文章采集工具(采集HtmlAgilityPack类库的应用)

优采云 发布时间: 2022-02-11 03:17

  文章采集工具(采集HtmlAgilityPack类库的应用)

  我们通常或多或少需要采集互联网上的一些信息。那个时候采集的方法很多。为了更高效的采集数据,我们基本上都需要使用多线程,采集下内容,最重要的是分析网页的内容,我们可以使用正则来分析网页中的内容,今天我们采集 HtmlAgilityPack 类库。

  使用的工具类库包括:HtmlAgilityPack,以及苏飞的一个HttpHelper类,开发环境VisualStudio 2008,.NetFramework 2.0,最终结果如图:

  

  同时我也看到了几个主要的类,这里采集工厂模式,目的是为了让扩展更容易,CollectorFactoryManager.cs的代码如下:

  using System;

using System.Collections.Generic;

namespace CollectDemo

{

///

/// 采集工厂管理类

///

public class CollectorFactoryManager

{

private const int initCount = 5;

private IList factoryList;

private Action callback;

private int collectFactoryIndex;

public CollectorFactoryManager(Action callback)

{

this.callback = callback;

this.factoryList = new List();

// 可以无限添加

this.factoryList.Add(new CollectorFactoryOne("http://www.cnblogs.com/", this.CollectorFactoryCalback));

this.factoryList.Add(new CollectorFactoryOne("http://www.cnblogs.com/sitehome/p/2", this.CollectorFactoryCalback));

}

// 开始采集

public void Run()

{

this.collectFactoryIndex = -1;

// 因为线程有最大上限,设置初始采集数量

for (int index = 0; index < initCount && index < this.factoryList.Count; index++)

{

this.CollectorFactoryData();

}

}

private void CollectorFactoryData()

{

lock (this)

{

this.collectFactoryIndex++;

//采集未结束,顺序采集

if (this.collectFactoryIndex < this.factoryList.Count)

{

CollectorFactory collectorFactory = this.factoryList[this.collectFactoryIndex];

collectorFactory.Run();

}

else

{

// 采集结束

this.End();

}

}

}

public void CollectorFactoryCalback()

{

this.CollectorFactoryData();

}

///

/// 采集结束

///

public void End()

{

if (this.callback != null) this.callback();

}

}

}

  CollectorFactory.cs代码如下:

  using System;

using System.Collections.Generic;

using System.Threading;

using HtmlAgilityPack;

namespace CollectDemo

{

public class CollectorFactory

{

private const int initCount = 10;

protected string htmlText;

protected string urlPath;

protected IList collectorItemList;

protected Action callback;

protected int collectItemIndex;

public CollectorFactory(string urlPath, Action callback)

{

this.urlPath = urlPath;

this.callback = callback;

}

///

/// 启动采集

///

public virtual void Run()

{

// 添加睡眠,避免请求被当成爬虫

int sleepData = new Random().Next(1000, 3000);

Thread.Sleep(sleepData);

Thread thread = new Thread(new ThreadStart(this.Start));

thread.Start();

}

///

/// 开启线程

///

protected virtual void Start()

{

this.CreateAndGetHtmlContent();

this.AnalysisHtmlContent();

this.CollectorPageData();

}

///

/// 创建采集请求信息

///

protected virtual void CreateAndGetHtmlContent()

{

}

///

/// 分析采集数据

///

protected virtual void AnalysisHtmlContent()

{

}

protected virtual void CollectorPageData()

{

this.collectItemIndex = -1;

if (this.collectorItemList != null && this.collectorItemList.Count > 0)

{

for (int index = 0; index < initCount && index < this.collectorItemList.Count; index++)

{

this.CollectorItemData();

}

}

}

public virtual void CollectorItemData()

{

lock (this)

{

this.collectItemIndex++;

if (this.collectItemIndex < this.collectorItemList.Count)

{

CollectorItem collectorItem = this.collectorItemList[this.collectItemIndex];

collectorItem.Run();

}

else

{

// 采集结束

this.End();

}

}

}

public void CollectorItemCalback()

{

this.CollectorItemData();

}

public virtual void End()

{

if (this.callback != null) this.callback();

}

}

}

  CollectorItem.cs 代码如下:

  using System;

using System.Collections.Generic;

using System.Threading;

using HtmlAgilityPack;

namespace CollectDemo

{

public class CollectorItem

{

protected string htmlText;

protected CollectorFactory collectorFactory;

protected string urlPath;

protected Action callback;

public CollectorItem(CollectorFactory collectorFactory, string urlPath, Action callback)

{

this.collectorFactory = collectorFactory;

this.urlPath = urlPath;

this.callback = callback;

}

public void Run()

{

// 添加睡眠,避免请求被当成爬虫

int sleepData = new Random().Next(2000, 6000);

Thread.Sleep(sleepData);

Thread thread = new Thread(new ThreadStart(this.Start));

thread.Start();

}

///

/// 开启线程

///

protected virtual void Start()

{

this.CreateAndGetHtmlContent();

this.AnalysisHtmlContent();

}

///

/// 创建采集请求信息

///

protected virtual void CreateAndGetHtmlContent()

{

}

///

/// 分析采集数据

///

protected virtual void AnalysisHtmlContent()

{

}

public virtual void End()

{

if (this.callback != null) this.callback();

}

}

}

  这个例子采集是博客园的前两页数据,所以我们需要一个CollectorFactoryOne.cs类来解析两页的数据链接,代码如下:

  using System;

using System.Collections.Generic;

using System.Threading;

using HtmlAgilityPack;

namespace CollectDemo

{

public class CollectorFactoryOne : CollectorFactory

{

public CollectorFactoryOne(string urlPath, Action callback) : base(urlPath, callback)

{

}

protected override void CreateAndGetHtmlContent()

{

HttpItem httpItem = new HttpItem();

httpItem.URL = this.urlPath;

httpItem.Method = "get";

httpItem.UserAgent = "Mozilla/5.0 (Windows NT 5.1; rv:24.0) Gecko/20100101 Firefox/24.0";

httpItem.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";

HttpResult httpResult = new HttpHelperUtils().GetHtml(httpItem);

this.htmlText = httpResult.Html;

}

protected override void AnalysisHtmlContent()

{

HtmlDocument htmlDocument = new HtmlDocument();

htmlDocument.LoadHtml(this.htmlText);

this.collectorItemList = new List();

HtmlNodeCollection hrefList = htmlDocument.DocumentNode.SelectNodes("//a[@class = &#39;titlelnk&#39;]");

if (hrefList != null)

{

foreach (HtmlNode hrefNode in hrefList)

{

HtmlAttribute htmlAttribute = hrefNode.Attributes["href"];

this.collectorItemList.Add(new CollectorItemOne(this, htmlAttribute.Value, this.CollectorItemCalback));

}

}

}

}

}

  还有一个CollectorItemOne.cs类,解析博客园各个页面的内容,代码如下:

  using System;

using System.Collections.Generic;

using System.Threading;

using HtmlAgilityPack;

using System.IO;

namespace CollectDemo

{

public class CollectorItemOne : CollectorItem

{

public CollectorItemOne(CollectorFactory collectorFactory, string urlPath, Action callback)

: base(collectorFactory, urlPath, callback)

{

}

protected override void CreateAndGetHtmlContent()

{

HttpItem httpItem = new HttpItem();

httpItem.URL = this.urlPath;

httpItem.Method = "get";

httpItem.UserAgent = "Mozilla/5.0 (Windows NT 5.1; rv:24.0) Gecko/20100101 Firefox/24.0";

httpItem.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";

HttpResult httpResult = new HttpHelperUtils().GetHtml(httpItem);

this.htmlText = httpResult.Html;

}

protected override void AnalysisHtmlContent()

{

HtmlDocument htmlDocument = new HtmlDocument();

htmlDocument.LoadHtml(this.htmlText);

lock (this)

{

string htmlTitle = htmlDocument.DocumentNode.SelectSingleNode("//title").InnerText;

// 这儿创建文件

string filePath = System.Windows.Forms.Application.StartupPath + "\\txt\\";

filePath += System.Guid.NewGuid() + ".txt";

if (File.Exists(filePath)) return;

File.Create(filePath).Close();

try

{

using (StreamWriter streamWriter = new StreamWriter(filePath, true, System.Text.Encoding.UTF8))

{

streamWriter.Write(htmlDocument.DocumentNode.InnerHtml);

streamWriter.Flush();

streamWriter.Close();

}

}

catch (Exception ex)

{

// 处理错误

}

// 处理结束,这儿必须调用

this.End();

}

}

}

}

  主要的多线程操作已经封装,只需要处理采集并解析网页内容即可实现快速扩展。

0 个评论

要回复文章请先登录注册


官方客服QQ群

微信人工客服

QQ人工客服


线