文章采集工具(采集HtmlAgilityPack类库的应用)

优采云发布时间: 2022-02-11 03:17

　　我们通常或多或少需要采集互联网上的一些信息。那个时候采集的方法很多。为了更高效的采集数据，我们基本上都需要使用多线程，采集下内容，最重要的是分析网页的内容，我们可以使用正则来分析网页中的内容，今天我们采集 HtmlAgilityPack 类库。

　　使用的工具类库包括：HtmlAgilityPack，以及苏飞的一个HttpHelper类，开发环境VisualStudio 2008，.NetFramework 2.0，最终结果如图：

　　同时我也看到了几个主要的类，这里采集工厂模式，目的是为了让扩展更容易，CollectorFactoryManager.cs的代码如下：

　　using System;

using System.Collections.Generic;

namespace CollectDemo

{

///

/// 采集工厂管理类

///

public class CollectorFactoryManager

{

private const int initCount = 5;

private IList factoryList;

private Action callback;

private int collectFactoryIndex;

public CollectorFactoryManager(Action callback)

{

this.callback = callback;

this.factoryList = new List();

// 可以无限添加

this.factoryList.Add(new CollectorFactoryOne("http://www.cnblogs.com/", this.CollectorFactoryCalback));

this.factoryList.Add(new CollectorFactoryOne("http://www.cnblogs.com/sitehome/p/2", this.CollectorFactoryCalback));

}

// 开始采集

public void Run()

{

this.collectFactoryIndex = -1;

// 因为线程有最大上限，设置初始采集数量

for (int index = 0; index < initCount && index < this.factoryList.Count; index++)

{

this.CollectorFactoryData();

}

private void CollectorFactoryData()

{

lock (this)

{

this.collectFactoryIndex++;

//采集未结束，顺序采集

if (this.collectFactoryIndex < this.factoryList.Count)

{

CollectorFactory collectorFactory = this.factoryList[this.collectFactoryIndex];

collectorFactory.Run();

}

else

{

// 采集结束

this.End();

}

public void CollectorFactoryCalback()

{

this.CollectorFactoryData();

}

///

/// 采集结束

///

public void End()

{

if (this.callback != null) this.callback();

}

　　CollectorFactory.cs代码如下：

　　using System;

using System.Collections.Generic;

using System.Threading;

using HtmlAgilityPack;

namespace CollectDemo

{

public class CollectorFactory

{

private const int initCount = 10;

protected string htmlText;

protected string urlPath;

protected IList collectorItemList;

protected Action callback;

protected int collectItemIndex;

public CollectorFactory(string urlPath, Action callback)

{

this.urlPath = urlPath;

this.callback = callback;

}

///

/// 启动采集

///

public virtual void Run()

{

// 添加睡眠，避免请求被当成爬虫

int sleepData = new Random().Next(1000, 3000);

Thread.Sleep(sleepData);

Thread thread = new Thread(new ThreadStart(this.Start));

thread.Start();

}

///

/// 开启线程

///

protected virtual void Start()

{

this.CreateAndGetHtmlContent();

this.AnalysisHtmlContent();

this.CollectorPageData();

}

///

/// 创建采集请求信息

///

protected virtual void CreateAndGetHtmlContent()

{

}

///

/// 分析采集数据

///

protected virtual void AnalysisHtmlContent()

{

}

protected virtual void CollectorPageData()

{

this.collectItemIndex = -1;

if (this.collectorItemList != null && this.collectorItemList.Count > 0)

{

for (int index = 0; index < initCount && index < this.collectorItemList.Count; index++)

{

this.CollectorItemData();

}

public virtual void CollectorItemData()

{

lock (this)

{

this.collectItemIndex++;

if (this.collectItemIndex < this.collectorItemList.Count)

{

CollectorItem collectorItem = this.collectorItemList[this.collectItemIndex];

collectorItem.Run();

}

else

{

// 采集结束

this.End();

}

public void CollectorItemCalback()

{

this.CollectorItemData();

}

public virtual void End()

{

if (this.callback != null) this.callback();

}

　　CollectorItem.cs 代码如下：

　　using System;

using System.Collections.Generic;

using System.Threading;

using HtmlAgilityPack;

namespace CollectDemo

{

public class CollectorItem

{

protected string htmlText;

protected CollectorFactory collectorFactory;

protected string urlPath;

protected Action callback;

public CollectorItem(CollectorFactory collectorFactory, string urlPath, Action callback)

{

this.collectorFactory = collectorFactory;

this.urlPath = urlPath;

this.callback = callback;

}

public void Run()

{

// 添加睡眠，避免请求被当成爬虫

int sleepData = new Random().Next(2000, 6000);

Thread.Sleep(sleepData);

Thread thread = new Thread(new ThreadStart(this.Start));

thread.Start();

}

///

/// 开启线程

///

protected virtual void Start()

{

this.CreateAndGetHtmlContent();

this.AnalysisHtmlContent();

}

///

/// 创建采集请求信息

///

protected virtual void CreateAndGetHtmlContent()

{

}

///

/// 分析采集数据

///

protected virtual void AnalysisHtmlContent()

{

}

public virtual void End()

{

if (this.callback != null) this.callback();

}

　　这个例子采集是博客园的前两页数据，所以我们需要一个CollectorFactoryOne.cs类来解析两页的数据链接，代码如下：

　　using System;

using System.Collections.Generic;

using System.Threading;

using HtmlAgilityPack;

namespace CollectDemo

{

public class CollectorFactoryOne : CollectorFactory

{

public CollectorFactoryOne(string urlPath, Action callback) : base(urlPath, callback)

{

}

protected override void CreateAndGetHtmlContent()

{

HttpItem httpItem = new HttpItem();

httpItem.URL = this.urlPath;

httpItem.Method = "get";

httpItem.UserAgent = "Mozilla/5.0 (Windows NT 5.1; rv:24.0) Gecko/20100101 Firefox/24.0";

httpItem.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";

HttpResult httpResult = new HttpHelperUtils().GetHtml(httpItem);

this.htmlText = httpResult.Html;

}

protected override void AnalysisHtmlContent()

{

HtmlDocument htmlDocument = new HtmlDocument();

htmlDocument.LoadHtml(this.htmlText);

this.collectorItemList = new List();

HtmlNodeCollection hrefList = htmlDocument.DocumentNode.SelectNodes("//a[@class = 'titlelnk']");

if (hrefList != null)

{

foreach (HtmlNode hrefNode in hrefList)

{

HtmlAttribute htmlAttribute = hrefNode.Attributes["href"];

this.collectorItemList.Add(new CollectorItemOne(this, htmlAttribute.Value, this.CollectorItemCalback));

}

　　还有一个CollectorItemOne.cs类，解析博客园各个页面的内容，代码如下：

　　using System;

using System.Collections.Generic;

using System.Threading;

using HtmlAgilityPack;

using System.IO;

namespace CollectDemo

{

public class CollectorItemOne : CollectorItem

{

public CollectorItemOne(CollectorFactory collectorFactory, string urlPath, Action callback)

: base(collectorFactory, urlPath, callback)

{

}

protected override void CreateAndGetHtmlContent()

{

HttpItem httpItem = new HttpItem();

httpItem.URL = this.urlPath;

httpItem.Method = "get";

httpItem.UserAgent = "Mozilla/5.0 (Windows NT 5.1; rv:24.0) Gecko/20100101 Firefox/24.0";

httpItem.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";

HttpResult httpResult = new HttpHelperUtils().GetHtml(httpItem);

this.htmlText = httpResult.Html;

}

protected override void AnalysisHtmlContent()

{

HtmlDocument htmlDocument = new HtmlDocument();

htmlDocument.LoadHtml(this.htmlText);

lock (this)

{

string htmlTitle = htmlDocument.DocumentNode.SelectSingleNode("//title").InnerText;

// 这儿创建文件

string filePath = System.Windows.Forms.Application.StartupPath + "\\txt\\";

filePath += System.Guid.NewGuid() + ".txt";

if (File.Exists(filePath)) return;

File.Create(filePath).Close();

try

{

using (StreamWriter streamWriter = new StreamWriter(filePath, true, System.Text.Encoding.UTF8))

{

streamWriter.Write(htmlDocument.DocumentNode.InnerHtml);

streamWriter.Flush();

streamWriter.Close();

}

catch (Exception ex)

{

// 处理错误

}

// 处理结束，这儿必须调用

this.End();

}

　　主要的多线程操作已经封装，只需要处理采集并解析网页内容即可实现快速扩展。

0

2022-02-11

文章采集工具

0 个评论

要回复文章请先登录或注册

AI时代内容工厂

文章采集工具(采集HtmlAgilityPack类库的应用)

0 个评论

发起人

AI时代内容工厂

文章采集工具(采集HtmlAgilityPack类库的应用)

0 个评论

发起人

相关问题