文章采集工具(采集HtmlAgilityPack类库的应用)
优采云 发布时间: 2022-02-11 03:17文章采集工具(采集HtmlAgilityPack类库的应用)
我们通常或多或少需要采集互联网上的一些信息。那个时候采集的方法很多。为了更高效的采集数据,我们基本上都需要使用多线程,采集下内容,最重要的是分析网页的内容,我们可以使用正则来分析网页中的内容,今天我们采集 HtmlAgilityPack 类库。
使用的工具类库包括:HtmlAgilityPack,以及苏飞的一个HttpHelper类,开发环境VisualStudio 2008,.NetFramework 2.0,最终结果如图:
同时我也看到了几个主要的类,这里采集工厂模式,目的是为了让扩展更容易,CollectorFactoryManager.cs的代码如下:
using System;
using System.Collections.Generic;
namespace CollectDemo
{
///
/// 采集工厂管理类
///
public class CollectorFactoryManager
{
private const int initCount = 5;
private IList factoryList;
private Action callback;
private int collectFactoryIndex;
public CollectorFactoryManager(Action callback)
{
this.callback = callback;
this.factoryList = new List();
// 可以无限添加
this.factoryList.Add(new CollectorFactoryOne("http://www.cnblogs.com/", this.CollectorFactoryCalback));
this.factoryList.Add(new CollectorFactoryOne("http://www.cnblogs.com/sitehome/p/2", this.CollectorFactoryCalback));
}
// 开始采集
public void Run()
{
this.collectFactoryIndex = -1;
// 因为线程有最大上限,设置初始采集数量
for (int index = 0; index < initCount && index < this.factoryList.Count; index++)
{
this.CollectorFactoryData();
}
}
private void CollectorFactoryData()
{
lock (this)
{
this.collectFactoryIndex++;
//采集未结束,顺序采集
if (this.collectFactoryIndex < this.factoryList.Count)
{
CollectorFactory collectorFactory = this.factoryList[this.collectFactoryIndex];
collectorFactory.Run();
}
else
{
// 采集结束
this.End();
}
}
}
public void CollectorFactoryCalback()
{
this.CollectorFactoryData();
}
///
/// 采集结束
///
public void End()
{
if (this.callback != null) this.callback();
}
}
}
CollectorFactory.cs代码如下:
using System;
using System.Collections.Generic;
using System.Threading;
using HtmlAgilityPack;
namespace CollectDemo
{
public class CollectorFactory
{
private const int initCount = 10;
protected string htmlText;
protected string urlPath;
protected IList collectorItemList;
protected Action callback;
protected int collectItemIndex;
public CollectorFactory(string urlPath, Action callback)
{
this.urlPath = urlPath;
this.callback = callback;
}
///
/// 启动采集
///
public virtual void Run()
{
// 添加睡眠,避免请求被当成爬虫
int sleepData = new Random().Next(1000, 3000);
Thread.Sleep(sleepData);
Thread thread = new Thread(new ThreadStart(this.Start));
thread.Start();
}
///
/// 开启线程
///
protected virtual void Start()
{
this.CreateAndGetHtmlContent();
this.AnalysisHtmlContent();
this.CollectorPageData();
}
///
/// 创建采集请求信息
///
protected virtual void CreateAndGetHtmlContent()
{
}
///
/// 分析采集数据
///
protected virtual void AnalysisHtmlContent()
{
}
protected virtual void CollectorPageData()
{
this.collectItemIndex = -1;
if (this.collectorItemList != null && this.collectorItemList.Count > 0)
{
for (int index = 0; index < initCount && index < this.collectorItemList.Count; index++)
{
this.CollectorItemData();
}
}
}
public virtual void CollectorItemData()
{
lock (this)
{
this.collectItemIndex++;
if (this.collectItemIndex < this.collectorItemList.Count)
{
CollectorItem collectorItem = this.collectorItemList[this.collectItemIndex];
collectorItem.Run();
}
else
{
// 采集结束
this.End();
}
}
}
public void CollectorItemCalback()
{
this.CollectorItemData();
}
public virtual void End()
{
if (this.callback != null) this.callback();
}
}
}
CollectorItem.cs 代码如下:
using System;
using System.Collections.Generic;
using System.Threading;
using HtmlAgilityPack;
namespace CollectDemo
{
public class CollectorItem
{
protected string htmlText;
protected CollectorFactory collectorFactory;
protected string urlPath;
protected Action callback;
public CollectorItem(CollectorFactory collectorFactory, string urlPath, Action callback)
{
this.collectorFactory = collectorFactory;
this.urlPath = urlPath;
this.callback = callback;
}
public void Run()
{
// 添加睡眠,避免请求被当成爬虫
int sleepData = new Random().Next(2000, 6000);
Thread.Sleep(sleepData);
Thread thread = new Thread(new ThreadStart(this.Start));
thread.Start();
}
///
/// 开启线程
///
protected virtual void Start()
{
this.CreateAndGetHtmlContent();
this.AnalysisHtmlContent();
}
///
/// 创建采集请求信息
///
protected virtual void CreateAndGetHtmlContent()
{
}
///
/// 分析采集数据
///
protected virtual void AnalysisHtmlContent()
{
}
public virtual void End()
{
if (this.callback != null) this.callback();
}
}
}
这个例子采集是博客园的前两页数据,所以我们需要一个CollectorFactoryOne.cs类来解析两页的数据链接,代码如下:
using System;
using System.Collections.Generic;
using System.Threading;
using HtmlAgilityPack;
namespace CollectDemo
{
public class CollectorFactoryOne : CollectorFactory
{
public CollectorFactoryOne(string urlPath, Action callback) : base(urlPath, callback)
{
}
protected override void CreateAndGetHtmlContent()
{
HttpItem httpItem = new HttpItem();
httpItem.URL = this.urlPath;
httpItem.Method = "get";
httpItem.UserAgent = "Mozilla/5.0 (Windows NT 5.1; rv:24.0) Gecko/20100101 Firefox/24.0";
httpItem.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
HttpResult httpResult = new HttpHelperUtils().GetHtml(httpItem);
this.htmlText = httpResult.Html;
}
protected override void AnalysisHtmlContent()
{
HtmlDocument htmlDocument = new HtmlDocument();
htmlDocument.LoadHtml(this.htmlText);
this.collectorItemList = new List();
HtmlNodeCollection hrefList = htmlDocument.DocumentNode.SelectNodes("//a[@class = 'titlelnk']");
if (hrefList != null)
{
foreach (HtmlNode hrefNode in hrefList)
{
HtmlAttribute htmlAttribute = hrefNode.Attributes["href"];
this.collectorItemList.Add(new CollectorItemOne(this, htmlAttribute.Value, this.CollectorItemCalback));
}
}
}
}
}
还有一个CollectorItemOne.cs类,解析博客园各个页面的内容,代码如下:
using System;
using System.Collections.Generic;
using System.Threading;
using HtmlAgilityPack;
using System.IO;
namespace CollectDemo
{
public class CollectorItemOne : CollectorItem
{
public CollectorItemOne(CollectorFactory collectorFactory, string urlPath, Action callback)
: base(collectorFactory, urlPath, callback)
{
}
protected override void CreateAndGetHtmlContent()
{
HttpItem httpItem = new HttpItem();
httpItem.URL = this.urlPath;
httpItem.Method = "get";
httpItem.UserAgent = "Mozilla/5.0 (Windows NT 5.1; rv:24.0) Gecko/20100101 Firefox/24.0";
httpItem.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
HttpResult httpResult = new HttpHelperUtils().GetHtml(httpItem);
this.htmlText = httpResult.Html;
}
protected override void AnalysisHtmlContent()
{
HtmlDocument htmlDocument = new HtmlDocument();
htmlDocument.LoadHtml(this.htmlText);
lock (this)
{
string htmlTitle = htmlDocument.DocumentNode.SelectSingleNode("//title").InnerText;
// 这儿创建文件
string filePath = System.Windows.Forms.Application.StartupPath + "\\txt\\";
filePath += System.Guid.NewGuid() + ".txt";
if (File.Exists(filePath)) return;
File.Create(filePath).Close();
try
{
using (StreamWriter streamWriter = new StreamWriter(filePath, true, System.Text.Encoding.UTF8))
{
streamWriter.Write(htmlDocument.DocumentNode.InnerHtml);
streamWriter.Flush();
streamWriter.Close();
}
}
catch (Exception ex)
{
// 处理错误
}
// 处理结束,这儿必须调用
this.End();
}
}
}
}
主要的多线程操作已经封装,只需要处理采集并解析网页内容即可实现快速扩展。