c#抓取网页数据( 类的代码:2016年10月26日(周四) )
优采云 发布时间: 2021-11-28 19:16c#抓取网页数据(
类的代码:2016年10月26日(周四)
)
正则表达式相关:C#抓取网页类(获取网页中的所有信息)
课程代码:
<p> 1 using System;
2 using System.Data;
3 using System.Configuration;
4 using System.Net;
5 using System.IO;
6 using System.Text;
7 using System.Collections.Generic;
8 using System.Text.RegularExpressions;
9 using System.Threading;
10 using System.Web;
11 using System.Web.UI.MobileControls;
12 ///
13 /// 网页类
14 ///
15 public class WebPage
16 {
17 #region 私有成员
18 private Uri m_uri; //url
19 private List m_links; //此网页上的链接
20 private string m_title; //标题
21 private string m_html; //HTML代码
22 private string m_outstr; //网页可输出的纯文本
23 private bool m_good; //网页是否可用
24 private int m_pagesize; //网页的大小
25 private static Dictionary webcookies = new Dictionary();//存放所有网页的Cookie
26
27 #endregion
28
29 #region 属性
30
31 ///
32 /// 通过此属性可获得本网页的网址,只读
33 ///
34 public string URL
35 {
36 get
37 {
38 return m_uri.AbsoluteUri;
39 }
40 }
41
42 ///
43 /// 通过此属性可获得本网页的标题,只读
44 ///
45 public string Title
46 {
47 get
48 {
49 if (m_title == "")
50 {
51 Regex reg = new Regex(@"(?m)]*>(?(?:\w|\W)*?)]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase);
52 Match mc = reg.Match(m_html);
53 if (mc.Success)
54 m_title = mc.Groups["title"].Value.Trim();
55 }
56 return m_title;
57 }
58 }
59 public string M_html
60 {
61 get
62 {
63 if (m_html == null)
64 {
65 m_html = "";
66 }
67 return m_html;
68 }
69 }
70 ///
71 /// 此属性获得本网页的所有链接信息,只读
72 ///
73 public List Links
74 {
75 get
76 {
77 if (m_links.Count == 0) getLinks();
78 return m_links;
79 }
80 }
81
82
83 ///
84 /// 此属性返回本网页的全部纯文本信息,只读
85 ///
86 public string Context
87 {
88 get
89 {
90 if (m_outstr == "") getContext(Int16.MaxValue);
91 return m_outstr;
92 }
93 }
94
95 ///
96 /// 此属性获得本网页的大小
97 ///
98 public int PageSize
99 {
100 get
101 {
102 return m_pagesize;
103 }
104 }
105 ///
106 /// 此属性获得本网页的所有站内链接
107 ///
108 public List InsiteLinks
109 {
110 get
111 {
112 return getSpecialLinksByUrl("^http://" + m_uri.Host, Int16.MaxValue);
113 }
114 }
115
116 ///
117 /// 此属性表示本网页是否可用
118 ///
119 public bool IsGood
120 {
121 get
122 {
123 return m_good;
124 }
125 }
126 ///
127 /// 此属性表示网页的所在的网站
128 ///
129 public string Host
130 {
131 get
132 {
133 return m_uri.Host;
134 }
135 }
136 #endregion
137
138
139 ///
140 /// 从HTML代码中分析出链接信息
141 ///
142 /// List
143 private List getLinks()
144 {
145 if (m_links.Count == 0)
146 {
147 Regex[] regex = new Regex[2];
148 regex[0] = new Regex(@"(?[^]*>", RegexOptions.IgnoreCase);
150
151 for (int i = 0; i < 2; i++)
152 {
153 Match match = regex[i].Match(m_html);
154 while (match.Success)
155 {
156 try
157 {
158 string url = HttpUtility.UrlDecode(new Uri(m_uri, match.Groups["URL"].Value).AbsoluteUri);
159
160 string text = "";
161 if (i == 0) text = new Regex("(]+>)|(\\s)|( )|&|\"", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(match.Groups["text"].Value, "");
162
163 Link link = new Link();
164 link.Text = text;
165 link.NavigateUrl = url;
166
167 m_links.Add(link);
168 }
169 catch (Exception ex) { Console.WriteLine(ex.Message); };
170 match = match.NextMatch();
171 }
172 }
173 }
174 return m_links;
175 }
176 ///
177 /// 此私有方法从一段HTML文本中提取出一定字数的纯文本
178 ///
179 /// HTML代码
180 /// 提取从头数多少个字
181 /// 是否要链接里面的字
182 /// 纯文本
183 private string getFirstNchar(string instr, int firstN, bool withLink)
184 {
185 if (m_outstr == "")
186 {
187 m_outstr = instr.Clone() as string;
188 m_outstr = new Regex(@"(?m)]*>(\w|\W)*?]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, "");
189 m_outstr = new Regex(@"(?m)]*>(\w|\W)*?]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, "");
190 m_outstr = new Regex(@"(?m)]*>(\w|\W)*?]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, "");
191 if (!withLink) m_outstr = new Regex(@"(?m)]*>(\w|\W)*?]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, "");
192 Regex objReg = new System.Text.RegularExpressions.Regex("(]+?>)| ", RegexOptions.Multiline | RegexOptions.IgnoreCase);
193 m_outstr = objReg.Replace(m_outstr, "");
194 Regex objReg2 = new System.Text.RegularExpressions.Regex("(\\s)+", RegexOptions.Multiline | RegexOptions.IgnoreCase);
195 m_outstr = objReg2.Replace(m_outstr, " ");
196
197 }
198 return m_outstr.Length > firstN ? m_outstr.Substring(0, firstN) : m_outstr;
199 }
200
201
202 #region 公有文法
203 ///
204 /// 此公有方法提取网页中一定字数的纯文本,包括链接文字
205 ///
206 /// 字数
207 ///
208 public string getContext(int firstN)
209 {
210 return getFirstNchar(m_html, firstN, true);
211 }
212
213 ///
214 /// 此公有方法从本网页的链接中提取一定数量的链接,该链接的URL满足某正则式
215 ///
216 /// 正则式
217 /// 返回的链接的个数
218 /// List
219 public List getSpecialLinksByUrl(string pattern, int count)
220 {
221 if (m_links.Count == 0) getLinks();
222 List SpecialLinks = new List();
223 List.Enumerator i;
224 i = m_links.GetEnumerator();
225 int cnt = 0;
226 while (i.MoveNext() && cnt 1