c#抓取网页数据( 类的代码:2016年10月26日(周四) )

优采云 发布时间: 2021-11-28 19:16

  c#抓取网页数据(

类的代码:2016年10月26日(周四)

)

  正则表达式相关:C#抓取网页类(获取网页中的所有信息)

  课程代码:

<p> 1 using System;

2 using System.Data;

3 using System.Configuration;

4 using System.Net;

5 using System.IO;

6 using System.Text;

7 using System.Collections.Generic;

8 using System.Text.RegularExpressions;

9 using System.Threading;

10 using System.Web;

11 using System.Web.UI.MobileControls;

12 ///

13 /// 网页类

14 ///

15 public class WebPage

16 {

17 #region 私有成员

18 private Uri m_uri; //url

19 private List m_links; //此网页上的链接

20 private string m_title; //标题

21 private string m_html; //HTML代码

22 private string m_outstr; //网页可输出的纯文本

23 private bool m_good; //网页是否可用

24 private int m_pagesize; //网页的大小

25 private static Dictionary webcookies = new Dictionary();//存放所有网页的Cookie

26

27 #endregion

28

29 #region 属性

30

31 ///

32 /// 通过此属性可获得本网页的网址,只读

33 ///

34 public string URL

35 {

36 get

37 {

38 return m_uri.AbsoluteUri;

39 }

40 }

41

42 ///

43 /// 通过此属性可获得本网页的标题,只读

44 ///

45 public string Title

46 {

47 get

48 {

49 if (m_title == "")

50 {

51 Regex reg = new Regex(@"(?m)]*>(?(?:\w|\W)*?)]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase);

52 Match mc = reg.Match(m_html);

53 if (mc.Success)

54 m_title = mc.Groups["title"].Value.Trim();

55 }

56 return m_title;

57 }

58 }

59 public string M_html

60 {

61 get

62 {

63 if (m_html == null)

64 {

65 m_html = "";

66 }

67 return m_html;

68 }

69 }

70 ///

71 /// 此属性获得本网页的所有链接信息,只读

72 ///

73 public List Links

74 {

75 get

76 {

77 if (m_links.Count == 0) getLinks();

78 return m_links;

79 }

80 }

81

82

83 ///

84 /// 此属性返回本网页的全部纯文本信息,只读

85 ///

86 public string Context

87 {

88 get

89 {

90 if (m_outstr == "") getContext(Int16.MaxValue);

91 return m_outstr;

92 }

93 }

94

95 ///

96 /// 此属性获得本网页的大小

97 ///

98 public int PageSize

99 {

100 get

101 {

102 return m_pagesize;

103 }

104 }

105 ///

106 /// 此属性获得本网页的所有站内链接

107 ///

108 public List InsiteLinks

109 {

110 get

111 {

112 return getSpecialLinksByUrl("^http://" + m_uri.Host, Int16.MaxValue);

113 }

114 }

115

116 ///

117 /// 此属性表示本网页是否可用

118 ///

119 public bool IsGood

120 {

121 get

122 {

123 return m_good;

124 }

125 }

126 ///

127 /// 此属性表示网页的所在的网站

128 ///

129 public string Host

130 {

131 get

132 {

133 return m_uri.Host;

134 }

135 }

136 #endregion

137

138

139 ///

140 /// 从HTML代码中分析出链接信息

141 ///

142 /// List

143 private List getLinks()

144 {

145 if (m_links.Count == 0)

146 {

147 Regex[] regex = new Regex[2];

148 regex[0] = new Regex(@"(?[^]*>", RegexOptions.IgnoreCase);

150

151 for (int i = 0; i < 2; i++)

152 {

153 Match match = regex[i].Match(m_html);

154 while (match.Success)

155 {

156 try

157 {

158 string url = HttpUtility.UrlDecode(new Uri(m_uri, match.Groups["URL"].Value).AbsoluteUri);

159

160 string text = "";

161 if (i == 0) text = new Regex("(]+>)|(\\s)|( )|&|\"", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(match.Groups["text"].Value, "");

162

163 Link link = new Link();

164 link.Text = text;

165 link.NavigateUrl = url;

166

167 m_links.Add(link);

168 }

169 catch (Exception ex) { Console.WriteLine(ex.Message); };

170 match = match.NextMatch();

171 }

172 }

173 }

174 return m_links;

175 }

176 ///

177 /// 此私有方法从一段HTML文本中提取出一定字数的纯文本

178 ///

179 /// HTML代码

180 /// 提取从头数多少个字

181 /// 是否要链接里面的字

182 /// 纯文本

183 private string getFirstNchar(string instr, int firstN, bool withLink)

184 {

185 if (m_outstr == "")

186 {

187 m_outstr = instr.Clone() as string;

188 m_outstr = new Regex(@"(?m)]*>(\w|\W)*?]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, "");

189 m_outstr = new Regex(@"(?m)]*>(\w|\W)*?]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, "");

190 m_outstr = new Regex(@"(?m)]*>(\w|\W)*?]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, "");

191 if (!withLink) m_outstr = new Regex(@"(?m)]*>(\w|\W)*?]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, "");

192 Regex objReg = new System.Text.RegularExpressions.Regex("(]+?>)| ", RegexOptions.Multiline | RegexOptions.IgnoreCase);

193 m_outstr = objReg.Replace(m_outstr, "");

194 Regex objReg2 = new System.Text.RegularExpressions.Regex("(\\s)+", RegexOptions.Multiline | RegexOptions.IgnoreCase);

195 m_outstr = objReg2.Replace(m_outstr, " ");

196

197 }

198 return m_outstr.Length > firstN ? m_outstr.Substring(0, firstN) : m_outstr;

199 }

200

201

202 #region 公有文法

203 ///

204 /// 此公有方法提取网页中一定字数的纯文本,包括链接文字

205 ///

206 /// 字数

207 ///

208 public string getContext(int firstN)

209 {

210 return getFirstNchar(m_html, firstN, true);

211 }

212

213 ///

214 /// 此公有方法从本网页的链接中提取一定数量的链接,该链接的URL满足某正则式

215 ///

216 /// 正则式

217 /// 返回的链接的个数

218 /// List

219 public List getSpecialLinksByUrl(string pattern, int count)

220 {

221 if (m_links.Count == 0) getLinks();

222 List SpecialLinks = new List();

223 List.Enumerator i;

224 i = m_links.GetEnumerator();

225 int cnt = 0;

226 while (i.MoveNext() && cnt 1

0 个评论

要回复文章请先登录注册


官方客服QQ群

微信人工客服

QQ人工客服


线