c#抓取网页数据(魔兽世界几个坑说下就是)

优采云 发布时间: 2021-10-30 02:00

  c#抓取网页数据(魔兽世界几个坑说下就是)

  这里有几个坑:

  第一个是记得获取代理IP爬取网站第一次忘记获取代理,ip被封了

  二是判断网页是否被压缩,但是第一次没有得到结果。

  

///

/// 抓取网页并转码

///

///

///

///

public string HttpGet(string url, string post_parament)

{

string html;

HttpWebRequest Web_Request = (HttpWebRequest)WebRequest.Create(url);

Web_Request.Timeout = 30000;

Web_Request.Method = "GET";

Web_Request.UserAgent = "Mozilla/4.0";

Web_Request.Headers.Add("Accept-Encoding", "gzip, deflate");

//Web_Request.Credentials = CredentialCache.DefaultCredentials;

//设置代理属性WebProxy-------------------------------------------------

WebProxy proxy = new WebProxy("111.13.7.120", 80);

//在发起HTTP请求前将proxy赋值给HttpWebRequest的Proxy属性

Web_Request.Proxy = proxy;

HttpWebResponse Web_Response = (HttpWebResponse)Web_Request.GetResponse();

if (Web_Response.ContentEncoding.ToLower() == "gzip") // 如果使用了GZip则先解压

{

using (Stream Stream_Receive = Web_Response.GetResponseStream())

{

using (var Zip_Stream = new GZipStream(Stream_Receive, CompressionMode.Decompress))

{

using (StreamReader Stream_Reader = new StreamReader(Zip_Stream, Encoding.Default))

{

html = Stream_Reader.ReadToEnd();

}

}

}

}

else

{

using (Stream Stream_Receive = Web_Response.GetResponseStream())

{

using (StreamReader Stream_Reader = new StreamReader(Stream_Receive, Encoding.Default))

{

html = Stream_Reader.ReadToEnd();

}

}

}

return html;

}

  二、 下面是使用正则处理的内容。因为对正则表达式不熟悉,重复的动作太多。

  1.先获取网页内容

  

IWebHttpRepository webHttpRepository = new WebHttpRepository();

string html = webHttpRepository.HttpGet(Url_Txt.Text, "");

  2.获取书名和文章列表

  标题

  

  文章列表

  

<p>

string Novel_Name = Regex.Match(html, @"(?]+)\1[^>]*>(?(?:(?!

0 个评论

要回复文章请先登录注册


官方客服QQ群

微信人工客服

QQ人工客服


线