c#抓取网页数据(附赠一个C#调用JS脚本的代码:)

优采云 发布时间: 2022-03-21 23:55

  c#抓取网页数据(附赠一个C#调用JS脚本的代码:)

  2021-09-24

  //不知道怎么删除,只好留着

  1. 获取方法:

  WebClient web = new WebClient();

var html = web.DownloadString(url);

  2.发布方法

   1 ///

2 ///

3 ///

4 ///

5 ///

6 /// 格式: paramname=value@name2=value2

7 ///

8 ///

9 public static string Post(this MyWebClient web, string url, string queryString, bool clearHeads=false)

10 {

11 string postString = queryString;// WebUtility.UrlEncode( queryString);//这里即为传递的参数,可以用工具抓包分析,也可以自己分析,主要是form里面每一个name都要加进来

12 byte[] postData = Encoding.UTF8.GetBytes(postString);//编码,尤其是汉字,事先要看下抓取网页的编码方式

13 web.RequestConentLength = postData.Length;

14 if (clearHeads)

15 {

16 web.Headers.Clear();

17 web.Headers.Add("Content-Type", "application/x-www-form-urlencoded");//采取POST方式必须加的header,如果改为GET方式的话就去掉这句话即可

18 }

19

20 byte[] responseData = web.UploadData(url, "POST", postData);//得到返回字符流

21 string srcString = Encoding.UTF8.GetString(responseData);//解码

22 return srcString;

23 }

  3.标题设置

   1 web.Headers.Add(HttpRequestHeader.Accept, "*/*");

2 web.Headers.Add(HttpRequestHeader.AcceptEncoding, "gzip, deflate");

3 web.Headers.Add(HttpRequestHeader.AcceptLanguage, "zh-CN,zh;q=0.9");

4 //web.Headers.Add(HttpRequestHeader.Connection, "keep-alive");

5 web.Headers.Add("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8");

6 web.Headers.Add(HttpRequestHeader.Host, "wenshu.court.gov.cn");

7 web.Headers.Add("Origin", "http://wenshu.court.gov.cn");

8 //web.Headers.Add("Proxy-Connection", "keep-alive");

9 web.Headers.Add(HttpRequestHeader.UserAgent, userAgent);

10 web.Headers.Add("X-Requested-With", "XMLHttpRequest");

11 web.Headers.Add(HttpRequestHeader.Referer, WebUtility.UrlEncode(Referer1));

  4.Cookie、超时等高可用基类

   1 public class MyWebClient : WebClient

2 {

3 public CookieContainer Cookies ;

4

5 public MyWebClient(CookieContainer cookieContainer)

6 {

7 this.Cookies = cookieContainer;

8 }

9

10 public int TimeoutSeconds { get; set; } = 60;

11

12 public WebRequest Request { get; set; }

13

14 public int RequestConentLength;

15

16 protected override WebRequest GetWebRequest(Uri address)

17 {

18 HttpWebRequest request = base.GetWebRequest(address) as HttpWebRequest;

19

20 if (request != null)

21 {

22 request.Method = "Post";

23 request.CookieContainer = Cookies;

24 request.Timeout = 1000 * TimeoutSeconds;

25 request.ContentLength = RequestConentLength;

26 }

27

28 Request = request;

29 return request;

30 }

31

32 public WebResponse Response { get; set; }

33

34 protected override WebResponse GetWebResponse(WebRequest request)

35 {

36 this.Response = base.GetWebResponse(request);

37 return this.Response;

38 }

39

40 public string GetCookieValue(string cookieName)

41 {

42 var cookies = this.Cookies.GetCookies(this.Request.RequestUri);

43 var ck = cookies[cookieName];

44 return ck?.Value;

45 }

46 }

  特别注意,浏览器需要为多个请求创建多个WebClient对象网站,但它们应该共享一个CookieContainer。在编写爬虫并模拟多个浏览器会话时,不应该都使用同一个 CookieContainer 对象以避免会话冲突。

  附上调用JS脚本的C#代码:

   1 public string CallJs(string jsCall , string jsFunctions)

2 {

3 Type obj = Type.GetTypeFromProgID("ScriptControl");

4 if (obj == null) return null;

5 object ScriptControl = Activator.CreateInstance(obj);

6 obj.InvokeMember("Language", BindingFlags.SetProperty, null, ScriptControl, new object[] { "JavaScript" });

7 //string js = "function time(a, b, msg){ var sum = a + b; return new Date().getTime() + ': ' + msg + ' = ' + sum }";

8 obj.InvokeMember("AddCode", BindingFlags.InvokeMethod, null, ScriptControl, new object[] { jsFunctions });

9

10 //return obj.InvokeMember("Eval", BindingFlags.InvokeMethod, null, ScriptControl, new object[] { "time(3, 5, '3 + 5')" }).ToString();

11 return obj.InvokeMember("Eval", BindingFlags.InvokeMethod, null, ScriptControl, new object[] { jsCall }).ToString();

12 }

  使用示例:

  string js = "function jsfunction(parm){ return parm + "abc"; }";

string val = CallJs($"jsfunction('{csvar}')", js.ToString());

  分类:

  技术要点:

  相关文章:

0 个评论

要回复文章请先登录注册


官方客服QQ群

微信人工客服

QQ人工客服


线