c#抓取网页数据(附赠一个C#调用JS脚本的代码:)
优采云 发布时间: 2022-03-21 23:55c#抓取网页数据(附赠一个C#调用JS脚本的代码:)
2021-09-24
//不知道怎么删除,只好留着
1. 获取方法:
WebClient web = new WebClient();
var html = web.DownloadString(url);
2.发布方法
1 ///
2 ///
3 ///
4 ///
5 ///
6 /// 格式: paramname=value@name2=value2
7 ///
8 ///
9 public static string Post(this MyWebClient web, string url, string queryString, bool clearHeads=false)
10 {
11 string postString = queryString;// WebUtility.UrlEncode( queryString);//这里即为传递的参数,可以用工具抓包分析,也可以自己分析,主要是form里面每一个name都要加进来
12 byte[] postData = Encoding.UTF8.GetBytes(postString);//编码,尤其是汉字,事先要看下抓取网页的编码方式
13 web.RequestConentLength = postData.Length;
14 if (clearHeads)
15 {
16 web.Headers.Clear();
17 web.Headers.Add("Content-Type", "application/x-www-form-urlencoded");//采取POST方式必须加的header,如果改为GET方式的话就去掉这句话即可
18 }
19
20 byte[] responseData = web.UploadData(url, "POST", postData);//得到返回字符流
21 string srcString = Encoding.UTF8.GetString(responseData);//解码
22 return srcString;
23 }
3.标题设置
1 web.Headers.Add(HttpRequestHeader.Accept, "*/*");
2 web.Headers.Add(HttpRequestHeader.AcceptEncoding, "gzip, deflate");
3 web.Headers.Add(HttpRequestHeader.AcceptLanguage, "zh-CN,zh;q=0.9");
4 //web.Headers.Add(HttpRequestHeader.Connection, "keep-alive");
5 web.Headers.Add("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8");
6 web.Headers.Add(HttpRequestHeader.Host, "wenshu.court.gov.cn");
7 web.Headers.Add("Origin", "http://wenshu.court.gov.cn");
8 //web.Headers.Add("Proxy-Connection", "keep-alive");
9 web.Headers.Add(HttpRequestHeader.UserAgent, userAgent);
10 web.Headers.Add("X-Requested-With", "XMLHttpRequest");
11 web.Headers.Add(HttpRequestHeader.Referer, WebUtility.UrlEncode(Referer1));
4.Cookie、超时等高可用基类
1 public class MyWebClient : WebClient
2 {
3 public CookieContainer Cookies ;
4
5 public MyWebClient(CookieContainer cookieContainer)
6 {
7 this.Cookies = cookieContainer;
8 }
9
10 public int TimeoutSeconds { get; set; } = 60;
11
12 public WebRequest Request { get; set; }
13
14 public int RequestConentLength;
15
16 protected override WebRequest GetWebRequest(Uri address)
17 {
18 HttpWebRequest request = base.GetWebRequest(address) as HttpWebRequest;
19
20 if (request != null)
21 {
22 request.Method = "Post";
23 request.CookieContainer = Cookies;
24 request.Timeout = 1000 * TimeoutSeconds;
25 request.ContentLength = RequestConentLength;
26 }
27
28 Request = request;
29 return request;
30 }
31
32 public WebResponse Response { get; set; }
33
34 protected override WebResponse GetWebResponse(WebRequest request)
35 {
36 this.Response = base.GetWebResponse(request);
37 return this.Response;
38 }
39
40 public string GetCookieValue(string cookieName)
41 {
42 var cookies = this.Cookies.GetCookies(this.Request.RequestUri);
43 var ck = cookies[cookieName];
44 return ck?.Value;
45 }
46 }
特别注意,浏览器需要为多个请求创建多个WebClient对象网站,但它们应该共享一个CookieContainer。在编写爬虫并模拟多个浏览器会话时,不应该都使用同一个 CookieContainer 对象以避免会话冲突。
附上调用JS脚本的C#代码:
1 public string CallJs(string jsCall , string jsFunctions)
2 {
3 Type obj = Type.GetTypeFromProgID("ScriptControl");
4 if (obj == null) return null;
5 object ScriptControl = Activator.CreateInstance(obj);
6 obj.InvokeMember("Language", BindingFlags.SetProperty, null, ScriptControl, new object[] { "JavaScript" });
7 //string js = "function time(a, b, msg){ var sum = a + b; return new Date().getTime() + ': ' + msg + ' = ' + sum }";
8 obj.InvokeMember("AddCode", BindingFlags.InvokeMethod, null, ScriptControl, new object[] { jsFunctions });
9
10 //return obj.InvokeMember("Eval", BindingFlags.InvokeMethod, null, ScriptControl, new object[] { "time(3, 5, '3 + 5')" }).ToString();
11 return obj.InvokeMember("Eval", BindingFlags.InvokeMethod, null, ScriptControl, new object[] { jsCall }).ToString();
12 }
使用示例:
string js = "function jsfunction(parm){ return parm + "abc"; }";
string val = CallJs($"jsfunction('{csvar}')", js.ToString());
分类:
技术要点:
相关文章: