php 抓取网页标题(风流不在谈锋胜,袖手无言味最长--C#)
优采云 发布时间: 2022-02-22 07:14php 抓取网页标题(风流不在谈锋胜,袖手无言味最长--C#)
代码描绘生活 2020-08-20 173次浏览 原C#抓取带有网页参数的img src图片链接并下载
关键词:
风流不谈风生,最长无言。本文章主要介绍C#如何爬取带有网页参数的img src图片链接,并下载相关知识,希望对大家有所帮助。
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading;
using System.Windows.Forms;
namespace ImageCollection
{
public partial class Form1 : Form
{
private static string Path = AppDomain.CurrentDomain.BaseDirectory + "img";
public Form1()
{
InitializeComponent();
}
private void btnshuaqu_Click(object sender, EventArgs e)
{
string url = txturl.Text.Trim();
if (string.IsNullOrEmpty(url))
{
MessageBox.Show("请输入URl");
return;
}
txtimg.AppendText("开始抓取中:\\r\\n");
Thread th = new Thread(() => ShuaQu(url)) { IsBackground = true };
th.Start();
}
private void ShuaQu(string url)
{
DirectoryInfo di = new DirectoryInfo(Path);
if (System.IO.Directory.Exists(Path))
{
di.Delete(true);
}
System.IO.Directory.CreateDirectory(Path);
string result = WebHttp.HttpGet(url, null, 3);
string[] str = GethtmlImageUrlList(result);
txtimg.Invoke(new Action(() =>
{
txtimg.AppendText("已经获取到数据!"+str.Count() + "\\r\\n");
}));
//建立获取网页标题正则表达式
String regex = @".+";
//返回网页标题
String title = Regex.Match(result, regex).ToString();
txttitle.Invoke(new Action(() => {
txttitle.Text = Regex.Replace(title, @"[\\""]+", "");
}));
foreach (string s in str)
{
Uri u = new Uri(s);
if (u.Host == "www.xxx.com")
{
Thread downimg = new Thread(() => Get_img(s)) { IsBackground = true };
downimg.Start();
txtimg.Invoke(new Action(() => {
txtimg.AppendText(s + "\\r\\n");
}));
}
}
txtimg.Invoke(new Action(() =>
{
txtimg.AppendText("全部抓取完成!\\r\\n");
}));
}
public void Get_img(string imgpath)
{
string[] file = imgpath.Split(\'?\');
string name = System.IO.Path.GetFileName(file[0]);
WebClient mywebclient = new WebClient();
mywebclient.DownloadFile(imgpath, Path + @"\\" + name);
//Bitmap img = null;
//HttpWebRequest req;
//HttpWebResponse res = null;
//try
//{
// System.Uri httpUrl = new System.Uri(imgpath);
// req = (HttpWebRequest)(WebRequest.Create(httpUrl));
// req.Timeout = 180000; //设置超时值10秒
// req.UserAgent = "XXXXX";
// req.Accept = "XXXXXX";
// req.Method = "GET";
// res = (HttpWebResponse)(req.GetResponse());
// img = new Bitmap(res.GetResponseStream());//获取图片流
// img.Save(Path + @"\\"+name);//随机名
//}
//catch (Exception ex)
//{
// string aa = ex.Message;
//}
//finally
//{
// res.Close();
//}
}
///
/// 取得HTML中所有图片的 URL。
///
/// HTML代码
/// 图片的URL列表
private string[] GetHtmlImageUrlList(string sHtmlText)
{
// 定义正则表达式用来匹配 img 标签
Regex regImg = new Regex(@"", RegexOptions.IgnoreCase);
// 搜索匹配的字符串
MatchCollection matches = regImg.Matches(sHtmlText);
int i = 0;
string[] sUrlList = new string[matches.Count];
// 取得匹配项列表
foreach (Match match in matches)
sUrlList[i++] = match.Groups["imgUrl"].Value;
return sUrlList;
}
}
}
#region 下载图片到Image
public static Image UrlToImage(string url) {
WebClient mywebclient = new WebClient();
byte[] Bytes = mywebclient.DownloadData(url);
using (MemoryStream ms = new MemoryStream(Bytes)) {
Image outputImg = Image.FromStream(ms);
return outputImg;
}
}
#endregion
这里是带有参数的img src图片链接,用于c#爬取网页,下载内容已经完成。如果您的问题无法解决,请参考以下文章:
相关文章
C#将图片保存为Base64格式
jquery写的,把网站中的所有图片src变成URL格式的demo
[当我用cheerio抓取img src时,我得到一个巨大的字符串,而不仅仅是链接
php远程下载图片
网页图片热点链接及坐标值
beautifulsoup 库简单地抓取网页——获取所有链接示例
从网页中抓取图像
html 基础知识(imga 列表)