php抓取网页内容(最近抓的2个网站内容的代码列表页抓取:第一种使用phpquery插件)

优采云 发布时间: 2021-10-11 19:20

  php抓取网页内容(最近抓的2个网站内容的代码列表页抓取:第一种使用phpquery插件)

  最近抓到两个网站内容代码

  列表页面爬取:第一种使用phpquery插件,可以快速获取,第二种是api,所以直接获取

  load_third("phpQuery.php");

/*********www.sosobtc.com***********/

/**/

$re = phpQuery::newDocumentFile('https://www.sosobtc.com/news/all'); //设置好抓取的新闻列表网址

$data = array();

// 获取列表地址

foreach(pq('.news-list .news-thumbnail a') as $key=>$value) {

$href = $value->getAttribute('href');

$data[$key]['source_url'] = "https://www.sosobtc.com".$href;

}

// 获取标题

foreach(pq('.news-list .news-title h3') as $key=>$value) {

$title = pq($value)->text();

$data[$key]['title'] = $title;

}

// 获取封面图地址

foreach(pq('.news-list .share-box ul') as $key=>$value) {

$re = pq($value)->find('li')->eq(0)->find('a')->attr('href');

$str = strrchr($re,"&");

$arr= explode("=",$str);

$data[$key]['pic'] = $arr[1];

$str2 = explode("/",$arr[1]);

$data[$key]['add_time'] = strtotime($str2[5]);

}

//获取信息初始来源

foreach(pq('.category') as $key=>$value) {

$source = pq($value)->text();

$data[$key]['source'] = $source;

}

// exit;

foreach($data as $v){

$adddata['title'] = $v['title'];

$adddata['source_url'] = $v['source_url'];

$adddata['add_time'] = time();

$adddata['add_time'] = $v['add_time'];

$adddata['pic'] = $v['pic'];

$adddata['source'] = $v['source'];

// $adddata['stype'] = 1;

$result = News::add($adddata);

if(!$result['insert_id']){

file_put_contents("/data/log/fail_spider.log",var_dump($result).",".$v['source_url'].",".$v['pic']."\r\n",FILE_APPEND);

}

}

/*********www.sosobtc.com***********/

/*********www.36kr.com/***********/

$result = file_get_contents("http://36kr.com/api/search-column/208?per_page=20&page=1");

if(!$result){

die;

}

$result = json_decode($result,true);

if(count($result['data']['items'])==0){

die;

}

foreach($result['data']['items'] as $k=>$v){

$sdata['add_time'] = strtotime($v['published_at']);

$sdata['title'] = $v['title'];

$sdata['pic'] = $v['template_info']['template_cover'][0];

$info = json_decode($v['user_info'],true);

$sdata['source'] = $info['name'];

$sdata['source_url'] = "http://36kr.com/p/".$v['id'].".html";

$re = News::add($sdata);

if(!$re['insert_id']){

file_put_contents("/data/log/fail_spider.log",var_dump($re).",".$v['source_url'].",".$v['pic']."\r\n",FILE_APPEND);

}

}

/*********www.36kr.com/***********/

  首先获取列表的内容,然后根据列表对应的目标地址一一抓取详情。

  详情页面抓取:

  load_third("phpQuery.php");

function download($url)

{

$ch = curl_init();

curl_setopt($ch, CURLOPT_URL, $url);

curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);

curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 30);

$file = curl_exec($ch);

curl_close($ch);

$filename = pathinfo($url, PATHINFO_BASENAME);

$path = '/data/xxxxx.com/phone/wwwroot/upimg/';//**************注意权限问题

$dirarr = explode("/",$url);

$path .= $dirarr[5]."/";

if (!is_dir($path)) mkdir($path);

$resource = fopen($path . $filename, 'a');

fwrite($resource, $file);

fclose($resource);

return "/".$dirarr[5]."/".$filename;

}

function download2($url)

{

$ch = curl_init();

curl_setopt($ch, CURLOPT_URL, $url);

curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);

curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 30);

$file = curl_exec($ch);

curl_close($ch);

$filename = pathinfo($url, PATHINFO_BASENAME).".jpg";

$path = '/data/xxxxx.com/phone/wwwroot/upimg/';//**************注意权限问题

$path .= date("Ymd")."/";

if (!is_dir($path)) mkdir($path);

$resource = fopen($path . $filename, 'a');

fwrite($resource, $file);

fclose($resource);

return "/".date("Ymd")."/".$filename;

}

$result = News::getdown();

if(count($result)==0){

exit(2);

}

foreach($result as $v)

{

if(strpos($v['source_url'],'sosobtc')){

$path = download($v['pic']);//下载图片到本地

$re = phpQuery::newDocumentFile($v['source_url']); //设置好抓取的新闻列表网址

$content = pq(".article-main")->html();

// $id = $v['id'];

$data['pic'] = $path;

$data['content'] = addslashes(trim($content));

$data['status'] = 1;

$result = News::modify($v['id'],$data);

if(!$result){

file_put_contents("/data/log/fail_spiderdown.log",$v['id']."|".var_dump($result)."|".json_encode($data)."\r\n",FILE_APPEND);

}

}else if(strpos($v['source_url'],'36kr')){

// echo $v['id']."\r\n";

$path = download2($v['pic']);//下载图片到本地

$re = file_get_contents($v['source_url']); //设置好抓取的新闻列表网址

preg_match("/var props=(.*),locationnal={/",$re,$match);

$info = json_decode($match[1],true);

$content = $info['detailArticle|post']['content'];

$data['pic'] = $path;

$data['content'] = $content;

$data['status'] = 1;

$result = News::modify($v['id'],$data);

// print_r($data);

// break;

$result = News::modify($v['id'],$data);

if(!$result){

file_put_contents("/data/log/fail_spiderdown.log",$v['id']."|".var_dump($result)."|".json_encode($data)."\r\n",FILE_APPEND);

}

}

}

  首先是使用phpquery来获取。第二种方式是查看源代码。是js数据的延迟加载,所以我直接用php匹配了自己需要的数据。其中,我将两者的封面图片下载到本地,本地upimg主要需要权限,否则创建日期目录可能会失败。还有一点是我在source_url上有一个唯一索引,也就是目标URL mysql字段,这样我每天定时运行两个脚本,可以抓到最新的数据,不会抓到重复的数据。

0 个评论

要回复文章请先登录注册


官方客服QQ群

微信人工客服

QQ人工客服


线