轻松获取微信公众号文章链接,phpspider爬虫框架教程!
优采云 发布时间: 2023-03-21 15:30微信公众号作为一种新兴的媒体形式,越来越受到人们的关注。而如何获取微信公众号文章链接,成为了很多人关注的问题。本文将详细介绍如何使用phpspider爬虫框架获取微信公众号文章链接。
1.爬取目标明确
首先要明确我们要爬取的是哪个公众号的文章。可以通过微信公众平台后台查看该公众号的历史消息,找到需要爬取的文章链接。
2.安装phpspider
phpspider是一个开源的PHP爬虫框架,具有容易上手、功能强大等特点。可以通过composer安装,具体步骤可参考官方文档。
3.抓取列表页
通过Fiddler等工具可以抓取到微信公众号历史消息页面的URL,然后使用phpspider抓取该页面,并解析出文章列表页中所有文章的URL。
php
<?php
require_once './vendor/autoload.php';
use phpspider\core\phpspider;
use phpspider\core\requests;
use phpspider\core\selector;
$public_account_url ="https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=MzI0MjEwNDg0MA==&scene=126&bizpsid=0#wechat_redirect";
$rules = array(
'url'=> array('div.weui_media_126e05b3e2d9883e5b70518117bf6294','href'),
);
requests::set_cookies('_ga=GA1.2.1735125465.1647779698;_gid=GA1.2.1924099627.1647779698; rewardsn=; wxtokenkey=777; wxuin=561518569; devicetype=Windows10; version=63010201; lang=zh_CN; pass_ticket=BsU6YDv6H2QyfX9gk%2B%2BxFpWz7TJhKloRfYVvLJGnWc%3D; wap_sid2=CJvMxqoBEooBdzhDWm5BS1l5b3ZmWWlXSEd0RERMbVhJN1ZtUGkyaU9STnNwVjZrNkpWLVVXaXgzQmFneGZzSWQxN1d3ZXhpdEhvcFRweGtqeFZfRnA5eHNEaFByZElEQUFBfjD+tv7lBTgNQAE=");
requests::set_useragent("Mozilla/5.0(Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36");
$config = array(
'name'=>'weixin',
'log_show'=> true,
'tasknum'=>1,
'interval'=> 1000,
'max_try'=>5,
'domains'=> array(
'mp.weixin.qq.com',
'weixin.qq.com'
),
'scan_urls'=> array(
$public_account_url
),
'list_url_regexes'=> array(
"https://mp.weixin.qq.com/mp/profile_ext.*"
),
'content_url_regexes'=> array(),
'export'=> array(
'type'=>'csv',
'file'=>'./8d777f385d3dfec8815d20f7496026dc/weixin.csv'
),
'fields'=>$rules
);
$spider = new phpspider($config);
$spider->start();
4.解析详情页内容
在获取到文章列表页中每篇文章的URL之后,我们需要进一步访问该URL获取文章详情页内容。使用phpspider的on_5374034a40c8d6800cb4f449c2ea00a0_71860c77c6745379b0d44304d66b6a13回调函数可以在每次请求详情页时进行数据解析和存储。
php
<?php
require_once './vendor/autoload.php';
use phpspider\core\phpspider;
use phpspider\core\requests;
use phpspider\core\selector;
$public_account_url ="https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=MzI0MjEwNDg0MA==&scene=126&bizpsid=0#wechat_redirect";
$rules = array(
'title'=> array('h2.rich_media_title','text'),
'content'=> array('#js_content','html')
);
requests::set_cookies('_ga=GA1.2.1735125465.1647779698;_gid=GA1.2.1924099627.1647779698; rewardsn=; wxtokenkey=777; wxuin=561518569; devicetype=Windows10; version=63010201; lang=zh_CN; pass_ticket=BsU6YDv6H2QyfX9gk%2B%2BxFpWz7TJhKloRfYVvLJGnWc%3D; wap_sid2=CJvMxqoBEooBdzhDWm5BS1l5b3ZmWWlXSEd0RERMbVhJN1ZtUGkyaU9STnNwVjZrNkpWLVVXaXgzQmFneGZzSWQxN1d3ZXhpdEhvcFRweGtqeFZfRnA5eHNEaFByZElEQUFBfjD+tv7lBTgNQAE=");
requests::set_useragent("Mozilla/5.0(Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36");
$config = array(
'name'=>'weixin',
'log_show'=> true,
'tasknum'=>1,
'interval'=> 1000,
'max_try'=>5,
'domains'=> array(
'mp.weixin.qq.com',
'weixin.qq.com'
),
'scan_urls'=> array(
$public_account_url
),
'list_url_regexes'=> array(
"https://mp.weixin.qq.com/mp/profile_ext.*"
),
//定义详情页解析规则
"content_71860c77c6745379b0d44304d66b6a13"=>
[
"/https:\/\/mp\.weixin\.qq\.com\/s\?src.*&amp;/",
],
"fields"=>
[
[
"name" =>"title",
"selector" =>"//div[@class='rich_media_area_primary_inner']//h2[@class='rich_media_title']",
"required" => true,
"is_detail_71860c77c6745379b0d44304d66b6a13"=> true,
],
[
"name" =>"content",
"selector" =>"//div[@class='rich_media_content ']",
"required" => true,
"is_detail_71860c77c6745379b0d44304d66b6a13"=> true,
],
],
//存储方式配置
"export"=>
[
"type" =>"csv",
//存储路径配置
"file" =>"./8d777f385d3dfec8815d20f7496026dc/weixin.csv",
]
);
$spider = new phpspider($config);
$spider->start();
5.总结
本文详细介绍了如何使用phpspider框架获取微信公众号文章链接,并提供了完整代码示例。需要注意的是,在实际应用中要遵守相关法律法规,不得用于非法用途。