轻松获取微信公众号文章链接,phpspider爬虫框架教程!

优采云 发布时间: 2023-03-21 15:30

  微信公众号作为一种新兴的媒体形式,越来越受到人们的关注。而如何获取微信公众号文章链接,成为了很多人关注的问题。本文将详细介绍如何使用phpspider爬虫框架获取微信公众号文章链接。

  1.爬取目标明确

  首先要明确我们要爬取的是哪个公众号的文章。可以通过微信公众平台后台查看该公众号的历史消息,找到需要爬取的文章链接。

  

  2.安装phpspider

  phpspider是一个开源的PHP爬虫框架,具有容易上手、功能强大等特点。可以通过composer安装,具体步骤可参考官方文档。

  3.抓取列表页

  

  通过Fiddler等工具可以抓取到微信公众号历史消息页面的URL,然后使用phpspider抓取该页面,并解析出文章列表页中所有文章的URL。

  php

<?php

require_once './vendor/autoload.php';

use phpspider\core\phpspider;

use phpspider\core\requests;

use phpspider\core\selector;

$public_account_url ="https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=MzI0MjEwNDg0MA==&scene=126&bizpsid=0#wechat_redirect";

$rules = array(

'url'=> array('div.weui_media_126e05b3e2d9883e5b70518117bf6294','href'),

);

requests::set_cookies('_ga=GA1.2.1735125465.1647779698;_gid=GA1.2.1924099627.1647779698; rewardsn=; wxtokenkey=777; wxuin=561518569; devicetype=Windows10; version=63010201; lang=zh_CN; pass_ticket=BsU6YDv6H2QyfX9gk%2B%2BxFpWz7TJhKloRfYVvLJGnWc%3D; wap_sid2=CJvMxqoBEooBdzhDWm5BS1l5b3ZmWWlXSEd0RERMbVhJN1ZtUGkyaU9STnNwVjZrNkpWLVVXaXgzQmFneGZzSWQxN1d3ZXhpdEhvcFRweGtqeFZfRnA5eHNEaFByZElEQUFBfjD+tv7lBTgNQAE=");

requests::set_useragent("Mozilla/5.0(Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36");

$config = array(

'name'=>'weixin',

'log_show'=> true,

'tasknum'=>1,

'interval'=> 1000,

'max_try'=>5,

'domains'=> array(

'mp.weixin.qq.com',

'weixin.qq.com'

),

'scan_urls'=> array(

$public_account_url

),

'list_url_regexes'=> array(

"https://mp.weixin.qq.com/mp/profile_ext.*"

),

'content_url_regexes'=> array(),

'export'=> array(

'type'=>'csv',

'file'=>'./8d777f385d3dfec8815d20f7496026dc/weixin.csv'

),

'fields'=>$rules

);

$spider = new phpspider($config);

$spider->start();

  4.解析详情页内容

  

  在获取到文章列表页中每篇文章的URL之后,我们需要进一步访问该URL获取文章详情页内容。使用phpspider的on_5374034a40c8d6800cb4f449c2ea00a0_71860c77c6745379b0d44304d66b6a13回调函数可以在每次请求详情页时进行数据解析和存储。

  php

<?php

require_once './vendor/autoload.php';

use phpspider\core\phpspider;

use phpspider\core\requests;

use phpspider\core\selector;

$public_account_url ="https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=MzI0MjEwNDg0MA==&scene=126&bizpsid=0#wechat_redirect";

$rules = array(

'title'=> array('h2.rich_media_title','text'),

'content'=> array('#js_content','html')

);

requests::set_cookies('_ga=GA1.2.1735125465.1647779698;_gid=GA1.2.1924099627.1647779698; rewardsn=; wxtokenkey=777; wxuin=561518569; devicetype=Windows10; version=63010201; lang=zh_CN; pass_ticket=BsU6YDv6H2QyfX9gk%2B%2BxFpWz7TJhKloRfYVvLJGnWc%3D; wap_sid2=CJvMxqoBEooBdzhDWm5BS1l5b3ZmWWlXSEd0RERMbVhJN1ZtUGkyaU9STnNwVjZrNkpWLVVXaXgzQmFneGZzSWQxN1d3ZXhpdEhvcFRweGtqeFZfRnA5eHNEaFByZElEQUFBfjD+tv7lBTgNQAE=");

requests::set_useragent("Mozilla/5.0(Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36");

$config = array(

'name'=>'weixin',

'log_show'=> true,

'tasknum'=>1,

'interval'=> 1000,

'max_try'=>5,

'domains'=> array(

'mp.weixin.qq.com',

'weixin.qq.com'

),

'scan_urls'=> array(

$public_account_url

),

'list_url_regexes'=> array(

"https://mp.weixin.qq.com/mp/profile_ext.*"

),

//定义详情页解析规则

"content_71860c77c6745379b0d44304d66b6a13"=>

[

"/https:\/\/mp\.weixin\.qq\.com\/s\?src.*&/",

],

"fields"=>

[

[

"name" =>"title",

"selector" =>"//div[@class='rich_media_area_primary_inner']//h2[@class='rich_media_title']",

"required" => true,

"is_detail_71860c77c6745379b0d44304d66b6a13"=> true,

],

[

"name" =>"content",

"selector" =>"//div[@class='rich_media_content ']",

"required" => true,

"is_detail_71860c77c6745379b0d44304d66b6a13"=> true,

],

],

//存储方式配置

"export"=>

[

"type" =>"csv",

//存储路径配置

"file" =>"./8d777f385d3dfec8815d20f7496026dc/weixin.csv",

]

);

$spider = new phpspider($config);

$spider->start();

  5.总结

  本文详细介绍了如何使用phpspider框架获取微信公众号文章链接,并提供了完整代码示例。需要注意的是,在实际应用中要遵守相关法律法规,不得用于非法用途。

0 个评论

要回复文章请先登录注册


官方客服QQ群

微信人工客服

QQ人工客服


线