querylist采集微信公众号文章( 处理跳转向微信注入js的方法：以上就是对处理代理服务器拦截到的数据进行处理)

优采云发布时间: 2022-03-13 10:09

　　querylist采集微信公众号文章(

处理跳转向微信注入js的方法：以上就是对处理代理服务器拦截到的数据进行处理)

　　public void getMsgExt(String str,String url) {

// TODO Auto-generated method stub

String biz = "";

String sn = "";

Map queryStrs = HttpUrlParser.parseUrl(url);

if(queryStrs != null){

biz = queryStrs.get("__biz");

biz = biz + "==";

sn = queryStrs.get("sn");

sn = "%" + sn + "%";

}

/**

* $sql = "select * from `文章表` where `biz`='".$biz."'

* and `content_url` like '%".$sn."%'" limit 0,1;

* 根据biz和sn找到对应的文章

*/

Post post = postMapper.selectByBizAndSn(biz, sn);

if(post == null){

System.out.println("biz:"+biz);

System.out.println("sn:"+sn);

tmpListMapper.deleteByLoad(1);

return;

}

// System.out.println("json数据:"+str);

Integer read_num;

Integer like_num;

try{

read_num = JsonPath.read(str, "['appmsgstat']['read_num']");//阅读量

like_num = JsonPath.read(str, "['appmsgstat']['like_num']");//点赞量

}catch(Exception e){

read_num = 123;//阅读量

like_num = 321;//点赞量

System.out.println("read_num:"+read_num);

System.out.println("like_num:"+like_num);

System.out.println(e.getMessage());

}

/**

* 在这里同样根据sn在采集队列表中删除对应的文章，代表这篇文章可以移出采集队列了

* $sql = "delete from `队列表` where `content_url` like '%".$sn."%'"

*/

tmpListMapper.deleteBySn(sn);

//然后将阅读量和点赞量更新到文章表中。

post.setReadnum(read_num);

post.setLikenum(like_num);

postMapper.updateByPrimaryKey(post);

}

　　将js注入微信的处理跳转方法：

　　public String getWxHis() {

String url = "";

// TODO Auto-generated method stub

/**

* 当前页面为公众号历史消息时，读取这个程序

* 在采集队列表中有一个load字段，当值等于1时代表正在被读取

* 首先删除采集队列表中load=1的行

* 然后从队列表中任意select一行

*/

tmpListMapper.deleteByLoad(1);

TmpList queue = tmpListMapper.selectRandomOne();

System.out.println("queue is null?"+queue);

if(queue == null){//队列表为空

/**

* 队列表如果空了，就从存储公众号biz的表中取得一个biz，

* 这里我在公众号表中设置了一个采集时间的time字段，按照正序排列之后，

* 就得到时间戳最小的一个公众号记录，并取得它的biz

*/

WeiXin weiXin = weiXinMapper.selectOne();

String biz = weiXin.getBiz();

url = "https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=" + biz +

"#wechat_redirect";//拼接公众号历史消息url地址（第二种页面形式）

//更新刚才提到的公众号表中的采集时间time字段为当前时间戳。

weiXin.setCollect(System.currentTimeMillis());

int result = weiXinMapper.updateByPrimaryKey(weiXin);

System.out.println("getHis weiXin updateResult:"+result);

}else{

//取得当前这一行的content_url字段

url = queue.getContentUrl();

//将load字段update为1

tmpListMapper.updateByContentUrl(url);

}

//将下一个将要跳转的$url变成js脚本，由anyproxy注入到微信页面中。

//echo "setTimeout(function(){window.location.href='".$url."';},2000);";

int randomTime = new Random().nextInt(3) + 3;

String jsCode = "setTimeout(function(){window.location.href='"+url+"';},"+randomTime*1000+");";

return jsCode;

}

　　以上是处理代理服务器截获的数据的程序。这里有一个需要注意的问题。程序会依次访问数据库中每个收录的公众号，甚至会再次访问存储的文章，以不断更新收录的阅读点赞数@文章的。如果需要抓取大量公众号，建议修改添加任务队列和添加条件的代码，否则多轮公众号抓取重复数据的效率会受到很大影响。

　　至此，微信公众号的文章链接全部被爬取完毕，且该链接为永久有效链接，可在浏览器中打开。接下来就是编写爬虫程序，从数据库中爬取链接文章的内容等信息。

　　我用webmagic写了一个爬虫，轻量级，好用。

　　public class SpiderModel implements PageProcessor{

private static PostMapper postMapper;

private static List posts;

// 抓取网站的相关配置，包括编码、抓取间隔、重试次数等

private Site site = Site.me().setRetryTimes(3).setSleepTime(100);

public Site getSite() {

// TODO Auto-generated method stub

return this.site;

}

public void process(Page page) {

// TODO Auto-generated method stub

Post post = posts.remove(0);

String content = page.getHtml().xpath("//div[@id='js_content']").get();

//存在和谐文章此处做判定如果有直接删除记录或设置表示位表示文章被和谐

if(content == null){

System.out.println("文章已和谐！");

//postMapper.deleteByPrimaryKey(post.getId());

return;

}

String contentSnap = content.replaceAll("data-src", "src").replaceAll("preview.html", "player.html");//快照

String contentTxt = HtmlToWord.stripHtml(content);//纯文本内容

Selectable metaContent = page.getHtml().xpath("//div[@id='meta_content']");

String pubTime = null;

String wxname = null;

String author = null;

if(metaContent != null){

pubTime = metaContent.xpath("//em[@id='post-date']").get();

if(pubTime != null){

pubTime = HtmlToWord.stripHtml(pubTime);//文章发布时间

}

wxname = metaContent.xpath("//a[@id='post-user']").get();

if(wxname != null){

wxname = HtmlToWord.stripHtml(wxname);//公众号名称

}

author = metaContent.xpath("//em[@class='rich_media_meta rich_media_meta_text' and @id!='post-date']").get();

if(author != null){

author = HtmlToWord.stripHtml(author);//文章作者

}

// System.out.println("发布时间:"+pubTime);

// System.out.println("公众号名称:"+wxname);

// System.out.println("文章作者:"+author);

String title = post.getTitle().replaceAll(" ", "");//文章标题

String digest = post.getDigest();//文章摘要

int likeNum = post.getLikenum();//文章点赞数

int readNum = post.getReadnum();//文章阅读数

String contentUrl = post.getContentUrl();//文章链接

WechatInfoBean wechatBean = new WechatInfoBean();

wechatBean.setTitle(title);

wechatBean.setContent(contentTxt);//纯文本内容

wechatBean.setSourceCode(contentSnap);//快照

wechatBean.setLikeCount(likeNum);

wechatBean.setViewCount(readNum);

wechatBean.setAbstractText(digest);//摘要

wechatBean.setUrl(contentUrl);

wechatBean.setPublishTime(pubTime);

wechatBean.setSiteName(wxname);//站点名称公众号名称

wechatBean.setAuthor(author);

wechatBean.setMediaType("微信公众号");//来源媒体类型

WechatStorage.saveWechatInfo(wechatBean);

//标示文章已经被爬取

post.setIsSpider(1);

postMapper.updateByPrimaryKey(post);

}

public static void startSpider(List inposts,PostMapper myPostMapper,String... urls){

long startTime, endTime;

startTime = System.currentTimeMillis();

postMapper = myPostMapper;

posts = inposts;

HttpClientDownloader httpClientDownloader = new HttpClientDownloader();

SpiderModel spiderModel = new SpiderModel();

Spider mySpider = Spider.create(spiderModel).addUrl(urls);

mySpider.setDownloader(httpClientDownloader);

try {

SpiderMonitor.instance().register(mySpider);

mySpider.thread(1).run();

} catch (JMException e) {

e.printStackTrace();

}

endTime = System.currentTimeMillis();

System.out.println("爬取时间" + ((endTime - startTime) / 1000) + "秒--");

}

　　其他一些不相关的数据存储代码将不会发布。这里我将代理服务器抓取的数据存储在mysql中，将我的爬虫爬取的数据存储在mongodb中。

　　以下是我爬取的公众号信息：

　　打开应用程序并阅读笔记

0

2022-03-13

querylist采集微信公众号文章

0 个评论

要回复文章请先登录或注册

AI时代内容工厂

querylist采集微信公众号文章( 处理跳转向微信注入js的方法：以上就是对处理代理服务器拦截到的数据进行处理)

0 个评论

发起人

AI时代内容工厂

querylist采集微信公众号文章( 处理跳转向微信注入js的方法：以上就是对处理代理服务器拦截到的数据进行处理)

0 个评论

发起人

相关问题