今日头条文章采集软件( 地址如何提取url地址根据接口数据链接中的pager页码 )

优采云发布时间: 2021-10-08 16:04

　　今日头条文章采集软件(

地址如何提取url地址根据接口数据链接中的pager页码

)

　　显示更多

　　可以看到相关数据界面，里面有新闻标题和新闻详情的url地址

　　如何提取url地址

　　1、转成json，键值对取值；

2、用正则表达式匹配url地址；

　　根据界面数据链接中的pager变化进行翻页，对应页码。

　　在详情页，可以看到新闻内容在div标签中的p标签中。根据正常分析网站，可以获取新闻内容。

　　保存方式

　　txt 文本格式

　　PDF 格式

　　整体爬取思路总结

在栏目列表页中，点击更多新闻内容，获取接口数据url

接口数据url中返回的数据内容中匹配新闻详情页url

使用常规解析网站操作（re、css、xpath）提取新闻内容

保存数据

　　 import parsel

import requests

import re

#### 获取网页源代码

def get_html(html_url):

"""

获取网页源代码 response

:param html_url: 网页url地址

:return: 网页源代码

"""

headers = {

"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36",

"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", }

response = requests.get(url=html_url, headers=headers)

return response

#### 获取每篇新闻url地址

def get_page_url(html_data):

"""

获取每篇新闻url地址

:param html_data: response.text

:return: 每篇新闻的url地址

"""

page_url_list = re.findall('"url":"(.*?)"', html_data)

return page_url_list

#### 文件保存命名不能含有特殊字符，需要对新闻标题进行处理

def file_name(name):

"""

文件命名不能携带特殊字符

:param name: 新闻标题

:return: 无特殊字符的标题

"""

replace = re.compile(r'[\\/\:\*\?\"\|]')

new_name = re.sub(replace, '_', name)

return new_name

####保存数据

def download(content, title):

"""

with open 保存新闻内容 txt

:param content: 新闻内容

:param title: 新闻标题

:return:

"""

path = '新闻\' + title + '.txt'

with open(path, mode='a', encoding='utf-8') as f:

f.write(content)

print('正在保存', title)

### 主函数

def main(url):

"""

主函数

:param url: 新闻列表页 url地址

:return:

"""

html_data = get_html(url).text # 获得接口数据response.text

lis = get_page_url(html_data) # 获得新闻url地址列表

for li in lis:

page_data = get_html(li).content.decode('utf-8', 'ignore') # 新闻详情页 response.text

selector = parsel.Selector(page_data)

title = re.findall('(.*?)', page_data, re.S)[0] # 获取新闻标题

new_title = file_name(title)

new_data = selector.css('#cont_1_1_2 div.left_zw p::text').getall()

content = ''.join(new_data)

download(content, new_title)

if __name__ == '__main__':

for page in range(1, 101):

url_1 = 'https://channel.chinanews.com/cns/cjs/gj.shtml?pager={}&pagenum=9&t=5_58'.format(page)

main(url_1)

　　在浏览器开发者模式网中，可以快速找到一个带有'?category=new_hot...'字样的文件，查看该文件，发现新闻内容的所有数据都存储在data中，即可发现数据类型是json；

　　只要找到这个文件的requests url，就可以通过python请求爬取网页；

　　查看请求的网址，

　　找到的链接是：

　　max_behot_time 是从获取到的json数据中获取：

　　我在网上找到了大神对as和cp算法的分析，

　　在js文件中找到两个参数：home_4abea46.js，具体算法如下：

　　 !function(t) {

var e = {};

e.getHoney = function() {

var t = Math.floor((new Date).getTime() / 1e3)

, e = t.toString(16).toUpperCase()

, i = md5(t).toString().toUpperCase();

if (8 != e.length)

return {

as: "479BB4B7254C150",

cp: "7E0AC8874BB0985"

};

for (var n = i.slice(0, 5), a = i.slice(-5), s = "", o = 0; 5 > o; o++)

s += n[o] + e[o];

for (var r = "", c = 0; 5 > c; c++)

r += e[c + 3] + a[c];

return {

as: "A1" + s + e.slice(-3),

cp: e.slice(0, 3) + r + "E1"

}

,

t.ascp = e

}(window, document),

　　python获取as和cp值的代码如下：（代码参考博客：）

　　 def get_as_cp(): # 该函数主要是为了获取as和cp参数，程序参考今日头条中的加密js文件：home_4abea46.js

zz = {}

now = round(time.time())

print(now) # 获取当前计算机时间

e = hex(int(now)).upper()[2:] #hex()转换一个整数对象为16进制的字符串表示

print('e:', e)

a = hashlib.md5() #hashlib.md5().hexdigest()创建hash对象并返回16进制结果

print('a:', a)

a.update(str(int(now)).encode('utf-8'))

i = a.hexdigest().upper()

print('i:', i)

if len(e)!=8:

zz = {'as':'479BB4B7254C150',

'cp':'7E0AC8874BB0985'}

return zz

n = i[:5]

a = i[-5:]

r = ''

s = ''

for i in range(5):

s= s+n[i]+e[i]

for j in range(5):

r = r+e[j+3]+a[j]

zz ={

'as':'A1'+s+e[-3:],

'cp':e[0:3]+r+'E1'

}

print('zz:', zz)

return zz

　　这样完整的链接就构成了，另外提一点就是：

_signature参数去掉也是可以获取到json数据的，

　　 import requests

import json

from openpyxl import Workbook

import time

import hashlib

import os

import datetime

start_url = 'https://www.toutiao.com/api/pc/feed/?category=news_hot&utm_source=toutiao&widen=1&max_behot_time='

url = 'https://www.toutiao.com'

headers={

'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'

}

cookies = {'tt_webid':'6649949084894053895'} # 此处cookies可从浏览器中查找，为了避免被头条禁止爬虫

max_behot_time = '0' # 链接参数

title = [] # 存储新闻标题

source_url = [] # 存储新闻的链接

s_url = [] # 存储新闻的完整链接

source = [] # 存储发布新闻的公众号

media_url = {} # 存储公众号的完整链接

def get_as_cp(): # 该函数主要是为了获取as和cp参数，程序参考今日头条中的加密js文件：home_4abea46.js

zz = {}

now = round(time.time())

print(now) # 获取当前计算机时间

e = hex(int(now)).upper()[2:] #hex()转换一个整数对象为16进制的字符串表示

print('e:', e)

a = hashlib.md5() #hashlib.md5().hexdigest()创建hash对象并返回16进制结果

print('a:', a)

a.update(str(int(now)).encode('utf-8'))

i = a.hexdigest().upper()

print('i:', i)

if len(e)!=8:

zz = {'as':'479BB4B7254C150',

'cp':'7E0AC8874BB0985'}

return zz

n = i[:5]

a = i[-5:]

r = ''

s = ''

for i in range(5):

s= s+n[i]+e[i]

for j in range(5):

r = r+e[j+3]+a[j]

zz ={

'as':'A1'+s+e[-3:],

'cp':e[0:3]+r+'E1'

}

print('zz:', zz)

return zz

def getdata(url, headers, cookies): # 解析网页函数

r = requests.get(url, headers=headers, cookies=cookies)

print(url)

data = json.loads(r.text)

return data

def savedata(title, s_url, source, media_url): # 存储数据到文件

# 存储数据到xlxs文件

wb = Workbook()

if not os.path.isdir(os.getcwd()+'/result'): # 判断文件夹是否存在

os.makedirs(os.getcwd()+'/result') # 新建存储文件夹

filename = os.getcwd()+'/result/'+datetime.datetime.now().strftime('%Y-%m-%d-%H-%m')+'.xlsx' # 新建存储结果的excel文件

ws = wb.active

ws.title = 'data' # 更改工作表的标题

ws['A1'] = '标题' # 对表格加入标题

ws['B1'] = '新闻链接'

ws['C1'] = '头条号'

ws['D1'] = '头条号链接'

for row in range(2, len(title)+2): # 将数据写入表格

_= ws.cell(column=1, row=row, value=title[row-2])

_= ws.cell(column=2, row=row, value=s_url[row-2])

_= ws.cell(column=3, row=row, value=source[row-2])

_= ws.cell(column=4, row=row, value=media_url[source[row-2]])

wb.save(filename=filename) # 保存文件

def main(max_behot_time, title, source_url, s_url, source, media_url): # 主函数

for i in range(3): # 此处的数字类似于你刷新新闻的次数，正常情况下刷新一次会出现10条新闻，但夜存在少于10条的情况；所以最后的结果并不一定是10的倍数

ascp = get_as_cp() # 获取as和cp参数的函数

demo = getdata(start_url+max_behot_time+'&max_behot_time_tmp='+max_behot_time+'&tadrequire=true&as='+ascp['as']+'&cp='+ascp['cp'], headers, cookies)

print(demo)

# time.sleep(1)

for j in range(len(demo['data'])):

# print(demo['data'][j]['title'])

if demo['data'][j]['title'] not in title:

title.append(demo['data'][j]['title']) # 获取新闻标题

source_url.append(demo['data'][j]['source_url']) # 获取新闻链接

source.append(demo['data'][j]['source']) # 获取发布新闻的公众号

if demo['data'][j]['source'] not in media_url:

media_url[demo['data'][j]['source']] = url+demo['data'][j]['media_url'] # 获取公众号链接

print(max_behot_time)

max_behot_time = str(demo['next']['max_behot_time']) # 获取下一个链接的max_behot_time参数的值

for index in range(len(title)):

print('标题：', title[index])

if 'https' not in source_url[index]:

s_url.append(url+source_url[index])

print('新闻链接：', url+source_url[index])

else:

print('新闻链接：', source_url[index])

s_url.append(source_url[index])

# print('源链接：', url+source_url[index])

print('头条号：', source[index])

print(len(title)) # 获取的新闻数量

if __name__ == '__main__':

main(max_behot_time, title, source_url, s_url, source, media_url)

savedata(title, s_url, source, media_url)

0

2021-10-08

今日头条文章采集软件

0 个评论

要回复文章请先登录或注册

AI时代内容工厂

今日头条文章采集软件( 地址如何提取url地址根据接口数据链接中的pager页码 )

0 个评论

发起人

AI时代内容工厂

今日头条文章采集软件( 地址如何提取url地址根据接口数据链接中的pager页码 )

0 个评论

发起人

相关问题