两个简单的版本,关于百度搜索结果的抓取版本
优采云 发布时间: 2021-07-13 18:04
两个简单的版本,关于百度搜索结果的抓取版本
seo必备网站分析工具,关键词百度搜索结果查询导出源码
两个简单的版本,关于采集百度搜索结果的抓取,你可以得到你竞争对手的网站进行分析研究,只需输入关键词和搜索页码即可完成对手的获取和研究,给出了两个版本,希望可以作为参考和帮助!
版本一功能
#百度搜索结果抓取
#author/微信:huguo00289
# -*- coding: utf-8 -*-
import requests,time,random
from fake_useragent import UserAgent
from lxml import etree
import threading
import xlsxwriter
class Baidu_search():
def __init__(self):
self.url="https://www.baidu.com/s?wd="
self.ua=UserAgent()
self.search_datas=[]
#获取cookies
def get_cookies(self):
with open("cookie.txt", "r", encoding="utf-8") as f:
cookies = f.readlines()
cookie=random.choice(cookies)
cookie=cookie.strip()
return cookie
#获取搜索结果
def get_search_objects(self,search_url):
headers={
"User-Agent":self.ua.random,
'Cookie':self.get_cookies(),
}
html=requests.get(search_url,headers=headers,timeout=8).content.decode("utf-8")
time.sleep(2)
req=etree.HTML(html)
h3s=req.xpath('//div[@class="result c-container new-pmd"]/h3[@class="t"]/a')
hrefs=req.xpath('//div[@class="result c-container new-pmd"]/h3[@class="t"]/a/@href')
for h3,href in zip(h3s,hrefs):
h3=h3.xpath('.//text()')
h3=''.join(h3)
href=self.get_website_url(href)
data=h3,href
self.search_datas.append(data)
print(data)
# 获取真实地址
def get_website_url(self,baidu_url):
r = requests.head(baidu_url, stream=True)
website_url = r.headers['Location']
# print(website_url)
return website_url
#插入excel
def write_to_xlsx(self, file_name):
workbook = xlsxwriter.Workbook(f'{file_name}_{time.strftime("%Y-%m-%d ", time.localtime())}.xlsx') # 创建一个Excel文件
worksheet = workbook.add_worksheet(file_name)
title = ['标题', '网址'] # 表格title
worksheet.write_row('A1', title)
for index, data in enumerate(self.search_datas):
# content = content.rstrip()
# keyword, rank, include_num, chart_url, title, game_id, company_num, long_words_num = data
num0 = str(index + 2)
row = 'A' + num0
# data = [name, size, game_id]
worksheet.write_row(row, data)
workbook.close()
print("搜索结果数据插入excel表格成功!")
def main(self,keyword,num):
for i in range(0, num):
print(f'正在查询第{i+1}页百度搜索结果数据..')
ym = i * 10
search_url = f"{self.url}{keyword}&ie=UTF-8&pn={ym}"
self.get_search_objects(search_url)
self.write_to_xlsx(keyword)
#多线程
def Thread_main(self,keyword,num):
threadings=[]
for i in range(0, num):
print(f'正在查询第{i+1}页百度搜索结果数据..')
ym = i * 10
search_url = f"{self.url}{keyword}&ie=UTF-8&pn={ym}"
t=threading.Thread(target=self.get_search_objects,args=(search_url,))
threadings.append(t)
t.start()
for x in threadings:
x.join()
print("多线程查询百度搜索结果完成")
print(self.search_datas)
if __name__=='__main__':
keyword="工业设计"
num=10
spider=Baidu_search()
spider.main(keyword,num)
#spider.Thread_main(keyword, num)
版本 2 功能