网页爬虫抓取百度图片( 利用Python第三方库请求库requests和解析库lxml等工具)
优采云 发布时间: 2021-12-26 16:14网页爬虫抓取百度图片(
利用Python第三方库请求库requests和解析库lxml等工具)
从爬虫小程序爬取*敏*感*词*图片
使用Python第三方库请求库请求和解析库lxml等工具抓取*敏*感*词*名下的所有图片:
要求如下:
1、。编程范式------面向对象
2、 采取简单的反爬措施:比如请求时间不要太频繁,请求头中的User-Agent应该隐藏爬虫工具,随机生成User-Agent,避免反爬
3、只抓取楼主发的图片,其他图片不允许抓取
代码如下:
import requests
from lxml import etree
import os
import time
from fake_useragent import UserAgent
import warnings
import random
warnings.filterwarnings('ignore')
class BaiduSpider(object):
def __init__(self, keyword, page_number):
self.url = 'http://tieba.baidu.com/'
self.useragent = UserAgent()
self.headers = {'User-Agent': self.useragent.random}
self.keyword = keyword
self.page_number = page_number
# 获取帖子链接
def get_tlink(self, data):
res = requests.get(self.url, headers=self.headers, params=data)
res.encoding = 'utf-8'
html = res.text
html = html.replace(r"", '')
# print(html)
parse_html = etree.HTML(html)
t_list = parse_html.xpath(
'//ul[@id="thread_list"]/li[@class=" j_thread_list clearfix"]/div//a/@href')
# print(t_list)
for t in t_list:
# 拼接每个帖子的链接
t_link = 'http://tieba.baidu.com' + t
# 向帖子链接发请求,获取图片链接,向图片链接发请求,保存图片到本地
# print(t_link)
self.get_ilink(t_link)
# 提取图片链接
def get_ilink(self, t_link):
res = requests.get(t_link, headers=self.headers)
res.encoding = 'utf-8'
html = res.text
parse_html = etree.HTML(html)
i_list = parse_html.xpath(
'//div[@class="d_post_content_main d_post_content_firstfloor"]//div[@class="d_post_content j_d_post_content clearfix"]/img[@class="BDE_Image"]/@src')
print(i_list)
for i in i_list:
html = requests.get(i, heasers=self.headers).content
self.write_image(html, i)
# 保存图片
def write_image(self, html, i):
filename = './' + self.keyword + '/' + i[-10:]
with open(filename, 'wb') as f:
f.write(html)
def main(self):
if os.path.exists(self.keyword):
os.remove(self.keyword)
for i in range(1, self.page_number + 1):
data = {
'kw': self.keyword,
'pn': str((i - 1) * 50)
}
self.get_tlink(data)
print('第%d页下载完毕' % i)
time.sleep(random.randint(1, 10))
if __name__ == "__main__":
spider = BaiduSpider('高考吧', 1)
spider.main()
posted @ 2019-06-26 19:39 阅读年轻(327)评论(0)编辑