关键词文章采集源码(一套简单粗暴的爬取百度图片的源码简单易上手 )

优采云 发布时间: 2021-12-30 06:11

  关键词文章采集源码(一套简单粗暴的爬取百度图片的源码简单易上手

)

  作为一个ai新手,短时间内无法直接掌握爬虫,但苦于数据集不足,又懒惰不想一一下载。我该怎么办?

  别着急,小编整理了一套简单粗暴的百度图片爬取源码

  简单易用,只需动手

  话不多说,上源码爬取百度图片

  import re

import requests

from urllib import error

from bs4 import BeautifulSoup

import os

num = 0

numPicture = 0

file = ''

List = []

def Find(url):

global List

print('正在检测图片总数,请稍等.....')

t = 0

i = 1

s = 0

while t < 4000:

Url = url + str(t)

try:

Result = requests.get(Url, timeout=7)

except BaseException:

t = t + 60

continue

else:

result = Result.text

pic_url = re.findall(&#39;"objURL":"(.*?)",&#39;, result, re.S) # 先利用正则表达式找到图片url

s += len(pic_url)

if len(pic_url) == 0:

break

else:

List.append(pic_url)

t = t + 60

return s

def recommend(url):

Re = []

try:

html = requests.get(url)

except error.HTTPError as e:

return

else:

html.encoding = &#39;utf-8&#39;

bsObj = BeautifulSoup(html.text, &#39;html.parser&#39;)

div = bsObj.find(&#39;div&#39;, id=&#39;topRS&#39;)

if div is not None:

listA = div.findAll(&#39;a&#39;)

for i in listA:

if i is not None:

Re.append(i.get_text())

return Re

def dowmloadPicture(html, keyword):

global num

# t =0

pic_url = re.findall(&#39;"objURL":"(.*?)",&#39;, html, re.S) # 先利用正则表达式找到图片url

print(&#39;找到关键词:&#39; + keyword + &#39;的图片,即将开始下载图片...&#39;)

for each in pic_url:

print(&#39;正在下载第&#39; + str(num + 1) + &#39;张图片,图片地址:&#39; + str(each))

try:

if each is not None:

pic = requests.get(each, timeout=7)

else:

continue

except BaseException:

print(&#39;错误,当前图片无法下载&#39;)

continue

else:

string = file + r&#39;\\&#39; + keyword + &#39;_&#39; + str(num) + &#39;.jpg&#39;

fp = open(string, &#39;wb&#39;)

fp.write(pic.content)

fp.close()

num += 1

if num >= numPicture:

return

if __name__ == &#39;__main__&#39;: # 主函数入口

word = input("请输入搜索关键词(可以是人名,地名等): ")

#add = &#39;http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=%E5%BC%A0%E5%A4%A9%E7%88%B1&pn=120&#39;

url = &#39;http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=&#39; + word + &#39;&pn=&#39;

tot = Find(url)

Recommend = recommend(url) # 记录相关推荐

print(&#39;经过检测%s类图片共有%d张&#39; % (word, tot))

numPicture = int(input(&#39;请输入想要下载的图片数量 &#39;))

file = input(&#39;请建立一个存储图片的文件夹,输入文件夹名称即可&#39;)

y = os.path.exists(file)

if y == 1:

print(&#39;该文件已存在,请重新输入&#39;)

file = input(&#39;请建立一个存储图片的文件夹,)输入文件夹名称即可&#39;)

os.mkdir(file)

else:

os.mkdir(file)

t = 0

tmp = url

while t < numPicture:

try:

url = tmp + str(t)

result = requests.get(url, timeout=10)

print(url)

except error.HTTPError as e:

print(&#39;网络错误,请调整网络后重试&#39;)

t = t+60

else:

dowmloadPicture(result.text, word)

t = t + 60

print(&#39;当前搜索结束,感谢使用&#39;)

print(&#39;猜你喜欢&#39;)

for re in Recommend:

print(re, end=&#39; &#39;)

  我不会解释它背后的原理。如果非要问我源码是什么,网上有很多资料。

  可以直接使用,不管你要抓取什么图片。

  

0 个评论

要回复文章请先登录注册


官方客服QQ群

微信人工客服

QQ人工客服


线