修改后三、代码分析对无头设置进行了简单的修改

优采云 发布时间: 2021-08-07 02:02

  

修改后三、代码分析对无头设置进行了简单的修改

  <a id="_0"></a>系列文章目录

<p>第一章:selenium实现爬虫功能selenium爬取图片实例

第二章:selenium实现增量式爬虫功能增量式爬虫

第三章:selenium搜索关键字爬虫

<a id="_17"></a>一、源代码

  声名一下由于chrome的高级版本宣布不支持selenium所以博主将浏览器换成了Firefox。

import requests

from selenium import webdriver

import os

import pymysql

def hide():

options = webdriver.FirefoxOptions()

options.add_argument('-headless')

driver = webdriver.Firefox(options=options)

return driver

def Gethtml(url):

driver=hide()

driver.get(url)

s = driver.find_elements_by_css_selector("div[class='slist'] li a")

if str(s[-1].get_attribute("href")).split("/")[-2] == "4kmeinv":

geturl(s[:-1])

else:

geturl(s)

print(s[-1].get_attribute("href"))

Gethtml(s[-1].get_attribute("href"))

def huoqvpicture(url):

driver = webdriver.Chrome(options=hide())

driver.get(url)

s=driver.find_element_by_css_selector("div[class='photo-pic'] a img")

print(s.get_attribute("title"))

insert(url,s.get_attribute("src"),s.get_attribute("title"))

GetPicture(str(s.get_attribute("src")),str(s.get_attribute("title")))

def GetPicture(url,name):

root = "../dist/"

path =root + name.replace(" ","")+".jpg"

try:

if not os.path.exists(root):

os.mkdir(root)

if not os.path.exists(path):

r = requests.get(url)

with open(path, 'wb') as f:

f.write(r.content)

f.close()

print("文件保存成功")

else:

print("文件已存在")

except:

print("爬取失败")

def geturl(s):

for i in s:

print(i.get_attribute("href"))

if not qvchong(i.get_attribute("href")):

huoqvpicture(str(i.get_attribute("href")))

def insert(html,jpg,name):

con = pymysql.connect(host="121.196.244.215", user="root", password="123456" ,port=3306 ,db="tupian",charset="utf8")

cur=con.cursor()

html=str(html)

jpg=str(jpg)

name=str(name)

sql="insert into suoyin(html,jpg,name) values('"+html+"','"+jpg+"','"+name+"');"

print("sql")

#cur.execute(sql)

con.commit()

def qvchong(i):

con = pymysql.connect(host="121.196.244.215", user="root", password="123456", port=3306, db="tupian",

charset="utf8")

cur=con.cursor()

sql="select html from suoyin"

cur.execute(sql)

results = cur.fetchall()

i=(str(i),)

if i in results:

print("数据已存在")

return True

else:

return False

def main():

url="https://pic.netbian.com/4kmeinv/index.html"

Gethtml(url)

main()

</p>

  二、修改后

  import requests

from selenium import webdriver

import os

import pymysql

import time

def hide():

options = webdriver.FirefoxOptions()

options.add_argument('-headless')

driver = webdriver.Firefox(options=options)

return driver

def huoqvpicture(url):

driver = hide()

driver.get(url)

s=driver.find_element_by_css_selector("div[class='photo-pic'] a img")

print(s.get_attribute("title"))

insert(url,s.get_attribute("src"),s.get_attribute("title"))

GetPicture(str(s.get_attribute("src")),str(s.get_attribute("title")))

driver.close()

def GetPicture(url,name):

root = "../dist/"

path =root + name.replace(" ","")+".jpg"

try:

if not os.path.exists(root):

os.mkdir(root)

if not os.path.exists(path):

r = requests.get(url)

with open(path, 'wb') as f:

f.write(r.content)

f.close()

print("文件保存成功")

else:

print("文件已存在")

except:

print("爬取失败")

def geturl(s):

for i in s:

print(i.get_attribute("href"))

if not qvchong(i.get_attribute("href")):

huoqvpicture(str(i.get_attribute("href")))

def insert(html,jpg,name):

con = pymysql.connect(host="121.196.244.215", user="root", password="123456" ,port=3306 ,db="tupian",charset="utf8")

cur=con.cursor()

html=str(html)

jpg=str(jpg)

name=qvdian(str(name))

sql="insert into suoyin(html,jpg,name) values('"+html+"','"+jpg+"','"+name+"');"

print("插入一条数据")

cur.execute(sql)

con.commit()

def qvchong(i):

con = pymysql.connect(host="121.196.244.215", user="root", password="123456", port=3306, db="tupian",

charset="utf8")

cur=con.cursor()

sql="select html from suoyin"

cur.execute(sql)

results = cur.fetchall()

i=(str(i),)

if i in results:

print("数据已存在")

return True

else:

return False

def geisuo(driver,a):

s = driver.find_elements_by_css_selector("div[class='slist'] li a")

print(a)

a = a + 1

if str(s[-1].get_attribute("href")).split("/")[-2] == "result":

geturl(s[:-1])

else:

geturl(s)

bt=driver.find_element_by_class_name("nextpage")

bt.click()

geisuo(driver,a)

def click(url):

driver = hide()

driver.implicitly_wait(3)

driver.get(url)

keyboard = driver.find_element_by_name("keyboard")

time.sleep(1)

keyboard.send_keys("美女")

bt = driver.find_element_by_name("submit")

time.sleep(1)

bt.click()

geisuo(driver,1)

def qvdian(s):

s=str(s)

ls=s.split("'")

s="".join(ls)

return s

def main():

url="https://pic.netbian.com/e/search/result/index.php?page=1&searchid=16"

click(url)

main()

  三、代码分析

  def hide():

options = webdriver.FirefoxOptions()

options.add_argument('-headless')

driver = webdriver.Firefox(options=options)

return driver

  更改浏览器后对headless设置的简单修改,现在直接返回浏览器,提高代码运行效率。

  def click(url):

driver = hide() #创建无头浏览器

driver.implicitly_wait(3) #设计浏览器等待时间

driver.get(url) #获取html页面

keyboard = driver.find_element_by_name("keyboard")

#找到搜索框

time.sleep(1) #等待1秒(拟人化操作)

keyboard.send_keys("美女")

#搜索框输入相应的关键字

bt = driver.find_element_by_name("submit")

#找到搜索按钮

time.sleep(1) #等待1秒(拟人化操作)

bt.click() #点击搜索按钮

geisuo(driver,1) #将浏览器和页数作为参数传递

  这段代码可以说是整个新增代码中的核心代码,他负责关键字搜索。

  def geisuo(driver,a):

s = driver.find_elements_by_css_selector("div[class='slist'] li a") #获取相关的url(最后一个是下一页)

print(a) #打印当前页页数,方便人了解进度

a = a + 1 #页数加一

if str(s[-1].get_attribute("href")).split("/")[-2] == "result":

#查看是否到达最后一页

geturl(s[:-1])

#除最后一个进行爬取(未到达最后一页)

else:

geturl(s)

#全部爬取(到达最后一页)

bt=driver.find_element_by_class_name("nextpage")

#找到下一页的点击按钮,进行翻页

bt.click()

#点击下一页

geisuo(driver,a)

#调用自身重新获取数据

  以前,翻页是通过重新获取url来完成的。这次使用的是鼠标点击。

  

def huoqvpicture(url):

driver = hide()

driver.get(url)

s=driver.find_element_by_css_selector("div[class='photo-pic'] a img")

print(s.get_attribute("title"))

insert(url,s.get_attribute("src"),s.get_attribute("title"))

GetPicture(str(s.get_attribute("src")),str(s.get_attribute("title")))

driver.close() #关掉浏览器,节省内存

  建议大家写完代码后开启headless模式。博主会打开浏览器,之前不用爬图。不知道有没有开启headless模式,消耗内存。

  def qvdian(s):

s=str(s) #将述据转化为字符串

ls=s.split("'") #以单引号为边界切开

s="".join(ls) #在直接合并成字符串

return s

  数据插入数据库时​​,如果有'in',则插入失败,写个小函数去掉单引号,修复一个bug。

  总结

  selenium 的大部分知识到此结束。博主开始学习爬虫框架scrapy。学习之后,基本上把python的爬虫内容都学完了,selinium文章就结束了。

0 个评论

要回复文章请先登录注册


官方客服QQ群

微信人工客服

QQ人工客服


线