Scrapy结合Selenium采集数据简单实例

优采云 发布时间: 2020-08-14 11:45

  做爬虫的都不难发觉,有的页面分页,点击下一页,或者指定某页,网址竟然不变,如果是基于scrapy框架采集,那么就没法使用yield迭代url进行页面数据解析采集。

  前段时间正好有用过selenium自动化模拟打开浏览器采集数据,不能能模拟人为的一些键盘、键盘操作。很强悍,照样能跟scrapy结合的太完美!!!

  以下就来打一个简单的在百度输入框输入关键词并点击百度一下进行页面的查询操作,然后再解析页面内容:

  快捷创建项目:

  scrapy startproject test

scrapy genspider crawltest 'www.baidu.com'

  items.py源码:

  import scrapy

class TestItem(scrapy.Item):

# define the fields for your item here like:

# name = scrapy.Field()

title = scrapy.Field()

  settings.py配置更改(取消注释):

  import random

# user agent 列表

USER_AGENT_LIST = [

'MSIE (MSIE 6.0; X11; Linux; i686) Opera 7.23',

'Opera/9.20 (Macintosh; Intel Mac OS X; U; en)',

'Opera/9.0 (Macintosh; PPC Mac OS X; U; en)',

'iTunes/9.0.3 (Macintosh; U; Intel Mac OS X 10_6_2; en-ca)',

'Mozilla/4.76 [en_jp] (X11; U; SunOS 5.8 sun4u)',

'iTunes/4.2 (Macintosh; U; PPC Mac OS X 10.2)',

'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:5.0) Gecko/20100101 Firefox/5.0',

'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:9.0) Gecko/20100101 Firefox/9.0',

'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:16.0) Gecko/20120813 Firefox/16.0',

'Mozilla/4.77 [en] (X11; I; IRIX;64 6.5 IP30)',

'Mozilla/4.8 [en] (X11; U; SunOS; 5.7 sun4u)'

]

# 随机生成user agent

USER_AGENT = random.choice(USER_AGENT_LIST)

#USER_AGENT = 'test (+http://www.yourdomain.com)' # 用户代理

ROBOTSTXT_OBEY = False #设置为False便于打印调试

ITEM_PIPELINES = {

'test.pipelines.JobsPipeline': 1,

} # 用于输出采集的结果,具体操作在pipelines中

  爬虫文件crawltest.py源码:

  # -*- coding: utf-8 -*-

import scrapy

from selenium import webdriver

from selenium.webdriver .chrome.options import Options

from test.items import TestItem

import lxml.html

import time, random

class CrawlSpider(scrapy.Spider):

name = 'crawl'

allowed_domains = ['baidu.com']

start_urls = ['https://www.baidu.com/']

def open_page(self):

chrome_options = Options()

chrome_options.add_argument('--no-sandbox')

chrome_options.add_argument('--disable-dev-shm-usage')

chrome_options.add_argument('--headless')

browser = webdriver.Chrome(chrome_options=chrome_options)

browser.get(self.start_urls[0])

browser.implicitly_wait(10)

return browser

def parse(self, response):

browser = self.open_page()

doc_souce = lxml.html.document_fromstring(browser.page_source)

su = response.xpath('.//input[@id="su"]/@value').extract()

es = doc_souce.xpath('.//input[@id="su"]/@value')

keywd = browser.find_element_by_xpath("//input[@id='kw']")

keywd.send_keys('scrapy')

time.sleep(random.randint(3,5))

browser.find_element_by_xpath("//input[@id='su']").click()

time.sleep(random.randint(3,5)) # 点击完最好要停留下时间,等待页面加载就绪

print(es[0],'ppppppppppppppppp',su[0]) #两个结果一样吗,也就是说selenium打开网页的结果跟内置获取的数据是一致的

doc_souce_01 = lxml.html.document_fromstring(browser.page_source)

result = doc_souce_01.xpath('//span[@class="nums_text"]/text()')

print(result,'000000000000000000')

item = TestItem()

item['title'] = su[0]

yield item

  输出pipelines.py源码:

  # -*- coding: utf-8 -*-

# Define your item pipelines here

#

# Don't forget to add your pipeline to the ITEM_PIPELINES setting

# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

#写入json

# import codecs

# import json

# from scrapy.exceptions import DropItem

# class SpiderctoPipeline(object):

# def __init__(self):

# self.file = open('data.json','w')

# self.file = codecs.open('data.json','w',encoding='utf-8')

# def process_item(self, item, spider):

# line = json.dumps(dict(item),ensure_ascii=False) + '\n'

# self.file.write(line)

# return item

#写入数据库

from twisted.enterprise import adbapi

import pymysql

import pymysql.cursors

class SpiderctoPipeline(object):

def __init__(self,dbpool):

self.dbpool = dbpool

@classmethod

def from_settings(cls,setting):

dbpool=adbapi.ConnectionPool('pymysql',host='127.0.0.1',

db='test',user='root',password='123456',charset='utf8',cursorclass=pymysql.cursors.DictCursor,use_unicode=True)

return cls(dbpool)

def process_item(self, item, spider):

self.dbpool.runInteraction(self.do_insert,item)

def do_insert(self,cursor,item):

insert_info = """

insert into ctolist(title,url,score,hour,student,couse_long,price,updata)

values (%s,%s,%s,%s,%s,%s,%s,%s)

"""

params = (item['title'],item['url'],item['score'],item['hour'],item['student'],item['couse_long'],item['price'],item['updata'])

cursor.execute(insert_info,params)

  大功告成,启动爬虫:

0 个评论

要回复文章请先登录注册


官方客服QQ群

微信人工客服

QQ人工客服


线