c爬虫抓取网页数据(爬取框架中分两类爬虫 )
优采云 发布时间: 2021-12-22 13:20c爬虫抓取网页数据(爬取框架中分两类爬虫
)
Scrapy 框架中有两种爬虫,Spider 和 CrawlSpider。本案例使用 CrawlSpider 类来实现全站抓取的爬虫。
CrawlSpider 是 Spider 的派生类。Spider类的设计原理是只抓取start_url列表中的网页,而CrawlSpider类则定义了一些规则来提供一种方便的机制来跟踪链接,从抓取到的网页中获取链接并继续抓取。
创建一个 CrawlSpider 模板:
scrapy genspider -t crawl spider名称 www.xxxx.com
LinkExtractors:Link Extractors 的目的是提取链接。Extract_links() 被调用,它提供过滤器以方便提取包括正则表达式在内的链接。过滤器配置有以下构造函数参数:
Rule:规则中收录一个或多个Rule对象,每个Rule定义了爬取网站的具体操作。如果多个规则匹配同一个链接,将根据规则在此集中定义的顺序使用第一个。
下面是一个爬钩网的案例:
蜘蛛.pyi
项目.py
import scrapy
from scrapy.loader.processors import MapCompose, TakeFirst, Join
from scrapy.loader import ItemLoader
from w3lib.html import remove_tags
from LaGouSpider.settings import SQL_DATETIME_FORMAT
class LagouJobItemLoader(ItemLoader):
#自定义Itemloader
default_output_processor = TakeFirst()
def remove_splash(value):
#去掉斜杠
return value.replace("/","")
def handle_jobaddr(value):
addr_list = value.split("\n")
addr_list = [item.strip() for item in addr_list if item.strip()!="查看地图"]
return "".join(addr_list)
class LagouspiderItem(scrapy.Item):
title = scrapy.Field()
url = scrapy.Field()
url_object_id = scrapy.Field()
salary = scrapy.Field()
job_city = scrapy.Field(
input_processor=MapCompose(remove_splash),
)
work_years = scrapy.Field(
input_processor=MapCompose(remove_splash),
)
degree_need = scrapy.Field(
input_processor=MapCompose(remove_splash),
)
job_type = scrapy.Field()
publish_time = scrapy.Field()
job_advantage = scrapy.Field()
job_desc = scrapy.Field()
job_address = scrapy.Field(
input_processor=MapCompose(remove_tags, handle_jobaddr),
)
company_name = scrapy.Field()
company_url = scrapy.Field()
tags = scrapy.Field(
input_processor=Join(",")
)
crawl_time = scrapy.Field()
def get_insert_sql(self):
insert_sql = """
insert into lagou_job(title, url, url_object_id, salary, job_city, work_years, degree_need,
job_type, publish_time, job_advantage, job_desc, job_address, company_name, company_url,
tags, crawl_time) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
ON DUPLICATE KEY UPDATE salary=VALUES(salary), job_desc=VALUES(job_desc)
"""
params = (
self["title"], self["url"], self["url_object_id"], self["salary"], self["job_city"],
self["work_years"], self["degree_need"], self["job_type"],
self["publish_time"], self["job_advantage"], self["job_desc"],
self["job_address"], self["company_name"], self["company_url"],
self["tags"], self["crawl_time"].strftime(SQL_DATETIME_FORMAT),
)
return insert_sql, params
管道.py
from twisted.enterprise import adbapi
import MySQLdb
import MySQLdb.cursors
class LagouspiderPipeline(object):
def process_item(self, item, spider):
return item
class MysqlTwistedPipeline(object):
def __init__(self, dbpool):
self.dbpool = dbpool
@classmethod
def from_settings(clsc,setting):
dbparms = dict(
host =setting["MYSQL_HOST"],
db = setting["MYSQL_DBNAME"],
user = setting["MYSQL_USER"],
password = setting["MYSQL_PASSWORD"],
charset = 'utf8',
cursorclass = MySQLdb.cursors.DictCursor,
use_unicode = True,
)
dbpool = adbapi.ConnectionPool("MySQLdb",**dbparms)
return clsc(dbpool)
def process_item(self, item, spider):
#使用twisted将mysql插入变成异步执行
query = self.dbpool.runInteraction(self.do_insert,item)
query.addErrback(self.handle_error,item,spider) #处理异常
def handle_error(self,failure,item,spider):
#处理异步插入的异常
print(failure)
def do_insert(self,cursor,item):
#执行具体的插入
# 根据不同的item 构建不同的sql语句并插入到mysql中
insert_sql,params = item.get_insert_sql()
cursor.execute(insert_sql, params)