[爬虫] 美团店家信息采集-详情链接采集

优采云 发布时间: 2020-08-18 08:55

  [爬虫] 美团店家信息采集-详情链接采集

  上篇文章分析了美团移动端的页面结构和设计技术结构,先爬详情链接,再爬详情内容,这篇先来实现详情链接的采集。

  首先将一些固定不变的数据先拿出来,比如城市相关的数据,先采集下来放在数据库或则缓存中,或者储存到文件中。

  

  如果稍为想想,是不是认为类别也是固定不变的数据,一开始我也是如此想的,后面调试的时侯发觉,每个城市的类别有所区别,有些类别在其他边远点的城市是没有的,所以每位city都须要恳求一遍分类。

  OK,先上代码(复制到编辑器更好阅读代码):

  

def crawl_shop(begin=0, count=1, detail=False):

    crawl_rate_file = f'i_crawl_rate_correct_{threading.current_thread().name}.txt' # 记录采集断点文件名

error_urls_file = f'i_error_urls_{threading.current_thread().name}.txt' # 记录错误请求的

location = load_location(crawl_rate_file) # 加载断点

if not location:

location = {'cityid': 0, 'kind1': 0, 'kind2': 0, 'areaid': 0, 'page': 0}

    for rowid, city_name, city_pinyin in citys(begin, count):  # 这里城市信息先获取

# 继续上一次爬取点,城市位置

if rowid < location['cityid']:

continue

        with requests.session() as session:

# 获取分类

category_url = f'https://i.meituan.com/category?city={city_pinyin}'

            cate_parser = etree.HTML(i_request(session, category_url).text)  # 封装好的请求函数

ikinds = OrderedDict()

            # category为要采集类别的列表,只爬取要爬取额类别

for kind in category:

cate_node = cate_parser.xpath(f'//h4[contains(text(),"{kind}")]/following-sibling::ul[1]/li')

for li in cate_node:

text = li.xpath('./a/text()')[0].strip()

href = li.xpath('./a/@href')[0]

if text == '全部' or 'cateType=poi' not in href:

continue

ikinds.setdefault(kind, []).append({text: re.search('cid=(.*?)&', href).group(1)})

for index1, (kind1, kind2s) in enumerate(ikinds.items(), 1):

# 继续上一次爬取点,一级类别位置

if location['cityid'] == rowid and index1 < location['kind1']:

continue

for index2, kitem in enumerate(kind2s, 1):

# 继续上一次爬取点,二级类别位置

if location['cityid'] == rowid and location['kind1'] == index1 and index2 < location['kind2']:

continue

kind2, cid = list(kitem.items())[0]

area_url = f'https://i.meituan.com/{city_pinyin}/all/?cid={cid}'

city_area = i_city_area(session, area_url)

for area in city_area:

# 继续上一次爬取点,城市地区位置

if location['cityid'] == rowid and location['kind1'] == index1 and location['kind2'] == index2 and area['id'] < location['areaid']:

continue

# 按城市区域爬

# 翻页爬取

if location['cityid'] == rowid and location['kind1'] == index1 and \

location['kind2'] == index2 and area['id'] == location['areaid']:

page = location['page'] + 1

else:

page = 1

while True:

datas = []

shop_list_url = f'https://i.meituan.com/select/{city_pinyin}/page_{page}.html?cid={cid}&bid={area["id"]}&sid=rating&p={page}&bizType=area&csp=&cateType=poi&stid_b=_b2&nocount=true'

                            print(f'city: {city_name}, kind1: {kind1}, kind2: {kind2} area: {area["name"]}, page: {page}, url: {shop_list_url}')

try:

res = i_request(session, shop_list_url)

except Exception as e:

write_error_url(json.dumps({

'shop_list_url': shop_list_url, 'kind1': kind1, 'kind2': kind2, 'cid': cid,

'bid': area['id'],

'area': area['name'] if area['name'] is not None else city_name, 'city': city_name

}), e, filename=error_urls_file)

page += 1

continue

if '暂无此类团购,请查看其他分类' in res.text:

break

with mysqldb() as db:

                                for shop_url, shop_name in i_parse_shop_list(res.text):

                                    shop = {'name': shop_name, 'crawled': 0, 'deleted': 0}

shop['kind1'], shop['kind2'] = kind1, kind2

shop['cid'], shop['bid'] = cid, area['id']

                                    shop['area'], shop['city'] = area['name'] if area['name'] is not None else city_name, city_name

shop['url'] = shop_url

datas.append(shop)

# 入库,每页入一次

if datas:

with mysqldb() as db:

sql = f'insert into i_shop({",".join(datas[0].keys())}) ' \

f'values ({",".join(map(lambda k: "%({})s".format(k), datas[0].keys()))}) ' \

f'on duplicate key update name=values(name), ' \

f'kind1=values(kind1),kind2=values(kind2),area=values(area),city=values(city),cid=values(cid),bid=values(bid),version=version+1'

db.executemany(sql, datas)

# 记录进度

crawl_rate(

                                json.dumps({'cityid': rowid, 'kind1': index1, 'kind2': index2, 'areaid': area['id'], 'page': page}),

filename=crawl_rate_file

)

                            # 翻页

parser = etree.HTML(res.text)

next_page = parser.xpath('//a[contains(text(),"下一页")]/@href')

if not next_page:

break

page += 1

  以上代码逻辑就是:先获取须要采集的城市数据,遍历的恳求每位城市,获取城市的分类,再遍历获取分类下的地区,每个地区下再按页数去遍历获取店家详情链接,保存采集到的数据到mysql。

  这里涉及到了好几层嵌套,为了防止重复采集,我们须要记录每位遍历的位置,采集完一页就要记录断点,下次重新启动脚本就把采集过的位置continue。

  以下是封装好的两个函数:

  

@retry(stop_max_attempt_number=5, wait_random_min=200, wait_random_max=330, retry_on_exception=retry_callback)

def i_request(session, url):

on_proxy(session)

res = session.get(url, timeout=10)

if 'Forbidden' in res.text and res.status_code == 403:

raise Exception('404 Forbidden')

return res

  使用retry装饰器来装潢恳求函数,当函数内部出现错误都会进行重试,重试达到最大次数就会报出错误,这个装潢器在写爬虫恳求的时侯特别有用,如果出现timeout或则暂时性的诱因引起错误,进行间隔性重试是非常好用的。

  

@contextmanager

def mysqldb(database='meituan'):

try:

conn = pymysql.connect(

host='localhost',

port=3306,

user='root',

password='xxxxx',

database=database,

charset='utf8'

)

cursor = conn.cursor()

yield cursor

conn.commit()

except Exception as e:

print(e)

finally:

cursor.close()

conn.close()

  使用contextmanager实现一个数据库操作的上下文管理器,有关上下文管理器的文章请看 [python] 上下文管理器。

0 个评论

要回复文章请先登录注册


官方客服QQ群

微信人工客服

QQ人工客服


线