[爬虫] 美团店家信息采集-详情链接采集

优采云发布时间: 2020-08-18 08:55

　　上篇文章分析了美团移动端的页面结构和设计技术结构，先爬详情链接，再爬详情内容，这篇先来实现详情链接的采集。

　　首先将一些固定不变的数据先拿出来，比如城市相关的数据，先采集下来放在数据库或则缓存中，或者储存到文件中。

　　如果稍为想想，是不是认为类别也是固定不变的数据，一开始我也是如此想的，后面调试的时侯发觉，每个城市的类别有所区别，有些类别在其他边远点的城市是没有的，所以每位city都须要恳求一遍分类。

　　OK，先上代码（复制到编辑器更好阅读代码）：

def crawl_shop(begin=0, count=1, detail=False):

crawl_rate_file = f'i_crawl_rate_correct_{threading.current_thread().name}.txt' # 记录采集断点文件名

error_urls_file = f'i_error_urls_{threading.current_thread().name}.txt' # 记录错误请求的

location = load_location(crawl_rate_file) # 加载断点

if not location:

location = {'cityid': 0, 'kind1': 0, 'kind2': 0, 'areaid': 0, 'page': 0}

for rowid, city_name, city_pinyin in citys(begin, count): # 这里城市信息先获取

# 继续上一次爬取点，城市位置

if rowid < location['cityid']:

continue

with requests.session() as session:

# 获取分类

category_url = f'https://i.meituan.com/category?city={city_pinyin}'

cate_parser = etree.HTML(i_request(session, category_url).text) # 封装好的请求函数

ikinds = OrderedDict()

# category为要采集类别的列表，只爬取要爬取额类别

for kind in category:

cate_node = cate_parser.xpath(f'//h4[contains(text(),"{kind}")]/following-sibling::ul[1]/li')

for li in cate_node:

text = li.xpath('./a/text()')[0].strip()

href = li.xpath('./a/@href')[0]

if text == '全部' or 'cateType=poi' not in href:

continue

ikinds.setdefault(kind, []).append({text: re.search('cid=(.*?)&', href).group(1)})

for index1, (kind1, kind2s) in enumerate(ikinds.items(), 1):

# 继续上一次爬取点，一级类别位置

if location['cityid'] == rowid and index1 < location['kind1']:

continue

for index2, kitem in enumerate(kind2s, 1):

# 继续上一次爬取点，二级类别位置

if location['cityid'] == rowid and location['kind1'] == index1 and index2 < location['kind2']:

continue

kind2, cid = list(kitem.items())[0]

area_url = f'https://i.meituan.com/{city_pinyin}/all/?cid={cid}'

city_area = i_city_area(session, area_url)

for area in city_area:

# 继续上一次爬取点，城市地区位置

if location['cityid'] == rowid and location['kind1'] == index1 and location['kind2'] == index2 and area['id'] < location['areaid']:

continue

# 按城市区域爬

# 翻页爬取

if location['cityid'] == rowid and location['kind1'] == index1 and \

location['kind2'] == index2 and area['id'] == location['areaid']:

page = location['page'] + 1

else:

page = 1

while True:

datas = []

shop_list_url = f'https://i.meituan.com/select/{city_pinyin}/page_{page}.html?cid={cid}&bid={area["id"]}&sid=rating&p={page}&bizType=area&csp=&cateType=poi&stid_b=_b2&nocount=true'

print(f'city: {city_name}, kind1: {kind1}, kind2: {kind2} area: {area["name"]}, page: {page}, url: {shop_list_url}')

try:

res = i_request(session, shop_list_url)

except Exception as e:

write_error_url(json.dumps({

'shop_list_url': shop_list_url, 'kind1': kind1, 'kind2': kind2, 'cid': cid,

'bid': area['id'],

'area': area['name'] if area['name'] is not None else city_name, 'city': city_name

}), e, filename=error_urls_file)

page += 1

continue

if '暂无此类团购，请查看其他分类' in res.text:

break

with mysqldb() as db:

for shop_url, shop_name in i_parse_shop_list(res.text):

shop = {'name': shop_name, 'crawled': 0, 'deleted': 0}

shop['kind1'], shop['kind2'] = kind1, kind2

shop['cid'], shop['bid'] = cid, area['id']

shop['area'], shop['city'] = area['name'] if area['name'] is not None else city_name, city_name

shop['url'] = shop_url

datas.append(shop)

# 入库，每页入一次

if datas:

with mysqldb() as db:

sql = f'insert into i_shop({",".join(datas[0].keys())}) ' \

f'values ({",".join(map(lambda k: "%({})s".format(k), datas[0].keys()))}) ' \

f'on duplicate key update name=values(name), ' \

f'kind1=values(kind1),kind2=values(kind2),area=values(area),city=values(city),cid=values(cid),bid=values(bid),version=version+1'

db.executemany(sql, datas)

# 记录进度

crawl_rate(

json.dumps({'cityid': rowid, 'kind1': index1, 'kind2': index2, 'areaid': area['id'], 'page': page}),

filename=crawl_rate_file

)

# 翻页

parser = etree.HTML(res.text)

next_page = parser.xpath('//a[contains(text(),"下一页")]/@href')

if not next_page:

break

page += 1

　　以上代码逻辑就是：先获取须要采集的城市数据，遍历的恳求每位城市，获取城市的分类，再遍历获取分类下的地区，每个地区下再按页数去遍历获取店家详情链接，保存采集到的数据到mysql。

　　这里涉及到了好几层嵌套，为了防止重复采集，我们须要记录每位遍历的位置，采集完一页就要记录断点，下次重新启动脚本就把采集过的位置continue。

　　以下是封装好的两个函数：

@retry(stop_max_attempt_number=5, wait_random_min=200, wait_random_max=330, retry_on_exception=retry_callback)

def i_request(session, url):

on_proxy(session)

res = session.get(url, timeout=10)

if 'Forbidden' in res.text and res.status_code == 403:

raise Exception('404 Forbidden')

return res

　　使用retry装饰器来装潢恳求函数，当函数内部出现错误都会进行重试，重试达到最大次数就会报出错误，这个装潢器在写爬虫恳求的时侯特别有用，如果出现timeout或则暂时性的诱因引起错误，进行间隔性重试是非常好用的。

@contextmanager

def mysqldb(database='meituan'):

try:

conn = pymysql.connect(

host='localhost',

port=3306,

user='root',

password='xxxxx',

database=database,

charset='utf8'

)

cursor = conn.cursor()

yield cursor

conn.commit()

except Exception as e:

print(e)

finally:

cursor.close()

conn.close()

　　使用contextmanager实现一个数据库操作的上下文管理器，有关上下文管理器的文章请看 [python] 上下文管理器。

0

2020-08-18

文章采集链接

0 个评论

要回复文章请先登录或注册

AI时代内容工厂

[爬虫] 美团店家信息采集-详情链接采集

0 个评论

发起人

AI时代内容工厂

[爬虫] 美团店家信息采集-详情链接采集

0 个评论

发起人

相关问题