网页抓取解密(电影类型试多个,在列表里,需要转一下。)
优采云 发布时间: 2021-11-27 19:01网页抓取解密(电影类型试多个,在列表里,需要转一下。)
尝试多种电影类型,在列表中,您需要翻转它。电影时长有下一行等符号,需要转入str处理和之前获取评论时一样的数据。
具体需要解密部分数据参考地址:
想法:
首先请求页面的字体下载。之前的公众意见是下载和回收字体。这是一次访问和下载一个。下载后整理好对应的词典,下载后保存到自己的工程中,再次打开即可。将保存的字体文件通过绘制方法绘制成图片,打开图片即可获取绘制图片的文字。组成数字和字符的字典,获取页面中的字符,判断字典中要替换的字符,得到想要的加密数据:
总结:在大众点评获得的原字体基础上,加图再识别图文对比!
在代码上:
猫眼.py
<p># -*- coding: utf-8 -*-
import json
import numpy
import os
import re
import requests
import scrapy
from PIL import Image, ImageDraw, ImageFont
from fontTools.ttLib import TTFont
from lxml import html
# 创建etree模块
from pytesseract import image_to_string
from myfilm import settings
from myfilm.items import MyfilmItem
etree = html.etree
from scrapy import Request
class MaoyanSpider(scrapy.Spider):
name = 'maoyan'
allowed_domains = ['maoyan.com']
# start_urls = ['http://maoyan.com/']
fontstr = ""#定义一个字体文件的字符串
headers = {
#此处为你自己的浏览器版本
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36"
}
#cookie的值上面有相应需求整理,我放到了settings中
cookies = settings.cookies
def start_requests(self):
'''
重写start_requests函数
:return:
'''
urllist = [
'https://maoyan.com/films?showType=3'
]
for url in urllist:
yield Request(url=url,headers=self.headers, cookies=self.cookies, callback=self.parse)
def parse(self, response):
'''
解析电影列表页面
:param response:
:return:
'''
##获取列表
ddlist = response.xpath("//div[@class='container']//div[@class='movies-panel']//dl[@class='movie-list']//dd")
##遍历获取每个电影的信息
for dd in ddlist:
filmdic = {
"id": dd.xpath(".//div[@class='movie-item']//a//@href").get() or "",
"film_id": str(dd.xpath(".//div[@class='movie-item']//a//@href").get()).split('/')[2] or "",
"cname": dd.xpath(".//div[contains(@class,'movie-item-title')]//a//text()").get() or "null",
"thumb": dd.xpath(".//div[@class='movie-item']//a//img[last()]//@data-src").get() or "default",
"score": dd.xpath(".//div[contains(@class,'channel-detail-orange')]//text()").get() or "0",
}
# print("filmdic=====",filmdic)
#获取详情链接
_url = "https://maoyan.com" + filmdic["id"]
yield Request(url=_url, headers=self.headers, cookies=self.cookies, callback=self.parse_detail,
meta={"filmdic": filmdic})
#判断下一页是否存在 获取当前页
page=response.xpath("//div[@class='container']//div[@class='movies-panel']//div[@class='movies-pager']//li[@class='active']//a//@class").get() or "null"
# print("page=========",page)
now_page = str(page).split('_')[1]
lastpage=response.xpath("//div[@class='container']//div[@class='movies-panel']//div[@class='movies-pager']//li[last()]//a//@class").get() or "null"
# print("lastpage=====",lastpage)
last_page = str(lastpage).split('_')[1]
if int(last_page) == int(now_page) + 1:
##说明有下一页
href=response.xpath("//div[@class='container']//div[@class='movies-panel']//div[@class='movies-pager']//li[last()]//a//@href").get()#跳转拼接的路径
nexturl = "https://maoyan.com/films" + href
# print("nexturl====一下一页路径",nexturl)
yield Request(url=nexturl, headers=self.headers, cookies=self.cookies, callback=self.parse,dont_filter=True)
def parse_detail(self, response):
'''
解析详情页面数据
:param response:
:return:
'''
filmItem = MyfilmItem()
#获取上个页面传过来的值
type=response.xpath("//div[@class='movie-brief-container']//li[1]//a//text()").extract()
length=response.xpath("//div[@class='movie-brief-container']//li[2]//text()").get()
# print("type====",type,"length====",length)
filmdic = response.meta.get("filmdic")
#先获取简单表面上的数据
filmItem["film_id"] = filmdic["film_id"]
filmItem["thumb"] = filmdic["thumb"]
filmItem["cname"] = filmdic["cname"]
filmItem["ename"] = response.xpath("//div[contains(@class,'ename')]//text()").get()
filmItem["type"] = json.dumps(type)
filmItem["length"] = str(length).replace('\n', '').replace(' ', '')
filmItem["time"] = response.xpath("//div[@class='movie-brief-container']//li[3]//text()").get()
filmItem["tickets_unit"] = response.xpath("//div[contains(@class,'celeInfo-right')]//div[@class='movie-index'][2]//span[@class='unit']//text()").get() or "none"
#获取到整个页面的内容
htmllist = response.text
# print(htmls)
#正则查找本次请求字体文件的url
font_file = re.findall(r'//vfile.meituan.net/colorstone/(\w+\.woff)', htmllist)[0]
# 先判断是否下载过字体,避免重复下载浪费空间
if self.fontstr == "":
# 把这个字体文件名在全局变量中记录一下
self.fontstr = font_file
else:
#先把原有的字体删除,再重新赋值 拼接下载地址
os.remove("static/" + self.fontstr)
self.fontstr = font_file
fonturl = "https://vfile.meituan.net/colorstone/" + font_file
#调用下载方法,下载该字体文件
self.save_font(font_file, fonturl)
#每次字体文件不一样并且字体文件中的编码以及所对应的值的顺序也不一样
# 虽然编码不停变化但是每个字的图元是固定的,所以需要新字体与最开始下载的字体的每个字的图元进行比较
fontdic = self.get_font(self.fontstr)
# 查看是否在字典里 发现替换
for key in fontdic:
if key in htmllist:
htmllist = htmllist.replace(key, str(fontdic[key]))
# print(htmllist)
htmldata = etree.HTML(htmllist)
#获取票房数量
ticketslist = htmldata.xpath("//div[contains(@class,'celeInfo-right')]//div[@class='movie-index'][2]//span[@class='stonefont']//text()")
if ticketslist:
# 当票房存在
filmItem["ticketNumber"] = "".join(str(ticketslist[0]).split(';'))
else:
filmItem["ticketNumber"] = "暂无"
#获取评论人数 若暂无评分归0
if filmdic["score"] == "暂无评分":
filmItem["score"] = filmdic["score"]
filmItem["score_num"] = "0"
else:
# 有评分获取数值
index_left = htmldata.xpath("//div[contains(@class,'celeInfo-right')]//div[@class='movie-index'][1]//span[contains(@class,'index-left')]//span[@class='stonefont']//text()")
filmItem["score"] = "".join(str(index_left[0]).split(';'))
index_right = htmldata.xpath("//div[contains(@class,'celeInfo-right')]//div[@class='movie-index'][1]//div[@class='index-right']//span[@class='stonefont']//text()")
filmItem["score_num"] = "".join(str(index_right[0]).split(';'))
yield filmItem
def save_font(self, file_name, url):
'''
字体文件保存到项目中
:param file_name:
:return:
'''
# 需要判断指定的文件夹是否存在,若不存在就创建
dir = "static"
self.creatDir(dir)
savePath = dir + "/" + file_name
response = requests.get(url)
# 写入文件中
with open(savePath, 'wb') as f:
f.write(response.content)
f.flush()
return file_name
def creatDir(self, dir):
'''
判断指定的文件夹是否存在 创建文件夹
:param dir:
:return:
'''
dirlist = dir.split("/")
# print("dirlist====",dirlist)
for index, name in enumerate(dirlist):
# print("index==",index,name)
itemdir = os.path.join(os.getcwd(), name)
# 判断当前文件夹是否存在
if not os.path.exists(itemdir):
os.mkdir(itemdir)
# 如果当前文件夹存在并且不是最后一层
if index