php抓取网页动态数据(动态网站和静态网站的区别在于什么是动态信息文件 )

优采云发布时间: 2021-10-20 11:11

　　php抓取网页动态数据(动态网站和静态网站的区别在于什么是动态信息文件

)

　　1.什么是动态网站？

　　动态网站和静态网站的区别在于，网页往往收录JS、CSS等具有动态效果的内容或文件，也是网页的有机整体。但是对于浏览器来说，它是如何处理这些额外文件的呢？首先浏览器下载html文件，然后根据需要下载js等附加文件，它会自动下载。如果我们要抓取这些网页中的动态信息，就需要我们自己构造请求数据。

　　2.如何找到这些动态效果的附加文件？

　　例子：

　　我们先打开一部电影网站:，然后按F12，在开发者工具中找到“网络”选项，我用的是FireFox，如图：

　　刷新：

　　可以发现，除了html文件外，网页中还加载了其他文件，如CSS、JS等。

　　例如，网页中的分数信息是动态加载到上图中我点击变成蓝色的那一行js文件中的。那么我是怎么找到这个文件的呢？不好意思，目前只能说是经验，因为网站中动态加载的信息大部分在js文件中，所以我们可以在js和xhr文件中响应看正文就是我们要的数据，一般都能找到。

　　3.这些动态信息文件有什么用，如何下载？

　　如前所述，这些文件的作用是动态加载网页信息。比如上图中的“票房：10.25亿元”就是我们要爬取的数据，但不在网页中。存在，所以可以预测这些js等文件是动态加载的。

　　如何下载它们？先点击文件，再点击“消息头”，会看到“消息头”、“Cookie”、“参数”等按钮，如图：

　　然后我们就可以看到这个js文件的请求地址了。请求 URL 的结构是规则的。动态变化的只有3个部分，即电影的URL、时间和电影的编号。这三个项目显然是电影。URL和电影编号可以从一个静态html网页中获取，时间可以自己构造，然后访问js文件的URL获取数据。数据以字典的形式呈现，内容在“响应”中，如图：

　　我们可以使用json模块来处理，比较方便。

　　4.以下是项目的结构和代码： 4.1. 目录结构：

　　4.2.代码模块：

　　1.HtmlDownloader模块中的下载方法用于下载网页信息：

　　import requests

import chardet

class HtmlDownloader(object):

def download(self, url):

if url is None:

return None

user_agent = 'Mozilla/4.0 (compatible; MISE 5.5; Windows NT)'

headers = {'User-Agent': user_agent}

response = requests.get(url, headers=headers)

if response.status_code == 200:

response.encoding = 'utf-8'

return response.text

return None

　　2.HtmlParser模块根据首页的电影网址找到所有js动态文件，然后下载我们需要的数据。

　　import re

from bs4 import BeautifulSoup

import json

class HtmlParser(object):

def parser_url(self, page_url, response):

pattern = re.compile(r'(http://movie.mtime.com/(\d+)/)')

urls = pattern.findall(response)

if urls:

# 将url进行去重

return list(set(urls))

else:

return None

def parser_json(self, page_url, response):

'''

解析响应

:param page_url:

:param response:

:return:

'''

# 将“=”和“；”之间的内容提取出来

pattern = re.compile(r'=(.*?);')

result = pattern.findall(response)[0]

if result:

# json模块加载字符串

value = json.loads(result)

try:

isRelease = value.get('value').get('isRelease')

except Exception as e:

print('json异常')

return None

if isRelease:

if value.get('value').get('hotValue') == None:

return self._parser_release(page_url, value)

else:

return self._parser_no_release(page_url, value, isRelease=2)

else:

return self._parser_no_release(page_url, value)

def _parser_release(self, page_url, value):

'''

解析已经上映的影片

:param page_url: 电影链接

:param value: json数据

:return:

'''

try:

isRelease = 1

movieRating = value.get('value').get('movieRating')

boxOffice = value.get('value').get('boxOffice')

movieTitle = value.get('value').get('movieTitle')

RPictureFinal = movieRating.get('RPictureFinal')

RStoryFinal = movieRating.get('RStoryFinal')

RDirectoryFinal = movieRating.get('RDirectoryFinal')

ROtherFinal = movieRating.get('ROtherFinal')

RatingFinal = movieRating.get('RatingFinal')

MovieId = movieRating.get('MovieId')

Usercount = movieRating.get('Usercount')

AttitudeCount = movieRating.get('AttitudeCount')

TotalBoxOffice = boxOffice.get('TotalBoxOffice')

TotalBoxOfficeUnit = boxOffice.get('TotalBoxOfficeUnit')

TodayBoxOffice = boxOffice.get('TodayBoxOffice')

TodayBoxOfficeUnit = boxOffice.get('TodayBoxOfficeUnit')

ShowDays = boxOffice.get('ShowDays')

try:

Rank = boxOffice.get('ShowDays')

except Exception:

Rank = 0

# 返回所提取的内容

return (

MovieId, movieTitle, RatingFinal,

ROtherFinal, RPictureFinal, RDirectoryFinal,

RStoryFinal, Usercount, AttitudeCount,

TotalBoxOffice+TotalBoxOfficeUnit,

TodayBoxOffice+TodayBoxOfficeUnit,

Rank, ShowDays, isRelease

)

except Exception:

print(page_url, value)

return None

def _parser_no_release(self, page_url, value, isRelease=0):

'''

解析未上映的电影信息

:param page_url:

:param value:

:param isRelease:

:return:

'''

try:

movieRating = value.get('value').get('movieRating')

movieTitle = value.get('value').get('movieTitle')

RPictureFinal = movieRating.get('RPictureFinal')

RStoryFinal = movieRating.get('RStoryFinal')

RDirectorFinal = movieRating.get('RDirectoryFinal')

ROtherFinal = movieRating.get('ROtherFinal')

RatingFinal = movieRating.get('RatingFinal')

MovieId = movieRating.get('MovieId')

Usercount = movieRating.get('Usercount')

AttitudeCount = movieRating.get('AttitudeCount')

try:

Rank = value.get('value').get('hotValue').get('Ranking')

except Exception:

Rank = 0

return (MovieId, movieTitle, RatingFinal,

ROtherFinal, RPictureFinal, RDirectorFinal,

RStoryFinal, Usercount, AttitudeCount, u'无',

u'无', Rank, 0, isRelease)

except Exception:

print(page_url, value)

return None

　　3.DataOutput 模块用于在数据库表中存储数据。

　　import sqlite3

class DataOutput(object):

def __init__(self):

self.cx = sqlite3.connect('MTime.db')

self.create_table('MTime')

self.datas=[]

def create_table(self, table_name):

'''

创建数据表

:param table_name:

:return:

'''

values = '''

id integer primary key,

MovieId integer,

MovieTitle varchar(40) NULL,

RatingFinal REAL NULL DEFAULT 0.0,

ROtherFinal REAL NULL DEFAULT 0.0,

RPictureFinal REAL NULL DEFAULT 0.0,

RDirectoryFinal REAL NULL DEFAULT 0.0,

RStoryFinal REAL NULL DEFAULT 0.0,

Usercount integer NULL DEFAULT 0,

AttitudeCount integer NULL DEFAULT 0,

TotalBoxOffice varchar(20) NULL,

TodayBoxOffice varchar(20) NULL,

Rank integer NULL DEFAULT 0,

ShowDays integer NULL DEFAULT 0,

isRelease integer NULL

'''

self.cx.execute("DROP TABLE IF EXISTS %s" % table_name)

self.cx.execute("CREATE TABLE %s( %s );" % (table_name, values))

def store_data(self, data):

'''

数据存储

:param data:

:return:

'''

if data is None:

return

self.datas.append(data)

print('passby')

if len(self.datas) > 10:

self.output_db('MTime')

print('Output successfully!')

def output_db(self, table_name):

'''

将数据存储到sqlite

:param table_name:

:return:

'''

for data in self.datas:

self.cx.execute("INSERT INTO %s (MovieId, MovieTitle,"

"RatingFinal, ROtherFinal, RPictureFinal,"

"RDirectoryFinal, RStoryFinal, Usercount,"

"AttitudeCount, TotalBoxOffice, TodayBoxOffice,"

"Rank, ShowDays, isRelease) VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?)"

"" % table_name, data)

self.datas.remove(data)

self.cx.commit()

def output_end(self):

'''

关闭数据库

:return:

'''

if len(self.datas) > 0:

self.output_db('MTime')

self.cx.close()

　　4.SpiderMan 模块用于调用各个模块，实现功能的统一。

　　import time

from the_python_spider_for_dynamic_websites.HtmlDownloader import HtmlDownloader

from the_python_spider_for_dynamic_websites.HtmlParser import HtmlParser

from the_python_spider_for_dynamic_websites.DataOutput import DataOutput

class SpiderMan(object):

def __int__(self):

pass

def crawl(self, root_url):

downloader = HtmlDownloader()

parser = HtmlParser()

output = DataOutput()

content = downloader.download(root_url)

urls = parser.parser_url(root_url, content)

# 构造一个获取评分和票房链接

for url in urls:

try:

print(url[0], url[1])

t = time.strftime("%Y%m%d%H%M%S3282", time.localtime())

# print('t:', t)

rank_url = 'http://service.library.mtime.com/Movie.api' \

'?Ajax_CallBack=true'\

'&Ajax_CallBackType=MTime.Library.Services'\

'&Ajax_CallBackMethod=GetMovieOverviewRating'\

'&Ajax_CrossDomain=1'\

'&Ajax_RequestUrl=%s'\

'&t=%s'\

'&Ajax_CallBackArgument0=%s' % (url[0], t, url[1])

rank_content = downloader.download(rank_url)

print("rank_content:", rank_content)

data = parser.parser_json(rank_url, rank_content)

print("data:", data)

output.store_data(data)

except Exception as e:

print("Crawl failed:", e)

output.output_end()

print("Crawl finish")

if __name__ == '__main__':

spider = SpiderMan()

spider.crawl('http://theater.mtime.com/China_Beijing/')

　　最后放一张爬取数据的图片：

0

2021-10-20

php抓取网页动态数据

0 个评论

要回复文章请先登录或注册

AI时代内容工厂

php抓取网页动态数据(动态网站和静态网站的区别在于什么是动态信息文件 )

0 个评论

发起人

AI时代内容工厂

php抓取网页动态数据(动态网站和静态网站的区别在于什么是动态信息文件 )

0 个评论

发起人

相关问题