ajax抓取网页内容(渲染引擎的基本流程和流程、流程介绍及流程 )
优采云 发布时间: 2021-10-09 14:17ajax抓取网页内容(渲染引擎的基本流程和流程、流程介绍及流程
)
文章内容
介绍
本文介绍了动态页面和Ajax渲染页面数据捕获的示例,以及相应的页面分析过程。
Ajax 抓取示例
越来越多的网页原创 HTML 文档不收录任何数据,而是由 Ajax 统一加载。向网页更新发送ajax请求的过程:
打开浏览器的开发者工具,进入Networkk选项卡,使用XHR过滤工具。需要根据对应的all_config_file.py文件创建对应的文件夹,修改配置,启动相关服务。
all_config_file.py
#coding=utf-8
__author__ = 'Mr数据杨'
__explain__ = '各目标网站爬虫脚本配置文件'
#加载引用模块
import time
import pymongo
import pandas as pd
def news_page_num():
page_num=input("输入每个网站页面爬取的页面数:")
return int(page_num)
def title_error_num():
title_error_num=input("输入错误标题爬取最大数:")
return int(title_error_num)
def body_error_num():
body_error_num=input("输入错误页面爬取最大数:")
return int(body_error_num)
def mongodb_client():
# 获取mongoClient对象
client = pymongo.MongoClient("localhost", 27017)
# 获取使用的database对象
db = client.news
print("加载MongoDB数据库完毕......")
return db
db=mongodb_client()
def time_today():
# 全局函数
time_today = time.strftime('%Y-%m-%d', time.localtime(time.time()))
print("加载全局日期函数完毕......")
return time_today
# 错误日志信息
def error_text_title(text,time_today):
print("加载错误信息日志完毕......")
with open("logs/" + time_today + " news_title_error.txt", "a") as f:
f.write(text + '\n')
# 错误日志信息
def error_text_body(text,time_today):
with open("logs/" + time_today + " news_body_error.txt", "a") as f:
f.write(text + '\n')
# 找到每个爬取网页的链接
def get_title_links_from_MongoDB(label, type):
result = []
for item in db.news_tmp.find({'label': label, 'type': type}, {'url': 1, '_id': 1}):
result.append(item)
result = pd.DataFrame(result, columns=['url', '_id'])
return result
主程序
<p>#加载引用模块
import urllib
import urllib.request
import requests
import datetime
from bs4 import BeautifulSoup
import all_config_file
from all_config_file import error_text_title
from all_config_file import error_text_body
from all_config_file import get_title_links_from_MongoDB
cqcoal = "http://news.cqcoal.com/manage/newsaction.do?method:webListPageNewsArchivesByTypeid"
print("加载目标网址完毕......")
db = all_config_file.mongodb_client()
time_today = all_config_file.time_today()
def cqcoal_title_start(num):
def start_type(url, label, typeid, pagenum, type):
try:
page_num = 1
while page_num