python抓取动态网页(python关于python编写爬虫的一些东西是怎样的？(图))

优采云发布时间: 2022-03-07 23:25

　　前段时间浏览知乎的时候，发现一篇关于用python写爬虫的帖子。这是帖子的链接

　　于是想到了用python来尝试爬取一些东西。本来打算根据关键词爬取百度图片并下载，但过程中遇到障碍，暂时停止。然后去内涵段的页面结构，发现还是比较简单的

　　单点，然后在下面实现一个爬虫。

　　写这个程序的时候参考了博主的相关博文在知乎/pleasecallmewhy/article/details/8929576

　　编写这个程序主要分为以下几个步骤：

　　1.分析Inner Community的页面结构

　　2.使用正则表达式查找要下载的url

　　3.下载这些图片

　　第一步就是第一步，也是比较关键的一步。如果页面分析不正确，则后续步骤将无法启动。

　　1.打开内段子囧图片页

　　我们将看到以下页面

　　这个页面下面有一些我们想要的搞笑图片，但是我们首先需要得到这个页面的html文件，这里我使用python的urllib库，代码如下

　　def get_html(url):

print "---------------now get html from url :" + url + "----------"

send_headers = {

'Host':'neihanshequ.com',

'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:37.0) Gecko/20100101 Firefox/37.0',

'Cookie':"pksrqup=1; csrftoken=237f4451075fe45cef3a4f5449f70658; tt_webid=3379513254; uuid=\"w:33266c46f0cc4fa6944c073b1b1bccea\"",

'Connection':'keep-alive'

}

req = urllib2.Request(url ,headers=send_headers)

try:

response = urllib2.urlopen(req ,timeout = 100)

html = response.read()

return html

except urllib2.HTTPError, e:

print 'The server couldn\'t fulfill the request.'

print 'Error code: ', e.code

except urllib2.URLError, e:

print 'We failed to reach a server.'

print 'Reason: ', e.reason

else:

print 'No exception was raised.'

　　需要使用 urllib 模拟使用 Firefox 的 Firebug 插件发送的信息才能看到，然后复制头部信息填入上面的头部。需要添加其中的 Cooiker。如果不添加，将无法获取 html 文件。urllib的使用具体介绍见上面博主的博客，说的很清楚。

　　既然得到了html文件，我们来观察这个文件。这个html文件的结构比较清晰。

　　每个帖子由一个 div 组成，然后是标题、图像和评论的另一个 div

　　在class = content-wrapper的div中我们找到了这句话

　　这个data-text就是图片的写法，data-pic就是图片的地址，所以我们的工作就是获取所有的data-pic和data-text（后面可以作为图片的名字)

　　要解析这个 html 中的所有这两个字段，我们需要使用 python 的正则表达式。我们这里使用的非常简单。我是通过模仿得到的。具体的re教程也可以从上面的博主那里获得。

　　下面是我的重新解析代码

　　这样我就可以根据刚才得到的html文件解析出所有图片的地址，然后就可以在下面下载了。下载使用与urllib相关的函数。

　　-----------------到此结束，就可以下载几十张图片了

　　为什么只有几十张图片？

　　原因是我们刚刚获取的只是首页的html文件，那么如何获取更多的html文件呢？

　　我们注意到页面底部有一个Load More按钮，点击它可以获取图片。

　　我们也使用萤火虫来抓取包。

　　打开这个 Get 请求和结果

　　问：

　　响应：我们在浏览器中输入这个请求地址，得到一个json响应

　　逐步展开json得到

　　在 large_image 下面我们有我们需要的东西。.

　　仔细观察得到的json响应，你会发现有一个min_time字段，是一个unix时间戳。而这个 min_time 正是这个下一个请求的 max_time

　　这个循环可以得到所有的图片！！

　　进入第一次获取的html文件，也可以找到一个

　　那么我们的任务基本上就是不断的解析json文件并下载

　　下面是我的第一个版本的源代码

　　# -*- coding: utf-8 -*-

import urllib2

import urllib

import re

import thread

import time

import os

import random

import json

#内涵段子抓取类

class neiHanSpider :

def __init__(self):

self.primer_url = 'http://neihanshequ.com/pic/'

#点击加载更多之后请求的url

self.base_url = 'http://neihanshequ.com/pic/?is_json=1&max_time='

def Start(self):

#首先获取第一个页面的html数据，并分析其中的data-pic和max_time

primer_html = self.__getHtml(self.primer_url)

data_pic = self.__getDataPic(primer_html)

max_time = self.__getMaxTime(primer_html)

#download pic

self.__downloadPic(data_pic)

count = 0

#下面开始下载点击更多之后的图片

while max_time:

count = count + 1

print "=--------------------THIS　IS THE " + str(count) + " Json Data Time : " + str(max_time) + "--------------------"

url = self.base_url + str(max_time)

json_data = self.__getHtml(url)

json_ret = self.__parseJson(json_data)

max_time = json_ret['max_time']

print max_time

image_url = json_ret['image_url']

image_content = json_ret['image_content']

self.__downloadPic(image_url,image_content)

#python 以两个下划线开始的为私有函数

#尝试5次

#解析json，并获取json中的数据

def __parseJson(self,json_data):

print "------This is parse_json --------"

dct = json.loads(json_data)

image_content = []

image_url = []

max_time = ""

try :

max_time = dct['data']['max_time']

data = dct['data']['data']

for item in data:

content = item['group']['content']

url = item['group']['large_image']['url_list'][0]['url']

image_content.append(content)

image_url.append(url)

ret = {}

ret['image_content'] = image_content

ret['image_url'] = image_url

ret['max_time'] = max_time

return ret

except :

print "json_parse error"

#定义下载图片函数

def __downloadPic(self,imageAddressList,contentList = []):

print "---download------"

contentExist = len(contentList)

count = 0

for image in imageAddressList :

print image

count = count + 1

randTail = str(random.randint(0,30000000))

try :

#tail = contentExist ? contentList[count - 1] : randTail ;

if contentExist :

tail = contentList[count - 1]

else :

tail = randTail

fullPath = "C:\\Users\\Administrator\\Desktop\\python\\" + tail + ".jpg"

urllib.urlretrieve(image , fullPath)

except :

failedMsg = "第" + str(count) + "张下载失败，URL： " + str(image) + ""

print failedMsg

pass

def __getDataPic(self,html):

re_str = r'data-pic="([^"]*)"'

data_pic = self.__getDataByRe(html,re_str)

return data_pic

def __getMaxTime(self,html):

re_str = r'max_time: \'([\d]*)\''

max_time = self.__getDataByRe(html,re_str)

return max_time

def __getDataByRe(self,text,re_str):

pattern = re.compile(re_str)

ret = pattern.findall(text)

return ret

def __getHtml(self,url):

print "GET　HTML********"

count = 0

while count < 5:

count = count + 1

print str(count) + " times ,try download html"

html = self.__getDataByUrl(url)

if not html:

continue;

else:

return html

def __getDataByUrl(self,url):

print "---------------now get html from url :" + url + "----------"

send_headers = {

'Host':'neihanshequ.com',

'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:37.0) Gecko/20100101 Firefox/37.0',

'Cookie':"pksrqup=1; csrftoken=237f4451075fe45cef3a4f5449f70658; tt_webid=3379513254; uuid=\"w:33266c46f0cc4fa6944c073b1b1bccea\"",

'Connection':'keep-alive'

}

req = urllib2.Request(url ,headers=send_headers)

try:

response = urllib2.urlopen(req ,timeout = 100)

html = response.read()

return html

except urllib2.HTTPError, e:

print 'The server couldn\'t fulfill the request.'

print 'Error code: ', e.code

except urllib2.URLError, e:

print 'We failed to reach a server.'

print 'Reason: ', e.reason

else:

print 'No exception was raised.'

#------------------------------------------程序入口处------------------------------

mySpider = neiHanSpider()

mySpider.Start()

　　之后我再次尝试了多线程版本

　　# -*- coding: utf-8 -*-

import urllib2

import urllib

import re

import threading

import time

import os

import random

import json

#内涵段子抓取类

class neiHanSpider :

def __init__(self ):

self.primer_url = 'http://neihanshequ.com/pic/'

#点击加载更多之后请求的url

self.base_url = 'http://neihanshequ.com/pic/?is_json=1&max_time='

def Start(self):

#首先获取第一个页面的html数据，并分析其中的data-pic和max_time

primer_html = self.__getHtml(self.primer_url)

data_pic = self.__getDataPic(primer_html)

max_time = self.__getMaxTime(primer_html)

#download pic

#self.__downloadPic(data_pic)

global downloadUrlList

global downloadTitleList

#downloadList = downloadList + data_pic

count = 0

#下面开始下载点击更多之后的图片

while max_time and count " + str(len(downloadTitleList))

threadLock = threading.Lock()

threads = []

size = len(downloadUrlList)

for i in range(1,10) :

thread = myDownLoad(i,"Thread-" + str(i));

thread.start()

threads.append(thread)

aliveCount = 10

while aliveCount > 1 :

print "Now There is " + str(aliveCount) + "Threads alive"

aliveCount = threading.activeCount()

time.sleep(10)

endTime = time.time()

print " Download " + str(size) + "张图，共耗时 " + str((endTime - startTime) / 60) + "min"

print "Exiting Main Thread"

　　可能写的不是很整齐，有时间再整理一下。Python现正在使用中，欢迎批评指正

0

2022-03-07

python抓取动态网页

0 个评论

要回复文章请先登录或注册

AI时代内容工厂

python抓取动态网页(python关于python编写爬虫的一些东西是怎样的？(图))

0 个评论

发起人