python抓取网页数据(1.2.3.3.上代码,网页抓取和网页分析。)
优采云 发布时间: 2022-02-03 22:21python抓取网页数据(1.2.3.3.上代码,网页抓取和网页分析。)
最近,组里的一位老师在做研究,需要一些站点的水文资料。最近刚学python,了解到python中有很多方便的库,用于网页抓取和网页分析,于是开始写一个。勉强能用,哈哈。
在实现上,简单使用了线程中队列的使用、线程池的使用、网页分析。主要参考:
1.
2.
3.
关于代码,后面是简要说明
代码块
# -*- coding: utf-8 -*-
import urllib
import urllib2
from BeautifulSoup import BeautifulSoup
from datetime import *
from dateutil.relativedelta import *
import Queue
import threading
queue = Queue.Queue()
out_queue = Queue.Queue()
#日期迭代器,迭代从start到end中间的每一天
def loopDay(start, end):
while(start < end):
yield start
start = start + relativedelta(days = 1)
#向指定网站(湖南省水文查询系统)请求所需的页面
def post(url, data):
req = urllib2.Request(url)
data = urllib.urlencode(data)
#enable cookie
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor())
response = opener.open(req, data)
return response.read()
def getHtml(time):
posturl = 'http://61.187.56.156/wap/hnsq_BB2.asp'
data = {}
data['nian'] = str(time.year)
data['yue'] = str(time.month)
data['ri'] = str(time.day)
data['shi'] = '08:00'
html = post(posturl, data)
return html
#查找我们需要的关键字在Tag列表中的位置
def findKeyword(keyword, tagList):
location = -1
for tag in tagList:
location += 1
if (keyword.encode('GBK') in unicode(tag.string).encode("windows-1252")):
break
return location
#用于抓取某一天的对应的页面
class ThreadUrl(threading.Thread):
"""Threaded Url Grab"""
def __init__(self, queue, out_queue):
threading.Thread.__init__(self)
self.queue = queue
self.out_queue = out_queue
def run(self):
while True:
oneDay = self.queue.get()
print oneDay
#grabs webpage
webpage = getHtml(oneDay)
#place page into out queue
self.out_queue.put((oneDay, webpage))
#抓取获取到的指定日期对应的网页中我们需要的数据
class DatamineThread(threading.Thread):
"""Threaded Url Parsing"""
def __init__(self, out_queue):
threading.Thread.__init__(self)
self.out_queue = out_queue
def run(self):
while True:
(oneDay, page) = self.out_queue.get()
#parse the page
soup = BeautifulSoup(page, fromEncoding = "gb2312")
tagList = soup.findAll('td')
locations = []
for keyword in keywords:
locations.append(findKeyword(keyword, tagList))
for i in range(5):
fileName = keywords[i] + ".txt"
f = file(fileName, "a+")
waterLevel = unicode(tagList[locations[i] + 2].string).strip().encode("windows-1252")
waterStorage = unicode(tagList[locations[i] + 5].string).strip().encode("windows-1252")
f.write(str(oneDay) +"\t" + waterLevel + "\t" + waterStorage + "\n")
f.close()
if __name__ == '__main__':
keywords = [u"寸滩", u"万县", u"巫山", u"清溪场", u"忠县", u"武隆"]
startDay = date(2006, 1, 1)
endDay = date(2007, 1, 1)
days = loopDay(startDay, endDay)
t = ThreadUrl(queue, out_queue)
t.setDaemon(True)
t.start()
for oneDay in days:
queue.put(oneDay)
dt = DatamineThread(out_queue)
dt.setDaemon(True)
dt.start()
queue.join()
out_queue.join()
注册好久了,第一次写。网速不好,请稍后更改。