python抓取网页数据(1.2.3.3.上代码,网页抓取和网页分析。)

优采云 发布时间: 2022-02-03 22:21

  python抓取网页数据(1.2.3.3.上代码,网页抓取和网页分析。)

  最近,组里的一位老师在做研究,需要一些站点的水文资料。最近刚学python,了解到python中有很多方便的库,用于网页抓取和网页分析,于是开始写一个。勉强能用,哈哈。

  在实现上,简单使用了线程中队列的使用、线程池的使用、网页分析。主要参考:

  1.

  2.

  3.

  关于代码,后面是简要说明

  代码块

  # -*- coding: utf-8 -*-

import urllib

import urllib2

from BeautifulSoup import BeautifulSoup

from datetime import *

from dateutil.relativedelta import *

import Queue

import threading

queue = Queue.Queue()

out_queue = Queue.Queue()

#日期迭代器,迭代从start到end中间的每一天

def loopDay(start, end):

while(start < end):

yield start

start = start + relativedelta(days = 1)

#向指定网站(湖南省水文查询系统)请求所需的页面

def post(url, data):

req = urllib2.Request(url)

data = urllib.urlencode(data)

#enable cookie

opener = urllib2.build_opener(urllib2.HTTPCookieProcessor())

response = opener.open(req, data)

return response.read()

def getHtml(time):

posturl = &#39;http://61.187.56.156/wap/hnsq_BB2.asp&#39;

data = {}

data[&#39;nian&#39;] = str(time.year)

data[&#39;yue&#39;] = str(time.month)

data[&#39;ri&#39;] = str(time.day)

data[&#39;shi&#39;] = &#39;08:00&#39;

html = post(posturl, data)

return html

#查找我们需要的关键字在Tag列表中的位置

def findKeyword(keyword, tagList):

location = -1

for tag in tagList:

location += 1

if (keyword.encode(&#39;GBK&#39;) in unicode(tag.string).encode("windows-1252")):

break

return location

#用于抓取某一天的对应的页面

class ThreadUrl(threading.Thread):

"""Threaded Url Grab"""

def __init__(self, queue, out_queue):

threading.Thread.__init__(self)

self.queue = queue

self.out_queue = out_queue

def run(self):

while True:

oneDay = self.queue.get()

print oneDay

#grabs webpage

webpage = getHtml(oneDay)

#place page into out queue

self.out_queue.put((oneDay, webpage))

#抓取获取到的指定日期对应的网页中我们需要的数据

class DatamineThread(threading.Thread):

"""Threaded Url Parsing"""

def __init__(self, out_queue):

threading.Thread.__init__(self)

self.out_queue = out_queue

def run(self):

while True:

(oneDay, page) = self.out_queue.get()

#parse the page

soup = BeautifulSoup(page, fromEncoding = "gb2312")

tagList = soup.findAll(&#39;td&#39;)

locations = []

for keyword in keywords:

locations.append(findKeyword(keyword, tagList))

for i in range(5):

fileName = keywords[i] + ".txt"

f = file(fileName, "a+")

waterLevel = unicode(tagList[locations[i] + 2].string).strip().encode("windows-1252")

waterStorage = unicode(tagList[locations[i] + 5].string).strip().encode("windows-1252")

f.write(str(oneDay) +"\t" + waterLevel + "\t" + waterStorage + "\n")

f.close()

if __name__ == &#39;__main__&#39;:

keywords = [u"寸滩", u"万县", u"巫山", u"清溪场", u"忠县", u"武隆"]

startDay = date(2006, 1, 1)

endDay = date(2007, 1, 1)

days = loopDay(startDay, endDay)

t = ThreadUrl(queue, out_queue)

t.setDaemon(True)

t.start()

for oneDay in days:

queue.put(oneDay)

dt = DatamineThread(out_queue)

dt.setDaemon(True)

dt.start()

queue.join()

out_queue.join()

  注册好久了,第一次写。网速不好,请稍后更改。

0 个评论

要回复文章请先登录注册


官方客服QQ群

微信人工客服

QQ人工客服


线