c#抓取网页数据(Java开发教程:匹配时为点任意匹配模式,点 )

优采云 发布时间: 2022-04-04 00:18

  c#抓取网页数据(Java开发教程:匹配时为点任意匹配模式,点

)

  1、确定URL并抓取页面代码

  import urllib

import urllib2

page = 1

url = 'http://www.qiushibaike.com/hot/page/' + str(page)

try:

request = urllib2.Request(url)

response = urllib2.urlopen(request)

print response.read()

except urllib2.URLError, e:

if hasattr(e,"code"):

print e.code

if hasattr(e,"reason"):

print e.reason

  2、添加页眉并抓取页面代码

  try:

#定义请求头

headrs={"User-Agent":" Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36"}

#定义请求,传入请求头

req=request.Request(url,headers=headrs)

#打开网页

resp=request.urlopen(req)

#打印响应码,解码

# print(resp.read().decode('utf-8'))

  3、提取页面的所有段落

  content = response.read().decode('utf-8')

pattern = re.compile('(.*?).*?(.*?).*?(.*?)',re.S)

items = re.findall(pattern,content)

for item in items:

haveImg = re.search("img",item[3])

if not haveImg:

print item[0],item[1],item[2],item[4]

except urllib2.URLError, e:

if hasattr(e,"code"):

print e.code

if hasattr(e,"reason"):

print e.reason

  4.面向对象的模式

<p>from urllib import request

import re

class tieba:

#初始化

def __init__(self):

# 定义url

self.url="https://tieba.baidu.com/f?kw=%E6%AE%B5%E5%AD%90&ie=utf-8&pn="

# 定义请求头

self.headrs={"User-Agent":" Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36"}

#列表,存储解析后的结果

self.stories=[]

#下载页面

def getPage(self,page_number):

try:

# 定义请求,传入请求头

req=request.Request(self.url+str(page_number),headers=self.headrs)

# 打开网页

resp=request.urlopen(req)

# 打印响应码,解码

content=resp.read().decode("utf-8")

return content

except request.URLError as e:

# 打印响应码

if hasattr(e, &#39;code&#39;):

print(e.code)

# 打印异常原因

if hasattr(e, &#39;reason&#39;):

print(e.reason)

#解析页面

def rexgPage(self,content):

# 定义正则表达式

#

0 个评论

要回复文章请先登录注册


官方客服QQ群

微信人工客服

QQ人工客服


线