python抓取网页数据(python写了一小端代码练习,获取网页数据(html))

优采云 发布时间: 2021-09-22 13:27

  python抓取网页数据(python写了一小端代码练习,获取网页数据(html))

  最近学识到Python写了一个小端代码来练习

  一、 get web数据(html)

  URL:URL标题:请求头信息(见图)

  找到一个常用的用户代理:

  headers = [

'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',

'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',

'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',

'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50',

'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)',

]

  

  通常我们通过

  '''

获取html页面

'''

def get_html(url,headers):

req = urllib2.Request(url)

header = random.choice(headers)

req.add_header('User-Agent', header)

req.add_header('Sec-Fetch-User', '?1')

req.add_header('Sec-Fetch-Site', 'none')

req.add_header('Sec-Fetch-Mode', 'navigate')

html = urllib2.urlopen(req).read()

return html

  二、将所获取的HTML转换为可用数据(使用XPath)

  Google XPath插件可以安装XPath

  soup.xpath('// div [@ class =“witkey-item-top”]')是您想要根据您的需要拦截的数据!

  '''

获取html页面内数据

'''

def get_page_data(html):

soup = etree.HTML(html)

div_list = soup.xpath('//div[@class="class"]')

with open('C:\Users\Administrator\Desktop\onelinux\os\\ces.csv', 'a')as f:

for div in div_list:

title = div.xpath('.//div[@class="class"]/text()')[0]

f.write('{}\n'.format(title));

'''

  三、 create一个csv文件必须首先创建一个csv文件,然后写入数据

  '''

生成csv

'''

def creat_csv():

csv_headers = ['标题']

with open('C:\Users\Administrator\Desktop\onelinux\os\\ces.csv', 'wb') as f:

f_csv = csv.writer(f)

f_csv.writerow(csv_headers)

# f_csv.writerows()

f.close()

  三、记住引言模块

  import urllib2

import random

import csv

import sys

reload(sys)

sys.setdefaultencoding( "utf-8" )

from lxml import etree

  四、或向中写一个完整的代码,以将其替换为您自己的,

  # -*- coding:utf-8 -*-

import urllib2

import random

import csv

import sys

reload(sys)

sys.setdefaultencoding( "utf-8" )

from lxml import etree

headers = [

'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',

'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',

'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',

'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50',

'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)',

]

'''

获取html页面

'''

def get_html(url,headers):

req = urllib2.Request(url)

header = random.choice(headers)

req.add_header('User-Agent', header)

req.add_header('Sec-Fetch-User', '?1')

req.add_header('Sec-Fetch-Site', 'none')

req.add_header('Sec-Fetch-Mode', 'navigate')

html = urllib2.urlopen(req).read()

return html

'''

生成csv

'''

def creat_csv():

csv_headers = ['名称', '网址']

with open('C:\Users\Administrator\Desktop\onelinux\os\\ces.csv', 'wb') as f:

f_csv = csv.writer(f)

f_csv.writerow(csv_headers)

# f_csv.writerows()

f.close()

'''

获取html页面内数据

'''

def get_page_data(html):

soup = etree.HTML(html)

div_list = soup.xpath('//div[@class="你的类属性"]')

with open('C:\Users\Administrator\Desktop\onelinux\os\\ces.csv', 'a')as f:

for div in div_list:

title = div.xpath('.//div[@class="你的类属性"]/a/text()')[0]

link= div.xpath('.//div[@class="你的类属性"]/a/@herf')[0]

f.write('{},{}\n'.format(title, link));

'''

主函数

'''

def main():

num = input('请输入你要爬取的页数');

keyword = raw_input('请输入你要爬取关键词');

keyword = urllib2.quote(keyword)

for i in range(int(num)):

page = (i-1)*5 + i*65

if page < 0:

page = 0

url = &#39;你的地址?page={}&key={}&#39;.format(page,keyword)

html = get_html(url,headers)

get_page_data(html)

creat_csv() #创建csv

main() #主函数

  五、如果有一个好的方法,逻辑,欢迎发送消息来提倡

0 个评论

要回复文章请先登录注册


官方客服QQ群

微信人工客服

QQ人工客服


线