python抓取网页数据(python写了一小端代码练习,获取网页数据(html))
优采云 发布时间: 2021-09-22 13:27python抓取网页数据(python写了一小端代码练习,获取网页数据(html))
最近学识到Python写了一个小端代码来练习
一、 get web数据(html)
URL:URL标题:请求头信息(见图)
找到一个常用的用户代理:
headers = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)',
]
通常我们通过
'''
获取html页面
'''
def get_html(url,headers):
req = urllib2.Request(url)
header = random.choice(headers)
req.add_header('User-Agent', header)
req.add_header('Sec-Fetch-User', '?1')
req.add_header('Sec-Fetch-Site', 'none')
req.add_header('Sec-Fetch-Mode', 'navigate')
html = urllib2.urlopen(req).read()
return html
二、将所获取的HTML转换为可用数据(使用XPath)
Google XPath插件可以安装XPath
soup.xpath('// div [@ class =“witkey-item-top”]')是您想要根据您的需要拦截的数据!
'''
获取html页面内数据
'''
def get_page_data(html):
soup = etree.HTML(html)
div_list = soup.xpath('//div[@class="class"]')
with open('C:\Users\Administrator\Desktop\onelinux\os\\ces.csv', 'a')as f:
for div in div_list:
title = div.xpath('.//div[@class="class"]/text()')[0]
f.write('{}\n'.format(title));
'''
三、 create一个csv文件必须首先创建一个csv文件,然后写入数据
'''
生成csv
'''
def creat_csv():
csv_headers = ['标题']
with open('C:\Users\Administrator\Desktop\onelinux\os\\ces.csv', 'wb') as f:
f_csv = csv.writer(f)
f_csv.writerow(csv_headers)
# f_csv.writerows()
f.close()
三、记住引言模块
import urllib2
import random
import csv
import sys
reload(sys)
sys.setdefaultencoding( "utf-8" )
from lxml import etree
四、或向中写一个完整的代码,以将其替换为您自己的,
# -*- coding:utf-8 -*-
import urllib2
import random
import csv
import sys
reload(sys)
sys.setdefaultencoding( "utf-8" )
from lxml import etree
headers = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)',
]
'''
获取html页面
'''
def get_html(url,headers):
req = urllib2.Request(url)
header = random.choice(headers)
req.add_header('User-Agent', header)
req.add_header('Sec-Fetch-User', '?1')
req.add_header('Sec-Fetch-Site', 'none')
req.add_header('Sec-Fetch-Mode', 'navigate')
html = urllib2.urlopen(req).read()
return html
'''
生成csv
'''
def creat_csv():
csv_headers = ['名称', '网址']
with open('C:\Users\Administrator\Desktop\onelinux\os\\ces.csv', 'wb') as f:
f_csv = csv.writer(f)
f_csv.writerow(csv_headers)
# f_csv.writerows()
f.close()
'''
获取html页面内数据
'''
def get_page_data(html):
soup = etree.HTML(html)
div_list = soup.xpath('//div[@class="你的类属性"]')
with open('C:\Users\Administrator\Desktop\onelinux\os\\ces.csv', 'a')as f:
for div in div_list:
title = div.xpath('.//div[@class="你的类属性"]/a/text()')[0]
link= div.xpath('.//div[@class="你的类属性"]/a/@herf')[0]
f.write('{},{}\n'.format(title, link));
'''
主函数
'''
def main():
num = input('请输入你要爬取的页数');
keyword = raw_input('请输入你要爬取关键词');
keyword = urllib2.quote(keyword)
for i in range(int(num)):
page = (i-1)*5 + i*65
if page < 0:
page = 0
url = '你的地址?page={}&key={}'.format(page,keyword)
html = get_html(url,headers)
get_page_data(html)
creat_csv() #创建csv
main() #主函数
五、如果有一个好的方法,逻辑,欢迎发送消息来提倡