网页数据抓取(自己边看边实践一些简单的实际应用,发现Python真的 )
优采云 发布时间: 2021-11-13 23:03网页数据抓取(自己边看边实践一些简单的实际应用,发现Python真的
)
自己观看并练习一些简单的实际应用。下面的程序是从某个网站中获取需要的数据。
在写的过程中,通过学习学到了一些方法,发现Python真的很方便。
尤其是使用pandas获取网页中的表格数据,真的很方便!!!
程序可能不太好,但基本满足了它的需求。
希望有高手指点~~
Version 04 (Jan 12 2017) [获取表单信息推荐使用此方法]
1 # Code based on Python 3.x
2 # _*_ coding: utf-8 _*_
3 # __Author: "LEMON"
4
5 import pandas as pd
6
7 url2 = 'http://www.bjets.com.cn/article/jyxx/?'
8 links = []
9 for n in range(2, 40):
10 # 页面总数为39页,需要自己先从网页判断,也可以从页面抓取,后续可以完善
11 link = url2 + str(n)
12 links.append(link)
13 links.insert(0, url2)
14
15 df2 = pd.DataFrame() # creates a new dataframe that's empty
16 for url in links:
17 # 利用pandas获取数据,需要安装 html5lib模块
18 dfs = pd.read_html(url, header=0)
19 for df in dfs:
20 df2= df2.append(df, ignore_index= True)
21
22 # df2.to_excel('MktDataBJ.xlsx') # 将数据存储在excel文件里
23 df2.to_csv('MktDataBJ-1.csv') # 将数据存储在csv文件里
版本 03(201 年 1 月 12 日7)
1 # Code based on Python 3.x
2 # _*_ coding: utf-8 _*_
3 # __Author: "LEMON"
4
5 from bs4 import BeautifulSoup
6 import requests
7 import csv
8
9 url2 = 'http://www.bjets.com.cn/article/jyxx/?'
10 links = []
11 for n in range(2, 40):
12 # 页面总数为39页,需要自己先从网页判断,也可以从页面抓取,后续可以完善
13 link = url2 + str(n)
14 links.append(link)
15 links.insert(0, url2)
16
17 for url in links:
18 rep = requests.get(url)
19 # content = rep.text.encode(rep.encoding).decode('utf-8')
20 # # 直接用requests时,中文内容需要转码
21
22 soup = BeautifulSoup(rep.content, 'html.parser')
23
24 # table = soup.table
25 table = soup.find('table') # 两种方式都可以
26
27 trs = table.find_all('tr')
28 trs2 = trs[1:len(trs)]
29 list1 = []
30 for tr in trs2:
31 td = tr.find_all('td')
32 row = [i.text for i in td]
33 list1.append(row)
34
35 with open('MktDataBJ.csv', 'a', errors='ignore', newline='') as f:
36 f_csv = csv.writer(f)
37 f_csv.writerows(list1)
版本 02(201 年 1 月 9 日7)
1 # Code based on Python 3.x
2 # _*_ coding: utf-8 _*_
3 # __Author: "LEMON"
4
5 from bs4 import BeautifulSoup
6 import requests
7 import csv
8
9 url2 = 'http://www.bjets.com.cn/article/jyxx/?'
10 links = []
11 for n in range(2, 40):
12 # 页面总数为39页,需要自己先从网页判断,也可以从页面抓取,后续可以完善
13 link = url2 + str(n)
14 links.append(link)
15 links.insert(0, url2)
16 # print(links)
17
18 for url in links:
19 rep = requests.get(url)
20 # content = rep.text.encode(rep.encoding).decode('utf-8')
21 # # 直接用requests时,中文内容需要转码
22
23 soup = BeautifulSoup(rep.content, 'html.parser')
24 body = soup.body
25 data = body.find('div', {'class': 'list_right'})
26
27 quotes = data.find_all('tr')
28 quotes1 = quotes[1:len(quotes)]
29
30 list1 = []
31 for x in quotes1:
32 list2 = []
33 for y in x.find_all('td'):
34 list2.append(y.text) # 每日的数据做一个单独的list
35 list1.append(list2)
36 # print(list1) # list1为每日数据的总列表
37 with open('MktDataBJ.csv', 'a', errors='ignore', newline='') as f:
38 f_csv = csv.writer(f)
39 f_csv.writerows(list1)
版本 01(201 年 1 月 8 日7)
1 # Code based on Python 3.x
2 # _*_ coding: utf-8 _*_
3 # __Author: "LEMON"
4
5 from bs4 import BeautifulSoup
6 import requests
7 import csv
8
9 urllink = 'http://www.bjets.com.cn/article/jyxx/?'
10 links = []
11 for n in range(2, 40):
12 #页面总数为39页,需要自己先从网页判断,也可以从页面抓取,后续可以完善
13 link = urllink + str(n)
14 links.append(link)
15 links.insert(0, urllink)
16 # print(links)
17
18 for url in links:
19
20 rep = requests.get(url)
21 # content = rep.text.encode(rep.encoding).decode('utf-8')
22 # # 直接用requests时,中文内容需要转码
23
24 soup = BeautifulSoup(rep.content, 'html.parser')
25
26 # print(soup.prettify())
27 # # prettify()
28
29 body = soup.body
30 data = body.find('div', {'class': 'list_right'})
31
32 # table title
33 titles = data.find_all('th')
34
35 title = []
36 for x in titles:
37 title.append(x.text)
38 # print(title)
39
40 quotes = data.find_all('tr')
41 quotes1 = quotes[1:len(quotes)]
42 # print(quotes1)
43
44 list1 = []
45 for x in quotes1:
46 for y in x.find_all('td'):
47 list1.append(y.text)
48 # print(list1) # list为每日数据的总列表
49
50 date = []
51 volumes = []
52 meanprice = []
53 totalmoney = []
54
55 for i in range(0, len(list1)):
56 if i % 4 == 0:
57 date.append(list1[i])
58 elif i % 4 == 1:
59 volumes.append(list1[i])
60 elif i % 4 == 2:
61 meanprice.append(list1[i])
62 else:
63 totalmoney.append(list1[i])
64
65 # print(date)
66 # print(volumes)
67 # print(meanprice)
68 # print(totalmoney)
69
70 final = []
71 for i in range(0, len(date)):
72 temp = [date[i], volumes[i], meanprice[i], totalmoney[i]]
73 final.append(temp)
74 # print(final)
75 with open('bj_carbon.csv', 'a', errors='ignore', newline='') as f:
76 f_csv = csv.writer(f)
77 f_csv.writerows(final)