两个网站关于“卷积神经网络”的期刊数据量

优采云 发布时间: 2021-04-27 01:13

  两个网站关于“卷积神经网络”的期刊数据量

  最近由于公司的需求采集 HowNet数据(标题,来源,关键字,作者,单位,分类号,摘要,类似文档等),因为HowNet太强大而无法阻止爬网,因此内容页面链接是加密,请尝试使用Pyspider,scrapy和selenium无法进入内容页面并直接跳到CNKI主页。因此,我不得不使用HowNet的界面来执行采集:以下是两本网站关于“卷积神经网络”的期刊,并将其数据量进行了比较,如下图所示:

  

  image.png

  

  image.png

  仔细查看会发现网站是一个发布请求,并且焦点集中在带有参数的请求上。打开先见之明,搜索所需内容,按f2键,然后查看参数中的表单数据。就像我要使用的是卷积神经网络[k​​13]型日志一样,您可以在此处用您的参数替换它们。

  formdata = {'Type': 1,

'Order': 1,

'Islegal': 'false',

'ArticleType': 1,

'Theme': '卷积神经网络',

'searchType': 'MulityTermsSearch',

'ParamIsNullOrEmpty': 'true',

'Page': i}

  以下是实现代码:

  # encoding='utf-8'

import json

import re

from lxml import etree

import requests

import codecs

class CNKI(object):

headers = {

'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}

cookies = {

'Cookie': 'Ecp_ClientId=4181108101501154830; cnkiUserKey=ec1ef785-3872-fac6-cad3-402229207945; UM_distinctid=166f12b44b1654-05e4c1a8d86edc-b79183d-1fa400-166f12b44b2ac8; KEYWORD=%E5%8D%B7%E7%A7%AF%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C%24%E5%8D%B7%E7%A7%AF%20%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C; Ecp_IpLoginFail=1811121.119.135.10; amid=73b0014b-8b61-4e24-a333-8774cb4dd8bd; SID=110105; CNZZDATA1257838113=579682214-1541655561-http%253A%252F%252Fsearch.cnki.net%252F%7C1542070177'}

param = {

'Accept': 'text/html, */*; q=0.01',

'Accept - Encoding': 'gzip, deflate',

'Accept - Language': 'zh-CN,zh;q=0.9',

'Connection': 'keep - alive',

'Content - Type': 'application / x - www - form - urlencoded;charset = UTF - 8',

'Host': 'yuanjian.cnki.net',

'Origin': 'http: // yuanjian.cnki.net',

'Referer': 'http: // yuanjian.cnki.net / Search / Result',

'X - Requested - With': 'XMLHttpRequest'}

def content(self):

li = []

# 遍历总页数

for j in range(1, 134):

for i in range(j, j + 1):

url = 'http://yuanjian.cnki.net/Search/Result'

print('当前页', i)

# post 传参

formdata = {'Type': 1,

'ArticleType': 1,

'Theme': '卷积神经网络',

'Page': i}

print(formdata)

try:

r = requests.post(url, data=formdata, headers=self.headers, cookies=self.cookies, params=self.param)

r.raise_for_status()

r.encoding = r.apparent_encoding

data = etree.HTML(r.text)

# 链接列表

url_list = data.xpath("//*[@id='article_result']/div/div/p[1]/a[1]/@href")

# 关键词列表

key_wordlist = []

all_items = data.xpath("//*[@id='article_result']/div/div")

for i in range(1, len(all_items) + 1):

key_word = data.xpath("//*[@id='article_result']/div/div[%s]/div[1]/p[1]/a/text()" % i)

key_words = ';'.join(key_word)

key_wordlist.append(key_words)

# 来源

source_items = data.xpath("//*[@id='article_result']/div/div")

for j in range(1, len(source_items) + 1):

sources = data.xpath("//*[@id='article_result']/div/div/p[3]/a[1]/span/text()")

for index, url in enumerate(url_list):

items = {}

try:

print('当前链接:', url)

content = requests.get(url, headers=self.headers)

contents = etree.HTML(content.text)

# 论文题目

title = contents.xpath("//h1[@class='xx_title']/text()")[0]

print('标题:', title)

# 来源

source = sources[index]

items['source'] = source

print('来源:', source)

items['title'] = title

# 关键字

each_key_words = key_wordlist[index]

print('关键字:', each_key_words)

items['keywordsEn'] = ''

items['keywordsCh'] = each_key_words

# 作者

author = contents.xpath("//*[@id='content']/div[2]/div[3]/a/text()")

items['author'] = author

print('作者:', author)

# 单位

unit = contents.xpath("//*[@id='content']/div[2]/div[5]/a[1]/text()")

units = ''.join(unit).strip(';')

items['unit'] = units

print('单位:', units)

# 分类号

classify = contents.xpath("//*[@id='content']/div[2]/div[5]/text()")[-1]

items['classify'] = classify

print('分类号:', classify)

# 摘要

abstract = contents.xpath("//div[@class='xx_font'][1]/text()")[1].strip()

print('摘要:', abstract)

items['abstractCh'] = abstract

items['abstractEn'] = ''

# 相似文献

similar = contents.xpath(

"//*[@id='xiangsi']/table[2]/tbody/tr[3]/td/table/tbody/tr/td/text()")

si = ''.join(similar).replace('\r\n', '').split('期')

po = []

for i in si:

sis = i + '期'

if len(sis) > 3:

po.append(sis)

items['similar_article'] = po

li.append(items)

except Exception as e:

print(e)

print(len(li))

except Exception as e:

print(e)

return li

if __name__ == '__main__':

con = CNKI()

items = con.content()

print(items)

try:

with codecs.open('./cnki_data.json', 'a+', encoding="utf-8") as fp:

for i in items:

fp.write(json.dumps(i, ensure_ascii=False) + ",\n")

except IOError as err:

print('error' + str(err))

finally:

fp.close()

  结束〜

  小白,希望对大家有帮助。如果有问题,请纠正我。

0 个评论

要回复文章请先登录注册


官方客服QQ群

微信人工客服

QQ人工客服


线