Nodejs学习笔记(11)
优采云 发布时间: 2020-08-08 17:04目录写在前面
许多人需要数据采集,这可以用不同的语言和不同的方式来实现. 我以前也用C#编写过它. 发送各种请求和定期分析数据主要比较麻烦. 总的来说,没有什么不好,但是效率更差,
使用nodejs(可能仅相对于C#)编写采集程序效率更高. 今天,我将主要使用一个示例来说明如何使用nodejs来实现数据采集器,主要是使用request和cheerio.
请求: 用于http请求
cheerio: 用于提取请求返回的html中所需的信息(与jquery的用法一致)
示例
仅讲API使用并不有趣,也无需记住所有API. 让我们开始一个例子.
说些八卦:
仍然有许多nodejs开发工具. 我还建议崇高. 自从Microsoft启动Visual Studio Code之后,我便转向了nodejs开发.
使用它进行开发比较舒适,无需配置,快速启动,自动完成,视图定义和引用,快速搜索等,并且具有一致的VS风格,因此它应该越来越好,所以我建议^ _ ^!
样品要求
在其中获取文章的“标题”,“地址”,“发布时间”和“封面图片”
采集器
1. 创建项目文件夹sampleDAU
2. 创建package.json文件
{
"name": "Wilson_SampleDAU",
"version": "0.0.1",
"private": false,
"dependencies": {
"request":"*",
"cheerio":"*"
}
}
3. 使用npm在终端中安装参考
cd 项目根目录
npm install
4. 构建app.js并编写采集器代码
首先,使用浏览器打开要采集的URL,使用开发人员工具查看HTML结构,然后根据该结构编写解析代码
/*
* 功能: 数据采集
* 创建人: Wilson
* 时间: 2015-07-29
*/
var request = require('request'),
cheerio = require('cheerio'),
URL_36KR = 'http://36kr.com/'; //36氪
/* 开启数据采集器 */
function dataCollectorStartup() {
dataRequest(URL_36KR);
}
/* 数据请求 */
function dataRequest(dataUrl)
{
request({
url: dataUrl,
method: 'GET'
}, function(err, res, body) {
if (err) {
console.log(dataUrl)
console.error('[ERROR]Collection' + err);
return;
}
switch(dataUrl)
{
case URL_36KR:
dataParse36Kr(body);
break;
}
});
}
/* 36kr 数据解析 */
function dataParse36Kr(body)
{
console.log('============================================================================================');
console.log('======================================36kr==================================================');
console.log('============================================================================================');
var $ = cheerio.load(body);
var articles = $('article')
for (var i = 0; i < articles.length; i++) {
var article = articles[i];
var descDoms = $(article).find('.desc');
if(descDoms.length == 0)
{
continue;
}
var coverDom = $(article).children().first();
var titleDom = $(descDoms).find('.info_flow_news_title');
var timeDom = $(descDoms).find('.timeago');
var titleVal = titleDom.text();
var urlVal = titleDom.attr('href');
var timeVal = timeDom.attr('title');
var coverUrl = coverDom.attr('data-lazyload');
//处理时间
var timeDateSecs = new Date(timeVal).getTime() / 1000;
if(urlVal != undefined)
{
console.info('--------------------------------');
console.info('标题:' + titleVal);
console.info('地址:' + urlVal);
console.info('时间:' + timeDateSecs);
console.info('封面:' + coverUrl);
console.info('--------------------------------');
}
};
}
dataCollectorStartup();
测试结果
此采集器已完成. 实际上,这是一个获取请求. 正文或HTML代码将在请求回调中返回,并且cheerio库将以与jquery库语法相同的方式进行解析,以检索所需的数据!
加入代理商
对采集器进行演示,然后基本完成. 如果您需要长时间使用它以防止网站被阻止,则仍然需要添加代理列表
对于该示例,我从Internet上的免费代理中提出了一些示例,并将其放入proxylist.js中,该列表提供了随机选择代理的功能
var PROXY_LIST = [{"ip":"111.1.55.136","port":"55336"},{"ip":"111.1.54.91","port":"55336"},{"ip":"111.1.56.19","port":"55336"}
,{"ip":"112.114.63.16","port":"55336"},{"ip":"106.58.63.83","port":"55336"},{"ip":"119.188.133.54","port":"55336"}
,{"ip":"106.58.63.84","port":"55336"},{"ip":"183.95.132.171","port":"55336"},{"ip":"11.12.14.9","port":"55336"}
,{"ip":"60.164.223.16","port":"55336"},{"ip":"117.185.13.87","port":"8080"},{"ip":"112.114.63.20","port":"55336"}
,{"ip":"188.134.19.102","port":"3129"},{"ip":"106.58.63.80","port":"55336"},{"ip":"60.164.223.20","port":"55336"}
,{"ip":"106.58.63.78","port":"55336"},{"ip":"112.114.63.23","port":"55336"},{"ip":"112.114.63.30","port":"55336"}
,{"ip":"60.164.223.14","port":"55336"},{"ip":"190.202.82.234","port":"3128"},{"ip":"60.164.223.15","port":"55336"}
,{"ip":"60.164.223.5","port":"55336"},{"ip":"221.204.9.28","port":"55336"},{"ip":"60.164.223.2","port":"55336"}
,{"ip":"139.214.113.84","port":"55336"} ,{"ip":"112.25.49.14","port":"55336"},{"ip":"221.204.9.19","port":"55336"}
,{"ip":"221.204.9.39","port":"55336"},{"ip":"113.207.57.18","port":"55336"} ,{"ip":"112.25.62.15","port":"55336"}
,{"ip":"60.5.255.143","port":"55336"},{"ip":"221.204.9.18","port":"55336"},{"ip":"60.5.255.145","port":"55336"}
,{"ip":"221.204.9.16","port":"55336"},{"ip":"183.232.82.132","port":"55336"},{"ip":"113.207.62.78","port":"55336"}
,{"ip":"60.5.255.144","port":"55336"} ,{"ip":"60.5.255.141","port":"55336"},{"ip":"221.204.9.23","port":"55336"}
,{"ip":"157.122.96.50","port":"55336"},{"ip":"218.61.39.41","port":"55336"} ,{"ip":"221.204.9.26","port":"55336"}
,{"ip":"112.112.43.213","port":"55336"},{"ip":"60.5.255.138","port":"55336"},{"ip":"60.5.255.133","port":"55336"}
,{"ip":"221.204.9.25","port":"55336"},{"ip":"111.161.35.56","port":"55336"},{"ip":"111.161.35.49","port":"55336"}
,{"ip":"183.129.134.226","port":"8080"} ,{"ip":"58.220.10.86","port":"80"},{"ip":"183.87.117.44","port":"80"}
,{"ip":"211.23.19.130","port":"80"},{"ip":"61.234.249.107","port":"8118"},{"ip":"200.20.168.140","port":"80"}
,{"ip":"111.1.46.176","port":"55336"},{"ip":"120.203.158.149","port":"8118"},{"ip":"70.39.189.6","port":"9090"}
,{"ip":"210.6.237.191","port":"3128"},{"ip":"122.155.195.26","port":"8080"}];
module.exports.GetProxy = function () {
var randomNum = parseInt(Math.floor(Math.random() * PROXY_LIST.length));
var proxy = PROXY_LIST[randomNum];
return 'http://' + proxy.ip + ':' + proxy.port;
}
proxylist.js
对app.js代码进行以下更改
/*
* 功能: 数据采集
* 创建人: Wilson
* 时间: 2015-07-29
*/
var request = require('request'),
cheerio = require('cheerio'),
URL_36KR = 'http://36kr.com/', //36氪
Proxy = require('./proxylist.js');
...
/* 数据请求 */
function dataRequest(dataUrl)
{
request({
url: dataUrl,
proxy: Proxy.GetProxy(),
method: 'GET'
}, function(err, res, body) {
...
}
}
...
dataCollectorStartup()
setInterval(dataCollectorStartup, 10000);
这样,转换就完成了,添加了代码,并添加了setInterval以定期执行!