采集器(一个示例来说一下使用nodejs实现数据采集器,你值得拥有)
优采云 发布时间: 2021-10-25 03:21采集器(一个示例来说一下使用nodejs实现数据采集器,你值得拥有)
目录写在前面
很多人都有做数据的需求采集。它可以用不同的语言和不同的方式来实现。我之前也用 C# 编写过它。主要是发送各种请求和定期分析数据比较麻烦。总体来说没有什么不好,就是效率比较差。
使用 nodejs 编写 采集 程序效率更高(可能仅相对于 C#)。今天主要通过一个例子来说明使用nodejs实现数据采集器,主要是使用request和cheerio。
request:用于http请求
Cheerio:用于提取请求返回的html中需要的信息(与jquery用法一致)
例子
单独说一下,API的用法没什么意思,没必要记住所有的API。让我们开始下面的例子。
还是八卦:
nodejs开发工具还是很多的。我也推荐崇高。自从微软推出 Visual Studio Code 后,我就转向了 nodejs 开发。
用它开发比较舒服,免配置,启动快,自动补全,视图定义和引用,快速搜索等,配合VS一贯的风格,应该会越来越好,所以推荐^_ ^!
示例要求
从中抓取文章的“标题”、“地址”、“发布时间”和“封面图”
采集器
1.创建项目文件夹sampleDAU
2.创建 package.json 文件
{
"name": "Wilson_SampleDAU",
"version": "0.0.1",
"private": false,
"dependencies": {
"request":"*",
"cheerio":"*"
}
}
3.在终端使用npm安装引用
cd 项目根目录
npm install
4.创建app.js并编写采集器代码
首先使用浏览器打开采集的URL,使用开发者工具查看HTML结构,然后根据结构编写解析代码
检测结果
这个采集器就完成了,其实是在请求一个get请求。请求回调中会返回body或HTML代码,并按照jquery库语法解析cheerio库,检索出想要的数据!
加入代理
做一个采集器DEMO 基本上就完成了。如果需要长时间使用以防止网站被屏蔽,还是需要添加代理列表
举个例子,我从网上的free agent中提出一些例子,做成proxylist.js,提供了随机选择代理的功能
var PROXY_LIST = [{"ip":"111.1.55.136","port":"55336"},{"ip":"111.1.54.91","port":"55336"},{"ip":"111.1.56.19","port":"55336"}
,{"ip":"112.114.63.16","port":"55336"},{"ip":"106.58.63.83","port":"55336"},{"ip":"119.188.133.54","port":"55336"}
,{"ip":"106.58.63.84","port":"55336"},{"ip":"183.95.132.171","port":"55336"},{"ip":"11.12.14.9","port":"55336"}
,{"ip":"60.164.223.16","port":"55336"},{"ip":"117.185.13.87","port":"8080"},{"ip":"112.114.63.20","port":"55336"}
,{"ip":"188.134.19.102","port":"3129"},{"ip":"106.58.63.80","port":"55336"},{"ip":"60.164.223.20","port":"55336"}
,{"ip":"106.58.63.78","port":"55336"},{"ip":"112.114.63.23","port":"55336"},{"ip":"112.114.63.30","port":"55336"}
,{"ip":"60.164.223.14","port":"55336"},{"ip":"190.202.82.234","port":"3128"},{"ip":"60.164.223.15","port":"55336"}
,{"ip":"60.164.223.5","port":"55336"},{"ip":"221.204.9.28","port":"55336"},{"ip":"60.164.223.2","port":"55336"}
,{"ip":"139.214.113.84","port":"55336"} ,{"ip":"112.25.49.14","port":"55336"},{"ip":"221.204.9.19","port":"55336"}
,{"ip":"221.204.9.39","port":"55336"},{"ip":"113.207.57.18","port":"55336"} ,{"ip":"112.25.62.15","port":"55336"}
,{"ip":"60.5.255.143","port":"55336"},{"ip":"221.204.9.18","port":"55336"},{"ip":"60.5.255.145","port":"55336"}
,{"ip":"221.204.9.16","port":"55336"},{"ip":"183.232.82.132","port":"55336"},{"ip":"113.207.62.78","port":"55336"}
,{"ip":"60.5.255.144","port":"55336"} ,{"ip":"60.5.255.141","port":"55336"},{"ip":"221.204.9.23","port":"55336"}
,{"ip":"157.122.96.50","port":"55336"},{"ip":"218.61.39.41","port":"55336"} ,{"ip":"221.204.9.26","port":"55336"}
,{"ip":"112.112.43.213","port":"55336"},{"ip":"60.5.255.138","port":"55336"},{"ip":"60.5.255.133","port":"55336"}
,{"ip":"221.204.9.25","port":"55336"},{"ip":"111.161.35.56","port":"55336"},{"ip":"111.161.35.49","port":"55336"}
,{"ip":"183.129.134.226","port":"8080"} ,{"ip":"58.220.10.86","port":"80"},{"ip":"183.87.117.44","port":"80"}
,{"ip":"211.23.19.130","port":"80"},{"ip":"61.234.249.107","port":"8118"},{"ip":"200.20.168.140","port":"80"}
,{"ip":"111.1.46.176","port":"55336"},{"ip":"120.203.158.149","port":"8118"},{"ip":"70.39.189.6","port":"9090"}
,{"ip":"210.6.237.191","port":"3128"},{"ip":"122.155.195.26","port":"8080"}];
module.exports.GetProxy = function () {
var randomNum = parseInt(Math.floor(Math.random() * PROXY_LIST.length));
var proxy = PROXY_LIST[randomNum];
return 'http://' + proxy.ip + ':' + proxy.port;
}
代理列表.js
对 app.js 代码进行以下更改
/*
* 功能: 数据采集
* 创建人: Wilson
* 时间: 2015-07-29
*/
var request = require('request'),
cheerio = require('cheerio'),
URL_36KR = 'http://36kr.com/', //36氪
Proxy = require('./proxylist.js');
...
/* 数据请求 */
function dataRequest(dataUrl)
{
request({
url: dataUrl,
proxy: Proxy.GetProxy(),
method: 'GET'
}, function(err, res, body) {
...
}
}
...
dataCollectorStartup()
setInterval(dataCollectorStartup, 10000);
这样转换就完成了,加了代码,加了setInterval,定时执行!