java从网页抓取数据(我通过在node中编写一个小的程序(称为extract.js)来抓取文 )

优采云 发布时间: 2021-11-29 02:00

  java从网页抓取数据(我通过在node中编写一个小的程序(称为extract.js)来抓取文

)

  我通过在 node.js 中编写一个小程序(称为extract.js)来抓取文本来解决这个问题。我用这个页面来帮助我:

  每个 html 页面收录多个书页。所以,如果我们只将url中的page参数加1,那么一不小心就会爬取重复的书页(这是我特别坚持的部分)。我通过使用 jquery 选择器只选择 url 中指定的单个页面并忽略 html 中的其他页面解决了这个问题。这样我就可以用电子表格程序快速构建一个文本文件,其中每个页面的URL都是按顺序排列的(因为增量只有1).

  至此,我已经成功抢到了前两卷,还有五卷要抢!下面给出了代码,它可能是抓取其他 Google 图书的有用入门。

  // Usage: node extract.js input output

// where input (mandatory) is the text file containing your list of urls

// and output (optional) is the directory where the output files will be saved

var fs = require('fs');

var request = require('request');

var cheerio = require('cheerio');

// Read the command line parameters

var input = process.argv[2];

var output = process.argv[3];

if (!input) {

console.log("Missing input parameter");

return;

}

// Read the url input file, each url is on a new line

var urls = fs.readFileSync(input).toString().split('\n');

// Check for non urls and remove

for (var i = 0; i < urls.length; i++) {

if (urls[i].slice(0, 4) != 'http') {

urls.splice(i, 1);

}

}

// Iterate through the urls

for (var i = 0; i < urls.length; i++) {

var url = urls[i];

// request function is asynchronous, hence requirement for self-executing function

// Cannot guarantee the execution order of the callback for each url, therefore save results to separate files

request(url, ( function(url) {

return function(err, resp, body) {

if (err)

throw err;

// Extract the pg parameter (book page) from the url

// We will use this to only extract the text from this book page

// because a retrieved html page contains multiple book pages

var pg = url.slice(url.indexOf('pg=') + 3, url.indexOf('&output=text'));

//

// Define the filename

//

var number = pg.slice(2, pg.length);

var zeroes = 4 - number.length;

// Insert leading zeroes

for (var j = 0; j < zeroes; j++) {

number = '0' + number;

}

var filename = pg.slice(0, 2) + number + '.txt';

// Add path to filename

if (output) {

if (!fs.existsSync(output))

fs.mkdirSync(output);

filename = output + '/' + filename;

}

// Delete the file if it already exists

if (fs.existsSync(filename))

fs.unlinkSync(filename);

// Make the DOM available to jquery

$ = cheerio.load(body);

// Select the book page

// Pages are contained within 'div' elements (where class='flow'),

// each of which contains an 'a' element where id is equal to the page

// Use ^ to match pages because sometimes page ids can have a trailing hyphen and extra characters

var page = $('div.flow:has(a[id=' + pg + ']), div.flow:has(a[id^=' + pg + '-])');

//

// Extract and save the text of the book page to the file

//

var hasText = false;

// Text is in 'gtxt_body', 'gtxt_column' and 'gtxt_footnote'

page.find('div.gtxt_body, div.gtxt_column, div.gtxt_footnote').each(function() {

this.find('p.gtxt_body, p.gtxt_column, p.gtxt_footnote').each(function() {

hasText = true;

fs.appendFileSync(filename, this.text());

fs.appendFileSync(filename, '\n\n');

});

});

// Log progress

if (hasText) {

console.log("Retrieved and saved page: " + pg);

}

else {

console.log("Skipping page: " + pg);

}

}

} )(url));

}

0 个评论

要回复文章请先登录注册


官方客服QQ群

微信人工客服

QQ人工客服


线