使用chrome插件批量读取浏览器页面内容并写入数据库

优采云 发布时间: 2020-08-08 02:33

  想象一下,如果您想每天采集100页甚至更多的网页数据. 如果使用手动采集,则会吐血,因此使用程序进行采集将成为唯一选择. 首先,我肯定会考虑使用Java,PHP和C#等高级语言,但是有一个登录和验证代码,我很茫然. 您是否还在担心在网络上采集数据?很高兴,您找到了正确的地方.

  应用场景:

  1. 每天需要在网络上重复采集大量数据

  2,登录后即可采集网页数据

  3,网页可以翻页

  解决方案:

  手动登录,然后使用chrome插件方法进行采集. 当然,您会说使用硒等自动化测试方法进行采集会比较凉爽,并且可以每天自动采集,并且完全自动化,无需人工干预. 但是作为忠实的Chrome迷,您只需要前端js,服务器端接收文件和数据库即可完美解决此问题. 加上简单的部署和操作. 成为粉丝的原因很多. 好吧,即使这是保存国家并实现它的顽固曲线.

  思考:

  

  帮助手册:

  示例:

  

  获取电子商务公司的后端订单数据

  1. 创建一个项目文件夹并导入所需的文件: 例如D: \ tool \ chrome_server_plugin

  jquery-2.1.1.min.js,icon.png

  2,创建background.html

  3,创建配置文件manifest.json文件

  {

"name": "获取某电商后台订单信息",

"version": "1.0",

"manifest_version": 2,

"description": "*********获取某电商后台订单信息*********",

"browser_action": {

"default_icon": "icon.png"

},

"permissions": [

"webNavigation",

"tabs",

"contextMenus",

"http://服务器接受数据url/"

],

"background": {

"scripts": ["eventPage.js","jquery-2.1.1.min.js"]

},

"content_scripts": [

{

"matches": ["http://抓取页面url/*"],

"js": ["jquery-2.1.1.min.js", "contentscript.js"]

}

]

}

  4,创建前端js文件contentscript.js

  var totalPage;

var page = 0;

//注册前台页面*敏*感*词*事件

chrome.extension.onMessage.addListener(

  function(request, sender, sendResponse) {

    totalPage = $("input[name=totalPage]").val();

    console.log("totalPage----------" + totalPage);

    //console.log("msg----------contentscript.js" + request.greeting);

    getOrderInfo( sendResponse );

  });

//获取订单信息

function getOrderInfo( sendResponse ){

  var flag = false;

  payMoney = [];//货款金额

  orderTime = [];//下单时间

  $("tr[class=head] span").each(function(index){

    spantxt = '';

    spantxt = $(this).text();

    if(spantxt.indexOf('货款金额:') > -1){

      money = spantxt.substr(5);

      //console.log(index + "---------payMoney-------货款金额:" + money);

      payMoney.push(money);

    }else if(spantxt.indexOf('下单时间:') > -1){

      time = spantxt.substr(5);

      //console.log(index + "---------orderTime-------下单时间:" + time);

      orderTime.push(time);

    }

  });

paytype = [];//物流方式

yunfei = [];//运费

$("td[class=p-values]").each(function(index){

  tdtxt = '';

  tdtxt = $(this).text();

  if(tdtxt.indexOf('*敏*感*词*') > -1){

    paytype.push('*敏*感*词*');

  }else{

    paytype.push('在线支付');

  }

  yf_index = tdtxt.indexOf('运费:');

  if(yf_index > -1){

    temp = tdtxt.substr(yf_index);

    temp_yf = temp.substr(3);

    //console.log(index + "---------yunfei-------"+ temp +"===" + temp_yf);

    yunfei.push(temp_yf);

  }else{

    yunfei.push(0);

  }

  //console.log(index + "---------tdtxt-------" + tdtxt);

});

orderStatus = [];//订单状态

users = [];//买家账号

remark = [];//备注

$("tr[class=content] td[class=t-c]").each(function(index){

  tdtxt = '';

  tdtxt = $(this).text().replace(/[\r\n]\ +/g,"");//将回车,换行,空格去掉

  temp = index % 5;

  if(1 == temp){

    orderStatus.push(tdtxt);

    //console.log(index + "---------statu-------" + tdtxt);

  }else if(2 == temp){

    users.push(tdtxt);

    //console.log(index + "---------users-------" + tdtxt);

  }else if(3 == temp){

    remark.push(tdtxt);

    //console.log(index + "---------remark-------" + tdtxt);

  }

});

express = [];//快递单号

$("tr[class=content] td div[style='text-align: center;']").each(function(index){

  tdtxt = '';

  tdtxt = $(this).text().replace(/[\r\n]\ +/g,"");//将回车,换行,空格去掉

  express.push(tdtxt);

  //console.log( "============快递单号=======" + tdtxt);

});

orderInfo = [];

splitstr = "@_@";

$("tr[class=head] a[track=orderinfopagebeta]").each(function(index){

  orderid = $(this).text();

  //console.log("---------orderid-------" + orderid);

  mycomment = $("a[id=comment_" + orderid + "]").attr('style').replace(/[\r\n]\ +/g,"");

  if("display: block;" == mycomment){

    mycomment = '已评价';

  }else if('display:none;' == mycomment){

    mycomment = '未评价';

  }

  tempshopid = $("img[id=remarkFlag_" + orderid + "]").attr('onclick');

  shopidIndex = tempshopid.indexOf(",");

  shopid = tempshopid.substr(shopidIndex + 1).replace(/[\)\;]/g,"");

  //console.log("---------shopid-------" + shopid);

  orderdesc = shopid + splitstr + orderid + splitstr + mycomment + splitstr + payMoney[index] + splitstr + orderTime[index] + splitstr + paytype[index] + splitstr + yunfei[index] + splitstr + orderStatus[index] + splitstr + users[index] + splitstr + remark[index] + splitstr + express[index];

  console.log("---------orderdesc-------" + orderdesc);

  orderInfo.push(orderdesc);

});

//chrome.extension.sendMessage({"orderInfo": orderInfo}, function(response) {});

page = parseInt($("a[class=current]").text());

totalPage = parseInt($("input[name=totalPage]").val());

console.log(page + "--page-----------totalPage---" + totalPage);

if(page < totalPage && page < 100){

  console.log("---------next-------");

  sendMsg( orderInfo, "next" );

  $('a.next')[1].click();

}else{

  console.log("---------end-------");

  sendMsg( orderInfo, "end" );

}

//

}

//将获取内容传递给后台文件进行处理

function sendMsg( msg, cmd){

  chrome.extension.sendMessage({"msg": msg, "cmd": cmd}, function(response) {});

}

  5. 创建一个后台处理js文件eventPage.js

  var flag = false;

var currentTabId;

chrome.browserAction.onClicked.addListener(function(tab) {

  counter = 40;

  console.log('Turning ' + tab.url);

  flag = true;

  currentTabId = tab.id;

  chrome.tabs.getSelected(null, function(tab) {

    sendMsg(tab.id);

  });

});

chrome.webNavigation.onCompleted.addListener(function( tab ){

  console.log('加载完成***' + tab.tabId );

  if( flag ){

    sendMsg( tab.tabId );

  }

});

chrome.extension.onMessage.addListener(

function(request, sender, sendResponse) {

  console.log("*******evenPage.js***chrome.extension.onMessage.addListener");

  articleData = request;

  $.ajax({

    url: "服务器接受数据URL/getOrderinfo.php",

    cache: false,

    type: "POST",

    data: {'orderinfo': request.msg.join("#$#")},

    dataType: "json"

    }).done(function(msg) {

      console.log('*******************json*************' + msg.sql );

      chrome.tabs.sendMessage(currentTabId, {"cmd":"end"},

      function(response) {

      console.log(response);

 });

}).fail(function(jqXHR, textStatus) {

  articleData.firstAccess = textStatus;

});

cmd = request.cmd;

if('end' == cmd){

  flag = false;//确保不会自动运行

}

});

function sendSku2Info(colores){

  chrome.tabs.query(

    {active: true, currentWindow: true}, function(tabs) {

      chrome.tabs.sendMessage(tabs[0].id, {"cmd":"ok", "sku": colores},

      function(response) {

        console.log(response);

      });

  });

}

function sendMsg( tabid ){

  console.log(tabid + "--sendMsg()----eventPage.js");

  chrome.tabs.sendMessage(tabid, {greeting: "start working"}, function(response) {

  });

}

  4. 创建一个服务器来接收文件getOrderInfo.php(亲爱的,将其放置在服务器上!)

<p>

0 个评论

要回复文章请先登录注册


官方客服QQ群

微信人工客服

QQ人工客服


线