c httpclient抓取网页(（本文与shell编程无关）的需求分析与应用)

优采云发布时间: 2022-01-03 00:16

　　一、灵感来源

　　前两天有朋友跟我提过这样的需求，希望从某个网页抓取数据，自动填入本地Excel表格中。当你需要做大量的数据统计时，将简单的任务自动化会显得很方便。这不就是shell编程的目的吗？（本文与shell编程无关，有感就贴）

　　二、需求分析

　　首先，对于需求，需要进行粗略的分析和设计（因为只是一个简单的测试demo，所以不需要考虑可行性、可维护性等），主要归结为到以下步骤：

　　要抓取网页数据，您必须知道网页的网址。这是我们从中抓取数据的入口网页。它需要根据发送的 url 生成响应并返回一个 html 响应页面。判断返回的结果是否是我们需要的。返回的html页面会将解析后的数据打包写入本地磁盘。三、解决方案

　　这个例子测试了“Program it”信息的检索。

　　1、用到的jar包，主要包括

　　2、演示结构

　　模型类（存储对象）：

　　package com.crawler.bean;

public class Model {

private String cardTitle;//帖子标题

private String authorName;//作者

private String cardContent;//帖子内容

private String cardDate;//发帖日期

public String getCardTitle() {

return cardTitle;

}

public void setCardTitle(String cardTitle) {

this.cardTitle = cardTitle;

}

public String getAuthorName() {

return authorName;

}

public void setAuthorName(String authorName) {

this.authorName = authorName;

}

public String getCardContent() {

return cardContent;

}

public void setCardContent(String cardContent) {

this.cardContent = cardContent;

}

public String getCardDate() {

return cardDate;

}

public void setCardDate(String cardDate) {

this.cardDate = cardDate;

}

　　UrlToHtml 类（返回 html 响应页面）：

　　package com.crawler.util;

import com.crawler.bean.Model;

import com.crawler.parser.DataParse;

import org.apache.http.HttpResponse;

import org.apache.http.HttpStatus;

import org.apache.http.HttpVersion;

import org.apache.http.client.HttpClient;

import org.apache.http.client.methods.HttpGet;

import org.apache.http.impl.client.DefaultHttpClient;

import org.apache.http.message.BasicHttpResponse;

import org.apache.http.util.EntityUtils;

import java.io.IOException;

import java.util.ArrayList;

import java.util.List;

public class UrlToHtml {

public List URLParser(String url) throws Exception {

//初始化一个httpclient

HttpClient client = new DefaultHttpClient();

//用来接收解析的数据

List cardDatas = new ArrayList();

//获取响应文件，即html，采用get方法获取响应数据

HttpGet getMethod = new HttpGet(url);

HttpResponse response = new BasicHttpResponse(HttpVersion.HTTP_1_1,

HttpStatus.SC_OK, "OK");

try {

//执行get方法

response = client.execute(getMethod);

} catch (IOException e) {

e.printStackTrace();

}

//获取响应状态码

int statusCode = response.getStatusLine().getStatusCode();

//如果状态响应码为200，则获取html实体内容或者json文件

if (statusCode == 200) {

//设置字符编码

String entity = EntityUtils.toString(response.getEntity(), "utf-8");

//对响应的html内容进行解析

cardDatas = DataParse.getData(entity);

EntityUtils.consume(response.getEntity());

} else {

//否则，消耗掉实体

EntityUtils.consume(response.getEntity());

}

return cardDatas;

}

　　DataParse 类（解析 html 响应页面）：

　　package com.crawler.parser;

import com.crawler.bean.Model;

import org.jsoup.Jsoup;

import org.jsoup.nodes.Document;

import org.jsoup.nodes.Element;

import org.jsoup.select.Elements;

import java.util.ArrayList;

import java.util.List;

public class DataParse {

public static List getData(String html) throws Exception {

//cardDatas用于存放结果

List cardDatas = new ArrayList();

//采用Jsoup解析

Document doc = Jsoup.parse(html);

//获取html标签中的内容

Elements elements = doc.select("div[class=content]").select("ul[id=thread_list]").select("div[class=t_con cleafix]");

//遍历

for (Element ele : elements) {

//获取标题

String cardName = ele.select("a").text();

//获取作者

String authorName = ele.select("div[class=threadlist_author pull_right]").select("span").attr("title");

String newAuthorName = authorName.substring(6);

//获取内容

String cardContent = ele.select("div[class=threadlist_text pull_left]").text();

//获取日期

String cardDate = ele.select("div[class=threadlist_author pull_right]").select("span[class=pull-right is_show_create_time]").text();

//写入Model属性中

Model cd = new Model();

cd.setCardTitle(cardName);

cd.setAuthorName(newAuthorName);

cd.setCardContent(cardContent);

cd.setCardDate(cardDate);

cardDatas.add(cd);

}

//返回数据

return cardDatas;

}

　　WriteToLocal 类（写入本地磁盘）：

<p>package com.crawler.service;

import com.crawler.bean.Model;

import org.apache.poi.hssf.usermodel.*;

import org.apache.poi.hssf.util.HSSFColor;

import java.io.File;

import java.io.FileNotFoundException;

import java.io.FileOutputStream;

import java.io.IOException;

import java.util.List;

public class WriteToLocal {

public void writeToExcel(List cardDatas, int columeCount, String[] titles, String path) {

HSSFWorkbook hssfWorkbook = new HSSFWorkbook();

HSSFSheet sheet = hssfWorkbook.createSheet("我的表格");

//创建标题行

HSSFRow headRow = sheet.createRow(0);

for (int i = 0; i

0

2022-01-03

c httpclient抓取网页

0 个评论

要回复文章请先登录或注册

AI时代内容工厂

c httpclient抓取网页(（本文与shell编程无关）的需求分析与应用)

0 个评论

发起人