c httpclient抓取网页((本文与shell编程无关)的需求分析与应用)
优采云 发布时间: 2022-01-03 00:16c httpclient抓取网页((本文与shell编程无关)的需求分析与应用)
一、灵感来源
前两天有朋友跟我提过这样的需求,希望从某个网页抓取数据,自动填入本地Excel表格中。当你需要做大量的数据统计时,将简单的任务自动化会显得很方便。这不就是shell编程的目的吗? (本文与shell编程无关,有感就贴)
二、需求分析
首先,对于需求,需要进行粗略的分析和设计(因为只是一个简单的测试demo,所以不需要考虑可行性、可维护性等),主要归结为到以下步骤:
要抓取网页数据,您必须知道网页的网址。这是我们从中抓取数据的入口网页。它需要根据发送的 url 生成响应并返回一个 html 响应页面。判断返回的结果是否是我们需要的。返回的html页面会将解析后的数据打包写入本地磁盘。 三、解决方案
这个例子测试了“Program it”信息的检索。
1、用到的jar包,主要包括
2、演示结构
模型类(存储对象):
package com.crawler.bean;
public class Model {
private String cardTitle;//帖子标题
private String authorName;//作者
private String cardContent;//帖子内容
private String cardDate;//发帖日期
public String getCardTitle() {
return cardTitle;
}
public void setCardTitle(String cardTitle) {
this.cardTitle = cardTitle;
}
public String getAuthorName() {
return authorName;
}
public void setAuthorName(String authorName) {
this.authorName = authorName;
}
public String getCardContent() {
return cardContent;
}
public void setCardContent(String cardContent) {
this.cardContent = cardContent;
}
public String getCardDate() {
return cardDate;
}
public void setCardDate(String cardDate) {
this.cardDate = cardDate;
}
}
UrlToHtml 类(返回 html 响应页面):
package com.crawler.util;
import com.crawler.bean.Model;
import com.crawler.parser.DataParse;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.HttpVersion;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.message.BasicHttpResponse;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class UrlToHtml {
public List URLParser(String url) throws Exception {
//初始化一个httpclient
HttpClient client = new DefaultHttpClient();
//用来接收解析的数据
List cardDatas = new ArrayList();
//获取响应文件,即html,采用get方法获取响应数据
HttpGet getMethod = new HttpGet(url);
HttpResponse response = new BasicHttpResponse(HttpVersion.HTTP_1_1,
HttpStatus.SC_OK, "OK");
try {
//执行get方法
response = client.execute(getMethod);
} catch (IOException e) {
e.printStackTrace();
}
//获取响应状态码
int statusCode = response.getStatusLine().getStatusCode();
//如果状态响应码为200,则获取html实体内容或者json文件
if (statusCode == 200) {
//设置字符编码
String entity = EntityUtils.toString(response.getEntity(), "utf-8");
//对响应的html内容进行解析
cardDatas = DataParse.getData(entity);
EntityUtils.consume(response.getEntity());
} else {
//否则,消耗掉实体
EntityUtils.consume(response.getEntity());
}
return cardDatas;
}
}
DataParse 类(解析 html 响应页面):
package com.crawler.parser;
import com.crawler.bean.Model;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.ArrayList;
import java.util.List;
public class DataParse {
public static List getData(String html) throws Exception {
//cardDatas用于存放结果
List cardDatas = new ArrayList();
//采用Jsoup解析
Document doc = Jsoup.parse(html);
//获取html标签中的内容
Elements elements = doc.select("div[class=content]").select("ul[id=thread_list]").select("div[class=t_con cleafix]");
//遍历
for (Element ele : elements) {
//获取标题
String cardName = ele.select("a").text();
//获取作者
String authorName = ele.select("div[class=threadlist_author pull_right]").select("span").attr("title");
String newAuthorName = authorName.substring(6);
//获取内容
String cardContent = ele.select("div[class=threadlist_text pull_left]").text();
//获取日期
String cardDate = ele.select("div[class=threadlist_author pull_right]").select("span[class=pull-right is_show_create_time]").text();
//写入Model属性中
Model cd = new Model();
cd.setCardTitle(cardName);
cd.setAuthorName(newAuthorName);
cd.setCardContent(cardContent);
cd.setCardDate(cardDate);
cardDatas.add(cd);
}
//返回数据
return cardDatas;
}
}
WriteToLocal 类(写入本地磁盘):
<p>package com.crawler.service;
import com.crawler.bean.Model;
import org.apache.poi.hssf.usermodel.*;
import org.apache.poi.hssf.util.HSSFColor;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.List;
public class WriteToLocal {
public void writeToExcel(List cardDatas, int columeCount, String[] titles, String path) {
HSSFWorkbook hssfWorkbook = new HSSFWorkbook();
HSSFSheet sheet = hssfWorkbook.createSheet("我的表格");
//创建标题行
HSSFRow headRow = sheet.createRow(0);
for (int i = 0; i