java从网页抓取数据( commons-io工具,获取页面或Json5)Jsoup工具)
优采云 发布时间: 2022-01-31 17:11java从网页抓取数据(
commons-io工具,获取页面或Json5)Jsoup工具)
4)commons-io 工具,获取页面或Json
5) Jsoup工具(一般用于html字段解析),获取页面,非Json返回格式]
完整代码:
package com.yeezhao.common.http;import java.io.BufferedReader;import java.io.InputStream;import java.io.InputStreamReader;import java.net.HttpURLConnection;import java.net.URL;import org.apache.commons.httpclient.HttpClient;import org.apache.commons.httpclient.HttpMethod;import org.apache.commons.httpclient.methods.GetMethod;import org.apache.commons.io.IOUtils;import org.jsoup.Jsoup;/**
* http工具对比
*
* @author Administrator -> junhong
*
* 2016年12月27日 */public class HttpFetchUtil {
/**
* 获取访问的状态码
* @param request
* @return
* @throws Exception */
public static int getResponseCode(String request) throws Exception {
URL url = new URL(request);
HttpURLConnection conn = (HttpURLConnection) url.openConnection(); return conn.getResponseCode();
} /**
* 1)JDK自带HTTP连接,获取页面或Json
* @param request
* @param charset
* @return
* @throws Exception */
public static String JDKFetch(String request, String charset) throws Exception {
URL url = new URL(request);
HttpURLConnection conn = (HttpURLConnection) url.openConnection(); //模拟浏览器参数
conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36"
+ " (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36"); if (conn.getResponseCode() == HttpURLConnection.HTTP_OK) {
InputStream input = conn.getInputStream();
StringBuffer sb = new StringBuffer();
BufferedReader reader = new BufferedReader(new InputStreamReader(input, charset));
String s; while ((s = reader.readLine()) != null) {
sb.append(s + "\n");
}
input.close();
conn.disconnect(); return sb.toString();
} return "";
} /**
* 2) JDK自带URL连接,获取页面或Json
* @param request
* @param charset
* @return
* @throws Exception */
public static String URLFetch(String request, String charset) throws Exception {
URL url = new URL(request); return IOUtils.toString(url.openStream());
} /**
* 3)HttpClient Get工具,获取页面或Json
* @param url
* @param charset
* @return
* @throws Exception */
public static String httpClientFetch(String url, String charset) throws Exception { // GET
HttpClient httpClient = new HttpClient();
httpClient.getParams().setContentCharset(charset);
HttpMethod method = new GetMethod(url);
httpClient.executeMethod(method); return method.getResponseBodyAsString();
} /**
* 4)commons-io工具,获取页面或Json
* @param url
* @param charset
* @return
* @throws Exception */
public static String commonsIOFetch(String url, String charset) throws Exception { return IOUtils.toString(new URL(url), charset);
}
/**
* 5) Jsoup工具(通常用于html字段解析),获取页面,非Json返回格式
* @param url
* @return
* @throws Exception */
public static String jsoupFetch(String url) throws Exception { return Jsoup.parse(new URL(url), 2 * 1000).html();
}
}
测试代码:
附:相关jar依赖
...
org.jsoup
jsoup
1.7.3
commons-httpclient
commons-httpclient
3.1
commons-io
commons-io
2.4
...
后记:
在当前的数据时代,有“数据就是财富”的概念。因此,数据采集技术会不断的发展和更新,并在此基础上进一步扩展POST方式的采集方式,敬请期待!
以上就是Java实现http数据抓取的几种方式的详细内容。更多详情请关注宏旺互联网其他相关话题文章!