c httpclient抓取网页(GET方法模拟抓取网页使用org.apache.HttpClient )
优采云 发布时间: 2021-09-08 22:14c httpclient抓取网页(GET方法模拟抓取网页使用org.apache.HttpClient
)
我目前正在学习Android并开发了一个类似于Super Course Schedule和Campus Today的APP。然而,我一直卡在抢课表这一步。遍历了很多数据,还是解决不了。我下定决心要系统信息HttpClient。写一个helloWord,继续记录和学习!
一、GET 方法模拟爬取网页
使用org.apache.HttpClient GET方法模拟登录网页并抓取数据,需要使用HttpClient包
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
public class HellWord {
//直接模拟
public static void main(String[] a){
//生成一个可关闭的HTTP浏览器(相当于)
CloseableHttpClient httpClient=HttpClients.createDefault();
CloseableHttpResponse response=null;
//创建http Get请求
HttpGet httpGet=new HttpGet("http://hll520.cn");
try {
response=httpClient.execute(httpGet);//执行
} catch (IOException e) {
e.printStackTrace();
}
//获取网页源码
HttpEntity httpEntity=response.getEntity();//获取网页源码
try {
String h=EntityUtils.toString(httpEntity,"UTF-8");//指定编码避免乱码
System.out.printf(h);
} catch (IOException e) {
//io异常(网络问题)
e.printStackTrace();
}
//关闭HTTp
try {
response.close();
httpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
运行结果,模拟打开网页,使用getEntity显示网页的HTML源代码
二、模拟浏览器UA并返回状态
有些网页会给不同的浏览器提供不同的页面,或者限制机器抓取。这时候就需要设置UA来模拟浏览器登录页面,可以使用getStatusLine返回状态。
1、设置请求头的UA模拟火狐浏览器
httpGet.setHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0");
2、返回状态
response.getStatusLine();//获取当前状态
如果只返回状态码(200)
response.getStatusLine().getStatusCode()
3、返回类型
确定链接的目标类型
entity.getContentType().getValue()
三、GET 带参数
使用URIBuilder构造一个URI,并设置参数,多个参数就是多个setParameters
URIBuilder uriBuilder=new URIBuilder("http://baidu.com");
//写入参数 (可以设置多参数)
uriBuilder.setParameter("key","JAVA");
uriBuilder.setParameter("keys","c#");
使用build()方法转换为URI
httpGet=new HttpGet(uriBuilder.build());//使用builder写入URI
带参数的完整代码
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
import java.net.URISyntaxException;
//带参数的get
public class HelloWordUA {
public static void main(String[] ars){
//生成一个可关闭的HTTP浏览器(相当于)
CloseableHttpClient httpClient= HttpClients.createDefault();
CloseableHttpResponse response=null;
HttpGet httpGet=null;
try {
URIBuilder uriBuilder=new URIBuilder("http://baidu.com");
//写入参数 (可以设置多参数)
uriBuilder.setParameter("key","JAVA");
uriBuilder.setParameter("keys","c#");
System.out.println(uriBuilder.build());
//创建http Get请求
httpGet=new HttpGet(uriBuilder.build());//使用builder写入URI
} catch (URISyntaxException e) {
e.printStackTrace();
}
//设置请求头,UA浏览器型号,模拟火狐浏览器
httpGet.setHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0");
try {
response=httpClient.execute(httpGet);//执行
//获取当响应状态
// response.getStatusLine();//获取当前状态
//response.getStatusLine().getStatusCode() 获取当前状态码
System.out.println("Status:"+response.getStatusLine().getStatusCode());
//获取网页源码
HttpEntity entity=response.getEntity();//获取网页实体
//获取目标类型
System.out.println("ContentType:"+entity.getContentType().getValue());
System.out.println(EntityUtils.toString(entity,"UTF-8"));
} catch (IOException e) {
e.printStackTrace();
}
//关闭HTTp
try {
response.close();
httpClient.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}