抓取ajax动态网页java(如何一次性把网站中的ajax获取这里介绍的文件)

优采云发布时间: 2022-02-04 10:03

　　通常情况下，通过网络爬虫挖出来的网页的静态内容基本上就是网页的静态内容，而动态ajax号的内容是我个人不知道ajax怎么弄进去的网站一次

　　这里介绍的是某个网站中的某个ajax刷新某个表，期望数据，提供其他操作，比如下载：

　　假设我们需要挖掘某个网站：

　　示例：网站中的那些 pdf 文件，并下载它们

　　首先：需要分析网页构成的结果；看看它是如何被读取和处理的。 ajax解决方案到此结束（其他异同，ajax只对数据进行一次性数据请求）

　　具体操作已经通过案例主要介绍了：

　　先分析ajax使用的请求url的含义和请求中需要的参数，然后给出响应的参数

　　 /**

* 获取某个请求的内容

* @param url 请求的地址

* @param code 请求的编码，不传就代表UTF-8

* @return 请求响应的内容

* @throws IOException

*/

public static String fetch_url(String url, String code) throws IOException {

BufferedReader bis = null;

InputStream is = null;

InputStreamReader inputStreamReader = null;

try {

URLConnection connection = new URL(url).openConnection();

connection.setConnectTimeout(20000);

connection.setReadTimeout(20000);

connection.setUseCaches(false);

connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11");

is = connection.getInputStream();

inputStreamReader = new InputStreamReader(is, code);

bis = new BufferedReader(inputStreamReader);

String line = null;

StringBuffer result = new StringBuffer();

while ((line = bis.readLine()) != null) {

result.append(line);

}

return result.toString();

} finally {

if (inputStreamReader != null) {

try {

inputStreamReader.close();

} catch (IOException e) {

e.printStackTrace();

}

if (bis != null) {

try {

bis.close();

} catch (IOException e) {

e.printStackTrace();

}

if (is != null) {

try {

is.close();

} catch (IOException e) {

e.printStackTrace();

}

　　通过上面的url请求，观察响应的数据格式，这里响应的数据测试格式为json格式

　　 /**

* 数据转化成json格式

* @param s

*/

public static void getJSON(String s) {

JSONObject object = JSONObject.fromObject(s);

JSONArray array = JSONArray.fromObject(object.get("disclosureInfos"));

//System.out.println(array);

String filePath = object.getString("filePath");//解析数据中的某一个值

//System.out.println(array.size());

List listFilePath = getJSONArray(array,filePath);//将数据解析成条数

/*System.out.println(listFilePath);

System.out.println(listFilePath.size());*/

writer(listFilePath);//根据数据的内容开始挖取下载

}

　　大量数据需要下载，一个一个

　　 public static void writer(List listFilePath) {

for (String string : listFilePath) {

downloadFile(string);

}

　　从格式数据中解析json数据

　　 /**

* 解析文件url

* @param array

* @return

*/

public static List getJSONArray(JSONArray array,String filePath) {

List listFilePath = new ArrayList();

for (Object object : array) {

JSONObject ob = JSONObject.fromObject(object);

filePath = filePath + ob.get("filePath").toString();

// System.out.println(filePath);

listFilePath.add(filePath);

}

return listFilePath;

}

　　文件下载处理

　　 /* 下载 url 指向的网页 */

public static String downloadFile(String url) {

String filePath = null;

/* 1.生成 HttpClinet 对象并设置参数 */

HttpClient httpClient = new HttpClient();

// 设置 Http 连接超时 5s

httpClient.getHttpConnectionManager().getParams().setConnectionTimeout(5000);

/* 2.生成 GetMethod 对象并设置参数 */

GetMethod getMethod = new GetMethod(url);

// 设置 get 请求超时 5s

getMethod.getParams().setParameter(HttpMethodParams.SO_TIMEOUT, 5000);

// 设置请求重试处理

getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER, new DefaultHttpMethodRetryHandler());

/* 3.执行 HTTP GET 请求 */

try {

int statusCode = httpClient.executeMethod(getMethod);

// 判断访问的状态码

if (statusCode != HttpStatus.SC_OK) {

System.err.println("Method failed: " + getMethod.getStatusLine());

filePath = null;

}

/* 4.处理 HTTP 响应内容 */

byte[] responseBody = getMethod.getResponseBody();// 读取为字节数组

// 根据网页 url 生成保存时的文件名

filePath = "e:\spider\";

String fileName = getFileNameByUrl(url, getMethod.getResponseHeader("Content-Type").getValue());

saveToLocal(responseBody, filePath,fileName);

} catch (HttpException e) {

// 发生致命的异常，可能是协议不对或者返回的内容有问题

System.out.println("Please check your provided http address!");

e.printStackTrace();

} catch (IOException e) {

// 发生网络异常

e.printStackTrace();

} finally {

// 释放连接

getMethod.releaseConnection();

}

return filePath;

}

　　确认文件名和文件格式

　　 /**

* 根据 url 和网页类型生成需要保存的网页的文件名去除掉 url 中非文件名字符

*/

public static String getFileNameByUrl(String url, String contentType) {

// remove http://

url = url.substring(7);

// text/html类型

if (contentType.indexOf("html") != -1) {

url = url.replaceAll("[\?/:*|\"]", "_") + ".html";

return url;

}

// 如application/pdf类型

else {

return url.replaceAll("[\?/:*|\"]", "_") + "." + contentType.substring(contentType.lastIndexOf("/") + 1);

}

　　保存要写入的文件地址

　　 /**

* 保存网页字节数组到本地文件 filePath 为要保存的文件的相对地址

*/

private static void saveToLocal(byte[] data, String fileDir,String fileName) {

try {

File fileNew=new File(fileDir+"\"+fileName);//new 一个文件构造参数是字符串

File rootFile=fileNew.getParentFile();//得到父文件夹

if( !fileNew.exists()) {

rootFile.mkdirs();

fileNew.createNewFile();

}

DataOutputStream out = new DataOutputStream(new FileOutputStream(fileNew));

for (int i = 0; i < data.length; i++)

out.write(data[i]);

out.flush();

out.close();

} catch (IOException e) {

e.printStackTrace();

}

　　测试，这里写一个测试URL，网站地址和URL参数可能会发生变化，需要适当调整

public static void main(String[] args) throws Exception{

String s = fetch_url("http://www.neeq.cc/controller/GetDisclosureannouncementPage?type=7&key=&startDate=2015-05-20&endDate=2015-05-21&queryParams=0&page=1&_=1432187131769", "utf-8");

//System.out.println(s);

getJSON(s);

}

　　完（欢迎转载）

0

2022-02-04

抓取ajax动态网页java

0 个评论

要回复文章请先登录或注册

AI时代内容工厂

抓取ajax动态网页java(如何一次性把网站中的ajax获取这里介绍的文件)

0 个评论

发起人

AI时代内容工厂

抓取ajax动态网页java(如何一次性把网站中的ajax获取这里介绍的文件)

0 个评论

发起人

相关问题