笑话集网站最近更新网站内容采集
优采云 发布时间: 2020-08-24 07:14笑话集网站最近更新网站内容采集
本篇博客主页介绍笑话集()最近更新列表页内容的抓取实现方法,程序源代码下载地址:
首先介绍一下抓取入口,这里的没有实现抓取程序的周期性采集,这里可以依照自己的须要来写相应的线程。
/**
*@Description: 笑话集抓取调度入口
*/
package cn.lulei.crawl.jokeji;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.concurrent.TimeUnit;
import cn.lulei.db.jokeji.JokeDbOperation;
import cn.lulei.model.Jokeji;
import cn.lulei.util.ParseUtil;
public class JokeCrawl {
//笑话集更新列表页url格式
private static String listPageUrl = "http://www.jokeji.cn/list_%pno%.htm";
//两次访问页面事件间隔,单位ms
private static int sleepTime = 500;
/**
* @param start 起始页
* @param end 终止页
* @throws IOException
* @Date: 2014-2-12
* @Author: lulei
* @Description: 抓取更新列表页上的内容
*/
public void crawlMain(int start, int end) throws IOException{
start = start < 1 ? 1 : start;
JokeDbOperation jokeDbOperation = new JokeDbOperation();
for ( ; start 0) {
try {
if (httpClient.executeMethod(method) != HttpStatus.SC_OK){
log.error("can not connect " + urlStr);
return false;
}
//获取头信息
responseHeaders = method.getResponseHeaders();
//获取页面源代码
InputStream inputStream = method.getResponseBodyAsStream();
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, charsetName));
StringBuffer stringBuffer = new StringBuffer();
String lineString = null;
while ((lineString = bufferedReader.readLine()) != null){
stringBuffer.append(lineString);
}
pageSourceCode = stringBuffer.toString();
return true;
} catch (Exception e) {
System.out.println(urlStr + " -- can't connect " + (maxConnectTimes - n + 1));
n--;
}
}
return false;
}
/**
* @param urlStr
* @param params
* @return GetMethod
* @Date: 2013-9-12
* @Author: lulei
* @Description: 设置get请求参数
*/
@SuppressWarnings("rawtypes")
private GetMethod createGetMethod(String urlStr, HashMap params){
GetMethod getMethod = new GetMethod(urlStr);
if (params == null){
return getMethod;
}
Iterator iter = params.entrySet().iterator();
while (iter.hasNext()) {
Map.Entry entry = (Map.Entry) iter.next();
String key = (String) entry.getKey();
String val = (String) entry.getValue();
getMethod.setRequestHeader(key, val);
}
return getMethod;
}
/**
* @param urlStr
* @param params
* @return PostMethod
* @Date: 2013-9-12
* @Author: lulei
* @Description: 设置post请求参数
*/
@SuppressWarnings("rawtypes")
private PostMethod createPostMethod(String urlStr, HashMap params){
PostMethod postMethod = new PostMethod(urlStr);
if (params == null){
return postMethod;
}
Iterator iter = params.entrySet().iterator();
while (iter.hasNext()) {
Map.Entry entry = (Map.Entry) iter.next();
String key = (String) entry.getKey();
String val = (String) entry.getValue();
postMethod.setParameter(key, val);
}
return postMethod;
}
/**
* @param urlStr
* @param charsetName
* @return 访问是否成功
* @throws IOException
* @Date: 2013-9-12
* @Author: lulei
* @Description: 不设置任何头信息直接访问网页
*/
public boolean readPageByGet(String urlStr, String charsetName) throws IOException{
return this.readPageByGet(urlStr, charsetName, null);
}
/**
* @return String
* @Date: 2013-9-12
* @Author: lulei
* @Description: 获取网页源代码
*/
public String getPageSourceCode(){
return pageSourceCode;
}
/**
* @return Header[]
* @Date: 2013-9-12
* @Author: lulei
* @Description: 获取网页返回头信息
*/
public Header[] getHeader(){
return responseHeaders;
}
/**
* @param timeout
* @Date: 2013-9-12
* @Author: lulei
* @Description: 设置连接超时时间
*/
public void setConnectTimeout(int timeout){
httpClient.getHttpConnectionManager().getParams().setConnectionTimeout(timeout);
}
/**
* @param timeout
* @Date: 2013-9-12
* @Author: lulei
* @Description: 设置读取超时时间
*/
public void setReadTimeout(int timeout){
httpClient.getHttpConnectionManager().getParams().setSoTimeout(timeout);
}
/**
* @param maxConnectTimes
* @Date: 2014-2-12
* @Author: lulei
* @Description: 设置最大访问次数,链接失败的情况下使用
*/
public static void setMaxConnectTimes(int maxConnectTimes) {
CrawlBase.maxConnectTimes = maxConnectTimes;
}
/**
* @param connectTimeout
* @param readTimeout
* @Date: 2013-9-12
* @Author: lulei
* @Description: 设置连接超时时间和读取超时时间
*/
public void setTimeout(int connectTimeout, int readTimeout){
setConnectTimeout(connectTimeout);
setReadTimeout(readTimeout);
}
}
对于更新列表页的详尽页面的链接url,由于多数网站都有相同的共性,因此对CrawlBase进行再一次的封装成CrawlListPageBase类,实现更新列表页中链接url的获取。
/**
*@Description: 获取页面链接地址信息基类
*/
package cn.lulei.crawl;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import cn.lulei.util.DoRegex;
public abstract class CrawlListPageBase extends CrawlBase {
private String pageurl;
/**
* @param urlStr
* @param charsetName
* @throws IOException
*/
public CrawlListPageBase(String urlStr, String charsetName) throws IOException{
readPageByGet(urlStr, charsetName);
pageurl = urlStr;
}
/**
* @param urlStr
* @param charsetName
* @param method
* @param params
* @throws IOException
*/
public CrawlListPageBase(String urlStr, String charsetName, String method, HashMap params) throws IOException{
readPage(urlStr, charsetName, method, params);
pageurl = urlStr;
}
/**
* @return ArrayList
* @Date: 2013-9-13
* @Author: lulei
* @Description: 返回页面上需求的链接地址
*/
public ArrayList getPageUrls(){
ArrayList pageUrls = new ArrayList();
pageUrls = DoRegex.getArrayList(getPageSourceCode(), getUrlRegexString(), pageurl, getUrlRegexStringNum());
return pageUrls;
}
/**
* @return String
* @Date: 2013-9-13
* @Author: lulei
* @Description: 返回页面上需求的网址连接的正则表达式
*/
public abstract String getUrlRegexString();
/**
* @return int
* @Date: 2013-9-13
* @Author: lulei
* @Description: 正则表达式中要去的字段位置
*/
public abstract int getUrlRegexStringNum();
}
继承该类,只须要实现public abstract String getUrlRegexString();public abstract int getUrlRegexStringNum();这两个具象方式即可,对于笑话集的更新列表页的实现如下:
<p> /**
*@Description: 笑话集最近更新列表页面
*/
package cn.lulei.crawl.jokeji;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import cn.lulei.crawl.CrawlListPageBase;
/**
*@Description:
*@Author: lulei
*@Date: 2014-2-12
*@Version: 1.1.0
*/
public class JokeList extends CrawlListPageBase{
//请求jokeji最新更新列表页参数
private static HashMap params = new HashMap();
static {
params.put("Host", "www.jokeji.cn");
params.put("Pragma", "no-cache");
params.put("User-Agent", "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36");
}
public JokeList(String urlStr) throws IOException {
this(urlStr, "gb2312");
}
public JokeList(String urlStr, String charsetName) throws IOException {
super(urlStr, charsetName, "get", params);
// TODO Auto-generated constructor stub
}
@Override
public String getUrlRegexString() {
// TODO Auto-generated method stub
return "<b>