笑话集网站最近更新网站内容采集

优采云 发布时间: 2020-08-24 07:14

  笑话集网站最近更新网站内容采集

  本篇博客主页介绍笑话集()最近更新列表页内容的抓取实现方法,程序源代码下载地址:

  首先介绍一下抓取入口,这里的没有实现抓取程序的周期性采集,这里可以依照自己的须要来写相应的线程。

   /**

*@Description: 笑话集抓取调度入口

*/

package cn.lulei.crawl.jokeji;

import java.io.IOException;

import java.util.ArrayList;

import java.util.HashSet;

import java.util.concurrent.TimeUnit;

import cn.lulei.db.jokeji.JokeDbOperation;

import cn.lulei.model.Jokeji;

import cn.lulei.util.ParseUtil;

public class JokeCrawl {

//笑话集更新列表页url格式

private static String listPageUrl = "http://www.jokeji.cn/list_%pno%.htm";

//两次访问页面事件间隔,单位ms

private static int sleepTime = 500;

/**

* @param start 起始页

* @param end 终止页

* @throws IOException

* @Date: 2014-2-12

* @Author: lulei

* @Description: 抓取更新列表页上的内容

*/

public void crawlMain(int start, int end) throws IOException{

start = start < 1 ? 1 : start;

JokeDbOperation jokeDbOperation = new JokeDbOperation();

for ( ; start 0) {

try {

if (httpClient.executeMethod(method) != HttpStatus.SC_OK){

log.error("can not connect " + urlStr);

return false;

}

//获取头信息

responseHeaders = method.getResponseHeaders();

//获取页面源代码

InputStream inputStream = method.getResponseBodyAsStream();

BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, charsetName));

StringBuffer stringBuffer = new StringBuffer();

String lineString = null;

while ((lineString = bufferedReader.readLine()) != null){

stringBuffer.append(lineString);

}

pageSourceCode = stringBuffer.toString();

return true;

} catch (Exception e) {

System.out.println(urlStr + " -- can't connect " + (maxConnectTimes - n + 1));

n--;

}

}

return false;

}

/**

* @param urlStr

* @param params

* @return GetMethod

* @Date: 2013-9-12

* @Author: lulei

* @Description: 设置get请求参数

*/

@SuppressWarnings("rawtypes")

private GetMethod createGetMethod(String urlStr, HashMap params){

GetMethod getMethod = new GetMethod(urlStr);

if (params == null){

return getMethod;

}

Iterator iter = params.entrySet().iterator();

while (iter.hasNext()) {

Map.Entry entry = (Map.Entry) iter.next();

String key = (String) entry.getKey();

String val = (String) entry.getValue();

getMethod.setRequestHeader(key, val);

}

return getMethod;

}

/**

* @param urlStr

* @param params

* @return PostMethod

* @Date: 2013-9-12

* @Author: lulei

* @Description: 设置post请求参数

*/

@SuppressWarnings("rawtypes")

private PostMethod createPostMethod(String urlStr, HashMap params){

PostMethod postMethod = new PostMethod(urlStr);

if (params == null){

return postMethod;

}

Iterator iter = params.entrySet().iterator();

while (iter.hasNext()) {

Map.Entry entry = (Map.Entry) iter.next();

String key = (String) entry.getKey();

String val = (String) entry.getValue();

postMethod.setParameter(key, val);

}

return postMethod;

}

/**

* @param urlStr

* @param charsetName

* @return 访问是否成功

* @throws IOException

* @Date: 2013-9-12

* @Author: lulei

* @Description: 不设置任何头信息直接访问网页

*/

public boolean readPageByGet(String urlStr, String charsetName) throws IOException{

return this.readPageByGet(urlStr, charsetName, null);

}

/**

* @return String

* @Date: 2013-9-12

* @Author: lulei

* @Description: 获取网页源代码

*/

public String getPageSourceCode(){

return pageSourceCode;

}

/**

* @return Header[]

* @Date: 2013-9-12

* @Author: lulei

* @Description: 获取网页返回头信息

*/

public Header[] getHeader(){

return responseHeaders;

}

/**

* @param timeout

* @Date: 2013-9-12

* @Author: lulei

* @Description: 设置连接超时时间

*/

public void setConnectTimeout(int timeout){

httpClient.getHttpConnectionManager().getParams().setConnectionTimeout(timeout);

}

/**

* @param timeout

* @Date: 2013-9-12

* @Author: lulei

* @Description: 设置读取超时时间

*/

public void setReadTimeout(int timeout){

httpClient.getHttpConnectionManager().getParams().setSoTimeout(timeout);

}

/**

* @param maxConnectTimes

* @Date: 2014-2-12

* @Author: lulei

* @Description: 设置最大访问次数,链接失败的情况下使用

*/

public static void setMaxConnectTimes(int maxConnectTimes) {

CrawlBase.maxConnectTimes = maxConnectTimes;

}

/**

* @param connectTimeout

* @param readTimeout

* @Date: 2013-9-12

* @Author: lulei

* @Description: 设置连接超时时间和读取超时时间

*/

public void setTimeout(int connectTimeout, int readTimeout){

setConnectTimeout(connectTimeout);

setReadTimeout(readTimeout);

}

}

  对于更新列表页的详尽页面的链接url,由于多数网站都有相同的共性,因此对CrawlBase进行再一次的封装成CrawlListPageBase类,实现更新列表页中链接url的获取。

   /**

*@Description: 获取页面链接地址信息基类

*/

package cn.lulei.crawl;

import java.io.IOException;

import java.util.ArrayList;

import java.util.HashMap;

import cn.lulei.util.DoRegex;

public abstract class CrawlListPageBase extends CrawlBase {

private String pageurl;

/**

* @param urlStr

* @param charsetName

* @throws IOException

*/

public CrawlListPageBase(String urlStr, String charsetName) throws IOException{

readPageByGet(urlStr, charsetName);

pageurl = urlStr;

}

/**

* @param urlStr

* @param charsetName

* @param method

* @param params

* @throws IOException

*/

public CrawlListPageBase(String urlStr, String charsetName, String method, HashMap params) throws IOException{

readPage(urlStr, charsetName, method, params);

pageurl = urlStr;

}

/**

* @return ArrayList

* @Date: 2013-9-13

* @Author: lulei

* @Description: 返回页面上需求的链接地址

*/

public ArrayList getPageUrls(){

ArrayList pageUrls = new ArrayList();

pageUrls = DoRegex.getArrayList(getPageSourceCode(), getUrlRegexString(), pageurl, getUrlRegexStringNum());

return pageUrls;

}

/**

* @return String

* @Date: 2013-9-13

* @Author: lulei

* @Description: 返回页面上需求的网址连接的正则表达式

*/

public abstract String getUrlRegexString();

/**

* @return int

* @Date: 2013-9-13

* @Author: lulei

* @Description: 正则表达式中要去的字段位置

*/

public abstract int getUrlRegexStringNum();

}

  继承该类,只须要实现public abstract String getUrlRegexString();public abstract int getUrlRegexStringNum();这两个具象方式即可,对于笑话集的更新列表页的实现如下:

<p> /**

*@Description: 笑话集最近更新列表页面

*/

package cn.lulei.crawl.jokeji;

import java.io.IOException;

import java.util.ArrayList;

import java.util.HashMap;

import cn.lulei.crawl.CrawlListPageBase;

/**

*@Description:

*@Author: lulei

*@Date: 2014-2-12

*@Version: 1.1.0

*/

public class JokeList extends CrawlListPageBase{

//请求jokeji最新更新列表页参数

private static HashMap params = new HashMap();

static {

params.put("Host", "www.jokeji.cn");

params.put("Pragma", "no-cache");

params.put("User-Agent", "Mozilla/5.0 (Windows NT 5.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36");

}

public JokeList(String urlStr) throws IOException {

this(urlStr, "gb2312");

}

public JokeList(String urlStr, String charsetName) throws IOException {

super(urlStr, charsetName, "get", params);

// TODO Auto-generated constructor stub

}

@Override

public String getUrlRegexString() {

// TODO Auto-generated method stub

return "<b>

0 个评论

要回复文章请先登录注册


官方客服QQ群

微信人工客服

QQ人工客服


线