解决方案:定时任务模块,附带定时采集实现

优采云 发布时间: 2022-12-20 19:23

  解决方案:定时任务模块,附带定时采集实现

  今天终于实现了定时任务模块。 发完我们一起优化吧,代码如下,你看不看,就在那儿:)

  代码清单:

  ================================ SQL================= === ==================

  日程

  =================================定时任务模块类============= ===== =====================

  计划管理DAO接口CmsSchedulerDao.java

  计划管理DAO接口实现类CmsSchedulerDaoImpl.java

  定时任务管理服务接口CmsSchedulerMng.java

  定时任务管理服务接口实现类CmsSchedulerMngImpl.java

  定时任务管理接口SchedulerTaskManageSvc.java

  定时任务管理接口实现类SchedulerTaskManageSvcImpl.java

  定时任务接口SchedulerTaskSvc.java

  定时任务抽象实现类AbstractSchedulerTaskSvc.java

  定时任务接口-采集器实现类-多线程版SchedulerAcquisitionSvcImpl.java

  定时服务关联任务be​​anSchedulerTaskBean.java

  定时任务ControllerCmsSchedulerAct.java

  持久对象基类 BaseCmsScheduler.java

  持久对象 CmsScheduler.java

  HBM 文件 CmsScheduler.hbm.xml

  ===================================定时任务模块相关互助=========== ==== ========================

  规划框架

  规划框架-任务调度Scheduler.java

  规划框架——时间*敏*感*词*接口ScheduleIterator.java

  定时任务抽象类SchedulerTask.java

  调度框架-时间*敏*感*词*接口实现类SimpleScheduleIterator.java

  调度参数beanScheduleParamBean.java

  采集相关

  HTML解析工具类接口ParseHtmlTool.java

  HTML解析工具,HtmlParser实现类HtmlParserImpl.java

  采集参数封装beanParamBean.java

  队列.java

  URL 队列 UrlQueue.java

  接下来是XML配置

  =================================定时任务模块XML配置=========== ===== =======================

  道配置

  管理配置

  服务配置

  接下来是messages_zh_CN.properties添加的常量

  ================================ messages_zh_CN.properties=============== ==== ====================

  messages_zh_CN.properties

  接下来是模板

  ================================模板================= ==== ==================

  generate_left.html 已被修改

  调度程序/add.html

  调度程序/edit.html

  调度程序/list.html

  具体代码如下:

  ================================ SQL================= === ==================

  1:计划任务表

  /*

MySQL Data Transfer

Source Host: localhost

Source Database: jeecms

Target Host: localhost

Target Database: jeecms

Date: 2011-11-8 11:36:55

*/

SET FOREIGN_KEY_CHECKS=0;

-- ----------------------------

-- Table structure for jc_scheduler

-- ----------------------------

CREATE TABLE `jc_scheduler` (

`scheduler_id` int(11) NOT NULL AUTO_INCREMENT COMMENT '任务主键',

`site_id` int(11) DEFAULT NULL,

`associate_id` int(11) DEFAULT NULL COMMENT '相关ID',

`module_type` varchar(100) DEFAULT NULL COMMENT '模块类型',

`name` varchar(100) DEFAULT NULL COMMENT '任务名称',

`start_time` datetime DEFAULT NULL COMMENT '开始时间',

`end_time` datetime DEFAULT NULL COMMENT '结束时间',

`status` int(1) NOT NULL DEFAULT '0' COMMENT '当前状态(0:静止;1:采集)',

`expression` varchar(50) NOT NULL COMMENT '计划表达式',

PRIMARY KEY (`scheduler_id`)

) ENGINE=InnoDB AUTO_INCREMENT=10 DEFAULT CHARSET=utf8;

-- ----------------------------

-- Records

-- ----------------------------

INSERT INTO `jc_scheduler` VALUES ('4', '1', '1', 'schedulerAcquisitionSvc', '测试', '2011-11-07 18:02:30', '2011-11-07 18:04:00', '0', '*,*,*,*,3,0');

INSERT INTO `jc_scheduler` VALUES ('8', '1', '5', 'schedulerAcquisitionSvc', '测试采集java', '2011-11-08 10:25:15', '2011-11-08 10:27:04', '0', '*,*,*,*,26,0');

INSERT INTO `jc_scheduler` VALUES ('9', '1', '1', 'schedulerAcquisitionSvc', '测试采集新闻', '2011-11-08 10:37:58', '2011-11-08 10:38:11', '0', '*,*,*,*,38,0')

  =================================定时任务模块类============= ===== =====================

  计划管理DAO接口CmsSchedulerDao.java

  package com.jeecms.cms.dao.assist;

import java.util.List;

import com.jeecms.cms.entity.assist.CmsScheduler;

import com.jeecms.common.hibernate3.Updater;

/**

* 计划管理DAO接口

* @author javacoo

* @since 2011-11-07

*/

public interface CmsSchedulerDao {

public List getList();

public List getListBy(CmsScheduler bean);

public CmsScheduler findById(Integer id);

public CmsScheduler save(CmsScheduler bean);

public CmsScheduler updateByUpdater(Updater updater);

public CmsScheduler deleteById(Integer id);

}

  计划管理DAO接口实现类CmsSchedulerDaoImpl.java

  package com.jeecms.cms.dao.assist.impl;

import java.util.List;

import org.apache.commons.lang.StringUtils;

import org.springframework.stereotype.Repository;

import com.jeecms.cms.dao.assist.CmsSchedulerDao;

import com.jeecms.cms.entity.assist.CmsScheduler;

import com.jeecms.common.hibernate3.Finder;

import com.jeecms.common.hibernate3.HibernateBaseDao;

@Repository

public class CmsSchedulerDaoImpl extends

HibernateBaseDao implements CmsSchedulerDao {

@SuppressWarnings("unchecked")

public List getList() {

Finder f = Finder.create("from CmsScheduler bean order by bean.id asc");

return find(f);

}

@SuppressWarnings("unchecked")

public List getListBy(CmsScheduler bean) {

Finder f = Finder.create("from CmsScheduler bean");

if(StringUtils.isNotEmpty(bean.getModuleType()) && bean.getSite().getId() != null) {

f.append(" where bean.moduleType=:moduleType and bean.site.id=:siteId");

f.setParam("moduleType", bean.getModuleType());

f.setParam("siteId", bean.getSite().getId());

}

f.append(" order by bean.id asc");

return find(f);

}

public CmsScheduler findById(Integer id) {

CmsScheduler entity = get(id);

return entity;

}

public CmsScheduler save(CmsScheduler bean) {

getSession().save(bean);

return bean;

}

public CmsScheduler deleteById(Integer id) {

CmsScheduler entity = super.get(id);

if (entity != null) {

getSession().delete(entity);

}

return entity;

}

@Override

protected Class getEntityClass() {

return CmsScheduler.class;

}

}

  定时任务管理服务接口CmsSchedulerMng.java

  package com.jeecms.cms.manager.assist;

import java.util.List;

import com.jeecms.cms.entity.assist.CmsScheduler;

/**

* 计划任务管理服务接口

* @author javacoo

* @since 2011-11-07

* @version 1.0

*/

public interface CmsSchedulerMng {

/**

* 取得所有计划任务

* @return 所有计划任务

*/

List getList();

/**

* 取得指定站点,指定模块所有计划任务

* @param bean 计划任务bean

* @return 所有计划任务

*/

List getListBy(CmsScheduler bean);

/**

* 根据ID取得计划任务

* @param id

* @return 计划任务

*/

CmsScheduler findById(Integer id);

/**

* 停止指定的计划任务

* @param id

*/

void stop(Integer id);

/**

* 开始指定的计划任务

* @param id

*/

CmsScheduler start(Integer id);

/**

* 停止指定的计划任务

* @param id

*/

void end(Integer id);

/**

* 保存计划任务

* @param bean

* @return

*/

CmsScheduler save(CmsScheduler bean);

/**

* 更新计划任务

* @param bean

* @return

*/

CmsScheduler update(CmsScheduler bean);

/**

* 删除计划任务

* @param bean

* @return

*/

CmsScheduler deleteById(Integer id);

/**

* 批量删除计划任务

* @param bean

* @return

*/

CmsScheduler[] deleteByIds(Integer[] ids);

}

  定时任务管理服务接口实现类CmsSchedulerMngImpl.java

  package com.jeecms.cms.manager.assist.impl;

import java.util.Date;

import java.util.List;

import org.springframework.beans.factory.annotation.Autowired;

import org.springframework.stereotype.Service;

import org.springframework.transaction.annotation.Transactional;

import com.jeecms.cms.dao.assist.CmsSchedulerDao;

import com.jeecms.cms.entity.assist.CmsAcquisition;

import com.jeecms.cms.entity.assist.CmsScheduler;

import com.jeecms.cms.manager.assist.CmsSchedulerMng;

import com.jeecms.common.hibernate3.Updater;

/**

* 计划任务管理服务接口实现类

* @author javacoo

* @since 2011-11-07

* @version 1.0

*/

@Service

@Transactional

public class CmsSchedulerMngImpl implements CmsSchedulerMng{

@Transactional(readOnly = true)

public List getList() {

return dao.getList();

}

@Transactional(readOnly = true)

public List getListBy(CmsScheduler bean) {

return dao.getListBy(bean);

}

@Transactional(readOnly = true)

public CmsScheduler findById(Integer id) {

CmsScheduler entity = dao.findById(id);

return entity;

}

public void stop(Integer id) {

CmsScheduler acqu = findById(id);

if (acqu == null) {

return;

}

if (acqu.getStatus() == CmsScheduler.START) {

acqu.setStatus(CmsScheduler.STOP);

}

}

public CmsScheduler start(Integer id) {

CmsScheduler scheduler = findById(id);

if (scheduler == null) {

return scheduler;

}

scheduler.setStatus(CmsAcquisition.START);

scheduler.setStartTime(new Date());

scheduler.setEndTime(null);

return scheduler;

}

public void end(Integer id) {

CmsScheduler scheduler = findById(id);

if (scheduler == null) {

return;

}

scheduler.setStatus(CmsAcquisition.STOP);

scheduler.setEndTime(new Date());

}

public CmsScheduler save(CmsScheduler bean) {

bean.init();

dao.save(bean);

return bean;

}

public CmsScheduler update(CmsScheduler bean) {

Updater updater = new Updater(bean);

bean = dao.updateByUpdater(updater);

return bean;

}

public CmsScheduler deleteById(Integer id) {

CmsScheduler bean = dao.deleteById(id);

return bean;

}

public CmsScheduler[] deleteByIds(Integer[] ids) {

CmsScheduler[] beans = new CmsScheduler[ids.length];

for (int i = 0, len = ids.length; i < len; i++) {

beans[i] = deleteById(ids[i]);

}

return beans;

}

private CmsSchedulerDao dao;

@Autowired

public void setDao(CmsSchedulerDao dao) {

this.dao = dao;

}

}

  定时任务管理接口SchedulerTaskManageSvc.java

  package com.jeecms.cms.service.scheduler;

import java.util.List;

import com.jeecms.cms.entity.assist.CmsScheduler;

/**

* 定时任务管理接口

* @author javacoo

* @since 2011-11-07

*/

public interface SchedulerTaskManageSvc {

/**

* 开始计划任务

* @param scheduler 任务对象

* @return true/false

*/

boolean start(CmsScheduler scheduler);

/**

* 结束计划任务

* @param scheduler 任务对象

* @return true/false

*/

boolean stop(CmsScheduler scheduler);

/**

* 取得关联任务map

* @param scheduler 任务对象

* @return 关联任务map

*/

List associateTaskList(CmsScheduler scheduler);

}

  定时任务管理接口实现类SchedulerTaskManageSvcImpl.java

  package com.jeecms.cms.service.scheduler;

import java.util.List;

import java.util.Map;

import java.util.concurrent.ConcurrentHashMap;

import org.apache.commons.lang.StringUtils;

import org.springframework.beans.factory.annotation.Autowired;

import org.springframework.stereotype.Service;

import com.jeecms.cms.entity.assist.CmsScheduler;

import com.jeecms.common.scheduling.core.Scheduler;

import com.jeecms.common.scheduling.core.SchedulerTask;

import com.jeecms.common.scheduling.impl.ScheduleParamBean;

import com.jeecms.common.scheduling.impl.SimpleScheduleIterator;

/**

* 定时任务管理服务接口实现类

* @author javacoo

* @since 2011-11-07

*/

@Service

public class SchedulerTaskManageSvcImpl implements SchedulerTaskManageSvc {

/**任务管理对象MAP*/

private static Map taskManageMap = new ConcurrentHashMap();

/**定时任务服务对象MAP*/

@Autowired

private Map schedulerTaskSvcMap;

/**

* 任务管理对象

* @author javacoo

* @since 2011-11-07

*/

private class TaskManage{

/**任务调度*/

private final Scheduler scheduler = new Scheduler();

/**任务参数bean*/

private ScheduleParamBean scheduleParamBean;

/**定时任务*/

private final SchedulerTaskSvc schedulerTaskSvc;

private CmsScheduler cmsScheduler;

public TaskManage(SchedulerTaskSvc schedulerSvc,CmsScheduler cmsScheduler){

this.schedulerTaskSvc = schedulerSvc;

this.cmsScheduler = cmsScheduler;

}

/**

* 解析计划表达式

* @return

*/

private boolean parseSchedulerParam(){

scheduleParamBean = new ScheduleParamBean();

System.out.println("计划表达式:"+cmsScheduler.getExpression());

String schedulerParamStr = cmsScheduler.getExpression();

if(StringUtils.isNotEmpty(schedulerParamStr) && schedulerParamStr.contains(",")){

String[] strAarr = schedulerParamStr.split(",");

if(strAarr.length == 6){

if(StringUtils.isNumeric(strAarr[0])){

scheduleParamBean.setWeekOfMonth(Integer.valueOf(strAarr[0]));

}

if(StringUtils.isNumeric(strAarr[1])){

scheduleParamBean.setDayOfWeek(Integer.valueOf(strAarr[1]));

}

if(StringUtils.isNumeric(strAarr[2])){

scheduleParamBean.setDayOfMonth(Integer.valueOf(strAarr[2]));

}

if(StringUtils.isNumeric(strAarr[3])){

scheduleParamBean.setHourOfDay(Integer.valueOf(strAarr[3]));

}

if(StringUtils.isNumeric(strAarr[4])){

scheduleParamBean.setMinute(Integer.valueOf(strAarr[4]));

}

if(StringUtils.isNumeric(strAarr[5])){

scheduleParamBean.setSecond(Integer.valueOf(strAarr[5]));

}

}else{

return false;

}

}else{

return false;

}

return true;

}

/**

* 开始

*/

public void start() {

if(parseSchedulerParam()){

scheduler.schedule(new SchedulerTask() {

public void run() {

processer();

}

private void processer() {

System.out.println("============开始执行计划任务=================");

schedulerTaskSvc.start(cmsScheduler);

}

}, new SimpleScheduleIterator(scheduleParamBean));

}

}

/**

* 取消

*/

public void cancel() {

schedulerTaskSvc.stop(cmsScheduler);

scheduler.cancel();

}

}

/**

* 开始执行计划

* @param scheduler 计划对象

*/

public boolean start(CmsScheduler scheduler) {

SchedulerTaskSvc schedulerSvc = getSchedulerTaskSvcByModuleType(scheduler.getModuleType());

TaskManage taskManage = new TaskManage(schedulerSvc,scheduler);

taskManage.start();

taskManageMap.put(scheduler.getId(), taskManage);

return true;

}

/**

* 停止执行计划

* @param scheduler 计划对象

*/

public boolean stop(CmsScheduler scheduler) {

TaskManage taskManage = taskManageMap.get(scheduler.getId());

taskManage.cancel();

return true;

}

/**

* 取得计划关联的任务对象集合

* @param scheduler 计划对象

*/

public List associateTaskList(CmsScheduler scheduler) {

SchedulerTaskSvc schedulerSvc = getSchedulerTaskSvcByModuleType(scheduler.getModuleType());

return schedulerSvc.associateTaskList(scheduler);

}

/**

* 根据模块的类型,取得定时任务服务对象

* @param moduleType 模块类型

*/

private SchedulerTaskSvc getSchedulerTaskSvcByModuleType(String moduleType){

return schedulerTaskSvcMap.get(moduleType);

}

}

  定时任务接口SchedulerTaskSvc.java

  package com.jeecms.cms.service.scheduler;

import java.util.List;

import com.jeecms.cms.entity.assist.CmsScheduler;

/**

* 定时任务接口

* @author javacoo

* @since 2011-11-04

*/

public interface SchedulerTaskSvc {

/**

* 开始计划任务

* @param cmsScheduler 任务对象

* @return true/false

*/

boolean start(CmsScheduler cmsScheduler);

/**

* 结束计划任务

* @param cmsScheduler 任务对象

* @return true/false

*/

boolean stop(CmsScheduler cmsScheduler);

/**

* 取得关联任务map

* @param cmsScheduler 任务对象

* @return 关联任务map

*/

List associateTaskList(CmsScheduler cmsScheduler);

}

  定时任务抽象实现类AbstractSchedulerTaskSvc.java

  package com.jeecms.cms.service.scheduler;

import java.util.List;

import com.jeecms.cms.entity.assist.CmsScheduler;

/**

* 定时任务抽象实现类

* @author javacoo

* @since 2011-11-08

*/

public abstract class AbstractSchedulerTaskSvc implements SchedulerTaskSvc{

/**

* 开始计划任务

* @return true/false

*/

public boolean start(CmsScheduler scheduler){

return execute(scheduler);

}

/**

* 开始计划任务

* @return true/false

*/

public boolean stop(CmsScheduler scheduler){

return true;

}

/**

* 取得关联任务map

* @return 关联任务map

*/

public List associateTaskList(CmsScheduler scheduler){

return null;

}

protected abstract boolean execute(CmsScheduler scheduler);

}

  定时任务接口-采集器实现类-多线程版SchedulerAcquisitionSvcImpl.java

  package com.jeecms.cms.service.scheduler;

import java.io.IOException;

import java.net.URI;

import java.net.URISyntaxException;

import java.util.ArrayList;

import java.util.HashMap;

import java.util.List;

import java.util.Map;

import java.util.concurrent.CountDownLatch;

import java.util.concurrent.ExecutorService;

import java.util.concurrent.Executors;

import org.apache.commons.lang.StringUtils;

import org.apache.http.HttpEntity;

import org.apache.http.HttpHost;

import org.apache.http.HttpResponse;

import org.apache.http.StatusLine;

import org.apache.http.client.ClientProtocolException;

import org.apache.http.client.HttpClient;

import org.apache.http.client.HttpResponseException;

import org.apache.http.client.ResponseHandler;

import org.apache.http.client.methods.HttpGet;

import org.apache.http.conn.params.ConnRoutePNames;

import org.apache.http.impl.client.DefaultHttpClient;

import org.apache.http.util.EntityUtils;

import org.slf4j.Logger;

import org.slf4j.LoggerFactory;

import org.springframework.beans.factory.annotation.Autowired;

import org.springframework.stereotype.Service;

import com.jeecms.cms.entity.assist.CmsAcquisition;

import com.jeecms.cms.entity.main.Content;

import com.jeecms.cms.manager.assist.CmsAcquisitionMng;

import com.jeecms.common.crawler.UrlQueue;

import com.jeecms.common.crawler.util.HtmlParserImpl;

import com.jeecms.common.crawler.util.ParseHtmlTool;

/**

* 计划任务接口-采集器实现类-多线程版

* @author javacoo

* @since 2011-11-02

* @version 1.0

*/

@Service

public class SchedulerAcquisitionSvcImpl extends AbstractSchedulerTaskSvc {

private Logger log = LoggerFactory.getLogger(SchedulerAcquisitionSvcImpl.class);

/**开启线程数*/

private static int THREAD_NUM = 2;

/**每个线程休眠毫秒数*/

private static int SLEEP_TIME = 100;

/**连接集合标志*/

private static String LINK_KEY = "linkKey";

/**标题集合标志*/

private static String TITLE_KEY = "titleKey";

/**采集管理对象*/

private CmsAcquisitionMng cmsAcquisitionMng;

/**存放HttpClient的ThreadLocal对象*/

private static ThreadLocal httpClientThreadLocal = new ThreadLocal();

/**存放ParseHtmlTool的ThreadLocal对象*/

private static ThreadLocal parseHtmlToolThreadLocal = new ThreadLocal();

/**存放UrlQueue的ThreadLocal对象*/

private static ThreadLocal urlQueueThreadLocal = new ThreadLocal();

/**存放计划UrlQueue的ThreadLocal对象*/

private static ThreadLocal planUrlQueueThreadLocal = new ThreadLocal();

@Autowired

public void setCmsAcquisitionMng(CmsAcquisitionMng cmsAcquisitionMng) {

this.cmsAcquisitionMng = cmsAcquisitionMng;

}

@Override

protected boolean execute(CmsScheduler scheduler) {

CmsAcquisition acqu = cmsAcquisitionMng.findById(scheduler.getAssociateId());

if (acqu == null) {

return false;

}

System.out.println("===============开始执行采集任务");

new Thread(new MainThreadProcesser(this,acqu)).start();

return true;

}

/**

* 取得关联任务map

* @return 关联任务map

*/

public List associateTaskList(CmsScheduler scheduler){

List list = cmsAcquisitionMng.getList(scheduler.getSite().getId());

List resultList = new ArrayList();

SchedulerTaskBean schedulerTaskBean = null;

for(CmsAcquisition acquisition : list){

schedulerTaskBean = new SchedulerTaskBean();

schedulerTaskBean.setId(acquisition.getId());

schedulerTaskBean.setName(acquisition.getName());

resultList.add(schedulerTaskBean);

}

return resultList;

}

/**

* 主线程处理类

* @author javacoo

* @since 2011-11-02

*/

private class MainThreadProcesser implements Runnable {

private CmsAcquisition acqu;

private SchedulerTaskSvc schedulerAcquisitionSvc;

public MainThreadProcesser(SchedulerTaskSvc schedulerAcquisitionSvc,CmsAcquisition acqu) {

this.acqu = acqu;

this.schedulerAcquisitionSvc = schedulerAcquisitionSvc;

}

//线程锁

Object threadLock = new Object();

public void run() {

long tStart = System.currentTimeMillis();

System.out.println("主线程:"+Thread.currentThread().getName() + "开始...");

try {

CountDownLatch latch = new CountDownLatch(THREAD_NUM);

ExecutorService exec = Executors.newCachedThreadPool();

getHttpClient().getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY,new HttpHost("128.160.64.5", 1235));

CharsetHandler handler = new CharsetHandler(acqu.getPageEncoding());

//取得当前任务所有计划

getAllPlans(acqu,getPlanUrlQueue());

//开启一线程执行抓取计划下URL

Thread thread = new Thread(new FetchUrlThread(schedulerAcquisitionSvc,latch,getHttpClient(),getPlanUrlQueue(),getUrlQueue(),getParseHtmlTool(acqu),handler,threadLock));

exec.execute(thread);

//开启指定数目线程执行采集内容

for(int i=0;i= 0; i--) {

planMap.put(LINK_KEY, plans[i]);

planMap.put(TITLE_KEY, acqu.getName());

addUrlAndTitleMap(planMap,urlQueue);

}

System.out.println("=======当前线程:"+Thread.currentThread().getName() + "计划URL连接数:"+urlQueue.getUnVisitedUrlNum());

}

/**

* 取得当前线程下所有计划的连接,并加入队列

* @param acqu 采集参数对象

* @param handler 字符集对象

* @param urlQueue 队列

* @throws URISyntaxException

* @throws IOException

* @throws ClientProtocolException

*/

private void getAllUrls(HttpClient httpClient,ParseHtmlTool parseHtmlTool,CharsetHandler handler,UrlQueue urlQueue,Map map) throws URISyntaxException, ClientProtocolException, IOException{

HttpGet httpGet = new HttpGet(new URI(map.get(LINK_KEY).trim()));

String html = httpClient.execute(httpGet, handler);

for(Map planMap : parseHtmlTool.getUrlAndTitleMap(html)){

addUrlAndTitleMap(planMap,urlQueue);

}

System.out.println("=======当前线程:"+Thread.currentThread().getName() + "URL连接数:"+urlQueue.getUnVisitedUrlNum());

}

/**

* 保存内容

* @param acqu 请求参数对象

* @param httpClient httpClient对象

* @param parseHtmlTool parseHtmlTool对象

* @param handler CharsetHandler对象

* @param map 连接和标题map对象

* @return Content

*/

private synchronized Content saveContent(CmsAcquisition acqu,HttpClient httpClient,ParseHtmlTool parseHtmlTool,CharsetHandler handler,Map map) {

try {

HttpGet httpGet = null;

if(map.get(LINK_KEY).contains("http://")){

httpGet = new HttpGet(new URI(map.get(LINK_KEY).trim()));

}else{

httpGet = new HttpGet(new URI("http://localhost/v7/"+map.get(LINK_KEY).trim()));

}

String html = httpClient.execute(httpGet, handler);

System.out.println("=============================子线程:"+Thread.currentThread().getName() + "执行");

String txt = parseHtmlTool.getHtml(html);

//return cmsAcquisitionMng.saveContent(map.get(TITLE_KEY), txt,acqu.getId());

return null;

} catch (Exception e) {

log.warn(null, e);

e.printStackTrace();

return null;

}

}

/**

* 字符集帮助类

* @author Administrator

*

*/

private class CharsetHandler implements ResponseHandler {

private String charset;

public CharsetHandler(String charset) {

this.charset = charset;

}

public String handleResponse(HttpResponse response)

throws ClientProtocolException, IOException {

StatusLine statusLine = response.getStatusLine();

if (statusLine.getStatusCode() >= 300) {

throw new HttpResponseException(statusLine.getStatusCode(),

statusLine.getReasonPhrase());

}

HttpEntity entity = response.getEntity();

if (entity != null) {

if (!StringUtils.isBlank(charset)) {

return EntityUtils.toString(entity, charset);

} else {

return EntityUtils.toString(entity);

}

} else {

return null;

}

}

}

}

  定时服务关联任务be​​anSchedulerTaskBean.java

  package com.jeecms.cms.service.scheduler;

/**

* 定时服务关联任务bean

* @author javacoo

* @since 2011-11-07

*/

public class SchedulerTaskBean {

/**任务主键*/

private Integer id;

/**任务名称*/

private String name;

public Integer getId() {

return id;

}

public void setId(Integer id) {

this.id = id;

}

public String getName() {

return name;

}

public void setName(String name) {

this.name = name;

}

}

  定时任务ControllerCmsSchedulerAct.java

  package com.jeecms.cms.action.admin.assist;

import java.util.List;

import javax.servlet.http.HttpServletRequest;

import javax.servlet.http.HttpServletResponse;

import org.slf4j.Logger;

import org.slf4j.LoggerFactory;

import org.springframework.beans.factory.annotation.Autowired;

import org.springframework.stereotype.Controller;

import org.springframework.ui.ModelMap;

import org.springframework.web.bind.annotation.RequestMapping;

import com.jeecms.cms.entity.assist.CmsAcquisition;

import com.jeecms.cms.entity.assist.CmsScheduler;

import com.jeecms.cms.entity.main.CmsSite;

import com.jeecms.cms.manager.assist.CmsSchedulerMng;

import com.jeecms.cms.manager.main.CmsLogMng;

import com.jeecms.cms.service.scheduler.SchedulerTaskManageSvc;

import com.jeecms.cms.service.scheduler.SchedulerTaskBean;

import com.jeecms.cms.web.CmsUtils;

import com.jeecms.cms.web.WebErrors;

/**

* 计划任务Controller

* @author javacoo

* @since 2011-11-7

*/

@Controller

public class CmsSchedulerAct {

private static final Logger log = LoggerFactory

.getLogger(CmsSchedulerAct.class);

/**日志服务*/

@Autowired

private CmsLogMng cmsLogMng;

/**计划管理服务*/

@Autowired

private CmsSchedulerMng manager;

/**计划任务管理服务*/

@Autowired

private SchedulerTaskManageSvc schedulerTaskManageSvc;

@RequestMapping("/scheduler/v_list.do")

public String list(HttpServletRequest request, ModelMap model) {

List list = manager.getList();

model.addAttribute("list", list);

return "scheduler/list";

}

@RequestMapping("/scheduler/v_listBy.do")

public String listBy(String moduleType,HttpServletRequest request, ModelMap model) {

CmsSite site = CmsUtils.getSite(request);

CmsScheduler scheduler = new CmsScheduler();

scheduler.setModuleType(moduleType);

scheduler.setSite(site);

List list = manager.getListBy(scheduler);

model.addAttribute("list", list);

model.addAttribute("moduleType", moduleType);

return "scheduler/list";

}

@RequestMapping("/scheduler/v_add.do")

public String add(String moduleType,HttpServletRequest request, ModelMap model) {

CmsSite site = CmsUtils.getSite(request);

CmsScheduler scheduler = new CmsScheduler();

scheduler.setModuleType(moduleType);

scheduler.setSite(site);

List schedulerTaskList = schedulerTaskManageSvc.associateTaskList(scheduler);

model.addAttribute("schedulerTaskList", schedulerTaskList);

model.addAttribute("moduleType", moduleType);

return "scheduler/add";

}

@RequestMapping("/scheduler/v_edit.do")

public String edit(Integer id, HttpServletRequest request, ModelMap model) {

WebErrors errors = validateEdit(id, request);

if (errors.hasErrors()) {

return errors.showErrorPage(model);

}

CmsSite site = CmsUtils.getSite(request);

CmsScheduler scheduler = manager.findById(id);

scheduler.setSite(site);

List schedulerTaskList = schedulerTaskManageSvc.associateTaskList(scheduler);

model.addAttribute("schedulerTaskList", schedulerTaskList);

model.addAttribute("cmsScheduler", scheduler);

return "scheduler/edit";

}

@RequestMapping("/scheduler/o_save.do")

public String save(CmsScheduler bean,HttpServletRequest request, ModelMap model) {

CmsSite site = CmsUtils.getSite(request);

bean.setSite(site);

bean = manager.save(bean);

model.addAttribute("moduleType", bean.getModuleType());

log.info("save CmsScheduler id={}", bean.getId());

cmsLogMng.operating(request, "cmsAcquisition.log.save", "id="

+ bean.getId() + ";name=" + bean.getName());

return "redirect:v_listBy.do";

}

@RequestMapping("/scheduler/o_update.do")

public String update(CmsScheduler bean, HttpServletRequest request, ModelMap model) {

WebErrors errors = validateUpdate(bean.getId(), request);

if (errors.hasErrors()) {

return errors.showErrorPage(model);

}

bean = manager.update(bean);

log.info("update CmsAcquisition id={}.", bean.getId());

cmsLogMng.operating(request, "cmsAcquisition.log.update", "id="

+ bean.getId() + ";name=" + bean.getName());

return listBy(bean.getModuleType(),request, model);

}

@RequestMapping("/scheduler/o_delete.do")

public String delete(String moduleType,Integer[] ids, HttpServletRequest request,

ModelMap model) {

WebErrors errors = validateDelete(ids, request);

if (errors.hasErrors()) {

return errors.showErrorPage(model);

}

CmsScheduler[] beans = manager.deleteByIds(ids);

for (CmsScheduler bean : beans) {

log.info("delete CmsAcquisition id={}", bean.getId());

cmsLogMng.operating(request, "cmsScheduler.log.delete", "id="

+ bean.getId() + ";name=" + bean.getName());

}

return listBy(moduleType,request, model);

}

@RequestMapping("/scheduler/o_start.do")

public String start(Integer id, HttpServletRequest request,

HttpServletResponse response, ModelMap model) {

CmsScheduler scheduler = manager.findById(id);

schedulerTaskManageSvc.start(scheduler);

manager.start(id);

model.addAttribute("moduleType", scheduler.getModuleType());

log.info("start CmsAcquisition id={}", id);

return "redirect:v_listBy.do";

}

@RequestMapping("/scheduler/o_end.do")

public String end(Integer id, HttpServletRequest request,

HttpServletResponse response, ModelMap model) {

manager.end(id);

CmsScheduler scheduler = manager.findById(id);

schedulerTaskManageSvc.stop(scheduler);

model.addAttribute("moduleType", scheduler.getModuleType());

log.info("end CmsScheduler id={}", id);

return "redirect:v_listBy.do";

}

private WebErrors validateEdit(Integer id, HttpServletRequest request) {

WebErrors errors = WebErrors.create(request);

CmsSite site = CmsUtils.getSite(request);

if (vldExist(id, site.getId(), errors)) {

return errors;

}

return errors;

}

private WebErrors validateUpdate(Integer id, HttpServletRequest request) {

WebErrors errors = WebErrors.create(request);

CmsSite site = CmsUtils.getSite(request);

if (vldExist(id, site.getId(), errors)) {

return errors;

}

return errors;

}

private WebErrors validateDelete(Integer[] ids, HttpServletRequest request) {

WebErrors errors = WebErrors.create(request);

CmsSite site = CmsUtils.getSite(request);

if (errors.ifEmpty(ids, "ids")) {

return errors;

}

for (Integer id : ids) {

vldExist(id, site.getId(), errors);

}

return errors;

}

private boolean vldExist(Integer id, Integer siteId, WebErrors errors) {

if (errors.ifNull(id, "id")) {

return true;

}

CmsScheduler entity = manager.findById(id);

if (errors.ifNotExist(entity, CmsAcquisition.class, id)) {

return true;

}

return false;

}

}

  持久对象基类 BaseCmsScheduler.java

  package com.jeecms.cms.entity.assist.base;

import java.io.Serializable;

import java.util.Date;

public abstract class BaseCmsScheduler implements Serializable {

public static String REF = "CmsScheduler";

public static String PROP_ID = "id";

public static String PROP_SITE = "site";

public static String PROP_ASSOCIATE_ID = "associateId";

public static String PROP_MODULE_TYPE = "moduleType";

public static String PROP_NAME = "name";

public static String PROP_START_TIME = "startTime";

public static String PROP_END_TIME = "endTime";

public static String PROP_STATUS = "status";

public static String PROP_EXPRESSION = "expression";

// constructors

public BaseCmsScheduler () {

initialize();

}

/**

* Constructor for primary key

*/

public BaseCmsScheduler (java.lang.Integer id) {

this.setId(id);

initialize();

}

public BaseCmsScheduler(Integer id,String name, Date startTime, Date endTime,

Integer status, Integer associateId, String moduleType, String expression,com.jeecms.cms.entity.main.CmsSite site) {

super();

this.id = id;

this.name = name;

this.startTime = startTime;

this.endTime = endTime;

this.status = status;

this.associateId = associateId;

this.moduleType = moduleType;

this.expression = expression;

this.site = site;

}

protected void initialize () {}

private int hashCode = Integer.MIN_VALUE;

// primary key

private java.lang.Integer id;

// fields

private java.lang.String name;

private java.util.Date startTime;

private java.util.Date endTime;

private java.lang.Integer status;

private java.lang.Integer associateId;

private java.lang.String moduleType;

private java.lang.String expression;

private com.jeecms.cms.entity.main.CmsSite site;

public int getHashCode() {

return hashCode;

}

public void setHashCode(int hashCode) {

this.hashCode = hashCode;

}

public java.lang.Integer getId() {

return id;

}

public void setId(java.lang.Integer id) {

this.id = id;

}

public java.lang.String getName() {

return name;

}

public void setName(java.lang.String name) {

this.name = name;

}

public java.util.Date getStartTime() {

return startTime;

}

public void setStartTime(java.util.Date startTime) {

this.startTime = startTime;

}

public java.util.Date getEndTime() {

return endTime;

}

public void setEndTime(java.util.Date endTime) {

this.endTime = endTime;

}

public java.lang.Integer getStatus() {

return status;

}

public void setStatus(java.lang.Integer status) {

this.status = status;

}

public java.lang.Integer getAssociateId() {

return associateId;

}

public void setAssociateId(java.lang.Integer associateId) {

this.associateId = associateId;

}

public java.lang.String getModuleType() {

return moduleType;

}

public void setModuleType(java.lang.String moduleType) {

this.moduleType = moduleType;

}

public java.lang.String getExpression() {

return expression;

}

public void setExpression(java.lang.String expression) {

this.expression = expression;

}

public com.jeecms.cms.entity.main.CmsSite getSite() {

return site;

}

public void setSite(com.jeecms.cms.entity.main.CmsSite site) {

this.site = site;

}

}

  持久对象 CmsScheduler.java

  package com.jeecms.cms.entity.assist;

import java.util.Date;

import com.jeecms.cms.entity.assist.base.BaseCmsScheduler;

/**

* 计划持久对象

* @author javacoo

* @since 2011-11-07

*/

public class CmsScheduler extends BaseCmsScheduler {

private static final long serialVersionUID = 1L;

/**

* 停止状态

*/

public static final int STOP = 0;

/**

* 采集状态

*/

public static final int START = 1;

/**

* 是否停止

*

* @return

*/

public boolean isStop() {

int status = getStatus();

return status == 0;

}

public void init() {

if (getStatus() == null) {

setStatus(STOP);

}

}

public CmsScheduler(){

super();

}

public CmsScheduler(java.lang.Integer id){

super(id);

}

public CmsScheduler(Integer id,String name, Date startTime, Date endTime,

Integer status, Integer associateId, String moduleType, String expression,com.jeecms.cms.entity.main.CmsSite site) {

super(id,name,startTime,endTime,status,associateId,moduleType,expression,site);

}

}

  HBM 文件 CmsScheduler.hbm.xml

  

false

  ===================================定时任务模块相关互助=========== ==== ========================

  规划框架

  规划框架-任务调度Scheduler.java

  package com.jeecms.common.scheduling.core;

import java.util.Date;

import java.util.Timer;

import java.util.TimerTask;

/**

* 计划框架-任务调度

*

* 用于提供必要的计划,Scheduler 的每一个实例都拥有 Timer 的一个实例,用于提供底层计划

* 它将一组单次定时器串接在一起,以便在由 ScheduleIterator 指定的各个时间执行 SchedulerTask 类

*

* @author javacoo

* @since 2011-11-02

*/

public class Scheduler {

/**Timer实例*/

private final Timer timer = new Timer();

/**

* 定时任务计划

* @author javacoo

* @since 2011-11-02

*/

class SchedulerTimerTask extends TimerTask {

private SchedulerTask schedulerTask;

private ScheduleIterator iterator;

public SchedulerTimerTask(SchedulerTask schedulerTask,

ScheduleIterator iterator) {

this.schedulerTask = schedulerTask;

this.iterator = iterator;

}

public void run() {

schedulerTask.run();

reschedule(schedulerTask, iterator);

}

}

public Scheduler() {

}

/**

* 取消执行

*/

public void cancel() {

timer.cancel();

}

/**

* 计划的入口点

*

* 通过调用 ScheduleIterator 接口的 next(),发现第一次执行 SchedulerTask 的时间。

* 然后通过调用底层 Timer 类的单次 schedule() 方法,启动计划在这一时刻执行。

* 为单次执行提供的 TimerTask 对象是嵌入的 SchedulerTimerTask 类的一个实例,

* 它包装了任务和迭代器(iterator)。在指定的时间,调用嵌入类的 run() 方法,

* 它使用包装的任务和迭代器引用以便重新计划任务的下一次执行

*

* @param schedulerTask SchedulerTimerTask 类的一个实例

* @param iterator ScheduleIterator 接口的一个实例

*/

public void schedule(SchedulerTask schedulerTask, ScheduleIterator iterator) {

Date time = iterator.next();

if (time == null) {

schedulerTask.cancel();

} else {

synchronized (schedulerTask.lock) {

if (schedulerTask.state != SchedulerTask.VIRGIN) {

throw new IllegalStateException("任务已经执行/取消");

}

schedulerTask.state = SchedulerTask.SCHEDULED;

schedulerTask.timerTask = new SchedulerTimerTask(schedulerTask,iterator);

timer.schedule(schedulerTask.timerTask, time);

}

}

}

/**

* 重新制定计划

* @param schedulerTask SchedulerTimerTask 类的一个实例

* @param iterator ScheduleIterator 接口的一个实例

*/

private void reschedule(SchedulerTask schedulerTask,

ScheduleIterator iterator) {

Date time = iterator.next();

if (time == null) {

schedulerTask.cancel();

} else {

synchronized (schedulerTask.lock) {

if (schedulerTask.state != SchedulerTask.CANCELLED) {

schedulerTask.timerTask = new SchedulerTimerTask(

schedulerTask, iterator);

timer.schedule(schedulerTask.timerTask, time);

}

}

}

}

}

  规划框架——时间*敏*感*词*接口ScheduleIterator.java

  package com.jeecms.common.scheduling.core;

import java.util.Date;

/**

* 计划框架-时间*敏*感*词*接口

* 将 SchedulerTask 的计划执行时间指定为一系列 java.util.Date 对象的接口

* 然后 next() 方法按时间先后顺序迭代 Date 对象,返回值 null 会使任务取消(即它再也不会运行)

* @author javacoo

* @since 2011-11-02

*/

public interface ScheduleIterator {

/**

* 返回下次计划执行时间

* @return 下次计划执行时间

*/

Date next();

}

  定时任务抽象类SchedulerTask.java

  package com.jeecms.common.scheduling.core;

import java.util.TimerTask;

/**

* 计划任务抽象类

*

* SchedulerTask 在其生命周期中要经历一系列的状态。创建后,它处于 VIRGIN 状态,

* 这表明它从没有计划过。计划以后,它就变为 SCHEDULED 状态,

* 再用下面描述的方法之一取消任务后,它就变为 CANCELLED 状态。

* 管理正确的状态转变 —— 如保证不对一个非 VIRGIN 状态的任务进行两次计划 ——

* 增加了 Scheduler 和 SchedulerTask 类的复杂性。在进行可能改变任务状态的操作时,

* 代码必须同步任务的锁对象

*

* @author javacoo

* @since 2011-11-02

*/

public abstract class SchedulerTask implements Runnable {

/**同步任务的锁对象*/

final Object lock = new Object();

/**状态*/

int state = VIRGIN;

/**初始状态*/

static final int VIRGIN = 0;

/**任务状态*/

static final int SCHEDULED = 1;

/**取消状态*/

static final int CANCELLED = 2;

/**TimerTask 对象*/

TimerTask timerTask;

protected SchedulerTask() {

}

/**执行的任务,由子类实现*/

public abstract void run();

/**取消任务

*

* 任务再也不会运行了,不过已经运行的任务仍会运行完成

*

*/

public boolean cancel() {

synchronized (lock) {

if (timerTask != null) {

timerTask.cancel();

}

boolean result = (state == SCHEDULED);

state = CANCELLED;

return result;

}

}

public long scheduledExecutionTime() {

synchronized (lock) {

return timerTask == null ? 0 : timerTask.scheduledExecutionTime();

}

}

}

  调度框架-时间*敏*感*词*接口实现类SimpleScheduleIterator.java

  package com.jeecms.common.scheduling.impl;

import java.util.Calendar;

import java.util.Date;

import java.util.GregorianCalendar;

import com.jeecms.common.scheduling.core.ScheduleIterator;

/**

* 计划框架-时间*敏*感*词*接口实现类

* 返回 月/周/天/小时/分钟/秒 计划的下一次执行时间

* 约定:参数以逗号分隔,*号表示无值

* 参数解释:

*

第一位:每个月的第几周</br>

*

第二位:每周的第几天</br>

*

第三位:天(几号)</br>

*

第四位:小时(24小时制)</br>

*

第五位:分钟</br>

*

第六位:秒</br>

*

* 参数样例:

*

1,6,4,15,20,30 表示 从今天的15:20:30开始,每隔一个月执行一次,即下次执行时间是 下个月的第一周的第6天的15:20:30</br>

*

*,6,4,15,20,30 表示 从今天的15:20:30开始,每隔一周执行一次,即下次执行时间是 下一周的第6天的15:20:30</br>

*

*,*,4,15,20,30 表示 从今天的15:20:30开始,每隔一天执行一次,即下次执行时间是 下一天的15:20:30</br>

*

*,*,*,15,20,30 表示 从今天的15:20:30开始,每隔一小时执行一次,即下次执行时间是 16:20:30</br>

*

*,*,*,*,20,30 表示 从这个小时的20:30开始,每隔一分钟执行一次,即下次执行时间是 *:21:30</br>

*

*,*,*,*,*,30 表示 从当前时间的30秒开始,每隔一秒执行一次,即下次执行时间是 *:*:31</br>

*

* @author javacoo

* @since 2011-11-03

*/

public class SimpleScheduleIterator implements ScheduleIterator {

private final ScheduleParamBean scheduleParamBean;

private final Calendar calendar = Calendar.getInstance();

private final Calendar orginCalendar = Calendar.getInstance();

public SimpleScheduleIterator(final ScheduleParamBean scheduleParamBean) {

this(scheduleParamBean, new Date());

}

public SimpleScheduleIterator(final ScheduleParamBean scheduleParamBean, Date date) {

this.scheduleParamBean = scheduleParamBean;

orginCalendar.setTime(date);

calendar.setTime(date);

if(null != scheduleParamBean.getWeekOfMonth()){

calendar.set(Calendar.WEEK_OF_MONTH, scheduleParamBean.getWeekOfMonth());

}

//如果设置了每周的第几天和一个月的第几天,则忽略一个月的第几天

if(null != scheduleParamBean.getDayOfWeek()){

calendar.set(Calendar.DAY_OF_WEEK, scheduleParamBean.getDayOfWeek());

}else if(null != scheduleParamBean.getDayOfMonth()){

calendar.set(Calendar.DAY_OF_MONTH, scheduleParamBean.getDayOfMonth());

}

if(null != scheduleParamBean.getHourOfDay()){

calendar.set(Calendar.HOUR_OF_DAY, scheduleParamBean.getHourOfDay());

}

if(null != scheduleParamBean.getMinute()){

calendar.set(Calendar.MINUTE, scheduleParamBean.getMinute());

}

if(null != scheduleParamBean.getSecond()){

calendar.set(Calendar.SECOND, scheduleParamBean.getSecond());

}

calendar.set(Calendar.MILLISECOND, 0);

//如果设置时间 大于当前时间

if (!calendar.getTime().before(date)) {

System.out.println(calendar.getTime() +"大于当前时间:"+date);

if(null != scheduleParamBean.getWeekOfMonth()){

calendar.add(Calendar.MONTH, -1);

}else if(null != scheduleParamBean.getDayOfWeek()){

calendar.add(Calendar.DAY_OF_WEEK, -6);

}else if(null != scheduleParamBean.getDayOfMonth()){

calendar.add(Calendar.DAY_OF_MONTH, -1);

}else if(null != scheduleParamBean.getHourOfDay()){

calendar.add(Calendar.HOUR_OF_DAY, -1);

}else if(null != scheduleParamBean.getMinute()){

calendar.add(Calendar.MINUTE, -1);

}else if(null != scheduleParamBean.getSecond()){

calendar.add(Calendar.SECOND, -1);

}

}else{//如果小于,则会一下执行多次,所以在天,小时,分钟,秒 都加上相应时间差

System.out.println(calendar.getTime() +"小于当前时间:"+date);

if(null != scheduleParamBean.getDayOfMonth()){

calendar.add(Calendar.DAY_OF_MONTH, orginCalendar.get(Calendar.DAY_OF_MONTH) - scheduleParamBean.getDayOfMonth());

}else if(null != scheduleParamBean.getHourOfDay()){

calendar.add(Calendar.HOUR_OF_DAY, orginCalendar.get(Calendar.HOUR_OF_DAY) - scheduleParamBean.getHourOfDay());

}else if(null != scheduleParamBean.getMinute()){

calendar.add(Calendar.MINUTE, orginCalendar.get(Calendar.MINUTE) - scheduleParamBean.getMinute());

}else if(null != scheduleParamBean.getSecond()){

calendar.add(Calendar.SECOND, orginCalendar.get(Calendar.SECOND) - scheduleParamBean.getSecond());

}

}

}

public Date next() {

if(null != scheduleParamBean.getWeekOfMonth()){

calendar.add(Calendar.MONTH, 1);

}else if(null != scheduleParamBean.getDayOfWeek()){

calendar.add(Calendar.DAY_OF_WEEK, 6);

}else if(null != scheduleParamBean.getDayOfMonth()){

calendar.add(Calendar.DAY_OF_MONTH, 1);

}else if(null != scheduleParamBean.getHourOfDay()){

calendar.add(Calendar.HOUR_OF_DAY, 1);

}else if(null != scheduleParamBean.getMinute()){

calendar.add(Calendar.MINUTE, 1);

}else if(null != scheduleParamBean.getSecond()){

calendar.add(Calendar.SECOND, 1);

}

System.out.println("下次执行时间:"+calendar.getTime());

return calendar.getTime();

}

}

  调度参数beanScheduleParamBean.java

  package com.jeecms.common.scheduling.impl;

/**

* 时间计划参数bean

* @author javacoo

* @since 2011-11-04

*/

public class ScheduleParamBean {

/**每个月的第几周,每周的第几天,每个月的第几天,小时(24小时制),分钟,秒*/

private Integer weekOfMonth,dayOfWeek,dayOfMonth,hourOfDay, minute, second;

public ScheduleParamBean(){

}

public ScheduleParamBean(Integer weekOfMonth, Integer dayOfWeek,

Integer dayOfMonth, Integer hourOfDay, Integer minute,

Integer second) {

super();

this.weekOfMonth = weekOfMonth;

this.dayOfWeek = dayOfWeek;

this.dayOfMonth = dayOfMonth;

this.hourOfDay = hourOfDay;

this.minute = minute;

this.second = second;

}

public Integer getWeekOfMonth() {

return weekOfMonth;

}

public void setWeekOfMonth(Integer weekOfMonth) {

this.weekOfMonth = weekOfMonth;

}

public Integer getDayOfWeek() {

return dayOfWeek;

}

public void setDayOfWeek(Integer dayOfWeek) {

this.dayOfWeek = dayOfWeek;

}

public Integer getDayOfMonth() {

return dayOfMonth;

}

public void setDayOfMonth(Integer dayOfMonth) {

this.dayOfMonth = dayOfMonth;

}

<p>

public Integer getHourOfDay() {

return hourOfDay;

}

public void setHourOfDay(Integer hourOfDay) {

this.hourOfDay = hourOfDay;

}

public Integer getMinute() {

return minute;

}

public void setMinute(Integer minute) {

this.minute = minute;

}

public Integer getSecond() {

return second;

}

public void setSecond(Integer second) {

this.second = second;

}

@Override

public String toString() {

return "ScheduleParamBean [dayOfMonth=" + dayOfMonth + ", dayOfWeek="

+ dayOfWeek + ", hourOfDay=" + hourOfDay + ", minute=" + minute

+ ", second=" + second + ", weekOfMonth=" + weekOfMonth + "]";

}

}</p>

  采集相关

  HTML解析工具类接口ParseHtmlTool.java

  package com.jeecms.common.crawler.util;

import java.util.List;

import java.util.Map;

/**

* HTML解析工具类接口

* @author javacoo

* @since 2011-10-31

*/

public interface ParseHtmlTool {

/**

* 取得连接集合

* @param orginHtml 原始HTML

* @return 连接集合

*/

List getUrlList( String orginHtml);

/**

* 取得标题集合

* @param orginHtml 原始HTML

* @return 标题集合

*/

List getTitleList(String orginHtml);

/**

* 取得指定区域的HTML内容

* @return 指定区域的HTML内容

*/

String getHtml(String orginHtml);

/**

* 取得连接标题Map集合

* @param orginHtml 原始HTML

* @return 连接标题Map集合

*/

List getUrlAndTitleMap(String orginHtml);

}

  HTML解析工具,HtmlParser实现类HtmlParserImpl.java

  package com.jeecms.common.crawler.util;

import java.io.BufferedReader;

import java.io.File;

import java.io.FileInputStream;

import java.io.IOException;

import java.io.InputStreamReader;

import java.net.URISyntaxException;

import java.util.ArrayList;

import java.util.HashMap;

import java.util.Iterator;

import java.util.List;

import java.util.Map;

import java.util.regex.Matcher;

import java.util.regex.Pattern;

import org.apache.commons.lang.StringUtils;

import org.htmlparser.Node;

import org.htmlparser.NodeFilter;

import org.htmlparser.Parser;

import org.htmlparser.filters.HasAttributeFilter;

import org.htmlparser.filters.NodeClassFilter;

import org.htmlparser.filters.TagNameFilter;

import org.htmlparser.nodes.RemarkNode;

import org.htmlparser.util.NodeList;

import org.htmlparser.util.ParserException;

import com.jeecms.cms.entity.assist.CmsAcquisition;

import com.jeecms.common.crawler.ParamBean;

/**

* HTML解析工具,HtmlParser实现类

* @author javacoo

* @since 2011-10-31

*/

public class HtmlParserImpl implements ParseHtmlTool{

/**连接集合标志*/

private static String LINK_KEY = "linkKey";

/**标题集合标志*/

private static String TITLE_KEY = "titleKey";

/**单标签标志*/

private static String SINGLE_TAG = "singleTag";

/**连接正则表达式*/

private static String LINK_REGX = "(.*?)</a>";

/**正则表达式对象*/

private Pattern pt = Pattern.compile(LINK_REGX);

/**采集参数bean*/

private ParamBean paramBean;

public HtmlParserImpl(CmsAcquisition acqu){

parseRequestParam(acqu);

}

/**

* 取得标题集合

* @param orginHtml 原始HTML

* @return 标题集合

*/

public List getTitleList(String orginHtml) {

orginHtml = getHtmlByFilter(paramBean.getLinksetStartMap(), paramBean.getLinksetEndMap(),orginHtml);

if (StringUtils.isNotEmpty(orginHtml)) {

return getUrlOrTitleListByType(orginHtml,TITLE_KEY);

}

return null;

}

/**

* 取得连接集合

* @param orginHtml 原始HTML

* @return 连接集合

*/

public List getUrlList(String orginHtml) {

orginHtml = getHtmlByFilter(paramBean.getLinksetStartMap(), paramBean.getLinksetEndMap(),orginHtml);

if (StringUtils.isNotEmpty(orginHtml)) {

return getUrlOrTitleListByType(orginHtml,LINK_KEY);

}

return null;

}

/**

* 取得指定区域的HTML内容

* @param orginHtml 原始HTML

* @return 指定区域的HTML内容

* @throws ParserException

*/

public String getHtml(String orginHtml) {

orginHtml = getHtmlByFilter(paramBean.getContentStartMap(), paramBean.getContentEndMap(),orginHtml);

return orginHtml;

}

/**

* 取得连接标题Map

* @param orginHtml 原始HTML

* @return 连接标题Map

*/

public List getUrlAndTitleMap(String orginHtml){

return getUrlAandTitleMap(orginHtml);

}

/**

* 解析采集参数,并封装到ParamBean

* @param acqu 原始采集参数

* @return 采集参数封装bean

*/

private void parseRequestParam(CmsAcquisition acqu){

paramBean = new ParamBean();

if(!StringUtils.isEmpty(acqu.getLinksetStart())){

paramBean.setLinksetStartMap(populateParamMap(acqu.getLinksetStart()));

}

if(!StringUtils.isEmpty(acqu.getLinksetEnd())){

paramBean.setLinksetEndMap(populateParamMap(acqu.getLinksetEnd()));

}

if(!StringUtils.isEmpty(acqu.getContentStart())){

paramBean.setContentStartMap(populateParamMap(acqu.getContentStart()));

}

if(!StringUtils.isEmpty(acqu.getContentEnd())){

paramBean.setContentEndMap(populateParamMap(acqu.getContentEnd()));

}

}

/**

* 得到连接标题MAP

* @param html html内容

* @return 连接或者标题集合

*/

private List getUrlAandTitleMap(String html) {

html = getHtmlByFilter(paramBean.getLinksetStartMap(), paramBean.getLinksetEndMap(),html);

List resultMapList = new ArrayList();

Map resultMap = null;

Matcher m = pt.matcher(html);

while (m.find()) {

if(StringUtils.isNotEmpty(m.group(1)) && StringUtils.isNotEmpty(m.group(2))){

resultMap = new HashMap();

resultMap.put(LINK_KEY, m.group(1));

resultMap.put(TITLE_KEY, m.group(2));

resultMapList.add(resultMap);

}

}

return resultMapList;

}

/**

* 得到地址集

* @param html html内容

* @param type 1 :取得连接集合,2:取得标题集合

* @return 连接或者标题集合

*/

private List getUrlOrTitleListByType(String html, String type) {

List resultList = new ArrayList();

Matcher m = pt.matcher(html);

String result = "";

int pos = 1;

if(TITLE_KEY.equals(type)){

pos = 2;

}

while (m.find()) {

result = m.group(pos);

resultList.add(result);

}

return resultList;

}

/**

* 取得指定区域的HTML内容

* @param tagMap 标签MAP

* @param removeTagMap 要过滤的标签MAP

* @param orginHtml 原始HTML

* @return 指定区域的HTML内容

* @throws ParserException

*/

private String getHtmlByFilter(Map tagMap,

Map removeTagMap, String orginHtml) {

try {

Parser parser = new Parser();

parser.setInputHTML(orginHtml);

// 第一步取得指定属性/标签内容

String tempKey = null;

String tempValue = null;

String[] tempValueArr = null;

StringBuilder sb = new StringBuilder();

NodeFilter filter = null;

for(Iterator it = tagMap.keySet().iterator(); it.hasNext();){

tempKey = it.next();

tempValue = tagMap.get(tempKey);

if(tempValue.contains("|")){

tempValueArr = tempValue.split("\\|");

}else{

tempValueArr = new String[]{tempValue};

}

for(String value : tempValueArr){

filter = populateFilter(tempKey,value);

appendHtmlByFilter(parser, filter, sb);

}

}

// 第二步过滤指定属性/标签内容

String contentHtml = sb.toString();

for (Iterator it = removeTagMap.keySet().iterator(); it

.hasNext();) {

tempKey = it.next();

tempValue = removeTagMap.get(tempKey);

if(tempValue.contains("|")){

tempValueArr = tempValue.split("\\|");

}else{

tempValueArr = new String[]{tempValue};

}

for(String value : tempValueArr){

filter = populateFilter(tempKey,value);

contentHtml = removeHtmlByFilter(parser, filter, contentHtml);

}

}

//第三步过滤注释

filter = new NodeClassFilter(RemarkNode.class);

contentHtml = removeHtmlByFilter(parser, filter, contentHtml);

System.out.println("=================================结果=======================================");

System.out.println(contentHtml);

return contentHtml;

} catch (ParserException e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

return "";

}

/**

* 解析并组装采集参数,支持标签属性/值形式和标签名称形式,可混合使用

* 约定采集参数格式如下

* 1,标签属性/值形式,如:class=articleList|tips,id=fxwb|fxMSN|fxMSN

* 2,标签名称形式,如:div,p,span

* 3,混合形式,如:class=articleList|tips,id=fxwb|fxMSN|fxMSN,div,p,span

* @param paramStr 参数字符串

*/

private Map populateParamMap(String paramStr) {

Map paramMap = new HashMap();

String[] paramStrArr = paramStr.split(",");

String[] tempStrArr = null;

StringBuilder sb = new StringBuilder();

for(String temp : paramStrArr){

if(temp.contains("=")){

tempStrArr = temp.split("=");

paramMap.put(tempStrArr[0], tempStrArr[1]);

}else{

if(StringUtils.isNotEmpty(temp)){

sb.append(temp).append("|");

}

}

}

if(StringUtils.isNotEmpty(sb.toString())){

paramMap.put(SINGLE_TAG, sb.substring(0, sb.length() - 1));

}

return paramMap;

}

/**

* 组装过滤器

* @param key 键

* @param value 值

* @return 过滤器

*/

private NodeFilter populateFilter(String key,String value) {

NodeFilter filter;

if(SINGLE_TAG.equals(key)){

filter = new TagNameFilter(value);

}else{

filter = new HasAttributeFilter(key,value);

}

return filter;

}

/**

* 过滤指定属性标签HTML

* @param parser 解析器

* @param filter 属性过滤器

* @param orginHtml 原始HTML

* @return 过滤后HTML

* @throws ParserException

*/

private String removeHtmlByFilter(Parser parser, NodeFilter filter,String orginHtml) throws ParserException {

parser.setInputHTML(orginHtml);

NodeList nodes = parser.extractAllNodesThatMatch(filter);

for (int i = 0; i < nodes.size(); i++) {

Node textnode = (Node) nodes.elementAt(i);

orginHtml = StringUtils.remove(orginHtml, textnode.toHtml());

}

return orginHtml;

}

/**

* 取得所有指定属性/标签的HTML

* @param parser 解析器

* @param filter 过滤器

* @param sb

* @throws ParserException

*/

private void appendHtmlByFilter(Parser parser, NodeFilter filter,

StringBuilder sb) throws ParserException {

NodeList nodes = parser.extractAllNodesThatMatch(filter);

for (int i = 0; i < nodes.size(); i++) {

Node textnode = (Node) nodes.elementAt(i);

sb.append(textnode.toHtml());

}

}

/**

* 解析并组装采集参数,支持标签属性/值形式和标签名称形式,可混合使用

* 约定采集参数格式如下

* 1,标签属性/值形式,如:class=articleList|tips,id=fxwb|fxMSN|fxMSN

* 2,标签名称形式,如:div,p,span

* 3,混合形式,如:class=articleList|tips,id=fxwb|fxMSN|fxMSN,div,p,span

* @param paramMap 参数map

* @param str 参数字符串

*/

private void populateParamMap(Map paramMap,String paramStr) {

String[] paramStrArr = paramStr.split(",");

String[] tempStrArr = null;

StringBuilder sb = new StringBuilder();

for(String temp : paramStrArr){

if(temp.contains("=")){

tempStrArr = temp.split("=");

paramMap.put(tempStrArr[0], tempStrArr[1]);

}else{

if(StringUtils.isNotEmpty(temp)){

sb.append(temp).append("|");

}

}

}

if(StringUtils.isNotEmpty(sb.toString())){

paramMap.put(SINGLE_TAG, sb.substring(0, sb.length() - 1));

}

}

/**

* 测试方法-打开文件并返回内容

* @param szFileName 文件绝对地址

* @param charset 字符集

* @return 内容

*/

public static String openFile(String szFileName,String charset) {

try {

BufferedReader bis = new BufferedReader(new InputStreamReader(

new FileInputStream(new File(szFileName)), charset));

StringBuilder szContent = new StringBuilder();

String szTemp;

while ((szTemp = bis.readLine()) != null) {

szContent.append(szTemp).append("\n");

}

bis.close();

return szContent.toString();

} catch (Exception e) {

return "";

}

}

/**

* 测试取得连接地址和标题

* @throws ParserException

*/

public void testFetchLinkAndTitle() throws ParserException{

String html = openFile("F:\\4.htm","UTF-8");

String result = "";

Map map = new HashMap();

map.put("class", "m_list");

Map notMap = new HashMap();

//notMap.put("class", "atc_ic_f");

result = getHtmlByFilter(map,notMap,html);

System.out.println("=============================result============================");

System.out.println(result);

System.out.println("==========================================================");

Pattern pt = Pattern.compile("(.*?)</a>");

Matcher m = pt.matcher(result);

String link = null;

String title = null;

while (m.find()) {

link = m.group(1);

title = m.group(2);

if (StringUtils.isNotEmpty(link)) {

System.out.println("url : " + link);

System.out.println("title : " + title);

}

}

}

/**

* 测试取得内容

* @throws ParserException

*/

public void testFetchContent() throws ParserException{

String html = openFile("F:\\6.shtml","GB2312");

Map map = new HashMap();

map.put("id", "artibody");

Map notMap = new HashMap();

notMap.put(SINGLE_TAG, "style|script");

notMap.put("type", "text/javascript");

notMap.put("class", "icon_fx|blkComment otherContent_01");

notMap.put("style", "text-align: right;padding-right:10px;|margin-top:6px;|font-size: 12px ! important;|font-size:12px");

notMap.put("id", "fxwb|fxMSN|fxMSN|comment_t_show_top");

getHtmlByFilter(map,notMap,html);

}

/**

* 测试解析参数

*/

public void testParseParam(){

Map map = new HashMap();

populateParamMap(map,"class=articleList|tips,p,div");

String tempKey = null;

String tempValue = null;

String[] tempValueArr = null;

for (Iterator it = map.keySet().iterator(); it.hasNext();) {

tempKey = it.next();

tempValue = map.get(tempKey);

if(tempValue.contains("|")){

tempValueArr = tempValue.split("\\|");

}else{

tempValueArr = new String[]{tempValue};

}

for(String value : tempValueArr){

System.out.println("tempKey:" + tempKey);

System.out.println("value:" + value);

}

}

}

/**

* 测试过滤标签

* @throws ParserException

*/

public void testRemarkFilter() throws ParserException{

String html = openFile("F:\\6.shtml","GB2312");

System.out.println("=========================过滤注释前HTML==================================");

System.out.println(html);

NodeFilter filter = new NodeClassFilter(RemarkNode.class);

html = removeHtmlByFilter(new Parser(), filter, html);

System.out.println("=========================过滤注释后HTML==================================");

System.out.println(html);

}

public static void main(String[] args) throws ParserException,

URISyntaxException, IOException {

HtmlParserImpl parseHtmlTool = new HtmlParserImpl(new CmsAcquisition());

//parseHtmlTool.testParseParam();

//parseHtmlTool.testFetchLinkAndTitle();

//parseHtmlTool.testFetchContent();

//parseHtmlTool.testRemarkFilter();

}

}

  采集参数封装beanParamBean.java

  package com.jeecms.common.crawler;

import java.util.HashMap;

import java.util.Map;

/**

* 采集参数封装bean

* @author javacoo

* @since 2011-10-31

*/

public class ParamBean {

/**待采集连接区域属性MAP*/

private Map linksetStartMap = new HashMap();

/**待采集连接区域过滤属性MAP*/

private Map linksetEndMap = new HashMap();

/**待采集内容区域属性MAP*/

private Map contentStartMap = new HashMap();

/**待采集内容区域过滤属性MAP*/

private Map contentEndMap = new HashMap();

public Map getLinksetStartMap() {

return linksetStartMap;

}

public void setLinksetStartMap(Map linksetStartMap) {

this.linksetStartMap = linksetStartMap;

}

public Map ge

0 个评论

要回复文章请先登录注册


官方客服QQ群

微信人工客服

QQ人工客服


线