jeecms 采集功能优化,基于htmlparser实现,多线程版

panyingdao 2011-11-03

为了熟悉一下多线程相关知识,把jeecms采集器类,改成了多线程版,还不是很完善,帖出来大家一起完善,改进。

说明:暂不支持暂停,停止功能。

用法:和我上一篇jeecms采集功能优化,基于htmlparser实现里面的用法一样。

思路:想法很简单,在主线程处理类中,先取得当前采集任务下所有URL,并放入队列中,然后开启指定数目的线程(默认是2)采集内容

代码清单:

采集器主类:MultiThreadAcquisitionSvcImpl.java

HTML解析工具类接口:ParseHtmlTool.java

HTML解析工具,HtmlParser实现类:HtmlParserImpl.java

采集参数封装bean:ParamBean.java

队列类:Queue.java

URL队列:UrlQueue.java

代码如下:

采集器主类:MultiThreadAcquisitionSvcImpl.java

packagecom.jeecms.cms.service;

importjava.io.IOException;

importjava.net.URI;

importjava.net.URISyntaxException;

importjava.util.List;

importjava.util.Map;

importjava.util.concurrent.CountDownLatch;

importjava.util.concurrent.ExecutorService;

importjava.util.concurrent.Executors;

importorg.apache.commons.lang.StringUtils;

importorg.apache.http.HttpEntity;

importorg.apache.http.HttpHost;

importorg.apache.http.HttpResponse;

importorg.apache.http.StatusLine;

importorg.apache.http.client.ClientProtocolException;

importorg.apache.http.client.HttpClient;

importorg.apache.http.client.HttpResponseException;

importorg.apache.http.client.ResponseHandler;

importorg.apache.http.client.methods.HttpGet;

importorg.apache.http.conn.params.ConnRoutePNames;

importorg.apache.http.impl.client.DefaultHttpClient;

importorg.apache.http.util.EntityUtils;

importorg.slf4j.Logger;

importorg.slf4j.LoggerFactory;

importorg.springframework.beans.factory.annotation.Autowired;

importorg.springframework.stereotype.Service;

importcom.jeecms.cms.entity.assist.CmsAcquisition;

importcom.jeecms.cms.entity.main.Content;

importcom.jeecms.cms.manager.assist.CmsAcquisitionMng;

/**

*采集器-多线程版

*@authorjavacoo

*@since2011-11-02

*@version1.0

*/

@Service

publicclassMultiThreadAcquisitionSvcImplimplementsAcquisitionSvc{

privateLoggerlog=LoggerFactory.getLogger(MultiThreadAcquisitionSvcImpl.class);

/**开启线程数*/

privatestaticintTHREAD_NUM=2;

/**每个线程休眠毫秒数*/

privatestaticintSLEEP_TIME=100;

/**连接集合标志*/

privatestaticStringLINK_KEY="linkKey";

/**标题集合标志*/

privatestaticStringTITLE_KEY="titleKey";

/**采集管理对象*/

privateCmsAcquisitionMngcmsAcquisitionMng;

/**存放HttpClient的ThreadLocal对象*/

privatestaticThreadLocal<HttpClient>httpClientThreadLocal=newThreadLocal<HttpClient>();

/**存放ParseHtmlTool的ThreadLocal对象*/

privatestaticThreadLocal<ParseHtmlTool>parseHtmlToolThreadLocal=newThreadLocal<ParseHtmlTool>();

/**存放UrlQueue的ThreadLocal对象*/

privatestaticThreadLocal<UrlQueue>urlQueueThreadLocal=newThreadLocal<UrlQueue>();

@Autowired

publicvoidsetCmsAcquisitionMng(CmsAcquisitionMngcmsAcquisitionMng){

this.cmsAcquisitionMng=cmsAcquisitionMng;

}

/**

*开始执行采集任务

*/

publicbooleanstart(Integerid){

CmsAcquisitionacqu=cmsAcquisitionMng.findById(id);

if(acqu==null||acqu.getStatus()==CmsAcquisition.START){

returnfalse;

}

newThread(newMainThreadProcesser(this,acqu)).start();

returntrue;

}

/**

*主线程处理类

*@authorjavacoo

*@since2011-11-02

*/

privateclassMainThreadProcesserimplementsRunnable{

privateCmsAcquisitionacqu;

privateAcquisitionSvcacquisitionSvc;

publicMainThreadProcesser(AcquisitionSvcacquisitionSvc,CmsAcquisitionacqu){

this.acqu=acqu;

this.acquisitionSvc=acquisitionSvc;

}

publicvoidrun(){

longtStart=System.currentTimeMillis();

System.out.println("主线程:"+Thread.currentThread().getName()+"开始...");

try{

getHttpClient().getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY,newHttpHost("128.160.64.5",1235));

CharsetHandlerhandler=newCharsetHandler(acqu.getPageEncoding());

getAllUrls(acqu,handler);

CountDownLatchlatch=newCountDownLatch(THREAD_NUM);

ExecutorServiceexec=Executors.newCachedThreadPool();

for(inti=0;i<THREAD_NUM;i++){

Threadthread=newThread(newProcesser(acquisitionSvc,acqu,latch,getHttpClient(),getUrlQueue(),getParseHtmlTool(acqu),handler));

exec.execute(thread);

}

latch.await();

exec.shutdown();

}catch(InterruptedExceptione){

e.printStackTrace();

}catch(ClientProtocolExceptione){

e.printStackTrace();

}catch(URISyntaxExceptione){

e.printStackTrace();

}catch(IOExceptione){

e.printStackTrace();

}finally{

httpClientThreadLocal.get().getConnectionManager().shutdown();

cmsAcquisitionMng.end(acqu.getId());

httpClientThreadLocal.remove();

parseHtmlToolThreadLocal.remove();

urlQueueThreadLocal.remove();

longtEnd=System.currentTimeMillis();

System.out.println("主线程:"+Thread.currentThread().getName()+"结束...");

System.out.println("主线程:"+Thread.currentThread().getName()+"总共用时:"+(tEnd-tStart)+"ms");

}

}

}

/**

*处理类

*@authorjavacoo

*@since2011-11-02

*/

privateclassProcesserimplementsRunnable{

privateAcquisitionSvcacquisitionSvc;

privateCmsAcquisitionacqu;

privateCountDownLatchlatch;

privateUrlQueueurlQueue;

privateHttpClienthttpClient;

privateParseHtmlToolparseHtmlTool;

privateCharsetHandlerhandler;

publicProcesser(AcquisitionSvcacquisitionSvc,CmsAcquisitionacqu,CountDownLatchlatch,HttpClienthttpClient,UrlQueueurlQueue,ParseHtmlToolparseHtmlTool,CharsetHandlerhandler){

this.acquisitionSvc=acquisitionSvc;

this.acqu=acqu;

this.latch=latch;

this.urlQueue=urlQueue;

this.httpClient=httpClient;

this.parseHtmlTool=parseHtmlTool;

this.handler=handler;

}

publicvoidrun(){

System.out.println("======================子线程:"+Thread.currentThread().getName()+"开始...");

try{

Map<String,String>urlMap=null;

while(!urlAndTitleMapIsEmpty(urlQueue)){

urlMap=getUrlAndTitleMap(urlQueue);

saveContent(acqu,httpClient,parseHtmlTool,handler,urlMap);

Thread.sleep(SLEEP_TIME);

}

}catch(Exceptione){

e.printStackTrace();

log.warn(null,e);

}finally{

System.out.println("======================子线程:"+Thread.currentThread().getName()+"结束.");

log.info("Acquisition#{}complete",acqu.getId());

latch.countDown();

}

}

}

/**

*取得当前主线程的HttpClient对象

*@return当前主线程的HttpClient对象

*/

privatestaticHttpClientgetHttpClient(){

if(httpClientThreadLocal.get()==null){

HttpClientclient=newDefaultHttpClient();

httpClientThreadLocal.set(client);

returnclient;

}else{

returnhttpClientThreadLocal.get();

}

}

/**

*取得当前主线程的UrlQueue对象

*@return当前主线程的UrlQueue对象

*/

privatestaticUrlQueuegetUrlQueue(){

if(urlQueueThreadLocal.get()==null){

UrlQueueurlQueue=newUrlQueue();

urlQueueThreadLocal.set(urlQueue);

returnurlQueue;

}else{

returnurlQueueThreadLocal.get();

}

}

/**

*取得当前主线程的ParseHtmlTool对象

*@paramacqu采集参数对象

*@return当前主线程的ParseHtmlTool对象

*/

privatestaticParseHtmlToolgetParseHtmlTool(CmsAcquisitionacqu){

if(parseHtmlToolThreadLocal.get()==null){

ParseHtmlToolparseHtmlTool=newHtmlParserImpl(acqu);

parseHtmlToolThreadLocal.set(parseHtmlTool);

returnparseHtmlTool;

}else{

returnparseHtmlToolThreadLocal.get();

}

}

/**

*连接和标题map对象入队列

*@parammap连接和标题map对象

*/

privatesynchronizedvoidaddUrlAndTitleMap(Map<String,String>map){

getUrlQueue().addUnVisitedUrl(map);

}

/**

*连接和标题map对象出队列

*@paramurlQueue当前线程的队列

*@return连接和标题map对象

*/

privatesynchronizedMap<String,String>getUrlAndTitleMap(UrlQueueurlQueue){

returnurlQueue.unVisitedUrlDeQueue();

}

/**

*判断当前对象是否为空

*@paramurlQueue当前线程的队列

*@returntrue/flase

*/

privatesynchronizedbooleanurlAndTitleMapIsEmpty(UrlQueueurlQueue){

returnurlQueue.isEmpty();

}

/**

*取得当前线程下所有计划的连接,并加入队列

*@paramacqu采集参数对象

*@paramhandler字符集对象

*@throwsURISyntaxException

*@throwsIOException

*@throwsClientProtocolException

*/

privatevoidgetAllUrls(CmsAcquisitionacqu,CharsetHandlerhandler)throwsURISyntaxException,ClientProtocolException,IOException{

acqu=cmsAcquisitionMng.start(acqu.getId());

String[]plans=acqu.getAllPlans();

Stringurl=null;

Stringhtml=null;

List<Map<String,String>>urlAndTitleListMap=null;

HttpGethttpGet=null;

for(inti=plans.length-acqu.getCurrNum();i>=0;i--){

url=plans[i];

httpGet=newHttpGet(newURI(url.trim()));

html=getHttpClient().execute(httpGet,handler);

urlAndTitleListMap=getParseHtmlTool(acqu).getUrlAndTitleMap(html);

for(Map<String,String>map:urlAndTitleListMap){

addUrlAndTitleMap(map);

}

}

System.out.println("=======当前线程:"+Thread.currentThread().getName()+"URL连接数:"+getUrlQueue().getUnVisitedUrl().getSize());

}

/**

*保存内容

*@paramacqu请求参数对象

*@paramhttpClienthttpClient对象

*@paramparseHtmlToolparseHtmlTool对象

*@paramhandlerCharsetHandler对象

*@parammap连接和标题map对象

*@returnContent

*/

privatesynchronizedContentsaveContent(CmsAcquisitionacqu,HttpClienthttpClient,ParseHtmlToolparseHtmlTool,CharsetHandlerhandler,Map<String,String>map){

try{

HttpGethttpGet=null;

if(map.get(LINK_KEY).contains("http://")){

httpGet=newHttpGet(newURI(map.get(LINK_KEY).trim()));

}else{

httpGet=newHttpGet(newURI("http://localhost/v7/"+map.get(LINK_KEY).trim()));

}

Stringhtml=httpClient.execute(httpGet,handler);

System.out.println("=============================子线程:"+Thread.currentThread().getName()+"执行");

Stringtxt=parseHtmlTool.getHtml(html);

returncmsAcquisitionMng.saveContent(map.get(TITLE_KEY),txt,acqu.getId());

//returnnull;

}catch(Exceptione){

log.warn(null,e);

e.printStackTrace();

returnnull;

}

}

/**

*字符集帮助类

*@authorAdministrator

*

*/

privateclassCharsetHandlerimplementsResponseHandler<String>{

privateStringcharset;

publicCharsetHandler(Stringcharset){

this.charset=charset;

}

publicStringhandleResponse(HttpResponseresponse)

throwsClientProtocolException,IOException{

StatusLinestatusLine=response.getStatusLine();

if(statusLine.getStatusCode()>=300){

thrownewHttpResponseException(statusLine.getStatusCode(),

statusLine.getReasonPhrase());

}

HttpEntityentity=response.getEntity();

if(entity!=null){

if(!StringUtils.isBlank(charset)){

returnEntityUtils.toString(entity,charset);

}else{

returnEntityUtils.toString(entity);

}

}else{

returnnull;

}

}

}

}

相关辅助类

HTML解析工具类接口:ParseHtmlTool.java

packagecom.jeecms.cms.service;

importjava.util.List;

importjava.util.Map;

/**

*HTML解析工具类接口

*@authorjavacoo

*@since2011-10-31

*/

publicinterfaceParseHtmlTool{

/**

*取得连接集合

*@paramorginHtml原始HTML

*@return连接集合

*/

List<String>getUrlList(StringorginHtml);

/**

*取得标题集合

*@paramorginHtml原始HTML

*@return标题集合

*/

List<String>getTitleList(StringorginHtml);

/**

*取得指定区域的HTML内容

*@return指定区域的HTML内容

*/

StringgetHtml(StringorginHtml);

/**

*取得连接标题Map集合

*@paramorginHtml原始HTML

*@return连接标题Map集合

*/

List<Map<String,String>>getUrlAndTitleMap(StringorginHtml);

}

HTML解析工具,HtmlParser实现类:HtmlParserImpl.java

packagecom.jeecms.cms.service;

importjava.io.BufferedReader;

importjava.io.File;

importjava.io.FileInputStream;

importjava.io.IOException;

importjava.io.InputStreamReader;

importjava.net.URISyntaxException;

importjava.util.ArrayList;

importjava.util.HashMap;

importjava.util.Iterator;

importjava.util.List;

importjava.util.Map;

importjava.util.regex.Matcher;

importjava.util.regex.Pattern;

importorg.apache.commons.lang.StringUtils;

importorg.htmlparser.Node;

importorg.htmlparser.NodeFilter;

importorg.htmlparser.Parser;

importorg.htmlparser.filters.HasAttributeFilter;

importorg.htmlparser.filters.NodeClassFilter;

importorg.htmlparser.filters.TagNameFilter;

importorg.htmlparser.nodes.RemarkNode;

importorg.htmlparser.util.NodeList;

importorg.htmlparser.util.ParserException;

importcom.jeecms.cms.entity.assist.CmsAcquisition;

/**

*HTML解析工具,HtmlParser实现类

*@authorjavacoo

*@since2011-10-31

*/

publicclassHtmlParserImplimplementsParseHtmlTool{

/**连接集合标志*/

privatestaticStringLINK_KEY="linkKey";

/**标题集合标志*/

privatestaticStringTITLE_KEY="titleKey";

/**单标签标志*/

privatestaticStringSINGLE_TAG="singleTag";

/**连接正则表达式*/

privatestaticStringLINK_REGX="<a.*href=\"(.*?)\".*>(.*?)</a>";

/**正则表达式对象*/

privatePatternpt=Pattern.compile(LINK_REGX);

/**采集参数bean*/

privateParamBeanparamBean;

publicHtmlParserImpl(CmsAcquisitionacqu){

parseRequestParam(acqu);

}

/**

*取得标题集合

*@paramorginHtml原始HTML

*@return标题集合

*/

publicList<String>getTitleList(StringorginHtml){

orginHtml=getHtmlByFilter(paramBean.getLinksetStartMap(),paramBean.getLinksetEndMap(),orginHtml);

if(StringUtils.isNotEmpty(orginHtml)){

returngetUrlOrTitleListByType(orginHtml,TITLE_KEY);

}

returnnull;

}

/**

*取得连接集合

*@paramorginHtml原始HTML

*@return连接集合

*/

publicList<String>getUrlList(StringorginHtml){

orginHtml=getHtmlByFilter(paramBean.getLinksetStartMap(),paramBean.getLinksetEndMap(),orginHtml);

if(StringUtils.isNotEmpty(orginHtml)){

returngetUrlOrTitleListByType(orginHtml,LINK_KEY);

}

returnnull;

}

/**

*取得指定区域的HTML内容

*@paramorginHtml原始HTML

*@return指定区域的HTML内容

*@throwsParserException

*/

publicStringgetHtml(StringorginHtml){

orginHtml=getHtmlByFilter(paramBean.getContentStartMap(),paramBean.getContentEndMap(),orginHtml);

returnorginHtml;

}

/**

*取得连接标题Map

*@paramorginHtml原始HTML

*@return连接标题Map

*/

publicList<Map<String,String>>getUrlAndTitleMap(StringorginHtml){

returngetUrlAandTitleMap(orginHtml);

}

/**

*解析采集参数,并封装到ParamBean

*@paramacqu原始采集参数

*@return采集参数封装bean

*/

privatevoidparseRequestParam(CmsAcquisitionacqu){

paramBean=newParamBean();

if(!StringUtils.isEmpty(acqu.getLinksetStart())){

paramBean.setLinksetStartMap(populateParamMap(acqu.getLinksetStart()));

}

if(!StringUtils.isEmpty(acqu.getLinksetEnd())){

paramBean.setLinksetEndMap(populateParamMap(acqu.getLinksetEnd()));

}

if(!StringUtils.isEmpty(acqu.getContentStart())){

paramBean.setContentStartMap(populateParamMap(acqu.getContentStart()));

}

if(!StringUtils.isEmpty(acqu.getContentEnd())){

paramBean.setContentEndMap(populateParamMap(acqu.getContentEnd()));

}

}

/**

*得到连接标题MAP

*@paramhtmlhtml内容

*@return连接或者标题集合

*/

privateList<Map<String,String>>getUrlAandTitleMap(Stringhtml){

html=getHtmlByFilter(paramBean.getLinksetStartMap(),paramBean.getLinksetEndMap(),html);

List<Map<String,String>>resultMapList=newArrayList<Map<String,String>>();

Map<String,String>resultMap=null;

Matcherm=pt.matcher(html);

while(m.find()){

if(StringUtils.isNotEmpty(m.group(1))&&StringUtils.isNotEmpty(m.group(2))){

resultMap=newHashMap<String,String>();

resultMap.put(LINK_KEY,m.group(1));

resultMap.put(TITLE_KEY,m.group(2));

resultMapList.add(resultMap);

}

}

returnresultMapList;

}

/**

*得到地址集

*@paramhtmlhtml内容

*@paramtype1:取得连接集合,2:取得标题集合

*@return连接或者标题集合

*/

privateList<String>getUrlOrTitleListByType(Stringhtml,Stringtype){

List<String>resultList=newArrayList<String>();

Matcherm=pt.matcher(html);

Stringresult="";

intpos=1;

if(TITLE_KEY.equals(type)){

pos=2;

}

while(m.find()){

result=m.group(pos);

resultList.add(result);

}

returnresultList;

}

/**

*取得指定区域的HTML内容

*@paramtagMap标签MAP

*@paramremoveTagMap要过滤的标签MAP

*@paramorginHtml原始HTML

*@return指定区域的HTML内容

*@throwsParserException

*/

privateStringgetHtmlByFilter(Map<String,String>tagMap,

Map<String,String>removeTagMap,StringorginHtml){

try{

Parserparser=newParser();

parser.setInputHTML(orginHtml);

//第一步取得指定属性/标签内容

StringtempKey=null;

StringtempValue=null;

String[]tempValueArr=null;

StringBuildersb=newStringBuilder();

NodeFilterfilter=null;

for(Iterator<String>it=tagMap.keySet().iterator();it.hasNext();){

tempKey=it.next();

tempValue=tagMap.get(tempKey);

if(tempValue.contains("|")){

tempValueArr=tempValue.split("\\|");

}else{

tempValueArr=newString[]{tempValue};

}

for(Stringvalue:tempValueArr){

filter=populateFilter(tempKey,value);

appendHtmlByFilter(parser,filter,sb);

}

}

//第二步过滤指定属性/标签内容

StringcontentHtml=sb.toString();

for(Iterator<String>it=removeTagMap.keySet().iterator();it

.hasNext();){

tempKey=it.next();

tempValue=removeTagMap.get(tempKey);

if(tempValue.contains("|")){

tempValueArr=tempValue.split("\\|");

}else{

tempValueArr=newString[]{tempValue};

}

for(Stringvalue:tempValueArr){

filter=populateFilter(tempKey,value);

contentHtml=removeHtmlByFilter(parser,filter,contentHtml);

}

}

//第三步过滤注释

filter=newNodeClassFilter(RemarkNode.class);

contentHtml=removeHtmlByFilter(parser,filter,contentHtml);

//System.out.println("=================================结果=======================================");

//System.out.println(contentHtml);

returncontentHtml;

}catch(ParserExceptione){

//TODOAuto-generatedcatchblock

e.printStackTrace();

}

return"";

}

/**

*解析并组装采集参数,支持标签属性/值形式和标签名称形式,可混合使用

*

约定采集参数格式如下

*

1,标签属性/值形式,如:class=articleList|tips,id=fxwb|fxMSN|fxMSN

*

2,标签名称形式,如:div,p,span

*

3,混合形式,如:class=articleList|tips,id=fxwb|fxMSN|fxMSN,div,p,span

*@paramparamStr参数字符串

*/

privateMap<String,String>populateParamMap(StringparamStr){

Map<String,String>paramMap=newHashMap<String,String>();

String[]paramStrArr=paramStr.split(",");

String[]tempStrArr=null;

StringBuildersb=newStringBuilder();

for(Stringtemp:paramStrArr){

if(temp.contains("=")){

tempStrArr=temp.split("=");

paramMap.put(tempStrArr[0],tempStrArr[1]);

}else{

if(StringUtils.isNotEmpty(temp)){

sb.append(temp).append("|");

}

}

}

if(StringUtils.isNotEmpty(sb.toString())){

paramMap.put(SINGLE_TAG,sb.substring(0,sb.length()-1));

}

returnparamMap;

}

/**

*组装过滤器

*@paramkey键

*@paramvalue值

*@return过滤器

*/

privateNodeFilterpopulateFilter(Stringkey,Stringvalue){

NodeFilterfilter;

if(SINGLE_TAG.equals(key)){

filter=newTagNameFilter(value);

}else{

filter=newHasAttributeFilter(key,value);

}

returnfilter;

}

/**

*过滤指定属性标签HTML

*@paramparser解析器

*@paramfilter属性过滤器

*@paramorginHtml原始HTML

*@return过滤后HTML

*@throwsParserException

*/

privateStringremoveHtmlByFilter(Parserparser,NodeFilterfilter,StringorginHtml)throwsParserException{

parser.setInputHTML(orginHtml);

NodeListnodes=parser.extractAllNodesThatMatch(filter);

for(inti=0;i<nodes.size();i++){

Nodetextnode=(Node)nodes.elementAt(i);

orginHtml=StringUtils.remove(orginHtml,textnode.toHtml());

}

returnorginHtml;

}

/**

*取得所有指定属性/标签的HTML

*@paramparser解析器

*@paramfilter过滤器

*@paramsb

*@throwsParserException

*/

privatevoidappendHtmlByFilter(Parserparser,NodeFilterfilter,

StringBuildersb)throwsParserException{

NodeListnodes=parser.extractAllNodesThatMatch(filter);

for(inti=0;i<nodes.size();i++){

Nodetextnode=(Node)nodes.elementAt(i);

sb.append(textnode.toHtml());

}

}

/**

*解析并组装采集参数,支持标签属性/值形式和标签名称形式,可混合使用

*

约定采集参数格式如下

*

1,标签属性/值形式,如:class=articleList|tips,id=fxwb|fxMSN|fxMSN

*

2,标签名称形式,如:div,p,span

*

3,混合形式,如:class=articleList|tips,id=fxwb|fxMSN|fxMSN,div,p,span

*@paramparamMap参数map

*@paramstr参数字符串

*/

privatevoidpopulateParamMap(Map<String,String>paramMap,StringparamStr){

String[]paramStrArr=paramStr.split(",");

String[]tempStrArr=null;

StringBuildersb=newStringBuilder();

for(Stringtemp:paramStrArr){

if(temp.contains("=")){

tempStrArr=temp.split("=");

paramMap.put(tempStrArr[0],tempStrArr[1]);

}else{

if(StringUtils.isNotEmpty(temp)){

sb.append(temp).append("|");

}

}

}

if(StringUtils.isNotEmpty(sb.toString())){

paramMap.put(SINGLE_TAG,sb.substring(0,sb.length()-1));

}

}

/**

*测试方法-打开文件并返回内容

*@paramszFileName文件绝对地址

*@paramcharset字符集

*@return内容

*/

publicstaticStringopenFile(StringszFileName,Stringcharset){

try{

BufferedReaderbis=newBufferedReader(newInputStreamReader(

newFileInputStream(newFile(szFileName)),charset));

StringBuilderszContent=newStringBuilder();

StringszTemp;

while((szTemp=bis.readLine())!=null){

szContent.append(szTemp).append("\n");

}

bis.close();

returnszContent.toString();

}catch(Exceptione){

return"";

}

}

/**

*测试取得连接地址和标题

*@throwsParserException

*/

publicvoidtestFetchLinkAndTitle()throwsParserException{

Stringhtml=openFile("F:\\4.htm","UTF-8");

Stringresult="";

Map<String,String>map=newHashMap<String,String>();

map.put("class","m_list");

Map<String,String>notMap=newHashMap<String,String>();

//notMap.put("class","atc_ic_f");

result=getHtmlByFilter(map,notMap,html);

System.out.println("=============================result============================");

System.out.println(result);

System.out.println("==========================================================");

Patternpt=Pattern.compile("<a.*href=\"(.*?)\".*>(.*?)</a>");

Matcherm=pt.matcher(result);

Stringlink=null;

Stringtitle=null;

while(m.find()){

link=m.group(1);

title=m.group(2);

if(StringUtils.isNotEmpty(link)){

System.out.println("url:"+link);

System.out.println("title:"+title);

}

}

}

/**

*测试取得内容

*@throwsParserException

*/

publicvoidtestFetchContent()throwsParserException{

Stringhtml=openFile("F:\\6.shtml","GB2312");

Map<String,String>map=newHashMap<String,String>();

map.put("id","artibody");

Map<String,String>notMap=newHashMap<String,String>();

notMap.put(SINGLE_TAG,"style|script");

notMap.put("type","text/javascript");

notMap.put("class","icon_fx|blkCommentotherContent_01");

notMap.put("style","text-align:right;padding-right:10px;|margin-top:6px;|font-size:12px!important;|font-size:12px");

notMap.put("id","fxwb|fxMSN|fxMSN|comment_t_show_top");

getHtmlByFilter(map,notMap,html);

}

/**

*测试解析参数

*/

publicvoidtestParseParam(){

Map<String,String>map=newHashMap<String,String>();

populateParamMap(map,"class=articleList|tips,p,div");

StringtempKey=null;

StringtempValue=null;

String[]tempValueArr=null;

for(Iterator<String>it=map.keySet().iterator();it.hasNext();){

tempKey=it.next();

tempValue=map.get(tempKey);

if(tempValue.contains("|")){

tempValueArr=tempValue.split("\\|");

}else{

tempValueArr=newString[]{tempValue};

}

for(Stringvalue:tempValueArr){

System.out.println("tempKey:"+tempKey);

System.out.println("value:"+value);

}

}

}

/**

*测试过滤标签

*@throwsParserException

*/

publicvoidtestRemarkFilter()throwsParserException{

Stringhtml=openFile("F:\\6.shtml","GB2312");

System.out.println("=========================过滤注释前HTML==================================");

System.out.println(html);

NodeFilterfilter=newNodeClassFilter(RemarkNode.class);

html=removeHtmlByFilter(newParser(),filter,html);

System.out.println("=========================过滤注释后HTML==================================");

System.out.println(html);

}

publicstaticvoidmain(String[]args)throwsParserException,

URISyntaxException,IOException{

HtmlParserImplparseHtmlTool=newHtmlParserImpl(newCmsAcquisition());

//parseHtmlTool.testParseParam();

//parseHtmlTool.testFetchLinkAndTitle();

//parseHtmlTool.testFetchContent();

//parseHtmlTool.testRemarkFilter();

}

}

采集参数封装bean:ParamBean.java

packagecom.jeecms.cms.service;

importjava.util.HashMap;

importjava.util.Map;

/**

*采集参数封装bean

*@authorjavacoo

*@since2011-10-31

*/

publicclassParamBean{

/**待采集连接区域属性MAP*/

privateMap<String,String>linksetStartMap=newHashMap<String,String>();

/**待采集连接区域过滤属性MAP*/

privateMap<String,String>linksetEndMap=newHashMap<String,String>();

/**待采集内容区域属性MAP*/

privateMap<String,String>contentStartMap=newHashMap<String,String>();

/**待采集内容区域过滤属性MAP*/

privateMap<String,String>contentEndMap=newHashMap<String,String>();

publicMap<String,String>getLinksetStartMap(){

returnlinksetStartMap;

}

publicvoidsetLinksetStartMap(Map<String,String>linksetStartMap){

this.linksetStartMap=linksetStartMap;

}

publicMap<String,String>getLinksetEndMap(){

returnlinksetEndMap;

}

publicvoidsetLinksetEndMap(Map<String,String>linksetEndMap){

this.linksetEndMap=linksetEndMap;

}

publicMap<String,String>getContentStartMap(){

returncontentStartMap;

}

publicvoidsetContentStartMap(Map<String,String>contentStartMap){

this.contentStartMap=contentStartMap;

}

publicMap<String,String>getContentEndMap(){

returncontentEndMap;

}

publicvoidsetContentEndMap(Map<String,String>contentEndMap){

this.contentEndMap=contentEndMap;

}

}

队列类:Queue.java

packagecom.jeecms.cms.service;

importjava.util.LinkedList;

/**

*队列

*@authorjavacoo

*@since2011-11-01

*@param<T>

*/

publicclassQueue<T>{

privateLinkedList<T>queue=newLinkedList<T>();

/**

*入队列

*@paramt

*/

publicvoidenQueue(Tt){

queue.addLast(t);

}

/**

*出队列

*@returnt

*/

publicTdeQueue(){

returnqueue.removeFirst();

}

/**

*判断队列是否为空

*@return

*/

publicbooleanisEmpty(){

returnqueue.isEmpty();

}

/**

*判断队列是否含有t

*@paramt

*@return

*/

publicbooleancontains(Tt){

returnqueue.contains(t);

}

/**

*取得队列大小

*@return

*/

publicintgetSize(){

returnqueue.size();

}

}

URL队列:UrlQueue.java

packagecom.jeecms.cms.service;

importjava.util.Map;

importorg.springframework.util.CollectionUtils;

/**

*URL队列

*@authorjavacoo

*@since2011-11-01

*@param<T>

*/

publicclassUrlQueue{

/**待访问URL集合*/

privateQueue<Map<String,String>>unVisitedUrl=newQueue<Map<String,String>>();

/**

*获得URL队列

*@return

*/

publicQueue<Map<String,String>>getUnVisitedUrl(){

returnunVisitedUrl;

}

/**

*未访问的URL出队列

*@return

*/

publicMap<String,String>unVisitedUrlDeQueue(){

returnunVisitedUrl.deQueue();

}

/**

*保证每个URL只被访问一次

*@paramurl

*/

publicvoidaddUnVisitedUrl(Map<String,String>urlMap){

if(!CollectionUtils.isEmpty(urlMap)&&!unVisitedUrl.contains(urlMap)){

unVisitedUrl.enQueue(urlMap);

}

}

/**

*判断是否为空

*@return

*/

publicbooleanisEmpty(){

returnunVisitedUrl.isEmpty();

}

}

相关推荐