zzcchunter 2007-11-16
我现在是用一台主机抓取数据,所以我想把Heritrix的链接散列到多个线程中,可是当我散列的ELFHashQueueAssignmentPolicy写好后,第一次执行的时候,只能解析出30个dns:任务就自动的结束了,可是,当第二次或是第三次的时候,就可以实现多个线程了
另外我已经把Heritrix.properties文件和AbstractFrontier中相应的位置都已经改了,希望您能帮我看看,谢谢了。
/*******************************************************************************
*文件说明:
*
*项目名:WebCrawler
*文件名:ELFHashAssignmentPolicy.java
*包名:com.hotct.heritrixExt.common.frontier
*
*创建人:zhangzhenxin
*创建时间:下午03:50:01
*创建日期:2007-10-30
******************************************************************************/
packagecom.hotct.heritrixExt.common.frontier;
importjava.util.logging.Level;
importjava.util.logging.Logger;
importorg.apache.commons.httpclient.URIException;
importorg.archive.crawler.datamodel.CandidateURI;
importorg.archive.crawler.framework.CrawlController;
importorg.archive.crawler.frontier.HostnameQueueAssignmentPolicy;
importorg.archive.crawler.frontier.QueueAssignmentPolicy;
importorg.archive.net.UURI;
importorg.archive.net.UURIFactory;
/**
*<h>类型描述</h>
*
*@authorzhangzhenxin
*@date2007-10-30
*/
publicclassELFHashAssignmentPolicyextendsQueueAssignmentPolicy{
privatestaticfinalLoggerlogger=Logger
.getLogger(ELFHashAssignmentPolicy.class.getName());
privatestaticStringDEFAULT_CLASS_KEY="default...";
privatestaticfinalStringDNS="dns";
/**
*
*/
@Override
publicStringgetClassKey(CrawlControllercontroller,CandidateURIcauri){
Stringuri=cauri.getUURI().toString();
Stringscheme=cauri.getUURI().getScheme();
Stringcandidate=null;
try{
if(scheme.equals(DNS)){
if(cauri.getVia()!=null){
//SpecialhandlingforDNS:treatasbeing
//ofthesameclassasthetriggeringURI.
//WhenaURIincludesaport,thisensures
//theDNSlookupgoesatopthehost:port
//queuethattriggeredit,ratherthan
//someotherhostqueue
UURIviaUuri=UURIFactory.getInstance(cauri.flattenVia());
candidate=viaUuri.getAuthorityMinusUserinfo();
//adoptschemeoftriggeringURI
scheme=viaUuri.getScheme();
}else{
candidate=cauri.getUURI().getReferencedHost();
}
}else{
//Stringuri=cauri.getUURI().toString();
longhash=ELFHash(uri);
candidate=Long.toString(hash%100);
}
if(candidate==null||candidate.length()==0){
candidate=DEFAULT_CLASS_KEY;
}
}catch(URIExceptione){
logger.log(Level.INFO,
"unabletoextractclasskey;usingdefault",e);
candidate=DEFAULT_CLASS_KEY;
}
returncandidate.replace(':','#');
}
publicstaticlongELFHash(Stringstr){
longhash=0;
longx=0;
for(inti=0;i<str.length();i++){
hash=(hash<<4)+str.charAt(i);
if((x=hash&0xF0000000L)!=0){
hash^=(x>>24);
hash&=~x;
}
}
return(hash&0x7FFFFFFF);
}
}