spider简单的爬虫程序

wusiye 2010-01-05

spider简单的爬虫程序

1、基础准备

htmlparser

首页:http://sourceforge.net/projects/htmlparser/

下载:http://sourceforge.net/project/showfiles.php?group_id=24399

文件:htmlparser1_6_20060610.zip

<dependency>

<groupId>org.htmlparser</groupId>

<artifactId>htmlparser</artifactId>

<version>1.6</version>

</dependency>

cpdetector

首页:http://cpdetector.sourceforge.net/

下载:http://sourceforge.net/project/showfiles.php?group_id=114421

文件:cpdetector_eclipse_project_1.0.7.zip

<dependency>

<groupId>cpdetector</groupId>

<artifactId>cpdetector</artifactId>

<version>1.0.5</version>

</dependency>

spindle

首页:http://www.bitmechanic.com/projects/spindle/(但是已经无法访问)

2修改spindle代码得到的spider

简单的将URL打印出来了,解析的内容等等都没有处理

解析HTML的基类HtmlParserUtil.java

packagecom.sillycat.api.commons.utils.html;

importjava.io.BufferedReader;

importjava.io.FileNotFoundException;

importjava.io.IOException;

importjava.io.InputStream;

importjava.io.InputStreamReader;

importjava.io.UnsupportedEncodingException;

importjava.net.MalformedURLException;

importjava.net.SocketException;

importjava.net.SocketTimeoutException;

importjava.net.URL;

importjava.net.UnknownHostException;

importjava.nio.charset.Charset;

importorg.htmlparser.Parser;

importorg.htmlparser.util.NodeList;

importorg.htmlparser.util.ParserException;

importorg.htmlparser.visitors.HtmlPage;

importcpdetector.io.ASCIIDetector;

importcpdetector.io.CodepageDetectorProxy;

importcpdetector.io.JChardetFacade;

importcpdetector.io.ParsingDetector;

importcpdetector.io.UnicodeDetector;

publicclassHtmlParserUtil{

/*StringBuffer的缓冲区大小*/

publicstaticintTRANSFER_SIZE=4096;

/*当前平台的行分隔符*/

publicstaticStringlineSep=System.getProperty("line.separator");

/*自动探测页面编码,避免中文乱码的出现*/

publicstaticStringautoDetectCharset(URLurl){

CodepageDetectorProxydetector=CodepageDetectorProxy.getInstance();

/**

*ParsingDetector可用于检查HTML、XML等文件或字符流的编码构造方法中的参数用于指示是否显示探测过程的详细信息

*为false则不显示

*/

detector.add(newParsingDetector(false));

detector.add(JChardetFacade.getInstance());

detector.add(ASCIIDetector.getInstance());

detector.add(UnicodeDetector.getInstance());

Charsetcharset=null;

try{

charset=detector.detectCodepage(url);

}catch(MalformedURLExceptionmue){

mue.printStackTrace();

}catch(IOExceptionie){

ie.printStackTrace();

}

if(charset==null)

charset=Charset.defaultCharset();

returncharset.name();

}

/*按照指定编码解析标准的html页面,为建立索引做准备*/

publicstaticString[]parseHtml(Stringurl,Stringcharset){

Stringresult[]=null;

Stringcontent=null;

try{

URLsource=newURL(url);

InputStreamin=source.openStream();

BufferedReaderreader=newBufferedReader(newInputStreamReader(

in,charset));

Stringline=newString();

StringBuffertemp=newStringBuffer(TRANSFER_SIZE);

while((line=reader.readLine())!=null){

temp.append(line);

temp.append(lineSep);

}

reader.close();

in.close();

content=temp.toString();

}catch(UnsupportedEncodingExceptionuee){

uee.printStackTrace();

}catch(MalformedURLExceptionmue){

System.err.println("InvalidURL:"+url);

}catch(UnknownHostExceptionuhe){

System.err.println("UnknowHost:"+url);

}catch(SocketExceptionse){

System.err.println("SocketError:"+se.getMessage()+""+url);

}catch(SocketTimeoutExceptionste){

System.err.println("SocketConnectionTimeOut:"+url);

}catch(FileNotFoundExceptionfnfe){

System.err.println("brokenlink"

+((FileNotFoundException)fnfe.getCause()).getMessage()

+"ignored");

}catch(IOExceptionie){

ie.printStackTrace();

}

if(content!=null){

ParsermyParser=Parser.createParser(content,charset);

HtmlPagevisitor=newHtmlPage(myParser);

try{

myParser.visitAllNodesWith(visitor);

Stringbody=null;

Stringtitle="Untitled";

if(visitor.getBody()!=null){

NodeListnodelist=visitor.getBody();

body=nodelist.asString().trim();

}

if(visitor.getTitle()!=null){

title=visitor.getTitle();

}

result=newString[]{body,title};

}catch(ParserExceptionpe){

pe.printStackTrace();

}

}

returnresult;

}

}

多线程爬虫类HtmlCaptureRunner.java

packagecom.sillycat.api.thread.runner;

importjava.io.FileNotFoundException;

importjava.io.IOException;

importjava.net.HttpURLConnection;

importjava.net.MalformedURLException;

importjava.net.SocketException;

importjava.net.SocketTimeoutException;

importjava.net.URL;

importjava.net.UnknownHostException;

importjava.util.ArrayList;

importjava.util.HashSet;

importorg.apache.commons.logging.Log;

importorg.apache.commons.logging.LogFactory;

importorg.htmlparser.Parser;

importorg.htmlparser.PrototypicalNodeFactory;

importorg.htmlparser.filters.AndFilter;

importorg.htmlparser.filters.HasAttributeFilter;

importorg.htmlparser.filters.NodeClassFilter;

importorg.htmlparser.tags.BaseHrefTag;

importorg.htmlparser.tags.FrameTag;

importorg.htmlparser.tags.LinkTag;

importorg.htmlparser.tags.MetaTag;

importorg.htmlparser.util.EncodingChangeException;

importorg.htmlparser.util.NodeIterator;

importorg.htmlparser.util.NodeList;

importorg.htmlparser.util.ParserException;

importcom.sillycat.api.commons.utils.StringUtil;

importcom.sillycat.api.commons.utils.html.HtmlParserUtil;

publicclassHtmlCaptureRunnerimplementsRunnable{

publicLoglogger=LogFactory.getLog(getClass());

/*基准(初始)URL*/

protectedStringbaseURL=null;

privateStringcontentPath=null;

/**

*待解析的URL地址集合,所有新检测到的链接均存放于此;解析时按照先入先出(First-InFirst-Out)法则线性取出

*/

protectedArrayListURLs=newArrayList();

/*已存储的URL地址集合,避免链接的重复抓取*/

protectedHashSetindexedURLs=newHashSet();

protectedParserparser=newParser();;

/*程序运行线程数,默认2个线程*/

protectedintthreads=2;

/*解析页面时的字符编码*/

protectedStringcharset;

/*基准端口*/

protectedintbasePort;

/*基准主机*/

protectedStringbaseHost;

/*是否存储,默认true*/

protectedbooleanjustDatabase=true;

/*检测索引中是否存在当前URL信息,避免重复抓取*/

protectedbooleanisRepeatedCheck=false;

publicHtmlCaptureRunner(){

PrototypicalNodeFactoryfactory=newPrototypicalNodeFactory();

factory.registerTag(newLocalLinkTag());

factory.registerTag(newLocalFrameTag());

factory.registerTag(newLocalBaseHrefTag());

parser.setNodeFactory(factory);

}

publicvoidcapture(){

URLs.clear();

URLs.add(getBaseURL());

intresponseCode=0;

StringcontentType="";

try{

HttpURLConnectionuc=(HttpURLConnection)newURL(baseURL)

.openConnection();

responseCode=uc.getResponseCode();

contentType=uc.getContentType();

}catch(MalformedURLExceptionmue){

logger.error("InvalidURL:"+getBaseURL());

}catch(UnknownHostExceptionuhe){

logger.error("UnknowHost:"+getBaseURL());

}catch(SocketExceptionse){

logger.error("SocketError:"+se.getMessage()+""

+getBaseURL());

}catch(IOExceptionie){

logger.error("IOException:"+ie);

}

if(responseCode==HttpURLConnection.HTTP_OK

&&contentType.startsWith("text/html")){

try{

charset=HtmlParserUtil.autoDetectCharset(newURL(baseURL));

basePort=newURL(baseURL).getPort();

baseHost=newURL(baseURL).getHost();

if(charset.equals("windows-1252"))

charset="GBK";

longstart=System.currentTimeMillis();

ArrayListthreadList=newArrayList();

for(inti=0;i<threads;i++){

Threadt=newThread(this,"SpiderThread#"+(i+1));

t.start();

threadList.add(t);

}

while(threadList.size()>0){

Threadchild=(Thread)threadList.remove(0);

try{

child.join();

}catch(InterruptedExceptionie){

logger.error("InterruptedException:"+ie);

}

}

//for(inti=0;i<threads;i++){

//threadPool.getThreadPoolExcutor().execute(new

//Thread(this,"SpiderThread#"+(i+1)));

//}

longelapsed=System.currentTimeMillis()-start;

logger.info("Finishedin"+(elapsed/1000)+"seconds");

logger.info("TheCountoftheLinksCapturedis"

+indexedURLs.size());

}catch(MalformedURLExceptione){

e.printStackTrace();

}

}

}

publicvoidrun(){

Stringurl;

while((url=dequeueURL())!=null){

if(justDatabase){

process(url);

}

}

threads--;

}

/**

*处理单独的URL地址,解析页面并加入到lucene索引中;通过自动探测页面编码保证抓取工作的顺利执行

*/

protectedvoidprocess(Stringurl){

Stringresult[];

Stringcontent=null;

Stringtitle=null;

result=HtmlParserUtil.parseHtml(url,charset);

content=result[0];

title=result[1];

if(content!=null&&content.trim().length()>0){

//content

System.out.println(url);

//title

//DateTools.timeToString(System.currentTimeMillis()

}

}

/*从URL队列mPages里取出单个的URL*/

publicsynchronizedStringdequeueURL(){

while(true)

if(URLs.size()>0){

Stringurl=(String)URLs.remove(0);

indexedURLs.add(url);

if(isToBeCaptured(url)){

NodeListlist;

try{

intbookmark=URLs.size();

/*获取页面所有节点*/

parser.setURL(url);

try{

list=newNodeList();

for(NodeIteratore=parser.elements();e

.hasMoreNodes();)

list.add(e.nextNode());

}catch(EncodingChangeExceptionece){

/*解码出错的异常处理*/

parser.reset();

list=newNodeList();

for(NodeIteratore=parser.elements();e

.hasMoreNodes();)

list.add(e.nextNode());

}

/**

*依据http://www.robotstxt.org/wc/meta-user.html处理

*Robots<META>tag

*/

NodeListrobots=list

.extractAllNodesThatMatch(

newAndFilter(newNodeClassFilter(

MetaTag.class),

newHasAttributeFilter("name",

"robots")),true);

if(0!=robots.size()){

MetaTagrobot=(MetaTag)robots.elementAt(0);

Stringcontent=robot.getAttribute("content")

.toLowerCase();

if((-1!=content.indexOf("none"))

||(-1!=content.indexOf("nofollow")))

for(inti=bookmark;i<URLs.size();i++)

URLs.remove(i);

}

}catch(ParserExceptionpe){

logger.error("ParserException:"+pe);

}

returnurl;

}

}else{

threads--;

if(threads>0){

try{

wait();

threads++;

}catch(InterruptedExceptionie){

logger.error("InterruptedException:"+ie);

}

}else{

notifyAll();

returnnull;

}

}

}

privatebooleanisHTML(Stringurl){

if(!url.endsWith(".html")){

returnfalse;

}

if(StringUtil.isNotBlank(contentPath)){

if(!url.startsWith(baseURL+"/"+contentPath)){

returnfalse;

}

}

returntrue;

}

/**

*判断提取到的链接是否符合解析条件;标准为Port及Host与基准URL相同且类型为text/html或text/plain

*/

publicbooleanisToBeCaptured(Stringurl){

booleanflag=false;

HttpURLConnectionuc=null;

intresponseCode=0;

StringcontentType="";

Stringhost="";

intport=0;

try{

URLsource=newURL(url);

Stringprotocol=source.getProtocol();

if(protocol!=null&&protocol.equals("http")){

host=source.getHost();

port=source.getPort();

uc=(HttpURLConnection)source.openConnection();

uc.setConnectTimeout(8000);

responseCode=uc.getResponseCode();

contentType=uc.getContentType();

}

}catch(MalformedURLExceptionmue){

logger.error("InvalidURL:"+url);

}catch(UnknownHostExceptionuhe){

logger.error("UnknowHost:"+url);

}catch(SocketExceptionse){

logger.error("SocketError:"+se.getMessage()+""+url);

}catch(SocketTimeoutExceptionste){

logger.error("SocketConnectionTimeOut:"+url);

}catch(FileNotFoundExceptionfnfe){

logger.error("brokenlink"+url+"ignored");

}catch(IOExceptionie){

logger.error("IOException:"+ie);

}

if(port==basePort

&&responseCode==HttpURLConnection.HTTP_OK

&&host.equals(baseHost)

&&(contentType.startsWith("text/html")||contentType

.startsWith("text/plain")))

flag=true;

returnflag;

}

classLocalLinkTagextendsLinkTag{

publicvoiddoSemanticAction(){

Stringlink=getLink();

if(link.endsWith("/"))

link=link.substring(0,link.length()-1);

intpos=link.indexOf("#");

if(pos!=-1)

link=link.substring(0,pos);

/*将链接加入到处理队列中*/

if(!(indexedURLs.contains(link)||URLs.contains(link))){

if(isHTML(link)){

URLs.add(link);

}

}

setLink(link);

}

}

/**

*FrametagthatrewritestheSRCURLs.TheSRCURLsaremappedtolocal

*targetsiftheymatchthesource.

*/

classLocalFrameTagextendsFrameTag{

publicvoiddoSemanticAction(){

Stringlink=getFrameLocation();

if(link.endsWith("/"))

link=link.substring(0,link.length()-1);

intpos=link.indexOf("#");

if(pos!=-1)

link=link.substring(0,pos);

/*将链接加入到处理队列中*/

if(!(indexedURLs.contains(link)||URLs.contains(link))){

if(isHTML(link)){

URLs.add(link);

}

}

setFrameLocation(link);

}

}

/**

*Basetagthatdoesn'tshow.ThetoHtml()methodisoverriddentoreturn

*anemptystring,effectivelyshuttingoffthebasereference.

*/

classLocalBaseHrefTagextendsBaseHrefTag{

publicStringtoHtml(){

return("");

}

}

publicStringgetBaseURL(){

returnbaseURL;

}

publicvoidsetBaseURL(StringbaseURL){

this.baseURL=baseURL;

}

publicintgetThreads(){

returnthreads;

}

publicvoidsetThreads(intthreads){

this.threads=threads;

}

publicStringgetCharset(){

returncharset;

}

publicvoidsetCharset(Stringcharset){

this.charset=charset;

}

publicintgetBasePort(){

returnbasePort;

}

publicvoidsetBasePort(intbasePort){

this.basePort=basePort;

}

publicStringgetBaseHost(){

returnbaseHost;

}

publicvoidsetBaseHost(StringbaseHost){

this.baseHost=baseHost;

}

publicbooleanisJustDatabase(){

returnjustDatabase;

}

publicvoidsetJustDatabase(booleanjustDatabase){

this.justDatabase=justDatabase;

}

publicStringgetContentPath(){

returncontentPath;

}

publicvoidsetContentPath(StringcontentPath){

this.contentPath=contentPath;

}

}

spring上的配置文件applicationContext-bean.xml:

<beanid="productCapture"

class="com.sillycat.api.thread.runner.HtmlCaptureRunner">

<propertyname="contentPath"value="${product.contentPath}"/>

<propertyname="basePort"value="${product.base.port}"/>

<propertyname="baseURL"value="${product.base.url}"/>

<propertyname="charset"value="${product.base.code}"/>

<propertyname="threads"value="${product.base.threads}"/>

</bean>

<beanid="messageCapture"

class="com.sillycat.api.thread.runner.HtmlCaptureRunner">

<propertyname="contentPath"value="${message.contentPath}"/>

<propertyname="basePort"value="${message.base.port}"/>

<propertyname="baseURL"value="${message.base.url}"/>

<propertyname="charset"value="${message.base.code}"/>

<propertyname="threads"value="${message.base.threads}"/>

</bean>

easySearch.properties配置文件:

#==========================================

#spiderconfigration

#=========================================

product.contentPath=product

product.base.port=80

product.base.url=http://www.safedv.com

product.base.code=UTF-8

product.base.threads=3

message.contentPath=message

message.base.port=80

message.base.url=http://www.safedv.com

message.base.code=UTF-8

message.base.threads=3

单元测试类HtmlRunnerTest.java文件:

packagecom.sillycat.api.thread;

importcom.sillycat.api.commons.base.BaseManagerTest;

importcom.sillycat.api.thread.runner.HtmlCaptureRunner;

publicclassHtmlRunnerTestextendsBaseManagerTest{

privateHtmlCaptureRunnerproductCapture;

privateHtmlCaptureRunnermessageCapture;

protectedvoidsetUp()throwsException{

super.setUp();

productCapture=(HtmlCaptureRunner)appContext.getBean("productCapture");

messageCapture=(HtmlCaptureRunner)appContext.getBean("messageCapture");

}

protectedvoidtearDown()throwsException{

super.tearDown();

}

publicvoidtestDumy(){

assertTrue(true);

}

publicvoidntestProductCapture(){

productCapture.capture();

}

publicvoidtestMessageCapture(){

messageCapture.capture();

}

}

相关推荐