htmlparser网页抓取

RedGuyanluo 2016-02-03

importjava.io.File;

importjava.io.FileNotFoundException;

importjava.io.FileOutputStream;

importjava.io.IOException;

importjava.io.InputStream;

importjava.net.URL;

importjava.sql.Connection;

importjava.sql.DriverManager;

importjava.sql.PreparedStatement;

importjava.sql.ResultSet;

importjava.sql.SQLException;

importorg.apache.log4j.Logger;

importorg.htmlparser.Node;

importorg.htmlparser.NodeFilter;

importorg.htmlparser.Parser;

importorg.htmlparser.Tag;

importorg.htmlparser.filters.TagNameFilter;

importorg.htmlparser.tags.LinkTag;

importorg.htmlparser.util.NodeIterator;

importorg.htmlparser.util.NodeList;

importorg.htmlparser.util.ParserException;

importorg.htmlparser.util.SimpleNodeIterator;

/**

*分析www.cheshi.com首页新闻

*@authorj.li

*/

publicclassHtmlParser{

privatestaticLoggerlogger;

privateConnectionconn=null;

privatestaticfinalStringSitename="";

publicvoidindexNewsContent(Stringsitepath)throwsException{

logger.info("分析网站【"+sitepath+"】首页的新闻列表,内容为【<divclass=\"hotjd\"></div>】所有网页新闻地址的HTML内容。");

ParsermyParser=newParser(sitepath);

myParser.setEncoding("UTF-8");

NodeListnodeList=myParser.extractAllNodesThatMatch(newNodeFilter(){

publicbooleanaccept(Nodenode){

return((nodeinstanceofTag)

&&!((Tag)node).isEndTag()

&&((Tag)node).getTagName().equals("DIV")

&&((Tag)node).getAttribute("class")!=null

&&((Tag)node).getAttribute("class").equals("descclearfix"));

}

});

for(inti=0,len=nodeList.size();i<len;i++){

Nodenode=nodeList.elementAt(i);

logger.debug(node.toHtml());

System.out.println(node.toHtml());

System.out.println("------------------------------------------------------------------------------------------------------");

//extractText(node.toHtml());

}

}

publicvoidextractText(StringinputHtml)throwsException{

Parserparser=Parser.createParser(inputHtml,"GBK");

TagNameFilterfilter=newTagNameFilter("a");

NodeListnodeList=parser.extractAllNodesThatMatch(filter);

NodeIteratorit=nodeList.elements();

getConnection();

while(it.hasMoreNodes()){

LinkTagnode=(LinkTag)it.nextNode();

Stringhref=node.getLink();

Stringtitle=node.getLinkText();

logger.info("分析首页新闻【"+title+"】,链接地址【"+href+"】");

try{

if(!newsExist(title)){

insertDataBase(title,extractContent(href));

}else{

logger.info("新闻【"+title+"】数据库中已经存在,忽略进入下一个新闻分析!");

}

}catch(SQLExceptione){

logger.error("插入数据库新闻记录异常!"+e.getMessage());

e.printStackTrace();

}catch(Exceptione){

logger.error(e.getMessage());

logger.info("分析新闻【"+title+"】,链接地址【"+href+"】失败,进入下一个新闻分析。");

e.printStackTrace();

}

}

closeConnection();

}

publicStringextractContent(Stringcontent)throwsException{

try{

ParsermyParser=newParser(content);

myParser.setEncoding("GBK");

NodeListnodeList=myParser.extractAllNodesThatMatch(newNodeFilter(){

publicbooleanaccept(Nodenode){

return((nodeinstanceofTag)

&&!((Tag)node).isEndTag()

&&((Tag)node).getTagName().equals("DIV")

&&((Tag)node).getAttribute("class")!=null

&&((Tag)node).getAttribute("class").equals("cs_content"));

}

});

intsize=nodeList.size();

Nodenode=nodeList.elementAt(size-1);

content=node.toHtml();

logger.debug("==========extractContent==============");

logger.debug(content);

}catch(Exceptionpe){

logger.error("分析新闻页面出现异常!"+pe.getMessage()+"原因可能出现于新闻页面不存在<divclass=\"cs_content\"></div>标记。");

throwpe;

}

returnremoveTagA(content);

}

/**

*去除新闻中href包含cheshi.com的<a>标签

*@paramcontent分析html内容

*@return分析处理后的html内容

*/

publicStringremoveTagA(Stringcontent)throwsParserException{

ParsermyParser=newParser(content);

myParser.setEncoding("GBK");

NodeListnodeList=myParser.extractAllNodesThatMatch(newTagNameFilter("a"));

SimpleNodeIteratorit=nodeList.elements();

while(it.hasMoreNodes()){

LinkTagnode=(LinkTag)it.nextNode();

logger.info("移除新闻内容中包含的文字、图片的链接【"+node.toHtml()+"】。");

if(node.getLink().indexOf("cheshi.com")>-1)

content=content.replace(node.toHtml(),node.getStringText());

}

logger.debug("==========removeTagA==============");

logger.debug(content);

returndownloadImages(content,"D:\\autodata\\upload\\intersite",SiteName+"upload/intersite");

}

publicStringdownloadImages(Stringcontent,StringuploadImgPath,Stringlocalhost)throwsParserException{

Filef=newFile(uploadImgPath);

if(!f.exists()){

f.mkdirs();

}

ParsermyParser=newParser(content);

myParser.setEncoding("GBK");

NodeListnodeList=myParser.extractAllNodesThatMatch(newTagNameFilter("img"));

SimpleNodeIteratorit=nodeList.elements();

while(it.hasMoreNodes()){

Tagtag=(Tag)it.nextNode();

Stringsrc=tag.getAttribute("src");

Stringfilename=src.substring(src.lastIndexOf("/")+1);

InputStreamis=null;

FileOutputStreamfos=null;

try{

URLurl=newURL(src);

is=url.openStream();

intbytesRead=0;

byte[]buff=newbyte[1024];

fos=newFileOutputStream(uploadImgPath+"/"+filename);

while((bytesRead=is.read(buff,0,buff.length))!=-1){

fos.write(buff,0,bytesRead);

}

content=content.replace(src,localhost+"/"+filename);

}catch(FileNotFoundExceptionnotFoundException){

notFoundException.printStackTrace();

}catch(IOExceptionioe){

ioe.printStackTrace();

}finally{

try{

if(fos!=null)fos.close();

if(is!=null)is.close();

}catch(IOExceptionioe){

ioe.printStackTrace();

}

}

}

logger.debug("=================downloadImages==================");

logger.debug(content);

returncontent;

}

publicvoidgetConnection(){

try{

Class.forName("com.microsoft.jdbc.sqlserver.SQLServerDriver");

StringstrCon="jdbc:microsoft:sqlserver://192.168.99.188:12580;databaseName=Project2009;SelectMethod=cursor";

StringstrUsername="sa";

StringstrPWD="qsyjcsxdl@@@web2009@@@";

conn=DriverManager.getConnection(strCon,strUserName,strPWD);

}catch(java.lang.ClassNotFoundExceptioncnfe){

cnfe.printStackTrace();

}catch(SQLExceptionse){

se.printStackTrace();

}

}

publicvoidcloseConnection(){

try{

if(conn!=null&&!conn.isClosed())conn.close();

}catch(SQLExceptionse){

se.printStackTrace();

}

}

publicvoidinsertDataBase(StringnewsTitle,StringnewsContent)throwsSQLException{

PreparedStatementpstmt=null;

try{

pstmt=conn.prepareStatement("INSERTINTOFumNews(NewsTitle,NewsContext,NewsState)values(?,?,?)");

pstmt.setString(1,newsTitle);

pstmt.setString(2,newsContent);

pstmt.setInt(3,1);

pstmt.executeUpdate();

}catch(SQLExceptione){

throwe;

}finally{

try{

if(pstmt!=null)pstmt.close();

}catch(SQLExceptione){

e.printStackTrace();

}

}

}

publicbooleannewsExist(Stringtitle)throwsSQLException{

PreparedStatementpstmt=null;

try{

pstmt=conn.prepareStatement("SELECTtop1NewsIdfromFumNewswhereNewsTitle=?");

pstmt.setString(1,title);

ResultSetrs=pstmt.executeQuery();

returnrs.next();

}catch(SQLExceptione){

throwe;

}finally{

try{

if(pstmt!=null)pstmt.close();

}catch(SQLExceptione){

e.printStackTrace();

}

}

}

publicstaticvoidmain(String[]args){

HtmlParserhtml=newHtmlParser();

//设置代理链接网络

//System.getProperties().put("proxySet","true");

//System.getProperties().put("proxyHost","192.168.99.100");

//System.getProperties().put("proxyPort","80");

//URLurl=html.getClass().getResource("log4j.properties");

//PropertyConfigurator.configure("www.cheshi.com");

logger=Logger.getLogger(HtmlParser.class);

try{

html.indexNewsContent("http://www.kaola.com/activity/detail/3245.html?navindex=1");

}catch(Exceptione){

e.printStackTrace();

logger.error("分析网页遇到错误,原因:"+e.getMessage());

}

logger.info("分析网页内容完成。");

}

}

相关推荐