JAVA抓取网页的图片,JAVA利用正则表达式抓取网站图片

shqhope 2014-08-27

利用java抓取网页上的所有图片:

用两个正则表达式:

1、匹配html中img标签的正则:<img.*src=(.*?)[^>]*?>

2、匹配img标签中得src中http路径的正则:http:\"?(.*?)(\"|>|\\s+)

实现:

[java]viewplaincopyprint?

packageorg.swinglife.main;

importjava.io.File;

importjava.io.FileOutputStream;

importjava.io.InputStream;

importjava.net.URL;

importjava.net.URLConnection;

importjava.util.ArrayList;

importjava.util.List;

importjava.util.regex.Matcher;

importjava.util.regex.Pattern;

/***

*java抓取网络图片

*@authorswinglife

*

*/

publicclassCatchImage{

//地址

privatestaticfinalStringURL="http://www.csdn.net";

//编码

privatestaticfinalStringECODING="UTF-8";

//获取img标签正则

privatestaticfinalStringIMGURL_REG="<img.*src=(.*?)[^>]*?>";

//获取src路径的正则

privatestaticfinalStringIMGSRC_REG="http:\"?(.*?)(\"|>|\\s+)";

publicstaticvoidmain(String[]args)throwsException{

CatchImagecm=newCatchImage();

//获得html文本内容

StringHTML=cm.getHTML(URL);

//获取图片标签

List<String>imgUrl=cm.getImageUrl(HTML);

//获取图片src地址

List<String>imgSrc=cm.getImageSrc(imgUrl);

//下载图片

cm.Download(imgSrc);

}

/***

*获取HTML内容

*

*@paramurl

*@return

*@throwsException

*/

privateStringgetHTML(Stringurl)throwsException{

URLuri=newURL(url);

URLConnectionconnection=uri.openConnection();

InputStreamin=connection.getInputStream();

byte[]buf=newbyte[1024];

intlength=0;

StringBuffersb=newStringBuffer();

while((length=in.read(buf,0,buf.length))>0){

sb.append(newString(buf,ECODING));

}

in.close();

returnsb.toString();

}

/***

*获取ImageUrl地址

*

*@paramHTML

*@return

*/

privateList<String>getImageUrl(StringHTML){

Matchermatcher=Pattern.compile(IMGURL_REG).matcher(HTML);

List<String>listImgUrl=newArrayList<String>();

while(matcher.find()){

listImgUrl.add(matcher.group());

}

returnlistImgUrl;

}

/***

*获取ImageSrc地址

*

*@paramlistImageUrl

*@return

*/

privateList<String>getImageSrc(List<String>listImageUrl){

List<String>listImgSrc=newArrayList<String>();

for(Stringimage:listImageUrl){

Matchermatcher=Pattern.compile(IMGSRC_REG).matcher(image);

while(matcher.find()){

listImgSrc.add(matcher.group().substring(0,matcher.group().length()-1));

}

}

returnlistImgSrc;

}

/***

*下载图片

*

*@paramlistImgSrc

*/

privatevoidDownload(List<String>listImgSrc){

try{

for(Stringurl:listImgSrc){

StringimageName=url.substring(url.lastIndexOf("/")+1,url.length());

URLuri=newURL(url);

InputStreamin=uri.openStream();

FileOutputStreamfo=newFileOutputStream(newFile(imageName));

byte[]buf=newbyte[1024];

intlength=0;

System.out.println("开始下载:"+url);

while((length=in.read(buf,0,buf.length))!=-1){

fo.write(buf,0,length);

}

in.close();

fo.close();

System.out.println(imageName+"下载完成");

}

}catch(Exceptione){

System.out.println("下载失败");

}

}

}

相关推荐