shqhope 2014-08-27
利用java抓取网页上的所有图片:
用两个正则表达式:
1、匹配html中img标签的正则:<img.*src=(.*?)[^>]*?>
2、匹配img标签中得src中http路径的正则:http:\"?(.*?)(\"|>|\\s+)
实现:
[java]viewplaincopyprint?
packageorg.swinglife.main;
importjava.io.File;
importjava.io.FileOutputStream;
importjava.io.InputStream;
importjava.net.URL;
importjava.net.URLConnection;
importjava.util.ArrayList;
importjava.util.List;
importjava.util.regex.Matcher;
importjava.util.regex.Pattern;
/***
*java抓取网络图片
*@authorswinglife
*
*/
publicclassCatchImage{
//地址
privatestaticfinalStringURL="http://www.csdn.net";
//编码
privatestaticfinalStringECODING="UTF-8";
//获取img标签正则
privatestaticfinalStringIMGURL_REG="<img.*src=(.*?)[^>]*?>";
//获取src路径的正则
privatestaticfinalStringIMGSRC_REG="http:\"?(.*?)(\"|>|\\s+)";
publicstaticvoidmain(String[]args)throwsException{
CatchImagecm=newCatchImage();
//获得html文本内容
StringHTML=cm.getHTML(URL);
//获取图片标签
List<String>imgUrl=cm.getImageUrl(HTML);
//获取图片src地址
List<String>imgSrc=cm.getImageSrc(imgUrl);
//下载图片
cm.Download(imgSrc);
}
/***
*获取HTML内容
*
*@paramurl
*@return
*@throwsException
*/
privateStringgetHTML(Stringurl)throwsException{
URLuri=newURL(url);
URLConnectionconnection=uri.openConnection();
InputStreamin=connection.getInputStream();
byte[]buf=newbyte[1024];
intlength=0;
StringBuffersb=newStringBuffer();
while((length=in.read(buf,0,buf.length))>0){
sb.append(newString(buf,ECODING));
}
in.close();
returnsb.toString();
}
/***
*获取ImageUrl地址
*
*@paramHTML
*@return
*/
privateList<String>getImageUrl(StringHTML){
Matchermatcher=Pattern.compile(IMGURL_REG).matcher(HTML);
List<String>listImgUrl=newArrayList<String>();
while(matcher.find()){
listImgUrl.add(matcher.group());
}
returnlistImgUrl;
}
/***
*获取ImageSrc地址
*
*@paramlistImageUrl
*@return
*/
privateList<String>getImageSrc(List<String>listImageUrl){
List<String>listImgSrc=newArrayList<String>();
for(Stringimage:listImageUrl){
Matchermatcher=Pattern.compile(IMGSRC_REG).matcher(image);
while(matcher.find()){
listImgSrc.add(matcher.group().substring(0,matcher.group().length()-1));
}
}
returnlistImgSrc;
}
/***
*下载图片
*
*@paramlistImgSrc
*/
privatevoidDownload(List<String>listImgSrc){
try{
for(Stringurl:listImgSrc){
StringimageName=url.substring(url.lastIndexOf("/")+1,url.length());
URLuri=newURL(url);
InputStreamin=uri.openStream();
FileOutputStreamfo=newFileOutputStream(newFile(imageName));
byte[]buf=newbyte[1024];
intlength=0;
System.out.println("开始下载:"+url);
while((length=in.read(buf,0,buf.length))!=-1){
fo.write(buf,0,length);
}
in.close();
fo.close();
System.out.println(imageName+"下载完成");
}
}catch(Exceptione){
System.out.println("下载失败");
}
}
}