java 抓取网页的图片

//只能抓取一部分图片,像折800有些子路径的一行图片代码有好多个img,而且排列不规律,我的能力根本就没法截取下来

package test; import java.io.BufferedOutputStream; import java.io.BufferedReader; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; public class getImageByUrl4 { /** * @param args */ private List<String> imageUrl = new ArrayList<String>();//用于存储图片的url private int count = 0;//图片计数器 public static void main(String[] args) { String netUrl = "http://www.zhe800.com";//要爬的网页 new getImageByUrl4().init(netUrl); } public void init(String netUrl){ getPage(netUrl); while(imageUrl.size()!=0) { getImage(imageUrl.remove(0)); } } //获取网页信息line中的图片url并加入到集合中 public void getImageUrl(String line,String netUrl){ //三种正则表达式 //其他网站的图片,http开头如:src = "http://www.ecoc.com:8080/pic/jfjiejf.jpg //String searchImgReg = "(src|SRC|background|BACKGROUND|background-image|BACKGROUND-IMAGE)=('|")(http://([\w-]+\.)+[\w-]+(:[0-9]+)*(/+[\w-]+)*(/+[\w-]+\.(jpg|JPG|png|PNG|gif|GIF)))"; String searchImgReg = "(src|SRC|background|BACKGROUND|background-image|BACKGROUND-IMAGE)\w{0,2}=('|")http(s)*://.{1,}.(jpg|JPG|png|PNG|gif|GIF)"\s"; //项目中的图片,绝对路径如:src = "/ecoc/lala/jj/ooellaie.jpg //String searchImgReg2 = "(src|SRC|background|BACKGROUND|background-image|BACKGROUND-IMAGE)\w{0,2}=('|")/*(([\w-]+/)*([\w-]+\.(jpg|JPG|png|PNG|gif|GIF)))('")"; String searchImgReg2 = "(src|SRC|background|BACKGROUND|background-image|BACKGROUND-IMAGE)\w{0,2}=('|")/*(([\w-]+/)*([\w-]+\.(jpg|JPG|png|PNG|gif|GIF)))"\s"; try { Pattern pat = Pattern.compile(searchImgReg); Matcher matcher=pat.matcher(line); String str =null; while(matcher.find()) { str = matcher.group(); String []sttr = str.split(" ");System.out.println(str); for(int i = 0;i<sttr.length;i++){ String s = sttr[i]; Integer index_denghao = s.indexOf("=")+2; imageUrl.add(s.substring(index_denghao,s.length()-1)); } } pat = Pattern.compile(searchImgReg2); matcher=pat.matcher(line); while(matcher.find()) { str = matcher.group(); System.out.println(str); String []sttr = str.split(" "); for(int i = 0;i<sttr.length;i++){ String s = sttr[i];System.out.println(s); Integer index_denghao = s.indexOf("=")+2; Integer index_2 = netUrl.indexOf("/", 8); if(index_2==-1) index_2 = netUrl.length(); imageUrl.add(netUrl.substring(0, index_2)+"/"+s.substring(index_denghao,s.length()-1)); } } } catch (Exception e) { } } //爬取网页中的信息。 public void getPage(String netUrl){ BufferedReader mybr = null; try { URL myurl = new URL(netUrl); URLConnection myconn = myurl.openConnection(); InputStream myin = myconn.getInputStream(); mybr = new BufferedReader(new InputStreamReader(myin,"UTF-8")); String line; while((line = mybr.readLine())!= null) { getImageUrl(line,netUrl);//判断网页中的jpg图片 } } catch (MalformedURLException e) { System.out.println("getPage url异常"); } catch (IOException e) { System.out.println("url连接异常"); e.printStackTrace(); }finally { if( mybr != null) { try { mybr.close(); } catch (IOException e) { System.out.println("读入流关闭异常"); } } } } //下载该图片! public void getImage(String imageUrl){ InputStream myin = null; BufferedOutputStream myos = null; try { File file = new File("H:\pic\"); File[] files = file.listFiles(); for (File file2 : files) { Integer fileName = Integer.valueOf(file2.getName().substring(0, file2.getName().indexOf("."))); if(count<fileName){ count = fileName; } } URL myurl = new URL(imageUrl); URLConnection myconn = myurl.openConnection(); myin = myconn.getInputStream(); myos = new BufferedOutputStream(new FileOutputStream("H:\pic\"+(++count)+".jpg")); byte[] buff = new byte[1024]; int num = 0; while((num = myin.read(buff))!= -1) { myos.write(buff, 0, num); myos.flush(); } } catch (MalformedURLException e) { System.out.println("getImage url异常"); e.printStackTrace(); } catch (IOException e) { System.out.println("下载图片url连接异常"); e.printStackTrace(); } finally{ if( myin != null){ try { myin.close(); } catch (IOException e) { System.out.println("读入流关闭异常"); } } if( myos != null){ try { myos.close(); } catch (IOException e) { System.out.println("输出流关闭异常"); } } } } }

  

原文地址:https://www.cnblogs.com/jamsbwo/p/5490101.html