使用Jsoup爬取网站图片

  1 package com.test.pic.crawler;
  2 
  3 import java.io.File;
  4 import java.io.FileOutputStream;
  5 import java.io.IOException;
  6 import java.io.InputStream;
  7 import java.io.OutputStream;
  8 import java.net.URL;
  9 import java.net.URLConnection;
 10 import java.util.Arrays;
 11 import java.util.HashSet;
 12 import java.util.List;
 13 import java.util.Set;
 14 import java.util.concurrent.BlockingQueue;
 15 import java.util.concurrent.LinkedBlockingDeque;
 16 import java.util.concurrent.ScheduledExecutorService;
 17 import java.util.concurrent.ScheduledThreadPoolExecutor;
 18 import org.apache.commons.lang3.concurrent.BasicThreadFactory;
 19 import org.jsoup.Jsoup;
 20 import org.jsoup.nodes.*;
 21 import org.jsoup.select.Elements;
 22 
 23 
 24 
 25 
 26 /**   
 27  * @Title: PicCrawler.java 
 28  *
 29  * @Package com.test.pic.crawler 
 30  *
 31  * @Description: 爬取指定网站的指定Tag下的图片或者全部Tag图片 
 32  *
 33  * @author CoderZZ   
 34  *
 35  * @date 2018年1月12日 下午11:22:41 
 36  *
 37  * @version V1.0   
 38  *
 39  */
 40 public class PicCrawler implements Runnable{
 41     private static String pathString = "G:/test/pic/";//存储目录
 42     //存储真正的爬取页面
 43     static BlockingQueue<String> urlBlockingQueue = new LinkedBlockingDeque<String>(1000);
 44     static int threadNum = 10;
 45 //    public PicCrawler(String url){
 46 //        this.url = url;
 47 //    }
 48 
 49     /** 
 50      * @Title: main 
 51      *
 52      * @Description: TODO(这里用一句话描述这个方法的作用) 
 53      *
 54      * @param @param args    设定文件 
 55      *
 56      * @return void    返回类型 
 57      *
 58      * @throws 
 59      *
 60      */
 61     public static void main(String[] args) {
 62         String homeurlString = "https://www.xxxx.com";//爬取页面的基本地址
 63         String tagPageUrl = "https://www.xxxx.com/tag/";//tag分页地址
 64         //Tag标签的完整路径
 65         Set<String> tagFullHrefSet = new HashSet<String>(16);
 66         //想要爬取哪些tag,如果为空,则全部爬取;否则只配置对应的tag
 67         String[] crawlerTagArray = {"风景"};
 68         List<String> crawlerTagList = Arrays.asList(crawlerTagArray);
 69         try {
 70             //1.获取想要的tag完整的url
 71             Document tagListDocument = Jsoup.connect(tagPageUrl).get();
 72             Elements tagsListDivElements = tagListDocument.getElementsByClass("tags_list");
 73             for(Element element:tagsListDivElements){
 74                 Elements aElements = element.getElementsByTag("a");
 75                 for(Element a:aElements){
 76                     if(crawlerTagList.size() == 0 || crawlerTagList.contains(a.text())){
 77                         String tagUrlString = homeurlString+a.attr("href");
 78                         //https://www.xxxx.com/tag/fengjing.html
 79                         tagUrlString = tagUrlString.substring(0, tagUrlString.lastIndexOf("."))+"/1.html";
 80                         tagFullHrefSet.add(tagUrlString);
 81                     }
 82                 }
 83             }
 84             //2.获取图片链接页面地址,分页爬取
 85             for(String tagUrl:tagFullHrefSet){
 86                 String tempTagUrlString = tagUrl;
 87                 int currentPageNum = 1;
 88                 while(true){
 89                     try{
 90                         Document imagePageDocument = Jsoup.connect(tempTagUrlString).get();
 91                         Elements imageListElements = imagePageDocument.getElementsByClass("Pli-litpic");
 92                         if(imageListElements.size() == 0){
 93                             break;
 94                         }
 95                         for(Element image:imageListElements){
 96                             urlBlockingQueue.offer(homeurlString+image.attr("href"));
 97                         }
 98                         //https://www.xxxx.com/tag/fengjing/1.html
 99                         tempTagUrlString = tempTagUrlString.substring(0, tempTagUrlString.lastIndexOf("/")+1)+(++currentPageNum)+".html";
100                     }catch(Exception e){
101                         break;
102                     }
103                 }
104             }
105             ScheduledExecutorService excutor = new ScheduledThreadPoolExecutor(threadNum,new BasicThreadFactory.Builder().namingPattern("my-crawler-thread-%d").daemon(false).build());
106             for(int i=0;i<threadNum;i++){
107 //                excutor.schedule(new PicCrawler(urlArray[i]), 1, TimeUnit.SECONDS);
108 //                excutor.execute(new PicCrawler(urlArray[i]));
109                 excutor.submit(new PicCrawler());
110             }
111         } catch (IOException e) {
112             // TODO Auto-generated catch block
113             e.printStackTrace();
114         }
115     }
116     @Override
117     public void run() {
118         while (true) {
119             try {
120                 long begin = System.currentTimeMillis();
121                 String url = urlBlockingQueue.poll();
122                 if(null != url){
123                     Document doc = Jsoup.connect(url).get();
124                     Elements titleElements =doc.select("#photos > h1");
125                     if(null != titleElements && null != titleElements.get(0)){
126                         Set<String> imgSrcSet = new HashSet<String>(16);
127                         Element titleElement = titleElements.get(0);
128                         String foldNameString = titleElement.text();
129                         String[] nameArray = foldNameString.split("\(");
130                         foldNameString = nameArray[0];
131                         nameArray = nameArray[1].split("/");
132                         int totalPaggs = Integer.parseInt(nameArray[1].replace(")", ""));
133                         for(int i=1;i<=totalPaggs;i++){
134                             String urlTemp = url.replace(".html", "_"+i+".html");
135                             Document docTemp = Jsoup.connect(urlTemp).get();
136                             Element element = docTemp.getElementById("big-pic");
137                             Elements imgElements = element.getElementsByTag("img");
138                             for(Element imgElement:imgElements){
139                                 imgSrcSet.add(imgElement.attr("src"));
140                             }
141                         }
142                         if(imgSrcSet.size()>0){
143                             for(String imgSrc:imgSrcSet){
144                                 // 构造URL    
145                                 URL imgurl = new URL(imgSrc);    
146                                 // 打开连接    
147                                 URLConnection con = imgurl.openConnection();    
148                                 //设置请求超时为10s    
149                                 con.setConnectTimeout(10*1000);    
150                                 // 输入流    
151                                 InputStream is = con.getInputStream();    
152                                 // 500k的数据缓冲    
153                                 byte[] bs = new byte[1024*500];    
154                                 // 读取到的数据长度    
155                                 int len;    
156                                 // 输出的文件流    
157                                 File sf=new File(pathString+"\"+foldNameString);    
158                                 if(!sf.exists()){    
159                                     sf.mkdirs();    
160                                 }
161                                 String filename = imgSrc.split("/")[imgSrc.split("/").length-1];
162                                 OutputStream os = new FileOutputStream(sf.getPath()+"\"+filename);    
163                                 // 开始读取    
164                                 while ((len = is.read(bs)) != -1) {    
165                                     os.write(bs, 0, len);    
166                                 }    
167                                 // 完毕,关闭所有链接    
168                                 os.close();    
169                                 is.close();  
170                                 System.out.println(imgSrc+"下载完成!!!");
171                             }
172                         }
173                         long end = System.currentTimeMillis();
174                         System.out.println("================================================================");
175                         System.out.println(Thread.currentThread().getName()+"******************已全部下载完成,用时:"+((end-begin)/1000)+"S");
176                     }
177                 }else{
178                     System.out.println("========================BlockingQueue已空,已全部抓取完成!=======================");
179                 }
180             } catch (Exception e) {
181                 System.out.println("========================抓取异常=======================");
182             }
183         }
184     }
185 }
原文地址:https://www.cnblogs.com/Java-Script/p/11089630.html