闲的无聊,写了个抓图片的网络爬虫

闲的无聊,写了个抓图片的网络爬虫

有喜欢的,拿去,不要把人家辛苦做的站抓宕机就好了。欢迎评论指点;

  1 import com.google.common.collect.Lists;
  2 
  3 
  4 import org.apache.commons.io.IOUtils;
  5 
  6 import org.apache.http.Header;
  7 import org.apache.http.HttpEntity;
  8 import org.apache.http.HttpHeaders;
  9 import org.apache.http.HttpResponse;
 10 import org.apache.http.client.ClientProtocolException;
 11 import org.apache.http.client.HttpClient;
 12 import org.apache.http.client.config.RequestConfig;
 13 import org.apache.http.client.methods.HttpGet;
 14 import org.apache.http.client.methods.HttpUriRequest;
 15 import org.apache.http.client.methods.RequestBuilder;
 16 import org.apache.http.client.utils.URIBuilder;
 17 import org.apache.http.impl.client.HttpClients;
 18 import org.apache.http.message.BasicHeader;
 19 
 20 import org.apache.http.protocol.HTTP;
 21 import org.apache.http.util.EntityUtils;
 22 import org.jsoup.Jsoup;
 23 import org.jsoup.nodes.Document;
 24 import org.jsoup.nodes.Element;
 25 import org.jsoup.select.Elements;
 26 
 27 
 28 import java.io.*;
 29 import java.net.URI;
 30 import java.net.URISyntaxException;
 31 
 32 
 33 import java.util.List;
 34 import java.util.Stack;
 35 
 36 
 37 public class SexlCrawler {
 38 
 39     //抓取路径以及保存到本地的路径
 40 
 41     public static String sexpage_url="https://www.mm131.net/xinggan/5354.html";
 42     public static String local_path="d://temp//sexImg//";
 43 
 44     public java.util.Stack imgStack =new Stack();
 45     public String pageNum;
 46 
 47 
 48 
 49 
 50     public SexlCrawler(){
 51         imgStack.push(sexpage_url);
 52     }
 53 
 54     public static void main(String[] args) {
 55 
 56         SexlCrawler sc=new SexlCrawler();
 57         while(!sc.imgStack.empty()){
 58 
 59             String url= (String) sc.imgStack.pop();
 60             sc.startCrawler(url);
 61 
 62         }
 63 
 64 
 65     }
 66 
 67 
 68     public void startCrawler(String startPage_url){
 69 
 70 
 71         //构造请求路径,并添加参数
 72         try {
 73 
 74 
 75             URI uri = new URIBuilder(startPage_url).build();
 76             List<Header> headerList = Lists.newArrayList();
 77 
 78             headerList.add(new BasicHeader(HttpHeaders.ACCEPT_ENCODING, "gzip, deflate"));
 79             headerList.add(new BasicHeader(HTTP.CONN_KEEP_ALIVE, "keep-alive"));
 80             headerList.add(new BasicHeader(HttpHeaders.ACCEPT_LANGUAGE, "zh-cn,zh;q=0.5"));
 81 
 82 
 83 
 84             //构造HttpClient
 85             HttpClient httpClient = HttpClients.custom().setDefaultHeaders(headerList).build();
 86 
 87 
 88 
 89             //构造HttpGet请求
 90             HttpUriRequest httpUriRequest = RequestBuilder.get().setUri(uri).build();
 91 
 92 
 93             //---加入参数,让反爬虫失效
 94             httpUriRequest.setHeader("Referer","sogou.com");
 95             //httpUriRequest.setHeader("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36");
 96             httpUriRequest.setHeader("User-Agent","sogou.com");
 97 
 98             //获取结果
 99             HttpResponse httpResponse = httpClient.execute(httpUriRequest);
100             //获取返回结果中的实体
101             HttpEntity entity = httpResponse.getEntity();
102             //查看页面内容结果
103             //---内容编码
104             String rawHTMLContent = EntityUtils.toString(entity, "GBK");
105 
106 
107             //关闭HttpEntity流
108             EntityUtils.consume(entity);
109 
110 
111 
112             this.setPageNum(startPage_url);
113 
114             //---获取路径---//
115             String imgUrl= this.dealWith(rawHTMLContent,startPage_url);
116             //---存图片---//
117             this.savepic(imgUrl,httpClient);
118 
119 
120 
121 
122 
123         } catch (URISyntaxException e) {
124             // TODO Auto-generated catch block
125             e.printStackTrace();
126         } catch (ClientProtocolException e) {
127             // TODO Auto-generated catch block
128             e.printStackTrace();
129         } catch (IOException e) {
130             // TODO Auto-generated catch block
131             e.printStackTrace();
132         }
133 
134 
135     }
136 
137 
138 
139     public String dealWith(String inputHtml,String startPage_url){
140 
141         Document doc = Jsoup.parse(inputHtml);
142         String imgurl="";
143 
144 
145         Elements elements = doc.select("div[class=content-pic]").select("img");
146 
147         Elements next_elements = doc.select("div[class=content-pic]").select("a");
148 
149         for( Element element : elements ){
150 
151             imgurl= element.attr("src");
152         }
153 
154         for( Element element : next_elements ){
155             String next_url="";
156 
157             next_url= element.attr("href");
158 
159             //--从开始第一页,每页都是类似 "666_6.html",但是在最后一页,跳转到新的例如"667.html"时,页面又加上了类似于 https开头的链接
160             //---所以要特殊处理
161             if(next_url.indexOf("http")<0) {
162                 int p = startPage_url.lastIndexOf("/");
163                 next_url = startPage_url.substring(0, p + 1) + next_url;
164             }
165             else{
166                 try {
167                     Thread.sleep(2000);
168                 } catch (InterruptedException e) {
169                     e.printStackTrace();
170                 }
171             }
172 
173 
174             this.imgStack.push(next_url);
175         }
176 
177         return imgurl;
178 
179     }
180 
181     public String setPageNum(String url){
182         String out="";
183 
184         int m = url.indexOf(".html");
185         int p=0;
186         if(url.lastIndexOf("_")>0) {
187            int j=url.lastIndexOf("_");
188            int k=url.lastIndexOf("/");
189             out=url.substring(k,j);
190         }
191         else{
192            p= url.lastIndexOf("/");
193            out=url.substring(p+1,m);
194         }
195 
196         this.pageNum=out;
197 
198         return out;
199 
200 
201     }
202 
203 
204     public void savepic(String ImgURL,HttpClient httpClient){
205         if(ImgURL == null){
206             return ;
207         }
208 
209         String[] strs = ImgURL.split("/");
210         String fileName = strs[strs.length-1];
211         String savePath = local_path+ File.separator+this.pageNum+"_"+fileName;
212         HttpEntity entity = null;
213         try {
214 
215             HttpGet get = new HttpGet(ImgURL);
216 
217 //            RequestConfig requestConfig = RequestConfig.custom()
218 //                    .setConnectTimeout(50000).setConnectionRequestTimeout(1000)
219 //                    .setSocketTimeout(50000).build();
220 //            get.setConfig(requestConfig);
221 
222 
223             //get.setHeader("Referer",ImgURL);
224             //get.setHeader("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36");
225 
226             //---这步很关键,是网站反爬虫的策略,只要加上 Referer参数即可,注意参数一定是这个网站名,否则抓的图片都是一个;
227             get.setHeader("Referer","https://www.mm131.net/");
228             get.setHeader("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36");
229             get.setHeader("User-Agent","sogou.com");
230 
231             HttpResponse response = httpClient.execute(get);
232 
233             entity = response.getEntity();
234             System.out.println("保存图片>>>>.>>>>>>"+fileName);
235             InputStream is = entity.getContent();
236             OutputStream os = new FileOutputStream(savePath);
237             IOUtils.copy(is, os);
238             IOUtils.closeQuietly(os);
239             IOUtils.closeQuietly(is);
240 
241         } catch (Exception e) {
242             e.printStackTrace();
243             System.out.println("图片保存失败");
244             return ;
245         }
246     }
247 
248 
249 }

花了2个小时,随便写写的;

欢迎点评;

原文地址:https://www.cnblogs.com/alexgl2008/p/12346875.html