爬虫

爬虫简介

爬虫：

网络爬虫机器人，以互联网自由抓取数据的程序

爬虫的作用:

搜索引擎
商品比价
知乎的数据分析平台

如何写一个爬虫？

网页的三大特征：

每一个网页都有一个唯一的url（统一资源定位符）来进行定位
网页都是通过HTML<超文本>文本展示的
所有的网页都是通过HTTP<超文本传输协议>（HTTPS）协议来传输的

爬虫的流程：

1.分析网站，得到新的url
+2. 根据url，发起请求，获取页面的HTML源码

3.从页面中提取数据
a.提取到目标数据，做数据的筛选和持久化储存
b.从页面中提取到新的url地址，继续执行第二操作

4.爬虫结束：所有的目标url都提取完毕，并且得到了数据，再也没有其他请求任务，意味着爬虫结束

pom依赖

 1 <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2     xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 3     <modelVersion>4.0.0</modelVersion>
 4 
 5     <groupId>com.javaxl</groupId>
 6     <artifactId>T226_jsoup</artifactId>
 7     <version>0.0.1-SNAPSHOT</version>
 8     <packaging>jar</packaging>
 9 
10     <name>T226_jsoup</name>
11     <url>http://maven.apache.org</url>
12 
13     <properties>
14         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
15     </properties>
16 
17     <dependencies>
18         <!-- jdbc驱动包 -->
19         <dependency>
20             <groupId>mysql</groupId>
21             <artifactId>mysql-connector-java</artifactId>
22             <version>5.1.44</version>
23         </dependency>
24 
25         <!-- 添加Httpclient支持 -->
26         <dependency>
27             <groupId>org.apache.httpcomponents</groupId>
28             <artifactId>httpclient</artifactId>
29             <version>4.5.2</version>
30         </dependency>
31 
32         <!-- 添加jsoup支持 -->
33         <dependency>
34             <groupId>org.jsoup</groupId>
35             <artifactId>jsoup</artifactId>
36             <version>1.10.1</version>
37         </dependency>
38 
39 
40         <!-- 添加日志支持 -->
41         <dependency>
42             <groupId>log4j</groupId>
43             <artifactId>log4j</artifactId>
44             <version>1.2.16</version>
45         </dependency>
46 
47         <!-- 添加ehcache支持 -->
48         <dependency>
49             <groupId>net.sf.ehcache</groupId>
50             <artifactId>ehcache</artifactId>
51             <version>2.10.3</version>
52         </dependency>
53 
54         <!-- 添加commons io支持 -->
55         <dependency>
56             <groupId>commons-io</groupId>
57             <artifactId>commons-io</artifactId>
58             <version>2.5</version>
59         </dependency>
60 
61         <dependency>
62             <groupId>com.alibaba</groupId>
63             <artifactId>fastjson</artifactId>
64             <version>1.2.47</version>
65         </dependency>
66     </dependencies>
67 </project>

我们来爬一个图片

DownloadImg .java

 1 package com.javaxl.crawler;
 2 
 3 import java.io.File;
 4 import java.io.IOException;
 5 import java.util.UUID;
 6 
 7 import org.apache.commons.io.FileUtils;
 8 import org.apache.http.HttpEntity;
 9 import org.apache.http.client.ClientProtocolException;
10 import org.apache.http.client.config.RequestConfig;
11 import org.apache.http.client.methods.CloseableHttpResponse;
12 import org.apache.http.client.methods.HttpGet;
13 import org.apache.http.impl.client.CloseableHttpClient;
14 import org.apache.http.impl.client.HttpClients;
15 import org.apache.log4j.Logger;
16 
17 import com.javaxl.util.DateUtil;
18 import com.javaxl.util.PropertiesUtil;
19 
20 public class DownloadImg {
21     private static Logger logger = Logger.getLogger(DownloadImg.class);
22     private static String URL = "https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1570570095818&di=f6f36a34fb29850cce69942cce9e8259&imgtype=0&src=http%3A%2F%2Fpic1.win4000.com%2Fpic%2F6%2F56%2F85ea1629713.jpg";
23     public static void main(String[] args) {
24         logger.info("开始爬取首页：" + URL);
25         CloseableHttpClient httpClient = HttpClients.createDefault();
26         HttpGet httpGet = new HttpGet(URL);
27         RequestConfig config = RequestConfig.custom().setConnectTimeout(5000).setSocketTimeout(8000).build();
28         httpGet.setConfig(config);
29         CloseableHttpResponse response = null;
30         try {
31             response = httpClient.execute(httpGet);
32             if (response == null) {
33                 logger.info("连接超时！！！");
34             } else {
35                 HttpEntity entity = response.getEntity();
36                 String imgPath = PropertiesUtil.getValue("blogImages");
37                 String dateDir = DateUtil.getCurrentDatePath();
38                 String uuid = UUID.randomUUID().toString();
39                 String subfix = entity.getContentType().getValue().split("/")[1];
40                 String localFile = imgPath+dateDir+"/"+uuid+"."+subfix;
41 //                System.out.println(localFile);
42                 FileUtils.copyInputStreamToFile(entity.getContent(), new File(localFile));
43             }
44         } catch (ClientProtocolException e) {
45             logger.error(URL+"-ClientProtocolException", e);
46         } catch (IOException e) {
47             logger.error(URL+"-IOException", e);
48         } catch (Exception e) {
49             logger.error(URL+"-Exception", e);
50         } finally {
51             try {
52                 if (response != null) {
53                     response.close();
54                 }
55                 if(httpClient != null) {
56                     httpClient.close();
57                 }
58             } catch (IOException e) {
59                 logger.error(URL+"-IOException", e);
60             }
61         }
62         
63 
64         logger.info("结束首页爬取：" + URL);
65     
66     }
67 }

爬博客园的首页

BlogCrawlerStarter.java

  1 package com.javaxl.crawler;
  2 
  3 import java.io.File;
  4 import java.io.IOException;
  5 import java.sql.Connection;
  6 import java.sql.PreparedStatement;
  7 import java.sql.SQLException;
  8 import java.util.HashMap;
  9 import java.util.List;
 10 import java.util.Map;
 11 import java.util.UUID;
 12 
 13 import org.apache.commons.io.FileUtils;
 14 import org.apache.http.HttpEntity;
 15 import org.apache.http.client.ClientProtocolException;
 16 import org.apache.http.client.config.RequestConfig;
 17 import org.apache.http.client.methods.CloseableHttpResponse;
 18 import org.apache.http.client.methods.HttpGet;
 19 import org.apache.http.impl.client.CloseableHttpClient;
 20 import org.apache.http.impl.client.HttpClients;
 21 import org.apache.http.util.EntityUtils;
 22 import org.apache.log4j.Logger;
 23 import org.jsoup.Jsoup;
 24 import org.jsoup.nodes.Document;
 25 import org.jsoup.nodes.Element;
 26 import org.jsoup.select.Elements;
 27 
 28 import com.javaxl.util.DateUtil;
 29 import com.javaxl.util.DbUtil;
 30 import com.javaxl.util.PropertiesUtil;
 31 
 32 import net.sf.ehcache.Cache;
 33 import net.sf.ehcache.CacheManager;
 34 import net.sf.ehcache.Status;
 35 
 36 /**
 37  * @author Administrator
 38  *
 39  */
 40 public class BlogCrawlerStarter {
 41 
 42     private static Logger logger = Logger.getLogger(BlogCrawlerStarter.class);
 43 //    https://www.csdn.net/nav/newarticles
 44     private static String HOMEURL = "https://www.cnblogs.com/";
 45     private static CloseableHttpClient httpClient;
 46     private static Connection con;
 47     private static CacheManager cacheManager;
 48     private static Cache cache;
 49 
 50     /**
 51      * httpclient解析首页，获取首页内容
 52      */
 53     public static void parseHomePage() {
 54         logger.info("开始爬取首页：" + HOMEURL);
 55         
 56         cacheManager = CacheManager.create(PropertiesUtil.getValue("ehcacheXmlPath"));
 57         cache = cacheManager.getCache("cnblog");
 58         
 59         httpClient = HttpClients.createDefault();
 60         HttpGet httpGet = new HttpGet(HOMEURL);
 61         RequestConfig config = RequestConfig.custom().setConnectTimeout(5000).setSocketTimeout(8000).build();
 62         httpGet.setConfig(config);
 63         CloseableHttpResponse response = null;
 64         try {
 65             response = httpClient.execute(httpGet);
 66             if (response == null) {
 67                 logger.info(HOMEURL + ":爬取无响应");
 68                 return;
 69             }
 70 
 71             if (response.getStatusLine().getStatusCode() == 200) {
 72                 HttpEntity entity = response.getEntity();
 73                 String homePageContent = EntityUtils.toString(entity, "utf-8");
 74                 // System.out.println(homePageContent);
 75                 parseHomePageContent(homePageContent);
 76             }
 77 
 78         } catch (ClientProtocolException e) {
 79             logger.error(HOMEURL + "-ClientProtocolException", e);
 80         } catch (IOException e) {
 81             logger.error(HOMEURL + "-IOException", e);
 82         } finally {
 83             try {
 84                 if (response != null) {
 85                     response.close();
 86                 }
 87 
 88                 if (httpClient != null) {
 89                     httpClient.close();
 90                 }
 91             } catch (IOException e) {
 92                 logger.error(HOMEURL + "-IOException", e);
 93             }
 94         }
 95 
 96         if(cache.getStatus() ==  Status.STATUS_ALIVE) {
 97             cache.flush();
 98         }
 99         cacheManager.shutdown();
100         logger.info("结束爬取首页：" + HOMEURL);
101 
102     }
103 
104     /**
105      * 通过网络爬虫框架jsoup，解析网页类容，获取想要数据（博客的连接）
106      * 
107      * @param homePageContent
108      */
109     private static void parseHomePageContent(String homePageContent) {
110         Document doc = Jsoup.parse(homePageContent);
111         //#feedlist_id .list_con .title h2 a
112         Elements aEles = doc.select("#post_list .post_item .post_item_body h3 a");
113         for (Element aEle : aEles) {
114 //            这个是首页中的博客列表中的单个链接URL
115             String blogUrl = aEle.attr("href");
116             if (null == blogUrl || "".equals(blogUrl)) {
117                 logger.info("该博客未内容，不再爬取插入数据库！");
118                 continue;
119             }
120             if(cache.get(blogUrl) != null) {
121                 logger.info("该数据已经被爬取到数据库中，数据库不再收录！");
122                 continue;
123             }
124 //            System.out.println("************************"+blogUrl+"****************************");
125             
126             parseBlogUrl(blogUrl);
127         }
128     }
129 
130     /**
131      * 通过博客地址获取博客的标题，以及博客的类容
132      * 
133      * @param blogUrl
134      */
135     private static void parseBlogUrl(String blogUrl) {
136 
137         logger.info("开始爬取博客网页：" + blogUrl);
138         httpClient = HttpClients.createDefault();
139         HttpGet httpGet = new HttpGet(blogUrl);
140         RequestConfig config = RequestConfig.custom().setConnectTimeout(5000).setSocketTimeout(8000).build();
141         httpGet.setConfig(config);
142         CloseableHttpResponse response = null;
143         try {
144             response = httpClient.execute(httpGet);
145             if (response == null) {
146                 logger.info(blogUrl + ":爬取无响应");
147                 return;
148             }
149 
150             if (response.getStatusLine().getStatusCode() == 200) {
151                 HttpEntity entity = response.getEntity();
152                 String blogContent = EntityUtils.toString(entity, "utf-8");
153                 parseBlogContent(blogContent, blogUrl);
154             }
155 
156         } catch (ClientProtocolException e) {
157             logger.error(blogUrl + "-ClientProtocolException", e);
158         } catch (IOException e) {
159             logger.error(blogUrl + "-IOException", e);
160         } finally {
161             try {
162                 if (response != null) {
163                     response.close();
164                 }
165             } catch (IOException e) {
166                 logger.error(blogUrl + "-IOException", e);
167             }
168         }
169 
170         logger.info("结束爬取博客网页：" + HOMEURL);
171 
172     }
173 
174     /**
175      * 解析博客类容，获取博客中标题以及所有内容
176      * 
177      * @param blogContent
178      */
179     private static void parseBlogContent(String blogContent, String link) {
180         Document doc = Jsoup.parse(blogContent);
181         if(!link.contains("ansion2014")) {
182             System.out.println(blogContent);
183         }
184         Elements titleEles = doc
185                 //#mainBox main .blog-content-box .article-header-box .article-header .article-title-box h1
186                 .select("#topics .post h1 a");
187         System.out.println("123");
188         System.out.println(titleEles.toString());
189         System.out.println("123");
190         if (titleEles.size() == 0) {
191             logger.info("博客标题为空，不插入数据库！");
192             return;
193         }
194         String title = titleEles.get(0).html();
195 
196         Elements blogContentEles = doc.select("#cnblogs_post_body ");
197         if (blogContentEles.size() == 0) {
198             logger.info("博客内容为空，不插入数据库！");
199             return;
200         }
201         String blogContentBody = blogContentEles.get(0).html();
202         
203 //        Elements imgEles = doc.select("img");
204 //        List<String> imgUrlList = new LinkedList<String>();
205 //        if(imgEles.size() > 0) {
206 //            for (Element imgEle : imgEles) {
207 //                imgUrlList.add(imgEle.attr("src"));
208 //            }
209 //        }
210 //        
211 //        if(imgUrlList.size() > 0) {
212 //            Map<String, String> replaceUrlMap = downloadImgList(imgUrlList);
213 //            blogContent = replaceContent(blogContent,replaceUrlMap);
214 //        }
215 
216         String sql = "insert into `t_jsoup_article` values(null,?,?,null,now(),0,0,null,?,0,null)";
217         try {
218             PreparedStatement pst = con.prepareStatement(sql);
219             pst.setObject(1, title);
220             pst.setObject(2, blogContentBody);
221             pst.setObject(3, link);
222             if(pst.executeUpdate() == 0) {
223                 logger.info("爬取博客信息插入数据库失败");
224             }else {
225                 cache.put(new net.sf.ehcache.Element(link, link));
226                 logger.info("爬取博客信息插入数据库成功");
227             }
228         } catch (SQLException e) {
229             logger.error("数据异常-SQLException：",e);
230         }
231     }
232 
233     /**
234      * 将别人博客内容进行加工，将原有图片地址换成本地的图片地址
235      * @param blogContent
236      * @param replaceUrlMap
237      * @return
238      */
239     private static String replaceContent(String blogContent, Map<String, String> replaceUrlMap) {
240         for(Map.Entry<String, String> entry: replaceUrlMap.entrySet()) {
241             blogContent = blogContent.replace(entry.getKey(), entry.getValue());
242         }
243         return blogContent;
244     }
245 
246     /**
247      * 别人服务器图片本地化
248      * @param imgUrlList
249      * @return
250      */
251     private static Map<String, String> downloadImgList(List<String> imgUrlList) {
252         Map<String, String> replaceMap = new HashMap<String, String>();
253         for (String imgUrl : imgUrlList) {
254             CloseableHttpClient httpClient = HttpClients.createDefault();
255             HttpGet httpGet = new HttpGet(imgUrl);
256             RequestConfig config = RequestConfig.custom().setConnectTimeout(5000).setSocketTimeout(8000).build();
257             httpGet.setConfig(config);
258             CloseableHttpResponse response = null;
259             try {
260                 response = httpClient.execute(httpGet);
261                 if (response == null) {
262                     logger.info(HOMEURL + ":爬取无响应");
263                 }else {
264                     if (response.getStatusLine().getStatusCode() == 200) {
265                         HttpEntity entity = response.getEntity();
266                         String blogImagesPath = PropertiesUtil.getValue("blogImages");
267                         String dateDir = DateUtil.getCurrentDatePath();
268                         String uuid = UUID.randomUUID().toString();
269                         String subfix = entity.getContentType().getValue().split("/")[1];
270                         String fileName = blogImagesPath + dateDir + "/" + uuid + "." + subfix;
271                         
272                         FileUtils.copyInputStreamToFile(entity.getContent(), new File(fileName));
273                         replaceMap.put(imgUrl, fileName);
274                     }
275                 }
276             } catch (ClientProtocolException e) {
277                 logger.error(imgUrl + "-ClientProtocolException", e);
278             } catch (IOException e) {
279                 logger.error(imgUrl + "-IOException", e);
280             } catch (Exception e) {
281                 logger.error(imgUrl + "-Exception", e);
282             } finally {
283                 try {
284                     if (response != null) {
285                         response.close();
286                     }
287                 } catch (IOException e) {
288                     logger.error(imgUrl + "-IOException", e);
289                 }
290             }
291         
292         }
293         return replaceMap;
294     }
295 
296     public static void start() {
297         while(true) {
298             DbUtil dbUtil = new DbUtil();
299             try {
300                 con = dbUtil.getCon();
301                 parseHomePage();
302             } catch (Exception e) {
303                 logger.error("数据库连接势失败！");
304             } finally {
305                 try {
306                     if (con != null) {
307                         con.close();
308                     }
309                 } catch (SQLException e) {
310                     logger.error("数据关闭异常-SQLException：",e);
311                 }
312             }
313             try {
314                 Thread.sleep(1000*60);
315             } catch (InterruptedException e) {
316                 logger.error("主线程休眠异常-InterruptedException：",e);
317             }
318         }
319     }
320 
321     public static void main(String[] args) {
322         start();
323     }
324 }

数据库中