Java 抓取网页中的内容【持续更新】

背景:前几天复习Java的时候看到URL类,当时就想写个小程序试试,迫于考试没有动手,今天写了下,感觉还不错

内容1. 抓取网页中的URL

知识点:Java URL+ 正则表达式

 1 import java.io.BufferedReader;
 2 import java.io.InputStreamReader;
 3 import java.net.URL;
 4 import java.util.regex.Matcher;
 5 import java.util.regex.Pattern;
 6 
 7 public class URLReader {
 8     public static void main(String[] args) throws Exception {
 9         System.out.println("开始!");
10         Pattern pattern = Pattern.compile("http(s)?://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?");
11         URL baidu = new URL("http://www.cnblogs.com/A--Q/");
12         BufferedReader br = new BufferedReader(new InputStreamReader(baidu.openStream(), "utf-8"));
13         String inputLine;
14         while ((inputLine = br.readLine()) != null) {
15             Matcher matcher = pattern.matcher(inputLine);
16             while (matcher.find()) {
17                 System.out.println(matcher.group(0));
18             }
19         }
20         br.close();
21         System.out.println("程序执行结束!");
22     }
23 }

 效果:

内容2. 抓取网页中的图片

 1 import java.io.File;
 2 import java.io.FileOutputStream;
 3 import java.io.InputStream;
 4 import java.net.URL;
 5 import java.net.URLConnection;
 6 import java.util.ArrayList;
 7 import java.util.List;
 8 import java.util.regex.Matcher;
 9 import java.util.regex.Pattern;
10 
11 public class CatchImage {
12 
13     private static final String URL = "http://www.cnblogs.com/A--Q/p/5170713.html";
14     private static final String ECODING = "UTF-8";
15     private static final String IMGURL_REG = "<img.*src=(.*?)[^>]*?>";
16     private static final String IMGSRC_REG = "http:"?(.*?)("|>|\s+)";
17 
18     public static void main(String[] args) throws Exception {
19         System.out.println("start");
20         CatchImage cm = new CatchImage();
21         String HTML = cm.getHTML(URL);
22         List<String> imgUrl = cm.getImageUrl(HTML);
23         List<String> imgSrc = cm.getImageSrc(imgUrl);
24         cm.Download(imgSrc);
25         System.out.println("END");
26     }
27 
28     private String getHTML(String url) throws Exception {
29         URL uri = new URL(url);
30         URLConnection connection = uri.openConnection();
31         InputStream in = connection.getInputStream();
32         byte[] buf = new byte[1024];
33         int length = 0;
34         StringBuffer sb = new StringBuffer();
35         while ((length = in.read(buf, 0, buf.length)) > 0) {
36             sb.append(new String(buf, ECODING));
37         }
38         in.close();
39         return sb.toString();
40     }
41 
42     private List<String> getImageUrl(String HTML) {
43         Matcher matcher = Pattern.compile(IMGURL_REG).matcher(HTML);
44         List<String> listImgUrl = new ArrayList<String>();
45         while (matcher.find()) {
46             listImgUrl.add(matcher.group());
47         }
48         return listImgUrl;
49     }
50 
51     private List<String> getImageSrc(List<String> listImageUrl) {
52         List<String> listImgSrc = new ArrayList<String>();
53         for (String image : listImageUrl) {
54             Matcher matcher = Pattern.compile(IMGSRC_REG).matcher(image);
55             while (matcher.find()) {
56                 listImgSrc.add(matcher.group().substring(0, matcher.group().length() - 1));
57             }
58         }
59         return listImgSrc;
60     }
61 
62     private void Download(List<String> listImgSrc) {
63         try {
64             for (String url : listImgSrc) {
65                 String imageName = url.substring(url.lastIndexOf("/") + 1, url.length());
66                 URL uri = new URL(url);
67                 InputStream in = uri.openStream();
68                 FileOutputStream fo = new FileOutputStream(new File(imageName));
69                 byte[] buf = new byte[1024];
70                 int length = 0;
71                 System.out.println("开始下载:" + url);
72                 while ((length = in.read(buf, 0, buf.length)) != -1) {
73                     fo.write(buf, 0, length);
74                 }
75                 in.close();
76                 fo.close();
77                 System.out.println(imageName + "下载完成");
78             }
79         } catch (Exception e) {
80             System.out.println("下载失败");
81         }
82     }
83 
84 }
View Code
原文地址:https://www.cnblogs.com/A--Q/p/6219556.html