使用正则表达式匹配一个网页中的所有超级链接

 2 
 3 import java.io.BufferedReader;
 4 import java.io.File;
 5 import java.io.FileInputStream;
 6 import java.io.InputStream;
 7 import java.io.InputStreamReader;
 8 import java.util.regex.Matcher;
 9 import java.util.regex.Pattern;
10 
11 public class WebPageContentInput {
12     /**
13      * 文件存储路径
14      */
15     private String filePath = null;
16     /**
17      * 无参构造器
18      */
19     public WebPageContentInput() {
20     }
21 
22     /**
23      * 接收客户端传来的文件存储路径</br>
24      * 
25      * @param filePat 文件存储路径</br>
26      */
27     public WebPageContentInput(String filePath) {
28         this.setFilePath(filePath);
29     }
30 
31     public void setFilePath(String filePath) {
32         this.filePath = filePath;
33     }
34 
35     public String getFilePath() {
36         return this.filePath;
37     }
38 
39     /**
40      * 在控制台输入所有http://和https://开头的链接</br>
41      * 
42      * @throws Exception
43      *             　所有文件操作类的对象执行操作时抛出的异常，这里统一由方法抛出</br>
44      */
45     public void printContent() throws Exception {
46         /**
47          * 取得文件路径，实例化文件类对象</br>
48          */
49         File file = new File(this.getFilePath());
50         /**
51          * 通过上面的文件类对象声明并实例化字节输入流对象</br>
52          */
53         InputStream in = new FileInputStream(file);
54         /**
55          * 通过字节输入流声明并实例化bufferedReader类对象，</br>
56          * bufferedReader类的构造器的不接受字节输入流对象，</br> 所以应该使用转换流把字节输入流转换为字符输入流。</br>
57          *这里需要声明编码方式，否则正文会乱码。</br>
58          */
59         InputStreamReader isr = new InputStreamReader(in,"gbk");
60         BufferedReader buf = new BufferedReader(isr);
61         /**
62          * 临时接受BufferedReader类对象读取到内存中的内容</br>
63          */
64         String str = null;
65         
66         Pattern p = null;
67         Matcher m = null;
68         /**
69          * 循环读取文件内容，直到读到文件末尾</br>
70          */
71         while ((str = buf.readLine()) != null) {
72             String regex = "<a.*?/a>";  
73             p = Pattern.compile(regex);
74               m = p.matcher(str);
75               while(m.find()){
76                   /*For a matcher m with input sequence s,
77                    *  the expressions m.group() and s.substring(m.start(), m.end()) 
78                    *  are equivalent.*/
79                   System.out.println(m.group());
80               }
81         }
82         System.out.println("******************************************");
83         System.out.println("finished!
Congratulations!");
84         /**
85          * 关闭所有文件操作类对象</br>
86          */
87         buf.close();
88         in.close();
89     }
90 
91 }

测试一下：

 2 
 3 /**
 4  * 
 5  *题目：使用正则表达式匹配一个网页中的所有超级链接，</br>
 6  *并输出所有的链接(或者将链接保存在文件中)</br></br>
 7  *
 8  *程序设计思路：
 9  *1.先下载一个网页，保存到本地；</br>
10  *2.使用文件操作连接这个网页；</br>
11  *3.使用输入流把文件内容输入到程序中；</br>
12  *4.使用整行读取的方式逐行读取文件内容：如果这行字符串中含有链接则输出到控制台；</br>
13  *5.关闭文件。</br>
14  *
15  *我在编写这程序时，输入流使用的是InputStream类的子类BufferedReader，</br>
16  *因为这个子类有每次读取文件整行字符串的方法，而字符流和字节流都没有；</br>
17  *而且，在读取的过程中，如果使用的是字节流或字符流，在获得完整的“链接子串”上有些不方便；</br>
18  *另外，因为对于每一步文件操作，都需要处理异常，为了使代码更加简洁，</br>
19  *我直接把所有的异常由执行筛选操作的方法抛出，再由调用者处理异常。</br></br>
20  *
21  *消耗时间：4h
22  *
23  *结果：能完整的截取所有的链接；
24  * @author fzh
25  *
26  */
27 public class GetResult {
28     public static void main(String[] args){
29         /**
30          * 文件存储路径
31          */
32         String filePath = "/home/fzh/jquery.html";
33         WebPageContentInput wpct = new WebPageContentInput(filePath);
34         /**
35          * WebPageContentInput类的printContent()方法抛出的异常，在这里捕获。</br>
36          */
37         try{
38             wpct.printContent();
39         }catch(Exception e){
40             e.printStackTrace();
41         }
42     }
43 }

Zhihong Fu