使用Jsoup抓取网站上的图片、链接

以 http://www.zgfjqw.roboo.com/ 中国风景区网为例，我们想要抓取首页的所有图片。

使用Jsoup可以轻松实现。

代码及注释如下：

 1 package test;
 2 import java.io.*;
 3 import java.net.URL;
 4 import java.io.IOException;
 5 import org.jsoup.Jsoup;
 6 import org.jsoup.nodes.Document;
 7 import org.jsoup.nodes.Element;
 8 import org.jsoup.select.Elements;
 9 
10 public class DownloadImages 
11 {
12     public static void main(String[] args) throws IOException   
13     {
14         //目标网站
15         String url = "http://www.zgfjqw.roboo.com/";
16         //获取到的图片的存储位置，这里采用的是相对路径，存储到项目中的img文件夹下
17         String path = "img";
18         Document doc = Jsoup.connect(url).timeout(100000).get();
19         //获取所有图像元素
20         Elements results = doc.getElementsByTag("img");
21         for(Element e : results)
22         {
23             //获取图像的url
24             String src = e.absUrl("src");
25             //存储到本地的函数
26             storeImg(src, path);
27         }
28         System.out.println("运行结束");
29     }
30     
31     //从文件的url获得其文件名，比如http://www.zgfjqw.roboo.com/a/b/c.gif转化为c.gif
32     private static String getName(String src)
33     {
34         int index = src.lastIndexOf('/');
35         return src.substring(index);
36     }
37 
38     //将src存储到path中
39     private static void storeImg(String src, String path) throws IOException 
40     {        
41         String name = getName(src);
42         URL url = new URL(src);
43         InputStream in = url.openStream();
44         OutputStream out = new BufferedOutputStream(new FileOutputStream(path + name));
45         for(int r;(r = in.read()) != -1; )
46         {
47             out.write(r);
48         }
49         in.close();
50         out.close();
51     }
52 }

以 http://www.cnblogs.com/huoxiayu/ 我的博客为例，我们想要抓取该页面的所有链接。

 1 package test;
 2 import java.io.IOException;
 3 import org.jsoup.Jsoup;
 4 import org.jsoup.nodes.Document;
 5 import org.jsoup.nodes.Element;
 6 import org.jsoup.select.Elements;
 7 
 8 public class ExtractLinks 
 9 {
10     public static void main(String[] args) throws IOException   
11     {
12         String url = "http://www.cnblogs.com/huoxiayu/";
13         Document doc = Jsoup.connect(url).get();
14         Elements results = doc.select("a[href]");
15         System.out.println("Total " + results.size() + " links");
16         for(Element e : results)
17         {
18             String name = e.text();
19             String href = e.attr("abs:href");
20             System.out.println(name + " : " + href);
21         }
22         System.out.println("运行结束");
23     }
24 }