通过自己技能把某个网站的ppt全部下载下来的过程

1、该网站的ppt链接全部都在页面上,用正则手动提取所有链接,放在指定位置的,以txt形式保存,格式如下

2、写个java文件处理一下,如下:

  1 package platform;
  2 
  3 import java.io.BufferedInputStream;
  4 import java.io.BufferedOutputStream;
  5 import java.io.BufferedReader;
  6 import java.io.File;
  7 import java.io.FileInputStream;
  8 import java.io.FileNotFoundException;
  9 import java.io.FileOutputStream;
 10 import java.io.IOException;
 11 import java.io.InputStreamReader;
 12 import java.io.UnsupportedEncodingException;
 13 import java.net.HttpURLConnection;
 14 import java.net.URL;
 15 import java.util.HashMap;
 16 import java.util.Map;
 17 
 18 import org.apache.http.HttpResponse;
 19 import org.apache.http.client.ClientProtocolException;
 20 import org.apache.http.client.methods.HttpPost;
 21 import org.apache.http.impl.client.DefaultHttpClient;
 22 
 23 public class TestQConDownload {
 24 
 25     public static void main(String[] args) {
 26         BufferedReader bufferedReader;
 27         String lineTxt = null;
 28         String title="1";
 29         String url="";
 30         try {
 31             //读文件
 32             bufferedReader = readTxtFile("E:\test\downinfo.txt");
 33             //循环遍历每行
 34             while((lineTxt = bufferedReader.readLine()) != null){
 35                 if(lineTxt.startsWith("【标题】")){
 36                     title = lineTxt.substring(4).replaceAll(":", "");
 37                     System.out.println(title);
 38                 }
 39                 if(lineTxt.startsWith("【下载地址】")){
 40                     url= lineTxt.substring(6);
 41                     //获取跳转后的地址
 42                     url = getRedirectLocation(url);
 43                     System.out.println(url);
 44                     //下载到指定位置
 45                     downloadFile(url, "E:\test\download\"+title+".pdf");
 46                 }
 47             }
 48             bufferedReader.close();
 49         } catch (UnsupportedEncodingException e) {
 50             // TODO Auto-generated catch block
 51             e.printStackTrace();
 52         } catch (FileNotFoundException e) {
 53             // TODO Auto-generated catch block
 54             e.printStackTrace();
 55         } catch (IOException e) {
 56             // TODO Auto-generated catch block
 57             e.printStackTrace();
 58         }
 59         
 60     }
 61     
 62     public static String getRedirectLocation(String url) throws ClientProtocolException, IOException {
 63         String SEND_MESSAGE_URL = url;
 64         Map<String, Object> params = new HashMap<String, Object>();
 65         HttpPost get = new HttpPost(SEND_MESSAGE_URL);
 66         get.setHeader("Cookie", "dx_un=%E5%B9%B4%E8%BD%BB%E7%9A%84%E7%96%AF%E5%AD%90; dx_avatar=http%3A%2F%2F7xil0e.com1.z0.glb.clouddn.com%2Fuser_580d84f25ea61.png; dx_token=0c6b719ffff50f3746b64f058cb4e719");
 67         get.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
 68         get.setHeader("Accept-Encoding", "zh-CN,zh;q=0.8");
 69         get.setHeader("Connection", "keep-alive");
 70         get.setHeader("Host", "ppt.geekbang.org");
 71         get.setHeader("Referer", "http://2016.qconshanghai.com/schedule");
 72         get.setHeader("Upgrade-Insecure-Requests", "1");
 73         get.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.87 Safari/537.36");
 74 
 75         // 设置编码
 76         HttpResponse re = new DefaultHttpClient().execute(get);
 77         /*if (re.getStatusLine().getStatusCode() == 200) {// 如果状态码为200,就是正常返回
 78             String result = EntityUtils.toString(re.getEntity());
 79             System.out.println(result);
 80         }*/
 81         String location = re.getFirstHeader("Location").getValue();
 82         get.releaseConnection();
 83         return location;
 84     }
 85     
 86     /**  
 87      * 下载远程文件并保存到本地  
 88      * @param remoteFilePath 远程文件路径   
 89      * @param localFilePath 本地文件路径  
 90      */
 91     public static void downloadFile(String remoteFilePath, String localFilePath)
 92     {
 93         URL urlfile = null;
 94         HttpURLConnection httpUrl = null;
 95         BufferedInputStream bis = null;
 96         BufferedOutputStream bos = null;
 97         File f = new File(localFilePath);
 98         try
 99         {
100             urlfile = new URL(remoteFilePath);
101             httpUrl = (HttpURLConnection)urlfile.openConnection();
102             httpUrl.connect();
103             bis = new BufferedInputStream(httpUrl.getInputStream());
104             bos = new BufferedOutputStream(new FileOutputStream(f));
105             int len = 2048;
106             byte[] b = new byte[len];
107             while ((len = bis.read(b)) != -1)
108             {
109                 bos.write(b, 0, len);
110             }
111             bos.flush();
112             bis.close();
113             httpUrl.disconnect();
114         }
115         catch (Exception e)
116         {
117             e.printStackTrace();
118         }
119         finally
120         {
121             try
122             {
123                 bis.close();
124                 bos.close();
125             }
126             catch (IOException e)
127             {
128                 e.printStackTrace();
129             }
130         }
131     }
132     
133     public static BufferedReader readTxtFile(String filePath) throws UnsupportedEncodingException, FileNotFoundException{
134                 String encoding="UTF-8";
135                 File file=new File(filePath);
136                     InputStreamReader read = new InputStreamReader(
137                     new FileInputStream(file),encoding);//考虑到编码格式
138                     BufferedReader bufferedReader = new BufferedReader(read);
139                     return bufferedReader;
140     }
141 }
原文地址:https://www.cnblogs.com/flying607/p/5993409.html