java爬虫 案例

package com.zjazn;

import com.sun.org.apache.bcel.internal.generic.RETURN;
import com.sun.xml.internal.ws.api.server.InstanceResolver;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import sun.net.www.http.HttpClient;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;

public class Data {

   //2、解析数据
    public static void main(String[] args) {
        String html = getData();
        Document htmledThisDocument = Jsoup.parse(html);
        List<MyData> myData=new ArrayList<MyData>();
        Elements courses = htmledThisDocument.select(".learn-path-container>div");
        for (Element course:courses){
            String courseName = course.select("a>div").first().text();
            String courseNum = course.select("a>div").last().text();
            if(courseNum.indexOf("门")>-1){
                int num = Integer.parseInt(courseNum.substring(0, courseNum.indexOf("门")));
                String imgPath = course.select("a>img").attr("src");
                String fuffix = imgPath.substring(imgPath.lastIndexOf("."));
                MyData myData6 = new MyData();
                    myData6.setName(courseName);
                    myData6.setImgPath(imgPath);
                    myData6.setNum(num);
                myData.add(myData6);
                downloadFile(imgPath,"E://myimg",courseName+fuffix);
            }
        }
        System.out.println(myData.toString());


    }
  //1、获取数据(html)
public static String getData(){ CloseableHttpClient httpClient = HttpClients.createDefault(); HttpGet httpGet = new HttpGet("https://www.lanqiao.cn/paths/"); CloseableHttpResponse response=null; HttpEntity entity=null; String html=null; try { response = httpClient.execute(httpGet);//发送请求 if(response.getStatusLine().getStatusCode() ==200){ entity = response.getEntity();//获取html html= EntityUtils.toString(entity,"UTF-8");//用指定编码解析html } return html; } catch (IOException e) { e.printStackTrace(); } return null; }
//传入资源链接,下载资源的方法,比如下载图片   
public static void downloadFile(String urlStr,String directory,String fileName){//#有些页面抓取不了,网站设置了反爬,拒绝反爬取,请看最下面
        FileOutputStream out =null;
        InputStream in=null;
        try {
            URL url=new URL(urlStr);
            URLConnection urlConnection = url.openConnection();
            in=urlConnection.getInputStream();
            byte[] buf=new byte[1000];
            File dir = new File(directory);
            if(!dir.exists() ){
                dir.mkdir();
            }
            out=new FileOutputStream(directory+"\"+fileName);
            int len=-1;
            while ((len=in.read(buf))!=-1){
                out.write(buf,0,len);

            }

        } catch (MalformedURLException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }finally {
            try {
                if(in != null){
                    in.close();
                }
                if (out !=null){
                    out.close();
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

}
package com.zjazn;

import lombok.Data;

@Data
public class MyData {
    private String name;
    private String imgPath;
    private Integer num;

}

##拒绝反爬取

public static String getData(String TargetUrl) throws IOException {
URL url = new URL(TargetUrl);//目标URL

HttpURLConnection conn = (HttpURLConnection) url.openConnection();//模拟浏览器,反拒绝爬取
conn.setRequestMethod("GET");
conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36");

BufferedReader fr = new BufferedReader(new InputStreamReader(conn.getInputStream(), "UTF-8"));
char[] chars = new char[1024];
int len=0;
String html = null;
while ((len=fr.read(chars))!=-1){
html+=new String(chars,0,len);
}

fr.close();
return html;


}
原文地址:https://www.cnblogs.com/zjazn/p/14188395.html