新浪新闻页面抓取(JAVA-Jsoup)

1、使用gradle建立工程:

    工程格式如下:

include ':spider-demo'

rootProject.name = 'my-spider-demo'
settings
def void forceVersion(details, group, version) {
    if (details.requested.group == group) {
        details.useVersion version
    }
}

def void forceVersion(details, group, name, version) {
    if (details.requested.group == group && details.requested.name == name) {
        details.useVersion version
    }
}

allprojects { p ->
    group = 'com.my.spider'
    version = '1.0.0'
    
    apply plugin: 'java'
    apply plugin: 'maven'
    apply plugin: 'maven-publish'
    
    [compileJava, compileTestJava]*.options*.encoding = 'UTF-8'

    jar.doFirst {
        manifest {
            def manifestFile = "${projectDir}/META-INF/MANIFEST.MF"
            if (new File(manifestFile).exists())
                from (manifestFile)
            
            attributes 'Implementation-Title':p.name
            if (p.version.endsWith('-SNAPSHOT')) {
                attributes 'Implementation-Version': p.version + '-' + p.ext.Timestamp
            } else {
                attributes 'Implementation-Version': p.version
            }
            attributes 'Implementation-BuildDateTime':new Date()
        }
    }
    
    javadoc {
        options {
            encoding 'UTF-8'
            charSet 'UTF-8'
            author false
            version true
            links 'http://docs.oracle.com/javase/8/docs/api/index.html'
            memberLevel = org.gradle.external.javadoc.JavadocMemberLevel.PRIVATE
        }
    }

    if (p.name.endsWith('-api')){
        task sourcesJar(type:Jar, dependsOn:classes) {
            classifier = 'sources'
            from sourceSets.main.allSource            
        }

        task javadocJar(type:Jar, dependsOn:javadoc) {
            classifier = 'javadoc'
            from javadoc.destinationDir
        }
    }
    
    publishing {
        repositories {
              maven {
                   credentials {
                     username "${repositoryUploadUsername}"
                    password "${repositoryUploadPassword}"
                   }
                   
                   if (version.endsWith('-SNAPSHOT')) {
                    url "${repositoryUploadSnapshotUrl}"
                  } else {
                    url "${repositoryUploadReleaseUrl}"
                  }
              }
          }
          publications {
            mavenJava(MavenPublication) {
                from components.java

                // 只有*-api才会需要发布sources和javadoc
                if (p.name.endsWith('-api')){
                    artifact sourcesJar {
                        classifier "sources"
                    }
                    artifact javadocJar {
                        classifier "javadoc"
                    }
                }
            }
        }
    }

    if (System.env.uploadArchives) {
        build.dependsOn publish
    }

    buildscript {
        repositories {
            maven {
                name 'Maven Repository'
                url "${repositoryMavenUrl}"
                credentials {
                    username "${repositoryUsername}"
                    password "${repositoryPassword}"
                }
            }
        }
        dependencies {classpath 'org.springframework.boot:spring-boot-gradle-plugin:1.4.0.RELEASE' }
    }
    
    afterEvaluate {Project  project -> 
        if (project.pluginManager.hasPlugin('java')) {
            configurations.all {
                resolutionStrategy.eachDependency {DependencyResolveDetails details -> 
                    forceVersion details, 'org.springframework.boot', '1.4.1.RELEASE'
                    forceVersion details, 'org.slf4j', '1.7.21'
                    forceVersion details, 'org.springframework', '4.3.3.RELEASE'
                }

                exclude module:'slf4j-log4j12'
                exclude module:'log4j'
            }

            dependencies {testCompile 'junit:junit:4.12' }
        }
    }

    repositories {
        maven {
            name 'Maven Repository'
            url "${repositoryMavenUrl}"
            credentials {
                username "${repositoryUsername}"
                password "${repositoryPassword}"
            }
        }

        ivy {
            name 'Ivy Repository'
            url "${repositoryIvyUrl}"
            credentials {
                username "${repositoryUsername}"
                password "${repositoryPassword}"
            }
            layout "pattern",  {
                artifact '[organisation]/[module]/[revision]/[type]s/[artifact]-[revision].[ext]'
                ivy '[organisation]/[module]/[revision]/[type]s/[artifact].[ext]'
                m2compatible = true
            }
        }
    }
    
    // 时间戳:年月日时分
    p.ext.Timestamp = new Date().format('yyyyMMddHHmm')
    // Build Number
    p.ext.BuildNumber = System.env.BUILD_NUMBER
    if (p.ext.BuildNumber == null || "" == p.ext.BuildNumber) {
        p.ext.BuildNumber = 'x'
    }    
}

task zipSources(type: Zip) {
    description '压缩源代码'
    project.ext.zipSourcesFile = project.name + '-' + project.version + '-' +  project.ext.Timestamp + '.' + project.ext.BuildNumber + '-sources.zip' 
    archiveName = project.ext.zipSourcesFile
    includeEmptyDirs = false
    
    from project.projectDir
    
    exclude '**/.*'
    exclude 'build/*'
    allprojects.each { p ->
        exclude '**/' + p.name + '/bin/*'
        exclude '**/' + p.name + '/build/*'
        exclude '**/' + p.name + '/data/*'
        exclude '**/' + p.name + '/work/*'
        exclude '**/' + p.name + '/logs/*'    
    }
}

def CopySpec appCopySpec(Project prj, dstname = null) {
    if (!dstname) { dstname = prj.name }
    return copySpec{
        // Fat jar
        from (prj.buildDir.toString() + '/libs/' + prj.name + '-' + project.version + '.jar') {
            into dstname
        }        
    
        // Configs
        from (prj.projectDir.toString() + '/config/examples') {
            into dstname + '/config'
        }
    
        // Windows start script
        from (prj.projectDir.toString() + '/' + prj.name + '.bat') {
            into dstname
        }
        
        // Unix conf script
        from (prj.projectDir.toString() + '/' + prj.name + '.conf') {
            into dstname
            rename prj.name, prj.name + '-' + project.version
        }
    }    
}


task zipSetup(type: Zip, dependsOn: subprojects.build) { 
    description '制作安装包' 
    project.ext.zipSetupFile = project.name + '-' + project.version + '-' +  project.ext.Timestamp + '.' + project.ext.BuildNumber + '-setup.zip' 
    archiveName = project.name + '-' + project.version + '-' +  project.ext.Timestamp + '.' + project.ext.BuildNumber + '-setup.zip'
    
    with appCopySpec(project(':spider-demo'))
}

import java.security.MessageDigest

def generateMD5(final file) {
    MessageDigest digest = MessageDigest.getInstance("MD5")
    file.withInputStream(){is->
        byte[] buffer = new byte[8192]
        int read = 0
        while( (read = is.read(buffer)) > 0) {
            digest.update(buffer, 0, read);
        }
    }
    byte[] md5sum = digest.digest()
    BigInteger bigInt = new BigInteger(1, md5sum)
    return bigInt.toString(16)
}

task md5(dependsOn: [zipSetup, zipSources]) << {
    String md5_setup = generateMD5(file("${projectDir}/build/distributions/" + project.ext.zipSetupFile));
    String md5_sources = generateMD5(file("${projectDir}/build/distributions/" + project.ext.zipSourcesFile));
    println project.ext.zipSetupFile + '=' + md5_setup
    println project.ext.zipSourcesFile + '=' + md5_sources
    
    def newFile = new File("${projectDir}/build/distributions/" 
                    + project.name + '-' + project.version + '-' +  project.ext.Timestamp + '.' + project.ext.BuildNumber + '-md5.txt')
    PrintWriter printWriter = newFile.newPrintWriter()
    printWriter.println project.ext.zipSetupFile + '=' + md5_setup
    printWriter.println project.ext.zipSourcesFile + '=' + md5_sources
    printWriter.flush()
    printWriter.close()
}

build.dependsOn subprojects.build, zipSetup, zipSources, md5
bulid.gradle

子过程相关依赖:

apply plugin: 'spring-boot'
apply plugin: 'application'

distributions {
    main {
        contents {
            from ("${projectDir}/config/examples") {
                into "config"
            }
        }
    }
}

distTar.enabled = false

springBoot {
    executable = true
    mainClass = 'com.my.spider.Application'
}

dependencies {
    compile 'org.springframework.boot:spring-boot-starter-web:1.4.0.RELEASE'
    compile 'dom4j:dom4j:1.6.1'
    compile 'commons-httpclient:commons-httpclient:3.1'
    compileOnly 'com.h2database:h2:1.4.191'
    compile 'javax.cache:cache-api:1.0.0'
    compile 'org.jboss.resteasy:resteasy-jaxrs:3.0.14.Final'
    compile 'org.jboss.resteasy:resteasy-client:3.0.14.Final'
    // Axis
    compile 'axis:axis:1.4'
    
    compile 'org.jsoup:jsoup:1.10.1'
    
    compile 'com.alibaba:fastjson:1.2.21'
    
}
bulid

2、代码编写:

    入口:

package com.my.spider;

import java.io.IOException;

import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.scheduling.annotation.EnableAsync;
import org.springframework.scheduling.annotation.EnableScheduling;

import com.my.spider.utils.CommonProperties;

@SpringBootApplication
@EnableScheduling
@EnableAsync
public class Application {

    public static void main(String[] args) throws IOException {
        String loc = CommonProperties.loadProperties2System(System.getProperty("spring.config.location"));
        System.getProperties().setProperty("application.version", CommonProperties.getVersion(Application.class));
        System.getProperties().setProperty("app.home", loc + "/..");
        SpringApplication.run(Application.class, args);
    }
    
}
package com.my.spider.utils;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.Properties;

import org.springframework.util.StringUtils;

public final class CommonProperties {

    public static final String PPT_KEY_APP_HOME = "app.home";

    public static final String DEFAULT_APP_HOME = "./";

    public static final String getAppHome() {
        return System.getProperty(DEFAULT_APP_HOME, DEFAULT_APP_HOME);
    }

    public static String loadProperties2System(String location) throws IOException {
        String configLocation = location;
        File cnf;
        if (!StringUtils.hasLength(configLocation)) {
            configLocation = "./config";
            cnf = new File(configLocation);
            if (!cnf.exists() || !cnf.isDirectory()) {
                configLocation = "../config";
                cnf = new File(configLocation);
            }
        } else {
            cnf = new File(configLocation);
        }
        for (File file : cnf.listFiles()) {
            if (file.isFile() && file.getName().endsWith(".properties")) {
                Properties ppt = new Properties();
                try (FileInputStream fi = new FileInputStream(file)) {
                    ppt.load(fi);
                    System.getProperties().putAll(ppt);
                }
            }
        }
        return configLocation;
    }

    public static String getVersion(Class<?> clazz) {
        Package pkg = clazz.getPackage();
        String ver = (pkg != null ? pkg.getImplementationVersion() : "undefined");
        return (ver == null ? "undefined" : ver);
    }
}

配置类:

package com.my.spider.config;

import org.springframework.context.annotation.ComponentScan;
import org.springframework.context.annotation.Configuration;
import org.springframework.scheduling.annotation.EnableScheduling;

@EnableScheduling
@Configuration
@ComponentScan(basePackages = {
        "com.my.spider.rs",
        "com.my.spider.schedule"
})
public class AppAutoConfiguration {

}

META-INF下spring.factories文件:

org.springframework.boot.autoconfigure.EnableAutoConfiguration=
    com.my.spider.config.AppAutoConfiguration

3、功能代码:

定时任务抽象类,提供三种定时任务的调用方法:

package com.my.spider.schedule;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.DisposableBean;
import org.springframework.beans.factory.InitializingBean;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;

import com.fasterxml.jackson.databind.ObjectMapper;

@Component
public abstract class ParentSchedule implements InitializingBean,DisposableBean{

    public static Logger logger = LoggerFactory.getLogger(ParentSchedule.class);
    
    public final static ObjectMapper objectMapper = new ObjectMapper();
    
    @Scheduled(
            initialDelayString = "${agent.task.initialDelay:1000}", //
            fixedDelayString = "${agent.task.fixedDelay:10000}")
    public void dowork(){
        execute();
    }
    //定时任务一
    public abstract void execute();

    @Scheduled(cron = "${agent.task.cron:0 0 10,14,16 * * ?}")
    public void timeTask(){
        executeTimeTask();
    }
    //定时任务三
    public abstract void executeTimeTask();
    
    //每天12点出发
    @Scheduled(cron = "0 0 12 * * ?")
    public void otherTask(){
        executeOtherTask();
    }
    //定时任务三
    public abstract void executeOtherTask();
}
package com.my.spider.utils;

import java.util.HashMap;
import java.util.Map;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * 页面抓取请求的公共类
 * */
public class HttpHtmlUtils {

    public static Logger logger = LoggerFactory.getLogger(HttpHtmlUtils.class);
    
    public static Map<String, String> header = new HashMap<String, String>();
    
    public static Map<String, String> header_a = new HashMap<String, String>();
    
    static {
        //设置请求头
        header.put("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0");
        header.put("Accept","text/javascript, text/html, application/xml, text/xml, */*");
        header.put("Accept-Language","zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3");
        header.put("Accept-Encoding","gzip, deflate");
        header.put("X-Requested-With","XMLHttpRequest");
        header.put("Content-Type","text/*, application/xml");
        header.put("Connection","keep-alive");
        
        header_a.put("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0");
        header_a.put("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
        header_a.put("Accept-Language","zh-CN,zh;q=0.8");
        header_a.put("Accept-Encoding","gzip, deflate, sdch");
        header_a.put("Content-Type","application/octet-stream");
        header_a.put("Connection","keep-alive");
        header_a.put("Upgrade-Insecure-Requests", "1");
    }
    
    
}

新浪滚动新闻抓取实现下载和分析:

package com.my.spider.schedule;

import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;
import org.springframework.util.StringUtils;

import com.my.spider.utils.FileUtils;
import com.my.spider.utils.HttpHtmlUtils;

@Component
public class SinaSchedule extends ParentSchedule {

    private static Logger logger = LoggerFactory.getLogger(SinaSchedule.class);

    public static SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm");

    public static SimpleDateFormat sdfYMD = new SimpleDateFormat("yyyy-MM-dd");

    private static int downloadtimeout = 5000;

    public static Set<String> titleSet = new HashSet<String>();
    
    @Value("${img.download.dir.prefix:D://testhtml}")
    public String dirpath;

    @Override
    public void afterPropertiesSet() throws Exception {
        // TODO Auto-generated method stub

    }

    // 抓取文章列表
    public static List<String> getArticleList(String url) {
        
        List<String> urlList = new ArrayList<String>();
        logger.debug("获取文章信息url:{},开始时间={}", url, sdf.format(new Date()));

        try {
            Connection connect = Jsoup.connect(url).headers(HttpHtmlUtils.header_a);
            Document document;
            document = connect.timeout(downloadtimeout).get();
            Elements newsList = document.getElementsByClass("d_list_txt");
            if (newsList != null && newsList.size() > 0) {
                newsList = newsList.get(0).getElementsByTag("ul").get(0).getElementsByTag("li");
                for (Element el : newsList) {
                    String elUrl = el.getElementsByTag("a").get(0).absUrl("href");
                    String urlName = el.getElementsByTag("a").get(0).text();
                    String time = el.getElementsByClass("c_time").get(0).text();
                    logger.debug("获取新闻:{},访问地址:{},时间:{}",urlName,elUrl,time);
                    //elUrl = el.getElementsByTag("a").get(0).attr("href");
                    urlList.add(elUrl);
                }
            }
            logger.debug("获取文章列表信息:结束时间={}", sdf.format(new Date()));
            return urlList;
        } catch (IOException e) {
            logger.error("访问文章列表失败:" + url + "  原因" + e.getMessage());
        }
        return null;
    }

    // 抓取文章列表
    public static Map<String, Object> getArticleInfo(String url) {

        logger.debug("获取文章信息url:{},开始时间={}", url, sdf.format(new Date()));
        try {
            Map<String, Object> map = new HashMap<String, Object>();
            Connection connect = Jsoup.connect(url).headers(HttpHtmlUtils.header);
            Document document;
            document = connect.timeout(downloadtimeout).get();
            Element titleEl = document.getElementById("artibodyTitle");
            String tilte = "";
            
            if (titleEl != null) {
                tilte = titleEl.text();
            }
            
            Elements keywords = document.getElementsByClass("article-keywords");
            String tag = "";
            StringBuffer sb = new StringBuffer();
            if (keywords != null ) {
                for (Element t : keywords.get(0).getElementsByTag("a")) {
                    sb.append(t.text()).append(",");
                }
                if (!StringUtils.isEmpty(sb.toString())) {
                    tag = sb.deleteCharAt(sb.lastIndexOf(",")).toString();
                }
            }
            
            Element contentEle = document.getElementById("artibody");
            String content = "";
            String contentText = "";
            if (contentEle != null) {
                content = contentEle.html();
                contentText = contentEle.text();
            }
            String description = "";
            Elements descEle = document.getElementsByAttributeValue("name","description");
            if (descEle != null && descEle.size() > 0) {
                description = descEle.get(0).attr("content");
            }
            List<String> imgUrls = new ArrayList<>();
            Elements imgs = contentEle.getElementsByTag("img");
            if (imgs != null && imgs.size() > 0) {
                for (Element img : imgs) {
                    String imgUrl = img.attr("src");
                    if (!StringUtils.isEmpty(imgUrl)) {
                        imgUrls.add(imgUrl);
                    }
                }
            }
            map.put("imgs", imgUrls);
            map.put("description", description);
            map.put("content", content);
            map.put("contentText", contentText);
            map.put("tag", tag);
            map.put("title", tilte);
            logger.debug("获取文章信息:结束时间={}", sdf.format(new Date()));

            return map;
        } catch (IOException e) {
            logger.error("访问文章页失败:" + url + "  原因" + e.getMessage());
        }
        return null;
    }

    @Override
    public void destroy() throws Exception {
        // TODO Auto-generated method stub

    }

    public static void main(String[] args) {
        List<String> url = new ArrayList<>();
        url.addAll(getArticleList("http://roll.news.sina.com.cn/s/channel.php?ch=01#col=89&spec=&type=&ch=0"
                    + "1&k=&offset_page=0&offset_num=0&num=60&asc=&page=1"));
        titleSet.addAll(url);
        logger.debug("此次共获取到{}个",titleSet.size());
    
        for (String urlStr : titleSet) {
            try {
                /*
                String htmlFile =  FileUtils.downloadYunFile(urlStr, "D://testhtml//sina//"+sdfYMD.format(new Date()));
                Document document = Jsoup.parse(new File(htmlFile), "utf8");
                document.getElementsByTag("tilte");
                */
                //下载保存
                FileUtils.downloadYunFile(urlStr, "D://testhtml//sina//"+sdfYMD.format(new Date()));
                
                getArticleInfo(urlStr);
            
            
            } catch (Throwable e) {

            }
            
            
        }
    }

    @Override
    public void execute() {

    }

    @Override
    public void executeTimeTask() {
        // TODO Auto-generated method stub

    }

    @Override
    public void executeOtherTask() {
        // TODO Auto-generated method stub

    }

}
View Code

下载html文件代码:

package com.my.spider.utils;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URI;
import java.util.Arrays;

import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.http.HttpEntity;
import org.springframework.http.HttpHeaders;
import org.springframework.http.HttpMethod;
import org.springframework.http.MediaType;
import org.springframework.http.ResponseEntity;
import org.springframework.http.client.ClientHttpRequestFactory;
import org.springframework.http.client.HttpComponentsClientHttpRequestFactory;
import org.springframework.util.StreamUtils;
import org.springframework.web.client.RestTemplate;
import org.springframework.web.util.UriComponentsBuilder;

import com.fasterxml.jackson.databind.ObjectMapper;

public class FileUtils {

    private static final Logger logger = LoggerFactory.getLogger(FileUtils.class);

    private static ObjectMapper _objectMapper = new ObjectMapper();

    private static int downloadTimeout = 5000;

    public static void main(String[] args) throws Throwable {
        String filePath = "/temp/temp/test.mpg";
        String dirPrex = "/temp&Z:\\";
        String[] paths = dirPrex.split("&");
        System.out.println(paths[1] + filePath.substring(paths[0].length() + 1).replace("/", "\"));
    }

    // 文件复制
    public static void copy(String src, String dest) throws IOException {

        System.out.println("正在拷贝【" + src + "】到【" + dest + "】
");
        File destFile = new File(dest);
        if (!destFile.exists()) {
            String dir = dest.substring(0, dest.lastIndexOf(File.separator));
            File dirF = new File(dir);
            if (!dirF.exists() || !dirF.isDirectory()) {
                dirF.mkdirs();
            }
            destFile.createNewFile();
        }
        FileInputStream in = new FileInputStream(src);
        FileOutputStream out = new FileOutputStream(dest);
        byte[] buffer = new byte[40960];
        while (in.read(buffer) != -1) {
            out.write(buffer);
            out.flush();
        }
        in.close();
        out.close();
    }

    // 下载云文件
    public static String downloadYunFile(String url, String dir) throws Throwable {

        String fileName = getFileName(url);

        String filePath = dir + File.separator + fileName;

        try (CloseableHttpClient httpclient = HttpClients.createDefault()) {
            HttpGet httpget = new HttpGet(url);
            httpget.setConfig(RequestConfig.custom() //
                    .setConnectionRequestTimeout(downloadTimeout) //
                    .setConnectTimeout(downloadTimeout) //
                    .setSocketTimeout(downloadTimeout) //
                    .build());
            try (CloseableHttpResponse response = httpclient.execute(httpget)) {
                org.apache.http.HttpEntity entity = response.getEntity();
                File desc = new File(filePath);
                File folder = desc.getParentFile();
                folder.mkdirs();
                try (InputStream is = entity.getContent(); //
                        OutputStream os = new FileOutputStream(desc)) {
                    StreamUtils.copy(is, os);
                }
            } catch (Throwable e) {
                throw new Throwable("文件下载失败......", e);
            }
        }
        return filePath;
    }

    public static String getFileName(String fileFullPath) {
        fileFullPath = fileFullPath.replace("/", "\");
        return fileFullPath.substring(fileFullPath.lastIndexOf("\") + 1, fileFullPath.length());
    }

    // 请求例子
    public void getToken(String url, String data) throws Throwable {

        RestTemplate restTemplate = new RestTemplate();
        ClientHttpRequestFactory clientFactory = new HttpComponentsClientHttpRequestFactory();
        restTemplate.setRequestFactory(clientFactory);

        HttpHeaders requestHeaders = new HttpHeaders();
        requestHeaders.setAccept(Arrays.asList(MediaType.APPLICATION_JSON_UTF8));
        requestHeaders.setContentType(MediaType.APPLICATION_JSON_UTF8);
        logger.debug("获取token的URL:" + url);

        URI uri = UriComponentsBuilder.fromUriString(url).build().encode().toUri();

        logger.debug("请求数据:{}", _objectMapper.writeValueAsString(data));

        HttpEntity<String> requestEntity = new HttpEntity<String>(data, requestHeaders);

        ResponseEntity<String> response = restTemplate.exchange(uri, HttpMethod.POST, requestEntity, String.class);
        String resp = response.getBody();
        logger.debug("请求返回值数据:{}", _objectMapper.writeValueAsString(resp));
    }

}

4、总结:

           Jsoup对于这种页面抓取很好用!也可能因为这是实现了一个最简单的页面抓取过程!

追加一个下载音频的代码:

package com.my.spider.service;


import java.net.HttpURLConnection;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;

import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Service;

import com.alibaba.fastjson.JSONObject;
import com.my.spider.model.AudioInfo;
import com.my.spider.utils.FileUtils;
import com.my.spider.utils.HttpHtmlUtils;
import com.my.spider.utils.HttpURLConnectionFactory;

@Service
public class XmlyAudioService {

    public static final Logger logger = LoggerFactory.getLogger(XmlyAudioService.class);
    static String url = "http://www.ximalaya.com/dq/comic/";
    static String requetUrl = "http://www.ximalaya.com/tracks/";

    public static void main(String[] args) {
        List<String> audioUrlList = new ArrayList<String>();
        int count = getCount(url);
        if(count > 1) {
            audioUrlList.addAll(getAudioList(1,url));
            for (int i = 2; i <= count; i++) {
                url = url +i+"/";
                audioUrlList.addAll(getAudioList(i,url));
                url = url.replace(i+"/", "");
            }
        }
        List<String> audioList = new ArrayList<String>();
        //解析
        if(audioUrlList.size() > 0) {
            for (String url : audioUrlList) {
                audioList.addAll(listAudio(url));
            }
        }
        System.out.println(audioUrlList.size() + "==" + audioList.size());
        List<AudioInfo> audioInfos = new ArrayList<>();
        //下载
        for (String sound_id : audioList) {
            requetUrl = requetUrl + sound_id+".json";
            System.out.println(requetUrl);
            audioInfos.add(downloadList(requetUrl));
            requetUrl = requetUrl.replace(sound_id+".json", "");
        }
    }

    //获取音频页详情
    public static List<String> getAudioList(int num,String url){
        List<String> list = new ArrayList<>();
        try {
                Connection connect = Jsoup.connect(url).headers(HttpHtmlUtils.header_a);
                Document document = connect.timeout(5000).get();
                FileUtils.str2File(document.toString(), "G:\xmly\html\comic" + num + ".html");
                Element el = document.getElementById("explore_album_detail_entry");
                Elements els =  el.getElementsByClass("albumface");
                for (Element element : els) {
                    list.add(element.absUrl("href"));
                }
        } catch (Throwable e) {
            logger.error("获取{}网页信息失败,{}",url,e.getMessage(),e);
        }
        return list;
    }
    
    public static List<String> listAudio(String url){
        List<String> list = new ArrayList<>();
        try {
                Connection connect = Jsoup.connect(url).headers(HttpHtmlUtils.header_a);
                Document document = connect.timeout(5000).get();
                FileUtils.str2File(document.toString(), "G:\xmly\html\comic_"+System.currentTimeMillis()+".html");
                Elements els = document.getElementsByClass("personal_body");
                if(els!=null && els.size() > 0) {
                    String sound_ids = els.get(0).attr("sound_ids");
                    list.addAll(Arrays.asList(sound_ids.split(",")));
                }
        } catch (Throwable e) {
            logger.error("获取{}网页信息失败,{}",url,e.getMessage(),e);
        }
        return list;
    }
    
    //
    @SuppressWarnings("unchecked")
    public static AudioInfo downloadList(String url){
        AudioInfo audioInfo = new AudioInfo();
        try {
        
            HttpURLConnection conn = HttpURLConnectionFactory.getConn(url);
            conn.setRequestProperty("Content-Type", "*/*; charset=utf-8");
            String audioJson  = HttpURLConnectionFactory.sendGet(conn);
            Map<String,Object> map =  (Map<String, Object>) JSONObject.parse(audioJson);
            audioInfo.setId(map.get("id").toString());
            audioInfo.setName(map.get("title").toString());
            audioInfo.setUrl(map.get("play_path").toString());
            try {
                FileUtils.downloadRenameFile(audioInfo.getUrl(), "G:\xmly", audioInfo.getName()+".mp3");
            } catch (Throwable e) {
                logger.error("{}下载失败,id={}",audioInfo.getName(),audioInfo.getId());;
            }
            
        } catch (Throwable e) {
            logger.error(e.getMessage(),e);
        }
        return audioInfo;
    }
    
    //获取总页数页数
    public static int getCount(String url) {
        try {
            Connection connect = Jsoup.connect(url).headers(HttpHtmlUtils.header_a);
            Document document = connect.timeout(5000).get();
            Elements els = document.getElementsByClass("pagingBar_page");
            if(els.size() < 2) {
                return 1;
            }
            Element pageCout = els.get(els.size()-2);
            return Integer.valueOf(pageCout.text());
        } catch (Throwable e) {
            e.printStackTrace();
        }
        return 0;
    }
    
    
}
xmly.java
原文地址:https://www.cnblogs.com/liangblog/p/7594047.html