java提取(获取)博客信息(内容)

package com.wbg.my.service;
import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * @author Jack Chen
 * */
public class BlogUtil {

    /**
     * URL_PAGE:cnblogs url
     * URL_PAGE_DETAIL:详情页url
     * PAGE_COUNT:页数
     * urlLists:所有详情页url Set集合(防止重复)
     * p:匹配模式
     * */
    public final static String URL_PAGE = "https://www.cnblogs.com/weibanggang/default.html?page=";
    public final static String URL_PAGE_DETAIL = "https://www.cnblogs.com/weibanggang/p/([0-9]+.html)";
    public final static int PAGE_COUNT = 20;
    public static Set<String> urlLists = new TreeSet<String>();
    public final static Pattern p = Pattern.compile(URL_PAGE_DETAIL);
    //文件路径
    public static String file="d:index.html";
    static String  [] arr=null;
   static int sun=0;
    public static void main(String[] args) throws Exception {
        for(int i = 1;i<=PAGE_COUNT;i++) {
            getUrls(i);
        }
        System.out.println("开始获取内容!");
        arr=new String[urlLists.size()];
        for(Iterator<String> i = urlLists.iterator();i.hasNext();) {
            createFile(i.next());
            sun++;
        }
        System.out.println("获取内容完毕!");
        System.out.println("开始写入文件!");
        StringBuffer stringBuffer=new StringBuffer(kais());
        for (int i = 0; i < arr.length; i++) {
            stringBuffer.append(arr[i]);
        }
        stringBuffer.append(jiehun());
        System.out.println("写入文件完毕!");
        System.out.println("开始导出文件!");
        createFile(file,stringBuffer);
        System.out.println("导出文件完毕!");
        System.out.println("输出文件地址为:"+file);
    }
    /*
     * 将结果写入文件
     */
    private static void createFile(String file, StringBuffer buffer) {
        try {
            File newFile = new File(file);
            if (newFile.exists())// 存在,则删除
                if (!newFile.delete())// 删除成功则创建
                {
                    System.err.println("删除文件" + newFile + "失败");
                }
            if (newFile.createNewFile()) {// 创建成功,则写入文件内容
                PrintWriter p = new PrintWriter(new FileOutputStream(newFile
                        .getAbsolutePath()));
                p.write(buffer.toString());
                p.close();
            } else {
                System.err.println("创建文件:" + newFile + "失败");
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
    //开始头部
    public static String kais(){
        return "<!DOCTYPE html>
" +
                "<html>
" +
                "<head>
" +
                "    <meta charset="utf-8">
" +
                "    <title>weibanggang.github.io</title>
" +
                "    <meta name="renderer" content="webkit">
" +
                "    <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
" +
                "    <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1">
" +
                "    <style>
" +
                "        html,body{100%;height: 100%}
" +
                "        table{ 1150px;height:500px;margin: auto}
" +
                "        table,td,th{border: 1px solid #e6e6e6;border-collapse:collapse; }
" +
                "        body{-moz-background-size:100% 100%; background-size:100% 100%;background-image:url("link.jpg");background-repeat: no-repeat}         body{-moz-background-size:100% 100%; background-size:100% 100%;background-image:url("link.jpg");background-repeat: no-repeat}
" +
                "        * { margin: 0; padding: 0; }
" +
                "        table { border-collapse: collapse; text-align: center;  }
" +
                "        /*关键设置 tbody出现滚动条*/
" +
                "        table tbody {
" +
                "            display: block;
" +
                "            height: 500px;
" +
                "            overflow-y: scroll;overflow-x:hidden;
" +
                "        }
" +
                "  table thead,  tbody tr { display: table; 100%; table-layout: fixed;  }
" +
                "        table thead th {  height: 40px  }
" +
                "        table tbody td {height: 30px }
" +
                "    </style>
" +
                "</head>
" +
                "
" +
                "<body>
" +
                "<marquee><h1 style="color:white;">本网页仅作为参考博客、github等地址</h1></marquee>
" +
                "<table width="80%" border="1">
" +
                "    <thead>
" +
                "    <tr>
" +
                "        <th style="230px">序号</th>
" +
                "        <th style="231px">标题链接</th>
" +
                "        <th style="231px">时间</th>
" +
                "        <th style="231px">来源</th>
" +
                "        <th style="249px">备注</th>
" +
                "    </tr>
" +
                "    </thead>
" +
                "    <tbody>
" +
                "
" +
                "    </tbody>
" +
                "</table>
" +
                "</body>
" +
                "<script src="js/jquery.js"></script>
" +
                "<script>
" +
                "    var sum=[";
    }
    //结尾
    public static String jiehun(){
        return " ];
" +
                "    
" +
                "    for(var i=0;i<sum.length;i++){
" +
                "        var tr=$("<tr/>");
" +
                "            //序号
" +
                "            $("<td/>").html(i+1).appendTo(tr);
" +
                "            //标题链接
" +
                "            var a="<a href='"+sum[i][0]+"' target='_blank'>"+sum[i][1]+"</a>"
" +
                "            $("<td/>").html(a).appendTo(tr);
" +
                "            //时间
" +
                "            $("<td/>").html(sum[i][2]).appendTo(tr);
" +
                "            //来源
" +
                "            $("<td/>").html(sum[i][3]).appendTo(tr);
" +
                "            //备注
" +
                "            $("<td/>").html(sum[i][4]).appendTo(tr);
" +
                "            $("table tbody").append(tr);
" +
                "    }
" +
                "</script>
" +
                "</html>";
    }
    static String fh="";
    /**
     * @param url
     * 获取所有内容
     * @throws
     */
    private static void createFile(String url) throws Exception {
        Matcher m = p.matcher(url);
        m.find();
        String fileName = m.group(1);
        URL u = new URL(url);
        HttpURLConnection conn = (HttpURLConnection) u.openConnection();
        conn.connect();
        BufferedReader br = new BufferedReader(new InputStreamReader(conn.getInputStream(), "utf-8"));
        String str;
        StringBuffer s=new StringBuffer();
        while((str = br.readLine()) != null){
            s.append(str);
        }
        String href="https://www.cnblogs.com/weibanggang/p/"+fileName;
        String title=getTitle(s);
        String data=getDate(s);
        arr[sun]=fh+"[""+href+"",""+title+"",""+data+"","博客","正常"]";
        fh=",";
        br.close();
        conn.disconnect();
    }
    //获取时间
    public static String getDate(StringBuffer sb){
        int first=sb.indexOf("<span id="post-date">")+"<span id="post-date">".length();
        String aa=sb.substring(first);
        int last=aa.indexOf("</span>");
        String sa=aa.substring(0,last);
        return sa;
    }
    //获取标题
    public static String getTitle(StringBuffer sb){
        int first=sb.indexOf("<title>");
        int last=sb.indexOf("</title>");
        String sa=sb.substring(first+7,last);
        int errorindex=sa.lastIndexOf("- 韦邦杠 - 博客园");
        return sa.substring(0,errorindex);
    }
    /**
     * @param idx
     * 获取页数
     * @throws
     */
    private static void getUrls(int idx) throws Exception{
        URL u = new URL(URL_PAGE+""+idx);
        HttpURLConnection conn = (HttpURLConnection) u.openConnection();
        conn.connect();
        BufferedReader br = new BufferedReader(new InputStreamReader(conn.getInputStream(), "utf-8"));
        String str;
        while((str = br.readLine()) != null){
            if(null != str && str.contains("https://www.cnblogs.com/weibanggang/p/")) {
                Matcher m = p.matcher(str);
                if(m.find()) {
                    urlLists.add(m.group());
                }
            }
        }
        br.close();
        conn.disconnect();
    }

}

原文地址:https://www.cnblogs.com/weibanggang/p/10019453.html