使用jsoup抓取新闻信息

1,jsoup简介

jsoup 是一款Java 的HTML解析器,可直接解析某个URL地址、HTML文本内容。它提供了一套非常省力的API,可通过DOM,CSS以及类似于jQuery的操作方法来取出和操作数据。jsoup 是基于 MIT 协议发布的,可放心使用于商业项目。

jsoup 的主要功能如下:

1. 从一个 URL,文件或字符串中解析 HTML;

2. 使用 DOM 或 CSS 选择器来查找、取出数据;

3. 可操作 HTML 元素、属性、文本;

2,jsoup使用

1,下载jsoup的jar包:http://jsoup.org/download 

2, jsoup英文的开发手册:http://jsoup.org/cookbook/ 

3,jsoup的jsoup cookbook中文版:http://www.open-open.com/jsoup/ 

下面是一个简单例子

1,获取新浪财经的website 以及标题,打印输出。

2,获取1中一个wensite的正文信息,打印并输出。

代码实现:

package jSoupTesting;

import java.io.IOException;

import org.jsoup.Jsoup;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;


public class GetSinaUrlAndTitle {

    public static void main(String[] args) {
        // TODO Auto-generated method stub
        getUrlAndTitle();
        getTextMes();
    }
    
    public static void getUrlAndTitle()
    {
        String url="http://finance.sina.com.cn/";
        try {
            Document doc=Jsoup.connect(url).timeout(10000).get();//get all infomation from url website
            //System.out.println(doc);  
            Elements ListDiv = doc.getElementsByAttributeValue("class","fin_tabs0_c0");
            //System.out.println(ListDiv);
            for (Element div :ListDiv) {
                 Elements links = div.getElementsByTag("a");
                // System.out.println(links);
                 for (Element link : links) {
                     String linkHref = link.attr("href").trim();
                     String linkText = link.text().trim();
                     System.out.println(linkHref+"	"+linkText);
                 }    
             }
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }
    
    public static void getTextMes()
    {
        String url="http://finance.sina.com.cn/hy/20140823/100220099682.shtml";
        String textMes="";
        try {
            Document doc=Jsoup.connect(url).timeout(10000).get();
            Elements ListDiv = doc.getElementsByAttributeValue("class","blkContainerSblkCon BSHARE_POP");
            //System.out.println(ListDiv);
            for(Element div:ListDiv)
            {
                Elements textInfos=div.getElementsByTag("p");
                //System.out.println(textInfos);
                for(Element textInfo:textInfos)
                {
                    String text=textInfo.text().trim();
                    textMes=textMes+text+"
";
                }
            }
            System.out.println(textMes);
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }
}
View Code
3,新闻抓取要求

新闻筛选过程:(以“新浪财经 “为例) http://finance.sina.com.cn/

1. 选择方向

(1)宏观新闻:宏观新闻:包括一些重大的国内外宏观调控,我国银监会等监管机构出台的一些文件,或者例如自贸区发展,金砖银行成立等国内重大金融新闻。

(2)公司新闻:包括客户公司或其他大型金融机构的管理层变动,兼并收购,战略转型,新推产品等新闻。

2. 网页选择

1.宏观新闻:进入http://finance.sina.com.cn/       -----》         首页“要闻“

 

2.公司新闻:进入http://finance.sina.com.cn/               选择“银行“ -》 ”要闻“

             

3,抓取要求

1,要求抓取要闻部分所有网址,标题,关键字。

2,要求抓取1中网址下的正文。

3,并且前一天看过的新闻不能存在于后一天。

4,要求抓好的新闻放在txt文档中。

4,代码实现
package sinaSpider;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class GetSinaInfo {
    
    public static void main(String[] args) throws IOException {
        // TODO Auto-generated method stub
        getSinaInforamtion();
    }
    public static  void getSinaInforamtion() 
    {
        Map<String,String> pathMap=createNewFiles();
        try {
            getSinaYaoWen(pathMap);
            getSinaChangJing(pathMap);
            getSinaBank(pathMap);    
        } catch (IOException e) {
            e.printStackTrace();
        }     
    }
    public static void getSinaYaoWen(Map<String,String> pathMap) throws IOException
    {
        String YaoWenTextPath=pathMap.get("yaowen")+"//yaowen"+GetDate()+"outputText.txt";
        String YaoWenTitlePath=pathMap.get("yaowen")+"//yaowen"+GetDate()+"outputTitle.txt";
        String YaoWenUrlPath=pathMap.get("yaowen")+"//"+GetDate()+"url.txt";
        
        FileWriter urlWriter = new FileWriter(YaoWenUrlPath);
        FileWriter textWriter = new FileWriter(YaoWenTextPath);
        FileWriter titleWriter = new FileWriter(YaoWenTitlePath);

        String oldUrlPath=pathMap.get("yaowen")+"//"+GetYesterday()+"url.txt";
        String[] oldUrls=GetYesterdayInfo(oldUrlPath);
        
        Document doc = Jsoup.connect("http://finance.sina.com.cn/").timeout(5000).get();
        Elements ListDiv = doc.getElementsByAttributeValue("class","fin_tabs0_c0");
        //System.out.println(ListDiv);
                 for (Element element :ListDiv) {
             Elements links = element.getElementsByTag("a");
             for (Element link : links) {
                 String linkHref = link.attr("href").trim();
                 String linkText = link.text().trim();
                 if(judgeDup(oldUrls,linkHref))
                 {
                     getWebText(linkHref,linkText,textWriter,titleWriter,urlWriter);
                 }     
                 
             }    
         }
         textWriter.close();
         titleWriter.close();
         urlWriter.close();
    }
    
    public static void getSinaChangJing(Map<String,String> pathMap) throws IOException
    {
         String ChanJingTextPath=pathMap.get("chanjing")+"//chanjing"+GetDate()+"outputText.txt";
         String ChanJingTitlePath=pathMap.get("chanjing")+"//chanjing"+GetDate()+"outputTitle.txt";
         String ChanJingUrlPath=pathMap.get("chanjing")+"//"+GetDate()+"url.txt";    
         FileWriter urlWriter = new FileWriter(ChanJingUrlPath);
         FileWriter textWriter = new FileWriter(ChanJingTextPath);
         FileWriter titleWriter = new FileWriter(ChanJingTitlePath);
         
         String oldUrlPath=pathMap.get("chanjing")+"//"+GetYesterday()+"url.txt";
         String[] oldUrls=GetYesterdayInfo(oldUrlPath);
            
         Document doc = Jsoup.connect("http://finance.sina.com.cn/chanjing/").timeout(5000).get();
         Elements ListDiv = doc.getElementsByAttributeValue("class","blk_03");
        //System.out.println(ListDiv);
         for (Element element :ListDiv) {
             Elements links = element.getElementsByTag("a");
             for (Element link : links) {
                 
                 String linkHref = link.attr("href").trim();
                 String linkText = link.text().trim();
                 if(judgeDup(oldUrls,linkHref))
                 {
                     getWebText(linkHref,linkText,textWriter,titleWriter,urlWriter);
                 } 
             }    
         }
         textWriter.close();
         titleWriter.close();
         urlWriter.close();
    }
    public static void getSinaBank(Map<String,String> pathMap) throws IOException
    {
        
         String bankTextPath=pathMap.get("bank")+"//bank"+GetDate()+"outputText.txt";
         String bankTitlePath=pathMap.get("bank")+"//bank"+GetDate()+"outputTitle.txt";
         String bankUrlPath=pathMap.get("bank")+"//"+GetDate()+"url.txt";    
         FileWriter urlWriter = new FileWriter(bankUrlPath);
         FileWriter textWriter = new FileWriter(bankTextPath);
         FileWriter titleWriter = new FileWriter(bankTitlePath);
         
         String oldUrlPath=pathMap.get("bank")+"//"+GetYesterday()+"url.txt";
         String[] oldUrls=GetYesterdayInfo(oldUrlPath);
         
         Document doc = Jsoup.connect("http://finance.sina.com.cn/money/bank/").timeout(5000).get();
         Elements ListDiv = doc.getElementsByAttributeValue("class","blk05");
        //System.out.println(ListDiv);
         
         for (Element element :ListDiv) {
            Elements links = element.getElementsByTag("a");
            for (Element link : links) {
                
                String linkHref = link.attr("href").trim();
                String linkText = link.text().trim();
                if(judgeDup(oldUrls,linkHref))
                {
                    getWebText(linkHref,linkText,textWriter,titleWriter,urlWriter);
                } 
            }    
        }
         textWriter.close();
         titleWriter.close();
         urlWriter.close();
    }
    
    public static void getWebText(String url,String subTitle,
                                  FileWriter textWriter,FileWriter titleWriter,
                                  FileWriter urlWriter) throws IOException
    {
        
        Document doc;
        doc = Jsoup.connect(url).timeout(10000).get();        
        Elements ListDiv = doc.getElementsByAttributeValue("class","blkContainerSblkCon BSHARE_POP");
        if(ListDiv.isEmpty()!=true)
        {    
            String webTitleKeywords=getTitleAndWebsite(url,subTitle)+getKeyWords(doc);
            System.out.println(webTitleKeywords);
            writeSTK(webTitleKeywords, titleWriter);
            textWriter.write(webTitleKeywords+"
");
            urlWriter.write(url+"
");
            for (Element element :ListDiv) {
                 Elements links = element.getElementsByTag("p");
                 for (Element link : links) {
                     String linkText = link.text().trim();         
                     textWriter.write(linkText+"
");
                   //  System.out.println(linkText);
                 }
             }
        }
    }
    public static String getTitleAndWebsite(String url,String subTitle)
    {
        String titleAndWebsite;
        titleAndWebsite=url+"	"+subTitle;
        return titleAndWebsite;
    }
    public static void writeSTK(String webTitleKeywords,FileWriter writeWebTitle)
    {
        try {
            writeWebTitle.write(webTitleKeywords+"
");
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }
    public static String getKeyWords(Document doc)
    {
        Elements listKey=doc.getElementsByAttributeValue("class","art_keywords");
        String keywords ="	 keywords:";
        for(Element element:listKey)
        {
            Elements links = element.getElementsByTag("a");
             for (Element link : links) {
                 String linkText = link.text().trim();         
                 keywords = keywords+linkText+",";
             }
        }
        return keywords;
        
    }
    
    public static String GetDate()
    {
         Date dt=new Date();
         SimpleDateFormat simpleDate=new SimpleDateFormat("yyyy-MM-dd");
        // System.out.println(simpleDate.format(dt));
         return simpleDate.format(dt);    
    }
    
    public static String GetYesterday()
    {
        Calendar calendar = Calendar.getInstance();
        calendar.add(Calendar.DATE, -1);   
        String  yestedayDate = new SimpleDateFormat("yyyy-MM-dd").format(calendar.getTime());
       // System.out.println(yestedayDate);
        return yestedayDate;    
    }    
    public static String[] GetYesterdayInfo(String oldFilePath) throws IOException
    {
        String encoding="Utf-8";
        File file=new File(oldFilePath);
        if(file.exists())
        {
            return getOldUrls(file,encoding);
        }
        else
        {
            file.createNewFile();
            return getOldUrls(file,encoding);
        }
            
    }
    public static String[] getOldUrls(File file,String encoding) throws IOException
    {
        
            FileInputStream fis=new FileInputStream(file);
            InputStreamReader inStream=new InputStreamReader(fis,encoding);
            BufferedReader input=new BufferedReader(inStream);
            String url=input.readLine();
            StringBuilder sb = new StringBuilder("");
             while(url!=null){
                sb.append(url.trim());
                sb.append(",");
                url=input.readLine();
             }
            String sbStr = sb.toString();
            String oldUrls[]=sbStr.split(",");        
            return oldUrls;
        
    }
    
    public static boolean judgeDup(String[] oldUrls ,String newUrl)
    {
        for(int i=0;i<oldUrls.length;i++)
        {
            if(newUrl.equals(oldUrls[i])==true)
            {
                return false;
            }
        }
        return true;
    }
    
    public static Map<String,String> createNewFiles()
    {    
        String path=getWorkPath()+"//output";
        String [] fileNames = {"yaowen","chanjing","bank"};
        Map<String,String> pathMap=new HashMap<String,String>();
        String pathArray[] = new String[fileNames.length];
        for(int i=0;i<fileNames.length;i++)
        {
            String filePath=path+"//"+fileNames[i];
            File file=new File(filePath);
            if(!file.exists())
            {
                file.mkdirs();
            }
            pathArray[i]=file.getPath().replace("\", "//");
            pathMap.put(fileNames[i], pathArray[i]);
        }
        return pathMap;
    }
    
    public static String getWorkPath()
    {
        String workspacePath = null;
        try {
            File directory = new File("");//参数为空 
            workspacePath = directory.getCanonicalPath() ; 
            //System.out.println(workspacePath);
            workspacePath = workspacePath.replace("\", "//"); 
            //System.out.println(workspacePath);
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        return workspacePath;
    }
}
View Code
原文地址:https://www.cnblogs.com/qianwen/p/3931432.html