java实现抓取某公司官网新闻

  做项目时,并没有合作公司的获取新闻的接口,但是项目又急着上线,所以总监就让我来做一个简单的抓取,现将主要的工具类NewsUtil.java贴出来供大家参考。

NewsUtil.java

  1 package org.news.util;
  2 
  3 import java.io.BufferedReader;
  4 import java.io.IOException;
  5 import java.io.InputStream;
  6 import java.io.InputStreamReader;
  7 import java.net.URL;
  8 import java.net.URLConnection;
  9 import java.util.ArrayList;
 10 import java.util.regex.Matcher;
 11 import java.util.regex.Pattern;
 12 
 13 /**
 14  * 抓取新闻内容的辅助类
 15  * @author geenkDC
 16  * @time 2015-07-28 15:15:04
 17  */
 18 public class NewsUtil {
 19     /**
 20      * 通过提交的URL来抓取出新闻的链接
 21      * @param url
 22      * @return
 23      * @throws Exception 
 24      */
 25     public static ArrayList<String> findUrlByUrl(String url) throws Exception
 26     {
 27         URL url0=new URL(url);
 28         ArrayList<String> urlList=new ArrayList<String>();
 29         URLConnection con;
 30         BufferedReader br=null;
 31         try {
 32             con = url0.openConnection();
 33             InputStream in=con.getInputStream();
 34             br=new BufferedReader(new InputStreamReader(in));
 35             String str="";
 36             while((str=br.readLine())!=null)
 37             {
 38                 urlList.addAll(findUrl(str));
 39             }
 40         } catch (IOException e) {
 41             throw new RuntimeException("URL读写错误:"+e.getMessage());
 42         }
 43         if(br!=null)
 44         {
 45             try {
 46                 br.close();
 47             } catch (IOException e) {
 48                 throw new RuntimeException("URL流关闭异常:"+e.getMessage());
 49             }
 50         }
 51         return urlList;
 52     }
 53     
 54     /**抓取新闻URL的真正实现类
 55      * @param str
 56      * @return
 57      */
 58     public static ArrayList<String> findUrl(String str)
 59     {
 60         ArrayList<String> urlList=new ArrayList<String>();
 61         //匹配新闻的URL
 62         String regex="http://[a-zA-Z0-9_\.:\d/?=&%]+\.jhtml";
 63         Pattern p=Pattern.compile(regex);
 64         Matcher m=p.matcher(str);
 65         //找符合正则匹配的字串
 66         while(m.find())
 67         {
 68             String subStr=m.group().substring(m.group().lastIndexOf("/")+1, m.group().lastIndexOf(".jhtml"));
 69 
 70             try {
 71                 if (subStr.matches("[0-9]*")) {
 72                     urlList.add(m.group());
 73                     
 74                 }
 75             } catch (Exception e) {
 76                 throw new RuntimeException("匹配新闻URL出错:"+e.getMessage());
 77             }
 78         }
 79         return urlList;
 80     }
 81     
 82     /**
 83      * 根据URL找到其的新闻内容
 84      * @param url
 85      * @return
 86      * @throws Exception 
 87      */
 88     public static ArrayList<String> findContentByUrl(String url) throws Exception {
 89         URL url1=new URL(url);
 90         ArrayList<String> conList=new ArrayList<String>();
 91         URLConnection con;
 92         BufferedReader br=null;
 93         try {
 94             con = url1.openConnection();
 95             InputStream in=con.getInputStream();
 96             InputStreamReader isr=new InputStreamReader(in, "utf-8");
 97             br=new BufferedReader(isr);
 98             String str="";
 99             StringBuffer sb=new StringBuffer();
100             while((str=br.readLine())!=null)
101             {
102                 sb.append(str);
103             }
104             conList.addAll(findContent(sb.toString()));
105         } catch (IOException e) {
106             throw new RuntimeException("URL读写错误:"+e.getMessage());
107         }
108         if(br!=null)
109         {
110             try {
111                 br.close();
112             } catch (IOException e) {
113                 throw new RuntimeException("URL流关闭异常:"+e.getMessage());
114             }
115         }
116         return conList;
117     }
118     
119     /**
120      * 抓取新闻内容的真正实现类
121      * @param str
122      * @return
123      */
124     public static ArrayList<String> findContent(String str) {
125         ArrayList<String> strList=new ArrayList<String>();
126         //匹配新闻内容div
127         String regex="<div class="con_box">([\s\S]*)</div>([\s\S]*)<div class="left_con">";
128         Pattern p=Pattern.compile(regex);
129         Matcher m=p.matcher(str);
130         //找符合正则匹配的字串
131         while(m.find())
132         {
133             try {
134                 strList.add(new String(m.group()));
135             } catch (Exception e) {
136                 throw new RuntimeException("抓取新闻内容出错:"+e.getMessage());
137             }
138         }
139         return strList;
140     }
141 }

功能简单说明:

  只要输入网站首页的url,程序会自动获取匹配的新闻条目的url,再根据每个新闻条目的url抓取该新闻的左右内容。

原文地址:https://www.cnblogs.com/geekdc/p/5499226.html