MinerUtil.java 爬虫工具类

MinerUtil.java 爬虫工具类

package com.iteye.injavawetrust.miner;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TimeZone;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 * 爬虫工具类
 * @author InJavaWeTrust
 *
 */
public class MinerUtil {
	
	private static final Log LOG = LogFactory.getLog(MinerUtil.class);
	
	public static long starTime = 0;
	
	/**
	 * 判断是否为空
	 * @param param
	 * @return true-为空;false-非空
	 */
	public static boolean isBlank(String param) {
		return (null == param || "".equals(param.trim())) ? true : false;
	}
	
	/**
	 * URL是否以html结尾
	 * @param url
	 * @return true-是;false-否
	 */
	public static boolean checkURL(String url) {
		String html = url.substring(url.lastIndexOf(".") + 1);
		return "html".equals(html) ? true : false;
	}
	/**
	 * URL列表是否包含关键字
	 * @param key 关键字
	 * @param keys URL列表
	 * @return true-是;false-否
	 */
	public static boolean checkKeys(String key, List<String> keys) {
		boolean flag = false;
		for(String k : keys) {
			if(key.contains(k)){
				flag = true;
				break;
			}
		}
		return flag;
	}
	
	public static boolean isValidFileName(String fileName) {
		if (fileName == null || fileName.length() > 255){
			return false;
		} else {
			return fileName
					.matches("[^\s\\/:\*\?\"<>\|](\x20|[^\s\\/:\*\?\"<>\|])*[^\s\\/:\*\?\"<>\|\.]$");
		}
	} 
	
	/**
	 * 获取URL
	 * @param url URL
	 * @return URL
	 */
	public static Set<String> getAllUrl(String url){
		Set<String> urls = new HashSet<String>();
		try {
			Connection conn = Jsoup.connect(url);
			conn.header("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13");//模拟浏览器  
			Document document = conn.timeout(5000).get();
			Elements hrefs = document.select("a[href]");
			Iterator<Element> hrefIter = hrefs.iterator();
			while (hrefIter.hasNext()) {
				Element href = hrefIter.next();
				urls.add(href.attr("href"));
			}
		} catch (Exception e) {
			LOG.info("获取URL出现异常,异常URL[" + url + "]");
			LOG.info("异常信息[" + e.getMessage() + "]");
		}
		return urls;
	}
	
	/**
	 * 毫秒转换成hhmmss
	 * @param ms 毫秒
	 * @return hh:mm:ss
	 */
	public static String msToss(long ms) {
		SimpleDateFormat formatter = new SimpleDateFormat("HH:mm:ss");
		formatter.setTimeZone(TimeZone.getTimeZone("GMT+00:00"));
		String ss = formatter.format(ms);
		return ss;
	}
	
	/**
	 * 将html写入本地文件
	 * @param htmlText html内容
	 * @param htmlName html名称
	 */
	public static void getHtmlToLocal(Map<String, String> map){
		Writer writer = null;
		try {
			String path = MinerConstanits.HTMLPATH + getToday();
			makeDir(path);
			writer = new OutputStreamWriter(new FileOutputStream(new File(path
					+ File.separator + map.get("title"))), "UTF-8");
			writer.write(map.get("html"));
			writer.flush();
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			if (writer != null) {
				try {
					writer.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
			}
		}
	}
	/**
	 * 文件名不能包含下列任何字符:<br>
	 * /:*?"<>|
	 * @param title 标题
	 * @return 去掉文件名不能包含的字符
	 */
	public static String fileName(String title){
		return title
				.replaceAll("\\", "")
				.replaceAll("/", "")
				.replaceAll(":", "")
				.replaceAll("\*", "")
				.replaceAll("\?", "")
				.replaceAll(""", "")
				.replaceAll("<", "")
				.replaceAll(">", "")
				.replaceAll("\|", "");
	}
	/**
	 * 获取当天日期
	 * @return 当天日期
	 */
	public static String getToday(){
		String result = "";
		Date date = new Date();
		result = format(date);
		return result;
	}
	/**
	 * 格式化日期
	 * @param date 日期
	 * @return yyyymmdd 日期
	 */
	public static String format(Date date){
		String format = "yyyyMMdd";
		SimpleDateFormat fmt = new SimpleDateFormat(format);
		return fmt.format(date);
	}
	/**
	 * 创建存储目录
	 * @param path 存储目录
	 */
	public static void makeDir(String path) {
		File file = new File(path);
		if(!file.exists()){
			file.mkdirs();
			LOG.info("创建存储目录[" + path + "]");
		}
	}
	
	public static boolean checkBeforeStart(MinerConfig config) {
		if(null == config){
			LOG.info("config未配置!!!");
			return false;
		}
		if(null == config.getKeys() || 0 == config.getKeys().size()){
			LOG.info("包含关键字未配置!!!");
			return false;
		}
		if(null == config.getStoreType()){
			LOG.info("存储方式未配置!!!");
			return false;
		}
		if(config.getMaxDepth() < 1){
			LOG.info("爬取页面最大深度配置错误!!!");
			return false;
		}
		if(config.getMinerHtmlThreadNum() < 1){
			LOG.info("下载页面线程数配置错误!!!");
			return false;
		}
		if(config.getMiseringThreadNum() < 1){
			LOG.info("分析页面线程数配置错误!!!");
			return false;
		}
		if(config.getMinserStoreThreadNum() < 1){
			LOG.info("存储线程数配置错误!!!");
			return false;
		}
		return true;
	}
	
	public static void main(String[] args) {
		String path = MinerConstanits.HTMLPATH + File.separator + getToday();
		makeDir(path);
//		System.out.println(getToday());
//		String test = "http://my.163.com/2015/11/27/17763_578935.html";
//		System.out.println(fileName(test));
//		System.out.println(MinerUtil.isBlank(null));
//		System.out.println(MinerUtil.isBlank(""));
//		System.out.println(MinerUtil.isBlank(" "));
//		System.out.println(MinerUtil.isBlank("bbb"));
//		System.out.println(MinerUtil.isBlank(" bbb "));
		
//		String key = "http://www.jqu.net.cn";
//		List<String> keys = new ArrayList<String>();
//		keys.add("http://www.jqu.net.cn");
//		System.out.println(MinerUtil.checkKeys(key, keys));
	}

}

返回列表

原文地址:https://www.cnblogs.com/new0801/p/6146682.html