crawler_工具类_RegexUtils_正则帮助类

package com.cph.crawler.core.utils;

import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

/**
 * 
 * @ClassName: RegexUtils
 * @Description: 正则帮助类
 * @author cphmvp
 * @date 2013-9-9 下午3:48:59<br>
 *       适合单次抽取结果,不适合遍历抽取
 * 
 */
public final class RegexUtils {
    private RegexUtils() {

    }

    private static Log logger = LogFactory.getLog(RegexUtils.class);
    private static Pattern defaultPattern;
    private static Matcher defaultMatcher;
    private static final String NOT_MATCHER_DATA = "没有匹配到对应数据";

    /**
     * 返回单行结果集
     * 
     * @param input
     * @param regex
     * @param group
     * @return
     */
    public static String getString(String input, Pattern pattren, int group) {
        if (pattren.toString().equals(
                "<span class="l">当前位置:([\s\S]*?)</span>")) {
            System.out.println("warn");
        }
        String result = "";
        String splitStr = "⊙";
        defaultMatcher = pattren.matcher(input);
        while (defaultMatcher.find()) {
            result = defaultMatcher.group(group).trim() + "" + splitStr;
        }
        result = result.trim().replaceAll("</?[^>]+>", "");
        result = result.replaceAll("&gt;", ">");
        result = result.replaceAll("
", "");
        result = result.replaceAll("\r\n", "");
        result = result.replaceAll("\s", "");
        result = result.replaceAll("&nbsp", " ");
        result = result.replace("
", "");
        result = result.replace("	", "");
        result = result.replace("^p", "");
        result = result.replaceAll("⊙", " ");
        return result.trim();
    }

    /**
     * 返回单行结果集
     * 
     * @param input
     * @param regex
     * @param group
     * @return
     */
    public static String getString(String input, String regex, int group) {
        String result = " ";
        defaultMatcher = getMatcher(input, regex);
        while (defaultMatcher.find()) {
            result = defaultMatcher.group(group).trim();
        }
        getLog(result);
        return result;
    }

    /**
     * 获得可匹配对象
     * 
     * @param input
     * @param regex
     * @return
     */
    public static Matcher getMatcher(String input, String regex) {
        defaultPattern = getPattern(regex);
        defaultMatcher = defaultPattern.matcher(input);
        return defaultMatcher;
    }

    /**
     * 获得模式对象
     * 
     * @param regex
     * @return
     */
    public static Pattern getPattern(String regex) {
        defaultPattern = Pattern.compile(regex);
        return defaultPattern;
    }

    /**
     * 返回多行结果集
     * 
     * @param input
     * @param regex
     * @param group
     * @return
     */
    public static List<String> getStringList(String input, String regex,
            int group) {
        List<String> resultList = new ArrayList<String>();
        defaultMatcher = getMatcher(input, regex);
        while (defaultMatcher.find()) {
            resultList.add(defaultMatcher.group().trim());
        }
        if (resultList.size() < 1) {
            logger.error(NOT_MATCHER_DATA);
        }
        return resultList;
    }

    /**
     * 返回多行结果集
     * 
     * @param input
     * @param regex
     * @param group
     * @return
     */
    public static List<Integer> getIntList(String input, String regex, int group) {
        List<Integer> resultList = new ArrayList<Integer>();
        defaultMatcher = getMatcher(input, regex);
        while (defaultMatcher.find()) {
            resultList.add(Integer.parseInt(defaultMatcher.group().trim()));
        }
        if (resultList.size() < 1) {
            logger.error(NOT_MATCHER_DATA);
        }
        return resultList;
    }

    /**
     * 返回多行结果集
     * 
     * @param input
     * @param regex
     * @param group
     * @return
     */
    public static String getString(String input, String regex) {
        String result = " ";
        defaultMatcher = getMatcher(input, regex);
        while (defaultMatcher.find()) {
            result = defaultMatcher.group().trim();
        }
        getLog(result);
        return result;
    }

    /**
     * 返回单行结果集
     * 
     * @param input
     * @param regex
     * @param group
     * @return
     */
    public static int getInt(String input, String regex, int group) {
        int result = -1;
        defaultMatcher = getMatcher(input, regex);
        while (defaultMatcher.find()) {
            result = Integer.parseInt(defaultMatcher.group(group).trim());
        }
        getLog(result);
        return result;
    }

    /**
     * 返回单行结果集
     * 
     * @param input
     * @param regex
     * @param group
     * @return
     */
    public static int getInt(String input, String regex) {
        int result = -1;
        defaultMatcher = getMatcher(input, regex);
        while (defaultMatcher.find()) {
            result = Integer.parseInt(defaultMatcher.group().trim());
        }
        getLog(result);
        return result;
    }

    /**
     * 匹配中国邮政编码
     * 
     * @param postcode
     *            邮政编码
     * @return 验证成功返回true,验证失败返回false
     */
    public static boolean checkPostcode(String postcode) {
        String regex = "[1-9]\d{5}";
        return Pattern.matches(regex, postcode);
    }

    private static void getLog(String result) {
        if (result.trim().equals("")) {
            logger.error(NOT_MATCHER_DATA);
        }
    }

    private static void getLog(Integer result) {

        if (-1 == result) {
            logger.error(NOT_MATCHER_DATA);
        }
    }
}
原文地址:https://www.cnblogs.com/cphmvp/p/3588741.html