DFA算法实现敏感词过滤

DFA算法:即确定有穷自动机,简单点说就是,它是是通过event和当前的state得到下一个state,即event+state=nextstate。理解为系统中有多个节点,通过传递进入的event,来确定走哪个路由至另一个节点,而节点是有限的。

废话不多说,直接贴上代码:

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * @description:敏感词工具
 * @author: maojialong
 * @date: 2018年1月30日 上午10:59:24
 */
public class SensitivewordEngine {
    
    private String ENCODING = "GBK";    //字符编码
    
    //敏感词库
    public static Map sensitiveWordMap = new HashMap();

    //只过滤最小敏感词
    public static int minMatchTYpe = 1;

    //过滤所有敏感词
    public static int maxMatchType = 2;
    
    //正则表达式中文、字母、数字
    public static Pattern pattern = Pattern.compile("^[a-zA-Z0-9u4E00-u9FA5]+$");
    
    /**
     * 读取敏感词库中的内容,将内容添加到set集合中
     * @author chenming 
     * @date 2014年4月20日 下午2:31:18
     * @return
     * @version 1.0
     * @throws Exception 
     */
    @SuppressWarnings("resource")
    private void readSensitiveWordFile() throws Exception{
        Set<String> set = null;
        
        File file = new File("D:\SensitiveWord.txt");    //读取文件
        InputStreamReader read = new InputStreamReader(new FileInputStream(file),ENCODING);
        try {
            if(file.isFile() && file.exists()){      //文件流是否存在
                set = new HashSet<String>();
                BufferedReader bufferedReader = new BufferedReader(read);
                String txt = null;
                while((txt = bufferedReader.readLine()) != null){    //读取文件,将文件内容放入到set中
                    set.add(txt);
                }
            }
            else{         //不存在抛出异常信息
                throw new Exception("敏感词库文件不存在");
            }
        } catch (Exception e) {
            throw e;
        }finally{
            read.close();     //关闭文件流
        }
        addNewSensitiveWord(set);
    }
    
    /**
     * @description: 新增敏感词库
     * @author: maojialong
     * @date: 2018年2月1日 上午11:55:10
     * @param keyWordSet
     */
    public static void addNewSensitiveWord(Set<String> keyWordSet) {
        sensitiveWordMap.putAll(getNewSensitiveWordToHashMap(keyWordSet));
    }
    
    /**
     * @description: 封装敏感词库
     * @author: maojialong
     * @date: 2018年1月30日 下午4:28:58
     * @param keyWordSet
     * @return
     */
    @SuppressWarnings("rawtypes")
    public static HashMap getNewSensitiveWordToHashMap(Set<String> keyWordSet) {
        // 初始化HashMap对象并控制容器的大小
        HashMap newSensitiveWordMap = new HashMap(keyWordSet.size());
        // 敏感词
        String key = null;
        // 用来按照相应的格式保存敏感词库数据
        Map nowMap = null;
        // 用来辅助构建敏感词库
        Map<String, String> newWorMap = null;
        // 使用一个迭代器来循环敏感词集合
        Iterator<String> iterator = keyWordSet.iterator();
        while (iterator.hasNext()) {
            key = iterator.next();
            // 等于敏感词库,HashMap对象在内存中占用的是同一个地址,所以此nowMap对象的变化,sensitiveWordMap对象也会跟着改变
            nowMap = sensitiveWordMap;
            for (int i = 0; i < key.length(); i++) {
                // 截取敏感词当中的字,在敏感词库中字为HashMap对象的Key键值
                char keyChar = key.charAt(i);
                //不是汉字数字字母直接跳过
                Matcher match = pattern.matcher(String.valueOf(keyChar));
                boolean matched = match.matches();
                if(!matched) {
                    continue;
                }

                // 判断这个字是否存在于敏感词库中
                Object wordMap = nowMap.get(keyChar);
                if (wordMap != null) {
                    nowMap = (Map) wordMap;
                } else {
                    newWorMap = new HashMap<String, String>();
                    newWorMap.put("isEnd", "0");
                    nowMap.put(keyChar, newWorMap);
                    nowMap = newWorMap;
                }

                // 如果该字是当前敏感词的最后一个字,则标识为结尾字
                if (i == key.length() - 1) {
                    nowMap.put("isEnd", "1");
                }
            }
        }
        return newSensitiveWordMap;
    }

    /**
     * @description: 敏感词库敏感词数量
     * @author: maojialong
     * @date: 2018年1月30日 下午4:07:20
     * @return
     */
    public static int getWordSize() {
        if (SensitivewordEngine.sensitiveWordMap == null) {
            return 0;
        }
        return SensitivewordEngine.sensitiveWordMap.size();
    }

    /**
     * @description: 是否包含敏感词
     * @author: maojialong
     * @date: 2018年1月30日 下午2:47:37
     * @param txt
     * @param matchType
     * @return
     */
    public static boolean isContaintSensitiveWord(String txt, int matchType) {
        boolean flag = false;
        for (int i = 0; i < txt.length(); i++) {
            int matchFlag = checkSensitiveWord(txt, i, matchType);
            if (matchFlag > 0) {
                flag = true;
            }
        }
        return flag;
    }

    /**
     * @description: 获取敏感词内容
     * @author: maojialong
     * @date: 2018年1月30日 下午2:47:27
     * @param txt
     * @param matchType
     * @return
     */
    public static Set<String> getSensitiveWord(String txt, int matchType) {
        Set<String> sensitiveWordList = new HashSet<String>();

        for (int i = 0; i < txt.length(); i++) {
            int length = checkSensitiveWord(txt, i, matchType);
            if (length > 0) {
                // 将检测出的敏感词保存到集合中
                sensitiveWordList.add(txt.substring(i, i + length));
                i = i + length - 1;
            }
        }

        return sensitiveWordList;
    }

    /**
     * @description: 替换敏感词
     * @author: maojialong
     * @date: 2018年1月30日 下午2:47:15
     * @param txt
     * @param matchType
     * @param replaceChar
     * @return
     */
    public static String replaceSensitiveWord(String txt, int matchType, String replaceChar) {
        String resultTxt = txt;
        Set<String> set = getSensitiveWord(txt, matchType);
        Iterator<String> iterator = set.iterator();
        String word = null;
        String replaceString = null;
        while (iterator.hasNext()) {
            word = iterator.next();
            replaceString = getReplaceChars(replaceChar, word.length());
            try {
                resultTxt = resultTxt.replaceAll(word, replaceString);
            }catch(Exception e) {
            }
        }

        return resultTxt;
    }

    /**
     * @description: 获取替换字符
     * @author: maojialong
     * @date: 2018年1月30日 下午2:46:40
     * @param replaceChar
     * @param length
     * @return
     */
    private static String getReplaceChars(String replaceChar, int length) {
        String resultReplace = replaceChar;
        for (int i = 1; i < length; i++) {
            resultReplace += replaceChar;
        }

        return resultReplace;
    }

    /**
     * @description: 检查敏感词
     * @author: maojialong
     * @date: 2018年1月30日 下午2:45:50
     * @param txt
     * @param beginIndex
     * @param matchType
     * @return
     */
    public static int checkSensitiveWord(String txt, int beginIndex, int matchType) {
        boolean flag = false;
        // 记录敏感词数量
        int matchFlag = 0;
        char word = 0;
        Map nowMap = SensitivewordEngine.sensitiveWordMap;
        for (int i = beginIndex; i < txt.length(); i++) {
            word = txt.charAt(i);
            if(matchFlag > 0 && !flag ) {
                Matcher match = pattern.matcher(String.valueOf(word));
                boolean matched = match.matches();
                if(!matched) {
                    matchFlag++;
                    continue;
                }
            }
            // 判断该字是否存在于敏感词库中
            nowMap = (Map) nowMap.get(word);
            if (nowMap != null) {
                matchFlag++;
                // 判断是否是敏感词的结尾字,如果是结尾字则判断是否继续检测
                if ("1".equals(nowMap.get("isEnd"))) {
                    flag = true;
                    // 判断过滤类型,如果是小过滤则跳出循环,否则继续循环
                    if (SensitivewordEngine.minMatchTYpe == matchType) {
                        break;
                    }
                }
            } else {
                break;
            }
        }
        if (matchFlag < 2 || !flag) {
            matchFlag = 0;
        }
        return matchFlag;
    }
    
    /**
     * @description: 删除敏感词
     * @author: maojialong
     * @date: 2018年2月1日 上午11:40:45
     * @param keyWord
     */
    public static void removeSensitiveWordToHashMap(String keyWord) {
        int length = keyWord.length();
        Map<Integer,Map> tempMap = new HashMap<Integer,Map>();
        char word = 0;
        boolean flag = false;
        Map nowMap = sensitiveWordMap;
        for(int i = 0; i < length ; i++){
            word = keyWord.charAt(i);
            Map lastMap = nowMap;
            nowMap = (Map) nowMap.get(word);     //获取指定key
            if(nowMap != null){     //存在,则判断是否为最后一个
                tempMap.put(i, lastMap);
            }else{     //不存在,直接返回
                break;
            }
            if (i == length -1 && "1".equals(nowMap.get("isEnd"))) {
                flag = true;
            }
        }
        if(flag) {
            for(int i = length - 1; i >= 0 ; i--){
                word = keyWord.charAt(i);
                nowMap = tempMap.get(i); 
                Map m = (Map) nowMap.get(word);
                boolean last = m.size() == 1 && "1".equals(m.get("isEnd")) && i == length - 1;
                boolean notLast = m.size() == 1 && "0".equals(m.get("isEnd"));
                if(last || notLast) {
                    nowMap.remove(keyWord.charAt(i));
                }else {
                    break;
                }
            }
        }
    }
    
    public static void main(String[] args) throws InterruptedException {
        Set<String> sensitiveWord = new HashSet<String>();
        sensitiveWord.add("大娃");
        SensitivewordEngine.addNewSensitiveWord(sensitiveWord);
        String result = SensitivewordEngine.replaceSensitiveWord("我是大娃,我弟弟是大二娃,我们都是葫芦娃", 2,"*");
        System.out.println(result);
        System.out.println(SensitivewordEngine.sensitiveWordMap);
        
        //新增或者批量新增
        sensitiveWord.add("大二娃");
        sensitiveWord.add("大二");
        SensitivewordEngine.addNewSensitiveWord(sensitiveWord);
        result = SensitivewordEngine.replaceSensitiveWord("我是大娃,我弟弟是大二娃,我们现在读大二,我们都是葫芦娃", 2,"*");
        System.out.println(result);
        System.out.println(SensitivewordEngine.sensitiveWordMap);
        
        //删除
        SensitivewordEngine.removeSensitiveWordToHashMap("大二娃");
        result = SensitivewordEngine.replaceSensitiveWord("我是大娃,我弟弟是大二娃,我们现在读大二,我们都是葫芦娃", 2,"*");
        System.out.println(result);
        System.out.println(SensitivewordEngine.sensitiveWordMap);
        
    }
    
    
}

复制代码即可食用,最后的removeSensitiveWordToHashMap是我一个朋友帮忙写的,其他方法时参考网上的其他博文整理的

原文地址:https://www.cnblogs.com/angry-scholar/p/8442191.html