敏感词校验 DFA 算法

为什么需要过滤

在Web系统中,用户需要进行评论/回复,或者发布一些观点,服务器需要对用户发布的内容进行过滤(如果想被请去喝茶的话可以选择不过滤),过滤的内容当然包括色情、暴力、政府相关等

在分析需求后,google 相关的其他系统是如何实现的. DFA 算法,可以比较高效的进行匹配 。如果使用 for 循环逐个匹配效率肯定是不行的

什么是DFA算法

Deterministic Finite Automaton ,也就是确定有穷自动机 ,通过 event 与 state 得到下一个 state ,即状态的转换

DFA 算法的实现

  1.  构建存储结构(构建类似树型结构匹配)
  2. 关键词匹配

通过加载数据库数据关键词,判断内容是否合法

/**
 *  修改为每次都读取一次 ,也可以系统启动读取一次(不实时)
 */
@Component
public class SensitiveWordFilter {

    private final SensitiveWordDao sensitiveWordDao;

    @SuppressWarnings("rawtypes")
    private Map sensitiveWordMap = null;
    public static int minMatchTYpe = 1;
    public static int maxMatchType = 2;


    public SensitiveWordFilter(SensitiveWordDao sensitiveWordDao){
        this.sensitiveWordDao = sensitiveWordDao;
//        Set<String> keyWordSet  = sensitiveWordDao.list();
//        addSensitiveWordToHashMap(keyWordSet);
    }

    @SuppressWarnings({ "rawtypes", "unchecked" })
    private void addSensitiveWordToHashMap(Set<String> keyWordSet) {
        sensitiveWordMap = new HashMap(keyWordSet.size());
        String key = null;
        Map nowMap = null;
        Map<String, String> newWorMap = null;
        Iterator<String> iterator = keyWordSet.iterator();
        while(iterator.hasNext()){
            key = iterator.next();
            nowMap = sensitiveWordMap;
            for(int i = 0 ; i < key.length() ; i++){
                char keyChar = key.charAt(i);
                Object wordMap = nowMap.get(keyChar);

                if(wordMap != null){
                    nowMap = (Map) wordMap;
                }
                else{
                    newWorMap = new HashMap<String,String>();
                    newWorMap.put("isEnd", "0");
                    nowMap.put(keyChar, newWorMap);
                    nowMap = newWorMap;
                }

                if(i == key.length() - 1){
                    nowMap.put("isEnd", "1");
                }
            }
        }
    }



    public boolean isContainSensitiveWord(String txt, int matchType){
        boolean flag = false;
        for(int i = 0 ; i < txt.length() ; i++){
            int matchFlag = this.CheckSensitiveWord(txt, i, matchType);
            if(matchFlag > 0){
                flag = true;
            }
        }
        return flag;
    }


    public Set<String> getSensitiveWord(String txt , int matchType){
        Set<String> sensitiveWordList = new HashSet<String>();

        for(int i = 0 ; i < txt.length() ; i++){
            int length = CheckSensitiveWord(txt, i, matchType);
            if(length > 0){
                sensitiveWordList.add(txt.substring(i, i+length));
                i = i + length - 1;
            }
        }

        return sensitiveWordList;
    }


    public String replaceSensitiveWord(String txt,int matchType,String replaceChar){
        String resultTxt = txt;
        Set<String> set = getSensitiveWord(txt, matchType);
        Iterator<String> iterator = set.iterator();
        String word = null;
        String replaceString = null;
        while (iterator.hasNext()) {
            word = iterator.next();
            replaceString = getReplaceChars(replaceChar, word.length());
            resultTxt = resultTxt.replaceAll(word, replaceString);
        }

        return resultTxt;
    }


    private String getReplaceChars(String replaceChar,int length){
        String resultReplace = replaceChar;
        for(int i = 1 ; i < length ; i++){
            resultReplace += replaceChar;
        }

        return resultReplace;
    }


    @SuppressWarnings({ "rawtypes"})
    public int CheckSensitiveWord(String txt,int beginIndex,int matchType){
        Set<String> keyWordSet  = sensitiveWordDao.list();
        addSensitiveWordToHashMap(keyWordSet);
        boolean  flag = false;
        int matchFlag = 0;
        char word = 0;
        Map nowMap = sensitiveWordMap;
        for(int i = beginIndex; i < txt.length() ; i++){
            word = txt.charAt(i);
            nowMap = (Map) nowMap.get(word);
            if(nowMap != null){
                matchFlag++;
                if("1".equals(nowMap.get("isEnd"))){
                    flag = true;
                    if(SensitiveWordFilter.minMatchTYpe == matchType){
                        break;
                    }
                }
            }
            else{
                break;
            }
        }
        if(matchFlag < 2 || !flag){
            matchFlag = 0;
        }
        return matchFlag;
    }
}

 参考:

https://blog.csdn.net/weixin_43378396/article/details/105910145

https://blog.csdn.net/chenssy/article/details/26961957

https://www.cnblogs.com/twoheads/p/11349541.html

原文地址:https://www.cnblogs.com/bytecodebuffer/p/14486877.html