未登录词识别

未登录词识别:不在词典中的词
---新词:杀马特
---命名实体:奥克兰


主要解决方案:基于规则合词,然后通过百度验证。

Start Char Char    1-2-Combine    #[图 n][里 f][市场 n][站 n]
Start Char Char Char    1-3-Combine    #
Start Char Char Char Char    1-4-Combine    #
Start Char Char Char Char Char    1-5-Combine    #
Start Char Char Char Char Char Char    1-6-Combine    #
Start Direction Char    1-2-Combine    #东澳站 南势站
Start Char Word    1-2-Combine    #[台 j][中港 nz][站 n]
Word Char Keyword    0-1-Combine    #[梨园 nz][寮 g][站 v][白沙 nz][屯 ng][站 n]
Char Char Keyword    0-1-Combine    #[商水县 ns][黄 a][寨 ng][站 n]
NumPrefix Num    0-1-Seq    #地五医院
Num NumSuffix    0-1-Seq    #93/号/酒家
Num Num    0-1-Combine #
Num Num Num    0-2-Combine #
Num Num Num Num    0-3-Combine #
Num Num Num Num Num    0-4-Combine #
Num Num Num Num Num Num    0-5-Combine #
Num Num Num Num Num Num Num    0-6-Combine #
Num Num Num Num Num Num Num Num    0-7-Combine #
Num Num Num Num Num Num Num Num Num    0-8-Combine #
Num Num Num Num Num Num Num Num Num Num    0-9-Combine #
Letter Letter Letter Letter Letter Letter Letter Letter Letter Letter Letter    0-10-Combine    #
Letter Letter Letter Letter Letter Letter Letter Letter Letter Letter    0-9-Combine    #
Letter Letter Letter Letter Letter Letter Letter Letter Letter    0-8-Combine    #
Letter Letter Letter Letter Letter Letter Letter Letter    0-7-Combine    #
Letter Letter Letter Letter Letter Letter Letter    0-6-Combine    #
Letter Letter Letter Letter Letter Letter    0-5-Combine    #
Letter Letter Letter Letter Letter    0-4-Combine    #
Letter Letter Letter Letter    0-3-Combine    #
Letter Letter Letter    0-2-Combine    #
Letter Letter    0-1-Combine    #
Num NumSuffix Keyword    0-1-Seq    #海口1号场BLACKSTONE球场
Num Char Char Keyword    0-2-Combine    #八里岔中学
Char Num Char Keyword    0-2-Combine    #八里岔中学
Char Char Num Keyword    0-2-Combine    #八里岔中学
ackage cn.tianditu.mt.common;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

public class Grammar {
    
    protected static Log logger = LogFactory.getLog(Grammar.class);
    
    public final class TSTNode {
        public CombinRule data = null;
        protected TSTNode loNode;
        protected TSTNode eqNode;
        protected TSTNode hiNode;
        protected SegMarkType splitchar;

        public TSTNode(SegMarkType type) {
            this.splitchar = type;
        }
    }

    public TSTNode rootNode;
    
    public TSTNode add(List<SegMarkType> word) {
        if (null == word) {
            throw new NullPointerException("空指针异常");
        }

        int charIndex = 0;
        if (null == rootNode) {
            rootNode = new TSTNode(word.get(0));
        }
        TSTNode currentNode = rootNode;
        while (true) {
            int charComp = word.get(charIndex).compareTo(currentNode.splitchar);
            if (charComp == 0) {
                charIndex++;
                if (charIndex == word.size()) {
                    return currentNode;
                }
                if (null == currentNode.eqNode) {
                    currentNode.eqNode = new TSTNode(word.get(charIndex));
                }
                currentNode = currentNode.eqNode;
            } else if (charComp < 0) {
                if (null == currentNode.loNode) {
                    currentNode.loNode = new TSTNode(word.get(charIndex));
                }
                currentNode = currentNode.loNode;
            } else {
                if (null == currentNode.hiNode) {
                    currentNode.hiNode = new TSTNode(word.get(charIndex));
                }
                currentNode = currentNode.hiNode;
            }
        }
    }

    protected TSTNode getNode(List<SegMarkType> word) {
        if (null == word) {
            return null;
        }
        int len = word.size();
        if (len == 0)
            return null;
        TSTNode currentNode = rootNode; // 匹配过程中的当前节点的位置
        int charIndex = 0; // 表示当前要比较的字符在Key中的位置
        SegMarkType cmpChar = word.get(charIndex);
        int charComp;
        while (true) {
            if (currentNode == null) {// 没找到
                return null;
            }
            charComp = cmpChar.compareTo(currentNode.splitchar);
            if (charComp == 0) {// 相等往下走
                charIndex++;
                if (charIndex == len) {// 找到了
                    return currentNode;
                } else {
                    cmpChar = word.get(charIndex);// 词往下走
                }
                currentNode = currentNode.eqNode;
            } else if (charComp < 0) {// 小于往左走
                currentNode = currentNode.loNode;
            } else {// 大于往右走
                currentNode = currentNode.hiNode;
            }
        }
    }

    public MatchRet matchLong(List<WordInfo> tokens, int offset) {
        if (tokens == null || rootNode == null) {
            return null;
        }

        MatchRet ret = null;
        TSTNode currentNode = rootNode;
        int index = offset;
        while (currentNode != null) {
            int charComp = tokens.get(index).getType().compareTo(
                    currentNode.splitchar);
            if (charComp == 0) {
                index++;
                if (currentNode.data != null) {
                    ret = new MatchRet(currentNode, index);
                }
                if (index == tokens.size()) {
                    return ret;
                }
                currentNode = currentNode.eqNode;
            } else if (charComp < 0) {
                currentNode = currentNode.loNode;
            } else {
                currentNode = currentNode.hiNode;
            }
        }
        return ret;
    }

    /**
     * 根据语法规则进行合并
     * 支持多次合并
     * 且保留了源序列
     * @param tokens
     * @param rules
     * @return
     */
    private List<WordInfo> combineByRules(List<WordInfo> tokens,List<Combin> rules){
        if(rules==null){
            return tokens;
        }
        List<WordInfo> list=new ArrayList<WordInfo>();
        for (int i = 0; i < tokens.size();) {
            for (Combin com : rules) {
                if(i==com.getStart()){
                    int start=com.getStart();
                    int end=com.getEnd();
                    
                    List<WordInfo> sub=tokens.subList(start, end+1);//前闭后开
                    StringBuilder buff=new StringBuilder();
                    for (WordInfo wordInfo : sub) {
                        buff.append(wordInfo.getCn());
                    }                    
                    String cn=buff.toString();
                    SegMarkType type=com.getType();                    
                    WordInfo info=new WordInfo(cn,null,type,sub);                
                    list.add(info);                    
                    i=end+1;                    
                    continue;
                }        
            }
            list.add(tokens.get(i));
            i++;        
        }
        return list;
    }
    
    /**
     * 仅支持一次合并,不支持内部的多次合并,即无法达到有限状态机的效果
     * @param tokens
     * @param rules
     */
    @SuppressWarnings("unused")
    private void CombineOnce(LinkedList<WordInfo> tokens,
            List<Combin> rules) {

        for (Combin com : rules) {
            int start = com.getStart();
            int end = com.getEnd();
            SegMarkType type = com.getType();
            
            StringBuilder buff=new StringBuilder();
            for (int i = start; i <= end; i++) {
                WordInfo word=tokens.get(i);
                buff.append(word.getCn());
            }
            
            int dis=end-start+1;
            for (int i = 0; i < dis; i++) {
                tokens.remove(start);
            }
            
            String cn=buff.toString();
            WordInfo info=new WordInfo(cn,null,type);
            tokens.add(start, info);            
        }
    }

    public List<WordInfo> tag(List<WordInfo> tokens) {
        if (tokens == null || rootNode == null) {
            return null;
        }
        List<Combin> rules = new ArrayList<Combin>();
        for (int i = 0; i < tokens.size();) {
            MatchRet ret = matchLong(tokens, i);
            if (null != ret) {
                CombinRule rule = ret.getNode().data;//找到了树上的东西
                int indexCurrent = ret.getIndex()-1;
                List<Combin> list_com = rule.getPosition();
                for (Combin com : list_com) {
                    int start = indexCurrent - rule.getLen() + 1
                            + com.getStart();
                    int end = indexCurrent - rule.getLen() + 1 + com.getEnd();
                    Combin c = new Combin(start, end, com.getType());//拿到规则
                    rules.add(c);//放入规则列表
                }
                i = ret.getIndex();
            } else {
                i++;
            }
        }
        List<WordInfo> words= combineByRules(tokens,rules);//根据规则合并
        return words;
    }

    public Grammar(Config config){
        loadGrammar(config.getBasicGramFileName());
        loadGrammar(config.getGramFileName());
    }
    
    
    public void loadGrammar(String gramFileName){
        try {
            FileReader fileReader = new FileReader(gramFileName);
            BufferedReader reader = new BufferedReader(fileReader);
            String line;
            try {
                while ((line = reader.readLine()) != null) {
                    String[] arr=line.split("	");
                    
                    List<SegMarkType> seq=FormSeq(arr[0]);
                    CombinRule rule=FormRule(arr[1],seq.size());                    
                    TSTNode node = this.add(seq);
                    node.data=rule;
                }
            } catch (NullPointerException e) {
                logger.info(e.getMessage());
                logger.info(e.getStackTrace());
            } catch (IllegalArgumentException e) {
                logger.info(e.getMessage());
                logger.info(e.getStackTrace());
            } catch (IOException e) {
                logger.info(e.getMessage());
                logger.info(e.getStackTrace());
            }
        } catch (FileNotFoundException e) {
            logger.info(e.getMessage());
            logger.info(e.getStackTrace());
        }
    }
    
    
    
    
    private CombinRule FormRule(String line,int size) {
        
        List<Combin> rec = new ArrayList<Combin>();
        String[] arr_1=line.split("#");
        for (String str : arr_1) {
            String[] arr_2=str.split("-");
            int start = Integer.parseInt(arr_2[0]);
            int end=Integer.parseInt(arr_2[1]);
            SegMarkType type=Enum.valueOf(SegMarkType.class, arr_2[2].trim());
            Combin pos = new Combin(start, end, type);
            rec.add(pos);
        }        
        CombinRule rule = new CombinRule(rec,size);
        return rule;
    }

    private List<SegMarkType> FormSeq(String string) {
        List<SegMarkType> list=new ArrayList<SegMarkType>();
        String[] arr=string.split(" ");
        for (String str : arr) {
            SegMarkType type=Enum.valueOf(SegMarkType.class, str);
            list.add(type);
        }
        return list;
    }
    
}
原文地址:https://www.cnblogs.com/i80386/p/3965091.html