lucene随笔 IKAnalyzer StandardAnalyzer

StandardAnalyzer 是单词分词器：

String msg = "我喜欢你，我的祖国！china 中国,I love you！中华人民共和国";

分词后的结果：[我],[喜],[欢],[你],[我],[的],[祖],[国],[china],[中],[国],[i],[love],[you],[中],[华],[人],[民],[共],[和],[国]

IKAnalyzer 是中文分词器：

分词后的结果：[我],[喜欢],[你],[我],[的],[祖国],[china],[中国],[i],[love],[you],[中华人民共和国]

package com.shrio.lucene;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;

import java.io.IOException;
import java.io.StringReader;

/**
 * Created by luojie on 2018/4/24.
 */
public class ChineseAnalyerDemo {
    /**standardAnalyer分析器 ，Lucene内置中文分析器*/
    public void standardAnalyer(String msg){
        StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_4_10_4);
        this.getTokens(analyzer, msg);
    }


    /**IK Analyzer分析器*/
    public void iKanalyer(String msg){
        IKAnalyzer analyzer = new IKAnalyzer(true);//当为true时，分词器进行最大词长切分
        //IKAnalyzer analyzer = new IKAnalyzer();
        this.getTokens(analyzer, msg);
    }

    private void getTokens(Analyzer analyzer, String msg) {
        try {
            TokenStream tokenStream=analyzer.tokenStream("content", new StringReader(msg));
            tokenStream.reset();
            this.printTokens(analyzer.getClass().getSimpleName(),tokenStream);
            tokenStream.end();
        } catch (IOException e) {
            e.printStackTrace();
        }

    }

    private void printTokens(String analyzerType,TokenStream tokenStream){
        CharTermAttribute ta = tokenStream.addAttribute(CharTermAttribute.class);
        StringBuffer result =new StringBuffer();
        try {
            while(tokenStream.incrementToken()){
                if(result.length()>0){
                    result.append(",");
                }
                result.append("["+ta.toString()+"]");
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
        System.out.println(analyzerType+"->"+result.toString());
    }
}

package com.shrio.lucene;

import org.junit.Before;
import org.junit.Test;

/**
 * Created by luojie on 2018/4/24.
 */
public class TestChineseAnalyizer {

    private ChineseAnalyerDemo demo = null;

    private String msg = "我喜欢你，我的祖国！china 中国,I love you！中华人民共和国";
    //private String msg = "I love you, China!B2C";
    @Before
    public void setUp() throws Exception {
        demo=new ChineseAnalyerDemo();
    }
    @Test
    public void testStandardAnalyer(){
        demo.standardAnalyer(msg);
        demo.iKanalyer(msg);
    }
    @Test
    public void testIkAnalyzer(){
        demo.iKanalyer(msg);
    }
}

IKAnalyzer 独立使用配置扩展词典

IKAnalyzer.cfg.xml必须在src根目录下

<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
<properties>

    <comment>IK Analyzer 扩展配置</comment>
    <!-- 用户可以在这里配置自己的扩展字典 -->
    <entry key="ext_dict">mydict.dic</entry>
    <!-- 用户可以在这里配置自己的扩展停用词字典    -->
    <entry key="ext_stopwords">ext_stopword.dic</entry>

</properties>