用lucene4.10.2分词器进行分词

import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.CharArraySet;

public class StandardAnalyzerTest {

	@SuppressWarnings("resource")
	public static void main(String[] args) {
		try {
			// 要处理的文本
            String text = "你好*hello,哈哈! !'ni> < !' hao->。“我”.192.168.8.10";
			// 自定义停用词		
			List<String> sw = new LinkedList<String>();// custom stopWords set
			sw.add("");
			CharArraySet stopWords = new CharArraySet(sw, true);			
			// 加入系统默认停用词
			Iterator<Object> itor = StandardAnalyzer.STOP_WORDS_SET.iterator();
			while (itor.hasNext()) {
				stopWords.add(itor.next());
			}
			// 标准分词器(Lucene内置的标准分析器,会将语汇单元转成小写形式,并去除停用词及标点符号)
			StandardAnalyzer analyzer = new StandardAnalyzer(stopWords);
			TokenStream ts = analyzer.tokenStream("field", text);
			CharTermAttribute ch = ts.addAttribute(CharTermAttribute.class);
			ts.reset();
			while (ts.incrementToken()) {
				System.out.println(ch.toString());
			}
			ts.end();
			ts.close();
		} catch (Exception ex) {
			ex.printStackTrace();
		}
	}

}
分词结果如下:
你
好
hello
哈
哈
ni
hao
我
192.168.8.10

  

原文地址:https://www.cnblogs.com/xiaodf/p/5027174.html