Lucene中Analyzer语句分析

    Lucene中Analyzer语句分析,利用lucene中自带的词法分析工具Analyzer,进行对句子的分析。

源代码如下:

 1 package com.test;
 2 
 3 import java.io.IOException;
 4 import java.io.StringReader;
 5 import java.util.List;
 6 
 7 import org.apache.lucene.analysis.Analyzer;
 8 import org.apache.lucene.analysis.SimpleAnalyzer;
 9 import org.apache.lucene.analysis.StopAnalyzer;
10 import org.apache.lucene.analysis.Token;
11 import org.apache.lucene.analysis.TokenStream;
12 import org.apache.lucene.analysis.WhitespaceAnalyzer;
13 import org.apache.lucene.analysis.standard.StandardAnalyzer;
14 import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
15 import org.apache.lucene.util.Version;
16 
17 import com.bean.mashupDerscriptionTest;
18 import com.daoImpl.MashupDaoImpl;
19 import com.gargoylesoftware.htmlunit.javascript.host.Comment;
20 
21 public class KeyWordsTest {
22 
23     /**
24      * @param args
25      */
26     public static void main(String[] args) {
27         MashupDaoImpl mashupDao = new MashupDaoImpl();
28         List<mashupDerscriptionTest> list = mashupDao
29                 .findAllmashupDescripteonTest();
30         int i = 1;
31         String comment = null;
32         for (mashupDerscriptionTest mashup : list) {
33             // 描述为空去名字作为描述
34             if (mashup.getComments().equals("")) {
35                 comment = mashup.getName();
36             } else {
37                 comment = mashup.getComments();
38             }
39 //            System.out.println(comment);
40             //对读取的描述利用Lucene中的Analyzer进行句子分析产生
41             //空格及各种符号分割,去掉停止词,停止词包括 is,are,in,on,the等无实际意义的词  
42             StringReader reader = new StringReader(comment);
43             Analyzer analyzer = new StopAnalyzer();
44             TokenStream tStream = analyzer.tokenStream("", reader);
45             Token t;
46             try {
47                 while ((t = tStream.next()) != null) {
48                     //对每个单词采用
49                     System.out.print(t.termText()+" ");
50                 }
51                 System.out.println((i++)+"条描述分词结束!");
52             } catch (IOException e) {
53                 e.printStackTrace();
54             }    
55         }
56     }
57 }

  注:数据来源于数据库中......

原文地址:https://www.cnblogs.com/rememberme/p/Lucene_Analyzer.html