在DotLucene/Lucene.net中, 增加自己的中文分词Analyzer

一种非常简单,但是不是很优化的方法,继承Lucene.Net.Analysis.Analyzer,实现了Lucene.Net.Analysis.Analyzer,Lucene.Net.Analysis.Tokenizer,Lucene.Net.Analysis.TokenFilter的子类.参考了Lucene.Net.Analysis.Cn的实现,该项目采用对汉语进行一元分词.

ChineseAnalyer类,继承自Lucene.Net.Analysis.Analyzer

using System;
using System.IO;
using System.Text;
using System.Collections;
using ShootSeg; //分词类的命名空间,该分词组件来源于http://www.shootsoft.net,开源项目,感谢作者
using Lucene.Net.Analysis;
namespace Lucene.Net.Analysis.CnByKing
{
public class ChineseAnalyzer : Analyzer
{
private Segment segment = new Segment(); //这个是自己的中文分词类
public ChineseAnalyzer()
{
segment.InitWordDics(); //在构造函数装载词典
segment.Separator = "|"; //分词间隔符号
}
public override sealed TokenStream TokenStream(String fieldName, TextReader reader)
{
TokenStream result = new ChineseTokenizer(reader,segment); //把分词类传引用进去
result = new ChineseFilter(result); //对处理好的结果进行过滤
return result;
}
}
}

ChineseTokenizer类继承自Lucene.Net.Analysis.Tokenizer

using System;
using System.IO;
using System.Text;
using System.Collections;
using System.Globalization;
using ShootSeg;
using Lucene.Net.Analysis;
namespace Lucene.Net.Analysis.CnByKing
{

public sealed class ChineseTokenizer : Tokenizer
{

private Segment segment;
private string[] Wordlist; //切好的词放入此数组中
private string Allstr; //对传入的流转成此string
private int offset = 0; int start = 0; int step = 0; //offset偏移量,start开始位置,step次数
public ChineseTokenizer(TextReader _in,Segment segment)
{
input = _in;
Allstr = input.ReadToEnd(); //把流读到Allstr
this.segment = segment; //继续传引用,这会才发现写的时候糊涂了,完全可以不用写
Wordlist = segment.SegmentText(Allstr).Split('|'); //把分好的词装入wordlist
}
private Token Flush(string str)
{

if (str.Length > 0)
{
return new Token(str,start, start + str.Length); //返回一个Token 包含词,词在流中的开始位置和结束位置.
}
else
return null;
}

public override Token Next() //重载Next函数,就是返回Token
{
Token token = null;
if (step <= Wordlist.Length)
{
start = Allstr.IndexOf(Wordlist[step], offset); //从Allstr里找每个分出来词汇的开始位置
offset = start + 1; //计算偏移量
token = Flush(Wordlist[step]); //返回已分词汇
step = step + 1; //变量+1,移动到wordlist的下一个词汇
}
return token;
}
}
}

这个ChineseFilter继承自Lucene.Net.Analysis.TokenFilter,完全照抄Lucene.Net.Analysis.Cn工程的同名类(此类过滤了数字及符号,英文助词,需要过滤其他相应增加代码)
using System;
using System.IO;
using System.Collections;
using System.Globalization;
using Lucene.Net.Analysis;
namespace Lucene.Net.Analysis.CnByKing
{


/// <summary>
/// Title: ChineseFilter
/// Description: Filter with a stop word table
/// Rule: No digital is allowed.
/// English word/token should larger than 1 character.
/// One Chinese character as one Chinese word.
/// TO DO:
/// 1. Add Chinese stop words, such as \ue400
/// 2. Dictionary based Chinese word extraction
/// 3. Intelligent Chinese word extraction
/// 
/// Copyright: Copyright (c) 2001
/// Company:
/// @author Yiyi Sun
/// @version $Id: ChineseFilter.java, v 1.4 2003/01/23 12:49:33 ehatcher Exp $
/// </summary>
public sealed class ChineseFilter : TokenFilter
{
// Only English now, Chinese to be added later.
public static String[] STOP_WORDS = 
         {
           "and", "are", "as", "at", "be", "but", "by",
           "for", "if", "in", "into", "is", "it",
           "no", "not", "of", "on", "or", "such",
           "that", "the", "their", "then", "there", "these",
           "they", "this", "to", "was", "will", "with"
         };

private Hashtable stopTable;

public ChineseFilter(TokenStream _in)
: base(_in)
{
stopTable = new Hashtable(STOP_WORDS.Length);

for (int i = 0; i < STOP_WORDS.Length; i++)
stopTable[STOP_WORDS[i]] = STOP_WORDS[i];
}

public override Token Next()
{

for (Token token = input.Next(); token != null; token = input.Next())
{
String text = token.TermText();

// why not key off token type here assuming ChineseTokenizer comes first?
if (stopTable[text] == null)
{
switch (Char.GetUnicodeCategory(text[0]))
{

case UnicodeCategory.LowercaseLetter:
case UnicodeCategory.UppercaseLetter:

// English word/token should larger than 1 character.
if (text.Length > 1)
{
return token;
}
break;
case UnicodeCategory.OtherLetter:

// One Chinese character as one Chinese word.
// Chinese word extraction to be added later here.

return token;
}

}

}
return null;
}
}
}
以上基本没什么技术含量,好处就是增加新的中文分词不管什么算法,只需要简单几行代码搞定.中文分词完全和DotLucene/Lucene.net本身无关. 使用的时候用ChineseAnalyzer替换 StandardAnalyzer就OK了.
Click Here To Download是编译好的lucene.net 1.91 Lucene.Net.Analysis.CnByKing.dll ShootSeg.dll引用这三玩意就可以搞定简单的中文搜索了