【Neo4j】踩坑大会-Neo4J用中文索引

正在用的Neo4j是当前最新版：3.1.0，各种踩坑。说一下如何在Neo4j 3.1.0中使用中文索引。选用了IKAnalyzer做分词器。

1. 首先参考文章：

https://segmentfault.com/a/1190000005665612

里面大致讲了用IKAnalyzer做索引的方式。但并不清晰，实际上，这篇文章的背景是用嵌入式Neo4j，即Neo4j一定要嵌入在你的Java应用中（https://neo4j.com/docs/java-reference/current/#tutorials-java-embedded），切记。否则无法使用自定义的Analyzer。其次，文中的方法现在用起来已经有问题了，因为Neo4j 3.1.0用了lucene5.5，故官方的IKAnalyzer已经不适用了。

2. 修正

转用 IKAnalyzer2012FF_u1.jar，在Google可以下载到（https://code.google.com/archive/p/ik-analyzer/downloads）。这个版本的IKAnalyzer是有小伙伴修复了IKAnalyzer不适配lucene3.5以上而修改的一个版本。但是用了这个包仍有问题，报错提示：

Caused by: java.lang.AbstractMethodError: org.apache.lucene.analysis.Analyzer.createComponents(Ljava/lang/String;)Lorg/apache/lucene/analysis/Analyzer$TokenStreamComponents;

即IKAnalyzer的Analyzer类和当前版本的lucene仍有不适配的地方。

解决方案：再增加两个类


package com.uc.wa.function;
 
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Tokenizer;
 
public class IKAnalyzer5x extends Analyzer{
 
    private boolean useSmart;
    
    public boolean useSmart() {
     return useSmart;
  }
 
  public void setUseSmart(boolean useSmart) {
       this.useSmart = useSmart;
 }
 
  public IKAnalyzer5x(){
        this(false);
    }
   
    public IKAnalyzer5x(boolean useSmart){
      super();
      this.useSmart = useSmart;
 }
 
  
    /**
  protected TokenStreamComponents createComponents(String fieldName, final Reader in) {
     Tokenizer _IKTokenizer = new IKTokenizer(in , this.useSmart());
       return new TokenStreamComponents(_IKTokenizer);
   }
 **/
  
    
    /**
     * 重写最新版本的createComponents
     * 重载Analyzer接口，构造分词组件
     */
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer _IKTokenizer = new IKTokenizer5x(this.useSmart());
        return new TokenStreamComponents(_IKTokenizer);
 }
}


package com.uc.wa.function;
 
import java.io.IOException;
 
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;
 
public class IKTokenizer5x extends Tokenizer{
    
    //IK�ִ���ʵ��
    private IKSegmenter _IKImplement;
     
    //��Ԫ�ı�����
    private final CharTermAttribute termAtt;
    //��Ԫλ������
    private final OffsetAttribute offsetAtt;
    //��Ԫ�������ԣ������Է���ο�org.wltea.analyzer.core.Lexeme�еķ��ೣ����
    private final TypeAttribute typeAtt;
    //��¼���һ����Ԫ�Ľ���λ��
    private int endPosition;
     
     
    /**
    public IKTokenizer(Reader in , boolean useSmart){
        super(in);
        offsetAtt = addAttribute(OffsetAttribute.class);
        termAtt = addAttribute(CharTermAttribute.class);
        typeAtt = addAttribute(TypeAttribute.class);
        _IKImplement = new IKSegmenter(input , useSmart);
    }**/
     
    /**
     * Lucene 5.x Tokenizer�������๹�캯��
     * ʵ�����µ�Tokenizer�ӿ�
     * @param useSmart
     */
    public IKTokenizer5x(boolean useSmart){
        super();
        offsetAtt = addAttribute(OffsetAttribute.class);
        termAtt = addAttribute(CharTermAttribute.class);
        typeAtt = addAttribute(TypeAttribute.class);
        _IKImplement = new IKSegmenter(input , useSmart);
    }
 
    /* (non-Javadoc)
     * @see org.apache.lucene.analysis.TokenStream#incrementToken()
     */
    @Override
    public boolean incrementToken() throws IOException {
        //������еĴ�Ԫ����
        clearAttributes();
        Lexeme nextLexeme = _IKImplement.next();
        if(nextLexeme != null){
            //��Lexemeת��Attributes
            //���ô�Ԫ�ı�
            termAtt.append(nextLexeme.getLexemeText());
            //���ô�Ԫ����
            termAtt.setLength(nextLexeme.getLength());
            //���ô�Ԫλ��
            offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition());
            //��¼�ִʵ����λ��
            endPosition = nextLexeme.getEndPosition();
            //��¼��Ԫ����
            typeAtt.setType(nextLexeme.getLexemeTypeString());          
            //����true��֪�����¸���Ԫ
            return true;
        }
        //����false��֪��Ԫ������
        return false;
    }
     
    /*
     * (non-Javadoc)
     * @see org.apache.lucene.analysis.Tokenizer#reset(java.io.Reader)
     */
    @Override
    public void reset() throws IOException {
        super.reset();
        _IKImplement.reset(input);
    }   
     
    @Override
    public final void end() {
        // set final offset
        int finalOffset = correctOffset(this.endPosition);
        offsetAtt.setOffset(finalOffset, finalOffset);
    }
}

解决 IKAnalyzer2012FF_u1.jar和lucene5不适配的问题。使用时用IKAnalyzer5x替换IKAnalyzer即可。

3. 最后

Neo4j中文索引建立和搜索示例：


    /**
   * 为单个结点创建索引
  * 
   * @param propKeys
    */
  public static void createFullTextIndex(long id, List<String> propKeys) {
      log.info("method[createFullTextIndex] begin.propKeys<"+propKeys+">");
     Index<Node> entityIndex = null;
      
        try (Transaction tx = Neo4j.graphDb.beginTx()) {
            entityIndex = Neo4j.graphDb.index().forNodes("NodeFullTextIndex",
                  MapUtil.stringMap(IndexManager.PROVIDER, "lucene", "analyzer", IKAnalyzer5x.class.getName()));
          
            Node node = Neo4j.graphDb.getNodeById(id);
          log.info("method[createFullTextIndex] get node id<"+node.getId()+"> name<"
                    +node.getProperty("knowledge_name")+">");
          /**获取node详细信息*/
         Set<Map.Entry<String, Object>> properties = node.getProperties(propKeys.toArray(new String[0]))
                  .entrySet();
            for (Map.Entry<String, Object> property : properties) {
                log.info("method[createFullTextIndex] index prop<"+property.getKey()+":"+property.getValue()+">");
              entityIndex.add(node, property.getKey(), property.getValue());
          }
           tx.success();
       }
   }


    /**
    * 使用索引查询
   * 
     * @param query
     * @return
  * @throws IOException 
     */
   public static List<Map<String, Object>> selectByFullTextIndex(String[] fields, String query) throws IOException {
        List<Map<String, Object>> ret = Lists.newArrayList();
     try (Transaction tx = Neo4j.graphDb.beginTx()) {
          IndexManager index = Neo4j.graphDb.index();
         /**查询*/
           Index<Node> addressNodeFullTextIndex = index.forNodes("NodeFullTextIndex",
                   MapUtil.stringMap(IndexManager.PROVIDER, "lucene", "analyzer", IKAnalyzer5x.class.getName()));
            Query q = IKQueryParser.parseMultiField(fields, query);
         
            IndexHits<Node> foundNodes = addressNodeFullTextIndex.query(q);
 
          for(Node n : foundNodes){
             Map<String, Object> m = n.getAllProperties();
                if(!Float.isNaN(foundNodes.currentScore())){
                  m.put("score", foundNodes.currentScore());
             }
               log.info("method[selectByIndex] score<"+foundNodes.currentScore()+">");
             ret.add(m);
         }
           tx.success();
       } catch (IOException e) {
         log.error("method[selectByIndex] fields<"+Joiner.on(",").join(fields)+"> query<"+query+">", e);
         throw e;
      }
       return ret;
   }

注意到，在这里我用了IKQueryParser，即根据我们的查询词和要查询的字段，自动构造Query。这里是绕过了一个坑：用lucene查询语句直接查的话，是有问题的。比如：“address:南昌市” 查询语句，会搜到所有带市字的地址，这是非常不合理的。改用IKQueryParser即修正这个问题。IKQueryParser是IKAnalyzer自带的一个工具，但在 IKAnalyzer2012FF_u1.jar却被删减掉了。因此我这里重新引入了原版IKAnalyzer的jar包，项目最终是两个jar包共存的。

到这里坑就踩得差不多了。

       原文地址：https://blog.csdn.net/hereiskxm/article/details/54345261                         </div>