elasticsearch2.x ik插件

先来一个标准分词（standard），配置如下：

curl -XPUT localhost:9200/local -d '{

    "settings" : {

        "analysis" : {

            "analyzer" : {

                "stem" : {

                    "tokenizer" : "standard",

                    "filter" : ["standard", "lowercase", "stop", "porter_stem"]

                }

            }

        }

    },

    "mappings" : {

        "article" : {

            "dynamic" : true,

            "properties" : {

                "title" : {

                    "type" : "string",

                    "analyzer" : "stem"

                }

            }

        }

    }

}'

index:local

type:article

default analyzer:stem (filter:小写、停用词等)

field:title　　

测试：

# Index Data

curl -XPUT localhost:9200/local/article/1 -d'{"title": "Fight for your life"}'

curl -XPUT localhost:9200/local/article/2 -d'{"title": "Fighting for your life"}'

curl -XPUT localhost:9200/local/article/3 -d'{"title": "My dad fought a dog"}'

curl -XPUT localhost:9200/local/article/4 -d'{"title": "Bruno fights Tyson tomorrow"}'

  

# search on the title field, which is stemmed on index and search

curl -XGET localhost:9200/local/_search?q=title:fight

  

# searching on _all will not do anystemming, unless also configured on the mapping to be stemmed...

curl -XGET localhost:9200/local/_search?q=fight

例如：

Fight for your life

分词如下：

{"tokens":[

{"token":"fight","start_offset":1,"end_offset":6,"type":"<ALPHANUM>","position":1},<br>{"token":"your","start_offset":11,"end_offset":15,"type":"<ALPHANUM>","position":3},<br>{"token":"life","start_offset":16,"end_offset":20,"type":"<ALPHANUM>","position":4}

]}

部署ik分词器

在elasticsearch.yml中配置 index.analysis.analyzer.ik.type : "ik"

delete之前创建的index，重新配置如下：

curl -XPUT localhost:9200/local -d '{

    "settings" : {

        "analysis" : {

            "analyzer" : {

                "ik" : {

                    "tokenizer" : "ik"

                }

            }

        }

    },

    "mappings" : {

        "article" : {

            "dynamic" : true,

            "properties" : {

                "title" : {

                    "type" : "string",

                    "analyzer" : "ik"

                }

            }

        }

    }

}'

测试：

curl 'http://localhost:9200/local/_analyze?analyzer=ik&pretty=true' -d'  

{  

    "text":"中华人民共和国国歌" 

}  

'  

{

  "tokens" : [ {

    "token" : "text",

    "start_offset" : 12,

    "end_offset" : 16,

    "type" : "ENGLISH",

    "position" : 1

  }, {

    "token" : "中华人民共和国",

    "start_offset" : 19,

    "end_offset" : 26,

    "type" : "CN_WORD",

    "position" : 2

  }, {

    "token" : "国歌",

    "start_offset" : 26,

    "end_offset" : 28,

    "type" : "CN_WORD",

    "position" : 3

  } ]

}

如果我们想返回最细粒度的分词结果，需要在elasticsearch.yml中配置如下：

index:

  analysis:

    analyzer:

      ik:

          alias: [ik_analyzer]

          type: org.elasticsearch.index.analysis.IkAnalyzerProvider

      ik_smart:

          type: ik

          use_smart: true

      ik_max_word:

          type: ik

          use_smart: false

测试：

curl 'http://localhost:9200/index/_analyze?analyzer=ik_max_word&pretty=true' -d'  

{  

    "text":"中华人民共和国国歌" 

}  

'  

{

  "tokens" : [ {

    "token" : "text",

    "start_offset" : 12,

    "end_offset" : 16,

    "type" : "ENGLISH",

    "position" : 1

  }, {

    "token" : "中华人民共和国",

    "start_offset" : 19,

    "end_offset" : 26,

    "type" : "CN_WORD",

    "position" : 2

  }, {

    "token" : "中华人民",

    "start_offset" : 19,

    "end_offset" : 23,

    "type" : "CN_WORD",

    "position" : 3

  }, {

    "token" : "中华",

    "start_offset" : 19,

    "end_offset" : 21,

    "type" : "CN_WORD",

    "position" : 4

  }, {

    "token" : "华人",

    "start_offset" : 20,

    "end_offset" : 22,

    "type" : "CN_WORD",

    "position" : 5

  }, {

    "token" : "人民共和国",

    "start_offset" : 21,

    "end_offset" : 26,

    "type" : "CN_WORD",

    "position" : 6

  }, {

    "token" : "人民",

    "start_offset" : 21,

    "end_offset" : 23,

    "type" : "CN_WORD",

    "position" : 7

  }, {

    "token" : "共和国",

    "start_offset" : 23,

    "end_offset" : 26,

    "type" : "CN_WORD",

    "position" : 8

  }, {

    "token" : "共和",

    "start_offset" : 23,

    "end_offset" : 25,

    "type" : "CN_WORD",

    "position" : 9

  }, {

    "token" : "国",

    "start_offset" : 25,

    "end_offset" : 26,

    "type" : "CN_CHAR",

    "position" : 10

  }, {

    "token" : "国歌",

    "start_offset" : 26,

    "end_offset" : 28,

    "type" : "CN_WORD",

    "position" : 11

  } ]

}