(53)ElasticSearch之如何计算相关度分数

  ElasticSearch查询的相关度分数是3部分综合的分数,使用的是TF/IDF算法(Term Frequency&Invest Document Frequency)

  1、根据Term Frequency(词条出现频率)

  我们查询的文本中的词条在本document中出现了多少次,出现次数越多,相关度越高。 

  例如搜索内容:hello world

  在文档1::hello,I love china.中出现了hello,出现了一次

  在文档2:hello world,how are you!中出现了hello world,相当于出现了两次,所以文档2的相关度分数高于文档1。

  2、根据Inverse Document Frequency

  根据查询的文本中的词条在索引的全部文档中出现了多少次,出现的次数越多,相关度越低。

  例如搜索内容:hello world

  在文档1:hello,what are you doing?中hello出现了一次。

  在文档2:I like the world.中world出现了一次。

  按照第1项算,这两个文档的分数是一样的,但是还要比较hello在该索引的所有文档中出现多少次,world在该索引的所有文档中出现多少次,假如hello在索引的所有文档中出现了500次,world出现了100次。那么文档2的相关度分数要高于文档1。

  3、根据Field-length norm(字段长度规约)

  field越长,相关度约低。

  例如搜索内容:hello world,有下面两个文档。

  文档1:{"title":"hello,what's your name?","content":{"qwieurowieuolsdjflk"}}

  文档2:{"title":"hi,good morning","content":{"lkjkljkj....world"}}

  在文档1的title字段中搜索到hello,在文档2的content字段中搜索到world,content字段的长度比title字段长,所以文档2的相关度低

  4、演示查看分数是如何计算的

  准备数据:

PUT /lib
{
    "settings":{
        "number_of_shards":3,
        "number_of_replicas":0
      },
        "mappings":{
            "user":{
                "properties":{
                    "name":{"type":"text"},
                    "address":{"type":"text"},
                    "age":{"type":"integer"},
                    "interests":{
                      "type":"text"
                    },
                    "birthday":{"type":"date"}
                }
            }
        }
}
put /lib/user/1
{
    "name":"zhaoliu",
    "address":"hei long jiang sheng tie ling shi",
    "age":50,
    "birthday":"1970-12-12",
    "interests":"xi huang hejiu,duanlian,lvyou"
}

put /lib/user/2
{
    "name":"zhaoming",
    "address":"bei jing hai dian qu qing he zhen",
    "age":20,
    "birthday":"1998-10-12",
    "interests":"xi huan hejiu,duanlian,changge"
}

put /lib/user/3
{
    "name":"lisi",
    "address":"bei jing hai dian qu qing he zhen",
    "age":23,
    "birthday":"1998-10-12",
    "interests":"xi huan hejiu,duanlian,changge"
}

put /lib/user/4
{
    "name":"wangwu",
    "address":"bei jing hai dian qu qing he zhen",
    "age":26,
    "birthday":"1998-10-12",
    "interests":"xi huan biancheng,tingyinyue,lvyou"
}

put /lib/user/5
{
    "name":"zhangsan",
    "address":"bei jing chao yang qu",
    "age":29,
    "birthday":"1988-10-12",
    "interests":"xi huan tingyinyue,changge,tiaowu"
}

  在查询后面添加explain=true

GET lib/user/_search?explain=true
{
  "query": {
    "match": {
      "interests": "duanlian,changge"
    }
  }
}

  查询结果,可以看到3部分的分数,加起来是总的分数

{
  "took": 8,
  "timed_out": false,
  "_shards": {
    "total": 5,
    "successful": 5,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": 4,
    "max_score": 1.3862944,
    "hits": [
      {
        "_shard": "[lib][2]",
        "_node": "AJ3x6yc8TfKj6_zx6VRm0g",
        "_index": "lib",
        "_type": "user",
        "_id": "2",
        "_score": 1.3862944,
        "_source": {
          "name": "zhaoming",
          "address": "bei jing hai dian qu qing he zhen",
          "age": 20,
          "birthday": "1998-10-12",
          "interests": "xi huan hejiu,duanlian,changge"
        },
        "_explanation": {
          "value": 1.3862944,
          "description": "sum of:",
          "details": [
            {
              "value": 0.6931472,
              "description": "weight(interests:duanlian in 0) [PerFieldSimilarity], result of:",
              "details": [
                {
                  "value": 0.6931472,
                  "description": "score(doc=0,freq=1.0 = termFreq=1.0
), product of:",
                  "details": [
                    {
                      "value": 0.6931472,
                      "description": "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:",
                      "details": [
                        {
                          "value": 1,
                          "description": "docFreq",
                          "details": []
                        },
                        {
                          "value": 2,
                          "description": "docCount",
                          "details": []
                        }
                      ]
                    },
                    {
                      "value": 1,
                      "description": "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:",
                      "details": [
                        {
                          "value": 1,
                          "description": "termFreq=1.0",
                          "details": []
                        },
                        {
                          "value": 1.2,
                          "description": "parameter k1",
                          "details": []
                        },
                        {
                          "value": 0.75,
                          "description": "parameter b",
                          "details": []
                        },
                        {
                          "value": 5,
                          "description": "avgFieldLength",
                          "details": []
                        },
                        {
                          "value": 5,
                          "description": "fieldLength",
                          "details": []
                        }
                      ]
                    }
                  ]
                }
              ]
            },
            {
              "value": 0.6931472,
              "description": "weight(interests:changge in 0) [PerFieldSimilarity], result of:",
              "details": [
                {
                  "value": 0.6931472,
                  "description": "score(doc=0,freq=1.0 = termFreq=1.0
), product of:",
                  "details": [
                    {
                      "value": 0.6931472,
                      "description": "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:",
                      "details": [
                        {
                          "value": 1,
                          "description": "docFreq",
                          "details": []
                        },
                        {
                          "value": 2,
                          "description": "docCount",
                          "details": []
                        }
                      ]
                    },
                    {
                      "value": 1,
                      "description": "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:",
                      "details": [
                        {
                          "value": 1,
                          "description": "termFreq=1.0",
                          "details": []
                        },
                        {
                          "value": 1.2,
                          "description": "parameter k1",
                          "details": []
                        },
                        {
                          "value": 0.75,
                          "description": "parameter b",
                          "details": []
                        },
                        {
                          "value": 5,
                          "description": "avgFieldLength",
                          "details": []
                        },
                        {
                          "value": 5,
                          "description": "fieldLength",
                          "details": []
                        }
                      ]
                    }
                  ]
                }
              ]
            }
          ]
        }
      },
      {
        "_shard": "[lib][4]",
        "_node": "AJ3x6yc8TfKj6_zx6VRm0g",
        "_index": "lib",
        "_type": "user",
        "_id": "3",
        "_score": 0.5753642,
        "_source": {
          "name": "lisi",
          "address": "bei jing hai dian qu qing he zhen",
          "age": 23,
          "birthday": "1998-10-12",
          "interests": "xi huan hejiu,duanlian,changge"
        },
        "_explanation": {
          "value": 0.5753642,
          "description": "sum of:",
          "details": [
            {
              "value": 0.2876821,
              "description": "weight(interests:duanlian in 0) [PerFieldSimilarity], result of:",
              "details": [
                {
                  "value": 0.2876821,
                  "description": "score(doc=0,freq=1.0 = termFreq=1.0
), product of:",
                  "details": [
                    {
                      "value": 0.2876821,
                      "description": "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:",
                      "details": [
                        {
                          "value": 1,
                          "description": "docFreq",
                          "details": []
                        },
                        {
                          "value": 1,
                          "description": "docCount",
                          "details": []
                        }
                      ]
                    },
                    {
                      "value": 1,
                      "description": "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:",
                      "details": [
                        {
                          "value": 1,
                          "description": "termFreq=1.0",
                          "details": []
                        },
                        {
                          "value": 1.2,
                          "description": "parameter k1",
                          "details": []
                        },
                        {
                          "value": 0.75,
                          "description": "parameter b",
                          "details": []
                        },
                        {
                          "value": 5,
                          "description": "avgFieldLength",
                          "details": []
                        },
                        {
                          "value": 5,
                          "description": "fieldLength",
                          "details": []
                        }
                      ]
                    }
                  ]
                }
              ]
            },
            {
              "value": 0.2876821,
              "description": "weight(interests:changge in 0) [PerFieldSimilarity], result of:",
              "details": [
                {
                  "value": 0.2876821,
                  "description": "score(doc=0,freq=1.0 = termFreq=1.0
), product of:",
                  "details": [
                    {
                      "value": 0.2876821,
                      "description": "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:",
                      "details": [
                        {
                          "value": 1,
                          "description": "docFreq",
                          "details": []
                        },
                        {
                          "value": 1,
                          "description": "docCount",
                          "details": []
                        }
                      ]
                    },
                    {
                      "value": 1,
                      "description": "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:",
                      "details": [
                        {
                          "value": 1,
                          "description": "termFreq=1.0",
                          "details": []
                        },
                        {
                          "value": 1.2,
                          "description": "parameter k1",
                          "details": []
                        },
                        {
                          "value": 0.75,
                          "description": "parameter b",
                          "details": []
                        },
                        {
                          "value": 5,
                          "description": "avgFieldLength",
                          "details": []
                        },
                        {
                          "value": 5,
                          "description": "fieldLength",
                          "details": []
                        }
                      ]
                    }
                  ]
                }
              ]
            }
          ]
        }
      },
      {
        "_shard": "[lib][1]",
        "_node": "AJ3x6yc8TfKj6_zx6VRm0g",
        "_index": "lib",
        "_type": "user",
        "_id": "5",
        "_score": 0.2876821,
        "_source": {
          "name": "zhangsan",
          "address": "bei jing chao yang qu",
          "age": 29,
          "birthday": "1988-10-12",
          "interests": "xi huan tingyinyue,changge,tiaowu"
        },
        "_explanation": {
          "value": 0.2876821,
          "description": "sum of:",
          "details": [
            {
              "value": 0.2876821,
              "description": "weight(interests:changge in 0) [PerFieldSimilarity], result of:",
              "details": [
                {
                  "value": 0.2876821,
                  "description": "score(doc=0,freq=1.0 = termFreq=1.0
), product of:",
                  "details": [
                    {
                      "value": 0.2876821,
                      "description": "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:",
                      "details": [
                        {
                          "value": 1,
                          "description": "docFreq",
                          "details": []
                        },
                        {
                          "value": 1,
                          "description": "docCount",
                          "details": []
                        }
                      ]
                    },
                    {
                      "value": 1,
                      "description": "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:",
                      "details": [
                        {
                          "value": 1,
                          "description": "termFreq=1.0",
                          "details": []
                        },
                        {
                          "value": 1.2,
                          "description": "parameter k1",
                          "details": []
                        },
                        {
                          "value": 0.75,
                          "description": "parameter b",
                          "details": []
                        },
                        {
                          "value": 5,
                          "description": "avgFieldLength",
                          "details": []
                        },
                        {
                          "value": 5,
                          "description": "fieldLength",
                          "details": []
                        }
                      ]
                    }
                  ]
                }
              ]
            }
          ]
        }
      },
      {
        "_shard": "[lib][3]",
        "_node": "AJ3x6yc8TfKj6_zx6VRm0g",
        "_index": "lib",
        "_type": "user",
        "_id": "1",
        "_score": 0.2876821,
        "_source": {
          "name": "zhaoliu",
          "address": "hei long jiang sheng tie ling shi",
          "age": 50,
          "birthday": "1970-12-12",
          "interests": "xi huang hejiu,duanlian,lvyou"
        },
        "_explanation": {
          "value": 0.2876821,
          "description": "sum of:",
          "details": [
            {
              "value": 0.2876821,
              "description": "weight(interests:duanlian in 0) [PerFieldSimilarity], result of:",
              "details": [
                {
                  "value": 0.2876821,
                  "description": "score(doc=0,freq=1.0 = termFreq=1.0
), product of:",
                  "details": [
                    {
                      "value": 0.2876821,
                      "description": "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:",
                      "details": [
                        {
                          "value": 1,
                          "description": "docFreq",
                          "details": []
                        },
                        {
                          "value": 1,
                          "description": "docCount",
                          "details": []
                        }
                      ]
                    },
                    {
                      "value": 1,
                      "description": "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:",
                      "details": [
                        {
                          "value": 1,
                          "description": "termFreq=1.0",
                          "details": []
                        },
                        {
                          "value": 1.2,
                          "description": "parameter k1",
                          "details": []
                        },
                        {
                          "value": 0.75,
                          "description": "parameter b",
                          "details": []
                        },
                        {
                          "value": 5,
                          "description": "avgFieldLength",
                          "details": []
                        },
                        {
                          "value": 5,
                          "description": "fieldLength",
                          "details": []
                        }
                      ]
                    }
                  ]
                }
              ]
            }
          ]
        }
      }
    ]
  }
}

  5、查看一个文档能否匹配上某个查询

  使用上面的数据,id为2的可以匹配

GET /lib/user/2/_explain
{
  "query":{
    "match":{
      "interests":"duanlian,changge"
    }
  }
}

  查询结果:

{
  "_index": "lib",
  "_type": "user",
  "_id": "2",
  "matched": true,
  "explanation": {
    "value": 1.3862944,
    "description": "sum of:",
    "details": [
      {
        "value": 0.6931472,
        "description": "weight(interests:duanlian in 0) [PerFieldSimilarity], result of:",
        "details": [
          {
            "value": 0.6931472,
            "description": "score(doc=0,freq=1.0 = termFreq=1.0
), product of:",
            "details": [
              {
                "value": 0.6931472,
                "description": "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:",
                "details": [
                  {
                    "value": 1,
                    "description": "docFreq",
                    "details": []
                  },
                  {
                    "value": 2,
                    "description": "docCount",
                    "details": []
                  }
                ]
              },
              {
                "value": 1,
                "description": "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:",
                "details": [
                  {
                    "value": 1,
                    "description": "termFreq=1.0",
                    "details": []
                  },
                  {
                    "value": 1.2,
                    "description": "parameter k1",
                    "details": []
                  },
                  {
                    "value": 0.75,
                    "description": "parameter b",
                    "details": []
                  },
                  {
                    "value": 5,
                    "description": "avgFieldLength",
                    "details": []
                  },
                  {
                    "value": 5,
                    "description": "fieldLength",
                    "details": []
                  }
                ]
              }
            ]
          }
        ]
      },
      {
        "value": 0.6931472,
        "description": "weight(interests:changge in 0) [PerFieldSimilarity], result of:",
        "details": [
          {
            "value": 0.6931472,
            "description": "score(doc=0,freq=1.0 = termFreq=1.0
), product of:",
            "details": [
              {
                "value": 0.6931472,
                "description": "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:",
                "details": [
                  {
                    "value": 1,
                    "description": "docFreq",
                    "details": []
                  },
                  {
                    "value": 2,
                    "description": "docCount",
                    "details": []
                  }
                ]
              },
              {
                "value": 1,
                "description": "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:",
                "details": [
                  {
                    "value": 1,
                    "description": "termFreq=1.0",
                    "details": []
                  },
                  {
                    "value": 1.2,
                    "description": "parameter k1",
                    "details": []
                  },
                  {
                    "value": 0.75,
                    "description": "parameter b",
                    "details": []
                  },
                  {
                    "value": 5,
                    "description": "avgFieldLength",
                    "details": []
                  },
                  {
                    "value": 5,
                    "description": "fieldLength",
                    "details": []
                  }
                ]
              }
            ]
          }
        ]
      }
    ]
  }
}

  使用上面的数据,id为10的不能匹配:

GET /lib/user/10/_explain
{
  "query":{
    "match":{
      "interests":"duanlian,changge"
    }
  }
}

  查询结果:

{
  "_index": "lib",
  "_type": "user",
  "_id": "10",
  "matched": false
}
原文地址:https://www.cnblogs.com/javasl/p/12661972.html