elastic_search 指令

#!/usr/bin/env python
# -*- coding: utf-8 -*-

""" pass
"""

import os
import sys
import jieba

sys.path.append(os.path.dirname(os.path.split(os.path.realpath(__file__))[0]))

from elasticsearch import Elasticsearch
from conf.settings import FAQ_ES_CONF    # [{'host': '192.168.7.173', 'port': 9200}]


es_ser = Elasticsearch(FAQ_ES_CONF)

es_ser.indices.delete(index='customer', ignore=404)

es_ser.indices.create(index='customer', ignore=400)

body={"properties":{'about': {'type': 'string'},
                    'name': {'type': 'string'},
                    'age': {'type': 'integer'},
                    'score': {'type': 'integer'},
                    'company': {'type': 'string', 'index': 'not_analyzed'},
                    'interests': {'type': 'string'},
                    'timestamp': {'type': 'date'},
                    'id': {'type': 'integer'}}}

es_ser.indices.put_mapping(index='customer', doc_type='round_FAQ2', body=body)

es_ser.index(index='customer',
             doc_type='round_FAQ2',
             id=1,
             body={"name":"wulangzhou",
                   "age": 25,
                   "score": [85,75,95],
                   "about": jieba.lcut('i like think deep'),
                   "company": 'zhangyue',
                   "interests": ["music"],
                   "timestamp": '2016'})

es_ser.index(index='customer',
             doc_type='round_FAQ2',
             id=2,
             body={"name":"yanweihong",
                   "age": 28,
                   "about": jieba.lcut('i like exercise more'),
                   "score": [90,85,77],
                   "company": 'zhangyue',
                   "interests": ["forestry", 'i', 'like'],
                   "timestamp": '2017'})

es_ser.index(index='customer',
             doc_type='round_FAQ2',
             id=3,
             body={"name":"liumin",
                   "age": 28,
                   "about": jieba.lcut('i like cat'),
                   "score": [80, 80, 80, 80],
                   "company": 'jindong',
                   "weight": 85,
                   "interests": ['game'],
                   "timestamp": '2016'})

import time
time.sleep(1)

body={'query': {'multi_match': {'query': 'i like cat' ,
                                'fields': ['about', 'interests'],
                                'type': 'most_fields',}}}
                                #'tie_breaker': 0.2}}}

body={'query': {'match_phrase': {'about': 'i like'}}}
body={'query': {'range': {'age': {'gte': 18, 'lte': 35}}}}
body={'query': {'match_all': {}}}

body={'query': {'terms': {'age': [22, 20]}}}

body={'query': {'exists': {'field': 'weight'}}}

for sources in es_ser.search(index='customer', doc_type='round_FAQ2', body=body)['hits']['hits']:
    for k, v in sources.items():
        print k, v
    print ''


'''
http://www.tuicool.com/articles/uAbmuaU
match_phrase 可以看about 字段，如果该字段是string 且被设置为默认分词，可以看做是‘query_str‘ in ‘match_string’（查询字符和匹配字符都不分词进行匹配）？
match        可以看about 字段，表示 query_str分词后中的每一个词，与match_string分词后中的所有词，看能匹配到几个（查询字符和匹配字符都进行分词匹配）。
term         与  match_phrase 稍微有点区别  ‘query_str‘ == ‘match_string’ ？） （不进行分词的匹配）
multi_match  如果搭配  most_fields 表示fields中的所有字段，分词后尽量匹配多的词的和（不要带tie_breaker）
             如果搭配  best_fields 表示完全匹配的分值最高 比如 i like cat 如果全部匹配到了则分高（带tie_breaker）
terms        与term 类似
bool         当我们需要and  or 查询的时候，可以用 bool 查询，查询条件可以嵌套  { "bool" : { "must" : [], "should" : [], "must_not" : [], } }

def get_analyze_body(**kargs):
    """ 将查询条件转成特殊的查询参数
    """
    from faq.doc_idf import get_phrases_rate

    question = kargs.get('question')
    if question and isinstance(question, str):
        question = question.decode('utf-8')

    question = replace_string(question)

    question_args = get_right_phrases(filter_phrases(jieba_cut(question)))
    channel_num_arg = kargs.get('channel_num')
    version_arg = kargs.get('version')

    question_arg_rate = get_phrases_rate(question_args)

    should = []
    for question_arg, rate in question_arg_rate.items():
        should.append({'match_phrase': {'question': {'query': question_arg,
                                                     'boost': 10 * rate}}})

    must_channel_num = []
    must_channel_num.append({'match_phrase': {'channel_num': {'query': -1,
                                                              'boost': 1}}})
    if channel_num_arg:
        must_channel_num.append({'match_phrase': {'channel_num': {'query': int(channel_num_arg),
                                                                  'boost': 1.5}}})

    must_version = []
    must_version.append({'match_phrase': {'version': {'query': -1,
                                                      'boost': 1}}})
    if version_arg:
        must_version.append({'match_phrase': {'version': {'query': int(version_arg),
                                                          'boost': 1.5}}})

    return {'query': {'bool': {'should': should,
                               'must': [{'bool': {'should': must_channel_num}},
                                        {'bool': {'should': must_version}}]}},
            'min_score': 1}