NLP gensim 相似度计算

from collections import defaultdict
from gensim import corpora
import jieba
from gensim import similarities
import re


class Similarity:

    def docs(self, datas):
        # 构建主搜索索引
        docs = []
        for doc in datas:
            data = [i for i in jieba.cut(doc['title']) if i not in self.stop_words]
            docs.append(data)
            # print(data)
        dictionary = corpora.Dictionary(docs)
        corpus = [dictionary.doc2bow(doc) for doc in docs]
        lsi = similarities.MatrixSimilarity(corpus)
        return corpus, lsi

    def __init__(self, words=None, stop_words=None, degree=95, title_len=0.9):
        """
        :param words: 分词 中标 候选人 公示 结果 失败 成交供应商 询价 单一来源 采购 成交人 更正 比选 公告 招标
        :param stop_words: 停用词 中标 候选人 公示 结果 失败 成交供应商 询价 单一来源 采购 成交人 更正 比选 公告 招标 的 是 _
        :param degree: 文本相似度
        :param title_len: 匹配相似度之后，文本删除停用词之后的最短相同长度
        """
        words = set() if words is None else words
        for w in words:
            jieba.add_word(w)
        #  停用词
        self.stop_words = set() if stop_words is None else stop_words
        self.stop_params = re.compile("|".join(self.stop_words))
        # 相似度
        self.degree = degree
        self.title_len = title_len

    def handle(self, bid_datas, win_datas):
        """
        :param bid_datas: [{"id": id, "title": title}]
        :param win_datas: [{"id": id, "title": title}]
        :return: {"win_id": ["bid_id", "bid_id"]}
        """
        if len({i['id'] for i in bid_datas} & set({i["id"] for i in win_datas})) != 0:
            raise Exception("bid 和 win 的id重复了")
        merge_data = defaultdict(list)
        all_datas = []
        all_datas.extend(win_datas)
        all_datas.extend(bid_datas)
        corpus, lsi = self.docs(all_datas)
        
        win_len = len(win_datas)
        # 以中标数据为基准和招标数据进行匹配
        for i, corpus in enumerate(corpus[:win_len]):
            for j, v in [i for i in enumerate(lsi[corpus]) if i[0] >= win_len]:
                # print(i, j - win_len, v)
                if v * 100 - self.degree >= 0:
                    merge_data[i].append((j-win_len, v))
                    
        similary_data = defaultdict(list)
        for i, values in merge_data.items():
            source = win_datas[i]
            source_title = source['title']
            source_title = self.stop_params.sub("", source_title)
            if len(values) == 1:
                similary_data[source['id']].append(bid_datas[values[0][0]]['id'])
                continue
            for index, weight in values:
                target_title = bid_datas[index]['title']
                target_title = self.stop_params.sub("", target_title)
                min_title = source_title if len(source_title) < len(target_title) else target_title
                j = 0
                for j in range(len(min_title)):
                    if source_title[j] != min_title[j]:
                        continue
                if j >= len(min_title) * self.title_len:
                    similary_data[source['id']].append(bid_datas[index]['id'])
        return similary_data
bid_datas = [
   
    {"id": 1, "title": "中国电信股份有限公司曲靖分公司“三供一业”供水管网改造工程招标代理的结果-采购项目中标公示"},
    {"id": 2, "title": "项目视图四象限图坐标轴调至中间"},
    {"id": 3, "title": "12345654564564"}
]
win_datas = [
    
    {"id": 4, "title": "中国电信集团有限公司曲靖分公司“三供一业”供水管网改造工程招标代理的结果-采购项目招标公告"},
    {"id": 5, "title": "项目视图四象限图坐标轴调至中间"},
]
words = set("中国电信股份有限公司 中国电信集团有限公司 中标 候选人 股份 公示 结果 失败 成交供应商 询价 单一来源 采购 成交人 更正 比选 公告 招标".split(" "))
stop_words = set("中标 候选人 公示 结果 失败 成交供应商 询价 单一来源 采购 成交人 更正 比选 公告 招标 的 是 _ -".split(" "))
datas = Similarity(words=words, stop_words=stop_words, degree=90).handle(bid_datas, win_datas)
print("datas = ", datas)