NLP（二十三）：用tf-idf得到句子向量，并计算相似度

一、基于gensim

1、模型类

import os
import jieba
import pickle
import logging
import numpy as np
from gensim import corpora, models, similarities
import utils.word_process as word_process
from root_path import root
from pathlib import Path
import heapq

class TfIdf(object):
    """tf-idf模型计算相似度"""
    def __init__(self):
        root_path = os.path.join(root, "confusion_detection", "checkpoints", "tf_idf")
        if not Path(root_path).is_dir():
            os.mkdir(root_path)
        self.dic_path = os.path.join(root_path, "bow.model")
        self.tfidf_model_path = os.path.join(root_path, "tfidf_model.model")
        self.tfidf_index_path = os.path.join(root_path, "tfidf_index.model")
        self.stop_list = word_process.get_stop_list()

        self.data_path = os.path.join(root, "confusion_detection", "data", "raw_data", "all.txt")


    def del_stopwords(self, words):
        """删除一句话中的停用词"""
        word_list = []

        for word in words:
            if word not in self.stop_list:
                word_list.append(word)
        return word_list

    def _seg_word(self, words_list, jieba_flag=True, del_stopword=True):
        """对多句话进行分词或分字"""
        word_list = []
        if jieba_flag:
            if del_stopword:
                for words in words_list:
                    jieba.cut(words)
                    word_list.append(self.del_stopwords(list(jieba.cut(words))))
            else:
                for words in words_list:
                    word_list.append(list(jieba.cut(words)))
        else:
            if del_stopword:
                for words in words_list:
                    word_list.append(self.del_stopwords(words))
            else:
                for words in words_list:
                    word_list.append([word for word in words])
        return word_list

    def train(self, sentence_list):
        """训练模型"""
        #下面保存语料字典
        word_list = self._seg_word(sentence_list)
        dic = corpora.Dictionary(word_list, prune_at=2000000)
        dic.save(self.dic_path)

        # 构建tfidf模型
        tfidf_model_path = self.tfidf_model_path
        corpus_model = [dic.doc2bow(word) for word in word_list]
        tfidf_model = models.TfidfModel(corpus_model)
        tfidf_model.save(tfidf_model_path)

        #构造检索模型
        tfidf_index_path = self.tfidf_index_path
        corpus_tfidf = tfidf_model[corpus_model]
        tfidf_index = similarities.MatrixSimilarity(corpus_tfidf)
        tfidf_index.save(tfidf_index_path)

    def predict(self, sentence):
        # 得到句子向量, 直接出检索结果(检索是基于word_list的)。
        dic = corpora.Dictionary.load(self.dic_path)
        words = sentence
        word_bow = dic.doc2bow(self._seg_word([words])[0])
        word_tfidf = models.TfidfModel.load(self.tfidf_model_path)[word_bow]
        tfidf_index = similarities.MatrixSimilarity.load(self.tfidf_index_path)
        score = tfidf_index[word_tfidf]
        return score

    def get_train_data(self):
        """得到句子数组和标签数组"""
        labels = []
        sentences = []
        with open(self.data_path, "r", encoding="utf8") as f:
            for line in f.readlines():
                data_tuple = line.split("  ")
                label = data_tuple[0]
                labels.append(label)
                sentence = data_tuple[1].replace("
", "").replace("
", "")
                sentences.append(sentence)
        return labels, sentences

    def main(self):
        labels, sentences = self.get_train_data()
        print(sentences)
        self.train(sentences)
        score_list = self.predict("我有困难还不了")

        # 获取下标， 输出为[4, 5, 2]
        print(heapq.nlargest(30, range(len(score_list)), score_list.__getitem__))

        # 获取数值， 输出为[9, 9, 6]
        print(heapq.nlargest(30, score_list))



if __name__ == '__main__':
    TfIdf().main()

2、工具类

import os
from root_path import root
import tqdm


stop = os.path.join(root, "confusion_detection","data", "raw_data", "ChineseStopWords.txt")

def get_stop_list():
    """得到停用词列表"""
    stop_word_list = []
    with open(stop, "r", encoding="utf8") as f:
        data_lines = tqdm.tqdm(f.readlines(), smoothing=0, mininterval=0.1)
        data_lines.set_description('正在处理停用词...')
        for line in data_lines:
            line = line.replace(" ", "").replace("
", "").replace("
", "")
            if len(line) == 1:
                stop_word_list.append(line)
    return stop_word_list

二、基于sklearn

import os
import jieba
import pickle
from root_path import root
from pathlib import Path

from sklearn.feature_extraction.text import TfidfVectorizer

class TfIdf(object):
    """tf-idf模型计算相似度"""
    def __init__(self):
        root_path = os.path.join(root, "confusion_detection", "checkpoints", "tf_idf")
        if not Path(root_path).is_dir():
            os.mkdir(root_path)
        self.data_path = os.path.join(root, "confusion_detection", "data", "raw_data", "all.txt")
        self.model_path = os.path.join(root_path, "tfidf.model")

    def get_train_data(self):
        """得到句子数组和标签数组"""
        labels = []
        sentences = []
        with open(self.data_path, "r", encoding="utf8") as f:
            for line in f.readlines():
                data_tuple = line.split("  ")
                label = data_tuple[0]
                labels.append(label)
                sentence = data_tuple[1].replace("
", "").replace("
", "")
                sentences.append(sentence)
        return labels, sentences

    def train(self):
        labels, sentences = self.get_train_data()
        sent_words = [list(jieba.cut(sent0)) for sent0 in sentences]
        document = [" ".join(sent0) for sent0 in sent_words]
        tfidf_vectorizer = TfidfVectorizer()
        feature = tfidf_vectorizer.fit_transform(document)
        # 保存模型
        with open(self.model_path, 'wb') as f:
            pickle.dump(tfidf_vectorizer, f)

    def predict(self, sentence):
        # 加载模型
        with open(self.model_path, 'rb') as f:
            tfidf_vectorizer = pickle.load(f)
        sentence = list(jieba.cut(sentence))
        sen = " ".join(sentence)
        res = tfidf_vectorizer.transform([sen]).toarray()
        return res[0]

    def main(self):
        sentence = "是的，我知道那就十五号没办法，因为这个，也可能是十二十号发工资的，因为遇见了超过了一点点。"
        self.predict(sentence)

if __name__ == '__main__':
    TfIdf().main()