python3 doc2vec文本聚类实现

import sys    #doc2vev
import gensim
import sklearn
import numpy as np
 
from gensim.models.doc2vec import Doc2Vec, LabeledSentence
 
TaggededDocument = gensim.models.doc2vec.TaggedDocument
 
def get_datasest():
    with open("ttt.txt", 'r') as cf:
        docs = cf.readlines()
        print (len(docs))
 
    x_train = []
    #y = np.concatenate(np.ones(len(docs)))
    for i, text in enumerate(docs):
        word_list = text.split(' ')
        l = len(word_list)
        word_list[l-1] = word_list[l-1].strip()
        document = TaggededDocument(word_list, tags=[i])
        x_train.append(document)
 
    return x_train
 
def getVecs(model, corpus, size):
    vecs = [np.array(model.docvecs[z.tags[0]].reshape(1, size)) for z in corpus]
    return np.concatenate(vecs)
 
def train(x_train, size=200, epoch_num=1):
    model_dm = Doc2Vec(x_train,min_count=1, window = 3, size = size, sample=1e-3, negative=5, workers=4)
    model_dm.train(x_train, total_examples=model_dm.corpus_count, epochs=70)
    model_dm.save('test/test')
 
    return model_dm
 
def test():
    model_dm = Doc2Vec.load("test/test")
    print(model_dm)
    test_text = ['', '舞林', '争霸' '', '十强' '出炉', '复活', '舞者', '澳门', '踢馆']
    inferred_vector_dm = model_dm.infer_vector(test_text)
    print (inferred_vector_dm)
    sims = model_dm.docvecs.most_similar([inferred_vector_dm], topn=10)
 
 
    return sims
 
if __name__ == '__main__':
    x_train = get_datasest()
    model_dm = train(x_train)
 
    sims = test()
    for count, sim in sims:
        sentence = x_train[count]
        words = ''
        for word in sentence[0]:
            words = words + word + ' '
        print (words, sim, len(sentence[0]))
print('ok')
 
原文地址:https://www.cnblogs.com/oikoumene/p/9798265.html