根据语料 计算词向量

根据语料计算词向量,两种模式 CBOW 和 skip-gram

# -*- coding:utf-8 -*-
import os

from gensim.models import word2vec


class MySentences(object):
    def __init__(self, dirname):
        self.dirname = dirname

    def __iter__(self):
        for line in open(os.path.join(self.dirname)):
            yield line.split()


if __name__ == '__main__':
    data_dir = ''
    model_path = ''
    files = os.listdir(data_dir)
    for index, data_path in enumerate(files):
        print 'index:', index, '	', data_path
        sentences = MySentences(data_dir + data_path)  # a memory-friendly iterator
        if index == 0:
            model = word2vec.Word2Vec(sentences, size=50, min_count=5, sg=1)
        else:
            model = word2vec.Word2Vec.load(model_path)
            model.train(sentences)
            print 'load success'

        model.save(model_path)
        print 'save success'

    # 继续训练
    #
    # 输出词向量
    print model['安踏']
    # 查看相似性
    print model.similarity('直播', '电商')
    print model.similarity('淘宝', '电商')
    # 输出最相似的词
    for i in model.most_similar(positive=['微博'], topn=10):
        print i[0], i[1]
    # 输出反义词
    for i in model.most_similar(negative=['微博'], topn=3):
        print i[0], i[1]
    # 找出气质最不合的词
    print(model.doesnt_match(['马云', '京东', '阿里', '小米', '百度', '美团']))
View Code

载入词向量:

def load_word_vec_model():
    word_vec_path = 'word2vec.txt'
    word_vec_model = KeyedVectors.load_word2vec_format(word_vec_path, binary=False)
    return word_vec_model
View Code

txt内容格式:

89299 50
广义 6.7723665 14.601548 20.063915 13.727134 -11.497403 -9.687737 -13.661188 13.636487 12.514348 -11.927621 9.849327 3.869883 -4.835537 21.264105 -0.27862522 -1.8299553 -6.370595 16.223785 -8.902656 -6.1665072 14.767804 -13.545085 -0.26700944 18.797802 3.4140692 -23.615307 5.3606462 -9.613785 -14.123712 -8.143979 -2.0690963 2.955524 4.1582117 0.92726874 -4.3396864 -10.7997 -2.9653497 -11.553318 -3.0220852 20.548243 -5.2833705 25.26876 -6.0394297 -1.6494333 3.4560573 12.670779 -13.85315 -8.514223 18.071764 -7.490371
钟爱 17.984484 -4.8768287 16.716238 16.658224 -27.738024 -25.891703 -19.179977 6.6909623 37.56464 -13.521651 13.267926 10.216028 -0.19054835 35.493042 29.336407 18.562439 -7.4809074 17.904173 -13.844719 3.022259 14.995911 -22.58654 -9.87084 15.710427 -14.876169 14.388888 -14.6048155 -5.1577635 -5.2825193 -10.078579 -5.086235 -22.363726 2.9529414 0.7049978 -10.118969 -22.133059 27.744198 -22.186438 -3.2051985 37.520164 10.439255 20.471209 -23.874033 -35.268066 -4.6956215 32.274727 24.359287 -8.854247 1.094503 -25.306633

89299 50 =》词条数目,向量维度

原文地址:https://www.cnblogs.com/tengpan-cn/p/8359805.html