gesim_word2vec训练词向量

记一下，懒得找了。

语料是NER的古文献语料，参考其他博客代码。

我先将标记的实体提出来，作为自定义字典，加入jieba中，然后再入停用词，再分词，最后训练词向量。效果还不知如何，后续再说。

#加载自定义词典
jieba.load_userdict("cidian.txt")

#加载停用词
def getStopwords():
    stopwords = []
    with open("stopwords.txt", "r", encoding='utf8') as f:
        lines = f.readlines()
        for line in lines:
            stopwords.append(line.strip())
    return stopwords
stopwords = getStopwords()

#分词
def segment():

    segment_file = open('fenci.txt', 'a', encoding='utf8')
    with open('guwen.txt', encoding='utf8') as f:
        text = f.readlines()
        for sentence in text:
            sentence = list(jieba.cut(sentence))
            sentence_segment = []
            for word in sentence:
                if word not in stopwords:
                    sentence_segment.append(word)
            segment_file.write(" ".join(sentence_segment))
        del text
        f.close()

segment()

#训练词向量
import logging
import os.path
import sys

from gensim.models import Word2Vec
from gensim.models.word2vec import PathLineSentences

if __name__ == '__main__':
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)
    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))
    input_file = 'fenci.txt'
    outp1 = 'guwen.model'
    outp2 = 'guwen_word2vec_format'
#     fileNames = os.listdir(input_dir)
    # 训练模型 输入语料目录 embedding size 256,共现窗口大小10,去除出现次数5以下的词，迭代10次
    model = Word2Vec(PathLineSentences(input_file),
                     size=256, window=10, min_count=5,
                     iter=10)
    model.save(outp1)
    model.wv.save_word2vec_format(outp2, binary=False)