【机器学习】word2vec词向量,相似词,近义词,k-means文本聚类预处理,python

使用K-means做词聚类需要用到word2vec做词向量化预处理。

# @Author  : LinYimeng

代码传送门:

# -*- coding: utf-8 -*-
# @Author  : LinYimeng
import multiprocessing
import gensim
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import word2vec,Word2Vec
from gensim.models import KeyedVectors
# import logging
import os
 
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
sentences = word2vec.LineSentence('one.txt')
model = Word2Vec(sentences,size = 256, min_count=1, window=5,sg=0,workers=multiprocessing.cpu_count())
model.save("w2v_model1.bin")
#model.wv.save_word2vec_format('w2v_model1.txt',binary = False)
#模型储存与加载

#计算一个词的最近似的词:
gensim.models.Word2Vec.load("w2v_model1.bin")
for key in model.similar_by_word('广告',topn=10):
        print(key)

#计算两个词的相似度:
print("词汇美工和设计的相似度:")
a = model.similarity("美工","设计")  
#print(a,"
")

#获取词向量:
print("'美工'的词向量:")
b = model ['美工']
#print(b)

运行结果:(结果的准确度和语料库训练出来的模型有关哈)

O1:与美工最近似的10个词及近似程度

('广告设计', 0.9999338984489441)
('专业', 0.999896764755249)
('艺术设计', 0.9998812675476074)
('淘宝美工', 0.999809205532074)
('以上学历', 0.9997988939285278)
('专业优先', 0.9997780323028564)
('专科', 0.9997620582580566)
('亚马逊', 0.9997476935386658)
('三年', 0.999673068523407)
('淘宝网', 0.9996634125709534)

O2:

词汇美工和设计的相似度:
0.9984635 

O3:

'美工'的词向量:
[-3.48687708e-01  1.46133468e-01 -1.44630894e-01 -9.87506658e-02
 -1.31191360e-02 -1.88716143e-01 -4.47646081e-01  1.49731040e-01
 -5.24004459e-01 -9.21018869e-02  2.10645497e-01  3.74629140e-01
 -2.56587386e-01  1.11217022e-01 -2.19115227e-01  1.19145982e-01
  7.22632334e-02  3.74799609e-01  2.44546384e-01  3.73683721e-01
 -2.28197843e-01  1.53873771e-01 -2.62950510e-01  1.50447646e-02
 -3.99097413e-01 -3.96443129e-01 -5.51293731e-01 -2.08152726e-01
  1.94999963e-01  1.29498601e-01 -2.42866322e-01  3.57480913e-01
 -2.01223403e-01 -3.19850177e-01  1.62804410e-01 -1.07733719e-01
  1.12713702e-01  7.96133697e-01  2.86799699e-01  4.27245438e-01
  1.00772537e-01  2.93486714e-01  4.93184403e-02 -2.66669482e-01
  3.64331871e-01  5.85242689e-01  1.74146473e-01 -3.93704474e-01
 -4.28898484e-01  3.98656696e-01  4.55863588e-02  3.28688651e-01
  5.26583552e-01 -4.76943761e-01 -1.55413464e-01  2.47356743e-01
 -8.14967453e-02 -1.70028687e-01  4.31064051e-03  3.76357615e-01
 -5.90099394e-01  1.77576676e-01  3.88448596e-01 -1.11230224e-01
  2.69643992e-01 -7.82909036e-01 -1.98717296e-01 -9.35947478e-01
  6.83441162e-02 -5.63184321e-01 -3.97667021e-01  2.68370777e-01
  2.63136953e-01  7.05484927e-01  8.96330625e-02 -5.24120629e-01
 -2.12599829e-01  2.29840080e-04 -5.73168576e-01 -6.45786345e-01
  3.66044998e-01  1.75751954e-01  7.15595856e-02  3.16502750e-01
  2.27089394e-02  6.02926195e-01 -2.42831558e-01  3.62302512e-01
  2.66589880e-01  1.07311830e-01 -4.75437641e-02  7.29983523e-02
 -6.58467054e-01 -2.25196555e-01 -1.49902210e-01  4.38490719e-01
 -7.52383396e-02  4.25105989e-01 -2.70483583e-01 -2.56693840e-01
 -4.75770295e-01 -3.31471801e-01 -1.22862287e-01 -5.76877110e-02
 -8.02784637e-02  7.69574270e-02  1.08504817e-01 -4.68369454e-01
 -2.75230199e-01 -2.93926775e-01 -6.06838986e-02  1.17807984e-01
 -3.85949165e-01 -4.76562411e-01  2.70589441e-01  3.21213417e-02
 -2.43952245e-01 -2.73433477e-02  3.20498526e-01  2.22012699e-02
  7.00642839e-02  4.17426050e-01 -8.49749371e-02  1.73902348e-01
  1.88933402e-01  8.13224837e-02  2.37543508e-01 -5.71050584e-01
  4.15591985e-01  2.71951646e-01 -2.30916366e-01 -4.98654425e-01
  1.91027045e-01  2.53007561e-01 -3.71333063e-01 -9.11553577e-02
  3.09580415e-01  3.57550770e-01 -2.21502915e-01 -1.25888035e-01
  1.97077528e-01 -5.63177228e-01 -4.32146847e-01 -4.75292541e-02
  4.57033277e-01 -2.78818011e-02 -2.65594780e-01 -2.95226574e-01
 -2.59728700e-01 -1.05398960e-01 -2.65379250e-01  6.60502255e-01
  4.46286768e-01  1.34611458e-01 -1.43338870e-02  3.83034289e-01
  2.66958773e-01  2.08140165e-02  6.13297224e-01  2.19453588e-01
 -2.32486799e-01  2.24392891e-01  2.88341671e-01 -2.75768310e-01
  6.81275606e-01 -2.02831313e-01 -7.40723163e-02 -1.66016743e-01
 -6.95668042e-01  5.25998890e-01 -1.13620602e-01 -3.82012874e-01
 -8.22404325e-02 -2.63661653e-01 -4.17716652e-01  2.05385938e-01
  2.15466067e-01 -1.07346162e-01 -1.15236171e-01  2.63738424e-01
 -1.36661410e-01  1.90679073e-01 -1.88163004e-03  3.70871603e-01
  2.03539170e-02  2.83421218e-01  4.02853429e-01 -2.21123248e-02
 -1.54945225e-01 -9.85328630e-02 -7.06284493e-02  1.31543204e-01
  3.75243634e-01 -1.22791380e-01  5.21236777e-01  4.91459191e-01
 -4.35250998e-01 -3.62768501e-01  1.24516673e-01 -4.28938717e-01
  2.76040763e-01  3.15282971e-01 -1.53081775e-01 -6.49789631e-01
  2.74431437e-01 -5.55682719e-01 -8.76603052e-02  4.11549360e-01
 -2.95616418e-01 -3.63209605e-01  2.95839936e-01  3.48195791e-01
 -2.38425374e-01 -2.08233610e-01 -1.75582811e-01 -3.28338861e-01
  5.12714326e-01 -1.87658891e-01 -2.33509511e-01  4.07605857e-01
 -2.99323261e-01 -8.34180117e-02  4.35251407e-02  3.75431739e-02
 -1.58620626e-02  4.56823647e-01  3.66483241e-01  1.54309824e-01
 -1.03329211e-01 -1.40446005e-02  4.41232733e-02  4.61825490e-01
  2.37411503e-02  6.56094402e-02 -6.57415241e-02 -1.13562010e-01
 -4.15926397e-01  5.27659595e-01  7.06111565e-02  1.34148777e-01
 -3.60802680e-01  1.17561802e-01 -3.98860484e-01  3.12916517e-01
  4.18521076e-01 -1.69889539e-01  2.68602937e-01 -3.33626688e-01
 -2.03481883e-01  2.07403064e-01 -3.03639412e-01 -1.66668415e-01
  2.40503717e-03  5.71818888e-01  6.53212294e-02  3.79095048e-01]

备注:本文代码系非原创的,因需要做聚类,几乎将博客里的关于这部分的代码都尝试了一遍,这份代码是没有报错的,感恩LinYimeng大神。

原文地址:https://www.cnblogs.com/helenlee01/p/12617470.html