使用K-means做词聚类需要用到word2vec做词向量化预处理。
# @Author : LinYimeng
代码传送门:
# -*- coding: utf-8 -*-
# @Author : LinYimeng
import multiprocessing
import gensim
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import word2vec,Word2Vec
from gensim.models import KeyedVectors
# import logging
import os
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
sentences = word2vec.LineSentence('one.txt')
model = Word2Vec(sentences,size = 256, min_count=1, window=5,sg=0,workers=multiprocessing.cpu_count())
model.save("w2v_model1.bin")
#model.wv.save_word2vec_format('w2v_model1.txt',binary = False)
#模型储存与加载
#计算一个词的最近似的词:
gensim.models.Word2Vec.load("w2v_model1.bin")
for key in model.similar_by_word('广告',topn=10):
print(key)
#计算两个词的相似度:
print("词汇美工和设计的相似度:")
a = model.similarity("美工","设计")
#print(a,"
")
#获取词向量:
print("'美工'的词向量:")
b = model ['美工']
#print(b)
运行结果:(结果的准确度和语料库训练出来的模型有关哈)
O1:与美工最近似的10个词及近似程度
('广告设计', 0.9999338984489441)
('专业', 0.999896764755249)
('艺术设计', 0.9998812675476074)
('淘宝美工', 0.999809205532074)
('以上学历', 0.9997988939285278)
('专业优先', 0.9997780323028564)
('专科', 0.9997620582580566)
('亚马逊', 0.9997476935386658)
('三年', 0.999673068523407)
('淘宝网', 0.9996634125709534)
O2:
词汇美工和设计的相似度:
0.9984635
O3:
'美工'的词向量:
[-3.48687708e-01 1.46133468e-01 -1.44630894e-01 -9.87506658e-02
-1.31191360e-02 -1.88716143e-01 -4.47646081e-01 1.49731040e-01
-5.24004459e-01 -9.21018869e-02 2.10645497e-01 3.74629140e-01
-2.56587386e-01 1.11217022e-01 -2.19115227e-01 1.19145982e-01
7.22632334e-02 3.74799609e-01 2.44546384e-01 3.73683721e-01
-2.28197843e-01 1.53873771e-01 -2.62950510e-01 1.50447646e-02
-3.99097413e-01 -3.96443129e-01 -5.51293731e-01 -2.08152726e-01
1.94999963e-01 1.29498601e-01 -2.42866322e-01 3.57480913e-01
-2.01223403e-01 -3.19850177e-01 1.62804410e-01 -1.07733719e-01
1.12713702e-01 7.96133697e-01 2.86799699e-01 4.27245438e-01
1.00772537e-01 2.93486714e-01 4.93184403e-02 -2.66669482e-01
3.64331871e-01 5.85242689e-01 1.74146473e-01 -3.93704474e-01
-4.28898484e-01 3.98656696e-01 4.55863588e-02 3.28688651e-01
5.26583552e-01 -4.76943761e-01 -1.55413464e-01 2.47356743e-01
-8.14967453e-02 -1.70028687e-01 4.31064051e-03 3.76357615e-01
-5.90099394e-01 1.77576676e-01 3.88448596e-01 -1.11230224e-01
2.69643992e-01 -7.82909036e-01 -1.98717296e-01 -9.35947478e-01
6.83441162e-02 -5.63184321e-01 -3.97667021e-01 2.68370777e-01
2.63136953e-01 7.05484927e-01 8.96330625e-02 -5.24120629e-01
-2.12599829e-01 2.29840080e-04 -5.73168576e-01 -6.45786345e-01
3.66044998e-01 1.75751954e-01 7.15595856e-02 3.16502750e-01
2.27089394e-02 6.02926195e-01 -2.42831558e-01 3.62302512e-01
2.66589880e-01 1.07311830e-01 -4.75437641e-02 7.29983523e-02
-6.58467054e-01 -2.25196555e-01 -1.49902210e-01 4.38490719e-01
-7.52383396e-02 4.25105989e-01 -2.70483583e-01 -2.56693840e-01
-4.75770295e-01 -3.31471801e-01 -1.22862287e-01 -5.76877110e-02
-8.02784637e-02 7.69574270e-02 1.08504817e-01 -4.68369454e-01
-2.75230199e-01 -2.93926775e-01 -6.06838986e-02 1.17807984e-01
-3.85949165e-01 -4.76562411e-01 2.70589441e-01 3.21213417e-02
-2.43952245e-01 -2.73433477e-02 3.20498526e-01 2.22012699e-02
7.00642839e-02 4.17426050e-01 -8.49749371e-02 1.73902348e-01
1.88933402e-01 8.13224837e-02 2.37543508e-01 -5.71050584e-01
4.15591985e-01 2.71951646e-01 -2.30916366e-01 -4.98654425e-01
1.91027045e-01 2.53007561e-01 -3.71333063e-01 -9.11553577e-02
3.09580415e-01 3.57550770e-01 -2.21502915e-01 -1.25888035e-01
1.97077528e-01 -5.63177228e-01 -4.32146847e-01 -4.75292541e-02
4.57033277e-01 -2.78818011e-02 -2.65594780e-01 -2.95226574e-01
-2.59728700e-01 -1.05398960e-01 -2.65379250e-01 6.60502255e-01
4.46286768e-01 1.34611458e-01 -1.43338870e-02 3.83034289e-01
2.66958773e-01 2.08140165e-02 6.13297224e-01 2.19453588e-01
-2.32486799e-01 2.24392891e-01 2.88341671e-01 -2.75768310e-01
6.81275606e-01 -2.02831313e-01 -7.40723163e-02 -1.66016743e-01
-6.95668042e-01 5.25998890e-01 -1.13620602e-01 -3.82012874e-01
-8.22404325e-02 -2.63661653e-01 -4.17716652e-01 2.05385938e-01
2.15466067e-01 -1.07346162e-01 -1.15236171e-01 2.63738424e-01
-1.36661410e-01 1.90679073e-01 -1.88163004e-03 3.70871603e-01
2.03539170e-02 2.83421218e-01 4.02853429e-01 -2.21123248e-02
-1.54945225e-01 -9.85328630e-02 -7.06284493e-02 1.31543204e-01
3.75243634e-01 -1.22791380e-01 5.21236777e-01 4.91459191e-01
-4.35250998e-01 -3.62768501e-01 1.24516673e-01 -4.28938717e-01
2.76040763e-01 3.15282971e-01 -1.53081775e-01 -6.49789631e-01
2.74431437e-01 -5.55682719e-01 -8.76603052e-02 4.11549360e-01
-2.95616418e-01 -3.63209605e-01 2.95839936e-01 3.48195791e-01
-2.38425374e-01 -2.08233610e-01 -1.75582811e-01 -3.28338861e-01
5.12714326e-01 -1.87658891e-01 -2.33509511e-01 4.07605857e-01
-2.99323261e-01 -8.34180117e-02 4.35251407e-02 3.75431739e-02
-1.58620626e-02 4.56823647e-01 3.66483241e-01 1.54309824e-01
-1.03329211e-01 -1.40446005e-02 4.41232733e-02 4.61825490e-01
2.37411503e-02 6.56094402e-02 -6.57415241e-02 -1.13562010e-01
-4.15926397e-01 5.27659595e-01 7.06111565e-02 1.34148777e-01
-3.60802680e-01 1.17561802e-01 -3.98860484e-01 3.12916517e-01
4.18521076e-01 -1.69889539e-01 2.68602937e-01 -3.33626688e-01
-2.03481883e-01 2.07403064e-01 -3.03639412e-01 -1.66668415e-01
2.40503717e-03 5.71818888e-01 6.53212294e-02 3.79095048e-01]
备注:本文代码系非原创的,因需要做聚类,几乎将博客里的关于这部分的代码都尝试了一遍,这份代码是没有报错的,感恩LinYimeng大神。