转自:https://radimrehurek.com/gensim/models/phrases.html
1.gensim.models.phrases.
Phraser
//上面链接中的样例代码复制不行,绝了。。。
>>> from gensim.test.utils import datapath >>> from gensim.models.word2vec import Text8Corpus >>> from gensim.models.phrases import Phrases, Phraser >>> >>> sentences = Text8Corpus(datapath('testcorpus.txt'))#首先加载作为分词的训练集 >>> phrases = Phrases(sentences, min_count=1, threshold=1)# >>> >>> bigram = Phraser(phrases)#这样训练出一个二元分词器 >>> sent = [u'trees', u'graph', u'minors']#在这个句子上测试 >>> print(bigram[sent]) [u'trees_graph', u'minors']#前两个word组合为了词组
其中参数min_count/threshold,越高那么词组越少,将单词组合成双字母组的难度就越大。
下面是三词组:
https://blog.csdn.net/weixin_33826609/article/details/91472435
# Build the bigram and trigram models bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases. trigram = gensim.models.Phrases(bigram[data_words], threshold=100) #以双词组作为训练数据 # Faster way to get a sentence clubbed as a trigram/bigram bigram_mod = gensim.models.phrases.Phraser(bigram) trigram_mod = gensim.models.phrases.Phraser(trigram) # See trigram example print(trigram_mod[bigram_mod[data_words[0]]])