【502】gensim实现Word2Vec

参考:Word Embedding Tutorial: word2vec using Gensim [EXAMPLE]

参考:NLP入门(三)词形还原(Lemmatization)

参考:文本分类实战(一)—— word2vec预训练词向量

参考:Implementing Word2Vec with Gensim Library in Python

  文本预处理

  • 分词
  • 单词转化为小写字母
  • 去除单词中的标点符号
  • 去除单词中的数字
  • 去除空字符
  • 去掉停用词
  • 去掉空的list
  • 词形还原

  首先导入必要的 libraries

import gensim
import nltk
from gensim.models import Word2Vec

# 停用词
from nltk.corpus import stopwords
stop = stopwords.words('english')

# 标点符号
import string
# string.punctuation

# 词形还原
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

  加载数据并显示

data = [{"tag": "welcome",
"patterns": ["Hi", "How are you", "Is any one to talk?", "Hello", "hi are you available"],
"responses": ["Hello, thanks for contacting us", "Good to see you here"," Hi there, how may I assist you?"]
        },
{"tag": "goodbye",
"patterns": ["Bye", "See you later", "Goodbye", "I will come back soon"],
"responses": ["See you later, thanks for visiting", "have a great day ahead", "Wish you Come back again soon."]
        },
{"tag": "thankful",
"patterns": ["Thanks for helping me", "Thank your guidance", "That's helpful and kind from you"],
"responses": ["Happy to help!", "Any time!", "My pleasure", "It is my duty to help you"]
        },
        {"tag": "hoursopening",
"patterns": ["What hours are you open?", "Tell your opening time?", "When are you open?", "Just your timing please"],
"responses": ["We're open every day 8am-7pm", "Our office hours are 8am-7pm every day", "We open office at 8 am and close at 7 pm"]
        },

{"tag": "payments",
"patterns": ["Can I pay using credit card?", " Can I pay using Mastercard?", " Can I pay using cash only?" ],
"responses": ["We accept VISA, Mastercard and credit card", "We accept credit card, debit cards and cash. Please don’t worry"]
        }
   ]

bigger_list = []

for i in range(len(data)):
    for s in data[i]['patterns']:
        li = s.split(" ")
        bigger_list.append(li)
        
bigger_list

  输出结果如下:

[['Hi'],
 ['How', 'are', 'you'],
 ['Is', 'any', 'one', 'to', 'talk?'],
 ['Hello'],
 ['hi', 'are', 'you', 'available'],
 ['Bye'],
 ['See', 'you', 'later'],
 ['Goodbye'],
 ['I', 'will', 'come', 'back', 'soon'],
 ['Thanks', 'for', 'helping', 'me'],
 ['Thank', 'your', 'guidance'],
 ["That's", 'helpful', 'and', 'kind', 'from', 'you'],
 ['What', 'hours', 'are', 'you', 'open?'],
 ['Tell', 'your', 'opening', 'time?'],
 ['When', 'are', 'you', 'open?'],
 ['Just', 'your', 'timing', 'please'],
 ['Can', 'I', 'pay', 'using', 'credit', 'card?'],
 ['', 'Can', 'I', 'pay', 'using', 'Mastercard?'],
 ['', 'Can', 'I', 'pay', 'using', 'cash', 'only?']]

  将单词都转换为小写字母:

# 将单词变为小写
bigger_list = [[w.lower() for w in s] for s in bigger_list]
bigger_list

  输出结果如下:

[['hi'],
 ['how', 'are', 'you'],
 ['is', 'any', 'one', 'to', 'talk?'],
 ['hello'],
 ['hi', 'are', 'you', 'available'],
 ['bye'],
 ['see', 'you', 'later'],
 ['goodbye'],
 ['i', 'will', 'come', 'back', 'soon'],
 ['thanks', 'for', 'helping', 'me'],
 ['thank', 'your', 'guidance'],
 ["that's", 'helpful', 'and', 'kind', 'from', 'you'],
 ['what', 'hours', 'are', 'you', 'open?'],
 ['tell', 'your', 'opening', 'time?'],
 ['when', 'are', 'you', 'open?'],
 ['just', 'your', 'timing', 'please'],
 ['can', 'i', 'pay', 'using', 'credit', 'card?'],
 ['', 'can', 'i', 'pay', 'using', 'mastercard?'],
 ['', 'can', 'i', 'pay', 'using', 'cash', 'only?']]

  删除单词里面的标点符号

import string
# 存储标点符号为一个字符串
# string.punctuation

# 去掉单词中的标点
# ''.join([x for x in 'alex?' if x not in string.punctuation])
# 输出为 alex

# 去掉单词中的标点
bigger_list = [[''.join([x for x in w if x not in string.punctuation]) for w in s] for s in bigger_list]
bigger_list

  输出结果如下:

[['hi'],
 ['how', 'are', 'you'],
 ['is', 'any', 'one', 'to', 'talk'],
 ['hello'],
 ['hi', 'are', 'you', 'available'],
 ['bye'],
 ['see', 'you', 'later'],
 ['goodbye'],
 ['i', 'will', 'come', 'back', 'soon'],
 ['thanks', 'for', 'helping', 'me'],
 ['thank', 'your', 'guidance'],
 ['thats', 'helpful', 'and', 'kind', 'from', 'you'],
 ['what', 'hours', 'are', 'you', 'open'],
 ['tell', 'your', 'opening', 'time'],
 ['when', 'are', 'you', 'open'],
 ['just', 'your', 'timing', 'please'],
 ['can', 'i', 'pay', 'using', 'credit', 'card'],
 ['', 'can', 'i', 'pay', 'using', 'mastercard'],
 ['', 'can', 'i', 'pay', 'using', 'cash', 'only']]

  去掉空字符

# 去掉空字符
bigger_list = [[w for w in s if w!=''] for s in bigger_list]
bigger_list

  输出结果如下:

[['hi'],
 ['how', 'are', 'you'],
 ['is', 'any', 'one', 'to', 'talk'],
 ['hello'],
 ['hi', 'are', 'you', 'available'],
 ['bye'],
 ['see', 'you', 'later'],
 ['goodbye'],
 ['i', 'will', 'come', 'back', 'soon'],
 ['thanks', 'for', 'helping', 'me'],
 ['thank', 'your', 'guidance'],
 ['thats', 'helpful', 'and', 'kind', 'from', 'you'],
 ['what', 'hours', 'are', 'you', 'open'],
 ['tell', 'your', 'opening', 'time'],
 ['when', 'are', 'you', 'open'],
 ['just', 'your', 'timing', 'please'],
 ['can', 'i', 'pay', 'using', 'credit', 'card'],
 ['can', 'i', 'pay', 'using', 'mastercard'],
 ['can', 'i', 'pay', 'using', 'cash', 'only']]

  去掉停用词

from nltk.corpus import stopwords
# 存储停用词
stop = stopwords.words('english')

# 去掉停用词
bigger_list = [[w for w in s if w not in stop] for s in bigger_list]
bigger_list

  输出结果如下:

[['hi'],
 [],
 ['one', 'talk'],
 ['hello'],
 ['hi', 'available'],
 ['bye'],
 ['see', 'later'],
 ['goodbye'],
 ['come', 'back', 'soon'],
 ['thanks', 'helping'],
 ['thank', 'guidance'],
 ['thats', 'helpful', 'kind'],
 ['hours', 'open'],
 ['tell', 'opening', 'time'],
 ['open'],
 ['timing', 'please'],
 ['pay', 'using', 'credit', 'card'],
 ['pay', 'using', 'mastercard'],
 ['pay', 'using', 'cash']]

  去掉空的 list

# 去掉空的list
bigger_list = [s for s in bigger_list if len(s) > 0]
bigger_list

  输出结果如下:

[['hi'],
 ['one', 'talk'],
 ['hello'],
 ['hi', 'available'],
 ['bye'],
 ['see', 'later'],
 ['goodbye'],
 ['come', 'back', 'soon'],
 ['thanks', 'helping'],
 ['thank', 'guidance'],
 ['thats', 'helpful', 'kind'],
 ['hours', 'open'],
 ['tell', 'opening', 'time'],
 ['open'],
 ['timing', 'please'],
 ['pay', 'using', 'credit', 'card'],
 ['pay', 'using', 'mastercard'],
 ['pay', 'using', 'cash']]

  词形还原

# 词形还原
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
bigger_list = [[wnl.lemmatize(w) for w in s] for s in bigger_list]
bigger_list

  输出结果如下:

[['hi'],
 ['one', 'talk'],
 ['hello'],
 ['hi', 'available'],
 ['bye'],
 ['see', 'later'],
 ['goodbye'],
 ['come', 'back', 'soon'],
 ['thanks', 'helping'],
 ['thank', 'guidance'],
 ['thats', 'helpful', 'kind'],
 ['hour', 'open'],
 ['tell', 'opening', 'time'],
 ['open'],
 ['timing', 'please'],
 ['pay', 'using', 'credit', 'card'],
 ['pay', 'using', 'mastercard'],
 ['pay', 'using', 'cash']]

  模型训练并存储以及调用

# 训练模型
model= Word2Vec(bigger_list,min_count=1,size=300,workers=4)

# 模型存储
model.save("word2vec.model")
model.save('word2vec.bin')

# 模型加载
model = Word2Vec.load('word2vec.bin')

# 词汇
list(model.wv.vocab)

# thanks 对应的 vector
model.wv.word_vec('thanks')

  

word2vec API讲解

  在gensim中,word2vec 相关的API都在包gensim.models.word2vec中。和算法有关的参数都在类gensim.models.word2vec.Word2Vec中。算法需要注意的参数有:

  1. sentences:我们要分析的语料,可以是一个列表,或者从文件中遍历读出(word2vec.LineSentence(filename) )。
  2. size:词向量的维度,默认值是100。这个维度的取值一般与我们的语料的大小相关,如果是不大的语料,比如小于100M的文本语料,则使用默认值一般就可以了。如果是超大的语料,建议增大维度。
  3. window:即词向量上下文最大距离,window越大,则和某一词较远的词也会产生上下文关系。默认值为5,在实际使用中,可以根据实际的需求来动态调整这个window的大小。如果是小语料则这个值可以设的更小。对于一般的语料这个值推荐在[5;10]之间。
  4. sg:即我们的word2vec两个模型的选择了。如果是0, 则是CBOW模型;是1则是Skip-Gram模型;默认是0即CBOW模型。
  5. hs:即我们的word2vec两个解法的选择了。如果是0, 则是Negative Sampling;是1的话并且负采样个数negative大于0, 则是Hierarchical Softmax。默认是0即Negative Sampling。
  6. negative:即使用Negative Sampling时负采样的个数,默认是5。推荐在[3,10]之间。这个参数在我们的算法原理篇中标记为neg。
  7. cbow_mean:仅用于CBOW在做投影的时候,为0,则算法中的xw为上下文的词向量之和,为1则为上下文的词向量的平均值。在我们的原理篇中,是按照词向量的平均值来描述的。个人比较喜欢用平均值来表示xw,默认值也是1,不推荐修改默认值。
  8. min_count:需要计算词向量的最小词频。这个值可以去掉一些很生僻的低频词,默认是5。如果是小语料,可以调低这个值。
  9. iter:随机梯度下降法中迭代的最大次数,默认是5。对于大语料,可以增大这个值。
  10. alpha:在随机梯度下降法中迭代的初始步长。算法原理篇中标记为η,默认是0.025。
  11. min_alpha: 由于算法支持在迭代的过程中逐渐减小步长,min_alpha给出了最小的迭代步长值。随机梯度下降中每轮的迭代步长可以由iter,alpha, min_alpha一起得出。这部分由于不是word2vec算法的核心内容,因此在原理篇我们没有提到。

  利用json和pandas处理

#list of libraries used by the code
import string
from gensim.models import Word2Vec
import logging
from nltk.corpus import stopwords
from textblob import Word
import json
import pandas as pd
#data in json format
json_file = 'intents.json'
with open('intents.json','r') as f:
    data = json.load(f)
#displaying the list of stopwords
stop = stopwords.words('english')
#dataframe
df = pd.DataFrame(data)

df['patterns'] = df['patterns'].apply(', '.join)
# print(df['patterns'])
#print(df['patterns'])
#cleaning the data using the NLP approach
print(df)
df['patterns'] = df['patterns'].apply(lambda x:' '.join(x.lower() for x in x.split()))
df['patterns']= df['patterns'].apply(lambda x: ' '.join(x for x in x.split() if x not in string.punctuation))
df['patterns']= df['patterns'].str.replace('[^ws]','')
df['patterns']= df['patterns'].apply(lambda x: ' '.join(x for x in x.split() if  not x.isdigit()))
df['patterns'] = df['patterns'].apply(lambda x:' '.join(x for x in x.split() if not x in stop))
df['patterns'] = df['patterns'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
#taking the outer list
bigger_list=[]
for i in df['patterns']:
    li = list(i.split(" "))
    bigger_list.append(li)
#structure of data to be taken by the model.word2vec
print("Data format for the overall list:",bigger_list)
#custom data is fed to machine for further processing
model = Word2Vec(bigger_list, min_count=1,size=300,workers=4)
#print(model)

  

原文地址:https://www.cnblogs.com/alex-bn-lee/p/14111655.html