深度学习之NLP获取词向量

1、代码

def clean_text(text, remove_stopwords=False):
    """
    数据清洗
    """
    text = BeautifulSoup(text, 'html.parser').get_text()
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    words = text.lower().split()
    if remove_stopwords:
        words = [w for w in words if w not in eng_stopwords]
    return words

def to_review_vector(review):
    """
    获取词向量
    """
    global word_vec
    
    review = clean_text(review, remove_stopwords=True)
    #print (review)
    #words = nltk.word_tokenize(review)
    word_vec = np.zeros((1,300))
    for word in review:
        #word_vec = np.zeros((1,300))
        if word in model:
            word_vec += np.array([model[word]]) 
    #print (word_vec.mean(axis = 0))
    return pd.Series(word_vec.mean(axis = 0))
原文地址:https://www.cnblogs.com/ywjfx/p/11041113.html