文本向量化

from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

stop_list = list(set(stopwords.words('english')))  # set()集合函数消除重复项

corpus = ['This is the first document.',    # 语料库
          'This is the second second document.',
          'And the third one.',
          'Is this the first document?']

# -----------------------------------
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)    # 向量化，得到词袋模型

print(X.toarray())
print(vectorizer.get_feature_names())

print()
# -----------------------------------

bigram_vectorizer = CountVectorizer(ngram_range=(1,3),  # N元特征
                                    stop_words = stop_list) # 停用词
X = bigram_vectorizer.fit_transform(corpus)

print(X.toarray())
print(bigram_vectorizer.get_feature_names())

print()
# ------------------------------------

analyze = vectorizer.build_analyzer()
print(analyze('This is a text document to analyze.'))

print(vectorizer.transform(['something completely new.',
                            'and this has something old.']).toarray())

[[0 1 1 1 0 0 1 0 1]
 [0 1 0 1 0 2 1 0 1]
 [1 0 0 0 1 0 1 1 0]
 [0 1 1 1 0 0 1 0 1]]
['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']

[[1 1 1 0 0 0 0 0 0 0]
 [1 0 0 0 2 1 1 1 0 0]
 [0 0 0 1 0 0 0 0 1 1]
 [1 1 1 0 0 0 0 0 0 0]]
['document', 'first', 'first document', 'one', 'second', 'second document', 'second second', 'second second document', 'third', 'third one']

['this', 'is', 'text', 'document', 'to', 'analyze']
[[0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 1]]

CountVectorizer和TfidfVectorizer的参数：https://blog.csdn.net/du_qi/article/details/51564303
stopwords：https://www.cnblogs.com/webRobot/p/6079919.html