nltk简要笔记

import nltk
from nltk.corpus import stopwords
# from nltk.stem.lancaster import LancasterStemmer  # 词干化
# ls = LancasterStemmer()  ls.stem(word)

from db_process import MyProcess

english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%']

s = 'attention window eyes users: if you are using internet explorer 9 or 10, you may not be able to log in to the chase site or other internet sites., I went to facebook with my students.'

words = nltk.word_tokenize(s)  # 分词

#tags = nltk.pos_tag(words)  # 显示词性

filter_words = filter(lambda x: x not in english_punctuations and x not in stopwords.words('english'), words)