自然语言处理 NLTK

from nltk.tokenize import MWETokenizer


tokenizer = MWETokenizer([('molecular','pathogenesis'), ('molecular','basis'), ('cognitive','assessment'),('clinical','intervention'),('clinical','interventions')
,('risk','factor'),('risk','factors'),('assisted','care')])

all_the_text = titleandabstractList[i].lower()
all_the_text = re.sub(""|,|.", "", all_the_text)
for word in tokenizer.tokenize(all_the_text.split()):

原文地址:https://www.cnblogs.com/lovely7/p/6144936.html