Document Classification

Natural Language Processing with Python

Chapter 6.1

由于nltk.FreqDist的排序问题，获取电影文本特征词的代码有些微改动。

 1 import nltk
 2 from nltk.corpus import movie_reviews as mr   
 3         
 4 def document_features(document,words_features):
 5     document_words=set(document)
 6     features={}
 7     for word in words_features:
 8         features['has(%s)' %word] = (word in document_words)
 9     return features   
10 
11 def test_doc_classification():
12     documents=[(list(mr.words(fileid)),category)
13                 for category in mr.categories()
14                 for fileid in mr.fileids(categories=category)]
15     all_words_dist=nltk.FreqDist(w.lower() for w in mr.words())
16     words_freq =sorted(all_words_dist.items(), key=lambda x: (-1*x[1], x[0]))[:2000]
17     words_features=[word[0] for word in words_freq]
18     
19     featuresets=[(document_features(doc,words_features),c) for (doc,c) in
20                     documents]
21                     
22     train_set, test_set= featuresets[100:],featuresets[:100]
23     classifier=nltk.NaiveBayesClassifier.train(train_set)
24     
25     print nltk.classify.accuracy(classifier,test_set)
26 
27     classifier.show_most_informative_features(5)

结果如下，accuracy为0.86：

0.86
Most Informative Features
has(outstanding) = True pos : neg = 10.4 : 1.0
has(seagal) = True neg : pos = 8.7 : 1.0
has(mulan) = True pos : neg = 8.1 : 1.0
has(wonderfully) = True pos : neg = 6.3 : 1.0
has(damon) = True pos : neg = 5.7 : 1.0