使用朴素贝叶斯分类器从个人广告中获取区域倾向

引入

使用来自不同城市的广告训练一个分类器，然后观察分类器的效果。我们的目的并不是使用该分类器进行分类，而是通过观察单词和条件概率值来发现与特定城市相关的内容。

 1 def calcMostFreq(vocabList,fullText):
 2     import operator
 3     freqDict = {}
 4     for token in vocabList:
 5         freqDict[token]=fullText.count(token)
 6     sortedFreq = sorted(freqDict.iteritems(), key=operator.itemgetter(1), reverse=True) 
 7     return sortedFreq[:30]       
 8 
 9 def localWords(feed1,feed0):
10     import feedparser
11     docList=[]; classList = []; fullText =[]
12     minLen = min(len(feed1['entries']),len(feed0['entries']))
13     for i in range(minLen):
14         wordList = textParse(feed1['entries'][i]['summary'])
15         docList.append(wordList)
16         fullText.extend(wordList)
17         classList.append(1) #NY is class 1
18         wordList = textParse(feed0['entries'][i]['summary'])
19         docList.append(wordList)
20         fullText.extend(wordList)
21         classList.append(0)
22     vocabList = createVocabList(docList)#create vocabulary
23     top30Words = calcMostFreq(vocabList,fullText)   #remove top 30 words
24     for pairW in top30Words:
25         if pairW[0] in vocabList: vocabList.remove(pairW[0])
26     trainingSet = range(2*minLen); testSet=[]           #create test set
27     for i in range(20):
28         randIndex = int(random.uniform(0,len(trainingSet)))
29         testSet.append(trainingSet[randIndex])
30         del(trainingSet[randIndex])  
31     trainMat=[]; trainClasses = []
32     for docIndex in trainingSet:#train the classifier (get probs) trainNB0
33         trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
34         trainClasses.append(classList[docIndex])
35     p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
36     errorCount = 0
37     for docIndex in testSet:        #classify the remaining items
38         wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
39         if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
40             errorCount += 1
41     print 'the error rate is: ',float(errorCount)/len(testSet)
42     return vocabList,p0V,p1V

第一个函数遍历词汇表中的每个词并统计它在文本中出现的次数，然后根据出现次数从高到低对词典进行排序，最后返回排序最高的30个单词

第二个函数使用两个RSS源作为参数。这个函数和之前的spamTest函数几乎相同。区别在于这里访问的是RSS源而不是文件。

词袋模型在解决文档分类问题上比词集模型有所提高。

 1 def getTopWords(ny,sf):
 2     import operator
 3     vocabList,p0V,p1V=localWords(ny,sf)
 4     topNY=[]; topSF=[]
 5     for i in range(len(p0V)):
 6         if p0V[i] > -6.0 : topSF.append((vocabList[i],p0V[i]))
 7         if p1V[i] > -6.0 : topNY.append((vocabList[i],p1V[i]))
 8     sortedSF = sorted(topSF, key=lambda pair: pair[1], reverse=True)
 9     print "SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**"
10     for item in sortedSF:
11         print item[0]
12     sortedNY = sorted(topNY, key=lambda pair: pair[1], reverse=True)
13     print "NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**"
14     for item in sortedNY:
15         print item[0]

>>> import feedparser
>>> import bayes
>>> ny=feedparser.parse('http://newyork.craigslist.org/stp/index.rss')
>>> sf=feedparser.parse('http://sfbay.craigslist.org/stp/index.rss')
>>> voacblist,psf,pny=bayes.localWords(ny,sf)
the error rate is:  0.25
>>> voacblist,psf,pny=bayes.localWords(ny,sf)
the error rate is:  0.45
>>> voacblist,psf,pny=bayes.localWords(ny,sf)
the error rate is:  0.4
>>> voacblist,psf,pny=bayes.localWords(ny,sf)
the error rate is:  0.3
voacblist,psf,pny=bayes.localWords(ny,sf)
>>> 
>>> 
KeyboardInterrupt
>>> 
>>> voacblist,psf,pny=bayes.localWords(ny,sf)
the error rate is:  0.25
>>> bayes.getTopWords(ny,sf)

使用两个RSS源作为输入，然后训练并测试朴素贝叶斯分类器，返回使用的概率值。然后创建两个列表用于元素的存储，返回大于某个阈值的所有词，然后按照他们的条件概率进行排序。