朴素贝叶斯

从词向量计算概率


import numpy as np

def loadDataSet():
    """
    实验样本
    :return: 第一个变量是进行词条切分后的文档集合，第二个变量是一个类别标签的集合
    """
    postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                   ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                   ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                   ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                   ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                   ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0, 1, 0, 1, 0, 1]  # 人工标记 【0，stupid愚蠢，0，stupid愚蠢worthless垃圾garbage一文不值，0，worthless垃圾stupid愚蠢】   #1 is abusive, 0 not
    return postingList, classVec

def createVocabList(dataSet):
    """
    创建一个包含所以文档中出现的不重复词的列表
    :param dataSet:
    :return:
    """
    vocabSet = set([])  #create empty set
    for document in dataSet:
        vocabSet = vocabSet | set(document)  # 并集 #union of the two sets
    return list(vocabSet)

def setOfWords2Vec(vocabList, inputSet):
    """

    :param vocabList: 词汇表
    :param inputSet: 某个文档
    :return: 词汇表长度的列表，1表示出现，0没有出现
    """
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
        else: print("the word: %s is not in my Vocabulary!" % word)
    return returnVec

def trainNB0(trainMatrix, trainCategory):
    """
    分类器训练
    :param trainMatrix: 文档矩阵 训练集
    :param trainCategory: 文档类别标签向量
    :return:
    """
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = sum(trainCategory)/float(numTrainDocs)
    p0Num = np.zeros(numWords); p1Num = np.zeros(numWords)      #change to np.ones()
    # p0Num = np.ones(numWords); p1Num = np.ones(numWords)      #change to np.ones()
    p0Denom = 0.0; p1Denom = 0.0                        #change to 2.0
    # p0Denom = 2.0; p1Denom = 2.0                        #change to 2.0
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vect = p1Num/p1Denom          #change to np.log()
    # p1Vect = np.log(p1Num/p1Denom)          #change to np.log()
    p0Vect = p0Num/p0Denom         #change to np.log()
    # p0Vect = np.log(p0Num/p0Denom)          #change to np.log()
    return p0Vect, p1Vect, pAbusive

if __name__ == '__main__':
    listOPosts, listClasses = loadDataSet()
    myVocabList = createVocabList(listOPosts)
    print(myVocabList)

    trainMat = []
    for postinDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
    print(trainMat)
    p0v, p1v, pAb = trainNB0(trainMat, listClasses)
    print(pAb)
    print(p0v)
    print(p1v)

'''
['love', 'take', 'cute', 'so', 'flea', 'posting', 'stop', 'help', 'mr', 'stupid', 'ate', 'garbage', 'has', 'I', 'problems', 'licks', 'worthless', 'is', 'how', 'not', 'maybe', 'dalmation', 'food', 'buying', 'please', 'him', 'park', 'quit', 'steak', 'my', 'dog', 'to']
[[0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1], [1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0]]
0.5
[0.04166667 0.         0.04166667 0.04166667 0.04166667 0.
 0.04166667 0.04166667 0.04166667 0.         0.04166667 0.
 0.04166667 0.04166667 0.04166667 0.04166667 0.         0.04166667
 0.04166667 0.         0.         0.04166667 0.         0.
 0.04166667 0.08333333 0.         0.         0.04166667 0.125
 0.04166667 0.04166667]
[0.         0.05263158 0.         0.         0.         0.05263158
 0.05263158 0.         0.         0.15789474 0.         0.05263158
 0.         0.         0.         0.         0.10526316 0.
 0.         0.05263158 0.05263158 0.         0.05263158 0.05263158
 0.         0.05263158 0.05263158 0.05263158 0.         0.
 0.10526316 0.05263158]


cute 在类别0中出现1次，类别1中出现0次，对应的条件概率分别是0.04166667与0.

p1v中最大概率是0.15789474对应stupid，这意味stupid是最能表征类别1的单词

'''

修改分类器

计算多个概率的额乘积以获得文档属于某个类别的概率如果其中一个概率值为0，那么最后的乘积也为0，
为降低这种影像，可以将所以词的出现数初始化为1，将分母初始化为2
p0Num = np.ones(numWords); p1Num = np.ones(numWords)
p0Denom = 2.0; p1Denom = 2.0
下溢出问题，由于太多很小的数相乘造成的，通过求对数可以避免下溢出或浮点数舍入导致的错误
p1Vect = np.log(p1Num/p1Denom)
p0Vect = np.log(p0Num/p0Denom)

分类函数

def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
    """
    分类函数
    :param vec2Classify:
    :param p0Vec:
    :param p1Vec:
    :param pClass1:
    :return:
    """
    p1 = sum(vec2Classify * p1Vec) + np.log(pClass1)    #element-wise mult
    p0 = sum(vec2Classify * p0Vec) + np.log(1.0 - pClass1)
    if p1 > p0:
        return 1
    else:
        return 0

测试

def testingNB():
    listOPosts, listClasses = loadDataSet()
    myVocabList = createVocabList(listOPosts)
    trainMat = []
    for postinDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList, postinDoc))
    p0V, p1V, pAb = trainNB0(np.array(trainMat), np.array(listClasses))
    testEntry = ['love', 'my', 'dalmation']
    thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
    print(testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb))
    testEntry = ['stupid', 'garbage']
    thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))
    print(testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb))