(3)机器学习实战笔记:朴素贝叶斯

优点:数据比较少的时候仍然有效,可以处理多类别问题
缺点:对于输入数据的准备方式比较敏感
适用的数据类型:标称型数据
 
 
将一组单词转换为一组数字
使用数字计算概率
 
著名应用:使用朴素贝叶斯过滤垃圾邮件
分类思路:
(1)收集数据:提供文本文件
(2)准备数据:将文本文件解析成词条向量
(3)分析数据:检查词条确保解析的正确性
(4)训练算法:使用我们之前建立的trainNB0()函数
(5)测试算法:使用classifyNB(),构建一个新的测试函数来计算文档的错误率
(6)使用算法:构建一个完整的程度对一组文档进行分类,将错分的文档输出到屏幕上
 
切分文本:使用String.split()方法切分
为了更加精确地估计分类器错误率,需要进行多次迭代后求出平均错误率

——————————————————————————————-

简单实例:通过朴素贝叶斯分类实现垃圾邮件分类

通过对一邮件文本数据集进行处理(转化为向量)

经过朴素贝叶斯分类器进行分类可以判定是否为垃圾邮件

代码实现了简单的朴素贝叶斯分类器、文本向量转换器

详细备注见解释,下载数据集点这里

import numpy as np
from functools import reduce

#准备数据:从文本中构建词向量
def loadDataSet():
    # 切分的词条
    postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                   ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                   ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                   ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                   ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                   ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    # 类别标签向量,1代表侮辱性词汇,0代表不是
    classVec = [0, 1, 0, 1, 0, 1]
    # 返回实验样本切分的词条、类别标签向量
    return postingList, classVec

def createVocabList(dataSet):
    #无重复提取单祠
    vocabSet = set([])
    for document in dataSet:
        vocabSet = vocabSet |set(document)#创建两个集合的并集

    return list(vocabSet)

#检查单词在第几篇文档(文档为inputSet向量)中出现
def setOfWords2Vec(vocabList,inputSet):
    #创建一个元素都为0的向量
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1
    # else: print("the word: %s is not in my Vocabulary!" % word)

    return returnVec

listOPosts,listClasses = loadDataSet()
myVocabList = createVocabList(listOPosts)
# print(myVocabList)
# print(setOfWords2Vec(myVocabList,listOPosts[0]))

#训练算法:词向量计算概率
#输入:文档矩阵trainMatrix、每类文档类别标签所构成向量trainCategory
def trainNB0(trainMatrix,trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])#无重复单词表有几个单词

    #概率初始化
    pAbusive=sum(trainCategory)/float(numTrainDocs) #文档里脏话文档的概率
    p0Num = np.ones(numWords)#正向词汇列
    p1Num = np.ones(numWords)#脏话词汇列,都初始化为0
    #降低由概率值为0导致最后乘积为0的影响
    p0Denom=2.0
    p1Denom=2.0

    for i in range(numTrainDocs):
        #计算文档属于侮辱性文档(class=1)的概率P(1)
        #对于二分类问题可以通过1-P(1)得到P(0)
        #一旦某个词语在某文档中出现,该词对应的个数就加1
        #在所有的文档中,文档的总词数也相应+1,

        if trainCategory[i]==1: #对于脏话类词汇统计
            p1Num += trainMatrix[i] #统计脏话词汇数量,对应位置数量+1(单位就是1)
            p1Denom += sum(trainMatrix[i]) #总脏话词汇+出现次数
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])

    p1Vect=p1Num/p1Denom
    p0Vect=p0Num/p0Denom

    return p0Vect,p1Vect,pAbusive

trainMat=[]
for postinDoc in listOPosts:
    trainMat.append(setOfWords2Vec(myVocabList,postinDoc))
    #统计经过处理的无重复词汇表中在对应第(postinDoc)文档中是否出现,是标记1,
    #返回len为文档长度的向量组
# print(trainMat)

p0V,p1V,pAb=trainNB0(trainMat,listClasses)
# print("0")
# print(p0V)
# print("1")
# print(p1V)
# print("A")
# print(pAb)
# print(myVocabList)

#朴素贝叶斯分类函数/输入向量(要分类的向量,使用函数trainNB0计算得到三个概率
def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
    p1 = sum(vec2Classify*p1Vec)+np.log(pClass1)
    p0 = sum(vec2Classify*p0Vec)+np.log(1.0-pClass1)
    if p1>p0:
        return 1
    else:
        return 0
#简单的分类测试
def testingNB():
    listOposts,listClasses = loadDataSet()
    myVocabList = createVocabList(listOposts)
    trainMat=[]

    for postinDoc in listOposts:
        trainMat.append(setOfWords2Vec(myVocabList,postinDoc))
    p0V,p1V,pAb = trainNB0(np.array(trainMat),np.array(listClasses))
    testEntry=['love','my','dalmation']

    thisDoc = np.array(setOfWords2Vec(myVocabList,testEntry))
    print(testEntry,'classified as:',classifyNB(thisDoc,p0V,p1V,pAb))
    testEntry=['stupid','garbage']
    thisDoc = np.array(setOfWords2Vec(myVocabList,testEntry))
    print(testEntry,'classified as:',classifyNB(thisDoc,p0V,p1V,pAb))



testingNB()

#词袋模型:遇到每一个单词时,会增加词向量中对应值,而不是将对应数值设为0
def bagOfWords2VecMN(vocabList,inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)]+=1
    return returnVec

#应用:进行垃圾邮件的过滤

#切分文本

#test!
# mySent = 'This book is the best book on python or M.L. I have laid eyes upon.'
#
# import re
# regEx = re.compile('\W*')
# listOfTokens = regEx.split(mySent)

#测试:使用朴素贝叶斯进行交叉验证
def textParse(bigString):
    import re
    listOfTokens =re.split(r'W*',bigString)
    return[tok.lower() for tok in listOfTokens if len(tok)>2]
    #返回长度大于2的词,而且全部小写化



#该函数对贝叶斯垃圾邮件分类进行自动化处理,导入spam与ham下的文本文件,并为他们解析词列表。(*1)
#分离器所需要的概率计算只利用训练集中的文档来完成
#python变量trainingSet是一个整数列表,数值范围是0到49;(*2)



def spamTest():
    docList=[]
    classList=[]
    fullText=[]
    main_email=[]
    for i in range(1,26):
        #(*1)
        wordList = textParse(open('email/spam/%d.txt'%i).read())
        main_e=open('email/spam/%d.txt'%i).read()
        main_email.append(main_e)
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)

        wordList = textParse(open('email/ham/%d.txt'%i).read())
        docList.append(wordList)
        main_e= open('email/spam/%d.txt' % i).read()
        main_email.append(main_e)
        fullText.extend(wordList)
        classList.append(0)



    #构建词汇表
    vocabList = createVocabList(docList)
    # print("构建的vocabList:")
    # print(vocabList)
    # print("=========================================================")



    #进行测试集的划分 (*2)
    trainingSet = list(range(50))
    testSet=[]
    for i in range(10): #随机选择10个文件
        randIndex = int(np.random.uniform(0,len(trainingSet))) #随机构建测试集,获取随机数作为index
        testSet.append(trainingSet[randIndex])#把index对应的邮件index添加到测试集中
        del(trainingSet[randIndex])#并且把该index从待挑选名单中删除


    trainMat=[]
    trainClasses =[]
    for docIndex in trainingSet:
        #对于每一个训练集里的训练单位进行词向量的构建
        trainMat.append(setOfWords2Vec(vocabList,docList[docIndex]))
        trainClasses.append(classList[docIndex]) #安上对应标签!
    #针对训练集进行训练
    #
    # print(trainMat)
    # print(trainClasses)

    p0V,p1V,pSpam=trainNB0(np.array(trainMat),np.array(trainClasses))
    # print(p0V)

    errorCount = 0

    for docIndex in testSet:
        #提取训练名单上对应的邮件信息,查看滴docIndex条元素里对应的单词是否在vocablist词汇表中出现,出现则+1,返回信息向量wordVector
        # print("train")
        # print(docList[docIndex])
        wordVector = setOfWords2Vec(vocabList,docList[docIndex])
        # print(wordVector)

        if classifyNB(np.array(wordVector),p0V,p1V,pSpam)!=classList[docIndex]:
            errorCount +=1
            print(main_email[docIndex])
            print(classifyNB(np.array(wordVector), p0V, p1V, pSpam))
            print(classList[docIndex])




    print('the error rate is :',float(errorCount)/len(testSet))

# spamTest()

#寻找最优参数


def findthebest_Data_test():
    docList = []
    classList = []
    fullText = []
    main_email = []
    for i in range(1, 26):
        # (*1)
        wordList = textParse(open('email/spam/%d.txt' % i).read())
        main_e = open('email/spam/%d.txt' % i).read()
        main_email.append(main_e)
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)

        wordList = textParse(open('email/ham/%d.txt' % i).read())
        docList.append(wordList)
        main_e = open('email/spam/%d.txt' % i).read()
        main_email.append(main_e)
        fullText.extend(wordList)
        classList.append(0)

    # 构建词汇表
    vocabList = createVocabList(docList)
    # print("构建的vocabList:")
    # print(vocabList)
    # print("=========================================================")

    # 进行测试集的划分 (*2)
    trainingSet = list(range(50))
    testSet = []
    for i in range(10):  # 随机选择10个文件
        randIndex = int(np.random.uniform(0, len(trainingSet)))  # 随机构建测试集,获取随机数作为index
        testSet.append(trainingSet[randIndex])  # 把index对应的邮件index添加到测试集中
        del (trainingSet[randIndex])  # 并且把该index从待挑选名单中删除

    trainMat = []
    trainClasses = []
    for docIndex in trainingSet:
        # 对于每一个训练集里的训练单位进行词向量的构建
        trainMat.append(setOfWords2Vec(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])  # 安上对应标签!
    # 针对训练集进行训练
    #
    # print(trainMat)
    # print(trainClasses)

    p0V, p1V, pSpam = trainNB0(np.array(trainMat), np.array(trainClasses))
    # print(p0V)

    errorCount = 0

    for docIndex in testSet:
        # 提取训练名单上对应的邮件信息,查看滴docIndex条元素里对应的单词是否在vocablist词汇表中出现,出现则+1,返回信息向量wordVector
        # print("train")
        # print(docList[docIndex])
        wordVector = setOfWords2Vec(vocabList, docList[docIndex])
        # print(wordVector)

        if classifyNB(np.array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
            errorCount += 1
            # print(main_email[docIndex])
            # print(classifyNB(np.array(wordVector), p0V, p1V, pSpam))
            # print(classList[docIndex])

    # print('the error rate is :', float(errorCount) / len(testSet))
    error_rate=float(errorCount) / len(testSet)
    return p0V, p1V, pSpam,error_rate

def find_the_data():
    p0Num = np.ones(10)
    p1Num = np.ones(10)
    PA = 0.0
    err=1
    for i in range(50):
        a,b,c,d=findthebest_Data_test()
        if d<err:
            err = d
            p0Num=a
            p1Num=b
            PA=c


    return p0Num,p1Num,PA



def final_test():
    p0,p1,pA =find_the_data()



    docList = []
    classList = []
    fullText = []
    main_email = []
    for i in range(1, 26):
        # (*1)
        wordList = textParse(open('email/spam/%d.txt' % i).read())
        main_e = open('email/spam/%d.txt' % i).read()
        main_email.append(main_e)
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)

        wordList = textParse(open('email/ham/%d.txt' % i).read())
        docList.append(wordList)
        main_e = open('email/spam/%d.txt' % i).read()
        main_email.append(main_e)
        fullText.extend(wordList)
        classList.append(0)

    vocabList = createVocabList(docList)

    errorCount = 0

    for i in range(len(docList)):
        # 提取训练名单上对应的邮件信息,查看滴docIndex条元素里对应的单词是否在vocablist词汇表中出现,出现则+1,返回信息向量wordVector
        # print("train")
        # print(docList[docIndex])
        wordVector = setOfWords2Vec(vocabList, docList[i])
        # print(wordVector)


        if classifyNB(np.array(wordVector), p0, p1, pA) != classList[i]:
            errorCount += 1
            # print(main_email[i])
            # print(classifyNB(np.array(wordVector), p0, p1, pA))
            # print(classList[i])

    print('the error rate is :', float(errorCount) / len(docList))



final_test()

 

原文地址:https://www.cnblogs.com/AKsnoopy/p/14085074.html