【Machine Learning】 决策树

决策树算法

根据我的理解,决策树算法就是对特征值的权重进行排列,在这个例子里,特征值权重判断的依据就是香农熵, 香农熵越大,不确定性越大,权重越高。

衡量特征值权重的方法应该是造成决策树多样化和影响决策树分类器准确率的主要手段。

然后就是 if else 了

trees.py

from math import log
import operator

'''
    计算香农熵,就是计算数学公式
'''
def calcShannoEnt(dataSet):
    numEntries = len(dataSet)
    labelCounts = {}
    for featVec in dataSet:
        currentLabel = featVec[-1]
        if currentLabel not in labelCounts.keys():
            labelCounts[currentLabel] = 0
        labelCounts[currentLabel] += 1
    shannoEnt = 0.0
    for key in labelCounts:
        prob = float(labelCounts[key])/numEntries
        shannoEnt -= prob * log(prob, 2)
    return shannoEnt


def createDataSet():
    dataSet = [
        [1, 1, 'yes'],
        [1, 0, 'yes'],
        [1, 0, 'no'],
        [0, 1, 'no'],
        [0, 1, 'no'],
    ]
    labels = ['no surfacing', 'flippers']
    return dataSet, labels


'''
    抽取特征值:就是把索引为axis的列值为value的数据挑出来,返回结果不包含axis这一列(因为值都为value)
    list 的 extend 方法: 相比append会拆包, 如果extend一个dict,只会增加dict的 key 值
'''
def splitDataSet(dataSet, axis, value):
    retDataSet = []
    for featVec in dataSet:
        if featVec[axis] == value:
            reduceFeatVec = featVec[:axis]
            reduceFeatVec.extend(featVec[axis+1:])
            retDataSet.append(reduceFeatVec)
    return retDataSet

'''
    抽取特征值后分别计算香农熵并比较得到最优的数据集划分方式
'''
def chooseBestFeatureToSplit(dataSet):
    numFeatures = len(dataSet[0]) - 1
    baseEntropy = calcShannoEnt(dataSet)
    bestInfoGain = 0.0
    bestFeature = -1
    for i in range(numFeatures):
        values = [val[i] for val in dataSet]
        uniqueValues = set(values)
        newEntropy = 0.0
        for value in uniqueValues:
            subDataSet = splitDataSet(dataSet, i, value)
            prob = len(subDataSet)/float(len(dataSet))
            newEntropy += prob * calcShannoEnt(subDataSet)
        infoGain = baseEntropy - newEntropy
        if(infoGain > bestInfoGain):
            bestInfoGain = infoGain
            bestFeature = i
    return bestFeature

'''
    统计并返回出现次数最多的值
'''
def majorityCnt(classList):
    classCount = {}
    for vote in classList:
        if vote not in classCount.keys():
            classCount[vote] = 0
        classCount[vote] += 1
    sortedClassCount = sorted(classCount.items(), key= operator.itemgetter(1), reverse=True)
    return sortedClassCount[0][0]

'''
    构建决策树
'''
def createTree(dataSet, labels):
    classList = [example[-1] for example in dataSet]    # 最后一列
    if classList.count(classList[0]) == len(classList):    # 只有一种结果,直接返回
        return classList[0]
    if len(dataSet[0]) == 1:    # 只有一列,返回出现次数最多的值
        return majorityCnt(classList)
    bestFeat = chooseBestFeatureToSplit(dataSet)    # 最优划分的特征值索引
    bestFeatLabel = labels[bestFeat]    # 最优划分的特征值名称
    myTree ={bestFeatLabel:{}}    
    del(labels[bestFeat])    # 删除当前最优化分的特征值
    featValues = [example[bestFeat] for example in dataSet] 
    uniqueVals = set(featValues)
    for value in uniqueVals:
        subLabels = labels[:]
        myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), subLabels)    # 根据特征值划分数据集后, 递归建树
    return myTree


if __name__ == '__main__':
    myDat, labels = createDataSet()
    mytree = createTree(myDat, labels)
    print(mytree)
原文地址:https://www.cnblogs.com/yeyeck/p/9943668.html