决策树基于ID3算法


from math import log
import operator

"""
使用ID3算法划分数据集，ID3算法可以用于划分标称型数据集

决策树分类器就像带有终止块的流程图，终止块表示分类结果。
开始处理数据集时，首先需要测量集合中数据的不一致，
然后寻找最优方案划分数据集，直到数据集中的所有数据属于同一分类
"""
def createDataSet():
    """ 不浮出水面是否可以生存     是否有脚蹼     属于鱼类 """
    dataSet = [[1, 1, 'yes'],
               [1, 1, 'yes'],
               [1, 0, 'no'],
               [0, 1, 'no'],
               [0, 1, 'no']]
    labels = ['no surfacing', 'flippers']
    #change to discrete values
    return dataSet, labels

def calcShannonEnt(dataSet):
    """计算信息熵（香农熵）
    H(x) = -∑P(xi)log(2,P(xi)) (i=1,2,..n)
    H 表示信息熵
    P 表示某种语言文字的字符出现的概率
    LOG2是以二为底的对数，用的是二进制，信息熵的单位是比特（BIT，即二进制的0和1）

    熵也可以作为一个系统的混乱程度的标准

    另一种：基尼不纯度 也可以作为衡量系统混乱程度的标准
        基尼 = 1 − ∑P(xi)^2 (i=1,2,..n)

    主要的区别就是基尼中把logP(xi)换成了P(xi)，相比于熵，基尼反而有计算量小的优势
    """
    numEntries = len(dataSet)  # 计算数据集中实例的总数
    labelCounts = {}  # 类别次数字典
    for featVec in dataSet:  # the the number of unique elements and their occurance
        currentLabel = featVec[-1]
        if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0
        labelCounts[currentLabel] += 1
    shannonEnt = 0.0   # 信息熵
    for key in labelCounts:
        prob = float(labelCounts[key])/numEntries  # 类别出现的频率
        shannonEnt -= prob * log(prob, 2)  # log base 2
    return shannonEnt

def splitDataSet(dataSet, axis, value):
    """
    按照给定特征划分数据集  第axis个特征是value的数据集
    :param dataSet: 待划分的数据集
    :param axis: 划分数据集的特征索引
    :param value: 需要返回的特征的值
    :return: 符合的数据集
    """
    retDataSet = []
    for featVec in dataSet:
        if featVec[axis] == value:
            reducedFeatVec = featVec[:axis]     #chop out axis used for splitting
            reducedFeatVec.extend(featVec[axis+1:])  # [,,,]
            retDataSet.append(reducedFeatVec)  # [,,[]]
    return retDataSet

def chooseBestFeatureToSplit(dataSet):
    """
    选择最好的数据集划分方式
    选择熵最小的，也就是数据最纯的
    :param dataSet:
    :return: 最好特征划分的索引值
    """
    numFeatures = len(dataSet[0]) - 1  # 特征数，最后一个为标签 #the last column is used for the labels
    baseEntropy = calcShannonEnt(dataSet)
    bestInfoGain = 0.0; bestFeature = -1
    for i in range(numFeatures):        #iterate over all the features
        featList = [example[i] for example in dataSet]#create a list of all the examples of this feature
        uniqueVals = set(featList)  # 当前特征中所有的唯一属性值集合  #get a set of unique values
        newEntropy = 0.0
        for value in uniqueVals:  # 遍历当前特征中所有的唯一属性值，对每个特征划分一次数据集
            subDataSet = splitDataSet(dataSet, i, value)
            prob = len(subDataSet)/float(len(dataSet))
            newEntropy += prob * calcShannonEnt(subDataSet)
        infoGain = baseEntropy - newEntropy     #calculate the info gain; ie reduction in entropy
        if (infoGain > bestInfoGain):       #compare this to the best gain so far
            bestInfoGain = infoGain         #if better than current best, set to best
            bestFeature = i
    return bestFeature                      #returns an integer

def majorityCnt(classList):
    """出现次数最多的分类名称"""
    classCount={}
    for vote in classList:
        if vote not in classCount.keys(): classCount[vote] = 0
        classCount[vote] += 1
    sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
    return sortedClassCount[0][0]

def createTree(dataSet, labels):
    """
    构建决策数
    :param dataSet: 数据集
    :param labels: 标签列表
    :return: 树
    """
    print(dataSet)
    classList = [example[-1] for example in dataSet]  # 包含数据集的所有类标签
    if classList.count(classList[0]) == len(classList):  # 所有类标签完全相同，直接返回该标签
        return classList[0] #stop splitting when all of the classes are equal
    #
    if len(dataSet[0]) == 1: #stop splitting when there are no more features in dataSet
        return majorityCnt(classList)
    bestFeat = chooseBestFeatureToSplit(dataSet)  # 最好特征划分的索引值
    bestFeatLabel = labels[bestFeat]  # 最好的特征划分标签
    myTree = {bestFeatLabel:{}}
    del(labels[bestFeat])  # 删除已经使用的标签
    featValues = [example[bestFeat] for example in dataSet]
    uniqueVals = set(featValues)
    for value in uniqueVals:
        subLabels = labels[:]  # 拷贝标签     #copy all of labels, so trees don't mess up existing labels
        myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), subLabels)
    return myTree


def classify(inputTree, featLabels, testVec):
    """
    决策树分类
    :param inputTree: 决策树
    :param featLabels: 标签
    :param testVec:
    :return:
    """
    firstStr = list(inputTree)[0]   # 相当于list(inputTree.keys())[0]
    secondDict = inputTree[firstStr]
    featIndex = featLabels.index(firstStr)
    key = testVec[featIndex]
    valueOfFeat = secondDict[key]
    if isinstance(valueOfFeat, dict):
        classLabel = classify(valueOfFeat, featLabels, testVec)
    else: classLabel = valueOfFeat
    return classLabel

"""
 存储决策树
"""
def storeTree(inputTree, filename):
    import pickle
    fw = open(filename, 'wb')
    pickle.dump(inputTree, fw)
    fw.close()

def grabTree(filename):
    import pickle
    fr = open(filename, 'rb')
    return pickle.load(fr)


if __name__ == '__main__':
    # 熵越高，则混合的数据也越多
    dataSet, labels = createDataSet()
    # dataSet[0][-1] = 'maybe'
    # print(calcShannonEnt(dataSet))
    # print(splitDataSet(dataSet,1,0))
    # print(splitDataSet(dataSet,0,0))
    # print(chooseBestFeatureToSplit(dataSet))  # 第0个特征是最好的用于划分数据集的特征

    """ 不浮出水面是否可以生存   是否有脚蹼   属于鱼类
    dataSet = [[1, 1, 'yes'],
               [1, 1, 'yes'],
               [1, 0, 'no'],
               [0, 1, 'no'],
               [0, 1, 'no']]
    
    按照第一个特征属性划分数据
        特征是1的：两个鱼类，一个不是鱼类
        特征是0的：都是鱼类
    按照第二个特征属性划分数据
        特征是1的：两个鱼类，两个不是鱼类
        特征是0的：都不是鱼类   
    比较得出第一种分组的输出结果较好 
    """
    # print(createTree(dataSet, labels))  # {'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}}