《西瓜书》第四章，决策树

▶ 使用决策树来为散点作分类。很多博客上的代码都是从《机器学习实战》上抄的，比如【https://www.cnblogs.com/jishengyu/p/8274991.html】，没有测试函数、没有树的泛化测试过程、没有节点默认值，甚至树本身就是错的，在增加属性数、属性取值数、类别数以后会出现对某些样本无法决策现象。

● 简单决策树，旧代码，能完成目标，但是没有前后剪枝、没有连续值和缺失值处理，没有把 X 和 Y 分离开，混合数据类型列表，函数 plogp 效率低下，各种不是 bug 的问题，还不知道为什么注释也没了，非常垃圾。

  1 import numpy as np
  2 import operator
  3 import warnings
  4 
  5 warnings.filterwarnings("ignore")
  6 dataSize = 10000
  7 trainRatio = 0.3
  8 
  9 def dataSplit(data, part):
 10     return data[0:part], data[part:]
 11 
 12 def createData(dim, option, kind, len):
 13     np.random.seed(103)
 14     temp = np.zeros([len,dim+1])
 15 
 16     for i in range(dim):
 17         temp[:,i] = np.random.randint(option, size = len)
 18     if kind == 2:
 19         temp[:,dim] = list(map(lambda x : int(x > 1), (3 - 2 * dim)*temp[:,0] + 2 * np.sum(temp[:,1:dim], 1)))
 20     else:
 21         temp[:,dim] = list(map(lambda x : int(x > 1), (3 - 2 * dim)*temp[:,0] + 2 * np.sum(temp[:,1:dim], 1)))
 22 
 23     output = [ line[0:dim].tolist() + [str(line[-1] == 1.0)] for line in temp ]
 24     label = [ chr(i + 65) for i in range(dim) ]
 25 
 26     print("dim = %d, option = %d, kind = %d, dataSize = %d, class1Ratio = %4f"%(dim, option, kind, dataSize, (np.sum(temp[:,-1]) / len)))
 27     return output, label
 28 
 29 def plogp(x, isScalar):
 30     output = x * np.log2(x)
 31     if isScalar:
 32         return [0 if np.isnan(output) else output][0]
 33     output[np.isnan(output)] = 0.0
 34     return output
 35 
 36 def calculateGain(table, alpha = 0):
 37     sumC = np.sum(table, 0)
 38     sumR = np.sum(table, 1)
 39     sumA = np.sum(sumC)
 40     temp = -( np.sum(plogp(sumC,False)) - plogp(sumA,True) - np.sum(plogp(table,False)) + np.sum(plogp(sumR,False)) ) / sumA
 41     if alpha == 0:
 42         return temp
 43     elif alpha == 1:
 44         return temp * (-1.0 / np.sum(plogp(sumR / sumA,False)))
 45     else:
 46         return sumA / ( np.sum(sumR * (1 - np.sum(table * table, 0) / (sumR * sumR))) )
 47 
 48 def chooseFeature(data,label):
 49     size, dim = np.shape(data)
 50     dim -= 1
 51     maxGain = 0
 52     maxi = -1
 53     kindTable = list(set([ data[j][-1] for j in range(size) ]))
 54     for i in range(dim):
 55         valueTable = list(set([ data[j][i] for j in range(size) ]))
 56         table = np.zeros([len(valueTable),len(kindTable)])
 57         for line in data:
 58             table[valueTable.index(line[i]),kindTable.index(line[-1])] += 1
 59         gain = calculateGain(table)
 60         if (gain > maxGain):
 61             maxGain = gain
 62             maxi = i
 63     valueTable = list(set([ data[j][maxi] for j in range(size) ]))
 64     return (maxi, valueTable)
 65 
 66 def vote(kindList):
 67     kindCount = {}
 68     for i in kindList:
 69         if i not in kindCount.keys():
 70             kindCount[i] = 0
 71         kindCount[i] += 1
 72     output = sorted(kindCount.items(),key=operator.itemgetter(1),reverse = True)
 73     return output[0][0]
 74 
 75 def createTree(data,label):
 76     if data == []:
 77         return '?'
 78     if len(data[0]) == 1:
 79         return vote([ line[-1] for line in data ])
 80     classList = set([i[-1] for i in data])
 81     if len(classList) == 1:
 82         return list(classList)[0]
 83 
 84     bestFeature, valueTable = chooseFeature(data, label)
 85     bestLabel = label[bestFeature]
 86     myTree = {bestLabel:{}}
 87     myTree[bestLabel]['Fuck'] = vote([ line[-1] for line in data])
 88 
 89     subLabel = label[:bestFeature]+label[bestFeature + 1:]
 90     for value in valueTable:
 91         childData = []
 92         for line in data:
 93             if line[bestFeature] == value:
 94                 childData.append(line[:bestFeature] + line[bestFeature + 1:])
 95         myTree[bestLabel][value] = createTree(childData, subLabel)
 96     return myTree
 97 
 98 def test(dim, option, kind):
 99     allData, labelName = createData(dim, option, kind, dataSize)
100     trainData, testData = dataSplit(allData, int(dataSize * trainRatio))
101     outputTree = createTree(trainData, labelName)
102 
103     myResult = []
104 
105     for line in testData:
106         tempTree = outputTree
107         while(True):
108             judgeName = list(tempTree)[0]
109             judgeValue = list(tempTree.values())[0]
110             value = line[labelName.index(judgeName)]
111             if value not in judgeValue :
112                 myResult.append(judgeValue['Fuck'])
113                 break
114             resultNow = judgeValue[value]
115             if type(resultNow) == type(allData[0][-1]):
116                 myResult.append(resultNow)
117                 break
118             tempTree = resultNow
119 
120     print("errorRatio = %4f"%( sum(map(lambda x,y:int(x!=y[-1]), myResult, testData)) / (dataSize*(1-trainRatio)) ))
121 
122 if __name__=='__main__':
123     test(3, 3, 2)
124     test(4, 3, 2)
125     test(5, 4, 2)
126     test(6, 5, 2)

View Code

● 重构后的代码，可用作以后的模板

  1 import numpy as np
  2 import operator
  3 import warnings
  4 
  5 warnings.filterwarnings("ignore")                           
  6 dataSize = 10000
  7 trainRatio = 0.3
  8 
  9 def dataSplit(x, y, part):                                                          # 将数据集按给定索引分为两段
 10     return x[:part], y[:part],x[part:],y[part:]
 11 
 12 def myColor(x):                                                                     # 颜色函数，用于对散点染色
 13     r = np.select([x < 1/2, x < 3/4, x <= 1, True],[0, 4 * x - 2, 1, 0])
 14     g = np.select([x < 1/4, x < 3/4, x <= 1, True],[4 * x, 1, 4 - 4 * x, 0])
 15     b = np.select([x < 1/4, x < 1/2, x <= 1, True],[1, 2 - 4 * x, 0, 0])
 16     return [r,g,b]
 17 
 18 def createData(dim, option, kind, count = dataSize):                                # 创建数据集，给定属性维度，每属性取值数，类别数，样本数
 19     np.random.seed(103)        
 20     X = np.random.randint(option, size = [count,dim])
 21     if kind == 2:                           
 22         Y = ((3 - 2 * dim)*X[:,0] + 2 * np.sum(X[:,1:], 1) > 0.5).astype(int)       # 单独列出方便观察，可以合并到 else 中
 23     else: 
 24         randomVector = np.random.rand(dim)
 25         randomVector /= np.sum(randomVector)
 26         Y = (np.sum(X * randomVector,1) * kind / option).astype(int)                # 各类别不够均匀
 27     label = [ chr(i + 65) for i in range(dim) ]                                     # 属性名，用 'A'，'B'，'C'，...
 28     
 29     #print(output)
 30     print("dim = %d, option = %d, kind = %d, dataSize = %d"%(dim, option, kind, count))
 31     kindCount = np.zeros(kind ,dtype = int)                                         # 各类别的占比
 32     for i in range(count):
 33         kindCount[Y[i]] += 1
 34     for i in range(kind):
 35         print("kind %d -> %4f"%(i, kindCount[i]/count))                                      
 36     return X, Y, label
 37 
 38 def plogp(x):                                                                       # 计算熵，把 nan 转 0
 39     return np.nan_to_num( x * np.log2(x))    
 40 
 41 def calculateGain(table, alpha = 0):                                                # 计算增益
 42     sumC = np.sum(table, 0)                             
 43     sumR = np.sum(table, 1)                             
 44     sumA = np.sum(sumC)                                     
 45     if alpha ==0:                                                                   # 增益，化简过的式子
 46         return -( np.sum(plogp(sumC)) + np.sum(plogp(sumR)) - np.sum(plogp(table)) - plogp(sumA) ) / sumA
 47     elif alpha == 1:                                                                # 增益率
 48         return (np.sum(plogp(sumC)) - plogp(sumA)) / (np.sum(plogp(sumR)) - plogp(sumA)) - 1
 49     else:                                                                           # Gini 系数的倒数（因为 Gini 越小越好）
 50         return sumA / ( np.sum(sumR * (1 - np.sum(table * table, 0) / (sumR * sumR))) )
 51 
 52 def chooseFeature(dataX, dataY, label):                                             # 选择最优属性 
 53     count, dim = np.shape(dataX)
 54     maxGain = 0
 55     maxi = -1       
 56     kindTable = list(set(dataY))                                                    # 类别表
 57     for i in range(dim):                                                            
 58         valueTable = list(set([ dataX[j][i] for j in range(count) ]))               # 属性取值表
 59         table = np.zeros([len(valueTable),len(kindTable)])                          # 生成用于计算熵的表格
 60         for j in range(count):                                                      
 61             table[valueTable.index(dataX[j][i]),kindTable.index(dataY[j])] += 1 
 62         gain = calculateGain(table)                                                 # 计算并记录最大增益的属性
 63         if (gain > maxGain):                                    
 64             maxGain = gain
 65             maxi = i 
 66     valueTable = list(set([ dataX[j][maxi] for j in range(count) ]))          
 67     return (maxi, valueTable)
 68 
 69 def vote(kindList):                                                                 # 根据列别表进行投票
 70     kindCount = {}
 71     for i in kindList:
 72         if i not in kindCount.keys():
 73             kindCount[i] = 0
 74         kindCount[i] += 1    
 75     output = sorted(kindCount.items(),key=operator.itemgetter(1),reverse = True)    
 76     return output[0][0]                                                             
 77             
 78 def createTree(dataX, dataY, label):                                # 以当前数据创建决策树
 79     #if dataX == []:                                                # 数据为空情况，下面使用 value 和 valueTable 防止进入空子树，不会进入该分支
 80     #    return '?'    
 81     if len(dataX[0]) == 0:                                          # 属性已经取完，剩余数据进行投票
 82         return vote(dataY)    
 83     if len(set(dataY)) == 1:                                        # 所有元素属于一类，返回该类 
 84         return dataY[0]
 85         
 86     bestFeature, valueTable = chooseFeature(dataX, dataY, label)    # 获取最佳属性索引及其取值表
 87     bestLabel = label[bestFeature]                                  
 88     myTree = {bestLabel:{}}                                         # 当前节点
 89     myTree[bestLabel]['Fuck'] = vote(dataY)                         # 该节点的默认值，当样本取值不在任何子树中时使用
 90 
 91     subLabel = label[:bestFeature] + label[bestFeature + 1:]        # 属性表中挖掉刚选出的属性  
 92     for value in valueTable:                                        
 93         childX = []                                               
 94         childY = []
 95         for i in range(len(dataX)):
 96             line = dataX[i]
 97             if line[bestFeature] == value:                                                  # 按最佳属性的取值将数据分类，分别计算子树
 98                 childX.append(np.concatenate([line[:bestFeature],line[bestFeature + 1:]]))
 99                 childY.append(dataY[i])
100         myTree[bestLabel][value] = createTree(childX, childY, subLabel)
101     return myTree                                                                           # 返回当前节点
102 
103 def test(dim, option, kind):                                                
104     allX, allY, labelName = createData(dim, option, kind)            
105     trainX, trainY, testX, testY = dataSplit(allX, allY, int(dataSize * trainRatio)) 
106     outputTree = createTree(trainX, trainY, labelName)                               
107     
108     myResult = []                                                   # 存放测试结果
109     
110     for line in testX:                                               
111         tempTree = outputTree                                       # 递归的搜索决策树
112         while(True):                                                        
113             judgeName = list(tempTree)[0]                           # 当前节点属性名
114             judgeValue = list(tempTree.values())[0]                 # 当前节点属性取值表
115             value = line[labelName.index(judgeName)]                # 当前样本该属性的取值
116             if value not in judgeValue :                            # 样本取值不在表中，使用节点默认值
117                 myResult.append(judgeValue['Fuck'])     
118                 break                
119             resultNow = judgeValue[value]               
120             if type(resultNow) == type(testY[0]):                   # 找到了叶节点，返回叶节点的类别，即分类结果
121                 myResult.append(resultNow)
122                 break
123             tempTree = resultNow    
124         
125     errorRatio  = np.sum((np.array(myResult) != testY).astype(int)) / (dataSize*(1 - trainRatio)) # 计算分类错误率
126     print("errorRatio = %4f"%errorRatio ) 
127 
128 if __name__=='__main__':    
129     test(1, 2, 2)    
130     test(1, 3, 2)    
131     test(2, 2, 2)    
132     test(2, 3, 2)
133     test(3, 3, 2)           
134     test(4, 3, 2)
135     test(5, 4, 2)
136     test(6, 5, 2)
137     test(3, 3, 3)
138     test(4, 4, 3)
139     test(5, 4, 4)

● 用于计算熵的表格，下标表示不同属性值，上标表示不同类别，n 表示指定属性值和类别下的样本频数，sumC 和 sumR 分别为列求和和行求和，sumA 是全元素总和

● 在总数据量 100 的情况下输出的决策树（手动调整缩进方便观看）：

{'0': {'Fuck': 'False',
          0.0: {'1': {'Fuck': 'True',
                         0.0: {'2': {'Fuck': 'True',
                                        0.0: {'3': {'Fuck': 'True',
                                                       0.0: 'False',
                                                       2.0: 'True'
                                                   }
                                             },
                                        1.0: 'True'
                                    }
                              },
                         1.0: 'True',
                         2.0: 'True'
                     }
               },
          1.0: {'1': {'Fuck': 'False',
                         0.0: 'False', 
                         2.0: 'True'
                     }
               },
          2.0: 'False'
      }
}

● 直接运行的输出结果，可见各维度越高，犯错概率也变得越高。画图都是整点的散点图，不放了。

dim = 1, option = 2, kind = 2, dataSize = 10000
kind 0 -> 0.498600
kind 1 -> 0.501400
errorRatio = 0.000000
dim = 1, option = 3, kind = 2, dataSize = 10000
kind 0 -> 0.339500
kind 1 -> 0.660500
errorRatio = 0.000000
dim = 2, option = 2, kind = 2, dataSize = 10000
kind 0 -> 0.497400
kind 1 -> 0.502600
errorRatio = 0.000000
dim = 2, option = 3, kind = 2, dataSize = 10000
kind 0 -> 0.446700
kind 1 -> 0.553300
errorRatio = 0.000000
dim = 3, option = 3, kind = 2, dataSize = 10000
kind 0 -> 0.444400
kind 1 -> 0.555600
errorRatio = 0.000000
dim = 4, option = 3, kind = 2, dataSize = 10000
kind 0 -> 0.452500
kind 1 -> 0.547500
errorRatio = 0.000000
dim = 5, option = 4, kind = 2, dataSize = 10000
kind 0 -> 0.468000
kind 1 -> 0.532000
errorRatio = 0.005857
dim = 6, option = 5, kind = 2, dataSize = 10000
kind 0 -> 0.464400
kind 1 -> 0.535600
errorRatio = 0.075286
dim = 3, option = 3, kind = 3, dataSize = 10000
kind 0 -> 0.485300
kind 1 -> 0.476600
kind 2 -> 0.038100
errorRatio = 0.000000
dim = 4, option = 4, kind = 3, dataSize = 10000
kind 0 -> 0.405300
kind 1 -> 0.568100
kind 2 -> 0.026600
errorRatio = 0.000000
dim = 5, option = 4, kind = 4, dataSize = 10000
kind 0 -> 0.207800
kind 1 -> 0.584400
kind 2 -> 0.207200
kind 3 -> 0.000600
errorRatio = 0.007571

● 调包，使用 sklearn 的 tree 模块，封装得连爹都不认识了，采用的数据集跟上面的代码完全相同

 1 import numpy as np
 2 import warnings
 3 from sklearn import tree
 4 from sklearn.metrics import classification_report
 5 
 6 warnings.filterwarnings("ignore")                           
 7 dataSize = 10000
 8 trainRatio = 0.3
 9 
10 def test(dim, option, kind, count = dataSize):                      
11     np.random.seed(103)        
12     X = np.random.randint(option, size = [count,dim])
13     randomVector = np.random.rand(dim)
14     randomVector /= np.sum(randomVector)
15     Y = (np.sum(X * randomVector,1) * kind / option).astype(int)    # 各类别不够均匀   
16     print("dim = %d, option = %d, kind = %d, dataSize = %d"%(dim, option, kind, dataSize))
17     kindCount = np.zeros(kind ,dtype = int)                         # 各类别的占比
18     for i in range(count):
19         kindCount[Y[i]] += 1
20     for i in range(kind):
21         print("kind %d -> %4f"%(i, kindCount[i]/count))                                      
22 
23     trainSize = int(dataSize * trainRatio)
24     testSize = dataSize - trainSize
25 
26     trainX = X[:trainSize]
27     trainY = Y[:trainSize]
28     clf = tree.DecisionTreeClassifier()                             # 创建决策树
29     clf = clf.fit(trainX, trainY)                                   # 训练
30 
31     testX = X[trainSize:]
32     testY = Y[trainSize:]
33     myResult = clf.predict(testX)                                   # 对测试数据进行分类
34     
35     print("errorRatio = %4f, accracy = %4f"%( sum(map(lambda x,y:int(x!=y), myResult, testY)) / testSize, clf.score(testX,testY) ))
36                                                                     # 我的错误率和 sklearn 自带的正确率，和恒为 1
37     print(classification_report(myResult,testY,target_names = [ chr(i + 65) for i in range(kind) ]))
38                                                                     # 自带的一个检测报告
39 if __name__=='__main__':    
40     test(1, 2, 2)    
41     test(1, 3, 2)    
42     test(2, 2, 2)    
43     test(2, 3, 2)
44     test(3, 3, 2)           
45     test(4, 3, 2)
46     test(5, 4, 2)
47     test(6, 5, 2)
48     test(3, 3, 3)
49     test(4, 4, 3)
50     test(5, 4, 4)

● 输出结果，感觉速度比我的更快，精度也比我的高一些

dim = 1, option = 2, kind = 2, dataSize = 10000
kind 0 -> 0.498600
kind 1 -> 0.501400
errorRatio = 0.000000, accracy = 1.000000
              precision    recall  f1-score   support

           A       1.00      1.00      1.00      3496
           B       1.00      1.00      1.00      3504

    accuracy                           1.00      7000
   macro avg       1.00      1.00      1.00      7000
weighted avg       1.00      1.00      1.00      7000

dim = 1, option = 3, kind = 2, dataSize = 10000
kind 0 -> 0.672700
kind 1 -> 0.327300
errorRatio = 0.000000, accracy = 1.000000
              precision    recall  f1-score   support

           A       1.00      1.00      1.00      4739
           B       1.00      1.00      1.00      2261

    accuracy                           1.00      7000
   macro avg       1.00      1.00      1.00      7000
weighted avg       1.00      1.00      1.00      7000

dim = 2, option = 2, kind = 2, dataSize = 10000
kind 0 -> 0.747900
kind 1 -> 0.252100
errorRatio = 0.000000, accracy = 1.000000
              precision    recall  f1-score   support

           A       1.00      1.00      1.00      5207
           B       1.00      1.00      1.00      1793

    accuracy                           1.00      7000
   macro avg       1.00      1.00      1.00      7000
weighted avg       1.00      1.00      1.00      7000

dim = 2, option = 3, kind = 2, dataSize = 10000
kind 0 -> 0.666800
kind 1 -> 0.333200
errorRatio = 0.000000, accracy = 1.000000
              precision    recall  f1-score   support

           A       1.00      1.00      1.00      4652
           B       1.00      1.00      1.00      2348

    accuracy                           1.00      7000
   macro avg       1.00      1.00      1.00      7000
weighted avg       1.00      1.00      1.00      7000

dim = 3, option = 3, kind = 2, dataSize = 10000
kind 0 -> 0.776400
kind 1 -> 0.223600
errorRatio = 0.000000, accracy = 1.000000
              precision    recall  f1-score   support

           A       1.00      1.00      1.00      5448
           B       1.00      1.00      1.00      1552

    accuracy                           1.00      7000
   macro avg       1.00      1.00      1.00      7000
weighted avg       1.00      1.00      1.00      7000

dim = 4, option = 3, kind = 2, dataSize = 10000
kind 0 -> 0.853400
kind 1 -> 0.146600
errorRatio = 0.000000, accracy = 1.000000
              precision    recall  f1-score   support

           A       1.00      1.00      1.00      5950
           B       1.00      1.00      1.00      1050

    accuracy                           1.00      7000
   macro avg       1.00      1.00      1.00      7000
weighted avg       1.00      1.00      1.00      7000

dim = 5, option = 4, kind = 2, dataSize = 10000
kind 0 -> 0.792200
kind 1 -> 0.207800
errorRatio = 0.003429, accracy = 0.996571
              precision    recall  f1-score   support

           A       1.00      1.00      1.00      5560
           B       0.99      0.99      0.99      1440

    accuracy                           1.00      7000
   macro avg       0.99      1.00      0.99      7000
weighted avg       1.00      1.00      1.00      7000

dim = 6, option = 5, kind = 2, dataSize = 10000
kind 0 -> 0.775900
kind 1 -> 0.224100
errorRatio = 0.065000, accracy = 0.935000
              precision    recall  f1-score   support

           A       0.96      0.96      0.96      5447
           B       0.85      0.86      0.85      1553

    accuracy                           0.94      7000
   macro avg       0.90      0.91      0.91      7000
weighted avg       0.94      0.94      0.94      7000

dim = 3, option = 3, kind = 3, dataSize = 10000
kind 0 -> 0.485300
kind 1 -> 0.476600
kind 2 -> 0.038100
errorRatio = 0.000000, accracy = 1.000000
              precision    recall  f1-score   support

           A       1.00      1.00      1.00      3403
           B       1.00      1.00      1.00      3329
           C       1.00      1.00      1.00       268

    accuracy                           1.00      7000
   macro avg       1.00      1.00      1.00      7000
weighted avg       1.00      1.00      1.00      7000

dim = 4, option = 4, kind = 3, dataSize = 10000
kind 0 -> 0.405300
kind 1 -> 0.568100
kind 2 -> 0.026600
errorRatio = 0.000000, accracy = 1.000000
              precision    recall  f1-score   support

           A       1.00      1.00      1.00      2840
           B       1.00      1.00      1.00      3967
           C       1.00      1.00      1.00       193

    accuracy                           1.00      7000
   macro avg       1.00      1.00      1.00      7000
weighted avg       1.00      1.00      1.00      7000

dim = 5, option = 4, kind = 4, dataSize = 10000
kind 0 -> 0.207800
kind 1 -> 0.584400
kind 2 -> 0.207200
kind 3 -> 0.000600
errorRatio = 0.005714, accracy = 0.994286
              precision    recall  f1-score   support

           A       0.99      1.00      0.99      1447
           B       1.00      0.99      1.00      4113
           C       0.99      0.99      0.99      1438
           D       1.00      1.00      1.00         2

    accuracy                           0.99      7000
   macro avg       0.99      1.00      1.00      7000
weighted avg       0.99      0.99      0.99      7000