《统计学习方法》第八章，提升方法2

▶ 回归问题的提升树，采用分段常数函数进行回归，每次选择一个分界点及相应的两侧估值，最小化训练数据在两侧的残差平方和的和，然后将当前残差作为下一次训练的数据，最后把各次得到的分段函数加在一起，就是最终回归结果

● 代码

 1 import numpy as np
 2 import matplotlib.pyplot as plt
 3 from matplotlib.patches import Rectangle
 4 
 5 trainDataRatio = 0.3
 6 dataSize = 1000
 7 defaultEpsilon = 0.05                                                           # 需要进行分段的最小区间宽度
 8 randomSeed = 103
 9 
10 def myColor(x):                                                                 # 颜色函数
11     r = np.select([x < 1/2, x < 3/4, x <= 1, True],[0, 4 * x - 2, 1, 0])
12     g = np.select([x < 1/4, x < 3/4, x <= 1, True],[4 * x, 1, 4 - 4 * x, 0])
13     b = np.select([x < 1/4, x < 1/2, x <= 1, True],[1, 2 - 4 * x, 0, 0])
14     return [r**2,g**2,b**2]
15 
16 def dataSplit(dataX, dataY, part):                                              # 将数据集分割为训练集和测试集
17     return dataX[:part],dataY[:part], dataX[part:], dataY[part:]
18 
19 def judgeStrong(x, para):                                                       # 强分类判别函数，调用弱分类判别函数进行线性加和
20     return para[1][targetIndex(x, para[0])]    
21 
22 def targetIndex(x, xList):                                                      # 二分查找 xList 中大于 x 的最小索引
23     lp = 0
24     rp = len(xList) - 1    
25     while lp < rp - 1:        
26         mp = (lp + rp) >> 1
27         if(xList[mp] >= x):
28             rp = mp
29         else:
30             lp = mp
31     return rp
32 
33 def createData(count = dataSize):                                               # 创建数据
34     np.random.seed(randomSeed)       
35     X = np.random.rand(count)    
36     Y = X * (32 / 3 * (X-1) * (X-1/2) + 1)
37     return X, Y
38     
39 def adaBoost(dataX, dataY, weakCount):                                          # 提升训练
40     count = len(dataX)
41     xSort, ySort = zip( *sorted(zip(dataX, dataY)) )
42     xSort = np.array(xSort)
43     ySort = np.array(ySort)    
44 
45     result = np.zeros([weakCount,3])
46     for i in range(weakCount):
47         table = np.zeros(count - 1)
48         for j in range(count - 1):                                              # 找最佳分割点，损失函数为两侧残差平方和的和
49             #table[j] = np.sum( (ySort[:j+1] - np.mean(ySort[:j+1]))**2 ) + np.sum( (ySort[j+1:] - np.mean(ySort[j+1:]))**2 )    
50             table[j] = -np.sum(ySort[:j+1])**2 / (j+1) - np.sum(ySort[j+1:])**2 / (count - j - 1)    # 简化一点计算量        
51         index = np.argmin(table)                    
52         
53         valueLeft = np.mean(ySort[:index+1])                                    # 两侧估值
54         valueRight = np.mean(ySort[index+1:])                                
55         result[i] = np.array([ xSort[index], valueLeft, valueRight ])           # 当前结果暂存，表示当前分类器以 xSort[index] 为分点，左右侧取值分别为 valueLeft 和 valueRight
56         ySort[:index+1] -= valueLeft                                            # 两侧新残差
57         ySort[index+1:] -= valueRight        
58     
59     result = np.array(sorted(result, key=lambda x:x[0]))
60     #L = np.triu( np.tile(result[:,1], [weakCount+1,1]), 0 )                         # 分段结果加和为最终分类器，注意段数为 weakCount + 1 
61     #R = np.tril( np.tile(result[:,2], [weakCount+1,1]), -1)                         # 这里第 i 行 等于 r[0] + r[1] + ... + r[i-1] + l[i] + l[i+1] + ... +l[n]，表示第 i 个分点
62     #return np.concatenate(( result[:,0], np.array([np.inf]) )), np.sum(L + R, 1)    # 其中第 0 行是 np.sum(result[:,1])，第 weakCount 行是 np.sum(result[:,2])
63     L = np.concatenate((                  np.cumsum(result[:,1].T[::-1])[::-1], np.array([0.0]) )) # 简化一点计算量和空间开销
64     R = np.concatenate(( np.array([0.0]), np.cumsum(result[:,2].T) ))
65     return np.concatenate(( result[:,0], np.array([np.inf]) )), L + R
66 
67 def test(weakCount):                                                            # 单次测试
68     allX, allY = createData()
69     trainX, trainY, testX, testY = dataSplit(allX, allY, int(dataSize * trainDataRatio))
70     
71     para = adaBoost(trainX, trainY, weakCount)        
72 
73     testResult = [ judgeStrong(i, para) for i in testX ]
74     errorRatio = np.sum( np.abs(np.array(testResult) -  testY) ) / (dataSize*(1-trainDataRatio))
75     print( "weakCount = %d, errorRatio = %f"%(weakCount, round(errorRatio,4)) )         
76 
77     fig = plt.figure(figsize=(10, 8))                                           # 画图
78     plt.xlim(-0.1, 1.1)
79     plt.ylim(-0.1, 1.1)
80     XX = [0.0] + para[0][:-1].flatten().tolist() + [1.0]
81     YY = [ judgeStrong(i, para) for i in XX ]    
82     plt.scatter(testX, testY, color = myColor(1), s = 8, label = "originalData")
83     for i in range(weakCount+1):
84         plt.plot(XX[i:i+2], [YY[i+1], YY[i+1]],color = myColor(0), label = "regressionData")        
85     plt.text(0.1, 1.0, "weakCount = " + str(weakCount) + "
errorRatio = " + str(round(errorRatio,4)), size = 15, ha="center", va="center", bbox=dict(boxstyle="round", ec=(1., 0.5, 0.5), fc=(1., 1., 1.)))       
86     R = [ Rectangle((0,0),0,0, color = myColor(1 - i)) for i in range(2) ]
87     plt.legend(R, ["originalData", "regressionData"], loc=[0.79, 0.012], ncol=1, numpoints=1, framealpha = 1)
88 
89     fig.savefig("R:\weakCount" + str(weakCount) + ".png")
90     plt.close()   
91 
92 if __name__ == '__main__':
93     test(1)
94     test(2)
95     test(3)
96     test(4)           
97     test(20)            
98     test(100)

● 输出结果，包含训练轮数和预测错误率

weakCount = 1, errorRatio = 0.184200
weakCount = 2, errorRatio = 0.142200
weakCount = 3, errorRatio = 0.115000
weakCount = 4, errorRatio = 0.102500
weakCount = 20, errorRatio = 0.035300
weakCount = 100, errorRatio = 0.016000

● 画图，总数据量 1000，训练集 300，测试集 700

● 画图，当总数据量减为 500，训练集 150 时，出现了过拟合？