神经网络6：LSTM 神经网络

▶ 循环神经网络

● 代码，参考【https://zybuluo.com/hanbingtao/note/581764】，这里主要实现了一个单层 LSTM 神经网络类 LstmLayer，包含前向量和后向计算。

■ 原代码有多处错误，包括且不仅限于：Line 106/110/130: 传入的 activator 根本没用到；Line 254: delta_f_list -> delta_i_list；Line 256: delta_f_list -> delta_o_list；在 github 上反馈给了 Up 主，不知道有没有改善

■ 源代码有大量低效的循环和函数调用，在这里进行了数据结构调整和向量化。例如，类 LstmLayer 的构造函数要求输入样本数，以便一次性申请所有状态向量和矩阵的内存，而不是每输入一个样本就在各状态向量和矩阵后添加一行；改善了几乎所有向量的存储方式，降低了各矩阵的维数，方便理解和调试；优化了训练部分 delta 和梯度的计算，使用向量外积和点乘来减少 for 循环和求和；添加了输出函数，方便观看构建的神经网络的各参数状态。

  1 import numpy as np
  2 
  3 global_epsilon = 1e-3
  4 global_ita = 0.2
  5 #np.random.seed(107)
  6 
  7 class SigmoidActivator(object):                                     # 两个激活函数
  8     def forward(self, weighted_input):
  9         return 1.0 / (1.0 + np.exp(-weighted_input))
 10     
 11     def backward(self, output):
 12         return output * (1 - output)
 13 
 14 class TanhActivator(object):
 15     def forward(self, weighted_input):
 16         return 2.0 / (1.0 + np.exp(-2 * weighted_input)) - 1.0
 17     
 18     def backward(self, output):
 19         return 1 - output * output
 20 
 21 class LstmLayer(object):
 22     def __init__(self, sCol, dCol, nSample, ita = global_ita):      # 构造函数要求传入样本数，以申请足够的内存
 23         self.sCol = sCol
 24         self.dCol = dCol
 25         self.ita = ita
 26         self.nSample = nSample       
 27         self.time = 0
 28         self.gActivator = SigmoidActivator()                        # 中间层激活函数
 29         self.dActivator = TanhActivator()                           # 输出层激活函数
 30         
 31         self.f =  np.zeros((self.nSample + 1, self.dCol))           # 初始化状态向量，第 0 行永远为 0，方便递推计算
 32         self.i =  np.zeros((self.nSample + 1, self.dCol))
 33         self.ct = np.zeros((self.nSample + 1, self.dCol))
 34         self.c =  np.zeros((self.nSample + 1, self.dCol))
 35         self.o =  np.zeros((self.nSample + 1, self.dCol))        
 36         self.h =  np.zeros((self.nSample + 1, self.dCol))        
 37         self.Wfh, self.Wfx, self.bf =  self.weightMatrix()        
 38         self.Wih, self.Wix, self.bi =  self.weightMatrix()        
 39         self.Wch, self.Wcx, self.bct = self.weightMatrix()
 40         self.Woh, self.Wox, self.bo =  self.weightMatrix()                
 41 
 42     def weightMatrix(self):                                         # 初始化权重矩阵
 43         Wh = np.random.uniform(-1, 1,(self.dCol, self.dCol))
 44         Wx = np.random.uniform(-1, 1,(self.dCol, self.sCol))
 45         b = np.zeros(self.dCol)                                    
 46         return Wh, Wx, b
 47 
 48     def forward(self, x):                                           # 前向计算
 49         self.time += 1
 50         tt = self.time
 51         self.f[tt] =  self.gActivator.forward(np.dot(self.Wfh, self.h[tt - 1]) + np.dot(self.Wfx, x) + self.bf)                        
 52         self.i[tt] =  self.gActivator.forward(np.dot(self.Wih, self.h[tt - 1]) + np.dot(self.Wix, x) + self.bi)
 53         self.ct[tt] = self.dActivator.forward(np.dot(self.Wch, self.h[tt - 1]) + np.dot(self.Wcx, x) + self.bct)     # 注意 ct 门不一样  
 54         self.c[tt] =  self.f[tt] * self.c[tt - 1] + self.i[tt] * self.ct[tt]
 55         self.o[tt] =  self.gActivator.forward(np.dot(self.Woh, self.h[tt - 1]) + np.dot(self.Wox, x) + self.bo)
 56         self.h[tt] =  self.o[tt] * self.dActivator.forward(self.c[tt])
 57 
 58     def backward(self, x, deltaNextLayer):                          # 后向计算        
 59         self.deltaF =  np.zeros((self.time + 1, self.dCol))         # 计算误差项部分，初始化各误差项        
 60         self.deltaI =  np.zeros((self.time + 1, self.dCol))  
 61         self.deltaO =  np.zeros((self.time + 1, self.dCol))  
 62         self.deltaCt = np.zeros((self.time + 1, self.dCol))         
 63         self.deltaH =  np.zeros((self.time + 1, self.dCol))         # deltaH 表示输出的误差项（用于分解给个状态向量）
 64         self.deltaH[-1] = deltaNextLayer                            # 上一层传递来的误差项                
 65         for tt in range(self.time, 0, -1):                          # 同层倒序传递
 66             f =  self.f[tt]
 67             i =  self.i[tt]            
 68             ct = self.ct[tt]
 69             o =  self.o[tt]                                   
 70             h =  self.deltaH[tt]
 71             cPre =         self.c[tt-1]
 72             tanhC =        self.dActivator.forward(self.c[tt])
 73             inverseTanhC = self.dActivator.backward(tanhC)                       
 74             
 75             self.deltaF[tt] =  h * o * inverseTanhC * cPre * self.gActivator.backward(f)                        # 用本层总 dealta 计算各状态向量的 delta
 76             self.deltaI[tt] =  h * o * inverseTanhC * ct   * self.gActivator.backward(i)
 77             self.deltaCt[tt] = h * o * inverseTanhC * i    * self.dActivator.backward(ct)
 78             self.deltaO[tt] =  h     * tanhC               * self.gActivator.backward(o)            
 79             self.deltaH[tt-1] = np.dot(self.deltaO[tt], self.Woh) + np.dot(self.deltaI[tt], self.Wih) + 
 80                                 np.dot(self.deltaF[tt], self.Wfh) + np.dot(self.deltaCt[tt], self.Wch)          # 用本层个状态向量 delta 计算上层总 delta，这里假设上一层输出即为本层输入，括号外没有偏导数项
 81                                         
 82         self.WfhGrad = np.sum(np.array([ np.outer(self.deltaF[1+i], self.h[i]) for i in range(self.time) ]), 0) # 求各状态向量梯度，使用外积完全向量化，类似于张量缩并
 83         self.WihGrad = np.sum(np.array([ np.outer(self.deltaI[1+i], self.h[i]) for i in range(self.time) ]), 0)
 84         self.WohGrad = np.sum(np.array([ np.outer(self.deltaO[1+i], self.h[i]) for i in range(self.time) ]), 0)
 85         self.WchGrad = np.sum(np.array([ np.outer(self.deltaCt[1+i],self.h[i]) for i in range(self.time) ]), 0)
 86                     
 87         self.bfGrad = np.sum(self.deltaF[1:1+self.time], 0)         # 求 b 的梯度，相当于上式没有乘法部分
 88         self.biGrad = np.sum(self.deltaI[1:1+self.time], 0)
 89         self.boGrad = np.sum(self.deltaO[1:1+self.time], 0)
 90         self.bcGrad = np.sum(self.deltaCt[1:1+self.time],0)
 91                                                        
 92         self.WfxGrad = np.outer(self.deltaF[-1], x)                 # 计算 Wx 的梯度         
 93         self.WixGrad = np.outer(self.deltaI[-1], x)
 94         self.WoxGrad = np.outer(self.deltaO[-1], x)
 95         self.WcxGrad = np.outer(self.deltaCt[-1],x)
 96     
 97     def update(self):                                               # 更新权重
 98         self.Wfh -= self.ita * self.WhfGrad
 99         self.Wfx -= self.ita * self.WhxGrad
100         self.bf -=  self.ita * self.bfGrad
101         self.Wih -= self.ita * self.WhiGrad
102         self.Wix -= self.ita * self.WhiGrad
103         self.bi -=  self.ita * self.biGrad
104         self.Woh -= self.ita * self.WofGrad
105         self.Wox -= self.ita * self.WoxGrad
106         self.bo -=  self.ita * self.boGrad
107         self.Wch -= self.ita * self.WcfGrad
108         self.Wcx -= self.ita * self.WcxGrad
109         self.bct -= self.ita * self.bcGrad            
110     
111     def reset(self):                                                # 重置各状态向量
112         self.time = 0                               
113         self.f = np.zeros((self.nSample + 1,self.dCol))
114         self.i = np.zeros((self.nSample + 1,self.dCol))        
115         self.ct = np.zeros((self.nSample + 1,self.dCol))
116         self.c = np.zeros((self.nSample + 1,self.dCol))
117         self.o = np.zeros((self.nSample + 1,self.dCol))
118         self.h = np.zeros((self.nSample + 1,self.dCol))
119                                                                     
120     def printLstmLayer(self):                                       # 输出本层神经网络的所有参数
121         print("sCol = %d, dCol = %d, ita = %d, nSample = %d, time = %d"%(self.sCol, self.dCol, self.ita, self.nSample, self.time))        
122         print("f=
", self.f, "
i=
", self.i, "
ct=
", self.ct, "
c=
", self.c, "
o=
", self.o, "
h=
", self.h)
123         print("Wfh=
", self.Wfh, "
Wfx=
", self.Wfx, "
bf=
", self.bf)
124         print("Wih=
", self.Wih, "
Wix=
", self.Wix, "
bi=
", self.bi)
125         print("Wch=
", self.Wch, "
Wcx=
", self.Wcx, "
bc=
", self.bct)
126         print("Woh=
", self.Woh, "
Wox=
", self.Wox, "
bo=
", self.bo)
127         
128         print("deltaF=
", self.deltaF, "
deltaI=
", self.deltaI, "
deltaO=
", self.deltaO, "
deltaCt=
", self.deltaCt, "
deltaH=
", self.deltaH)        
129         print("WfhGrad=
", self.WfhGrad, "
WfxGrad=
", self.WfxGrad, "
bfGrad=
", self.bfGrad)              
130         print("WihGrad=
", self.WihGrad, "
WixGrad=
", self.WixGrad, "
biGrad=
", self.biGrad)
131         print("WohGrad=
", self.WohGrad, "
WoxGrad=
", self.WoxGrad, "
boGrad=
", self.boGrad)
132         print("WchGrad=
", self.WchGrad, "
WcxGrad=
", self.WcxGrad, "
bcGrad=
", self.bcGrad)        
133         
134 def createTestData():                                               # 创建测试数据
135     s = [ np.array([1, 2, 3]), np.array([2, 3, 4]) ]
136     d = np.array([1, 2])
137     return s, d
138 
139 def test():
140     lstmLayer = LstmLayer(3, 2, 2)                                  # 传入输入维度、输出维度，样本数，可选参数学习效率
141     x, d = createTestData()
142     lstmLayer.forward(x[0]), lstmLayer.forward(x[1])
143     lstmLayer.backward(x[1], d)
144     lstmLayer.printLstmLayer()
145 
146 def gradCheck(epsilon = global_epsilon):
147     lstm = LstmLayer(3, 2, 2, epsilon)   
148     s, d = createTestData()
149     lstm.forward(s[0]), lstm.forward(s[1])
150     lstm.backward(s[1], np.ones(lstm.h[-1].shape,dtype=np.float64))    # 计算参考梯度，假设最终误差项为全 1 向量    
151     for i in range(lstm.Wfh.shape[0]):
152         for j in range(lstm.Wfh.shape[1]):
153             lstm.Wfh[i,j] += epsilon
154             lstm.reset()
155             lstm.forward(s[0]), lstm.forward(s[1])
156             err1 = np.sum(lstm.h[-1])
157             lstm.Wfh[i,j] -= 2*epsilon
158             lstm.reset()
159             lstm.forward(s[0]), lstm.forward(s[1])
160             err2 = np.sum(lstm.h[-1])
161             lstm.Wfh[i,j] += epsilon
162             print('weights(%d,%d): expected <-> actural %.4e <-> %.4e' % (i, j, (err1 - err2) / (2 * epsilon), lstm.WfhGrad[i,j]))
163 
164 if __name__ == "__main__":
165     test()
166     gradCheck()

● 输出结果

sCol = 3, dCol = 2, ita = 0, nSample = 2, time = 2
f=
 [[0.         0.        ]
 [0.08364582 0.64637889]
 [0.04687823 0.75054254]]
i=
 [[0.         0.        ]
 [0.64994358 0.11231909]
 [0.71495984 0.07056953]]
ct=
 [[ 0.          0.        ]
 [-0.77851627 -0.99959223]
 [-0.75373868 -0.99999457]]
c=
 [[ 0.          0.        ]
 [-0.50599165 -0.11227329]
 [-0.56261288 -0.15483503]]
o=
 [[0.         0.        ]
 [0.05723445 0.06130245]
 [0.0173681  0.01644683]]
h=
 [[ 0.          0.        ]
 [-0.02671797 -0.00685385]
 [-0.00885623 -0.00252639]]
Wfh=
 [[-0.44650757 -0.34150997]
 [ 0.1461234   0.7320657 ]]
Wfx=
 [[ 0.61845573 -0.74104458 -0.51005937]
 [ 0.50410244 -0.08955573  0.09272295]]
bf=
 [0. 0.]
Wih=
 [[ 0.05587383 -0.25802153]
 [-0.73662134 -0.25832213]]
Wix=
 [[ 0.33294318 -0.38308928  0.35067554]
 [ 0.03109526  0.40860802 -0.97185992]]
bi=
 [0. 0.]
Wch=
 [[-0.16803787 -0.149016  ]
 [-0.68550217  0.24428858]]
Wcx=
 [[ 0.29142476  0.62232088 -0.85921977]
 [-0.81363189 -0.65205061 -0.71037887]]
bc=
 [0. 0.]
Woh=
 [[-0.09910883 -0.49439315]
 [-0.90781981  0.44788208]]
Wox=
 [[-0.23362093 -0.45101893 -0.55533428]
 [-0.88301662  0.34405375 -0.84458816]]
bo=
 [0. 0.]
deltaF=
 [[ 0.          0.        ]
 [ 0.          0.        ]
 [-0.00029056 -0.00067513]]
deltaI=
 [[ 0.00000000e+00  0.00000000e+00]
 [-4.89958523e-05 -1.29338011e-05]
 [-1.97417511e-03 -2.10655809e-03]]
deltaO=
 [[ 0.00000000e+00  0.00000000e+00]
 [-1.55659244e-04 -1.37923566e-05]
 [-8.70241385e-03 -4.96967325e-03]]
deltaCt=
 [[0.00000000e+00 0.00000000e+00]
 [7.08195651e-05 1.18852402e-08]
 [3.96844066e-03 2.46176389e-08]]
deltaH=
 [[2.28293924e-05 7.62122414e-05]
 [6.17970530e-03 2.14376899e-03]
 [1.00000000e+00 2.00000000e+00]]
WfhGrad=
 [[7.76324944e-06 1.99147518e-06]
 [1.80382086e-05 4.62726915e-06]]
WfxGrad=
 [[-0.00058113 -0.00087169 -0.00116225]
 [-0.00135027 -0.0020254  -0.00270054]]
bfGrad=
 [-0.00029056 -0.00067513]
WihGrad=
 [[5.27459502e-05 1.35307066e-05]
 [5.62829545e-05 1.44380401e-05]]
WixGrad=
 [[-0.00394835 -0.00592253 -0.0078967 ]
 [-0.00421312 -0.00631967 -0.00842623]]
biGrad=
 [-0.00202317 -0.00211949]
WohGrad=
 [[2.32510827e-04 5.96450681e-05]
 [1.32779578e-04 3.40614115e-05]]
WoxGrad=
 [[-0.01740483 -0.02610724 -0.03480966]
 [-0.00993935 -0.01490902 -0.01987869]]
boGrad=
 [-0.00885807 -0.00498347]
WchGrad=
 [[-1.06028676e-04 -2.71991102e-05]
 [-6.57733325e-10 -1.68725686e-10]]
WcxGrad=
 [[7.93688131e-03 1.19053220e-02 1.58737626e-02]
 [4.92352779e-08 7.38529168e-08 9.84705558e-08]]
bcGrad=
 [4.03926022e-03 3.65028791e-08]
weights(0,0): expected <-> actural 1.4570e-02 <-> 1.4570e-02
weights(0,1): expected <-> actural -2.4253e-02 <-> -2.4253e-02
weights(1,0): expected <-> actural -5.2460e-03 <-> -5.2460e-03
weights(1,1): expected <-> actural 8.7327e-03 <-> 8.7327e-03