博雅大数据机器学习十讲第七讲

(batch)和(mini-batch)梯度下降

(batch)梯度下降：使用全部训练集样本，计算代价太高((nsim10^6))
(mini-batch)梯度下降：随机采样一个子集((msim10^2或10^3))

[ abla J( heta)=frac{1}{m}sum^m_{j=1} abla L(f(x_j; heta_t),y_j) ]

(mini-batch)是无偏估计((Robbins\, & \, Monro\, 1951))
计算和统计原因：
- 更大的批量会减小梯度计算的方差
训练一个样本，更新一次参数
(mini-batch)是(SGD)的推广，通常所说(SGD)即是(mini-batch)
学习率的设定是关键！
理论上保证(SGD)收敛的充分条件为

[sum^infty_{t=1}eta_t=infty,sum^infty_{t=1}eta^2_t<infty ]

需要随着迭代次数的增加降低学习率
病态条件((ill\,\,conditioning))
- 不同方向有不同的梯度
- 学习率的选择困难
鞍点((saddle\,points))
- 梯度为0,(Hessian)矩阵同时存在正值和负值
- (Hessian)矩阵的所有特征值为正值的概率很低
- 对于高维情况，鞍点和局部最小点的数量多
使用二阶优化算法会有问题
平台((plateaus))
- 梯度为0，(Hessian)矩阵也为0
加入噪音使得从平台区域跳出
梯度爆炸和悬崖((explosion\,and\,cliffs))
- 在(RNN)中非常常见，参数不断相乘导致
- 长期时间依赖性
解决办法：梯度截断((gradient\,clipping))启发式梯度截断干涉以减少步长
在大数据场景下（样本量大，特征维度数大）下，一阶方法最实用（随机梯度）
自适应学习率算法族（以(RMSProp)为代表）表现相当鲁棒
(Adam)可能是最佳选择
使用者对算法的熟悉程序，以便于调节超参数

案例：

python实现梯度下降法

梯度下降法使用以下迭代公式进行参数的更新。

[mathbf{x}^{(t+1)} leftarrow mathbf{x}^{(t)} - eta abla f(mathbf{x}^{(t)}) ]

其中 (eta) 为学习率。
我们实现 gradient_descent 方法来进行参数的更新。

import matplotlib.pyplot as plt
import numpy as np

from mpl_toolkits.mplot3d import Axes3D
from matplotlib import animation
from IPython.display import HTML

from autograd import elementwise_grad, value_and_grad,grad
from scipy.optimize import minimize
from scipy import optimize
from collections import defaultdict
from itertools import zip_longest
plt.rcParams['axes.unicode_minus']=False  # 用来正常显示负号
# 借助 Python 的匿名函数定义目标函数。
f1 = lambda x1,x2 : x1**2 + 0.5*x2**2 #函数定义
f1_grad = value_and_grad(lambda args : f1(*args)) #函数梯度
def gradient_descent(func, func_grad, x0, learning_rate=0.1, max_iteration=20):
    path_list = [x0]
    best_x = x0
    step = 0
    while step < max_iteration:
        update = -learning_rate * np.array(func_grad(best_x)[1])
        if(np.linalg.norm(update) < 1e-4):
            break
        best_x = best_x + update
        path_list.append(best_x)
        step = step + 1
    return best_x, np.array(path_list)
# 首先我们使用上节实现的梯度下降法求解，得到参数的优化路径。
best_x_gd, path_list_gd = gradient_descent(f1,f1_grad,[-4.0,4.0],0.1,30)
path_list_gd
# 为了将函数曲面绘制出来，我们先借助 np.meshgrid 生成网格点坐标矩阵。两个维度上每个维度显示范围为-5到5。对应网格点的函数值保存在 z 中。
x1,x2 = np.meshgrid(np.linspace(-5.0,5.0,50), np.linspace(-5.0,5.0,50))
z = f1(x1,x2 )
minima = np.array([0, 0]) #对于函数f1，我们已知最小点为(0,0)
ax.plot_surface?
# Matplotlib 中的 plot_surface 函数能够帮助我们绘制3D函数曲面图。函数的主要参数如下表所示。
%matplotlib inline
fig = plt.figure(figsize=(8, 8))
ax = plt.axes(projection='3d', elev=50, azim=-50)

ax.plot_surface(x1,x2, z, alpha=.8, cmap=plt.cm.jet)
ax.plot([minima[0]],[minima[1]],[f1(*minima)], 'r*', markersize=10)

ax.set_xlabel('$x1$')
ax.set_ylabel('$x2$')
ax.set_zlabel('$f$')

ax.set_xlim((-5, 5))
ax.set_ylim((-5, 5))

plt.show()

# contour 方法能够绘制等高线，clabel 能够将对应线的高度（函数值）显示出来，这里我们保留两位小数（fmt='%.2f'）。
dz_dx1 = elementwise_grad(f1, argnum=0)(x1, x2)
dz_dx2 = elementwise_grad(f1, argnum=1)(x1, x2)
fig, ax = plt.subplots(figsize=(6, 6))

contour = ax.contour(x1, x2, z,levels=20,cmap=plt.cm.jet)
ax.clabel(contour,fontsize=10,colors='k',fmt='%.2f')
ax.plot(*minima, 'r*', markersize=18)

ax.set_xlabel('$x1$')
ax.set_ylabel('$x2$')

ax.set_xlim((-5, 5))
ax.set_ylim((-5, 5))

plt.show()

# 借助 quiver 函数，我们可以将梯度下降法得到的优化路径使用箭头连接进行可视化。
fig, ax = plt.subplots(figsize=(6, 6))

ax.contour(x1, x2, z, levels=20,cmap=plt.cm.jet)#等高线
#绘制轨迹箭头
ax.quiver(path_list_gd[:-1,0], path_list_gd[:-1,1], path_list_gd[1:,0]-path_list_gd[:-1,0], path_list_gd[1:,1]-path_list_gd[:-1,1], scale_units='xy', angles='xy', scale=1, color='k')
#标注最优值点
ax.plot(*minima, 'r*', markersize=18)

ax.set_xlabel('$x1$')
ax.set_ylabel('$x2$')

ax.set_xlim((-5, 5))
ax.set_ylim((-5, 5))
plt.show()
# 使用动画将每一步的路径展示出来，我们使用 animation.FuncAnimation 类来完成动画模拟，然后使用 .to_jshtml 方法将动画显示出来。
path = path_list_gd #梯度下降法的优化路径
fig, ax = plt.subplots(figsize=(6, 6))
line, = ax.plot([], [], 'b', label='Gradient Descent', lw=2) #保存路径
point, = ax.plot([], [], 'bo') #保存路径最后的点

def init_draw():
    ax.contour(x1, x2, z, levels=20, cmap=plt.cm.jet)
    ax.plot(*minima, 'r*', markersize=18) #将最小值点绘制成红色五角星
    ax.set_xlabel('$x$')
    ax.set_ylabel('$y$')
    ax.set_xlim((-5, 5))
    ax.set_ylim((-5, 5))
    return line, point

def update_draw(i):
    line.set_data(path[:i,0],path[:i,1])
    point.set_data(path[i-1:i,0],path[i-1:i,1])
    plt.close()
    return line, point

anim = animation.FuncAnimation(fig, update_draw, init_func=init_draw,frames=path.shape[0], interval=60, repeat_delay=5, blit=True)
HTML(anim.to_jshtml())

# 使用 scipy.optimize 模块求解最优化问题。由于我们需要对优化路径进行可视化，因此 minimize 函数需要制定一个回调函数参数 callback。
x0 = np.array([-4, 4])


def make_minimize_cb(path=[]):
    def minimize_cb(xk):
        path.append(np.copy(xk))

    return minimize_cb
# 在这里我们选取 scipy.optimize 模块实现的一些常见的优化方法。
methods = [ "CG", "BFGS","Newton-CG","L-BFGS-B"]
import warnings
warnings.filterwarnings('ignore') #该行代码的作用是隐藏警告信息
x0 = [-4.0,4.0]
paths = []
zpaths = []
for method in methods:
    path = [x0]
    res = minimize(fun=f1_grad, x0=x0,jac=True,method = method,callback=make_minimize_cb(path), bounds=[(-5, 5), (-5, 5)], tol=1e-20)
    paths.append(np.array(path))
# 增加我们自己实现的梯度下降法的结果。
methods.append("GD")
paths.append(path_list_gd)
zpaths = [f1(path[:,0],path[:,1]) for path in paths]
# 封装一个 TrajectoryAnimation 类 ,将不同算法得到的优化路径进行动画演示。
class TrajectoryAnimation(animation.FuncAnimation):

    def __init__(self, paths, labels=[], fig=None, ax=None, frames=None,
                 interval=60, repeat_delay=5, blit=True, **kwargs):
        # 如果传入的fig和ax参数为空，则新建一个fig对象和ax对象
        if fig is None:
            if ax is None:
                fig, ax = plt.subplots()
            else:
                fig = ax.get_figure()
        else:
            if ax is None:
                ax = fig.gca()
        self.fig = fig
        self.ax = ax
        self.paths = paths
        # 动画的帧数等于最长的路径长度
        if frames is None:
            frames = max(path.shape[0] for path in paths)  # 获取最长的路径长度
        self.lines = [ax.plot([], [], label=label, lw=2)[0]
                      for _, label in zip_longest(paths, labels)]
        self.points = [ax.plot([], [], 'o', color=line.get_color())[0]
                       for line in self.lines]
        super(TrajectoryAnimation, self).__init__(fig, self.animate, init_func=self.init_anim,
                                                  frames=frames, interval=interval, blit=blit,
                                                  repeat_delay=repeat_delay, **kwargs)

    def init_anim(self):
        for line, point in zip(self.lines, self.points):
            line.set_data([], [])
            point.set_data([], [])
        return self.lines + self.points

    def animate(self, i):
        for line, point, path in zip(self.lines, self.points, self.paths):
            line.set_data(path[:i, 0], path[:i, 1])
            point.set_data(path[i - 1:i, 0], path[i - 1:i, 1])
            plt.close()
        return self.lines + self.points
fig, ax = plt.subplots(figsize=(8, 8))

ax.contour(x1, x2, z, cmap=plt.cm.jet)
ax.plot(*minima, 'r*', markersize=10)

ax.set_xlabel('$x1$')
ax.set_ylabel('$x2$')

ax.set_xlim((-5, 5))
ax.set_ylim((-5, 5))

anim = TrajectoryAnimation(paths, labels=methods, ax=ax)

ax.legend(loc='upper left')
HTML(anim.to_jshtml())

# 我们再来看一个有多个局部最小值和鞍点的函数。
f2 = lambda x1, x2 :((4 - 2.1*x1**2 + x1**4 / 3.) * x1**2 + x1 * x2  + (-4 + 4*x2**2) * x2 **2)
f2_grad = value_and_grad(lambda args: f2(*args))

x1,x2 = np.meshgrid(np.linspace(-2.0,2.0,50), np.linspace(-1.0,1.0,50))
z = f2(x1,x2 )

%matplotlib inline
fig = plt.figure(figsize=(6, 6))
ax = plt.axes(projection='3d', elev=50, azim=-50)

ax.plot_surface(x1,x2, z, alpha=.8, cmap=plt.cm.jet)

ax.set_xlabel('$x1$')
ax.set_ylabel('$x2$')
ax.set_zlabel('$f$')

ax.set_xlim((-2.0, 2.0))
ax.set_ylim((-1.0, 1.0))

plt.show()
# 使用 Scipy 中实现的不同的优化方法以及我们在本案例实现的梯度下降法进行求解。
x02 = [-1.0, -0.5]  # 初始点，尝试不同初始点，[-1.0,-0.5] ，[1.5,0.75],[-0.8,0.25]
_, path_list_gd2 = gradient_descent(f2, f2_grad, x02, 0.1, 30)  # 使用梯度下降法求解

paths = []
zpaths = []
methods = ["CG", "BFGS", "Newton-CG", "L-BFGS-B"]
for method in methods:
    path = [x02]
    res = minimize(fun=f2_grad, x0=x02, jac=True, method=method, callback=make_minimize_cb(path),
                   bounds=[(-2.0, 2.0), (-1.0, 1.0)], tol=1e-20)
    paths.append(np.array(path))

methods.append("GD")
paths.append(path_list_gd2)
zpaths = [f2(path[:, 0], path[:, 1]) for path in paths]
# 将不同方法的求解路径以动画形式显示出来。
%matplotlib inline
fig, ax = plt.subplots(figsize=(8, 8))

contour = ax.contour(x1, x2, z, levels=50, cmap=plt.cm.jet)
ax.clabel(contour,fontsize=10,colors='k',fmt='%.2f')
ax.set_xlabel('$x1$')
ax.set_ylabel('$x2$')

ax.set_xlim((-2.0, 2.0))
ax.set_ylim((-1.0, 1.0))

anim = TrajectoryAnimation(paths, labels=methods, ax=ax)
ax.legend(loc='upper left')
HTML(anim.to_jshtml())

# MNIST 手写数字数据集是在图像处理和深度学习领域一个著名的图像数据集。该数据集包含一份 60000 个图像样本的训练集和包含 10000 个图像样本的测试集。每一个样本是  28×2828×28  的图像，每个图像有一个标签，标签取值为 0-9 。 MNIST 数据集下载地址为 http://yann.lecun.com/exdb/mnist/。
import numpy as np
f = np.load("input/mnist.npz")
X_train, y_train, X_test, y_test = f['x_train'], f['y_train'],f['x_test'], f['y_test']
f.close()
x_train = X_train.reshape((-1, 28*28)) / 255.0
x_test = X_test.reshape((-1, 28*28)) / 255.0
# 随机打印一些手写数字，查看数据集。
rndperm = np.random.permutation(len(x_train))
%matplotlib inline
import matplotlib.pyplot as plt
plt.gray()
fig = plt.figure( figsize=(8,8) )
for i in range(0,100):
    ax = fig.add_subplot(10,10,i+1)
    ax.matshow(x_train[rndperm[i]].reshape((28,28)))
    plt.box(False) #去掉边框
    plt.axis("off")#不显示坐标轴
plt.show()
# 为了便于后续模型训练，对手写数字的标签进行 One-Hot 编码。
import pandas as pd
y_train_onehot = pd.get_dummies(y_train)
y_train_onehot.head()

构建一个简单的全连接神经网络，用于手写数字的分类，网络结构如下图所示：

import tensorflow as tf
import tensorflow.keras.layers as layers
inputs = layers.Input(shape=(28*28,), name='inputs')
hidden1 = layers.Dense(100, activation='relu', name='hidden1')(inputs)
hidden2 = layers.Dense(100, activation='relu', name='hidden2')(hidden1)
hidden3 = layers.Dense(50, activation='relu', name='hidden3')(hidden2)
outputs = layers.Dense(10, activation='softmax', name='outputs')(hidden3)
deep_networks = tf.keras.Model(inputs,outputs)
deep_networks.summary()
deep_networks.compile(optimizer='SGD',loss='categorical_crossentropy',metrics=['accuracy']) #定义误差和优化方法 SGD,RMSprop,Adam,Adagrad，Nadam
%time history = deep_networks.fit(x_train, y_train_onehot, batch_size=500, epochs=10,validation_split=0.5,verbose=1) #模型训练
fig, ax = plt.subplots(figsize=(20, 8))

ax.plot(history.epoch, history.history["loss"])

ax.set_xlabel('$epoch$')
ax.set_ylabel('$loss$')
test_loss, test_acc = deep_networks.evaluate(x_test,  pd.get_dummies(y_test), verbose=2)

print('
Test accuracy:', test_acc)

本案例我们实现了梯度下降法，借助 Scipy 的 optimize 模块，在两个不同的二维函数上使用梯度下降、共轭梯度下降法和拟牛顿法的优化路径，并使用 Matplotlib 进行了动画展示。
然后在手写数字数据集上，我们使用 TensorFlow 构建分类模型，使用不同的优化方法进行模型训练。本案例主要用到的 Python 包列举如下。

包或方法	版本	用途
Matplotlib	3.0.2	绘制三维曲面,绘制等高线，制作动画，绘制梯度场（箭头
Scipy	1.0.0	scipy.optimize.minimize 求解最优化问题
TensorFlow	1.12.0	构建手写数字神经网络模型
Pandas	0.23.4	数据预处理，One-Hot 编码