导读
gradient descent
momentum
RMSProp
adam
鞍点
gradient descent根据每次迭代时计算梯度的样本大小,可以分为bath, mini-batch, SGD;对gd的优化,可通过修改迭代步长或alpha值改进 ;优化迭代步长的算法有:momentum, RMSProp, adam等; 修改alpha值:learning rate decay,learning rate的衰减有不同的方法,以下列举常用的几种
alpha = alpha_0 / (1 + decay_rate * t);
alpha = alpha_0 * 0.95 ** t;
alpha = alpha_0 * k / sqrt(t); ...
本文只讨论标准gd及其对迭代步长优化的算法,假设优化函数为:z = x ** 2 + 5y ** 2,
import matplotlib.pyplot as plt
import numpy as np
#椭圆等高线
x = y = np.linspace(-10, 10, 1000)
x,y = np.meshgrid(x,y)
z = x ** 2 + 5 * y ** 2
plt.contour(x,y,z)
plt.show()
gd算法:x := x - alpha * dx, y := y = alpha * dy
其中 dx=2x, dy = 10y,令初始点(x, y) = (8, 9)
plt.contour(x, y, z)
plt.scatter(0,0)
plt.xlabel("x")
plt.ylabel("y")
alpha = 0.01
w = np.array([2, 0, 0, 10]).reshape(2, 2)
t = np.array([9, 8]).reshape(2, 1)
plt.scatter(t[0], t[1])
for i in range(1, 100):
dt = np.dot(w, t)
t_step = dt
t = t - alpha * t_step
plt.scatter(t[0], t[1])
print("最终值")
print(t[0], t[1])
plt.show()
最终值
[1.2179347] [0.0002361]
alpha值的设置
def gd(alpha):
w = np.array([2, 0, 0, 10]).reshape(2, 2)
t = np.array([9, 8]).reshape(2, 1)
i = 1
iter_array = []
result_array = []
iter_array.append(i)
result = t[0] ** 2 + 5 * t[1] ** 2
result_array.append(result)
while i < 100:
dt = np.dot(w, t)
t_step = dt
t = t - alpha * t_step
iter_array.append(i)
result = t[0] ** 2 + 5 * t[1] ** 2
result_array.append(result)
i+=1
return iter_array, result_array
def get_convergence_iter(i_result_array, threshold):
i = 0
while i < len(i_result_array):
t = i_result_array[i]
if abs(t) < threshold:
return i+1
i+=1
return i+1
for i in range(1, 20):
i_new = i / 100
i_iter_array, i_result_array = gd(i_new)
print('alpha值:%.2f, 收敛次数:%d' %(i_new,get_convergence_iter(i_result_array, 0.0001)))
for i in [0.01, 0.1, 0.2005]:
i_iter_array, i_result_array = gd(i)
plt.plot(i_iter_array, i_result_array, label="alpha:%.4f" %(i))
plt.legend()
plt.show()
alpha值:0.01, 收敛次数:101
alpha值:0.02, 收敛次数:101
alpha值:0.03, 收敛次数:101
alpha值:0.04, 收敛次数:83
alpha值:0.05, 收敛次数:66
alpha值:0.06, 收敛次数:55
alpha值:0.07, 收敛次数:47
alpha值:0.08, 收敛次数:41
alpha值:0.09, 收敛次数:36
alpha值:0.10, 收敛次数:32
alpha值:0.11, 收敛次数:29
alpha值:0.12, 收敛次数:26
alpha值:0.13, 收敛次数:24
alpha值:0.14, 收敛次数:22
alpha值:0.15, 收敛次数:21
alpha值:0.16, 收敛次数:19
alpha值:0.17, 收敛次数:23
alpha值:0.18, 收敛次数:35
alpha值:0.19, 收敛次数:73
上述结果可知,随着alpha值增加,迭代次数先减小,后增大,最后发散;alpha值需要合理设置,否则优化算法不能收敛
gd的优化
momentum
momentum算法在梯度的基础上加上指数滑动平均;
gd算法:x := x - alpha * dx;
momentum算法:
vdx = beta * vdx + (1-beta) * dx,
x := x - alpha * vdx
采用SGD或mini-bath训练机器学习或神经网络模型的时候,dw受每次迭代的batch影响,造成步长的摆动,而采用滑动平均可以减少摆动的幅度,从而加快收敛速度。
plt.contour(x, y, z)
plt.scatter(0,0)
plt.xlabel("x")
plt.ylabel("y")
alpha = 0.2005
#gd
w = np.array([2, 0, 0, 10]).reshape(2, 2)
t = np.array([9, 8]).reshape(2, 1)
dy_value = []
for i in range(1, 100):
dt = np.dot(w, t)
t_step = dt
dy_value.append(round(t_step[1][0],2))
t = t - alpha * t_step
print("gd的更新步长(y值)")
print(dy_value)
#momentum
beta = 0.9
w = np.array([2, 0, 0, 10]).reshape(2, 2)
t = np.array([9, 8]).reshape(2, 1)
plt.scatter(t[0], t[1])
vdt = 0
dy_value = []
for i in range(1, 100):
dt = np.dot(w, t)
vdt = beta * vdt + (1-beta) * dt
t_step = vdt
dy_value.append(round(t_step[1][0],2))
t = t - alpha * t_step
plt.scatter(t[0], t[1])
print("momentum的更新步长(y值)")
print(dy_value)
print("最终值")
print(t[0], t[1])
plt.show()
gd的更新步长(y值)
[80, -80.4, 80.8, -81.21, 81.61, -82.02, 82.43, -82.84, 83.26, -83.67, 84.09, -84.51, 84.93, -85.36, 85.79, -86.21, 86.65, -87.08, 87.51, -87.95, 88.39, -88.83, 89.28, -89.72, 90.17, -90.62, 91.08, -91.53, 91.99, -92.45, 92.91, -93.38, 93.84, -94.31, 94.78, -95.26, 95.73, -96.21, 96.69, -97.18, 97.66, -98.15, 98.64, -99.14, 99.63, -100.13, 100.63, -101.13, 101.64, -102.15, 102.66, -103.17, 103.69, -104.21, 104.73, -105.25, 105.78, -106.31, 106.84, -107.37, 107.91, -108.45, 108.99, -109.53, 110.08, -110.63, 111.19, -111.74, 112.3, -112.86, 113.43, -113.99, 114.56, -115.14, 115.71, -116.29, 116.87, -117.46, 118.04, -118.63, 119.23, -119.82, 120.42, -121.02, 121.63, -122.24, 122.85, -123.46, 124.08, -124.7, 125.32, -125.95, 126.58, -127.21, 127.85, -128.49, 129.13, -129.78, 130.43]
momentum的更新步长(y值)
[8.0, 13.6, 15.91, 14.8, 10.83, 5.09, -1.1, -6.45, -9.97, -11.14, -9.96, -6.9, -2.76, 1.51, 5.06, 7.23, 7.74, 6.65, 4.33, 1.38, -1.56, -3.89, -5.2, -5.35, -4.4, -2.67, -0.57, 1.43, 2.94, 3.71, 3.66, 2.89, 1.61, 0.13, -1.22, -2.19, -2.63, -2.49, -1.87, -0.94, 0.09, 1.0, 1.62, 1.85, 1.69, 1.2, 0.52, -0.19, -0.79, -1.18, -1.29, -1.13, -0.76, -0.27, 0.22, 0.62, 0.85, 0.89, 0.75, 0.47, 0.13, -0.21, -0.47, -0.61, -0.61, -0.5, -0.29, -0.04, 0.18, 0.35, 0.43, 0.42, 0.32, 0.17, 0.0, -0.15, -0.26, -0.31, -0.28, -0.21, -0.1, 0.02, 0.12, 0.19, 0.21, 0.19, 0.13, 0.05, -0.03, -0.1, -0.14, -0.15, -0.13, -0.08, -0.03, 0.03, 0.07, 0.1, 0.1]
最终值
[0.03790624] [-0.00786362]
momentum算法的效率
def mom(alpha, beta):
w = np.array([2, 0, 0, 10]).reshape(2, 2)
t = np.array([9, 8]).reshape(2, 1)
vdt = 0
i = 1
iter_array = []
result_array = []
iter_array.append(i)
result = t[0] ** 2 + 5 * t[1] ** 2
result_array.append(result)
while i < 100:
dt = np.dot(w, t)
vdt = beta * vdt + (1-beta) * dt
t_step = vdt
t = t - alpha * t_step
iter_array.append(i)
result = t[0] ** 2 + 5 * t[1] ** 2
result_array.append(result)
i += 1
return iter_array, result_array
beta = 0.9
for i in range(1, 20):
i_new = i / 10
i_iter_array, i_result_array = mom(i_new, beta)
print('alpha值:%.2f, 收敛次数:%d' %(i_new,get_convergence_iter(i_result_array, 0.001)))
for i in [0.01, 0.1, 1]:
i_iter_array, i_result_array = mom(i, beta)
plt.plot(i_iter_array, i_result_array, label="alpha:%.2f" %(i))
plt.legend()
plt.show()
alpha值:0.10, 收敛次数:84
alpha值:0.20, 收敛次数:101
alpha值:0.30, 收敛次数:98
alpha值:0.40, 收敛次数:84
alpha值:0.50, 收敛次数:101
alpha值:0.60, 收敛次数:95
alpha值:0.70, 收敛次数:101
alpha值:0.80, 收敛次数:101
alpha值:0.90, 收敛次数:98
alpha值:1.00, 收敛次数:101
alpha值:1.10, 收敛次数:96
alpha值:1.20, 收敛次数:101
alpha值:1.30, 收敛次数:100
alpha值:1.40, 收敛次数:101
alpha值:1.50, 收敛次数:92
alpha值:1.60, 收敛次数:84
alpha值:1.70, 收敛次数:96
alpha值:1.80, 收敛次数:101
alpha值:1.90, 收敛次数:101
从物理角度理解momentum算法:vdx = beta * vdx + (1-beta) * dw, dw可看作加速度,beta小于1,可看作摩擦力,vdx可看作动量
RMSProp
gd算法的alpha值受限于y轴(y轴斜率大),rmsprop增加了微平方加权平均数,可以消除梯度大的维度的影响,设置更大的alpha值
gd算法:x := x - alpha * dx;
rmsprop算法:
sdx = beta * sdx + (1-beta) * dx ** 2,
x := x - alpha * dx / sqrt(sdx)
plt.contour(x, y, z)
plt.scatter(0,0)
plt.xlabel('x')
plt.ylabel('y')
#rmsprop
alpha = 0.2005
epsilon = 0.00000001
beta = 0.9
w = np.array([2, 0, 0, 10]).reshape(2, 2)
t = np.array([9, 8]).reshape(2, 1)
plt.scatter(t[0], t[1])
sdt = 0
for i in range(1, 100):
dt = np.dot(w, t)
sdt = beta * sdt + (1-beta) * dt ** 2
t_step = dt / (np.sqrt(sdt) + epsilon)
t = t - alpha * t_step
plt.scatter(t[0], t[1])
print("最终值")
print(t[0], t[1])
plt.show()
最终值
[-1.6552961e-19] [4.0731588e-20]
RMSProp算法的效率
def rmsprop(alpha, beta):
w = np.array([2, 0, 0, 10]).reshape(2, 2)
t = np.array([9, 8]).reshape(2, 1)
sdt = 0
i = 1
iter_array = []
result_array = []
iter_array.append(i)
result = t[0] ** 2 + 5 * t[1] ** 2
result_array.append(result)
while i < 100:
dt = np.dot(w, t)
sdt = beta * sdt + (1-beta) * dt ** 2
t_step = dt / (np.sqrt(sdt) + epsilon)
t = t - alpha * t_step
iter_array.append(i)
result = t[0] ** 2 + 5 * t[1] ** 2
result_array.append(result)
i += 1
return iter_array, result_array
beta = 0.9
for i in range(1, 20):
i_new = i / 10
i_iter_array, i_result_array = rmsprop(i_new, beta)
print('alpha值:%.2f, 收敛次数:%d' %(i_new,get_convergence_iter(i_result_array, 0.0001)))
for i in [0.01, 0.1, 1, 2]:
i_iter_array, i_result_array = rmsprop(i, beta)
plt.plot(i_iter_array, i_result_array, label="alpha:%.2f" %(i))
plt.legend()
plt.show()
alpha值:0.10, 收敛次数:101
alpha值:0.20, 收敛次数:70
alpha值:0.30, 收敛次数:51
alpha值:0.40, 收敛次数:40
alpha值:0.50, 收敛次数:33
alpha值:0.60, 收敛次数:27
alpha值:0.70, 收敛次数:23
alpha值:0.80, 收敛次数:20
alpha值:0.90, 收敛次数:18
alpha值:1.00, 收敛次数:16
alpha值:1.10, 收敛次数:14
alpha值:1.20, 收敛次数:13
alpha值:1.30, 收敛次数:12
alpha值:1.40, 收敛次数:11
alpha值:1.50, 收敛次数:10
alpha值:1.60, 收敛次数:9
alpha值:1.70, 收敛次数:8
alpha值:1.80, 收敛次数:7
alpha值:1.90, 收敛次数:7
adam
adam算法将momentum和rmsprop算法结合起来,
vdx = beta1 * vdx + (1-beta1) * dx,
vdxc = vdx / (1 - beta1 ** t),
sdx = beta2 * sdx + (1 - beta2) * dx ** 2,
sdxc = sdx / (1 - beta2 ** t),
dx := dx - alpha * vdxc / sqrt(sdxc)
plt.contour(x, y, z)
plt.scatter(0,0)
plt.xlabel('x')
plt.ylabel('y')
#adam
alpha = 0.2005
epsilon = 0.00000001
beta1 = 0.9
beta2 = 0.999
w = np.array([2, 0, 0, 10]).reshape(2, 2)
t = np.array([9, 8]).reshape(2, 1)
plt.scatter(t[0], t[1])
vdt = 0
sdt = 0
for i in range(1,100):
dt = np.dot(w, t)
vdt = beta1 * vdt + (1-beta1) * dt
vdtc = vdt / (1 - beta1 ** i)
sdt = beta2 * sdt + (1-beta2) * dt ** 2
sdtc = sdt / (1 - beta2 ** i)
t_step = vdtc / (np.sqrt(sdtc) + epsilon)
t = t - alpha * t_step
plt.scatter(t[0], t[1])
print("最终值")
print(t[0], t[1])
plt.show()
最终值
[-0.09618689] [-0.04753836]
adma算法的效率
def adam(alpha, beta1, beta2):
w = np.array([2, 0, 0, 10]).reshape(2, 2)
t = np.array([9, 8]).reshape(2, 1)
vdt = 0
sdt = 0
i = 1
iter_array = []
result_array = []
iter_array.append(i)
result = t[0] ** 2 + 5 * t[1] ** 2
result_array.append(result)
while i < 100:
dt = np.dot(w, t)
vdt = beta1 * vdt + (1-beta1) * dt
vdtc = vdt / (1 - beta1 ** i)
sdt = beta * sdt + (1-beta) * dt ** 2
sdtc = sdt / (1 - beta2 ** i)
t_step = dt / (np.sqrt(sdt) + epsilon)
t = t - alpha * t_step
iter_array.append(i)
result = t[0] ** 2 + 5 * t[1] ** 2
result_array.append(result)
i += 1
return iter_array, result_array
beta1 = 0.9
beta2 = 0.999
for i in range(1, 20):
i_new = i / 10
i_iter_array, i_result_array = adam(i_new, beta1, beta2)
print('alpha值:%.2f, 收敛次数:%d' %(i_new,get_convergence_iter(i_result_array, 0.0001)))
for i in [0.01, 0.1, 1, 2]:
i_iter_array, i_result_array = adam(i, beta1, beta2)
plt.plot(i_iter_array, i_result_array, label="alpha:%.2f" %(i))
plt.legend()
plt.show()
alpha值:0.10, 收敛次数:101
alpha值:0.20, 收敛次数:70
alpha值:0.30, 收敛次数:51
alpha值:0.40, 收敛次数:40
alpha值:0.50, 收敛次数:33
alpha值:0.60, 收敛次数:27
alpha值:0.70, 收敛次数:23
alpha值:0.80, 收敛次数:20
alpha值:0.90, 收敛次数:18
alpha值:1.00, 收敛次数:16
alpha值:1.10, 收敛次数:14
alpha值:1.20, 收敛次数:13
alpha值:1.30, 收敛次数:12
alpha值:1.40, 收敛次数:11
alpha值:1.50, 收敛次数:10
alpha值:1.60, 收敛次数:9
alpha值:1.70, 收敛次数:8
alpha值:1.80, 收敛次数:7
alpha值:1.90, 收敛次数:7
局部最小与鞍点
在高维的情况下,算法更可能碰到鞍点,而非局部最小点(由于局部最小点需要所有维度在当前点都为凸函数,在高维情况下出现此情况都概率非常小)
鞍点对算法的影响是在平缓区域迭代速度减慢,可用adam等算法增加优化速度
from mpl_toolkits.mplot3d import Axes3D
#鞍点
x = y = np.linspace(-10, 10, 1000)
x,y = np.meshgrid(x,y)
z = x ** 2 - y ** 2
fig = plt.figure()
ax = fig.gca(projection='3d')
ax.plot_surface(x,y,z,cmap=plt.cm.coolwarm)
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_zlabel('z')
plt.show()
plt.contour(x,y,z)
plt.xlabel('x')
plt.ylabel('y')
plt.show()
gd在鞍点的表现
#gradient descent
alpha = 0.1
w = np.array([2, 0, 0, -2]).reshape(2, 2)
t_0 = t = np.array([-9, -0.2]).reshape(2, 1)
z_0 = t_0[0][0]**2 - t[1][0] ** 2
t_array = []
z_array = []
for i in range(1, 20):
dt = np.dot(w, t)
t_step = dt
t = t - alpha * t_step
t_array.append(t)
z_value = t[0][0] ** 2 - t[1][0] ** 2
z_array.append(z_value)
print("最终值")
print(t[0], t[1])
fig = plt.figure()
ax = fig.gca(projection='3d')
ax.plot_surface(x,y,z,cmap=plt.cm.coolwarm)
ax.scatter(t_0[0][0], t_0[1][0], z_0)
for i in range(len(t_array)):
ax.scatter(t_array[i][0], t_array[i][1], z_array[i])
plt.show()
plt.contour(x, y, z)
plt.xlabel("x")
plt.ylabel("y")
plt.scatter(t_0[0], t_0[1])
for i in t_array:
plt.scatter(i[0], i[1])
plt.show()
最终值
[-0.12970367] [-6.38959999]
adam在鞍点的表现
gd算法中不能设置过大的alpha值,否则算法不能收敛,使用adam算法时,可以设置较大的alpha值,加快迭代速度;同时adam算法平衡了x和y方向的迭代步长,使算法无须在x方向耗费过长的时间
#adam
alpha = 2
epsilon = 0.00000001
beta1 = 0.9
beta2 = 0.999
w = np.array([2, 0, 0, -2]).reshape(2, 2)
t_0 = t = np.array([-9, -0.2]).reshape(2, 1)
vdt = 0
sdt = 0
t_array = []
z_array = []
for i in range(1,10):
dt = np.dot(w, t)
vdt = beta1 * vdt + (1-beta1) * dt
vdtc = vdt / (1 - beta1 ** i)
sdt = beta2 * sdt + (1-beta2) * dt ** 2
sdtc = sdt / (1 - beta2 ** i)
t_step = vdtc / (np.sqrt(sdtc) + epsilon)
t = t - alpha * t_step
t_array.append(t)
z_value = t[0][0] ** 2 - t[1][0] ** 2
z_array.append(z_value)
print("最终值")
print(t[0], t[1])
fig = plt.figure()
ax = fig.gca(projection='3d')
ax.plot_surface(x,y,z,cmap=plt.cm.coolwarm)
ax.scatter(t_0[0][0], t_0[1][0], z_0)
for i in range(len(t_array)):
ax.scatter(t_array[i][0], t_array[i][1], z_array[i])
plt.show()
plt.contour(x, y, z)
plt.xlabel("x")
plt.ylabel("y")
plt.scatter(t_0[0], t_0[1])
for i in t_array:
plt.scatter(i[0], i[1])
plt.show()
最终值
[3.81504286] [-16.860752]