一般来说学习曲线:一种用来判断训练模型的一种方法通过查看学习曲线,可以对模型的状态进行判断。
1.偏差的方差:
: 偏差度量了学习算法的期望预测与真实结果的偏离程序, 即。
: 方差度量了同样大小的训练集的变动所导致的学习性能的变化, 即 。
请看下图:
,一般称为欠拟合(underfitting),即我们的模型并没有很好的去适配现有的数据,拟合度不够。
)的情况一般称作过拟合(overfitting),即模型对于训练数据拟合度太高了,失去了泛化的能力。
2.learning curve
如何判断模型是否合理?
这里可以使用learning curve帮我们判断
上图分别对应了欠拟合,过拟合和合理学习曲线,这里需要解释一下,横坐标表示训练集大小,纵坐标表示metric score。。如下图所示:
在时,训练集和验证集的cv结果随着训练集数据增大逐渐收敛,但是正确率远远于我们期望的情况,这时属于,可以通过: 训练更长时间, 增加样本, 增加特征、增加树的深度、减小正则项,增加网络结构,如增加隐藏层数目, 寻找合适的网络架构,使用更大的NN结构 来解决
在时,训练集上面的cv结果非常高,但是验证集的结果就不是那么理想,这时属于,可以使用更多的数据,正则化(regularization), 寻找合适的网络结构,树的剪枝来解决。
一般来说,过拟合的情况出现的会比较多一些,除非在计算资源很缺乏,一般不会出现欠拟合的情况
这里可以使用sklearn learning curve这个函数, 学习曲线请看下图:
上图中,深色的线表示cv后的metric的均值,前色线表示浮动的方差,这里时使用了matplotlib画的静态图,我使用plotly改写的一下变成,更方面观看:
# -*- coding: utf-8 -*-
"""
-------------------------------------------------
File Name: aaa
Description :
Author : Asdil
date: 2020/3/26
-------------------------------------------------
Change Activity:
2020/3/26:
-------------------------------------------------
"""
__author__ = 'Asdil'
import plotly
import plotly.io as pio
import plotly.offline as py # 离线模式
from plotly.graph_objs import Data, Figure
plotly.offline.init_notebook_mode(connected=True)
import warnings
warnings.filterwarnings('ignore')
def add_trace(percent, score_means, score_std, is_train=True):
"""add_trace方法用于添加每轮验证结果
Parameters
----------
percent : list
训练数据集百分比 eg: [0, 0.25, 0.5, 0.75, 1]
score_means : list
训练百分n的数据后, cv得分均值
score_std : list
训练百分n的数据后, cv得分均方差
is_train : bool
True 或者 False
Returns
----------
"""
score_means = np.array(score_means)
score_std = np.array(score_std)
name = 'Train score' if is_train else 'Test score'
color_std = "rgba(211, 47, 47, 0.4)" if is_train else "rgba(56, 142, 60, 0.4)"
color_mean = "#d32f2f" if is_train else "#388E3C"
trace_mean = {
"line": {"color": color_mean},
"name": name,
"type": "scatter",
"x": percent,
"y": score_means,
"showlegend": True
}
trace_mean_add_std = {
"fill": "tonexty",
"line": {
"color": color_std,
"width": 0.1
},
"mode": "lines",
"name": "",
"type": "scatter",
"x": percent,
"y": score_means+score_std,
"hoverlabel": {"namelength": 20},
"showlegend": False
}
trace_mean_sub_std = {
"line": {
"color": color_std,
"width": 0.1
},
"mode": "lines",
"name": "",
"type": "scatter",
"x": percent,
"y": score_means-score_std,
"hoverlabel": {"namelength": 20},
"showlegend": False
}
return [trace_mean_sub_std, trace_mean_add_std, trace_mean]
def create_lay_out(score_name, title='Learning curve'):
"""create_lay_out方法用于添加layout注释
Parameters
----------
title : str
标题
score_name : str
metric 函数名称 F1, ACC ect...
Returns
----------
"""
layout = {
"title": {"text": f"{title}"},
"xaxis": {"title": {"text": "训练集样本百分比"}},
"yaxis": {"title": {"text": f"{score_name}"}},
"legend": {
"x": 0.8,
"y": 0
},
"autosize": True
}
return layout
from sklearn.naive_bayes import GaussianNB
import numpy as np
from sklearn.svm import SVC
from sklearn.datasets import load_digits
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
# 这里使用ShuffleSplit 对训练集合划分
cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)
X, y = load_digits(return_X_y=True) # 使用手写体数据集minst
estimator = SVC(gamma=0.001) # smv模型
percent = np.linspace(.1, 1.0, 5) # 注意这里将训练集按照10%-100%划分成5份,对应结果图片x坐标有5个
scoring = 'f1_macro' # f1_macro accuracy
train_sizes, train_scores, test_scores, fit_times, _ = learning_curve(estimator,
X,
y,
cv=cv,
n_jobs=-1,
train_sizes=percent,
scoring = scoring,
return_times=True)
# 获取5次训练集f1得分均值和方差,验证集5次f1均值和方差
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
# 这是训练集分批cv结果的那条线
train_trace = add_trace(percent=percent,
score_means=train_scores_mean,
score_std=train_scores_std, is_train=True)
# 这是测试集集cv结果的那条线
test_trace = add_trace(percent=percent,
score_means=test_scores_mean,
score_std=test_scores_std, is_train=False)
layout = create_lay_out(score_name=scoring, title='Learning curve of SVC')
data = Data([*train_trace, *test_trace])
fig = Figure(data=data, layout=layout)
py.iplot(fig)
# pio.write_html(fig, file='iris.html')
3.损失函数值可视化:
上面的例子小样本集中是可以尝试的,但是通常情况下,不会有那么多的自己算资源,假设一个模型需要运行2天,我们将训练集拆分成不同百分比验证模型显然是不可取的,因为时间消耗太大。更多的时候我们需要看训练集cv和验证集cv的曲线变化,这里介绍另一个可以实时查看损失和metrc的包,名叫lrcurve,需要注意的是这个包需要安装tensorlow
这里使用一个pytorch的例子,这里使用一个手写体识别的例子,在例子中我们会显示出训练集和测试集的损失函数和metric
from lrcurve import PlotLearningCurve
import torch
from torchvision import datasets, transforms
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import torchvision.transforms as transforms
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
transform = transforms.Compose([transforms.ToTensor(),
transforms.Normalize((0.5,), (0.5,)),
])
# 下载数据集可能需要花一点时间
trainset = datasets.MNIST('./mnist', download=True, train=True, transform=transform)
valset = datasets.MNIST('./mnist', download=True, train=False, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=64, shuffle=True)
valloader = torch.utils.data.DataLoader(valset, batch_size=64, shuffle=True)
dataiter = iter(trainloader)
images, labels = dataiter.next()
input_size = 784
hidden_sizes = [64, 32]
output_size = 10
# 定义一个简单的全连接模型
model = nn.Sequential(nn.Linear(input_size, hidden_sizes[0]),
nn.ReLU(),
nn.Linear(hidden_sizes[0], hidden_sizes[1]),
nn.ReLU(),
nn.Linear(hidden_sizes[1], output_size),
nn.LogSoftmax(dim=1))
criterion = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.003, momentum=0.9)
epochs = 15 # 进行15轮
plot = PlotLearningCurve(
# 这里定义三幅图,分别是损失函数图, accuracy曲线和f1 socre曲线
facet_config = {
'loss': { 'name': 'NLLLoss', 'limit': [0, None] },
'accuracy': { 'name': 'Accuracy', 'limit': [0, 1] }, # acc 和 f1 取值都是0-1
'f1_score': { 'name': 'F1_score', 'limit': [0, 1] }
},
xaxis_config = { 'name': 'Epoch', 'limit': [0, 15] } # 这里定义x轴,我们迭代epochs = 15轮,因此这里是0-15
)
# 将整个pytorch训练过程包在plot中
with plot:
for epoch in range(epochs):
train_loss = 0
train_metric_1 = 0
train_metric_2 = 0
for images, labels in trainloader:
images = images.view(images.shape[0], -1)
optimizer.zero_grad()
output = model(images)
loss = criterion(output, labels)
loss.backward()
optimizer.step()
train_loss += loss.item()
#记录每个epoch中每个batch的acc和f1
train_metric_1 += f1_score(np.argmax(output.detach().numpy(), axis=1), labels.detach().numpy(),
average='macro')
train_metric_2 += accuracy_score(np.argmax(output.detach().numpy(), axis=1), labels.detach().numpy())
model.eval() # eval mode
val_loss = 0
val_metric_1 = 0
val_metric_2 = 0
with torch.no_grad():
for images, labels in valloader:
images = images.view(images.shape[0], -1)
output = model(images)
loss = criterion(output, labels)
val_loss += loss.item()
#记录每个epoch中每个batch的acc和f1
val_metric_1 += f1_score(np.argmax(output.detach().numpy(), axis=1), labels.detach().numpy(),
average='macro')
val_metric_2 += accuracy_score(np.argmax(output.detach().numpy(), axis=1), labels.detach().numpy())
# 求出所有batch平均值
train_metric_1 = train_metric_1/len(trainloader)
train_metric_2 = train_metric_2/len(trainloader)
val_metric_1 = val_metric_1/len(valloader)
val_metric_2 = val_metric_2/len(valloader)
# 将结果添加到学习曲线中以显示
plot.append(epoch, {
'loss': {
'train': train_loss/len(trainloader), # 每个epoch所有batch损失函数均值
'validation': val_loss/len(valloader),
},
'accuracy': {
'train': train_metric_2, # 训练集acc均值
'validation': val_metric_2
},
'f1_score': {
'train': train_metric_1, # 训练集f1数均值
'validation': val_metric_1
},
})
plot.draw() # 每一个epoch将数据显示出来
这实际上是一个动图,可以参考下面这张图
4.使用tensorboard
还有一种方法可视化损失函数和metric那就是使用tensorboard,方法也很简单,还是上面那个例子,前面的定义都一样:
# 应用tensorboard
from torch.utils.tensorboard import SummaryWriter
# default `log_dir` is "runs" - we'll be more specific here
writer = SummaryWriter('runs/fashion_mnist_experiment_1') # 日志路径
for epoch in range(epochs):
train_loss = 0
train_metric_1 = 0
train_metric_2 = 0
for images, labels in trainloader:
images = images.view(images.shape[0], -1)
optimizer.zero_grad()
output = model(images)
loss = criterion(output, labels)
loss.backward()
optimizer.step()
train_loss += loss.item()
train_metric_1 += f1_score(np.argmax(output.detach().numpy(), axis=1), labels.detach().numpy(),
average='macro')
train_metric_2 += accuracy_score(np.argmax(output.detach().numpy(), axis=1), labels.detach().numpy())
model.eval() # eval mode
val_loss = 0
val_metric_1 = 0
val_metric_2 = 0
with torch.no_grad():
for images, labels in valloader:
images = images.view(images.shape[0], -1)
output = model(images)
loss = criterion(output, labels)
val_loss += loss.item()
val_metric_1 += f1_score(np.argmax(output.detach().numpy(), axis=1), labels.detach().numpy(),
average='macro')
val_metric_2 += accuracy_score(np.argmax(output.detach().numpy(), axis=1), labels.detach().numpy())
train_metric_1 = train_metric_1/len(trainloader)
train_metric_2 = train_metric_2/len(trainloader)
val_metric_1 = val_metric_1/len(valloader)
val_metric_2 = val_metric_2/len(valloader)
# 这里把信息写入tensorboard
writer.add_scalars('loss',
{'train loss':train_loss/len(trainloader),
'val_loss:':val_loss/len(valloader)},epoch)
writer.close()
# 这里直接在jupyter中显示
%load_ext tensorboard
%tensorboard --logdir='runs/fashion_mnist_experiment_1'