Pytorch使用ReduceLROnPlateau来更新学习率

该文章创建(更新)于08/5/2020，请注意文章的时效性！

文章目录

自己之前写过一个Pytorch学习率更新,其中感觉依据是否loss升高或降低的次数来动态更新学习率，感觉是个挺好玩的东西，自己弄了好久都设置错误，今天算是搞出来了！
- torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10, verbose=False, threshold=0.0001, threshold_mode='rel', cooldown=0, min_lr=0, eps=1e-08) 在发现loss不再降低或者acc不再提高之后，降低学习率。各参数意义如下：参数含义 mode 'min'模式检测metric是否不再减小，'max'模式检测metric是否不再增大； factor 触发条件后lr*=factor； patience 不再减小（或增大）的累计次数； verbose 触发条件后print； threshold 只关注超过阈值的显著变化； threshold_mode 有rel和abs两种阈值计算模式，rel规则：max模式下如果超过best(1+threshold)为显著，min模式下如果低于best(1-threshold)为显著；abs规则：max模式下如果超过best+threshold为显著，min模式下如果低于best-threshold为显著； cooldown 触发一次条件后，等待一定epoch再进行检测，避免lr下降过速； min_lr 最小的允许lr； eps 如果新旧lr之间的差异小与1e-8，则忽略此次更新。例子，如图所示的y轴为lr,x为调整的次序，初始的学习率为0.0009575 则学习率的方程为：lr = 0.0009575 * (0.35)^x import math import matplotlib.pyplot as plt #%matplotlib inline x = 0 o = [] p = [] o.append(0) p.append(0.0009575) while(x < 8): x += 1 y = 0.0009575 * math.pow(0.35,x) o.append(x) p.append(y) print('%d: %.50f' %(x,y)) plt.plot(o,p,c='red',label='test') #分别为x,y轴对应数据,c:color,label plt.legend(loc='best') # 显示label,loc为显示位置(best为系统认为最好的位置) plt.show()
- 我感觉这里面最难的时这几个参数的选择，第一个是初始的学习率（我目前接触的miniest和下面的图像分类貌似都是0.001，我这里训练调整时才发现自己设置的为0.0009575，这个值是上一个实验忘更改了，但发现结果不错，第一次运行该代码接近到0.001这么小的损失值）,这里面的乘积系数以及判断说多少次没有减少（增加）后决定变换学习率都是难以估计的。我自己的最好方法是先按默认不变的0.001来训练一下（结合tensoarboard ）观察从哪里开始出现问题就可以从这里来确定次数，而乘积系数，个人感觉还是用上面的代码来获取一个较为平滑且变化极小的数字来作为选择。建议在做这种测试时可以把模型先备份一下以免浪费过多的时间！
该例子初始学习率为0.0009575，乘积项系数为：0.35，在我的例子中x变化的条件是：累计125次没有减小则x加1；自己训练在第一次lr变化后（从0.0009575变化到0.00011729）损失值慢慢取向于0.001（如第一张图所示），准确率达到69%； import torch import torchvision import torchvision.transforms as transforms import matplotlib.pyplot as plt import numpy as np import torch.nn as nn import torch.nn.functional as F import torch.optim as optim from datetime import datetime from torch.utils.tensorboard import SummaryWriter from torch.optim import * PATH = './cifar_net_tensorboard_net_width_200_and_chang_lr_by_decrease_0_35^x.pth' # 保存模型地址 transform = transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform) trainloader = torch.utils.data.DataLoader(trainset, batch_size=4, shuffle=True, num_workers=0) testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform) testloader = torch.utils.data.DataLoader(testset, batch_size=4, shuffle=False, num_workers=0) classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck') device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Assuming that we are on a CUDA machine, this should print a CUDA device: print(device) print("获取一些随机训练数据") # get some random training images dataiter = iter(trainloader) images, labels = dataiter.next() # functions to show an image def imshow(img): img = img / 2 + 0.5 # unnormalize npimg = img.numpy() plt.imshow(np.transpose(npimg, (1, 2, 0))) plt.show() # show images imshow(torchvision.utils.make_grid(images)) # print labels print(' '.join('%5s' % classes[labels[j]] for j in range(4))) print("**********************") # 设置一个tensorborad # helper function to show an image # (used in the `plot_classes_preds` function below) def matplotlib_imshow(img, one_channel=False): if one_channel: img = img.mean(dim=0) img = img / 2 + 0.5 # unnormalize npimg = img.cpu().numpy() if one_channel: plt.imshow(npimg, cmap="Greys") else: plt.imshow(np.transpose(npimg, (1, 2, 0))) # 设置tensorBoard # default `log_dir` is "runs" - we'll be more specific here writer = SummaryWriter('runs/train') # get some random training images dataiter = iter(trainloader) images, labels = dataiter.next() # create grid of images img_grid = torchvision.utils.make_grid(images) # show images # matplotlib_imshow(img_grid, one_channel=True) imshow(img_grid) # write to tensorboard # writer.add_image('imag_classify', img_grid) # Tracking model training with TensorBoard # helper functions def images_to_probs(net, images): ''' Generates predictions and corresponding probabilities from a trained network and a list of images ''' output = net(images) # convert output probabilities to predicted class _, preds_tensor = torch.max(output, 1) # preds = np.squeeze(preds_tensor.numpy()) preds = np.squeeze(preds_tensor.cpu().numpy()) return preds, [F.softmax(el, dim=0)[i].item() for i, el in zip(preds, output)] def plot_classes_preds(net, images, labels): preds, probs = images_to_probs(net, images) # plot the images in the batch, along with predicted and true labels fig = plt.figure(figsize=(12, 48)) for idx in np.arange(4): ax = fig.add_subplot(1, 4, idx+1, xticks=[], yticks=[]) matplotlib_imshow(images[idx], one_channel=True) ax.set_title("{0}, {1:.1f}%\n(label: {2})".format( classes[preds[idx]], probs[idx] * 100.0, classes[labels[idx]]), color=("green" if preds[idx]==labels[idx].item() else "red")) return fig # class Net(nn.Module): def __init__(self): super(Net, self).__init__() self.conv1 = nn.Conv2d(3, 200, 5) self.pool = nn.MaxPool2d(2, 2) self.conv2 = nn.Conv2d(200, 16, 5) self.fc1 = nn.Linear(16 * 5 * 5, 120) self.fc2 = nn.Linear(120, 84) self.fc3 = nn.Linear(84, 10) def forward(self, x): x = self.pool(F.relu(self.conv1(x))) x = self.pool(F.relu(self.conv2(x))) x = x.view(-1, 16 * 5 * 5) x = F.relu(self.fc1(x)) x = F.relu(self.fc2(x)) x = self.fc3(x) return x net = Net() # # 把net结构可视化出来 writer.add_graph(net, images) net.to(device) try: net.load_state_dict(torch.load(PATH)) print("Modle file load successful !") except: print("no model file,it will creat a new file!") # 训练 print("训练") criterion = nn.CrossEntropyLoss() # optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9) #在发现loss不再降低或者acc不再提高之后，降低学习率。 optimizer = torch.optim.SGD(net.parameters(), lr=0.0009575, momentum=0.9) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min',factor=0.35,verbose=1,min_lr=0.0001,patience=125) startTime = datetime.now() for epoch in range(200): # loop over the dataset multiple times running_loss = 0.0 for i, data in enumerate(trainloader, 0): # get the inputs; data is a list of [inputs, labels] # inputs, labels = data inputs, labels = data[0].to(device), data[1].to(device) # zero the parameter gradients optimizer.zero_grad() #将参数的grad值初始化为0 # forward + backward + optimize outputs = net(inputs) loss = criterion(outputs, labels) #计算损失 loss.backward() # 反向传播 optimizer.step() # 反向传播求梯度 # print statistics running_loss += loss.item() if i % 2000 == 1999: # print every 2000 mini-batches now_loss = running_loss / 2000 # 2000mini-batches 的平均损失率 print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, now_loss)) # now_loss = running_loss / 2000 scheduler.step(now_loss) # 把数据写入tensorflow # ...log the running loss writer.add_scalar('image training loss on net width 200 chang_lr_by_decrease', now_loss, epoch * len(trainloader) + i) writer.add_scalar('learning rate on net width 200 chang_lr_by_decrease', optimizer.state_dict()['param_groups'][0]['lr'], epoch * len(trainloader) + i) running_loss = 0.0 torch.save(net.state_dict(), PATH) print('Finished Training') print("***************************") print("***************************") print("***************************") print("Time taken:", datetime.now() - startTime) print("***************************") print("***************************") print("***************************") #获取一些随机测试数据 print("获取一些随机测试数据") dataiter = iter(testloader) images, labels = dataiter.next() # print images imshow(torchvision.utils.make_grid(images)) print('GroundTruth: ', ' '.join('%5s' % classes[labels[j]] for j in range(4))) # 恢复模型并测试 net = Net() net.load_state_dict(torch.load(PATH)) outputs = net(images) _, predicted = torch.max(outputs, 1) print('Predicted: ', ' '.join('%5s' % classes[predicted[j]] for j in range(4))) print("**********************") print("输出训练得到的准确度") # 输出训练得到的准确度 correct = 0 total = 0 with torch.no_grad(): for data in testloader: images, labels = data outputs = net(images) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum().item() print('Accuracy of the network on the 10000 test images: %d %%' % ( 100 * correct / total)) class_correct = list(0. for i in range(10)) class_total = list(0. for i in range(10)) with torch.no_grad(): for data in testloader: images, labels = data outputs = net(images) _, predicted = torch.max(outputs, 1) c = (predicted == labels).squeeze() for i in range(4): label = labels[i] class_correct[label] += c[i].item() class_total[label] += 1 for i in range(10): print('Accuracy of %5s : %2d %%' % ( classes[i], 100 * class_correct[i] / class_total[i]))

自己之前写过一个Pytorch学习率更新,其中感觉依据是否loss升高或降低的次数来动态更新学习率，感觉是个挺好玩的东西，自己弄了好久都设置错误，今天算是搞出来了！

torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10, verbose=False, threshold=0.0001, threshold_mode='rel', cooldown=0, min_lr=0, eps=1e-08)

在发现loss不再降低或者acc不再提高之后，降低学习率。各参数意义如下：

参数	含义
mode	'min'模式检测metric是否不再减小，'max'模式检测metric是否不再增大；
factor	触发条件后lr*=factor；
patience	不再减小（或增大）的累计次数；
verbose	触发条件后print；
threshold	只关注超过阈值的显著变化；
threshold_mode	有rel和abs两种阈值计算模式，rel规则：max模式下如果超过best(1+threshold)为显著，min模式下如果低于best(1-threshold)为显著；abs规则：max模式下如果超过best+threshold为显著，min模式下如果低于best-threshold为显著；
cooldown	触发一次条件后，等待一定epoch再进行检测，避免lr下降过速；
min_lr	最小的允许lr；
eps	如果新旧lr之间的差异小与1e-8，则忽略此次更新。

例子，如图所示的y轴为lr,x为调整的次序，初始的学习率为0.0009575
则学习率的方程为：lr = 0.0009575 * (0.35)^x

import math 
import matplotlib.pyplot as plt
#%matplotlib inline

x = 0 
o = []
p = []
o.append(0)
p.append(0.0009575)
while(x < 8):
    x += 1
    y = 0.0009575 * math.pow(0.35,x)
    o.append(x)
    p.append(y)
    print('%d:   %.50f' %(x,y))

plt.plot(o,p,c='red',label='test') #分别为x,y轴对应数据,c:color,label
plt.legend(loc='best')  # 显示label,loc为显示位置(best为系统认为最好的位置)
plt.show()

我感觉这里面最难的时这几个参数的选择，第一个是初始的学习率（我目前接触的miniest和下面的图像分类貌似都是0.001，我这里训练调整时才发现自己设置的为0.0009575，这个值是上一个实验忘更改了，但发现结果不错，第一次运行该代码接近到0.001这么小的损失值）,这里面的乘积系数以及判断说多少次没有减少（增加）后决定变换学习率都是难以估计的。我自己的最好方法是先按默认不变的0.001来训练一下（结合tensoarboard ）观察从哪里开始出现问题就可以从这里来确定次数，而乘积系数，个人感觉还是用上面的代码来获取一个较为平滑且变化极小的数字来作为选择。建议在做这种测试时可以把模型先备份一下以免浪费过多的时间！

该例子初始学习率为0.0009575，乘积项系数为：0.35，在我的例子中x变化的条件是：累计125次没有减小则x加1；自己训练在第一次lr变化后（从0.0009575变化到0.00011729）损失值慢慢取向于0.001（如第一张图所示），准确率达到69%；

import torch
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from datetime import datetime
from torch.utils.tensorboard import SummaryWriter
from torch.optim import *


PATH = './cifar_net_tensorboard_net_width_200_and_chang_lr_by_decrease_0_35^x.pth'  # 保存模型地址

transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=4,
                                          shuffle=True, num_workers=0)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=4,
                                         shuffle=False, num_workers=0)

classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Assuming that we are on a CUDA machine, this should print a CUDA device:

print(device)

print("获取一些随机训练数据")
# get some random training images
dataiter = iter(trainloader)
images, labels = dataiter.next()


# functions to show an image
def imshow(img):
    img = img / 2 + 0.5     # unnormalize
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)))
    plt.show()


# show images
imshow(torchvision.utils.make_grid(images))
# print labels
print(' '.join('%5s' % classes[labels[j]] for j in range(4)))
print("**********************")

# 设置一个tensorborad
# helper function to show an image
# (used in the `plot_classes_preds` function below)
def matplotlib_imshow(img, one_channel=False):
    if one_channel:
        img = img.mean(dim=0)
    img = img / 2 + 0.5     # unnormalize
    npimg = img.cpu().numpy()
    if one_channel:
        plt.imshow(npimg, cmap="Greys")
    else:
        plt.imshow(np.transpose(npimg, (1, 2, 0)))    

# 设置tensorBoard
# default `log_dir` is "runs" - we'll be more specific here
writer = SummaryWriter('runs/train')

# get some random training images
dataiter = iter(trainloader)
images, labels = dataiter.next()

# create grid of images
img_grid = torchvision.utils.make_grid(images)

# show images
# matplotlib_imshow(img_grid, one_channel=True)
imshow(img_grid)

# write to tensorboard
# writer.add_image('imag_classify', img_grid)

# Tracking model training with TensorBoard
# helper functions

def images_to_probs(net, images):
    '''
    Generates predictions and corresponding probabilities from a trained
    network and a list of images
    '''
    output = net(images)
    # convert output probabilities to predicted class
    _, preds_tensor = torch.max(output, 1)
    # preds = np.squeeze(preds_tensor.numpy())
    preds = np.squeeze(preds_tensor.cpu().numpy())
    return preds, [F.softmax(el, dim=0)[i].item() for i, el in zip(preds, output)]


def plot_classes_preds(net, images, labels):
    preds, probs = images_to_probs(net, images)
    # plot the images in the batch, along with predicted and true labels
    fig = plt.figure(figsize=(12, 48))
    for idx in np.arange(4):
        ax = fig.add_subplot(1, 4, idx+1, xticks=[], yticks=[])
        matplotlib_imshow(images[idx], one_channel=True)
        ax.set_title("{0}, {1:.1f}%\n(label: {2})".format(
            classes[preds[idx]],
            probs[idx] * 100.0,
            classes[labels[idx]]),
                    color=("green" if preds[idx]==labels[idx].item() else "red"))
    return fig

#

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 200, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(200, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

net = Net()
# # 把net结构可视化出来
writer.add_graph(net, images)
net.to(device)

try:
    net.load_state_dict(torch.load(PATH))
    print("Modle file load successful !")
except:
    print("no model file,it will creat a new file!")

# 训练
print("训练")
criterion = nn.CrossEntropyLoss()
# optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

#在发现loss不再降低或者acc不再提高之后，降低学习率。
optimizer = torch.optim.SGD(net.parameters(), lr=0.0009575, momentum=0.9)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min',factor=0.35,verbose=1,min_lr=0.0001,patience=125)

startTime = datetime.now()
for epoch in range(200):  # loop over the dataset multiple times
    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
#         inputs, labels = data

        inputs, labels = data[0].to(device), data[1].to(device)

        # zero the parameter gradients
        optimizer.zero_grad() #将参数的grad值初始化为0

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels) #计算损失
        loss.backward() # 反向传播
        optimizer.step() # 反向传播求梯度

        # print statistics
        running_loss += loss.item()
        if i % 2000 == 1999:    # print every 2000 mini-batches
            now_loss = running_loss / 2000 # 2000mini-batches 的平均损失率
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, now_loss))
            # now_loss = running_loss / 2000

            scheduler.step(now_loss)

                        # 把数据写入tensorflow
            # ...log the running loss
            writer.add_scalar('image training loss on net width 200 chang_lr_by_decrease',
                            now_loss,
                            epoch * len(trainloader) + i)

            writer.add_scalar('learning rate on net width 200 chang_lr_by_decrease',
                            optimizer.state_dict()['param_groups'][0]['lr'],
                            epoch * len(trainloader) + i)                            

            running_loss = 0.0

torch.save(net.state_dict(), PATH)

print('Finished Training')
print("***************************")
print("***************************")
print("***************************")
print("Time taken:", datetime.now() - startTime)
print("***************************")
print("***************************")
print("***************************")



#获取一些随机测试数据
print("获取一些随机测试数据")
dataiter = iter(testloader)
images, labels = dataiter.next()

# print images
imshow(torchvision.utils.make_grid(images))
print('GroundTruth: ', ' '.join('%5s' % classes[labels[j]] for j in range(4)))

# 恢复模型并测试
net = Net()
net.load_state_dict(torch.load(PATH))

outputs = net(images)

_, predicted = torch.max(outputs, 1)

print('Predicted: ', ' '.join('%5s' % classes[predicted[j]]
                              for j in range(4)))

print("**********************")
print("输出训练得到的准确度")
# 输出训练得到的准确度
correct = 0
total = 0
with torch.no_grad():
    for data in testloader:
        images, labels = data
        outputs = net(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print('Accuracy of the network on the 10000 test images: %d %%' % (
    100 * correct / total))

class_correct = list(0. for i in range(10))
class_total = list(0. for i in range(10))
with torch.no_grad():
    for data in testloader:
        images, labels = data
        outputs = net(images)
        _, predicted = torch.max(outputs, 1)
        c = (predicted == labels).squeeze()
        for i in range(4):
            label = labels[i]
            class_correct[label] += c[i].item()
            class_total[label] += 1

for i in range(10):
    print('Accuracy of %5s : %2d %%' % (
        classes[i], 100 * class_correct[i] / class_total[i]))

要不赞赏一下?

微信

支付宝

PayPal

Bitcoin

除非特别说明，本博客所有作品均采用知识共享署名-非商业性使用-禁止演绎 4.0 国际许可协议进行许可。转载请注明转自-
https://www.emperinter.info/2020/08/05/change-leaning-rate-by-reducelronplateau-in-pytorch/

阿里云国际版	20美元
Vultr	10美元
搬瓦工 \| Bandwagon	应该有折扣吧？
Just My Socks	JMS9272283 【注意手动复制去跳转】
域名 \| namesilo	`emperinter`(1美元)
币安	币安

自己之前写过一个Pytorch学习率更新,其中感觉依据是否loss升高或降低的次数来动态更新学习率，感觉是个挺好玩的东西，自己弄了好久都设置错误，今天算是搞出来了！

要不赞赏一下?

要不聊聊？

YouTube | B站

微信公众号

My Project

My Github Contributions

优惠码

近期文章

自己之前写过一个Pytorch学习率更新,其中感觉依据是否loss升高或降低的次数来动态更新学习率，感觉是个挺好玩的东西，自己弄了好久都设置错误，今天算是搞出来了！

相关文章：

要不赞赏一下?

要不聊聊？

YouTube | B站

微信公众号

My Project

My Github Contributions

优惠码

近期文章