DAY 45 Tensorboard使用介绍-EW帮帮网

DAY 45 Tensorboard使用介绍

1.tensorboard的发展历史和原理

2.tensorboard的常见操作

log_dir = 'runs/cifar10_mlp_experiment'
if os.path.exists(log_dir):
    i = 1
    while os.path.exists(f"{log_dir}_{i}"):
        i += 1
    log_dir = f"{log_dir}_{i}"
writer = SummaryWriter(log_dir) #关键入口，用于写入数据到日志目录

# 记录每个 Batch 的损失和准确率
writer.add_scalar('Train/Batch_Loss', batch_loss, global_step)
writer.add_scalar('Train/Batch_Accuracy', batch_acc, global_step)

# 记录每个 Epoch 的训练指标
writer.add_scalar('Train/Epoch_Loss', epoch_train_loss, epoch)
writer.add_scalar('Train/Epoch_Accuracy', epoch_train_acc, epoch)

dataiter = iter(train_loader)
images, labels = next(dataiter)
images = images.to(device)
writer.add_graph(model, images)  # 通过真实输入样本生成模型计算图

# 可视化原始训练图像
img_grid = torchvision.utils.make_grid(images[:8].cpu()) # 将多张图像拼接成网格状（方便可视化），将前8张图像拼接成一个网格
writer.add_image('原始训练图像', img_grid)

# 可视化错误预测样本（训练结束后）
wrong_img_grid = torchvision.utils.make_grid(wrong_images[:display_count])
writer.add_image('错误预测样本', wrong_img_grid)

if (batch_idx + 1) % 500 == 0:
    for name, param in model.named_parameters():
        writer.add_histogram(f'weights/{name}', param, global_step)  # 权重分布
        if param.grad is not None:
            writer.add_histogram(f'grads/{name}', param.grad, global_step)  # 梯度分布

3.tensorboard在cifar上的实战：MLP和CNN模型

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
import numpy as np
import matplotlib.pyplot as plt
import os

# 设置随机种子以确保结果可复现
torch.manual_seed(42)
np.random.seed(42)

# 1. 数据预处理
transform = transforms.Compose([
    transforms.ToTensor(),                # 转换为张量
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))  # 标准化处理
])

# 2. 加载CIFAR-10数据集
train_dataset = datasets.CIFAR10(
    root='./data',
    train=True,
    download=True,
    transform=transform
)

test_dataset = datasets.CIFAR10(
    root='./data',
    train=False,
    transform=transform
)

# 3. 创建数据加载器
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# CIFAR-10的类别名称
classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

# 4. 定义MLP模型（适应CIFAR-10的输入尺寸）
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.flatten = nn.Flatten()  # 将3x32x32的图像展平为3072维向量
        self.layer1 = nn.Linear(3072, 512)  # 第一层：3072个输入，512个神经元
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(0.2)  # 添加Dropout防止过拟合
        self.layer2 = nn.Linear(512, 256)  # 第二层：512个输入，256个神经元
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(0.2)
        self.layer3 = nn.Linear(256, 10)  # 输出层：10个类别
        
    def forward(self, x):
        # 第一步：将输入图像展平为一维向量
        x = self.flatten(x)  # 输入尺寸: [batch_size, 3, 32, 32] → [batch_size, 3072]
        
        # 第一层全连接 + 激活 + Dropout
        x = self.layer1(x)   # 线性变换: [batch_size, 3072] → [batch_size, 512]
        x = self.relu1(x)    # 应用ReLU激活函数
        x = self.dropout1(x) # 训练时随机丢弃部分神经元输出
        
        # 第二层全连接 + 激活 + Dropout
        x = self.layer2(x)   # 线性变换: [batch_size, 512] → [batch_size, 256]
        x = self.relu2(x)    # 应用ReLU激活函数
        x = self.dropout2(x) # 训练时随机丢弃部分神经元输出
        
        # 第三层（输出层）全连接
        x = self.layer3(x)   # 线性变换: [batch_size, 256] → [batch_size, 10]
        
        return x  # 返回未经过Softmax的logits

# 检查GPU是否可用
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 初始化模型
model = MLP()
model = model.to(device)  # 将模型移至GPU（如果可用）

criterion = nn.CrossEntropyLoss()  # 交叉熵损失函数
optimizer = optim.Adam(model.parameters(), lr=0.001)  # Adam优化器

# 创建TensorBoard的SummaryWriter，指定日志保存目录
log_dir = 'runs/cifar10_mlp_experiment'
# 如果目录已存在，添加后缀避免覆盖
if os.path.exists(log_dir):
    i = 1
    while os.path.exists(f"{log_dir}_{i}"):
        i += 1
    log_dir = f"{log_dir}_{i}"
writer = SummaryWriter(log_dir)

# 5. 训练模型（使用TensorBoard记录各种信息）
def train(model, train_loader, test_loader, criterion, optimizer, device, epochs, writer):
    model.train()  # 设置为训练模式
    
    # 记录训练开始时间，用于计算训练速度
    global_step = 0
    
    # 可视化模型结构
    dataiter = iter(train_loader)
    images, labels = next(dataiter)
    images = images.to(device)
    writer.add_graph(model, images)  # 添加模型图
    
    # 可视化原始图像样本
    img_grid = torchvision.utils.make_grid(images[:8].cpu())
    writer.add_image('原始训练图像', img_grid)
    
    for epoch in range(epochs):
        running_loss = 0.0
        correct = 0
        total = 0
        
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to(device), target.to(device)  # 移至GPU
            
            optimizer.zero_grad()  # 梯度清零
            output = model(data)  # 前向传播
            loss = criterion(output, target)  # 计算损失
            loss.backward()  # 反向传播
            optimizer.step()  # 更新参数
            
            # 统计准确率和损失
            running_loss += loss.item()
            _, predicted = output.max(1)
            total += target.size(0)
            correct += predicted.eq(target).sum().item()
            
            # 每100个批次记录一次信息到TensorBoard
            if (batch_idx + 1) % 100 == 0:
                batch_loss = loss.item()
                batch_acc = 100. * correct / total
                
                # 记录标量数据（损失、准确率）
                writer.add_scalar('Train/Batch_Loss', batch_loss, global_step)
                writer.add_scalar('Train/Batch_Accuracy', batch_acc, global_step)
                
                # 记录学习率
                writer.add_scalar('Train/Learning_Rate', optimizer.param_groups[0]['lr'], global_step)
                
                # 每500个批次记录一次直方图（权重和梯度）
                if (batch_idx + 1) % 500 == 0:
                    for name, param in model.named_parameters():
                        writer.add_histogram(f'weights/{name}', param, global_step)
                        if param.grad is not None:
                            writer.add_histogram(f'grads/{name}', param.grad, global_step)
                
                print(f'Epoch: {epoch+1}/{epochs} | Batch: {batch_idx+1}/{len(train_loader)} '
                      f'| 单Batch损失: {batch_loss:.4f} | 累计平均损失: {running_loss/(batch_idx+1):.4f}')
            
            global_step += 1
        
        # 计算当前epoch的平均训练损失和准确率
        epoch_train_loss = running_loss / len(train_loader)
        epoch_train_acc = 100. * correct / total
        
        # 记录每个epoch的训练损失和准确率
        writer.add_scalar('Train/Epoch_Loss', epoch_train_loss, epoch)
        writer.add_scalar('Train/Epoch_Accuracy', epoch_train_acc, epoch)
        
        # 测试阶段
        model.eval()  # 设置为评估模式
        test_loss = 0
        correct_test = 0
        total_test = 0
        
        # 用于存储预测错误的样本
        wrong_images = []
        wrong_labels = []
        wrong_preds = []
        
        with torch.no_grad():
            for data, target in test_loader:
                data, target = data.to(device), target.to(device)
                output = model(data)
                test_loss += criterion(output, target).item()
                _, predicted = output.max(1)
                total_test += target.size(0)
                correct_test += predicted.eq(target).sum().item()
                
                # 收集预测错误的样本
                wrong_mask = (predicted != target).cpu()
                if wrong_mask.sum() > 0:
                    wrong_batch_images = data[wrong_mask].cpu()
                    wrong_batch_labels = target[wrong_mask].cpu()
                    wrong_batch_preds = predicted[wrong_mask].cpu()
                    
                    wrong_images.extend(wrong_batch_images)
                    wrong_labels.extend(wrong_batch_labels)
                    wrong_preds.extend(wrong_batch_preds)
        
        epoch_test_loss = test_loss / len(test_loader)
        epoch_test_acc = 100. * correct_test / total_test
        
        # 记录每个epoch的测试损失和准确率
        writer.add_scalar('Test/Loss', epoch_test_loss, epoch)
        writer.add_scalar('Test/Accuracy', epoch_test_acc, epoch)
        
        # 计算并记录训练速度（每秒处理的样本数）
        # 这里简化处理，假设每个epoch的时间相同
        samples_per_epoch = len(train_loader.dataset)
        # 实际应用中应该使用time.time()来计算真实时间
        
        print(f'Epoch {epoch+1}/{epochs} 完成 | 训练准确率: {epoch_train_acc:.2f}% | 测试准确率: {epoch_test_acc:.2f}%')
        
        # 可视化预测错误的样本（只在最后一个epoch进行）
        if epoch == epochs - 1 and len(wrong_images) > 0:
            # 最多显示8个错误样本
            display_count = min(8, len(wrong_images))
            wrong_img_grid = torchvision.utils.make_grid(wrong_images[:display_count])
            
            # 创建错误预测的标签文本
            wrong_text = []
            for i in range(display_count):
                true_label = classes[wrong_labels[i]]
                pred_label = classes[wrong_preds[i]]
                wrong_text.append(f'True: {true_label}, Pred: {pred_label}')
            
            writer.add_image('错误预测样本', wrong_img_grid)
            writer.add_text('错误预测标签', '\n'.join(wrong_text), epoch)
    
    # 关闭TensorBoard写入器
    writer.close()
    
    return epoch_test_acc  # 返回最终测试准确率

# 6. 执行训练和测试
epochs = 20  # 训练轮次
print("开始训练模型...")
print(f"TensorBoard日志保存在: {log_dir}")
print("训练完成后，使用命令 `tensorboard --logdir=runs` 启动TensorBoard查看可视化结果")

final_accuracy = train(model, train_loader, test_loader, criterion, optimizer, device, epochs, writer)
print(f"训练完成！最终测试准确率: {final_accuracy:.2f}%")

Files already downloaded and verified
开始训练模型...
TensorBoard日志保存在: runs/cifar10_mlp_experiment_1
训练完成后，使用命令 `tensorboard --logdir=runs` 启动TensorBoard查看可视化结果
Epoch: 1/20 | Batch: 100/782 | 单Batch损失: 1.8327 | 累计平均损失: 1.9410
Epoch: 1/20 | Batch: 200/782 | 单Batch损失: 1.8588 | 累计平均损失: 1.8519
Epoch: 1/20 | Batch: 300/782 | 单Batch损失: 1.6719 | 累计平均损失: 1.8029
Epoch: 1/20 | Batch: 400/782 | 单Batch损失: 1.7609 | 累计平均损失: 1.7754
Epoch: 1/20 | Batch: 500/782 | 单Batch损失: 1.6642 | 累计平均损失: 1.7508
Epoch: 1/20 | Batch: 600/782 | 单Batch损失: 1.6564 | 累计平均损失: 1.7330
Epoch: 1/20 | Batch: 700/782 | 单Batch损失: 1.5870 | 累计平均损失: 1.7199
Epoch 1/20 完成 | 训练准确率: 39.23% | 测试准确率: 45.11%
Epoch: 2/20 | Batch: 100/782 | 单Batch损失: 1.4987 | 累计平均损失: 1.5227
Epoch: 2/20 | Batch: 200/782 | 单Batch损失: 1.3297 | 累计平均损失: 1.4918
Epoch: 2/20 | Batch: 300/782 | 单Batch损失: 1.3329 | 累计平均损失: 1.4820
Epoch: 2/20 | Batch: 400/782 | 单Batch损失: 1.5894 | 累计平均损失: 1.4701
Epoch: 2/20 | Batch: 500/782 | 单Batch损失: 1.3843 | 累计平均损失: 1.4710
Epoch: 2/20 | Batch: 600/782 | 单Batch损失: 1.3671 | 累计平均损失: 1.4662
Epoch: 2/20 | Batch: 700/782 | 单Batch损失: 1.4408 | 累计平均损失: 1.4614
Epoch 2/20 完成 | 训练准确率: 48.51% | 测试准确率: 49.87%
Epoch: 3/20 | Batch: 100/782 | 单Batch损失: 1.3722 | 累计平均损失: 1.3401
Epoch: 3/20 | Batch: 200/782 | 单Batch损失: 1.8139 | 累计平均损失: 1.3486
Epoch: 3/20 | Batch: 300/782 | 单Batch损失: 1.1994 | 累计平均损失: 1.3457
Epoch: 3/20 | Batch: 400/782 | 单Batch损失: 1.1896 | 累计平均损失: 1.3403
Epoch: 3/20 | Batch: 500/782 | 单Batch损失: 1.4191 | 累计平均损失: 1.3419
Epoch: 3/20 | Batch: 600/782 | 单Batch损失: 1.4218 | 累计平均损失: 1.3475
Epoch: 3/20 | Batch: 700/782 | 单Batch损失: 1.4627 | 累计平均损失: 1.3441
Epoch 3/20 完成 | 训练准确率: 52.43% | 测试准确率: 51.27%
Epoch: 4/20 | Batch: 100/782 | 单Batch损失: 1.3596 | 累计平均损失: 1.2346
Epoch: 4/20 | Batch: 200/782 | 单Batch损失: 1.3270 | 累计平均损失: 1.2381
Epoch: 4/20 | Batch: 300/782 | 单Batch损失: 1.2478 | 累计平均损失: 1.2434
Epoch: 4/20 | Batch: 400/782 | 单Batch损失: 1.3861 | 累计平均损失: 1.2422
Epoch: 4/20 | Batch: 500/782 | 单Batch损失: 1.3478 | 累计平均损失: 1.2422
Epoch: 4/20 | Batch: 600/782 | 单Batch损失: 1.1521 | 累计平均损失: 1.2447
Epoch: 4/20 | Batch: 700/782 | 单Batch损失: 1.2833 | 累计平均损失: 1.2469
Epoch 4/20 完成 | 训练准确率: 55.63% | 测试准确率: 51.32%
Epoch: 5/20 | Batch: 100/782 | 单Batch损失: 0.9809 | 累计平均损失: 1.1235
Epoch: 5/20 | Batch: 200/782 | 单Batch损失: 1.0800 | 累计平均损失: 1.1295
Epoch: 5/20 | Batch: 300/782 | 单Batch损失: 1.0129 | 累计平均损失: 1.1372
Epoch: 5/20 | Batch: 400/782 | 单Batch损失: 1.0918 | 累计平均损失: 1.1459
Epoch: 5/20 | Batch: 500/782 | 单Batch损失: 1.3155 | 累计平均损失: 1.1532
Epoch: 5/20 | Batch: 600/782 | 单Batch损失: 1.1727 | 累计平均损失: 1.1588
Epoch: 5/20 | Batch: 700/782 | 单Batch损失: 1.2888 | 累计平均损失: 1.1649
Epoch 5/20 完成 | 训练准确率: 58.74% | 测试准确率: 52.74%
Epoch: 6/20 | Batch: 100/782 | 单Batch损失: 1.1855 | 累计平均损失: 1.0499
Epoch: 6/20 | Batch: 200/782 | 单Batch损失: 0.8994 | 累计平均损失: 1.0567
Epoch: 6/20 | Batch: 300/782 | 单Batch损失: 1.2460 | 累计平均损失: 1.0602
Epoch: 6/20 | Batch: 400/782 | 单Batch损失: 1.1033 | 累计平均损失: 1.0660
Epoch: 6/20 | Batch: 500/782 | 单Batch损失: 0.9182 | 累计平均损失: 1.0679
Epoch: 6/20 | Batch: 600/782 | 单Batch损失: 1.4116 | 累计平均损失: 1.0745
Epoch: 6/20 | Batch: 700/782 | 单Batch损失: 1.0211 | 累计平均损失: 1.0814
Epoch 6/20 完成 | 训练准确率: 61.37% | 测试准确率: 52.98%
Epoch: 7/20 | Batch: 100/782 | 单Batch损失: 1.0082 | 累计平均损失: 0.9592
Epoch: 7/20 | Batch: 200/782 | 单Batch损失: 1.0255 | 累计平均损失: 0.9742
Epoch: 7/20 | Batch: 300/782 | 单Batch损失: 1.1416 | 累计平均损失: 0.9837
Epoch: 7/20 | Batch: 400/782 | 单Batch损失: 0.9732 | 累计平均损失: 0.9875
Epoch: 7/20 | Batch: 500/782 | 单Batch损失: 1.1387 | 累计平均损失: 0.9947
Epoch: 7/20 | Batch: 600/782 | 单Batch损失: 0.8657 | 累计平均损失: 0.9994
Epoch: 7/20 | Batch: 700/782 | 单Batch损失: 0.9666 | 累计平均损失: 1.0046
Epoch 7/20 完成 | 训练准确率: 64.09% | 测试准确率: 52.69%
Epoch: 8/20 | Batch: 100/782 | 单Batch损失: 0.6081 | 累计平均损失: 0.8927
Epoch: 8/20 | Batch: 200/782 | 单Batch损失: 0.6484 | 累计平均损失: 0.8922
Epoch: 8/20 | Batch: 300/782 | 单Batch损失: 0.8360 | 累计平均损失: 0.9001
Epoch: 8/20 | Batch: 400/782 | 单Batch损失: 1.1883 | 累计平均损失: 0.9150
Epoch: 8/20 | Batch: 500/782 | 单Batch损失: 0.9597 | 累计平均损失: 0.9244
Epoch: 8/20 | Batch: 600/782 | 单Batch损失: 0.8802 | 累计平均损失: 0.9273
Epoch: 8/20 | Batch: 700/782 | 单Batch损失: 0.9168 | 累计平均损失: 0.9295
Epoch 8/20 完成 | 训练准确率: 66.68% | 测试准确率: 52.01%
Epoch: 9/20 | Batch: 100/782 | 单Batch损失: 0.8491 | 累计平均损失: 0.7973
Epoch: 9/20 | Batch: 200/782 | 单Batch损失: 0.8207 | 累计平均损失: 0.8219
Epoch: 9/20 | Batch: 300/782 | 单Batch损失: 0.9952 | 累计平均损失: 0.8260
Epoch: 9/20 | Batch: 400/782 | 单Batch损失: 0.8664 | 累计平均损失: 0.8395
Epoch: 9/20 | Batch: 500/782 | 单Batch损失: 0.8573 | 累计平均损失: 0.8478
Epoch: 9/20 | Batch: 600/782 | 单Batch损失: 1.2844 | 累计平均损失: 0.8503
Epoch: 9/20 | Batch: 700/782 | 单Batch损失: 0.7931 | 累计平均损失: 0.8556
Epoch 9/20 完成 | 训练准确率: 69.11% | 测试准确率: 53.24%
Epoch: 10/20 | Batch: 100/782 | 单Batch损失: 0.6661 | 累计平均损失: 0.7471
Epoch: 10/20 | Batch: 200/782 | 单Batch损失: 0.7758 | 累计平均损失: 0.7521
Epoch: 10/20 | Batch: 300/782 | 单Batch损失: 1.1638 | 累计平均损失: 0.7680
Epoch: 10/20 | Batch: 400/782 | 单Batch损失: 0.7825 | 累计平均损失: 0.7754
Epoch: 10/20 | Batch: 500/782 | 单Batch损失: 0.6984 | 累计平均损失: 0.7834
Epoch: 10/20 | Batch: 600/782 | 单Batch损失: 0.7199 | 累计平均损失: 0.7880
Epoch: 10/20 | Batch: 700/782 | 单Batch损失: 0.9765 | 累计平均损失: 0.7918
Epoch 10/20 完成 | 训练准确率: 71.70% | 测试准确率: 53.59%
Epoch: 11/20 | Batch: 100/782 | 单Batch损失: 0.7485 | 累计平均损失: 0.6873
Epoch: 11/20 | Batch: 200/782 | 单Batch损失: 0.6853 | 累计平均损失: 0.6817
Epoch: 11/20 | Batch: 300/782 | 单Batch损失: 0.7594 | 累计平均损失: 0.6880
Epoch: 11/20 | Batch: 400/782 | 单Batch损失: 0.9249 | 累计平均损失: 0.7001
Epoch: 11/20 | Batch: 500/782 | 单Batch损失: 0.5742 | 累计平均损失: 0.7060
Epoch: 11/20 | Batch: 600/782 | 单Batch损失: 0.7716 | 累计平均损失: 0.7190
Epoch: 11/20 | Batch: 700/782 | 单Batch损失: 0.6123 | 累计平均损失: 0.7273
Epoch 11/20 完成 | 训练准确率: 73.83% | 测试准确率: 52.58%
Epoch: 12/20 | Batch: 100/782 | 单Batch损失: 0.6315 | 累计平均损失: 0.6275
Epoch: 12/20 | Batch: 200/782 | 单Batch损失: 0.5326 | 累计平均损失: 0.6286
Epoch: 12/20 | Batch: 300/782 | 单Batch损失: 0.5623 | 累计平均损失: 0.6369
Epoch: 12/20 | Batch: 400/782 | 单Batch损失: 0.7911 | 累计平均损失: 0.6473
Epoch: 12/20 | Batch: 500/782 | 单Batch损失: 0.6620 | 累计平均损失: 0.6545
Epoch: 12/20 | Batch: 600/782 | 单Batch损失: 0.5583 | 累计平均损失: 0.6637
Epoch: 12/20 | Batch: 700/782 | 单Batch损失: 0.6010 | 累计平均损失: 0.6709
Epoch 12/20 完成 | 训练准确率: 75.82% | 测试准确率: 52.88%
Epoch: 13/20 | Batch: 100/782 | 单Batch损失: 0.7061 | 累计平均损失: 0.5733
Epoch: 13/20 | Batch: 200/782 | 单Batch损失: 0.5555 | 累计平均损失: 0.5713
Epoch: 13/20 | Batch: 300/782 | 单Batch损失: 0.3972 | 累计平均损失: 0.5712
Epoch: 13/20 | Batch: 400/782 | 单Batch损失: 0.8246 | 累计平均损失: 0.5824
Epoch: 13/20 | Batch: 500/782 | 单Batch损失: 0.4577 | 累计平均损失: 0.5935
Epoch: 13/20 | Batch: 600/782 | 单Batch损失: 0.7397 | 累计平均损失: 0.5992
Epoch: 13/20 | Batch: 700/782 | 单Batch损失: 0.6297 | 累计平均损失: 0.6090
Epoch 13/20 完成 | 训练准确率: 78.18% | 测试准确率: 53.19%
Epoch: 14/20 | Batch: 100/782 | 单Batch损失: 0.5944 | 累计平均损失: 0.5333
Epoch: 14/20 | Batch: 200/782 | 单Batch损失: 0.5172 | 累计平均损失: 0.5252
Epoch: 14/20 | Batch: 300/782 | 单Batch损失: 0.5107 | 累计平均损失: 0.5313
Epoch: 14/20 | Batch: 400/782 | 单Batch损失: 0.4882 | 累计平均损失: 0.5414
Epoch: 14/20 | Batch: 500/782 | 单Batch损失: 0.4880 | 累计平均损失: 0.5560
Epoch: 14/20 | Batch: 600/782 | 单Batch损失: 0.6760 | 累计平均损失: 0.5617
Epoch: 14/20 | Batch: 700/782 | 单Batch损失: 0.5190 | 累计平均损失: 0.5651
Epoch 14/20 完成 | 训练准确率: 79.50% | 测试准确率: 53.00%
Epoch: 15/20 | Batch: 100/782 | 单Batch损失: 0.3614 | 累计平均损失: 0.4667
Epoch: 15/20 | Batch: 200/782 | 单Batch损失: 0.5322 | 累计平均损失: 0.4657
Epoch: 15/20 | Batch: 300/782 | 单Batch损失: 0.5792 | 累计平均损失: 0.4838
Epoch: 15/20 | Batch: 400/782 | 单Batch损失: 0.6562 | 累计平均损失: 0.4975
Epoch: 15/20 | Batch: 500/782 | 单Batch损失: 0.5755 | 累计平均损失: 0.5062
Epoch: 15/20 | Batch: 600/782 | 单Batch损失: 0.8258 | 累计平均损失: 0.5142
Epoch: 15/20 | Batch: 700/782 | 单Batch损失: 0.4823 | 累计平均损失: 0.5194
Epoch 15/20 完成 | 训练准确率: 81.21% | 测试准确率: 52.39%
Epoch: 16/20 | Batch: 100/782 | 单Batch损失: 0.3308 | 累计平均损失: 0.4314
Epoch: 16/20 | Batch: 200/782 | 单Batch损失: 0.3376 | 累计平均损失: 0.4463
Epoch: 16/20 | Batch: 300/782 | 单Batch损失: 0.5752 | 累计平均损失: 0.4539
Epoch: 16/20 | Batch: 400/782 | 单Batch损失: 0.4853 | 累计平均损失: 0.4700
Epoch: 16/20 | Batch: 500/782 | 单Batch损失: 0.5356 | 累计平均损失: 0.4794
Epoch: 16/20 | Batch: 600/782 | 单Batch损失: 0.6754 | 累计平均损失: 0.4817
Epoch: 16/20 | Batch: 700/782 | 单Batch损失: 0.4735 | 累计平均损失: 0.4875
Epoch 16/20 完成 | 训练准确率: 82.41% | 测试准确率: 53.40%
Epoch: 17/20 | Batch: 100/782 | 单Batch损失: 0.3944 | 累计平均损失: 0.4055
Epoch: 17/20 | Batch: 200/782 | 单Batch损失: 0.3707 | 累计平均损失: 0.4074
Epoch: 17/20 | Batch: 300/782 | 单Batch损失: 0.5363 | 累计平均损失: 0.4122
Epoch: 17/20 | Batch: 400/782 | 单Batch损失: 0.3647 | 累计平均损失: 0.4147
Epoch: 17/20 | Batch: 500/782 | 单Batch损失: 0.4949 | 累计平均损失: 0.4241
Epoch: 17/20 | Batch: 600/782 | 单Batch损失: 0.2563 | 累计平均损失: 0.4316
Epoch: 17/20 | Batch: 700/782 | 单Batch损失: 0.3814 | 累计平均损失: 0.4394
Epoch 17/20 完成 | 训练准确率: 84.10% | 测试准确率: 51.73%
Epoch: 18/20 | Batch: 100/782 | 单Batch损失: 0.4645 | 累计平均损失: 0.3851
Epoch: 18/20 | Batch: 200/782 | 单Batch损失: 0.2752 | 累计平均损失: 0.3906
Epoch: 18/20 | Batch: 300/782 | 单Batch损失: 0.4404 | 累计平均损失: 0.3927
Epoch: 18/20 | Batch: 400/782 | 单Batch损失: 0.4450 | 累计平均损失: 0.4015
Epoch: 18/20 | Batch: 500/782 | 单Batch损失: 0.4082 | 累计平均损失: 0.4158
Epoch: 18/20 | Batch: 600/782 | 单Batch损失: 0.3982 | 累计平均损失: 0.4203
Epoch: 18/20 | Batch: 700/782 | 单Batch损失: 0.5168 | 累计平均损失: 0.4263
Epoch 18/20 完成 | 训练准确率: 84.83% | 测试准确率: 51.31%
Epoch: 19/20 | Batch: 100/782 | 单Batch损失: 0.2534 | 累计平均损失: 0.3471
Epoch: 19/20 | Batch: 200/782 | 单Batch损失: 0.3286 | 累计平均损失: 0.3488
Epoch: 19/20 | Batch: 300/782 | 单Batch损失: 0.2713 | 累计平均损失: 0.3563
Epoch: 19/20 | Batch: 400/782 | 单Batch损失: 0.4733 | 累计平均损失: 0.3728
Epoch: 19/20 | Batch: 500/782 | 单Batch损失: 0.3166 | 累计平均损失: 0.3756
Epoch: 19/20 | Batch: 600/782 | 单Batch损失: 0.4382 | 累计平均损失: 0.3798
Epoch: 19/20 | Batch: 700/782 | 单Batch损失: 0.3680 | 累计平均损失: 0.3888
Epoch 19/20 完成 | 训练准确率: 86.05% | 测试准确率: 52.17%
Epoch: 20/20 | Batch: 100/782 | 单Batch损失: 0.2334 | 累计平均损失: 0.3316
Epoch: 20/20 | Batch: 200/782 | 单Batch损失: 0.3335 | 累计平均损失: 0.3274
Epoch: 20/20 | Batch: 300/782 | 单Batch损失: 0.4049 | 累计平均损失: 0.3455
Epoch: 20/20 | Batch: 400/782 | 单Batch损失: 0.5196 | 累计平均损失: 0.3514
Epoch: 20/20 | Batch: 500/782 | 单Batch损失: 0.3912 | 累计平均损失: 0.3619
Epoch: 20/20 | Batch: 600/782 | 单Batch损失: 0.2988 | 累计平均损失: 0.3776
Epoch: 20/20 | Batch: 700/782 | 单Batch损失: 0.5925 | 累计平均损失: 0.3867
Epoch 20/20 完成 | 训练准确率: 86.26% | 测试准确率: 51.46%
训练完成！最终测试准确率: 51.46%

import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter  
import matplotlib.pyplot as plt
import numpy as np
import os
import torchvision  # 记得导入 torchvision，之前代码里用到了其功能但没导入

# 设置中文字体支持
plt.rcParams["font.family"] = ["SimHei"]
plt.rcParams['axes.unicode_minus'] = False  # 解决负号显示问题

# 检查GPU是否可用
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"使用设备: {device}")

# 1. 数据预处理 
train_transform = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    transforms.RandomRotation(15),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])

test_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])

# 2. 加载CIFAR-10数据集
train_dataset = datasets.CIFAR10(
    root='./data',
    train=True,
    download=True,
    transform=train_transform
)

test_dataset = datasets.CIFAR10(
    root='./data',
    train=False,
    transform=test_transform
)

# 3. 创建数据加载器
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)



# 4. 定义CNN模型的定义（替代原MLP）
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()  # 继承父类初始化
        
        # ---------------------- 第一个卷积块 ----------------------
        # 卷积层1：输入3通道（RGB），输出32个特征图，卷积核3x3，边缘填充1像素
        self.conv1 = nn.Conv2d(
            in_channels=3,       # 输入通道数（图像的RGB通道）
            out_channels=32,     # 输出通道数（生成32个新特征图）
            kernel_size=3,       # 卷积核尺寸（3x3像素）
            padding=1            # 边缘填充1像素，保持输出尺寸与输入相同
        )
        # 批量归一化层：对32个输出通道进行归一化，加速训练
        self.bn1 = nn.BatchNorm2d(num_features=32)
        # ReLU激活函数：引入非线性，公式：max(0, x)
        self.relu1 = nn.ReLU()
        # 最大池化层：窗口2x2，步长2，特征图尺寸减半（32x32→16x16）
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)  # stride默认等于kernel_size
        
        # ---------------------- 第二个卷积块 ----------------------
        # 卷积层2：输入32通道（来自conv1的输出），输出64通道
        self.conv2 = nn.Conv2d(
            in_channels=32,      # 输入通道数（前一层的输出通道数）
            out_channels=64,     # 输出通道数（特征图数量翻倍）
            kernel_size=3,       # 卷积核尺寸不变
            padding=1            # 保持尺寸：16x16→16x16（卷积后）→8x8（池化后）
        )
        self.bn2 = nn.BatchNorm2d(num_features=64)
        self.relu2 = nn.ReLU()
        self.pool2 = nn.MaxPool2d(kernel_size=2)  # 尺寸减半：16x16→8x8
        
        # ---------------------- 第三个卷积块 ----------------------
        # 卷积层3：输入64通道，输出128通道
        self.conv3 = nn.Conv2d(
            in_channels=64,      # 输入通道数（前一层的输出通道数）
            out_channels=128,    # 输出通道数（特征图数量再次翻倍）
            kernel_size=3,
            padding=1            # 保持尺寸：8x8→8x8（卷积后）→4x4（池化后）
        )
        self.bn3 = nn.BatchNorm2d(num_features=128)
        self.relu3 = nn.ReLU()  # 复用激活函数对象（节省内存）
        self.pool3 = nn.MaxPool2d(kernel_size=2)  # 尺寸减半：8x8→4x4
        
        # ---------------------- 全连接层（分类器） ----------------------
        # 计算展平后的特征维度：128通道 × 4x4尺寸 = 128×16=2048维
        self.fc1 = nn.Linear(
            in_features=128 * 4 * 4,  # 输入维度（卷积层输出的特征数）
            out_features=512          # 输出维度（隐藏层神经元数）
        )
        # Dropout层：训练时随机丢弃50%神经元，防止过拟合
        self.dropout = nn.Dropout(p=0.5)
        # 输出层：将512维特征映射到10个类别（CIFAR-10的类别数）
        self.fc2 = nn.Linear(in_features=512, out_features=10)

    def forward(self, x):
        # 输入尺寸：[batch_size, 3, 32, 32]（batch_size=批量大小，3=通道数，32x32=图像尺寸）
        
        # ---------- 卷积块1处理 ----------
        x = self.conv1(x)       # 卷积后尺寸：[batch_size, 32, 32, 32]（padding=1保持尺寸）
        x = self.bn1(x)         # 批量归一化，不改变尺寸
        x = self.relu1(x)       # 激活函数，不改变尺寸
        x = self.pool1(x)       # 池化后尺寸：[batch_size, 32, 16, 16]（32→16是因为池化窗口2x2）
        
        # ---------- 卷积块2处理 ----------
        x = self.conv2(x)       # 卷积后尺寸：[batch_size, 64, 16, 16]（padding=1保持尺寸）
        x = self.bn2(x)
        x = self.relu2(x)
        x = self.pool2(x)       # 池化后尺寸：[batch_size, 64, 8, 8]
        
        # ---------- 卷积块3处理 ----------
        x = self.conv3(x)       # 卷积后尺寸：[batch_size, 128, 8, 8]（padding=1保持尺寸）
        x = self.bn3(x)
        x = self.relu3(x)
        x = self.pool3(x)       # 池化后尺寸：[batch_size, 128, 4, 4]
        
        # ---------- 展平与全连接层 ----------
        # 将多维特征图展平为一维向量：[batch_size, 128*4*4] = [batch_size, 2048]
        x = x.view(-1, 128 * 4 * 4)  # -1自动计算批量维度，保持批量大小不变
        
        x = self.fc1(x)           # 全连接层：2048→512，尺寸变为[batch_size, 512]
        x = self.relu3(x)         # 激活函数（复用relu3，与卷积块3共用）
        x = self.dropout(x)       # Dropout随机丢弃神经元，不改变尺寸
        x = self.fc2(x)           # 全连接层：512→10，尺寸变为[batch_size, 10]（未激活，直接输出logits）
        
        return x  # 输出未经过Softmax的logits，适用于交叉熵损失函数



# 初始化模型
model = CNN()
model = model.to(device)  # 将模型移至GPU（如果可用）


criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,        # 指定要控制的优化器（这里是Adam）
    mode='min',       # 监测的指标是"最小化"（如损失函数）
    patience=3,       # 如果连续3个epoch指标没有改善，才降低LR
    factor=0.5,       # 降低LR的比例（新LR = 旧LR × 0.5）
    verbose=True      # 打印学习率调整信息
)

# ======================== TensorBoard 核心配置 ========================
# 创建 TensorBoard 日志目录（自动避免重复）
log_dir = "runs/cifar10_cnn_exp"
if os.path.exists(log_dir):
    version = 1
    while os.path.exists(f"{log_dir}_v{version}"):
        version += 1
    log_dir = f"{log_dir}_v{version}"
writer = SummaryWriter(log_dir)  # 初始化 SummaryWriter

# 5. 训练模型（整合 TensorBoard 记录）
def train(model, train_loader, test_loader, criterion, optimizer, scheduler, device, epochs, writer):
    model.train()
    all_iter_losses = []  
    iter_indices = []     
    global_step = 0       # 全局步骤，用于 TensorBoard 标量记录

    # （可选）记录模型结构：用一个真实样本走一遍前向传播，让 TensorBoard 解析计算图
    dataiter = iter(train_loader)
    images, labels = next(dataiter)
    images = images.to(device)
    writer.add_graph(model, images)  # 写入模型结构到 TensorBoard

    # （可选）记录原始训练图像：可视化数据增强前/后效果
    img_grid = torchvision.utils.make_grid(images[:8].cpu())  # 取前8张
    writer.add_image('原始训练图像（增强前）', img_grid, global_step=0)

    for epoch in range(epochs):
        running_loss = 0.0
        correct = 0
        total = 0
        
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to(device), target.to(device)
            
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()

            # 记录迭代级损失
            iter_loss = loss.item()
            all_iter_losses.append(iter_loss)
            iter_indices.append(global_step + 1)  # 用 global_step 对齐

            # 统计准确率
            running_loss += iter_loss
            _, predicted = output.max(1)
            total += target.size(0)
            correct += predicted.eq(target).sum().item()

            # ======================== TensorBoard 标量记录 ========================
            # 记录每个 batch 的损失、准确率
            batch_acc = 100. * correct / total
            writer.add_scalar('Train/Batch Loss', iter_loss, global_step)
            writer.add_scalar('Train/Batch Accuracy', batch_acc, global_step)

            # 记录学习率（可选）
            writer.add_scalar('Train/Learning Rate', optimizer.param_groups[0]['lr'], global_step)

            # 每 200 个 batch 记录一次参数直方图（可选，耗时稍高）
            if (batch_idx + 1) % 200 == 0:
                for name, param in model.named_parameters():
                    writer.add_histogram(f'Weights/{name}', param, global_step)
                    if param.grad is not None:
                        writer.add_histogram(f'Gradients/{name}', param.grad, global_step)

            # 每 100 个 batch 打印控制台日志（同原代码）
            if (batch_idx + 1) % 100 == 0:
                print(f'Epoch: {epoch+1}/{epochs} | Batch: {batch_idx+1}/{len(train_loader)} '
                      f'| 单Batch损失: {iter_loss:.4f} | 累计平均损失: {running_loss/(batch_idx+1):.4f}')

            global_step += 1  # 全局步骤递增

        # 计算 epoch 级训练指标
        epoch_train_loss = running_loss / len(train_loader)
        epoch_train_acc = 100. * correct / total

        # ======================== TensorBoard  epoch 标量记录 ========================
        writer.add_scalar('Train/Epoch Loss', epoch_train_loss, epoch)
        writer.add_scalar('Train/Epoch Accuracy', epoch_train_acc, epoch)

        # 测试阶段
        model.eval()
        test_loss = 0
        correct_test = 0
        total_test = 0
        wrong_images = []  # 存储错误预测样本（用于可视化）
        wrong_labels = []
        wrong_preds = []

        with torch.no_grad():
            for data, target in test_loader:
                data, target = data.to(device), target.to(device)
                output = model(data)
                test_loss += criterion(output, target).item()
                _, predicted = output.max(1)
                total_test += target.size(0)
                correct_test += predicted.eq(target).sum().item()

                # 收集错误预测样本（用于可视化）
                wrong_mask = (predicted != target)
                if wrong_mask.sum() > 0:
                    wrong_batch_images = data[wrong_mask][:8].cpu()  # 最多存8张
                    wrong_batch_labels = target[wrong_mask][:8].cpu()
                    wrong_batch_preds = predicted[wrong_mask][:8].cpu()
                    wrong_images.extend(wrong_batch_images)
                    wrong_labels.extend(wrong_batch_labels)
                    wrong_preds.extend(wrong_batch_preds)

        # 计算 epoch 级测试指标
        epoch_test_loss = test_loss / len(test_loader)
        epoch_test_acc = 100. * correct_test / total_test

        # ======================== TensorBoard 测试集记录 ========================
        writer.add_scalar('Test/Epoch Loss', epoch_test_loss, epoch)
        writer.add_scalar('Test/Epoch Accuracy', epoch_test_acc, epoch)

        # （可选）可视化错误预测样本
        if wrong_images:
            wrong_img_grid = torchvision.utils.make_grid(wrong_images)
            writer.add_image('错误预测样本', wrong_img_grid, epoch)
            # 写入错误标签文本（可选）
            wrong_text = [f"真实: {classes[wl]}, 预测: {classes[wp]}" 
                         for wl, wp in zip(wrong_labels, wrong_preds)]
            writer.add_text('错误预测标签', '\n'.join(wrong_text), epoch)

        # 更新学习率调度器
        scheduler.step(epoch_test_loss)

        print(f'Epoch {epoch+1}/{epochs} 完成 | 训练准确率: {epoch_train_acc:.2f}% | 测试准确率: {epoch_test_acc:.2f}%')

    # 关闭 TensorBoard 写入器
    writer.close()

    # 绘制迭代级损失曲线（同原代码）
    plot_iter_losses(all_iter_losses, iter_indices)
    return epoch_test_acc

# 6. 绘制迭代级损失曲线（同原代码，略）
def plot_iter_losses(losses, indices):
    plt.figure(figsize=(10, 4))
    plt.plot(indices, losses, 'b-', alpha=0.7, label='Iteration Loss')
    plt.xlabel('Iteration（Batch序号）')
    plt.ylabel('损失值')
    plt.title('每个 Iteration 的训练损失')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

# （可选）CIFAR-10 类别名
classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

# 7. 执行训练（传入 TensorBoard writer）
epochs = 20
print("开始使用CNN训练模型...")
print(f"TensorBoard 日志目录: {log_dir}")
print("训练后执行: tensorboard --logdir=runs 查看可视化")

final_accuracy = train(model, train_loader, test_loader, criterion, optimizer, scheduler, device, epochs, writer)
print(f"训练完成！最终测试准确率: {final_accuracy:.2f}%")

使用设备: cuda
Files already downloaded and verified
开始使用CNN训练模型...
TensorBoard 日志目录: runs/cifar10_cnn_exp
训练后执行: tensorboard --logdir=runs 查看可视化
Epoch: 1/20 | Batch: 100/782 | 单Batch损失: 1.8809 | 累计平均损失: 2.0134
Epoch: 1/20 | Batch: 200/782 | 单Batch损失: 1.7645 | 累计平均损失: 1.8838
Epoch: 1/20 | Batch: 300/782 | 单Batch损失: 1.6334 | 累计平均损失: 1.8246
Epoch: 1/20 | Batch: 400/782 | 单Batch损失: 1.6380 | 累计平均损失: 1.7784
Epoch: 1/20 | Batch: 500/782 | 单Batch损失: 1.5500 | 累计平均损失: 1.7435
Epoch: 1/20 | Batch: 600/782 | 单Batch损失: 1.5527 | 累计平均损失: 1.7107
Epoch: 1/20 | Batch: 700/782 | 单Batch损失: 1.4984 | 累计平均损失: 1.6852
Epoch 1/20 完成 | 训练准确率: 38.11% | 测试准确率: 52.47%
Epoch: 2/20 | Batch: 100/782 | 单Batch损失: 1.3814 | 累计平均损失: 1.4373
Epoch: 2/20 | Batch: 200/782 | 单Batch损失: 1.2911 | 累计平均损失: 1.3985
Epoch: 2/20 | Batch: 300/782 | 单Batch损失: 1.1904 | 累计平均损失: 1.3747
Epoch: 2/20 | Batch: 400/782 | 单Batch损失: 1.4026 | 累计平均损失: 1.3556
Epoch: 2/20 | Batch: 500/782 | 单Batch损失: 1.0859 | 累计平均损失: 1.3323
Epoch: 2/20 | Batch: 600/782 | 单Batch损失: 1.0579 | 累计平均损失: 1.3118
Epoch: 2/20 | Batch: 700/782 | 单Batch损失: 1.0614 | 累计平均损失: 1.2968
Epoch 2/20 完成 | 训练准确率: 53.28% | 测试准确率: 64.18%
Epoch: 3/20 | Batch: 100/782 | 单Batch损失: 1.2669 | 累计平均损失: 1.1595
Epoch: 3/20 | Batch: 200/782 | 单Batch损失: 1.1426 | 累计平均损失: 1.1423
Epoch: 3/20 | Batch: 300/782 | 单Batch损失: 0.9215 | 累计平均损失: 1.1318
Epoch: 3/20 | Batch: 400/782 | 单Batch损失: 0.9795 | 累计平均损失: 1.1233
Epoch: 3/20 | Batch: 500/782 | 单Batch损失: 1.2100 | 累计平均损失: 1.1204
Epoch: 3/20 | Batch: 600/782 | 单Batch损失: 1.1693 | 累计平均损失: 1.1098
Epoch: 3/20 | Batch: 700/782 | 单Batch损失: 1.0973 | 累计平均损失: 1.1007
Epoch 3/20 完成 | 训练准确率: 61.37% | 测试准确率: 67.55%
Epoch: 4/20 | Batch: 100/782 | 单Batch损失: 0.8795 | 累计平均损失: 1.0080
Epoch: 4/20 | Batch: 200/782 | 单Batch损失: 1.0070 | 累计平均损失: 1.0122
Epoch: 4/20 | Batch: 300/782 | 单Batch损失: 1.1206 | 累计平均损失: 1.0071
Epoch: 4/20 | Batch: 400/782 | 单Batch损失: 1.0918 | 累计平均损失: 1.0017
Epoch: 4/20 | Batch: 500/782 | 单Batch损失: 0.8132 | 累计平均损失: 0.9982
Epoch: 4/20 | Batch: 600/782 | 单Batch损失: 1.1464 | 累计平均损失: 0.9895
Epoch: 4/20 | Batch: 700/782 | 单Batch损失: 0.9950 | 累计平均损失: 0.9883
Epoch 4/20 完成 | 训练准确率: 65.12% | 测试准确率: 70.56%
Epoch: 5/20 | Batch: 100/782 | 单Batch损失: 0.9320 | 累计平均损失: 0.9707
Epoch: 5/20 | Batch: 200/782 | 单Batch损失: 0.9041 | 累计平均损失: 0.9490
Epoch: 5/20 | Batch: 300/782 | 单Batch损失: 0.7707 | 累计平均损失: 0.9494
Epoch: 5/20 | Batch: 400/782 | 单Batch损失: 0.8947 | 累计平均损失: 0.9423
Epoch: 5/20 | Batch: 500/782 | 单Batch损失: 0.8728 | 累计平均损失: 0.9352
Epoch: 5/20 | Batch: 600/782 | 单Batch损失: 0.9779 | 累计平均损失: 0.9290
Epoch: 5/20 | Batch: 700/782 | 单Batch损失: 0.9652 | 累计平均损失: 0.9266
Epoch 5/20 完成 | 训练准确率: 67.76% | 测试准确率: 74.09%
Epoch: 6/20 | Batch: 100/782 | 单Batch损失: 0.8804 | 累计平均损失: 0.8748
Epoch: 6/20 | Batch: 200/782 | 单Batch损失: 0.9413 | 累计平均损失: 0.8779
Epoch: 6/20 | Batch: 300/782 | 单Batch损失: 0.9451 | 累计平均损失: 0.8813
Epoch: 6/20 | Batch: 400/782 | 单Batch损失: 0.9844 | 累计平均损失: 0.8811
Epoch: 6/20 | Batch: 500/782 | 单Batch损失: 0.9123 | 累计平均损失: 0.8804
Epoch: 6/20 | Batch: 600/782 | 单Batch损失: 0.7724 | 累计平均损失: 0.8747
Epoch: 6/20 | Batch: 700/782 | 单Batch损失: 0.9191 | 累计平均损失: 0.8738
Epoch 6/20 完成 | 训练准确率: 69.24% | 测试准确率: 74.44%
Epoch: 7/20 | Batch: 100/782 | 单Batch损失: 0.4618 | 累计平均损失: 0.8522
Epoch: 7/20 | Batch: 200/782 | 单Batch损失: 1.0956 | 累计平均损失: 0.8398
Epoch: 7/20 | Batch: 300/782 | 单Batch损失: 0.7080 | 累计平均损失: 0.8442
Epoch: 7/20 | Batch: 400/782 | 单Batch损失: 0.8755 | 累计平均损失: 0.8423
Epoch: 7/20 | Batch: 500/782 | 单Batch损失: 1.0161 | 累计平均损失: 0.8451
Epoch: 7/20 | Batch: 600/782 | 单Batch损失: 0.9611 | 累计平均损失: 0.8436
Epoch: 7/20 | Batch: 700/782 | 单Batch损失: 0.9344 | 累计平均损失: 0.8433
Epoch 7/20 完成 | 训练准确率: 70.60% | 测试准确率: 75.97%
Epoch: 8/20 | Batch: 100/782 | 单Batch损失: 0.5846 | 累计平均损失: 0.7982
Epoch: 8/20 | Batch: 200/782 | 单Batch损失: 1.1336 | 累计平均损失: 0.8046
Epoch: 8/20 | Batch: 300/782 | 单Batch损失: 0.7393 | 累计平均损失: 0.8122
Epoch: 8/20 | Batch: 400/782 | 单Batch损失: 0.8892 | 累计平均损失: 0.8108
Epoch: 8/20 | Batch: 500/782 | 单Batch损失: 0.9932 | 累计平均损失: 0.8128
Epoch: 8/20 | Batch: 600/782 | 单Batch损失: 0.8610 | 累计平均损失: 0.8154
Epoch: 8/20 | Batch: 700/782 | 单Batch损失: 1.0081 | 累计平均损失: 0.8130
Epoch 8/20 完成 | 训练准确率: 71.44% | 测试准确率: 76.04%
Epoch: 9/20 | Batch: 100/782 | 单Batch损失: 0.8448 | 累计平均损失: 0.8206
Epoch: 9/20 | Batch: 200/782 | 单Batch损失: 0.6494 | 累计平均损失: 0.8086
Epoch: 9/20 | Batch: 300/782 | 单Batch损失: 0.8203 | 累计平均损失: 0.8021
Epoch: 9/20 | Batch: 400/782 | 单Batch损失: 0.6053 | 累计平均损失: 0.7929
Epoch: 9/20 | Batch: 500/782 | 单Batch损失: 0.8298 | 累计平均损失: 0.7890
Epoch: 9/20 | Batch: 600/782 | 单Batch损失: 0.9492 | 累计平均损失: 0.7873
Epoch: 9/20 | Batch: 700/782 | 单Batch损失: 0.7991 | 累计平均损失: 0.7889
Epoch 9/20 完成 | 训练准确率: 72.75% | 测试准确率: 77.43%
Epoch: 10/20 | Batch: 100/782 | 单Batch损失: 0.7773 | 累计平均损失: 0.7684
Epoch: 10/20 | Batch: 200/782 | 单Batch损失: 0.7030 | 累计平均损失: 0.7681
Epoch: 10/20 | Batch: 300/782 | 单Batch损失: 0.7726 | 累计平均损失: 0.7708
Epoch: 10/20 | Batch: 400/782 | 单Batch损失: 0.7785 | 累计平均损失: 0.7681
Epoch: 10/20 | Batch: 500/782 | 单Batch损失: 0.8096 | 累计平均损失: 0.7653
Epoch: 10/20 | Batch: 600/782 | 单Batch损失: 0.6069 | 累计平均损失: 0.7635
Epoch: 10/20 | Batch: 700/782 | 单Batch损失: 0.5608 | 累计平均损失: 0.7630
Epoch 10/20 完成 | 训练准确率: 73.20% | 测试准确率: 76.64%
Epoch: 11/20 | Batch: 100/782 | 单Batch损失: 0.7491 | 累计平均损失: 0.7709
Epoch: 11/20 | Batch: 200/782 | 单Batch损失: 0.8199 | 累计平均损失: 0.7523
Epoch: 11/20 | Batch: 300/782 | 单Batch损失: 1.0428 | 累计平均损失: 0.7427
Epoch: 11/20 | Batch: 400/782 | 单Batch损失: 0.7862 | 累计平均损失: 0.7416
Epoch: 11/20 | Batch: 500/782 | 单Batch损失: 0.7416 | 累计平均损失: 0.7450
Epoch: 11/20 | Batch: 600/782 | 单Batch损失: 0.8239 | 累计平均损失: 0.7390
Epoch: 11/20 | Batch: 700/782 | 单Batch损失: 0.5744 | 累计平均损失: 0.7427
Epoch 11/20 完成 | 训练准确率: 74.04% | 测试准确率: 77.92%
Epoch: 12/20 | Batch: 100/782 | 单Batch损失: 0.7772 | 累计平均损失: 0.7281
Epoch: 12/20 | Batch: 200/782 | 单Batch损失: 0.6939 | 累计平均损失: 0.7296
Epoch: 12/20 | Batch: 300/782 | 单Batch损失: 0.6478 | 累计平均损失: 0.7348
Epoch: 12/20 | Batch: 400/782 | 单Batch损失: 0.6809 | 累计平均损失: 0.7306
Epoch: 12/20 | Batch: 500/782 | 单Batch损失: 0.7887 | 累计平均损失: 0.7308
Epoch: 12/20 | Batch: 600/782 | 单Batch损失: 0.9312 | 累计平均损失: 0.7293
Epoch: 12/20 | Batch: 700/782 | 单Batch损失: 0.8912 | 累计平均损失: 0.7249
Epoch 12/20 完成 | 训练准确率: 74.57% | 测试准确率: 78.34%
Epoch: 13/20 | Batch: 100/782 | 单Batch损失: 0.7660 | 累计平均损失: 0.7202
Epoch: 13/20 | Batch: 200/782 | 单Batch损失: 0.8096 | 累计平均损失: 0.7066
Epoch: 13/20 | Batch: 300/782 | 单Batch损失: 0.6760 | 累计平均损失: 0.7041
Epoch: 13/20 | Batch: 400/782 | 单Batch损失: 0.9175 | 累计平均损失: 0.7036
Epoch: 13/20 | Batch: 500/782 | 单Batch损失: 0.7499 | 累计平均损失: 0.7067
Epoch: 13/20 | Batch: 600/782 | 单Batch损失: 0.5950 | 累计平均损失: 0.7061
Epoch: 13/20 | Batch: 700/782 | 单Batch损失: 0.5243 | 累计平均损失: 0.7101
Epoch 13/20 完成 | 训练准确率: 75.14% | 测试准确率: 78.82%
Epoch: 14/20 | Batch: 100/782 | 单Batch损失: 0.4825 | 累计平均损失: 0.6837
Epoch: 14/20 | Batch: 200/782 | 单Batch损失: 0.6175 | 累计平均损失: 0.6888
Epoch: 14/20 | Batch: 300/782 | 单Batch损失: 0.7952 | 累计平均损失: 0.6866
Epoch: 14/20 | Batch: 400/782 | 单Batch损失: 0.5896 | 累计平均损失: 0.6942
Epoch: 14/20 | Batch: 500/782 | 单Batch损失: 0.6090 | 累计平均损失: 0.6938
Epoch: 14/20 | Batch: 600/782 | 单Batch损失: 0.7104 | 累计平均损失: 0.6953
Epoch: 14/20 | Batch: 700/782 | 单Batch损失: 0.4085 | 累计平均损失: 0.6972
Epoch 14/20 完成 | 训练准确率: 75.52% | 测试准确率: 78.96%
Epoch: 15/20 | Batch: 100/782 | 单Batch损失: 0.6176 | 累计平均损失: 0.6590
Epoch: 15/20 | Batch: 200/782 | 单Batch损失: 0.4711 | 累计平均损失: 0.6702
Epoch: 15/20 | Batch: 300/782 | 单Batch损失: 0.6192 | 累计平均损失: 0.6718
Epoch: 15/20 | Batch: 400/782 | 单Batch损失: 0.9684 | 累计平均损失: 0.6750
Epoch: 15/20 | Batch: 500/782 | 单Batch损失: 0.5928 | 累计平均损失: 0.6773
Epoch: 15/20 | Batch: 600/782 | 单Batch损失: 0.7290 | 累计平均损失: 0.6795
Epoch: 15/20 | Batch: 700/782 | 单Batch损失: 0.6996 | 累计平均损失: 0.6792
Epoch 15/20 完成 | 训练准确率: 76.17% | 测试准确率: 79.87%
Epoch: 16/20 | Batch: 100/782 | 单Batch损失: 0.4865 | 累计平均损失: 0.6782
Epoch: 16/20 | Batch: 200/782 | 单Batch损失: 0.5581 | 累计平均损失: 0.6671
Epoch: 16/20 | Batch: 300/782 | 单Batch损失: 0.6508 | 累计平均损失: 0.6648
Epoch: 16/20 | Batch: 400/782 | 单Batch损失: 0.8125 | 累计平均损失: 0.6715
Epoch: 16/20 | Batch: 500/782 | 单Batch损失: 0.5303 | 累计平均损失: 0.6735
Epoch: 16/20 | Batch: 600/782 | 单Batch损失: 0.6881 | 累计平均损失: 0.6737
Epoch: 16/20 | Batch: 700/782 | 单Batch损失: 0.9869 | 累计平均损失: 0.6726
Epoch 16/20 完成 | 训练准确率: 76.63% | 测试准确率: 79.73%
Epoch: 17/20 | Batch: 100/782 | 单Batch损失: 0.5943 | 累计平均损失: 0.6603
Epoch: 17/20 | Batch: 200/782 | 单Batch损失: 0.8486 | 累计平均损失: 0.6590
Epoch: 17/20 | Batch: 300/782 | 单Batch损失: 0.5727 | 累计平均损失: 0.6586
Epoch: 17/20 | Batch: 400/782 | 单Batch损失: 0.6489 | 累计平均损失: 0.6592
Epoch: 17/20 | Batch: 500/782 | 单Batch损失: 0.7211 | 累计平均损失: 0.6612
Epoch: 17/20 | Batch: 600/782 | 单Batch损失: 0.5552 | 累计平均损失: 0.6615
Epoch: 17/20 | Batch: 700/782 | 单Batch损失: 0.5500 | 累计平均损失: 0.6617
Epoch 17/20 完成 | 训练准确率: 76.85% | 测试准确率: 80.07%
Epoch: 18/20 | Batch: 100/782 | 单Batch损失: 0.6643 | 累计平均损失: 0.6195
Epoch: 18/20 | Batch: 200/782 | 单Batch损失: 0.5175 | 累计平均损失: 0.6383
Epoch: 18/20 | Batch: 300/782 | 单Batch损失: 0.8941 | 累计平均损失: 0.6423
Epoch: 18/20 | Batch: 400/782 | 单Batch损失: 0.5957 | 累计平均损失: 0.6494
Epoch: 18/20 | Batch: 500/782 | 单Batch损失: 0.6997 | 累计平均损失: 0.6515
Epoch: 18/20 | Batch: 600/782 | 单Batch损失: 0.8387 | 累计平均损失: 0.6526
Epoch: 18/20 | Batch: 700/782 | 单Batch损失: 0.6168 | 累计平均损失: 0.6502
Epoch 18/20 完成 | 训练准确率: 77.20% | 测试准确率: 80.18%
Epoch: 19/20 | Batch: 100/782 | 单Batch损失: 0.5368 | 累计平均损失: 0.6308
Epoch: 19/20 | Batch: 200/782 | 单Batch损失: 0.4568 | 累计平均损失: 0.6476
Epoch: 19/20 | Batch: 300/782 | 单Batch损失: 0.4435 | 累计平均损失: 0.6396
Epoch: 19/20 | Batch: 400/782 | 单Batch损失: 0.5895 | 累计平均损失: 0.6407
Epoch: 19/20 | Batch: 500/782 | 单Batch损失: 0.9253 | 累计平均损失: 0.6424
Epoch: 19/20 | Batch: 600/782 | 单Batch损失: 0.5735 | 累计平均损失: 0.6417
Epoch: 19/20 | Batch: 700/782 | 单Batch损失: 0.6145 | 累计平均损失: 0.6418
Epoch    19: reducing learning rate of group 0 to 5.0000e-04.
Epoch 19/20 完成 | 训练准确率: 77.65% | 测试准确率: 79.72%
Epoch: 20/20 | Batch: 100/782 | 单Batch损失: 0.5315 | 累计平均损失: 0.5793
Epoch: 20/20 | Batch: 200/782 | 单Batch损失: 0.5769 | 累计平均损失: 0.5837
Epoch: 20/20 | Batch: 300/782 | 单Batch损失: 0.4930 | 累计平均损失: 0.5806
Epoch: 20/20 | Batch: 400/782 | 单Batch损失: 0.5806 | 累计平均损失: 0.5821
Epoch: 20/20 | Batch: 500/782 | 单Batch损失: 0.8352 | 累计平均损失: 0.5847
Epoch: 20/20 | Batch: 600/782 | 单Batch损失: 0.5066 | 累计平均损失: 0.5831
Epoch: 20/20 | Batch: 700/782 | 单Batch损失: 0.5890 | 累计平均损失: 0.5816
Epoch 20/20 完成 | 训练准确率: 79.72% | 测试准确率: 81.85%

在这里插入图片描述

训练完成！最终测试准确率: 81.85%

# 省略预处理、模型定义代码

# ======================== TensorBoard 核心配置 ========================
# 在使用tensorboard前需要先指定日志保存路径
log_dir = "runs/cifar10_cnn_exp" # 指定日志保存路径
if os.path.exists(log_dir): #检查刚才定义的路径是否存在
    version = 1 
    while os.path.exists(f"{log_dir}_v{version}"): # 如果路径存在且版本号一致
        version += 1 # 版本号加1
    log_dir = f"{log_dir}_v{version}" # 如果路径存在，则创建一个新版本
writer = SummaryWriter(log_dir) # 初始化SummaryWriter
print(f"TensorBoard 日志目录: {log_dir}") # 所以第一次是cifar10_cnn_exp、第二次是cifar10_cnn_exp_v1

# 5. 训练模型（整合 TensorBoard 记录）
def train(model, train_loader, test_loader, criterion, optimizer, scheduler, device, epochs, writer):
    model.train()
    global_step = 0  # 全局步骤，用于 TensorBoard 标量记录

    # 记录模型结构和训练图像
    dataiter = iter(train_loader)
    images, labels = next(dataiter)
    images = images.to(device)
    writer.add_graph(model, images)
    
    img_grid = torchvision.utils.make_grid(images[:8].cpu())
    writer.add_image('原始训练图像（增强前）', img_grid, global_step=0)

    for epoch in range(epochs):
        running_loss = 0.0
        correct = 0
        total = 0
        
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to(device), target.to(device)
            
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()

            # 统计准确率
            running_loss += loss.item()
            _, predicted = output.max(1)
            total += target.size(0)
            correct += predicted.eq(target).sum().item()

            # 记录每个 batch 的损失、准确率和学习率
            batch_acc = 100. * correct / total
            writer.add_scalar('Train/Batch Loss', loss.item(), global_step)
            writer.add_scalar('Train/Batch Accuracy', batch_acc, global_step)
            writer.add_scalar('Train/Learning Rate', optimizer.param_groups[0]['lr'], global_step)

            # 每 200 个 batch 记录一次参数直方图
            if (batch_idx + 1) % 200 == 0:
                for name, param in model.named_parameters():
                    writer.add_histogram(f'Weights/{name}', param, global_step)
                    if param.grad is not None:
                        writer.add_histogram(f'Gradients/{name}', param.grad, global_step)

            global_step += 1

        # 计算 epoch 级训练指标
        epoch_train_loss = running_loss / len(train_loader)
        epoch_train_acc = 100. * correct / total
        writer.add_scalar('Train/Epoch Loss', epoch_train_loss, epoch)
        writer.add_scalar('Train/Epoch Accuracy', epoch_train_acc, epoch)

        # 测试阶段
        model.eval()
        test_loss = 0
        correct_test = 0
        total_test = 0
        wrong_images = []
        wrong_labels = []
        wrong_preds = []

        with torch.no_grad():
            for data, target in test_loader:
                data, target = data.to(device), target.to(device)
                output = model(data)
                test_loss += criterion(output, target).item()
                _, predicted = output.max(1)
                total_test += target.size(0)
                correct_test += predicted.eq(target).sum().item()

                # 收集错误预测样本
                wrong_mask = (predicted != target)
                if wrong_mask.sum() > 0:
                    wrong_batch_images = data[wrong_mask][:8].cpu()
                    wrong_batch_labels = target[wrong_mask][:8].cpu()
                    wrong_batch_preds = predicted[wrong_mask][:8].cpu()
                    wrong_images.extend(wrong_batch_images)
                    wrong_labels.extend(wrong_batch_labels)
                    wrong_preds.extend(wrong_batch_preds)

        # 计算 epoch 级测试指标
        epoch_test_loss = test_loss / len(test_loader)
        epoch_test_acc = 100. * correct_test / total_test
        writer.add_scalar('Test/Epoch Loss', epoch_test_loss, epoch)
        writer.add_scalar('Test/Epoch Accuracy', epoch_test_acc, epoch)

        # 可视化错误预测样本
        if wrong_images:
            wrong_img_grid = torchvision.utils.make_grid(wrong_images)
            writer.add_image('错误预测样本', wrong_img_grid, epoch)
            wrong_text = [f"真实: {classes[wl]}, 预测: {classes[wp]}" 
                         for wl, wp in zip(wrong_labels, wrong_preds)]
            writer.add_text('错误预测标签', '\n'.join(wrong_text), epoch)

        # 更新学习率调度器
        scheduler.step(epoch_test_loss)
        print(f'Epoch {epoch+1}/{epochs} 完成 | 测试准确率: {epoch_test_acc:.2f}%')

    writer.close()
    return epoch_test_acc

# （可选）CIFAR-10 类别名
classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

# 执行训练
epochs = 20
print("开始使用CNN训练模型...")
print("训练后执行: tensorboard --logdir=runs 查看可视化")

final_accuracy = train(model, train_loader, test_loader, criterion, optimizer, scheduler, device, epochs, writer)
print(f"训练完成！最终测试准确率: {final_accuracy:.2f}%")

TensorBoard 日志目录: runs/cifar10_cnn_exp_v2
开始使用CNN训练模型...
训练后执行: tensorboard --logdir=runs 查看可视化
Epoch    21: reducing learning rate of group 0 to 5.0000e-04.
Epoch 1/20 完成 | 测试准确率: 75.81%
Epoch 2/20 完成 | 测试准确率: 80.88%
Epoch 3/20 完成 | 测试准确率: 80.36%
Epoch 4/20 完成 | 测试准确率: 82.32%
Epoch 5/20 完成 | 测试准确率: 80.98%
Epoch 6/20 完成 | 测试准确率: 81.43%
Epoch 7/20 完成 | 测试准确率: 81.86%
Epoch    28: reducing learning rate of group 0 to 2.5000e-04.
Epoch 8/20 完成 | 测试准确率: 81.89%
Epoch 9/20 完成 | 测试准确率: 82.69%
Epoch 10/20 完成 | 测试准确率: 83.66%
Epoch 11/20 完成 | 测试准确率: 83.29%
Epoch 12/20 完成 | 测试准确率: 82.99%
Epoch 13/20 完成 | 测试准确率: 83.11%
Epoch    34: reducing learning rate of group 0 to 1.2500e-04.
Epoch 14/20 完成 | 测试准确率: 83.58%
Epoch 15/20 完成 | 测试准确率: 83.79%
Epoch 16/20 完成 | 测试准确率: 83.88%
Epoch 17/20 完成 | 测试准确率: 83.89%
Epoch 18/20 完成 | 测试准确率: 84.08%
Epoch 19/20 完成 | 测试准确率: 84.25%
Epoch 20/20 完成 | 测试准确率: 84.21%
训练完成！最终测试准确率: 84.21%

作业：对resnet18在cifar10上采用微调策略下，用tensorboard监控训练过程。

@浙大疏锦行

DAY 45 Tensorboard使用介绍

目录

DAY 45 Tensorboard使用介绍

1.tensorboard的发展历史和原理

2.tensorboard的常见操作

3.tensorboard在cifar上的实战：MLP和CNN模型

作业：对resnet18在cifar10上采用微调策略下，用tensorboard监控训练过程。

网站公告

今日签到

热门文章

最新发布