本节课你将学到
- 理解语义分割的概念和应用场景
- 掌握UNet网络架构原理
- 使用PyTorch实现图像分割模型
- 完成医学图像分割实战项目
开始之前
环境要求
- Python 3.8+
- PyTorch 2.0+
- OpenCV
- Matplotlib
- 推荐使用GPU加速(非必须)
前置知识
- 基本Python编程能力(第1-8讲)
- PyTorch基础(第22讲)
- 卷积神经网络(第24讲)
核心概念
什么是语义分割?
语义分割就像是给图像的每个像素"贴标签":
- 传统分类:整张图片是"狗"或"猫"
- 目标检测:用方框标出狗和猫的位置
- 语义分割:精确到每个像素是"狗毛"还是"猫毛"
典型应用场景
- 医学影像:肿瘤区域分割
- 自动驾驶:道路和行人识别
- 遥感图像:地表覆盖分析
- 工业质检:缺陷区域定位
UNet网络架构
UNet就像是一个"编码器-解码器"系统:
- 编码器:不断压缩图像提取特征(下采样)
- 解码器:逐步恢复空间信息(上采样)
- 跳跃连接:保留细节信息的关键设计
# 类比解释
假设你要画一幅精细的素描:
1. 先画大体轮廓(编码器提取主要特征)
2. 再添加细节(解码器恢复空间信息)
3. 时不时参考原图(跳跃连接保持精度)
代码实战
1. 数据准备
我们使用公开的医学影像数据集(皮肤病变分割)
import os
import cv2
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
class MedicalDataset(Dataset):
def __init__(self, image_dir, mask_dir, transform=None):
self.image_dir = image_dir
self.mask_dir = mask_dir
self.transform = transform
self.images = os.listdir(image_dir)
def __len__(self):
return len(self.images)
def __getitem__(self, idx):
img_path = os.path.join(self.image_dir, self.images[idx])
mask_path = os.path.join(self.mask_dir, self.images[idx].replace(".jpg", "_mask.gif"))
# 读取图像和掩码
image = cv2.imread(img_path)
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
mask = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE)
# 归一化处理
image = image / 255.0
mask = mask / 255.0
mask = np.expand_dims(mask, axis=-1)
if self.transform:
augmented = self.transform(image=image, mask=mask)
image = augmented["image"]
mask = augmented["mask"]
# 调整维度顺序 (H,W,C) -> (C,H,W)
image = image.transpose((2, 0, 1))
mask = mask.transpose((2, 0, 1))
return torch.tensor(image, dtype=torch.float), torch.tensor(mask, dtype=torch.float)
# ⚠️ 常见错误1:文件路径问题
# 确保:
# 1. 图像和掩码文件名对应正确
# 2. 文件扩展名匹配实际格式
# 3. 图像和掩码尺寸相同
2. UNet模型实现
import torch.nn as nn
import torch.nn.functional as F
class DoubleConv(nn.Module):
"""(卷积 => BN => ReLU) * 2"""
def __init__(self, in_channels, out_channels):
super().__init__()
self.double_conv = nn.Sequential(
nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1),
nn.BatchNorm2d(out_channels),
nn.ReLU(inplace=True),
nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1),
nn.BatchNorm2d(out_channels),
nn.ReLU(inplace=True)
)
def forward(self, x):
return self.double_conv(x)
class Down(nn.Module):
"""下采样:MaxPool + DoubleConv"""
def __init__(self, in_channels, out_channels):
super().__init__()
self.maxpool_conv = nn.Sequential(
nn.MaxPool2d(2),
DoubleConv(in_channels, out_channels)
)
def forward(self, x):
return self.maxpool_conv(x)
class Up(nn.Module):
"""上采样:转置卷积 + 跳跃连接"""
def __init__(self, in_channels, out_channels):
super().__init__()
self.up = nn.ConvTranspose2d(in_channels, in_channels // 2, kernel_size=2, stride=2)
self.conv = DoubleConv(in_channels, out_channels)
def forward(self, x1, x2):
x1 = self.up(x1)
# 处理尺寸不匹配问题
diffY = x2.size()[2] - x1.size()[2]
diffX = x2.size()[3] - x1.size()[3]
x1 = F.pad(x1, [diffX // 2, diffX - diffX // 2,
diffY // 2, diffY - diffY // 2])
x = torch.cat([x2, x1], dim=1)
return self.conv(x)
class UNet(nn.Module):
def __init__(self, n_channels=3, n_classes=1):
super(UNet, self).__init__()
self.n_channels = n_channels
self.n_classes = n_classes
# 编码器
self.inc = DoubleConv(n_channels, 64)
self.down1 = Down(64, 128)
self.down2 = Down(128, 256)
self.down3 = Down(256, 512)
self.down4 = Down(512, 1024)
# 解码器
self.up1 = Up(1024, 512)
self.up2 = Up(512, 256)
self.up3 = Up(256, 128)
self.up4 = Up(128, 64)
# 输出层
self.outc = nn.Conv2d(64, n_classes, kernel_size=1)
def forward(self, x):
# 编码器
x1 = self.inc(x)
x2 = self.down1(x1)
x3 = self.down2(x2)
x4 = self.down3(x3)
x5 = self.down4(x4)
# 解码器
x = self.up1(x5, x4)
x = self.up2(x, x3)
x = self.up3(x, x2)
x = self.up4(x, x1)
# 输出
logits = self.outc(x)
return torch.sigmoid(logits)
# ⚠️ 常见错误2:尺寸不匹配
# 解决方案:
# 1. 使用F.pad进行填充
# 2. 确保输入图像尺寸是16的倍数(因为经历4次2倍下采样)
3. 训练流程
import torch.optim as optim
from torchvision.transforms import transforms
from sklearn.model_selection import train_test_split
# 数据增强
transform = transforms.Compose([
transforms.ToPILImage(),
transforms.RandomHorizontalFlip(),
transforms.RandomRotation(15),
transforms.ToTensor()
])
# 准备数据集
image_dir = "data/images"
mask_dir = "data/masks"
dataset = MedicalDataset(image_dir, mask_dir, transform=transform)
# 分割训练集和验证集
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
# 创建数据加载器
batch_size = 4
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
# 初始化模型
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = UNet().to(device)
# 损失函数和优化器
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# 训练循环
num_epochs = 20
for epoch in range(num_epochs):
model.train()
running_loss = 0.0
for images, masks in train_loader:
images = images.to(device)
masks = masks.to(device)
# 前向传播
outputs = model(images)
loss = criterion(outputs, masks)
# 反向传播
optimizer.zero_grad()
loss.backward()
optimizer.step()
running_loss += loss.item()
# 验证阶段
model.eval()
val_loss = 0.0
with torch.no_grad():
for images, masks in val_loader:
images = images.to(device)
masks = masks.to(device)
outputs = model(images)
val_loss += criterion(outputs, masks).item()
print(f"Epoch {epoch+1}/{num_epochs}, "
f"Train Loss: {running_loss/len(train_loader):.4f}, "
f"Val Loss: {val_loss/len(val_loader):.4f}")
# 保存模型
torch.save(model.state_dict(), "unet_medical.pth")
4. 评估与可视化
import matplotlib.pyplot as plt
def show_results(image, true_mask, pred_mask):
plt.figure(figsize=(15, 5))
plt.subplot(1, 3, 1)
plt.imshow(image.permute(1, 2, 0))
plt.title("Input Image")
plt.subplot(1, 3, 2)
plt.imshow(true_mask.squeeze(), cmap="gray")
plt.title("Ground Truth")
plt.subplot(1, 3, 3)
plt.imshow(pred_mask.squeeze(), cmap="gray")
plt.title("Prediction")
plt.show()
# 加载测试样本
test_image, test_mask = val_dataset[0]
test_image = test_image.unsqueeze(0).to(device)
# 预测
model.eval()
with torch.no_grad():
pred_mask = model(test_image)
# 可视化
show_results(test_image.cpu().squeeze(),
test_mask.cpu(),
pred_mask.cpu().squeeze())
完整项目
项目结构:
lesson_32_semantic_segmentation/
├── data/
│ ├── images/ # 原始医学图像
│ └── masks/ # 对应的分割掩码
├── models/
│ └── unet.py # UNet模型实现
├── utils/
│ └── dataset.py # 数据加载工具
├── train.py # 训练脚本
├── predict.py # 预测脚本
├── requirements.txt # 依赖列表
└── README.md # 项目说明
requirements.txt
torch==2.0.1
torchvision==0.15.2
opencv-python==4.7.0.72
matplotlib==3.7.1
numpy==1.24.3
scikit-image==0.20.0
train.py 主程序
import torch
from models.unet import UNet
from utils.dataset import MedicalDataset
from torch.utils.data import DataLoader
def main():
# 初始化
device = "cuda" if torch.cuda.is_available() else "cpu"
model = UNet().to(device)
# 数据加载
dataset = MedicalDataset("data/images", "data/masks")
loader = DataLoader(dataset, batch_size=4, shuffle=True)
# 训练代码(同上文训练流程)
# ...
if __name__ == "__main__":
main()
运行效果
控制台输出
Epoch 1/20, Train Loss: 0.4521, Val Loss: 0.3812
Epoch 2/20, Train Loss: 0.3502, Val Loss: 0.3215
...
Epoch 20/20, Train Loss: 0.1125, Val Loss: 0.1258
模型训练完成!保存到 unet_medical.pth
可视化结果
- 左:原始医学图像
- 中:医生标注的真实病变区域
- 右:模型预测的分割结果
常见问题
Q1: 训练时出现CUDA内存不足错误
解决方案:
- 减小batch_size(如从4改为2)
- 降低图像分辨率
- 使用更小的模型
Q2: 预测结果全是黑色/白色
可能原因:
- 学习率不合适(尝试调整lr=0.0001)
- 数据标注有问题(检查mask是否有效)
- 模型没有收敛(增加训练轮次)
Q3: 如何应用到自己的数据集?
步骤:
- 准备图像和对应的mask(PNG格式)
- mask应为单通道,前景255,背景0
- 修改dataset.py中的文件读取逻辑
课后练习
数据增强实验
尝试添加更多数据增强方法(如随机裁剪、颜色抖动),观察对模型效果的影响模型改进
在UNet中添加注意力机制,比较改进前后的分割精度应用迁移
使用本模型对卫星图像进行地表分类(需重新标注数据)指标计算
实现Dice系数和IoU指标的计算,量化评估模型性能
扩展阅读
下节预告:第33讲将学习强化学习基础,实现游戏AI智能体!