PyTorch 详细知识点总结
1. PyTorch 基础概念
1.1 张量(Tensor)
张量是PyTorch中的基本数据结构
类似于多维数组,支持GPU加速
主要操作:import torch
x = torch. tensor( [ 1 , 2 , 3 ] )
y = torch. zeros( 2 , 3 )
z = torch. randn( 3 , 4 )
a = x + y
b = torch. matmul( y, z)
c = x. mean( )
1.2 自动求导(Autograd)
2. 神经网络构建
2.1 nn.Module
PyTorch中构建神经网络的基类
包含网络层定义和前向传播
示例:import torch. nn as nn
class Net ( nn. Module) :
def __init__ ( self) :
super ( Net, self) . __init__( )
self. fc1 = nn. Linear( 784 , 128 )
self. fc2 = nn. Linear( 128 , 10 )
self. relu = nn. ReLU( )
def forward ( self, x) :
x = self. relu( self. fc1( x) )
x = self. fc2( x)
return x
2.2 常用层
Linear:全连接层
Conv2d:2D卷积层
MaxPool2d:最大池化层
BatchNorm2d:批归一化
Dropout:防止过拟合
3. 数据处理
3.1 Dataset和DataLoader
Dataset:定义数据集
DataLoader:批量加载数据
示例:from torch. utils. data import Dataset, DataLoader
class CustomDataset ( Dataset) :
def __init__ ( self, data, labels) :
self. data = data
self. labels = labels
def __len__ ( self) :
return len ( self. data)
def __getitem__ ( self, idx) :
return self. data[ idx] , self. labels[ idx]
dataset = CustomDataset( data, labels)
dataloader = DataLoader( dataset, batch_size= 32 , shuffle= True )
4. 模型训练
4.1 优化器和损失函数
常用优化器:SGD、Adam
常用损失函数:CrossEntropyLoss、MSELoss
示例:criterion = nn. CrossEntropyLoss( )
optimizer = torch. optim. Adam( model. parameters( ) , lr= 0.001 )
for epoch in range ( num_epochs) :
for data, labels in dataloader:
optimizer. zero_grad( )
outputs = model( data)
loss = criterion( outputs, labels)
loss. backward( )
optimizer. step( )
4.2 模型保存与加载
torch. save( model. state_dict( ) , 'model.pth' )
model. load_state_dict( torch. load( 'model.pth' ) )
5. GPU加速
5.1 设备管理
device = torch. device( 'cuda' if torch. cuda. is_available( ) else 'cpu' )
model = model. to( device)
data = data. to( device)
6. 高级特性
6.1 分布式训练
DistributedDataParallel:多GPU训练
DataParallel:简单的数据并行
6.2 TorchScript
将PyTorch模型转换为可优化的格式
支持在生产环境中部署
6.3 模型量化
7. 调试与优化
7.1 内存优化
使用del释放不需要的张量
使用torch.no_grad()减少内存使用
梯度累积处理大批量数据
7.2 性能分析
torch.autograd.profiler
nvprof性能分析
内存泄漏检测
8. 最佳实践
8.1 代码规范
使用nn.Sequential组织网络层
适当使用nn.ModuleList和nn.ParameterList
正确处理batch维度
8.2 训练技巧
8.3 部署考虑
模型导出(ONNX)
服务化部署
移动端部署
边缘设备部署
9. 常见问题解决方案
9.1 数据预处理
from torchvision import transforms
transform = transforms. Compose( [
transforms. Resize( ( 224 , 224 ) ) ,
transforms. ToTensor( ) ,
transforms. Normalize( mean= [ 0.485 , 0.456 , 0.406 ] ,
std= [ 0.229 , 0.224 , 0.225 ] )
] )
from torch. nn. utils. rnn import pad_sequence
def text_preprocess ( text_list, vocab) :
indices = [ [ vocab[ word] for word in text. split( ) ] for text in text_list]
padded = pad_sequence( [ torch. tensor( x) for x in indices] , batch_first= True )
return padded
9.2 模型评估
def evaluate_model ( model, test_loader, criterion, device) :
model. eval ( )
total_loss = 0
correct = 0
total = 0
with torch. no_grad( ) :
for data, target in test_loader:
data, target = data. to( device) , target. to( device)
output = model( data)
total_loss += criterion( output, target) . item( )
pred = output. argmax( dim= 1 , keepdim= True )
correct += pred. eq( target. view_as( pred) ) . sum ( ) . item( )
total += target. size( 0 )
avg_loss = total_loss / len ( test_loader)
accuracy = 100 . * correct / total
return avg_loss, accuracy
9.3 早停策略实现
class EarlyStopping :
def __init__ ( self, patience= 7 , min_delta= 0 ) :
self. patience = patience
self. min_delta = min_delta
self. counter = 0
self. best_loss = None
self. early_stop = False
def __call__ ( self, val_loss) :
if self. best_loss is None :
self. best_loss = val_loss
elif val_loss > self. best_loss - self. min_delta:
self. counter += 1
if self. counter >= self. patience:
self. early_stop = True
else :
self. best_loss = val_loss
self. counter = 0
9.4 模型训练监控
class TrainingMonitor :
def __init__ ( self) :
self. history = {
'train_loss' : [ ] ,
'val_loss' : [ ] ,
'accuracy' : [ ]
}
def update ( self, metrics) :
for k, v in metrics. items( ) :
self. history[ k] . append( v)
def plot_metrics ( self) :
epochs = range ( 1 , len ( self. history[ 'train_loss' ] ) + 1 )
plt. figure( figsize= ( 12 , 4 ) )
plt. subplot( 1 , 2 , 1 )
plt. plot( epochs, self. history[ 'train_loss' ] , 'b-' , label= 'Training Loss' )
plt. plot( epochs, self. history[ 'val_loss' ] , 'r-' , label= 'Validation Loss' )
plt. title( 'Training and Validation Loss' )
plt. xlabel( 'Epochs' )
plt. ylabel( 'Loss' )
plt. legend( )
plt. subplot( 1 , 2 , 2 )
plt. plot( epochs, self. history[ 'accuracy' ] , 'g-' , label= 'Accuracy' )
plt. title( 'Model Accuracy' )
plt. xlabel( 'Epochs' )
plt. ylabel( 'Accuracy' )
plt. legend( )
plt. tight_layout( )
plt. show( )
9.5 实际应用场景示例
from torchvision. models import resnet50
def create_transfer_model ( num_classes) :
model = resnet50( pretrained= True )
for param in model. parameters( ) :
param. requires_grad = False
model. fc = nn. Linear( model. fc. in_features, num_classes)
return model
class EnsembleModel ( nn. Module) :
def __init__ ( self, models) :
super ( ) . __init__( )
self. models = nn. ModuleList( models)
def forward ( self, x) :
outputs = [ model( x) for model in self. models]
return torch. stack( outputs) . mean( 0 )
class FocalLoss ( nn. Module) :
def __init__ ( self, alpha= 1 , gamma= 2 ) :
super ( ) . __init__( )
self. alpha = alpha
self. gamma = gamma
def forward ( self, inputs, targets) :
ce_loss = F. cross_entropy( inputs, targets, reduction= 'none' )
pt = torch. exp( - ce_loss)
focal_loss = self. alpha * ( 1 - pt) ** self. gamma * ce_loss
return focal_loss. mean( )
total_loss = 0
correct = 0
total = 0
with torch.no_grad():
for data, targets in test_loader:
data, targets = data.to(device), targets.to(device)
outputs = model(data)
loss = criterion(outputs, targets)
total_loss += loss.item()
_, predicted = outputs.max(1)
total += targets.size(0)
correct += predicted.eq(targets).sum().item()
accuracy = 100. * correct / total
avg_loss = total_loss / len(test_loader)
return avg_loss, accuracy
### 9.3 常见错误处理
```python
# 1. CUDA内存不足
try:
# 较大的批量处理
output = model(large_input)
except RuntimeError as e:
if "out of memory" in str(e):
# 清理缓存
torch.cuda.empty_cache()
# 减小批量大小重试
output = model(large_input.split(2))
# 2. 梯度爆炸处理
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
# 3. 模型并行化错误处理
if torch.cuda.device_count() > 1:
try:
model = nn.DataParallel(model)
except RuntimeError as e:
print(f"并行化失败: {e}")
# 回退到单GPU
model = model.to(device)
9.4 性能优化技巧
from torch. cuda. amp import autocast, GradScaler
scaler = GradScaler( )
for data, targets in train_loader:
with autocast( ) :
output = model( data)
loss = criterion( output, targets)
scaler. scale( loss) . backward( )
scaler. step( optimizer)
scaler. update( )
train_loader = DataLoader(
dataset,
batch_size= 32 ,
shuffle= True ,
num_workers= 4 ,
pin_memory= True
)
with torch. no_grad( ) :
model. eval ( )
traced_model = torch. jit. trace( model, torch. randn( 1 , 3 , 224 , 224 ) )
output = traced_model( input_data)
9.5 实际应用场景示例
from torchvision. models import resnet50
def create_transfer_model ( num_classes) :
model = resnet50( pretrained= True )
for param in model. parameters( ) :
param. requires_grad = False
model. fc = nn. Linear( model. fc. in_features, num_classes)
return model
class EnsembleModel ( nn. Module) :
def __init__ ( self, models) :
super ( ) . __init__( )
self. models = nn. ModuleList( models)
def forward ( self, x) :
outputs = [ model( x) for model in self. models]
return torch. stack( outputs) . mean( 0 )
class FocalLoss ( nn. Module) :
def __init__ ( self, alpha= 1 , gamma= 2 ) :
super ( ) . __init__( )
self. alpha = alpha
self. gamma = gamma
def forward ( self, inputs, targets) :
ce_loss = F. cross_entropy( inputs, targets, reduction= 'none' )
pt = torch. exp( - ce_loss)
focal_loss = self. alpha * ( 1 - pt) ** self. gamma * ce_loss
return focal_loss. mean( )
class SelfAttention ( nn. Module) :
def __init__ ( self, dim) :
super ( ) . __init__( )
self. query = nn. Linear( dim, dim)
self. key = nn. Linear( dim, dim)
self. value = nn. Linear( dim, dim)
def forward ( self, x) :
q = self. query( x)
k = self. key( x)
v = self. value( x)
scores = torch. matmul( q, k. transpose( - 2 , - 1 ) ) / math. sqrt( q. size( - 1 ) )
attention = F. softmax( scores, dim= - 1 )
return torch. matmul( attention, v)
class Generator ( nn. Module) :
def __init__ ( self, latent_dim, img_shape) :
super ( ) . __init__( )
self. model = nn. Sequential(
nn. Linear( latent_dim, 128 ) ,
nn. LeakyReLU( 0.2 ) ,
nn. Linear( 128 , 256 ) ,
nn. BatchNorm1d( 256 ) ,
nn. LeakyReLU( 0.2 ) ,
nn. Linear( 256 , np. prod( img_shape) ) ,
nn. Tanh( )
)
self. img_shape = img_shape
def forward ( self, z) :
img = self. model( z)
return img. view( img. size( 0 ) , * self. img_shape)
class Discriminator ( nn. Module) :
def __init__ ( self, img_shape) :
super ( ) . __init__( )
self. model = nn. Sequential(
nn. Linear( np. prod( img_shape) , 256 ) ,
nn. LeakyReLU( 0.2 ) ,
nn. Linear( 256 , 128 ) ,
nn. LeakyReLU( 0.2 ) ,
nn. Linear( 128 , 1 ) ,
nn. Sigmoid( )
)
def forward ( self, img) :
img_flat = img. view( img. size( 0 ) , - 1 )
return self. model( img_flat)
10. 高级训练技巧
10.1 梯度累积
accumulation_steps = 4
optimizer. zero_grad( )
for i, ( data, target) in enumerate ( train_loader) :
output = model( data)
loss = criterion( output, target) / accumulation_steps
loss. backward( )
if ( i + 1 ) % accumulation_steps == 0 :
optimizer. step( )
optimizer. zero_grad( )
10.2 学习率调度
scheduler = torch. optim. lr_scheduler. CosineAnnealingLR(
optimizer, T_max= 200 )
scheduler = torch. optim. lr_scheduler. CosineAnnealingWarmRestarts(
optimizer, T_0= 50 , T_mult= 2 )
scheduler = torch. optim. lr_scheduler. OneCycleLR(
optimizer,
max_lr= 0.1 ,
steps_per_epoch= len ( train_loader) ,
epochs= num_epochs
)
10.3 模型蒸馏
class DistillationLoss ( nn. Module) :
def __init__ ( self, alpha= 0.5 , temperature= 2.0 ) :
super ( ) . __init__( )
self. alpha = alpha
self. T = temperature
def forward ( self, student_outputs, teacher_outputs, targets) :
hard_loss = F. cross_entropy( student_outputs, targets)
soft_loss = nn. KLDivLoss( reduction= 'batchmean' ) (
F. log_softmax( student_outputs/ self. T, dim= 1 ) ,
F. softmax( teacher_outputs/ self. T, dim= 1 )
) * ( self. T * self. T)
return self. alpha * hard_loss + ( 1 - self. alpha) * soft_loss
10.4 对抗训练
def fgsm_attack ( model, loss, data, epsilon, data_grad) :
sign_data_grad = data_grad. sign( )
perturbed_data = data + epsilon * sign_data_grad
perturbed_data = torch. clamp( perturbed_data, 0 , 1 )
return perturbed_data
def train_with_adversarial ( model, train_loader, optimizer, epsilon) :
for data, target in train_loader:
optimizer. zero_grad( )
output = model( data)
loss = F. cross_entropy( output, target)
loss. backward( )
data_grad = data. grad. data
perturbed_data = fgsm_attack( model, loss, data, epsilon, data_grad)
output = model( perturbed_data)
loss = F. cross_entropy( output, target)
loss. backward( )
optimizer. step( )
10.5 半精度训练
scaler = torch. cuda. amp. GradScaler( )
optimizer = torch. optim. Adam( model. parameters( ) )
for data, target in train_loader:
optimizer. zero_grad( )
with torch. cuda. amp. autocast( ) :
output = model( data)
loss = criterion( output, target)
scaler. scale( loss) . backward( )
scaler. step( optimizer)
scaler. update( )
model. half( )
for data, target in train_loader:
data = data. half( )
output = model( data)
loss = criterion( output, target)