用Roofline模型去分析pytorch和Triton算子-易微帮

用Roofline模型去分析pytorch和Triton算子

1.参考链接
2.测试环境
3.安装相关依赖
4.锁频
5.获取理论算力
6.创建测试脚本
7.运行测试程序生成Roofline图
8.NVIDIA Nsight Compute生成Roofline
9.效果图
- A.nn.Linear
- B.Triton实现

本文演示了如何用Roofline模型去分析pytorch和Triton算子
遗留问题:NVIDIA Nsight Compute中的Peak Work是怎么算出来的,又不是峰值算力 -> Nsight Compute 是怎么计算Roofline的呢

2.测试环境

+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.161.07             Driver Version: 535.161.07   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|=========================================+======================+======================|
|   0  NVIDIA GeForce RTX 3060        On  | 00000000:03:00.0 Off |                  N/A |
|  0%   48C    P5              29W / 170W |     18MiB / 12288MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
torch==2.3.1+cu121

3.安装相关依赖

export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64
export PATH=$PATH:/usr/local/cuda/bin
export CUDA_HOME=$CUDA_HOME:/usr/local/cuda
pip install pycuda

4.锁频

MAX_Graphics_CLOCK=`nvidia-smi -q -d SUPPORTED_CLOCKS | grep 'Graphics' | sed 's/[^0-9]//g' | sort -n | uniq | tail -n 1`
MAX_Memory_CLOCK=`nvidia-smi -q -d SUPPORTED_CLOCKS | grep 'Memory' | sed 's/[^0-9]//g' | sort -n | uniq | tail -n 1`
nvidia-smi -pm 1
nvidia-smi -lgc $MAX_Graphics_CLOCK,$MAX_Graphics_CLOCK
nvidia-smi -i 0 -ac $MAX_Memory_CLOCK,$MAX_Graphics_CLOCK
nvidia-smi -q -d CLOCK

5.获取理论算力

tee Theoretical_FLOPS.py <<-'EOF'
import pycuda.driver as cuda
import pycuda.autoinit

def get_gpu_compute_capability_and_clock_rate():
    device = cuda.Device(0)
    compute_capability = device.compute_capability()
    clock_rate = device.get_attribute(cuda.device_attribute.CLOCK_RATE)  # in kHz
    sm_count = device.get_attribute(cuda.device_attribute.MULTIPROCESSOR_COUNT)
    cores_per_sm = get_cuda_cores_per_sm(compute_capability)
    return compute_capability, clock_rate, sm_count, cores_per_sm

def get_cuda_cores_per_sm(compute_capability):
    major, minor = compute_capability
    if major == 2:
        return 32
    elif major == 3:
        return 192
    elif major == 5:
        return 128
    elif major == 6 and minor in [0, 1]:
        return 64
    elif major == 6 and minor == 2:
        return 128
    elif major == 7 and minor in [0, 5]:
        return 64
    elif major == 7 and minor == 2:
        return 64
    elif major == 8 and minor in [0, 6]:
        return 128
    else:
        raise ValueError("Unknown compute capability")

def calculate_theoretical_flops(clock_rate, sm_count, cores_per_sm):
    clock_rate_hz = clock_rate * 1e3  # Convert kHz to Hz
    flops = clock_rate_hz * sm_count * cores_per_sm * 2  # 2 FLOPs per clock per core (FMA)
    return flops

compute_capability, clock_rate, sm_count, cores_per_sm = get_gpu_compute_capability_and_clock_rate()
theoretical_flops = calculate_theoretical_flops(clock_rate, sm_count, cores_per_sm)
print(f"GPU compute capability: {compute_capability}")
print(f"Clock rate (kHz): {clock_rate}")
print(f"Number of SMs: {sm_count}")
print(f"Cores per SM: {cores_per_sm}")
print(f"Theoretical FLOPS for float32: {theoretical_flops / 1e12} TFLOPS")
EOF
python Theoretical_FLOPS.py

输出

GPU compute capability: (8, 6)
Clock rate (kHz): 1852000
Number of SMs: 28
Cores per SM: 128
Theoretical FLOPS for float32: 13.275136 TFLOPS

6.创建测试脚本

tee roofline_model.py <<-'EOF'
import sys
import torch
import torch.nn as nn
import triton
import triton.language as tl
import math
import torch
import torch.nn as nn
from fvcore.nn import FlopCountAnalysis, ActivationCountAnalysis
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.font_manager import FontProperties
import os
import argparse

# 定义一个测试模型
class SimpleModel(nn.Module):
    def __init__(self,input_features,output_features):
        super(SimpleModel, self).__init__()
        self.fc1 = nn.Linear(input_features,output_features,bias=False)
    def forward(self, x):
        x = self.fc1(x)
        return x

@triton.jit
def sgemm_kernel(
    A, B, C,
    M, N, K,
    stride_am, stride_ak,
    stride_bk, stride_bn,
    stride_cm, stride_cn,
    BLOCK_SIZE: tl.constexpr
):
    """ Kernel for computing C = A @ B """
    # Define the program ids
    pid_m = tl.program_id(0)
    pid_n = tl.program_id(1)

    # Create base pointers for A and B and C
    offs_am = pid_m * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
    offs_bn = pid_n * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
    offs_ak = tl.arange(0, BLOCK_SIZE)
    
    a_ptrs = A + (stride_am * offs_am[:, None] + stride_ak * offs_ak[None, :])
    b_ptrs = B + (stride_bk * offs_ak[:, None] + stride_bn * offs_bn[None, :])
    
    # Initialize accumulator
    acc = tl.zeros((BLOCK_SIZE, BLOCK_SIZE), dtype=tl.float32)
    
    # Loop over K dimension
    for k in range(0, K, BLOCK_SIZE):
        a = tl.load(a_ptrs, mask=offs_am[:, None] < M)
        b = tl.load(b_ptrs, mask=offs_bn[None, :] < N)
        acc += tl.dot(a, b)
        a_ptrs += BLOCK_SIZE * stride_ak
        b_ptrs += BLOCK_SIZE * stride_bk

    # Write back results
    c_ptrs = C + stride_cm * offs_am[:, None] + stride_cn * offs_bn[None, :]
    tl.store(c_ptrs, acc, mask=(offs_am[:, None] < M) & (offs_bn[None, :] < N))

class TritonLinear(nn.Module):
    def __init__(self, in_features, out_features):
        super(TritonLinear, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = nn.Parameter(torch.randn(out_features, in_features).float()).cuda()
    
    def forward(self, x):
        assert x.shape[1] == self.in_features
        out = torch.empty((x.shape[0], self.out_features), device=x.device, dtype=x.dtype).cuda()
        grid = lambda META: (math.ceil(x.shape[0] / META['BLOCK_SIZE']), math.ceil(self.out_features / META['BLOCK_SIZE']))
        sgemm_kernel[grid](
            x, self.weight, out,
            x.shape[0], self.out_features, self.in_features,
            x.stride(0), x.stride(1),
            self.weight.stride(0), self.weight.stride(1),
            out.stride(0), out.stride(1),
            BLOCK_SIZE=64
        )
        return out
        
def main(args):
    # 模型和输入数据

    input_features = 8192
    output_features = 8192
    batch_size = 8192

    model = SimpleModel(input_features,output_features)
    input_data = torch.randn(batch_size, input_features)    
    
    test_count=10
    # 计算 FLOPs 和内存访问量
    flops = FlopCountAnalysis(model, input_data).total()*test_count
    activations = ActivationCountAnalysis(model, input_data).total() + input_data.numel()
    print("activations:",activations)
    # 计算参数个数
    params = sum(p.numel() for p in model.parameters())

    # 内存访问量假定为 activations 和params 乘以 4 字节（假设 activations 和 params 是 float32 类型）
    activation_memory_access = activations * 4
    params_memory_access = params * 4
    memory_access = activation_memory_access + params_memory_access
    memory_access=memory_access*test_count
    
    if args.triton_kernel:
        model = TritonLinear(in_features=input_features, out_features=output_features)
    else:
        model=model.cuda()
        
    input_data=input_data.float().cuda()
    
    for i in range(5):
        output = model(input_data) 
        torch.cuda.synchronize()
    
    if args.warmup_only:
        return

    if False:
        # 设置 CUDA 事件用于计算执行时间
        start_event = torch.cuda.Event(enable_timing=True)
        end_event = torch.cuda.Event(enable_timing=True)
        start_event.record()
        for _ in range(test_count):
            output = model(input_data)
        end_event.record()
        torch.cuda.synchronize()
        total_cuda_time = start_event.elapsed_time(end_event) / 1000  # 转换为秒
    else:
        # 使用 PyTorch Profiler 计算 FLOPs、内存访问和执行时间
        with torch.profiler.profile(
            activities=[torch.profiler.ProfilerActivity.CUDA]) as prof:
            for _ in range(test_count):
                output = model(input_data)
        key_averages = prof.key_averages()
        for ev in key_averages:
            print(ev)
        total_cuda_time = sum([event.self_cuda_time_total for event in key_averages if event.key.find("sgemm")>=0]) / 1e6  # 转换至秒

    # FLOPs 转换至 GFLOPs
    flops_measured_glops = flops / 1e9

    # 内存带宽测量
    memory_access_gb=memory_access/ 1e9
    bandwidth_measured = memory_access_gb / total_cuda_time  # 单位：GB/s
    print("bandwidth_measured:",bandwidth_measured)

    # GPU 的峰值性能和带宽
    peak_performance = 13.275136  * 1e3  # 单位：GFLOPs
    memory_bandwidth = 360.0  # 单位：GB/s

    # 计算 Roofline 模型中的数据点
    Io = np.logspace(-2,4,100) #GFLOPs/GB
    performance = np.minimum(peak_performance, Io * memory_bandwidth)  #不同计算密度下的最大FLOPs/S,上限为峰值算力peak_performance

    # 绘制 Roofline 模型
    plt.figure(figsize=(10, 6))

    thresold=0.75

    # 设置字体以支持中文
    font_path = 'simsun.ttc'  # 在这里替换为你的字体路径
    font_prop = FontProperties(fname=font_path)
    
    # Bandwidth Bound
    x=Io[Io<(peak_performance / memory_bandwidth)]
    plt.fill_between(x, np.minimum(peak_performance, x * memory_bandwidth)*thresold,
                np.minimum(peak_performance, x * memory_bandwidth),
                color='lightblue', alpha=0.6, label='Bandwidth Bound')

    # Compute Bound
    x2=Io[Io>=(peak_performance / memory_bandwidth)]
    plt.fill_between(x2, np.minimum(peak_performance, x2 * memory_bandwidth)*thresold, 
                 np.minimum(peak_performance, x2 * memory_bandwidth), 
                 color='green', alpha=0.6, label='Compute Bound')

    # 绘制低性能区域
    plt.fill_between(Io, 0, np.minimum(peak_performance, Io * memory_bandwidth)*thresold,
                 color='gray', alpha=0.6, label='poor performance')

    plt.axhline(y=peak_performance, color='b', linestyle='--', 
            label=f'峰值计算能力:{peak_performance/1e3:.2f}TFLOPs')
            
    plt.axvline(x=peak_performance / memory_bandwidth, color='g', linestyle='--', 
            label=f'{peak_performance / memory_bandwidth:.2f}GFLOPs/GB')

    plt.loglog(Io, performance, label='Roofline')

    arithmetic_intensity_measured=flops_measured_glops/memory_access_gb #GFLOPs/GB(算法的静态属性)
    point_y = arithmetic_intensity_measured*bandwidth_measured

    plt.scatter(arithmetic_intensity_measured, point_y, c='r',
            label=f'Measured Points {point_y/1e3:.2f} TFLOPs/sec {point_y*100/peak_performance:.2f}%')

    plt.xlabel('操作强度 [GFLOPs/GB]', fontproperties=font_prop)
    plt.ylabel('性能 [GFLOPs/sec]', fontproperties=font_prop)
    plt.title('Roofline 模型', fontproperties=font_prop)
    plt.legend(prop=font_prop)

    # 保存图片而不显示
    plt.savefig('roofline_model.png')
    plt.close()

    print(f"FLOPs: {flops} FLOPs")
    print(f"内存访问量: {memory_access} 字节")
    print(f"执行时间: {total_cuda_time:.4f} 秒")
    print(f"理论值的:{point_y*100/peak_performance:.2f}%")


parser = argparse.ArgumentParser(description='Process some integers.')
parser.add_argument("--warmup_only", action="store_true", help="warmup_only")
parser.add_argument("--triton_kernel", action="store_true", help="triton_kernel")

args = parser.parse_args()
main(args)
EOF

7.运行测试程序生成Roofline图

python roofline_model.py
python roofline_model.py --triton_kernel

输出

FLOPs: 5497558138880 FLOPs
内存访问量: 8053063680 字节
执行时间: 1.3862 秒
理论值的:29.87%

FLOPs: 5497558138880 FLOPs
内存访问量: 8053063680 字节
执行时间: 1.0957 秒
理论值的:37.80%

8.NVIDIA Nsight Compute生成Roofline

/usr/local/cuda/bin/ncu -f --section SpeedOfLight_HierarchicalSingleRooflineChart \
        --section ComputeWorkloadAnalysis --section MemoryWorkloadAnalysis \
        --target-processes all --export roofline_report python roofline_model.py --warmup_only


/usr/local/cuda/bin/ncu -f --section SpeedOfLight_HierarchicalSingleRooflineChart \
        --section ComputeWorkloadAnalysis --section MemoryWorkloadAnalysis \
        --target-processes all --export roofline_triton_kernel_report python roofline_model.py --warmup_only --triton_kernel