从hip cuda kernel 的汇编语言来感受 AMD GPU内部工作方式

发布于:2024-06-17 ⋅ 阅读:(139) ⋅ 点赞:(0)


0, 无参数 kernel 汇编语言示例

./param_00.hip

__global__ void WWWWW()
{
    ((int*)0x8888888)[3] = 0x77777;
}

../../../local_amdgpu/bin/clang++ -O1  --cuda-device-only --offload-arch=gfx906 -S ./param_00.hip -o param_00.s


_Z5WWWWWv:                              ; @_Z5WWWWWv
; %bb.0:                                ; %entry
    v_mov_b32_e32 v0, 0x8888894
    v_mov_b32_e32 v1, 0
    v_mov_b32_e32 v2, 0x77777
    flat_store_dword v[0:1], v2
    s_endpgm
    .section    .rodata,"a",@progbits
    .p2align    6, 0x0
    .amdhsa_kernel _Z5WWWWWv


1,一个参数的 kernel 的汇编语言示例

param_01.hip

__global__ void MMMMM(int* A)
{
    A[10] = 0x77777;

}


    
../../../local_amdgpu/bin/clang++ -O1  --cuda-device-only --offload-arch=gfx906 -S ./param_01.hip -o param_01.s


_Z5MMMMMPi:                             ; @_Z5MMMMMPi
; %bb.0:                                ; %entry
    s_load_dwordx2 s[0:1], s[4:5], 0x0
    v_mov_b32_e32 v0, 0
    v_mov_b32_e32 v1, 0x77777
    s_waitcnt lgkmcnt(0)
    global_store_dword v0, v1, s[0:1] offset:40
    s_endpgm
    .section    .rodata,"a",@progbits
    .p2align    6, 0x0
    .amdhsa_kernel _Z5MMMMMPi


2, 两个参数的kernel 的汇编语言

param_02.hip

__global__ void MMMMM(int* AA, int *BB)
{
    AA[10] = 0x77777;
    BB[20] = 0x33333;

}

../../../local_amdgpu/bin/clang++ -O1  --cuda-device-only --offload-arch=gfx906 -S ./param_02.hip -o param_02.s


_Z5MMMMMPiS_:                           ; @_Z5MMMMMPiS_
; %bb.0:                                ; %entry
    s_load_dwordx4 s[0:3], s[4:5], 0x0
    v_mov_b32_e32 v0, 0
    v_mov_b32_e32 v1, 0x77777
    v_mov_b32_e32 v2, 0x33333
    s_waitcnt lgkmcnt(0)
    global_store_dword v0, v1, s[0:1] offset:40
    global_store_dword v0, v2, s[2:3] offset:80
    s_endpgm
    .section    .rodata,"a",@progbits
    .p2align    6, 0x0
    .amdhsa_kernel _Z5MMMMMPiS_


3, 三个参数的kernel 的汇编语言


param_03.hip

__global__ void MMMMM(int* AA, int* BB, int* CC)
{
    AA[10] = 0x77777;
    BB[20] = 0x33333;
    CC[30] = 0x12121;

}


    ../../../local_amdgpu/bin/clang++ -O1  --cuda-device-only --offload-arch=gfx906 -S ./param_03.hip -o param_03.s


_Z5MMMMMPiS_S_:                         ; @_Z5MMMMMPiS_S_
; %bb.0:                                ; %entry
    s_load_dwordx4 s[0:3], s[4:5], 0x0
    v_mov_b32_e32 v0, 0
    s_load_dwordx2 s[4:5], s[4:5], 0x10
    v_mov_b32_e32 v1, 0x77777
    v_mov_b32_e32 v2, 0x33333
    s_waitcnt lgkmcnt(0)
    global_store_dword v0, v1, s[0:1] offset:40
    global_store_dword v0, v2, s[2:3] offset:80
    v_mov_b32_e32 v1, 0x12121
    global_store_dword v0, v1, s[4:5] offset:120
    s_endpgm
    .section    .rodata,"a",@progbits
    .p2align    6, 0x0
    .amdhsa_kernel _Z5MMMMMPiS_S_


4,四个参数的kernel 的汇编语言


param_04.hip

__global__ void MMMMM(int* AA, int* BB, int* CC, int* DD)
{
    AA[10] = 0x77777;
    BB[20] = 0x33333;
    CC[30] = 0x22222;
    DD[40] = 0x44444;

}

../../../local_amdgpu/bin/clang++ -O1  --cuda-device-only --offload-arch=gfx906 -S ./param_04.hip -o param_04.s

_Z5MMMMMPiS_S_S_:                       ; @_Z5MMMMMPiS_S_S_
; %bb.0:                                ; %entry
    s_load_dwordx8 s[0:7], s[4:5], 0x0
    v_mov_b32_e32 v0, 0
    v_mov_b32_e32 v1, 0x77777
    v_mov_b32_e32 v2, 0x33333
    s_waitcnt lgkmcnt(0)
    global_store_dword v0, v1, s[0:1] offset:40
    global_store_dword v0, v2, s[2:3] offset:80
    v_mov_b32_e32 v1, 0x22222
    global_store_dword v0, v1, s[4:5] offset:120
    v_mov_b32_e32 v1, 0x44444
    global_store_dword v0, v1, s[6:7] offset:160
    s_endpgm
    .section    .rodata,"a",@progbits
    .p2align    6, 0x0
    .amdhsa_kernel _Z5MMMMMPiS_S_S_

5, 带线程索引的 kernel 的汇编语言

 param_threadid.hip

#include <hip/hip_runtime.h>

__global__ void MMMMM(int* AA)
{
    AA[threadIdx.x + 10] = 0x77777;

}


../../../local_amdgpu/bin/clang++ -O1  --cuda-device-only --offload-arch=gfx906 -S ./param_threadid.hip -o param_threadid.s

_Z5MMMMMPi:                             ; @_Z5MMMMMPi
; %bb.0:                                ; %entry
    s_load_dwordx2 s[0:1], s[4:5], 0x0
    v_lshlrev_b32_e32 v0, 2, v0
    v_mov_b32_e32 v1, 0x77777
    s_waitcnt lgkmcnt(0)
    global_store_dword v0, v1, s[0:1] offset:40
    s_endpgm
    .section    .rodata,"a",@progbits
    .p2align    6, 0x0
    .amdhsa_kernel _Z5MMMMMPi


网站公告

今日签到

点亮在社区的每一天
去签到