; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s ; LDS is allocated per-kernel. Module scope variables are gathered into a struct which is ; allocated at address zero, if used by the kernel. Kernel scope variables are gathered into ; a per-kernel struct and allocated immediately after the module scope. ; This test checks that the module and kernel scope variables are allocated in deterministic ; order without spurious alignment padding between the two ; External LDS is checked because it influences LDS padding in general and because it will ; not be moved into either module or kernel struct @module_variable = addrspace(3) global i16 undef ; Variables are allocated into module scope block when used by a non-kernel function define void @use_module() #0 { ; CHECK-LABEL: use_module: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: ds_write_b16 v0, v0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] store i16 0, ptr addrspace(3) @module_variable ret void } ; Variables only used by kernels are specialised and allocated per-kernel @kernel_normal = addrspace(3) global i16 undef @kernel_overalign = addrspace(3) global i16 undef, align 4 ; External LDS shall not introduce padding between module and kernel scope variables @extern_normal = external addrspace(3) global [0 x float] @extern_overalign = external addrspace(3) global [0 x float], align 8 ; 2^3 cases encoded into function names define amdgpu_kernel void @module_0_kernel_normal_extern_normal(i32 %idx) #1 { ; CHECK-LABEL: module_0_kernel_normal_extern_normal: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dword s0, s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_lshl_b32 s0, s0, 2 ; CHECK-NEXT: s_add_i32 s0, s0, 4 ; CHECK-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-NEXT: ds_write_b16 v0, v1 ; CHECK-NEXT: ds_write_b32 v2, v0 ; CHECK-NEXT: s_endpgm store i16 2, ptr addrspace(3) @kernel_normal %arrayidx1 = getelementptr inbounds [0 x float], ptr addrspace(3) @extern_normal, i32 0, i32 %idx store float 0.0, ptr addrspace(3) %arrayidx1 ret void } define amdgpu_kernel void @module_1_kernel_normal_extern_normal(i32 %idx) { ; CHECK-LABEL: module_1_kernel_normal_extern_normal: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_add_u32 s8, s8, s11 ; CHECK-NEXT: s_mov_b32 s32, 0 ; CHECK-NEXT: s_addc_u32 s9, s9, 0 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 ; CHECK-NEXT: s_add_u32 s0, s0, s11 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_getpc_b64 s[8:9] ; CHECK-NEXT: s_add_u32 s8, s8, use_module@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s9, s9, use_module@gotpcrel32@hi+12 ; CHECK-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 ; CHECK-NEXT: s_load_dword s12, s[6:7], 0x0 ; CHECK-NEXT: s_mov_b64 s[6:7], s[4:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[10:11] ; CHECK-NEXT: s_lshl_b32 s4, s12, 2 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_add_i32 s4, s4, 4 ; CHECK-NEXT: v_mov_b32_e32 v2, 2 ; CHECK-NEXT: v_mov_b32_e32 v3, s4 ; CHECK-NEXT: ds_write_b16 v0, v1 ; CHECK-NEXT: ds_write_b16 v0, v2 offset:2 ; CHECK-NEXT: ds_write_b32 v3, v0 ; CHECK-NEXT: s_endpgm call void @use_module() store i16 1, ptr addrspace(3) @module_variable store i16 2, ptr addrspace(3) @kernel_normal %arrayidx1 = getelementptr inbounds [0 x float], ptr addrspace(3) @extern_normal, i32 0, i32 %idx store float 0.0, ptr addrspace(3) %arrayidx1 ret void } define amdgpu_kernel void @module_0_kernel_overalign_extern_normal(i32 %idx) #1 { ; CHECK-LABEL: module_0_kernel_overalign_extern_normal: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dword s0, s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_lshl_b32 s0, s0, 2 ; CHECK-NEXT: s_add_i32 s0, s0, 4 ; CHECK-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-NEXT: ds_write_b16 v0, v1 ; CHECK-NEXT: ds_write_b32 v2, v0 ; CHECK-NEXT: s_endpgm store i16 2, ptr addrspace(3) @kernel_overalign %arrayidx1 = getelementptr inbounds [0 x float], ptr addrspace(3) @extern_normal, i32 0, i32 %idx store float 0.0, ptr addrspace(3) %arrayidx1 ret void } define amdgpu_kernel void @module_1_kernel_overalign_extern_normal(i32 %idx) { ; CHECK-LABEL: module_1_kernel_overalign_extern_normal: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_add_u32 s8, s8, s11 ; CHECK-NEXT: s_mov_b32 s32, 0 ; CHECK-NEXT: s_addc_u32 s9, s9, 0 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 ; CHECK-NEXT: s_add_u32 s0, s0, s11 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_getpc_b64 s[8:9] ; CHECK-NEXT: s_add_u32 s8, s8, use_module@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s9, s9, use_module@gotpcrel32@hi+12 ; CHECK-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 ; CHECK-NEXT: s_load_dword s12, s[6:7], 0x0 ; CHECK-NEXT: s_mov_b64 s[6:7], s[4:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[10:11] ; CHECK-NEXT: s_lshl_b32 s4, s12, 2 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_add_i32 s4, s4, 8 ; CHECK-NEXT: v_mov_b32_e32 v2, 2 ; CHECK-NEXT: v_mov_b32_e32 v3, s4 ; CHECK-NEXT: ds_write_b16 v0, v1 ; CHECK-NEXT: ds_write_b16 v0, v2 offset:4 ; CHECK-NEXT: ds_write_b32 v3, v0 ; CHECK-NEXT: s_endpgm call void @use_module() store i16 1, ptr addrspace(3) @module_variable store i16 2, ptr addrspace(3) @kernel_overalign %arrayidx1 = getelementptr inbounds [0 x float], ptr addrspace(3) @extern_normal, i32 0, i32 %idx store float 0.0, ptr addrspace(3) %arrayidx1 ret void } define amdgpu_kernel void @module_0_kernel_normal_extern_overalign(i32 %idx) #1 { ; CHECK-LABEL: module_0_kernel_normal_extern_overalign: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dword s0, s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_lshl_b32 s0, s0, 2 ; CHECK-NEXT: s_add_i32 s0, s0, 8 ; CHECK-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-NEXT: ds_write_b16 v0, v1 ; CHECK-NEXT: ds_write_b32 v2, v0 ; CHECK-NEXT: s_endpgm store i16 2, ptr addrspace(3) @kernel_normal %arrayidx1 = getelementptr inbounds [0 x float], ptr addrspace(3) @extern_overalign, i32 0, i32 %idx store float 0.0, ptr addrspace(3) %arrayidx1 ret void } define amdgpu_kernel void @module_1_kernel_normal_extern_overalign(i32 %idx) { ; CHECK-LABEL: module_1_kernel_normal_extern_overalign: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_add_u32 s8, s8, s11 ; CHECK-NEXT: s_mov_b32 s32, 0 ; CHECK-NEXT: s_addc_u32 s9, s9, 0 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 ; CHECK-NEXT: s_add_u32 s0, s0, s11 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_getpc_b64 s[8:9] ; CHECK-NEXT: s_add_u32 s8, s8, use_module@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s9, s9, use_module@gotpcrel32@hi+12 ; CHECK-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 ; CHECK-NEXT: s_load_dword s12, s[6:7], 0x0 ; CHECK-NEXT: s_mov_b64 s[6:7], s[4:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[10:11] ; CHECK-NEXT: s_lshl_b32 s4, s12, 2 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_add_i32 s4, s4, 8 ; CHECK-NEXT: v_mov_b32_e32 v2, 2 ; CHECK-NEXT: v_mov_b32_e32 v3, s4 ; CHECK-NEXT: ds_write_b16 v0, v1 ; CHECK-NEXT: ds_write_b16 v0, v2 offset:2 ; CHECK-NEXT: ds_write_b32 v3, v0 ; CHECK-NEXT: s_endpgm call void @use_module() store i16 1, ptr addrspace(3) @module_variable store i16 2, ptr addrspace(3) @kernel_normal %arrayidx1 = getelementptr inbounds [0 x float], ptr addrspace(3) @extern_overalign, i32 0, i32 %idx store float 0.0, ptr addrspace(3) %arrayidx1 ret void } define amdgpu_kernel void @module_0_kernel_overalign_extern_overalign(i32 %idx) #1 { ; CHECK-LABEL: module_0_kernel_overalign_extern_overalign: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dword s0, s[4:5], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_lshl_b32 s0, s0, 2 ; CHECK-NEXT: s_add_i32 s0, s0, 8 ; CHECK-NEXT: v_mov_b32_e32 v2, s0 ; CHECK-NEXT: ds_write_b16 v0, v1 ; CHECK-NEXT: ds_write_b32 v2, v0 ; CHECK-NEXT: s_endpgm store i16 2, ptr addrspace(3) @kernel_overalign %arrayidx1 = getelementptr inbounds [0 x float], ptr addrspace(3) @extern_overalign, i32 0, i32 %idx store float 0.0, ptr addrspace(3) %arrayidx1 ret void } define amdgpu_kernel void @module_1_kernel_overalign_extern_overalign(i32 %idx) { ; CHECK-LABEL: module_1_kernel_overalign_extern_overalign: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_add_u32 s8, s8, s11 ; CHECK-NEXT: s_mov_b32 s32, 0 ; CHECK-NEXT: s_addc_u32 s9, s9, 0 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 ; CHECK-NEXT: s_add_u32 s0, s0, s11 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: s_getpc_b64 s[8:9] ; CHECK-NEXT: s_add_u32 s8, s8, use_module@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s9, s9, use_module@gotpcrel32@hi+12 ; CHECK-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x0 ; CHECK-NEXT: s_load_dword s12, s[6:7], 0x0 ; CHECK-NEXT: s_mov_b64 s[6:7], s[4:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[10:11] ; CHECK-NEXT: s_lshl_b32 s4, s12, 2 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 1 ; CHECK-NEXT: s_add_i32 s4, s4, 8 ; CHECK-NEXT: v_mov_b32_e32 v2, 2 ; CHECK-NEXT: v_mov_b32_e32 v3, s4 ; CHECK-NEXT: ds_write_b16 v0, v1 ; CHECK-NEXT: ds_write_b16 v0, v2 offset:4 ; CHECK-NEXT: ds_write_b32 v3, v0 ; CHECK-NEXT: s_endpgm call void @use_module() store i16 1, ptr addrspace(3) @module_variable store i16 2, ptr addrspace(3) @kernel_overalign %arrayidx1 = getelementptr inbounds [0 x float], ptr addrspace(3) @extern_overalign, i32 0, i32 %idx store float 0.0, ptr addrspace(3) %arrayidx1 ret void } attributes #0 = { noinline } attributes #1 = { "amdgpu-elide-module-lds" }