diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp index 319b9e14fc21..ab8e979e7b40 100644 --- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp +++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp @@ -1092,7 +1092,7 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { // is transformed to: // // addr2 = gep float, float* p, i64 a ; inbounds removed - // addr = gep inbounds float, float* addr2, i64 5 + // addr = gep float, float* addr2, i64 5 ; inbounds removed // // If a is -4, although the old index b is in bounds, the new index a is // off-bound. http://llvm.org/docs/LangRef.html#id181 says "if the @@ -1103,7 +1103,7 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { // TODO(jingyue): do some range analysis to keep as many inbounds as // possible. GEPs with inbounds are more friendly to alias analysis. // TODO(gep_nowrap): Preserve nuw at least. - bool GEPWasInBounds = GEP->isInBounds(); + GEPNoWrapFlags NewGEPFlags = GEPNoWrapFlags::none(); GEP->setNoWrapFlags(GEPNoWrapFlags::none()); // Lowers a GEP to either GEPs with a single index or arithmetic operations. @@ -1153,7 +1153,7 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { IRBuilder<> Builder(GEP); NewGEP = cast(Builder.CreatePtrAdd( NewGEP, ConstantInt::get(PtrIdxTy, AccumulativeByteOffset, true), - GEP->getName(), GEPWasInBounds)); + GEP->getName(), NewGEPFlags)); NewGEP->copyMetadata(*GEP); GEP->replaceAllUsesWith(NewGEP); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll index ebb345f56168..8a80afd4a768 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll @@ -173,17 +173,22 @@ bb: ret void } -define amdgpu_kernel void @store_load_vindex_kernel() { +define amdgpu_kernel void @store_load_vindex_kernel(i32 %n) { ; GFX9-LABEL: store_load_vindex_kernel: ; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s8, s13 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 15 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 -; GFX9-NEXT: scratch_store_dword v1, v2, off -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshl_b32 s0, s0, 7 +; GFX9-NEXT: v_mov_b32_e32 v1, 15 +; GFX9-NEXT: v_add_u32_e32 v2, s0, v2 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: scratch_store_dword v2, v1, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 ; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm @@ -194,10 +199,15 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX10-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 15 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_lshl_b32 s0, s0, 7 +; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v1, s0, v1 ; GFX10-NEXT: scratch_store_dword v0, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v1, off offset:124 glc dlc @@ -206,25 +216,37 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; ; GFX942-LABEL: store_load_vindex_kernel: ; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX942-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX942-NEXT: v_mov_b32_e32 v2, 15 +; GFX942-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX942-NEXT: v_sub_u32_e32 v0, 0, v0 -; GFX942-NEXT: scratch_store_dword v1, v2, off sc0 sc1 -; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, 15 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_lshl_b32 s0, s0, 7 +; GFX942-NEXT: v_add_u32_e32 v2, s0, v2 ; GFX942-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX942-NEXT: scratch_store_dword v2, v1, off sc0 sc1 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: v_add_u32_e32 v0, s0, v0 ; GFX942-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_vindex_kernel: ; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, 15 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b32 s0, s0, 7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v1, s0, v1 ; GFX11-NEXT: scratch_store_b32 v0, v2, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, v1, off offset:124 glc dlc @@ -233,11 +255,19 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; ; GFX12-LABEL: store_load_vindex_kernel: ; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_mov_b32_e32 v2, 15 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1 +; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b32 s0, s0, 7 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_add_nc_u32_e32 v1, s0, v1 ; GFX12-NEXT: scratch_store_b32 v0, v2, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:124 scope:SCOPE_SYS @@ -246,14 +276,19 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; ; UNALIGNED_GFX9-LABEL: store_load_vindex_kernel: ; UNALIGNED_GFX9: ; %bb.0: ; %bb +; UNALIGNED_GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; UNALIGNED_GFX9-NEXT: s_add_u32 flat_scratch_lo, s8, s13 ; UNALIGNED_GFX9-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 -; UNALIGNED_GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v2, 15 +; UNALIGNED_GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; UNALIGNED_GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 -; UNALIGNED_GFX9-NEXT: scratch_store_dword v1, v2, off -; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED_GFX9-NEXT: s_lshl_b32 s0, s0, 7 +; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v1, 15 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v2, s0, v2 ; UNALIGNED_GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_dword v2, v1, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v0, s0, v0 ; UNALIGNED_GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX9-NEXT: s_endpgm @@ -264,10 +299,15 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; UNALIGNED_GFX10-NEXT: s_addc_u32 s9, s9, 0 ; UNALIGNED_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 ; UNALIGNED_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 +; UNALIGNED_GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 ; UNALIGNED_GFX10-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; UNALIGNED_GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v2, 15 ; UNALIGNED_GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; UNALIGNED_GFX10-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED_GFX10-NEXT: s_lshl_b32 s0, s0, 7 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v1, s0, v1 ; UNALIGNED_GFX10-NEXT: scratch_store_dword v0, v2, off ; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; UNALIGNED_GFX10-NEXT: scratch_load_dword v0, v1, off offset:124 glc dlc @@ -276,25 +316,37 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; ; UNALIGNED_GFX942-LABEL: store_load_vindex_kernel: ; UNALIGNED_GFX942: ; %bb.0: ; %bb +; UNALIGNED_GFX942-NEXT: s_load_dword s0, s[4:5], 0x0 ; UNALIGNED_GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; UNALIGNED_GFX942-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v2, 15 +; UNALIGNED_GFX942-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; UNALIGNED_GFX942-NEXT: v_sub_u32_e32 v0, 0, v0 -; UNALIGNED_GFX942-NEXT: scratch_store_dword v1, v2, off sc0 sc1 -; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v1, 15 +; UNALIGNED_GFX942-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED_GFX942-NEXT: s_lshl_b32 s0, s0, 7 +; UNALIGNED_GFX942-NEXT: v_add_u32_e32 v2, s0, v2 ; UNALIGNED_GFX942-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; UNALIGNED_GFX942-NEXT: scratch_store_dword v2, v1, off sc0 sc1 +; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX942-NEXT: v_add_u32_e32 v0, s0, v0 ; UNALIGNED_GFX942-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 ; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX942-NEXT: s_endpgm ; ; UNALIGNED_GFX11-LABEL: store_load_vindex_kernel: ; UNALIGNED_GFX11: ; %bb.0: ; %bb +; UNALIGNED_GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0 ; UNALIGNED_GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; UNALIGNED_GFX11-NEXT: v_mov_b32_e32 v2, 15 ; UNALIGNED_GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; UNALIGNED_GFX11-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; UNALIGNED_GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; UNALIGNED_GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; UNALIGNED_GFX11-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED_GFX11-NEXT: s_lshl_b32 s0, s0, 7 +; UNALIGNED_GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; UNALIGNED_GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v1, s0, v1 ; UNALIGNED_GFX11-NEXT: scratch_store_b32 v0, v2, off dlc ; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; UNALIGNED_GFX11-NEXT: scratch_load_b32 v0, v1, off offset:124 glc dlc @@ -303,24 +355,34 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; ; UNALIGNED_GFX12-LABEL: store_load_vindex_kernel: ; UNALIGNED_GFX12: ; %bb.0: ; %bb +; UNALIGNED_GFX12-NEXT: s_load_b32 s0, s[4:5], 0x0 ; UNALIGNED_GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v2, 15 +; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; UNALIGNED_GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; UNALIGNED_GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; UNALIGNED_GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1 +; UNALIGNED_GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; UNALIGNED_GFX12-NEXT: s_wait_kmcnt 0x0 +; UNALIGNED_GFX12-NEXT: s_lshl_b32 s0, s0, 7 +; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; UNALIGNED_GFX12-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; UNALIGNED_GFX12-NEXT: v_add_nc_u32_e32 v1, s0, v1 ; UNALIGNED_GFX12-NEXT: scratch_store_b32 v0, v2, off scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 ; UNALIGNED_GFX12-NEXT: scratch_load_b32 v0, v1, off offset:124 scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 ; UNALIGNED_GFX12-NEXT: s_endpgm bb: - %i = alloca [32 x float], align 4, addrspace(5) + %i.alloca = alloca [2 x [32 x float]], align 4, addrspace(5) + %i = getelementptr inbounds [2 x [32 x float]], ptr addrspace (5) %i.alloca, i32 0, i32 %n %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() %i3 = zext i32 %i2 to i64 %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i2 store volatile i32 15, ptr addrspace(5) %i7, align 4 - %i9 = sub nsw i32 31, %i2 - %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9 + %i9 = sub nsw i32 0, %i2 + %i9.1 = getelementptr [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9 + %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i9.1, i32 0, i32 31 %i12 = load volatile i32, ptr addrspace(5) %i10, align 4 ret void } @@ -779,22 +841,26 @@ bb: ret void } -define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { +define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) { ; GFX9-LABEL: store_load_vindex_small_offset_kernel: ; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s8, s13 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 -; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: scratch_load_dword v1, off, s0 glc +; GFX9-NEXT: s_mov_b32 s1, 0 +; GFX9-NEXT: scratch_load_dword v1, off, s1 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshl_b32 s0, s0, 7 ; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x100, v1 +; GFX9-NEXT: s_add_u32 s0, 0x100, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: scratch_store_dword v1, v2, off +; GFX9-NEXT: scratch_store_dword v1, v2, off offset:128 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v0, 0x100, v0 +; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 ; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm @@ -805,6 +871,7 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX10-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 15 @@ -812,76 +879,99 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x100, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x100, v1 -; GFX10-NEXT: scratch_store_dword v0, v2, off +; GFX10-NEXT: scratch_store_dword v0, v2, off offset:128 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_lshl_b32 s0, s0, 7 +; GFX10-NEXT: s_add_u32 s0, 0x100, s0 +; GFX10-NEXT: v_add_nc_u32_e32 v1, s0, v1 ; GFX10-NEXT: scratch_load_dword v0, v1, off offset:124 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_endpgm ; ; GFX942-LABEL: store_load_vindex_small_offset_kernel: ; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NEXT: scratch_load_dword v1, off, off sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX942-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX942-NEXT: v_sub_u32_e32 v0, 0, v0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_lshl_b32 s0, s0, 7 +; GFX942-NEXT: s_add_u32 s0, 0x100, s0 ; GFX942-NEXT: v_mov_b32_e32 v2, 15 ; GFX942-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX942-NEXT: scratch_store_dword v1, v2, off offset:256 sc0 sc1 +; GFX942-NEXT: scratch_store_dword v1, v2, off offset:384 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_add_u32_e32 v0, 0x100, v0 +; GFX942-NEXT: v_add_u32_e32 v0, s0, v0 ; GFX942-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_vindex_small_offset_kernel: ; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: scratch_load_b32 v3, off, off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v2, 15 ; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX11-NEXT: scratch_store_b32 v0, v2, off offset:256 dlc +; GFX11-NEXT: scratch_store_b32 v0, v2, off offset:384 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x100, v1 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b32 s0, s0, 7 +; GFX11-NEXT: s_add_u32 s0, 0x100, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_add_nc_u32_e32 v1, s0, v1 ; GFX11-NEXT: scratch_load_b32 v0, v1, off offset:124 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: store_load_vindex_small_offset_kernel: ; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v2, 15 ; GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1 -; GFX12-NEXT: scratch_store_b32 v0, v2, off offset:256 scope:SCOPE_SYS +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: scratch_store_b32 v0, v2, off offset:384 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:380 scope:SCOPE_SYS +; GFX12-NEXT: s_lshl_b32 s0, s0, 7 +; GFX12-NEXT: s_add_co_u32 s0, 0x100, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_add_nc_u32_e32 v1, s0, v1 +; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:124 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_endpgm ; ; UNALIGNED_GFX9-LABEL: store_load_vindex_small_offset_kernel: ; UNALIGNED_GFX9: ; %bb.0: ; %bb +; UNALIGNED_GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; UNALIGNED_GFX9-NEXT: s_add_u32 flat_scratch_lo, s8, s13 ; UNALIGNED_GFX9-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 -; UNALIGNED_GFX9-NEXT: s_mov_b32 s0, 0 -; UNALIGNED_GFX9-NEXT: scratch_load_dword v1, off, s0 glc +; UNALIGNED_GFX9-NEXT: s_mov_b32 s1, 0 +; UNALIGNED_GFX9-NEXT: scratch_load_dword v1, off, s1 glc ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; UNALIGNED_GFX9-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED_GFX9-NEXT: s_lshl_b32 s0, s0, 7 ; UNALIGNED_GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 ; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v1, 0x100, v1 +; UNALIGNED_GFX9-NEXT: s_add_u32 s0, 0x100, s0 ; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v2, 15 ; UNALIGNED_GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; UNALIGNED_GFX9-NEXT: scratch_store_dword v1, v2, off +; UNALIGNED_GFX9-NEXT: scratch_store_dword v1, v2, off offset:128 ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v0, 0x100, v0 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v0, s0, v0 ; UNALIGNED_GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX9-NEXT: s_endpgm @@ -892,6 +982,7 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; UNALIGNED_GFX10-NEXT: s_addc_u32 s9, s9, 0 ; UNALIGNED_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 ; UNALIGNED_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 +; UNALIGNED_GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 ; UNALIGNED_GFX10-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; UNALIGNED_GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v2, 15 @@ -899,71 +990,93 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v0, 0x100, v0 -; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v1, 0x100, v1 -; UNALIGNED_GFX10-NEXT: scratch_store_dword v0, v2, off +; UNALIGNED_GFX10-NEXT: scratch_store_dword v0, v2, off offset:128 ; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED_GFX10-NEXT: s_lshl_b32 s0, s0, 7 +; UNALIGNED_GFX10-NEXT: s_add_u32 s0, 0x100, s0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v1, s0, v1 ; UNALIGNED_GFX10-NEXT: scratch_load_dword v0, v1, off offset:124 glc dlc ; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX10-NEXT: s_endpgm ; ; UNALIGNED_GFX942-LABEL: store_load_vindex_small_offset_kernel: ; UNALIGNED_GFX942: ; %bb.0: ; %bb +; UNALIGNED_GFX942-NEXT: s_load_dword s0, s[4:5], 0x0 ; UNALIGNED_GFX942-NEXT: scratch_load_dword v1, off, off sc0 sc1 ; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; UNALIGNED_GFX942-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; UNALIGNED_GFX942-NEXT: v_sub_u32_e32 v0, 0, v0 +; UNALIGNED_GFX942-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED_GFX942-NEXT: s_lshl_b32 s0, s0, 7 +; UNALIGNED_GFX942-NEXT: s_add_u32 s0, 0x100, s0 ; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v2, 15 ; UNALIGNED_GFX942-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; UNALIGNED_GFX942-NEXT: scratch_store_dword v1, v2, off offset:256 sc0 sc1 +; UNALIGNED_GFX942-NEXT: scratch_store_dword v1, v2, off offset:384 sc0 sc1 ; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX942-NEXT: v_add_u32_e32 v0, 0x100, v0 +; UNALIGNED_GFX942-NEXT: v_add_u32_e32 v0, s0, v0 ; UNALIGNED_GFX942-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 ; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX942-NEXT: s_endpgm ; ; UNALIGNED_GFX11-LABEL: store_load_vindex_small_offset_kernel: ; UNALIGNED_GFX11: ; %bb.0: ; %bb +; UNALIGNED_GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0 ; UNALIGNED_GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; UNALIGNED_GFX11-NEXT: scratch_load_b32 v3, off, off glc dlc ; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX11-NEXT: v_mov_b32_e32 v2, 15 ; UNALIGNED_GFX11-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; UNALIGNED_GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; UNALIGNED_GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; UNALIGNED_GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) ; UNALIGNED_GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; UNALIGNED_GFX11-NEXT: scratch_store_b32 v0, v2, off offset:256 dlc +; UNALIGNED_GFX11-NEXT: scratch_store_b32 v0, v2, off offset:384 dlc ; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v1, 0x100, v1 +; UNALIGNED_GFX11-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED_GFX11-NEXT: s_lshl_b32 s0, s0, 7 +; UNALIGNED_GFX11-NEXT: s_add_u32 s0, 0x100, s0 +; UNALIGNED_GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v1, s0, v1 ; UNALIGNED_GFX11-NEXT: scratch_load_b32 v0, v1, off offset:124 glc dlc ; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX11-NEXT: s_endpgm ; ; UNALIGNED_GFX12-LABEL: store_load_vindex_small_offset_kernel: ; UNALIGNED_GFX12: ; %bb.0: ; %bb +; UNALIGNED_GFX12-NEXT: s_load_b32 s0, s[4:5], 0x0 ; UNALIGNED_GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; UNALIGNED_GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v2, 15 ; UNALIGNED_GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; UNALIGNED_GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; UNALIGNED_GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1 -; UNALIGNED_GFX12-NEXT: scratch_store_b32 v0, v2, off offset:256 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; UNALIGNED_GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; UNALIGNED_GFX12-NEXT: s_wait_kmcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b32 v0, v2, off offset:384 scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 -; UNALIGNED_GFX12-NEXT: scratch_load_b32 v0, v1, off offset:380 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_lshl_b32 s0, s0, 7 +; UNALIGNED_GFX12-NEXT: s_add_co_u32 s0, 0x100, s0 +; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; UNALIGNED_GFX12-NEXT: v_add_nc_u32_e32 v1, s0, v1 +; UNALIGNED_GFX12-NEXT: scratch_load_b32 v0, v1, off offset:124 scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 ; UNALIGNED_GFX12-NEXT: s_endpgm bb: %padding = alloca [64 x i32], align 4, addrspace(5) - %i = alloca [32 x float], align 4, addrspace(5) + %i.alloca = alloca [2 x [32 x float]], align 4, addrspace(5) + %i = getelementptr inbounds [2 x [32 x float]], ptr addrspace (5) %i.alloca, i32 0, i32 %n + %i.1 = getelementptr inbounds [2 x [32 x float]], ptr addrspace (5) %i.alloca, i32 0, i32 1 %pad_gep = getelementptr inbounds [64 x i32], ptr addrspace(5) %padding, i32 0, i32 0 %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4 %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() %i3 = zext i32 %i2 to i64 - %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i2 + %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i.1, i32 0, i32 %i2 store volatile i32 15, ptr addrspace(5) %i7, align 4 - %i9 = sub nsw i32 31, %i2 - %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9 + %i9 = sub nsw i32 0, %i2 + %i9.1 = getelementptr [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9 + %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i9.1, i32 0, i32 31 %i12 = load volatile i32, ptr addrspace(5) %i10, align 4 ret void } @@ -1368,22 +1481,26 @@ bb: ret void } -define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { +define amdgpu_kernel void @store_load_vindex_large_offset_kernel(i32 %n) { ; GFX9-LABEL: store_load_vindex_large_offset_kernel: ; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s8, s13 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 -; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: scratch_load_dword v1, off, s0 offset:4 glc +; GFX9-NEXT: s_mov_b32 s1, 0 +; GFX9-NEXT: scratch_load_dword v1, off, s1 offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshl_b32 s0, s0, 7 ; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 ; GFX9-NEXT: v_add_u32_e32 v1, 0x4004, v1 +; GFX9-NEXT: s_add_u32 s0, 0x4004, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: scratch_store_dword v1, v2, off +; GFX9-NEXT: scratch_store_dword v1, v2, off offset:128 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v0, 0x4004, v0 +; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 ; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm @@ -1394,6 +1511,7 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX10-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 15 @@ -1401,78 +1519,101 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x4004, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x4004, v1 -; GFX10-NEXT: scratch_store_dword v0, v2, off +; GFX10-NEXT: scratch_store_dword v0, v2, off offset:128 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_lshl_b32 s0, s0, 7 +; GFX10-NEXT: s_add_u32 s0, 0x4004, s0 +; GFX10-NEXT: v_add_nc_u32_e32 v1, s0, v1 ; GFX10-NEXT: scratch_load_dword v0, v1, off offset:124 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_endpgm ; ; GFX942-LABEL: store_load_vindex_large_offset_kernel: ; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX942-NEXT: scratch_load_dword v1, off, off offset:4 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX942-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX942-NEXT: v_sub_u32_e32 v0, 0, v0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_lshl_b32 s0, s0, 7 +; GFX942-NEXT: s_add_u32 s0, 0x4004, s0 ; GFX942-NEXT: v_mov_b32_e32 v2, 15 -; GFX942-NEXT: s_movk_i32 s0, 0x4004 +; GFX942-NEXT: s_movk_i32 s1, 0x4004 ; GFX942-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX942-NEXT: scratch_store_dword v1, v2, s0 sc0 sc1 +; GFX942-NEXT: scratch_store_dword v1, v2, s1 offset:128 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_add_u32_e32 v0, 0x4004, v0 +; GFX942-NEXT: v_add_u32_e32 v0, s0, v0 ; GFX942-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_vindex_large_offset_kernel: ; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_movk_i32 s0, 0x4004 ; GFX11-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v2, 15 ; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; GFX11-NEXT: scratch_store_b32 v0, v2, s0 dlc +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b32 s0, s0, 7 +; GFX11-NEXT: s_add_u32 s0, 0x4004, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_add_nc_u32_e32 v1, s0, v1 +; GFX11-NEXT: s_movk_i32 s0, 0x4004 +; GFX11-NEXT: scratch_store_b32 v0, v2, s0 offset:128 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x4004, v1 ; GFX11-NEXT: scratch_load_b32 v0, v1, off offset:124 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: store_load_vindex_large_offset_kernel: ; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v2, 15 ; GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1 -; GFX12-NEXT: scratch_store_b32 v0, v2, off offset:16384 scope:SCOPE_SYS +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: scratch_store_b32 v0, v2, off offset:16512 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:16508 scope:SCOPE_SYS +; GFX12-NEXT: s_lshl_b32 s0, s0, 7 +; GFX12-NEXT: s_add_co_u32 s0, 0x4000, s0 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_add_nc_u32_e32 v1, s0, v1 +; GFX12-NEXT: scratch_load_b32 v0, v1, off offset:124 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_endpgm ; ; UNALIGNED_GFX9-LABEL: store_load_vindex_large_offset_kernel: ; UNALIGNED_GFX9: ; %bb.0: ; %bb +; UNALIGNED_GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; UNALIGNED_GFX9-NEXT: s_add_u32 flat_scratch_lo, s8, s13 ; UNALIGNED_GFX9-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 -; UNALIGNED_GFX9-NEXT: s_mov_b32 s0, 0 -; UNALIGNED_GFX9-NEXT: scratch_load_dword v1, off, s0 offset:4 glc +; UNALIGNED_GFX9-NEXT: s_mov_b32 s1, 0 +; UNALIGNED_GFX9-NEXT: scratch_load_dword v1, off, s1 offset:4 glc ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; UNALIGNED_GFX9-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED_GFX9-NEXT: s_lshl_b32 s0, s0, 7 ; UNALIGNED_GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 ; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v1, 0x4004, v1 +; UNALIGNED_GFX9-NEXT: s_add_u32 s0, 0x4004, s0 ; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v2, 15 ; UNALIGNED_GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; UNALIGNED_GFX9-NEXT: scratch_store_dword v1, v2, off +; UNALIGNED_GFX9-NEXT: scratch_store_dword v1, v2, off offset:128 ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v0, 0x4004, v0 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v0, s0, v0 ; UNALIGNED_GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX9-NEXT: s_endpgm @@ -1483,6 +1624,7 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; UNALIGNED_GFX10-NEXT: s_addc_u32 s9, s9, 0 ; UNALIGNED_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 ; UNALIGNED_GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 +; UNALIGNED_GFX10-NEXT: s_load_dword s0, s[4:5], 0x0 ; UNALIGNED_GFX10-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; UNALIGNED_GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v2, 15 @@ -1490,73 +1632,95 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v0, 0x4004, v0 -; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v1, 0x4004, v1 -; UNALIGNED_GFX10-NEXT: scratch_store_dword v0, v2, off +; UNALIGNED_GFX10-NEXT: scratch_store_dword v0, v2, off offset:128 ; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED_GFX10-NEXT: s_lshl_b32 s0, s0, 7 +; UNALIGNED_GFX10-NEXT: s_add_u32 s0, 0x4004, s0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v1, s0, v1 ; UNALIGNED_GFX10-NEXT: scratch_load_dword v0, v1, off offset:124 glc dlc ; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX10-NEXT: s_endpgm ; ; UNALIGNED_GFX942-LABEL: store_load_vindex_large_offset_kernel: ; UNALIGNED_GFX942: ; %bb.0: ; %bb +; UNALIGNED_GFX942-NEXT: s_load_dword s0, s[4:5], 0x0 ; UNALIGNED_GFX942-NEXT: scratch_load_dword v1, off, off offset:4 sc0 sc1 ; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; UNALIGNED_GFX942-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; UNALIGNED_GFX942-NEXT: v_sub_u32_e32 v0, 0, v0 +; UNALIGNED_GFX942-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED_GFX942-NEXT: s_lshl_b32 s0, s0, 7 +; UNALIGNED_GFX942-NEXT: s_add_u32 s0, 0x4004, s0 ; UNALIGNED_GFX942-NEXT: v_mov_b32_e32 v2, 15 -; UNALIGNED_GFX942-NEXT: s_movk_i32 s0, 0x4004 +; UNALIGNED_GFX942-NEXT: s_movk_i32 s1, 0x4004 ; UNALIGNED_GFX942-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; UNALIGNED_GFX942-NEXT: scratch_store_dword v1, v2, s0 sc0 sc1 +; UNALIGNED_GFX942-NEXT: scratch_store_dword v1, v2, s1 offset:128 sc0 sc1 ; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX942-NEXT: v_add_u32_e32 v0, 0x4004, v0 +; UNALIGNED_GFX942-NEXT: v_add_u32_e32 v0, s0, v0 ; UNALIGNED_GFX942-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 ; UNALIGNED_GFX942-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX942-NEXT: s_endpgm ; ; UNALIGNED_GFX11-LABEL: store_load_vindex_large_offset_kernel: ; UNALIGNED_GFX11: ; %bb.0: ; %bb +; UNALIGNED_GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0 ; UNALIGNED_GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; UNALIGNED_GFX11-NEXT: s_movk_i32 s0, 0x4004 ; UNALIGNED_GFX11-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc ; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX11-NEXT: v_mov_b32_e32 v2, 15 ; UNALIGNED_GFX11-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; UNALIGNED_GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; UNALIGNED_GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; UNALIGNED_GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) ; UNALIGNED_GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 -; UNALIGNED_GFX11-NEXT: scratch_store_b32 v0, v2, s0 dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt lgkmcnt(0) +; UNALIGNED_GFX11-NEXT: s_lshl_b32 s0, s0, 7 +; UNALIGNED_GFX11-NEXT: s_add_u32 s0, 0x4004, s0 +; UNALIGNED_GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v1, s0, v1 +; UNALIGNED_GFX11-NEXT: s_movk_i32 s0, 0x4004 +; UNALIGNED_GFX11-NEXT: scratch_store_b32 v0, v2, s0 offset:128 dlc ; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v1, 0x4004, v1 ; UNALIGNED_GFX11-NEXT: scratch_load_b32 v0, v1, off offset:124 glc dlc ; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX11-NEXT: s_endpgm ; ; UNALIGNED_GFX12-LABEL: store_load_vindex_large_offset_kernel: ; UNALIGNED_GFX12: ; %bb.0: ; %bb +; UNALIGNED_GFX12-NEXT: s_load_b32 s0, s[4:5], 0x0 ; UNALIGNED_GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; UNALIGNED_GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v2, 15 ; UNALIGNED_GFX12-NEXT: v_sub_nc_u32_e32 v1, 0, v0 ; UNALIGNED_GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; UNALIGNED_GFX12-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1 -; UNALIGNED_GFX12-NEXT: scratch_store_b32 v0, v2, off offset:16384 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; UNALIGNED_GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; UNALIGNED_GFX12-NEXT: s_wait_kmcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b32 v0, v2, off offset:16512 scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 -; UNALIGNED_GFX12-NEXT: scratch_load_b32 v0, v1, off offset:16508 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_lshl_b32 s0, s0, 7 +; UNALIGNED_GFX12-NEXT: s_add_co_u32 s0, 0x4000, s0 +; UNALIGNED_GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; UNALIGNED_GFX12-NEXT: v_add_nc_u32_e32 v1, s0, v1 +; UNALIGNED_GFX12-NEXT: scratch_load_b32 v0, v1, off offset:124 scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 ; UNALIGNED_GFX12-NEXT: s_endpgm bb: %padding = alloca [4096 x i32], align 4, addrspace(5) - %i = alloca [32 x float], align 4, addrspace(5) + %i.alloca = alloca [2 x [32 x float]], align 4, addrspace(5) + %i = getelementptr inbounds [2 x [32 x float]], ptr addrspace (5) %i.alloca, i32 0, i32 %n + %i.1 = getelementptr inbounds [2 x [32 x float]], ptr addrspace (5) %i.alloca, i32 0, i32 1 %pad_gep = getelementptr inbounds [4096 x i32], ptr addrspace(5) %padding, i32 0, i32 0 %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4 %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() %i3 = zext i32 %i2 to i64 - %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i2 + %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i.1, i32 0, i32 %i2 store volatile i32 15, ptr addrspace(5) %i7, align 4 - %i9 = sub nsw i32 31, %i2 - %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9 + %i9 = sub nsw i32 0, %i2 + %i9.1 = getelementptr [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9 + %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i9.1, i32 0, i32 31 %i12 = load volatile i32, ptr addrspace(5) %i10, align 4 ret void } @@ -2206,8 +2370,8 @@ bb: %alloca = alloca [32 x i32], align 4, addrspace(5) %vidx = tail call i32 @llvm.amdgcn.workitem.id.x() %add1 = add nsw i32 %sidx, %vidx - %add2 = add nsw i32 %add1, 256 - %gep = getelementptr inbounds [32 x i32], ptr addrspace(5) %alloca, i32 0, i32 %add2 + %gep1 = getelementptr [32 x float], ptr addrspace(5) %alloca, i32 0, i32 %add1 + %gep = getelementptr inbounds [32 x float], ptr addrspace(5) %gep1, i32 0, i32 256 store volatile i32 15, ptr addrspace(5) %gep, align 4 %load = load volatile i32, ptr addrspace(5) %gep, align 4 ret void diff --git a/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll b/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll index d8fb2641c819..52ccfe8ba3bf 100644 --- a/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll +++ b/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll @@ -238,8 +238,8 @@ main_body: %25 = getelementptr inbounds [0 x <8 x i32>], ptr addrspace(6) %1, i32 0, i32 %24, !amdgpu.uniform !0 %26 = load <8 x i32>, ptr addrspace(6) %25, align 32, !invariant.load !0 %27 = shl i32 %23, 2 - %28 = or disjoint i32 %27, 3 - %29 = getelementptr inbounds [0 x <4 x i32>], ptr addrspace(6) %1, i32 0, i32 %28, !amdgpu.uniform !0 + %28 = getelementptr [0 x <4 x i32>], ptr addrspace(6) %1, i32 0, i32 %27, !amdgpu.uniform !0 + %29 = getelementptr inbounds [0 x <4 x i32>], ptr addrspace(6) %28, i32 0, i32 3, !amdgpu.uniform !0 %30 = load <4 x i32>, ptr addrspace(6) %29, align 16, !invariant.load !0 %31 = call nsz <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float 0.0, <8 x i32> %26, <4 x i32> %30, i1 0, i32 0, i32 0) #8 %32 = extractelement <4 x float> %31, i32 0 @@ -270,8 +270,8 @@ main_body: %25 = getelementptr inbounds [0 x <8 x i32>], ptr addrspace(6) %1, i32 0, i32 %24 %26 = load <8 x i32>, ptr addrspace(6) %25, align 32, !invariant.load !0 %27 = shl i32 %23, 2 - %28 = or disjoint i32 %27, 3 - %29 = getelementptr inbounds [0 x <4 x i32>], ptr addrspace(6) %1, i32 0, i32 %28 + %28 = getelementptr [0 x <4 x i32>], ptr addrspace(6) %1, i32 0, i32 %27 + %29 = getelementptr inbounds [0 x <4 x i32>], ptr addrspace(6) %28, i32 0, i32 3 %30 = load <4 x i32>, ptr addrspace(6) %29, align 16, !invariant.load !0 %31 = call nsz <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float 0.0, <8 x i32> %26, <4 x i32> %30, i1 0, i32 0, i32 0) #8 %32 = extractelement <4 x float> %31, i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll index dd423b5ce5a7..b25d9b245f5f 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -674,17 +674,20 @@ bb: ret void } -define amdgpu_kernel void @store_load_vindex_kernel() { +define amdgpu_kernel void @store_load_vindex_kernel(i32 %n) { ; GFX9-LABEL: store_load_vindex_kernel: ; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s8, s13 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 15 -; GFX9-NEXT: scratch_store_dword v1, v2, off +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 15 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshl_b32 s0, s0, 7 +; GFX9-NEXT: v_add_u32_e32 v2, s0, v0 +; GFX9-NEXT: scratch_store_dword v2, v1, off ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 +; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 ; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm @@ -695,10 +698,13 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX10-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 15 -; GFX10-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_lshl_b32 s0, s0, 7 +; GFX10-NEXT: v_add_nc_u32_e32 v1, s0, v0 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, s0, v0 ; GFX10-NEXT: scratch_store_dword v1, v2, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc @@ -707,23 +713,33 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; ; GFX11-LABEL: store_load_vindex_kernel: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, 15 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_and_b32_e32 v0, 0xffc, v0 -; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0, v0 -; GFX11-NEXT: scratch_store_b32 v0, v1, off dlc +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b32 s0, s0, 7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_add_nc_u32_e32 v1, s0, v0 +; GFX11-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX11-NEXT: scratch_store_b32 v1, v2, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc +; GFX11-NEXT: scratch_load_b32 v0, v0, off offset:124 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: store_load_vindex_kernel: ; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_and_b32_e32 v0, 0xffc, v0 -; GFX12-NEXT: v_sub_nc_u32_e32 v2, 0, v0 -; GFX12-NEXT: scratch_store_b32 v0, v1, off scope:SCOPE_SYS +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_lshl_b32 s0, s0, 7 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: v_sub_nc_u32_e32 v2, s0, v0 +; GFX12-NEXT: scratch_store_b32 v0, v1, s0 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: scratch_load_b32 v0, v2, off offset:124 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -735,27 +751,33 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX9-PAL-NEXT: s_mov_b32 s12, s0 ; GFX9-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 ; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-PAL-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 15 -; GFX9-PAL-NEXT: v_sub_u32_e32 v0, 0, v0 +; GFX9-PAL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-PAL-NEXT: s_and_b32 s13, s13, 0xffff ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s12, s11 ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX9-PAL-NEXT: scratch_store_dword v1, v2, off +; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 7 +; GFX9-PAL-NEXT: v_add_u32_e32 v2, s0, v0 +; GFX9-PAL-NEXT: scratch_store_dword v2, v1, off ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: v_sub_u32_e32 v0, s0, v0 ; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_endpgm ; ; GFX942-LABEL: store_load_vindex_kernel: ; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX942-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX942-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GFX942-NEXT: v_mov_b32_e32 v1, 15 -; GFX942-NEXT: scratch_store_dword v0, v1, off sc0 sc1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_lshl_b32 s0, s0, 7 +; GFX942-NEXT: v_add_u32_e32 v2, s0, v0 +; GFX942-NEXT: scratch_store_dword v2, v1, off sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_sub_u32_e32 v0, 0, v0 +; GFX942-NEXT: v_sub_u32_e32 v0, s0, v0 ; GFX942-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_endpgm @@ -771,10 +793,13 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; GFX10-PAL-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX10-PAL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX10-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 15 -; GFX10-PAL-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-PAL-NEXT: v_sub_nc_u32_e32 v0, 0, v0 +; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 7 +; GFX10-PAL-NEXT: v_add_nc_u32_e32 v1, s0, v0 +; GFX10-PAL-NEXT: v_sub_nc_u32_e32 v0, s0, v0 ; GFX10-PAL-NEXT: scratch_store_dword v1, v2, off ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc @@ -783,35 +808,47 @@ define amdgpu_kernel void @store_load_vindex_kernel() { ; ; GFX11-PAL-LABEL: store_load_vindex_kernel: ; GFX11-PAL: ; %bb.0: ; %bb -; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-PAL-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 15 +; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-PAL-NEXT: v_and_b32_e32 v0, 0xffc, v0 -; GFX11-PAL-NEXT: v_sub_nc_u32_e32 v2, 0, v0 -; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off dlc +; GFX11-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 7 +; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-PAL-NEXT: v_add_nc_u32_e32 v1, s0, v0 +; GFX11-PAL-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX11-PAL-NEXT: scratch_store_b32 v1, v2, off dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-PAL-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc +; GFX11-PAL-NEXT: scratch_load_b32 v0, v0, off offset:124 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_endpgm ; ; GFX12-PAL-LABEL: store_load_vindex_kernel: ; GFX12-PAL: ; %bb.0: ; %bb +; GFX12-PAL-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX12-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-PAL-NEXT: v_and_b32_e32 v0, 0xffc, v0 -; GFX12-PAL-NEXT: v_sub_nc_u32_e32 v2, 0, v0 -; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, off scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 +; GFX12-PAL-NEXT: s_lshl_b32 s0, s0, 7 +; GFX12-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12-PAL-NEXT: v_sub_nc_u32_e32 v2, s0, v0 +; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, s0 scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_storecnt 0x0 ; GFX12-PAL-NEXT: scratch_load_b32 v0, v2, off offset:124 scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: s_endpgm bb: - %i = alloca [32 x float], align 4, addrspace(5) + %i.alloca = alloca [2 x [32 x float]], align 4, addrspace(5) + %i = getelementptr inbounds [2 x [32 x float]], ptr addrspace (5) %i.alloca, i32 0, i32 %n %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() %i3 = zext i32 %i2 to i64 %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i2 store volatile i32 15, ptr addrspace(5) %i7, align 4 - %i9 = sub nsw i32 31, %i2 - %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9 + %i9 = sub nsw i32 0, %i2 + %i9.1 = getelementptr [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9 + %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i9.1, i32 0, i32 31 %i12 = load volatile i32, ptr addrspace(5) %i10, align 4 ret void } @@ -1874,20 +1911,24 @@ bb: ret void } -define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { +define amdgpu_kernel void @store_load_vindex_small_offset_kernel(i32 %n) { ; GFX9-LABEL: store_load_vindex_small_offset_kernel: ; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s8, s13 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 -; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: scratch_load_dword v1, off, s0 glc +; GFX9-NEXT: s_mov_b32 s1, 0 +; GFX9-NEXT: scratch_load_dword v1, off, s1 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshl_b32 s0, s0, 7 ; GFX9-NEXT: v_add_u32_e32 v1, 0x100, v0 +; GFX9-NEXT: s_addk_i32 s0, 0x100 ; GFX9-NEXT: v_mov_b32_e32 v2, 15 -; GFX9-NEXT: scratch_store_dword v1, v2, off +; GFX9-NEXT: scratch_store_dword v1, v2, off offset:128 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_sub_u32_e32 v0, 0x100, v0 +; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 ; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm @@ -1898,42 +1939,54 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX10-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 15 ; GFX10-NEXT: scratch_load_dword v3, off, off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x100, v0 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0x100, v0 -; GFX10-NEXT: scratch_store_dword v1, v2, off +; GFX10-NEXT: scratch_store_dword v1, v2, off offset:128 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_lshl_b32 s0, s0, 7 +; GFX10-NEXT: s_addk_i32 s0, 0x100 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, s0, v0 ; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_vindex_small_offset_kernel: ; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: scratch_load_b32 v3, off, off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v0, 0xffc, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0x100, v0 -; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:256 dlc +; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:384 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b32 s0, s0, 7 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_addk_i32 s0, 0x100 +; GFX11-NEXT: v_sub_nc_u32_e32 v2, s0, v0 ; GFX11-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: store_load_vindex_small_offset_kernel: ; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_and_b32_e32 v0, 0xffc, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_sub_nc_u32_e32 v2, 0x100, v0 -; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:256 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:384 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: s_lshl_b32 s0, s0, 7 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_addk_co_i32 s0, 0x100 +; GFX12-NEXT: v_sub_nc_u32_e32 v2, s0, v0 ; GFX12-NEXT: scratch_load_b32 v0, v2, off offset:124 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_endpgm @@ -1943,33 +1996,40 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX9-PAL-NEXT: s_getpc_b64 s[12:13] ; GFX9-PAL-NEXT: s_mov_b32 s12, s0 ; GFX9-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 -; GFX9-PAL-NEXT: s_mov_b32 s0, 0 +; GFX9-PAL-NEXT: s_mov_b32 s1, 0 +; GFX9-PAL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-PAL-NEXT: s_and_b32 s13, s13, 0xffff ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s12, s11 ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX9-PAL-NEXT: scratch_load_dword v1, off, s0 glc +; GFX9-PAL-NEXT: scratch_load_dword v1, off, s1 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 7 ; GFX9-PAL-NEXT: v_add_u32_e32 v1, 0x100, v0 -; GFX9-PAL-NEXT: scratch_store_dword v1, v2, off +; GFX9-PAL-NEXT: s_addk_i32 s0, 0x100 +; GFX9-PAL-NEXT: scratch_store_dword v1, v2, off offset:128 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: v_sub_u32_e32 v0, 0x100, v0 +; GFX9-PAL-NEXT: v_sub_u32_e32 v0, s0, v0 ; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_endpgm ; ; GFX942-LABEL: store_load_vindex_small_offset_kernel: ; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX942-NEXT: scratch_load_dword v1, off, off sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX942-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GFX942-NEXT: v_mov_b32_e32 v1, 15 -; GFX942-NEXT: scratch_store_dword v0, v1, off offset:256 sc0 sc1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_lshl_b32 s0, s0, 7 +; GFX942-NEXT: s_addk_i32 s0, 0x100 +; GFX942-NEXT: scratch_store_dword v0, v1, off offset:384 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_sub_u32_e32 v0, 0x100, v0 +; GFX942-NEXT: v_sub_u32_e32 v0, s0, v0 ; GFX942-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_endpgm @@ -1985,14 +2045,18 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX1010-PAL-NEXT: s_addc_u32 s13, s13, 0 ; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX1010-PAL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX1010-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, 15 +; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v1, 0x100, v0 +; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-PAL-NEXT: s_lshl_b32 s0, s0, 7 +; GFX1010-PAL-NEXT: s_addk_i32 s0, 0x100 +; GFX1010-PAL-NEXT: v_sub_nc_u32_e32 v0, s0, v0 ; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 ; GFX1010-PAL-NEXT: scratch_load_dword v3, off, s0 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v1, 0x100, v0 -; GFX1010-PAL-NEXT: v_sub_nc_u32_e32 v0, 0x100, v0 -; GFX1010-PAL-NEXT: scratch_store_dword v1, v2, off +; GFX1010-PAL-NEXT: scratch_store_dword v1, v2, off offset:128 ; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) @@ -2009,56 +2073,71 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { ; GFX1030-PAL-NEXT: s_addc_u32 s13, s13, 0 ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX1030-PAL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX1030-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX1030-PAL-NEXT: scratch_load_dword v3, off, off glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v1, 0x100, v0 -; GFX1030-PAL-NEXT: v_sub_nc_u32_e32 v0, 0x100, v0 -; GFX1030-PAL-NEXT: scratch_store_dword v1, v2, off +; GFX1030-PAL-NEXT: scratch_store_dword v1, v2, off offset:128 ; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1030-PAL-NEXT: s_lshl_b32 s0, s0, 7 +; GFX1030-PAL-NEXT: s_addk_i32 s0, 0x100 +; GFX1030-PAL-NEXT: v_sub_nc_u32_e32 v0, s0, v0 ; GFX1030-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX1030-PAL-NEXT: s_endpgm ; ; GFX11-PAL-LABEL: store_load_vindex_small_offset_kernel: ; GFX11-PAL: ; %bb.0: ; %bb +; GFX11-PAL-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-PAL-NEXT: scratch_load_b32 v3, off, off glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: v_and_b32_e32 v0, 0xffc, v0 -; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-PAL-NEXT: v_sub_nc_u32_e32 v2, 0x100, v0 -; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off offset:256 dlc +; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off offset:384 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 7 +; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-PAL-NEXT: s_addk_i32 s0, 0x100 +; GFX11-PAL-NEXT: v_sub_nc_u32_e32 v2, s0, v0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_endpgm ; ; GFX12-PAL-LABEL: store_load_vindex_small_offset_kernel: ; GFX12-PAL: ; %bb.0: ; %bb +; GFX12-PAL-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-PAL-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: v_and_b32_e32 v0, 0xffc, v0 -; GFX12-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-PAL-NEXT: v_sub_nc_u32_e32 v2, 0x100, v0 -; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, off offset:256 scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 +; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, off offset:384 scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_storecnt 0x0 +; GFX12-PAL-NEXT: s_lshl_b32 s0, s0, 7 +; GFX12-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-PAL-NEXT: s_addk_co_i32 s0, 0x100 +; GFX12-PAL-NEXT: v_sub_nc_u32_e32 v2, s0, v0 ; GFX12-PAL-NEXT: scratch_load_b32 v0, v2, off offset:124 scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: s_endpgm bb: %padding = alloca [64 x i32], align 4, addrspace(5) - %i = alloca [32 x float], align 4, addrspace(5) + %i.alloca = alloca [2 x [32 x float]], align 4, addrspace(5) + %i = getelementptr inbounds [2 x [32 x float]], ptr addrspace (5) %i.alloca, i32 0, i32 %n + %i.1 = getelementptr inbounds [2 x [32 x float]], ptr addrspace (5) %i.alloca, i32 0, i32 1 %pad_gep = getelementptr inbounds [64 x i32], ptr addrspace(5) %padding, i32 0, i32 0 %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4 %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() %i3 = zext i32 %i2 to i64 - %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i2 + %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i.1, i32 0, i32 %i2 store volatile i32 15, ptr addrspace(5) %i7, align 4 - %i9 = sub nsw i32 31, %i2 - %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9 + %i9 = sub nsw i32 0, %i2 + %i9.1 = getelementptr [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9 + %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i9.1, i32 0, i32 31 %i12 = load volatile i32, ptr addrspace(5) %i10, align 4 ret void } @@ -3128,20 +3207,24 @@ bb: ret void } -define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { +define amdgpu_kernel void @store_load_vindex_large_offset_kernel(i32 %n) { ; GFX9-LABEL: store_load_vindex_large_offset_kernel: ; GFX9: ; %bb.0: ; %bb +; GFX9-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s8, s13 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s9, 0 -; GFX9-NEXT: s_mov_b32 s0, 0 -; GFX9-NEXT: scratch_load_dword v1, off, s0 offset:4 glc +; GFX9-NEXT: s_mov_b32 s1, 0 +; GFX9-NEXT: scratch_load_dword v1, off, s1 offset:4 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshl_b32 s0, s0, 7 ; GFX9-NEXT: v_add_u32_e32 v1, 0x4004, v0 +; GFX9-NEXT: s_addk_i32 s0, 0x4004 ; GFX9-NEXT: v_mov_b32_e32 v2, 15 -; GFX9-NEXT: scratch_store_dword v1, v2, off +; GFX9-NEXT: scratch_store_dword v1, v2, off offset:128 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_sub_u32_e32 v0, 0x4004, v0 +; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 ; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm @@ -3152,28 +3235,37 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX10-NEXT: s_addc_u32 s9, s9, 0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9 +; GFX10-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, 15 ; GFX10-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x4004, v0 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, 0x4004, v0 -; GFX10-NEXT: scratch_store_dword v1, v2, off +; GFX10-NEXT: scratch_store_dword v1, v2, off offset:128 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_lshl_b32 s0, s0, 7 +; GFX10-NEXT: s_addk_i32 s0, 0x4004 +; GFX10-NEXT: v_sub_nc_u32_e32 v0, s0, v0 ; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: store_load_vindex_large_offset_kernel: ; GFX11: ; %bb.0: ; %bb +; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-NEXT: s_movk_i32 s0, 0x4004 ; GFX11-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_and_b32_e32 v0, 0xffc, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0x4004, v0 -; GFX11-NEXT: scratch_store_b32 v0, v1, s0 dlc +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_lshl_b32 s0, s0, 7 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_addk_i32 s0, 0x4004 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_sub_nc_u32_e32 v2, s0, v0 +; GFX11-NEXT: s_movk_i32 s0, 0x4004 +; GFX11-NEXT: scratch_store_b32 v0, v1, s0 offset:128 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3181,14 +3273,18 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; ; GFX12-LABEL: store_load_vindex_large_offset_kernel: ; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x24 ; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_and_b32_e32 v0, 0xffc, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_sub_nc_u32_e32 v2, 0x4000, v0 -; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:16384 scope:SCOPE_SYS +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: scratch_store_b32 v0, v1, off offset:16512 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: s_lshl_b32 s0, s0, 7 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_addk_co_i32 s0, 0x4000 +; GFX12-NEXT: v_sub_nc_u32_e32 v2, s0, v0 ; GFX12-NEXT: scratch_load_b32 v0, v2, off offset:124 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: s_endpgm @@ -3198,34 +3294,41 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX9-PAL-NEXT: s_getpc_b64 s[12:13] ; GFX9-PAL-NEXT: s_mov_b32 s12, s0 ; GFX9-PAL-NEXT: s_load_dwordx2 s[12:13], s[12:13], 0x0 -; GFX9-PAL-NEXT: s_mov_b32 s0, 0 +; GFX9-PAL-NEXT: s_mov_b32 s1, 0 +; GFX9-PAL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-PAL-NEXT: s_and_b32 s13, s13, 0xffff ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s12, s11 ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX9-PAL-NEXT: scratch_load_dword v1, off, s0 offset:4 glc +; GFX9-PAL-NEXT: scratch_load_dword v1, off, s1 offset:4 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 7 ; GFX9-PAL-NEXT: v_add_u32_e32 v1, 0x4004, v0 -; GFX9-PAL-NEXT: scratch_store_dword v1, v2, off +; GFX9-PAL-NEXT: s_addk_i32 s0, 0x4004 +; GFX9-PAL-NEXT: scratch_store_dword v1, v2, off offset:128 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: v_sub_u32_e32 v0, 0x4004, v0 +; GFX9-PAL-NEXT: v_sub_u32_e32 v0, s0, v0 ; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: s_endpgm ; ; GFX942-LABEL: store_load_vindex_large_offset_kernel: ; GFX942: ; %bb.0: ; %bb +; GFX942-NEXT: s_load_dword s0, s[4:5], 0x24 ; GFX942-NEXT: scratch_load_dword v1, off, off offset:4 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX942-NEXT: v_and_b32_e32 v0, 0xffc, v0 ; GFX942-NEXT: v_mov_b32_e32 v1, 15 -; GFX942-NEXT: s_movk_i32 s0, 0x4004 -; GFX942-NEXT: scratch_store_dword v0, v1, s0 sc0 sc1 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_lshl_b32 s0, s0, 7 +; GFX942-NEXT: s_addk_i32 s0, 0x4004 +; GFX942-NEXT: s_movk_i32 s1, 0x4004 +; GFX942-NEXT: scratch_store_dword v0, v1, s1 offset:128 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) -; GFX942-NEXT: v_sub_u32_e32 v0, 0x4004, v0 +; GFX942-NEXT: v_sub_u32_e32 v0, s0, v0 ; GFX942-NEXT: scratch_load_dword v0, v0, off offset:124 sc0 sc1 ; GFX942-NEXT: s_waitcnt vmcnt(0) ; GFX942-NEXT: s_endpgm @@ -3241,14 +3344,18 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX1010-PAL-NEXT: s_addc_u32 s13, s13, 0 ; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX1010-PAL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX1010-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, 15 +; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v1, 0x4004, v0 +; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-PAL-NEXT: s_lshl_b32 s0, s0, 7 +; GFX1010-PAL-NEXT: s_addk_i32 s0, 0x4004 +; GFX1010-PAL-NEXT: v_sub_nc_u32_e32 v0, s0, v0 ; GFX1010-PAL-NEXT: s_mov_b32 s0, 0 ; GFX1010-PAL-NEXT: scratch_load_dword v3, off, s0 offset:4 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v1, 0x4004, v0 -; GFX1010-PAL-NEXT: v_sub_nc_u32_e32 v0, 0x4004, v0 -; GFX1010-PAL-NEXT: scratch_store_dword v1, v2, off +; GFX1010-PAL-NEXT: scratch_store_dword v1, v2, off offset:128 ; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1010-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) @@ -3265,28 +3372,37 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; GFX1030-PAL-NEXT: s_addc_u32 s13, s13, 0 ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; GFX1030-PAL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX1030-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX1030-PAL-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v1, 0x4004, v0 -; GFX1030-PAL-NEXT: v_sub_nc_u32_e32 v0, 0x4004, v0 -; GFX1030-PAL-NEXT: scratch_store_dword v1, v2, off +; GFX1030-PAL-NEXT: scratch_store_dword v1, v2, off offset:128 ; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1030-PAL-NEXT: s_lshl_b32 s0, s0, 7 +; GFX1030-PAL-NEXT: s_addk_i32 s0, 0x4004 +; GFX1030-PAL-NEXT: v_sub_nc_u32_e32 v0, s0, v0 ; GFX1030-PAL-NEXT: scratch_load_dword v0, v0, off offset:124 glc dlc ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX1030-PAL-NEXT: s_endpgm ; ; GFX11-PAL-LABEL: store_load_vindex_large_offset_kernel: ; GFX11-PAL: ; %bb.0: ; %bb +; GFX11-PAL-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 -; GFX11-PAL-NEXT: s_movk_i32 s0, 0x4004 ; GFX11-PAL-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: v_and_b32_e32 v0, 0xffc, v0 -; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-PAL-NEXT: v_sub_nc_u32_e32 v2, 0x4004, v0 -; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, s0 dlc +; GFX11-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 7 +; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-PAL-NEXT: s_addk_i32 s0, 0x4004 +; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-PAL-NEXT: v_sub_nc_u32_e32 v2, s0, v0 +; GFX11-PAL-NEXT: s_movk_i32 s0, 0x4004 +; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, s0 offset:128 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b32 v0, v2, off offset:124 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) @@ -3294,28 +3410,35 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { ; ; GFX12-PAL-LABEL: store_load_vindex_large_offset_kernel: ; GFX12-PAL: ; %bb.0: ; %bb +; GFX12-PAL-NEXT: s_load_b32 s0, s[4:5], 0x0 ; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX12-PAL-NEXT: scratch_load_b32 v3, off, off scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: v_and_b32_e32 v0, 0xffc, v0 -; GFX12-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-PAL-NEXT: v_sub_nc_u32_e32 v2, 0x4000, v0 -; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, off offset:16384 scope:SCOPE_SYS +; GFX12-PAL-NEXT: s_wait_kmcnt 0x0 +; GFX12-PAL-NEXT: scratch_store_b32 v0, v1, off offset:16512 scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_storecnt 0x0 +; GFX12-PAL-NEXT: s_lshl_b32 s0, s0, 7 +; GFX12-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-PAL-NEXT: s_addk_co_i32 s0, 0x4000 +; GFX12-PAL-NEXT: v_sub_nc_u32_e32 v2, s0, v0 ; GFX12-PAL-NEXT: scratch_load_b32 v0, v2, off offset:124 scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 ; GFX12-PAL-NEXT: s_endpgm bb: %padding = alloca [4096 x i32], align 4, addrspace(5) - %i = alloca [32 x float], align 4, addrspace(5) + %i.alloca = alloca [2 x [32 x float]], align 4, addrspace(5) + %i = getelementptr inbounds [2 x [32 x float]], ptr addrspace (5) %i.alloca, i32 0, i32 %n + %i.1 = getelementptr inbounds [2 x [32 x float]], ptr addrspace (5) %i.alloca, i32 0, i32 1 %pad_gep = getelementptr inbounds [4096 x i32], ptr addrspace(5) %padding, i32 0, i32 0 %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4 %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() %i3 = zext i32 %i2 to i64 - %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i2 + %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i.1, i32 0, i32 %i2 store volatile i32 15, ptr addrspace(5) %i7, align 4 - %i9 = sub nsw i32 31, %i2 - %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9 + %i9 = sub nsw i32 0, %i2 + %i9.1 = getelementptr [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9 + %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i9.1, i32 0, i32 31 %i12 = load volatile i32, ptr addrspace(5) %i10, align 4 ret void } @@ -3947,8 +4070,8 @@ bb: %alloca = alloca [32 x i32], align 4, addrspace(5) %vidx = tail call i32 @llvm.amdgcn.workitem.id.x() %add1 = add nsw i32 %sidx, %vidx - %add2 = add nsw i32 %add1, 256 - %gep = getelementptr inbounds [32 x i32], ptr addrspace(5) %alloca, i32 0, i32 %add2 + %gep1 = getelementptr [32 x float], ptr addrspace(5) %alloca, i32 0, i32 %add1 + %gep = getelementptr inbounds [32 x float], ptr addrspace(5) %gep1, i32 0, i32 256 store volatile i32 15, ptr addrspace(5) %gep, align 4 %load = load volatile i32, ptr addrspace(5) %gep, align 4 ret void diff --git a/llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll b/llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll new file mode 100644 index 000000000000..88cc4b1c96b4 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fold-gep-offset.ll @@ -0,0 +1,159 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py + +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=-enable-flat-scratch < %s | FileCheck --check-prefixes=GFX90A,GFX90A-MUBUF %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+enable-flat-scratch < %s | FileCheck --check-prefixes=GFX90A,GFX90A-FLATSCR %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=-enable-flat-scratch < %s | FileCheck --check-prefixes=GFX10,GFX10-MUBUF %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=+enable-flat-scratch < %s | FileCheck --check-prefixes=GFX10,GFX10-FLATSCR %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GFX942 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12 %s + +; This test checks memory addresses with constant offset components that should +; not be folded into memory accesses with immediate offsets. +; SeparateConstOffsetsFromGEP transforms the GEPs in a way that can lead to +; out-of-bounds or negative intermediate results in the address computation, +; which are problematic for flat and scratch instructions: +; gep[inbounds](p, i + 3) -> gep(gep(p, i), 3) + + +; FIXME the offset here should not be folded: if %p points to the beginning of +; scratch or LDS and %i is -1, a folded offset crashes the program. +define i32 @flat_offset_maybe_oob(ptr %p, i32 %i) { +; GFX90A-LABEL: flat_offset_maybe_oob: +; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX90A-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] +; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 +; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX90A-NEXT: flat_load_dword v0, v[0:1] offset:12 +; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: flat_offset_maybe_oob: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX10-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX10-NEXT: flat_load_dword v0, v[0:1] offset:12 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-LABEL: flat_offset_maybe_oob: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 2, v[0:1] +; GFX942-NEXT: flat_load_dword v0, v[0:1] offset:12 +; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: flat_offset_maybe_oob: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b64 v[2:3], 2, v[2:3] +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX11-NEXT: flat_load_b32 v0, v[0:1] offset:12 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: flat_offset_maybe_oob: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[2:3] +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; GFX12-NEXT: flat_load_b32 v0, v[0:1] offset:12 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_setpc_b64 s[30:31] + %idx = add nsw i32 %i, 3 + %arrayidx = getelementptr inbounds i32, ptr %p, i32 %idx + %l = load i32, ptr %arrayidx + ret i32 %l +} + +; For MUBUF and for GFX12, folding the offset is okay. +define i32 @private_offset_maybe_oob(ptr addrspace(5) %p, i32 %i) { +; GFX90A-MUBUF-LABEL: private_offset_maybe_oob: +; GFX90A-MUBUF: ; %bb.0: +; GFX90A-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-MUBUF-NEXT: v_lshl_add_u32 v0, v1, 2, v0 +; GFX90A-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:12 +; GFX90A-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX90A-MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; GFX90A-FLATSCR-LABEL: private_offset_maybe_oob: +; GFX90A-FLATSCR: ; %bb.0: +; GFX90A-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-FLATSCR-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX90A-FLATSCR-NEXT: v_add3_u32 v0, v0, v1, 12 +; GFX90A-FLATSCR-NEXT: scratch_load_dword v0, v0, off +; GFX90A-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX90A-FLATSCR-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-MUBUF-LABEL: private_offset_maybe_oob: +; GFX10-MUBUF: ; %bb.0: +; GFX10-MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-MUBUF-NEXT: v_lshl_add_u32 v0, v1, 2, v0 +; GFX10-MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:12 +; GFX10-MUBUF-NEXT: s_waitcnt vmcnt(0) +; GFX10-MUBUF-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-FLATSCR-LABEL: private_offset_maybe_oob: +; GFX10-FLATSCR: ; %bb.0: +; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-FLATSCR-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX10-FLATSCR-NEXT: v_add3_u32 v0, v0, v1, 12 +; GFX10-FLATSCR-NEXT: scratch_load_dword v0, v0, off +; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX10-FLATSCR-NEXT: s_setpc_b64 s[30:31] +; +; GFX942-LABEL: private_offset_maybe_oob: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX942-NEXT: v_add3_u32 v0, v0, v1, 12 +; GFX942-NEXT: scratch_load_dword v0, v0, off +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: private_offset_maybe_oob: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v0, v0, v1, 12 +; GFX11-NEXT: scratch_load_b32 v0, v0, off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: private_offset_maybe_oob: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_lshl_add_u32 v0, v1, 2, v0 +; GFX12-NEXT: scratch_load_b32 v0, v0, off offset:12 +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %idx = add nsw i32 %i, 3 + %arrayidx = getelementptr inbounds i32, ptr addrspace(5) %p, i32 %idx + %l = load i32, ptr addrspace(5) %arrayidx + ret i32 %l +} diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll index b7a6aa52e5de..4e1d2a754fa6 100644 --- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll +++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll @@ -225,22 +225,19 @@ define void @mubuf_clause(ptr addrspace(5) noalias nocapture readonly %arg, ptr ; GCN-SCRATCH-NEXT: s_setpc_b64 s[30:31] bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() - %tmp2 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %arg, i32 %tmp - %tmp3 = load <4 x i32>, ptr addrspace(5) %tmp2, align 16 - %tmp4 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %arg1, i32 %tmp - %tmp5 = add nuw nsw i32 %tmp, 1 - %tmp6 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %arg, i32 %tmp5 + %base = getelementptr inbounds <4 x i32>, ptr addrspace(5) %arg, i32 %tmp + %tmp3 = load <4 x i32>, ptr addrspace(5) %base, align 16 + %base1 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %arg1, i32 %tmp + %tmp6 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %base, i32 1 %tmp7 = load <4 x i32>, ptr addrspace(5) %tmp6, align 16 - %tmp8 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %arg1, i32 %tmp5 - %tmp9 = add nuw nsw i32 %tmp, 2 - %tmp10 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %arg, i32 %tmp9 + %tmp8 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %base1, i32 1 + %tmp10 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %base, i32 2 %tmp11 = load <4 x i32>, ptr addrspace(5) %tmp10, align 16 - %tmp12 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %arg1, i32 %tmp9 - %tmp13 = add nuw nsw i32 %tmp, 3 - %tmp14 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %arg, i32 %tmp13 + %tmp12 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %base1, i32 2 + %tmp14 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %base, i32 3 %tmp15 = load <4 x i32>, ptr addrspace(5) %tmp14, align 16 - %tmp16 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %arg1, i32 %tmp13 - store <4 x i32> %tmp3, ptr addrspace(5) %tmp4, align 16 + %tmp16 = getelementptr inbounds <4 x i32>, ptr addrspace(5) %base1, i32 3 + store <4 x i32> %tmp3, ptr addrspace(5) %base1, align 16 store <4 x i32> %tmp7, ptr addrspace(5) %tmp8, align 16 store <4 x i32> %tmp11, ptr addrspace(5) %tmp12, align 16 store <4 x i32> %tmp15, ptr addrspace(5) %tmp16, align 16 diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/preserve-inbounds.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/preserve-inbounds.ll new file mode 100644 index 000000000000..422e5d821550 --- /dev/null +++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/preserve-inbounds.ll @@ -0,0 +1,18 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -passes=separate-const-offset-from-gep -S | FileCheck %s + +; The inbounds flags cannot be preserved here: If the pointers point to the +; beginning of an object and %i is 1, the intermediate GEPs are out of bounds. +define ptr @maybe_oob(ptr %p, i64 %i) { +; CHECK-LABEL: @maybe_oob( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[IDX1:%.*]] = sub i64 0, [[I:%.*]] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[IDX1]] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr i8, ptr [[TMP0]], i64 4 +; CHECK-NEXT: ret ptr [[ARRAYIDX2]] +; +entry: + %idx = sub nsw i64 1, %i + %arrayidx = getelementptr inbounds i32, ptr %p, i64 %idx + ret ptr %arrayidx +} diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn-addrspace-addressing-modes.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn-addrspace-addressing-modes.ll index 2ee36d712a94..89a82ba35d08 100644 --- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn-addrspace-addressing-modes.ll +++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn-addrspace-addressing-modes.ll @@ -11,9 +11,9 @@ define amdgpu_kernel void @sum_of_array(i32 %x, i32 %y, ptr addrspace(1) nocaptu ; IR-NEXT: [[TMP:%.*]] = sext i32 [[Y]] to i64 ; IR-NEXT: [[TMP1:%.*]] = sext i32 [[X]] to i64 ; IR-NEXT: [[TMP2:%.*]] = getelementptr [4096 x [32 x float]], ptr addrspace(4) @array, i64 0, i64 [[TMP1]], i64 [[TMP]] -; IR-NEXT: [[TMP82:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TMP2]], i64 4 -; IR-NEXT: [[TMP144:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TMP2]], i64 128 -; IR-NEXT: [[TMP187:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TMP2]], i64 132 +; IR-NEXT: [[TMP82:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP2]], i64 4 +; IR-NEXT: [[TMP144:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP2]], i64 128 +; IR-NEXT: [[TMP187:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP2]], i64 132 ; IR-NEXT: store float 0.000000e+00, ptr addrspace(1) [[OUTPUT]], align 4 ; IR-NEXT: ret void ; @@ -51,7 +51,7 @@ define amdgpu_kernel void @sum_of_array_over_max_mubuf_offset(i32 %x, i32 %y, pt ; IR-NEXT: [[TMP2:%.*]] = getelementptr [4096 x [4 x float]], ptr addrspace(4) @array2, i64 0, i64 [[TMP1]], i64 [[TMP]] ; IR-NEXT: [[TMP6:%.*]] = add i32 [[Y]], 255 ; IR-NEXT: [[TMP7:%.*]] = sext i32 [[TMP6]] to i64 -; IR-NEXT: [[TMP82:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TMP2]], i64 1020 +; IR-NEXT: [[TMP82:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP2]], i64 1020 ; IR-NEXT: [[TMP12:%.*]] = add i32 [[X]], 256 ; IR-NEXT: [[TMP13:%.*]] = sext i32 [[TMP12]] to i64 ; IR-NEXT: [[TMP14:%.*]] = getelementptr inbounds [4096 x [4 x float]], ptr addrspace(4) @array2, i64 0, i64 [[TMP13]], i64 [[TMP]] @@ -91,13 +91,13 @@ define amdgpu_kernel void @sum_of_lds_array_over_max_mubuf_offset(i32 %x, i32 %y ; IR-NEXT: [[TMP2:%.*]] = getelementptr [4096 x [4 x float]], ptr addrspace(3) @lds_array, i32 0, i32 [[X]], i32 [[Y]] ; IR-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(3) [[TMP2]], align 4 ; IR-NEXT: [[TMP5:%.*]] = fadd float [[TMP4]], 0.000000e+00 -; IR-NEXT: [[TMP82:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP2]], i32 1020 +; IR-NEXT: [[TMP82:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP2]], i32 1020 ; IR-NEXT: [[TMP10:%.*]] = load float, ptr addrspace(3) [[TMP82]], align 4 ; IR-NEXT: [[TMP11:%.*]] = fadd float [[TMP5]], [[TMP10]] -; IR-NEXT: [[TMP144:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP2]], i32 64512 +; IR-NEXT: [[TMP144:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP2]], i32 64512 ; IR-NEXT: [[TMP16:%.*]] = load float, ptr addrspace(3) [[TMP144]], align 4 ; IR-NEXT: [[TMP17:%.*]] = fadd float [[TMP11]], [[TMP16]] -; IR-NEXT: [[TMP187:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP2]], i32 65532 +; IR-NEXT: [[TMP187:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP2]], i32 65532 ; IR-NEXT: [[TMP20:%.*]] = load float, ptr addrspace(3) [[TMP187]], align 4 ; IR-NEXT: [[TMP21:%.*]] = fadd float [[TMP17]], [[TMP20]] ; IR-NEXT: store float [[TMP21]], ptr addrspace(1) [[OUTPUT]], align 4 diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn.ll index 79940041ed5f..cea1d112628a 100644 --- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn.ll +++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn.ll @@ -26,15 +26,15 @@ define void @sum_of_array(i32 %x, i32 %y, ptr nocapture %output) { ; IR-NEXT: [[I3:%.*]] = addrspacecast ptr addrspace(3) [[I2]] to ptr ; IR-NEXT: [[I4:%.*]] = load float, ptr [[I3]], align 4 ; IR-NEXT: [[I5:%.*]] = fadd float [[I4]], 0.000000e+00 -; IR-NEXT: [[I87:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[I2]], i32 4 +; IR-NEXT: [[I87:%.*]] = getelementptr i8, ptr addrspace(3) [[I2]], i32 4 ; IR-NEXT: [[I9:%.*]] = addrspacecast ptr addrspace(3) [[I87]] to ptr ; IR-NEXT: [[I10:%.*]] = load float, ptr [[I9]], align 4 ; IR-NEXT: [[I11:%.*]] = fadd float [[I5]], [[I10]] -; IR-NEXT: [[I1412:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[I2]], i32 128 +; IR-NEXT: [[I1412:%.*]] = getelementptr i8, ptr addrspace(3) [[I2]], i32 128 ; IR-NEXT: [[I15:%.*]] = addrspacecast ptr addrspace(3) [[I1412]] to ptr ; IR-NEXT: [[I16:%.*]] = load float, ptr [[I15]], align 4 ; IR-NEXT: [[I17:%.*]] = fadd float [[I11]], [[I16]] -; IR-NEXT: [[I1818:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[I2]], i32 132 +; IR-NEXT: [[I1818:%.*]] = getelementptr i8, ptr addrspace(3) [[I2]], i32 132 ; IR-NEXT: [[I19:%.*]] = addrspacecast ptr addrspace(3) [[I1818]] to ptr ; IR-NEXT: [[I20:%.*]] = load float, ptr [[I19]], align 4 ; IR-NEXT: [[I21:%.*]] = fadd float [[I17]], [[I20]] @@ -88,15 +88,15 @@ define void @sum_of_array2(i32 %x, i32 %y, ptr nocapture %output) { ; IR-NEXT: [[I3:%.*]] = addrspacecast ptr addrspace(3) [[I2]] to ptr ; IR-NEXT: [[I4:%.*]] = load float, ptr [[I3]], align 4 ; IR-NEXT: [[I5:%.*]] = fadd float [[I4]], 0.000000e+00 -; IR-NEXT: [[I77:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[I2]], i32 4 +; IR-NEXT: [[I77:%.*]] = getelementptr i8, ptr addrspace(3) [[I2]], i32 4 ; IR-NEXT: [[I8:%.*]] = addrspacecast ptr addrspace(3) [[I77]] to ptr ; IR-NEXT: [[I9:%.*]] = load float, ptr [[I8]], align 4 ; IR-NEXT: [[I10:%.*]] = fadd float [[I5]], [[I9]] -; IR-NEXT: [[I1212:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[I2]], i32 128 +; IR-NEXT: [[I1212:%.*]] = getelementptr i8, ptr addrspace(3) [[I2]], i32 128 ; IR-NEXT: [[I13:%.*]] = addrspacecast ptr addrspace(3) [[I1212]] to ptr ; IR-NEXT: [[I14:%.*]] = load float, ptr [[I13]], align 4 ; IR-NEXT: [[I15:%.*]] = fadd float [[I10]], [[I14]] -; IR-NEXT: [[I1618:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[I2]], i32 132 +; IR-NEXT: [[I1618:%.*]] = getelementptr i8, ptr addrspace(3) [[I2]], i32 132 ; IR-NEXT: [[I17:%.*]] = addrspacecast ptr addrspace(3) [[I1618]] to ptr ; IR-NEXT: [[I18:%.*]] = load float, ptr [[I17]], align 4 ; IR-NEXT: [[I19:%.*]] = fadd float [[I15]], [[I18]] @@ -149,15 +149,15 @@ define void @sum_of_array3(i32 %x, i32 %y, ptr nocapture %output) { ; IR-NEXT: [[I3:%.*]] = addrspacecast ptr addrspace(3) [[I2]] to ptr ; IR-NEXT: [[I4:%.*]] = load float, ptr [[I3]], align 4 ; IR-NEXT: [[I5:%.*]] = fadd float [[I4]], 0.000000e+00 -; IR-NEXT: [[I87:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[I2]], i32 4 +; IR-NEXT: [[I87:%.*]] = getelementptr i8, ptr addrspace(3) [[I2]], i32 4 ; IR-NEXT: [[I9:%.*]] = addrspacecast ptr addrspace(3) [[I87]] to ptr ; IR-NEXT: [[I10:%.*]] = load float, ptr [[I9]], align 4 ; IR-NEXT: [[I11:%.*]] = fadd float [[I5]], [[I10]] -; IR-NEXT: [[I1412:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[I2]], i32 128 +; IR-NEXT: [[I1412:%.*]] = getelementptr i8, ptr addrspace(3) [[I2]], i32 128 ; IR-NEXT: [[I15:%.*]] = addrspacecast ptr addrspace(3) [[I1412]] to ptr ; IR-NEXT: [[I16:%.*]] = load float, ptr [[I15]], align 4 ; IR-NEXT: [[I17:%.*]] = fadd float [[I11]], [[I16]] -; IR-NEXT: [[I1818:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[I2]], i32 132 +; IR-NEXT: [[I1818:%.*]] = getelementptr i8, ptr addrspace(3) [[I2]], i32 132 ; IR-NEXT: [[I19:%.*]] = addrspacecast ptr addrspace(3) [[I1818]] to ptr ; IR-NEXT: [[I20:%.*]] = load float, ptr [[I19]], align 4 ; IR-NEXT: [[I21:%.*]] = fadd float [[I17]], [[I20]] @@ -209,15 +209,15 @@ define void @sum_of_array4(i32 %x, i32 %y, ptr nocapture %output) { ; IR-NEXT: [[I3:%.*]] = addrspacecast ptr addrspace(3) [[I2]] to ptr ; IR-NEXT: [[I4:%.*]] = load float, ptr [[I3]], align 4 ; IR-NEXT: [[I5:%.*]] = fadd float [[I4]], 0.000000e+00 -; IR-NEXT: [[I77:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[I2]], i32 4 +; IR-NEXT: [[I77:%.*]] = getelementptr i8, ptr addrspace(3) [[I2]], i32 4 ; IR-NEXT: [[I8:%.*]] = addrspacecast ptr addrspace(3) [[I77]] to ptr ; IR-NEXT: [[I9:%.*]] = load float, ptr [[I8]], align 4 ; IR-NEXT: [[I10:%.*]] = fadd float [[I5]], [[I9]] -; IR-NEXT: [[I1212:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[I2]], i32 128 +; IR-NEXT: [[I1212:%.*]] = getelementptr i8, ptr addrspace(3) [[I2]], i32 128 ; IR-NEXT: [[I13:%.*]] = addrspacecast ptr addrspace(3) [[I1212]] to ptr ; IR-NEXT: [[I14:%.*]] = load float, ptr [[I13]], align 4 ; IR-NEXT: [[I15:%.*]] = fadd float [[I10]], [[I14]] -; IR-NEXT: [[I1618:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[I2]], i32 132 +; IR-NEXT: [[I1618:%.*]] = getelementptr i8, ptr addrspace(3) [[I2]], i32 132 ; IR-NEXT: [[I17:%.*]] = addrspacecast ptr addrspace(3) [[I1618]] to ptr ; IR-NEXT: [[I18:%.*]] = load float, ptr [[I17]], align 4 ; IR-NEXT: [[I19:%.*]] = fadd float [[I15]], [[I18]] @@ -270,7 +270,7 @@ define void @reunion(i32 %x, i32 %y, ptr %input) { ; IR-NEXT: [[P0:%.*]] = getelementptr float, ptr [[INPUT]], i64 [[I]] ; IR-NEXT: [[V0:%.*]] = load float, ptr [[P0]], align 4 ; IR-NEXT: call void @use(float [[V0]]) -; IR-NEXT: [[P13:%.*]] = getelementptr inbounds i8, ptr [[P0]], i64 20 +; IR-NEXT: [[P13:%.*]] = getelementptr i8, ptr [[P0]], i64 20 ; IR-NEXT: [[V1:%.*]] = load float, ptr [[P13]], align 4 ; IR-NEXT: call void @use(float [[V1]]) ; IR-NEXT: ret void diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep.ll index 1b220ceac351..a80823c138c1 100644 --- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep.ll +++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep.ll @@ -14,7 +14,7 @@ define ptr addrspace(3) @packed_struct(i32 %i, i32 %j) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[IDXPROM:%.*]] = trunc i64 0 to i32 ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr [1024 x %struct.Packed], ptr addrspace(3) @packed_struct_array, i32 [[IDXPROM]], i32 [[I:%.*]], i32 1, i32 [[J:%.*]] -; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP0]], i32 100 +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP0]], i32 100 ; CHECK-NEXT: ret ptr addrspace(3) [[UGLYGEP]] ; entry: diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll index e03f5cc6ad24..9a73feb2c4b5 100644 --- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll +++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll @@ -28,15 +28,15 @@ define void @sum_of_array(i32 %x, i32 %y, ptr nocapture %output) { ; IR-NEXT: [[TMP3:%.*]] = addrspacecast ptr addrspace(3) [[TMP2]] to ptr ; IR-NEXT: [[TMP4:%.*]] = load float, ptr [[TMP3]], align 4 ; IR-NEXT: [[TMP5:%.*]] = fadd float [[TMP4]], 0.000000e+00 -; IR-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP2]], i64 4 +; IR-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP2]], i64 4 ; IR-NEXT: [[TMP7:%.*]] = addrspacecast ptr addrspace(3) [[TMP6]] to ptr ; IR-NEXT: [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4 ; IR-NEXT: [[TMP9:%.*]] = fadd float [[TMP5]], [[TMP8]] -; IR-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP2]], i64 128 +; IR-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP2]], i64 128 ; IR-NEXT: [[TMP11:%.*]] = addrspacecast ptr addrspace(3) [[TMP10]] to ptr ; IR-NEXT: [[TMP12:%.*]] = load float, ptr [[TMP11]], align 4 ; IR-NEXT: [[TMP13:%.*]] = fadd float [[TMP9]], [[TMP12]] -; IR-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP2]], i64 132 +; IR-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP2]], i64 132 ; IR-NEXT: [[TMP15:%.*]] = addrspacecast ptr addrspace(3) [[TMP14]] to ptr ; IR-NEXT: [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4 ; IR-NEXT: [[TMP17:%.*]] = fadd float [[TMP13]], [[TMP16]] @@ -94,15 +94,15 @@ define void @sum_of_array2(i32 %x, i32 %y, ptr nocapture %output) { ; IR-NEXT: [[TMP3:%.*]] = addrspacecast ptr addrspace(3) [[TMP2]] to ptr ; IR-NEXT: [[TMP4:%.*]] = load float, ptr [[TMP3]], align 4 ; IR-NEXT: [[TMP5:%.*]] = fadd float [[TMP4]], 0.000000e+00 -; IR-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP2]], i64 4 +; IR-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP2]], i64 4 ; IR-NEXT: [[TMP7:%.*]] = addrspacecast ptr addrspace(3) [[TMP6]] to ptr ; IR-NEXT: [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4 ; IR-NEXT: [[TMP9:%.*]] = fadd float [[TMP5]], [[TMP8]] -; IR-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP2]], i64 128 +; IR-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP2]], i64 128 ; IR-NEXT: [[TMP11:%.*]] = addrspacecast ptr addrspace(3) [[TMP10]] to ptr ; IR-NEXT: [[TMP12:%.*]] = load float, ptr [[TMP11]], align 4 ; IR-NEXT: [[TMP13:%.*]] = fadd float [[TMP9]], [[TMP12]] -; IR-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP2]], i64 132 +; IR-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP2]], i64 132 ; IR-NEXT: [[TMP15:%.*]] = addrspacecast ptr addrspace(3) [[TMP14]] to ptr ; IR-NEXT: [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4 ; IR-NEXT: [[TMP17:%.*]] = fadd float [[TMP13]], [[TMP16]] @@ -161,15 +161,15 @@ define void @sum_of_array3(i32 %x, i32 %y, ptr nocapture %output) { ; IR-NEXT: [[TMP3:%.*]] = addrspacecast ptr addrspace(3) [[TMP2]] to ptr ; IR-NEXT: [[TMP4:%.*]] = load float, ptr [[TMP3]], align 4 ; IR-NEXT: [[TMP5:%.*]] = fadd float [[TMP4]], 0.000000e+00 -; IR-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP2]], i64 4 +; IR-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP2]], i64 4 ; IR-NEXT: [[TMP7:%.*]] = addrspacecast ptr addrspace(3) [[TMP6]] to ptr ; IR-NEXT: [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4 ; IR-NEXT: [[TMP9:%.*]] = fadd float [[TMP5]], [[TMP8]] -; IR-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP2]], i64 128 +; IR-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP2]], i64 128 ; IR-NEXT: [[TMP11:%.*]] = addrspacecast ptr addrspace(3) [[TMP10]] to ptr ; IR-NEXT: [[TMP12:%.*]] = load float, ptr [[TMP11]], align 4 ; IR-NEXT: [[TMP13:%.*]] = fadd float [[TMP9]], [[TMP12]] -; IR-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP2]], i64 132 +; IR-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP2]], i64 132 ; IR-NEXT: [[TMP15:%.*]] = addrspacecast ptr addrspace(3) [[TMP14]] to ptr ; IR-NEXT: [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4 ; IR-NEXT: [[TMP17:%.*]] = fadd float [[TMP13]], [[TMP16]] @@ -228,15 +228,15 @@ define void @sum_of_array4(i32 %x, i32 %y, ptr nocapture %output) { ; IR-NEXT: [[TMP3:%.*]] = addrspacecast ptr addrspace(3) [[TMP2]] to ptr ; IR-NEXT: [[TMP4:%.*]] = load float, ptr [[TMP3]], align 4 ; IR-NEXT: [[TMP5:%.*]] = fadd float [[TMP4]], 0.000000e+00 -; IR-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP2]], i64 4 +; IR-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP2]], i64 4 ; IR-NEXT: [[TMP7:%.*]] = addrspacecast ptr addrspace(3) [[TMP6]] to ptr ; IR-NEXT: [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4 ; IR-NEXT: [[TMP9:%.*]] = fadd float [[TMP5]], [[TMP8]] -; IR-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP2]], i64 128 +; IR-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP2]], i64 128 ; IR-NEXT: [[TMP11:%.*]] = addrspacecast ptr addrspace(3) [[TMP10]] to ptr ; IR-NEXT: [[TMP12:%.*]] = load float, ptr [[TMP11]], align 4 ; IR-NEXT: [[TMP13:%.*]] = fadd float [[TMP9]], [[TMP12]] -; IR-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP2]], i64 132 +; IR-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP2]], i64 132 ; IR-NEXT: [[TMP15:%.*]] = addrspacecast ptr addrspace(3) [[TMP14]] to ptr ; IR-NEXT: [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4 ; IR-NEXT: [[TMP17:%.*]] = fadd float [[TMP13]], [[TMP16]] @@ -296,7 +296,7 @@ define void @reunion(i32 %x, i32 %y, ptr %input) { ; IR-NEXT: [[P0:%.*]] = getelementptr float, ptr [[INPUT]], i64 [[TMP0]] ; IR-NEXT: [[V0:%.*]] = load float, ptr [[P0]], align 4 ; IR-NEXT: call void @use(float [[V0]]) -; IR-NEXT: [[P13:%.*]] = getelementptr inbounds i8, ptr [[P0]], i64 20 +; IR-NEXT: [[P13:%.*]] = getelementptr i8, ptr [[P0]], i64 20 ; IR-NEXT: [[V1:%.*]] = load float, ptr [[P13]], align 4 ; IR-NEXT: call void @use(float [[V1]]) ; IR-NEXT: ret void diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll index 16e9e5eeb614..77b3434f4f15 100644 --- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll +++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll @@ -19,7 +19,7 @@ define ptr @struct(i32 %i) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[I]] to i64 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr [1024 x %struct.S], ptr @struct_array, i64 0, i64 [[TMP0]], i32 1 -; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 80 +; CHECK-NEXT: [[P2:%.*]] = getelementptr i8, ptr [[TMP1]], i64 80 ; CHECK-NEXT: ret ptr [[P2]] ; entry: @@ -40,7 +40,7 @@ define ptr @sext_add(i32 %i, i32 %j) { ; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[TMP0]] to i64 ; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[I]] to i64 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr [32 x [32 x float]], ptr @float_2d_array, i64 0, i64 [[TMP2]], i64 [[TMP1]] -; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 128 +; CHECK-NEXT: [[P1:%.*]] = getelementptr i8, ptr [[TMP3]], i64 128 ; CHECK-NEXT: ret ptr [[P1]] ; entry: @@ -68,7 +68,7 @@ define ptr @ext_add_no_overflow(i64 %a, i32 %b, i64 %c, i32 %d) { ; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[D]] to i64 ; CHECK-NEXT: [[J4:%.*]] = add i64 [[C]], [[TMP2]] ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr [32 x [32 x float]], ptr @float_2d_array, i64 0, i64 [[I2]], i64 [[J4]] -; CHECK-NEXT: [[P5:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 132 +; CHECK-NEXT: [[P5:%.*]] = getelementptr i8, ptr [[TMP3]], i64 132 ; CHECK-NEXT: ret ptr [[P5]] ; %b1 = add nsw i32 %b, 1 @@ -137,7 +137,7 @@ define ptr @sext_or(i64 %a, i32 %b) { ; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[B1]] to i64 ; CHECK-NEXT: [[I2:%.*]] = add i64 [[A]], [[TMP0]] ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr [32 x [32 x float]], ptr @float_2d_array, i64 0, i64 [[I2]], i64 [[J]] -; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 128 +; CHECK-NEXT: [[P3:%.*]] = getelementptr i8, ptr [[TMP1]], i64 128 ; CHECK-NEXT: ret ptr [[P3]] ; entry: @@ -162,7 +162,7 @@ define ptr @expr(i64 %a, i64 %b, ptr %out) { ; CHECK-NEXT: [[B5:%.*]] = add i64 [[B]], 5 ; CHECK-NEXT: [[I2:%.*]] = add i64 [[B]], [[A]] ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr [32 x [32 x float]], ptr @float_2d_array, i64 0, i64 [[I2]], i64 0 -; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 640 +; CHECK-NEXT: [[P3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 640 ; CHECK-NEXT: store i64 [[B5]], ptr [[OUT]], align 8 ; CHECK-NEXT: ret ptr [[P3]] ; @@ -186,7 +186,7 @@ define ptr @sext_expr(i32 %a, i32 %b, i32 %c, i64 %d) { ; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[TMP0]], [[TMP3]] ; CHECK-NEXT: [[I1:%.*]] = add i64 [[D]], [[TMP4]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr [32 x [32 x float]], ptr @float_2d_array, i64 0, i64 0, i64 [[I1]] -; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 32 +; CHECK-NEXT: [[P2:%.*]] = getelementptr i8, ptr [[TMP5]], i64 32 ; CHECK-NEXT: ret ptr [[P2]] ; entry: @@ -205,7 +205,7 @@ define ptr @sub(i64 %i, i64 %j) { ; CHECK-SAME: i64 [[I:%.*]], i64 [[J:%.*]]) { ; CHECK-NEXT: [[J22:%.*]] = sub i64 0, [[J]] ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr [32 x [32 x float]], ptr @float_2d_array, i64 0, i64 [[I]], i64 [[J22]] -; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 -620 +; CHECK-NEXT: [[P3:%.*]] = getelementptr i8, ptr [[TMP1]], i64 -620 ; CHECK-NEXT: ret ptr [[P3]] ; %i2 = sub i64 %i, 5 ; i - 5 @@ -225,7 +225,7 @@ define ptr @packed_struct(i32 %i, i32 %j) { ; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[I]] to i64 ; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[J]] to i64 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr [1024 x %struct.Packed], ptr [[S]], i64 0, i64 [[TMP0]], i32 1, i64 [[TMP1]] -; CHECK-NEXT: [[ARRAYIDX33:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 100 +; CHECK-NEXT: [[ARRAYIDX33:%.*]] = getelementptr i8, ptr [[TMP2]], i64 100 ; CHECK-NEXT: ret ptr [[ARRAYIDX33]] ; entry: @@ -358,7 +358,7 @@ define ptr @sign_mod_unsign(ptr %ptr, i64 %idx) { ; CHECK-SAME: ptr [[PTR:%.*]], i64 [[IDX:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr [[STRUCT0:%.*]], ptr [[PTR]], i64 0, i32 3, i64 [[IDX]], i32 1 -; CHECK-NEXT: [[PTR22:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 -64 +; CHECK-NEXT: [[PTR22:%.*]] = getelementptr i8, ptr [[TMP0]], i64 -64 ; CHECK-NEXT: ret ptr [[PTR22]] ; entry: @@ -373,7 +373,7 @@ define ptr @trunk_explicit(ptr %ptr, i64 %idx) { ; CHECK-SAME: ptr [[PTR:%.*]], i64 [[IDX:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr [[STRUCT0:%.*]], ptr [[PTR]], i64 0, i32 3, i64 [[IDX]], i32 1 -; CHECK-NEXT: [[PTR21:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 3216 +; CHECK-NEXT: [[PTR21:%.*]] = getelementptr i8, ptr [[TMP0]], i64 3216 ; CHECK-NEXT: ret ptr [[PTR21]] ; entry: @@ -390,7 +390,7 @@ define ptr @trunk_long_idx(ptr %ptr, i64 %idx) { ; CHECK-SAME: ptr [[PTR:%.*]], i64 [[IDX:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr [[STRUCT0:%.*]], ptr [[PTR]], i64 0, i32 3, i64 [[IDX]], i32 1 -; CHECK-NEXT: [[PTR21:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 3216 +; CHECK-NEXT: [[PTR21:%.*]] = getelementptr i8, ptr [[TMP0]], i64 3216 ; CHECK-NEXT: ret ptr [[PTR21]] ; entry: diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/RISCV/split-gep.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/RISCV/split-gep.ll index 3742ea7fb0c2..deaffc88117d 100644 --- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/RISCV/split-gep.ll +++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/RISCV/split-gep.ll @@ -12,11 +12,11 @@ define i64 @test1(ptr %array, i64 %i, i64 %j) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[I:%.*]], 5 ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i64, ptr [[ARRAY:%.*]], i64 [[I]] -; CHECK-NEXT: [[GEP4:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 40 +; CHECK-NEXT: [[GEP4:%.*]] = getelementptr i8, ptr [[TMP0]], i64 40 ; CHECK-NEXT: store i64 [[J:%.*]], ptr [[GEP4]], align 8 -; CHECK-NEXT: [[GEP26:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 48 +; CHECK-NEXT: [[GEP26:%.*]] = getelementptr i8, ptr [[TMP0]], i64 48 ; CHECK-NEXT: store i64 [[J]], ptr [[GEP26]], align 8 -; CHECK-NEXT: [[GEP38:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 280 +; CHECK-NEXT: [[GEP38:%.*]] = getelementptr i8, ptr [[TMP0]], i64 280 ; CHECK-NEXT: store i64 [[ADD]], ptr [[GEP38]], align 8 ; CHECK-NEXT: ret i64 undef ; @@ -40,11 +40,11 @@ define i32 @test2(ptr %array, i32 %i, i32 %j) { ; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[I:%.*]], 5 ; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[I]] to i64 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[ARRAY:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr [[TMP1]], i64 20 ; CHECK-NEXT: store i32 [[J:%.*]], ptr [[GEP2]], align 4 -; CHECK-NEXT: [[GEP54:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 24 +; CHECK-NEXT: [[GEP54:%.*]] = getelementptr i8, ptr [[TMP1]], i64 24 ; CHECK-NEXT: store i32 [[J]], ptr [[GEP54]], align 4 -; CHECK-NEXT: [[GEP86:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 140 +; CHECK-NEXT: [[GEP86:%.*]] = getelementptr i8, ptr [[TMP1]], i64 140 ; CHECK-NEXT: store i32 [[ADD]], ptr [[GEP86]], align 4 ; CHECK-NEXT: ret i32 undef ; @@ -72,13 +72,13 @@ define i32 @test3(ptr %array, i32 %i) { ; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[I:%.*]], 5 ; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[I]] to i64 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[ARRAY:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr [[TMP1]], i64 20 ; CHECK-NEXT: store i32 [[ADD]], ptr [[GEP2]], align 4 ; CHECK-NEXT: [[ADD3:%.*]] = add nsw i32 [[I]], 6 -; CHECK-NEXT: [[GEP54:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 24 +; CHECK-NEXT: [[GEP54:%.*]] = getelementptr i8, ptr [[TMP1]], i64 24 ; CHECK-NEXT: store i32 [[ADD3]], ptr [[GEP54]], align 4 ; CHECK-NEXT: [[ADD6:%.*]] = add nsw i32 [[I]], 35 -; CHECK-NEXT: [[GEP86:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 140 +; CHECK-NEXT: [[GEP86:%.*]] = getelementptr i8, ptr [[TMP1]], i64 140 ; CHECK-NEXT: store i32 [[ADD6]], ptr [[GEP86]], align 4 ; CHECK-NEXT: ret i32 undef ; @@ -105,11 +105,11 @@ define i32 @test4(ptr %array2, i32 %i) { ; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[I:%.*]], 5 ; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[I]] to i64 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr [50 x i32], ptr [[ARRAY2:%.*]], i64 [[TMP0]], i64 [[TMP0]] -; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 1020 +; CHECK-NEXT: [[GEP3:%.*]] = getelementptr i8, ptr [[TMP1]], i64 1020 ; CHECK-NEXT: store i32 [[I]], ptr [[GEP3]], align 4 -; CHECK-NEXT: [[GEP56:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 1024 +; CHECK-NEXT: [[GEP56:%.*]] = getelementptr i8, ptr [[TMP1]], i64 1024 ; CHECK-NEXT: store i32 [[ADD]], ptr [[GEP56]], align 4 -; CHECK-NEXT: [[GEP89:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 1140 +; CHECK-NEXT: [[GEP89:%.*]] = getelementptr i8, ptr [[TMP1]], i64 1140 ; CHECK-NEXT: store i32 [[I]], ptr [[GEP89]], align 4 ; CHECK-NEXT: ret i32 undef ; @@ -136,10 +136,10 @@ define i32 @test5(ptr %array2, i32 %i, i64 %j) { ; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[I:%.*]], 5 ; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[I]] to i64 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr [50 x i32], ptr [[ARRAY2:%.*]], i64 [[TMP0]], i64 [[TMP0]] -; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 1020 +; CHECK-NEXT: [[GEP3:%.*]] = getelementptr i8, ptr [[TMP1]], i64 1020 ; CHECK-NEXT: store i32 [[ADD]], ptr [[GEP3]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr [50 x i32], ptr [[ARRAY2]], i64 [[TMP0]], i64 [[J:%.*]] -; CHECK-NEXT: [[GEP55:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 1200 +; CHECK-NEXT: [[GEP55:%.*]] = getelementptr i8, ptr [[TMP2]], i64 1200 ; CHECK-NEXT: store i32 [[I]], ptr [[GEP55]], align 4 ; CHECK-NEXT: [[ADD6:%.*]] = add nsw i32 [[I]], 35 ; CHECK-NEXT: [[SEXT7:%.*]] = sext i32 [[ADD6]] to i64 @@ -171,7 +171,7 @@ define i64 @test6(ptr %array, i64 %i, i64 %j) { ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i64, ptr [[ARRAY:%.*]], i64 [[J:%.*]] ; CHECK-NEXT: store i64 [[ADD]], ptr [[GEP]], align 8 ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i64, ptr [[ARRAY]], i64 [[I]] -; CHECK-NEXT: [[GEP52:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 48 +; CHECK-NEXT: [[GEP52:%.*]] = getelementptr i8, ptr [[TMP0]], i64 48 ; CHECK-NEXT: store i64 [[I]], ptr [[GEP52]], align 8 ; CHECK-NEXT: store i64 [[I]], ptr [[TMP0]], align 8 ; CHECK-NEXT: ret i64 undef @@ -196,15 +196,15 @@ define i32 @test7(ptr %array, i32 %i, i32 %j, i32 %k) { ; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[I:%.*]], 5 ; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[I]] to i64 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[ARRAY:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr [[TMP1]], i64 20 ; CHECK-NEXT: store i32 [[ADD]], ptr [[GEP2]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[K:%.*]] to i64 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i32, ptr [[ARRAY]], i64 [[TMP2]] -; CHECK-NEXT: [[GEP54:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 24 +; CHECK-NEXT: [[GEP54:%.*]] = getelementptr i8, ptr [[TMP3]], i64 24 ; CHECK-NEXT: store i32 [[I]], ptr [[GEP54]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = sext i32 [[J:%.*]] to i64 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[ARRAY]], i64 [[TMP4]] -; CHECK-NEXT: [[GEP86:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 140 +; CHECK-NEXT: [[GEP86:%.*]] = getelementptr i8, ptr [[TMP5]], i64 140 ; CHECK-NEXT: store i32 [[I]], ptr [[GEP86]], align 4 ; CHECK-NEXT: ret i32 undef ; @@ -231,13 +231,13 @@ define i32 @test8(ptr %array, ptr %array2, ptr %array3, i32 %i) { ; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[I:%.*]], 5 ; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[I]] to i64 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[ARRAY:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr [[TMP1]], i64 20 ; CHECK-NEXT: store i32 [[ADD]], ptr [[GEP2]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[ARRAY2:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[GEP54:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 24 +; CHECK-NEXT: [[GEP54:%.*]] = getelementptr i8, ptr [[TMP2]], i64 24 ; CHECK-NEXT: store i32 [[I]], ptr [[GEP54]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i32, ptr [[ARRAY3:%.*]], i64 [[TMP0]] -; CHECK-NEXT: [[GEP86:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 140 +; CHECK-NEXT: [[GEP86:%.*]] = getelementptr i8, ptr [[TMP3]], i64 140 ; CHECK-NEXT: store i32 [[I]], ptr [[GEP86]], align 4 ; CHECK-NEXT: ret i32 undef ; @@ -264,12 +264,12 @@ define i32 @test9(ptr %array, i32 %i) { ; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[I:%.*]], 5 ; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[I]] to i64 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr [50 x i32], ptr [[ARRAY:%.*]], i64 0, i64 [[TMP0]] -; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr [[TMP1]], i64 20 ; CHECK-NEXT: store i32 [[ADD]], ptr [[GEP2]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr [50 x i32], ptr [[ARRAY]], i64 [[TMP0]], i64 [[TMP0]] -; CHECK-NEXT: [[GEP54:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 24 +; CHECK-NEXT: [[GEP54:%.*]] = getelementptr i8, ptr [[TMP2]], i64 24 ; CHECK-NEXT: store i32 [[I]], ptr [[GEP54]], align 4 -; CHECK-NEXT: [[GEP87:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 1340 +; CHECK-NEXT: [[GEP87:%.*]] = getelementptr i8, ptr [[TMP2]], i64 1340 ; CHECK-NEXT: store i32 [[I]], ptr [[GEP87]], align 4 ; CHECK-NEXT: ret i32 undef ; diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/reassociate-geps-and-slsr-addrspace.ll b/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/reassociate-geps-and-slsr-addrspace.ll index 0af4093c184e..03edfdceab32 100644 --- a/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/reassociate-geps-and-slsr-addrspace.ll +++ b/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/reassociate-geps-and-slsr-addrspace.ll @@ -10,12 +10,12 @@ define amdgpu_kernel void @slsr_after_reassociate_global_geps_mubuf_max_offset(p ; CHECK-NEXT: bb: ; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[I]] to i64 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr float, ptr addrspace(1) [[ARR]], i64 [[TMP0]] -; CHECK-NEXT: [[P12:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP1]], i64 4092 +; CHECK-NEXT: [[P12:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP1]], i64 4092 ; CHECK-NEXT: [[V11:%.*]] = load i32, ptr addrspace(1) [[P12]], align 4 ; CHECK-NEXT: store i32 [[V11]], ptr addrspace(1) [[OUT]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[TMP0]], 2 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP1]], i64 [[TMP2]] -; CHECK-NEXT: [[P24:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP3]], i64 4092 +; CHECK-NEXT: [[P24:%.*]] = getelementptr i8, ptr addrspace(1) [[TMP3]], i64 4092 ; CHECK-NEXT: [[V22:%.*]] = load i32, ptr addrspace(1) [[P24]], align 4 ; CHECK-NEXT: store i32 [[V22]], ptr addrspace(1) [[OUT]], align 4 ; CHECK-NEXT: ret void @@ -76,12 +76,12 @@ define amdgpu_kernel void @slsr_after_reassociate_lds_geps_ds_max_offset(ptr add ; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], ptr addrspace(3) noalias [[ARR:%.*]], i32 [[I:%.*]]) { ; CHECK-NEXT: bb: ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr float, ptr addrspace(3) [[ARR]], i32 [[I]] -; CHECK-NEXT: [[P12:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP0]], i32 65532 +; CHECK-NEXT: [[P12:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP0]], i32 65532 ; CHECK-NEXT: [[V11:%.*]] = load i32, ptr addrspace(3) [[P12]], align 4 ; CHECK-NEXT: store i32 [[V11]], ptr addrspace(1) [[OUT]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[I]], 2 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP0]], i32 [[TMP1]] -; CHECK-NEXT: [[P24:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP2]], i32 65532 +; CHECK-NEXT: [[P24:%.*]] = getelementptr i8, ptr addrspace(3) [[TMP2]], i32 65532 ; CHECK-NEXT: [[V22:%.*]] = load i32, ptr addrspace(3) [[P24]], align 4 ; CHECK-NEXT: store i32 [[V22]], ptr addrspace(1) [[OUT]], align 4 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/reassociate-geps-and-slsr.ll b/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/reassociate-geps-and-slsr.ll index 9fb50f39dc0e..edaeef8c87b6 100644 --- a/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/reassociate-geps-and-slsr.ll +++ b/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/reassociate-geps-and-slsr.ll @@ -32,20 +32,20 @@ define void @slsr_after_reassociate_geps(ptr %arr, i32 %i) { ; CHECK-SAME: ptr [[ARR:%.*]], i32 [[I:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[I]] to i64 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, ptr [[ARR]], i64 [[TMP1]] -; CHECK-NEXT: [[P12:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 20 +; CHECK-NEXT: [[P12:%.*]] = getelementptr i8, ptr [[TMP2]], i64 20 ; CHECK-NEXT: [[V1:%.*]] = load float, ptr [[P12]], align 4 ; CHECK-NEXT: call void @foo(float [[V1]]) ; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP1]], 2 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[TMP2]], i64 [[TMP3]] -; CHECK-NEXT: [[P24:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 20 +; CHECK-NEXT: [[P24:%.*]] = getelementptr i8, ptr [[TMP4]], i64 20 ; CHECK-NEXT: [[V2:%.*]] = load float, ptr [[P24]], align 4 ; CHECK-NEXT: call void @foo(float [[V2]]) ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i64 [[TMP3]] -; CHECK-NEXT: [[P36:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 20 +; CHECK-NEXT: [[P36:%.*]] = getelementptr i8, ptr [[TMP5]], i64 20 ; CHECK-NEXT: [[V3:%.*]] = load float, ptr [[P36]], align 4 ; CHECK-NEXT: call void @foo(float [[V3]]) ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i64 [[TMP3]] -; CHECK-NEXT: [[P48:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 20 +; CHECK-NEXT: [[P48:%.*]] = getelementptr i8, ptr [[TMP6]], i64 20 ; CHECK-NEXT: [[V4:%.*]] = load float, ptr [[P48]], align 4 ; CHECK-NEXT: call void @foo(float [[V4]]) ; CHECK-NEXT: ret void