From 170de19a5b2dc2b9855d73794f139d1ff84bce47 Mon Sep 17 00:00:00 2001 From: LU-JOHN Date: Wed, 21 Jan 2026 12:19:55 -0600 Subject: [PATCH] [AMDGPU] Latency calculation must be independent of meta insts (#177052) Debug and other meta instructions in bundles must not affect latency calculation. Ensure that code compiled with and without debug instructions is identical. --------- Signed-off-by: John Lu --- llvm/lib/Target/AMDGPU/GCNSubtarget.cpp | 4 ++ ...ebug-independence-adjustSchedDependency.ll | 44 +++++++++++++++++++ .../AMDGPU/gfx-callable-argument-types.ll | 22 +++++----- 3 files changed, 59 insertions(+), 11 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/debug-independence-adjustSchedDependency.ll diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp index c8bbcbbd7692..0488968a1a2e 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp @@ -644,6 +644,8 @@ void GCNSubtarget::adjustSchedDependency( MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end()); unsigned Lat = 0; for (++I; I != E && I->isBundledWithPred(); ++I) { + if (I->isMetaInstruction()) + continue; if (I->modifiesRegister(Reg, TRI)) Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I); else if (Lat) @@ -657,6 +659,8 @@ void GCNSubtarget::adjustSchedDependency( MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end()); unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI); for (++I; I != E && I->isBundledWithPred() && Lat; ++I) { + if (I->isMetaInstruction()) + continue; if (I->readsRegister(Reg, TRI)) break; --Lat; diff --git a/llvm/test/CodeGen/AMDGPU/debug-independence-adjustSchedDependency.ll b/llvm/test/CodeGen/AMDGPU/debug-independence-adjustSchedDependency.ll new file mode 100644 index 000000000000..0ea51b2b3e21 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/debug-independence-adjustSchedDependency.ll @@ -0,0 +1,44 @@ +; RUN: opt %s -strip-debug -o %t.no_debug.ll -S +; RUN: llc -mcpu=gfx1250 < %s -filetype=obj -o %t.with_debug.o +; RUN: llc -mcpu=gfx1250 < %t.no_debug.ll -filetype=obj -o %t.no_debug.o +; RUN: llvm-strip %t.with_debug.o %t.no_debug.o +; RUN: cmp %t.with_debug.o %t.no_debug.o +; Ensure that compiling with and without debug generates identical code. +; Test that adjustSchedDependency does not count debug instructions in bundles. + +target triple = "amdgcn-amd-amdhsa" + +define amdgpu_kernel void @_test_adjustSchedDependency(ptr addrspace(1) %AA.coerce, i64 %shiftA, i32 %lda, ptr addrspace(3) %stPtr) !dbg !4 { +entry: + #dbg_value(i32 0, !10, !DIExpression(), !13) + #dbg_value(ptr addrspace(1) %AA.coerce, !14, !DIExpression(), !13) + %add.ptr1.i = getelementptr float, ptr addrspace(1) %AA.coerce, i64 %shiftA + %mul15.13 = mul i32 %lda, 13 + %idxprom.13 = sext i32 %mul15.13 to i64 + %arrayidx.13 = getelementptr float, ptr addrspace(1) %add.ptr1.i, i64 %idxprom.13 + %floatval = load float, ptr addrspace(1) %arrayidx.13, align 4 + %floatpair = insertelement <2 x float> zeroinitializer, float %floatval, i64 0 + store <2 x float> %floatpair, ptr addrspace(3) %stPtr, align 4 + ret void +} + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "AMD clang version 22.0.0git (ssh://github-emu/AMD-Lightning-Internal/llvm-project 25425 c51a87b7a53a3e8f308402aaffa3ecbc2953305a)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !2, imports: !2, splitDebugInlining: false, nameTableKind: None) +!1 = !DIFile(filename: "test.cpp", directory: "/tmp", checksumkind: CSK_MD5, checksum: "cc205700bf3536fe4ff21a07daf7e01d") +!2 = !{} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!4 = distinct !DISubprogram(name: "test_adjustSchedDependency", linkageName: "_test_adjustSchedDependency", scope: !6, file: !5, line: 142, type: !8, scopeLine: 150, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, templateParams: !2, retainedNodes: !2) +!5 = !DIFile(filename: "kernels.hpp", directory: "/tmp") +!6 = !DINamespace(name: "v33200", scope: !7, exportSymbols: true) +!7 = !DINamespace(name: "solve", scope: null) +!8 = distinct !DISubroutineType(types: !9) +!9 = !{null} +!10 = !DILocalVariable(name: "m", arg: 1, scope: !4, file: !5, line: 142, type: !11) +!11 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !12) +!12 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!13 = !DILocation(line: 0, scope: !4) +!14 = !DILocalVariable(name: "AA", arg: 2, scope: !4, file: !5, line: 143, type: !15) +!15 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !16, size: 64) +!16 = !DIBasicType(name: "float", size: 32, encoding: DW_ATE_float) diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll index 2bcc5df70d94..5c26ff6e916c 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll @@ -15960,24 +15960,24 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 { ; GFX10-NEXT: v_writelane_b32 v40, s22, 18 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s52 -; GFX10-NEXT: v_mov_b32_e32 v1, s47 ; GFX10-NEXT: v_writelane_b32 v40, s23, 19 -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 -; GFX10-NEXT: v_mov_b32_e32 v0, s46 +; GFX10-NEXT: v_mov_b32_e32 v1, s47 ; GFX10-NEXT: v_mov_b32_e32 v2, s48 ; GFX10-NEXT: v_mov_b32_e32 v3, s49 +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; GFX10-NEXT: v_writelane_b32 v40, s24, 20 +; GFX10-NEXT: v_mov_b32_e32 v0, s46 ; GFX10-NEXT: s_mov_b32 s20, s36 ; GFX10-NEXT: s_mov_b32 s21, s37 ; GFX10-NEXT: s_mov_b32 s22, s38 -; GFX10-NEXT: s_mov_b32 s23, s39 ; GFX10-NEXT: v_writelane_b32 v40, s25, 21 +; GFX10-NEXT: s_mov_b32 s23, s39 ; GFX10-NEXT: s_mov_b32 s24, s40 ; GFX10-NEXT: s_mov_b32 s25, s41 ; GFX10-NEXT: v_mov_b32_e32 v4, s50 -; GFX10-NEXT: v_mov_b32_e32 v5, s51 ; GFX10-NEXT: v_writelane_b32 v40, s26, 22 ; GFX10-NEXT: s_mov_b32 s26, s42 +; GFX10-NEXT: v_mov_b32_e32 v5, s51 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 @@ -16180,27 +16180,27 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 { ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s22, 18 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, s2 -; GFX10-SCRATCH-NEXT: s_add_i32 s2, s32, 24 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, s50 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s23, 19 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, s50 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, s51 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, s46 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, s47 -; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, s48 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s24, 20 +; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, s48 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, s49 +; GFX10-SCRATCH-NEXT: s_add_i32 s2, s32, 24 ; GFX10-SCRATCH-NEXT: s_mov_b32 s20, s36 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s25, 21 ; GFX10-SCRATCH-NEXT: s_mov_b32 s21, s37 ; GFX10-SCRATCH-NEXT: s_mov_b32 s22, s38 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s25, 21 ; GFX10-SCRATCH-NEXT: s_mov_b32 s23, s39 ; GFX10-SCRATCH-NEXT: s_mov_b32 s24, s40 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s26, 22 ; GFX10-SCRATCH-NEXT: s_mov_b32 s25, s41 +; GFX10-SCRATCH-NEXT: s_mov_b32 s26, s42 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v6, s2 ; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[4:5], s3 ; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s26, 22 -; GFX10-SCRATCH-NEXT: s_mov_b32 s26, s42 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s27, 23 ; GFX10-SCRATCH-NEXT: s_mov_b32 s27, s43 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s28, 24