Ryotaro Kasuga 3cd6b86cc1
[MachinePipeliner] Use AliasAnalysis properly when analyzing loop-carried dependencies (#136691)
MachinePipeliner uses AliasAnalysis to collect loop-carried memory
dependencies. To analyze loop-carried dependencies, we need to
explicitly tell AliasAnalysis that the values may come from different
iterations. Before this patch, MachinePipeliner didn't do this, so some
loop-carried dependencies might be missed. For example, in the following
case, there is a loop-carried dependency from the load to the store, but
it wasn't considered.

```
def @f(ptr noalias %p0, ptr noalias %p1) {
entry:
  br label %body

loop:
  %idx0 = phi ptr [ %p0, %entry ], [ %p1, %body ]
  %idx1 = phi ptr [ %p1, %entry ], [ %p0, %body ]
  %v0 = load %idx0
  ...
  store %v1, %idx1
  ...
}
```

Further, the handling of the underlying objects was not sound. If there
is no information about memory operands (i.e., `memoperands()` is
empty), it must be handled conservatively. However, Machinepipeliner
uses a dummy value (namely `UnknownValue`). It is distinguished from
other "known" objects, causing necessary dependencies to be missed.
(NOTE: in such cases, `buildSchedGraph` adds non-loop-carried
dependencies correctly, so perhaps a critical problem has not occurred.)

This patch fixes the above problems. This change has increased false
dependencies that didn't exist before. Therefore, this patch also
introduces additional alias checks with the underlying objects.

Split off from #135148
2025-04-23 18:11:34 +09:00

152 lines
5.6 KiB
YAML

# RUN: llc -mtriple=hexagon -run-pass pipeliner -debug-only=pipeliner %s -o /dev/null 2>&1 | FileCheck %s
# REQUIRES: asserts
# Test that there are no loop-carried dependencies between all memory instructions.
# CHECK: SU(0): %8:intregs = PHI %1:intregs, %bb.1, %9:intregs, %bb.2
# CHECK-NEXT: # preds left
# CHECK-NEXT: # succs left
# CHECK-NEXT: # rdefs left
# CHECK-NEXT: Latency
# CHECK-NEXT: Depth
# CHECK-NEXT: Height
# CHECK-NEXT: Successors:
# CHECK-DAG: SU(6): Data Latency=0 Reg=%8
# CHECK-DAG: SU(5): Data Latency=0 Reg=%8
# CHECK-DAG: SU(3): Data Latency=0 Reg=%8
# CHECK-DAG: SU(6): Anti Latency=1
# CHECK-NEXT: SU(1): %10:intregs = PHI %2:intregs, %bb.1, %11:intregs, %bb.2
# CHECK-NEXT: # preds left
# CHECK-NEXT: # succs left
# CHECK-NEXT: # rdefs left
# CHECK-NEXT: Latency
# CHECK-NEXT: Depth
# CHECK-NEXT: Height
# CHECK-NEXT: Successors:
# CHECK-DAG: SU(7): Data Latency=0 Reg=%10
# CHECK-DAG: SU(4): Data Latency=0 Reg=%10
# CHECK-DAG: SU(2): Data Latency=0 Reg=%10
# CHECK-DAG: SU(7): Anti Latency=1
# CHECK-NEXT: SU(2): %12:hvxvr = V6_vL32b_ai %10:intregs, 0 :: (load (s1024) from %ir.iptr.09, !tbaa !4)
# CHECK-NEXT: # preds left
# CHECK-NEXT: # succs left
# CHECK-NEXT: # rdefs left
# CHECK-NEXT: Latency
# CHECK-NEXT: Depth
# CHECK-NEXT: Height
# CHECK-NEXT: Predecessors:
# CHECK-NEXT: SU(1): Data Latency=0 Reg=%10
# CHECK-NEXT: Successors:
# CHECK-NEXT: SU(3): Data Latency=0 Reg=%12
# CHECK-NEXT: SU(3): V6_vS32b_ai %8:intregs, 0, %12:hvxvr :: (store (s1024) into %ir.optr.010, !tbaa !4)
# CHECK-NEXT: # preds left
# CHECK-NEXT: # succs left
# CHECK-NEXT: # rdefs left
# CHECK-NEXT: Latency
# CHECK-NEXT: Depth
# CHECK-NEXT: Height
# CHECK-NEXT: Predecessors:
# CHECK-DAG: SU(2): Data Latency=0 Reg=%12
# CHECK-DAG: SU(0): Data Latency=0 Reg=%8
# CHECK-NEXT: SU(4): %13:hvxvr = V6_vL32b_ai %10:intregs, 128 :: (load (s1024) from %ir.cgep, !tbaa !4)
# CHECK-NEXT: # preds left
# CHECK-NEXT: # succs left
# CHECK-NEXT: # rdefs left
# CHECK-NEXT: Latency
# CHECK-NEXT: Depth
# CHECK-NEXT: Height
# CHECK-NEXT: Predecessors:
# CHECK-NEXT: SU(1): Data Latency=0 Reg=%10
# CHECK-NEXT: Successors:
# CHECK-NEXT: SU(5): Data Latency=0 Reg=%13
# CHECK-NEXT: SU(5): V6_vS32b_ai %8:intregs, 128, %13:hvxvr :: (store (s1024) into %ir.cgep3, !tbaa !4)
--- |
define dso_local void @foo(ptr noundef readonly captures(none) %in, ptr noalias noundef writeonly captures(none) %out, i32 noundef %width) local_unnamed_addr #0 {
entry:
%cmp7 = icmp sgt i32 %width, 0
br i1 %cmp7, label %for.body.preheader, label %for.end
for.body.preheader: ; preds = %entry
%0 = add i32 %width, 128
br label %for.body
for.body: ; preds = %for.body.preheader, %for.body
%lsr.iv = phi i32 [ %0, %for.body.preheader ], [ %lsr.iv.next, %for.body ]
%optr.010 = phi ptr [ %cgep4, %for.body ], [ %out, %for.body.preheader ]
%iptr.09 = phi ptr [ %cgep5, %for.body ], [ %in, %for.body.preheader ]
%ald = load <128 x i8>, ptr %iptr.09, align 128, !tbaa !4
%cst = bitcast <128 x i8> %ald to <32 x i32>
store <32 x i32> %cst, ptr %optr.010, align 128, !tbaa !4
%cgep = getelementptr i8, ptr %iptr.09, i32 128
%ald1 = load <128 x i8>, ptr %cgep, align 128, !tbaa !4
%cst2 = bitcast <128 x i8> %ald1 to <32 x i32>
%cgep3 = getelementptr i8, ptr %optr.010, i32 128
store <32 x i32> %cst2, ptr %cgep3, align 128, !tbaa !4
%lsr.iv.next = add i32 %lsr.iv, -128
%cmp = icmp samesign ugt i32 %lsr.iv.next, 128
%cgep4 = getelementptr i8, ptr %optr.010, i32 256
%cgep5 = getelementptr i8, ptr %iptr.09, i32 256
br i1 %cmp, label %for.body, label %for.end
for.end: ; preds = %for.body, %entry
ret void
}
attributes #0 = { "target-cpu"="hexagonv60" "target-features"="+hvx-length128b,+hvxv69,+v66,-long-calls" }
!llvm.module.flags = !{!0, !1, !2, !3}
!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 8, !"PIC Level", i32 2}
!2 = !{i32 7, !"PIE Level", i32 2}
!3 = !{i32 7, !"frame-pointer", i32 2}
!4 = !{!5, !5, i64 0}
!5 = !{!"omnipotent char", !6, i64 0}
!6 = !{!"Simple C/C++ TBAA"}
...
---
name: foo
tracksRegLiveness: true
body: |
bb.0.entry:
successors: %bb.1(0x50000000), %bb.3(0x30000000)
liveins: $r0, $r1, $r2
%9:intregs = COPY $r2
%8:intregs = COPY $r1
%7:intregs = COPY $r0
%10:predregs = C2_cmpgti %9, 0
J2_jumpf %10, %bb.3, implicit-def dead $pc
J2_jump %bb.1, implicit-def dead $pc
bb.1.for.body.preheader:
successors: %bb.2(0x80000000)
%0:intregs = A2_addi %9, 128
%15:intregs = A2_addi %0, -1
%16:intregs = S2_lsr_i_r %15, 7
%17:intregs = COPY %16
J2_loop0r %bb.2, %17, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
bb.2.for.body (machine-block-address-taken):
successors: %bb.2(0x7c000000), %bb.3(0x04000000)
%2:intregs = PHI %8, %bb.1, %5, %bb.2
%3:intregs = PHI %7, %bb.1, %6, %bb.2
%12:hvxvr = V6_vL32b_ai %3, 0 :: (load (s1024) from %ir.iptr.09, !tbaa !4)
V6_vS32b_ai %2, 0, killed %12 :: (store (s1024) into %ir.optr.010, !tbaa !4)
%13:hvxvr = V6_vL32b_ai %3, 128 :: (load (s1024) from %ir.cgep, !tbaa !4)
V6_vS32b_ai %2, 128, killed %13 :: (store (s1024) into %ir.cgep3, !tbaa !4)
%5:intregs = A2_addi %2, 256
%6:intregs = A2_addi %3, 256
ENDLOOP0 %bb.2, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
J2_jump %bb.3, implicit-def dead $pc
bb.3.for.end:
PS_jmpret $r31, implicit-def dead $pc
...