Usman Nadeem b167ada896
[DFAJumpThreading] Rewrite the way paths are enumerated (#96127)
I tried to add a limit to number of blocks visited in the paths()
function but even with a very high limit the transformation coverage was
being reduced.

After looking at the code it seemed that the function was trying to
create paths of the form
`SwitchBB...DeterminatorBB...SwitchPredecessor`. This is inefficient
because a lot of nodes in those paths (nodes before DeterminatorBB)
would be irrelevant to the optimization. We only care about paths of the
form `DeterminatorBB_Pred DeterminatorBB...SwitchBB`. This weeds out a
lot of visited nodes.

In this patch I have added a hard limit to the number of nodes visited
and changed the algorithm for path calculation. Primarily I am
traversing the use-def chain for the PHI nodes that define the state. If
we have a hole in the use-def chain (no immediate predecessors) then I
call the paths() function.

I also had to the change the select instruction unfolding code to insert
redundant one input PHIs to allow the use of the use-def chain in
calculating the paths.

The test suite coverage with this patch (including a limit on nodes
visited) is as follows:

    Geomean diff:
      dfa-jump-threading.NumTransforms: +13.4%
      dfa-jump-threading.NumCloned: +34.1%
      dfa-jump-threading.NumPaths: -80.7%

Compile time effect vs baseline (pass enabled by default) is mostly
positive:
https://llvm-compile-time-tracker.com/compare.php?from=ad8705fda25f64dcfeb6264ac4d6bac36bee91ab&to=5a3af6ce7e852f0736f706b4a8663efad5bce6ea&stat=instructions:u

Change-Id: I0fba9e0f8aa079706f633089a8ccd4ecf57547ed
2024-08-10 12:13:53 -07:00

57 lines
1.5 KiB
LLVM

; REQUIRES: asserts
; RUN: opt -S -passes=dfa-jump-threading -dfa-max-path-length=3 %s \
; RUN: 2>&1 -disable-output -debug-only=dfa-jump-threading | FileCheck %s
; RUN: opt -S -passes=dfa-jump-threading -dfa-max-num-visited-paths=3 %s \
; RUN: 2>&1 -disable-output -debug-only=dfa-jump-threading | FileCheck %s
; Make the path
; < case1 case1.1 case1.2 case1.3 case1.4 for.inc for.body > [ 3, case1 ]
; too long so that it is not jump-threaded.
define i32 @max_path_length(i32 %num) {
; CHECK-NOT: 3, case1
; CHECK: < case2 for.inc for.body > [ 1, for.inc ]
; CHECK-NEXT: < for.inc for.body > [ 1, for.inc ]
; CHECK-NEXT: < case2 sel.si.unfold.false for.inc for.body > [ 2, sel.si.unfold.false ]
; CHECK-NEXT: DFA-JT: Renaming non-local uses of:
entry:
br label %for.body
for.body:
%count = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
%state = phi i32 [ 1, %entry ], [ %state.next, %for.inc ]
switch i32 %state, label %for.inc [
i32 1, label %case1
i32 2, label %case2
]
case1:
%case1.state.next = phi i32 [ 3, %for.body ]
br label %case1.1
case1.1:
br label %case1.2
case1.2:
br label %case1.3
case1.3:
br label %case1.4
case1.4:
br label %for.inc
case2:
%cmp = icmp eq i32 %count, 50
%sel = select i1 %cmp, i32 1, i32 2
br label %for.inc
for.inc:
%state.next = phi i32 [ %sel, %case2 ], [ 1, %for.body ], [ %case1.state.next, %case1.4 ]
%inc = add nsw i32 %count, 1
%cmp.exit = icmp slt i32 %inc, %num
br i1 %cmp.exit, label %for.body, label %for.end
for.end:
ret i32 0
}