
LoopPeel currently considers PHI nodes that become loop invariants through peeling. However, in some cases, peeling transforms PHI nodes into induction variables (IVs), potentially enabling further optimizations such as loop vectorization. For example: ```c // TSVC s292 int im = N-1; for (int i=0; i<N; i++) { a[i] = b[i] + b[im]; im = i; } ``` In this case, peeling one iteration converts `im` into an IV, allowing it to be handled by the loop vectorizer. This patch adds a new feature to peel loops when to convert PHIs into IVs. At the moment this feature is disabled by default. Enabling it allows to vectorize the above example. I have measured on neoverse-v2 and observed a speedup of more than 60% (options: `-O3 -ffast-math -mcpu=neoverse-v2 -mllvm -enable-peeling-for-iv`). This PR is taken over from #94900 Related #81851
151 lines
6.2 KiB
LLVM
151 lines
6.2 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
|
|
; RUN: opt < %s -S -passes=loop-unroll -enable-peeling-for-iv | FileCheck %s
|
|
; RUN: opt < %s -S -passes=loop-unroll-full -enable-peeling-for-iv | FileCheck %s
|
|
|
|
; Check that unnecessary peeling doesn't occur if for a comparison instruction
|
|
; between two instructions. The original code is as below. Both i and j are
|
|
; inductions, but the comparison i < j is not an induction.
|
|
;
|
|
; val = 42;
|
|
; for (i=0,j=100; i<10000; i+=2,j+=1) {
|
|
; a[i] = val;
|
|
; val = i < j;
|
|
; }
|
|
;
|
|
define void @dont_peel_cmp_ind_ind(ptr %a) {
|
|
; CHECK-LABEL: define void @dont_peel_cmp_ind_ind(
|
|
; CHECK-SAME: ptr [[A:%.*]]) {
|
|
; CHECK-NEXT: [[ENTRY:.*]]:
|
|
; CHECK-NEXT: br label %[[FOR_BODY:.*]]
|
|
; CHECK: [[FOR_BODY]]:
|
|
; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[I_NEXT:%.*]], %[[FOR_BODY]] ]
|
|
; CHECK-NEXT: [[J:%.*]] = phi i32 [ 100, %[[ENTRY]] ], [ [[J_NEXT:%.*]], %[[FOR_BODY]] ]
|
|
; CHECK-NEXT: [[VAL:%.*]] = phi i32 [ 42, %[[ENTRY]] ], [ [[VAL_NEXT:%.*]], %[[FOR_BODY]] ]
|
|
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i32 [[I]]
|
|
; CHECK-NEXT: store i32 10, ptr [[ARRAYIDX]], align 4
|
|
; CHECK-NEXT: [[VAL_NEXT_CMP:%.*]] = icmp slt i32 [[I]], [[J]]
|
|
; CHECK-NEXT: [[VAL_NEXT]] = zext i1 [[VAL_NEXT_CMP]] to i32
|
|
; CHECK-NEXT: [[I_NEXT]] = add i32 [[I]], 2
|
|
; CHECK-NEXT: [[J_NEXT]] = add i32 [[J]], 1
|
|
; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[I_NEXT]], 10000
|
|
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp slt i32 [[I_NEXT]], 10000
|
|
; CHECK-NEXT: br i1 [[EXITCOND]], label %[[FOR_BODY]], label %[[EXIT:.*]]
|
|
; CHECK: [[EXIT]]:
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
entry:
|
|
br label %for.body
|
|
|
|
for.body:
|
|
%i = phi i32 [ 0, %entry ], [ %i.next, %for.body ]
|
|
%j = phi i32 [ 100, %entry ] , [ %j.next, %for.body ]
|
|
%val = phi i32 [ 42, %entry ], [ %val.next, %for.body ]
|
|
%arrayidx = getelementptr inbounds nuw i32, ptr %a, i32 %i
|
|
store i32 10, ptr %arrayidx, align 4
|
|
%val.next.cmp = icmp slt i32 %i, %j
|
|
%val.next = zext i1 %val.next.cmp to i32
|
|
%i.next = add i32 %i, 2
|
|
%j.next = add i32 %j, 1
|
|
%cmp = icmp ne i32 %i.next, 10000
|
|
%exitcond = icmp slt i32 %i.next, 10000
|
|
br i1 %exitcond, label %for.body, label %exit
|
|
|
|
exit:
|
|
ret void
|
|
}
|
|
|
|
|
|
; Check that unnecessary peeling doesn't occur if for a bitwise instructions
|
|
; between IVs. The original code is as below. The variable i is an induction,
|
|
; but vals (val0 through val4) are not.
|
|
;
|
|
; val0 = 42;
|
|
; val1 = 42;
|
|
; val2 = 42;
|
|
; val3 = 42;
|
|
; val4 = 42;
|
|
; for (i=0,j=100; i<10000; i+=2,j+=1) {
|
|
; a[i] = val0;
|
|
; b[i] = val1;
|
|
; c[i] = val2;
|
|
; d[i] = val3;
|
|
; e[i] = val4;
|
|
; val0 = i & j;
|
|
; val1 = i | j;
|
|
; val2 = i ^ j;
|
|
; val3 = i >> j;
|
|
; val4 = i << j;
|
|
; }
|
|
;
|
|
define void @dont_peel_bitwise_op_iv_iv(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) {
|
|
; CHECK-LABEL: define void @dont_peel_bitwise_op_iv_iv(
|
|
; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], ptr [[D:%.*]], ptr [[E:%.*]]) {
|
|
; CHECK-NEXT: [[ENTRY:.*]]:
|
|
; CHECK-NEXT: br label %[[FOR_BODY:.*]]
|
|
; CHECK: [[FOR_BODY]]:
|
|
; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[I_NEXT:%.*]], %[[FOR_BODY]] ]
|
|
; CHECK-NEXT: [[J:%.*]] = phi i32 [ 100, %[[ENTRY]] ], [ [[J_NEXT:%.*]], %[[FOR_BODY]] ]
|
|
; CHECK-NEXT: [[VAL0:%.*]] = phi i32 [ 42, %[[ENTRY]] ], [ [[VAL0_NEXT:%.*]], %[[FOR_BODY]] ]
|
|
; CHECK-NEXT: [[VAL1:%.*]] = phi i32 [ 42, %[[ENTRY]] ], [ [[VAL1_NEXT:%.*]], %[[FOR_BODY]] ]
|
|
; CHECK-NEXT: [[VAL2:%.*]] = phi i32 [ 42, %[[ENTRY]] ], [ [[VAL2_NEXT:%.*]], %[[FOR_BODY]] ]
|
|
; CHECK-NEXT: [[VAL3:%.*]] = phi i32 [ 42, %[[ENTRY]] ], [ [[VAL3_NEXT:%.*]], %[[FOR_BODY]] ]
|
|
; CHECK-NEXT: [[VAL4:%.*]] = phi i32 [ 42, %[[ENTRY]] ], [ [[VAL4_NEXT:%.*]], %[[FOR_BODY]] ]
|
|
; CHECK-NEXT: [[IDX_0:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i32 [[I]]
|
|
; CHECK-NEXT: [[IDX_1:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i32 [[I]]
|
|
; CHECK-NEXT: [[IDX_2:%.*]] = getelementptr inbounds nuw i32, ptr [[C]], i32 [[I]]
|
|
; CHECK-NEXT: [[IDX_3:%.*]] = getelementptr inbounds nuw i32, ptr [[D]], i32 [[I]]
|
|
; CHECK-NEXT: [[IDX_4:%.*]] = getelementptr inbounds nuw i32, ptr [[E]], i32 [[I]]
|
|
; CHECK-NEXT: store i32 [[VAL0]], ptr [[IDX_0]], align 4
|
|
; CHECK-NEXT: store i32 [[VAL1]], ptr [[IDX_1]], align 4
|
|
; CHECK-NEXT: store i32 [[VAL2]], ptr [[IDX_2]], align 4
|
|
; CHECK-NEXT: store i32 [[VAL3]], ptr [[IDX_3]], align 4
|
|
; CHECK-NEXT: store i32 [[VAL4]], ptr [[IDX_4]], align 4
|
|
; CHECK-NEXT: [[VAL0_NEXT]] = and i32 [[I]], [[J]]
|
|
; CHECK-NEXT: [[VAL1_NEXT]] = or i32 [[I]], [[J]]
|
|
; CHECK-NEXT: [[VAL2_NEXT]] = xor i32 [[I]], [[J]]
|
|
; CHECK-NEXT: [[VAL3_NEXT]] = shl i32 [[I]], [[J]]
|
|
; CHECK-NEXT: [[VAL4_NEXT]] = lshr i32 [[I]], [[J]]
|
|
; CHECK-NEXT: [[I_NEXT]] = add i32 [[I]], 2
|
|
; CHECK-NEXT: [[J_NEXT]] = add i32 [[J]], 1
|
|
; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[I_NEXT]], 10000
|
|
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp slt i32 [[I_NEXT]], 10000
|
|
; CHECK-NEXT: br i1 [[EXITCOND]], label %[[FOR_BODY]], label %[[EXIT:.*]]
|
|
; CHECK: [[EXIT]]:
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
entry:
|
|
br label %for.body
|
|
|
|
for.body:
|
|
%i = phi i32 [ 0, %entry ], [ %i.next, %for.body ]
|
|
%j = phi i32 [ 100, %entry ] , [ %j.next, %for.body ]
|
|
%val0 = phi i32 [ 42, %entry ], [ %val0.next, %for.body ]
|
|
%val1 = phi i32 [ 42, %entry ], [ %val1.next, %for.body ]
|
|
%val2 = phi i32 [ 42, %entry ], [ %val2.next, %for.body ]
|
|
%val3 = phi i32 [ 42, %entry ], [ %val3.next, %for.body ]
|
|
%val4 = phi i32 [ 42, %entry ], [ %val4.next, %for.body ]
|
|
%idx.0 = getelementptr inbounds nuw i32, ptr %a, i32 %i
|
|
%idx.1 = getelementptr inbounds nuw i32, ptr %b, i32 %i
|
|
%idx.2 = getelementptr inbounds nuw i32, ptr %c, i32 %i
|
|
%idx.3 = getelementptr inbounds nuw i32, ptr %d, i32 %i
|
|
%idx.4 = getelementptr inbounds nuw i32, ptr %e, i32 %i
|
|
store i32 %val0, ptr %idx.0, align 4
|
|
store i32 %val1, ptr %idx.1, align 4
|
|
store i32 %val2, ptr %idx.2, align 4
|
|
store i32 %val3, ptr %idx.3, align 4
|
|
store i32 %val4, ptr %idx.4, align 4
|
|
%val0.next = and i32 %i, %j
|
|
%val1.next = or i32 %i, %j
|
|
%val2.next = xor i32 %i, %j
|
|
%val3.next = shl i32 %i, %j
|
|
%val4.next = lshr i32 %i, %j
|
|
%i.next = add i32 %i, 2
|
|
%j.next = add i32 %j, 1
|
|
%cmp = icmp ne i32 %i.next, 10000
|
|
%exitcond = icmp slt i32 %i.next, 10000
|
|
br i1 %exitcond, label %for.body, label %exit
|
|
|
|
exit:
|
|
ret void
|
|
}
|