
Motivating example: https://godbolt.org/z/eb97zrxhx Here we have 2 induction variables in the loop: one is corresponding to i variable (add rdx, 4), the other - to res (add rax, 2). The second induction variable can be removed by rewriteLoopExitValues() method (final value of res at loop exit is unroll_iter * -2); however, this doesn't happen because we have duplicated LCSSA phi nodes at loop exit: ``` ; Preheader: for.body.preheader.new: ; preds = %for.body.preheader %unroll_iter = and i64 %N, -4 br label %for.body ; Loop: for.body: ; preds = %for.body, %for.body.preheader.new %lsr.iv = phi i64 [ %lsr.iv.next, %for.body ], [ 0, %for.body.preheader.new ] %i.07 = phi i64 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ] %inc.3 = add nuw i64 %i.07, 4 %lsr.iv.next = add nsw i64 %lsr.iv, -2 %niter.ncmp.3.not = icmp eq i64 %unroll_iter, %inc.3 br i1 %niter.ncmp.3.not, label %for.end.loopexit.unr-lcssa.loopexit, label %for.body, !llvm.loop !7 ; Exit blocks for.end.loopexit.unr-lcssa.loopexit: ; preds = %for.body %inc.3.lcssa = phi i64 [ %inc.3, %for.body ] %lsr.iv.next.lcssa11 = phi i64 [ %lsr.iv.next, %for.body ] %lsr.iv.next.lcssa = phi i64 [ %lsr.iv.next, %for.body ] br label %for.end.loopexit.unr-lcssa ``` rewriteLoopExitValues requires %lsr.iv.next value to have only 2 uses: one in LCSSA phi node, the other - in induction phi node. Here we have 3 uses of this value because of duplicated lcssa nodes, so the transform doesn't apply and leads to an extra add operation inside the loop. The proposed solution is to accumulate inserted instructions that will require LCSSA form update into SetVector and then call formLCSSAForInstructions for this SetVector once, so the same instructions don't process twice. Reland fixes the issue with preserve-lcssa.ll test: it fails in the situation when x86_64-unknown-linux-gnu target is unavailable in opt. The changes are moved into separate duplicated-phis.ll test with explicit x86 target requirement to fix bots which are not building this target.
114 lines
5.0 KiB
LLVM
114 lines
5.0 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
|
; RUN: opt -mtriple=amdgcn -loop-reduce -S < %s | FileCheck %s
|
|
; REQUIRES: asserts
|
|
|
|
; Test that LSR does not attempt to extend a pointer type to an integer type,
|
|
; which causes a SCEV analysis assertion.
|
|
|
|
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
|
|
|
|
target triple = "amdgcn-amd-amdhsa"
|
|
|
|
@gVar = external hidden local_unnamed_addr addrspace(3) global [1024 x double], align 16
|
|
|
|
define amdgpu_kernel void @scaledregtest() local_unnamed_addr {
|
|
; CHECK-LABEL: @scaledregtest(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
|
|
; CHECK: loopexit:
|
|
; CHECK-NEXT: [[SCEVGEP11_LCSSA:%.*]] = phi ptr addrspace(5) [ [[SCEVGEP11:%.*]], [[FOR_BODY]] ]
|
|
; CHECK-NEXT: [[SCEVGEP13_LCSSA:%.*]] = phi ptr [ [[SCEVGEP13:%.*]], [[FOR_BODY]] ]
|
|
; CHECK-NEXT: br label [[FOR_BODY_1:%.*]]
|
|
; CHECK: for.body.1:
|
|
; CHECK-NEXT: [[LSR_IV5:%.*]] = phi ptr addrspace(5) [ [[SCEVGEP6:%.*]], [[FOR_BODY_1]] ], [ [[SCEVGEP11_LCSSA]], [[LOOPEXIT:%.*]] ]
|
|
; CHECK-NEXT: [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP2:%.*]], [[FOR_BODY_1]] ], [ [[SCEVGEP13_LCSSA]], [[LOOPEXIT]] ]
|
|
; CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[LSR_IV5]], align 8
|
|
; CHECK-NEXT: store ptr [[TMP0]], ptr [[LSR_IV1]], align 8
|
|
; CHECK-NEXT: [[SCEVGEP2]] = getelementptr i8, ptr [[LSR_IV1]], i64 8
|
|
; CHECK-NEXT: [[SCEVGEP6]] = getelementptr i8, ptr addrspace(5) [[LSR_IV5]], i32 8
|
|
; CHECK-NEXT: br label [[FOR_BODY_1]]
|
|
; CHECK: for.body:
|
|
; CHECK-NEXT: [[LSR_IV12:%.*]] = phi ptr [ [[SCEVGEP13]], [[FOR_BODY]] ], [ null, [[ENTRY:%.*]] ]
|
|
; CHECK-NEXT: [[LSR_IV10:%.*]] = phi ptr addrspace(5) [ [[SCEVGEP11]], [[FOR_BODY]] ], [ null, [[ENTRY]] ]
|
|
; CHECK-NEXT: [[SCEVGEP11]] = getelementptr i8, ptr addrspace(5) [[LSR_IV10]], i32 64
|
|
; CHECK-NEXT: [[SCEVGEP13]] = getelementptr i8, ptr [[LSR_IV12]], i64 64
|
|
; CHECK-NEXT: br i1 false, label [[LOOPEXIT]], label [[FOR_BODY]]
|
|
;
|
|
entry:
|
|
br label %for.body
|
|
|
|
loopexit:
|
|
%conv = zext i32 %inc to i64
|
|
br label %for.body.1
|
|
|
|
for.body.1:
|
|
%conv.1 = phi i64 [ %conv.2, %for.body.1 ], [ %conv, %loopexit ]
|
|
%I.1 = phi i32 [ %inc.1, %for.body.1 ], [ %inc, %loopexit ]
|
|
%idxprom = trunc i64 %conv.1 to i32
|
|
%arrayidx = getelementptr inbounds ptr, ptr addrspace(5) null, i32 %idxprom
|
|
%0 = load ptr, ptr addrspace(5) %arrayidx, align 8
|
|
%arrayidx.1 = getelementptr inbounds ptr, ptr null, i64 %conv.1
|
|
store ptr %0, ptr %arrayidx.1, align 8
|
|
%inc.1 = add nuw nsw i32 %I.1, 1
|
|
%conv.2 = zext i32 %inc.1 to i64
|
|
br label %for.body.1
|
|
|
|
for.body:
|
|
%I = phi i32 [ 0, %entry ], [ %inc, %for.body ]
|
|
%inc = add nuw nsw i32 %I, 8
|
|
br i1 false, label %loopexit, label %for.body
|
|
}
|
|
|
|
define protected amdgpu_kernel void @baseregtest(i32 %n, i32 %lda) local_unnamed_addr {
|
|
; CHECK-LABEL: @baseregtest(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: br i1 undef, label [[EXIT:%.*]], label [[IF_END:%.*]]
|
|
; CHECK: if.end:
|
|
; CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @foo()
|
|
; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[TMP0]], 3
|
|
; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr addrspace(3) @gVar, i32 [[TMP1]]
|
|
; CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[N:%.*]], 3
|
|
; CHECK-NEXT: [[TMP3:%.*]] = sext i32 [[TMP0]] to i64
|
|
; CHECK-NEXT: [[TMP4:%.*]] = shl nsw i64 [[TMP3]], 3
|
|
; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr addrspace(1) null, i64 [[TMP4]]
|
|
; CHECK-NEXT: [[TMP5:%.*]] = sext i32 [[LDA:%.*]] to i64
|
|
; CHECK-NEXT: [[TMP6:%.*]] = shl nsw i64 [[TMP5]], 3
|
|
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
|
|
; CHECK: for.body:
|
|
; CHECK-NEXT: [[LSR_IV3:%.*]] = phi ptr addrspace(1) [ [[SCEVGEP4:%.*]], [[FOR_BODY]] ], [ [[SCEVGEP2]], [[IF_END]] ]
|
|
; CHECK-NEXT: [[LSR_IV:%.*]] = phi ptr addrspace(3) [ [[SCEVGEP1:%.*]], [[FOR_BODY]] ], [ [[SCEVGEP]], [[IF_END]] ]
|
|
; CHECK-NEXT: [[TMP7:%.*]] = load double, ptr addrspace(1) [[LSR_IV3]], align 8
|
|
; CHECK-NEXT: store double [[TMP7]], ptr addrspace(3) [[LSR_IV]], align 8
|
|
; CHECK-NEXT: [[SCEVGEP1]] = getelementptr i8, ptr addrspace(3) [[LSR_IV]], i32 [[TMP2]]
|
|
; CHECK-NEXT: [[SCEVGEP4]] = getelementptr i8, ptr addrspace(1) [[LSR_IV3]], i64 [[TMP6]]
|
|
; CHECK-NEXT: br label [[FOR_BODY]]
|
|
; CHECK: exit:
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
entry:
|
|
br i1 undef, label %exit, label %if.end
|
|
|
|
if.end:
|
|
%0 = tail call i32 @foo()
|
|
br label %for.body
|
|
|
|
for.body:
|
|
%i = phi i32 [ %inc, %for.body ], [ 0, %if.end ]
|
|
%mul1 = mul nsw i32 %i, %lda
|
|
%add1 = add nsw i32 %mul1, %0
|
|
%idxprom = sext i32 %add1 to i64
|
|
%arrayidx = getelementptr inbounds double, ptr addrspace(1) null, i64 %idxprom
|
|
%1 = load double, ptr addrspace(1) %arrayidx, align 8
|
|
%mul2 = mul nsw i32 %i, %n
|
|
%add2 = add nsw i32 %mul2, %0
|
|
%arrayidx9110 = getelementptr inbounds [1024 x double], ptr addrspace(3) @gVar, i32 0, i32 %add2
|
|
store double %1, ptr addrspace(3) %arrayidx9110, align 8
|
|
%inc = add nuw nsw i32 %i, 1
|
|
br label %for.body
|
|
|
|
exit:
|
|
ret void
|
|
}
|
|
|
|
declare i32 @foo()
|