Loop headers frequently consume the loop-carried value in the header block via non-lookthrough ops (e.g. byte-wise vector binops). LiveRegOptimizer’s same-BB filter currently prunes these users, so the loop-carried PHI is not coerced to i32 and the intended packed form is lost. Relax the filter: when the def is a PHI, allow same-BB non-lookthrough users. Also fix the check to look at the user (CII) rather than the def (II) so the walk does not terminate prematurely.
47 lines
1.7 KiB
LLVM
47 lines
1.7 KiB
LLVM
; RUN: opt -S -passes=amdgpu-late-codegenprepare \
|
|
; RUN: -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a %s | FileCheck %s
|
|
|
|
; Goal: With a loop-header PHI in illegal vector type and a same-BB
|
|
; non-lookthrough user (vector add) in the header, LRO should still coerce
|
|
; the PHI to i32 because a profitable sink (store) exists across BB.
|
|
|
|
define amdgpu_kernel void @phi_samebb_nonlookthrough_store(
|
|
ptr addrspace(1) %out, <4 x i8> %v, i1 %exit) {
|
|
; CHECK-LABEL: @phi_samebb_nonlookthrough_store(
|
|
entry:
|
|
br label %loop
|
|
|
|
loop: ; preds = %entry, %loop
|
|
; Loop-carried PHI in illegal vector type.
|
|
%acc = phi <4 x i8> [ zeroinitializer, %entry ], [ %acc.next, %loop ]
|
|
|
|
; Same-BB non-lookthrough use in header.
|
|
%acc.next = add <4 x i8> %acc, %v
|
|
|
|
; Make it a real loop: either iterate or exit to the sink block.
|
|
br i1 %exit, label %store, label %loop
|
|
|
|
store: ; preds = %loop
|
|
; The across-BB sink: storing the PHI coerced to i32.
|
|
%acc.bc = bitcast <4 x i8> %acc to i32
|
|
store i32 %acc.bc, ptr addrspace(1) %out, align 4
|
|
ret void
|
|
}
|
|
|
|
; After AMDGPULateCodeGenPrepare we expect:
|
|
; - PHI is coerced to i32
|
|
; - A header bitcast materializes for the add
|
|
; This proves the same-BB non-lookthrough user (add) did not get pruned
|
|
; when the def is a PHI.
|
|
|
|
; CHECK: loop:
|
|
; CHECK: %[[ACC_TC:[^ ]+]] = phi i32
|
|
; CHECK: %[[ACC_TC_BC:[^ ]+]] = bitcast i32 %[[ACC_TC]] to <4 x i8>
|
|
; CHECK: %[[ACC_NEXT:[^ ]+]] = add <4 x i8> %[[ACC_TC_BC]], %v
|
|
; CHECK: br i1 %exit, label %store, label %loop
|
|
; CHECK: store:
|
|
; CHECK: %[[ACC_TC_BC2:[^ ]+]] = bitcast i32 %[[ACC_TC]] to <4 x i8>
|
|
; CHECK: %[[ST_I32:[^ ]+]] = bitcast <4 x i8> %[[ACC_TC_BC2]] to i32
|
|
; CHECK: store i32 %[[ST_I32]],
|
|
|