PHI-node part was merged with PR#160909. Extend `isOpLegal` to treat 8/16-bit vector add/sub/and/or/xor as profitable on SDWA targets (stores and intrinsics remain profitable). This repacks loop-carried values to i32 across BBs and restores SDWA lowering instead of scattered lshr/lshl/or sequences. Testing: - Local: `check-llvm-codegen-amdgpu` is green (4314/4320 passed, 6 XFAIL). - Additional: validated in AMD internal CI
64 lines
2.1 KiB
LLVM
64 lines
2.1 KiB
LLVM
; REQUIRES: amdgpu-registered-target
|
|
; RUN: opt -S -passes=amdgpu-late-codegenprepare \
|
|
; RUN: -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a %s | FileCheck %s
|
|
|
|
; Purpose:
|
|
; - Input has a loop-carried PHI of type <4 x i8> and byte-wise adds in the
|
|
; loop header (same basic block as the PHI).
|
|
; - After amdgpu-late-codegenprepare, the PHI must be coerced to i32 across
|
|
; the backedge, and a single dominating "bitcast i32 -> <4 x i8>" must be
|
|
; placed in the header (enabling SDWA-friendly lowering later).
|
|
;
|
|
; What we check:
|
|
; - PHI is i32 (no loop-carried <4 x i8> PHI remains).
|
|
; - A header-local bitcast i32 -> <4 x i8> exists and feeds the vector add.
|
|
; - The loopexit produces a bitcast <4 x i8> -> i32 for the backedge.
|
|
|
|
define amdgpu_kernel void @lro_coerce_v4i8_phi(ptr nocapture %p, i32 %n) {
|
|
entry:
|
|
br label %loop
|
|
|
|
loop:
|
|
; Loop index
|
|
%i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
|
|
|
|
; Loop-carried accumulator in vector-of-bytes form (problematic on input).
|
|
%acc = phi <4 x i8> [ zeroinitializer, %entry ], [ %acc.next, %loop ]
|
|
|
|
; Make up four i8 values derived from %i to avoid memory noise.
|
|
%i0 = trunc i32 %i to i8
|
|
%i1i = add i32 %i, 1
|
|
%i1 = trunc i32 %i1i to i8
|
|
%i2i = add i32 %i, 2
|
|
%i2 = trunc i32 %i2i to i8
|
|
%i3i = add i32 %i, 3
|
|
%i3 = trunc i32 %i3i to i8
|
|
|
|
; Pack them into <4 x i8>.
|
|
%v01 = insertelement <4 x i8> zeroinitializer, i8 %i0, i32 0
|
|
%v02 = insertelement <4 x i8> %v01, i8 %i1, i32 1
|
|
%v03 = insertelement <4 x i8> %v02, i8 %i2, i32 2
|
|
%v = insertelement <4 x i8> %v03, i8 %i3, i32 3
|
|
|
|
; Byte-wise add in the same block as the PHI (this must make coercion profitable).
|
|
%acc.next = add <4 x i8> %acc, %v
|
|
|
|
; Loop control.
|
|
%i.next = add i32 %i, 4
|
|
%cond = icmp slt i32 %i.next, %n
|
|
br i1 %cond, label %loop, label %exit
|
|
|
|
exit:
|
|
ret void
|
|
}
|
|
|
|
; CHECK-LABEL: define amdgpu_kernel void @lro_coerce_v4i8_phi(
|
|
; CHECK: loop:
|
|
; CHECK: %i = phi i32
|
|
; CHECK-NOT: phi <4 x i8>
|
|
; CHECK: %[[ACCI32:[^ ]+]] = phi i32
|
|
; CHECK-NEXT: %[[HDRCAST:[^ ]+]] = bitcast i32 %[[ACCI32]] to <4 x i8>
|
|
; CHECK: add <4 x i8> %[[HDRCAST]],
|
|
; CHECK: br i1
|
|
|