llvm-project/llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll
michaelselehov 3645cef1ef
[AMDGPU] LiveRegOptimizer: consider i8/i16 binops on SDWA (#155800)
PHI-node part was merged with PR#160909.

Extend `isOpLegal` to treat 8/16-bit vector add/sub/and/or/xor as
profitable on SDWA targets (stores and intrinsics remain profitable).
This repacks loop-carried values to i32 across BBs and restores SDWA
lowering instead of scattered lshr/lshl/or sequences.

Testing:
- Local: `check-llvm-codegen-amdgpu` is green (4314/4320 passed, 6
XFAIL).
- Additional: validated in AMD internal CI
2025-12-15 12:04:33 -05:00

64 lines
2.1 KiB
LLVM

; REQUIRES: amdgpu-registered-target
; RUN: opt -S -passes=amdgpu-late-codegenprepare \
; RUN: -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a %s | FileCheck %s
; Purpose:
; - Input has a loop-carried PHI of type <4 x i8> and byte-wise adds in the
; loop header (same basic block as the PHI).
; - After amdgpu-late-codegenprepare, the PHI must be coerced to i32 across
; the backedge, and a single dominating "bitcast i32 -> <4 x i8>" must be
; placed in the header (enabling SDWA-friendly lowering later).
;
; What we check:
; - PHI is i32 (no loop-carried <4 x i8> PHI remains).
; - A header-local bitcast i32 -> <4 x i8> exists and feeds the vector add.
; - The loopexit produces a bitcast <4 x i8> -> i32 for the backedge.
define amdgpu_kernel void @lro_coerce_v4i8_phi(ptr nocapture %p, i32 %n) {
entry:
br label %loop
loop:
; Loop index
%i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
; Loop-carried accumulator in vector-of-bytes form (problematic on input).
%acc = phi <4 x i8> [ zeroinitializer, %entry ], [ %acc.next, %loop ]
; Make up four i8 values derived from %i to avoid memory noise.
%i0 = trunc i32 %i to i8
%i1i = add i32 %i, 1
%i1 = trunc i32 %i1i to i8
%i2i = add i32 %i, 2
%i2 = trunc i32 %i2i to i8
%i3i = add i32 %i, 3
%i3 = trunc i32 %i3i to i8
; Pack them into <4 x i8>.
%v01 = insertelement <4 x i8> zeroinitializer, i8 %i0, i32 0
%v02 = insertelement <4 x i8> %v01, i8 %i1, i32 1
%v03 = insertelement <4 x i8> %v02, i8 %i2, i32 2
%v = insertelement <4 x i8> %v03, i8 %i3, i32 3
; Byte-wise add in the same block as the PHI (this must make coercion profitable).
%acc.next = add <4 x i8> %acc, %v
; Loop control.
%i.next = add i32 %i, 4
%cond = icmp slt i32 %i.next, %n
br i1 %cond, label %loop, label %exit
exit:
ret void
}
; CHECK-LABEL: define amdgpu_kernel void @lro_coerce_v4i8_phi(
; CHECK: loop:
; CHECK: %i = phi i32
; CHECK-NOT: phi <4 x i8>
; CHECK: %[[ACCI32:[^ ]+]] = phi i32
; CHECK-NEXT: %[[HDRCAST:[^ ]+]] = bitcast i32 %[[ACCI32]] to <4 x i8>
; CHECK: add <4 x i8> %[[HDRCAST]],
; CHECK: br i1