llvm-project/llvm/test/CodeGen/AMDGPU/lro-coerce-v4i8-phi-loop.ll

; REQUIRES: amdgpu-registered-target
; RUN: opt -S -passes=amdgpu-late-codegenprepare \
; RUN:   -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a %s | FileCheck %s

; Purpose:
;  - Input has a loop-carried PHI of type <4 x i8> and byte-wise adds in the
;    loop header (same basic block as the PHI).
;  - After amdgpu-late-codegenprepare, the PHI must be coerced to i32 across
;    the backedge, and a single dominating "bitcast i32 -> <4 x i8>" must be
;    placed in the header (enabling SDWA-friendly lowering later).
;
; What we check:
;  - PHI is i32 (no loop-carried <4 x i8> PHI remains).
;  - A header-local bitcast i32 -> <4 x i8> exists and feeds the vector add.
;  - The loopexit produces a bitcast <4 x i8> -> i32 for the backedge.

define amdgpu_kernel void @lro_coerce_v4i8_phi(ptr nocapture %p, i32 %n) {
entry:
  br label %loop

loop:
  ; Loop index
  %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]

  ; Loop-carried accumulator in vector-of-bytes form (problematic on input).
  %acc = phi <4 x i8> [ zeroinitializer, %entry ], [ %acc.next, %loop ]

  ; Make up four i8 values derived from %i to avoid memory noise.
  %i0 = trunc i32 %i to i8
  %i1i = add i32 %i, 1
  %i1 = trunc i32 %i1i to i8
  %i2i = add i32 %i, 2
  %i2 = trunc i32 %i2i to i8
  %i3i = add i32 %i, 3
  %i3 = trunc i32 %i3i to i8

  ; Pack them into <4 x i8>.
  %v01 = insertelement <4 x i8> zeroinitializer, i8 %i0, i32 0
  %v02 = insertelement <4 x i8> %v01, i8 %i1, i32 1
  %v03 = insertelement <4 x i8> %v02, i8 %i2, i32 2
  %v   = insertelement <4 x i8> %v03, i8 %i3, i32 3

  ; Byte-wise add in the same block as the PHI (this must make coercion profitable).
  %acc.next = add <4 x i8> %acc, %v

  ; Loop control.
  %i.next = add i32 %i, 4
  %cond = icmp slt i32 %i.next, %n
  br i1 %cond, label %loop, label %exit

exit:
  ret void
}

; CHECK-LABEL: define amdgpu_kernel void @lro_coerce_v4i8_phi(
; CHECK: loop:
; CHECK: %i = phi i32
; CHECK-NOT: phi <4 x i8>
; CHECK: %[[ACCI32:[^ ]+]] = phi i32
; CHECK-NEXT: %[[HDRCAST:[^ ]+]] = bitcast i32 %[[ACCI32]] to <4 x i8>
; CHECK: add <4 x i8> %[[HDRCAST]],
; CHECK: br i1