llvm-project/llvm/test/CodeGen/AMDGPU/loop-header-align-gfx950.mir

# RUN: llc -mtriple=amdgcn -mcpu=gfx950 -start-before=block-placement -o - %s | FileCheck %s

# Test that loop headers are aligned to 32 bytes on GFX950 when the first
# instruction is 8 bytes, to prevent the instruction from being split by the
# 32-byte fetch window boundary.
# The second test case verifies that 4-byte instructions do NOT trigger
# alignment (CHECK-NEXT chain would break if .p2align were inserted).

---
name:            loop_with_8byte_first_inst
tracksRegLiveness: true
body:             |
  ; CHECK-LABEL: loop_with_8byte_first_inst:
  ; CHECK:       ; %bb.0:
  ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
  ; CHECK-NEXT:    s_mov_b64 s[0:1], 0
  ; CHECK-NEXT:    .p2align 5, , 4
  ; CHECK-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
  ; CHECK-NEXT:    v_lshrrev_b64 v[0:1], 1, v[0:1]
  bb.0:
    successors: %bb.1(0x80000000)
    liveins: $vgpr0_vgpr1

    renamable $sgpr0_sgpr1 = S_MOV_B64 0

  bb.1:
    successors: %bb.2(0x04000000), %bb.1(0x7c000000)
    liveins: $sgpr0_sgpr1, $vgpr0_vgpr1

    renamable $vgpr0_vgpr1 = V_LSHRREV_B64_e64 1, killed renamable $vgpr0_vgpr1, implicit $exec
    V_CMP_EQ_U64_e32 0, $vgpr0_vgpr1, implicit-def $vcc, implicit $exec
    renamable $sgpr0_sgpr1 = S_OR_B64 killed renamable $vcc, killed renamable $sgpr0_sgpr1, implicit-def $scc
    $exec = S_ANDN2_B64 $exec, renamable $sgpr0_sgpr1, implicit-def $scc
    S_CBRANCH_EXECNZ %bb.1, implicit $exec

  bb.2:
    liveins: $sgpr0_sgpr1

    $exec = S_OR_B64 $exec, killed renamable $sgpr0_sgpr1, implicit-def $scc
    S_SETPC_B64_return undef $sgpr30_sgpr31
...

---
name:            loop_with_4byte_first_inst
tracksRegLiveness: true
body:             |
  ; CHECK-LABEL: loop_with_4byte_first_inst:
  ; CHECK:       ; %bb.0:
  ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
  ; CHECK-NEXT:    s_mov_b64 s[0:1], 0
  ; CHECK-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
  ; CHECK-NEXT:    v_add_u32_e32 v0, 1, v0
  bb.0:
    successors: %bb.1(0x80000000)
    liveins: $vgpr0

    renamable $sgpr0_sgpr1 = S_MOV_B64 0

  bb.1:
    successors: %bb.2(0x04000000), %bb.1(0x7c000000)
    liveins: $sgpr0_sgpr1, $vgpr0

    renamable $vgpr0 = V_ADD_U32_e32 1, killed $vgpr0, implicit $exec
    V_CMP_LT_U32_e32 10, $vgpr0, implicit-def $vcc, implicit $exec
    renamable $sgpr0_sgpr1 = S_OR_B64 killed renamable $vcc, killed renamable $sgpr0_sgpr1, implicit-def $scc
    $exec = S_ANDN2_B64 $exec, renamable $sgpr0_sgpr1, implicit-def $scc
    S_CBRANCH_EXECNZ %bb.1, implicit $exec

  bb.2:
    liveins: $sgpr0_sgpr1

    $exec = S_OR_B64 $exec, killed renamable $sgpr0_sgpr1, implicit-def $scc
    S_SETPC_B64_return undef $sgpr30_sgpr31
...