[AMDGPU] Allow bank conflicts on src0 for V_DUAL_MOV_B32 for gfx1170 (#186100)

This commit is contained in:
Mirko Brkušanin 2026-03-12 13:49:52 +01:00 committed by GitHub
parent ef8db55ed6
commit e78c797780
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 397 additions and 454 deletions

View File

@ -3967,9 +3967,10 @@ AMDGPUAsmParser::checkVOPDRegBankConstraints(const MCInst &Inst, bool AsVOPD3) {
: MCRegister();
};
// On GFX12+ if both OpX and OpY are V_MOV_B32 then OPY uses SRC2
// On GFX1170+ if both OpX and OpY are V_MOV_B32 then OPY uses SRC2
// source-cache.
bool SkipSrc =
Opcode == AMDGPU::V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1170 ||
Opcode == AMDGPU::V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 ||
Opcode == AMDGPU::V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 ||
Opcode == AMDGPU::V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx13 ||

View File

@ -142,9 +142,9 @@ bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII,
if ((UniqueLiterals.size() + UniqueScalarRegs.size()) > 2)
return false;
// On GFX12+ if both OpX and OpY are V_MOV_B32 then OPY uses SRC2
// On GFX1170+ if both OpX and OpY are V_MOV_B32 then OPY uses SRC2
// source-cache.
bool SkipSrc = ST.getGeneration() >= AMDGPUSubtarget::GFX12 &&
bool SkipSrc = (ST.hasGFX11_7Insts() || ST.hasGFX12Insts()) &&
MIX.getOpcode() == AMDGPU::V_MOV_B32_e32 &&
MIY.getOpcode() == AMDGPU::V_MOV_B32_e32;
bool AllowSameVGPR = ST.hasGFX1250Insts();

View File

@ -1,6 +1,8 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass=postmisched %s -o - | FileCheck -check-prefix=SCHED %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass=postmisched,gcn-create-vopd %s -o - | FileCheck -check-prefixes=PAIR,PAIR-GFX11 %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass=postmisched,gcn-create-vopd %s -o - | FileCheck -check-prefixes=PAIR,PAIR-GFX1100 %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx1170 -verify-machineinstrs -run-pass=postmisched %s -o - | FileCheck -check-prefix=SCHED %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx1170 -verify-machineinstrs -run-pass=postmisched,gcn-create-vopd %s -o - | FileCheck -check-prefixes=PAIR,PAIR-GFX1170 %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass=postmisched %s -o - | FileCheck -check-prefix=SCHED %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass=postmisched,gcn-create-vopd %s -o - | FileCheck -check-prefixes=PAIR,PAIR-GFX12 %s
@ -43,12 +45,19 @@ body: |
; SCHED-NEXT: $vgpr6 = V_MUL_F32_e32 killed $vgpr0, $vgpr0, implicit $mode, implicit $exec
; SCHED-NEXT: $vgpr4 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
;
; PAIR-GFX11-LABEL: name: vopd_schedule
; PAIR-GFX11: $vgpr0 = IMPLICIT_DEF
; PAIR-GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
; PAIR-GFX11-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; PAIR-GFX11-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx11 $vgpr1, $vgpr1, killed $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
; PAIR-GFX11-NEXT: $vgpr4 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
; PAIR-GFX1100-LABEL: name: vopd_schedule
; PAIR-GFX1100: $vgpr0 = IMPLICIT_DEF
; PAIR-GFX1100-NEXT: $vgpr1 = IMPLICIT_DEF
; PAIR-GFX1100-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; PAIR-GFX1100-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx11 $vgpr1, $vgpr1, killed $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
; PAIR-GFX1100-NEXT: $vgpr4 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
;
; PAIR-GFX1170-LABEL: name: vopd_schedule
; PAIR-GFX1170: $vgpr0 = IMPLICIT_DEF
; PAIR-GFX1170-NEXT: $vgpr1 = IMPLICIT_DEF
; PAIR-GFX1170-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; PAIR-GFX1170-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1170 $vgpr1, $vgpr1, killed $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
; PAIR-GFX1170-NEXT: $vgpr4 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
;
; PAIR-GFX12-LABEL: name: vopd_schedule
; PAIR-GFX12: $vgpr0 = IMPLICIT_DEF
@ -80,12 +89,19 @@ body: |
; SCHED-NEXT: $vgpr5 = V_FMAMK_F32 killed $vgpr0, 10, killed $vgpr3, implicit $mode, implicit $exec
; SCHED-NEXT: $vgpr2 = V_FMAC_F32_e32 killed $vgpr1, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec
;
; PAIR-GFX11-LABEL: name: vopd_fmamk
; PAIR-GFX11: $vgpr2 = IMPLICIT_DEF
; PAIR-GFX11-NEXT: $vgpr0 = IMPLICIT_DEF
; PAIR-GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
; PAIR-GFX11-NEXT: $vgpr3 = IMPLICIT_DEF
; PAIR-GFX11-NEXT: $vgpr5, $vgpr2 = V_DUAL_FMAMK_F32_X_FMAC_F32_e32_gfx11 killed $vgpr0, 10, killed $vgpr3, killed $vgpr1, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
; PAIR-GFX1100-LABEL: name: vopd_fmamk
; PAIR-GFX1100: $vgpr2 = IMPLICIT_DEF
; PAIR-GFX1100-NEXT: $vgpr0 = IMPLICIT_DEF
; PAIR-GFX1100-NEXT: $vgpr1 = IMPLICIT_DEF
; PAIR-GFX1100-NEXT: $vgpr3 = IMPLICIT_DEF
; PAIR-GFX1100-NEXT: $vgpr5, $vgpr2 = V_DUAL_FMAMK_F32_X_FMAC_F32_e32_gfx11 killed $vgpr0, 10, killed $vgpr3, killed $vgpr1, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
;
; PAIR-GFX1170-LABEL: name: vopd_fmamk
; PAIR-GFX1170: $vgpr2 = IMPLICIT_DEF
; PAIR-GFX1170-NEXT: $vgpr0 = IMPLICIT_DEF
; PAIR-GFX1170-NEXT: $vgpr1 = IMPLICIT_DEF
; PAIR-GFX1170-NEXT: $vgpr3 = IMPLICIT_DEF
; PAIR-GFX1170-NEXT: $vgpr5, $vgpr2 = V_DUAL_FMAMK_F32_X_FMAC_F32_e32_gfx1170 killed $vgpr0, 10, killed $vgpr3, killed $vgpr1, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
;
; PAIR-GFX12-LABEL: name: vopd_fmamk
; PAIR-GFX12: $vgpr2 = IMPLICIT_DEF
@ -155,19 +171,33 @@ body: |
; SCHED-NEXT: $vgpr6 = V_ADD_F32_e32 $sgpr20, $vgpr3, implicit $mode, implicit $exec
; SCHED-NEXT: $vgpr9 = V_CNDMASK_B32_e32 killed $sgpr20, killed $vgpr3, implicit $mode, implicit $exec, implicit killed $vcc_lo
;
; PAIR-GFX11-LABEL: name: vopd_cndmask
; PAIR-GFX11: liveins: $vcc_lo
; PAIR-GFX11-NEXT: {{ $}}
; PAIR-GFX11-NEXT: $vgpr2 = IMPLICIT_DEF
; PAIR-GFX11-NEXT: $vgpr0 = IMPLICIT_DEF
; PAIR-GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
; PAIR-GFX11-NEXT: $vgpr3 = IMPLICIT_DEF
; PAIR-GFX11-NEXT: $sgpr20 = IMPLICIT_DEF
; PAIR-GFX11-NEXT: $vgpr4 = V_FMAMK_F32 $sgpr20, 12345, $vgpr3, implicit $mode, implicit $exec
; PAIR-GFX11-NEXT: $vgpr2, $vgpr5 = V_DUAL_FMAC_F32_e32_X_CNDMASK_B32_e32_gfx11 $sgpr20, killed $vgpr1, killed $vgpr2, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
; PAIR-GFX11-NEXT: $vgpr7 = V_CNDMASK_B32_e32 killed $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo
; PAIR-GFX11-NEXT: $vgpr6 = V_ADD_F32_e32 $sgpr20, $vgpr3, implicit $mode, implicit $exec
; PAIR-GFX11-NEXT: $vgpr9 = V_CNDMASK_B32_e32 killed $sgpr20, killed $vgpr3, implicit $mode, implicit $exec, implicit killed $vcc_lo
; PAIR-GFX1100-LABEL: name: vopd_cndmask
; PAIR-GFX1100: liveins: $vcc_lo
; PAIR-GFX1100-NEXT: {{ $}}
; PAIR-GFX1100-NEXT: $vgpr2 = IMPLICIT_DEF
; PAIR-GFX1100-NEXT: $vgpr0 = IMPLICIT_DEF
; PAIR-GFX1100-NEXT: $vgpr1 = IMPLICIT_DEF
; PAIR-GFX1100-NEXT: $vgpr3 = IMPLICIT_DEF
; PAIR-GFX1100-NEXT: $sgpr20 = IMPLICIT_DEF
; PAIR-GFX1100-NEXT: $vgpr4 = V_FMAMK_F32 $sgpr20, 12345, $vgpr3, implicit $mode, implicit $exec
; PAIR-GFX1100-NEXT: $vgpr2, $vgpr5 = V_DUAL_FMAC_F32_e32_X_CNDMASK_B32_e32_gfx11 $sgpr20, killed $vgpr1, killed $vgpr2, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
; PAIR-GFX1100-NEXT: $vgpr7 = V_CNDMASK_B32_e32 killed $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo
; PAIR-GFX1100-NEXT: $vgpr6 = V_ADD_F32_e32 $sgpr20, $vgpr3, implicit $mode, implicit $exec
; PAIR-GFX1100-NEXT: $vgpr9 = V_CNDMASK_B32_e32 killed $sgpr20, killed $vgpr3, implicit $mode, implicit $exec, implicit killed $vcc_lo
;
; PAIR-GFX1170-LABEL: name: vopd_cndmask
; PAIR-GFX1170: liveins: $vcc_lo
; PAIR-GFX1170-NEXT: {{ $}}
; PAIR-GFX1170-NEXT: $vgpr2 = IMPLICIT_DEF
; PAIR-GFX1170-NEXT: $vgpr0 = IMPLICIT_DEF
; PAIR-GFX1170-NEXT: $vgpr1 = IMPLICIT_DEF
; PAIR-GFX1170-NEXT: $vgpr3 = IMPLICIT_DEF
; PAIR-GFX1170-NEXT: $sgpr20 = IMPLICIT_DEF
; PAIR-GFX1170-NEXT: $vgpr4 = V_FMAMK_F32 $sgpr20, 12345, $vgpr3, implicit $mode, implicit $exec
; PAIR-GFX1170-NEXT: $vgpr2, $vgpr5 = V_DUAL_FMAC_F32_e32_X_CNDMASK_B32_e32_gfx1170 $sgpr20, killed $vgpr1, killed $vgpr2, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
; PAIR-GFX1170-NEXT: $vgpr7 = V_CNDMASK_B32_e32 killed $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo
; PAIR-GFX1170-NEXT: $vgpr6 = V_ADD_F32_e32 $sgpr20, $vgpr3, implicit $mode, implicit $exec
; PAIR-GFX1170-NEXT: $vgpr9 = V_CNDMASK_B32_e32 killed $sgpr20, killed $vgpr3, implicit $mode, implicit $exec, implicit killed $vcc_lo
;
; PAIR-GFX12-LABEL: name: vopd_cndmask
; PAIR-GFX12: liveins: $vcc_lo
@ -211,10 +241,15 @@ body: |
; SCHED-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr0, implicit $exec
; SCHED-NEXT: $vgpr3 = V_ADD_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
;
; PAIR-GFX11-LABEL: name: vopd_mov
; PAIR-GFX11: $vgpr0 = IMPLICIT_DEF
; PAIR-GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
; PAIR-GFX11-NEXT: $vgpr2, $vgpr3 = V_DUAL_MOV_B32_e32_X_ADD_F32_e32_gfx11 killed $vgpr0, killed $vgpr1, $vgpr1, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
; PAIR-GFX1100-LABEL: name: vopd_mov
; PAIR-GFX1100: $vgpr0 = IMPLICIT_DEF
; PAIR-GFX1100-NEXT: $vgpr1 = IMPLICIT_DEF
; PAIR-GFX1100-NEXT: $vgpr2, $vgpr3 = V_DUAL_MOV_B32_e32_X_ADD_F32_e32_gfx11 killed $vgpr0, killed $vgpr1, $vgpr1, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
;
; PAIR-GFX1170-LABEL: name: vopd_mov
; PAIR-GFX1170: $vgpr0 = IMPLICIT_DEF
; PAIR-GFX1170-NEXT: $vgpr1 = IMPLICIT_DEF
; PAIR-GFX1170-NEXT: $vgpr2, $vgpr3 = V_DUAL_MOV_B32_e32_X_ADD_F32_e32_gfx1170 killed $vgpr0, killed $vgpr1, $vgpr1, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
;
; PAIR-GFX12-LABEL: name: vopd_mov
; PAIR-GFX12: $vgpr0 = IMPLICIT_DEF
@ -239,10 +274,15 @@ body: |
; SCHED-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr0, implicit $exec
; SCHED-NEXT: $vgpr3 = V_MOV_B32_e32 killed $sgpr7, implicit $exec
;
; PAIR-GFX11-LABEL: name: vopd_mov_mov
; PAIR-GFX11: $sgpr0 = IMPLICIT_DEF
; PAIR-GFX11-NEXT: $sgpr7 = IMPLICIT_DEF
; PAIR-GFX11-NEXT: $vgpr2, $vgpr3 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx11 killed $sgpr0, killed $sgpr7, implicit $exec, implicit $exec, implicit $exec
; PAIR-GFX1100-LABEL: name: vopd_mov_mov
; PAIR-GFX1100: $sgpr0 = IMPLICIT_DEF
; PAIR-GFX1100-NEXT: $sgpr7 = IMPLICIT_DEF
; PAIR-GFX1100-NEXT: $vgpr2, $vgpr3 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx11 killed $sgpr0, killed $sgpr7, implicit $exec, implicit $exec, implicit $exec
;
; PAIR-GFX1170-LABEL: name: vopd_mov_mov
; PAIR-GFX1170: $sgpr0 = IMPLICIT_DEF
; PAIR-GFX1170-NEXT: $sgpr7 = IMPLICIT_DEF
; PAIR-GFX1170-NEXT: $vgpr2, $vgpr3 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1170 killed $sgpr0, killed $sgpr7, implicit $exec, implicit $exec, implicit $exec
;
; PAIR-GFX12-LABEL: name: vopd_mov_mov
; PAIR-GFX12: $sgpr0 = IMPLICIT_DEF
@ -300,12 +340,19 @@ body: |
; SCHED-NEXT: $vgpr5 = V_FMAMK_F32 killed $vgpr0, 100, killed $vgpr3, implicit $mode, implicit $exec
; SCHED-NEXT: $vgpr2 = V_FMAC_F32_e32 4, killed $vgpr1, killed $vgpr2, implicit $mode, implicit $exec
;
; PAIR-GFX11-LABEL: name: vopd_constants_inlinable
; PAIR-GFX11: $vgpr2 = IMPLICIT_DEF
; PAIR-GFX11-NEXT: $vgpr0 = IMPLICIT_DEF
; PAIR-GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
; PAIR-GFX11-NEXT: $vgpr3 = IMPLICIT_DEF
; PAIR-GFX11-NEXT: $vgpr5, $vgpr2 = V_DUAL_FMAMK_F32_X_FMAC_F32_e32_gfx11 killed $vgpr0, 100, killed $vgpr3, 4, killed $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
; PAIR-GFX1100-LABEL: name: vopd_constants_inlinable
; PAIR-GFX1100: $vgpr2 = IMPLICIT_DEF
; PAIR-GFX1100-NEXT: $vgpr0 = IMPLICIT_DEF
; PAIR-GFX1100-NEXT: $vgpr1 = IMPLICIT_DEF
; PAIR-GFX1100-NEXT: $vgpr3 = IMPLICIT_DEF
; PAIR-GFX1100-NEXT: $vgpr5, $vgpr2 = V_DUAL_FMAMK_F32_X_FMAC_F32_e32_gfx11 killed $vgpr0, 100, killed $vgpr3, 4, killed $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
;
; PAIR-GFX1170-LABEL: name: vopd_constants_inlinable
; PAIR-GFX1170: $vgpr2 = IMPLICIT_DEF
; PAIR-GFX1170-NEXT: $vgpr0 = IMPLICIT_DEF
; PAIR-GFX1170-NEXT: $vgpr1 = IMPLICIT_DEF
; PAIR-GFX1170-NEXT: $vgpr3 = IMPLICIT_DEF
; PAIR-GFX1170-NEXT: $vgpr5, $vgpr2 = V_DUAL_FMAMK_F32_X_FMAC_F32_e32_gfx1170 killed $vgpr0, 100, killed $vgpr3, 4, killed $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
;
; PAIR-GFX12-LABEL: name: vopd_constants_inlinable
; PAIR-GFX12: $vgpr2 = IMPLICIT_DEF
@ -338,12 +385,19 @@ body: |
; SCHED-NEXT: $vgpr5 = V_FMAMK_F32 killed $vgpr0, 100, killed $vgpr3, implicit $mode, implicit $exec
; SCHED-NEXT: $vgpr2 = V_FMAC_F32_e32 100, killed $vgpr1, killed $vgpr2, implicit $mode, implicit $exec
;
; PAIR-GFX11-LABEL: name: vopd_constants_same
; PAIR-GFX11: $vgpr2 = IMPLICIT_DEF
; PAIR-GFX11-NEXT: $vgpr0 = IMPLICIT_DEF
; PAIR-GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
; PAIR-GFX11-NEXT: $vgpr3 = IMPLICIT_DEF
; PAIR-GFX11-NEXT: $vgpr5, $vgpr2 = V_DUAL_FMAMK_F32_X_FMAC_F32_e32_gfx11 killed $vgpr0, 100, killed $vgpr3, 100, killed $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
; PAIR-GFX1100-LABEL: name: vopd_constants_same
; PAIR-GFX1100: $vgpr2 = IMPLICIT_DEF
; PAIR-GFX1100-NEXT: $vgpr0 = IMPLICIT_DEF
; PAIR-GFX1100-NEXT: $vgpr1 = IMPLICIT_DEF
; PAIR-GFX1100-NEXT: $vgpr3 = IMPLICIT_DEF
; PAIR-GFX1100-NEXT: $vgpr5, $vgpr2 = V_DUAL_FMAMK_F32_X_FMAC_F32_e32_gfx11 killed $vgpr0, 100, killed $vgpr3, 100, killed $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
;
; PAIR-GFX1170-LABEL: name: vopd_constants_same
; PAIR-GFX1170: $vgpr2 = IMPLICIT_DEF
; PAIR-GFX1170-NEXT: $vgpr0 = IMPLICIT_DEF
; PAIR-GFX1170-NEXT: $vgpr1 = IMPLICIT_DEF
; PAIR-GFX1170-NEXT: $vgpr3 = IMPLICIT_DEF
; PAIR-GFX1170-NEXT: $vgpr5, $vgpr2 = V_DUAL_FMAMK_F32_X_FMAC_F32_e32_gfx1170 killed $vgpr0, 100, killed $vgpr3, 100, killed $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
;
; PAIR-GFX12-LABEL: name: vopd_constants_same
; PAIR-GFX12: $vgpr2 = IMPLICIT_DEF
@ -373,10 +427,15 @@ body: |
; SCHED-NEXT: $vgpr1 = V_MOV_B32_e32 981467136, implicit $exec
; SCHED-NEXT: $vgpr2 = V_FMAAK_F32 killed $sgpr0, killed $vgpr0, 981467136, implicit $mode, implicit $exec
;
; PAIR-GFX11-LABEL: name: vopd_mov_fmaak_constants_same
; PAIR-GFX11: $vgpr0 = IMPLICIT_DEF
; PAIR-GFX11-NEXT: $sgpr0 = IMPLICIT_DEF
; PAIR-GFX11-NEXT: $vgpr1, $vgpr2 = V_DUAL_MOV_B32_e32_X_FMAAK_F32_gfx11 981467136, killed $sgpr0, killed $vgpr0, 981467136, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
; PAIR-GFX1100-LABEL: name: vopd_mov_fmaak_constants_same
; PAIR-GFX1100: $vgpr0 = IMPLICIT_DEF
; PAIR-GFX1100-NEXT: $sgpr0 = IMPLICIT_DEF
; PAIR-GFX1100-NEXT: $vgpr1, $vgpr2 = V_DUAL_MOV_B32_e32_X_FMAAK_F32_gfx11 981467136, killed $sgpr0, killed $vgpr0, 981467136, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
;
; PAIR-GFX1170-LABEL: name: vopd_mov_fmaak_constants_same
; PAIR-GFX1170: $vgpr0 = IMPLICIT_DEF
; PAIR-GFX1170-NEXT: $sgpr0 = IMPLICIT_DEF
; PAIR-GFX1170-NEXT: $vgpr1, $vgpr2 = V_DUAL_MOV_B32_e32_X_FMAAK_F32_gfx1170 981467136, killed $sgpr0, killed $vgpr0, 981467136, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
;
; PAIR-GFX12-LABEL: name: vopd_mov_fmaak_constants_same
; PAIR-GFX12: $vgpr0 = IMPLICIT_DEF
@ -403,11 +462,17 @@ body: |
; SCHED-NEXT: DBG_VALUE $vgpr0, 0, 0
; SCHED-NEXT: $vgpr6 = V_MUL_F32_e32 killed $vgpr0, $vgpr0, implicit $mode, implicit $exec
;
; PAIR-GFX11-LABEL: name: vopd_debug
; PAIR-GFX11: $vgpr0 = IMPLICIT_DEF
; PAIR-GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
; PAIR-GFX11-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx11 killed $vgpr1, $vgpr1, killed $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
; PAIR-GFX11-NEXT: DBG_VALUE $vgpr0, 0, 0
; PAIR-GFX1100-LABEL: name: vopd_debug
; PAIR-GFX1100: $vgpr0 = IMPLICIT_DEF
; PAIR-GFX1100-NEXT: $vgpr1 = IMPLICIT_DEF
; PAIR-GFX1100-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx11 killed $vgpr1, $vgpr1, killed $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
; PAIR-GFX1100-NEXT: DBG_VALUE $vgpr0, 0, 0
;
; PAIR-GFX1170-LABEL: name: vopd_debug
; PAIR-GFX1170: $vgpr0 = IMPLICIT_DEF
; PAIR-GFX1170-NEXT: $vgpr1 = IMPLICIT_DEF
; PAIR-GFX1170-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1170 killed $vgpr1, $vgpr1, killed $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
; PAIR-GFX1170-NEXT: DBG_VALUE $vgpr0, 0, 0
;
; PAIR-GFX12-LABEL: name: vopd_debug
; PAIR-GFX12: $vgpr0 = IMPLICIT_DEF
@ -451,23 +516,41 @@ body: |
; SCHED-NEXT: $vgpr16 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; SCHED-NEXT: $vgpr14 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
;
; PAIR-GFX11-LABEL: name: vopd_schedule_unconstrained
; PAIR-GFX11: liveins: $vcc_lo
; PAIR-GFX11-NEXT: {{ $}}
; PAIR-GFX11-NEXT: $vgpr2 = IMPLICIT_DEF
; PAIR-GFX11-NEXT: $vgpr3 = IMPLICIT_DEF
; PAIR-GFX11-NEXT: $vgpr0 = IMPLICIT_DEF
; PAIR-GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
; PAIR-GFX11-NEXT: $vgpr4 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; PAIR-GFX11-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx11 $vgpr1, $vgpr1, $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
; PAIR-GFX11-NEXT: $vgpr2 = V_FMAC_F32_e32 10, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec
; PAIR-GFX11-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; PAIR-GFX11-NEXT: $vgpr12, $vgpr11 = V_DUAL_ADD_F32_e32_X_CNDMASK_B32_e32_gfx11 $vgpr1, $vgpr1, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
; PAIR-GFX11-NEXT: $vgpr19 = V_CNDMASK_B32_e32 $vgpr0, killed $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo
; PAIR-GFX11-NEXT: $vgpr17, $vgpr10 = V_DUAL_MUL_F32_e32_X_CNDMASK_B32_e32_gfx11 killed $vgpr0, $vgpr0, $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
; PAIR-GFX11-NEXT: $vgpr15 = V_CNDMASK_B32_e32 $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit killed $vcc_lo
; PAIR-GFX11-NEXT: $vgpr16 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; PAIR-GFX11-NEXT: $vgpr14 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
; PAIR-GFX1100-LABEL: name: vopd_schedule_unconstrained
; PAIR-GFX1100: liveins: $vcc_lo
; PAIR-GFX1100-NEXT: {{ $}}
; PAIR-GFX1100-NEXT: $vgpr2 = IMPLICIT_DEF
; PAIR-GFX1100-NEXT: $vgpr3 = IMPLICIT_DEF
; PAIR-GFX1100-NEXT: $vgpr0 = IMPLICIT_DEF
; PAIR-GFX1100-NEXT: $vgpr1 = IMPLICIT_DEF
; PAIR-GFX1100-NEXT: $vgpr4 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; PAIR-GFX1100-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx11 $vgpr1, $vgpr1, $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
; PAIR-GFX1100-NEXT: $vgpr2 = V_FMAC_F32_e32 10, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec
; PAIR-GFX1100-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; PAIR-GFX1100-NEXT: $vgpr12, $vgpr11 = V_DUAL_ADD_F32_e32_X_CNDMASK_B32_e32_gfx11 $vgpr1, $vgpr1, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
; PAIR-GFX1100-NEXT: $vgpr19 = V_CNDMASK_B32_e32 $vgpr0, killed $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo
; PAIR-GFX1100-NEXT: $vgpr17, $vgpr10 = V_DUAL_MUL_F32_e32_X_CNDMASK_B32_e32_gfx11 killed $vgpr0, $vgpr0, $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
; PAIR-GFX1100-NEXT: $vgpr15 = V_CNDMASK_B32_e32 $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit killed $vcc_lo
; PAIR-GFX1100-NEXT: $vgpr16 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; PAIR-GFX1100-NEXT: $vgpr14 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
;
; PAIR-GFX1170-LABEL: name: vopd_schedule_unconstrained
; PAIR-GFX1170: liveins: $vcc_lo
; PAIR-GFX1170-NEXT: {{ $}}
; PAIR-GFX1170-NEXT: $vgpr2 = IMPLICIT_DEF
; PAIR-GFX1170-NEXT: $vgpr3 = IMPLICIT_DEF
; PAIR-GFX1170-NEXT: $vgpr0 = IMPLICIT_DEF
; PAIR-GFX1170-NEXT: $vgpr1 = IMPLICIT_DEF
; PAIR-GFX1170-NEXT: $vgpr4 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; PAIR-GFX1170-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1170 $vgpr1, $vgpr1, $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
; PAIR-GFX1170-NEXT: $vgpr2 = V_FMAC_F32_e32 10, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec
; PAIR-GFX1170-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; PAIR-GFX1170-NEXT: $vgpr12, $vgpr11 = V_DUAL_ADD_F32_e32_X_CNDMASK_B32_e32_gfx1170 $vgpr1, $vgpr1, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
; PAIR-GFX1170-NEXT: $vgpr19 = V_CNDMASK_B32_e32 $vgpr0, killed $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo
; PAIR-GFX1170-NEXT: $vgpr17, $vgpr10 = V_DUAL_MUL_F32_e32_X_CNDMASK_B32_e32_gfx1170 killed $vgpr0, $vgpr0, $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
; PAIR-GFX1170-NEXT: $vgpr15 = V_CNDMASK_B32_e32 $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit killed $vcc_lo
; PAIR-GFX1170-NEXT: $vgpr16 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; PAIR-GFX1170-NEXT: $vgpr14 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
;
; PAIR-GFX12-LABEL: name: vopd_schedule_unconstrained
; PAIR-GFX12: liveins: $vcc_lo
@ -551,32 +634,59 @@ body: |
; SCHED-NEXT: $vgpr34 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; SCHED-NEXT: $vgpr32 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
;
; PAIR-GFX11-LABEL: name: vopd_schedule_unconstrained_2
; PAIR-GFX11: liveins: $vcc_lo
; PAIR-GFX11-NEXT: {{ $}}
; PAIR-GFX11-NEXT: $vgpr2 = IMPLICIT_DEF
; PAIR-GFX11-NEXT: $vgpr3 = IMPLICIT_DEF
; PAIR-GFX11-NEXT: $vgpr0 = IMPLICIT_DEF
; PAIR-GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
; PAIR-GFX11-NEXT: $vgpr20 = IMPLICIT_DEF
; PAIR-GFX11-NEXT: $vgpr16, $vgpr35 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx11 $vgpr1, $vgpr1, $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
; PAIR-GFX11-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx11 $vgpr1, $vgpr1, $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
; PAIR-GFX11-NEXT: $vgpr2 = V_FMAC_F32_e32 10, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec
; PAIR-GFX11-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; PAIR-GFX11-NEXT: $vgpr4, $vgpr29 = V_DUAL_SUB_F32_e32_X_CNDMASK_B32_e32_gfx11 $vgpr1, $vgpr1, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
; PAIR-GFX11-NEXT: $vgpr19, $vgpr20 = V_DUAL_CNDMASK_B32_e32_X_FMAC_F32_e32_gfx11 $vgpr0, $vgpr3, 10, $vgpr1, killed $vgpr20, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
; PAIR-GFX11-NEXT: $vgpr15 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo
; PAIR-GFX11-NEXT: $vgpr10, $vgpr17 = V_DUAL_CNDMASK_B32_e32_X_MUL_F32_e32_gfx11 $vgpr1, $vgpr2, $vgpr0, $vgpr0, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
; PAIR-GFX11-NEXT: $vgpr11, $vgpr12 = V_DUAL_CNDMASK_B32_e32_X_ADD_F32_e32_gfx11 $vgpr0, $vgpr3, $vgpr1, $vgpr1, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
; PAIR-GFX11-NEXT: $vgpr37, $vgpr14 = V_DUAL_CNDMASK_B32_e32_X_SUB_F32_e32_gfx11 $vgpr0, killed $vgpr3, $vgpr1, $vgpr1, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
; PAIR-GFX11-NEXT: $vgpr20 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; PAIR-GFX11-NEXT: $vgpr21, $vgpr24 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx11 $vgpr1, $vgpr1, killed $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
; PAIR-GFX11-NEXT: $vgpr28 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo
; PAIR-GFX11-NEXT: $vgpr22 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; PAIR-GFX11-NEXT: $vgpr31 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; PAIR-GFX11-NEXT: $vgpr33 = V_CNDMASK_B32_e32 $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit killed $vcc_lo
; PAIR-GFX11-NEXT: $vgpr34 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; PAIR-GFX11-NEXT: $vgpr32 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
; PAIR-GFX1100-LABEL: name: vopd_schedule_unconstrained_2
; PAIR-GFX1100: liveins: $vcc_lo
; PAIR-GFX1100-NEXT: {{ $}}
; PAIR-GFX1100-NEXT: $vgpr2 = IMPLICIT_DEF
; PAIR-GFX1100-NEXT: $vgpr3 = IMPLICIT_DEF
; PAIR-GFX1100-NEXT: $vgpr0 = IMPLICIT_DEF
; PAIR-GFX1100-NEXT: $vgpr1 = IMPLICIT_DEF
; PAIR-GFX1100-NEXT: $vgpr20 = IMPLICIT_DEF
; PAIR-GFX1100-NEXT: $vgpr16, $vgpr35 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx11 $vgpr1, $vgpr1, $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
; PAIR-GFX1100-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx11 $vgpr1, $vgpr1, $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
; PAIR-GFX1100-NEXT: $vgpr2 = V_FMAC_F32_e32 10, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec
; PAIR-GFX1100-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; PAIR-GFX1100-NEXT: $vgpr4, $vgpr29 = V_DUAL_SUB_F32_e32_X_CNDMASK_B32_e32_gfx11 $vgpr1, $vgpr1, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
; PAIR-GFX1100-NEXT: $vgpr19, $vgpr20 = V_DUAL_CNDMASK_B32_e32_X_FMAC_F32_e32_gfx11 $vgpr0, $vgpr3, 10, $vgpr1, killed $vgpr20, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
; PAIR-GFX1100-NEXT: $vgpr15 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo
; PAIR-GFX1100-NEXT: $vgpr10, $vgpr17 = V_DUAL_CNDMASK_B32_e32_X_MUL_F32_e32_gfx11 $vgpr1, $vgpr2, $vgpr0, $vgpr0, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
; PAIR-GFX1100-NEXT: $vgpr11, $vgpr12 = V_DUAL_CNDMASK_B32_e32_X_ADD_F32_e32_gfx11 $vgpr0, $vgpr3, $vgpr1, $vgpr1, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
; PAIR-GFX1100-NEXT: $vgpr37, $vgpr14 = V_DUAL_CNDMASK_B32_e32_X_SUB_F32_e32_gfx11 $vgpr0, killed $vgpr3, $vgpr1, $vgpr1, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
; PAIR-GFX1100-NEXT: $vgpr20 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; PAIR-GFX1100-NEXT: $vgpr21, $vgpr24 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx11 $vgpr1, $vgpr1, killed $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
; PAIR-GFX1100-NEXT: $vgpr28 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo
; PAIR-GFX1100-NEXT: $vgpr22 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; PAIR-GFX1100-NEXT: $vgpr31 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; PAIR-GFX1100-NEXT: $vgpr33 = V_CNDMASK_B32_e32 $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit killed $vcc_lo
; PAIR-GFX1100-NEXT: $vgpr34 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; PAIR-GFX1100-NEXT: $vgpr32 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
;
; PAIR-GFX1170-LABEL: name: vopd_schedule_unconstrained_2
; PAIR-GFX1170: liveins: $vcc_lo
; PAIR-GFX1170-NEXT: {{ $}}
; PAIR-GFX1170-NEXT: $vgpr2 = IMPLICIT_DEF
; PAIR-GFX1170-NEXT: $vgpr3 = IMPLICIT_DEF
; PAIR-GFX1170-NEXT: $vgpr0 = IMPLICIT_DEF
; PAIR-GFX1170-NEXT: $vgpr1 = IMPLICIT_DEF
; PAIR-GFX1170-NEXT: $vgpr20 = IMPLICIT_DEF
; PAIR-GFX1170-NEXT: $vgpr16, $vgpr35 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1170 $vgpr1, $vgpr1, $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
; PAIR-GFX1170-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1170 $vgpr1, $vgpr1, $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
; PAIR-GFX1170-NEXT: $vgpr2 = V_FMAC_F32_e32 10, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec
; PAIR-GFX1170-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; PAIR-GFX1170-NEXT: $vgpr4, $vgpr29 = V_DUAL_SUB_F32_e32_X_CNDMASK_B32_e32_gfx1170 $vgpr1, $vgpr1, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
; PAIR-GFX1170-NEXT: $vgpr19, $vgpr20 = V_DUAL_CNDMASK_B32_e32_X_FMAC_F32_e32_gfx1170 $vgpr0, $vgpr3, 10, $vgpr1, killed $vgpr20, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
; PAIR-GFX1170-NEXT: $vgpr15 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo
; PAIR-GFX1170-NEXT: $vgpr10, $vgpr17 = V_DUAL_CNDMASK_B32_e32_X_MUL_F32_e32_gfx1170 $vgpr1, $vgpr2, $vgpr0, $vgpr0, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
; PAIR-GFX1170-NEXT: $vgpr11, $vgpr12 = V_DUAL_CNDMASK_B32_e32_X_ADD_F32_e32_gfx1170 $vgpr0, $vgpr3, $vgpr1, $vgpr1, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
; PAIR-GFX1170-NEXT: $vgpr37, $vgpr14 = V_DUAL_CNDMASK_B32_e32_X_SUB_F32_e32_gfx1170 $vgpr0, killed $vgpr3, $vgpr1, $vgpr1, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
; PAIR-GFX1170-NEXT: $vgpr20 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; PAIR-GFX1170-NEXT: $vgpr21, $vgpr24 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1170 $vgpr1, $vgpr1, killed $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
; PAIR-GFX1170-NEXT: $vgpr28 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo
; PAIR-GFX1170-NEXT: $vgpr22 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; PAIR-GFX1170-NEXT: $vgpr31 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; PAIR-GFX1170-NEXT: $vgpr33 = V_CNDMASK_B32_e32 $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit killed $vcc_lo
; PAIR-GFX1170-NEXT: $vgpr34 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
; PAIR-GFX1170-NEXT: $vgpr32 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
;
; PAIR-GFX12-LABEL: name: vopd_schedule_unconstrained_2
; PAIR-GFX12: liveins: $vcc_lo
@ -657,11 +767,17 @@ body: |
; SCHED-NEXT: $vgpr4 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @lds, implicit $exec
; SCHED-NEXT: $vgpr5 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @lds, implicit $exec
;
; PAIR-GFX11-LABEL: name: vopd_mov_fixup
; PAIR-GFX11: $vgpr0 = IMPLICIT_DEF
; PAIR-GFX11-NEXT: $vgpr1 = IMPLICIT_DEF
; PAIR-GFX11-NEXT: $vgpr2, $vgpr3 = V_DUAL_MOV_B32_e32_X_ADD_F32_e32_gfx11 target-flags(amdgpu-abs32-lo) @lds, killed $vgpr0, killed $vgpr1, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
; PAIR-GFX11-NEXT: $vgpr4, $vgpr5 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx11 target-flags(amdgpu-abs32-lo) @lds, target-flags(amdgpu-abs32-lo) @lds, implicit $exec, implicit $exec, implicit $exec
; PAIR-GFX1100-LABEL: name: vopd_mov_fixup
; PAIR-GFX1100: $vgpr0 = IMPLICIT_DEF
; PAIR-GFX1100-NEXT: $vgpr1 = IMPLICIT_DEF
; PAIR-GFX1100-NEXT: $vgpr2, $vgpr3 = V_DUAL_MOV_B32_e32_X_ADD_F32_e32_gfx11 target-flags(amdgpu-abs32-lo) @lds, killed $vgpr0, killed $vgpr1, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
; PAIR-GFX1100-NEXT: $vgpr4, $vgpr5 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx11 target-flags(amdgpu-abs32-lo) @lds, target-flags(amdgpu-abs32-lo) @lds, implicit $exec, implicit $exec, implicit $exec
;
; PAIR-GFX1170-LABEL: name: vopd_mov_fixup
; PAIR-GFX1170: $vgpr0 = IMPLICIT_DEF
; PAIR-GFX1170-NEXT: $vgpr1 = IMPLICIT_DEF
; PAIR-GFX1170-NEXT: $vgpr2, $vgpr3 = V_DUAL_MOV_B32_e32_X_ADD_F32_e32_gfx1170 target-flags(amdgpu-abs32-lo) @lds, killed $vgpr0, killed $vgpr1, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
; PAIR-GFX1170-NEXT: $vgpr4, $vgpr5 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1170 target-flags(amdgpu-abs32-lo) @lds, target-flags(amdgpu-abs32-lo) @lds, implicit $exec, implicit $exec, implicit $exec
;
; PAIR-GFX12-LABEL: name: vopd_mov_fixup
; PAIR-GFX12: $vgpr0 = IMPLICIT_DEF
@ -726,11 +842,16 @@ body: |
; SCHED-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr1, implicit $exec
; SCHED-NEXT: $vgpr3 = V_MOV_B32_e32 killed $vgpr5, implicit $exec
;
; PAIR-GFX11-LABEL: name: vopd_mov_mov_same_src_bank
; PAIR-GFX11: $vgpr1 = IMPLICIT_DEF
; PAIR-GFX11-NEXT: $vgpr5 = IMPLICIT_DEF
; PAIR-GFX11-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr1, implicit $exec
; PAIR-GFX11-NEXT: $vgpr3 = V_MOV_B32_e32 killed $vgpr5, implicit $exec
; PAIR-GFX1100-LABEL: name: vopd_mov_mov_same_src_bank
; PAIR-GFX1100: $vgpr1 = IMPLICIT_DEF
; PAIR-GFX1100-NEXT: $vgpr5 = IMPLICIT_DEF
; PAIR-GFX1100-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr1, implicit $exec
; PAIR-GFX1100-NEXT: $vgpr3 = V_MOV_B32_e32 killed $vgpr5, implicit $exec
;
; PAIR-GFX1170-LABEL: name: vopd_mov_mov_same_src_bank
; PAIR-GFX1170: $vgpr1 = IMPLICIT_DEF
; PAIR-GFX1170-NEXT: $vgpr5 = IMPLICIT_DEF
; PAIR-GFX1170-NEXT: $vgpr2, $vgpr3 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1170 killed $vgpr1, killed $vgpr5, implicit $exec, implicit $exec, implicit $exec
;
; PAIR-GFX12-LABEL: name: vopd_mov_mov_same_src_bank
; PAIR-GFX12: $vgpr1 = IMPLICIT_DEF
@ -754,10 +875,15 @@ body: |
; SCHED-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr1, implicit $exec
; SCHED-NEXT: $vgpr1 = V_ADD_F32_e32 killed $vgpr3, $vgpr3, implicit $mode, implicit $exec
;
; PAIR-GFX11-LABEL: name: vopd_combine_opy_overwrites_opx
; PAIR-GFX11: $vgpr1 = IMPLICIT_DEF
; PAIR-GFX11-NEXT: $vgpr3 = IMPLICIT_DEF
; PAIR-GFX11-NEXT: $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_ADD_F32_e32_gfx11 killed $vgpr1, killed $vgpr3, $vgpr3, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
; PAIR-GFX1100-LABEL: name: vopd_combine_opy_overwrites_opx
; PAIR-GFX1100: $vgpr1 = IMPLICIT_DEF
; PAIR-GFX1100-NEXT: $vgpr3 = IMPLICIT_DEF
; PAIR-GFX1100-NEXT: $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_ADD_F32_e32_gfx11 killed $vgpr1, killed $vgpr3, $vgpr3, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
;
; PAIR-GFX1170-LABEL: name: vopd_combine_opy_overwrites_opx
; PAIR-GFX1170: $vgpr1 = IMPLICIT_DEF
; PAIR-GFX1170-NEXT: $vgpr3 = IMPLICIT_DEF
; PAIR-GFX1170-NEXT: $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_ADD_F32_e32_gfx1170 killed $vgpr1, killed $vgpr3, $vgpr3, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
;
; PAIR-GFX12-LABEL: name: vopd_combine_opy_overwrites_opx
; PAIR-GFX12: $vgpr1 = IMPLICIT_DEF

View File

@ -17,37 +17,19 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
; GFX1170-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable:
; GFX1170: ; %bb.0: ; %bb
; GFX1170-NEXT: v_mov_b32_e32 v10, 0x40400000
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1170-NEXT: v_mov_b32_e32 v11, v10
; GFX1170-NEXT: v_mov_b32_e32 v12, v10
; GFX1170-NEXT: v_mov_b32_e32 v13, v10
; GFX1170-NEXT: v_mov_b32_e32 v14, v10
; GFX1170-NEXT: v_mov_b32_e32 v15, v10
; GFX1170-NEXT: v_mov_b32_e32 v16, v10
; GFX1170-NEXT: v_mov_b32_e32 v17, v10
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1170-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], v[10:17]
; GFX1170-NEXT: s_clause 0x1
; GFX1170-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
; GFX1170-NEXT: global_store_b128 v[8:9], v[10:13], off
; GFX1170-NEXT: s_endpgm
;
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_mov_b32_e32 v10, 0x40400000
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10
; GFX12-NEXT: v_dual_mov_b32 v13, v10 :: v_dual_mov_b32 v14, v10
; GFX12-NEXT: v_dual_mov_b32 v15, v10 :: v_dual_mov_b32 v16, v10
; GFX12-NEXT: v_mov_b32_e32 v17, v10
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], v[10:17]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
; GFX12-NEXT: s_endpgm
; GCN-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: v_mov_b32_e32 v10, 0x40400000
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GCN-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10
; GCN-NEXT: v_dual_mov_b32 v13, v10 :: v_dual_mov_b32 v14, v10
; GCN-NEXT: v_dual_mov_b32 v15, v10 :: v_dual_mov_b32 v16, v10
; GCN-NEXT: v_mov_b32_e32 v17, v10
; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], v[10:17]
; GCN-NEXT: s_clause 0x1
; GCN-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off
; GCN-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %A, <8 x half> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
store <8 x float> %res, ptr addrspace(1) %out
@ -69,37 +51,19 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
; GFX1170-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable:
; GFX1170: ; %bb.0: ; %bb
; GFX1170-NEXT: v_mov_b32_e32 v10, 0x40400000
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1170-NEXT: v_mov_b32_e32 v11, v10
; GFX1170-NEXT: v_mov_b32_e32 v12, v10
; GFX1170-NEXT: v_mov_b32_e32 v13, v10
; GFX1170-NEXT: v_mov_b32_e32 v14, v10
; GFX1170-NEXT: v_mov_b32_e32 v15, v10
; GFX1170-NEXT: v_mov_b32_e32 v16, v10
; GFX1170-NEXT: v_mov_b32_e32 v17, v10
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1170-NEXT: v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], v[10:17]
; GFX1170-NEXT: s_clause 0x1
; GFX1170-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
; GFX1170-NEXT: global_store_b128 v[8:9], v[10:13], off
; GFX1170-NEXT: s_endpgm
;
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_mov_b32_e32 v10, 0x40400000
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10
; GFX12-NEXT: v_dual_mov_b32 v13, v10 :: v_dual_mov_b32 v14, v10
; GFX12-NEXT: v_dual_mov_b32 v15, v10 :: v_dual_mov_b32 v16, v10
; GFX12-NEXT: v_mov_b32_e32 v17, v10
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], v[10:17]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
; GFX12-NEXT: s_endpgm
; GCN-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: v_mov_b32_e32 v10, 0x40400000
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GCN-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10
; GCN-NEXT: v_dual_mov_b32 v13, v10 :: v_dual_mov_b32 v14, v10
; GCN-NEXT: v_dual_mov_b32 v15, v10 :: v_dual_mov_b32 v16, v10
; GCN-NEXT: v_mov_b32_e32 v17, v10
; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], v[10:17]
; GCN-NEXT: s_clause 0x1
; GCN-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16
; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off
; GCN-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
store <8 x float> %res, ptr addrspace(1) %out
@ -119,26 +83,15 @@ bb:
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
; GFX1170-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable:
; GFX1170: ; %bb.0: ; %bb
; GFX1170-NEXT: v_mov_b32_e32 v10, 0x42004200
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1170-NEXT: v_mov_b32_e32 v11, v10
; GFX1170-NEXT: v_mov_b32_e32 v12, v10
; GFX1170-NEXT: v_mov_b32_e32 v13, v10
; GFX1170-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], v[10:13]
; GFX1170-NEXT: global_store_b128 v[8:9], v[10:13], off
; GFX1170-NEXT: s_endpgm
;
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_mov_b32_e32 v10, 0x42004200
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10
; GFX12-NEXT: v_mov_b32_e32 v13, v10
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], v[10:13]
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
; GFX12-NEXT: s_endpgm
; GCN-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: v_mov_b32_e32 v10, 0x42004200
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GCN-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10
; GCN-NEXT: v_mov_b32_e32 v13, v10
; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], v[10:13]
; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off
; GCN-NEXT: s_endpgm
bb:
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> <half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0>, i1 0)
store <8 x half> %res, ptr addrspace(1) %out
@ -146,26 +99,15 @@ bb:
}
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
; GFX1170-LABEL: test_wmma_bf16_16x16x16_bf16_imm:
; GFX1170: ; %bb.0: ; %bb
; GFX1170-NEXT: v_mov_b32_e32 v10, 0x3f803f80
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1170-NEXT: v_mov_b32_e32 v11, v10
; GFX1170-NEXT: v_mov_b32_e32 v12, v10
; GFX1170-NEXT: v_mov_b32_e32 v13, v10
; GFX1170-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
; GFX1170-NEXT: global_store_b128 v[8:9], v[10:13], off
; GFX1170-NEXT: s_endpgm
;
; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_mov_b32_e32 v10, 0x3f803f80
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10
; GFX12-NEXT: v_mov_b32_e32 v13, v10
; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
; GFX12-NEXT: s_endpgm
; GCN-LABEL: test_wmma_bf16_16x16x16_bf16_imm:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: v_mov_b32_e32 v10, 0x3f803f80
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GCN-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10
; GCN-NEXT: v_mov_b32_e32 v13, v10
; GCN-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off
; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> <i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256>, i1 0)
store <8 x i16> %res, ptr addrspace(1) %out
@ -173,26 +115,15 @@ bb:
}
define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
; GFX1170-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable:
; GFX1170: ; %bb.0: ; %bb
; GFX1170-NEXT: v_mov_b32_e32 v10, 0x3fc03fc0
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX1170-NEXT: v_mov_b32_e32 v11, v10
; GFX1170-NEXT: v_mov_b32_e32 v12, v10
; GFX1170-NEXT: v_mov_b32_e32 v13, v10
; GFX1170-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
; GFX1170-NEXT: global_store_b128 v[8:9], v[10:13], off
; GFX1170-NEXT: s_endpgm
;
; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_mov_b32_e32 v10, 0x3fc03fc0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10
; GFX12-NEXT: v_mov_b32_e32 v13, v10
; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off
; GFX12-NEXT: s_endpgm
; GCN-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: v_mov_b32_e32 v10, 0x3fc03fc0
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GCN-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10
; GCN-NEXT: v_mov_b32_e32 v13, v10
; GCN-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off
; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> <i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320>, i1 0)
store <8 x i16> %res, ptr addrspace(1) %out
@ -214,37 +145,19 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
; GFX1170-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable:
; GFX1170: ; %bb.0: ; %bb
; GFX1170-NEXT: v_mov_b32_e32 v6, 0x80
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1170-NEXT: v_mov_b32_e32 v7, v6
; GFX1170-NEXT: v_mov_b32_e32 v8, v6
; GFX1170-NEXT: v_mov_b32_e32 v9, v6
; GFX1170-NEXT: v_mov_b32_e32 v10, v6
; GFX1170-NEXT: v_mov_b32_e32 v11, v6
; GFX1170-NEXT: v_mov_b32_e32 v12, v6
; GFX1170-NEXT: v_mov_b32_e32 v13, v6
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1170-NEXT: v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], v[6:13]
; GFX1170-NEXT: s_clause 0x1
; GFX1170-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX1170-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX1170-NEXT: s_endpgm
;
; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_mov_b32_e32 v6, 0x80
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
; GFX12-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
; GFX12-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
; GFX12-NEXT: v_mov_b32_e32 v13, v6
; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], v[6:13]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX12-NEXT: s_endpgm
; GCN-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: v_mov_b32_e32 v6, 0x80
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GCN-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
; GCN-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
; GCN-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
; GCN-NEXT: v_mov_b32_e32 v13, v6
; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], v[6:13]
; GCN-NEXT: s_clause 0x1
; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
@ -266,37 +179,19 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) {
; GFX1170-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable:
; GFX1170: ; %bb.0: ; %bb
; GFX1170-NEXT: v_mov_b32_e32 v4, 0x80
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1170-NEXT: v_mov_b32_e32 v5, v4
; GFX1170-NEXT: v_mov_b32_e32 v6, v4
; GFX1170-NEXT: v_mov_b32_e32 v7, v4
; GFX1170-NEXT: v_mov_b32_e32 v8, v4
; GFX1170-NEXT: v_mov_b32_e32 v9, v4
; GFX1170-NEXT: v_mov_b32_e32 v10, v4
; GFX1170-NEXT: v_mov_b32_e32 v11, v4
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1170-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, v[4:11]
; GFX1170-NEXT: s_clause 0x1
; GFX1170-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16
; GFX1170-NEXT: global_store_b128 v[2:3], v[4:7], off
; GFX1170-NEXT: s_endpgm
;
; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_mov_b32_e32 v4, 0x80
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4
; GFX12-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_mov_b32 v8, v4
; GFX12-NEXT: v_dual_mov_b32 v9, v4 :: v_dual_mov_b32 v10, v4
; GFX12-NEXT: v_mov_b32_e32 v11, v4
; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, v[4:11]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16
; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off
; GFX12-NEXT: s_endpgm
; GCN-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: v_mov_b32_e32 v4, 0x80
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GCN-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4
; GCN-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_mov_b32 v8, v4
; GCN-NEXT: v_dual_mov_b32 v9, v4 :: v_dual_mov_b32 v10, v4
; GCN-NEXT: v_mov_b32_e32 v11, v4
; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, v[4:11]
; GCN-NEXT: s_clause 0x1
; GCN-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16
; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off
; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
@ -318,37 +213,19 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
; GFX1170-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable:
; GFX1170: ; %bb.0: ; %bb
; GFX1170-NEXT: v_mov_b32_e32 v6, 0x40400000
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1170-NEXT: v_mov_b32_e32 v7, v6
; GFX1170-NEXT: v_mov_b32_e32 v8, v6
; GFX1170-NEXT: v_mov_b32_e32 v9, v6
; GFX1170-NEXT: v_mov_b32_e32 v10, v6
; GFX1170-NEXT: v_mov_b32_e32 v11, v6
; GFX1170-NEXT: v_mov_b32_e32 v12, v6
; GFX1170-NEXT: v_mov_b32_e32 v13, v6
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1170-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], v[6:13]
; GFX1170-NEXT: s_clause 0x1
; GFX1170-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX1170-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX1170-NEXT: s_endpgm
;
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
; GFX12-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
; GFX12-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
; GFX12-NEXT: v_mov_b32_e32 v13, v6
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], v[6:13]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX12-NEXT: s_endpgm
; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: v_mov_b32_e32 v6, 0x40400000
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GCN-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
; GCN-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
; GCN-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
; GCN-NEXT: v_mov_b32_e32 v13, v6
; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], v[6:13]
; GCN-NEXT: s_clause 0x1
; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
; GCN-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
store <8 x float> %res, ptr addrspace(1) %out
@ -370,37 +247,19 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
; GFX1170-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable:
; GFX1170: ; %bb.0: ; %bb
; GFX1170-NEXT: v_mov_b32_e32 v6, 0x40400000
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1170-NEXT: v_mov_b32_e32 v7, v6
; GFX1170-NEXT: v_mov_b32_e32 v8, v6
; GFX1170-NEXT: v_mov_b32_e32 v9, v6
; GFX1170-NEXT: v_mov_b32_e32 v10, v6
; GFX1170-NEXT: v_mov_b32_e32 v11, v6
; GFX1170-NEXT: v_mov_b32_e32 v12, v6
; GFX1170-NEXT: v_mov_b32_e32 v13, v6
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1170-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], v[6:13]
; GFX1170-NEXT: s_clause 0x1
; GFX1170-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX1170-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX1170-NEXT: s_endpgm
;
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
; GFX12-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
; GFX12-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
; GFX12-NEXT: v_mov_b32_e32 v13, v6
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], v[6:13]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX12-NEXT: s_endpgm
; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: v_mov_b32_e32 v6, 0x40400000
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GCN-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
; GCN-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
; GCN-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
; GCN-NEXT: v_mov_b32_e32 v13, v6
; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], v[6:13]
; GCN-NEXT: s_clause 0x1
; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
; GCN-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
store <8 x float> %res, ptr addrspace(1) %out
@ -422,37 +281,19 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
; GFX1170-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable:
; GFX1170: ; %bb.0: ; %bb
; GFX1170-NEXT: v_mov_b32_e32 v6, 0x40400000
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1170-NEXT: v_mov_b32_e32 v7, v6
; GFX1170-NEXT: v_mov_b32_e32 v8, v6
; GFX1170-NEXT: v_mov_b32_e32 v9, v6
; GFX1170-NEXT: v_mov_b32_e32 v10, v6
; GFX1170-NEXT: v_mov_b32_e32 v11, v6
; GFX1170-NEXT: v_mov_b32_e32 v12, v6
; GFX1170-NEXT: v_mov_b32_e32 v13, v6
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1170-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], v[6:13]
; GFX1170-NEXT: s_clause 0x1
; GFX1170-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX1170-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX1170-NEXT: s_endpgm
;
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
; GFX12-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
; GFX12-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
; GFX12-NEXT: v_mov_b32_e32 v13, v6
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], v[6:13]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX12-NEXT: s_endpgm
; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: v_mov_b32_e32 v6, 0x40400000
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GCN-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
; GCN-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
; GCN-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
; GCN-NEXT: v_mov_b32_e32 v13, v6
; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], v[6:13]
; GCN-NEXT: s_clause 0x1
; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
; GCN-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
store <8 x float> %res, ptr addrspace(1) %out
@ -474,37 +315,19 @@ bb:
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
; GFX1170-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable:
; GFX1170: ; %bb.0: ; %bb
; GFX1170-NEXT: v_mov_b32_e32 v6, 0x40400000
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1170-NEXT: v_mov_b32_e32 v7, v6
; GFX1170-NEXT: v_mov_b32_e32 v8, v6
; GFX1170-NEXT: v_mov_b32_e32 v9, v6
; GFX1170-NEXT: v_mov_b32_e32 v10, v6
; GFX1170-NEXT: v_mov_b32_e32 v11, v6
; GFX1170-NEXT: v_mov_b32_e32 v12, v6
; GFX1170-NEXT: v_mov_b32_e32 v13, v6
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1170-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], v[6:13]
; GFX1170-NEXT: s_clause 0x1
; GFX1170-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX1170-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX1170-NEXT: s_endpgm
;
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
; GFX12-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
; GFX12-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
; GFX12-NEXT: v_mov_b32_e32 v13, v6
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], v[6:13]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX12-NEXT: s_endpgm
; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: v_mov_b32_e32 v6, 0x40400000
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GCN-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
; GCN-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
; GCN-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
; GCN-NEXT: v_mov_b32_e32 v13, v6
; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], v[6:13]
; GCN-NEXT: s_clause 0x1
; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
; GCN-NEXT: s_endpgm
bb:
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
store <8 x float> %res, ptr addrspace(1) %out
@ -526,37 +349,19 @@ bb:
}
define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
; GFX1170-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable:
; GFX1170: ; %bb.0: ; %bb
; GFX1170-NEXT: v_mov_b32_e32 v6, 0x80
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1170-NEXT: v_mov_b32_e32 v7, v6
; GFX1170-NEXT: v_mov_b32_e32 v8, v6
; GFX1170-NEXT: v_mov_b32_e32 v9, v6
; GFX1170-NEXT: v_mov_b32_e32 v10, v6
; GFX1170-NEXT: v_mov_b32_e32 v11, v6
; GFX1170-NEXT: v_mov_b32_e32 v12, v6
; GFX1170-NEXT: v_mov_b32_e32 v13, v6
; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1170-NEXT: v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], v[6:13]
; GFX1170-NEXT: s_clause 0x1
; GFX1170-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX1170-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX1170-NEXT: s_endpgm
;
; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_mov_b32_e32 v6, 0x80
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
; GFX12-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
; GFX12-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
; GFX12-NEXT: v_mov_b32_e32 v13, v6
; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], v[6:13]
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX12-NEXT: s_endpgm
; GCN-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: v_mov_b32_e32 v6, 0x80
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GCN-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
; GCN-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
; GCN-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
; GCN-NEXT: v_mov_b32_e32 v13, v6
; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], v[6:13]
; GCN-NEXT: s_clause 0x1
; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16
; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off
; GCN-NEXT: s_endpgm
bb:
%res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
store <8 x i32> %res, ptr addrspace(1) %out
@ -574,3 +379,6 @@ declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32>
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32>, <2 x i32>, <8 x float>)
declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32>, <2 x i32>, <8 x float>)
declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GFX1170: {{.*}}
; GFX12: {{.*}}

View File

@ -0,0 +1,8 @@
// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1170 -show-encoding %s | FileCheck -check-prefix=GFX1170 %s
//===----------------------------------------------------------------------===//
// A VOPD OpY mov_b32 instruction uses SRC2 source-cache if OpX is also mov_b32
//===----------------------------------------------------------------------===//
v_dual_mov_b32 v2, v5 :: v_dual_mov_b32 v3, v1
// GFX1170: encoding: [0x05,0x01,0x10,0xca,0x01,0x01,0x02,0x02]