diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index bbf1e2be8695..34c9ba58fd6b 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -3967,9 +3967,10 @@ AMDGPUAsmParser::checkVOPDRegBankConstraints(const MCInst &Inst, bool AsVOPD3) { : MCRegister(); }; - // On GFX12+ if both OpX and OpY are V_MOV_B32 then OPY uses SRC2 + // On GFX1170+ if both OpX and OpY are V_MOV_B32 then OPY uses SRC2 // source-cache. bool SkipSrc = + Opcode == AMDGPU::V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1170 || Opcode == AMDGPU::V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 || Opcode == AMDGPU::V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 || Opcode == AMDGPU::V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx13 || diff --git a/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp index 800d5bfa2314..b17cabf37d53 100644 --- a/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp +++ b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp @@ -142,9 +142,9 @@ bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII, if ((UniqueLiterals.size() + UniqueScalarRegs.size()) > 2) return false; - // On GFX12+ if both OpX and OpY are V_MOV_B32 then OPY uses SRC2 + // On GFX1170+ if both OpX and OpY are V_MOV_B32 then OPY uses SRC2 // source-cache. - bool SkipSrc = ST.getGeneration() >= AMDGPUSubtarget::GFX12 && + bool SkipSrc = (ST.hasGFX11_7Insts() || ST.hasGFX12Insts()) && MIX.getOpcode() == AMDGPU::V_MOV_B32_e32 && MIY.getOpcode() == AMDGPU::V_MOV_B32_e32; bool AllowSameVGPR = ST.hasGFX1250Insts(); diff --git a/llvm/test/CodeGen/AMDGPU/vopd-combine.mir b/llvm/test/CodeGen/AMDGPU/vopd-combine.mir index 3a2b0996edac..b5c25f862ba2 100644 --- a/llvm/test/CodeGen/AMDGPU/vopd-combine.mir +++ b/llvm/test/CodeGen/AMDGPU/vopd-combine.mir @@ -1,6 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass=postmisched %s -o - | FileCheck -check-prefix=SCHED %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass=postmisched,gcn-create-vopd %s -o - | FileCheck -check-prefixes=PAIR,PAIR-GFX11 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass=postmisched,gcn-create-vopd %s -o - | FileCheck -check-prefixes=PAIR,PAIR-GFX1100 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1170 -verify-machineinstrs -run-pass=postmisched %s -o - | FileCheck -check-prefix=SCHED %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1170 -verify-machineinstrs -run-pass=postmisched,gcn-create-vopd %s -o - | FileCheck -check-prefixes=PAIR,PAIR-GFX1170 %s # RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass=postmisched %s -o - | FileCheck -check-prefix=SCHED %s # RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass=postmisched,gcn-create-vopd %s -o - | FileCheck -check-prefixes=PAIR,PAIR-GFX12 %s @@ -43,12 +45,19 @@ body: | ; SCHED-NEXT: $vgpr6 = V_MUL_F32_e32 killed $vgpr0, $vgpr0, implicit $mode, implicit $exec ; SCHED-NEXT: $vgpr4 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec ; - ; PAIR-GFX11-LABEL: name: vopd_schedule - ; PAIR-GFX11: $vgpr0 = IMPLICIT_DEF - ; PAIR-GFX11-NEXT: $vgpr1 = IMPLICIT_DEF - ; PAIR-GFX11-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec - ; PAIR-GFX11-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx11 $vgpr1, $vgpr1, killed $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec - ; PAIR-GFX11-NEXT: $vgpr4 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; PAIR-GFX1100-LABEL: name: vopd_schedule + ; PAIR-GFX1100: $vgpr0 = IMPLICIT_DEF + ; PAIR-GFX1100-NEXT: $vgpr1 = IMPLICIT_DEF + ; PAIR-GFX1100-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; PAIR-GFX1100-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx11 $vgpr1, $vgpr1, killed $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec + ; PAIR-GFX1100-NEXT: $vgpr4 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; + ; PAIR-GFX1170-LABEL: name: vopd_schedule + ; PAIR-GFX1170: $vgpr0 = IMPLICIT_DEF + ; PAIR-GFX1170-NEXT: $vgpr1 = IMPLICIT_DEF + ; PAIR-GFX1170-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; PAIR-GFX1170-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1170 $vgpr1, $vgpr1, killed $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec + ; PAIR-GFX1170-NEXT: $vgpr4 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec ; ; PAIR-GFX12-LABEL: name: vopd_schedule ; PAIR-GFX12: $vgpr0 = IMPLICIT_DEF @@ -80,12 +89,19 @@ body: | ; SCHED-NEXT: $vgpr5 = V_FMAMK_F32 killed $vgpr0, 10, killed $vgpr3, implicit $mode, implicit $exec ; SCHED-NEXT: $vgpr2 = V_FMAC_F32_e32 killed $vgpr1, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec ; - ; PAIR-GFX11-LABEL: name: vopd_fmamk - ; PAIR-GFX11: $vgpr2 = IMPLICIT_DEF - ; PAIR-GFX11-NEXT: $vgpr0 = IMPLICIT_DEF - ; PAIR-GFX11-NEXT: $vgpr1 = IMPLICIT_DEF - ; PAIR-GFX11-NEXT: $vgpr3 = IMPLICIT_DEF - ; PAIR-GFX11-NEXT: $vgpr5, $vgpr2 = V_DUAL_FMAMK_F32_X_FMAC_F32_e32_gfx11 killed $vgpr0, 10, killed $vgpr3, killed $vgpr1, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec + ; PAIR-GFX1100-LABEL: name: vopd_fmamk + ; PAIR-GFX1100: $vgpr2 = IMPLICIT_DEF + ; PAIR-GFX1100-NEXT: $vgpr0 = IMPLICIT_DEF + ; PAIR-GFX1100-NEXT: $vgpr1 = IMPLICIT_DEF + ; PAIR-GFX1100-NEXT: $vgpr3 = IMPLICIT_DEF + ; PAIR-GFX1100-NEXT: $vgpr5, $vgpr2 = V_DUAL_FMAMK_F32_X_FMAC_F32_e32_gfx11 killed $vgpr0, 10, killed $vgpr3, killed $vgpr1, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec + ; + ; PAIR-GFX1170-LABEL: name: vopd_fmamk + ; PAIR-GFX1170: $vgpr2 = IMPLICIT_DEF + ; PAIR-GFX1170-NEXT: $vgpr0 = IMPLICIT_DEF + ; PAIR-GFX1170-NEXT: $vgpr1 = IMPLICIT_DEF + ; PAIR-GFX1170-NEXT: $vgpr3 = IMPLICIT_DEF + ; PAIR-GFX1170-NEXT: $vgpr5, $vgpr2 = V_DUAL_FMAMK_F32_X_FMAC_F32_e32_gfx1170 killed $vgpr0, 10, killed $vgpr3, killed $vgpr1, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec ; ; PAIR-GFX12-LABEL: name: vopd_fmamk ; PAIR-GFX12: $vgpr2 = IMPLICIT_DEF @@ -155,19 +171,33 @@ body: | ; SCHED-NEXT: $vgpr6 = V_ADD_F32_e32 $sgpr20, $vgpr3, implicit $mode, implicit $exec ; SCHED-NEXT: $vgpr9 = V_CNDMASK_B32_e32 killed $sgpr20, killed $vgpr3, implicit $mode, implicit $exec, implicit killed $vcc_lo ; - ; PAIR-GFX11-LABEL: name: vopd_cndmask - ; PAIR-GFX11: liveins: $vcc_lo - ; PAIR-GFX11-NEXT: {{ $}} - ; PAIR-GFX11-NEXT: $vgpr2 = IMPLICIT_DEF - ; PAIR-GFX11-NEXT: $vgpr0 = IMPLICIT_DEF - ; PAIR-GFX11-NEXT: $vgpr1 = IMPLICIT_DEF - ; PAIR-GFX11-NEXT: $vgpr3 = IMPLICIT_DEF - ; PAIR-GFX11-NEXT: $sgpr20 = IMPLICIT_DEF - ; PAIR-GFX11-NEXT: $vgpr4 = V_FMAMK_F32 $sgpr20, 12345, $vgpr3, implicit $mode, implicit $exec - ; PAIR-GFX11-NEXT: $vgpr2, $vgpr5 = V_DUAL_FMAC_F32_e32_X_CNDMASK_B32_e32_gfx11 $sgpr20, killed $vgpr1, killed $vgpr2, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo - ; PAIR-GFX11-NEXT: $vgpr7 = V_CNDMASK_B32_e32 killed $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo - ; PAIR-GFX11-NEXT: $vgpr6 = V_ADD_F32_e32 $sgpr20, $vgpr3, implicit $mode, implicit $exec - ; PAIR-GFX11-NEXT: $vgpr9 = V_CNDMASK_B32_e32 killed $sgpr20, killed $vgpr3, implicit $mode, implicit $exec, implicit killed $vcc_lo + ; PAIR-GFX1100-LABEL: name: vopd_cndmask + ; PAIR-GFX1100: liveins: $vcc_lo + ; PAIR-GFX1100-NEXT: {{ $}} + ; PAIR-GFX1100-NEXT: $vgpr2 = IMPLICIT_DEF + ; PAIR-GFX1100-NEXT: $vgpr0 = IMPLICIT_DEF + ; PAIR-GFX1100-NEXT: $vgpr1 = IMPLICIT_DEF + ; PAIR-GFX1100-NEXT: $vgpr3 = IMPLICIT_DEF + ; PAIR-GFX1100-NEXT: $sgpr20 = IMPLICIT_DEF + ; PAIR-GFX1100-NEXT: $vgpr4 = V_FMAMK_F32 $sgpr20, 12345, $vgpr3, implicit $mode, implicit $exec + ; PAIR-GFX1100-NEXT: $vgpr2, $vgpr5 = V_DUAL_FMAC_F32_e32_X_CNDMASK_B32_e32_gfx11 $sgpr20, killed $vgpr1, killed $vgpr2, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo + ; PAIR-GFX1100-NEXT: $vgpr7 = V_CNDMASK_B32_e32 killed $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo + ; PAIR-GFX1100-NEXT: $vgpr6 = V_ADD_F32_e32 $sgpr20, $vgpr3, implicit $mode, implicit $exec + ; PAIR-GFX1100-NEXT: $vgpr9 = V_CNDMASK_B32_e32 killed $sgpr20, killed $vgpr3, implicit $mode, implicit $exec, implicit killed $vcc_lo + ; + ; PAIR-GFX1170-LABEL: name: vopd_cndmask + ; PAIR-GFX1170: liveins: $vcc_lo + ; PAIR-GFX1170-NEXT: {{ $}} + ; PAIR-GFX1170-NEXT: $vgpr2 = IMPLICIT_DEF + ; PAIR-GFX1170-NEXT: $vgpr0 = IMPLICIT_DEF + ; PAIR-GFX1170-NEXT: $vgpr1 = IMPLICIT_DEF + ; PAIR-GFX1170-NEXT: $vgpr3 = IMPLICIT_DEF + ; PAIR-GFX1170-NEXT: $sgpr20 = IMPLICIT_DEF + ; PAIR-GFX1170-NEXT: $vgpr4 = V_FMAMK_F32 $sgpr20, 12345, $vgpr3, implicit $mode, implicit $exec + ; PAIR-GFX1170-NEXT: $vgpr2, $vgpr5 = V_DUAL_FMAC_F32_e32_X_CNDMASK_B32_e32_gfx1170 $sgpr20, killed $vgpr1, killed $vgpr2, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo + ; PAIR-GFX1170-NEXT: $vgpr7 = V_CNDMASK_B32_e32 killed $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo + ; PAIR-GFX1170-NEXT: $vgpr6 = V_ADD_F32_e32 $sgpr20, $vgpr3, implicit $mode, implicit $exec + ; PAIR-GFX1170-NEXT: $vgpr9 = V_CNDMASK_B32_e32 killed $sgpr20, killed $vgpr3, implicit $mode, implicit $exec, implicit killed $vcc_lo ; ; PAIR-GFX12-LABEL: name: vopd_cndmask ; PAIR-GFX12: liveins: $vcc_lo @@ -211,10 +241,15 @@ body: | ; SCHED-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr0, implicit $exec ; SCHED-NEXT: $vgpr3 = V_ADD_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec ; - ; PAIR-GFX11-LABEL: name: vopd_mov - ; PAIR-GFX11: $vgpr0 = IMPLICIT_DEF - ; PAIR-GFX11-NEXT: $vgpr1 = IMPLICIT_DEF - ; PAIR-GFX11-NEXT: $vgpr2, $vgpr3 = V_DUAL_MOV_B32_e32_X_ADD_F32_e32_gfx11 killed $vgpr0, killed $vgpr1, $vgpr1, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec + ; PAIR-GFX1100-LABEL: name: vopd_mov + ; PAIR-GFX1100: $vgpr0 = IMPLICIT_DEF + ; PAIR-GFX1100-NEXT: $vgpr1 = IMPLICIT_DEF + ; PAIR-GFX1100-NEXT: $vgpr2, $vgpr3 = V_DUAL_MOV_B32_e32_X_ADD_F32_e32_gfx11 killed $vgpr0, killed $vgpr1, $vgpr1, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec + ; + ; PAIR-GFX1170-LABEL: name: vopd_mov + ; PAIR-GFX1170: $vgpr0 = IMPLICIT_DEF + ; PAIR-GFX1170-NEXT: $vgpr1 = IMPLICIT_DEF + ; PAIR-GFX1170-NEXT: $vgpr2, $vgpr3 = V_DUAL_MOV_B32_e32_X_ADD_F32_e32_gfx1170 killed $vgpr0, killed $vgpr1, $vgpr1, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec ; ; PAIR-GFX12-LABEL: name: vopd_mov ; PAIR-GFX12: $vgpr0 = IMPLICIT_DEF @@ -239,10 +274,15 @@ body: | ; SCHED-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr0, implicit $exec ; SCHED-NEXT: $vgpr3 = V_MOV_B32_e32 killed $sgpr7, implicit $exec ; - ; PAIR-GFX11-LABEL: name: vopd_mov_mov - ; PAIR-GFX11: $sgpr0 = IMPLICIT_DEF - ; PAIR-GFX11-NEXT: $sgpr7 = IMPLICIT_DEF - ; PAIR-GFX11-NEXT: $vgpr2, $vgpr3 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx11 killed $sgpr0, killed $sgpr7, implicit $exec, implicit $exec, implicit $exec + ; PAIR-GFX1100-LABEL: name: vopd_mov_mov + ; PAIR-GFX1100: $sgpr0 = IMPLICIT_DEF + ; PAIR-GFX1100-NEXT: $sgpr7 = IMPLICIT_DEF + ; PAIR-GFX1100-NEXT: $vgpr2, $vgpr3 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx11 killed $sgpr0, killed $sgpr7, implicit $exec, implicit $exec, implicit $exec + ; + ; PAIR-GFX1170-LABEL: name: vopd_mov_mov + ; PAIR-GFX1170: $sgpr0 = IMPLICIT_DEF + ; PAIR-GFX1170-NEXT: $sgpr7 = IMPLICIT_DEF + ; PAIR-GFX1170-NEXT: $vgpr2, $vgpr3 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1170 killed $sgpr0, killed $sgpr7, implicit $exec, implicit $exec, implicit $exec ; ; PAIR-GFX12-LABEL: name: vopd_mov_mov ; PAIR-GFX12: $sgpr0 = IMPLICIT_DEF @@ -300,12 +340,19 @@ body: | ; SCHED-NEXT: $vgpr5 = V_FMAMK_F32 killed $vgpr0, 100, killed $vgpr3, implicit $mode, implicit $exec ; SCHED-NEXT: $vgpr2 = V_FMAC_F32_e32 4, killed $vgpr1, killed $vgpr2, implicit $mode, implicit $exec ; - ; PAIR-GFX11-LABEL: name: vopd_constants_inlinable - ; PAIR-GFX11: $vgpr2 = IMPLICIT_DEF - ; PAIR-GFX11-NEXT: $vgpr0 = IMPLICIT_DEF - ; PAIR-GFX11-NEXT: $vgpr1 = IMPLICIT_DEF - ; PAIR-GFX11-NEXT: $vgpr3 = IMPLICIT_DEF - ; PAIR-GFX11-NEXT: $vgpr5, $vgpr2 = V_DUAL_FMAMK_F32_X_FMAC_F32_e32_gfx11 killed $vgpr0, 100, killed $vgpr3, 4, killed $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec + ; PAIR-GFX1100-LABEL: name: vopd_constants_inlinable + ; PAIR-GFX1100: $vgpr2 = IMPLICIT_DEF + ; PAIR-GFX1100-NEXT: $vgpr0 = IMPLICIT_DEF + ; PAIR-GFX1100-NEXT: $vgpr1 = IMPLICIT_DEF + ; PAIR-GFX1100-NEXT: $vgpr3 = IMPLICIT_DEF + ; PAIR-GFX1100-NEXT: $vgpr5, $vgpr2 = V_DUAL_FMAMK_F32_X_FMAC_F32_e32_gfx11 killed $vgpr0, 100, killed $vgpr3, 4, killed $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec + ; + ; PAIR-GFX1170-LABEL: name: vopd_constants_inlinable + ; PAIR-GFX1170: $vgpr2 = IMPLICIT_DEF + ; PAIR-GFX1170-NEXT: $vgpr0 = IMPLICIT_DEF + ; PAIR-GFX1170-NEXT: $vgpr1 = IMPLICIT_DEF + ; PAIR-GFX1170-NEXT: $vgpr3 = IMPLICIT_DEF + ; PAIR-GFX1170-NEXT: $vgpr5, $vgpr2 = V_DUAL_FMAMK_F32_X_FMAC_F32_e32_gfx1170 killed $vgpr0, 100, killed $vgpr3, 4, killed $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec ; ; PAIR-GFX12-LABEL: name: vopd_constants_inlinable ; PAIR-GFX12: $vgpr2 = IMPLICIT_DEF @@ -338,12 +385,19 @@ body: | ; SCHED-NEXT: $vgpr5 = V_FMAMK_F32 killed $vgpr0, 100, killed $vgpr3, implicit $mode, implicit $exec ; SCHED-NEXT: $vgpr2 = V_FMAC_F32_e32 100, killed $vgpr1, killed $vgpr2, implicit $mode, implicit $exec ; - ; PAIR-GFX11-LABEL: name: vopd_constants_same - ; PAIR-GFX11: $vgpr2 = IMPLICIT_DEF - ; PAIR-GFX11-NEXT: $vgpr0 = IMPLICIT_DEF - ; PAIR-GFX11-NEXT: $vgpr1 = IMPLICIT_DEF - ; PAIR-GFX11-NEXT: $vgpr3 = IMPLICIT_DEF - ; PAIR-GFX11-NEXT: $vgpr5, $vgpr2 = V_DUAL_FMAMK_F32_X_FMAC_F32_e32_gfx11 killed $vgpr0, 100, killed $vgpr3, 100, killed $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec + ; PAIR-GFX1100-LABEL: name: vopd_constants_same + ; PAIR-GFX1100: $vgpr2 = IMPLICIT_DEF + ; PAIR-GFX1100-NEXT: $vgpr0 = IMPLICIT_DEF + ; PAIR-GFX1100-NEXT: $vgpr1 = IMPLICIT_DEF + ; PAIR-GFX1100-NEXT: $vgpr3 = IMPLICIT_DEF + ; PAIR-GFX1100-NEXT: $vgpr5, $vgpr2 = V_DUAL_FMAMK_F32_X_FMAC_F32_e32_gfx11 killed $vgpr0, 100, killed $vgpr3, 100, killed $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec + ; + ; PAIR-GFX1170-LABEL: name: vopd_constants_same + ; PAIR-GFX1170: $vgpr2 = IMPLICIT_DEF + ; PAIR-GFX1170-NEXT: $vgpr0 = IMPLICIT_DEF + ; PAIR-GFX1170-NEXT: $vgpr1 = IMPLICIT_DEF + ; PAIR-GFX1170-NEXT: $vgpr3 = IMPLICIT_DEF + ; PAIR-GFX1170-NEXT: $vgpr5, $vgpr2 = V_DUAL_FMAMK_F32_X_FMAC_F32_e32_gfx1170 killed $vgpr0, 100, killed $vgpr3, 100, killed $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec ; ; PAIR-GFX12-LABEL: name: vopd_constants_same ; PAIR-GFX12: $vgpr2 = IMPLICIT_DEF @@ -373,10 +427,15 @@ body: | ; SCHED-NEXT: $vgpr1 = V_MOV_B32_e32 981467136, implicit $exec ; SCHED-NEXT: $vgpr2 = V_FMAAK_F32 killed $sgpr0, killed $vgpr0, 981467136, implicit $mode, implicit $exec ; - ; PAIR-GFX11-LABEL: name: vopd_mov_fmaak_constants_same - ; PAIR-GFX11: $vgpr0 = IMPLICIT_DEF - ; PAIR-GFX11-NEXT: $sgpr0 = IMPLICIT_DEF - ; PAIR-GFX11-NEXT: $vgpr1, $vgpr2 = V_DUAL_MOV_B32_e32_X_FMAAK_F32_gfx11 981467136, killed $sgpr0, killed $vgpr0, 981467136, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec + ; PAIR-GFX1100-LABEL: name: vopd_mov_fmaak_constants_same + ; PAIR-GFX1100: $vgpr0 = IMPLICIT_DEF + ; PAIR-GFX1100-NEXT: $sgpr0 = IMPLICIT_DEF + ; PAIR-GFX1100-NEXT: $vgpr1, $vgpr2 = V_DUAL_MOV_B32_e32_X_FMAAK_F32_gfx11 981467136, killed $sgpr0, killed $vgpr0, 981467136, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec + ; + ; PAIR-GFX1170-LABEL: name: vopd_mov_fmaak_constants_same + ; PAIR-GFX1170: $vgpr0 = IMPLICIT_DEF + ; PAIR-GFX1170-NEXT: $sgpr0 = IMPLICIT_DEF + ; PAIR-GFX1170-NEXT: $vgpr1, $vgpr2 = V_DUAL_MOV_B32_e32_X_FMAAK_F32_gfx1170 981467136, killed $sgpr0, killed $vgpr0, 981467136, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec ; ; PAIR-GFX12-LABEL: name: vopd_mov_fmaak_constants_same ; PAIR-GFX12: $vgpr0 = IMPLICIT_DEF @@ -403,11 +462,17 @@ body: | ; SCHED-NEXT: DBG_VALUE $vgpr0, 0, 0 ; SCHED-NEXT: $vgpr6 = V_MUL_F32_e32 killed $vgpr0, $vgpr0, implicit $mode, implicit $exec ; - ; PAIR-GFX11-LABEL: name: vopd_debug - ; PAIR-GFX11: $vgpr0 = IMPLICIT_DEF - ; PAIR-GFX11-NEXT: $vgpr1 = IMPLICIT_DEF - ; PAIR-GFX11-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx11 killed $vgpr1, $vgpr1, killed $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec - ; PAIR-GFX11-NEXT: DBG_VALUE $vgpr0, 0, 0 + ; PAIR-GFX1100-LABEL: name: vopd_debug + ; PAIR-GFX1100: $vgpr0 = IMPLICIT_DEF + ; PAIR-GFX1100-NEXT: $vgpr1 = IMPLICIT_DEF + ; PAIR-GFX1100-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx11 killed $vgpr1, $vgpr1, killed $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec + ; PAIR-GFX1100-NEXT: DBG_VALUE $vgpr0, 0, 0 + ; + ; PAIR-GFX1170-LABEL: name: vopd_debug + ; PAIR-GFX1170: $vgpr0 = IMPLICIT_DEF + ; PAIR-GFX1170-NEXT: $vgpr1 = IMPLICIT_DEF + ; PAIR-GFX1170-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1170 killed $vgpr1, $vgpr1, killed $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec + ; PAIR-GFX1170-NEXT: DBG_VALUE $vgpr0, 0, 0 ; ; PAIR-GFX12-LABEL: name: vopd_debug ; PAIR-GFX12: $vgpr0 = IMPLICIT_DEF @@ -451,23 +516,41 @@ body: | ; SCHED-NEXT: $vgpr16 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec ; SCHED-NEXT: $vgpr14 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec ; - ; PAIR-GFX11-LABEL: name: vopd_schedule_unconstrained - ; PAIR-GFX11: liveins: $vcc_lo - ; PAIR-GFX11-NEXT: {{ $}} - ; PAIR-GFX11-NEXT: $vgpr2 = IMPLICIT_DEF - ; PAIR-GFX11-NEXT: $vgpr3 = IMPLICIT_DEF - ; PAIR-GFX11-NEXT: $vgpr0 = IMPLICIT_DEF - ; PAIR-GFX11-NEXT: $vgpr1 = IMPLICIT_DEF - ; PAIR-GFX11-NEXT: $vgpr4 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec - ; PAIR-GFX11-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx11 $vgpr1, $vgpr1, $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec - ; PAIR-GFX11-NEXT: $vgpr2 = V_FMAC_F32_e32 10, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec - ; PAIR-GFX11-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec - ; PAIR-GFX11-NEXT: $vgpr12, $vgpr11 = V_DUAL_ADD_F32_e32_X_CNDMASK_B32_e32_gfx11 $vgpr1, $vgpr1, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo - ; PAIR-GFX11-NEXT: $vgpr19 = V_CNDMASK_B32_e32 $vgpr0, killed $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo - ; PAIR-GFX11-NEXT: $vgpr17, $vgpr10 = V_DUAL_MUL_F32_e32_X_CNDMASK_B32_e32_gfx11 killed $vgpr0, $vgpr0, $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo - ; PAIR-GFX11-NEXT: $vgpr15 = V_CNDMASK_B32_e32 $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit killed $vcc_lo - ; PAIR-GFX11-NEXT: $vgpr16 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec - ; PAIR-GFX11-NEXT: $vgpr14 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; PAIR-GFX1100-LABEL: name: vopd_schedule_unconstrained + ; PAIR-GFX1100: liveins: $vcc_lo + ; PAIR-GFX1100-NEXT: {{ $}} + ; PAIR-GFX1100-NEXT: $vgpr2 = IMPLICIT_DEF + ; PAIR-GFX1100-NEXT: $vgpr3 = IMPLICIT_DEF + ; PAIR-GFX1100-NEXT: $vgpr0 = IMPLICIT_DEF + ; PAIR-GFX1100-NEXT: $vgpr1 = IMPLICIT_DEF + ; PAIR-GFX1100-NEXT: $vgpr4 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; PAIR-GFX1100-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx11 $vgpr1, $vgpr1, $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec + ; PAIR-GFX1100-NEXT: $vgpr2 = V_FMAC_F32_e32 10, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec + ; PAIR-GFX1100-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; PAIR-GFX1100-NEXT: $vgpr12, $vgpr11 = V_DUAL_ADD_F32_e32_X_CNDMASK_B32_e32_gfx11 $vgpr1, $vgpr1, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo + ; PAIR-GFX1100-NEXT: $vgpr19 = V_CNDMASK_B32_e32 $vgpr0, killed $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo + ; PAIR-GFX1100-NEXT: $vgpr17, $vgpr10 = V_DUAL_MUL_F32_e32_X_CNDMASK_B32_e32_gfx11 killed $vgpr0, $vgpr0, $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo + ; PAIR-GFX1100-NEXT: $vgpr15 = V_CNDMASK_B32_e32 $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit killed $vcc_lo + ; PAIR-GFX1100-NEXT: $vgpr16 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; PAIR-GFX1100-NEXT: $vgpr14 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; + ; PAIR-GFX1170-LABEL: name: vopd_schedule_unconstrained + ; PAIR-GFX1170: liveins: $vcc_lo + ; PAIR-GFX1170-NEXT: {{ $}} + ; PAIR-GFX1170-NEXT: $vgpr2 = IMPLICIT_DEF + ; PAIR-GFX1170-NEXT: $vgpr3 = IMPLICIT_DEF + ; PAIR-GFX1170-NEXT: $vgpr0 = IMPLICIT_DEF + ; PAIR-GFX1170-NEXT: $vgpr1 = IMPLICIT_DEF + ; PAIR-GFX1170-NEXT: $vgpr4 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; PAIR-GFX1170-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1170 $vgpr1, $vgpr1, $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec + ; PAIR-GFX1170-NEXT: $vgpr2 = V_FMAC_F32_e32 10, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec + ; PAIR-GFX1170-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; PAIR-GFX1170-NEXT: $vgpr12, $vgpr11 = V_DUAL_ADD_F32_e32_X_CNDMASK_B32_e32_gfx1170 $vgpr1, $vgpr1, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo + ; PAIR-GFX1170-NEXT: $vgpr19 = V_CNDMASK_B32_e32 $vgpr0, killed $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo + ; PAIR-GFX1170-NEXT: $vgpr17, $vgpr10 = V_DUAL_MUL_F32_e32_X_CNDMASK_B32_e32_gfx1170 killed $vgpr0, $vgpr0, $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo + ; PAIR-GFX1170-NEXT: $vgpr15 = V_CNDMASK_B32_e32 $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit killed $vcc_lo + ; PAIR-GFX1170-NEXT: $vgpr16 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; PAIR-GFX1170-NEXT: $vgpr14 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec ; ; PAIR-GFX12-LABEL: name: vopd_schedule_unconstrained ; PAIR-GFX12: liveins: $vcc_lo @@ -551,32 +634,59 @@ body: | ; SCHED-NEXT: $vgpr34 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec ; SCHED-NEXT: $vgpr32 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec ; - ; PAIR-GFX11-LABEL: name: vopd_schedule_unconstrained_2 - ; PAIR-GFX11: liveins: $vcc_lo - ; PAIR-GFX11-NEXT: {{ $}} - ; PAIR-GFX11-NEXT: $vgpr2 = IMPLICIT_DEF - ; PAIR-GFX11-NEXT: $vgpr3 = IMPLICIT_DEF - ; PAIR-GFX11-NEXT: $vgpr0 = IMPLICIT_DEF - ; PAIR-GFX11-NEXT: $vgpr1 = IMPLICIT_DEF - ; PAIR-GFX11-NEXT: $vgpr20 = IMPLICIT_DEF - ; PAIR-GFX11-NEXT: $vgpr16, $vgpr35 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx11 $vgpr1, $vgpr1, $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec - ; PAIR-GFX11-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx11 $vgpr1, $vgpr1, $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec - ; PAIR-GFX11-NEXT: $vgpr2 = V_FMAC_F32_e32 10, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec - ; PAIR-GFX11-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec - ; PAIR-GFX11-NEXT: $vgpr4, $vgpr29 = V_DUAL_SUB_F32_e32_X_CNDMASK_B32_e32_gfx11 $vgpr1, $vgpr1, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo - ; PAIR-GFX11-NEXT: $vgpr19, $vgpr20 = V_DUAL_CNDMASK_B32_e32_X_FMAC_F32_e32_gfx11 $vgpr0, $vgpr3, 10, $vgpr1, killed $vgpr20, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec - ; PAIR-GFX11-NEXT: $vgpr15 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo - ; PAIR-GFX11-NEXT: $vgpr10, $vgpr17 = V_DUAL_CNDMASK_B32_e32_X_MUL_F32_e32_gfx11 $vgpr1, $vgpr2, $vgpr0, $vgpr0, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec - ; PAIR-GFX11-NEXT: $vgpr11, $vgpr12 = V_DUAL_CNDMASK_B32_e32_X_ADD_F32_e32_gfx11 $vgpr0, $vgpr3, $vgpr1, $vgpr1, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec - ; PAIR-GFX11-NEXT: $vgpr37, $vgpr14 = V_DUAL_CNDMASK_B32_e32_X_SUB_F32_e32_gfx11 $vgpr0, killed $vgpr3, $vgpr1, $vgpr1, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec - ; PAIR-GFX11-NEXT: $vgpr20 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec - ; PAIR-GFX11-NEXT: $vgpr21, $vgpr24 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx11 $vgpr1, $vgpr1, killed $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec - ; PAIR-GFX11-NEXT: $vgpr28 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo - ; PAIR-GFX11-NEXT: $vgpr22 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec - ; PAIR-GFX11-NEXT: $vgpr31 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec - ; PAIR-GFX11-NEXT: $vgpr33 = V_CNDMASK_B32_e32 $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit killed $vcc_lo - ; PAIR-GFX11-NEXT: $vgpr34 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec - ; PAIR-GFX11-NEXT: $vgpr32 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; PAIR-GFX1100-LABEL: name: vopd_schedule_unconstrained_2 + ; PAIR-GFX1100: liveins: $vcc_lo + ; PAIR-GFX1100-NEXT: {{ $}} + ; PAIR-GFX1100-NEXT: $vgpr2 = IMPLICIT_DEF + ; PAIR-GFX1100-NEXT: $vgpr3 = IMPLICIT_DEF + ; PAIR-GFX1100-NEXT: $vgpr0 = IMPLICIT_DEF + ; PAIR-GFX1100-NEXT: $vgpr1 = IMPLICIT_DEF + ; PAIR-GFX1100-NEXT: $vgpr20 = IMPLICIT_DEF + ; PAIR-GFX1100-NEXT: $vgpr16, $vgpr35 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx11 $vgpr1, $vgpr1, $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec + ; PAIR-GFX1100-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx11 $vgpr1, $vgpr1, $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec + ; PAIR-GFX1100-NEXT: $vgpr2 = V_FMAC_F32_e32 10, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec + ; PAIR-GFX1100-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; PAIR-GFX1100-NEXT: $vgpr4, $vgpr29 = V_DUAL_SUB_F32_e32_X_CNDMASK_B32_e32_gfx11 $vgpr1, $vgpr1, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo + ; PAIR-GFX1100-NEXT: $vgpr19, $vgpr20 = V_DUAL_CNDMASK_B32_e32_X_FMAC_F32_e32_gfx11 $vgpr0, $vgpr3, 10, $vgpr1, killed $vgpr20, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec + ; PAIR-GFX1100-NEXT: $vgpr15 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo + ; PAIR-GFX1100-NEXT: $vgpr10, $vgpr17 = V_DUAL_CNDMASK_B32_e32_X_MUL_F32_e32_gfx11 $vgpr1, $vgpr2, $vgpr0, $vgpr0, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec + ; PAIR-GFX1100-NEXT: $vgpr11, $vgpr12 = V_DUAL_CNDMASK_B32_e32_X_ADD_F32_e32_gfx11 $vgpr0, $vgpr3, $vgpr1, $vgpr1, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec + ; PAIR-GFX1100-NEXT: $vgpr37, $vgpr14 = V_DUAL_CNDMASK_B32_e32_X_SUB_F32_e32_gfx11 $vgpr0, killed $vgpr3, $vgpr1, $vgpr1, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec + ; PAIR-GFX1100-NEXT: $vgpr20 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; PAIR-GFX1100-NEXT: $vgpr21, $vgpr24 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx11 $vgpr1, $vgpr1, killed $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec + ; PAIR-GFX1100-NEXT: $vgpr28 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo + ; PAIR-GFX1100-NEXT: $vgpr22 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; PAIR-GFX1100-NEXT: $vgpr31 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; PAIR-GFX1100-NEXT: $vgpr33 = V_CNDMASK_B32_e32 $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit killed $vcc_lo + ; PAIR-GFX1100-NEXT: $vgpr34 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; PAIR-GFX1100-NEXT: $vgpr32 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; + ; PAIR-GFX1170-LABEL: name: vopd_schedule_unconstrained_2 + ; PAIR-GFX1170: liveins: $vcc_lo + ; PAIR-GFX1170-NEXT: {{ $}} + ; PAIR-GFX1170-NEXT: $vgpr2 = IMPLICIT_DEF + ; PAIR-GFX1170-NEXT: $vgpr3 = IMPLICIT_DEF + ; PAIR-GFX1170-NEXT: $vgpr0 = IMPLICIT_DEF + ; PAIR-GFX1170-NEXT: $vgpr1 = IMPLICIT_DEF + ; PAIR-GFX1170-NEXT: $vgpr20 = IMPLICIT_DEF + ; PAIR-GFX1170-NEXT: $vgpr16, $vgpr35 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1170 $vgpr1, $vgpr1, $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec + ; PAIR-GFX1170-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1170 $vgpr1, $vgpr1, $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec + ; PAIR-GFX1170-NEXT: $vgpr2 = V_FMAC_F32_e32 10, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec + ; PAIR-GFX1170-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; PAIR-GFX1170-NEXT: $vgpr4, $vgpr29 = V_DUAL_SUB_F32_e32_X_CNDMASK_B32_e32_gfx1170 $vgpr1, $vgpr1, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo + ; PAIR-GFX1170-NEXT: $vgpr19, $vgpr20 = V_DUAL_CNDMASK_B32_e32_X_FMAC_F32_e32_gfx1170 $vgpr0, $vgpr3, 10, $vgpr1, killed $vgpr20, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec + ; PAIR-GFX1170-NEXT: $vgpr15 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo + ; PAIR-GFX1170-NEXT: $vgpr10, $vgpr17 = V_DUAL_CNDMASK_B32_e32_X_MUL_F32_e32_gfx1170 $vgpr1, $vgpr2, $vgpr0, $vgpr0, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec + ; PAIR-GFX1170-NEXT: $vgpr11, $vgpr12 = V_DUAL_CNDMASK_B32_e32_X_ADD_F32_e32_gfx1170 $vgpr0, $vgpr3, $vgpr1, $vgpr1, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec + ; PAIR-GFX1170-NEXT: $vgpr37, $vgpr14 = V_DUAL_CNDMASK_B32_e32_X_SUB_F32_e32_gfx1170 $vgpr0, killed $vgpr3, $vgpr1, $vgpr1, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec + ; PAIR-GFX1170-NEXT: $vgpr20 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; PAIR-GFX1170-NEXT: $vgpr21, $vgpr24 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx1170 $vgpr1, $vgpr1, killed $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec + ; PAIR-GFX1170-NEXT: $vgpr28 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo + ; PAIR-GFX1170-NEXT: $vgpr22 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; PAIR-GFX1170-NEXT: $vgpr31 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; PAIR-GFX1170-NEXT: $vgpr33 = V_CNDMASK_B32_e32 $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit killed $vcc_lo + ; PAIR-GFX1170-NEXT: $vgpr34 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; PAIR-GFX1170-NEXT: $vgpr32 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec ; ; PAIR-GFX12-LABEL: name: vopd_schedule_unconstrained_2 ; PAIR-GFX12: liveins: $vcc_lo @@ -657,11 +767,17 @@ body: | ; SCHED-NEXT: $vgpr4 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @lds, implicit $exec ; SCHED-NEXT: $vgpr5 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @lds, implicit $exec ; - ; PAIR-GFX11-LABEL: name: vopd_mov_fixup - ; PAIR-GFX11: $vgpr0 = IMPLICIT_DEF - ; PAIR-GFX11-NEXT: $vgpr1 = IMPLICIT_DEF - ; PAIR-GFX11-NEXT: $vgpr2, $vgpr3 = V_DUAL_MOV_B32_e32_X_ADD_F32_e32_gfx11 target-flags(amdgpu-abs32-lo) @lds, killed $vgpr0, killed $vgpr1, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec - ; PAIR-GFX11-NEXT: $vgpr4, $vgpr5 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx11 target-flags(amdgpu-abs32-lo) @lds, target-flags(amdgpu-abs32-lo) @lds, implicit $exec, implicit $exec, implicit $exec + ; PAIR-GFX1100-LABEL: name: vopd_mov_fixup + ; PAIR-GFX1100: $vgpr0 = IMPLICIT_DEF + ; PAIR-GFX1100-NEXT: $vgpr1 = IMPLICIT_DEF + ; PAIR-GFX1100-NEXT: $vgpr2, $vgpr3 = V_DUAL_MOV_B32_e32_X_ADD_F32_e32_gfx11 target-flags(amdgpu-abs32-lo) @lds, killed $vgpr0, killed $vgpr1, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec + ; PAIR-GFX1100-NEXT: $vgpr4, $vgpr5 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx11 target-flags(amdgpu-abs32-lo) @lds, target-flags(amdgpu-abs32-lo) @lds, implicit $exec, implicit $exec, implicit $exec + ; + ; PAIR-GFX1170-LABEL: name: vopd_mov_fixup + ; PAIR-GFX1170: $vgpr0 = IMPLICIT_DEF + ; PAIR-GFX1170-NEXT: $vgpr1 = IMPLICIT_DEF + ; PAIR-GFX1170-NEXT: $vgpr2, $vgpr3 = V_DUAL_MOV_B32_e32_X_ADD_F32_e32_gfx1170 target-flags(amdgpu-abs32-lo) @lds, killed $vgpr0, killed $vgpr1, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec + ; PAIR-GFX1170-NEXT: $vgpr4, $vgpr5 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1170 target-flags(amdgpu-abs32-lo) @lds, target-flags(amdgpu-abs32-lo) @lds, implicit $exec, implicit $exec, implicit $exec ; ; PAIR-GFX12-LABEL: name: vopd_mov_fixup ; PAIR-GFX12: $vgpr0 = IMPLICIT_DEF @@ -726,11 +842,16 @@ body: | ; SCHED-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr1, implicit $exec ; SCHED-NEXT: $vgpr3 = V_MOV_B32_e32 killed $vgpr5, implicit $exec ; - ; PAIR-GFX11-LABEL: name: vopd_mov_mov_same_src_bank - ; PAIR-GFX11: $vgpr1 = IMPLICIT_DEF - ; PAIR-GFX11-NEXT: $vgpr5 = IMPLICIT_DEF - ; PAIR-GFX11-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr1, implicit $exec - ; PAIR-GFX11-NEXT: $vgpr3 = V_MOV_B32_e32 killed $vgpr5, implicit $exec + ; PAIR-GFX1100-LABEL: name: vopd_mov_mov_same_src_bank + ; PAIR-GFX1100: $vgpr1 = IMPLICIT_DEF + ; PAIR-GFX1100-NEXT: $vgpr5 = IMPLICIT_DEF + ; PAIR-GFX1100-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr1, implicit $exec + ; PAIR-GFX1100-NEXT: $vgpr3 = V_MOV_B32_e32 killed $vgpr5, implicit $exec + ; + ; PAIR-GFX1170-LABEL: name: vopd_mov_mov_same_src_bank + ; PAIR-GFX1170: $vgpr1 = IMPLICIT_DEF + ; PAIR-GFX1170-NEXT: $vgpr5 = IMPLICIT_DEF + ; PAIR-GFX1170-NEXT: $vgpr2, $vgpr3 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1170 killed $vgpr1, killed $vgpr5, implicit $exec, implicit $exec, implicit $exec ; ; PAIR-GFX12-LABEL: name: vopd_mov_mov_same_src_bank ; PAIR-GFX12: $vgpr1 = IMPLICIT_DEF @@ -754,10 +875,15 @@ body: | ; SCHED-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr1, implicit $exec ; SCHED-NEXT: $vgpr1 = V_ADD_F32_e32 killed $vgpr3, $vgpr3, implicit $mode, implicit $exec ; - ; PAIR-GFX11-LABEL: name: vopd_combine_opy_overwrites_opx - ; PAIR-GFX11: $vgpr1 = IMPLICIT_DEF - ; PAIR-GFX11-NEXT: $vgpr3 = IMPLICIT_DEF - ; PAIR-GFX11-NEXT: $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_ADD_F32_e32_gfx11 killed $vgpr1, killed $vgpr3, $vgpr3, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec + ; PAIR-GFX1100-LABEL: name: vopd_combine_opy_overwrites_opx + ; PAIR-GFX1100: $vgpr1 = IMPLICIT_DEF + ; PAIR-GFX1100-NEXT: $vgpr3 = IMPLICIT_DEF + ; PAIR-GFX1100-NEXT: $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_ADD_F32_e32_gfx11 killed $vgpr1, killed $vgpr3, $vgpr3, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec + ; + ; PAIR-GFX1170-LABEL: name: vopd_combine_opy_overwrites_opx + ; PAIR-GFX1170: $vgpr1 = IMPLICIT_DEF + ; PAIR-GFX1170-NEXT: $vgpr3 = IMPLICIT_DEF + ; PAIR-GFX1170-NEXT: $vgpr0, $vgpr1 = V_DUAL_MOV_B32_e32_X_ADD_F32_e32_gfx1170 killed $vgpr1, killed $vgpr3, $vgpr3, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec ; ; PAIR-GFX12-LABEL: name: vopd_combine_opy_overwrites_opx ; PAIR-GFX12: $vgpr1 = IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll index 2558dc390364..7148f3d61465 100644 --- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll @@ -17,37 +17,19 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) { -; GFX1170-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable: -; GFX1170: ; %bb.0: ; %bb -; GFX1170-NEXT: v_mov_b32_e32 v10, 0x40400000 -; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1170-NEXT: v_mov_b32_e32 v11, v10 -; GFX1170-NEXT: v_mov_b32_e32 v12, v10 -; GFX1170-NEXT: v_mov_b32_e32 v13, v10 -; GFX1170-NEXT: v_mov_b32_e32 v14, v10 -; GFX1170-NEXT: v_mov_b32_e32 v15, v10 -; GFX1170-NEXT: v_mov_b32_e32 v16, v10 -; GFX1170-NEXT: v_mov_b32_e32 v17, v10 -; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1170-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], v[10:17] -; GFX1170-NEXT: s_clause 0x1 -; GFX1170-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 -; GFX1170-NEXT: global_store_b128 v[8:9], v[10:13], off -; GFX1170-NEXT: s_endpgm -; -; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_mov_b32_e32 v10, 0x40400000 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10 -; GFX12-NEXT: v_dual_mov_b32 v13, v10 :: v_dual_mov_b32 v14, v10 -; GFX12-NEXT: v_dual_mov_b32 v15, v10 :: v_dual_mov_b32 v16, v10 -; GFX12-NEXT: v_mov_b32_e32 v17, v10 -; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], v[10:17] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 -; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_mov_b32_e32 v10, 0x40400000 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GCN-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10 +; GCN-NEXT: v_dual_mov_b32 v13, v10 :: v_dual_mov_b32 v14, v10 +; GCN-NEXT: v_dual_mov_b32 v15, v10 :: v_dual_mov_b32 v16, v10 +; GCN-NEXT: v_mov_b32_e32 v17, v10 +; GCN-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], v[10:17] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 +; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %A, <8 x half> %B, <8 x float> ) store <8 x float> %res, ptr addrspace(1) %out @@ -69,37 +51,19 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) { -; GFX1170-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable: -; GFX1170: ; %bb.0: ; %bb -; GFX1170-NEXT: v_mov_b32_e32 v10, 0x40400000 -; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1170-NEXT: v_mov_b32_e32 v11, v10 -; GFX1170-NEXT: v_mov_b32_e32 v12, v10 -; GFX1170-NEXT: v_mov_b32_e32 v13, v10 -; GFX1170-NEXT: v_mov_b32_e32 v14, v10 -; GFX1170-NEXT: v_mov_b32_e32 v15, v10 -; GFX1170-NEXT: v_mov_b32_e32 v16, v10 -; GFX1170-NEXT: v_mov_b32_e32 v17, v10 -; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1170-NEXT: v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], v[10:17] -; GFX1170-NEXT: s_clause 0x1 -; GFX1170-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 -; GFX1170-NEXT: global_store_b128 v[8:9], v[10:13], off -; GFX1170-NEXT: s_endpgm -; -; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_mov_b32_e32 v10, 0x40400000 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10 -; GFX12-NEXT: v_dual_mov_b32 v13, v10 :: v_dual_mov_b32 v14, v10 -; GFX12-NEXT: v_dual_mov_b32 v15, v10 :: v_dual_mov_b32 v16, v10 -; GFX12-NEXT: v_mov_b32_e32 v17, v10 -; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], v[10:17] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 -; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_mov_b32_e32 v10, 0x40400000 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GCN-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10 +; GCN-NEXT: v_dual_mov_b32 v13, v10 :: v_dual_mov_b32 v14, v10 +; GCN-NEXT: v_dual_mov_b32 v15, v10 :: v_dual_mov_b32 v16, v10 +; GCN-NEXT: v_mov_b32_e32 v17, v10 +; GCN-NEXT: v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], v[10:17] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[8:9], v[14:17], off offset:16 +; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x float> ) store <8 x float> %res, ptr addrspace(1) %out @@ -119,26 +83,15 @@ bb: } define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) { -; GFX1170-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable: -; GFX1170: ; %bb.0: ; %bb -; GFX1170-NEXT: v_mov_b32_e32 v10, 0x42004200 -; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1170-NEXT: v_mov_b32_e32 v11, v10 -; GFX1170-NEXT: v_mov_b32_e32 v12, v10 -; GFX1170-NEXT: v_mov_b32_e32 v13, v10 -; GFX1170-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], v[10:13] -; GFX1170-NEXT: global_store_b128 v[8:9], v[10:13], off -; GFX1170-NEXT: s_endpgm -; -; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_mov_b32_e32 v10, 0x42004200 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10 -; GFX12-NEXT: v_mov_b32_e32 v13, v10 -; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], v[10:13] -; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_mov_b32_e32 v10, 0x42004200 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GCN-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10 +; GCN-NEXT: v_mov_b32_e32 v13, v10 +; GCN-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], v[10:13] +; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> , i1 0) store <8 x half> %res, ptr addrspace(1) %out @@ -146,26 +99,15 @@ bb: } define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) { -; GFX1170-LABEL: test_wmma_bf16_16x16x16_bf16_imm: -; GFX1170: ; %bb.0: ; %bb -; GFX1170-NEXT: v_mov_b32_e32 v10, 0x3f803f80 -; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1170-NEXT: v_mov_b32_e32 v11, v10 -; GFX1170-NEXT: v_mov_b32_e32 v12, v10 -; GFX1170-NEXT: v_mov_b32_e32 v13, v10 -; GFX1170-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13] -; GFX1170-NEXT: global_store_b128 v[8:9], v[10:13], off -; GFX1170-NEXT: s_endpgm -; -; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_mov_b32_e32 v10, 0x3f803f80 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10 -; GFX12-NEXT: v_mov_b32_e32 v13, v10 -; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13] -; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_bf16_16x16x16_bf16_imm: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_mov_b32_e32 v10, 0x3f803f80 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GCN-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10 +; GCN-NEXT: v_mov_b32_e32 v13, v10 +; GCN-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13] +; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> , i1 0) store <8 x i16> %res, ptr addrspace(1) %out @@ -173,26 +115,15 @@ bb: } define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) { -; GFX1170-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable: -; GFX1170: ; %bb.0: ; %bb -; GFX1170-NEXT: v_mov_b32_e32 v10, 0x3fc03fc0 -; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1170-NEXT: v_mov_b32_e32 v11, v10 -; GFX1170-NEXT: v_mov_b32_e32 v12, v10 -; GFX1170-NEXT: v_mov_b32_e32 v13, v10 -; GFX1170-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13] -; GFX1170-NEXT: global_store_b128 v[8:9], v[10:13], off -; GFX1170-NEXT: s_endpgm -; -; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_mov_b32_e32 v10, 0x3fc03fc0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10 -; GFX12-NEXT: v_mov_b32_e32 v13, v10 -; GFX12-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13] -; GFX12-NEXT: global_store_b128 v[8:9], v[10:13], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_mov_b32_e32 v10, 0x3fc03fc0 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GCN-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10 +; GCN-NEXT: v_mov_b32_e32 v13, v10 +; GCN-NEXT: v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13] +; GCN-NEXT: global_store_b128 v[8:9], v[10:13], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> , i1 0) store <8 x i16> %res, ptr addrspace(1) %out @@ -214,37 +145,19 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { -; GFX1170-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable: -; GFX1170: ; %bb.0: ; %bb -; GFX1170-NEXT: v_mov_b32_e32 v6, 0x80 -; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1170-NEXT: v_mov_b32_e32 v7, v6 -; GFX1170-NEXT: v_mov_b32_e32 v8, v6 -; GFX1170-NEXT: v_mov_b32_e32 v9, v6 -; GFX1170-NEXT: v_mov_b32_e32 v10, v6 -; GFX1170-NEXT: v_mov_b32_e32 v11, v6 -; GFX1170-NEXT: v_mov_b32_e32 v12, v6 -; GFX1170-NEXT: v_mov_b32_e32 v13, v6 -; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1170-NEXT: v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], v[6:13] -; GFX1170-NEXT: s_clause 0x1 -; GFX1170-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 -; GFX1170-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX1170-NEXT: s_endpgm -; -; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_mov_b32_e32 v6, 0x80 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6 -; GFX12-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6 -; GFX12-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6 -; GFX12-NEXT: v_mov_b32_e32 v13, v6 -; GFX12-NEXT: v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], v[6:13] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 -; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_mov_b32_e32 v6, 0x80 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GCN-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6 +; GCN-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6 +; GCN-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6 +; GCN-NEXT: v_mov_b32_e32 v13, v6 +; GCN-NEXT: v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], v[6:13] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> , i1 0) store <8 x i32> %res, ptr addrspace(1) %out @@ -266,37 +179,19 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) { -; GFX1170-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable: -; GFX1170: ; %bb.0: ; %bb -; GFX1170-NEXT: v_mov_b32_e32 v4, 0x80 -; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1170-NEXT: v_mov_b32_e32 v5, v4 -; GFX1170-NEXT: v_mov_b32_e32 v6, v4 -; GFX1170-NEXT: v_mov_b32_e32 v7, v4 -; GFX1170-NEXT: v_mov_b32_e32 v8, v4 -; GFX1170-NEXT: v_mov_b32_e32 v9, v4 -; GFX1170-NEXT: v_mov_b32_e32 v10, v4 -; GFX1170-NEXT: v_mov_b32_e32 v11, v4 -; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1170-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, v[4:11] -; GFX1170-NEXT: s_clause 0x1 -; GFX1170-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16 -; GFX1170-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX1170-NEXT: s_endpgm -; -; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_mov_b32_e32 v4, 0x80 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4 -; GFX12-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_mov_b32 v8, v4 -; GFX12-NEXT: v_dual_mov_b32 v9, v4 :: v_dual_mov_b32 v10, v4 -; GFX12-NEXT: v_mov_b32_e32 v11, v4 -; GFX12-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, v[4:11] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16 -; GFX12-NEXT: global_store_b128 v[2:3], v[4:7], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_mov_b32_e32 v4, 0x80 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GCN-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4 +; GCN-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_mov_b32 v8, v4 +; GCN-NEXT: v_dual_mov_b32 v9, v4 :: v_dual_mov_b32 v10, v4 +; GCN-NEXT: v_mov_b32_e32 v11, v4 +; GCN-NEXT: v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, v[4:11] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[2:3], v[8:11], off offset:16 +; GCN-NEXT: global_store_b128 v[2:3], v[4:7], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> , i1 0) store <8 x i32> %res, ptr addrspace(1) %out @@ -318,37 +213,19 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { -; GFX1170-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable: -; GFX1170: ; %bb.0: ; %bb -; GFX1170-NEXT: v_mov_b32_e32 v6, 0x40400000 -; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1170-NEXT: v_mov_b32_e32 v7, v6 -; GFX1170-NEXT: v_mov_b32_e32 v8, v6 -; GFX1170-NEXT: v_mov_b32_e32 v9, v6 -; GFX1170-NEXT: v_mov_b32_e32 v10, v6 -; GFX1170-NEXT: v_mov_b32_e32 v11, v6 -; GFX1170-NEXT: v_mov_b32_e32 v12, v6 -; GFX1170-NEXT: v_mov_b32_e32 v13, v6 -; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1170-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], v[6:13] -; GFX1170-NEXT: s_clause 0x1 -; GFX1170-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 -; GFX1170-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX1170-NEXT: s_endpgm -; -; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6 -; GFX12-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6 -; GFX12-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6 -; GFX12-NEXT: v_mov_b32_e32 v13, v6 -; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], v[6:13] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 -; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_mov_b32_e32 v6, 0x40400000 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GCN-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6 +; GCN-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6 +; GCN-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6 +; GCN-NEXT: v_mov_b32_e32 v13, v6 +; GCN-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], v[6:13] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) store <8 x float> %res, ptr addrspace(1) %out @@ -370,37 +247,19 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { -; GFX1170-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable: -; GFX1170: ; %bb.0: ; %bb -; GFX1170-NEXT: v_mov_b32_e32 v6, 0x40400000 -; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1170-NEXT: v_mov_b32_e32 v7, v6 -; GFX1170-NEXT: v_mov_b32_e32 v8, v6 -; GFX1170-NEXT: v_mov_b32_e32 v9, v6 -; GFX1170-NEXT: v_mov_b32_e32 v10, v6 -; GFX1170-NEXT: v_mov_b32_e32 v11, v6 -; GFX1170-NEXT: v_mov_b32_e32 v12, v6 -; GFX1170-NEXT: v_mov_b32_e32 v13, v6 -; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1170-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], v[6:13] -; GFX1170-NEXT: s_clause 0x1 -; GFX1170-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 -; GFX1170-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX1170-NEXT: s_endpgm -; -; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6 -; GFX12-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6 -; GFX12-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6 -; GFX12-NEXT: v_mov_b32_e32 v13, v6 -; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], v[6:13] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 -; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_mov_b32_e32 v6, 0x40400000 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GCN-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6 +; GCN-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6 +; GCN-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6 +; GCN-NEXT: v_mov_b32_e32 v13, v6 +; GCN-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], v[6:13] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) store <8 x float> %res, ptr addrspace(1) %out @@ -422,37 +281,19 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { -; GFX1170-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable: -; GFX1170: ; %bb.0: ; %bb -; GFX1170-NEXT: v_mov_b32_e32 v6, 0x40400000 -; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1170-NEXT: v_mov_b32_e32 v7, v6 -; GFX1170-NEXT: v_mov_b32_e32 v8, v6 -; GFX1170-NEXT: v_mov_b32_e32 v9, v6 -; GFX1170-NEXT: v_mov_b32_e32 v10, v6 -; GFX1170-NEXT: v_mov_b32_e32 v11, v6 -; GFX1170-NEXT: v_mov_b32_e32 v12, v6 -; GFX1170-NEXT: v_mov_b32_e32 v13, v6 -; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1170-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], v[6:13] -; GFX1170-NEXT: s_clause 0x1 -; GFX1170-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 -; GFX1170-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX1170-NEXT: s_endpgm -; -; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6 -; GFX12-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6 -; GFX12-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6 -; GFX12-NEXT: v_mov_b32_e32 v13, v6 -; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], v[6:13] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 -; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_mov_b32_e32 v6, 0x40400000 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GCN-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6 +; GCN-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6 +; GCN-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6 +; GCN-NEXT: v_mov_b32_e32 v13, v6 +; GCN-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], v[6:13] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) store <8 x float> %res, ptr addrspace(1) %out @@ -474,37 +315,19 @@ bb: } define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { -; GFX1170-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable: -; GFX1170: ; %bb.0: ; %bb -; GFX1170-NEXT: v_mov_b32_e32 v6, 0x40400000 -; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1170-NEXT: v_mov_b32_e32 v7, v6 -; GFX1170-NEXT: v_mov_b32_e32 v8, v6 -; GFX1170-NEXT: v_mov_b32_e32 v9, v6 -; GFX1170-NEXT: v_mov_b32_e32 v10, v6 -; GFX1170-NEXT: v_mov_b32_e32 v11, v6 -; GFX1170-NEXT: v_mov_b32_e32 v12, v6 -; GFX1170-NEXT: v_mov_b32_e32 v13, v6 -; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1170-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], v[6:13] -; GFX1170-NEXT: s_clause 0x1 -; GFX1170-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 -; GFX1170-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX1170-NEXT: s_endpgm -; -; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_mov_b32_e32 v6, 0x40400000 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6 -; GFX12-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6 -; GFX12-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6 -; GFX12-NEXT: v_mov_b32_e32 v13, v6 -; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], v[6:13] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 -; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_mov_b32_e32 v6, 0x40400000 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GCN-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6 +; GCN-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6 +; GCN-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6 +; GCN-NEXT: v_mov_b32_e32 v13, v6 +; GCN-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], v[6:13] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> ) store <8 x float> %res, ptr addrspace(1) %out @@ -526,37 +349,19 @@ bb: } define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) { -; GFX1170-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable: -; GFX1170: ; %bb.0: ; %bb -; GFX1170-NEXT: v_mov_b32_e32 v6, 0x80 -; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1170-NEXT: v_mov_b32_e32 v7, v6 -; GFX1170-NEXT: v_mov_b32_e32 v8, v6 -; GFX1170-NEXT: v_mov_b32_e32 v9, v6 -; GFX1170-NEXT: v_mov_b32_e32 v10, v6 -; GFX1170-NEXT: v_mov_b32_e32 v11, v6 -; GFX1170-NEXT: v_mov_b32_e32 v12, v6 -; GFX1170-NEXT: v_mov_b32_e32 v13, v6 -; GFX1170-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1170-NEXT: v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], v[6:13] -; GFX1170-NEXT: s_clause 0x1 -; GFX1170-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 -; GFX1170-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX1170-NEXT: s_endpgm -; -; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable: -; GFX12: ; %bb.0: ; %bb -; GFX12-NEXT: v_mov_b32_e32 v6, 0x80 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6 -; GFX12-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6 -; GFX12-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6 -; GFX12-NEXT: v_mov_b32_e32 v13, v6 -; GFX12-NEXT: v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], v[6:13] -; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 -; GFX12-NEXT: global_store_b128 v[4:5], v[6:9], off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_mov_b32_e32 v6, 0x80 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GCN-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6 +; GCN-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6 +; GCN-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6 +; GCN-NEXT: v_mov_b32_e32 v13, v6 +; GCN-NEXT: v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], v[6:13] +; GCN-NEXT: s_clause 0x1 +; GCN-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GCN-NEXT: global_store_b128 v[4:5], v[6:9], off +; GCN-NEXT: s_endpgm bb: %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> , i1 0) store <8 x i32> %res, ptr addrspace(1) %out @@ -574,3 +379,6 @@ declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32> declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32>, <2 x i32>, <8 x float>) declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32>, <2 x i32>, <8 x float>) declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX1170: {{.*}} +; GFX12: {{.*}} diff --git a/llvm/test/MC/AMDGPU/gfx1170_asm_features.s b/llvm/test/MC/AMDGPU/gfx1170_asm_features.s new file mode 100644 index 000000000000..3cc0cc9b74cf --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx1170_asm_features.s @@ -0,0 +1,8 @@ +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1170 -show-encoding %s | FileCheck -check-prefix=GFX1170 %s + +//===----------------------------------------------------------------------===// +// A VOPD OpY mov_b32 instruction uses SRC2 source-cache if OpX is also mov_b32 +//===----------------------------------------------------------------------===// + +v_dual_mov_b32 v2, v5 :: v_dual_mov_b32 v3, v1 +// GFX1170: encoding: [0x05,0x01,0x10,0xca,0x01,0x01,0x02,0x02]