diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 94cc1d90e0ca..e13c13913d4e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -2045,38 +2045,6 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic( if (BaseOpcode->HasD16) MIB.addImm(IsD16 ? -1 : 0); - if (IsTexFail) { - // An image load instruction with TFE/LWE only conditionally writes to its - // result registers. Initialize them to zero so that we always get well - // defined result values. - assert(VDataOut && !VDataIn); - Register Tied = MRI->cloneVirtualRegister(VDataOut); - Register Zero = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); - BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::V_MOV_B32_e32), Zero) - .addImm(0); - auto Parts = TRI.getRegSplitParts(MRI->getRegClass(Tied), 4); - if (STI.usePRTStrictNull()) { - // With enable-prt-strict-null enabled, initialize all result registers to - // zero. - auto RegSeq = - BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), Tied); - for (auto Sub : Parts) - RegSeq.addReg(Zero).addImm(Sub); - } else { - // With enable-prt-strict-null disabled, only initialize the extra TFE/LWE - // result register. - Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); - BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef); - auto RegSeq = - BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), Tied); - for (auto Sub : Parts.drop_back(1)) - RegSeq.addReg(Undef).addImm(Sub); - RegSeq.addReg(Zero).addImm(Parts.back()); - } - MIB.addReg(Tied, RegState::Implicit); - MIB->tieOperands(0, MIB->getNumOperands() - 1); - } - MI.eraseFromParent(); constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI); TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::vaddr); diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index 4ae514ffcf78..273f92abf354 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -86,7 +86,7 @@ class BUF_Pseudo has_soffset = 1; bits<1> has_offset = 1; bits<1> has_slc = 1; - bits<1> tfe = ?; + bits<1> tfe = 0; bits<4> elements = 0; bits<1> has_sccb = 1; bits<1> sccb_value = 0; @@ -323,6 +323,7 @@ class MUBUF_Pseudo (MUBUFGetBaseOpcode.ret); let MUBUF = 1; let AsmMatchConverter = "cvtMubuf"; + let usesCustomInserter = 1; } class MUBUF_Real : @@ -3369,7 +3370,7 @@ def MUBUFInfoTable : GenericTable { let CppTypeName = "MUBUFInfo"; let Fields = [ "Opcode", "BaseOpcode", "elements", "has_vaddr", "has_srsrc", "has_soffset", - "IsBufferInv" + "IsBufferInv", "tfe" ]; let PrimaryKey = ["Opcode"]; diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td index 595ef39ce03e..23e8be0d5e45 100644 --- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td +++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td @@ -210,6 +210,7 @@ class MIMG : MIMG_Base { let hasPostISelHook = 1; + let usesCustomInserter = 1; Instruction Opcode = !cast(NAME); MIMGBaseOpcode BaseOpcode; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index d437f339a687..81a231f0cade 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -5410,6 +5410,11 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( return SplitBB; } default: + if (TII->isImage(MI) || TII->isMUBUF(MI)) { + if (!MI.mayStore()) + AddMemOpInit(MI); + return BB; + } return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); } } @@ -15034,60 +15039,67 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, // result register that will be written in the case of a memory access failure. // The required code is also added to tie this init code to the result of the // img instruction. -void SITargetLowering::AddIMGInit(MachineInstr &MI) const { +void SITargetLowering::AddMemOpInit(MachineInstr &MI) const { const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); const SIRegisterInfo &TRI = TII->getRegisterInfo(); MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); MachineBasicBlock &MBB = *MI.getParent(); - MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe); - MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe); - MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16); + int DstIdx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata); + unsigned InitIdx = 0; - if (!TFE && !LWE) // intersect_ray + if (TII->isImage(MI)) { + MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe); + MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe); + MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16); + + if (!TFE && !LWE) // intersect_ray + return; + + unsigned TFEVal = TFE ? TFE->getImm() : 0; + unsigned LWEVal = LWE ? LWE->getImm() : 0; + unsigned D16Val = D16 ? D16->getImm() : 0; + + if (!TFEVal && !LWEVal) + return; + + // At least one of TFE or LWE are non-zero + // We have to insert a suitable initialization of the result value and + // tie this to the dest of the image instruction. + + // Calculate which dword we have to initialize to 0. + MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask); + + // check that dmask operand is found. + assert(MO_Dmask && "Expected dmask operand in instruction"); + + unsigned dmask = MO_Dmask->getImm(); + // Determine the number of active lanes taking into account the + // Gather4 special case + unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask); + + bool Packed = !Subtarget->hasUnpackedD16VMem(); + + InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1; + + // Abandon attempt if the dst size isn't large enough + // - this is in fact an error but this is picked up elsewhere and + // reported correctly. + uint32_t DstSize = + TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32; + if (DstSize < InitIdx) + return; + } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) { + InitIdx = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32; + } else { return; - - unsigned TFEVal = TFE ? TFE->getImm() : 0; - unsigned LWEVal = LWE ? LWE->getImm() : 0; - unsigned D16Val = D16 ? D16->getImm() : 0; - - if (!TFEVal && !LWEVal) - return; - - // At least one of TFE or LWE are non-zero - // We have to insert a suitable initialization of the result value and - // tie this to the dest of the image instruction. + } const DebugLoc &DL = MI.getDebugLoc(); - int DstIdx = - AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata); - - // Calculate which dword we have to initialize to 0. - MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask); - - // check that dmask operand is found. - assert(MO_Dmask && "Expected dmask operand in instruction"); - - unsigned dmask = MO_Dmask->getImm(); - // Determine the number of active lanes taking into account the - // Gather4 special case - unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask); - - bool Packed = !Subtarget->hasUnpackedD16VMem(); - - unsigned InitIdx = - D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1; - - // Abandon attempt if the dst size isn't large enough - // - this is in fact an error but this is picked up elsewhere and - // reported correctly. - uint32_t DstSize = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32; - if (DstSize < InitIdx) - return; - // Create a register for the initialization value. - Register PrevDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx)); + Register PrevDst = MRI.cloneVirtualRegister(MI.getOperand(DstIdx).getReg()); unsigned NewDst = 0; // Final initialized value will be in here // If PRTStrictNull feature is enabled (the default) then initialize @@ -15185,11 +15197,8 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, return; } - if (TII->isImage(MI)) { - if (!MI.mayStore()) - AddIMGInit(MI); + if (TII->isImage(MI)) TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr); - } } static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index 89da4428e3ab..9856a2923d38 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -466,7 +466,7 @@ public: SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override; - void AddIMGInit(MachineInstr &MI) const; + void AddMemOpInit(MachineInstr &MI) const; void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 6d53f68ace70..a90dc32d396f 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -318,6 +318,7 @@ struct MUBUFInfo { bool has_srsrc; bool has_soffset; bool IsBufferInv; + bool tfe; }; struct MTBUFInfo { @@ -466,6 +467,11 @@ bool getMUBUFIsBufferInv(unsigned Opc) { return Info ? Info->IsBufferInv : false; } +bool getMUBUFTfe(unsigned Opc) { + const MUBUFInfo *Info = getMUBUFOpcodeHelper(Opc); + return Info ? Info->tfe : false; +} + bool getSMEMIsBuffer(unsigned Opc) { const SMInfo *Info = getSMEMOpcodeHelper(Opc); return Info ? Info->IsBuffer : false; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 29ac402d9535..f4f9a787100b 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -525,6 +525,9 @@ bool getMUBUFHasSoffset(unsigned Opc); LLVM_READONLY bool getMUBUFIsBufferInv(unsigned Opc); +LLVM_READONLY +bool getMUBUFTfe(unsigned Opc); + LLVM_READONLY bool getSMEMIsBuffer(unsigned Opc); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll index 686b849ff58f..06bd45a45cce 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX12 %s +; Note that TFE instructions don't have the result initialization to zero due to stopping before finalize-isel - which is where that's inserted define amdgpu_ps float @struct_buffer_load_format_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; GFX8-LABEL: name: struct_buffer_load_format_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.ll index 9edc24554911..1e3f94a5e39c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s +; Note that TFE instructions don't have the result initialization to zero due to stopping before finalize-isel - which is where that's inserted define amdgpu_ps float @struct_ptr_buffer_load_format_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { ; CHECK-LABEL: name: struct_ptr_buffer_load_format_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll index 1348315e72e7..7b1f55e7eeba 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll @@ -22,18 +22,36 @@ main_body: define amdgpu_ps <4 x float> @load_2dmsaa_both(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %fragid) { ; GFX11-LABEL: load_2dmsaa_both: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_msaa_load v[0:4], v[0:2], s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_2D_MSAA unorm tfe lwe ; encoding: [0x98,0x02,0x60,0xf0,0x00,0x00,0x60,0x00] -; GFX11-NEXT: v_mov_b32_e32 v5, 0 ; encoding: [0x80,0x02,0x0a,0x7e] +; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, 0 ; encoding: [0x00,0x01,0x10,0xca,0x80,0x00,0x08,0x05] +; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 ; encoding: [0x02,0x01,0x10,0xca,0x01,0x01,0x06,0x07] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; encoding: [0x42,0x02,0x87,0xbf] +; GFX11-NEXT: v_mov_b32_e32 v9, v8 ; encoding: [0x08,0x03,0x12,0x7e] +; GFX11-NEXT: v_mov_b32_e32 v10, v8 ; encoding: [0x08,0x03,0x14,0x7e] +; GFX11-NEXT: v_mov_b32_e32 v11, v8 ; encoding: [0x08,0x03,0x16,0x7e] +; GFX11-NEXT: v_mov_b32_e32 v12, v8 ; encoding: [0x08,0x03,0x18,0x7e] +; GFX11-NEXT: v_dual_mov_b32 v0, v8 :: v_dual_mov_b32 v1, v9 ; encoding: [0x08,0x01,0x10,0xca,0x09,0x01,0x00,0x00] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; encoding: [0x93,0x01,0x87,0xbf] +; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11 ; encoding: [0x0a,0x01,0x10,0xca,0x0b,0x01,0x02,0x02] +; GFX11-NEXT: v_mov_b32_e32 v4, v12 ; encoding: [0x0c,0x03,0x08,0x7e] +; GFX11-NEXT: image_msaa_load v[0:4], v[5:7], s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_2D_MSAA unorm tfe lwe ; encoding: [0x98,0x02,0x60,0xf0,0x05,0x00,0x60,0x00] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] -; GFX11-NEXT: global_store_b32 v5, v4, s[8:9] ; encoding: [0x00,0x00,0x6a,0xdc,0x05,0x04,0x08,0x00] +; GFX11-NEXT: global_store_b32 v8, v4, s[8:9] ; encoding: [0x00,0x00,0x6a,0xdc,0x08,0x04,0x08,0x00] ; GFX11-NEXT: ; return to shader part epilog ; ; GFX12-LABEL: load_2dmsaa_both: ; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: image_msaa_load v[0:4], [v0, v1, v2], s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_2D_MSAA unorm tfe lwe ; encoding: [0x0e,0x20,0x86,0xe4,0x00,0x01,0x00,0x00,0x00,0x01,0x02,0x00] -; GFX12-NEXT: v_mov_b32_e32 v5, 0 ; encoding: [0x80,0x02,0x0a,0x7e] +; GFX12-NEXT: v_dual_mov_b32 v7, v0 :: v_dual_mov_b32 v8, 0 ; encoding: [0x00,0x01,0x10,0xca,0x80,0x00,0x08,0x07] +; GFX12-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v6, v1 ; encoding: [0x02,0x01,0x10,0xca,0x01,0x01,0x06,0x05] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; encoding: [0x22,0x01,0x87,0xbf] +; GFX12-NEXT: v_dual_mov_b32 v9, v8 :: v_dual_mov_b32 v10, v8 ; encoding: [0x08,0x01,0x10,0xca,0x08,0x01,0x0a,0x09] +; GFX12-NEXT: v_dual_mov_b32 v11, v8 :: v_dual_mov_b32 v12, v8 ; encoding: [0x08,0x01,0x10,0xca,0x08,0x01,0x0c,0x0b] +; GFX12-NEXT: v_dual_mov_b32 v0, v8 :: v_dual_mov_b32 v1, v9 ; encoding: [0x08,0x01,0x10,0xca,0x09,0x01,0x00,0x00] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; encoding: [0x92,0x01,0x87,0xbf] +; GFX12-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11 ; encoding: [0x0a,0x01,0x10,0xca,0x0b,0x01,0x02,0x02] +; GFX12-NEXT: v_mov_b32_e32 v4, v12 ; encoding: [0x0c,0x03,0x08,0x7e] +; GFX12-NEXT: image_msaa_load v[0:4], [v7, v6, v5], s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_2D_MSAA unorm tfe lwe ; encoding: [0x0e,0x20,0x86,0xe4,0x00,0x01,0x00,0x00,0x07,0x06,0x05,0x00] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf] -; GFX12-NEXT: global_store_b32 v5, v4, s[8:9] ; encoding: [0x08,0x80,0x06,0xee,0x00,0x00,0x00,0x02,0x05,0x00,0x00,0x00] +; GFX12-NEXT: global_store_b32 v8, v4, s[8:9] ; encoding: [0x08,0x80,0x06,0xee,0x00,0x00,0x00,0x02,0x08,0x00,0x00,0x00] ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32i32.i32(i32 2, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 3, i32 0) @@ -63,18 +81,37 @@ main_body: define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %slice, i32 %fragid) { ; GFX11-LABEL: load_2darraymsaa_tfe: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_msaa_load v[0:4], v[0:3], s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe ; encoding: [0x9c,0x08,0x60,0xf0,0x00,0x00,0x20,0x00] -; GFX11-NEXT: v_mov_b32_e32 v5, 0 ; encoding: [0x80,0x02,0x0a,0x7e] +; GFX11-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v8, v3 ; encoding: [0x80,0x00,0x10,0xca,0x03,0x01,0x08,0x09] +; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 ; encoding: [0x02,0x01,0x10,0xca,0x01,0x01,0x06,0x07] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; encoding: [0x42,0x02,0x87,0xbf] +; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v10, v9 ; encoding: [0x00,0x01,0x10,0xca,0x09,0x01,0x0a,0x05] +; GFX11-NEXT: v_mov_b32_e32 v11, v9 ; encoding: [0x09,0x03,0x16,0x7e] +; GFX11-NEXT: v_mov_b32_e32 v12, v9 ; encoding: [0x09,0x03,0x18,0x7e] +; GFX11-NEXT: v_mov_b32_e32 v13, v9 ; encoding: [0x09,0x03,0x1a,0x7e] +; GFX11-NEXT: v_dual_mov_b32 v0, v9 :: v_dual_mov_b32 v1, v10 ; encoding: [0x09,0x01,0x10,0xca,0x0a,0x01,0x00,0x00] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; encoding: [0x93,0x01,0x87,0xbf] +; GFX11-NEXT: v_dual_mov_b32 v2, v11 :: v_dual_mov_b32 v3, v12 ; encoding: [0x0b,0x01,0x10,0xca,0x0c,0x01,0x02,0x02] +; GFX11-NEXT: v_mov_b32_e32 v4, v13 ; encoding: [0x0d,0x03,0x08,0x7e] +; GFX11-NEXT: image_msaa_load v[0:4], v[5:8], s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe ; encoding: [0x9c,0x08,0x60,0xf0,0x05,0x00,0x20,0x00] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] -; GFX11-NEXT: global_store_b32 v5, v4, s[8:9] ; encoding: [0x00,0x00,0x6a,0xdc,0x05,0x04,0x08,0x00] +; GFX11-NEXT: global_store_b32 v9, v4, s[8:9] ; encoding: [0x00,0x00,0x6a,0xdc,0x09,0x04,0x08,0x00] ; GFX11-NEXT: ; return to shader part epilog ; ; GFX12-LABEL: load_2darraymsaa_tfe: ; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: image_msaa_load v[0:4], [v0, v1, v2, v3], s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe ; encoding: [0x0f,0x20,0x06,0xe6,0x00,0x00,0x00,0x00,0x00,0x01,0x02,0x03] -; GFX12-NEXT: v_mov_b32_e32 v5, 0 ; encoding: [0x80,0x02,0x0a,0x7e] +; GFX12-NEXT: v_mov_b32_e32 v9, 0 ; encoding: [0x80,0x02,0x12,0x7e] +; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v6, v2 ; encoding: [0x03,0x01,0x10,0xca,0x02,0x01,0x06,0x05] +; GFX12-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v0 ; encoding: [0x01,0x01,0x10,0xca,0x00,0x01,0x08,0x07] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; encoding: [0x23,0x01,0x87,0xbf] +; GFX12-NEXT: v_dual_mov_b32 v10, v9 :: v_dual_mov_b32 v11, v9 ; encoding: [0x09,0x01,0x10,0xca,0x09,0x01,0x0a,0x0a] +; GFX12-NEXT: v_dual_mov_b32 v12, v9 :: v_dual_mov_b32 v13, v9 ; encoding: [0x09,0x01,0x10,0xca,0x09,0x01,0x0c,0x0c] +; GFX12-NEXT: v_dual_mov_b32 v0, v9 :: v_dual_mov_b32 v1, v10 ; encoding: [0x09,0x01,0x10,0xca,0x0a,0x01,0x00,0x00] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; encoding: [0x92,0x01,0x87,0xbf] +; GFX12-NEXT: v_dual_mov_b32 v2, v11 :: v_dual_mov_b32 v3, v12 ; encoding: [0x0b,0x01,0x10,0xca,0x0c,0x01,0x02,0x02] +; GFX12-NEXT: v_mov_b32_e32 v4, v13 ; encoding: [0x0d,0x03,0x08,0x7e] +; GFX12-NEXT: image_msaa_load v[0:4], [v8, v7, v6, v5], s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe ; encoding: [0x0f,0x20,0x06,0xe6,0x00,0x00,0x00,0x00,0x08,0x07,0x06,0x05] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf] -; GFX12-NEXT: global_store_b32 v5, v4, s[8:9] ; encoding: [0x08,0x80,0x06,0xee,0x00,0x00,0x00,0x02,0x05,0x00,0x00,0x00] +; GFX12-NEXT: global_store_b32 v9, v4, s[8:9] ; encoding: [0x08,0x80,0x06,0xee,0x00,0x00,0x00,0x02,0x09,0x00,0x00,0x00] ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f32i32.i32(i32 8, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 1, i32 0) @@ -155,18 +192,31 @@ main_body: define amdgpu_ps <4 x half> @load_2dmsaa_tfe_d16(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %fragid) { ; GFX11-LABEL: load_2dmsaa_tfe_d16: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_msaa_load v[0:2], v[0:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm tfe d16 ; encoding: [0x98,0x01,0x62,0xf0,0x00,0x00,0x20,0x00] -; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; encoding: [0x80,0x02,0x06,0x7e] +; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v6, 0 ; encoding: [0x00,0x01,0x10,0xca,0x80,0x00,0x06,0x03] +; GFX11-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 ; encoding: [0x02,0x01,0x10,0xca,0x01,0x01,0x04,0x05] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; encoding: [0x22,0x01,0x87,0xbf] +; GFX11-NEXT: v_mov_b32_e32 v7, v6 ; encoding: [0x06,0x03,0x0e,0x7e] +; GFX11-NEXT: v_mov_b32_e32 v8, v6 ; encoding: [0x06,0x03,0x10,0x7e] +; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 ; encoding: [0x06,0x01,0x10,0xca,0x07,0x01,0x00,0x00] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; encoding: [0x02,0x00,0x87,0xbf] +; GFX11-NEXT: v_mov_b32_e32 v2, v8 ; encoding: [0x08,0x03,0x04,0x7e] +; GFX11-NEXT: image_msaa_load v[0:2], v[3:5], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm tfe d16 ; encoding: [0x98,0x01,0x62,0xf0,0x03,0x00,0x20,0x00] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] -; GFX11-NEXT: global_store_b32 v3, v2, s[8:9] ; encoding: [0x00,0x00,0x6a,0xdc,0x03,0x02,0x08,0x00] +; GFX11-NEXT: global_store_b32 v6, v2, s[8:9] ; encoding: [0x00,0x00,0x6a,0xdc,0x06,0x02,0x08,0x00] ; GFX11-NEXT: ; return to shader part epilog ; ; GFX12-LABEL: load_2dmsaa_tfe_d16: ; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: image_msaa_load v[0:2], [v0, v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm tfe d16 ; encoding: [0x2e,0x20,0x46,0xe4,0x00,0x00,0x00,0x00,0x00,0x01,0x02,0x00] -; GFX12-NEXT: v_mov_b32_e32 v3, 0 ; encoding: [0x80,0x02,0x06,0x7e] +; GFX12-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, 0 ; encoding: [0x00,0x01,0x10,0xca,0x80,0x00,0x06,0x05] +; GFX12-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v4, v1 ; encoding: [0x02,0x01,0x10,0xca,0x01,0x01,0x04,0x03] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; encoding: [0x92,0x00,0x87,0xbf] +; GFX12-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6 ; encoding: [0x06,0x01,0x10,0xca,0x06,0x01,0x08,0x07] +; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 ; encoding: [0x06,0x01,0x10,0xca,0x07,0x01,0x00,0x00] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; encoding: [0x02,0x00,0x87,0xbf] +; GFX12-NEXT: v_mov_b32_e32 v2, v8 ; encoding: [0x08,0x03,0x04,0x7e] +; GFX12-NEXT: image_msaa_load v[0:2], [v5, v4, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm tfe d16 ; encoding: [0x2e,0x20,0x46,0xe4,0x00,0x00,0x00,0x00,0x05,0x04,0x03,0x00] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf] -; GFX12-NEXT: global_store_b32 v3, v2, s[8:9] ; encoding: [0x08,0x80,0x06,0xee,0x00,0x00,0x00,0x01,0x03,0x00,0x00,0x00] +; GFX12-NEXT: global_store_b32 v6, v2, s[8:9] ; encoding: [0x08,0x80,0x06,0xee,0x00,0x00,0x00,0x01,0x06,0x00,0x00,0x00] ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call {<4 x half>,i32} @llvm.amdgcn.image.msaa.load.2dmsaa.v4f16i32.i32(i32 1, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 1, i32 0) @@ -196,18 +246,31 @@ main_body: define amdgpu_ps <4 x half> @load_2darraymsaa_tfe_d16(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %slice, i32 %fragid) { ; GFX11-LABEL: load_2darraymsaa_tfe_d16: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: image_msaa_load v[0:2], v[0:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe d16 ; encoding: [0x9c,0x01,0x62,0xf0,0x00,0x00,0x20,0x00] -; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; encoding: [0x80,0x02,0x06,0x7e] +; GFX11-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, 0 ; encoding: [0x00,0x01,0x10,0xca,0x80,0x00,0x06,0x06] +; GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v5, v1 ; encoding: [0x02,0x01,0x10,0xca,0x01,0x01,0x04,0x04] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; encoding: [0x22,0x01,0x87,0xbf] +; GFX11-NEXT: v_mov_b32_e32 v8, v7 ; encoding: [0x07,0x03,0x10,0x7e] +; GFX11-NEXT: v_mov_b32_e32 v9, v7 ; encoding: [0x07,0x03,0x12,0x7e] +; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 ; encoding: [0x07,0x01,0x10,0xca,0x08,0x01,0x00,0x00] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; encoding: [0x02,0x00,0x87,0xbf] +; GFX11-NEXT: v_mov_b32_e32 v2, v9 ; encoding: [0x09,0x03,0x04,0x7e] +; GFX11-NEXT: image_msaa_load v[0:2], [v6, v5, v4, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe d16 ; encoding: [0x9d,0x01,0x62,0xf0,0x06,0x00,0x20,0x00,0x05,0x04,0x03,0x00] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf] -; GFX11-NEXT: global_store_b32 v3, v2, s[8:9] ; encoding: [0x00,0x00,0x6a,0xdc,0x03,0x02,0x08,0x00] +; GFX11-NEXT: global_store_b32 v7, v2, s[8:9] ; encoding: [0x00,0x00,0x6a,0xdc,0x07,0x02,0x08,0x00] ; GFX11-NEXT: ; return to shader part epilog ; ; GFX12-LABEL: load_2darraymsaa_tfe_d16: ; GFX12: ; %bb.0: ; %main_body -; GFX12-NEXT: image_msaa_load v[0:2], [v0, v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe d16 ; encoding: [0x2f,0x20,0x46,0xe4,0x00,0x00,0x00,0x00,0x00,0x01,0x02,0x03] -; GFX12-NEXT: v_mov_b32_e32 v3, 0 ; encoding: [0x80,0x02,0x06,0x7e] +; GFX12-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, 0 ; encoding: [0x00,0x01,0x10,0xca,0x80,0x00,0x06,0x06] +; GFX12-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v5, v1 ; encoding: [0x02,0x01,0x10,0xca,0x01,0x01,0x04,0x04] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; encoding: [0x92,0x00,0x87,0xbf] +; GFX12-NEXT: v_dual_mov_b32 v8, v7 :: v_dual_mov_b32 v9, v7 ; encoding: [0x07,0x01,0x10,0xca,0x07,0x01,0x08,0x08] +; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 ; encoding: [0x07,0x01,0x10,0xca,0x08,0x01,0x00,0x00] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; encoding: [0x02,0x00,0x87,0xbf] +; GFX12-NEXT: v_mov_b32_e32 v2, v9 ; encoding: [0x09,0x03,0x04,0x7e] +; GFX12-NEXT: image_msaa_load v[0:2], [v6, v5, v4, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe d16 ; encoding: [0x2f,0x20,0x46,0xe4,0x00,0x00,0x00,0x00,0x06,0x05,0x04,0x03] ; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf] -; GFX12-NEXT: global_store_b32 v3, v2, s[8:9] ; encoding: [0x08,0x80,0x06,0xee,0x00,0x00,0x00,0x01,0x03,0x00,0x00,0x00] +; GFX12-NEXT: global_store_b32 v7, v2, s[8:9] ; encoding: [0x08,0x80,0x06,0xee,0x00,0x00,0x00,0x01,0x07,0x00,0x00,0x00] ; GFX12-NEXT: ; return to shader part epilog main_body: %v = call {<4 x half>,i32} @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f16i32.i32(i32 1, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 1, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll index 00be32b06de0..ba3d306cc0cf 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll @@ -2,6 +2,7 @@ ;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefixes=GFX6 %s ;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefixes=GFX8PLUS %s ;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck --check-prefixes=GFX11 %s +;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-enable-prt-strict-null -verify-machineinstrs | FileCheck --check-prefixes=NOPRT %s ;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck --check-prefixes=GFX12,GFX12-SDAG %s ;RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck --check-prefixes=GFX12,GFX12-GISEL %s @@ -34,6 +35,16 @@ define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: v_mov_b32_e32 v8, 0 +; NOPRT-NEXT: s_clause 0x2 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v8, s[0:3], 0 idxen +; NOPRT-NEXT: buffer_load_format_xyzw v[4:7], v8, s[0:3], 0 idxen glc +; NOPRT-NEXT: buffer_load_format_xyzw v[8:11], v8, s[0:3], 0 idxen slc +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v8, 0 @@ -75,6 +86,13 @@ define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_immoffs: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: v_mov_b32_e32 v0, 0 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen offset:42 +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_immoffs: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v0, 0 @@ -146,6 +164,25 @@ define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) { ; GFX11-NEXT: v_add_f32_e32 v2, v10, v2 ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_immoffs_large: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: v_mov_b32_e32 v8, 0 +; NOPRT-NEXT: s_movk_i32 s4, 0x7ffc +; NOPRT-NEXT: s_clause 0x1 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v8, s[0:3], 60 idxen offset:4092 +; NOPRT-NEXT: buffer_load_format_xyzw v[4:7], v8, s[0:3], s4 idxen offset:4092 +; NOPRT-NEXT: s_mov_b32 s4, 0x8ffc +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: v_add_f32_e32 v1, v1, v5 +; NOPRT-NEXT: buffer_load_format_xyzw v[8:11], v8, s[0:3], s4 idxen offset:4 +; NOPRT-NEXT: v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v3, v3, v7 +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: v_dual_add_f32 v2, v2, v6 :: v_dual_add_f32 v1, v9, v1 +; NOPRT-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; NOPRT-NEXT: v_dual_add_f32 v0, v8, v0 :: v_dual_add_f32 v3, v11, v3 +; NOPRT-NEXT: v_add_f32_e32 v2, v10, v2 +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_immoffs_large: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v8, 0 @@ -196,6 +233,13 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_12bit(<4 x i32> inreg) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_voffset_large_12bit: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: v_mov_b32_e32 v0, 0 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen offset:4092 +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_voffset_large_12bit: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v0, 0 @@ -235,6 +279,15 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_13bit(<4 x i32> inreg) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_voffset_large_13bit: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: s_mov_b32 s4, 0 +; NOPRT-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; NOPRT-NEXT: v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s4 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092 +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_voffset_large_13bit: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v0, 0 @@ -274,6 +327,15 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_16bit(<4 x i32> inreg) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_voffset_large_16bit: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: s_mov_b32 s4, 0 +; NOPRT-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; NOPRT-NEXT: v_dual_mov_b32 v1, 0xf000 :: v_dual_mov_b32 v0, s4 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092 +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_voffset_large_16bit: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v0, 0 @@ -313,6 +375,15 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_23bit(<4 x i32> inreg) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_voffset_large_23bit: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: s_mov_b32 s4, 0 +; NOPRT-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; NOPRT-NEXT: v_dual_mov_b32 v1, 0x7ff000 :: v_dual_mov_b32 v0, s4 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092 +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_voffset_large_23bit: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v0, 0 @@ -352,6 +423,15 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_24bit(<4 x i32> inreg) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_voffset_large_24bit: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: s_mov_b32 s4, 0 +; NOPRT-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; NOPRT-NEXT: v_dual_mov_b32 v1, 0xfff000 :: v_dual_mov_b32 v0, s4 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092 +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-SDAG-LABEL: buffer_load_voffset_large_24bit: ; GFX12-SDAG: ; %bb.0: ; %main_body ; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x800000 :: v_dual_mov_b32 v0, 0 @@ -389,6 +469,12 @@ define amdgpu_ps <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_idx: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_idx: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: buffer_load_format_xyzw v[0:3], v0, s[0:3], null idxen @@ -427,6 +513,15 @@ define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_ofs: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: s_mov_b32 s4, 0 +; NOPRT-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; NOPRT-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_ofs: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, 0 @@ -466,6 +561,15 @@ define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_ofs_imm: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: s_mov_b32 s4, 0 +; NOPRT-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; NOPRT-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:60 +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_ofs_imm: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, 0 @@ -497,6 +601,12 @@ define amdgpu_ps <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_both: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_both: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], null idxen offen @@ -529,6 +639,13 @@ define amdgpu_ps <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i3 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_both_reversed: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: v_mov_b32_e32 v2, v0 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[1:2], s[0:3], 0 idxen offen +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_both_reversed: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v2, v0 @@ -562,6 +679,13 @@ define amdgpu_ps float @buffer_load_x(<4 x i32> inreg %rsrc) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_x: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: v_mov_b32_e32 v0, 0 +; NOPRT-NEXT: buffer_load_format_x v0, v0, s[0:3], 0 idxen +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_x: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v0, 0 @@ -595,6 +719,13 @@ define amdgpu_ps float @buffer_load_x_i32(<4 x i32> inreg %rsrc) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_x_i32: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: v_mov_b32_e32 v0, 0 +; NOPRT-NEXT: buffer_load_format_x v0, v0, s[0:3], 0 idxen +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_x_i32: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v0, 0 @@ -629,6 +760,13 @@ define amdgpu_ps <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) { ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_xy: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: v_mov_b32_e32 v0, 0 +; NOPRT-NEXT: buffer_load_format_xy v[0:1], v0, s[0:3], 0 idxen +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_xy: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: v_mov_b32_e32 v0, 0 @@ -644,7 +782,12 @@ define amdgpu_cs float @buffer_load_v4i32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX6-LABEL: buffer_load_v4i32_tfe: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v2, 0 -; GFX6-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe +; GFX6-NEXT: v_mov_b32_e32 v7, 2 +; GFX6-NEXT: v_mov_b32_e32 v3, v2 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v6, v2 +; GFX6-NEXT: buffer_load_format_xyzw v[2:6], v7, s[0:3], 0 idxen tfe ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s0, s2 @@ -658,7 +801,12 @@ define amdgpu_cs float @buffer_load_v4i32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX8PLUS-LABEL: buffer_load_v4i32_tfe: ; GFX8PLUS: ; %bb.0: ; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0 -; GFX8PLUS-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe +; GFX8PLUS-NEXT: v_mov_b32_e32 v7, 2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v4, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v5, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v6, v2 +; GFX8PLUS-NEXT: buffer_load_format_xyzw v[2:6], v7, s[0:3], 0 idxen tfe ; GFX8PLUS-NEXT: s_waitcnt vmcnt(0) ; GFX8PLUS-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; GFX8PLUS-NEXT: v_mov_b32_e32 v0, v6 @@ -667,22 +815,40 @@ define amdgpu_cs float @buffer_load_v4i32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; ; GFX11-LABEL: buffer_load_v4i32_tfe: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v7, 2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-NEXT: v_mov_b32_e32 v5, v2 +; GFX11-NEXT: v_mov_b32_e32 v6, v2 +; GFX11-NEXT: buffer_load_format_xyzw v[2:6], v7, s[0:3], 0 idxen tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX11-NEXT: v_mov_b32_e32 v0, v6 ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_v4i32_tfe: +; NOPRT: ; %bb.0: +; NOPRT-NEXT: v_mov_b32_e32 v2, 2 +; NOPRT-NEXT: v_mov_b32_e32 v6, 0 +; NOPRT-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: global_store_b128 v[0:1], v[2:5], off +; NOPRT-NEXT: v_mov_b32_e32 v0, v6 +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_v4i32_tfe: ; GFX12: ; %bb.0: -; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], null idxen tfe +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v7, 2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v6, v2 +; GFX12-NEXT: buffer_load_format_xyzw v[2:6], v7, s[0:3], null idxen tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX12-NEXT: v_mov_b32_e32 v0, v6 ; GFX12-NEXT: ; return to shader part epilog - %load = call { <4 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v4i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0) + %load = call { <4 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v4i32i32s(<4 x i32> %rsrc, i32 2, i32 0, i32 0, i32 0) %data = extractvalue { <4 x i32>, i32 } %load, 0 store <4 x i32> %data, ptr addrspace(1) %out %status = extractvalue { <4 x i32>, i32 } %load, 1 @@ -694,6 +860,10 @@ define amdgpu_cs float @buffer_load_v4f32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX6-LABEL: buffer_load_v4f32_tfe: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: v_mov_b32_e32 v3, v2 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v6, v2 ; GFX6-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -708,6 +878,10 @@ define amdgpu_cs float @buffer_load_v4f32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX8PLUS-LABEL: buffer_load_v4f32_tfe: ; GFX8PLUS: ; %bb.0: ; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0 +; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v4, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v5, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v6, v2 ; GFX8PLUS-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe ; GFX8PLUS-NEXT: s_waitcnt vmcnt(0) ; GFX8PLUS-NEXT: flat_store_dwordx4 v[0:1], v[2:5] @@ -718,15 +892,32 @@ define amdgpu_cs float @buffer_load_v4f32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX11-LABEL: buffer_load_v4f32_tfe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-NEXT: v_mov_b32_e32 v5, v2 +; GFX11-NEXT: v_mov_b32_e32 v6, v2 ; GFX11-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX11-NEXT: v_mov_b32_e32 v0, v6 ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_v4f32_tfe: +; NOPRT: ; %bb.0: +; NOPRT-NEXT: v_mov_b32_e32 v6, 0 +; NOPRT-NEXT: buffer_load_format_xyzw v[2:6], v6, s[0:3], 0 idxen tfe +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: global_store_b128 v[0:1], v[2:5], off +; NOPRT-NEXT: v_mov_b32_e32 v0, v6 +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_v4f32_tfe: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v6, v2 ; GFX12-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], null idxen tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off @@ -744,6 +935,9 @@ define amdgpu_cs float @buffer_load_v3i32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX6-LABEL: buffer_load_v3i32_tfe: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: v_mov_b32_e32 v3, v2 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v2 ; GFX6-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -759,6 +953,9 @@ define amdgpu_cs float @buffer_load_v3i32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX8PLUS-LABEL: buffer_load_v3i32_tfe: ; GFX8PLUS: ; %bb.0: ; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0 +; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v4, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v5, v2 ; GFX8PLUS-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe ; GFX8PLUS-NEXT: s_waitcnt vmcnt(0) ; GFX8PLUS-NEXT: flat_store_dwordx3 v[0:1], v[2:4] @@ -769,15 +966,31 @@ define amdgpu_cs float @buffer_load_v3i32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX11-LABEL: buffer_load_v3i32_tfe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-NEXT: v_mov_b32_e32 v5, v2 ; GFX11-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b96 v[0:1], v[2:4], off ; GFX11-NEXT: v_mov_b32_e32 v0, v5 ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_v3i32_tfe: +; NOPRT: ; %bb.0: +; NOPRT-NEXT: v_mov_b32_e32 v5, 0 +; NOPRT-NEXT: buffer_load_format_xyz v[2:5], v5, s[0:3], 0 idxen tfe +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: global_store_b96 v[0:1], v[2:4], off +; NOPRT-NEXT: v_mov_b32_e32 v0, v5 +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_v3i32_tfe: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_mov_b32_e32 v5, v2 ; GFX12-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], null idxen tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b96 v[0:1], v[2:4], off @@ -795,6 +1008,9 @@ define amdgpu_cs float @buffer_load_v3f32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX6-LABEL: buffer_load_v3f32_tfe: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: v_mov_b32_e32 v3, v2 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v2 ; GFX6-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -810,6 +1026,9 @@ define amdgpu_cs float @buffer_load_v3f32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX8PLUS-LABEL: buffer_load_v3f32_tfe: ; GFX8PLUS: ; %bb.0: ; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0 +; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v4, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v5, v2 ; GFX8PLUS-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe ; GFX8PLUS-NEXT: s_waitcnt vmcnt(0) ; GFX8PLUS-NEXT: flat_store_dwordx3 v[0:1], v[2:4] @@ -820,15 +1039,31 @@ define amdgpu_cs float @buffer_load_v3f32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX11-LABEL: buffer_load_v3f32_tfe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-NEXT: v_mov_b32_e32 v5, v2 ; GFX11-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b96 v[0:1], v[2:4], off ; GFX11-NEXT: v_mov_b32_e32 v0, v5 ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_v3f32_tfe: +; NOPRT: ; %bb.0: +; NOPRT-NEXT: v_mov_b32_e32 v5, 0 +; NOPRT-NEXT: buffer_load_format_xyz v[2:5], v5, s[0:3], 0 idxen tfe +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: global_store_b96 v[0:1], v[2:4], off +; NOPRT-NEXT: v_mov_b32_e32 v0, v5 +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_v3f32_tfe: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: v_mov_b32_e32 v5, v2 ; GFX12-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], null idxen tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b96 v[0:1], v[2:4], off @@ -846,6 +1081,9 @@ define amdgpu_cs float @buffer_load_v2i32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX6-LABEL: buffer_load_v2i32_tfe: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: v_mov_b32_e32 v3, v2 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v2 ; GFX6-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -860,6 +1098,8 @@ define amdgpu_cs float @buffer_load_v2i32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX8PLUS-LABEL: buffer_load_v2i32_tfe: ; GFX8PLUS: ; %bb.0: ; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0 +; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v4, v2 ; GFX8PLUS-NEXT: buffer_load_format_xy v[2:4], v2, s[0:3], 0 idxen tfe ; GFX8PLUS-NEXT: s_waitcnt vmcnt(0) ; GFX8PLUS-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -870,15 +1110,29 @@ define amdgpu_cs float @buffer_load_v2i32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX11-LABEL: buffer_load_v2i32_tfe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v4, v2 ; GFX11-NEXT: buffer_load_format_xy v[2:4], v2, s[0:3], 0 idxen tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX11-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_v2i32_tfe: +; NOPRT: ; %bb.0: +; NOPRT-NEXT: v_mov_b32_e32 v4, 0 +; NOPRT-NEXT: buffer_load_format_xy v[2:4], v4, s[0:3], 0 idxen tfe +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: global_store_b64 v[0:1], v[2:3], off +; NOPRT-NEXT: v_mov_b32_e32 v0, v4 +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_v2i32_tfe: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v4, v2 ; GFX12-NEXT: buffer_load_format_xy v[2:4], v2, s[0:3], null idxen tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off @@ -896,6 +1150,9 @@ define amdgpu_cs float @buffer_load_v2f32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX6-LABEL: buffer_load_v2f32_tfe: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: v_mov_b32_e32 v3, v2 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v2 ; GFX6-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -910,6 +1167,8 @@ define amdgpu_cs float @buffer_load_v2f32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX8PLUS-LABEL: buffer_load_v2f32_tfe: ; GFX8PLUS: ; %bb.0: ; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0 +; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v4, v2 ; GFX8PLUS-NEXT: buffer_load_format_xy v[2:4], v2, s[0:3], 0 idxen tfe ; GFX8PLUS-NEXT: s_waitcnt vmcnt(0) ; GFX8PLUS-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -920,15 +1179,29 @@ define amdgpu_cs float @buffer_load_v2f32_tfe(<4 x i32> inreg %rsrc, ptr addrspa ; GFX11-LABEL: buffer_load_v2f32_tfe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v4, v2 ; GFX11-NEXT: buffer_load_format_xy v[2:4], v2, s[0:3], 0 idxen tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX11-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_v2f32_tfe: +; NOPRT: ; %bb.0: +; NOPRT-NEXT: v_mov_b32_e32 v4, 0 +; NOPRT-NEXT: buffer_load_format_xy v[2:4], v4, s[0:3], 0 idxen tfe +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: global_store_b64 v[0:1], v[2:3], off +; NOPRT-NEXT: v_mov_b32_e32 v0, v4 +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_v2f32_tfe: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v4, v2 ; GFX12-NEXT: buffer_load_format_xy v[2:4], v2, s[0:3], null idxen tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off @@ -946,6 +1219,7 @@ define amdgpu_cs float @buffer_load_i32_tfe(<4 x i32> inreg %rsrc, ptr addrspace ; GFX6-LABEL: buffer_load_i32_tfe: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: v_mov_b32_e32 v3, v2 ; GFX6-NEXT: buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -960,6 +1234,7 @@ define amdgpu_cs float @buffer_load_i32_tfe(<4 x i32> inreg %rsrc, ptr addrspace ; GFX8PLUS-LABEL: buffer_load_i32_tfe: ; GFX8PLUS: ; %bb.0: ; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0 +; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2 ; GFX8PLUS-NEXT: buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe ; GFX8PLUS-NEXT: s_waitcnt vmcnt(0) ; GFX8PLUS-NEXT: flat_store_dword v[0:1], v2 @@ -970,15 +1245,28 @@ define amdgpu_cs float @buffer_load_i32_tfe(<4 x i32> inreg %rsrc, ptr addrspace ; GFX11-LABEL: buffer_load_i32_tfe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v[0:1], v2, off ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_i32_tfe: +; NOPRT: ; %bb.0: +; NOPRT-NEXT: v_mov_b32_e32 v3, 0 +; NOPRT-NEXT: buffer_load_format_x v[2:3], v3, s[0:3], 0 idxen tfe +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: global_store_b32 v[0:1], v2, off +; NOPRT-NEXT: v_mov_b32_e32 v0, v3 +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_i32_tfe: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: buffer_load_format_x v[2:3], v2, s[0:3], null idxen tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v[0:1], v2, off @@ -996,6 +1284,7 @@ define amdgpu_cs float @buffer_load_f32_tfe(<4 x i32> inreg %rsrc, ptr addrspace ; GFX6-LABEL: buffer_load_f32_tfe: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: v_mov_b32_e32 v3, v2 ; GFX6-NEXT: buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -1010,6 +1299,7 @@ define amdgpu_cs float @buffer_load_f32_tfe(<4 x i32> inreg %rsrc, ptr addrspace ; GFX8PLUS-LABEL: buffer_load_f32_tfe: ; GFX8PLUS: ; %bb.0: ; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0 +; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2 ; GFX8PLUS-NEXT: buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe ; GFX8PLUS-NEXT: s_waitcnt vmcnt(0) ; GFX8PLUS-NEXT: flat_store_dword v[0:1], v2 @@ -1020,15 +1310,28 @@ define amdgpu_cs float @buffer_load_f32_tfe(<4 x i32> inreg %rsrc, ptr addrspace ; GFX11-LABEL: buffer_load_f32_tfe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v[0:1], v2, off ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: ; return to shader part epilog ; +; NOPRT-LABEL: buffer_load_f32_tfe: +; NOPRT: ; %bb.0: +; NOPRT-NEXT: v_mov_b32_e32 v3, 0 +; NOPRT-NEXT: buffer_load_format_x v[2:3], v3, s[0:3], 0 idxen tfe +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: global_store_b32 v[0:1], v2, off +; NOPRT-NEXT: v_mov_b32_e32 v0, v3 +; NOPRT-NEXT: ; return to shader part epilog +; ; GFX12-LABEL: buffer_load_f32_tfe: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v3, v2 ; GFX12-NEXT: buffer_load_format_x v[2:3], v2, s[0:3], null idxen tfe ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v[0:1], v2, off diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.ll index b0bd4e428ef2..c5202b84fa1e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.ll @@ -2,6 +2,7 @@ ;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefixes=GFX6 %s ;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefixes=GFX8PLUS %s ;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck --check-prefixes=GFX11 %s +;RUN: llc < %s -mtriple=amdgcn -mattr=-enable-prt-strict-null -mcpu=gfx1100 -verify-machineinstrs | FileCheck --check-prefixes=NOPRT %s define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(ptr addrspace(8) inreg) { ; GFX6-LABEL: buffer_load: @@ -31,6 +32,16 @@ define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(ptr addrsp ; GFX11-NEXT: buffer_load_format_xyzw v[8:11], v8, s[0:3], 0 idxen slc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: v_mov_b32_e32 v8, 0 +; NOPRT-NEXT: s_clause 0x2 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v8, s[0:3], 0 idxen +; NOPRT-NEXT: buffer_load_format_xyzw v[4:7], v8, s[0:3], 0 idxen glc +; NOPRT-NEXT: buffer_load_format_xyzw v[8:11], v8, s[0:3], 0 idxen slc +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 0) %data_glc = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 1) @@ -62,6 +73,13 @@ define amdgpu_ps <4 x float> @buffer_load_immoffs(ptr addrspace(8) inreg) { ; GFX11-NEXT: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen offset:42 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_immoffs: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: v_mov_b32_e32 v0, 0 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen offset:42 +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 42, i32 0, i32 0) ret <4 x float> %data @@ -126,6 +144,25 @@ define amdgpu_ps <4 x float> @buffer_load_immoffs_large(ptr addrspace(8) inreg) ; GFX11-NEXT: v_dual_add_f32 v0, v8, v0 :: v_dual_add_f32 v3, v11, v3 ; GFX11-NEXT: v_add_f32_e32 v2, v10, v2 ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_immoffs_large: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: v_mov_b32_e32 v8, 0 +; NOPRT-NEXT: s_movk_i32 s4, 0x7ffc +; NOPRT-NEXT: s_clause 0x1 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v8, s[0:3], 60 idxen offset:4092 +; NOPRT-NEXT: buffer_load_format_xyzw v[4:7], v8, s[0:3], s4 idxen offset:4092 +; NOPRT-NEXT: s_mov_b32 s4, 0x8ffc +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: v_add_f32_e32 v1, v1, v5 +; NOPRT-NEXT: buffer_load_format_xyzw v[8:11], v8, s[0:3], s4 idxen offset:4 +; NOPRT-NEXT: v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v3, v3, v7 +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: v_dual_add_f32 v2, v2, v6 :: v_dual_add_f32 v1, v9, v1 +; NOPRT-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; NOPRT-NEXT: v_dual_add_f32 v0, v8, v0 :: v_dual_add_f32 v3, v11, v3 +; NOPRT-NEXT: v_add_f32_e32 v2, v10, v2 +; NOPRT-NEXT: ; return to shader part epilog main_body: %d.0 = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 4092, i32 60, i32 0) %d.1 = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 4092, i32 32764, i32 0) @@ -156,6 +193,13 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_12bit(ptr addrspace(8) i ; GFX11-NEXT: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen offset:4092 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_voffset_large_12bit: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: v_mov_b32_e32 v0, 0 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen offset:4092 +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 4092, i32 0, i32 0) ret <4 x float> %data @@ -188,6 +232,15 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_13bit(ptr addrspace(8) i ; GFX11-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_voffset_large_13bit: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: s_mov_b32 s4, 0 +; NOPRT-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; NOPRT-NEXT: v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s4 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092 +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 8188, i32 0, i32 0) ret <4 x float> %data @@ -220,6 +273,15 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_16bit(ptr addrspace(8) i ; GFX11-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_voffset_large_16bit: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: s_mov_b32 s4, 0 +; NOPRT-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; NOPRT-NEXT: v_dual_mov_b32 v1, 0xf000 :: v_dual_mov_b32 v0, s4 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092 +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 65532, i32 0, i32 0) ret <4 x float> %data @@ -252,6 +314,15 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_23bit(ptr addrspace(8) i ; GFX11-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_voffset_large_23bit: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: s_mov_b32 s4, 0 +; NOPRT-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; NOPRT-NEXT: v_dual_mov_b32 v1, 0x7ff000 :: v_dual_mov_b32 v0, s4 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092 +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 8388604, i32 0, i32 0) ret <4 x float> %data @@ -284,6 +355,15 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_24bit(ptr addrspace(8) i ; GFX11-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_voffset_large_24bit: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: s_mov_b32 s4, 0 +; NOPRT-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; NOPRT-NEXT: v_dual_mov_b32 v1, 0xfff000 :: v_dual_mov_b32 v0, s4 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092 +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 16777212, i32 0, i32 0) ret <4 x float> %data @@ -307,6 +387,12 @@ define amdgpu_ps <4 x float> @buffer_load_idx(ptr addrspace(8) inreg, i32) { ; GFX11-NEXT: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_idx: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 %1, i32 0, i32 0, i32 0) ret <4 x float> %data @@ -339,6 +425,15 @@ define amdgpu_ps <4 x float> @buffer_load_ofs(ptr addrspace(8) inreg, i32) { ; GFX11-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_ofs: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: s_mov_b32 s4, 0 +; NOPRT-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; NOPRT-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 %1, i32 0, i32 0) ret <4 x float> %data @@ -371,6 +466,15 @@ define amdgpu_ps <4 x float> @buffer_load_ofs_imm(ptr addrspace(8) inreg, i32) { ; GFX11-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:60 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_ofs_imm: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: s_mov_b32 s4, 0 +; NOPRT-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; NOPRT-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:60 +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog main_body: %ofs = add i32 %1, 60 %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 %ofs, i32 0, i32 0) @@ -395,6 +499,12 @@ define amdgpu_ps <4 x float> @buffer_load_both(ptr addrspace(8) inreg, i32, i32) ; GFX11-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_both: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 %1, i32 %2, i32 0, i32 0) ret <4 x float> %data @@ -421,6 +531,13 @@ define amdgpu_ps <4 x float> @buffer_load_both_reversed(ptr addrspace(8) inreg, ; GFX11-NEXT: buffer_load_format_xyzw v[0:3], v[1:2], s[0:3], 0 idxen offen ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_both_reversed: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: v_mov_b32_e32 v2, v0 +; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[1:2], s[0:3], 0 idxen offen +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog main_body: %data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 %2, i32 %1, i32 0, i32 0) ret <4 x float> %data @@ -447,6 +564,13 @@ define amdgpu_ps float @buffer_load_x(ptr addrspace(8) inreg %rsrc) { ; GFX11-NEXT: buffer_load_format_x v0, v0, s[0:3], 0 idxen ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_x: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: v_mov_b32_e32 v0, 0 +; NOPRT-NEXT: buffer_load_format_x v0, v0, s[0:3], 0 idxen +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog main_body: %data = call float @llvm.amdgcn.struct.ptr.buffer.load.format.f32(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) ret float %data @@ -473,6 +597,13 @@ define amdgpu_ps float @buffer_load_x_i32(ptr addrspace(8) inreg %rsrc) { ; GFX11-NEXT: buffer_load_format_x v0, v0, s[0:3], 0 idxen ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_x_i32: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: v_mov_b32_e32 v0, 0 +; NOPRT-NEXT: buffer_load_format_x v0, v0, s[0:3], 0 idxen +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog main_body: %data = call i32 @llvm.amdgcn.struct.ptr.buffer.load.format.i32(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %fdata = bitcast i32 %data to float @@ -500,6 +631,13 @@ define amdgpu_ps <2 x float> @buffer_load_xy(ptr addrspace(8) inreg %rsrc) { ; GFX11-NEXT: buffer_load_format_xy v[0:1], v0, s[0:3], 0 idxen ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_xy: +; NOPRT: ; %bb.0: ; %main_body +; NOPRT-NEXT: v_mov_b32_e32 v0, 0 +; NOPRT-NEXT: buffer_load_format_xy v[0:1], v0, s[0:3], 0 idxen +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: ; return to shader part epilog main_body: %data = call <2 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v2f32(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) ret <2 x float> %data @@ -509,6 +647,10 @@ define amdgpu_cs float @buffer_load_v4i32_tfe(ptr addrspace(8) inreg %rsrc, ptr ; GFX6-LABEL: buffer_load_v4i32_tfe: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: v_mov_b32_e32 v3, v2 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v6, v2 ; GFX6-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -523,6 +665,10 @@ define amdgpu_cs float @buffer_load_v4i32_tfe(ptr addrspace(8) inreg %rsrc, ptr ; GFX8PLUS-LABEL: buffer_load_v4i32_tfe: ; GFX8PLUS: ; %bb.0: ; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0 +; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v4, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v5, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v6, v2 ; GFX8PLUS-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe ; GFX8PLUS-NEXT: s_waitcnt vmcnt(0) ; GFX8PLUS-NEXT: flat_store_dwordx4 v[0:1], v[2:5] @@ -533,11 +679,25 @@ define amdgpu_cs float @buffer_load_v4i32_tfe(ptr addrspace(8) inreg %rsrc, ptr ; GFX11-LABEL: buffer_load_v4i32_tfe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-NEXT: v_mov_b32_e32 v5, v2 +; GFX11-NEXT: v_mov_b32_e32 v6, v2 ; GFX11-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX11-NEXT: v_mov_b32_e32 v0, v6 ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_v4i32_tfe: +; NOPRT: ; %bb.0: +; NOPRT-NEXT: v_mov_b32_e32 v6, 0 +; NOPRT-NEXT: buffer_load_format_xyzw v[2:6], v6, s[0:3], 0 idxen tfe +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: global_store_b128 v[0:1], v[2:5], off +; NOPRT-NEXT: v_mov_b32_e32 v0, v6 +; NOPRT-NEXT: ; return to shader part epilog %load = call { <4 x i32>, i32 } @llvm.amdgcn.struct.ptr.buffer.load.format.sl_v4i32i32s(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %data = extractvalue { <4 x i32>, i32 } %load, 0 store <4 x i32> %data, ptr addrspace(1) %out @@ -550,6 +710,10 @@ define amdgpu_cs float @buffer_load_v4f32_tfe(ptr addrspace(8) inreg %rsrc, ptr ; GFX6-LABEL: buffer_load_v4f32_tfe: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: v_mov_b32_e32 v3, v2 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v2 +; GFX6-NEXT: v_mov_b32_e32 v6, v2 ; GFX6-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -564,6 +728,10 @@ define amdgpu_cs float @buffer_load_v4f32_tfe(ptr addrspace(8) inreg %rsrc, ptr ; GFX8PLUS-LABEL: buffer_load_v4f32_tfe: ; GFX8PLUS: ; %bb.0: ; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0 +; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v4, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v5, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v6, v2 ; GFX8PLUS-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe ; GFX8PLUS-NEXT: s_waitcnt vmcnt(0) ; GFX8PLUS-NEXT: flat_store_dwordx4 v[0:1], v[2:5] @@ -574,11 +742,25 @@ define amdgpu_cs float @buffer_load_v4f32_tfe(ptr addrspace(8) inreg %rsrc, ptr ; GFX11-LABEL: buffer_load_v4f32_tfe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-NEXT: v_mov_b32_e32 v5, v2 +; GFX11-NEXT: v_mov_b32_e32 v6, v2 ; GFX11-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off ; GFX11-NEXT: v_mov_b32_e32 v0, v6 ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_v4f32_tfe: +; NOPRT: ; %bb.0: +; NOPRT-NEXT: v_mov_b32_e32 v6, 0 +; NOPRT-NEXT: buffer_load_format_xyzw v[2:6], v6, s[0:3], 0 idxen tfe +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: global_store_b128 v[0:1], v[2:5], off +; NOPRT-NEXT: v_mov_b32_e32 v0, v6 +; NOPRT-NEXT: ; return to shader part epilog %load = call { <4 x float>, i32 } @llvm.amdgcn.struct.ptr.buffer.load.format.sl_v4f32i32s(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %data = extractvalue { <4 x float>, i32 } %load, 0 store <4 x float> %data, ptr addrspace(1) %out @@ -591,6 +773,9 @@ define amdgpu_cs float @buffer_load_v3i32_tfe(ptr addrspace(8) inreg %rsrc, ptr ; GFX6-LABEL: buffer_load_v3i32_tfe: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: v_mov_b32_e32 v3, v2 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v2 ; GFX6-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -606,6 +791,9 @@ define amdgpu_cs float @buffer_load_v3i32_tfe(ptr addrspace(8) inreg %rsrc, ptr ; GFX8PLUS-LABEL: buffer_load_v3i32_tfe: ; GFX8PLUS: ; %bb.0: ; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0 +; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v4, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v5, v2 ; GFX8PLUS-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe ; GFX8PLUS-NEXT: s_waitcnt vmcnt(0) ; GFX8PLUS-NEXT: flat_store_dwordx3 v[0:1], v[2:4] @@ -616,11 +804,24 @@ define amdgpu_cs float @buffer_load_v3i32_tfe(ptr addrspace(8) inreg %rsrc, ptr ; GFX11-LABEL: buffer_load_v3i32_tfe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-NEXT: v_mov_b32_e32 v5, v2 ; GFX11-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b96 v[0:1], v[2:4], off ; GFX11-NEXT: v_mov_b32_e32 v0, v5 ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_v3i32_tfe: +; NOPRT: ; %bb.0: +; NOPRT-NEXT: v_mov_b32_e32 v5, 0 +; NOPRT-NEXT: buffer_load_format_xyz v[2:5], v5, s[0:3], 0 idxen tfe +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: global_store_b96 v[0:1], v[2:4], off +; NOPRT-NEXT: v_mov_b32_e32 v0, v5 +; NOPRT-NEXT: ; return to shader part epilog %load = call { <3 x i32>, i32 } @llvm.amdgcn.struct.ptr.buffer.load.format.sl_v3i32i32s(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %data = extractvalue { <3 x i32>, i32 } %load, 0 store <3 x i32> %data, ptr addrspace(1) %out @@ -633,6 +834,9 @@ define amdgpu_cs float @buffer_load_v3f32_tfe(ptr addrspace(8) inreg %rsrc, ptr ; GFX6-LABEL: buffer_load_v3f32_tfe: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: v_mov_b32_e32 v3, v2 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v2 ; GFX6-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -648,6 +852,9 @@ define amdgpu_cs float @buffer_load_v3f32_tfe(ptr addrspace(8) inreg %rsrc, ptr ; GFX8PLUS-LABEL: buffer_load_v3f32_tfe: ; GFX8PLUS: ; %bb.0: ; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0 +; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v4, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v5, v2 ; GFX8PLUS-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe ; GFX8PLUS-NEXT: s_waitcnt vmcnt(0) ; GFX8PLUS-NEXT: flat_store_dwordx3 v[0:1], v[2:4] @@ -658,11 +865,24 @@ define amdgpu_cs float @buffer_load_v3f32_tfe(ptr addrspace(8) inreg %rsrc, ptr ; GFX11-LABEL: buffer_load_v3f32_tfe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v4, v2 +; GFX11-NEXT: v_mov_b32_e32 v5, v2 ; GFX11-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b96 v[0:1], v[2:4], off ; GFX11-NEXT: v_mov_b32_e32 v0, v5 ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_v3f32_tfe: +; NOPRT: ; %bb.0: +; NOPRT-NEXT: v_mov_b32_e32 v5, 0 +; NOPRT-NEXT: buffer_load_format_xyz v[2:5], v5, s[0:3], 0 idxen tfe +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: global_store_b96 v[0:1], v[2:4], off +; NOPRT-NEXT: v_mov_b32_e32 v0, v5 +; NOPRT-NEXT: ; return to shader part epilog %load = call { <3 x float>, i32 } @llvm.amdgcn.struct.ptr.buffer.load.format.sl_v3f32i32s(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %data = extractvalue { <3 x float>, i32 } %load, 0 store <3 x float> %data, ptr addrspace(1) %out @@ -675,6 +895,9 @@ define amdgpu_cs float @buffer_load_v2i32_tfe(ptr addrspace(8) inreg %rsrc, ptr ; GFX6-LABEL: buffer_load_v2i32_tfe: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: v_mov_b32_e32 v3, v2 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v2 ; GFX6-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -689,6 +912,8 @@ define amdgpu_cs float @buffer_load_v2i32_tfe(ptr addrspace(8) inreg %rsrc, ptr ; GFX8PLUS-LABEL: buffer_load_v2i32_tfe: ; GFX8PLUS: ; %bb.0: ; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0 +; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v4, v2 ; GFX8PLUS-NEXT: buffer_load_format_xy v[2:4], v2, s[0:3], 0 idxen tfe ; GFX8PLUS-NEXT: s_waitcnt vmcnt(0) ; GFX8PLUS-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -699,11 +924,23 @@ define amdgpu_cs float @buffer_load_v2i32_tfe(ptr addrspace(8) inreg %rsrc, ptr ; GFX11-LABEL: buffer_load_v2i32_tfe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v4, v2 ; GFX11-NEXT: buffer_load_format_xy v[2:4], v2, s[0:3], 0 idxen tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX11-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_v2i32_tfe: +; NOPRT: ; %bb.0: +; NOPRT-NEXT: v_mov_b32_e32 v4, 0 +; NOPRT-NEXT: buffer_load_format_xy v[2:4], v4, s[0:3], 0 idxen tfe +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: global_store_b64 v[0:1], v[2:3], off +; NOPRT-NEXT: v_mov_b32_e32 v0, v4 +; NOPRT-NEXT: ; return to shader part epilog %load = call { <2 x i32>, i32 } @llvm.amdgcn.struct.ptr.buffer.load.format.sl_v2i32i32s(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %data = extractvalue { <2 x i32>, i32 } %load, 0 store <2 x i32> %data, ptr addrspace(1) %out @@ -716,6 +953,9 @@ define amdgpu_cs float @buffer_load_v2f32_tfe(ptr addrspace(8) inreg %rsrc, ptr ; GFX6-LABEL: buffer_load_v2f32_tfe: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: v_mov_b32_e32 v3, v2 +; GFX6-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-NEXT: v_mov_b32_e32 v5, v2 ; GFX6-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -730,6 +970,8 @@ define amdgpu_cs float @buffer_load_v2f32_tfe(ptr addrspace(8) inreg %rsrc, ptr ; GFX8PLUS-LABEL: buffer_load_v2f32_tfe: ; GFX8PLUS: ; %bb.0: ; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0 +; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2 +; GFX8PLUS-NEXT: v_mov_b32_e32 v4, v2 ; GFX8PLUS-NEXT: buffer_load_format_xy v[2:4], v2, s[0:3], 0 idxen tfe ; GFX8PLUS-NEXT: s_waitcnt vmcnt(0) ; GFX8PLUS-NEXT: flat_store_dwordx2 v[0:1], v[2:3] @@ -740,11 +982,23 @@ define amdgpu_cs float @buffer_load_v2f32_tfe(ptr addrspace(8) inreg %rsrc, ptr ; GFX11-LABEL: buffer_load_v2f32_tfe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: v_mov_b32_e32 v4, v2 ; GFX11-NEXT: buffer_load_format_xy v[2:4], v2, s[0:3], 0 idxen tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off ; GFX11-NEXT: v_mov_b32_e32 v0, v4 ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_v2f32_tfe: +; NOPRT: ; %bb.0: +; NOPRT-NEXT: v_mov_b32_e32 v4, 0 +; NOPRT-NEXT: buffer_load_format_xy v[2:4], v4, s[0:3], 0 idxen tfe +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: global_store_b64 v[0:1], v[2:3], off +; NOPRT-NEXT: v_mov_b32_e32 v0, v4 +; NOPRT-NEXT: ; return to shader part epilog %load = call { <2 x float>, i32 } @llvm.amdgcn.struct.ptr.buffer.load.format.sl_v2f32i32s(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %data = extractvalue { <2 x float>, i32 } %load, 0 store <2 x float> %data, ptr addrspace(1) %out @@ -757,6 +1011,7 @@ define amdgpu_cs float @buffer_load_i32_tfe(ptr addrspace(8) inreg %rsrc, ptr ad ; GFX6-LABEL: buffer_load_i32_tfe: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: v_mov_b32_e32 v3, v2 ; GFX6-NEXT: buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -771,6 +1026,7 @@ define amdgpu_cs float @buffer_load_i32_tfe(ptr addrspace(8) inreg %rsrc, ptr ad ; GFX8PLUS-LABEL: buffer_load_i32_tfe: ; GFX8PLUS: ; %bb.0: ; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0 +; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2 ; GFX8PLUS-NEXT: buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe ; GFX8PLUS-NEXT: s_waitcnt vmcnt(0) ; GFX8PLUS-NEXT: flat_store_dword v[0:1], v2 @@ -781,11 +1037,22 @@ define amdgpu_cs float @buffer_load_i32_tfe(ptr addrspace(8) inreg %rsrc, ptr ad ; GFX11-LABEL: buffer_load_i32_tfe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v[0:1], v2, off ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_i32_tfe: +; NOPRT: ; %bb.0: +; NOPRT-NEXT: v_mov_b32_e32 v3, 0 +; NOPRT-NEXT: buffer_load_format_x v[2:3], v3, s[0:3], 0 idxen tfe +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: global_store_b32 v[0:1], v2, off +; NOPRT-NEXT: v_mov_b32_e32 v0, v3 +; NOPRT-NEXT: ; return to shader part epilog %load = call { i32, i32 } @llvm.amdgcn.struct.ptr.buffer.load.format.sl_i32i32s(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %data = extractvalue { i32, i32 } %load, 0 store i32 %data, ptr addrspace(1) %out @@ -798,6 +1065,7 @@ define amdgpu_cs float @buffer_load_f32_tfe(ptr addrspace(8) inreg %rsrc, ptr ad ; GFX6-LABEL: buffer_load_f32_tfe: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_mov_b32_e32 v2, 0 +; GFX6-NEXT: v_mov_b32_e32 v3, v2 ; GFX6-NEXT: buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 @@ -812,6 +1080,7 @@ define amdgpu_cs float @buffer_load_f32_tfe(ptr addrspace(8) inreg %rsrc, ptr ad ; GFX8PLUS-LABEL: buffer_load_f32_tfe: ; GFX8PLUS: ; %bb.0: ; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0 +; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2 ; GFX8PLUS-NEXT: buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe ; GFX8PLUS-NEXT: s_waitcnt vmcnt(0) ; GFX8PLUS-NEXT: flat_store_dword v[0:1], v2 @@ -822,11 +1091,22 @@ define amdgpu_cs float @buffer_load_f32_tfe(ptr addrspace(8) inreg %rsrc, ptr ad ; GFX11-LABEL: buffer_load_f32_tfe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v3, v2 ; GFX11-NEXT: buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v[0:1], v2, off ; GFX11-NEXT: v_mov_b32_e32 v0, v3 ; GFX11-NEXT: ; return to shader part epilog +; +; NOPRT-LABEL: buffer_load_f32_tfe: +; NOPRT: ; %bb.0: +; NOPRT-NEXT: v_mov_b32_e32 v3, 0 +; NOPRT-NEXT: buffer_load_format_x v[2:3], v3, s[0:3], 0 idxen tfe +; NOPRT-NEXT: s_waitcnt vmcnt(0) +; NOPRT-NEXT: global_store_b32 v[0:1], v2, off +; NOPRT-NEXT: v_mov_b32_e32 v0, v3 +; NOPRT-NEXT: ; return to shader part epilog %load = call { float, i32 } @llvm.amdgcn.struct.ptr.buffer.load.format.sl_f32i32s(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0) %data = extractvalue { float, i32 } %load, 0 store float %data, ptr addrspace(1) %out