[AMDGPU] Support cluster load instructions for gfx1250 (#156548)
This commit is contained in:
parent
0dc1b168a6
commit
d3d1d8ff21
@ -667,6 +667,9 @@ TARGET_BUILTIN(__builtin_amdgcn_global_load_monitor_b128, "V4iV4i*1Ii", "nc", "g
|
||||
TARGET_BUILTIN(__builtin_amdgcn_flat_load_monitor_b32, "ii*0Ii", "nc", "gfx1250-insts")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_flat_load_monitor_b64, "V2iV2i*0Ii", "nc", "gfx1250-insts")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_flat_load_monitor_b128, "V4iV4i*0Ii", "nc", "gfx1250-insts")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_cluster_load_b32, "ii*1Iii", "nc", "gfx1250-insts,wavefrontsize32")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_cluster_load_b64, "V2iV2i*1Iii", "nc", "gfx1250-insts,wavefrontsize32")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_cluster_load_b128, "V4iV4i*1Iii", "nc", "gfx1250-insts,wavefrontsize32")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_global_load_async_to_lds_b8, "vc*1c*3IiIi", "nc", "gfx1250-insts")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_global_load_async_to_lds_b32, "vi*1i*3IiIi", "nc", "gfx1250-insts")
|
||||
TARGET_BUILTIN(__builtin_amdgcn_global_load_async_to_lds_b64, "vV2i*1V2i*3IiIi", "nc", "gfx1250-insts")
|
||||
|
||||
@ -675,6 +675,27 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
|
||||
llvm::Function *F = CGM.getIntrinsic(IID, {LoadTy});
|
||||
return Builder.CreateCall(F, {Addr, Val});
|
||||
}
|
||||
case AMDGPU::BI__builtin_amdgcn_cluster_load_b32:
|
||||
case AMDGPU::BI__builtin_amdgcn_cluster_load_b64:
|
||||
case AMDGPU::BI__builtin_amdgcn_cluster_load_b128: {
|
||||
Intrinsic::ID IID;
|
||||
switch (BuiltinID) {
|
||||
case AMDGPU::BI__builtin_amdgcn_cluster_load_b32:
|
||||
IID = Intrinsic::amdgcn_cluster_load_b32;
|
||||
break;
|
||||
case AMDGPU::BI__builtin_amdgcn_cluster_load_b64:
|
||||
IID = Intrinsic::amdgcn_cluster_load_b64;
|
||||
break;
|
||||
case AMDGPU::BI__builtin_amdgcn_cluster_load_b128:
|
||||
IID = Intrinsic::amdgcn_cluster_load_b128;
|
||||
break;
|
||||
}
|
||||
SmallVector<Value *, 3> Args;
|
||||
for (int i = 0, e = E->getNumArgs(); i != e; ++i)
|
||||
Args.push_back(EmitScalarExpr(E->getArg(i)));
|
||||
llvm::Function *F = CGM.getIntrinsic(IID, {ConvertType(E->getType())});
|
||||
return Builder.CreateCall(F, {Args});
|
||||
}
|
||||
case AMDGPU::BI__builtin_amdgcn_load_to_lds: {
|
||||
// Should this have asan instrumentation?
|
||||
return emitBuiltinWithOneOverloadedType<5>(*this, E,
|
||||
|
||||
@ -0,0 +1,36 @@
|
||||
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
|
||||
// REQUIRES: amdgpu-registered-target
|
||||
// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1250 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1250
|
||||
|
||||
typedef int v2i __attribute__((ext_vector_type(2)));
|
||||
typedef int v4i __attribute__((ext_vector_type(4)));
|
||||
|
||||
// CHECK-GFX1250-LABEL: @test_amdgcn_cluster_load_b32(
|
||||
// CHECK-GFX1250-NEXT: entry:
|
||||
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.cluster.load.b32.i32(ptr addrspace(1) [[INPTR:%.*]], i32 10, i32 [[MASK:%.*]])
|
||||
// CHECK-GFX1250-NEXT: ret i32 [[TMP0]]
|
||||
//
|
||||
int test_amdgcn_cluster_load_b32(global int* inptr, int mask)
|
||||
{
|
||||
return __builtin_amdgcn_cluster_load_b32(inptr, 10, mask);
|
||||
}
|
||||
|
||||
// CHECK-GFX1250-LABEL: @test_amdgcn_cluster_load_b64(
|
||||
// CHECK-GFX1250-NEXT: entry:
|
||||
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.cluster.load.b64.v2i32(ptr addrspace(1) [[INPTR:%.*]], i32 22, i32 [[MASK:%.*]])
|
||||
// CHECK-GFX1250-NEXT: ret <2 x i32> [[TMP0]]
|
||||
//
|
||||
v2i test_amdgcn_cluster_load_b64(global v2i* inptr, int mask)
|
||||
{
|
||||
return __builtin_amdgcn_cluster_load_b64(inptr, 22, mask);
|
||||
}
|
||||
|
||||
// CHECK-GFX1250-LABEL: @test_amdgcn_cluster_load_b128(
|
||||
// CHECK-GFX1250-NEXT: entry:
|
||||
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.cluster.load.b128.v4i32(ptr addrspace(1) [[INPTR:%.*]], i32 27, i32 [[MASK:%.*]])
|
||||
// CHECK-GFX1250-NEXT: ret <4 x i32> [[TMP0]]
|
||||
//
|
||||
v4i test_amdgcn_cluster_load_b128(global v4i* inptr, int mask)
|
||||
{
|
||||
return __builtin_amdgcn_cluster_load_b128(inptr, 27, mask);
|
||||
}
|
||||
@ -103,6 +103,13 @@ void test_amdgcn_load_monitor(global int* b32gaddr, global v2i* b64gaddr, global
|
||||
*b128out = __builtin_amdgcn_flat_load_monitor_b128(b128faddr, cpol); // expected-error {{'__builtin_amdgcn_flat_load_monitor_b128' must be a constant integer}}
|
||||
}
|
||||
|
||||
void test_amdgcn_cluster_load(global int* addr32, global v2i* addr64, global v4i* addr128, global int* b32out, global v2i* b64out, global v4i* b128out, int cpol, int mask)
|
||||
{
|
||||
*b32out = __builtin_amdgcn_cluster_load_b32(addr32, cpol, mask); // expected-error {{'__builtin_amdgcn_cluster_load_b32' must be a constant integer}}
|
||||
*b64out = __builtin_amdgcn_cluster_load_b64(addr64, cpol, mask); // expected-error {{'__builtin_amdgcn_cluster_load_b64' must be a constant integer}}
|
||||
*b128out = __builtin_amdgcn_cluster_load_b128(addr128, cpol, mask); // expected-error {{'__builtin_amdgcn_cluster_load_b128' must be a constant integer}}
|
||||
}
|
||||
|
||||
void test_amdgcn_async_load_store_lds_offset(global char* gaddr8, global int *gaddr32, global v2i* gaddr64, global v4i* gaddr128, local char* laddr8,
|
||||
local int *laddr32, local v2i* laddr64, local v4i* laddr128, int offset, int mask)
|
||||
{
|
||||
|
||||
@ -4113,6 +4113,23 @@ def int_amdgcn_tensor_load_to_lds_d2 :
|
||||
def int_amdgcn_tensor_store_from_lds_d2 :
|
||||
ClangBuiltin<"__builtin_amdgcn_tensor_store_from_lds_d2">, AMDGPUTensorLoadStoreD2;
|
||||
|
||||
class AMDGPUClusterLoad<LLVMType ptr_ty>:
|
||||
Intrinsic<
|
||||
[llvm_any_ty],
|
||||
[ptr_ty,
|
||||
llvm_i32_ty, // gfx12+ cachepolicy:
|
||||
// bits [0-2] = th
|
||||
// bits [3-4] = scope
|
||||
llvm_i32_ty], // workgroup broadcast mask (in M0)
|
||||
[IntrArgMemOnly, IntrReadMem, ReadOnly<ArgIndex<0>>, IntrWillReturn, IntrConvergent,
|
||||
NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<1>>, IntrNoCallback, IntrNoFree],
|
||||
"", [SDNPMemOperand]
|
||||
>;
|
||||
|
||||
def int_amdgcn_cluster_load_b32 : AMDGPUClusterLoad<global_ptr_ty>;
|
||||
def int_amdgcn_cluster_load_b64 : AMDGPUClusterLoad<global_ptr_ty>;
|
||||
def int_amdgcn_cluster_load_b128 : AMDGPUClusterLoad<global_ptr_ty>;
|
||||
|
||||
class AMDGPULoadMonitor<LLVMType ptr_ty>:
|
||||
Intrinsic<
|
||||
[llvm_any_ty],
|
||||
|
||||
@ -128,6 +128,9 @@ def gi_global_saddr :
|
||||
def gi_global_saddr_cpol :
|
||||
GIComplexOperandMatcher<s64, "selectGlobalSAddrCPol">,
|
||||
GIComplexPatternEquiv<GlobalSAddrCPol>;
|
||||
def gi_global_saddr_cpol_m0 :
|
||||
GIComplexOperandMatcher<s64, "selectGlobalSAddrCPolM0">,
|
||||
GIComplexPatternEquiv<GlobalSAddrCPolM0>;
|
||||
def gi_global_saddr_glc :
|
||||
GIComplexOperandMatcher<s64, "selectGlobalSAddrGLC">,
|
||||
GIComplexPatternEquiv<GlobalSAddrGLC>;
|
||||
|
||||
@ -2089,6 +2089,23 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddrCPol(SDNode *N, SDValue Addr,
|
||||
return true;
|
||||
}
|
||||
|
||||
bool AMDGPUDAGToDAGISel::SelectGlobalSAddrCPolM0(SDNode *N, SDValue Addr,
|
||||
SDValue &SAddr,
|
||||
SDValue &VOffset,
|
||||
SDValue &Offset,
|
||||
SDValue &CPol) const {
|
||||
bool ScaleOffset;
|
||||
if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
|
||||
return false;
|
||||
|
||||
// We are assuming CPol is second from last operand of the intrinsic.
|
||||
auto PassedCPol =
|
||||
N->getConstantOperandVal(N->getNumOperands() - 2) & ~AMDGPU::CPol::SCAL;
|
||||
CPol = CurDAG->getTargetConstant(
|
||||
(ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool AMDGPUDAGToDAGISel::SelectGlobalSAddrGLC(SDNode *N, SDValue Addr,
|
||||
SDValue &SAddr, SDValue &VOffset,
|
||||
SDValue &Offset,
|
||||
|
||||
@ -171,6 +171,9 @@ private:
|
||||
bool SelectGlobalSAddrCPol(SDNode *N, SDValue Addr, SDValue &SAddr,
|
||||
SDValue &VOffset, SDValue &Offset,
|
||||
SDValue &CPol) const;
|
||||
bool SelectGlobalSAddrCPolM0(SDNode *N, SDValue Addr, SDValue &SAddr,
|
||||
SDValue &VOffset, SDValue &Offset,
|
||||
SDValue &CPol) const;
|
||||
bool SelectGlobalSAddrGLC(SDNode *N, SDValue Addr, SDValue &SAddr,
|
||||
SDValue &VOffset, SDValue &Offset,
|
||||
SDValue &CPol) const;
|
||||
|
||||
@ -5708,6 +5708,16 @@ AMDGPUInstructionSelector::selectGlobalSAddrCPol(MachineOperand &Root) const {
|
||||
return selectGlobalSAddr(Root, PassedCPol);
|
||||
}
|
||||
|
||||
InstructionSelector::ComplexRendererFns
|
||||
AMDGPUInstructionSelector::selectGlobalSAddrCPolM0(MachineOperand &Root) const {
|
||||
const MachineInstr &I = *Root.getParent();
|
||||
|
||||
// We are assuming CPol is second from last operand of the intrinsic.
|
||||
auto PassedCPol =
|
||||
I.getOperand(I.getNumOperands() - 2).getImm() & ~AMDGPU::CPol::SCAL;
|
||||
return selectGlobalSAddr(Root, PassedCPol);
|
||||
}
|
||||
|
||||
InstructionSelector::ComplexRendererFns
|
||||
AMDGPUInstructionSelector::selectGlobalSAddrGLC(MachineOperand &Root) const {
|
||||
return selectGlobalSAddr(Root, AMDGPU::CPol::GLC);
|
||||
|
||||
@ -256,6 +256,8 @@ private:
|
||||
InstructionSelector::ComplexRendererFns
|
||||
selectGlobalSAddrCPol(MachineOperand &Root) const;
|
||||
InstructionSelector::ComplexRendererFns
|
||||
selectGlobalSAddrCPolM0(MachineOperand &Root) const;
|
||||
InstructionSelector::ComplexRendererFns
|
||||
selectGlobalSAddrGLC(MachineOperand &Root) const;
|
||||
InstructionSelector::ComplexRendererFns
|
||||
selectGlobalSAddrNoIOffset(MachineOperand &Root) const;
|
||||
|
||||
@ -3338,6 +3338,13 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
|
||||
applyDefaultMapping(OpdMapper);
|
||||
constrainOpWithReadfirstlane(B, MI, 8); // M0
|
||||
return;
|
||||
case Intrinsic::amdgcn_cluster_load_b32:
|
||||
case Intrinsic::amdgcn_cluster_load_b64:
|
||||
case Intrinsic::amdgcn_cluster_load_b128: {
|
||||
applyDefaultMapping(OpdMapper);
|
||||
constrainOpWithReadfirstlane(B, MI, 4); // M0
|
||||
return;
|
||||
}
|
||||
case Intrinsic::amdgcn_s_sleep_var:
|
||||
assert(OpdMapper.getVRegs(1).empty());
|
||||
constrainOpWithReadfirstlane(B, MI, 1);
|
||||
@ -5466,6 +5473,16 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
|
||||
OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
|
||||
break;
|
||||
}
|
||||
case Intrinsic::amdgcn_cluster_load_b32:
|
||||
case Intrinsic::amdgcn_cluster_load_b64:
|
||||
case Intrinsic::amdgcn_cluster_load_b128: {
|
||||
OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
|
||||
OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
|
||||
unsigned M0Bank =
|
||||
getRegBankID(MI.getOperand(4).getReg(), MRI, AMDGPU::SGPRRegBankID);
|
||||
OpdsMapping[4] = AMDGPU::getValueMapping(M0Bank, 32);
|
||||
break;
|
||||
}
|
||||
case Intrinsic::amdgcn_global_store_async_from_lds_b8:
|
||||
case Intrinsic::amdgcn_global_store_async_from_lds_b32:
|
||||
case Intrinsic::amdgcn_global_store_async_from_lds_b64:
|
||||
|
||||
@ -15,6 +15,7 @@ let WantsRoot = true in {
|
||||
def GlobalSAddr : ComplexPattern<iPTR, 4, "SelectGlobalSAddr", [], [], -10>;
|
||||
def GlobalSAddrGLC : ComplexPattern<iPTR, 4, "SelectGlobalSAddrGLC", [], [], -10>;
|
||||
def GlobalSAddrCPol : ComplexPattern<iPTR, 4, "SelectGlobalSAddrCPol", [], [], -10>;
|
||||
def GlobalSAddrCPolM0 : ComplexPattern<iPTR, 4, "SelectGlobalSAddrCPolM0", [], [], -10>;
|
||||
def ScratchSAddr : ComplexPattern<iPTR, 2, "SelectScratchSAddr", [], [], -10>;
|
||||
def ScratchSVAddr : ComplexPattern<iPTR, 4, "SelectScratchSVAddr", [], [], -10>;
|
||||
}
|
||||
@ -1248,6 +1249,14 @@ defm GLOBAL_LOAD_MONITOR_B64 : FLAT_Global_Load_Pseudo <"global_load_monitor_b6
|
||||
defm GLOBAL_LOAD_MONITOR_B128 : FLAT_Global_Load_Pseudo <"global_load_monitor_b128", VReg_128>;
|
||||
} // End SubtargetPredicate = isGFX125xOnly
|
||||
|
||||
let SubtargetPredicate = isGFX1250Plus, WaveSizePredicate = isWave32 in {
|
||||
let Uses = [M0, EXEC] in { // Use M0 for broadcast workgroup mask.
|
||||
defm CLUSTER_LOAD_B32 : FLAT_Global_Load_Pseudo <"cluster_load_b32", VGPR_32>;
|
||||
defm CLUSTER_LOAD_B64 : FLAT_Global_Load_Pseudo <"cluster_load_b64", VReg_64>;
|
||||
defm CLUSTER_LOAD_B128 : FLAT_Global_Load_Pseudo <"cluster_load_b128", VReg_128>;
|
||||
} // End Uses = [M0, EXEC]
|
||||
} // End SubtargetPredicate = isGFX1250Plus, WaveSizePredicate = isWave32
|
||||
|
||||
let SubtargetPredicate = isGFX12Plus in {
|
||||
let Uses = [EXEC, M0] in {
|
||||
defm GLOBAL_LOAD_BLOCK : FLAT_Global_Load_Pseudo <"global_load_block", VReg_1024>;
|
||||
@ -1394,6 +1403,16 @@ class FlatLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt>
|
||||
(inst $saddr, $voffset, $offset, $cpol)
|
||||
>;
|
||||
|
||||
class FlatLoadSignedPat_M0 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
|
||||
(vt (node (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset), (i32 timm:$cpol), M0)),
|
||||
(inst $vaddr, $offset, $cpol)
|
||||
>;
|
||||
|
||||
class GlobalLoadSaddrPat_M0 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
|
||||
(vt (node (GlobalSAddrCPolM0 (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), (i32 timm), M0)),
|
||||
(inst $saddr, $voffset, $offset, $cpol)
|
||||
>;
|
||||
|
||||
class FlatLoadSignedPat_CPOL <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
|
||||
(vt (node (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset), (i32 timm:$cpol))),
|
||||
(inst $vaddr, $offset, $cpol)
|
||||
@ -1619,6 +1638,16 @@ multiclass GlobalFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueTyp
|
||||
}
|
||||
}
|
||||
|
||||
multiclass GlobalFLATLoadPats_M0<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
|
||||
def : FlatLoadSignedPat_M0 <inst, node, vt> {
|
||||
let AddedComplexity = 10;
|
||||
}
|
||||
|
||||
def : GlobalLoadSaddrPat_M0<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
|
||||
let AddedComplexity = 11;
|
||||
}
|
||||
}
|
||||
|
||||
multiclass GlobalFLATLoadPats_CPOL<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
|
||||
def : FlatLoadSignedPat_CPOL<inst, node, vt> {
|
||||
let AddedComplexity = 10;
|
||||
@ -2176,6 +2205,10 @@ let OtherPredicates = [isGFX125xOnly] in {
|
||||
} // End SubtargetPredicate = isGFX125xOnly
|
||||
|
||||
let OtherPredicates = [isGFX1250Plus] in {
|
||||
defm : GlobalFLATLoadPats_M0 <CLUSTER_LOAD_B32, int_amdgcn_cluster_load_b32, i32>;
|
||||
defm : GlobalFLATLoadPats_M0 <CLUSTER_LOAD_B64, int_amdgcn_cluster_load_b64, v2i32>;
|
||||
defm : GlobalFLATLoadPats_M0 <CLUSTER_LOAD_B128, int_amdgcn_cluster_load_b128, v4i32>;
|
||||
|
||||
defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B8, int_amdgcn_global_load_async_to_lds_b8>;
|
||||
defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B32, int_amdgcn_global_load_async_to_lds_b32>;
|
||||
defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B64, int_amdgcn_global_load_async_to_lds_b64>;
|
||||
@ -3470,6 +3503,10 @@ defm GLOBAL_LOAD_MONITOR_B32 : VFLAT_Real_AllAddr_gfx1250<0x070>;
|
||||
defm GLOBAL_LOAD_MONITOR_B64 : VFLAT_Real_AllAddr_gfx1250<0x071>;
|
||||
defm GLOBAL_LOAD_MONITOR_B128 : VFLAT_Real_AllAddr_gfx1250<0x072>;
|
||||
|
||||
defm CLUSTER_LOAD_B32 : VFLAT_Real_AllAddr_gfx1250<0x067>;
|
||||
defm CLUSTER_LOAD_B64 : VFLAT_Real_AllAddr_gfx1250<0x068>;
|
||||
defm CLUSTER_LOAD_B128 : VFLAT_Real_AllAddr_gfx1250<0x069>;
|
||||
|
||||
defm GLOBAL_LOAD_ASYNC_TO_LDS_B8 : VFLAT_Real_AllAddr_gfx1250<0x5f>;
|
||||
defm GLOBAL_LOAD_ASYNC_TO_LDS_B32 : VFLAT_Real_AllAddr_gfx1250<0x60>;
|
||||
defm GLOBAL_LOAD_ASYNC_TO_LDS_B64 : VFLAT_Real_AllAddr_gfx1250<0x61>;
|
||||
|
||||
@ -1506,6 +1506,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
|
||||
case Intrinsic::amdgcn_global_load_monitor_b32:
|
||||
case Intrinsic::amdgcn_global_load_monitor_b64:
|
||||
case Intrinsic::amdgcn_global_load_monitor_b128:
|
||||
case Intrinsic::amdgcn_cluster_load_b32:
|
||||
case Intrinsic::amdgcn_cluster_load_b64:
|
||||
case Intrinsic::amdgcn_cluster_load_b128:
|
||||
case Intrinsic::amdgcn_ds_load_tr6_b96:
|
||||
case Intrinsic::amdgcn_ds_load_tr4_b64:
|
||||
case Intrinsic::amdgcn_ds_load_tr8_b64:
|
||||
@ -1636,6 +1639,9 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II,
|
||||
Value *Ptr = nullptr;
|
||||
switch (II->getIntrinsicID()) {
|
||||
case Intrinsic::amdgcn_atomic_cond_sub_u32:
|
||||
case Intrinsic::amdgcn_cluster_load_b128:
|
||||
case Intrinsic::amdgcn_cluster_load_b64:
|
||||
case Intrinsic::amdgcn_cluster_load_b32:
|
||||
case Intrinsic::amdgcn_ds_append:
|
||||
case Intrinsic::amdgcn_ds_consume:
|
||||
case Intrinsic::amdgcn_ds_load_tr8_b64:
|
||||
|
||||
183
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.load.ll
Normal file
183
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.load.ll
Normal file
@ -0,0 +1,183 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -O3 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s
|
||||
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -O3 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
|
||||
|
||||
declare i32 @llvm.amdgcn.cluster.load.b32.i32.p1(ptr addrspace(1), i32 %cpol, i32 %mask)
|
||||
declare <2 x i32> @llvm.amdgcn.cluster.load.b64.v2i32.p1(ptr addrspace(1), i32 %cpol, i32 %mask)
|
||||
declare <4 x i32> @llvm.amdgcn.cluster.load.b128.v4i32.p1(ptr addrspace(1), i32 %cpol, i32 %mask)
|
||||
|
||||
define amdgpu_ps void @cluster_load_b32_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use, i32 %mask) {
|
||||
; GFX1250-LABEL: cluster_load_b32_vaddr:
|
||||
; GFX1250: ; %bb.0: ; %entry
|
||||
; GFX1250-NEXT: v_readfirstlane_b32 s0, v4
|
||||
; GFX1250-NEXT: s_mov_b32 m0, s0
|
||||
; GFX1250-NEXT: cluster_load_b32 v0, v[0:1], off offset:32 th:TH_LOAD_NT
|
||||
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX1250-NEXT: global_store_b32 v[2:3], v0, off
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
entry:
|
||||
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
|
||||
%val = call i32 @llvm.amdgcn.cluster.load.b32.i32.p1(ptr addrspace(1) %gep, i32 1, i32 %mask)
|
||||
store i32 %val, ptr addrspace(1) %use
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @cluster_load_b32_vaddr_imm_mask(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
|
||||
; GFX1250-LABEL: cluster_load_b32_vaddr_imm_mask:
|
||||
; GFX1250: ; %bb.0: ; %entry
|
||||
; GFX1250-NEXT: s_mov_b32 m0, 7
|
||||
; GFX1250-NEXT: cluster_load_b32 v0, v[0:1], off offset:32 th:TH_LOAD_HT scope:SCOPE_SE
|
||||
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX1250-NEXT: global_store_b32 v[2:3], v0, off
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
entry:
|
||||
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
|
||||
%val = call i32 @llvm.amdgcn.cluster.load.b32.i32.p1(ptr addrspace(1) %gep, i32 10, i32 7)
|
||||
store i32 %val, ptr addrspace(1) %use
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @cluster_load_b32_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use, i32 inreg %mask) {
|
||||
; GFX1250-LABEL: cluster_load_b32_saddr:
|
||||
; GFX1250: ; %bb.0: ; %entry
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX1250-NEXT: s_mov_b32 m0, s2
|
||||
; GFX1250-NEXT: cluster_load_b32 v2, v2, s[0:1] offset:32 th:TH_LOAD_NT_HT scope:SCOPE_DEV
|
||||
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX1250-NEXT: global_store_b32 v[0:1], v2, off
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
entry:
|
||||
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
|
||||
%val = call i32 @llvm.amdgcn.cluster.load.b32.i32.p1(ptr addrspace(1) %gep, i32 22, i32 %mask)
|
||||
store i32 %val, ptr addrspace(1) %use
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @cluster_load_monitor_b32_saddr_scale_offset(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use, i32 inreg %mask, i32 %idx) {
|
||||
; GFX1250-LABEL: cluster_load_monitor_b32_saddr_scale_offset:
|
||||
; GFX1250: ; %bb.0: ; %entry
|
||||
; GFX1250-NEXT: s_mov_b32 m0, s2
|
||||
; GFX1250-NEXT: cluster_load_b32 v2, v2, s[0:1] scale_offset th:TH_LOAD_BYPASS scope:SCOPE_SYS
|
||||
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX1250-NEXT: global_store_b32 v[0:1], v2, off
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
entry:
|
||||
%idxprom = sext i32 %idx to i64
|
||||
%gep = getelementptr i32, ptr addrspace(1) %addr, i64 %idxprom
|
||||
%val = call i32 @llvm.amdgcn.cluster.load.b32.i32.p1(ptr addrspace(1) %gep, i32 27, i32 inreg %mask)
|
||||
store i32 %val, ptr addrspace(1) %use
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @cluster_load_b64_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use, i32 %mask) {
|
||||
; GFX1250-LABEL: cluster_load_b64_vaddr:
|
||||
; GFX1250: ; %bb.0: ; %entry
|
||||
; GFX1250-NEXT: v_readfirstlane_b32 s0, v4
|
||||
; GFX1250-NEXT: s_mov_b32 m0, s0
|
||||
; GFX1250-NEXT: cluster_load_b64 v[0:1], v[0:1], off offset:32 th:TH_LOAD_NT
|
||||
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX1250-NEXT: global_store_b64 v[2:3], v[0:1], off
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
entry:
|
||||
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
|
||||
%val = call <2 x i32> @llvm.amdgcn.cluster.load.b64.v2i32.p1(ptr addrspace(1) %gep, i32 1, i32 %mask)
|
||||
store <2 x i32> %val, ptr addrspace(1) %use
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @cluster_load_b64_vaddr_imm_mask(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
|
||||
; GFX1250-LABEL: cluster_load_b64_vaddr_imm_mask:
|
||||
; GFX1250: ; %bb.0: ; %entry
|
||||
; GFX1250-NEXT: s_mov_b32 m0, 0x10007
|
||||
; GFX1250-NEXT: cluster_load_b64 v[0:1], v[0:1], off offset:32 th:TH_LOAD_HT scope:SCOPE_SE
|
||||
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX1250-NEXT: global_store_b64 v[2:3], v[0:1], off
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
entry:
|
||||
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
|
||||
%val = call <2 x i32> @llvm.amdgcn.cluster.load.b64.v2i32.p1(ptr addrspace(1) %gep, i32 10, i32 65543)
|
||||
store <2 x i32> %val, ptr addrspace(1) %use
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @cluster_load_b64_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use, i32 inreg %mask) {
|
||||
; GFX1250-LABEL: cluster_load_b64_saddr:
|
||||
; GFX1250: ; %bb.0: ; %entry
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX1250-NEXT: s_mov_b32 m0, s2
|
||||
; GFX1250-NEXT: cluster_load_b64 v[2:3], v2, s[0:1] offset:32 th:TH_LOAD_NT_HT scope:SCOPE_DEV
|
||||
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
entry:
|
||||
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
|
||||
%val = call <2 x i32> @llvm.amdgcn.cluster.load.b64.v2i32.p1(ptr addrspace(1) %gep, i32 22, i32 %mask)
|
||||
store <2 x i32> %val, ptr addrspace(1) %use
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @cluster_load_monitor_b64_saddr_scale_offset(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use, i32 inreg %mask, i32 %idx) {
|
||||
; GFX1250-LABEL: cluster_load_monitor_b64_saddr_scale_offset:
|
||||
; GFX1250: ; %bb.0: ; %entry
|
||||
; GFX1250-NEXT: s_mov_b32 m0, s2
|
||||
; GFX1250-NEXT: cluster_load_b64 v[2:3], v2, s[0:1] scale_offset th:TH_LOAD_BYPASS scope:SCOPE_SYS
|
||||
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
entry:
|
||||
%idxprom = sext i32 %idx to i64
|
||||
%gep = getelementptr i64, ptr addrspace(1) %addr, i64 %idxprom
|
||||
%val = call <2 x i32> @llvm.amdgcn.cluster.load.b64.v2i32.p1(ptr addrspace(1) %gep, i32 27, i32 inreg %mask)
|
||||
store <2 x i32> %val, ptr addrspace(1) %use
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @cluster_load_b128_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use, i32 %mask) {
|
||||
; GFX1250-LABEL: cluster_load_b128_vaddr:
|
||||
; GFX1250: ; %bb.0: ; %entry
|
||||
; GFX1250-NEXT: v_readfirstlane_b32 s0, v4
|
||||
; GFX1250-NEXT: s_mov_b32 m0, s0
|
||||
; GFX1250-NEXT: cluster_load_b128 v[4:7], v[0:1], off offset:32 th:TH_LOAD_NT
|
||||
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX1250-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
entry:
|
||||
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
|
||||
%val = call <4 x i32> @llvm.amdgcn.cluster.load.b128.v4i32.p1(ptr addrspace(1) %gep, i32 1, i32 %mask)
|
||||
store <4 x i32> %val, ptr addrspace(1) %use
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @cluster_load_b128_vaddr_imm_mask(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
|
||||
; GFX1250-LABEL: cluster_load_b128_vaddr_imm_mask:
|
||||
; GFX1250: ; %bb.0: ; %entry
|
||||
; GFX1250-NEXT: s_mov_b32 m0, 15
|
||||
; GFX1250-NEXT: cluster_load_b128 v[4:7], v[0:1], off offset:32 th:TH_LOAD_HT scope:SCOPE_SE
|
||||
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX1250-NEXT: global_store_b128 v[2:3], v[4:7], off
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
entry:
|
||||
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
|
||||
%val = call <4 x i32> @llvm.amdgcn.cluster.load.b128.v4i32.p1(ptr addrspace(1) %gep, i32 10, i32 15)
|
||||
store <4 x i32> %val, ptr addrspace(1) %use
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @cluster_load_b128_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use, i32 inreg %mask) {
|
||||
; GFX1250-LABEL: cluster_load_b128_saddr:
|
||||
; GFX1250: ; %bb.0: ; %entry
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX1250-NEXT: s_mov_b32 m0, s2
|
||||
; GFX1250-NEXT: cluster_load_b128 v[2:5], v2, s[0:1] offset:32 th:TH_LOAD_BYPASS scope:SCOPE_SYS
|
||||
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX1250-NEXT: global_store_b128 v[0:1], v[2:5], off
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
entry:
|
||||
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
|
||||
%val = call <4 x i32> @llvm.amdgcn.cluster.load.b128.v4i32.p1(ptr addrspace(1) %gep, i32 27, i32 inreg %mask)
|
||||
store <4 x i32> %val, ptr addrspace(1) %use
|
||||
ret void
|
||||
}
|
||||
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
|
||||
; GFX1250-GISEL: {{.*}}
|
||||
; GFX1250-SDAG: {{.*}}
|
||||
@ -3096,6 +3096,87 @@ flat_load_monitor_b64 v[2:3], v2, s[4:5] offset:64 scale_offset
|
||||
// GFX1250: flat_load_monitor_b64 v[2:3], v2, s[4:5] offset:64 scale_offset ; encoding: [0x04,0x40,0x1c,0xec,0x02,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
|
||||
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
|
||||
|
||||
cluster_load_b32 v1, v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS
|
||||
// GFX1250: cluster_load_b32 v1, v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0xc0,0x19,0xee,0x01,0x00,0x3c,0x00,0x02,0x00,0x00,0x00]
|
||||
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
|
||||
|
||||
cluster_load_b32 v1, v[2:3], off offset:64
|
||||
// GFX1250: cluster_load_b32 v1, v[2:3], off offset:64 ; encoding: [0x7c,0xc0,0x19,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00]
|
||||
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
|
||||
|
||||
cluster_load_b32 v1, v[2:3], off offset:-64
|
||||
// GFX1250: cluster_load_b32 v1, v[2:3], off offset:-64 ; encoding: [0x7c,0xc0,0x19,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff]
|
||||
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
|
||||
|
||||
cluster_load_b32 v1, v2, s[0:1] th:TH_LOAD_NT_HT scope:SCOPE_DEV
|
||||
// GFX1250: cluster_load_b32 v1, v2, s[0:1] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x00,0xc0,0x19,0xee,0x01,0x00,0x68,0x00,0x02,0x00,0x00,0x00]
|
||||
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
|
||||
|
||||
cluster_load_b32 v1, v2, s[0:1] offset:64
|
||||
// GFX1250: cluster_load_b32 v1, v2, s[0:1] offset:64 ; encoding: [0x00,0xc0,0x19,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00]
|
||||
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
|
||||
|
||||
cluster_load_b32 v1, v2, s[0:1] offset:-64
|
||||
// GFX1250: cluster_load_b32 v1, v2, s[0:1] offset:-64 ; encoding: [0x00,0xc0,0x19,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff]
|
||||
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
|
||||
|
||||
cluster_load_b32 v1, v2, s[4:5] offset:64 scale_offset th:TH_LOAD_BYPASS scope:SCOPE_SYS
|
||||
// GFX1250: cluster_load_b32 v1, v2, s[4:5] offset:64 scale_offset th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x04,0xc0,0x19,0xee,0x01,0x00,0x3d,0x00,0x02,0x40,0x00,0x00]
|
||||
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
|
||||
|
||||
cluster_load_b64 v[0:1], v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS
|
||||
// GFX1250: cluster_load_b64 v[0:1], v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0x00,0x1a,0xee,0x00,0x00,0x3c,0x00,0x02,0x00,0x00,0x00]
|
||||
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
|
||||
|
||||
cluster_load_b64 v[0:1], v[2:3], off offset:64
|
||||
// GFX1250: cluster_load_b64 v[0:1], v[2:3], off offset:64 ; encoding: [0x7c,0x00,0x1a,0xee,0x00,0x00,0x00,0x00,0x02,0x40,0x00,0x00]
|
||||
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
|
||||
|
||||
cluster_load_b64 v[0:1], v[2:3], off offset:-64
|
||||
// GFX1250: cluster_load_b64 v[0:1], v[2:3], off offset:-64 ; encoding: [0x7c,0x00,0x1a,0xee,0x00,0x00,0x00,0x00,0x02,0xc0,0xff,0xff]
|
||||
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
|
||||
|
||||
cluster_load_b64 v[0:1], v2, s[0:1] th:TH_LOAD_NT_HT scope:SCOPE_DEV
|
||||
// GFX1250: cluster_load_b64 v[0:1], v2, s[0:1] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x00,0x00,0x1a,0xee,0x00,0x00,0x68,0x00,0x02,0x00,0x00,0x00]
|
||||
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
|
||||
|
||||
cluster_load_b64 v[0:1], v2, s[0:1] offset:64
|
||||
// GFX1250: cluster_load_b64 v[0:1], v2, s[0:1] offset:64 ; encoding: [0x00,0x00,0x1a,0xee,0x00,0x00,0x00,0x00,0x02,0x40,0x00,0x00]
|
||||
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
|
||||
|
||||
cluster_load_b64 v[0:1], v2, s[0:1] offset:-64
|
||||
// GFX1250: cluster_load_b64 v[0:1], v2, s[0:1] offset:-64 ; encoding: [0x00,0x00,0x1a,0xee,0x00,0x00,0x00,0x00,0x02,0xc0,0xff,0xff]
|
||||
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
|
||||
|
||||
cluster_load_b64 v[2:3], v2, s[4:5] offset:64 scale_offset th:TH_LOAD_NT_HT scope:SCOPE_DEV
|
||||
// GFX1250: cluster_load_b64 v[2:3], v2, s[4:5] offset:64 scale_offset th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x04,0x00,0x1a,0xee,0x02,0x00,0x69,0x00,0x02,0x40,0x00,0x00]
|
||||
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
|
||||
|
||||
cluster_load_b128 v[0:3], v[4:5], off th:TH_LOAD_BYPASS scope:SCOPE_SYS
|
||||
// GFX1250: cluster_load_b128 v[0:3], v[4:5], off th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0x40,0x1a,0xee,0x00,0x00,0x3c,0x00,0x04,0x00,0x00,0x00]
|
||||
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
|
||||
|
||||
cluster_load_b128 v[0:3], v[4:5], off offset:64
|
||||
// GFX1250: cluster_load_b128 v[0:3], v[4:5], off offset:64 ; encoding: [0x7c,0x40,0x1a,0xee,0x00,0x00,0x00,0x00,0x04,0x40,0x00,0x00]
|
||||
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
|
||||
|
||||
cluster_load_b128 v[0:3], v[4:5], off offset:-64
|
||||
// GFX1250: cluster_load_b128 v[0:3], v[4:5], off offset:-64 ; encoding: [0x7c,0x40,0x1a,0xee,0x00,0x00,0x00,0x00,0x04,0xc0,0xff,0xff]
|
||||
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
|
||||
|
||||
cluster_load_b128 v[0:3], v4, s[0:1] th:TH_LOAD_NT_HT scope:SCOPE_DEV
|
||||
// GFX1250: cluster_load_b128 v[0:3], v4, s[0:1] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x00,0x40,0x1a,0xee,0x00,0x00,0x68,0x00,0x04,0x00,0x00,0x00]
|
||||
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
|
||||
|
||||
cluster_load_b128 v[0:3], v4, s[0:1] offset:64
|
||||
// GFX1250: cluster_load_b128 v[0:3], v4, s[0:1] offset:64 ; encoding: [0x00,0x40,0x1a,0xee,0x00,0x00,0x00,0x00,0x04,0x40,0x00,0x00]
|
||||
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
|
||||
|
||||
cluster_load_b128 v[0:3], v4, s[0:1] offset:-64
|
||||
// GFX1250: cluster_load_b128 v[0:3], v4, s[0:1] offset:-64 ; encoding: [0x00,0x40,0x1a,0xee,0x00,0x00,0x00,0x00,0x04,0xc0,0xff,0xff]
|
||||
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
|
||||
|
||||
|
||||
flat_atomic_add_f64 v[0:1], v[2:3] offset:4095
|
||||
// GFX1250: flat_atomic_add_f64 v[0:1], v[2:3] offset:4095 ; encoding: [0x7c,0x40,0x15,0xec,0x00,0x00,0x00,0x01,0x00,0xff,0x0f,0x00]
|
||||
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
|
||||
|
||||
@ -3183,6 +3183,66 @@
|
||||
# GFX1250: global_load_monitor_b64 v[2:3], v2, s[4:5] offset:64 scale_offset ; encoding: [0x04,0x40,0x1c,0xee,0x02,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
|
||||
0x04,0x40,0x1c,0xee,0x02,0x00,0x01,0x00,0x02,0x40,0x00,0x00
|
||||
|
||||
# GFX1250: cluster_load_b128 v[0:3], v[4:5], off offset:64 ; encoding: [0x7c,0x40,0x1a,0xee,0x00,0x00,0x00,0x00,0x04,0x40,0x00,0x00]
|
||||
0x7c,0x40,0x1a,0xee,0x00,0x00,0x00,0x00,0x04,0x40,0x00,0x00
|
||||
|
||||
# GFX1250: cluster_load_b128 v[0:3], v[4:5], off offset:-64 ; encoding: [0x7c,0x40,0x1a,0xee,0x00,0x00,0x00,0x00,0x04,0xc0,0xff,0xff]
|
||||
0x7c,0x40,0x1a,0xee,0x00,0x00,0x00,0x00,0x04,0xc0,0xff,0xff
|
||||
|
||||
# GFX1250: cluster_load_b128 v[0:3], v[4:5], off th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0x40,0x1a,0xee,0x00,0x00,0x3c,0x00,0x04,0x00,0x00,0x00]
|
||||
0x7c,0x40,0x1a,0xee,0x00,0x00,0x3c,0x00,0x04,0x00,0x00,0x00
|
||||
|
||||
# GFX1250: cluster_load_b128 v[0:3], v4, s[0:1] offset:64 ; encoding: [0x00,0x40,0x1a,0xee,0x00,0x00,0x00,0x00,0x04,0x40,0x00,0x00]
|
||||
0x00,0x40,0x1a,0xee,0x00,0x00,0x00,0x00,0x04,0x40,0x00,0x00
|
||||
|
||||
# GFX1250: cluster_load_b128 v[0:3], v4, s[0:1] offset:-64 ; encoding: [0x00,0x40,0x1a,0xee,0x00,0x00,0x00,0x00,0x04,0xc0,0xff,0xff]
|
||||
0x00,0x40,0x1a,0xee,0x00,0x00,0x00,0x00,0x04,0xc0,0xff,0xff
|
||||
|
||||
# GFX1250: cluster_load_b128 v[0:3], v4, s[0:1] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x00,0x40,0x1a,0xee,0x00,0x00,0x68,0x00,0x04,0x00,0x00,0x00]
|
||||
0x00,0x40,0x1a,0xee,0x00,0x00,0x68,0x00,0x04,0x00,0x00,0x00
|
||||
|
||||
# GFX1250: cluster_load_b32 v1, v[2:3], off offset:64 ; encoding: [0x7c,0xc0,0x19,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00]
|
||||
0x7c,0xc0,0x19,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00
|
||||
|
||||
# GFX1250: cluster_load_b32 v1, v[2:3], off offset:-64 ; encoding: [0x7c,0xc0,0x19,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff]
|
||||
0x7c,0xc0,0x19,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff
|
||||
|
||||
# GFX1250: cluster_load_b32 v1, v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0xc0,0x19,0xee,0x01,0x00,0x3c,0x00,0x02,0x00,0x00,0x00]
|
||||
0x7c,0xc0,0x19,0xee,0x01,0x00,0x3c,0x00,0x02,0x00,0x00,0x00
|
||||
|
||||
# GFX1250: cluster_load_b32 v1, v2, s[0:1] offset:64 ; encoding: [0x00,0xc0,0x19,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00]
|
||||
0x00,0xc0,0x19,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00
|
||||
|
||||
# GFX1250: cluster_load_b32 v1, v2, s[0:1] offset:-64 ; encoding: [0x00,0xc0,0x19,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff]
|
||||
0x00,0xc0,0x19,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff
|
||||
|
||||
# GFX1250: cluster_load_b32 v1, v2, s[0:1] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x00,0xc0,0x19,0xee,0x01,0x00,0x68,0x00,0x02,0x00,0x00,0x00]
|
||||
0x00,0xc0,0x19,0xee,0x01,0x00,0x68,0x00,0x02,0x00,0x00,0x00
|
||||
|
||||
# GFX1250: cluster_load_b32 v1, v2, s[4:5] offset:64 scale_offset th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x04,0xc0,0x19,0xee,0x01,0x00,0x3d,0x00,0x02,0x40,0x00,0x00]
|
||||
0x04,0xc0,0x19,0xee,0x01,0x00,0x3d,0x00,0x02,0x40,0x00,0x00
|
||||
|
||||
# GFX1250: cluster_load_b64 v[0:1], v[2:3], off offset:64 ; encoding: [0x7c,0x00,0x1a,0xee,0x00,0x00,0x00,0x00,0x02,0x40,0x00,0x00]
|
||||
0x7c,0x00,0x1a,0xee,0x00,0x00,0x00,0x00,0x02,0x40,0x00,0x00
|
||||
|
||||
# GFX1250: cluster_load_b64 v[0:1], v[2:3], off offset:-64 ; encoding: [0x7c,0x00,0x1a,0xee,0x00,0x00,0x00,0x00,0x02,0xc0,0xff,0xff]
|
||||
0x7c,0x00,0x1a,0xee,0x00,0x00,0x00,0x00,0x02,0xc0,0xff,0xff
|
||||
|
||||
# GFX1250: cluster_load_b64 v[0:1], v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0x00,0x1a,0xee,0x00,0x00,0x3c,0x00,0x02,0x00,0x00,0x00]
|
||||
0x7c,0x00,0x1a,0xee,0x00,0x00,0x3c,0x00,0x02,0x00,0x00,0x00
|
||||
|
||||
# GFX1250: cluster_load_b64 v[0:1], v2, s[0:1] offset:64 ; encoding: [0x00,0x00,0x1a,0xee,0x00,0x00,0x00,0x00,0x02,0x40,0x00,0x00]
|
||||
0x00,0x00,0x1a,0xee,0x00,0x00,0x00,0x00,0x02,0x40,0x00,0x00
|
||||
|
||||
# GFX1250: cluster_load_b64 v[0:1], v2, s[0:1] offset:-64 ; encoding: [0x00,0x00,0x1a,0xee,0x00,0x00,0x00,0x00,0x02,0xc0,0xff,0xff]
|
||||
0x00,0x00,0x1a,0xee,0x00,0x00,0x00,0x00,0x02,0xc0,0xff,0xff
|
||||
|
||||
# GFX1250: cluster_load_b64 v[0:1], v2, s[0:1] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x00,0x00,0x1a,0xee,0x00,0x00,0x68,0x00,0x02,0x00,0x00,0x00]
|
||||
0x00,0x00,0x1a,0xee,0x00,0x00,0x68,0x00,0x02,0x00,0x00,0x00
|
||||
|
||||
# GFX1250: cluster_load_b64 v[2:3], v2, s[4:5] offset:64 scale_offset th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x04,0x00,0x1a,0xee,0x02,0x00,0x69,0x00,0x02,0x40,0x00,0x00]
|
||||
0x04,0x00,0x1a,0xee,0x02,0x00,0x69,0x00,0x02,0x40,0x00,0x00
|
||||
|
||||
# GFX1250: flat_atomic_add_f64 v[0:1], v[2:3] offset:4095 ; encoding: [0x7c,0x40,0x15,0xec,0x00,0x00,0x00,0x01,0x00,0xff,0x0f,0x00]
|
||||
0x7c,0x40,0x15,0xec,0x00,0x00,0x00,0x01,0x00,0xff,0x0f,0x00
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user