[AMDGPU] Support cluster load instructions for gfx1250 (#156548)

This commit is contained in:
Changpeng Fang 2025-09-02 16:34:20 -07:00 committed by GitHub
parent 0dc1b168a6
commit d3d1d8ff21
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
16 changed files with 503 additions and 0 deletions

View File

@ -667,6 +667,9 @@ TARGET_BUILTIN(__builtin_amdgcn_global_load_monitor_b128, "V4iV4i*1Ii", "nc", "g
TARGET_BUILTIN(__builtin_amdgcn_flat_load_monitor_b32, "ii*0Ii", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_flat_load_monitor_b64, "V2iV2i*0Ii", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_flat_load_monitor_b128, "V4iV4i*0Ii", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cluster_load_b32, "ii*1Iii", "nc", "gfx1250-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_cluster_load_b64, "V2iV2i*1Iii", "nc", "gfx1250-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_cluster_load_b128, "V4iV4i*1Iii", "nc", "gfx1250-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_global_load_async_to_lds_b8, "vc*1c*3IiIi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_global_load_async_to_lds_b32, "vi*1i*3IiIi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_global_load_async_to_lds_b64, "vV2i*1V2i*3IiIi", "nc", "gfx1250-insts")

View File

@ -675,6 +675,27 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
llvm::Function *F = CGM.getIntrinsic(IID, {LoadTy});
return Builder.CreateCall(F, {Addr, Val});
}
case AMDGPU::BI__builtin_amdgcn_cluster_load_b32:
case AMDGPU::BI__builtin_amdgcn_cluster_load_b64:
case AMDGPU::BI__builtin_amdgcn_cluster_load_b128: {
Intrinsic::ID IID;
switch (BuiltinID) {
case AMDGPU::BI__builtin_amdgcn_cluster_load_b32:
IID = Intrinsic::amdgcn_cluster_load_b32;
break;
case AMDGPU::BI__builtin_amdgcn_cluster_load_b64:
IID = Intrinsic::amdgcn_cluster_load_b64;
break;
case AMDGPU::BI__builtin_amdgcn_cluster_load_b128:
IID = Intrinsic::amdgcn_cluster_load_b128;
break;
}
SmallVector<Value *, 3> Args;
for (int i = 0, e = E->getNumArgs(); i != e; ++i)
Args.push_back(EmitScalarExpr(E->getArg(i)));
llvm::Function *F = CGM.getIntrinsic(IID, {ConvertType(E->getType())});
return Builder.CreateCall(F, {Args});
}
case AMDGPU::BI__builtin_amdgcn_load_to_lds: {
// Should this have asan instrumentation?
return emitBuiltinWithOneOverloadedType<5>(*this, E,

View File

@ -0,0 +1,36 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
// REQUIRES: amdgpu-registered-target
// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1250 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1250
typedef int v2i __attribute__((ext_vector_type(2)));
typedef int v4i __attribute__((ext_vector_type(4)));
// CHECK-GFX1250-LABEL: @test_amdgcn_cluster_load_b32(
// CHECK-GFX1250-NEXT: entry:
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.cluster.load.b32.i32(ptr addrspace(1) [[INPTR:%.*]], i32 10, i32 [[MASK:%.*]])
// CHECK-GFX1250-NEXT: ret i32 [[TMP0]]
//
int test_amdgcn_cluster_load_b32(global int* inptr, int mask)
{
return __builtin_amdgcn_cluster_load_b32(inptr, 10, mask);
}
// CHECK-GFX1250-LABEL: @test_amdgcn_cluster_load_b64(
// CHECK-GFX1250-NEXT: entry:
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.cluster.load.b64.v2i32(ptr addrspace(1) [[INPTR:%.*]], i32 22, i32 [[MASK:%.*]])
// CHECK-GFX1250-NEXT: ret <2 x i32> [[TMP0]]
//
v2i test_amdgcn_cluster_load_b64(global v2i* inptr, int mask)
{
return __builtin_amdgcn_cluster_load_b64(inptr, 22, mask);
}
// CHECK-GFX1250-LABEL: @test_amdgcn_cluster_load_b128(
// CHECK-GFX1250-NEXT: entry:
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.cluster.load.b128.v4i32(ptr addrspace(1) [[INPTR:%.*]], i32 27, i32 [[MASK:%.*]])
// CHECK-GFX1250-NEXT: ret <4 x i32> [[TMP0]]
//
v4i test_amdgcn_cluster_load_b128(global v4i* inptr, int mask)
{
return __builtin_amdgcn_cluster_load_b128(inptr, 27, mask);
}

View File

@ -103,6 +103,13 @@ void test_amdgcn_load_monitor(global int* b32gaddr, global v2i* b64gaddr, global
*b128out = __builtin_amdgcn_flat_load_monitor_b128(b128faddr, cpol); // expected-error {{'__builtin_amdgcn_flat_load_monitor_b128' must be a constant integer}}
}
void test_amdgcn_cluster_load(global int* addr32, global v2i* addr64, global v4i* addr128, global int* b32out, global v2i* b64out, global v4i* b128out, int cpol, int mask)
{
*b32out = __builtin_amdgcn_cluster_load_b32(addr32, cpol, mask); // expected-error {{'__builtin_amdgcn_cluster_load_b32' must be a constant integer}}
*b64out = __builtin_amdgcn_cluster_load_b64(addr64, cpol, mask); // expected-error {{'__builtin_amdgcn_cluster_load_b64' must be a constant integer}}
*b128out = __builtin_amdgcn_cluster_load_b128(addr128, cpol, mask); // expected-error {{'__builtin_amdgcn_cluster_load_b128' must be a constant integer}}
}
void test_amdgcn_async_load_store_lds_offset(global char* gaddr8, global int *gaddr32, global v2i* gaddr64, global v4i* gaddr128, local char* laddr8,
local int *laddr32, local v2i* laddr64, local v4i* laddr128, int offset, int mask)
{

View File

@ -4113,6 +4113,23 @@ def int_amdgcn_tensor_load_to_lds_d2 :
def int_amdgcn_tensor_store_from_lds_d2 :
ClangBuiltin<"__builtin_amdgcn_tensor_store_from_lds_d2">, AMDGPUTensorLoadStoreD2;
class AMDGPUClusterLoad<LLVMType ptr_ty>:
Intrinsic<
[llvm_any_ty],
[ptr_ty,
llvm_i32_ty, // gfx12+ cachepolicy:
// bits [0-2] = th
// bits [3-4] = scope
llvm_i32_ty], // workgroup broadcast mask (in M0)
[IntrArgMemOnly, IntrReadMem, ReadOnly<ArgIndex<0>>, IntrWillReturn, IntrConvergent,
NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<1>>, IntrNoCallback, IntrNoFree],
"", [SDNPMemOperand]
>;
def int_amdgcn_cluster_load_b32 : AMDGPUClusterLoad<global_ptr_ty>;
def int_amdgcn_cluster_load_b64 : AMDGPUClusterLoad<global_ptr_ty>;
def int_amdgcn_cluster_load_b128 : AMDGPUClusterLoad<global_ptr_ty>;
class AMDGPULoadMonitor<LLVMType ptr_ty>:
Intrinsic<
[llvm_any_ty],

View File

@ -128,6 +128,9 @@ def gi_global_saddr :
def gi_global_saddr_cpol :
GIComplexOperandMatcher<s64, "selectGlobalSAddrCPol">,
GIComplexPatternEquiv<GlobalSAddrCPol>;
def gi_global_saddr_cpol_m0 :
GIComplexOperandMatcher<s64, "selectGlobalSAddrCPolM0">,
GIComplexPatternEquiv<GlobalSAddrCPolM0>;
def gi_global_saddr_glc :
GIComplexOperandMatcher<s64, "selectGlobalSAddrGLC">,
GIComplexPatternEquiv<GlobalSAddrGLC>;

View File

@ -2089,6 +2089,23 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddrCPol(SDNode *N, SDValue Addr,
return true;
}
bool AMDGPUDAGToDAGISel::SelectGlobalSAddrCPolM0(SDNode *N, SDValue Addr,
SDValue &SAddr,
SDValue &VOffset,
SDValue &Offset,
SDValue &CPol) const {
bool ScaleOffset;
if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
return false;
// We are assuming CPol is second from last operand of the intrinsic.
auto PassedCPol =
N->getConstantOperandVal(N->getNumOperands() - 2) & ~AMDGPU::CPol::SCAL;
CPol = CurDAG->getTargetConstant(
(ScaleOffset ? AMDGPU::CPol::SCAL : 0) | PassedCPol, SDLoc(), MVT::i32);
return true;
}
bool AMDGPUDAGToDAGISel::SelectGlobalSAddrGLC(SDNode *N, SDValue Addr,
SDValue &SAddr, SDValue &VOffset,
SDValue &Offset,

View File

@ -171,6 +171,9 @@ private:
bool SelectGlobalSAddrCPol(SDNode *N, SDValue Addr, SDValue &SAddr,
SDValue &VOffset, SDValue &Offset,
SDValue &CPol) const;
bool SelectGlobalSAddrCPolM0(SDNode *N, SDValue Addr, SDValue &SAddr,
SDValue &VOffset, SDValue &Offset,
SDValue &CPol) const;
bool SelectGlobalSAddrGLC(SDNode *N, SDValue Addr, SDValue &SAddr,
SDValue &VOffset, SDValue &Offset,
SDValue &CPol) const;

View File

@ -5708,6 +5708,16 @@ AMDGPUInstructionSelector::selectGlobalSAddrCPol(MachineOperand &Root) const {
return selectGlobalSAddr(Root, PassedCPol);
}
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectGlobalSAddrCPolM0(MachineOperand &Root) const {
const MachineInstr &I = *Root.getParent();
// We are assuming CPol is second from last operand of the intrinsic.
auto PassedCPol =
I.getOperand(I.getNumOperands() - 2).getImm() & ~AMDGPU::CPol::SCAL;
return selectGlobalSAddr(Root, PassedCPol);
}
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectGlobalSAddrGLC(MachineOperand &Root) const {
return selectGlobalSAddr(Root, AMDGPU::CPol::GLC);

View File

@ -256,6 +256,8 @@ private:
InstructionSelector::ComplexRendererFns
selectGlobalSAddrCPol(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectGlobalSAddrCPolM0(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectGlobalSAddrGLC(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectGlobalSAddrNoIOffset(MachineOperand &Root) const;

View File

@ -3338,6 +3338,13 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
applyDefaultMapping(OpdMapper);
constrainOpWithReadfirstlane(B, MI, 8); // M0
return;
case Intrinsic::amdgcn_cluster_load_b32:
case Intrinsic::amdgcn_cluster_load_b64:
case Intrinsic::amdgcn_cluster_load_b128: {
applyDefaultMapping(OpdMapper);
constrainOpWithReadfirstlane(B, MI, 4); // M0
return;
}
case Intrinsic::amdgcn_s_sleep_var:
assert(OpdMapper.getVRegs(1).empty());
constrainOpWithReadfirstlane(B, MI, 1);
@ -5466,6 +5473,16 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
break;
}
case Intrinsic::amdgcn_cluster_load_b32:
case Intrinsic::amdgcn_cluster_load_b64:
case Intrinsic::amdgcn_cluster_load_b128: {
OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
unsigned M0Bank =
getRegBankID(MI.getOperand(4).getReg(), MRI, AMDGPU::SGPRRegBankID);
OpdsMapping[4] = AMDGPU::getValueMapping(M0Bank, 32);
break;
}
case Intrinsic::amdgcn_global_store_async_from_lds_b8:
case Intrinsic::amdgcn_global_store_async_from_lds_b32:
case Intrinsic::amdgcn_global_store_async_from_lds_b64:

View File

@ -15,6 +15,7 @@ let WantsRoot = true in {
def GlobalSAddr : ComplexPattern<iPTR, 4, "SelectGlobalSAddr", [], [], -10>;
def GlobalSAddrGLC : ComplexPattern<iPTR, 4, "SelectGlobalSAddrGLC", [], [], -10>;
def GlobalSAddrCPol : ComplexPattern<iPTR, 4, "SelectGlobalSAddrCPol", [], [], -10>;
def GlobalSAddrCPolM0 : ComplexPattern<iPTR, 4, "SelectGlobalSAddrCPolM0", [], [], -10>;
def ScratchSAddr : ComplexPattern<iPTR, 2, "SelectScratchSAddr", [], [], -10>;
def ScratchSVAddr : ComplexPattern<iPTR, 4, "SelectScratchSVAddr", [], [], -10>;
}
@ -1248,6 +1249,14 @@ defm GLOBAL_LOAD_MONITOR_B64 : FLAT_Global_Load_Pseudo <"global_load_monitor_b6
defm GLOBAL_LOAD_MONITOR_B128 : FLAT_Global_Load_Pseudo <"global_load_monitor_b128", VReg_128>;
} // End SubtargetPredicate = isGFX125xOnly
let SubtargetPredicate = isGFX1250Plus, WaveSizePredicate = isWave32 in {
let Uses = [M0, EXEC] in { // Use M0 for broadcast workgroup mask.
defm CLUSTER_LOAD_B32 : FLAT_Global_Load_Pseudo <"cluster_load_b32", VGPR_32>;
defm CLUSTER_LOAD_B64 : FLAT_Global_Load_Pseudo <"cluster_load_b64", VReg_64>;
defm CLUSTER_LOAD_B128 : FLAT_Global_Load_Pseudo <"cluster_load_b128", VReg_128>;
} // End Uses = [M0, EXEC]
} // End SubtargetPredicate = isGFX1250Plus, WaveSizePredicate = isWave32
let SubtargetPredicate = isGFX12Plus in {
let Uses = [EXEC, M0] in {
defm GLOBAL_LOAD_BLOCK : FLAT_Global_Load_Pseudo <"global_load_block", VReg_1024>;
@ -1394,6 +1403,16 @@ class FlatLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt>
(inst $saddr, $voffset, $offset, $cpol)
>;
class FlatLoadSignedPat_M0 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(vt (node (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset), (i32 timm:$cpol), M0)),
(inst $vaddr, $offset, $cpol)
>;
class GlobalLoadSaddrPat_M0 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(vt (node (GlobalSAddrCPolM0 (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), (i32 timm), M0)),
(inst $saddr, $voffset, $offset, $cpol)
>;
class FlatLoadSignedPat_CPOL <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
(vt (node (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset), (i32 timm:$cpol))),
(inst $vaddr, $offset, $cpol)
@ -1619,6 +1638,16 @@ multiclass GlobalFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueTyp
}
}
multiclass GlobalFLATLoadPats_M0<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
def : FlatLoadSignedPat_M0 <inst, node, vt> {
let AddedComplexity = 10;
}
def : GlobalLoadSaddrPat_M0<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
let AddedComplexity = 11;
}
}
multiclass GlobalFLATLoadPats_CPOL<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
def : FlatLoadSignedPat_CPOL<inst, node, vt> {
let AddedComplexity = 10;
@ -2176,6 +2205,10 @@ let OtherPredicates = [isGFX125xOnly] in {
} // End SubtargetPredicate = isGFX125xOnly
let OtherPredicates = [isGFX1250Plus] in {
defm : GlobalFLATLoadPats_M0 <CLUSTER_LOAD_B32, int_amdgcn_cluster_load_b32, i32>;
defm : GlobalFLATLoadPats_M0 <CLUSTER_LOAD_B64, int_amdgcn_cluster_load_b64, v2i32>;
defm : GlobalFLATLoadPats_M0 <CLUSTER_LOAD_B128, int_amdgcn_cluster_load_b128, v4i32>;
defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B8, int_amdgcn_global_load_async_to_lds_b8>;
defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B32, int_amdgcn_global_load_async_to_lds_b32>;
defm : GlobalLoadLDSPats <GLOBAL_LOAD_ASYNC_TO_LDS_B64, int_amdgcn_global_load_async_to_lds_b64>;
@ -3470,6 +3503,10 @@ defm GLOBAL_LOAD_MONITOR_B32 : VFLAT_Real_AllAddr_gfx1250<0x070>;
defm GLOBAL_LOAD_MONITOR_B64 : VFLAT_Real_AllAddr_gfx1250<0x071>;
defm GLOBAL_LOAD_MONITOR_B128 : VFLAT_Real_AllAddr_gfx1250<0x072>;
defm CLUSTER_LOAD_B32 : VFLAT_Real_AllAddr_gfx1250<0x067>;
defm CLUSTER_LOAD_B64 : VFLAT_Real_AllAddr_gfx1250<0x068>;
defm CLUSTER_LOAD_B128 : VFLAT_Real_AllAddr_gfx1250<0x069>;
defm GLOBAL_LOAD_ASYNC_TO_LDS_B8 : VFLAT_Real_AllAddr_gfx1250<0x5f>;
defm GLOBAL_LOAD_ASYNC_TO_LDS_B32 : VFLAT_Real_AllAddr_gfx1250<0x60>;
defm GLOBAL_LOAD_ASYNC_TO_LDS_B64 : VFLAT_Real_AllAddr_gfx1250<0x61>;

View File

@ -1506,6 +1506,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
case Intrinsic::amdgcn_global_load_monitor_b32:
case Intrinsic::amdgcn_global_load_monitor_b64:
case Intrinsic::amdgcn_global_load_monitor_b128:
case Intrinsic::amdgcn_cluster_load_b32:
case Intrinsic::amdgcn_cluster_load_b64:
case Intrinsic::amdgcn_cluster_load_b128:
case Intrinsic::amdgcn_ds_load_tr6_b96:
case Intrinsic::amdgcn_ds_load_tr4_b64:
case Intrinsic::amdgcn_ds_load_tr8_b64:
@ -1636,6 +1639,9 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II,
Value *Ptr = nullptr;
switch (II->getIntrinsicID()) {
case Intrinsic::amdgcn_atomic_cond_sub_u32:
case Intrinsic::amdgcn_cluster_load_b128:
case Intrinsic::amdgcn_cluster_load_b64:
case Intrinsic::amdgcn_cluster_load_b32:
case Intrinsic::amdgcn_ds_append:
case Intrinsic::amdgcn_ds_consume:
case Intrinsic::amdgcn_ds_load_tr8_b64:

View File

@ -0,0 +1,183 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 -O3 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 -O3 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
declare i32 @llvm.amdgcn.cluster.load.b32.i32.p1(ptr addrspace(1), i32 %cpol, i32 %mask)
declare <2 x i32> @llvm.amdgcn.cluster.load.b64.v2i32.p1(ptr addrspace(1), i32 %cpol, i32 %mask)
declare <4 x i32> @llvm.amdgcn.cluster.load.b128.v4i32.p1(ptr addrspace(1), i32 %cpol, i32 %mask)
define amdgpu_ps void @cluster_load_b32_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use, i32 %mask) {
; GFX1250-LABEL: cluster_load_b32_vaddr:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: v_readfirstlane_b32 s0, v4
; GFX1250-NEXT: s_mov_b32 m0, s0
; GFX1250-NEXT: cluster_load_b32 v0, v[0:1], off offset:32 th:TH_LOAD_NT
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_store_b32 v[2:3], v0, off
; GFX1250-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
%val = call i32 @llvm.amdgcn.cluster.load.b32.i32.p1(ptr addrspace(1) %gep, i32 1, i32 %mask)
store i32 %val, ptr addrspace(1) %use
ret void
}
define amdgpu_ps void @cluster_load_b32_vaddr_imm_mask(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
; GFX1250-LABEL: cluster_load_b32_vaddr_imm_mask:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_mov_b32 m0, 7
; GFX1250-NEXT: cluster_load_b32 v0, v[0:1], off offset:32 th:TH_LOAD_HT scope:SCOPE_SE
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_store_b32 v[2:3], v0, off
; GFX1250-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
%val = call i32 @llvm.amdgcn.cluster.load.b32.i32.p1(ptr addrspace(1) %gep, i32 10, i32 7)
store i32 %val, ptr addrspace(1) %use
ret void
}
define amdgpu_ps void @cluster_load_b32_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use, i32 inreg %mask) {
; GFX1250-LABEL: cluster_load_b32_saddr:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: s_mov_b32 m0, s2
; GFX1250-NEXT: cluster_load_b32 v2, v2, s[0:1] offset:32 th:TH_LOAD_NT_HT scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_store_b32 v[0:1], v2, off
; GFX1250-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
%val = call i32 @llvm.amdgcn.cluster.load.b32.i32.p1(ptr addrspace(1) %gep, i32 22, i32 %mask)
store i32 %val, ptr addrspace(1) %use
ret void
}
define amdgpu_ps void @cluster_load_monitor_b32_saddr_scale_offset(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use, i32 inreg %mask, i32 %idx) {
; GFX1250-LABEL: cluster_load_monitor_b32_saddr_scale_offset:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_mov_b32 m0, s2
; GFX1250-NEXT: cluster_load_b32 v2, v2, s[0:1] scale_offset th:TH_LOAD_BYPASS scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_store_b32 v[0:1], v2, off
; GFX1250-NEXT: s_endpgm
entry:
%idxprom = sext i32 %idx to i64
%gep = getelementptr i32, ptr addrspace(1) %addr, i64 %idxprom
%val = call i32 @llvm.amdgcn.cluster.load.b32.i32.p1(ptr addrspace(1) %gep, i32 27, i32 inreg %mask)
store i32 %val, ptr addrspace(1) %use
ret void
}
define amdgpu_ps void @cluster_load_b64_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use, i32 %mask) {
; GFX1250-LABEL: cluster_load_b64_vaddr:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: v_readfirstlane_b32 s0, v4
; GFX1250-NEXT: s_mov_b32 m0, s0
; GFX1250-NEXT: cluster_load_b64 v[0:1], v[0:1], off offset:32 th:TH_LOAD_NT
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_store_b64 v[2:3], v[0:1], off
; GFX1250-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
%val = call <2 x i32> @llvm.amdgcn.cluster.load.b64.v2i32.p1(ptr addrspace(1) %gep, i32 1, i32 %mask)
store <2 x i32> %val, ptr addrspace(1) %use
ret void
}
define amdgpu_ps void @cluster_load_b64_vaddr_imm_mask(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
; GFX1250-LABEL: cluster_load_b64_vaddr_imm_mask:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_mov_b32 m0, 0x10007
; GFX1250-NEXT: cluster_load_b64 v[0:1], v[0:1], off offset:32 th:TH_LOAD_HT scope:SCOPE_SE
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_store_b64 v[2:3], v[0:1], off
; GFX1250-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
%val = call <2 x i32> @llvm.amdgcn.cluster.load.b64.v2i32.p1(ptr addrspace(1) %gep, i32 10, i32 65543)
store <2 x i32> %val, ptr addrspace(1) %use
ret void
}
define amdgpu_ps void @cluster_load_b64_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use, i32 inreg %mask) {
; GFX1250-LABEL: cluster_load_b64_saddr:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: s_mov_b32 m0, s2
; GFX1250-NEXT: cluster_load_b64 v[2:3], v2, s[0:1] offset:32 th:TH_LOAD_NT_HT scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX1250-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
%val = call <2 x i32> @llvm.amdgcn.cluster.load.b64.v2i32.p1(ptr addrspace(1) %gep, i32 22, i32 %mask)
store <2 x i32> %val, ptr addrspace(1) %use
ret void
}
define amdgpu_ps void @cluster_load_monitor_b64_saddr_scale_offset(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use, i32 inreg %mask, i32 %idx) {
; GFX1250-LABEL: cluster_load_monitor_b64_saddr_scale_offset:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_mov_b32 m0, s2
; GFX1250-NEXT: cluster_load_b64 v[2:3], v2, s[0:1] scale_offset th:TH_LOAD_BYPASS scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX1250-NEXT: s_endpgm
entry:
%idxprom = sext i32 %idx to i64
%gep = getelementptr i64, ptr addrspace(1) %addr, i64 %idxprom
%val = call <2 x i32> @llvm.amdgcn.cluster.load.b64.v2i32.p1(ptr addrspace(1) %gep, i32 27, i32 inreg %mask)
store <2 x i32> %val, ptr addrspace(1) %use
ret void
}
define amdgpu_ps void @cluster_load_b128_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use, i32 %mask) {
; GFX1250-LABEL: cluster_load_b128_vaddr:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: v_readfirstlane_b32 s0, v4
; GFX1250-NEXT: s_mov_b32 m0, s0
; GFX1250-NEXT: cluster_load_b128 v[4:7], v[0:1], off offset:32 th:TH_LOAD_NT
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_store_b128 v[2:3], v[4:7], off
; GFX1250-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
%val = call <4 x i32> @llvm.amdgcn.cluster.load.b128.v4i32.p1(ptr addrspace(1) %gep, i32 1, i32 %mask)
store <4 x i32> %val, ptr addrspace(1) %use
ret void
}
define amdgpu_ps void @cluster_load_b128_vaddr_imm_mask(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
; GFX1250-LABEL: cluster_load_b128_vaddr_imm_mask:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: s_mov_b32 m0, 15
; GFX1250-NEXT: cluster_load_b128 v[4:7], v[0:1], off offset:32 th:TH_LOAD_HT scope:SCOPE_SE
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_store_b128 v[2:3], v[4:7], off
; GFX1250-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
%val = call <4 x i32> @llvm.amdgcn.cluster.load.b128.v4i32.p1(ptr addrspace(1) %gep, i32 10, i32 15)
store <4 x i32> %val, ptr addrspace(1) %use
ret void
}
define amdgpu_ps void @cluster_load_b128_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use, i32 inreg %mask) {
; GFX1250-LABEL: cluster_load_b128_saddr:
; GFX1250: ; %bb.0: ; %entry
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
; GFX1250-NEXT: s_mov_b32 m0, s2
; GFX1250-NEXT: cluster_load_b128 v[2:5], v2, s[0:1] offset:32 th:TH_LOAD_BYPASS scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: global_store_b128 v[0:1], v[2:5], off
; GFX1250-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
%val = call <4 x i32> @llvm.amdgcn.cluster.load.b128.v4i32.p1(ptr addrspace(1) %gep, i32 27, i32 inreg %mask)
store <4 x i32> %val, ptr addrspace(1) %use
ret void
}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GFX1250-GISEL: {{.*}}
; GFX1250-SDAG: {{.*}}

View File

@ -3096,6 +3096,87 @@ flat_load_monitor_b64 v[2:3], v2, s[4:5] offset:64 scale_offset
// GFX1250: flat_load_monitor_b64 v[2:3], v2, s[4:5] offset:64 scale_offset ; encoding: [0x04,0x40,0x1c,0xec,0x02,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
cluster_load_b32 v1, v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS
// GFX1250: cluster_load_b32 v1, v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0xc0,0x19,0xee,0x01,0x00,0x3c,0x00,0x02,0x00,0x00,0x00]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
cluster_load_b32 v1, v[2:3], off offset:64
// GFX1250: cluster_load_b32 v1, v[2:3], off offset:64 ; encoding: [0x7c,0xc0,0x19,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
cluster_load_b32 v1, v[2:3], off offset:-64
// GFX1250: cluster_load_b32 v1, v[2:3], off offset:-64 ; encoding: [0x7c,0xc0,0x19,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
cluster_load_b32 v1, v2, s[0:1] th:TH_LOAD_NT_HT scope:SCOPE_DEV
// GFX1250: cluster_load_b32 v1, v2, s[0:1] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x00,0xc0,0x19,0xee,0x01,0x00,0x68,0x00,0x02,0x00,0x00,0x00]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
cluster_load_b32 v1, v2, s[0:1] offset:64
// GFX1250: cluster_load_b32 v1, v2, s[0:1] offset:64 ; encoding: [0x00,0xc0,0x19,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
cluster_load_b32 v1, v2, s[0:1] offset:-64
// GFX1250: cluster_load_b32 v1, v2, s[0:1] offset:-64 ; encoding: [0x00,0xc0,0x19,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
cluster_load_b32 v1, v2, s[4:5] offset:64 scale_offset th:TH_LOAD_BYPASS scope:SCOPE_SYS
// GFX1250: cluster_load_b32 v1, v2, s[4:5] offset:64 scale_offset th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x04,0xc0,0x19,0xee,0x01,0x00,0x3d,0x00,0x02,0x40,0x00,0x00]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
cluster_load_b64 v[0:1], v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS
// GFX1250: cluster_load_b64 v[0:1], v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0x00,0x1a,0xee,0x00,0x00,0x3c,0x00,0x02,0x00,0x00,0x00]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
cluster_load_b64 v[0:1], v[2:3], off offset:64
// GFX1250: cluster_load_b64 v[0:1], v[2:3], off offset:64 ; encoding: [0x7c,0x00,0x1a,0xee,0x00,0x00,0x00,0x00,0x02,0x40,0x00,0x00]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
cluster_load_b64 v[0:1], v[2:3], off offset:-64
// GFX1250: cluster_load_b64 v[0:1], v[2:3], off offset:-64 ; encoding: [0x7c,0x00,0x1a,0xee,0x00,0x00,0x00,0x00,0x02,0xc0,0xff,0xff]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
cluster_load_b64 v[0:1], v2, s[0:1] th:TH_LOAD_NT_HT scope:SCOPE_DEV
// GFX1250: cluster_load_b64 v[0:1], v2, s[0:1] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x00,0x00,0x1a,0xee,0x00,0x00,0x68,0x00,0x02,0x00,0x00,0x00]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
cluster_load_b64 v[0:1], v2, s[0:1] offset:64
// GFX1250: cluster_load_b64 v[0:1], v2, s[0:1] offset:64 ; encoding: [0x00,0x00,0x1a,0xee,0x00,0x00,0x00,0x00,0x02,0x40,0x00,0x00]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
cluster_load_b64 v[0:1], v2, s[0:1] offset:-64
// GFX1250: cluster_load_b64 v[0:1], v2, s[0:1] offset:-64 ; encoding: [0x00,0x00,0x1a,0xee,0x00,0x00,0x00,0x00,0x02,0xc0,0xff,0xff]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
cluster_load_b64 v[2:3], v2, s[4:5] offset:64 scale_offset th:TH_LOAD_NT_HT scope:SCOPE_DEV
// GFX1250: cluster_load_b64 v[2:3], v2, s[4:5] offset:64 scale_offset th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x04,0x00,0x1a,0xee,0x02,0x00,0x69,0x00,0x02,0x40,0x00,0x00]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
cluster_load_b128 v[0:3], v[4:5], off th:TH_LOAD_BYPASS scope:SCOPE_SYS
// GFX1250: cluster_load_b128 v[0:3], v[4:5], off th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0x40,0x1a,0xee,0x00,0x00,0x3c,0x00,0x04,0x00,0x00,0x00]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
cluster_load_b128 v[0:3], v[4:5], off offset:64
// GFX1250: cluster_load_b128 v[0:3], v[4:5], off offset:64 ; encoding: [0x7c,0x40,0x1a,0xee,0x00,0x00,0x00,0x00,0x04,0x40,0x00,0x00]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
cluster_load_b128 v[0:3], v[4:5], off offset:-64
// GFX1250: cluster_load_b128 v[0:3], v[4:5], off offset:-64 ; encoding: [0x7c,0x40,0x1a,0xee,0x00,0x00,0x00,0x00,0x04,0xc0,0xff,0xff]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
cluster_load_b128 v[0:3], v4, s[0:1] th:TH_LOAD_NT_HT scope:SCOPE_DEV
// GFX1250: cluster_load_b128 v[0:3], v4, s[0:1] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x00,0x40,0x1a,0xee,0x00,0x00,0x68,0x00,0x04,0x00,0x00,0x00]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
cluster_load_b128 v[0:3], v4, s[0:1] offset:64
// GFX1250: cluster_load_b128 v[0:3], v4, s[0:1] offset:64 ; encoding: [0x00,0x40,0x1a,0xee,0x00,0x00,0x00,0x00,0x04,0x40,0x00,0x00]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
cluster_load_b128 v[0:3], v4, s[0:1] offset:-64
// GFX1250: cluster_load_b128 v[0:3], v4, s[0:1] offset:-64 ; encoding: [0x00,0x40,0x1a,0xee,0x00,0x00,0x00,0x00,0x04,0xc0,0xff,0xff]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
flat_atomic_add_f64 v[0:1], v[2:3] offset:4095
// GFX1250: flat_atomic_add_f64 v[0:1], v[2:3] offset:4095 ; encoding: [0x7c,0x40,0x15,0xec,0x00,0x00,0x00,0x01,0x00,0xff,0x0f,0x00]
// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU

View File

@ -3183,6 +3183,66 @@
# GFX1250: global_load_monitor_b64 v[2:3], v2, s[4:5] offset:64 scale_offset ; encoding: [0x04,0x40,0x1c,0xee,0x02,0x00,0x01,0x00,0x02,0x40,0x00,0x00]
0x04,0x40,0x1c,0xee,0x02,0x00,0x01,0x00,0x02,0x40,0x00,0x00
# GFX1250: cluster_load_b128 v[0:3], v[4:5], off offset:64 ; encoding: [0x7c,0x40,0x1a,0xee,0x00,0x00,0x00,0x00,0x04,0x40,0x00,0x00]
0x7c,0x40,0x1a,0xee,0x00,0x00,0x00,0x00,0x04,0x40,0x00,0x00
# GFX1250: cluster_load_b128 v[0:3], v[4:5], off offset:-64 ; encoding: [0x7c,0x40,0x1a,0xee,0x00,0x00,0x00,0x00,0x04,0xc0,0xff,0xff]
0x7c,0x40,0x1a,0xee,0x00,0x00,0x00,0x00,0x04,0xc0,0xff,0xff
# GFX1250: cluster_load_b128 v[0:3], v[4:5], off th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0x40,0x1a,0xee,0x00,0x00,0x3c,0x00,0x04,0x00,0x00,0x00]
0x7c,0x40,0x1a,0xee,0x00,0x00,0x3c,0x00,0x04,0x00,0x00,0x00
# GFX1250: cluster_load_b128 v[0:3], v4, s[0:1] offset:64 ; encoding: [0x00,0x40,0x1a,0xee,0x00,0x00,0x00,0x00,0x04,0x40,0x00,0x00]
0x00,0x40,0x1a,0xee,0x00,0x00,0x00,0x00,0x04,0x40,0x00,0x00
# GFX1250: cluster_load_b128 v[0:3], v4, s[0:1] offset:-64 ; encoding: [0x00,0x40,0x1a,0xee,0x00,0x00,0x00,0x00,0x04,0xc0,0xff,0xff]
0x00,0x40,0x1a,0xee,0x00,0x00,0x00,0x00,0x04,0xc0,0xff,0xff
# GFX1250: cluster_load_b128 v[0:3], v4, s[0:1] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x00,0x40,0x1a,0xee,0x00,0x00,0x68,0x00,0x04,0x00,0x00,0x00]
0x00,0x40,0x1a,0xee,0x00,0x00,0x68,0x00,0x04,0x00,0x00,0x00
# GFX1250: cluster_load_b32 v1, v[2:3], off offset:64 ; encoding: [0x7c,0xc0,0x19,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00]
0x7c,0xc0,0x19,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00
# GFX1250: cluster_load_b32 v1, v[2:3], off offset:-64 ; encoding: [0x7c,0xc0,0x19,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff]
0x7c,0xc0,0x19,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff
# GFX1250: cluster_load_b32 v1, v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0xc0,0x19,0xee,0x01,0x00,0x3c,0x00,0x02,0x00,0x00,0x00]
0x7c,0xc0,0x19,0xee,0x01,0x00,0x3c,0x00,0x02,0x00,0x00,0x00
# GFX1250: cluster_load_b32 v1, v2, s[0:1] offset:64 ; encoding: [0x00,0xc0,0x19,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00]
0x00,0xc0,0x19,0xee,0x01,0x00,0x00,0x00,0x02,0x40,0x00,0x00
# GFX1250: cluster_load_b32 v1, v2, s[0:1] offset:-64 ; encoding: [0x00,0xc0,0x19,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff]
0x00,0xc0,0x19,0xee,0x01,0x00,0x00,0x00,0x02,0xc0,0xff,0xff
# GFX1250: cluster_load_b32 v1, v2, s[0:1] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x00,0xc0,0x19,0xee,0x01,0x00,0x68,0x00,0x02,0x00,0x00,0x00]
0x00,0xc0,0x19,0xee,0x01,0x00,0x68,0x00,0x02,0x00,0x00,0x00
# GFX1250: cluster_load_b32 v1, v2, s[4:5] offset:64 scale_offset th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x04,0xc0,0x19,0xee,0x01,0x00,0x3d,0x00,0x02,0x40,0x00,0x00]
0x04,0xc0,0x19,0xee,0x01,0x00,0x3d,0x00,0x02,0x40,0x00,0x00
# GFX1250: cluster_load_b64 v[0:1], v[2:3], off offset:64 ; encoding: [0x7c,0x00,0x1a,0xee,0x00,0x00,0x00,0x00,0x02,0x40,0x00,0x00]
0x7c,0x00,0x1a,0xee,0x00,0x00,0x00,0x00,0x02,0x40,0x00,0x00
# GFX1250: cluster_load_b64 v[0:1], v[2:3], off offset:-64 ; encoding: [0x7c,0x00,0x1a,0xee,0x00,0x00,0x00,0x00,0x02,0xc0,0xff,0xff]
0x7c,0x00,0x1a,0xee,0x00,0x00,0x00,0x00,0x02,0xc0,0xff,0xff
# GFX1250: cluster_load_b64 v[0:1], v[2:3], off th:TH_LOAD_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0x00,0x1a,0xee,0x00,0x00,0x3c,0x00,0x02,0x00,0x00,0x00]
0x7c,0x00,0x1a,0xee,0x00,0x00,0x3c,0x00,0x02,0x00,0x00,0x00
# GFX1250: cluster_load_b64 v[0:1], v2, s[0:1] offset:64 ; encoding: [0x00,0x00,0x1a,0xee,0x00,0x00,0x00,0x00,0x02,0x40,0x00,0x00]
0x00,0x00,0x1a,0xee,0x00,0x00,0x00,0x00,0x02,0x40,0x00,0x00
# GFX1250: cluster_load_b64 v[0:1], v2, s[0:1] offset:-64 ; encoding: [0x00,0x00,0x1a,0xee,0x00,0x00,0x00,0x00,0x02,0xc0,0xff,0xff]
0x00,0x00,0x1a,0xee,0x00,0x00,0x00,0x00,0x02,0xc0,0xff,0xff
# GFX1250: cluster_load_b64 v[0:1], v2, s[0:1] th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x00,0x00,0x1a,0xee,0x00,0x00,0x68,0x00,0x02,0x00,0x00,0x00]
0x00,0x00,0x1a,0xee,0x00,0x00,0x68,0x00,0x02,0x00,0x00,0x00
# GFX1250: cluster_load_b64 v[2:3], v2, s[4:5] offset:64 scale_offset th:TH_LOAD_NT_HT scope:SCOPE_DEV ; encoding: [0x04,0x00,0x1a,0xee,0x02,0x00,0x69,0x00,0x02,0x40,0x00,0x00]
0x04,0x00,0x1a,0xee,0x02,0x00,0x69,0x00,0x02,0x40,0x00,0x00
# GFX1250: flat_atomic_add_f64 v[0:1], v[2:3] offset:4095 ; encoding: [0x7c,0x40,0x15,0xec,0x00,0x00,0x00,0x01,0x00,0xff,0x0f,0x00]
0x7c,0x40,0x15,0xec,0x00,0x00,0x00,0x01,0x00,0xff,0x0f,0x00