[AMDGPU] gfx1250 v_permlane_* instructions (#151749)

This commit is contained in:
Stanislav Mekhanoshin 2025-08-01 16:14:19 -07:00 committed by GitHub
parent c3b5f1cfe9
commit 33abf05af4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
13 changed files with 1064 additions and 8 deletions

View File

@ -721,6 +721,12 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_fp8_f32_e5m3, "ifiiIi", "nc", "fp8e5m3-in
TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_u4_u8, "UsUi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_permlane_bcast, "iiii", "nc", "gfx1250-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_permlane_up, "iiii", "nc", "gfx1250-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_permlane_down, "iiii", "nc", "gfx1250-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_permlane_xor, "iiii", "nc", "gfx1250-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_permlane_idx_gen, "iii", "nc", "gfx1250-insts,wavefrontsize32")
// GFX1250 WMMA builtins
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x4_f32, "V8fIbV2fIbV2fIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x32_bf16, "V8fIbV16yIbV16yIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")

View File

@ -744,6 +744,132 @@ void test_permlane16_swap(global uint2* out, uint old, uint src) {
*out = __builtin_amdgcn_permlane16_swap(old, src, false, true);
}
// CHECK-LABEL: @test_permlane_bcast(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// CHECK-NEXT: [[SRC0_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[SRC1_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[SRC2_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
// CHECK-NEXT: [[SRC0_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC0_ADDR]] to ptr
// CHECK-NEXT: [[SRC1_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC1_ADDR]] to ptr
// CHECK-NEXT: [[SRC2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC2_ADDR]] to ptr
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i32 [[SRC0:%.*]], ptr [[SRC0_ADDR_ASCAST]], align 4
// CHECK-NEXT: store i32 [[SRC1:%.*]], ptr [[SRC1_ADDR_ASCAST]], align 4
// CHECK-NEXT: store i32 [[SRC2:%.*]], ptr [[SRC2_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC0_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SRC1_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[SRC2_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.permlane.bcast(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]])
// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
// CHECK-NEXT: ret void
//
void test_permlane_bcast(global uint* out, uint src0, uint src1, uint src2) {
*out = __builtin_amdgcn_permlane_bcast(src0, src1, src2);
}
// CHECK-LABEL: @test_permlane_down(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// CHECK-NEXT: [[SRC0_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[SRC1_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[SRC2_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
// CHECK-NEXT: [[SRC0_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC0_ADDR]] to ptr
// CHECK-NEXT: [[SRC1_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC1_ADDR]] to ptr
// CHECK-NEXT: [[SRC2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC2_ADDR]] to ptr
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i32 [[SRC0:%.*]], ptr [[SRC0_ADDR_ASCAST]], align 4
// CHECK-NEXT: store i32 [[SRC1:%.*]], ptr [[SRC1_ADDR_ASCAST]], align 4
// CHECK-NEXT: store i32 [[SRC2:%.*]], ptr [[SRC2_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC0_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SRC1_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[SRC2_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.permlane.down(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]])
// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
// CHECK-NEXT: ret void
//
void test_permlane_down(global uint* out, uint src0, uint src1, uint src2) {
*out = __builtin_amdgcn_permlane_down(src0, src1, src2);
}
// CHECK-LABEL: @test_permlane_up(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// CHECK-NEXT: [[SRC0_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[SRC1_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[SRC2_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
// CHECK-NEXT: [[SRC0_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC0_ADDR]] to ptr
// CHECK-NEXT: [[SRC1_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC1_ADDR]] to ptr
// CHECK-NEXT: [[SRC2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC2_ADDR]] to ptr
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i32 [[SRC0:%.*]], ptr [[SRC0_ADDR_ASCAST]], align 4
// CHECK-NEXT: store i32 [[SRC1:%.*]], ptr [[SRC1_ADDR_ASCAST]], align 4
// CHECK-NEXT: store i32 [[SRC2:%.*]], ptr [[SRC2_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC0_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SRC1_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[SRC2_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.permlane.up(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]])
// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
// CHECK-NEXT: ret void
//
void test_permlane_up(global uint* out, uint src0, uint src1, uint src2) {
*out = __builtin_amdgcn_permlane_up(src0, src1, src2);
}
// CHECK-LABEL: @test_permlane_xor(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// CHECK-NEXT: [[SRC0_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[SRC1_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[SRC2_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
// CHECK-NEXT: [[SRC0_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC0_ADDR]] to ptr
// CHECK-NEXT: [[SRC1_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC1_ADDR]] to ptr
// CHECK-NEXT: [[SRC2_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC2_ADDR]] to ptr
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i32 [[SRC0:%.*]], ptr [[SRC0_ADDR_ASCAST]], align 4
// CHECK-NEXT: store i32 [[SRC1:%.*]], ptr [[SRC1_ADDR_ASCAST]], align 4
// CHECK-NEXT: store i32 [[SRC2:%.*]], ptr [[SRC2_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC0_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SRC1_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[SRC2_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.permlane.xor(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]])
// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
// CHECK-NEXT: ret void
//
void test_permlane_xor(global uint* out, uint src0, uint src1, uint src2) {
*out = __builtin_amdgcn_permlane_xor(src0, src1, src2);
}
// CHECK-LABEL: @test_permlane_idx_gen(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// CHECK-NEXT: [[SRC0_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[SRC1_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
// CHECK-NEXT: [[SRC0_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC0_ADDR]] to ptr
// CHECK-NEXT: [[SRC1_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC1_ADDR]] to ptr
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i32 [[SRC0:%.*]], ptr [[SRC0_ADDR_ASCAST]], align 4
// CHECK-NEXT: store i32 [[SRC1:%.*]], ptr [[SRC1_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC0_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SRC1_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.permlane.idx.gen(i32 [[TMP0]], i32 [[TMP1]])
// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store i32 [[TMP2]], ptr addrspace(1) [[TMP3]], align 4
// CHECK-NEXT: ret void
//
void test_permlane_idx_gen(global uint* out, uint src0, uint src1) {
*out = __builtin_amdgcn_permlane_idx_gen(src0, src1);
}
// CHECK-LABEL: @test_prefetch(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[FPTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)

View File

@ -3656,6 +3656,36 @@ def int_amdgcn_sat_pk4_i4_i8 : ClangBuiltin<"__builtin_amdgcn_sat_pk4_i4_i8">,
def int_amdgcn_sat_pk4_u4_u8 : ClangBuiltin<"__builtin_amdgcn_sat_pk4_u4_u8">,
DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_i32_ty], [IntrNoMem, IntrSpeculatable]>;
// llvm.amdgcn.permlane.bcast <src0> <src1> <src2>
def int_amdgcn_permlane_bcast : ClangBuiltin<"__builtin_amdgcn_permlane_bcast">,
Intrinsic<[llvm_i32_ty],
[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
// llvm.amdgcn.permlane.up <src0> <src1> <src2>
def int_amdgcn_permlane_up : ClangBuiltin<"__builtin_amdgcn_permlane_up">,
Intrinsic<[llvm_i32_ty],
[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
// llvm.amdgcn.permlane.down <src0> <src1> <src2>
def int_amdgcn_permlane_down : ClangBuiltin<"__builtin_amdgcn_permlane_down">,
Intrinsic<[llvm_i32_ty],
[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
// llvm.amdgcn.permlane.xor <src0> <src1> <src2>
def int_amdgcn_permlane_xor : ClangBuiltin<"__builtin_amdgcn_permlane_xor">,
Intrinsic<[llvm_i32_ty],
[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
// llvm.amdgcn.permlane.idx.gen <src0> <src1>
def int_amdgcn_permlane_idx_gen : ClangBuiltin<"__builtin_amdgcn_permlane_idx_gen">,
Intrinsic<[llvm_i32_ty],
[llvm_i32_ty, llvm_i32_ty],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
//===----------------------------------------------------------------------===//
// Special Intrinsics for backend internal use only. No frontend
// should emit calls to these.

View File

@ -3204,6 +3204,18 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
constrainOpWithReadfirstlane(B, MI, 5);
return;
}
case Intrinsic::amdgcn_permlane_bcast:
case Intrinsic::amdgcn_permlane_up:
case Intrinsic::amdgcn_permlane_down:
case Intrinsic::amdgcn_permlane_xor:
// Doing a waterfall loop over these wouldn't make any sense.
constrainOpWithReadfirstlane(B, MI, 3);
constrainOpWithReadfirstlane(B, MI, 4);
return;
case Intrinsic::amdgcn_permlane_idx_gen: {
constrainOpWithReadfirstlane(B, MI, 3);
return;
}
case Intrinsic::amdgcn_sbfe:
applyMappingBFE(B, OpdMapper, true);
return;
@ -4902,6 +4914,24 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
break;
}
case Intrinsic::amdgcn_permlane_bcast:
case Intrinsic::amdgcn_permlane_up:
case Intrinsic::amdgcn_permlane_down:
case Intrinsic::amdgcn_permlane_xor: {
unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
OpdsMapping[3] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
break;
}
case Intrinsic::amdgcn_permlane_idx_gen: {
unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
OpdsMapping[3] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
break;
}
case Intrinsic::amdgcn_permlane16_var:
case Intrinsic::amdgcn_permlanex16_var: {
unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);

View File

@ -321,6 +321,11 @@ def : SourceOfDivergence<int_amdgcn_permlane16>;
def : SourceOfDivergence<int_amdgcn_permlanex16>;
def : SourceOfDivergence<int_amdgcn_permlane16_var>;
def : SourceOfDivergence<int_amdgcn_permlanex16_var>;
def : SourceOfDivergence<int_amdgcn_permlane_bcast>;
def : SourceOfDivergence<int_amdgcn_permlane_up>;
def : SourceOfDivergence<int_amdgcn_permlane_down>;
def : SourceOfDivergence<int_amdgcn_permlane_xor>;
def : SourceOfDivergence<int_amdgcn_permlane_idx_gen>;
def : SourceOfDivergence<int_amdgcn_mov_dpp>;
def : SourceOfDivergence<int_amdgcn_mov_dpp8>;
def : SourceOfDivergence<int_amdgcn_update_dpp>;

View File

@ -152,7 +152,12 @@ static bool isPermlane(const MachineInstr &MI) {
Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e32 ||
Opcode == AMDGPU::V_PERMLANE16_SWAP_B32_e64 ||
Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e32 ||
Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64;
Opcode == AMDGPU::V_PERMLANE32_SWAP_B32_e64 ||
Opcode == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
Opcode == AMDGPU::V_PERMLANE_UP_B32_e64 ||
Opcode == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
Opcode == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
Opcode == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64;
}
static bool isLdsDma(const MachineInstr &MI) {

View File

@ -6304,10 +6304,14 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
};
if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
Opc == AMDGPU::V_PERMLANEX16_B32_e64) {
Opc == AMDGPU::V_PERMLANEX16_B32_e64 ||
Opc == AMDGPU::V_PERMLANE_BCAST_B32_e64 ||
Opc == AMDGPU::V_PERMLANE_UP_B32_e64 ||
Opc == AMDGPU::V_PERMLANE_DOWN_B32_e64 ||
Opc == AMDGPU::V_PERMLANE_XOR_B32_e64 ||
Opc == AMDGPU::V_PERMLANE_IDX_GEN_B32_e64) {
// src1 and src2 must be scalar
MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
const DebugLoc &DL = MI.getDebugLoc();
if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
@ -6315,11 +6319,14 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
.add(Src1);
Src1.ChangeToRegister(Reg, false);
}
if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
.add(Src2);
Src2.ChangeToRegister(Reg, false);
if (VOP3Idx[2] != -1) {
MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
.add(Src2);
Src2.ChangeToRegister(Reg, false);
}
}
}

View File

@ -1053,6 +1053,14 @@ def VOP3_PERMLANE_VAR_Profile : VOP3_Profile<VOPProfile <[i32, i32, i32, untyped
let HasExtDPP = 0;
}
class VOP3_PERMLANE_NOOPSEL_Profile<VOPProfile P> : VOP3_Profile<P> {
let Ins64 = !con((ins VRegSrc_32:$src0, SSrc_b32:$src1),
!if(P.HasSrc2, (ins SSrc_b32:$src2), (ins)));
let HasClamp = 0;
let HasExtVOP3DPP = 0;
let HasExtDPP = 0;
}
def opsel_i1timm : SDNodeXForm<timm, [{
return CurDAG->getTargetConstant(
N->getZExtValue() ? SISrcMods::OP_SEL_0 : SISrcMods::NONE,
@ -1136,6 +1144,18 @@ class PermlaneVarPat<SDPatternOperator permlane,
VGPR_32:$src1, VGPR_32:$vdst_in)
>;
class PermlaneNoDppPat3Src<SDPatternOperator permlane,
Instruction inst> : GCNPat<
(permlane i32:$src0, i32:$src1, i32:$src2),
(inst VGPR_32:$src0, SCSrc_b32:$src1, SCSrc_b32:$src2)
>;
class PermlaneNoDppPat2Src<SDPatternOperator permlane,
Instruction inst> : GCNPat<
(permlane i32:$src0, i32:$src1),
(inst VGPR_32:$src0, SCSrc_b32:$src1)
>;
class VOP3_BITOP3_Profile<VOPProfile pfl, VOP3Features f> : VOP3_Profile<pfl, f> {
let HasClamp = 0;
let HasOMod = 0;
@ -1522,6 +1542,20 @@ let SubtargetPredicate = isGFX12Plus in {
} // End SubtargetPredicate = isGFX12Plus
let SubtargetPredicate = isGFX1250Plus, WaveSizePredicate = isWave32 in {
defm V_PERMLANE_BCAST_B32 : VOP3Inst<"v_permlane_bcast_b32", VOP3_PERMLANE_NOOPSEL_Profile<VOP_I32_I32_I32_I32>>;
defm V_PERMLANE_UP_B32 : VOP3Inst<"v_permlane_up_b32", VOP3_PERMLANE_NOOPSEL_Profile<VOP_I32_I32_I32_I32>>;
defm V_PERMLANE_DOWN_B32 : VOP3Inst<"v_permlane_down_b32", VOP3_PERMLANE_NOOPSEL_Profile<VOP_I32_I32_I32_I32>>;
defm V_PERMLANE_XOR_B32 : VOP3Inst<"v_permlane_xor_b32", VOP3_PERMLANE_NOOPSEL_Profile<VOP_I32_I32_I32_I32>>;
defm V_PERMLANE_IDX_GEN_B32 : VOP3Inst<"v_permlane_idx_gen_b32", VOP3_PERMLANE_NOOPSEL_Profile<VOP_I32_I32_I32>>;
def : PermlaneNoDppPat3Src<int_amdgcn_permlane_bcast, V_PERMLANE_BCAST_B32_e64>;
def : PermlaneNoDppPat3Src<int_amdgcn_permlane_up, V_PERMLANE_UP_B32_e64>;
def : PermlaneNoDppPat3Src<int_amdgcn_permlane_down, V_PERMLANE_DOWN_B32_e64>;
def : PermlaneNoDppPat3Src<int_amdgcn_permlane_xor, V_PERMLANE_XOR_B32_e64>;
def : PermlaneNoDppPat2Src<int_amdgcn_permlane_idx_gen, V_PERMLANE_IDX_GEN_B32_e64>;
} // End SubtargetPredicate = isGFX1250Plus, WaveSizePredicate = isWave32
let HasClamp = 0, HasModifiers = 1 in {
def BitOp3_B16_Profile : VOP3_BITOP3_Profile<VOPProfile <[i16, i16, i16, i16, i32]>, VOP3_OPSEL>;
def BitOp3_B16_t16_Profile : VOP3_Profile_True16<BitOp3_B16_Profile>;
@ -1973,6 +2007,11 @@ defm V_ADD_MAX_I32 : VOP3Only_Realtriple_gfx1250<0x25e>;
defm V_ADD_MAX_U32 : VOP3Only_Realtriple_gfx1250<0x25f>;
defm V_ADD_MIN_I32 : VOP3Only_Realtriple_gfx1250<0x260>;
defm V_ADD_MIN_U32 : VOP3Only_Realtriple_gfx1250<0x261>;
defm V_PERMLANE_BCAST_B32 : VOP3Only_Real_Base_gfx12<0x270>;
defm V_PERMLANE_UP_B32 : VOP3Only_Real_Base_gfx12<0x271>;
defm V_PERMLANE_DOWN_B32 : VOP3Only_Real_Base_gfx12<0x272>;
defm V_PERMLANE_XOR_B32 : VOP3Only_Real_Base_gfx12<0x273>;
defm V_PERMLANE_IDX_GEN_B32 : VOP3Only_Real_Base_gfx12<0x314>;
//===----------------------------------------------------------------------===//
// GFX11, GFX12

View File

@ -796,6 +796,41 @@ define amdgpu_kernel void @v_permlane32_swap(ptr addrspace(1) %out, i32 %src0, i
ret void
}
; CHECK: DIVERGENT: %result = call i32 @llvm.amdgcn.permlane.bcast(i32 %src0, i32 %src1, i32 %src2)
define amdgpu_kernel void @v_permlane_bcast_b32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
%result= call i32 @llvm.amdgcn.permlane.bcast(i32 %src0, i32 %src1, i32 %src2)
store i32 %result, ptr addrspace(1) %out
ret void
}
; CHECK: DIVERGENT: %result = call i32 @llvm.amdgcn.permlane.up(i32 %src0, i32 %src1, i32 %src2)
define amdgpu_kernel void @v_permlane_up_b32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
%result= call i32 @llvm.amdgcn.permlane.up(i32 %src0, i32 %src1, i32 %src2)
store i32 %result, ptr addrspace(1) %out
ret void
}
; CHECK: DIVERGENT: %result = call i32 @llvm.amdgcn.permlane.down(i32 %src0, i32 %src1, i32 %src2)
define amdgpu_kernel void @v_permlane_down_b32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
%result= call i32 @llvm.amdgcn.permlane.down(i32 %src0, i32 %src1, i32 %src2)
store i32 %result, ptr addrspace(1) %out
ret void
}
; CHECK: DIVERGENT: %result = call i32 @llvm.amdgcn.permlane.xor(i32 %src0, i32 %src1, i32 %src2)
define amdgpu_kernel void @v_permlane_xor_b32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
%result= call i32 @llvm.amdgcn.permlane.xor(i32 %src0, i32 %src1, i32 %src2)
store i32 %result, ptr addrspace(1) %out
ret void
}
; CHECK: DIVERGENT: %result = call i32 @llvm.amdgcn.permlane.idx.gen(i32 %src0, i32 %src1)
define amdgpu_kernel void @v_permlane_idx_gen_b32(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
%result= call i32 @llvm.amdgcn.permlane.idx.gen(i32 %src0, i32 %src1)
store i32 %result, ptr addrspace(1) %out
ret void
}
; CHECK: DIVERGENT: %v = call i32 @llvm.amdgcn.dead.i32()
define amdgpu_cs_chain void @dead(ptr addrspace(1) %out) {
%v = call i32 @llvm.amdgcn.dead.i32()

View File

@ -0,0 +1,416 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s
; RUN: llc -global-isel=1 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
define amdgpu_kernel void @v_permlane_bcast_b32_vss(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
; GFX1250-LABEL: v_permlane_bcast_b32_vss:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_load_b32 s4, s[4:5], 0x34
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_permlane_bcast_b32 v0, v0, s3, s4
; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX1250-NEXT: s_endpgm
%v = call i32 @llvm.amdgcn.permlane.bcast(i32 %src0, i32 %src1, i32 %src2)
store i32 %v, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_permlane_bcast_b32_vii(ptr addrspace(1) %out, i32 %src0) {
; GFX1250-LABEL: v_permlane_bcast_b32_vii:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_permlane_bcast_b32 v0, v0, 1, 2
; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX1250-NEXT: s_endpgm
%v = call i32 @llvm.amdgcn.permlane.bcast(i32 %src0, i32 1, i32 2)
store i32 %v, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_permlane_bcast_b32_vll(ptr addrspace(1) %out, i32 %src0) {
; GFX1250-LABEL: v_permlane_bcast_b32_vll:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX1250-NEXT: s_movk_i32 s2, 0x64
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1250-NEXT: v_permlane_bcast_b32 v0, v0, s2, 0x66
; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX1250-NEXT: s_endpgm
%v = call i32 @llvm.amdgcn.permlane.bcast(i32 %src0, i32 100, i32 102)
store i32 %v, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_permlane_bcast_b32_vvv(ptr addrspace(1) %out, i32 %src0) {
; GFX1250-SDAG-LABEL: v_permlane_bcast_b32_vvv:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX1250-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0
; GFX1250-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s3, v1
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, s2
; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s2, v0
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_permlane_bcast_b32 v1, v1, s3, s2
; GFX1250-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: v_permlane_bcast_b32_vvv:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX1250-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0
; GFX1250-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s3, v1
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s4, v0
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_permlane_bcast_b32 v0, v0, s3, s4
; GFX1250-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX1250-GISEL-NEXT: s_endpgm
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
%tidy = call i32 @llvm.amdgcn.workitem.id.y()
%v = call i32 @llvm.amdgcn.permlane.bcast(i32 %src0, i32 %tidx, i32 %tidy)
store i32 %v, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_permlane_down_b32_vss(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
; GFX1250-LABEL: v_permlane_down_b32_vss:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_load_b32 s4, s[4:5], 0x34
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_permlane_down_b32 v0, v0, s3, s4
; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX1250-NEXT: s_endpgm
%v = call i32 @llvm.amdgcn.permlane.down(i32 %src0, i32 %src1, i32 %src2)
store i32 %v, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_permlane_down_b32_vii(ptr addrspace(1) %out, i32 %src0) {
; GFX1250-LABEL: v_permlane_down_b32_vii:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_permlane_down_b32 v0, v0, 1, 2
; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX1250-NEXT: s_endpgm
%v = call i32 @llvm.amdgcn.permlane.down(i32 %src0, i32 1, i32 2)
store i32 %v, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_permlane_down_b32_vll(ptr addrspace(1) %out, i32 %src0) {
; GFX1250-LABEL: v_permlane_down_b32_vll:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX1250-NEXT: s_movk_i32 s2, 0x64
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1250-NEXT: v_permlane_down_b32 v0, v0, s2, 0x66
; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX1250-NEXT: s_endpgm
%v = call i32 @llvm.amdgcn.permlane.down(i32 %src0, i32 100, i32 102)
store i32 %v, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_permlane_down_b32_vvv(ptr addrspace(1) %out, i32 %src0) {
; GFX1250-SDAG-LABEL: v_permlane_down_b32_vvv:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX1250-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0
; GFX1250-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s3, v1
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, s2
; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s2, v0
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_permlane_down_b32 v1, v1, s3, s2
; GFX1250-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: v_permlane_down_b32_vvv:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX1250-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0
; GFX1250-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s3, v1
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s4, v0
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_permlane_down_b32 v0, v0, s3, s4
; GFX1250-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX1250-GISEL-NEXT: s_endpgm
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
%tidy = call i32 @llvm.amdgcn.workitem.id.y()
%v = call i32 @llvm.amdgcn.permlane.down(i32 %src0, i32 %tidx, i32 %tidy)
store i32 %v, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_permlane_up_b32_vss(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
; GFX1250-LABEL: v_permlane_up_b32_vss:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_load_b32 s4, s[4:5], 0x34
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_permlane_up_b32 v0, v0, s3, s4
; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX1250-NEXT: s_endpgm
%v = call i32 @llvm.amdgcn.permlane.up(i32 %src0, i32 %src1, i32 %src2)
store i32 %v, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_permlane_up_b32_vii(ptr addrspace(1) %out, i32 %src0) {
; GFX1250-LABEL: v_permlane_up_b32_vii:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_permlane_up_b32 v0, v0, 1, 2
; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX1250-NEXT: s_endpgm
%v = call i32 @llvm.amdgcn.permlane.up(i32 %src0, i32 1, i32 2)
store i32 %v, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_permlane_up_b32_vll(ptr addrspace(1) %out, i32 %src0) {
; GFX1250-LABEL: v_permlane_up_b32_vll:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX1250-NEXT: s_movk_i32 s2, 0x64
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1250-NEXT: v_permlane_up_b32 v0, v0, s2, 0x66
; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX1250-NEXT: s_endpgm
%v = call i32 @llvm.amdgcn.permlane.up(i32 %src0, i32 100, i32 102)
store i32 %v, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_permlane_up_b32_vvv(ptr addrspace(1) %out, i32 %src0) {
; GFX1250-SDAG-LABEL: v_permlane_up_b32_vvv:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX1250-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0
; GFX1250-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s3, v1
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, s2
; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s2, v0
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_permlane_up_b32 v1, v1, s3, s2
; GFX1250-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: v_permlane_up_b32_vvv:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX1250-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0
; GFX1250-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s3, v1
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s4, v0
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_permlane_up_b32 v0, v0, s3, s4
; GFX1250-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX1250-GISEL-NEXT: s_endpgm
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
%tidy = call i32 @llvm.amdgcn.workitem.id.y()
%v = call i32 @llvm.amdgcn.permlane.up(i32 %src0, i32 %tidx, i32 %tidy)
store i32 %v, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_permlane_xor_b32_vss(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
; GFX1250-LABEL: v_permlane_xor_b32_vss:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1250-NEXT: s_wait_xcnt 0x0
; GFX1250-NEXT: s_load_b32 s4, s[4:5], 0x34
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_permlane_xor_b32 v0, v0, s3, s4
; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX1250-NEXT: s_endpgm
%v = call i32 @llvm.amdgcn.permlane.xor(i32 %src0, i32 %src1, i32 %src2)
store i32 %v, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_permlane_xor_b32_vii(ptr addrspace(1) %out, i32 %src0) {
; GFX1250-LABEL: v_permlane_xor_b32_vii:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_permlane_xor_b32 v0, v0, 1, 2
; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX1250-NEXT: s_endpgm
%v = call i32 @llvm.amdgcn.permlane.xor(i32 %src0, i32 1, i32 2)
store i32 %v, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_permlane_xor_b32_vll(ptr addrspace(1) %out, i32 %src0) {
; GFX1250-LABEL: v_permlane_xor_b32_vll:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX1250-NEXT: s_movk_i32 s2, 0x64
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1250-NEXT: v_permlane_xor_b32 v0, v0, s2, 0x66
; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX1250-NEXT: s_endpgm
%v = call i32 @llvm.amdgcn.permlane.xor(i32 %src0, i32 100, i32 102)
store i32 %v, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_permlane_xor_b32_vvv(ptr addrspace(1) %out, i32 %src0) {
; GFX1250-SDAG-LABEL: v_permlane_xor_b32_vvv:
; GFX1250-SDAG: ; %bb.0:
; GFX1250-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX1250-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0
; GFX1250-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s3, v1
; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, s2
; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s2, v0
; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-SDAG-NEXT: v_permlane_xor_b32 v1, v1, s3, s2
; GFX1250-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX1250-SDAG-NEXT: s_endpgm
;
; GFX1250-GISEL-LABEL: v_permlane_xor_b32_vvv:
; GFX1250-GISEL: ; %bb.0:
; GFX1250-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX1250-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0
; GFX1250-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s3, v1
; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
; GFX1250-GISEL-NEXT: v_readfirstlane_b32 s4, v0
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_permlane_xor_b32 v0, v0, s3, s4
; GFX1250-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX1250-GISEL-NEXT: s_endpgm
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
%tidy = call i32 @llvm.amdgcn.workitem.id.y()
%v = call i32 @llvm.amdgcn.permlane.xor(i32 %src0, i32 %tidx, i32 %tidy)
store i32 %v, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_permlane_idx_gen_b32_vs(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
; GFX1250-LABEL: v_permlane_idx_gen_b32_vs:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_permlane_idx_gen_b32 v0, v0, s3
; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX1250-NEXT: s_endpgm
%v = call i32 @llvm.amdgcn.permlane.idx.gen(i32 %src0, i32 %src1)
store i32 %v, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_permlane_idx_gen_b32_vi(ptr addrspace(1) %out, i32 %src0) {
; GFX1250-LABEL: v_permlane_idx_gen_b32_vi:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_permlane_idx_gen_b32 v0, v0, 1
; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX1250-NEXT: s_endpgm
%v = call i32 @llvm.amdgcn.permlane.idx.gen(i32 %src0, i32 1)
store i32 %v, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_permlane_idx_gen_b32_vl(ptr addrspace(1) %out, i32 %src0) {
; GFX1250-LABEL: v_permlane_idx_gen_b32_vl:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_permlane_idx_gen_b32 v0, v0, 0x64
; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX1250-NEXT: s_endpgm
%v = call i32 @llvm.amdgcn.permlane.idx.gen(i32 %src0, i32 100)
store i32 %v, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @v_permlane_idx_gen_b32_vv(ptr addrspace(1) %out) {
; GFX1250-LABEL: v_permlane_idx_gen_b32_vv:
; GFX1250: ; %bb.0:
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1250-NEXT: v_bfe_u32 v1, v0, 10, 10
; GFX1250-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1250-NEXT: v_readfirstlane_b32 s2, v1
; GFX1250-NEXT: v_mov_b32_e32 v1, 0
; GFX1250-NEXT: v_permlane_idx_gen_b32 v0, v0, s2
; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX1250-NEXT: s_endpgm
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
%tidy = call i32 @llvm.amdgcn.workitem.id.y()
%v = call i32 @llvm.amdgcn.permlane.idx.gen(i32 %tidx, i32 %tidy)
store i32 %v, ptr addrspace(1) %out
ret void
}

View File

@ -766,3 +766,123 @@ v_cvt_scale_pk8_f32_fp4 v[10:17], v20, 0xcf00
v_cvt_scale_pk8_f32_fp4 v[10:17], v20, v8 scale_sel:1
// GFX1250: v_cvt_scale_pk8_f32_fp4 v[10:17], v20, v8 scale_sel:1 ; encoding: [0x0a,0x08,0xa1,0xd6,0x14,0x11,0x02,0x00]
v_permlane_bcast_b32 v5, v1, s2, s3
// GFX1250: v_permlane_bcast_b32 v5, v1, s2, s3 ; encoding: [0x05,0x00,0x70,0xd6,0x01,0x05,0x0c,0x00]
v_permlane_bcast_b32 v5, v1, s105, s105
// GFX1250: v_permlane_bcast_b32 v5, v1, s105, s105 ; encoding: [0x05,0x00,0x70,0xd6,0x01,0xd3,0xa4,0x01]
v_permlane_bcast_b32 v5, v1, ttmp15, ttmp15
// GFX1250: v_permlane_bcast_b32 v5, v1, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x70,0xd6,0x01,0xf7,0xec,0x01]
v_permlane_bcast_b32 v5, v1, vcc_hi, exec_lo
// GFX1250: v_permlane_bcast_b32 v5, v1, vcc_hi, exec_lo ; encoding: [0x05,0x00,0x70,0xd6,0x01,0xd7,0xf8,0x01]
v_permlane_bcast_b32 v5, v1, vcc_lo, m0
// GFX1250: v_permlane_bcast_b32 v5, v1, vcc_lo, m0 ; encoding: [0x05,0x00,0x70,0xd6,0x01,0xd5,0xf4,0x01]
v_permlane_bcast_b32 v5, v1, m0, vcc_hi
// GFX1250: v_permlane_bcast_b32 v5, v1, m0, vcc_hi ; encoding: [0x05,0x00,0x70,0xd6,0x01,0xfb,0xac,0x01]
v_permlane_bcast_b32 v5, v1, exec_hi, vcc_lo
// GFX1250: v_permlane_bcast_b32 v5, v1, exec_hi, vcc_lo ; encoding: [0x05,0x00,0x70,0xd6,0x01,0xff,0xa8,0x01]
v_permlane_bcast_b32 v5, v1, exec_lo, src_scc
// GFX1250: v_permlane_bcast_b32 v5, v1, exec_lo, src_scc ; encoding: [0x05,0x00,0x70,0xd6,0x01,0xfd,0xf4,0x03]
v_permlane_down_b32 v5, v1, s2, s3
// GFX1250: v_permlane_down_b32 v5, v1, s2, s3 ; encoding: [0x05,0x00,0x72,0xd6,0x01,0x05,0x0c,0x00]
v_permlane_down_b32 v5, v1, s105, s105
// GFX1250: v_permlane_down_b32 v5, v1, s105, s105 ; encoding: [0x05,0x00,0x72,0xd6,0x01,0xd3,0xa4,0x01]
v_permlane_down_b32 v5, v1, ttmp15, ttmp15
// GFX1250: v_permlane_down_b32 v5, v1, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x72,0xd6,0x01,0xf7,0xec,0x01]
v_permlane_down_b32 v5, v1, vcc_hi, exec_lo
// GFX1250: v_permlane_down_b32 v5, v1, vcc_hi, exec_lo ; encoding: [0x05,0x00,0x72,0xd6,0x01,0xd7,0xf8,0x01]
v_permlane_down_b32 v5, v1, vcc_lo, m0
// GFX1250: v_permlane_down_b32 v5, v1, vcc_lo, m0 ; encoding: [0x05,0x00,0x72,0xd6,0x01,0xd5,0xf4,0x01]
v_permlane_down_b32 v5, v1, m0, vcc_hi
// GFX1250: v_permlane_down_b32 v5, v1, m0, vcc_hi ; encoding: [0x05,0x00,0x72,0xd6,0x01,0xfb,0xac,0x01]
v_permlane_down_b32 v5, v1, exec_hi, vcc_lo
// GFX1250: v_permlane_down_b32 v5, v1, exec_hi, vcc_lo ; encoding: [0x05,0x00,0x72,0xd6,0x01,0xff,0xa8,0x01]
v_permlane_down_b32 v5, v1, exec_lo, src_scc
// GFX1250: v_permlane_down_b32 v5, v1, exec_lo, src_scc ; encoding: [0x05,0x00,0x72,0xd6,0x01,0xfd,0xf4,0x03]
v_permlane_up_b32 v5, v1, s2, s3
// GFX1250: v_permlane_up_b32 v5, v1, s2, s3 ; encoding: [0x05,0x00,0x71,0xd6,0x01,0x05,0x0c,0x00]
v_permlane_up_b32 v5, v1, s105, s105
// GFX1250: v_permlane_up_b32 v5, v1, s105, s105 ; encoding: [0x05,0x00,0x71,0xd6,0x01,0xd3,0xa4,0x01]
v_permlane_up_b32 v5, v1, ttmp15, ttmp15
// GFX1250: v_permlane_up_b32 v5, v1, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x71,0xd6,0x01,0xf7,0xec,0x01]
v_permlane_up_b32 v5, v1, vcc_hi, exec_lo
// GFX1250: v_permlane_up_b32 v5, v1, vcc_hi, exec_lo ; encoding: [0x05,0x00,0x71,0xd6,0x01,0xd7,0xf8,0x01]
v_permlane_up_b32 v5, v1, vcc_lo, m0
// GFX1250: v_permlane_up_b32 v5, v1, vcc_lo, m0 ; encoding: [0x05,0x00,0x71,0xd6,0x01,0xd5,0xf4,0x01]
v_permlane_up_b32 v5, v1, m0, vcc_hi
// GFX1250: v_permlane_up_b32 v5, v1, m0, vcc_hi ; encoding: [0x05,0x00,0x71,0xd6,0x01,0xfb,0xac,0x01]
v_permlane_up_b32 v5, v1, exec_hi, vcc_lo
// GFX1250: v_permlane_up_b32 v5, v1, exec_hi, vcc_lo ; encoding: [0x05,0x00,0x71,0xd6,0x01,0xff,0xa8,0x01]
v_permlane_up_b32 v5, v1, exec_lo, src_scc
// GFX1250: v_permlane_up_b32 v5, v1, exec_lo, src_scc ; encoding: [0x05,0x00,0x71,0xd6,0x01,0xfd,0xf4,0x03]
v_permlane_xor_b32 v5, v1, s2, s3
// GFX1250: v_permlane_xor_b32 v5, v1, s2, s3 ; encoding: [0x05,0x00,0x73,0xd6,0x01,0x05,0x0c,0x00]
v_permlane_xor_b32 v5, v1, s105, s105
// GFX1250: v_permlane_xor_b32 v5, v1, s105, s105 ; encoding: [0x05,0x00,0x73,0xd6,0x01,0xd3,0xa4,0x01]
v_permlane_xor_b32 v5, v1, ttmp15, ttmp15
// GFX1250: v_permlane_xor_b32 v5, v1, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x73,0xd6,0x01,0xf7,0xec,0x01]
v_permlane_xor_b32 v5, v1, vcc_hi, exec_lo
// GFX1250: v_permlane_xor_b32 v5, v1, vcc_hi, exec_lo ; encoding: [0x05,0x00,0x73,0xd6,0x01,0xd7,0xf8,0x01]
v_permlane_xor_b32 v5, v1, vcc_lo, m0
// GFX1250: v_permlane_xor_b32 v5, v1, vcc_lo, m0 ; encoding: [0x05,0x00,0x73,0xd6,0x01,0xd5,0xf4,0x01]
v_permlane_xor_b32 v5, v1, m0, vcc_hi
// GFX1250: v_permlane_xor_b32 v5, v1, m0, vcc_hi ; encoding: [0x05,0x00,0x73,0xd6,0x01,0xfb,0xac,0x01]
v_permlane_xor_b32 v5, v1, exec_hi, vcc_lo
// GFX1250: v_permlane_xor_b32 v5, v1, exec_hi, vcc_lo ; encoding: [0x05,0x00,0x73,0xd6,0x01,0xff,0xa8,0x01]
v_permlane_xor_b32 v5, v1, exec_lo, src_scc
// GFX1250: v_permlane_xor_b32 v5, v1, exec_lo, src_scc ; encoding: [0x05,0x00,0x73,0xd6,0x01,0xfd,0xf4,0x03]
v_permlane_idx_gen_b32 v5, v1, s2
// GFX1250: v_permlane_idx_gen_b32 v5, v1, s2 ; encoding: [0x05,0x00,0x14,0xd7,0x01,0x05,0x00,0x00]
v_permlane_idx_gen_b32 v5, v1, s105
// GFX1250: v_permlane_idx_gen_b32 v5, v1, s105 ; encoding: [0x05,0x00,0x14,0xd7,0x01,0xd3,0x00,0x00]
v_permlane_idx_gen_b32 v5, v1, ttmp15
// GFX1250: v_permlane_idx_gen_b32 v5, v1, ttmp15 ; encoding: [0x05,0x00,0x14,0xd7,0x01,0xf7,0x00,0x00]
v_permlane_idx_gen_b32 v5, v1, vcc_hi
// GFX1250: v_permlane_idx_gen_b32 v5, v1, vcc_hi ; encoding: [0x05,0x00,0x14,0xd7,0x01,0xd7,0x00,0x00]
v_permlane_idx_gen_b32 v5, v1, vcc_lo
// GFX1250: v_permlane_idx_gen_b32 v5, v1, vcc_lo ; encoding: [0x05,0x00,0x14,0xd7,0x01,0xd5,0x00,0x00]
v_permlane_idx_gen_b32 v5, v1, m0
// GFX1250: v_permlane_idx_gen_b32 v5, v1, m0 ; encoding: [0x05,0x00,0x14,0xd7,0x01,0xfb,0x00,0x00]
v_permlane_idx_gen_b32 v5, v1, exec_hi
// GFX1250: v_permlane_idx_gen_b32 v5, v1, exec_hi ; encoding: [0x05,0x00,0x14,0xd7,0x01,0xff,0x00,0x00]
v_permlane_idx_gen_b32 v5, v1, exec_lo
// GFX1250: v_permlane_idx_gen_b32 v5, v1, exec_lo ; encoding: [0x05,0x00,0x14,0xd7,0x01,0xfd,0x00,0x00]

View File

@ -766,3 +766,123 @@ v_cvt_scale_pk8_f32_fp4 v[10:17], v20, 0xcf00
v_cvt_scale_pk8_f32_fp4 v[10:17], v20, v8 scale_sel:1
// GFX1250: v_cvt_scale_pk8_f32_fp4 v[10:17], v20, v8 scale_sel:1 ; encoding: [0x0a,0x08,0xa1,0xd6,0x14,0x11,0x02,0x00]
v_permlane_bcast_b32 v5, v1, s2, s3
// GFX1250: v_permlane_bcast_b32 v5, v1, s2, s3 ; encoding: [0x05,0x00,0x70,0xd6,0x01,0x05,0x0c,0x00]
v_permlane_bcast_b32 v5, v1, s105, s105
// GFX1250: v_permlane_bcast_b32 v5, v1, s105, s105 ; encoding: [0x05,0x00,0x70,0xd6,0x01,0xd3,0xa4,0x01]
v_permlane_bcast_b32 v5, v1, ttmp15, ttmp15
// GFX1250: v_permlane_bcast_b32 v5, v1, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x70,0xd6,0x01,0xf7,0xec,0x01]
v_permlane_bcast_b32 v5, v1, vcc_hi, exec_lo
// GFX1250: v_permlane_bcast_b32 v5, v1, vcc_hi, exec_lo ; encoding: [0x05,0x00,0x70,0xd6,0x01,0xd7,0xf8,0x01]
v_permlane_bcast_b32 v5, v1, vcc_lo, m0
// GFX1250: v_permlane_bcast_b32 v5, v1, vcc_lo, m0 ; encoding: [0x05,0x00,0x70,0xd6,0x01,0xd5,0xf4,0x01]
v_permlane_bcast_b32 v5, v1, m0, vcc_hi
// GFX1250: v_permlane_bcast_b32 v5, v1, m0, vcc_hi ; encoding: [0x05,0x00,0x70,0xd6,0x01,0xfb,0xac,0x01]
v_permlane_bcast_b32 v5, v1, exec_hi, vcc_lo
// GFX1250: v_permlane_bcast_b32 v5, v1, exec_hi, vcc_lo ; encoding: [0x05,0x00,0x70,0xd6,0x01,0xff,0xa8,0x01]
v_permlane_bcast_b32 v5, v1, exec_lo, src_scc
// GFX1250: v_permlane_bcast_b32 v5, v1, exec_lo, src_scc ; encoding: [0x05,0x00,0x70,0xd6,0x01,0xfd,0xf4,0x03]
v_permlane_down_b32 v5, v1, s2, s3
// GFX1250: v_permlane_down_b32 v5, v1, s2, s3 ; encoding: [0x05,0x00,0x72,0xd6,0x01,0x05,0x0c,0x00]
v_permlane_down_b32 v5, v1, s105, s105
// GFX1250: v_permlane_down_b32 v5, v1, s105, s105 ; encoding: [0x05,0x00,0x72,0xd6,0x01,0xd3,0xa4,0x01]
v_permlane_down_b32 v5, v1, ttmp15, ttmp15
// GFX1250: v_permlane_down_b32 v5, v1, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x72,0xd6,0x01,0xf7,0xec,0x01]
v_permlane_down_b32 v5, v1, vcc_hi, exec_lo
// GFX1250: v_permlane_down_b32 v5, v1, vcc_hi, exec_lo ; encoding: [0x05,0x00,0x72,0xd6,0x01,0xd7,0xf8,0x01]
v_permlane_down_b32 v5, v1, vcc_lo, m0
// GFX1250: v_permlane_down_b32 v5, v1, vcc_lo, m0 ; encoding: [0x05,0x00,0x72,0xd6,0x01,0xd5,0xf4,0x01]
v_permlane_down_b32 v5, v1, m0, vcc_hi
// GFX1250: v_permlane_down_b32 v5, v1, m0, vcc_hi ; encoding: [0x05,0x00,0x72,0xd6,0x01,0xfb,0xac,0x01]
v_permlane_down_b32 v5, v1, exec_hi, vcc_lo
// GFX1250: v_permlane_down_b32 v5, v1, exec_hi, vcc_lo ; encoding: [0x05,0x00,0x72,0xd6,0x01,0xff,0xa8,0x01]
v_permlane_down_b32 v5, v1, exec_lo, src_scc
// GFX1250: v_permlane_down_b32 v5, v1, exec_lo, src_scc ; encoding: [0x05,0x00,0x72,0xd6,0x01,0xfd,0xf4,0x03]
v_permlane_up_b32 v5, v1, s2, s3
// GFX1250: v_permlane_up_b32 v5, v1, s2, s3 ; encoding: [0x05,0x00,0x71,0xd6,0x01,0x05,0x0c,0x00]
v_permlane_up_b32 v5, v1, s105, s105
// GFX1250: v_permlane_up_b32 v5, v1, s105, s105 ; encoding: [0x05,0x00,0x71,0xd6,0x01,0xd3,0xa4,0x01]
v_permlane_up_b32 v5, v1, ttmp15, ttmp15
// GFX1250: v_permlane_up_b32 v5, v1, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x71,0xd6,0x01,0xf7,0xec,0x01]
v_permlane_up_b32 v5, v1, vcc_hi, exec_lo
// GFX1250: v_permlane_up_b32 v5, v1, vcc_hi, exec_lo ; encoding: [0x05,0x00,0x71,0xd6,0x01,0xd7,0xf8,0x01]
v_permlane_up_b32 v5, v1, vcc_lo, m0
// GFX1250: v_permlane_up_b32 v5, v1, vcc_lo, m0 ; encoding: [0x05,0x00,0x71,0xd6,0x01,0xd5,0xf4,0x01]
v_permlane_up_b32 v5, v1, m0, vcc_hi
// GFX1250: v_permlane_up_b32 v5, v1, m0, vcc_hi ; encoding: [0x05,0x00,0x71,0xd6,0x01,0xfb,0xac,0x01]
v_permlane_up_b32 v5, v1, exec_hi, vcc_lo
// GFX1250: v_permlane_up_b32 v5, v1, exec_hi, vcc_lo ; encoding: [0x05,0x00,0x71,0xd6,0x01,0xff,0xa8,0x01]
v_permlane_up_b32 v5, v1, exec_lo, src_scc
// GFX1250: v_permlane_up_b32 v5, v1, exec_lo, src_scc ; encoding: [0x05,0x00,0x71,0xd6,0x01,0xfd,0xf4,0x03]
v_permlane_xor_b32 v5, v1, s2, s3
// GFX1250: v_permlane_xor_b32 v5, v1, s2, s3 ; encoding: [0x05,0x00,0x73,0xd6,0x01,0x05,0x0c,0x00]
v_permlane_xor_b32 v5, v1, s105, s105
// GFX1250: v_permlane_xor_b32 v5, v1, s105, s105 ; encoding: [0x05,0x00,0x73,0xd6,0x01,0xd3,0xa4,0x01]
v_permlane_xor_b32 v5, v1, ttmp15, ttmp15
// GFX1250: v_permlane_xor_b32 v5, v1, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x73,0xd6,0x01,0xf7,0xec,0x01]
v_permlane_xor_b32 v5, v1, vcc_hi, exec_lo
// GFX1250: v_permlane_xor_b32 v5, v1, vcc_hi, exec_lo ; encoding: [0x05,0x00,0x73,0xd6,0x01,0xd7,0xf8,0x01]
v_permlane_xor_b32 v5, v1, vcc_lo, m0
// GFX1250: v_permlane_xor_b32 v5, v1, vcc_lo, m0 ; encoding: [0x05,0x00,0x73,0xd6,0x01,0xd5,0xf4,0x01]
v_permlane_xor_b32 v5, v1, m0, vcc_hi
// GFX1250: v_permlane_xor_b32 v5, v1, m0, vcc_hi ; encoding: [0x05,0x00,0x73,0xd6,0x01,0xfb,0xac,0x01]
v_permlane_xor_b32 v5, v1, exec_hi, vcc_lo
// GFX1250: v_permlane_xor_b32 v5, v1, exec_hi, vcc_lo ; encoding: [0x05,0x00,0x73,0xd6,0x01,0xff,0xa8,0x01]
v_permlane_xor_b32 v5, v1, exec_lo, src_scc
// GFX1250: v_permlane_xor_b32 v5, v1, exec_lo, src_scc ; encoding: [0x05,0x00,0x73,0xd6,0x01,0xfd,0xf4,0x03]
v_permlane_idx_gen_b32 v5, v1, s2
// GFX1250: v_permlane_idx_gen_b32 v5, v1, s2 ; encoding: [0x05,0x00,0x14,0xd7,0x01,0x05,0x00,0x00]
v_permlane_idx_gen_b32 v5, v1, s105
// GFX1250: v_permlane_idx_gen_b32 v5, v1, s105 ; encoding: [0x05,0x00,0x14,0xd7,0x01,0xd3,0x00,0x00]
v_permlane_idx_gen_b32 v5, v1, ttmp15
// GFX1250: v_permlane_idx_gen_b32 v5, v1, ttmp15 ; encoding: [0x05,0x00,0x14,0xd7,0x01,0xf7,0x00,0x00]
v_permlane_idx_gen_b32 v5, v1, vcc_hi
// GFX1250: v_permlane_idx_gen_b32 v5, v1, vcc_hi ; encoding: [0x05,0x00,0x14,0xd7,0x01,0xd7,0x00,0x00]
v_permlane_idx_gen_b32 v5, v1, vcc_lo
// GFX1250: v_permlane_idx_gen_b32 v5, v1, vcc_lo ; encoding: [0x05,0x00,0x14,0xd7,0x01,0xd5,0x00,0x00]
v_permlane_idx_gen_b32 v5, v1, m0
// GFX1250: v_permlane_idx_gen_b32 v5, v1, m0 ; encoding: [0x05,0x00,0x14,0xd7,0x01,0xfb,0x00,0x00]
v_permlane_idx_gen_b32 v5, v1, exec_hi
// GFX1250: v_permlane_idx_gen_b32 v5, v1, exec_hi ; encoding: [0x05,0x00,0x14,0xd7,0x01,0xff,0x00,0x00]
v_permlane_idx_gen_b32 v5, v1, exec_lo
// GFX1250: v_permlane_idx_gen_b32 v5, v1, exec_lo ; encoding: [0x05,0x00,0x14,0xd7,0x01,0xfd,0x00,0x00]

View File

@ -820,3 +820,120 @@
0x0a,0x08,0xa1,0xd6,0x14,0x11,0x02,0x00
# GFX1250: v_cvt_scale_pk8_f32_fp4 v[10:17], v20, v8 scale_sel:1 ; encoding: [0x0a,0x08,0xa1,0xd6,0x14,0x11,0x02,0x00]
0x05,0x00,0x70,0xd6,0x01,0xff,0xa8,0x01
# GFX1250: v_permlane_bcast_b32 v5, v1, exec_hi, vcc_lo ; encoding: [0x05,0x00,0x70,0xd6,0x01,0xff,0xa8,0x01]
0x05,0x00,0x70,0xd6,0x01,0xfd,0xf4,0x03
# GFX1250: v_permlane_bcast_b32 v5, v1, exec_lo, src_scc ; encoding: [0x05,0x00,0x70,0xd6,0x01,0xfd,0xf4,0x03]
0x05,0x00,0x70,0xd6,0x01,0xfb,0xac,0x01
# GFX1250: v_permlane_bcast_b32 v5, v1, m0, vcc_hi ; encoding: [0x05,0x00,0x70,0xd6,0x01,0xfb,0xac,0x01]
0x05,0x00,0x70,0xd6,0x01,0xd3,0xa4,0x01
# GFX1250: v_permlane_bcast_b32 v5, v1, s105, s105 ; encoding: [0x05,0x00,0x70,0xd6,0x01,0xd3,0xa4,0x01]
0x05,0x00,0x70,0xd6,0x01,0x05,0x0c,0x00
# GFX1250: v_permlane_bcast_b32 v5, v1, s2, s3 ; encoding: [0x05,0x00,0x70,0xd6,0x01,0x05,0x0c,0x00]
0x05,0x00,0x70,0xd6,0x01,0xf7,0xec,0x01
# GFX1250: v_permlane_bcast_b32 v5, v1, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x70,0xd6,0x01,0xf7,0xec,0x01]
0x05,0x00,0x70,0xd6,0x01,0xd7,0xf8,0x01
# GFX1250: v_permlane_bcast_b32 v5, v1, vcc_hi, exec_lo ; encoding: [0x05,0x00,0x70,0xd6,0x01,0xd7,0xf8,0x01]
0x05,0x00,0x70,0xd6,0x01,0xd5,0xf4,0x01
# GFX1250: v_permlane_bcast_b32 v5, v1, vcc_lo, m0 ; encoding: [0x05,0x00,0x70,0xd6,0x01,0xd5,0xf4,0x01]
0x05,0x00,0x72,0xd6,0x01,0xff,0xa8,0x01
# GFX1250: v_permlane_down_b32 v5, v1, exec_hi, vcc_lo ; encoding: [0x05,0x00,0x72,0xd6,0x01,0xff,0xa8,0x01]
0x05,0x00,0x72,0xd6,0x01,0xfd,0xf4,0x03
# GFX1250: v_permlane_down_b32 v5, v1, exec_lo, src_scc ; encoding: [0x05,0x00,0x72,0xd6,0x01,0xfd,0xf4,0x03]
0x05,0x00,0x72,0xd6,0x01,0xfb,0xac,0x01
# GFX1250: v_permlane_down_b32 v5, v1, m0, vcc_hi ; encoding: [0x05,0x00,0x72,0xd6,0x01,0xfb,0xac,0x01]
0x05,0x00,0x72,0xd6,0x01,0xd3,0xa4,0x01
# GFX1250: v_permlane_down_b32 v5, v1, s105, s105 ; encoding: [0x05,0x00,0x72,0xd6,0x01,0xd3,0xa4,0x01]
0x05,0x00,0x72,0xd6,0x01,0x05,0x0c,0x00
# GFX1250: v_permlane_down_b32 v5, v1, s2, s3 ; encoding: [0x05,0x00,0x72,0xd6,0x01,0x05,0x0c,0x00]
0x05,0x00,0x72,0xd6,0x01,0xf7,0xec,0x01
# GFX1250: v_permlane_down_b32 v5, v1, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x72,0xd6,0x01,0xf7,0xec,0x01]
0x05,0x00,0x72,0xd6,0x01,0xd7,0xf8,0x01
# GFX1250: v_permlane_down_b32 v5, v1, vcc_hi, exec_lo ; encoding: [0x05,0x00,0x72,0xd6,0x01,0xd7,0xf8,0x01]
0x05,0x00,0x72,0xd6,0x01,0xd5,0xf4,0x01
# GFX1250: v_permlane_down_b32 v5, v1, vcc_lo, m0 ; encoding: [0x05,0x00,0x72,0xd6,0x01,0xd5,0xf4,0x01]
0x05,0x00,0x71,0xd6,0x01,0xfd,0xf4,0x03
# GFX1250: v_permlane_up_b32 v5, v1, exec_lo, src_scc ; encoding: [0x05,0x00,0x71,0xd6,0x01,0xfd,0xf4,0x03]
0x05,0x00,0x71,0xd6,0x01,0xfb,0xac,0x01
# GFX1250: v_permlane_up_b32 v5, v1, m0, vcc_hi ; encoding: [0x05,0x00,0x71,0xd6,0x01,0xfb,0xac,0x01]
0x05,0x00,0x71,0xd6,0x01,0xd3,0xa4,0x01
# GFX1250: v_permlane_up_b32 v5, v1, s105, s105 ; encoding: [0x05,0x00,0x71,0xd6,0x01,0xd3,0xa4,0x01]
0x05,0x00,0x71,0xd6,0x01,0x05,0x0c,0x00
# GFX1250: v_permlane_up_b32 v5, v1, s2, s3 ; encoding: [0x05,0x00,0x71,0xd6,0x01,0x05,0x0c,0x00]
0x05,0x00,0x71,0xd6,0x01,0xf7,0xec,0x01
# GFX1250: v_permlane_up_b32 v5, v1, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x71,0xd6,0x01,0xf7,0xec,0x01]
0x05,0x00,0x71,0xd6,0x01,0xd7,0xf8,0x01
# GFX1250: v_permlane_up_b32 v5, v1, vcc_hi, exec_lo ; encoding: [0x05,0x00,0x71,0xd6,0x01,0xd7,0xf8,0x01]
0x05,0x00,0x71,0xd6,0x01,0xd5,0xf4,0x01
# GFX1250: v_permlane_up_b32 v5, v1, vcc_lo, m0 ; encoding: [0x05,0x00,0x71,0xd6,0x01,0xd5,0xf4,0x01]
0x05,0x00,0x73,0xd6,0x01,0xff,0xa8,0x01
# GFX1250: v_permlane_xor_b32 v5, v1, exec_hi, vcc_lo ; encoding: [0x05,0x00,0x73,0xd6,0x01,0xff,0xa8,0x01]
0x05,0x00,0x73,0xd6,0x01,0xfd,0xf4,0x03
# GFX1250: v_permlane_xor_b32 v5, v1, exec_lo, src_scc ; encoding: [0x05,0x00,0x73,0xd6,0x01,0xfd,0xf4,0x03]
0x05,0x00,0x73,0xd6,0x01,0xfb,0xac,0x01
# GFX1250: v_permlane_xor_b32 v5, v1, m0, vcc_hi ; encoding: [0x05,0x00,0x73,0xd6,0x01,0xfb,0xac,0x01]
0x05,0x00,0x73,0xd6,0x01,0xd3,0xa4,0x01
# GFX1250: v_permlane_xor_b32 v5, v1, s105, s105 ; encoding: [0x05,0x00,0x73,0xd6,0x01,0xd3,0xa4,0x01]
0x05,0x00,0x73,0xd6,0x01,0x05,0x0c,0x00
# GFX1250: v_permlane_xor_b32 v5, v1, s2, s3 ; encoding: [0x05,0x00,0x73,0xd6,0x01,0x05,0x0c,0x00]
0x05,0x00,0x73,0xd6,0x01,0xf7,0xec,0x01
# GFX1250: v_permlane_xor_b32 v5, v1, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x73,0xd6,0x01,0xf7,0xec,0x01]
0x05,0x00,0x73,0xd6,0x01,0xd7,0xf8,0x01
# GFX1250: v_permlane_xor_b32 v5, v1, vcc_hi, exec_lo ; encoding: [0x05,0x00,0x73,0xd6,0x01,0xd7,0xf8,0x01]
0x05,0x00,0x73,0xd6,0x01,0xd5,0xf4,0x01
# GFX1250: v_permlane_xor_b32 v5, v1, vcc_lo, m0 ; encoding: [0x05,0x00,0x73,0xd6,0x01,0xd5,0xf4,0x01]
0x05,0x00,0x14,0xd7,0x01,0xff,0x00,0x00
# GFX1250: v_permlane_idx_gen_b32 v5, v1, exec_hi ; encoding: [0x05,0x00,0x14,0xd7,0x01,0xff,0x00,0x00]
0x05,0x00,0x14,0xd7,0x01,0xfd,0x00,0x00
# GFX1250: v_permlane_idx_gen_b32 v5, v1, exec_lo ; encoding: [0x05,0x00,0x14,0xd7,0x01,0xfd,0x00,0x00]
0x05,0x00,0x14,0xd7,0x01,0xfb,0x00,0x00
# GFX1250: v_permlane_idx_gen_b32 v5, v1, m0 ; encoding: [0x05,0x00,0x14,0xd7,0x01,0xfb,0x00,0x00]
0x05,0x00,0x14,0xd7,0x01,0xd3,0x00,0x00
# GFX1250: v_permlane_idx_gen_b32 v5, v1, s105 ; encoding: [0x05,0x00,0x14,0xd7,0x01,0xd3,0x00,0x00]
0x05,0x00,0x14,0xd7,0x01,0x05,0x00,0x00
# GFX1250: v_permlane_idx_gen_b32 v5, v1, s2 ; encoding: [0x05,0x00,0x14,0xd7,0x01,0x05,0x00,0x00]
0x05,0x00,0x14,0xd7,0x01,0xf7,0x00,0x00
# GFX1250: v_permlane_idx_gen_b32 v5, v1, ttmp15 ; encoding: [0x05,0x00,0x14,0xd7,0x01,0xf7,0x00,0x00]
0x05,0x00,0x14,0xd7,0x01,0xd7,0x00,0x00
# GFX1250: v_permlane_idx_gen_b32 v5, v1, vcc_hi ; encoding: [0x05,0x00,0x14,0xd7,0x01,0xd7,0x00,0x00]
0x05,0x00,0x14,0xd7,0x01,0xd5,0x00,0x00
# GFX1250: v_permlane_idx_gen_b32 v5, v1, vcc_lo ; encoding: [0x05,0x00,0x14,0xd7,0x01,0xd5,0x00,0x00]