[AMDGPU] Make rotr illegal (#166558)
fshr is already legal and is strictly more powerful than rotr, so we should only need selection patterns for fshr.
This commit is contained in:
parent
740a3ad1f7
commit
b36f89faed
@ -504,9 +504,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
|
||||
// The hardware supports 32-bit FSHR, but not FSHL.
|
||||
setOperationAction(ISD::FSHR, MVT::i32, Legal);
|
||||
|
||||
// The hardware supports 32-bit ROTR, but not ROTL.
|
||||
setOperationAction(ISD::ROTL, {MVT::i32, MVT::i64}, Expand);
|
||||
setOperationAction(ISD::ROTR, MVT::i64, Expand);
|
||||
setOperationAction({ISD::ROTL, ISD::ROTR}, {MVT::i32, MVT::i64}, Expand);
|
||||
|
||||
setOperationAction({ISD::MULHU, ISD::MULHS}, MVT::i16, Expand);
|
||||
|
||||
|
||||
@ -808,12 +808,6 @@ class DwordAddrPat<ValueType vt, RegisterClass rc> : AMDGPUPat <
|
||||
(vt rc:$addr)
|
||||
>;
|
||||
|
||||
// rotr pattern
|
||||
class ROTRPattern <Instruction BIT_ALIGN> : AMDGPUPat <
|
||||
(rotr i32:$src0, i32:$src1),
|
||||
(BIT_ALIGN $src0, $src0, $src1)
|
||||
>;
|
||||
|
||||
// Special conversion patterns
|
||||
|
||||
def cvt_rpi_i32_f32 : PatFrag <
|
||||
|
||||
@ -505,7 +505,6 @@ def : AMDGPUPat <
|
||||
(fshr i32:$src0, i32:$src1, i32:$src2),
|
||||
(BIT_ALIGN_INT_eg $src0, $src1, $src2)
|
||||
>;
|
||||
def : ROTRPattern <BIT_ALIGN_INT_eg>;
|
||||
def MULADD_eg : MULADD_Common<0x14>;
|
||||
def MULADD_IEEE_eg : MULADD_IEEE_Common<0x18>;
|
||||
def FMA_eg : FMA_Common<0x7>;
|
||||
|
||||
@ -14042,6 +14042,12 @@ static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
|
||||
assert(OtherOp.getValueSizeInBits() == 32);
|
||||
}
|
||||
|
||||
// Check that we haven't just recreated the same FSHR node.
|
||||
if (N->getOpcode() == ISD::FSHR &&
|
||||
(N->getOperand(0) == Op || N->getOperand(0) == OtherOp) &&
|
||||
(N->getOperand(1) == Op || N->getOperand(1) == OtherOp))
|
||||
return SDValue();
|
||||
|
||||
if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
|
||||
|
||||
assert(Op.getValueType().isByteSized() &&
|
||||
|
||||
@ -2685,8 +2685,6 @@ def : AMDGPUPat <
|
||||
|
||||
let True16Predicate = NotHasTrue16BitInsts in {
|
||||
let SubtargetPredicate = isNotGFX9Plus in {
|
||||
def : ROTRPattern <V_ALIGNBIT_B32_e64>;
|
||||
|
||||
def : GCNPat<(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (and i32:$src1, (i32 31))))),
|
||||
(V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
|
||||
(i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>;
|
||||
@ -2697,14 +2695,6 @@ def : GCNPat<(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (i32 ShiftAmt32Imm:
|
||||
} // isNotGFX9Plus
|
||||
|
||||
let SubtargetPredicate = isGFX9GFX10 in {
|
||||
def : GCNPat <
|
||||
(rotr i32:$src0, i32:$src1),
|
||||
(V_ALIGNBIT_B32_opsel_e64 /* src0_modifiers */ 0, $src0,
|
||||
/* src1_modifiers */ 0, $src0,
|
||||
/* src2_modifiers */ 0,
|
||||
$src1, /* clamp */ 0, /* op_sel */ 0)
|
||||
>;
|
||||
|
||||
foreach pat = [(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (and i32:$src1, (i32 31))))),
|
||||
(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (i32 ShiftAmt32Imm:$src1))))] in
|
||||
def : GCNPat<pat,
|
||||
@ -2726,15 +2716,6 @@ def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2),
|
||||
} // end True16Predicate = NotHasTrue16BitInsts
|
||||
|
||||
let True16Predicate = UseRealTrue16Insts in {
|
||||
def : GCNPat <
|
||||
(rotr i32:$src0, i32:$src1),
|
||||
(V_ALIGNBIT_B32_t16_e64 /* src0_modifiers */ 0, $src0,
|
||||
/* src1_modifiers */ 0, $src0,
|
||||
/* src2_modifiers */ 0,
|
||||
(EXTRACT_SUBREG $src1, lo16),
|
||||
/* clamp */ 0, /* op_sel */ 0)
|
||||
>;
|
||||
|
||||
def : GCNPat<(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))),
|
||||
(V_ALIGNBIT_B32_t16_e64 0, /* src0_modifiers */
|
||||
(i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
|
||||
@ -2753,14 +2734,6 @@ def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2),
|
||||
} // end True16Predicate = UseRealTrue16Insts
|
||||
|
||||
let True16Predicate = UseFakeTrue16Insts in {
|
||||
def : GCNPat <
|
||||
(rotr i32:$src0, i32:$src1),
|
||||
(V_ALIGNBIT_B32_fake16_e64 /* src0_modifiers */ 0, $src0,
|
||||
/* src1_modifiers */ 0, $src0,
|
||||
/* src2_modifiers */ 0,
|
||||
$src1, /* clamp */ 0, /* op_sel */ 0)
|
||||
>;
|
||||
|
||||
def : GCNPat<(i32 (DivergentUnaryFrag<trunc> (srl i64:$src0, (and i32:$src1, (i32 31))))),
|
||||
(V_ALIGNBIT_B32_fake16_e64 0, /* src0_modifiers */
|
||||
(i32 (EXTRACT_SUBREG (i64 $src0), sub1)),
|
||||
|
||||
@ -5,43 +5,37 @@
|
||||
define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %x_arg, i32 %y_arg, i32 %z_arg, i32 %w_arg, i32 %e) {
|
||||
; R600-LABEL: test:
|
||||
; R600: ; %bb.0: ; %entry
|
||||
; R600-NEXT: ALU 12, @4, KC0[CB0:0-32], KC1[]
|
||||
; R600-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[]
|
||||
; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
|
||||
; R600-NEXT: CF_END
|
||||
; R600-NEXT: PAD
|
||||
; R600-NEXT: ALU clause starting at 4:
|
||||
; R600-NEXT: ADD_INT T0.Y, KC0[3].X, 1,
|
||||
; R600-NEXT: ADD_INT T0.Z, KC0[3].Y, 1,
|
||||
; R600-NEXT: ADD_INT T0.W, KC0[2].Z, 1,
|
||||
; R600-NEXT: ADD_INT * T1.W, KC0[2].W, 1,
|
||||
; R600-NEXT: BIT_ALIGN_INT T0.X, PS, PS, KC0[3].Z,
|
||||
; R600-NEXT: BIT_ALIGN_INT T1.Y, PV.W, PV.W, KC0[3].Z,
|
||||
; R600-NEXT: BIT_ALIGN_INT T0.Z, PV.Z, PV.Z, KC0[3].Z,
|
||||
; R600-NEXT: BIT_ALIGN_INT * T0.W, PV.Y, PV.Y, KC0[3].Z,
|
||||
; R600-NEXT: OR_INT T0.W, PV.W, PV.Z,
|
||||
; R600-NEXT: OR_INT * T1.W, PV.Y, PV.X,
|
||||
; R600-NEXT: OR_INT T0.X, PS, PV.W,
|
||||
; R600-NEXT: ADD_INT T0.Y, KC0[2].W, 1,
|
||||
; R600-NEXT: ADD_INT T0.Z, KC0[2].Z, 1,
|
||||
; R600-NEXT: ADD_INT T0.W, KC0[3].Y, 1,
|
||||
; R600-NEXT: ADD_INT * T1.W, KC0[3].X, 1,
|
||||
; R600-NEXT: OR_INT T0.W, PS, PV.W,
|
||||
; R600-NEXT: OR_INT * T1.W, PV.Z, PV.Y,
|
||||
; R600-NEXT: OR_INT * T0.W, PS, PV.W,
|
||||
; R600-NEXT: BIT_ALIGN_INT T0.X, PV.W, PV.W, KC0[3].Z,
|
||||
; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
|
||||
; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
||||
;
|
||||
; CM-LABEL: test:
|
||||
; CM: ; %bb.0: ; %entry
|
||||
; CM-NEXT: ALU 12, @4, KC0[CB0:0-32], KC1[]
|
||||
; CM-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[]
|
||||
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
|
||||
; CM-NEXT: CF_END
|
||||
; CM-NEXT: PAD
|
||||
; CM-NEXT: ALU clause starting at 4:
|
||||
; CM-NEXT: ADD_INT T0.X, KC0[3].X, 1,
|
||||
; CM-NEXT: ADD_INT T0.Y, KC0[3].Y, 1,
|
||||
; CM-NEXT: ADD_INT T0.Z, KC0[2].Z, 1,
|
||||
; CM-NEXT: ADD_INT * T0.W, KC0[2].W, 1,
|
||||
; CM-NEXT: BIT_ALIGN_INT T1.X, PV.W, PV.W, KC0[3].Z,
|
||||
; CM-NEXT: BIT_ALIGN_INT T1.Y, PV.Z, PV.Z, KC0[3].Z,
|
||||
; CM-NEXT: BIT_ALIGN_INT T0.Z, PV.Y, PV.Y, KC0[3].Z,
|
||||
; CM-NEXT: BIT_ALIGN_INT * T0.W, PV.X, PV.X, KC0[3].Z,
|
||||
; CM-NEXT: ADD_INT T0.X, KC0[2].W, 1,
|
||||
; CM-NEXT: ADD_INT T0.Y, KC0[2].Z, 1,
|
||||
; CM-NEXT: ADD_INT T0.Z, KC0[3].Y, 1,
|
||||
; CM-NEXT: ADD_INT * T0.W, KC0[3].X, 1,
|
||||
; CM-NEXT: OR_INT T0.Z, PV.W, PV.Z,
|
||||
; CM-NEXT: OR_INT * T0.W, PV.Y, PV.X,
|
||||
; CM-NEXT: OR_INT * T0.X, PV.W, PV.Z,
|
||||
; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z,
|
||||
; CM-NEXT: BIT_ALIGN_INT * T0.X, PV.W, PV.W, KC0[3].Z,
|
||||
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
|
||||
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
||||
entry:
|
||||
|
||||
@ -118,14 +118,13 @@ define amdgpu_kernel void @lsh8_or_lsr24(ptr addrspace(1) nocapture %arg, i32 %a
|
||||
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GCN-NEXT: s_load_dword s2, s[4:5], 0x2c
|
||||
; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
|
||||
; GCN-NEXT: v_mov_b32_e32 v3, 0x2010007
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GCN-NEXT: v_add_u32_e32 v0, vcc, s0, v0
|
||||
; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
||||
; GCN-NEXT: flat_load_dword v2, v[0:1]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: v_perm_b32 v2, s2, v2, v3
|
||||
; GCN-NEXT: v_alignbit_b32 v2, v2, s2, 24
|
||||
; GCN-NEXT: flat_store_dword v[0:1], v2
|
||||
; GCN-NEXT: s_endpgm
|
||||
bb:
|
||||
|
||||
@ -25,12 +25,14 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) {
|
||||
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_sub_i32 s3, 32, s3
|
||||
; SI-NEXT: s_sub_i32 s4, 32, s3
|
||||
; SI-NEXT: s_mov_b32 s3, s2
|
||||
; SI-NEXT: s_and_b32 s4, s4, 31
|
||||
; SI-NEXT: s_lshr_b64 s[2:3], s[2:3], s4
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: s_mov_b32 s4, s0
|
||||
; SI-NEXT: s_mov_b32 s5, s1
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s3
|
||||
; SI-NEXT: v_alignbit_b32 v0, s2, s2, v0
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
@ -38,11 +40,13 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) {
|
||||
; GFX8: ; %bb.0: ; %entry
|
||||
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: s_sub_i32 s3, 32, s3
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s3
|
||||
; GFX8-NEXT: v_alignbit_b32 v2, s2, s2, v0
|
||||
; GFX8-NEXT: s_sub_i32 s4, 32, s3
|
||||
; GFX8-NEXT: s_mov_b32 s3, s2
|
||||
; GFX8-NEXT: s_and_b32 s4, s4, 31
|
||||
; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], s4
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX8-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX8-NEXT: s_endpgm
|
||||
;
|
||||
@ -52,18 +56,24 @@ define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) {
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: s_sub_i32 s3, 32, s3
|
||||
; GFX10-NEXT: v_alignbit_b32 v1, s2, s2, s3
|
||||
; GFX10-NEXT: s_and_b32 s4, s3, 31
|
||||
; GFX10-NEXT: s_mov_b32 s3, s2
|
||||
; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s4
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
|
||||
; GFX10-NEXT: s_endpgm
|
||||
;
|
||||
; GFX11-LABEL: rotl_i32:
|
||||
; GFX11: ; %bb.0: ; %entry
|
||||
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: s_sub_i32 s3, 32, s3
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: s_and_b32 s4, s3, 31
|
||||
; GFX11-NEXT: s_mov_b32 s3, s2
|
||||
; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s4
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: v_alignbit_b32 v1, s2, s2, s3
|
||||
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
|
||||
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
|
||||
; GFX11-NEXT: s_endpgm
|
||||
entry:
|
||||
@ -97,14 +107,19 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
|
||||
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
|
||||
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_sub_i32 s3, 32, s3
|
||||
; SI-NEXT: s_sub_i32 s2, 32, s2
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s3
|
||||
; SI-NEXT: v_alignbit_b32 v1, s1, s1, v0
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; SI-NEXT: v_alignbit_b32 v0, s0, s0, v0
|
||||
; SI-NEXT: s_sub_i32 s6, 32, s2
|
||||
; SI-NEXT: s_sub_i32 s8, 32, s3
|
||||
; SI-NEXT: s_mov_b32 s2, s1
|
||||
; SI-NEXT: s_mov_b32 s3, s1
|
||||
; SI-NEXT: s_mov_b32 s1, s0
|
||||
; SI-NEXT: s_and_b32 s8, s8, 31
|
||||
; SI-NEXT: s_and_b32 s6, s6, 31
|
||||
; SI-NEXT: s_lshr_b64 s[2:3], s[2:3], s8
|
||||
; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], s6
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s2
|
||||
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
@ -113,13 +128,18 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
|
||||
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
|
||||
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: s_sub_i32 s2, 32, s2
|
||||
; GFX8-NEXT: s_sub_i32 s3, 32, s3
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s3
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX8-NEXT: v_alignbit_b32 v1, s1, s1, v0
|
||||
; GFX8-NEXT: v_alignbit_b32 v0, s0, s0, v2
|
||||
; GFX8-NEXT: s_sub_i32 s7, 32, s3
|
||||
; GFX8-NEXT: s_sub_i32 s6, 32, s2
|
||||
; GFX8-NEXT: s_mov_b32 s2, s1
|
||||
; GFX8-NEXT: s_mov_b32 s3, s1
|
||||
; GFX8-NEXT: s_and_b32 s1, s7, 31
|
||||
; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], s1
|
||||
; GFX8-NEXT: s_and_b32 s3, s6, 31
|
||||
; GFX8-NEXT: s_mov_b32 s1, s0
|
||||
; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], s3
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s5
|
||||
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
||||
; GFX8-NEXT: s_endpgm
|
||||
@ -131,10 +151,17 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
|
||||
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: s_sub_i32 s3, 32, s3
|
||||
; GFX10-NEXT: s_sub_i32 s2, 32, s2
|
||||
; GFX10-NEXT: v_alignbit_b32 v1, s1, s1, s3
|
||||
; GFX10-NEXT: v_alignbit_b32 v0, s0, s0, s2
|
||||
; GFX10-NEXT: s_sub_i32 s4, 32, s2
|
||||
; GFX10-NEXT: s_sub_i32 s5, 32, s3
|
||||
; GFX10-NEXT: s_mov_b32 s2, s1
|
||||
; GFX10-NEXT: s_mov_b32 s3, s1
|
||||
; GFX10-NEXT: s_mov_b32 s1, s0
|
||||
; GFX10-NEXT: s_and_b32 s4, s4, 31
|
||||
; GFX10-NEXT: s_and_b32 s5, s5, 31
|
||||
; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], s4
|
||||
; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s5
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
|
||||
; GFX10-NEXT: s_endpgm
|
||||
;
|
||||
@ -143,12 +170,19 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
|
||||
; GFX11-NEXT: s_clause 0x1
|
||||
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
|
||||
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
|
||||
; GFX11-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: s_sub_i32 s3, 32, s3
|
||||
; GFX11-NEXT: s_sub_i32 s2, 32, s2
|
||||
; GFX11-NEXT: v_alignbit_b32 v1, s1, s1, s3
|
||||
; GFX11-NEXT: v_alignbit_b32 v0, s0, s0, s2
|
||||
; GFX11-NEXT: s_sub_i32 s6, 32, s2
|
||||
; GFX11-NEXT: s_sub_i32 s7, 32, s3
|
||||
; GFX11-NEXT: s_mov_b32 s2, s1
|
||||
; GFX11-NEXT: s_mov_b32 s3, s1
|
||||
; GFX11-NEXT: s_mov_b32 s1, s0
|
||||
; GFX11-NEXT: s_and_b32 s6, s6, 31
|
||||
; GFX11-NEXT: s_and_b32 s7, s7, 31
|
||||
; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s6
|
||||
; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s7
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
|
||||
; GFX11-NEXT: s_endpgm
|
||||
entry:
|
||||
@ -188,20 +222,30 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
|
||||
; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd
|
||||
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_sub_i32 s4, 32, s12
|
||||
; SI-NEXT: s_sub_i32 s5, 32, s13
|
||||
; SI-NEXT: s_sub_i32 s6, 32, s15
|
||||
; SI-NEXT: s_sub_i32 s7, 32, s14
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s6
|
||||
; SI-NEXT: v_alignbit_b32 v3, s11, s11, v0
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s7
|
||||
; SI-NEXT: v_alignbit_b32 v2, s10, s10, v0
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s5
|
||||
; SI-NEXT: v_alignbit_b32 v1, s9, s9, v0
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s4
|
||||
; SI-NEXT: v_alignbit_b32 v0, s8, s8, v0
|
||||
; SI-NEXT: s_sub_i32 s2, 32, s12
|
||||
; SI-NEXT: s_sub_i32 s12, 32, s13
|
||||
; SI-NEXT: s_sub_i32 s13, 32, s14
|
||||
; SI-NEXT: s_sub_i32 s14, 32, s15
|
||||
; SI-NEXT: s_mov_b32 s4, s11
|
||||
; SI-NEXT: s_mov_b32 s5, s11
|
||||
; SI-NEXT: s_mov_b32 s11, s10
|
||||
; SI-NEXT: s_mov_b32 s6, s9
|
||||
; SI-NEXT: s_mov_b32 s7, s9
|
||||
; SI-NEXT: s_mov_b32 s9, s8
|
||||
; SI-NEXT: s_and_b32 s14, s14, 31
|
||||
; SI-NEXT: s_and_b32 s13, s13, 31
|
||||
; SI-NEXT: s_and_b32 s12, s12, 31
|
||||
; SI-NEXT: s_and_b32 s2, s2, 31
|
||||
; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], s14
|
||||
; SI-NEXT: s_lshr_b64 s[10:11], s[10:11], s13
|
||||
; SI-NEXT: s_lshr_b64 s[6:7], s[6:7], s12
|
||||
; SI-NEXT: s_lshr_b64 s[8:9], s[8:9], s2
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s8
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s6
|
||||
; SI-NEXT: v_mov_b32_e32 v2, s10
|
||||
; SI-NEXT: v_mov_b32_e32 v3, s4
|
||||
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
@ -210,19 +254,29 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
|
||||
; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
||||
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: s_sub_i32 s5, 32, s15
|
||||
; GFX8-NEXT: s_sub_i32 s2, 32, s15
|
||||
; GFX8-NEXT: s_and_b32 s5, s2, 31
|
||||
; GFX8-NEXT: s_mov_b32 s2, s11
|
||||
; GFX8-NEXT: s_mov_b32 s3, s11
|
||||
; GFX8-NEXT: s_sub_i32 s4, 32, s14
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s5
|
||||
; GFX8-NEXT: s_sub_i32 s3, 32, s13
|
||||
; GFX8-NEXT: v_alignbit_b32 v3, s11, s11, v0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX8-NEXT: s_sub_i32 s2, 32, s12
|
||||
; GFX8-NEXT: v_alignbit_b32 v2, s10, s10, v0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s3
|
||||
; GFX8-NEXT: v_alignbit_b32 v1, s9, s9, v0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], s5
|
||||
; GFX8-NEXT: s_sub_i32 s6, 32, s13
|
||||
; GFX8-NEXT: s_and_b32 s3, s4, 31
|
||||
; GFX8-NEXT: s_mov_b32 s11, s10
|
||||
; GFX8-NEXT: s_sub_i32 s12, 32, s12
|
||||
; GFX8-NEXT: s_lshr_b64 s[4:5], s[10:11], s3
|
||||
; GFX8-NEXT: s_and_b32 s3, s6, 31
|
||||
; GFX8-NEXT: s_mov_b32 s6, s9
|
||||
; GFX8-NEXT: s_mov_b32 s7, s9
|
||||
; GFX8-NEXT: s_lshr_b64 s[6:7], s[6:7], s3
|
||||
; GFX8-NEXT: s_and_b32 s3, s12, 31
|
||||
; GFX8-NEXT: s_mov_b32 s9, s8
|
||||
; GFX8-NEXT: s_lshr_b64 s[8:9], s[8:9], s3
|
||||
; GFX8-NEXT: v_mov_b32_e32 v5, s1
|
||||
; GFX8-NEXT: v_alignbit_b32 v0, s8, s8, v0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s8
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s6
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s2
|
||||
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
|
||||
; GFX8-NEXT: s_endpgm
|
||||
@ -234,14 +288,28 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
|
||||
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX10-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: s_sub_i32 s2, 32, s12
|
||||
; GFX10-NEXT: s_sub_i32 s3, 32, s13
|
||||
; GFX10-NEXT: s_sub_i32 s4, 32, s15
|
||||
; GFX10-NEXT: s_sub_i32 s5, 32, s14
|
||||
; GFX10-NEXT: v_alignbit_b32 v3, s11, s11, s4
|
||||
; GFX10-NEXT: v_alignbit_b32 v2, s10, s10, s5
|
||||
; GFX10-NEXT: v_alignbit_b32 v1, s9, s9, s3
|
||||
; GFX10-NEXT: v_alignbit_b32 v0, s8, s8, s2
|
||||
; GFX10-NEXT: s_sub_i32 s6, 32, s12
|
||||
; GFX10-NEXT: s_sub_i32 s7, 32, s13
|
||||
; GFX10-NEXT: s_sub_i32 s12, 32, s14
|
||||
; GFX10-NEXT: s_sub_i32 s13, 32, s15
|
||||
; GFX10-NEXT: s_mov_b32 s2, s11
|
||||
; GFX10-NEXT: s_mov_b32 s3, s11
|
||||
; GFX10-NEXT: s_mov_b32 s11, s10
|
||||
; GFX10-NEXT: s_mov_b32 s4, s9
|
||||
; GFX10-NEXT: s_mov_b32 s5, s9
|
||||
; GFX10-NEXT: s_mov_b32 s9, s8
|
||||
; GFX10-NEXT: s_and_b32 s13, s13, 31
|
||||
; GFX10-NEXT: s_and_b32 s12, s12, 31
|
||||
; GFX10-NEXT: s_and_b32 s14, s7, 31
|
||||
; GFX10-NEXT: s_and_b32 s15, s6, 31
|
||||
; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s13
|
||||
; GFX10-NEXT: s_lshr_b64 s[6:7], s[10:11], s12
|
||||
; GFX10-NEXT: s_lshr_b64 s[8:9], s[8:9], s15
|
||||
; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], s14
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, s8
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, s6
|
||||
; GFX10-NEXT: v_mov_b32_e32 v3, s2
|
||||
; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
|
||||
; GFX10-NEXT: s_endpgm
|
||||
;
|
||||
@ -250,16 +318,29 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
|
||||
; GFX11-NEXT: s_clause 0x1
|
||||
; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34
|
||||
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
||||
; GFX11-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: s_sub_i32 s2, 32, s12
|
||||
; GFX11-NEXT: s_sub_i32 s3, 32, s13
|
||||
; GFX11-NEXT: s_sub_i32 s4, 32, s15
|
||||
; GFX11-NEXT: s_sub_i32 s5, 32, s14
|
||||
; GFX11-NEXT: v_alignbit_b32 v3, s11, s11, s4
|
||||
; GFX11-NEXT: v_alignbit_b32 v2, s10, s10, s5
|
||||
; GFX11-NEXT: v_alignbit_b32 v1, s9, s9, s3
|
||||
; GFX11-NEXT: v_alignbit_b32 v0, s8, s8, s2
|
||||
; GFX11-NEXT: s_sub_i32 s6, 32, s12
|
||||
; GFX11-NEXT: s_sub_i32 s7, 32, s13
|
||||
; GFX11-NEXT: s_sub_i32 s12, 32, s14
|
||||
; GFX11-NEXT: s_sub_i32 s13, 32, s15
|
||||
; GFX11-NEXT: s_mov_b32 s2, s11
|
||||
; GFX11-NEXT: s_mov_b32 s3, s11
|
||||
; GFX11-NEXT: s_mov_b32 s11, s10
|
||||
; GFX11-NEXT: s_mov_b32 s4, s9
|
||||
; GFX11-NEXT: s_mov_b32 s5, s9
|
||||
; GFX11-NEXT: s_mov_b32 s9, s8
|
||||
; GFX11-NEXT: s_and_b32 s13, s13, 31
|
||||
; GFX11-NEXT: s_and_b32 s12, s12, 31
|
||||
; GFX11-NEXT: s_and_b32 s14, s7, 31
|
||||
; GFX11-NEXT: s_and_b32 s15, s6, 31
|
||||
; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s13
|
||||
; GFX11-NEXT: s_lshr_b64 s[6:7], s[10:11], s12
|
||||
; GFX11-NEXT: s_lshr_b64 s[8:9], s[8:9], s15
|
||||
; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], s14
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s4
|
||||
; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s2
|
||||
; GFX11-NEXT: v_mov_b32_e32 v2, s6
|
||||
; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
|
||||
; GFX11-NEXT: s_endpgm
|
||||
entry:
|
||||
|
||||
@ -22,12 +22,14 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) {
|
||||
; SI: ; %bb.0: ; %entry
|
||||
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_and_b32 s4, s3, 31
|
||||
; SI-NEXT: s_mov_b32 s3, s2
|
||||
; SI-NEXT: s_lshr_b64 s[2:3], s[2:3], s4
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: s_mov_b32 s4, s0
|
||||
; SI-NEXT: s_mov_b32 s5, s1
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s3
|
||||
; SI-NEXT: v_alignbit_b32 v0, s2, s2, v0
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
@ -35,10 +37,12 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) {
|
||||
; GFX8: ; %bb.0: ; %entry
|
||||
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s3
|
||||
; GFX8-NEXT: v_alignbit_b32 v2, s2, s2, v0
|
||||
; GFX8-NEXT: s_and_b32 s4, s3, 31
|
||||
; GFX8-NEXT: s_mov_b32 s3, s2
|
||||
; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], s4
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX8-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX8-NEXT: s_endpgm
|
||||
;
|
||||
@ -47,16 +51,22 @@ define amdgpu_kernel void @rotr_i32(ptr addrspace(1) %in, i32 %x, i32 %y) {
|
||||
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: v_alignbit_b32 v1, s2, s2, s3
|
||||
; GFX10-NEXT: s_and_b32 s4, s3, 31
|
||||
; GFX10-NEXT: s_mov_b32 s3, s2
|
||||
; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s4
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
|
||||
; GFX10-NEXT: s_endpgm
|
||||
;
|
||||
; GFX11-LABEL: rotr_i32:
|
||||
; GFX11: ; %bb.0: ; %entry
|
||||
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: v_alignbit_b32 v1, s2, s2, s3
|
||||
; GFX11-NEXT: s_and_b32 s4, s3, 31
|
||||
; GFX11-NEXT: s_mov_b32 s3, s2
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s4
|
||||
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
|
||||
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
|
||||
; GFX11-NEXT: s_endpgm
|
||||
entry:
|
||||
@ -86,12 +96,17 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
|
||||
; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb
|
||||
; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s3
|
||||
; SI-NEXT: v_alignbit_b32 v1, s1, s1, v0
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; SI-NEXT: v_alignbit_b32 v0, s0, s0, v0
|
||||
; SI-NEXT: s_and_b32 s3, s3, 31
|
||||
; SI-NEXT: s_mov_b32 s8, s1
|
||||
; SI-NEXT: s_mov_b32 s9, s1
|
||||
; SI-NEXT: s_and_b32 s6, s2, 31
|
||||
; SI-NEXT: s_mov_b32 s1, s0
|
||||
; SI-NEXT: s_lshr_b64 s[2:3], s[8:9], s3
|
||||
; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], s6
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s2
|
||||
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
@ -100,11 +115,16 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
|
||||
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
|
||||
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s3
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX8-NEXT: v_alignbit_b32 v1, s1, s1, v0
|
||||
; GFX8-NEXT: v_alignbit_b32 v0, s0, s0, v2
|
||||
; GFX8-NEXT: s_and_b32 s3, s3, 31
|
||||
; GFX8-NEXT: s_mov_b32 s6, s1
|
||||
; GFX8-NEXT: s_mov_b32 s7, s1
|
||||
; GFX8-NEXT: s_and_b32 s8, s2, 31
|
||||
; GFX8-NEXT: s_mov_b32 s1, s0
|
||||
; GFX8-NEXT: s_lshr_b64 s[2:3], s[6:7], s3
|
||||
; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], s8
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s5
|
||||
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
||||
; GFX8-NEXT: s_endpgm
|
||||
@ -116,8 +136,15 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
|
||||
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: v_alignbit_b32 v1, s1, s1, s3
|
||||
; GFX10-NEXT: v_alignbit_b32 v0, s0, s0, s2
|
||||
; GFX10-NEXT: s_mov_b32 s4, s1
|
||||
; GFX10-NEXT: s_mov_b32 s5, s1
|
||||
; GFX10-NEXT: s_and_b32 s2, s2, 31
|
||||
; GFX10-NEXT: s_mov_b32 s1, s0
|
||||
; GFX10-NEXT: s_and_b32 s3, s3, 31
|
||||
; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
|
||||
; GFX10-NEXT: s_lshr_b64 s[2:3], s[4:5], s3
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
|
||||
; GFX10-NEXT: s_endpgm
|
||||
;
|
||||
@ -126,10 +153,17 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
|
||||
; GFX11-NEXT: s_clause 0x1
|
||||
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
|
||||
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
|
||||
; GFX11-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: v_alignbit_b32 v1, s1, s1, s3
|
||||
; GFX11-NEXT: v_alignbit_b32 v0, s0, s0, s2
|
||||
; GFX11-NEXT: s_mov_b32 s6, s1
|
||||
; GFX11-NEXT: s_mov_b32 s7, s1
|
||||
; GFX11-NEXT: s_and_b32 s2, s2, 31
|
||||
; GFX11-NEXT: s_mov_b32 s1, s0
|
||||
; GFX11-NEXT: s_and_b32 s3, s3, 31
|
||||
; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
|
||||
; GFX11-NEXT: s_lshr_b64 s[2:3], s[6:7], s3
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
|
||||
; GFX11-NEXT: s_endpgm
|
||||
entry:
|
||||
@ -161,16 +195,26 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
|
||||
; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd
|
||||
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s15
|
||||
; SI-NEXT: v_alignbit_b32 v3, s11, s11, v0
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s14
|
||||
; SI-NEXT: v_alignbit_b32 v2, s10, s10, v0
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s13
|
||||
; SI-NEXT: v_alignbit_b32 v1, s9, s9, v0
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s12
|
||||
; SI-NEXT: v_alignbit_b32 v0, s8, s8, v0
|
||||
; SI-NEXT: s_and_b32 s2, s15, 31
|
||||
; SI-NEXT: s_mov_b32 s4, s11
|
||||
; SI-NEXT: s_mov_b32 s5, s11
|
||||
; SI-NEXT: s_and_b32 s14, s14, 31
|
||||
; SI-NEXT: s_mov_b32 s11, s10
|
||||
; SI-NEXT: s_and_b32 s13, s13, 31
|
||||
; SI-NEXT: s_mov_b32 s6, s9
|
||||
; SI-NEXT: s_mov_b32 s7, s9
|
||||
; SI-NEXT: s_and_b32 s12, s12, 31
|
||||
; SI-NEXT: s_mov_b32 s9, s8
|
||||
; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], s2
|
||||
; SI-NEXT: s_lshr_b64 s[10:11], s[10:11], s14
|
||||
; SI-NEXT: s_lshr_b64 s[6:7], s[6:7], s13
|
||||
; SI-NEXT: s_lshr_b64 s[8:9], s[8:9], s12
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s8
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s6
|
||||
; SI-NEXT: v_mov_b32_e32 v2, s10
|
||||
; SI-NEXT: v_mov_b32_e32 v3, s4
|
||||
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
@ -179,15 +223,25 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
|
||||
; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
||||
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s15
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s14
|
||||
; GFX8-NEXT: v_mov_b32_e32 v4, s13
|
||||
; GFX8-NEXT: v_alignbit_b32 v3, s11, s11, v0
|
||||
; GFX8-NEXT: v_alignbit_b32 v2, s10, s10, v1
|
||||
; GFX8-NEXT: v_alignbit_b32 v1, s9, s9, v4
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s12
|
||||
; GFX8-NEXT: s_and_b32 s4, s15, 31
|
||||
; GFX8-NEXT: s_mov_b32 s2, s11
|
||||
; GFX8-NEXT: s_mov_b32 s3, s11
|
||||
; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], s4
|
||||
; GFX8-NEXT: s_and_b32 s3, s13, 31
|
||||
; GFX8-NEXT: s_mov_b32 s6, s9
|
||||
; GFX8-NEXT: s_mov_b32 s7, s9
|
||||
; GFX8-NEXT: s_and_b32 s5, s14, 31
|
||||
; GFX8-NEXT: s_mov_b32 s11, s10
|
||||
; GFX8-NEXT: s_lshr_b64 s[6:7], s[6:7], s3
|
||||
; GFX8-NEXT: s_and_b32 s3, s12, 31
|
||||
; GFX8-NEXT: s_mov_b32 s9, s8
|
||||
; GFX8-NEXT: s_lshr_b64 s[4:5], s[10:11], s5
|
||||
; GFX8-NEXT: s_lshr_b64 s[8:9], s[8:9], s3
|
||||
; GFX8-NEXT: v_mov_b32_e32 v5, s1
|
||||
; GFX8-NEXT: v_alignbit_b32 v0, s8, s8, v0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s8
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s6
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s2
|
||||
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
|
||||
; GFX8-NEXT: s_endpgm
|
||||
@ -199,10 +253,24 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
|
||||
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX10-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: v_alignbit_b32 v3, s11, s11, s15
|
||||
; GFX10-NEXT: v_alignbit_b32 v2, s10, s10, s14
|
||||
; GFX10-NEXT: v_alignbit_b32 v1, s9, s9, s13
|
||||
; GFX10-NEXT: v_alignbit_b32 v0, s8, s8, s12
|
||||
; GFX10-NEXT: s_and_b32 s6, s15, 31
|
||||
; GFX10-NEXT: s_mov_b32 s2, s11
|
||||
; GFX10-NEXT: s_mov_b32 s3, s11
|
||||
; GFX10-NEXT: s_and_b32 s7, s14, 31
|
||||
; GFX10-NEXT: s_mov_b32 s11, s10
|
||||
; GFX10-NEXT: s_and_b32 s13, s13, 31
|
||||
; GFX10-NEXT: s_mov_b32 s4, s9
|
||||
; GFX10-NEXT: s_mov_b32 s5, s9
|
||||
; GFX10-NEXT: s_and_b32 s12, s12, 31
|
||||
; GFX10-NEXT: s_mov_b32 s9, s8
|
||||
; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s6
|
||||
; GFX10-NEXT: s_lshr_b64 s[6:7], s[10:11], s7
|
||||
; GFX10-NEXT: s_lshr_b64 s[8:9], s[8:9], s12
|
||||
; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], s13
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, s8
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, s6
|
||||
; GFX10-NEXT: v_mov_b32_e32 v3, s2
|
||||
; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
|
||||
; GFX10-NEXT: s_endpgm
|
||||
;
|
||||
@ -211,12 +279,25 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
|
||||
; GFX11-NEXT: s_clause 0x1
|
||||
; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34
|
||||
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
||||
; GFX11-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: v_alignbit_b32 v3, s11, s11, s15
|
||||
; GFX11-NEXT: v_alignbit_b32 v2, s10, s10, s14
|
||||
; GFX11-NEXT: v_alignbit_b32 v1, s9, s9, s13
|
||||
; GFX11-NEXT: v_alignbit_b32 v0, s8, s8, s12
|
||||
; GFX11-NEXT: s_and_b32 s6, s15, 31
|
||||
; GFX11-NEXT: s_mov_b32 s2, s11
|
||||
; GFX11-NEXT: s_mov_b32 s3, s11
|
||||
; GFX11-NEXT: s_and_b32 s7, s14, 31
|
||||
; GFX11-NEXT: s_mov_b32 s11, s10
|
||||
; GFX11-NEXT: s_and_b32 s13, s13, 31
|
||||
; GFX11-NEXT: s_mov_b32 s4, s9
|
||||
; GFX11-NEXT: s_mov_b32 s5, s9
|
||||
; GFX11-NEXT: s_and_b32 s12, s12, 31
|
||||
; GFX11-NEXT: s_mov_b32 s9, s8
|
||||
; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s6
|
||||
; GFX11-NEXT: s_lshr_b64 s[6:7], s[10:11], s7
|
||||
; GFX11-NEXT: s_lshr_b64 s[8:9], s[8:9], s12
|
||||
; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], s13
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s4
|
||||
; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s2
|
||||
; GFX11-NEXT: v_mov_b32_e32 v2, s6
|
||||
; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
|
||||
; GFX11-NEXT: s_endpgm
|
||||
entry:
|
||||
@ -258,23 +339,44 @@ define amdgpu_kernel void @rotr_v8i32(ptr addrspace(1) %in, <8 x i32> %x, <8 x i
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s19
|
||||
; SI-NEXT: v_alignbit_b32 v3, s11, s11, v0
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s18
|
||||
; SI-NEXT: v_alignbit_b32 v2, s10, s10, v0
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s17
|
||||
; SI-NEXT: v_alignbit_b32 v1, s9, s9, v0
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s16
|
||||
; SI-NEXT: v_alignbit_b32 v0, s8, s8, v0
|
||||
; SI-NEXT: v_mov_b32_e32 v4, s23
|
||||
; SI-NEXT: v_alignbit_b32 v7, s15, s15, v4
|
||||
; SI-NEXT: v_mov_b32_e32 v4, s22
|
||||
; SI-NEXT: v_alignbit_b32 v6, s14, s14, v4
|
||||
; SI-NEXT: v_mov_b32_e32 v4, s21
|
||||
; SI-NEXT: v_alignbit_b32 v5, s13, s13, v4
|
||||
; SI-NEXT: v_mov_b32_e32 v4, s20
|
||||
; SI-NEXT: v_alignbit_b32 v4, s12, s12, v4
|
||||
; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
|
||||
; SI-NEXT: s_and_b32 s24, s19, 31
|
||||
; SI-NEXT: s_mov_b32 s4, s11
|
||||
; SI-NEXT: s_mov_b32 s5, s11
|
||||
; SI-NEXT: s_and_b32 s25, s18, 31
|
||||
; SI-NEXT: s_mov_b32 s11, s10
|
||||
; SI-NEXT: s_and_b32 s26, s17, 31
|
||||
; SI-NEXT: s_mov_b32 s6, s9
|
||||
; SI-NEXT: s_mov_b32 s7, s9
|
||||
; SI-NEXT: s_and_b32 s27, s16, 31
|
||||
; SI-NEXT: s_mov_b32 s9, s8
|
||||
; SI-NEXT: s_and_b32 s23, s23, 31
|
||||
; SI-NEXT: s_mov_b32 s16, s15
|
||||
; SI-NEXT: s_mov_b32 s17, s15
|
||||
; SI-NEXT: s_and_b32 s22, s22, 31
|
||||
; SI-NEXT: s_mov_b32 s15, s14
|
||||
; SI-NEXT: s_and_b32 s21, s21, 31
|
||||
; SI-NEXT: s_mov_b32 s18, s13
|
||||
; SI-NEXT: s_mov_b32 s19, s13
|
||||
; SI-NEXT: s_and_b32 s20, s20, 31
|
||||
; SI-NEXT: s_mov_b32 s13, s12
|
||||
; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], s24
|
||||
; SI-NEXT: s_lshr_b64 s[10:11], s[10:11], s25
|
||||
; SI-NEXT: s_lshr_b64 s[6:7], s[6:7], s26
|
||||
; SI-NEXT: s_lshr_b64 s[16:17], s[16:17], s23
|
||||
; SI-NEXT: s_lshr_b64 s[14:15], s[14:15], s22
|
||||
; SI-NEXT: s_lshr_b64 s[18:19], s[18:19], s21
|
||||
; SI-NEXT: s_lshr_b64 s[12:13], s[12:13], s20
|
||||
; SI-NEXT: s_lshr_b64 s[8:9], s[8:9], s27
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s12
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s18
|
||||
; SI-NEXT: v_mov_b32_e32 v2, s14
|
||||
; SI-NEXT: v_mov_b32_e32 v3, s16
|
||||
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
|
||||
; SI-NEXT: s_waitcnt expcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s8
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s6
|
||||
; SI-NEXT: v_mov_b32_e32 v2, s10
|
||||
; SI-NEXT: v_mov_b32_e32 v3, s4
|
||||
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
@ -283,28 +385,48 @@ define amdgpu_kernel void @rotr_v8i32(ptr addrspace(1) %in, <8 x i32> %x, <8 x i
|
||||
; GFX8-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x44
|
||||
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s18
|
||||
; GFX8-NEXT: v_mov_b32_e32 v4, s17
|
||||
; GFX8-NEXT: v_alignbit_b32 v2, s10, s10, v1
|
||||
; GFX8-NEXT: v_alignbit_b32 v1, s9, s9, v4
|
||||
; GFX8-NEXT: v_mov_b32_e32 v4, s23
|
||||
; GFX8-NEXT: v_alignbit_b32 v7, s15, s15, v4
|
||||
; GFX8-NEXT: v_mov_b32_e32 v4, s22
|
||||
; GFX8-NEXT: s_add_u32 s2, s0, 16
|
||||
; GFX8-NEXT: v_alignbit_b32 v6, s14, s14, v4
|
||||
; GFX8-NEXT: v_mov_b32_e32 v4, s21
|
||||
; GFX8-NEXT: s_addc_u32 s3, s1, 0
|
||||
; GFX8-NEXT: v_alignbit_b32 v5, s13, s13, v4
|
||||
; GFX8-NEXT: v_mov_b32_e32 v4, s20
|
||||
; GFX8-NEXT: v_mov_b32_e32 v9, s3
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s19
|
||||
; GFX8-NEXT: v_alignbit_b32 v4, s12, s12, v4
|
||||
; GFX8-NEXT: v_mov_b32_e32 v8, s2
|
||||
; GFX8-NEXT: v_alignbit_b32 v3, s11, s11, v0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s16
|
||||
; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
|
||||
; GFX8-NEXT: v_alignbit_b32 v0, s8, s8, v0
|
||||
; GFX8-NEXT: s_and_b32 s4, s19, 31
|
||||
; GFX8-NEXT: s_mov_b32 s2, s11
|
||||
; GFX8-NEXT: s_mov_b32 s3, s11
|
||||
; GFX8-NEXT: s_lshr_b64 s[2:3], s[2:3], s4
|
||||
; GFX8-NEXT: s_and_b32 s3, s17, 31
|
||||
; GFX8-NEXT: s_mov_b32 s6, s9
|
||||
; GFX8-NEXT: s_mov_b32 s7, s9
|
||||
; GFX8-NEXT: s_and_b32 s5, s18, 31
|
||||
; GFX8-NEXT: s_mov_b32 s11, s10
|
||||
; GFX8-NEXT: s_lshr_b64 s[6:7], s[6:7], s3
|
||||
; GFX8-NEXT: s_and_b32 s3, s16, 31
|
||||
; GFX8-NEXT: s_mov_b32 s9, s8
|
||||
; GFX8-NEXT: s_lshr_b64 s[4:5], s[10:11], s5
|
||||
; GFX8-NEXT: s_lshr_b64 s[8:9], s[8:9], s3
|
||||
; GFX8-NEXT: s_and_b32 s3, s23, 31
|
||||
; GFX8-NEXT: s_mov_b32 s10, s15
|
||||
; GFX8-NEXT: s_mov_b32 s11, s15
|
||||
; GFX8-NEXT: s_lshr_b64 s[10:11], s[10:11], s3
|
||||
; GFX8-NEXT: s_and_b32 s3, s22, 31
|
||||
; GFX8-NEXT: s_mov_b32 s15, s14
|
||||
; GFX8-NEXT: s_lshr_b64 s[14:15], s[14:15], s3
|
||||
; GFX8-NEXT: s_and_b32 s3, s21, 31
|
||||
; GFX8-NEXT: s_mov_b32 s16, s13
|
||||
; GFX8-NEXT: s_mov_b32 s17, s13
|
||||
; GFX8-NEXT: s_lshr_b64 s[16:17], s[16:17], s3
|
||||
; GFX8-NEXT: s_and_b32 s3, s20, 31
|
||||
; GFX8-NEXT: s_mov_b32 s13, s12
|
||||
; GFX8-NEXT: s_lshr_b64 s[12:13], s[12:13], s3
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s10
|
||||
; GFX8-NEXT: s_add_u32 s10, s0, 16
|
||||
; GFX8-NEXT: s_addc_u32 s11, s1, 0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v4, s10
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s12
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s16
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s14
|
||||
; GFX8-NEXT: v_mov_b32_e32 v5, s11
|
||||
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
|
||||
; GFX8-NEXT: v_mov_b32_e32 v5, s1
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s8
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s6
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, s2
|
||||
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
|
||||
; GFX8-NEXT: s_endpgm
|
||||
@ -316,16 +438,44 @@ define amdgpu_kernel void @rotr_v8i32(ptr addrspace(1) %in, <8 x i32> %x, <8 x i
|
||||
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX10-NEXT: v_mov_b32_e32 v8, 0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: v_alignbit_b32 v7, s15, s15, s23
|
||||
; GFX10-NEXT: v_alignbit_b32 v6, s14, s14, s22
|
||||
; GFX10-NEXT: v_alignbit_b32 v5, s13, s13, s21
|
||||
; GFX10-NEXT: v_alignbit_b32 v4, s12, s12, s20
|
||||
; GFX10-NEXT: v_alignbit_b32 v3, s11, s11, s19
|
||||
; GFX10-NEXT: v_alignbit_b32 v2, s10, s10, s18
|
||||
; GFX10-NEXT: v_alignbit_b32 v1, s9, s9, s17
|
||||
; GFX10-NEXT: v_alignbit_b32 v0, s8, s8, s16
|
||||
; GFX10-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
|
||||
; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
|
||||
; GFX10-NEXT: s_and_b32 s19, s19, 31
|
||||
; GFX10-NEXT: s_mov_b32 s2, s11
|
||||
; GFX10-NEXT: s_mov_b32 s3, s11
|
||||
; GFX10-NEXT: s_and_b32 s17, s17, 31
|
||||
; GFX10-NEXT: s_mov_b32 s4, s9
|
||||
; GFX10-NEXT: s_mov_b32 s5, s9
|
||||
; GFX10-NEXT: s_and_b32 s16, s16, 31
|
||||
; GFX10-NEXT: s_mov_b32 s9, s8
|
||||
; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s19
|
||||
; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], s17
|
||||
; GFX10-NEXT: s_and_b32 s23, s23, 31
|
||||
; GFX10-NEXT: s_mov_b32 s6, s15
|
||||
; GFX10-NEXT: s_mov_b32 s7, s15
|
||||
; GFX10-NEXT: s_and_b32 s22, s22, 31
|
||||
; GFX10-NEXT: s_mov_b32 s15, s14
|
||||
; GFX10-NEXT: s_lshr_b64 s[8:9], s[8:9], s16
|
||||
; GFX10-NEXT: s_mov_b32 s16, s13
|
||||
; GFX10-NEXT: s_mov_b32 s17, s13
|
||||
; GFX10-NEXT: s_and_b32 s3, s20, 31
|
||||
; GFX10-NEXT: s_mov_b32 s13, s12
|
||||
; GFX10-NEXT: s_and_b32 s5, s21, 31
|
||||
; GFX10-NEXT: s_and_b32 s18, s18, 31
|
||||
; GFX10-NEXT: s_mov_b32 s11, s10
|
||||
; GFX10-NEXT: s_lshr_b64 s[6:7], s[6:7], s23
|
||||
; GFX10-NEXT: s_lshr_b64 s[14:15], s[14:15], s22
|
||||
; GFX10-NEXT: s_lshr_b64 s[12:13], s[12:13], s3
|
||||
; GFX10-NEXT: s_lshr_b64 s[16:17], s[16:17], s5
|
||||
; GFX10-NEXT: s_lshr_b64 s[10:11], s[10:11], s18
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, s12
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, s16
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, s14
|
||||
; GFX10-NEXT: v_mov_b32_e32 v3, s6
|
||||
; GFX10-NEXT: v_mov_b32_e32 v4, s8
|
||||
; GFX10-NEXT: v_mov_b32_e32 v5, s4
|
||||
; GFX10-NEXT: v_mov_b32_e32 v6, s10
|
||||
; GFX10-NEXT: v_mov_b32_e32 v7, s2
|
||||
; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
|
||||
; GFX10-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
|
||||
; GFX10-NEXT: s_endpgm
|
||||
;
|
||||
; GFX11-LABEL: rotr_v8i32:
|
||||
@ -333,19 +483,43 @@ define amdgpu_kernel void @rotr_v8i32(ptr addrspace(1) %in, <8 x i32> %x, <8 x i
|
||||
; GFX11-NEXT: s_clause 0x1
|
||||
; GFX11-NEXT: s_load_b512 s[8:23], s[4:5], 0x44
|
||||
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
||||
; GFX11-NEXT: v_mov_b32_e32 v8, 0
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: v_alignbit_b32 v7, s15, s15, s23
|
||||
; GFX11-NEXT: v_alignbit_b32 v6, s14, s14, s22
|
||||
; GFX11-NEXT: v_alignbit_b32 v5, s13, s13, s21
|
||||
; GFX11-NEXT: v_alignbit_b32 v4, s12, s12, s20
|
||||
; GFX11-NEXT: v_alignbit_b32 v3, s11, s11, s19
|
||||
; GFX11-NEXT: v_alignbit_b32 v2, s10, s10, s18
|
||||
; GFX11-NEXT: v_alignbit_b32 v1, s9, s9, s17
|
||||
; GFX11-NEXT: v_alignbit_b32 v0, s8, s8, s16
|
||||
; GFX11-NEXT: s_and_b32 s19, s19, 31
|
||||
; GFX11-NEXT: s_mov_b32 s2, s11
|
||||
; GFX11-NEXT: s_mov_b32 s3, s11
|
||||
; GFX11-NEXT: s_and_b32 s17, s17, 31
|
||||
; GFX11-NEXT: s_mov_b32 s4, s9
|
||||
; GFX11-NEXT: s_mov_b32 s5, s9
|
||||
; GFX11-NEXT: s_and_b32 s16, s16, 31
|
||||
; GFX11-NEXT: s_mov_b32 s9, s8
|
||||
; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s19
|
||||
; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], s17
|
||||
; GFX11-NEXT: s_and_b32 s23, s23, 31
|
||||
; GFX11-NEXT: s_mov_b32 s6, s15
|
||||
; GFX11-NEXT: s_mov_b32 s7, s15
|
||||
; GFX11-NEXT: s_and_b32 s22, s22, 31
|
||||
; GFX11-NEXT: s_mov_b32 s15, s14
|
||||
; GFX11-NEXT: s_lshr_b64 s[8:9], s[8:9], s16
|
||||
; GFX11-NEXT: s_mov_b32 s16, s13
|
||||
; GFX11-NEXT: s_mov_b32 s17, s13
|
||||
; GFX11-NEXT: s_and_b32 s3, s20, 31
|
||||
; GFX11-NEXT: s_mov_b32 s13, s12
|
||||
; GFX11-NEXT: s_and_b32 s5, s21, 31
|
||||
; GFX11-NEXT: s_and_b32 s18, s18, 31
|
||||
; GFX11-NEXT: s_mov_b32 s11, s10
|
||||
; GFX11-NEXT: s_lshr_b64 s[6:7], s[6:7], s23
|
||||
; GFX11-NEXT: s_lshr_b64 s[14:15], s[14:15], s22
|
||||
; GFX11-NEXT: s_lshr_b64 s[12:13], s[12:13], s3
|
||||
; GFX11-NEXT: s_lshr_b64 s[16:17], s[16:17], s5
|
||||
; GFX11-NEXT: s_lshr_b64 s[10:11], s[10:11], s18
|
||||
; GFX11-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s16
|
||||
; GFX11-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v3, s6
|
||||
; GFX11-NEXT: v_dual_mov_b32 v2, s14 :: v_dual_mov_b32 v5, s4
|
||||
; GFX11-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v7, s2
|
||||
; GFX11-NEXT: v_mov_b32_e32 v6, s10
|
||||
; GFX11-NEXT: s_clause 0x1
|
||||
; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
|
||||
; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1]
|
||||
; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16
|
||||
; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1]
|
||||
; GFX11-NEXT: s_endpgm
|
||||
entry:
|
||||
%tmp0 = sub <8 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>, %y
|
||||
|
||||
@ -1470,21 +1470,20 @@ define amdgpu_kernel void @s_shl_inline_imm_1_i64(ptr addrspace(1) %out, ptr add
|
||||
;
|
||||
; EG-LABEL: s_shl_inline_imm_1_i64:
|
||||
; EG: ; %bb.0:
|
||||
; EG-NEXT: ALU 11, @4, KC0[CB0:0-32], KC1[]
|
||||
; EG-NEXT: ALU 10, @4, KC0[CB0:0-32], KC1[]
|
||||
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
|
||||
; EG-NEXT: CF_END
|
||||
; EG-NEXT: PAD
|
||||
; EG-NEXT: ALU clause starting at 4:
|
||||
; EG-NEXT: AND_INT T0.W, KC0[2].W, literal.x,
|
||||
; EG-NEXT: LSHL * T1.W, KC0[2].W, literal.y,
|
||||
; EG-NEXT: 31(4.344025e-44), 26(3.643376e-44)
|
||||
; EG-NEXT: ASHR T1.W, PS, literal.x,
|
||||
; EG-NEXT: LSHL * T0.W, 1, PV.W,
|
||||
; EG-NEXT: NOT_INT * T1.W, KC0[2].W,
|
||||
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
|
||||
; EG-NEXT: AND_INT T0.Y, PV.W, PS,
|
||||
; EG-NEXT: AND_INT * T1.W, KC0[2].W, literal.x,
|
||||
; EG-NEXT: BIT_ALIGN_INT T0.Z, 0.0, 0.0, PS,
|
||||
; EG-NEXT: AND_INT T1.W, KC0[2].W, literal.x,
|
||||
; EG-NEXT: LSHL * T0.W, 1, PV.W,
|
||||
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
|
||||
; EG-NEXT: CNDE_INT T0.X, PV.W, T0.W, 0.0,
|
||||
; EG-NEXT: CNDE_INT * T0.Y, PV.W, PV.Z, PS,
|
||||
; EG-NEXT: CNDE_INT T0.X, T1.W, T0.W, 0.0,
|
||||
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
|
||||
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
||||
%shl = shl i64 1, %a
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user