[RISCV] Rewrite deinterleave load as vlse optimization as DAG combine (#150049)
This reworks an existing optimization on the fixed vector (shuffle based) deinterleave lowering into a DAG combine. This has the effect of making it kick in much more widely - in particular on the deinterleave intrinsic (i.e. scalable) path, deinterleaveN (without load) lowering, but also the intrinsic lowering paths.
This commit is contained in:
parent
fa6965f722
commit
73245b06b3
@ -20843,6 +20843,62 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
|
||||
}
|
||||
break;
|
||||
}
|
||||
case RISCVISD::TUPLE_EXTRACT: {
|
||||
EVT VT = N->getValueType(0);
|
||||
SDValue Tuple = N->getOperand(0);
|
||||
unsigned Idx = N->getConstantOperandVal(1);
|
||||
if (!Tuple.hasOneUse() || Tuple.getOpcode() != ISD::INTRINSIC_W_CHAIN)
|
||||
break;
|
||||
|
||||
unsigned NF = 0;
|
||||
switch (Tuple.getConstantOperandVal(1)) {
|
||||
default:
|
||||
break;
|
||||
case Intrinsic::riscv_vlseg2_mask:
|
||||
case Intrinsic::riscv_vlseg3_mask:
|
||||
case Intrinsic::riscv_vlseg4_mask:
|
||||
case Intrinsic::riscv_vlseg5_mask:
|
||||
case Intrinsic::riscv_vlseg6_mask:
|
||||
case Intrinsic::riscv_vlseg7_mask:
|
||||
case Intrinsic::riscv_vlseg8_mask:
|
||||
NF = Tuple.getValueType().getRISCVVectorTupleNumFields();
|
||||
break;
|
||||
}
|
||||
|
||||
if (!NF || Subtarget.hasOptimizedSegmentLoadStore(NF))
|
||||
break;
|
||||
|
||||
unsigned SEW = VT.getScalarSizeInBits();
|
||||
assert(Log2_64(SEW) == Tuple.getConstantOperandVal(7) &&
|
||||
"Type mismatch without bitcast?");
|
||||
unsigned Stride = SEW / 8 * NF;
|
||||
unsigned Offset = SEW / 8 * Idx;
|
||||
|
||||
SDValue Ops[] = {
|
||||
/*Chain=*/Tuple.getOperand(0),
|
||||
/*IntID=*/DAG.getTargetConstant(Intrinsic::riscv_vlse_mask, DL, XLenVT),
|
||||
/*Passthru=*/Tuple.getOperand(2),
|
||||
/*Ptr=*/
|
||||
DAG.getNode(ISD::ADD, DL, XLenVT, Tuple.getOperand(3),
|
||||
DAG.getConstant(Offset, DL, XLenVT)),
|
||||
/*Stride=*/DAG.getConstant(Stride, DL, XLenVT),
|
||||
/*Mask=*/Tuple.getOperand(4),
|
||||
/*VL=*/Tuple.getOperand(5),
|
||||
/*Policy=*/Tuple.getOperand(6)};
|
||||
|
||||
auto TupleMemSD = cast<MemIntrinsicSDNode>(Tuple);
|
||||
// Match getTgtMemIntrinsic for non-unit stride case
|
||||
EVT MemVT = TupleMemSD->getMemoryVT().getScalarType();
|
||||
MachineFunction &MF = DAG.getMachineFunction();
|
||||
MachineMemOperand *MMO = MF.getMachineMemOperand(
|
||||
TupleMemSD->getMemOperand(), Offset, MemoryLocation::UnknownSize);
|
||||
|
||||
SDVTList VTs = DAG.getVTList({VT, MVT::Other});
|
||||
SDValue Result = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTs,
|
||||
Ops, MemVT, MMO);
|
||||
DAG.ReplaceAllUsesOfValueWith(Tuple.getValue(1), Result.getValue(1));
|
||||
return Result.getValue(0);
|
||||
}
|
||||
}
|
||||
|
||||
return SDValue();
|
||||
|
||||
@ -216,29 +216,6 @@ bool RISCVTargetLowering::lowerInterleavedLoad(
|
||||
if (!isLegalInterleavedAccessType(VTy, Factor, Alignment, AS, DL))
|
||||
return false;
|
||||
|
||||
// If the segment load is going to be performed segment at a time anyways
|
||||
// and there's only one element used, use a strided load instead. This
|
||||
// will be equally fast, and create less vector register pressure.
|
||||
if (Indices.size() == 1 && !Subtarget.hasOptimizedSegmentLoadStore(Factor)) {
|
||||
unsigned ScalarSizeInBytes = DL.getTypeStoreSize(VTy->getElementType());
|
||||
Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes);
|
||||
Value *Offset = ConstantInt::get(XLenTy, Indices[0] * ScalarSizeInBytes);
|
||||
Value *BasePtr = Builder.CreatePtrAdd(Ptr, Offset);
|
||||
// For rv64, need to truncate i64 to i32 to match signature. As VL is at most
|
||||
// the number of active lanes (which is bounded by i32) this is safe.
|
||||
VL = Builder.CreateTrunc(VL, Builder.getInt32Ty());
|
||||
|
||||
CallInst *CI =
|
||||
Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_load,
|
||||
{VTy, BasePtr->getType(), Stride->getType()},
|
||||
{BasePtr, Stride, Mask, VL});
|
||||
Alignment = commonAlignment(Alignment, Indices[0] * ScalarSizeInBytes);
|
||||
CI->addParamAttr(0,
|
||||
Attribute::getWithAlignment(CI->getContext(), Alignment));
|
||||
Shuffles[0]->replaceAllUsesWith(CI);
|
||||
return true;
|
||||
};
|
||||
|
||||
CallInst *VlsegN = Builder.CreateIntrinsic(
|
||||
FixedVlsegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy}, {Ptr, Mask, VL});
|
||||
|
||||
|
||||
@ -9,27 +9,29 @@ define void @pr141907(ptr %0) nounwind {
|
||||
; CHECK-NEXT: slli a1, a1, 2
|
||||
; CHECK-NEXT: sub sp, sp, a1
|
||||
; CHECK-NEXT: vsetivli zero, 0, e32, m1, ta, ma
|
||||
; CHECK-NEXT: vmv.v.i v9, 0
|
||||
; CHECK-NEXT: vmv.v.i v8, 0
|
||||
; CHECK-NEXT: vmclr.m v0
|
||||
; CHECK-NEXT: li a1, 0
|
||||
; CHECK-NEXT: vsetvli a3, zero, e16, mf2, ta, ma
|
||||
; CHECK-NEXT: vmv.v.i v12, 0
|
||||
; CHECK-NEXT: vsetvli a5, zero, e16, mf2, ta, ma
|
||||
; CHECK-NEXT: vmv.v.i v10, 0
|
||||
; CHECK-NEXT: addi a2, sp, 16
|
||||
; CHECK-NEXT: addi a3, sp, 20
|
||||
; CHECK-NEXT: li a4, 12
|
||||
; CHECK-NEXT: .LBB0_1: # %vector.body
|
||||
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vs4r.v v8, (a2)
|
||||
; CHECK-NEXT: vsetvli a1, a1, e8, mf8, ta, ma
|
||||
; CHECK-NEXT: vsetivli zero, 0, e16, mf2, ta, ma
|
||||
; CHECK-NEXT: vnsrl.wi v11, v9, 0, v0.t
|
||||
; CHECK-NEXT: vsetvli a3, zero, e32, m1, ta, ma
|
||||
; CHECK-NEXT: vlseg3e32.v v8, (a2)
|
||||
; CHECK-NEXT: vnsrl.wi v9, v8, 0, v0.t
|
||||
; CHECK-NEXT: vsetvli a5, zero, e32, m1, ta, ma
|
||||
; CHECK-NEXT: vlse32.v v8, (a3), a4
|
||||
; CHECK-NEXT: vsetivli zero, 0, e16, mf2, ta, ma
|
||||
; CHECK-NEXT: vsseg2e16.v v11, (zero)
|
||||
; CHECK-NEXT: vsseg2e16.v v9, (zero)
|
||||
; CHECK-NEXT: bnez a1, .LBB0_1
|
||||
; CHECK-NEXT: .LBB0_2: # %while.body5
|
||||
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
|
||||
; CHECK-NEXT: vse16.v v9, (a0)
|
||||
; CHECK-NEXT: vse16.v v8, (a0)
|
||||
; CHECK-NEXT: j .LBB0_2
|
||||
entry:
|
||||
br label %vector.body
|
||||
|
||||
@ -407,8 +407,9 @@ define { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x
|
||||
define <vscale x 8 x i8> @vector_deinterleave_load_factor4_oneactive(ptr %p) {
|
||||
; CHECK-LABEL: vector_deinterleave_load_factor4_oneactive:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
|
||||
; CHECK-NEXT: vlseg4e8.v v8, (a0)
|
||||
; CHECK-NEXT: li a1, 4
|
||||
; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma
|
||||
; CHECK-NEXT: vlse8.v v8, (a0), a1
|
||||
; CHECK-NEXT: ret
|
||||
%vec = load <vscale x 32 x i8>, ptr %p
|
||||
%d0 = call { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.vector.deinterleave4(<vscale x 32 x i8> %vec)
|
||||
@ -419,8 +420,10 @@ define <vscale x 8 x i8> @vector_deinterleave_load_factor4_oneactive(ptr %p) {
|
||||
define <vscale x 8 x i8> @vector_deinterleave_load_factor4_oneactive2(ptr %p) {
|
||||
; CHECK-LABEL: vector_deinterleave_load_factor4_oneactive2:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
|
||||
; CHECK-NEXT: vlseg4e8.v v5, (a0)
|
||||
; CHECK-NEXT: addi a0, a0, 3
|
||||
; CHECK-NEXT: li a1, 4
|
||||
; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma
|
||||
; CHECK-NEXT: vlse8.v v8, (a0), a1
|
||||
; CHECK-NEXT: ret
|
||||
%vec = load <vscale x 32 x i8>, ptr %p
|
||||
%d0 = call { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.vector.deinterleave4(<vscale x 32 x i8> %vec)
|
||||
|
||||
@ -3712,8 +3712,9 @@ define <vscale x 1 x float> @vector_deinterleave_nxv1f32_nxv8f32_oneactive(<vsca
|
||||
; CHECK-NEXT: sub sp, sp, a0
|
||||
; CHECK-NEXT: addi a0, sp, 16
|
||||
; CHECK-NEXT: vs4r.v v8, (a0)
|
||||
; CHECK-NEXT: vsetvli a1, zero, e32, mf2, ta, ma
|
||||
; CHECK-NEXT: vlseg8e32.v v8, (a0)
|
||||
; CHECK-NEXT: li a1, 32
|
||||
; CHECK-NEXT: vsetvli a2, zero, e32, mf2, ta, ma
|
||||
; CHECK-NEXT: vlse32.v v8, (a0), a1
|
||||
; CHECK-NEXT: csrr a0, vlenb
|
||||
; CHECK-NEXT: slli a0, a0, 2
|
||||
; CHECK-NEXT: add sp, sp, a0
|
||||
@ -3732,9 +3733,11 @@ define <vscale x 1 x float> @vector_deinterleave_nxv1f32_nxv8f32_oneactive2(<vsc
|
||||
; CHECK-NEXT: slli a0, a0, 2
|
||||
; CHECK-NEXT: sub sp, sp, a0
|
||||
; CHECK-NEXT: addi a0, sp, 16
|
||||
; CHECK-NEXT: addi a1, sp, 36
|
||||
; CHECK-NEXT: vs4r.v v8, (a0)
|
||||
; CHECK-NEXT: vsetvli a1, zero, e32, mf2, ta, ma
|
||||
; CHECK-NEXT: vlseg8e32.v v3, (a0)
|
||||
; CHECK-NEXT: li a0, 32
|
||||
; CHECK-NEXT: vsetvli a2, zero, e32, mf2, ta, ma
|
||||
; CHECK-NEXT: vlse32.v v8, (a1), a0
|
||||
; CHECK-NEXT: csrr a0, vlenb
|
||||
; CHECK-NEXT: slli a0, a0, 2
|
||||
; CHECK-NEXT: add sp, sp, a0
|
||||
|
||||
@ -674,16 +674,20 @@ define <vscale x 2 x i32> @load_factor2_oneactive(ptr %ptr, i32 %evl) {
|
||||
define <vscale x 2 x i32> @load_factor5_oneactive(ptr %ptr, i32 %evl) {
|
||||
; RV32-LABEL: load_factor5_oneactive:
|
||||
; RV32: # %bb.0:
|
||||
; RV32-NEXT: addi a0, a0, 12
|
||||
; RV32-NEXT: li a2, 20
|
||||
; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma
|
||||
; RV32-NEXT: vlseg5e32.v v5, (a0)
|
||||
; RV32-NEXT: vlse32.v v8, (a0), a2
|
||||
; RV32-NEXT: ret
|
||||
;
|
||||
; RV64-LABEL: load_factor5_oneactive:
|
||||
; RV64: # %bb.0:
|
||||
; RV64-NEXT: slli a1, a1, 32
|
||||
; RV64-NEXT: addi a0, a0, 12
|
||||
; RV64-NEXT: srli a1, a1, 32
|
||||
; RV64-NEXT: li a2, 20
|
||||
; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma
|
||||
; RV64-NEXT: vlseg5e32.v v5, (a0)
|
||||
; RV64-NEXT: vlse32.v v8, (a0), a2
|
||||
; RV64-NEXT: ret
|
||||
%rvl = mul nuw i32 %evl, 5
|
||||
%wide.masked.load = call <vscale x 10 x i32> @llvm.vp.load(ptr %ptr, <vscale x 10 x i1> splat (i1 true), i32 %rvl)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user