[X86] LowerVSELECT - split v16i16/v32i8 pre-AVX2 VSELECT ops if enough of the operands are free to split.

Often on AVX1 we're better off consistently using 128-bit instructions, so recognise when the operands are loads that can be freely/cheaply split - ideally this functionality needs to be moved to isFreeToSplitVector but we're using it in a few places where we don't want to split loads yet.

Based off a regression reported after #92794
This commit is contained in:
Simon Pilgrim 2024-05-31 14:43:00 +01:00
parent 654cd94629
commit b52962d1b8
2 changed files with 36 additions and 20 deletions

View File

@ -17846,6 +17846,22 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
}
// v16i16/v32i8 selects without AVX2, if the condition and another operand
// are free to split, then better to split before expanding the
// select. Don't bother with XOP as it has the fast VPCMOV instruction.
// TODO: This is very similar to narrowVectorSelect.
// TODO: Add Load splitting to isFreeToSplitVector ?
if (EltSize < 32 && VT.is256BitVector() && !Subtarget.hasAVX2() &&
!Subtarget.hasXOP()) {
bool FreeCond = isFreeToSplitVector(Cond.getNode(), DAG);
bool FreeLHS = isFreeToSplitVector(LHS.getNode(), DAG) ||
(ISD::isNormalLoad(LHS.getNode()) && LHS.hasOneUse());
bool FreeRHS = isFreeToSplitVector(RHS.getNode(), DAG) ||
(ISD::isNormalLoad(RHS.getNode()) && RHS.hasOneUse());
if (FreeCond && (FreeLHS || FreeRHS))
return splitVectorOp(Op, DAG, dl);
}
// Only some types will be legal on some subtargets. If we can emit a legal
// VSELECT-matching blend, return Op, and but if we need to expand, return
// a null value.

View File

@ -1509,16 +1509,16 @@ define void @store_blend_load_v16i16(ptr %a0, ptr %a1, ptr %a2) {
; AVX1-NEXT: vmovdqa (%rdi), %xmm0
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8]
; AVX1-NEXT: vpmaxuw %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vpcmpeqw %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpmaxuw %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vandnps (%rsi), %ymm0, %ymm1
; AVX1-NEXT: vandps (%rdi), %ymm0, %ymm0
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vmovaps %ymm0, (%rdx)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: vpmaxuw %xmm2, %xmm0, %xmm3
; AVX1-NEXT: vpcmpeqw %xmm3, %xmm0, %xmm3
; AVX1-NEXT: vpmaxuw %xmm2, %xmm1, %xmm2
; AVX1-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2
; AVX1-NEXT: vmovdqa (%rsi), %xmm4
; AVX1-NEXT: vmovdqa 16(%rsi), %xmm5
; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm5, %xmm1
; AVX1-NEXT: vpblendvb %xmm3, %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
; AVX1-NEXT: vmovdqa %xmm1, 16(%rdx)
; AVX1-NEXT: retq
;
; AVX2-LABEL: store_blend_load_v16i16:
@ -1578,16 +1578,16 @@ define void @store_blend_load_v32i8(ptr %a0, ptr %a1, ptr %a2) {
; AVX1-NEXT: vmovdqa (%rdi), %xmm0
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX1-NEXT: vpmaxub %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vpcmpeqb %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpmaxub %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vandnps (%rsi), %ymm0, %ymm1
; AVX1-NEXT: vandps (%rdi), %ymm0, %ymm0
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vmovaps %ymm0, (%rdx)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: vpmaxub %xmm2, %xmm0, %xmm3
; AVX1-NEXT: vpcmpeqb %xmm3, %xmm0, %xmm3
; AVX1-NEXT: vpmaxub %xmm2, %xmm1, %xmm2
; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2
; AVX1-NEXT: vmovdqa (%rsi), %xmm4
; AVX1-NEXT: vmovdqa 16(%rsi), %xmm5
; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm5, %xmm1
; AVX1-NEXT: vpblendvb %xmm3, %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vmovdqa %xmm0, (%rdx)
; AVX1-NEXT: vmovdqa %xmm1, 16(%rdx)
; AVX1-NEXT: retq
;
; AVX2-LABEL: store_blend_load_v32i8: