[X86] LowerVSELECT - split v16i16/v32i8 pre-AVX2 VSELECT ops if enough of the operands are free to split.

Often on AVX1 we're better off consistently using 128-bit instructions, so recognise when the operands are loads that can be freely/cheaply split - ideally this functionality needs to be moved to isFreeToSplitVector but we're using it in a few places where we don't want to split loads yet. Based off a regression reported after #92794
2024-05-31 14:43:00 +01:00 · 2024-05-31 14:43:00 +01:00 · b52962d1b8
commit b52962d1b8
parent 654cd94629
2 changed files with 36 additions and 20 deletions
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@ -17846,6 +17846,22 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
    return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
  }

+  // v16i16/v32i8 selects without AVX2, if the condition and another operand
+  // are free to split, then better to split before expanding the
+  // select. Don't bother with XOP as it has the fast VPCMOV instruction.
+  // TODO: This is very similar to narrowVectorSelect.
+  // TODO: Add Load splitting to isFreeToSplitVector ?
+  if (EltSize < 32 && VT.is256BitVector() && !Subtarget.hasAVX2() &&
+      !Subtarget.hasXOP()) {
+    bool FreeCond = isFreeToSplitVector(Cond.getNode(), DAG);
+    bool FreeLHS = isFreeToSplitVector(LHS.getNode(), DAG) ||
+                   (ISD::isNormalLoad(LHS.getNode()) && LHS.hasOneUse());
+    bool FreeRHS = isFreeToSplitVector(RHS.getNode(), DAG) ||
+                   (ISD::isNormalLoad(RHS.getNode()) && RHS.hasOneUse());
+    if (FreeCond && (FreeLHS || FreeRHS))
+      return splitVectorOp(Op, DAG, dl);
+  }
+
  // Only some types will be legal on some subtargets. If we can emit a legal
  // VSELECT-matching blend, return Op, and but if we need to expand, return
  // a null value.
--- a/llvm/test/CodeGen/X86/vselect-pcmp.ll
+++ b/llvm/test/CodeGen/X86/vselect-pcmp.ll
@ -1509,16 +1509,16 @@ define void @store_blend_load_v16i16(ptr %a0, ptr %a1, ptr %a2) {
 ; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8]
-; AVX1-NEXT:    vpmaxuw %xmm2, %xmm1, %xmm3
-; AVX1-NEXT:    vpcmpeqw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpmaxuw %xmm2, %xmm0, %xmm2
-; AVX1-NEXT:    vpcmpeqw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT:    vandnps (%rsi), %ymm0, %ymm1
-; AVX1-NEXT:    vandps (%rdi), %ymm0, %ymm0
-; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vmovaps %ymm0, (%rdx)
-; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    vpmaxuw %xmm2, %xmm0, %xmm3
+; AVX1-NEXT:    vpcmpeqw %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpmaxuw %xmm2, %xmm1, %xmm2
+; AVX1-NEXT:    vpcmpeqw %xmm2, %xmm1, %xmm2
+; AVX1-NEXT:    vmovdqa (%rsi), %xmm4
+; AVX1-NEXT:    vmovdqa 16(%rsi), %xmm5
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vpblendvb %xmm3, %xmm0, %xmm4, %xmm0
+; AVX1-NEXT:    vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT:    vmovdqa %xmm1, 16(%rdx)
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: store_blend_load_v16i16:
@ -1578,16 +1578,16 @@ define void @store_blend_load_v32i8(ptr %a0, ptr %a1, ptr %a2) {
 ; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX1-NEXT:    vpmaxub %xmm2, %xmm1, %xmm3
-; AVX1-NEXT:    vpcmpeqb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpmaxub %xmm2, %xmm0, %xmm2
-; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT:    vandnps (%rsi), %ymm0, %ymm1
-; AVX1-NEXT:    vandps (%rdi), %ymm0, %ymm0
-; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vmovaps %ymm0, (%rdx)
-; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    vpmaxub %xmm2, %xmm0, %xmm3
+; AVX1-NEXT:    vpcmpeqb %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpmaxub %xmm2, %xmm1, %xmm2
+; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm1, %xmm2
+; AVX1-NEXT:    vmovdqa (%rsi), %xmm4
+; AVX1-NEXT:    vmovdqa 16(%rsi), %xmm5
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vpblendvb %xmm3, %xmm0, %xmm4, %xmm0
+; AVX1-NEXT:    vmovdqa %xmm0, (%rdx)
+; AVX1-NEXT:    vmovdqa %xmm1, 16(%rdx)
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: store_blend_load_v32i8: