From 91dad7e7cab490693bbfeaf7a4c454bd779ae82f Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 29 Jan 2026 15:52:09 +0000 Subject: [PATCH] [X86] canonicalizeLaneShuffleWithRepeatedOps - avoid folding vperm2x128(vpshufd(load()),undef) -> vpshufd(vperm2x128(load(),undef)) (#178675) There's no benefit to letting vperm2x128 handle the fold in an unary shuffle and llvm-mca assumes there's an extra register dependency, which confuses analysis. Fixes #178632 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 17 ++++++++++++----- .../CodeGen/X86/merge-consecutive-loads-256.ll | 16 ++++++++-------- 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 88ee0c7cc531..144d6451b981 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -42646,9 +42646,10 @@ static SDValue canonicalizeShuffleWithOp(SDValue N, SelectionDAG &DAG, } /// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()). -static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V, - SelectionDAG &DAG, - const SDLoc &DL) { +static SDValue +canonicalizeLaneShuffleWithRepeatedOps(SDValue V, SelectionDAG &DAG, + const SDLoc &DL, + const X86Subtarget &Subtarget) { assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle"); MVT VT = V.getSimpleValueType(); @@ -42679,10 +42680,15 @@ static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V, break; } [[fallthrough]]; + case X86ISD::PSHUFD: + // Don't prefer a folded vpermf128 to a folded pshufd/vpermilp. + if ((Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) && + X86::mayFoldLoad(Src0.getOperand(0), Subtarget)) + break; + [[fallthrough]]; case X86ISD::VSHLI: case X86ISD::VSRLI: case X86ISD::VSRAI: - case X86ISD::PSHUFD: if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) { SDValue LHS = Src0.getOperand(0); SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0); @@ -43219,7 +43225,8 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, } // Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()). - if (SDValue Res = canonicalizeLaneShuffleWithRepeatedOps(N, DAG, DL)) + if (SDValue Res = + canonicalizeLaneShuffleWithRepeatedOps(N, DAG, DL, Subtarget)) return Res; // Fold vperm2x128 subvector shuffle with an inner concat pattern. diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-256.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-256.ll index 6ad306d2e656..30853582e0c2 100644 --- a/llvm/test/CodeGen/X86/merge-consecutive-loads-256.ll +++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-256.ll @@ -129,8 +129,8 @@ define <4 x double> @merge_4f64_f64_45zz(ptr %ptr) nounwind uwtable noinline ssp define <4 x double> @merge_v4f64_f64_3210(ptr %ptr) nounwind uwtable noinline ssp { ; AVX1-LABEL: merge_v4f64_f64_3210: ; AVX1: # %bb.0: -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,0,1] -; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = mem[1,0,3,2] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX1-NEXT: retq ; ; AVX2-LABEL: merge_v4f64_f64_3210: @@ -146,8 +146,8 @@ define <4 x double> @merge_v4f64_f64_3210(ptr %ptr) nounwind uwtable noinline ss ; X86-AVX-LABEL: merge_v4f64_f64_3210: ; X86-AVX: # %bb.0: ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,0,1] -; X86-AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; X86-AVX-NEXT: vpermilpd {{.*#+}} ymm0 = mem[1,0,3,2] +; X86-AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] ; X86-AVX-NEXT: retl %ptr0 = getelementptr inbounds double, ptr %ptr, i64 3 %ptr1 = getelementptr inbounds double, ptr %ptr, i64 2 @@ -421,8 +421,8 @@ define <8 x float> @merge_8f32_f32_1u3u5zu8(ptr %ptr) nounwind uwtable noinline define <8 x float> @merge_8f32_f32_76543210(ptr %ptr) nounwind uwtable noinline ssp { ; AVX1-LABEL: merge_8f32_f32_76543210: ; AVX1: # %bb.0: -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,0,1] -; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = mem[3,2,1,0,7,6,5,4] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX1-NEXT: retq ; ; AVX2-LABEL: merge_8f32_f32_76543210: @@ -440,8 +440,8 @@ define <8 x float> @merge_8f32_f32_76543210(ptr %ptr) nounwind uwtable noinline ; X86-AVX-LABEL: merge_8f32_f32_76543210: ; X86-AVX: # %bb.0: ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,0,1] -; X86-AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; X86-AVX-NEXT: vpermilps {{.*#+}} ymm0 = mem[3,2,1,0,7,6,5,4] +; X86-AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] ; X86-AVX-NEXT: retl %ptr0 = getelementptr inbounds float, ptr %ptr, i64 7 %ptr1 = getelementptr inbounds float, ptr %ptr, i64 6