[X86] canonicalizeLaneShuffleWithRepeatedOps - avoid folding vperm2x128(vpshufd(load()),undef) -> vpshufd(vperm2x128(load(),undef)) (#178675)
There's no benefit to letting vperm2x128 handle the fold in an unary shuffle and llvm-mca assumes there's an extra register dependency, which confuses analysis. Fixes #178632
This commit is contained in:
parent
7deea9db70
commit
91dad7e7ca
@ -42646,9 +42646,10 @@ static SDValue canonicalizeShuffleWithOp(SDValue N, SelectionDAG &DAG,
|
||||
}
|
||||
|
||||
/// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
|
||||
static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V,
|
||||
SelectionDAG &DAG,
|
||||
const SDLoc &DL) {
|
||||
static SDValue
|
||||
canonicalizeLaneShuffleWithRepeatedOps(SDValue V, SelectionDAG &DAG,
|
||||
const SDLoc &DL,
|
||||
const X86Subtarget &Subtarget) {
|
||||
assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle");
|
||||
|
||||
MVT VT = V.getSimpleValueType();
|
||||
@ -42679,10 +42680,15 @@ static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V,
|
||||
break;
|
||||
}
|
||||
[[fallthrough]];
|
||||
case X86ISD::PSHUFD:
|
||||
// Don't prefer a folded vpermf128 to a folded pshufd/vpermilp.
|
||||
if ((Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) &&
|
||||
X86::mayFoldLoad(Src0.getOperand(0), Subtarget))
|
||||
break;
|
||||
[[fallthrough]];
|
||||
case X86ISD::VSHLI:
|
||||
case X86ISD::VSRLI:
|
||||
case X86ISD::VSRAI:
|
||||
case X86ISD::PSHUFD:
|
||||
if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {
|
||||
SDValue LHS = Src0.getOperand(0);
|
||||
SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
|
||||
@ -43219,7 +43225,8 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
|
||||
}
|
||||
|
||||
// Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).
|
||||
if (SDValue Res = canonicalizeLaneShuffleWithRepeatedOps(N, DAG, DL))
|
||||
if (SDValue Res =
|
||||
canonicalizeLaneShuffleWithRepeatedOps(N, DAG, DL, Subtarget))
|
||||
return Res;
|
||||
|
||||
// Fold vperm2x128 subvector shuffle with an inner concat pattern.
|
||||
|
||||
@ -129,8 +129,8 @@ define <4 x double> @merge_4f64_f64_45zz(ptr %ptr) nounwind uwtable noinline ssp
|
||||
define <4 x double> @merge_v4f64_f64_3210(ptr %ptr) nounwind uwtable noinline ssp {
|
||||
; AVX1-LABEL: merge_v4f64_f64_3210:
|
||||
; AVX1: # %bb.0:
|
||||
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,0,1]
|
||||
; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
|
||||
; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = mem[1,0,3,2]
|
||||
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: merge_v4f64_f64_3210:
|
||||
@ -146,8 +146,8 @@ define <4 x double> @merge_v4f64_f64_3210(ptr %ptr) nounwind uwtable noinline ss
|
||||
; X86-AVX-LABEL: merge_v4f64_f64_3210:
|
||||
; X86-AVX: # %bb.0:
|
||||
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X86-AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,0,1]
|
||||
; X86-AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
|
||||
; X86-AVX-NEXT: vpermilpd {{.*#+}} ymm0 = mem[1,0,3,2]
|
||||
; X86-AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
|
||||
; X86-AVX-NEXT: retl
|
||||
%ptr0 = getelementptr inbounds double, ptr %ptr, i64 3
|
||||
%ptr1 = getelementptr inbounds double, ptr %ptr, i64 2
|
||||
@ -421,8 +421,8 @@ define <8 x float> @merge_8f32_f32_1u3u5zu8(ptr %ptr) nounwind uwtable noinline
|
||||
define <8 x float> @merge_8f32_f32_76543210(ptr %ptr) nounwind uwtable noinline ssp {
|
||||
; AVX1-LABEL: merge_8f32_f32_76543210:
|
||||
; AVX1: # %bb.0:
|
||||
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,0,1]
|
||||
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
|
||||
; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = mem[3,2,1,0,7,6,5,4]
|
||||
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: merge_8f32_f32_76543210:
|
||||
@ -440,8 +440,8 @@ define <8 x float> @merge_8f32_f32_76543210(ptr %ptr) nounwind uwtable noinline
|
||||
; X86-AVX-LABEL: merge_8f32_f32_76543210:
|
||||
; X86-AVX: # %bb.0:
|
||||
; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X86-AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3,0,1]
|
||||
; X86-AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
|
||||
; X86-AVX-NEXT: vpermilps {{.*#+}} ymm0 = mem[3,2,1,0,7,6,5,4]
|
||||
; X86-AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
|
||||
; X86-AVX-NEXT: retl
|
||||
%ptr0 = getelementptr inbounds float, ptr %ptr, i64 7
|
||||
%ptr1 = getelementptr inbounds float, ptr %ptr, i64 6
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user