From 5ccfc9d6b9d65c352d348c4249d07da7eea8c8e4 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 15 Mar 2026 16:41:49 +0000 Subject: [PATCH] [X86] combineConcatVectorOps - concat(vtruncs(x),vtruncs(y)) -> packss(shuffle(x,y),shuffle(x,y)) (#186678) Although at worst this isn't a reduction in instruction count, the shuffle/packss sequence is much easier for further folds / shuffle combining --- llvm/lib/Target/X86/X86ISelLowering.cpp | 19 +++++++++++++++++++ llvm/test/CodeGen/X86/masked_packss.ll | 18 ++---------------- llvm/test/CodeGen/X86/packss.ll | 16 ++-------------- 3 files changed, 23 insertions(+), 30 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 4fbbf63c3906..8c1c0cbd053a 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -60065,6 +60065,25 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, } } break; + case X86ISD::VTRUNCS: + if (!IsSplat && NumOps == 2 && VT.is512BitVector() && + Subtarget.useBWIRegs()) { + MVT SrcVT = Ops[0].getOperand(0).getSimpleValueType(); + if (SrcVT.is512BitVector() && + SrcVT == Ops[1].getOperand(0).getSimpleValueType() && + SrcVT.getScalarSizeInBits() <= 32 && + (VT.getScalarSizeInBits() * 2 == SrcVT.getScalarSizeInBits())) { + SDValue N0 = DAG.getBitcast(MVT::v8i64, Ops[0].getOperand(0)); + SDValue N1 = DAG.getBitcast(MVT::v8i64, Ops[1].getOperand(0)); + SDValue LHS = DAG.getVectorShuffle(MVT::v8i64, DL, N0, N1, + {0, 1, 4, 5, 8, 9, 12, 13}); + SDValue RHS = DAG.getVectorShuffle(MVT::v8i64, DL, N0, N1, + {2, 3, 6, 7, 10, 11, 14, 15}); + return DAG.getNode(X86ISD::PACKSS, DL, VT, DAG.getBitcast(SrcVT, LHS), + DAG.getBitcast(SrcVT, RHS)); + } + } + break; case ISD::ANY_EXTEND: case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: diff --git a/llvm/test/CodeGen/X86/masked_packss.ll b/llvm/test/CodeGen/X86/masked_packss.ll index d84eaeaae60b..f4e9063a7baf 100644 --- a/llvm/test/CodeGen/X86/masked_packss.ll +++ b/llvm/test/CodeGen/X86/masked_packss.ll @@ -75,15 +75,8 @@ define <64 x i8> @_mm512_mask_packss_epi16_manual(<64 x i8> %src, i64 noundef %k ; ; AVX512-LABEL: _mm512_mask_packss_epi16_manual: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [4,5,12,13,6,7,14,15] -; AVX512-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,8,9,2,3,10,11] -; AVX512-NEXT: vpermi2q %zmm2, %zmm1, %zmm4 -; AVX512-NEXT: vpmovswb %zmm4, %ymm1 -; AVX512-NEXT: vpmovswb %zmm3, %ymm2 -; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512-NEXT: kmovq %rdi, %k1 -; AVX512-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} +; AVX512-NEXT: vpacksswb %zmm2, %zmm1, %zmm0 {%k1} ; AVX512-NEXT: retq %sh = shufflevector <32 x i16> %a, <32 x i16> %b, <64 x i32> %minv = tail call <64 x i16> @llvm.smax.v64i16(<64 x i16> %sh, <64 x i16> splat (i16 -128)) @@ -167,15 +160,8 @@ define <32 x i16> @_mm512_mask_packss_epi32_manual(<32 x i16> %src, i32 noundef ; ; AVX512-LABEL: _mm512_mask_packss_epi32_manual: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [4,5,12,13,6,7,14,15] -; AVX512-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,8,9,2,3,10,11] -; AVX512-NEXT: vpermi2q %zmm2, %zmm1, %zmm4 -; AVX512-NEXT: vpmovsdw %zmm4, %ymm1 -; AVX512-NEXT: vpmovsdw %zmm3, %ymm2 -; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512-NEXT: kmovd %edi, %k1 -; AVX512-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} +; AVX512-NEXT: vpackssdw %zmm2, %zmm1, %zmm0 {%k1} ; AVX512-NEXT: retq %sh = shufflevector <16 x i32> %a, <16 x i32> %b, <32 x i32> %minv = tail call <32 x i32> @llvm.smax.v32i32(<32 x i32> %sh, <32 x i32> splat (i32 -32768)) diff --git a/llvm/test/CodeGen/X86/packss.ll b/llvm/test/CodeGen/X86/packss.ll index da739dc277f6..91e4b9b463b0 100644 --- a/llvm/test/CodeGen/X86/packss.ll +++ b/llvm/test/CodeGen/X86/packss.ll @@ -535,13 +535,7 @@ define <64 x i8> @_mm512_packss_epi16_manual(<32 x i16> %a, <32 x i16> %b) nounw ; ; AVX512-LABEL: _mm512_packss_epi16_manual: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [4,5,12,13,6,7,14,15] -; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,8,9,2,3,10,11] -; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512-NEXT: vpmovswb %zmm3, %ymm0 -; AVX512-NEXT: vpmovswb %zmm2, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vpacksswb %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: ret{{[l|q]}} ; ; X64-SSE-LABEL: _mm512_packss_epi16_manual: @@ -688,13 +682,7 @@ define <32 x i16> @_mm512_packss_epi32_manual(<16 x i32> %a, <16 x i32> %b) noun ; ; AVX512-LABEL: _mm512_packss_epi32_manual: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm2 = [4,5,12,13,6,7,14,15] -; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; AVX512-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,1,8,9,2,3,10,11] -; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm3 -; AVX512-NEXT: vpmovsdw %zmm3, %ymm0 -; AVX512-NEXT: vpmovsdw %zmm2, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512-NEXT: vpackssdw %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: ret{{[l|q]}} ; ; X64-SSE-LABEL: _mm512_packss_epi32_manual: