From dc1e3e5dbf7805ff060c84135a63fc7239c52814 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 3 Mar 2026 20:52:38 +0000 Subject: [PATCH] [X86] getFauxShuffleMask - add ISD::ROTL/ROTR handling (#184417) Very similar to the existing X86ISD::VROTLI/VROTRI handling Prep work for #184002 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 31 +++++++++++++++++++ .../X86/vector-shuffle-combining-xop.ll | 30 +++++------------- 2 files changed, 39 insertions(+), 22 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 8c570aff23f5..ba5d1d4105a6 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -6918,6 +6918,37 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, } return true; } + case ISD::ROTL: + case ISD::ROTR: { + APInt UndefElts; + SmallVector EltBits; + if (!getTargetConstantBitsFromNode(N.getOperand(1), NumBitsPerElt, + UndefElts, EltBits, + /*AllowWholeUndefs*/ true, + /*AllowPartialUndefs*/ false)) + return false; + + // We can only decode 'whole byte' bit rotates as shuffles. + for (unsigned I = 0; I != NumElts; ++I) + if (DemandedElts[I] && !UndefElts[I] && + (EltBits[I].urem(NumBitsPerElt) % 8) != 0) + return false; + + Ops.push_back(N.getOperand(0)); + for (unsigned I = 0; I != NumElts; ++I) { + if (!DemandedElts[I] || UndefElts[I]) { + Mask.append(NumBytesPerElt, SM_SentinelUndef); + continue; + } + int Offset = EltBits[I].urem(NumBitsPerElt) / 8; + Offset = (ISD::ROTL == Opcode ? NumBytesPerElt - Offset : Offset); + int BaseIdx = I * NumBytesPerElt; + for (int J = 0; J != (int)NumBytesPerElt; ++J) { + Mask.push_back(BaseIdx + ((Offset + J) % NumBytesPerElt)); + } + } + return true; + } case X86ISD::VROTLI: case X86ISD::VROTRI: { // We can only decode 'whole byte' bit rotates as shuffles. diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll index 61169af11e40..ee4a7e1413f6 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-xop.ll @@ -250,17 +250,10 @@ define <16 x i8> @combine_vpperm_as_proti_v8i16(<16 x i8> %a0, <16 x i8> %a1) { } define <16 x i8> @combine_shuffle_prot_v2i64(<2 x i64> %a0) { -; X86-LABEL: combine_shuffle_prot_v2i64: -; X86: # %bb.0: -; X86-NEXT: vprotq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] -; X86-NEXT: retl -; -; X64-LABEL: combine_shuffle_prot_v2i64: -; X64: # %bb.0: -; X64-NEXT: vprotq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] -; X64-NEXT: retq +; CHECK-LABEL: combine_shuffle_prot_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[9,8,15,14,13,12,11,10,6,5,4,3,2,1,0,7] +; CHECK-NEXT: ret{{[l|q]}} %1 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a0, <2 x i64> %a0, <2 x i64> ) %2 = bitcast <2 x i64> %1 to <16 x i8> %3 = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> @@ -280,17 +273,10 @@ define <16 x i8> @combine_shuffle_proti_v2i64(<2 x i64> %a0) { declare <2 x i64> @llvm.fshr.v2i64(<2 x i64>, <2 x i64>, <2 x i64>) define <16 x i8> @combine_shuffle_prot_v4i32(<4 x i32> %a0) { -; X86-LABEL: combine_shuffle_prot_v4i32: -; X86: # %bb.0: -; X86-NEXT: vprotd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] -; X86-NEXT: retl -; -; X64-LABEL: combine_shuffle_prot_v4i32: -; X64: # %bb.0: -; X64-NEXT: vprotd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] -; X64-NEXT: retq +; CHECK-LABEL: combine_shuffle_prot_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,15,14,13,9,8,11,10,6,5,4,7,3,2,1,0] +; CHECK-NEXT: ret{{[l|q]}} %1 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a0, <4 x i32> %a0, <4 x i32> ) %2 = bitcast <4 x i32> %1 to <16 x i8> %3 = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32>