From 04abe44524ce960186cc337cae44edb000b63e4c Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 10 Mar 2026 10:08:02 +0000 Subject: [PATCH] [X86] Enable i256 fshl/fshr lowering on avx512 targets (#185455) If freely foldable to vector, concat the i256 halves as v4i64 -> v8i64 and use the existing i512 shift lowering Part of #184828 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 33 + llvm/test/CodeGen/X86/funnel-shift-i256.ll | 914 ++++++++------------- 2 files changed, 367 insertions(+), 580 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 4ce343922a02..328e79eb07ca 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1891,6 +1891,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SRL, MVT::i512, Custom); setOperationAction(ISD::SHL, MVT::i512, Custom); setOperationAction(ISD::SRA, MVT::i512, Custom); + setOperationAction(ISD::FSHR, MVT::i256, Custom); + setOperationAction(ISD::FSHL, MVT::i256, Custom); setOperationAction(ISD::SELECT, MVT::i512, Custom); for (MVT VT : { MVT::v16i1, MVT::v16i8 }) { @@ -2953,6 +2955,8 @@ static bool mayFoldIntoVector(SDValue Op, const SelectionDAG &DAG, case ISD::XOR: case ISD::ADD: case ISD::SUB: + case ISD::FSHL: + case ISD::FSHR: return mayFoldIntoVector(Op.getOperand(0), DAG, Subtarget) && mayFoldIntoVector(Op.getOperand(1), DAG, Subtarget); case ISD::SELECT: @@ -34519,6 +34523,35 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, Results.push_back(DAG.getBitcast(VT, Res)); return; } + case ISD::FSHL: + case ISD::FSHR: { + EVT VT = N->getValueType(0); + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + SDValue Amt = N->getOperand(2); + assert(Subtarget.useAVX512Regs() && "AVX512F required"); + assert(VT == MVT::i256 && "Unexpected VT!"); + if (!mayFoldIntoVector(Op0, DAG, Subtarget) || + !mayFoldIntoVector(Op1, DAG, Subtarget)) + return; + + // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw. + // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))). + SDValue Res = DAG.getBitcast( + MVT::i512, concatSubVectors(DAG.getBitcast(MVT::v4i64, Op1), + DAG.getBitcast(MVT::v4i64, Op0), DAG, dl)); + Amt = DAG.getNode(ISD::AND, dl, Amt.getValueType(), Amt, + DAG.getConstant(255, dl, Amt.getValueType())); + if (Opc == ISD::FSHL) { + Res = DAG.getNode(ISD::SHL, dl, MVT::i512, Res, Amt); + Res = DAG.getNode(ISD::SRL, dl, MVT::i512, Res, + DAG.getShiftAmountConstant(256, MVT::i512, dl)); + } else { + Res = DAG.getNode(ISD::SRL, dl, MVT::i512, Res, Amt); + } + Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, MVT::i256, Res)); + return; + } case ISD::CTPOP: { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); diff --git a/llvm/test/CodeGen/X86/funnel-shift-i256.ll b/llvm/test/CodeGen/X86/funnel-shift-i256.ll index 7b787ab6a7a5..549b6e3fc0dd 100644 --- a/llvm/test/CodeGen/X86/funnel-shift-i256.ll +++ b/llvm/test/CodeGen/X86/funnel-shift-i256.ll @@ -466,112 +466,76 @@ define i256 @fshl_i256_load(ptr %p0, ptr %p1, i256 %a2) nounwind { ; ; AVX512F-LABEL: fshl_i256_load: ; AVX512F: # %bb.0: -; AVX512F-NEXT: pushq %rbx -; AVX512F-NEXT: movq (%rsi), %r9 -; AVX512F-NEXT: movq 8(%rsi), %r8 -; AVX512F-NEXT: movq 24(%rdx), %r10 -; AVX512F-NEXT: testb $-128, %cl -; AVX512F-NEXT: movq 8(%rdx), %rbx -; AVX512F-NEXT: cmoveq %r10, %rbx -; AVX512F-NEXT: movq 16(%rdx), %r11 -; AVX512F-NEXT: movq (%rdx), %rdx -; AVX512F-NEXT: cmoveq %r11, %rdx -; AVX512F-NEXT: cmoveq %r9, %r11 -; AVX512F-NEXT: cmoveq %r8, %r10 -; AVX512F-NEXT: cmoveq 16(%rsi), %r9 -; AVX512F-NEXT: cmoveq 24(%rsi), %r8 ; AVX512F-NEXT: movq %rdi, %rax -; AVX512F-NEXT: testb $64, %cl -; AVX512F-NEXT: cmovneq %r9, %r8 -; AVX512F-NEXT: cmovneq %r10, %r9 -; AVX512F-NEXT: cmovneq %r11, %r10 -; AVX512F-NEXT: cmoveq %rbx, %rdx -; AVX512F-NEXT: cmovneq %rbx, %r11 -; AVX512F-NEXT: movq %r11, %rsi -; AVX512F-NEXT: shldq %cl, %rdx, %rsi -; AVX512F-NEXT: movq %r10, %rdx -; AVX512F-NEXT: shldq %cl, %r11, %rdx -; AVX512F-NEXT: movq %r9, %rdi -; AVX512F-NEXT: shldq %cl, %r10, %rdi -; AVX512F-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX512F-NEXT: shldq %cl, %r9, %r8 -; AVX512F-NEXT: movq %r8, 24(%rax) -; AVX512F-NEXT: movq %rdi, 16(%rax) -; AVX512F-NEXT: movq %rdx, 8(%rax) -; AVX512F-NEXT: movq %rsi, (%rax) -; AVX512F-NEXT: popq %rbx +; AVX512F-NEXT: vmovdqu (%rdx), %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 +; AVX512F-NEXT: movzbl %cl, %ecx +; AVX512F-NEXT: vmovq %rcx, %xmm1 +; AVX512F-NEXT: # kill: def $ecx killed $ecx killed $rcx +; AVX512F-NEXT: shrl $6, %ecx +; AVX512F-NEXT: movl $-1, %edx +; AVX512F-NEXT: shlxl %ecx, %edx, %ecx +; AVX512F-NEXT: kmovw %ecx, %k1 +; AVX512F-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63] +; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX512F-NEXT: vpsllq %xmm3, %zmm0, %zmm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm3 +; AVX512F-NEXT: valignq {{.*#+}} zmm0 = zmm0[7,0,1,2,3,4,5,6] +; AVX512F-NEXT: vpsrlq $1, %zmm0, %zmm0 +; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm1 +; AVX512F-NEXT: vpsrlq %xmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512F-NEXT: vmovdqu %ymm0, (%rdi) ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: fshl_i256_load: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: movq (%rsi), %rax -; AVX512VL-NEXT: movq 8(%rsi), %r8 -; AVX512VL-NEXT: movq 16(%rdx), %r10 -; AVX512VL-NEXT: movq 24(%rdx), %r9 -; AVX512VL-NEXT: testb $-128, %cl -; AVX512VL-NEXT: movq 8(%rdx), %r11 -; AVX512VL-NEXT: cmoveq %r9, %r11 -; AVX512VL-NEXT: movq (%rdx), %rdx -; AVX512VL-NEXT: cmoveq %r10, %rdx -; AVX512VL-NEXT: cmoveq %rax, %r10 -; AVX512VL-NEXT: cmoveq %r8, %r9 -; AVX512VL-NEXT: cmoveq 16(%rsi), %rax -; AVX512VL-NEXT: cmoveq 24(%rsi), %r8 -; AVX512VL-NEXT: testb $64, %cl -; AVX512VL-NEXT: cmovneq %rax, %r8 -; AVX512VL-NEXT: cmovneq %r9, %rax -; AVX512VL-NEXT: cmovneq %r10, %r9 -; AVX512VL-NEXT: cmoveq %r11, %rdx -; AVX512VL-NEXT: cmovneq %r11, %r10 -; AVX512VL-NEXT: movq %r10, %rsi -; AVX512VL-NEXT: shldq %cl, %rdx, %rsi -; AVX512VL-NEXT: movq %r9, %rdx -; AVX512VL-NEXT: shldq %cl, %r10, %rdx -; AVX512VL-NEXT: movq %rax, %r10 -; AVX512VL-NEXT: shldq %cl, %r9, %r10 -; AVX512VL-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX512VL-NEXT: shldq %cl, %rax, %r8 ; AVX512VL-NEXT: movq %rdi, %rax -; AVX512VL-NEXT: movq %r8, 24(%rdi) -; AVX512VL-NEXT: movq %r10, 16(%rdi) -; AVX512VL-NEXT: movq %rdx, 8(%rdi) -; AVX512VL-NEXT: movq %rsi, (%rdi) +; AVX512VL-NEXT: vmovdqu (%rdx), %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 +; AVX512VL-NEXT: movzbl %cl, %ecx +; AVX512VL-NEXT: vpbroadcastq %rcx, %xmm1 +; AVX512VL-NEXT: # kill: def $ecx killed $ecx killed $rcx +; AVX512VL-NEXT: shrl $6, %ecx +; AVX512VL-NEXT: movl $-1, %edx +; AVX512VL-NEXT: shlxl %ecx, %edx, %ecx +; AVX512VL-NEXT: kmovd %ecx, %k1 +; AVX512VL-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z} +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63] +; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX512VL-NEXT: vpsllq %xmm3, %zmm0, %zmm3 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm3, %ymm3 +; AVX512VL-NEXT: valignq {{.*#+}} zmm0 = zmm0[7,0,1,2,3,4,5,6] +; AVX512VL-NEXT: vpsrlq $1, %zmm0, %zmm0 +; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm1 +; AVX512VL-NEXT: vpsrlq %xmm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512VL-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512VL-NEXT: vmovdqu %ymm0, (%rdi) +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512VBMI-LABEL: fshl_i256_load: ; AVX512VBMI: # %bb.0: -; AVX512VBMI-NEXT: movq (%rsi), %rax -; AVX512VBMI-NEXT: movq 8(%rsi), %r8 -; AVX512VBMI-NEXT: movq 16(%rdx), %r10 -; AVX512VBMI-NEXT: movq 24(%rdx), %r9 -; AVX512VBMI-NEXT: testb $-128, %cl -; AVX512VBMI-NEXT: movq 8(%rdx), %r11 -; AVX512VBMI-NEXT: cmoveq %r9, %r11 -; AVX512VBMI-NEXT: movq (%rdx), %rdx -; AVX512VBMI-NEXT: cmoveq %r10, %rdx -; AVX512VBMI-NEXT: cmoveq %rax, %r10 -; AVX512VBMI-NEXT: cmoveq %r8, %r9 -; AVX512VBMI-NEXT: cmoveq 16(%rsi), %rax -; AVX512VBMI-NEXT: cmoveq 24(%rsi), %r8 -; AVX512VBMI-NEXT: testb $64, %cl -; AVX512VBMI-NEXT: cmovneq %rax, %r8 -; AVX512VBMI-NEXT: cmovneq %r9, %rax -; AVX512VBMI-NEXT: cmovneq %r10, %r9 -; AVX512VBMI-NEXT: cmoveq %r11, %rdx -; AVX512VBMI-NEXT: cmovneq %r11, %r10 -; AVX512VBMI-NEXT: movq %r10, %rsi -; AVX512VBMI-NEXT: shldq %cl, %rdx, %rsi -; AVX512VBMI-NEXT: movq %r9, %rdx -; AVX512VBMI-NEXT: shldq %cl, %r10, %rdx -; AVX512VBMI-NEXT: movq %rax, %r10 -; AVX512VBMI-NEXT: shldq %cl, %r9, %r10 -; AVX512VBMI-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX512VBMI-NEXT: shldq %cl, %rax, %r8 ; AVX512VBMI-NEXT: movq %rdi, %rax -; AVX512VBMI-NEXT: movq %r8, 24(%rdi) -; AVX512VBMI-NEXT: movq %r10, 16(%rdi) -; AVX512VBMI-NEXT: movq %rdx, 8(%rdi) -; AVX512VBMI-NEXT: movq %rsi, (%rdi) +; AVX512VBMI-NEXT: vmovdqu (%rdx), %ymm0 +; AVX512VBMI-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 +; AVX512VBMI-NEXT: movzbl %cl, %ecx +; AVX512VBMI-NEXT: movl %ecx, %edx +; AVX512VBMI-NEXT: shrl $6, %edx +; AVX512VBMI-NEXT: movl $-1, %esi +; AVX512VBMI-NEXT: shlxl %edx, %esi, %edx +; AVX512VBMI-NEXT: kmovd %edx, %k1 +; AVX512VBMI-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z} +; AVX512VBMI-NEXT: vpbroadcastq %rcx, %zmm1 +; AVX512VBMI-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VBMI-NEXT: valignq {{.*#+}} zmm2 = zmm2[7],zmm0[0,1,2,3,4,5,6] +; AVX512VBMI-NEXT: vpshldvq %zmm1, %zmm2, %zmm0 +; AVX512VBMI-NEXT: vextracti64x4 $1, %zmm0, (%rdi) +; AVX512VBMI-NEXT: vzeroupper ; AVX512VBMI-NEXT: retq %a0 = load i256, ptr %p0 %a1 = load i256, ptr %p1 @@ -650,103 +614,72 @@ define i256 @fshr_i256_load(ptr %p0, ptr %p1, i256 %a2) nounwind { ; ; AVX512F-LABEL: fshr_i256_load: ; AVX512F: # %bb.0: -; AVX512F-NEXT: pushq %rbx -; AVX512F-NEXT: movq (%rsi), %r8 -; AVX512F-NEXT: movq 8(%rsi), %r10 -; AVX512F-NEXT: movq 16(%rdx), %r9 -; AVX512F-NEXT: testb %cl, %cl -; AVX512F-NEXT: movq (%rdx), %rbx -; AVX512F-NEXT: cmovsq %r9, %rbx -; AVX512F-NEXT: movq 24(%rdx), %r11 -; AVX512F-NEXT: movq 8(%rdx), %rdx -; AVX512F-NEXT: cmovsq %r11, %rdx -; AVX512F-NEXT: cmovsq %r8, %r9 -; AVX512F-NEXT: cmovsq %r10, %r11 -; AVX512F-NEXT: cmovsq 16(%rsi), %r8 -; AVX512F-NEXT: cmovsq 24(%rsi), %r10 +; AVX512F-NEXT: vmovdqu (%rdx), %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 +; AVX512F-NEXT: movzbl %cl, %eax +; AVX512F-NEXT: vmovq %rax, %xmm1 +; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512F-NEXT: shrl $6, %eax +; AVX512F-NEXT: movl $-1, %ecx +; AVX512F-NEXT: shlxl %eax, %ecx, %eax +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63] +; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX512F-NEXT: vpsrlq %xmm3, %ymm0, %ymm3 +; AVX512F-NEXT: valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7,0] +; AVX512F-NEXT: vpaddq %ymm0, %ymm0, %ymm0 +; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm1 +; AVX512F-NEXT: vpsllq %xmm1, %ymm0, %ymm0 ; AVX512F-NEXT: movq %rdi, %rax -; AVX512F-NEXT: testb $64, %cl -; AVX512F-NEXT: cmoveq %r8, %r10 -; AVX512F-NEXT: cmoveq %r11, %r8 -; AVX512F-NEXT: cmoveq %r9, %r11 -; AVX512F-NEXT: cmoveq %rdx, %r9 -; AVX512F-NEXT: cmoveq %rbx, %rdx -; AVX512F-NEXT: shrdq %cl, %r9, %rdx -; AVX512F-NEXT: shrdq %cl, %r11, %r9 -; AVX512F-NEXT: shrdq %cl, %r8, %r11 -; AVX512F-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX512F-NEXT: shrdq %cl, %r10, %r8 -; AVX512F-NEXT: movq %r8, 24(%rdi) -; AVX512F-NEXT: movq %r11, 16(%rdi) -; AVX512F-NEXT: movq %r9, 8(%rdi) -; AVX512F-NEXT: movq %rdx, (%rdi) -; AVX512F-NEXT: popq %rbx +; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqu %ymm0, (%rdi) ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: fshr_i256_load: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: movq (%rsi), %r8 -; AVX512VL-NEXT: movq 8(%rsi), %rax -; AVX512VL-NEXT: movq 16(%rdx), %r9 -; AVX512VL-NEXT: movq 24(%rdx), %r10 -; AVX512VL-NEXT: testb %cl, %cl -; AVX512VL-NEXT: movq (%rdx), %r11 -; AVX512VL-NEXT: cmovsq %r9, %r11 -; AVX512VL-NEXT: movq 8(%rdx), %rdx -; AVX512VL-NEXT: cmovsq %r10, %rdx -; AVX512VL-NEXT: cmovsq %r8, %r9 -; AVX512VL-NEXT: cmovsq %rax, %r10 -; AVX512VL-NEXT: cmovsq 16(%rsi), %r8 -; AVX512VL-NEXT: cmovsq 24(%rsi), %rax -; AVX512VL-NEXT: testb $64, %cl -; AVX512VL-NEXT: cmoveq %r8, %rax -; AVX512VL-NEXT: cmoveq %r10, %r8 -; AVX512VL-NEXT: cmoveq %r9, %r10 -; AVX512VL-NEXT: cmoveq %rdx, %r9 -; AVX512VL-NEXT: cmoveq %r11, %rdx -; AVX512VL-NEXT: shrdq %cl, %r9, %rdx -; AVX512VL-NEXT: shrdq %cl, %r10, %r9 -; AVX512VL-NEXT: shrdq %cl, %r8, %r10 -; AVX512VL-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX512VL-NEXT: shrdq %cl, %rax, %r8 ; AVX512VL-NEXT: movq %rdi, %rax -; AVX512VL-NEXT: movq %r8, 24(%rdi) -; AVX512VL-NEXT: movq %r10, 16(%rdi) -; AVX512VL-NEXT: movq %r9, 8(%rdi) -; AVX512VL-NEXT: movq %rdx, (%rdi) +; AVX512VL-NEXT: vmovdqu (%rdx), %ymm0 +; AVX512VL-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 +; AVX512VL-NEXT: movzbl %cl, %ecx +; AVX512VL-NEXT: movl %ecx, %edx +; AVX512VL-NEXT: shrl $6, %edx +; AVX512VL-NEXT: movl $-1, %esi +; AVX512VL-NEXT: shlxl %edx, %esi, %edx +; AVX512VL-NEXT: kmovd %edx, %k1 +; AVX512VL-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z} +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [63,63] +; AVX512VL-NEXT: vpbroadcastq %rcx, %xmm2 +; AVX512VL-NEXT: vpand %xmm1, %xmm2, %xmm3 +; AVX512VL-NEXT: vpsrlq %xmm3, %ymm0, %ymm3 +; AVX512VL-NEXT: valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7,0] +; AVX512VL-NEXT: vpaddq %ymm0, %ymm0, %ymm0 +; AVX512VL-NEXT: vpandn %xmm1, %xmm2, %xmm1 +; AVX512VL-NEXT: vpsllq %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vmovdqu %ymm0, (%rdi) +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512VBMI-LABEL: fshr_i256_load: ; AVX512VBMI: # %bb.0: -; AVX512VBMI-NEXT: movq (%rsi), %r8 -; AVX512VBMI-NEXT: movq 8(%rsi), %rax -; AVX512VBMI-NEXT: movq 16(%rdx), %r9 -; AVX512VBMI-NEXT: movq 24(%rdx), %r10 -; AVX512VBMI-NEXT: testb %cl, %cl -; AVX512VBMI-NEXT: movq (%rdx), %r11 -; AVX512VBMI-NEXT: cmovsq %r9, %r11 -; AVX512VBMI-NEXT: movq 8(%rdx), %rdx -; AVX512VBMI-NEXT: cmovsq %r10, %rdx -; AVX512VBMI-NEXT: cmovsq %r8, %r9 -; AVX512VBMI-NEXT: cmovsq %rax, %r10 -; AVX512VBMI-NEXT: cmovsq 16(%rsi), %r8 -; AVX512VBMI-NEXT: cmovsq 24(%rsi), %rax -; AVX512VBMI-NEXT: testb $64, %cl -; AVX512VBMI-NEXT: cmoveq %r8, %rax -; AVX512VBMI-NEXT: cmoveq %r10, %r8 -; AVX512VBMI-NEXT: cmoveq %r9, %r10 -; AVX512VBMI-NEXT: cmoveq %rdx, %r9 -; AVX512VBMI-NEXT: cmoveq %r11, %rdx -; AVX512VBMI-NEXT: shrdq %cl, %r9, %rdx -; AVX512VBMI-NEXT: shrdq %cl, %r10, %r9 -; AVX512VBMI-NEXT: shrdq %cl, %r8, %r10 -; AVX512VBMI-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX512VBMI-NEXT: shrdq %cl, %rax, %r8 ; AVX512VBMI-NEXT: movq %rdi, %rax -; AVX512VBMI-NEXT: movq %r8, 24(%rdi) -; AVX512VBMI-NEXT: movq %r10, 16(%rdi) -; AVX512VBMI-NEXT: movq %r9, 8(%rdi) -; AVX512VBMI-NEXT: movq %rdx, (%rdi) +; AVX512VBMI-NEXT: vmovdqu (%rdx), %ymm0 +; AVX512VBMI-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 +; AVX512VBMI-NEXT: movzbl %cl, %ecx +; AVX512VBMI-NEXT: movl %ecx, %edx +; AVX512VBMI-NEXT: shrl $6, %edx +; AVX512VBMI-NEXT: movl $-1, %esi +; AVX512VBMI-NEXT: shlxl %edx, %esi, %edx +; AVX512VBMI-NEXT: kmovd %edx, %k1 +; AVX512VBMI-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z} +; AVX512VBMI-NEXT: vpbroadcastq %rcx, %zmm1 +; AVX512VBMI-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VBMI-NEXT: valignq {{.*#+}} zmm2 = zmm0[1,2,3,4,5,6,7],zmm2[0] +; AVX512VBMI-NEXT: vpshrdvq %zmm1, %zmm2, %zmm0 +; AVX512VBMI-NEXT: vmovdqu %ymm0, (%rdi) +; AVX512VBMI-NEXT: vzeroupper ; AVX512VBMI-NEXT: retq %a0 = load i256, ptr %p0 %a1 = load i256, ptr %p1 @@ -1235,132 +1168,75 @@ define i256 @fshl_i256_vector(<4 x i64> %v0, <4 x i64> %v1, i256 %a2) nounwind { ; ; AVX512F-LABEL: fshl_i256_vector: ; AVX512F: # %bb.0: -; AVX512F-NEXT: pushq %rbx -; AVX512F-NEXT: movq %rsi, %rcx -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512F-NEXT: vpextrq $1, %xmm2, %rdx -; AVX512F-NEXT: vmovq %xmm2, %rsi -; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vmovq %xmm0, %r11 -; AVX512F-NEXT: vmovq %xmm1, %r9 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512F-NEXT: vpextrq $1, %xmm1, %rbx -; AVX512F-NEXT: vmovq %xmm0, %r10 -; AVX512F-NEXT: vpextrq $1, %xmm0, %r8 -; AVX512F-NEXT: testb $-128, %cl -; AVX512F-NEXT: cmoveq %r8, %rbx -; AVX512F-NEXT: cmoveq %r10, %r9 -; AVX512F-NEXT: cmoveq %r11, %r10 -; AVX512F-NEXT: cmoveq %rax, %r8 -; AVX512F-NEXT: cmovneq %r11, %rsi -; AVX512F-NEXT: cmovneq %rax, %rdx +; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512F-NEXT: movq %rdi, %rax -; AVX512F-NEXT: testb $64, %cl -; AVX512F-NEXT: cmovneq %rsi, %rdx -; AVX512F-NEXT: cmovneq %r8, %rsi -; AVX512F-NEXT: cmovneq %r10, %r8 -; AVX512F-NEXT: cmoveq %rbx, %r9 -; AVX512F-NEXT: cmovneq %rbx, %r10 -; AVX512F-NEXT: movq %r10, %rdi -; AVX512F-NEXT: shldq %cl, %r9, %rdi -; AVX512F-NEXT: movq %r8, %r9 -; AVX512F-NEXT: shldq %cl, %r10, %r9 -; AVX512F-NEXT: movq %rsi, %r10 -; AVX512F-NEXT: shldq %cl, %r8, %r10 -; AVX512F-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX512F-NEXT: shldq %cl, %rsi, %rdx -; AVX512F-NEXT: movq %rdx, 24(%rax) -; AVX512F-NEXT: movq %r10, 16(%rax) -; AVX512F-NEXT: movq %r9, 8(%rax) -; AVX512F-NEXT: movq %rdi, (%rax) -; AVX512F-NEXT: popq %rbx +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: movzbl %sil, %ecx +; AVX512F-NEXT: vmovq %rcx, %xmm1 +; AVX512F-NEXT: # kill: def $ecx killed $ecx killed $rcx +; AVX512F-NEXT: shrl $6, %ecx +; AVX512F-NEXT: movl $-1, %edx +; AVX512F-NEXT: shlxl %ecx, %edx, %ecx +; AVX512F-NEXT: kmovw %ecx, %k1 +; AVX512F-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63] +; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX512F-NEXT: vpsllq %xmm3, %zmm0, %zmm3 +; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm3 +; AVX512F-NEXT: valignq {{.*#+}} zmm0 = zmm0[7,0,1,2,3,4,5,6] +; AVX512F-NEXT: vpsrlq $1, %zmm0, %zmm0 +; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm1 +; AVX512F-NEXT: vpsrlq %xmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512F-NEXT: vmovdqu %ymm0, (%rdi) ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: fshl_i256_vector: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: pushq %rbx -; AVX512VL-NEXT: movq %rsi, %rcx -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512VL-NEXT: vpextrq $1, %xmm2, %rdx -; AVX512VL-NEXT: vmovq %xmm2, %rax -; AVX512VL-NEXT: vpextrq $1, %xmm0, %r10 -; AVX512VL-NEXT: vmovq %xmm0, %r11 -; AVX512VL-NEXT: vmovq %xmm1, %r8 -; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512VL-NEXT: vpextrq $1, %xmm1, %rbx -; AVX512VL-NEXT: vpextrq $1, %xmm0, %rsi -; AVX512VL-NEXT: vmovq %xmm0, %r9 -; AVX512VL-NEXT: testb $-128, %cl -; AVX512VL-NEXT: cmoveq %rsi, %rbx -; AVX512VL-NEXT: cmoveq %r9, %r8 -; AVX512VL-NEXT: cmoveq %r11, %r9 -; AVX512VL-NEXT: cmoveq %r10, %rsi -; AVX512VL-NEXT: cmovneq %r11, %rax -; AVX512VL-NEXT: cmovneq %r10, %rdx -; AVX512VL-NEXT: testb $64, %cl -; AVX512VL-NEXT: cmovneq %rax, %rdx -; AVX512VL-NEXT: cmovneq %rsi, %rax -; AVX512VL-NEXT: cmovneq %r9, %rsi -; AVX512VL-NEXT: cmoveq %rbx, %r8 -; AVX512VL-NEXT: cmovneq %rbx, %r9 -; AVX512VL-NEXT: movq %r9, %r10 -; AVX512VL-NEXT: shldq %cl, %r8, %r10 -; AVX512VL-NEXT: movq %rsi, %r8 -; AVX512VL-NEXT: shldq %cl, %r9, %r8 -; AVX512VL-NEXT: movq %rax, %r9 -; AVX512VL-NEXT: shldq %cl, %rsi, %r9 -; AVX512VL-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX512VL-NEXT: shldq %cl, %rax, %rdx +; AVX512VL-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512VL-NEXT: movq %rdi, %rax -; AVX512VL-NEXT: movq %rdx, 24(%rdi) -; AVX512VL-NEXT: movq %r9, 16(%rdi) -; AVX512VL-NEXT: movq %r8, 8(%rdi) -; AVX512VL-NEXT: movq %r10, (%rdi) -; AVX512VL-NEXT: popq %rbx +; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512VL-NEXT: movzbl %sil, %ecx +; AVX512VL-NEXT: vpbroadcastq %rcx, %xmm1 +; AVX512VL-NEXT: # kill: def $ecx killed $ecx killed $rcx +; AVX512VL-NEXT: shrl $6, %ecx +; AVX512VL-NEXT: movl $-1, %edx +; AVX512VL-NEXT: shlxl %ecx, %edx, %ecx +; AVX512VL-NEXT: kmovd %ecx, %k1 +; AVX512VL-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z} +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63] +; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX512VL-NEXT: vpsllq %xmm3, %zmm0, %zmm3 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm3, %ymm3 +; AVX512VL-NEXT: valignq {{.*#+}} zmm0 = zmm0[7,0,1,2,3,4,5,6] +; AVX512VL-NEXT: vpsrlq $1, %zmm0, %zmm0 +; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm1 +; AVX512VL-NEXT: vpsrlq %xmm1, %zmm0, %zmm0 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512VL-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512VL-NEXT: vmovdqu %ymm0, (%rdi) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512VBMI-LABEL: fshl_i256_vector: ; AVX512VBMI: # %bb.0: -; AVX512VBMI-NEXT: pushq %rbx -; AVX512VBMI-NEXT: movq %rsi, %rcx -; AVX512VBMI-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512VBMI-NEXT: vpextrq $1, %xmm2, %rdx -; AVX512VBMI-NEXT: vmovq %xmm2, %rax -; AVX512VBMI-NEXT: vpextrq $1, %xmm0, %r10 -; AVX512VBMI-NEXT: vmovq %xmm0, %r11 -; AVX512VBMI-NEXT: vmovq %xmm1, %r8 -; AVX512VBMI-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512VBMI-NEXT: vpextrq $1, %xmm1, %rbx -; AVX512VBMI-NEXT: vpextrq $1, %xmm0, %rsi -; AVX512VBMI-NEXT: vmovq %xmm0, %r9 -; AVX512VBMI-NEXT: testb $-128, %cl -; AVX512VBMI-NEXT: cmoveq %rsi, %rbx -; AVX512VBMI-NEXT: cmoveq %r9, %r8 -; AVX512VBMI-NEXT: cmoveq %r11, %r9 -; AVX512VBMI-NEXT: cmoveq %r10, %rsi -; AVX512VBMI-NEXT: cmovneq %r11, %rax -; AVX512VBMI-NEXT: cmovneq %r10, %rdx -; AVX512VBMI-NEXT: testb $64, %cl -; AVX512VBMI-NEXT: cmovneq %rax, %rdx -; AVX512VBMI-NEXT: cmovneq %rsi, %rax -; AVX512VBMI-NEXT: cmovneq %r9, %rsi -; AVX512VBMI-NEXT: cmoveq %rbx, %r8 -; AVX512VBMI-NEXT: cmovneq %rbx, %r9 -; AVX512VBMI-NEXT: movq %r9, %r10 -; AVX512VBMI-NEXT: shldq %cl, %r8, %r10 -; AVX512VBMI-NEXT: movq %rsi, %r8 -; AVX512VBMI-NEXT: shldq %cl, %r9, %r8 -; AVX512VBMI-NEXT: movq %rax, %r9 -; AVX512VBMI-NEXT: shldq %cl, %rsi, %r9 -; AVX512VBMI-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX512VBMI-NEXT: shldq %cl, %rax, %rdx +; AVX512VBMI-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512VBMI-NEXT: movq %rdi, %rax -; AVX512VBMI-NEXT: movq %rdx, 24(%rdi) -; AVX512VBMI-NEXT: movq %r9, 16(%rdi) -; AVX512VBMI-NEXT: movq %r8, 8(%rdi) -; AVX512VBMI-NEXT: movq %r10, (%rdi) -; AVX512VBMI-NEXT: popq %rbx +; AVX512VBMI-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512VBMI-NEXT: movzbl %sil, %ecx +; AVX512VBMI-NEXT: movl %ecx, %edx +; AVX512VBMI-NEXT: shrl $6, %edx +; AVX512VBMI-NEXT: movl $-1, %esi +; AVX512VBMI-NEXT: shlxl %edx, %esi, %edx +; AVX512VBMI-NEXT: kmovd %edx, %k1 +; AVX512VBMI-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z} +; AVX512VBMI-NEXT: vpbroadcastq %rcx, %zmm1 +; AVX512VBMI-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VBMI-NEXT: valignq {{.*#+}} zmm2 = zmm2[7],zmm0[0,1,2,3,4,5,6] +; AVX512VBMI-NEXT: vpshldvq %zmm1, %zmm2, %zmm0 +; AVX512VBMI-NEXT: vextracti64x4 $1, %zmm0, (%rdi) ; AVX512VBMI-NEXT: vzeroupper ; AVX512VBMI-NEXT: retq %a0 = bitcast <4 x i64> %v0 to i256 @@ -1491,123 +1367,71 @@ define i256 @fshr_i256_vector(<4 x i64> %v0, <4 x i64> %v1, i256 %a2) nounwind { ; ; AVX512F-LABEL: fshr_i256_vector: ; AVX512F: # %bb.0: -; AVX512F-NEXT: pushq %rbx -; AVX512F-NEXT: movq %rsi, %rcx -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512F-NEXT: vpextrq $1, %xmm2, %rdx -; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vpextrq $1, %xmm1, %rsi -; AVX512F-NEXT: vmovq %xmm2, %r8 -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-NEXT: vpextrq $1, %xmm2, %r9 -; AVX512F-NEXT: vmovq %xmm0, %r11 -; AVX512F-NEXT: vmovq %xmm1, %rbx -; AVX512F-NEXT: vmovq %xmm2, %r10 -; AVX512F-NEXT: testb %cl, %cl -; AVX512F-NEXT: cmovsq %r10, %rbx -; AVX512F-NEXT: cmovsq %r9, %rsi -; AVX512F-NEXT: cmovsq %r11, %r10 -; AVX512F-NEXT: cmovsq %rax, %r9 -; AVX512F-NEXT: cmovnsq %r11, %r8 -; AVX512F-NEXT: cmovnsq %rax, %rdx +; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: movzbl %sil, %eax +; AVX512F-NEXT: vmovq %rax, %xmm1 +; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512F-NEXT: shrl $6, %eax +; AVX512F-NEXT: movl $-1, %ecx +; AVX512F-NEXT: shlxl %eax, %ecx, %eax +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63] +; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX512F-NEXT: vpsrlq %xmm3, %ymm0, %ymm3 +; AVX512F-NEXT: valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7,0] +; AVX512F-NEXT: vpaddq %ymm0, %ymm0, %ymm0 +; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm1 +; AVX512F-NEXT: vpsllq %xmm1, %ymm0, %ymm0 ; AVX512F-NEXT: movq %rdi, %rax -; AVX512F-NEXT: testb $64, %cl -; AVX512F-NEXT: cmoveq %r8, %rdx -; AVX512F-NEXT: cmoveq %r9, %r8 -; AVX512F-NEXT: cmoveq %r10, %r9 -; AVX512F-NEXT: cmoveq %rsi, %r10 -; AVX512F-NEXT: cmoveq %rbx, %rsi -; AVX512F-NEXT: shrdq %cl, %r10, %rsi -; AVX512F-NEXT: shrdq %cl, %r9, %r10 -; AVX512F-NEXT: shrdq %cl, %r8, %r9 -; AVX512F-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX512F-NEXT: shrdq %cl, %rdx, %r8 -; AVX512F-NEXT: movq %r8, 24(%rdi) -; AVX512F-NEXT: movq %r9, 16(%rdi) -; AVX512F-NEXT: movq %r10, 8(%rdi) -; AVX512F-NEXT: movq %rsi, (%rdi) -; AVX512F-NEXT: popq %rbx +; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqu %ymm0, (%rdi) ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: fshr_i256_vector: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: pushq %rbx -; AVX512VL-NEXT: movq %rsi, %rcx -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512VL-NEXT: vpextrq $1, %xmm2, %rax -; AVX512VL-NEXT: vmovq %xmm2, %rdx -; AVX512VL-NEXT: vpextrq $1, %xmm0, %r10 -; AVX512VL-NEXT: vmovq %xmm0, %r11 -; AVX512VL-NEXT: vpextrq $1, %xmm1, %rsi -; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512VL-NEXT: vpextrq $1, %xmm0, %r8 -; AVX512VL-NEXT: vmovq %xmm1, %rbx -; AVX512VL-NEXT: vmovq %xmm0, %r9 -; AVX512VL-NEXT: testb %cl, %cl -; AVX512VL-NEXT: cmovsq %r9, %rbx -; AVX512VL-NEXT: cmovsq %r8, %rsi -; AVX512VL-NEXT: cmovsq %r11, %r9 -; AVX512VL-NEXT: cmovsq %r10, %r8 -; AVX512VL-NEXT: cmovnsq %r11, %rdx -; AVX512VL-NEXT: cmovnsq %r10, %rax -; AVX512VL-NEXT: testb $64, %cl -; AVX512VL-NEXT: cmoveq %rdx, %rax -; AVX512VL-NEXT: cmoveq %r8, %rdx -; AVX512VL-NEXT: cmoveq %r9, %r8 -; AVX512VL-NEXT: cmoveq %rsi, %r9 -; AVX512VL-NEXT: cmoveq %rbx, %rsi -; AVX512VL-NEXT: shrdq %cl, %r9, %rsi -; AVX512VL-NEXT: shrdq %cl, %r8, %r9 -; AVX512VL-NEXT: shrdq %cl, %rdx, %r8 -; AVX512VL-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX512VL-NEXT: shrdq %cl, %rax, %rdx +; AVX512VL-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512VL-NEXT: movq %rdi, %rax -; AVX512VL-NEXT: movq %rdx, 24(%rdi) -; AVX512VL-NEXT: movq %r8, 16(%rdi) -; AVX512VL-NEXT: movq %r9, 8(%rdi) -; AVX512VL-NEXT: movq %rsi, (%rdi) -; AVX512VL-NEXT: popq %rbx +; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512VL-NEXT: movzbl %sil, %ecx +; AVX512VL-NEXT: movl %ecx, %edx +; AVX512VL-NEXT: shrl $6, %edx +; AVX512VL-NEXT: movl $-1, %esi +; AVX512VL-NEXT: shlxl %edx, %esi, %edx +; AVX512VL-NEXT: kmovd %edx, %k1 +; AVX512VL-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z} +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [63,63] +; AVX512VL-NEXT: vpbroadcastq %rcx, %xmm2 +; AVX512VL-NEXT: vpand %xmm1, %xmm2, %xmm3 +; AVX512VL-NEXT: vpsrlq %xmm3, %ymm0, %ymm3 +; AVX512VL-NEXT: valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7,0] +; AVX512VL-NEXT: vpaddq %ymm0, %ymm0, %ymm0 +; AVX512VL-NEXT: vpandn %xmm1, %xmm2, %xmm1 +; AVX512VL-NEXT: vpsllq %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vmovdqu %ymm0, (%rdi) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512VBMI-LABEL: fshr_i256_vector: ; AVX512VBMI: # %bb.0: -; AVX512VBMI-NEXT: pushq %rbx -; AVX512VBMI-NEXT: movq %rsi, %rcx -; AVX512VBMI-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512VBMI-NEXT: vpextrq $1, %xmm2, %rax -; AVX512VBMI-NEXT: vmovq %xmm2, %rdx -; AVX512VBMI-NEXT: vpextrq $1, %xmm0, %r10 -; AVX512VBMI-NEXT: vmovq %xmm0, %r11 -; AVX512VBMI-NEXT: vpextrq $1, %xmm1, %rsi -; AVX512VBMI-NEXT: vextracti128 $1, %ymm1, %xmm0 -; AVX512VBMI-NEXT: vpextrq $1, %xmm0, %r8 -; AVX512VBMI-NEXT: vmovq %xmm1, %rbx -; AVX512VBMI-NEXT: vmovq %xmm0, %r9 -; AVX512VBMI-NEXT: testb %cl, %cl -; AVX512VBMI-NEXT: cmovsq %r9, %rbx -; AVX512VBMI-NEXT: cmovsq %r8, %rsi -; AVX512VBMI-NEXT: cmovsq %r11, %r9 -; AVX512VBMI-NEXT: cmovsq %r10, %r8 -; AVX512VBMI-NEXT: cmovnsq %r11, %rdx -; AVX512VBMI-NEXT: cmovnsq %r10, %rax -; AVX512VBMI-NEXT: testb $64, %cl -; AVX512VBMI-NEXT: cmoveq %rdx, %rax -; AVX512VBMI-NEXT: cmoveq %r8, %rdx -; AVX512VBMI-NEXT: cmoveq %r9, %r8 -; AVX512VBMI-NEXT: cmoveq %rsi, %r9 -; AVX512VBMI-NEXT: cmoveq %rbx, %rsi -; AVX512VBMI-NEXT: shrdq %cl, %r9, %rsi -; AVX512VBMI-NEXT: shrdq %cl, %r8, %r9 -; AVX512VBMI-NEXT: shrdq %cl, %rdx, %r8 -; AVX512VBMI-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX512VBMI-NEXT: shrdq %cl, %rax, %rdx +; AVX512VBMI-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512VBMI-NEXT: movq %rdi, %rax -; AVX512VBMI-NEXT: movq %rdx, 24(%rdi) -; AVX512VBMI-NEXT: movq %r8, 16(%rdi) -; AVX512VBMI-NEXT: movq %r9, 8(%rdi) -; AVX512VBMI-NEXT: movq %rsi, (%rdi) -; AVX512VBMI-NEXT: popq %rbx +; AVX512VBMI-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512VBMI-NEXT: movzbl %sil, %ecx +; AVX512VBMI-NEXT: movl %ecx, %edx +; AVX512VBMI-NEXT: shrl $6, %edx +; AVX512VBMI-NEXT: movl $-1, %esi +; AVX512VBMI-NEXT: shlxl %edx, %esi, %edx +; AVX512VBMI-NEXT: kmovd %edx, %k1 +; AVX512VBMI-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z} +; AVX512VBMI-NEXT: vpbroadcastq %rcx, %zmm1 +; AVX512VBMI-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VBMI-NEXT: valignq {{.*#+}} zmm2 = zmm0[1,2,3,4,5,6,7],zmm2[0] +; AVX512VBMI-NEXT: vpshrdvq %zmm1, %zmm2, %zmm0 +; AVX512VBMI-NEXT: vmovdqu %ymm0, (%rdi) ; AVX512VBMI-NEXT: vzeroupper ; AVX512VBMI-NEXT: retq %a0 = bitcast <4 x i64> %v0 to i256 @@ -1728,111 +1552,75 @@ define i256 @fshl_rot_i256_vector(<4 x i64> %v0, i256 %a2) nounwind { ; ; AVX512F-LABEL: fshl_rot_i256_vector: ; AVX512F: # %bb.0: -; AVX512F-NEXT: movq %rsi, %rcx -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vmovq %xmm1, %rdx -; AVX512F-NEXT: vpextrq $1, %xmm0, %rsi -; AVX512F-NEXT: vpextrq $1, %xmm1, %rax -; AVX512F-NEXT: vmovq %xmm0, %r8 -; AVX512F-NEXT: testb $-128, %cl -; AVX512F-NEXT: movq %rax, %r9 -; AVX512F-NEXT: cmovneq %rsi, %r9 -; AVX512F-NEXT: movq %r8, %r10 -; AVX512F-NEXT: cmovneq %rdx, %r10 -; AVX512F-NEXT: cmovneq %rax, %rsi +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: movq %rdi, %rax -; AVX512F-NEXT: cmovneq %r8, %rdx -; AVX512F-NEXT: testb $64, %cl -; AVX512F-NEXT: movq %rdx, %rdi -; AVX512F-NEXT: cmovneq %rsi, %rdi -; AVX512F-NEXT: cmovneq %r10, %rsi -; AVX512F-NEXT: cmoveq %r9, %rdx -; AVX512F-NEXT: cmovneq %r9, %r10 -; AVX512F-NEXT: movq %r10, %r8 -; AVX512F-NEXT: shldq %cl, %rdx, %r8 -; AVX512F-NEXT: movq %rsi, %r9 -; AVX512F-NEXT: shldq %cl, %r10, %r9 -; AVX512F-NEXT: movq %rdi, %r10 -; AVX512F-NEXT: shldq %cl, %rsi, %r10 -; AVX512F-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX512F-NEXT: shldq %cl, %rdi, %rdx -; AVX512F-NEXT: movq %rdx, 24(%rax) -; AVX512F-NEXT: movq %r10, 16(%rax) -; AVX512F-NEXT: movq %r9, 8(%rax) -; AVX512F-NEXT: movq %r8, (%rax) +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512F-NEXT: movzbl %sil, %ecx +; AVX512F-NEXT: vmovq %rcx, %xmm1 +; AVX512F-NEXT: # kill: def $ecx killed $ecx killed $rcx +; AVX512F-NEXT: shrl $6, %ecx +; AVX512F-NEXT: movl $-1, %edx +; AVX512F-NEXT: shlxl %ecx, %edx, %ecx +; AVX512F-NEXT: kmovw %ecx, %k1 +; AVX512F-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: valignq {{.*#+}} zmm2 = zmm0[7,0,1,2,3,4,5,6] +; AVX512F-NEXT: vpsrlq $1, %zmm2, %zmm2 +; AVX512F-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] +; AVX512F-NEXT: vpandn %xmm3, %xmm1, %xmm4 +; AVX512F-NEXT: vpsrlq %xmm4, %zmm2, %zmm2 +; AVX512F-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX512F-NEXT: vpsllq %xmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqu %ymm0, (%rdi) ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: fshl_rot_i256_vector: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: movq %rsi, %rcx -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vmovq %xmm1, %rdx -; AVX512VL-NEXT: vmovq %xmm0, %rsi -; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vpextrq $1, %xmm1, %r8 -; AVX512VL-NEXT: testb $-128, %cl -; AVX512VL-NEXT: movq %r8, %r9 -; AVX512VL-NEXT: cmovneq %rax, %r9 -; AVX512VL-NEXT: movq %rsi, %r10 -; AVX512VL-NEXT: cmovneq %rdx, %r10 -; AVX512VL-NEXT: cmovneq %r8, %rax -; AVX512VL-NEXT: cmovneq %rsi, %rdx -; AVX512VL-NEXT: testb $64, %cl -; AVX512VL-NEXT: movq %rdx, %rsi -; AVX512VL-NEXT: cmovneq %rax, %rsi -; AVX512VL-NEXT: cmovneq %r10, %rax -; AVX512VL-NEXT: cmoveq %r9, %rdx -; AVX512VL-NEXT: cmovneq %r9, %r10 -; AVX512VL-NEXT: movq %r10, %r8 -; AVX512VL-NEXT: shldq %cl, %rdx, %r8 -; AVX512VL-NEXT: movq %rax, %r9 -; AVX512VL-NEXT: shldq %cl, %r10, %r9 -; AVX512VL-NEXT: movq %rsi, %r10 -; AVX512VL-NEXT: shldq %cl, %rax, %r10 -; AVX512VL-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX512VL-NEXT: shldq %cl, %rsi, %rdx +; AVX512VL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512VL-NEXT: movzbl %sil, %eax +; AVX512VL-NEXT: vpbroadcastq %rax, %xmm1 +; AVX512VL-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512VL-NEXT: shrl $6, %eax +; AVX512VL-NEXT: movl $-1, %ecx +; AVX512VL-NEXT: shlxl %eax, %ecx, %eax +; AVX512VL-NEXT: kmovd %eax, %k1 +; AVX512VL-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z} +; AVX512VL-NEXT: valignq {{.*#+}} zmm2 = zmm0[7,0,1,2,3,4,5,6] +; AVX512VL-NEXT: vpsrlq $1, %zmm2, %zmm2 +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] +; AVX512VL-NEXT: vpandn %xmm3, %xmm1, %xmm4 +; AVX512VL-NEXT: vpsrlq %xmm4, %zmm2, %zmm2 +; AVX512VL-NEXT: vpand %xmm3, %xmm1, %xmm1 +; AVX512VL-NEXT: vpsllq %xmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: movq %rdi, %rax -; AVX512VL-NEXT: movq %rdx, 24(%rdi) -; AVX512VL-NEXT: movq %r10, 16(%rdi) -; AVX512VL-NEXT: movq %r9, 8(%rdi) -; AVX512VL-NEXT: movq %r8, (%rdi) +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm1 +; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vmovdqu %ymm0, (%rdi) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512VBMI-LABEL: fshl_rot_i256_vector: ; AVX512VBMI: # %bb.0: -; AVX512VBMI-NEXT: movq %rsi, %rcx -; AVX512VBMI-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VBMI-NEXT: vmovq %xmm1, %rdx -; AVX512VBMI-NEXT: vmovq %xmm0, %rsi -; AVX512VBMI-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VBMI-NEXT: vpextrq $1, %xmm1, %r8 -; AVX512VBMI-NEXT: testb $-128, %cl -; AVX512VBMI-NEXT: movq %r8, %r9 -; AVX512VBMI-NEXT: cmovneq %rax, %r9 -; AVX512VBMI-NEXT: movq %rsi, %r10 -; AVX512VBMI-NEXT: cmovneq %rdx, %r10 -; AVX512VBMI-NEXT: cmovneq %r8, %rax -; AVX512VBMI-NEXT: cmovneq %rsi, %rdx -; AVX512VBMI-NEXT: testb $64, %cl -; AVX512VBMI-NEXT: movq %rdx, %rsi -; AVX512VBMI-NEXT: cmovneq %rax, %rsi -; AVX512VBMI-NEXT: cmovneq %r10, %rax -; AVX512VBMI-NEXT: cmoveq %r9, %rdx -; AVX512VBMI-NEXT: cmovneq %r9, %r10 -; AVX512VBMI-NEXT: movq %r10, %r8 -; AVX512VBMI-NEXT: shldq %cl, %rdx, %r8 -; AVX512VBMI-NEXT: movq %rax, %r9 -; AVX512VBMI-NEXT: shldq %cl, %r10, %r9 -; AVX512VBMI-NEXT: movq %rsi, %r10 -; AVX512VBMI-NEXT: shldq %cl, %rax, %r10 -; AVX512VBMI-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX512VBMI-NEXT: shldq %cl, %rsi, %rdx +; AVX512VBMI-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512VBMI-NEXT: movq %rdi, %rax -; AVX512VBMI-NEXT: movq %rdx, 24(%rdi) -; AVX512VBMI-NEXT: movq %r10, 16(%rdi) -; AVX512VBMI-NEXT: movq %r9, 8(%rdi) -; AVX512VBMI-NEXT: movq %r8, (%rdi) +; AVX512VBMI-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512VBMI-NEXT: movzbl %sil, %ecx +; AVX512VBMI-NEXT: movl %ecx, %edx +; AVX512VBMI-NEXT: shrl $6, %edx +; AVX512VBMI-NEXT: movl $-1, %esi +; AVX512VBMI-NEXT: shlxl %edx, %esi, %edx +; AVX512VBMI-NEXT: kmovd %edx, %k1 +; AVX512VBMI-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z} +; AVX512VBMI-NEXT: vpbroadcastq %rcx, %zmm1 +; AVX512VBMI-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VBMI-NEXT: valignq {{.*#+}} zmm2 = zmm2[7],zmm0[0,1,2,3,4,5,6] +; AVX512VBMI-NEXT: vpshldvq %zmm1, %zmm2, %zmm0 +; AVX512VBMI-NEXT: vextracti64x4 $1, %zmm0, (%rdi) ; AVX512VBMI-NEXT: vzeroupper ; AVX512VBMI-NEXT: retq %a0 = bitcast <4 x i64> %v0 to i256 @@ -1946,105 +1734,71 @@ define i256 @fshr_rot_i256_vector(<4 x i64> %v0, i256 %a2) nounwind { ; ; AVX512F-LABEL: fshr_rot_i256_vector: ; AVX512F: # %bb.0: -; AVX512F-NEXT: movq %rsi, %rcx -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vmovq %xmm1, %rdx -; AVX512F-NEXT: vpextrq $1, %xmm0, %rsi -; AVX512F-NEXT: vpextrq $1, %xmm1, %rax -; AVX512F-NEXT: vmovq %xmm0, %r9 -; AVX512F-NEXT: testb %cl, %cl -; AVX512F-NEXT: movq %rax, %r10 -; AVX512F-NEXT: cmovnsq %rsi, %r10 -; AVX512F-NEXT: movq %r9, %r8 -; AVX512F-NEXT: cmovnsq %rdx, %r8 -; AVX512F-NEXT: cmovnsq %rax, %rsi +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512F-NEXT: movzbl %sil, %eax +; AVX512F-NEXT: vmovq %rax, %xmm1 +; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax +; AVX512F-NEXT: shrl $6, %eax +; AVX512F-NEXT: movl $-1, %ecx +; AVX512F-NEXT: shlxl %eax, %ecx, %eax +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpbroadcastq %xmm1, %xmm1 +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63] +; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX512F-NEXT: vpsrlq %xmm3, %ymm0, %ymm3 +; AVX512F-NEXT: valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7,0] +; AVX512F-NEXT: vpaddq %ymm0, %ymm0, %ymm0 +; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm1 +; AVX512F-NEXT: vpsllq %xmm1, %ymm0, %ymm0 ; AVX512F-NEXT: movq %rdi, %rax -; AVX512F-NEXT: cmovnsq %r9, %rdx -; AVX512F-NEXT: testb $64, %cl -; AVX512F-NEXT: movq %rdx, %rdi -; AVX512F-NEXT: cmoveq %rsi, %rdi -; AVX512F-NEXT: cmoveq %r8, %rsi -; AVX512F-NEXT: cmoveq %r10, %r8 -; AVX512F-NEXT: cmovneq %r10, %rdx -; AVX512F-NEXT: movq %rdx, %r9 -; AVX512F-NEXT: shrdq %cl, %r8, %r9 -; AVX512F-NEXT: shrdq %cl, %rsi, %r8 -; AVX512F-NEXT: shrdq %cl, %rdi, %rsi -; AVX512F-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX512F-NEXT: shrdq %cl, %rdx, %rdi -; AVX512F-NEXT: movq %rdi, 24(%rax) -; AVX512F-NEXT: movq %rsi, 16(%rax) -; AVX512F-NEXT: movq %r8, 8(%rax) -; AVX512F-NEXT: movq %r9, (%rax) +; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512F-NEXT: vmovdqu %ymm0, (%rdi) ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: fshr_rot_i256_vector: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: movq %rsi, %rcx -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vmovq %xmm1, %rax -; AVX512VL-NEXT: vmovq %xmm0, %r8 -; AVX512VL-NEXT: vpextrq $1, %xmm0, %rdx -; AVX512VL-NEXT: vpextrq $1, %xmm1, %r9 -; AVX512VL-NEXT: testb %cl, %cl -; AVX512VL-NEXT: movq %r9, %r10 -; AVX512VL-NEXT: cmovnsq %rdx, %r10 -; AVX512VL-NEXT: movq %r8, %rsi -; AVX512VL-NEXT: cmovnsq %rax, %rsi -; AVX512VL-NEXT: cmovnsq %r9, %rdx -; AVX512VL-NEXT: cmovnsq %r8, %rax -; AVX512VL-NEXT: testb $64, %cl -; AVX512VL-NEXT: movq %rax, %r8 -; AVX512VL-NEXT: cmoveq %rdx, %r8 -; AVX512VL-NEXT: cmoveq %rsi, %rdx -; AVX512VL-NEXT: cmoveq %r10, %rsi -; AVX512VL-NEXT: cmovneq %r10, %rax -; AVX512VL-NEXT: movq %rax, %r9 -; AVX512VL-NEXT: shrdq %cl, %rsi, %r9 -; AVX512VL-NEXT: shrdq %cl, %rdx, %rsi -; AVX512VL-NEXT: shrdq %cl, %r8, %rdx -; AVX512VL-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX512VL-NEXT: shrdq %cl, %rax, %r8 +; AVX512VL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512VL-NEXT: movq %rdi, %rax -; AVX512VL-NEXT: movq %r8, 24(%rdi) -; AVX512VL-NEXT: movq %rdx, 16(%rdi) -; AVX512VL-NEXT: movq %rsi, 8(%rdi) -; AVX512VL-NEXT: movq %r9, (%rdi) +; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512VL-NEXT: movzbl %sil, %ecx +; AVX512VL-NEXT: movl %ecx, %edx +; AVX512VL-NEXT: shrl $6, %edx +; AVX512VL-NEXT: movl $-1, %esi +; AVX512VL-NEXT: shlxl %edx, %esi, %edx +; AVX512VL-NEXT: kmovd %edx, %k1 +; AVX512VL-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z} +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [63,63] +; AVX512VL-NEXT: vpbroadcastq %rcx, %xmm2 +; AVX512VL-NEXT: vpand %xmm1, %xmm2, %xmm3 +; AVX512VL-NEXT: vpsrlq %xmm3, %ymm0, %ymm3 +; AVX512VL-NEXT: valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7,0] +; AVX512VL-NEXT: vpaddq %ymm0, %ymm0, %ymm0 +; AVX512VL-NEXT: vpandn %xmm1, %xmm2, %xmm1 +; AVX512VL-NEXT: vpsllq %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vmovdqu %ymm0, (%rdi) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512VBMI-LABEL: fshr_rot_i256_vector: ; AVX512VBMI: # %bb.0: -; AVX512VBMI-NEXT: movq %rsi, %rcx -; AVX512VBMI-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VBMI-NEXT: vmovq %xmm1, %rax -; AVX512VBMI-NEXT: vmovq %xmm0, %r8 -; AVX512VBMI-NEXT: vpextrq $1, %xmm0, %rdx -; AVX512VBMI-NEXT: vpextrq $1, %xmm1, %r9 -; AVX512VBMI-NEXT: testb %cl, %cl -; AVX512VBMI-NEXT: movq %r9, %r10 -; AVX512VBMI-NEXT: cmovnsq %rdx, %r10 -; AVX512VBMI-NEXT: movq %r8, %rsi -; AVX512VBMI-NEXT: cmovnsq %rax, %rsi -; AVX512VBMI-NEXT: cmovnsq %r9, %rdx -; AVX512VBMI-NEXT: cmovnsq %r8, %rax -; AVX512VBMI-NEXT: testb $64, %cl -; AVX512VBMI-NEXT: movq %rax, %r8 -; AVX512VBMI-NEXT: cmoveq %rdx, %r8 -; AVX512VBMI-NEXT: cmoveq %rsi, %rdx -; AVX512VBMI-NEXT: cmoveq %r10, %rsi -; AVX512VBMI-NEXT: cmovneq %r10, %rax -; AVX512VBMI-NEXT: movq %rax, %r9 -; AVX512VBMI-NEXT: shrdq %cl, %rsi, %r9 -; AVX512VBMI-NEXT: shrdq %cl, %rdx, %rsi -; AVX512VBMI-NEXT: shrdq %cl, %r8, %rdx -; AVX512VBMI-NEXT: # kill: def $cl killed $cl killed $rcx -; AVX512VBMI-NEXT: shrdq %cl, %rax, %r8 +; AVX512VBMI-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512VBMI-NEXT: movq %rdi, %rax -; AVX512VBMI-NEXT: movq %r8, 24(%rdi) -; AVX512VBMI-NEXT: movq %rdx, 16(%rdi) -; AVX512VBMI-NEXT: movq %rsi, 8(%rdi) -; AVX512VBMI-NEXT: movq %r9, (%rdi) +; AVX512VBMI-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512VBMI-NEXT: movzbl %sil, %ecx +; AVX512VBMI-NEXT: movl %ecx, %edx +; AVX512VBMI-NEXT: shrl $6, %edx +; AVX512VBMI-NEXT: movl $-1, %esi +; AVX512VBMI-NEXT: shlxl %edx, %esi, %edx +; AVX512VBMI-NEXT: kmovd %edx, %k1 +; AVX512VBMI-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z} +; AVX512VBMI-NEXT: vpbroadcastq %rcx, %zmm1 +; AVX512VBMI-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VBMI-NEXT: valignq {{.*#+}} zmm2 = zmm0[1,2,3,4,5,6,7],zmm2[0] +; AVX512VBMI-NEXT: vpshrdvq %zmm1, %zmm2, %zmm0 +; AVX512VBMI-NEXT: vmovdqu %ymm0, (%rdi) ; AVX512VBMI-NEXT: vzeroupper ; AVX512VBMI-NEXT: retq %a0 = bitcast <4 x i64> %v0 to i256