diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 651c6de8a35b..834db1df26de 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -8186,8 +8186,6 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N, // If (1 << HBitWidth) % divisor == 1, we can add the two halves together and // then add in the carry. - // TODO: If we can't split it in half, we might be able to split into 3 or - // more pieces using a smaller bit width. if (HalfMaxPlus1.urem(Divisor).isOne()) { assert(!LL == !LH && "Expected both input halves or no input halves!"); if (!LL) @@ -8239,6 +8237,67 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N, DAG.getConstant(0, dl, HiLoVT)); Sum = DAG.getNode(ISD::ADD, dl, HiLoVT, Sum, Carry); } + } else { + // If we cannot split in two halves, look for a smaller chunk width W + // such that (1 << W) % Divisor == 1. + unsigned BitWidth = VT.getScalarSizeInBits(); + unsigned BestChunkWidth = 0; + + // Determine the legal scalar integer type for chunk operations. + EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT); + unsigned LegalWidth = LegalVT.getScalarSizeInBits(); + unsigned MaxChunk = std::min(LegalWidth, BitWidth); + + // Search for I where 2^I % Divisor == 1 + for (unsigned I = MaxChunk, E = MaxChunk / 2; I > E; --I) { + APInt Mod = APInt::getOneBitSet(Divisor.getBitWidth(), I).urem(Divisor); + + if (Mod.isOne()) { + // Ensure (NumChunks * MaxChunkValue) doesn't overflow LegalVT + unsigned NumChunks = divideCeil(BitWidth, I); + + // Ensure the sum won't overflow the hardware register (LegalWidth). + // Summing N chunks adds ceil(log2(N)) extra carry bits to the width. + // Safety check: Base Chunk Width (I) + Carry Bits <= Register Width. + if (I + llvm::bit_width(NumChunks - 1) <= LegalWidth) { + BestChunkWidth = I; + break; + } + } + } + + if (!BestChunkWidth) + return false; + + SDValue In = + LL ? DAG.getNode(ISD::BUILD_PAIR, dl, VT, LL, LH) : N->getOperand(0); + if (TrailingZeros) { + // Save the shifted off bits if we need the remainder. + if (Opcode != ISD::UDIV) { + APInt Mask = APInt::getLowBitsSet(BitWidth, TrailingZeros); + PartialRem = + DAG.getNode(ISD::AND, dl, VT, In, DAG.getConstant(Mask, dl, VT)); + } + EVT ShiftVT = getShiftAmountTy(VT, DAG.getDataLayout()); + In = DAG.getNode(ISD::SRL, dl, VT, In, + DAG.getShiftAmountConstant(TrailingZeros, ShiftVT, dl)); + } + SDValue TotalSum = DAG.getConstant(0, dl, LegalVT); + SDValue Mask = DAG.getConstant( + APInt::getLowBitsSet(LegalWidth, BestChunkWidth), dl, LegalVT); + + for (unsigned I = 0; I < BitWidth; I += BestChunkWidth) { + SDValue Shift = DAG.getShiftAmountConstant(I, VT, dl); + SDValue Chunk = DAG.getNode(ISD::SRL, dl, VT, In, Shift); + // Truncate to LegalVT + SDValue TruncChunk = DAG.getNode(ISD::TRUNCATE, dl, LegalVT, Chunk); + // For the last chunk, we might not need a mask if it's smaller than + // BestChunkWidth, but applying it is always safe. + SDValue MaskedChunk = + DAG.getNode(ISD::AND, dl, LegalVT, TruncChunk, Mask); + TotalSum = DAG.getNode(ISD::ADD, dl, LegalVT, TotalSum, MaskedChunk); + } + Sum = DAG.getNode(ISD::ZERO_EXTEND, dl, HiLoVT, TotalSum); } // If we didn't find a sum, we can't do the expansion. @@ -8278,7 +8337,9 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N, if (TrailingZeros) { RemL = DAG.getNode(ISD::SHL, dl, HiLoVT, RemL, DAG.getShiftAmountConstant(TrailingZeros, HiLoVT, dl)); - RemL = DAG.getNode(ISD::ADD, dl, HiLoVT, RemL, PartialRem); + + SDValue PartialRemLo = DAG.getNode(ISD::TRUNCATE, dl, HiLoVT, PartialRem); + RemL = DAG.getNode(ISD::ADD, dl, HiLoVT, RemL, PartialRemLo); } Result.push_back(RemL); Result.push_back(DAG.getConstant(0, dl, HiLoVT)); diff --git a/llvm/test/CodeGen/AArch64/rem-by-const.ll b/llvm/test/CodeGen/AArch64/rem-by-const.ll index eaed62961fc5..204001ede600 100644 --- a/llvm/test/CodeGen/AArch64/rem-by-const.ll +++ b/llvm/test/CodeGen/AArch64/rem-by-const.ll @@ -500,13 +500,20 @@ entry: define i128 @ui128_7(i128 %a, i128 %b) { ; CHECK-SD-LABEL: ui128_7: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 -; CHECK-SD-NEXT: .cfi_offset w30, -16 -; CHECK-SD-NEXT: mov w2, #7 // =0x7 -; CHECK-SD-NEXT: mov x3, xzr -; CHECK-SD-NEXT: bl __umodti3 -; CHECK-SD-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-SD-NEXT: extr x8, x1, x0, #60 +; CHECK-SD-NEXT: and x9, x0, #0xfffffffffffffff +; CHECK-SD-NEXT: and x8, x8, #0xfffffffffffffff +; CHECK-SD-NEXT: add x8, x9, x8 +; CHECK-SD-NEXT: mov x9, #18725 // =0x4925 +; CHECK-SD-NEXT: movk x9, #9362, lsl #16 +; CHECK-SD-NEXT: add x8, x8, x1, lsr #56 +; CHECK-SD-NEXT: mov x1, xzr +; CHECK-SD-NEXT: movk x9, #37449, lsl #32 +; CHECK-SD-NEXT: movk x9, #18724, lsl #48 +; CHECK-SD-NEXT: umulh x9, x8, x9 +; CHECK-SD-NEXT: lsr x9, x9, #1 +; CHECK-SD-NEXT: sub x9, x9, x9, lsl #3 +; CHECK-SD-NEXT: add x0, x8, x9 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: ui128_7: @@ -579,13 +586,23 @@ entry: define i128 @ui128_100(i128 %a, i128 %b) { ; CHECK-SD-LABEL: ui128_100: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 -; CHECK-SD-NEXT: .cfi_offset w30, -16 -; CHECK-SD-NEXT: mov w2, #100 // =0x64 -; CHECK-SD-NEXT: mov x3, xzr -; CHECK-SD-NEXT: bl __umodti3 -; CHECK-SD-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-SD-NEXT: extr x8, x1, x0, #2 +; CHECK-SD-NEXT: lsr x9, x1, #2 +; CHECK-SD-NEXT: mov w10, #25 // =0x19 +; CHECK-SD-NEXT: extr x9, x9, x8, #60 +; CHECK-SD-NEXT: and x8, x8, #0xfffffffffffffff +; CHECK-SD-NEXT: and x9, x9, #0xfffffffffffffff +; CHECK-SD-NEXT: add x8, x8, x9 +; CHECK-SD-NEXT: mov x9, #62915 // =0xf5c3 +; CHECK-SD-NEXT: movk x9, #23592, lsl #16 +; CHECK-SD-NEXT: add x8, x8, x1, lsr #58 +; CHECK-SD-NEXT: mov x1, xzr +; CHECK-SD-NEXT: movk x9, #49807, lsl #32 +; CHECK-SD-NEXT: movk x9, #10485, lsl #48 +; CHECK-SD-NEXT: umulh x9, x8, x9 +; CHECK-SD-NEXT: lsr x9, x9, #2 +; CHECK-SD-NEXT: msub x8, x9, x10, x8 +; CHECK-SD-NEXT: bfi x0, x8, #2, #62 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: ui128_100: @@ -2556,7 +2573,8 @@ define <3 x i32> @uv3i32_7(<3 x i32> %d, <3 x i32> %e) { ; CHECK-SD-NEXT: shrn v1.2s, v1.2d, #32 ; CHECK-SD-NEXT: sub w9, w9, w9, lsl #3 ; CHECK-SD-NEXT: sub v2.2s, v0.2s, v1.2s -; CHECK-SD: add w8, w8, w9 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 def $q0 +; CHECK-SD-NEXT: add w8, w8, w9 ; CHECK-SD-NEXT: ushll v2.2d, v2.2s, #0 ; CHECK-SD-NEXT: shrn v2.2s, v2.2d, #1 ; CHECK-SD-NEXT: add v1.2s, v2.2s, v1.2s @@ -3079,34 +3097,30 @@ entry: define <2 x i128> @uv2i128_7(<2 x i128> %d, <2 x i128> %e) { ; CHECK-SD-LABEL: uv2i128_7: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill -; CHECK-SD-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill -; CHECK-SD-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill -; CHECK-SD-NEXT: .cfi_def_cfa_offset 48 -; CHECK-SD-NEXT: .cfi_offset w19, -8 -; CHECK-SD-NEXT: .cfi_offset w20, -16 -; CHECK-SD-NEXT: .cfi_offset w21, -24 -; CHECK-SD-NEXT: .cfi_offset w22, -32 -; CHECK-SD-NEXT: .cfi_offset w30, -48 -; CHECK-SD-NEXT: mov x19, x3 -; CHECK-SD-NEXT: mov x20, x2 -; CHECK-SD-NEXT: mov w2, #7 // =0x7 +; CHECK-SD-NEXT: extr x9, x1, x0, #60 +; CHECK-SD-NEXT: extr x8, x3, x2, #60 +; CHECK-SD-NEXT: and x10, x0, #0xfffffffffffffff +; CHECK-SD-NEXT: mov x11, #18725 // =0x4925 +; CHECK-SD-NEXT: and x9, x9, #0xfffffffffffffff +; CHECK-SD-NEXT: and x8, x8, #0xfffffffffffffff +; CHECK-SD-NEXT: movk x11, #9362, lsl #16 +; CHECK-SD-NEXT: add x9, x10, x9 +; CHECK-SD-NEXT: and x10, x2, #0xfffffffffffffff +; CHECK-SD-NEXT: movk x11, #37449, lsl #32 +; CHECK-SD-NEXT: add x8, x10, x8 +; CHECK-SD-NEXT: add x9, x9, x1, lsr #56 +; CHECK-SD-NEXT: movk x11, #18724, lsl #48 +; CHECK-SD-NEXT: add x8, x8, x3, lsr #56 +; CHECK-SD-NEXT: mov x1, xzr ; CHECK-SD-NEXT: mov x3, xzr -; CHECK-SD-NEXT: bl __umodti3 -; CHECK-SD-NEXT: mov x21, x0 -; CHECK-SD-NEXT: mov x22, x1 -; CHECK-SD-NEXT: mov x0, x20 -; CHECK-SD-NEXT: mov x1, x19 -; CHECK-SD-NEXT: mov w2, #7 // =0x7 -; CHECK-SD-NEXT: mov x3, xzr -; CHECK-SD-NEXT: bl __umodti3 -; CHECK-SD-NEXT: mov x2, x0 -; CHECK-SD-NEXT: mov x3, x1 -; CHECK-SD-NEXT: mov x0, x21 -; CHECK-SD-NEXT: mov x1, x22 -; CHECK-SD-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; CHECK-SD-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload -; CHECK-SD-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload +; CHECK-SD-NEXT: umulh x10, x9, x11 +; CHECK-SD-NEXT: umulh x11, x8, x11 +; CHECK-SD-NEXT: lsr x10, x10, #1 +; CHECK-SD-NEXT: lsr x11, x11, #1 +; CHECK-SD-NEXT: sub x10, x10, x10, lsl #3 +; CHECK-SD-NEXT: sub x11, x11, x11, lsl #3 +; CHECK-SD-NEXT: add x0, x9, x10 +; CHECK-SD-NEXT: add x2, x8, x11 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: uv2i128_7: @@ -3228,34 +3242,35 @@ entry: define <2 x i128> @uv2i128_100(<2 x i128> %d, <2 x i128> %e) { ; CHECK-SD-LABEL: uv2i128_100: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill -; CHECK-SD-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill -; CHECK-SD-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill -; CHECK-SD-NEXT: .cfi_def_cfa_offset 48 -; CHECK-SD-NEXT: .cfi_offset w19, -8 -; CHECK-SD-NEXT: .cfi_offset w20, -16 -; CHECK-SD-NEXT: .cfi_offset w21, -24 -; CHECK-SD-NEXT: .cfi_offset w22, -32 -; CHECK-SD-NEXT: .cfi_offset w30, -48 -; CHECK-SD-NEXT: mov x19, x3 -; CHECK-SD-NEXT: mov x20, x2 -; CHECK-SD-NEXT: mov w2, #100 // =0x64 +; CHECK-SD-NEXT: lsr x8, x1, #2 +; CHECK-SD-NEXT: extr x9, x1, x0, #2 +; CHECK-SD-NEXT: extr x10, x3, x2, #2 +; CHECK-SD-NEXT: lsr x11, x3, #2 +; CHECK-SD-NEXT: mov w12, #25 // =0x19 +; CHECK-SD-NEXT: extr x8, x8, x9, #60 +; CHECK-SD-NEXT: and x9, x9, #0xfffffffffffffff +; CHECK-SD-NEXT: extr x11, x11, x10, #60 +; CHECK-SD-NEXT: and x8, x8, #0xfffffffffffffff +; CHECK-SD-NEXT: add x8, x9, x8 +; CHECK-SD-NEXT: and x9, x10, #0xfffffffffffffff +; CHECK-SD-NEXT: and x10, x11, #0xfffffffffffffff +; CHECK-SD-NEXT: mov x11, #62915 // =0xf5c3 +; CHECK-SD-NEXT: add x9, x9, x10 +; CHECK-SD-NEXT: add x8, x8, x1, lsr #58 +; CHECK-SD-NEXT: movk x11, #23592, lsl #16 +; CHECK-SD-NEXT: add x9, x9, x3, lsr #58 +; CHECK-SD-NEXT: mov x1, xzr +; CHECK-SD-NEXT: movk x11, #49807, lsl #32 ; CHECK-SD-NEXT: mov x3, xzr -; CHECK-SD-NEXT: bl __umodti3 -; CHECK-SD-NEXT: mov x21, x0 -; CHECK-SD-NEXT: mov x22, x1 -; CHECK-SD-NEXT: mov x0, x20 -; CHECK-SD-NEXT: mov x1, x19 -; CHECK-SD-NEXT: mov w2, #100 // =0x64 -; CHECK-SD-NEXT: mov x3, xzr -; CHECK-SD-NEXT: bl __umodti3 -; CHECK-SD-NEXT: mov x2, x0 -; CHECK-SD-NEXT: mov x3, x1 -; CHECK-SD-NEXT: mov x0, x21 -; CHECK-SD-NEXT: mov x1, x22 -; CHECK-SD-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload -; CHECK-SD-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload -; CHECK-SD-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload +; CHECK-SD-NEXT: movk x11, #10485, lsl #48 +; CHECK-SD-NEXT: umulh x10, x8, x11 +; CHECK-SD-NEXT: umulh x11, x9, x11 +; CHECK-SD-NEXT: lsr x10, x10, #2 +; CHECK-SD-NEXT: lsr x11, x11, #2 +; CHECK-SD-NEXT: msub x8, x10, x12, x8 +; CHECK-SD-NEXT: msub x9, x11, x12, x9 +; CHECK-SD-NEXT: bfi x0, x8, #2, #62 +; CHECK-SD-NEXT: bfi x2, x9, #2, #62 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: uv2i128_100: diff --git a/llvm/test/CodeGen/PowerPC/urem-lkk.ll b/llvm/test/CodeGen/PowerPC/urem-lkk.ll index 43a1e5a2faf6..17ba2d17a2c4 100644 --- a/llvm/test/CodeGen/PowerPC/urem-lkk.ll +++ b/llvm/test/CodeGen/PowerPC/urem-lkk.ll @@ -89,17 +89,19 @@ define i32 @dont_fold_urem_i32_umax(i32 %x) { define i64 @dont_fold_urem_i64(i64 %x) { ; CHECK-LABEL: dont_fold_urem_i64: ; CHECK: # %bb.0: -; CHECK-NEXT: mflr 0 -; CHECK-NEXT: stwu 1, -16(1) -; CHECK-NEXT: stw 0, 20(1) -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset lr, 4 -; CHECK-NEXT: li 5, 0 -; CHECK-NEXT: li 6, 98 -; CHECK-NEXT: bl __umoddi3 -; CHECK-NEXT: lwz 0, 20(1) -; CHECK-NEXT: addi 1, 1, 16 -; CHECK-NEXT: mtlr 0 +; CHECK-NEXT: srwi 6, 4, 22 +; CHECK-NEXT: rlwinm 7, 4, 31, 11, 31 +; CHECK-NEXT: rlwimi 6, 3, 10, 11, 21 +; CHECK-NEXT: lis 5, 1337 +; CHECK-NEXT: add 6, 7, 6 +; CHECK-NEXT: srwi 3, 3, 11 +; CHECK-NEXT: ori 5, 5, 30762 +; CHECK-NEXT: add 3, 6, 3 +; CHECK-NEXT: mulhwu 5, 3, 5 +; CHECK-NEXT: mulli 5, 5, 49 +; CHECK-NEXT: sub 3, 3, 5 +; CHECK-NEXT: rlwimi 4, 3, 1, 0, 30 +; CHECK-NEXT: li 3, 0 ; CHECK-NEXT: blr %1 = urem i64 %x, 98 ret i64 %1 diff --git a/llvm/test/CodeGen/RISCV/div-by-constant.ll b/llvm/test/CodeGen/RISCV/div-by-constant.ll index 24c882daa113..1aa0cd053f3e 100644 --- a/llvm/test/CodeGen/RISCV/div-by-constant.ll +++ b/llvm/test/CodeGen/RISCV/div-by-constant.ll @@ -111,16 +111,78 @@ define i64 @udiv64_constant_no_add(i64 %a) nounwind { } define i64 @udiv64_constant_add(i64 %a) nounwind { -; RV32-LABEL: udiv64_constant_add: -; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: li a2, 7 -; RV32-NEXT: li a3, 0 -; RV32-NEXT: call __udivdi3 -; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 16 -; RV32-NEXT: ret +; RV32IM-LABEL: udiv64_constant_add: +; RV32IM: # %bb.0: +; RV32IM-NEXT: lui a2, 262144 +; RV32IM-NEXT: slli a3, a1, 2 +; RV32IM-NEXT: srli a4, a0, 30 +; RV32IM-NEXT: srli a5, a1, 28 +; RV32IM-NEXT: lui a6, 149797 +; RV32IM-NEXT: or a3, a4, a3 +; RV32IM-NEXT: lui a4, 449390 +; RV32IM-NEXT: addi a2, a2, -1 +; RV32IM-NEXT: and a3, a3, a2 +; RV32IM-NEXT: and a2, a0, a2 +; RV32IM-NEXT: add a2, a2, a3 +; RV32IM-NEXT: lui a3, 748983 +; RV32IM-NEXT: addi a6, a6, -1755 +; RV32IM-NEXT: addi a4, a4, -1171 +; RV32IM-NEXT: addi a3, a3, -585 +; RV32IM-NEXT: add a2, a2, a5 +; RV32IM-NEXT: mulhu a5, a2, a6 +; RV32IM-NEXT: sub a6, a2, a5 +; RV32IM-NEXT: srli a6, a6, 1 +; RV32IM-NEXT: add a5, a6, a5 +; RV32IM-NEXT: srli a5, a5, 2 +; RV32IM-NEXT: slli a6, a5, 3 +; RV32IM-NEXT: sub a5, a5, a6 +; RV32IM-NEXT: add a2, a2, a5 +; RV32IM-NEXT: sub a5, a0, a2 +; RV32IM-NEXT: sltu a0, a0, a2 +; RV32IM-NEXT: mul a2, a5, a4 +; RV32IM-NEXT: mulhu a4, a5, a3 +; RV32IM-NEXT: sub a1, a1, a0 +; RV32IM-NEXT: add a2, a4, a2 +; RV32IM-NEXT: mul a1, a1, a3 +; RV32IM-NEXT: add a1, a2, a1 +; RV32IM-NEXT: mul a0, a5, a3 +; RV32IM-NEXT: ret +; +; RV32IMZB-LABEL: udiv64_constant_add: +; RV32IMZB: # %bb.0: +; RV32IMZB-NEXT: srli a2, a0, 30 +; RV32IMZB-NEXT: lui a3, 786432 +; RV32IMZB-NEXT: slli a4, a0, 2 +; RV32IMZB-NEXT: srli a5, a1, 28 +; RV32IMZB-NEXT: lui a6, 149797 +; RV32IMZB-NEXT: sh2add a2, a1, a2 +; RV32IMZB-NEXT: andn a2, a2, a3 +; RV32IMZB-NEXT: lui a3, 449390 +; RV32IMZB-NEXT: srli a4, a4, 2 +; RV32IMZB-NEXT: add a4, a4, a5 +; RV32IMZB-NEXT: lui a5, 748983 +; RV32IMZB-NEXT: addi a6, a6, -1755 +; RV32IMZB-NEXT: addi a3, a3, -1171 +; RV32IMZB-NEXT: addi a5, a5, -585 +; RV32IMZB-NEXT: add a2, a4, a2 +; RV32IMZB-NEXT: mulhu a4, a2, a6 +; RV32IMZB-NEXT: sub a6, a2, a4 +; RV32IMZB-NEXT: srli a6, a6, 1 +; RV32IMZB-NEXT: add a4, a6, a4 +; RV32IMZB-NEXT: srli a4, a4, 2 +; RV32IMZB-NEXT: slli a6, a4, 3 +; RV32IMZB-NEXT: sub a4, a4, a6 +; RV32IMZB-NEXT: add a2, a2, a4 +; RV32IMZB-NEXT: sub a4, a0, a2 +; RV32IMZB-NEXT: sltu a0, a0, a2 +; RV32IMZB-NEXT: mul a2, a4, a3 +; RV32IMZB-NEXT: mulhu a3, a4, a5 +; RV32IMZB-NEXT: sub a1, a1, a0 +; RV32IMZB-NEXT: add a2, a3, a2 +; RV32IMZB-NEXT: mul a1, a1, a5 +; RV32IMZB-NEXT: add a1, a2, a1 +; RV32IMZB-NEXT: mul a0, a4, a5 +; RV32IMZB-NEXT: ret ; ; RV64-LABEL: udiv64_constant_add: ; RV64: # %bb.0: diff --git a/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll b/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll index eb70d7f43c0e..3ded13cc31c7 100644 --- a/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll +++ b/llvm/test/CodeGen/RISCV/split-udiv-by-constant.ll @@ -117,24 +117,75 @@ define iXLen2 @test_udiv_5(iXLen2 %x) nounwind { define iXLen2 @test_udiv_7(iXLen2 %x) nounwind { ; RV32-LABEL: test_udiv_7: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: li a2, 7 -; RV32-NEXT: li a3, 0 -; RV32-NEXT: call __udivdi3 -; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: lui a2, 262144 +; RV32-NEXT: slli a3, a1, 2 +; RV32-NEXT: srli a4, a0, 30 +; RV32-NEXT: srli a5, a1, 28 +; RV32-NEXT: lui a6, 149797 +; RV32-NEXT: or a3, a4, a3 +; RV32-NEXT: lui a4, 449390 +; RV32-NEXT: addi a2, a2, -1 +; RV32-NEXT: and a3, a3, a2 +; RV32-NEXT: and a2, a0, a2 +; RV32-NEXT: add a2, a2, a3 +; RV32-NEXT: lui a3, 748983 +; RV32-NEXT: addi a6, a6, -1755 +; RV32-NEXT: addi a4, a4, -1171 +; RV32-NEXT: addi a3, a3, -585 +; RV32-NEXT: add a2, a2, a5 +; RV32-NEXT: mulhu a5, a2, a6 +; RV32-NEXT: sub a6, a2, a5 +; RV32-NEXT: srli a6, a6, 1 +; RV32-NEXT: add a5, a6, a5 +; RV32-NEXT: srli a5, a5, 2 +; RV32-NEXT: slli a6, a5, 3 +; RV32-NEXT: sub a5, a5, a6 +; RV32-NEXT: add a2, a2, a5 +; RV32-NEXT: sub a5, a0, a2 +; RV32-NEXT: sltu a0, a0, a2 +; RV32-NEXT: mul a2, a5, a4 +; RV32-NEXT: mulhu a4, a5, a3 +; RV32-NEXT: sub a1, a1, a0 +; RV32-NEXT: add a2, a4, a2 +; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: add a1, a2, a1 +; RV32-NEXT: mul a0, a5, a3 ; RV32-NEXT: ret ; ; RV64-LABEL: test_udiv_7: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64-NEXT: li a2, 7 -; RV64-NEXT: li a3, 0 -; RV64-NEXT: call __udivti3 -; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: li a2, -1 +; RV64-NEXT: slli a3, a1, 4 +; RV64-NEXT: srli a4, a0, 60 +; RV64-NEXT: srli a5, a1, 56 +; RV64-NEXT: lui a6, %hi(.LCPI2_0) +; RV64-NEXT: or a3, a4, a3 +; RV64-NEXT: lui a4, 748983 +; RV64-NEXT: srli a2, a2, 4 +; RV64-NEXT: ld a6, %lo(.LCPI2_0)(a6) +; RV64-NEXT: addi a4, a4, -585 +; RV64-NEXT: and a3, a3, a2 +; RV64-NEXT: and a2, a0, a2 +; RV64-NEXT: add a2, a2, a3 +; RV64-NEXT: slli a3, a4, 33 +; RV64-NEXT: add a3, a4, a3 +; RV64-NEXT: lui a4, %hi(.LCPI2_1) +; RV64-NEXT: ld a4, %lo(.LCPI2_1)(a4) +; RV64-NEXT: add a2, a2, a5 +; RV64-NEXT: mulhu a5, a2, a6 +; RV64-NEXT: srli a5, a5, 1 +; RV64-NEXT: slli a6, a5, 3 +; RV64-NEXT: sub a5, a5, a6 +; RV64-NEXT: add a2, a2, a5 +; RV64-NEXT: sub a5, a0, a2 +; RV64-NEXT: sltu a0, a0, a2 +; RV64-NEXT: mul a2, a5, a4 +; RV64-NEXT: mulhu a4, a5, a3 +; RV64-NEXT: sub a1, a1, a0 +; RV64-NEXT: add a2, a4, a2 +; RV64-NEXT: mul a1, a1, a3 +; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: mul a0, a5, a3 ; RV64-NEXT: ret %a = udiv iXLen2 %x, 7 ret iXLen2 %a @@ -143,24 +194,67 @@ define iXLen2 @test_udiv_7(iXLen2 %x) nounwind { define iXLen2 @test_udiv_9(iXLen2 %x) nounwind { ; RV32-LABEL: test_udiv_9: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: li a2, 9 -; RV32-NEXT: li a3, 0 -; RV32-NEXT: call __udivdi3 -; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: lui a2, 262144 +; RV32-NEXT: slli a3, a1, 2 +; RV32-NEXT: srli a4, a0, 30 +; RV32-NEXT: srli a5, a1, 28 +; RV32-NEXT: lui a6, 233017 +; RV32-NEXT: or a3, a4, a3 +; RV32-NEXT: lui a4, 582542 +; RV32-NEXT: addi a2, a2, -1 +; RV32-NEXT: addi a6, a6, -455 +; RV32-NEXT: addi a4, a4, 910 +; RV32-NEXT: and a3, a3, a2 +; RV32-NEXT: and a2, a0, a2 +; RV32-NEXT: add a2, a2, a3 +; RV32-NEXT: add a2, a2, a5 +; RV32-NEXT: mulhu a3, a2, a6 +; RV32-NEXT: srli a3, a3, 1 +; RV32-NEXT: slli a5, a3, 3 +; RV32-NEXT: add a3, a5, a3 +; RV32-NEXT: sub a2, a2, a3 +; RV32-NEXT: sub a3, a0, a2 +; RV32-NEXT: sltu a0, a0, a2 +; RV32-NEXT: mul a2, a3, a4 +; RV32-NEXT: mulhu a4, a3, a6 +; RV32-NEXT: sub a1, a1, a0 +; RV32-NEXT: add a2, a4, a2 +; RV32-NEXT: mul a1, a1, a6 +; RV32-NEXT: add a1, a2, a1 +; RV32-NEXT: mul a0, a3, a6 ; RV32-NEXT: ret ; ; RV64-LABEL: test_udiv_9: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64-NEXT: li a2, 9 -; RV64-NEXT: li a3, 0 -; RV64-NEXT: call __udivti3 -; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: li a2, -1 +; RV64-NEXT: slli a3, a1, 4 +; RV64-NEXT: srli a4, a0, 60 +; RV64-NEXT: srli a5, a1, 56 +; RV64-NEXT: lui a6, %hi(.LCPI3_0) +; RV64-NEXT: or a3, a4, a3 +; RV64-NEXT: lui a4, %hi(.LCPI3_1) +; RV64-NEXT: srli a2, a2, 4 +; RV64-NEXT: and a3, a3, a2 +; RV64-NEXT: and a2, a0, a2 +; RV64-NEXT: add a2, a2, a3 +; RV64-NEXT: lui a3, %hi(.LCPI3_2) +; RV64-NEXT: ld a6, %lo(.LCPI3_0)(a6) +; RV64-NEXT: ld a4, %lo(.LCPI3_1)(a4) +; RV64-NEXT: ld a3, %lo(.LCPI3_2)(a3) +; RV64-NEXT: add a2, a2, a5 +; RV64-NEXT: mulhu a5, a2, a6 +; RV64-NEXT: slli a6, a5, 3 +; RV64-NEXT: add a5, a6, a5 +; RV64-NEXT: sub a2, a2, a5 +; RV64-NEXT: sub a5, a0, a2 +; RV64-NEXT: sltu a0, a0, a2 +; RV64-NEXT: mul a2, a5, a4 +; RV64-NEXT: mulhu a4, a5, a3 +; RV64-NEXT: sub a1, a1, a0 +; RV64-NEXT: add a2, a4, a2 +; RV64-NEXT: mul a1, a1, a3 +; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: mul a0, a5, a3 ; RV64-NEXT: ret %a = udiv iXLen2 %x, 9 ret iXLen2 %a diff --git a/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll b/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll index bc4a99a00ac6..2a890e8bb1aa 100644 --- a/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll +++ b/llvm/test/CodeGen/RISCV/split-urem-by-constant.ll @@ -79,24 +79,49 @@ define iXLen2 @test_urem_5(iXLen2 %x) nounwind { define iXLen2 @test_urem_7(iXLen2 %x) nounwind { ; RV32-LABEL: test_urem_7: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: li a2, 7 -; RV32-NEXT: li a3, 0 -; RV32-NEXT: call __umoddi3 -; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: lui a2, 262144 +; RV32-NEXT: slli a3, a1, 2 +; RV32-NEXT: srli a4, a0, 30 +; RV32-NEXT: srli a1, a1, 28 +; RV32-NEXT: or a3, a4, a3 +; RV32-NEXT: lui a4, 149797 +; RV32-NEXT: addi a2, a2, -1 +; RV32-NEXT: and a0, a0, a2 +; RV32-NEXT: and a2, a3, a2 +; RV32-NEXT: add a0, a0, a2 +; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: addi a1, a4, -1755 +; RV32-NEXT: mulhu a1, a0, a1 +; RV32-NEXT: sub a2, a0, a1 +; RV32-NEXT: srli a2, a2, 1 +; RV32-NEXT: add a1, a2, a1 +; RV32-NEXT: srli a1, a1, 2 +; RV32-NEXT: slli a2, a1, 3 +; RV32-NEXT: sub a1, a1, a2 +; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: li a1, 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_urem_7: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64-NEXT: li a2, 7 -; RV64-NEXT: li a3, 0 -; RV64-NEXT: call __umodti3 -; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: li a2, -1 +; RV64-NEXT: slli a3, a1, 4 +; RV64-NEXT: srli a4, a0, 60 +; RV64-NEXT: or a3, a4, a3 +; RV64-NEXT: lui a4, %hi(.LCPI2_0) +; RV64-NEXT: srli a2, a2, 4 +; RV64-NEXT: ld a4, %lo(.LCPI2_0)(a4) +; RV64-NEXT: and a0, a0, a2 +; RV64-NEXT: and a2, a3, a2 +; RV64-NEXT: add a0, a0, a2 +; RV64-NEXT: srli a1, a1, 56 +; RV64-NEXT: add a0, a0, a1 +; RV64-NEXT: mulhu a1, a0, a4 +; RV64-NEXT: srli a1, a1, 1 +; RV64-NEXT: slli a2, a1, 3 +; RV64-NEXT: sub a1, a1, a2 +; RV64-NEXT: add a0, a0, a1 +; RV64-NEXT: li a1, 0 ; RV64-NEXT: ret %a = urem iXLen2 %x, 7 ret iXLen2 %a @@ -105,24 +130,45 @@ define iXLen2 @test_urem_7(iXLen2 %x) nounwind { define iXLen2 @test_urem_9(iXLen2 %x) nounwind { ; RV32-LABEL: test_urem_9: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32-NEXT: li a2, 9 -; RV32-NEXT: li a3, 0 -; RV32-NEXT: call __umoddi3 -; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: lui a2, 262144 +; RV32-NEXT: slli a3, a1, 2 +; RV32-NEXT: srli a4, a0, 30 +; RV32-NEXT: srli a1, a1, 28 +; RV32-NEXT: or a3, a4, a3 +; RV32-NEXT: lui a4, 233017 +; RV32-NEXT: addi a2, a2, -1 +; RV32-NEXT: and a0, a0, a2 +; RV32-NEXT: and a2, a3, a2 +; RV32-NEXT: add a0, a0, a2 +; RV32-NEXT: add a0, a0, a1 +; RV32-NEXT: addi a1, a4, -455 +; RV32-NEXT: mulhu a1, a0, a1 +; RV32-NEXT: srli a1, a1, 1 +; RV32-NEXT: slli a2, a1, 3 +; RV32-NEXT: add a1, a2, a1 +; RV32-NEXT: sub a0, a0, a1 +; RV32-NEXT: li a1, 0 ; RV32-NEXT: ret ; ; RV64-LABEL: test_urem_9: ; RV64: # %bb.0: -; RV64-NEXT: addi sp, sp, -16 -; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill -; RV64-NEXT: li a2, 9 -; RV64-NEXT: li a3, 0 -; RV64-NEXT: call __umodti3 -; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload -; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: li a2, -1 +; RV64-NEXT: slli a3, a1, 4 +; RV64-NEXT: srli a4, a0, 60 +; RV64-NEXT: or a3, a4, a3 +; RV64-NEXT: lui a4, %hi(.LCPI3_0) +; RV64-NEXT: srli a2, a2, 4 +; RV64-NEXT: ld a4, %lo(.LCPI3_0)(a4) +; RV64-NEXT: and a0, a0, a2 +; RV64-NEXT: and a2, a3, a2 +; RV64-NEXT: add a0, a0, a2 +; RV64-NEXT: srli a1, a1, 56 +; RV64-NEXT: add a0, a0, a1 +; RV64-NEXT: mulhu a1, a0, a4 +; RV64-NEXT: slli a2, a1, 3 +; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: sub a0, a0, a1 +; RV64-NEXT: li a1, 0 ; RV64-NEXT: ret %a = urem iXLen2 %x, 9 ret iXLen2 %a diff --git a/llvm/test/CodeGen/RISCV/urem-lkk.ll b/llvm/test/CodeGen/RISCV/urem-lkk.ll index 449e56c82e74..00fe1a5eb111 100644 --- a/llvm/test/CodeGen/RISCV/urem-lkk.ll +++ b/llvm/test/CodeGen/RISCV/urem-lkk.ll @@ -229,13 +229,28 @@ define i64 @dont_fold_urem_i64(i64 %x) nounwind { ; ; RV32IM-LABEL: dont_fold_urem_i64: ; RV32IM: # %bb.0: -; RV32IM-NEXT: addi sp, sp, -16 -; RV32IM-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32IM-NEXT: li a2, 98 -; RV32IM-NEXT: li a3, 0 -; RV32IM-NEXT: call __umoddi3 -; RV32IM-NEXT: lw ra, 12(sp) # 4-byte Folded Reload -; RV32IM-NEXT: addi sp, sp, 16 +; RV32IM-NEXT: slli a2, a1, 31 +; RV32IM-NEXT: srli a3, a0, 1 +; RV32IM-NEXT: andi a4, a1, 2046 +; RV32IM-NEXT: srli a1, a1, 11 +; RV32IM-NEXT: or a2, a3, a2 +; RV32IM-NEXT: slli a4, a4, 10 +; RV32IM-NEXT: srli a3, a2, 21 +; RV32IM-NEXT: or a3, a3, a4 +; RV32IM-NEXT: lui a4, 21400 +; RV32IM-NEXT: slli a2, a2, 11 +; RV32IM-NEXT: srli a2, a2, 11 +; RV32IM-NEXT: add a2, a2, a3 +; RV32IM-NEXT: li a3, 49 +; RV32IM-NEXT: addi a4, a4, -2006 +; RV32IM-NEXT: add a1, a2, a1 +; RV32IM-NEXT: mulhu a2, a1, a4 +; RV32IM-NEXT: mul a2, a2, a3 +; RV32IM-NEXT: sub a1, a1, a2 +; RV32IM-NEXT: slli a1, a1, 1 +; RV32IM-NEXT: andi a0, a0, 1 +; RV32IM-NEXT: or a0, a1, a0 +; RV32IM-NEXT: li a1, 0 ; RV32IM-NEXT: ret ; ; RV64I-LABEL: dont_fold_urem_i64: diff --git a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll index 7fb5ba5f7fc6..180fa6fd2b2f 100644 --- a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll +++ b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll @@ -862,51 +862,58 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind { ; RV32IM-NEXT: sw s5, 20(sp) # 4-byte Folded Spill ; RV32IM-NEXT: sw s6, 16(sp) # 4-byte Folded Spill ; RV32IM-NEXT: sw s7, 12(sp) # 4-byte Folded Spill -; RV32IM-NEXT: sw s8, 8(sp) # 4-byte Folded Spill -; RV32IM-NEXT: lw s1, 16(a1) -; RV32IM-NEXT: lw s2, 20(a1) -; RV32IM-NEXT: lw s3, 24(a1) -; RV32IM-NEXT: lw s4, 28(a1) -; RV32IM-NEXT: lw a3, 0(a1) -; RV32IM-NEXT: lw a4, 4(a1) -; RV32IM-NEXT: lw s5, 8(a1) -; RV32IM-NEXT: lw s6, 12(a1) ; RV32IM-NEXT: mv s0, a0 +; RV32IM-NEXT: lw a2, 16(a1) +; RV32IM-NEXT: lw a4, 20(a1) +; RV32IM-NEXT: lw s1, 24(a1) +; RV32IM-NEXT: lw s2, 28(a1) +; RV32IM-NEXT: lw a0, 0(a1) +; RV32IM-NEXT: lw a3, 4(a1) +; RV32IM-NEXT: lw s3, 8(a1) +; RV32IM-NEXT: lw s4, 12(a1) +; RV32IM-NEXT: lui a1, 1024 +; RV32IM-NEXT: slli a5, a4, 10 +; RV32IM-NEXT: srli a6, a2, 22 +; RV32IM-NEXT: or a5, a6, a5 +; RV32IM-NEXT: lui a6, 45590 +; RV32IM-NEXT: addi a1, a1, -1 +; RV32IM-NEXT: addi a6, a6, 1069 +; RV32IM-NEXT: and a2, a2, a1 +; RV32IM-NEXT: srli a4, a4, 12 +; RV32IM-NEXT: add a2, a2, a4 +; RV32IM-NEXT: and a1, a5, a1 +; RV32IM-NEXT: add a1, a2, a1 +; RV32IM-NEXT: mulhu a2, a1, a6 +; RV32IM-NEXT: li a4, 23 +; RV32IM-NEXT: mul a2, a2, a4 +; RV32IM-NEXT: sub s7, a1, a2 ; RV32IM-NEXT: li a2, 1 -; RV32IM-NEXT: mv a0, a3 -; RV32IM-NEXT: mv a1, a4 -; RV32IM-NEXT: li a3, 0 -; RV32IM-NEXT: call __umoddi3 -; RV32IM-NEXT: mv s7, a0 -; RV32IM-NEXT: mv s8, a1 -; RV32IM-NEXT: li a2, 654 -; RV32IM-NEXT: mv a0, s5 -; RV32IM-NEXT: mv a1, s6 +; RV32IM-NEXT: mv a1, a3 ; RV32IM-NEXT: li a3, 0 ; RV32IM-NEXT: call __umoddi3 ; RV32IM-NEXT: mv s5, a0 ; RV32IM-NEXT: mv s6, a1 -; RV32IM-NEXT: li a2, 23 -; RV32IM-NEXT: mv a0, s1 -; RV32IM-NEXT: mv a1, s2 -; RV32IM-NEXT: li a3, 0 -; RV32IM-NEXT: call __umoddi3 -; RV32IM-NEXT: mv s1, a0 -; RV32IM-NEXT: mv s2, a1 -; RV32IM-NEXT: lui a2, 1 -; RV32IM-NEXT: addi a2, a2, 1327 +; RV32IM-NEXT: li a2, 654 ; RV32IM-NEXT: mv a0, s3 ; RV32IM-NEXT: mv a1, s4 ; RV32IM-NEXT: li a3, 0 ; RV32IM-NEXT: call __umoddi3 -; RV32IM-NEXT: sw s1, 16(s0) -; RV32IM-NEXT: sw s2, 20(s0) +; RV32IM-NEXT: mv s3, a0 +; RV32IM-NEXT: mv s4, a1 +; RV32IM-NEXT: lui a2, 1 +; RV32IM-NEXT: addi a2, a2, 1327 +; RV32IM-NEXT: mv a0, s1 +; RV32IM-NEXT: mv a1, s2 +; RV32IM-NEXT: li a3, 0 +; RV32IM-NEXT: call __umoddi3 +; RV32IM-NEXT: sw s7, 16(s0) +; RV32IM-NEXT: sw zero, 20(s0) ; RV32IM-NEXT: sw a0, 24(s0) ; RV32IM-NEXT: sw a1, 28(s0) -; RV32IM-NEXT: sw s7, 0(s0) -; RV32IM-NEXT: sw s8, 4(s0) -; RV32IM-NEXT: sw s5, 8(s0) -; RV32IM-NEXT: sw s6, 12(s0) +; RV32IM-NEXT: sw s5, 0(s0) +; RV32IM-NEXT: sw s6, 4(s0) +; RV32IM-NEXT: sw s3, 8(s0) +; RV32IM-NEXT: sw s4, 12(s0) ; RV32IM-NEXT: lw ra, 44(sp) # 4-byte Folded Reload ; RV32IM-NEXT: lw s0, 40(sp) # 4-byte Folded Reload ; RV32IM-NEXT: lw s1, 36(sp) # 4-byte Folded Reload @@ -916,7 +923,6 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind { ; RV32IM-NEXT: lw s5, 20(sp) # 4-byte Folded Reload ; RV32IM-NEXT: lw s6, 16(sp) # 4-byte Folded Reload ; RV32IM-NEXT: lw s7, 12(sp) # 4-byte Folded Reload -; RV32IM-NEXT: lw s8, 8(sp) # 4-byte Folded Reload ; RV32IM-NEXT: addi sp, sp, 48 ; RV32IM-NEXT: ret ; diff --git a/llvm/test/CodeGen/X86/divide-by-constant.ll b/llvm/test/CodeGen/X86/divide-by-constant.ll index f15697aaf2df..e0bff50e2e2d 100644 --- a/llvm/test/CodeGen/X86/divide-by-constant.ll +++ b/llvm/test/CodeGen/X86/divide-by-constant.ll @@ -294,19 +294,48 @@ entry: define i64 @PR23590(i64 %x) nounwind { ; X86-LABEL: PR23590: ; X86: # %bb.0: # %entry -; X86-NEXT: subl $12, %esp +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: pushl %eax ; X86-NEXT: pushl $0 ; X86-NEXT: pushl $12345 # imm = 0x3039 ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: pushl {{[0-9]+}}(%esp) ; X86-NEXT: calll __umoddi3 ; X86-NEXT: addl $16, %esp -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $7 -; X86-NEXT: pushl %edx -; X86-NEXT: pushl %eax -; X86-NEXT: calll __udivdi3 -; X86-NEXT: addl $28, %esp +; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: andl $1073741823, %eax # imm = 0x3FFFFFFF +; X86-NEXT: movl %esi, %edx +; X86-NEXT: shrdl $30, %ecx, %edx +; X86-NEXT: andl $1073741823, %edx # imm = 0x3FFFFFFF +; X86-NEXT: movl %ecx, %edi +; X86-NEXT: shrl $28, %edi +; X86-NEXT: addl %eax, %edi +; X86-NEXT: addl %edx, %edi +; X86-NEXT: movl $613566757, %edx # imm = 0x24924925 +; X86-NEXT: movl %edi, %eax +; X86-NEXT: mull %edx +; X86-NEXT: movl %edi, %eax +; X86-NEXT: subl %edx, %eax +; X86-NEXT: shrl %eax +; X86-NEXT: addl %edx, %eax +; X86-NEXT: shrl $2, %eax +; X86-NEXT: leal (,%eax,8), %edx +; X86-NEXT: subl %edx, %eax +; X86-NEXT: addl %edi, %eax +; X86-NEXT: subl %eax, %esi +; X86-NEXT: sbbl $0, %ecx +; X86-NEXT: movl $-1227133513, %edx # imm = 0xB6DB6DB7 +; X86-NEXT: movl %esi, %eax +; X86-NEXT: mull %edx +; X86-NEXT: imull $1840700269, %esi, %esi # imm = 0x6DB6DB6D +; X86-NEXT: addl %esi, %edx +; X86-NEXT: imull $-1227133513, %ecx, %ecx # imm = 0xB6DB6DB7 +; X86-NEXT: addl %ecx, %edx +; X86-NEXT: addl $4, %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi ; X86-NEXT: retl ; ; X64-FAST-LABEL: PR23590: diff --git a/llvm/test/CodeGen/X86/divmod128.ll b/llvm/test/CodeGen/X86/divmod128.ll index af37be791d27..bac337a9c817 100644 --- a/llvm/test/CodeGen/X86/divmod128.ll +++ b/llvm/test/CodeGen/X86/divmod128.ll @@ -67,25 +67,42 @@ define i64 @div128(i128 %x) nounwind { define i64 @umod128(i128 %x) nounwind { ; X86-64-LABEL: umod128: ; X86-64: # %bb.0: -; X86-64-NEXT: pushq %rax -; X86-64-NEXT: movl $11, %edx -; X86-64-NEXT: xorl %ecx, %ecx -; X86-64-NEXT: callq __umodti3@PLT -; X86-64-NEXT: popq %rcx +; X86-64-NEXT: movabsq $1152921504606846975, %rcx # imm = 0xFFFFFFFFFFFFFFF +; X86-64-NEXT: movq %rdi, %rax +; X86-64-NEXT: andq %rcx, %rax +; X86-64-NEXT: shrdq $60, %rsi, %rdi +; X86-64-NEXT: andq %rdi, %rcx +; X86-64-NEXT: addq %rax, %rcx +; X86-64-NEXT: shrq $56, %rsi +; X86-64-NEXT: addq %rsi, %rcx +; X86-64-NEXT: movabsq $3353953467947191203, %rdx # imm = 0x2E8BA2E8BA2E8BA3 +; X86-64-NEXT: movq %rcx, %rax +; X86-64-NEXT: mulq %rdx +; X86-64-NEXT: shrq %rdx +; X86-64-NEXT: leaq (%rdx,%rdx,4), %rax +; X86-64-NEXT: leaq (%rdx,%rax,2), %rax +; X86-64-NEXT: subq %rax, %rcx +; X86-64-NEXT: movq %rcx, %rax ; X86-64-NEXT: retq ; ; WIN64-LABEL: umod128: ; WIN64: # %bb.0: -; WIN64-NEXT: subq $72, %rsp -; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $11, {{[0-9]+}}(%rsp) -; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp) -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx -; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx -; WIN64-NEXT: callq __umodti3 -; WIN64-NEXT: movq %xmm0, %rax -; WIN64-NEXT: addq $72, %rsp +; WIN64-NEXT: movabsq $1152921504606846975, %r8 # imm = 0xFFFFFFFFFFFFFFF +; WIN64-NEXT: movq %rcx, %rax +; WIN64-NEXT: andq %r8, %rax +; WIN64-NEXT: shrdq $60, %rdx, %rcx +; WIN64-NEXT: andq %rcx, %r8 +; WIN64-NEXT: addq %rax, %r8 +; WIN64-NEXT: shrq $56, %rdx +; WIN64-NEXT: addq %rdx, %r8 +; WIN64-NEXT: movabsq $3353953467947191203, %rcx # imm = 0x2E8BA2E8BA2E8BA3 +; WIN64-NEXT: movq %r8, %rax +; WIN64-NEXT: mulq %rcx +; WIN64-NEXT: shrq %rdx +; WIN64-NEXT: leaq (%rdx,%rdx,4), %rax +; WIN64-NEXT: leaq (%rdx,%rax,2), %rax +; WIN64-NEXT: subq %rax, %r8 +; WIN64-NEXT: movq %r8, %rax ; WIN64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/i128-udiv.ll b/llvm/test/CodeGen/X86/i128-udiv.ll index 901183242132..8328ecddbebb 100644 --- a/llvm/test/CodeGen/X86/i128-udiv.ll +++ b/llvm/test/CodeGen/X86/i128-udiv.ll @@ -2,9 +2,6 @@ ; RUN: llc < %s -mtriple=i686-- | FileCheck %s --check-prefix=X86 ; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s --check-prefix=X64 -; Make sure none of these crash, and that the power-of-two transformations -; trigger correctly. - define i128 @test1(i128 %x) nounwind { ; X86-LABEL: test1: ; X86: # %bb.0: @@ -35,10 +32,291 @@ define i128 @test1(i128 %x) nounwind { ret i128 %tmp } + +; X86 doesn't have __divti3, so the urem is expanded into a loop. define i128 @test2(i128 %x) nounwind { ; X86-LABEL: test2: -; X86 doesn't have __divti3, so the urem is expanded into a loop. -; X86: udiv-do-while +; X86: # %bb.0: # %_udiv-special-cases +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $144, %esp +; X86-NEXT: movl 32(%ebp), %esi +; X86-NEXT: movl 36(%ebp), %edi +; X86-NEXT: movl 28(%ebp), %ecx +; X86-NEXT: testl %edi, %edi +; X86-NEXT: jne .LBB1_1 +; X86-NEXT: # %bb.2: # %_udiv-special-cases +; X86-NEXT: bsrl %esi, %ebx +; X86-NEXT: xorl $31, %ebx +; X86-NEXT: orl $32, %ebx +; X86-NEXT: jmp .LBB1_3 +; X86-NEXT: .LBB1_1: +; X86-NEXT: bsrl %edi, %ebx +; X86-NEXT: xorl $31, %ebx +; X86-NEXT: .LBB1_3: # %_udiv-special-cases +; X86-NEXT: movl 24(%ebp), %edx +; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: jne .LBB1_4 +; X86-NEXT: # %bb.5: # %_udiv-special-cases +; X86-NEXT: bsrl %edx, %eax +; X86-NEXT: xorl $31, %eax +; X86-NEXT: orl $32, %eax +; X86-NEXT: jmp .LBB1_6 +; X86-NEXT: .LBB1_4: +; X86-NEXT: bsrl %ecx, %eax +; X86-NEXT: xorl $31, %eax +; X86-NEXT: .LBB1_6: # %_udiv-special-cases +; X86-NEXT: orl %edi, %ecx +; X86-NEXT: orl %esi, %edx +; X86-NEXT: orl %edi, %esi +; X86-NEXT: jne .LBB1_8 +; X86-NEXT: # %bb.7: # %_udiv-special-cases +; X86-NEXT: orl $64, %eax +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: .LBB1_8: # %_udiv-special-cases +; X86-NEXT: negl %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $0, %esi +; X86-NEXT: sbbl %esi, %esi +; X86-NEXT: movl $0, %ebx +; X86-NEXT: sbbl %ebx, %ebx +; X86-NEXT: movl $0, %edi +; X86-NEXT: sbbl %edi, %edi +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: je .LBB1_9 +; X86-NEXT: # %bb.10: # %select.false.sink +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: movl $127, %ecx +; X86-NEXT: cmpl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl $0, %ecx +; X86-NEXT: sbbl %esi, %ecx +; X86-NEXT: movl $0, %ecx +; X86-NEXT: sbbl %ebx, %ecx +; X86-NEXT: sbbl %edi, %eax +; X86-NEXT: setb %cl +; X86-NEXT: .LBB1_11: # %select.end +; X86-NEXT: movl %ebx, %esi +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl $0, (%esp) # 4-byte Folded Spill +; X86-NEXT: testb %cl, %cl +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl $0, %edi +; X86-NEXT: movl $0, %ecx +; X86-NEXT: jne .LBB1_13 +; X86-NEXT: # %bb.12: # %select.end +; X86-NEXT: movl 28(%ebp), %ebx +; X86-NEXT: movl %ebx, (%esp) # 4-byte Spill +; X86-NEXT: movl 24(%ebp), %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 32(%ebp), %edi +; X86-NEXT: movl 36(%ebp), %ecx +; X86-NEXT: .LBB1_13: # %select.end +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: jne .LBB1_14 +; X86-NEXT: # %bb.20: # %select.end +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, %ebx +; X86-NEXT: xorl $127, %ecx +; X86-NEXT: orl %esi, %ecx +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: je .LBB1_21 +; X86-NEXT: # %bb.18: # %udiv-bb1 +; X86-NEXT: movl 24(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 28(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 32(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 36(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: xorb $127, %cl +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: shrb $3, %al +; X86-NEXT: andb $12, %al +; X86-NEXT: negb %al +; X86-NEXT: movsbl %al, %eax +; X86-NEXT: movl 120(%esp,%eax), %edx +; X86-NEXT: movl 124(%esp,%eax), %edi +; X86-NEXT: shldl %cl, %edx, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 112(%esp,%eax), %edi +; X86-NEXT: movl 116(%esp,%eax), %eax +; X86-NEXT: shldl %cl, %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edi, %eax +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: shll %cl, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl $1, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: jb .LBB1_19 +; X86-NEXT: # %bb.15: # %udiv-preheader +; X86-NEXT: movl 24(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 28(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 32(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 36(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: shrb $3, %al +; X86-NEXT: andb $12, %al +; X86-NEXT: movzbl %al, %esi +; X86-NEXT: movl 76(%esp,%esi), %eax +; X86-NEXT: movl 72(%esp,%esi), %edi +; X86-NEXT: movl %edi, %edx +; X86-NEXT: shrdl %cl, %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 64(%esp,%esi), %edx +; X86-NEXT: movl 68(%esp,%esi), %esi +; X86-NEXT: movl %esi, %ebx +; X86-NEXT: shrdl %cl, %edi, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shrl %cl, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shrdl %cl, %esi, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $-4, %eax +; X86-NEXT: addl $-1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $-1, %eax +; X86-NEXT: adcl $-1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: .p2align 4 +; X86-NEXT: .LBB1_16: # %udiv-do-while +; X86-NEXT: # =>This Inner Loop Header: Depth=1 +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: shldl $1, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: shldl $1, %esi, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: shldl $1, %ecx, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: shldl $1, %edi, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl $1, %eax, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: orl %edx, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl (%esp), %edi # 4-byte Reload +; X86-NEXT: shldl $1, %edi, %eax +; X86-NEXT: orl %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl $1, %eax, %edi +; X86-NEXT: orl %edx, %edi +; X86-NEXT: movl %edi, (%esp) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: addl %eax, %eax +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $-1, %eax +; X86-NEXT: cmpl %ecx, %eax +; X86-NEXT: movl $-1, %ecx +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl %esi, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: sbbl %ebx, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: sbbl %edx, %ecx +; X86-NEXT: sarl $31, %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: andl $1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: movl $-1, %eax +; X86-NEXT: andl %eax, %esi +; X86-NEXT: movl $-4, %eax +; X86-NEXT: andl %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: subl %ecx, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: sbbl %esi, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: addl $-1, %esi +; X86-NEXT: adcl $-1, %eax +; X86-NEXT: adcl $-1, %edx +; X86-NEXT: adcl $-1, %edi +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: orl %edi, %ecx +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %edx, %esi +; X86-NEXT: orl %ecx, %esi +; X86-NEXT: jne .LBB1_16 +; X86-NEXT: .LBB1_17: # %udiv-loop-exit +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl $1, %edi, %edx +; X86-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-NEXT: shldl $1, %ecx, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl $1, %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: leal (%esi,%eax,2), %esi +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: .LBB1_21: # %udiv-end +; X86-NEXT: movl %esi, (%eax) +; X86-NEXT: movl %ecx, 4(%eax) +; X86-NEXT: movl %edi, 8(%eax) +; X86-NEXT: movl %edx, 12(%eax) +; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl $4 +; X86-NEXT: .LBB1_9: +; X86-NEXT: movb $1, %cl +; X86-NEXT: jmp .LBB1_11 +; X86-NEXT: .LBB1_19: +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: jmp .LBB1_17 +; X86-NEXT: .LBB1_14: +; X86-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: jmp .LBB1_21 ; ; X64-LABEL: test2: ; X64: # %bb.0: @@ -52,10 +330,304 @@ define i128 @test2(i128 %x) nounwind { ret i128 %tmp } +; X86 doesn't have __divti3, so the urem is expanded into a loop. define i128 @test3(i128 %x) nounwind { ; X86-LABEL: test3: -; X86 doesn't have __divti3, so the urem is expanded into a loop. -; X86: udiv-do-while +; X86: # %bb.0: # %_udiv-special-cases +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $160, %esp +; X86-NEXT: movl 32(%ebp), %edi +; X86-NEXT: movl 36(%ebp), %edx +; X86-NEXT: movl 28(%ebp), %esi +; X86-NEXT: testl %edx, %edx +; X86-NEXT: jne .LBB2_1 +; X86-NEXT: # %bb.2: # %_udiv-special-cases +; X86-NEXT: bsrl %edi, %ebx +; X86-NEXT: xorl $31, %ebx +; X86-NEXT: orl $32, %ebx +; X86-NEXT: jmp .LBB2_3 +; X86-NEXT: .LBB2_1: +; X86-NEXT: bsrl %edx, %ebx +; X86-NEXT: xorl $31, %ebx +; X86-NEXT: .LBB2_3: # %_udiv-special-cases +; X86-NEXT: movl 24(%ebp), %eax +; X86-NEXT: testl %esi, %esi +; X86-NEXT: jne .LBB2_4 +; X86-NEXT: # %bb.5: # %_udiv-special-cases +; X86-NEXT: bsrl %eax, %ecx +; X86-NEXT: xorl $31, %ecx +; X86-NEXT: orl $32, %ecx +; X86-NEXT: jmp .LBB2_6 +; X86-NEXT: .LBB2_4: +; X86-NEXT: bsrl %esi, %ecx +; X86-NEXT: xorl $31, %ecx +; X86-NEXT: .LBB2_6: # %_udiv-special-cases +; X86-NEXT: orl %edx, %esi +; X86-NEXT: orl %edi, %eax +; X86-NEXT: orl %edx, %edi +; X86-NEXT: jne .LBB2_8 +; X86-NEXT: # %bb.7: # %_udiv-special-cases +; X86-NEXT: orl $64, %ecx +; X86-NEXT: movl %ecx, %ebx +; X86-NEXT: .LBB2_8: # %_udiv-special-cases +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: negl %ebx +; X86-NEXT: movl $0, %ecx +; X86-NEXT: sbbl %ecx, %ecx +; X86-NEXT: movl $0, %edx +; X86-NEXT: sbbl %edx, %edx +; X86-NEXT: movl %ebx, %edi +; X86-NEXT: movl $0, %ebx +; X86-NEXT: sbbl %ebx, %ebx +; X86-NEXT: orl %esi, %eax +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: je .LBB2_9 +; X86-NEXT: # %bb.10: # %select.false.sink +; X86-NEXT: movl $127, %eax +; X86-NEXT: cmpl %edi, %eax +; X86-NEXT: movl $0, %eax +; X86-NEXT: movl %ecx, %edi +; X86-NEXT: sbbl %ecx, %eax +; X86-NEXT: movl $0, %eax +; X86-NEXT: sbbl %edx, %eax +; X86-NEXT: movl $0, %eax +; X86-NEXT: sbbl %ebx, %eax +; X86-NEXT: setb %cl +; X86-NEXT: .LBB2_11: # %select.end +; X86-NEXT: movl %ebx, %esi +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: testb %cl, %cl +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl $0, %ecx +; X86-NEXT: movl $0, %ebx +; X86-NEXT: jne .LBB2_13 +; X86-NEXT: # %bb.12: # %select.end +; X86-NEXT: movl 28(%ebp), %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 24(%ebp), %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 32(%ebp), %ecx +; X86-NEXT: movl 36(%ebp), %ebx +; X86-NEXT: .LBB2_13: # %select.end +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: jne .LBB2_14 +; X86-NEXT: # %bb.20: # %select.end +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: xorl $127, %ecx +; X86-NEXT: orl %edx, %ecx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edi, %edx +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %esi, %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: je .LBB2_21 +; X86-NEXT: # %bb.18: # %udiv-bb1 +; X86-NEXT: movl 24(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 28(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 32(%ebp), %ebx +; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NEXT: movl 36(%ebp), %ebx +; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: xorb $127, %cl +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: shrb $3, %al +; X86-NEXT: andb $12, %al +; X86-NEXT: negb %al +; X86-NEXT: movsbl %al, %eax +; X86-NEXT: movl 136(%esp,%eax), %edx +; X86-NEXT: movl 140(%esp,%eax), %esi +; X86-NEXT: shldl %cl, %edx, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 128(%esp,%eax), %esi +; X86-NEXT: movl 132(%esp,%eax), %eax +; X86-NEXT: shldl %cl, %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %esi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shll %cl, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl $1, %ebx +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: jb .LBB2_19 +; X86-NEXT: # %bb.15: # %udiv-preheader +; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: movl 24(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 28(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 32(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 36(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: shrb $3, %al +; X86-NEXT: andb $12, %al +; X86-NEXT: movzbl %al, %edx +; X86-NEXT: movl 92(%esp,%edx), %edi +; X86-NEXT: movl 88(%esp,%edx), %esi +; X86-NEXT: movl %esi, %eax +; X86-NEXT: shrdl %cl, %edi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 80(%esp,%edx), %eax +; X86-NEXT: movl 84(%esp,%edx), %ebx +; X86-NEXT: movl %ebx, %edx +; X86-NEXT: shrdl %cl, %esi, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shrl %cl, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shrdl %cl, %ebx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $-3, %eax +; X86-NEXT: addl $-1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $-1, %eax +; X86-NEXT: adcl $-1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $-5, %eax +; X86-NEXT: adcl $-1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $-1, %eax +; X86-NEXT: adcl $-1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: .p2align 4 +; X86-NEXT: .LBB2_16: # %udiv-do-while +; X86-NEXT: # =>This Inner Loop Header: Depth=1 +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl $1, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: shldl $1, %edi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: shldl $1, %ebx, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: shldl $1, %edi, %ebx +; X86-NEXT: shldl $1, %ecx, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: orl %eax, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl $1, %esi, %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl $1, %edx, %esi +; X86-NEXT: orl %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl %edx, %edx +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: sbbl %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: sarl $31, %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: andl $1, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl %ecx, %edi +; X86-NEXT: movl $-1, %esi +; X86-NEXT: andl %esi, %edi +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: movl $-5, %edx +; X86-NEXT: andl %edx, %esi +; X86-NEXT: movl $-3, %edx +; X86-NEXT: andl %edx, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: subl %ecx, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl %edi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: sbbl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: sbbl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: addl $-1, %esi +; X86-NEXT: adcl $-1, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: adcl $-1, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: adcl $-1, %edi +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %edi, %ecx +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %ebx, %esi +; X86-NEXT: orl %ecx, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: jne .LBB2_16 +; X86-NEXT: .LBB2_17: # %udiv-loop-exit +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: shldl $1, %ecx, %ebx +; X86-NEXT: shldl $1, %esi, %ecx +; X86-NEXT: shldl $1, %edx, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: leal (%eax,%edx,2), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: .LBB2_21: # %udiv-end +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, (%eax) +; X86-NEXT: movl %esi, 4(%eax) +; X86-NEXT: movl %edx, 8(%eax) +; X86-NEXT: movl %ebx, 12(%eax) +; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl $4 +; X86-NEXT: .LBB2_9: +; X86-NEXT: movl %ecx, %edi +; X86-NEXT: movb $1, %cl +; X86-NEXT: jmp .LBB2_11 +; X86-NEXT: .LBB2_19: +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: jmp .LBB2_17 +; X86-NEXT: .LBB2_14: +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: jmp .LBB2_21 ; ; X64-LABEL: test3: ; X64: # %bb.0: @@ -68,3 +640,2283 @@ define i128 @test3(i128 %x) nounwind { %tmp = udiv i128 %x, -73786976294838206467 ret i128 %tmp } + +define i128 @div_by_7(i128 %x) nounwind { +; X86-LABEL: div_by_7: +; X86: # %bb.0: # %entry_udiv-special-cases +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $160, %esp +; X86-NEXT: movl 32(%ebp), %edi +; X86-NEXT: movl 36(%ebp), %ebx +; X86-NEXT: movl 28(%ebp), %edx +; X86-NEXT: testl %ebx, %ebx +; X86-NEXT: jne .LBB3_1 +; X86-NEXT: # %bb.2: # %entry_udiv-special-cases +; X86-NEXT: bsrl %edi, %esi +; X86-NEXT: xorl $31, %esi +; X86-NEXT: orl $32, %esi +; X86-NEXT: jmp .LBB3_3 +; X86-NEXT: .LBB3_1: +; X86-NEXT: bsrl %ebx, %esi +; X86-NEXT: xorl $31, %esi +; X86-NEXT: .LBB3_3: # %entry_udiv-special-cases +; X86-NEXT: movl 24(%ebp), %eax +; X86-NEXT: testl %edx, %edx +; X86-NEXT: jne .LBB3_4 +; X86-NEXT: # %bb.5: # %entry_udiv-special-cases +; X86-NEXT: bsrl %eax, %ecx +; X86-NEXT: xorl $31, %ecx +; X86-NEXT: orl $32, %ecx +; X86-NEXT: jmp .LBB3_6 +; X86-NEXT: .LBB3_4: +; X86-NEXT: bsrl %edx, %ecx +; X86-NEXT: xorl $31, %ecx +; X86-NEXT: .LBB3_6: # %entry_udiv-special-cases +; X86-NEXT: orl %ebx, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %edi, %eax +; X86-NEXT: orl %ebx, %edi +; X86-NEXT: jne .LBB3_8 +; X86-NEXT: # %bb.7: # %entry_udiv-special-cases +; X86-NEXT: orl $64, %ecx +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: .LBB3_8: # %entry_udiv-special-cases +; X86-NEXT: movl $125, %ebx +; X86-NEXT: subl %esi, %ebx +; X86-NEXT: movl $0, %edx +; X86-NEXT: sbbl %edx, %edx +; X86-NEXT: movl $0, %esi +; X86-NEXT: sbbl %esi, %esi +; X86-NEXT: movl $0, %edi +; X86-NEXT: sbbl %edi, %edi +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: je .LBB3_9 +; X86-NEXT: # %bb.10: # %select.false.sink +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: movl $127, %ecx +; X86-NEXT: cmpl %ebx, %ecx +; X86-NEXT: movl $0, %ecx +; X86-NEXT: sbbl %edx, %ecx +; X86-NEXT: movl $0, %ecx +; X86-NEXT: sbbl %esi, %ecx +; X86-NEXT: sbbl %edi, %eax +; X86-NEXT: setb %cl +; X86-NEXT: .LBB3_11: # %select.end +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: testb %cl, %cl +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl $0, %ecx +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: jne .LBB3_13 +; X86-NEXT: # %bb.12: # %select.end +; X86-NEXT: movl 28(%ebp), %edx +; X86-NEXT: movl 24(%ebp), %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 32(%ebp), %ecx +; X86-NEXT: movl 36(%ebp), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: .LBB3_13: # %select.end +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: jne .LBB3_14 +; X86-NEXT: # %bb.20: # %select.end +; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: xorl $127, %ecx +; X86-NEXT: orl %esi, %ecx +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl %esi, %edx +; X86-NEXT: orl %edi, %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: je .LBB3_21 +; X86-NEXT: # %bb.18: # %udiv-bb1 +; X86-NEXT: movl 24(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 28(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 32(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 36(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: xorb $127, %cl +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: shrb $3, %al +; X86-NEXT: andb $12, %al +; X86-NEXT: negb %al +; X86-NEXT: movsbl %al, %eax +; X86-NEXT: movl 136(%esp,%eax), %edx +; X86-NEXT: movl 140(%esp,%eax), %edi +; X86-NEXT: shldl %cl, %edx, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 128(%esp,%eax), %edi +; X86-NEXT: movl 132(%esp,%eax), %eax +; X86-NEXT: shldl %cl, %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shll %cl, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl $1, %ebx +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: jb .LBB3_19 +; X86-NEXT: # %bb.15: # %udiv-preheader +; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: movl 24(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 28(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 32(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 36(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: shrb $3, %al +; X86-NEXT: andb $12, %al +; X86-NEXT: movzbl %al, %edi +; X86-NEXT: movl 92(%esp,%edi), %eax +; X86-NEXT: movl 88(%esp,%edi), %ebx +; X86-NEXT: movl %ebx, %esi +; X86-NEXT: shrdl %cl, %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 80(%esp,%edi), %esi +; X86-NEXT: movl 84(%esp,%edi), %edi +; X86-NEXT: movl %edi, %edx +; X86-NEXT: shrdl %cl, %ebx, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shrl %cl, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shrdl %cl, %edi, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $7, %eax +; X86-NEXT: addl $-1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $0, %eax +; X86-NEXT: adcl $-1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $0, %eax +; X86-NEXT: adcl $-1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $0, %eax +; X86-NEXT: adcl $-1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: .p2align 4 +; X86-NEXT: .LBB3_16: # %udiv-do-while +; X86-NEXT: # =>This Inner Loop Header: Depth=1 +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: shldl $1, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: shldl $1, %edi, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl $1, %edx, %edi +; X86-NEXT: shldl $1, %eax, %edx +; X86-NEXT: shldl $1, %ecx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: orl %esi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl $1, %eax, %ecx +; X86-NEXT: orl %esi, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: shldl $1, %ecx, %eax +; X86-NEXT: orl %esi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl %ecx, %ecx +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: sbbl %edi, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: sbbl %ebx, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: sarl $31, %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: andl $1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl $7, %eax +; X86-NEXT: andl %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: subl %ecx, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl $0, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: sbbl $0, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: sbbl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: addl $-1, %ebx +; X86-NEXT: adcl $-1, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: adcl $-1, %edx +; X86-NEXT: adcl $-1, %esi +; X86-NEXT: movl %edi, %ecx +; X86-NEXT: orl %esi, %ecx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %edx, %ebx +; X86-NEXT: orl %ecx, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: jne .LBB3_16 +; X86-NEXT: .LBB3_17: # %udiv-loop-exit +; X86-NEXT: shldl $1, %ecx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl $1, %edx, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl $1, %eax, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: leal (%esi,%eax,2), %edi +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: .LBB3_21: # %udiv-end +; X86-NEXT: movl %edi, (%eax) +; X86-NEXT: movl %edx, 4(%eax) +; X86-NEXT: movl %ecx, 8(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl $4 +; X86-NEXT: .LBB3_9: +; X86-NEXT: movb $1, %cl +; X86-NEXT: jmp .LBB3_11 +; X86-NEXT: .LBB3_19: +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: jmp .LBB3_17 +; X86-NEXT: .LBB3_14: +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: jmp .LBB3_21 +; +; X64-LABEL: div_by_7: +; X64: # %bb.0: # %entry +; X64-NEXT: movabsq $1152921504606846975, %rax # imm = 0xFFFFFFFFFFFFFFF +; X64-NEXT: movq %rdi, %rcx +; X64-NEXT: andq %rax, %rcx +; X64-NEXT: movq %rdi, %rdx +; X64-NEXT: shrdq $60, %rsi, %rdx +; X64-NEXT: andq %rax, %rdx +; X64-NEXT: addq %rcx, %rdx +; X64-NEXT: movq %rsi, %rcx +; X64-NEXT: shrq $56, %rcx +; X64-NEXT: addq %rdx, %rcx +; X64-NEXT: movabsq $5270498306774157605, %rdx # imm = 0x4924924924924925 +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: mulq %rdx +; X64-NEXT: shrq %rdx +; X64-NEXT: leaq (,%rdx,8), %rax +; X64-NEXT: subq %rax, %rdx +; X64-NEXT: addq %rcx, %rdx +; X64-NEXT: subq %rdx, %rdi +; X64-NEXT: sbbq $0, %rsi +; X64-NEXT: movabsq $-5270498306774157605, %rcx # imm = 0xB6DB6DB6DB6DB6DB +; X64-NEXT: imulq %rdi, %rcx +; X64-NEXT: movabsq $7905747460161236407, %r8 # imm = 0x6DB6DB6DB6DB6DB7 +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: mulq %r8 +; X64-NEXT: addq %rcx, %rdx +; X64-NEXT: imulq %rsi, %r8 +; X64-NEXT: addq %r8, %rdx +; X64-NEXT: retq +entry: + %div = udiv i128 %x, 7 + ret i128 %div +} + +define i128 @div_by_11(i128 %x) nounwind { +; X86-LABEL: div_by_11: +; X86: # %bb.0: # %_udiv-special-cases +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $160, %esp +; X86-NEXT: movl 32(%ebp), %edi +; X86-NEXT: movl 36(%ebp), %ebx +; X86-NEXT: movl 28(%ebp), %edx +; X86-NEXT: testl %ebx, %ebx +; X86-NEXT: jne .LBB4_1 +; X86-NEXT: # %bb.2: # %_udiv-special-cases +; X86-NEXT: bsrl %edi, %esi +; X86-NEXT: xorl $31, %esi +; X86-NEXT: orl $32, %esi +; X86-NEXT: jmp .LBB4_3 +; X86-NEXT: .LBB4_1: +; X86-NEXT: bsrl %ebx, %esi +; X86-NEXT: xorl $31, %esi +; X86-NEXT: .LBB4_3: # %_udiv-special-cases +; X86-NEXT: movl 24(%ebp), %eax +; X86-NEXT: testl %edx, %edx +; X86-NEXT: jne .LBB4_4 +; X86-NEXT: # %bb.5: # %_udiv-special-cases +; X86-NEXT: bsrl %eax, %ecx +; X86-NEXT: xorl $31, %ecx +; X86-NEXT: orl $32, %ecx +; X86-NEXT: jmp .LBB4_6 +; X86-NEXT: .LBB4_4: +; X86-NEXT: bsrl %edx, %ecx +; X86-NEXT: xorl $31, %ecx +; X86-NEXT: .LBB4_6: # %_udiv-special-cases +; X86-NEXT: orl %ebx, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %edi, %eax +; X86-NEXT: orl %ebx, %edi +; X86-NEXT: jne .LBB4_8 +; X86-NEXT: # %bb.7: # %_udiv-special-cases +; X86-NEXT: orl $64, %ecx +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: .LBB4_8: # %_udiv-special-cases +; X86-NEXT: movl $124, %ebx +; X86-NEXT: subl %esi, %ebx +; X86-NEXT: movl $0, %edx +; X86-NEXT: sbbl %edx, %edx +; X86-NEXT: movl $0, %esi +; X86-NEXT: sbbl %esi, %esi +; X86-NEXT: movl $0, %edi +; X86-NEXT: sbbl %edi, %edi +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: je .LBB4_9 +; X86-NEXT: # %bb.10: # %select.false.sink +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: movl $127, %ecx +; X86-NEXT: cmpl %ebx, %ecx +; X86-NEXT: movl $0, %ecx +; X86-NEXT: sbbl %edx, %ecx +; X86-NEXT: movl $0, %ecx +; X86-NEXT: sbbl %esi, %ecx +; X86-NEXT: sbbl %edi, %eax +; X86-NEXT: setb %cl +; X86-NEXT: .LBB4_11: # %select.end +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: testb %cl, %cl +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl $0, %ecx +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: jne .LBB4_13 +; X86-NEXT: # %bb.12: # %select.end +; X86-NEXT: movl 28(%ebp), %edx +; X86-NEXT: movl 24(%ebp), %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 32(%ebp), %ecx +; X86-NEXT: movl 36(%ebp), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: .LBB4_13: # %select.end +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: jne .LBB4_14 +; X86-NEXT: # %bb.20: # %select.end +; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: xorl $127, %ecx +; X86-NEXT: orl %esi, %ecx +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl %esi, %edx +; X86-NEXT: orl %edi, %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: je .LBB4_21 +; X86-NEXT: # %bb.18: # %udiv-bb1 +; X86-NEXT: movl 24(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 28(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 32(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 36(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: xorb $127, %cl +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: shrb $3, %al +; X86-NEXT: andb $12, %al +; X86-NEXT: negb %al +; X86-NEXT: movsbl %al, %eax +; X86-NEXT: movl 136(%esp,%eax), %edx +; X86-NEXT: movl 140(%esp,%eax), %edi +; X86-NEXT: shldl %cl, %edx, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 128(%esp,%eax), %edi +; X86-NEXT: movl 132(%esp,%eax), %eax +; X86-NEXT: shldl %cl, %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shll %cl, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl $1, %ebx +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: jb .LBB4_19 +; X86-NEXT: # %bb.15: # %udiv-preheader +; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: movl 24(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 28(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 32(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 36(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: shrb $3, %al +; X86-NEXT: andb $12, %al +; X86-NEXT: movzbl %al, %edi +; X86-NEXT: movl 92(%esp,%edi), %eax +; X86-NEXT: movl 88(%esp,%edi), %ebx +; X86-NEXT: movl %ebx, %esi +; X86-NEXT: shrdl %cl, %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 80(%esp,%edi), %esi +; X86-NEXT: movl 84(%esp,%edi), %edi +; X86-NEXT: movl %edi, %edx +; X86-NEXT: shrdl %cl, %ebx, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shrl %cl, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shrdl %cl, %edi, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $11, %eax +; X86-NEXT: addl $-1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $0, %eax +; X86-NEXT: adcl $-1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $0, %eax +; X86-NEXT: adcl $-1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $0, %eax +; X86-NEXT: adcl $-1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: .p2align 4 +; X86-NEXT: .LBB4_16: # %udiv-do-while +; X86-NEXT: # =>This Inner Loop Header: Depth=1 +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: shldl $1, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: shldl $1, %edi, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl $1, %edx, %edi +; X86-NEXT: shldl $1, %eax, %edx +; X86-NEXT: shldl $1, %ecx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: orl %esi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl $1, %eax, %ecx +; X86-NEXT: orl %esi, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: shldl $1, %ecx, %eax +; X86-NEXT: orl %esi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl %ecx, %ecx +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: sbbl %edi, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: sbbl %ebx, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: sarl $31, %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: andl $1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl $11, %eax +; X86-NEXT: andl %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: subl %ecx, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl $0, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: sbbl $0, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: sbbl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: addl $-1, %ebx +; X86-NEXT: adcl $-1, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: adcl $-1, %edx +; X86-NEXT: adcl $-1, %esi +; X86-NEXT: movl %edi, %ecx +; X86-NEXT: orl %esi, %ecx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %edx, %ebx +; X86-NEXT: orl %ecx, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: jne .LBB4_16 +; X86-NEXT: .LBB4_17: # %udiv-loop-exit +; X86-NEXT: shldl $1, %ecx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl $1, %edx, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl $1, %eax, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: leal (%esi,%eax,2), %edi +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: .LBB4_21: # %udiv-end +; X86-NEXT: movl %edi, (%eax) +; X86-NEXT: movl %edx, 4(%eax) +; X86-NEXT: movl %ecx, 8(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl $4 +; X86-NEXT: .LBB4_9: +; X86-NEXT: movb $1, %cl +; X86-NEXT: jmp .LBB4_11 +; X86-NEXT: .LBB4_19: +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: jmp .LBB4_17 +; X86-NEXT: .LBB4_14: +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: jmp .LBB4_21 +; +; X64-LABEL: div_by_11: +; X64: # %bb.0: +; X64-NEXT: movabsq $1152921504606846975, %rax # imm = 0xFFFFFFFFFFFFFFF +; X64-NEXT: movq %rdi, %rcx +; X64-NEXT: andq %rax, %rcx +; X64-NEXT: movq %rdi, %rdx +; X64-NEXT: shrdq $60, %rsi, %rdx +; X64-NEXT: andq %rax, %rdx +; X64-NEXT: addq %rcx, %rdx +; X64-NEXT: movq %rsi, %rcx +; X64-NEXT: shrq $56, %rcx +; X64-NEXT: addq %rdx, %rcx +; X64-NEXT: movabsq $3353953467947191203, %r8 # imm = 0x2E8BA2E8BA2E8BA3 +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: mulq %r8 +; X64-NEXT: shrq %rdx +; X64-NEXT: leaq (%rdx,%rdx,4), %rax +; X64-NEXT: leaq (%rdx,%rax,2), %rax +; X64-NEXT: subq %rax, %rcx +; X64-NEXT: subq %rcx, %rdi +; X64-NEXT: sbbq $0, %rsi +; X64-NEXT: movabsq $-6707906935894382406, %rcx # imm = 0xA2E8BA2E8BA2E8BA +; X64-NEXT: imulq %rdi, %rcx +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: mulq %r8 +; X64-NEXT: addq %rcx, %rdx +; X64-NEXT: imulq %rsi, %r8 +; X64-NEXT: addq %r8, %rdx +; X64-NEXT: retq + %div = udiv i128 %x, 11 + ret i128 %div +} + +define i128 @div_by_22(i128 %x) nounwind { +; X86-LABEL: div_by_22: +; X86: # %bb.0: # %entry_udiv-special-cases +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $160, %esp +; X86-NEXT: movl 32(%ebp), %edi +; X86-NEXT: movl 36(%ebp), %ebx +; X86-NEXT: movl 28(%ebp), %edx +; X86-NEXT: testl %ebx, %ebx +; X86-NEXT: jne .LBB5_1 +; X86-NEXT: # %bb.2: # %entry_udiv-special-cases +; X86-NEXT: bsrl %edi, %esi +; X86-NEXT: xorl $31, %esi +; X86-NEXT: orl $32, %esi +; X86-NEXT: jmp .LBB5_3 +; X86-NEXT: .LBB5_1: +; X86-NEXT: bsrl %ebx, %esi +; X86-NEXT: xorl $31, %esi +; X86-NEXT: .LBB5_3: # %entry_udiv-special-cases +; X86-NEXT: movl 24(%ebp), %eax +; X86-NEXT: testl %edx, %edx +; X86-NEXT: jne .LBB5_4 +; X86-NEXT: # %bb.5: # %entry_udiv-special-cases +; X86-NEXT: bsrl %eax, %ecx +; X86-NEXT: xorl $31, %ecx +; X86-NEXT: orl $32, %ecx +; X86-NEXT: jmp .LBB5_6 +; X86-NEXT: .LBB5_4: +; X86-NEXT: bsrl %edx, %ecx +; X86-NEXT: xorl $31, %ecx +; X86-NEXT: .LBB5_6: # %entry_udiv-special-cases +; X86-NEXT: orl %ebx, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %edi, %eax +; X86-NEXT: orl %ebx, %edi +; X86-NEXT: jne .LBB5_8 +; X86-NEXT: # %bb.7: # %entry_udiv-special-cases +; X86-NEXT: orl $64, %ecx +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: .LBB5_8: # %entry_udiv-special-cases +; X86-NEXT: movl $123, %ebx +; X86-NEXT: subl %esi, %ebx +; X86-NEXT: movl $0, %edx +; X86-NEXT: sbbl %edx, %edx +; X86-NEXT: movl $0, %esi +; X86-NEXT: sbbl %esi, %esi +; X86-NEXT: movl $0, %edi +; X86-NEXT: sbbl %edi, %edi +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: je .LBB5_9 +; X86-NEXT: # %bb.10: # %select.false.sink +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: movl $127, %ecx +; X86-NEXT: cmpl %ebx, %ecx +; X86-NEXT: movl $0, %ecx +; X86-NEXT: sbbl %edx, %ecx +; X86-NEXT: movl $0, %ecx +; X86-NEXT: sbbl %esi, %ecx +; X86-NEXT: sbbl %edi, %eax +; X86-NEXT: setb %cl +; X86-NEXT: .LBB5_11: # %select.end +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: testb %cl, %cl +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl $0, %ecx +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: jne .LBB5_13 +; X86-NEXT: # %bb.12: # %select.end +; X86-NEXT: movl 28(%ebp), %edx +; X86-NEXT: movl 24(%ebp), %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 32(%ebp), %ecx +; X86-NEXT: movl 36(%ebp), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: .LBB5_13: # %select.end +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: jne .LBB5_14 +; X86-NEXT: # %bb.20: # %select.end +; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: xorl $127, %ecx +; X86-NEXT: orl %esi, %ecx +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl %esi, %edx +; X86-NEXT: orl %edi, %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: je .LBB5_21 +; X86-NEXT: # %bb.18: # %udiv-bb1 +; X86-NEXT: movl 24(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 28(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 32(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 36(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: xorb $127, %cl +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: shrb $3, %al +; X86-NEXT: andb $12, %al +; X86-NEXT: negb %al +; X86-NEXT: movsbl %al, %eax +; X86-NEXT: movl 136(%esp,%eax), %edx +; X86-NEXT: movl 140(%esp,%eax), %edi +; X86-NEXT: shldl %cl, %edx, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 128(%esp,%eax), %edi +; X86-NEXT: movl 132(%esp,%eax), %eax +; X86-NEXT: shldl %cl, %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shll %cl, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl $1, %ebx +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: jb .LBB5_19 +; X86-NEXT: # %bb.15: # %udiv-preheader +; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: movl 24(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 28(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 32(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 36(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: shrb $3, %al +; X86-NEXT: andb $12, %al +; X86-NEXT: movzbl %al, %edi +; X86-NEXT: movl 92(%esp,%edi), %eax +; X86-NEXT: movl 88(%esp,%edi), %ebx +; X86-NEXT: movl %ebx, %esi +; X86-NEXT: shrdl %cl, %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 80(%esp,%edi), %esi +; X86-NEXT: movl 84(%esp,%edi), %edi +; X86-NEXT: movl %edi, %edx +; X86-NEXT: shrdl %cl, %ebx, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shrl %cl, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shrdl %cl, %edi, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $22, %eax +; X86-NEXT: addl $-1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $0, %eax +; X86-NEXT: adcl $-1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $0, %eax +; X86-NEXT: adcl $-1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $0, %eax +; X86-NEXT: adcl $-1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: .p2align 4 +; X86-NEXT: .LBB5_16: # %udiv-do-while +; X86-NEXT: # =>This Inner Loop Header: Depth=1 +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: shldl $1, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: shldl $1, %edi, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl $1, %edx, %edi +; X86-NEXT: shldl $1, %eax, %edx +; X86-NEXT: shldl $1, %ecx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: orl %esi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl $1, %eax, %ecx +; X86-NEXT: orl %esi, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: shldl $1, %ecx, %eax +; X86-NEXT: orl %esi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl %ecx, %ecx +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: sbbl %edi, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: sbbl %ebx, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: sarl $31, %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: andl $1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl $22, %eax +; X86-NEXT: andl %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: subl %ecx, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl $0, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: sbbl $0, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: sbbl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: addl $-1, %ebx +; X86-NEXT: adcl $-1, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: adcl $-1, %edx +; X86-NEXT: adcl $-1, %esi +; X86-NEXT: movl %edi, %ecx +; X86-NEXT: orl %esi, %ecx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %edx, %ebx +; X86-NEXT: orl %ecx, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: jne .LBB5_16 +; X86-NEXT: .LBB5_17: # %udiv-loop-exit +; X86-NEXT: shldl $1, %ecx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl $1, %edx, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl $1, %eax, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: leal (%esi,%eax,2), %edi +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: .LBB5_21: # %udiv-end +; X86-NEXT: movl %edi, (%eax) +; X86-NEXT: movl %edx, 4(%eax) +; X86-NEXT: movl %ecx, 8(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl $4 +; X86-NEXT: .LBB5_9: +; X86-NEXT: movb $1, %cl +; X86-NEXT: jmp .LBB5_11 +; X86-NEXT: .LBB5_19: +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: jmp .LBB5_17 +; X86-NEXT: .LBB5_14: +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: jmp .LBB5_21 +; +; X64-LABEL: div_by_22: +; X64: # %bb.0: # %entry +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: shrdq $1, %rsi, %rax +; X64-NEXT: movabsq $1152921504606846975, %rcx # imm = 0xFFFFFFFFFFFFFFF +; X64-NEXT: movq %rsi, %rdx +; X64-NEXT: shrq %rdx +; X64-NEXT: shldq $4, %rax, %rdx +; X64-NEXT: andq %rcx, %rax +; X64-NEXT: andq %rcx, %rdx +; X64-NEXT: movq %rsi, %rcx +; X64-NEXT: shrq $57, %rcx +; X64-NEXT: addq %rax, %rcx +; X64-NEXT: addq %rdx, %rcx +; X64-NEXT: movabsq $3353953467947191203, %r8 # imm = 0x2E8BA2E8BA2E8BA3 +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: mulq %r8 +; X64-NEXT: shrq %rdx +; X64-NEXT: leaq (%rdx,%rdx,4), %rax +; X64-NEXT: leaq (%rdx,%rax,2), %rax +; X64-NEXT: subq %rax, %rcx +; X64-NEXT: subq %rcx, %rdi +; X64-NEXT: sbbq $0, %rsi +; X64-NEXT: movabsq $-6707906935894382406, %rcx # imm = 0xA2E8BA2E8BA2E8BA +; X64-NEXT: imulq %rdi, %rcx +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: mulq %r8 +; X64-NEXT: addq %rcx, %rdx +; X64-NEXT: imulq %rsi, %r8 +; X64-NEXT: addq %r8, %rdx +; X64-NEXT: retq +entry: + %div = udiv i128 %x, 22 + ret i128 %div +} + +define i128 @div_by_56(i128 %x) nounwind { +; X86-LABEL: div_by_56: +; X86: # %bb.0: # %_udiv-special-cases +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $160, %esp +; X86-NEXT: movl 32(%ebp), %edi +; X86-NEXT: movl 36(%ebp), %ebx +; X86-NEXT: movl 28(%ebp), %edx +; X86-NEXT: testl %ebx, %ebx +; X86-NEXT: jne .LBB6_1 +; X86-NEXT: # %bb.2: # %_udiv-special-cases +; X86-NEXT: bsrl %edi, %esi +; X86-NEXT: xorl $31, %esi +; X86-NEXT: orl $32, %esi +; X86-NEXT: jmp .LBB6_3 +; X86-NEXT: .LBB6_1: +; X86-NEXT: bsrl %ebx, %esi +; X86-NEXT: xorl $31, %esi +; X86-NEXT: .LBB6_3: # %_udiv-special-cases +; X86-NEXT: movl 24(%ebp), %eax +; X86-NEXT: testl %edx, %edx +; X86-NEXT: jne .LBB6_4 +; X86-NEXT: # %bb.5: # %_udiv-special-cases +; X86-NEXT: bsrl %eax, %ecx +; X86-NEXT: xorl $31, %ecx +; X86-NEXT: orl $32, %ecx +; X86-NEXT: jmp .LBB6_6 +; X86-NEXT: .LBB6_4: +; X86-NEXT: bsrl %edx, %ecx +; X86-NEXT: xorl $31, %ecx +; X86-NEXT: .LBB6_6: # %_udiv-special-cases +; X86-NEXT: orl %ebx, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %edi, %eax +; X86-NEXT: orl %ebx, %edi +; X86-NEXT: jne .LBB6_8 +; X86-NEXT: # %bb.7: # %_udiv-special-cases +; X86-NEXT: orl $64, %ecx +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: .LBB6_8: # %_udiv-special-cases +; X86-NEXT: movl $122, %ebx +; X86-NEXT: subl %esi, %ebx +; X86-NEXT: movl $0, %edx +; X86-NEXT: sbbl %edx, %edx +; X86-NEXT: movl $0, %esi +; X86-NEXT: sbbl %esi, %esi +; X86-NEXT: movl $0, %edi +; X86-NEXT: sbbl %edi, %edi +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: je .LBB6_9 +; X86-NEXT: # %bb.10: # %select.false.sink +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: movl $127, %ecx +; X86-NEXT: cmpl %ebx, %ecx +; X86-NEXT: movl $0, %ecx +; X86-NEXT: sbbl %edx, %ecx +; X86-NEXT: movl $0, %ecx +; X86-NEXT: sbbl %esi, %ecx +; X86-NEXT: sbbl %edi, %eax +; X86-NEXT: setb %cl +; X86-NEXT: .LBB6_11: # %select.end +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: testb %cl, %cl +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl $0, %ecx +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: jne .LBB6_13 +; X86-NEXT: # %bb.12: # %select.end +; X86-NEXT: movl 28(%ebp), %edx +; X86-NEXT: movl 24(%ebp), %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 32(%ebp), %ecx +; X86-NEXT: movl 36(%ebp), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: .LBB6_13: # %select.end +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: jne .LBB6_14 +; X86-NEXT: # %bb.20: # %select.end +; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: xorl $127, %ecx +; X86-NEXT: orl %esi, %ecx +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl %esi, %edx +; X86-NEXT: orl %edi, %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: je .LBB6_21 +; X86-NEXT: # %bb.18: # %udiv-bb1 +; X86-NEXT: movl 24(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 28(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 32(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 36(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: xorb $127, %cl +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: shrb $3, %al +; X86-NEXT: andb $12, %al +; X86-NEXT: negb %al +; X86-NEXT: movsbl %al, %eax +; X86-NEXT: movl 136(%esp,%eax), %edx +; X86-NEXT: movl 140(%esp,%eax), %edi +; X86-NEXT: shldl %cl, %edx, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 128(%esp,%eax), %edi +; X86-NEXT: movl 132(%esp,%eax), %eax +; X86-NEXT: shldl %cl, %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shll %cl, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl $1, %ebx +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: jb .LBB6_19 +; X86-NEXT: # %bb.15: # %udiv-preheader +; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: movl 24(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 28(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 32(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 36(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: shrb $3, %al +; X86-NEXT: andb $12, %al +; X86-NEXT: movzbl %al, %edi +; X86-NEXT: movl 92(%esp,%edi), %eax +; X86-NEXT: movl 88(%esp,%edi), %ebx +; X86-NEXT: movl %ebx, %esi +; X86-NEXT: shrdl %cl, %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 80(%esp,%edi), %esi +; X86-NEXT: movl 84(%esp,%edi), %edi +; X86-NEXT: movl %edi, %edx +; X86-NEXT: shrdl %cl, %ebx, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shrl %cl, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shrdl %cl, %edi, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $56, %eax +; X86-NEXT: addl $-1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $0, %eax +; X86-NEXT: adcl $-1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $0, %eax +; X86-NEXT: adcl $-1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $0, %eax +; X86-NEXT: adcl $-1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: .p2align 4 +; X86-NEXT: .LBB6_16: # %udiv-do-while +; X86-NEXT: # =>This Inner Loop Header: Depth=1 +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: shldl $1, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: shldl $1, %edi, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl $1, %edx, %edi +; X86-NEXT: shldl $1, %eax, %edx +; X86-NEXT: shldl $1, %ecx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: orl %esi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl $1, %eax, %ecx +; X86-NEXT: orl %esi, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: shldl $1, %ecx, %eax +; X86-NEXT: orl %esi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl %ecx, %ecx +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: sbbl %edi, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: sbbl %ebx, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: sarl $31, %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: andl $1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl $56, %eax +; X86-NEXT: andl %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: subl %ecx, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl $0, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: sbbl $0, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: sbbl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: addl $-1, %ebx +; X86-NEXT: adcl $-1, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: adcl $-1, %edx +; X86-NEXT: adcl $-1, %esi +; X86-NEXT: movl %edi, %ecx +; X86-NEXT: orl %esi, %ecx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %edx, %ebx +; X86-NEXT: orl %ecx, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: jne .LBB6_16 +; X86-NEXT: .LBB6_17: # %udiv-loop-exit +; X86-NEXT: shldl $1, %ecx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl $1, %edx, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl $1, %eax, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: leal (%esi,%eax,2), %edi +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: .LBB6_21: # %udiv-end +; X86-NEXT: movl %edi, (%eax) +; X86-NEXT: movl %edx, 4(%eax) +; X86-NEXT: movl %ecx, 8(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl $4 +; X86-NEXT: .LBB6_9: +; X86-NEXT: movb $1, %cl +; X86-NEXT: jmp .LBB6_11 +; X86-NEXT: .LBB6_19: +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: jmp .LBB6_17 +; X86-NEXT: .LBB6_14: +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: jmp .LBB6_21 +; +; X64-LABEL: div_by_56: +; X64: # %bb.0: +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: shrdq $3, %rsi, %rax +; X64-NEXT: movabsq $1152921504606846975, %rcx # imm = 0xFFFFFFFFFFFFFFF +; X64-NEXT: movq %rsi, %rdx +; X64-NEXT: shrq $3, %rdx +; X64-NEXT: shldq $4, %rax, %rdx +; X64-NEXT: andq %rcx, %rax +; X64-NEXT: andq %rcx, %rdx +; X64-NEXT: movq %rsi, %rcx +; X64-NEXT: shrq $59, %rcx +; X64-NEXT: addq %rax, %rcx +; X64-NEXT: addq %rdx, %rcx +; X64-NEXT: movabsq $5270498306774157605, %rdx # imm = 0x4924924924924925 +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: mulq %rdx +; X64-NEXT: shrq %rdx +; X64-NEXT: leaq (,%rdx,8), %rax +; X64-NEXT: subq %rax, %rdx +; X64-NEXT: addq %rcx, %rdx +; X64-NEXT: subq %rdx, %rdi +; X64-NEXT: sbbq $0, %rsi +; X64-NEXT: movabsq $-5270498306774157605, %rcx # imm = 0xB6DB6DB6DB6DB6DB +; X64-NEXT: imulq %rdi, %rcx +; X64-NEXT: movabsq $7905747460161236407, %r8 # imm = 0x6DB6DB6DB6DB6DB7 +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: mulq %r8 +; X64-NEXT: addq %rcx, %rdx +; X64-NEXT: imulq %rsi, %r8 +; X64-NEXT: addq %r8, %rdx +; X64-NEXT: retq + %div = udiv i128 %x, 56 ; 8 * 7 + ret i128 %div +} + +define i128 @rem_by_7(i128 %x) nounwind { +; X86-LABEL: rem_by_7: +; X86: # %bb.0: # %_udiv-special-cases +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $160, %esp +; X86-NEXT: movl 32(%ebp), %edi +; X86-NEXT: movl 36(%ebp), %ebx +; X86-NEXT: movl 28(%ebp), %eax +; X86-NEXT: testl %ebx, %ebx +; X86-NEXT: jne .LBB7_1 +; X86-NEXT: # %bb.2: # %_udiv-special-cases +; X86-NEXT: bsrl %edi, %edx +; X86-NEXT: xorl $31, %edx +; X86-NEXT: orl $32, %edx +; X86-NEXT: jmp .LBB7_3 +; X86-NEXT: .LBB7_1: +; X86-NEXT: bsrl %ebx, %edx +; X86-NEXT: xorl $31, %edx +; X86-NEXT: .LBB7_3: # %_udiv-special-cases +; X86-NEXT: movl 24(%ebp), %esi +; X86-NEXT: testl %eax, %eax +; X86-NEXT: jne .LBB7_4 +; X86-NEXT: # %bb.5: # %_udiv-special-cases +; X86-NEXT: bsrl %esi, %ecx +; X86-NEXT: xorl $31, %ecx +; X86-NEXT: orl $32, %ecx +; X86-NEXT: jmp .LBB7_6 +; X86-NEXT: .LBB7_4: +; X86-NEXT: bsrl %eax, %ecx +; X86-NEXT: xorl $31, %ecx +; X86-NEXT: .LBB7_6: # %_udiv-special-cases +; X86-NEXT: orl %ebx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %edi, %esi +; X86-NEXT: orl %ebx, %edi +; X86-NEXT: jne .LBB7_8 +; X86-NEXT: # %bb.7: # %_udiv-special-cases +; X86-NEXT: orl $64, %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: .LBB7_8: # %_udiv-special-cases +; X86-NEXT: movl $125, %ebx +; X86-NEXT: subl %edx, %ebx +; X86-NEXT: movl $0, %edx +; X86-NEXT: sbbl %edx, %edx +; X86-NEXT: movl $0, %eax +; X86-NEXT: sbbl %eax, %eax +; X86-NEXT: movl $0, %edi +; X86-NEXT: sbbl %edi, %edi +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: je .LBB7_9 +; X86-NEXT: # %bb.10: # %select.false.sink +; X86-NEXT: xorl %esi, %esi +; X86-NEXT: movl $127, %ecx +; X86-NEXT: cmpl %ebx, %ecx +; X86-NEXT: movl $0, %ecx +; X86-NEXT: sbbl %edx, %ecx +; X86-NEXT: movl $0, %ecx +; X86-NEXT: sbbl %eax, %ecx +; X86-NEXT: sbbl %edi, %esi +; X86-NEXT: setb %al +; X86-NEXT: .LBB7_11: # %select.end +; X86-NEXT: movl 32(%ebp), %edi +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: testb %al, %al +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl $0, %eax +; X86-NEXT: movl $0, %esi +; X86-NEXT: jne .LBB7_13 +; X86-NEXT: # %bb.12: # %select.end +; X86-NEXT: movl 28(%ebp), %edx +; X86-NEXT: movl 24(%ebp), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl 36(%ebp), %esi +; X86-NEXT: .LBB7_13: # %select.end +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: jne .LBB7_14 +; X86-NEXT: # %bb.20: # %select.end +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: xorl $127, %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: orl %edi, %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: je .LBB7_21 +; X86-NEXT: # %bb.18: # %udiv-bb1 +; X86-NEXT: movl 24(%ebp), %ebx +; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NEXT: movl 28(%ebp), %esi +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: movl 32(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 36(%ebp), %edx +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: xorb $127, %cl +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: shrb $3, %al +; X86-NEXT: andb $12, %al +; X86-NEXT: negb %al +; X86-NEXT: movsbl %al, %eax +; X86-NEXT: movl 136(%esp,%eax), %esi +; X86-NEXT: movl 140(%esp,%eax), %edx +; X86-NEXT: shldl %cl, %esi, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 128(%esp,%eax), %edx +; X86-NEXT: movl 132(%esp,%eax), %eax +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shll %cl, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl $1, %ebx +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: jb .LBB7_19 +; X86-NEXT: # %bb.15: # %udiv-preheader +; X86-NEXT: movl 24(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 28(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 32(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 36(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: shrb $3, %al +; X86-NEXT: andb $12, %al +; X86-NEXT: movzbl %al, %eax +; X86-NEXT: movl 92(%esp,%eax), %esi +; X86-NEXT: movl 88(%esp,%eax), %edx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: shrdl %cl, %esi, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 80(%esp,%eax), %edi +; X86-NEXT: movl 84(%esp,%eax), %ebx +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: shrdl %cl, %edx, %eax +; X86-NEXT: shrl %cl, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shrdl %cl, %ebx, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $7, %edx +; X86-NEXT: addl $-1, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $0, %edx +; X86-NEXT: adcl $-1, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $0, %edx +; X86-NEXT: adcl $-1, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $0, %edx +; X86-NEXT: adcl $-1, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: .p2align 4 +; X86-NEXT: .LBB7_16: # %udiv-do-while +; X86-NEXT: # =>This Inner Loop Header: Depth=1 +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: shldl $1, %ebx, %esi +; X86-NEXT: shldl $1, %eax, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl $1, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: shldl $1, %ecx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: shldl $1, %edi, %ecx +; X86-NEXT: orl %edx, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: shldl $1, %ecx, %edi +; X86-NEXT: orl %edx, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: shldl $1, %edi, %ecx +; X86-NEXT: orl %edx, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl %edi, %edi +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: cmpl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: sbbl %ebx, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: sbbl %esi, %ecx +; X86-NEXT: sarl $31, %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: andl $1, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $7, %edx +; X86-NEXT: andl %edx, %ecx +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: sbbl $0, %eax +; X86-NEXT: sbbl $0, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: sbbl $0, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: addl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: adcl $-1, %ecx +; X86-NEXT: adcl $-1, %ebx +; X86-NEXT: adcl $-1, %esi +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %esi, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: orl %ebx, %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: movl $0, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: jne .LBB7_16 +; X86-NEXT: .LBB7_17: # %udiv-loop-exit +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl $1, %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl $1, %edx, %eax +; X86-NEXT: shldl $1, %edi, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: leal (%esi,%edi,2), %ebx +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: .LBB7_21: # %udiv-end +; X86-NEXT: movl $7, %ecx +; X86-NEXT: imull %ecx, %esi +; X86-NEXT: movl %edx, %edi +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $7, %eax +; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $7, %eax +; X86-NEXT: mull %edi +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: adcl %esi, %ecx +; X86-NEXT: movl 24(%ebp), %ebx +; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: movl 28(%ebp), %esi +; X86-NEXT: sbbl %eax, %esi +; X86-NEXT: movl 32(%ebp), %edi +; X86-NEXT: sbbl %edx, %edi +; X86-NEXT: movl 36(%ebp), %edx +; X86-NEXT: sbbl %ecx, %edx +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl %ebx, (%eax) +; X86-NEXT: movl %esi, 4(%eax) +; X86-NEXT: movl %edi, 8(%eax) +; X86-NEXT: movl %edx, 12(%eax) +; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl $4 +; X86-NEXT: .LBB7_9: +; X86-NEXT: movb $1, %al +; X86-NEXT: jmp .LBB7_11 +; X86-NEXT: .LBB7_19: +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: jmp .LBB7_17 +; X86-NEXT: .LBB7_14: +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: jmp .LBB7_21 +; +; X64-LABEL: rem_by_7: +; X64: # %bb.0: +; X64-NEXT: movabsq $1152921504606846975, %rcx # imm = 0xFFFFFFFFFFFFFFF +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: andq %rcx, %rax +; X64-NEXT: shrdq $60, %rsi, %rdi +; X64-NEXT: andq %rdi, %rcx +; X64-NEXT: addq %rax, %rcx +; X64-NEXT: shrq $56, %rsi +; X64-NEXT: addq %rsi, %rcx +; X64-NEXT: movabsq $5270498306774157605, %rdx # imm = 0x4924924924924925 +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: mulq %rdx +; X64-NEXT: shrq %rdx +; X64-NEXT: leaq (,%rdx,8), %rax +; X64-NEXT: subq %rax, %rdx +; X64-NEXT: addq %rdx, %rcx +; X64-NEXT: movq %rcx, %rax +; X64-NEXT: xorl %edx, %edx +; X64-NEXT: retq + %rem = urem i128 %x, 7 + ret i128 %rem +} + +define i128 @rem_by_14(i128 %x) nounwind { +; X86-LABEL: rem_by_14: +; X86: # %bb.0: # %_udiv-special-cases +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $160, %esp +; X86-NEXT: movl 32(%ebp), %edi +; X86-NEXT: movl 36(%ebp), %ebx +; X86-NEXT: movl 28(%ebp), %eax +; X86-NEXT: testl %ebx, %ebx +; X86-NEXT: jne .LBB8_1 +; X86-NEXT: # %bb.2: # %_udiv-special-cases +; X86-NEXT: bsrl %edi, %edx +; X86-NEXT: xorl $31, %edx +; X86-NEXT: orl $32, %edx +; X86-NEXT: jmp .LBB8_3 +; X86-NEXT: .LBB8_1: +; X86-NEXT: bsrl %ebx, %edx +; X86-NEXT: xorl $31, %edx +; X86-NEXT: .LBB8_3: # %_udiv-special-cases +; X86-NEXT: movl 24(%ebp), %esi +; X86-NEXT: testl %eax, %eax +; X86-NEXT: jne .LBB8_4 +; X86-NEXT: # %bb.5: # %_udiv-special-cases +; X86-NEXT: bsrl %esi, %ecx +; X86-NEXT: xorl $31, %ecx +; X86-NEXT: orl $32, %ecx +; X86-NEXT: jmp .LBB8_6 +; X86-NEXT: .LBB8_4: +; X86-NEXT: bsrl %eax, %ecx +; X86-NEXT: xorl $31, %ecx +; X86-NEXT: .LBB8_6: # %_udiv-special-cases +; X86-NEXT: orl %ebx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %edi, %esi +; X86-NEXT: orl %ebx, %edi +; X86-NEXT: jne .LBB8_8 +; X86-NEXT: # %bb.7: # %_udiv-special-cases +; X86-NEXT: orl $64, %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: .LBB8_8: # %_udiv-special-cases +; X86-NEXT: movl $124, %ebx +; X86-NEXT: subl %edx, %ebx +; X86-NEXT: movl $0, %edx +; X86-NEXT: sbbl %edx, %edx +; X86-NEXT: movl $0, %eax +; X86-NEXT: sbbl %eax, %eax +; X86-NEXT: movl $0, %edi +; X86-NEXT: sbbl %edi, %edi +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: je .LBB8_9 +; X86-NEXT: # %bb.10: # %select.false.sink +; X86-NEXT: xorl %esi, %esi +; X86-NEXT: movl $127, %ecx +; X86-NEXT: cmpl %ebx, %ecx +; X86-NEXT: movl $0, %ecx +; X86-NEXT: sbbl %edx, %ecx +; X86-NEXT: movl $0, %ecx +; X86-NEXT: sbbl %eax, %ecx +; X86-NEXT: sbbl %edi, %esi +; X86-NEXT: setb %al +; X86-NEXT: .LBB8_11: # %select.end +; X86-NEXT: movl 32(%ebp), %edi +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: testb %al, %al +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl $0, %eax +; X86-NEXT: movl $0, %esi +; X86-NEXT: jne .LBB8_13 +; X86-NEXT: # %bb.12: # %select.end +; X86-NEXT: movl 28(%ebp), %edx +; X86-NEXT: movl 24(%ebp), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl 36(%ebp), %esi +; X86-NEXT: .LBB8_13: # %select.end +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: jne .LBB8_14 +; X86-NEXT: # %bb.20: # %select.end +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: xorl $127, %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: orl %edi, %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: je .LBB8_21 +; X86-NEXT: # %bb.18: # %udiv-bb1 +; X86-NEXT: movl 24(%ebp), %ebx +; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NEXT: movl 28(%ebp), %esi +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: movl 32(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 36(%ebp), %edx +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: xorb $127, %cl +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: shrb $3, %al +; X86-NEXT: andb $12, %al +; X86-NEXT: negb %al +; X86-NEXT: movsbl %al, %eax +; X86-NEXT: movl 136(%esp,%eax), %esi +; X86-NEXT: movl 140(%esp,%eax), %edx +; X86-NEXT: shldl %cl, %esi, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 128(%esp,%eax), %edx +; X86-NEXT: movl 132(%esp,%eax), %eax +; X86-NEXT: shldl %cl, %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shll %cl, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl $1, %ebx +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: adcl $0, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: jb .LBB8_19 +; X86-NEXT: # %bb.15: # %udiv-preheader +; X86-NEXT: movl 24(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 28(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 32(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 36(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: shrb $3, %al +; X86-NEXT: andb $12, %al +; X86-NEXT: movzbl %al, %eax +; X86-NEXT: movl 92(%esp,%eax), %esi +; X86-NEXT: movl 88(%esp,%eax), %edx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: shrdl %cl, %esi, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 80(%esp,%eax), %edi +; X86-NEXT: movl 84(%esp,%eax), %ebx +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: shrdl %cl, %edx, %eax +; X86-NEXT: shrl %cl, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shrdl %cl, %ebx, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $14, %edx +; X86-NEXT: addl $-1, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $0, %edx +; X86-NEXT: adcl $-1, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $0, %edx +; X86-NEXT: adcl $-1, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $0, %edx +; X86-NEXT: adcl $-1, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: .p2align 4 +; X86-NEXT: .LBB8_16: # %udiv-do-while +; X86-NEXT: # =>This Inner Loop Header: Depth=1 +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: shldl $1, %ebx, %esi +; X86-NEXT: shldl $1, %eax, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl $1, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: shldl $1, %ecx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: shldl $1, %edi, %ecx +; X86-NEXT: orl %edx, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: shldl $1, %ecx, %edi +; X86-NEXT: orl %edx, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: shldl $1, %edi, %ecx +; X86-NEXT: orl %edx, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl %edi, %edi +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: cmpl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: sbbl %ebx, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: sbbl %esi, %ecx +; X86-NEXT: sarl $31, %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: andl $1, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $14, %edx +; X86-NEXT: andl %edx, %ecx +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: sbbl $0, %eax +; X86-NEXT: sbbl $0, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: sbbl $0, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: addl $-1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: adcl $-1, %ecx +; X86-NEXT: adcl $-1, %ebx +; X86-NEXT: adcl $-1, %esi +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %esi, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: orl %ebx, %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: movl $0, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: jne .LBB8_16 +; X86-NEXT: .LBB8_17: # %udiv-loop-exit +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl $1, %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl $1, %edx, %eax +; X86-NEXT: shldl $1, %edi, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: leal (%esi,%edi,2), %ebx +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: .LBB8_21: # %udiv-end +; X86-NEXT: movl $14, %ecx +; X86-NEXT: imull %ecx, %esi +; X86-NEXT: movl %edx, %edi +; X86-NEXT: mull %ecx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $14, %eax +; X86-NEXT: mull %ebx +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $14, %eax +; X86-NEXT: mull %edi +; X86-NEXT: addl %ebx, %eax +; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: adcl %esi, %ecx +; X86-NEXT: movl 24(%ebp), %ebx +; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-NEXT: movl 28(%ebp), %esi +; X86-NEXT: sbbl %eax, %esi +; X86-NEXT: movl 32(%ebp), %edi +; X86-NEXT: sbbl %edx, %edi +; X86-NEXT: movl 36(%ebp), %edx +; X86-NEXT: sbbl %ecx, %edx +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl %ebx, (%eax) +; X86-NEXT: movl %esi, 4(%eax) +; X86-NEXT: movl %edi, 8(%eax) +; X86-NEXT: movl %edx, 12(%eax) +; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl $4 +; X86-NEXT: .LBB8_9: +; X86-NEXT: movb $1, %al +; X86-NEXT: jmp .LBB8_11 +; X86-NEXT: .LBB8_19: +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: jmp .LBB8_17 +; X86-NEXT: .LBB8_14: +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: jmp .LBB8_21 +; +; X64-LABEL: rem_by_14: +; X64: # %bb.0: +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: shrdq $1, %rsi, %rax +; X64-NEXT: movabsq $1152921504606846975, %rcx # imm = 0xFFFFFFFFFFFFFFF +; X64-NEXT: movq %rsi, %rdx +; X64-NEXT: shrq %rdx +; X64-NEXT: shldq $4, %rax, %rdx +; X64-NEXT: andq %rcx, %rax +; X64-NEXT: andq %rcx, %rdx +; X64-NEXT: shrq $57, %rsi +; X64-NEXT: addq %rax, %rsi +; X64-NEXT: addq %rdx, %rsi +; X64-NEXT: movabsq $5270498306774157605, %rcx # imm = 0x4924924924924925 +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: mulq %rcx +; X64-NEXT: shrq %rdx +; X64-NEXT: leal (,%rdx,8), %eax +; X64-NEXT: subl %eax, %edx +; X64-NEXT: addl %edx, %esi +; X64-NEXT: andl $1, %edi +; X64-NEXT: leaq (%rdi,%rsi,2), %rax +; X64-NEXT: xorl %edx, %edx +; X64-NEXT: retq + %rem = urem i128 %x, 14 + ret i128 %rem +} + +define i128 @div_by_67(i128 %x) nounwind { +; X86-LABEL: div_by_67: +; X86: # %bb.0: # %entry_udiv-special-cases +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $160, %esp +; X86-NEXT: movl 32(%ebp), %edi +; X86-NEXT: movl 36(%ebp), %ebx +; X86-NEXT: movl 28(%ebp), %edx +; X86-NEXT: testl %ebx, %ebx +; X86-NEXT: jne .LBB9_1 +; X86-NEXT: # %bb.2: # %entry_udiv-special-cases +; X86-NEXT: bsrl %edi, %esi +; X86-NEXT: xorl $31, %esi +; X86-NEXT: orl $32, %esi +; X86-NEXT: jmp .LBB9_3 +; X86-NEXT: .LBB9_1: +; X86-NEXT: bsrl %ebx, %esi +; X86-NEXT: xorl $31, %esi +; X86-NEXT: .LBB9_3: # %entry_udiv-special-cases +; X86-NEXT: movl 24(%ebp), %eax +; X86-NEXT: testl %edx, %edx +; X86-NEXT: jne .LBB9_4 +; X86-NEXT: # %bb.5: # %entry_udiv-special-cases +; X86-NEXT: bsrl %eax, %ecx +; X86-NEXT: xorl $31, %ecx +; X86-NEXT: orl $32, %ecx +; X86-NEXT: jmp .LBB9_6 +; X86-NEXT: .LBB9_4: +; X86-NEXT: bsrl %edx, %ecx +; X86-NEXT: xorl $31, %ecx +; X86-NEXT: .LBB9_6: # %entry_udiv-special-cases +; X86-NEXT: orl %ebx, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %edi, %eax +; X86-NEXT: orl %ebx, %edi +; X86-NEXT: jne .LBB9_8 +; X86-NEXT: # %bb.7: # %entry_udiv-special-cases +; X86-NEXT: orl $64, %ecx +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: .LBB9_8: # %entry_udiv-special-cases +; X86-NEXT: movl $121, %ebx +; X86-NEXT: subl %esi, %ebx +; X86-NEXT: movl $0, %edx +; X86-NEXT: sbbl %edx, %edx +; X86-NEXT: movl $0, %esi +; X86-NEXT: sbbl %esi, %esi +; X86-NEXT: movl $0, %edi +; X86-NEXT: sbbl %edi, %edi +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: je .LBB9_9 +; X86-NEXT: # %bb.10: # %select.false.sink +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: movl $127, %ecx +; X86-NEXT: cmpl %ebx, %ecx +; X86-NEXT: movl $0, %ecx +; X86-NEXT: sbbl %edx, %ecx +; X86-NEXT: movl $0, %ecx +; X86-NEXT: sbbl %esi, %ecx +; X86-NEXT: sbbl %edi, %eax +; X86-NEXT: setb %cl +; X86-NEXT: .LBB9_11: # %select.end +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: testb %cl, %cl +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl $0, %ecx +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: jne .LBB9_13 +; X86-NEXT: # %bb.12: # %select.end +; X86-NEXT: movl 28(%ebp), %edx +; X86-NEXT: movl 24(%ebp), %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 32(%ebp), %ecx +; X86-NEXT: movl 36(%ebp), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: .LBB9_13: # %select.end +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: jne .LBB9_14 +; X86-NEXT: # %bb.20: # %select.end +; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: xorl $127, %ecx +; X86-NEXT: orl %esi, %ecx +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl %esi, %edx +; X86-NEXT: orl %edi, %edx +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: je .LBB9_21 +; X86-NEXT: # %bb.18: # %udiv-bb1 +; X86-NEXT: movl 24(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 28(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 32(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 36(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: xorb $127, %cl +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: shrb $3, %al +; X86-NEXT: andb $12, %al +; X86-NEXT: negb %al +; X86-NEXT: movsbl %al, %eax +; X86-NEXT: movl 136(%esp,%eax), %edx +; X86-NEXT: movl 140(%esp,%eax), %edi +; X86-NEXT: shldl %cl, %edx, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 128(%esp,%eax), %edi +; X86-NEXT: movl 132(%esp,%eax), %eax +; X86-NEXT: shldl %cl, %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shldl %cl, %edi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shll %cl, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl $1, %ebx +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: jb .LBB9_19 +; X86-NEXT: # %bb.15: # %udiv-preheader +; X86-NEXT: movl %ebx, %ecx +; X86-NEXT: movl 24(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 28(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 32(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 36(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: shrb $3, %al +; X86-NEXT: andb $12, %al +; X86-NEXT: movzbl %al, %edi +; X86-NEXT: movl 92(%esp,%edi), %eax +; X86-NEXT: movl 88(%esp,%edi), %ebx +; X86-NEXT: movl %ebx, %esi +; X86-NEXT: shrdl %cl, %eax, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 80(%esp,%edi), %esi +; X86-NEXT: movl 84(%esp,%edi), %edi +; X86-NEXT: movl %edi, %edx +; X86-NEXT: shrdl %cl, %ebx, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: shrl %cl, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: shrdl %cl, %edi, %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $67, %eax +; X86-NEXT: addl $-1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $0, %eax +; X86-NEXT: adcl $-1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $0, %eax +; X86-NEXT: adcl $-1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $0, %eax +; X86-NEXT: adcl $-1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: .p2align 4 +; X86-NEXT: .LBB9_16: # %udiv-do-while +; X86-NEXT: # =>This Inner Loop Header: Depth=1 +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: shldl $1, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: shldl $1, %edi, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl $1, %edx, %edi +; X86-NEXT: shldl $1, %eax, %edx +; X86-NEXT: shldl $1, %ecx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: orl %esi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl $1, %eax, %ecx +; X86-NEXT: orl %esi, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: shldl $1, %ecx, %eax +; X86-NEXT: orl %esi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl %ecx, %ecx +; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmpl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: sbbl %edi, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: sbbl %ebx, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: sarl $31, %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: andl $1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl $67, %eax +; X86-NEXT: andl %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: subl %ecx, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl $0, %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: sbbl $0, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: sbbl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: addl $-1, %ebx +; X86-NEXT: adcl $-1, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: adcl $-1, %edx +; X86-NEXT: adcl $-1, %esi +; X86-NEXT: movl %edi, %ecx +; X86-NEXT: orl %esi, %ecx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: orl %edx, %ebx +; X86-NEXT: orl %ecx, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: jne .LBB9_16 +; X86-NEXT: .LBB9_17: # %udiv-loop-exit +; X86-NEXT: shldl $1, %ecx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: shldl $1, %edx, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: shldl $1, %eax, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: leal (%esi,%eax,2), %edi +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: .LBB9_21: # %udiv-end +; X86-NEXT: movl %edi, (%eax) +; X86-NEXT: movl %edx, 4(%eax) +; X86-NEXT: movl %ecx, 8(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: leal -12(%ebp), %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl $4 +; X86-NEXT: .LBB9_9: +; X86-NEXT: movb $1, %cl +; X86-NEXT: jmp .LBB9_11 +; X86-NEXT: .LBB9_19: +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-NEXT: jmp .LBB9_17 +; X86-NEXT: .LBB9_14: +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: jmp .LBB9_21 +; +; X64-LABEL: div_by_67: +; X64: # %bb.0: # %entry +; X64-NEXT: pushq %rax +; X64-NEXT: movl $67, %edx +; X64-NEXT: xorl %ecx, %ecx +; X64-NEXT: callq __udivti3@PLT +; X64-NEXT: popq %rcx +; X64-NEXT: retq +entry: + %div = udiv i128 %x, 67 + ret i128 %div +} diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll index cbc2b968eec7..ccb957a81025 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll @@ -940,3 +940,477 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind { %res = urem <16 x i8> %a, ret <16 x i8> %res } + +define <2 x i128> @v2i128_div_by_7(<2 x i128> %x) nounwind { +; SSE-LABEL: v2i128_div_by_7: +; SSE: # %bb.0: # %entry +; SSE-NEXT: pushq %r15 +; SSE-NEXT: pushq %r14 +; SSE-NEXT: pushq %rbx +; SSE-NEXT: movq %rcx, %r9 +; SSE-NEXT: movq %rdx, %rcx +; SSE-NEXT: movabsq $1152921504606846975, %r14 # imm = 0xFFFFFFFFFFFFFFF +; SSE-NEXT: movq %rsi, %rax +; SSE-NEXT: andq %r14, %rax +; SSE-NEXT: movq %rsi, %rdx +; SSE-NEXT: shrdq $60, %rcx, %rdx +; SSE-NEXT: andq %r14, %rdx +; SSE-NEXT: addq %rax, %rdx +; SSE-NEXT: movq %rcx, %r10 +; SSE-NEXT: shrq $56, %r10 +; SSE-NEXT: addq %rdx, %r10 +; SSE-NEXT: movabsq $5270498306774157605, %r15 # imm = 0x4924924924924925 +; SSE-NEXT: movq %r10, %rax +; SSE-NEXT: mulq %r15 +; SSE-NEXT: shrq %rdx +; SSE-NEXT: leaq (,%rdx,8), %rax +; SSE-NEXT: subq %rax, %rdx +; SSE-NEXT: addq %r10, %rdx +; SSE-NEXT: subq %rdx, %rsi +; SSE-NEXT: sbbq $0, %rcx +; SSE-NEXT: movabsq $-5270498306774157605, %r11 # imm = 0xB6DB6DB6DB6DB6DB +; SSE-NEXT: movq %rsi, %r10 +; SSE-NEXT: imulq %r11, %r10 +; SSE-NEXT: movabsq $7905747460161236407, %rbx # imm = 0x6DB6DB6DB6DB6DB7 +; SSE-NEXT: movq %rsi, %rax +; SSE-NEXT: mulq %rbx +; SSE-NEXT: movq %rax, %rsi +; SSE-NEXT: addq %r10, %rdx +; SSE-NEXT: imulq %rbx, %rcx +; SSE-NEXT: addq %rdx, %rcx +; SSE-NEXT: movq %r9, %rax +; SSE-NEXT: andq %r14, %rax +; SSE-NEXT: movq %r9, %rdx +; SSE-NEXT: shrdq $60, %r8, %rdx +; SSE-NEXT: andq %r14, %rdx +; SSE-NEXT: addq %rax, %rdx +; SSE-NEXT: movq %r8, %r10 +; SSE-NEXT: shrq $56, %r10 +; SSE-NEXT: addq %rdx, %r10 +; SSE-NEXT: movq %r10, %rax +; SSE-NEXT: mulq %r15 +; SSE-NEXT: shrq %rdx +; SSE-NEXT: leaq (,%rdx,8), %rax +; SSE-NEXT: subq %rax, %rdx +; SSE-NEXT: addq %r10, %rdx +; SSE-NEXT: subq %rdx, %r9 +; SSE-NEXT: sbbq $0, %r8 +; SSE-NEXT: imulq %r9, %r11 +; SSE-NEXT: movq %r9, %rax +; SSE-NEXT: mulq %rbx +; SSE-NEXT: addq %r11, %rdx +; SSE-NEXT: imulq %rbx, %r8 +; SSE-NEXT: addq %rdx, %r8 +; SSE-NEXT: movq %rax, 16(%rdi) +; SSE-NEXT: movq %rsi, (%rdi) +; SSE-NEXT: movq %r8, 24(%rdi) +; SSE-NEXT: movq %rcx, 8(%rdi) +; SSE-NEXT: movq %rdi, %rax +; SSE-NEXT: popq %rbx +; SSE-NEXT: popq %r14 +; SSE-NEXT: popq %r15 +; SSE-NEXT: retq +; +; AVX-LABEL: v2i128_div_by_7: +; AVX: # %bb.0: # %entry +; AVX-NEXT: pushq %r15 +; AVX-NEXT: pushq %r14 +; AVX-NEXT: pushq %rbx +; AVX-NEXT: movq %rcx, %r9 +; AVX-NEXT: movq %rdx, %rcx +; AVX-NEXT: movabsq $1152921504606846975, %r14 # imm = 0xFFFFFFFFFFFFFFF +; AVX-NEXT: movq %rsi, %rax +; AVX-NEXT: andq %r14, %rax +; AVX-NEXT: movq %rsi, %rdx +; AVX-NEXT: shrdq $60, %rcx, %rdx +; AVX-NEXT: andq %r14, %rdx +; AVX-NEXT: addq %rax, %rdx +; AVX-NEXT: movq %rcx, %r10 +; AVX-NEXT: shrq $56, %r10 +; AVX-NEXT: addq %rdx, %r10 +; AVX-NEXT: movabsq $5270498306774157605, %r15 # imm = 0x4924924924924925 +; AVX-NEXT: movq %r10, %rax +; AVX-NEXT: mulq %r15 +; AVX-NEXT: shrq %rdx +; AVX-NEXT: leaq (,%rdx,8), %rax +; AVX-NEXT: subq %rax, %rdx +; AVX-NEXT: addq %r10, %rdx +; AVX-NEXT: subq %rdx, %rsi +; AVX-NEXT: sbbq $0, %rcx +; AVX-NEXT: movabsq $-5270498306774157605, %r11 # imm = 0xB6DB6DB6DB6DB6DB +; AVX-NEXT: movq %rsi, %r10 +; AVX-NEXT: imulq %r11, %r10 +; AVX-NEXT: movabsq $7905747460161236407, %rbx # imm = 0x6DB6DB6DB6DB6DB7 +; AVX-NEXT: movq %rsi, %rax +; AVX-NEXT: mulq %rbx +; AVX-NEXT: movq %rax, %rsi +; AVX-NEXT: addq %r10, %rdx +; AVX-NEXT: imulq %rbx, %rcx +; AVX-NEXT: addq %rdx, %rcx +; AVX-NEXT: movq %r9, %rax +; AVX-NEXT: andq %r14, %rax +; AVX-NEXT: movq %r9, %rdx +; AVX-NEXT: shrdq $60, %r8, %rdx +; AVX-NEXT: andq %r14, %rdx +; AVX-NEXT: addq %rax, %rdx +; AVX-NEXT: movq %r8, %r10 +; AVX-NEXT: shrq $56, %r10 +; AVX-NEXT: addq %rdx, %r10 +; AVX-NEXT: movq %r10, %rax +; AVX-NEXT: mulq %r15 +; AVX-NEXT: shrq %rdx +; AVX-NEXT: leaq (,%rdx,8), %rax +; AVX-NEXT: subq %rax, %rdx +; AVX-NEXT: addq %r10, %rdx +; AVX-NEXT: subq %rdx, %r9 +; AVX-NEXT: sbbq $0, %r8 +; AVX-NEXT: imulq %r9, %r11 +; AVX-NEXT: movq %r9, %rax +; AVX-NEXT: mulq %rbx +; AVX-NEXT: addq %r11, %rdx +; AVX-NEXT: imulq %rbx, %r8 +; AVX-NEXT: addq %rdx, %r8 +; AVX-NEXT: movq %rax, 16(%rdi) +; AVX-NEXT: movq %rsi, (%rdi) +; AVX-NEXT: movq %r8, 24(%rdi) +; AVX-NEXT: movq %rcx, 8(%rdi) +; AVX-NEXT: movq %rdi, %rax +; AVX-NEXT: popq %rbx +; AVX-NEXT: popq %r14 +; AVX-NEXT: popq %r15 +; AVX-NEXT: retq +entry: + %div = udiv <2 x i128> %x, + ret <2 x i128> %div +} + +define <2 x i128> @v2i128_div_by_14(<2 x i128> %x) nounwind { +; SSE-LABEL: v2i128_div_by_14: +; SSE: # %bb.0: # %entry +; SSE-NEXT: pushq %r15 +; SSE-NEXT: pushq %r14 +; SSE-NEXT: pushq %rbx +; SSE-NEXT: movq %rcx, %r9 +; SSE-NEXT: movq %rdx, %rcx +; SSE-NEXT: movq %rsi, %rax +; SSE-NEXT: shrdq $1, %rdx, %rax +; SSE-NEXT: movabsq $1152921504606846975, %r11 # imm = 0xFFFFFFFFFFFFFFF +; SSE-NEXT: shrq %rdx +; SSE-NEXT: shldq $4, %rax, %rdx +; SSE-NEXT: andq %r11, %rax +; SSE-NEXT: andq %r11, %rdx +; SSE-NEXT: movq %rcx, %r10 +; SSE-NEXT: shrq $57, %r10 +; SSE-NEXT: addq %rax, %r10 +; SSE-NEXT: addq %rdx, %r10 +; SSE-NEXT: movabsq $5270498306774157605, %r15 # imm = 0x4924924924924925 +; SSE-NEXT: movq %r10, %rax +; SSE-NEXT: mulq %r15 +; SSE-NEXT: shrq %rdx +; SSE-NEXT: leaq (,%rdx,8), %rax +; SSE-NEXT: subq %rax, %rdx +; SSE-NEXT: addq %r10, %rdx +; SSE-NEXT: subq %rdx, %rsi +; SSE-NEXT: sbbq $0, %rcx +; SSE-NEXT: movabsq $-5270498306774157605, %rbx # imm = 0xB6DB6DB6DB6DB6DB +; SSE-NEXT: movq %rsi, %r10 +; SSE-NEXT: imulq %rbx, %r10 +; SSE-NEXT: movabsq $7905747460161236407, %r14 # imm = 0x6DB6DB6DB6DB6DB7 +; SSE-NEXT: movq %rsi, %rax +; SSE-NEXT: mulq %r14 +; SSE-NEXT: movq %rax, %rsi +; SSE-NEXT: addq %r10, %rdx +; SSE-NEXT: imulq %r14, %rcx +; SSE-NEXT: addq %rdx, %rcx +; SSE-NEXT: movq %r9, %rax +; SSE-NEXT: shrdq $1, %r8, %rax +; SSE-NEXT: movq %r8, %rdx +; SSE-NEXT: shrq %rdx +; SSE-NEXT: shldq $4, %rax, %rdx +; SSE-NEXT: andq %r11, %rax +; SSE-NEXT: andq %r11, %rdx +; SSE-NEXT: movq %r8, %r10 +; SSE-NEXT: shrq $57, %r10 +; SSE-NEXT: addq %rax, %r10 +; SSE-NEXT: addq %rdx, %r10 +; SSE-NEXT: movq %r10, %rax +; SSE-NEXT: mulq %r15 +; SSE-NEXT: shrq %rdx +; SSE-NEXT: leaq (,%rdx,8), %rax +; SSE-NEXT: subq %rax, %rdx +; SSE-NEXT: addq %r10, %rdx +; SSE-NEXT: subq %rdx, %r9 +; SSE-NEXT: sbbq $0, %r8 +; SSE-NEXT: imulq %r9, %rbx +; SSE-NEXT: movq %r9, %rax +; SSE-NEXT: mulq %r14 +; SSE-NEXT: addq %rbx, %rdx +; SSE-NEXT: imulq %r14, %r8 +; SSE-NEXT: addq %rdx, %r8 +; SSE-NEXT: movq %rax, 16(%rdi) +; SSE-NEXT: movq %rsi, (%rdi) +; SSE-NEXT: movq %r8, 24(%rdi) +; SSE-NEXT: movq %rcx, 8(%rdi) +; SSE-NEXT: movq %rdi, %rax +; SSE-NEXT: popq %rbx +; SSE-NEXT: popq %r14 +; SSE-NEXT: popq %r15 +; SSE-NEXT: retq +; +; AVX-LABEL: v2i128_div_by_14: +; AVX: # %bb.0: # %entry +; AVX-NEXT: pushq %r15 +; AVX-NEXT: pushq %r14 +; AVX-NEXT: pushq %rbx +; AVX-NEXT: movq %rcx, %r9 +; AVX-NEXT: movq %rdx, %rcx +; AVX-NEXT: movq %rsi, %rax +; AVX-NEXT: shrdq $1, %rdx, %rax +; AVX-NEXT: movabsq $1152921504606846975, %r11 # imm = 0xFFFFFFFFFFFFFFF +; AVX-NEXT: shrq %rdx +; AVX-NEXT: shldq $4, %rax, %rdx +; AVX-NEXT: andq %r11, %rax +; AVX-NEXT: andq %r11, %rdx +; AVX-NEXT: movq %rcx, %r10 +; AVX-NEXT: shrq $57, %r10 +; AVX-NEXT: addq %rax, %r10 +; AVX-NEXT: addq %rdx, %r10 +; AVX-NEXT: movabsq $5270498306774157605, %r15 # imm = 0x4924924924924925 +; AVX-NEXT: movq %r10, %rax +; AVX-NEXT: mulq %r15 +; AVX-NEXT: shrq %rdx +; AVX-NEXT: leaq (,%rdx,8), %rax +; AVX-NEXT: subq %rax, %rdx +; AVX-NEXT: addq %r10, %rdx +; AVX-NEXT: subq %rdx, %rsi +; AVX-NEXT: sbbq $0, %rcx +; AVX-NEXT: movabsq $-5270498306774157605, %rbx # imm = 0xB6DB6DB6DB6DB6DB +; AVX-NEXT: movq %rsi, %r10 +; AVX-NEXT: imulq %rbx, %r10 +; AVX-NEXT: movabsq $7905747460161236407, %r14 # imm = 0x6DB6DB6DB6DB6DB7 +; AVX-NEXT: movq %rsi, %rax +; AVX-NEXT: mulq %r14 +; AVX-NEXT: movq %rax, %rsi +; AVX-NEXT: addq %r10, %rdx +; AVX-NEXT: imulq %r14, %rcx +; AVX-NEXT: addq %rdx, %rcx +; AVX-NEXT: movq %r9, %rax +; AVX-NEXT: shrdq $1, %r8, %rax +; AVX-NEXT: movq %r8, %rdx +; AVX-NEXT: shrq %rdx +; AVX-NEXT: shldq $4, %rax, %rdx +; AVX-NEXT: andq %r11, %rax +; AVX-NEXT: andq %r11, %rdx +; AVX-NEXT: movq %r8, %r10 +; AVX-NEXT: shrq $57, %r10 +; AVX-NEXT: addq %rax, %r10 +; AVX-NEXT: addq %rdx, %r10 +; AVX-NEXT: movq %r10, %rax +; AVX-NEXT: mulq %r15 +; AVX-NEXT: shrq %rdx +; AVX-NEXT: leaq (,%rdx,8), %rax +; AVX-NEXT: subq %rax, %rdx +; AVX-NEXT: addq %r10, %rdx +; AVX-NEXT: subq %rdx, %r9 +; AVX-NEXT: sbbq $0, %r8 +; AVX-NEXT: imulq %r9, %rbx +; AVX-NEXT: movq %r9, %rax +; AVX-NEXT: mulq %r14 +; AVX-NEXT: addq %rbx, %rdx +; AVX-NEXT: imulq %r14, %r8 +; AVX-NEXT: addq %rdx, %r8 +; AVX-NEXT: movq %rax, 16(%rdi) +; AVX-NEXT: movq %rsi, (%rdi) +; AVX-NEXT: movq %r8, 24(%rdi) +; AVX-NEXT: movq %rcx, 8(%rdi) +; AVX-NEXT: movq %rdi, %rax +; AVX-NEXT: popq %rbx +; AVX-NEXT: popq %r14 +; AVX-NEXT: popq %r15 +; AVX-NEXT: retq +entry: + %div = udiv <2 x i128> %x, + ret <2 x i128> %div +} + +define <2 x i128> @v2i128_rem_by_7(<2 x i128> %x) nounwind { +; SSE-LABEL: v2i128_rem_by_7: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movq %rdx, %r9 +; SSE-NEXT: movabsq $1152921504606846975, %r10 # imm = 0xFFFFFFFFFFFFFFF +; SSE-NEXT: movq %rsi, %rax +; SSE-NEXT: andq %r10, %rax +; SSE-NEXT: shrdq $60, %rdx, %rsi +; SSE-NEXT: andq %r10, %rsi +; SSE-NEXT: addq %rax, %rsi +; SSE-NEXT: shrq $56, %r9 +; SSE-NEXT: addq %rsi, %r9 +; SSE-NEXT: movabsq $5270498306774157605, %r11 # imm = 0x4924924924924925 +; SSE-NEXT: movq %r9, %rax +; SSE-NEXT: mulq %r11 +; SSE-NEXT: movq %rdx, %rsi +; SSE-NEXT: shrq %rsi +; SSE-NEXT: leaq (,%rsi,8), %rax +; SSE-NEXT: subq %rax, %rsi +; SSE-NEXT: addq %r9, %rsi +; SSE-NEXT: movq %rcx, %rax +; SSE-NEXT: andq %r10, %rax +; SSE-NEXT: shrdq $60, %r8, %rcx +; SSE-NEXT: andq %r10, %rcx +; SSE-NEXT: addq %rax, %rcx +; SSE-NEXT: shrq $56, %r8 +; SSE-NEXT: addq %rcx, %r8 +; SSE-NEXT: movq %r8, %rax +; SSE-NEXT: mulq %r11 +; SSE-NEXT: shrq %rdx +; SSE-NEXT: leaq (,%rdx,8), %rax +; SSE-NEXT: subq %rax, %rdx +; SSE-NEXT: addq %r8, %rdx +; SSE-NEXT: movq %rdx, 16(%rdi) +; SSE-NEXT: movq %rsi, (%rdi) +; SSE-NEXT: movq $0, 24(%rdi) +; SSE-NEXT: movq $0, 8(%rdi) +; SSE-NEXT: movq %rdi, %rax +; SSE-NEXT: retq +; +; AVX-LABEL: v2i128_rem_by_7: +; AVX: # %bb.0: # %entry +; AVX-NEXT: movq %rdx, %r9 +; AVX-NEXT: movabsq $1152921504606846975, %r10 # imm = 0xFFFFFFFFFFFFFFF +; AVX-NEXT: movq %rsi, %rax +; AVX-NEXT: andq %r10, %rax +; AVX-NEXT: shrdq $60, %rdx, %rsi +; AVX-NEXT: andq %r10, %rsi +; AVX-NEXT: addq %rax, %rsi +; AVX-NEXT: shrq $56, %r9 +; AVX-NEXT: addq %rsi, %r9 +; AVX-NEXT: movabsq $5270498306774157605, %r11 # imm = 0x4924924924924925 +; AVX-NEXT: movq %r9, %rax +; AVX-NEXT: mulq %r11 +; AVX-NEXT: movq %rdx, %rsi +; AVX-NEXT: shrq %rsi +; AVX-NEXT: leaq (,%rsi,8), %rax +; AVX-NEXT: subq %rax, %rsi +; AVX-NEXT: addq %r9, %rsi +; AVX-NEXT: movq %rcx, %rax +; AVX-NEXT: andq %r10, %rax +; AVX-NEXT: shrdq $60, %r8, %rcx +; AVX-NEXT: andq %r10, %rcx +; AVX-NEXT: addq %rax, %rcx +; AVX-NEXT: shrq $56, %r8 +; AVX-NEXT: addq %rcx, %r8 +; AVX-NEXT: movq %r8, %rax +; AVX-NEXT: mulq %r11 +; AVX-NEXT: shrq %rdx +; AVX-NEXT: leaq (,%rdx,8), %rax +; AVX-NEXT: subq %rax, %rdx +; AVX-NEXT: addq %r8, %rdx +; AVX-NEXT: movq %rdx, 16(%rdi) +; AVX-NEXT: movq %rsi, (%rdi) +; AVX-NEXT: movq $0, 24(%rdi) +; AVX-NEXT: movq $0, 8(%rdi) +; AVX-NEXT: movq %rdi, %rax +; AVX-NEXT: retq +entry: + %rem = urem <2 x i128> %x, + ret <2 x i128> %rem +} + +define <2 x i128> @v2i128_rem_by_14(<2 x i128> %x) nounwind { +; SSE-LABEL: v2i128_rem_by_14: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movq %rdx, %r9 +; SSE-NEXT: movq %rsi, %rax +; SSE-NEXT: shrdq $1, %rdx, %rax +; SSE-NEXT: movabsq $1152921504606846975, %r10 # imm = 0xFFFFFFFFFFFFFFF +; SSE-NEXT: shrq %rdx +; SSE-NEXT: shldq $4, %rax, %rdx +; SSE-NEXT: andq %r10, %rax +; SSE-NEXT: andq %r10, %rdx +; SSE-NEXT: shrq $57, %r9 +; SSE-NEXT: addq %rax, %r9 +; SSE-NEXT: addq %rdx, %r9 +; SSE-NEXT: movabsq $5270498306774157605, %r11 # imm = 0x4924924924924925 +; SSE-NEXT: movq %r9, %rax +; SSE-NEXT: mulq %r11 +; SSE-NEXT: shrq %rdx +; SSE-NEXT: leal (,%rdx,8), %eax +; SSE-NEXT: subl %eax, %edx +; SSE-NEXT: addl %edx, %r9d +; SSE-NEXT: andl $1, %esi +; SSE-NEXT: leaq (%rsi,%r9,2), %rsi +; SSE-NEXT: movq %rcx, %rax +; SSE-NEXT: shrdq $1, %r8, %rax +; SSE-NEXT: movq %r8, %rdx +; SSE-NEXT: shrq %rdx +; SSE-NEXT: shldq $4, %rax, %rdx +; SSE-NEXT: andq %r10, %rax +; SSE-NEXT: andq %r10, %rdx +; SSE-NEXT: shrq $57, %r8 +; SSE-NEXT: addq %rax, %r8 +; SSE-NEXT: addq %rdx, %r8 +; SSE-NEXT: movq %r8, %rax +; SSE-NEXT: mulq %r11 +; SSE-NEXT: shrq %rdx +; SSE-NEXT: leal (,%rdx,8), %eax +; SSE-NEXT: subl %eax, %edx +; SSE-NEXT: addl %edx, %r8d +; SSE-NEXT: andl $1, %ecx +; SSE-NEXT: leaq (%rcx,%r8,2), %rax +; SSE-NEXT: movq %rax, 16(%rdi) +; SSE-NEXT: movq %rsi, (%rdi) +; SSE-NEXT: movq $0, 24(%rdi) +; SSE-NEXT: movq $0, 8(%rdi) +; SSE-NEXT: movq %rdi, %rax +; SSE-NEXT: retq +; +; AVX-LABEL: v2i128_rem_by_14: +; AVX: # %bb.0: # %entry +; AVX-NEXT: movq %rdx, %r9 +; AVX-NEXT: movq %rsi, %rax +; AVX-NEXT: shrdq $1, %rdx, %rax +; AVX-NEXT: movabsq $1152921504606846975, %r10 # imm = 0xFFFFFFFFFFFFFFF +; AVX-NEXT: shrq %rdx +; AVX-NEXT: shldq $4, %rax, %rdx +; AVX-NEXT: andq %r10, %rax +; AVX-NEXT: andq %r10, %rdx +; AVX-NEXT: shrq $57, %r9 +; AVX-NEXT: addq %rax, %r9 +; AVX-NEXT: addq %rdx, %r9 +; AVX-NEXT: movabsq $5270498306774157605, %r11 # imm = 0x4924924924924925 +; AVX-NEXT: movq %r9, %rax +; AVX-NEXT: mulq %r11 +; AVX-NEXT: shrq %rdx +; AVX-NEXT: leal (,%rdx,8), %eax +; AVX-NEXT: subl %eax, %edx +; AVX-NEXT: addl %edx, %r9d +; AVX-NEXT: andl $1, %esi +; AVX-NEXT: leaq (%rsi,%r9,2), %rsi +; AVX-NEXT: movq %rcx, %rax +; AVX-NEXT: shrdq $1, %r8, %rax +; AVX-NEXT: movq %r8, %rdx +; AVX-NEXT: shrq %rdx +; AVX-NEXT: shldq $4, %rax, %rdx +; AVX-NEXT: andq %r10, %rax +; AVX-NEXT: andq %r10, %rdx +; AVX-NEXT: shrq $57, %r8 +; AVX-NEXT: addq %rax, %r8 +; AVX-NEXT: addq %rdx, %r8 +; AVX-NEXT: movq %r8, %rax +; AVX-NEXT: mulq %r11 +; AVX-NEXT: shrq %rdx +; AVX-NEXT: leal (,%rdx,8), %eax +; AVX-NEXT: subl %eax, %edx +; AVX-NEXT: addl %edx, %r8d +; AVX-NEXT: andl $1, %ecx +; AVX-NEXT: leaq (%rcx,%r8,2), %rax +; AVX-NEXT: movq %rax, 16(%rdi) +; AVX-NEXT: movq %rsi, (%rdi) +; AVX-NEXT: movq $0, 24(%rdi) +; AVX-NEXT: movq $0, 8(%rdi) +; AVX-NEXT: movq %rdi, %rax +; AVX-NEXT: retq +entry: + %rem = urem <2 x i128> %x, + ret <2 x i128> %rem +}