[LegalizeTypes] Expand UDIV/UREM by constant via chunk summation (#146238)

This patch improves the lowering of 128-bit unsigned division and
remainder by constants (UDIV/UREM) by avoiding a fallback to libcall
(__udivti3/uremti3) for specific divisors.

When a divisor D satisfies the condition (1 << ChunkWidth) % D == 1, the
128-bit value is split into fixed-width chunks (e.g., 30-bit) and summed
before applying a smaller UDIV/UREM. This transformation is based on the
"remainder by summing digits" trick described in Hacker’s Delight.

This fixes #137514 for some constants.
This commit is contained in:
Shivam Gupta 2026-03-19 17:58:54 +05:30 committed by GitHub
parent 582fa78753
commit 796b218edd
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
12 changed files with 3892 additions and 219 deletions

View File

@ -8186,8 +8186,6 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
// If (1 << HBitWidth) % divisor == 1, we can add the two halves together and
// then add in the carry.
// TODO: If we can't split it in half, we might be able to split into 3 or
// more pieces using a smaller bit width.
if (HalfMaxPlus1.urem(Divisor).isOne()) {
assert(!LL == !LH && "Expected both input halves or no input halves!");
if (!LL)
@ -8239,6 +8237,67 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
DAG.getConstant(0, dl, HiLoVT));
Sum = DAG.getNode(ISD::ADD, dl, HiLoVT, Sum, Carry);
}
} else {
// If we cannot split in two halves, look for a smaller chunk width W
// such that (1 << W) % Divisor == 1.
unsigned BitWidth = VT.getScalarSizeInBits();
unsigned BestChunkWidth = 0;
// Determine the legal scalar integer type for chunk operations.
EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
unsigned LegalWidth = LegalVT.getScalarSizeInBits();
unsigned MaxChunk = std::min<unsigned>(LegalWidth, BitWidth);
// Search for I where 2^I % Divisor == 1
for (unsigned I = MaxChunk, E = MaxChunk / 2; I > E; --I) {
APInt Mod = APInt::getOneBitSet(Divisor.getBitWidth(), I).urem(Divisor);
if (Mod.isOne()) {
// Ensure (NumChunks * MaxChunkValue) doesn't overflow LegalVT
unsigned NumChunks = divideCeil(BitWidth, I);
// Ensure the sum won't overflow the hardware register (LegalWidth).
// Summing N chunks adds ceil(log2(N)) extra carry bits to the width.
// Safety check: Base Chunk Width (I) + Carry Bits <= Register Width.
if (I + llvm::bit_width(NumChunks - 1) <= LegalWidth) {
BestChunkWidth = I;
break;
}
}
}
if (!BestChunkWidth)
return false;
SDValue In =
LL ? DAG.getNode(ISD::BUILD_PAIR, dl, VT, LL, LH) : N->getOperand(0);
if (TrailingZeros) {
// Save the shifted off bits if we need the remainder.
if (Opcode != ISD::UDIV) {
APInt Mask = APInt::getLowBitsSet(BitWidth, TrailingZeros);
PartialRem =
DAG.getNode(ISD::AND, dl, VT, In, DAG.getConstant(Mask, dl, VT));
}
EVT ShiftVT = getShiftAmountTy(VT, DAG.getDataLayout());
In = DAG.getNode(ISD::SRL, dl, VT, In,
DAG.getShiftAmountConstant(TrailingZeros, ShiftVT, dl));
}
SDValue TotalSum = DAG.getConstant(0, dl, LegalVT);
SDValue Mask = DAG.getConstant(
APInt::getLowBitsSet(LegalWidth, BestChunkWidth), dl, LegalVT);
for (unsigned I = 0; I < BitWidth; I += BestChunkWidth) {
SDValue Shift = DAG.getShiftAmountConstant(I, VT, dl);
SDValue Chunk = DAG.getNode(ISD::SRL, dl, VT, In, Shift);
// Truncate to LegalVT
SDValue TruncChunk = DAG.getNode(ISD::TRUNCATE, dl, LegalVT, Chunk);
// For the last chunk, we might not need a mask if it's smaller than
// BestChunkWidth, but applying it is always safe.
SDValue MaskedChunk =
DAG.getNode(ISD::AND, dl, LegalVT, TruncChunk, Mask);
TotalSum = DAG.getNode(ISD::ADD, dl, LegalVT, TotalSum, MaskedChunk);
}
Sum = DAG.getNode(ISD::ZERO_EXTEND, dl, HiLoVT, TotalSum);
}
// If we didn't find a sum, we can't do the expansion.
@ -8278,7 +8337,9 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
if (TrailingZeros) {
RemL = DAG.getNode(ISD::SHL, dl, HiLoVT, RemL,
DAG.getShiftAmountConstant(TrailingZeros, HiLoVT, dl));
RemL = DAG.getNode(ISD::ADD, dl, HiLoVT, RemL, PartialRem);
SDValue PartialRemLo = DAG.getNode(ISD::TRUNCATE, dl, HiLoVT, PartialRem);
RemL = DAG.getNode(ISD::ADD, dl, HiLoVT, RemL, PartialRemLo);
}
Result.push_back(RemL);
Result.push_back(DAG.getConstant(0, dl, HiLoVT));

View File

@ -500,13 +500,20 @@ entry:
define i128 @ui128_7(i128 %a, i128 %b) {
; CHECK-SD-LABEL: ui128_7:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
; CHECK-SD-NEXT: .cfi_offset w30, -16
; CHECK-SD-NEXT: mov w2, #7 // =0x7
; CHECK-SD-NEXT: mov x3, xzr
; CHECK-SD-NEXT: bl __umodti3
; CHECK-SD-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; CHECK-SD-NEXT: extr x8, x1, x0, #60
; CHECK-SD-NEXT: and x9, x0, #0xfffffffffffffff
; CHECK-SD-NEXT: and x8, x8, #0xfffffffffffffff
; CHECK-SD-NEXT: add x8, x9, x8
; CHECK-SD-NEXT: mov x9, #18725 // =0x4925
; CHECK-SD-NEXT: movk x9, #9362, lsl #16
; CHECK-SD-NEXT: add x8, x8, x1, lsr #56
; CHECK-SD-NEXT: mov x1, xzr
; CHECK-SD-NEXT: movk x9, #37449, lsl #32
; CHECK-SD-NEXT: movk x9, #18724, lsl #48
; CHECK-SD-NEXT: umulh x9, x8, x9
; CHECK-SD-NEXT: lsr x9, x9, #1
; CHECK-SD-NEXT: sub x9, x9, x9, lsl #3
; CHECK-SD-NEXT: add x0, x8, x9
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: ui128_7:
@ -579,13 +586,23 @@ entry:
define i128 @ui128_100(i128 %a, i128 %b) {
; CHECK-SD-LABEL: ui128_100:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
; CHECK-SD-NEXT: .cfi_offset w30, -16
; CHECK-SD-NEXT: mov w2, #100 // =0x64
; CHECK-SD-NEXT: mov x3, xzr
; CHECK-SD-NEXT: bl __umodti3
; CHECK-SD-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
; CHECK-SD-NEXT: extr x8, x1, x0, #2
; CHECK-SD-NEXT: lsr x9, x1, #2
; CHECK-SD-NEXT: mov w10, #25 // =0x19
; CHECK-SD-NEXT: extr x9, x9, x8, #60
; CHECK-SD-NEXT: and x8, x8, #0xfffffffffffffff
; CHECK-SD-NEXT: and x9, x9, #0xfffffffffffffff
; CHECK-SD-NEXT: add x8, x8, x9
; CHECK-SD-NEXT: mov x9, #62915 // =0xf5c3
; CHECK-SD-NEXT: movk x9, #23592, lsl #16
; CHECK-SD-NEXT: add x8, x8, x1, lsr #58
; CHECK-SD-NEXT: mov x1, xzr
; CHECK-SD-NEXT: movk x9, #49807, lsl #32
; CHECK-SD-NEXT: movk x9, #10485, lsl #48
; CHECK-SD-NEXT: umulh x9, x8, x9
; CHECK-SD-NEXT: lsr x9, x9, #2
; CHECK-SD-NEXT: msub x8, x9, x10, x8
; CHECK-SD-NEXT: bfi x0, x8, #2, #62
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: ui128_100:
@ -2556,7 +2573,8 @@ define <3 x i32> @uv3i32_7(<3 x i32> %d, <3 x i32> %e) {
; CHECK-SD-NEXT: shrn v1.2s, v1.2d, #32
; CHECK-SD-NEXT: sub w9, w9, w9, lsl #3
; CHECK-SD-NEXT: sub v2.2s, v0.2s, v1.2s
; CHECK-SD: add w8, w8, w9
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 def $q0
; CHECK-SD-NEXT: add w8, w8, w9
; CHECK-SD-NEXT: ushll v2.2d, v2.2s, #0
; CHECK-SD-NEXT: shrn v2.2s, v2.2d, #1
; CHECK-SD-NEXT: add v1.2s, v2.2s, v1.2s
@ -3079,34 +3097,30 @@ entry:
define <2 x i128> @uv2i128_7(<2 x i128> %d, <2 x i128> %e) {
; CHECK-SD-LABEL: uv2i128_7:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill
; CHECK-SD-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill
; CHECK-SD-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill
; CHECK-SD-NEXT: .cfi_def_cfa_offset 48
; CHECK-SD-NEXT: .cfi_offset w19, -8
; CHECK-SD-NEXT: .cfi_offset w20, -16
; CHECK-SD-NEXT: .cfi_offset w21, -24
; CHECK-SD-NEXT: .cfi_offset w22, -32
; CHECK-SD-NEXT: .cfi_offset w30, -48
; CHECK-SD-NEXT: mov x19, x3
; CHECK-SD-NEXT: mov x20, x2
; CHECK-SD-NEXT: mov w2, #7 // =0x7
; CHECK-SD-NEXT: extr x9, x1, x0, #60
; CHECK-SD-NEXT: extr x8, x3, x2, #60
; CHECK-SD-NEXT: and x10, x0, #0xfffffffffffffff
; CHECK-SD-NEXT: mov x11, #18725 // =0x4925
; CHECK-SD-NEXT: and x9, x9, #0xfffffffffffffff
; CHECK-SD-NEXT: and x8, x8, #0xfffffffffffffff
; CHECK-SD-NEXT: movk x11, #9362, lsl #16
; CHECK-SD-NEXT: add x9, x10, x9
; CHECK-SD-NEXT: and x10, x2, #0xfffffffffffffff
; CHECK-SD-NEXT: movk x11, #37449, lsl #32
; CHECK-SD-NEXT: add x8, x10, x8
; CHECK-SD-NEXT: add x9, x9, x1, lsr #56
; CHECK-SD-NEXT: movk x11, #18724, lsl #48
; CHECK-SD-NEXT: add x8, x8, x3, lsr #56
; CHECK-SD-NEXT: mov x1, xzr
; CHECK-SD-NEXT: mov x3, xzr
; CHECK-SD-NEXT: bl __umodti3
; CHECK-SD-NEXT: mov x21, x0
; CHECK-SD-NEXT: mov x22, x1
; CHECK-SD-NEXT: mov x0, x20
; CHECK-SD-NEXT: mov x1, x19
; CHECK-SD-NEXT: mov w2, #7 // =0x7
; CHECK-SD-NEXT: mov x3, xzr
; CHECK-SD-NEXT: bl __umodti3
; CHECK-SD-NEXT: mov x2, x0
; CHECK-SD-NEXT: mov x3, x1
; CHECK-SD-NEXT: mov x0, x21
; CHECK-SD-NEXT: mov x1, x22
; CHECK-SD-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload
; CHECK-SD-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload
; CHECK-SD-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload
; CHECK-SD-NEXT: umulh x10, x9, x11
; CHECK-SD-NEXT: umulh x11, x8, x11
; CHECK-SD-NEXT: lsr x10, x10, #1
; CHECK-SD-NEXT: lsr x11, x11, #1
; CHECK-SD-NEXT: sub x10, x10, x10, lsl #3
; CHECK-SD-NEXT: sub x11, x11, x11, lsl #3
; CHECK-SD-NEXT: add x0, x9, x10
; CHECK-SD-NEXT: add x2, x8, x11
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: uv2i128_7:
@ -3228,34 +3242,35 @@ entry:
define <2 x i128> @uv2i128_100(<2 x i128> %d, <2 x i128> %e) {
; CHECK-SD-LABEL: uv2i128_100:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill
; CHECK-SD-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill
; CHECK-SD-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill
; CHECK-SD-NEXT: .cfi_def_cfa_offset 48
; CHECK-SD-NEXT: .cfi_offset w19, -8
; CHECK-SD-NEXT: .cfi_offset w20, -16
; CHECK-SD-NEXT: .cfi_offset w21, -24
; CHECK-SD-NEXT: .cfi_offset w22, -32
; CHECK-SD-NEXT: .cfi_offset w30, -48
; CHECK-SD-NEXT: mov x19, x3
; CHECK-SD-NEXT: mov x20, x2
; CHECK-SD-NEXT: mov w2, #100 // =0x64
; CHECK-SD-NEXT: lsr x8, x1, #2
; CHECK-SD-NEXT: extr x9, x1, x0, #2
; CHECK-SD-NEXT: extr x10, x3, x2, #2
; CHECK-SD-NEXT: lsr x11, x3, #2
; CHECK-SD-NEXT: mov w12, #25 // =0x19
; CHECK-SD-NEXT: extr x8, x8, x9, #60
; CHECK-SD-NEXT: and x9, x9, #0xfffffffffffffff
; CHECK-SD-NEXT: extr x11, x11, x10, #60
; CHECK-SD-NEXT: and x8, x8, #0xfffffffffffffff
; CHECK-SD-NEXT: add x8, x9, x8
; CHECK-SD-NEXT: and x9, x10, #0xfffffffffffffff
; CHECK-SD-NEXT: and x10, x11, #0xfffffffffffffff
; CHECK-SD-NEXT: mov x11, #62915 // =0xf5c3
; CHECK-SD-NEXT: add x9, x9, x10
; CHECK-SD-NEXT: add x8, x8, x1, lsr #58
; CHECK-SD-NEXT: movk x11, #23592, lsl #16
; CHECK-SD-NEXT: add x9, x9, x3, lsr #58
; CHECK-SD-NEXT: mov x1, xzr
; CHECK-SD-NEXT: movk x11, #49807, lsl #32
; CHECK-SD-NEXT: mov x3, xzr
; CHECK-SD-NEXT: bl __umodti3
; CHECK-SD-NEXT: mov x21, x0
; CHECK-SD-NEXT: mov x22, x1
; CHECK-SD-NEXT: mov x0, x20
; CHECK-SD-NEXT: mov x1, x19
; CHECK-SD-NEXT: mov w2, #100 // =0x64
; CHECK-SD-NEXT: mov x3, xzr
; CHECK-SD-NEXT: bl __umodti3
; CHECK-SD-NEXT: mov x2, x0
; CHECK-SD-NEXT: mov x3, x1
; CHECK-SD-NEXT: mov x0, x21
; CHECK-SD-NEXT: mov x1, x22
; CHECK-SD-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload
; CHECK-SD-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload
; CHECK-SD-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload
; CHECK-SD-NEXT: movk x11, #10485, lsl #48
; CHECK-SD-NEXT: umulh x10, x8, x11
; CHECK-SD-NEXT: umulh x11, x9, x11
; CHECK-SD-NEXT: lsr x10, x10, #2
; CHECK-SD-NEXT: lsr x11, x11, #2
; CHECK-SD-NEXT: msub x8, x10, x12, x8
; CHECK-SD-NEXT: msub x9, x11, x12, x9
; CHECK-SD-NEXT: bfi x0, x8, #2, #62
; CHECK-SD-NEXT: bfi x2, x9, #2, #62
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: uv2i128_100:

View File

@ -89,17 +89,19 @@ define i32 @dont_fold_urem_i32_umax(i32 %x) {
define i64 @dont_fold_urem_i64(i64 %x) {
; CHECK-LABEL: dont_fold_urem_i64:
; CHECK: # %bb.0:
; CHECK-NEXT: mflr 0
; CHECK-NEXT: stwu 1, -16(1)
; CHECK-NEXT: stw 0, 20(1)
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: .cfi_offset lr, 4
; CHECK-NEXT: li 5, 0
; CHECK-NEXT: li 6, 98
; CHECK-NEXT: bl __umoddi3
; CHECK-NEXT: lwz 0, 20(1)
; CHECK-NEXT: addi 1, 1, 16
; CHECK-NEXT: mtlr 0
; CHECK-NEXT: srwi 6, 4, 22
; CHECK-NEXT: rlwinm 7, 4, 31, 11, 31
; CHECK-NEXT: rlwimi 6, 3, 10, 11, 21
; CHECK-NEXT: lis 5, 1337
; CHECK-NEXT: add 6, 7, 6
; CHECK-NEXT: srwi 3, 3, 11
; CHECK-NEXT: ori 5, 5, 30762
; CHECK-NEXT: add 3, 6, 3
; CHECK-NEXT: mulhwu 5, 3, 5
; CHECK-NEXT: mulli 5, 5, 49
; CHECK-NEXT: sub 3, 3, 5
; CHECK-NEXT: rlwimi 4, 3, 1, 0, 30
; CHECK-NEXT: li 3, 0
; CHECK-NEXT: blr
%1 = urem i64 %x, 98
ret i64 %1

View File

@ -111,16 +111,78 @@ define i64 @udiv64_constant_no_add(i64 %a) nounwind {
}
define i64 @udiv64_constant_add(i64 %a) nounwind {
; RV32-LABEL: udiv64_constant_add:
; RV32: # %bb.0:
; RV32-NEXT: addi sp, sp, -16
; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32-NEXT: li a2, 7
; RV32-NEXT: li a3, 0
; RV32-NEXT: call __udivdi3
; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: ret
; RV32IM-LABEL: udiv64_constant_add:
; RV32IM: # %bb.0:
; RV32IM-NEXT: lui a2, 262144
; RV32IM-NEXT: slli a3, a1, 2
; RV32IM-NEXT: srli a4, a0, 30
; RV32IM-NEXT: srli a5, a1, 28
; RV32IM-NEXT: lui a6, 149797
; RV32IM-NEXT: or a3, a4, a3
; RV32IM-NEXT: lui a4, 449390
; RV32IM-NEXT: addi a2, a2, -1
; RV32IM-NEXT: and a3, a3, a2
; RV32IM-NEXT: and a2, a0, a2
; RV32IM-NEXT: add a2, a2, a3
; RV32IM-NEXT: lui a3, 748983
; RV32IM-NEXT: addi a6, a6, -1755
; RV32IM-NEXT: addi a4, a4, -1171
; RV32IM-NEXT: addi a3, a3, -585
; RV32IM-NEXT: add a2, a2, a5
; RV32IM-NEXT: mulhu a5, a2, a6
; RV32IM-NEXT: sub a6, a2, a5
; RV32IM-NEXT: srli a6, a6, 1
; RV32IM-NEXT: add a5, a6, a5
; RV32IM-NEXT: srli a5, a5, 2
; RV32IM-NEXT: slli a6, a5, 3
; RV32IM-NEXT: sub a5, a5, a6
; RV32IM-NEXT: add a2, a2, a5
; RV32IM-NEXT: sub a5, a0, a2
; RV32IM-NEXT: sltu a0, a0, a2
; RV32IM-NEXT: mul a2, a5, a4
; RV32IM-NEXT: mulhu a4, a5, a3
; RV32IM-NEXT: sub a1, a1, a0
; RV32IM-NEXT: add a2, a4, a2
; RV32IM-NEXT: mul a1, a1, a3
; RV32IM-NEXT: add a1, a2, a1
; RV32IM-NEXT: mul a0, a5, a3
; RV32IM-NEXT: ret
;
; RV32IMZB-LABEL: udiv64_constant_add:
; RV32IMZB: # %bb.0:
; RV32IMZB-NEXT: srli a2, a0, 30
; RV32IMZB-NEXT: lui a3, 786432
; RV32IMZB-NEXT: slli a4, a0, 2
; RV32IMZB-NEXT: srli a5, a1, 28
; RV32IMZB-NEXT: lui a6, 149797
; RV32IMZB-NEXT: sh2add a2, a1, a2
; RV32IMZB-NEXT: andn a2, a2, a3
; RV32IMZB-NEXT: lui a3, 449390
; RV32IMZB-NEXT: srli a4, a4, 2
; RV32IMZB-NEXT: add a4, a4, a5
; RV32IMZB-NEXT: lui a5, 748983
; RV32IMZB-NEXT: addi a6, a6, -1755
; RV32IMZB-NEXT: addi a3, a3, -1171
; RV32IMZB-NEXT: addi a5, a5, -585
; RV32IMZB-NEXT: add a2, a4, a2
; RV32IMZB-NEXT: mulhu a4, a2, a6
; RV32IMZB-NEXT: sub a6, a2, a4
; RV32IMZB-NEXT: srli a6, a6, 1
; RV32IMZB-NEXT: add a4, a6, a4
; RV32IMZB-NEXT: srli a4, a4, 2
; RV32IMZB-NEXT: slli a6, a4, 3
; RV32IMZB-NEXT: sub a4, a4, a6
; RV32IMZB-NEXT: add a2, a2, a4
; RV32IMZB-NEXT: sub a4, a0, a2
; RV32IMZB-NEXT: sltu a0, a0, a2
; RV32IMZB-NEXT: mul a2, a4, a3
; RV32IMZB-NEXT: mulhu a3, a4, a5
; RV32IMZB-NEXT: sub a1, a1, a0
; RV32IMZB-NEXT: add a2, a3, a2
; RV32IMZB-NEXT: mul a1, a1, a5
; RV32IMZB-NEXT: add a1, a2, a1
; RV32IMZB-NEXT: mul a0, a4, a5
; RV32IMZB-NEXT: ret
;
; RV64-LABEL: udiv64_constant_add:
; RV64: # %bb.0:

View File

@ -117,24 +117,75 @@ define iXLen2 @test_udiv_5(iXLen2 %x) nounwind {
define iXLen2 @test_udiv_7(iXLen2 %x) nounwind {
; RV32-LABEL: test_udiv_7:
; RV32: # %bb.0:
; RV32-NEXT: addi sp, sp, -16
; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32-NEXT: li a2, 7
; RV32-NEXT: li a3, 0
; RV32-NEXT: call __udivdi3
; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: lui a2, 262144
; RV32-NEXT: slli a3, a1, 2
; RV32-NEXT: srli a4, a0, 30
; RV32-NEXT: srli a5, a1, 28
; RV32-NEXT: lui a6, 149797
; RV32-NEXT: or a3, a4, a3
; RV32-NEXT: lui a4, 449390
; RV32-NEXT: addi a2, a2, -1
; RV32-NEXT: and a3, a3, a2
; RV32-NEXT: and a2, a0, a2
; RV32-NEXT: add a2, a2, a3
; RV32-NEXT: lui a3, 748983
; RV32-NEXT: addi a6, a6, -1755
; RV32-NEXT: addi a4, a4, -1171
; RV32-NEXT: addi a3, a3, -585
; RV32-NEXT: add a2, a2, a5
; RV32-NEXT: mulhu a5, a2, a6
; RV32-NEXT: sub a6, a2, a5
; RV32-NEXT: srli a6, a6, 1
; RV32-NEXT: add a5, a6, a5
; RV32-NEXT: srli a5, a5, 2
; RV32-NEXT: slli a6, a5, 3
; RV32-NEXT: sub a5, a5, a6
; RV32-NEXT: add a2, a2, a5
; RV32-NEXT: sub a5, a0, a2
; RV32-NEXT: sltu a0, a0, a2
; RV32-NEXT: mul a2, a5, a4
; RV32-NEXT: mulhu a4, a5, a3
; RV32-NEXT: sub a1, a1, a0
; RV32-NEXT: add a2, a4, a2
; RV32-NEXT: mul a1, a1, a3
; RV32-NEXT: add a1, a2, a1
; RV32-NEXT: mul a0, a5, a3
; RV32-NEXT: ret
;
; RV64-LABEL: test_udiv_7:
; RV64: # %bb.0:
; RV64-NEXT: addi sp, sp, -16
; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
; RV64-NEXT: li a2, 7
; RV64-NEXT: li a3, 0
; RV64-NEXT: call __udivti3
; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: li a2, -1
; RV64-NEXT: slli a3, a1, 4
; RV64-NEXT: srli a4, a0, 60
; RV64-NEXT: srli a5, a1, 56
; RV64-NEXT: lui a6, %hi(.LCPI2_0)
; RV64-NEXT: or a3, a4, a3
; RV64-NEXT: lui a4, 748983
; RV64-NEXT: srli a2, a2, 4
; RV64-NEXT: ld a6, %lo(.LCPI2_0)(a6)
; RV64-NEXT: addi a4, a4, -585
; RV64-NEXT: and a3, a3, a2
; RV64-NEXT: and a2, a0, a2
; RV64-NEXT: add a2, a2, a3
; RV64-NEXT: slli a3, a4, 33
; RV64-NEXT: add a3, a4, a3
; RV64-NEXT: lui a4, %hi(.LCPI2_1)
; RV64-NEXT: ld a4, %lo(.LCPI2_1)(a4)
; RV64-NEXT: add a2, a2, a5
; RV64-NEXT: mulhu a5, a2, a6
; RV64-NEXT: srli a5, a5, 1
; RV64-NEXT: slli a6, a5, 3
; RV64-NEXT: sub a5, a5, a6
; RV64-NEXT: add a2, a2, a5
; RV64-NEXT: sub a5, a0, a2
; RV64-NEXT: sltu a0, a0, a2
; RV64-NEXT: mul a2, a5, a4
; RV64-NEXT: mulhu a4, a5, a3
; RV64-NEXT: sub a1, a1, a0
; RV64-NEXT: add a2, a4, a2
; RV64-NEXT: mul a1, a1, a3
; RV64-NEXT: add a1, a2, a1
; RV64-NEXT: mul a0, a5, a3
; RV64-NEXT: ret
%a = udiv iXLen2 %x, 7
ret iXLen2 %a
@ -143,24 +194,67 @@ define iXLen2 @test_udiv_7(iXLen2 %x) nounwind {
define iXLen2 @test_udiv_9(iXLen2 %x) nounwind {
; RV32-LABEL: test_udiv_9:
; RV32: # %bb.0:
; RV32-NEXT: addi sp, sp, -16
; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32-NEXT: li a2, 9
; RV32-NEXT: li a3, 0
; RV32-NEXT: call __udivdi3
; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: lui a2, 262144
; RV32-NEXT: slli a3, a1, 2
; RV32-NEXT: srli a4, a0, 30
; RV32-NEXT: srli a5, a1, 28
; RV32-NEXT: lui a6, 233017
; RV32-NEXT: or a3, a4, a3
; RV32-NEXT: lui a4, 582542
; RV32-NEXT: addi a2, a2, -1
; RV32-NEXT: addi a6, a6, -455
; RV32-NEXT: addi a4, a4, 910
; RV32-NEXT: and a3, a3, a2
; RV32-NEXT: and a2, a0, a2
; RV32-NEXT: add a2, a2, a3
; RV32-NEXT: add a2, a2, a5
; RV32-NEXT: mulhu a3, a2, a6
; RV32-NEXT: srli a3, a3, 1
; RV32-NEXT: slli a5, a3, 3
; RV32-NEXT: add a3, a5, a3
; RV32-NEXT: sub a2, a2, a3
; RV32-NEXT: sub a3, a0, a2
; RV32-NEXT: sltu a0, a0, a2
; RV32-NEXT: mul a2, a3, a4
; RV32-NEXT: mulhu a4, a3, a6
; RV32-NEXT: sub a1, a1, a0
; RV32-NEXT: add a2, a4, a2
; RV32-NEXT: mul a1, a1, a6
; RV32-NEXT: add a1, a2, a1
; RV32-NEXT: mul a0, a3, a6
; RV32-NEXT: ret
;
; RV64-LABEL: test_udiv_9:
; RV64: # %bb.0:
; RV64-NEXT: addi sp, sp, -16
; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
; RV64-NEXT: li a2, 9
; RV64-NEXT: li a3, 0
; RV64-NEXT: call __udivti3
; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: li a2, -1
; RV64-NEXT: slli a3, a1, 4
; RV64-NEXT: srli a4, a0, 60
; RV64-NEXT: srli a5, a1, 56
; RV64-NEXT: lui a6, %hi(.LCPI3_0)
; RV64-NEXT: or a3, a4, a3
; RV64-NEXT: lui a4, %hi(.LCPI3_1)
; RV64-NEXT: srli a2, a2, 4
; RV64-NEXT: and a3, a3, a2
; RV64-NEXT: and a2, a0, a2
; RV64-NEXT: add a2, a2, a3
; RV64-NEXT: lui a3, %hi(.LCPI3_2)
; RV64-NEXT: ld a6, %lo(.LCPI3_0)(a6)
; RV64-NEXT: ld a4, %lo(.LCPI3_1)(a4)
; RV64-NEXT: ld a3, %lo(.LCPI3_2)(a3)
; RV64-NEXT: add a2, a2, a5
; RV64-NEXT: mulhu a5, a2, a6
; RV64-NEXT: slli a6, a5, 3
; RV64-NEXT: add a5, a6, a5
; RV64-NEXT: sub a2, a2, a5
; RV64-NEXT: sub a5, a0, a2
; RV64-NEXT: sltu a0, a0, a2
; RV64-NEXT: mul a2, a5, a4
; RV64-NEXT: mulhu a4, a5, a3
; RV64-NEXT: sub a1, a1, a0
; RV64-NEXT: add a2, a4, a2
; RV64-NEXT: mul a1, a1, a3
; RV64-NEXT: add a1, a2, a1
; RV64-NEXT: mul a0, a5, a3
; RV64-NEXT: ret
%a = udiv iXLen2 %x, 9
ret iXLen2 %a

View File

@ -79,24 +79,49 @@ define iXLen2 @test_urem_5(iXLen2 %x) nounwind {
define iXLen2 @test_urem_7(iXLen2 %x) nounwind {
; RV32-LABEL: test_urem_7:
; RV32: # %bb.0:
; RV32-NEXT: addi sp, sp, -16
; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32-NEXT: li a2, 7
; RV32-NEXT: li a3, 0
; RV32-NEXT: call __umoddi3
; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: lui a2, 262144
; RV32-NEXT: slli a3, a1, 2
; RV32-NEXT: srli a4, a0, 30
; RV32-NEXT: srli a1, a1, 28
; RV32-NEXT: or a3, a4, a3
; RV32-NEXT: lui a4, 149797
; RV32-NEXT: addi a2, a2, -1
; RV32-NEXT: and a0, a0, a2
; RV32-NEXT: and a2, a3, a2
; RV32-NEXT: add a0, a0, a2
; RV32-NEXT: add a0, a0, a1
; RV32-NEXT: addi a1, a4, -1755
; RV32-NEXT: mulhu a1, a0, a1
; RV32-NEXT: sub a2, a0, a1
; RV32-NEXT: srli a2, a2, 1
; RV32-NEXT: add a1, a2, a1
; RV32-NEXT: srli a1, a1, 2
; RV32-NEXT: slli a2, a1, 3
; RV32-NEXT: sub a1, a1, a2
; RV32-NEXT: add a0, a0, a1
; RV32-NEXT: li a1, 0
; RV32-NEXT: ret
;
; RV64-LABEL: test_urem_7:
; RV64: # %bb.0:
; RV64-NEXT: addi sp, sp, -16
; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
; RV64-NEXT: li a2, 7
; RV64-NEXT: li a3, 0
; RV64-NEXT: call __umodti3
; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: li a2, -1
; RV64-NEXT: slli a3, a1, 4
; RV64-NEXT: srli a4, a0, 60
; RV64-NEXT: or a3, a4, a3
; RV64-NEXT: lui a4, %hi(.LCPI2_0)
; RV64-NEXT: srli a2, a2, 4
; RV64-NEXT: ld a4, %lo(.LCPI2_0)(a4)
; RV64-NEXT: and a0, a0, a2
; RV64-NEXT: and a2, a3, a2
; RV64-NEXT: add a0, a0, a2
; RV64-NEXT: srli a1, a1, 56
; RV64-NEXT: add a0, a0, a1
; RV64-NEXT: mulhu a1, a0, a4
; RV64-NEXT: srli a1, a1, 1
; RV64-NEXT: slli a2, a1, 3
; RV64-NEXT: sub a1, a1, a2
; RV64-NEXT: add a0, a0, a1
; RV64-NEXT: li a1, 0
; RV64-NEXT: ret
%a = urem iXLen2 %x, 7
ret iXLen2 %a
@ -105,24 +130,45 @@ define iXLen2 @test_urem_7(iXLen2 %x) nounwind {
define iXLen2 @test_urem_9(iXLen2 %x) nounwind {
; RV32-LABEL: test_urem_9:
; RV32: # %bb.0:
; RV32-NEXT: addi sp, sp, -16
; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32-NEXT: li a2, 9
; RV32-NEXT: li a3, 0
; RV32-NEXT: call __umoddi3
; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32-NEXT: addi sp, sp, 16
; RV32-NEXT: lui a2, 262144
; RV32-NEXT: slli a3, a1, 2
; RV32-NEXT: srli a4, a0, 30
; RV32-NEXT: srli a1, a1, 28
; RV32-NEXT: or a3, a4, a3
; RV32-NEXT: lui a4, 233017
; RV32-NEXT: addi a2, a2, -1
; RV32-NEXT: and a0, a0, a2
; RV32-NEXT: and a2, a3, a2
; RV32-NEXT: add a0, a0, a2
; RV32-NEXT: add a0, a0, a1
; RV32-NEXT: addi a1, a4, -455
; RV32-NEXT: mulhu a1, a0, a1
; RV32-NEXT: srli a1, a1, 1
; RV32-NEXT: slli a2, a1, 3
; RV32-NEXT: add a1, a2, a1
; RV32-NEXT: sub a0, a0, a1
; RV32-NEXT: li a1, 0
; RV32-NEXT: ret
;
; RV64-LABEL: test_urem_9:
; RV64: # %bb.0:
; RV64-NEXT: addi sp, sp, -16
; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
; RV64-NEXT: li a2, 9
; RV64-NEXT: li a3, 0
; RV64-NEXT: call __umodti3
; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
; RV64-NEXT: addi sp, sp, 16
; RV64-NEXT: li a2, -1
; RV64-NEXT: slli a3, a1, 4
; RV64-NEXT: srli a4, a0, 60
; RV64-NEXT: or a3, a4, a3
; RV64-NEXT: lui a4, %hi(.LCPI3_0)
; RV64-NEXT: srli a2, a2, 4
; RV64-NEXT: ld a4, %lo(.LCPI3_0)(a4)
; RV64-NEXT: and a0, a0, a2
; RV64-NEXT: and a2, a3, a2
; RV64-NEXT: add a0, a0, a2
; RV64-NEXT: srli a1, a1, 56
; RV64-NEXT: add a0, a0, a1
; RV64-NEXT: mulhu a1, a0, a4
; RV64-NEXT: slli a2, a1, 3
; RV64-NEXT: add a1, a2, a1
; RV64-NEXT: sub a0, a0, a1
; RV64-NEXT: li a1, 0
; RV64-NEXT: ret
%a = urem iXLen2 %x, 9
ret iXLen2 %a

View File

@ -229,13 +229,28 @@ define i64 @dont_fold_urem_i64(i64 %x) nounwind {
;
; RV32IM-LABEL: dont_fold_urem_i64:
; RV32IM: # %bb.0:
; RV32IM-NEXT: addi sp, sp, -16
; RV32IM-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
; RV32IM-NEXT: li a2, 98
; RV32IM-NEXT: li a3, 0
; RV32IM-NEXT: call __umoddi3
; RV32IM-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
; RV32IM-NEXT: addi sp, sp, 16
; RV32IM-NEXT: slli a2, a1, 31
; RV32IM-NEXT: srli a3, a0, 1
; RV32IM-NEXT: andi a4, a1, 2046
; RV32IM-NEXT: srli a1, a1, 11
; RV32IM-NEXT: or a2, a3, a2
; RV32IM-NEXT: slli a4, a4, 10
; RV32IM-NEXT: srli a3, a2, 21
; RV32IM-NEXT: or a3, a3, a4
; RV32IM-NEXT: lui a4, 21400
; RV32IM-NEXT: slli a2, a2, 11
; RV32IM-NEXT: srli a2, a2, 11
; RV32IM-NEXT: add a2, a2, a3
; RV32IM-NEXT: li a3, 49
; RV32IM-NEXT: addi a4, a4, -2006
; RV32IM-NEXT: add a1, a2, a1
; RV32IM-NEXT: mulhu a2, a1, a4
; RV32IM-NEXT: mul a2, a2, a3
; RV32IM-NEXT: sub a1, a1, a2
; RV32IM-NEXT: slli a1, a1, 1
; RV32IM-NEXT: andi a0, a0, 1
; RV32IM-NEXT: or a0, a1, a0
; RV32IM-NEXT: li a1, 0
; RV32IM-NEXT: ret
;
; RV64I-LABEL: dont_fold_urem_i64:

View File

@ -862,51 +862,58 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind {
; RV32IM-NEXT: sw s5, 20(sp) # 4-byte Folded Spill
; RV32IM-NEXT: sw s6, 16(sp) # 4-byte Folded Spill
; RV32IM-NEXT: sw s7, 12(sp) # 4-byte Folded Spill
; RV32IM-NEXT: sw s8, 8(sp) # 4-byte Folded Spill
; RV32IM-NEXT: lw s1, 16(a1)
; RV32IM-NEXT: lw s2, 20(a1)
; RV32IM-NEXT: lw s3, 24(a1)
; RV32IM-NEXT: lw s4, 28(a1)
; RV32IM-NEXT: lw a3, 0(a1)
; RV32IM-NEXT: lw a4, 4(a1)
; RV32IM-NEXT: lw s5, 8(a1)
; RV32IM-NEXT: lw s6, 12(a1)
; RV32IM-NEXT: mv s0, a0
; RV32IM-NEXT: lw a2, 16(a1)
; RV32IM-NEXT: lw a4, 20(a1)
; RV32IM-NEXT: lw s1, 24(a1)
; RV32IM-NEXT: lw s2, 28(a1)
; RV32IM-NEXT: lw a0, 0(a1)
; RV32IM-NEXT: lw a3, 4(a1)
; RV32IM-NEXT: lw s3, 8(a1)
; RV32IM-NEXT: lw s4, 12(a1)
; RV32IM-NEXT: lui a1, 1024
; RV32IM-NEXT: slli a5, a4, 10
; RV32IM-NEXT: srli a6, a2, 22
; RV32IM-NEXT: or a5, a6, a5
; RV32IM-NEXT: lui a6, 45590
; RV32IM-NEXT: addi a1, a1, -1
; RV32IM-NEXT: addi a6, a6, 1069
; RV32IM-NEXT: and a2, a2, a1
; RV32IM-NEXT: srli a4, a4, 12
; RV32IM-NEXT: add a2, a2, a4
; RV32IM-NEXT: and a1, a5, a1
; RV32IM-NEXT: add a1, a2, a1
; RV32IM-NEXT: mulhu a2, a1, a6
; RV32IM-NEXT: li a4, 23
; RV32IM-NEXT: mul a2, a2, a4
; RV32IM-NEXT: sub s7, a1, a2
; RV32IM-NEXT: li a2, 1
; RV32IM-NEXT: mv a0, a3
; RV32IM-NEXT: mv a1, a4
; RV32IM-NEXT: li a3, 0
; RV32IM-NEXT: call __umoddi3
; RV32IM-NEXT: mv s7, a0
; RV32IM-NEXT: mv s8, a1
; RV32IM-NEXT: li a2, 654
; RV32IM-NEXT: mv a0, s5
; RV32IM-NEXT: mv a1, s6
; RV32IM-NEXT: mv a1, a3
; RV32IM-NEXT: li a3, 0
; RV32IM-NEXT: call __umoddi3
; RV32IM-NEXT: mv s5, a0
; RV32IM-NEXT: mv s6, a1
; RV32IM-NEXT: li a2, 23
; RV32IM-NEXT: mv a0, s1
; RV32IM-NEXT: mv a1, s2
; RV32IM-NEXT: li a3, 0
; RV32IM-NEXT: call __umoddi3
; RV32IM-NEXT: mv s1, a0
; RV32IM-NEXT: mv s2, a1
; RV32IM-NEXT: lui a2, 1
; RV32IM-NEXT: addi a2, a2, 1327
; RV32IM-NEXT: li a2, 654
; RV32IM-NEXT: mv a0, s3
; RV32IM-NEXT: mv a1, s4
; RV32IM-NEXT: li a3, 0
; RV32IM-NEXT: call __umoddi3
; RV32IM-NEXT: sw s1, 16(s0)
; RV32IM-NEXT: sw s2, 20(s0)
; RV32IM-NEXT: mv s3, a0
; RV32IM-NEXT: mv s4, a1
; RV32IM-NEXT: lui a2, 1
; RV32IM-NEXT: addi a2, a2, 1327
; RV32IM-NEXT: mv a0, s1
; RV32IM-NEXT: mv a1, s2
; RV32IM-NEXT: li a3, 0
; RV32IM-NEXT: call __umoddi3
; RV32IM-NEXT: sw s7, 16(s0)
; RV32IM-NEXT: sw zero, 20(s0)
; RV32IM-NEXT: sw a0, 24(s0)
; RV32IM-NEXT: sw a1, 28(s0)
; RV32IM-NEXT: sw s7, 0(s0)
; RV32IM-NEXT: sw s8, 4(s0)
; RV32IM-NEXT: sw s5, 8(s0)
; RV32IM-NEXT: sw s6, 12(s0)
; RV32IM-NEXT: sw s5, 0(s0)
; RV32IM-NEXT: sw s6, 4(s0)
; RV32IM-NEXT: sw s3, 8(s0)
; RV32IM-NEXT: sw s4, 12(s0)
; RV32IM-NEXT: lw ra, 44(sp) # 4-byte Folded Reload
; RV32IM-NEXT: lw s0, 40(sp) # 4-byte Folded Reload
; RV32IM-NEXT: lw s1, 36(sp) # 4-byte Folded Reload
@ -916,7 +923,6 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) nounwind {
; RV32IM-NEXT: lw s5, 20(sp) # 4-byte Folded Reload
; RV32IM-NEXT: lw s6, 16(sp) # 4-byte Folded Reload
; RV32IM-NEXT: lw s7, 12(sp) # 4-byte Folded Reload
; RV32IM-NEXT: lw s8, 8(sp) # 4-byte Folded Reload
; RV32IM-NEXT: addi sp, sp, 48
; RV32IM-NEXT: ret
;

View File

@ -294,19 +294,48 @@ entry:
define i64 @PR23590(i64 %x) nounwind {
; X86-LABEL: PR23590:
; X86: # %bb.0: # %entry
; X86-NEXT: subl $12, %esp
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: pushl %eax
; X86-NEXT: pushl $0
; X86-NEXT: pushl $12345 # imm = 0x3039
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: pushl {{[0-9]+}}(%esp)
; X86-NEXT: calll __umoddi3
; X86-NEXT: addl $16, %esp
; X86-NEXT: pushl $0
; X86-NEXT: pushl $7
; X86-NEXT: pushl %edx
; X86-NEXT: pushl %eax
; X86-NEXT: calll __udivdi3
; X86-NEXT: addl $28, %esp
; X86-NEXT: movl %eax, %esi
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: andl $1073741823, %eax # imm = 0x3FFFFFFF
; X86-NEXT: movl %esi, %edx
; X86-NEXT: shrdl $30, %ecx, %edx
; X86-NEXT: andl $1073741823, %edx # imm = 0x3FFFFFFF
; X86-NEXT: movl %ecx, %edi
; X86-NEXT: shrl $28, %edi
; X86-NEXT: addl %eax, %edi
; X86-NEXT: addl %edx, %edi
; X86-NEXT: movl $613566757, %edx # imm = 0x24924925
; X86-NEXT: movl %edi, %eax
; X86-NEXT: mull %edx
; X86-NEXT: movl %edi, %eax
; X86-NEXT: subl %edx, %eax
; X86-NEXT: shrl %eax
; X86-NEXT: addl %edx, %eax
; X86-NEXT: shrl $2, %eax
; X86-NEXT: leal (,%eax,8), %edx
; X86-NEXT: subl %edx, %eax
; X86-NEXT: addl %edi, %eax
; X86-NEXT: subl %eax, %esi
; X86-NEXT: sbbl $0, %ecx
; X86-NEXT: movl $-1227133513, %edx # imm = 0xB6DB6DB7
; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull %edx
; X86-NEXT: imull $1840700269, %esi, %esi # imm = 0x6DB6DB6D
; X86-NEXT: addl %esi, %edx
; X86-NEXT: imull $-1227133513, %ecx, %ecx # imm = 0xB6DB6DB7
; X86-NEXT: addl %ecx, %edx
; X86-NEXT: addl $4, %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: retl
;
; X64-FAST-LABEL: PR23590:

View File

@ -67,25 +67,42 @@ define i64 @div128(i128 %x) nounwind {
define i64 @umod128(i128 %x) nounwind {
; X86-64-LABEL: umod128:
; X86-64: # %bb.0:
; X86-64-NEXT: pushq %rax
; X86-64-NEXT: movl $11, %edx
; X86-64-NEXT: xorl %ecx, %ecx
; X86-64-NEXT: callq __umodti3@PLT
; X86-64-NEXT: popq %rcx
; X86-64-NEXT: movabsq $1152921504606846975, %rcx # imm = 0xFFFFFFFFFFFFFFF
; X86-64-NEXT: movq %rdi, %rax
; X86-64-NEXT: andq %rcx, %rax
; X86-64-NEXT: shrdq $60, %rsi, %rdi
; X86-64-NEXT: andq %rdi, %rcx
; X86-64-NEXT: addq %rax, %rcx
; X86-64-NEXT: shrq $56, %rsi
; X86-64-NEXT: addq %rsi, %rcx
; X86-64-NEXT: movabsq $3353953467947191203, %rdx # imm = 0x2E8BA2E8BA2E8BA3
; X86-64-NEXT: movq %rcx, %rax
; X86-64-NEXT: mulq %rdx
; X86-64-NEXT: shrq %rdx
; X86-64-NEXT: leaq (%rdx,%rdx,4), %rax
; X86-64-NEXT: leaq (%rdx,%rax,2), %rax
; X86-64-NEXT: subq %rax, %rcx
; X86-64-NEXT: movq %rcx, %rax
; X86-64-NEXT: retq
;
; WIN64-LABEL: umod128:
; WIN64: # %bb.0:
; WIN64-NEXT: subq $72, %rsp
; WIN64-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
; WIN64-NEXT: movq %rcx, {{[0-9]+}}(%rsp)
; WIN64-NEXT: movq $11, {{[0-9]+}}(%rsp)
; WIN64-NEXT: movq $0, {{[0-9]+}}(%rsp)
; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rcx
; WIN64-NEXT: leaq {{[0-9]+}}(%rsp), %rdx
; WIN64-NEXT: callq __umodti3
; WIN64-NEXT: movq %xmm0, %rax
; WIN64-NEXT: addq $72, %rsp
; WIN64-NEXT: movabsq $1152921504606846975, %r8 # imm = 0xFFFFFFFFFFFFFFF
; WIN64-NEXT: movq %rcx, %rax
; WIN64-NEXT: andq %r8, %rax
; WIN64-NEXT: shrdq $60, %rdx, %rcx
; WIN64-NEXT: andq %rcx, %r8
; WIN64-NEXT: addq %rax, %r8
; WIN64-NEXT: shrq $56, %rdx
; WIN64-NEXT: addq %rdx, %r8
; WIN64-NEXT: movabsq $3353953467947191203, %rcx # imm = 0x2E8BA2E8BA2E8BA3
; WIN64-NEXT: movq %r8, %rax
; WIN64-NEXT: mulq %rcx
; WIN64-NEXT: shrq %rdx
; WIN64-NEXT: leaq (%rdx,%rdx,4), %rax
; WIN64-NEXT: leaq (%rdx,%rax,2), %rax
; WIN64-NEXT: subq %rax, %r8
; WIN64-NEXT: movq %r8, %rax
; WIN64-NEXT: retq

File diff suppressed because it is too large Load Diff

View File

@ -940,3 +940,477 @@ define <16 x i8> @test_remconstant_16i8(<16 x i8> %a) nounwind {
%res = urem <16 x i8> %a, <i8 7, i8 8, i8 9, i8 10,i8 11, i8 12, i8 13, i8 14, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9,i8 9, i8 7>
ret <16 x i8> %res
}
define <2 x i128> @v2i128_div_by_7(<2 x i128> %x) nounwind {
; SSE-LABEL: v2i128_div_by_7:
; SSE: # %bb.0: # %entry
; SSE-NEXT: pushq %r15
; SSE-NEXT: pushq %r14
; SSE-NEXT: pushq %rbx
; SSE-NEXT: movq %rcx, %r9
; SSE-NEXT: movq %rdx, %rcx
; SSE-NEXT: movabsq $1152921504606846975, %r14 # imm = 0xFFFFFFFFFFFFFFF
; SSE-NEXT: movq %rsi, %rax
; SSE-NEXT: andq %r14, %rax
; SSE-NEXT: movq %rsi, %rdx
; SSE-NEXT: shrdq $60, %rcx, %rdx
; SSE-NEXT: andq %r14, %rdx
; SSE-NEXT: addq %rax, %rdx
; SSE-NEXT: movq %rcx, %r10
; SSE-NEXT: shrq $56, %r10
; SSE-NEXT: addq %rdx, %r10
; SSE-NEXT: movabsq $5270498306774157605, %r15 # imm = 0x4924924924924925
; SSE-NEXT: movq %r10, %rax
; SSE-NEXT: mulq %r15
; SSE-NEXT: shrq %rdx
; SSE-NEXT: leaq (,%rdx,8), %rax
; SSE-NEXT: subq %rax, %rdx
; SSE-NEXT: addq %r10, %rdx
; SSE-NEXT: subq %rdx, %rsi
; SSE-NEXT: sbbq $0, %rcx
; SSE-NEXT: movabsq $-5270498306774157605, %r11 # imm = 0xB6DB6DB6DB6DB6DB
; SSE-NEXT: movq %rsi, %r10
; SSE-NEXT: imulq %r11, %r10
; SSE-NEXT: movabsq $7905747460161236407, %rbx # imm = 0x6DB6DB6DB6DB6DB7
; SSE-NEXT: movq %rsi, %rax
; SSE-NEXT: mulq %rbx
; SSE-NEXT: movq %rax, %rsi
; SSE-NEXT: addq %r10, %rdx
; SSE-NEXT: imulq %rbx, %rcx
; SSE-NEXT: addq %rdx, %rcx
; SSE-NEXT: movq %r9, %rax
; SSE-NEXT: andq %r14, %rax
; SSE-NEXT: movq %r9, %rdx
; SSE-NEXT: shrdq $60, %r8, %rdx
; SSE-NEXT: andq %r14, %rdx
; SSE-NEXT: addq %rax, %rdx
; SSE-NEXT: movq %r8, %r10
; SSE-NEXT: shrq $56, %r10
; SSE-NEXT: addq %rdx, %r10
; SSE-NEXT: movq %r10, %rax
; SSE-NEXT: mulq %r15
; SSE-NEXT: shrq %rdx
; SSE-NEXT: leaq (,%rdx,8), %rax
; SSE-NEXT: subq %rax, %rdx
; SSE-NEXT: addq %r10, %rdx
; SSE-NEXT: subq %rdx, %r9
; SSE-NEXT: sbbq $0, %r8
; SSE-NEXT: imulq %r9, %r11
; SSE-NEXT: movq %r9, %rax
; SSE-NEXT: mulq %rbx
; SSE-NEXT: addq %r11, %rdx
; SSE-NEXT: imulq %rbx, %r8
; SSE-NEXT: addq %rdx, %r8
; SSE-NEXT: movq %rax, 16(%rdi)
; SSE-NEXT: movq %rsi, (%rdi)
; SSE-NEXT: movq %r8, 24(%rdi)
; SSE-NEXT: movq %rcx, 8(%rdi)
; SSE-NEXT: movq %rdi, %rax
; SSE-NEXT: popq %rbx
; SSE-NEXT: popq %r14
; SSE-NEXT: popq %r15
; SSE-NEXT: retq
;
; AVX-LABEL: v2i128_div_by_7:
; AVX: # %bb.0: # %entry
; AVX-NEXT: pushq %r15
; AVX-NEXT: pushq %r14
; AVX-NEXT: pushq %rbx
; AVX-NEXT: movq %rcx, %r9
; AVX-NEXT: movq %rdx, %rcx
; AVX-NEXT: movabsq $1152921504606846975, %r14 # imm = 0xFFFFFFFFFFFFFFF
; AVX-NEXT: movq %rsi, %rax
; AVX-NEXT: andq %r14, %rax
; AVX-NEXT: movq %rsi, %rdx
; AVX-NEXT: shrdq $60, %rcx, %rdx
; AVX-NEXT: andq %r14, %rdx
; AVX-NEXT: addq %rax, %rdx
; AVX-NEXT: movq %rcx, %r10
; AVX-NEXT: shrq $56, %r10
; AVX-NEXT: addq %rdx, %r10
; AVX-NEXT: movabsq $5270498306774157605, %r15 # imm = 0x4924924924924925
; AVX-NEXT: movq %r10, %rax
; AVX-NEXT: mulq %r15
; AVX-NEXT: shrq %rdx
; AVX-NEXT: leaq (,%rdx,8), %rax
; AVX-NEXT: subq %rax, %rdx
; AVX-NEXT: addq %r10, %rdx
; AVX-NEXT: subq %rdx, %rsi
; AVX-NEXT: sbbq $0, %rcx
; AVX-NEXT: movabsq $-5270498306774157605, %r11 # imm = 0xB6DB6DB6DB6DB6DB
; AVX-NEXT: movq %rsi, %r10
; AVX-NEXT: imulq %r11, %r10
; AVX-NEXT: movabsq $7905747460161236407, %rbx # imm = 0x6DB6DB6DB6DB6DB7
; AVX-NEXT: movq %rsi, %rax
; AVX-NEXT: mulq %rbx
; AVX-NEXT: movq %rax, %rsi
; AVX-NEXT: addq %r10, %rdx
; AVX-NEXT: imulq %rbx, %rcx
; AVX-NEXT: addq %rdx, %rcx
; AVX-NEXT: movq %r9, %rax
; AVX-NEXT: andq %r14, %rax
; AVX-NEXT: movq %r9, %rdx
; AVX-NEXT: shrdq $60, %r8, %rdx
; AVX-NEXT: andq %r14, %rdx
; AVX-NEXT: addq %rax, %rdx
; AVX-NEXT: movq %r8, %r10
; AVX-NEXT: shrq $56, %r10
; AVX-NEXT: addq %rdx, %r10
; AVX-NEXT: movq %r10, %rax
; AVX-NEXT: mulq %r15
; AVX-NEXT: shrq %rdx
; AVX-NEXT: leaq (,%rdx,8), %rax
; AVX-NEXT: subq %rax, %rdx
; AVX-NEXT: addq %r10, %rdx
; AVX-NEXT: subq %rdx, %r9
; AVX-NEXT: sbbq $0, %r8
; AVX-NEXT: imulq %r9, %r11
; AVX-NEXT: movq %r9, %rax
; AVX-NEXT: mulq %rbx
; AVX-NEXT: addq %r11, %rdx
; AVX-NEXT: imulq %rbx, %r8
; AVX-NEXT: addq %rdx, %r8
; AVX-NEXT: movq %rax, 16(%rdi)
; AVX-NEXT: movq %rsi, (%rdi)
; AVX-NEXT: movq %r8, 24(%rdi)
; AVX-NEXT: movq %rcx, 8(%rdi)
; AVX-NEXT: movq %rdi, %rax
; AVX-NEXT: popq %rbx
; AVX-NEXT: popq %r14
; AVX-NEXT: popq %r15
; AVX-NEXT: retq
entry:
%div = udiv <2 x i128> %x, <i128 7, i128 7>
ret <2 x i128> %div
}
define <2 x i128> @v2i128_div_by_14(<2 x i128> %x) nounwind {
; SSE-LABEL: v2i128_div_by_14:
; SSE: # %bb.0: # %entry
; SSE-NEXT: pushq %r15
; SSE-NEXT: pushq %r14
; SSE-NEXT: pushq %rbx
; SSE-NEXT: movq %rcx, %r9
; SSE-NEXT: movq %rdx, %rcx
; SSE-NEXT: movq %rsi, %rax
; SSE-NEXT: shrdq $1, %rdx, %rax
; SSE-NEXT: movabsq $1152921504606846975, %r11 # imm = 0xFFFFFFFFFFFFFFF
; SSE-NEXT: shrq %rdx
; SSE-NEXT: shldq $4, %rax, %rdx
; SSE-NEXT: andq %r11, %rax
; SSE-NEXT: andq %r11, %rdx
; SSE-NEXT: movq %rcx, %r10
; SSE-NEXT: shrq $57, %r10
; SSE-NEXT: addq %rax, %r10
; SSE-NEXT: addq %rdx, %r10
; SSE-NEXT: movabsq $5270498306774157605, %r15 # imm = 0x4924924924924925
; SSE-NEXT: movq %r10, %rax
; SSE-NEXT: mulq %r15
; SSE-NEXT: shrq %rdx
; SSE-NEXT: leaq (,%rdx,8), %rax
; SSE-NEXT: subq %rax, %rdx
; SSE-NEXT: addq %r10, %rdx
; SSE-NEXT: subq %rdx, %rsi
; SSE-NEXT: sbbq $0, %rcx
; SSE-NEXT: movabsq $-5270498306774157605, %rbx # imm = 0xB6DB6DB6DB6DB6DB
; SSE-NEXT: movq %rsi, %r10
; SSE-NEXT: imulq %rbx, %r10
; SSE-NEXT: movabsq $7905747460161236407, %r14 # imm = 0x6DB6DB6DB6DB6DB7
; SSE-NEXT: movq %rsi, %rax
; SSE-NEXT: mulq %r14
; SSE-NEXT: movq %rax, %rsi
; SSE-NEXT: addq %r10, %rdx
; SSE-NEXT: imulq %r14, %rcx
; SSE-NEXT: addq %rdx, %rcx
; SSE-NEXT: movq %r9, %rax
; SSE-NEXT: shrdq $1, %r8, %rax
; SSE-NEXT: movq %r8, %rdx
; SSE-NEXT: shrq %rdx
; SSE-NEXT: shldq $4, %rax, %rdx
; SSE-NEXT: andq %r11, %rax
; SSE-NEXT: andq %r11, %rdx
; SSE-NEXT: movq %r8, %r10
; SSE-NEXT: shrq $57, %r10
; SSE-NEXT: addq %rax, %r10
; SSE-NEXT: addq %rdx, %r10
; SSE-NEXT: movq %r10, %rax
; SSE-NEXT: mulq %r15
; SSE-NEXT: shrq %rdx
; SSE-NEXT: leaq (,%rdx,8), %rax
; SSE-NEXT: subq %rax, %rdx
; SSE-NEXT: addq %r10, %rdx
; SSE-NEXT: subq %rdx, %r9
; SSE-NEXT: sbbq $0, %r8
; SSE-NEXT: imulq %r9, %rbx
; SSE-NEXT: movq %r9, %rax
; SSE-NEXT: mulq %r14
; SSE-NEXT: addq %rbx, %rdx
; SSE-NEXT: imulq %r14, %r8
; SSE-NEXT: addq %rdx, %r8
; SSE-NEXT: movq %rax, 16(%rdi)
; SSE-NEXT: movq %rsi, (%rdi)
; SSE-NEXT: movq %r8, 24(%rdi)
; SSE-NEXT: movq %rcx, 8(%rdi)
; SSE-NEXT: movq %rdi, %rax
; SSE-NEXT: popq %rbx
; SSE-NEXT: popq %r14
; SSE-NEXT: popq %r15
; SSE-NEXT: retq
;
; AVX-LABEL: v2i128_div_by_14:
; AVX: # %bb.0: # %entry
; AVX-NEXT: pushq %r15
; AVX-NEXT: pushq %r14
; AVX-NEXT: pushq %rbx
; AVX-NEXT: movq %rcx, %r9
; AVX-NEXT: movq %rdx, %rcx
; AVX-NEXT: movq %rsi, %rax
; AVX-NEXT: shrdq $1, %rdx, %rax
; AVX-NEXT: movabsq $1152921504606846975, %r11 # imm = 0xFFFFFFFFFFFFFFF
; AVX-NEXT: shrq %rdx
; AVX-NEXT: shldq $4, %rax, %rdx
; AVX-NEXT: andq %r11, %rax
; AVX-NEXT: andq %r11, %rdx
; AVX-NEXT: movq %rcx, %r10
; AVX-NEXT: shrq $57, %r10
; AVX-NEXT: addq %rax, %r10
; AVX-NEXT: addq %rdx, %r10
; AVX-NEXT: movabsq $5270498306774157605, %r15 # imm = 0x4924924924924925
; AVX-NEXT: movq %r10, %rax
; AVX-NEXT: mulq %r15
; AVX-NEXT: shrq %rdx
; AVX-NEXT: leaq (,%rdx,8), %rax
; AVX-NEXT: subq %rax, %rdx
; AVX-NEXT: addq %r10, %rdx
; AVX-NEXT: subq %rdx, %rsi
; AVX-NEXT: sbbq $0, %rcx
; AVX-NEXT: movabsq $-5270498306774157605, %rbx # imm = 0xB6DB6DB6DB6DB6DB
; AVX-NEXT: movq %rsi, %r10
; AVX-NEXT: imulq %rbx, %r10
; AVX-NEXT: movabsq $7905747460161236407, %r14 # imm = 0x6DB6DB6DB6DB6DB7
; AVX-NEXT: movq %rsi, %rax
; AVX-NEXT: mulq %r14
; AVX-NEXT: movq %rax, %rsi
; AVX-NEXT: addq %r10, %rdx
; AVX-NEXT: imulq %r14, %rcx
; AVX-NEXT: addq %rdx, %rcx
; AVX-NEXT: movq %r9, %rax
; AVX-NEXT: shrdq $1, %r8, %rax
; AVX-NEXT: movq %r8, %rdx
; AVX-NEXT: shrq %rdx
; AVX-NEXT: shldq $4, %rax, %rdx
; AVX-NEXT: andq %r11, %rax
; AVX-NEXT: andq %r11, %rdx
; AVX-NEXT: movq %r8, %r10
; AVX-NEXT: shrq $57, %r10
; AVX-NEXT: addq %rax, %r10
; AVX-NEXT: addq %rdx, %r10
; AVX-NEXT: movq %r10, %rax
; AVX-NEXT: mulq %r15
; AVX-NEXT: shrq %rdx
; AVX-NEXT: leaq (,%rdx,8), %rax
; AVX-NEXT: subq %rax, %rdx
; AVX-NEXT: addq %r10, %rdx
; AVX-NEXT: subq %rdx, %r9
; AVX-NEXT: sbbq $0, %r8
; AVX-NEXT: imulq %r9, %rbx
; AVX-NEXT: movq %r9, %rax
; AVX-NEXT: mulq %r14
; AVX-NEXT: addq %rbx, %rdx
; AVX-NEXT: imulq %r14, %r8
; AVX-NEXT: addq %rdx, %r8
; AVX-NEXT: movq %rax, 16(%rdi)
; AVX-NEXT: movq %rsi, (%rdi)
; AVX-NEXT: movq %r8, 24(%rdi)
; AVX-NEXT: movq %rcx, 8(%rdi)
; AVX-NEXT: movq %rdi, %rax
; AVX-NEXT: popq %rbx
; AVX-NEXT: popq %r14
; AVX-NEXT: popq %r15
; AVX-NEXT: retq
entry:
%div = udiv <2 x i128> %x, <i128 14, i128 14>
ret <2 x i128> %div
}
define <2 x i128> @v2i128_rem_by_7(<2 x i128> %x) nounwind {
; SSE-LABEL: v2i128_rem_by_7:
; SSE: # %bb.0: # %entry
; SSE-NEXT: movq %rdx, %r9
; SSE-NEXT: movabsq $1152921504606846975, %r10 # imm = 0xFFFFFFFFFFFFFFF
; SSE-NEXT: movq %rsi, %rax
; SSE-NEXT: andq %r10, %rax
; SSE-NEXT: shrdq $60, %rdx, %rsi
; SSE-NEXT: andq %r10, %rsi
; SSE-NEXT: addq %rax, %rsi
; SSE-NEXT: shrq $56, %r9
; SSE-NEXT: addq %rsi, %r9
; SSE-NEXT: movabsq $5270498306774157605, %r11 # imm = 0x4924924924924925
; SSE-NEXT: movq %r9, %rax
; SSE-NEXT: mulq %r11
; SSE-NEXT: movq %rdx, %rsi
; SSE-NEXT: shrq %rsi
; SSE-NEXT: leaq (,%rsi,8), %rax
; SSE-NEXT: subq %rax, %rsi
; SSE-NEXT: addq %r9, %rsi
; SSE-NEXT: movq %rcx, %rax
; SSE-NEXT: andq %r10, %rax
; SSE-NEXT: shrdq $60, %r8, %rcx
; SSE-NEXT: andq %r10, %rcx
; SSE-NEXT: addq %rax, %rcx
; SSE-NEXT: shrq $56, %r8
; SSE-NEXT: addq %rcx, %r8
; SSE-NEXT: movq %r8, %rax
; SSE-NEXT: mulq %r11
; SSE-NEXT: shrq %rdx
; SSE-NEXT: leaq (,%rdx,8), %rax
; SSE-NEXT: subq %rax, %rdx
; SSE-NEXT: addq %r8, %rdx
; SSE-NEXT: movq %rdx, 16(%rdi)
; SSE-NEXT: movq %rsi, (%rdi)
; SSE-NEXT: movq $0, 24(%rdi)
; SSE-NEXT: movq $0, 8(%rdi)
; SSE-NEXT: movq %rdi, %rax
; SSE-NEXT: retq
;
; AVX-LABEL: v2i128_rem_by_7:
; AVX: # %bb.0: # %entry
; AVX-NEXT: movq %rdx, %r9
; AVX-NEXT: movabsq $1152921504606846975, %r10 # imm = 0xFFFFFFFFFFFFFFF
; AVX-NEXT: movq %rsi, %rax
; AVX-NEXT: andq %r10, %rax
; AVX-NEXT: shrdq $60, %rdx, %rsi
; AVX-NEXT: andq %r10, %rsi
; AVX-NEXT: addq %rax, %rsi
; AVX-NEXT: shrq $56, %r9
; AVX-NEXT: addq %rsi, %r9
; AVX-NEXT: movabsq $5270498306774157605, %r11 # imm = 0x4924924924924925
; AVX-NEXT: movq %r9, %rax
; AVX-NEXT: mulq %r11
; AVX-NEXT: movq %rdx, %rsi
; AVX-NEXT: shrq %rsi
; AVX-NEXT: leaq (,%rsi,8), %rax
; AVX-NEXT: subq %rax, %rsi
; AVX-NEXT: addq %r9, %rsi
; AVX-NEXT: movq %rcx, %rax
; AVX-NEXT: andq %r10, %rax
; AVX-NEXT: shrdq $60, %r8, %rcx
; AVX-NEXT: andq %r10, %rcx
; AVX-NEXT: addq %rax, %rcx
; AVX-NEXT: shrq $56, %r8
; AVX-NEXT: addq %rcx, %r8
; AVX-NEXT: movq %r8, %rax
; AVX-NEXT: mulq %r11
; AVX-NEXT: shrq %rdx
; AVX-NEXT: leaq (,%rdx,8), %rax
; AVX-NEXT: subq %rax, %rdx
; AVX-NEXT: addq %r8, %rdx
; AVX-NEXT: movq %rdx, 16(%rdi)
; AVX-NEXT: movq %rsi, (%rdi)
; AVX-NEXT: movq $0, 24(%rdi)
; AVX-NEXT: movq $0, 8(%rdi)
; AVX-NEXT: movq %rdi, %rax
; AVX-NEXT: retq
entry:
%rem = urem <2 x i128> %x, <i128 7, i128 7>
ret <2 x i128> %rem
}
define <2 x i128> @v2i128_rem_by_14(<2 x i128> %x) nounwind {
; SSE-LABEL: v2i128_rem_by_14:
; SSE: # %bb.0: # %entry
; SSE-NEXT: movq %rdx, %r9
; SSE-NEXT: movq %rsi, %rax
; SSE-NEXT: shrdq $1, %rdx, %rax
; SSE-NEXT: movabsq $1152921504606846975, %r10 # imm = 0xFFFFFFFFFFFFFFF
; SSE-NEXT: shrq %rdx
; SSE-NEXT: shldq $4, %rax, %rdx
; SSE-NEXT: andq %r10, %rax
; SSE-NEXT: andq %r10, %rdx
; SSE-NEXT: shrq $57, %r9
; SSE-NEXT: addq %rax, %r9
; SSE-NEXT: addq %rdx, %r9
; SSE-NEXT: movabsq $5270498306774157605, %r11 # imm = 0x4924924924924925
; SSE-NEXT: movq %r9, %rax
; SSE-NEXT: mulq %r11
; SSE-NEXT: shrq %rdx
; SSE-NEXT: leal (,%rdx,8), %eax
; SSE-NEXT: subl %eax, %edx
; SSE-NEXT: addl %edx, %r9d
; SSE-NEXT: andl $1, %esi
; SSE-NEXT: leaq (%rsi,%r9,2), %rsi
; SSE-NEXT: movq %rcx, %rax
; SSE-NEXT: shrdq $1, %r8, %rax
; SSE-NEXT: movq %r8, %rdx
; SSE-NEXT: shrq %rdx
; SSE-NEXT: shldq $4, %rax, %rdx
; SSE-NEXT: andq %r10, %rax
; SSE-NEXT: andq %r10, %rdx
; SSE-NEXT: shrq $57, %r8
; SSE-NEXT: addq %rax, %r8
; SSE-NEXT: addq %rdx, %r8
; SSE-NEXT: movq %r8, %rax
; SSE-NEXT: mulq %r11
; SSE-NEXT: shrq %rdx
; SSE-NEXT: leal (,%rdx,8), %eax
; SSE-NEXT: subl %eax, %edx
; SSE-NEXT: addl %edx, %r8d
; SSE-NEXT: andl $1, %ecx
; SSE-NEXT: leaq (%rcx,%r8,2), %rax
; SSE-NEXT: movq %rax, 16(%rdi)
; SSE-NEXT: movq %rsi, (%rdi)
; SSE-NEXT: movq $0, 24(%rdi)
; SSE-NEXT: movq $0, 8(%rdi)
; SSE-NEXT: movq %rdi, %rax
; SSE-NEXT: retq
;
; AVX-LABEL: v2i128_rem_by_14:
; AVX: # %bb.0: # %entry
; AVX-NEXT: movq %rdx, %r9
; AVX-NEXT: movq %rsi, %rax
; AVX-NEXT: shrdq $1, %rdx, %rax
; AVX-NEXT: movabsq $1152921504606846975, %r10 # imm = 0xFFFFFFFFFFFFFFF
; AVX-NEXT: shrq %rdx
; AVX-NEXT: shldq $4, %rax, %rdx
; AVX-NEXT: andq %r10, %rax
; AVX-NEXT: andq %r10, %rdx
; AVX-NEXT: shrq $57, %r9
; AVX-NEXT: addq %rax, %r9
; AVX-NEXT: addq %rdx, %r9
; AVX-NEXT: movabsq $5270498306774157605, %r11 # imm = 0x4924924924924925
; AVX-NEXT: movq %r9, %rax
; AVX-NEXT: mulq %r11
; AVX-NEXT: shrq %rdx
; AVX-NEXT: leal (,%rdx,8), %eax
; AVX-NEXT: subl %eax, %edx
; AVX-NEXT: addl %edx, %r9d
; AVX-NEXT: andl $1, %esi
; AVX-NEXT: leaq (%rsi,%r9,2), %rsi
; AVX-NEXT: movq %rcx, %rax
; AVX-NEXT: shrdq $1, %r8, %rax
; AVX-NEXT: movq %r8, %rdx
; AVX-NEXT: shrq %rdx
; AVX-NEXT: shldq $4, %rax, %rdx
; AVX-NEXT: andq %r10, %rax
; AVX-NEXT: andq %r10, %rdx
; AVX-NEXT: shrq $57, %r8
; AVX-NEXT: addq %rax, %r8
; AVX-NEXT: addq %rdx, %r8
; AVX-NEXT: movq %r8, %rax
; AVX-NEXT: mulq %r11
; AVX-NEXT: shrq %rdx
; AVX-NEXT: leal (,%rdx,8), %eax
; AVX-NEXT: subl %eax, %edx
; AVX-NEXT: addl %edx, %r8d
; AVX-NEXT: andl $1, %ecx
; AVX-NEXT: leaq (%rcx,%r8,2), %rax
; AVX-NEXT: movq %rax, 16(%rdi)
; AVX-NEXT: movq %rsi, (%rdi)
; AVX-NEXT: movq $0, 24(%rdi)
; AVX-NEXT: movq $0, 8(%rdi)
; AVX-NEXT: movq %rdi, %rax
; AVX-NEXT: retq
entry:
%rem = urem <2 x i128> %x, <i128 14, i128 14>
ret <2 x i128> %rem
}