[DAGCombine] Enable div by constant optimization for odd sized vectors before type legalization. (#188313)
If we we are going to legalize to a vector with the same element type and mulh or mul_lohi are supported, allow the optimization before type legalization. RISC-V will widen vectors using vp.udiv/sdiv that doesn't support division by constant optimization. In addition, type legalization will create a build_vector with undef elements making it hard to match after type legalization. Other targets may need to widen by a combination of vector and scalar divisions to avoid traps if we widen a vector with garbage. I had to enable the MULHU->SRL DAG combine before type legalization to prevent regressions. After type legalization, the multiply constant build_vector will have undef elements and the combine won't trigger.
This commit is contained in:
parent
8700d19a3d
commit
0ebef5e5e2
@ -5636,7 +5636,7 @@ SDValue DAGCombiner::visitMULHU(SDNode *N) {
|
||||
// fold (mulhu x, (1 << c)) -> x >> (bitwidth - c)
|
||||
if (isConstantOrConstantVector(N1, /*NoOpaques=*/true,
|
||||
/*AllowTruncation=*/true) &&
|
||||
hasOperation(ISD::SRL, VT)) {
|
||||
(!LegalOperations || hasOperation(ISD::SRL, VT))) {
|
||||
if (SDValue LogBase2 = BuildLogBase2(N1, DL)) {
|
||||
unsigned NumEltBits = VT.getScalarSizeInBits();
|
||||
SDValue SRLAmt = DAG.getNode(
|
||||
|
||||
@ -6612,9 +6612,18 @@ SDValue TargetLowering::BuildSDIV(SDNode *N, SelectionDAG &DAG,
|
||||
|
||||
// Check to see if we can do this.
|
||||
// FIXME: We should be more aggressive here.
|
||||
if (!isTypeLegal(VT)) {
|
||||
EVT QueryVT = VT;
|
||||
if (VT.isVector()) {
|
||||
// If the vector type will be legalized to a vector type with the same
|
||||
// element type, allow the transform before type legalization if MULHS or
|
||||
// SMUL_LOHI are supported.
|
||||
QueryVT = getLegalTypeToTransformTo(*DAG.getContext(), VT);
|
||||
if (!QueryVT.isVector() ||
|
||||
QueryVT.getVectorElementType() != VT.getVectorElementType())
|
||||
return SDValue();
|
||||
} else if (!isTypeLegal(VT)) {
|
||||
// Limit this to simple scalars for now.
|
||||
if (VT.isVector() || !VT.isSimple())
|
||||
if (!VT.isSimple())
|
||||
return SDValue();
|
||||
|
||||
// If this type will be promoted to a large enough type with a legal
|
||||
@ -6628,11 +6637,12 @@ SDValue TargetLowering::BuildSDIV(SDNode *N, SelectionDAG &DAG,
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
bool HasMULHS = isOperationLegalOrCustom(ISD::MULHS, VT, IsAfterLegalization);
|
||||
bool HasMULHS =
|
||||
isOperationLegalOrCustom(ISD::MULHS, QueryVT, IsAfterLegalization);
|
||||
bool HasSMUL_LOHI =
|
||||
isOperationLegalOrCustom(ISD::SMUL_LOHI, VT, IsAfterLegalization);
|
||||
isOperationLegalOrCustom(ISD::SMUL_LOHI, QueryVT, IsAfterLegalization);
|
||||
|
||||
if (!HasMULHS && !HasSMUL_LOHI && MulVT == EVT()) {
|
||||
if (isTypeLegal(VT) && !HasMULHS && !HasSMUL_LOHI && MulVT == EVT()) {
|
||||
// If type twice as wide legal, widen and use a mul plus a shift.
|
||||
EVT WideVT = VT.widenIntegerElementType(*DAG.getContext());
|
||||
// Some targets like AMDGPU try to go from SDIV to SDIVREM which is then
|
||||
@ -6791,9 +6801,18 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
|
||||
|
||||
// Check to see if we can do this.
|
||||
// FIXME: We should be more aggressive here.
|
||||
if (!isTypeLegal(VT)) {
|
||||
EVT QueryVT = VT;
|
||||
if (VT.isVector()) {
|
||||
// If the vector type will be legalized to a vector type with the same
|
||||
// element type, allow the transform before type legalization if MULHU or
|
||||
// UMUL_LOHI are supported.
|
||||
QueryVT = getLegalTypeToTransformTo(*DAG.getContext(), VT);
|
||||
if (!QueryVT.isVector() ||
|
||||
QueryVT.getVectorElementType() != VT.getVectorElementType())
|
||||
return SDValue();
|
||||
} else if (!isTypeLegal(VT)) {
|
||||
// Limit this to simple scalars for now.
|
||||
if (VT.isVector() || !VT.isSimple())
|
||||
if (!VT.isSimple())
|
||||
return SDValue();
|
||||
|
||||
// If this type will be promoted to a large enough type with a legal
|
||||
@ -6807,14 +6826,15 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
bool HasMULHU = isOperationLegalOrCustom(ISD::MULHU, VT, IsAfterLegalization);
|
||||
bool HasMULHU =
|
||||
isOperationLegalOrCustom(ISD::MULHU, QueryVT, IsAfterLegalization);
|
||||
bool HasUMUL_LOHI =
|
||||
isOperationLegalOrCustom(ISD::UMUL_LOHI, VT, IsAfterLegalization);
|
||||
isOperationLegalOrCustom(ISD::UMUL_LOHI, QueryVT, IsAfterLegalization);
|
||||
|
||||
if (!HasMULHU && !HasUMUL_LOHI && MulVT == EVT()) {
|
||||
if (isTypeLegal(VT) && !HasMULHU && !HasUMUL_LOHI && MulVT == EVT()) {
|
||||
// If type twice as wide legal, widen and use a mul plus a shift.
|
||||
EVT WideVT = VT.widenIntegerElementType(*DAG.getContext());
|
||||
// Some targets like AMDGPU try to go from SDIV to SDIVREM which is then
|
||||
// Some targets like AMDGPU try to go from UDIV to UDIVREM which is then
|
||||
// custom lowered. This is very expensive so avoid it at all costs for
|
||||
// constant divisors.
|
||||
if ((!IsAfterLegalTypes && isOperationExpand(ISD::UDIV, VT) &&
|
||||
|
||||
@ -2334,23 +2334,16 @@ define <3 x i32> @sv3i32_7(<3 x i32> %d, <3 x i32> %e) {
|
||||
; CHECK-SD-LABEL: sv3i32_7:
|
||||
; CHECK-SD: // %bb.0: // %entry
|
||||
; CHECK-SD-NEXT: mov w8, #9363 // =0x2493
|
||||
; CHECK-SD-NEXT: mov w9, v0.s[2]
|
||||
; CHECK-SD-NEXT: movi v3.2s, #7
|
||||
; CHECK-SD-NEXT: movi v3.4s, #7
|
||||
; CHECK-SD-NEXT: movk w8, #37449, lsl #16
|
||||
; CHECK-SD-NEXT: dup v1.2s, w8
|
||||
; CHECK-SD-NEXT: smull x8, w9, w8
|
||||
; CHECK-SD-NEXT: dup v1.4s, w8
|
||||
; CHECK-SD-NEXT: smull2 v2.2d, v0.4s, v1.4s
|
||||
; CHECK-SD-NEXT: smull v1.2d, v0.2s, v1.2s
|
||||
; CHECK-SD-NEXT: add x8, x9, x8, lsr #32
|
||||
; CHECK-SD-NEXT: asr w10, w8, #2
|
||||
; CHECK-SD-NEXT: shrn v1.2s, v1.2d, #32
|
||||
; CHECK-SD-NEXT: add w8, w10, w8, lsr #31
|
||||
; CHECK-SD-NEXT: add v1.2s, v1.2s, v0.2s
|
||||
; CHECK-SD-NEXT: sub w8, w8, w8, lsl #3
|
||||
; CHECK-SD-NEXT: add w8, w9, w8
|
||||
; CHECK-SD-NEXT: sshr v2.2s, v1.2s, #2
|
||||
; CHECK-SD-NEXT: usra v2.2s, v1.2s, #31
|
||||
; CHECK-SD-NEXT: mls v0.2s, v2.2s, v3.2s
|
||||
; CHECK-SD-NEXT: mov v0.s[2], w8
|
||||
; CHECK-SD-NEXT: uzp2 v1.4s, v1.4s, v2.4s
|
||||
; CHECK-SD-NEXT: add v1.4s, v1.4s, v0.4s
|
||||
; CHECK-SD-NEXT: sshr v2.4s, v1.4s, #2
|
||||
; CHECK-SD-NEXT: usra v2.4s, v1.4s, #31
|
||||
; CHECK-SD-NEXT: mls v0.4s, v2.4s, v3.4s
|
||||
; CHECK-SD-NEXT: ret
|
||||
;
|
||||
; CHECK-GI-LABEL: sv3i32_7:
|
||||
@ -2386,21 +2379,15 @@ define <3 x i32> @sv3i32_100(<3 x i32> %d, <3 x i32> %e) {
|
||||
; CHECK-SD-LABEL: sv3i32_100:
|
||||
; CHECK-SD: // %bb.0: // %entry
|
||||
; CHECK-SD-NEXT: mov w8, #34079 // =0x851f
|
||||
; CHECK-SD-NEXT: mov w9, v0.s[2]
|
||||
; CHECK-SD-NEXT: movi v2.2s, #100
|
||||
; CHECK-SD-NEXT: movi v3.4s, #100
|
||||
; CHECK-SD-NEXT: movk w8, #20971, lsl #16
|
||||
; CHECK-SD-NEXT: dup v1.2s, w8
|
||||
; CHECK-SD-NEXT: smull x8, w9, w8
|
||||
; CHECK-SD-NEXT: dup v1.4s, w8
|
||||
; CHECK-SD-NEXT: smull2 v2.2d, v0.4s, v1.4s
|
||||
; CHECK-SD-NEXT: smull v1.2d, v0.2s, v1.2s
|
||||
; CHECK-SD-NEXT: asr x10, x8, #37
|
||||
; CHECK-SD-NEXT: add x8, x10, x8, lsr #63
|
||||
; CHECK-SD-NEXT: mov w10, #100 // =0x64
|
||||
; CHECK-SD-NEXT: sshr v1.2d, v1.2d, #37
|
||||
; CHECK-SD-NEXT: msub w8, w8, w10, w9
|
||||
; CHECK-SD-NEXT: xtn v1.2s, v1.2d
|
||||
; CHECK-SD-NEXT: usra v1.2s, v1.2s, #31
|
||||
; CHECK-SD-NEXT: mls v0.2s, v1.2s, v2.2s
|
||||
; CHECK-SD-NEXT: mov v0.s[2], w8
|
||||
; CHECK-SD-NEXT: uzp2 v1.4s, v1.4s, v2.4s
|
||||
; CHECK-SD-NEXT: sshr v2.4s, v1.4s, #5
|
||||
; CHECK-SD-NEXT: usra v2.4s, v1.4s, #31
|
||||
; CHECK-SD-NEXT: mls v0.4s, v2.4s, v3.4s
|
||||
; CHECK-SD-NEXT: ret
|
||||
;
|
||||
; CHECK-GI-LABEL: sv3i32_100:
|
||||
@ -2560,26 +2547,16 @@ define <3 x i32> @uv3i32_7(<3 x i32> %d, <3 x i32> %e) {
|
||||
; CHECK-SD-LABEL: uv3i32_7:
|
||||
; CHECK-SD: // %bb.0: // %entry
|
||||
; CHECK-SD-NEXT: mov w8, #18725 // =0x4925
|
||||
; CHECK-SD-NEXT: mov x9, #2684354560 // =0xa0000000
|
||||
; CHECK-SD-NEXT: movk w8, #9362, lsl #16
|
||||
; CHECK-SD-NEXT: movk x9, #18724, lsl #32
|
||||
; CHECK-SD-NEXT: dup v1.2s, w8
|
||||
; CHECK-SD-NEXT: mov w8, v0.s[2]
|
||||
; CHECK-SD-NEXT: movk x9, #9362, lsl #48
|
||||
; CHECK-SD-NEXT: dup v1.4s, w8
|
||||
; CHECK-SD-NEXT: umull2 v2.2d, v0.4s, v1.4s
|
||||
; CHECK-SD-NEXT: umull v1.2d, v0.2s, v1.2s
|
||||
; CHECK-SD-NEXT: umulh x9, x8, x9
|
||||
; CHECK-SD-NEXT: shrn v1.2s, v1.2d, #32
|
||||
; CHECK-SD-NEXT: sub w9, w9, w9, lsl #3
|
||||
; CHECK-SD-NEXT: sub v2.2s, v0.2s, v1.2s
|
||||
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 def $q0
|
||||
; CHECK-SD-NEXT: add w8, w8, w9
|
||||
; CHECK-SD-NEXT: ushll v2.2d, v2.2s, #0
|
||||
; CHECK-SD-NEXT: shrn v2.2s, v2.2d, #1
|
||||
; CHECK-SD-NEXT: add v1.2s, v2.2s, v1.2s
|
||||
; CHECK-SD-NEXT: movi v2.2s, #7
|
||||
; CHECK-SD-NEXT: ushr v1.2s, v1.2s, #2
|
||||
; CHECK-SD-NEXT: mls v0.2s, v1.2s, v2.2s
|
||||
; CHECK-SD-NEXT: mov v0.s[2], w8
|
||||
; CHECK-SD-NEXT: uzp2 v1.4s, v1.4s, v2.4s
|
||||
; CHECK-SD-NEXT: sub v2.4s, v0.4s, v1.4s
|
||||
; CHECK-SD-NEXT: usra v1.4s, v2.4s, #1
|
||||
; CHECK-SD-NEXT: movi v2.4s, #7
|
||||
; CHECK-SD-NEXT: ushr v1.4s, v1.4s, #2
|
||||
; CHECK-SD-NEXT: mls v0.4s, v1.4s, v2.4s
|
||||
; CHECK-SD-NEXT: ret
|
||||
;
|
||||
; CHECK-GI-LABEL: uv3i32_7:
|
||||
@ -2630,19 +2607,14 @@ define <3 x i32> @uv3i32_100(<3 x i32> %d, <3 x i32> %e) {
|
||||
; CHECK-SD-LABEL: uv3i32_100:
|
||||
; CHECK-SD: // %bb.0: // %entry
|
||||
; CHECK-SD-NEXT: mov w8, #34079 // =0x851f
|
||||
; CHECK-SD-NEXT: mov w9, v0.s[2]
|
||||
; CHECK-SD-NEXT: movi v2.2s, #100
|
||||
; CHECK-SD-NEXT: movk w8, #20971, lsl #16
|
||||
; CHECK-SD-NEXT: mov w10, #100 // =0x64
|
||||
; CHECK-SD-NEXT: dup v1.2s, w8
|
||||
; CHECK-SD-NEXT: umull x8, w9, w8
|
||||
; CHECK-SD-NEXT: dup v1.4s, w8
|
||||
; CHECK-SD-NEXT: umull2 v2.2d, v0.4s, v1.4s
|
||||
; CHECK-SD-NEXT: umull v1.2d, v0.2s, v1.2s
|
||||
; CHECK-SD-NEXT: lsr x8, x8, #37
|
||||
; CHECK-SD-NEXT: msub w8, w8, w10, w9
|
||||
; CHECK-SD-NEXT: ushr v1.2d, v1.2d, #37
|
||||
; CHECK-SD-NEXT: xtn v1.2s, v1.2d
|
||||
; CHECK-SD-NEXT: mls v0.2s, v1.2s, v2.2s
|
||||
; CHECK-SD-NEXT: mov v0.s[2], w8
|
||||
; CHECK-SD-NEXT: uzp2 v1.4s, v1.4s, v2.4s
|
||||
; CHECK-SD-NEXT: movi v2.4s, #100
|
||||
; CHECK-SD-NEXT: ushr v1.4s, v1.4s, #5
|
||||
; CHECK-SD-NEXT: mls v0.4s, v1.4s, v2.4s
|
||||
; CHECK-SD-NEXT: ret
|
||||
;
|
||||
; CHECK-GI-LABEL: uv3i32_100:
|
||||
|
||||
@ -51,9 +51,12 @@ entry:
|
||||
define <8 x i64> @test_i64(<8 x i64> %shuffle) {
|
||||
; LA32-LABEL: test_i64:
|
||||
; LA32: # %bb.0: # %entry
|
||||
; LA32-NEXT: xvrepli.d $xr2, 3
|
||||
; LA32-NEXT: xvdiv.du $xr0, $xr0, $xr2
|
||||
; LA32-NEXT: xvdiv.du $xr1, $xr1, $xr2
|
||||
; LA32-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_0)
|
||||
; LA32-NEXT: xvld $xr2, $a0, %pc_lo12(.LCPI3_0)
|
||||
; LA32-NEXT: xvmuh.du $xr0, $xr0, $xr2
|
||||
; LA32-NEXT: xvsrli.d $xr0, $xr0, 1
|
||||
; LA32-NEXT: xvmuh.du $xr1, $xr1, $xr2
|
||||
; LA32-NEXT: xvsrli.d $xr1, $xr1, 1
|
||||
; LA32-NEXT: ret
|
||||
;
|
||||
; LA64-LABEL: test_i64:
|
||||
|
||||
@ -51,9 +51,12 @@ entry:
|
||||
define <4 x i64> @test_i64(<4 x i64> %shuffle) {
|
||||
; LA32-LABEL: test_i64:
|
||||
; LA32: # %bb.0: # %entry
|
||||
; LA32-NEXT: vrepli.d $vr2, 3
|
||||
; LA32-NEXT: vdiv.du $vr0, $vr0, $vr2
|
||||
; LA32-NEXT: vdiv.du $vr1, $vr1, $vr2
|
||||
; LA32-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_0)
|
||||
; LA32-NEXT: vld $vr2, $a0, %pc_lo12(.LCPI3_0)
|
||||
; LA32-NEXT: vmuh.du $vr0, $vr0, $vr2
|
||||
; LA32-NEXT: vsrli.d $vr0, $vr0, 1
|
||||
; LA32-NEXT: vmuh.du $vr1, $vr1, $vr2
|
||||
; LA32-NEXT: vsrli.d $vr1, $vr1, 1
|
||||
; LA32-NEXT: ret
|
||||
;
|
||||
; LA64-LABEL: test_i64:
|
||||
|
||||
@ -1141,10 +1141,25 @@ define void @mulhu_v6i16(ptr %x) {
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma
|
||||
; CHECK-NEXT: vle16.v v8, (a0)
|
||||
; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
|
||||
; CHECK-NEXT: vmv.v.i v9, 0
|
||||
; CHECK-NEXT: lui a1, %hi(.LCPI67_0)
|
||||
; CHECK-NEXT: addi a1, a1, %lo(.LCPI67_0)
|
||||
; CHECK-NEXT: vle16.v v9, (a1)
|
||||
; CHECK-NEXT: vdivu.vv v8, v8, v9
|
||||
; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma
|
||||
; CHECK-NEXT: vle16.v v10, (a1)
|
||||
; CHECK-NEXT: lui a1, 1048568
|
||||
; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, ma
|
||||
; CHECK-NEXT: vmv.s.x v9, a1
|
||||
; CHECK-NEXT: li a1, 33
|
||||
; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma
|
||||
; CHECK-NEXT: vmulhu.vv v10, v8, v10
|
||||
; CHECK-NEXT: vsub.vv v8, v8, v10
|
||||
; CHECK-NEXT: vmulhu.vv v8, v8, v9
|
||||
; CHECK-NEXT: vmv.s.x v0, a1
|
||||
; CHECK-NEXT: vadd.vv v8, v8, v10
|
||||
; CHECK-NEXT: vmv.v.i v9, 3
|
||||
; CHECK-NEXT: vmerge.vim v9, v9, 2, v0
|
||||
; CHECK-NEXT: vsrl.vv v8, v8, v9
|
||||
; CHECK-NEXT: vse16.v v8, (a0)
|
||||
; CHECK-NEXT: ret
|
||||
%a = load <6 x i16>, ptr %x
|
||||
@ -1287,9 +1302,16 @@ define void @mulhs_v6i16(ptr %x) {
|
||||
; CHECK-NEXT: vle16.v v8, (a0)
|
||||
; CHECK-NEXT: li a1, 22
|
||||
; CHECK-NEXT: vmv.s.x v0, a1
|
||||
; CHECK-NEXT: vmv.v.i v9, -7
|
||||
; CHECK-NEXT: vmerge.vim v9, v9, 7, v0
|
||||
; CHECK-NEXT: vdiv.vv v8, v8, v9
|
||||
; CHECK-NEXT: lui a1, 1048571
|
||||
; CHECK-NEXT: addi a1, a1, 1755
|
||||
; CHECK-NEXT: vmv.v.x v9, a1
|
||||
; CHECK-NEXT: lui a1, 5
|
||||
; CHECK-NEXT: addi a1, a1, -1755
|
||||
; CHECK-NEXT: vmerge.vxm v9, v9, a1, v0
|
||||
; CHECK-NEXT: vmulh.vv v8, v8, v9
|
||||
; CHECK-NEXT: vsra.vi v8, v8, 1
|
||||
; CHECK-NEXT: vsrl.vi v9, v8, 15
|
||||
; CHECK-NEXT: vadd.vv v8, v8, v9
|
||||
; CHECK-NEXT: vse16.v v8, (a0)
|
||||
; CHECK-NEXT: ret
|
||||
%a = load <6 x i16>, ptr %x
|
||||
|
||||
@ -2213,121 +2213,186 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
|
||||
; CHECK-SSE2-LABEL: pr51133:
|
||||
; CHECK-SSE2: # %bb.0:
|
||||
; CHECK-SSE2-NEXT: movq %rdi, %rax
|
||||
; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm5
|
||||
; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
|
||||
; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 # [9,u,0,u,41,u,183,u,1,u,1,u,161,u,221,u]
|
||||
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
|
||||
; CHECK-SSE2-NEXT: pand %xmm4, %xmm5
|
||||
; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm6
|
||||
; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
|
||||
; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [171,u,103,u,183,u,171,u,61,u,1,u,127,u,183,u]
|
||||
; CHECK-SSE2-NEXT: pand %xmm4, %xmm6
|
||||
; CHECK-SSE2-NEXT: packuswb %xmm5, %xmm6
|
||||
; CHECK-SSE2-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
|
||||
; CHECK-SSE2-NEXT: movdqa %xmm6, %xmm5
|
||||
; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15]
|
||||
; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 # [128,1,128,1,128,32,1,1]
|
||||
; CHECK-SSE2-NEXT: psrlw $8, %xmm5
|
||||
; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
|
||||
; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [1,1,1,128,64,2,1,32]
|
||||
; CHECK-SSE2-NEXT: psrlw $8, %xmm6
|
||||
; CHECK-SSE2-NEXT: packuswb %xmm5, %xmm6
|
||||
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm7 = [84,2,36,42,2,1,2,4,2,255,4,36,127,31,2,2]
|
||||
; CHECK-SSE2-NEXT: pminub %xmm6, %xmm7
|
||||
; CHECK-SSE2-NEXT: pcmpeqb %xmm6, %xmm7
|
||||
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
|
||||
; CHECK-SSE2-NEXT: pandn %xmm5, %xmm7
|
||||
; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
|
||||
; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm4
|
||||
; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
|
||||
; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [0,u,1,u,0,u,1,u,1,u,1,u,0,u,1,u]
|
||||
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
|
||||
; CHECK-SSE2-NEXT: pand %xmm5, %xmm4
|
||||
; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm7
|
||||
; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
|
||||
; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 # [0,u,0,u,1,u,0,u,0,u,255,u,0,u,1,u]
|
||||
; CHECK-SSE2-NEXT: pand %xmm5, %xmm7
|
||||
; CHECK-SSE2-NEXT: packuswb %xmm4, %xmm7
|
||||
; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4
|
||||
; CHECK-SSE2-NEXT: pxor %xmm8, %xmm8
|
||||
; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm1[8],xmm8[9],xmm1[9],xmm8[10],xmm1[10],xmm8[11],xmm1[11],xmm8[12],xmm1[12],xmm8[13],xmm1[13],xmm8[14],xmm1[14],xmm8[15],xmm1[15]
|
||||
; CHECK-SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8 # [0,9,0,0,0,41,0,147,0,129,0,129,0,85,0,141]
|
||||
; CHECK-SSE2-NEXT: psrlw $8, %xmm8
|
||||
; CHECK-SSE2-NEXT: pxor %xmm6, %xmm6
|
||||
; CHECK-SSE2-NEXT: pcmpgtb %xmm6, %xmm1
|
||||
; CHECK-SSE2-NEXT: pandn %xmm1, %xmm5
|
||||
; CHECK-SSE2-NEXT: por %xmm7, %xmm5
|
||||
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
|
||||
; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
|
||||
; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [223,u,223,u,205,u,183,u,161,u,1,u,171,u,239,u]
|
||||
; CHECK-SSE2-NEXT: pand %xmm4, %xmm1
|
||||
; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
|
||||
; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [197,u,205,u,27,u,241,u,1,u,1,u,1,u,163,u]
|
||||
; CHECK-SSE2-NEXT: pand %xmm4, %xmm0
|
||||
; CHECK-SSE2-NEXT: packuswb %xmm1, %xmm0
|
||||
; CHECK-SSE2-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
|
||||
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
|
||||
; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
|
||||
; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [128,128,1,1,1,128,1,64]
|
||||
; CHECK-SSE2-NEXT: psrlw $8, %xmm1
|
||||
; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
|
||||
; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,1,1,128,128,32,128,32]
|
||||
; CHECK-SSE2-NEXT: psrlw $8, %xmm0
|
||||
; CHECK-SSE2-NEXT: packuswb %xmm1, %xmm0
|
||||
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [19,51,13,7,128,32,128,3,5,5,51,37,3,128,85,5]
|
||||
; CHECK-SSE2-NEXT: pmaxub %xmm0, %xmm1
|
||||
; CHECK-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
|
||||
; CHECK-SSE2-NEXT: pcmpeqb %xmm6, %xmm3
|
||||
; CHECK-SSE2-NEXT: pandn %xmm5, %xmm3
|
||||
; CHECK-SSE2-NEXT: pcmpeqb %xmm6, %xmm2
|
||||
; CHECK-SSE2-NEXT: pandn %xmm1, %xmm2
|
||||
; CHECK-SSE2-NEXT: pmovmskb %xmm2, %ecx
|
||||
; CHECK-SSE2-NEXT: pmovmskb %xmm3, %edx
|
||||
; CHECK-SSE2-NEXT: shll $16, %edx
|
||||
; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3],xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7]
|
||||
; CHECK-SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [0,86,0,95,0,147,0,43,0,49,0,127,0,65,0,147]
|
||||
; CHECK-SSE2-NEXT: psrlw $8, %xmm6
|
||||
; CHECK-SSE2-NEXT: packuswb %xmm8, %xmm6
|
||||
; CHECK-SSE2-NEXT: paddb %xmm7, %xmm6
|
||||
; CHECK-SSE2-NEXT: movdqa %xmm6, %xmm7
|
||||
; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15]
|
||||
; CHECK-SSE2-NEXT: psraw $8, %xmm7
|
||||
; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 # [64,256,32,64,256,64,8,4]
|
||||
; CHECK-SSE2-NEXT: psrlw $8, %xmm7
|
||||
; CHECK-SSE2-NEXT: movdqa %xmm6, %xmm8
|
||||
; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
|
||||
; CHECK-SSE2-NEXT: psraw $8, %xmm8
|
||||
; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8 # [256,8,64,256,16,4,8,8]
|
||||
; CHECK-SSE2-NEXT: psrlw $8, %xmm8
|
||||
; CHECK-SSE2-NEXT: packuswb %xmm7, %xmm8
|
||||
; CHECK-SSE2-NEXT: psrlw $7, %xmm6
|
||||
; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
|
||||
; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
|
||||
; CHECK-SSE2-NEXT: paddb %xmm8, %xmm6
|
||||
; CHECK-SSE2-NEXT: movdqa %xmm6, %xmm7
|
||||
; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
|
||||
; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 # [114,u,1,u,50,u,7,u,2,u,8,u,97,u,117,u]
|
||||
; CHECK-SSE2-NEXT: pand %xmm5, %xmm7
|
||||
; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
|
||||
; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [3,u,87,u,7,u,6,u,84,u,128,u,127,u,56,u]
|
||||
; CHECK-SSE2-NEXT: pand %xmm5, %xmm6
|
||||
; CHECK-SSE2-NEXT: packuswb %xmm7, %xmm6
|
||||
; CHECK-SSE2-NEXT: psubb %xmm6, %xmm1
|
||||
; CHECK-SSE2-NEXT: pxor %xmm6, %xmm6
|
||||
; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15]
|
||||
; CHECK-SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [0,133,0,133,0,103,0,147,0,85,0,129,0,86,0,137]
|
||||
; CHECK-SSE2-NEXT: psrlw $8, %xmm6
|
||||
; CHECK-SSE2-NEXT: pxor %xmm7, %xmm7
|
||||
; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3],xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
|
||||
; CHECK-SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 # [0,79,0,103,0,27,0,121,0,129,0,129,0,129,0,47]
|
||||
; CHECK-SSE2-NEXT: psrlw $8, %xmm7
|
||||
; CHECK-SSE2-NEXT: packuswb %xmm6, %xmm7
|
||||
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm6 = [0,0,0,0,255,255,255,0,255,255,0,255,0,255,0,255]
|
||||
; CHECK-SSE2-NEXT: pand %xmm0, %xmm6
|
||||
; CHECK-SSE2-NEXT: paddb %xmm7, %xmm6
|
||||
; CHECK-SSE2-NEXT: movdqa %xmm6, %xmm7
|
||||
; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15]
|
||||
; CHECK-SSE2-NEXT: psraw $8, %xmm7
|
||||
; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 # [8,8,128,64,8,256,256,8]
|
||||
; CHECK-SSE2-NEXT: psrlw $8, %xmm7
|
||||
; CHECK-SSE2-NEXT: pxor %xmm8, %xmm8
|
||||
; CHECK-SSE2-NEXT: pcmpgtb %xmm6, %xmm8
|
||||
; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
|
||||
; CHECK-SSE2-NEXT: psraw $8, %xmm6
|
||||
; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [64,128,128,16,256,64,256,16]
|
||||
; CHECK-SSE2-NEXT: psrlw $8, %xmm6
|
||||
; CHECK-SSE2-NEXT: packuswb %xmm7, %xmm6
|
||||
; CHECK-SSE2-NEXT: psubb %xmm8, %xmm6
|
||||
; CHECK-SSE2-NEXT: movdqa %xmm6, %xmm7
|
||||
; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
|
||||
; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 # [62,u,62,u,5,u,7,u,97,u,2,u,3,u,60,u]
|
||||
; CHECK-SSE2-NEXT: pand %xmm5, %xmm7
|
||||
; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
|
||||
; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [13,u,5,u,19,u,34,u,2,u,8,u,2,u,88,u]
|
||||
; CHECK-SSE2-NEXT: pand %xmm5, %xmm6
|
||||
; CHECK-SSE2-NEXT: packuswb %xmm7, %xmm6
|
||||
; CHECK-SSE2-NEXT: psubb %xmm6, %xmm0
|
||||
; CHECK-SSE2-NEXT: pcmpeqb %xmm4, %xmm1
|
||||
; CHECK-SSE2-NEXT: pcmpeqb %xmm4, %xmm0
|
||||
; CHECK-SSE2-NEXT: pcmpeqb %xmm4, %xmm2
|
||||
; CHECK-SSE2-NEXT: por %xmm0, %xmm2
|
||||
; CHECK-SSE2-NEXT: pcmpeqb %xmm4, %xmm3
|
||||
; CHECK-SSE2-NEXT: por %xmm1, %xmm3
|
||||
; CHECK-SSE2-NEXT: pmovmskb %xmm3, %ecx
|
||||
; CHECK-SSE2-NEXT: notl %ecx
|
||||
; CHECK-SSE2-NEXT: shll $16, %ecx
|
||||
; CHECK-SSE2-NEXT: pmovmskb %xmm2, %edx
|
||||
; CHECK-SSE2-NEXT: xorl $65535, %edx # imm = 0xFFFF
|
||||
; CHECK-SSE2-NEXT: orl %ecx, %edx
|
||||
; CHECK-SSE2-NEXT: movl %edx, (%rdi)
|
||||
; CHECK-SSE2-NEXT: retq
|
||||
;
|
||||
; CHECK-SSE41-LABEL: pr51133:
|
||||
; CHECK-SSE41: # %bb.0:
|
||||
; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm4
|
||||
; CHECK-SSE41-NEXT: movq %rdi, %rax
|
||||
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [171,103,183,171,61,1,127,183,9,0,41,183,1,1,161,221]
|
||||
; CHECK-SSE41-NEXT: pmullw %xmm1, %xmm0
|
||||
; CHECK-SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
|
||||
; CHECK-SSE41-NEXT: pand %xmm5, %xmm0
|
||||
; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm6
|
||||
; CHECK-SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [0,103,0,171,0,1,0,183,0,0,0,183,0,1,0,221]
|
||||
; CHECK-SSE41-NEXT: psllw $8, %xmm6
|
||||
; CHECK-SSE41-NEXT: por %xmm0, %xmm6
|
||||
; CHECK-SSE41-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
|
||||
; CHECK-SSE41-NEXT: movdqa %xmm6, %xmm0
|
||||
; CHECK-SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15]
|
||||
; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [128,1,128,1,128,32,1,1]
|
||||
; CHECK-SSE41-NEXT: psrlw $8, %xmm0
|
||||
; CHECK-SSE41-NEXT: pxor %xmm4, %xmm4
|
||||
; CHECK-SSE41-NEXT: pxor %xmm5, %xmm5
|
||||
; CHECK-SSE41-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
|
||||
; CHECK-SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 # [0,133,0,133,0,103,0,147,0,85,0,129,0,86,0,137]
|
||||
; CHECK-SSE41-NEXT: psrlw $8, %xmm5
|
||||
; CHECK-SSE41-NEXT: pxor %xmm7, %xmm7
|
||||
; CHECK-SSE41-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3],xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
|
||||
; CHECK-SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 # [0,79,0,103,0,27,0,121,0,129,0,129,0,129,0,47]
|
||||
; CHECK-SSE41-NEXT: psrlw $8, %xmm7
|
||||
; CHECK-SSE41-NEXT: packuswb %xmm5, %xmm7
|
||||
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm6 = [0,0,0,0,255,255,255,0,255,255,0,255,0,255,0,255]
|
||||
; CHECK-SSE41-NEXT: pand %xmm0, %xmm6
|
||||
; CHECK-SSE41-NEXT: paddb %xmm7, %xmm6
|
||||
; CHECK-SSE41-NEXT: movdqa %xmm6, %xmm5
|
||||
; CHECK-SSE41-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15]
|
||||
; CHECK-SSE41-NEXT: psraw $8, %xmm5
|
||||
; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 # [8,8,128,64,8,256,256,8]
|
||||
; CHECK-SSE41-NEXT: psrlw $8, %xmm5
|
||||
; CHECK-SSE41-NEXT: pxor %xmm7, %xmm7
|
||||
; CHECK-SSE41-NEXT: pcmpgtb %xmm6, %xmm7
|
||||
; CHECK-SSE41-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
|
||||
; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [1,1,1,128,64,2,1,32]
|
||||
; CHECK-SSE41-NEXT: psraw $8, %xmm6
|
||||
; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [64,128,128,16,256,64,256,16]
|
||||
; CHECK-SSE41-NEXT: psrlw $8, %xmm6
|
||||
; CHECK-SSE41-NEXT: packuswb %xmm0, %xmm6
|
||||
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [84,2,36,42,2,1,2,4,2,255,4,36,127,31,2,2]
|
||||
; CHECK-SSE41-NEXT: pminub %xmm6, %xmm0
|
||||
; CHECK-SSE41-NEXT: pcmpeqb %xmm6, %xmm0
|
||||
; CHECK-SSE41-NEXT: pcmpeqd %xmm7, %xmm7
|
||||
; CHECK-SSE41-NEXT: pxor %xmm0, %xmm7
|
||||
; CHECK-SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
|
||||
; CHECK-SSE41-NEXT: packuswb %xmm5, %xmm6
|
||||
; CHECK-SSE41-NEXT: psubb %xmm7, %xmm6
|
||||
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm7 = [13,5,19,34,2,8,2,88,62,62,5,7,97,2,3,60]
|
||||
; CHECK-SSE41-NEXT: pmullw %xmm6, %xmm7
|
||||
; CHECK-SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
|
||||
; CHECK-SSE41-NEXT: pand %xmm5, %xmm7
|
||||
; CHECK-SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [0,5,0,34,0,8,0,88,0,62,0,7,0,2,0,60]
|
||||
; CHECK-SSE41-NEXT: psllw $8, %xmm6
|
||||
; CHECK-SSE41-NEXT: por %xmm7, %xmm6
|
||||
; CHECK-SSE41-NEXT: psubb %xmm6, %xmm0
|
||||
; CHECK-SSE41-NEXT: pxor %xmm6, %xmm6
|
||||
; CHECK-SSE41-NEXT: pcmpgtb %xmm6, %xmm1
|
||||
; CHECK-SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
|
||||
; CHECK-SSE41-NEXT: pblendvb %xmm0, %xmm7, %xmm1
|
||||
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [197,205,27,241,1,1,1,163,223,223,205,183,161,1,171,239]
|
||||
; CHECK-SSE41-NEXT: pmullw %xmm4, %xmm0
|
||||
; CHECK-SSE41-NEXT: pand %xmm5, %xmm0
|
||||
; CHECK-SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [0,205,0,241,0,1,0,163,0,223,0,183,0,1,0,239]
|
||||
; CHECK-SSE41-NEXT: psllw $8, %xmm4
|
||||
; CHECK-SSE41-NEXT: por %xmm0, %xmm4
|
||||
; CHECK-SSE41-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
|
||||
; CHECK-SSE41-NEXT: movdqa %xmm4, %xmm0
|
||||
; CHECK-SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
|
||||
; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [128,128,1,1,1,128,1,64]
|
||||
; CHECK-SSE41-NEXT: psrlw $8, %xmm0
|
||||
; CHECK-SSE41-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
|
||||
; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [1,1,1,128,128,32,128,32]
|
||||
; CHECK-SSE41-NEXT: psrlw $8, %xmm4
|
||||
; CHECK-SSE41-NEXT: packuswb %xmm0, %xmm4
|
||||
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [19,51,13,7,128,32,128,3,5,5,51,37,3,128,85,5]
|
||||
; CHECK-SSE41-NEXT: pmaxub %xmm4, %xmm0
|
||||
; CHECK-SSE41-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm1[8],xmm6[9],xmm1[9],xmm6[10],xmm1[10],xmm6[11],xmm1[11],xmm6[12],xmm1[12],xmm6[13],xmm1[13],xmm6[14],xmm1[14],xmm6[15],xmm1[15]
|
||||
; CHECK-SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [0,9,0,0,0,41,0,147,0,129,0,129,0,85,0,141]
|
||||
; CHECK-SSE41-NEXT: psrlw $8, %xmm6
|
||||
; CHECK-SSE41-NEXT: pxor %xmm7, %xmm7
|
||||
; CHECK-SSE41-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3],xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7]
|
||||
; CHECK-SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 # [0,86,0,95,0,147,0,43,0,49,0,127,0,65,0,147]
|
||||
; CHECK-SSE41-NEXT: psrlw $8, %xmm7
|
||||
; CHECK-SSE41-NEXT: packuswb %xmm6, %xmm7
|
||||
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm8 = [0,0,1,0,0,255,0,1,0,1,0,1,1,1,0,1]
|
||||
; CHECK-SSE41-NEXT: pmullw %xmm1, %xmm8
|
||||
; CHECK-SSE41-NEXT: pand %xmm5, %xmm8
|
||||
; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm6
|
||||
; CHECK-SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [0,0,0,0,0,255,0,1,0,1,0,1,0,1,0,1]
|
||||
; CHECK-SSE41-NEXT: psllw $8, %xmm6
|
||||
; CHECK-SSE41-NEXT: por %xmm8, %xmm6
|
||||
; CHECK-SSE41-NEXT: paddb %xmm7, %xmm6
|
||||
; CHECK-SSE41-NEXT: movdqa %xmm6, %xmm7
|
||||
; CHECK-SSE41-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15]
|
||||
; CHECK-SSE41-NEXT: psraw $8, %xmm7
|
||||
; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 # [64,256,32,64,256,64,8,4]
|
||||
; CHECK-SSE41-NEXT: psrlw $8, %xmm7
|
||||
; CHECK-SSE41-NEXT: movdqa %xmm6, %xmm8
|
||||
; CHECK-SSE41-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
|
||||
; CHECK-SSE41-NEXT: psraw $8, %xmm8
|
||||
; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8 # [256,8,64,256,16,4,8,8]
|
||||
; CHECK-SSE41-NEXT: psrlw $8, %xmm8
|
||||
; CHECK-SSE41-NEXT: packuswb %xmm7, %xmm8
|
||||
; CHECK-SSE41-NEXT: psrlw $7, %xmm6
|
||||
; CHECK-SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
|
||||
; CHECK-SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
|
||||
; CHECK-SSE41-NEXT: paddb %xmm8, %xmm6
|
||||
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm7 = [3,87,7,6,84,128,127,56,114,1,50,7,2,8,97,117]
|
||||
; CHECK-SSE41-NEXT: pmullw %xmm6, %xmm7
|
||||
; CHECK-SSE41-NEXT: pand %xmm5, %xmm7
|
||||
; CHECK-SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [0,87,0,6,0,128,0,56,0,1,0,7,0,8,0,117]
|
||||
; CHECK-SSE41-NEXT: psllw $8, %xmm6
|
||||
; CHECK-SSE41-NEXT: por %xmm7, %xmm6
|
||||
; CHECK-SSE41-NEXT: psubb %xmm6, %xmm1
|
||||
; CHECK-SSE41-NEXT: pcmpeqb %xmm4, %xmm1
|
||||
; CHECK-SSE41-NEXT: pcmpeqb %xmm4, %xmm0
|
||||
; CHECK-SSE41-NEXT: pcmpeqb %xmm6, %xmm3
|
||||
; CHECK-SSE41-NEXT: pandn %xmm1, %xmm3
|
||||
; CHECK-SSE41-NEXT: pcmpeqb %xmm6, %xmm2
|
||||
; CHECK-SSE41-NEXT: pandn %xmm0, %xmm2
|
||||
; CHECK-SSE41-NEXT: pcmpeqb %xmm4, %xmm3
|
||||
; CHECK-SSE41-NEXT: por %xmm1, %xmm3
|
||||
; CHECK-SSE41-NEXT: pcmpeqb %xmm4, %xmm2
|
||||
; CHECK-SSE41-NEXT: por %xmm0, %xmm2
|
||||
; CHECK-SSE41-NEXT: pmovmskb %xmm2, %ecx
|
||||
; CHECK-SSE41-NEXT: xorl $65535, %ecx # imm = 0xFFFF
|
||||
; CHECK-SSE41-NEXT: pmovmskb %xmm3, %edx
|
||||
; CHECK-SSE41-NEXT: notl %edx
|
||||
; CHECK-SSE41-NEXT: shll $16, %edx
|
||||
; CHECK-SSE41-NEXT: orl %ecx, %edx
|
||||
; CHECK-SSE41-NEXT: movl %edx, (%rdi)
|
||||
|
||||
@ -6,155 +6,78 @@
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512
|
||||
|
||||
define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) {
|
||||
; SSE-LABEL: fold_srem_vec_1:
|
||||
; SSE: # %bb.0:
|
||||
; SSE-NEXT: pextrw $3, %xmm0, %eax
|
||||
; SSE-NEXT: movswl %ax, %ecx
|
||||
; SSE-NEXT: imull $32081, %ecx, %ecx # imm = 0x7D51
|
||||
; SSE-NEXT: shrl $16, %ecx
|
||||
; SSE-NEXT: subl %eax, %ecx
|
||||
; SSE-NEXT: movzwl %cx, %ecx
|
||||
; SSE-NEXT: movswl %cx, %edx
|
||||
; SSE-NEXT: shrl $15, %ecx
|
||||
; SSE-NEXT: sarl $9, %edx
|
||||
; SSE-NEXT: addl %ecx, %edx
|
||||
; SSE-NEXT: imull $-1003, %edx, %ecx # imm = 0xFC15
|
||||
; SSE-NEXT: subl %ecx, %eax
|
||||
; SSE-NEXT: movd %xmm0, %ecx
|
||||
; SSE-NEXT: movswl %cx, %edx
|
||||
; SSE-NEXT: imull $-21385, %edx, %edx # imm = 0xAC77
|
||||
; SSE-NEXT: shrl $16, %edx
|
||||
; SSE-NEXT: addl %ecx, %edx
|
||||
; SSE-NEXT: movzwl %dx, %edx
|
||||
; SSE-NEXT: movswl %dx, %esi
|
||||
; SSE-NEXT: shrl $15, %edx
|
||||
; SSE-NEXT: sarl $6, %esi
|
||||
; SSE-NEXT: addl %edx, %esi
|
||||
; SSE-NEXT: imull $95, %esi, %edx
|
||||
; SSE-NEXT: subl %edx, %ecx
|
||||
; SSE-NEXT: movd %ecx, %xmm1
|
||||
; SSE-NEXT: pextrw $1, %xmm0, %ecx
|
||||
; SSE-NEXT: movswl %cx, %edx
|
||||
; SSE-NEXT: imull $-16913, %edx, %edx # imm = 0xBDEF
|
||||
; SSE-NEXT: movl %edx, %esi
|
||||
; SSE-NEXT: shrl $31, %esi
|
||||
; SSE-NEXT: sarl $21, %edx
|
||||
; SSE-NEXT: addl %esi, %edx
|
||||
; SSE-NEXT: imull $-124, %edx, %edx
|
||||
; SSE-NEXT: subl %edx, %ecx
|
||||
; SSE-NEXT: pinsrw $1, %ecx, %xmm1
|
||||
; SSE-NEXT: pextrw $2, %xmm0, %ecx
|
||||
; SSE-NEXT: movswl %cx, %edx
|
||||
; SSE-NEXT: imull $2675, %edx, %edx # imm = 0xA73
|
||||
; SSE-NEXT: movl %edx, %esi
|
||||
; SSE-NEXT: shrl $31, %esi
|
||||
; SSE-NEXT: sarl $18, %edx
|
||||
; SSE-NEXT: addl %esi, %edx
|
||||
; SSE-NEXT: imull $98, %edx, %edx
|
||||
; SSE-NEXT: subl %edx, %ecx
|
||||
; SSE-NEXT: pinsrw $2, %ecx, %xmm1
|
||||
; SSE-NEXT: pinsrw $3, %eax, %xmm1
|
||||
; SSE-NEXT: movdqa %xmm1, %xmm0
|
||||
; SSE-NEXT: retq
|
||||
; SSE2-LABEL: fold_srem_vec_1:
|
||||
; SSE2: # %bb.0:
|
||||
; SSE2-NEXT: movq {{.*#+}} xmm2 = [1,0,0,65535,0,0,0,0]
|
||||
; SSE2-NEXT: pmullw %xmm0, %xmm2
|
||||
; SSE2-NEXT: movq {{.*#+}} xmm1 = [44151,48623,2675,32081,0,0,0,0]
|
||||
; SSE2-NEXT: pmulhw %xmm0, %xmm1
|
||||
; SSE2-NEXT: paddw %xmm2, %xmm1
|
||||
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535]
|
||||
; SSE2-NEXT: movdqa %xmm1, %xmm3
|
||||
; SSE2-NEXT: pand %xmm2, %xmm3
|
||||
; SSE2-NEXT: movdqa %xmm1, %xmm4
|
||||
; SSE2-NEXT: psraw $8, %xmm4
|
||||
; SSE2-NEXT: pandn %xmm4, %xmm2
|
||||
; SSE2-NEXT: por %xmm3, %xmm2
|
||||
; SSE2-NEXT: movdqa %xmm1, %xmm3
|
||||
; SSE2-NEXT: psraw $4, %xmm3
|
||||
; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3]
|
||||
; SSE2-NEXT: movaps %xmm2, %xmm3
|
||||
; SSE2-NEXT: psraw $1, %xmm3
|
||||
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,0,65535,65535,65535,65535]
|
||||
; SSE2-NEXT: psraw $2, %xmm2
|
||||
; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,65535,0,65535,65535,65535,65535,65535]
|
||||
; SSE2-NEXT: movdqa %xmm1, %xmm6
|
||||
; SSE2-NEXT: pand %xmm5, %xmm6
|
||||
; SSE2-NEXT: pandn %xmm2, %xmm5
|
||||
; SSE2-NEXT: por %xmm6, %xmm5
|
||||
; SSE2-NEXT: pand %xmm4, %xmm5
|
||||
; SSE2-NEXT: pandn %xmm3, %xmm4
|
||||
; SSE2-NEXT: por %xmm5, %xmm4
|
||||
; SSE2-NEXT: psrlw $15, %xmm1
|
||||
; SSE2-NEXT: paddw %xmm4, %xmm1
|
||||
; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [95,65412,98,64533,u,u,u,u]
|
||||
; SSE2-NEXT: psubw %xmm1, %xmm0
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSE4-LABEL: fold_srem_vec_1:
|
||||
; SSE4: # %bb.0:
|
||||
; SSE4-NEXT: movq {{.*#+}} xmm1 = [1,0,0,65535,0,0,0,0]
|
||||
; SSE4-NEXT: pmullw %xmm0, %xmm1
|
||||
; SSE4-NEXT: movq {{.*#+}} xmm2 = [44151,48623,2675,32081,0,0,0,0]
|
||||
; SSE4-NEXT: pmulhw %xmm0, %xmm2
|
||||
; SSE4-NEXT: paddw %xmm1, %xmm2
|
||||
; SSE4-NEXT: movdqa %xmm2, %xmm1
|
||||
; SSE4-NEXT: psrlw $15, %xmm1
|
||||
; SSE4-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [1024,2048,16384,128,u,u,u,u]
|
||||
; SSE4-NEXT: paddw %xmm1, %xmm2
|
||||
; SSE4-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [95,65412,98,64533,u,u,u,u]
|
||||
; SSE4-NEXT: psubw %xmm2, %xmm0
|
||||
; SSE4-NEXT: retq
|
||||
;
|
||||
; AVX1OR2-LABEL: fold_srem_vec_1:
|
||||
; AVX1OR2: # %bb.0:
|
||||
; AVX1OR2-NEXT: vpextrw $3, %xmm0, %eax
|
||||
; AVX1OR2-NEXT: movswl %ax, %ecx
|
||||
; AVX1OR2-NEXT: imull $32081, %ecx, %ecx # imm = 0x7D51
|
||||
; AVX1OR2-NEXT: shrl $16, %ecx
|
||||
; AVX1OR2-NEXT: subl %eax, %ecx
|
||||
; AVX1OR2-NEXT: movzwl %cx, %ecx
|
||||
; AVX1OR2-NEXT: movswl %cx, %edx
|
||||
; AVX1OR2-NEXT: shrl $15, %ecx
|
||||
; AVX1OR2-NEXT: sarl $9, %edx
|
||||
; AVX1OR2-NEXT: addl %ecx, %edx
|
||||
; AVX1OR2-NEXT: imull $-1003, %edx, %ecx # imm = 0xFC15
|
||||
; AVX1OR2-NEXT: subl %ecx, %eax
|
||||
; AVX1OR2-NEXT: vmovd %xmm0, %ecx
|
||||
; AVX1OR2-NEXT: movswl %cx, %edx
|
||||
; AVX1OR2-NEXT: imull $-21385, %edx, %edx # imm = 0xAC77
|
||||
; AVX1OR2-NEXT: shrl $16, %edx
|
||||
; AVX1OR2-NEXT: addl %ecx, %edx
|
||||
; AVX1OR2-NEXT: movzwl %dx, %edx
|
||||
; AVX1OR2-NEXT: movswl %dx, %esi
|
||||
; AVX1OR2-NEXT: shrl $15, %edx
|
||||
; AVX1OR2-NEXT: sarl $6, %esi
|
||||
; AVX1OR2-NEXT: addl %edx, %esi
|
||||
; AVX1OR2-NEXT: imull $95, %esi, %edx
|
||||
; AVX1OR2-NEXT: subl %edx, %ecx
|
||||
; AVX1OR2-NEXT: vmovd %ecx, %xmm1
|
||||
; AVX1OR2-NEXT: vpextrw $1, %xmm0, %ecx
|
||||
; AVX1OR2-NEXT: movswl %cx, %edx
|
||||
; AVX1OR2-NEXT: imull $-16913, %edx, %edx # imm = 0xBDEF
|
||||
; AVX1OR2-NEXT: movl %edx, %esi
|
||||
; AVX1OR2-NEXT: shrl $31, %esi
|
||||
; AVX1OR2-NEXT: sarl $21, %edx
|
||||
; AVX1OR2-NEXT: addl %esi, %edx
|
||||
; AVX1OR2-NEXT: imull $-124, %edx, %edx
|
||||
; AVX1OR2-NEXT: subl %edx, %ecx
|
||||
; AVX1OR2-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1
|
||||
; AVX1OR2-NEXT: vpextrw $2, %xmm0, %ecx
|
||||
; AVX1OR2-NEXT: movswl %cx, %edx
|
||||
; AVX1OR2-NEXT: imull $2675, %edx, %edx # imm = 0xA73
|
||||
; AVX1OR2-NEXT: movl %edx, %esi
|
||||
; AVX1OR2-NEXT: shrl $31, %esi
|
||||
; AVX1OR2-NEXT: sarl $18, %edx
|
||||
; AVX1OR2-NEXT: addl %esi, %edx
|
||||
; AVX1OR2-NEXT: imull $98, %edx, %edx
|
||||
; AVX1OR2-NEXT: subl %edx, %ecx
|
||||
; AVX1OR2-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm0
|
||||
; AVX1OR2-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
|
||||
; AVX1OR2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [1,0,0,65535,u,u,u,u]
|
||||
; AVX1OR2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 # [44151,48623,2675,32081,u,u,u,u]
|
||||
; AVX1OR2-NEXT: vpaddw %xmm1, %xmm2, %xmm1
|
||||
; AVX1OR2-NEXT: vpsrlw $15, %xmm1, %xmm2
|
||||
; AVX1OR2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1024,2048,16384,128,u,u,u,u]
|
||||
; AVX1OR2-NEXT: vpaddw %xmm2, %xmm1, %xmm1
|
||||
; AVX1OR2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [95,65412,98,64533,u,u,u,u]
|
||||
; AVX1OR2-NEXT: vpsubw %xmm1, %xmm0, %xmm0
|
||||
; AVX1OR2-NEXT: retq
|
||||
;
|
||||
; AVX512-LABEL: fold_srem_vec_1:
|
||||
; AVX512: # %bb.0:
|
||||
; AVX512-NEXT: vpextrw $3, %xmm0, %eax
|
||||
; AVX512-NEXT: movswl %ax, %ecx
|
||||
; AVX512-NEXT: imull $32081, %ecx, %ecx # imm = 0x7D51
|
||||
; AVX512-NEXT: shrl $16, %ecx
|
||||
; AVX512-NEXT: subl %eax, %ecx
|
||||
; AVX512-NEXT: movzwl %cx, %edx
|
||||
; AVX512-NEXT: movswl %dx, %ecx
|
||||
; AVX512-NEXT: shrl $15, %edx
|
||||
; AVX512-NEXT: sarl $9, %ecx
|
||||
; AVX512-NEXT: addl %edx, %ecx
|
||||
; AVX512-NEXT: vmovd %xmm0, %edx
|
||||
; AVX512-NEXT: movswl %dx, %esi
|
||||
; AVX512-NEXT: imull $-21385, %esi, %esi # imm = 0xAC77
|
||||
; AVX512-NEXT: shrl $16, %esi
|
||||
; AVX512-NEXT: addl %edx, %esi
|
||||
; AVX512-NEXT: movzwl %si, %esi
|
||||
; AVX512-NEXT: movswl %si, %edi
|
||||
; AVX512-NEXT: shrl $15, %esi
|
||||
; AVX512-NEXT: sarl $6, %edi
|
||||
; AVX512-NEXT: addl %esi, %edi
|
||||
; AVX512-NEXT: imull $95, %edi, %esi
|
||||
; AVX512-NEXT: subl %esi, %edx
|
||||
; AVX512-NEXT: vmovd %edx, %xmm1
|
||||
; AVX512-NEXT: vpextrw $1, %xmm0, %edx
|
||||
; AVX512-NEXT: movswl %dx, %esi
|
||||
; AVX512-NEXT: imull $-16913, %esi, %esi # imm = 0xBDEF
|
||||
; AVX512-NEXT: movl %esi, %edi
|
||||
; AVX512-NEXT: shrl $31, %edi
|
||||
; AVX512-NEXT: sarl $21, %esi
|
||||
; AVX512-NEXT: addl %edi, %esi
|
||||
; AVX512-NEXT: imull $-1003, %ecx, %ecx # imm = 0xFC15
|
||||
; AVX512-NEXT: imull $-124, %esi, %esi
|
||||
; AVX512-NEXT: subl %esi, %edx
|
||||
; AVX512-NEXT: vpinsrw $1, %edx, %xmm1, %xmm1
|
||||
; AVX512-NEXT: vpextrw $2, %xmm0, %edx
|
||||
; AVX512-NEXT: subl %ecx, %eax
|
||||
; AVX512-NEXT: movswl %dx, %ecx
|
||||
; AVX512-NEXT: imull $2675, %ecx, %ecx # imm = 0xA73
|
||||
; AVX512-NEXT: movl %ecx, %esi
|
||||
; AVX512-NEXT: shrl $31, %esi
|
||||
; AVX512-NEXT: sarl $18, %ecx
|
||||
; AVX512-NEXT: addl %esi, %ecx
|
||||
; AVX512-NEXT: imull $98, %ecx, %ecx
|
||||
; AVX512-NEXT: subl %ecx, %edx
|
||||
; AVX512-NEXT: vpinsrw $2, %edx, %xmm1, %xmm0
|
||||
; AVX512-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [1,0,0,65535,u,u,u,u]
|
||||
; AVX512-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 # [44151,48623,2675,32081,u,u,u,u]
|
||||
; AVX512-NEXT: vpaddw %xmm1, %xmm2, %xmm1
|
||||
; AVX512-NEXT: vpsrlw $15, %xmm1, %xmm2
|
||||
; AVX512-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
|
||||
; AVX512-NEXT: vpaddw %xmm2, %xmm1, %xmm1
|
||||
; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [95,65412,98,64533,u,u,u,u]
|
||||
; AVX512-NEXT: vpsubw %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: retq
|
||||
%1 = srem <4 x i16> %x, <i16 95, i16 -124, i16 98, i16 -1003>
|
||||
ret <4 x i16> %1
|
||||
@ -163,25 +86,25 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) {
|
||||
define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) {
|
||||
; SSE-LABEL: fold_srem_vec_2:
|
||||
; SSE: # %bb.0:
|
||||
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151]
|
||||
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,u,u,u,u]
|
||||
; SSE-NEXT: pmulhw %xmm0, %xmm1
|
||||
; SSE-NEXT: paddw %xmm0, %xmm1
|
||||
; SSE-NEXT: movdqa %xmm1, %xmm2
|
||||
; SSE-NEXT: psrlw $15, %xmm2
|
||||
; SSE-NEXT: psraw $6, %xmm1
|
||||
; SSE-NEXT: paddw %xmm2, %xmm1
|
||||
; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [95,95,95,95,95,95,95,95]
|
||||
; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [95,95,95,95,u,u,u,u]
|
||||
; SSE-NEXT: psubw %xmm1, %xmm0
|
||||
; SSE-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: fold_srem_vec_2:
|
||||
; AVX: # %bb.0:
|
||||
; AVX-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [44151,44151,44151,44151,44151,44151,44151,44151]
|
||||
; AVX-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [44151,44151,44151,44151,u,u,u,u]
|
||||
; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm1
|
||||
; AVX-NEXT: vpsrlw $15, %xmm1, %xmm2
|
||||
; AVX-NEXT: vpsraw $6, %xmm1, %xmm1
|
||||
; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1
|
||||
; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [95,95,95,95,95,95,95,95]
|
||||
; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [95,95,95,95,u,u,u,u]
|
||||
; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0
|
||||
; AVX-NEXT: retq
|
||||
%1 = srem <4 x i16> %x, <i16 95, i16 95, i16 95, i16 95>
|
||||
@ -193,14 +116,14 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) {
|
||||
define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) {
|
||||
; SSE2-LABEL: combine_srem_sdiv:
|
||||
; SSE2: # %bb.0:
|
||||
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151]
|
||||
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,u,u,u,u]
|
||||
; SSE2-NEXT: pmulhw %xmm0, %xmm1
|
||||
; SSE2-NEXT: paddw %xmm0, %xmm1
|
||||
; SSE2-NEXT: movdqa %xmm1, %xmm2
|
||||
; SSE2-NEXT: psrlw $15, %xmm2
|
||||
; SSE2-NEXT: psraw $6, %xmm1
|
||||
; SSE2-NEXT: paddw %xmm2, %xmm1
|
||||
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [95,95,95,95,95,95,95,95]
|
||||
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [95,95,95,95,u,u,u,u]
|
||||
; SSE2-NEXT: pmullw %xmm1, %xmm2
|
||||
; SSE2-NEXT: psubw %xmm2, %xmm0
|
||||
; SSE2-NEXT: paddw %xmm1, %xmm0
|
||||
@ -208,7 +131,7 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) {
|
||||
;
|
||||
; SSE4-LABEL: combine_srem_sdiv:
|
||||
; SSE4: # %bb.0:
|
||||
; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151]
|
||||
; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,u,u,u,u]
|
||||
; SSE4-NEXT: pmulhw %xmm0, %xmm1
|
||||
; SSE4-NEXT: paddw %xmm0, %xmm1
|
||||
; SSE4-NEXT: movdqa %xmm1, %xmm2
|
||||
@ -223,12 +146,12 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) {
|
||||
;
|
||||
; AVX-LABEL: combine_srem_sdiv:
|
||||
; AVX: # %bb.0:
|
||||
; AVX-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [44151,44151,44151,44151,44151,44151,44151,44151]
|
||||
; AVX-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [44151,44151,44151,44151,u,u,u,u]
|
||||
; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm1
|
||||
; AVX-NEXT: vpsrlw $15, %xmm1, %xmm2
|
||||
; AVX-NEXT: vpsraw $6, %xmm1, %xmm1
|
||||
; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1
|
||||
; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 # [95,95,95,95,95,95,95,95]
|
||||
; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 # [95,95,95,95,u,u,u,u]
|
||||
; AVX-NEXT: vpsubw %xmm2, %xmm0, %xmm0
|
||||
; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
|
||||
; AVX-NEXT: retq
|
||||
@ -240,248 +163,237 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) {
|
||||
|
||||
; Don't fold for divisors that are a power of two.
|
||||
define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) {
|
||||
; SSE-LABEL: dont_fold_srem_power_of_two:
|
||||
; SSE: # %bb.0:
|
||||
; SSE-NEXT: movdqa %xmm0, %xmm1
|
||||
; SSE-NEXT: pextrw $1, %xmm0, %eax
|
||||
; SSE-NEXT: leal 31(%rax), %ecx
|
||||
; SSE-NEXT: testw %ax, %ax
|
||||
; SSE-NEXT: cmovnsl %eax, %ecx
|
||||
; SSE-NEXT: andl $-32, %ecx
|
||||
; SSE-NEXT: subl %ecx, %eax
|
||||
; SSE-NEXT: movd %xmm0, %ecx
|
||||
; SSE-NEXT: leal 63(%rcx), %edx
|
||||
; SSE-NEXT: testw %cx, %cx
|
||||
; SSE-NEXT: cmovnsl %ecx, %edx
|
||||
; SSE-NEXT: andl $-64, %edx
|
||||
; SSE-NEXT: subl %edx, %ecx
|
||||
; SSE-NEXT: movd %ecx, %xmm0
|
||||
; SSE-NEXT: pinsrw $1, %eax, %xmm0
|
||||
; SSE-NEXT: pextrw $2, %xmm1, %eax
|
||||
; SSE-NEXT: leal 7(%rax), %ecx
|
||||
; SSE-NEXT: testw %ax, %ax
|
||||
; SSE-NEXT: cmovnsl %eax, %ecx
|
||||
; SSE-NEXT: andl $-8, %ecx
|
||||
; SSE-NEXT: subl %ecx, %eax
|
||||
; SSE-NEXT: pinsrw $2, %eax, %xmm0
|
||||
; SSE-NEXT: pextrw $3, %xmm1, %eax
|
||||
; SSE-NEXT: movswl %ax, %ecx
|
||||
; SSE-NEXT: imull $-21385, %ecx, %ecx # imm = 0xAC77
|
||||
; SSE-NEXT: shrl $16, %ecx
|
||||
; SSE-NEXT: addl %eax, %ecx
|
||||
; SSE-NEXT: movzwl %cx, %ecx
|
||||
; SSE-NEXT: movswl %cx, %edx
|
||||
; SSE-NEXT: shrl $15, %ecx
|
||||
; SSE-NEXT: sarl $6, %edx
|
||||
; SSE-NEXT: addl %ecx, %edx
|
||||
; SSE-NEXT: imull $95, %edx, %ecx
|
||||
; SSE-NEXT: subl %ecx, %eax
|
||||
; SSE-NEXT: pinsrw $3, %eax, %xmm0
|
||||
; SSE-NEXT: retq
|
||||
; SSE2-LABEL: dont_fold_srem_power_of_two:
|
||||
; SSE2: # %bb.0:
|
||||
; SSE2-NEXT: movq {{.*#+}} xmm1 = [32769,32769,32769,44151,0,0,0,0]
|
||||
; SSE2-NEXT: pmulhw %xmm0, %xmm1
|
||||
; SSE2-NEXT: paddw %xmm0, %xmm1
|
||||
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,0,65535,0,65535,65535,65535,65535]
|
||||
; SSE2-NEXT: movdqa %xmm1, %xmm3
|
||||
; SSE2-NEXT: pand %xmm2, %xmm3
|
||||
; SSE2-NEXT: movdqa %xmm1, %xmm4
|
||||
; SSE2-NEXT: psraw $4, %xmm4
|
||||
; SSE2-NEXT: pandn %xmm4, %xmm2
|
||||
; SSE2-NEXT: por %xmm3, %xmm2
|
||||
; SSE2-NEXT: movdqa %xmm2, %xmm3
|
||||
; SSE2-NEXT: psraw $2, %xmm3
|
||||
; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[0,0]
|
||||
; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm2[2,3]
|
||||
; SSE2-NEXT: movaps {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
|
||||
; SSE2-NEXT: andps %xmm2, %xmm3
|
||||
; SSE2-NEXT: movdqa %xmm1, %xmm4
|
||||
; SSE2-NEXT: psraw $5, %xmm4
|
||||
; SSE2-NEXT: andnps %xmm4, %xmm2
|
||||
; SSE2-NEXT: orps %xmm3, %xmm2
|
||||
; SSE2-NEXT: psrlw $15, %xmm1
|
||||
; SSE2-NEXT: paddw %xmm2, %xmm1
|
||||
; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [64,32,8,95,u,u,u,u]
|
||||
; SSE2-NEXT: psubw %xmm1, %xmm0
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: dont_fold_srem_power_of_two:
|
||||
; AVX: # %bb.0:
|
||||
; AVX-NEXT: vpextrw $1, %xmm0, %eax
|
||||
; AVX-NEXT: leal 31(%rax), %ecx
|
||||
; AVX-NEXT: testw %ax, %ax
|
||||
; AVX-NEXT: cmovnsl %eax, %ecx
|
||||
; AVX-NEXT: andl $-32, %ecx
|
||||
; AVX-NEXT: subl %ecx, %eax
|
||||
; AVX-NEXT: vmovd %xmm0, %ecx
|
||||
; AVX-NEXT: leal 63(%rcx), %edx
|
||||
; AVX-NEXT: testw %cx, %cx
|
||||
; AVX-NEXT: cmovnsl %ecx, %edx
|
||||
; AVX-NEXT: andl $-64, %edx
|
||||
; AVX-NEXT: subl %edx, %ecx
|
||||
; AVX-NEXT: vmovd %ecx, %xmm1
|
||||
; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
|
||||
; AVX-NEXT: vpextrw $2, %xmm0, %eax
|
||||
; AVX-NEXT: leal 7(%rax), %ecx
|
||||
; AVX-NEXT: testw %ax, %ax
|
||||
; AVX-NEXT: cmovnsl %eax, %ecx
|
||||
; AVX-NEXT: andl $-8, %ecx
|
||||
; AVX-NEXT: subl %ecx, %eax
|
||||
; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
|
||||
; AVX-NEXT: vpextrw $3, %xmm0, %eax
|
||||
; AVX-NEXT: movswl %ax, %ecx
|
||||
; AVX-NEXT: imull $-21385, %ecx, %ecx # imm = 0xAC77
|
||||
; AVX-NEXT: shrl $16, %ecx
|
||||
; AVX-NEXT: addl %eax, %ecx
|
||||
; AVX-NEXT: movzwl %cx, %ecx
|
||||
; AVX-NEXT: movswl %cx, %edx
|
||||
; AVX-NEXT: shrl $15, %ecx
|
||||
; AVX-NEXT: sarl $6, %edx
|
||||
; AVX-NEXT: addl %ecx, %edx
|
||||
; AVX-NEXT: imull $95, %edx, %ecx
|
||||
; AVX-NEXT: subl %ecx, %eax
|
||||
; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0
|
||||
; AVX-NEXT: retq
|
||||
; SSE4-LABEL: dont_fold_srem_power_of_two:
|
||||
; SSE4: # %bb.0:
|
||||
; SSE4-NEXT: movq {{.*#+}} xmm1 = [32769,32769,32769,44151,0,0,0,0]
|
||||
; SSE4-NEXT: pmulhw %xmm0, %xmm1
|
||||
; SSE4-NEXT: paddw %xmm0, %xmm1
|
||||
; SSE4-NEXT: movdqa %xmm1, %xmm2
|
||||
; SSE4-NEXT: psrlw $15, %xmm2
|
||||
; SSE4-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2048,4096,16384,1024,u,u,u,u]
|
||||
; SSE4-NEXT: paddw %xmm2, %xmm1
|
||||
; SSE4-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [64,32,8,95,u,u,u,u]
|
||||
; SSE4-NEXT: psubw %xmm1, %xmm0
|
||||
; SSE4-NEXT: retq
|
||||
;
|
||||
; AVX1OR2-LABEL: dont_fold_srem_power_of_two:
|
||||
; AVX1OR2: # %bb.0:
|
||||
; AVX1OR2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [32769,32769,32769,44151,u,u,u,u]
|
||||
; AVX1OR2-NEXT: vpaddw %xmm0, %xmm1, %xmm1
|
||||
; AVX1OR2-NEXT: vpsrlw $15, %xmm1, %xmm2
|
||||
; AVX1OR2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2048,4096,16384,1024,u,u,u,u]
|
||||
; AVX1OR2-NEXT: vpaddw %xmm2, %xmm1, %xmm1
|
||||
; AVX1OR2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [64,32,8,95,u,u,u,u]
|
||||
; AVX1OR2-NEXT: vpsubw %xmm1, %xmm0, %xmm0
|
||||
; AVX1OR2-NEXT: retq
|
||||
;
|
||||
; AVX512-LABEL: dont_fold_srem_power_of_two:
|
||||
; AVX512: # %bb.0:
|
||||
; AVX512-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [32769,32769,32769,44151,u,u,u,u]
|
||||
; AVX512-NEXT: vpaddw %xmm0, %xmm1, %xmm1
|
||||
; AVX512-NEXT: vpsrlw $15, %xmm1, %xmm2
|
||||
; AVX512-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
|
||||
; AVX512-NEXT: vpaddw %xmm2, %xmm1, %xmm1
|
||||
; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [64,32,8,95,u,u,u,u]
|
||||
; AVX512-NEXT: vpsubw %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: retq
|
||||
%1 = srem <4 x i16> %x, <i16 64, i16 32, i16 8, i16 95>
|
||||
ret <4 x i16> %1
|
||||
}
|
||||
|
||||
; Don't fold if the divisor is one.
|
||||
define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) {
|
||||
; SSE-LABEL: dont_fold_srem_one:
|
||||
; SSE: # %bb.0:
|
||||
; SSE-NEXT: pextrw $2, %xmm0, %ecx
|
||||
; SSE-NEXT: movswl %cx, %eax
|
||||
; SSE-NEXT: imull $-19945, %eax, %eax # imm = 0xB217
|
||||
; SSE-NEXT: shrl $16, %eax
|
||||
; SSE-NEXT: addl %ecx, %eax
|
||||
; SSE-NEXT: movzwl %ax, %edx
|
||||
; SSE-NEXT: movswl %dx, %eax
|
||||
; SSE-NEXT: shrl $15, %edx
|
||||
; SSE-NEXT: sarl $4, %eax
|
||||
; SSE-NEXT: addl %edx, %eax
|
||||
; SSE-NEXT: leal (%rax,%rax,2), %edx
|
||||
; SSE-NEXT: shll $3, %edx
|
||||
; SSE-NEXT: subl %edx, %eax
|
||||
; SSE-NEXT: addl %ecx, %eax
|
||||
; SSE-NEXT: pextrw $1, %xmm0, %ecx
|
||||
; SSE-NEXT: movswl %cx, %edx
|
||||
; SSE-NEXT: imull $12827, %edx, %edx # imm = 0x321B
|
||||
; SSE-NEXT: movl %edx, %esi
|
||||
; SSE-NEXT: shrl $31, %esi
|
||||
; SSE-NEXT: sarl $23, %edx
|
||||
; SSE-NEXT: addl %esi, %edx
|
||||
; SSE-NEXT: imull $654, %edx, %edx # imm = 0x28E
|
||||
; SSE-NEXT: subl %edx, %ecx
|
||||
; SSE-NEXT: pxor %xmm1, %xmm1
|
||||
; SSE-NEXT: pinsrw $1, %ecx, %xmm1
|
||||
; SSE-NEXT: pinsrw $2, %eax, %xmm1
|
||||
; SSE-NEXT: pextrw $3, %xmm0, %eax
|
||||
; SSE-NEXT: movswl %ax, %ecx
|
||||
; SSE-NEXT: imull $12375, %ecx, %ecx # imm = 0x3057
|
||||
; SSE-NEXT: movl %ecx, %edx
|
||||
; SSE-NEXT: shrl $31, %edx
|
||||
; SSE-NEXT: sarl $26, %ecx
|
||||
; SSE-NEXT: addl %edx, %ecx
|
||||
; SSE-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F
|
||||
; SSE-NEXT: subl %ecx, %eax
|
||||
; SSE-NEXT: pinsrw $3, %eax, %xmm1
|
||||
; SSE-NEXT: movdqa %xmm1, %xmm0
|
||||
; SSE-NEXT: retq
|
||||
; SSE2-LABEL: dont_fold_srem_one:
|
||||
; SSE2: # %bb.0:
|
||||
; SSE2-NEXT: movq {{.*#+}} xmm1 = [65535,0,65535,0,0,0,0,0]
|
||||
; SSE2-NEXT: pand %xmm0, %xmm1
|
||||
; SSE2-NEXT: movq {{.*#+}} xmm2 = [0,12827,45591,12375,0,0,0,0]
|
||||
; SSE2-NEXT: pmulhw %xmm0, %xmm2
|
||||
; SSE2-NEXT: paddw %xmm2, %xmm1
|
||||
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,0,65535,65535,65535,65535,65535]
|
||||
; SSE2-NEXT: movdqa %xmm1, %xmm4
|
||||
; SSE2-NEXT: pand %xmm3, %xmm4
|
||||
; SSE2-NEXT: movdqa %xmm1, %xmm5
|
||||
; SSE2-NEXT: psraw $4, %xmm5
|
||||
; SSE2-NEXT: pandn %xmm5, %xmm3
|
||||
; SSE2-NEXT: por %xmm4, %xmm3
|
||||
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,0,65535,65535,65535,65535]
|
||||
; SSE2-NEXT: pand %xmm4, %xmm3
|
||||
; SSE2-NEXT: movdqa %xmm2, %xmm5
|
||||
; SSE2-NEXT: psraw $10, %xmm5
|
||||
; SSE2-NEXT: pandn %xmm5, %xmm4
|
||||
; SSE2-NEXT: por %xmm3, %xmm4
|
||||
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535]
|
||||
; SSE2-NEXT: pand %xmm3, %xmm4
|
||||
; SSE2-NEXT: psraw $7, %xmm2
|
||||
; SSE2-NEXT: pandn %xmm2, %xmm3
|
||||
; SSE2-NEXT: por %xmm4, %xmm3
|
||||
; SSE2-NEXT: psrlw $15, %xmm1
|
||||
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
|
||||
; SSE2-NEXT: paddw %xmm3, %xmm1
|
||||
; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,654,23,5423,u,u,u,u]
|
||||
; SSE2-NEXT: psubw %xmm1, %xmm0
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: dont_fold_srem_one:
|
||||
; AVX: # %bb.0:
|
||||
; AVX-NEXT: vpextrw $2, %xmm0, %eax
|
||||
; AVX-NEXT: movswl %ax, %ecx
|
||||
; AVX-NEXT: imull $-19945, %ecx, %ecx # imm = 0xB217
|
||||
; AVX-NEXT: shrl $16, %ecx
|
||||
; AVX-NEXT: addl %eax, %ecx
|
||||
; AVX-NEXT: movzwl %cx, %ecx
|
||||
; AVX-NEXT: movswl %cx, %edx
|
||||
; AVX-NEXT: shrl $15, %ecx
|
||||
; AVX-NEXT: sarl $4, %edx
|
||||
; AVX-NEXT: addl %ecx, %edx
|
||||
; AVX-NEXT: leal (%rdx,%rdx,2), %ecx
|
||||
; AVX-NEXT: shll $3, %ecx
|
||||
; AVX-NEXT: subl %ecx, %edx
|
||||
; AVX-NEXT: addl %eax, %edx
|
||||
; AVX-NEXT: vpextrw $1, %xmm0, %eax
|
||||
; AVX-NEXT: movswl %ax, %ecx
|
||||
; AVX-NEXT: imull $12827, %ecx, %ecx # imm = 0x321B
|
||||
; AVX-NEXT: movl %ecx, %esi
|
||||
; AVX-NEXT: shrl $31, %esi
|
||||
; AVX-NEXT: sarl $23, %ecx
|
||||
; AVX-NEXT: addl %esi, %ecx
|
||||
; AVX-NEXT: imull $654, %ecx, %ecx # imm = 0x28E
|
||||
; AVX-NEXT: subl %ecx, %eax
|
||||
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
||||
; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
|
||||
; AVX-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1
|
||||
; AVX-NEXT: vpextrw $3, %xmm0, %eax
|
||||
; AVX-NEXT: movswl %ax, %ecx
|
||||
; AVX-NEXT: imull $12375, %ecx, %ecx # imm = 0x3057
|
||||
; AVX-NEXT: movl %ecx, %edx
|
||||
; AVX-NEXT: shrl $31, %edx
|
||||
; AVX-NEXT: sarl $26, %ecx
|
||||
; AVX-NEXT: addl %edx, %ecx
|
||||
; AVX-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F
|
||||
; AVX-NEXT: subl %ecx, %eax
|
||||
; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0
|
||||
; AVX-NEXT: retq
|
||||
; SSE4-LABEL: dont_fold_srem_one:
|
||||
; SSE4: # %bb.0:
|
||||
; SSE4-NEXT: pxor %xmm1, %xmm1
|
||||
; SSE4-NEXT: pxor %xmm2, %xmm2
|
||||
; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1],xmm0[2],xmm2[3,4,5,6,7]
|
||||
; SSE4-NEXT: movq {{.*#+}} xmm3 = [0,12827,45591,12375,0,0,0,0]
|
||||
; SSE4-NEXT: pmulhw %xmm0, %xmm3
|
||||
; SSE4-NEXT: paddw %xmm2, %xmm3
|
||||
; SSE4-NEXT: movdqa %xmm3, %xmm2
|
||||
; SSE4-NEXT: psrlw $15, %xmm2
|
||||
; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3],xmm1[4,5,6,7]
|
||||
; SSE4-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [u,512,4096,64,u,u,u,u]
|
||||
; SSE4-NEXT: pblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3,4,5,6,7]
|
||||
; SSE4-NEXT: paddw %xmm2, %xmm3
|
||||
; SSE4-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [1,654,23,5423,u,u,u,u]
|
||||
; SSE4-NEXT: psubw %xmm3, %xmm0
|
||||
; SSE4-NEXT: retq
|
||||
;
|
||||
; AVX1OR2-LABEL: dont_fold_srem_one:
|
||||
; AVX1OR2: # %bb.0:
|
||||
; AVX1OR2-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
||||
; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3,4,5,6,7]
|
||||
; AVX1OR2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 # [0,12827,45591,12375,u,u,u,u]
|
||||
; AVX1OR2-NEXT: vpaddw %xmm2, %xmm3, %xmm2
|
||||
; AVX1OR2-NEXT: vpsrlw $15, %xmm2, %xmm3
|
||||
; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4,5,6,7]
|
||||
; AVX1OR2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [u,512,4096,64,u,u,u,u]
|
||||
; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7]
|
||||
; AVX1OR2-NEXT: vpaddw %xmm1, %xmm2, %xmm1
|
||||
; AVX1OR2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,654,23,5423,u,u,u,u]
|
||||
; AVX1OR2-NEXT: vpsubw %xmm1, %xmm0, %xmm0
|
||||
; AVX1OR2-NEXT: retq
|
||||
;
|
||||
; AVX512-LABEL: dont_fold_srem_one:
|
||||
; AVX512: # %bb.0:
|
||||
; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
||||
; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3,4,5,6,7]
|
||||
; AVX512-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 # [0,12827,45591,12375,u,u,u,u]
|
||||
; AVX512-NEXT: vpaddw %xmm2, %xmm3, %xmm2
|
||||
; AVX512-NEXT: vpsrlw $15, %xmm2, %xmm3
|
||||
; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4,5,6,7]
|
||||
; AVX512-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
|
||||
; AVX512-NEXT: vpaddw %xmm1, %xmm2, %xmm1
|
||||
; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,654,23,5423,u,u,u,u]
|
||||
; AVX512-NEXT: vpsubw %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: retq
|
||||
%1 = srem <4 x i16> %x, <i16 1, i16 654, i16 23, i16 5423>
|
||||
ret <4 x i16> %1
|
||||
}
|
||||
|
||||
; Don't fold if the divisor is 2^15.
|
||||
define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) {
|
||||
; SSE-LABEL: dont_fold_urem_i16_smax:
|
||||
; SSE: # %bb.0:
|
||||
; SSE-NEXT: pextrw $2, %xmm0, %eax
|
||||
; SSE-NEXT: movswl %ax, %ecx
|
||||
; SSE-NEXT: imull $-19945, %ecx, %ecx # imm = 0xB217
|
||||
; SSE-NEXT: shrl $16, %ecx
|
||||
; SSE-NEXT: addl %eax, %ecx
|
||||
; SSE-NEXT: movzwl %cx, %ecx
|
||||
; SSE-NEXT: movswl %cx, %edx
|
||||
; SSE-NEXT: shrl $15, %ecx
|
||||
; SSE-NEXT: sarl $4, %edx
|
||||
; SSE-NEXT: addl %ecx, %edx
|
||||
; SSE-NEXT: leal (%rdx,%rdx,2), %ecx
|
||||
; SSE-NEXT: shll $3, %ecx
|
||||
; SSE-NEXT: subl %ecx, %edx
|
||||
; SSE-NEXT: addl %eax, %edx
|
||||
; SSE-NEXT: pextrw $1, %xmm0, %eax
|
||||
; SSE-NEXT: leal 32767(%rax), %ecx
|
||||
; SSE-NEXT: testw %ax, %ax
|
||||
; SSE-NEXT: cmovnsl %eax, %ecx
|
||||
; SSE-NEXT: andl $-32768, %ecx # imm = 0x8000
|
||||
; SSE-NEXT: addl %eax, %ecx
|
||||
; SSE-NEXT: pxor %xmm1, %xmm1
|
||||
; SSE-NEXT: pinsrw $1, %ecx, %xmm1
|
||||
; SSE-NEXT: pinsrw $2, %edx, %xmm1
|
||||
; SSE-NEXT: pextrw $3, %xmm0, %eax
|
||||
; SSE-NEXT: movswl %ax, %ecx
|
||||
; SSE-NEXT: imull $12375, %ecx, %ecx # imm = 0x3057
|
||||
; SSE-NEXT: movl %ecx, %edx
|
||||
; SSE-NEXT: shrl $31, %edx
|
||||
; SSE-NEXT: sarl $26, %ecx
|
||||
; SSE-NEXT: addl %edx, %ecx
|
||||
; SSE-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F
|
||||
; SSE-NEXT: subl %ecx, %eax
|
||||
; SSE-NEXT: pinsrw $3, %eax, %xmm1
|
||||
; SSE-NEXT: movdqa %xmm1, %xmm0
|
||||
; SSE-NEXT: retq
|
||||
; SSE2-LABEL: dont_fold_urem_i16_smax:
|
||||
; SSE2: # %bb.0:
|
||||
; SSE2-NEXT: movq {{.*#+}} xmm2 = [1,65535,1,0,0,0,0,0]
|
||||
; SSE2-NEXT: pmullw %xmm0, %xmm2
|
||||
; SSE2-NEXT: movq {{.*#+}} xmm1 = [0,32767,45591,12375,0,0,0,0]
|
||||
; SSE2-NEXT: pmulhw %xmm0, %xmm1
|
||||
; SSE2-NEXT: paddw %xmm2, %xmm1
|
||||
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,0,65535,65535,65535,65535]
|
||||
; SSE2-NEXT: movdqa %xmm1, %xmm3
|
||||
; SSE2-NEXT: pand %xmm2, %xmm3
|
||||
; SSE2-NEXT: movdqa %xmm1, %xmm4
|
||||
; SSE2-NEXT: psraw $8, %xmm4
|
||||
; SSE2-NEXT: movdqa %xmm2, %xmm5
|
||||
; SSE2-NEXT: pandn %xmm4, %xmm5
|
||||
; SSE2-NEXT: por %xmm3, %xmm5
|
||||
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,0,65535,65535,65535,65535,65535]
|
||||
; SSE2-NEXT: movdqa %xmm5, %xmm4
|
||||
; SSE2-NEXT: pand %xmm3, %xmm4
|
||||
; SSE2-NEXT: psraw $4, %xmm5
|
||||
; SSE2-NEXT: pandn %xmm5, %xmm3
|
||||
; SSE2-NEXT: por %xmm4, %xmm3
|
||||
; SSE2-NEXT: movdqa %xmm3, %xmm4
|
||||
; SSE2-NEXT: pand %xmm2, %xmm4
|
||||
; SSE2-NEXT: psraw $2, %xmm3
|
||||
; SSE2-NEXT: pandn %xmm3, %xmm2
|
||||
; SSE2-NEXT: por %xmm4, %xmm2
|
||||
; SSE2-NEXT: psrlw $15, %xmm1
|
||||
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
|
||||
; SSE2-NEXT: paddw %xmm2, %xmm1
|
||||
; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,32768,23,5423,u,u,u,u]
|
||||
; SSE2-NEXT: psubw %xmm1, %xmm0
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: dont_fold_urem_i16_smax:
|
||||
; AVX: # %bb.0:
|
||||
; AVX-NEXT: vpextrw $2, %xmm0, %eax
|
||||
; AVX-NEXT: movswl %ax, %ecx
|
||||
; AVX-NEXT: imull $-19945, %ecx, %ecx # imm = 0xB217
|
||||
; AVX-NEXT: shrl $16, %ecx
|
||||
; AVX-NEXT: addl %eax, %ecx
|
||||
; AVX-NEXT: movzwl %cx, %ecx
|
||||
; AVX-NEXT: movswl %cx, %edx
|
||||
; AVX-NEXT: shrl $15, %ecx
|
||||
; AVX-NEXT: sarl $4, %edx
|
||||
; AVX-NEXT: addl %ecx, %edx
|
||||
; AVX-NEXT: leal (%rdx,%rdx,2), %ecx
|
||||
; AVX-NEXT: shll $3, %ecx
|
||||
; AVX-NEXT: subl %ecx, %edx
|
||||
; AVX-NEXT: addl %eax, %edx
|
||||
; AVX-NEXT: vpextrw $1, %xmm0, %eax
|
||||
; AVX-NEXT: leal 32767(%rax), %ecx
|
||||
; AVX-NEXT: testw %ax, %ax
|
||||
; AVX-NEXT: cmovnsl %eax, %ecx
|
||||
; AVX-NEXT: andl $-32768, %ecx # imm = 0x8000
|
||||
; AVX-NEXT: addl %eax, %ecx
|
||||
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
||||
; AVX-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1
|
||||
; AVX-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1
|
||||
; AVX-NEXT: vpextrw $3, %xmm0, %eax
|
||||
; AVX-NEXT: movswl %ax, %ecx
|
||||
; AVX-NEXT: imull $12375, %ecx, %ecx # imm = 0x3057
|
||||
; AVX-NEXT: movl %ecx, %edx
|
||||
; AVX-NEXT: shrl $31, %edx
|
||||
; AVX-NEXT: sarl $26, %ecx
|
||||
; AVX-NEXT: addl %edx, %ecx
|
||||
; AVX-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F
|
||||
; AVX-NEXT: subl %ecx, %eax
|
||||
; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0
|
||||
; AVX-NEXT: retq
|
||||
; SSE4-LABEL: dont_fold_urem_i16_smax:
|
||||
; SSE4: # %bb.0:
|
||||
; SSE4-NEXT: movq {{.*#+}} xmm1 = [1,65535,1,0,0,0,0,0]
|
||||
; SSE4-NEXT: pmullw %xmm0, %xmm1
|
||||
; SSE4-NEXT: movq {{.*#+}} xmm2 = [0,32767,45591,12375,0,0,0,0]
|
||||
; SSE4-NEXT: pmulhw %xmm0, %xmm2
|
||||
; SSE4-NEXT: paddw %xmm1, %xmm2
|
||||
; SSE4-NEXT: movdqa %xmm2, %xmm3
|
||||
; SSE4-NEXT: psrlw $15, %xmm3
|
||||
; SSE4-NEXT: pxor %xmm4, %xmm4
|
||||
; SSE4-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0],xmm3[1,2,3],xmm4[4,5,6,7]
|
||||
; SSE4-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [u,4,4096,64,u,u,u,u]
|
||||
; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3,4,5,6,7]
|
||||
; SSE4-NEXT: paddw %xmm4, %xmm2
|
||||
; SSE4-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [1,32768,23,5423,u,u,u,u]
|
||||
; SSE4-NEXT: psubw %xmm2, %xmm0
|
||||
; SSE4-NEXT: retq
|
||||
;
|
||||
; AVX1OR2-LABEL: dont_fold_urem_i16_smax:
|
||||
; AVX1OR2: # %bb.0:
|
||||
; AVX1OR2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [1,65535,1,0,u,u,u,u]
|
||||
; AVX1OR2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 # [0,32767,45591,12375,u,u,u,u]
|
||||
; AVX1OR2-NEXT: vpaddw %xmm1, %xmm2, %xmm2
|
||||
; AVX1OR2-NEXT: vpsrlw $15, %xmm2, %xmm3
|
||||
; AVX1OR2-NEXT: vpxor %xmm4, %xmm4, %xmm4
|
||||
; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3],xmm4[4,5,6,7]
|
||||
; AVX1OR2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [u,4,4096,64,u,u,u,u]
|
||||
; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
|
||||
; AVX1OR2-NEXT: vpaddw %xmm3, %xmm1, %xmm1
|
||||
; AVX1OR2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,32768,23,5423,u,u,u,u]
|
||||
; AVX1OR2-NEXT: vpsubw %xmm1, %xmm0, %xmm0
|
||||
; AVX1OR2-NEXT: retq
|
||||
;
|
||||
; AVX512-LABEL: dont_fold_urem_i16_smax:
|
||||
; AVX512: # %bb.0:
|
||||
; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [1,65535,1,0,u,u,u,u]
|
||||
; AVX512-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 # [0,32767,45591,12375,u,u,u,u]
|
||||
; AVX512-NEXT: vpaddw %xmm1, %xmm2, %xmm1
|
||||
; AVX512-NEXT: vpsrlw $15, %xmm1, %xmm2
|
||||
; AVX512-NEXT: vpxor %xmm3, %xmm3, %xmm3
|
||||
; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3],xmm3[4,5,6,7]
|
||||
; AVX512-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
|
||||
; AVX512-NEXT: vpaddw %xmm2, %xmm1, %xmm1
|
||||
; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,32768,23,5423,u,u,u,u]
|
||||
; AVX512-NEXT: vpsubw %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: retq
|
||||
%1 = srem <4 x i16> %x, <i16 1, i16 32768, i16 23, i16 5423>
|
||||
ret <4 x i16> %1
|
||||
}
|
||||
|
||||
@ -6,84 +6,62 @@
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512
|
||||
|
||||
define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) {
|
||||
; SSE-LABEL: fold_urem_vec_1:
|
||||
; SSE: # %bb.0:
|
||||
; SSE-NEXT: pextrw $1, %xmm0, %eax
|
||||
; SSE-NEXT: movl %eax, %ecx
|
||||
; SSE-NEXT: shrl $2, %ecx
|
||||
; SSE-NEXT: imull $16913, %ecx, %ecx # imm = 0x4211
|
||||
; SSE-NEXT: shrl $19, %ecx
|
||||
; SSE-NEXT: imull $124, %ecx, %ecx
|
||||
; SSE-NEXT: subl %ecx, %eax
|
||||
; SSE-NEXT: movd %xmm0, %ecx
|
||||
; SSE-NEXT: movzwl %cx, %edx
|
||||
; SSE-NEXT: imull $44151, %edx, %edx # imm = 0xAC77
|
||||
; SSE-NEXT: shrl $22, %edx
|
||||
; SSE-NEXT: imull $95, %edx, %edx
|
||||
; SSE-NEXT: subl %edx, %ecx
|
||||
; SSE-NEXT: movd %ecx, %xmm1
|
||||
; SSE-NEXT: pinsrw $1, %eax, %xmm1
|
||||
; SSE-NEXT: pextrw $2, %xmm0, %eax
|
||||
; SSE-NEXT: movl %eax, %ecx
|
||||
; SSE-NEXT: shrl %ecx
|
||||
; SSE-NEXT: imull $2675, %ecx, %ecx # imm = 0xA73
|
||||
; SSE-NEXT: shrl $17, %ecx
|
||||
; SSE-NEXT: imull $98, %ecx, %ecx
|
||||
; SSE-NEXT: subl %ecx, %eax
|
||||
; SSE-NEXT: pinsrw $2, %eax, %xmm1
|
||||
; SSE-NEXT: pextrw $3, %xmm0, %eax
|
||||
; SSE-NEXT: imull $1373, %eax, %ecx # imm = 0x55D
|
||||
; SSE-NEXT: shrl $16, %ecx
|
||||
; SSE-NEXT: movl %eax, %edx
|
||||
; SSE-NEXT: subl %ecx, %edx
|
||||
; SSE-NEXT: movzwl %dx, %edx
|
||||
; SSE-NEXT: shrl %edx
|
||||
; SSE-NEXT: addl %ecx, %edx
|
||||
; SSE-NEXT: shrl $9, %edx
|
||||
; SSE-NEXT: imull $1003, %edx, %ecx # imm = 0x3EB
|
||||
; SSE-NEXT: subl %ecx, %eax
|
||||
; SSE-NEXT: pinsrw $3, %eax, %xmm1
|
||||
; SSE-NEXT: movdqa %xmm1, %xmm0
|
||||
; SSE-NEXT: retq
|
||||
; SSE2-LABEL: fold_urem_vec_1:
|
||||
; SSE2: # %bb.0:
|
||||
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,0,65535,65535,65535,65535]
|
||||
; SSE2-NEXT: pandn %xmm0, %xmm1
|
||||
; SSE2-NEXT: movq {{.*#+}} xmm2 = [0,0,0,64,0,128,0,0,0,0,0,0,0,0,0,0]
|
||||
; SSE2-NEXT: pmulhuw %xmm0, %xmm2
|
||||
; SSE2-NEXT: por %xmm1, %xmm2
|
||||
; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [44151,16913,2675,1373,u,u,u,u]
|
||||
; SSE2-NEXT: movdqa %xmm0, %xmm1
|
||||
; SSE2-NEXT: psubw %xmm2, %xmm1
|
||||
; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,0,0,32768,u,u,u,u]
|
||||
; SSE2-NEXT: paddw %xmm2, %xmm1
|
||||
; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1024,8192,32768,128,u,u,u,u]
|
||||
; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [95,124,98,1003,u,u,u,u]
|
||||
; SSE2-NEXT: psubw %xmm1, %xmm0
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: fold_urem_vec_1:
|
||||
; AVX: # %bb.0:
|
||||
; AVX-NEXT: vpextrw $1, %xmm0, %eax
|
||||
; AVX-NEXT: movl %eax, %ecx
|
||||
; AVX-NEXT: shrl $2, %ecx
|
||||
; AVX-NEXT: imull $16913, %ecx, %ecx # imm = 0x4211
|
||||
; AVX-NEXT: shrl $19, %ecx
|
||||
; AVX-NEXT: imull $124, %ecx, %ecx
|
||||
; AVX-NEXT: subl %ecx, %eax
|
||||
; AVX-NEXT: vmovd %xmm0, %ecx
|
||||
; AVX-NEXT: movzwl %cx, %edx
|
||||
; AVX-NEXT: imull $44151, %edx, %edx # imm = 0xAC77
|
||||
; AVX-NEXT: shrl $22, %edx
|
||||
; AVX-NEXT: imull $95, %edx, %edx
|
||||
; AVX-NEXT: subl %edx, %ecx
|
||||
; AVX-NEXT: vmovd %ecx, %xmm1
|
||||
; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
|
||||
; AVX-NEXT: vpextrw $2, %xmm0, %eax
|
||||
; AVX-NEXT: movl %eax, %ecx
|
||||
; AVX-NEXT: shrl %ecx
|
||||
; AVX-NEXT: imull $2675, %ecx, %ecx # imm = 0xA73
|
||||
; AVX-NEXT: shrl $17, %ecx
|
||||
; AVX-NEXT: imull $98, %ecx, %ecx
|
||||
; AVX-NEXT: subl %ecx, %eax
|
||||
; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
|
||||
; AVX-NEXT: vpextrw $3, %xmm0, %eax
|
||||
; AVX-NEXT: imull $1373, %eax, %ecx # imm = 0x55D
|
||||
; AVX-NEXT: shrl $16, %ecx
|
||||
; AVX-NEXT: movl %eax, %edx
|
||||
; AVX-NEXT: subl %ecx, %edx
|
||||
; AVX-NEXT: movzwl %dx, %edx
|
||||
; AVX-NEXT: shrl %edx
|
||||
; AVX-NEXT: addl %ecx, %edx
|
||||
; AVX-NEXT: shrl $9, %edx
|
||||
; AVX-NEXT: imull $1003, %edx, %ecx # imm = 0x3EB
|
||||
; AVX-NEXT: subl %ecx, %eax
|
||||
; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0
|
||||
; AVX-NEXT: retq
|
||||
; SSE4-LABEL: fold_urem_vec_1:
|
||||
; SSE4: # %bb.0:
|
||||
; SSE4-NEXT: movq {{.*#+}} xmm1 = [0,16384,32768,0,0,0,0,0]
|
||||
; SSE4-NEXT: pmulhuw %xmm0, %xmm1
|
||||
; SSE4-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5,6,7]
|
||||
; SSE4-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [44151,16913,2675,1373,u,u,u,u]
|
||||
; SSE4-NEXT: movdqa %xmm0, %xmm2
|
||||
; SSE4-NEXT: psubw %xmm1, %xmm2
|
||||
; SSE4-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [0,0,0,32768,u,u,u,u]
|
||||
; SSE4-NEXT: paddw %xmm1, %xmm2
|
||||
; SSE4-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [1024,8192,32768,128,u,u,u,u]
|
||||
; SSE4-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [95,124,98,1003,u,u,u,u]
|
||||
; SSE4-NEXT: psubw %xmm2, %xmm0
|
||||
; SSE4-NEXT: retq
|
||||
;
|
||||
; AVX1OR2-LABEL: fold_urem_vec_1:
|
||||
; AVX1OR2: # %bb.0:
|
||||
; AVX1OR2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,16384,32768,u,u,u,u,u]
|
||||
; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5,6,7]
|
||||
; AVX1OR2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [44151,16913,2675,1373,u,u,u,u]
|
||||
; AVX1OR2-NEXT: vpsubw %xmm1, %xmm0, %xmm2
|
||||
; AVX1OR2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [0,0,0,32768,u,u,u,u]
|
||||
; AVX1OR2-NEXT: vpaddw %xmm1, %xmm2, %xmm1
|
||||
; AVX1OR2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1024,8192,32768,128,u,u,u,u]
|
||||
; AVX1OR2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [95,124,98,1003,u,u,u,u]
|
||||
; AVX1OR2-NEXT: vpsubw %xmm1, %xmm0, %xmm0
|
||||
; AVX1OR2-NEXT: retq
|
||||
;
|
||||
; AVX512-LABEL: fold_urem_vec_1:
|
||||
; AVX512: # %bb.0:
|
||||
; AVX512-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
|
||||
; AVX512-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [44151,16913,2675,1373,u,u,u,u]
|
||||
; AVX512-NEXT: vpsubw %xmm1, %xmm0, %xmm2
|
||||
; AVX512-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [0,0,0,32768,u,u,u,u]
|
||||
; AVX512-NEXT: vpaddw %xmm1, %xmm2, %xmm1
|
||||
; AVX512-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
|
||||
; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [95,124,98,1003,u,u,u,u]
|
||||
; AVX512-NEXT: vpsubw %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: retq
|
||||
%1 = urem <4 x i16> %x, <i16 95, i16 124, i16 98, i16 1003>
|
||||
ret <4 x i16> %1
|
||||
}
|
||||
@ -91,18 +69,18 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) {
|
||||
define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) {
|
||||
; SSE-LABEL: fold_urem_vec_2:
|
||||
; SSE: # %bb.0:
|
||||
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151]
|
||||
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,u,u,u,u]
|
||||
; SSE-NEXT: pmulhuw %xmm0, %xmm1
|
||||
; SSE-NEXT: psrlw $6, %xmm1
|
||||
; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [95,95,95,95,95,95,95,95]
|
||||
; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [95,95,95,95,u,u,u,u]
|
||||
; SSE-NEXT: psubw %xmm1, %xmm0
|
||||
; SSE-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: fold_urem_vec_2:
|
||||
; AVX: # %bb.0:
|
||||
; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [44151,44151,44151,44151,44151,44151,44151,44151]
|
||||
; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [44151,44151,44151,44151,u,u,u,u]
|
||||
; AVX-NEXT: vpsrlw $6, %xmm1, %xmm1
|
||||
; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [95,95,95,95,95,95,95,95]
|
||||
; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [95,95,95,95,u,u,u,u]
|
||||
; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0
|
||||
; AVX-NEXT: retq
|
||||
%1 = urem <4 x i16> %x, <i16 95, i16 95, i16 95, i16 95>
|
||||
@ -114,10 +92,10 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) {
|
||||
define <4 x i16> @combine_urem_udiv(<4 x i16> %x) {
|
||||
; SSE2-LABEL: combine_urem_udiv:
|
||||
; SSE2: # %bb.0:
|
||||
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151]
|
||||
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,u,u,u,u]
|
||||
; SSE2-NEXT: pmulhuw %xmm0, %xmm1
|
||||
; SSE2-NEXT: psrlw $6, %xmm1
|
||||
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [95,95,95,95,95,95,95,95]
|
||||
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [95,95,95,95,u,u,u,u]
|
||||
; SSE2-NEXT: pmullw %xmm1, %xmm2
|
||||
; SSE2-NEXT: psubw %xmm2, %xmm0
|
||||
; SSE2-NEXT: paddw %xmm1, %xmm0
|
||||
@ -125,7 +103,7 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) {
|
||||
;
|
||||
; SSE4-LABEL: combine_urem_udiv:
|
||||
; SSE4: # %bb.0:
|
||||
; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151]
|
||||
; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,u,u,u,u]
|
||||
; SSE4-NEXT: pmulhuw %xmm0, %xmm1
|
||||
; SSE4-NEXT: psrlw $6, %xmm1
|
||||
; SSE4-NEXT: pmovsxbw {{.*#+}} xmm2 = [95,95,95,95,95,95,95,95]
|
||||
@ -136,9 +114,9 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) {
|
||||
;
|
||||
; AVX-LABEL: combine_urem_udiv:
|
||||
; AVX: # %bb.0:
|
||||
; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [44151,44151,44151,44151,44151,44151,44151,44151]
|
||||
; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [44151,44151,44151,44151,u,u,u,u]
|
||||
; AVX-NEXT: vpsrlw $6, %xmm1, %xmm1
|
||||
; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 # [95,95,95,95,95,95,95,95]
|
||||
; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 # [95,95,95,95,u,u,u,u]
|
||||
; AVX-NEXT: vpsubw %xmm2, %xmm0, %xmm0
|
||||
; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0
|
||||
; AVX-NEXT: retq
|
||||
@ -152,92 +130,44 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) {
|
||||
define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) {
|
||||
; SSE2-LABEL: dont_fold_urem_power_of_two:
|
||||
; SSE2: # %bb.0:
|
||||
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [63,63,63,63]
|
||||
; SSE2-NEXT: pand %xmm0, %xmm1
|
||||
; SSE2-NEXT: pextrw $1, %xmm0, %eax
|
||||
; SSE2-NEXT: andl $31, %eax
|
||||
; SSE2-NEXT: pinsrw $1, %eax, %xmm1
|
||||
; SSE2-NEXT: pextrw $2, %xmm0, %eax
|
||||
; SSE2-NEXT: andl $7, %eax
|
||||
; SSE2-NEXT: pinsrw $2, %eax, %xmm1
|
||||
; SSE2-NEXT: pextrw $3, %xmm0, %eax
|
||||
; SSE2-NEXT: imull $44151, %eax, %ecx # imm = 0xAC77
|
||||
; SSE2-NEXT: shrl $22, %ecx
|
||||
; SSE2-NEXT: imull $95, %ecx, %ecx
|
||||
; SSE2-NEXT: subl %ecx, %eax
|
||||
; SSE2-NEXT: pinsrw $3, %eax, %xmm1
|
||||
; SSE2-NEXT: movdqa %xmm1, %xmm0
|
||||
; SSE2-NEXT: movq {{.*#+}} xmm1 = [1024,2048,8192,44151,0,0,0,0]
|
||||
; SSE2-NEXT: pmulhuw %xmm0, %xmm1
|
||||
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535]
|
||||
; SSE2-NEXT: movdqa %xmm1, %xmm3
|
||||
; SSE2-NEXT: pand %xmm2, %xmm3
|
||||
; SSE2-NEXT: psrlw $6, %xmm1
|
||||
; SSE2-NEXT: pandn %xmm1, %xmm2
|
||||
; SSE2-NEXT: por %xmm3, %xmm2
|
||||
; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [64,32,8,95,u,u,u,u]
|
||||
; SSE2-NEXT: psubw %xmm2, %xmm0
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSE4-LABEL: dont_fold_urem_power_of_two:
|
||||
; SSE4: # %bb.0:
|
||||
; SSE4-NEXT: pmovsxbd {{.*#+}} xmm1 = [63,63,63,63]
|
||||
; SSE4-NEXT: pand %xmm0, %xmm1
|
||||
; SSE4-NEXT: pextrw $1, %xmm0, %eax
|
||||
; SSE4-NEXT: andl $31, %eax
|
||||
; SSE4-NEXT: pinsrw $1, %eax, %xmm1
|
||||
; SSE4-NEXT: pextrw $2, %xmm0, %eax
|
||||
; SSE4-NEXT: andl $7, %eax
|
||||
; SSE4-NEXT: pinsrw $2, %eax, %xmm1
|
||||
; SSE4-NEXT: pextrw $3, %xmm0, %eax
|
||||
; SSE4-NEXT: imull $44151, %eax, %ecx # imm = 0xAC77
|
||||
; SSE4-NEXT: shrl $22, %ecx
|
||||
; SSE4-NEXT: imull $95, %ecx, %ecx
|
||||
; SSE4-NEXT: subl %ecx, %eax
|
||||
; SSE4-NEXT: pinsrw $3, %eax, %xmm1
|
||||
; SSE4-NEXT: movdqa %xmm1, %xmm0
|
||||
; SSE4-NEXT: movq {{.*#+}} xmm1 = [1024,2048,8192,44151,0,0,0,0]
|
||||
; SSE4-NEXT: pmulhuw %xmm0, %xmm1
|
||||
; SSE4-NEXT: movdqa %xmm1, %xmm2
|
||||
; SSE4-NEXT: psrlw $6, %xmm2
|
||||
; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3],xmm1[4,5,6,7]
|
||||
; SSE4-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [64,32,8,95,u,u,u,u]
|
||||
; SSE4-NEXT: psubw %xmm2, %xmm0
|
||||
; SSE4-NEXT: retq
|
||||
;
|
||||
; AVX1-LABEL: dont_fold_urem_power_of_two:
|
||||
; AVX1: # %bb.0:
|
||||
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
|
||||
; AVX1-NEXT: vpextrw $1, %xmm0, %eax
|
||||
; AVX1-NEXT: andl $31, %eax
|
||||
; AVX1-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpextrw $2, %xmm0, %eax
|
||||
; AVX1-NEXT: andl $7, %eax
|
||||
; AVX1-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpextrw $3, %xmm0, %eax
|
||||
; AVX1-NEXT: imull $44151, %eax, %ecx # imm = 0xAC77
|
||||
; AVX1-NEXT: shrl $22, %ecx
|
||||
; AVX1-NEXT: imull $95, %ecx, %ecx
|
||||
; AVX1-NEXT: subl %ecx, %eax
|
||||
; AVX1-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: dont_fold_urem_power_of_two:
|
||||
; AVX2: # %bb.0:
|
||||
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [63,63,63,63]
|
||||
; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm1
|
||||
; AVX2-NEXT: vpextrw $1, %xmm0, %eax
|
||||
; AVX2-NEXT: andl $31, %eax
|
||||
; AVX2-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vpextrw $2, %xmm0, %eax
|
||||
; AVX2-NEXT: andl $7, %eax
|
||||
; AVX2-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vpextrw $3, %xmm0, %eax
|
||||
; AVX2-NEXT: imull $44151, %eax, %ecx # imm = 0xAC77
|
||||
; AVX2-NEXT: shrl $22, %ecx
|
||||
; AVX2-NEXT: imull $95, %ecx, %ecx
|
||||
; AVX2-NEXT: subl %ecx, %eax
|
||||
; AVX2-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0
|
||||
; AVX2-NEXT: retq
|
||||
; AVX1OR2-LABEL: dont_fold_urem_power_of_two:
|
||||
; AVX1OR2: # %bb.0:
|
||||
; AVX1OR2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [1024,2048,8192,44151,u,u,u,u]
|
||||
; AVX1OR2-NEXT: vpsrlw $6, %xmm1, %xmm2
|
||||
; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5,6,7]
|
||||
; AVX1OR2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [64,32,8,95,u,u,u,u]
|
||||
; AVX1OR2-NEXT: vpsubw %xmm1, %xmm0, %xmm0
|
||||
; AVX1OR2-NEXT: retq
|
||||
;
|
||||
; AVX512-LABEL: dont_fold_urem_power_of_two:
|
||||
; AVX512: # %bb.0:
|
||||
; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1
|
||||
; AVX512-NEXT: vpextrw $1, %xmm0, %eax
|
||||
; AVX512-NEXT: andl $31, %eax
|
||||
; AVX512-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
|
||||
; AVX512-NEXT: vpextrw $2, %xmm0, %eax
|
||||
; AVX512-NEXT: andl $7, %eax
|
||||
; AVX512-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
|
||||
; AVX512-NEXT: vpextrw $3, %xmm0, %eax
|
||||
; AVX512-NEXT: imull $44151, %eax, %ecx # imm = 0xAC77
|
||||
; AVX512-NEXT: shrl $22, %ecx
|
||||
; AVX512-NEXT: imull $95, %ecx, %ecx
|
||||
; AVX512-NEXT: subl %ecx, %eax
|
||||
; AVX512-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0
|
||||
; AVX512-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [1024,2048,8192,44151,u,u,u,u]
|
||||
; AVX512-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
|
||||
; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [64,32,8,95,u,u,u,u]
|
||||
; AVX512-NEXT: vpsubw %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: retq
|
||||
%1 = urem <4 x i16> %x, <i16 64, i16 32, i16 8, i16 95>
|
||||
ret <4 x i16> %1
|
||||
@ -245,98 +175,58 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) {
|
||||
|
||||
; Don't fold if the divisor is one.
|
||||
define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) {
|
||||
; SSE-LABEL: dont_fold_urem_one:
|
||||
; SSE: # %bb.0:
|
||||
; SSE-NEXT: pextrw $2, %xmm0, %eax
|
||||
; SSE-NEXT: imull $25645, %eax, %ecx # imm = 0x642D
|
||||
; SSE-NEXT: shrl $16, %ecx
|
||||
; SSE-NEXT: movl %eax, %edx
|
||||
; SSE-NEXT: subl %ecx, %edx
|
||||
; SSE-NEXT: movzwl %dx, %edx
|
||||
; SSE-NEXT: shrl %edx
|
||||
; SSE-NEXT: addl %ecx, %edx
|
||||
; SSE-NEXT: shrl $4, %edx
|
||||
; SSE-NEXT: leal (%rdx,%rdx,2), %ecx
|
||||
; SSE-NEXT: shll $3, %ecx
|
||||
; SSE-NEXT: subl %ecx, %edx
|
||||
; SSE-NEXT: addl %eax, %edx
|
||||
; SSE-NEXT: pextrw $1, %xmm0, %eax
|
||||
; SSE-NEXT: imull $51307, %eax, %ecx # imm = 0xC86B
|
||||
; SSE-NEXT: shrl $25, %ecx
|
||||
; SSE-NEXT: imull $654, %ecx, %ecx # imm = 0x28E
|
||||
; SSE-NEXT: subl %ecx, %eax
|
||||
; SSE-NEXT: pxor %xmm1, %xmm1
|
||||
; SSE-NEXT: pinsrw $1, %eax, %xmm1
|
||||
; SSE-NEXT: pinsrw $2, %edx, %xmm1
|
||||
; SSE-NEXT: pextrw $3, %xmm0, %eax
|
||||
; SSE-NEXT: imull $12375, %eax, %ecx # imm = 0x3057
|
||||
; SSE-NEXT: shrl $26, %ecx
|
||||
; SSE-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F
|
||||
; SSE-NEXT: subl %ecx, %eax
|
||||
; SSE-NEXT: pinsrw $3, %eax, %xmm1
|
||||
; SSE-NEXT: movdqa %xmm1, %xmm0
|
||||
; SSE-NEXT: retq
|
||||
; SSE2-LABEL: dont_fold_urem_one:
|
||||
; SSE2: # %bb.0:
|
||||
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
|
||||
; SSE2-NEXT: pandn %xmm0, %xmm1
|
||||
; SSE2-NEXT: movq {{.*#+}} xmm2 = [0,51307,25645,12375,0,0,0,0]
|
||||
; SSE2-NEXT: pmulhuw %xmm0, %xmm2
|
||||
; SSE2-NEXT: movdqa %xmm0, %xmm3
|
||||
; SSE2-NEXT: psubw %xmm2, %xmm3
|
||||
; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [u,0,32768,0,u,u,u,u]
|
||||
; SSE2-NEXT: paddw %xmm2, %xmm3
|
||||
; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [0,0,128,0,0,16,64,0,u,u,u,u,u,u,u,u]
|
||||
; SSE2-NEXT: por %xmm1, %xmm3
|
||||
; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [1,654,23,5423,u,u,u,u]
|
||||
; SSE2-NEXT: psubw %xmm3, %xmm0
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSE4-LABEL: dont_fold_urem_one:
|
||||
; SSE4: # %bb.0:
|
||||
; SSE4-NEXT: movq {{.*#+}} xmm1 = [0,51307,25645,12375,0,0,0,0]
|
||||
; SSE4-NEXT: pmulhuw %xmm0, %xmm1
|
||||
; SSE4-NEXT: movdqa %xmm0, %xmm2
|
||||
; SSE4-NEXT: psubw %xmm1, %xmm2
|
||||
; SSE4-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [u,0,32768,0,u,u,u,u]
|
||||
; SSE4-NEXT: paddw %xmm1, %xmm2
|
||||
; SSE4-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [u,128,4096,64,u,u,u,u]
|
||||
; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7]
|
||||
; SSE4-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [1,654,23,5423,u,u,u,u]
|
||||
; SSE4-NEXT: psubw %xmm2, %xmm0
|
||||
; SSE4-NEXT: retq
|
||||
;
|
||||
; AVX1OR2-LABEL: dont_fold_urem_one:
|
||||
; AVX1OR2: # %bb.0:
|
||||
; AVX1OR2-NEXT: vpextrw $2, %xmm0, %eax
|
||||
; AVX1OR2-NEXT: imull $25645, %eax, %ecx # imm = 0x642D
|
||||
; AVX1OR2-NEXT: shrl $16, %ecx
|
||||
; AVX1OR2-NEXT: movl %eax, %edx
|
||||
; AVX1OR2-NEXT: subl %ecx, %edx
|
||||
; AVX1OR2-NEXT: movzwl %dx, %edx
|
||||
; AVX1OR2-NEXT: shrl %edx
|
||||
; AVX1OR2-NEXT: addl %ecx, %edx
|
||||
; AVX1OR2-NEXT: shrl $4, %edx
|
||||
; AVX1OR2-NEXT: leal (%rdx,%rdx,2), %ecx
|
||||
; AVX1OR2-NEXT: shll $3, %ecx
|
||||
; AVX1OR2-NEXT: subl %ecx, %edx
|
||||
; AVX1OR2-NEXT: addl %eax, %edx
|
||||
; AVX1OR2-NEXT: vpextrw $1, %xmm0, %eax
|
||||
; AVX1OR2-NEXT: imull $51307, %eax, %ecx # imm = 0xC86B
|
||||
; AVX1OR2-NEXT: shrl $25, %ecx
|
||||
; AVX1OR2-NEXT: imull $654, %ecx, %ecx # imm = 0x28E
|
||||
; AVX1OR2-NEXT: subl %ecx, %eax
|
||||
; AVX1OR2-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
||||
; AVX1OR2-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
|
||||
; AVX1OR2-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1
|
||||
; AVX1OR2-NEXT: vpextrw $3, %xmm0, %eax
|
||||
; AVX1OR2-NEXT: imull $12375, %eax, %ecx # imm = 0x3057
|
||||
; AVX1OR2-NEXT: shrl $26, %ecx
|
||||
; AVX1OR2-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F
|
||||
; AVX1OR2-NEXT: subl %ecx, %eax
|
||||
; AVX1OR2-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0
|
||||
; AVX1OR2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,51307,25645,12375,u,u,u,u]
|
||||
; AVX1OR2-NEXT: vpsubw %xmm1, %xmm0, %xmm2
|
||||
; AVX1OR2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [u,0,32768,0,u,u,u,u]
|
||||
; AVX1OR2-NEXT: vpaddw %xmm1, %xmm2, %xmm1
|
||||
; AVX1OR2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [u,128,4096,64,u,u,u,u]
|
||||
; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
|
||||
; AVX1OR2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,654,23,5423,u,u,u,u]
|
||||
; AVX1OR2-NEXT: vpsubw %xmm1, %xmm0, %xmm0
|
||||
; AVX1OR2-NEXT: retq
|
||||
;
|
||||
; AVX512-LABEL: dont_fold_urem_one:
|
||||
; AVX512: # %bb.0:
|
||||
; AVX512-NEXT: vpextrw $2, %xmm0, %eax
|
||||
; AVX512-NEXT: imull $25645, %eax, %ecx # imm = 0x642D
|
||||
; AVX512-NEXT: shrl $16, %ecx
|
||||
; AVX512-NEXT: movl %eax, %edx
|
||||
; AVX512-NEXT: subl %ecx, %edx
|
||||
; AVX512-NEXT: movzwl %dx, %edx
|
||||
; AVX512-NEXT: shrl %edx
|
||||
; AVX512-NEXT: addl %ecx, %edx
|
||||
; AVX512-NEXT: shrl $4, %edx
|
||||
; AVX512-NEXT: leal (%rdx,%rdx,2), %ecx
|
||||
; AVX512-NEXT: shll $3, %ecx
|
||||
; AVX512-NEXT: subl %ecx, %edx
|
||||
; AVX512-NEXT: vpextrw $1, %xmm0, %ecx
|
||||
; AVX512-NEXT: addl %eax, %edx
|
||||
; AVX512-NEXT: imull $51307, %ecx, %eax # imm = 0xC86B
|
||||
; AVX512-NEXT: shrl $25, %eax
|
||||
; AVX512-NEXT: imull $654, %eax, %eax # imm = 0x28E
|
||||
; AVX512-NEXT: subl %eax, %ecx
|
||||
; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
||||
; AVX512-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1
|
||||
; AVX512-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1
|
||||
; AVX512-NEXT: vpextrw $3, %xmm0, %eax
|
||||
; AVX512-NEXT: imull $12375, %eax, %ecx # imm = 0x3057
|
||||
; AVX512-NEXT: shrl $26, %ecx
|
||||
; AVX512-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F
|
||||
; AVX512-NEXT: subl %ecx, %eax
|
||||
; AVX512-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0
|
||||
; AVX512-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,51307,25645,12375,u,u,u,u]
|
||||
; AVX512-NEXT: vpsubw %xmm1, %xmm0, %xmm2
|
||||
; AVX512-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [u,0,32768,0,u,u,u,u]
|
||||
; AVX512-NEXT: vpaddw %xmm1, %xmm2, %xmm1
|
||||
; AVX512-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
|
||||
; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
|
||||
; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,654,23,5423,u,u,u,u]
|
||||
; AVX512-NEXT: vpsubw %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: retq
|
||||
%1 = urem <4 x i16> %x, <i16 1, i16 654, i16 23, i16 5423>
|
||||
ret <4 x i16> %1
|
||||
|
||||
@ -6,7 +6,7 @@ define void @test_udiv7_v2i32(ptr %x, ptr %y) nounwind {
|
||||
; X64-LABEL: test_udiv7_v2i32:
|
||||
; X64: # %bb.0:
|
||||
; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
|
||||
; X64-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
|
||||
; X64-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,u,u]
|
||||
; X64-NEXT: movdqa %xmm0, %xmm2
|
||||
; X64-NEXT: pmuludq %xmm1, %xmm2
|
||||
; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
|
||||
@ -26,7 +26,7 @@ define void @test_udiv7_v2i32(ptr %x, ptr %y) nounwind {
|
||||
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
||||
; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
|
||||
; X86-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
|
||||
; X86-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,u,u]
|
||||
; X86-NEXT: movdqa %xmm0, %xmm2
|
||||
; X86-NEXT: pmuludq %xmm1, %xmm2
|
||||
; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
|
||||
@ -51,7 +51,7 @@ define void @test_urem7_v2i32(ptr %x, ptr %y) nounwind {
|
||||
; X64-LABEL: test_urem7_v2i32:
|
||||
; X64: # %bb.0:
|
||||
; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
|
||||
; X64-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
|
||||
; X64-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,u,u]
|
||||
; X64-NEXT: movdqa %xmm0, %xmm2
|
||||
; X64-NEXT: pmuludq %xmm1, %xmm2
|
||||
; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
|
||||
@ -76,7 +76,7 @@ define void @test_urem7_v2i32(ptr %x, ptr %y) nounwind {
|
||||
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
||||
; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
|
||||
; X86-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
|
||||
; X86-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,u,u]
|
||||
; X86-NEXT: movdqa %xmm0, %xmm2
|
||||
; X86-NEXT: pmuludq %xmm1, %xmm2
|
||||
; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
|
||||
@ -106,7 +106,7 @@ define void @test_sdiv7_v2i32(ptr %x, ptr %y) nounwind {
|
||||
; X64-LABEL: test_sdiv7_v2i32:
|
||||
; X64: # %bb.0:
|
||||
; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
|
||||
; X64-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
|
||||
; X64-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,u,u]
|
||||
; X64-NEXT: movdqa %xmm0, %xmm2
|
||||
; X64-NEXT: pmuludq %xmm1, %xmm2
|
||||
; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
|
||||
@ -132,7 +132,7 @@ define void @test_sdiv7_v2i32(ptr %x, ptr %y) nounwind {
|
||||
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
||||
; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
|
||||
; X86-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
|
||||
; X86-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2454267027,u,u]
|
||||
; X86-NEXT: movdqa %xmm1, %xmm0
|
||||
; X86-NEXT: pmuludq %xmm2, %xmm0
|
||||
; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
|
||||
@ -163,7 +163,7 @@ define void @test_srem7_v2i32(ptr %x, ptr %y) nounwind {
|
||||
; X64-LABEL: test_srem7_v2i32:
|
||||
; X64: # %bb.0:
|
||||
; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
|
||||
; X64-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
|
||||
; X64-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,u,u]
|
||||
; X64-NEXT: movdqa %xmm0, %xmm2
|
||||
; X64-NEXT: pmuludq %xmm1, %xmm2
|
||||
; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
|
||||
@ -193,7 +193,7 @@ define void @test_srem7_v2i32(ptr %x, ptr %y) nounwind {
|
||||
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
||||
; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
|
||||
; X86-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027]
|
||||
; X86-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2454267027,u,u]
|
||||
; X86-NEXT: movdqa %xmm0, %xmm1
|
||||
; X86-NEXT: pmuludq %xmm2, %xmm1
|
||||
; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user