diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index f6e70e52b2ca..c646d5220749 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -5636,7 +5636,7 @@ SDValue DAGCombiner::visitMULHU(SDNode *N) { // fold (mulhu x, (1 << c)) -> x >> (bitwidth - c) if (isConstantOrConstantVector(N1, /*NoOpaques=*/true, /*AllowTruncation=*/true) && - hasOperation(ISD::SRL, VT)) { + (!LegalOperations || hasOperation(ISD::SRL, VT))) { if (SDValue LogBase2 = BuildLogBase2(N1, DL)) { unsigned NumEltBits = VT.getScalarSizeInBits(); SDValue SRLAmt = DAG.getNode( diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 9efbc62caa1e..2e55c859f8f0 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -6612,9 +6612,18 @@ SDValue TargetLowering::BuildSDIV(SDNode *N, SelectionDAG &DAG, // Check to see if we can do this. // FIXME: We should be more aggressive here. - if (!isTypeLegal(VT)) { + EVT QueryVT = VT; + if (VT.isVector()) { + // If the vector type will be legalized to a vector type with the same + // element type, allow the transform before type legalization if MULHS or + // SMUL_LOHI are supported. + QueryVT = getLegalTypeToTransformTo(*DAG.getContext(), VT); + if (!QueryVT.isVector() || + QueryVT.getVectorElementType() != VT.getVectorElementType()) + return SDValue(); + } else if (!isTypeLegal(VT)) { // Limit this to simple scalars for now. - if (VT.isVector() || !VT.isSimple()) + if (!VT.isSimple()) return SDValue(); // If this type will be promoted to a large enough type with a legal @@ -6628,11 +6637,12 @@ SDValue TargetLowering::BuildSDIV(SDNode *N, SelectionDAG &DAG, return SDValue(); } - bool HasMULHS = isOperationLegalOrCustom(ISD::MULHS, VT, IsAfterLegalization); + bool HasMULHS = + isOperationLegalOrCustom(ISD::MULHS, QueryVT, IsAfterLegalization); bool HasSMUL_LOHI = - isOperationLegalOrCustom(ISD::SMUL_LOHI, VT, IsAfterLegalization); + isOperationLegalOrCustom(ISD::SMUL_LOHI, QueryVT, IsAfterLegalization); - if (!HasMULHS && !HasSMUL_LOHI && MulVT == EVT()) { + if (isTypeLegal(VT) && !HasMULHS && !HasSMUL_LOHI && MulVT == EVT()) { // If type twice as wide legal, widen and use a mul plus a shift. EVT WideVT = VT.widenIntegerElementType(*DAG.getContext()); // Some targets like AMDGPU try to go from SDIV to SDIVREM which is then @@ -6791,9 +6801,18 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG, // Check to see if we can do this. // FIXME: We should be more aggressive here. - if (!isTypeLegal(VT)) { + EVT QueryVT = VT; + if (VT.isVector()) { + // If the vector type will be legalized to a vector type with the same + // element type, allow the transform before type legalization if MULHU or + // UMUL_LOHI are supported. + QueryVT = getLegalTypeToTransformTo(*DAG.getContext(), VT); + if (!QueryVT.isVector() || + QueryVT.getVectorElementType() != VT.getVectorElementType()) + return SDValue(); + } else if (!isTypeLegal(VT)) { // Limit this to simple scalars for now. - if (VT.isVector() || !VT.isSimple()) + if (!VT.isSimple()) return SDValue(); // If this type will be promoted to a large enough type with a legal @@ -6807,14 +6826,15 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG, return SDValue(); } - bool HasMULHU = isOperationLegalOrCustom(ISD::MULHU, VT, IsAfterLegalization); + bool HasMULHU = + isOperationLegalOrCustom(ISD::MULHU, QueryVT, IsAfterLegalization); bool HasUMUL_LOHI = - isOperationLegalOrCustom(ISD::UMUL_LOHI, VT, IsAfterLegalization); + isOperationLegalOrCustom(ISD::UMUL_LOHI, QueryVT, IsAfterLegalization); - if (!HasMULHU && !HasUMUL_LOHI && MulVT == EVT()) { + if (isTypeLegal(VT) && !HasMULHU && !HasUMUL_LOHI && MulVT == EVT()) { // If type twice as wide legal, widen and use a mul plus a shift. EVT WideVT = VT.widenIntegerElementType(*DAG.getContext()); - // Some targets like AMDGPU try to go from SDIV to SDIVREM which is then + // Some targets like AMDGPU try to go from UDIV to UDIVREM which is then // custom lowered. This is very expensive so avoid it at all costs for // constant divisors. if ((!IsAfterLegalTypes && isOperationExpand(ISD::UDIV, VT) && diff --git a/llvm/test/CodeGen/AArch64/rem-by-const.ll b/llvm/test/CodeGen/AArch64/rem-by-const.ll index b3667c6e17e6..1c6b241cb8f1 100644 --- a/llvm/test/CodeGen/AArch64/rem-by-const.ll +++ b/llvm/test/CodeGen/AArch64/rem-by-const.ll @@ -2334,23 +2334,16 @@ define <3 x i32> @sv3i32_7(<3 x i32> %d, <3 x i32> %e) { ; CHECK-SD-LABEL: sv3i32_7: ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: mov w8, #9363 // =0x2493 -; CHECK-SD-NEXT: mov w9, v0.s[2] -; CHECK-SD-NEXT: movi v3.2s, #7 +; CHECK-SD-NEXT: movi v3.4s, #7 ; CHECK-SD-NEXT: movk w8, #37449, lsl #16 -; CHECK-SD-NEXT: dup v1.2s, w8 -; CHECK-SD-NEXT: smull x8, w9, w8 +; CHECK-SD-NEXT: dup v1.4s, w8 +; CHECK-SD-NEXT: smull2 v2.2d, v0.4s, v1.4s ; CHECK-SD-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-SD-NEXT: add x8, x9, x8, lsr #32 -; CHECK-SD-NEXT: asr w10, w8, #2 -; CHECK-SD-NEXT: shrn v1.2s, v1.2d, #32 -; CHECK-SD-NEXT: add w8, w10, w8, lsr #31 -; CHECK-SD-NEXT: add v1.2s, v1.2s, v0.2s -; CHECK-SD-NEXT: sub w8, w8, w8, lsl #3 -; CHECK-SD-NEXT: add w8, w9, w8 -; CHECK-SD-NEXT: sshr v2.2s, v1.2s, #2 -; CHECK-SD-NEXT: usra v2.2s, v1.2s, #31 -; CHECK-SD-NEXT: mls v0.2s, v2.2s, v3.2s -; CHECK-SD-NEXT: mov v0.s[2], w8 +; CHECK-SD-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-SD-NEXT: add v1.4s, v1.4s, v0.4s +; CHECK-SD-NEXT: sshr v2.4s, v1.4s, #2 +; CHECK-SD-NEXT: usra v2.4s, v1.4s, #31 +; CHECK-SD-NEXT: mls v0.4s, v2.4s, v3.4s ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: sv3i32_7: @@ -2386,21 +2379,15 @@ define <3 x i32> @sv3i32_100(<3 x i32> %d, <3 x i32> %e) { ; CHECK-SD-LABEL: sv3i32_100: ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: mov w8, #34079 // =0x851f -; CHECK-SD-NEXT: mov w9, v0.s[2] -; CHECK-SD-NEXT: movi v2.2s, #100 +; CHECK-SD-NEXT: movi v3.4s, #100 ; CHECK-SD-NEXT: movk w8, #20971, lsl #16 -; CHECK-SD-NEXT: dup v1.2s, w8 -; CHECK-SD-NEXT: smull x8, w9, w8 +; CHECK-SD-NEXT: dup v1.4s, w8 +; CHECK-SD-NEXT: smull2 v2.2d, v0.4s, v1.4s ; CHECK-SD-NEXT: smull v1.2d, v0.2s, v1.2s -; CHECK-SD-NEXT: asr x10, x8, #37 -; CHECK-SD-NEXT: add x8, x10, x8, lsr #63 -; CHECK-SD-NEXT: mov w10, #100 // =0x64 -; CHECK-SD-NEXT: sshr v1.2d, v1.2d, #37 -; CHECK-SD-NEXT: msub w8, w8, w10, w9 -; CHECK-SD-NEXT: xtn v1.2s, v1.2d -; CHECK-SD-NEXT: usra v1.2s, v1.2s, #31 -; CHECK-SD-NEXT: mls v0.2s, v1.2s, v2.2s -; CHECK-SD-NEXT: mov v0.s[2], w8 +; CHECK-SD-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-SD-NEXT: sshr v2.4s, v1.4s, #5 +; CHECK-SD-NEXT: usra v2.4s, v1.4s, #31 +; CHECK-SD-NEXT: mls v0.4s, v2.4s, v3.4s ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: sv3i32_100: @@ -2560,26 +2547,16 @@ define <3 x i32> @uv3i32_7(<3 x i32> %d, <3 x i32> %e) { ; CHECK-SD-LABEL: uv3i32_7: ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: mov w8, #18725 // =0x4925 -; CHECK-SD-NEXT: mov x9, #2684354560 // =0xa0000000 ; CHECK-SD-NEXT: movk w8, #9362, lsl #16 -; CHECK-SD-NEXT: movk x9, #18724, lsl #32 -; CHECK-SD-NEXT: dup v1.2s, w8 -; CHECK-SD-NEXT: mov w8, v0.s[2] -; CHECK-SD-NEXT: movk x9, #9362, lsl #48 +; CHECK-SD-NEXT: dup v1.4s, w8 +; CHECK-SD-NEXT: umull2 v2.2d, v0.4s, v1.4s ; CHECK-SD-NEXT: umull v1.2d, v0.2s, v1.2s -; CHECK-SD-NEXT: umulh x9, x8, x9 -; CHECK-SD-NEXT: shrn v1.2s, v1.2d, #32 -; CHECK-SD-NEXT: sub w9, w9, w9, lsl #3 -; CHECK-SD-NEXT: sub v2.2s, v0.2s, v1.2s -; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 def $q0 -; CHECK-SD-NEXT: add w8, w8, w9 -; CHECK-SD-NEXT: ushll v2.2d, v2.2s, #0 -; CHECK-SD-NEXT: shrn v2.2s, v2.2d, #1 -; CHECK-SD-NEXT: add v1.2s, v2.2s, v1.2s -; CHECK-SD-NEXT: movi v2.2s, #7 -; CHECK-SD-NEXT: ushr v1.2s, v1.2s, #2 -; CHECK-SD-NEXT: mls v0.2s, v1.2s, v2.2s -; CHECK-SD-NEXT: mov v0.s[2], w8 +; CHECK-SD-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-SD-NEXT: sub v2.4s, v0.4s, v1.4s +; CHECK-SD-NEXT: usra v1.4s, v2.4s, #1 +; CHECK-SD-NEXT: movi v2.4s, #7 +; CHECK-SD-NEXT: ushr v1.4s, v1.4s, #2 +; CHECK-SD-NEXT: mls v0.4s, v1.4s, v2.4s ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: uv3i32_7: @@ -2630,19 +2607,14 @@ define <3 x i32> @uv3i32_100(<3 x i32> %d, <3 x i32> %e) { ; CHECK-SD-LABEL: uv3i32_100: ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: mov w8, #34079 // =0x851f -; CHECK-SD-NEXT: mov w9, v0.s[2] -; CHECK-SD-NEXT: movi v2.2s, #100 ; CHECK-SD-NEXT: movk w8, #20971, lsl #16 -; CHECK-SD-NEXT: mov w10, #100 // =0x64 -; CHECK-SD-NEXT: dup v1.2s, w8 -; CHECK-SD-NEXT: umull x8, w9, w8 +; CHECK-SD-NEXT: dup v1.4s, w8 +; CHECK-SD-NEXT: umull2 v2.2d, v0.4s, v1.4s ; CHECK-SD-NEXT: umull v1.2d, v0.2s, v1.2s -; CHECK-SD-NEXT: lsr x8, x8, #37 -; CHECK-SD-NEXT: msub w8, w8, w10, w9 -; CHECK-SD-NEXT: ushr v1.2d, v1.2d, #37 -; CHECK-SD-NEXT: xtn v1.2s, v1.2d -; CHECK-SD-NEXT: mls v0.2s, v1.2s, v2.2s -; CHECK-SD-NEXT: mov v0.s[2], w8 +; CHECK-SD-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-SD-NEXT: movi v2.4s, #100 +; CHECK-SD-NEXT: ushr v1.4s, v1.4s, #5 +; CHECK-SD-NEXT: mls v0.4s, v1.4s, v2.4s ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: uv3i32_100: diff --git a/llvm/test/CodeGen/LoongArch/lasx/issue170976.ll b/llvm/test/CodeGen/LoongArch/lasx/issue170976.ll index a5aa5c8d6d0b..6cdc2794605b 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/issue170976.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/issue170976.ll @@ -51,9 +51,12 @@ entry: define <8 x i64> @test_i64(<8 x i64> %shuffle) { ; LA32-LABEL: test_i64: ; LA32: # %bb.0: # %entry -; LA32-NEXT: xvrepli.d $xr2, 3 -; LA32-NEXT: xvdiv.du $xr0, $xr0, $xr2 -; LA32-NEXT: xvdiv.du $xr1, $xr1, $xr2 +; LA32-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_0) +; LA32-NEXT: xvld $xr2, $a0, %pc_lo12(.LCPI3_0) +; LA32-NEXT: xvmuh.du $xr0, $xr0, $xr2 +; LA32-NEXT: xvsrli.d $xr0, $xr0, 1 +; LA32-NEXT: xvmuh.du $xr1, $xr1, $xr2 +; LA32-NEXT: xvsrli.d $xr1, $xr1, 1 ; LA32-NEXT: ret ; ; LA64-LABEL: test_i64: diff --git a/llvm/test/CodeGen/LoongArch/lsx/issue170976.ll b/llvm/test/CodeGen/LoongArch/lsx/issue170976.ll index df4da0178f38..a8e4c2a19b58 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/issue170976.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/issue170976.ll @@ -51,9 +51,12 @@ entry: define <4 x i64> @test_i64(<4 x i64> %shuffle) { ; LA32-LABEL: test_i64: ; LA32: # %bb.0: # %entry -; LA32-NEXT: vrepli.d $vr2, 3 -; LA32-NEXT: vdiv.du $vr0, $vr0, $vr2 -; LA32-NEXT: vdiv.du $vr1, $vr1, $vr2 +; LA32-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_0) +; LA32-NEXT: vld $vr2, $a0, %pc_lo12(.LCPI3_0) +; LA32-NEXT: vmuh.du $vr0, $vr0, $vr2 +; LA32-NEXT: vsrli.d $vr0, $vr0, 1 +; LA32-NEXT: vmuh.du $vr1, $vr1, $vr2 +; LA32-NEXT: vsrli.d $vr1, $vr1, 1 ; LA32-NEXT: ret ; ; LA64-LABEL: test_i64: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll index 7cb00d40e60c..9e2daa4a9a06 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll @@ -1141,10 +1141,25 @@ define void @mulhu_v6i16(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vmv.v.i v9, 0 ; CHECK-NEXT: lui a1, %hi(.LCPI67_0) ; CHECK-NEXT: addi a1, a1, %lo(.LCPI67_0) -; CHECK-NEXT: vle16.v v9, (a1) -; CHECK-NEXT: vdivu.vv v8, v8, v9 +; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v10, (a1) +; CHECK-NEXT: lui a1, 1048568 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, tu, ma +; CHECK-NEXT: vmv.s.x v9, a1 +; CHECK-NEXT: li a1, 33 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vmulhu.vv v10, v8, v10 +; CHECK-NEXT: vsub.vv v8, v8, v10 +; CHECK-NEXT: vmulhu.vv v8, v8, v9 +; CHECK-NEXT: vmv.s.x v0, a1 +; CHECK-NEXT: vadd.vv v8, v8, v10 +; CHECK-NEXT: vmv.v.i v9, 3 +; CHECK-NEXT: vmerge.vim v9, v9, 2, v0 +; CHECK-NEXT: vsrl.vv v8, v8, v9 ; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret %a = load <6 x i16>, ptr %x @@ -1287,9 +1302,16 @@ define void @mulhs_v6i16(ptr %x) { ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: li a1, 22 ; CHECK-NEXT: vmv.s.x v0, a1 -; CHECK-NEXT: vmv.v.i v9, -7 -; CHECK-NEXT: vmerge.vim v9, v9, 7, v0 -; CHECK-NEXT: vdiv.vv v8, v8, v9 +; CHECK-NEXT: lui a1, 1048571 +; CHECK-NEXT: addi a1, a1, 1755 +; CHECK-NEXT: vmv.v.x v9, a1 +; CHECK-NEXT: lui a1, 5 +; CHECK-NEXT: addi a1, a1, -1755 +; CHECK-NEXT: vmerge.vxm v9, v9, a1, v0 +; CHECK-NEXT: vmulh.vv v8, v8, v9 +; CHECK-NEXT: vsra.vi v8, v8, 1 +; CHECK-NEXT: vsrl.vi v9, v8, 15 +; CHECK-NEXT: vadd.vv v8, v8, v9 ; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret %a = load <6 x i16>, ptr %x diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll index 6f1f1fcb647f..1931a45155cf 100644 --- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll @@ -2213,121 +2213,186 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) { ; CHECK-SSE2-LABEL: pr51133: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: movq %rdi, %rax -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm5 -; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 # [9,u,0,u,41,u,183,u,1,u,1,u,161,u,221,u] -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] -; CHECK-SSE2-NEXT: pand %xmm4, %xmm5 -; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm6 -; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [171,u,103,u,183,u,171,u,61,u,1,u,127,u,183,u] -; CHECK-SSE2-NEXT: pand %xmm4, %xmm6 -; CHECK-SSE2-NEXT: packuswb %xmm5, %xmm6 -; CHECK-SSE2-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 -; CHECK-SSE2-NEXT: movdqa %xmm6, %xmm5 -; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] -; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 # [128,1,128,1,128,32,1,1] -; CHECK-SSE2-NEXT: psrlw $8, %xmm5 -; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [1,1,1,128,64,2,1,32] -; CHECK-SSE2-NEXT: psrlw $8, %xmm6 -; CHECK-SSE2-NEXT: packuswb %xmm5, %xmm6 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm7 = [84,2,36,42,2,1,2,4,2,255,4,36,127,31,2,2] -; CHECK-SSE2-NEXT: pminub %xmm6, %xmm7 -; CHECK-SSE2-NEXT: pcmpeqb %xmm6, %xmm7 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] -; CHECK-SSE2-NEXT: pandn %xmm5, %xmm7 -; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm4 +; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [0,u,1,u,0,u,1,u,1,u,1,u,0,u,1,u] +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; CHECK-SSE2-NEXT: pand %xmm5, %xmm4 +; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm7 +; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 # [0,u,0,u,1,u,0,u,0,u,255,u,0,u,1,u] +; CHECK-SSE2-NEXT: pand %xmm5, %xmm7 +; CHECK-SSE2-NEXT: packuswb %xmm4, %xmm7 +; CHECK-SSE2-NEXT: pxor %xmm4, %xmm4 +; CHECK-SSE2-NEXT: pxor %xmm8, %xmm8 +; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm1[8],xmm8[9],xmm1[9],xmm8[10],xmm1[10],xmm8[11],xmm1[11],xmm8[12],xmm1[12],xmm8[13],xmm1[13],xmm8[14],xmm1[14],xmm8[15],xmm1[15] +; CHECK-SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8 # [0,9,0,0,0,41,0,147,0,129,0,129,0,85,0,141] +; CHECK-SSE2-NEXT: psrlw $8, %xmm8 ; CHECK-SSE2-NEXT: pxor %xmm6, %xmm6 -; CHECK-SSE2-NEXT: pcmpgtb %xmm6, %xmm1 -; CHECK-SSE2-NEXT: pandn %xmm1, %xmm5 -; CHECK-SSE2-NEXT: por %xmm7, %xmm5 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [223,u,223,u,205,u,183,u,161,u,1,u,171,u,239,u] -; CHECK-SSE2-NEXT: pand %xmm4, %xmm1 -; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [197,u,205,u,27,u,241,u,1,u,1,u,1,u,163,u] -; CHECK-SSE2-NEXT: pand %xmm4, %xmm0 -; CHECK-SSE2-NEXT: packuswb %xmm1, %xmm0 -; CHECK-SSE2-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 -; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [128,128,1,1,1,128,1,64] -; CHECK-SSE2-NEXT: psrlw $8, %xmm1 -; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,1,1,128,128,32,128,32] -; CHECK-SSE2-NEXT: psrlw $8, %xmm0 -; CHECK-SSE2-NEXT: packuswb %xmm1, %xmm0 -; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [19,51,13,7,128,32,128,3,5,5,51,37,3,128,85,5] -; CHECK-SSE2-NEXT: pmaxub %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqb %xmm0, %xmm1 -; CHECK-SSE2-NEXT: pcmpeqb %xmm6, %xmm3 -; CHECK-SSE2-NEXT: pandn %xmm5, %xmm3 -; CHECK-SSE2-NEXT: pcmpeqb %xmm6, %xmm2 -; CHECK-SSE2-NEXT: pandn %xmm1, %xmm2 -; CHECK-SSE2-NEXT: pmovmskb %xmm2, %ecx -; CHECK-SSE2-NEXT: pmovmskb %xmm3, %edx -; CHECK-SSE2-NEXT: shll $16, %edx +; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3],xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] +; CHECK-SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [0,86,0,95,0,147,0,43,0,49,0,127,0,65,0,147] +; CHECK-SSE2-NEXT: psrlw $8, %xmm6 +; CHECK-SSE2-NEXT: packuswb %xmm8, %xmm6 +; CHECK-SSE2-NEXT: paddb %xmm7, %xmm6 +; CHECK-SSE2-NEXT: movdqa %xmm6, %xmm7 +; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] +; CHECK-SSE2-NEXT: psraw $8, %xmm7 +; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 # [64,256,32,64,256,64,8,4] +; CHECK-SSE2-NEXT: psrlw $8, %xmm7 +; CHECK-SSE2-NEXT: movdqa %xmm6, %xmm8 +; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; CHECK-SSE2-NEXT: psraw $8, %xmm8 +; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8 # [256,8,64,256,16,4,8,8] +; CHECK-SSE2-NEXT: psrlw $8, %xmm8 +; CHECK-SSE2-NEXT: packuswb %xmm7, %xmm8 +; CHECK-SSE2-NEXT: psrlw $7, %xmm6 +; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 +; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 +; CHECK-SSE2-NEXT: paddb %xmm8, %xmm6 +; CHECK-SSE2-NEXT: movdqa %xmm6, %xmm7 +; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 # [114,u,1,u,50,u,7,u,2,u,8,u,97,u,117,u] +; CHECK-SSE2-NEXT: pand %xmm5, %xmm7 +; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [3,u,87,u,7,u,6,u,84,u,128,u,127,u,56,u] +; CHECK-SSE2-NEXT: pand %xmm5, %xmm6 +; CHECK-SSE2-NEXT: packuswb %xmm7, %xmm6 +; CHECK-SSE2-NEXT: psubb %xmm6, %xmm1 +; CHECK-SSE2-NEXT: pxor %xmm6, %xmm6 +; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15] +; CHECK-SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [0,133,0,133,0,103,0,147,0,85,0,129,0,86,0,137] +; CHECK-SSE2-NEXT: psrlw $8, %xmm6 +; CHECK-SSE2-NEXT: pxor %xmm7, %xmm7 +; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3],xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] +; CHECK-SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 # [0,79,0,103,0,27,0,121,0,129,0,129,0,129,0,47] +; CHECK-SSE2-NEXT: psrlw $8, %xmm7 +; CHECK-SSE2-NEXT: packuswb %xmm6, %xmm7 +; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm6 = [0,0,0,0,255,255,255,0,255,255,0,255,0,255,0,255] +; CHECK-SSE2-NEXT: pand %xmm0, %xmm6 +; CHECK-SSE2-NEXT: paddb %xmm7, %xmm6 +; CHECK-SSE2-NEXT: movdqa %xmm6, %xmm7 +; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] +; CHECK-SSE2-NEXT: psraw $8, %xmm7 +; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 # [8,8,128,64,8,256,256,8] +; CHECK-SSE2-NEXT: psrlw $8, %xmm7 +; CHECK-SSE2-NEXT: pxor %xmm8, %xmm8 +; CHECK-SSE2-NEXT: pcmpgtb %xmm6, %xmm8 +; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; CHECK-SSE2-NEXT: psraw $8, %xmm6 +; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [64,128,128,16,256,64,256,16] +; CHECK-SSE2-NEXT: psrlw $8, %xmm6 +; CHECK-SSE2-NEXT: packuswb %xmm7, %xmm6 +; CHECK-SSE2-NEXT: psubb %xmm8, %xmm6 +; CHECK-SSE2-NEXT: movdqa %xmm6, %xmm7 +; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 # [62,u,62,u,5,u,7,u,97,u,2,u,3,u,60,u] +; CHECK-SSE2-NEXT: pand %xmm5, %xmm7 +; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [13,u,5,u,19,u,34,u,2,u,8,u,2,u,88,u] +; CHECK-SSE2-NEXT: pand %xmm5, %xmm6 +; CHECK-SSE2-NEXT: packuswb %xmm7, %xmm6 +; CHECK-SSE2-NEXT: psubb %xmm6, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqb %xmm4, %xmm1 +; CHECK-SSE2-NEXT: pcmpeqb %xmm4, %xmm0 +; CHECK-SSE2-NEXT: pcmpeqb %xmm4, %xmm2 +; CHECK-SSE2-NEXT: por %xmm0, %xmm2 +; CHECK-SSE2-NEXT: pcmpeqb %xmm4, %xmm3 +; CHECK-SSE2-NEXT: por %xmm1, %xmm3 +; CHECK-SSE2-NEXT: pmovmskb %xmm3, %ecx +; CHECK-SSE2-NEXT: notl %ecx +; CHECK-SSE2-NEXT: shll $16, %ecx +; CHECK-SSE2-NEXT: pmovmskb %xmm2, %edx +; CHECK-SSE2-NEXT: xorl $65535, %edx # imm = 0xFFFF ; CHECK-SSE2-NEXT: orl %ecx, %edx ; CHECK-SSE2-NEXT: movl %edx, (%rdi) ; CHECK-SSE2-NEXT: retq ; ; CHECK-SSE41-LABEL: pr51133: ; CHECK-SSE41: # %bb.0: -; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm4 ; CHECK-SSE41-NEXT: movq %rdi, %rax -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [171,103,183,171,61,1,127,183,9,0,41,183,1,1,161,221] -; CHECK-SSE41-NEXT: pmullw %xmm1, %xmm0 -; CHECK-SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] -; CHECK-SSE41-NEXT: pand %xmm5, %xmm0 -; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm6 -; CHECK-SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [0,103,0,171,0,1,0,183,0,0,0,183,0,1,0,221] -; CHECK-SSE41-NEXT: psllw $8, %xmm6 -; CHECK-SSE41-NEXT: por %xmm0, %xmm6 -; CHECK-SSE41-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 -; CHECK-SSE41-NEXT: movdqa %xmm6, %xmm0 -; CHECK-SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15] -; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [128,1,128,1,128,32,1,1] -; CHECK-SSE41-NEXT: psrlw $8, %xmm0 +; CHECK-SSE41-NEXT: pxor %xmm4, %xmm4 +; CHECK-SSE41-NEXT: pxor %xmm5, %xmm5 +; CHECK-SSE41-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] +; CHECK-SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 # [0,133,0,133,0,103,0,147,0,85,0,129,0,86,0,137] +; CHECK-SSE41-NEXT: psrlw $8, %xmm5 +; CHECK-SSE41-NEXT: pxor %xmm7, %xmm7 +; CHECK-SSE41-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3],xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] +; CHECK-SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 # [0,79,0,103,0,27,0,121,0,129,0,129,0,129,0,47] +; CHECK-SSE41-NEXT: psrlw $8, %xmm7 +; CHECK-SSE41-NEXT: packuswb %xmm5, %xmm7 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm6 = [0,0,0,0,255,255,255,0,255,255,0,255,0,255,0,255] +; CHECK-SSE41-NEXT: pand %xmm0, %xmm6 +; CHECK-SSE41-NEXT: paddb %xmm7, %xmm6 +; CHECK-SSE41-NEXT: movdqa %xmm6, %xmm5 +; CHECK-SSE41-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15] +; CHECK-SSE41-NEXT: psraw $8, %xmm5 +; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 # [8,8,128,64,8,256,256,8] +; CHECK-SSE41-NEXT: psrlw $8, %xmm5 +; CHECK-SSE41-NEXT: pxor %xmm7, %xmm7 +; CHECK-SSE41-NEXT: pcmpgtb %xmm6, %xmm7 ; CHECK-SSE41-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [1,1,1,128,64,2,1,32] +; CHECK-SSE41-NEXT: psraw $8, %xmm6 +; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [64,128,128,16,256,64,256,16] ; CHECK-SSE41-NEXT: psrlw $8, %xmm6 -; CHECK-SSE41-NEXT: packuswb %xmm0, %xmm6 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [84,2,36,42,2,1,2,4,2,255,4,36,127,31,2,2] -; CHECK-SSE41-NEXT: pminub %xmm6, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqb %xmm6, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqd %xmm7, %xmm7 -; CHECK-SSE41-NEXT: pxor %xmm0, %xmm7 -; CHECK-SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; CHECK-SSE41-NEXT: packuswb %xmm5, %xmm6 +; CHECK-SSE41-NEXT: psubb %xmm7, %xmm6 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm7 = [13,5,19,34,2,8,2,88,62,62,5,7,97,2,3,60] +; CHECK-SSE41-NEXT: pmullw %xmm6, %xmm7 +; CHECK-SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] +; CHECK-SSE41-NEXT: pand %xmm5, %xmm7 +; CHECK-SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [0,5,0,34,0,8,0,88,0,62,0,7,0,2,0,60] +; CHECK-SSE41-NEXT: psllw $8, %xmm6 +; CHECK-SSE41-NEXT: por %xmm7, %xmm6 +; CHECK-SSE41-NEXT: psubb %xmm6, %xmm0 ; CHECK-SSE41-NEXT: pxor %xmm6, %xmm6 -; CHECK-SSE41-NEXT: pcmpgtb %xmm6, %xmm1 -; CHECK-SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255] -; CHECK-SSE41-NEXT: pblendvb %xmm0, %xmm7, %xmm1 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [197,205,27,241,1,1,1,163,223,223,205,183,161,1,171,239] -; CHECK-SSE41-NEXT: pmullw %xmm4, %xmm0 -; CHECK-SSE41-NEXT: pand %xmm5, %xmm0 -; CHECK-SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [0,205,0,241,0,1,0,163,0,223,0,183,0,1,0,239] -; CHECK-SSE41-NEXT: psllw $8, %xmm4 -; CHECK-SSE41-NEXT: por %xmm0, %xmm4 -; CHECK-SSE41-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 -; CHECK-SSE41-NEXT: movdqa %xmm4, %xmm0 -; CHECK-SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] -; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [128,128,1,1,1,128,1,64] -; CHECK-SSE41-NEXT: psrlw $8, %xmm0 -; CHECK-SSE41-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 # [1,1,1,128,128,32,128,32] -; CHECK-SSE41-NEXT: psrlw $8, %xmm4 -; CHECK-SSE41-NEXT: packuswb %xmm0, %xmm4 -; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [19,51,13,7,128,32,128,3,5,5,51,37,3,128,85,5] -; CHECK-SSE41-NEXT: pmaxub %xmm4, %xmm0 +; CHECK-SSE41-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm1[8],xmm6[9],xmm1[9],xmm6[10],xmm1[10],xmm6[11],xmm1[11],xmm6[12],xmm1[12],xmm6[13],xmm1[13],xmm6[14],xmm1[14],xmm6[15],xmm1[15] +; CHECK-SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [0,9,0,0,0,41,0,147,0,129,0,129,0,85,0,141] +; CHECK-SSE41-NEXT: psrlw $8, %xmm6 +; CHECK-SSE41-NEXT: pxor %xmm7, %xmm7 +; CHECK-SSE41-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3],xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] +; CHECK-SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 # [0,86,0,95,0,147,0,43,0,49,0,127,0,65,0,147] +; CHECK-SSE41-NEXT: psrlw $8, %xmm7 +; CHECK-SSE41-NEXT: packuswb %xmm6, %xmm7 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm8 = [0,0,1,0,0,255,0,1,0,1,0,1,1,1,0,1] +; CHECK-SSE41-NEXT: pmullw %xmm1, %xmm8 +; CHECK-SSE41-NEXT: pand %xmm5, %xmm8 +; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm6 +; CHECK-SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [0,0,0,0,0,255,0,1,0,1,0,1,0,1,0,1] +; CHECK-SSE41-NEXT: psllw $8, %xmm6 +; CHECK-SSE41-NEXT: por %xmm8, %xmm6 +; CHECK-SSE41-NEXT: paddb %xmm7, %xmm6 +; CHECK-SSE41-NEXT: movdqa %xmm6, %xmm7 +; CHECK-SSE41-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] +; CHECK-SSE41-NEXT: psraw $8, %xmm7 +; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 # [64,256,32,64,256,64,8,4] +; CHECK-SSE41-NEXT: psrlw $8, %xmm7 +; CHECK-SSE41-NEXT: movdqa %xmm6, %xmm8 +; CHECK-SSE41-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; CHECK-SSE41-NEXT: psraw $8, %xmm8 +; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8 # [256,8,64,256,16,4,8,8] +; CHECK-SSE41-NEXT: psrlw $8, %xmm8 +; CHECK-SSE41-NEXT: packuswb %xmm7, %xmm8 +; CHECK-SSE41-NEXT: psrlw $7, %xmm6 +; CHECK-SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 +; CHECK-SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 +; CHECK-SSE41-NEXT: paddb %xmm8, %xmm6 +; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm7 = [3,87,7,6,84,128,127,56,114,1,50,7,2,8,97,117] +; CHECK-SSE41-NEXT: pmullw %xmm6, %xmm7 +; CHECK-SSE41-NEXT: pand %xmm5, %xmm7 +; CHECK-SSE41-NEXT: pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 # [0,87,0,6,0,128,0,56,0,1,0,7,0,8,0,117] +; CHECK-SSE41-NEXT: psllw $8, %xmm6 +; CHECK-SSE41-NEXT: por %xmm7, %xmm6 +; CHECK-SSE41-NEXT: psubb %xmm6, %xmm1 +; CHECK-SSE41-NEXT: pcmpeqb %xmm4, %xmm1 ; CHECK-SSE41-NEXT: pcmpeqb %xmm4, %xmm0 -; CHECK-SSE41-NEXT: pcmpeqb %xmm6, %xmm3 -; CHECK-SSE41-NEXT: pandn %xmm1, %xmm3 -; CHECK-SSE41-NEXT: pcmpeqb %xmm6, %xmm2 -; CHECK-SSE41-NEXT: pandn %xmm0, %xmm2 +; CHECK-SSE41-NEXT: pcmpeqb %xmm4, %xmm3 +; CHECK-SSE41-NEXT: por %xmm1, %xmm3 +; CHECK-SSE41-NEXT: pcmpeqb %xmm4, %xmm2 +; CHECK-SSE41-NEXT: por %xmm0, %xmm2 ; CHECK-SSE41-NEXT: pmovmskb %xmm2, %ecx +; CHECK-SSE41-NEXT: xorl $65535, %ecx # imm = 0xFFFF ; CHECK-SSE41-NEXT: pmovmskb %xmm3, %edx +; CHECK-SSE41-NEXT: notl %edx ; CHECK-SSE41-NEXT: shll $16, %edx ; CHECK-SSE41-NEXT: orl %ecx, %edx ; CHECK-SSE41-NEXT: movl %edx, (%rdi) diff --git a/llvm/test/CodeGen/X86/srem-vector-lkk.ll b/llvm/test/CodeGen/X86/srem-vector-lkk.ll index 0fb6eb3c5889..f9de4e18857c 100644 --- a/llvm/test/CodeGen/X86/srem-vector-lkk.ll +++ b/llvm/test/CodeGen/X86/srem-vector-lkk.ll @@ -6,155 +6,78 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512 define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) { -; SSE-LABEL: fold_srem_vec_1: -; SSE: # %bb.0: -; SSE-NEXT: pextrw $3, %xmm0, %eax -; SSE-NEXT: movswl %ax, %ecx -; SSE-NEXT: imull $32081, %ecx, %ecx # imm = 0x7D51 -; SSE-NEXT: shrl $16, %ecx -; SSE-NEXT: subl %eax, %ecx -; SSE-NEXT: movzwl %cx, %ecx -; SSE-NEXT: movswl %cx, %edx -; SSE-NEXT: shrl $15, %ecx -; SSE-NEXT: sarl $9, %edx -; SSE-NEXT: addl %ecx, %edx -; SSE-NEXT: imull $-1003, %edx, %ecx # imm = 0xFC15 -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: movd %xmm0, %ecx -; SSE-NEXT: movswl %cx, %edx -; SSE-NEXT: imull $-21385, %edx, %edx # imm = 0xAC77 -; SSE-NEXT: shrl $16, %edx -; SSE-NEXT: addl %ecx, %edx -; SSE-NEXT: movzwl %dx, %edx -; SSE-NEXT: movswl %dx, %esi -; SSE-NEXT: shrl $15, %edx -; SSE-NEXT: sarl $6, %esi -; SSE-NEXT: addl %edx, %esi -; SSE-NEXT: imull $95, %esi, %edx -; SSE-NEXT: subl %edx, %ecx -; SSE-NEXT: movd %ecx, %xmm1 -; SSE-NEXT: pextrw $1, %xmm0, %ecx -; SSE-NEXT: movswl %cx, %edx -; SSE-NEXT: imull $-16913, %edx, %edx # imm = 0xBDEF -; SSE-NEXT: movl %edx, %esi -; SSE-NEXT: shrl $31, %esi -; SSE-NEXT: sarl $21, %edx -; SSE-NEXT: addl %esi, %edx -; SSE-NEXT: imull $-124, %edx, %edx -; SSE-NEXT: subl %edx, %ecx -; SSE-NEXT: pinsrw $1, %ecx, %xmm1 -; SSE-NEXT: pextrw $2, %xmm0, %ecx -; SSE-NEXT: movswl %cx, %edx -; SSE-NEXT: imull $2675, %edx, %edx # imm = 0xA73 -; SSE-NEXT: movl %edx, %esi -; SSE-NEXT: shrl $31, %esi -; SSE-NEXT: sarl $18, %edx -; SSE-NEXT: addl %esi, %edx -; SSE-NEXT: imull $98, %edx, %edx -; SSE-NEXT: subl %edx, %ecx -; SSE-NEXT: pinsrw $2, %ecx, %xmm1 -; SSE-NEXT: pinsrw $3, %eax, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: fold_srem_vec_1: +; SSE2: # %bb.0: +; SSE2-NEXT: movq {{.*#+}} xmm2 = [1,0,0,65535,0,0,0,0] +; SSE2-NEXT: pmullw %xmm0, %xmm2 +; SSE2-NEXT: movq {{.*#+}} xmm1 = [44151,48623,2675,32081,0,0,0,0] +; SSE2-NEXT: pmulhw %xmm0, %xmm1 +; SSE2-NEXT: paddw %xmm2, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: psraw $8, %xmm4 +; SSE2-NEXT: pandn %xmm4, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: psraw $4, %xmm3 +; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3] +; SSE2-NEXT: movaps %xmm2, %xmm3 +; SSE2-NEXT: psraw $1, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,0,65535,65535,65535,65535] +; SSE2-NEXT: psraw $2, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,65535,0,65535,65535,65535,65535,65535] +; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: pand %xmm5, %xmm6 +; SSE2-NEXT: pandn %xmm2, %xmm5 +; SSE2-NEXT: por %xmm6, %xmm5 +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: pandn %xmm3, %xmm4 +; SSE2-NEXT: por %xmm5, %xmm4 +; SSE2-NEXT: psrlw $15, %xmm1 +; SSE2-NEXT: paddw %xmm4, %xmm1 +; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [95,65412,98,64533,u,u,u,u] +; SSE2-NEXT: psubw %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE4-LABEL: fold_srem_vec_1: +; SSE4: # %bb.0: +; SSE4-NEXT: movq {{.*#+}} xmm1 = [1,0,0,65535,0,0,0,0] +; SSE4-NEXT: pmullw %xmm0, %xmm1 +; SSE4-NEXT: movq {{.*#+}} xmm2 = [44151,48623,2675,32081,0,0,0,0] +; SSE4-NEXT: pmulhw %xmm0, %xmm2 +; SSE4-NEXT: paddw %xmm1, %xmm2 +; SSE4-NEXT: movdqa %xmm2, %xmm1 +; SSE4-NEXT: psrlw $15, %xmm1 +; SSE4-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [1024,2048,16384,128,u,u,u,u] +; SSE4-NEXT: paddw %xmm1, %xmm2 +; SSE4-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [95,65412,98,64533,u,u,u,u] +; SSE4-NEXT: psubw %xmm2, %xmm0 +; SSE4-NEXT: retq ; ; AVX1OR2-LABEL: fold_srem_vec_1: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vpextrw $3, %xmm0, %eax -; AVX1OR2-NEXT: movswl %ax, %ecx -; AVX1OR2-NEXT: imull $32081, %ecx, %ecx # imm = 0x7D51 -; AVX1OR2-NEXT: shrl $16, %ecx -; AVX1OR2-NEXT: subl %eax, %ecx -; AVX1OR2-NEXT: movzwl %cx, %ecx -; AVX1OR2-NEXT: movswl %cx, %edx -; AVX1OR2-NEXT: shrl $15, %ecx -; AVX1OR2-NEXT: sarl $9, %edx -; AVX1OR2-NEXT: addl %ecx, %edx -; AVX1OR2-NEXT: imull $-1003, %edx, %ecx # imm = 0xFC15 -; AVX1OR2-NEXT: subl %ecx, %eax -; AVX1OR2-NEXT: vmovd %xmm0, %ecx -; AVX1OR2-NEXT: movswl %cx, %edx -; AVX1OR2-NEXT: imull $-21385, %edx, %edx # imm = 0xAC77 -; AVX1OR2-NEXT: shrl $16, %edx -; AVX1OR2-NEXT: addl %ecx, %edx -; AVX1OR2-NEXT: movzwl %dx, %edx -; AVX1OR2-NEXT: movswl %dx, %esi -; AVX1OR2-NEXT: shrl $15, %edx -; AVX1OR2-NEXT: sarl $6, %esi -; AVX1OR2-NEXT: addl %edx, %esi -; AVX1OR2-NEXT: imull $95, %esi, %edx -; AVX1OR2-NEXT: subl %edx, %ecx -; AVX1OR2-NEXT: vmovd %ecx, %xmm1 -; AVX1OR2-NEXT: vpextrw $1, %xmm0, %ecx -; AVX1OR2-NEXT: movswl %cx, %edx -; AVX1OR2-NEXT: imull $-16913, %edx, %edx # imm = 0xBDEF -; AVX1OR2-NEXT: movl %edx, %esi -; AVX1OR2-NEXT: shrl $31, %esi -; AVX1OR2-NEXT: sarl $21, %edx -; AVX1OR2-NEXT: addl %esi, %edx -; AVX1OR2-NEXT: imull $-124, %edx, %edx -; AVX1OR2-NEXT: subl %edx, %ecx -; AVX1OR2-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1 -; AVX1OR2-NEXT: vpextrw $2, %xmm0, %ecx -; AVX1OR2-NEXT: movswl %cx, %edx -; AVX1OR2-NEXT: imull $2675, %edx, %edx # imm = 0xA73 -; AVX1OR2-NEXT: movl %edx, %esi -; AVX1OR2-NEXT: shrl $31, %esi -; AVX1OR2-NEXT: sarl $18, %edx -; AVX1OR2-NEXT: addl %esi, %edx -; AVX1OR2-NEXT: imull $98, %edx, %edx -; AVX1OR2-NEXT: subl %edx, %ecx -; AVX1OR2-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm0 -; AVX1OR2-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [1,0,0,65535,u,u,u,u] +; AVX1OR2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 # [44151,48623,2675,32081,u,u,u,u] +; AVX1OR2-NEXT: vpaddw %xmm1, %xmm2, %xmm1 +; AVX1OR2-NEXT: vpsrlw $15, %xmm1, %xmm2 +; AVX1OR2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1024,2048,16384,128,u,u,u,u] +; AVX1OR2-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX1OR2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [95,65412,98,64533,u,u,u,u] +; AVX1OR2-NEXT: vpsubw %xmm1, %xmm0, %xmm0 ; AVX1OR2-NEXT: retq ; ; AVX512-LABEL: fold_srem_vec_1: ; AVX512: # %bb.0: -; AVX512-NEXT: vpextrw $3, %xmm0, %eax -; AVX512-NEXT: movswl %ax, %ecx -; AVX512-NEXT: imull $32081, %ecx, %ecx # imm = 0x7D51 -; AVX512-NEXT: shrl $16, %ecx -; AVX512-NEXT: subl %eax, %ecx -; AVX512-NEXT: movzwl %cx, %edx -; AVX512-NEXT: movswl %dx, %ecx -; AVX512-NEXT: shrl $15, %edx -; AVX512-NEXT: sarl $9, %ecx -; AVX512-NEXT: addl %edx, %ecx -; AVX512-NEXT: vmovd %xmm0, %edx -; AVX512-NEXT: movswl %dx, %esi -; AVX512-NEXT: imull $-21385, %esi, %esi # imm = 0xAC77 -; AVX512-NEXT: shrl $16, %esi -; AVX512-NEXT: addl %edx, %esi -; AVX512-NEXT: movzwl %si, %esi -; AVX512-NEXT: movswl %si, %edi -; AVX512-NEXT: shrl $15, %esi -; AVX512-NEXT: sarl $6, %edi -; AVX512-NEXT: addl %esi, %edi -; AVX512-NEXT: imull $95, %edi, %esi -; AVX512-NEXT: subl %esi, %edx -; AVX512-NEXT: vmovd %edx, %xmm1 -; AVX512-NEXT: vpextrw $1, %xmm0, %edx -; AVX512-NEXT: movswl %dx, %esi -; AVX512-NEXT: imull $-16913, %esi, %esi # imm = 0xBDEF -; AVX512-NEXT: movl %esi, %edi -; AVX512-NEXT: shrl $31, %edi -; AVX512-NEXT: sarl $21, %esi -; AVX512-NEXT: addl %edi, %esi -; AVX512-NEXT: imull $-1003, %ecx, %ecx # imm = 0xFC15 -; AVX512-NEXT: imull $-124, %esi, %esi -; AVX512-NEXT: subl %esi, %edx -; AVX512-NEXT: vpinsrw $1, %edx, %xmm1, %xmm1 -; AVX512-NEXT: vpextrw $2, %xmm0, %edx -; AVX512-NEXT: subl %ecx, %eax -; AVX512-NEXT: movswl %dx, %ecx -; AVX512-NEXT: imull $2675, %ecx, %ecx # imm = 0xA73 -; AVX512-NEXT: movl %ecx, %esi -; AVX512-NEXT: shrl $31, %esi -; AVX512-NEXT: sarl $18, %ecx -; AVX512-NEXT: addl %esi, %ecx -; AVX512-NEXT: imull $98, %ecx, %ecx -; AVX512-NEXT: subl %ecx, %edx -; AVX512-NEXT: vpinsrw $2, %edx, %xmm1, %xmm0 -; AVX512-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0 +; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [1,0,0,65535,u,u,u,u] +; AVX512-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 # [44151,48623,2675,32081,u,u,u,u] +; AVX512-NEXT: vpaddw %xmm1, %xmm2, %xmm1 +; AVX512-NEXT: vpsrlw $15, %xmm1, %xmm2 +; AVX512-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [95,65412,98,64533,u,u,u,u] +; AVX512-NEXT: vpsubw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = srem <4 x i16> %x, ret <4 x i16> %1 @@ -163,25 +86,25 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) { define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) { ; SSE-LABEL: fold_srem_vec_2: ; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,u,u,u,u] ; SSE-NEXT: pmulhw %xmm0, %xmm1 ; SSE-NEXT: paddw %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: psrlw $15, %xmm2 ; SSE-NEXT: psraw $6, %xmm1 ; SSE-NEXT: paddw %xmm2, %xmm1 -; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [95,95,95,95,95,95,95,95] +; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [95,95,95,95,u,u,u,u] ; SSE-NEXT: psubw %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: fold_srem_vec_2: ; AVX: # %bb.0: -; AVX-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [44151,44151,44151,44151,44151,44151,44151,44151] +; AVX-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [44151,44151,44151,44151,u,u,u,u] ; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm1 ; AVX-NEXT: vpsrlw $15, %xmm1, %xmm2 ; AVX-NEXT: vpsraw $6, %xmm1, %xmm1 ; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [95,95,95,95,95,95,95,95] +; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [95,95,95,95,u,u,u,u] ; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = srem <4 x i16> %x, @@ -193,14 +116,14 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) { define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) { ; SSE2-LABEL: combine_srem_sdiv: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151] +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,u,u,u,u] ; SSE2-NEXT: pmulhw %xmm0, %xmm1 ; SSE2-NEXT: paddw %xmm0, %xmm1 ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: psrlw $15, %xmm2 ; SSE2-NEXT: psraw $6, %xmm1 ; SSE2-NEXT: paddw %xmm2, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [95,95,95,95,95,95,95,95] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [95,95,95,95,u,u,u,u] ; SSE2-NEXT: pmullw %xmm1, %xmm2 ; SSE2-NEXT: psubw %xmm2, %xmm0 ; SSE2-NEXT: paddw %xmm1, %xmm0 @@ -208,7 +131,7 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) { ; ; SSE4-LABEL: combine_srem_sdiv: ; SSE4: # %bb.0: -; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151] +; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,u,u,u,u] ; SSE4-NEXT: pmulhw %xmm0, %xmm1 ; SSE4-NEXT: paddw %xmm0, %xmm1 ; SSE4-NEXT: movdqa %xmm1, %xmm2 @@ -223,12 +146,12 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) { ; ; AVX-LABEL: combine_srem_sdiv: ; AVX: # %bb.0: -; AVX-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [44151,44151,44151,44151,44151,44151,44151,44151] +; AVX-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [44151,44151,44151,44151,u,u,u,u] ; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm1 ; AVX-NEXT: vpsrlw $15, %xmm1, %xmm2 ; AVX-NEXT: vpsraw $6, %xmm1, %xmm1 ; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 # [95,95,95,95,95,95,95,95] +; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 # [95,95,95,95,u,u,u,u] ; AVX-NEXT: vpsubw %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq @@ -240,248 +163,237 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) { ; Don't fold for divisors that are a power of two. define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) { -; SSE-LABEL: dont_fold_srem_power_of_two: -; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pextrw $1, %xmm0, %eax -; SSE-NEXT: leal 31(%rax), %ecx -; SSE-NEXT: testw %ax, %ax -; SSE-NEXT: cmovnsl %eax, %ecx -; SSE-NEXT: andl $-32, %ecx -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: movd %xmm0, %ecx -; SSE-NEXT: leal 63(%rcx), %edx -; SSE-NEXT: testw %cx, %cx -; SSE-NEXT: cmovnsl %ecx, %edx -; SSE-NEXT: andl $-64, %edx -; SSE-NEXT: subl %edx, %ecx -; SSE-NEXT: movd %ecx, %xmm0 -; SSE-NEXT: pinsrw $1, %eax, %xmm0 -; SSE-NEXT: pextrw $2, %xmm1, %eax -; SSE-NEXT: leal 7(%rax), %ecx -; SSE-NEXT: testw %ax, %ax -; SSE-NEXT: cmovnsl %eax, %ecx -; SSE-NEXT: andl $-8, %ecx -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: pinsrw $2, %eax, %xmm0 -; SSE-NEXT: pextrw $3, %xmm1, %eax -; SSE-NEXT: movswl %ax, %ecx -; SSE-NEXT: imull $-21385, %ecx, %ecx # imm = 0xAC77 -; SSE-NEXT: shrl $16, %ecx -; SSE-NEXT: addl %eax, %ecx -; SSE-NEXT: movzwl %cx, %ecx -; SSE-NEXT: movswl %cx, %edx -; SSE-NEXT: shrl $15, %ecx -; SSE-NEXT: sarl $6, %edx -; SSE-NEXT: addl %ecx, %edx -; SSE-NEXT: imull $95, %edx, %ecx -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: pinsrw $3, %eax, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: dont_fold_srem_power_of_two: +; SSE2: # %bb.0: +; SSE2-NEXT: movq {{.*#+}} xmm1 = [32769,32769,32769,44151,0,0,0,0] +; SSE2-NEXT: pmulhw %xmm0, %xmm1 +; SSE2-NEXT: paddw %xmm0, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,0,65535,0,65535,65535,65535,65535] +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: psraw $4, %xmm4 +; SSE2-NEXT: pandn %xmm4, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: psraw $2, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm2[2,3] +; SSE2-NEXT: movaps {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535] +; SSE2-NEXT: andps %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: psraw $5, %xmm4 +; SSE2-NEXT: andnps %xmm4, %xmm2 +; SSE2-NEXT: orps %xmm3, %xmm2 +; SSE2-NEXT: psrlw $15, %xmm1 +; SSE2-NEXT: paddw %xmm2, %xmm1 +; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [64,32,8,95,u,u,u,u] +; SSE2-NEXT: psubw %xmm1, %xmm0 +; SSE2-NEXT: retq ; -; AVX-LABEL: dont_fold_srem_power_of_two: -; AVX: # %bb.0: -; AVX-NEXT: vpextrw $1, %xmm0, %eax -; AVX-NEXT: leal 31(%rax), %ecx -; AVX-NEXT: testw %ax, %ax -; AVX-NEXT: cmovnsl %eax, %ecx -; AVX-NEXT: andl $-32, %ecx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vmovd %xmm0, %ecx -; AVX-NEXT: leal 63(%rcx), %edx -; AVX-NEXT: testw %cx, %cx -; AVX-NEXT: cmovnsl %ecx, %edx -; AVX-NEXT: andl $-64, %edx -; AVX-NEXT: subl %edx, %ecx -; AVX-NEXT: vmovd %ecx, %xmm1 -; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $2, %xmm0, %eax -; AVX-NEXT: leal 7(%rax), %ecx -; AVX-NEXT: testw %ax, %ax -; AVX-NEXT: cmovnsl %eax, %ecx -; AVX-NEXT: andl $-8, %ecx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $3, %xmm0, %eax -; AVX-NEXT: movswl %ax, %ecx -; AVX-NEXT: imull $-21385, %ecx, %ecx # imm = 0xAC77 -; AVX-NEXT: shrl $16, %ecx -; AVX-NEXT: addl %eax, %ecx -; AVX-NEXT: movzwl %cx, %ecx -; AVX-NEXT: movswl %cx, %edx -; AVX-NEXT: shrl $15, %ecx -; AVX-NEXT: sarl $6, %edx -; AVX-NEXT: addl %ecx, %edx -; AVX-NEXT: imull $95, %edx, %ecx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 -; AVX-NEXT: retq +; SSE4-LABEL: dont_fold_srem_power_of_two: +; SSE4: # %bb.0: +; SSE4-NEXT: movq {{.*#+}} xmm1 = [32769,32769,32769,44151,0,0,0,0] +; SSE4-NEXT: pmulhw %xmm0, %xmm1 +; SSE4-NEXT: paddw %xmm0, %xmm1 +; SSE4-NEXT: movdqa %xmm1, %xmm2 +; SSE4-NEXT: psrlw $15, %xmm2 +; SSE4-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2048,4096,16384,1024,u,u,u,u] +; SSE4-NEXT: paddw %xmm2, %xmm1 +; SSE4-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [64,32,8,95,u,u,u,u] +; SSE4-NEXT: psubw %xmm1, %xmm0 +; SSE4-NEXT: retq +; +; AVX1OR2-LABEL: dont_fold_srem_power_of_two: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [32769,32769,32769,44151,u,u,u,u] +; AVX1OR2-NEXT: vpaddw %xmm0, %xmm1, %xmm1 +; AVX1OR2-NEXT: vpsrlw $15, %xmm1, %xmm2 +; AVX1OR2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2048,4096,16384,1024,u,u,u,u] +; AVX1OR2-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX1OR2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [64,32,8,95,u,u,u,u] +; AVX1OR2-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: dont_fold_srem_power_of_two: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [32769,32769,32769,44151,u,u,u,u] +; AVX512-NEXT: vpaddw %xmm0, %xmm1, %xmm1 +; AVX512-NEXT: vpsrlw $15, %xmm1, %xmm2 +; AVX512-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [64,32,8,95,u,u,u,u] +; AVX512-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq %1 = srem <4 x i16> %x, ret <4 x i16> %1 } ; Don't fold if the divisor is one. define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) { -; SSE-LABEL: dont_fold_srem_one: -; SSE: # %bb.0: -; SSE-NEXT: pextrw $2, %xmm0, %ecx -; SSE-NEXT: movswl %cx, %eax -; SSE-NEXT: imull $-19945, %eax, %eax # imm = 0xB217 -; SSE-NEXT: shrl $16, %eax -; SSE-NEXT: addl %ecx, %eax -; SSE-NEXT: movzwl %ax, %edx -; SSE-NEXT: movswl %dx, %eax -; SSE-NEXT: shrl $15, %edx -; SSE-NEXT: sarl $4, %eax -; SSE-NEXT: addl %edx, %eax -; SSE-NEXT: leal (%rax,%rax,2), %edx -; SSE-NEXT: shll $3, %edx -; SSE-NEXT: subl %edx, %eax -; SSE-NEXT: addl %ecx, %eax -; SSE-NEXT: pextrw $1, %xmm0, %ecx -; SSE-NEXT: movswl %cx, %edx -; SSE-NEXT: imull $12827, %edx, %edx # imm = 0x321B -; SSE-NEXT: movl %edx, %esi -; SSE-NEXT: shrl $31, %esi -; SSE-NEXT: sarl $23, %edx -; SSE-NEXT: addl %esi, %edx -; SSE-NEXT: imull $654, %edx, %edx # imm = 0x28E -; SSE-NEXT: subl %edx, %ecx -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: pinsrw $1, %ecx, %xmm1 -; SSE-NEXT: pinsrw $2, %eax, %xmm1 -; SSE-NEXT: pextrw $3, %xmm0, %eax -; SSE-NEXT: movswl %ax, %ecx -; SSE-NEXT: imull $12375, %ecx, %ecx # imm = 0x3057 -; SSE-NEXT: movl %ecx, %edx -; SSE-NEXT: shrl $31, %edx -; SSE-NEXT: sarl $26, %ecx -; SSE-NEXT: addl %edx, %ecx -; SSE-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: pinsrw $3, %eax, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: dont_fold_srem_one: +; SSE2: # %bb.0: +; SSE2-NEXT: movq {{.*#+}} xmm1 = [65535,0,65535,0,0,0,0,0] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: movq {{.*#+}} xmm2 = [0,12827,45591,12375,0,0,0,0] +; SSE2-NEXT: pmulhw %xmm0, %xmm2 +; SSE2-NEXT: paddw %xmm2, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,0,65535,65535,65535,65535,65535] +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pand %xmm3, %xmm4 +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: psraw $4, %xmm5 +; SSE2-NEXT: pandn %xmm5, %xmm3 +; SSE2-NEXT: por %xmm4, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,65535,0,65535,65535,65535,65535] +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: psraw $10, %xmm5 +; SSE2-NEXT: pandn %xmm5, %xmm4 +; SSE2-NEXT: por %xmm3, %xmm4 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,65535,65535,65535,65535] +; SSE2-NEXT: pand %xmm3, %xmm4 +; SSE2-NEXT: psraw $7, %xmm2 +; SSE2-NEXT: pandn %xmm2, %xmm3 +; SSE2-NEXT: por %xmm4, %xmm3 +; SSE2-NEXT: psrlw $15, %xmm1 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: paddw %xmm3, %xmm1 +; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,654,23,5423,u,u,u,u] +; SSE2-NEXT: psubw %xmm1, %xmm0 +; SSE2-NEXT: retq ; -; AVX-LABEL: dont_fold_srem_one: -; AVX: # %bb.0: -; AVX-NEXT: vpextrw $2, %xmm0, %eax -; AVX-NEXT: movswl %ax, %ecx -; AVX-NEXT: imull $-19945, %ecx, %ecx # imm = 0xB217 -; AVX-NEXT: shrl $16, %ecx -; AVX-NEXT: addl %eax, %ecx -; AVX-NEXT: movzwl %cx, %ecx -; AVX-NEXT: movswl %cx, %edx -; AVX-NEXT: shrl $15, %ecx -; AVX-NEXT: sarl $4, %edx -; AVX-NEXT: addl %ecx, %edx -; AVX-NEXT: leal (%rdx,%rdx,2), %ecx -; AVX-NEXT: shll $3, %ecx -; AVX-NEXT: subl %ecx, %edx -; AVX-NEXT: addl %eax, %edx -; AVX-NEXT: vpextrw $1, %xmm0, %eax -; AVX-NEXT: movswl %ax, %ecx -; AVX-NEXT: imull $12827, %ecx, %ecx # imm = 0x321B -; AVX-NEXT: movl %ecx, %esi -; AVX-NEXT: shrl $31, %esi -; AVX-NEXT: sarl $23, %ecx -; AVX-NEXT: addl %esi, %ecx -; AVX-NEXT: imull $654, %ecx, %ecx # imm = 0x28E -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $3, %xmm0, %eax -; AVX-NEXT: movswl %ax, %ecx -; AVX-NEXT: imull $12375, %ecx, %ecx # imm = 0x3057 -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: shrl $31, %edx -; AVX-NEXT: sarl $26, %ecx -; AVX-NEXT: addl %edx, %ecx -; AVX-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 -; AVX-NEXT: retq +; SSE4-LABEL: dont_fold_srem_one: +; SSE4: # %bb.0: +; SSE4-NEXT: pxor %xmm1, %xmm1 +; SSE4-NEXT: pxor %xmm2, %xmm2 +; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1],xmm0[2],xmm2[3,4,5,6,7] +; SSE4-NEXT: movq {{.*#+}} xmm3 = [0,12827,45591,12375,0,0,0,0] +; SSE4-NEXT: pmulhw %xmm0, %xmm3 +; SSE4-NEXT: paddw %xmm2, %xmm3 +; SSE4-NEXT: movdqa %xmm3, %xmm2 +; SSE4-NEXT: psrlw $15, %xmm2 +; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3],xmm1[4,5,6,7] +; SSE4-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [u,512,4096,64,u,u,u,u] +; SSE4-NEXT: pblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3,4,5,6,7] +; SSE4-NEXT: paddw %xmm2, %xmm3 +; SSE4-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [1,654,23,5423,u,u,u,u] +; SSE4-NEXT: psubw %xmm3, %xmm0 +; SSE4-NEXT: retq +; +; AVX1OR2-LABEL: dont_fold_srem_one: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3,4,5,6,7] +; AVX1OR2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 # [0,12827,45591,12375,u,u,u,u] +; AVX1OR2-NEXT: vpaddw %xmm2, %xmm3, %xmm2 +; AVX1OR2-NEXT: vpsrlw $15, %xmm2, %xmm3 +; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4,5,6,7] +; AVX1OR2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [u,512,4096,64,u,u,u,u] +; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7] +; AVX1OR2-NEXT: vpaddw %xmm1, %xmm2, %xmm1 +; AVX1OR2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,654,23,5423,u,u,u,u] +; AVX1OR2-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: dont_fold_srem_one: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3,4,5,6,7] +; AVX512-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 # [0,12827,45591,12375,u,u,u,u] +; AVX512-NEXT: vpaddw %xmm2, %xmm3, %xmm2 +; AVX512-NEXT: vpsrlw $15, %xmm2, %xmm3 +; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4,5,6,7] +; AVX512-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX512-NEXT: vpaddw %xmm1, %xmm2, %xmm1 +; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,654,23,5423,u,u,u,u] +; AVX512-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq %1 = srem <4 x i16> %x, ret <4 x i16> %1 } ; Don't fold if the divisor is 2^15. define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) { -; SSE-LABEL: dont_fold_urem_i16_smax: -; SSE: # %bb.0: -; SSE-NEXT: pextrw $2, %xmm0, %eax -; SSE-NEXT: movswl %ax, %ecx -; SSE-NEXT: imull $-19945, %ecx, %ecx # imm = 0xB217 -; SSE-NEXT: shrl $16, %ecx -; SSE-NEXT: addl %eax, %ecx -; SSE-NEXT: movzwl %cx, %ecx -; SSE-NEXT: movswl %cx, %edx -; SSE-NEXT: shrl $15, %ecx -; SSE-NEXT: sarl $4, %edx -; SSE-NEXT: addl %ecx, %edx -; SSE-NEXT: leal (%rdx,%rdx,2), %ecx -; SSE-NEXT: shll $3, %ecx -; SSE-NEXT: subl %ecx, %edx -; SSE-NEXT: addl %eax, %edx -; SSE-NEXT: pextrw $1, %xmm0, %eax -; SSE-NEXT: leal 32767(%rax), %ecx -; SSE-NEXT: testw %ax, %ax -; SSE-NEXT: cmovnsl %eax, %ecx -; SSE-NEXT: andl $-32768, %ecx # imm = 0x8000 -; SSE-NEXT: addl %eax, %ecx -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: pinsrw $1, %ecx, %xmm1 -; SSE-NEXT: pinsrw $2, %edx, %xmm1 -; SSE-NEXT: pextrw $3, %xmm0, %eax -; SSE-NEXT: movswl %ax, %ecx -; SSE-NEXT: imull $12375, %ecx, %ecx # imm = 0x3057 -; SSE-NEXT: movl %ecx, %edx -; SSE-NEXT: shrl $31, %edx -; SSE-NEXT: sarl $26, %ecx -; SSE-NEXT: addl %edx, %ecx -; SSE-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: pinsrw $3, %eax, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: dont_fold_urem_i16_smax: +; SSE2: # %bb.0: +; SSE2-NEXT: movq {{.*#+}} xmm2 = [1,65535,1,0,0,0,0,0] +; SSE2-NEXT: pmullw %xmm0, %xmm2 +; SSE2-NEXT: movq {{.*#+}} xmm1 = [0,32767,45591,12375,0,0,0,0] +; SSE2-NEXT: pmulhw %xmm0, %xmm1 +; SSE2-NEXT: paddw %xmm2, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,0,65535,65535,65535,65535] +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: psraw $8, %xmm4 +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: pandn %xmm4, %xmm5 +; SSE2-NEXT: por %xmm3, %xmm5 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,0,65535,65535,65535,65535,65535] +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: pand %xmm3, %xmm4 +; SSE2-NEXT: psraw $4, %xmm5 +; SSE2-NEXT: pandn %xmm5, %xmm3 +; SSE2-NEXT: por %xmm4, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pand %xmm2, %xmm4 +; SSE2-NEXT: psraw $2, %xmm3 +; SSE2-NEXT: pandn %xmm3, %xmm2 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: psrlw $15, %xmm1 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: paddw %xmm2, %xmm1 +; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,32768,23,5423,u,u,u,u] +; SSE2-NEXT: psubw %xmm1, %xmm0 +; SSE2-NEXT: retq ; -; AVX-LABEL: dont_fold_urem_i16_smax: -; AVX: # %bb.0: -; AVX-NEXT: vpextrw $2, %xmm0, %eax -; AVX-NEXT: movswl %ax, %ecx -; AVX-NEXT: imull $-19945, %ecx, %ecx # imm = 0xB217 -; AVX-NEXT: shrl $16, %ecx -; AVX-NEXT: addl %eax, %ecx -; AVX-NEXT: movzwl %cx, %ecx -; AVX-NEXT: movswl %cx, %edx -; AVX-NEXT: shrl $15, %ecx -; AVX-NEXT: sarl $4, %edx -; AVX-NEXT: addl %ecx, %edx -; AVX-NEXT: leal (%rdx,%rdx,2), %ecx -; AVX-NEXT: shll $3, %ecx -; AVX-NEXT: subl %ecx, %edx -; AVX-NEXT: addl %eax, %edx -; AVX-NEXT: vpextrw $1, %xmm0, %eax -; AVX-NEXT: leal 32767(%rax), %ecx -; AVX-NEXT: testw %ax, %ax -; AVX-NEXT: cmovnsl %eax, %ecx -; AVX-NEXT: andl $-32768, %ecx # imm = 0x8000 -; AVX-NEXT: addl %eax, %ecx -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1 -; AVX-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $3, %xmm0, %eax -; AVX-NEXT: movswl %ax, %ecx -; AVX-NEXT: imull $12375, %ecx, %ecx # imm = 0x3057 -; AVX-NEXT: movl %ecx, %edx -; AVX-NEXT: shrl $31, %edx -; AVX-NEXT: sarl $26, %ecx -; AVX-NEXT: addl %edx, %ecx -; AVX-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 -; AVX-NEXT: retq +; SSE4-LABEL: dont_fold_urem_i16_smax: +; SSE4: # %bb.0: +; SSE4-NEXT: movq {{.*#+}} xmm1 = [1,65535,1,0,0,0,0,0] +; SSE4-NEXT: pmullw %xmm0, %xmm1 +; SSE4-NEXT: movq {{.*#+}} xmm2 = [0,32767,45591,12375,0,0,0,0] +; SSE4-NEXT: pmulhw %xmm0, %xmm2 +; SSE4-NEXT: paddw %xmm1, %xmm2 +; SSE4-NEXT: movdqa %xmm2, %xmm3 +; SSE4-NEXT: psrlw $15, %xmm3 +; SSE4-NEXT: pxor %xmm4, %xmm4 +; SSE4-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0],xmm3[1,2,3],xmm4[4,5,6,7] +; SSE4-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [u,4,4096,64,u,u,u,u] +; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3,4,5,6,7] +; SSE4-NEXT: paddw %xmm4, %xmm2 +; SSE4-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [1,32768,23,5423,u,u,u,u] +; SSE4-NEXT: psubw %xmm2, %xmm0 +; SSE4-NEXT: retq +; +; AVX1OR2-LABEL: dont_fold_urem_i16_smax: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [1,65535,1,0,u,u,u,u] +; AVX1OR2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 # [0,32767,45591,12375,u,u,u,u] +; AVX1OR2-NEXT: vpaddw %xmm1, %xmm2, %xmm2 +; AVX1OR2-NEXT: vpsrlw $15, %xmm2, %xmm3 +; AVX1OR2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3],xmm4[4,5,6,7] +; AVX1OR2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [u,4,4096,64,u,u,u,u] +; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7] +; AVX1OR2-NEXT: vpaddw %xmm3, %xmm1, %xmm1 +; AVX1OR2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,32768,23,5423,u,u,u,u] +; AVX1OR2-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: dont_fold_urem_i16_smax: +; AVX512: # %bb.0: +; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [1,65535,1,0,u,u,u,u] +; AVX512-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 # [0,32767,45591,12375,u,u,u,u] +; AVX512-NEXT: vpaddw %xmm1, %xmm2, %xmm1 +; AVX512-NEXT: vpsrlw $15, %xmm1, %xmm2 +; AVX512-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3],xmm3[4,5,6,7] +; AVX512-NEXT: vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,32768,23,5423,u,u,u,u] +; AVX512-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq %1 = srem <4 x i16> %x, ret <4 x i16> %1 } diff --git a/llvm/test/CodeGen/X86/urem-vector-lkk.ll b/llvm/test/CodeGen/X86/urem-vector-lkk.ll index 3d0d73be9a58..3faa2a0720d4 100644 --- a/llvm/test/CodeGen/X86/urem-vector-lkk.ll +++ b/llvm/test/CodeGen/X86/urem-vector-lkk.ll @@ -6,84 +6,62 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512 define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) { -; SSE-LABEL: fold_urem_vec_1: -; SSE: # %bb.0: -; SSE-NEXT: pextrw $1, %xmm0, %eax -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: shrl $2, %ecx -; SSE-NEXT: imull $16913, %ecx, %ecx # imm = 0x4211 -; SSE-NEXT: shrl $19, %ecx -; SSE-NEXT: imull $124, %ecx, %ecx -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: movd %xmm0, %ecx -; SSE-NEXT: movzwl %cx, %edx -; SSE-NEXT: imull $44151, %edx, %edx # imm = 0xAC77 -; SSE-NEXT: shrl $22, %edx -; SSE-NEXT: imull $95, %edx, %edx -; SSE-NEXT: subl %edx, %ecx -; SSE-NEXT: movd %ecx, %xmm1 -; SSE-NEXT: pinsrw $1, %eax, %xmm1 -; SSE-NEXT: pextrw $2, %xmm0, %eax -; SSE-NEXT: movl %eax, %ecx -; SSE-NEXT: shrl %ecx -; SSE-NEXT: imull $2675, %ecx, %ecx # imm = 0xA73 -; SSE-NEXT: shrl $17, %ecx -; SSE-NEXT: imull $98, %ecx, %ecx -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: pinsrw $2, %eax, %xmm1 -; SSE-NEXT: pextrw $3, %xmm0, %eax -; SSE-NEXT: imull $1373, %eax, %ecx # imm = 0x55D -; SSE-NEXT: shrl $16, %ecx -; SSE-NEXT: movl %eax, %edx -; SSE-NEXT: subl %ecx, %edx -; SSE-NEXT: movzwl %dx, %edx -; SSE-NEXT: shrl %edx -; SSE-NEXT: addl %ecx, %edx -; SSE-NEXT: shrl $9, %edx -; SSE-NEXT: imull $1003, %edx, %ecx # imm = 0x3EB -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: pinsrw $3, %eax, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: fold_urem_vec_1: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,0,65535,65535,65535,65535] +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: movq {{.*#+}} xmm2 = [0,0,0,64,0,128,0,0,0,0,0,0,0,0,0,0] +; SSE2-NEXT: pmulhuw %xmm0, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [44151,16913,2675,1373,u,u,u,u] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psubw %xmm2, %xmm1 +; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,0,0,32768,u,u,u,u] +; SSE2-NEXT: paddw %xmm2, %xmm1 +; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1024,8192,32768,128,u,u,u,u] +; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [95,124,98,1003,u,u,u,u] +; SSE2-NEXT: psubw %xmm1, %xmm0 +; SSE2-NEXT: retq ; -; AVX-LABEL: fold_urem_vec_1: -; AVX: # %bb.0: -; AVX-NEXT: vpextrw $1, %xmm0, %eax -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: shrl $2, %ecx -; AVX-NEXT: imull $16913, %ecx, %ecx # imm = 0x4211 -; AVX-NEXT: shrl $19, %ecx -; AVX-NEXT: imull $124, %ecx, %ecx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vmovd %xmm0, %ecx -; AVX-NEXT: movzwl %cx, %edx -; AVX-NEXT: imull $44151, %edx, %edx # imm = 0xAC77 -; AVX-NEXT: shrl $22, %edx -; AVX-NEXT: imull $95, %edx, %edx -; AVX-NEXT: subl %edx, %ecx -; AVX-NEXT: vmovd %ecx, %xmm1 -; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $2, %xmm0, %eax -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: shrl %ecx -; AVX-NEXT: imull $2675, %ecx, %ecx # imm = 0xA73 -; AVX-NEXT: shrl $17, %ecx -; AVX-NEXT: imull $98, %ecx, %ecx -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpextrw $3, %xmm0, %eax -; AVX-NEXT: imull $1373, %eax, %ecx # imm = 0x55D -; AVX-NEXT: shrl $16, %ecx -; AVX-NEXT: movl %eax, %edx -; AVX-NEXT: subl %ecx, %edx -; AVX-NEXT: movzwl %dx, %edx -; AVX-NEXT: shrl %edx -; AVX-NEXT: addl %ecx, %edx -; AVX-NEXT: shrl $9, %edx -; AVX-NEXT: imull $1003, %edx, %ecx # imm = 0x3EB -; AVX-NEXT: subl %ecx, %eax -; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 -; AVX-NEXT: retq +; SSE4-LABEL: fold_urem_vec_1: +; SSE4: # %bb.0: +; SSE4-NEXT: movq {{.*#+}} xmm1 = [0,16384,32768,0,0,0,0,0] +; SSE4-NEXT: pmulhuw %xmm0, %xmm1 +; SSE4-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5,6,7] +; SSE4-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [44151,16913,2675,1373,u,u,u,u] +; SSE4-NEXT: movdqa %xmm0, %xmm2 +; SSE4-NEXT: psubw %xmm1, %xmm2 +; SSE4-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [0,0,0,32768,u,u,u,u] +; SSE4-NEXT: paddw %xmm1, %xmm2 +; SSE4-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [1024,8192,32768,128,u,u,u,u] +; SSE4-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [95,124,98,1003,u,u,u,u] +; SSE4-NEXT: psubw %xmm2, %xmm0 +; SSE4-NEXT: retq +; +; AVX1OR2-LABEL: fold_urem_vec_1: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,16384,32768,u,u,u,u,u] +; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5,6,7] +; AVX1OR2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [44151,16913,2675,1373,u,u,u,u] +; AVX1OR2-NEXT: vpsubw %xmm1, %xmm0, %xmm2 +; AVX1OR2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [0,0,0,32768,u,u,u,u] +; AVX1OR2-NEXT: vpaddw %xmm1, %xmm2, %xmm1 +; AVX1OR2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1024,8192,32768,128,u,u,u,u] +; AVX1OR2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [95,124,98,1003,u,u,u,u] +; AVX1OR2-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX1OR2-NEXT: retq +; +; AVX512-LABEL: fold_urem_vec_1: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX512-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [44151,16913,2675,1373,u,u,u,u] +; AVX512-NEXT: vpsubw %xmm1, %xmm0, %xmm2 +; AVX512-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [0,0,0,32768,u,u,u,u] +; AVX512-NEXT: vpaddw %xmm1, %xmm2, %xmm1 +; AVX512-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [95,124,98,1003,u,u,u,u] +; AVX512-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq %1 = urem <4 x i16> %x, ret <4 x i16> %1 } @@ -91,18 +69,18 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) { define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) { ; SSE-LABEL: fold_urem_vec_2: ; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151] +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,u,u,u,u] ; SSE-NEXT: pmulhuw %xmm0, %xmm1 ; SSE-NEXT: psrlw $6, %xmm1 -; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [95,95,95,95,95,95,95,95] +; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [95,95,95,95,u,u,u,u] ; SSE-NEXT: psubw %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: fold_urem_vec_2: ; AVX: # %bb.0: -; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [44151,44151,44151,44151,44151,44151,44151,44151] +; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [44151,44151,44151,44151,u,u,u,u] ; AVX-NEXT: vpsrlw $6, %xmm1, %xmm1 -; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [95,95,95,95,95,95,95,95] +; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [95,95,95,95,u,u,u,u] ; AVX-NEXT: vpsubw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = urem <4 x i16> %x, @@ -114,10 +92,10 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) { define <4 x i16> @combine_urem_udiv(<4 x i16> %x) { ; SSE2-LABEL: combine_urem_udiv: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151] +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,u,u,u,u] ; SSE2-NEXT: pmulhuw %xmm0, %xmm1 ; SSE2-NEXT: psrlw $6, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [95,95,95,95,95,95,95,95] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [95,95,95,95,u,u,u,u] ; SSE2-NEXT: pmullw %xmm1, %xmm2 ; SSE2-NEXT: psubw %xmm2, %xmm0 ; SSE2-NEXT: paddw %xmm1, %xmm0 @@ -125,7 +103,7 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) { ; ; SSE4-LABEL: combine_urem_udiv: ; SSE4: # %bb.0: -; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,44151,44151,44151,44151] +; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [44151,44151,44151,44151,u,u,u,u] ; SSE4-NEXT: pmulhuw %xmm0, %xmm1 ; SSE4-NEXT: psrlw $6, %xmm1 ; SSE4-NEXT: pmovsxbw {{.*#+}} xmm2 = [95,95,95,95,95,95,95,95] @@ -136,9 +114,9 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) { ; ; AVX-LABEL: combine_urem_udiv: ; AVX: # %bb.0: -; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [44151,44151,44151,44151,44151,44151,44151,44151] +; AVX-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [44151,44151,44151,44151,u,u,u,u] ; AVX-NEXT: vpsrlw $6, %xmm1, %xmm1 -; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 # [95,95,95,95,95,95,95,95] +; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 # [95,95,95,95,u,u,u,u] ; AVX-NEXT: vpsubw %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq @@ -152,92 +130,44 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) { define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) { ; SSE2-LABEL: dont_fold_urem_power_of_two: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [63,63,63,63] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pextrw $1, %xmm0, %eax -; SSE2-NEXT: andl $31, %eax -; SSE2-NEXT: pinsrw $1, %eax, %xmm1 -; SSE2-NEXT: pextrw $2, %xmm0, %eax -; SSE2-NEXT: andl $7, %eax -; SSE2-NEXT: pinsrw $2, %eax, %xmm1 -; SSE2-NEXT: pextrw $3, %xmm0, %eax -; SSE2-NEXT: imull $44151, %eax, %ecx # imm = 0xAC77 -; SSE2-NEXT: shrl $22, %ecx -; SSE2-NEXT: imull $95, %ecx, %ecx -; SSE2-NEXT: subl %ecx, %eax -; SSE2-NEXT: pinsrw $3, %eax, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: movq {{.*#+}} xmm1 = [1024,2048,8192,44151,0,0,0,0] +; SSE2-NEXT: pmulhuw %xmm0, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,65535,65535] +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: psrlw $6, %xmm1 +; SSE2-NEXT: pandn %xmm1, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [64,32,8,95,u,u,u,u] +; SSE2-NEXT: psubw %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE4-LABEL: dont_fold_urem_power_of_two: ; SSE4: # %bb.0: -; SSE4-NEXT: pmovsxbd {{.*#+}} xmm1 = [63,63,63,63] -; SSE4-NEXT: pand %xmm0, %xmm1 -; SSE4-NEXT: pextrw $1, %xmm0, %eax -; SSE4-NEXT: andl $31, %eax -; SSE4-NEXT: pinsrw $1, %eax, %xmm1 -; SSE4-NEXT: pextrw $2, %xmm0, %eax -; SSE4-NEXT: andl $7, %eax -; SSE4-NEXT: pinsrw $2, %eax, %xmm1 -; SSE4-NEXT: pextrw $3, %xmm0, %eax -; SSE4-NEXT: imull $44151, %eax, %ecx # imm = 0xAC77 -; SSE4-NEXT: shrl $22, %ecx -; SSE4-NEXT: imull $95, %ecx, %ecx -; SSE4-NEXT: subl %ecx, %eax -; SSE4-NEXT: pinsrw $3, %eax, %xmm1 -; SSE4-NEXT: movdqa %xmm1, %xmm0 +; SSE4-NEXT: movq {{.*#+}} xmm1 = [1024,2048,8192,44151,0,0,0,0] +; SSE4-NEXT: pmulhuw %xmm0, %xmm1 +; SSE4-NEXT: movdqa %xmm1, %xmm2 +; SSE4-NEXT: psrlw $6, %xmm2 +; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3],xmm1[4,5,6,7] +; SSE4-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [64,32,8,95,u,u,u,u] +; SSE4-NEXT: psubw %xmm2, %xmm0 ; SSE4-NEXT: retq ; -; AVX1-LABEL: dont_fold_urem_power_of_two: -; AVX1: # %bb.0: -; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX1-NEXT: vpextrw $1, %xmm0, %eax -; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 -; AVX1-NEXT: vpextrw $2, %xmm0, %eax -; AVX1-NEXT: andl $7, %eax -; AVX1-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 -; AVX1-NEXT: vpextrw $3, %xmm0, %eax -; AVX1-NEXT: imull $44151, %eax, %ecx # imm = 0xAC77 -; AVX1-NEXT: shrl $22, %ecx -; AVX1-NEXT: imull $95, %ecx, %ecx -; AVX1-NEXT: subl %ecx, %eax -; AVX1-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: dont_fold_urem_power_of_two: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [63,63,63,63] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpextrw $1, %xmm0, %eax -; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 -; AVX2-NEXT: vpextrw $2, %xmm0, %eax -; AVX2-NEXT: andl $7, %eax -; AVX2-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 -; AVX2-NEXT: vpextrw $3, %xmm0, %eax -; AVX2-NEXT: imull $44151, %eax, %ecx # imm = 0xAC77 -; AVX2-NEXT: shrl $22, %ecx -; AVX2-NEXT: imull $95, %ecx, %ecx -; AVX2-NEXT: subl %ecx, %eax -; AVX2-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 -; AVX2-NEXT: retq +; AVX1OR2-LABEL: dont_fold_urem_power_of_two: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [1024,2048,8192,44151,u,u,u,u] +; AVX1OR2-NEXT: vpsrlw $6, %xmm1, %xmm2 +; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5,6,7] +; AVX1OR2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [64,32,8,95,u,u,u,u] +; AVX1OR2-NEXT: vpsubw %xmm1, %xmm0, %xmm0 +; AVX1OR2-NEXT: retq ; ; AVX512-LABEL: dont_fold_urem_power_of_two: ; AVX512: # %bb.0: -; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1 -; AVX512-NEXT: vpextrw $1, %xmm0, %eax -; AVX512-NEXT: andl $31, %eax -; AVX512-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 -; AVX512-NEXT: vpextrw $2, %xmm0, %eax -; AVX512-NEXT: andl $7, %eax -; AVX512-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 -; AVX512-NEXT: vpextrw $3, %xmm0, %eax -; AVX512-NEXT: imull $44151, %eax, %ecx # imm = 0xAC77 -; AVX512-NEXT: shrl $22, %ecx -; AVX512-NEXT: imull $95, %ecx, %ecx -; AVX512-NEXT: subl %ecx, %eax -; AVX512-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 +; AVX512-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [1024,2048,8192,44151,u,u,u,u] +; AVX512-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [64,32,8,95,u,u,u,u] +; AVX512-NEXT: vpsubw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = urem <4 x i16> %x, ret <4 x i16> %1 @@ -245,98 +175,58 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) { ; Don't fold if the divisor is one. define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) { -; SSE-LABEL: dont_fold_urem_one: -; SSE: # %bb.0: -; SSE-NEXT: pextrw $2, %xmm0, %eax -; SSE-NEXT: imull $25645, %eax, %ecx # imm = 0x642D -; SSE-NEXT: shrl $16, %ecx -; SSE-NEXT: movl %eax, %edx -; SSE-NEXT: subl %ecx, %edx -; SSE-NEXT: movzwl %dx, %edx -; SSE-NEXT: shrl %edx -; SSE-NEXT: addl %ecx, %edx -; SSE-NEXT: shrl $4, %edx -; SSE-NEXT: leal (%rdx,%rdx,2), %ecx -; SSE-NEXT: shll $3, %ecx -; SSE-NEXT: subl %ecx, %edx -; SSE-NEXT: addl %eax, %edx -; SSE-NEXT: pextrw $1, %xmm0, %eax -; SSE-NEXT: imull $51307, %eax, %ecx # imm = 0xC86B -; SSE-NEXT: shrl $25, %ecx -; SSE-NEXT: imull $654, %ecx, %ecx # imm = 0x28E -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: pinsrw $1, %eax, %xmm1 -; SSE-NEXT: pinsrw $2, %edx, %xmm1 -; SSE-NEXT: pextrw $3, %xmm0, %eax -; SSE-NEXT: imull $12375, %eax, %ecx # imm = 0x3057 -; SSE-NEXT: shrl $26, %ecx -; SSE-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F -; SSE-NEXT: subl %ecx, %eax -; SSE-NEXT: pinsrw $3, %eax, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: dont_fold_urem_one: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535] +; SSE2-NEXT: pandn %xmm0, %xmm1 +; SSE2-NEXT: movq {{.*#+}} xmm2 = [0,51307,25645,12375,0,0,0,0] +; SSE2-NEXT: pmulhuw %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: psubw %xmm2, %xmm3 +; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [u,0,32768,0,u,u,u,u] +; SSE2-NEXT: paddw %xmm2, %xmm3 +; SSE2-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [0,0,128,0,0,16,64,0,u,u,u,u,u,u,u,u] +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [1,654,23,5423,u,u,u,u] +; SSE2-NEXT: psubw %xmm3, %xmm0 +; SSE2-NEXT: retq +; +; SSE4-LABEL: dont_fold_urem_one: +; SSE4: # %bb.0: +; SSE4-NEXT: movq {{.*#+}} xmm1 = [0,51307,25645,12375,0,0,0,0] +; SSE4-NEXT: pmulhuw %xmm0, %xmm1 +; SSE4-NEXT: movdqa %xmm0, %xmm2 +; SSE4-NEXT: psubw %xmm1, %xmm2 +; SSE4-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [u,0,32768,0,u,u,u,u] +; SSE4-NEXT: paddw %xmm1, %xmm2 +; SSE4-NEXT: pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [u,128,4096,64,u,u,u,u] +; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7] +; SSE4-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [1,654,23,5423,u,u,u,u] +; SSE4-NEXT: psubw %xmm2, %xmm0 +; SSE4-NEXT: retq ; ; AVX1OR2-LABEL: dont_fold_urem_one: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vpextrw $2, %xmm0, %eax -; AVX1OR2-NEXT: imull $25645, %eax, %ecx # imm = 0x642D -; AVX1OR2-NEXT: shrl $16, %ecx -; AVX1OR2-NEXT: movl %eax, %edx -; AVX1OR2-NEXT: subl %ecx, %edx -; AVX1OR2-NEXT: movzwl %dx, %edx -; AVX1OR2-NEXT: shrl %edx -; AVX1OR2-NEXT: addl %ecx, %edx -; AVX1OR2-NEXT: shrl $4, %edx -; AVX1OR2-NEXT: leal (%rdx,%rdx,2), %ecx -; AVX1OR2-NEXT: shll $3, %ecx -; AVX1OR2-NEXT: subl %ecx, %edx -; AVX1OR2-NEXT: addl %eax, %edx -; AVX1OR2-NEXT: vpextrw $1, %xmm0, %eax -; AVX1OR2-NEXT: imull $51307, %eax, %ecx # imm = 0xC86B -; AVX1OR2-NEXT: shrl $25, %ecx -; AVX1OR2-NEXT: imull $654, %ecx, %ecx # imm = 0x28E -; AVX1OR2-NEXT: subl %ecx, %eax -; AVX1OR2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1OR2-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 -; AVX1OR2-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1 -; AVX1OR2-NEXT: vpextrw $3, %xmm0, %eax -; AVX1OR2-NEXT: imull $12375, %eax, %ecx # imm = 0x3057 -; AVX1OR2-NEXT: shrl $26, %ecx -; AVX1OR2-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F -; AVX1OR2-NEXT: subl %ecx, %eax -; AVX1OR2-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 +; AVX1OR2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,51307,25645,12375,u,u,u,u] +; AVX1OR2-NEXT: vpsubw %xmm1, %xmm0, %xmm2 +; AVX1OR2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [u,0,32768,0,u,u,u,u] +; AVX1OR2-NEXT: vpaddw %xmm1, %xmm2, %xmm1 +; AVX1OR2-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [u,128,4096,64,u,u,u,u] +; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX1OR2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,654,23,5423,u,u,u,u] +; AVX1OR2-NEXT: vpsubw %xmm1, %xmm0, %xmm0 ; AVX1OR2-NEXT: retq ; ; AVX512-LABEL: dont_fold_urem_one: ; AVX512: # %bb.0: -; AVX512-NEXT: vpextrw $2, %xmm0, %eax -; AVX512-NEXT: imull $25645, %eax, %ecx # imm = 0x642D -; AVX512-NEXT: shrl $16, %ecx -; AVX512-NEXT: movl %eax, %edx -; AVX512-NEXT: subl %ecx, %edx -; AVX512-NEXT: movzwl %dx, %edx -; AVX512-NEXT: shrl %edx -; AVX512-NEXT: addl %ecx, %edx -; AVX512-NEXT: shrl $4, %edx -; AVX512-NEXT: leal (%rdx,%rdx,2), %ecx -; AVX512-NEXT: shll $3, %ecx -; AVX512-NEXT: subl %ecx, %edx -; AVX512-NEXT: vpextrw $1, %xmm0, %ecx -; AVX512-NEXT: addl %eax, %edx -; AVX512-NEXT: imull $51307, %ecx, %eax # imm = 0xC86B -; AVX512-NEXT: shrl $25, %eax -; AVX512-NEXT: imull $654, %eax, %eax # imm = 0x28E -; AVX512-NEXT: subl %eax, %ecx -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1 -; AVX512-NEXT: vpinsrw $2, %edx, %xmm1, %xmm1 -; AVX512-NEXT: vpextrw $3, %xmm0, %eax -; AVX512-NEXT: imull $12375, %eax, %ecx # imm = 0x3057 -; AVX512-NEXT: shrl $26, %ecx -; AVX512-NEXT: imull $5423, %ecx, %ecx # imm = 0x152F -; AVX512-NEXT: subl %ecx, %eax -; AVX512-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 +; AVX512-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,51307,25645,12375,u,u,u,u] +; AVX512-NEXT: vpsubw %xmm1, %xmm0, %xmm2 +; AVX512-NEXT: vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [u,0,32768,0,u,u,u,u] +; AVX512-NEXT: vpaddw %xmm1, %xmm2, %xmm1 +; AVX512-NEXT: vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,654,23,5423,u,u,u,u] +; AVX512-NEXT: vpsubw %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = urem <4 x i16> %x, ret <4 x i16> %1 diff --git a/llvm/test/CodeGen/X86/vector-idiv-v2i32.ll b/llvm/test/CodeGen/X86/vector-idiv-v2i32.ll index 10a840218c86..1d9977e6d628 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-v2i32.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-v2i32.ll @@ -6,7 +6,7 @@ define void @test_udiv7_v2i32(ptr %x, ptr %y) nounwind { ; X64-LABEL: test_udiv7_v2i32: ; X64: # %bb.0: ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757] +; X64-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,u,u] ; X64-NEXT: movdqa %xmm0, %xmm2 ; X64-NEXT: pmuludq %xmm1, %xmm2 ; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] @@ -26,7 +26,7 @@ define void @test_udiv7_v2i32(ptr %x, ptr %y) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757] +; X86-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,u,u] ; X86-NEXT: movdqa %xmm0, %xmm2 ; X86-NEXT: pmuludq %xmm1, %xmm2 ; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] @@ -51,7 +51,7 @@ define void @test_urem7_v2i32(ptr %x, ptr %y) nounwind { ; X64-LABEL: test_urem7_v2i32: ; X64: # %bb.0: ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757] +; X64-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,u,u] ; X64-NEXT: movdqa %xmm0, %xmm2 ; X64-NEXT: pmuludq %xmm1, %xmm2 ; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] @@ -76,7 +76,7 @@ define void @test_urem7_v2i32(ptr %x, ptr %y) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757] +; X86-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,u,u] ; X86-NEXT: movdqa %xmm0, %xmm2 ; X86-NEXT: pmuludq %xmm1, %xmm2 ; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] @@ -106,7 +106,7 @@ define void @test_sdiv7_v2i32(ptr %x, ptr %y) nounwind { ; X64-LABEL: test_sdiv7_v2i32: ; X64: # %bb.0: ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] +; X64-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,u,u] ; X64-NEXT: movdqa %xmm0, %xmm2 ; X64-NEXT: pmuludq %xmm1, %xmm2 ; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] @@ -132,7 +132,7 @@ define void @test_sdiv7_v2i32(ptr %x, ptr %y) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X86-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] +; X86-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2454267027,u,u] ; X86-NEXT: movdqa %xmm1, %xmm0 ; X86-NEXT: pmuludq %xmm2, %xmm0 ; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] @@ -163,7 +163,7 @@ define void @test_srem7_v2i32(ptr %x, ptr %y) nounwind { ; X64-LABEL: test_srem7_v2i32: ; X64: # %bb.0: ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] +; X64-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,u,u] ; X64-NEXT: movdqa %xmm0, %xmm2 ; X64-NEXT: pmuludq %xmm1, %xmm2 ; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] @@ -193,7 +193,7 @@ define void @test_srem7_v2i32(ptr %x, ptr %y) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] +; X86-NEXT: movdqa {{.*#+}} xmm2 = [2454267027,2454267027,u,u] ; X86-NEXT: movdqa %xmm0, %xmm1 ; X86-NEXT: pmuludq %xmm2, %xmm1 ; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]