
Similar to InstCombinerImpl::freezeOtherUses, attempt to ensure that we merge multiple frozen/unfrozen uses of a SDValue. This fixes a number of hasOneUse() problems when trying to push FREEZE nodes through the DAG. Remove SimplifyMultipleUseDemandedBits handling of FREEZE nodes as we now want to keep the common node, and not bypass for some nodes just because of DemandedElts. Fixes #149799
216 lines
10 KiB
LLVM
216 lines
10 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
|
|
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s
|
|
|
|
define <4 x i32> @hadd_select_v4i32(<4 x i32> %x, <4 x i32> %y) {
|
|
; CHECK-LABEL: hadd_select_v4i32:
|
|
; CHECK: # %bb.0: # %entry
|
|
; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,3,3,3]
|
|
; CHECK-NEXT: vpand %xmm2, %xmm0, %xmm0
|
|
; CHECK-NEXT: vpand %xmm2, %xmm1, %xmm1
|
|
; CHECK-NEXT: vphaddd %xmm1, %xmm0, %xmm0
|
|
; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [9,9,9,9]
|
|
; CHECK-NEXT: vpmaxud %xmm1, %xmm0, %xmm1
|
|
; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1
|
|
; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0
|
|
; CHECK-NEXT: retq
|
|
entry:
|
|
%and1 = and <4 x i32> %x, <i32 3, i32 3, i32 3, i32 3>
|
|
%and2 = and <4 x i32> %y, <i32 3, i32 3, i32 3, i32 3>
|
|
%hadd = tail call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %and1, <4 x i32> %and2)
|
|
%cond = icmp ule <4 x i32> %hadd, <i32 8, i32 8, i32 8, i32 8>
|
|
%ret = select <4 x i1> %cond, <4 x i32> zeroinitializer, <4 x i32> %hadd
|
|
ret <4 x i32> %ret
|
|
}
|
|
|
|
define <8 x i8> @hadd_trunc_v8i16(<8 x i16> %x, <8 x i16> %y) {
|
|
; CHECK-LABEL: hadd_trunc_v8i16:
|
|
; CHECK: # %bb.0: # %entry
|
|
; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,3,3,3,3,3,3,3]
|
|
; CHECK-NEXT: vpand %xmm2, %xmm0, %xmm0
|
|
; CHECK-NEXT: vpand %xmm2, %xmm1, %xmm1
|
|
; CHECK-NEXT: vphaddw %xmm1, %xmm0, %xmm0
|
|
; CHECK-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
|
|
; CHECK-NEXT: retq
|
|
entry:
|
|
%and1 = and <8 x i16> %x, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
|
|
%and2 = and <8 x i16> %y, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
|
|
%hadd = tail call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %and1, <8 x i16> %and2)
|
|
%conv = trunc <8 x i16> %hadd to <8 x i8>
|
|
ret <8 x i8> %conv
|
|
}
|
|
|
|
define <8 x i16> @hadd_trunc_v8i32(<8 x i32> %x, <8 x i32> %y) {
|
|
; CHECK-LABEL: hadd_trunc_v8i32:
|
|
; CHECK: # %bb.0: # %entry
|
|
; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3]
|
|
; CHECK-NEXT: vpand %ymm2, %ymm0, %ymm0
|
|
; CHECK-NEXT: vpand %ymm2, %ymm1, %ymm1
|
|
; CHECK-NEXT: vphaddd %ymm1, %ymm0, %ymm0
|
|
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
|
|
; CHECK-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
entry:
|
|
%and1 = and <8 x i32> %x, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
|
|
%and2 = and <8 x i32> %y, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
|
|
%hadd = tail call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %and1, <8 x i32> %and2)
|
|
%conv = trunc <8 x i32> %hadd to <8 x i16>
|
|
ret <8 x i16> %conv
|
|
}
|
|
|
|
define <16 x i8> @hadd_trunc_v16i16(<16 x i16> %x, <16 x i16> %y) {
|
|
; CHECK-LABEL: hadd_trunc_v16i16:
|
|
; CHECK: # %bb.0: # %entry
|
|
; CHECK-NEXT: vpbroadcastw {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
|
|
; CHECK-NEXT: vpand %ymm2, %ymm0, %ymm0
|
|
; CHECK-NEXT: vpand %ymm2, %ymm1, %ymm1
|
|
; CHECK-NEXT: vphaddw %ymm1, %ymm0, %ymm0
|
|
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
|
|
; CHECK-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
entry:
|
|
%and1 = and <16 x i16> %x, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
|
|
%and2 = and <16 x i16> %y, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
|
|
%hadd = tail call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %and1, <16 x i16> %and2)
|
|
%conv = trunc <16 x i16> %hadd to <16 x i8>
|
|
ret <16 x i8> %conv
|
|
}
|
|
|
|
define <4 x i32> @hsub_select_shl_v4i32(<4 x i32> %x, <4 x i32> %y) {
|
|
; CHECK-LABEL: hsub_select_shl_v4i32:
|
|
; CHECK: # %bb.0:
|
|
; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm2 = [65535,65535,65535,65535]
|
|
; CHECK-NEXT: vpor %xmm2, %xmm0, %xmm0
|
|
; CHECK-NEXT: vpor %xmm2, %xmm1, %xmm1
|
|
; CHECK-NEXT: vphsubd %xmm1, %xmm0, %xmm0
|
|
; CHECK-NEXT: vpslld $16, %xmm0, %xmm1
|
|
; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm2 = [9,9,9,9]
|
|
; CHECK-NEXT: vpmaxud %xmm2, %xmm1, %xmm2
|
|
; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
|
|
; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0
|
|
; CHECK-NEXT: retq
|
|
%or1 = or <4 x i32> %x, <i32 65535, i32 65535, i32 65535, i32 65535>
|
|
%or2 = or <4 x i32> %y, <i32 65535, i32 65535, i32 65535, i32 65535>
|
|
%hsub = tail call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %or1, <4 x i32> %or2)
|
|
%shl = shl <4 x i32> %hsub, <i32 16, i32 16, i32 16, i32 16>
|
|
%cond = icmp ule <4 x i32> %shl, <i32 8, i32 8, i32 8, i32 8>
|
|
%ret = select <4 x i1> %cond, <4 x i32> zeroinitializer, <4 x i32> %hsub
|
|
ret <4 x i32> %ret
|
|
}
|
|
|
|
define <8 x i8> @hsub_trunc_v8i16(<8 x i16> %x, <8 x i16> %y) {
|
|
; CHECK-LABEL: hsub_trunc_v8i16:
|
|
; CHECK: # %bb.0: # %entry
|
|
; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
|
; CHECK-NEXT: retq
|
|
entry:
|
|
%or1 = or <8 x i16> %x, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
|
|
%or2 = or <8 x i16> %y, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
|
|
%hsub = tail call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %or1, <8 x i16> %or2)
|
|
%conv = trunc <8 x i16> %hsub to <8 x i8>
|
|
ret <8 x i8> %conv
|
|
}
|
|
|
|
define <8 x i16> @hsub_trunc_v8i32(<8 x i32> %x, <8 x i32> %y) {
|
|
; CHECK-LABEL: hsub_trunc_v8i32:
|
|
; CHECK: # %bb.0: # %entry
|
|
; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
|
; CHECK-NEXT: retq
|
|
entry:
|
|
%or1 = or <8 x i32> %x, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
|
|
%or2 = or <8 x i32> %y, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
|
|
%hsub = tail call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %or1, <8 x i32> %or2)
|
|
%conv = trunc <8 x i32> %hsub to <8 x i16>
|
|
ret <8 x i16> %conv
|
|
}
|
|
|
|
define <16 x i8> @hsub_trunc_v16i16(<16 x i16> %x, <16 x i16> %y) {
|
|
; CHECK-LABEL: hsub_trunc_v16i16:
|
|
; CHECK: # %bb.0: # %entry
|
|
; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
|
|
; CHECK-NEXT: retq
|
|
entry:
|
|
%or1 = or <16 x i16> %x, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
|
|
%or2 = or <16 x i16> %y, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
|
|
%hsub = tail call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %or1, <16 x i16> %or2)
|
|
%conv = trunc <16 x i16> %hsub to <16 x i8>
|
|
ret <16 x i8> %conv
|
|
}
|
|
|
|
define <8 x i16> @hadd_extract_2st_trunc_v8i32(<8 x i32> %x, <8 x i32> %y) {
|
|
; CHECK-LABEL: hadd_extract_2st_trunc_v8i32:
|
|
; CHECK: # %bb.0: # %entry
|
|
; CHECK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
|
|
; CHECK-NEXT: vphaddd %xmm0, %xmm0, %xmm0
|
|
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
entry:
|
|
%and1 = and <8 x i32> %x, <i32 -1, i32 -1, i32 3, i32 3, i32 -1, i32 -1, i32 -1, i32 -1>
|
|
%and2 = and <8 x i32> %y, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
|
|
%hadd = tail call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %and1, <8 x i32> %and2)
|
|
%andr = and <8 x i32> %hadd, <i32 0, i32 -1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
|
|
%conv = trunc <8 x i32> %andr to <8 x i16>
|
|
ret <8 x i16> %conv
|
|
}
|
|
|
|
define <8 x i16> @hadd_extract_8th_trunc_v8i32(<8 x i32> %x, <8 x i32> %y) {
|
|
; CHECK-LABEL: hadd_extract_8th_trunc_v8i32:
|
|
; CHECK: # %bb.0: # %entry
|
|
; CHECK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
|
|
; CHECK-NEXT: vphaddd %ymm1, %ymm0, %ymm0
|
|
; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
|
|
; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
|
; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
|
; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7]
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
entry:
|
|
%and1 = and <8 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
|
|
%and2 = and <8 x i32> %y, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 3, i32 3>
|
|
%hadd = tail call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %and1, <8 x i32> %and2)
|
|
%andr = and <8 x i32> %hadd, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 -1>
|
|
%conv = trunc <8 x i32> %andr to <8 x i16>
|
|
ret <8 x i16> %conv
|
|
}
|
|
|
|
define <8 x i16> @hadd_extract_2st_trunc_redundant_and_v4i32(<8 x i32> %x, <8 x i32> %y) {
|
|
; CHECK-LABEL: hadd_extract_2st_trunc_redundant_and_v4i32:
|
|
; CHECK: # %bb.0: # %entry
|
|
; CHECK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
|
|
; CHECK-NEXT: vphaddd %xmm0, %xmm0, %xmm0
|
|
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
entry:
|
|
%and1 = and <8 x i32> %x, <i32 3, i32 3, i32 -1, i32 -1, i32 3, i32 3, i32 3, i32 3>
|
|
%and2 = and <8 x i32> %y, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
|
|
%hadd = tail call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %and1, <8 x i32> %and2)
|
|
%andr = and <8 x i32> %hadd, <i32 0, i32 -1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
|
|
%conv = trunc <8 x i32> %andr to <8 x i16>
|
|
ret <8 x i16> %conv
|
|
}
|
|
|
|
define <8 x i16> @hadd_extract_4th_trunc_redundant_and_v4i32(<8 x i32> %x, <8 x i32> %y) {
|
|
; CHECK-LABEL: hadd_extract_4th_trunc_redundant_and_v4i32:
|
|
; CHECK: # %bb.0: # %entry
|
|
; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3]
|
|
; CHECK-NEXT: vpand %ymm2, %ymm0, %ymm0
|
|
; CHECK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
|
|
; CHECK-NEXT: vphaddd %ymm1, %ymm0, %ymm0
|
|
; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
|
|
; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
|
|
; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
|
; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7]
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
entry:
|
|
%and1 = and <8 x i32> %x, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
|
|
%and2 = and <8 x i32> %y, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 -1, i32 -1>
|
|
%hadd = tail call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %and1, <8 x i32> %and2)
|
|
%andr = and <8 x i32> %hadd, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 -1>
|
|
%conv = trunc <8 x i32> %andr to <8 x i16>
|
|
ret <8 x i16> %conv
|
|
}
|