llvm-project/llvm/test/CodeGen/X86/knownbits-hadd-hsub.ll
Simon Pilgrim d561259a08
[DAG] visitFREEZE - replace multiple frozen/unfrozen uses of an SDValue with just the frozen node (#150017)
Similar to InstCombinerImpl::freezeOtherUses, attempt to ensure that we
merge multiple frozen/unfrozen uses of a SDValue. This fixes a number of
hasOneUse() problems when trying to push FREEZE nodes through the DAG.

Remove SimplifyMultipleUseDemandedBits handling of FREEZE nodes as we
now want to keep the common node, and not bypass for some nodes just
because of DemandedElts.

Fixes #149799
2025-08-05 09:24:09 +01:00

216 lines
10 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s
define <4 x i32> @hadd_select_v4i32(<4 x i32> %x, <4 x i32> %y) {
; CHECK-LABEL: hadd_select_v4i32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,3,3,3]
; CHECK-NEXT: vpand %xmm2, %xmm0, %xmm0
; CHECK-NEXT: vpand %xmm2, %xmm1, %xmm1
; CHECK-NEXT: vphaddd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [9,9,9,9]
; CHECK-NEXT: vpmaxud %xmm1, %xmm0, %xmm1
; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1
; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0
; CHECK-NEXT: retq
entry:
%and1 = and <4 x i32> %x, <i32 3, i32 3, i32 3, i32 3>
%and2 = and <4 x i32> %y, <i32 3, i32 3, i32 3, i32 3>
%hadd = tail call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %and1, <4 x i32> %and2)
%cond = icmp ule <4 x i32> %hadd, <i32 8, i32 8, i32 8, i32 8>
%ret = select <4 x i1> %cond, <4 x i32> zeroinitializer, <4 x i32> %hadd
ret <4 x i32> %ret
}
define <8 x i8> @hadd_trunc_v8i16(<8 x i16> %x, <8 x i16> %y) {
; CHECK-LABEL: hadd_trunc_v8i16:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,3,3,3,3,3,3,3]
; CHECK-NEXT: vpand %xmm2, %xmm0, %xmm0
; CHECK-NEXT: vpand %xmm2, %xmm1, %xmm1
; CHECK-NEXT: vphaddw %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; CHECK-NEXT: retq
entry:
%and1 = and <8 x i16> %x, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
%and2 = and <8 x i16> %y, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
%hadd = tail call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %and1, <8 x i16> %and2)
%conv = trunc <8 x i16> %hadd to <8 x i8>
ret <8 x i8> %conv
}
define <8 x i16> @hadd_trunc_v8i32(<8 x i32> %x, <8 x i32> %y) {
; CHECK-LABEL: hadd_trunc_v8i32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3]
; CHECK-NEXT: vpand %ymm2, %ymm0, %ymm0
; CHECK-NEXT: vpand %ymm2, %ymm1, %ymm1
; CHECK-NEXT: vphaddd %ymm1, %ymm0, %ymm0
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
; CHECK-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
entry:
%and1 = and <8 x i32> %x, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
%and2 = and <8 x i32> %y, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
%hadd = tail call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %and1, <8 x i32> %and2)
%conv = trunc <8 x i32> %hadd to <8 x i16>
ret <8 x i16> %conv
}
define <16 x i8> @hadd_trunc_v16i16(<16 x i16> %x, <16 x i16> %y) {
; CHECK-LABEL: hadd_trunc_v16i16:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vpbroadcastw {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3]
; CHECK-NEXT: vpand %ymm2, %ymm0, %ymm0
; CHECK-NEXT: vpand %ymm2, %ymm1, %ymm1
; CHECK-NEXT: vphaddw %ymm1, %ymm0, %ymm0
; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
; CHECK-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
entry:
%and1 = and <16 x i16> %x, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
%and2 = and <16 x i16> %y, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
%hadd = tail call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %and1, <16 x i16> %and2)
%conv = trunc <16 x i16> %hadd to <16 x i8>
ret <16 x i8> %conv
}
define <4 x i32> @hsub_select_shl_v4i32(<4 x i32> %x, <4 x i32> %y) {
; CHECK-LABEL: hsub_select_shl_v4i32:
; CHECK: # %bb.0:
; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm2 = [65535,65535,65535,65535]
; CHECK-NEXT: vpor %xmm2, %xmm0, %xmm0
; CHECK-NEXT: vpor %xmm2, %xmm1, %xmm1
; CHECK-NEXT: vphsubd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vpslld $16, %xmm0, %xmm1
; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm2 = [9,9,9,9]
; CHECK-NEXT: vpmaxud %xmm2, %xmm1, %xmm2
; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
; CHECK-NEXT: vpand %xmm0, %xmm1, %xmm0
; CHECK-NEXT: retq
%or1 = or <4 x i32> %x, <i32 65535, i32 65535, i32 65535, i32 65535>
%or2 = or <4 x i32> %y, <i32 65535, i32 65535, i32 65535, i32 65535>
%hsub = tail call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %or1, <4 x i32> %or2)
%shl = shl <4 x i32> %hsub, <i32 16, i32 16, i32 16, i32 16>
%cond = icmp ule <4 x i32> %shl, <i32 8, i32 8, i32 8, i32 8>
%ret = select <4 x i1> %cond, <4 x i32> zeroinitializer, <4 x i32> %hsub
ret <4 x i32> %ret
}
define <8 x i8> @hsub_trunc_v8i16(<8 x i16> %x, <8 x i16> %y) {
; CHECK-LABEL: hsub_trunc_v8i16:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
; CHECK-NEXT: retq
entry:
%or1 = or <8 x i16> %x, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
%or2 = or <8 x i16> %y, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
%hsub = tail call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %or1, <8 x i16> %or2)
%conv = trunc <8 x i16> %hsub to <8 x i8>
ret <8 x i8> %conv
}
define <8 x i16> @hsub_trunc_v8i32(<8 x i32> %x, <8 x i32> %y) {
; CHECK-LABEL: hsub_trunc_v8i32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
; CHECK-NEXT: retq
entry:
%or1 = or <8 x i32> %x, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
%or2 = or <8 x i32> %y, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
%hsub = tail call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %or1, <8 x i32> %or2)
%conv = trunc <8 x i32> %hsub to <8 x i16>
ret <8 x i16> %conv
}
define <16 x i8> @hsub_trunc_v16i16(<16 x i16> %x, <16 x i16> %y) {
; CHECK-LABEL: hsub_trunc_v16i16:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
; CHECK-NEXT: retq
entry:
%or1 = or <16 x i16> %x, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
%or2 = or <16 x i16> %y, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
%hsub = tail call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %or1, <16 x i16> %or2)
%conv = trunc <16 x i16> %hsub to <16 x i8>
ret <16 x i8> %conv
}
define <8 x i16> @hadd_extract_2st_trunc_v8i32(<8 x i32> %x, <8 x i32> %y) {
; CHECK-LABEL: hadd_extract_2st_trunc_v8i32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-NEXT: vphaddd %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
entry:
%and1 = and <8 x i32> %x, <i32 -1, i32 -1, i32 3, i32 3, i32 -1, i32 -1, i32 -1, i32 -1>
%and2 = and <8 x i32> %y, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
%hadd = tail call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %and1, <8 x i32> %and2)
%andr = and <8 x i32> %hadd, <i32 0, i32 -1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
%conv = trunc <8 x i32> %andr to <8 x i16>
ret <8 x i16> %conv
}
define <8 x i16> @hadd_extract_8th_trunc_v8i32(<8 x i32> %x, <8 x i32> %y) {
; CHECK-LABEL: hadd_extract_8th_trunc_v8i32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
; CHECK-NEXT: vphaddd %ymm1, %ymm0, %ymm0
; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7]
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
entry:
%and1 = and <8 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
%and2 = and <8 x i32> %y, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 3, i32 3>
%hadd = tail call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %and1, <8 x i32> %and2)
%andr = and <8 x i32> %hadd, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 -1>
%conv = trunc <8 x i32> %andr to <8 x i16>
ret <8 x i16> %conv
}
define <8 x i16> @hadd_extract_2st_trunc_redundant_and_v4i32(<8 x i32> %x, <8 x i32> %y) {
; CHECK-LABEL: hadd_extract_2st_trunc_redundant_and_v4i32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; CHECK-NEXT: vphaddd %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
entry:
%and1 = and <8 x i32> %x, <i32 3, i32 3, i32 -1, i32 -1, i32 3, i32 3, i32 3, i32 3>
%and2 = and <8 x i32> %y, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
%hadd = tail call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %and1, <8 x i32> %and2)
%andr = and <8 x i32> %hadd, <i32 0, i32 -1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
%conv = trunc <8 x i32> %andr to <8 x i16>
ret <8 x i16> %conv
}
define <8 x i16> @hadd_extract_4th_trunc_redundant_and_v4i32(<8 x i32> %x, <8 x i32> %y) {
; CHECK-LABEL: hadd_extract_4th_trunc_redundant_and_v4i32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vpbroadcastd {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3]
; CHECK-NEXT: vpand %ymm2, %ymm0, %ymm0
; CHECK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
; CHECK-NEXT: vphaddd %ymm1, %ymm0, %ymm0
; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7]
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
entry:
%and1 = and <8 x i32> %x, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
%and2 = and <8 x i32> %y, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 -1, i32 -1>
%hadd = tail call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %and1, <8 x i32> %and2)
%andr = and <8 x i32> %hadd, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 -1>
%conv = trunc <8 x i32> %andr to <8 x i16>
ret <8 x i16> %conv
}