
When avx512 is available the lhs operand of select instruction can be folded with mask instruction, while the rhs operand can't. This patch is to commute the lhs and rhs of the select instruction to create the opportunity of folding. Differential Revision: https://reviews.llvm.org/D151535
262 lines
12 KiB
LLVM
262 lines
12 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
|
|
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
|
|
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f,avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
|
|
|
|
define dso_local <8 x i64> @select_sub(<8 x i64> %src, <8 x i64> %a, <8 x i64> %b, ptr %ptr) {
|
|
; AVX512-LABEL: select_sub:
|
|
; AVX512: # %bb.0: # %entry
|
|
; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm3
|
|
; AVX512-NEXT: vptestnmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm3, %k1
|
|
; AVX512-NEXT: vpsubq %zmm2, %zmm1, %zmm0 {%k1}
|
|
; AVX512-NEXT: retq
|
|
entry:
|
|
%arrayidx = getelementptr inbounds <8 x i64>, ptr %ptr, i64 1
|
|
%0 = load <8 x i64>, ptr %arrayidx, align 64
|
|
%and1 = and <8 x i64> %0, <i64 2251799813685248, i64 2251799813685248, i64 2251799813685248, i64 2251799813685248, i64 2251799813685248, i64 2251799813685248, i64 2251799813685248, i64 2251799813685248>
|
|
%not = icmp ne <8 x i64> %and1, zeroinitializer
|
|
%sub = sub <8 x i64> %a, %b
|
|
%1 = select <8 x i1> %not, <8 x i64> %src, <8 x i64> %sub
|
|
ret <8 x i64> %1
|
|
}
|
|
|
|
define dso_local <8 x i64> @select_add(<8 x i64> %src, <8 x i64> %a, <8 x i64> %b, ptr %ptr) {
|
|
; AVX512-LABEL: select_add:
|
|
; AVX512: # %bb.0: # %entry
|
|
; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm3
|
|
; AVX512-NEXT: vptestnmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm3, %k1
|
|
; AVX512-NEXT: vpaddq %zmm2, %zmm1, %zmm0 {%k1}
|
|
; AVX512-NEXT: retq
|
|
entry:
|
|
%arrayidx = getelementptr inbounds <8 x i64>, ptr %ptr, i64 1
|
|
%0 = load <8 x i64>, ptr %arrayidx, align 64
|
|
%and1 = and <8 x i64> %0, <i64 2251799813685248, i64 2251799813685248, i64 2251799813685248, i64 2251799813685248, i64 2251799813685248, i64 2251799813685248, i64 2251799813685248, i64 2251799813685248>
|
|
%not = icmp ne <8 x i64> %and1, zeroinitializer
|
|
%add = add <8 x i64> %a, %b
|
|
%1 = select <8 x i1> %not, <8 x i64> %src, <8 x i64> %add
|
|
ret <8 x i64> %1
|
|
}
|
|
|
|
define dso_local <8 x i64> @select_and(<8 x i64> %src, <8 x i64> %a, <8 x i64> %b, ptr %ptr) {
|
|
; AVX512-LABEL: select_and:
|
|
; AVX512: # %bb.0: # %entry
|
|
; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm3
|
|
; AVX512-NEXT: vptestnmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm3, %k1
|
|
; AVX512-NEXT: vpandq %zmm2, %zmm1, %zmm0 {%k1}
|
|
; AVX512-NEXT: retq
|
|
entry:
|
|
%arrayidx = getelementptr inbounds <8 x i64>, ptr %ptr, i64 1
|
|
%0 = load <8 x i64>, ptr %arrayidx, align 64
|
|
%and1 = and <8 x i64> %0, <i64 2251799813685248, i64 2251799813685248, i64 2251799813685248, i64 2251799813685248, i64 2251799813685248, i64 2251799813685248, i64 2251799813685248, i64 2251799813685248>
|
|
%not = icmp ne <8 x i64> %and1, zeroinitializer
|
|
%and = and <8 x i64> %a, %b
|
|
%1 = select <8 x i1> %not, <8 x i64> %src, <8 x i64> %and
|
|
ret <8 x i64> %1
|
|
}
|
|
|
|
define dso_local <8 x i64> @select_xor(<8 x i64> %src, <8 x i64> %a, <8 x i64> %b, ptr %ptr) {
|
|
; AVX512-LABEL: select_xor:
|
|
; AVX512: # %bb.0: # %entry
|
|
; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm3
|
|
; AVX512-NEXT: vptestnmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm3, %k1
|
|
; AVX512-NEXT: vpxorq %zmm2, %zmm1, %zmm0 {%k1}
|
|
; AVX512-NEXT: retq
|
|
entry:
|
|
%arrayidx = getelementptr inbounds <8 x i64>, ptr %ptr, i64 1
|
|
%0 = load <8 x i64>, ptr %arrayidx, align 64
|
|
%and1 = and <8 x i64> %0, <i64 2251799813685248, i64 2251799813685248, i64 2251799813685248, i64 2251799813685248, i64 2251799813685248, i64 2251799813685248, i64 2251799813685248, i64 2251799813685248>
|
|
%not = icmp ne <8 x i64> %and1, zeroinitializer
|
|
%xor = xor <8 x i64> %a, %b
|
|
%1 = select <8 x i1> %not, <8 x i64> %src, <8 x i64> %xor
|
|
ret <8 x i64> %1
|
|
}
|
|
|
|
define dso_local <8 x i64> @select_shl(<8 x i64> %src, <8 x i64> %a, <8 x i64> %b, ptr %ptr) {
|
|
; AVX512-LABEL: select_shl:
|
|
; AVX512: # %bb.0: # %entry
|
|
; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm3
|
|
; AVX512-NEXT: vptestnmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm3, %k1
|
|
; AVX512-NEXT: vpsllvq %zmm2, %zmm1, %zmm0 {%k1}
|
|
; AVX512-NEXT: retq
|
|
entry:
|
|
%arrayidx = getelementptr inbounds <8 x i64>, ptr %ptr, i64 1
|
|
%0 = load <8 x i64>, ptr %arrayidx, align 64
|
|
%and1 = and <8 x i64> %0, <i64 2251799813685248, i64 2251799813685248, i64 2251799813685248, i64 2251799813685248, i64 2251799813685248, i64 2251799813685248, i64 2251799813685248, i64 2251799813685248>
|
|
%not = icmp ne <8 x i64> %and1, zeroinitializer
|
|
%shl = shl <8 x i64> %a, %b
|
|
%1 = select <8 x i1> %not, <8 x i64> %src, <8 x i64> %shl
|
|
ret <8 x i64> %1
|
|
}
|
|
|
|
define dso_local <8 x i64> @select_srl(<8 x i64> %src, <8 x i64> %a, <8 x i64> %b, ptr %ptr) {
|
|
; AVX512-LABEL: select_srl:
|
|
; AVX512: # %bb.0: # %entry
|
|
; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm3
|
|
; AVX512-NEXT: vptestnmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm3, %k1
|
|
; AVX512-NEXT: vpsrlvq %zmm2, %zmm1, %zmm0 {%k1}
|
|
; AVX512-NEXT: retq
|
|
entry:
|
|
%arrayidx = getelementptr inbounds <8 x i64>, ptr %ptr, i64 1
|
|
%0 = load <8 x i64>, ptr %arrayidx, align 64
|
|
%and1 = and <8 x i64> %0, <i64 2251799813685248, i64 2251799813685248, i64 2251799813685248, i64 2251799813685248, i64 2251799813685248, i64 2251799813685248, i64 2251799813685248, i64 2251799813685248>
|
|
%not = icmp ne <8 x i64> %and1, zeroinitializer
|
|
%srl = lshr <8 x i64> %a, %b
|
|
%1 = select <8 x i1> %not, <8 x i64> %src, <8 x i64> %srl
|
|
ret <8 x i64> %1
|
|
}
|
|
|
|
define dso_local <8 x i64> @select_sra(<8 x i64> %src, <8 x i64> %a, <8 x i64> %b, ptr %ptr) {
|
|
; AVX512-LABEL: select_sra:
|
|
; AVX512: # %bb.0: # %entry
|
|
; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm3
|
|
; AVX512-NEXT: vptestnmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm3, %k1
|
|
; AVX512-NEXT: vpsravq %zmm2, %zmm1, %zmm0 {%k1}
|
|
; AVX512-NEXT: retq
|
|
entry:
|
|
%arrayidx = getelementptr inbounds <8 x i64>, ptr %ptr, i64 1
|
|
%0 = load <8 x i64>, ptr %arrayidx, align 64
|
|
%and1 = and <8 x i64> %0, <i64 2251799813685248, i64 2251799813685248, i64 2251799813685248, i64 2251799813685248, i64 2251799813685248, i64 2251799813685248, i64 2251799813685248, i64 2251799813685248>
|
|
%not = icmp ne <8 x i64> %and1, zeroinitializer
|
|
%sra = ashr <8 x i64> %a, %b
|
|
%1 = select <8 x i1> %not, <8 x i64> %src, <8 x i64> %sra
|
|
ret <8 x i64> %1
|
|
}
|
|
|
|
define dso_local <8 x i32> @select_mul(<8 x i32> %src, <8 x i32> %a, <8 x i32> %b, ptr %ptr) {
|
|
; AVX512F-LABEL: select_mul:
|
|
; AVX512F: # %bb.0: # %entry
|
|
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
|
|
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3
|
|
; AVX512F-NEXT: vptestnmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm3, %k1
|
|
; AVX512F-NEXT: vpmulld %ymm2, %ymm1, %ymm1
|
|
; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
|
|
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
|
|
; AVX512F-NEXT: retq
|
|
;
|
|
; AVX512VL-LABEL: select_mul:
|
|
; AVX512VL: # %bb.0: # %entry
|
|
; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm3
|
|
; AVX512VL-NEXT: vptestnmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %k1
|
|
; AVX512VL-NEXT: vpmulld %ymm2, %ymm1, %ymm0 {%k1}
|
|
; AVX512VL-NEXT: retq
|
|
entry:
|
|
%arrayidx = getelementptr inbounds <8 x i32>, ptr %ptr, i32 1
|
|
%0 = load <8 x i32>, ptr %arrayidx, align 64
|
|
%and1 = and <8 x i32> %0, <i32 22517, i32 22517, i32 22517, i32 22517, i32 22517, i32 22517, i32 22517, i32 22517>
|
|
%not = icmp ne <8 x i32> %and1, zeroinitializer
|
|
%mul = mul <8 x i32> %a, %b
|
|
%1 = select <8 x i1> %not, <8 x i32> %src, <8 x i32> %mul
|
|
ret <8 x i32> %1
|
|
}
|
|
|
|
define dso_local <8 x i32> @select_smax(<8 x i32> %src, <8 x i32> %a, <8 x i32> %b, ptr %ptr) {
|
|
; AVX512F-LABEL: select_smax:
|
|
; AVX512F: # %bb.0: # %entry
|
|
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
|
|
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3
|
|
; AVX512F-NEXT: vptestnmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm3, %k1
|
|
; AVX512F-NEXT: vpmaxsd %ymm2, %ymm1, %ymm1
|
|
; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
|
|
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
|
|
; AVX512F-NEXT: retq
|
|
;
|
|
; AVX512VL-LABEL: select_smax:
|
|
; AVX512VL: # %bb.0: # %entry
|
|
; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm3
|
|
; AVX512VL-NEXT: vptestnmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %k1
|
|
; AVX512VL-NEXT: vpmaxsd %ymm2, %ymm1, %ymm0 {%k1}
|
|
; AVX512VL-NEXT: retq
|
|
entry:
|
|
%arrayidx = getelementptr inbounds <8 x i32>, ptr %ptr, i32 1
|
|
%0 = load <8 x i32>, ptr %arrayidx, align 64
|
|
%and1 = and <8 x i32> %0, <i32 22517, i32 22517, i32 22517, i32 22517, i32 22517, i32 22517, i32 22517, i32 22517>
|
|
%not = icmp ne <8 x i32> %and1, zeroinitializer
|
|
%smax = call <8 x i32> @llvm.smax.v4i32(<8 x i32> %a, <8 x i32> %b)
|
|
%1 = select <8 x i1> %not, <8 x i32> %src, <8 x i32> %smax
|
|
ret <8 x i32> %1
|
|
}
|
|
declare <8 x i32> @llvm.smax.v4i32(<8 x i32> %a, <8 x i32> %b)
|
|
|
|
define dso_local <8 x i32> @select_smin(<8 x i32> %src, <8 x i32> %a, <8 x i32> %b, ptr %ptr) {
|
|
; AVX512F-LABEL: select_smin:
|
|
; AVX512F: # %bb.0: # %entry
|
|
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
|
|
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3
|
|
; AVX512F-NEXT: vptestnmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm3, %k1
|
|
; AVX512F-NEXT: vpminsd %ymm2, %ymm1, %ymm1
|
|
; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
|
|
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
|
|
; AVX512F-NEXT: retq
|
|
;
|
|
; AVX512VL-LABEL: select_smin:
|
|
; AVX512VL: # %bb.0: # %entry
|
|
; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm3
|
|
; AVX512VL-NEXT: vptestnmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %k1
|
|
; AVX512VL-NEXT: vpminsd %ymm2, %ymm1, %ymm0 {%k1}
|
|
; AVX512VL-NEXT: retq
|
|
entry:
|
|
%arrayidx = getelementptr inbounds <8 x i32>, ptr %ptr, i32 1
|
|
%0 = load <8 x i32>, ptr %arrayidx, align 64
|
|
%and1 = and <8 x i32> %0, <i32 22517, i32 22517, i32 22517, i32 22517, i32 22517, i32 22517, i32 22517, i32 22517>
|
|
%not = icmp ne <8 x i32> %and1, zeroinitializer
|
|
%smin = call <8 x i32> @llvm.smin.v4i32(<8 x i32> %a, <8 x i32> %b)
|
|
%1 = select <8 x i1> %not, <8 x i32> %src, <8 x i32> %smin
|
|
ret <8 x i32> %1
|
|
}
|
|
declare <8 x i32> @llvm.smin.v4i32(<8 x i32> %a, <8 x i32> %b)
|
|
|
|
define dso_local <8 x i32> @select_umax(<8 x i32> %src, <8 x i32> %a, <8 x i32> %b, ptr %ptr) {
|
|
; AVX512F-LABEL: select_umax:
|
|
; AVX512F: # %bb.0: # %entry
|
|
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
|
|
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3
|
|
; AVX512F-NEXT: vptestnmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm3, %k1
|
|
; AVX512F-NEXT: vpmaxud %ymm2, %ymm1, %ymm1
|
|
; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
|
|
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
|
|
; AVX512F-NEXT: retq
|
|
;
|
|
; AVX512VL-LABEL: select_umax:
|
|
; AVX512VL: # %bb.0: # %entry
|
|
; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm3
|
|
; AVX512VL-NEXT: vptestnmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %k1
|
|
; AVX512VL-NEXT: vpmaxud %ymm2, %ymm1, %ymm0 {%k1}
|
|
; AVX512VL-NEXT: retq
|
|
entry:
|
|
%arrayidx = getelementptr inbounds <8 x i32>, ptr %ptr, i32 1
|
|
%0 = load <8 x i32>, ptr %arrayidx, align 64
|
|
%and1 = and <8 x i32> %0, <i32 22517, i32 22517, i32 22517, i32 22517, i32 22517, i32 22517, i32 22517, i32 22517>
|
|
%not = icmp ne <8 x i32> %and1, zeroinitializer
|
|
%umax = call <8 x i32> @llvm.umax.v4i32(<8 x i32> %a, <8 x i32> %b)
|
|
%1 = select <8 x i1> %not, <8 x i32> %src, <8 x i32> %umax
|
|
ret <8 x i32> %1
|
|
}
|
|
declare <8 x i32> @llvm.umax.v4i32(<8 x i32> %a, <8 x i32> %b)
|
|
|
|
define dso_local <8 x i32> @select_umin(<8 x i32> %src, <8 x i32> %a, <8 x i32> %b, ptr %ptr) {
|
|
; AVX512F-LABEL: select_umin:
|
|
; AVX512F: # %bb.0: # %entry
|
|
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
|
|
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3
|
|
; AVX512F-NEXT: vptestnmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm3, %k1
|
|
; AVX512F-NEXT: vpminud %ymm2, %ymm1, %ymm1
|
|
; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
|
|
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
|
|
; AVX512F-NEXT: retq
|
|
;
|
|
; AVX512VL-LABEL: select_umin:
|
|
; AVX512VL: # %bb.0: # %entry
|
|
; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm3
|
|
; AVX512VL-NEXT: vptestnmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %k1
|
|
; AVX512VL-NEXT: vpminud %ymm2, %ymm1, %ymm0 {%k1}
|
|
; AVX512VL-NEXT: retq
|
|
entry:
|
|
%arrayidx = getelementptr inbounds <8 x i32>, ptr %ptr, i32 1
|
|
%0 = load <8 x i32>, ptr %arrayidx, align 64
|
|
%and1 = and <8 x i32> %0, <i32 22517, i32 22517, i32 22517, i32 22517, i32 22517, i32 22517, i32 22517, i32 22517>
|
|
%not = icmp ne <8 x i32> %and1, zeroinitializer
|
|
%umin = call <8 x i32> @llvm.umin.v4i32(<8 x i32> %a, <8 x i32> %b)
|
|
%1 = select <8 x i1> %not, <8 x i32> %src, <8 x i32> %umin
|
|
ret <8 x i32> %1
|
|
}
|
|
declare <8 x i32> @llvm.umin.v4i32(<8 x i32> %a, <8 x i32> %b)
|