[X86][AVX512] rematerialize smaller predicate masks (#166178)
Resolves #165752
This commit is contained in:
parent
938f521e40
commit
83ef17dbe8
@ -3161,6 +3161,12 @@ multiclass avx512_mask_setop_w<SDPatternOperator Val> {
|
||||
defm KSET0 : avx512_mask_setop_w<immAllZerosV>;
|
||||
defm KSET1 : avx512_mask_setop_w<immAllOnesV>;
|
||||
|
||||
// 8-bit mask set operations for AVX512DQ
|
||||
let Predicates = [HasDQI] in {
|
||||
defm KSET0B : avx512_mask_setop<VK8, v8i1, immAllZerosV>;
|
||||
defm KSET1B : avx512_mask_setop<VK8, v8i1, immAllOnesV>;
|
||||
}
|
||||
|
||||
// With AVX-512 only, 8-bit mask is promoted to 16-bit mask.
|
||||
let Predicates = [HasAVX512] in {
|
||||
def : Pat<(v8i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK8)>;
|
||||
@ -3173,6 +3179,34 @@ let Predicates = [HasAVX512] in {
|
||||
def : Pat<(v1i1 immAllOnesV), (COPY_TO_REGCLASS (KSET1W), VK1)>;
|
||||
}
|
||||
|
||||
// With AVX512DQ, use 8-bit operations for 8-bit masks to avoid setting upper
|
||||
// bits
|
||||
let Predicates = [HasDQI] in {
|
||||
def : Pat<(v8i1 immAllZerosV), (KSET0B)>;
|
||||
def : Pat<(v8i1 immAllOnesV), (KSET1B)>;
|
||||
}
|
||||
|
||||
// Optimize bitconvert of all-ones constants to use kxnor instructions
|
||||
let Predicates = [HasDQI] in {
|
||||
def : Pat<(v8i1(bitconvert(i8 255))), (KSET1B)>;
|
||||
def : Pat<(v16i1(bitconvert(i16 255))), (COPY_TO_REGCLASS(KSET1B), VK16)>;
|
||||
}
|
||||
let Predicates = [HasBWI] in {
|
||||
def : Pat<(v32i1(bitconvert(i32 -1))), (KSET1D)>;
|
||||
def : Pat<(v64i1(bitconvert(i64 -1))), (KSET1Q)>;
|
||||
}
|
||||
// Submask patterns: lower N bits set in larger mask registers
|
||||
let Predicates = [HasBWI, HasDQI] in {
|
||||
// v32i1 submasks
|
||||
def : Pat<(v32i1(bitconvert(i32 255))), (COPY_TO_REGCLASS(KSET1B), VK32)>;
|
||||
def : Pat<(v32i1(bitconvert(i32 65535))), (COPY_TO_REGCLASS(KSET1W), VK32)>;
|
||||
// v64i1 submasks
|
||||
def : Pat<(v64i1(bitconvert(i64 255))), (COPY_TO_REGCLASS(KSET1B), VK64)>;
|
||||
def : Pat<(v64i1(bitconvert(i64 65535))), (COPY_TO_REGCLASS(KSET1W), VK64)>;
|
||||
def : Pat<(v64i1(bitconvert(i64 4294967295))), (COPY_TO_REGCLASS(KSET1D),
|
||||
VK64)>;
|
||||
}
|
||||
|
||||
// Patterns for kmask insert_subvector/extract_subvector to/from index=0
|
||||
multiclass operation_subvector_mask_lowering<RegisterClass subRC, ValueType subVT,
|
||||
RegisterClass RC, ValueType VT> {
|
||||
|
||||
@ -788,9 +788,11 @@ bool X86InstrInfo::isReMaterializableImpl(
|
||||
case X86::FsFLD0SS:
|
||||
case X86::FsFLD0SH:
|
||||
case X86::FsFLD0F128:
|
||||
case X86::KSET0B:
|
||||
case X86::KSET0D:
|
||||
case X86::KSET0Q:
|
||||
case X86::KSET0W:
|
||||
case X86::KSET1B:
|
||||
case X86::KSET1D:
|
||||
case X86::KSET1Q:
|
||||
case X86::KSET1W:
|
||||
@ -6352,12 +6354,16 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
|
||||
// registers, since it is not usable as a write mask.
|
||||
// FIXME: A more advanced approach would be to choose the best input mask
|
||||
// register based on context.
|
||||
case X86::KSET0B:
|
||||
return Expand2AddrKreg(MIB, get(X86::KXORBkk), X86::K0);
|
||||
case X86::KSET0W:
|
||||
return Expand2AddrKreg(MIB, get(X86::KXORWkk), X86::K0);
|
||||
case X86::KSET0D:
|
||||
return Expand2AddrKreg(MIB, get(X86::KXORDkk), X86::K0);
|
||||
case X86::KSET0Q:
|
||||
return Expand2AddrKreg(MIB, get(X86::KXORQkk), X86::K0);
|
||||
case X86::KSET1B:
|
||||
return Expand2AddrKreg(MIB, get(X86::KXNORBkk), X86::K0);
|
||||
case X86::KSET1W:
|
||||
return Expand2AddrKreg(MIB, get(X86::KXNORWkk), X86::K0);
|
||||
case X86::KSET1D:
|
||||
|
||||
@ -255,8 +255,8 @@ define void @gather_qps(<8 x i64> %ind, <8 x float> %src, ptr %base, ptr %stbuf)
|
||||
; CHECK-LABEL: gather_qps:
|
||||
; CHECK: ## %bb.0:
|
||||
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
|
||||
; CHECK-NEXT: kxnorw %k0, %k0, %k1
|
||||
; CHECK-NEXT: kxnorw %k0, %k0, %k2
|
||||
; CHECK-NEXT: kxnorb %k0, %k0, %k1
|
||||
; CHECK-NEXT: kxnorb %k0, %k0, %k2
|
||||
; CHECK-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm1 {%k2}
|
||||
; CHECK-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
|
||||
; CHECK-NEXT: vscatterqps %ymm1, (%rsi,%zmm0,4) {%k1}
|
||||
@ -520,7 +520,7 @@ define <8 x float>@test_int_x86_avx512_gather3siv8_sf(<8 x float> %x0, ptr %x1,
|
||||
; CHECK: ## %bb.0:
|
||||
; CHECK-NEXT: kmovd %esi, %k1
|
||||
; CHECK-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1}
|
||||
; CHECK-NEXT: kxnorw %k0, %k0, %k1
|
||||
; CHECK-NEXT: kxnorb %k0, %k0, %k1
|
||||
; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
|
||||
; CHECK-NEXT: vgatherdps (%rdi,%ymm1,2), %ymm2 {%k1}
|
||||
; CHECK-NEXT: vaddps %ymm2, %ymm0, %ymm0
|
||||
@ -772,7 +772,7 @@ define void@test_int_x86_avx512_scattersiv8_sf(ptr %x0, i8 %x1, <8 x i32> %x2, <
|
||||
; CHECK: ## %bb.0:
|
||||
; CHECK-NEXT: kmovd %esi, %k1
|
||||
; CHECK-NEXT: vscatterdps %ymm1, (%rdi,%ymm0,2) {%k1}
|
||||
; CHECK-NEXT: kxnorw %k0, %k0, %k1
|
||||
; CHECK-NEXT: kxnorb %k0, %k0, %k1
|
||||
; CHECK-NEXT: vscatterdps %ymm1, (%rdi,%ymm0,4) {%k1}
|
||||
; CHECK-NEXT: vzeroupper
|
||||
; CHECK-NEXT: retq
|
||||
@ -788,7 +788,7 @@ define void@test_int_x86_avx512_scattersiv8_si(ptr %x0, i8 %x1, <8 x i32> %x2, <
|
||||
; CHECK: ## %bb.0:
|
||||
; CHECK-NEXT: kmovd %esi, %k1
|
||||
; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1}
|
||||
; CHECK-NEXT: kxnorw %k0, %k0, %k1
|
||||
; CHECK-NEXT: kxnorb %k0, %k0, %k1
|
||||
; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
|
||||
; CHECK-NEXT: vzeroupper
|
||||
; CHECK-NEXT: retq
|
||||
@ -800,9 +800,9 @@ define void@test_int_x86_avx512_scattersiv8_si(ptr %x0, i8 %x1, <8 x i32> %x2, <
|
||||
define void @scatter_mask_test(ptr %x0, <8 x i32> %x2, <8 x i32> %x3) {
|
||||
; CHECK-LABEL: scatter_mask_test:
|
||||
; CHECK: ## %bb.0:
|
||||
; CHECK-NEXT: kxnorw %k0, %k0, %k1
|
||||
; CHECK-NEXT: kxnorb %k0, %k0, %k1
|
||||
; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1}
|
||||
; CHECK-NEXT: kxorw %k0, %k0, %k1
|
||||
; CHECK-NEXT: kxorb %k0, %k0, %k1
|
||||
; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
|
||||
; CHECK-NEXT: movb $1, %al
|
||||
; CHECK-NEXT: kmovd %eax, %k1
|
||||
|
||||
@ -251,9 +251,9 @@ define dso_local void @scatter_mask_qps_execdomain(<8 x i64> %ind, ptr %src, i8
|
||||
define dso_local void @gather_qps(<8 x i64> %ind, <8 x float> %src, ptr %base, ptr %stbuf) {
|
||||
; CHECK-LABEL: gather_qps:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: kxnorw %k0, %k0, %k1
|
||||
; CHECK-NEXT: kxnorb %k0, %k0, %k1
|
||||
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
|
||||
; CHECK-NEXT: kxnorw %k0, %k0, %k2
|
||||
; CHECK-NEXT: kxnorb %k0, %k0, %k2
|
||||
; CHECK-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm1 {%k2}
|
||||
; CHECK-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
|
||||
; CHECK-NEXT: vscatterqps %ymm1, (%rsi,%zmm0,4) {%k1}
|
||||
@ -523,7 +523,7 @@ define <8 x float> @test_int_x86_avx512_mask_gather3siv8_sf(<8 x float> %x0, ptr
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: kmovd %esi, %k1
|
||||
; CHECK-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1}
|
||||
; CHECK-NEXT: kxnorw %k0, %k0, %k1
|
||||
; CHECK-NEXT: kxnorb %k0, %k0, %k1
|
||||
; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
|
||||
; CHECK-NEXT: vgatherdps (%rdi,%ymm1,2), %ymm2 {%k1}
|
||||
; CHECK-NEXT: vaddps %ymm2, %ymm0, %ymm0
|
||||
@ -774,7 +774,7 @@ define dso_local void@test_int_x86_avx512_scattersiv8_sf(ptr %x0, i8 %x1, <8 x i
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: kmovd %esi, %k1
|
||||
; CHECK-NEXT: vscatterdps %ymm1, (%rdi,%ymm0,2) {%k1}
|
||||
; CHECK-NEXT: kxnorw %k0, %k0, %k1
|
||||
; CHECK-NEXT: kxnorb %k0, %k0, %k1
|
||||
; CHECK-NEXT: vscatterdps %ymm1, (%rdi,%ymm0,4) {%k1}
|
||||
; CHECK-NEXT: vzeroupper
|
||||
; CHECK-NEXT: retq
|
||||
@ -789,7 +789,7 @@ define dso_local void@test_int_x86_avx512_scattersiv8_si(ptr %x0, i8 %x1, <8 x i
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: kmovd %esi, %k1
|
||||
; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1}
|
||||
; CHECK-NEXT: kxnorw %k0, %k0, %k1
|
||||
; CHECK-NEXT: kxnorb %k0, %k0, %k1
|
||||
; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
|
||||
; CHECK-NEXT: vzeroupper
|
||||
; CHECK-NEXT: retq
|
||||
@ -802,9 +802,9 @@ define dso_local void@test_int_x86_avx512_scattersiv8_si(ptr %x0, i8 %x1, <8 x i
|
||||
define dso_local void @scatter_mask_test(ptr %x0, <8 x i32> %x2, <8 x i32> %x3) {
|
||||
; CHECK-LABEL: scatter_mask_test:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: kxnorw %k0, %k0, %k1
|
||||
; CHECK-NEXT: kxnorb %k0, %k0, %k1
|
||||
; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1}
|
||||
; CHECK-NEXT: kxorw %k0, %k0, %k1
|
||||
; CHECK-NEXT: kxorb %k0, %k0, %k1
|
||||
; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
|
||||
; CHECK-NEXT: movb $1, %al
|
||||
; CHECK-NEXT: kmovd %eax, %k1
|
||||
@ -856,7 +856,7 @@ define <16 x float> @gather_mask_test(<16 x i32> %ind, <16 x float> %src, ptr %b
|
||||
define <8 x float> @gather_global(<8 x i64>, ptr nocapture readnone) {
|
||||
; CHECK-LABEL: gather_global:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: kxnorw %k0, %k0, %k1
|
||||
; CHECK-NEXT: kxnorb %k0, %k0, %k1
|
||||
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
|
||||
; CHECK-NEXT: vgatherqps x(,%zmm0,4), %ymm1 {%k1}
|
||||
; CHECK-NEXT: vmovaps %ymm1, %ymm0
|
||||
|
||||
229
llvm/test/CodeGen/X86/avx512-mask-set-opt.ll
Normal file
229
llvm/test/CodeGen/X86/avx512-mask-set-opt.ll
Normal file
@ -0,0 +1,229 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512DQBW
|
||||
|
||||
declare <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x float>)
|
||||
declare <16 x float> @llvm.masked.expandload.v16f32(ptr, <16 x i1>, <16 x float>)
|
||||
declare <8 x float> @llvm.masked.expandload.v8f32(ptr, <8 x i1>, <8 x float>)
|
||||
declare <16 x i32> @llvm.masked.expandload.v16i32(ptr, <16 x i1>, <16 x i32>)
|
||||
|
||||
; Test case 1: Direct v8i1 all-ones mask (should use kxnorb on AVX512DQ)
|
||||
define <8 x float> @mask_v8i1_allones(ptr %ptr) {
|
||||
; AVX512F-LABEL: mask_v8i1_allones:
|
||||
; AVX512F: # %bb.0:
|
||||
; AVX512F-NEXT: movw $255, %ax
|
||||
; AVX512F-NEXT: kmovw %eax, %k1
|
||||
; AVX512F-NEXT: vexpandps (%rdi), %zmm0 {%k1} {z}
|
||||
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
|
||||
; AVX512F-NEXT: retq
|
||||
;
|
||||
; AVX512DQ-LABEL: mask_v8i1_allones:
|
||||
; AVX512DQ: # %bb.0:
|
||||
; AVX512DQ-NEXT: kxnorb %k0, %k0, %k1
|
||||
; AVX512DQ-NEXT: vexpandps (%rdi), %zmm0 {%k1} {z}
|
||||
; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
|
||||
; AVX512DQ-NEXT: retq
|
||||
;
|
||||
; AVX512BW-LABEL: mask_v8i1_allones:
|
||||
; AVX512BW: # %bb.0:
|
||||
; AVX512BW-NEXT: movw $255, %ax
|
||||
; AVX512BW-NEXT: kmovd %eax, %k1
|
||||
; AVX512BW-NEXT: vexpandps (%rdi), %zmm0 {%k1} {z}
|
||||
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
|
||||
; AVX512BW-NEXT: retq
|
||||
;
|
||||
; AVX512DQBW-LABEL: mask_v8i1_allones:
|
||||
; AVX512DQBW: # %bb.0:
|
||||
; AVX512DQBW-NEXT: kxnorb %k0, %k0, %k1
|
||||
; AVX512DQBW-NEXT: vexpandps (%rdi), %zmm0 {%k1} {z}
|
||||
; AVX512DQBW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
|
||||
; AVX512DQBW-NEXT: retq
|
||||
%res = call <8 x float> @llvm.masked.expandload.v8f32(ptr %ptr, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> zeroinitializer)
|
||||
ret <8 x float> %res
|
||||
}
|
||||
|
||||
; Test case 2: v16i1 with lower 8 bits set via bitconvert (should use kxnorb on AVX512DQ)
|
||||
define <16 x float> @mask_v16i1_lower8(ptr %ptr) {
|
||||
; AVX512F-LABEL: mask_v16i1_lower8:
|
||||
; AVX512F: # %bb.0:
|
||||
; AVX512F-NEXT: movw $255, %ax
|
||||
; AVX512F-NEXT: kmovw %eax, %k1
|
||||
; AVX512F-NEXT: vexpandps (%rdi), %zmm0 {%k1} {z}
|
||||
; AVX512F-NEXT: retq
|
||||
;
|
||||
; AVX512DQ-LABEL: mask_v16i1_lower8:
|
||||
; AVX512DQ: # %bb.0:
|
||||
; AVX512DQ-NEXT: kxnorb %k0, %k0, %k1
|
||||
; AVX512DQ-NEXT: vexpandps (%rdi), %zmm0 {%k1} {z}
|
||||
; AVX512DQ-NEXT: retq
|
||||
;
|
||||
; AVX512BW-LABEL: mask_v16i1_lower8:
|
||||
; AVX512BW: # %bb.0:
|
||||
; AVX512BW-NEXT: movw $255, %ax
|
||||
; AVX512BW-NEXT: kmovd %eax, %k1
|
||||
; AVX512BW-NEXT: vexpandps (%rdi), %zmm0 {%k1} {z}
|
||||
; AVX512BW-NEXT: retq
|
||||
;
|
||||
; AVX512DQBW-LABEL: mask_v16i1_lower8:
|
||||
; AVX512DQBW: # %bb.0:
|
||||
; AVX512DQBW-NEXT: kxnorb %k0, %k0, %k1
|
||||
; AVX512DQBW-NEXT: vexpandps (%rdi), %zmm0 {%k1} {z}
|
||||
; AVX512DQBW-NEXT: retq
|
||||
%res = call <16 x float> @llvm.masked.expandload.v16f32(ptr %ptr, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x float> zeroinitializer)
|
||||
ret <16 x float> %res
|
||||
}
|
||||
|
||||
; Test case 3: v16i1 with all bits set (should use kxnorw on all targets)
|
||||
define <16 x float> @gather_all(ptr %base, <16 x i32> %ind, i16 %mask) {
|
||||
; AVX512-LABEL: gather_all:
|
||||
; AVX512: # %bb.0:
|
||||
; AVX512-NEXT: kxnorw %k0, %k0, %k1
|
||||
; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
|
||||
; AVX512-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
|
||||
; AVX512-NEXT: vmovaps %zmm1, %zmm0
|
||||
; AVX512-NEXT: retq
|
||||
%broadcast.splatinsert = insertelement <16 x ptr> poison, ptr %base, i32 0
|
||||
%broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer
|
||||
%sext_ind = sext <16 x i32> %ind to <16 x i64>
|
||||
%gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
|
||||
%res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> poison)
|
||||
ret <16 x float> %res
|
||||
}
|
||||
|
||||
; Test case 4: v8i1 with lower 8 bits set in gather (should use kxnorb on AVX512DQ targets)
|
||||
define <16 x float> @gather_lower(ptr %base, <16 x i32> %ind, i16 %mask) {
|
||||
; AVX512F-LABEL: gather_lower:
|
||||
; AVX512F: # %bb.0:
|
||||
; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
|
||||
; AVX512F-NEXT: movw $255, %ax
|
||||
; AVX512F-NEXT: kmovw %eax, %k1
|
||||
; AVX512F-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
|
||||
; AVX512F-NEXT: vmovaps %zmm1, %zmm0
|
||||
; AVX512F-NEXT: retq
|
||||
;
|
||||
; AVX512DQ-LABEL: gather_lower:
|
||||
; AVX512DQ: # %bb.0:
|
||||
; AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1
|
||||
; AVX512DQ-NEXT: kxnorb %k0, %k0, %k1
|
||||
; AVX512DQ-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
|
||||
; AVX512DQ-NEXT: vmovaps %zmm1, %zmm0
|
||||
; AVX512DQ-NEXT: retq
|
||||
;
|
||||
; AVX512BW-LABEL: gather_lower:
|
||||
; AVX512BW: # %bb.0:
|
||||
; AVX512BW-NEXT: vxorps %xmm1, %xmm1, %xmm1
|
||||
; AVX512BW-NEXT: movw $255, %ax
|
||||
; AVX512BW-NEXT: kmovd %eax, %k1
|
||||
; AVX512BW-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
|
||||
; AVX512BW-NEXT: vmovaps %zmm1, %zmm0
|
||||
; AVX512BW-NEXT: retq
|
||||
;
|
||||
; AVX512DQBW-LABEL: gather_lower:
|
||||
; AVX512DQBW: # %bb.0:
|
||||
; AVX512DQBW-NEXT: vxorps %xmm1, %xmm1, %xmm1
|
||||
; AVX512DQBW-NEXT: kxnorb %k0, %k0, %k1
|
||||
; AVX512DQBW-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
|
||||
; AVX512DQBW-NEXT: vmovaps %zmm1, %zmm0
|
||||
; AVX512DQBW-NEXT: retq
|
||||
%broadcast.splatinsert = insertelement <16 x ptr> poison, ptr %base, i32 0
|
||||
%broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> poison, <16 x i32> zeroinitializer
|
||||
%sext_ind = sext <16 x i32> %ind to <16 x i64>
|
||||
%gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind
|
||||
%res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x float> poison)
|
||||
ret <16 x float> %res
|
||||
}
|
||||
|
||||
; Test case 5: v32i1 mask via bitconvert combined with dynamic condition.
|
||||
; Ensures lower 16 lanes force the KSET1W path without folding into a shuffle.
|
||||
define <32 x i16> @mask_v32i1_lower16(<32 x i16> %a, <32 x i16> %b, <32 x i16> %c, <32 x i16> %d) {
|
||||
; AVX512F-LABEL: mask_v32i1_lower16:
|
||||
; AVX512F: # %bb.0:
|
||||
; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm3
|
||||
; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm2
|
||||
; AVX512F-NEXT: vpcmpgtw %ymm3, %ymm2, %ymm2
|
||||
; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 = -1
|
||||
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
|
||||
; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm2 & (zmm0 ^ zmm1))
|
||||
; AVX512F-NEXT: retq
|
||||
;
|
||||
; AVX512DQ-LABEL: mask_v32i1_lower16:
|
||||
; AVX512DQ: # %bb.0:
|
||||
; AVX512DQ-NEXT: vextracti64x4 $1, %zmm3, %ymm3
|
||||
; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm2
|
||||
; AVX512DQ-NEXT: vpcmpgtw %ymm3, %ymm2, %ymm2
|
||||
; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm3 = -1
|
||||
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
|
||||
; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm2 & (zmm0 ^ zmm1))
|
||||
; AVX512DQ-NEXT: retq
|
||||
;
|
||||
; AVX512BW-LABEL: mask_v32i1_lower16:
|
||||
; AVX512BW: # %bb.0:
|
||||
; AVX512BW-NEXT: movl $65535, %eax # imm = 0xFFFF
|
||||
; AVX512BW-NEXT: kmovd %eax, %k0
|
||||
; AVX512BW-NEXT: vpcmpgtw %zmm3, %zmm2, %k1
|
||||
; AVX512BW-NEXT: kord %k0, %k1, %k1
|
||||
; AVX512BW-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
|
||||
; AVX512BW-NEXT: retq
|
||||
;
|
||||
; AVX512DQBW-LABEL: mask_v32i1_lower16:
|
||||
; AVX512DQBW: # %bb.0:
|
||||
; AVX512DQBW-NEXT: kxnorw %k0, %k0, %k0
|
||||
; AVX512DQBW-NEXT: vpcmpgtw %zmm3, %zmm2, %k1
|
||||
; AVX512DQBW-NEXT: kord %k0, %k1, %k1
|
||||
; AVX512DQBW-NEXT: vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
|
||||
; AVX512DQBW-NEXT: retq
|
||||
%mask0 = bitcast i32 65535 to <32 x i1>
|
||||
%mask1 = icmp sgt <32 x i16> %c, %d
|
||||
%mask = or <32 x i1> %mask0, %mask1
|
||||
%res = select <32 x i1> %mask, <32 x i16> %a, <32 x i16> %b
|
||||
ret <32 x i16> %res
|
||||
}
|
||||
|
||||
; Test case 6: v64i1 mask via bitconvert combined with dynamic condition.
|
||||
; Verifies the KSET1D submask pattern survives past SelectionDAG combines.
|
||||
define <64 x i8> @mask_v64i1_lower32(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i8> %d) {
|
||||
; AVX512F-LABEL: mask_v64i1_lower32:
|
||||
; AVX512F: # %bb.0:
|
||||
; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm3
|
||||
; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm2
|
||||
; AVX512F-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm2
|
||||
; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 = -1
|
||||
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
|
||||
; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm2 & (zmm0 ^ zmm1))
|
||||
; AVX512F-NEXT: retq
|
||||
;
|
||||
; AVX512DQ-LABEL: mask_v64i1_lower32:
|
||||
; AVX512DQ: # %bb.0:
|
||||
; AVX512DQ-NEXT: vextracti64x4 $1, %zmm3, %ymm3
|
||||
; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm2
|
||||
; AVX512DQ-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm2
|
||||
; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm3 = -1
|
||||
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
|
||||
; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm1 ^ (zmm2 & (zmm0 ^ zmm1))
|
||||
; AVX512DQ-NEXT: retq
|
||||
;
|
||||
; AVX512BW-LABEL: mask_v64i1_lower32:
|
||||
; AVX512BW: # %bb.0:
|
||||
; AVX512BW-NEXT: movl $4294967295, %eax # imm = 0xFFFFFFFF
|
||||
; AVX512BW-NEXT: kmovq %rax, %k0
|
||||
; AVX512BW-NEXT: vpcmpgtb %zmm3, %zmm2, %k1
|
||||
; AVX512BW-NEXT: korq %k0, %k1, %k1
|
||||
; AVX512BW-NEXT: vpblendmb %zmm0, %zmm1, %zmm0 {%k1}
|
||||
; AVX512BW-NEXT: retq
|
||||
;
|
||||
; AVX512DQBW-LABEL: mask_v64i1_lower32:
|
||||
; AVX512DQBW: # %bb.0:
|
||||
; AVX512DQBW-NEXT: kxnord %k0, %k0, %k0
|
||||
; AVX512DQBW-NEXT: vpcmpgtb %zmm3, %zmm2, %k1
|
||||
; AVX512DQBW-NEXT: korq %k0, %k1, %k1
|
||||
; AVX512DQBW-NEXT: vpblendmb %zmm0, %zmm1, %zmm0 {%k1}
|
||||
; AVX512DQBW-NEXT: retq
|
||||
%mask0 = bitcast i64 4294967295 to <64 x i1>
|
||||
%mask1 = icmp sgt <64 x i8> %c, %d
|
||||
%mask = or <64 x i1> %mask0, %mask1
|
||||
%res = select <64 x i1> %mask, <64 x i8> %a, <64 x i8> %b
|
||||
ret <64 x i8> %res
|
||||
}
|
||||
|
||||
@ -207,15 +207,15 @@ declare void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> , <16 x ptr> , i32 , <
|
||||
; SCALAR-NEXT: store i32 %Elt2, ptr %Ptr23, align 4
|
||||
|
||||
define <8 x i32> @test6(<8 x i32>%a1, <8 x ptr> %ptr) {
|
||||
; X64-LABEL: test6:
|
||||
; X64: # %bb.0:
|
||||
; X64-NEXT: kxnorw %k0, %k0, %k1
|
||||
; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
||||
; X64-NEXT: kxnorw %k0, %k0, %k2
|
||||
; X64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
|
||||
; X64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
|
||||
; X64-NEXT: vmovdqa %ymm2, %ymm0
|
||||
; X64-NEXT: retq
|
||||
; X64-KNL-LABEL: test6:
|
||||
; X64-KNL: # %bb.0:
|
||||
; X64-KNL-NEXT: kxnorw %k0, %k0, %k1
|
||||
; X64-KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
||||
; X64-KNL-NEXT: kxnorw %k0, %k0, %k2
|
||||
; X64-KNL-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
|
||||
; X64-KNL-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
|
||||
; X64-KNL-NEXT: vmovdqa %ymm2, %ymm0
|
||||
; X64-KNL-NEXT: retq
|
||||
;
|
||||
; X86-KNL-LABEL: test6:
|
||||
; X86-KNL: # %bb.0:
|
||||
@ -230,11 +230,21 @@ define <8 x i32> @test6(<8 x i32>%a1, <8 x ptr> %ptr) {
|
||||
; X86-KNL-NEXT: vmovdqa %ymm2, %ymm0
|
||||
; X86-KNL-NEXT: retl
|
||||
;
|
||||
; X64-SKX-LABEL: test6:
|
||||
; X64-SKX: # %bb.0:
|
||||
; X64-SKX-NEXT: kxnorb %k0, %k0, %k1
|
||||
; X64-SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
||||
; X64-SKX-NEXT: kxnorb %k0, %k0, %k2
|
||||
; X64-SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
|
||||
; X64-SKX-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
|
||||
; X64-SKX-NEXT: vmovdqa %ymm2, %ymm0
|
||||
; X64-SKX-NEXT: retq
|
||||
;
|
||||
; X86-SKX-LABEL: test6:
|
||||
; X86-SKX: # %bb.0:
|
||||
; X86-SKX-NEXT: kxnorw %k0, %k0, %k1
|
||||
; X86-SKX-NEXT: kxnorb %k0, %k0, %k1
|
||||
; X86-SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
||||
; X86-SKX-NEXT: kxnorw %k0, %k0, %k2
|
||||
; X86-SKX-NEXT: kxnorb %k0, %k0, %k2
|
||||
; X86-SKX-NEXT: vpgatherdd (,%ymm1), %ymm2 {%k2}
|
||||
; X86-SKX-NEXT: vpscatterdd %ymm0, (,%ymm1) {%k1}
|
||||
; X86-SKX-NEXT: vmovdqa %ymm2, %ymm0
|
||||
@ -397,7 +407,7 @@ define <8 x i32> @test9(ptr %base, <8 x i64> %ind1, <8 x i32>%ind5) {
|
||||
; X64-SKX-SMALL-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
|
||||
; X64-SKX-SMALL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1
|
||||
; X64-SKX-SMALL-NEXT: vpaddq %zmm1, %zmm0, %zmm1
|
||||
; X64-SKX-SMALL-NEXT: kxnorw %k0, %k0, %k1
|
||||
; X64-SKX-SMALL-NEXT: kxnorb %k0, %k0, %k1
|
||||
; X64-SKX-SMALL-NEXT: vpxor %xmm0, %xmm0, %xmm0
|
||||
; X64-SKX-SMALL-NEXT: vpgatherqd 72(,%zmm1), %ymm0 {%k1}
|
||||
; X64-SKX-SMALL-NEXT: retq
|
||||
@ -412,7 +422,7 @@ define <8 x i32> @test9(ptr %base, <8 x i64> %ind1, <8 x i32>%ind5) {
|
||||
; X64-SKX-LARGE-NEXT: vpmullq (%rax){1to8}, %zmm0, %zmm0
|
||||
; X64-SKX-LARGE-NEXT: vpaddq %zmm0, %zmm2, %zmm0
|
||||
; X64-SKX-LARGE-NEXT: vpaddq %zmm1, %zmm0, %zmm1
|
||||
; X64-SKX-LARGE-NEXT: kxnorw %k0, %k0, %k1
|
||||
; X64-SKX-LARGE-NEXT: kxnorb %k0, %k0, %k1
|
||||
; X64-SKX-LARGE-NEXT: vpxor %xmm0, %xmm0, %xmm0
|
||||
; X64-SKX-LARGE-NEXT: vpgatherqd 72(,%zmm1), %ymm0 {%k1}
|
||||
; X64-SKX-LARGE-NEXT: retq
|
||||
@ -424,7 +434,7 @@ define <8 x i32> @test9(ptr %base, <8 x i64> %ind1, <8 x i32>%ind5) {
|
||||
; X86-SKX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm0
|
||||
; X86-SKX-NEXT: vpaddd {{[0-9]+}}(%esp){1to8}, %ymm0, %ymm0
|
||||
; X86-SKX-NEXT: vpaddd %ymm1, %ymm0, %ymm1
|
||||
; X86-SKX-NEXT: kxnorw %k0, %k0, %k1
|
||||
; X86-SKX-NEXT: kxnorb %k0, %k0, %k1
|
||||
; X86-SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0
|
||||
; X86-SKX-NEXT: vpgatherdd 68(,%ymm1), %ymm0 {%k1}
|
||||
; X86-SKX-NEXT: retl
|
||||
@ -481,7 +491,7 @@ define <8 x i32> @test10(ptr %base, <8 x i64> %i1, <8 x i32>%ind5) {
|
||||
; X64-SKX-SMALL-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
|
||||
; X64-SKX-SMALL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1
|
||||
; X64-SKX-SMALL-NEXT: vpaddq %zmm1, %zmm0, %zmm1
|
||||
; X64-SKX-SMALL-NEXT: kxnorw %k0, %k0, %k1
|
||||
; X64-SKX-SMALL-NEXT: kxnorb %k0, %k0, %k1
|
||||
; X64-SKX-SMALL-NEXT: vpxor %xmm0, %xmm0, %xmm0
|
||||
; X64-SKX-SMALL-NEXT: vpgatherqd 72(,%zmm1), %ymm0 {%k1}
|
||||
; X64-SKX-SMALL-NEXT: retq
|
||||
@ -496,7 +506,7 @@ define <8 x i32> @test10(ptr %base, <8 x i64> %i1, <8 x i32>%ind5) {
|
||||
; X64-SKX-LARGE-NEXT: vpmullq (%rax){1to8}, %zmm0, %zmm0
|
||||
; X64-SKX-LARGE-NEXT: vpaddq %zmm0, %zmm2, %zmm0
|
||||
; X64-SKX-LARGE-NEXT: vpaddq %zmm1, %zmm0, %zmm1
|
||||
; X64-SKX-LARGE-NEXT: kxnorw %k0, %k0, %k1
|
||||
; X64-SKX-LARGE-NEXT: kxnorb %k0, %k0, %k1
|
||||
; X64-SKX-LARGE-NEXT: vpxor %xmm0, %xmm0, %xmm0
|
||||
; X64-SKX-LARGE-NEXT: vpgatherqd 72(,%zmm1), %ymm0 {%k1}
|
||||
; X64-SKX-LARGE-NEXT: retq
|
||||
@ -508,7 +518,7 @@ define <8 x i32> @test10(ptr %base, <8 x i64> %i1, <8 x i32>%ind5) {
|
||||
; X86-SKX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm0
|
||||
; X86-SKX-NEXT: vpaddd {{[0-9]+}}(%esp){1to8}, %ymm0, %ymm0
|
||||
; X86-SKX-NEXT: vpaddd %ymm1, %ymm0, %ymm1
|
||||
; X86-SKX-NEXT: kxnorw %k0, %k0, %k1
|
||||
; X86-SKX-NEXT: kxnorb %k0, %k0, %k1
|
||||
; X86-SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0
|
||||
; X86-SKX-NEXT: vpgatherdd 68(,%ymm1), %ymm0 {%k1}
|
||||
; X86-SKX-NEXT: retl
|
||||
@ -2465,17 +2475,17 @@ define void @test30b(<3 x ptr> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32>
|
||||
|
||||
declare <16 x ptr> @llvm.masked.gather.v16p0.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x ptr>)
|
||||
define <16 x ptr> @test31(<16 x ptr> %ptrs) {
|
||||
; X64-LABEL: test31:
|
||||
; X64: # %bb.0:
|
||||
; X64-NEXT: kxnorw %k0, %k0, %k1
|
||||
; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
||||
; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3
|
||||
; X64-NEXT: kxnorw %k0, %k0, %k2
|
||||
; X64-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k2}
|
||||
; X64-NEXT: vpgatherqq (,%zmm1), %zmm2 {%k1}
|
||||
; X64-NEXT: vmovdqa64 %zmm3, %zmm0
|
||||
; X64-NEXT: vmovdqa64 %zmm2, %zmm1
|
||||
; X64-NEXT: retq
|
||||
; X64-KNL-LABEL: test31:
|
||||
; X64-KNL: # %bb.0:
|
||||
; X64-KNL-NEXT: kxnorw %k0, %k0, %k1
|
||||
; X64-KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
||||
; X64-KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3
|
||||
; X64-KNL-NEXT: kxnorw %k0, %k0, %k2
|
||||
; X64-KNL-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k2}
|
||||
; X64-KNL-NEXT: vpgatherqq (,%zmm1), %zmm2 {%k1}
|
||||
; X64-KNL-NEXT: vmovdqa64 %zmm3, %zmm0
|
||||
; X64-KNL-NEXT: vmovdqa64 %zmm2, %zmm1
|
||||
; X64-KNL-NEXT: retq
|
||||
;
|
||||
; X86-LABEL: test31:
|
||||
; X86: # %bb.0:
|
||||
@ -2484,6 +2494,18 @@ define <16 x ptr> @test31(<16 x ptr> %ptrs) {
|
||||
; X86-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k1}
|
||||
; X86-NEXT: vmovdqa64 %zmm1, %zmm0
|
||||
; X86-NEXT: retl
|
||||
;
|
||||
; X64-SKX-LABEL: test31:
|
||||
; X64-SKX: # %bb.0:
|
||||
; X64-SKX-NEXT: kxnorb %k0, %k0, %k1
|
||||
; X64-SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
||||
; X64-SKX-NEXT: kxnorb %k0, %k0, %k2
|
||||
; X64-SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3
|
||||
; X64-SKX-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k2}
|
||||
; X64-SKX-NEXT: vpgatherqq (,%zmm1), %zmm2 {%k1}
|
||||
; X64-SKX-NEXT: vmovdqa64 %zmm3, %zmm0
|
||||
; X64-SKX-NEXT: vmovdqa64 %zmm2, %zmm1
|
||||
; X64-SKX-NEXT: retq
|
||||
%res = call <16 x ptr> @llvm.masked.gather.v16p0.v16p0(<16 x ptr> %ptrs, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x ptr> undef)
|
||||
ret <16 x ptr>%res
|
||||
}
|
||||
@ -3253,17 +3275,17 @@ define <8 x i32> @test_global_array(<8 x i64> %indxs) {
|
||||
; X64-KNL-NEXT: vmovdqa %ymm1, %ymm0
|
||||
; X64-KNL-NEXT: retq
|
||||
;
|
||||
; X86-LABEL: test_global_array:
|
||||
; X86: # %bb.0:
|
||||
; X86-NEXT: kxnorw %k0, %k0, %k1
|
||||
; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
||||
; X86-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
|
||||
; X86-NEXT: vmovdqa %ymm1, %ymm0
|
||||
; X86-NEXT: retl
|
||||
; X86-KNL-LABEL: test_global_array:
|
||||
; X86-KNL: # %bb.0:
|
||||
; X86-KNL-NEXT: kxnorw %k0, %k0, %k1
|
||||
; X86-KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
||||
; X86-KNL-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
|
||||
; X86-KNL-NEXT: vmovdqa %ymm1, %ymm0
|
||||
; X86-KNL-NEXT: retl
|
||||
;
|
||||
; X64-SKX-SMALL-LABEL: test_global_array:
|
||||
; X64-SKX-SMALL: # %bb.0:
|
||||
; X64-SKX-SMALL-NEXT: kxnorw %k0, %k0, %k1
|
||||
; X64-SKX-SMALL-NEXT: kxnorb %k0, %k0, %k1
|
||||
; X64-SKX-SMALL-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
||||
; X64-SKX-SMALL-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
|
||||
; X64-SKX-SMALL-NEXT: vmovdqa %ymm1, %ymm0
|
||||
@ -3272,11 +3294,19 @@ define <8 x i32> @test_global_array(<8 x i64> %indxs) {
|
||||
; X64-SKX-LARGE-LABEL: test_global_array:
|
||||
; X64-SKX-LARGE: # %bb.0:
|
||||
; X64-SKX-LARGE-NEXT: movabsq $glob_array, %rax
|
||||
; X64-SKX-LARGE-NEXT: kxnorw %k0, %k0, %k1
|
||||
; X64-SKX-LARGE-NEXT: kxnorb %k0, %k0, %k1
|
||||
; X64-SKX-LARGE-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
||||
; X64-SKX-LARGE-NEXT: vpgatherqd (%rax,%zmm0,4), %ymm1 {%k1}
|
||||
; X64-SKX-LARGE-NEXT: vmovdqa %ymm1, %ymm0
|
||||
; X64-SKX-LARGE-NEXT: retq
|
||||
;
|
||||
; X86-SKX-LABEL: test_global_array:
|
||||
; X86-SKX: # %bb.0:
|
||||
; X86-SKX-NEXT: kxnorb %k0, %k0, %k1
|
||||
; X86-SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
||||
; X86-SKX-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
|
||||
; X86-SKX-NEXT: vmovdqa %ymm1, %ymm0
|
||||
; X86-SKX-NEXT: retl
|
||||
%p = getelementptr inbounds [16 x i32], ptr @glob_array, i64 0, <8 x i64> %indxs
|
||||
%g = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %p, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
|
||||
ret <8 x i32> %g
|
||||
@ -3291,17 +3321,17 @@ define <8 x i32> @test_global_array_zeroinitializer_index(<8 x i64> %indxs) {
|
||||
; X64-KNL-NEXT: vmovdqa %ymm1, %ymm0
|
||||
; X64-KNL-NEXT: retq
|
||||
;
|
||||
; X86-LABEL: test_global_array_zeroinitializer_index:
|
||||
; X86: # %bb.0:
|
||||
; X86-NEXT: kxnorw %k0, %k0, %k1
|
||||
; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
||||
; X86-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
|
||||
; X86-NEXT: vmovdqa %ymm1, %ymm0
|
||||
; X86-NEXT: retl
|
||||
; X86-KNL-LABEL: test_global_array_zeroinitializer_index:
|
||||
; X86-KNL: # %bb.0:
|
||||
; X86-KNL-NEXT: kxnorw %k0, %k0, %k1
|
||||
; X86-KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
||||
; X86-KNL-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
|
||||
; X86-KNL-NEXT: vmovdqa %ymm1, %ymm0
|
||||
; X86-KNL-NEXT: retl
|
||||
;
|
||||
; X64-SKX-SMALL-LABEL: test_global_array_zeroinitializer_index:
|
||||
; X64-SKX-SMALL: # %bb.0:
|
||||
; X64-SKX-SMALL-NEXT: kxnorw %k0, %k0, %k1
|
||||
; X64-SKX-SMALL-NEXT: kxnorb %k0, %k0, %k1
|
||||
; X64-SKX-SMALL-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
||||
; X64-SKX-SMALL-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
|
||||
; X64-SKX-SMALL-NEXT: vmovdqa %ymm1, %ymm0
|
||||
@ -3310,11 +3340,19 @@ define <8 x i32> @test_global_array_zeroinitializer_index(<8 x i64> %indxs) {
|
||||
; X64-SKX-LARGE-LABEL: test_global_array_zeroinitializer_index:
|
||||
; X64-SKX-LARGE: # %bb.0:
|
||||
; X64-SKX-LARGE-NEXT: movabsq $glob_array, %rax
|
||||
; X64-SKX-LARGE-NEXT: kxnorw %k0, %k0, %k1
|
||||
; X64-SKX-LARGE-NEXT: kxnorb %k0, %k0, %k1
|
||||
; X64-SKX-LARGE-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
||||
; X64-SKX-LARGE-NEXT: vpgatherqd (%rax,%zmm0,4), %ymm1 {%k1}
|
||||
; X64-SKX-LARGE-NEXT: vmovdqa %ymm1, %ymm0
|
||||
; X64-SKX-LARGE-NEXT: retq
|
||||
;
|
||||
; X86-SKX-LABEL: test_global_array_zeroinitializer_index:
|
||||
; X86-SKX: # %bb.0:
|
||||
; X86-SKX-NEXT: kxnorb %k0, %k0, %k1
|
||||
; X86-SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
||||
; X86-SKX-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1}
|
||||
; X86-SKX-NEXT: vmovdqa %ymm1, %ymm0
|
||||
; X86-SKX-NEXT: retl
|
||||
%p = getelementptr inbounds [16 x i32], ptr @glob_array, <8 x i64> zeroinitializer, <8 x i64> %indxs
|
||||
%g = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %p, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
|
||||
ret <8 x i32> %g
|
||||
@ -3545,7 +3583,7 @@ define <8 x float> @sext_v8i8_index(ptr %base, <8 x i8> %ind) {
|
||||
; X64-SKX-LABEL: sext_v8i8_index:
|
||||
; X64-SKX: # %bb.0:
|
||||
; X64-SKX-NEXT: vpmovsxbd %xmm0, %ymm1
|
||||
; X64-SKX-NEXT: kxnorw %k0, %k0, %k1
|
||||
; X64-SKX-NEXT: kxnorb %k0, %k0, %k1
|
||||
; X64-SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0
|
||||
; X64-SKX-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1}
|
||||
; X64-SKX-NEXT: retq
|
||||
@ -3554,7 +3592,7 @@ define <8 x float> @sext_v8i8_index(ptr %base, <8 x i8> %ind) {
|
||||
; X86-SKX: # %bb.0:
|
||||
; X86-SKX-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X86-SKX-NEXT: vpmovsxbd %xmm0, %ymm1
|
||||
; X86-SKX-NEXT: kxnorw %k0, %k0, %k1
|
||||
; X86-SKX-NEXT: kxnorb %k0, %k0, %k1
|
||||
; X86-SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0
|
||||
; X86-SKX-NEXT: vgatherdps (%eax,%ymm1,4), %ymm0 {%k1}
|
||||
; X86-SKX-NEXT: retl
|
||||
@ -3617,7 +3655,7 @@ define <8 x float> @zext_v8i8_index(ptr %base, <8 x i8> %ind) {
|
||||
; X64-SKX-LABEL: zext_v8i8_index:
|
||||
; X64-SKX: # %bb.0:
|
||||
; X64-SKX-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
|
||||
; X64-SKX-NEXT: kxnorw %k0, %k0, %k1
|
||||
; X64-SKX-NEXT: kxnorb %k0, %k0, %k1
|
||||
; X64-SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0
|
||||
; X64-SKX-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1}
|
||||
; X64-SKX-NEXT: retq
|
||||
@ -3626,7 +3664,7 @@ define <8 x float> @zext_v8i8_index(ptr %base, <8 x i8> %ind) {
|
||||
; X86-SKX: # %bb.0:
|
||||
; X86-SKX-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X86-SKX-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
|
||||
; X86-SKX-NEXT: kxnorw %k0, %k0, %k1
|
||||
; X86-SKX-NEXT: kxnorb %k0, %k0, %k1
|
||||
; X86-SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0
|
||||
; X86-SKX-NEXT: vgatherdps (%eax,%ymm1,4), %ymm0 {%k1}
|
||||
; X86-SKX-NEXT: retl
|
||||
@ -4793,19 +4831,19 @@ define <16 x i32> @pr163023_sext(ptr %a0, <16 x i32> %a1) {
|
||||
}
|
||||
|
||||
define <16 x i32> @pr163023_zext(ptr %a0, <16 x i32> %a1) {
|
||||
; X64-LABEL: pr163023_zext:
|
||||
; X64: # %bb.0:
|
||||
; X64-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
|
||||
; X64-NEXT: vextracti64x4 $1, %zmm0, %ymm0
|
||||
; X64-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
|
||||
; X64-NEXT: kxnorw %k0, %k0, %k1
|
||||
; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
||||
; X64-NEXT: vpxor %xmm3, %xmm3, %xmm3
|
||||
; X64-NEXT: kxnorw %k0, %k0, %k2
|
||||
; X64-NEXT: vpgatherqd (%rdi,%zmm0), %ymm3 {%k2}
|
||||
; X64-NEXT: vpgatherqd (%rdi,%zmm1), %ymm2 {%k1}
|
||||
; X64-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm0
|
||||
; X64-NEXT: retq
|
||||
; X64-KNL-LABEL: pr163023_zext:
|
||||
; X64-KNL: # %bb.0:
|
||||
; X64-KNL-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
|
||||
; X64-KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
|
||||
; X64-KNL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
|
||||
; X64-KNL-NEXT: kxnorw %k0, %k0, %k1
|
||||
; X64-KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
||||
; X64-KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3
|
||||
; X64-KNL-NEXT: kxnorw %k0, %k0, %k2
|
||||
; X64-KNL-NEXT: vpgatherqd (%rdi,%zmm0), %ymm3 {%k2}
|
||||
; X64-KNL-NEXT: vpgatherqd (%rdi,%zmm1), %ymm2 {%k1}
|
||||
; X64-KNL-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm0
|
||||
; X64-KNL-NEXT: retq
|
||||
;
|
||||
; X86-LABEL: pr163023_zext:
|
||||
; X86: # %bb.0:
|
||||
@ -4815,6 +4853,20 @@ define <16 x i32> @pr163023_zext(ptr %a0, <16 x i32> %a1) {
|
||||
; X86-NEXT: vpgatherdd (%eax,%zmm0), %zmm1 {%k1}
|
||||
; X86-NEXT: vmovdqa64 %zmm1, %zmm0
|
||||
; X86-NEXT: retl
|
||||
;
|
||||
; X64-SKX-LABEL: pr163023_zext:
|
||||
; X64-SKX: # %bb.0:
|
||||
; X64-SKX-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
|
||||
; X64-SKX-NEXT: vextracti64x4 $1, %zmm0, %ymm0
|
||||
; X64-SKX-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
|
||||
; X64-SKX-NEXT: kxnorb %k0, %k0, %k1
|
||||
; X64-SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
||||
; X64-SKX-NEXT: kxnorb %k0, %k0, %k2
|
||||
; X64-SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3
|
||||
; X64-SKX-NEXT: vpgatherqd (%rdi,%zmm0), %ymm3 {%k2}
|
||||
; X64-SKX-NEXT: vpgatherqd (%rdi,%zmm1), %ymm2 {%k1}
|
||||
; X64-SKX-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm0
|
||||
; X64-SKX-NEXT: retq
|
||||
%addr.p = ptrtoint ptr %a0 to i64
|
||||
%addr.v = insertelement <1 x i64> poison, i64 %addr.p, i64 0
|
||||
%addr.splat = shufflevector <1 x i64> %addr.v, <1 x i64> poison, <16 x i32> zeroinitializer
|
||||
@ -4834,21 +4886,37 @@ define <16 x i32> @pr163023_zext(ptr %a0, <16 x i32> %a1) {
|
||||
%struct.foo = type { ptr, i64, i16, i16, i32 }
|
||||
|
||||
define <8 x i64> @pr45906(<8 x ptr> %ptr) {
|
||||
; X64-LABEL: pr45906:
|
||||
; X64: # %bb.0: # %bb
|
||||
; X64-NEXT: kxnorw %k0, %k0, %k1
|
||||
; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
||||
; X64-NEXT: vpgatherqq 8(,%zmm0), %zmm1 {%k1}
|
||||
; X64-NEXT: vmovdqa64 %zmm1, %zmm0
|
||||
; X64-NEXT: retq
|
||||
; X64-KNL-LABEL: pr45906:
|
||||
; X64-KNL: # %bb.0: # %bb
|
||||
; X64-KNL-NEXT: kxnorw %k0, %k0, %k1
|
||||
; X64-KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
||||
; X64-KNL-NEXT: vpgatherqq 8(,%zmm0), %zmm1 {%k1}
|
||||
; X64-KNL-NEXT: vmovdqa64 %zmm1, %zmm0
|
||||
; X64-KNL-NEXT: retq
|
||||
;
|
||||
; X86-LABEL: pr45906:
|
||||
; X86: # %bb.0: # %bb
|
||||
; X86-NEXT: kxnorw %k0, %k0, %k1
|
||||
; X86-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
||||
; X86-NEXT: vpgatherdq 4(,%ymm0), %zmm1 {%k1}
|
||||
; X86-NEXT: vmovdqa64 %zmm1, %zmm0
|
||||
; X86-NEXT: retl
|
||||
; X86-KNL-LABEL: pr45906:
|
||||
; X86-KNL: # %bb.0: # %bb
|
||||
; X86-KNL-NEXT: kxnorw %k0, %k0, %k1
|
||||
; X86-KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
||||
; X86-KNL-NEXT: vpgatherdq 4(,%ymm0), %zmm1 {%k1}
|
||||
; X86-KNL-NEXT: vmovdqa64 %zmm1, %zmm0
|
||||
; X86-KNL-NEXT: retl
|
||||
;
|
||||
; X64-SKX-LABEL: pr45906:
|
||||
; X64-SKX: # %bb.0: # %bb
|
||||
; X64-SKX-NEXT: kxnorb %k0, %k0, %k1
|
||||
; X64-SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
||||
; X64-SKX-NEXT: vpgatherqq 8(,%zmm0), %zmm1 {%k1}
|
||||
; X64-SKX-NEXT: vmovdqa64 %zmm1, %zmm0
|
||||
; X64-SKX-NEXT: retq
|
||||
;
|
||||
; X86-SKX-LABEL: pr45906:
|
||||
; X86-SKX: # %bb.0: # %bb
|
||||
; X86-SKX-NEXT: kxnorb %k0, %k0, %k1
|
||||
; X86-SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
||||
; X86-SKX-NEXT: vpgatherdq 4(,%ymm0), %zmm1 {%k1}
|
||||
; X86-SKX-NEXT: vmovdqa64 %zmm1, %zmm0
|
||||
; X86-SKX-NEXT: retl
|
||||
bb:
|
||||
%tmp = getelementptr inbounds %struct.foo, <8 x ptr> %ptr, i64 0, i32 1
|
||||
%tmp1 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> %tmp, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i64> undef)
|
||||
|
||||
@ -9,9 +9,9 @@ target triple = "x86_64-unknown-linux-gnu"
|
||||
define void @test(i64 %x272, <16 x ptr> %x335, <16 x i32> %x270) {
|
||||
; CHECK-LABEL: test:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: kxnorw %k0, %k0, %k1
|
||||
; CHECK-NEXT: kxnorb %k0, %k0, %k1
|
||||
; CHECK-NEXT: vpscatterqd %ymm2, (,%zmm0) {%k1}
|
||||
; CHECK-NEXT: kxnorw %k0, %k0, %k1
|
||||
; CHECK-NEXT: kxnorb %k0, %k0, %k1
|
||||
; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm0
|
||||
; CHECK-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
|
||||
; CHECK-NEXT: vzeroupper
|
||||
|
||||
@ -5890,17 +5890,16 @@ define void @mask_replication_factor6_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out
|
||||
; AVX512DQ-SLOW: # %bb.0:
|
||||
; AVX512DQ-SLOW-NEXT: kmovw (%rdi), %k0
|
||||
; AVX512DQ-SLOW-NEXT: vpmovm2d %k0, %zmm0
|
||||
; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
|
||||
; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,1]
|
||||
; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
||||
; AVX512DQ-SLOW-NEXT: movw $255, %ax
|
||||
; AVX512DQ-SLOW-NEXT: kmovw %eax, %k1
|
||||
; AVX512DQ-SLOW-NEXT: vpcmpgtd %zmm1, %zmm2, %k1 {%k1}
|
||||
; AVX512DQ-SLOW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
|
||||
; AVX512DQ-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm0
|
||||
; AVX512DQ-SLOW-NEXT: vpmovd2m %zmm0, %k2
|
||||
; AVX512DQ-SLOW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
|
||||
; AVX512DQ-SLOW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z}
|
||||
; AVX512DQ-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm1
|
||||
; AVX512DQ-SLOW-NEXT: vpmovd2m %zmm1, %k1
|
||||
; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
|
||||
; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,1]
|
||||
; AVX512DQ-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
||||
; AVX512DQ-SLOW-NEXT: kxnorb %k0, %k0, %k2
|
||||
; AVX512DQ-SLOW-NEXT: vpcmpgtd %zmm0, %zmm1, %k2 {%k2}
|
||||
; AVX512DQ-SLOW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
|
||||
; AVX512DQ-SLOW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
|
||||
; AVX512DQ-SLOW-NEXT: vmovdqa64 %zmm1, (%rdx)
|
||||
; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, 64(%rdx)
|
||||
; AVX512DQ-SLOW-NEXT: vzeroupper
|
||||
@ -5910,17 +5909,16 @@ define void @mask_replication_factor6_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out
|
||||
; AVX512DQ-FAST: # %bb.0:
|
||||
; AVX512DQ-FAST-NEXT: kmovw (%rdi), %k0
|
||||
; AVX512DQ-FAST-NEXT: vpmovm2d %k0, %zmm0
|
||||
; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,2,3,3,3,3,3,3]
|
||||
; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm1
|
||||
; AVX512DQ-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
||||
; AVX512DQ-FAST-NEXT: movw $255, %ax
|
||||
; AVX512DQ-FAST-NEXT: kmovw %eax, %k1
|
||||
; AVX512DQ-FAST-NEXT: vpcmpgtd %zmm1, %zmm2, %k1 {%k1}
|
||||
; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2]
|
||||
; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0
|
||||
; AVX512DQ-FAST-NEXT: vpmovd2m %zmm0, %k2
|
||||
; AVX512DQ-FAST-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
|
||||
; AVX512DQ-FAST-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z}
|
||||
; AVX512DQ-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1
|
||||
; AVX512DQ-FAST-NEXT: vpmovd2m %zmm1, %k1
|
||||
; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,2,3,3,3,3,3,3]
|
||||
; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
|
||||
; AVX512DQ-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
||||
; AVX512DQ-FAST-NEXT: kxnorb %k0, %k0, %k2
|
||||
; AVX512DQ-FAST-NEXT: vpcmpgtd %zmm0, %zmm1, %k2 {%k2}
|
||||
; AVX512DQ-FAST-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
|
||||
; AVX512DQ-FAST-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
|
||||
; AVX512DQ-FAST-NEXT: vmovdqa64 %zmm1, (%rdx)
|
||||
; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, 64(%rdx)
|
||||
; AVX512DQ-FAST-NEXT: vzeroupper
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user