Simon Pilgrim 6e574a4fa3
[X86] lowerVECTOR_SHUFFLE - canonicalize zeros/ones/fp splat constants to ensure no undefs (#141214)
Make it easier for splat/element-equivalent detection by ensuring
constant splats contain no undefs.

Integer constants are limited to rematerializable zeros/ones values to
avoid unnecessary scalar_to_vector(int) -> load conversions - we can
relax this later if useful
2025-05-23 11:02:46 +01:00

83 lines
5.0 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 -O0 | FileCheck %s --check-prefixes=CHECK-O0
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 -O3 | FileCheck %s --check-prefixes=CHECK-O3
define <16 x i64> @pluto(<16 x i64> %arg, <16 x i64> %arg1, <16 x i64> %arg2, <16 x i64> %arg3, <16 x i64> %arg4) nounwind {
; CHECK-O0-LABEL: pluto:
; CHECK-O0: # %bb.0: # %bb
; CHECK-O0-NEXT: pushq %rbp
; CHECK-O0-NEXT: movq %rsp, %rbp
; CHECK-O0-NEXT: andq $-32, %rsp
; CHECK-O0-NEXT: subq $64, %rsp
; CHECK-O0-NEXT: vmovaps %ymm4, %ymm10
; CHECK-O0-NEXT: vmovaps %ymm3, %ymm9
; CHECK-O0-NEXT: vmovaps %ymm2, (%rsp) # 32-byte Spill
; CHECK-O0-NEXT: vmovaps %ymm1, %ymm8
; CHECK-O0-NEXT: vmovaps %ymm0, %ymm3
; CHECK-O0-NEXT: vmovaps (%rsp), %ymm0 # 32-byte Reload
; CHECK-O0-NEXT: vmovaps 240(%rbp), %ymm4
; CHECK-O0-NEXT: vmovaps 208(%rbp), %ymm1
; CHECK-O0-NEXT: vmovaps 176(%rbp), %ymm2
; CHECK-O0-NEXT: vmovaps 144(%rbp), %ymm2
; CHECK-O0-NEXT: vmovaps 112(%rbp), %ymm11
; CHECK-O0-NEXT: vmovaps 80(%rbp), %ymm11
; CHECK-O0-NEXT: vmovaps 48(%rbp), %ymm11
; CHECK-O0-NEXT: vmovaps 16(%rbp), %ymm11
; CHECK-O0-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7]
; CHECK-O0-NEXT: vxorps %xmm3, %xmm3, %xmm3
; CHECK-O0-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
; CHECK-O0-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
; CHECK-O0-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,1,2,1]
; CHECK-O0-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5],ymm0[6,7]
; CHECK-O0-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm6[0,1]
; CHECK-O0-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7]
; CHECK-O0-NEXT: vmovaps %xmm1, %xmm3
; CHECK-O0-NEXT: vmovaps %xmm7, %xmm1
; CHECK-O0-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3]
; CHECK-O0-NEXT: # implicit-def: $ymm1
; CHECK-O0-NEXT: vmovaps %xmm3, %xmm1
; CHECK-O0-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,0,1,3]
; CHECK-O0-NEXT: vpslldq {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,ymm5[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[16,17,18,19,20,21,22,23]
; CHECK-O0-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1],ymm3[2,3,4,5],ymm1[6,7]
; CHECK-O0-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm7[0],ymm5[0],ymm7[2],ymm5[2]
; CHECK-O0-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
; CHECK-O0-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,1,3]
; CHECK-O0-NEXT: movq %rbp, %rsp
; CHECK-O0-NEXT: popq %rbp
; CHECK-O0-NEXT: retq
;
; CHECK-O3-LABEL: pluto:
; CHECK-O3: # %bb.0: # %bb
; CHECK-O3-NEXT: pushq %rbp
; CHECK-O3-NEXT: movq %rsp, %rbp
; CHECK-O3-NEXT: andq $-32, %rsp
; CHECK-O3-NEXT: subq $32, %rsp
; CHECK-O3-NEXT: vmovdqa 208(%rbp), %ymm3
; CHECK-O3-NEXT: vmovdqa 144(%rbp), %ymm0
; CHECK-O3-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm2[6,7]
; CHECK-O3-NEXT: vpxor %xmm2, %xmm2, %xmm2
; CHECK-O3-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
; CHECK-O3-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; CHECK-O3-NEXT: vpermq {{.*#+}} ymm1 = ymm1[3,1,2,1]
; CHECK-O3-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
; CHECK-O3-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm7[2,3],ymm6[0,1]
; CHECK-O3-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7]
; CHECK-O3-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm7[0],ymm5[0],ymm7[2],ymm5[2]
; CHECK-O3-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
; CHECK-O3-NEXT: vpbroadcastq 248(%rbp), %ymm4
; CHECK-O3-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3,4,5],ymm1[6,7]
; CHECK-O3-NEXT: vpblendd {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3]
; CHECK-O3-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,3]
; CHECK-O3-NEXT: vpslldq {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,ymm5[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[16,17,18,19,20,21,22,23]
; CHECK-O3-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5],ymm4[6,7]
; CHECK-O3-NEXT: movq %rbp, %rsp
; CHECK-O3-NEXT: popq %rbp
; CHECK-O3-NEXT: retq
bb:
%tmp = select <16 x i1> <i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false>, <16 x i64> %arg, <16 x i64> %arg1
%tmp5 = select <16 x i1> <i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x i64> %arg2, <16 x i64> zeroinitializer
%tmp6 = select <16 x i1> <i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true>, <16 x i64> %arg3, <16 x i64> %tmp5
%tmp7 = shufflevector <16 x i64> %tmp, <16 x i64> %tmp6, <16 x i32> <i32 11, i32 18, i32 24, i32 9, i32 14, i32 29, i32 29, i32 6, i32 14, i32 28, i32 8, i32 9, i32 22, i32 12, i32 25, i32 6>
ret <16 x i64> %tmp7
}