Simon Pilgrim b5d35feacb
[X86] X86FixupVectorConstants - load+sign-extend vector constants that can be stored in a truncated form (#79815)
Reduce the size of the vector constant by storing it in the constant pool in a truncated form, and sign-extend it as part of the load.

I've extended the existing FixupConstant functionality to support these sext constant rebuilds - we still select the smallest stored constant entry and prefer vzload/broadcast/vextload for same bitwidth to avoid domain flips.

I intend to add the matching load+zero-extend handling in a future PR, but that requires some alterations to the existing MC shuffle comments handling first.
2024-02-02 11:28:58 +00:00

69 lines
3.1 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX
define void @main(<16 x i32> %0, i32 %1) {
; SSE-LABEL: main:
; SSE: # %bb.0: # %entry
; SSE-NEXT: movd %edi, %xmm4
; SSE-NEXT: movss {{.*#+}} xmm0 = [1,0,0,0]
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm4[1,0]
; SSE-NEXT: paddd %xmm0, %xmm0
; SSE-NEXT: paddd %xmm1, %xmm1
; SSE-NEXT: paddd %xmm3, %xmm3
; SSE-NEXT: paddd %xmm2, %xmm2
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,1,1,3]
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[1,0]
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm2[1,0]
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[1,3]
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,0],xmm3[1,0]
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[1,3]
; SSE-NEXT: xorps %xmm2, %xmm0
; SSE-NEXT: xorps %xmm4, %xmm1
; SSE-NEXT: xorps %xmm0, %xmm1
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
; SSE-NEXT: pxor %xmm1, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; SSE-NEXT: pxor %xmm0, %xmm1
; SSE-NEXT: movd %xmm1, 0
; SSE-NEXT: retq
;
; AVX-LABEL: main:
; AVX: # %bb.0: # %entry
; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3]
; AVX-NEXT: movl $1, %eax
; AVX-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2
; AVX-NEXT: vpinsrd $3, %edi, %xmm2, %xmm2
; AVX-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; AVX-NEXT: vpaddd %ymm0, %ymm0, %ymm0
; AVX-NEXT: vpaddd %ymm1, %ymm1, %ymm1
; AVX-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,1,1,3,3,5,5,7]
; AVX-NEXT: vpermd %ymm0, %ymm2, %ymm2
; AVX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1]
; AVX-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7]
; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,5,7]
; AVX-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
; AVX-NEXT: vpxor %ymm0, %ymm2, %ymm0
; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, 0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
entry:
%2 = insertelement <16 x i32> %0, i32 1, i64 1
%3 = insertelement <16 x i32> %2, i32 %1, i64 3
%4 = insertelement <16 x i32> %3, i32 0, i64 0
%5 = shl <16 x i32> %4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%6 = shufflevector <16 x i32> %5, <16 x i32> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15>
%7 = tail call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> %6)
store i32 %7, ptr null, align 4
ret void
}
declare i32 @llvm.vector.reduce.xor.v16i32(<16 x i32>)