llvm-project/llvm/test/CodeGen/X86/avx512-build-vector.ll
Simon Pilgrim d1592a966b
[X86] X86FixupVectorConstantsPass - use scheduler model to avoid regressions (#140028)
When attempting to replace a full vector constant load with an instruction that uses a smaller constant, check the scheduler model to ensure the instruction isn't slower.

Throughput must not regress, but allow a small increase in latency based on how much constant data we're saving (I've used a simple estimate of 1 cycle per 128-bits of data saved).

NOTE: this currently ignores hoisted constant loads where the slower instruction might be acceptable.

Fixes #135998
2025-05-28 11:02:37 +01:00

29 lines
1.4 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
define <16 x i32> @test2(<16 x i32> %x) {
; CHECK-LABEL: test2:
; CHECK: ## %bb.0:
; CHECK-NEXT: vpternlogd {{.*#+}} zmm1 = -1
; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%res = add <16 x i32><i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, %x
ret <16 x i32>%res
}
define <16 x float> @test3(<4 x float> %a) {
; CHECK-LABEL: test3:
; CHECK: ## %bb.0:
; CHECK-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
; CHECK-NEXT: vmovaps {{.*#+}} zmm2 = [0,1,2,3,4,18,16,7,8,9,10,11,12,13,14,15]
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1
; CHECK-NEXT: vmovaps %zmm1, %zmm0
; CHECK-NEXT: retq
%b = extractelement <4 x float> %a, i32 2
%c = insertelement <16 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float %b, i32 5
%b1 = extractelement <4 x float> %a, i32 0
%c1 = insertelement <16 x float> %c, float %b1, i32 6
ret <16 x float>%c1
}