
lowerBuildVectorAsBroadcast will not broadcast splat constants in all cases, resulting in a lot of situations where a full width vector load that has failed to fold but is loading splat constant values could use a broadcast load instruction just as cheaply, and save constant pool space. NOTE: SSE3 targets can use MOVDDUP but not all SSE era CPUs can perform this as cheaply as a vector load, we will need to add scheduler model checks if we want to pursue this. This is an updated commit of 98061013e01207444cfd3980cde17b5e75764fbe after being reverted at a279a09ab9524d1d74ef29b34618102d4b202e2f
144 lines
5.0 KiB
LLVM
144 lines
5.0 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=X32
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=X64
|
|
|
|
define void @big_nonzero_16_bytes(ptr nocapture %a) {
|
|
; X32-LABEL: big_nonzero_16_bytes:
|
|
; X32: # %bb.0:
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
; X32-NEXT: vmovaps {{.*#+}} xmm0 = [1,2,3,4]
|
|
; X32-NEXT: vmovups %xmm0, (%eax)
|
|
; X32-NEXT: retl
|
|
;
|
|
; X64-LABEL: big_nonzero_16_bytes:
|
|
; X64: # %bb.0:
|
|
; X64-NEXT: vmovaps {{.*#+}} xmm0 = [1,2,3,4]
|
|
; X64-NEXT: vmovups %xmm0, (%rdi)
|
|
; X64-NEXT: retq
|
|
%arrayidx1 = getelementptr inbounds i32, ptr %a, i64 1
|
|
%arrayidx2 = getelementptr inbounds i32, ptr %a, i64 2
|
|
%arrayidx3 = getelementptr inbounds i32, ptr %a, i64 3
|
|
|
|
store i32 1, ptr %a, align 4
|
|
store i32 2, ptr %arrayidx1, align 4
|
|
store i32 3, ptr %arrayidx2, align 4
|
|
store i32 4, ptr %arrayidx3, align 4
|
|
ret void
|
|
}
|
|
|
|
; TODO: We assumed that two 64-bit stores were better than 1 vector load and 1 vector store.
|
|
; But if the 64-bit constants can't be represented as sign-extended 32-bit constants, then
|
|
; it takes extra instructions to do this in scalar.
|
|
|
|
define void @big_nonzero_16_bytes_big64bit_constants(ptr nocapture %a) {
|
|
; X32-LABEL: big_nonzero_16_bytes_big64bit_constants:
|
|
; X32: # %bb.0:
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
; X32-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,3]
|
|
; X32-NEXT: vmovups %xmm0, (%eax)
|
|
; X32-NEXT: retl
|
|
;
|
|
; X64-LABEL: big_nonzero_16_bytes_big64bit_constants:
|
|
; X64: # %bb.0:
|
|
; X64-NEXT: movabsq $4294967297, %rax # imm = 0x100000001
|
|
; X64-NEXT: movq %rax, (%rdi)
|
|
; X64-NEXT: movabsq $12884901889, %rax # imm = 0x300000001
|
|
; X64-NEXT: movq %rax, 8(%rdi)
|
|
; X64-NEXT: retq
|
|
%arrayidx1 = getelementptr inbounds i64, ptr %a, i64 1
|
|
|
|
store i64 4294967297, ptr %a
|
|
store i64 12884901889, ptr %arrayidx1
|
|
ret void
|
|
}
|
|
|
|
; Splats may be an opportunity to use a broadcast op.
|
|
|
|
define void @big_nonzero_32_bytes_splat(ptr nocapture %a) {
|
|
; X32-LABEL: big_nonzero_32_bytes_splat:
|
|
; X32: # %bb.0:
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
; X32-NEXT: vbroadcastss {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42]
|
|
; X32-NEXT: vmovups %ymm0, (%eax)
|
|
; X32-NEXT: vzeroupper
|
|
; X32-NEXT: retl
|
|
;
|
|
; X64-LABEL: big_nonzero_32_bytes_splat:
|
|
; X64: # %bb.0:
|
|
; X64-NEXT: vbroadcastss {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42]
|
|
; X64-NEXT: vmovups %ymm0, (%rdi)
|
|
; X64-NEXT: vzeroupper
|
|
; X64-NEXT: retq
|
|
%arrayidx1 = getelementptr inbounds i32, ptr %a, i64 1
|
|
%arrayidx2 = getelementptr inbounds i32, ptr %a, i64 2
|
|
%arrayidx3 = getelementptr inbounds i32, ptr %a, i64 3
|
|
%arrayidx4 = getelementptr inbounds i32, ptr %a, i64 4
|
|
%arrayidx5 = getelementptr inbounds i32, ptr %a, i64 5
|
|
%arrayidx6 = getelementptr inbounds i32, ptr %a, i64 6
|
|
%arrayidx7 = getelementptr inbounds i32, ptr %a, i64 7
|
|
|
|
store i32 42, ptr %a, align 4
|
|
store i32 42, ptr %arrayidx1, align 4
|
|
store i32 42, ptr %arrayidx2, align 4
|
|
store i32 42, ptr %arrayidx3, align 4
|
|
store i32 42, ptr %arrayidx4, align 4
|
|
store i32 42, ptr %arrayidx5, align 4
|
|
store i32 42, ptr %arrayidx6, align 4
|
|
store i32 42, ptr %arrayidx7, align 4
|
|
ret void
|
|
}
|
|
|
|
; Verify that we choose the best-sized store(s) for each chunk.
|
|
|
|
define void @big_nonzero_63_bytes(ptr nocapture %a) {
|
|
; X32-LABEL: big_nonzero_63_bytes:
|
|
; X32: # %bb.0:
|
|
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
; X32-NEXT: vmovaps {{.*#+}} ymm0 = [1,0,2,0,3,0,4,0]
|
|
; X32-NEXT: vmovups %ymm0, (%eax)
|
|
; X32-NEXT: vmovaps {{.*#+}} xmm0 = [5,0,6,0]
|
|
; X32-NEXT: vmovups %xmm0, 32(%eax)
|
|
; X32-NEXT: movl $0, 52(%eax)
|
|
; X32-NEXT: movl $7, 48(%eax)
|
|
; X32-NEXT: movl $8, 56(%eax)
|
|
; X32-NEXT: movw $9, 60(%eax)
|
|
; X32-NEXT: movb $10, 62(%eax)
|
|
; X32-NEXT: vzeroupper
|
|
; X32-NEXT: retl
|
|
;
|
|
; X64-LABEL: big_nonzero_63_bytes:
|
|
; X64: # %bb.0:
|
|
; X64-NEXT: vmovaps {{.*#+}} ymm0 = [1,2,3,4]
|
|
; X64-NEXT: vmovups %ymm0, (%rdi)
|
|
; X64-NEXT: movq $5, 32(%rdi)
|
|
; X64-NEXT: movq $6, 40(%rdi)
|
|
; X64-NEXT: movq $7, 48(%rdi)
|
|
; X64-NEXT: movl $8, 56(%rdi)
|
|
; X64-NEXT: movw $9, 60(%rdi)
|
|
; X64-NEXT: movb $10, 62(%rdi)
|
|
; X64-NEXT: vzeroupper
|
|
; X64-NEXT: retq
|
|
%arrayidx8 = getelementptr inbounds i64, ptr %a, i64 1
|
|
%arrayidx16 = getelementptr inbounds i64, ptr %a, i64 2
|
|
%arrayidx24 = getelementptr inbounds i64, ptr %a, i64 3
|
|
%arrayidx32 = getelementptr inbounds i64, ptr %a, i64 4
|
|
%arrayidx40 = getelementptr inbounds i64, ptr %a, i64 5
|
|
%arrayidx48 = getelementptr inbounds i64, ptr %a, i64 6
|
|
%arrayidx56 = getelementptr inbounds i32, ptr %a, i64 14
|
|
%arrayidx60 = getelementptr inbounds i16, ptr %a, i64 30
|
|
%arrayidx62 = getelementptr inbounds i8, ptr %a, i64 62
|
|
|
|
store i64 1, ptr %a
|
|
store i64 2, ptr %arrayidx8
|
|
store i64 3, ptr %arrayidx16
|
|
store i64 4, ptr %arrayidx24
|
|
store i64 5, ptr %arrayidx32
|
|
store i64 6, ptr %arrayidx40
|
|
store i64 7, ptr %arrayidx48
|
|
store i32 8, ptr %arrayidx56
|
|
store i16 9, ptr %arrayidx60
|
|
store i8 10, ptr %arrayidx62
|
|
ret void
|
|
}
|
|
|