llvm-project/llvm/test/CodeGen/X86/bfloat-calling-conv.ll
Mikołaj Piróg 6662fe393c
[X86] Add missing vNbf16 handling in X86CallingConv.td file (#127102)
Lack of these entries caused clang to crash on the following code:

```c
__m256bh fun(__m256bh arg) {
	return arg;
	
}
__m256bh run() {
    __m256bh arg= {0};
    fun(arg);
}
```
It caused the FastISel to fail since it handled the call lowering basing
on the X86CallingConv table.

Curiously, if FastISel fails somewhere down the line and
selectionDAGISel fallbacks, the crash does not occur. Following code
_does not_ crash:

```c
__m256bh fun(__m256bh arg) {
	return arg;
}

__m256bh run() {
    __m256bh arg= {0};
    return fun(arg);

}
```

This is puzzling to me. Obviously, if FastISel fails then compiler
fallbacks to something else to lower these calls -- but since the
X86callingConv table _doesn't_ have entries for vNbf16 how does this
other thing manage not to crash? It has to use some other mechanism, one
which doesn't use the table. This rises following questions:
- how is this lowering accomplished without, presumably, using the
CallingConv entries?
- why is the table not used? I mean this points to some logic
duplication (fastISel way vs. the other bug-free way)
- How to properly test this? There is a test for vNbf16 values, but it
also must not be using the FastISel path? This duplication of logic
makes it hard to test this, since we don't have direct control whether
the FastISel path or the other one is used.

Nonetheless, this PR fixes the crash, though I didn't create a test for
it, since I am unsure yet how it should look like. I would like to learn
how the working non-FastISel mechanism works; I tried looking for it,
but didn't yet manage to find anything
2025-02-19 11:04:10 +08:00

1163 lines
54 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -fast-isel=false -mtriple=x86_64-linux-unknown -mattr=+sse2 < %s | FileCheck -check-prefixes=SSE2 %s
; RUN: llc -fast-isel -mtriple=x86_64-linux-unknown -mattr=+sse2 < %s | FileCheck -check-prefixes=FAST_ISEL_SSE2 %s
; RUN: llc -fast-isel=false -mtriple=x86_64-linux-unknown -mattr=+avx512bf16,avx512vl < %s | FileCheck -check-prefixes=AVX512BF16 %s
; RUN: llc -fast-isel -mtriple=x86_64-linux-unknown -mattr=+avx512bf16,avx512vl < %s | FileCheck -check-prefixes=FAST_ISEL_AVX512BF16 %s
; RUN: llc -fast-isel=false -mtriple=x86_64-linux-unknown -mattr=+avxneconvert < %s | FileCheck -check-prefixes=AVXNECONVERT %s
; RUN: llc -fast-isel -mtriple=x86_64-linux-unknown -mattr=+avxneconvert < %s | FileCheck -check-prefixes=FAST_ISEL_AVXNECONVERT %s
define bfloat @return_arg_bf16(bfloat %x) #0 {
; SSE2-LABEL: return_arg_bf16:
; SSE2: # %bb.0:
; SSE2-NEXT: retq
;
; FAST_ISEL_SSE2-LABEL: return_arg_bf16:
; FAST_ISEL_SSE2: # %bb.0:
; FAST_ISEL_SSE2-NEXT: pushq %rax
; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movd %eax, %xmm0
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: popq %rax
; FAST_ISEL_SSE2-NEXT: retq
;
; AVX512BF16-LABEL: return_arg_bf16:
; AVX512BF16: # %bb.0:
; AVX512BF16-NEXT: retq
;
; FAST_ISEL_AVX512BF16-LABEL: return_arg_bf16:
; FAST_ISEL_AVX512BF16: # %bb.0:
; FAST_ISEL_AVX512BF16-NEXT: vpextrw $0, %xmm0, %eax
; FAST_ISEL_AVX512BF16-NEXT: shll $16, %eax
; FAST_ISEL_AVX512BF16-NEXT: vmovd %eax, %xmm0
; FAST_ISEL_AVX512BF16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
; FAST_ISEL_AVX512BF16-NEXT: retq
;
; AVXNECONVERT-LABEL: return_arg_bf16:
; AVXNECONVERT: # %bb.0:
; AVXNECONVERT-NEXT: retq
;
; FAST_ISEL_AVXNECONVERT-LABEL: return_arg_bf16:
; FAST_ISEL_AVXNECONVERT: # %bb.0:
; FAST_ISEL_AVXNECONVERT-NEXT: vpextrw $0, %xmm0, %eax
; FAST_ISEL_AVXNECONVERT-NEXT: shll $16, %eax
; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %eax, %xmm0
; FAST_ISEL_AVXNECONVERT-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0
; FAST_ISEL_AVXNECONVERT-NEXT: retq
ret bfloat %x
}
define <2 x bfloat> @return_arg_v2bf16(<2 x bfloat> %x) #0 {
; SSE2-LABEL: return_arg_v2bf16:
; SSE2: # %bb.0:
; SSE2-NEXT: retq
;
; FAST_ISEL_SSE2-LABEL: return_arg_v2bf16:
; FAST_ISEL_SSE2: # %bb.0:
; FAST_ISEL_SSE2-NEXT: subq $40, %rsp
; FAST_ISEL_SSE2-NEXT: pextrw $1, %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; FAST_ISEL_SSE2-NEXT: movd %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movd %eax, %xmm0
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; FAST_ISEL_SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; FAST_ISEL_SSE2-NEXT: movdqa %xmm1, %xmm0
; FAST_ISEL_SSE2-NEXT: addq $40, %rsp
; FAST_ISEL_SSE2-NEXT: retq
;
; AVX512BF16-LABEL: return_arg_v2bf16:
; AVX512BF16: # %bb.0:
; AVX512BF16-NEXT: retq
;
; FAST_ISEL_AVX512BF16-LABEL: return_arg_v2bf16:
; FAST_ISEL_AVX512BF16: # %bb.0:
; FAST_ISEL_AVX512BF16-NEXT: retq
;
; AVXNECONVERT-LABEL: return_arg_v2bf16:
; AVXNECONVERT: # %bb.0:
; AVXNECONVERT-NEXT: retq
;
; FAST_ISEL_AVXNECONVERT-LABEL: return_arg_v2bf16:
; FAST_ISEL_AVXNECONVERT: # %bb.0:
; FAST_ISEL_AVXNECONVERT-NEXT: retq
ret <2 x bfloat> %x
}
define <3 x bfloat> @return_arg_v3bf16(<3 x bfloat> %x) #0 {
; SSE2-LABEL: return_arg_v3bf16:
; SSE2: # %bb.0:
; SSE2-NEXT: retq
;
; FAST_ISEL_SSE2-LABEL: return_arg_v3bf16:
; FAST_ISEL_SSE2: # %bb.0:
; FAST_ISEL_SSE2-NEXT: subq $40, %rsp
; FAST_ISEL_SSE2-NEXT: pextrw $2, %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; FAST_ISEL_SSE2-NEXT: pextrw $1, %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; FAST_ISEL_SSE2-NEXT: movd %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movd %eax, %xmm0
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; FAST_ISEL_SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; FAST_ISEL_SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; FAST_ISEL_SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; FAST_ISEL_SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; FAST_ISEL_SSE2-NEXT: movaps %xmm1, %xmm0
; FAST_ISEL_SSE2-NEXT: addq $40, %rsp
; FAST_ISEL_SSE2-NEXT: retq
;
; AVX512BF16-LABEL: return_arg_v3bf16:
; AVX512BF16: # %bb.0:
; AVX512BF16-NEXT: retq
;
; FAST_ISEL_AVX512BF16-LABEL: return_arg_v3bf16:
; FAST_ISEL_AVX512BF16: # %bb.0:
; FAST_ISEL_AVX512BF16-NEXT: vpextrw $2, %xmm0, %eax
; FAST_ISEL_AVX512BF16-NEXT: shll $16, %eax
; FAST_ISEL_AVX512BF16-NEXT: vmovd %eax, %xmm1
; FAST_ISEL_AVX512BF16-NEXT: vpextrw $1, %xmm0, %eax
; FAST_ISEL_AVX512BF16-NEXT: shll $16, %eax
; FAST_ISEL_AVX512BF16-NEXT: vmovd %eax, %xmm2
; FAST_ISEL_AVX512BF16-NEXT: vmovd %xmm0, %eax
; FAST_ISEL_AVX512BF16-NEXT: shll $16, %eax
; FAST_ISEL_AVX512BF16-NEXT: vmovd %eax, %xmm0
; FAST_ISEL_AVX512BF16-NEXT: vcvtneps2bf16 %xmm1, %xmm1
; FAST_ISEL_AVX512BF16-NEXT: vmovd %xmm1, %eax
; FAST_ISEL_AVX512BF16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
; FAST_ISEL_AVX512BF16-NEXT: vcvtneps2bf16 %xmm2, %xmm1
; FAST_ISEL_AVX512BF16-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; FAST_ISEL_AVX512BF16-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
; FAST_ISEL_AVX512BF16-NEXT: retq
;
; AVXNECONVERT-LABEL: return_arg_v3bf16:
; AVXNECONVERT: # %bb.0:
; AVXNECONVERT-NEXT: retq
;
; FAST_ISEL_AVXNECONVERT-LABEL: return_arg_v3bf16:
; FAST_ISEL_AVXNECONVERT: # %bb.0:
; FAST_ISEL_AVXNECONVERT-NEXT: vpextrw $2, %xmm0, %eax
; FAST_ISEL_AVXNECONVERT-NEXT: shll $16, %eax
; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %eax, %xmm1
; FAST_ISEL_AVXNECONVERT-NEXT: vpextrw $1, %xmm0, %eax
; FAST_ISEL_AVXNECONVERT-NEXT: shll $16, %eax
; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %eax, %xmm2
; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %xmm0, %eax
; FAST_ISEL_AVXNECONVERT-NEXT: shll $16, %eax
; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %eax, %xmm0
; FAST_ISEL_AVXNECONVERT-NEXT: {vex} vcvtneps2bf16 %xmm1, %xmm1
; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %xmm1, %eax
; FAST_ISEL_AVXNECONVERT-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0
; FAST_ISEL_AVXNECONVERT-NEXT: {vex} vcvtneps2bf16 %xmm2, %xmm1
; FAST_ISEL_AVXNECONVERT-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; FAST_ISEL_AVXNECONVERT-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
; FAST_ISEL_AVXNECONVERT-NEXT: vmovq %xmm1, %rax
; FAST_ISEL_AVXNECONVERT-NEXT: movl %eax, %ecx
; FAST_ISEL_AVXNECONVERT-NEXT: shrl $16, %ecx
; FAST_ISEL_AVXNECONVERT-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm1
; FAST_ISEL_AVXNECONVERT-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; FAST_ISEL_AVXNECONVERT-NEXT: shrq $32, %rax
; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %eax, %xmm1
; FAST_ISEL_AVXNECONVERT-NEXT: vpbroadcastw %xmm1, %xmm1
; FAST_ISEL_AVXNECONVERT-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7]
; FAST_ISEL_AVXNECONVERT-NEXT: retq
ret <3 x bfloat> %x
}
define <4 x bfloat> @return_arg_v4bf16(<4 x bfloat> %x) #0 {
; SSE2-LABEL: return_arg_v4bf16:
; SSE2: # %bb.0:
; SSE2-NEXT: retq
;
; FAST_ISEL_SSE2-LABEL: return_arg_v4bf16:
; FAST_ISEL_SSE2: # %bb.0:
; FAST_ISEL_SSE2-NEXT: subq $56, %rsp
; FAST_ISEL_SSE2-NEXT: pextrw $3, %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; FAST_ISEL_SSE2-NEXT: pextrw $2, %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; FAST_ISEL_SSE2-NEXT: pextrw $1, %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; FAST_ISEL_SSE2-NEXT: movd %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movd %eax, %xmm0
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; FAST_ISEL_SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; FAST_ISEL_SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; FAST_ISEL_SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; FAST_ISEL_SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; FAST_ISEL_SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; FAST_ISEL_SSE2-NEXT: addq $56, %rsp
; FAST_ISEL_SSE2-NEXT: retq
;
; AVX512BF16-LABEL: return_arg_v4bf16:
; AVX512BF16: # %bb.0:
; AVX512BF16-NEXT: retq
;
; FAST_ISEL_AVX512BF16-LABEL: return_arg_v4bf16:
; FAST_ISEL_AVX512BF16: # %bb.0:
; FAST_ISEL_AVX512BF16-NEXT: retq
;
; AVXNECONVERT-LABEL: return_arg_v4bf16:
; AVXNECONVERT: # %bb.0:
; AVXNECONVERT-NEXT: retq
;
; FAST_ISEL_AVXNECONVERT-LABEL: return_arg_v4bf16:
; FAST_ISEL_AVXNECONVERT: # %bb.0:
; FAST_ISEL_AVXNECONVERT-NEXT: retq
ret <4 x bfloat> %x
}
define <8 x bfloat> @return_arg_v8bf16(<8 x bfloat> %x) #0 {
; SSE2-LABEL: return_arg_v8bf16:
; SSE2: # %bb.0:
; SSE2-NEXT: retq
;
; FAST_ISEL_SSE2-LABEL: return_arg_v8bf16:
; FAST_ISEL_SSE2: # %bb.0:
; FAST_ISEL_SSE2-NEXT: pushq %r14
; FAST_ISEL_SSE2-NEXT: pushq %rbx
; FAST_ISEL_SSE2-NEXT: subq $56, %rsp
; FAST_ISEL_SSE2-NEXT: pextrw $7, %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; FAST_ISEL_SSE2-NEXT: pextrw $6, %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; FAST_ISEL_SSE2-NEXT: pextrw $5, %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; FAST_ISEL_SSE2-NEXT: pextrw $4, %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; FAST_ISEL_SSE2-NEXT: pextrw $3, %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; FAST_ISEL_SSE2-NEXT: pextrw $2, %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; FAST_ISEL_SSE2-NEXT: pextrw $1, %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movd %eax, %xmm1
; FAST_ISEL_SSE2-NEXT: movd %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; FAST_ISEL_SSE2-NEXT: movdqa %xmm1, %xmm0
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx
; FAST_ISEL_SSE2-NEXT: shll $16, %ebx
; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: movzwl %ax, %r14d
; FAST_ISEL_SSE2-NEXT: orl %ebx, %r14d
; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx
; FAST_ISEL_SSE2-NEXT: shll $16, %ebx
; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: movzwl %ax, %eax
; FAST_ISEL_SSE2-NEXT: orl %ebx, %eax
; FAST_ISEL_SSE2-NEXT: shlq $32, %rax
; FAST_ISEL_SSE2-NEXT: orq %r14, %rax
; FAST_ISEL_SSE2-NEXT: movq %rax, %xmm0
; FAST_ISEL_SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx
; FAST_ISEL_SSE2-NEXT: shll $16, %ebx
; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: movzwl %ax, %r14d
; FAST_ISEL_SSE2-NEXT: orl %ebx, %r14d
; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx
; FAST_ISEL_SSE2-NEXT: shll $16, %ebx
; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: movzwl %ax, %eax
; FAST_ISEL_SSE2-NEXT: orl %ebx, %eax
; FAST_ISEL_SSE2-NEXT: shlq $32, %rax
; FAST_ISEL_SSE2-NEXT: orq %r14, %rax
; FAST_ISEL_SSE2-NEXT: movq %rax, %xmm1
; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; FAST_ISEL_SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; FAST_ISEL_SSE2-NEXT: addq $56, %rsp
; FAST_ISEL_SSE2-NEXT: popq %rbx
; FAST_ISEL_SSE2-NEXT: popq %r14
; FAST_ISEL_SSE2-NEXT: retq
;
; AVX512BF16-LABEL: return_arg_v8bf16:
; AVX512BF16: # %bb.0:
; AVX512BF16-NEXT: retq
;
; FAST_ISEL_AVX512BF16-LABEL: return_arg_v8bf16:
; FAST_ISEL_AVX512BF16: # %bb.0:
; FAST_ISEL_AVX512BF16-NEXT: retq
;
; AVXNECONVERT-LABEL: return_arg_v8bf16:
; AVXNECONVERT: # %bb.0:
; AVXNECONVERT-NEXT: retq
;
; FAST_ISEL_AVXNECONVERT-LABEL: return_arg_v8bf16:
; FAST_ISEL_AVXNECONVERT: # %bb.0:
; FAST_ISEL_AVXNECONVERT-NEXT: retq
ret <8 x bfloat> %x
}
define <16 x bfloat> @return_arg_v16bf16(<16 x bfloat> %x) #0 {
;
; SSE2-LABEL: return_arg_v16bf16:
; SSE2: # %bb.0:
; SSE2-NEXT: retq
;
; FAST_ISEL_SSE2-LABEL: return_arg_v16bf16:
; FAST_ISEL_SSE2: # %bb.0:
; FAST_ISEL_SSE2-NEXT: pushq %r14
; FAST_ISEL_SSE2-NEXT: pushq %rbx
; FAST_ISEL_SSE2-NEXT: subq $104, %rsp
; FAST_ISEL_SSE2-NEXT: pextrw $7, %xmm1, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; FAST_ISEL_SSE2-NEXT: pextrw $6, %xmm1, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; FAST_ISEL_SSE2-NEXT: pextrw $5, %xmm1, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; FAST_ISEL_SSE2-NEXT: pextrw $4, %xmm1, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; FAST_ISEL_SSE2-NEXT: pextrw $3, %xmm1, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; FAST_ISEL_SSE2-NEXT: pextrw $2, %xmm1, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; FAST_ISEL_SSE2-NEXT: pextrw $1, %xmm1, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; FAST_ISEL_SSE2-NEXT: movd %xmm1, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; FAST_ISEL_SSE2-NEXT: pextrw $7, %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; FAST_ISEL_SSE2-NEXT: pextrw $6, %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; FAST_ISEL_SSE2-NEXT: pextrw $5, %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; FAST_ISEL_SSE2-NEXT: pextrw $4, %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; FAST_ISEL_SSE2-NEXT: pextrw $3, %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; FAST_ISEL_SSE2-NEXT: pextrw $2, %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; FAST_ISEL_SSE2-NEXT: pextrw $1, %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movd %eax, %xmm1
; FAST_ISEL_SSE2-NEXT: movd %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; FAST_ISEL_SSE2-NEXT: movdqa %xmm1, %xmm0
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx
; FAST_ISEL_SSE2-NEXT: shll $16, %ebx
; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: movzwl %ax, %r14d
; FAST_ISEL_SSE2-NEXT: orl %ebx, %r14d
; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx
; FAST_ISEL_SSE2-NEXT: shll $16, %ebx
; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: movzwl %ax, %eax
; FAST_ISEL_SSE2-NEXT: orl %ebx, %eax
; FAST_ISEL_SSE2-NEXT: shlq $32, %rax
; FAST_ISEL_SSE2-NEXT: orq %r14, %rax
; FAST_ISEL_SSE2-NEXT: movq %rax, %xmm0
; FAST_ISEL_SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx
; FAST_ISEL_SSE2-NEXT: shll $16, %ebx
; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: movzwl %ax, %r14d
; FAST_ISEL_SSE2-NEXT: orl %ebx, %r14d
; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx
; FAST_ISEL_SSE2-NEXT: shll $16, %ebx
; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: movzwl %ax, %eax
; FAST_ISEL_SSE2-NEXT: orl %ebx, %eax
; FAST_ISEL_SSE2-NEXT: shlq $32, %rax
; FAST_ISEL_SSE2-NEXT: orq %r14, %rax
; FAST_ISEL_SSE2-NEXT: movq %rax, %xmm0
; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; FAST_ISEL_SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; FAST_ISEL_SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx
; FAST_ISEL_SSE2-NEXT: shll $16, %ebx
; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: movzwl %ax, %r14d
; FAST_ISEL_SSE2-NEXT: orl %ebx, %r14d
; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx
; FAST_ISEL_SSE2-NEXT: shll $16, %ebx
; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: movzwl %ax, %eax
; FAST_ISEL_SSE2-NEXT: orl %ebx, %eax
; FAST_ISEL_SSE2-NEXT: shlq $32, %rax
; FAST_ISEL_SSE2-NEXT: orq %r14, %rax
; FAST_ISEL_SSE2-NEXT: movq %rax, %xmm0
; FAST_ISEL_SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx
; FAST_ISEL_SSE2-NEXT: shll $16, %ebx
; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: movzwl %ax, %r14d
; FAST_ISEL_SSE2-NEXT: orl %ebx, %r14d
; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx
; FAST_ISEL_SSE2-NEXT: shll $16, %ebx
; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: movzwl %ax, %eax
; FAST_ISEL_SSE2-NEXT: orl %ebx, %eax
; FAST_ISEL_SSE2-NEXT: shlq $32, %rax
; FAST_ISEL_SSE2-NEXT: orq %r14, %rax
; FAST_ISEL_SSE2-NEXT: movq %rax, %xmm0
; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; FAST_ISEL_SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; FAST_ISEL_SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; FAST_ISEL_SSE2-NEXT: addq $104, %rsp
; FAST_ISEL_SSE2-NEXT: popq %rbx
; FAST_ISEL_SSE2-NEXT: popq %r14
; FAST_ISEL_SSE2-NEXT: retq
;
; AVX512BF16-LABEL: return_arg_v16bf16:
; AVX512BF16: # %bb.0:
; AVX512BF16-NEXT: retq
;
; FAST_ISEL_AVX512BF16-LABEL: return_arg_v16bf16:
; FAST_ISEL_AVX512BF16: # %bb.0:
; FAST_ISEL_AVX512BF16-NEXT: retq
;
; AVXNECONVERT-LABEL: return_arg_v16bf16:
; AVXNECONVERT: # %bb.0:
; AVXNECONVERT-NEXT: retq
;
; FAST_ISEL_AVXNECONVERT-LABEL: return_arg_v16bf16:
; FAST_ISEL_AVXNECONVERT: # %bb.0:
; FAST_ISEL_AVXNECONVERT-NEXT: retq
ret <16 x bfloat> %x
}
declare bfloat @returns_bf16(bfloat)
declare <2 x bfloat> @returns_v2bf16(<2 x bfloat>)
declare <3 x bfloat> @returns_v3bf16(<3 x bfloat>)
declare <4 x bfloat> @returns_v4bf16(<4 x bfloat>)
declare <8 x bfloat> @returns_v8bf16(<8 x bfloat>)
declare <16 x bfloat> @returns_v16bf16(<16 x bfloat>)
define bfloat @call_ret_bf16(ptr %ptr) #0 {
;
; SSE2-LABEL: call_ret_bf16:
; SSE2: # %bb.0:
; SSE2-NEXT: pushq %rax
; SSE2-NEXT: pinsrw $0, (%rdi), %xmm0
; SSE2-NEXT: callq returns_bf16@PLT
;
; FAST_ISEL_SSE2-LABEL: call_ret_bf16:
; FAST_ISEL_SSE2: # %bb.0:
; FAST_ISEL_SSE2-NEXT: pushq %rax
; FAST_ISEL_SSE2-NEXT: movzwl (%rdi), %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movd %eax, %xmm0
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: callq returns_bf16@PLT
;
; AVX512BF16-LABEL: call_ret_bf16:
; AVX512BF16: # %bb.0:
; AVX512BF16-NEXT: pushq %rax
; AVX512BF16-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0
; AVX512BF16-NEXT: callq returns_bf16@PLT
;
; FAST_ISEL_AVX512BF16-LABEL: call_ret_bf16:
; FAST_ISEL_AVX512BF16: # %bb.0:
; FAST_ISEL_AVX512BF16-NEXT: pushq %rax
; FAST_ISEL_AVX512BF16-NEXT: movzwl (%rdi), %eax
; FAST_ISEL_AVX512BF16-NEXT: shll $16, %eax
; FAST_ISEL_AVX512BF16-NEXT: vmovd %eax, %xmm0
; FAST_ISEL_AVX512BF16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
; FAST_ISEL_AVX512BF16-NEXT: callq returns_bf16@PLT
;
; AVXNECONVERT-LABEL: call_ret_bf16:
; AVXNECONVERT: # %bb.0:
; AVXNECONVERT-NEXT: pushq %rax
; AVXNECONVERT-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0
; AVXNECONVERT-NEXT: callq returns_bf16@PLT
;
; FAST_ISEL_AVXNECONVERT-LABEL: call_ret_bf16:
; FAST_ISEL_AVXNECONVERT: # %bb.0:
; FAST_ISEL_AVXNECONVERT-NEXT: pushq %rax
; FAST_ISEL_AVXNECONVERT-NEXT: movzwl (%rdi), %eax
; FAST_ISEL_AVXNECONVERT-NEXT: shll $16, %eax
; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %eax, %xmm0
; FAST_ISEL_AVXNECONVERT-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0
; FAST_ISEL_AVXNECONVERT-NEXT: callq returns_bf16@PLT
%val = load bfloat, ptr %ptr
call bfloat @returns_bf16(bfloat %val)
unreachable
}
define <2 x bfloat> @call_ret_v2bf16(ptr %ptr) #0 {
;
; SSE2-LABEL: call_ret_v2bf16:
; SSE2: # %bb.0:
; SSE2-NEXT: pushq %rax
; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: callq returns_v2bf16@PLT
;
; FAST_ISEL_SSE2-LABEL: call_ret_v2bf16:
; FAST_ISEL_SSE2: # %bb.0:
; FAST_ISEL_SSE2-NEXT: subq $40, %rsp
; FAST_ISEL_SSE2-NEXT: movl (%rdi), %eax
; FAST_ISEL_SSE2-NEXT: movl %eax, (%rsp)
; FAST_ISEL_SSE2-NEXT: movdqa (%rsp), %xmm0
; FAST_ISEL_SSE2-NEXT: pextrw $1, %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; FAST_ISEL_SSE2-NEXT: movd %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movd %eax, %xmm0
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; FAST_ISEL_SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; FAST_ISEL_SSE2-NEXT: movdqa %xmm1, %xmm0
; FAST_ISEL_SSE2-NEXT: callq returns_v2bf16@PLT
;
; AVX512BF16-LABEL: call_ret_v2bf16:
; AVX512BF16: # %bb.0:
; AVX512BF16-NEXT: pushq %rax
; AVX512BF16-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX512BF16-NEXT: callq returns_v2bf16@PLT
;
; FAST_ISEL_AVX512BF16-LABEL: call_ret_v2bf16:
; FAST_ISEL_AVX512BF16: # %bb.0:
; FAST_ISEL_AVX512BF16-NEXT: pushq %rax
; FAST_ISEL_AVX512BF16-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; FAST_ISEL_AVX512BF16-NEXT: callq returns_v2bf16@PLT
;
; AVXNECONVERT-LABEL: call_ret_v2bf16:
; AVXNECONVERT: # %bb.0:
; AVXNECONVERT-NEXT: pushq %rax
; AVXNECONVERT-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVXNECONVERT-NEXT: callq returns_v2bf16@PLT
;
; FAST_ISEL_AVXNECONVERT-LABEL: call_ret_v2bf16:
; FAST_ISEL_AVXNECONVERT: # %bb.0:
; FAST_ISEL_AVXNECONVERT-NEXT: pushq %rax
; FAST_ISEL_AVXNECONVERT-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; FAST_ISEL_AVXNECONVERT-NEXT: callq returns_v2bf16@PLT
%val = load <2 x bfloat>, ptr %ptr
call <2 x bfloat> @returns_v2bf16(<2 x bfloat> %val)
unreachable
}
define <3 x bfloat> @call_ret_v3bf16(ptr %ptr) #0 {
;
; SSE2-LABEL: call_ret_v3bf16:
; SSE2: # %bb.0:
; SSE2-NEXT: pushq %rax
; SSE2-NEXT: movl 4(%rdi), %eax
; SSE2-NEXT: pinsrw $0, %eax, %xmm1
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: callq returns_v3bf16@PLT
;
; FAST_ISEL_SSE2-LABEL: call_ret_v3bf16:
; FAST_ISEL_SSE2: # %bb.0:
; FAST_ISEL_SSE2-NEXT: subq $40, %rsp
; FAST_ISEL_SSE2-NEXT: movq (%rdi), %rax
; FAST_ISEL_SSE2-NEXT: movl %eax, %ecx
; FAST_ISEL_SSE2-NEXT: andl $-65536, %ecx # imm = 0xFFFF0000
; FAST_ISEL_SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; FAST_ISEL_SSE2-NEXT: movl %eax, %ecx
; FAST_ISEL_SSE2-NEXT: shll $16, %ecx
; FAST_ISEL_SSE2-NEXT: movd %ecx, %xmm0
; FAST_ISEL_SSE2-NEXT: shrq $32, %rax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; FAST_ISEL_SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; FAST_ISEL_SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; FAST_ISEL_SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; FAST_ISEL_SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; FAST_ISEL_SSE2-NEXT: movaps %xmm1, %xmm0
; FAST_ISEL_SSE2-NEXT: callq returns_v3bf16@PLT
;
; AVX512BF16-LABEL: call_ret_v3bf16:
; AVX512BF16: # %bb.0:
; AVX512BF16-NEXT: pushq %rax
; AVX512BF16-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX512BF16-NEXT: callq returns_v3bf16@PLT
;
; FAST_ISEL_AVX512BF16-LABEL: call_ret_v3bf16:
; FAST_ISEL_AVX512BF16: # %bb.0:
; FAST_ISEL_AVX512BF16-NEXT: pushq %rax
; FAST_ISEL_AVX512BF16-NEXT: movq (%rdi), %rax
; FAST_ISEL_AVX512BF16-NEXT: movl %eax, %ecx
; FAST_ISEL_AVX512BF16-NEXT: andl $-65536, %ecx # imm = 0xFFFF0000
; FAST_ISEL_AVX512BF16-NEXT: vmovd %ecx, %xmm0
; FAST_ISEL_AVX512BF16-NEXT: movl %eax, %ecx
; FAST_ISEL_AVX512BF16-NEXT: shll $16, %ecx
; FAST_ISEL_AVX512BF16-NEXT: vmovd %ecx, %xmm1
; FAST_ISEL_AVX512BF16-NEXT: shrq $32, %rax
; FAST_ISEL_AVX512BF16-NEXT: shll $16, %eax
; FAST_ISEL_AVX512BF16-NEXT: vmovd %eax, %xmm2
; FAST_ISEL_AVX512BF16-NEXT: vcvtneps2bf16 %xmm2, %xmm2
; FAST_ISEL_AVX512BF16-NEXT: vmovd %xmm2, %eax
; FAST_ISEL_AVX512BF16-NEXT: vcvtneps2bf16 %xmm1, %xmm1
; FAST_ISEL_AVX512BF16-NEXT: vcvtneps2bf16 %xmm0, %xmm0
; FAST_ISEL_AVX512BF16-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; FAST_ISEL_AVX512BF16-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
; FAST_ISEL_AVX512BF16-NEXT: callq returns_v3bf16@PLT
;
; AVXNECONVERT-LABEL: call_ret_v3bf16:
; AVXNECONVERT: # %bb.0:
; AVXNECONVERT-NEXT: pushq %rax
; AVXNECONVERT-NEXT: movl 4(%rdi), %eax
; AVXNECONVERT-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0
; AVXNECONVERT-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVXNECONVERT-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
; AVXNECONVERT-NEXT: callq returns_v3bf16@PLT
;
; FAST_ISEL_AVXNECONVERT-LABEL: call_ret_v3bf16:
; FAST_ISEL_AVXNECONVERT: # %bb.0:
; FAST_ISEL_AVXNECONVERT-NEXT: pushq %rax
; FAST_ISEL_AVXNECONVERT-NEXT: movq (%rdi), %rax
; FAST_ISEL_AVXNECONVERT-NEXT: movl %eax, %ecx
; FAST_ISEL_AVXNECONVERT-NEXT: andl $-65536, %ecx # imm = 0xFFFF0000
; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %ecx, %xmm0
; FAST_ISEL_AVXNECONVERT-NEXT: movl %eax, %ecx
; FAST_ISEL_AVXNECONVERT-NEXT: shll $16, %ecx
; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %ecx, %xmm1
; FAST_ISEL_AVXNECONVERT-NEXT: shrq $32, %rax
; FAST_ISEL_AVXNECONVERT-NEXT: shll $16, %eax
; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %eax, %xmm2
; FAST_ISEL_AVXNECONVERT-NEXT: {vex} vcvtneps2bf16 %xmm2, %xmm2
; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %xmm2, %eax
; FAST_ISEL_AVXNECONVERT-NEXT: {vex} vcvtneps2bf16 %xmm1, %xmm1
; FAST_ISEL_AVXNECONVERT-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0
; FAST_ISEL_AVXNECONVERT-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; FAST_ISEL_AVXNECONVERT-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
; FAST_ISEL_AVXNECONVERT-NEXT: vmovq %xmm0, %rax
; FAST_ISEL_AVXNECONVERT-NEXT: movl %eax, %ecx
; FAST_ISEL_AVXNECONVERT-NEXT: shrl $16, %ecx
; FAST_ISEL_AVXNECONVERT-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm0
; FAST_ISEL_AVXNECONVERT-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; FAST_ISEL_AVXNECONVERT-NEXT: shrq $32, %rax
; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %eax, %xmm1
; FAST_ISEL_AVXNECONVERT-NEXT: vpbroadcastw %xmm1, %xmm1
; FAST_ISEL_AVXNECONVERT-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7]
; FAST_ISEL_AVXNECONVERT-NEXT: callq returns_v3bf16@PLT
%val = load <3 x bfloat>, ptr %ptr
call <3 x bfloat> @returns_v3bf16(<3 x bfloat> %val)
unreachable
}
define <4 x bfloat> @call_ret_v4bf16(ptr %ptr) #0 {
;
; SSE2-LABEL: call_ret_v4bf16:
; SSE2: # %bb.0:
; SSE2-NEXT: pushq %rax
; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; SSE2-NEXT: callq returns_v4bf16@PLT
;
; FAST_ISEL_SSE2-LABEL: call_ret_v4bf16:
; FAST_ISEL_SSE2: # %bb.0:
; FAST_ISEL_SSE2-NEXT: subq $56, %rsp
; FAST_ISEL_SSE2-NEXT: movq (%rdi), %rax
; FAST_ISEL_SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp)
; FAST_ISEL_SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0
; FAST_ISEL_SSE2-NEXT: pextrw $3, %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; FAST_ISEL_SSE2-NEXT: pextrw $2, %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; FAST_ISEL_SSE2-NEXT: pextrw $1, %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; FAST_ISEL_SSE2-NEXT: movd %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movd %eax, %xmm0
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; FAST_ISEL_SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; FAST_ISEL_SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; FAST_ISEL_SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; FAST_ISEL_SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; FAST_ISEL_SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; FAST_ISEL_SSE2-NEXT: callq returns_v4bf16@PLT
;
; AVX512BF16-LABEL: call_ret_v4bf16:
; AVX512BF16: # %bb.0:
; AVX512BF16-NEXT: pushq %rax
; AVX512BF16-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX512BF16-NEXT: callq returns_v4bf16@PLT
;
; FAST_ISEL_AVX512BF16-LABEL: call_ret_v4bf16:
; FAST_ISEL_AVX512BF16: # %bb.0:
; FAST_ISEL_AVX512BF16-NEXT: pushq %rax
; FAST_ISEL_AVX512BF16-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; FAST_ISEL_AVX512BF16-NEXT: callq returns_v4bf16@PLT
;
; AVXNECONVERT-LABEL: call_ret_v4bf16:
; AVXNECONVERT: # %bb.0:
; AVXNECONVERT-NEXT: pushq %rax
; AVXNECONVERT-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVXNECONVERT-NEXT: callq returns_v4bf16@PLT
;
; FAST_ISEL_AVXNECONVERT-LABEL: call_ret_v4bf16:
; FAST_ISEL_AVXNECONVERT: # %bb.0:
; FAST_ISEL_AVXNECONVERT-NEXT: pushq %rax
; FAST_ISEL_AVXNECONVERT-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; FAST_ISEL_AVXNECONVERT-NEXT: callq returns_v4bf16@PLT
%val = load <4 x bfloat>, ptr %ptr
call <4 x bfloat> @returns_v4bf16(<4 x bfloat> %val)
unreachable
}
define <8 x bfloat> @call_ret_v8bf16(ptr %ptr) #0 {
;
; SSE2-LABEL: call_ret_v8bf16:
; SSE2: # %bb.0:
; SSE2-NEXT: pushq %rax
; SSE2-NEXT: movaps (%rdi), %xmm0
; SSE2-NEXT: callq returns_v8bf16@PLT
;
; FAST_ISEL_SSE2-LABEL: call_ret_v8bf16:
; FAST_ISEL_SSE2: # %bb.0:
; FAST_ISEL_SSE2-NEXT: pushq %r14
; FAST_ISEL_SSE2-NEXT: pushq %rbx
; FAST_ISEL_SSE2-NEXT: subq $56, %rsp
; FAST_ISEL_SSE2-NEXT: movdqa (%rdi), %xmm1
; FAST_ISEL_SSE2-NEXT: pextrw $7, %xmm1, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; FAST_ISEL_SSE2-NEXT: pextrw $6, %xmm1, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; FAST_ISEL_SSE2-NEXT: pextrw $5, %xmm1, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; FAST_ISEL_SSE2-NEXT: pextrw $4, %xmm1, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; FAST_ISEL_SSE2-NEXT: pextrw $3, %xmm1, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; FAST_ISEL_SSE2-NEXT: pextrw $2, %xmm1, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; FAST_ISEL_SSE2-NEXT: pextrw $1, %xmm1, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movd %eax, %xmm0
; FAST_ISEL_SSE2-NEXT: movd %xmm1, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx
; FAST_ISEL_SSE2-NEXT: shll $16, %ebx
; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: movzwl %ax, %r14d
; FAST_ISEL_SSE2-NEXT: orl %ebx, %r14d
; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx
; FAST_ISEL_SSE2-NEXT: shll $16, %ebx
; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: movzwl %ax, %eax
; FAST_ISEL_SSE2-NEXT: orl %ebx, %eax
; FAST_ISEL_SSE2-NEXT: shlq $32, %rax
; FAST_ISEL_SSE2-NEXT: orq %r14, %rax
; FAST_ISEL_SSE2-NEXT: movq %rax, %xmm0
; FAST_ISEL_SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx
; FAST_ISEL_SSE2-NEXT: shll $16, %ebx
; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: movzwl %ax, %r14d
; FAST_ISEL_SSE2-NEXT: orl %ebx, %r14d
; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx
; FAST_ISEL_SSE2-NEXT: shll $16, %ebx
; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: movzwl %ax, %eax
; FAST_ISEL_SSE2-NEXT: orl %ebx, %eax
; FAST_ISEL_SSE2-NEXT: shlq $32, %rax
; FAST_ISEL_SSE2-NEXT: orq %r14, %rax
; FAST_ISEL_SSE2-NEXT: movq %rax, %xmm1
; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; FAST_ISEL_SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; FAST_ISEL_SSE2-NEXT: callq returns_v8bf16@PLT
;
; AVX512BF16-LABEL: call_ret_v8bf16:
; AVX512BF16: # %bb.0:
; AVX512BF16-NEXT: pushq %rax
; AVX512BF16-NEXT: vmovaps (%rdi), %xmm0
; AVX512BF16-NEXT: callq returns_v8bf16@PLT
;
; FAST_ISEL_AVX512BF16-LABEL: call_ret_v8bf16:
; FAST_ISEL_AVX512BF16: # %bb.0:
; FAST_ISEL_AVX512BF16-NEXT: pushq %rax
; FAST_ISEL_AVX512BF16-NEXT: vmovaps (%rdi), %xmm0
; FAST_ISEL_AVX512BF16-NEXT: callq returns_v8bf16@PLT
;
; AVXNECONVERT-LABEL: call_ret_v8bf16:
; AVXNECONVERT: # %bb.0:
; AVXNECONVERT-NEXT: pushq %rax
; AVXNECONVERT-NEXT: vmovaps (%rdi), %xmm0
; AVXNECONVERT-NEXT: callq returns_v8bf16@PLT
;
; FAST_ISEL_AVXNECONVERT-LABEL: call_ret_v8bf16:
; FAST_ISEL_AVXNECONVERT: # %bb.0:
; FAST_ISEL_AVXNECONVERT-NEXT: pushq %rax
; FAST_ISEL_AVXNECONVERT-NEXT: vmovaps (%rdi), %xmm0
; FAST_ISEL_AVXNECONVERT-NEXT: callq returns_v8bf16@PLT
%val = load <8 x bfloat>, ptr %ptr
call <8 x bfloat> @returns_v8bf16(<8 x bfloat> %val)
unreachable
}
define <16 x bfloat> @call_ret_v16bf16(ptr %ptr) #0 {
;
; SSE2-LABEL: call_ret_v16bf16:
; SSE2: # %bb.0:
; SSE2-NEXT: pushq %rax
; SSE2-NEXT: movaps (%rdi), %xmm0
; SSE2-NEXT: movaps 16(%rdi), %xmm1
; SSE2-NEXT: callq returns_v16bf16@PLT
;
; FAST_ISEL_SSE2-LABEL: call_ret_v16bf16:
; FAST_ISEL_SSE2: # %bb.0:
; FAST_ISEL_SSE2-NEXT: pushq %r14
; FAST_ISEL_SSE2-NEXT: pushq %rbx
; FAST_ISEL_SSE2-NEXT: subq $104, %rsp
; FAST_ISEL_SSE2-NEXT: movdqa (%rdi), %xmm1
; FAST_ISEL_SSE2-NEXT: movdqa 16(%rdi), %xmm0
; FAST_ISEL_SSE2-NEXT: pextrw $7, %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; FAST_ISEL_SSE2-NEXT: pextrw $6, %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; FAST_ISEL_SSE2-NEXT: pextrw $5, %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; FAST_ISEL_SSE2-NEXT: pextrw $4, %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; FAST_ISEL_SSE2-NEXT: pextrw $3, %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; FAST_ISEL_SSE2-NEXT: pextrw $2, %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; FAST_ISEL_SSE2-NEXT: pextrw $1, %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; FAST_ISEL_SSE2-NEXT: movd %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; FAST_ISEL_SSE2-NEXT: pextrw $7, %xmm1, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; FAST_ISEL_SSE2-NEXT: pextrw $6, %xmm1, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; FAST_ISEL_SSE2-NEXT: pextrw $5, %xmm1, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; FAST_ISEL_SSE2-NEXT: pextrw $4, %xmm1, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; FAST_ISEL_SSE2-NEXT: pextrw $3, %xmm1, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; FAST_ISEL_SSE2-NEXT: pextrw $2, %xmm1, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; FAST_ISEL_SSE2-NEXT: pextrw $1, %xmm1, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movd %eax, %xmm0
; FAST_ISEL_SSE2-NEXT: movd %xmm1, %eax
; FAST_ISEL_SSE2-NEXT: shll $16, %eax
; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx
; FAST_ISEL_SSE2-NEXT: shll $16, %ebx
; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: movzwl %ax, %r14d
; FAST_ISEL_SSE2-NEXT: orl %ebx, %r14d
; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx
; FAST_ISEL_SSE2-NEXT: shll $16, %ebx
; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: movzwl %ax, %eax
; FAST_ISEL_SSE2-NEXT: orl %ebx, %eax
; FAST_ISEL_SSE2-NEXT: shlq $32, %rax
; FAST_ISEL_SSE2-NEXT: orq %r14, %rax
; FAST_ISEL_SSE2-NEXT: movq %rax, %xmm0
; FAST_ISEL_SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx
; FAST_ISEL_SSE2-NEXT: shll $16, %ebx
; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: movzwl %ax, %r14d
; FAST_ISEL_SSE2-NEXT: orl %ebx, %r14d
; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx
; FAST_ISEL_SSE2-NEXT: shll $16, %ebx
; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: movzwl %ax, %eax
; FAST_ISEL_SSE2-NEXT: orl %ebx, %eax
; FAST_ISEL_SSE2-NEXT: shlq $32, %rax
; FAST_ISEL_SSE2-NEXT: orq %r14, %rax
; FAST_ISEL_SSE2-NEXT: movq %rax, %xmm0
; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; FAST_ISEL_SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; FAST_ISEL_SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx
; FAST_ISEL_SSE2-NEXT: shll $16, %ebx
; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: movzwl %ax, %r14d
; FAST_ISEL_SSE2-NEXT: orl %ebx, %r14d
; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx
; FAST_ISEL_SSE2-NEXT: shll $16, %ebx
; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: movzwl %ax, %eax
; FAST_ISEL_SSE2-NEXT: orl %ebx, %eax
; FAST_ISEL_SSE2-NEXT: shlq $32, %rax
; FAST_ISEL_SSE2-NEXT: orq %r14, %rax
; FAST_ISEL_SSE2-NEXT: movq %rax, %xmm0
; FAST_ISEL_SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx
; FAST_ISEL_SSE2-NEXT: shll $16, %ebx
; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: movzwl %ax, %r14d
; FAST_ISEL_SSE2-NEXT: orl %ebx, %r14d
; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx
; FAST_ISEL_SSE2-NEXT: shll $16, %ebx
; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero
; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT
; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax
; FAST_ISEL_SSE2-NEXT: movzwl %ax, %eax
; FAST_ISEL_SSE2-NEXT: orl %ebx, %eax
; FAST_ISEL_SSE2-NEXT: shlq $32, %rax
; FAST_ISEL_SSE2-NEXT: orq %r14, %rax
; FAST_ISEL_SSE2-NEXT: movq %rax, %xmm0
; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; FAST_ISEL_SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; FAST_ISEL_SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; FAST_ISEL_SSE2-NEXT: callq returns_v16bf16@PLT
;
; AVX512BF16-LABEL: call_ret_v16bf16:
; AVX512BF16: # %bb.0:
; AVX512BF16-NEXT: pushq %rax
; AVX512BF16-NEXT: vmovaps (%rdi), %ymm0
; AVX512BF16-NEXT: callq returns_v16bf16@PLT
;
; FAST_ISEL_AVX512BF16-LABEL: call_ret_v16bf16:
; FAST_ISEL_AVX512BF16: # %bb.0:
; FAST_ISEL_AVX512BF16-NEXT: pushq %rax
; FAST_ISEL_AVX512BF16-NEXT: vmovaps (%rdi), %ymm0
; FAST_ISEL_AVX512BF16-NEXT: callq returns_v16bf16@PLT
;
; AVXNECONVERT-LABEL: call_ret_v16bf16:
; AVXNECONVERT: # %bb.0:
; AVXNECONVERT-NEXT: pushq %rax
; AVXNECONVERT-NEXT: vmovaps (%rdi), %ymm0
; AVXNECONVERT-NEXT: callq returns_v16bf16@PLT
;
; FAST_ISEL_AVXNECONVERT-LABEL: call_ret_v16bf16:
; FAST_ISEL_AVXNECONVERT: # %bb.0:
; FAST_ISEL_AVXNECONVERT-NEXT: pushq %rax
; FAST_ISEL_AVXNECONVERT-NEXT: vmovaps (%rdi), %ymm0
; FAST_ISEL_AVXNECONVERT-NEXT: callq returns_v16bf16@PLT
%val = load <16 x bfloat>, ptr %ptr
call <16 x bfloat> @returns_v16bf16(<16 x bfloat> %val)
unreachable
}
attributes #0 = { nounwind }