llvm-project/llvm/test/CodeGen/ARM/fp16-insert-extract.ll
Alexey Bataev 2cca53c815 [DAG]Introduce llvm::processShuffleMasks and use it for shuffles in DAG Type Legalizer.
We can process the long shuffles (working across several actual
vector registers) in the best way if we take the actual register
represantion into account. We can build more correct representation of
register shuffles, improve number of recognised buildvector sequences.
Also, same function can be used to improve the cost model for the
shuffles. in future patches.

Part of D100486

Differential Revision: https://reviews.llvm.org/D115653
2022-04-20 09:37:16 -07:00

369 lines
12 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=arm-eabi -mattr=+armv8.2-a,+fullfp16,+neon -float-abi=hard < %s | FileCheck %s --check-prefix=CHECKHARD
; RUN: llc -mtriple=arm-eabi -mattr=+armv8.2-a,+fullfp16,+neon -float-abi=soft < %s | FileCheck %s --check-prefix=CHECKSOFT
define float @test_vget_lane_f16_1(<4 x half> %a) nounwind {
; CHECKHARD-LABEL: test_vget_lane_f16_1:
; CHECKHARD: @ %bb.0: @ %entry
; CHECKHARD-NEXT: vcvtt.f32.f16 s0, s0
; CHECKHARD-NEXT: bx lr
;
; CHECKSOFT-LABEL: test_vget_lane_f16_1:
; CHECKSOFT: @ %bb.0: @ %entry
; CHECKSOFT-NEXT: vmov d0, r0, r1
; CHECKSOFT-NEXT: vcvtt.f32.f16 s0, s0
; CHECKSOFT-NEXT: vmov r0, s0
; CHECKSOFT-NEXT: bx lr
entry:
%elt = extractelement <4 x half> %a, i32 1
%conv = fpext half %elt to float
ret float %conv
}
define float @test_vget_lane_f16_2(<4 x half> %a) nounwind {
; CHECKHARD-LABEL: test_vget_lane_f16_2:
; CHECKHARD: @ %bb.0: @ %entry
; CHECKHARD-NEXT: vcvtb.f32.f16 s0, s1
; CHECKHARD-NEXT: bx lr
;
; CHECKSOFT-LABEL: test_vget_lane_f16_2:
; CHECKSOFT: @ %bb.0: @ %entry
; CHECKSOFT-NEXT: vmov d0, r0, r1
; CHECKSOFT-NEXT: vcvtb.f32.f16 s0, s1
; CHECKSOFT-NEXT: vmov r0, s0
; CHECKSOFT-NEXT: bx lr
entry:
%elt = extractelement <4 x half> %a, i32 2
%conv = fpext half %elt to float
ret float %conv
}
define float @test_vget_laneq_f16_6(<8 x half> %a) nounwind {
; CHECKHARD-LABEL: test_vget_laneq_f16_6:
; CHECKHARD: @ %bb.0: @ %entry
; CHECKHARD-NEXT: vcvtb.f32.f16 s0, s3
; CHECKHARD-NEXT: bx lr
;
; CHECKSOFT-LABEL: test_vget_laneq_f16_6:
; CHECKSOFT: @ %bb.0: @ %entry
; CHECKSOFT-NEXT: vmov d1, r2, r3
; CHECKSOFT-NEXT: vcvtb.f32.f16 s0, s3
; CHECKSOFT-NEXT: vmov r0, s0
; CHECKSOFT-NEXT: bx lr
entry:
%elt = extractelement <8 x half> %a, i32 6
%conv = fpext half %elt to float
ret float %conv
}
define float @test_vget_laneq_f16_7(<8 x half> %a) nounwind {
; CHECKHARD-LABEL: test_vget_laneq_f16_7:
; CHECKHARD: @ %bb.0: @ %entry
; CHECKHARD-NEXT: vcvtt.f32.f16 s0, s3
; CHECKHARD-NEXT: bx lr
;
; CHECKSOFT-LABEL: test_vget_laneq_f16_7:
; CHECKSOFT: @ %bb.0: @ %entry
; CHECKSOFT-NEXT: vmov d1, r2, r3
; CHECKSOFT-NEXT: vcvtt.f32.f16 s0, s3
; CHECKSOFT-NEXT: vmov r0, s0
; CHECKSOFT-NEXT: bx lr
entry:
%elt = extractelement <8 x half> %a, i32 7
%conv = fpext half %elt to float
ret float %conv
}
define <4 x half> @insert_v4f16(half %a) {
; CHECKHARD-LABEL: insert_v4f16:
; CHECKHARD: @ %bb.0: @ %entry
; CHECKHARD-NEXT: @ kill: def $s0 killed $s0 def $d0
; CHECKHARD-NEXT: bx lr
;
; CHECKSOFT-LABEL: insert_v4f16:
; CHECKSOFT: @ %bb.0: @ %entry
; CHECKSOFT-NEXT: vmov.f16 s0, r0
; CHECKSOFT-NEXT: vmov r0, r1, d0
; CHECKSOFT-NEXT: bx lr
entry:
%res = insertelement <4 x half> undef, half %a, i32 0
ret <4 x half> %res
}
define <8 x half> @insert_v8f16(half %a) {
; CHECKHARD-LABEL: insert_v8f16:
; CHECKHARD: @ %bb.0: @ %entry
; CHECKHARD-NEXT: @ kill: def $s0 killed $s0 def $q0
; CHECKHARD-NEXT: bx lr
;
; CHECKSOFT-LABEL: insert_v8f16:
; CHECKSOFT: @ %bb.0: @ %entry
; CHECKSOFT-NEXT: vmov.f16 s0, r0
; CHECKSOFT-NEXT: vmov r2, r3, d1
; CHECKSOFT-NEXT: vmov r0, r1, d0
; CHECKSOFT-NEXT: bx lr
entry:
%res = insertelement <8 x half> undef, half %a, i32 0
ret <8 x half> %res
}
define <4 x half> @test_vset_lane_f16(<4 x half> %a, float %fb) nounwind {
; CHECKHARD-LABEL: test_vset_lane_f16:
; CHECKHARD: @ %bb.0: @ %entry
; CHECKHARD-NEXT: vcvtt.f16.f32 s1, s2
; CHECKHARD-NEXT: bx lr
;
; CHECKSOFT-LABEL: test_vset_lane_f16:
; CHECKSOFT: @ %bb.0: @ %entry
; CHECKSOFT-NEXT: vmov d0, r0, r1
; CHECKSOFT-NEXT: vmov s2, r2
; CHECKSOFT-NEXT: vcvtt.f16.f32 s1, s2
; CHECKSOFT-NEXT: vmov r0, r1, d0
; CHECKSOFT-NEXT: bx lr
entry:
%b = fptrunc float %fb to half
%x = insertelement <4 x half> %a, half %b, i32 3
ret <4 x half> %x
}
define <8 x half> @test_vset_laneq_f16_1(<8 x half> %a, float %fb) nounwind {
; CHECKHARD-LABEL: test_vset_laneq_f16_1:
; CHECKHARD: @ %bb.0: @ %entry
; CHECKHARD-NEXT: vcvtt.f16.f32 s0, s4
; CHECKHARD-NEXT: bx lr
;
; CHECKSOFT-LABEL: test_vset_laneq_f16_1:
; CHECKSOFT: @ %bb.0: @ %entry
; CHECKSOFT-NEXT: vmov d1, r2, r3
; CHECKSOFT-NEXT: vldr s4, [sp]
; CHECKSOFT-NEXT: vmov d0, r0, r1
; CHECKSOFT-NEXT: vcvtt.f16.f32 s0, s4
; CHECKSOFT-NEXT: vmov r2, r3, d1
; CHECKSOFT-NEXT: vmov r0, r1, d0
; CHECKSOFT-NEXT: bx lr
entry:
%b = fptrunc float %fb to half
%x = insertelement <8 x half> %a, half %b, i32 1
ret <8 x half> %x
}
define <8 x half> @test_vset_laneq_f16_7(<8 x half> %a, float %fb) nounwind {
; CHECKHARD-LABEL: test_vset_laneq_f16_7:
; CHECKHARD: @ %bb.0: @ %entry
; CHECKHARD-NEXT: vcvtt.f16.f32 s3, s4
; CHECKHARD-NEXT: bx lr
;
; CHECKSOFT-LABEL: test_vset_laneq_f16_7:
; CHECKSOFT: @ %bb.0: @ %entry
; CHECKSOFT-NEXT: vmov d1, r2, r3
; CHECKSOFT-NEXT: vldr s4, [sp]
; CHECKSOFT-NEXT: vmov d0, r0, r1
; CHECKSOFT-NEXT: vcvtt.f16.f32 s3, s4
; CHECKSOFT-NEXT: vmov r0, r1, d0
; CHECKSOFT-NEXT: vmov r2, r3, d1
; CHECKSOFT-NEXT: bx lr
entry:
%b = fptrunc float %fb to half
%x = insertelement <8 x half> %a, half %b, i32 7
ret <8 x half> %x
}
define arm_aapcs_vfpcc <8 x half> @shuffle3step_f16(<32 x half> %src) {
; CHECKHARD-LABEL: shuffle3step_f16:
; CHECKHARD: @ %bb.0: @ %entry
; CHECKHARD-NEXT: vmov r1, s0
; CHECKHARD-NEXT: vmovx.f16 s12, s1
; CHECKHARD-NEXT: vmov r0, s12
; CHECKHARD-NEXT: vrev32.16 d16, d3
; CHECKHARD-NEXT: vext.16 d17, d4, d5, #2
; CHECKHARD-NEXT: vmovx.f16 s12, s4
; CHECKHARD-NEXT: vext.16 d16, d16, d3, #1
; CHECKHARD-NEXT: vext.16 d16, d17, d16, #2
; CHECKHARD-NEXT: vext.16 d16, d16, d17, #1
; CHECKHARD-NEXT: vext.16 d17, d16, d16, #1
; CHECKHARD-NEXT: vmov.16 d16[0], r1
; CHECKHARD-NEXT: vmov.16 d16[1], r0
; CHECKHARD-NEXT: vmov r0, s3
; CHECKHARD-NEXT: vmov.16 d16[2], r0
; CHECKHARD-NEXT: vmov r0, s12
; CHECKHARD-NEXT: vmovx.f16 s12, s0
; CHECKHARD-NEXT: vmov r1, s12
; CHECKHARD-NEXT: vmovx.f16 s12, s3
; CHECKHARD-NEXT: vmov.16 d16[3], r0
; CHECKHARD-NEXT: vmov r0, s2
; CHECKHARD-NEXT: vmov.16 d18[0], r1
; CHECKHARD-NEXT: vmov.16 d18[1], r0
; CHECKHARD-NEXT: vmov r0, s12
; CHECKHARD-NEXT: vdup.16 q3, d3[1]
; CHECKHARD-NEXT: vmov r1, s12
; CHECKHARD-NEXT: vmovx.f16 s12, s9
; CHECKHARD-NEXT: vmov.16 d18[2], r0
; CHECKHARD-NEXT: vmov r0, s5
; CHECKHARD-NEXT: vmov.16 d18[3], r0
; CHECKHARD-NEXT: vmov r0, s8
; CHECKHARD-NEXT: vmov.16 d19[0], r1
; CHECKHARD-NEXT: vmov.16 d19[1], r0
; CHECKHARD-NEXT: vmov r0, s12
; CHECKHARD-NEXT: vmov.16 d19[2], r0
; CHECKHARD-NEXT: vmov r0, s11
; CHECKHARD-NEXT: vmov.16 d19[3], r0
; CHECKHARD-NEXT: vadd.f16 q8, q8, q9
; CHECKHARD-NEXT: vext.16 d18, d0, d1, #2
; CHECKHARD-NEXT: vmovx.f16 s0, s8
; CHECKHARD-NEXT: vmov r0, s0
; CHECKHARD-NEXT: vdup.16 q0, d3[2]
; CHECKHARD-NEXT: vext.16 d19, d18, d2, #3
; CHECKHARD-NEXT: vmov r1, s0
; CHECKHARD-NEXT: vext.16 d18, d2, d18, #1
; CHECKHARD-NEXT: vmovx.f16 s0, s11
; CHECKHARD-NEXT: vext.16 d18, d18, d19, #2
; CHECKHARD-NEXT: vext.16 d18, d18, d18, #1
; CHECKHARD-NEXT: vmov.16 d19[0], r1
; CHECKHARD-NEXT: vmov.16 d19[1], r0
; CHECKHARD-NEXT: vmov r0, s10
; CHECKHARD-NEXT: vmov.16 d19[2], r0
; CHECKHARD-NEXT: vmov r0, s0
; CHECKHARD-NEXT: vmov.16 d19[3], r0
; CHECKHARD-NEXT: vadd.f16 q0, q8, q9
; CHECKHARD-NEXT: bx lr
;
; CHECKSOFT-LABEL: shuffle3step_f16:
; CHECKSOFT: @ %bb.0: @ %entry
; CHECKSOFT-NEXT: vmov r1, s0
; CHECKSOFT-NEXT: vmovx.f16 s12, s1
; CHECKSOFT-NEXT: vmov r0, s12
; CHECKSOFT-NEXT: vrev32.16 d16, d3
; CHECKSOFT-NEXT: vext.16 d17, d4, d5, #2
; CHECKSOFT-NEXT: vmovx.f16 s12, s4
; CHECKSOFT-NEXT: vext.16 d16, d16, d3, #1
; CHECKSOFT-NEXT: vext.16 d16, d17, d16, #2
; CHECKSOFT-NEXT: vext.16 d16, d16, d17, #1
; CHECKSOFT-NEXT: vext.16 d17, d16, d16, #1
; CHECKSOFT-NEXT: vmov.16 d16[0], r1
; CHECKSOFT-NEXT: vmov.16 d16[1], r0
; CHECKSOFT-NEXT: vmov r0, s3
; CHECKSOFT-NEXT: vmov.16 d16[2], r0
; CHECKSOFT-NEXT: vmov r0, s12
; CHECKSOFT-NEXT: vmovx.f16 s12, s0
; CHECKSOFT-NEXT: vmov r1, s12
; CHECKSOFT-NEXT: vmovx.f16 s12, s3
; CHECKSOFT-NEXT: vmov.16 d16[3], r0
; CHECKSOFT-NEXT: vmov r0, s2
; CHECKSOFT-NEXT: vmov.16 d18[0], r1
; CHECKSOFT-NEXT: vmov.16 d18[1], r0
; CHECKSOFT-NEXT: vmov r0, s12
; CHECKSOFT-NEXT: vdup.16 q3, d3[1]
; CHECKSOFT-NEXT: vmov r1, s12
; CHECKSOFT-NEXT: vmovx.f16 s12, s9
; CHECKSOFT-NEXT: vmov.16 d18[2], r0
; CHECKSOFT-NEXT: vmov r0, s5
; CHECKSOFT-NEXT: vmov.16 d18[3], r0
; CHECKSOFT-NEXT: vmov r0, s8
; CHECKSOFT-NEXT: vmov.16 d19[0], r1
; CHECKSOFT-NEXT: vmov.16 d19[1], r0
; CHECKSOFT-NEXT: vmov r0, s12
; CHECKSOFT-NEXT: vmov.16 d19[2], r0
; CHECKSOFT-NEXT: vmov r0, s11
; CHECKSOFT-NEXT: vmov.16 d19[3], r0
; CHECKSOFT-NEXT: vadd.f16 q8, q8, q9
; CHECKSOFT-NEXT: vext.16 d18, d0, d1, #2
; CHECKSOFT-NEXT: vmovx.f16 s0, s8
; CHECKSOFT-NEXT: vmov r0, s0
; CHECKSOFT-NEXT: vdup.16 q0, d3[2]
; CHECKSOFT-NEXT: vext.16 d19, d18, d2, #3
; CHECKSOFT-NEXT: vmov r1, s0
; CHECKSOFT-NEXT: vext.16 d18, d2, d18, #1
; CHECKSOFT-NEXT: vmovx.f16 s0, s11
; CHECKSOFT-NEXT: vext.16 d18, d18, d19, #2
; CHECKSOFT-NEXT: vext.16 d18, d18, d18, #1
; CHECKSOFT-NEXT: vmov.16 d19[0], r1
; CHECKSOFT-NEXT: vmov.16 d19[1], r0
; CHECKSOFT-NEXT: vmov r0, s10
; CHECKSOFT-NEXT: vmov.16 d19[2], r0
; CHECKSOFT-NEXT: vmov r0, s0
; CHECKSOFT-NEXT: vmov.16 d19[3], r0
; CHECKSOFT-NEXT: vadd.f16 q0, q8, q9
; CHECKSOFT-NEXT: bx lr
entry:
%s1 = shufflevector <32 x half> %src, <32 x half> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
%s2 = shufflevector <32 x half> %src, <32 x half> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
%s3 = shufflevector <32 x half> %src, <32 x half> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
%a = fadd <8 x half> %s1, %s2
%r = fadd <8 x half> %a, %s3
ret <8 x half> %r
}
define i16 @extract_v4i16(<4 x half> %a) {
; CHECKHARD-LABEL: extract_v4i16:
; CHECKHARD: @ %bb.0: @ %entry
; CHECKHARD-NEXT: vmov.u16 r0, d0[0]
; CHECKHARD-NEXT: bx lr
;
; CHECKSOFT-LABEL: extract_v4i16:
; CHECKSOFT: @ %bb.0: @ %entry
; CHECKSOFT-NEXT: vmov d16, r0, r1
; CHECKSOFT-NEXT: vmov.u16 r0, d16[0]
; CHECKSOFT-NEXT: bx lr
entry:
%elt = extractelement <4 x half> %a, i32 0
%t = bitcast half %elt to i16
ret i16 %t
}
define i16 @extract_v8i16(<8 x half> %a) {
; CHECKHARD-LABEL: extract_v8i16:
; CHECKHARD: @ %bb.0: @ %entry
; CHECKHARD-NEXT: vmov.u16 r0, d0[0]
; CHECKHARD-NEXT: bx lr
;
; CHECKSOFT-LABEL: extract_v8i16:
; CHECKSOFT: @ %bb.0: @ %entry
; CHECKSOFT-NEXT: vmov d16, r0, r1
; CHECKSOFT-NEXT: vmov.u16 r0, d16[0]
; CHECKSOFT-NEXT: bx lr
entry:
%elt = extractelement <8 x half> %a, i32 0
%t = bitcast half %elt to i16
ret i16 %t
}
define i32 @extract_v4s32(<4 x half> %a) {
; CHECKHARD-LABEL: extract_v4s32:
; CHECKHARD: @ %bb.0: @ %entry
; CHECKHARD-NEXT: vmov.u16 r0, d0[0]
; CHECKHARD-NEXT: sxth r0, r0
; CHECKHARD-NEXT: bx lr
;
; CHECKSOFT-LABEL: extract_v4s32:
; CHECKSOFT: @ %bb.0: @ %entry
; CHECKSOFT-NEXT: vmov d16, r0, r1
; CHECKSOFT-NEXT: vmov.u16 r0, d16[0]
; CHECKSOFT-NEXT: sxth r0, r0
; CHECKSOFT-NEXT: bx lr
entry:
%elt = extractelement <4 x half> %a, i32 0
%t = bitcast half %elt to i16
%s = sext i16 %t to i32
ret i32 %s
}
define i32 @extract_v8s32(<8 x half> %a) {
; CHECKHARD-LABEL: extract_v8s32:
; CHECKHARD: @ %bb.0: @ %entry
; CHECKHARD-NEXT: vmov.u16 r0, d0[0]
; CHECKHARD-NEXT: sxth r0, r0
; CHECKHARD-NEXT: bx lr
;
; CHECKSOFT-LABEL: extract_v8s32:
; CHECKSOFT: @ %bb.0: @ %entry
; CHECKSOFT-NEXT: vmov d16, r0, r1
; CHECKSOFT-NEXT: vmov.u16 r0, d16[0]
; CHECKSOFT-NEXT: sxth r0, r0
; CHECKSOFT-NEXT: bx lr
entry:
%elt = extractelement <8 x half> %a, i32 0
%t = bitcast half %elt to i16
%s = sext i16 %t to i32
ret i32 %s
}