llvm-project/llvm/test/CodeGen/AArch64/selectopt-const.ll
Guy David 58d70dc62b
[AArch64] Keep floating-point conversion in SIMD (#147707)
Stores can be issued faster if the result is kept in the SIMD/FP
registers.
The `HasOneUse` guards against creating two floating point conversions,
if for example there's some arithmetic done on the converted value as
well. Another approach would be to inspect the user instructions during
lowering, but I don't see that type of check in the lowering too often.
2025-07-30 14:53:56 +03:00

73 lines
2.9 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=neoverse-v2 -O3 < %s | FileCheck %s
define i32 @test_const(ptr %in1, ptr %in2, ptr %out, i32 %n, ptr %tbl) {
; CHECK-LABEL: test_const:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: cmp w3, #1
; CHECK-NEXT: b.lt .LBB0_3
; CHECK-NEXT: // %bb.1: // %for.body.preheader
; CHECK-NEXT: mov w9, #1267 // =0x4f3
; CHECK-NEXT: fmov s1, #1.00000000
; CHECK-NEXT: fmov d2, #5.00000000
; CHECK-NEXT: mov w8, w3
; CHECK-NEXT: movk w9, #16309, lsl #16
; CHECK-NEXT: fmov s0, w9
; CHECK-NEXT: mov w9, #16 // =0x10
; CHECK-NEXT: .p2align 5, , 16
; CHECK-NEXT: .LBB0_2: // %for.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldr s4, [x1], #4
; CHECK-NEXT: ldr w10, [x0], #4
; CHECK-NEXT: add w10, w10, #10
; CHECK-NEXT: scvtf d3, w10
; CHECK-NEXT: fmadd s4, s4, s0, s1
; CHECK-NEXT: fabs s4, s4
; CHECK-NEXT: fcvt d4, s4
; CHECK-NEXT: fdiv d3, d3, d4
; CHECK-NEXT: fcmp d3, d2
; CHECK-NEXT: csel x10, x9, xzr, lt
; CHECK-NEXT: subs x8, x8, #1
; CHECK-NEXT: ldr s3, [x4, x10]
; CHECK-NEXT: fcvtzs s3, s3
; CHECK-NEXT: st1 { v3.s }[0], [x2], #4
; CHECK-NEXT: b.ne .LBB0_2
; CHECK-NEXT: .LBB0_3: // %for.cond.cleanup
; CHECK-NEXT: mov w0, wzr
; CHECK-NEXT: ret
entry:
%cmp15 = icmp sgt i32 %n, 0
br i1 %cmp15, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
%wide.trip.count = zext nneg i32 %n to i64
br label %for.body
for.cond.cleanup: ; preds = %for.body, %entry
ret i32 0
for.body: ; preds = %for.body.preheader, %for.body
%indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
%arrayidx = getelementptr inbounds i32, ptr %in1, i64 %indvars.iv
%0 = load i32, ptr %arrayidx, align 4
%add = add nsw i32 %0, 10
%conv = sitofp i32 %add to double
%arrayidx2 = getelementptr inbounds float, ptr %in2, i64 %indvars.iv
%1 = load float, ptr %arrayidx2, align 4
%mul = fmul fast float %1, 0x3FF6A09E60000000
%add3 = fadd fast float %mul, 1.000000e+00
%2 = tail call fast float @llvm.fabs.f32(float %add3)
%3 = fpext float %2 to double
%div = fdiv fast double %conv, %3
%cmp5 = fcmp fast olt double %div, 5.000000e+00
%idxprom6 = select i1 %cmp5, i64 4, i64 0
%arrayidx7 = getelementptr inbounds float, ptr %tbl, i64 %idxprom6
%4 = load float, ptr %arrayidx7, align 4
%conv8 = fptosi float %4 to i32
%arrayidx10 = getelementptr inbounds i32, ptr %out, i64 %indvars.iv
store i32 %conv8, ptr %arrayidx10, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
}