llvm-project/llvm/test/CodeGen/X86/statepoint-stack-usage.ll
Matthias Braun 189900eb14 X86: Stop assigning register costs for longer encodings.
This stops reporting CostPerUse 1 for `R8`-`R15` and `XMM8`-`XMM31`.
This was previously done because instruction encoding require a REX
prefix when using them resulting in longer instruction encodings. I
found that this regresses the quality of the register allocation as the
costs impose an ordering on eviction candidates. I also feel that there
is a bit of an impedance mismatch as the actual costs occure when
encoding instructions using those registers, but the order of VReg
assignments is not primarily ordered by number of Defs+Uses.

I did extensive measurements with the llvm-test-suite wiht SPEC2006 +
SPEC2017 included, internal services showed similar patterns. Generally
there are a log of improvements but also a lot of regression. But on
average the allocation quality seems to improve at a small code size
regression.

Results for measuring static and dynamic instruction counts:

Dynamic Counts (scaled by execution frequency) / Optimization Remarks:
    Spills+FoldedSpills   -5.6%
    Reloads+FoldedReloads -4.2%
    Copies                -0.1%

Static / LLVM Statistics:
    regalloc.NumSpills    mean -1.6%, geomean -2.8%
    regalloc.NumReloads   mean -1.7%, geomean -3.1%
    size..text            mean +0.4%, geomean +0.4%

Static / LLVM Statistics:
    mean -2.2%, geomean -3.1%) regalloc.NumSpills
    mean -2.6%, geomean -3.9%) regalloc.NumReloads
    mean +0.6%, geomean +0.6%) size..text

Static / LLVM Statistics:
    regalloc.NumSpills   mean -3.0%
    regalloc.NumReloads  mean -3.3%
    size..text           mean +0.3%, geomean +0.3%

Differential Revision: https://reviews.llvm.org/D133902
2022-09-30 16:01:33 -07:00

135 lines
8.2 KiB
LLVM

; RUN: llc -verify-machineinstrs -stack-symbol-ordering=0 < %s | FileCheck %s
target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-pc-linux-gnu"
; This test is checking to make sure that we reuse the same stack slots
; for GC values spilled over two different call sites. Since the order
; of GC arguments differ, niave lowering code would insert loads and
; stores to rearrange items on the stack. We need to make sure (for
; performance) that this doesn't happen.
define i32 @back_to_back_calls(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c) #1 gc "statepoint-example" {
; CHECK-LABEL: back_to_back_calls
; The exact stores don't matter, but there need to be three stack slots created
; CHECK-DAG: movq %rdi, {{[0-9]*}}(%rsp)
; CHECK-DAG: movq %rdx, {{[0-9]*}}(%rsp)
; CHECK-DAG: movq %rsi, {{[0-9]*}}(%rsp)
; There should be no more than three moves
; CHECK-NOT: movq
%safepoint_token = tail call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 0, i32 0, ptr elementtype(void ()) undef, i32 0, i32 0, i32 0, i32 0) ["gc-live" (ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c), "deopt" (i32 0, i32 -1, i32 0, i32 0, i32 0)]
%a1 = tail call coldcc ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token %safepoint_token, i32 0, i32 0)
%b1 = tail call coldcc ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token %safepoint_token, i32 0, i32 1)
%c1 = tail call coldcc ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token %safepoint_token, i32 0, i32 2)
; CHECK: callq
; This is the key check. There should NOT be any memory moves here
; CHECK-NOT: movq
%safepoint_token2 = tail call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 0, i32 0, ptr elementtype(void ()) undef, i32 0, i32 0, i32 0, i32 0) ["gc-live" (ptr addrspace(1) %c1, ptr addrspace(1) %b1, ptr addrspace(1) %a1), "deopt" (i32 0, i32 -1, i32 0, i32 0, i32 0)]
%a2 = tail call coldcc ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token %safepoint_token2, i32 0, i32 2)
%b2 = tail call coldcc ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token %safepoint_token2, i32 0, i32 1)
%c2 = tail call coldcc ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token %safepoint_token2, i32 0, i32 0)
; CHECK: callq
ret i32 1
}
; This test simply checks that minor changes in vm state don't prevent slots
; being reused for gc values.
define i32 @reserve_first(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c) #1 gc "statepoint-example" {
; CHECK-LABEL: reserve_first
; The exact stores don't matter, but there need to be three stack slots created
; CHECK-DAG: movq %rdi, {{[0-9]*}}(%rsp)
; CHECK-DAG: movq %rdx, {{[0-9]*}}(%rsp)
; CHECK-DAG: movq %rsi, {{[0-9]*}}(%rsp)
%safepoint_token = tail call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 0, i32 0, ptr elementtype(void ()) undef, i32 0, i32 0, i32 0, i32 0) ["gc-live" (ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c), "deopt" (i32 0, i32 -1, i32 0, i32 0, i32 0)]
%a1 = tail call coldcc ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token %safepoint_token, i32 0, i32 0)
%b1 = tail call coldcc ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token %safepoint_token, i32 0, i32 1)
%c1 = tail call coldcc ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token %safepoint_token, i32 0, i32 2)
; CHECK: callq
; This is the key check. There should NOT be any memory moves here
; CHECK-NOT: movq
%safepoint_token2 = tail call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 0, i32 0, ptr elementtype(void ()) undef, i32 0, i32 0, i32 0, i32 0) ["gc-live" (ptr addrspace(1) %c1, ptr addrspace(1) %b1, ptr addrspace(1) %a1), "deopt" (ptr addrspace(1) %a1, i32 0, ptr addrspace(1) %c1, i32 0, i32 0)]
%a2 = tail call coldcc ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token %safepoint_token2, i32 0, i32 2)
%b2 = tail call coldcc ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token %safepoint_token2, i32 0, i32 1)
%c2 = tail call coldcc ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token %safepoint_token2, i32 0, i32 0)
; CHECK: callq
ret i32 1
}
; Check that we reuse the same stack slot across multiple calls. The use of
; more than two calls here is critical. We've had a bug which allowed reuse
; exactly once which went undetected for a long time.
define i32 @back_to_back_deopt(i32 %a, i32 %b, i32 %c) #1
gc "statepoint-example" {
; CHECK-LABEL: back_to_back_deopt
; The exact stores don't matter, but there need to be three stack slots created
; CHECK-DAG: movl %edi, 12(%rsp)
; CHECK-DAG: movl %esi, 8(%rsp)
; CHECK-DAG: movl %edx, 4(%rsp)
; CHECK: callq
; CHECK-DAG: movl %r14d, 12(%rsp)
; CHECK-DAG: movl %ebp, 8(%rsp)
; CHECK-DAG: movl %ebx, 4(%rsp)
; CHECK: callq
; CHECK-DAG: movl %r14d, 12(%rsp)
; CHECK-DAG: movl %ebp, 8(%rsp)
; CHECK-DAG: movl %ebx, 4(%rsp)
; CHECK: callq
; CHECK-DAG: movl %r14d, 12(%rsp)
; CHECK-DAG: movl %ebp, 8(%rsp)
; CHECK-DAG: movl %ebx, 4(%rsp)
; CHECK: callq
call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 0, i32 0, ptr elementtype(void ()) undef, i32 0, i32 0, i32 0, i32 0) ["deopt" (i32 %a, i32 %b, i32 %c)]
call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 0, i32 0, ptr elementtype(void ()) undef, i32 0, i32 0, i32 0, i32 0) ["deopt" (i32 %a, i32 %b, i32 %c)]
call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 0, i32 0, ptr elementtype(void ()) undef, i32 0, i32 0, i32 0, i32 0) ["deopt" (i32 %a, i32 %b, i32 %c)]
call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 0, i32 0, ptr elementtype(void ()) undef, i32 0, i32 0, i32 0, i32 0) ["deopt" (i32 %a, i32 %b, i32 %c)]
ret i32 1
}
; Test that stack slots are reused for invokes
define i32 @back_to_back_invokes(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c) #1 gc "statepoint-example" personality ptr @"personality_function" {
; CHECK-LABEL: back_to_back_invokes
entry:
; The exact stores don't matter, but there need to be three stack slots created
; CHECK-DAG: movq %rdi, {{[0-9]*}}(%rsp)
; CHECK-DAG: movq %rdx, {{[0-9]*}}(%rsp)
; CHECK-DAG: movq %rsi, {{[0-9]*}}(%rsp)
; CHECK: callq
%safepoint_token = invoke token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 0, i32 0, ptr elementtype(void ()) undef, i32 0, i32 0, i32 0, i32 0) ["gc-live" (ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c), "deopt" (i32 0, i32 -1, i32 0, i32 0, i32 0)]
to label %normal_return unwind label %exceptional_return
normal_return:
%a1 = tail call coldcc ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token %safepoint_token, i32 0, i32 0)
%b1 = tail call coldcc ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token %safepoint_token, i32 0, i32 1)
%c1 = tail call coldcc ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token %safepoint_token, i32 0, i32 2)
; Should work even through bitcasts
; This is the key check. There should NOT be any memory moves here
; CHECK-NOT: movq
; CHECK: callq
%safepoint_token2 = invoke token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 0, i32 0, ptr elementtype(void ()) undef, i32 0, i32 0, i32 0, i32 0) ["gc-live" (ptr addrspace(1) %c1, ptr addrspace(1) %b1, ptr addrspace(1) %a1), "deopt" (i32 0, i32 -1, i32 0, i32 0, i32 0)]
to label %normal_return2 unwind label %exceptional_return2
normal_return2:
%a2 = tail call coldcc ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token %safepoint_token2, i32 0, i32 2)
%b2 = tail call coldcc ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token %safepoint_token2, i32 0, i32 1)
%c2 = tail call coldcc ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token %safepoint_token2, i32 0, i32 0)
ret i32 1
exceptional_return:
%landing_pad = landingpad { ptr, i32 }
cleanup
ret i32 0
exceptional_return2:
%landing_pad2 = landingpad { ptr, i32 }
cleanup
ret i32 0
}
; Function Attrs: nounwind
declare ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token, i32, i32) #3
declare token @llvm.experimental.gc.statepoint.p0(i64, i32, ptr, i32, i32, ...)
declare i32 @"personality_function"()
attributes #1 = { uwtable }