llvm-project/llvm/test/CodeGen/AMDGPU/promote-alloca-scoring.ll
Nicolai Hähnle e760d0619f
AMDGPU/PromoteAlloca: Refactor into analysis / commit phases (#170512)
This change is motivated by the overall goal of finding alternative ways
to promote allocas to VGPRs. The current solution is effectively limited
to allocas whose size matches a register class, and we can't keep adding
more register classes. We have some downstream work in this direction,
and I'm currently looking at cleaning that up to bring it upstream.

This refactor paves the way to adding a third way of promoting allocas,
on top of the existing alloca-to-vector and alloca-to-LDS. Much of the
analysis can be shared between the different promotion techniques.

Additionally, the idea behind splitting the pass into an analysis
phase and a commit phase is that it ought to allow us to more easily
make
better "big picture" decision about which allocas to promote how in the
future.
2025-12-12 01:24:38 +00:00

74 lines
3.1 KiB
LLVM

; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -debug-only=amdgpu-promote-alloca -amdgpu-promote-alloca-to-vector-limit=512 -passes=amdgpu-promote-alloca %s -o - 2>&1 | FileCheck %s
; REQUIRES: asserts
; CHECK-LABEL: Analyzing: %simpleuser = alloca [4 x i64], align 4, addrspace(5)
; CHECK-NEXT: Scoring: %simpleuser = alloca [4 x i64], align 4, addrspace(5)
; CHECK-NEXT: [+1]: store i64 42, ptr addrspace(5) %simpleuser, align 8
; CHECK-NEXT: => Final Score:1
; CHECK-LABEL: Analyzing: %manyusers = alloca [4 x i64], align 4, addrspace(5)
; CHECK-NEXT: Scoring: %manyusers = alloca [4 x i64], align 4, addrspace(5)
; CHECK-NEXT: [+1]: store i64 %v0.add, ptr addrspace(5) %manyusers.1, align 8
; CHECK-NEXT: [+1]: %v0 = load i64, ptr addrspace(5) %manyusers.1, align 8
; CHECK-NEXT: [+1]: store i64 %v1.add, ptr addrspace(5) %manyusers.2, align 8
; CHECK-NEXT: [+1]: %v1 = load i64, ptr addrspace(5) %manyusers.2, align 8
; CHECK-NEXT: => Final Score:4
; CHECK-NEXT: Sorted Worklist:
; CHECK-NEXT: %manyusers = alloca [4 x i64], align 4, addrspace(5)
; CHECK-NEXT: %simpleuser = alloca [4 x i64], align 4, addrspace(5)
define amdgpu_kernel void @simple_users_scores() #0 {
entry:
; should get a score of 1
%simpleuser = alloca [4 x i64], align 4, addrspace(5)
; should get a score of 4
%manyusers = alloca [4 x i64], align 4, addrspace(5)
store i64 42, ptr addrspace(5) %simpleuser
%manyusers.1 = getelementptr i64, ptr addrspace(5) %manyusers, i64 2
%v0 = load i64, ptr addrspace(5) %manyusers.1
%v0.add = add i64 %v0, 1
store i64 %v0.add, ptr addrspace(5) %manyusers.1
%manyusers.2 = getelementptr i64, ptr addrspace(5) %manyusers, i64 1
%v1 = load i64, ptr addrspace(5) %manyusers.2
%v1.add = add i64 %v0, 1
store i64 %v1.add, ptr addrspace(5) %manyusers.2
ret void
}
; CHECK-LABEL: Analyzing: %stack = alloca [4 x i64], align 4, addrspace(5)
; CHECK-NEXT: Scoring: %stack = alloca [4 x i64], align 4, addrspace(5)
; CHECK-NEXT: [+5]: store i64 32, ptr addrspace(5) %stack, align 8
; CHECK-NEXT: [+1]: store i64 42, ptr addrspace(5) %stack, align 8
; CHECK-NEXT: [+9]: store i64 32, ptr addrspace(5) %stack.1, align 8
; CHECK-NEXT: [+5]: %outer = load i64, ptr addrspace(5) %stack.1, align 8
; CHECK-NEXT: [+1]: store i64 64, ptr addrspace(5) %stack.2, align 8
; CHECK-NEXT: [+9]: %inner = load i64, ptr addrspace(5) %stack.2, align 8
; CHECK-NEXT: => Final Score:30
define amdgpu_kernel void @loop_users_alloca(i1 %x, i2) #0 {
entry:
; should get a score of 1
%stack = alloca [4 x i64], align 4, addrspace(5)
%stack.1 = getelementptr i8, ptr addrspace(5) %stack, i64 8
%stack.2 = getelementptr i8, ptr addrspace(5) %stack, i64 16
store i64 42, ptr addrspace(5) %stack
br label %loop.outer
loop.outer:
store i64 32, ptr addrspace(5) %stack
%outer = load i64, ptr addrspace(5) %stack.1
br label %loop.inner
loop.inner:
store i64 32, ptr addrspace(5) %stack.1
%inner = load i64, ptr addrspace(5) %stack.2
%inner.cmp = icmp sge i64 %inner, 0
br i1 %inner.cmp, label %loop.inner, label %loop.outer
exit:
store i64 64, ptr addrspace(5) %stack.2
ret void
}