This change is motivated by the overall goal of finding alternative ways to promote allocas to VGPRs. The current solution is effectively limited to allocas whose size matches a register class, and we can't keep adding more register classes. We have some downstream work in this direction, and I'm currently looking at cleaning that up to bring it upstream. This refactor paves the way to adding a third way of promoting allocas, on top of the existing alloca-to-vector and alloca-to-LDS. Much of the analysis can be shared between the different promotion techniques. Additionally, the idea behind splitting the pass into an analysis phase and a commit phase is that it ought to allow us to more easily make better "big picture" decision about which allocas to promote how in the future.
74 lines
3.1 KiB
LLVM
74 lines
3.1 KiB
LLVM
; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -debug-only=amdgpu-promote-alloca -amdgpu-promote-alloca-to-vector-limit=512 -passes=amdgpu-promote-alloca %s -o - 2>&1 | FileCheck %s
|
|
; REQUIRES: asserts
|
|
|
|
; CHECK-LABEL: Analyzing: %simpleuser = alloca [4 x i64], align 4, addrspace(5)
|
|
; CHECK-NEXT: Scoring: %simpleuser = alloca [4 x i64], align 4, addrspace(5)
|
|
; CHECK-NEXT: [+1]: store i64 42, ptr addrspace(5) %simpleuser, align 8
|
|
; CHECK-NEXT: => Final Score:1
|
|
; CHECK-LABEL: Analyzing: %manyusers = alloca [4 x i64], align 4, addrspace(5)
|
|
; CHECK-NEXT: Scoring: %manyusers = alloca [4 x i64], align 4, addrspace(5)
|
|
; CHECK-NEXT: [+1]: store i64 %v0.add, ptr addrspace(5) %manyusers.1, align 8
|
|
; CHECK-NEXT: [+1]: %v0 = load i64, ptr addrspace(5) %manyusers.1, align 8
|
|
; CHECK-NEXT: [+1]: store i64 %v1.add, ptr addrspace(5) %manyusers.2, align 8
|
|
; CHECK-NEXT: [+1]: %v1 = load i64, ptr addrspace(5) %manyusers.2, align 8
|
|
; CHECK-NEXT: => Final Score:4
|
|
; CHECK-NEXT: Sorted Worklist:
|
|
; CHECK-NEXT: %manyusers = alloca [4 x i64], align 4, addrspace(5)
|
|
; CHECK-NEXT: %simpleuser = alloca [4 x i64], align 4, addrspace(5)
|
|
define amdgpu_kernel void @simple_users_scores() #0 {
|
|
entry:
|
|
; should get a score of 1
|
|
%simpleuser = alloca [4 x i64], align 4, addrspace(5)
|
|
; should get a score of 4
|
|
%manyusers = alloca [4 x i64], align 4, addrspace(5)
|
|
|
|
store i64 42, ptr addrspace(5) %simpleuser
|
|
|
|
%manyusers.1 = getelementptr i64, ptr addrspace(5) %manyusers, i64 2
|
|
%v0 = load i64, ptr addrspace(5) %manyusers.1
|
|
%v0.add = add i64 %v0, 1
|
|
store i64 %v0.add, ptr addrspace(5) %manyusers.1
|
|
|
|
%manyusers.2 = getelementptr i64, ptr addrspace(5) %manyusers, i64 1
|
|
%v1 = load i64, ptr addrspace(5) %manyusers.2
|
|
%v1.add = add i64 %v0, 1
|
|
store i64 %v1.add, ptr addrspace(5) %manyusers.2
|
|
|
|
ret void
|
|
}
|
|
|
|
; CHECK-LABEL: Analyzing: %stack = alloca [4 x i64], align 4, addrspace(5)
|
|
; CHECK-NEXT: Scoring: %stack = alloca [4 x i64], align 4, addrspace(5)
|
|
; CHECK-NEXT: [+5]: store i64 32, ptr addrspace(5) %stack, align 8
|
|
; CHECK-NEXT: [+1]: store i64 42, ptr addrspace(5) %stack, align 8
|
|
; CHECK-NEXT: [+9]: store i64 32, ptr addrspace(5) %stack.1, align 8
|
|
; CHECK-NEXT: [+5]: %outer = load i64, ptr addrspace(5) %stack.1, align 8
|
|
; CHECK-NEXT: [+1]: store i64 64, ptr addrspace(5) %stack.2, align 8
|
|
; CHECK-NEXT: [+9]: %inner = load i64, ptr addrspace(5) %stack.2, align 8
|
|
; CHECK-NEXT: => Final Score:30
|
|
define amdgpu_kernel void @loop_users_alloca(i1 %x, i2) #0 {
|
|
entry:
|
|
; should get a score of 1
|
|
%stack = alloca [4 x i64], align 4, addrspace(5)
|
|
%stack.1 = getelementptr i8, ptr addrspace(5) %stack, i64 8
|
|
%stack.2 = getelementptr i8, ptr addrspace(5) %stack, i64 16
|
|
|
|
store i64 42, ptr addrspace(5) %stack
|
|
br label %loop.outer
|
|
|
|
loop.outer:
|
|
store i64 32, ptr addrspace(5) %stack
|
|
%outer = load i64, ptr addrspace(5) %stack.1
|
|
br label %loop.inner
|
|
|
|
loop.inner:
|
|
store i64 32, ptr addrspace(5) %stack.1
|
|
%inner = load i64, ptr addrspace(5) %stack.2
|
|
%inner.cmp = icmp sge i64 %inner, 0
|
|
br i1 %inner.cmp, label %loop.inner, label %loop.outer
|
|
|
|
exit:
|
|
store i64 64, ptr addrspace(5) %stack.2
|
|
ret void
|
|
}
|