When computing the number of registers required by entry functions, the `AMDGPUAsmPrinter` needs to take into account both the register usage computed by the `AMDGPUResourceUsageAnalysis` pass, and the number of registers initialized by the hardware. At the moment, the way it computes the latter is different for graphics vs compute, due to differences in the implementation. For kernels, all the information needed is available in the `SIMachineFunctionInfo`, but for graphics shaders we would iterate over the `Function` arguments in the `AMDGPUAsmPrinter`. This pretty much repeats some of the logic from instruction selection. This patch introduces 2 new members to `SIMachineFunctionInfo`, one for SGPRs and one for VGPRs. Both will be computed during instruction selection and then used during `AMDGPUAsmPrinter`, removing the need to refer to the `Function` when printing assembly. This patch is NFC except for the fact that we now add the extra SGPRs (VCC, XNACK etc) to the number of SGPRs computed for graphics entry points. I'm not sure why these weren't included before. It would be nice if someone could confirm if that was just an oversight or if we have some docs somewhere that I haven't managed to find. Only one test is affected (its SGPR usage increases because we now take into account the XNACK registers).
323 lines
12 KiB
LLVM
323 lines
12 KiB
LLVM
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -stop-after=si-pre-allocate-wwm-regs -o %t.mir %s
|
|
; RUN: llc -run-pass=none -verify-machineinstrs %t.mir -o - | FileCheck %s
|
|
|
|
; Test that SIMachineFunctionInfo can be round trip serialized through
|
|
; MIR.
|
|
|
|
@lds = addrspace(3) global [512 x float] undef, align 4
|
|
|
|
; CHECK-LABEL: {{^}}name: kernel
|
|
; CHECK: machineFunctionInfo:
|
|
; CHECK-NEXT: explicitKernArgSize: 128
|
|
; CHECK-NEXT: maxKernArgAlign: 64
|
|
; CHECK-NEXT: ldsSize: 2048
|
|
; CHECK-NEXT: gdsSize: 0
|
|
; CHECK-NEXT: dynLDSAlign: 1
|
|
; CHECK-NEXT: isEntryFunction: true
|
|
; CHECK-NEXT: isChainFunction: false
|
|
; CHECK-NEXT: noSignedZerosFPMath: false
|
|
; CHECK-NEXT: memoryBound: false
|
|
; CHECK-NEXT: waveLimiter: false
|
|
; CHECK-NEXT: hasSpilledSGPRs: false
|
|
; CHECK-NEXT: hasSpilledVGPRs: false
|
|
; CHECK-NEXT: numWaveDispatchSGPRs: 0
|
|
; CHECK-NEXT: numWaveDispatchVGPRs: 0
|
|
; CHECK-NEXT: scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99'
|
|
; CHECK-NEXT: frameOffsetReg: '$fp_reg'
|
|
; CHECK-NEXT: stackPtrOffsetReg: '$sgpr32'
|
|
; CHECK-NEXT: bytesInStackArgArea: 0
|
|
; CHECK-NEXT: returnsVoid: true
|
|
; CHECK-NEXT: argumentInfo:
|
|
; CHECK-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
|
|
; CHECK-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' }
|
|
; CHECK-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' }
|
|
; CHECK-NEXT: kernargSegmentPtr: { reg: '$sgpr8_sgpr9' }
|
|
; CHECK-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' }
|
|
; CHECK-NEXT: workGroupIDX: { reg: '$sgpr12' }
|
|
; CHECK-NEXT: workGroupIDY: { reg: '$sgpr13' }
|
|
; CHECK-NEXT: workGroupIDZ: { reg: '$sgpr14' }
|
|
; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr15' }
|
|
; CHECK-NEXT: workItemIDX: { reg: '$vgpr0' }
|
|
; CHECK-NEXT: workItemIDY: { reg: '$vgpr1' }
|
|
; CHECK-NEXT: workItemIDZ: { reg: '$vgpr2' }
|
|
; CHECK-NEXT: psInputAddr: 0
|
|
; CHECK-NEXT: psInputEnable: 0
|
|
; CHECK-NEXT: maxMemoryClusterDWords: 8
|
|
; CHECK-NEXT: mode:
|
|
; CHECK-NEXT: ieee: true
|
|
; CHECK-NEXT: dx10-clamp: true
|
|
; CHECK-NEXT: fp32-input-denormals: true
|
|
; CHECK-NEXT: fp32-output-denormals: true
|
|
; CHECK-NEXT: fp64-fp16-input-denormals: true
|
|
; CHECK-NEXT: fp64-fp16-output-denormals: true
|
|
; CHECK-NEXT: highBitsOf32BitAddress: 0
|
|
; CHECK-NEXT: occupancy: 8
|
|
; CHECK-NEXT: vgprForAGPRCopy: ''
|
|
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
|
|
; CHECK-NEXT: longBranchReservedReg: ''
|
|
; CHECK-NEXT: hasInitWholeWave: false
|
|
; CHECK-NEXT: dynamicVGPRBlockSize: 0
|
|
; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
|
|
; CHECK-NEXT: isWholeWaveFunction: false
|
|
; CHECK-NEXT: body:
|
|
define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) {
|
|
%gep = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %arg0
|
|
store float 0.0, ptr addrspace(3) %gep, align 4
|
|
ret void
|
|
}
|
|
|
|
@gds = addrspace(2) global [128 x i32] undef, align 4
|
|
|
|
; CHECK-LABEL: {{^}}name: ps_shader
|
|
; CHECK: machineFunctionInfo:
|
|
; CHECK-NEXT: explicitKernArgSize: 0
|
|
; CHECK-NEXT: maxKernArgAlign: 4
|
|
; CHECK-NEXT: ldsSize: 0
|
|
; CHECK-NEXT: gdsSize: 512
|
|
; CHECK-NEXT: dynLDSAlign: 1
|
|
; CHECK-NEXT: isEntryFunction: true
|
|
; CHECK-NEXT: isChainFunction: false
|
|
; CHECK-NEXT: noSignedZerosFPMath: false
|
|
; CHECK-NEXT: memoryBound: false
|
|
; CHECK-NEXT: waveLimiter: false
|
|
; CHECK-NEXT: hasSpilledSGPRs: false
|
|
; CHECK-NEXT: hasSpilledVGPRs: false
|
|
; CHECK-NEXT: numWaveDispatchSGPRs: 3
|
|
; CHECK-NEXT: numWaveDispatchVGPRs: 1
|
|
; CHECK-NEXT: scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99'
|
|
; CHECK-NEXT: frameOffsetReg: '$fp_reg'
|
|
; CHECK-NEXT: stackPtrOffsetReg: '$sgpr32'
|
|
; CHECK-NEXT: bytesInStackArgArea: 0
|
|
; CHECK-NEXT: returnsVoid: true
|
|
; CHECK-NEXT: argumentInfo:
|
|
; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr3' }
|
|
; CHECK-NEXT: implicitBufferPtr: { reg: '$sgpr0_sgpr1' }
|
|
; CHECK-NEXT: psInputAddr: 1
|
|
; CHECK-NEXT: psInputEnable: 1
|
|
; CHECK-NEXT: maxMemoryClusterDWords: 8
|
|
; CHECK-NEXT: mode:
|
|
; CHECK-NEXT: ieee: false
|
|
; CHECK-NEXT: dx10-clamp: true
|
|
; CHECK-NEXT: fp32-input-denormals: true
|
|
; CHECK-NEXT: fp32-output-denormals: true
|
|
; CHECK-NEXT: fp64-fp16-input-denormals: true
|
|
; CHECK-NEXT: fp64-fp16-output-denormals: true
|
|
; CHECK-NEXT: highBitsOf32BitAddress: 0
|
|
; CHECK-NEXT: occupancy: 10
|
|
; CHECK-NEXT: vgprForAGPRCopy: ''
|
|
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
|
|
; CHECK-NEXT: longBranchReservedReg: ''
|
|
; CHECK-NEXT: hasInitWholeWave: false
|
|
; CHECK-NEXT: dynamicVGPRBlockSize: 0
|
|
; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
|
|
; CHECK-NEXT: isWholeWaveFunction: false
|
|
; CHECK-NEXT: body:
|
|
define amdgpu_ps void @ps_shader(i32 %arg0, i32 inreg %arg1) {
|
|
%gep = getelementptr inbounds [128 x i32], ptr addrspace(2) @gds, i32 0, i32 %arg0
|
|
atomicrmw add ptr addrspace(2) %gep, i32 8 seq_cst
|
|
ret void
|
|
}
|
|
|
|
; CHECK-LABEL: {{^}}name: ps_shader_ps_input_enable
|
|
; CHECK: machineFunctionInfo:
|
|
; CHECK: psInputAddr: 36983
|
|
; CHECK-NEXT: psInputEnable: 1{{$}}
|
|
define amdgpu_ps void @ps_shader_ps_input_enable(i32 %arg0, i32 inreg %arg1) #7 {
|
|
%gep = getelementptr inbounds [128 x i32], ptr addrspace(2) @gds, i32 0, i32 %arg0
|
|
atomicrmw add ptr addrspace(2) %gep, i32 8 seq_cst
|
|
ret void
|
|
}
|
|
|
|
; CHECK-LABEL: {{^}}name: gds_size_shader
|
|
; CHECK: gdsSize: 4096
|
|
define amdgpu_ps void @gds_size_shader(i32 %arg0, i32 inreg %arg1) #5 {
|
|
ret void
|
|
}
|
|
|
|
; CHECK-LABEL: {{^}}name: function
|
|
; CHECK: machineFunctionInfo:
|
|
; CHECK-NEXT: explicitKernArgSize: 0
|
|
; CHECK-NEXT: maxKernArgAlign: 1
|
|
; CHECK-NEXT: ldsSize: 0
|
|
; CHECK-NEXT: gdsSize: 0
|
|
; CHECK-NEXT: dynLDSAlign: 1
|
|
; CHECK-NEXT: isEntryFunction: false
|
|
; CHECK-NEXT: isChainFunction: false
|
|
; CHECK-NEXT: noSignedZerosFPMath: false
|
|
; CHECK-NEXT: memoryBound: false
|
|
; CHECK-NEXT: waveLimiter: false
|
|
; CHECK-NEXT: hasSpilledSGPRs: false
|
|
; CHECK-NEXT: hasSpilledVGPRs: false
|
|
; CHECK-NEXT: numWaveDispatchSGPRs: 16
|
|
; CHECK-NEXT: numWaveDispatchVGPRs: 0
|
|
; CHECK-NEXT: scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
|
|
; CHECK-NEXT: frameOffsetReg: '$sgpr33'
|
|
; CHECK-NEXT: stackPtrOffsetReg: '$sgpr32'
|
|
; CHECK-NEXT: bytesInStackArgArea: 0
|
|
; CHECK-NEXT: returnsVoid: true
|
|
; CHECK-NEXT: argumentInfo:
|
|
; CHECK-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
|
|
; CHECK-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' }
|
|
; CHECK-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' }
|
|
; CHECK-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' }
|
|
; CHECK-NEXT: workGroupIDX: { reg: '$sgpr12' }
|
|
; CHECK-NEXT: workGroupIDY: { reg: '$sgpr13' }
|
|
; CHECK-NEXT: workGroupIDZ: { reg: '$sgpr14' }
|
|
; CHECK-NEXT: LDSKernelId: { reg: '$sgpr15' }
|
|
; CHECK-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' }
|
|
; CHECK-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 }
|
|
; CHECK-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 }
|
|
; CHECK-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
|
|
; CHECK-NEXT: psInputAddr: 0
|
|
; CHECK-NEXT: psInputEnable: 0
|
|
; CHECK-NEXT: maxMemoryClusterDWords: 8
|
|
; CHECK-NEXT: mode:
|
|
; CHECK-NEXT: ieee: true
|
|
; CHECK-NEXT: dx10-clamp: true
|
|
; CHECK-NEXT: fp32-input-denormals: true
|
|
; CHECK-NEXT: fp32-output-denormals: true
|
|
; CHECK-NEXT: fp64-fp16-input-denormals: true
|
|
; CHECK-NEXT: fp64-fp16-output-denormals: true
|
|
; CHECK-NEXT: highBitsOf32BitAddress: 0
|
|
; CHECK-NEXT: occupancy: 10
|
|
; CHECK-NEXT: vgprForAGPRCopy: ''
|
|
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
|
|
; CHECK-NEXT: longBranchReservedReg: ''
|
|
; CHECK-NEXT: hasInitWholeWave: false
|
|
; CHECK-NEXT: dynamicVGPRBlockSize: 0
|
|
; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
|
|
; CHECK-NEXT: isWholeWaveFunction: false
|
|
; CHECK-NEXT: body:
|
|
define void @function() {
|
|
ret void
|
|
}
|
|
|
|
; CHECK-LABEL: {{^}}name: function_nsz
|
|
; CHECK: machineFunctionInfo:
|
|
; CHECK-NEXT: explicitKernArgSize: 0
|
|
; CHECK-NEXT: maxKernArgAlign: 1
|
|
; CHECK-NEXT: ldsSize: 0
|
|
; CHECK-NEXT: gdsSize: 0
|
|
; CHECK-NEXT: dynLDSAlign: 1
|
|
; CHECK-NEXT: isEntryFunction: false
|
|
; CHECK-NEXT: isChainFunction: false
|
|
; CHECK-NEXT: noSignedZerosFPMath: true
|
|
; CHECK-NEXT: memoryBound: false
|
|
; CHECK-NEXT: waveLimiter: false
|
|
; CHECK-NEXT: hasSpilledSGPRs: false
|
|
; CHECK-NEXT: hasSpilledVGPRs: false
|
|
; CHECK-NEXT: numWaveDispatchSGPRs: 16
|
|
; CHECK-NEXT: numWaveDispatchVGPRs: 0
|
|
; CHECK-NEXT: scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
|
|
; CHECK-NEXT: frameOffsetReg: '$sgpr33'
|
|
; CHECK-NEXT: stackPtrOffsetReg: '$sgpr32'
|
|
; CHECK-NEXT: bytesInStackArgArea: 0
|
|
; CHECK-NEXT: returnsVoid: true
|
|
; CHECK-NEXT: argumentInfo:
|
|
; CHECK-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
|
|
; CHECK-NEXT: dispatchPtr: { reg: '$sgpr4_sgpr5' }
|
|
; CHECK-NEXT: queuePtr: { reg: '$sgpr6_sgpr7' }
|
|
; CHECK-NEXT: dispatchID: { reg: '$sgpr10_sgpr11' }
|
|
; CHECK-NEXT: workGroupIDX: { reg: '$sgpr12' }
|
|
; CHECK-NEXT: workGroupIDY: { reg: '$sgpr13' }
|
|
; CHECK-NEXT: workGroupIDZ: { reg: '$sgpr14' }
|
|
; CHECK-NEXT: LDSKernelId: { reg: '$sgpr15' }
|
|
; CHECK-NEXT: implicitArgPtr: { reg: '$sgpr8_sgpr9' }
|
|
; CHECK-NEXT: workItemIDX: { reg: '$vgpr31', mask: 1023 }
|
|
; CHECK-NEXT: workItemIDY: { reg: '$vgpr31', mask: 1047552 }
|
|
; CHECK-NEXT: workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
|
|
; CHECK-NEXT: psInputAddr: 0
|
|
; CHECK-NEXT: psInputEnable: 0
|
|
; CHECK-NEXT: maxMemoryClusterDWords: 8
|
|
; CHECK-NEXT: mode:
|
|
; CHECK-NEXT: ieee: true
|
|
; CHECK-NEXT: dx10-clamp: true
|
|
; CHECK-NEXT: fp32-input-denormals: true
|
|
; CHECK-NEXT: fp32-output-denormals: true
|
|
; CHECK-NEXT: fp64-fp16-input-denormals: true
|
|
; CHECK-NEXT: fp64-fp16-output-denormals: true
|
|
; CHECK-NEXT: highBitsOf32BitAddress: 0
|
|
; CHECK-NEXT: occupancy: 10
|
|
; CHECK-NEXT: vgprForAGPRCopy: ''
|
|
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
|
|
; CHECK-NEXT: longBranchReservedReg: ''
|
|
; CHECK-NEXT: hasInitWholeWave: false
|
|
; CHECK-NEXT: dynamicVGPRBlockSize: 0
|
|
; CHECK-NEXT: scratchReservedForDynamicVGPRs: 0
|
|
; CHECK-NEXT: isWholeWaveFunction: false
|
|
; CHECK-NEXT: body:
|
|
define void @function_nsz() #0 {
|
|
ret void
|
|
}
|
|
|
|
; CHECK-LABEL: {{^}}name: function_dx10_clamp_off
|
|
; CHECK: mode:
|
|
; CHECK-NEXT: ieee: true
|
|
; CHECK-NEXT: dx10-clamp: false
|
|
; CHECK-NEXT: fp32-input-denormals: true
|
|
; CHECK-NEXT: fp32-output-denormals: true
|
|
; CHECK-NEXT: fp64-fp16-input-denormals: true
|
|
; CHECK-NEXT: fp64-fp16-output-denormals: true
|
|
define void @function_dx10_clamp_off() #1 {
|
|
ret void
|
|
}
|
|
|
|
; CHECK-LABEL: {{^}}name: function_ieee_off
|
|
; CHECK: mode:
|
|
; CHECK-NEXT: ieee: false
|
|
; CHECK-NEXT: dx10-clamp: true
|
|
; CHECK-NEXT: fp32-input-denormals: true
|
|
; CHECK-NEXT: fp32-output-denormals: true
|
|
; CHECK-NEXT: fp64-fp16-input-denormals: true
|
|
; CHECK-NEXT: fp64-fp16-output-denormals: true
|
|
define void @function_ieee_off() #2 {
|
|
ret void
|
|
}
|
|
|
|
; CHECK-LABEL: {{^}}name: function_ieee_off_dx10_clamp_off
|
|
; CHECK: mode:
|
|
; CHECK-NEXT: ieee: false
|
|
; CHECK-NEXT: dx10-clamp: false
|
|
; CHECK-NEXT: fp32-input-denormals: true
|
|
; CHECK-NEXT: fp32-output-denormals: true
|
|
; CHECK-NEXT: fp64-fp16-input-denormals: true
|
|
; CHECK-NEXT: fp64-fp16-output-denormals: true
|
|
define void @function_ieee_off_dx10_clamp_off() #3 {
|
|
ret void
|
|
}
|
|
|
|
; CHECK-LABEL: {{^}}name: high_address_bits
|
|
; CHECK: machineFunctionInfo:
|
|
; CHECK: highBitsOf32BitAddress: 4294934528
|
|
define amdgpu_ps void @high_address_bits() #4 {
|
|
ret void
|
|
}
|
|
|
|
; CHECK-LABEL: {{^}}name: wwm_reserved_regs
|
|
; CHECK: wwmReservedRegs:
|
|
; CHECK-NEXT: - '$vgpr2'
|
|
; CHECK-NEXT: - '$vgpr3'
|
|
define amdgpu_cs void @wwm_reserved_regs(ptr addrspace(1) %ptr, <4 x i32> inreg %tmp14) {
|
|
%ld0 = load volatile i32, ptr addrspace(1) %ptr
|
|
%ld1 = load volatile i32, ptr addrspace(1) %ptr
|
|
%inactive0 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %ld1, i32 0)
|
|
%inactive1 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %ld0, i32 0)
|
|
%wwm0 = tail call i32 @llvm.amdgcn.strict.wwm.i32(i32 %inactive0)
|
|
%wwm1 = tail call i32 @llvm.amdgcn.strict.wwm.i32(i32 %inactive1)
|
|
store volatile i32 %wwm0, ptr addrspace(1) %ptr
|
|
store volatile i32 %wwm1, ptr addrspace(1) %ptr
|
|
ret void
|
|
}
|
|
|
|
declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #6
|
|
declare i32 @llvm.amdgcn.strict.wwm.i32(i32) #6
|
|
|
|
attributes #0 = { "no-signed-zeros-fp-math" = "true" }
|
|
attributes #1 = { "amdgpu-dx10-clamp" = "false" }
|
|
attributes #2 = { "amdgpu-ieee" = "false" }
|
|
attributes #3 = { "amdgpu-dx10-clamp" = "false" "amdgpu-ieee" = "false" }
|
|
attributes #4 = { "amdgpu-32bit-address-high-bits"="0xffff8000" }
|
|
attributes #5 = { "amdgpu-gds-size"="4096" }
|
|
attributes #6 = { convergent nounwind readnone willreturn }
|
|
attributes #7 = { "InitialPSInputAddr"="36983" }
|