[AMDGPU] Increase LDS to 320K on gfx1250 (#153645)
This commit is contained in:
parent
334a046a3c
commit
49f2093477
@ -5598,6 +5598,8 @@ The fields used by CP for code objects before V3 also match those specified in
|
||||
roundup(lds-size / (128 * 4))
|
||||
GFX950
|
||||
roundup(lds-size / (320 * 4))
|
||||
GFX125*
|
||||
roundup(lds-size / (256 * 4))
|
||||
|
||||
24 1 bit ENABLE_EXCEPTION_IEEE_754_FP Wavefront starts execution
|
||||
_INVALID_OPERATION with specified exceptions
|
||||
|
||||
@ -1548,7 +1548,7 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11",
|
||||
|
||||
def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12",
|
||||
"gfx12",
|
||||
[FeatureFP64, FeatureAddressableLocalMemorySize65536, FeatureMIMG_R128,
|
||||
[FeatureFP64, FeatureMIMG_R128,
|
||||
FeatureFlatAddressSpace, Feature16BitInsts,
|
||||
FeatureInv2PiInlineImm, FeatureApertureRegs,
|
||||
FeatureCIInsts, FeatureGFX8Insts, FeatureGFX9Insts, FeatureGFX10Insts,
|
||||
@ -1977,6 +1977,7 @@ def FeatureISAVersion11_5_3 : FeatureSet<
|
||||
|
||||
def FeatureISAVersion12 : FeatureSet<
|
||||
[FeatureGFX12,
|
||||
FeatureAddressableLocalMemorySize65536,
|
||||
FeatureLDSBankCount32,
|
||||
FeatureDLInsts,
|
||||
FeatureDot7Insts,
|
||||
@ -2019,6 +2020,7 @@ def FeatureISAVersion12_50 : FeatureSet<
|
||||
[FeatureGFX12,
|
||||
FeatureGFX1250Insts,
|
||||
FeatureCUStores,
|
||||
FeatureAddressableLocalMemorySize327680,
|
||||
FeatureCuMode,
|
||||
Feature64BitLiterals,
|
||||
FeatureLDSBankCount32,
|
||||
|
||||
@ -1103,7 +1103,11 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
|
||||
ProgInfo.DX10Clamp = Mode.DX10Clamp;
|
||||
|
||||
unsigned LDSAlignShift;
|
||||
if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize163840)) {
|
||||
if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize327680)) {
|
||||
// LDS is allocated in 256 dword blocks.
|
||||
LDSAlignShift = 10;
|
||||
} else if (STM.getFeatureBits().test(
|
||||
FeatureAddressableLocalMemorySize163840)) {
|
||||
// LDS is allocated in 320 dword blocks.
|
||||
LDSAlignShift = 11;
|
||||
} else if (STM.getFeatureBits().test(
|
||||
|
||||
@ -30,6 +30,7 @@ class SubtargetFeatureAddressableLocalMemorySize <int Value> : SubtargetFeature<
|
||||
def FeatureAddressableLocalMemorySize32768 : SubtargetFeatureAddressableLocalMemorySize<32768>;
|
||||
def FeatureAddressableLocalMemorySize65536 : SubtargetFeatureAddressableLocalMemorySize<65536>;
|
||||
def FeatureAddressableLocalMemorySize163840 : SubtargetFeatureAddressableLocalMemorySize<163840>;
|
||||
def FeatureAddressableLocalMemorySize327680 : SubtargetFeatureAddressableLocalMemorySize<327680>;
|
||||
|
||||
class SubtargetFeatureWavefrontSize <int ValueLog2> : SubtargetFeature<
|
||||
"wavefrontsize"#!shl(1, ValueLog2),
|
||||
|
||||
@ -1160,6 +1160,8 @@ unsigned getAddressableLocalMemorySize(const MCSubtargetInfo *STI) {
|
||||
return 65536;
|
||||
if (STI->getFeatureBits().test(FeatureAddressableLocalMemorySize163840))
|
||||
return 163840;
|
||||
if (STI->getFeatureBits().test(FeatureAddressableLocalMemorySize327680))
|
||||
return 327680;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -3340,8 +3342,8 @@ bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCSubtargetInfo &ST) {
|
||||
}
|
||||
|
||||
unsigned getLdsDwGranularity(const MCSubtargetInfo &ST) {
|
||||
// Currently this is 128 for all subtargets
|
||||
return 128;
|
||||
return ST.hasFeature(AMDGPU::FeatureAddressableLocalMemorySize327680) ? 256
|
||||
: 128;
|
||||
}
|
||||
|
||||
bool isPackedFP32Inst(unsigned Opc) {
|
||||
|
||||
@ -6,6 +6,8 @@
|
||||
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx950 < %s | FileCheck -check-prefix=GFX950-MESA %s
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200-PAL %s
|
||||
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200-MESA %s
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250-PAL %s
|
||||
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250-MESA %s
|
||||
|
||||
; Check EXTRA_LDS_SIZE in SPI_SHADER_PGM_RSRC2_PS.
|
||||
|
||||
@ -29,6 +31,11 @@
|
||||
; GFX1200-MESA: .long 45100
|
||||
; GFX1200-MESA-NEXT: .long 1024
|
||||
|
||||
; GFX1250-PAL: '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x200
|
||||
|
||||
; GFX1250-MESA: .long 45100
|
||||
; GFX1250-MESA-NEXT: .long 512
|
||||
|
||||
@lds = internal addrspace(3) global [4096 x i8] poison
|
||||
|
||||
define amdgpu_ps void @global_store_saddr_uniform_ptr_in_vgprs(i32 %voffset) {
|
||||
|
||||
13
llvm/test/CodeGen/AMDGPU/lds-limit-diagnostics-gfx1250.ll
Normal file
13
llvm/test/CodeGen/AMDGPU/lds-limit-diagnostics-gfx1250.ll
Normal file
@ -0,0 +1,13 @@
|
||||
; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s 2>&1 | FileCheck -check-prefix=ERROR %s
|
||||
|
||||
; GFX1250 supports upto 320 KB LDS memory.
|
||||
; This is a negative test to check when the LDS size exceeds the max usable limit.
|
||||
|
||||
; ERROR: error: <unknown>:0:0: local memory (327684) exceeds limit (327680) in function 'test_lds_limit'
|
||||
@dst = addrspace(3) global [81921 x i32] undef
|
||||
|
||||
define amdgpu_kernel void @test_lds_limit(i32 %val) {
|
||||
%gep = getelementptr [81921 x i32], ptr addrspace(3) @dst, i32 0, i32 100
|
||||
store i32 %val, ptr addrspace(3) %gep
|
||||
ret void
|
||||
}
|
||||
72
llvm/test/CodeGen/AMDGPU/lds-size-hsa-gfx1250.ll
Normal file
72
llvm/test/CodeGen/AMDGPU/lds-size-hsa-gfx1250.ll
Normal file
@ -0,0 +1,72 @@
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefix=GCN %s
|
||||
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefix=MESA %s
|
||||
|
||||
; GFX1250 supports upto 320 KB configurable LDS memory.
|
||||
; This test checks the min and max size of LDS that can be allocated.
|
||||
|
||||
@lds.i8 = addrspace(3) global i8 undef
|
||||
@lds.array.i8 = addrspace(3) global [327679 x i8] undef
|
||||
@lds.i16 = addrspace(3) global i16 undef
|
||||
@lds.array.i16 = addrspace(3) global [163839 x i16] undef
|
||||
@lds.i32 = addrspace(3) global i32 undef
|
||||
@lds.array.i32 = addrspace(3) global [81919 x i32] undef
|
||||
|
||||
; GCN-LABEL: test_lds_i8:
|
||||
; GCN: .amdhsa_group_segment_fixed_size 1
|
||||
; GCN: ; LDSByteSize: 1 bytes/workgroup
|
||||
; MESA: granulated_lds_size = 1
|
||||
define amdgpu_kernel void @test_lds_i8(i8 %val) {
|
||||
store i8 %val, ptr addrspace(3) @lds.i8
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: test_lds_i16:
|
||||
; GCN: .amdhsa_group_segment_fixed_size 2
|
||||
; GCN: ; LDSByteSize: 2 bytes/workgroup
|
||||
; MESA: granulated_lds_size = 1
|
||||
define amdgpu_kernel void @test_lds_i16(i16 %val) {
|
||||
store i16 %val, ptr addrspace(3) @lds.i16
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: test_lds_i32:
|
||||
; GCN: .amdhsa_group_segment_fixed_size 4
|
||||
; GCN: ; LDSByteSize: 4 bytes/workgroup
|
||||
; MESA: granulated_lds_size = 1
|
||||
define amdgpu_kernel void @test_lds_i32(i32 %val) {
|
||||
store i32 %val, ptr addrspace(3) @lds.i32
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: test_lds_array_i8:
|
||||
; GCN: .amdhsa_group_segment_fixed_size 327680
|
||||
; GCN: ; LDSByteSize: 327680 bytes/workgroup
|
||||
; MESA: granulated_lds_size = 320
|
||||
define amdgpu_kernel void @test_lds_array_i8() {
|
||||
%gep = getelementptr inbounds [327679 x i8], ptr addrspace(3) @lds.array.i8, i32 0, i32 5
|
||||
%val = load i8, ptr addrspace(3) %gep
|
||||
store i8 %val, ptr addrspace(3) @lds.i8
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: test_lds_array_i16:
|
||||
; GCN: .amdhsa_group_segment_fixed_size 327680
|
||||
; GCN: ; LDSByteSize: 327680 bytes/workgroup
|
||||
; MESA: granulated_lds_size = 320
|
||||
define amdgpu_kernel void @test_lds_array_i16() {
|
||||
%gep = getelementptr inbounds [163839 x i16], ptr addrspace(3) @lds.array.i16, i32 0, i32 10
|
||||
%val = load i16, ptr addrspace(3) %gep
|
||||
store i16 %val, ptr addrspace(3) @lds.i16
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: test_lds_array_i32:
|
||||
; GCN: .amdhsa_group_segment_fixed_size 327680
|
||||
; GCN: ; LDSByteSize: 327680 bytes/workgroup
|
||||
; MESA: granulated_lds_size = 320
|
||||
define amdgpu_kernel void @test_lds_array_i32() {
|
||||
%gep = getelementptr inbounds [81919 x i32], ptr addrspace(3) @lds.array.i32, i32 0, i32 20
|
||||
%val = load i32, ptr addrspace(3) %gep
|
||||
store i32 %val, ptr addrspace(3) @lds.i32
|
||||
ret void
|
||||
}
|
||||
61
llvm/test/CodeGen/AMDGPU/lds-size-pal-gfx1250.ll
Normal file
61
llvm/test/CodeGen/AMDGPU/lds-size-pal-gfx1250.ll
Normal file
@ -0,0 +1,61 @@
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1250 < %s | FileCheck -check-prefix=PAL %s
|
||||
|
||||
; GFX1250 supports upto 320 KB configurable LDS memory.
|
||||
; This test checks the min and max size of LDS that can be allocated.
|
||||
|
||||
; PAL: .shader_functions:
|
||||
; PAL: test_lds_array_i16:
|
||||
; PAL: .lds_size: 0x50000
|
||||
; PAL: test_lds_array_i32:
|
||||
; PAL: .lds_size: 0x50000
|
||||
; PAL: test_lds_array_i8:
|
||||
; PAL: .lds_size: 0x50000
|
||||
; PAL: test_lds_i16:
|
||||
; PAL: .lds_size: 0x2
|
||||
; PAL: test_lds_i32:
|
||||
; PAL: .lds_size: 0x4
|
||||
; PAL: test_lds_i8:
|
||||
; PAL: .lds_size: 0x1
|
||||
|
||||
@lds.i8 = addrspace(3) global i8 undef
|
||||
@lds.array.i8 = addrspace(3) global [327679 x i8] undef
|
||||
@lds.i16 = addrspace(3) global i16 undef
|
||||
@lds.array.i16 = addrspace(3) global [163839 x i16] undef
|
||||
@lds.i32 = addrspace(3) global i32 undef
|
||||
@lds.array.i32 = addrspace(3) global [81919 x i32] undef
|
||||
|
||||
define amdgpu_gfx void @test_lds_i8(i8 %val) {
|
||||
store i8 %val, ptr addrspace(3) @lds.i8
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_gfx void @test_lds_i16(i16 %val) {
|
||||
store i16 %val, ptr addrspace(3) @lds.i16
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_gfx void @test_lds_i32(i32 %val) {
|
||||
store i32 %val, ptr addrspace(3) @lds.i32
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_gfx void @test_lds_array_i8() {
|
||||
%gep = getelementptr inbounds [327679 x i8], ptr addrspace(3) @lds.array.i8, i32 0, i32 5
|
||||
%val = load i8, ptr addrspace(3) %gep
|
||||
store i8 %val, ptr addrspace(3) @lds.i8
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_gfx void @test_lds_array_i16() {
|
||||
%gep = getelementptr inbounds [163839 x i16], ptr addrspace(3) @lds.array.i16, i32 0, i32 10
|
||||
%val = load i16, ptr addrspace(3) %gep
|
||||
store i16 %val, ptr addrspace(3) @lds.i16
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_gfx void @test_lds_array_i32() {
|
||||
%gep = getelementptr inbounds [81919 x i32], ptr addrspace(3) @lds.array.i32, i32 0, i32 20
|
||||
%val = load i32, ptr addrspace(3) %gep
|
||||
store i32 %val, ptr addrspace(3) @lds.i32
|
||||
ret void
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user