[mlir][GPU] Add constant address space to GPU dialect (#190211)
This PR adds a `constant` address space to the` GPU dialect and lowerings to all GPU backends. Signed-off-by: Bangtian Liu <liubangtian@gmail.com>
This commit is contained in:
parent
f7b6cc1efa
commit
86b5f11ecc
@ -28,13 +28,16 @@ is being used.
|
||||
## GPU address spaces
|
||||
|
||||
The GPU dialect exposes the `gpu.address_space` attribute, which currently has
|
||||
three values: `global`, `workgroup`, and `private`.
|
||||
four values: `global`, `workgroup`, `private`, and `constant`.
|
||||
|
||||
These address spaces represent the types of buffer commonly seen in GPU compilation.
|
||||
`global` memory is memory that resides in the GPU's global memory. `workgroup`
|
||||
memory is a limited, per-workgroup resource: all threads in a workgroup/thread
|
||||
block access the same values in `workgroup` memory. Finally, `private` memory is
|
||||
block access the same values in `workgroup` memory. `private` memory is
|
||||
used to represent `alloca`-like buffers that are private to a single thread/workitem.
|
||||
`constant` memory is read-only memory residing in global address space, guaranteed
|
||||
not to change during kernel execution, allowing backend-specific optimizations
|
||||
(e.g., scalar reads on AMD GPUs).
|
||||
|
||||
These address spaces may be used as the `memorySpace` attribute on `memref` values.
|
||||
The `gpu.module`/`gpu.func` compilation pipeline will lower such memory space
|
||||
|
||||
@ -53,6 +53,10 @@ def GPU_Dialect : Dialect {
|
||||
/// space.
|
||||
static AddressSpace getPrivateAddressSpace() { return AddressSpace::Private; }
|
||||
|
||||
/// Returns the numeric value used to identify the constant memory address
|
||||
/// space.
|
||||
static AddressSpace getConstantAddressSpace() { return AddressSpace::Constant; }
|
||||
|
||||
/// Return true if the given MemRefType has an address space that matches
|
||||
/// with the gpu::AddressSpaceAttr attribute with value 'workgroup`.
|
||||
static bool hasWorkgroupMemoryAddressSpace(MemRefType type);
|
||||
@ -60,6 +64,14 @@ def GPU_Dialect : Dialect {
|
||||
/// Return true if the given Attribute is an gpu::AddressSpaceAttr
|
||||
/// attribute with value 'workgroup`.
|
||||
static bool isWorkgroupMemoryAddressSpace(Attribute memorySpace);
|
||||
|
||||
/// Return true if the given MemRefType has an address space that matches
|
||||
/// with the gpu::AddressSpaceAttr attribute with value 'constant`.
|
||||
static bool hasConstantMemoryAddressSpace(MemRefType type);
|
||||
|
||||
/// Return true if the given Attribute is an gpu::AddressSpaceAttr
|
||||
/// attribute with value 'constant`.
|
||||
static bool isConstantMemoryAddressSpace(Attribute memorySpace);
|
||||
}];
|
||||
let discardableAttrs = (ins
|
||||
"::mlir::DenseI32ArrayAttr":$known_block_size,
|
||||
@ -89,11 +101,13 @@ class GPU_I32EnumAttr<string mnemonic, GPU_I32Enum enumInfo> :
|
||||
def GPU_AddressSpaceGlobal : I32EnumAttrCase<"Global", 1, "global">;
|
||||
def GPU_AddressSpaceWorkgroup : I32EnumAttrCase<"Workgroup", 2, "workgroup">;
|
||||
def GPU_AddressSpacePrivate : I32EnumAttrCase<"Private", 3, "private">;
|
||||
def GPU_AddressSpaceConstant : I32EnumAttrCase<"Constant", 4, "constant">;
|
||||
def GPU_AddressSpaceEnum : GPU_I32Enum<
|
||||
"AddressSpace", "GPU address space", [
|
||||
GPU_AddressSpaceGlobal,
|
||||
GPU_AddressSpaceWorkgroup,
|
||||
GPU_AddressSpacePrivate
|
||||
GPU_AddressSpacePrivate,
|
||||
GPU_AddressSpaceConstant
|
||||
]>;
|
||||
|
||||
def GPU_AddressSpaceAttr :
|
||||
|
||||
@ -131,6 +131,8 @@ def ROCDL_Dialect : Dialect {
|
||||
static constexpr unsigned kGlobalMemoryAddressSpace = 1;
|
||||
/// The address space value that represents shared memory.
|
||||
static constexpr unsigned kSharedMemoryAddressSpace = 3;
|
||||
/// The address space value that represents constant memory.
|
||||
static constexpr unsigned kConstantMemoryAddressSpace = 4;
|
||||
/// The address space value that represents private memory.
|
||||
static constexpr unsigned kPrivateMemoryAddressSpace = 5;
|
||||
}];
|
||||
|
||||
@ -4105,6 +4105,8 @@ void mlir::amdgpu::populateCommonGPUTypeAndAttributeConversions(
|
||||
return ROCDL::ROCDLDialect::kSharedMemoryAddressSpace;
|
||||
case gpu::AddressSpace::Private:
|
||||
return ROCDL::ROCDLDialect::kPrivateMemoryAddressSpace;
|
||||
case gpu::AddressSpace::Constant:
|
||||
return ROCDL::ROCDLDialect::kConstantMemoryAddressSpace;
|
||||
}
|
||||
llvm_unreachable("unknown address space enum value");
|
||||
});
|
||||
|
||||
@ -17,6 +17,8 @@ spirv::StorageClass addressSpaceToStorageClass(gpu::AddressSpace addressSpace) {
|
||||
return spirv::StorageClass::Workgroup;
|
||||
case gpu::AddressSpace::Private:
|
||||
return spirv::StorageClass::Private;
|
||||
case gpu::AddressSpace::Constant:
|
||||
return spirv::StorageClass::UniformConstant;
|
||||
}
|
||||
llvm_unreachable("Unhandled storage class");
|
||||
}
|
||||
|
||||
@ -138,6 +138,8 @@ struct GPUBarrierConversion final : ConvertOpToLLVMPattern<gpu::BarrierOp> {
|
||||
memFenceFlag = memFenceFlag | localMemFenceFlag;
|
||||
break;
|
||||
case gpu::AddressSpace::Private:
|
||||
case gpu::AddressSpace::Constant:
|
||||
// Private is thread-local, constant is read-only; no fencing needed.
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@ -545,6 +545,8 @@ struct GPUBarrierOpLowering final : ConvertOpToLLVMPattern<gpu::BarrierOp> {
|
||||
fenceLDS = true;
|
||||
break;
|
||||
case gpu::AddressSpace::Private:
|
||||
case gpu::AddressSpace::Constant:
|
||||
// Private is thread-local, constant is read-only; no fencing needed.
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@ -1723,6 +1723,8 @@ void mlir::nvgpu::populateCommonGPUTypeAndAttributeConversions(
|
||||
return static_cast<unsigned>(NVVM::NVVMMemorySpace::Shared);
|
||||
case gpu::AddressSpace::Private:
|
||||
return 0;
|
||||
case gpu::AddressSpace::Constant:
|
||||
return static_cast<unsigned>(NVVM::NVVMMemorySpace::Constant);
|
||||
}
|
||||
llvm_unreachable("unknown address space enum value");
|
||||
});
|
||||
|
||||
@ -248,6 +248,19 @@ bool GPUDialect::hasWorkgroupMemoryAddressSpace(MemRefType type) {
|
||||
return isWorkgroupMemoryAddressSpace(memorySpace);
|
||||
}
|
||||
|
||||
bool GPUDialect::isConstantMemoryAddressSpace(Attribute memorySpace) {
|
||||
if (!memorySpace)
|
||||
return false;
|
||||
if (auto gpuAttr = llvm::dyn_cast<gpu::AddressSpaceAttr>(memorySpace))
|
||||
return gpuAttr.getValue() == getConstantAddressSpace();
|
||||
return false;
|
||||
}
|
||||
|
||||
bool GPUDialect::hasConstantMemoryAddressSpace(MemRefType type) {
|
||||
Attribute memorySpace = type.getMemorySpace();
|
||||
return isConstantMemoryAddressSpace(memorySpace);
|
||||
}
|
||||
|
||||
bool GPUDialect::isKernel(Operation *op) {
|
||||
UnitAttr isKernelAttr = op->getAttrOfType<UnitAttr>(getKernelFuncAttrName());
|
||||
return static_cast<bool>(isKernelAttr);
|
||||
|
||||
@ -0,0 +1,26 @@
|
||||
// RUN: mlir-opt %s -convert-gpu-to-llvm-spv | FileCheck %s
|
||||
|
||||
gpu.module @kernels {
|
||||
// CHECK-LABEL: llvm.func spir_kernelcc @constant_load
|
||||
// Constant address space maps to SPIRV/OpenCL address space 2 (UniformConstant)
|
||||
// CHECK-SAME: !llvm.ptr<2>
|
||||
gpu.func @constant_load(%arg0: memref<16xf32, #gpu.address_space<constant>>) kernel {
|
||||
%c0 = arith.constant 0 : index
|
||||
%v = memref.load %arg0[%c0] : memref<16xf32, #gpu.address_space<constant>>
|
||||
gpu.return
|
||||
}
|
||||
|
||||
// CHECK-LABEL: llvm.func spir_funccc @all_address_spaces
|
||||
// Global -> 1, Workgroup -> 3, Private -> 0 (default), Constant -> 2
|
||||
// CHECK-SAME: !llvm.ptr<1>
|
||||
// CHECK-SAME: !llvm.ptr<3>
|
||||
// CHECK-SAME: !llvm.ptr,
|
||||
// CHECK-SAME: !llvm.ptr<2>
|
||||
gpu.func @all_address_spaces(
|
||||
%arg0: memref<f32, #gpu.address_space<global>>,
|
||||
%arg1: memref<f32, #gpu.address_space<workgroup>>,
|
||||
%arg2: memref<f32, #gpu.address_space<private>>,
|
||||
%arg3: memref<f32, #gpu.address_space<constant>>) {
|
||||
gpu.return
|
||||
}
|
||||
}
|
||||
@ -238,6 +238,10 @@ gpu.module @barriers {
|
||||
// CHECK: [[NONE_FLAG2:%.*]] = llvm.mlir.constant(0 : i32) : i32
|
||||
// CHECK: llvm.call spir_funccc @_Z7barrierj([[NONE_FLAG2]])
|
||||
gpu.barrier memfence [#gpu.address_space<private>]
|
||||
// Constant memory is read-only, no fencing needed (same as private)
|
||||
// CHECK: [[NONE_FLAG3:%.*]] = llvm.mlir.constant(0 : i32) : i32
|
||||
// CHECK: llvm.call spir_funccc @_Z7barrierj([[NONE_FLAG3]])
|
||||
gpu.barrier memfence [#gpu.address_space<constant>]
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
22
mlir/test/Conversion/GPUToNVVM/constant-address-space.mlir
Normal file
22
mlir/test/Conversion/GPUToNVVM/constant-address-space.mlir
Normal file
@ -0,0 +1,22 @@
|
||||
// RUN: mlir-opt -convert-gpu-to-nvvm %s | FileCheck %s
|
||||
|
||||
module attributes {gpu.container_module} {
|
||||
gpu.module @kernel_module {
|
||||
// CHECK-LABEL: llvm.func @constant_load
|
||||
// CHECK-SAME: %{{.*}}: !llvm.ptr<4>
|
||||
gpu.func @constant_load(%arg0: memref<16xf32, #gpu.address_space<constant>>) kernel {
|
||||
%c0 = arith.constant 0 : index
|
||||
%v = memref.load %arg0[%c0] : memref<16xf32, #gpu.address_space<constant>>
|
||||
gpu.return
|
||||
}
|
||||
|
||||
// CHECK-LABEL: llvm.func @constant_multidim
|
||||
// CHECK-SAME: %{{.*}}: !llvm.ptr<4>
|
||||
gpu.func @constant_multidim(%arg0: memref<4x8xf32, #gpu.address_space<constant>>) kernel {
|
||||
%c0 = arith.constant 0 : index
|
||||
%c1 = arith.constant 1 : index
|
||||
%v = memref.load %arg0[%c0, %c1] : memref<4x8xf32, #gpu.address_space<constant>>
|
||||
gpu.return
|
||||
}
|
||||
}
|
||||
}
|
||||
22
mlir/test/Conversion/GPUToROCDL/constant-address-space.mlir
Normal file
22
mlir/test/Conversion/GPUToROCDL/constant-address-space.mlir
Normal file
@ -0,0 +1,22 @@
|
||||
// RUN: mlir-opt -convert-gpu-to-rocdl %s | FileCheck %s
|
||||
|
||||
module attributes {gpu.container_module} {
|
||||
gpu.module @kernel_module {
|
||||
// CHECK-LABEL: llvm.func @constant_load
|
||||
// CHECK-SAME: %{{.*}}: !llvm.ptr<4>
|
||||
gpu.func @constant_load(%arg0: memref<16xf32, #gpu.address_space<constant>>) kernel {
|
||||
%c0 = arith.constant 0 : index
|
||||
%v = memref.load %arg0[%c0] : memref<16xf32, #gpu.address_space<constant>>
|
||||
gpu.return
|
||||
}
|
||||
|
||||
// CHECK-LABEL: llvm.func @constant_multidim
|
||||
// CHECK-SAME: %{{.*}}: !llvm.ptr<4>
|
||||
gpu.func @constant_multidim(%arg0: memref<4x8xf32, #gpu.address_space<constant>>) kernel {
|
||||
%c0 = arith.constant 0 : index
|
||||
%c1 = arith.constant 1 : index
|
||||
%v = memref.load %arg0[%c0, %c1] : memref<4x8xf32, #gpu.address_space<constant>>
|
||||
gpu.return
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -65,4 +65,16 @@ func.func @barrier_private_only() {
|
||||
gpu.barrier memfence [#gpu.address_space<private>]
|
||||
func.return
|
||||
}
|
||||
|
||||
// GFX9-LABEL: func @barrier_constant_only
|
||||
// GFX12-LABEL: func @barrier_constant_only
|
||||
func.func @barrier_constant_only() {
|
||||
// GFX9-NEXT: rocdl.s.barrier
|
||||
// GFX12-NEXT: rocdl.s.barrier.signal id = -1
|
||||
// GFX12-NEXT: rocdl.s.barrier.wait id = -1
|
||||
// CHECK-NOT: llvm.fence
|
||||
// Constant memory is read-only, no fencing needed
|
||||
gpu.barrier memfence [#gpu.address_space<constant>]
|
||||
func.return
|
||||
}
|
||||
}
|
||||
|
||||
23
mlir/test/Dialect/GPU/constant-address-space.mlir
Normal file
23
mlir/test/Dialect/GPU/constant-address-space.mlir
Normal file
@ -0,0 +1,23 @@
|
||||
// RUN: mlir-opt -allow-unregistered-dialect %s | FileCheck %s
|
||||
|
||||
gpu.module @test {
|
||||
// CHECK-LABEL: @constant_memref_basic
|
||||
// CHECK-SAME: (%{{.*}}: memref<16xf32, #gpu.address_space<constant>>)
|
||||
gpu.func @constant_memref_basic(%arg0: memref<16xf32, #gpu.address_space<constant>>) kernel {
|
||||
%c0 = arith.constant 0 : index
|
||||
%0 = memref.load %arg0[%c0] : memref<16xf32, #gpu.address_space<constant>>
|
||||
gpu.return
|
||||
}
|
||||
|
||||
// CHECK-LABEL: @constant_memref_multidim
|
||||
// CHECK: memref<4x8xf32, #gpu.address_space<constant>>
|
||||
gpu.func @constant_memref_multidim(%arg0: memref<4x8xf32, #gpu.address_space<constant>>) kernel {
|
||||
gpu.return
|
||||
}
|
||||
|
||||
// CHECK-LABEL: @constant_memref_dynamic
|
||||
// CHECK: memref<?x?xf32, #gpu.address_space<constant>>
|
||||
gpu.func @constant_memref_dynamic(%arg0: memref<?x?xf32, #gpu.address_space<constant>>) kernel {
|
||||
gpu.return
|
||||
}
|
||||
}
|
||||
@ -186,6 +186,7 @@ module attributes {gpu.container_module} {
|
||||
gpu.barrier memfence [#gpu.address_space<global>]
|
||||
gpu.barrier memfence [#gpu.address_space<global>, #gpu.address_space<workgroup>]
|
||||
gpu.barrier memfence [#gpu.address_space<private>]
|
||||
gpu.barrier memfence [#gpu.address_space<constant>]
|
||||
gpu.barrier memfence []
|
||||
|
||||
"some_op"(%bIdX, %tIdX) : (index, index) -> ()
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user