Add builders on the Python side that match builders in the C++ side, add tests for launching GPU kernels and regions, and correct some small documentation mistakes. This reflects the API decisions already made in the func dialect's Python bindings and makes use of the GPU dialect's bindings work more similar to C++ interface.
254 lines
8.8 KiB
Python
254 lines
8.8 KiB
Python
# RUN: %PYTHON %s | FileCheck %s
|
|
|
|
from mlir.ir import *
|
|
import mlir.ir as ir
|
|
from mlir.dialects import gpu, func, arith, math
|
|
from mlir.extras import types as T
|
|
import mlir.dialects.gpu.passes
|
|
from mlir.passmanager import *
|
|
|
|
|
|
def run(f):
|
|
print("\nTEST:", f.__name__)
|
|
with Context(), Location.unknown():
|
|
f()
|
|
return f
|
|
|
|
|
|
# CHECK-LABEL: testGPUPass
|
|
# CHECK: SUCCESS
|
|
@run
|
|
def testGPUPass():
|
|
PassManager.parse("any(gpu-kernel-outlining)")
|
|
print("SUCCESS")
|
|
|
|
|
|
# CHECK-LABEL: testMMAElementWiseAttr
|
|
@run
|
|
def testMMAElementWiseAttr():
|
|
module = Module.create()
|
|
with InsertionPoint(module.body):
|
|
gpu.BlockDimOp(gpu.Dimension.y)
|
|
# CHECK: %block_dim_y = gpu.block_dim y
|
|
print(module)
|
|
pass
|
|
|
|
|
|
# CHECK-LABEL: testObjectAttr
|
|
@run
|
|
def testObjectAttr():
|
|
target = Attribute.parse("#nvvm.target")
|
|
format = gpu.CompilationTarget.Fatbin
|
|
object = b"BC\xc0\xde5\x14\x00\x00\x05\x00\x00\x00b\x0c0$MY\xbef"
|
|
properties = DictAttr.get({"O": IntegerAttr.get(IntegerType.get_signless(32), 2)})
|
|
o = gpu.ObjectAttr.get(target, format, object, properties)
|
|
# CHECK: #gpu.object<#nvvm.target, properties = {O = 2 : i32}, "BC\C0\DE5\14\00\00\05\00\00\00b\0C0$MY\BEf">
|
|
print(o)
|
|
assert o.object == object
|
|
|
|
o = gpu.ObjectAttr.get(target, format, object)
|
|
# CHECK: #gpu.object<#nvvm.target, "BC\C0\DE5\14\00\00\05\00\00\00b\0C0$MY\BEf">
|
|
print(o)
|
|
|
|
object = (
|
|
b"//\n// Generated by LLVM NVPTX Back-End\n//\n\n.version 6.0\n.target sm_50"
|
|
)
|
|
o = gpu.ObjectAttr.get(target, format, object)
|
|
# CHECK: #gpu.object<#nvvm.target, "//\0A// Generated by LLVM NVPTX Back-End\0A//\0A\0A.version 6.0\0A.target sm_50">
|
|
print(o)
|
|
assert o.object == object
|
|
|
|
object = b"BC\xc0\xde5\x14\x00\x00\x05\x00\x00\x00b\x0c0$MY\xbef"
|
|
kernelTable = Attribute.parse(
|
|
'#gpu.kernel_table<[#gpu.kernel_metadata<"kernel", () -> ()>]>'
|
|
)
|
|
o = gpu.ObjectAttr.get(target, format, object, kernels=kernelTable)
|
|
# CHECK: #gpu.object<#nvvm.target, kernels = <[#gpu.kernel_metadata<"kernel", () -> ()>]>, "BC\C0\DE5\14\00\00\05\00\00\00b\0C0$MY\BEf">
|
|
print(o)
|
|
assert o.kernels == kernelTable
|
|
|
|
|
|
# CHECK-LABEL: testGPUFuncOp
|
|
@run
|
|
def testGPUFuncOp():
|
|
assert gpu.GPUFuncOp.__doc__ is not None
|
|
module = Module.create()
|
|
with InsertionPoint(module.body):
|
|
gpu_module_name = StringAttr.get("gpu_module")
|
|
gpumodule = gpu.GPUModuleOp(gpu_module_name)
|
|
block = gpumodule.bodyRegion.blocks.append()
|
|
|
|
def builder(func: gpu.GPUFuncOp) -> None:
|
|
gpu.GlobalIdOp(gpu.Dimension.x)
|
|
gpu.ReturnOp([])
|
|
|
|
with InsertionPoint(block):
|
|
name = StringAttr.get("kernel0")
|
|
func_type = ir.FunctionType.get(inputs=[], results=[])
|
|
type_attr = TypeAttr.get(func_type)
|
|
func = gpu.GPUFuncOp(type_attr, name)
|
|
func.attributes["sym_name"] = name
|
|
func.attributes["gpu.kernel"] = UnitAttr.get()
|
|
|
|
try:
|
|
func.entry_block
|
|
assert False, "Expected RuntimeError"
|
|
except RuntimeError as e:
|
|
assert (
|
|
str(e)
|
|
== "Entry block does not exist for kernel0. Do you need to call the add_entry_block() method on this GPUFuncOp?"
|
|
)
|
|
|
|
block = func.add_entry_block()
|
|
with InsertionPoint(block):
|
|
builder(func)
|
|
|
|
try:
|
|
func.add_entry_block()
|
|
assert False, "Expected RuntimeError"
|
|
except RuntimeError as e:
|
|
assert str(e) == "Entry block already exists for kernel0"
|
|
|
|
func = gpu.GPUFuncOp(
|
|
func_type,
|
|
sym_name="kernel1",
|
|
kernel=True,
|
|
body_builder=builder,
|
|
known_block_size=[1, 2, 3],
|
|
known_grid_size=DenseI32ArrayAttr.get([4, 5, 6]),
|
|
)
|
|
|
|
assert func.name.value == "kernel1"
|
|
assert func.function_type.value == func_type
|
|
assert func.arg_attrs == None
|
|
assert func.res_attrs == None
|
|
assert func.arguments == []
|
|
assert func.entry_block == func.body.blocks[0]
|
|
assert func.is_kernel
|
|
assert func.known_block_size == DenseI32ArrayAttr.get(
|
|
[1, 2, 3]
|
|
), func.known_block_size
|
|
assert func.known_grid_size == DenseI32ArrayAttr.get(
|
|
[4, 5, 6]
|
|
), func.known_grid_size
|
|
|
|
func = gpu.GPUFuncOp(
|
|
func_type,
|
|
sym_name="non_kernel_func",
|
|
body_builder=builder,
|
|
)
|
|
assert not func.is_kernel
|
|
assert func.known_block_size is None
|
|
assert func.known_grid_size is None
|
|
|
|
print(module)
|
|
|
|
# CHECK: gpu.module @gpu_module
|
|
# CHECK: gpu.func @kernel0() kernel {
|
|
# CHECK: %[[VAL_0:.*]] = gpu.global_id x
|
|
# CHECK: gpu.return
|
|
# CHECK: }
|
|
# CHECK: gpu.func @kernel1() kernel attributes
|
|
# CHECK-SAME: known_block_size = array<i32: 1, 2, 3>
|
|
# CHECK-SAME: known_grid_size = array<i32: 4, 5, 6>
|
|
# CHECK: %[[VAL_0:.*]] = gpu.global_id x
|
|
# CHECK: gpu.return
|
|
# CHECK: }
|
|
# CHECK: gpu.func @non_kernel_func() {
|
|
# CHECK: %[[VAL_0:.*]] = gpu.global_id x
|
|
# CHECK: gpu.return
|
|
# CHECK: }
|
|
|
|
|
|
# CHECK-LABEL: testGPULaunchFuncOp
|
|
@run
|
|
def testGPULaunchFuncOp():
|
|
module = Module.create()
|
|
|
|
module.operation.attributes["gpu.container_module"] = UnitAttr.get()
|
|
with InsertionPoint(module.body):
|
|
gpu_module = gpu.GPUModuleOp("gpu_module")
|
|
block = gpu_module.bodyRegion.blocks.append()
|
|
|
|
with InsertionPoint(block):
|
|
gpu_func = gpu.GPUFuncOp(
|
|
FunctionType.get([], []),
|
|
"kernel",
|
|
body_builder=lambda func: gpu.return_([]),
|
|
kernel=True,
|
|
)
|
|
|
|
with InsertionPoint(module.body):
|
|
host = func.FuncOp(type=FunctionType.get([], []), name="host")
|
|
|
|
with InsertionPoint(host.add_entry_block()):
|
|
c1 = arith.constant(T.index(), 1)
|
|
grid_sizes = (1, 1, 1)
|
|
block_sizes = (1, 1, 1)
|
|
token = gpu.wait()
|
|
token = gpu.launch_func(
|
|
async_dependencies=[token],
|
|
kernel=[gpu_module.sym_name.value, gpu_func.name.value],
|
|
grid_size=grid_sizes,
|
|
block_size=block_sizes,
|
|
kernel_operands=[],
|
|
)
|
|
gpu.wait(async_dependencies=[token])
|
|
func.ReturnOp([])
|
|
|
|
print(module)
|
|
|
|
# CHECK-LABEL: gpu.module @gpu_module {
|
|
# CHECK: gpu.func @kernel() kernel {
|
|
# CHECK: gpu.return
|
|
# CHECK: }
|
|
# CHECK: }
|
|
|
|
# CHECK-LABEL: func.func @host() {
|
|
# CHECK: %[[CONSTANT_0:.*]] = arith.constant 1 : index
|
|
# CHECK: %[[WAIT_0:.*]] = gpu.wait async
|
|
# CHECK: %[[CONSTANT_1:.*]] = arith.constant 1 : index
|
|
# CHECK: %[[CONSTANT_2:.*]] = arith.constant 1 : index
|
|
# CHECK: %[[CONSTANT_3:.*]] = arith.constant 1 : index
|
|
# CHECK: %[[CONSTANT_4:.*]] = arith.constant 1 : index
|
|
# CHECK: %[[CONSTANT_5:.*]] = arith.constant 1 : index
|
|
# CHECK: %[[CONSTANT_6:.*]] = arith.constant 1 : index
|
|
# CHECK: %[[LAUNCH_FUNC_0:.*]] = gpu.launch_func async {{\[}}%[[WAIT_0]]] @gpu_module::@kernel blocks in (%[[CONSTANT_1]], %[[CONSTANT_2]], %[[CONSTANT_3]]) threads in (%[[CONSTANT_4]], %[[CONSTANT_5]], %[[CONSTANT_6]])
|
|
# CHECK: %[[WAIT_1:.*]] = gpu.wait async {{\[}}%[[LAUNCH_FUNC_0]]]
|
|
# CHECK: return
|
|
# CHECK: }
|
|
|
|
|
|
# CHECK-LABEL: testGPULaunchOp
|
|
@run
|
|
def testGPULaunchOp():
|
|
module = Module.create()
|
|
|
|
with InsertionPoint(module.body):
|
|
host = func.FuncOp(type=FunctionType.get([T.f32()], []), name="gpu_printf")
|
|
|
|
entry_block = host.add_entry_block()
|
|
with InsertionPoint(entry_block):
|
|
c1 = arith.constant(T.index(), 1)
|
|
grid_sizes = (c1, c1, c1)
|
|
block_sizes = (c1, c1, c1)
|
|
|
|
launch = gpu.launch(grid_sizes, block_sizes)
|
|
|
|
op = launch(lambda *args: gpu.printf("%f", args[0]))
|
|
|
|
with InsertionPoint(entry_block):
|
|
func.ReturnOp([])
|
|
|
|
print(module)
|
|
|
|
# CHECK-LABEL: func.func @gpu_printf(
|
|
# CHECK-SAME: %[[ARG0:.*]]: f32) {
|
|
# CHECK: %[[CONSTANT_0:.*]] = arith.constant 1 : index
|
|
# CHECK: gpu.launch blocks(%[[VAL_0:.*]], %[[VAL_1:.*]], %[[VAL_2:.*]]) in (%[[VAL_3:.*]] = %[[CONSTANT_0]], %[[VAL_4:.*]] = %[[CONSTANT_0]], %[[VAL_5:.*]] = %[[CONSTANT_0]]) threads(%[[VAL_6:.*]], %[[VAL_7:.*]], %[[VAL_8:.*]]) in (%[[VAL_9:.*]] = %[[CONSTANT_0]], %[[VAL_10:.*]] = %[[CONSTANT_0]], %[[VAL_11:.*]] = %[[CONSTANT_0]]) {
|
|
# CHECK: gpu.printf "%[[VAL_12:.*]]", %[[VAL_0]] : index
|
|
# CHECK: gpu.terminator
|
|
# CHECK: }
|
|
# CHECK: return
|
|
# CHECK: }
|