[MLIR] Enable GPU Dialect to SYCL runtime integration (#71430)
GPU Dialect lowering to SYCL runtime is driven by spirv.target_env attached to gpu.module. As a result of this, spirv.target_env remains as an input to LLVMIR Translation. A SPIRVToLLVMIRTranslation without any actual translation is added to avoid an unregistered error in mlir-cpu-runner. SelectObjectAttr.cpp is updated to 1) Pass binary size argument to getModuleLoadFn 2) Pass parameter count to getKernelLaunchFn This change does not impact CUDA and ROCM usage since both mlir_cuda_runtime and mlir_rocm_runtime are already updated to accept and ignore the extra arguments.
This commit is contained in:
parent
2284771fa0
commit
7fc792cba7
@ -26,6 +26,7 @@
|
||||
#include "mlir/Target/LLVMIR/Dialect/OpenACC/OpenACCToLLVMIRTranslation.h"
|
||||
#include "mlir/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.h"
|
||||
#include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h"
|
||||
#include "mlir/Target/LLVMIR/Dialect/SPIRV/SPIRVToLLVMIRTranslation.h"
|
||||
#include "mlir/Target/LLVMIR/Dialect/X86Vector/X86VectorToLLVMIRTranslation.h"
|
||||
|
||||
namespace mlir {
|
||||
@ -45,6 +46,7 @@ static inline void registerAllToLLVMIRTranslations(DialectRegistry ®istry) {
|
||||
registerOpenACCDialectTranslation(registry);
|
||||
registerOpenMPDialectTranslation(registry);
|
||||
registerROCDLDialectTranslation(registry);
|
||||
registerSPIRVDialectTranslation(registry);
|
||||
registerX86VectorDialectTranslation(registry);
|
||||
|
||||
// Extension required for translating GPU offloading Ops.
|
||||
@ -61,6 +63,7 @@ registerAllGPUToLLVMIRTranslations(DialectRegistry ®istry) {
|
||||
registerLLVMDialectTranslation(registry);
|
||||
registerNVVMDialectTranslation(registry);
|
||||
registerROCDLDialectTranslation(registry);
|
||||
registerSPIRVDialectTranslation(registry);
|
||||
|
||||
// Extension required for translating GPU offloading Ops.
|
||||
gpu::registerOffloadingLLVMTranslationInterfaceExternalModels(registry);
|
||||
|
@ -0,0 +1,31 @@
|
||||
//===- SPIRVToLLVMIRTranslation.h - SPIR-V to LLVM IR -----------*- C++ -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This provides registration calls for SPIR-V dialect to LLVM IR translation.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef MLIR_TARGET_LLVMIR_DIALECT_SPIRV_SPIRVTOLLVMIRTRANSLATION_H
|
||||
#define MLIR_TARGET_LLVMIR_DIALECT_SPIRV_SPIRVTOLLVMIRTRANSLATION_H
|
||||
|
||||
namespace mlir {
|
||||
|
||||
class DialectRegistry;
|
||||
class MLIRContext;
|
||||
|
||||
/// Register the SPIR-V dialect and the translation from it to the LLVM IR in
|
||||
/// the given registry;
|
||||
void registerSPIRVDialectTranslation(DialectRegistry ®istry);
|
||||
|
||||
/// Register the SPIR-V dialect and the translation from it in the registry
|
||||
/// associated with the given context.
|
||||
void registerSPIRVDialectTranslation(MLIRContext &context);
|
||||
|
||||
} // namespace mlir
|
||||
|
||||
#endif // MLIR_TARGET_LLVMIR_DIALECT_SPIRV_SPIRVTOLLVMIRTRANSLATION_H
|
@ -58,6 +58,7 @@ add_mlir_translation_library(MLIRToLLVMIRTranslationRegistration
|
||||
MLIROpenACCToLLVMIRTranslation
|
||||
MLIROpenMPToLLVMIRTranslation
|
||||
MLIRROCDLToLLVMIRTranslation
|
||||
MLIRSPIRVToLLVMIRTranslation
|
||||
)
|
||||
|
||||
add_mlir_translation_library(MLIRTargetLLVMIRImport
|
||||
|
@ -9,4 +9,5 @@ add_subdirectory(NVVM)
|
||||
add_subdirectory(OpenACC)
|
||||
add_subdirectory(OpenMP)
|
||||
add_subdirectory(ROCDL)
|
||||
add_subdirectory(SPIRV)
|
||||
add_subdirectory(X86Vector)
|
||||
|
@ -175,6 +175,7 @@ private:
|
||||
IRBuilderBase &builder;
|
||||
mlir::LLVM::ModuleTranslation &moduleTranslation;
|
||||
Type *i32Ty{};
|
||||
Type *i64Ty{};
|
||||
Type *voidTy{};
|
||||
Type *intPtrTy{};
|
||||
PointerType *ptrTy{};
|
||||
@ -216,6 +217,7 @@ llvm::LaunchKernel::LaunchKernel(
|
||||
mlir::LLVM::ModuleTranslation &moduleTranslation)
|
||||
: module(module), builder(builder), moduleTranslation(moduleTranslation) {
|
||||
i32Ty = builder.getInt32Ty();
|
||||
i64Ty = builder.getInt64Ty();
|
||||
ptrTy = builder.getPtrTy(0);
|
||||
voidTy = builder.getVoidTy();
|
||||
intPtrTy = builder.getIntPtrTy(module.getDataLayout());
|
||||
@ -224,10 +226,10 @@ llvm::LaunchKernel::LaunchKernel(
|
||||
llvm::FunctionCallee llvm::LaunchKernel::getKernelLaunchFn() {
|
||||
return module.getOrInsertFunction(
|
||||
"mgpuLaunchKernel",
|
||||
FunctionType::get(
|
||||
voidTy,
|
||||
ArrayRef<Type *>({ptrTy, intPtrTy, intPtrTy, intPtrTy, intPtrTy,
|
||||
intPtrTy, intPtrTy, i32Ty, ptrTy, ptrTy, ptrTy}),
|
||||
FunctionType::get(voidTy,
|
||||
ArrayRef<Type *>({ptrTy, intPtrTy, intPtrTy, intPtrTy,
|
||||
intPtrTy, intPtrTy, intPtrTy, i32Ty,
|
||||
ptrTy, ptrTy, ptrTy, i64Ty}),
|
||||
false));
|
||||
}
|
||||
|
||||
@ -251,7 +253,7 @@ llvm::FunctionCallee llvm::LaunchKernel::getModuleFunctionFn() {
|
||||
llvm::FunctionCallee llvm::LaunchKernel::getModuleLoadFn() {
|
||||
return module.getOrInsertFunction(
|
||||
"mgpuModuleLoad",
|
||||
FunctionType::get(ptrTy, ArrayRef<Type *>({ptrTy}), false));
|
||||
FunctionType::get(ptrTy, ArrayRef<Type *>({ptrTy, i64Ty}), false));
|
||||
}
|
||||
|
||||
llvm::FunctionCallee llvm::LaunchKernel::getModuleLoadJITFn() {
|
||||
@ -391,10 +393,24 @@ llvm::LaunchKernel::createKernelLaunch(mlir::gpu::LaunchFuncOp op,
|
||||
if (!binary)
|
||||
return op.emitError() << "Couldn't find the binary: " << binaryIdentifier;
|
||||
|
||||
auto binaryVar = dyn_cast<llvm::GlobalVariable>(binary);
|
||||
if (!binaryVar)
|
||||
return op.emitError() << "Binary is not a global variable: "
|
||||
<< binaryIdentifier;
|
||||
llvm::Constant *binaryInit = binaryVar->getInitializer();
|
||||
auto binaryDataSeq =
|
||||
dyn_cast_if_present<llvm::ConstantDataSequential>(binaryInit);
|
||||
if (!binaryDataSeq)
|
||||
return op.emitError() << "Couldn't find binary data array: "
|
||||
<< binaryIdentifier;
|
||||
llvm::Constant *binarySize =
|
||||
llvm::ConstantInt::get(i64Ty, binaryDataSeq->getNumElements() *
|
||||
binaryDataSeq->getElementByteSize());
|
||||
|
||||
Value *moduleObject =
|
||||
object.getFormat() == gpu::CompilationTarget::Assembly
|
||||
? builder.CreateCall(getModuleLoadJITFn(), {binary, optV})
|
||||
: builder.CreateCall(getModuleLoadFn(), {binary});
|
||||
: builder.CreateCall(getModuleLoadFn(), {binary, binarySize});
|
||||
|
||||
// Load the kernel function.
|
||||
Value *moduleFunction = builder.CreateCall(
|
||||
@ -413,6 +429,9 @@ llvm::LaunchKernel::createKernelLaunch(mlir::gpu::LaunchFuncOp op,
|
||||
stream = builder.CreateCall(getStreamCreateFn(), {});
|
||||
}
|
||||
|
||||
llvm::Constant *paramsCount =
|
||||
llvm::ConstantInt::get(i64Ty, op.getNumKernelOperands());
|
||||
|
||||
// Create the launch call.
|
||||
Value *nullPtr = ConstantPointerNull::get(ptrTy);
|
||||
|
||||
@ -426,10 +445,10 @@ llvm::LaunchKernel::createKernelLaunch(mlir::gpu::LaunchFuncOp op,
|
||||
ArrayRef<Value *>({moduleFunction, cx, cy, cz, gx, gy, gz, bx, by, bz,
|
||||
dynamicMemorySize, stream, argArray, nullPtr}));
|
||||
} else {
|
||||
builder.CreateCall(
|
||||
getKernelLaunchFn(),
|
||||
ArrayRef<Value *>({moduleFunction, gx, gy, gz, bx, by, bz,
|
||||
dynamicMemorySize, stream, argArray, nullPtr}));
|
||||
builder.CreateCall(getKernelLaunchFn(),
|
||||
ArrayRef<Value *>({moduleFunction, gx, gy, gz, bx, by,
|
||||
bz, dynamicMemorySize, stream,
|
||||
argArray, nullPtr, paramsCount}));
|
||||
}
|
||||
|
||||
// Sync & destroy the stream, for synchronous launches.
|
||||
|
13
mlir/lib/Target/LLVMIR/Dialect/SPIRV/CMakeLists.txt
Normal file
13
mlir/lib/Target/LLVMIR/Dialect/SPIRV/CMakeLists.txt
Normal file
@ -0,0 +1,13 @@
|
||||
add_mlir_translation_library(MLIRSPIRVToLLVMIRTranslation
|
||||
SPIRVToLLVMIRTranslation.cpp
|
||||
|
||||
LINK_COMPONENTS
|
||||
Core
|
||||
|
||||
LINK_LIBS PUBLIC
|
||||
MLIRIR
|
||||
MLIRLLVMDialect
|
||||
MLIRSPIRVDialect
|
||||
MLIRSupport
|
||||
MLIRTargetLLVMIRExport
|
||||
)
|
@ -0,0 +1,31 @@
|
||||
//===- SPIRVToLLVMIRTranslation.cpp - Translate SPIR-V to LLVM IR ---------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This file implements a translation between the MLIR SPIR-V dialect and
|
||||
// LLVM IR.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "mlir/Target/LLVMIR/Dialect/SPIRV/SPIRVToLLVMIRTranslation.h"
|
||||
#include "mlir/Dialect/SPIRV/IR/SPIRVDialect.h"
|
||||
#include "mlir/IR/BuiltinAttributes.h"
|
||||
#include "mlir/IR/Operation.h"
|
||||
#include "mlir/Target/LLVMIR/ModuleTranslation.h"
|
||||
|
||||
using namespace mlir;
|
||||
using namespace mlir::LLVM;
|
||||
|
||||
void mlir::registerSPIRVDialectTranslation(DialectRegistry ®istry) {
|
||||
registry.insert<spirv::SPIRVDialect>();
|
||||
}
|
||||
|
||||
void mlir::registerSPIRVDialectTranslation(MLIRContext &context) {
|
||||
DialectRegistry registry;
|
||||
registerSPIRVDialectTranslation(registry);
|
||||
context.appendDialectRegistry(registry);
|
||||
}
|
@ -142,6 +142,10 @@ if(MLIR_ENABLE_ROCM_RUNNER)
|
||||
list(APPEND MLIR_TEST_DEPENDS mlir_rocm_runtime)
|
||||
endif()
|
||||
|
||||
if(MLIR_ENABLE_SYCL_RUNNER)
|
||||
list(APPEND MLIR_TEST_DEPENDS mlir_sycl_runtime)
|
||||
endif()
|
||||
|
||||
if (MLIR_RUN_ARM_SME_TESTS AND NOT ARM_SME_ABI_ROUTINES_SHLIB)
|
||||
list(APPEND MLIR_TEST_DEPENDS mlir_arm_sme_abi_stubs)
|
||||
endif()
|
||||
|
56
mlir/test/Integration/GPU/SYCL/gpu-addf32-to-spirv.mlir
Normal file
56
mlir/test/Integration/GPU/SYCL/gpu-addf32-to-spirv.mlir
Normal file
@ -0,0 +1,56 @@
|
||||
// RUN: mlir-opt %s -pass-pipeline='builtin.module(spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-cf-to-llvm,convert-arith-to-llvm,convert-math-to-llvm,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \
|
||||
// RUN: | mlir-cpu-runner \
|
||||
// RUN: --shared-libs=%mlir_sycl_runtime \
|
||||
// RUN: --shared-libs=%mlir_runner_utils \
|
||||
// RUN: --entry-point-result=void \
|
||||
// RUN: | FileCheck %s
|
||||
|
||||
module @add attributes {gpu.container_module} {
|
||||
memref.global "private" constant @__constant_2x2x2xf32_0 : memref<2x2x2xf32> = dense<[[[1.1, 2.2], [3.3, 4.4]], [[5.5, 6.6], [7.7, 8.8 ]]]>
|
||||
memref.global "private" constant @__constant_2x2x2xf32 : memref<2x2x2xf32> = dense<[[[1.2, 2.3], [4.5, 5.8]], [[7.2, 8.3], [10.5, 11.8]]]>
|
||||
func.func @main() {
|
||||
%0 = memref.get_global @__constant_2x2x2xf32 : memref<2x2x2xf32>
|
||||
%1 = memref.get_global @__constant_2x2x2xf32_0 : memref<2x2x2xf32>
|
||||
%2 = call @test(%0, %1) : (memref<2x2x2xf32>, memref<2x2x2xf32>) -> memref<2x2x2xf32>
|
||||
%cast = memref.cast %2 : memref<2x2x2xf32> to memref<*xf32>
|
||||
call @printMemrefF32(%cast) : (memref<*xf32>) -> ()
|
||||
return
|
||||
}
|
||||
func.func private @printMemrefF32(memref<*xf32>)
|
||||
func.func @test(%arg0: memref<2x2x2xf32>, %arg1: memref<2x2x2xf32>) -> memref<2x2x2xf32> {
|
||||
%c2 = arith.constant 2 : index
|
||||
%c1 = arith.constant 1 : index
|
||||
%mem = gpu.alloc host_shared () : memref<2x2x2xf32>
|
||||
memref.copy %arg1, %mem : memref<2x2x2xf32> to memref<2x2x2xf32>
|
||||
%memref_0 = gpu.alloc host_shared () : memref<2x2x2xf32>
|
||||
memref.copy %arg0, %memref_0 : memref<2x2x2xf32> to memref<2x2x2xf32>
|
||||
%memref_2 = gpu.alloc host_shared () : memref<2x2x2xf32>
|
||||
%2 = gpu.wait async
|
||||
%3 = gpu.launch_func async [%2] @test_kernel::@test_kernel blocks in (%c2, %c2, %c2) threads in (%c1, %c1, %c1) args(%memref_0 : memref<2x2x2xf32>, %mem : memref<2x2x2xf32>, %memref_2 : memref<2x2x2xf32>)
|
||||
gpu.wait [%3]
|
||||
%alloc = memref.alloc() : memref<2x2x2xf32>
|
||||
memref.copy %memref_2, %alloc : memref<2x2x2xf32> to memref<2x2x2xf32>
|
||||
%4 = gpu.wait async
|
||||
%5 = gpu.dealloc async [%4] %memref_2 : memref<2x2x2xf32>
|
||||
%6 = gpu.dealloc async [%5] %memref_0 : memref<2x2x2xf32>
|
||||
%7 = gpu.dealloc async [%6] %mem : memref<2x2x2xf32>
|
||||
gpu.wait [%7]
|
||||
return %alloc : memref<2x2x2xf32>
|
||||
}
|
||||
gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Int64, Kernel], []>, api=OpenCL, #spirv.resource_limits<>>} {
|
||||
gpu.func @test_kernel(%arg0: memref<2x2x2xf32>, %arg1: memref<2x2x2xf32>, %arg2: memref<2x2x2xf32>) kernel attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 2, 2, 2>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
|
||||
%0 = gpu.block_id x
|
||||
%1 = gpu.block_id y
|
||||
%2 = gpu.block_id z
|
||||
%3 = memref.load %arg0[%0, %1, %2] : memref<2x2x2xf32>
|
||||
%4 = memref.load %arg1[%0, %1, %2] : memref<2x2x2xf32>
|
||||
%5 = arith.addf %3, %4 : f32
|
||||
memref.store %5, %arg2[%0, %1, %2] : memref<2x2x2xf32>
|
||||
gpu.return
|
||||
}
|
||||
}
|
||||
// CHECK: [2.3, 4.5]
|
||||
// CHECK: [7.8, 10.2]
|
||||
// CHECK: [12.7, 14.9]
|
||||
// CHECK: [18.2, 20.6]
|
||||
}
|
54
mlir/test/Integration/GPU/SYCL/gpu-addi64-to-spirv.mlir
Normal file
54
mlir/test/Integration/GPU/SYCL/gpu-addi64-to-spirv.mlir
Normal file
@ -0,0 +1,54 @@
|
||||
// RUN: mlir-opt %s -pass-pipeline='builtin.module(spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-cf-to-llvm,convert-arith-to-llvm,convert-math-to-llvm,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \
|
||||
// RUN: | mlir-cpu-runner \
|
||||
// RUN: --shared-libs=%mlir_sycl_runtime \
|
||||
// RUN: --shared-libs=%mlir_runner_utils \
|
||||
// RUN: --entry-point-result=void \
|
||||
// RUN: | FileCheck %s
|
||||
|
||||
module @add attributes {gpu.container_module} {
|
||||
memref.global "private" constant @__constant_3x3xi64_0 : memref<3x3xi64> = dense<[[1, 4098, 3], [16777220, 5, 4294967302], [7, 1099511627784, 9]]>
|
||||
memref.global "private" constant @__constant_3x3xi64 : memref<3x3xi64> = dense<[[1, 2, 3], [4, 5, 4102], [16777223, 4294967304, 1099511627785]]>
|
||||
func.func @main() {
|
||||
%0 = memref.get_global @__constant_3x3xi64 : memref<3x3xi64>
|
||||
%1 = memref.get_global @__constant_3x3xi64_0 : memref<3x3xi64>
|
||||
%2 = call @test(%0, %1) : (memref<3x3xi64>, memref<3x3xi64>) -> memref<3x3xi64>
|
||||
%cast = memref.cast %2 : memref<3x3xi64> to memref<*xi64>
|
||||
call @printMemrefI64(%cast) : (memref<*xi64>) -> ()
|
||||
return
|
||||
}
|
||||
func.func private @printMemrefI64(memref<*xi64>)
|
||||
func.func @test(%arg0: memref<3x3xi64>, %arg1: memref<3x3xi64>) -> memref<3x3xi64> {
|
||||
%c3 = arith.constant 3 : index
|
||||
%c1 = arith.constant 1 : index
|
||||
%mem = gpu.alloc host_shared () : memref<3x3xi64>
|
||||
memref.copy %arg1, %mem : memref<3x3xi64> to memref<3x3xi64>
|
||||
%memref_0 = gpu.alloc host_shared () : memref<3x3xi64>
|
||||
memref.copy %arg0, %memref_0 : memref<3x3xi64> to memref<3x3xi64>
|
||||
%memref_2 = gpu.alloc host_shared () : memref<3x3xi64>
|
||||
%2 = gpu.wait async
|
||||
%3 = gpu.launch_func async [%2] @test_kernel::@test_kernel blocks in (%c3, %c3, %c1) threads in (%c1, %c1, %c1) args(%memref_0 : memref<3x3xi64>, %mem : memref<3x3xi64>, %memref_2 : memref<3x3xi64>)
|
||||
gpu.wait [%3]
|
||||
%alloc = memref.alloc() : memref<3x3xi64>
|
||||
memref.copy %memref_2, %alloc : memref<3x3xi64> to memref<3x3xi64>
|
||||
%4 = gpu.wait async
|
||||
%5 = gpu.dealloc async [%4] %memref_2 : memref<3x3xi64>
|
||||
%6 = gpu.dealloc async [%5] %memref_0 : memref<3x3xi64>
|
||||
%7 = gpu.dealloc async [%6] %mem : memref<3x3xi64>
|
||||
gpu.wait [%7]
|
||||
return %alloc : memref<3x3xi64>
|
||||
}
|
||||
gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Int64, Kernel], []>, api=OpenCL, #spirv.resource_limits<>>} {
|
||||
gpu.func @test_kernel(%arg0: memref<3x3xi64>, %arg1: memref<3x3xi64>, %arg2: memref<3x3xi64>) kernel attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 3, 3, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
|
||||
%0 = gpu.block_id x
|
||||
%1 = gpu.block_id y
|
||||
%2 = memref.load %arg0[%0, %1] : memref<3x3xi64>
|
||||
%3 = memref.load %arg1[%0, %1] : memref<3x3xi64>
|
||||
%4 = arith.addi %2, %3 : i64
|
||||
memref.store %4, %arg2[%0, %1] : memref<3x3xi64>
|
||||
gpu.return
|
||||
}
|
||||
}
|
||||
// CHECK: [2, 4100, 6],
|
||||
// CHECK: [16777224, 10, 4294971404],
|
||||
// CHECK: [16777230, 1103806595088, 1099511627794]
|
||||
}
|
79
mlir/test/Integration/GPU/SYCL/gpu-reluf32-to-spirv.mlir
Normal file
79
mlir/test/Integration/GPU/SYCL/gpu-reluf32-to-spirv.mlir
Normal file
@ -0,0 +1,79 @@
|
||||
// RUN: mlir-opt %s -pass-pipeline='builtin.module(spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-cf-to-llvm,convert-arith-to-llvm,convert-math-to-llvm,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \
|
||||
// RUN: | mlir-cpu-runner \
|
||||
// RUN: --shared-libs=%mlir_sycl_runtime \
|
||||
// RUN: --shared-libs=%mlir_runner_utils \
|
||||
// RUN: --entry-point-result=void \
|
||||
// RUN: | FileCheck %s
|
||||
|
||||
module @relu attributes {gpu.container_module} {
|
||||
memref.global "private" constant @__constant_4x5xf32 : memref<4x5xf32> = dense<[
|
||||
[-1.000000e-01, -2.000000e-01, -3.000000e-01, 4.000000e-01, 5.000000e-01],
|
||||
[1.000000e-01, -2.000000e-01, 3.000000e-01, -4.000000e-01, 5.000000e-01],
|
||||
[1.000000e-01, 2.000000e-01, 3.000000e-01, -4.000000e-01, -5.000000e-01],
|
||||
[1.000000e-01, 2.000000e-01, 3.000000e-01, 4.000000e-01, 5.000000e-01]
|
||||
]>
|
||||
|
||||
func.func @main() {
|
||||
%c1 = arith.constant 1 : index
|
||||
%c100 = arith.constant 100 : index
|
||||
%c0 = arith.constant 0 : index
|
||||
%0 = memref.get_global @__constant_4x5xf32 : memref<4x5xf32>
|
||||
|
||||
scf.for %arg0 = %c0 to %c100 step %c1 {
|
||||
%1 = func.call @test(%0) : (memref<4x5xf32>) -> memref<4x5xf32>
|
||||
%cast = memref.cast %1 : memref<4x5xf32> to memref<*xf32>
|
||||
func.call @printMemrefF32(%cast) : (memref<*xf32>) -> ()
|
||||
// CHECK: [0, 0, 0, 0.4, 0.5],
|
||||
// CHECK: [0.1, 0, 0.3, 0, 0.5],
|
||||
// CHECK: [0.1, 0.2, 0.3, 0, 0],
|
||||
// CHECK: [0.1, 0.2, 0.3, 0.4, 0.5]
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func.func private @printMemrefF32(memref<*xf32>)
|
||||
func.func @test(%arg0: memref<4x5xf32>) -> memref<4x5xf32> {
|
||||
%c5 = arith.constant 5 : index
|
||||
%c4 = arith.constant 4 : index
|
||||
%cst = arith.constant 0.000000e+00 : f32
|
||||
%c1 = arith.constant 1 : index
|
||||
%memref = gpu.alloc host_shared () : memref<4x5xf32>
|
||||
memref.copy %arg0, %memref : memref<4x5xf32> to memref<4x5xf32>
|
||||
%memref_0 = gpu.alloc host_shared () : memref<4x5xi1>
|
||||
%2 = gpu.wait async
|
||||
%3 = gpu.launch_func async [%2] @test_kernel::@test_kernel blocks in (%c4, %c5, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<4x5xf32>, %cst : f32, %memref_0 : memref<4x5xi1>)
|
||||
gpu.wait [%3]
|
||||
%memref_1 = gpu.alloc host_shared () : memref<4x5xf32>
|
||||
%4 = gpu.wait async
|
||||
%5 = gpu.launch_func async [%4] @test_kernel_0::@test_kernel blocks in (%c4, %c5, %c1) threads in (%c1, %c1, %c1) args(%memref_0 : memref<4x5xi1>, %memref : memref<4x5xf32>, %cst : f32, %memref_1 : memref<4x5xf32>)
|
||||
gpu.wait [%5]
|
||||
%alloc = memref.alloc() : memref<4x5xf32>
|
||||
memref.copy %memref_1, %alloc : memref<4x5xf32> to memref<4x5xf32>
|
||||
%6 = gpu.wait async
|
||||
%7 = gpu.dealloc async [%6] %memref_1 : memref<4x5xf32>
|
||||
%8 = gpu.dealloc async [%7] %memref_0 : memref<4x5xi1>
|
||||
%9 = gpu.dealloc async [%8] %memref : memref<4x5xf32>
|
||||
return %alloc : memref<4x5xf32>
|
||||
}
|
||||
gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Int64, Int8, Kernel], []>, api=OpenCL, #spirv.resource_limits<>>} {
|
||||
gpu.func @test_kernel(%arg0: memref<4x5xf32>, %arg1: f32, %arg2: memref<4x5xi1>) kernel attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 4, 5, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
|
||||
%0 = gpu.block_id x
|
||||
%1 = gpu.block_id y
|
||||
%2 = memref.load %arg0[%0, %1] : memref<4x5xf32>
|
||||
%3 = arith.cmpf olt, %2, %arg1 : f32
|
||||
memref.store %3, %arg2[%0, %1] : memref<4x5xi1>
|
||||
gpu.return
|
||||
}
|
||||
}
|
||||
gpu.module @test_kernel_0 attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Int64, Int8, Kernel], []>, api=OpenCL, #spirv.resource_limits<>>} {
|
||||
gpu.func @test_kernel(%arg0: memref<4x5xi1>, %arg1: memref<4x5xf32>, %arg2: f32, %arg3: memref<4x5xf32>) kernel attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 4, 5, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
|
||||
%0 = gpu.block_id x
|
||||
%1 = gpu.block_id y
|
||||
%2 = memref.load %arg0[%0, %1] : memref<4x5xi1>
|
||||
%3 = memref.load %arg1[%0, %1] : memref<4x5xf32>
|
||||
%4 = arith.select %2, %arg2, %3 : f32
|
||||
memref.store %4, %arg3[%0, %1] : memref<4x5xf32>
|
||||
gpu.return
|
||||
}
|
||||
}
|
||||
}
|
2
mlir/test/Integration/GPU/SYCL/lit.local.cfg
Normal file
2
mlir/test/Integration/GPU/SYCL/lit.local.cfg
Normal file
@ -0,0 +1,2 @@
|
||||
if not config.enable_sycl_runner:
|
||||
config.unsupported = True
|
@ -17,10 +17,10 @@ module attributes {gpu.container_module} {
|
||||
// CHECK: store i32 32, ptr [[ARG1]], align 4
|
||||
// CHECK: %{{.*}} = getelementptr ptr, ptr [[ARGS_ARRAY]], i32 1
|
||||
// CHECK: store ptr [[ARG1]], ptr %{{.*}}, align 8
|
||||
// CHECK: [[MODULE:%.*]] = call ptr @mgpuModuleLoad(ptr @kernel_module_bin_cst)
|
||||
// CHECK: [[MODULE:%.*]] = call ptr @mgpuModuleLoad(ptr @kernel_module_bin_cst, i64 4)
|
||||
// CHECK: [[FUNC:%.*]] = call ptr @mgpuModuleGetFunction(ptr [[MODULE]], ptr @kernel_module_kernel_kernel_name)
|
||||
// CHECK: [[STREAM:%.*]] = call ptr @mgpuStreamCreate()
|
||||
// CHECK: call void @mgpuLaunchKernel(ptr [[FUNC]], i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i32 256, ptr [[STREAM]], ptr [[ARGS_ARRAY]], ptr null)
|
||||
// CHECK: call void @mgpuLaunchKernel(ptr [[FUNC]], i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i32 256, ptr [[STREAM]], ptr [[ARGS_ARRAY]], ptr null, i64 2)
|
||||
// CHECK: call void @mgpuStreamSynchronize(ptr [[STREAM]])
|
||||
// CHECK: call void @mgpuStreamDestroy(ptr [[STREAM]])
|
||||
// CHECK: call void @mgpuModuleUnload(ptr [[MODULE]])
|
||||
@ -50,6 +50,13 @@ module {
|
||||
|
||||
// -----
|
||||
|
||||
// Checking the correct selection of the second object using a target as a selector.
|
||||
module {
|
||||
// CHECK: @kernel_module_bin_cst = internal constant [4 x i8] c"BLOB", align 8
|
||||
gpu.binary @kernel_module <#gpu.select_object<#spirv.target_env<#spirv.vce<v1.0, [Addresses, Int64, Kernel], []>, api=OpenCL, #spirv.resource_limits<>>>> [#gpu.object<#nvvm.target, "NVPTX">, #gpu.object<#spirv.target_env<#spirv.vce<v1.0, [Addresses, Int64, Kernel], []>, api=OpenCL, #spirv.resource_limits<>>, "BLOB">]
|
||||
}
|
||||
|
||||
// -----
|
||||
// Checking the translation of `gpu.launch_fun` with an async dependency.
|
||||
module attributes {gpu.container_module} {
|
||||
// CHECK: @kernel_module_bin_cst = internal constant [4 x i8] c"BLOB", align 8
|
||||
@ -59,9 +66,9 @@ module attributes {gpu.container_module} {
|
||||
// CHECK: = call ptr @mgpuStreamCreate()
|
||||
// CHECK-NEXT: = alloca {{.*}}, align 8
|
||||
// CHECK-NEXT: [[ARGS:%.*]] = alloca ptr, i64 0, align 8
|
||||
// CHECK-NEXT: [[MODULE:%.*]] = call ptr @mgpuModuleLoad(ptr @kernel_module_bin_cst)
|
||||
// CHECK-NEXT: [[MODULE:%.*]] = call ptr @mgpuModuleLoad(ptr @kernel_module_bin_cst, i64 4)
|
||||
// CHECK-NEXT: [[FUNC:%.*]] = call ptr @mgpuModuleGetFunction(ptr [[MODULE]], ptr @kernel_module_kernel_kernel_name)
|
||||
// CHECK-NEXT: call void @mgpuLaunchKernel(ptr [[FUNC]], i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i32 0, ptr {{.*}}, ptr [[ARGS]], ptr null)
|
||||
// CHECK-NEXT: call void @mgpuLaunchKernel(ptr [[FUNC]], i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i32 0, ptr {{.*}}, ptr [[ARGS]], ptr null, i64 0)
|
||||
// CHECK-NEXT: call void @mgpuModuleUnload(ptr [[MODULE]])
|
||||
// CHECK-NEXT: call void @mgpuStreamSynchronize(ptr %{{.*}})
|
||||
// CHECK-NEXT: call void @mgpuStreamDestroy(ptr %{{.*}})
|
||||
@ -84,7 +91,7 @@ module attributes {gpu.container_module} {
|
||||
gpu.binary @kernel_module [#gpu.object<#nvvm.target, "BLOB">]
|
||||
llvm.func @foo() {
|
||||
// CHECK: [[S2:%.*]] = alloca ptr, i64 0, align 8
|
||||
// CHECK: [[S3:%.*]] = call ptr @mgpuModuleLoad(ptr @kernel_module_bin_cst)
|
||||
// CHECK: [[S3:%.*]] = call ptr @mgpuModuleLoad(ptr @kernel_module_bin_cst, i64 4)
|
||||
// CHECK: [[S4:%.*]] = call ptr @mgpuModuleGetFunction(ptr [[S3]], ptr @kernel_module_kernel_kernel_name)
|
||||
// CHECK: [[S5:%.*]] = call ptr @mgpuStreamCreate()
|
||||
// CHECK: call void @mgpuLaunchClusterKernel(ptr [[S4]], i64 2, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i32 0, ptr [[S5]], ptr [[S2]], ptr null)
|
||||
|
@ -132,6 +132,9 @@ if config.enable_rocm_runner:
|
||||
if config.enable_cuda_runner:
|
||||
tools.extend([add_runtime("mlir_cuda_runtime")])
|
||||
|
||||
if config.enable_sycl_runner:
|
||||
tools.extend([add_runtime("mlir_sycl_runtime")])
|
||||
|
||||
if config.mlir_run_arm_sme_tests:
|
||||
config.substitutions.append(
|
||||
(
|
||||
|
@ -31,6 +31,7 @@ config.run_rocm_tests = @MLIR_ENABLE_ROCM_CONVERSIONS@
|
||||
config.enable_rocm_runner = @MLIR_ENABLE_ROCM_RUNNER@
|
||||
config.gpu_compilation_format = "@MLIR_GPU_COMPILATION_TEST_FORMAT@"
|
||||
config.rocm_test_chipset = "@ROCM_TEST_CHIPSET@"
|
||||
config.enable_sycl_runner = @MLIR_ENABLE_SYCL_RUNNER@
|
||||
config.enable_spirv_cpu_runner = @MLIR_ENABLE_SPIRV_CPU_RUNNER@
|
||||
config.enable_vulkan_runner = @MLIR_ENABLE_VULKAN_RUNNER@
|
||||
config.enable_bindings_python = @MLIR_ENABLE_BINDINGS_PYTHON@
|
||||
|
Loading…
x
Reference in New Issue
Block a user