[mlir][ExecutionEngine] Add LevelZeroRuntimeWrapper. (#151038)
Adds LevelZeroRuntime wrapper and tests. Co-authored-by: Artem Kroviakov <artem.kroviakov@intel.com> Co-authored-by: Nishant Patel <nishant.b.patel@intel.com> --------- Co-authored-by: Artem Kroviakov <artem.kroviakov@intel.com> Co-authored-by: Nishant Patel <nishant.b.patel@intel.com>
This commit is contained in:
parent
66392a8d8d
commit
281e6d2cc4
@ -140,6 +140,7 @@ endif()
|
||||
set(MLIR_ENABLE_CUDA_RUNNER 0 CACHE BOOL "Enable building the MLIR CUDA runner")
|
||||
set(MLIR_ENABLE_ROCM_RUNNER 0 CACHE BOOL "Enable building the MLIR ROCm runner")
|
||||
set(MLIR_ENABLE_SYCL_RUNNER 0 CACHE BOOL "Enable building the MLIR SYCL runner")
|
||||
set(MLIR_ENABLE_LEVELZERO_RUNNER 0 CACHE BOOL "Enable building the MLIR LevelZero runner")
|
||||
set(MLIR_ENABLE_SPIRV_CPU_RUNNER 0 CACHE BOOL "Enable building the MLIR SPIR-V cpu runner")
|
||||
set(MLIR_ENABLE_VULKAN_RUNNER 0 CACHE BOOL "Enable building the MLIR Vulkan runner")
|
||||
set(MLIR_ENABLE_NVPTXCOMPILER 0 CACHE BOOL
|
||||
|
@ -20,7 +20,6 @@ include(FindPackageHandleStandardArgs)
|
||||
# Search path priority
|
||||
# 1. CMake Variable LEVEL_ZERO_DIR
|
||||
# 2. Environment Variable LEVEL_ZERO_DIR
|
||||
|
||||
if(NOT LEVEL_ZERO_DIR)
|
||||
if(DEFINED ENV{LEVEL_ZERO_DIR})
|
||||
set(LEVEL_ZERO_DIR "$ENV{LEVEL_ZERO_DIR}")
|
||||
@ -28,32 +27,32 @@ if(NOT LEVEL_ZERO_DIR)
|
||||
endif()
|
||||
|
||||
if(LEVEL_ZERO_DIR)
|
||||
find_path(LevelZero_INCLUDE_DIR
|
||||
find_path(LevelZeroRuntime_INCLUDE_DIR
|
||||
NAMES level_zero/ze_api.h
|
||||
PATHS ${LEVEL_ZERO_DIR}/include
|
||||
NO_DEFAULT_PATH
|
||||
)
|
||||
|
||||
if(LINUX)
|
||||
find_library(LevelZero_LIBRARY
|
||||
find_library(LevelZeroRuntime_LIBRARY
|
||||
NAMES ze_loader
|
||||
PATHS ${LEVEL_ZERO_DIR}/lib
|
||||
${LEVEL_ZERO_DIR}/lib/x86_64-linux-gnu
|
||||
${LEVEL_ZERO_DIR}/lib/x86_64-linux-gnu
|
||||
NO_DEFAULT_PATH
|
||||
)
|
||||
else()
|
||||
find_library(LevelZero_LIBRARY
|
||||
find_library(LevelZeroRuntime_LIBRARY
|
||||
NAMES ze_loader
|
||||
PATHS ${LEVEL_ZERO_DIR}/lib
|
||||
NO_DEFAULT_PATH
|
||||
)
|
||||
endif()
|
||||
else()
|
||||
find_path(LevelZero_INCLUDE_DIR
|
||||
find_path(LevelZeroRuntime_INCLUDE_DIR
|
||||
NAMES level_zero/ze_api.h
|
||||
)
|
||||
|
||||
find_library(LevelZero_LIBRARY
|
||||
find_library(LevelZeroRuntime_LIBRARY
|
||||
NAMES ze_loader
|
||||
)
|
||||
endif()
|
||||
@ -64,12 +63,14 @@ endif()
|
||||
# lists of equal lengths, with the shorter string getting zero-padded.
|
||||
function(compare_versions VERSION_STR1 VERSION_STR2 OUTPUT)
|
||||
# Convert the strings to list
|
||||
string(REPLACE "." ";" VL1 ${VERSION_STR1})
|
||||
string(REPLACE "." ";" VL2 ${VERSION_STR2})
|
||||
string(REPLACE "." ";" VL1 ${VERSION_STR1})
|
||||
string(REPLACE "." ";" VL2 ${VERSION_STR2})
|
||||
|
||||
# get lengths of both lists
|
||||
list(LENGTH VL1 VL1_LEN)
|
||||
list(LENGTH VL2 VL2_LEN)
|
||||
set(LEN ${VL1_LEN})
|
||||
|
||||
# If they differ in size pad the shorter list with 0s
|
||||
if(VL1_LEN GREATER VL2_LEN)
|
||||
math(EXPR DIFF "${VL1_LEN} - ${VL2_LEN}" OUTPUT_FORMAT DECIMAL)
|
||||
@ -98,12 +99,10 @@ function(compare_versions VERSION_STR1 VERSION_STR2 OUTPUT)
|
||||
set(${OUTPUT} TRUE PARENT_SCOPE)
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
endfunction(compare_versions)
|
||||
endfunction(compare_versions)
|
||||
|
||||
# Creates a small function to run and extract the LevelZero loader version.
|
||||
function(get_l0_loader_version)
|
||||
|
||||
set(L0_VERSIONEER_SRC
|
||||
[====[
|
||||
#include <iostream>
|
||||
@ -142,19 +141,20 @@ function(get_l0_loader_version)
|
||||
|
||||
# We need both the directories in the include path as ze_loader.h
|
||||
# includes "ze_api.h" and not "level_zero/ze_api.h".
|
||||
list(APPEND INCLUDE_DIRS ${LevelZero_INCLUDE_DIR})
|
||||
list(APPEND INCLUDE_DIRS ${LevelZero_INCLUDE_DIR}/level_zero)
|
||||
list(APPEND INCLUDE_DIRS ${LevelZeroRuntime_INCLUDE_DIR})
|
||||
list(APPEND INCLUDE_DIRS ${LevelZeroRuntime_INCLUDE_DIR}/level_zero)
|
||||
list(JOIN INCLUDE_DIRS ";" INCLUDE_DIRS_STR)
|
||||
try_run(L0_VERSIONEER_RUN L0_VERSIONEER_COMPILE
|
||||
"${CMAKE_BINARY_DIR}"
|
||||
"${L0_VERSIONEER_FILE}"
|
||||
LINK_LIBRARIES ${LevelZero_LIBRARY}
|
||||
CMAKE_FLAGS
|
||||
"-DINCLUDE_DIRECTORIES=${INCLUDE_DIRS_STR}"
|
||||
RUN_OUTPUT_VARIABLE L0_VERSION
|
||||
"${CMAKE_BINARY_DIR}"
|
||||
"${L0_VERSIONEER_FILE}"
|
||||
LINK_LIBRARIES ${LevelZeroRuntime_LIBRARY}
|
||||
CMAKE_FLAGS
|
||||
"-DINCLUDE_DIRECTORIES=${INCLUDE_DIRS_STR}"
|
||||
RUN_OUTPUT_VARIABLE L0_VERSION
|
||||
)
|
||||
if(${L0_VERSIONEER_COMPILE} AND (DEFINED L0_VERSIONEER_RUN))
|
||||
set(LevelZero_VERSION ${L0_VERSION} PARENT_SCOPE)
|
||||
|
||||
if(${L0_VERSIONEER_COMPILE} AND(DEFINED L0_VERSIONEER_RUN))
|
||||
set(LevelZeroRuntime_VERSION ${L0_VERSION} PARENT_SCOPE)
|
||||
message(STATUS "Found Level Zero of version: ${L0_VERSION}")
|
||||
else()
|
||||
message(FATAL_ERROR
|
||||
@ -163,59 +163,61 @@ function(get_l0_loader_version)
|
||||
endif()
|
||||
endfunction(get_l0_loader_version)
|
||||
|
||||
if(LevelZero_INCLUDE_DIR AND LevelZero_LIBRARY)
|
||||
list(APPEND LevelZero_LIBRARIES "${LevelZero_LIBRARY}")
|
||||
list(APPEND LevelZero_INCLUDE_DIRS ${LevelZero_INCLUDE_DIR})
|
||||
if(LevelZeroRuntime_INCLUDE_DIR AND LevelZeroRuntime_LIBRARY)
|
||||
list(APPEND LevelZeroRuntime_LIBRARIES "${LevelZeroRuntime_LIBRARY}")
|
||||
list(APPEND LevelZeroRuntime_INCLUDE_DIRS ${LevelZeroRuntime_INCLUDE_DIR})
|
||||
|
||||
if(OpenCL_FOUND)
|
||||
list(APPEND LevelZero_INCLUDE_DIRS ${OpenCL_INCLUDE_DIRS})
|
||||
list(APPEND LevelZeroRuntime_INCLUDE_DIRS ${OpenCL_INCLUDE_DIRS})
|
||||
endif()
|
||||
|
||||
cmake_path(GET LevelZero_LIBRARY PARENT_PATH LevelZero_LIBRARIES_PATH)
|
||||
set(LevelZero_LIBRARIES_DIR ${LevelZero_LIBRARIES_PATH})
|
||||
cmake_path(GET LevelZeroRuntime_LIBRARY PARENT_PATH LevelZeroRuntime_LIBRARIES_PATH)
|
||||
set(LevelZeroRuntime_LIBRARIES_DIR ${LevelZeroRuntime_LIBRARIES_PATH})
|
||||
|
||||
if(NOT TARGET LevelZero::LevelZero)
|
||||
add_library(LevelZero::LevelZero INTERFACE IMPORTED)
|
||||
set_target_properties(LevelZero::LevelZero
|
||||
PROPERTIES INTERFACE_LINK_LIBRARIES "${LevelZero_LIBRARIES}"
|
||||
)
|
||||
set_target_properties(LevelZero::LevelZero
|
||||
PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${LevelZero_INCLUDE_DIRS}"
|
||||
)
|
||||
if(NOT TARGET LevelZeroRuntime::LevelZeroRuntime)
|
||||
add_library(LevelZeroRuntime::LevelZeroRuntime INTERFACE IMPORTED)
|
||||
set_target_properties(LevelZeroRuntime::LevelZeroRuntime
|
||||
PROPERTIES INTERFACE_LINK_LIBRARIES "${LevelZeroRuntime_LIBRARIES}"
|
||||
)
|
||||
set_target_properties(LevelZeroRuntime::LevelZeroRuntime
|
||||
PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${LevelZeroRuntime_INCLUDE_DIRS}"
|
||||
)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# Check if a specific version of Level Zero is required
|
||||
if(LevelZero_FIND_VERSION)
|
||||
if(LevelZeroRuntime_FIND_VERSION)
|
||||
get_l0_loader_version()
|
||||
set(VERSION_GT_FIND_VERSION FALSE)
|
||||
compare_versions(
|
||||
${LevelZero_VERSION}
|
||||
${LevelZero_FIND_VERSION}
|
||||
${LevelZeroRuntime_VERSION}
|
||||
${LevelZeroRuntime_FIND_VERSION}
|
||||
VERSION_GT_FIND_VERSION
|
||||
)
|
||||
|
||||
if(${VERSION_GT_FIND_VERSION})
|
||||
set(LevelZero_FOUND TRUE)
|
||||
set(LevelZeroRuntime_FOUND TRUE)
|
||||
else()
|
||||
set(LevelZero_FOUND FALSE)
|
||||
set(LevelZeroRuntime_FOUND FALSE)
|
||||
endif()
|
||||
else()
|
||||
set(LevelZero_FOUND TRUE)
|
||||
set(LevelZeroRuntime_FOUND TRUE)
|
||||
endif()
|
||||
|
||||
find_package_handle_standard_args(LevelZero
|
||||
find_package_handle_standard_args(LevelZeroRuntime
|
||||
REQUIRED_VARS
|
||||
LevelZero_FOUND
|
||||
LevelZero_INCLUDE_DIRS
|
||||
LevelZero_LIBRARY
|
||||
LevelZero_LIBRARIES_DIR
|
||||
LevelZeroRuntime_FOUND
|
||||
LevelZeroRuntime_INCLUDE_DIRS
|
||||
LevelZeroRuntime_LIBRARY
|
||||
LevelZeroRuntime_LIBRARIES_DIR
|
||||
HANDLE_COMPONENTS
|
||||
)
|
||||
mark_as_advanced(LevelZero_LIBRARY LevelZero_INCLUDE_DIRS)
|
||||
mark_as_advanced(LevelZeroRuntime_LIBRARY LevelZeroRuntime_INCLUDE_DIRS)
|
||||
|
||||
if(LevelZero_FOUND)
|
||||
find_package_message(LevelZero "Found LevelZero: ${LevelZero_LIBRARY}"
|
||||
"(found version ${LevelZero_VERSION})"
|
||||
if(LevelZeroRuntime_FOUND)
|
||||
find_package_message(LevelZeroRuntime "Found LevelZero: ${LevelZeroRuntime_LIBRARY}"
|
||||
"(found version ${LevelZeroRuntime_VERSION})"
|
||||
)
|
||||
else()
|
||||
find_package_message(LevelZero "Could not find LevelZero" "")
|
||||
find_package_message(LevelZeroRuntime "Could not find LevelZero" "")
|
||||
endif()
|
@ -14,6 +14,7 @@ set(LLVM_OPTIONAL_SOURCES
|
||||
RunnerUtils.cpp
|
||||
OptUtils.cpp
|
||||
JitRunner.cpp
|
||||
LevelZeroRuntimeWrappers.cpp
|
||||
SpirvCpuRuntimeWrappers.cpp
|
||||
SyclRuntimeWrappers.cpp
|
||||
VulkanRuntimeWrappers.cpp
|
||||
@ -374,6 +375,15 @@ if(LLVM_ENABLE_PIC)
|
||||
)
|
||||
endif()
|
||||
|
||||
if(MLIR_ENABLE_SYCL_RUNNER OR MLIR_ENABLE_LEVELZERO_RUNNER)
|
||||
# Both runtimes require LevelZero, so we can find it once.
|
||||
find_package(LevelZeroRuntime)
|
||||
|
||||
if(NOT LevelZeroRuntime_FOUND)
|
||||
message(FATAL_ERROR "LevelZero not found. Please set LEVEL_ZERO_DIR.")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(MLIR_ENABLE_SYCL_RUNNER)
|
||||
find_package(SyclRuntime)
|
||||
|
||||
@ -381,12 +391,6 @@ if(LLVM_ENABLE_PIC)
|
||||
message(FATAL_ERROR "syclRuntime not found. Please set check oneapi installation and run setvars.sh.")
|
||||
endif()
|
||||
|
||||
find_package(LevelZero)
|
||||
|
||||
if(NOT LevelZero_FOUND)
|
||||
message(FATAL_ERROR "LevelZero not found. Please set LEVEL_ZERO_DIR.")
|
||||
endif()
|
||||
|
||||
add_mlir_library(mlir_sycl_runtime
|
||||
SHARED
|
||||
SyclRuntimeWrappers.cpp
|
||||
@ -404,9 +408,28 @@ if(LLVM_ENABLE_PIC)
|
||||
${MLIR_INCLUDE_DIRS}
|
||||
)
|
||||
|
||||
target_link_libraries(mlir_sycl_runtime PRIVATE LevelZero::LevelZero SyclRuntime::SyclRuntime)
|
||||
target_link_libraries(mlir_sycl_runtime PRIVATE LevelZeroRuntime::LevelZeroRuntime SyclRuntime::SyclRuntime)
|
||||
|
||||
set_property(TARGET mlir_sycl_runtime APPEND PROPERTY BUILD_RPATH "${LevelZero_LIBRARIES_DIR}" "${SyclRuntime_LIBRARIES_DIR}")
|
||||
set_property(TARGET mlir_sycl_runtime APPEND PROPERTY BUILD_RPATH "${LevelZeroRuntime_LIBRARIES_DIR}" "${SyclRuntime_LIBRARIES_DIR}")
|
||||
endif()
|
||||
|
||||
if(MLIR_ENABLE_LEVELZERO_RUNNER)
|
||||
add_mlir_library(mlir_levelzero_runtime
|
||||
SHARED
|
||||
LevelZeroRuntimeWrappers.cpp
|
||||
|
||||
EXCLUDE_FROM_LIBMLIR
|
||||
)
|
||||
|
||||
target_compile_options(mlir_levelzero_runtime PUBLIC -fexceptions -frtti)
|
||||
|
||||
target_include_directories(mlir_levelzero_runtime PRIVATE
|
||||
${MLIR_INCLUDE_DIRS}
|
||||
)
|
||||
|
||||
target_link_libraries(mlir_levelzero_runtime PRIVATE LevelZeroRuntime::LevelZeroRuntime)
|
||||
|
||||
set_property(TARGET mlir_levelzero_runtime APPEND PROPERTY BUILD_RPATH "${LevelZeroRuntime_LIBRARIES_DIR}")
|
||||
endif()
|
||||
|
||||
if(MLIR_ENABLE_SPIRV_CPU_RUNNER)
|
||||
|
573
mlir/lib/ExecutionEngine/LevelZeroRuntimeWrappers.cpp
Normal file
573
mlir/lib/ExecutionEngine/LevelZeroRuntimeWrappers.cpp
Normal file
@ -0,0 +1,573 @@
|
||||
//===- LevelZeroRuntimeWrappers.cpp - MLIR Level Zero (L0) wrapper library-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Implements wrappers around the Level Zero (L0) runtime library with C linkage
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "llvm/ADT/Twine.h"
|
||||
|
||||
#include "level_zero/ze_api.h"
|
||||
#include <cassert>
|
||||
#include <deque>
|
||||
#include <exception>
|
||||
#include <functional>
|
||||
#include <iostream>
|
||||
#include <limits>
|
||||
#include <unordered_set>
|
||||
#include <vector>
|
||||
|
||||
namespace {
|
||||
template <typename F>
|
||||
auto catchAll(F &&func) {
|
||||
try {
|
||||
return func();
|
||||
} catch (const std::exception &e) {
|
||||
std::cerr << "An exception was thrown: " << e.what() << std::endl;
|
||||
std::abort();
|
||||
} catch (...) {
|
||||
std::cerr << "An unknown exception was thrown." << std::endl;
|
||||
std::abort();
|
||||
}
|
||||
}
|
||||
|
||||
#define L0_SAFE_CALL(call) \
|
||||
{ \
|
||||
ze_result_t status = (call); \
|
||||
if (status != ZE_RESULT_SUCCESS) { \
|
||||
const char *errorString; \
|
||||
zeDriverGetLastErrorDescription(NULL, &errorString); \
|
||||
std::cerr << "L0 error " << status << ": " << errorString << std::endl; \
|
||||
std::abort(); \
|
||||
} \
|
||||
}
|
||||
} // namespace
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// L0 RT context & device setters
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
// Returns the L0 driver handle for the given index. Default index is 0
|
||||
// (i.e., returns the first driver handle of the available drivers).
|
||||
|
||||
static ze_driver_handle_t getDriver(uint32_t idx = 0) {
|
||||
ze_init_driver_type_desc_t driver_type = {};
|
||||
driver_type.stype = ZE_STRUCTURE_TYPE_INIT_DRIVER_TYPE_DESC;
|
||||
driver_type.flags = ZE_INIT_DRIVER_TYPE_FLAG_GPU;
|
||||
driver_type.pNext = nullptr;
|
||||
uint32_t driverCount{0};
|
||||
thread_local static std::vector<ze_driver_handle_t> drivers;
|
||||
thread_local static bool isDriverInitialised{false};
|
||||
if (isDriverInitialised && idx < drivers.size())
|
||||
return drivers[idx];
|
||||
L0_SAFE_CALL(zeInitDrivers(&driverCount, nullptr, &driver_type));
|
||||
if (!driverCount)
|
||||
throw std::runtime_error("No L0 drivers found.");
|
||||
drivers.resize(driverCount);
|
||||
L0_SAFE_CALL(zeInitDrivers(&driverCount, drivers.data(), &driver_type));
|
||||
if (idx >= driverCount)
|
||||
throw std::runtime_error((llvm::Twine("Requested driver idx out-of-bound, "
|
||||
"number of availabe drivers: ") +
|
||||
std::to_string(driverCount))
|
||||
.str());
|
||||
isDriverInitialised = true;
|
||||
return drivers[idx];
|
||||
}
|
||||
|
||||
static ze_device_handle_t getDevice(const uint32_t driverIdx = 0,
|
||||
const int32_t devIdx = 0) {
|
||||
thread_local static ze_device_handle_t l0Device;
|
||||
thread_local int32_t currDevIdx{-1};
|
||||
thread_local uint32_t currDriverIdx{0};
|
||||
if (currDriverIdx == driverIdx && currDevIdx == devIdx)
|
||||
return l0Device;
|
||||
auto driver = getDriver(driverIdx);
|
||||
uint32_t deviceCount{0};
|
||||
L0_SAFE_CALL(zeDeviceGet(driver, &deviceCount, nullptr));
|
||||
if (!deviceCount)
|
||||
throw std::runtime_error("getDevice failed: did not find L0 device.");
|
||||
if (static_cast<int>(deviceCount) < devIdx + 1)
|
||||
throw std::runtime_error("getDevice failed: devIdx out-of-bounds.");
|
||||
std::vector<ze_device_handle_t> devices(deviceCount);
|
||||
L0_SAFE_CALL(zeDeviceGet(driver, &deviceCount, devices.data()));
|
||||
l0Device = devices[devIdx];
|
||||
currDriverIdx = driverIdx;
|
||||
currDevIdx = devIdx;
|
||||
return l0Device;
|
||||
}
|
||||
|
||||
// Returns the default L0 context of the defult driver.
|
||||
static ze_context_handle_t getContext(ze_driver_handle_t driver) {
|
||||
thread_local static ze_context_handle_t context;
|
||||
thread_local static bool isContextInitialised{false};
|
||||
if (isContextInitialised)
|
||||
return context;
|
||||
ze_context_desc_t ctxtDesc = {ZE_STRUCTURE_TYPE_CONTEXT_DESC, nullptr, 0};
|
||||
L0_SAFE_CALL(zeContextCreate(driver, &ctxtDesc, &context));
|
||||
isContextInitialised = true;
|
||||
return context;
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// L0 RT helper structs
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
struct ZeContextDeleter {
|
||||
void operator()(ze_context_handle_t ctx) const {
|
||||
if (ctx)
|
||||
L0_SAFE_CALL(zeContextDestroy(ctx));
|
||||
}
|
||||
};
|
||||
|
||||
struct ZeCommandListDeleter {
|
||||
void operator()(ze_command_list_handle_t cmdList) const {
|
||||
if (cmdList)
|
||||
L0_SAFE_CALL(zeCommandListDestroy(cmdList));
|
||||
}
|
||||
};
|
||||
using UniqueZeContext =
|
||||
std::unique_ptr<std::remove_pointer<ze_context_handle_t>::type,
|
||||
ZeContextDeleter>;
|
||||
using UniqueZeCommandList =
|
||||
std::unique_ptr<std::remove_pointer<ze_command_list_handle_t>::type,
|
||||
ZeCommandListDeleter>;
|
||||
struct L0RTContextWrapper {
|
||||
ze_driver_handle_t driver{nullptr};
|
||||
ze_device_handle_t device{nullptr};
|
||||
UniqueZeContext context;
|
||||
// Usually, one immediate command list with ordinal 0 suffices for
|
||||
// both copy and compute ops, but leaves HW underutilized.
|
||||
UniqueZeCommandList immCmdListCompute;
|
||||
// Copy engines can be used for both memcpy and memset, but
|
||||
// they have limitations for memset pattern size (e.g., 1 byte).
|
||||
UniqueZeCommandList immCmdListCopy;
|
||||
uint32_t copyEngineMaxMemoryFillPatternSize{-1u};
|
||||
|
||||
L0RTContextWrapper() = default;
|
||||
L0RTContextWrapper(const uint32_t driverIdx = 0, const int32_t devIdx = 0)
|
||||
: driver(getDriver(driverIdx)), device(getDevice(devIdx)) {
|
||||
// Create context
|
||||
ze_context_handle_t ctx = getContext(driver);
|
||||
context.reset(ctx);
|
||||
|
||||
// Determine ordinals
|
||||
uint32_t computeEngineOrdinal = -1u, copyEngineOrdinal = -1u;
|
||||
ze_device_properties_t deviceProperties{};
|
||||
L0_SAFE_CALL(zeDeviceGetProperties(device, &deviceProperties));
|
||||
uint32_t queueGroupCount = 0;
|
||||
L0_SAFE_CALL(zeDeviceGetCommandQueueGroupProperties(
|
||||
device, &queueGroupCount, nullptr));
|
||||
std::vector<ze_command_queue_group_properties_t> queueGroupProperties(
|
||||
queueGroupCount);
|
||||
L0_SAFE_CALL(zeDeviceGetCommandQueueGroupProperties(
|
||||
device, &queueGroupCount, queueGroupProperties.data()));
|
||||
|
||||
for (uint32_t queueGroupIdx = 0; queueGroupIdx < queueGroupCount;
|
||||
++queueGroupIdx) {
|
||||
const auto &group = queueGroupProperties[queueGroupIdx];
|
||||
if (group.flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE)
|
||||
computeEngineOrdinal = queueGroupIdx;
|
||||
else if (group.flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY) {
|
||||
copyEngineOrdinal = queueGroupIdx;
|
||||
copyEngineMaxMemoryFillPatternSize = group.maxMemoryFillPatternSize;
|
||||
}
|
||||
if (copyEngineOrdinal != -1u && computeEngineOrdinal != -1u)
|
||||
break;
|
||||
}
|
||||
|
||||
// Fallback to the default queue if no dedicated copy queue is available.
|
||||
if (copyEngineOrdinal == -1u)
|
||||
copyEngineOrdinal = computeEngineOrdinal;
|
||||
|
||||
assert(copyEngineOrdinal != -1u && computeEngineOrdinal != -1u &&
|
||||
"Expected two engines to be available.");
|
||||
|
||||
// Create copy command list
|
||||
ze_command_queue_desc_t cmdQueueDesc{
|
||||
ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
|
||||
nullptr,
|
||||
copyEngineOrdinal, // ordinal
|
||||
0, // index (assume one physical engine in the group)
|
||||
0, // flags
|
||||
ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS,
|
||||
ZE_COMMAND_QUEUE_PRIORITY_NORMAL};
|
||||
|
||||
ze_command_list_handle_t rawCmdListCopy = nullptr;
|
||||
L0_SAFE_CALL(zeCommandListCreateImmediate(context.get(), device,
|
||||
&cmdQueueDesc, &rawCmdListCopy));
|
||||
immCmdListCopy.reset(rawCmdListCopy);
|
||||
|
||||
// Create compute command list
|
||||
cmdQueueDesc.ordinal = computeEngineOrdinal;
|
||||
ze_command_list_handle_t rawCmdListCompute = nullptr;
|
||||
L0_SAFE_CALL(zeCommandListCreateImmediate(
|
||||
context.get(), device, &cmdQueueDesc, &rawCmdListCompute));
|
||||
immCmdListCompute.reset(rawCmdListCompute);
|
||||
}
|
||||
L0RTContextWrapper(const L0RTContextWrapper &) = delete;
|
||||
L0RTContextWrapper &operator=(const L0RTContextWrapper &) = delete;
|
||||
// Allow move
|
||||
L0RTContextWrapper(L0RTContextWrapper &&) noexcept = default;
|
||||
L0RTContextWrapper &operator=(L0RTContextWrapper &&) noexcept = default;
|
||||
~L0RTContextWrapper() = default;
|
||||
};
|
||||
|
||||
struct ZeEventDeleter {
|
||||
void operator()(ze_event_handle_t event) const {
|
||||
if (event)
|
||||
L0_SAFE_CALL(zeEventDestroy(event));
|
||||
}
|
||||
};
|
||||
|
||||
struct ZeEventPoolDeleter {
|
||||
void operator()(ze_event_pool_handle_t pool) const {
|
||||
if (pool)
|
||||
L0_SAFE_CALL(zeEventPoolDestroy(pool));
|
||||
}
|
||||
};
|
||||
|
||||
using UniqueZeEvent =
|
||||
std::unique_ptr<std::remove_pointer<ze_event_handle_t>::type,
|
||||
ZeEventDeleter>;
|
||||
using UniqueZeEventPool =
|
||||
std::unique_ptr<std::remove_pointer<ze_event_pool_handle_t>::type,
|
||||
ZeEventPoolDeleter>;
|
||||
|
||||
// L0 only supports pre-determined sizes of event pools,
|
||||
// implement a runtime data structure to avoid running out of events.
|
||||
|
||||
struct DynamicEventPool {
|
||||
constexpr static size_t numEventsPerPool{128};
|
||||
|
||||
std::vector<UniqueZeEventPool> eventPools;
|
||||
std::vector<UniqueZeEvent> availableEvents;
|
||||
std::unordered_map<ze_event_handle_t, UniqueZeEvent> takenEvents;
|
||||
|
||||
// Limit the number of events to avoid running out of memory.
|
||||
// The limit is set to 32K events, which should be sufficient for most use
|
||||
// cases.
|
||||
size_t maxEventsCount{32768}; // 32K events
|
||||
size_t currentEventsLimit{0};
|
||||
size_t currentEventsCnt{0};
|
||||
L0RTContextWrapper *rtCtx;
|
||||
|
||||
DynamicEventPool(L0RTContextWrapper *rtCtx) : rtCtx(rtCtx) {
|
||||
createNewPool(numEventsPerPool);
|
||||
}
|
||||
|
||||
DynamicEventPool(const DynamicEventPool &) = delete;
|
||||
DynamicEventPool &operator=(const DynamicEventPool &) = delete;
|
||||
|
||||
// Allow move
|
||||
DynamicEventPool(DynamicEventPool &&) noexcept = default;
|
||||
DynamicEventPool &operator=(DynamicEventPool &&) noexcept = default;
|
||||
|
||||
~DynamicEventPool() {
|
||||
assert(takenEvents.empty() && "Some events were not released");
|
||||
}
|
||||
|
||||
void createNewPool(size_t numEvents) {
|
||||
ze_event_pool_desc_t eventPoolDesc = {};
|
||||
eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE;
|
||||
eventPoolDesc.count = numEvents;
|
||||
|
||||
ze_event_pool_handle_t rawPool = nullptr;
|
||||
L0_SAFE_CALL(zeEventPoolCreate(rtCtx->context.get(), &eventPoolDesc, 1,
|
||||
&rtCtx->device, &rawPool));
|
||||
|
||||
eventPools.emplace_back(UniqueZeEventPool(rawPool));
|
||||
currentEventsLimit += numEvents;
|
||||
}
|
||||
|
||||
ze_event_handle_t takeEvent() {
|
||||
ze_event_handle_t rawEvent = nullptr;
|
||||
|
||||
if (!availableEvents.empty()) {
|
||||
// Reuse one
|
||||
auto uniqueEvent = std::move(availableEvents.back());
|
||||
availableEvents.pop_back();
|
||||
rawEvent = uniqueEvent.get();
|
||||
takenEvents[rawEvent] = std::move(uniqueEvent);
|
||||
} else {
|
||||
if (currentEventsCnt >= maxEventsCount) {
|
||||
throw std::runtime_error("DynamicEventPool: reached max events limit");
|
||||
}
|
||||
if (currentEventsCnt == currentEventsLimit)
|
||||
createNewPool(numEventsPerPool);
|
||||
|
||||
ze_event_desc_t eventDesc = {
|
||||
ZE_STRUCTURE_TYPE_EVENT_DESC, nullptr,
|
||||
static_cast<uint32_t>(currentEventsCnt % numEventsPerPool),
|
||||
ZE_EVENT_SCOPE_FLAG_DEVICE, ZE_EVENT_SCOPE_FLAG_HOST};
|
||||
|
||||
ze_event_handle_t newEvent = nullptr;
|
||||
L0_SAFE_CALL(
|
||||
zeEventCreate(eventPools.back().get(), &eventDesc, &newEvent));
|
||||
|
||||
takenEvents[newEvent] = UniqueZeEvent(newEvent);
|
||||
rawEvent = newEvent;
|
||||
currentEventsCnt++;
|
||||
}
|
||||
|
||||
return rawEvent;
|
||||
}
|
||||
|
||||
void releaseEvent(ze_event_handle_t event) {
|
||||
auto it = takenEvents.find(event);
|
||||
assert(it != takenEvents.end() &&
|
||||
"Attempting to release unknown or already released event");
|
||||
|
||||
L0_SAFE_CALL(zeEventHostReset(event));
|
||||
availableEvents.emplace_back(std::move(it->second));
|
||||
takenEvents.erase(it);
|
||||
}
|
||||
};
|
||||
|
||||
L0RTContextWrapper &getRtContext() {
|
||||
thread_local static L0RTContextWrapper rtContext(0);
|
||||
return rtContext;
|
||||
}
|
||||
|
||||
DynamicEventPool &getDynamicEventPool() {
|
||||
thread_local static DynamicEventPool dynEventPool{&getRtContext()};
|
||||
return dynEventPool;
|
||||
}
|
||||
|
||||
struct StreamWrapper {
|
||||
// avoid event pointer invalidations
|
||||
std::deque<ze_event_handle_t> implicitEventStack;
|
||||
DynamicEventPool &dynEventPool;
|
||||
|
||||
StreamWrapper(DynamicEventPool &dynEventPool) : dynEventPool(dynEventPool) {}
|
||||
~StreamWrapper() { sync(); }
|
||||
|
||||
ze_event_handle_t *getLastImplicitEventPtr() {
|
||||
// Assume current implicit events will not be used after `sync`.
|
||||
return implicitEventStack.size() ? &implicitEventStack.back() : nullptr;
|
||||
}
|
||||
|
||||
void sync(ze_event_handle_t explicitEvent = nullptr) {
|
||||
ze_event_handle_t syncEvent{nullptr};
|
||||
if (!explicitEvent) {
|
||||
ze_event_handle_t *lastImplicitEventPtr = getLastImplicitEventPtr();
|
||||
syncEvent = lastImplicitEventPtr ? *lastImplicitEventPtr : nullptr;
|
||||
} else {
|
||||
syncEvent = explicitEvent;
|
||||
}
|
||||
if (syncEvent)
|
||||
L0_SAFE_CALL(zeEventHostSynchronize(
|
||||
syncEvent, std::numeric_limits<uint64_t>::max()));
|
||||
// All of the "implicit" events were signaled and are of no use, release
|
||||
// them. "explicit" event must be "released" via mgpuEventDestroy
|
||||
for (auto event : implicitEventStack)
|
||||
dynEventPool.releaseEvent(event);
|
||||
implicitEventStack.clear();
|
||||
}
|
||||
|
||||
template <typename Func>
|
||||
void enqueueOp(Func &&op) {
|
||||
ze_event_handle_t newImplicitEvent = dynEventPool.takeEvent();
|
||||
ze_event_handle_t *lastImplicitEventPtr = getLastImplicitEventPtr();
|
||||
const uint32_t numWaitEvents = lastImplicitEventPtr ? 1 : 0;
|
||||
std::forward<Func>(op)(newImplicitEvent, numWaitEvents,
|
||||
lastImplicitEventPtr);
|
||||
implicitEventStack.push_back(newImplicitEvent);
|
||||
}
|
||||
};
|
||||
|
||||
static ze_module_handle_t loadModule(const void *data, size_t dataSize) {
|
||||
assert(data);
|
||||
ze_module_handle_t zeModule;
|
||||
ze_module_desc_t desc = {ZE_STRUCTURE_TYPE_MODULE_DESC,
|
||||
nullptr,
|
||||
ZE_MODULE_FORMAT_IL_SPIRV,
|
||||
dataSize,
|
||||
(const uint8_t *)data,
|
||||
nullptr,
|
||||
nullptr};
|
||||
ze_module_build_log_handle_t buildLogHandle;
|
||||
ze_result_t result =
|
||||
zeModuleCreate(getRtContext().context.get(), getRtContext().device, &desc,
|
||||
&zeModule, &buildLogHandle);
|
||||
if (result != ZE_RESULT_SUCCESS) {
|
||||
std::cerr << "Error creating module, error code: " << result << std::endl;
|
||||
size_t logSize = 0;
|
||||
L0_SAFE_CALL(zeModuleBuildLogGetString(buildLogHandle, &logSize, nullptr));
|
||||
std::string buildLog(" ", logSize);
|
||||
L0_SAFE_CALL(
|
||||
zeModuleBuildLogGetString(buildLogHandle, &logSize, buildLog.data()));
|
||||
std::cerr << "Build log:\n" << buildLog << std::endl;
|
||||
std::abort();
|
||||
}
|
||||
return zeModule;
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// L0 Wrappers definition
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
extern "C" StreamWrapper *mgpuStreamCreate() {
|
||||
return new StreamWrapper(getDynamicEventPool());
|
||||
}
|
||||
|
||||
extern "C" void mgpuStreamSynchronize(StreamWrapper *stream) {
|
||||
if (stream)
|
||||
stream->sync();
|
||||
}
|
||||
|
||||
extern "C" void mgpuStreamDestroy(StreamWrapper *stream) { delete stream; }
|
||||
|
||||
extern "C" void mgpuStreamWaitEvent(StreamWrapper *stream,
|
||||
ze_event_handle_t event) {
|
||||
assert(stream && "Invalid stream");
|
||||
assert(event && "Invalid event");
|
||||
stream->sync(event);
|
||||
}
|
||||
|
||||
extern "C" ze_event_handle_t mgpuEventCreate() {
|
||||
return getDynamicEventPool().takeEvent();
|
||||
}
|
||||
|
||||
extern "C" void mgpuEventDestroy(ze_event_handle_t event) {
|
||||
return getDynamicEventPool().releaseEvent(event);
|
||||
}
|
||||
|
||||
extern "C" void mgpuEventSynchronize(ze_event_handle_t event) {
|
||||
L0_SAFE_CALL(
|
||||
zeEventHostSynchronize(event, std::numeric_limits<uint64_t>::max()));
|
||||
L0_SAFE_CALL(zeEventHostReset(event));
|
||||
}
|
||||
|
||||
extern "C" void mgpuEventRecord(ze_event_handle_t event,
|
||||
StreamWrapper *stream) {
|
||||
L0_SAFE_CALL(zeCommandListAppendSignalEvent(
|
||||
getRtContext().immCmdListCopy.get(), event));
|
||||
L0_SAFE_CALL(zeCommandListAppendSignalEvent(
|
||||
getRtContext().immCmdListCompute.get(), event));
|
||||
}
|
||||
|
||||
extern "C" void *mgpuMemAlloc(uint64_t size, StreamWrapper *stream,
|
||||
bool isShared) {
|
||||
return catchAll([&]() {
|
||||
void *memPtr = nullptr;
|
||||
constexpr size_t alignment{64};
|
||||
ze_device_mem_alloc_desc_t deviceDesc = {};
|
||||
deviceDesc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC;
|
||||
if (isShared) {
|
||||
ze_host_mem_alloc_desc_t hostDesc = {};
|
||||
hostDesc.stype = ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC;
|
||||
L0_SAFE_CALL(zeMemAllocShared(getRtContext().context.get(), &deviceDesc,
|
||||
&hostDesc, size, alignment,
|
||||
getRtContext().device, &memPtr));
|
||||
} else {
|
||||
L0_SAFE_CALL(zeMemAllocDevice(getRtContext().context.get(), &deviceDesc,
|
||||
size, alignment, getRtContext().device,
|
||||
&memPtr));
|
||||
}
|
||||
if (!memPtr)
|
||||
throw std::runtime_error("mem allocation failed!");
|
||||
return memPtr;
|
||||
});
|
||||
}
|
||||
|
||||
extern "C" void mgpuMemFree(void *ptr, StreamWrapper *stream) {
|
||||
stream->sync();
|
||||
if (ptr)
|
||||
L0_SAFE_CALL(zeMemFree(getRtContext().context.get(), ptr));
|
||||
}
|
||||
|
||||
extern "C" void mgpuMemcpy(void *dst, void *src, size_t sizeBytes,
|
||||
StreamWrapper *stream) {
|
||||
stream->enqueueOp([&](ze_event_handle_t newEvent, uint32_t numWaitEvents,
|
||||
ze_event_handle_t *waitEvents) {
|
||||
L0_SAFE_CALL(zeCommandListAppendMemoryCopy(
|
||||
getRtContext().immCmdListCopy.get(), dst, src, sizeBytes, newEvent,
|
||||
numWaitEvents, waitEvents));
|
||||
});
|
||||
}
|
||||
|
||||
template <typename PATTERN_TYPE>
|
||||
void mgpuMemset(void *dst, PATTERN_TYPE value, size_t count,
|
||||
StreamWrapper *stream) {
|
||||
L0RTContextWrapper &rtContext = getRtContext();
|
||||
auto listType =
|
||||
rtContext.copyEngineMaxMemoryFillPatternSize >= sizeof(PATTERN_TYPE)
|
||||
? rtContext.immCmdListCopy.get()
|
||||
: rtContext.immCmdListCompute.get();
|
||||
stream->enqueueOp([&](ze_event_handle_t newEvent, uint32_t numWaitEvents,
|
||||
ze_event_handle_t *waitEvents) {
|
||||
L0_SAFE_CALL(zeCommandListAppendMemoryFill(
|
||||
listType, dst, &value, sizeof(PATTERN_TYPE),
|
||||
count * sizeof(PATTERN_TYPE), newEvent, numWaitEvents, waitEvents));
|
||||
});
|
||||
}
|
||||
extern "C" void mgpuMemset32(void *dst, unsigned int value, size_t count,
|
||||
StreamWrapper *stream) {
|
||||
mgpuMemset<unsigned int>(dst, value, count, stream);
|
||||
}
|
||||
|
||||
extern "C" void mgpuMemset16(void *dst, unsigned short value, size_t count,
|
||||
StreamWrapper *stream) {
|
||||
mgpuMemset<unsigned short>(dst, value, count, stream);
|
||||
}
|
||||
|
||||
extern "C" ze_module_handle_t mgpuModuleLoad(const void *data,
|
||||
size_t gpuBlobSize) {
|
||||
return catchAll([&]() { return loadModule(data, gpuBlobSize); });
|
||||
}
|
||||
|
||||
extern "C" ze_kernel_handle_t mgpuModuleGetFunction(ze_module_handle_t module,
|
||||
const char *name) {
|
||||
assert(module && name);
|
||||
ze_kernel_handle_t zeKernel;
|
||||
ze_kernel_desc_t desc = {};
|
||||
desc.pKernelName = name;
|
||||
L0_SAFE_CALL(zeKernelCreate(module, &desc, &zeKernel));
|
||||
return zeKernel;
|
||||
}
|
||||
|
||||
extern "C" void mgpuLaunchKernel(ze_kernel_handle_t kernel, size_t gridX,
|
||||
size_t gridY, size_t gridZ, size_t blockX,
|
||||
size_t blockY, size_t blockZ,
|
||||
size_t sharedMemBytes, StreamWrapper *stream,
|
||||
void **params, void ** /*extra*/,
|
||||
size_t paramsCount) {
|
||||
|
||||
if (sharedMemBytes > 0) {
|
||||
paramsCount = paramsCount - 1; // Last param is shared memory size
|
||||
L0_SAFE_CALL(
|
||||
zeKernelSetArgumentValue(kernel, paramsCount, sharedMemBytes, nullptr));
|
||||
}
|
||||
for (size_t i = 0; i < paramsCount; ++i)
|
||||
L0_SAFE_CALL(zeKernelSetArgumentValue(kernel, static_cast<uint32_t>(i),
|
||||
sizeof(void *), params[i]));
|
||||
L0_SAFE_CALL(zeKernelSetGroupSize(kernel, blockX, blockY, blockZ));
|
||||
ze_group_count_t dispatch;
|
||||
dispatch.groupCountX = static_cast<uint32_t>(gridX);
|
||||
dispatch.groupCountY = static_cast<uint32_t>(gridY);
|
||||
dispatch.groupCountZ = static_cast<uint32_t>(gridZ);
|
||||
stream->enqueueOp([&](ze_event_handle_t newEvent, uint32_t numWaitEvents,
|
||||
ze_event_handle_t *waitEvents) {
|
||||
L0_SAFE_CALL(zeCommandListAppendLaunchKernel(
|
||||
getRtContext().immCmdListCompute.get(), kernel, &dispatch, newEvent,
|
||||
numWaitEvents, waitEvents));
|
||||
});
|
||||
}
|
||||
|
||||
extern "C" void mgpuModuleUnload(ze_module_handle_t module) {
|
||||
L0_SAFE_CALL(zeModuleDestroy(module));
|
||||
}
|
||||
|
||||
extern "C" void mgpuSetDefaultDevice(int32_t devIdx) {
|
||||
catchAll([&]() {
|
||||
// For now, a user must ensure that streams and events complete
|
||||
// and are destroyed before switching a device.
|
||||
getRtContext() = L0RTContextWrapper(devIdx);
|
||||
getDynamicEventPool() = DynamicEventPool(&getRtContext());
|
||||
});
|
||||
}
|
@ -167,6 +167,10 @@ if(MLIR_ENABLE_SYCL_RUNNER)
|
||||
list(APPEND MLIR_TEST_DEPENDS mlir_sycl_runtime)
|
||||
endif()
|
||||
|
||||
if(MLIR_ENABLE_LEVELZERO_RUNNER)
|
||||
list(APPEND MLIR_TEST_DEPENDS mlir_levelzero_runtime)
|
||||
endif()
|
||||
|
||||
if (MLIR_RUN_ARM_SME_TESTS AND NOT ARM_SME_ABI_ROUTINES_SHLIB)
|
||||
list(APPEND MLIR_TEST_DEPENDS mlir_arm_sme_abi_stubs)
|
||||
endif()
|
||||
|
59
mlir/test/Integration/GPU/LevelZero/gpu-addf32-to-spirv.mlir
Normal file
59
mlir/test/Integration/GPU/LevelZero/gpu-addf32-to-spirv.mlir
Normal file
@ -0,0 +1,59 @@
|
||||
// RUN: mlir-opt %s -pass-pipeline='builtin.module(spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,reconcile-unrealized-casts)' \
|
||||
// RUN: | mlir-runner \
|
||||
// RUN: --shared-libs=%mlir_levelzero_runtime \
|
||||
// RUN: --shared-libs=%mlir_runner_utils \
|
||||
// RUN: --entry-point-result=void \
|
||||
// RUN: | FileCheck %s
|
||||
|
||||
module @add attributes {gpu.container_module} {
|
||||
memref.global "private" constant @__constant_2x2x2xf32_0 : memref<2x2x2xf32> = dense<[[[1.1, 2.2], [3.3, 4.4]], [[5.5, 6.6], [7.7, 8.8 ]]]>
|
||||
memref.global "private" constant @__constant_2x2x2xf32 : memref<2x2x2xf32> = dense<[[[1.2, 2.3], [4.5, 5.8]], [[7.2, 8.3], [10.5, 11.8]]]>
|
||||
func.func @main() {
|
||||
%0 = memref.get_global @__constant_2x2x2xf32 : memref<2x2x2xf32>
|
||||
%1 = memref.get_global @__constant_2x2x2xf32_0 : memref<2x2x2xf32>
|
||||
%2 = call @test(%0, %1) : (memref<2x2x2xf32>, memref<2x2x2xf32>) -> memref<2x2x2xf32>
|
||||
%cast = memref.cast %2 : memref<2x2x2xf32> to memref<*xf32>
|
||||
call @printMemrefF32(%cast) : (memref<*xf32>) -> ()
|
||||
return
|
||||
}
|
||||
func.func private @printMemrefF32(memref<*xf32>)
|
||||
func.func @test(%arg0: memref<2x2x2xf32>, %arg1: memref<2x2x2xf32>) -> memref<2x2x2xf32> {
|
||||
%c2 = arith.constant 2 : index
|
||||
%c1 = arith.constant 1 : index
|
||||
%mem = gpu.alloc host_shared () : memref<2x2x2xf32>
|
||||
memref.copy %arg1, %mem : memref<2x2x2xf32> to memref<2x2x2xf32>
|
||||
%memref_0 = gpu.alloc host_shared () : memref<2x2x2xf32>
|
||||
memref.copy %arg0, %memref_0 : memref<2x2x2xf32> to memref<2x2x2xf32>
|
||||
%memref_2 = gpu.alloc host_shared () : memref<2x2x2xf32>
|
||||
%2 = gpu.wait async
|
||||
%3 = gpu.launch_func async [%2] @test_kernel::@test_kernel blocks in (%c2, %c2, %c2) threads in (%c1, %c1, %c1)
|
||||
args(%memref_0 : memref<2x2x2xf32>, %mem : memref<2x2x2xf32>, %memref_2 : memref<2x2x2xf32>)
|
||||
gpu.wait [%3]
|
||||
%alloc = memref.alloc() : memref<2x2x2xf32>
|
||||
memref.copy %memref_2, %alloc : memref<2x2x2xf32> to memref<2x2x2xf32>
|
||||
%4 = gpu.wait async
|
||||
%5 = gpu.dealloc async [%4] %memref_2 : memref<2x2x2xf32>
|
||||
%6 = gpu.dealloc async [%5] %memref_0 : memref<2x2x2xf32>
|
||||
%7 = gpu.dealloc async [%6] %mem : memref<2x2x2xf32>
|
||||
gpu.wait [%7]
|
||||
return %alloc : memref<2x2x2xf32>
|
||||
}
|
||||
gpu.module @test_kernel
|
||||
attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Int64, Kernel], []>, api=OpenCL, #spirv.resource_limits<>>} {
|
||||
gpu.func @test_kernel(%arg0: memref<2x2x2xf32>, %arg1: memref<2x2x2xf32>, %arg2: memref<2x2x2xf32>) kernel
|
||||
attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 2, 2, 2>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
|
||||
%0 = gpu.block_id x
|
||||
%1 = gpu.block_id y
|
||||
%2 = gpu.block_id z
|
||||
%3 = memref.load %arg0[%0, %1, %2] : memref<2x2x2xf32>
|
||||
%4 = memref.load %arg1[%0, %1, %2] : memref<2x2x2xf32>
|
||||
%5 = arith.addf %3, %4 : f32
|
||||
memref.store %5, %arg2[%0, %1, %2] : memref<2x2x2xf32>
|
||||
gpu.return
|
||||
}
|
||||
}
|
||||
// CHECK: [2.3, 4.5]
|
||||
// CHECK: [7.8, 10.2]
|
||||
// CHECK: [12.7, 14.9]
|
||||
// CHECK: [18.2, 20.6]
|
||||
}
|
57
mlir/test/Integration/GPU/LevelZero/gpu-addi64-to-spirv.mlir
Normal file
57
mlir/test/Integration/GPU/LevelZero/gpu-addi64-to-spirv.mlir
Normal file
@ -0,0 +1,57 @@
|
||||
// RUN: mlir-opt %s -pass-pipeline='builtin.module(spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,reconcile-unrealized-casts)' \
|
||||
// RUN: | mlir-runner \
|
||||
// RUN: --shared-libs=%mlir_levelzero_runtime \
|
||||
// RUN: --shared-libs=%mlir_runner_utils \
|
||||
// RUN: --entry-point-result=void \
|
||||
// RUN: | FileCheck %s
|
||||
|
||||
module @add attributes {gpu.container_module} {
|
||||
memref.global "private" constant @__constant_3x3xi64_0 : memref<3x3xi64> = dense<[[1, 4098, 3], [16777220, 5, 4294967302], [7, 1099511627784, 9]]>
|
||||
memref.global "private" constant @__constant_3x3xi64 : memref<3x3xi64> = dense<[[1, 2, 3], [4, 5, 4102], [16777223, 4294967304, 1099511627785]]>
|
||||
func.func @main() {
|
||||
%0 = memref.get_global @__constant_3x3xi64 : memref<3x3xi64>
|
||||
%1 = memref.get_global @__constant_3x3xi64_0 : memref<3x3xi64>
|
||||
%2 = call @test(%0, %1) : (memref<3x3xi64>, memref<3x3xi64>) -> memref<3x3xi64>
|
||||
%cast = memref.cast %2 : memref<3x3xi64> to memref<*xi64>
|
||||
call @printMemrefI64(%cast) : (memref<*xi64>) -> ()
|
||||
return
|
||||
}
|
||||
func.func private @printMemrefI64(memref<*xi64>)
|
||||
func.func @test(%arg0: memref<3x3xi64>, %arg1: memref<3x3xi64>) -> memref<3x3xi64> {
|
||||
%c3 = arith.constant 3 : index
|
||||
%c1 = arith.constant 1 : index
|
||||
%mem = gpu.alloc host_shared () : memref<3x3xi64>
|
||||
memref.copy %arg1, %mem : memref<3x3xi64> to memref<3x3xi64>
|
||||
%memref_0 = gpu.alloc host_shared () : memref<3x3xi64>
|
||||
memref.copy %arg0, %memref_0 : memref<3x3xi64> to memref<3x3xi64>
|
||||
%memref_2 = gpu.alloc host_shared () : memref<3x3xi64>
|
||||
%2 = gpu.wait async
|
||||
%3 = gpu.launch_func async [%2] @test_kernel::@test_kernel blocks in (%c3, %c3, %c1) threads in (%c1, %c1, %c1)
|
||||
args(%memref_0 : memref<3x3xi64>, %mem : memref<3x3xi64>, %memref_2 : memref<3x3xi64>)
|
||||
gpu.wait [%3]
|
||||
%alloc = memref.alloc() : memref<3x3xi64>
|
||||
memref.copy %memref_2, %alloc : memref<3x3xi64> to memref<3x3xi64>
|
||||
%4 = gpu.wait async
|
||||
%5 = gpu.dealloc async [%4] %memref_2 : memref<3x3xi64>
|
||||
%6 = gpu.dealloc async [%5] %memref_0 : memref<3x3xi64>
|
||||
%7 = gpu.dealloc async [%6] %mem : memref<3x3xi64>
|
||||
gpu.wait [%7]
|
||||
return %alloc : memref<3x3xi64>
|
||||
}
|
||||
gpu.module @test_kernel
|
||||
attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Int64, Kernel], []>, api=OpenCL, #spirv.resource_limits<>>} {
|
||||
gpu.func @test_kernel(%arg0: memref<3x3xi64>, %arg1: memref<3x3xi64>, %arg2: memref<3x3xi64>) kernel
|
||||
attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 3, 3, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
|
||||
%0 = gpu.block_id x
|
||||
%1 = gpu.block_id y
|
||||
%2 = memref.load %arg0[%0, %1] : memref<3x3xi64>
|
||||
%3 = memref.load %arg1[%0, %1] : memref<3x3xi64>
|
||||
%4 = arith.addi %2, %3 : i64
|
||||
memref.store %4, %arg2[%0, %1] : memref<3x3xi64>
|
||||
gpu.return
|
||||
}
|
||||
}
|
||||
// CHECK: [2, 4100, 6],
|
||||
// CHECK: [16777224, 10, 4294971404],
|
||||
// CHECK: [16777230, 1103806595088, 1099511627794]
|
||||
}
|
@ -0,0 +1,56 @@
|
||||
// RUN: mlir-opt %s -pass-pipeline='builtin.module(func.func(gpu-async-region),spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,reconcile-unrealized-casts)' \
|
||||
// RUN: | mlir-runner \
|
||||
// RUN: --shared-libs=%mlir_levelzero_runtime \
|
||||
// RUN: --shared-libs=%mlir_runner_utils \
|
||||
// RUN: --entry-point-result=void \
|
||||
// RUN: | FileCheck %s
|
||||
|
||||
module @add attributes {gpu.container_module} {
|
||||
memref.global "private" constant @__constant_2x2x2xf32_0 : memref<2x2x2xf32> = dense<[[[1.1, 2.2], [3.3, 4.4]], [[5.5, 6.6], [7.7, 8.8 ]]]>
|
||||
memref.global "private" constant @__constant_2x2x2xf32 : memref<2x2x2xf32> = dense<[[[1.2, 2.3], [4.5, 5.8]], [[7.2, 8.3], [10.5, 11.8]]]>
|
||||
func.func @main() {
|
||||
%0 = memref.get_global @__constant_2x2x2xf32 : memref<2x2x2xf32>
|
||||
%1 = memref.get_global @__constant_2x2x2xf32_0 : memref<2x2x2xf32>
|
||||
%2 = call @test(%0, %1) : (memref<2x2x2xf32>, memref<2x2x2xf32>) -> memref<2x2x2xf32>
|
||||
%cast = memref.cast %2 : memref<2x2x2xf32> to memref<*xf32>
|
||||
call @printMemrefF32(%cast) : (memref<*xf32>) -> ()
|
||||
memref.dealloc %2 : memref<2x2x2xf32>
|
||||
return
|
||||
}
|
||||
func.func private @printMemrefF32(memref<*xf32>)
|
||||
func.func @test(%arg0: memref<2x2x2xf32>, %arg1: memref<2x2x2xf32>) -> memref<2x2x2xf32> {
|
||||
%c2 = arith.constant 2 : index
|
||||
%c1 = arith.constant 1 : index
|
||||
%memref = gpu.alloc () : memref<2x2x2xf32>
|
||||
gpu.memcpy %memref, %arg0 : memref<2x2x2xf32>, memref<2x2x2xf32>
|
||||
%memref_0 = gpu.alloc () : memref<2x2x2xf32>
|
||||
gpu.memcpy %memref_0, %arg1 : memref<2x2x2xf32>, memref<2x2x2xf32>
|
||||
%memref_1 = gpu.alloc () : memref<2x2x2xf32>
|
||||
gpu.launch_func @test_kernel::@test_kernel blocks in (%c2, %c2, %c2) threads in (%c1, %c1, %c1)
|
||||
args(%memref : memref<2x2x2xf32>, %memref_0 : memref<2x2x2xf32>, %memref_1 : memref<2x2x2xf32>)
|
||||
%alloc = memref.alloc() : memref<2x2x2xf32>
|
||||
gpu.memcpy %alloc, %memref_1 : memref<2x2x2xf32>, memref<2x2x2xf32>
|
||||
gpu.dealloc %memref_1 : memref<2x2x2xf32>
|
||||
gpu.dealloc %memref_0 : memref<2x2x2xf32>
|
||||
gpu.dealloc %memref : memref<2x2x2xf32>
|
||||
return %alloc : memref<2x2x2xf32>
|
||||
}
|
||||
gpu.module @test_kernel
|
||||
attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Int64, Kernel], []>, api=OpenCL, #spirv.resource_limits<>>} {
|
||||
gpu.func @test_kernel(%arg0: memref<2x2x2xf32>, %arg1: memref<2x2x2xf32>, %arg2: memref<2x2x2xf32>) kernel
|
||||
attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 2, 2, 2>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
|
||||
%0 = gpu.block_id x
|
||||
%1 = gpu.block_id y
|
||||
%2 = gpu.block_id z
|
||||
%3 = memref.load %arg0[%0, %1, %2] : memref<2x2x2xf32>
|
||||
%4 = memref.load %arg1[%0, %1, %2] : memref<2x2x2xf32>
|
||||
%5 = arith.addf %3, %4 : f32
|
||||
memref.store %5, %arg2[%0, %1, %2] : memref<2x2x2xf32>
|
||||
gpu.return
|
||||
}
|
||||
}
|
||||
// CHECK: [2.3, 4.5]
|
||||
// CHECK: [7.8, 10.2]
|
||||
// CHECK: [12.7, 14.9]
|
||||
// CHECK: [18.2, 20.6]
|
||||
}
|
@ -0,0 +1,86 @@
|
||||
// RUN: mlir-opt %s -pass-pipeline='builtin.module(spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,reconcile-unrealized-casts)' \
|
||||
// RUN: | mlir-runner \
|
||||
// RUN: --shared-libs=%mlir_levelzero_runtime \
|
||||
// RUN: --shared-libs=%mlir_runner_utils \
|
||||
// RUN: --entry-point-result=void \
|
||||
// RUN: | FileCheck %s
|
||||
|
||||
module @relu attributes {gpu.container_module} {
|
||||
memref.global "private" constant @__constant_4x5xf32 : memref<4x5xf32> = dense<[
|
||||
[-1.000000e-01, -2.000000e-01, -3.000000e-01, 4.000000e-01, 5.000000e-01],
|
||||
[1.000000e-01, -2.000000e-01, 3.000000e-01, -4.000000e-01, 5.000000e-01],
|
||||
[1.000000e-01, 2.000000e-01, 3.000000e-01, -4.000000e-01, -5.000000e-01],
|
||||
[1.000000e-01, 2.000000e-01, 3.000000e-01, 4.000000e-01, 5.000000e-01]
|
||||
]>
|
||||
|
||||
func.func @main() {
|
||||
%c1 = arith.constant 1 : index
|
||||
%c100 = arith.constant 100 : index
|
||||
%c0 = arith.constant 0 : index
|
||||
%0 = memref.get_global @__constant_4x5xf32 : memref<4x5xf32>
|
||||
|
||||
scf.for %arg0 = %c0 to %c100 step %c1 {
|
||||
%1 = func.call @test(%0) : (memref<4x5xf32>) -> memref<4x5xf32>
|
||||
%cast = memref.cast %1 : memref<4x5xf32> to memref<*xf32>
|
||||
func.call @printMemrefF32(%cast) : (memref<*xf32>) -> ()
|
||||
// CHECK: [0, 0, 0, 0.4, 0.5],
|
||||
// CHECK: [0.1, 0, 0.3, 0, 0.5],
|
||||
// CHECK: [0.1, 0.2, 0.3, 0, 0],
|
||||
// CHECK: [0.1, 0.2, 0.3, 0.4, 0.5]
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func.func private @printMemrefF32(memref<*xf32>)
|
||||
func.func @test(%arg0: memref<4x5xf32>) -> memref<4x5xf32> {
|
||||
%c5 = arith.constant 5 : index
|
||||
%c4 = arith.constant 4 : index
|
||||
%cst = arith.constant 0.000000e+00 : f32
|
||||
%c1 = arith.constant 1 : index
|
||||
%memref = gpu.alloc host_shared () : memref<4x5xf32>
|
||||
memref.copy %arg0, %memref : memref<4x5xf32> to memref<4x5xf32>
|
||||
%memref_0 = gpu.alloc host_shared () : memref<4x5xi1>
|
||||
%2 = gpu.wait async
|
||||
%3 = gpu.launch_func async [%2] @test_kernel::@test_kernel blocks in (%c4, %c5, %c1) threads in (%c1, %c1, %c1)
|
||||
args(%memref : memref<4x5xf32>, %cst : f32, %memref_0 : memref<4x5xi1>)
|
||||
gpu.wait [%3]
|
||||
%memref_1 = gpu.alloc host_shared () : memref<4x5xf32>
|
||||
%4 = gpu.wait async
|
||||
%5 = gpu.launch_func async [%4] @test_kernel_0::@test_kernel blocks in (%c4, %c5, %c1) threads in (%c1, %c1, %c1)
|
||||
args(%memref_0 : memref<4x5xi1>, %memref : memref<4x5xf32>, %cst : f32,
|
||||
%memref_1 : memref<4x5xf32>)
|
||||
gpu.wait [%5]
|
||||
%alloc = memref.alloc() : memref<4x5xf32>
|
||||
memref.copy %memref_1, %alloc : memref<4x5xf32> to memref<4x5xf32>
|
||||
%6 = gpu.wait async
|
||||
%7 = gpu.dealloc async [%6] %memref_1 : memref<4x5xf32>
|
||||
%8 = gpu.dealloc async [%7] %memref_0 : memref<4x5xi1>
|
||||
%9 = gpu.dealloc async [%8] %memref : memref<4x5xf32>
|
||||
return %alloc : memref<4x5xf32>
|
||||
}
|
||||
gpu.module @test_kernel
|
||||
attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Int64, Int8, Kernel], []>, api=OpenCL, #spirv.resource_limits<>>} {
|
||||
gpu.func @test_kernel(%arg0: memref<4x5xf32>, %arg1: f32, %arg2: memref<4x5xi1>) kernel
|
||||
attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 4, 5, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
|
||||
%0 = gpu.block_id x
|
||||
%1 = gpu.block_id y
|
||||
%2 = memref.load %arg0[%0, %1] : memref<4x5xf32>
|
||||
%3 = arith.cmpf olt, %2, %arg1 : f32
|
||||
memref.store %3, %arg2[%0, %1] : memref<4x5xi1>
|
||||
gpu.return
|
||||
}
|
||||
}
|
||||
gpu.module @test_kernel_0
|
||||
attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Int64, Int8, Kernel], []>, api=OpenCL, #spirv.resource_limits<>>} {
|
||||
gpu.func @test_kernel(%arg0: memref<4x5xi1>, %arg1: memref<4x5xf32>, %arg2: f32, %arg3: memref<4x5xf32>) kernel
|
||||
attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 4, 5, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
|
||||
%0 = gpu.block_id x
|
||||
%1 = gpu.block_id y
|
||||
%2 = memref.load %arg0[%0, %1] : memref<4x5xi1>
|
||||
%3 = memref.load %arg1[%0, %1] : memref<4x5xf32>
|
||||
%4 = arith.select %2, %arg2, %3 : f32
|
||||
memref.store %4, %arg3[%0, %1] : memref<4x5xf32>
|
||||
gpu.return
|
||||
}
|
||||
}
|
||||
}
|
2
mlir/test/Integration/GPU/LevelZero/lit.local.cfg
Normal file
2
mlir/test/Integration/GPU/LevelZero/lit.local.cfg
Normal file
@ -0,0 +1,2 @@
|
||||
if not config.enable_levelzero_runner:
|
||||
config.unsupported = True
|
@ -224,6 +224,9 @@ if config.enable_cuda_runner:
|
||||
if config.enable_sycl_runner:
|
||||
tools.extend([add_runtime("mlir_sycl_runtime")])
|
||||
|
||||
if config.enable_levelzero_runner:
|
||||
tools.extend([add_runtime("mlir_levelzero_runtime")])
|
||||
|
||||
if config.enable_spirv_cpu_runner:
|
||||
tools.extend([add_runtime("mlir_spirv_cpu_runtime")])
|
||||
|
||||
|
@ -34,6 +34,7 @@ config.enable_rocm_runner = @MLIR_ENABLE_ROCM_RUNNER@
|
||||
config.gpu_compilation_format = "@MLIR_GPU_COMPILATION_TEST_FORMAT@"
|
||||
config.rocm_test_chipset = "@ROCM_TEST_CHIPSET@"
|
||||
config.enable_sycl_runner = @MLIR_ENABLE_SYCL_RUNNER@
|
||||
config.enable_levelzero_runner = @MLIR_ENABLE_LEVELZERO_RUNNER@
|
||||
config.enable_spirv_cpu_runner = @MLIR_ENABLE_SPIRV_CPU_RUNNER@
|
||||
config.enable_vulkan_runner = @MLIR_ENABLE_VULKAN_RUNNER@
|
||||
config.enable_bindings_python = @MLIR_ENABLE_BINDINGS_PYTHON@
|
||||
|
Loading…
x
Reference in New Issue
Block a user