[MLIR][GPU][XeVM] Add XeVM target and XeVM dialect integration tests. (#148286)
As part of XeVM dialect upsteaming, covers remaining parts required for XeVM dialect integration and testing. It has two high level components - XeVM target and serialization support - XeVM dialect integration tests using level zero runtime Co-Authored-by: Artem Kroviakov <artem.kroviakov@intel.com>
This commit is contained in:
parent
06d2d1e156
commit
baae949f19
@ -137,6 +137,14 @@ else()
|
||||
set(MLIR_ENABLE_ROCM_CONVERSIONS 0)
|
||||
endif()
|
||||
|
||||
# Build the XeVM conversions and run according tests if the SPIRV backend
|
||||
# is available.
|
||||
if ("SPIRV" IN_LIST LLVM_TARGETS_TO_BUILD)
|
||||
set(MLIR_ENABLE_XEVM_CONVERSIONS 1)
|
||||
else()
|
||||
set(MLIR_ENABLE_XEVM_CONVERSIONS 0)
|
||||
endif()
|
||||
|
||||
set(MLIR_ENABLE_CUDA_RUNNER 0 CACHE BOOL "Enable building the MLIR CUDA runner")
|
||||
set(MLIR_ENABLE_ROCM_RUNNER 0 CACHE BOOL "Enable building the MLIR ROCm runner")
|
||||
set(MLIR_ENABLE_SYCL_RUNNER 0 CACHE BOOL "Enable building the MLIR SYCL runner")
|
||||
|
30
mlir/include/mlir/Target/LLVM/XeVM/Target.h
Normal file
30
mlir/include/mlir/Target/LLVM/XeVM/Target.h
Normal file
@ -0,0 +1,30 @@
|
||||
//===-- Target.h - MLIR XeVM target registration ----------------*- C++ -*-===//
|
||||
//
|
||||
// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This provides registration calls for attaching the XeVM target interface.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef MLIR_TARGET_LLVM_XEVM_TARGET_H
|
||||
#define MLIR_TARGET_LLVM_XEVM_TARGET_H
|
||||
|
||||
namespace mlir {
|
||||
class DialectRegistry;
|
||||
class MLIRContext;
|
||||
namespace xevm {
|
||||
/// Registers the `TargetAttrInterface` for the `#xevm.target` attribute in
|
||||
/// the given registry.
|
||||
void registerXeVMTargetInterfaceExternalModels(mlir::DialectRegistry ®istry);
|
||||
|
||||
/// Registers the `TargetAttrInterface` for the `#xevm.target` attribute in
|
||||
/// the registry associated with the given context.
|
||||
void registerXeVMTargetInterfaceExternalModels(mlir::MLIRContext &context);
|
||||
} // namespace xevm
|
||||
} // namespace mlir
|
||||
|
||||
#endif // MLIR_TARGET_LLVM_XEVM_TARGET_H
|
62
mlir/include/mlir/Target/LLVM/XeVM/Utils.h
Normal file
62
mlir/include/mlir/Target/LLVM/XeVM/Utils.h
Normal file
@ -0,0 +1,62 @@
|
||||
//===-- Utils.h - MLIR XeVM target utils ------------------------*- C++ -*-===//
|
||||
//
|
||||
// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This files declares XeVM target related utility classes and functions.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef MLIR_TARGET_LLVM_XEVM_UTILS_H
|
||||
#define MLIR_TARGET_LLVM_XEVM_UTILS_H
|
||||
|
||||
#include "mlir/Dialect/GPU/IR/CompilationInterfaces.h"
|
||||
#include "mlir/Dialect/LLVMIR/XeVMDialect.h"
|
||||
#include "mlir/IR/Attributes.h"
|
||||
#include "mlir/Target/LLVM/ModuleToObject.h"
|
||||
|
||||
namespace mlir {
|
||||
namespace xevm {
|
||||
|
||||
/// Base class for all XeVM serializations from GPU modules into binary strings.
|
||||
/// By default this class serializes into LLVM bitcode.
|
||||
class SerializeGPUModuleBase : public LLVM::ModuleToObject {
|
||||
public:
|
||||
SerializeGPUModuleBase(Operation &module, XeVMTargetAttr target,
|
||||
const gpu::TargetOptions &targetOptions = {});
|
||||
|
||||
/// Returns the target attribute.
|
||||
XeVMTargetAttr getTarget() const;
|
||||
|
||||
/// Loads the bitcode files in `librariesToLink`.
|
||||
std::optional<SmallVector<std::unique_ptr<llvm::Module>>>
|
||||
loadBitcodeFiles(llvm::Module &module) override;
|
||||
|
||||
/// Returns the gpu module being serialized.
|
||||
gpu::GPUModuleOp getGPUModuleOp();
|
||||
|
||||
/// Compiles to native code using `ocloc`.
|
||||
std::optional<SmallVector<char, 0>> compileToBinary(const std::string &asmStr,
|
||||
StringRef inputFormat);
|
||||
|
||||
protected:
|
||||
/// XeVM Target attribute.
|
||||
XeVMTargetAttr xeTarget;
|
||||
/// List of LLVM bitcode to link into after translation to LLVM IR.
|
||||
/// The attributes can be StringAttr pointing to a file path, or
|
||||
/// a Resource blob pointing to the LLVM bitcode in-memory.
|
||||
SmallVector<Attribute> librariesToLink;
|
||||
|
||||
/// Returns the path to the tool used for serialization.
|
||||
std::optional<std::string> findTool(StringRef tool);
|
||||
|
||||
/// GPU compilation target options.
|
||||
gpu::TargetOptions targetOptions;
|
||||
};
|
||||
} // namespace xevm
|
||||
} // namespace mlir
|
||||
|
||||
#endif // MLIR_TARGET_LLVM_XEVM_UTILS_H
|
@ -17,6 +17,7 @@
|
||||
#include "mlir/Dialect/LLVMIR/XeVMDialect.h"
|
||||
#include "mlir/IR/Builders.h"
|
||||
#include "mlir/Pass/Pass.h"
|
||||
#include "mlir/Target/LLVM/XeVM/Target.h"
|
||||
#include "llvm/Support/Regex.h"
|
||||
|
||||
namespace mlir {
|
||||
|
@ -102,6 +102,7 @@
|
||||
#include "mlir/Interfaces/CastInterfaces.h"
|
||||
#include "mlir/Target/LLVM/NVVM/Target.h"
|
||||
#include "mlir/Target/LLVM/ROCDL/Target.h"
|
||||
#include "mlir/Target/LLVM/XeVM/Target.h"
|
||||
#include "mlir/Target/SPIRV/Target.h"
|
||||
|
||||
/// Add all the MLIR dialects to the provided registry.
|
||||
@ -199,6 +200,7 @@ void mlir::registerAllDialects(DialectRegistry ®istry) {
|
||||
NVVM::registerNVVMTargetInterfaceExternalModels(registry);
|
||||
ROCDL::registerROCDLTargetInterfaceExternalModels(registry);
|
||||
spirv::registerSPIRVTargetInterfaceExternalModels(registry);
|
||||
xevm::registerXeVMTargetInterfaceExternalModels(registry);
|
||||
}
|
||||
|
||||
/// Append all the MLIR dialects to the registry contained in the given context.
|
||||
|
@ -58,6 +58,7 @@
|
||||
#include "mlir/Target/LLVMIR/Dialect/GPU/GPUToLLVMIRTranslation.h"
|
||||
#include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
|
||||
#include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h"
|
||||
#include "mlir/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.h"
|
||||
|
||||
/// This function may be called to register all MLIR dialect extensions with the
|
||||
/// provided registry.
|
||||
|
@ -210,3 +210,26 @@ if(MLIR_ENABLE_ROCM_CONVERSIONS)
|
||||
)
|
||||
endif()
|
||||
|
||||
if ("SPIRV" IN_LIST LLVM_TARGETS_TO_BUILD)
|
||||
set(SPIRV_LIBS
|
||||
SPIRVCodeGen
|
||||
|
||||
)
|
||||
endif()
|
||||
|
||||
add_mlir_dialect_library(MLIRXeVMTarget
|
||||
XeVM/Target.cpp
|
||||
|
||||
OBJECT
|
||||
|
||||
LINK_COMPONENTS
|
||||
${SPIRV_LIBS}
|
||||
|
||||
LINK_LIBS PUBLIC
|
||||
MLIRIR
|
||||
MLIRExecutionEngineUtils
|
||||
MLIRSupport
|
||||
MLIRGPUDialect
|
||||
MLIRTargetLLVM
|
||||
MLIRXeVMToLLVMIRTranslation
|
||||
)
|
||||
|
418
mlir/lib/Target/LLVM/XeVM/Target.cpp
Normal file
418
mlir/lib/Target/LLVM/XeVM/Target.cpp
Normal file
@ -0,0 +1,418 @@
|
||||
//===- Target.cpp - MLIR LLVM XeVM target compilation -----------*- C++ -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This files defines XeVM target related functions including registration
|
||||
// calls for the `#xevm.target` compilation attribute.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "mlir/Target/LLVM/XeVM/Target.h"
|
||||
|
||||
#include "mlir/Dialect/GPU/IR/CompilationInterfaces.h"
|
||||
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
|
||||
#include "mlir/Dialect/LLVMIR/XeVMDialect.h"
|
||||
#include "mlir/IR/BuiltinAttributeInterfaces.h"
|
||||
#include "mlir/IR/BuiltinDialect.h"
|
||||
#include "mlir/IR/BuiltinTypes.h"
|
||||
#include "mlir/IR/DialectResourceBlobManager.h"
|
||||
#include "mlir/Target/LLVM/XeVM/Utils.h"
|
||||
#include "mlir/Target/LLVMIR/Dialect/GPU/GPUToLLVMIRTranslation.h"
|
||||
#include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
|
||||
#include "mlir/Target/LLVMIR/Dialect/XeVM/XeVMToLLVMIRTranslation.h"
|
||||
#include "mlir/Target/LLVMIR/Export.h"
|
||||
#include "llvm/IR/LegacyPassManager.h"
|
||||
#include "llvm/Target/TargetMachine.h"
|
||||
|
||||
#include "llvm/Bitcode/BitcodeWriter.h"
|
||||
#include "llvm/Config/Targets.h"
|
||||
#include "llvm/Support/FileSystem.h"
|
||||
#include "llvm/Support/FileUtilities.h"
|
||||
#include "llvm/Support/FormatVariadic.h"
|
||||
#include "llvm/Support/MemoryBuffer.h"
|
||||
#include "llvm/Support/Path.h"
|
||||
#include "llvm/Support/Process.h"
|
||||
#include "llvm/Support/Program.h"
|
||||
#include "llvm/Support/TargetSelect.h"
|
||||
#include "llvm/Support/raw_ostream.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
|
||||
using namespace mlir;
|
||||
using namespace mlir::xevm;
|
||||
|
||||
namespace {
|
||||
// XeVM implementation of the gpu:TargetAttrInterface.
|
||||
class XeVMTargetAttrImpl
|
||||
: public gpu::TargetAttrInterface::FallbackModel<XeVMTargetAttrImpl> {
|
||||
public:
|
||||
std::optional<SmallVector<char, 0>>
|
||||
serializeToObject(Attribute attribute, Operation *module,
|
||||
const gpu::TargetOptions &options) const;
|
||||
|
||||
Attribute createObject(Attribute attribute, Operation *module,
|
||||
const SmallVector<char, 0> &object,
|
||||
const gpu::TargetOptions &options) const;
|
||||
};
|
||||
} // namespace
|
||||
|
||||
void mlir::xevm::registerXeVMTargetInterfaceExternalModels(
|
||||
DialectRegistry ®istry) {
|
||||
registry.addExtension(+[](MLIRContext *ctx, XeVMDialect *dialect) {
|
||||
XeVMTargetAttr::attachInterface<XeVMTargetAttrImpl>(*ctx);
|
||||
});
|
||||
}
|
||||
|
||||
void mlir::xevm::registerXeVMTargetInterfaceExternalModels(
|
||||
MLIRContext &context) {
|
||||
DialectRegistry registry;
|
||||
registerXeVMTargetInterfaceExternalModels(registry);
|
||||
context.appendDialectRegistry(registry);
|
||||
}
|
||||
|
||||
SerializeGPUModuleBase::SerializeGPUModuleBase(
|
||||
Operation &module, XeVMTargetAttr xeTarget,
|
||||
const gpu::TargetOptions &targetOptions)
|
||||
: ModuleToObject(module, xeTarget.getTriple(), "", {}, xeTarget.getO()),
|
||||
xeTarget(xeTarget), librariesToLink(targetOptions.getLibrariesToLink()),
|
||||
targetOptions(targetOptions) {
|
||||
if (xeTarget.getLinkFiles())
|
||||
librariesToLink.append(xeTarget.getLinkFiles().begin(),
|
||||
xeTarget.getLinkFiles().end());
|
||||
}
|
||||
|
||||
XeVMTargetAttr SerializeGPUModuleBase::getTarget() const { return xeTarget; }
|
||||
|
||||
std::optional<SmallVector<std::unique_ptr<llvm::Module>>>
|
||||
SerializeGPUModuleBase::loadBitcodeFiles(llvm::Module &module) {
|
||||
if (librariesToLink.empty())
|
||||
return SmallVector<std::unique_ptr<llvm::Module>>();
|
||||
SmallVector<std::unique_ptr<llvm::Module>> bcFiles;
|
||||
if (failed(loadBitcodeFilesFromList(module.getContext(), librariesToLink,
|
||||
bcFiles)))
|
||||
return std::nullopt;
|
||||
return std::move(bcFiles);
|
||||
}
|
||||
|
||||
gpu::GPUModuleOp SerializeGPUModuleBase::getGPUModuleOp() {
|
||||
return dyn_cast<gpu::GPUModuleOp>(&SerializeGPUModuleBase::getOperation());
|
||||
}
|
||||
|
||||
// There is 1 way to finalize IL to native code: IGC
|
||||
// There are 2 ways to access IGC: AOT (ocloc) and JIT (L0 runtime).
|
||||
// - L0 runtime consumes IL and is external to MLIR codebase (rt wrappers).
|
||||
// - `ocloc` tool can be "queried" from within MLIR.
|
||||
std::optional<SmallVector<char, 0>>
|
||||
SerializeGPUModuleBase::compileToBinary(const std::string &asmStr,
|
||||
StringRef inputFormat) {
|
||||
using TmpFile = std::pair<llvm::SmallString<128>, llvm::FileRemover>;
|
||||
// Find the `ocloc` tool.
|
||||
std::optional<std::string> oclocCompiler = findTool("ocloc");
|
||||
if (!oclocCompiler)
|
||||
return std::nullopt;
|
||||
Location loc = getGPUModuleOp().getLoc();
|
||||
std::string basename = llvm::formatv(
|
||||
"mlir-{0}-{1}-{2}", getGPUModuleOp().getNameAttr().getValue(),
|
||||
getTarget().getTriple(), getTarget().getChip());
|
||||
|
||||
auto createTemp = [&](StringRef name,
|
||||
StringRef suffix) -> std::optional<TmpFile> {
|
||||
llvm::SmallString<128> filePath;
|
||||
if (auto ec = llvm::sys::fs::createTemporaryFile(name, suffix, filePath)) {
|
||||
getGPUModuleOp().emitError()
|
||||
<< "Couldn't create the temp file: `" << filePath
|
||||
<< "`, error message: " << ec.message();
|
||||
return std::nullopt;
|
||||
}
|
||||
return TmpFile(filePath, llvm::FileRemover(filePath.c_str()));
|
||||
};
|
||||
// Create temp file
|
||||
std::optional<TmpFile> asmFile = createTemp(basename, "asm");
|
||||
std::optional<TmpFile> binFile = createTemp(basename, "");
|
||||
std::optional<TmpFile> logFile = createTemp(basename, "log");
|
||||
if (!logFile || !asmFile || !binFile)
|
||||
return std::nullopt;
|
||||
// Dump the assembly to a temp file
|
||||
std::error_code ec;
|
||||
{
|
||||
llvm::raw_fd_ostream asmStream(asmFile->first, ec);
|
||||
if (ec) {
|
||||
emitError(loc) << "Couldn't open the file: `" << asmFile->first
|
||||
<< "`, error message: " << ec.message();
|
||||
return std::nullopt;
|
||||
}
|
||||
asmStream << asmStr;
|
||||
if (asmStream.has_error()) {
|
||||
emitError(loc) << "An error occurred while writing the assembly to: `"
|
||||
<< asmFile->first << "`.";
|
||||
return std::nullopt;
|
||||
}
|
||||
asmStream.flush();
|
||||
}
|
||||
// Set cmd options
|
||||
std::pair<llvm::BumpPtrAllocator, SmallVector<const char *>> cmdOpts =
|
||||
targetOptions.tokenizeCmdOptions();
|
||||
// Example: --gpu-module-to-binary="opts='opt1 opt2'"
|
||||
const std::string cmdOptsStr = "\"" + llvm::join(cmdOpts.second, " ") + "\"";
|
||||
SmallVector<StringRef, 12> oclocArgs(
|
||||
{"ocloc", "compile", "-file", asmFile->first, inputFormat, "-device",
|
||||
getTarget().getChip(), "-output", binFile->first, "-output_no_suffix",
|
||||
"-options", cmdOptsStr});
|
||||
|
||||
// Dump tool invocation commands.
|
||||
#define DEBUG_TYPE "serialize-to-binary"
|
||||
LLVM_DEBUG({
|
||||
llvm::dbgs() << "Tool invocation for module: "
|
||||
<< getGPUModuleOp().getNameAttr() << "\n";
|
||||
llvm::interleave(oclocArgs, llvm::dbgs(), " ");
|
||||
llvm::dbgs() << "\n";
|
||||
});
|
||||
#undef DEBUG_TYPE
|
||||
// Helper function for printing tool error logs.
|
||||
std::string message;
|
||||
auto emitLogError =
|
||||
[&](StringRef toolName) -> std::optional<SmallVector<char, 0>> {
|
||||
if (message.empty()) {
|
||||
llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> toolStderr =
|
||||
llvm::MemoryBuffer::getFile(logFile->first);
|
||||
if (toolStderr)
|
||||
emitError(loc) << toolName << " invocation failed. Log:\n"
|
||||
<< toolStderr->get()->getBuffer();
|
||||
else
|
||||
emitError(loc) << toolName << " invocation failed.";
|
||||
return std::nullopt;
|
||||
}
|
||||
emitError(loc) << toolName
|
||||
<< " invocation failed, error message: " << message;
|
||||
return std::nullopt;
|
||||
};
|
||||
std::optional<StringRef> redirects[] = {
|
||||
std::nullopt,
|
||||
logFile->first,
|
||||
logFile->first,
|
||||
};
|
||||
// Invoke ocloc.
|
||||
if (llvm::sys::ExecuteAndWait(oclocCompiler.value(), oclocArgs, std::nullopt,
|
||||
redirects, 0, 0, &message))
|
||||
return emitLogError("`ocloc`");
|
||||
binFile->first.append(".bin");
|
||||
llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> binaryBuffer =
|
||||
llvm::MemoryBuffer::getFile(binFile->first);
|
||||
if (!binaryBuffer) {
|
||||
emitError(loc) << "Couldn't open the file: `" << binFile->first
|
||||
<< "`, error message: " << binaryBuffer.getError().message();
|
||||
return std::nullopt;
|
||||
}
|
||||
StringRef bin = (*binaryBuffer)->getBuffer();
|
||||
return SmallVector<char, 0>(bin.begin(), bin.end());
|
||||
}
|
||||
|
||||
std::optional<std::string> SerializeGPUModuleBase::findTool(StringRef tool) {
|
||||
// 1. Check the toolkit path given in the command line.
|
||||
StringRef pathRef = targetOptions.getToolkitPath();
|
||||
SmallVector<char, 256> path;
|
||||
if (!pathRef.empty()) {
|
||||
path.insert(path.begin(), pathRef.begin(), pathRef.end());
|
||||
llvm::sys::path::append(path, "bin", tool);
|
||||
if (llvm::sys::fs::can_execute(path))
|
||||
return StringRef(path.data(), path.size()).str();
|
||||
}
|
||||
// 2. Check PATH.
|
||||
if (std::optional<std::string> toolPath =
|
||||
llvm::sys::Process::FindInEnvPath("PATH", tool))
|
||||
return *toolPath;
|
||||
|
||||
getGPUModuleOp().emitError()
|
||||
<< "Couldn't find the `" << tool
|
||||
<< "` binary. Please specify the toolkit "
|
||||
"path via GpuModuleToBinaryPass or add the compiler to $PATH`.";
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
namespace {
|
||||
class SPIRVSerializer : public SerializeGPUModuleBase {
|
||||
public:
|
||||
SPIRVSerializer(Operation &module, XeVMTargetAttr xeTarget,
|
||||
const gpu::TargetOptions &targetOptions)
|
||||
: SerializeGPUModuleBase(module, xeTarget, targetOptions) {}
|
||||
|
||||
static void init();
|
||||
|
||||
/// Serializes the LLVM module to an object format, depending on the
|
||||
/// compilation target selected in target options.
|
||||
std::optional<SmallVector<char, 0>>
|
||||
moduleToObject(llvm::Module &llvmModule) override;
|
||||
|
||||
private:
|
||||
/// Translates the LLVM module to SPIR-V binary using LLVM's
|
||||
/// SPIR-V target.
|
||||
std::optional<std::string>
|
||||
translateToSPIRVBinary(llvm::Module &llvmModule,
|
||||
llvm::TargetMachine &targetMachine);
|
||||
};
|
||||
} // namespace
|
||||
|
||||
void SPIRVSerializer::init() {
|
||||
static llvm::once_flag initializeBackendOnce;
|
||||
llvm::call_once(initializeBackendOnce, []() {
|
||||
#if LLVM_HAS_SPIRV_TARGET
|
||||
LLVMInitializeSPIRVTarget();
|
||||
LLVMInitializeSPIRVTargetInfo();
|
||||
LLVMInitializeSPIRVTargetMC();
|
||||
LLVMInitializeSPIRVAsmPrinter();
|
||||
#endif
|
||||
});
|
||||
}
|
||||
|
||||
std::optional<SmallVector<char, 0>>
|
||||
SPIRVSerializer::moduleToObject(llvm::Module &llvmModule) {
|
||||
#define DEBUG_TYPE "serialize-to-llvm"
|
||||
LLVM_DEBUG({
|
||||
llvm::dbgs() << "LLVM IR for module: " << getGPUModuleOp().getNameAttr()
|
||||
<< "\n";
|
||||
llvm::dbgs() << llvmModule << "\n";
|
||||
llvm::dbgs().flush();
|
||||
});
|
||||
#undef DEBUG_TYPE
|
||||
|
||||
// Return LLVM IR if the compilation target is `offload`.
|
||||
if (targetOptions.getCompilationTarget() == gpu::CompilationTarget::Offload)
|
||||
return SerializeGPUModuleBase::moduleToObject(llvmModule);
|
||||
|
||||
#if !LLVM_HAS_SPIRV_TARGET
|
||||
getGPUModuleOp()->emitError("The `SPIRV` target was not built. Please enable "
|
||||
"it when building LLVM.");
|
||||
return std::nullopt;
|
||||
#endif // LLVM_HAS_SPIRV_TARGET
|
||||
|
||||
std::optional<llvm::TargetMachine *> targetMachine =
|
||||
getOrCreateTargetMachine();
|
||||
if (!targetMachine) {
|
||||
getGPUModuleOp().emitError() << "Target Machine unavailable for triple "
|
||||
<< triple << ", can't optimize with LLVM\n";
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
// Return SPIRV if the compilation target is `assembly`.
|
||||
if (targetOptions.getCompilationTarget() ==
|
||||
gpu::CompilationTarget::Assembly) {
|
||||
std::optional<std::string> serializedISA =
|
||||
translateToISA(llvmModule, **targetMachine);
|
||||
if (!serializedISA) {
|
||||
getGPUModuleOp().emitError() << "Failed translating the module to ISA."
|
||||
<< triple << ", can't compile with LLVM\n";
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
#define DEBUG_TYPE "serialize-to-isa"
|
||||
LLVM_DEBUG({
|
||||
llvm::dbgs() << "SPIR-V for module: " << getGPUModuleOp().getNameAttr()
|
||||
<< "\n";
|
||||
llvm::dbgs() << *serializedISA << "\n";
|
||||
llvm::dbgs().flush();
|
||||
});
|
||||
#undef DEBUG_TYPE
|
||||
|
||||
// Make sure to include the null terminator.
|
||||
StringRef bin(serializedISA->c_str(), serializedISA->size() + 1);
|
||||
return SmallVector<char, 0>(bin.begin(), bin.end());
|
||||
}
|
||||
|
||||
// Level zero runtime is set up to accept SPIR-V binary
|
||||
// translateToSPIRVBinary translates the LLVM module to SPIR-V binary
|
||||
// using LLVM's SPIRV target.
|
||||
// compileToBinary can be used in the future if level zero runtime
|
||||
// implementation switches to native XeVM binary format.
|
||||
std::optional<std::string> serializedSPIRVBinary =
|
||||
translateToSPIRVBinary(llvmModule, **targetMachine);
|
||||
if (!serializedSPIRVBinary) {
|
||||
getGPUModuleOp().emitError() << "Failed translating the module to Binary.";
|
||||
return std::nullopt;
|
||||
}
|
||||
if (serializedSPIRVBinary->size() % 4) {
|
||||
getGPUModuleOp().emitError() << "SPIRV code size must be a multiple of 4.";
|
||||
return std::nullopt;
|
||||
}
|
||||
StringRef bin(serializedSPIRVBinary->c_str(), serializedSPIRVBinary->size());
|
||||
return SmallVector<char, 0>(bin.begin(), bin.end());
|
||||
}
|
||||
|
||||
std::optional<std::string>
|
||||
SPIRVSerializer::translateToSPIRVBinary(llvm::Module &llvmModule,
|
||||
llvm::TargetMachine &targetMachine) {
|
||||
std::string targetISA;
|
||||
llvm::raw_string_ostream stream(targetISA);
|
||||
|
||||
{ // Drop pstream after this to prevent the ISA from being stuck buffering
|
||||
llvm::buffer_ostream pstream(stream);
|
||||
llvm::legacy::PassManager codegenPasses;
|
||||
if (targetMachine.addPassesToEmitFile(codegenPasses, pstream, nullptr,
|
||||
llvm::CodeGenFileType::ObjectFile))
|
||||
return std::nullopt;
|
||||
|
||||
codegenPasses.run(llvmModule);
|
||||
}
|
||||
return targetISA;
|
||||
}
|
||||
|
||||
std::optional<SmallVector<char, 0>>
|
||||
XeVMTargetAttrImpl::serializeToObject(Attribute attribute, Operation *module,
|
||||
const gpu::TargetOptions &options) const {
|
||||
if (!module)
|
||||
return std::nullopt;
|
||||
auto gpuMod = dyn_cast<gpu::GPUModuleOp>(module);
|
||||
if (!gpuMod) {
|
||||
module->emitError("expected to be a gpu.module op");
|
||||
return std::nullopt;
|
||||
}
|
||||
auto xeTarget = cast<XeVMTargetAttr>(attribute);
|
||||
if (xeTarget.getTriple().starts_with("spirv")) {
|
||||
gpuMod.walk([&](LLVM::LLVMFuncOp funcOp) {
|
||||
if (funcOp->hasAttr(gpu::GPUDialect::getKernelFuncAttrName())) {
|
||||
funcOp.setIntelReqdSubGroupSize(16);
|
||||
return WalkResult::interrupt();
|
||||
}
|
||||
return WalkResult::advance();
|
||||
});
|
||||
|
||||
SPIRVSerializer serializer(*module, cast<XeVMTargetAttr>(attribute),
|
||||
options);
|
||||
serializer.init();
|
||||
|
||||
#if !LLVM_HAS_SPIRV_TARGET
|
||||
module->emitError("Cannot run `TargetRegistry::lookupTarget()` for SPIRV "
|
||||
"without having the target built.");
|
||||
#endif
|
||||
|
||||
return serializer.run();
|
||||
}
|
||||
module->emitError("Unsupported XeVM target triple: ") << xeTarget.getTriple();
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
Attribute
|
||||
XeVMTargetAttrImpl::createObject(Attribute attribute, Operation *module,
|
||||
const SmallVector<char, 0> &object,
|
||||
const gpu::TargetOptions &options) const {
|
||||
Builder builder(attribute.getContext());
|
||||
gpu::CompilationTarget format = options.getCompilationTarget();
|
||||
auto xeTarget = cast<XeVMTargetAttr>(attribute);
|
||||
SmallVector<NamedAttribute, 2> properties;
|
||||
if (format == gpu::CompilationTarget::Assembly)
|
||||
properties.push_back(
|
||||
builder.getNamedAttr("O", builder.getI32IntegerAttr(xeTarget.getO())));
|
||||
|
||||
DictionaryAttr objectProps;
|
||||
if (!properties.empty())
|
||||
objectProps = builder.getDictionaryAttr(properties);
|
||||
|
||||
return builder.getAttr<gpu::ObjectAttr>(
|
||||
attribute, format,
|
||||
builder.getStringAttr(StringRef(object.data(), object.size())),
|
||||
objectProps, /*kernels=*/nullptr);
|
||||
}
|
4
mlir/test/Integration/Dialect/XeVM/GPU/lit.local.cfg
Normal file
4
mlir/test/Integration/Dialect/XeVM/GPU/lit.local.cfg
Normal file
@ -0,0 +1,4 @@
|
||||
if not config.run_xevm_tests:
|
||||
config.unsupported = True
|
||||
if not config.enable_levelzero_runner:
|
||||
config.unsupported = True
|
146
mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_dpas.mlir
Normal file
146
mlir/test/Integration/Dialect/XeVM/GPU/xevm_block_dpas.mlir
Normal file
@ -0,0 +1,146 @@
|
||||
// RUN: mlir-opt %s \
|
||||
// RUN: | mlir-opt -pass-pipeline='builtin.module(cse,func.func(gpu-async-region),xevm-attach-target,gpu.module(convert-gpu-to-llvm-spv{use-64bit-index=true},convert-xevm-to-llvm,cse))' \
|
||||
// RUN: | mlir-opt -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \
|
||||
// RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -cse -gpu-module-to-binary \
|
||||
// RUN: | mlir-runner \
|
||||
// RUN: --shared-libs=%mlir_levelzero_runtime \
|
||||
// RUN: --shared-libs=%mlir_runner_utils \
|
||||
// RUN: --shared-libs=%mlir_c_runner_utils \
|
||||
// RUN: --entry-point-result=void \
|
||||
// RUN: | FileCheck %s
|
||||
|
||||
module @gemm attributes {gpu.container_module} {
|
||||
gpu.module @kernel {
|
||||
// - Sets of `matrix_mad` intrinsics can differ based on device's *minimal* supported sub-group size.
|
||||
// The *minimum supported* sub-group size should be used to call `matrix_mad` intrinsics.
|
||||
// https://registry.khronos.org/OpenCL/extensions/intel/cl_intel_subgroup_matrix_multiply_accumulate.html
|
||||
|
||||
gpu.func @block_dpas(%a: !llvm.ptr<1>, %b: !llvm.ptr<1>, %c: !llvm.ptr<1>) kernel {
|
||||
%base_width_a = arith.constant 32 : i32
|
||||
%base_height_a = arith.constant 8 : i32
|
||||
%base_pitch_a = arith.constant 32 : i32
|
||||
%x = arith.constant 0 : i32
|
||||
%y = arith.constant 0 : i32
|
||||
%loaded_a = xevm.blockload2d %a, %base_width_a, %base_height_a, %base_pitch_a, %x, %y
|
||||
<{elem_size_in_bits=16 : i32, tile_width=16 : i32, tile_height=8 : i32, v_blocks=1 : i32,
|
||||
transpose=false, pack_register=false}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<8xi16>
|
||||
|
||||
%base_width_b = arith.constant 32 : i32
|
||||
%base_height_b = arith.constant 16 : i32
|
||||
%base_pitch_b = arith.constant 32 : i32
|
||||
%loaded_b1 = xevm.blockload2d %b, %base_width_b, %base_height_b, %base_pitch_b, %x, %y
|
||||
<{elem_size_in_bits=16 : i32, tile_width=16 : i32, tile_height=16 : i32, v_blocks=1 : i32,
|
||||
transpose=false, pack_register=false}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16>
|
||||
%loaded_b_casted = vector.bitcast %loaded_b1 : vector<16xi16> to vector<8xi32>
|
||||
|
||||
%base_width_c = arith.constant 64 : i32
|
||||
%base_height_c = arith.constant 8 : i32
|
||||
%base_pitch_c = arith.constant 64 : i32
|
||||
%loaded_c = xevm.blockload2d %c, %base_width_c, %base_height_c, %base_pitch_c, %x, %y
|
||||
<{elem_size_in_bits=32 : i32, tile_width=16 : i32, tile_height=8 : i32, v_blocks=1 : i32,
|
||||
transpose=false, pack_register=false}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<8xi32>
|
||||
|
||||
%loaded_c_casted = vector.bitcast %loaded_c : vector<8xi32> to vector<8xf32>
|
||||
%c_result = xevm.mma %loaded_a, %loaded_b_casted, %loaded_c_casted
|
||||
{shape=<m=8, n=16, k=16>, types=<d=f32, a=f16, b=f16, c=f32>}
|
||||
: (vector<8xi16>, vector<8xi32>, vector<8xf32>) -> vector<8xf32>
|
||||
%c_result_casted = vector.bitcast %c_result : vector<8xf32> to vector<8xi32>
|
||||
|
||||
xevm.blockstore2d %c, %base_width_c, %base_height_c, %base_pitch_c, %x, %y, %c_result_casted
|
||||
<{elem_size_in_bits=32 : i32, tile_width=16 : i32, tile_height=8 : i32}>
|
||||
: (!llvm.ptr<1>, i32, i32, i32, i32, i32, vector<8xi32>)
|
||||
gpu.return
|
||||
}
|
||||
}
|
||||
|
||||
func.func @test(%a : memref<8x16xf16>, %b : memref<16x16xf16>, %c : memref<8x16xf32>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} {
|
||||
%c1 = arith.constant 1 : index
|
||||
%c16 = arith.constant 16 : index
|
||||
|
||||
%memref_a = gpu.alloc() : memref<8x16xf16>
|
||||
gpu.memcpy %memref_a, %a : memref<8x16xf16>, memref<8x16xf16>
|
||||
%a_ptr_as_idx = memref.extract_aligned_pointer_as_index %memref_a : memref<8x16xf16> -> index
|
||||
%a_ptr_as_i64 = arith.index_cast %a_ptr_as_idx : index to i64
|
||||
%a_ptr = llvm.inttoptr %a_ptr_as_i64 : i64 to !llvm.ptr
|
||||
%a_ptr_casted = llvm.addrspacecast %a_ptr : !llvm.ptr to !llvm.ptr<1>
|
||||
|
||||
%memref_b = gpu.alloc() : memref<16x16xf16>
|
||||
gpu.memcpy %memref_b, %b : memref<16x16xf16>, memref<16x16xf16>
|
||||
%b_ptr_as_idx = memref.extract_aligned_pointer_as_index %memref_b : memref<16x16xf16> -> index
|
||||
%b_ptr_as_i64 = arith.index_cast %b_ptr_as_idx : index to i64
|
||||
%b_ptr = llvm.inttoptr %b_ptr_as_i64 : i64 to !llvm.ptr
|
||||
%b_ptr_casted = llvm.addrspacecast %b_ptr : !llvm.ptr to !llvm.ptr<1>
|
||||
|
||||
%memref_c = gpu.alloc() : memref<8x16xf32>
|
||||
gpu.memcpy %memref_c, %c : memref<8x16xf32>, memref<8x16xf32>
|
||||
%c_ptr_as_idx = memref.extract_aligned_pointer_as_index %memref_c : memref<8x16xf32> -> index
|
||||
%c_ptr_as_i64 = arith.index_cast %c_ptr_as_idx : index to i64
|
||||
%c_ptr = llvm.inttoptr %c_ptr_as_i64 : i64 to !llvm.ptr
|
||||
%c_ptr_casted = llvm.addrspacecast %c_ptr : !llvm.ptr to !llvm.ptr<1>
|
||||
|
||||
gpu.launch_func @kernel::@block_dpas blocks in (%c1, %c1, %c1) threads in (%c16, %c1, %c1)
|
||||
args(%a_ptr_casted : !llvm.ptr<1>, %b_ptr_casted : !llvm.ptr<1>, %c_ptr_casted : !llvm.ptr<1>)
|
||||
gpu.dealloc %memref_a : memref<8x16xf16>
|
||||
gpu.dealloc %memref_b : memref<16x16xf16>
|
||||
%res = memref.alloc() : memref<8x16xf32>
|
||||
gpu.memcpy %res, %memref_c : memref<8x16xf32>, memref<8x16xf32>
|
||||
gpu.dealloc %memref_c : memref<8x16xf32>
|
||||
return %res : memref<8x16xf32>
|
||||
}
|
||||
|
||||
func.func @main() attributes {llvm.emit_c_interface} {
|
||||
%A = memref.alloc() : memref<8x16xf16>
|
||||
%c0 = arith.constant 0 : index
|
||||
%c1 = arith.constant 1 : index
|
||||
%c8 = arith.constant 8 : index
|
||||
%c16 = arith.constant 16 : index
|
||||
|
||||
scf.for %i = %c0 to %c8 step %c1 {
|
||||
scf.for %j = %c0 to %c16 step %c1 {
|
||||
%row_idx = arith.index_cast %i : index to i32
|
||||
%row = arith.sitofp %row_idx : i32 to f16
|
||||
memref.store %row, %A[%i, %j] : memref<8x16xf16>
|
||||
}
|
||||
}
|
||||
%B = memref.alloc() : memref<16x16xf16>
|
||||
scf.for %i = %c0 to %c16 step %c1 {
|
||||
scf.for %j = %c0 to %c16 step %c1 {
|
||||
%col_idx = arith.index_cast %j : index to i32
|
||||
%col = arith.sitofp %col_idx : i32 to f16
|
||||
memref.store %col, %B[%i, %j] : memref<16x16xf16>
|
||||
}
|
||||
}
|
||||
|
||||
%C = memref.alloc() : memref<8x16xf32>
|
||||
%c0_f16 = arith.constant 0.0 : f32
|
||||
scf.for %i = %c0 to %c8 step %c1 {
|
||||
scf.for %j = %c0 to %c16 step %c1 {
|
||||
memref.store %c0_f16, %C[%i, %j] : memref<8x16xf32>
|
||||
}
|
||||
}
|
||||
|
||||
%C_res = call @test(%A, %B, %C) : (memref<8x16xf16>, memref<16x16xf16>, memref<8x16xf32>) -> memref<8x16xf32>
|
||||
%C_cast = memref.cast %C_res : memref<8x16xf32> to memref<*xf32>
|
||||
%A_cast = memref.cast %A : memref<8x16xf16> to memref<*xf16>
|
||||
call @printMemrefF32(%C_cast) : (memref<*xf32>) -> ()
|
||||
|
||||
// CHECK: Unranked Memref base@ = 0x{{[0-9a-f]+}}
|
||||
// CHECK-NEXT: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
|
||||
// CHECK-NEXT: [0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240]
|
||||
// CHECK-NEXT: [0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480]
|
||||
// CHECK-NEXT: [0, 48, 96, 144, 192, 240, 288, 336, 384, 432, 480, 528, 576, 624, 672, 720]
|
||||
// CHECK-NEXT: [0, 64, 128, 192, 256, 320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960]
|
||||
// CHECK-NEXT: [0, 80, 160, 240, 320, 400, 480, 560, 640, 720, 800, 880, 960, 1040, 1120, 1200]
|
||||
// CHECK-NEXT: [0, 96, 192, 288, 384, 480, 576, 672, 768, 864, 960, 1056, 1152, 1248, 1344, 1440]
|
||||
// CHECK-NEXT: [0, 112, 224, 336, 448, 560, 672, 784, 896, 1008, 1120, 1232, 1344, 1456, 1568, 1680]
|
||||
|
||||
memref.dealloc %A : memref<8x16xf16>
|
||||
memref.dealloc %B : memref<16x16xf16>
|
||||
memref.dealloc %C : memref<8x16xf32>
|
||||
memref.dealloc %C_res : memref<8x16xf32>
|
||||
return
|
||||
}
|
||||
func.func private @printMemrefF16(%ptr : memref<*xf16>) attributes { llvm.emit_c_interface }
|
||||
func.func private @printMemrefF32(%ptr : memref<*xf32>) attributes { llvm.emit_c_interface }
|
||||
|
||||
}
|
@ -0,0 +1,109 @@
|
||||
// RUN: mlir-opt %s \
|
||||
// RUN: | mlir-opt -pass-pipeline='builtin.module(cse,func.func(gpu-async-region),xevm-attach-target,gpu.module(convert-gpu-to-llvm-spv{use-64bit-index=true},convert-xevm-to-llvm,cse))' \
|
||||
// RUN: | mlir-opt -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \
|
||||
// RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -cse -gpu-module-to-binary \
|
||||
// RUN: | mlir-runner \
|
||||
// RUN: --shared-libs=%mlir_levelzero_runtime \
|
||||
// RUN: --shared-libs=%mlir_runner_utils \
|
||||
// RUN: --shared-libs=%mlir_c_runner_utils \
|
||||
// RUN: --entry-point-result=void \
|
||||
// RUN: | FileCheck %s
|
||||
|
||||
module @gemm attributes {gpu.container_module} {
|
||||
|
||||
gpu.module @kernel {
|
||||
// - `cl_intel_subgroups` block load/store intrinsics operate at the *maximum* sub-group size,
|
||||
// regardless of the active sub-group size. Make sure `clGetKernelSubGroupInfo` meets your expectations.
|
||||
// - The attribute `intel_reqd_sub_group_size` establishes the maximum sub-group size for a kernel.
|
||||
//
|
||||
// Note: launching 16 threads without explicit `intel_reqd_sub_group_size = 16` may still use
|
||||
// the default sub-group size of 32.
|
||||
//
|
||||
// https://registry.khronos.org/OpenCL/extensions/intel/cl_intel_required_subgroup_size.html
|
||||
// https://registry.khronos.org/OpenCL/extensions/intel/cl_intel_subgroups.html
|
||||
|
||||
gpu.func @block_load_store(%src: !llvm.ptr<1>, %dst: !llvm.ptr<1>) kernel {
|
||||
%base_width = arith.constant 64 : i32 // bytewidth of the block
|
||||
%base_height = arith.constant 8 : i32 // number of rows
|
||||
%base_pitch = arith.constant 64 : i32 // bytewidth of the base row
|
||||
%x = arith.constant 0 : i32
|
||||
%y = arith.constant 0 : i32
|
||||
// If `intel_reqd_sub_group_size = 16` is not set, the default (32) is used and this `blockload2d`
|
||||
// would only load 4 elements into vector<8xi32>
|
||||
%loaded = xevm.blockload2d %src, %base_width, %base_height, %base_pitch, %x, %y
|
||||
<{elem_size_in_bits=32 : i32, tile_width=16 : i32, tile_height=8 : i32, v_blocks=1 : i32,
|
||||
transpose=false, pack_register=false}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<8xi32>
|
||||
%loaded_f32 = vector.bitcast %loaded : vector<8xi32> to vector<8xf32>
|
||||
%c0 = arith.constant 0 : index
|
||||
%thread_x = gpu.thread_id x
|
||||
%thread_x_i64 = arith.index_cast %thread_x : index to i64
|
||||
%thread_x_i32 = llvm.trunc %thread_x_i64 : i64 to i32
|
||||
%thread_x_f32 = arith.sitofp %thread_x_i32 : i32 to f32
|
||||
%loaded_f32_modified = vector.insert %thread_x_f32, %loaded_f32[%c0] : f32 into vector<8xf32>
|
||||
%loaded_modified = vector.bitcast %loaded_f32_modified : vector<8xf32> to vector<8xi32>
|
||||
xevm.blockstore2d %dst, %base_width, %base_height, %base_pitch, %x, %y, %loaded_modified
|
||||
<{elem_size_in_bits=32 : i32, tile_width=16 : i32, tile_height=8 : i32}>
|
||||
: (!llvm.ptr<1>, i32, i32, i32, i32, i32, vector<8xi32>)
|
||||
gpu.return
|
||||
}
|
||||
}
|
||||
|
||||
func.func @test(%src : memref<8x16xf32>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} {
|
||||
%c1 = arith.constant 1 : index
|
||||
%c16 = arith.constant 16 : index // Multiple of the *maximum sub-group size* (see `intel_reqd_sub_group_size`)
|
||||
%memref_src = gpu.alloc() : memref<8x16xf32>
|
||||
gpu.memcpy %memref_src, %src : memref<8x16xf32>, memref<8x16xf32>
|
||||
%src_ptr_as_idx = memref.extract_aligned_pointer_as_index %memref_src : memref<8x16xf32> -> index
|
||||
%src_ptr_as_i64 = arith.index_cast %src_ptr_as_idx : index to i64
|
||||
%src_ptr = llvm.inttoptr %src_ptr_as_i64 : i64 to !llvm.ptr
|
||||
%src_ptr_casted = llvm.addrspacecast %src_ptr : !llvm.ptr to !llvm.ptr<1>
|
||||
|
||||
%memref_dst = gpu.alloc() : memref<8x16xf32>
|
||||
%dst_ptr_as_idx = memref.extract_aligned_pointer_as_index %memref_dst : memref<8x16xf32> -> index
|
||||
%dst_ptr_as_i64 = arith.index_cast %dst_ptr_as_idx : index to i64
|
||||
%dst_ptr = llvm.inttoptr %dst_ptr_as_i64 : i64 to !llvm.ptr
|
||||
%dst_ptr_casted = llvm.addrspacecast %dst_ptr : !llvm.ptr to !llvm.ptr<1>
|
||||
|
||||
gpu.launch_func @kernel::@block_load_store blocks in (%c1, %c1, %c1) threads in (%c16, %c1, %c1)
|
||||
args(%src_ptr_casted : !llvm.ptr<1>, %dst_ptr_casted : !llvm.ptr<1>)
|
||||
gpu.dealloc %memref_src : memref<8x16xf32>
|
||||
%dst = memref.alloc() : memref<8x16xf32>
|
||||
gpu.memcpy %dst, %memref_dst : memref<8x16xf32>, memref<8x16xf32>
|
||||
gpu.dealloc %memref_dst : memref<8x16xf32>
|
||||
return %dst : memref<8x16xf32>
|
||||
}
|
||||
|
||||
func.func @main() attributes {llvm.emit_c_interface} {
|
||||
%A = memref.alloc() : memref<8x16xf32>
|
||||
%c0 = arith.constant 0 : index
|
||||
%c1 = arith.constant 1 : index
|
||||
%c8 = arith.constant 8 : index
|
||||
%c16 = arith.constant 16 : index
|
||||
%c11_f32 = arith.constant 11.11 : f32
|
||||
scf.for %i = %c0 to %c8 step %c1 {
|
||||
scf.for %j = %c0 to %c16 step %c1 {
|
||||
memref.store %c11_f32, %A[%i, %j] : memref<8x16xf32>
|
||||
}
|
||||
}
|
||||
%B = call @test(%A) : (memref<8x16xf32>) -> memref<8x16xf32>
|
||||
%B_cast = memref.cast %B : memref<8x16xf32> to memref<*xf32>
|
||||
%A_cast = memref.cast %A : memref<8x16xf32> to memref<*xf32>
|
||||
call @printMemrefF32(%A_cast) : (memref<*xf32>) -> ()
|
||||
call @printMemrefF32(%B_cast) : (memref<*xf32>) -> ()
|
||||
|
||||
// CHECK: Unranked Memref base@ = 0x{{[0-9a-f]+}}
|
||||
// CHECK-NEXT: [11.11{{.*}}]
|
||||
// CHECK-COUNT-96: 11.11
|
||||
// CHECK-NEXT: [11.11{{.*}}]
|
||||
|
||||
// CHECK-NEXT: Unranked Memref base@ = 0x{{[0-9a-f]+}}
|
||||
// CHECK: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
||||
// CHECK-COUNT-96: 11.11
|
||||
// CHECK-NEXT: [11.11{{.*}}]
|
||||
|
||||
memref.dealloc %A : memref<8x16xf32>
|
||||
memref.dealloc %B : memref<8x16xf32>
|
||||
return
|
||||
}
|
||||
func.func private @printMemrefF32(%ptr : memref<*xf32>)
|
||||
}
|
@ -0,0 +1,131 @@
|
||||
// RUN: mlir-opt %s \
|
||||
// RUN: | mlir-opt -pass-pipeline='builtin.module(cse,func.func(gpu-async-region),xevm-attach-target,gpu.module(convert-gpu-to-llvm-spv{use-64bit-index=true},convert-xevm-to-llvm,cse))' \
|
||||
// RUN: | mlir-opt -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \
|
||||
// RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -cse -gpu-module-to-binary \
|
||||
// RUN: | mlir-runner \
|
||||
// RUN: --shared-libs=%mlir_levelzero_runtime \
|
||||
// RUN: --shared-libs=%mlir_runner_utils \
|
||||
// RUN: --shared-libs=%mlir_c_runner_utils \
|
||||
// RUN: --entry-point-result=void \
|
||||
// RUN: | FileCheck %s
|
||||
|
||||
module @gemm attributes {gpu.container_module} {
|
||||
gpu.module @kernel {
|
||||
gpu.func @block_load_store(%src: !llvm.ptr<1>, %dst: !llvm.ptr<1>) kernel {
|
||||
%base_width = arith.constant 32 : i32 // bytewidth of the block
|
||||
%base_height_load = arith.constant 16 : i32 // number of rows
|
||||
%base_pitch = arith.constant 32 : i32 // bytewidth of the base row
|
||||
%x = arith.constant 0 : i32
|
||||
%y = arith.constant 0 : i32
|
||||
|
||||
// Consider the following two loads:
|
||||
// Normal load:
|
||||
%loaded = xevm.blockload2d %src, %base_width, %base_height_load, %base_pitch, %x, %y
|
||||
<{elem_size_in_bits=16 : i32, tile_width=16 : i32, tile_height=16 : i32, v_blocks=1 : i32,
|
||||
transpose=false, pack_register=false}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16>
|
||||
%loaded_f16_flat = vector.bitcast %loaded : vector<16xi16> to vector<16xf16>
|
||||
%loaded_f16 = vector.shape_cast %loaded_f16_flat : vector<16xf16> to vector<8x1x2xf16>
|
||||
|
||||
// Register packed load:
|
||||
%loaded_packed = xevm.blockload2d %src, %base_width, %base_height_load, %base_pitch, %x, %y
|
||||
<{elem_size_in_bits=16 : i32, tile_width=16 : i32, tile_height=16 : i32, v_blocks=1 : i32,
|
||||
transpose=false, pack_register=true}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<8xi32>
|
||||
%loaded_packed_f16_flat = vector.bitcast %loaded_packed : vector<8xi32> to vector<16xf16>
|
||||
%loaded_packed_f16 = vector.shape_cast %loaded_packed_f16_flat : vector<16xf16> to vector<8x1x2xf16>
|
||||
// Both can be represented the same way in code as vector<16xf16>.
|
||||
// A normal load pads a value to a dword (e.g., 32-bit) when loaded to a register.
|
||||
// Packed load "packs" multiple sub-dword values along the column (↓), allowing a single register
|
||||
// to hold multiple values.
|
||||
// In SIMT, a work-item reads values along the column (↓), hence a sequence of values loaded by packing
|
||||
// to register is logically equivalent to the sequence of values loaded using a normal load.
|
||||
// The load results of both methods can have the same logical representation, but are expected to
|
||||
// differ in physical layout and register efficiency.
|
||||
|
||||
%thread_x = gpu.thread_id x
|
||||
%thread_x_i64 = arith.index_cast %thread_x : index to i64
|
||||
%thread_x_i32 = llvm.trunc %thread_x_i64 : i64 to i32
|
||||
%thread_x_f16 = arith.sitofp %thread_x_i32 : i32 to f16
|
||||
%loaded_f16_modified = vector.insert %thread_x_f16, %loaded_packed_f16 [0,0,1] : f16 into vector<8x1x2xf16> // Both loaded_packed_f16 and loaded_f16 can be used here
|
||||
// We can only store [1,2,4,8]x[16] shapes for f16, so we have to do 2 stores
|
||||
%loaded_f16_modified_slice_0 = vector.extract_strided_slice %loaded_f16_modified
|
||||
{offsets = [0, 0, 0], sizes = [4, 1, 2], strides = [1, 1, 1]} : vector<8x1x2xf16> to vector<4x1x2xf16>
|
||||
%loaded_f16_modified_slice_0_flat = vector.shape_cast %loaded_f16_modified_slice_0 : vector<4x1x2xf16> to vector<8xf16>
|
||||
%base_height_store = arith.constant 8 : i32 // number of rows
|
||||
%base_width_store = arith.constant 32 : i32 // bytewidth of the block
|
||||
%base_pitch_store = arith.constant 32 : i32 // bytewidth of the base row
|
||||
xevm.blockstore2d %dst, %base_width_store, %base_height_store, %base_pitch_store, %x, %y, %loaded_f16_modified_slice_0_flat
|
||||
<{elem_size_in_bits=16 : i32, tile_width=16 : i32, tile_height=8 : i32}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32, vector<8xf16>)
|
||||
|
||||
%loaded_f16_modified_slice_1 = vector.extract_strided_slice %loaded_f16_modified
|
||||
{offsets = [4, 0, 0], sizes = [4, 1, 2], strides = [1, 1, 1]} : vector<8x1x2xf16> to vector<4x1x2xf16>
|
||||
%loaded_f16_modified_slice_1_flat = vector.shape_cast %loaded_f16_modified_slice_1 : vector<4x1x2xf16> to vector<8xf16>
|
||||
|
||||
%second_half_offset = arith.muli %base_pitch_store, %base_height_store : i32
|
||||
%second_half_ptr = llvm.getelementptr %dst[%second_half_offset] : (!llvm.ptr<1>, i32) -> !llvm.ptr<1>, i8
|
||||
xevm.blockstore2d %second_half_ptr, %base_width_store, %base_height_store, %base_pitch_store, %x, %y, %loaded_f16_modified_slice_1_flat
|
||||
<{elem_size_in_bits=16 : i32, tile_width=16 : i32, tile_height=8 : i32}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32, vector<8xf16>)
|
||||
gpu.return
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
func.func @test(%src : memref<16x16xf16>) -> memref<16x16xf16> attributes {llvm.emit_c_interface} {
|
||||
%c1 = arith.constant 1 : index
|
||||
%c16 = arith.constant 16 : index // Multiple of the *maximum sub-group size* (see `intel_reqd_sub_group_size`)
|
||||
%memref_src = gpu.alloc() : memref<16x16xf16>
|
||||
gpu.memcpy %memref_src, %src : memref<16x16xf16>, memref<16x16xf16>
|
||||
%src_ptr_as_idx = memref.extract_aligned_pointer_as_index %memref_src : memref<16x16xf16> -> index
|
||||
%src_ptr_as_i64 = arith.index_cast %src_ptr_as_idx : index to i64
|
||||
%src_ptr = llvm.inttoptr %src_ptr_as_i64 : i64 to !llvm.ptr
|
||||
%src_ptr_casted = llvm.addrspacecast %src_ptr : !llvm.ptr to !llvm.ptr<1>
|
||||
|
||||
%memref_dst = gpu.alloc() : memref<16x16xf16>
|
||||
%dst_ptr_as_idx = memref.extract_aligned_pointer_as_index %memref_dst : memref<16x16xf16> -> index
|
||||
%dst_ptr_as_i64 = arith.index_cast %dst_ptr_as_idx : index to i64
|
||||
%dst_ptr = llvm.inttoptr %dst_ptr_as_i64 : i64 to !llvm.ptr
|
||||
%dst_ptr_casted = llvm.addrspacecast %dst_ptr : !llvm.ptr to !llvm.ptr<1>
|
||||
|
||||
gpu.launch_func @kernel::@block_load_store blocks in (%c1, %c1, %c1) threads in (%c16, %c1, %c1)
|
||||
args(%src_ptr_casted : !llvm.ptr<1>, %dst_ptr_casted : !llvm.ptr<1>)
|
||||
gpu.dealloc %memref_src : memref<16x16xf16>
|
||||
%dst = memref.alloc() : memref<16x16xf16>
|
||||
gpu.memcpy %dst, %memref_dst : memref<16x16xf16>, memref<16x16xf16>
|
||||
gpu.dealloc %memref_dst : memref<16x16xf16>
|
||||
return %dst : memref<16x16xf16>
|
||||
}
|
||||
|
||||
func.func @main() attributes {llvm.emit_c_interface} {
|
||||
%A = memref.alloc() : memref<16x16xf16>
|
||||
%c0 = arith.constant 0 : index
|
||||
%c1 = arith.constant 1 : index
|
||||
%c8 = arith.constant 16 : index
|
||||
%c16 = arith.constant 16 : index
|
||||
%c11_f32 = arith.constant 11.1 : f16
|
||||
scf.for %i = %c0 to %c8 step %c1 {
|
||||
scf.for %j = %c0 to %c16 step %c1 {
|
||||
memref.store %c11_f32, %A[%i, %j] : memref<16x16xf16>
|
||||
}
|
||||
}
|
||||
%B = call @test(%A) : (memref<16x16xf16>) -> memref<16x16xf16>
|
||||
%B_cast = memref.cast %B : memref<16x16xf16> to memref<*xf16>
|
||||
%A_cast = memref.cast %A : memref<16x16xf16> to memref<*xf16>
|
||||
call @printMemrefF16(%A_cast) : (memref<*xf16>) -> ()
|
||||
call @printMemrefF16(%B_cast) : (memref<*xf16>) -> ()
|
||||
|
||||
// CHECK: Unranked Memref base@ = 0x{{[0-9a-f]+}}
|
||||
// CHECK-NEXT: [11.1{{.*}}]
|
||||
// CHECK-COUNT-224: 11.1
|
||||
// CHECK-NEXT: [11.1{{.*}}]
|
||||
|
||||
// CHECK-NEXT: Unranked Memref base@ = 0x{{[0-9a-f]+}}
|
||||
// CHECK-NEXT: [11.1{{.*}}]
|
||||
// CHECK: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
||||
// CHECK-COUNT-208: 11.1
|
||||
// CHECK-NEXT: [11.1{{.*}}]
|
||||
|
||||
memref.dealloc %A : memref<16x16xf16>
|
||||
memref.dealloc %B : memref<16x16xf16>
|
||||
return
|
||||
}
|
||||
func.func private @printMemrefF16(%ptr : memref<*xf16>) attributes { llvm.emit_c_interface }
|
||||
}
|
@ -0,0 +1,133 @@
|
||||
// RUN: mlir-opt %s \
|
||||
// RUN: | mlir-opt -pass-pipeline='builtin.module(cse,func.func(gpu-async-region),xevm-attach-target,gpu.module(convert-gpu-to-llvm-spv{use-64bit-index=true},convert-xevm-to-llvm,cse))' \
|
||||
// RUN: | mlir-opt -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \
|
||||
// RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -cse -gpu-module-to-binary \
|
||||
// RUN: | mlir-runner \
|
||||
// RUN: --shared-libs=%mlir_levelzero_runtime \
|
||||
// RUN: --shared-libs=%mlir_runner_utils \
|
||||
// RUN: --shared-libs=%mlir_c_runner_utils \
|
||||
// RUN: --entry-point-result=void \
|
||||
// RUN: | FileCheck %s
|
||||
|
||||
module @gemm attributes {gpu.container_module} {
|
||||
gpu.module @kernel {
|
||||
gpu.func @block_load_store(%src: !llvm.ptr<1>, %dst: !llvm.ptr<1>) kernel {
|
||||
%base_width = arith.constant 32 : i32 // bytewidth of the block
|
||||
%base_height = arith.constant 16 : i32 // number of rows
|
||||
%base_pitch = arith.constant 32 : i32 // bytewidth of the base row
|
||||
%x = arith.constant 0 : i32
|
||||
%y = arith.constant 0 : i32
|
||||
// Normally a work-item loads a vertical slice (↓), but with *transpose* a work-item
|
||||
// loads a horizontal slice (→).
|
||||
// The tile dimension we want to slice must be a multiple of the sub-group size:
|
||||
// e.g., we want to slice rows (→), then we need SG_SIZE % tile_height == 0.
|
||||
%loaded = xevm.blockload2d %src, %base_width, %base_height, %base_pitch, %x, %y
|
||||
<{elem_size_in_bits=32 : i32, tile_width=8 : i32, tile_height=16 : i32, v_blocks=1 : i32,
|
||||
transpose=true, pack_register=false}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<8xi32>
|
||||
%loaded_f32 = vector.bitcast %loaded : vector<8xi32> to vector<8xf32>
|
||||
|
||||
%c0 = arith.constant 0 : i32
|
||||
%thread_x = gpu.thread_id x
|
||||
%thread_x_i64 = arith.index_cast %thread_x : index to i64
|
||||
%thread_x_i32 = llvm.trunc %thread_x_i64 : i64 to i32
|
||||
%thread_x_f32 = arith.sitofp %thread_x_i32 : i32 to f32
|
||||
%loaded_f32_modified = vector.insert %thread_x_f32, %loaded_f32[7] : f32 into vector<8xf32> // Use this to see where threadIds end up stored
|
||||
%loaded_f32_modified_1 = vector.bitcast %loaded_f32_modified : vector<8xf32> to vector<8xi32>
|
||||
|
||||
%base_height_store = arith.constant 8 : i32 // number of rows
|
||||
%base_width_store = arith.constant 64 : i32 // bytewidth of the block
|
||||
%base_pitch_store = arith.constant 64 : i32 // bytewidth of the base row
|
||||
// "Transposed" stores are not available, meaning a work-item can store its vector as a vertical slice (↓).
|
||||
xevm.blockstore2d %dst, %base_width_store, %base_height_store, %base_pitch_store, %x, %y, %loaded
|
||||
<{elem_size_in_bits=32 : i32, tile_width=16 : i32, tile_height=8 : i32}> : (!llvm.ptr<1>, i32, i32, i32, i32, i32, vector<8xi32>)
|
||||
gpu.return
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
func.func @test(%src : memref<16x8xf32>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} {
|
||||
%c1 = arith.constant 1 : index
|
||||
%c16 = arith.constant 16 : index // Multiple of the *maximum sub-group size* (see `intel_reqd_sub_group_size`)
|
||||
%memref_src = gpu.alloc() : memref<16x8xf32>
|
||||
gpu.memcpy %memref_src, %src : memref<16x8xf32>, memref<16x8xf32>
|
||||
%src_ptr_as_idx = memref.extract_aligned_pointer_as_index %memref_src : memref<16x8xf32> -> index
|
||||
%src_ptr_as_i64 = arith.index_cast %src_ptr_as_idx : index to i64
|
||||
%src_ptr = llvm.inttoptr %src_ptr_as_i64 : i64 to !llvm.ptr
|
||||
%src_ptr_casted = llvm.addrspacecast %src_ptr : !llvm.ptr to !llvm.ptr<1>
|
||||
|
||||
%memref_dst = gpu.alloc() : memref<8x16xf32>
|
||||
%dst_ptr_as_idx = memref.extract_aligned_pointer_as_index %memref_dst : memref<8x16xf32> -> index
|
||||
%dst_ptr_as_i64 = arith.index_cast %dst_ptr_as_idx : index to i64
|
||||
%dst_ptr = llvm.inttoptr %dst_ptr_as_i64 : i64 to !llvm.ptr
|
||||
%dst_ptr_casted = llvm.addrspacecast %dst_ptr : !llvm.ptr to !llvm.ptr<1>
|
||||
|
||||
gpu.launch_func @kernel::@block_load_store blocks in (%c1, %c1, %c1) threads in (%c16, %c1, %c1)
|
||||
args(%src_ptr_casted : !llvm.ptr<1>, %dst_ptr_casted : !llvm.ptr<1>)
|
||||
gpu.dealloc %memref_src : memref<16x8xf32>
|
||||
%dst = memref.alloc() : memref<8x16xf32>
|
||||
gpu.memcpy %dst, %memref_dst : memref<8x16xf32>, memref<8x16xf32>
|
||||
gpu.dealloc %memref_dst : memref<8x16xf32>
|
||||
return %dst : memref<8x16xf32>
|
||||
}
|
||||
|
||||
func.func @main() attributes {llvm.emit_c_interface} {
|
||||
%A = memref.alloc() : memref<16x8xf32>
|
||||
%c0 = arith.constant 0 : index
|
||||
%c1 = arith.constant 1 : index
|
||||
%c8 = arith.constant 8 : index
|
||||
%c16 = arith.constant 16 : index
|
||||
%c11_f32 = arith.constant 11.11 : f16
|
||||
scf.for %i = %c0 to %c16 step %c1 {
|
||||
scf.for %j = %c0 to %c8 step %c1 {
|
||||
%c_10_f = arith.constant 10.0 : f32
|
||||
%j_i64 = arith.index_cast %j : index to i64
|
||||
%j_i32 = llvm.trunc %j_i64 : i64 to i32
|
||||
%j_f32 = arith.sitofp %j_i32 : i32 to f32
|
||||
%jj = arith.divf %j_f32, %c_10_f : f32
|
||||
|
||||
%i_i64 = arith.index_cast %i : index to i64
|
||||
%i_i32 = llvm.trunc %i_i64 : i64 to i32
|
||||
%i_f32 = arith.sitofp %i_i32 : i32 to f32
|
||||
%ii = arith.addf %i_f32, %jj : f32
|
||||
memref.store %ii, %A[%i, %j] : memref<16x8xf32>
|
||||
}
|
||||
}
|
||||
%B = call @test(%A) : (memref<16x8xf32>) -> memref<8x16xf32>
|
||||
%A_cast = memref.cast %A : memref<16x8xf32> to memref<*xf32>
|
||||
%B_cast = memref.cast %B : memref<8x16xf32> to memref<*xf32>
|
||||
call @printMemrefF32(%A_cast) : (memref<*xf32>) -> ()
|
||||
// CHECK: Unranked Memref base@ = 0x{{[0-9a-f]+}}
|
||||
// CHECK-NEXT: [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7],
|
||||
// CHECK-NEXT: [1, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7],
|
||||
// CHECK-NEXT: [2, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7],
|
||||
// CHECK-NEXT: [3, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7],
|
||||
// CHECK-NEXT: [4, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7],
|
||||
// CHECK-NEXT: [5, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.7],
|
||||
// CHECK-NEXT: [6, 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7],
|
||||
// CHECK-NEXT: [7, 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7],
|
||||
// CHECK-NEXT: [8, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.7],
|
||||
// CHECK-NEXT: [9, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7],
|
||||
// CHECK-NEXT: [10, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7],
|
||||
// CHECK-NEXT: [11, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7],
|
||||
// CHECK-NEXT: [12, 12.1, 12.2, 12.3, 12.4, 12.5, 12.6, 12.7],
|
||||
// CHECK-NEXT: [13, 13.1, 13.2, 13.3, 13.4, 13.5, 13.6, 13.7],
|
||||
// CHECK-NEXT: [14, 14.1, 14.2, 14.3, 14.4, 14.5, 14.6, 14.7],
|
||||
// CHECK-NEXT: [15, 15.1, 15.2, 15.3, 15.4, 15.5, 15.6, 15.7]
|
||||
|
||||
call @printMemrefF32(%B_cast) : (memref<*xf32>) -> ()
|
||||
// CHECK: Unranked Memref base@ = 0x{{[0-9a-f]+}}
|
||||
// CHECK-NEXT: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
|
||||
// CHECK-NEXT: [0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1, 8.1, 9.1, 10.1, 11.1, 12.1, 13.1, 14.1, 15.1],
|
||||
// CHECK-NEXT: [0.2, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2, 8.2, 9.2, 10.2, 11.2, 12.2, 13.2, 14.2, 15.2],
|
||||
// CHECK-NEXT: [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3, 8.3, 9.3, 10.3, 11.3, 12.3, 13.3, 14.3, 15.3],
|
||||
// CHECK-NEXT: [0.4, 1.4, 2.4, 3.4, 4.4, 5.4, 6.4, 7.4, 8.4, 9.4, 10.4, 11.4, 12.4, 13.4, 14.4, 15.4],
|
||||
// CHECK-NEXT: [0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5, 10.5, 11.5, 12.5, 13.5, 14.5, 15.5],
|
||||
// CHECK-NEXT: [0.6, 1.6, 2.6, 3.6, 4.6, 5.6, 6.6, 7.6, 8.6, 9.6, 10.6, 11.6, 12.6, 13.6, 14.6, 15.6],
|
||||
// CHECK-NEXT: [0.7, 1.7, 2.7, 3.7, 4.7, 5.7, 6.7, 7.7, 8.7, 9.7, 10.7, 11.7, 12.7, 13.7, 14.7, 15.7]
|
||||
|
||||
memref.dealloc %A : memref<16x8xf32>
|
||||
memref.dealloc %B : memref<8x16xf32>
|
||||
return
|
||||
}
|
||||
func.func private @printMemrefF32(%ptr : memref<*xf32>) attributes { llvm.emit_c_interface }
|
||||
}
|
75
mlir/test/Integration/Dialect/XeVM/GPU/xevm_store_cst.mlir
Normal file
75
mlir/test/Integration/Dialect/XeVM/GPU/xevm_store_cst.mlir
Normal file
@ -0,0 +1,75 @@
|
||||
// RUN: mlir-opt %s \
|
||||
// RUN: | mlir-opt -pass-pipeline='builtin.module(cse,func.func(gpu-async-region),xevm-attach-target,gpu.module(convert-gpu-to-llvm-spv{use-64bit-index=true},convert-xevm-to-llvm,cse))' \
|
||||
// RUN: | mlir-opt -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \
|
||||
// RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -cse -gpu-module-to-binary \
|
||||
// RUN: | mlir-runner \
|
||||
// RUN: --shared-libs=%mlir_levelzero_runtime \
|
||||
// RUN: --shared-libs=%mlir_runner_utils \
|
||||
// RUN: --shared-libs=%mlir_c_runner_utils \
|
||||
// RUN: --entry-point-result=void \
|
||||
// RUN: | FileCheck %s
|
||||
|
||||
module @gemm attributes {gpu.container_module} {
|
||||
|
||||
gpu.module @kernel {
|
||||
gpu.func @store_constant(%ptr: !llvm.ptr<1>) kernel {
|
||||
%const_val = arith.constant 42.0 : f32
|
||||
%thread_x = gpu.lane_id
|
||||
%thread_x_i64 = arith.index_cast %thread_x : index to i64
|
||||
%ptr_next_1 = llvm.getelementptr %ptr[%thread_x_i64] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, i32
|
||||
llvm.store %const_val, %ptr_next_1 : f32, !llvm.ptr<1>
|
||||
gpu.return
|
||||
}
|
||||
}
|
||||
func.func @test(%src : memref<8x16xf32>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} {
|
||||
%c1 = arith.constant 1 : index
|
||||
%c16 = arith.constant 16 : index
|
||||
%memref_0 = gpu.alloc() : memref<8x16xf32>
|
||||
gpu.memcpy %memref_0, %src : memref<8x16xf32>, memref<8x16xf32>
|
||||
%0 = memref.extract_aligned_pointer_as_index %memref_0 : memref<8x16xf32> -> index
|
||||
%1 = arith.index_cast %0 : index to i64
|
||||
%2 = llvm.inttoptr %1 : i64 to !llvm.ptr
|
||||
%src_casted = llvm.addrspacecast %2 : !llvm.ptr to !llvm.ptr<1>
|
||||
gpu.launch_func @kernel::@store_constant blocks in (%c1, %c1, %c1) threads in (%c16, %c1, %c1)
|
||||
args(%src_casted : !llvm.ptr<1>)
|
||||
%dst = memref.alloc() : memref<8x16xf32>
|
||||
gpu.memcpy %dst, %memref_0 : memref<8x16xf32>, memref<8x16xf32>
|
||||
gpu.dealloc %memref_0 : memref<8x16xf32>
|
||||
|
||||
return %dst : memref<8x16xf32>
|
||||
}
|
||||
|
||||
func.func @main() attributes {llvm.emit_c_interface} {
|
||||
%A = memref.alloc() : memref<8x16xf32>
|
||||
%c0 = arith.constant 0 : index
|
||||
%c1 = arith.constant 1 : index
|
||||
%c8 = arith.constant 8 : index
|
||||
%c16 = arith.constant 16 : index
|
||||
%c11_f32 = arith.constant 11.11 : f32
|
||||
scf.for %i = %c0 to %c8 step %c1 {
|
||||
scf.for %j = %c0 to %c16 step %c1 {
|
||||
memref.store %c11_f32, %A[%i, %j] : memref<8x16xf32>
|
||||
}
|
||||
}
|
||||
%B = call @test(%A) : (memref<8x16xf32>) -> memref<8x16xf32>
|
||||
%B_cast = memref.cast %B : memref<8x16xf32> to memref<*xf32>
|
||||
%A_cast = memref.cast %A : memref<8x16xf32> to memref<*xf32>
|
||||
call @printMemrefF32(%A_cast) : (memref<*xf32>) -> ()
|
||||
call @printMemrefF32(%B_cast) : (memref<*xf32>) -> ()
|
||||
|
||||
// CHECK: Unranked Memref base@ = 0x{{[0-9a-f]+}}
|
||||
// CHECK-NEXT: [11.11{{.*}}]
|
||||
// CHECK-COUNT-96: 11.11
|
||||
// CHECK-NEXT: [11.11{{.*}}]
|
||||
|
||||
// CHECK-NEXT: Unranked Memref base@ = 0x{{[0-9a-f]+}}
|
||||
// CHECK-NEXT: [42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42]
|
||||
// CHECK-COUNT-96: 11.11
|
||||
// CHECK-NEXT: [11.11{{.*}}]
|
||||
|
||||
memref.dealloc %A : memref<8x16xf32>
|
||||
memref.dealloc %B : memref<8x16xf32>
|
||||
return
|
||||
}
|
||||
func.func private @printMemrefF32(%ptr : memref<*xf32>)
|
||||
}
|
@ -30,6 +30,7 @@ set(LIBS
|
||||
MLIRVectorDialect
|
||||
MLIRVectorToLLVMPass
|
||||
MLIRXeVMDialect
|
||||
MLIRXeVMToLLVMIRTranslation
|
||||
)
|
||||
|
||||
add_mlir_library(MLIRGPUTestPasses
|
||||
|
@ -33,6 +33,7 @@ config.run_rocm_tests = @MLIR_ENABLE_ROCM_CONVERSIONS@
|
||||
config.enable_rocm_runner = @MLIR_ENABLE_ROCM_RUNNER@
|
||||
config.gpu_compilation_format = "@MLIR_GPU_COMPILATION_TEST_FORMAT@"
|
||||
config.rocm_test_chipset = "@ROCM_TEST_CHIPSET@"
|
||||
config.run_xevm_tests = @MLIR_ENABLE_XEVM_CONVERSIONS@
|
||||
config.enable_sycl_runner = @MLIR_ENABLE_SYCL_RUNNER@
|
||||
config.enable_levelzero_runner = @MLIR_ENABLE_LEVELZERO_RUNNER@
|
||||
config.enable_spirv_cpu_runner = @MLIR_ENABLE_SPIRV_CPU_RUNNER@
|
||||
|
Loading…
x
Reference in New Issue
Block a user