llvm-project/llvm/tools/llvm-gpu-loader/llvm-gpu-loader.cpp
Joseph Huber 049cfda67c
[LLVM] Port 'llvm-gpu-loader' to use LLVMOffload (#162739)
Summary:
This patch rewrites the `llvm-gpu-loader` utility to use the LLVMOffload
interface. This heavily simplifies it while re-using the already
existing support. Another benefit is that I can now easily do this
dynamically so we can always build this utility without needing to find
non-standard packages.

One issue is mentioned in
https://github.com/llvm/llvm-project/issues/159636 where this will now
take extra time if you have both installed on the same machine. This is
just slightly annoying since most people don't have both CUDA and ROCm
at the same time so I don't consider it a blocker. I will work later to
address it.

Slightly unfortunate environment variable usage, I will also expose that
better in the future.

Fixes: https://github.com/llvm/llvm-project/issues/132890
2026-02-24 08:44:29 -06:00

299 lines
10 KiB
C++

//===-- Main entry into the loader interface ------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This utility is used to launch standard programs onto the GPU in conjunction
// with the LLVM 'libc' project. It is designed to mimic a standard emulator
// workflow, allowing for unit tests to be run on the GPU directly.
//
//===----------------------------------------------------------------------===//
#include "llvm-gpu-loader.h"
#include "llvm/BinaryFormat/Magic.h"
#include "llvm/Object/ELF.h"
#include "llvm/Object/ELFObjectFile.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Error.h"
#include "llvm/Support/FileSystem.h"
#include "llvm/Support/MemoryBuffer.h"
#include "llvm/Support/Path.h"
#include "llvm/Support/Signals.h"
#include "llvm/Support/WithColor.h"
#include "llvm/TargetParser/Triple.h"
#include <cerrno>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <string>
#include <sys/file.h>
using namespace llvm;
static cl::OptionCategory LoaderCategory("loader options");
static cl::opt<bool> Help("h", cl::desc("Alias for -help"), cl::Hidden,
cl::cat(LoaderCategory));
static cl::opt<unsigned>
ThreadsX("threads-x", cl::desc("Number of threads in the 'x' dimension"),
cl::init(1), cl::cat(LoaderCategory));
static cl::opt<unsigned>
ThreadsY("threads-y", cl::desc("Number of threads in the 'y' dimension"),
cl::init(1), cl::cat(LoaderCategory));
static cl::opt<unsigned>
ThreadsZ("threads-z", cl::desc("Number of threads in the 'z' dimension"),
cl::init(1), cl::cat(LoaderCategory));
static cl::alias threads("threads", cl::aliasopt(ThreadsX),
cl::desc("Alias for --threads-x"),
cl::cat(LoaderCategory));
static cl::opt<unsigned>
BlocksX("blocks-x", cl::desc("Number of blocks in the 'x' dimension"),
cl::init(1), cl::cat(LoaderCategory));
static cl::opt<unsigned>
BlocksY("blocks-y", cl::desc("Number of blocks in the 'y' dimension"),
cl::init(1), cl::cat(LoaderCategory));
static cl::opt<unsigned>
BlocksZ("blocks-z", cl::desc("Number of blocks in the 'z' dimension"),
cl::init(1), cl::cat(LoaderCategory));
static cl::alias Blocks("blocks", cl::aliasopt(BlocksX),
cl::desc("Alias for --blocks-x"),
cl::cat(LoaderCategory));
static cl::opt<std::string> File(cl::Positional, cl::Required,
cl::desc("<gpu executable>"),
cl::cat(LoaderCategory));
static cl::list<std::string> Args(cl::ConsumeAfter,
cl::desc("<program arguments>..."),
cl::cat(LoaderCategory));
// The arguments to the '_begin' kernel.
struct BeginArgs {
int Argc;
void *Argv;
void *Envp;
};
// The arguments to the '_start' kernel.
struct StartArgs {
int Argc;
void *Argv;
void *Envp;
void *Ret;
};
// The arguments to the '_end' kernel.
struct EndArgs {};
[[noreturn]] static void handleError(Error E) {
outs().flush();
logAllUnhandledErrors(std::move(E), WithColor::error(errs(), "loader"));
exit(EXIT_FAILURE);
}
[[noreturn]] static void handleError(ol_result_t Err, unsigned Line) {
fprintf(stderr, "%s:%d %s\n", __FILE__, Line, Err->Details);
exit(EXIT_FAILURE);
}
#define OFFLOAD_ERR(X) \
if (ol_result_t Err = X) \
handleError(Err, __LINE__);
static void *copyArgumentVector(int Argc, const char **Argv,
ol_device_handle_t Device) {
size_t ArgSize = sizeof(char *) * (Argc + 1);
size_t StringLen = 0;
for (int i = 0; i < Argc; ++i)
StringLen += strlen(Argv[i]) + 1;
// We allocate enough space for a null terminated array and all the strings.
void *DevArgv;
OFFLOAD_ERR(
olMemAlloc(Device, OL_ALLOC_TYPE_HOST, ArgSize + StringLen, &DevArgv));
if (!DevArgv)
handleError(
createStringError("Failed to allocate memory for environment."));
// Store the strings linerally in the same memory buffer.
void *DevString = reinterpret_cast<uint8_t *>(DevArgv) + ArgSize;
for (int i = 0; i < Argc; ++i) {
size_t size = strlen(Argv[i]) + 1;
std::memcpy(DevString, Argv[i], size);
static_cast<void **>(DevArgv)[i] = DevString;
DevString = reinterpret_cast<uint8_t *>(DevString) + size;
}
// Ensure the vector is null terminated.
reinterpret_cast<void **>(DevArgv)[Argc] = nullptr;
return DevArgv;
}
void *copyEnvironment(const char **Envp, ol_device_handle_t Device) {
int Envc = 0;
for (const char **Env = Envp; *Env != 0; ++Env)
++Envc;
return copyArgumentVector(Envc, Envp, Device);
}
ol_device_handle_t findDevice(MemoryBufferRef Binary) {
ol_device_handle_t Device;
std::tuple Data = std::make_tuple(&Device, &Binary);
OFFLOAD_ERR(olIterateDevices(
[](ol_device_handle_t Device, void *UserData) {
auto &[Output, Binary] = *reinterpret_cast<decltype(Data) *>(UserData);
bool IsValid = false;
OFFLOAD_ERR(olIsValidBinary(Device, Binary->getBufferStart(),
Binary->getBufferSize(), &IsValid));
if (!IsValid)
return true;
*Output = Device;
return false;
},
&Data));
return Device;
}
ol_device_handle_t getHostDevice() {
ol_device_handle_t Device;
OFFLOAD_ERR(olIterateDevices(
[](ol_device_handle_t Device, void *UserData) {
ol_platform_handle_t Platform;
olGetDeviceInfo(Device, OL_DEVICE_INFO_PLATFORM, sizeof(Platform),
&Platform);
ol_platform_backend_t Backend;
olGetPlatformInfo(Platform, OL_PLATFORM_INFO_BACKEND, sizeof(Backend),
&Backend);
auto &Output = *reinterpret_cast<decltype(Device) *>(UserData);
if (Backend == OL_PLATFORM_BACKEND_HOST) {
Output = Device;
return false;
}
return true;
},
&Device));
return Device;
}
template <typename Args>
void launchKernel(ol_queue_handle_t Queue, ol_device_handle_t Device,
ol_program_handle_t Program, const char *Name,
ol_kernel_launch_size_args_t LaunchArgs, Args &KernelArgs) {
ol_symbol_handle_t Kernel;
OFFLOAD_ERR(olGetSymbol(Program, Name, OL_SYMBOL_KIND_KERNEL, &Kernel));
OFFLOAD_ERR(olLaunchKernel(Queue, Device, Kernel, &KernelArgs,
std::is_empty_v<Args> ? 0 : sizeof(Args),
&LaunchArgs));
}
int main(int argc, const char **argv, const char **envp) {
sys::PrintStackTraceOnErrorSignal(argv[0]);
cl::HideUnrelatedOptions(LoaderCategory);
cl::ParseCommandLineOptions(
argc, argv,
"A utility used to launch unit tests built for a GPU target. This is\n"
"intended to provide an intrface simular to cross-compiling emulators\n");
if (Help) {
cl::PrintHelpMessage();
return EXIT_SUCCESS;
}
if (Error Err = loadLLVMOffload())
handleError(std::move(Err));
ErrorOr<std::unique_ptr<MemoryBuffer>> ImageOrErr =
MemoryBuffer::getFileOrSTDIN(File);
if (std::error_code EC = ImageOrErr.getError())
handleError(errorCodeToError(EC));
MemoryBufferRef Image = **ImageOrErr;
ol_platform_backend_t Backend;
ol_init_args_t InitArgs = OL_INIT_ARGS_INIT;
file_magic Magic = identify_magic(Image.getBuffer());
if (Magic >= file_magic::elf && Magic <= file_magic::elf_core) {
Expected<object::ELFFile<object::ELF64LE>> ElfOrErr =
object::ELFFile<object::ELF64LE>::create(Image.getBuffer());
if (!ElfOrErr)
handleError(ElfOrErr.takeError());
switch (ElfOrErr->getHeader().e_machine) {
case ELF::EM_AMDGPU:
Backend = OL_PLATFORM_BACKEND_AMDGPU;
break;
case ELF::EM_CUDA:
Backend = OL_PLATFORM_BACKEND_CUDA;
break;
default:
handleError(createStringError(
"unhandled ELF architecture: %s",
ELF::convertEMachineToArchName(ElfOrErr->getHeader().e_machine)
.data()));
}
InitArgs.NumPlatforms = 1;
InitArgs.Platforms = &Backend;
}
SmallVector<const char *> NewArgv = {File.c_str()};
llvm::transform(Args, std::back_inserter(NewArgv),
[](const std::string &Arg) { return Arg.c_str(); });
OFFLOAD_ERR(olInit(&InitArgs));
ol_device_handle_t Device = findDevice(Image);
ol_device_handle_t Host = getHostDevice();
ol_program_handle_t Program;
OFFLOAD_ERR(olCreateProgram(Device, Image.getBufferStart(),
Image.getBufferSize(), &Program));
ol_queue_handle_t Queue;
OFFLOAD_ERR(olCreateQueue(Device, &Queue));
int DevArgc = static_cast<int>(NewArgv.size());
void *DevArgv = copyArgumentVector(NewArgv.size(), NewArgv.begin(), Device);
void *DevEnvp = copyEnvironment(envp, Device);
void *DevRet;
OFFLOAD_ERR(olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, sizeof(int), &DevRet));
ol_kernel_launch_size_args_t BeginLaunch{1, {1, 1, 1}, {1, 1, 1}, 0};
BeginArgs BeginArgs = {DevArgc, DevArgv, DevEnvp};
launchKernel(Queue, Device, Program, "_begin", BeginLaunch, BeginArgs);
OFFLOAD_ERR(olSyncQueue(Queue));
uint32_t Dims = (BlocksZ > 1) ? 3 : (BlocksY > 1) ? 2 : 1;
ol_kernel_launch_size_args_t StartLaunch{Dims,
{BlocksX, BlocksY, BlocksZ},
{ThreadsX, ThreadsY, ThreadsZ},
/*SharedMemBytes=*/0};
StartArgs StartArgs = {DevArgc, DevArgv, DevEnvp, DevRet};
launchKernel(Queue, Device, Program, "_start", StartLaunch, StartArgs);
ol_kernel_launch_size_args_t EndLaunch{1, {1, 1, 1}, {1, 1, 1}, 0};
EndArgs EndArgs = {};
launchKernel(Queue, Device, Program, "_end", EndLaunch, EndArgs);
int Ret;
OFFLOAD_ERR(olMemcpy(Queue, &Ret, Host, DevRet, Device, sizeof(int)));
OFFLOAD_ERR(olSyncQueue(Queue));
OFFLOAD_ERR(olMemFree(DevArgv));
OFFLOAD_ERR(olMemFree(DevEnvp));
OFFLOAD_ERR(olDestroyQueue(Queue));
OFFLOAD_ERR(olDestroyProgram(Program));
OFFLOAD_ERR(olShutDown());
return Ret;
}