Ross Brunton 910d7e90bf
[Offload] Make olLaunchKernel test thread safe (#149497)
This sprinkles a few mutexes around the plugin interface so that the
olLaunchKernel CTS test now passes when ran on multiple threads.

Part of this also involved changing the interface for device synchronise
so that it can optionally not free the underlying queue (which
introduced a race condition in liboffload).
2025-08-08 10:57:04 +01:00

287 lines
8.6 KiB
C++

//===------- Offload API tests - olLaunchKernel --------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "../common/Fixtures.hpp"
#include <OffloadAPI.h>
#include <gtest/gtest.h>
struct LaunchKernelTestBase : OffloadQueueTest {
void SetUpProgram(const char *program) {
RETURN_ON_FATAL_FAILURE(OffloadQueueTest::SetUp());
ASSERT_TRUE(TestEnvironment::loadDeviceBinary(program, Device, DeviceBin));
ASSERT_GE(DeviceBin->getBufferSize(), 0lu);
ASSERT_SUCCESS(olCreateProgram(Device, DeviceBin->getBufferStart(),
DeviceBin->getBufferSize(), &Program));
LaunchArgs.Dimensions = 1;
LaunchArgs.GroupSize = {64, 1, 1};
LaunchArgs.NumGroups = {1, 1, 1};
LaunchArgs.DynSharedMemory = 0;
}
void TearDown() override {
if (Program) {
olDestroyProgram(Program);
}
RETURN_ON_FATAL_FAILURE(OffloadQueueTest::TearDown());
}
std::unique_ptr<llvm::MemoryBuffer> DeviceBin;
ol_program_handle_t Program = nullptr;
ol_kernel_launch_size_args_t LaunchArgs{};
};
struct LaunchSingleKernelTestBase : LaunchKernelTestBase {
void SetUpKernel(const char *kernel) {
RETURN_ON_FATAL_FAILURE(SetUpProgram(kernel));
ASSERT_SUCCESS(
olGetSymbol(Program, kernel, OL_SYMBOL_KIND_KERNEL, &Kernel));
}
ol_symbol_handle_t Kernel = nullptr;
};
#define KERNEL_TEST(NAME, KERNEL) \
struct olLaunchKernel##NAME##Test : LaunchSingleKernelTestBase { \
void SetUp() override { SetUpKernel(#KERNEL); } \
}; \
OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olLaunchKernel##NAME##Test);
KERNEL_TEST(Foo, foo)
KERNEL_TEST(NoArgs, noargs)
KERNEL_TEST(LocalMem, localmem)
KERNEL_TEST(LocalMemReduction, localmem_reduction)
KERNEL_TEST(LocalMemStatic, localmem_static)
KERNEL_TEST(GlobalCtor, global_ctor)
KERNEL_TEST(GlobalDtor, global_dtor)
struct LaunchMultipleKernelTestBase : LaunchKernelTestBase {
void SetUpKernels(const char *program, std::vector<const char *> kernels) {
RETURN_ON_FATAL_FAILURE(SetUpProgram(program));
Kernels.resize(kernels.size());
size_t I = 0;
for (auto K : kernels)
ASSERT_SUCCESS(
olGetSymbol(Program, K, OL_SYMBOL_KIND_KERNEL, &Kernels[I++]));
}
std::vector<ol_symbol_handle_t> Kernels;
};
#define KERNEL_MULTI_TEST(NAME, PROGRAM, ...) \
struct olLaunchKernel##NAME##Test : LaunchMultipleKernelTestBase { \
void SetUp() override { SetUpKernels(#PROGRAM, {__VA_ARGS__}); } \
}; \
OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olLaunchKernel##NAME##Test);
KERNEL_MULTI_TEST(Global, global, "write", "read")
TEST_P(olLaunchKernelFooTest, Success) {
void *Mem;
ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
LaunchArgs.GroupSize.x * sizeof(uint32_t), &Mem));
struct {
void *Mem;
} Args{Mem};
ASSERT_SUCCESS(
olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args), &LaunchArgs));
ASSERT_SUCCESS(olSyncQueue(Queue));
uint32_t *Data = (uint32_t *)Mem;
for (uint32_t i = 0; i < 64; i++) {
ASSERT_EQ(Data[i], i);
}
ASSERT_SUCCESS(olMemFree(Mem));
}
TEST_P(olLaunchKernelFooTest, SuccessThreaded) {
threadify([&](size_t) {
void *Mem;
ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
LaunchArgs.GroupSize.x * sizeof(uint32_t), &Mem));
struct {
void *Mem;
} Args{Mem};
ASSERT_SUCCESS(olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args),
&LaunchArgs));
ASSERT_SUCCESS(olSyncQueue(Queue));
uint32_t *Data = (uint32_t *)Mem;
for (uint32_t i = 0; i < 64; i++) {
ASSERT_EQ(Data[i], i);
}
ASSERT_SUCCESS(olMemFree(Mem));
});
}
TEST_P(olLaunchKernelNoArgsTest, Success) {
ASSERT_SUCCESS(
olLaunchKernel(Queue, Device, Kernel, nullptr, 0, &LaunchArgs));
ASSERT_SUCCESS(olSyncQueue(Queue));
}
TEST_P(olLaunchKernelFooTest, SuccessSynchronous) {
void *Mem;
ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
LaunchArgs.GroupSize.x * sizeof(uint32_t), &Mem));
struct {
void *Mem;
} Args{Mem};
ASSERT_SUCCESS(olLaunchKernel(nullptr, Device, Kernel, &Args, sizeof(Args),
&LaunchArgs));
uint32_t *Data = (uint32_t *)Mem;
for (uint32_t i = 0; i < 64; i++) {
ASSERT_EQ(Data[i], i);
}
ASSERT_SUCCESS(olMemFree(Mem));
}
TEST_P(olLaunchKernelLocalMemTest, Success) {
LaunchArgs.NumGroups.x = 4;
LaunchArgs.DynSharedMemory = 64 * sizeof(uint32_t);
void *Mem;
ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
LaunchArgs.GroupSize.x * LaunchArgs.NumGroups.x *
sizeof(uint32_t),
&Mem));
struct {
void *Mem;
} Args{Mem};
ASSERT_SUCCESS(
olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args), &LaunchArgs));
ASSERT_SUCCESS(olSyncQueue(Queue));
uint32_t *Data = (uint32_t *)Mem;
for (uint32_t i = 0; i < LaunchArgs.GroupSize.x * LaunchArgs.NumGroups.x; i++)
ASSERT_EQ(Data[i], (i % 64) * 2);
ASSERT_SUCCESS(olMemFree(Mem));
}
TEST_P(olLaunchKernelLocalMemReductionTest, Success) {
LaunchArgs.NumGroups.x = 4;
LaunchArgs.DynSharedMemory = 64 * sizeof(uint32_t);
void *Mem;
ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
LaunchArgs.NumGroups.x * sizeof(uint32_t), &Mem));
struct {
void *Mem;
} Args{Mem};
ASSERT_SUCCESS(
olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args), &LaunchArgs));
ASSERT_SUCCESS(olSyncQueue(Queue));
uint32_t *Data = (uint32_t *)Mem;
for (uint32_t i = 0; i < LaunchArgs.NumGroups.x; i++)
ASSERT_EQ(Data[i], 2 * LaunchArgs.GroupSize.x);
ASSERT_SUCCESS(olMemFree(Mem));
}
TEST_P(olLaunchKernelLocalMemStaticTest, Success) {
LaunchArgs.NumGroups.x = 4;
LaunchArgs.DynSharedMemory = 0;
void *Mem;
ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
LaunchArgs.NumGroups.x * sizeof(uint32_t), &Mem));
struct {
void *Mem;
} Args{Mem};
ASSERT_SUCCESS(
olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args), &LaunchArgs));
ASSERT_SUCCESS(olSyncQueue(Queue));
uint32_t *Data = (uint32_t *)Mem;
for (uint32_t i = 0; i < LaunchArgs.NumGroups.x; i++)
ASSERT_EQ(Data[i], 2 * LaunchArgs.GroupSize.x);
ASSERT_SUCCESS(olMemFree(Mem));
}
TEST_P(olLaunchKernelGlobalTest, Success) {
void *Mem;
ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
LaunchArgs.GroupSize.x * sizeof(uint32_t), &Mem));
struct {
void *Mem;
} Args{Mem};
ASSERT_SUCCESS(
olLaunchKernel(Queue, Device, Kernels[0], nullptr, 0, &LaunchArgs));
ASSERT_SUCCESS(olSyncQueue(Queue));
ASSERT_SUCCESS(olLaunchKernel(Queue, Device, Kernels[1], &Args, sizeof(Args),
&LaunchArgs));
ASSERT_SUCCESS(olSyncQueue(Queue));
uint32_t *Data = (uint32_t *)Mem;
for (uint32_t i = 0; i < 64; i++) {
ASSERT_EQ(Data[i], i * 2);
}
ASSERT_SUCCESS(olMemFree(Mem));
}
TEST_P(olLaunchKernelGlobalTest, InvalidNotAKernel) {
ol_symbol_handle_t Global = nullptr;
ASSERT_SUCCESS(
olGetSymbol(Program, "global", OL_SYMBOL_KIND_GLOBAL_VARIABLE, &Global));
ASSERT_ERROR(OL_ERRC_SYMBOL_KIND,
olLaunchKernel(Queue, Device, Global, nullptr, 0, &LaunchArgs));
}
TEST_P(olLaunchKernelGlobalCtorTest, Success) {
void *Mem;
ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
LaunchArgs.GroupSize.x * sizeof(uint32_t), &Mem));
struct {
void *Mem;
} Args{Mem};
ASSERT_SUCCESS(
olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args), &LaunchArgs));
ASSERT_SUCCESS(olSyncQueue(Queue));
uint32_t *Data = (uint32_t *)Mem;
for (uint32_t i = 0; i < 64; i++) {
ASSERT_EQ(Data[i], i + 100);
}
ASSERT_SUCCESS(olMemFree(Mem));
}
TEST_P(olLaunchKernelGlobalDtorTest, Success) {
// TODO: We can't inspect the result of a destructor yet, once we
// find/implement a way, update this test. For now we just check that nothing
// crashes
ASSERT_SUCCESS(
olLaunchKernel(Queue, Device, Kernel, nullptr, 0, &LaunchArgs));
ASSERT_SUCCESS(olSyncQueue(Queue));
}