[OFFLOAD][L0] More error handling (#188496)

This PR improves cleanup/handling of errors in some memory operations,
allocating event pools, ...
This commit is contained in:
Alex Duran 2026-03-26 05:50:26 +01:00 committed by GitHub
parent 6937866f5c
commit 64e7c77e04
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 58 additions and 14 deletions

View File

@ -60,6 +60,12 @@ using namespace llvm::offload::debug;
Plugin::error(ErrorCode::UNKNOWN, "%s failed with error %d, %s", \
#Fn, rc, getZeErrorName(rc)), Fn, __VA_ARGS__)
#define CALL_ZE_SILENT(Fn, ...) \
do { \
ze_result_t rc; \
CALL_ZE(rc, Fn, __VA_ARGS__); \
(void)rc; /* Silence unused variable warning. */ \
} while (0)
#define CALL_ZE_HANDLE_ERROR(HandleErrFn, Fn, ...) \
do { \

View File

@ -18,6 +18,7 @@
#include "L0Trace.h"
#include "GlobalHandler.h"
#include "llvm/ADT/ScopeExit.h"
#include "llvm/Object/ELF.h"
namespace llvm::omp::target::plugin {
@ -700,30 +701,36 @@ Expected<OmpInteropTy> L0DeviceTy::createInterop(int32_t InteropContext,
Ret->rtl_property = new L0Interop::Property();
if (InteropContext == kmp_interop_type_targetsync) {
Ret->async_info = new __tgt_async_info();
// Ensure cleanup on error
llvm::scope_exit CleanupOnError([&]() {
if (Ret->async_info)
delete Ret->async_info;
if (Ret->rtl_property)
delete static_cast<L0Interop::Property *>(Ret->rtl_property);
delete Ret;
});
auto L0 = static_cast<L0Interop::Property *>(Ret->rtl_property);
bool InOrder = InteropSpec.attrs.inorder;
Ret->attrs.inorder = InOrder;
if (useImmForInterop()) {
auto CmdListOrErr = createImmCmdList(InOrder);
if (!CmdListOrErr) {
delete Ret->async_info;
delete Ret;
if (!CmdListOrErr)
return CmdListOrErr.takeError();
}
Ret->async_info->Queue = *CmdListOrErr;
L0->ImmCmdList = *CmdListOrErr;
} else {
auto QueueOrErr = createCommandQueue(InOrder);
if (!QueueOrErr) {
delete Ret->async_info;
delete Ret;
if (!QueueOrErr)
return QueueOrErr.takeError();
}
Ret->async_info->Queue = *QueueOrErr;
L0->CommandQueue =
static_cast<ze_command_queue_handle_t>(Ret->async_info->Queue);
}
CleanupOnError.release();
}
return Ret;
@ -791,9 +798,12 @@ Error L0DeviceTy::enqueueMemCopy(void *Dst, const void *Src, size_t Size,
CALL_ZE_RET_ERROR(zeCommandListAppendMemoryCopy, CmdList, Dst, Src, Size,
nullptr, 0, nullptr);
CALL_ZE_RET_ERROR(zeCommandListClose, CmdList);
llvm::scope_exit ResetOnExit(
[&]() { CALL_ZE_SILENT(zeCommandListReset, CmdList); });
CALL_ZE_RET_ERROR_MTX(zeCommandQueueExecuteCommandLists, getMutex(),
CmdQueue, 1, &CmdList, nullptr);
CALL_ZE_RET_ERROR(zeCommandQueueSynchronize, CmdQueue, L0DefaultTimeout);
ResetOnExit.release();
CALL_ZE_RET_ERROR(zeCommandListReset, CmdList);
}
return Plugin::success();
@ -1141,8 +1151,12 @@ Error L0DeviceTy::dataFence(__tgt_async_info *Async) {
CmdQueue = *CmdQueueOrerr;
CALL_ZE_RET_ERROR(zeCommandListAppendBarrier, CmdList, nullptr, 0, nullptr);
CALL_ZE_RET_ERROR(zeCommandListClose, CmdList);
llvm::scope_exit ResetOnExit(
[&]() { CALL_ZE_SILENT(zeCommandListReset, CmdList); });
CALL_ZE_RET_ERROR(zeCommandQueueExecuteCommandLists, CmdQueue, 1, &CmdList,
nullptr);
CALL_ZE_RET_ERROR(zeCommandQueueSynchronize, CmdQueue, L0DefaultTimeout);
ResetOnExit.release();
CALL_ZE_RET_ERROR(zeCommandListReset, CmdList);
}

View File

@ -15,6 +15,8 @@
#include "L0Plugin.h"
#include "L0Program.h"
#include "llvm/ADT/ScopeExit.h"
namespace llvm::omp::target::plugin {
bool KernelPropertiesTy::reuseGroupParams(const int32_t NumTeamsIn,
@ -337,12 +339,16 @@ static Error launchKernelWithCmdQueue(L0DeviceTy &l0Device,
&KEnv.GroupCounts, Event, 0, nullptr);
KEnv.Lock.unlock();
CALL_ZE_RET_ERROR(zeCommandListClose, CmdList);
// Ensure command list is reset even on errors after this point.
llvm::scope_exit ResetOnExit(
[&]() { CALL_ZE_SILENT(zeCommandListReset, CmdList); });
CALL_ZE_RET_ERROR_MTX(zeCommandQueueExecuteCommandLists, l0Device.getMutex(),
CmdQueue, 1, &CmdList, nullptr);
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
"Submitted kernel " DPxMOD " to device %s\n", DPxPTR(zeKernel), IdStr);
CALL_ZE_RET_ERROR(zeCommandQueueSynchronize, CmdQueue, L0DefaultTimeout);
CALL_ZE_RET_ERROR(zeCommandListReset, CmdList);
if (Event) {
if (auto Err = l0Device.releaseEvent(Event))
return Err;

View File

@ -301,8 +301,12 @@ Expected<void *> MemAllocatorTy::MemPoolTy::alloc(size_t Size,
void *Base = *BaseOrErr;
if (ZeroInit) {
auto Err = Allocator->enqueueMemSet(Base, 0, BlockSize);
if (Err)
return Err;
if (Err) {
// deallocate Base on error.
if (auto DeallocErr = Allocator->deallocFromL0(Base))
return joinErrors(std::move(Err), std::move(DeallocErr));
return std::move(Err);
}
}
BlockTy *Block = new BlockTy(Base, BlockSize, ChunkSize);
@ -708,12 +712,24 @@ Expected<ze_event_handle_t> EventPoolTy::getEvent() {
ze_event_desc_t EventDesc{ZE_STRUCTURE_TYPE_EVENT_DESC, nullptr, 0, 0, 0};
EventDesc.wait = 0;
EventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST;
uint32_t CreatedEvents = 0;
for (uint32_t I = 0; I < PoolSize; I++) {
EventDesc.index = I;
ze_event_handle_t Event;
CALL_ZE_RET_ERROR(zeEventCreate, Pool, &EventDesc, &Event);
ze_result_t RC;
CALL_ZE(RC, zeEventCreate, Pool, &EventDesc, &Event);
if (RC != ZE_RESULT_SUCCESS) {
// Log the error and skip this event.
ODBG(OLDT_Init) << "Warning: zeEventCreate failed at index " << I
<< " with code " << RC << ". Skipping this event.";
continue;
}
Events.push_back(Event);
CreatedEvents++;
}
PoolSize = CreatedEvents;
ODBG(OLDT_Init) << "Created a new event pool " << Pool << " with "
<< PoolSize << " events";
}
auto Ret = Events.back();

View File

@ -102,8 +102,10 @@ Error L0ProgramBuilderTy::addModule(size_t Size, const uint8_t *Image,
if (!RequiresModuleLink && !IsLibModule) {
ze_module_properties_t Properties = {ZE_STRUCTURE_TYPE_MODULE_PROPERTIES,
nullptr, 0};
CALL_ZE_RET_ERROR(zeModuleGetProperties, Module, &Properties);
RequiresModuleLink = Properties.flags & ZE_MODULE_PROPERTY_FLAG_IMPORTS;
ze_result_t RC;
CALL_ZE(RC, zeModuleGetProperties, Module, &Properties);
if (RC == ZE_RESULT_SUCCESS)
RequiresModuleLink = Properties.flags & ZE_MODULE_PROPERTY_FLAG_IMPORTS;
}
// For now, assume the first module contains libraries, globals.
if (Modules.empty())