From 64e7c77e04406f017c71a313b304b3369f79e04a Mon Sep 17 00:00:00 2001 From: Alex Duran Date: Thu, 26 Mar 2026 05:50:26 +0100 Subject: [PATCH] [OFFLOAD][L0] More error handling (#188496) This PR improves cleanup/handling of errors in some memory operations, allocating event pools, ... --- .../level_zero/include/L0Trace.h | 6 ++++ .../level_zero/src/L0Device.cpp | 30 ++++++++++++++----- .../level_zero/src/L0Kernel.cpp | 8 ++++- .../level_zero/src/L0Memory.cpp | 22 ++++++++++++-- .../level_zero/src/L0Program.cpp | 6 ++-- 5 files changed, 58 insertions(+), 14 deletions(-) diff --git a/offload/plugins-nextgen/level_zero/include/L0Trace.h b/offload/plugins-nextgen/level_zero/include/L0Trace.h index 357a81a1c4a7..0dd55e01c71e 100644 --- a/offload/plugins-nextgen/level_zero/include/L0Trace.h +++ b/offload/plugins-nextgen/level_zero/include/L0Trace.h @@ -60,6 +60,12 @@ using namespace llvm::offload::debug; Plugin::error(ErrorCode::UNKNOWN, "%s failed with error %d, %s", \ #Fn, rc, getZeErrorName(rc)), Fn, __VA_ARGS__) +#define CALL_ZE_SILENT(Fn, ...) \ + do { \ + ze_result_t rc; \ + CALL_ZE(rc, Fn, __VA_ARGS__); \ + (void)rc; /* Silence unused variable warning. */ \ + } while (0) #define CALL_ZE_HANDLE_ERROR(HandleErrFn, Fn, ...) \ do { \ diff --git a/offload/plugins-nextgen/level_zero/src/L0Device.cpp b/offload/plugins-nextgen/level_zero/src/L0Device.cpp index 96a9846863af..461f12407680 100644 --- a/offload/plugins-nextgen/level_zero/src/L0Device.cpp +++ b/offload/plugins-nextgen/level_zero/src/L0Device.cpp @@ -18,6 +18,7 @@ #include "L0Trace.h" #include "GlobalHandler.h" +#include "llvm/ADT/ScopeExit.h" #include "llvm/Object/ELF.h" namespace llvm::omp::target::plugin { @@ -700,30 +701,36 @@ Expected L0DeviceTy::createInterop(int32_t InteropContext, Ret->rtl_property = new L0Interop::Property(); if (InteropContext == kmp_interop_type_targetsync) { Ret->async_info = new __tgt_async_info(); + + // Ensure cleanup on error + llvm::scope_exit CleanupOnError([&]() { + if (Ret->async_info) + delete Ret->async_info; + if (Ret->rtl_property) + delete static_cast(Ret->rtl_property); + delete Ret; + }); + auto L0 = static_cast(Ret->rtl_property); bool InOrder = InteropSpec.attrs.inorder; Ret->attrs.inorder = InOrder; if (useImmForInterop()) { auto CmdListOrErr = createImmCmdList(InOrder); - if (!CmdListOrErr) { - delete Ret->async_info; - delete Ret; + if (!CmdListOrErr) return CmdListOrErr.takeError(); - } Ret->async_info->Queue = *CmdListOrErr; L0->ImmCmdList = *CmdListOrErr; } else { auto QueueOrErr = createCommandQueue(InOrder); - if (!QueueOrErr) { - delete Ret->async_info; - delete Ret; + if (!QueueOrErr) return QueueOrErr.takeError(); - } Ret->async_info->Queue = *QueueOrErr; L0->CommandQueue = static_cast(Ret->async_info->Queue); } + + CleanupOnError.release(); } return Ret; @@ -791,9 +798,12 @@ Error L0DeviceTy::enqueueMemCopy(void *Dst, const void *Src, size_t Size, CALL_ZE_RET_ERROR(zeCommandListAppendMemoryCopy, CmdList, Dst, Src, Size, nullptr, 0, nullptr); CALL_ZE_RET_ERROR(zeCommandListClose, CmdList); + llvm::scope_exit ResetOnExit( + [&]() { CALL_ZE_SILENT(zeCommandListReset, CmdList); }); CALL_ZE_RET_ERROR_MTX(zeCommandQueueExecuteCommandLists, getMutex(), CmdQueue, 1, &CmdList, nullptr); CALL_ZE_RET_ERROR(zeCommandQueueSynchronize, CmdQueue, L0DefaultTimeout); + ResetOnExit.release(); CALL_ZE_RET_ERROR(zeCommandListReset, CmdList); } return Plugin::success(); @@ -1141,8 +1151,12 @@ Error L0DeviceTy::dataFence(__tgt_async_info *Async) { CmdQueue = *CmdQueueOrerr; CALL_ZE_RET_ERROR(zeCommandListAppendBarrier, CmdList, nullptr, 0, nullptr); CALL_ZE_RET_ERROR(zeCommandListClose, CmdList); + llvm::scope_exit ResetOnExit( + [&]() { CALL_ZE_SILENT(zeCommandListReset, CmdList); }); CALL_ZE_RET_ERROR(zeCommandQueueExecuteCommandLists, CmdQueue, 1, &CmdList, nullptr); + CALL_ZE_RET_ERROR(zeCommandQueueSynchronize, CmdQueue, L0DefaultTimeout); + ResetOnExit.release(); CALL_ZE_RET_ERROR(zeCommandListReset, CmdList); } diff --git a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp index fb7bfa05310d..8cccb5e11639 100644 --- a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp +++ b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp @@ -15,6 +15,8 @@ #include "L0Plugin.h" #include "L0Program.h" +#include "llvm/ADT/ScopeExit.h" + namespace llvm::omp::target::plugin { bool KernelPropertiesTy::reuseGroupParams(const int32_t NumTeamsIn, @@ -337,12 +339,16 @@ static Error launchKernelWithCmdQueue(L0DeviceTy &l0Device, &KEnv.GroupCounts, Event, 0, nullptr); KEnv.Lock.unlock(); CALL_ZE_RET_ERROR(zeCommandListClose, CmdList); + + // Ensure command list is reset even on errors after this point. + llvm::scope_exit ResetOnExit( + [&]() { CALL_ZE_SILENT(zeCommandListReset, CmdList); }); + CALL_ZE_RET_ERROR_MTX(zeCommandQueueExecuteCommandLists, l0Device.getMutex(), CmdQueue, 1, &CmdList, nullptr); INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId, "Submitted kernel " DPxMOD " to device %s\n", DPxPTR(zeKernel), IdStr); CALL_ZE_RET_ERROR(zeCommandQueueSynchronize, CmdQueue, L0DefaultTimeout); - CALL_ZE_RET_ERROR(zeCommandListReset, CmdList); if (Event) { if (auto Err = l0Device.releaseEvent(Event)) return Err; diff --git a/offload/plugins-nextgen/level_zero/src/L0Memory.cpp b/offload/plugins-nextgen/level_zero/src/L0Memory.cpp index 477fb0e27c6f..9039e7c4d148 100644 --- a/offload/plugins-nextgen/level_zero/src/L0Memory.cpp +++ b/offload/plugins-nextgen/level_zero/src/L0Memory.cpp @@ -301,8 +301,12 @@ Expected MemAllocatorTy::MemPoolTy::alloc(size_t Size, void *Base = *BaseOrErr; if (ZeroInit) { auto Err = Allocator->enqueueMemSet(Base, 0, BlockSize); - if (Err) - return Err; + if (Err) { + // deallocate Base on error. + if (auto DeallocErr = Allocator->deallocFromL0(Base)) + return joinErrors(std::move(Err), std::move(DeallocErr)); + return std::move(Err); + } } BlockTy *Block = new BlockTy(Base, BlockSize, ChunkSize); @@ -708,12 +712,24 @@ Expected EventPoolTy::getEvent() { ze_event_desc_t EventDesc{ZE_STRUCTURE_TYPE_EVENT_DESC, nullptr, 0, 0, 0}; EventDesc.wait = 0; EventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST; + uint32_t CreatedEvents = 0; for (uint32_t I = 0; I < PoolSize; I++) { EventDesc.index = I; ze_event_handle_t Event; - CALL_ZE_RET_ERROR(zeEventCreate, Pool, &EventDesc, &Event); + ze_result_t RC; + CALL_ZE(RC, zeEventCreate, Pool, &EventDesc, &Event); + if (RC != ZE_RESULT_SUCCESS) { + // Log the error and skip this event. + ODBG(OLDT_Init) << "Warning: zeEventCreate failed at index " << I + << " with code " << RC << ". Skipping this event."; + continue; + } Events.push_back(Event); + CreatedEvents++; } + PoolSize = CreatedEvents; + ODBG(OLDT_Init) << "Created a new event pool " << Pool << " with " + << PoolSize << " events"; } auto Ret = Events.back(); diff --git a/offload/plugins-nextgen/level_zero/src/L0Program.cpp b/offload/plugins-nextgen/level_zero/src/L0Program.cpp index 7adbcbe56fa8..6d4fac3be873 100644 --- a/offload/plugins-nextgen/level_zero/src/L0Program.cpp +++ b/offload/plugins-nextgen/level_zero/src/L0Program.cpp @@ -102,8 +102,10 @@ Error L0ProgramBuilderTy::addModule(size_t Size, const uint8_t *Image, if (!RequiresModuleLink && !IsLibModule) { ze_module_properties_t Properties = {ZE_STRUCTURE_TYPE_MODULE_PROPERTIES, nullptr, 0}; - CALL_ZE_RET_ERROR(zeModuleGetProperties, Module, &Properties); - RequiresModuleLink = Properties.flags & ZE_MODULE_PROPERTY_FLAG_IMPORTS; + ze_result_t RC; + CALL_ZE(RC, zeModuleGetProperties, Module, &Properties); + if (RC == ZE_RESULT_SUCCESS) + RequiresModuleLink = Properties.flags & ZE_MODULE_PROPERTY_FLAG_IMPORTS; } // For now, assume the first module contains libraries, globals. if (Modules.empty())