[OFFLOAD][L0] More error handling (#188496)
This PR improves cleanup/handling of errors in some memory operations, allocating event pools, ...
This commit is contained in:
parent
6937866f5c
commit
64e7c77e04
@ -60,6 +60,12 @@ using namespace llvm::offload::debug;
|
||||
Plugin::error(ErrorCode::UNKNOWN, "%s failed with error %d, %s", \
|
||||
#Fn, rc, getZeErrorName(rc)), Fn, __VA_ARGS__)
|
||||
|
||||
#define CALL_ZE_SILENT(Fn, ...) \
|
||||
do { \
|
||||
ze_result_t rc; \
|
||||
CALL_ZE(rc, Fn, __VA_ARGS__); \
|
||||
(void)rc; /* Silence unused variable warning. */ \
|
||||
} while (0)
|
||||
|
||||
#define CALL_ZE_HANDLE_ERROR(HandleErrFn, Fn, ...) \
|
||||
do { \
|
||||
|
||||
@ -18,6 +18,7 @@
|
||||
#include "L0Trace.h"
|
||||
|
||||
#include "GlobalHandler.h"
|
||||
#include "llvm/ADT/ScopeExit.h"
|
||||
#include "llvm/Object/ELF.h"
|
||||
|
||||
namespace llvm::omp::target::plugin {
|
||||
@ -700,30 +701,36 @@ Expected<OmpInteropTy> L0DeviceTy::createInterop(int32_t InteropContext,
|
||||
Ret->rtl_property = new L0Interop::Property();
|
||||
if (InteropContext == kmp_interop_type_targetsync) {
|
||||
Ret->async_info = new __tgt_async_info();
|
||||
|
||||
// Ensure cleanup on error
|
||||
llvm::scope_exit CleanupOnError([&]() {
|
||||
if (Ret->async_info)
|
||||
delete Ret->async_info;
|
||||
if (Ret->rtl_property)
|
||||
delete static_cast<L0Interop::Property *>(Ret->rtl_property);
|
||||
delete Ret;
|
||||
});
|
||||
|
||||
auto L0 = static_cast<L0Interop::Property *>(Ret->rtl_property);
|
||||
|
||||
bool InOrder = InteropSpec.attrs.inorder;
|
||||
Ret->attrs.inorder = InOrder;
|
||||
if (useImmForInterop()) {
|
||||
auto CmdListOrErr = createImmCmdList(InOrder);
|
||||
if (!CmdListOrErr) {
|
||||
delete Ret->async_info;
|
||||
delete Ret;
|
||||
if (!CmdListOrErr)
|
||||
return CmdListOrErr.takeError();
|
||||
}
|
||||
Ret->async_info->Queue = *CmdListOrErr;
|
||||
L0->ImmCmdList = *CmdListOrErr;
|
||||
} else {
|
||||
auto QueueOrErr = createCommandQueue(InOrder);
|
||||
if (!QueueOrErr) {
|
||||
delete Ret->async_info;
|
||||
delete Ret;
|
||||
if (!QueueOrErr)
|
||||
return QueueOrErr.takeError();
|
||||
}
|
||||
Ret->async_info->Queue = *QueueOrErr;
|
||||
L0->CommandQueue =
|
||||
static_cast<ze_command_queue_handle_t>(Ret->async_info->Queue);
|
||||
}
|
||||
|
||||
CleanupOnError.release();
|
||||
}
|
||||
|
||||
return Ret;
|
||||
@ -791,9 +798,12 @@ Error L0DeviceTy::enqueueMemCopy(void *Dst, const void *Src, size_t Size,
|
||||
CALL_ZE_RET_ERROR(zeCommandListAppendMemoryCopy, CmdList, Dst, Src, Size,
|
||||
nullptr, 0, nullptr);
|
||||
CALL_ZE_RET_ERROR(zeCommandListClose, CmdList);
|
||||
llvm::scope_exit ResetOnExit(
|
||||
[&]() { CALL_ZE_SILENT(zeCommandListReset, CmdList); });
|
||||
CALL_ZE_RET_ERROR_MTX(zeCommandQueueExecuteCommandLists, getMutex(),
|
||||
CmdQueue, 1, &CmdList, nullptr);
|
||||
CALL_ZE_RET_ERROR(zeCommandQueueSynchronize, CmdQueue, L0DefaultTimeout);
|
||||
ResetOnExit.release();
|
||||
CALL_ZE_RET_ERROR(zeCommandListReset, CmdList);
|
||||
}
|
||||
return Plugin::success();
|
||||
@ -1141,8 +1151,12 @@ Error L0DeviceTy::dataFence(__tgt_async_info *Async) {
|
||||
CmdQueue = *CmdQueueOrerr;
|
||||
CALL_ZE_RET_ERROR(zeCommandListAppendBarrier, CmdList, nullptr, 0, nullptr);
|
||||
CALL_ZE_RET_ERROR(zeCommandListClose, CmdList);
|
||||
llvm::scope_exit ResetOnExit(
|
||||
[&]() { CALL_ZE_SILENT(zeCommandListReset, CmdList); });
|
||||
CALL_ZE_RET_ERROR(zeCommandQueueExecuteCommandLists, CmdQueue, 1, &CmdList,
|
||||
nullptr);
|
||||
CALL_ZE_RET_ERROR(zeCommandQueueSynchronize, CmdQueue, L0DefaultTimeout);
|
||||
ResetOnExit.release();
|
||||
CALL_ZE_RET_ERROR(zeCommandListReset, CmdList);
|
||||
}
|
||||
|
||||
|
||||
@ -15,6 +15,8 @@
|
||||
#include "L0Plugin.h"
|
||||
#include "L0Program.h"
|
||||
|
||||
#include "llvm/ADT/ScopeExit.h"
|
||||
|
||||
namespace llvm::omp::target::plugin {
|
||||
|
||||
bool KernelPropertiesTy::reuseGroupParams(const int32_t NumTeamsIn,
|
||||
@ -337,12 +339,16 @@ static Error launchKernelWithCmdQueue(L0DeviceTy &l0Device,
|
||||
&KEnv.GroupCounts, Event, 0, nullptr);
|
||||
KEnv.Lock.unlock();
|
||||
CALL_ZE_RET_ERROR(zeCommandListClose, CmdList);
|
||||
|
||||
// Ensure command list is reset even on errors after this point.
|
||||
llvm::scope_exit ResetOnExit(
|
||||
[&]() { CALL_ZE_SILENT(zeCommandListReset, CmdList); });
|
||||
|
||||
CALL_ZE_RET_ERROR_MTX(zeCommandQueueExecuteCommandLists, l0Device.getMutex(),
|
||||
CmdQueue, 1, &CmdList, nullptr);
|
||||
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
|
||||
"Submitted kernel " DPxMOD " to device %s\n", DPxPTR(zeKernel), IdStr);
|
||||
CALL_ZE_RET_ERROR(zeCommandQueueSynchronize, CmdQueue, L0DefaultTimeout);
|
||||
CALL_ZE_RET_ERROR(zeCommandListReset, CmdList);
|
||||
if (Event) {
|
||||
if (auto Err = l0Device.releaseEvent(Event))
|
||||
return Err;
|
||||
|
||||
@ -301,8 +301,12 @@ Expected<void *> MemAllocatorTy::MemPoolTy::alloc(size_t Size,
|
||||
void *Base = *BaseOrErr;
|
||||
if (ZeroInit) {
|
||||
auto Err = Allocator->enqueueMemSet(Base, 0, BlockSize);
|
||||
if (Err)
|
||||
return Err;
|
||||
if (Err) {
|
||||
// deallocate Base on error.
|
||||
if (auto DeallocErr = Allocator->deallocFromL0(Base))
|
||||
return joinErrors(std::move(Err), std::move(DeallocErr));
|
||||
return std::move(Err);
|
||||
}
|
||||
}
|
||||
|
||||
BlockTy *Block = new BlockTy(Base, BlockSize, ChunkSize);
|
||||
@ -708,12 +712,24 @@ Expected<ze_event_handle_t> EventPoolTy::getEvent() {
|
||||
ze_event_desc_t EventDesc{ZE_STRUCTURE_TYPE_EVENT_DESC, nullptr, 0, 0, 0};
|
||||
EventDesc.wait = 0;
|
||||
EventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST;
|
||||
uint32_t CreatedEvents = 0;
|
||||
for (uint32_t I = 0; I < PoolSize; I++) {
|
||||
EventDesc.index = I;
|
||||
ze_event_handle_t Event;
|
||||
CALL_ZE_RET_ERROR(zeEventCreate, Pool, &EventDesc, &Event);
|
||||
ze_result_t RC;
|
||||
CALL_ZE(RC, zeEventCreate, Pool, &EventDesc, &Event);
|
||||
if (RC != ZE_RESULT_SUCCESS) {
|
||||
// Log the error and skip this event.
|
||||
ODBG(OLDT_Init) << "Warning: zeEventCreate failed at index " << I
|
||||
<< " with code " << RC << ". Skipping this event.";
|
||||
continue;
|
||||
}
|
||||
Events.push_back(Event);
|
||||
CreatedEvents++;
|
||||
}
|
||||
PoolSize = CreatedEvents;
|
||||
ODBG(OLDT_Init) << "Created a new event pool " << Pool << " with "
|
||||
<< PoolSize << " events";
|
||||
}
|
||||
|
||||
auto Ret = Events.back();
|
||||
|
||||
@ -102,8 +102,10 @@ Error L0ProgramBuilderTy::addModule(size_t Size, const uint8_t *Image,
|
||||
if (!RequiresModuleLink && !IsLibModule) {
|
||||
ze_module_properties_t Properties = {ZE_STRUCTURE_TYPE_MODULE_PROPERTIES,
|
||||
nullptr, 0};
|
||||
CALL_ZE_RET_ERROR(zeModuleGetProperties, Module, &Properties);
|
||||
RequiresModuleLink = Properties.flags & ZE_MODULE_PROPERTY_FLAG_IMPORTS;
|
||||
ze_result_t RC;
|
||||
CALL_ZE(RC, zeModuleGetProperties, Module, &Properties);
|
||||
if (RC == ZE_RESULT_SUCCESS)
|
||||
RequiresModuleLink = Properties.flags & ZE_MODULE_PROPERTY_FLAG_IMPORTS;
|
||||
}
|
||||
// For now, assume the first module contains libraries, globals.
|
||||
if (Modules.empty())
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user