[OFFLOAD] Improve resource management of the plugin (#187597)

This PR improves event management of the plugin by fixing potential
resource leaks and preventing a potential deadlock
This commit is contained in:
fineg74 2026-03-25 01:50:38 -07:00 committed by GitHub
parent ebe2454dc5
commit 1dbf7c7e1b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 66 additions and 30 deletions

View File

@ -86,10 +86,12 @@ struct L0LaunchEnvTy {
KernelPropertiesTy &KernelPR;
bool HalfNumThreads = false;
bool IsTeamsNDRange = false;
std::unique_lock<std::mutex> Lock;
L0LaunchEnvTy(bool IsAsync, AsyncQueueTy *AsyncQueue,
KernelPropertiesTy &KernelPR)
: IsAsync(IsAsync), AsyncQueue(AsyncQueue), KernelPR(KernelPR) {}
: IsAsync(IsAsync), AsyncQueue(AsyncQueue), KernelPR(KernelPR),
Lock(KernelPR.Mtx, std::defer_lock) {}
};
class L0KernelTy : public GenericKernelTy {

View File

@ -71,6 +71,17 @@ using namespace llvm::offload::debug;
} \
} while (0)
#define CALL_ZE_ACCUM_ERROR(Err, Fn, ...) \
do { \
ze_result_t rc; \
CALL_ZE(rc, Fn, __VA_ARGS__); \
if (rc != ZE_RESULT_SUCCESS) { \
Err = joinErrors(std::move(Err), \
Plugin::error(ErrorCode::UNKNOWN, "%s failed with error %d," \
" %s", #Fn, rc, getZeErrorName(rc))); \
} \
} while (0)
#define CALL_ZE_EXT_SILENT_RET(Device, Ret, Name, ...) \
do { \
ze_result_t rc; \

View File

@ -290,24 +290,21 @@ Error L0DeviceTy::synchronizeImpl(__tgt_async_info &AsyncInfo,
AsyncQueueTy *AsyncQueue = reinterpret_cast<AsyncQueueTy *>(AsyncInfo.Queue);
Error SyncErrors = Error::success();
auto addError = [&](Error Err) {
SyncErrors = joinErrors(std::move(SyncErrors), std::move(Err));
};
if (!AsyncQueue->WaitEvents.empty()) {
const auto &WaitEvents = AsyncQueue->WaitEvents;
if (Plugin.getOptions().CommandMode == CommandModeTy::AsyncOrdered) {
// Only need to wait for the last event.
CALL_ZE_HANDLE_ERROR(addError, zeEventHostSynchronize, WaitEvents.back(),
L0DefaultTimeout);
CALL_ZE_ACCUM_ERROR(SyncErrors, zeEventHostSynchronize, WaitEvents.back(),
L0DefaultTimeout);
// Synchronize on kernel event to support printf().
auto KE = AsyncQueue->KernelEvent;
if (KE && KE != WaitEvents.back() && !SyncErrors) {
CALL_ZE_HANDLE_ERROR(addError, zeEventHostSynchronize, KE,
L0DefaultTimeout);
CALL_ZE_ACCUM_ERROR(SyncErrors, zeEventHostSynchronize, KE,
L0DefaultTimeout);
}
for (auto &Event : WaitEvents) {
if (auto Err = releaseEvent(Event))
addError(std::move(Err));
SyncErrors = joinErrors(std::move(SyncErrors), std::move(Err));
}
} else {
// Async case.
@ -319,13 +316,13 @@ Error L0DeviceTy::synchronizeImpl(__tgt_async_info &AsyncInfo,
bool WaitDone = false;
for (auto Itr = WaitEvents.rbegin(); Itr != WaitEvents.rend(); Itr++) {
if (!WaitDone) {
CALL_ZE_HANDLE_ERROR(addError, zeEventHostSynchronize, *Itr,
L0DefaultTimeout);
CALL_ZE_ACCUM_ERROR(SyncErrors, zeEventHostSynchronize, *Itr,
L0DefaultTimeout);
if (*Itr == AsyncQueue->KernelEvent)
WaitDone = true;
}
if (auto Err = releaseEvent(*Itr))
addError(std::move(Err));
SyncErrors = joinErrors(std::move(SyncErrors), std::move(Err));
}
}
// In either case, all the events are now reset and released
@ -809,6 +806,10 @@ Error L0DeviceTy::enqueueMemCopyAsync(void *Dst, const void *Src, size_t Size,
bool CopyTo) {
const bool Ordered =
(getPlugin().getOptions().CommandMode == CommandModeTy::AsyncOrdered);
auto CmdListOrError = getImmCopyCmdList();
if (!CmdListOrError)
return CmdListOrError.takeError();
const auto CmdList = *CmdListOrError;
auto EventOrErr = getEvent();
if (!EventOrErr)
return EventOrErr.takeError();
@ -826,14 +827,19 @@ Error L0DeviceTy::enqueueMemCopyAsync(void *Dst, const void *Src, size_t Size,
else
NumWaitEvents = 0;
}
auto CmdListOrError = getImmCopyCmdList();
if (!CmdListOrError)
return CmdListOrError.takeError();
const auto CmdList = *CmdListOrError;
CALL_ZE_RET_ERROR(zeCommandListAppendMemoryCopy, CmdList, Dst, Src, Size,
SignalEvent, NumWaitEvents, WaitEvents);
AsyncQueue->WaitEvents.push_back(SignalEvent);
return Plugin::success();
Error AllErrors = Error::success();
CALL_ZE_ACCUM_ERROR(AllErrors, zeCommandListAppendMemoryCopy, CmdList, Dst,
Src, Size, SignalEvent, NumWaitEvents, WaitEvents);
if (!AllErrors)
AsyncQueue->WaitEvents.push_back(SignalEvent);
else {
if (auto Err = releaseEvent(SignalEvent))
AllErrors = joinErrors(std::move(AllErrors), std::move(Err));
}
return AllErrors;
}
/// Enqueue memory fill.
@ -847,10 +853,16 @@ Error L0DeviceTy::enqueueMemFill(void *Ptr, const void *Pattern,
auto EventOrErr = getEvent();
if (!EventOrErr)
return EventOrErr.takeError();
Error AllErrors = Error::success();
ze_event_handle_t Event = *EventOrErr;
CALL_ZE_RET_ERROR(zeCommandListAppendMemoryFill, CmdList, Ptr, Pattern,
PatternSize, Size, Event, 0, nullptr);
CALL_ZE_RET_ERROR(zeEventHostSynchronize, Event, L0DefaultTimeout);
CALL_ZE_ACCUM_ERROR(AllErrors, zeCommandListAppendMemoryFill, CmdList, Ptr,
Pattern, PatternSize, Size, Event, 0, nullptr);
if (!AllErrors)
CALL_ZE_ACCUM_ERROR(AllErrors, zeEventHostSynchronize, Event,
L0DefaultTimeout);
if (auto Err = releaseEvent(Event))
AllErrors = joinErrors(std::move(AllErrors), std::move(Err));
return AllErrors;
} else {
auto CmdListOrErr = getCopyCmdList();
if (!CmdListOrErr)

View File

@ -282,9 +282,17 @@ static Error launchKernelWithImmCmdList(L0DeviceTy &l0Device,
}
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
"Kernel depends on %zu data copying events.\n", NumWaitEvents);
CALL_ZE_RET_ERROR(zeCommandListAppendLaunchKernel, CmdList, zeKernel,
&KEnv.GroupCounts, Event, NumWaitEvents, WaitEvents);
KEnv.KernelPR.Mtx.unlock();
Error AllErrors = Error::success();
CALL_ZE_ACCUM_ERROR(AllErrors, zeCommandListAppendLaunchKernel, CmdList,
zeKernel, &KEnv.GroupCounts, Event, NumWaitEvents,
WaitEvents);
KEnv.Lock.unlock();
if (AllErrors) {
if (auto Err = l0Device.releaseEvent(Event))
AllErrors = joinErrors(std::move(AllErrors), std::move(Err));
return AllErrors;
}
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
"Submitted kernel " DPxMOD " to device %s\n", DPxPTR(zeKernel), IdStr);
@ -292,9 +300,12 @@ static Error launchKernelWithImmCmdList(L0DeviceTy &l0Device,
AsyncQueue->WaitEvents.push_back(Event);
AsyncQueue->KernelEvent = Event;
} else {
CALL_ZE_RET_ERROR(zeEventHostSynchronize, Event, L0DefaultTimeout);
CALL_ZE_ACCUM_ERROR(AllErrors, zeEventHostSynchronize, Event,
L0DefaultTimeout);
if (auto Err = l0Device.releaseEvent(Event))
return Err;
AllErrors = joinErrors(std::move(AllErrors), std::move(Err));
if (AllErrors)
return AllErrors;
}
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
"Executed kernel entry " DPxMOD " on device %s\n", DPxPTR(zeKernel),
@ -324,7 +335,7 @@ static Error launchKernelWithCmdQueue(L0DeviceTy &l0Device,
ze_event_handle_t Event = nullptr;
CALL_ZE_RET_ERROR(zeCommandListAppendLaunchKernel, CmdList, zeKernel,
&KEnv.GroupCounts, Event, 0, nullptr);
KEnv.KernelPR.Mtx.unlock();
KEnv.Lock.unlock();
CALL_ZE_RET_ERROR(zeCommandListClose, CmdList);
CALL_ZE_RET_ERROR_MTX(zeCommandQueueExecuteCommandLists, l0Device.getMutex(),
CmdQueue, 1, &CmdList, nullptr);
@ -445,7 +456,7 @@ Error L0KernelTy::launchImpl(GenericDeviceTy &GenericDevice,
L0LaunchEnvTy KEnv(IsAsync, AsyncQueue, KernelPR);
// Protect from kernel preparation to submission as kernels are shared.
KernelPR.Mtx.lock();
KEnv.Lock.lock();
if (auto Err = setKernelGroups(l0Device, KEnv, NumThreads, NumBlocks))
return Err;