The current only way to obtain pinned memory with libomptarget is to use a custom allocator llvm_omp_target_alloc_host. This reflects well the CUDA implementation of libomptarget, but it does not correctly expose the AMDGPU runtime API, where any system allocated page can be locked/unlocked through a call to hsa_amd_memory_lock/unlock. This patch enables users to allocate memory through malloc (mmap, sbreak) and then pin the related memory pages with a libomptarget special call. It is a base support in the amdgpu libomptarget plugin to enable users to prelock their host memory pages so that the runtime doesn't need to lock them itself for asynchronous memory transfers. Reviewed By: jdoerfert, ye-luo Differential Revision: https://reviews.llvm.org/D139208
183 lines
6.4 KiB
C++
183 lines
6.4 KiB
C++
//===--- amdgpu/impl/impl.cpp ------------------------------------- C++ -*-===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
#include "rt.h"
|
|
#include <memory>
|
|
|
|
/*
|
|
* Data
|
|
*/
|
|
|
|
hsa_status_t is_locked(void *ptr, void **agentBaseAddress) {
|
|
hsa_status_t err = HSA_STATUS_SUCCESS;
|
|
hsa_amd_pointer_info_t info;
|
|
info.size = sizeof(hsa_amd_pointer_info_t);
|
|
err = hsa_amd_pointer_info(ptr, &info, /*alloc=*/nullptr,
|
|
/*num_agents_accessible=*/nullptr,
|
|
/*accessible=*/nullptr);
|
|
if (err != HSA_STATUS_SUCCESS) {
|
|
DP("Error when getting pointer info\n");
|
|
return err;
|
|
}
|
|
|
|
if (info.type == HSA_EXT_POINTER_TYPE_LOCKED) {
|
|
// When user passes in a basePtr+offset we need to fix the
|
|
// locked pointer to include the offset: ROCr always returns
|
|
// the base locked address, not the shifted one.
|
|
if ((char *)info.hostBaseAddress <= (char *)ptr &&
|
|
(char *)ptr < (char *)info.hostBaseAddress + info.sizeInBytes)
|
|
*agentBaseAddress =
|
|
(void *)((uint64_t)info.agentBaseAddress + (uint64_t)ptr -
|
|
(uint64_t)info.hostBaseAddress);
|
|
else // address is already device-agent accessible, no need to compute
|
|
// offset
|
|
*agentBaseAddress = ptr;
|
|
} else
|
|
*agentBaseAddress = nullptr;
|
|
|
|
return HSA_STATUS_SUCCESS;
|
|
}
|
|
|
|
// host pointer (either src or dest) must be locked via hsa_amd_memory_lock
|
|
static hsa_status_t invoke_hsa_copy(hsa_signal_t signal, void *dest,
|
|
hsa_agent_t agent, const void *src,
|
|
size_t size) {
|
|
const hsa_signal_value_t init = 1;
|
|
const hsa_signal_value_t success = 0;
|
|
hsa_signal_store_screlease(signal, init);
|
|
|
|
hsa_status_t err = hsa_amd_memory_async_copy(dest, agent, src, agent, size, 0,
|
|
nullptr, signal);
|
|
if (err != HSA_STATUS_SUCCESS)
|
|
return err;
|
|
|
|
// async_copy reports success by decrementing and failure by setting to < 0
|
|
hsa_signal_value_t got = init;
|
|
while (got == init)
|
|
got = hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_NE, init,
|
|
UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
|
|
|
|
if (got != success)
|
|
return HSA_STATUS_ERROR;
|
|
|
|
return err;
|
|
}
|
|
|
|
struct implFreePtrDeletor {
|
|
void operator()(void *p) {
|
|
core::Runtime::Memfree(p); // ignore failure to free
|
|
}
|
|
};
|
|
|
|
enum CopyDirection { H2D, D2H };
|
|
|
|
static hsa_status_t locking_async_memcpy(enum CopyDirection direction,
|
|
hsa_signal_t signal, void *dest,
|
|
hsa_agent_t agent, void *src,
|
|
void *lockingPtr, size_t size) {
|
|
void *lockedPtr = nullptr;
|
|
hsa_status_t err = is_locked(lockingPtr, &lockedPtr);
|
|
bool HostPtrIsLocked = true;
|
|
if (err != HSA_STATUS_SUCCESS)
|
|
return err;
|
|
if (!lockedPtr) { // not locked
|
|
HostPtrIsLocked = false;
|
|
hsa_agent_t agents[1] = {agent};
|
|
err = hsa_amd_memory_lock(lockingPtr, size, agents, /*num_agent=*/1,
|
|
(void **)&lockedPtr);
|
|
if (err != HSA_STATUS_SUCCESS)
|
|
return err;
|
|
DP("locking_async_memcpy: lockingPtr=%p lockedPtr=%p Size = %lu\n",
|
|
lockingPtr, lockedPtr, size);
|
|
}
|
|
|
|
switch (direction) {
|
|
case H2D:
|
|
err = invoke_hsa_copy(signal, dest, agent, lockedPtr, size);
|
|
break;
|
|
case D2H:
|
|
err = invoke_hsa_copy(signal, lockedPtr, agent, src, size);
|
|
break;
|
|
}
|
|
|
|
if (err != HSA_STATUS_SUCCESS && !HostPtrIsLocked) {
|
|
// do not leak locked host pointers, but discard potential error message
|
|
// because the initial error was in the copy function
|
|
hsa_amd_memory_unlock(lockingPtr);
|
|
return err;
|
|
}
|
|
|
|
// unlock only if not user locked
|
|
if (!HostPtrIsLocked)
|
|
err = hsa_amd_memory_unlock(lockingPtr);
|
|
if (err != HSA_STATUS_SUCCESS)
|
|
return err;
|
|
|
|
return HSA_STATUS_SUCCESS;
|
|
}
|
|
|
|
hsa_status_t impl_memcpy_h2d(hsa_signal_t signal, void *deviceDest,
|
|
void *hostSrc, size_t size,
|
|
hsa_agent_t device_agent,
|
|
hsa_amd_memory_pool_t MemoryPool) {
|
|
hsa_status_t err;
|
|
|
|
err = locking_async_memcpy(CopyDirection::H2D, signal, deviceDest,
|
|
device_agent, hostSrc, hostSrc, size);
|
|
|
|
if (err == HSA_STATUS_SUCCESS)
|
|
return err;
|
|
|
|
// async memcpy sometimes fails in situations where
|
|
// allocate + copy succeeds. Looks like it might be related to
|
|
// locking part of a read only segment. Fall back for now.
|
|
void *tempHostPtr;
|
|
hsa_status_t ret = core::Runtime::HostMalloc(&tempHostPtr, size, MemoryPool);
|
|
if (ret != HSA_STATUS_SUCCESS) {
|
|
DP("HostMalloc: Unable to alloc %zu bytes for temp scratch\n", size);
|
|
return ret;
|
|
}
|
|
std::unique_ptr<void, implFreePtrDeletor> del(tempHostPtr);
|
|
memcpy(tempHostPtr, hostSrc, size);
|
|
|
|
return locking_async_memcpy(CopyDirection::H2D, signal, deviceDest,
|
|
device_agent, tempHostPtr, tempHostPtr, size);
|
|
}
|
|
|
|
hsa_status_t impl_memcpy_d2h(hsa_signal_t signal, void *hostDest,
|
|
void *deviceSrc, size_t size,
|
|
hsa_agent_t deviceAgent,
|
|
hsa_amd_memory_pool_t MemoryPool) {
|
|
hsa_status_t err;
|
|
|
|
// device has always visibility over both pointers, so use that
|
|
err = locking_async_memcpy(CopyDirection::D2H, signal, hostDest, deviceAgent,
|
|
deviceSrc, hostDest, size);
|
|
|
|
if (err == HSA_STATUS_SUCCESS)
|
|
return err;
|
|
|
|
// hsa_memory_copy sometimes fails in situations where
|
|
// allocate + copy succeeds. Looks like it might be related to
|
|
// locking part of a read only segment. Fall back for now.
|
|
void *tempHostPtr;
|
|
hsa_status_t ret = core::Runtime::HostMalloc(&tempHostPtr, size, MemoryPool);
|
|
if (ret != HSA_STATUS_SUCCESS) {
|
|
DP("HostMalloc: Unable to alloc %zu bytes for temp scratch\n", size);
|
|
return ret;
|
|
}
|
|
std::unique_ptr<void, implFreePtrDeletor> del(tempHostPtr);
|
|
|
|
err = locking_async_memcpy(CopyDirection::D2H, signal, tempHostPtr,
|
|
deviceAgent, deviceSrc, tempHostPtr, size);
|
|
if (err != HSA_STATUS_SUCCESS)
|
|
return HSA_STATUS_ERROR;
|
|
|
|
memcpy(hostDest, tempHostPtr, size);
|
|
return HSA_STATUS_SUCCESS;
|
|
}
|