//===--- amdgpu/impl/impl.cpp ------------------------------------- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "rt.h" #include /* * Data */ hsa_status_t is_locked(void *ptr, void **agentBaseAddress) { hsa_status_t err = HSA_STATUS_SUCCESS; hsa_amd_pointer_info_t info; info.size = sizeof(hsa_amd_pointer_info_t); err = hsa_amd_pointer_info(ptr, &info, /*alloc=*/nullptr, /*num_agents_accessible=*/nullptr, /*accessible=*/nullptr); if (err != HSA_STATUS_SUCCESS) { DP("Error when getting pointer info\n"); return err; } if (info.type == HSA_EXT_POINTER_TYPE_LOCKED) { // When user passes in a basePtr+offset we need to fix the // locked pointer to include the offset: ROCr always returns // the base locked address, not the shifted one. if ((char *)info.hostBaseAddress <= (char *)ptr && (char *)ptr < (char *)info.hostBaseAddress + info.sizeInBytes) *agentBaseAddress = (void *)((uint64_t)info.agentBaseAddress + (uint64_t)ptr - (uint64_t)info.hostBaseAddress); else // address is already device-agent accessible, no need to compute // offset *agentBaseAddress = ptr; } else *agentBaseAddress = nullptr; return HSA_STATUS_SUCCESS; } // host pointer (either src or dest) must be locked via hsa_amd_memory_lock static hsa_status_t invoke_hsa_copy(hsa_signal_t signal, void *dest, hsa_agent_t agent, const void *src, size_t size) { const hsa_signal_value_t init = 1; const hsa_signal_value_t success = 0; hsa_signal_store_screlease(signal, init); hsa_status_t err = hsa_amd_memory_async_copy(dest, agent, src, agent, size, 0, nullptr, signal); if (err != HSA_STATUS_SUCCESS) return err; // async_copy reports success by decrementing and failure by setting to < 0 hsa_signal_value_t got = init; while (got == init) got = hsa_signal_wait_scacquire(signal, HSA_SIGNAL_CONDITION_NE, init, UINT64_MAX, HSA_WAIT_STATE_BLOCKED); if (got != success) return HSA_STATUS_ERROR; return err; } struct implFreePtrDeletor { void operator()(void *p) { core::Runtime::Memfree(p); // ignore failure to free } }; enum CopyDirection { H2D, D2H }; static hsa_status_t locking_async_memcpy(enum CopyDirection direction, hsa_signal_t signal, void *dest, hsa_agent_t agent, void *src, void *lockingPtr, size_t size) { void *lockedPtr = nullptr; hsa_status_t err = is_locked(lockingPtr, &lockedPtr); bool HostPtrIsLocked = true; if (err != HSA_STATUS_SUCCESS) return err; if (!lockedPtr) { // not locked HostPtrIsLocked = false; hsa_agent_t agents[1] = {agent}; err = hsa_amd_memory_lock(lockingPtr, size, agents, /*num_agent=*/1, (void **)&lockedPtr); if (err != HSA_STATUS_SUCCESS) return err; DP("locking_async_memcpy: lockingPtr=%p lockedPtr=%p Size = %lu\n", lockingPtr, lockedPtr, size); } switch (direction) { case H2D: err = invoke_hsa_copy(signal, dest, agent, lockedPtr, size); break; case D2H: err = invoke_hsa_copy(signal, lockedPtr, agent, src, size); break; } if (err != HSA_STATUS_SUCCESS && !HostPtrIsLocked) { // do not leak locked host pointers, but discard potential error message // because the initial error was in the copy function hsa_amd_memory_unlock(lockingPtr); return err; } // unlock only if not user locked if (!HostPtrIsLocked) err = hsa_amd_memory_unlock(lockingPtr); if (err != HSA_STATUS_SUCCESS) return err; return HSA_STATUS_SUCCESS; } hsa_status_t impl_memcpy_h2d(hsa_signal_t signal, void *deviceDest, void *hostSrc, size_t size, hsa_agent_t device_agent, hsa_amd_memory_pool_t MemoryPool) { hsa_status_t err; err = locking_async_memcpy(CopyDirection::H2D, signal, deviceDest, device_agent, hostSrc, hostSrc, size); if (err == HSA_STATUS_SUCCESS) return err; // async memcpy sometimes fails in situations where // allocate + copy succeeds. Looks like it might be related to // locking part of a read only segment. Fall back for now. void *tempHostPtr; hsa_status_t ret = core::Runtime::HostMalloc(&tempHostPtr, size, MemoryPool); if (ret != HSA_STATUS_SUCCESS) { DP("HostMalloc: Unable to alloc %zu bytes for temp scratch\n", size); return ret; } std::unique_ptr del(tempHostPtr); memcpy(tempHostPtr, hostSrc, size); return locking_async_memcpy(CopyDirection::H2D, signal, deviceDest, device_agent, tempHostPtr, tempHostPtr, size); } hsa_status_t impl_memcpy_d2h(hsa_signal_t signal, void *hostDest, void *deviceSrc, size_t size, hsa_agent_t deviceAgent, hsa_amd_memory_pool_t MemoryPool) { hsa_status_t err; // device has always visibility over both pointers, so use that err = locking_async_memcpy(CopyDirection::D2H, signal, hostDest, deviceAgent, deviceSrc, hostDest, size); if (err == HSA_STATUS_SUCCESS) return err; // hsa_memory_copy sometimes fails in situations where // allocate + copy succeeds. Looks like it might be related to // locking part of a read only segment. Fall back for now. void *tempHostPtr; hsa_status_t ret = core::Runtime::HostMalloc(&tempHostPtr, size, MemoryPool); if (ret != HSA_STATUS_SUCCESS) { DP("HostMalloc: Unable to alloc %zu bytes for temp scratch\n", size); return ret; } std::unique_ptr del(tempHostPtr); err = locking_async_memcpy(CopyDirection::D2H, signal, tempHostPtr, deviceAgent, deviceSrc, tempHostPtr, size); if (err != HSA_STATUS_SUCCESS) return HSA_STATUS_ERROR; memcpy(hostDest, tempHostPtr, size); return HSA_STATUS_SUCCESS; }