
Comparison without explicit nullptr seems to bring false positives. Use explicit nullptr.
198 lines
5.9 KiB
C++
198 lines
5.9 KiB
C++
//===-- lib/cuda/allocator.cpp ----------------------------------*- C++ -*-===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "flang/Runtime/CUDA/allocator.h"
|
|
#include "flang-rt/runtime/allocator-registry.h"
|
|
#include "flang-rt/runtime/derived.h"
|
|
#include "flang-rt/runtime/descriptor.h"
|
|
#include "flang-rt/runtime/environment.h"
|
|
#include "flang-rt/runtime/lock.h"
|
|
#include "flang-rt/runtime/stat.h"
|
|
#include "flang-rt/runtime/terminator.h"
|
|
#include "flang-rt/runtime/type-info.h"
|
|
#include "flang/Common/ISO_Fortran_binding_wrapper.h"
|
|
#include "flang/Runtime/CUDA/common.h"
|
|
#include "flang/Support/Fortran.h"
|
|
|
|
#include "cuda_runtime.h"
|
|
|
|
namespace Fortran::runtime::cuda {
|
|
|
|
struct DeviceAllocation {
|
|
void *ptr;
|
|
std::size_t size;
|
|
cudaStream_t stream;
|
|
};
|
|
|
|
// Compare address values. nullptr will be sorted at the end of the array.
|
|
int compareDeviceAlloc(const void *a, const void *b) {
|
|
const DeviceAllocation *deva = (const DeviceAllocation *)a;
|
|
const DeviceAllocation *devb = (const DeviceAllocation *)b;
|
|
if (deva->ptr == nullptr && devb->ptr == nullptr)
|
|
return 0;
|
|
if (deva->ptr == nullptr)
|
|
return 1;
|
|
if (devb->ptr == nullptr)
|
|
return -1;
|
|
return deva->ptr < devb->ptr ? -1 : (deva->ptr > devb->ptr ? 1 : 0);
|
|
}
|
|
|
|
// Dynamic array for tracking asynchronous allocations.
|
|
static DeviceAllocation *deviceAllocations = nullptr;
|
|
Lock lock;
|
|
static int maxDeviceAllocations{512}; // Initial size
|
|
static int numDeviceAllocations{0};
|
|
static constexpr int allocNotFound{-1};
|
|
|
|
static void initAllocations() {
|
|
if (!deviceAllocations) {
|
|
deviceAllocations = static_cast<DeviceAllocation *>(
|
|
malloc(maxDeviceAllocations * sizeof(DeviceAllocation)));
|
|
if (!deviceAllocations) {
|
|
Terminator terminator{__FILE__, __LINE__};
|
|
terminator.Crash("Failed to allocate tracking array");
|
|
}
|
|
}
|
|
}
|
|
|
|
static void doubleAllocationArray() {
|
|
unsigned newSize = maxDeviceAllocations * 2;
|
|
DeviceAllocation *newArray = static_cast<DeviceAllocation *>(
|
|
realloc(deviceAllocations, newSize * sizeof(DeviceAllocation)));
|
|
if (!newArray) {
|
|
Terminator terminator{__FILE__, __LINE__};
|
|
terminator.Crash("Failed to reallocate tracking array");
|
|
}
|
|
deviceAllocations = newArray;
|
|
maxDeviceAllocations = newSize;
|
|
}
|
|
|
|
static unsigned findAllocation(void *ptr) {
|
|
if (numDeviceAllocations == 0) {
|
|
return allocNotFound;
|
|
}
|
|
|
|
int left{0};
|
|
int right{numDeviceAllocations - 1};
|
|
|
|
if (left == right) {
|
|
return left;
|
|
}
|
|
|
|
while (left <= right) {
|
|
int mid = left + (right - left) / 2;
|
|
if (deviceAllocations[mid].ptr == ptr) {
|
|
return mid;
|
|
}
|
|
if (deviceAllocations[mid].ptr < ptr) {
|
|
left = mid + 1;
|
|
} else {
|
|
right = mid - 1;
|
|
}
|
|
}
|
|
return allocNotFound;
|
|
}
|
|
|
|
static void insertAllocation(void *ptr, std::size_t size, cudaStream_t stream) {
|
|
CriticalSection critical{lock};
|
|
initAllocations();
|
|
if (numDeviceAllocations >= maxDeviceAllocations) {
|
|
doubleAllocationArray();
|
|
}
|
|
deviceAllocations[numDeviceAllocations].ptr = ptr;
|
|
deviceAllocations[numDeviceAllocations].size = size;
|
|
deviceAllocations[numDeviceAllocations].stream = stream;
|
|
++numDeviceAllocations;
|
|
qsort(deviceAllocations, numDeviceAllocations, sizeof(DeviceAllocation),
|
|
compareDeviceAlloc);
|
|
}
|
|
|
|
static void eraseAllocation(int pos) {
|
|
deviceAllocations[pos].ptr = nullptr;
|
|
deviceAllocations[pos].size = 0;
|
|
deviceAllocations[pos].stream = (cudaStream_t)0;
|
|
qsort(deviceAllocations, numDeviceAllocations, sizeof(DeviceAllocation),
|
|
compareDeviceAlloc);
|
|
--numDeviceAllocations;
|
|
}
|
|
|
|
extern "C" {
|
|
|
|
void RTDEF(CUFRegisterAllocator)() {
|
|
allocatorRegistry.Register(
|
|
kPinnedAllocatorPos, {&CUFAllocPinned, CUFFreePinned});
|
|
allocatorRegistry.Register(
|
|
kDeviceAllocatorPos, {&CUFAllocDevice, CUFFreeDevice});
|
|
allocatorRegistry.Register(
|
|
kManagedAllocatorPos, {&CUFAllocManaged, CUFFreeManaged});
|
|
allocatorRegistry.Register(
|
|
kUnifiedAllocatorPos, {&CUFAllocUnified, CUFFreeUnified});
|
|
}
|
|
}
|
|
|
|
void *CUFAllocPinned(
|
|
std::size_t sizeInBytes, [[maybe_unused]] std::int64_t *asyncObject) {
|
|
void *p;
|
|
CUDA_REPORT_IF_ERROR(cudaMallocHost((void **)&p, sizeInBytes));
|
|
return p;
|
|
}
|
|
|
|
void CUFFreePinned(void *p) { CUDA_REPORT_IF_ERROR(cudaFreeHost(p)); }
|
|
|
|
void *CUFAllocDevice(std::size_t sizeInBytes, std::int64_t *asyncObject) {
|
|
void *p;
|
|
if (Fortran::runtime::executionEnvironment.cudaDeviceIsManaged) {
|
|
CUDA_REPORT_IF_ERROR(
|
|
cudaMallocManaged((void **)&p, sizeInBytes, cudaMemAttachGlobal));
|
|
} else {
|
|
if (asyncObject == nullptr) {
|
|
CUDA_REPORT_IF_ERROR(cudaMalloc(&p, sizeInBytes));
|
|
} else {
|
|
CUDA_REPORT_IF_ERROR(
|
|
cudaMallocAsync(&p, sizeInBytes, (cudaStream_t)*asyncObject));
|
|
insertAllocation(p, sizeInBytes, (cudaStream_t)*asyncObject);
|
|
}
|
|
}
|
|
return p;
|
|
}
|
|
|
|
void CUFFreeDevice(void *p) {
|
|
CriticalSection critical{lock};
|
|
int pos = findAllocation(p);
|
|
if (pos >= 0) {
|
|
cudaStream_t stream = deviceAllocations[pos].stream;
|
|
eraseAllocation(pos);
|
|
CUDA_REPORT_IF_ERROR(cudaFreeAsync(p, stream));
|
|
} else {
|
|
CUDA_REPORT_IF_ERROR(cudaFree(p));
|
|
}
|
|
}
|
|
|
|
void *CUFAllocManaged(
|
|
std::size_t sizeInBytes, [[maybe_unused]] std::int64_t *asyncObject) {
|
|
void *p;
|
|
CUDA_REPORT_IF_ERROR(
|
|
cudaMallocManaged((void **)&p, sizeInBytes, cudaMemAttachGlobal));
|
|
return reinterpret_cast<void *>(p);
|
|
}
|
|
|
|
void CUFFreeManaged(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); }
|
|
|
|
void *CUFAllocUnified(
|
|
std::size_t sizeInBytes, [[maybe_unused]] std::int64_t *asyncObject) {
|
|
// Call alloc managed for the time being.
|
|
return CUFAllocManaged(sizeInBytes, asyncObject);
|
|
}
|
|
|
|
void CUFFreeUnified(void *p) {
|
|
// Call free managed for the time being.
|
|
CUFFreeManaged(p);
|
|
}
|
|
|
|
} // namespace Fortran::runtime::cuda
|