Valentin Clement (バレンタイン クレメン) c17ae161fd
[flang][cuda] Use nullptr for comparison (#140767)
Comparison without explicit nullptr seems to bring false positives. Use
explicit nullptr.
2025-05-20 11:04:06 -07:00

198 lines
5.9 KiB
C++

//===-- lib/cuda/allocator.cpp ----------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "flang/Runtime/CUDA/allocator.h"
#include "flang-rt/runtime/allocator-registry.h"
#include "flang-rt/runtime/derived.h"
#include "flang-rt/runtime/descriptor.h"
#include "flang-rt/runtime/environment.h"
#include "flang-rt/runtime/lock.h"
#include "flang-rt/runtime/stat.h"
#include "flang-rt/runtime/terminator.h"
#include "flang-rt/runtime/type-info.h"
#include "flang/Common/ISO_Fortran_binding_wrapper.h"
#include "flang/Runtime/CUDA/common.h"
#include "flang/Support/Fortran.h"
#include "cuda_runtime.h"
namespace Fortran::runtime::cuda {
struct DeviceAllocation {
void *ptr;
std::size_t size;
cudaStream_t stream;
};
// Compare address values. nullptr will be sorted at the end of the array.
int compareDeviceAlloc(const void *a, const void *b) {
const DeviceAllocation *deva = (const DeviceAllocation *)a;
const DeviceAllocation *devb = (const DeviceAllocation *)b;
if (deva->ptr == nullptr && devb->ptr == nullptr)
return 0;
if (deva->ptr == nullptr)
return 1;
if (devb->ptr == nullptr)
return -1;
return deva->ptr < devb->ptr ? -1 : (deva->ptr > devb->ptr ? 1 : 0);
}
// Dynamic array for tracking asynchronous allocations.
static DeviceAllocation *deviceAllocations = nullptr;
Lock lock;
static int maxDeviceAllocations{512}; // Initial size
static int numDeviceAllocations{0};
static constexpr int allocNotFound{-1};
static void initAllocations() {
if (!deviceAllocations) {
deviceAllocations = static_cast<DeviceAllocation *>(
malloc(maxDeviceAllocations * sizeof(DeviceAllocation)));
if (!deviceAllocations) {
Terminator terminator{__FILE__, __LINE__};
terminator.Crash("Failed to allocate tracking array");
}
}
}
static void doubleAllocationArray() {
unsigned newSize = maxDeviceAllocations * 2;
DeviceAllocation *newArray = static_cast<DeviceAllocation *>(
realloc(deviceAllocations, newSize * sizeof(DeviceAllocation)));
if (!newArray) {
Terminator terminator{__FILE__, __LINE__};
terminator.Crash("Failed to reallocate tracking array");
}
deviceAllocations = newArray;
maxDeviceAllocations = newSize;
}
static unsigned findAllocation(void *ptr) {
if (numDeviceAllocations == 0) {
return allocNotFound;
}
int left{0};
int right{numDeviceAllocations - 1};
if (left == right) {
return left;
}
while (left <= right) {
int mid = left + (right - left) / 2;
if (deviceAllocations[mid].ptr == ptr) {
return mid;
}
if (deviceAllocations[mid].ptr < ptr) {
left = mid + 1;
} else {
right = mid - 1;
}
}
return allocNotFound;
}
static void insertAllocation(void *ptr, std::size_t size, cudaStream_t stream) {
CriticalSection critical{lock};
initAllocations();
if (numDeviceAllocations >= maxDeviceAllocations) {
doubleAllocationArray();
}
deviceAllocations[numDeviceAllocations].ptr = ptr;
deviceAllocations[numDeviceAllocations].size = size;
deviceAllocations[numDeviceAllocations].stream = stream;
++numDeviceAllocations;
qsort(deviceAllocations, numDeviceAllocations, sizeof(DeviceAllocation),
compareDeviceAlloc);
}
static void eraseAllocation(int pos) {
deviceAllocations[pos].ptr = nullptr;
deviceAllocations[pos].size = 0;
deviceAllocations[pos].stream = (cudaStream_t)0;
qsort(deviceAllocations, numDeviceAllocations, sizeof(DeviceAllocation),
compareDeviceAlloc);
--numDeviceAllocations;
}
extern "C" {
void RTDEF(CUFRegisterAllocator)() {
allocatorRegistry.Register(
kPinnedAllocatorPos, {&CUFAllocPinned, CUFFreePinned});
allocatorRegistry.Register(
kDeviceAllocatorPos, {&CUFAllocDevice, CUFFreeDevice});
allocatorRegistry.Register(
kManagedAllocatorPos, {&CUFAllocManaged, CUFFreeManaged});
allocatorRegistry.Register(
kUnifiedAllocatorPos, {&CUFAllocUnified, CUFFreeUnified});
}
}
void *CUFAllocPinned(
std::size_t sizeInBytes, [[maybe_unused]] std::int64_t *asyncObject) {
void *p;
CUDA_REPORT_IF_ERROR(cudaMallocHost((void **)&p, sizeInBytes));
return p;
}
void CUFFreePinned(void *p) { CUDA_REPORT_IF_ERROR(cudaFreeHost(p)); }
void *CUFAllocDevice(std::size_t sizeInBytes, std::int64_t *asyncObject) {
void *p;
if (Fortran::runtime::executionEnvironment.cudaDeviceIsManaged) {
CUDA_REPORT_IF_ERROR(
cudaMallocManaged((void **)&p, sizeInBytes, cudaMemAttachGlobal));
} else {
if (asyncObject == nullptr) {
CUDA_REPORT_IF_ERROR(cudaMalloc(&p, sizeInBytes));
} else {
CUDA_REPORT_IF_ERROR(
cudaMallocAsync(&p, sizeInBytes, (cudaStream_t)*asyncObject));
insertAllocation(p, sizeInBytes, (cudaStream_t)*asyncObject);
}
}
return p;
}
void CUFFreeDevice(void *p) {
CriticalSection critical{lock};
int pos = findAllocation(p);
if (pos >= 0) {
cudaStream_t stream = deviceAllocations[pos].stream;
eraseAllocation(pos);
CUDA_REPORT_IF_ERROR(cudaFreeAsync(p, stream));
} else {
CUDA_REPORT_IF_ERROR(cudaFree(p));
}
}
void *CUFAllocManaged(
std::size_t sizeInBytes, [[maybe_unused]] std::int64_t *asyncObject) {
void *p;
CUDA_REPORT_IF_ERROR(
cudaMallocManaged((void **)&p, sizeInBytes, cudaMemAttachGlobal));
return reinterpret_cast<void *>(p);
}
void CUFFreeManaged(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); }
void *CUFAllocUnified(
std::size_t sizeInBytes, [[maybe_unused]] std::int64_t *asyncObject) {
// Call alloc managed for the time being.
return CUFAllocManaged(sizeInBytes, asyncObject);
}
void CUFFreeUnified(void *p) {
// Call free managed for the time being.
CUFFreeManaged(p);
}
} // namespace Fortran::runtime::cuda