[libc] Efficiently implement 'realloc' for AMDGPU devices (#145960)

Summary: Now that we have `malloc` we can implement `realloc` efficiently. This uses the known chunk sizes to avoid unnecessary allocations. We just return nullptr for NVPTX. I'd remove the list for the entrypoint but then the libc++ code would stop working. When someone writes the NVPTX support this will be trivial.
2025-06-30 08:39:40 -05:00 · 2025-06-30 08:39:40 -05:00 · 10445acfa6
commit 10445acfa6
parent d7e23bef6a
5 changed files with 91 additions and 11 deletions
--- a/libc/src/__support/GPU/allocator.cpp
+++ b/libc/src/__support/GPU/allocator.cpp
@ -22,6 +22,7 @@
 #include "src/__support/GPU/utils.h"
 #include "src/__support/RPC/rpc_client.h"
 #include "src/__support/threads/sleep.h"
+#include "src/string/memory_utils/inline_memcpy.h"

 namespace LIBC_NAMESPACE_DECL {

@ -550,5 +551,26 @@ void deallocate(void *ptr) {
  release_slab(slab);
 }

+void *reallocate(void *ptr, uint64_t size) {
+  if (ptr == nullptr)
+    return gpu::allocate(size);
+
+  // Non-slab allocations are considered foreign pointers so we fail.
+  if ((reinterpret_cast<uintptr_t>(ptr) & SLAB_ALIGNMENT) == 0)
+    return nullptr;
+
+  // The original slab pointer is the 2MiB boundary using the given pointer.
+  Slab *slab = cpp::launder(reinterpret_cast<Slab *>(
+      (reinterpret_cast<uintptr_t>(ptr) & ~SLAB_ALIGNMENT)));
+  if (slab->get_chunk_size() >= size)
+    return ptr;
+
+  // If we need a new chunk we reallocate and copy it over.
+  void *new_ptr = gpu::allocate(size);
+  inline_memcpy(new_ptr, ptr, slab->get_chunk_size());
+  gpu::deallocate(ptr);
+  return new_ptr;
+}
+
 } // namespace gpu
 } // namespace LIBC_NAMESPACE_DECL
--- a/libc/src/__support/GPU/allocator.h
+++ b/libc/src/__support/GPU/allocator.h
@ -17,6 +17,7 @@ namespace gpu {

 void *allocate(uint64_t size);
 void deallocate(void *ptr);
+void *reallocate(void *ptr, uint64_t size);

 } // namespace gpu
 } // namespace LIBC_NAMESPACE_DECL
--- a/libc/src/stdlib/gpu/realloc.cpp
+++ b/libc/src/stdlib/gpu/realloc.cpp
@ -16,17 +16,15 @@
 namespace LIBC_NAMESPACE_DECL {

 LLVM_LIBC_FUNCTION(void *, realloc, (void *ptr, size_t size)) {
-  if (ptr == nullptr)
-    return gpu::allocate(size);
-
-  void *newmem = gpu::allocate(size);
-  if (newmem == nullptr)
-    return nullptr;
-
-  // This will copy garbage if it goes beyond the old allocation size.
-  inline_memcpy(newmem, ptr, size);
-  gpu::deallocate(ptr);
-  return newmem;
+  // FIXME: NVIDIA targets currently use the built-in 'malloc' which we cannot
+  // reason with. But we still need to provide this function for compatibility.
+#ifndef LIBC_TARGET_ARCH_IS_NVPTX
+  return gpu::reallocate(ptr, size);
+#else
+  (void)ptr;
+  (void)size;
+  return nullptr;
+#endif
 }

 } // namespace LIBC_NAMESPACE_DECL
--- a/libc/test/integration/src/stdlib/gpu/CMakeLists.txt
+++ b/libc/test/integration/src/stdlib/gpu/CMakeLists.txt
@ -17,6 +17,21 @@ if(NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
      --blocks 1024
  )

+  add_integration_test(
+    realloc
+    SUITE
+      stdlib-gpu-integration-tests
+    SRCS
+      realloc.cpp
+    DEPENDS
+      libc.src.stdlib.malloc
+      libc.src.stdlib.free
+      libc.src.stdlib.realloc
+    LOADER_ARGS
+      --threads 256
+      --blocks 1024
+  )
+
  add_integration_test(
    malloc_stress
    SUITE
--- a/libc/test/integration/src/stdlib/gpu/realloc.cpp
+++ b/libc/test/integration/src/stdlib/gpu/realloc.cpp
@ -0,0 +1,44 @@
+#include "test/IntegrationTest/test.h"
+
+#include "src/__support/GPU/utils.h"
+#include "src/stdlib/free.h"
+#include "src/stdlib/malloc.h"
+#include "src/stdlib/realloc.h"
+
+using namespace LIBC_NAMESPACE;
+
+TEST_MAIN(int, char **, char **) {
+  // realloc(nullptr, size) is equivalent to malloc.
+  int *alloc = reinterpret_cast<int *>(LIBC_NAMESPACE::realloc(nullptr, 32));
+  EXPECT_NE(alloc, nullptr);
+  *alloc = 42;
+  EXPECT_EQ(*alloc, 42);
+
+  // realloc to same size returns the same pointer.
+  void *same = LIBC_NAMESPACE::realloc(alloc, 32);
+  EXPECT_EQ(same, alloc);
+  EXPECT_EQ(reinterpret_cast<int *>(same)[0], 42);
+
+  // realloc to smaller size returns same pointer.
+  void *smaller = LIBC_NAMESPACE::realloc(same, 16);
+  EXPECT_EQ(smaller, alloc);
+  EXPECT_EQ(reinterpret_cast<int *>(smaller)[0], 42);
+
+  // realloc to larger size returns new pointer and preserves contents.
+  int *larger = reinterpret_cast<int *>(LIBC_NAMESPACE::realloc(smaller, 128));
+  EXPECT_NE(larger, nullptr);
+  EXPECT_EQ(larger[0], 42);
+
+  // realloc works when called with a divergent size.
+  int *div = reinterpret_cast<int *>(
+      LIBC_NAMESPACE::malloc((gpu::get_thread_id() + 1) * 16));
+  EXPECT_NE(div, nullptr);
+  div[0] = static_cast<int>(gpu::get_thread_id());
+  int *div_realloc = reinterpret_cast<int *>(
+      LIBC_NAMESPACE::realloc(div, ((gpu::get_thread_id() + 1) * 32)));
+  EXPECT_NE(div_realloc, nullptr);
+  EXPECT_EQ(div_realloc[0], static_cast<int>(gpu::get_thread_id()));
+  LIBC_NAMESPACE::free(div_realloc);
+
+  return 0;
+}