[libc] Improve SIMT control flow in the GPU allocator

Summary: The Volta independent thread scheduling is very difficult to work with. This is a first attempt to make the logic more sound when lanes execute independently. This isn't all that's required, but it ends up improving control flow for AMDGPU as well.
2026-01-11 07:48:43 -06:00 · 2026-01-11 07:48:43 -06:00 · 185f078a6f
commit 185f078a6f
parent 263802c56b
1 changed files with 46 additions and 39 deletions
--- a/libc/src/__support/GPU/allocator.cpp
+++ b/libc/src/__support/GPU/allocator.cpp
@ -38,7 +38,7 @@ constexpr static uint32_t MIN_SIZE = 16;
 constexpr static uint32_t MIN_ALIGNMENT = MIN_SIZE - 1;

 // The number of times to attempt claiming an in-progress slab allocation.
-constexpr static uint32_t MAX_TRIES = 1024;
+constexpr static uint32_t MAX_TRIES = 128;

 // The number of previously allocated slabs we will keep in memory.
 constexpr static uint32_t CACHED_SLABS = 8;
@ -136,11 +136,11 @@ static inline constexpr T round_up(const T x) {
 }

 // Perform a lane parallel memset on a uint32_t pointer.
-void uniform_memset(uint32_t *s, uint32_t c, uint32_t n, uint64_t uniform) {
-  uint64_t mask = gpu::get_lane_mask();
+void uniform_memset(uint32_t *s, uint32_t c, uint32_t n, uint64_t lane_mask,
+                    uint64_t uniform) {
  uint32_t workers = cpp::popcount(uniform);
-  for (uint32_t i = impl::lane_count(mask & uniform, gpu::get_lane_id()); i < n;
-       i += workers)
+  for (uint32_t i = impl::lane_count(lane_mask & uniform, gpu::get_lane_id());
+       i < n; i += workers)
    s[i] = c;
 }

@ -176,6 +176,9 @@ template <typename T> bool is_sentinel(const T &x) {
    return x == cpp::numeric_limits<T>::max();
 }

+// Returns the current lane's position in the lane mask.
+uint64_t id_in_mask() { return 1ull << gpu::get_lane_id(); }
+
 } // namespace impl

 /// A slab allocator used to hand out identically sized slabs of memory.
@ -220,14 +223,14 @@ struct Slab {
  // Set the necessary bitfield bytes to zero in parallel using many lanes. This
  // must be called before the bitfield can be accessed safely, memory is not
  // guaranteed to be zero initialized in the current implementation.
-  void initialize(uint64_t uniform) {
+  void initialize(uint64_t lane_mask, uint64_t uniform) {
    // If this is a re-used slab the memory is already set to zero.
    if (get_cached_chunk_size() <= get_chunk_size())
      return;

    uint32_t size = (bitfield_bytes(get_chunk_size()) + sizeof(uint32_t) - 1) /
                    sizeof(uint32_t);
-    impl::uniform_memset(get_bitfield(), 0, size, uniform);
+    impl::uniform_memset(get_bitfield(), 0, size, lane_mask, uniform);
  }

  // Get the number of chunks that can theoretically fit inside this slab.
@ -495,20 +498,18 @@ public:
    result = gpu::shuffle(lane_mask, cpp::countr_zero(uniform), result);
    count = gpu::shuffle(lane_mask, cpp::countr_zero(uniform), count);

-    if (!result)
-      return nullptr;
-
    // We defer storing the newly allocated slab until now so that we can use
    // multiple lanes to initialize it and release it for use.
-    if (impl::is_sentinel(count)) {
-      result->initialize(uniform);
+    uint64_t slab_mask =
+        gpu::ballot(lane_mask, result && impl::is_sentinel(count));
+    if (slab_mask & impl::id_in_mask()) {
+      result->initialize(slab_mask, uniform);
      if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(uniform)))
        finalize(result, cpp::popcount(uniform), count);
-      count =
-          gpu::shuffle(gpu::get_lane_mask(), cpp::countr_zero(uniform), count);
+      count = gpu::shuffle(slab_mask, cpp::countr_zero(uniform), count);
    }

-    if (!impl::is_sentinel(count))
+    if (result)
      count = count - cpp::popcount(uniform) +
              impl::lane_count(uniform, gpu::get_lane_id());

@ -553,52 +554,56 @@ static cpp::Atomic<uint32_t> indices[] = {
 #undef S

 // Tries to find a slab in the table that can support the given chunk size.
-static Slab *find_slab(uint32_t chunk_size, uint64_t &uniform,
-                       uint32_t &reserved) {
+static Slab *find_slab(uint32_t chunk_size, uint64_t lane_mask,
+                       uint64_t &uniform, uint32_t &reserved) {
  // We start at the index of the last successful allocation for this kind.
  uint32_t chunk_id = impl::get_chunk_id(chunk_size);
  uint32_t start = indices[chunk_id].load(cpp::MemoryOrder::RELAXED);

-  for (uint32_t offset = 0; offset <= ARRAY_SIZE; ++offset) {
+  Slab *result = nullptr;
+  for (uint32_t offset = 0;
+       gpu::ballot(lane_mask, !result) && offset <= ARRAY_SIZE; ++offset) {
    uint32_t index =
        !offset ? start
                : (impl::get_start_index(chunk_size) + offset - 1) % ARRAY_SIZE;

-    if (!offset ||
-        slots[index].use_count() < Slab::available_chunks(chunk_size)) {
-      uint64_t lane_mask = gpu::get_lane_mask();
-
-      Slab *slab = slots[index].try_lock(lane_mask, uniform & lane_mask,
+    bool available = !offset || slots[index].use_count() <
+                                    Slab::available_chunks(chunk_size);
+    uint64_t slab_mask = gpu::ballot(uniform, !result && available);
+    if (slab_mask & impl::id_in_mask()) {
+      Slab *slab = slots[index].try_lock(slab_mask, uniform & slab_mask,
                                         reserved, chunk_size, index);

      // If we find a slab with a matching chunk size then we store the result.
      // Otherwise, we need to free the claimed lock and continue. In the case
      // of out-of-memory we receive a sentinel value and return a failure.
-      if (slab && reserved < Slab::available_chunks(chunk_size) &&
-          slab->get_chunk_size() == chunk_size) {
+      uint64_t locked_mask = gpu::ballot(
+          slab_mask, slab && reserved < Slab::available_chunks(chunk_size) &&
+                         slab->get_chunk_size() == chunk_size);
+      uint64_t failed_mask = gpu::ballot(
+          slab_mask, slab && (reserved >= Slab::available_chunks(chunk_size) ||
+                              slab->get_chunk_size() != chunk_size));
+      if (locked_mask & impl::id_in_mask()) {
        if (index != start)
          indices[chunk_id].store(index, cpp::MemoryOrder::RELAXED);
-        uniform = uniform & gpu::get_lane_mask();
-        return slab;
-      } else if (slab && (reserved >= Slab::available_chunks(chunk_size) ||
-                          slab->get_chunk_size() != chunk_size)) {
-        slots[index].unlock(gpu::get_lane_mask(),
-                            gpu::get_lane_mask() & uniform);
+        uniform = uniform & locked_mask;
+        result = slab;
+      } else if (failed_mask & impl::id_in_mask()) {
+        slots[index].unlock(failed_mask, failed_mask & uniform);
      } else if (!slab && impl::is_sentinel(reserved)) {
-        uniform = uniform & gpu::get_lane_mask();
-        return nullptr;
+        result =
+            reinterpret_cast<Slab *>(cpp::numeric_limits<uintptr_t>::max());
      } else {
        sleep_briefly();
      }
    }
  }
-  return nullptr;
+  return !impl::is_sentinel(result) ? result : nullptr;
 }

 // Release the lock associated with a given slab.
-static void release_slab(Slab *slab) {
+static void release_slab(uint64_t lane_mask, Slab *slab) {
  uint32_t index = slab->get_global_index();
-  uint64_t lane_mask = gpu::get_lane_mask();
  uint64_t uniform = gpu::match_any(lane_mask, index);
  slots[index].unlock(lane_mask, uniform);
 }
@ -615,9 +620,10 @@ void *allocate(uint64_t size) {

  // Try to find a slab for the rounded up chunk size and allocate from it.
  uint32_t chunk_size = impl::get_chunk_size(static_cast<uint32_t>(size));
-  uint64_t uniform = gpu::match_any(gpu::get_lane_mask(), chunk_size);
+  uint64_t lane_mask = gpu::get_lane_mask();
+  uint64_t uniform = gpu::match_any(lane_mask, chunk_size);
  uint32_t reserved = 0;
-  Slab *slab = find_slab(chunk_size, uniform, reserved);
+  Slab *slab = find_slab(chunk_size, lane_mask, uniform, reserved);
  if (!slab)
    return nullptr;

@ -634,10 +640,11 @@ void deallocate(void *ptr) {
    return impl::rpc_free(ptr);

  // The original slab pointer is the 2MiB boundary using the given pointer.
+  uint64_t lane_mask = gpu::get_lane_mask();
  Slab *slab = cpp::launder(reinterpret_cast<Slab *>(
      (reinterpret_cast<uintptr_t>(ptr) & ~SLAB_ALIGNMENT)));
  slab->deallocate(ptr);
-  release_slab(slab);
+  release_slab(lane_mask, slab);
 }

 void *reallocate(void *ptr, uint64_t size) {