[libc] Fix RPC server with independent thread scheduling (#182211)

Summary:
The NVIDIA ITS protocol allows lanes to diverge inside of a warp. We
previously had contingencies around this, but there were cases where
issues would still show up under highly stressed usage.

The rules state that as long as the PC is the same, threads can
reconverge. This means that we can see a 'convergent' warp even when
they took completely divergent paths to get there. This resulted in the
'index' value in the RPC port lookup loop thinking we were in a
convergent group while all the indices were different. Fix this with a
broadcast to force the expected behavior

Additionally, we did not force that the threads were actually done with
their 'work_fn'. If the work included something that caused divergence
the other threads could continue and toggle the mailbox, resulting in
the server seeing unfinished work. Fix this with an explicit sync and
have one thread do it.

Add a test to make sure this actually works.
This commit is contained in:
Joseph Huber 2026-02-19 17:38:49 -06:00 committed by GitHub
parent e9350dcd23
commit 6cdee8f3d8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 85 additions and 7 deletions

View File

@ -130,11 +130,14 @@ template <bool Invert> struct Process {
/// Equivalent to loading outbox followed by store of the inverted value
/// The outbox is write only by this warp and tracking the value locally is
/// cheaper than calling load_outbox to get the value to store.
RPC_ATTRS uint32_t invert_outbox(uint32_t index, uint32_t current_outbox) {
RPC_ATTRS uint32_t invert_outbox(uint64_t lane_mask, uint32_t index,
uint32_t current_outbox) {
uint32_t inverted_outbox = !current_outbox;
rpc::sync_lane(lane_mask);
__scoped_atomic_thread_fence(__ATOMIC_RELEASE, __MEMORY_SCOPE_SYSTEM);
__scoped_atomic_store_n(&outbox[index], inverted_outbox, __ATOMIC_RELAXED,
__MEMORY_SCOPE_SYSTEM);
if (rpc::is_first_lane(lane_mask))
__scoped_atomic_store_n(&outbox[index], inverted_outbox, __ATOMIC_RELAXED,
__MEMORY_SCOPE_SYSTEM);
return inverted_outbox;
}
@ -340,7 +343,7 @@ private:
// The server is passive, if it owns the buffer when it closes we need to
// give ownership back to the client.
if (owns_buffer && T)
out = process.invert_outbox(index, out);
out = process.invert_outbox(lane_mask, index, out);
process.unlock(lane_mask, index);
}
@ -403,7 +406,7 @@ template <bool T> template <typename F> RPC_ATTRS void Port<T>::send(F fill) {
// Apply the \p fill function to initialize the buffer and release the memory.
invoke_rpc(fill, lane_size, get_lane_mask(),
process.get_packet(index, lane_size));
out = process.invert_outbox(index, out);
out = process.invert_outbox(lane_mask, index, out);
owns_buffer = false;
receive = false;
}
@ -413,7 +416,7 @@ template <bool T> template <typename U> RPC_ATTRS void Port<T>::recv(U use) {
// We only exchange ownership of the buffer during a receive if we are waiting
// for a previous receive to finish.
if (receive) {
out = process.invert_outbox(index, out);
out = process.invert_outbox(lane_mask, index, out);
owns_buffer = false;
}
@ -556,8 +559,10 @@ template <uint32_t opcode> RPC_ATTRS Client::Port Client::open() {
if (index >= process.port_count)
index = 0;
// Attempt to acquire the lock on this index.
// Attempt to acquire the lock on this index. Under NVIDIA's ITS the lanes
// may reconverge with differing index values, ensure they are convergent.
uint64_t lane_mask = rpc::get_lane_mask();
index = rpc::broadcast_value(lane_mask, index);
if (!process.try_lock(lane_mask, index))
continue;

View File

@ -54,6 +54,16 @@ add_integration_test(
--blocks 8
)
add_integration_test(
startup_rpc_divergence_test
SUITE libc-startup-tests
SRCS
rpc_divergence_test.cpp
LOADER_ARGS
--threads 256
--blocks 16
)
if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
add_integration_test(
startup_rpc_lane_test_w32

View File

@ -0,0 +1,63 @@
//===-- Loader test to check the RPC interface with the loader ------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "src/__support/GPU/utils.h"
#include "src/__support/RPC/rpc_client.h"
#include "test/IntegrationTest/test.h"
using namespace LIBC_NAMESPACE;
static inline uint32_t entropy() {
return (static_cast<uint32_t>(gpu::processor_clock()) ^
(gpu::get_thread_id_x() * 0x632be59b) ^
(gpu::get_block_id_x() * 0x85157af5)) *
0x9e3779bb;
}
static inline uint32_t xorshift32(uint32_t &state) {
state ^= state << 13;
state ^= state >> 17;
state ^= state << 5;
return state * 0x9e3779bb;
}
void increment(uint64_t cnt) {
LIBC_NAMESPACE::rpc::Client::Port port =
LIBC_NAMESPACE::rpc::client.open<LIBC_TEST_INCREMENT>();
port.send_and_recv(
[=](LIBC_NAMESPACE::rpc::Buffer *buffer, uint32_t) {
reinterpret_cast<uint64_t *>(buffer->data)[0] = cnt;
},
[&](LIBC_NAMESPACE::rpc::Buffer *buffer, uint32_t) {
ASSERT_TRUE(reinterpret_cast<uint64_t *>(buffer->data)[0] == cnt + 1);
});
}
TEST_MAIN(int, char **, char **) {
uint32_t state = entropy();
// Force a highly divergent warp state while hammering the RPC interface.
uint32_t iters = 128 + xorshift32(state) % 128;
for (uint32_t i = 0; i < iters; ++i) {
if (xorshift32(state) % 127 == 0) {
volatile int x = 0;
uint32_t delay = xorshift32(state) % 4096;
for (uint32_t j = 0; j < delay; ++j)
x++;
}
uint32_t roll = xorshift32(state);
if (roll % 2 == 0) {
uint32_t burst = roll % 64 == 0 ? 2 + xorshift32(state) % 7 : 1;
for (uint32_t b = 0; b < burst; ++b)
increment(xorshift32(state));
}
}
return 0;
}