From 478e45fb94e541dfd3a53a23bbc8ed98337b8a77 Mon Sep 17 00:00:00 2001 From: Roger Sanders Date: Thu, 13 Nov 2025 19:13:41 +1100 Subject: [PATCH] [libc++] Improve performance of std::atomic_flag on Windows (#163524) On Windows 8 and above, the WaitOnAddress, WakeByAddressSingle and WakeByAddressAll functions allow efficient implementation of the C++20 wait and notify features of std::atomic_flag. These Windows functions have never been made use of in libc++, leading to very poor performance of these features on Windows platforms, as they are implemented using a spin loop with backoff, rather than using any OS thread signalling whatsoever. This change implements the use of these OS functions where available, falling back to the original implementation on Windows versions prior to 8. Relevant API docs from Microsoft: https://learn.microsoft.com/en-us/windows/win32/api/synchapi/nf-synchapi-waitonaddress https://learn.microsoft.com/en-us/windows/win32/api/synchapi/nf-synchapi-wakebyaddresssingle https://learn.microsoft.com/en-us/windows/win32/api/synchapi/nf-synchapi-wakebyaddressall Fixes #127221 --- libcxx/src/atomic.cpp | 69 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/libcxx/src/atomic.cpp b/libcxx/src/atomic.cpp index b214ba1fd11c..b9e4aa30bbdc 100644 --- a/libcxx/src/atomic.cpp +++ b/libcxx/src/atomic.cpp @@ -41,6 +41,11 @@ // OpenBSD has no indirect syscalls # define _LIBCPP_FUTEX(...) futex(__VA_ARGS__) +#elif defined(_WIN32) + +# include +# include + #else // <- Add other operating systems here // Baseline needs no new headers @@ -101,6 +106,70 @@ static void __libcpp_platform_wake_by_address(__cxx_atomic_contention_t const vo _umtx_op(const_cast<__cxx_atomic_contention_t*>(__ptr), UMTX_OP_WAKE, __notify_one ? 1 : INT_MAX, nullptr, nullptr); } +#elif defined(_WIN32) + +static void* win32_get_synch_api_function(const char* function_name) { + // Attempt to load the API set. Note that as per the Microsoft STL implementation, we assume this API is already + // loaded and accessible. While this isn't explicitly guaranteed by publicly available Win32 API documentation, it is + // true in practice, and may be guaranteed by internal documentation not released publicly. In any case the fact that + // the Microsoft STL made this assumption is reasonable basis to say that we can too. The alternative to this would be + // to use LoadLibrary, but then leak the module handle. We can't call FreeLibrary, as this would have to be triggered + // by a global static destructor, which would hang off DllMain, and calling FreeLibrary from DllMain is explicitly + // mentioned as not being allowed: + // https://learn.microsoft.com/en-us/windows/win32/dlls/dllmain + // Given the range of bad options here, we have chosen to mirror what Microsoft did, as it seems fair to assume that + // Microsoft will guarantee compatibility for us, as we are exposed to the same conditions as all existing Windows + // apps using the Microsoft STL VS2015/2017/2019/2022 runtimes, where Windows 7 support has not been excluded at + // compile time. + static auto module_handle = GetModuleHandleW(L"api-ms-win-core-synch-l1-2-0.dll"); + if (module_handle == nullptr) { + return nullptr; + } + + // Attempt to locate the function in the API and return the result to the caller. Note that the NULL return from this + // method is documented as being interchangeable with nullptr. + // https://devblogs.microsoft.com/oldnewthing/20180307-00/?p=98175 + return reinterpret_cast(GetProcAddress(module_handle, function_name)); +} + +static void +__libcpp_platform_wait_on_address(__cxx_atomic_contention_t const volatile* __ptr, __cxx_contention_t __val) { + // WaitOnAddress was added in Windows 8 (build 9200) + static auto wait_on_address = reinterpret_cast( + win32_get_synch_api_function("WaitOnAddress")); + if (wait_on_address != nullptr) { + wait_on_address(const_cast<__cxx_atomic_contention_t*>(__ptr), &__val, sizeof(__val), INFINITE); + } else { + __libcpp_thread_poll_with_backoff( + [=]() -> bool { return !__cxx_nonatomic_compare_equal(__cxx_atomic_load(__ptr, memory_order_relaxed), __val); }, + __libcpp_timed_backoff_policy()); + } +} + +static void __libcpp_platform_wake_by_address(__cxx_atomic_contention_t const volatile* __ptr, bool __notify_one) { + if (__notify_one) { + // WakeByAddressSingle was added in Windows 8 (build 9200) + static auto wake_by_address_single = + reinterpret_cast(win32_get_synch_api_function("WakeByAddressSingle")); + if (wake_by_address_single != nullptr) { + wake_by_address_single(const_cast<__cxx_atomic_contention_t*>(__ptr)); + } else { + // The fallback implementation of waking does nothing, as the fallback wait implementation just does polling, so + // there's nothing to do here. + } + } else { + // WakeByAddressAll was added in Windows 8 (build 9200) + static auto wake_by_address_all = + reinterpret_cast(win32_get_synch_api_function("WakeByAddressAll")); + if (wake_by_address_all != nullptr) { + wake_by_address_all(const_cast<__cxx_atomic_contention_t*>(__ptr)); + } else { + // The fallback implementation of waking does nothing, as the fallback wait implementation just does polling, so + // there's nothing to do here. + } + } +} + #else // <- Add other operating systems here // Baseline is just a timed backoff