From ecee70e210fa0a43b12d8ce7fe01f33bb8f84167 Mon Sep 17 00:00:00 2001 From: Sterling-Augustine Date: Fri, 13 Feb 2026 12:38:19 -0800 Subject: [PATCH] Implement vector version of memchr, and dispatch to same (#177711) As in the description. This implementation shares quite a bit of code with the wide-read versions of string_length. --- .../memory_utils/generic/inline_strlen.h | 51 +++++++- .../memory_utils/x86_64/inline_strlen.h | 111 +++++++++++++++--- .../src/strings/wide_read_memory_test.cpp | 21 ++++ 3 files changed, 161 insertions(+), 22 deletions(-) diff --git a/libc/src/string/memory_utils/generic/inline_strlen.h b/libc/src/string/memory_utils/generic/inline_strlen.h index 7a565b36617e..e9f1542f4142 100644 --- a/libc/src/string/memory_utils/generic/inline_strlen.h +++ b/libc/src/string/memory_utils/generic/inline_strlen.h @@ -17,11 +17,12 @@ namespace LIBC_NAMESPACE_DECL { namespace clang_vector { // Exploit the underlying integer representation to do a variable shift. -LIBC_INLINE constexpr cpp::simd_mask shift_mask(cpp::simd_mask m, - size_t shift) { +template +LIBC_INLINE constexpr cpp::simd_mask shift_mask(cpp::simd_mask m, + size_t shift) { using bitmask_ty = cpp::internal::get_as_integer_type_t>; bitmask_ty r = cpp::bit_cast(m) >> shift; - return cpp::bit_cast>(r); + return cpp::bit_cast>(r); } LIBC_NO_SANITIZE_OOB_ACCESS LIBC_INLINE size_t string_length(const char *src) { @@ -34,8 +35,8 @@ LIBC_NO_SANITIZE_OOB_ACCESS LIBC_INLINE size_t string_length(const char *src) { cpp::simd chars = cpp::load>(aligned, /*aligned=*/true); cpp::simd_mask mask = chars == null_byte; size_t offset = src - reinterpret_cast(aligned); - if (cpp::any_of(shift_mask(mask, offset))) - return cpp::find_first_set(shift_mask(mask, offset)); + if (cpp::any_of(shift_mask(mask, offset))) + return cpp::find_first_set(shift_mask(mask, offset)); for (;;) { cpp::simd chars = cpp::load>(++aligned, @@ -46,6 +47,46 @@ LIBC_NO_SANITIZE_OOB_ACCESS LIBC_INLINE size_t string_length(const char *src) { cpp::find_first_set(mask); } } + +LIBC_INLINE static void *calculate_find_first_character_return( + const char *src, cpp::simd_mask c_mask, size_t n_left) { + size_t c_offset = cpp::find_first_set(c_mask); + if (n_left < c_offset) + return nullptr; + return const_cast(src) + c_offset; +} + +LIBC_NO_SANITIZE_OOB_ACCESS LIBC_INLINE static void * +find_first_character(const unsigned char *s, unsigned char c, size_t n) { + using Vector = cpp::simd; + using Mask = cpp::simd_mask; + Vector c_byte = c; + + size_t alignment = alignof(Vector); + const Vector *aligned = + reinterpret_cast(__builtin_align_down(s, alignment)); + + Vector chars = cpp::load(aligned, /*aligned=*/true); + Mask cmp_v = chars == c_byte; + size_t offset = s - reinterpret_cast(aligned); + + cmp_v = shift_mask(cmp_v, offset); + if (cpp::any_of(cmp_v)) + return calculate_find_first_character_return( + reinterpret_cast(s), cmp_v, n); + + for (size_t bytes_checked = sizeof(Vector) - offset; bytes_checked < n; + bytes_checked += sizeof(Vector)) { + aligned++; + Vector chars = cpp::load(aligned, /*aligned=*/true); + Mask cmp_v = chars == c_byte; + if (cpp::any_of(cmp_v)) + return calculate_find_first_character_return( + reinterpret_cast(aligned), cmp_v, n - bytes_checked); + } + return nullptr; +} + } // namespace clang_vector } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/string/memory_utils/x86_64/inline_strlen.h b/libc/src/string/memory_utils/x86_64/inline_strlen.h index 07b4a470f0d7..74b6820a50ff 100644 --- a/libc/src/string/memory_utils/x86_64/inline_strlen.h +++ b/libc/src/string/memory_utils/x86_64/inline_strlen.h @@ -17,10 +17,11 @@ namespace LIBC_NAMESPACE_DECL { namespace internal::arch_vector { -// Return a bit-mask with the nth bit set if the nth-byte in block_ptr is zero. +// Return a bit-mask with the nth bit set if the nth-byte in block_ptr matches +// character c. template LIBC_NO_SANITIZE_OOB_ACCESS LIBC_INLINE static Mask -compare_and_mask(const Vector *block_ptr); +compare_and_mask(const Vector *block_ptr, char c); template )> @@ -30,13 +31,13 @@ string_length_vector(const char *src) { const Vector *block_ptr = reinterpret_cast(src - misalign_bytes); - auto cmp = compare_and_mask(block_ptr) >> misalign_bytes; + auto cmp = compare_and_mask(block_ptr, 0) >> misalign_bytes; if (cmp) return cpp::countr_zero(cmp); while (true) { block_ptr++; - cmp = compare_and_mask(block_ptr); + cmp = compare_and_mask(block_ptr, 0); if (cmp) return static_cast(reinterpret_cast(block_ptr) - reinterpret_cast(src) + @@ -44,13 +45,50 @@ string_length_vector(const char *src) { } } +template +LIBC_INLINE static void * +calculate_find_first_character_return(const unsigned char *src, Mask c_mask, + size_t n_left) { + size_t c_offset = cpp::countr_zero(c_mask); + if (n_left < c_offset) + return nullptr; + return const_cast(src) + c_offset; +} + +template )> +LIBC_NO_SANITIZE_OOB_ACCESS LIBC_INLINE static void * +find_first_character_vector(const unsigned char *s, unsigned char c, size_t n) { + uintptr_t misalign_bytes = reinterpret_cast(s) % sizeof(Vector); + + const Vector *block_ptr = + reinterpret_cast(s - misalign_bytes); + auto cmp_bytes = + compare_and_mask(block_ptr, c) >> misalign_bytes; + if (cmp_bytes) + return calculate_find_first_character_return( + reinterpret_cast(block_ptr) + misalign_bytes, + cmp_bytes, n); + + for (size_t bytes_checked = sizeof(Vector) - misalign_bytes; + bytes_checked < n; bytes_checked += sizeof(Vector)) { + block_ptr++; + cmp_bytes = compare_and_mask(block_ptr, c); + if (cmp_bytes) + return calculate_find_first_character_return( + reinterpret_cast(block_ptr), cmp_bytes, + n - bytes_checked); + } + return nullptr; +} + template <> LIBC_INLINE uint32_t -compare_and_mask<__m128i, uint32_t>(const __m128i *block_ptr) { - __m128i v = _mm_load_si128(block_ptr); - __m128i z = _mm_setzero_si128(); - __m128i c = _mm_cmpeq_epi8(z, v); - return _mm_movemask_epi8(c); +compare_and_mask<__m128i, uint32_t>(const __m128i *block_ptr, char c) { + __m128i b = _mm_load_si128(block_ptr); + __m128i set = _mm_set1_epi8(c); + __m128i cmp = _mm_cmpeq_epi8(b, set); + return _mm_movemask_epi8(cmp); } namespace sse2 { @@ -58,16 +96,24 @@ namespace sse2 { return string_length_vector<__m128i, uint32_t, compare_and_mask<__m128i, uint32_t>>(src); } + +[[maybe_unused]] LIBC_INLINE void * +find_first_character(const unsigned char *s, unsigned char c, size_t n) { + return find_first_character_vector<__m128i, uint32_t, + compare_and_mask<__m128i, uint32_t>>(s, c, + n); +} + } // namespace sse2 #if defined(__AVX2__) template <> LIBC_INLINE uint32_t -compare_and_mask<__m256i, uint32_t>(const __m256i *block_ptr) { - __m256i v = _mm256_load_si256(block_ptr); - __m256i z = _mm256_setzero_si256(); - __m256i c = _mm256_cmpeq_epi8(z, v); - return _mm256_movemask_epi8(c); +compare_and_mask<__m256i, uint32_t>(const __m256i *block_ptr, char c) { + __m256i b = _mm256_load_si256(block_ptr); + __m256i set = _mm256_set1_epi16(c); + __m256i cmp = _mm256_cmpeq_epi8(b, set); + return _mm256_movemask_epi8(cmp); } namespace avx2 { @@ -75,25 +121,45 @@ namespace avx2 { return string_length_vector<__m256i, uint32_t, compare_and_mask<__m256i, uint32_t>>(src); } + +[[maybe_unused]] LIBC_INLINE void * +find_first_character(const unsigned char *s, unsigned char c, size_t n) { + return find_first_character_vector<__m256i, uint32_t, + compare_and_mask<__m256i, uint32_t>>(s, c, + n); +} } // namespace avx2 #endif #if defined(__AVX512F__) template <> LIBC_INLINE __mmask64 -compare_and_mask<__m512i, __mmask64>(const __m512i *block_ptr) { +compare_and_mask<__m512i, __mmask64>(const __m512i *block_ptr, char c) { __m512i v = _mm512_load_si512(block_ptr); - __m512i z = _mm512_setzero_si512(); - return _mm512_cmp_epu8_mask(z, v, _MM_CMPINT_EQ); + __m512i set = _mm512_set1_epi8(c); + return _mm512_cmp_epu8_mask(set, v, _MM_CMPINT_EQ); } + namespace avx512 { [[maybe_unused]] LIBC_INLINE size_t string_length(const char *src) { return string_length_vector<__m512i, __mmask64, compare_and_mask<__m512i, __mmask64>>(src); } + +[[maybe_unused]] LIBC_INLINE void * +find_first_character(const unsigned char *s, unsigned char c, size_t n) { + return find_first_character_vector<__m512i, __mmask64, + compare_and_mask<__m512i, __mmask64>>(s, c, + n); +} + } // namespace avx512 #endif +// We could directly use the various _vector templates here, but this +// indirection allows comparing the various implementations elsewhere by name, +// without having to instantiate the templates by hand at those locations. + [[maybe_unused]] LIBC_INLINE size_t string_length(const char *src) { #if defined(__AVX512F__) return avx512::string_length(src); @@ -104,6 +170,17 @@ namespace avx512 { #endif } +[[maybe_unused]] LIBC_INLINE void * +find_first_character(const unsigned char *s, unsigned char c, size_t n) { +#if defined(__AVX512F__) + return avx512::find_first_character(s, c, n); +#elif defined(__AVX2__) + return avx2::find_first_character(s, c, n); +#else + return sse2::find_first_character(s, c, n); +#endif +} + } // namespace internal::arch_vector } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/test/src/strings/wide_read_memory_test.cpp b/libc/test/src/strings/wide_read_memory_test.cpp index cc4a2dcbd9dd..83a6294833b0 100644 --- a/libc/test/src/strings/wide_read_memory_test.cpp +++ b/libc/test/src/strings/wide_read_memory_test.cpp @@ -98,4 +98,25 @@ TEST_F(LlvmLibcWideAccessMemoryTest, StringLength) { }); } +TEST_F(LlvmLibcWideAccessMemoryTest, FindFirstChar) { + // 1.5 k long vector of a's. + TwoKilobyteBuffer buf; + inline_memset(buf.data(), 'a', buf.size()); + buf[buf.size() - 1] = 'b'; + this->TestMemoryAccess(buf, [this, buf](const char *test_data) { + // Found case + ASSERT_EQ( + reinterpret_cast(internal::find_first_character_impl( + reinterpret_cast(test_data), 'b', + size_t(buf.size()))), + reinterpret_cast(test_data + size_t(buf.size()) - 1)); + // Not found case + ASSERT_EQ( + reinterpret_cast(internal::find_first_character_impl( + reinterpret_cast(test_data), 'c', + size_t(buf.size()))), + nullptr); + }); +} + } // namespace LIBC_NAMESPACE_DECL