This PR creates an SVE-based implementation for strlen by translating from the AOR code in tree. Microbenchmark shows improvements against NEON when N>=64. Although both implementations fall behind glibc by a large margin, this may be a good start point to explore SVE implementations. Together with the PR: 1. Added two more tests of strlen with special nul symbols. 2. Added strlen's fuzzer and fix a typo in previous heap fuzzer. ``` === strlen(16 bytes) === libc: 1.56115 ns/call, 9.54499 GiB/s neon: 1.59393 ns/call, 9.34867 GiB/s sve: 1.66097 ns/call, 8.97134 GiB/s === strlen(64 bytes) === libc: 2.06967 ns/call, 28.7991 GiB/s neon: 2.59914 ns/call, 22.9325 GiB/s sve: 2.58628 ns/call, 23.0465 GiB/s === strlen(256 bytes) === libc: 3.74165 ns/call, 63.7202 GiB/s neon: 8.98243 ns/call, 26.5428 GiB/s sve: 7.36426 ns/call, 32.3751 GiB/s === strlen(1024 bytes) === libc: 10.5327 ns/call, 90.5438 GiB/s neon: 34.363 ns/call, 27.7529 GiB/s sve: 26.9329 ns/call, 35.4092 GiB/s === strlen(4096 bytes) === libc: 37.7304 ns/call, 101.104 GiB/s neon: 145.911 ns/call, 26.144 GiB/s sve: 103.208 ns/call, 36.9612 GiB/s === strlen(1048576 bytes) === libc: 9623.4 ns/call, 101.478 GiB/s neon: 36138.2 ns/call, 27.023 GiB/s sve: 26605.6 ns/call, 36.7051 GiB/s ```
107 lines
3.7 KiB
C++
107 lines
3.7 KiB
C++
//===-- Strlen implementation for aarch64 ---------------------------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_AARCH64_INLINE_STRLEN_H
|
|
#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_AARCH64_INLINE_STRLEN_H
|
|
|
|
#include "src/__support/macros/properties/cpu_features.h"
|
|
|
|
#if defined(__ARM_NEON)
|
|
#include "src/__support/CPP/bit.h" // countr_zero
|
|
#include <arm_neon.h>
|
|
#include <stddef.h> // size_t
|
|
namespace LIBC_NAMESPACE_DECL {
|
|
namespace neon {
|
|
[[maybe_unused]] LIBC_NO_SANITIZE_OOB_ACCESS LIBC_INLINE static size_t
|
|
string_length(const char *src) {
|
|
using Vector __attribute__((may_alias)) = uint8x8_t;
|
|
|
|
uintptr_t misalign_bytes = reinterpret_cast<uintptr_t>(src) % sizeof(Vector);
|
|
const Vector *block_ptr =
|
|
reinterpret_cast<const Vector *>(src - misalign_bytes);
|
|
Vector v = *block_ptr;
|
|
Vector vcmp = vceqz_u8(v);
|
|
uint64x1_t cmp_mask = vreinterpret_u64_u8(vcmp);
|
|
uint64_t cmp = vget_lane_u64(cmp_mask, 0);
|
|
cmp = cmp >> (misalign_bytes << 3);
|
|
if (cmp)
|
|
return cpp::countr_zero(cmp) >> 3;
|
|
|
|
while (true) {
|
|
++block_ptr;
|
|
v = *block_ptr;
|
|
vcmp = vceqz_u8(v);
|
|
cmp_mask = vreinterpret_u64_u8(vcmp);
|
|
cmp = vget_lane_u64(cmp_mask, 0);
|
|
if (cmp)
|
|
return static_cast<size_t>(reinterpret_cast<uintptr_t>(block_ptr) -
|
|
reinterpret_cast<uintptr_t>(src) +
|
|
(cpp::countr_zero(cmp) >> 3));
|
|
}
|
|
}
|
|
} // namespace neon
|
|
} // namespace LIBC_NAMESPACE_DECL
|
|
#endif // __ARM_NEON
|
|
|
|
#ifdef LIBC_TARGET_CPU_HAS_SVE
|
|
#include "src/__support/macros/optimization.h"
|
|
#include <arm_sve.h>
|
|
namespace LIBC_NAMESPACE_DECL {
|
|
namespace sve {
|
|
[[maybe_unused]] LIBC_INLINE static size_t string_length(const char *src) {
|
|
const uint8_t *ptr = reinterpret_cast<const uint8_t *>(src);
|
|
// Initialize the first-fault register to all true
|
|
svsetffr();
|
|
const svbool_t all_true = svptrue_b8(); // all true predicate
|
|
svbool_t cmp_zero;
|
|
size_t len = 0;
|
|
|
|
for (;;) {
|
|
// Read a vector's worth of bytes, stopping on first fault.
|
|
svuint8_t data = svldff1_u8(all_true, &ptr[len]);
|
|
svbool_t fault_mask = svrdffr_z(all_true);
|
|
bool has_no_fault = svptest_last(all_true, fault_mask);
|
|
if (LIBC_LIKELY(has_no_fault)) {
|
|
// First fault did not fail: the whole vector is valid.
|
|
// Avoid depending on the contents of FFR beyond the branch.
|
|
len += svcntb(); // speculative increment
|
|
cmp_zero = svcmpeq_n_u8(all_true, data, 0);
|
|
bool has_no_zero = !svptest_any(all_true, cmp_zero);
|
|
if (LIBC_LIKELY(has_no_zero))
|
|
continue;
|
|
len -= svcntb(); // undo speculative increment
|
|
break;
|
|
} else {
|
|
// First fault failed: only some of the vector is valid.
|
|
// Perform the comparison only on the valid bytes.
|
|
cmp_zero = svcmpeq_n_u8(fault_mask, data, 0);
|
|
bool has_zero = svptest_any(fault_mask, cmp_zero);
|
|
if (LIBC_LIKELY(has_zero))
|
|
break;
|
|
svsetffr();
|
|
len += svcntp_b8(all_true, fault_mask);
|
|
continue;
|
|
}
|
|
}
|
|
// Select the bytes before the first and count them.
|
|
svbool_t before_zero = svbrkb_z(all_true, cmp_zero);
|
|
len += svcntp_b8(all_true, before_zero);
|
|
return len;
|
|
}
|
|
} // namespace sve
|
|
} // namespace LIBC_NAMESPACE_DECL
|
|
#endif // LIBC_TARGET_CPU_HAS_SVE
|
|
|
|
namespace LIBC_NAMESPACE_DECL {
|
|
#ifdef LIBC_TARGET_CPU_HAS_SVE
|
|
namespace string_length_impl = sve;
|
|
#elif defined(__ARM_NEON)
|
|
namespace string_length_impl = neon;
|
|
#endif
|
|
} // namespace LIBC_NAMESPACE_DECL
|
|
#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_AARCH64_INLINE_STRLEN_H
|