Peng Liu d0cee6939a
[libc++] Optimize std::{,ranges}::{fill,fill_n} for segmented iterators (#132665)
This patch optimizes `std::fill`, `std::fill_n`, `std::ranges::fill`,
and `std::ranges::fill_n` for segmented iterators, achieving substantial
performance improvements. Specifically, for `deque<int>` iterators, the
performance improvements are above 10x for all these algorithms. The
optimization also enables filling segmented memory of `deque<int>` to
approach the performance of filling contiguous memory of `vector<int>`.


Benchmark results comparing the before and after implementations are
provided below. For additional context, we’ve included `vector<int>`
results, which remain unchanged, as this patch specifically targets
segmented iterators and leaves non-segmented iterator behavior
untouched.



Fixes two subtasks outlined in #102817.

#### `fill_n`

```
-----------------------------------------------------------------------------
Benchmark                                Before            After      Speedup
-----------------------------------------------------------------------------
std::fill_n(deque<int>)/32              11.4 ns          2.28 ns        5.0x
std::fill_n(deque<int>)/50              19.7 ns          3.40 ns        5.8x
std::fill_n(deque<int>)/1024             391 ns          37.3 ns       10.5x
std::fill_n(deque<int>)/8192            3174 ns           301 ns       10.5x
std::fill_n(deque<int>)/65536          26504 ns          2951 ns        9.0x
std::fill_n(deque<int>)/1048576       407960 ns         80658 ns        5.1x
rng::fill_n(deque<int>)/32              14.3 ns          2.15 ns        6.6x
rng::fill_n(deque<int>)/50              20.2 ns          3.22 ns        6.3x
rng::fill_n(deque<int>)/1024             381 ns          37.8 ns       10.1x
rng::fill_n(deque<int>)/8192            3101 ns           294 ns       10.5x
rng::fill_n(deque<int>)/65536          25098 ns          2926 ns        8.6x
rng::fill_n(deque<int>)/1048576       394342 ns         78874 ns        5.0x
std::fill_n(vector<int>)/32             1.76 ns          1.72 ns        1.0x
std::fill_n(vector<int>)/50             3.00 ns          2.73 ns        1.1x
std::fill_n(vector<int>)/1024           38.4 ns          37.9 ns        1.0x
std::fill_n(vector<int>)/8192            258 ns           252 ns        1.0x
std::fill_n(vector<int>)/65536          2993 ns          2889 ns        1.0x
std::fill_n(vector<int>)/1048576       80328 ns         80468 ns        1.0x
rng::fill_n(vector<int>)/32             1.99 ns          1.35 ns        1.5x
rng::fill_n(vector<int>)/50             2.66 ns          2.12 ns        1.3x
rng::fill_n(vector<int>)/1024           37.7 ns          35.8 ns        1.1x
rng::fill_n(vector<int>)/8192            253 ns           250 ns        1.0x
rng::fill_n(vector<int>)/65536          2922 ns          2930 ns        1.0x
rng::fill_n(vector<int>)/1048576       79739 ns         79742 ns        1.0x
```

#### `fill`

```
--------------------------------------------------------------------------
Benchmark                              Before            After     Speedup
--------------------------------------------------------------------------
std::fill(deque<int>)/32              13.7 ns          2.45 ns        5.6x
std::fill(deque<int>)/50              21.7 ns          4.57 ns        4.7x
std::fill(deque<int>)/1024             367 ns          38.5 ns        9.5x
std::fill(deque<int>)/8192            2896 ns           247 ns       11.7x
std::fill(deque<int>)/65536          23723 ns          2907 ns        8.2x
std::fill(deque<int>)/1048576       379043 ns         79885 ns        4.7x
rng::fill(deque<int>)/32              13.6 ns          2.70 ns        5.0x
rng::fill(deque<int>)/50              23.4 ns          3.94 ns        5.9x
rng::fill(deque<int>)/1024             377 ns          37.9 ns        9.9x
rng::fill(deque<int>)/8192            2914 ns           286 ns       10.2x
rng::fill(deque<int>)/65536          23612 ns          2939 ns        8.0x
rng::fill(deque<int>)/1048576       379841 ns         80079 ns        4.7x
std::fill(vector<int>)/32             1.99 ns          1.79 ns        1.1x
std::fill(vector<int>)/50             3.05 ns          3.06 ns        1.0x
std::fill(vector<int>)/1024           37.6 ns          38.0 ns        1.0x
std::fill(vector<int>)/8192            255 ns           257 ns        1.0x
std::fill(vector<int>)/65536          2966 ns          2981 ns        1.0x
std::fill(vector<int>)/1048576       78300 ns         80348 ns        1.0x
rng::fill(vector<int>)/32             1.77 ns          1.75 ns        1.0x
rng::fill(vector<int>)/50             4.85 ns          2.31 ns        2.1x
rng::fill(vector<int>)/1024           39.6 ns          36.1 ns        1.1x
rng::fill(vector<int>)/8192            238 ns           251 ns        0.9x
rng::fill(vector<int>)/65536          2941 ns          2918 ns        1.0x
rng::fill(vector<int>)/1048576       80497 ns         80442 ns        1.0x
```

---------

Co-authored-by: Louis Dionne <ldionne.2@gmail.com>
Co-authored-by: A. Jiang <de34@live.cn>
2025-10-17 07:41:24 +08:00

261 lines
8.9 KiB
C++

//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// <algorithm>
// UNSUPPORTED: c++03, c++11, c++14, c++17
// template<class T, output_iterator<const T&> O, sentinel_for<O> S>
// constexpr O ranges::fill(O first, S last, const T& value);
// template<class T, output_range<const T&> R>
// constexpr borrowed_iterator_t<R> ranges::fill(R&& r, const T& value);
#include <algorithm>
#include <array>
#include <cassert>
#include <deque>
#include <ranges>
#include <string>
#include <vector>
#include "sized_allocator.h"
#include "almost_satisfies_types.h"
#include "test_iterators.h"
#include "test_macros.h"
template <class Iter, class Sent = sentinel_wrapper<Iter>>
concept HasFillIt = requires(Iter iter, Sent sent) { std::ranges::fill(iter, sent, int{}); };
static_assert(HasFillIt<int*>);
static_assert(!HasFillIt<OutputIteratorNotIndirectlyWritable>);
static_assert(!HasFillIt<OutputIteratorNotInputOrOutputIterator>);
static_assert(!HasFillIt<int*, SentinelForNotSemiregular>);
static_assert(!HasFillIt<int*, SentinelForNotWeaklyEqualityComparableWith>);
template <class Range>
concept HasFillR = requires(Range range) { std::ranges::fill(range, int{}); };
static_assert(HasFillR<UncheckedRange<int*>>);
static_assert(!HasFillR<OutputRangeNotIndirectlyWritable>);
static_assert(!HasFillR<OutputRangeNotInputOrOutputIterator>);
static_assert(!HasFillR<OutputRangeNotSentinelSemiregular>);
static_assert(!HasFillR<OutputRangeNotSentinelEqualityComparableWith>);
template <class It, class Sent = It>
constexpr void test_iterators() {
{ // simple test
{
int a[3];
std::same_as<It> auto ret = std::ranges::fill(It(a), Sent(It(a + 3)), 1);
assert(std::all_of(a, a + 3, [](int i) { return i == 1; }));
assert(base(ret) == a + 3);
}
{
int a[3];
auto range = std::ranges::subrange(It(a), Sent(It(a + 3)));
std::same_as<It> auto ret = std::ranges::fill(range, 1);
assert(std::all_of(a, a + 3, [](int i) { return i == 1; }));
assert(base(ret) == a + 3);
}
}
{ // check that an empty range works
{
std::array<int, 0> a;
auto ret = std::ranges::fill(It(a.data()), Sent(It(a.data())), 1);
assert(base(ret) == a.data());
}
{
std::array<int, 0> a;
auto range = std::ranges::subrange(It(a.data()), Sent(It(a.data())));
auto ret = std::ranges::fill(range, 1);
assert(base(ret) == a.data());
}
}
}
// The `ranges::{fill, fill_n}` algorithms require `vector<bool, Alloc>::iterator` to satisfy
// the `std::indirectly_writable` concept when used with `vector<bool, Alloc>`, which is only
// satisfied since C++23.
#if TEST_STD_VER >= 23
constexpr bool test_vector_bool(std::size_t N) {
{ // Test cases validating leading/trailing bits unfilled remain unchanged
{ // Leading bits are not filled
std::vector<bool> in(N, false);
std::vector<bool> expected(N, true);
expected[0] = expected[1] = false;
std::ranges::fill(std::ranges::subrange(in.begin() + 2, in.end()), true);
assert(in == expected);
}
{ // Trailing bits are not filled
std::vector<bool> in(N, false);
std::vector<bool> expected(N, true);
expected[N - 1] = expected[N - 2] = false;
std::ranges::fill(std::ranges::subrange(in.begin(), in.end() - 2), true);
assert(in == expected);
}
{ // Leading and trailing bits are not filled
std::vector<bool> in(N, false);
std::vector<bool> expected(N, true);
expected[0] = expected[1] = expected[N - 1] = expected[N - 2] = false;
std::ranges::fill(std::ranges::subrange(in.begin() + 2, in.end() - 2), true);
assert(in == expected);
}
}
{ // Test cases with full or partial bytes filled
{ // Full bytes filled
std::vector<bool> in(N, false);
std::vector<bool> expected(N, true);
std::ranges::fill(in, true);
assert(in == expected);
}
{ // Partial bytes with offset filled
std::vector<bool> in(N, false);
std::vector<bool> expected(N, true);
std::ranges::fill(std::ranges::subrange(std::ranges::begin(in) + 4, std::ranges::end(in) - 4), true);
std::ranges::fill(std::ranges::subrange(std::ranges::begin(expected), std::ranges::begin(expected) + 4), false);
std::ranges::fill(std::ranges::subrange(std::ranges::end(expected) - 4, std::ranges::end(expected)), false);
assert(in == expected);
}
}
return true;
}
#endif
/*TEST_CONSTEXPR_CXX26*/ void test_deque() { // TODO: Mark as TEST_CONSTEXPR_CXX26 once std::deque is constexpr
std::deque<int> in(20);
std::deque<int> expected(in.size(), 42);
std::ranges::fill(in, 42);
assert(in == expected);
}
constexpr bool test() {
test_iterators<cpp17_output_iterator<int*>, sentinel_wrapper<cpp17_output_iterator<int*>>>();
test_iterators<cpp20_output_iterator<int*>, sentinel_wrapper<cpp20_output_iterator<int*>>>();
test_iterators<forward_iterator<int*>>();
test_iterators<bidirectional_iterator<int*>>();
test_iterators<random_access_iterator<int*>>();
test_iterators<contiguous_iterator<int*>>();
test_iterators<int*>();
{ // check that every element is copied once
struct S {
bool copied = false;
constexpr S& operator=(const S&) {
copied = true;
return *this;
}
};
{
S a[5];
std::ranges::fill(a, a + 5, S{true});
assert(std::all_of(a, a + 5, [](S& s) { return s.copied; }));
}
{
S a[5];
std::ranges::fill(a, S{true});
assert(std::all_of(a, a + 5, [](S& s) { return s.copied; }));
}
}
{ // check that std::ranges::dangling is returned
[[maybe_unused]] std::same_as<std::ranges::dangling> decltype(auto) ret =
std::ranges::fill(std::array<int, 10>{}, 1);
}
{ // check that std::ranges::dangling isn't returned with a borrowing range
std::array<int, 10> a{};
[[maybe_unused]] std::same_as<std::array<int, 10>::iterator> decltype(auto) ret =
std::ranges::fill(std::views::all(a), 1);
assert(std::all_of(a.begin(), a.end(), [](int i) { return i == 1; }));
}
{ // check that non-trivially copyable items are copied properly
{
std::array<std::string, 10> a;
auto ret = std::ranges::fill(a.begin(), a.end(), "long long string so no SSO");
assert(ret == a.end());
assert(std::all_of(a.begin(), a.end(), [](auto& s) { return s == "long long string so no SSO"; }));
}
{
std::array<std::string, 10> a;
auto ret = std::ranges::fill(a, "long long string so no SSO");
assert(ret == a.end());
assert(std::all_of(a.begin(), a.end(), [](auto& s) { return s == "long long string so no SSO"; }));
}
}
#if TEST_STD_VER >= 23
{ // Test vector<bool>::iterator optimization
assert(test_vector_bool(8));
assert(test_vector_bool(19));
assert(test_vector_bool(32));
assert(test_vector_bool(49));
assert(test_vector_bool(64));
assert(test_vector_bool(199));
assert(test_vector_bool(256));
// Make sure std::ranges::fill behaves properly with std::vector<bool> iterators with custom
// size types. See https://github.com/llvm/llvm-project/pull/122410.
{
using Alloc = sized_allocator<bool, std::uint8_t, std::int8_t>;
std::vector<bool, Alloc> in(100, false, Alloc(1));
std::vector<bool, Alloc> expected(100, true, Alloc(1));
std::ranges::fill(in, true);
assert(in == expected);
}
{
using Alloc = sized_allocator<bool, std::uint16_t, std::int16_t>;
std::vector<bool, Alloc> in(200, false, Alloc(1));
std::vector<bool, Alloc> expected(200, true, Alloc(1));
std::ranges::fill(in, true);
assert(in == expected);
}
{
using Alloc = sized_allocator<bool, std::uint32_t, std::int32_t>;
std::vector<bool, Alloc> in(200, false, Alloc(1));
std::vector<bool, Alloc> expected(200, true, Alloc(1));
std::ranges::fill(in, true);
assert(in == expected);
}
{
using Alloc = sized_allocator<bool, std::uint64_t, std::int64_t>;
std::vector<bool, Alloc> in(200, false, Alloc(1));
std::vector<bool, Alloc> expected(200, true, Alloc(1));
std::ranges::fill(in, true);
assert(in == expected);
}
}
#endif
if (!TEST_IS_CONSTANT_EVALUATED) // TODO: Use TEST_STD_AT_LEAST_26_OR_RUNTIME_EVALUATED when std::deque is made constexpr
test_deque();
#if TEST_STD_VER >= 20
{
std::vector<std::vector<int>> v{{1, 2}, {1, 2, 3}, {}, {3, 4, 5}, {6}, {7, 8, 9, 6}, {0, 1, 2, 3, 0, 1, 2}};
auto jv = std::ranges::join_view(v);
std::ranges::fill(jv, 42);
for (const auto& vec : v)
for (auto n : vec)
assert(n == 42);
}
#endif
return true;
}
int main(int, char**) {
test();
static_assert(test());
return 0;
}