Peng Liu 5b65896ad6
[libc++] Optimize ranges::copy{, _n} for vector<bool>::iterator (#121013)
This PR optimizes the performance of `std::ranges::copy` and
`std::ranges::copy_n` specifically for `vector<bool>::iterator`,
addressing a subtask outlined in issue #64038. The optimizations yield
performance improvements of up to **2000x** for aligned copies and
**60x** for unaligned copies. Additionally, new tests have been added to
validate these enhancements.


- Aligned source-destination bits

ranges::copy
```
--------------------------------------------------------------------------
Benchmark                                Before        After   Improvement
--------------------------------------------------------------------------
bm_ranges_copy_vb_aligned/8              10.8 ns      1.42 ns           8x
bm_ranges_copy_vb_aligned/64             88.5 ns      2.28 ns          39x
bm_ranges_copy_vb_aligned/512             709 ns      1.95 ns         364x
bm_ranges_copy_vb_aligned/4096           5568 ns      5.01 ns        1111x
bm_ranges_copy_vb_aligned/32768         44754 ns      38.7 ns        1156x
bm_ranges_copy_vb_aligned/65536         91092 ns      73.2 ns        1244x
bm_ranges_copy_vb_aligned/102400       139473 ns       127 ns        1098x
bm_ranges_copy_vb_aligned/106496       189004 ns      81.5 ns        2319x
bm_ranges_copy_vb_aligned/110592       153647 ns      71.1 ns        2161x
bm_ranges_copy_vb_aligned/114688       159261 ns      70.2 ns        2269x
bm_ranges_copy_vb_aligned/118784       181910 ns      73.5 ns        2475x
bm_ranges_copy_vb_aligned/122880       174117 ns      76.5 ns        2276x
bm_ranges_copy_vb_aligned/126976       176020 ns      82.0 ns        2147x
bm_ranges_copy_vb_aligned/131072       180757 ns       137 ns        1319x
bm_ranges_copy_vb_aligned/135168       190342 ns       158 ns        1205x
bm_ranges_copy_vb_aligned/139264       192831 ns       103 ns        1872x
bm_ranges_copy_vb_aligned/143360       199627 ns      89.4 ns        2233x
bm_ranges_copy_vb_aligned/147456       203881 ns      88.6 ns        2301x
bm_ranges_copy_vb_aligned/151552       213345 ns      88.4 ns        2413x
bm_ranges_copy_vb_aligned/155648       216892 ns      92.9 ns        2335x
bm_ranges_copy_vb_aligned/159744       222751 ns      96.4 ns        2311x
bm_ranges_copy_vb_aligned/163840       225995 ns       173 ns        1306x
bm_ranges_copy_vb_aligned/167936       235230 ns       202 ns        1165x
bm_ranges_copy_vb_aligned/172032       244093 ns       131 ns        1863x
bm_ranges_copy_vb_aligned/176128       244434 ns       111 ns        2202x
bm_ranges_copy_vb_aligned/180224       249570 ns       108 ns        2311x
bm_ranges_copy_vb_aligned/184320       254538 ns       108 ns        2357x
bm_ranges_copy_vb_aligned/188416       261817 ns       113 ns        2317x
bm_ranges_copy_vb_aligned/192512       269923 ns       125 ns        2159x
bm_ranges_copy_vb_aligned/196608       273494 ns       210 ns        1302x
bm_ranges_copy_vb_aligned/200704       280035 ns       269 ns        1041x
bm_ranges_copy_vb_aligned/204800       293102 ns       231 ns        1269x
```

ranges::copy_n
```
--------------------------------------------------------------------------
Benchmark                                Before        After   Improvement
--------------------------------------------------------------------------
bm_ranges_copy_n_vb_aligned/8            11.8 ns       0.89 ns         13x
bm_ranges_copy_n_vb_aligned/64           91.6 ns       2.06 ns         44x
bm_ranges_copy_n_vb_aligned/512           718 ns       2.45 ns        293x
bm_ranges_copy_n_vb_aligned/4096         5750 ns       5.02 ns       1145x
bm_ranges_copy_n_vb_aligned/32768       45824 ns       40.9 ns       1120x
bm_ranges_copy_n_vb_aligned/65536       92267 ns       73.8 ns       1250x
bm_ranges_copy_n_vb_aligned/102400     143267 ns       125 ns        1146x
bm_ranges_copy_n_vb_aligned/106496     148625 ns      82.4 ns        1804x
bm_ranges_copy_n_vb_aligned/110592     154817 ns      72.0 ns        2150x
bm_ranges_copy_n_vb_aligned/114688     157953 ns      70.4 ns        2244x
bm_ranges_copy_n_vb_aligned/118784     162374 ns      71.5 ns        2270x
bm_ranges_copy_n_vb_aligned/122880     168638 ns      72.9 ns        2313x
bm_ranges_copy_n_vb_aligned/126976     175596 ns      76.6 ns        2292x
bm_ranges_copy_n_vb_aligned/131072     181164 ns       135 ns        1342x
bm_ranges_copy_n_vb_aligned/135168     184697 ns       157 ns        1176x
bm_ranges_copy_n_vb_aligned/139264     191395 ns       104 ns        1840x
bm_ranges_copy_n_vb_aligned/143360     194954 ns      88.3 ns        2208x
bm_ranges_copy_n_vb_aligned/147456     208917 ns      86.1 ns        2426x
bm_ranges_copy_n_vb_aligned/151552     211101 ns      87.2 ns        2421x
bm_ranges_copy_n_vb_aligned/155648     213175 ns      89.0 ns        2395x
bm_ranges_copy_n_vb_aligned/159744     218988 ns      86.7 ns        2526x
bm_ranges_copy_n_vb_aligned/163840     225263 ns       156 ns        1444x
bm_ranges_copy_n_vb_aligned/167936     230725 ns       184 ns        1254x
bm_ranges_copy_n_vb_aligned/172032     235795 ns       119 ns        1981x
bm_ranges_copy_n_vb_aligned/176128     241145 ns       101 ns        2388x
bm_ranges_copy_n_vb_aligned/180224     250680 ns      99.5 ns        2519x
bm_ranges_copy_n_vb_aligned/184320     262954 ns      99.7 ns        2637x
bm_ranges_copy_n_vb_aligned/188416     258584 ns       103 ns        2510x
bm_ranges_copy_n_vb_aligned/192512     267190 ns       125 ns        2138x
bm_ranges_copy_n_vb_aligned/196608     270821 ns       213 ns        1271x
bm_ranges_copy_n_vb_aligned/200704     279532 ns       262 ns        1067x
bm_ranges_copy_n_vb_aligned/204800     283412 ns       222 ns        1277x
```

- Unaligned source-destination bits
```
--------------------------------------------------------------------------------
Benchmark                                    Before           After  Improvement
--------------------------------------------------------------------------------
bm_ranges_copy_vb_unaligned/8               12.8 ns         8.59 ns         1.5x
bm_ranges_copy_vb_unaligned/64              98.2 ns         8.24 ns          12x
bm_ranges_copy_vb_unaligned/512              755 ns         18.1 ns          42x
bm_ranges_copy_vb_unaligned/4096            6027 ns          102 ns          59x
bm_ranges_copy_vb_unaligned/32768          47663 ns          774 ns          62x
bm_ranges_copy_vb_unaligned/262144        378981 ns         6455 ns          59x
bm_ranges_copy_vb_unaligned/1048576      1520486 ns        25942 ns          59x
bm_ranges_copy_n_vb_unaligned/8             11.3 ns         8.22 ns         1.4x
bm_ranges_copy_n_vb_unaligned/64            97.3 ns         7.89 ns          12x
bm_ranges_copy_n_vb_unaligned/512            747 ns         18.1 ns          41x
bm_ranges_copy_n_vb_unaligned/4096          5932 ns         99.0 ns          60x
bm_ranges_copy_n_vb_unaligned/32768        47776 ns         749 ns           64x
bm_ranges_copy_n_vb_unaligned/262144      378802 ns        6576 ns           58x
bm_ranges_copy_n_vb_unaligned/1048576    1547234 ns       26229 ns           59x
```
2025-01-30 17:26:26 +01:00

251 lines
8.6 KiB
C++

//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// <algorithm>
// UNSUPPORTED: c++03, c++11, c++14, c++17
// template<input_iterator I, sentinel_for<I> S, weakly_incrementable O>
// requires indirectly_copyable<I, O>
// constexpr ranges::copy_result<I, O> ranges::copy(I first, S last, O result);
// template<input_range R, weakly_incrementable O>
// requires indirectly_copyable<iterator_t<R>, O>
// constexpr ranges::copy_result<borrowed_iterator_t<R>, O> ranges::copy(R&& r, O result);
#include <algorithm>
#include <array>
#include <cassert>
#include <deque>
#include <ranges>
#include <vector>
#include "almost_satisfies_types.h"
#include "test_iterators.h"
#include "test_macros.h"
#include "type_algorithms.h"
template <class In, class Out = In, class Sent = sentinel_wrapper<In>>
concept HasCopyIt = requires(In in, Sent sent, Out out) { std::ranges::copy(in, sent, out); };
static_assert(HasCopyIt<int*>);
static_assert(!HasCopyIt<InputIteratorNotDerivedFrom>);
static_assert(!HasCopyIt<InputIteratorNotIndirectlyReadable>);
static_assert(!HasCopyIt<InputIteratorNotInputOrOutputIterator>);
static_assert(!HasCopyIt<int*, WeaklyIncrementableNotMovable>);
struct NotIndirectlyCopyable {};
static_assert(!HasCopyIt<int*, NotIndirectlyCopyable*>);
static_assert(!HasCopyIt<int*, int*, SentinelForNotSemiregular>);
static_assert(!HasCopyIt<int*, int*, SentinelForNotWeaklyEqualityComparableWith>);
template <class Range, class Out>
concept HasCopyR = requires(Range range, Out out) { std::ranges::copy(range, out); };
static_assert(HasCopyR<std::array<int, 10>, int*>);
static_assert(!HasCopyR<InputRangeNotDerivedFrom, int*>);
static_assert(!HasCopyR<InputRangeNotIndirectlyReadable, int*>);
static_assert(!HasCopyR<InputRangeNotInputOrOutputIterator, int*>);
static_assert(!HasCopyR<WeaklyIncrementableNotMovable, int*>);
static_assert(!HasCopyR<UncheckedRange<NotIndirectlyCopyable*>, int*>);
static_assert(!HasCopyR<InputRangeNotSentinelSemiregular, int*>);
static_assert(!HasCopyR<InputRangeNotSentinelEqualityComparableWith, int*>);
static_assert(std::is_same_v<std::ranges::copy_result<int, long>, std::ranges::in_out_result<int, long>>);
// clang-format off
template <class In, class Out, class Sent = In>
constexpr void test_iterators() {
{ // simple test
{
std::array in{1, 2, 3, 4};
std::array<int, 4> out;
std::same_as<std::ranges::in_out_result<In, Out>> auto ret =
std::ranges::copy(In(in.data()), Sent(In(in.data() + in.size())), Out(out.data()));
assert(in == out);
assert(base(ret.in) == in.data() + in.size());
assert(base(ret.out) == out.data() + out.size());
}
{
std::array in{1, 2, 3, 4};
std::array<int, 4> out;
auto range = std::ranges::subrange(In(in.data()), Sent(In(in.data() + in.size())));
std::same_as<std::ranges::in_out_result<In, Out>> auto ret = std::ranges::copy(range, Out(out.data()));
assert(in == out);
assert(base(ret.in) == in.data() + in.size());
assert(base(ret.out) == out.data() + out.size());
}
}
{ // check that an empty range works
{
std::array<int, 0> in;
std::array<int, 0> out;
auto ret = std::ranges::copy(In(in.data()), Sent(In(in.data() + in.size())), Out(out.data()));
assert(base(ret.in) == in.data());
assert(base(ret.out) == out.data());
}
{
std::array<int, 0> in;
std::array<int, 0> out;
auto range = std::ranges::subrange(In(in.data()), Sent(In(in.data() + in.size())));
auto ret = std::ranges::copy(range, Out(out.data()));
assert(base(ret.in) == in.data());
assert(base(ret.out) == out.data());
}
}
}
// clang-format on
#if TEST_STD_VER >= 23
constexpr bool test_vector_bool(std::size_t N) {
std::vector<bool> in(N, false);
for (std::size_t i = 0; i < N; i += 2)
in[i] = true;
{ // Test copy with aligned bytes
std::vector<bool> out(N);
std::ranges::copy(in, out.begin());
assert(in == out);
}
{ // Test copy with unaligned bytes
std::vector<bool> out(N + 8);
std::ranges::copy(in, out.begin() + 4);
for (std::size_t i = 0; i < N; ++i)
assert(out[i + 4] == in[i]);
}
return true;
}
#endif
constexpr bool test() {
types::for_each(types::forward_iterator_list<int*>{}, []<class Out>() {
test_iterators<cpp20_input_iterator<int*>, Out, sentinel_wrapper<cpp20_input_iterator<int*>>>();
test_iterators<ProxyIterator<cpp20_input_iterator<int*>>,
ProxyIterator<Out>,
sentinel_wrapper<ProxyIterator<cpp20_input_iterator<int*>>>>();
types::for_each(types::forward_iterator_list<int*>{}, []<class In>() {
test_iterators<In, Out>();
test_iterators<In, Out, sized_sentinel<In>>();
test_iterators<In, Out, sentinel_wrapper<In>>();
test_iterators<ProxyIterator<In>, ProxyIterator<Out>>();
test_iterators<ProxyIterator<In>, ProxyIterator<Out>, sized_sentinel<ProxyIterator<In>>>();
test_iterators<ProxyIterator<In>, ProxyIterator<Out>, sentinel_wrapper<ProxyIterator<In>>>();
});
});
{ // check that ranges::dangling is returned
std::array<int, 4> out;
std::same_as<std::ranges::in_out_result<std::ranges::dangling, int*>> auto ret =
std::ranges::copy(std::array{1, 2, 3, 4}, out.data());
assert(ret.out == out.data() + 4);
assert((out == std::array{1, 2, 3, 4}));
}
{ // check that an iterator is returned with a borrowing range
std::array in{1, 2, 3, 4};
std::array<int, 4> out;
std::same_as<std::ranges::in_out_result<std::array<int, 4>::iterator, int*>> auto ret =
std::ranges::copy(std::views::all(in), out.data());
assert(ret.in == in.end());
assert(ret.out == out.data() + 4);
assert(in == out);
}
{ // check that every element is copied exactly once
struct CopyOnce {
bool copied = false;
constexpr CopyOnce() = default;
constexpr CopyOnce(const CopyOnce& other) = delete;
constexpr CopyOnce& operator=(const CopyOnce& other) {
assert(!other.copied);
copied = true;
return *this;
}
};
{
std::array<CopyOnce, 4> in{};
std::array<CopyOnce, 4> out{};
auto ret = std::ranges::copy(in.begin(), in.end(), out.begin());
assert(ret.in == in.end());
assert(ret.out == out.end());
assert(std::all_of(out.begin(), out.end(), [](const auto& e) { return e.copied; }));
}
{
std::array<CopyOnce, 4> in{};
std::array<CopyOnce, 4> out{};
auto ret = std::ranges::copy(in, out.begin());
assert(ret.in == in.end());
assert(ret.out == out.end());
assert(std::all_of(out.begin(), out.end(), [](const auto& e) { return e.copied; }));
}
}
{ // check that the range is copied forwards
struct OnlyForwardsCopyable {
OnlyForwardsCopyable* next = nullptr;
bool canCopy = false;
OnlyForwardsCopyable() = default;
constexpr OnlyForwardsCopyable& operator=(const OnlyForwardsCopyable&) {
assert(canCopy);
if (next != nullptr)
next->canCopy = true;
return *this;
}
};
{
std::array<OnlyForwardsCopyable, 3> in{};
std::array<OnlyForwardsCopyable, 3> out{};
out[0].next = &out[1];
out[1].next = &out[2];
out[0].canCopy = true;
auto ret = std::ranges::copy(in.begin(), in.end(), out.begin());
assert(ret.in == in.end());
assert(ret.out == out.end());
assert(out[0].canCopy);
assert(out[1].canCopy);
assert(out[2].canCopy);
}
{
std::array<OnlyForwardsCopyable, 3> in{};
std::array<OnlyForwardsCopyable, 3> out{};
out[0].next = &out[1];
out[1].next = &out[2];
out[0].canCopy = true;
auto ret = std::ranges::copy(in, out.begin());
assert(ret.in == in.end());
assert(ret.out == out.end());
assert(out[0].canCopy);
assert(out[1].canCopy);
assert(out[2].canCopy);
}
}
#if TEST_STD_VER >= 23
{ // Test vector<bool>::iterator optimization
assert(test_vector_bool(8));
assert(test_vector_bool(19));
assert(test_vector_bool(32));
assert(test_vector_bool(49));
assert(test_vector_bool(64));
assert(test_vector_bool(199));
assert(test_vector_bool(256));
}
#endif
return true;
}
int main(int, char**) {
test();
static_assert(test());
return 0;
}