Peng Liu 5b65896ad6
[libc++] Optimize ranges::copy{, _n} for vector<bool>::iterator (#121013)
This PR optimizes the performance of `std::ranges::copy` and
`std::ranges::copy_n` specifically for `vector<bool>::iterator`,
addressing a subtask outlined in issue #64038. The optimizations yield
performance improvements of up to **2000x** for aligned copies and
**60x** for unaligned copies. Additionally, new tests have been added to
validate these enhancements.


- Aligned source-destination bits

ranges::copy
```
--------------------------------------------------------------------------
Benchmark                                Before        After   Improvement
--------------------------------------------------------------------------
bm_ranges_copy_vb_aligned/8              10.8 ns      1.42 ns           8x
bm_ranges_copy_vb_aligned/64             88.5 ns      2.28 ns          39x
bm_ranges_copy_vb_aligned/512             709 ns      1.95 ns         364x
bm_ranges_copy_vb_aligned/4096           5568 ns      5.01 ns        1111x
bm_ranges_copy_vb_aligned/32768         44754 ns      38.7 ns        1156x
bm_ranges_copy_vb_aligned/65536         91092 ns      73.2 ns        1244x
bm_ranges_copy_vb_aligned/102400       139473 ns       127 ns        1098x
bm_ranges_copy_vb_aligned/106496       189004 ns      81.5 ns        2319x
bm_ranges_copy_vb_aligned/110592       153647 ns      71.1 ns        2161x
bm_ranges_copy_vb_aligned/114688       159261 ns      70.2 ns        2269x
bm_ranges_copy_vb_aligned/118784       181910 ns      73.5 ns        2475x
bm_ranges_copy_vb_aligned/122880       174117 ns      76.5 ns        2276x
bm_ranges_copy_vb_aligned/126976       176020 ns      82.0 ns        2147x
bm_ranges_copy_vb_aligned/131072       180757 ns       137 ns        1319x
bm_ranges_copy_vb_aligned/135168       190342 ns       158 ns        1205x
bm_ranges_copy_vb_aligned/139264       192831 ns       103 ns        1872x
bm_ranges_copy_vb_aligned/143360       199627 ns      89.4 ns        2233x
bm_ranges_copy_vb_aligned/147456       203881 ns      88.6 ns        2301x
bm_ranges_copy_vb_aligned/151552       213345 ns      88.4 ns        2413x
bm_ranges_copy_vb_aligned/155648       216892 ns      92.9 ns        2335x
bm_ranges_copy_vb_aligned/159744       222751 ns      96.4 ns        2311x
bm_ranges_copy_vb_aligned/163840       225995 ns       173 ns        1306x
bm_ranges_copy_vb_aligned/167936       235230 ns       202 ns        1165x
bm_ranges_copy_vb_aligned/172032       244093 ns       131 ns        1863x
bm_ranges_copy_vb_aligned/176128       244434 ns       111 ns        2202x
bm_ranges_copy_vb_aligned/180224       249570 ns       108 ns        2311x
bm_ranges_copy_vb_aligned/184320       254538 ns       108 ns        2357x
bm_ranges_copy_vb_aligned/188416       261817 ns       113 ns        2317x
bm_ranges_copy_vb_aligned/192512       269923 ns       125 ns        2159x
bm_ranges_copy_vb_aligned/196608       273494 ns       210 ns        1302x
bm_ranges_copy_vb_aligned/200704       280035 ns       269 ns        1041x
bm_ranges_copy_vb_aligned/204800       293102 ns       231 ns        1269x
```

ranges::copy_n
```
--------------------------------------------------------------------------
Benchmark                                Before        After   Improvement
--------------------------------------------------------------------------
bm_ranges_copy_n_vb_aligned/8            11.8 ns       0.89 ns         13x
bm_ranges_copy_n_vb_aligned/64           91.6 ns       2.06 ns         44x
bm_ranges_copy_n_vb_aligned/512           718 ns       2.45 ns        293x
bm_ranges_copy_n_vb_aligned/4096         5750 ns       5.02 ns       1145x
bm_ranges_copy_n_vb_aligned/32768       45824 ns       40.9 ns       1120x
bm_ranges_copy_n_vb_aligned/65536       92267 ns       73.8 ns       1250x
bm_ranges_copy_n_vb_aligned/102400     143267 ns       125 ns        1146x
bm_ranges_copy_n_vb_aligned/106496     148625 ns      82.4 ns        1804x
bm_ranges_copy_n_vb_aligned/110592     154817 ns      72.0 ns        2150x
bm_ranges_copy_n_vb_aligned/114688     157953 ns      70.4 ns        2244x
bm_ranges_copy_n_vb_aligned/118784     162374 ns      71.5 ns        2270x
bm_ranges_copy_n_vb_aligned/122880     168638 ns      72.9 ns        2313x
bm_ranges_copy_n_vb_aligned/126976     175596 ns      76.6 ns        2292x
bm_ranges_copy_n_vb_aligned/131072     181164 ns       135 ns        1342x
bm_ranges_copy_n_vb_aligned/135168     184697 ns       157 ns        1176x
bm_ranges_copy_n_vb_aligned/139264     191395 ns       104 ns        1840x
bm_ranges_copy_n_vb_aligned/143360     194954 ns      88.3 ns        2208x
bm_ranges_copy_n_vb_aligned/147456     208917 ns      86.1 ns        2426x
bm_ranges_copy_n_vb_aligned/151552     211101 ns      87.2 ns        2421x
bm_ranges_copy_n_vb_aligned/155648     213175 ns      89.0 ns        2395x
bm_ranges_copy_n_vb_aligned/159744     218988 ns      86.7 ns        2526x
bm_ranges_copy_n_vb_aligned/163840     225263 ns       156 ns        1444x
bm_ranges_copy_n_vb_aligned/167936     230725 ns       184 ns        1254x
bm_ranges_copy_n_vb_aligned/172032     235795 ns       119 ns        1981x
bm_ranges_copy_n_vb_aligned/176128     241145 ns       101 ns        2388x
bm_ranges_copy_n_vb_aligned/180224     250680 ns      99.5 ns        2519x
bm_ranges_copy_n_vb_aligned/184320     262954 ns      99.7 ns        2637x
bm_ranges_copy_n_vb_aligned/188416     258584 ns       103 ns        2510x
bm_ranges_copy_n_vb_aligned/192512     267190 ns       125 ns        2138x
bm_ranges_copy_n_vb_aligned/196608     270821 ns       213 ns        1271x
bm_ranges_copy_n_vb_aligned/200704     279532 ns       262 ns        1067x
bm_ranges_copy_n_vb_aligned/204800     283412 ns       222 ns        1277x
```

- Unaligned source-destination bits
```
--------------------------------------------------------------------------------
Benchmark                                    Before           After  Improvement
--------------------------------------------------------------------------------
bm_ranges_copy_vb_unaligned/8               12.8 ns         8.59 ns         1.5x
bm_ranges_copy_vb_unaligned/64              98.2 ns         8.24 ns          12x
bm_ranges_copy_vb_unaligned/512              755 ns         18.1 ns          42x
bm_ranges_copy_vb_unaligned/4096            6027 ns          102 ns          59x
bm_ranges_copy_vb_unaligned/32768          47663 ns          774 ns          62x
bm_ranges_copy_vb_unaligned/262144        378981 ns         6455 ns          59x
bm_ranges_copy_vb_unaligned/1048576      1520486 ns        25942 ns          59x
bm_ranges_copy_n_vb_unaligned/8             11.3 ns         8.22 ns         1.4x
bm_ranges_copy_n_vb_unaligned/64            97.3 ns         7.89 ns          12x
bm_ranges_copy_n_vb_unaligned/512            747 ns         18.1 ns          41x
bm_ranges_copy_n_vb_unaligned/4096          5932 ns         99.0 ns          60x
bm_ranges_copy_n_vb_unaligned/32768        47776 ns         749 ns           64x
bm_ranges_copy_n_vb_unaligned/262144      378802 ns        6576 ns           58x
bm_ranges_copy_n_vb_unaligned/1048576    1547234 ns       26229 ns           59x
```
2025-01-30 17:26:26 +01:00

150 lines
5.0 KiB
C++

//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// <algorithm>
// template<InputIterator InIter, OutputIterator<auto, InIter::reference> OutIter>
// constexpr OutIter // constexpr after C++17
// copy_n(InIter first, InIter::difference_type n, OutIter result);
#include <algorithm>
#include <cassert>
#include <vector>
#include "test_macros.h"
#include "test_iterators.h"
#include "user_defined_integral.h"
typedef UserDefinedIntegral<unsigned> UDI;
class PaddedBase {
public:
TEST_CONSTEXPR PaddedBase(std::int16_t a, std::int8_t b) : a_(a), b_(b) {}
std::int16_t a_;
std::int8_t b_;
};
class Derived : public PaddedBase {
public:
TEST_CONSTEXPR Derived(std::int16_t a, std::int8_t b, std::int8_t c) : PaddedBase(a, b), c_(c) {}
std::int8_t c_;
};
template <class InIter, class OutIter>
TEST_CONSTEXPR_CXX20 void test_copy_n() {
{
const unsigned N = 1000;
int ia[N] = {};
for (unsigned i = 0; i < N; ++i)
ia[i] = i;
int ib[N] = {0};
OutIter r = std::copy_n(InIter(ia), UDI(N / 2), OutIter(ib));
assert(base(r) == ib + N / 2);
for (unsigned i = 0; i < N / 2; ++i)
assert(ia[i] == ib[i]);
}
{ // Make sure that padding bits aren't copied
Derived src(1, 2, 3);
Derived dst(4, 5, 6);
std::copy_n(static_cast<PaddedBase*>(&src), 1, static_cast<PaddedBase*>(&dst));
assert(dst.a_ == 1);
assert(dst.b_ == 2);
assert(dst.c_ == 6);
}
{ // Make sure that overlapping ranges can be copied
int a[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
std::copy_n(a + 3, 7, a);
int expected[] = {4, 5, 6, 7, 8, 9, 10, 8, 9, 10};
assert(std::equal(a, a + 10, expected));
}
}
TEST_CONSTEXPR_CXX20 bool test_vector_bool(std::size_t N) {
std::vector<bool> in(N, false);
for (std::size_t i = 0; i < N; i += 2)
in[i] = true;
{ // Test copy with aligned bytes
std::vector<bool> out(N);
std::copy_n(in.begin(), N, out.begin());
assert(in == out);
}
{ // Test copy with unaligned bytes
std::vector<bool> out(N + 8);
std::copy_n(in.begin(), N, out.begin() + 4);
for (std::size_t i = 0; i < N; ++i)
assert(out[i + 4] == in[i]);
}
return true;
}
TEST_CONSTEXPR_CXX20 bool test() {
test_copy_n<cpp17_input_iterator<const int*>, cpp17_output_iterator<int*> >();
test_copy_n<cpp17_input_iterator<const int*>, cpp17_input_iterator<int*> >();
test_copy_n<cpp17_input_iterator<const int*>, forward_iterator<int*> >();
test_copy_n<cpp17_input_iterator<const int*>, bidirectional_iterator<int*> >();
test_copy_n<cpp17_input_iterator<const int*>, random_access_iterator<int*> >();
test_copy_n<cpp17_input_iterator<const int*>, int*>();
test_copy_n<forward_iterator<const int*>, cpp17_output_iterator<int*> >();
test_copy_n<forward_iterator<const int*>, cpp17_input_iterator<int*> >();
test_copy_n<forward_iterator<const int*>, forward_iterator<int*> >();
test_copy_n<forward_iterator<const int*>, bidirectional_iterator<int*> >();
test_copy_n<forward_iterator<const int*>, random_access_iterator<int*> >();
test_copy_n<forward_iterator<const int*>, int*>();
test_copy_n<bidirectional_iterator<const int*>, cpp17_output_iterator<int*> >();
test_copy_n<bidirectional_iterator<const int*>, cpp17_input_iterator<int*> >();
test_copy_n<bidirectional_iterator<const int*>, forward_iterator<int*> >();
test_copy_n<bidirectional_iterator<const int*>, bidirectional_iterator<int*> >();
test_copy_n<bidirectional_iterator<const int*>, random_access_iterator<int*> >();
test_copy_n<bidirectional_iterator<const int*>, int*>();
test_copy_n<random_access_iterator<const int*>, cpp17_output_iterator<int*> >();
test_copy_n<random_access_iterator<const int*>, cpp17_input_iterator<int*> >();
test_copy_n<random_access_iterator<const int*>, forward_iterator<int*> >();
test_copy_n<random_access_iterator<const int*>, bidirectional_iterator<int*> >();
test_copy_n<random_access_iterator<const int*>, random_access_iterator<int*> >();
test_copy_n<random_access_iterator<const int*>, int*>();
test_copy_n<const int*, cpp17_output_iterator<int*> >();
test_copy_n<const int*, cpp17_input_iterator<int*> >();
test_copy_n<const int*, forward_iterator<int*> >();
test_copy_n<const int*, bidirectional_iterator<int*> >();
test_copy_n<const int*, random_access_iterator<int*> >();
test_copy_n<const int*, int*>();
{ // Test vector<bool>::iterator optimization
assert(test_vector_bool(8));
assert(test_vector_bool(19));
assert(test_vector_bool(32));
assert(test_vector_bool(49));
assert(test_vector_bool(64));
assert(test_vector_bool(199));
assert(test_vector_bool(256));
}
return true;
}
int main(int, char**) {
test();
#if TEST_STD_VER > 17
static_assert(test());
#endif
return 0;
}