Peng Liu 5b65896ad6
[libc++] Optimize ranges::copy{, _n} for vector<bool>::iterator (#121013)
This PR optimizes the performance of `std::ranges::copy` and
`std::ranges::copy_n` specifically for `vector<bool>::iterator`,
addressing a subtask outlined in issue #64038. The optimizations yield
performance improvements of up to **2000x** for aligned copies and
**60x** for unaligned copies. Additionally, new tests have been added to
validate these enhancements.


- Aligned source-destination bits

ranges::copy
```
--------------------------------------------------------------------------
Benchmark                                Before        After   Improvement
--------------------------------------------------------------------------
bm_ranges_copy_vb_aligned/8              10.8 ns      1.42 ns           8x
bm_ranges_copy_vb_aligned/64             88.5 ns      2.28 ns          39x
bm_ranges_copy_vb_aligned/512             709 ns      1.95 ns         364x
bm_ranges_copy_vb_aligned/4096           5568 ns      5.01 ns        1111x
bm_ranges_copy_vb_aligned/32768         44754 ns      38.7 ns        1156x
bm_ranges_copy_vb_aligned/65536         91092 ns      73.2 ns        1244x
bm_ranges_copy_vb_aligned/102400       139473 ns       127 ns        1098x
bm_ranges_copy_vb_aligned/106496       189004 ns      81.5 ns        2319x
bm_ranges_copy_vb_aligned/110592       153647 ns      71.1 ns        2161x
bm_ranges_copy_vb_aligned/114688       159261 ns      70.2 ns        2269x
bm_ranges_copy_vb_aligned/118784       181910 ns      73.5 ns        2475x
bm_ranges_copy_vb_aligned/122880       174117 ns      76.5 ns        2276x
bm_ranges_copy_vb_aligned/126976       176020 ns      82.0 ns        2147x
bm_ranges_copy_vb_aligned/131072       180757 ns       137 ns        1319x
bm_ranges_copy_vb_aligned/135168       190342 ns       158 ns        1205x
bm_ranges_copy_vb_aligned/139264       192831 ns       103 ns        1872x
bm_ranges_copy_vb_aligned/143360       199627 ns      89.4 ns        2233x
bm_ranges_copy_vb_aligned/147456       203881 ns      88.6 ns        2301x
bm_ranges_copy_vb_aligned/151552       213345 ns      88.4 ns        2413x
bm_ranges_copy_vb_aligned/155648       216892 ns      92.9 ns        2335x
bm_ranges_copy_vb_aligned/159744       222751 ns      96.4 ns        2311x
bm_ranges_copy_vb_aligned/163840       225995 ns       173 ns        1306x
bm_ranges_copy_vb_aligned/167936       235230 ns       202 ns        1165x
bm_ranges_copy_vb_aligned/172032       244093 ns       131 ns        1863x
bm_ranges_copy_vb_aligned/176128       244434 ns       111 ns        2202x
bm_ranges_copy_vb_aligned/180224       249570 ns       108 ns        2311x
bm_ranges_copy_vb_aligned/184320       254538 ns       108 ns        2357x
bm_ranges_copy_vb_aligned/188416       261817 ns       113 ns        2317x
bm_ranges_copy_vb_aligned/192512       269923 ns       125 ns        2159x
bm_ranges_copy_vb_aligned/196608       273494 ns       210 ns        1302x
bm_ranges_copy_vb_aligned/200704       280035 ns       269 ns        1041x
bm_ranges_copy_vb_aligned/204800       293102 ns       231 ns        1269x
```

ranges::copy_n
```
--------------------------------------------------------------------------
Benchmark                                Before        After   Improvement
--------------------------------------------------------------------------
bm_ranges_copy_n_vb_aligned/8            11.8 ns       0.89 ns         13x
bm_ranges_copy_n_vb_aligned/64           91.6 ns       2.06 ns         44x
bm_ranges_copy_n_vb_aligned/512           718 ns       2.45 ns        293x
bm_ranges_copy_n_vb_aligned/4096         5750 ns       5.02 ns       1145x
bm_ranges_copy_n_vb_aligned/32768       45824 ns       40.9 ns       1120x
bm_ranges_copy_n_vb_aligned/65536       92267 ns       73.8 ns       1250x
bm_ranges_copy_n_vb_aligned/102400     143267 ns       125 ns        1146x
bm_ranges_copy_n_vb_aligned/106496     148625 ns      82.4 ns        1804x
bm_ranges_copy_n_vb_aligned/110592     154817 ns      72.0 ns        2150x
bm_ranges_copy_n_vb_aligned/114688     157953 ns      70.4 ns        2244x
bm_ranges_copy_n_vb_aligned/118784     162374 ns      71.5 ns        2270x
bm_ranges_copy_n_vb_aligned/122880     168638 ns      72.9 ns        2313x
bm_ranges_copy_n_vb_aligned/126976     175596 ns      76.6 ns        2292x
bm_ranges_copy_n_vb_aligned/131072     181164 ns       135 ns        1342x
bm_ranges_copy_n_vb_aligned/135168     184697 ns       157 ns        1176x
bm_ranges_copy_n_vb_aligned/139264     191395 ns       104 ns        1840x
bm_ranges_copy_n_vb_aligned/143360     194954 ns      88.3 ns        2208x
bm_ranges_copy_n_vb_aligned/147456     208917 ns      86.1 ns        2426x
bm_ranges_copy_n_vb_aligned/151552     211101 ns      87.2 ns        2421x
bm_ranges_copy_n_vb_aligned/155648     213175 ns      89.0 ns        2395x
bm_ranges_copy_n_vb_aligned/159744     218988 ns      86.7 ns        2526x
bm_ranges_copy_n_vb_aligned/163840     225263 ns       156 ns        1444x
bm_ranges_copy_n_vb_aligned/167936     230725 ns       184 ns        1254x
bm_ranges_copy_n_vb_aligned/172032     235795 ns       119 ns        1981x
bm_ranges_copy_n_vb_aligned/176128     241145 ns       101 ns        2388x
bm_ranges_copy_n_vb_aligned/180224     250680 ns      99.5 ns        2519x
bm_ranges_copy_n_vb_aligned/184320     262954 ns      99.7 ns        2637x
bm_ranges_copy_n_vb_aligned/188416     258584 ns       103 ns        2510x
bm_ranges_copy_n_vb_aligned/192512     267190 ns       125 ns        2138x
bm_ranges_copy_n_vb_aligned/196608     270821 ns       213 ns        1271x
bm_ranges_copy_n_vb_aligned/200704     279532 ns       262 ns        1067x
bm_ranges_copy_n_vb_aligned/204800     283412 ns       222 ns        1277x
```

- Unaligned source-destination bits
```
--------------------------------------------------------------------------------
Benchmark                                    Before           After  Improvement
--------------------------------------------------------------------------------
bm_ranges_copy_vb_unaligned/8               12.8 ns         8.59 ns         1.5x
bm_ranges_copy_vb_unaligned/64              98.2 ns         8.24 ns          12x
bm_ranges_copy_vb_unaligned/512              755 ns         18.1 ns          42x
bm_ranges_copy_vb_unaligned/4096            6027 ns          102 ns          59x
bm_ranges_copy_vb_unaligned/32768          47663 ns          774 ns          62x
bm_ranges_copy_vb_unaligned/262144        378981 ns         6455 ns          59x
bm_ranges_copy_vb_unaligned/1048576      1520486 ns        25942 ns          59x
bm_ranges_copy_n_vb_unaligned/8             11.3 ns         8.22 ns         1.4x
bm_ranges_copy_n_vb_unaligned/64            97.3 ns         7.89 ns          12x
bm_ranges_copy_n_vb_unaligned/512            747 ns         18.1 ns          41x
bm_ranges_copy_n_vb_unaligned/4096          5932 ns         99.0 ns          60x
bm_ranges_copy_n_vb_unaligned/32768        47776 ns         749 ns           64x
bm_ranges_copy_n_vb_unaligned/262144      378802 ns        6576 ns           58x
bm_ranges_copy_n_vb_unaligned/1048576    1547234 ns       26229 ns           59x
```
2025-01-30 17:26:26 +01:00

255 lines
11 KiB
C++

//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef _LIBCPP___ALGORITHM_COPY_H
#define _LIBCPP___ALGORITHM_COPY_H
#include <__algorithm/copy_move_common.h>
#include <__algorithm/for_each_segment.h>
#include <__algorithm/min.h>
#include <__config>
#include <__fwd/bit_reference.h>
#include <__iterator/iterator_traits.h>
#include <__iterator/segmented_iterator.h>
#include <__memory/pointer_traits.h>
#include <__type_traits/common_type.h>
#include <__type_traits/enable_if.h>
#include <__utility/move.h>
#include <__utility/pair.h>
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
# pragma GCC system_header
#endif
_LIBCPP_PUSH_MACROS
#include <__undef_macros>
_LIBCPP_BEGIN_NAMESPACE_STD
template <class _InputIterator, class _OutputIterator>
inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator
copy(_InputIterator __first, _InputIterator __last, _OutputIterator __result);
template <class _InIter, class _Sent, class _OutIter>
inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_InIter, _OutIter> __copy(_InIter, _Sent, _OutIter);
template <class _Cp, bool _IsConst>
_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_aligned(
__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
using _In = __bit_iterator<_Cp, _IsConst>;
using difference_type = typename _In::difference_type;
using __storage_type = typename _In::__storage_type;
const int __bits_per_word = _In::__bits_per_word;
difference_type __n = __last - __first;
if (__n > 0) {
// do first word
if (__first.__ctz_ != 0) {
unsigned __clz = __bits_per_word - __first.__ctz_;
difference_type __dn = std::min(static_cast<difference_type>(__clz), __n);
__n -= __dn;
__storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz - __dn));
__storage_type __b = *__first.__seg_ & __m;
*__result.__seg_ &= ~__m;
*__result.__seg_ |= __b;
__result.__seg_ += (__dn + __result.__ctz_) / __bits_per_word;
__result.__ctz_ = static_cast<unsigned>((__dn + __result.__ctz_) % __bits_per_word);
++__first.__seg_;
// __first.__ctz_ = 0;
}
// __first.__ctz_ == 0;
// do middle words
__storage_type __nw = __n / __bits_per_word;
std::copy(std::__to_address(__first.__seg_),
std::__to_address(__first.__seg_ + __nw),
std::__to_address(__result.__seg_));
__n -= __nw * __bits_per_word;
__result.__seg_ += __nw;
// do last word
if (__n > 0) {
__first.__seg_ += __nw;
__storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
__storage_type __b = *__first.__seg_ & __m;
*__result.__seg_ &= ~__m;
*__result.__seg_ |= __b;
__result.__ctz_ = static_cast<unsigned>(__n);
}
}
return __result;
}
template <class _Cp, bool _IsConst>
_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_unaligned(
__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
using _In = __bit_iterator<_Cp, _IsConst>;
using difference_type = typename _In::difference_type;
using __storage_type = typename _In::__storage_type;
const int __bits_per_word = _In::__bits_per_word;
difference_type __n = __last - __first;
if (__n > 0) {
// do first word
if (__first.__ctz_ != 0) {
unsigned __clz_f = __bits_per_word - __first.__ctz_;
difference_type __dn = std::min(static_cast<difference_type>(__clz_f), __n);
__n -= __dn;
__storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
__storage_type __b = *__first.__seg_ & __m;
unsigned __clz_r = __bits_per_word - __result.__ctz_;
__storage_type __ddn = std::min<__storage_type>(__dn, __clz_r);
__m = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __ddn));
*__result.__seg_ &= ~__m;
if (__result.__ctz_ > __first.__ctz_)
*__result.__seg_ |= __b << (__result.__ctz_ - __first.__ctz_);
else
*__result.__seg_ |= __b >> (__first.__ctz_ - __result.__ctz_);
__result.__seg_ += (__ddn + __result.__ctz_) / __bits_per_word;
__result.__ctz_ = static_cast<unsigned>((__ddn + __result.__ctz_) % __bits_per_word);
__dn -= __ddn;
if (__dn > 0) {
__m = ~__storage_type(0) >> (__bits_per_word - __dn);
*__result.__seg_ &= ~__m;
*__result.__seg_ |= __b >> (__first.__ctz_ + __ddn);
__result.__ctz_ = static_cast<unsigned>(__dn);
}
++__first.__seg_;
// __first.__ctz_ = 0;
}
// __first.__ctz_ == 0;
// do middle words
unsigned __clz_r = __bits_per_word - __result.__ctz_;
__storage_type __m = ~__storage_type(0) << __result.__ctz_;
for (; __n >= __bits_per_word; __n -= __bits_per_word, ++__first.__seg_) {
__storage_type __b = *__first.__seg_;
*__result.__seg_ &= ~__m;
*__result.__seg_ |= __b << __result.__ctz_;
++__result.__seg_;
*__result.__seg_ &= __m;
*__result.__seg_ |= __b >> __clz_r;
}
// do last word
if (__n > 0) {
__m = ~__storage_type(0) >> (__bits_per_word - __n);
__storage_type __b = *__first.__seg_ & __m;
__storage_type __dn = std::min(__n, static_cast<difference_type>(__clz_r));
__m = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __dn));
*__result.__seg_ &= ~__m;
*__result.__seg_ |= __b << __result.__ctz_;
__result.__seg_ += (__dn + __result.__ctz_) / __bits_per_word;
__result.__ctz_ = static_cast<unsigned>((__dn + __result.__ctz_) % __bits_per_word);
__n -= __dn;
if (__n > 0) {
__m = ~__storage_type(0) >> (__bits_per_word - __n);
*__result.__seg_ &= ~__m;
*__result.__seg_ |= __b >> __dn;
__result.__ctz_ = static_cast<unsigned>(__n);
}
}
}
return __result;
}
struct __copy_impl {
template <class _InIter, class _Sent, class _OutIter>
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_InIter, _OutIter>
operator()(_InIter __first, _Sent __last, _OutIter __result) const {
while (__first != __last) {
*__result = *__first;
++__first;
++__result;
}
return std::make_pair(std::move(__first), std::move(__result));
}
template <class _InIter, class _OutIter>
struct _CopySegment {
using _Traits _LIBCPP_NODEBUG = __segmented_iterator_traits<_InIter>;
_OutIter& __result_;
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 explicit _CopySegment(_OutIter& __result)
: __result_(__result) {}
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 void
operator()(typename _Traits::__local_iterator __lfirst, typename _Traits::__local_iterator __llast) {
__result_ = std::__copy(__lfirst, __llast, std::move(__result_)).second;
}
};
template <class _InIter, class _OutIter, __enable_if_t<__is_segmented_iterator<_InIter>::value, int> = 0>
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_InIter, _OutIter>
operator()(_InIter __first, _InIter __last, _OutIter __result) const {
std::__for_each_segment(__first, __last, _CopySegment<_InIter, _OutIter>(__result));
return std::make_pair(__last, std::move(__result));
}
template <class _InIter,
class _OutIter,
__enable_if_t<__has_random_access_iterator_category<_InIter>::value &&
!__is_segmented_iterator<_InIter>::value && __is_segmented_iterator<_OutIter>::value,
int> = 0>
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_InIter, _OutIter>
operator()(_InIter __first, _InIter __last, _OutIter __result) const {
using _Traits = __segmented_iterator_traits<_OutIter>;
using _DiffT = typename common_type<__iter_diff_t<_InIter>, __iter_diff_t<_OutIter> >::type;
if (__first == __last)
return std::make_pair(std::move(__first), std::move(__result));
auto __local_first = _Traits::__local(__result);
auto __segment_iterator = _Traits::__segment(__result);
while (true) {
auto __local_last = _Traits::__end(__segment_iterator);
auto __size = std::min<_DiffT>(__local_last - __local_first, __last - __first);
auto __iters = std::__copy(__first, __first + __size, __local_first);
__first = std::move(__iters.first);
if (__first == __last)
return std::make_pair(std::move(__first), _Traits::__compose(__segment_iterator, std::move(__iters.second)));
__local_first = _Traits::__begin(++__segment_iterator);
}
}
template <class _Cp, bool _IsConst>
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<__bit_iterator<_Cp, _IsConst>, __bit_iterator<_Cp, false> >
operator()(__bit_iterator<_Cp, _IsConst> __first,
__bit_iterator<_Cp, _IsConst> __last,
__bit_iterator<_Cp, false> __result) const {
if (__first.__ctz_ == __result.__ctz_)
return std::make_pair(__last, std::__copy_aligned(__first, __last, __result));
return std::make_pair(__last, std::__copy_unaligned(__first, __last, __result));
}
// At this point, the iterators have been unwrapped so any `contiguous_iterator` has been unwrapped to a pointer.
template <class _In, class _Out, __enable_if_t<__can_lower_copy_assignment_to_memmove<_In, _Out>::value, int> = 0>
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_In*, _Out*>
operator()(_In* __first, _In* __last, _Out* __result) const {
return std::__copy_trivial_impl(__first, __last, __result);
}
};
template <class _InIter, class _Sent, class _OutIter>
pair<_InIter, _OutIter> inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14
__copy(_InIter __first, _Sent __last, _OutIter __result) {
return std::__copy_move_unwrap_iters<__copy_impl>(std::move(__first), std::move(__last), std::move(__result));
}
template <class _InputIterator, class _OutputIterator>
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator
copy(_InputIterator __first, _InputIterator __last, _OutputIterator __result) {
return std::__copy(__first, __last, __result).second;
}
_LIBCPP_END_NAMESPACE_STD
_LIBCPP_POP_MACROS
#endif // _LIBCPP___ALGORITHM_COPY_H