
This PR optimizes the performance of `std::ranges::copy` and `std::ranges::copy_n` specifically for `vector<bool>::iterator`, addressing a subtask outlined in issue #64038. The optimizations yield performance improvements of up to **2000x** for aligned copies and **60x** for unaligned copies. Additionally, new tests have been added to validate these enhancements. - Aligned source-destination bits ranges::copy ``` -------------------------------------------------------------------------- Benchmark Before After Improvement -------------------------------------------------------------------------- bm_ranges_copy_vb_aligned/8 10.8 ns 1.42 ns 8x bm_ranges_copy_vb_aligned/64 88.5 ns 2.28 ns 39x bm_ranges_copy_vb_aligned/512 709 ns 1.95 ns 364x bm_ranges_copy_vb_aligned/4096 5568 ns 5.01 ns 1111x bm_ranges_copy_vb_aligned/32768 44754 ns 38.7 ns 1156x bm_ranges_copy_vb_aligned/65536 91092 ns 73.2 ns 1244x bm_ranges_copy_vb_aligned/102400 139473 ns 127 ns 1098x bm_ranges_copy_vb_aligned/106496 189004 ns 81.5 ns 2319x bm_ranges_copy_vb_aligned/110592 153647 ns 71.1 ns 2161x bm_ranges_copy_vb_aligned/114688 159261 ns 70.2 ns 2269x bm_ranges_copy_vb_aligned/118784 181910 ns 73.5 ns 2475x bm_ranges_copy_vb_aligned/122880 174117 ns 76.5 ns 2276x bm_ranges_copy_vb_aligned/126976 176020 ns 82.0 ns 2147x bm_ranges_copy_vb_aligned/131072 180757 ns 137 ns 1319x bm_ranges_copy_vb_aligned/135168 190342 ns 158 ns 1205x bm_ranges_copy_vb_aligned/139264 192831 ns 103 ns 1872x bm_ranges_copy_vb_aligned/143360 199627 ns 89.4 ns 2233x bm_ranges_copy_vb_aligned/147456 203881 ns 88.6 ns 2301x bm_ranges_copy_vb_aligned/151552 213345 ns 88.4 ns 2413x bm_ranges_copy_vb_aligned/155648 216892 ns 92.9 ns 2335x bm_ranges_copy_vb_aligned/159744 222751 ns 96.4 ns 2311x bm_ranges_copy_vb_aligned/163840 225995 ns 173 ns 1306x bm_ranges_copy_vb_aligned/167936 235230 ns 202 ns 1165x bm_ranges_copy_vb_aligned/172032 244093 ns 131 ns 1863x bm_ranges_copy_vb_aligned/176128 244434 ns 111 ns 2202x bm_ranges_copy_vb_aligned/180224 249570 ns 108 ns 2311x bm_ranges_copy_vb_aligned/184320 254538 ns 108 ns 2357x bm_ranges_copy_vb_aligned/188416 261817 ns 113 ns 2317x bm_ranges_copy_vb_aligned/192512 269923 ns 125 ns 2159x bm_ranges_copy_vb_aligned/196608 273494 ns 210 ns 1302x bm_ranges_copy_vb_aligned/200704 280035 ns 269 ns 1041x bm_ranges_copy_vb_aligned/204800 293102 ns 231 ns 1269x ``` ranges::copy_n ``` -------------------------------------------------------------------------- Benchmark Before After Improvement -------------------------------------------------------------------------- bm_ranges_copy_n_vb_aligned/8 11.8 ns 0.89 ns 13x bm_ranges_copy_n_vb_aligned/64 91.6 ns 2.06 ns 44x bm_ranges_copy_n_vb_aligned/512 718 ns 2.45 ns 293x bm_ranges_copy_n_vb_aligned/4096 5750 ns 5.02 ns 1145x bm_ranges_copy_n_vb_aligned/32768 45824 ns 40.9 ns 1120x bm_ranges_copy_n_vb_aligned/65536 92267 ns 73.8 ns 1250x bm_ranges_copy_n_vb_aligned/102400 143267 ns 125 ns 1146x bm_ranges_copy_n_vb_aligned/106496 148625 ns 82.4 ns 1804x bm_ranges_copy_n_vb_aligned/110592 154817 ns 72.0 ns 2150x bm_ranges_copy_n_vb_aligned/114688 157953 ns 70.4 ns 2244x bm_ranges_copy_n_vb_aligned/118784 162374 ns 71.5 ns 2270x bm_ranges_copy_n_vb_aligned/122880 168638 ns 72.9 ns 2313x bm_ranges_copy_n_vb_aligned/126976 175596 ns 76.6 ns 2292x bm_ranges_copy_n_vb_aligned/131072 181164 ns 135 ns 1342x bm_ranges_copy_n_vb_aligned/135168 184697 ns 157 ns 1176x bm_ranges_copy_n_vb_aligned/139264 191395 ns 104 ns 1840x bm_ranges_copy_n_vb_aligned/143360 194954 ns 88.3 ns 2208x bm_ranges_copy_n_vb_aligned/147456 208917 ns 86.1 ns 2426x bm_ranges_copy_n_vb_aligned/151552 211101 ns 87.2 ns 2421x bm_ranges_copy_n_vb_aligned/155648 213175 ns 89.0 ns 2395x bm_ranges_copy_n_vb_aligned/159744 218988 ns 86.7 ns 2526x bm_ranges_copy_n_vb_aligned/163840 225263 ns 156 ns 1444x bm_ranges_copy_n_vb_aligned/167936 230725 ns 184 ns 1254x bm_ranges_copy_n_vb_aligned/172032 235795 ns 119 ns 1981x bm_ranges_copy_n_vb_aligned/176128 241145 ns 101 ns 2388x bm_ranges_copy_n_vb_aligned/180224 250680 ns 99.5 ns 2519x bm_ranges_copy_n_vb_aligned/184320 262954 ns 99.7 ns 2637x bm_ranges_copy_n_vb_aligned/188416 258584 ns 103 ns 2510x bm_ranges_copy_n_vb_aligned/192512 267190 ns 125 ns 2138x bm_ranges_copy_n_vb_aligned/196608 270821 ns 213 ns 1271x bm_ranges_copy_n_vb_aligned/200704 279532 ns 262 ns 1067x bm_ranges_copy_n_vb_aligned/204800 283412 ns 222 ns 1277x ``` - Unaligned source-destination bits ``` -------------------------------------------------------------------------------- Benchmark Before After Improvement -------------------------------------------------------------------------------- bm_ranges_copy_vb_unaligned/8 12.8 ns 8.59 ns 1.5x bm_ranges_copy_vb_unaligned/64 98.2 ns 8.24 ns 12x bm_ranges_copy_vb_unaligned/512 755 ns 18.1 ns 42x bm_ranges_copy_vb_unaligned/4096 6027 ns 102 ns 59x bm_ranges_copy_vb_unaligned/32768 47663 ns 774 ns 62x bm_ranges_copy_vb_unaligned/262144 378981 ns 6455 ns 59x bm_ranges_copy_vb_unaligned/1048576 1520486 ns 25942 ns 59x bm_ranges_copy_n_vb_unaligned/8 11.3 ns 8.22 ns 1.4x bm_ranges_copy_n_vb_unaligned/64 97.3 ns 7.89 ns 12x bm_ranges_copy_n_vb_unaligned/512 747 ns 18.1 ns 41x bm_ranges_copy_n_vb_unaligned/4096 5932 ns 99.0 ns 60x bm_ranges_copy_n_vb_unaligned/32768 47776 ns 749 ns 64x bm_ranges_copy_n_vb_unaligned/262144 378802 ns 6576 ns 58x bm_ranges_copy_n_vb_unaligned/1048576 1547234 ns 26229 ns 59x ```
255 lines
11 KiB
C++
255 lines
11 KiB
C++
//===----------------------------------------------------------------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#ifndef _LIBCPP___ALGORITHM_COPY_H
|
|
#define _LIBCPP___ALGORITHM_COPY_H
|
|
|
|
#include <__algorithm/copy_move_common.h>
|
|
#include <__algorithm/for_each_segment.h>
|
|
#include <__algorithm/min.h>
|
|
#include <__config>
|
|
#include <__fwd/bit_reference.h>
|
|
#include <__iterator/iterator_traits.h>
|
|
#include <__iterator/segmented_iterator.h>
|
|
#include <__memory/pointer_traits.h>
|
|
#include <__type_traits/common_type.h>
|
|
#include <__type_traits/enable_if.h>
|
|
#include <__utility/move.h>
|
|
#include <__utility/pair.h>
|
|
|
|
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
|
|
# pragma GCC system_header
|
|
#endif
|
|
|
|
_LIBCPP_PUSH_MACROS
|
|
#include <__undef_macros>
|
|
|
|
_LIBCPP_BEGIN_NAMESPACE_STD
|
|
|
|
template <class _InputIterator, class _OutputIterator>
|
|
inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator
|
|
copy(_InputIterator __first, _InputIterator __last, _OutputIterator __result);
|
|
|
|
template <class _InIter, class _Sent, class _OutIter>
|
|
inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_InIter, _OutIter> __copy(_InIter, _Sent, _OutIter);
|
|
|
|
template <class _Cp, bool _IsConst>
|
|
_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_aligned(
|
|
__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
|
|
using _In = __bit_iterator<_Cp, _IsConst>;
|
|
using difference_type = typename _In::difference_type;
|
|
using __storage_type = typename _In::__storage_type;
|
|
|
|
const int __bits_per_word = _In::__bits_per_word;
|
|
difference_type __n = __last - __first;
|
|
if (__n > 0) {
|
|
// do first word
|
|
if (__first.__ctz_ != 0) {
|
|
unsigned __clz = __bits_per_word - __first.__ctz_;
|
|
difference_type __dn = std::min(static_cast<difference_type>(__clz), __n);
|
|
__n -= __dn;
|
|
__storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz - __dn));
|
|
__storage_type __b = *__first.__seg_ & __m;
|
|
*__result.__seg_ &= ~__m;
|
|
*__result.__seg_ |= __b;
|
|
__result.__seg_ += (__dn + __result.__ctz_) / __bits_per_word;
|
|
__result.__ctz_ = static_cast<unsigned>((__dn + __result.__ctz_) % __bits_per_word);
|
|
++__first.__seg_;
|
|
// __first.__ctz_ = 0;
|
|
}
|
|
// __first.__ctz_ == 0;
|
|
// do middle words
|
|
__storage_type __nw = __n / __bits_per_word;
|
|
std::copy(std::__to_address(__first.__seg_),
|
|
std::__to_address(__first.__seg_ + __nw),
|
|
std::__to_address(__result.__seg_));
|
|
__n -= __nw * __bits_per_word;
|
|
__result.__seg_ += __nw;
|
|
// do last word
|
|
if (__n > 0) {
|
|
__first.__seg_ += __nw;
|
|
__storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
|
|
__storage_type __b = *__first.__seg_ & __m;
|
|
*__result.__seg_ &= ~__m;
|
|
*__result.__seg_ |= __b;
|
|
__result.__ctz_ = static_cast<unsigned>(__n);
|
|
}
|
|
}
|
|
return __result;
|
|
}
|
|
|
|
template <class _Cp, bool _IsConst>
|
|
_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> __copy_unaligned(
|
|
__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) {
|
|
using _In = __bit_iterator<_Cp, _IsConst>;
|
|
using difference_type = typename _In::difference_type;
|
|
using __storage_type = typename _In::__storage_type;
|
|
|
|
const int __bits_per_word = _In::__bits_per_word;
|
|
difference_type __n = __last - __first;
|
|
if (__n > 0) {
|
|
// do first word
|
|
if (__first.__ctz_ != 0) {
|
|
unsigned __clz_f = __bits_per_word - __first.__ctz_;
|
|
difference_type __dn = std::min(static_cast<difference_type>(__clz_f), __n);
|
|
__n -= __dn;
|
|
__storage_type __m = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
|
|
__storage_type __b = *__first.__seg_ & __m;
|
|
unsigned __clz_r = __bits_per_word - __result.__ctz_;
|
|
__storage_type __ddn = std::min<__storage_type>(__dn, __clz_r);
|
|
__m = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __ddn));
|
|
*__result.__seg_ &= ~__m;
|
|
if (__result.__ctz_ > __first.__ctz_)
|
|
*__result.__seg_ |= __b << (__result.__ctz_ - __first.__ctz_);
|
|
else
|
|
*__result.__seg_ |= __b >> (__first.__ctz_ - __result.__ctz_);
|
|
__result.__seg_ += (__ddn + __result.__ctz_) / __bits_per_word;
|
|
__result.__ctz_ = static_cast<unsigned>((__ddn + __result.__ctz_) % __bits_per_word);
|
|
__dn -= __ddn;
|
|
if (__dn > 0) {
|
|
__m = ~__storage_type(0) >> (__bits_per_word - __dn);
|
|
*__result.__seg_ &= ~__m;
|
|
*__result.__seg_ |= __b >> (__first.__ctz_ + __ddn);
|
|
__result.__ctz_ = static_cast<unsigned>(__dn);
|
|
}
|
|
++__first.__seg_;
|
|
// __first.__ctz_ = 0;
|
|
}
|
|
// __first.__ctz_ == 0;
|
|
// do middle words
|
|
unsigned __clz_r = __bits_per_word - __result.__ctz_;
|
|
__storage_type __m = ~__storage_type(0) << __result.__ctz_;
|
|
for (; __n >= __bits_per_word; __n -= __bits_per_word, ++__first.__seg_) {
|
|
__storage_type __b = *__first.__seg_;
|
|
*__result.__seg_ &= ~__m;
|
|
*__result.__seg_ |= __b << __result.__ctz_;
|
|
++__result.__seg_;
|
|
*__result.__seg_ &= __m;
|
|
*__result.__seg_ |= __b >> __clz_r;
|
|
}
|
|
// do last word
|
|
if (__n > 0) {
|
|
__m = ~__storage_type(0) >> (__bits_per_word - __n);
|
|
__storage_type __b = *__first.__seg_ & __m;
|
|
__storage_type __dn = std::min(__n, static_cast<difference_type>(__clz_r));
|
|
__m = (~__storage_type(0) << __result.__ctz_) & (~__storage_type(0) >> (__clz_r - __dn));
|
|
*__result.__seg_ &= ~__m;
|
|
*__result.__seg_ |= __b << __result.__ctz_;
|
|
__result.__seg_ += (__dn + __result.__ctz_) / __bits_per_word;
|
|
__result.__ctz_ = static_cast<unsigned>((__dn + __result.__ctz_) % __bits_per_word);
|
|
__n -= __dn;
|
|
if (__n > 0) {
|
|
__m = ~__storage_type(0) >> (__bits_per_word - __n);
|
|
*__result.__seg_ &= ~__m;
|
|
*__result.__seg_ |= __b >> __dn;
|
|
__result.__ctz_ = static_cast<unsigned>(__n);
|
|
}
|
|
}
|
|
}
|
|
return __result;
|
|
}
|
|
|
|
struct __copy_impl {
|
|
template <class _InIter, class _Sent, class _OutIter>
|
|
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_InIter, _OutIter>
|
|
operator()(_InIter __first, _Sent __last, _OutIter __result) const {
|
|
while (__first != __last) {
|
|
*__result = *__first;
|
|
++__first;
|
|
++__result;
|
|
}
|
|
|
|
return std::make_pair(std::move(__first), std::move(__result));
|
|
}
|
|
|
|
template <class _InIter, class _OutIter>
|
|
struct _CopySegment {
|
|
using _Traits _LIBCPP_NODEBUG = __segmented_iterator_traits<_InIter>;
|
|
|
|
_OutIter& __result_;
|
|
|
|
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 explicit _CopySegment(_OutIter& __result)
|
|
: __result_(__result) {}
|
|
|
|
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 void
|
|
operator()(typename _Traits::__local_iterator __lfirst, typename _Traits::__local_iterator __llast) {
|
|
__result_ = std::__copy(__lfirst, __llast, std::move(__result_)).second;
|
|
}
|
|
};
|
|
|
|
template <class _InIter, class _OutIter, __enable_if_t<__is_segmented_iterator<_InIter>::value, int> = 0>
|
|
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_InIter, _OutIter>
|
|
operator()(_InIter __first, _InIter __last, _OutIter __result) const {
|
|
std::__for_each_segment(__first, __last, _CopySegment<_InIter, _OutIter>(__result));
|
|
return std::make_pair(__last, std::move(__result));
|
|
}
|
|
|
|
template <class _InIter,
|
|
class _OutIter,
|
|
__enable_if_t<__has_random_access_iterator_category<_InIter>::value &&
|
|
!__is_segmented_iterator<_InIter>::value && __is_segmented_iterator<_OutIter>::value,
|
|
int> = 0>
|
|
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_InIter, _OutIter>
|
|
operator()(_InIter __first, _InIter __last, _OutIter __result) const {
|
|
using _Traits = __segmented_iterator_traits<_OutIter>;
|
|
using _DiffT = typename common_type<__iter_diff_t<_InIter>, __iter_diff_t<_OutIter> >::type;
|
|
|
|
if (__first == __last)
|
|
return std::make_pair(std::move(__first), std::move(__result));
|
|
|
|
auto __local_first = _Traits::__local(__result);
|
|
auto __segment_iterator = _Traits::__segment(__result);
|
|
while (true) {
|
|
auto __local_last = _Traits::__end(__segment_iterator);
|
|
auto __size = std::min<_DiffT>(__local_last - __local_first, __last - __first);
|
|
auto __iters = std::__copy(__first, __first + __size, __local_first);
|
|
__first = std::move(__iters.first);
|
|
|
|
if (__first == __last)
|
|
return std::make_pair(std::move(__first), _Traits::__compose(__segment_iterator, std::move(__iters.second)));
|
|
|
|
__local_first = _Traits::__begin(++__segment_iterator);
|
|
}
|
|
}
|
|
|
|
template <class _Cp, bool _IsConst>
|
|
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<__bit_iterator<_Cp, _IsConst>, __bit_iterator<_Cp, false> >
|
|
operator()(__bit_iterator<_Cp, _IsConst> __first,
|
|
__bit_iterator<_Cp, _IsConst> __last,
|
|
__bit_iterator<_Cp, false> __result) const {
|
|
if (__first.__ctz_ == __result.__ctz_)
|
|
return std::make_pair(__last, std::__copy_aligned(__first, __last, __result));
|
|
return std::make_pair(__last, std::__copy_unaligned(__first, __last, __result));
|
|
}
|
|
|
|
// At this point, the iterators have been unwrapped so any `contiguous_iterator` has been unwrapped to a pointer.
|
|
template <class _In, class _Out, __enable_if_t<__can_lower_copy_assignment_to_memmove<_In, _Out>::value, int> = 0>
|
|
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_In*, _Out*>
|
|
operator()(_In* __first, _In* __last, _Out* __result) const {
|
|
return std::__copy_trivial_impl(__first, __last, __result);
|
|
}
|
|
};
|
|
|
|
template <class _InIter, class _Sent, class _OutIter>
|
|
pair<_InIter, _OutIter> inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14
|
|
__copy(_InIter __first, _Sent __last, _OutIter __result) {
|
|
return std::__copy_move_unwrap_iters<__copy_impl>(std::move(__first), std::move(__last), std::move(__result));
|
|
}
|
|
|
|
template <class _InputIterator, class _OutputIterator>
|
|
_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _OutputIterator
|
|
copy(_InputIterator __first, _InputIterator __last, _OutputIterator __result) {
|
|
return std::__copy(__first, __last, __result).second;
|
|
}
|
|
|
|
_LIBCPP_END_NAMESPACE_STD
|
|
|
|
_LIBCPP_POP_MACROS
|
|
|
|
#endif // _LIBCPP___ALGORITHM_COPY_H
|