llvm-project/libcxx/benchmarks/unordered_set_operations.bench.cpp
Dmitry Ilvokhin 78addb2c32 Use hash value checks optimizations consistently
There are couple of optimizations of `__hash_table::find` which are applicable
to other places like `__hash_table::__node_insert_unique_prepare` and
`__hash_table::__emplace_unique_key_args`.

```
for (__nd = __nd->__next_; __nd != nullptr &&
    (__nd->__hash() == __hash
    // ^^^^^^^^^^^^^^^^^^^^^^
    //         (1)
      || std::__constrain_hash(__nd->__hash(), __bc) == __chash);
                                               __nd = __nd->__next_)
{
    if ((__nd->__hash() == __hash)
    // ^^^^^^^^^^^^^^^^^^^^^^^^^^
    //           (2)
        && key_eq()(__nd->__upcast()->__value_, __k))
        return iterator(__nd, this);
}
```

(1): We can avoid expensive modulo operations from `std::__constrain_hash` if
hashes matched. This one is from commit 6a411472e3c4.
(2): We can avoid `key_eq` calls if hashes didn't match. Commit:
318d35a7bca6c4e5.

Both of them are applicable for insert and emplace methods.

Results of unordered_set_operations benchmark:

```
Comparing /tmp/main to /tmp/hashtable-hash-value-optimization
Benchmark                                                                 Time             CPU      Time Old      Time New       CPU Old       CPU New
------------------------------------------------------------------------------------------------------------------------------------------------------
BM_Hash/uint32_random_std_hash/1024                                    -0.0127         -0.0127          1511          1492          1508          1489
BM_Hash/uint32_random_custom_hash/1024                                 +0.0012         +0.0013          1370          1371          1367          1369
BM_Hash/uint32_top_std_hash/1024                                       -0.0027         -0.0028          1502          1497          1498          1494
BM_Hash/uint32_top_custom_hash/1024                                    +0.0033         +0.0032          1368          1373          1365          1370
BM_InsertValue/unordered_set_uint32/1024                               +0.0267         +0.0266         36421         37392         36350         37318
BM_InsertValue/unordered_set_uint32_sorted/1024                        +0.0230         +0.0229         28247         28897         28193         28837
BM_InsertValue/unordered_set_top_bits_uint32/1024                      +0.0492         +0.0491         31012         32539         30952         32472
BM_InsertValueRehash/unordered_set_top_bits_uint32/1024                +0.0523         +0.0520         62905         66197         62780         66043
BM_InsertValue/unordered_set_string/1024                               -0.0252         -0.0253        300762        293189        299805        292221
BM_InsertValueRehash/unordered_set_string/1024                         -0.0932         -0.0920        332924        301882        331276        300810
BM_InsertValue/unordered_set_prefixed_string/1024                      -0.0578         -0.0577        314239        296072        313222        295137
BM_InsertValueRehash/unordered_set_prefixed_string/1024                -0.0986         -0.0985        336091        302950        334982        301995
BM_Find/unordered_set_random_uint64/1024                               -0.1416         -0.1417         16075         13798         16041         13769
BM_FindRehash/unordered_set_random_uint64/1024                         -0.0105         -0.0105          5900          5838          5889          5827
BM_Find/unordered_set_sorted_uint64/1024                               +0.0014         +0.0014          2813          2817          2807          2811
BM_FindRehash/unordered_set_sorted_uint64/1024                         -0.0247         -0.0249          5863          5718          5851          5706
BM_Find/unordered_set_sorted_uint128/1024                              +0.0113         +0.0112         15570         15746         15539         15713
BM_FindRehash/unordered_set_sorted_uint128/1024                        +0.0438         +0.0441          6917          7220          6902          7206
BM_Find/unordered_set_sorted_uint32/1024                               -0.0020         -0.0020          3098          3091          3092          3085
BM_FindRehash/unordered_set_sorted_uint32/1024                         +0.0570         +0.0569          5377          5684          5368          5673
BM_Find/unordered_set_sorted_large_uint64/1024                         +0.0081         +0.0081          3594          3623          3587          3616
BM_FindRehash/unordered_set_sorted_large_uint64/1024                   -0.0542         -0.0540          6154          5820          6140          5808
BM_Find/unordered_set_top_bits_uint64/1024                             -0.0061         -0.0061         10440         10377         10417         10353
BM_FindRehash/unordered_set_top_bits_uint64/1024                       +0.0131         +0.0128          5852          5928          5840          5914
BM_Find/unordered_set_string/1024                                      -0.0352         -0.0349        189037        182384        188389        181809
BM_FindRehash/unordered_set_string/1024                                -0.0309         -0.0311        180718        175142        180141        174532
BM_Find/unordered_set_prefixed_string/1024                             -0.0559         -0.0557        190853        180177        190251        179659
BM_FindRehash/unordered_set_prefixed_string/1024                       -0.0563         -0.0561        182396        172136        181797        171602
BM_Rehash/unordered_set_string_arg/1024                                -0.0244         -0.0241         27052         26393         26989         26339
BM_Rehash/unordered_set_int_arg/1024                                   -0.0410         -0.0410         19582         18779         19539         18738
BM_InsertDuplicate/unordered_set_int/1024                              +0.0023         +0.0025         12168         12196         12142         12173
BM_InsertDuplicate/unordered_set_string/1024                           -0.0505         -0.0504        189238        179683        188648        179133
BM_InsertDuplicate/unordered_set_prefixed_string/1024                  -0.0989         -0.0987        198893        179222        198263        178702
BM_EmplaceDuplicate/unordered_set_int/1024                             -0.0175         -0.0173         12674         12452         12646         12427
BM_EmplaceDuplicate/unordered_set_string/1024                          -0.0559         -0.0557        190104        179481        189492        178934
BM_EmplaceDuplicate/unordered_set_prefixed_string/1024                 -0.1111         -0.1110        201233        178870        200608        178341
BM_InsertDuplicate/unordered_set_int_insert_arg/1024                   -0.0747         -0.0745         12993         12022         12964         11997
BM_InsertDuplicate/unordered_set_string_insert_arg/1024                -0.0584         -0.0583        191489        180311        190864        179731
BM_EmplaceDuplicate/unordered_set_int_insert_arg/1024                  -0.0807         -0.0804         35946         33047         35866         32982
BM_EmplaceDuplicate/unordered_set_string_arg/1024                      -0.0312         -0.0310        321623        311601        320559        310637
OVERALL_GEOMEAN                                                        -0.0276         -0.0275             0             0             0             0
```

Time differences looks more like noise to me. But if we want to have this
optimizations in `find`, we probably want them in `insert` and `emplace` as
well.

Reviewed By: #libc, Mordante

Differential Revision: https://reviews.llvm.org/D140779
2023-07-04 21:01:08 +02:00

322 lines
11 KiB
C++

#include <cstdint>
#include <cstdlib>
#include <cstring>
#include <functional>
#include <unordered_set>
#include <vector>
#include "benchmark/benchmark.h"
#include "ContainerBenchmarks.h"
#include "GenerateInput.h"
#include "test_macros.h"
using namespace ContainerBenchmarks;
constexpr std::size_t TestNumInputs = 1024;
template <class _Size>
inline TEST_ALWAYS_INLINE _Size loadword(const void* __p) {
_Size __r;
std::memcpy(&__r, __p, sizeof(__r));
return __r;
}
inline TEST_ALWAYS_INLINE std::size_t rotate_by_at_least_1(std::size_t __val, int __shift) {
return (__val >> __shift) | (__val << (64 - __shift));
}
inline TEST_ALWAYS_INLINE std::size_t hash_len_16(std::size_t __u, std::size_t __v) {
const std::size_t __mul = 0x9ddfea08eb382d69ULL;
std::size_t __a = (__u ^ __v) * __mul;
__a ^= (__a >> 47);
std::size_t __b = (__v ^ __a) * __mul;
__b ^= (__b >> 47);
__b *= __mul;
return __b;
}
template <std::size_t _Len>
inline TEST_ALWAYS_INLINE std::size_t hash_len_0_to_8(const char* __s) {
static_assert(_Len == 4 || _Len == 8, "");
const uint64_t __a = loadword<uint32_t>(__s);
const uint64_t __b = loadword<uint32_t>(__s + _Len - 4);
return hash_len_16(_Len + (__a << 3), __b);
}
struct UInt32Hash {
UInt32Hash() = default;
inline TEST_ALWAYS_INLINE std::size_t operator()(uint32_t data) const {
return hash_len_0_to_8<4>(reinterpret_cast<const char*>(&data));
}
};
struct UInt64Hash {
UInt64Hash() = default;
inline TEST_ALWAYS_INLINE std::size_t operator()(uint64_t data) const {
return hash_len_0_to_8<8>(reinterpret_cast<const char*>(&data));
}
};
struct UInt128Hash {
UInt128Hash() = default;
inline TEST_ALWAYS_INLINE std::size_t operator()(__uint128_t data) const {
const __uint128_t __mask = static_cast<std::size_t>(-1);
const std::size_t __a = (std::size_t)(data & __mask);
const std::size_t __b = (std::size_t)((data & (__mask << 64)) >> 64);
return hash_len_16(__a, rotate_by_at_least_1(__b + 16, 16)) ^ __b;
}
};
struct UInt32Hash2 {
UInt32Hash2() = default;
inline TEST_ALWAYS_INLINE std::size_t operator()(uint32_t data) const {
const uint32_t __m = 0x5bd1e995;
const uint32_t __r = 24;
uint32_t __h = 4;
uint32_t __k = data;
__k *= __m;
__k ^= __k >> __r;
__k *= __m;
__h *= __m;
__h ^= __k;
__h ^= __h >> 13;
__h *= __m;
__h ^= __h >> 15;
return __h;
}
};
struct UInt64Hash2 {
UInt64Hash2() = default;
inline TEST_ALWAYS_INLINE std::size_t operator()(uint64_t data) const {
return hash_len_0_to_8<8>(reinterpret_cast<const char*>(&data));
}
};
// The sole purpose of this comparator is to be used in BM_Rehash, where
// we need something slow enough to be easily noticable in benchmark results.
// The default implementation of operator== for strings seems to be a little
// too fast for that specific benchmark to reliably show a noticeable
// improvement, but unoptimized bytewise comparison fits just right.
// Early return is there just for convenience, since we only compare strings
// of equal length in BM_Rehash.
struct SlowStringEq {
SlowStringEq() = default;
inline TEST_ALWAYS_INLINE bool operator()(const std::string& lhs, const std::string& rhs) const {
if (lhs.size() != rhs.size())
return false;
bool eq = true;
for (size_t i = 0; i < lhs.size(); ++i) {
eq &= lhs[i] == rhs[i];
}
return eq;
}
};
//----------------------------------------------------------------------------//
// BM_Hash
// ---------------------------------------------------------------------------//
template <class HashFn, class GenInputs>
void BM_Hash(benchmark::State& st, HashFn fn, GenInputs gen) {
auto in = gen(st.range(0));
const auto end = in.data() + in.size();
std::size_t last_hash = 0;
benchmark::DoNotOptimize(&last_hash);
while (st.KeepRunning()) {
for (auto it = in.data(); it != end; ++it) {
benchmark::DoNotOptimize(last_hash += fn(*it));
}
benchmark::ClobberMemory();
}
}
BENCHMARK_CAPTURE(BM_Hash, uint32_random_std_hash, std::hash<uint32_t>{}, getRandomIntegerInputs<uint32_t>)
->Arg(TestNumInputs);
BENCHMARK_CAPTURE(BM_Hash, uint32_random_custom_hash, UInt32Hash{}, getRandomIntegerInputs<uint32_t>)
->Arg(TestNumInputs);
BENCHMARK_CAPTURE(BM_Hash, uint32_top_std_hash, std::hash<uint32_t>{}, getSortedTopBitsIntegerInputs<uint32_t>)
->Arg(TestNumInputs);
BENCHMARK_CAPTURE(BM_Hash, uint32_top_custom_hash, UInt32Hash{}, getSortedTopBitsIntegerInputs<uint32_t>)
->Arg(TestNumInputs);
//----------------------------------------------------------------------------//
// BM_InsertValue
// ---------------------------------------------------------------------------//
// Sorted Ascending //
BENCHMARK_CAPTURE(
BM_InsertValue, unordered_set_uint32, std::unordered_set<uint32_t>{}, getRandomIntegerInputs<uint32_t>)
->Arg(TestNumInputs);
BENCHMARK_CAPTURE(
BM_InsertValue, unordered_set_uint32_sorted, std::unordered_set<uint32_t>{}, getSortedIntegerInputs<uint32_t>)
->Arg(TestNumInputs);
// Top Bytes //
BENCHMARK_CAPTURE(BM_InsertValue,
unordered_set_top_bits_uint32,
std::unordered_set<uint32_t>{},
getSortedTopBitsIntegerInputs<uint32_t>)
->Arg(TestNumInputs);
BENCHMARK_CAPTURE(BM_InsertValueRehash,
unordered_set_top_bits_uint32,
std::unordered_set<uint32_t, UInt32Hash>{},
getSortedTopBitsIntegerInputs<uint32_t>)
->Arg(TestNumInputs);
// String //
BENCHMARK_CAPTURE(BM_InsertValue, unordered_set_string, std::unordered_set<std::string>{}, getRandomStringInputs)
->Arg(TestNumInputs);
BENCHMARK_CAPTURE(BM_InsertValueRehash, unordered_set_string, std::unordered_set<std::string>{}, getRandomStringInputs)
->Arg(TestNumInputs);
// Prefixed String //
BENCHMARK_CAPTURE(
BM_InsertValue, unordered_set_prefixed_string, std::unordered_set<std::string>{}, getPrefixedRandomStringInputs)
->Arg(TestNumInputs);
BENCHMARK_CAPTURE(BM_InsertValueRehash,
unordered_set_prefixed_string,
std::unordered_set<std::string>{},
getPrefixedRandomStringInputs)
->Arg(TestNumInputs);
//----------------------------------------------------------------------------//
// BM_Find
// ---------------------------------------------------------------------------//
// Random //
BENCHMARK_CAPTURE(
BM_Find, unordered_set_random_uint64, std::unordered_set<uint64_t>{}, getRandomIntegerInputs<uint64_t>)
->Arg(TestNumInputs);
BENCHMARK_CAPTURE(BM_FindRehash,
unordered_set_random_uint64,
std::unordered_set<uint64_t, UInt64Hash>{},
getRandomIntegerInputs<uint64_t>)
->Arg(TestNumInputs);
// Sorted //
BENCHMARK_CAPTURE(
BM_Find, unordered_set_sorted_uint64, std::unordered_set<uint64_t>{}, getSortedIntegerInputs<uint64_t>)
->Arg(TestNumInputs);
BENCHMARK_CAPTURE(BM_FindRehash,
unordered_set_sorted_uint64,
std::unordered_set<uint64_t, UInt64Hash>{},
getSortedIntegerInputs<uint64_t>)
->Arg(TestNumInputs);
// Sorted //
#if 1
BENCHMARK_CAPTURE(BM_Find,
unordered_set_sorted_uint128,
std::unordered_set<__uint128_t, UInt128Hash>{},
getSortedTopBitsIntegerInputs<__uint128_t>)
->Arg(TestNumInputs);
BENCHMARK_CAPTURE(BM_FindRehash,
unordered_set_sorted_uint128,
std::unordered_set<__uint128_t, UInt128Hash>{},
getSortedTopBitsIntegerInputs<__uint128_t>)
->Arg(TestNumInputs);
#endif
// Sorted //
BENCHMARK_CAPTURE(
BM_Find, unordered_set_sorted_uint32, std::unordered_set<uint32_t>{}, getSortedIntegerInputs<uint32_t>)
->Arg(TestNumInputs);
BENCHMARK_CAPTURE(BM_FindRehash,
unordered_set_sorted_uint32,
std::unordered_set<uint32_t, UInt32Hash2>{},
getSortedIntegerInputs<uint32_t>)
->Arg(TestNumInputs);
// Sorted Ascending //
BENCHMARK_CAPTURE(
BM_Find, unordered_set_sorted_large_uint64, std::unordered_set<uint64_t>{}, getSortedLargeIntegerInputs<uint64_t>)
->Arg(TestNumInputs);
BENCHMARK_CAPTURE(BM_FindRehash,
unordered_set_sorted_large_uint64,
std::unordered_set<uint64_t, UInt64Hash>{},
getSortedLargeIntegerInputs<uint64_t>)
->Arg(TestNumInputs);
// Top Bits //
BENCHMARK_CAPTURE(
BM_Find, unordered_set_top_bits_uint64, std::unordered_set<uint64_t>{}, getSortedTopBitsIntegerInputs<uint64_t>)
->Arg(TestNumInputs);
BENCHMARK_CAPTURE(BM_FindRehash,
unordered_set_top_bits_uint64,
std::unordered_set<uint64_t, UInt64Hash>{},
getSortedTopBitsIntegerInputs<uint64_t>)
->Arg(TestNumInputs);
// String //
BENCHMARK_CAPTURE(BM_Find, unordered_set_string, std::unordered_set<std::string>{}, getRandomStringInputs)
->Arg(TestNumInputs);
BENCHMARK_CAPTURE(BM_FindRehash, unordered_set_string, std::unordered_set<std::string>{}, getRandomStringInputs)
->Arg(TestNumInputs);
// Prefixed String //
BENCHMARK_CAPTURE(
BM_Find, unordered_set_prefixed_string, std::unordered_set<std::string>{}, getPrefixedRandomStringInputs)
->Arg(TestNumInputs);
BENCHMARK_CAPTURE(
BM_FindRehash, unordered_set_prefixed_string, std::unordered_set<std::string>{}, getPrefixedRandomStringInputs)
->Arg(TestNumInputs);
//----------------------------------------------------------------------------//
// BM_Rehash
// ---------------------------------------------------------------------------//
BENCHMARK_CAPTURE(BM_Rehash,
unordered_set_string_arg,
std::unordered_set<std::string, std::hash<std::string>, SlowStringEq>{},
getRandomStringInputs)
->Arg(TestNumInputs);
BENCHMARK_CAPTURE(BM_Rehash, unordered_set_int_arg, std::unordered_set<int>{}, getRandomIntegerInputs<int>)
->Arg(TestNumInputs);
///////////////////////////////////////////////////////////////////////////////
BENCHMARK_CAPTURE(BM_InsertDuplicate, unordered_set_int, std::unordered_set<int>{}, getRandomIntegerInputs<int>)
->Arg(TestNumInputs);
BENCHMARK_CAPTURE(BM_InsertDuplicate, unordered_set_string, std::unordered_set<std::string>{}, getRandomStringInputs)
->Arg(TestNumInputs);
BENCHMARK_CAPTURE(BM_EmplaceDuplicate, unordered_set_int, std::unordered_set<int>{}, getRandomIntegerInputs<int>)
->Arg(TestNumInputs);
BENCHMARK_CAPTURE(BM_EmplaceDuplicate, unordered_set_string, std::unordered_set<std::string>{}, getRandomStringInputs)
->Arg(TestNumInputs);
BENCHMARK_CAPTURE(
BM_InsertDuplicate, unordered_set_int_insert_arg, std::unordered_set<int>{}, getRandomIntegerInputs<int>)
->Arg(TestNumInputs);
BENCHMARK_CAPTURE(
BM_InsertDuplicate, unordered_set_string_insert_arg, std::unordered_set<std::string>{}, getRandomStringInputs)
->Arg(TestNumInputs);
BENCHMARK_CAPTURE(
BM_EmplaceDuplicate, unordered_set_int_insert_arg, std::unordered_set<int>{}, getRandomIntegerInputs<unsigned>)
->Arg(TestNumInputs);
BENCHMARK_CAPTURE(
BM_EmplaceDuplicate, unordered_set_string_arg, std::unordered_set<std::string>{}, getRandomCStringInputs)
->Arg(TestNumInputs);
BENCHMARK_MAIN();