
If lots of threads do lots of malloc/free and they overflow per-pthread DenseSlabAlloc cache, it causes lots of contention: 31.97% race.old race.old [.] __sanitizer::StaticSpinMutex::LockSlow 17.61% race.old race.old [.] __tsan_read4 10.77% race.old race.old [.] __tsan::SlotLock Optimize DenseSlabAlloc to use a lock-free stack of batches of nodes. This way we don't take any locks in steady state at all and do only 1 push/pop per Refill/Drain. Effect on the added benchmark: $ TIME="%e %U %S %M" time ./test.old 36 5 2000000 34.51 978.22 175.67 5833592 32.53 891.73 167.03 5790036 36.17 1005.54 201.24 5802828 36.94 1004.76 226.58 5803188 $ TIME="%e %U %S %M" time ./test.new 36 5 2000000 26.44 720.99 13.45 5750704 25.92 721.98 13.58 5767764 26.33 725.15 13.41 5777936 25.93 713.49 13.41 5791796 Reviewed By: melver Differential Revision: https://reviews.llvm.org/D130002
23 lines
521 B
C++
23 lines
521 B
C++
// RUN: %clangxx_tsan %s -o %t
|
|
// RUN: %run %t 2>&1 | FileCheck %s
|
|
|
|
// bench.h needs pthread barriers which are not available on OS X
|
|
// UNSUPPORTED: darwin
|
|
|
|
#include "bench.h"
|
|
|
|
void thread(int tid) {
|
|
void **blocks = new void *[bench_mode];
|
|
for (int i = 0; i < bench_niter; i++) {
|
|
for (int j = 0; j < bench_mode; j++)
|
|
blocks[j] = malloc(8);
|
|
for (int j = 0; j < bench_mode; j++)
|
|
free(blocks[j]);
|
|
}
|
|
delete[] blocks;
|
|
}
|
|
|
|
void bench() { start_thread_group(bench_nthread, thread); }
|
|
|
|
// CHECK: DONE
|