
This patch provides cleanups and improvements for the GPU benchmarking infrastructure. The key changes are: - Fix benchmark convergence bug: Round up the scaled iteration count (ceil) to ensure it grows properly. The previous truncation logic causes the iteration count to get stuck. - Resolve remaining compiler warning. - Remove unused `BenchmarkLogger` files: This is dead code that added maintenance and cognitive overhead without providing functionality. - Improve build hygiene: Clean up headers and CMake dependencies to strictly follow the 'include what you use' (IWYU) principle.
248 lines
8.2 KiB
C++
248 lines
8.2 KiB
C++
#include "LibcGpuBenchmark.h"
|
|
|
|
#include "hdr/stdint_proxy.h"
|
|
#include "src/__support/CPP/algorithm.h"
|
|
#include "src/__support/CPP/atomic.h"
|
|
#include "src/__support/CPP/string.h"
|
|
#include "src/__support/FPUtil/FPBits.h"
|
|
#include "src/__support/FPUtil/NearestIntegerOperations.h"
|
|
#include "src/__support/FPUtil/sqrt.h"
|
|
#include "src/__support/GPU/utils.h"
|
|
#include "src/__support/fixedvector.h"
|
|
#include "src/__support/macros/config.h"
|
|
#include "src/__support/time/gpu/time_utils.h"
|
|
#include "src/stdio/printf.h"
|
|
#include "src/time/clock.h"
|
|
|
|
namespace LIBC_NAMESPACE_DECL {
|
|
namespace benchmarks {
|
|
|
|
FixedVector<Benchmark *, 64> benchmarks;
|
|
|
|
void Benchmark::add_benchmark(Benchmark *benchmark) {
|
|
benchmarks.push_back(benchmark);
|
|
}
|
|
|
|
static void atomic_add_double(cpp::Atomic<uint64_t> &atomic_bits,
|
|
double value) {
|
|
using FPBits = LIBC_NAMESPACE::fputil::FPBits<double>;
|
|
|
|
uint64_t expected_bits = atomic_bits.load(cpp::MemoryOrder::RELAXED);
|
|
|
|
while (true) {
|
|
double current_value = FPBits(expected_bits).get_val();
|
|
double next_value = current_value + value;
|
|
|
|
uint64_t desired_bits = FPBits(next_value).uintval();
|
|
if (atomic_bits.compare_exchange_strong(expected_bits, desired_bits,
|
|
cpp::MemoryOrder::ACQUIRE,
|
|
cpp::MemoryOrder::RELAXED))
|
|
break;
|
|
}
|
|
}
|
|
|
|
struct AtomicBenchmarkSums {
|
|
cpp::Atomic<uint32_t> active_threads = 0;
|
|
cpp::Atomic<uint64_t> iterations_sum = 0;
|
|
cpp::Atomic<uint64_t> weighted_cycles_sum_bits = 0;
|
|
cpp::Atomic<uint64_t> weighted_squared_cycles_sum_bits = 0;
|
|
cpp::Atomic<uint64_t> min = UINT64_MAX;
|
|
cpp::Atomic<uint64_t> max = 0;
|
|
|
|
void reset() {
|
|
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
|
|
active_threads.store(0, cpp::MemoryOrder::RELAXED);
|
|
iterations_sum.store(0, cpp::MemoryOrder::RELAXED);
|
|
weighted_cycles_sum_bits.store(0, cpp::MemoryOrder::RELAXED);
|
|
weighted_squared_cycles_sum_bits.store(0, cpp::MemoryOrder::RELAXED);
|
|
min.store(UINT64_MAX, cpp::MemoryOrder::RELAXED);
|
|
max.store(0, cpp::MemoryOrder::RELAXED);
|
|
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
|
|
}
|
|
|
|
void update(const BenchmarkResult &result) {
|
|
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
|
|
active_threads.fetch_add(1, cpp::MemoryOrder::RELAXED);
|
|
iterations_sum.fetch_add(result.total_iterations,
|
|
cpp::MemoryOrder::RELAXED);
|
|
|
|
const double n_i = static_cast<double>(result.total_iterations);
|
|
const double mean_i = result.cycles;
|
|
const double stddev_i = result.standard_deviation;
|
|
const double variance_i = stddev_i * stddev_i;
|
|
atomic_add_double(weighted_cycles_sum_bits, n_i * mean_i);
|
|
atomic_add_double(weighted_squared_cycles_sum_bits,
|
|
n_i * (variance_i + mean_i * mean_i));
|
|
|
|
// Perform a CAS loop to atomically update the min
|
|
uint64_t orig_min = min.load(cpp::MemoryOrder::RELAXED);
|
|
while (!min.compare_exchange_strong(
|
|
orig_min, cpp::min(orig_min, result.min), cpp::MemoryOrder::ACQUIRE,
|
|
cpp::MemoryOrder::RELAXED))
|
|
;
|
|
|
|
// Perform a CAS loop to atomically update the max
|
|
uint64_t orig_max = max.load(cpp::MemoryOrder::RELAXED);
|
|
while (!max.compare_exchange_strong(
|
|
orig_max, cpp::max(orig_max, result.max), cpp::MemoryOrder::ACQUIRE,
|
|
cpp::MemoryOrder::RELAXED))
|
|
;
|
|
|
|
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
|
|
}
|
|
};
|
|
|
|
AtomicBenchmarkSums all_results;
|
|
constexpr auto GREEN = "\033[32m";
|
|
constexpr auto RESET = "\033[0m";
|
|
|
|
void print_results(Benchmark *b) {
|
|
using FPBits = LIBC_NAMESPACE::fputil::FPBits<double>;
|
|
|
|
BenchmarkResult final_result;
|
|
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
|
|
|
|
const uint32_t num_threads =
|
|
all_results.active_threads.load(cpp::MemoryOrder::RELAXED);
|
|
final_result.total_iterations =
|
|
all_results.iterations_sum.load(cpp::MemoryOrder::RELAXED);
|
|
|
|
if (final_result.total_iterations > 0) {
|
|
const uint64_t s1_bits =
|
|
all_results.weighted_cycles_sum_bits.load(cpp::MemoryOrder::RELAXED);
|
|
const uint64_t s2_bits = all_results.weighted_squared_cycles_sum_bits.load(
|
|
cpp::MemoryOrder::RELAXED);
|
|
|
|
const double S1 = FPBits(s1_bits).get_val();
|
|
const double S2 = FPBits(s2_bits).get_val();
|
|
const double N = static_cast<double>(final_result.total_iterations);
|
|
|
|
const double global_mean = S1 / N;
|
|
const double global_mean_of_squares = S2 / N;
|
|
const double global_variance =
|
|
global_mean_of_squares - (global_mean * global_mean);
|
|
|
|
final_result.cycles = global_mean;
|
|
final_result.standard_deviation =
|
|
fputil::sqrt<double>(global_variance < 0.0 ? 0.0 : global_variance);
|
|
} else {
|
|
final_result.cycles = 0.0;
|
|
final_result.standard_deviation = 0.0;
|
|
}
|
|
|
|
final_result.min = all_results.min.load(cpp::MemoryOrder::RELAXED);
|
|
final_result.max = all_results.max.load(cpp::MemoryOrder::RELAXED);
|
|
cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE);
|
|
|
|
LIBC_NAMESPACE::printf(
|
|
"%-24s |%15.0f |%9.0f |%8llu |%8llu |%15llu |%9u |\n",
|
|
b->get_test_name().data(), final_result.cycles,
|
|
final_result.standard_deviation,
|
|
static_cast<unsigned long long>(final_result.min),
|
|
static_cast<unsigned long long>(final_result.max),
|
|
static_cast<unsigned long long>(final_result.total_iterations),
|
|
static_cast<unsigned>(num_threads));
|
|
}
|
|
|
|
void print_header() {
|
|
LIBC_NAMESPACE::printf("%s", GREEN);
|
|
LIBC_NAMESPACE::printf("Running Suite: %-10s\n",
|
|
benchmarks[0]->get_suite_name().data());
|
|
LIBC_NAMESPACE::printf("%s", RESET);
|
|
cpp::string titles = "Benchmark | Cycles (Mean) | Stddev | "
|
|
" Min | Max | Iterations | Threads |\n";
|
|
LIBC_NAMESPACE::printf(titles.data());
|
|
|
|
cpp::string separator(titles.size(), '-');
|
|
separator[titles.size() - 1] = '\n';
|
|
LIBC_NAMESPACE::printf(separator.data());
|
|
}
|
|
|
|
void Benchmark::run_benchmarks() {
|
|
uint64_t id = gpu::get_thread_id();
|
|
|
|
if (id == 0)
|
|
print_header();
|
|
|
|
gpu::sync_threads();
|
|
|
|
for (Benchmark *b : benchmarks) {
|
|
if (id == 0)
|
|
all_results.reset();
|
|
|
|
gpu::sync_threads();
|
|
if (b->num_threads == static_cast<uint32_t>(-1) || id < b->num_threads) {
|
|
auto current_result = b->run();
|
|
all_results.update(current_result);
|
|
}
|
|
gpu::sync_threads();
|
|
|
|
if (id == 0)
|
|
print_results(b);
|
|
}
|
|
gpu::sync_threads();
|
|
}
|
|
|
|
BenchmarkResult benchmark(const BenchmarkOptions &options,
|
|
const BenchmarkTarget &target) {
|
|
BenchmarkResult result;
|
|
RuntimeEstimationProgression rep;
|
|
uint32_t iterations = options.initial_iterations;
|
|
|
|
if (iterations < 1u)
|
|
iterations = 1;
|
|
|
|
uint32_t samples = 0;
|
|
uint64_t total_time = 0;
|
|
uint64_t min = UINT64_MAX;
|
|
uint64_t max = 0;
|
|
|
|
uint32_t call_index = 0;
|
|
|
|
for (int64_t time_budget = options.max_duration; time_budget >= 0;) {
|
|
RefinableRuntimeEstimator sample_estimator;
|
|
|
|
const clock_t start = clock();
|
|
while (sample_estimator.get_iterations() < iterations) {
|
|
auto current_result = target(call_index++);
|
|
max = cpp::max(max, current_result);
|
|
min = cpp::min(min, current_result);
|
|
sample_estimator.update(current_result);
|
|
}
|
|
const clock_t end = clock();
|
|
|
|
const clock_t duration_ns =
|
|
((end - start) * 1000 * 1000 * 1000) / CLOCKS_PER_SEC;
|
|
total_time += duration_ns;
|
|
time_budget -= duration_ns;
|
|
samples++;
|
|
|
|
const double change_ratio = rep.compute_improvement(sample_estimator);
|
|
|
|
if (samples >= options.max_samples || iterations >= options.max_iterations)
|
|
break;
|
|
|
|
const auto total_iterations = rep.get_estimator().get_iterations();
|
|
|
|
if (total_time >= options.min_duration && samples >= options.min_samples &&
|
|
total_iterations >= options.min_iterations &&
|
|
change_ratio < options.epsilon)
|
|
break;
|
|
|
|
iterations = static_cast<uint32_t>(
|
|
fputil::ceil(iterations * options.scaling_factor));
|
|
}
|
|
|
|
const auto &estimator = rep.get_estimator();
|
|
result.total_iterations = estimator.get_iterations();
|
|
result.cycles = estimator.get_mean();
|
|
result.standard_deviation = estimator.get_stddev();
|
|
result.min = min;
|
|
result.max = max;
|
|
|
|
return result;
|
|
}
|
|
|
|
} // namespace benchmarks
|
|
} // namespace LIBC_NAMESPACE_DECL
|