[libc] revamp memory function benchmark

The benchmarking infrastructure can now run in two modes: - Sweep Mode: which generates a ramp of size values (same as before), - Distribution Mode: allows the user to select a distribution for the size paramater that is representative from production. The analysis tool has also been updated to handle both modes. Differential Revision: https://reviews.llvm.org/D93210
2020-12-17 13:16:14 +00:00 · 2020-12-17 13:16:14 +00:00 · deae7e982a
commit deae7e982a
parent e7a3c4c11e
20 changed files with 775 additions and 1001 deletions
--- a/libc/benchmarks/CMakeLists.txt
+++ b/libc/benchmarks/CMakeLists.txt
@ -20,6 +20,7 @@ ExternalProject_Add(google-benchmark
    SOURCE_DIR ${LIBC_SOURCE_DIR}/../llvm/utils/benchmark
    INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/google-benchmark
    CMAKE_CACHE_ARGS
        -DBUILD_SHARED_LIBS:BOOL=OFF
        -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
        -DCMAKE_C_COMPILER:STRING=${CMAKE_C_COMPILER}
        -DCMAKE_CXX_COMPILER:STRING=${CMAKE_CXX_COMPILER}
@ -114,7 +115,10 @@ add_library(libc-memory-benchmark
    MemorySizeDistributions.cpp
    MemorySizeDistributions.h
 )
-target_link_libraries(libc-memory-benchmark PUBLIC libc-benchmark)
+target_link_libraries(libc-memory-benchmark
    PUBLIC
    libc-benchmark
 )
 fix_rtti(libc-memory-benchmark)
 add_libc_benchmark_unittest(libc-memory-benchmark-test
@ -138,54 +142,14 @@ add_libc_benchmark_unittest(json-test
 )
 #==============================================================================
-# Benchmark tests configuration
+# Benchmarking tool
 #==============================================================================
-function(add_libc_benchmark_analysis conf_target run_target)
+add_executable(libc-benchmark-main
-    set(png_file "/tmp/last-${conf_target}.png")
+    EXCLUDE_FROM_ALL
-    set(render_target render-${conf_target})
+    LibcMemoryBenchmarkMain.cpp
-    add_custom_target(${render_target}
+)
-        COMMAND python3 render.py3 ${json_file} --headless --output=${png_file}
+foreach(entrypoint_target libc.src.string.memcpy libc.src.string.memset)
        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
        COMMENT "render ${libc_target} to ${png_file}"
    )
    add_dependencies(${render_target} ${run_target})
    set(display_target display-${conf_target})
    add_custom_target(${display_target}
        COMMAND python3 render.py3 ${json_file}
        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
        COMMENT "display ${libc_target}"
    )
    add_dependencies(${display_target} ${run_target})
 endfunction()
 function(add_libc_benchmark_configuration target configuration)
    set(conf_target ${target}-${configuration})
    set(json_file "/tmp/last-${conf_target}.json")
    set(run_target run-${conf_target})
    add_custom_target(${run_target}
        COMMAND ${libc_target} --conf=configuration_${configuration}.json -o ${json_file}
        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
    )
    add_libc_benchmark_analysis(${conf_target} ${run_target})
 endfunction()
 function(add_libc_benchmark name file entrypoint_target)
    set(libc_target libc-${name}-benchmark)
    add_executable(${libc_target}
        EXCLUDE_FROM_ALL
        ${file}
        LibcMemoryBenchmarkMain.h
        LibcMemoryBenchmarkMain.cpp
    )
    get_target_property(entrypoint_object_file ${entrypoint_target} "OBJECT_FILE_RAW")
-    target_link_libraries(${libc_target} PUBLIC json ${entrypoint_object_file})
+    target_link_libraries(libc-benchmark-main PUBLIC json ${entrypoint_object_file})
-    foreach(configuration "small" "big")
+endforeach()
        add_libc_benchmark_configuration(${libc_target} ${configuration})
    endforeach()
 endfunction()
 add_libc_benchmark(memcpy Memcpy.cpp libc.src.string.memcpy)
 add_libc_benchmark(memset Memset.cpp libc.src.string.memset)
--- a/libc/benchmarks/JSON.cpp
+++ b/libc/benchmarks/JSON.cpp
@ -40,6 +40,14 @@ static Error intFromJsonTemplate(const json::Value &V, T &Out) {
  return createStringError(errc::io_error, "Can't parse Integer");
 }
 static Error fromJson(const json::Value &V, bool &Out) {
  if (auto B = V.getAsBoolean()) {
    Out = *B;
    return Error::success();
  }
  return createStringError(errc::io_error, "Can't parse Boolean");
 }
 static Error fromJson(const json::Value &V, double &Out) {
  if (auto S = V.getAsNumber()) {
    Out = *S;
@ -60,10 +68,6 @@ static Error fromJson(const json::Value &V, uint32_t &Out) {
  return intFromJsonTemplate(V, Out);
 }
 static Error fromJson(const json::Value &V, uint8_t &Out) {
  return intFromJsonTemplate(V, Out);
 }
 static Error fromJson(const json::Value &V, int &Out) {
  return intFromJsonTemplate(V, Out);
 }
@ -186,22 +190,15 @@ static Error fromJson(const json::Value &V,
  return O.takeError();
 }
 static Error fromJson(const json::Value &V, libc_benchmarks::SizeRange &Out) {
  JsonObjectMapper O(V);
  O.map("From", Out.From);
  O.map("To", Out.To);
  O.map("Step", Out.Step);
  return O.takeError();
 }
 static Error fromJson(const json::Value &V,
                      libc_benchmarks::StudyConfiguration &Out) {
  JsonObjectMapper O(V);
-  O.map("Runs", Out.Runs);
+  O.map("Function", Out.Function);
-  O.map("BufferSize", Out.BufferSize);
+  O.map("NumTrials", Out.NumTrials);
-  O.map("Size", Out.Size);
+  O.map("IsSweepMode", Out.IsSweepMode);
-  O.map("AddressAlignment", Out.AddressAlignment);
+  O.map("SweepModeMaxSize", Out.SweepModeMaxSize);
-  O.map("MemsetValue", Out.MemsetValue);
+  O.map("SizeDistributionName", Out.SizeDistributionName);
  O.map("AccessAlignment", Out.AccessAlignment);
  O.map("MemcmpMismatchAt", Out.MemcmpMismatchAt);
  return O.takeError();
 }
@ -223,39 +220,29 @@ static Error fromJson(const json::Value &V, libc_benchmarks::HostState &Out) {
  return O.takeError();
 }
-static Error fromJson(const json::Value &V,
+static Error fromJson(const json::Value &V, libc_benchmarks::Runtime &Out) {
                      libc_benchmarks::FunctionMeasurements &Out) {
  JsonObjectMapper O(V);
-  O.map("Name", Out.Name);
+  O.map("Host", Out.Host);
-  std::vector<uint32_t> Sizes;
+  O.map("BufferSize", Out.BufferSize);
-  O.map("Sizes", Sizes);
+  O.map("BatchParameterCount", Out.BatchParameterCount);
-  std::vector<libc_benchmarks::Duration> Runtimes;
+  O.map("BenchmarkOptions", Out.BenchmarkOptions);
  O.map("Runtimes", Runtimes);
  if (Sizes.size() != Runtimes.size())
    return createStringError(errc::io_error,
                             "Measurement Size and Runtime mistmatch");
  Out.Measurements.resize(Sizes.size());
  for (size_t I = 0; I < Sizes.size(); ++I) {
    Out.Measurements[I].Size = Sizes[I];
    Out.Measurements[I].Runtime = Runtimes[I];
  }
  return O.takeError();
 }
 static Error fromJson(const json::Value &V, libc_benchmarks::Study &Out) {
  JsonObjectMapper O(V);
-  O.map("Host", Out.Host);
+  O.map("StudyName", Out.StudyName);
-  O.map("Options", Out.Options);
+  O.map("Runtime", Out.Runtime);
  O.map("Configuration", Out.Configuration);
-  O.map("Functions", Out.Functions);
+  O.map("Measurements", Out.Measurements);
  return O.takeError();
 }
-static double Seconds(const Duration &D) {
+static double seconds(const Duration &D) {
  return std::chrono::duration<double>(D).count();
 }
-Expected<Study> ParseJsonStudy(StringRef Content) {
+Expected<Study> parseJsonStudy(StringRef Content) {
  Expected<json::Value> EV = json::parse(Content);
  if (!EV)
    return EV.takeError();
@ -265,7 +252,7 @@ Expected<Study> ParseJsonStudy(StringRef Content) {
  return S;
 }
-static StringRef Serialize(const BenchmarkLog &L) {
+static StringRef serialize(const BenchmarkLog &L) {
  switch (L) {
  case BenchmarkLog::None:
    return "None";
@ -277,89 +264,63 @@ static StringRef Serialize(const BenchmarkLog &L) {
  llvm_unreachable("Unhandled BenchmarkLog value");
 }
-static void Serialize(const BenchmarkOptions &BO, json::OStream &JOS) {
+static void serialize(const BenchmarkOptions &BO, json::OStream &JOS) {
-  JOS.object([&]() {
+  JOS.attribute("MinDuration", seconds(BO.MinDuration));
-    JOS.attribute("MinDuration", Seconds(BO.MinDuration));
+  JOS.attribute("MaxDuration", seconds(BO.MaxDuration));
-    JOS.attribute("MaxDuration", Seconds(BO.MaxDuration));
+  JOS.attribute("InitialIterations", BO.InitialIterations);
-    JOS.attribute("InitialIterations", BO.InitialIterations);
+  JOS.attribute("MaxIterations", BO.MaxIterations);
-    JOS.attribute("MaxIterations", BO.MaxIterations);
+  JOS.attribute("MinSamples", BO.MinSamples);
-    JOS.attribute("MinSamples", BO.MinSamples);
+  JOS.attribute("MaxSamples", BO.MaxSamples);
-    JOS.attribute("MaxSamples", BO.MaxSamples);
+  JOS.attribute("Epsilon", BO.Epsilon);
-    JOS.attribute("Epsilon", BO.Epsilon);
+  JOS.attribute("ScalingFactor", BO.ScalingFactor);
-    JOS.attribute("ScalingFactor", BO.ScalingFactor);
+  JOS.attribute("Log", serialize(BO.Log));
-    JOS.attribute("Log", Serialize(BO.Log));
+}
 static void serialize(const CacheInfo &CI, json::OStream &JOS) {
  JOS.attribute("Type", CI.Type);
  JOS.attribute("Level", CI.Level);
  JOS.attribute("Size", CI.Size);
  JOS.attribute("NumSharing", CI.NumSharing);
 }
 static void serialize(const StudyConfiguration &SC, json::OStream &JOS) {
  JOS.attribute("Function", SC.Function);
  JOS.attribute("NumTrials", SC.NumTrials);
  JOS.attribute("IsSweepMode", SC.IsSweepMode);
  JOS.attribute("SweepModeMaxSize", SC.SweepModeMaxSize);
  JOS.attribute("SizeDistributionName", SC.SizeDistributionName);
  JOS.attribute("AccessAlignment",
                static_cast<int64_t>(SC.AccessAlignment->value()));
  JOS.attribute("MemcmpMismatchAt", SC.MemcmpMismatchAt);
 }
 static void serialize(const HostState &HS, json::OStream &JOS) {
  JOS.attribute("CpuName", HS.CpuName);
  JOS.attribute("CpuFrequency", HS.CpuFrequency);
  JOS.attributeArray("Caches", [&]() {
    for (const auto &CI : HS.Caches)
      JOS.object([&]() { serialize(CI, JOS); });
  });
 }
-static void Serialize(const CacheInfo &CI, json::OStream &JOS) {
+static void serialize(const Runtime &RI, json::OStream &JOS) {
-  JOS.object([&]() {
+  JOS.attributeObject("Host", [&]() { serialize(RI.Host, JOS); });
-    JOS.attribute("Type", CI.Type);
+  JOS.attribute("BufferSize", RI.BufferSize);
-    JOS.attribute("Level", CI.Level);
+  JOS.attribute("BatchParameterCount", RI.BatchParameterCount);
-    JOS.attribute("Size", CI.Size);
+  JOS.attributeObject("BenchmarkOptions",
-    JOS.attribute("NumSharing", CI.NumSharing);
+                      [&]() { serialize(RI.BenchmarkOptions, JOS); });
  });
 }
-static void Serialize(const HostState &HS, json::OStream &JOS) {
+void serializeToJson(const Study &S, json::OStream &JOS) {
  JOS.object([&]() {
-    JOS.attribute("CpuName", HS.CpuName);
+    JOS.attribute("StudyName", S.StudyName);
-    JOS.attribute("CpuFrequency", HS.CpuFrequency);
+    JOS.attributeObject("Runtime", [&]() { serialize(S.Runtime, JOS); });
-    JOS.attributeArray("Caches", [&]() {
+    JOS.attributeObject("Configuration",
-      for (const auto &CI : HS.Caches)
+                        [&]() { serialize(S.Configuration, JOS); });
-        Serialize(CI, JOS);
+    if (!S.Measurements.empty()) {
-    });
+      JOS.attributeArray("Measurements", [&]() {
-  });
+        for (const auto &M : S.Measurements)
-}
+          JOS.value(seconds(M));
 static void Serialize(const StudyConfiguration &SC, json::OStream &JOS) {
  JOS.object([&]() {
    JOS.attribute("Runs", SC.Runs);
    JOS.attribute("BufferSize", SC.BufferSize);
    JOS.attributeObject("Size", [&]() {
      JOS.attribute("From", SC.Size.From);
      JOS.attribute("To", SC.Size.To);
      JOS.attribute("Step", SC.Size.Step);
    });
    if (SC.AddressAlignment)
      JOS.attribute("AddressAlignment",
                    static_cast<int64_t>(SC.AddressAlignment->value()));
    JOS.attribute("MemsetValue", SC.MemsetValue);
    JOS.attribute("MemcmpMismatchAt", SC.MemcmpMismatchAt);
  });
 }
 static void Serialize(const FunctionMeasurements &FM, json::OStream &JOS) {
  JOS.object([&]() {
    JOS.attribute("Name", FM.Name);
    JOS.attributeArray("Sizes", [&]() {
      for (const auto &M : FM.Measurements)
        JOS.value(M.Size);
    });
    JOS.attributeArray("Runtimes", [&]() {
      for (const auto &M : FM.Measurements)
        JOS.value(Seconds(M.Runtime));
    });
  });
 }
 void SerializeToJson(const Study &S, json::OStream &JOS) {
  JOS.object([&]() {
    JOS.attributeBegin("Host");
    Serialize(S.Host, JOS);
    JOS.attributeEnd();
    JOS.attributeBegin("Options");
    Serialize(S.Options, JOS);
    JOS.attributeEnd();
    JOS.attributeBegin("Configuration");
    Serialize(S.Configuration, JOS);
    JOS.attributeEnd();
    if (!S.Functions.empty()) {
      JOS.attributeArray("Functions", [&]() {
        for (const auto &FM : S.Functions)
          Serialize(FM, JOS);
      });
    }
  });
--- a/libc/benchmarks/JSON.h
+++ b/libc/benchmarks/JSON.h
@ -17,10 +17,10 @@ namespace llvm {
 namespace libc_benchmarks {
 // Parses a Study from a json string.
-Expected<Study> ParseJsonStudy(StringRef Content);
+Expected<Study> parseJsonStudy(StringRef Content);
 // Serialize a Study as json.
-void SerializeToJson(const Study &S, llvm::json::OStream &JOS);
+void serializeToJson(const Study &S, llvm::json::OStream &JOS);
 } // namespace libc_benchmarks
 } // namespace llvm
--- a/libc/benchmarks/JSONTest.cpp
+++ b/libc/benchmarks/JSONTest.cpp
@ -25,22 +25,23 @@ namespace {
 Study getStudy() {
  return Study{
-      HostState{
+      "StudyName",
-          "CpuName", 123, {CacheInfo{"A", 1, 2, 3}, CacheInfo{"B", 4, 5, 6}}},
+      Runtime{HostState{"CpuName",
-      BenchmarkOptions{std::chrono::seconds(1), std::chrono::seconds(2), 10,
+                        123,
-                       100, 6, 100, 0.1, 2, BenchmarkLog::Full},
+                        {CacheInfo{"A", 1, 2, 3}, CacheInfo{"B", 4, 5, 6}}},
-      StudyConfiguration{2, 3, SizeRange{4, 5, 6}, Align(8), 9, 10},
+              456, 789,
-      {FunctionMeasurements{"A",
+              BenchmarkOptions{std::chrono::seconds(1), std::chrono::seconds(2),
-                            {Measurement{3, std::chrono::seconds(3)},
+                               10, 100, 6, 100, 0.1, 2, BenchmarkLog::Full}},
-                             Measurement{3, std::chrono::seconds(4)}}},
+      StudyConfiguration{std::string("Function"), 30U, false, 32U,
-       FunctionMeasurements{"B", {}}}};
+                         std::string("Distribution"), Align(16), 3U},
      {std::chrono::seconds(3), std::chrono::seconds(4)}};
 }
-static std::string SerializeToString(const Study &S) {
+static std::string serializeToString(const Study &S) {
  std::string Buffer;
  raw_string_ostream RSO(Buffer);
  json::OStream JOS(RSO);
-  SerializeToJson(S, JOS);
+  serializeToJson(S, JOS);
  return Buffer;
 }
@ -54,14 +55,25 @@ MATCHER(EqualsCacheInfo, "") {
                            A, result_listener);
 }
-auto Equals(const HostState &H) -> auto {
+auto equals(const HostState &H) -> auto {
  return AllOf(
      Field(&HostState::CpuName, H.CpuName),
      Field(&HostState::CpuFrequency, H.CpuFrequency),
      Field(&HostState::Caches, Pointwise(EqualsCacheInfo(), H.Caches)));
 }
-auto Equals(const BenchmarkOptions &BO) -> auto {
+auto equals(const StudyConfiguration &SC) -> auto {
  return AllOf(
      Field(&StudyConfiguration::Function, SC.Function),
      Field(&StudyConfiguration::NumTrials, SC.NumTrials),
      Field(&StudyConfiguration::IsSweepMode, SC.IsSweepMode),
      Field(&StudyConfiguration::SweepModeMaxSize, SC.SweepModeMaxSize),
      Field(&StudyConfiguration::SizeDistributionName, SC.SizeDistributionName),
      Field(&StudyConfiguration::AccessAlignment, SC.AccessAlignment),
      Field(&StudyConfiguration::MemcmpMismatchAt, SC.MemcmpMismatchAt));
 }
 auto equals(const BenchmarkOptions &BO) -> auto {
  return AllOf(
      Field(&BenchmarkOptions::MinDuration, BO.MinDuration),
      Field(&BenchmarkOptions::MaxDuration, BO.MaxDuration),
@ -74,58 +86,33 @@ auto Equals(const BenchmarkOptions &BO) -> auto {
      Field(&BenchmarkOptions::Log, BO.Log));
 }
-auto Equals(const SizeRange &SR) -> auto {
+auto equals(const Runtime &RI) -> auto {
-  return AllOf(Field(&SizeRange::From, SR.From), Field(&SizeRange::To, SR.To),
+  return AllOf(Field(&Runtime::Host, equals(RI.Host)),
-               Field(&SizeRange::Step, SR.Step));
+               Field(&Runtime::BufferSize, RI.BufferSize),
               Field(&Runtime::BatchParameterCount, RI.BatchParameterCount),
               Field(&Runtime::BenchmarkOptions, equals(RI.BenchmarkOptions)));
 }
-auto Equals(const StudyConfiguration &SC) -> auto {
+auto equals(const Study &S) -> auto {
-  return AllOf(
+  return AllOf(Field(&Study::StudyName, S.StudyName),
-      Field(&StudyConfiguration::Runs, SC.Runs),
+               Field(&Study::Runtime, equals(S.Runtime)),
-      Field(&StudyConfiguration::BufferSize, SC.BufferSize),
+               Field(&Study::Configuration, equals(S.Configuration)),
-      Field(&StudyConfiguration::Size, Equals(SC.Size)),
+               Field(&Study::Measurements, S.Measurements));
      Field(&StudyConfiguration::AddressAlignment, SC.AddressAlignment),
      Field(&StudyConfiguration::MemsetValue, SC.MemsetValue),
      Field(&StudyConfiguration::MemcmpMismatchAt, SC.MemcmpMismatchAt));
 }
 MATCHER(EqualsMeasurement, "") {
  const Measurement &A = ::testing::get<0>(arg);
  const Measurement &B = ::testing::get<1>(arg);
  return ExplainMatchResult(AllOf(Field(&Measurement::Size, B.Size),
                                  Field(&Measurement::Runtime, B.Runtime)),
                            A, result_listener);
 }
 MATCHER(EqualsFunctions, "") {
  const FunctionMeasurements &A = ::testing::get<0>(arg);
  const FunctionMeasurements &B = ::testing::get<1>(arg);
  return ExplainMatchResult(
      AllOf(Field(&FunctionMeasurements::Name, B.Name),
            Field(&FunctionMeasurements::Measurements,
                  Pointwise(EqualsMeasurement(), B.Measurements))),
      A, result_listener);
 }
 auto Equals(const Study &S) -> auto {
  return AllOf(
      Field(&Study::Host, Equals(S.Host)),
      Field(&Study::Options, Equals(S.Options)),
      Field(&Study::Configuration, Equals(S.Configuration)),
      Field(&Study::Functions, Pointwise(EqualsFunctions(), S.Functions)));
 }
 TEST(JsonTest, RoundTrip) {
  const Study S = getStudy();
-  auto StudyOrError = ParseJsonStudy(SerializeToString(S));
+  const auto Serialized = serializeToString(S);
  auto StudyOrError = parseJsonStudy(Serialized);
  if (auto Err = StudyOrError.takeError())
-    EXPECT_FALSE(Err) << "Unexpected error";
+    EXPECT_FALSE(Err) << "Unexpected error : " << Err << "\n" << Serialized;
  const Study &Parsed = *StudyOrError;
-  EXPECT_THAT(Parsed, Equals(S));
+  EXPECT_THAT(Parsed, equals(S)) << Serialized << "\n"
                                 << serializeToString(Parsed);
 }
 TEST(JsonTest, SupplementaryField) {
-  auto Failure = ParseJsonStudy(R"({
+  auto Failure = parseJsonStudy(R"({
      "UnknownField": 10
    }
  )");
@ -133,17 +120,19 @@ TEST(JsonTest, SupplementaryField) {
 }
 TEST(JsonTest, InvalidType) {
-  auto Failure = ParseJsonStudy(R"({
+  auto Failure = parseJsonStudy(R"({
-      "Options": 1
+      "Runtime": 1
    }
  )");
  EXPECT_EQ(toString(Failure.takeError()), "Expected JSON Object");
 }
 TEST(JsonTest, InvalidDuration) {
-  auto Failure = ParseJsonStudy(R"({
+  auto Failure = parseJsonStudy(R"({
-      "Options": {
+      "Runtime": {
-        "MinDuration": "Duration should be a Number"
+        "BenchmarkOptions": {
          "MinDuration": "Duration should be a Number"
        }
      }
    }
  )");
@ -151,9 +140,9 @@ TEST(JsonTest, InvalidDuration) {
 }
 TEST(JsonTest, InvalidAlignType) {
-  auto Failure = ParseJsonStudy(R"({
+  auto Failure = parseJsonStudy(R"({
-      "Configuration":{
+      "Configuration": {
-        "AddressAlignment": "Align should be an Integer"
+        "AccessAlignment": "Align should be an Integer"
      }
    }
  )");
@ -161,9 +150,9 @@ TEST(JsonTest, InvalidAlignType) {
 }
 TEST(JsonTest, InvalidAlign) {
-  auto Failure = ParseJsonStudy(R"({
+  auto Failure = parseJsonStudy(R"({
-      "Configuration":{
+      "Configuration": {
-        "AddressAlignment":3
+        "AccessAlignment": 3
      }
    }
  )");
@ -172,9 +161,11 @@ TEST(JsonTest, InvalidAlign) {
 }
 TEST(JsonTest, InvalidBenchmarkLogType) {
-  auto Failure = ParseJsonStudy(R"({
+  auto Failure = parseJsonStudy(R"({
-      "Options":{
+      "Runtime": {
-        "Log": 3
+        "BenchmarkOptions":{
          "Log": 3
        }
      }
    }
  )");
@ -183,9 +174,11 @@ TEST(JsonTest, InvalidBenchmarkLogType) {
 }
 TEST(JsonTest, InvalidBenchmarkLog) {
-  auto Failure = ParseJsonStudy(R"({
+  auto Failure = parseJsonStudy(R"({
-      "Options":{
+      "Runtime": {
-        "Log": "Unknown"
+        "BenchmarkOptions":{
          "Log": "Unknown"
        }
      }
    }
  )");
--- a/libc/benchmarks/LibcBenchmark.h
+++ b/libc/benchmarks/LibcBenchmark.h
@ -41,10 +41,6 @@
 namespace llvm {
 namespace libc_benchmarks {
 // Makes sure the binary was compiled in release mode and that frequency
 // governor is set on performance.
 void checkRequirements();
 using Duration = std::chrono::duration<double>;
 enum class BenchmarkLog {
@ -318,6 +314,10 @@ CircularArrayRef<T> cycle(const std::array<T, N> &Container, size_t Size) {
  return {llvm::ArrayRef<T>(Container.cbegin(), Container.cend()), Size};
 }
 // Makes sure the binary was compiled in release mode and that frequency
 // governor is set on performance.
 void checkRequirements();
 } // namespace libc_benchmarks
 } // namespace llvm
--- a/libc/benchmarks/LibcMemoryBenchmark.cpp
+++ b/libc/benchmarks/LibcMemoryBenchmark.cpp
@ -20,40 +20,42 @@ namespace libc_benchmarks {
 // When alignment is set, the distribution is scaled down by `Factor` and scaled
 // up again by the same amount during sampling.
 static std::uniform_int_distribution<uint32_t>
-GetOffsetDistribution(const StudyConfiguration &Conf) {
+getOffsetDistribution(size_t BufferSize, size_t MaxSizeValue,
-  if (Conf.AddressAlignment &&
+                      MaybeAlign AccessAlignment) {
-      *Conf.AddressAlignment > AlignedBuffer::Alignment)
+  if (AccessAlignment && *AccessAlignment > AlignedBuffer::Alignment)
    report_fatal_error(
-        "AddressAlignment must be less or equal to AlignedBuffer::Alignment");
+        "AccessAlignment must be less or equal to AlignedBuffer::Alignment");
-  if (!Conf.AddressAlignment)
+  if (!AccessAlignment)
    return std::uniform_int_distribution<uint32_t>(0, 0); // Always 0.
  // If we test up to Size bytes, the returned offset must stay under
  // BuffersSize - Size.
-  int64_t MaxOffset = Conf.BufferSize;
+  int64_t MaxOffset = BufferSize;
-  MaxOffset -= Conf.Size.To;
+  MaxOffset -= MaxSizeValue;
  MaxOffset -= 1;
  if (MaxOffset < 0)
    report_fatal_error(
        "BufferSize too small to exercise specified Size configuration");
-  MaxOffset /= Conf.AddressAlignment->value();
+  MaxOffset /= AccessAlignment->value();
  return std::uniform_int_distribution<uint32_t>(0, MaxOffset);
 }
-OffsetDistribution::OffsetDistribution(const StudyConfiguration &Conf)
+OffsetDistribution::OffsetDistribution(size_t BufferSize, size_t MaxSizeValue,
-    : Distribution(GetOffsetDistribution(Conf)),
+                                       MaybeAlign AccessAlignment)
-      Factor(Conf.AddressAlignment.valueOrOne().value()) {}
+    : Distribution(
          getOffsetDistribution(BufferSize, MaxSizeValue, AccessAlignment)),
      Factor(AccessAlignment.valueOrOne().value()) {}
 // Precomputes offset where to insert mismatches between the two buffers.
-MismatchOffsetDistribution::MismatchOffsetDistribution(
+MismatchOffsetDistribution::MismatchOffsetDistribution(size_t BufferSize,
-    const StudyConfiguration &Conf)
+                                                       size_t MaxSizeValue,
-    : MismatchAt(Conf.MemcmpMismatchAt) {
+                                                       size_t MismatchAt)
    : MismatchAt(MismatchAt) {
  if (MismatchAt <= 1)
    return;
-  const auto ToSize = Conf.Size.To;
+  for (size_t I = MaxSizeValue + 1; I < BufferSize; I += MaxSizeValue)
  for (size_t I = ToSize + 1; I < Conf.BufferSize; I += ToSize)
    MismatchIndices.push_back(I);
  if (MismatchIndices.empty())
-    llvm::report_fatal_error("Unable to generate mismatch");
+    report_fatal_error("Unable to generate mismatch");
  MismatchIndexSelector =
      std::uniform_int_distribution<size_t>(0, MismatchIndices.size() - 1);
 }
--- a/libc/benchmarks/LibcMemoryBenchmark.h
+++ b/libc/benchmarks/LibcMemoryBenchmark.h
@ -13,7 +13,6 @@
 #define LLVM_LIBC_UTILS_BENCHMARK_MEMORY_BENCHMARK_H
 #include "LibcBenchmark.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Alignment.h"
 #include <cstdint>
@ -26,66 +25,79 @@ namespace libc_benchmarks {
 // Configuration
 //--------------
-// Specifies a range of sizes to explore.
+struct StudyConfiguration {
-struct SizeRange {
+  // One of 'memcpy', 'memset', 'memcmp'.
-  uint32_t From = 0;  // Inclusive
+  // The underlying implementation is always the llvm libc one.
-  uint32_t To = 1024; // Inclusive
+  // e.g. 'memcpy' will test '__llvm_libc::memcpy'
-  uint32_t Step = 1;
+  std::string Function;
  // The number of trials to run for this benchmark.
  // If in SweepMode, each individual sizes are measured 'NumTrials' time.
  // i.e 'NumTrials' measurements for 0, 'NumTrials' measurements for 1 ...
  uint32_t NumTrials = 1;
  // Toggles between Sweep Mode and Distribution Mode (default).
  // See 'SweepModeMaxSize' and 'SizeDistributionName' below.
  bool IsSweepMode = false;
  // Maximum size to use when measuring a ramp of size values (SweepMode).
  // The benchmark measures all sizes from 0 to SweepModeMaxSize.
  // Note: in sweep mode the same size is sampled several times in a row this
  // will allow the processor to learn it and optimize the branching pattern.
  // The resulting measurement is likely to be idealized.
  uint32_t SweepModeMaxSize = 0; // inclusive
  // The name of the distribution to be used to randomize the size parameter.
  // This is used when SweepMode is false (default).
  std::string SizeDistributionName;
  // This parameter allows to control how the buffers are accessed during
  // benchmark:
  // None : Use a fixed address that is at least cache line aligned,
  //    1 : Use random address,
  //   >1 : Use random address aligned to value.
  MaybeAlign AccessAlignment = None;
  // When Function == 'memcmp', this is the buffers mismatch position.
  //  0 : Buffers always compare equal,
  // >0 : Buffers compare different at byte N-1.
  uint32_t MemcmpMismatchAt = 0;
 };
-// An object to define how to test a memory function.
+struct Runtime {
-struct StudyConfiguration {
+  // Details about the Host (cpu name, cpu frequency, cache hierarchy).
-  // The number of run for the study.
+  HostState Host;
  uint32_t Runs = 1;
-  // The size of the buffers (1 buffer for memset but 2 for memcpy or memcmp).
+  // The framework will populate this value so all data accessed during the
-  // When testing small sizes, it's important to keep the total allocated
+  // benchmark will stay in L1 data cache. This includes bookkeeping data.
-  // size under the size of the L1 cache (usually 16 or 32KiB). The framework
+  uint32_t BufferSize = 0;
  // will also use 2KiB of additional L1 memory to store the function
  // parameters.
  uint32_t BufferSize = 8192;
-  // The range of sizes to exercise.
+  // This is the number of distinct parameters used in a single batch.
-  SizeRange Size;
+  // The framework always tests a batch of randomized parameter to prevent the
  // cpu from learning branching patterns.
  uint32_t BatchParameterCount = 0;
-  MaybeAlign AddressAlignment; //  Unset : Use start of buffer which is at
+  // The benchmark options that were used to perform the measurement.
-                               //         least cache line aligned)
+  // This is decided by the framework.
-                               //     1 : Use random address,
+  BenchmarkOptions BenchmarkOptions;
                               //    >1 : Use random address aligned to value.
  // The value to use for memset.
  uint8_t MemsetValue = 0;
  // The mismatch position for memcmp.
  uint32_t MemcmpMismatchAt = 0; //  0 : Buffer compare equal,
                                 // >0 : Buffer compare different at byte N-1.
 };
 //--------
 // Results
 //--------
 // The time to run one iteration of the function under test for the specified
 // Size.
 struct Measurement {
  uint32_t Size = 0;
  Duration Runtime = {};
 };
 // The measurements for a specific function.
 struct FunctionMeasurements {
  std::string Name;
  std::vector<Measurement> Measurements;
 };
 // The root object containing all the data (configuration and measurements).
 struct Study {
-  HostState Host;
+  std::string StudyName;
-  BenchmarkOptions Options;
+  Runtime Runtime;
  StudyConfiguration Configuration;
-  SmallVector<FunctionMeasurements, 4> Functions;
+  std::vector<Duration> Measurements;
 };
 //------
 // Utils
 //------
 // Provides an aligned, dynamically allocated buffer.
 class AlignedBuffer {
  char *const Buffer = nullptr;
@ -95,7 +107,8 @@ public:
  static constexpr size_t Alignment = 1024;
  explicit AlignedBuffer(size_t Size)
-      : Buffer(static_cast<char *>(aligned_alloc(1024, Size))), Size(Size) {}
+      : Buffer(static_cast<char *>(aligned_alloc(Alignment, Size))),
        Size(Size) {}
  ~AlignedBuffer() { free(Buffer); }
  inline char *operator+(size_t Index) { return Buffer + Index; }
@ -106,36 +119,6 @@ public:
  inline char *end() { return Buffer + Size; }
 };
 // Implements the ParameterProvider abstraction needed by the `benchmark`
 // function. This implementation makes sure that all parameters will fit into
 // `StorageSize` bytes. The total memory accessed during benchmark should be
 // less than the data L1 cache, that is the storage for the ParameterProvider
 // and the memory buffers.
 template <typename Context, size_t StorageSize = 8 * 1024>
 class SmallParameterProvider {
  using ParameterType = typename Context::ParameterType;
  ByteConstrainedArray<ParameterType, StorageSize> Parameters;
  size_t LastIterations;
  Context &Ctx;
 public:
  explicit SmallParameterProvider(Context &C) : Ctx(C) {}
  SmallParameterProvider(const SmallParameterProvider &) = delete;
  SmallParameterProvider &operator=(const SmallParameterProvider &) = delete;
  // Useful to compute the histogram of the size parameter.
  CircularArrayRef<ParameterType> getLastBatch() const {
    return cycle(Parameters, LastIterations);
  }
  // Implements the interface needed by the `benchmark` function.
  CircularArrayRef<ParameterType> generateBatch(size_t Iterations) {
    LastIterations = Iterations;
    Ctx.Randomize(Parameters);
    return getLastBatch();
  }
 };
 // Helper to generate random buffer offsets that satisfy the configuration
 // constraints.
 class OffsetDistribution {
@ -143,7 +126,8 @@ class OffsetDistribution {
  uint32_t Factor;
 public:
-  explicit OffsetDistribution(const StudyConfiguration &Conf);
+  explicit OffsetDistribution(size_t BufferSize, size_t MaxSizeValue,
                              MaybeAlign AccessAlignment);
  template <class Generator> uint32_t operator()(Generator &G) {
    return Distribution(G) * Factor;
@ -159,7 +143,8 @@ class MismatchOffsetDistribution {
  const uint32_t MismatchAt;
 public:
-  explicit MismatchOffsetDistribution(const StudyConfiguration &Conf);
+  explicit MismatchOffsetDistribution(size_t BufferSize, size_t MaxSizeValue,
                                      size_t MismatchAt);
  explicit operator bool() const { return !MismatchIndices.empty(); }
--- a/libc/benchmarks/LibcMemoryBenchmarkMain.cpp
+++ b/libc/benchmarks/LibcMemoryBenchmarkMain.cpp
@ -6,10 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 #include "LibcMemoryBenchmarkMain.h"
 #include "JSON.h"
 #include "LibcBenchmark.h"
 #include "LibcMemoryBenchmark.h"
 #include "MemorySizeDistributions.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FileSystem.h"
@ -17,70 +17,310 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
-#include <string>
+namespace __llvm_libc {
 extern void *memcpy(void *__restrict, const void *__restrict, size_t);
 extern void *memset(void *, int, size_t);
 } // namespace __llvm_libc
 namespace llvm {
 namespace libc_benchmarks {
-static cl::opt<std::string>
+enum Function { memcpy, memset };
    Configuration("conf", cl::desc("Specify configuration filename"),
                  cl::value_desc("filename"), cl::init(""));
-static cl::opt<std::string> Output("o", cl::desc("Specify output filename"),
+static cl::opt<std::string>
    StudyName("study-name", cl::desc("The name for this study"), cl::Required);
 static cl::opt<Function>
    MemoryFunction("function", cl::desc("Sets the function to benchmark:"),
                   cl::values(clEnumVal(memcpy, "__llvm_libc::memcpy"),
                              clEnumVal(memset, "__llvm_libc::memset")),
                   cl::Required);
 static cl::opt<std::string>
    SizeDistributionName("size-distribution-name",
                         cl::desc("The name of the distribution to use"));
 static cl::opt<bool>
    SweepMode("sweep-mode",
              cl::desc("If set, benchmark all sizes from 0 to sweep-max-size"));
 static cl::opt<uint32_t>
    SweepMaxSize("sweep-max-size",
                 cl::desc("The maximum size to use in sweep-mode"),
                 cl::init(256));
 static cl::opt<uint32_t>
    AlignedAccess("aligned-access",
                  cl::desc("The alignment to use when accessing the buffers\n"
                           "Default is unaligned\n"
                           "Use 0 to disable address randomization"),
                  cl::init(1));
 static cl::opt<std::string> Output("output",
                                   cl::desc("Specify output filename"),
                                   cl::value_desc("filename"), cl::init("-"));
-extern std::unique_ptr<BenchmarkRunner>
+static cl::opt<uint32_t>
-getRunner(const StudyConfiguration &Conf);
+    NumTrials("num-trials", cl::desc("The number of benchmarks run to perform"),
              cl::init(1));
-void Main() {
+static constexpr int64_t KiB = 1024;
-#ifndef NDEBUG
+static constexpr int64_t ParameterStorageBytes = 4 * KiB;
-  static_assert(
+static constexpr int64_t L1LeftAsideBytes = 1 * KiB;
      false,
      "For reproducibility benchmarks should not be compiled in DEBUG mode.");
 #endif
  checkRequirements();
  ErrorOr<std::unique_ptr<MemoryBuffer>> MB =
      MemoryBuffer::getFileOrSTDIN(Configuration);
  if (!MB)
    report_fatal_error(
        Twine("Could not open configuration file: ").concat(Configuration));
  auto ErrorOrStudy = ParseJsonStudy((*MB)->getBuffer());
  if (!ErrorOrStudy)
    report_fatal_error(ErrorOrStudy.takeError());
-  const auto StudyPrototype = *ErrorOrStudy;
+struct ParameterType {
  unsigned OffsetBytes : 16; // max : 16 KiB - 1
  unsigned SizeBytes : 16;   // max : 16 KiB - 1
 };
-  Study S;
+struct MemcpyBenchmark {
-  S.Host = HostState::get();
+  static constexpr auto GetDistributions = &getMemcpySizeDistributions;
-  S.Options = StudyPrototype.Options;
+  static constexpr size_t BufferCount = 2;
-  S.Configuration = StudyPrototype.Configuration;
+  static void amend(Study &S) { S.Configuration.Function = "memcpy"; }
-  const auto Runs = S.Configuration.Runs;
+  MemcpyBenchmark(const size_t BufferSize)
-  const auto &SR = S.Configuration.Size;
+      : SrcBuffer(BufferSize), DstBuffer(BufferSize) {}
-  std::unique_ptr<BenchmarkRunner> Runner = getRunner(S.Configuration);
+
-  const size_t TotalSteps =
+  inline auto functor() {
-      Runner->getFunctionNames().size() * Runs * ((SR.To - SR.From) / SR.Step);
+    return [this](ParameterType P) {
-  size_t Steps = 0;
+      __llvm_libc::memcpy(DstBuffer + P.OffsetBytes, SrcBuffer + P.OffsetBytes,
-  for (auto FunctionName : Runner->getFunctionNames()) {
+                          P.SizeBytes);
-    FunctionMeasurements FM;
+      return DstBuffer + P.OffsetBytes;
-    FM.Name = std::string(FunctionName);
+    };
    for (size_t Run = 0; Run < Runs; ++Run) {
      for (uint32_t Size = SR.From; Size <= SR.To; Size += SR.Step) {
        const auto Result = Runner->benchmark(S.Options, FunctionName, Size);
        Measurement Measurement;
        Measurement.Runtime = Result.BestGuess;
        Measurement.Size = Size;
        FM.Measurements.push_back(Measurement);
        outs() << format("%3d%% run: %2d / %2d size: %5d ",
                         (Steps * 100 / TotalSteps), Run, Runs, Size)
               << FunctionName
               << "                                                  \r";
        ++Steps;
      }
    }
    S.Functions.push_back(std::move(FM));
  }
  AlignedBuffer SrcBuffer;
  AlignedBuffer DstBuffer;
 };
 struct MemsetBenchmark {
  static constexpr auto GetDistributions = &getMemsetSizeDistributions;
  static constexpr size_t BufferCount = 1;
  static void amend(Study &S) { S.Configuration.Function = "memset"; }
  MemsetBenchmark(const size_t BufferSize) : DstBuffer(BufferSize) {}
  inline auto functor() {
    return [this](ParameterType P) {
      __llvm_libc::memset(DstBuffer + P.OffsetBytes, P.OffsetBytes & 0xFF,
                          P.SizeBytes);
      return DstBuffer + P.OffsetBytes;
    };
  }
  AlignedBuffer DstBuffer;
 };
 template <typename Benchmark> struct Harness : Benchmark {
  using Benchmark::functor;
  Harness(const size_t BufferSize, size_t BatchParameterCount,
          std::function<unsigned()> SizeSampler,
          std::function<unsigned()> OffsetSampler)
      : Benchmark(BufferSize), BufferSize(BufferSize),
        BatchParameterCount(BatchParameterCount),
        Parameters(BatchParameterCount), SizeSampler(SizeSampler),
        OffsetSampler(OffsetSampler) {}
  CircularArrayRef<ParameterType> generateBatch(size_t Iterations) {
    for (auto &P : Parameters) {
      P.OffsetBytes = OffsetSampler();
      P.SizeBytes = SizeSampler();
      if (P.OffsetBytes + P.SizeBytes >= BufferSize)
        report_fatal_error("Call would result in buffer overflow");
    }
    return cycle(makeArrayRef(Parameters), Iterations);
  }
 private:
  const size_t BufferSize;
  const size_t BatchParameterCount;
  std::vector<ParameterType> Parameters;
  std::function<unsigned()> SizeSampler;
  std::function<unsigned()> OffsetSampler;
 };
 struct IBenchmark {
  virtual ~IBenchmark() {}
  virtual Study run() = 0;
 };
 size_t getL1DataCacheSize() {
  const std::vector<CacheInfo> &CacheInfos = HostState::get().Caches;
  const auto IsL1DataCache = [](const CacheInfo &CI) {
    return CI.Type == "Data" && CI.Level == 1;
  };
  const auto CacheIt = find_if(CacheInfos, IsL1DataCache);
  if (CacheIt != CacheInfos.end())
    return CacheIt->Size;
  report_fatal_error("Unable to read L1 Cache Data Size");
 }
 template <typename Benchmark> struct MemfunctionBenchmark : IBenchmark {
  MemfunctionBenchmark(int64_t L1Size = getL1DataCacheSize())
      : AvailableSize(L1Size - L1LeftAsideBytes - ParameterStorageBytes),
        BufferSize(AvailableSize / Benchmark::BufferCount),
        BatchParameterCount(BufferSize / sizeof(ParameterType)) {
    // Handling command line flags
    if (AvailableSize <= 0 || BufferSize <= 0 || BatchParameterCount < 100)
      report_fatal_error("Not enough L1 cache");
    if (!isPowerOfTwoOrZero(AlignedAccess))
      report_fatal_error(AlignedAccess.ArgStr +
                         Twine(" must be a power of two or zero"));
    const bool HasDistributionName = !SizeDistributionName.empty();
    if (SweepMode && HasDistributionName)
      report_fatal_error("Select only one of `--" + Twine(SweepMode.ArgStr) +
                         "` or `--" + Twine(SizeDistributionName.ArgStr) + "`");
    if (SweepMode) {
      MaxSizeValue = SweepMaxSize;
    } else {
      std::map<StringRef, MemorySizeDistribution> Map;
      for (MemorySizeDistribution Distribution : Benchmark::GetDistributions())
        Map[Distribution.Name] = Distribution;
      if (Map.count(SizeDistributionName) == 0) {
        std::string Message;
        raw_string_ostream Stream(Message);
        Stream << "Unknown --" << SizeDistributionName.ArgStr << "='"
               << SizeDistributionName << "', available distributions:\n";
        for (const auto &Pair : Map)
          Stream << "'" << Pair.first << "'\n";
        report_fatal_error(Stream.str());
      }
      SizeDistribution = Map[SizeDistributionName];
      MaxSizeValue = SizeDistribution.Probabilities.size() - 1;
    }
    // Setup study.
    Study.StudyName = StudyName;
    Runtime &RI = Study.Runtime;
    RI.Host = HostState::get();
    RI.BufferSize = BufferSize;
    RI.BatchParameterCount = BatchParameterCount;
    BenchmarkOptions &BO = RI.BenchmarkOptions;
    BO.MinDuration = std::chrono::milliseconds(1);
    BO.MaxDuration = std::chrono::seconds(1);
    BO.MaxIterations = 10'000'000U;
    BO.MinSamples = 4;
    BO.MaxSamples = 1000;
    BO.Epsilon = 0.01; // 1%
    BO.ScalingFactor = 1.4;
    StudyConfiguration &SC = Study.Configuration;
    SC.NumTrials = NumTrials;
    SC.IsSweepMode = SweepMode;
    if (SweepMode)
      SC.SweepModeMaxSize = SweepMaxSize;
    else
      SC.SizeDistributionName = SizeDistributionName;
    SC.AccessAlignment = MaybeAlign(AlignedAccess);
    // Delegate specific flags and configuration.
    Benchmark::amend(Study);
  }
  Study run() override {
    if (SweepMode)
      runSweepMode();
    else
      runDistributionMode();
    return Study;
  }
 private:
  const int64_t AvailableSize;
  const int64_t BufferSize;
  const size_t BatchParameterCount;
  size_t MaxSizeValue = 0;
  MemorySizeDistribution SizeDistribution;
  Study Study;
  std::mt19937_64 Gen;
  static constexpr bool isPowerOfTwoOrZero(size_t Value) {
    return (Value & (Value - 1U)) == 0;
  }
  std::function<unsigned()> geOffsetSampler() {
    return [this]() {
      static OffsetDistribution OD(BufferSize, MaxSizeValue,
                                   Study.Configuration.AccessAlignment);
      return OD(Gen);
    };
  }
  std::function<unsigned()> getSizeSampler() {
    return [this]() {
      static std::discrete_distribution<unsigned> Distribution(
          SizeDistribution.Probabilities.begin(),
          SizeDistribution.Probabilities.end());
      return Distribution(Gen);
    };
  }
  void reportProgress(BenchmarkStatus BS) {
    const size_t TotalSteps = Study.Measurements.capacity();
    const size_t Steps = Study.Measurements.size();
    const size_t Percent = 100 * Steps / TotalSteps;
    size_t I = 0;
    errs() << '[';
    for (; I <= Percent; ++I)
      errs() << '#';
    for (; I <= 100; ++I)
      errs() << '_';
    errs() << "] " << Percent << "%\r";
  }
  void runTrials(const BenchmarkOptions &Options,
                 std::function<unsigned()> SizeSampler,
                 std::function<unsigned()> OffsetSampler) {
    Harness<Benchmark> B(BufferSize, BatchParameterCount, SizeSampler,
                         OffsetSampler);
    for (size_t i = 0; i < NumTrials; ++i) {
      const BenchmarkResult Result = benchmark(Options, B, B.functor());
      Study.Measurements.push_back(Result.BestGuess);
      reportProgress(Result.TerminationStatus);
    }
  }
  void runSweepMode() {
    Study.Measurements.reserve(NumTrials * SweepMaxSize);
    BenchmarkOptions &BO = Study.Runtime.BenchmarkOptions;
    BO.MinDuration = std::chrono::milliseconds(1);
    BO.InitialIterations = 100;
    for (size_t Size = 0; Size <= SweepMaxSize; ++Size) {
      const auto SizeSampler = [Size]() { return Size; };
      runTrials(BO, SizeSampler, geOffsetSampler());
    }
  }
  void runDistributionMode() {
    Study.Measurements.reserve(NumTrials);
    BenchmarkOptions &BO = Study.Runtime.BenchmarkOptions;
    BO.MinDuration = std::chrono::milliseconds(10);
    BO.InitialIterations = BatchParameterCount * 10;
    runTrials(BO, getSizeSampler(), geOffsetSampler());
  }
 };
 std::unique_ptr<IBenchmark> getMemfunctionBenchmark() {
  switch (MemoryFunction) {
  case memcpy:
    return std::make_unique<MemfunctionBenchmark<MemcpyBenchmark>>();
  case memset:
    return std::make_unique<MemfunctionBenchmark<MemsetBenchmark>>();
  }
 }
 void writeStudy(const Study &S) {
  std::error_code EC;
  raw_fd_ostream FOS(Output, EC);
  if (EC)
@ -89,7 +329,13 @@ void Main() {
                           .concat(", ")
                           .concat(Output));
  json::OStream JOS(FOS);
-  SerializeToJson(S, JOS);
+  serializeToJson(S, JOS);
 }
 void main() {
  checkRequirements();
  auto MB = getMemfunctionBenchmark();
  writeStudy(MB->run());
 }
 } // namespace libc_benchmarks
@ -97,6 +343,11 @@ void Main() {
 int main(int argc, char **argv) {
  llvm::cl::ParseCommandLineOptions(argc, argv);
-  llvm::libc_benchmarks::Main();
+#ifndef NDEBUG
  static_assert(
      false,
      "For reproducibility benchmarks should not be compiled in DEBUG mode.");
 #endif
  llvm::libc_benchmarks::main();
  return EXIT_SUCCESS;
 }
--- a/libc/benchmarks/LibcMemoryBenchmarkMain.h
+++ b/libc/benchmarks/LibcMemoryBenchmarkMain.h
@ -1,36 +0,0 @@
 //===-- BenchmarkRunner interface -------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 #ifndef LLVM_LIBC_UTILS_BENCHMARK_MEMORY_BENCHMARK_MAIN_H
 #define LLVM_LIBC_UTILS_BENCHMARK_MEMORY_BENCHMARK_MAIN_H
 #include "LibcBenchmark.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
 namespace llvm {
 namespace libc_benchmarks {
 // Each memory function benchmark implements this interface.
 // It is used by the main function to run all benchmarks in a uniform manner.
 class BenchmarkRunner {
 public:
  virtual ~BenchmarkRunner() {}
  // Returns a list of all available functions to test.
  virtual ArrayRef<StringRef> getFunctionNames() const = 0;
  // Performs the benchmarking for a particular FunctionName and Size.
  virtual BenchmarkResult benchmark(const BenchmarkOptions &Options,
                                    StringRef FunctionName, size_t Size) = 0;
 };
 } // namespace libc_benchmarks
 } // namespace llvm
 #endif // LLVM_LIBC_UTILS_BENCHMARK_MEMORY_BENCHMARK_MAIN_H
--- a/libc/benchmarks/LibcMemoryBenchmarkTest.cpp
+++ b/libc/benchmarks/LibcMemoryBenchmarkTest.cpp
@ -34,22 +34,16 @@ TEST(AlignedBuffer, Empty) {
 }
 TEST(OffsetDistribution, AlignToBegin) {
-  StudyConfiguration Conf;
+  const size_t BufferSize = 8192;
-  Conf.BufferSize = 8192;
+  OffsetDistribution OD(BufferSize, 1024, None);
  Conf.AddressAlignment = None;
  OffsetDistribution OD(Conf);
  std::default_random_engine Gen;
  for (size_t I = 0; I <= 10; ++I)
    EXPECT_EQ(OD(Gen), 0U);
 }
 TEST(OffsetDistribution, NoAlignment) {
-  StudyConfiguration Conf;
+  const size_t BufferSize = 8192;
-  Conf.BufferSize = 8192;
+  OffsetDistribution OD(BufferSize, 1, Align(1));
  Conf.Size.To = 1;
  OffsetDistribution OD(Conf);
  std::default_random_engine Gen;
  for (size_t I = 0; I <= 10; ++I)
    EXPECT_THAT(OD(Gen), AllOf(Ge(0U), Lt(8192U)));
@ -61,49 +55,42 @@ MATCHER_P(IsDivisibleBy, n, "") {
 }
 TEST(OffsetDistribution, Aligned) {
-  StudyConfiguration Conf;
+  const size_t BufferSize = 8192;
-  Conf.BufferSize = 8192;
+  OffsetDistribution OD(BufferSize, 1, Align(16));
  Conf.AddressAlignment = Align(16);
  Conf.Size.To = 1;
  OffsetDistribution OD(Conf);
  std::default_random_engine Gen;
  for (size_t I = 0; I <= 10; ++I)
    EXPECT_THAT(OD(Gen), AllOf(Ge(0U), Lt(8192U), IsDivisibleBy(16U)));
 }
 TEST(MismatchOffsetDistribution, EqualBufferDisablesDistribution) {
-  StudyConfiguration Conf;
+  const size_t BufferSize = 8192;
-  Conf.MemcmpMismatchAt = 0; // buffer are equal.
+  const uint32_t MismatchAt = 0; // buffer are equal.
-  MismatchOffsetDistribution MOD(Conf);
+  MismatchOffsetDistribution MOD(BufferSize, 1024, MismatchAt);
  EXPECT_FALSE(MOD);
 }
 TEST(MismatchOffsetDistribution, DifferentBufferDisablesDistribution) {
-  StudyConfiguration Conf;
+  const size_t BufferSize = 8192;
-  Conf.MemcmpMismatchAt = 1; // buffer are different.
+  const uint32_t MismatchAt = 1; // buffer are different.
-  MismatchOffsetDistribution MOD(Conf);
+  MismatchOffsetDistribution MOD(BufferSize, 1024, MismatchAt);
  EXPECT_FALSE(MOD);
 }
 TEST(MismatchOffsetDistribution, MismatchAt2) {
-  const uint32_t MismatchAt = 2;
+  const size_t BufferSize = 16;
-  const uint32_t ToSize = 4;
+  const uint32_t MismatchAt = 2; // buffer are different at position 2.
-  StudyConfiguration Conf;
+  const uint32_t MaxSize = 4;
  Conf.BufferSize = 16;
  Conf.MemcmpMismatchAt = MismatchAt; // buffer are different at position 2.
  Conf.Size.To = ToSize;
-  MismatchOffsetDistribution MOD(Conf);
+  MismatchOffsetDistribution MOD(BufferSize, MaxSize, MismatchAt);
  EXPECT_TRUE(MOD);
-  // We test equality up to ToSize (=4) so we need spans of 4 equal bytes spaced
+  // We test equality up to MaxSize (=4) so we need spans of 4 equal bytes
-  // by one mismatch.
+  // spaced by one mismatch.
  EXPECT_THAT(MOD.getMismatchIndices(), ElementsAre(5, 9, 13));
  std::default_random_engine Gen;
  for (size_t Iterations = 0; Iterations <= 10; ++Iterations) {
-    for (size_t Size = Conf.Size.From; Size <= ToSize; ++Size) {
+    for (size_t Size = 0; Size <= MaxSize; ++Size) {
      if (Size >= MismatchAt)
        EXPECT_THAT(MOD(Gen, Size),
                    AnyOf(5 - MismatchAt, 9 - MismatchAt, 13 - MismatchAt));
--- a/libc/benchmarks/Memcmp.cpp
+++ b/libc/benchmarks/Memcmp.cpp
@ -1,87 +0,0 @@
 //===-- Benchmark memcmp implementation -----------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 #include "LibcBenchmark.h"
 #include "LibcMemoryBenchmark.h"
 #include "LibcMemoryBenchmarkMain.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/raw_ostream.h"
 namespace llvm {
 namespace libc_benchmarks {
 // The context encapsulates the buffers, parameters and the measure.
 struct MemcmpContext : public BenchmarkRunner {
  using FunctionPrototype = int (*)(const void *, const void *, size_t);
  struct ParameterType {
    uint16_t Offset = 0;
  };
  explicit MemcmpContext(const StudyConfiguration &Conf)
      : MOD(Conf), OD(Conf), ABuffer(Conf.BufferSize), BBuffer(Conf.BufferSize),
        PP(*this) {
    std::uniform_int_distribution<char> Dis;
    // Generate random buffer A.
    for (size_t I = 0; I < Conf.BufferSize; ++I)
      ABuffer[I] = Dis(Gen);
    // Copy buffer A to B.
    ::memcpy(BBuffer.begin(), ABuffer.begin(), Conf.BufferSize);
    if (Conf.MemcmpMismatchAt == 0)
      return; // all same.
    else if (Conf.MemcmpMismatchAt == 1)
      for (char &c : BBuffer)
        ++c; // all different.
    else
      for (const auto I : MOD.getMismatchIndices())
        ++BBuffer[I];
  }
  // Needed by the ParameterProvider to update the current batch of parameter.
  void Randomize(MutableArrayRef<ParameterType> Parameters) {
    if (MOD)
      for (auto &P : Parameters)
        P.Offset = MOD(Gen, CurrentSize);
    else
      for (auto &P : Parameters)
        P.Offset = OD(Gen);
  }
  ArrayRef<StringRef> getFunctionNames() const override {
    static std::array<StringRef, 1> kFunctionNames = {"memcmp"};
    return kFunctionNames;
  }
  BenchmarkResult benchmark(const BenchmarkOptions &Options,
                            StringRef FunctionName, size_t Size) override {
    CurrentSize = Size;
    // FIXME: Add `bcmp` once we're guaranteed that the function is provided.
    FunctionPrototype Function =
        StringSwitch<FunctionPrototype>(FunctionName).Case("memcmp", &::memcmp);
    return llvm::libc_benchmarks::benchmark(
        Options, PP, [this, Function, Size](ParameterType p) {
          return Function(ABuffer + p.Offset, BBuffer + p.Offset, Size);
        });
  }
 private:
  std::default_random_engine Gen;
  MismatchOffsetDistribution MOD;
  OffsetDistribution OD;
  size_t CurrentSize = 0;
  AlignedBuffer ABuffer;
  AlignedBuffer BBuffer;
  SmallParameterProvider<MemcmpContext> PP;
 };
 std::unique_ptr<BenchmarkRunner> getRunner(const StudyConfiguration &Conf) {
  return std::make_unique<MemcmpContext>(Conf);
 }
 } // namespace libc_benchmarks
 } // namespace llvm
--- a/libc/benchmarks/Memcpy.cpp
+++ b/libc/benchmarks/Memcpy.cpp
@ -1,73 +0,0 @@
 //===-- Benchmark memcpy implementation -----------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 #include "LibcBenchmark.h"
 #include "LibcMemoryBenchmark.h"
 #include "LibcMemoryBenchmarkMain.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/raw_ostream.h"
 #include <memory>
 namespace __llvm_libc {
 extern void *memcpy(void *__restrict, const void *__restrict, size_t);
 } // namespace __llvm_libc
 namespace llvm {
 namespace libc_benchmarks {
 // The context encapsulates the buffers, parameters and the measure.
 struct MemcpyContext : public BenchmarkRunner {
  using FunctionPrototype = void *(*)(void *, const void *, size_t);
  struct ParameterType {
    uint16_t SrcOffset = 0;
    uint16_t DstOffset = 0;
  };
  explicit MemcpyContext(const StudyConfiguration &Conf)
      : OD(Conf), SrcBuffer(Conf.BufferSize), DstBuffer(Conf.BufferSize),
        PP(*this) {}
  // Needed by the ParameterProvider to update the current batch of parameter.
  void Randomize(MutableArrayRef<ParameterType> Parameters) {
    for (auto &P : Parameters) {
      P.DstOffset = OD(Gen);
      P.SrcOffset = OD(Gen);
    }
  }
  ArrayRef<StringRef> getFunctionNames() const override {
    static std::array<StringRef, 1> kFunctionNames = {"memcpy"};
    return kFunctionNames;
  }
  BenchmarkResult benchmark(const BenchmarkOptions &Options,
                            StringRef FunctionName, size_t Size) override {
    FunctionPrototype Function = StringSwitch<FunctionPrototype>(FunctionName)
                                     .Case("memcpy", &__llvm_libc::memcpy);
    return llvm::libc_benchmarks::benchmark(
        Options, PP, [this, Function, Size](ParameterType p) {
          Function(DstBuffer + p.DstOffset, SrcBuffer + p.SrcOffset, Size);
          return DstBuffer + p.DstOffset;
        });
  }
 private:
  std::default_random_engine Gen;
  OffsetDistribution OD;
  AlignedBuffer SrcBuffer;
  AlignedBuffer DstBuffer;
  SmallParameterProvider<MemcpyContext> PP;
 };
 std::unique_ptr<BenchmarkRunner> getRunner(const StudyConfiguration &Conf) {
  return std::make_unique<MemcpyContext>(Conf);
 }
 } // namespace libc_benchmarks
 } // namespace llvm
--- a/libc/benchmarks/MemorySizeDistributions.cpp
+++ b/libc/benchmarks/MemorySizeDistributions.cpp
--- a/libc/benchmarks/Memset.cpp
+++ b/libc/benchmarks/Memset.cpp
@ -1,70 +0,0 @@
 //===-- Benchmark memset implementation -----------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 #include "LibcBenchmark.h"
 #include "LibcMemoryBenchmark.h"
 #include "LibcMemoryBenchmarkMain.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/raw_ostream.h"
 namespace __llvm_libc {
 void *memset(void *, int, size_t);
 } // namespace __llvm_libc
 namespace llvm {
 namespace libc_benchmarks {
 // The context encapsulates the buffers, parameters and the measure.
 struct MemsetContext : public BenchmarkRunner {
  using FunctionPrototype = void *(*)(void *, int, size_t);
  struct ParameterType {
    uint16_t DstOffset = 0;
  };
  explicit MemsetContext(const StudyConfiguration &Conf)
      : OD(Conf), DstBuffer(Conf.BufferSize), MemsetValue(Conf.MemsetValue),
        PP(*this) {}
  // Needed by the ParameterProvider to update the current batch of parameter.
  void Randomize(MutableArrayRef<ParameterType> Parameters) {
    for (auto &P : Parameters) {
      P.DstOffset = OD(Gen);
    }
  }
  ArrayRef<StringRef> getFunctionNames() const override {
    static std::array<StringRef, 1> kFunctionNames = {"memset"};
    return kFunctionNames;
  }
  BenchmarkResult benchmark(const BenchmarkOptions &Options,
                            StringRef FunctionName, size_t Size) override {
    FunctionPrototype Function = StringSwitch<FunctionPrototype>(FunctionName)
                                     .Case("memset", &__llvm_libc::memset);
    return llvm::libc_benchmarks::benchmark(
        Options, PP, [this, Function, Size](ParameterType p) {
          Function(DstBuffer + p.DstOffset, MemsetValue, Size);
          return DstBuffer + p.DstOffset;
        });
  }
 private:
  std::default_random_engine Gen;
  OffsetDistribution OD;
  AlignedBuffer DstBuffer;
  const uint8_t MemsetValue;
  SmallParameterProvider<MemsetContext> PP;
 };
 std::unique_ptr<BenchmarkRunner> getRunner(const StudyConfiguration &Conf) {
  return std::make_unique<MemsetContext>(Conf);
 }
 } // namespace libc_benchmarks
 } // namespace llvm
--- a/libc/benchmarks/RATIONALE.md
+++ b/libc/benchmarks/RATIONALE.md
@ -33,7 +33,7 @@ functions.
 ## Challenges
-As seen in the [README.md](README.md#benchmarking-regimes) the microbenchmarking
+As seen in the [README.md](README.md#stochastic-mode) the microbenchmarking
 facility should focus on measuring **low latency code**. If copying a few bytes
 takes in the order of a few cycles, the benchmark should be able to **measure
 accurately down to the cycle**.
@ -76,7 +76,7 @@ Each vendor decides which performance counters to implement and their exact
 meaning. Although we want to benchmark `llvm-libc` memory functions for all
 available [target
 triples](https://clang.llvm.org/docs/CrossCompilation.html#target-triple), there
-are **no guarantees that the counter we're interested in is available.** 
+are **no guarantees that the counter we're interested in is available.**
 ### Additional imprecisions
--- a/libc/benchmarks/README.md
+++ b/libc/benchmarks/README.md
@ -1,65 +1,59 @@
 # Libc mem* benchmarks
-This framework has been designed to evaluate and compare relative performance of
+This framework has been designed to evaluate and compare relative performance of memory function implementations on a particular machine.
 memory function implementations on a particular host.
-It will also be use to track implementations performances over time.
+It relies on two tools:
 - `libc-benchmark-main` a C++ benchmarking utility producing raw measurements,
 - `libc-benchmark-analysis.py3` a tool to process the measurements into reports.
-## Quick start
+## Benchmarking tool
 ### Setup
 **Python 2** [being deprecated](https://www.python.org/doc/sunset-python-2/) it is
 advised to used **Python 3**.
 Then make sure to have `matplotlib`, `scipy` and `numpy` setup correctly:
 ```shell
 apt-get install python3-pip
 pip3 install matplotlib scipy numpy
 ```
 You may need `python3-gtk` or similar package for displaying benchmark results.
 To get good reproducibility it is important to make sure that the system runs in
 `performance` mode. This is achieved by running:
 ```shell
 cpupower frequency-set --governor performance
 ```
 ### Run and display `memcpy` benchmark
 The following commands will run the benchmark and display a 95 percentile
 confidence interval curve of **time per copied bytes**. It also features **host
 informations** and **benchmarking configuration**.
 ```shell
 cd llvm-project
 cmake -B/tmp/build -Sllvm -DLLVM_ENABLE_PROJECTS='clang;clang-tools-extra;libc' -DCMAKE_BUILD_TYPE=Release -G Ninja
-ninja -C /tmp/build display-libc-memcpy-benchmark-small
+ninja -C /tmp/build libc-benchmark-main
 ```
-The display target will attempt to open a window on the machine where you're
+> Note: The machine should run in `performance` mode. This is achieved by running:
-running the benchmark. If this may not work for you then you may want `render`
+```shell
-or `run` instead as detailed below.
+cpupower frequency-set --governor performance
 ```
-## Benchmarking targets
+### Usage
-The benchmarking process occurs in two steps:
+`libc-benchmark-main` can run in two modes:
 - **stochastic mode** returns the average time per call for a particular size distribution,
 - **sweep mode** returns the average time per size over a range of sizes.
-1. Benchmark the functions and produce a `json` file
+The tool requires the following flags to be set:
-2. Display (or renders) the `json` file
+ - `--study-name`: a name to identify a run and provide label during analysis,
 - `--function`: the name of the function under test.
-Targets are of the form `<action>-libc-<function>-benchmark-<configuration>`
+It also provides optional flags:
 - `--num-trials`: repeats the benchmark more times, the analysis tool can take this into account and give confidence intervals.
 - `--output`: specifies a file to write the report - or standard output if not set.
 - `--aligned-access`: The alignment to use when accessing the buffers, default is unaligned, 0 disables address randomization.
- - `action` is one of :
+> Note: `--function` takes a generic function name like `memcpy` or `memset` but the actual function being tested is the llvm-libc implementation (e.g. `__llvm_libc::memcpy`).
    - `run`, runs the benchmark and writes the `json` file
    - `display`, displays the graph on screen
    - `render`, renders the graph on disk as a `png` file
 - `function` is one of : `memcpy`, `memcmp`, `memset`
 - `configuration` is one of : `small`, `big`
-## Benchmarking regimes
+### Stochastic mode
 This is the preferred mode to use. The function parameters are randomized and the branch predictor is less likely to kick in.
 ```shell
 /tmp/build/bin/libc-benchmark-main \
    --study-name="new memcpy" \
    --function=memcpy \
    --size-distribution-name="memcpy Google A" \
    --num-trials=30 \
    --output=/tmp/benchmark_result.json
 ```
 The `--size-distribution-name` flag is mandatory and points to one of the [predefined distribution](libc/benchmarks/MemorySizeDistributions.h).
 > Note: These distributions are gathered from several important binaries at Google (servers, databases, realtime and batch jobs) and reflect the importance of focusing on small sizes.
 Using a profiler to observe size distributions for calls into libc functions, it
 was found most operations act on a small number of bytes.
@ -70,37 +64,48 @@ memcpy             | 96%                         | 99%
 memset             | 91%                         | 99.9%
 memcmp<sup>1</sup> | 99.5%                       | ~100%
 Benchmarking configurations come in two flavors:
 - [small](libc/utils/benchmarks/configuration_small.json)
    - Exercises sizes up to `1KiB`, representative of normal usage
    - The data is kept in the `L1` cache to prevent measuring the memory
      subsystem
 - [big](libc/utils/benchmarks/configuration_big.json)
    - Exercises sizes up to `32MiB` to test large operations
    - Caching effects can show up here which prevents comparing different hosts
 _<sup>1</sup> - The size refers to the size of the buffers to compare and not
 the number of bytes until the first difference._
-## Superposing curves
+### Sweep mode
-It is possible to **merge** several `json` files into a single graph. This is
+This mode is used to measure call latency per size for a certain range of sizes. Because it exercises the same size over and over again the branch predictor can kick in. It can still be useful to compare strength and weaknesses of particular implementations.
 useful to **compare** implementations.
 In the following example we superpose the curves for `memcpy`, `memset` and
 `memcmp`:
 ```shell
-> make -C /tmp/build run-libc-memcpy-benchmark-small run-libc-memcmp-benchmark-small run-libc-memset-benchmark-small
+/tmp/build/bin/libc-benchmark-main \
-> python libc/utils/benchmarks/render.py3 /tmp/last-libc-memcpy-benchmark-small.json /tmp/last-libc-memcmp-benchmark-small.json /tmp/last-libc-memset-benchmark-small.json
+    --study-name="new memcpy" \
    --function=memcpy \
    --sweep-mode \
    --sweep-max-size=128 \
    --output=/tmp/benchmark_result.json
 ```
-## Useful `render.py3` flags
+## Analysis tool
- - To save the produced graph `--output=/tmp/benchmark_curve.png`.
+### Setup
 - To prevent the graph from appearing on the screen `--headless`.
 Make sure to have `matplotlib`, `pandas` and `seaborn` setup correctly:
 ```shell
 apt-get install python3-pip
 pip3 install matplotlib pandas seaborn
 ```
 You may need `python3-gtk` or similar package to display the graphs.
 ### Usage
 ```shell
 python3 libc/benchmarks/libc-benchmark-analysis.py3 /tmp/benchmark_result.json ...
 ```
 When used with __multiple trials Sweep Mode data__ the tool displays the 95% confidence interval.
 When providing with multiple reports at the same time, all the graphs from the same machine are displayed side by side to allow for comparison.
 The Y-axis unit can be changed via the `--mode` flag:
 - `time` displays the measured time (this is the default),
 - `cycles` displays the number of cycles computed from the cpu frequency,
 - `bytespercycle` displays the number of bytes per cycle (for `Sweep Mode` reports only).
 ## Under the hood
--- a/libc/benchmarks/configuration_big.json
+++ b/libc/benchmarks/configuration_big.json
@ -1,24 +0,0 @@
 {
   "Options":{
      "MinDuration":0.001,
      "MaxDuration":1,
      "InitialIterations":100,
      "MaxIterations":10000000,
      "MinSamples":1,
      "MaxSamples":1,
      "Epsilon":0.01,
      "ScalingFactor":1.4
   },
   "Configuration":{
      "Runs":5,
      "BufferSize":134217728,
      "Size":{
        "From":0,
        "To":33554432,
        "Step":1048576
      },
      "AddressAlignment":1,
      "MemsetValue":0,
      "MemcmpMismatchAt":0
   }
 }
--- a/libc/benchmarks/configuration_small.json
+++ b/libc/benchmarks/configuration_small.json
@ -1,24 +0,0 @@
 {
   "Options":{
      "MinDuration":0.001,
      "MaxDuration":1,
      "InitialIterations":100,
      "MaxIterations":10000000,
      "MinSamples":4,
      "MaxSamples":1000,
      "Epsilon":0.01,
      "ScalingFactor":1.4
   },
   "Configuration":{
      "Runs":10,
      "BufferSize":8192,
      "Size":{
        "From":0,
        "To":1024,
        "Step":1
      },
      "AddressAlignment":1,
      "MemsetValue":0,
      "MemcmpMismatchAt":0
   }
 }
--- a/libc/benchmarks/libc-benchmark-analysis.py3
+++ b/libc/benchmarks/libc-benchmark-analysis.py3
@ -0,0 +1,128 @@
 """Reads JSON files produced by the benchmarking framework and renders them.
 Installation:
 > apt-get install python3-pip
 > pip3 install matplotlib pandas seaborn
 Run:
 > python3 libc/benchmarks/libc-benchmark-analysis.py3 <files>
 """
 import argparse
 import json
 import pandas as pd
 import seaborn as sns
 import matplotlib.pyplot as plt
 from matplotlib.ticker import EngFormatter
 def formatUnit(value, unit):
    return EngFormatter(unit, sep="").format_data(value)
 def formatCache(cache):
  letter = cache["Type"][0].lower()
  level = cache["Level"]
  size = formatUnit(cache["Size"], "B")
  ways = cache["NumSharing"]
  return F'{letter}L{level}:{size}/{ways}'
 def getCpuFrequency(study):
    return study["Runtime"]["Host"]["CpuFrequency"]
 def getId(study):
    CpuName = study["Runtime"]["Host"]["CpuName"]
    CpuFrequency = formatUnit(getCpuFrequency(study), "Hz")
    Mode = " (Sweep)" if study["Configuration"]["IsSweepMode"] else ""
    CpuCaches = ", ".join(formatCache(c) for c in study["Runtime"]["Host"]["Caches"])
    return F'{CpuName} {CpuFrequency}{Mode}\n{CpuCaches}'
 def getFunction(study):
    return study["Configuration"]["Function"]
 def getLabel(study):
    return F'{getFunction(study)} {study["StudyName"]}'
 def displaySweepData(id, studies, mode):
    df = None
    for study in studies:
        Measurements = study["Measurements"]
        SweepModeMaxSize = study["Configuration"]["SweepModeMaxSize"]
        NumSizes = SweepModeMaxSize + 1
        NumTrials = study["Configuration"]["NumTrials"]
        assert NumTrials * NumSizes  == len(Measurements), 'not a multiple of NumSizes'
        Index = pd.MultiIndex.from_product([range(NumSizes), range(NumTrials)], names=['size', 'trial'])
        if df is None:
            df = pd.DataFrame(Measurements, index=Index, columns=[getLabel(study)])
        else:
            df[getLabel(study)] = pd.Series(Measurements, index=Index)
    df = df.reset_index(level='trial', drop=True)
    if mode == "cycles":
        df *= getCpuFrequency(study)
    if mode == "bytespercycle":
        df *= getCpuFrequency(study)
        for col in df.columns:
            df[col] = pd.Series(data=df.index, index=df.index).divide(df[col])
    FormatterUnit = {"time":"s","cycles":"","bytespercycle":"B/cycle"}[mode]
    Label = {"time":"Time","cycles":"Cycles","bytespercycle":"Byte/cycle"}[mode]
    graph = sns.lineplot(data=df, palette="muted", ci=95)
    graph.set_title(id)
    graph.yaxis.set_major_formatter(EngFormatter(unit=FormatterUnit))
    graph.yaxis.set_label_text(Label)
    graph.xaxis.set_major_formatter(EngFormatter(unit="B"))
    graph.xaxis.set_label_text("Copy Size")
    _ = plt.xticks(rotation=90)
    plt.show()
 def displayDistributionData(id, studies, mode):
    distributions = set()
    df = None
    for study in studies:
        distribution = study["Configuration"]["SizeDistributionName"]
        distributions.add(distribution)
        local = pd.DataFrame(study["Measurements"], columns=["time"])
        local["distribution"] = distribution
        local["label"] = getLabel(study)
        local["cycles"] = local["time"] * getCpuFrequency(study)
        if df is None:
            df = local
        else:
            df = df.append(local)
    if mode == "bytespercycle":
        mode = "time"
        print("`--mode=bytespercycle` is ignored for distribution mode reports")
    FormatterUnit = {"time":"s","cycles":""}[mode]
    Label = {"time":"Time","cycles":"Cycles"}[mode]
    graph = sns.violinplot(data=df, x="distribution", y=mode, palette="muted", hue="label", order=sorted(distributions))
    graph.set_title(id)
    graph.yaxis.set_major_formatter(EngFormatter(unit=FormatterUnit))
    graph.yaxis.set_label_text(Label)
    _ = plt.xticks(rotation=90)
    plt.show()
 def main():
    parser = argparse.ArgumentParser(description="Process benchmark json files.")
    parser.add_argument("--mode", choices=["time", "cycles", "bytespercycle"], default="time", help="Use to display either 'time', 'cycles' or 'bytes/cycle'.")
    parser.add_argument("files", nargs="+", help="The json files to read from.")
    args = parser.parse_args()
    study_groups = dict()
    for file in args.files:
        with open(file) as json_file:
            json_obj = json.load(json_file)
            Id = getId(json_obj)
            if Id in study_groups:
                study_groups[Id].append(json_obj)
            else:
                study_groups[Id] = [json_obj]
    plt.tight_layout()
    sns.set_theme(style="ticks")
    for id, study_collection in study_groups.items():
        if "(Sweep)" in id:
            displaySweepData(id, study_collection, args.mode)
        else:
            displayDistributionData(id, study_collection, args.mode)
 if __name__ == "__main__":
    main()
--- a/libc/benchmarks/render.py3
+++ b/libc/benchmarks/render.py3
@ -1,194 +0,0 @@
 """Reads JSON files produced by the benchmarking framework and renders them.
 Installation:
 > apt-get install python3-pip
 > pip3 install matplotlib scipy numpy
 Run:
 > python3 render.py3 <files>
 Rendering can occur on disk by specifying the --output option or on screen if
 the --headless flag is not set.
 """
 import argparse
 import collections
 import json
 import math
 import pprint
 import sys
 import matplotlib.pyplot as plt
 from matplotlib.ticker import EngFormatter
 import numpy as np
 import scipy.stats
 def format_freq(number):
    """Returns a human readable frequency."""
    magnitude = 0
    while math.fabs(number) >= 1000:
        number /= 1000.0
        magnitude += 1
    return "%g%sHz" % (number, ["", "k", "M", "G"][magnitude])
 def format_size(number):
    """Returns number in human readable form."""
    magnitude = 0
    while number >= 1000 and number % 1000 == 0:
        number /= 1000
        magnitude += 1
    return "%g%s" % (number, ["", "K", "M", "G"][magnitude])
 def mean_confidence_interval(dataset, confidence=0.95):
    """Returns the mean and half confidence interval for the dataset."""
    a = 1.0 * np.array(dataset)
    n = len(a)
    m, se = np.mean(a), scipy.stats.sem(a)
    h = se * scipy.stats.t.ppf((1 + confidence) / 2., n - 1)
    return m, h
 def add_plot(function_name, points):
    """Plots measurements for a function."""
    n = len(points.keys())
    x = np.zeros(n)
    y = np.zeros(n)
    yerr = np.zeros(n)
    for i, key in enumerate(sorted(points.keys())):
        values = points[key]
        m, e = mean_confidence_interval(values)
        x[i] = key
        y[i] = m
        yerr[i] = e
    plt.plot(x, y, linewidth=1, label=function_name)
    plt.fill_between(x, y - yerr, y + yerr, alpha=0.5)
 def get_title(host):
    """Formats the Host object into a title for the plot."""
    cpu_name = host["CpuName"]
    cpu_freq = format_freq(host["CpuFrequency"])
    cache_strings = []
    for cache in host["Caches"]:
        prefix = {
            "Instruction": "i",
            "Data": "d",
            "Unified": "u",
        }.get(cache["Type"])
        cache_strings.append(r"%sL_%d %s_{/%d}" %
                             (prefix, cache["Level"], format_size(
                                 cache["Size"]), cache["NumSharing"]))
    title = "%s (%s)" % (cpu_name, cpu_freq)
    subtitle = r"$" + ", ".join(sorted(cache_strings)) + r"$"
    return title + "\n" + subtitle
 def get_host(jsons):
    """Returns the host of the different json objects iff they are all the same.
    """
    host = None
    for root in jsons:
        if host and host != root["Host"]:
            sys.exit("The datasets are not coming from the same Host")
        if not host:
            host = root["Host"]
    return host
 def get_configuration(jsons):
    """Returns the configuration of the different json objects iff they are all
    the same.
    """
    config = None
    for root in jsons:
        if config and config != root["Configuration"]:
            return None
        if not config:
            config = root["Configuration"]
    return config
 def setup_graphs(files, display):
    """Setups the graphs to render from the json files."""
    jsons = []
    for file in files:
        with open(file) as json_file:
            jsons.append(json.load(json_file))
    if not jsons:
        sys.exit("Nothing to process")
    for root in jsons:
        frequency = root["Host"]["CpuFrequency"]
        for function in root["Functions"]:
            function_name = function["Name"]
            sizes = function["Sizes"]
            runtimes = function["Runtimes"]
            assert len(sizes) == len(runtimes)
            values = collections.defaultdict(lambda: [])
            for i in range(len(sizes)):
              value = runtimes[i]
              if display == "cycles":
                  value = value * frequency
              if display == "bytespercycle":
                  value = value * frequency
                  value = sizes[i] / value
              values[sizes[i]].append(value)
            add_plot(function_name, values)
    config = get_configuration(jsons)
    if config:
        plt.figtext(
            0.95,
            0.15,
            pprint.pformat(config),
            verticalalignment="bottom",
            horizontalalignment="right",
            multialignment="left",
            fontsize="small",
            bbox=dict(boxstyle="round", facecolor="wheat", alpha=0.5))
    axes = plt.gca()
    axes.set_title(get_title(get_host(jsons)))
    axes.set_ylim(bottom=0)
    axes.set_xlabel("Size")
    axes.xaxis.set_major_formatter(EngFormatter(unit="B"))
    if display == "cycles":
          axes.set_ylabel("Cycles")
    if display == "time":
          axes.set_ylabel("Time")
          axes.yaxis.set_major_formatter(EngFormatter(unit="s"))
    if display == "bytespercycle":
          axes.set_ylabel("bytes/cycle")
    plt.legend()
    plt.grid()
 def main():
    parser = argparse.ArgumentParser(
        description="Process benchmark json files.")
    parser.add_argument("files", nargs="+", help="The json files to read from.")
    parser.add_argument("--output", help="The output file to write the graph.")
    parser.add_argument(
        "--headless",
        help="If set do not display the graph.",
        action="store_true")
    parser.add_argument(
        "--display",
        choices= ["time", "cycles", "bytespercycle"],
        default="time",
        help="Use to display either 'time', 'cycles' or 'bytes/cycle'.")
    args = parser.parse_args()
    setup_graphs(args.files, args.display)
    if args.output:
        plt.savefig(args.output)
    if not args.headless:
        plt.show()
 if __name__ == "__main__":
    main()