// Copyright 2015 Google Inc. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "benchmark/benchmark.h" #include "internal_macros.h" #ifndef BENCHMARK_OS_WINDOWS #include <sys/time.h> #include <sys/resource.h> #include <unistd.h> #endif #include <cstdlib> #include <cstring> #include <cstdio> #include <algorithm> #include <atomic> #include <condition_variable> #include <iostream> #include <memory> #include <thread> #include "check.h" #include "commandlineflags.h" #include "log.h" #include "mutex.h" #include "re.h" #include "stat.h" #include "string_util.h" #include "sysinfo.h" #include "walltime.h" DEFINE_bool(benchmark_list_tests, false, "Print a list of benchmarks. This option overrides all other " "options."); DEFINE_string(benchmark_filter, ".", "A regular expression that specifies the set of benchmarks " "to execute. If this flag is empty, no benchmarks are run. " "If this flag is the string \"all\", all benchmarks linked " "into the process are run."); DEFINE_double(benchmark_min_time, 0.5, "Minimum number of seconds we should run benchmark before " "results are considered significant. For cpu-time based " "tests, this is the lower bound on the total cpu time " "used by all threads that make up the test. For real-time " "based tests, this is the lower bound on the elapsed time " "of the benchmark execution, regardless of number of " "threads."); DEFINE_int32(benchmark_repetitions, 1, "The number of runs of each benchmark. If greater than 1, the " "mean and standard deviation of the runs will be reported."); DEFINE_string(benchmark_format, "tabular", "The format to use for console output. Valid values are " "'tabular', 'json', or 'csv'."); DEFINE_bool(color_print, true, "Enables colorized logging."); DEFINE_int32(v, 0, "The level of verbose logging to output"); namespace benchmark { namespace internal { void UseCharPointer(char const volatile*) {} // NOTE: This is a dummy "mutex" type used to denote the actual mutex // returned by GetBenchmarkLock(). This is only used to placate the thread // safety warnings by giving the return of GetBenchmarkLock() a name. struct CAPABILITY("mutex") BenchmarkLockType {}; BenchmarkLockType BenchmarkLockVar; } // end namespace internal inline Mutex& RETURN_CAPABILITY(::benchmark::internal::BenchmarkLockVar) GetBenchmarkLock() { static Mutex lock; return lock; } namespace { bool IsZero(double n) { return std::abs(n) < std::numeric_limits<double>::epsilon(); } // For non-dense Range, intermediate values are powers of kRangeMultiplier. static const int kRangeMultiplier = 8; static const size_t kMaxIterations = 1000000000; bool running_benchmark = false; // Global variable so that a benchmark can cause a little extra printing std::string* GetReportLabel() { static std::string label GUARDED_BY(GetBenchmarkLock()); return &label; } // TODO(ericwf): support MallocCounter. //static benchmark::MallocCounter *benchmark_mc; struct ThreadStats { ThreadStats() : bytes_processed(0), items_processed(0) {} int64_t bytes_processed; int64_t items_processed; }; // Timer management class class TimerManager { public: TimerManager(int num_threads, Notification* done) : num_threads_(num_threads), done_(done), running_(false), real_time_used_(0), cpu_time_used_(0), num_finalized_(0), phase_number_(0), entered_(0) { } // Called by each thread void StartTimer() EXCLUDES(lock_) { bool last_thread = false; { MutexLock ml(lock_); last_thread = Barrier(ml); if (last_thread) { CHECK(!running_) << "Called StartTimer when timer is already running"; running_ = true; start_real_time_ = walltime::Now(); start_cpu_time_ = MyCPUUsage() + ChildrenCPUUsage(); } } if (last_thread) { phase_condition_.notify_all(); } } // Called by each thread void StopTimer() EXCLUDES(lock_) { bool last_thread = false; { MutexLock ml(lock_); last_thread = Barrier(ml); if (last_thread) { CHECK(running_) << "Called StopTimer when timer is already stopped"; InternalStop(); } } if (last_thread) { phase_condition_.notify_all(); } } // Called by each thread void Finalize() EXCLUDES(lock_) { MutexLock l(lock_); num_finalized_++; if (num_finalized_ == num_threads_) { CHECK(!running_) << "The timer should be stopped before the timer is finalized"; done_->Notify(); } } // REQUIRES: timer is not running double real_time_used() EXCLUDES(lock_) { MutexLock l(lock_); CHECK(!running_); return real_time_used_; } // REQUIRES: timer is not running double cpu_time_used() EXCLUDES(lock_) { MutexLock l(lock_); CHECK(!running_); return cpu_time_used_; } private: Mutex lock_; Condition phase_condition_; int num_threads_; Notification* done_; bool running_; // Is the timer running double start_real_time_; // If running_ double start_cpu_time_; // If running_ // Accumulated time so far (does not contain current slice if running_) double real_time_used_; double cpu_time_used_; // How many threads have called Finalize() int num_finalized_; // State for barrier management int phase_number_; int entered_; // Number of threads that have entered this barrier void InternalStop() REQUIRES(lock_) { CHECK(running_); running_ = false; real_time_used_ += walltime::Now() - start_real_time_; cpu_time_used_ += ((MyCPUUsage() + ChildrenCPUUsage()) - start_cpu_time_); } // Enter the barrier and wait until all other threads have also // entered the barrier. Returns iff this is the last thread to // enter the barrier. bool Barrier(MutexLock& ml) REQUIRES(lock_) { CHECK_LT(entered_, num_threads_); entered_++; if (entered_ < num_threads_) { // Wait for all threads to enter int phase_number_cp = phase_number_; auto cb = [this, phase_number_cp]() { return this->phase_number_ > phase_number_cp; }; phase_condition_.wait(ml.native_handle(), cb); return false; // I was not the last one } else { // Last thread has reached the barrier phase_number_++; entered_ = 0; return true; } } }; // TimerManager for current run. static std::unique_ptr<TimerManager> timer_manager = nullptr; } // end namespace namespace internal { // Information kept per benchmark we may want to run struct Benchmark::Instance { std::string name; Benchmark* benchmark; bool has_arg1; int arg1; bool has_arg2; int arg2; bool use_real_time; double min_time; int threads; // Number of concurrent threads to use bool multithreaded; // Is benchmark multi-threaded? }; // Class for managing registered benchmarks. Note that each registered // benchmark identifies a family of related benchmarks to run. class BenchmarkFamilies { public: static BenchmarkFamilies* GetInstance(); // Registers a benchmark family and returns the index assigned to it. size_t AddBenchmark(std::unique_ptr<Benchmark> family); // Extract the list of benchmark instances that match the specified // regular expression. bool FindBenchmarks(const std::string& re, std::vector<Benchmark::Instance>* benchmarks); private: BenchmarkFamilies() {} std::vector<std::unique_ptr<Benchmark>> families_; Mutex mutex_; }; class BenchmarkImp { public: explicit BenchmarkImp(const char* name); ~BenchmarkImp(); void Arg(int x); void Range(int start, int limit); void DenseRange(int start, int limit); void ArgPair(int start, int limit); void RangePair(int lo1, int hi1, int lo2, int hi2); void MinTime(double n); void UseRealTime(); void Threads(int t); void ThreadRange(int min_threads, int max_threads); void ThreadPerCpu(); void SetName(const char* name); static void AddRange(std::vector<int>* dst, int lo, int hi, int mult); private: friend class BenchmarkFamilies; std::string name_; int arg_count_; std::vector< std::pair<int, int> > args_; // Args for all benchmark runs double min_time_; bool use_real_time_; std::vector<int> thread_counts_; BenchmarkImp& operator=(BenchmarkImp const&); }; BenchmarkFamilies* BenchmarkFamilies::GetInstance() { static BenchmarkFamilies instance; return &instance; } size_t BenchmarkFamilies::AddBenchmark(std::unique_ptr<Benchmark> family) { MutexLock l(mutex_); size_t index = families_.size(); families_.push_back(std::move(family)); return index; } bool BenchmarkFamilies::FindBenchmarks( const std::string& spec, std::vector<Benchmark::Instance>* benchmarks) { // Make regular expression out of command-line flag std::string error_msg; Regex re; if (!re.Init(spec, &error_msg)) { std::cerr << "Could not compile benchmark re: " << error_msg << std::endl; return false; } // Special list of thread counts to use when none are specified std::vector<int> one_thread; one_thread.push_back(1); MutexLock l(mutex_); for (std::unique_ptr<Benchmark>& bench_family : families_) { // Family was deleted or benchmark doesn't match if (!bench_family) continue; BenchmarkImp* family = bench_family->imp_; if (family->arg_count_ == -1) { family->arg_count_ = 0; family->args_.emplace_back(-1, -1); } for (auto const& args : family->args_) { const std::vector<int>* thread_counts = (family->thread_counts_.empty() ? &one_thread : &family->thread_counts_); for (int num_threads : *thread_counts) { Benchmark::Instance instance; instance.name = family->name_; instance.benchmark = bench_family.get(); instance.has_arg1 = family->arg_count_ >= 1; instance.arg1 = args.first; instance.has_arg2 = family->arg_count_ == 2; instance.arg2 = args.second; instance.min_time = family->min_time_; instance.use_real_time = family->use_real_time_; instance.threads = num_threads; instance.multithreaded = !(family->thread_counts_.empty()); // Add arguments to instance name if (family->arg_count_ >= 1) { AppendHumanReadable(instance.arg1, &instance.name); } if (family->arg_count_ >= 2) { AppendHumanReadable(instance.arg2, &instance.name); } if (!IsZero(family->min_time_)) { instance.name += StringPrintF("/min_time:%0.3f", family->min_time_); } if (family->use_real_time_) { instance.name += "/real_time"; } // Add the number of threads used to the name if (!family->thread_counts_.empty()) { instance.name += StringPrintF("/threads:%d", instance.threads); } if (re.Match(instance.name)) { benchmarks->push_back(instance); } } } } return true; } BenchmarkImp::BenchmarkImp(const char* name) : name_(name), arg_count_(-1), min_time_(0.0), use_real_time_(false) { } BenchmarkImp::~BenchmarkImp() { } void BenchmarkImp::Arg(int x) { CHECK(arg_count_ == -1 || arg_count_ == 1); arg_count_ = 1; args_.emplace_back(x, -1); } void BenchmarkImp::Range(int start, int limit) { CHECK(arg_count_ == -1 || arg_count_ == 1); arg_count_ = 1; std::vector<int> arglist; AddRange(&arglist, start, limit, kRangeMultiplier); for (int i : arglist) { args_.emplace_back(i, -1); } } void BenchmarkImp::DenseRange(int start, int limit) { CHECK(arg_count_ == -1 || arg_count_ == 1); arg_count_ = 1; CHECK_GE(start, 0); CHECK_LE(start, limit); for (int arg = start; arg <= limit; arg++) { args_.emplace_back(arg, -1); } } void BenchmarkImp::ArgPair(int x, int y) { CHECK(arg_count_ == -1 || arg_count_ == 2); arg_count_ = 2; args_.emplace_back(x, y); } void BenchmarkImp::RangePair(int lo1, int hi1, int lo2, int hi2) { CHECK(arg_count_ == -1 || arg_count_ == 2); arg_count_ = 2; std::vector<int> arglist1, arglist2; AddRange(&arglist1, lo1, hi1, kRangeMultiplier); AddRange(&arglist2, lo2, hi2, kRangeMultiplier); for (int i : arglist1) { for (int j : arglist2) { args_.emplace_back(i, j); } } } void BenchmarkImp::MinTime(double t) { CHECK(t > 0.0); min_time_ = t; } void BenchmarkImp::UseRealTime() { use_real_time_ = true; } void BenchmarkImp::Threads(int t) { CHECK_GT(t, 0); thread_counts_.push_back(t); } void BenchmarkImp::ThreadRange(int min_threads, int max_threads) { CHECK_GT(min_threads, 0); CHECK_GE(max_threads, min_threads); AddRange(&thread_counts_, min_threads, max_threads, 2); } void BenchmarkImp::ThreadPerCpu() { static int num_cpus = NumCPUs(); thread_counts_.push_back(num_cpus); } void BenchmarkImp::SetName(const char* name) { name_ = name; } void BenchmarkImp::AddRange(std::vector<int>* dst, int lo, int hi, int mult) { CHECK_GE(lo, 0); CHECK_GE(hi, lo); // Add "lo" dst->push_back(lo); static const int kint32max = std::numeric_limits<int32_t>::max(); // Now space out the benchmarks in multiples of "mult" for (int32_t i = 1; i < kint32max/mult; i *= mult) { if (i >= hi) break; if (i > lo) { dst->push_back(i); } } // Add "hi" (if different from "lo") if (hi != lo) { dst->push_back(hi); } } Benchmark::Benchmark(const char* name) : imp_(new BenchmarkImp(name)) { } Benchmark::~Benchmark() { delete imp_; } Benchmark::Benchmark(Benchmark const& other) : imp_(new BenchmarkImp(*other.imp_)) { } Benchmark* Benchmark::Arg(int x) { imp_->Arg(x); return this; } Benchmark* Benchmark::Range(int start, int limit) { imp_->Range(start, limit); return this; } Benchmark* Benchmark::DenseRange(int start, int limit) { imp_->DenseRange(start, limit); return this; } Benchmark* Benchmark::ArgPair(int x, int y) { imp_->ArgPair(x, y); return this; } Benchmark* Benchmark::RangePair(int lo1, int hi1, int lo2, int hi2) { imp_->RangePair(lo1, hi1, lo2, hi2); return this; } Benchmark* Benchmark::Apply(void (*custom_arguments)(Benchmark* benchmark)) { custom_arguments(this); return this; } Benchmark* Benchmark::MinTime(double t) { imp_->MinTime(t); return this; } Benchmark* Benchmark::UseRealTime() { imp_->UseRealTime(); return this; } Benchmark* Benchmark::Threads(int t) { imp_->Threads(t); return this; } Benchmark* Benchmark::ThreadRange(int min_threads, int max_threads) { imp_->ThreadRange(min_threads, max_threads); return this; } Benchmark* Benchmark::ThreadPerCpu() { imp_->ThreadPerCpu(); return this; } void Benchmark::SetName(const char* name) { imp_->SetName(name); } void FunctionBenchmark::Run(State& st) { func_(st); } } // end namespace internal namespace { // Execute one thread of benchmark b for the specified number of iterations. // Adds the stats collected for the thread into *total. void RunInThread(const benchmark::internal::Benchmark::Instance* b, size_t iters, int thread_id, ThreadStats* total) EXCLUDES(GetBenchmarkLock()) { State st(iters, b->has_arg1, b->arg1, b->has_arg2, b->arg2, thread_id, b->threads); b->benchmark->Run(st); CHECK(st.iterations() == st.max_iterations) << "Benchmark returned before State::KeepRunning() returned false!"; { MutexLock l(GetBenchmarkLock()); total->bytes_processed += st.bytes_processed(); total->items_processed += st.items_processed(); } timer_manager->Finalize(); } void RunBenchmark(const benchmark::internal::Benchmark::Instance& b, BenchmarkReporter* br) EXCLUDES(GetBenchmarkLock()) { size_t iters = 1; std::vector<BenchmarkReporter::Run> reports; std::vector<std::thread> pool; if (b.multithreaded) pool.resize(b.threads); for (int i = 0; i < FLAGS_benchmark_repetitions; i++) { std::string mem; for (;;) { // Try benchmark VLOG(2) << "Running " << b.name << " for " << iters << "\n"; { MutexLock l(GetBenchmarkLock()); GetReportLabel()->clear(); } Notification done; timer_manager = std::unique_ptr<TimerManager>(new TimerManager(b.threads, &done)); ThreadStats total; running_benchmark = true; if (b.multithreaded) { // If this is out first iteration of the while(true) loop then the // threads haven't been started and can't be joined. Otherwise we need // to join the thread before replacing them. for (std::thread& thread : pool) { if (thread.joinable()) thread.join(); } for (std::size_t ti = 0; ti < pool.size(); ++ti) { pool[ti] = std::thread(&RunInThread, &b, iters, ti, &total); } } else { // Run directly in this thread RunInThread(&b, iters, 0, &total); } done.WaitForNotification(); running_benchmark = false; const double cpu_accumulated_time = timer_manager->cpu_time_used(); const double real_accumulated_time = timer_manager->real_time_used(); timer_manager.reset(); VLOG(2) << "Ran in " << cpu_accumulated_time << "/" << real_accumulated_time << "\n"; // Base decisions off of real time if requested by this benchmark. double seconds = cpu_accumulated_time; if (b.use_real_time) { seconds = real_accumulated_time; } std::string label; { MutexLock l(GetBenchmarkLock()); label = *GetReportLabel(); } const double min_time = !IsZero(b.min_time) ? b.min_time : FLAGS_benchmark_min_time; // If this was the first run, was elapsed time or cpu time large enough? // If this is not the first run, go with the current value of iter. if ((i > 0) || (iters >= kMaxIterations) || (seconds >= min_time) || (real_accumulated_time >= 5*min_time)) { double bytes_per_second = 0; if (total.bytes_processed > 0 && seconds > 0.0) { bytes_per_second = (total.bytes_processed / seconds); } double items_per_second = 0; if (total.items_processed > 0 && seconds > 0.0) { items_per_second = (total.items_processed / seconds); } // Create report about this benchmark run. BenchmarkReporter::Run report; report.benchmark_name = b.name; report.report_label = label; // Report the total iterations across all threads. report.iterations = static_cast<int64_t>(iters) * b.threads; report.real_accumulated_time = real_accumulated_time; report.cpu_accumulated_time = cpu_accumulated_time; report.bytes_per_second = bytes_per_second; report.items_per_second = items_per_second; reports.push_back(report); break; } // See how much iterations should be increased by // Note: Avoid division by zero with max(seconds, 1ns). double multiplier = min_time * 1.4 / std::max(seconds, 1e-9); // If our last run was at least 10% of FLAGS_benchmark_min_time then we // use the multiplier directly. Otherwise we use at most 10 times // expansion. // NOTE: When the last run was at least 10% of the min time the max // expansion should be 14x. bool is_significant = (seconds / min_time) > 0.1; multiplier = is_significant ? multiplier : std::min(10.0, multiplier); if (multiplier <= 1.0) multiplier = 2.0; double next_iters = std::max(multiplier * iters, iters + 1.0); if (next_iters > kMaxIterations) { next_iters = kMaxIterations; } VLOG(3) << "Next iters: " << next_iters << ", " << multiplier << "\n"; iters = static_cast<int>(next_iters + 0.5); } } br->ReportRuns(reports); if (b.multithreaded) { for (std::thread& thread : pool) thread.join(); } } } // namespace State::State(size_t max_iters, bool has_x, int x, bool has_y, int y, int thread_i, int n_threads) : started_(false), total_iterations_(0), has_range_x_(has_x), range_x_(x), has_range_y_(has_y), range_y_(y), bytes_processed_(0), items_processed_(0), thread_index(thread_i), threads(n_threads), max_iterations(max_iters) { CHECK(max_iterations != 0) << "At least one iteration must be run"; CHECK_LT(thread_index, threads) << "thread_index must be less than threads"; } void State::PauseTiming() { // Add in time accumulated so far CHECK(running_benchmark); timer_manager->StopTimer(); } void State::ResumeTiming() { CHECK(running_benchmark); timer_manager->StartTimer(); } void State::SetLabel(const char* label) { CHECK(running_benchmark); MutexLock l(GetBenchmarkLock()); *GetReportLabel() = label; } namespace internal { namespace { void PrintBenchmarkList() { std::vector<Benchmark::Instance> benchmarks; auto families = BenchmarkFamilies::GetInstance(); if (!families->FindBenchmarks(".", &benchmarks)) return; for (const internal::Benchmark::Instance& benchmark : benchmarks) { std::cout << benchmark.name << "\n"; } } void RunMatchingBenchmarks(const std::string& spec, BenchmarkReporter* reporter) { CHECK(reporter != nullptr); if (spec.empty()) return; std::vector<Benchmark::Instance> benchmarks; auto families = BenchmarkFamilies::GetInstance(); if (!families->FindBenchmarks(spec, &benchmarks)) return; // Determine the width of the name field using a minimum width of 10. size_t name_field_width = 10; for (const Benchmark::Instance& benchmark : benchmarks) { name_field_width = std::max<size_t>(name_field_width, benchmark.name.size()); } if (FLAGS_benchmark_repetitions > 1) name_field_width += std::strlen("_stddev"); // Print header here BenchmarkReporter::Context context; context.num_cpus = NumCPUs(); context.mhz_per_cpu = CyclesPerSecond() / 1000000.0f; context.cpu_scaling_enabled = CpuScalingEnabled(); context.name_field_width = name_field_width; if (reporter->ReportContext(context)) { for (const auto& benchmark : benchmarks) { RunBenchmark(benchmark, reporter); } } } std::unique_ptr<BenchmarkReporter> GetDefaultReporter() { typedef std::unique_ptr<BenchmarkReporter> PtrType; if (FLAGS_benchmark_format == "tabular") { return PtrType(new ConsoleReporter); } else if (FLAGS_benchmark_format == "json") { return PtrType(new JSONReporter); } else if (FLAGS_benchmark_format == "csv") { return PtrType(new CSVReporter); } else { std::cerr << "Unexpected format: '" << FLAGS_benchmark_format << "'\n"; std::exit(1); } } } // end namespace } // end namespace internal void RunSpecifiedBenchmarks() { RunSpecifiedBenchmarks(nullptr); } void RunSpecifiedBenchmarks(BenchmarkReporter* reporter) { if (FLAGS_benchmark_list_tests) { internal::PrintBenchmarkList(); return; } std::string spec = FLAGS_benchmark_filter; if (spec.empty() || spec == "all") spec = "."; // Regexp that matches all benchmarks std::unique_ptr<BenchmarkReporter> default_reporter; if (!reporter) { default_reporter = internal::GetDefaultReporter(); reporter = default_reporter.get(); } internal::RunMatchingBenchmarks(spec, reporter); reporter->Finalize(); } namespace internal { void PrintUsageAndExit() { fprintf(stdout, "benchmark" " [--benchmark_list_tests={true|false}]\n" " [--benchmark_filter=<regex>]\n" " [--benchmark_min_time=<min_time>]\n" " [--benchmark_repetitions=<num_repetitions>]\n" " [--benchmark_format=<tabular|json|csv>]\n" " [--color_print={true|false}]\n" " [--v=<verbosity>]\n"); exit(0); } void ParseCommandLineFlags(int* argc, char** argv) { using namespace benchmark; for (int i = 1; i < *argc; ++i) { if ( ParseBoolFlag(argv[i], "benchmark_list_tests", &FLAGS_benchmark_list_tests) || ParseStringFlag(argv[i], "benchmark_filter", &FLAGS_benchmark_filter) || ParseDoubleFlag(argv[i], "benchmark_min_time", &FLAGS_benchmark_min_time) || ParseInt32Flag(argv[i], "benchmark_repetitions", &FLAGS_benchmark_repetitions) || ParseStringFlag(argv[i], "benchmark_format", &FLAGS_benchmark_format) || ParseBoolFlag(argv[i], "color_print", &FLAGS_color_print) || ParseInt32Flag(argv[i], "v", &FLAGS_v)) { for (int j = i; j != *argc; ++j) argv[j] = argv[j + 1]; --(*argc); --i; } else if (IsFlag(argv[i], "help")) { PrintUsageAndExit(); } } if (FLAGS_benchmark_format != "tabular" && FLAGS_benchmark_format != "json" && FLAGS_benchmark_format != "csv") { PrintUsageAndExit(); } } Benchmark* RegisterBenchmarkInternal(Benchmark* bench) { std::unique_ptr<Benchmark> bench_ptr(bench); BenchmarkFamilies* families = BenchmarkFamilies::GetInstance(); families->AddBenchmark(std::move(bench_ptr)); return bench; } } // end namespace internal void Initialize(int* argc, char** argv) { internal::ParseCommandLineFlags(argc, argv); internal::SetLogLevel(FLAGS_v); // TODO remove this. It prints some output the first time it is called. // We don't want to have this ouput printed during benchmarking. MyCPUUsage(); // The first call to walltime::Now initialized it. Call it once to // prevent the initialization from happening in a benchmark. walltime::Now(); } } // end namespace benchmark