Adapted perfomance counters for AMD Epyc and usage of TSC instead of std::chrono.

This commit is contained in:
Michael Mueller
2024-07-23 16:21:28 +02:00
parent 74befd9e3b
commit 5823c49317
4 changed files with 48 additions and 15 deletions

View File

@@ -23,9 +23,15 @@ Benchmark::Benchmark(benchmark::Cores &&cores, const std::uint16_t iterations, s
{ {
this->_chronometer.add(benchmark::Perf::CYCLES); this->_chronometer.add(benchmark::Perf::CYCLES);
this->_chronometer.add(benchmark::Perf::INSTRUCTIONS); this->_chronometer.add(benchmark::Perf::INSTRUCTIONS);
this->_chronometer.add(benchmark::Perf::STALLS_MEM_ANY); //this->_chronometer.add(benchmark::Perf::STALLS_MEM_ANY);
this->_chronometer.add(benchmark::Perf::SW_PREFETCH_ACCESS_NTA); //this->_chronometer.add(benchmark::Perf::SW_PREFETCH_ACCESS_NTA);
this->_chronometer.add(benchmark::Perf::SW_PREFETCH_ACCESS_WRITE); //this->_chronometer.add(benchmark::Perf::SW_PREFETCH_ACCESS_WRITE);
this->_chronometer.add(benchmark::Perf::LLC_MISSES);
this->_chronometer.add(benchmark::Perf::DTLB_READ_MISSES);
this->_chronometer.add(benchmark::Perf::DTLB_STORE_MISSES);
this->_chronometer.add(benchmark::Perf::ITLB_LOAD_MISSES);
this->_chronometer.add(benchmark::Perf::SW_PAGE_FAULTS_MAJOR);
this->_chronometer.add(benchmark::Perf::SW_PAGE_FAULTS_MINOR);
} }
std::cout << "core configuration: \n" << this->_cores.dump(2) << std::endl; std::cout << "core configuration: \n" << this->_cores.dump(2) << std::endl;
@@ -53,7 +59,10 @@ void Benchmark::start()
{ {
this->_request_scheduler.clear(); this->_request_scheduler.clear();
} }
auto *start_task = mx::tasking::runtime::new_task<StartMeasurementTask>(0U, *this);
mx::tasking::runtime::spawn(*start_task, 0U);
// Create one request scheduler per core. // Create one request scheduler per core.
for (auto core_index = 0U; core_index < this->_cores.current().size(); core_index++) for (auto core_index = 0U; core_index < this->_cores.current().size(); core_index++)
{ {
@@ -70,8 +79,8 @@ void Benchmark::start()
{ {
mx::tasking::runtime::profile(this->profile_file_name()); mx::tasking::runtime::profile(this->profile_file_name());
} }
this->_chronometer.start(static_cast<std::uint16_t>(static_cast<benchmark::phase>(this->_workload)), //this->_chronometer.start(static_cast<std::uint16_t>(static_cast<benchmark::phase>(this->_workload)),
this->_current_iteration + 1, this->_cores.current()); // this->_current_iteration + 1, this->_cores.current());
} }
const mx::util::core_set &Benchmark::core_set() const mx::util::core_set &Benchmark::core_set()
@@ -109,6 +118,14 @@ void Benchmark::requests_finished()
if (open_requests == 0U) // All request schedulers are done. if (open_requests == 0U) // All request schedulers are done.
{ {
std::uint16_t core_id = mx::system::topology::core_id();
if (core_id != 0) {
this->_open_requests++;
auto *stop_task = mx::tasking::runtime::new_task<StopMeasurementTask>(0U, *this);
stop_task->annotate(static_cast<mx::tasking::TaskInterface::channel>(0));
mx::tasking::runtime::spawn(*stop_task, core_id);
return;
}
// Stop and print time (and performance counter). // Stop and print time (and performance counter).
const auto result = this->_chronometer.stop(this->_workload.size()); const auto result = this->_chronometer.stop(this->_workload.size());
mx::tasking::runtime::stop(); mx::tasking::runtime::stop();
@@ -193,7 +210,7 @@ void Benchmark::requests_finished()
std::string Benchmark::profile_file_name() const std::string Benchmark::profile_file_name() const
{ {
return "profiling-" + std::to_string(this->_cores.current().size()) + "-cores" + "-phase-" + return "profiling-" + std::to_string(static_cast<int>(this->_cores.current().size())) + "-cores" + "-phase-" +
std::to_string(static_cast<std::uint16_t>(static_cast<benchmark::phase>(this->_workload))) + "-iteration-" + std::to_string(static_cast<int>(static_cast<benchmark::phase>(this->_workload))) + "-iteration-" +
std::to_string(this->_current_iteration) + ".json"; std::to_string(static_cast<int>(this->_current_iteration)) + ".json";
} }

View File

@@ -7,6 +7,7 @@
#include <mx/tasking/config.h> #include <mx/tasking/config.h>
#include <mx/tasking/profiling/statistic.h> #include <mx/tasking/profiling/statistic.h>
#include <mx/tasking/runtime.h> #include <mx/tasking/runtime.h>
#include <mx/system/environment.h>
#include <mx/util/core_set.h> #include <mx/util/core_set.h>
#include <numeric> #include <numeric>
#include <ostream> #include <ostream>
@@ -177,15 +178,16 @@ public:
_core_set = core_set; _core_set = core_set;
_perf.start(); _perf.start();
_start = std::chrono::steady_clock::now(); _start = mx::system::Environment::timestamp(); // std::chrono::steady_clock::now();
} }
InterimResult<P> stop(const std::uint64_t count_operations) InterimResult<P> stop(const std::uint64_t count_operations)
{ {
const auto end = std::chrono::steady_clock::now(); const auto end = mx::system::Environment::timestamp(); // std::chrono::steady_clock::now();
_perf.stop(); _perf.stop();
const auto milliseconds = std::chrono::duration_cast<std::chrono::milliseconds>(end - _start); const auto milliseconds = std::chrono::milliseconds(
(end - _start) / 2000000UL); // std::chrono::duration_cast<std::chrono::milliseconds>(end - _start);
return {count_operations, return {count_operations,
_current_phase, _current_phase,
@@ -209,7 +211,8 @@ private:
P _current_phase; P _current_phase;
mx::util::core_set _core_set; mx::util::core_set _core_set;
alignas(64) Perf _perf; alignas(64) Perf _perf;
alignas(64) std::chrono::steady_clock::time_point _start; //alignas(64) std::chrono::steady_clock::time_point _start;
alignas(64) uint64_t _start;
std::unordered_map<std::uint16_t, std::uint64_t> statistic_map( std::unordered_map<std::uint16_t, std::uint64_t> statistic_map(
const mx::tasking::profiling::Statistic::Counter counter) const mx::tasking::profiling::Statistic::Counter counter)

View File

@@ -24,6 +24,13 @@ using namespace benchmark;
*/ */
[[maybe_unused]] PerfCounter Perf::LLC_MISSES = {"llc-miss", PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES}; [[maybe_unused]] PerfCounter Perf::LLC_MISSES = {"llc-miss", PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES};
[[maybe_unused]] PerfCounter Perf::DTLB_READ_MISSES = {"dtlb-read-miss", PERF_TYPE_HW_CACHE, 0x10003};
[[maybe_unused]] PerfCounter Perf::DTLB_STORE_MISSES = {"dtlb-store-miss", PERF_TYPE_HW_CACHE, 0x10103};
[[maybe_unused]] PerfCounter Perf::ITLB_LOAD_MISSES = {"itlb-load-miss", PERF_TYPE_HW_CACHE, 0x10004};
[[maybe_unused]] PerfCounter Perf::SW_PAGE_FAULTS = {"sw-page-faults", PERF_TYPE_SOFTWARE, PERF_COUNT_SW_PAGE_FAULTS};
[[maybe_unused]] PerfCounter Perf::SW_PAGE_FAULTS_MINOR = {"sw-page-faults-minor", PERF_TYPE_SOFTWARE, PERF_COUNT_SW_PAGE_FAULTS_MIN};
[[maybe_unused]] PerfCounter Perf::SW_PAGE_FAULTS_MAJOR = {"sw-page-faults-major", PERF_TYPE_SOFTWARE, PERF_COUNT_SW_PAGE_FAULTS_MAJ};
/** /**
* Counter "LLC Reference" * Counter "LLC Reference"
* Accesses to the LLC, in which the data is present(hit) or not present(miss) * Accesses to the LLC, in which the data is present(hit) or not present(miss)

View File

@@ -107,6 +107,12 @@ public:
[[maybe_unused]] static PerfCounter SW_PREFETCH_ACCESS_T0; [[maybe_unused]] static PerfCounter SW_PREFETCH_ACCESS_T0;
[[maybe_unused]] static PerfCounter SW_PREFETCH_ACCESS_T1_T2; [[maybe_unused]] static PerfCounter SW_PREFETCH_ACCESS_T1_T2;
[[maybe_unused]] static PerfCounter SW_PREFETCH_ACCESS_WRITE; [[maybe_unused]] static PerfCounter SW_PREFETCH_ACCESS_WRITE;
[[maybe_unused]] static PerfCounter DTLB_STORE_MISSES;
[[maybe_unused]] static PerfCounter DTLB_READ_MISSES;
[[maybe_unused]] static PerfCounter ITLB_LOAD_MISSES;
[[maybe_unused]] static PerfCounter SW_PAGE_FAULTS;
[[maybe_unused]] static PerfCounter SW_PAGE_FAULTS_MINOR;
[[maybe_unused]] static PerfCounter SW_PAGE_FAULTS_MAJOR;
Perf() noexcept = default; Perf() noexcept = default;
~Perf() noexcept = default; ~Perf() noexcept = default;
@@ -154,4 +160,4 @@ public:
private: private:
std::vector<PerfCounter> _counter; std::vector<PerfCounter> _counter;
}; };
} // namespace benchmark } // namespace benchmark