Adapted perfomance counters for AMD Epyc and usage of TSC instead of std::chrono.

This commit is contained in:
Michael Mueller
2024-07-23 16:21:28 +02:00
parent 74befd9e3b
commit 5823c49317
4 changed files with 48 additions and 15 deletions

View File

@@ -23,9 +23,15 @@ Benchmark::Benchmark(benchmark::Cores &&cores, const std::uint16_t iterations, s
{
this->_chronometer.add(benchmark::Perf::CYCLES);
this->_chronometer.add(benchmark::Perf::INSTRUCTIONS);
this->_chronometer.add(benchmark::Perf::STALLS_MEM_ANY);
this->_chronometer.add(benchmark::Perf::SW_PREFETCH_ACCESS_NTA);
this->_chronometer.add(benchmark::Perf::SW_PREFETCH_ACCESS_WRITE);
//this->_chronometer.add(benchmark::Perf::STALLS_MEM_ANY);
//this->_chronometer.add(benchmark::Perf::SW_PREFETCH_ACCESS_NTA);
//this->_chronometer.add(benchmark::Perf::SW_PREFETCH_ACCESS_WRITE);
this->_chronometer.add(benchmark::Perf::LLC_MISSES);
this->_chronometer.add(benchmark::Perf::DTLB_READ_MISSES);
this->_chronometer.add(benchmark::Perf::DTLB_STORE_MISSES);
this->_chronometer.add(benchmark::Perf::ITLB_LOAD_MISSES);
this->_chronometer.add(benchmark::Perf::SW_PAGE_FAULTS_MAJOR);
this->_chronometer.add(benchmark::Perf::SW_PAGE_FAULTS_MINOR);
}
std::cout << "core configuration: \n" << this->_cores.dump(2) << std::endl;
@@ -53,7 +59,10 @@ void Benchmark::start()
{
this->_request_scheduler.clear();
}
auto *start_task = mx::tasking::runtime::new_task<StartMeasurementTask>(0U, *this);
mx::tasking::runtime::spawn(*start_task, 0U);
// Create one request scheduler per core.
for (auto core_index = 0U; core_index < this->_cores.current().size(); core_index++)
{
@@ -70,8 +79,8 @@ void Benchmark::start()
{
mx::tasking::runtime::profile(this->profile_file_name());
}
this->_chronometer.start(static_cast<std::uint16_t>(static_cast<benchmark::phase>(this->_workload)),
this->_current_iteration + 1, this->_cores.current());
//this->_chronometer.start(static_cast<std::uint16_t>(static_cast<benchmark::phase>(this->_workload)),
// this->_current_iteration + 1, this->_cores.current());
}
const mx::util::core_set &Benchmark::core_set()
@@ -109,6 +118,14 @@ void Benchmark::requests_finished()
if (open_requests == 0U) // All request schedulers are done.
{
std::uint16_t core_id = mx::system::topology::core_id();
if (core_id != 0) {
this->_open_requests++;
auto *stop_task = mx::tasking::runtime::new_task<StopMeasurementTask>(0U, *this);
stop_task->annotate(static_cast<mx::tasking::TaskInterface::channel>(0));
mx::tasking::runtime::spawn(*stop_task, core_id);
return;
}
// Stop and print time (and performance counter).
const auto result = this->_chronometer.stop(this->_workload.size());
mx::tasking::runtime::stop();
@@ -193,7 +210,7 @@ void Benchmark::requests_finished()
std::string Benchmark::profile_file_name() const
{
return "profiling-" + std::to_string(this->_cores.current().size()) + "-cores" + "-phase-" +
std::to_string(static_cast<std::uint16_t>(static_cast<benchmark::phase>(this->_workload))) + "-iteration-" +
std::to_string(this->_current_iteration) + ".json";
}
return "profiling-" + std::to_string(static_cast<int>(this->_cores.current().size())) + "-cores" + "-phase-" +
std::to_string(static_cast<int>(static_cast<benchmark::phase>(this->_workload))) + "-iteration-" +
std::to_string(static_cast<int>(this->_current_iteration)) + ".json";
}

View File

@@ -7,6 +7,7 @@
#include <mx/tasking/config.h>
#include <mx/tasking/profiling/statistic.h>
#include <mx/tasking/runtime.h>
#include <mx/system/environment.h>
#include <mx/util/core_set.h>
#include <numeric>
#include <ostream>
@@ -177,15 +178,16 @@ public:
_core_set = core_set;
_perf.start();
_start = std::chrono::steady_clock::now();
_start = mx::system::Environment::timestamp(); // std::chrono::steady_clock::now();
}
InterimResult<P> stop(const std::uint64_t count_operations)
{
const auto end = std::chrono::steady_clock::now();
const auto end = mx::system::Environment::timestamp(); // std::chrono::steady_clock::now();
_perf.stop();
const auto milliseconds = std::chrono::duration_cast<std::chrono::milliseconds>(end - _start);
const auto milliseconds = std::chrono::milliseconds(
(end - _start) / 2000000UL); // std::chrono::duration_cast<std::chrono::milliseconds>(end - _start);
return {count_operations,
_current_phase,
@@ -209,7 +211,8 @@ private:
P _current_phase;
mx::util::core_set _core_set;
alignas(64) Perf _perf;
alignas(64) std::chrono::steady_clock::time_point _start;
//alignas(64) std::chrono::steady_clock::time_point _start;
alignas(64) uint64_t _start;
std::unordered_map<std::uint16_t, std::uint64_t> statistic_map(
const mx::tasking::profiling::Statistic::Counter counter)

View File

@@ -24,6 +24,13 @@ using namespace benchmark;
*/
[[maybe_unused]] PerfCounter Perf::LLC_MISSES = {"llc-miss", PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES};
[[maybe_unused]] PerfCounter Perf::DTLB_READ_MISSES = {"dtlb-read-miss", PERF_TYPE_HW_CACHE, 0x10003};
[[maybe_unused]] PerfCounter Perf::DTLB_STORE_MISSES = {"dtlb-store-miss", PERF_TYPE_HW_CACHE, 0x10103};
[[maybe_unused]] PerfCounter Perf::ITLB_LOAD_MISSES = {"itlb-load-miss", PERF_TYPE_HW_CACHE, 0x10004};
[[maybe_unused]] PerfCounter Perf::SW_PAGE_FAULTS = {"sw-page-faults", PERF_TYPE_SOFTWARE, PERF_COUNT_SW_PAGE_FAULTS};
[[maybe_unused]] PerfCounter Perf::SW_PAGE_FAULTS_MINOR = {"sw-page-faults-minor", PERF_TYPE_SOFTWARE, PERF_COUNT_SW_PAGE_FAULTS_MIN};
[[maybe_unused]] PerfCounter Perf::SW_PAGE_FAULTS_MAJOR = {"sw-page-faults-major", PERF_TYPE_SOFTWARE, PERF_COUNT_SW_PAGE_FAULTS_MAJ};
/**
* Counter "LLC Reference"
* Accesses to the LLC, in which the data is present(hit) or not present(miss)

View File

@@ -107,6 +107,12 @@ public:
[[maybe_unused]] static PerfCounter SW_PREFETCH_ACCESS_T0;
[[maybe_unused]] static PerfCounter SW_PREFETCH_ACCESS_T1_T2;
[[maybe_unused]] static PerfCounter SW_PREFETCH_ACCESS_WRITE;
[[maybe_unused]] static PerfCounter DTLB_STORE_MISSES;
[[maybe_unused]] static PerfCounter DTLB_READ_MISSES;
[[maybe_unused]] static PerfCounter ITLB_LOAD_MISSES;
[[maybe_unused]] static PerfCounter SW_PAGE_FAULTS;
[[maybe_unused]] static PerfCounter SW_PAGE_FAULTS_MINOR;
[[maybe_unused]] static PerfCounter SW_PAGE_FAULTS_MAJOR;
Perf() noexcept = default;
~Perf() noexcept = default;
@@ -154,4 +160,4 @@ public:
private:
std::vector<PerfCounter> _counter;
};
} // namespace benchmark
} // namespace benchmark