mirror of
https://github.com/mmueller41/mxtasking.git
synced 2026-01-21 12:42:57 +01:00
Adapted perfomance counters for AMD Epyc and usage of TSC instead of std::chrono.
This commit is contained in:
@@ -23,9 +23,15 @@ Benchmark::Benchmark(benchmark::Cores &&cores, const std::uint16_t iterations, s
|
|||||||
{
|
{
|
||||||
this->_chronometer.add(benchmark::Perf::CYCLES);
|
this->_chronometer.add(benchmark::Perf::CYCLES);
|
||||||
this->_chronometer.add(benchmark::Perf::INSTRUCTIONS);
|
this->_chronometer.add(benchmark::Perf::INSTRUCTIONS);
|
||||||
this->_chronometer.add(benchmark::Perf::STALLS_MEM_ANY);
|
//this->_chronometer.add(benchmark::Perf::STALLS_MEM_ANY);
|
||||||
this->_chronometer.add(benchmark::Perf::SW_PREFETCH_ACCESS_NTA);
|
//this->_chronometer.add(benchmark::Perf::SW_PREFETCH_ACCESS_NTA);
|
||||||
this->_chronometer.add(benchmark::Perf::SW_PREFETCH_ACCESS_WRITE);
|
//this->_chronometer.add(benchmark::Perf::SW_PREFETCH_ACCESS_WRITE);
|
||||||
|
this->_chronometer.add(benchmark::Perf::LLC_MISSES);
|
||||||
|
this->_chronometer.add(benchmark::Perf::DTLB_READ_MISSES);
|
||||||
|
this->_chronometer.add(benchmark::Perf::DTLB_STORE_MISSES);
|
||||||
|
this->_chronometer.add(benchmark::Perf::ITLB_LOAD_MISSES);
|
||||||
|
this->_chronometer.add(benchmark::Perf::SW_PAGE_FAULTS_MAJOR);
|
||||||
|
this->_chronometer.add(benchmark::Perf::SW_PAGE_FAULTS_MINOR);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::cout << "core configuration: \n" << this->_cores.dump(2) << std::endl;
|
std::cout << "core configuration: \n" << this->_cores.dump(2) << std::endl;
|
||||||
@@ -53,7 +59,10 @@ void Benchmark::start()
|
|||||||
{
|
{
|
||||||
this->_request_scheduler.clear();
|
this->_request_scheduler.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
auto *start_task = mx::tasking::runtime::new_task<StartMeasurementTask>(0U, *this);
|
||||||
|
mx::tasking::runtime::spawn(*start_task, 0U);
|
||||||
|
|
||||||
// Create one request scheduler per core.
|
// Create one request scheduler per core.
|
||||||
for (auto core_index = 0U; core_index < this->_cores.current().size(); core_index++)
|
for (auto core_index = 0U; core_index < this->_cores.current().size(); core_index++)
|
||||||
{
|
{
|
||||||
@@ -70,8 +79,8 @@ void Benchmark::start()
|
|||||||
{
|
{
|
||||||
mx::tasking::runtime::profile(this->profile_file_name());
|
mx::tasking::runtime::profile(this->profile_file_name());
|
||||||
}
|
}
|
||||||
this->_chronometer.start(static_cast<std::uint16_t>(static_cast<benchmark::phase>(this->_workload)),
|
//this->_chronometer.start(static_cast<std::uint16_t>(static_cast<benchmark::phase>(this->_workload)),
|
||||||
this->_current_iteration + 1, this->_cores.current());
|
// this->_current_iteration + 1, this->_cores.current());
|
||||||
}
|
}
|
||||||
|
|
||||||
const mx::util::core_set &Benchmark::core_set()
|
const mx::util::core_set &Benchmark::core_set()
|
||||||
@@ -109,6 +118,14 @@ void Benchmark::requests_finished()
|
|||||||
|
|
||||||
if (open_requests == 0U) // All request schedulers are done.
|
if (open_requests == 0U) // All request schedulers are done.
|
||||||
{
|
{
|
||||||
|
std::uint16_t core_id = mx::system::topology::core_id();
|
||||||
|
if (core_id != 0) {
|
||||||
|
this->_open_requests++;
|
||||||
|
auto *stop_task = mx::tasking::runtime::new_task<StopMeasurementTask>(0U, *this);
|
||||||
|
stop_task->annotate(static_cast<mx::tasking::TaskInterface::channel>(0));
|
||||||
|
mx::tasking::runtime::spawn(*stop_task, core_id);
|
||||||
|
return;
|
||||||
|
}
|
||||||
// Stop and print time (and performance counter).
|
// Stop and print time (and performance counter).
|
||||||
const auto result = this->_chronometer.stop(this->_workload.size());
|
const auto result = this->_chronometer.stop(this->_workload.size());
|
||||||
mx::tasking::runtime::stop();
|
mx::tasking::runtime::stop();
|
||||||
@@ -193,7 +210,7 @@ void Benchmark::requests_finished()
|
|||||||
|
|
||||||
std::string Benchmark::profile_file_name() const
|
std::string Benchmark::profile_file_name() const
|
||||||
{
|
{
|
||||||
return "profiling-" + std::to_string(this->_cores.current().size()) + "-cores" + "-phase-" +
|
return "profiling-" + std::to_string(static_cast<int>(this->_cores.current().size())) + "-cores" + "-phase-" +
|
||||||
std::to_string(static_cast<std::uint16_t>(static_cast<benchmark::phase>(this->_workload))) + "-iteration-" +
|
std::to_string(static_cast<int>(static_cast<benchmark::phase>(this->_workload))) + "-iteration-" +
|
||||||
std::to_string(this->_current_iteration) + ".json";
|
std::to_string(static_cast<int>(this->_current_iteration)) + ".json";
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -7,6 +7,7 @@
|
|||||||
#include <mx/tasking/config.h>
|
#include <mx/tasking/config.h>
|
||||||
#include <mx/tasking/profiling/statistic.h>
|
#include <mx/tasking/profiling/statistic.h>
|
||||||
#include <mx/tasking/runtime.h>
|
#include <mx/tasking/runtime.h>
|
||||||
|
#include <mx/system/environment.h>
|
||||||
#include <mx/util/core_set.h>
|
#include <mx/util/core_set.h>
|
||||||
#include <numeric>
|
#include <numeric>
|
||||||
#include <ostream>
|
#include <ostream>
|
||||||
@@ -177,15 +178,16 @@ public:
|
|||||||
_core_set = core_set;
|
_core_set = core_set;
|
||||||
|
|
||||||
_perf.start();
|
_perf.start();
|
||||||
_start = std::chrono::steady_clock::now();
|
_start = mx::system::Environment::timestamp(); // std::chrono::steady_clock::now();
|
||||||
}
|
}
|
||||||
|
|
||||||
InterimResult<P> stop(const std::uint64_t count_operations)
|
InterimResult<P> stop(const std::uint64_t count_operations)
|
||||||
{
|
{
|
||||||
const auto end = std::chrono::steady_clock::now();
|
const auto end = mx::system::Environment::timestamp(); // std::chrono::steady_clock::now();
|
||||||
_perf.stop();
|
_perf.stop();
|
||||||
|
|
||||||
const auto milliseconds = std::chrono::duration_cast<std::chrono::milliseconds>(end - _start);
|
const auto milliseconds = std::chrono::milliseconds(
|
||||||
|
(end - _start) / 2000000UL); // std::chrono::duration_cast<std::chrono::milliseconds>(end - _start);
|
||||||
|
|
||||||
return {count_operations,
|
return {count_operations,
|
||||||
_current_phase,
|
_current_phase,
|
||||||
@@ -209,7 +211,8 @@ private:
|
|||||||
P _current_phase;
|
P _current_phase;
|
||||||
mx::util::core_set _core_set;
|
mx::util::core_set _core_set;
|
||||||
alignas(64) Perf _perf;
|
alignas(64) Perf _perf;
|
||||||
alignas(64) std::chrono::steady_clock::time_point _start;
|
//alignas(64) std::chrono::steady_clock::time_point _start;
|
||||||
|
alignas(64) uint64_t _start;
|
||||||
|
|
||||||
std::unordered_map<std::uint16_t, std::uint64_t> statistic_map(
|
std::unordered_map<std::uint16_t, std::uint64_t> statistic_map(
|
||||||
const mx::tasking::profiling::Statistic::Counter counter)
|
const mx::tasking::profiling::Statistic::Counter counter)
|
||||||
|
|||||||
@@ -24,6 +24,13 @@ using namespace benchmark;
|
|||||||
*/
|
*/
|
||||||
[[maybe_unused]] PerfCounter Perf::LLC_MISSES = {"llc-miss", PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES};
|
[[maybe_unused]] PerfCounter Perf::LLC_MISSES = {"llc-miss", PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES};
|
||||||
|
|
||||||
|
[[maybe_unused]] PerfCounter Perf::DTLB_READ_MISSES = {"dtlb-read-miss", PERF_TYPE_HW_CACHE, 0x10003};
|
||||||
|
[[maybe_unused]] PerfCounter Perf::DTLB_STORE_MISSES = {"dtlb-store-miss", PERF_TYPE_HW_CACHE, 0x10103};
|
||||||
|
[[maybe_unused]] PerfCounter Perf::ITLB_LOAD_MISSES = {"itlb-load-miss", PERF_TYPE_HW_CACHE, 0x10004};
|
||||||
|
[[maybe_unused]] PerfCounter Perf::SW_PAGE_FAULTS = {"sw-page-faults", PERF_TYPE_SOFTWARE, PERF_COUNT_SW_PAGE_FAULTS};
|
||||||
|
[[maybe_unused]] PerfCounter Perf::SW_PAGE_FAULTS_MINOR = {"sw-page-faults-minor", PERF_TYPE_SOFTWARE, PERF_COUNT_SW_PAGE_FAULTS_MIN};
|
||||||
|
[[maybe_unused]] PerfCounter Perf::SW_PAGE_FAULTS_MAJOR = {"sw-page-faults-major", PERF_TYPE_SOFTWARE, PERF_COUNT_SW_PAGE_FAULTS_MAJ};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Counter "LLC Reference"
|
* Counter "LLC Reference"
|
||||||
* Accesses to the LLC, in which the data is present(hit) or not present(miss)
|
* Accesses to the LLC, in which the data is present(hit) or not present(miss)
|
||||||
|
|||||||
@@ -107,6 +107,12 @@ public:
|
|||||||
[[maybe_unused]] static PerfCounter SW_PREFETCH_ACCESS_T0;
|
[[maybe_unused]] static PerfCounter SW_PREFETCH_ACCESS_T0;
|
||||||
[[maybe_unused]] static PerfCounter SW_PREFETCH_ACCESS_T1_T2;
|
[[maybe_unused]] static PerfCounter SW_PREFETCH_ACCESS_T1_T2;
|
||||||
[[maybe_unused]] static PerfCounter SW_PREFETCH_ACCESS_WRITE;
|
[[maybe_unused]] static PerfCounter SW_PREFETCH_ACCESS_WRITE;
|
||||||
|
[[maybe_unused]] static PerfCounter DTLB_STORE_MISSES;
|
||||||
|
[[maybe_unused]] static PerfCounter DTLB_READ_MISSES;
|
||||||
|
[[maybe_unused]] static PerfCounter ITLB_LOAD_MISSES;
|
||||||
|
[[maybe_unused]] static PerfCounter SW_PAGE_FAULTS;
|
||||||
|
[[maybe_unused]] static PerfCounter SW_PAGE_FAULTS_MINOR;
|
||||||
|
[[maybe_unused]] static PerfCounter SW_PAGE_FAULTS_MAJOR;
|
||||||
|
|
||||||
Perf() noexcept = default;
|
Perf() noexcept = default;
|
||||||
~Perf() noexcept = default;
|
~Perf() noexcept = default;
|
||||||
@@ -154,4 +160,4 @@ public:
|
|||||||
private:
|
private:
|
||||||
std::vector<PerfCounter> _counter;
|
std::vector<PerfCounter> _counter;
|
||||||
};
|
};
|
||||||
} // namespace benchmark
|
} // namespace benchmark
|
||||||
|
|||||||
Reference in New Issue
Block a user