Merge branch 'ealan' into tasking-profiler

2026-01-21 12:32:56 +01:00 · 2023-06-01 16:37:09 +02:00
parent 5b62e72ec3 d72527809b
commit 4660cf7604
48 changed files with 2358 additions and 95 deletions
--- a/README.md
+++ b/README.md
@@ -0,0 +1,24 @@
+# EalánOS — An Operating System for Heterogeneous Many-core Systems
+
+EalánOS is a research operating system, based on the [Genode OS Framework](https://genode.org/), that explores new architectural designs and resource management strategies for many-core systems with heterogeneous computing and memory resources. It is a reference implementation of the [MxKernel](https://mxkernel.org/) architecture.
+
+## MxKernel Architecture
+The MxKernel is a new operating system architecture inspired by many-core operating systems, such as [FOS](https://dl.acm.org/doi/abs/10.1145/1531793.1531805) and [Tesselation](https://www.usenix.org/event/hotpar09/tech/full_papers/liu/liu_html/), as well as hypervisors, exokernels and unikernels.
+Novel approaches of the MxKernel include the use of tasks, short-lived closed units of work, instead of threads as control-flow abstraction, and the concept of elastic cells as process abstraction. The architecture has first been described in the paper [MxKernel: Rethinking Operating System Architecture for Many-core Hardware](https://sites.google.com/site/sfma2019eurosys/Program/sfma-mxkernel.pdf?attredirects=0) presented at the [9th Workshop on Systems for Multi-core and Heterogeneous Architectures](https://sites.google.com/site/sfma2019eurosys/). 
+
+## Task-based programming
+EalánOS promotes task-parallel programming by including the [MxTasking](https://github.com/jmuehlig/mxtasking.git) task-parallel runtime library. MxTasking improves on the common task-parallel programming paradigm by allowing tasks to be annotated with hints about the tasks behavior, such as memory accesses. These annotations are used by the runtime environment to implement advanced features, like automatic prefetching of data and automatic synchronization of concurrent memory accesses.
+
+## Documentation
+Because EalánOS is based on Genode, the primary documentation, for now, can be found in the book [Genode Foundations](https://genode.org/documentation/genode-foundations-22-05.pdf).
+
+## Features added to Genode
+EalánOS extends the Genode OS framework by functionality needed and helpful for many-core systems with non-uniform memory access (NUMA), such as
+- A topology service that allows to query NUMA information from within a Genode component.
+- A port of [MxTasking](https://github.com/jmuehlig/mxtasking.git), a task-based framework designed to aid in developing parallel applications.
+- (WiP) A extension of Genode's RAM service that enables applications to allocate memory from a specific NUMA region, similar to libnuma's `numa_alloc_on_node`, and thus improve NUMA-locality of internal data objects.
+- (WiP) An interface for using Hardware Performance Monitoring Counters inside Genode components. Currently, performance counters are only implemented for AMD's Zen1 microarchitecture.
+
+### Acknowledgement
+The work on EalánOS and the MxKernel architecture is supported by the German Research Foundation (DFG) as part of the priority program 2037 "[Scalable Data Management on Future Hardware](https://dfg-spp2037.de/)" under Grant numbers SP968/9-1 and SP968/9-2. 
+The MxTasking framework is developed as part of the same DFG project at the [DBIS group at TU Dortmund Universitiy](http://dbis.cs.tu-dortmund.de/cms/de/home/index.html)  and funded under Grant numbers TE1117/2-1.
--- a/repos/base-nova/include/nova/syscall-generic.h
+++ b/repos/base-nova/include/nova/syscall-generic.h
@@ -3,7 +3,8 @@
 * \author Norman Feske
 * \author Sebastian Sumpf
 * \author Alexander Boettcher
- * \date   2009-12-27
+ * \author Michael Müller
+ * \date   2022-12-13
 */

 /*
@@ -133,11 +134,19 @@ namespace Nova {
 		bool has_feature_svm() const { return feature_flags & (1 << 2); }

 		struct Cpu_desc {
+			enum Vendor
+			{
+				UNKNOWN,
+				INTEL,
+				AMD
+			};
+
 			uint8_t flags;
 			uint8_t thread;
 			uint8_t core;
 			uint8_t package;
 			uint8_t acpi_id;
+			uint8_t vendor;
 			uint8_t family;
 			uint8_t model;
 			uint8_t stepping:4;
@@ -255,6 +264,19 @@ namespace Nova {
 		SC_EC_TIME     = 3,
 	};

+	/**
+	 * Hpc operations
+	 * 
+	 */
+	enum Hpc_op
+	{
+		HPC_SETUP = 6U,
+		HPC_START = 7U,
+		HPC_STOP = 8U,
+		HPC_RESET = 9U,
+		HPC_READ = 10U,
+	};
+
 	/**
 	 * Pd operations
 	 */
--- a/repos/base-nova/include/spec/64bit/nova/syscalls.h
+++ b/repos/base-nova/include/spec/64bit/nova/syscalls.h
@@ -253,6 +253,36 @@ namespace Nova {
 		return util_time(NOVA_EC_CTRL, ec, Ec_op::EC_TIME, time);
 	}

+	ALWAYS_INLINE
+	inline uint8_t hpc_ctrl(Hpc_op op, mword_t sel, mword_t type, mword_t &p1, mword_t &p2, mword_t &p3)
+	{
+		uint8_t res = syscall_6(NOVA_EC_CTRL, op, sel, type, p1, p2, p3);
+		return res;
+	}
+
+	ALWAYS_INLINE
+	inline uint8_t hpc_read(mword_t sel, mword_t type, mword_t &value)
+	{
+		return syscall_5(NOVA_EC_CTRL, HPC_READ, sel, type, value);
+	}
+
+	ALWAYS_INLINE
+	inline uint8_t hpc_start(mword_t sel, mword_t type)
+	{
+		return syscall_1(NOVA_EC_CTRL, HPC_START, sel, type);
+	}
+
+	ALWAYS_INLINE
+	inline uint8_t hpc_stop(mword_t sel, mword_t type)
+	{
+		return syscall_1(NOVA_EC_CTRL, HPC_STOP, sel, type);
+	}
+
+	ALWAYS_INLINE
+	inline uint8_t hpc_reset(mword_t sel, mword_t type, mword_t val)
+	{
+		return syscall_2(NOVA_EC_CTRL, HPC_RESET, sel, type, val);
+	}

 	ALWAYS_INLINE
 	inline uint8_t create_sc(mword_t sc, mword_t pd, mword_t ec, Qpd qpd)
--- a/repos/base-nova/lib/mk/base-nova-common.mk
+++ b/repos/base-nova/lib/mk/base-nova-common.mk
@@ -14,3 +14,4 @@ SRC_CC += stack_area_addr.cc
 SRC_CC += cap_map.cc
 SRC_CC += capability.cc
 SRC_CC += signal_transmitter.cc
+SRC_CC += perf.cc
--- a/repos/base-nova/ports/nova.hash
+++ b/repos/base-nova/ports/nova.hash
@@ -1 +1 @@
-d850a1b6412ce630abedf7b9aa623b5caa994235
+52fcb4b19aa032eaba5484a69c3c4c491c2a6915
--- a/repos/base-nova/ports/nova.port
+++ b/repos/base-nova/ports/nova.port
@@ -4,7 +4,7 @@ DOWNLOADS := nova.git

 # feature/numa branch
 URL(nova) := https://github.com/mmueller41/NOVA.git
-REV(nova) := 6479677bd61db47bcdcb4bd796566f83b9f655ef
+REV(nova) := 4707840843206d63f72ba9238756355d16b52be3
 DIR(nova) := src/kernel/nova

 PATCHES   := $(sort $(wildcard $(REP_DIR)/patches/*.patch))
--- a/repos/base-nova/src/core/include/platform.h
+++ b/repos/base-nova/src/core/include/platform.h
@@ -20,6 +20,7 @@
 #include <core_mem_alloc.h>
 #include <address_space.h>
 #include <base/allocator.h>
+#include <nova/syscall-generic.h>

 namespace Genode {

@@ -51,9 +52,13 @@ namespace Genode {

 			/* map of virtual cpu ids in Genode to kernel cpu ids */
 			uint8_t map_cpu_ids[MAX_SUPPORTED_CPUS];
+
+			/* map of virtual cpu ids in Genode to kernel NUMA ids */
 			uint8_t cpu_numa_map[MAX_SUPPORTED_CPUS];
+			/* map of kernel NUMA region to Genode memory ranges */
 			Genode::Range_allocator::Range numa_mem_ranges[MAX_SUPPORTED_CPUS]; // TODO: Add new macro for max of numa regions

+
 			addr_t _map_pages(addr_t phys_page, addr_t pages,
 			                  bool guard_page = false);

@@ -164,6 +169,17 @@ namespace Genode {
 					}
 				}
 			}
+
+			/**
+			 * @brief Return NUMA-interal vendor code for CPU
+			 * 
+			 */
+			Nova::Hip::Cpu_desc::Vendor cpu_vendor() {
+				extern addr_t __initial_sp;
+				Nova::Hip const &hip = *(Nova::Hip *)__initial_sp;
+
+				return static_cast<Nova::Hip::Cpu_desc::Vendor>(hip.cpu_desc_of_cpu(0)->vendor);
+			}
 	};
 }

--- a/repos/base-nova/src/core/ram_dataspace_support.cc
+++ b/repos/base-nova/src/core/ram_dataspace_support.cc
@@ -56,30 +56,35 @@ static inline void * alloc_region(Dataspace_component &ds, const size_t size)

 void Ram_dataspace_factory::_clear_ds(Dataspace_component &ds)
 {
+	
 	size_t const page_rounded_size = align_addr(ds.size(), get_page_size_log2());

-	//size_t memset_count = page_rounded_size / 4;
-	//addr_t memset_ptr   = ds.core_local_addr();
+	size_t memset_count = page_rounded_size / 32;
+	addr_t memset_ptr   = ds.core_local_addr();

-	/*
-	if ((memset_count * 4 == page_rounded_size) && !(memset_ptr & 0x3))
-		asm volatile ("rep stosl" : "+D" (memset_ptr), "+c" (memset_count)
+	if ((memset_count * 32 == page_rounded_size) && !(memset_ptr & 0x3))
+	{
+		asm volatile ("rep stosq" : "+D" (memset_ptr), "+c" (memset_count)
 		                          : "a" (0)  : "memory");
-	else
+	} else
 		memset(reinterpret_cast<void *>(memset_ptr), 0, page_rounded_size);
-	*/
+}
+
+void Ram_dataspace_factory::_unmap_ds_from_core(Dataspace_component &ds)
+{
+	size_t const page_rounded_size = align_addr(ds.size(), get_page_size_log2());
+
 	/* we don't keep any core-local mapping */
 	unmap_local(*reinterpret_cast<Nova::Utcb *>(Thread::myself()->utcb()),
-	            ds.core_local_addr(),
-	            page_rounded_size >> get_page_size_log2());
+				ds.core_local_addr(),
+				page_rounded_size >> get_page_size_log2());

-	platform().region_alloc().free((void*)ds.core_local_addr(),
-	                               page_rounded_size);
+	platform().region_alloc().free((void *)ds.core_local_addr(),
+								   page_rounded_size);

 	ds.assign_core_local_addr(nullptr);
 }

-
 void Ram_dataspace_factory::_export_ram_ds(Dataspace_component &ds) {

 	size_t page_rounded_size = align_addr(ds.size(), get_page_size_log2());
--- a/repos/base-nova/src/kernel/nova/target.mk
+++ b/repos/base-nova/src/kernel/nova/target.mk
@@ -36,7 +36,7 @@ CC_OPT          += -mpreferred-stack-boundary=2 -mregparm=3
 else
 ifeq ($(filter-out $(SPECS),64bit),)
 override CC_MARCH = -m64
-CC_WARN         += -Wframe-larger-than=256
+CC_WARN         += -Wframe-larger-than=1024
 CC_OPT          += -mpreferred-stack-boundary=4 -mcmodel=kernel -mno-red-zone
 else
 $(error Unsupported environment)
--- a/repos/base-nova/src/lib/base/perf.cc
+++ b/repos/base-nova/src/lib/base/perf.cc
@@ -0,0 +1,86 @@
+
+/*
+ * \brief  Performance Counter infrastructure, NOVA-specific implemantation
+ * \author Michael Müller
+ * \date   2022-12-15
+ */
+
+#include <base/trace/perf.h>
+
+#include <nova/syscall-generic.h>
+#include <nova/syscalls.h>
+#include <base/log.h>
+
+unsigned long Genode::Trace::Performance_counter::private_freemask { 0xffff };
+unsigned long Genode::Trace::Performance_counter::shared_freemask { 0xffff0000 };
+
+void Genode::Trace::Performance_counter::_init_masks()
+{
+    Nova::Hip::Cpu_desc::Vendor vendor = Nova::Hip::Cpu_desc::AMD;
+    if (vendor == Nova::Hip::Cpu_desc::AMD)
+    {
+        private_freemask = 0x3f; // 6 core performance counters
+        shared_freemask = 0x1f0000; // 5 L3 complex performance counters
+    }
+    else if (vendor == Nova::Hip::Cpu_desc::INTEL)
+    {
+        private_freemask = 0x7fff;
+        shared_freemask = 0x7fff0000; // 15 CBO performance counters
+    }
+}
+
+void Genode::Trace::Performance_counter::setup(unsigned counter, uint64_t event, uint64_t mask, uint64_t flags)
+{
+    Nova::mword_t evt = event;
+    Nova::mword_t msk = mask;
+    Nova::mword_t flg = flags;
+    Nova::uint8_t rc;
+    Nova::mword_t type = (counter >>4);
+    Nova::mword_t sel = type == Performance_counter::CORE ? counter : counter & 0xf;
+
+    if ((rc = (Nova::hpc_ctrl(Nova::HPC_SETUP, sel, type, evt, msk, flg))) != Nova::NOVA_OK)
+        throw  Genode::Trace::Pfc_access_error(rc);
+}
+
+void Genode::Trace::Performance_counter::start(unsigned counter)
+{
+    Nova::uint8_t rc;
+    Nova::mword_t type = (counter >> 4);
+    Nova::mword_t sel = type == Performance_counter::CORE ? counter : counter >>4;
+
+    if ((rc = Nova::hpc_start(sel, type)) != Nova::NOVA_OK)
+        throw  Genode::Trace::Pfc_access_error(rc);
+}
+
+void Genode::Trace::Performance_counter::stop(unsigned counter)
+{
+    Nova::uint8_t rc;
+    Nova::mword_t type = (counter >>4);
+    Nova::mword_t sel = type == Performance_counter::CORE ? counter : counter & 0xf;
+
+    if ((rc = Nova::hpc_stop(sel, type)) != Nova::NOVA_OK)
+        throw  Genode::Trace::Pfc_access_error(rc);
+}
+
+void Genode::Trace::Performance_counter::reset(unsigned counter, unsigned val)
+{
+    Nova::uint8_t rc;
+    Nova::mword_t type = (counter >>4);
+    Nova::mword_t sel = type == Performance_counter::CORE ? counter : counter & 0xf;
+
+    if ((rc = Nova::hpc_reset(sel, type, val)) != Nova::NOVA_OK)
+        throw  Genode::Trace::Pfc_access_error(rc);
+}
+
+Genode::uint64_t Genode::Trace::Performance_counter::read(unsigned counter)
+{
+    Nova::uint8_t rc;
+    Nova::mword_t value = 0;
+    Nova::mword_t type = (counter >>4);
+    Nova::mword_t sel = type == Performance_counter::CORE ? counter : counter & 0xf;
+
+    if ((rc = Nova::hpc_read(sel, type, value)) != Nova::NOVA_OK)
+        throw  Genode::Trace::Pfc_access_error(rc);
+
+    return static_cast<Genode::uint64_t>(value);
+}
--- a/repos/base/include/base/attached_ram_dataspace.h
+++ b/repos/base/include/base/attached_ram_dataspace.h
@@ -14,6 +14,7 @@
 #ifndef _INCLUDE__BASE__ATTACHED_RAM_DATASPACE_H_
 #define _INCLUDE__BASE__ATTACHED_RAM_DATASPACE_H_

+#include <util/string.h>
 #include <util/touch.h>
 #include <base/ram_allocator.h>
 #include <base/env.h>
@@ -105,6 +106,7 @@ class Genode::Attached_ram_dataspace
 			_size(size), _ram(&ram), _rm(&rm), _cache(cache)
 		{
 			_alloc_and_attach();
+			memset(_local_addr, 0, _size);
 		}

 		/**
--- a/repos/base/include/base/local_connection.h
+++ b/repos/base/include/base/local_connection.h
@@ -93,8 +93,15 @@ struct Genode::Local_connection_base : Noncopyable

 			if (_session_state->phase == Session_state::INSUFFICIENT_RAM_QUOTA
 			 || _session_state->phase == Session_state::INSUFFICIENT_CAP_QUOTA)
-				warning("giving up to increase session quota for ", service.name(), " session "
+			 {
+				warning("[", label, "] giving up to increase session quota for ", service.name(), " session "
 				        "after ", (int)NUM_ATTEMPTS, " attempts");
+				if (_session_state->phase == Session_state::INSUFFICIENT_RAM_QUOTA)
+					warning("Insufficient RAM quota: ", resources.ram_quota.value);
+				
+				if (_session_state->phase == Session_state::INSUFFICIENT_CAP_QUOTA)
+					warning("Insufficient CAP quota ", resources.cap_quota.value);
+			 }
 		}

 		void close()
--- a/repos/base/include/base/trace/perf.h
+++ b/repos/base/include/base/trace/perf.h
@@ -0,0 +1,93 @@
+/*
+ * \brief  Performance Counter infrastructure
+ * \author Michael Müller
+ * \date   2022-12-15
+ */
+
+#pragma once
+
+#include <base/stdint.h>
+
+namespace Genode { namespace Trace {
+
+    class Pfc_no_avail {
+    };
+
+    class Performance_counter
+    {
+
+    private:
+        static unsigned long private_freemask;
+        static unsigned long shared_freemask;
+
+        static unsigned _alloc(unsigned long *free_mask)
+        {
+            unsigned long current_mask, new_mask;
+            unsigned bit;
+
+            do
+            {
+                current_mask = *free_mask;
+                bit = __builtin_ffsl(current_mask);
+                new_mask = current_mask & ~(1 << (bit - 1));
+            } while (!__atomic_compare_exchange(free_mask, &current_mask, &new_mask, true, __ATOMIC_ACQ_REL, __ATOMIC_RELAXED));
+
+            if (!bit) // Allocation failed
+                throw Pfc_no_avail();
+
+            return bit - 1; // number of the allocated counter
+            }
+
+            static void _init_masks();
+
+        public:
+            typedef unsigned int Counter;
+            
+            enum Type
+            {
+                CORE = 0,
+                CACHE = 1
+            };
+            
+            static unsigned acquire(Type type) {
+                return (type == Type::CORE) ? alloc_core() : alloc_cbo();
+            }
+
+            static unsigned alloc_cbo() {
+                if (shared_freemask == 0xffff0000)
+                    _init_masks();
+                return _alloc(&shared_freemask);
+            }
+
+            static unsigned alloc_core() {
+                if (private_freemask == 0xffff)
+                    _init_masks();
+                return _alloc(&private_freemask);
+            }
+
+            static void release(unsigned counter) {
+                bool core = static_cast<bool>(counter >> 4);
+                if (core)
+                    private_freemask |= (1 << counter);
+                else
+                    shared_freemask |= (1 << counter);
+            }
+
+            static void setup(unsigned counter, Genode::uint64_t event, Genode::uint64_t mask, Genode::uint64_t flags);
+            static void start(unsigned counter);
+            static void stop(unsigned counter);
+            static void reset(unsigned counter, unsigned val=0);
+            static uint64_t read(unsigned counter);
+    };
+
+    class Pfc_access_error {
+        private:
+            Genode::uint8_t _rc;
+
+        public:
+            Pfc_access_error(uint8_t rc) : _rc(rc) {}
+            Genode::uint8_t error_code() { return _rc; }
+    };
+
+    }
+}
--- a/repos/base/include/topo_session/connection.h
+++ b/repos/base/include/topo_session/connection.h
@@ -27,7 +27,7 @@ struct Genode::Topo_connection : Connection<Topo_session>, Topo_session_client
 {
    enum
    {
-        RAM_QUOTA = 262144
+        RAM_QUOTA = 2097152UL
    };

    Topo_connection(Env &env, const char *label = "", Affinity const &affinity = Affinity()) 
--- a/repos/base/lib/symbols/ld
+++ b/repos/base/lib/symbols/ld
@@ -54,6 +54,9 @@ _ZN5Timer10ConnectionC1ERN6Genode3EnvEPKc T
 _ZN5Timer10ConnectionC1ERN6Genode3EnvERNS1_10EntrypointEPKc T
 _ZN5Timer10ConnectionC2ERN6Genode3EnvEPKc T
 _ZN5Timer10ConnectionC2ERN6Genode3EnvERNS1_10EntrypointEPKc T
+_ZN6Genode5Trace19Performance_counter15shared_freemaskE D 8
+_ZN6Genode5Trace19Performance_counter16private_freemaskE D 8
+_ZN6Genode5Trace19Performance_counter11_init_masksEv T
 _ZN6Genode10Entrypoint16_dispatch_signalERNS_6SignalE T
 _ZN6Genode10Entrypoint16schedule_suspendEPFvvES2_ T
 _ZN6Genode10Entrypoint22Signal_proxy_component6signalEv T
@@ -274,6 +277,11 @@ _ZN6Genode5Trace6Logger17_evaluate_controlEv T
 _ZN6Genode5Trace6Logger3logEPKcm T
 _ZN6Genode5Trace6LoggerC1Ev T
 _ZN6Genode5Trace6LoggerC2Ev T
+_ZN6Genode5Trace19Performance_counter4readEj T
+_ZN6Genode5Trace19Performance_counter4stopEj T
+_ZN6Genode5Trace19Performance_counter5resetEjj T
+_ZN6Genode5Trace19Performance_counter5setupEjyyy T
+_ZN6Genode5Trace19Performance_counter5startEj T
 _ZN6Genode5Trace18Partitioned_buffer4initEm T
 _ZN6Genode5Trace18Partitioned_buffer6commitEm T
 _ZN6Genode5Trace18Partitioned_buffer7reserveEm T
--- a/repos/base/src/core/include/cpu_thread_component.h
+++ b/repos/base/src/core/include/cpu_thread_component.h
@@ -172,6 +172,7 @@ class Genode::Cpu_thread_component : public  Rpc_object<Cpu_thread>,

 			_address_space_region_map.add_client(_rm_client);
 			_platform_thread.pager(_rm_client);
+			_platform_thread.affinity(location);
 			_trace_sources.insert(&_trace_source);
 		}

--- a/repos/base/src/core/include/ram_dataspace_factory.h
+++ b/repos/base/src/core/include/ram_dataspace_factory.h
@@ -82,6 +82,11 @@ class Genode::Ram_dataspace_factory : public Ram_allocator,
 		 */
 		void _clear_ds(Dataspace_component &ds);

+		/**
+		 * Remove core-local mappings of dataspace
+		 */
+		void _unmap_ds_from_core(Dataspace_component &ds);
+
 	public:

 		Ram_dataspace_factory(Rpc_entrypoint  &ep,
--- a/repos/base/src/core/include/topo_session_component.h
+++ b/repos/base/src/core/include/topo_session_component.h
@@ -20,6 +20,7 @@
 #include <base/affinity.h>
 #include <base/heap.h>
 #include <topo_session/topo_session.h>
+#include <platform.h>
 #include <topo_session/node.h>

 namespace Genode {
@@ -32,7 +33,7 @@ class Genode::Topo_session_component : public Session_object<Topo_session>
        Genode::Affinity &_affinity;
        Sliced_heap _md_alloc;
        
-        Topology::Numa_region _node_affinities[64][64];
+        Topology::Numa_region _node_affinities[Genode::Platform::MAX_SUPPORTED_CPUS][Genode::Platform::MAX_SUPPORTED_CPUS];
        unsigned _node_count;
        Topology::Numa_region _nodes[64];

--- a/repos/base/src/core/main.cc
+++ b/repos/base/src/core/main.cc
@@ -286,7 +286,7 @@ int main()
 	size_t const avail_ram_quota = core_pd.avail_ram().value;
 	size_t const avail_cap_quota = core_pd.avail_caps().value;

-	size_t const preserved_ram_quota = 224*1024;
+	size_t const preserved_ram_quota = 224*1024+(1<<20);
 	size_t const preserved_cap_quota = 1000;

 	if (avail_ram_quota < preserved_ram_quota) {
--- a/repos/base/src/core/ram_dataspace_factory.cc
+++ b/repos/base/src/core/ram_dataspace_factory.cc
@@ -123,6 +123,7 @@ Ram_dataspace_factory::try_alloc(size_t ds_size, Cache cache)
 	Dataspace_component &ds = *ds_ptr;

 	/* create native shared memory representation of dataspace */
+#ifdef ZERO_AT_ALLOC
 	try { _export_ram_ds(ds); }
 	catch (Core_virtual_memory_exhausted) {
 		warning("could not export RAM dataspace of size ", ds.size());
@@ -137,8 +138,8 @@ Ram_dataspace_factory::try_alloc(size_t ds_size, Cache cache)
 	 * function must also make sure to flush all cache lines related to the
 	 * address range used by the dataspace.
 	 */
-	_clear_ds(ds);
-
+	_unmap_ds_from_core(ds);
+#endif 
 	Dataspace_capability ds_cap = _ep.manage(&ds);

 	phys_alloc_guard.keep = true;
@@ -181,8 +182,25 @@ void Ram_dataspace_factory::free(Ram_dataspace_capability ds_cap)
 	});

 	/* call dataspace destructor and free memory */
-	if (ds)
+	if (ds) {
+		try { _export_ram_ds(*ds); }
+		catch (Core_virtual_memory_exhausted) {
+			warning("could not export RAM dataspace of size ", ds->size());
+
+			/* cleanup unneeded resources */
+			destroy(_ds_slab, ds);
+			return;
+		}
+
+		/*
+		* Fill new dataspaces with zeros. For non-cached RAM dataspaces, this
+		* function must also make sure to flush all cache lines related to the
+		* address range used by the dataspace.
+		*/
+		_clear_ds(*ds);
+		_unmap_ds_from_core(*ds);
 		destroy(_ds_slab, ds);
+	}
 }


--- a/repos/libports/lib/import/import-libpfm4.mk
+++ b/repos/libports/lib/import/import-libpfm4.mk
@@ -0,0 +1 @@
+INC_DIR += $(call select_from_ports,libpfm4)/include
--- a/repos/libports/lib/mk/libpfm4.mk
+++ b/repos/libports/lib/mk/libpfm4.mk
@@ -0,0 +1,204 @@
+LIBPFM4_DIR := $(call select_from_ports,libpfm4)/src/lib/libpfm4
+
+CC_OPT += -D_REENTRANT -fvisibility=hidden
+
+SRC_CC = $(LIBPFM4_DIR)/lib/pfmlib_common.c
+
+# build libpfm only for x86_64 for now
+CONFIG_PFMLIB_ARCH_X86_64=y
+CONFIG_PFMLIB_ARCH_X86=y
+
+CONFIG_PFMLIB_SHARED?=n
+CONFIG_PFMLIB_DEBUG?=y
+CONFIG_PFMLIB_NOPYTHON?=y
+
+#
+# list all library support modules
+#
+ifeq ($(CONFIG_PFMLIB_ARCH_IA64),y)
+INCARCH = $(INC_IA64)
+#SRCS   += pfmlib_gen_ia64.c pfmlib_itanium.c pfmlib_itanium2.c pfmlib_montecito.c
+CFLAGS += -DCONFIG_PFMLIB_ARCH_IA64
+endif
+
+ifeq ($(CONFIG_PFMLIB_ARCH_X86),y)
+
+ifeq ($(SYS),Linux)
+SRCS += pfmlib_intel_x86_perf_event.c pfmlib_amd64_perf_event.c \
+	pfmlib_intel_netburst_perf_event.c \
+	pfmlib_intel_snbep_unc_perf_event.c
+endif
+
+INCARCH = $(INC_X86)
+SRCS   += pfmlib_amd64.c pfmlib_intel_core.c pfmlib_intel_x86.c \
+	  pfmlib_intel_x86_arch.c pfmlib_intel_atom.c \
+	  pfmlib_intel_nhm_unc.c pfmlib_intel_nhm.c \
+	  pfmlib_intel_wsm.c  \
+	  pfmlib_intel_snb.c pfmlib_intel_snb_unc.c \
+	  pfmlib_intel_ivb.c pfmlib_intel_ivb_unc.c \
+	  pfmlib_intel_hsw.c \
+	  pfmlib_intel_bdw.c \
+	  pfmlib_intel_skl.c \
+	  pfmlib_intel_icl.c \
+	  pfmlib_intel_spr.c \
+	  pfmlib_intel_rapl.c \
+	  pfmlib_intel_snbep_unc.c \
+	  pfmlib_intel_snbep_unc_cbo.c \
+	  pfmlib_intel_snbep_unc_ha.c \
+	  pfmlib_intel_snbep_unc_imc.c \
+	  pfmlib_intel_snbep_unc_pcu.c \
+	  pfmlib_intel_snbep_unc_qpi.c \
+	  pfmlib_intel_snbep_unc_ubo.c \
+	  pfmlib_intel_snbep_unc_r2pcie.c \
+	  pfmlib_intel_snbep_unc_r3qpi.c \
+	  pfmlib_intel_ivbep_unc_cbo.c \
+	  pfmlib_intel_ivbep_unc_ha.c \
+	  pfmlib_intel_ivbep_unc_imc.c \
+	  pfmlib_intel_ivbep_unc_pcu.c \
+	  pfmlib_intel_ivbep_unc_qpi.c \
+	  pfmlib_intel_ivbep_unc_ubo.c \
+	  pfmlib_intel_ivbep_unc_r2pcie.c \
+	  pfmlib_intel_ivbep_unc_r3qpi.c \
+	  pfmlib_intel_ivbep_unc_irp.c \
+	  pfmlib_intel_hswep_unc_cbo.c \
+	  pfmlib_intel_hswep_unc_ha.c \
+	  pfmlib_intel_hswep_unc_imc.c \
+	  pfmlib_intel_hswep_unc_pcu.c \
+	  pfmlib_intel_hswep_unc_qpi.c \
+	  pfmlib_intel_hswep_unc_ubo.c \
+	  pfmlib_intel_hswep_unc_r2pcie.c \
+	  pfmlib_intel_hswep_unc_r3qpi.c \
+	  pfmlib_intel_hswep_unc_irp.c \
+	  pfmlib_intel_hswep_unc_sbo.c \
+	  pfmlib_intel_bdx_unc_cbo.c \
+	  pfmlib_intel_bdx_unc_ubo.c \
+	  pfmlib_intel_bdx_unc_sbo.c \
+	  pfmlib_intel_bdx_unc_ha.c \
+	  pfmlib_intel_bdx_unc_imc.c \
+	  pfmlib_intel_bdx_unc_irp.c \
+	  pfmlib_intel_bdx_unc_pcu.c \
+	  pfmlib_intel_bdx_unc_qpi.c \
+	  pfmlib_intel_bdx_unc_r2pcie.c \
+	  pfmlib_intel_bdx_unc_r3qpi.c \
+	  pfmlib_intel_skx_unc_cha.c \
+	  pfmlib_intel_skx_unc_iio.c \
+	  pfmlib_intel_skx_unc_imc.c \
+	  pfmlib_intel_skx_unc_irp.c \
+	  pfmlib_intel_skx_unc_m2m.c \
+	  pfmlib_intel_skx_unc_m3upi.c \
+	  pfmlib_intel_skx_unc_pcu.c \
+	  pfmlib_intel_skx_unc_ubo.c \
+	  pfmlib_intel_skx_unc_upi.c \
+	  pfmlib_intel_knc.c \
+	  pfmlib_intel_slm.c \
+	  pfmlib_intel_tmt.c \
+	  pfmlib_intel_knl.c \
+	  pfmlib_intel_knl_unc_imc.c \
+	  pfmlib_intel_knl_unc_edc.c \
+	  pfmlib_intel_knl_unc_cha.c \
+	  pfmlib_intel_knl_unc_m2pcie.c \
+	  pfmlib_intel_glm.c \
+	  pfmlib_intel_netburst.c \
+	  pfmlib_amd64_k7.c pfmlib_amd64_k8.c pfmlib_amd64_fam10h.c \
+	  pfmlib_amd64_fam11h.c pfmlib_amd64_fam12h.c \
+	  pfmlib_amd64_fam14h.c pfmlib_amd64_fam15h.c \
+	  pfmlib_amd64_fam17h.c pfmlib_amd64_fam16h.c \
+	  pfmlib_amd64_fam19h.c pfmlib_amd64_rapl.c \
+	  pfmlib_amd64_fam19h_l3.c
+
+CFLAGS += -DCONFIG_PFMLIB_ARCH_X86
+
+ifeq ($(CONFIG_PFMLIB_ARCH_I386),y)
+SRCS += pfmlib_intel_coreduo.c pfmlib_intel_p6.c
+CFLAGS += -DCONFIG_PFMLIB_ARCH_I386
+endif
+
+ifeq ($(CONFIG_PFMLIB_ARCH_X86_64),y)
+CFLAGS += -DCONFIG_PFMLIB_ARCH_X86_64
+endif
+
+endif
+
+ifeq ($(CONFIG_PFMLIB_ARCH_POWERPC),y)
+
+ifeq ($(SYS),Linux)
+SRCS += pfmlib_powerpc_perf_event.c
+endif
+
+INCARCH = $(INC_POWERPC)
+SRCS   += pfmlib_powerpc.c pfmlib_power4.c pfmlib_ppc970.c pfmlib_power5.c \
+	pfmlib_power6.c pfmlib_power7.c pfmlib_torrent.c pfmlib_power8.c \
+	pfmlib_power9.c pfmlib_powerpc_nest.c pfmlib_power10.c
+CFLAGS += -DCONFIG_PFMLIB_ARCH_POWERPC
+endif
+
+ifeq ($(CONFIG_PFMLIB_ARCH_S390X),y)
+
+ifeq ($(SYS),Linux)
+SRCS += pfmlib_s390x_perf_event.c
+endif
+
+INCARCH = $(INC_S390X)
+SRCS   += pfmlib_s390x_cpumf.c
+CFLAGS += -DCONFIG_PFMLIB_ARCH_S390X
+endif
+
+ifeq ($(CONFIG_PFMLIB_ARCH_SPARC),y)
+
+ifeq ($(SYS),Linux)
+SRCS += pfmlib_sparc_perf_event.c
+endif
+
+INCARCH = $(INC_SPARC)
+SRCS   += pfmlib_sparc.c pfmlib_sparc_ultra12.c pfmlib_sparc_ultra3.c pfmlib_sparc_ultra4.c pfmlib_sparc_niagara.c
+CFLAGS += -DCONFIG_PFMLIB_ARCH_SPARC
+endif
+
+ifeq ($(CONFIG_PFMLIB_ARCH_ARM),y)
+
+ifeq ($(SYS),Linux)
+SRCS += pfmlib_arm_perf_event.c
+endif
+
+INCARCH = $(INC_ARM)
+SRCS   += pfmlib_arm.c pfmlib_arm_armv7_pmuv1.c pfmlib_arm_armv6.c pfmlib_arm_armv8.c pfmlib_tx2_unc_perf_event.c pfmlib_kunpeng_unc_perf_event.c
+CFLAGS += -DCONFIG_PFMLIB_ARCH_ARM
+endif
+
+ifeq ($(CONFIG_PFMLIB_ARCH_ARM64),y)
+
+ifeq ($(SYS),Linux)
+SRCS += pfmlib_arm_perf_event.c
+endif
+
+INCARCH = $(INC_ARM64)
+SRCS   += pfmlib_arm.c pfmlib_arm_armv8.c pfmlib_tx2_unc_perf_event.c pfmlib_kunpeng_unc_perf_event.c
+CFLAGS += -DCONFIG_PFMLIB_ARCH_ARM64
+endif
+
+ifeq ($(CONFIG_PFMLIB_ARCH_MIPS),y)
+
+ifeq ($(SYS),Linux)
+SRCS += pfmlib_mips_perf_event.c
+endif
+
+INCARCH = $(INC_MIPS)
+SRCS   += pfmlib_mips.c pfmlib_mips_74k.c
+CFLAGS += -DCONFIG_PFMLIB_ARCH_MIPS
+endif
+
+ifeq ($(CONFIG_PFMLIB_CELL),y)
+INCARCH = $(INC_CELL)
+#SRCS   += pfmlib_cell.c
+CFLAGS += -DCONFIG_PFMLIB_CELL
+endif
+
+SRC_CC += $(addprefix $(LIBPFM4_DIR)/lib/,$(SRCS))
+vpath %.c $(LIBPFM4_DIR)/lib
+
+CC_OPT += $(CFLAGS)
+
+INC_DIR += $(LIBPFM4_DIR)/include $(LIBPFM4_DIR)/lib/events
+vpath %.h $(INC_DIR)
+
+LIBS += base libm libc
--- a/repos/libports/ports/libpfm4.hash
+++ b/repos/libports/ports/libpfm4.hash
@@ -0,0 +1 @@
+b0ec09148c2be9f4a96203a3d2de4ebed6ce2da0
--- a/repos/libports/ports/libpfm4.port
+++ b/repos/libports/ports/libpfm4.port
@@ -0,0 +1,13 @@
+LICENSE := PD
+DOWNLOADS := libpfm4.git
+VERSION := git
+
+URL(libpfm4) := https://github.com/wcohen/libpfm4.git
+REV(libpfm4) := 8aaaf1747e96031a47ed6bd9337ff61a21f8cc64
+DIR(libpfm4) := src/lib/libpfm4
+
+DIRS += include
+DIRS += include/perfmon
+
+DIR_CONTENT(include) += src/lib/libpfm4/include/perfmon
+DIR_CONTENT(include/perfmon) += src/lib/libpfm4/include/perfmon/*.h 
--- a/repos/libports/ports/mxtasking.hash
+++ b/repos/libports/ports/mxtasking.hash
@@ -1 +1 @@
-07a3844690ae8eb15832d93e29567a5a8e6e45af
+03dc91ed3385b2a62dee0c4f20daf9b5cb29ba24
--- a/repos/libports/ports/mxtasking.port
+++ b/repos/libports/ports/mxtasking.port
@@ -3,7 +3,7 @@ DOWNLOADS := mxtasking.git
 VERSION := git

 URL(mxtasking) := https://github.com/mmueller41/mxtasking.git
-REV(mxtasking) := bfc90d4dcf88b7072c76d70e897cb4072f399248 
+REV(mxtasking) := fcf0a2810ba69d1017d6d7d9a5d6e60ac962f9f1 
 DIR(mxtasking) := src/lib/mxtasking

 DIRS += include/mx/memory
--- a/repos/libports/recipes/src/libpfm4/api
+++ b/repos/libports/recipes/src/libpfm4/api
@@ -0,0 +1 @@
+libpfm4
--- a/repos/libports/recipes/src/libpfm4/content.mk
+++ b/repos/libports/recipes/src/libpfm4/content.mk
@@ -0,0 +1,17 @@
+MIRROR_FROM_REP_DIR := lib/mk/libpfm4.mk lib/import/import-libpfm4.mk
+
+content: src/lib/libpfm4 COPYING $(MIRROR_FROM_REP_DIR)
+
+PORT_DIR := $(call port_dir,$(REP_DIR)/ports/libpfm4)
+
+src/lib/libpfm4:
+	mkdir -p $@
+	cp -r $(PORT_DIR)/src/lib/libpfm4/* $@
+	rm -rf $@/.git
+	echo "LIBS = libpfm4" > $@/target.mk
+
+$(MIRROR_FROM_REP_DIR):
+	$(mirror_from_rep_dir)
+
+LICENSE:
+	echo "libpfm license, see src/lib/libpfm4/COPYING" > $@
--- a/repos/libports/recipes/src/libpfm4/used_api
+++ b/repos/libports/recipes/src/libpfm4/used_api
@@ -0,0 +1,3 @@
+base
+libm
+libc
--- a/repos/libports/src/lib/libc/component.cc
+++ b/repos/libports/src/lib/libc/component.cc
@@ -79,4 +79,4 @@ void Component::construct(Genode::Env &env)
 * Default stack size for libc-using components
 */
 Genode::size_t Libc::Component::stack_size() __attribute__((weak));
-Genode::size_t Libc::Component::stack_size() { return 32UL*1024*sizeof(long); }
+Genode::size_t Libc::Component::stack_size() { return 96UL*1024*sizeof(long); }
--- a/repos/mml/run/hello_mxtask.run
+++ b/repos/mml/run/hello_mxtask.run
@@ -20,7 +20,9 @@ set config {
        <service name="IO_MEM"/>
        <service name="IO_PORT"/>
        <service name="RM"/>
+        <service name="TOPO"/>
    </parent-provides>
+        <affinity-space width="32" height="1"/> 
    <default-route>
        <any-service><parent/><any-child/></any-service>
    </default-route>
@@ -38,7 +40,8 @@ append_platform_drv_config

 append config {
    <start name="hello_mxtask">
-        <resource name="RAM" quantum="3G"/>
+        <resource name="RAM" quantum="16G"/>
+            <affinity xpos="0" ypos="0" width="16" height="1"/>
        <config>
            <vfs> <dir name="dev"> 
                <log/> 
@@ -60,6 +63,6 @@ set boot_modules {

 append_platform_drv_boot_modules
 build_boot_image $boot_modules
-append qemu_args "-nographic -m 64"
+append qemu_args "-nographic"

 run_genode_until forever
--- a/repos/mml/run/hpc_test.run
+++ b/repos/mml/run/hpc_test.run
@@ -0,0 +1,80 @@
+set build_components {
+    core init timer app/hpc_test
+}
+
+source ${genode_dir}/repos/base/run/platform_drv.inc 
+append_platform_drv_build_components
+
+build $build_components
+
+create_boot_directory
+
+set config {
+    <config>
+        <parent-provides>
+            <service name="LOG"/>
+            <service name="PD"/>
+            <service name="CPU"/>
+            <service name="ROM"/>
+            <service name="RAM"/>
+            <service name="IRQ"/>
+            <service name="IO_MEM"/>
+            <service name="IO_PORT"/>
+            <service name="CAP"/>
+            <service name="RM"/>
+            <service name="SIGNAL"/>
+            <service name="TOPO"/>
+        </parent-provides>
+        <default-route>
+            <any-service><parent/><any-child/></any-service>
+        </default-route>
+        <default caps="200"/>
+        <start name="timer">
+            <resource name="RAM" quantum="16M"/>
+            <provides><service name="Timer"/></provides>
+            <route>
+                <any-service><parent/><any-child/></any-service>
+            </route>
+        </start>
+}
+
+append config {
+        <start name="hpc_test1">
+            <binary name="hpc_test"/>
+            <resource name="RAM" quantum="64M"/>
+            <config>
+                <vfs> <dir name="dev"> <log/> <inline name="rtc">2022-07-20 14:30</inline> </dir> </vfs>
+                <libc stdout="/dev/log" stderr="/dev/log" rtc="/dev/rtc"/>
+            </config>
+            <route>
+                <service name="Timer"><child name="timer"/></service>
+                <any-service><parent/><any-child/></any-service>
+            </route>
+        </start>
+        <start name="hpc_test2">
+            <binary name="hpc_test"/>
+            <resource name="RAM" quantum="64M"/>
+            <config>
+                <vfs> <dir name="dev"> <log/> <inline name="rtc">2022-07-20 14:30</inline> </dir> </vfs>
+                <libc stdout="/dev/log" stderr="/dev/log" rtc="/dev/rtc"/>
+            </config>
+            <route>
+                <service name="Timer"><child name="timer"/></service>
+                <any-service><parent/><any-child/></any-service>
+            </route>
+        </start>
+    </config>
+}
+
+install_config $config
+
+set boot_modules {
+    core init timer vfs.lib.so ld.lib.so posix.lib.so libc.lib.so libm.lib.so stdcxx.lib.so hpc_test
+}
+
+append_platform_drv_boot_modules
+
+build_boot_image $boot_modules
+append qemu_args "-nographic "
+
+run_genode_until forever
--- a/repos/mml/run/libpfm_test.run
+++ b/repos/mml/run/libpfm_test.run
@@ -0,0 +1,68 @@
+set build_components {
+    core init timer app/libpfm_test
+}
+
+source ${genode_dir}/repos/base/run/platform_drv.inc 
+append_platform_drv_build_components
+
+build $build_components
+
+create_boot_directory
+
+set config {
+    <config>
+        <parent-provides>
+            <service name="LOG"/>
+            <service name="LOG"/>
+            <service name="PD"/>
+            <service name="CPU"/>
+            <service name="ROM"/>
+            <service name="RAM"/>
+            <service name="IRQ"/>
+            <service name="IO_MEM"/>
+            <service name="IO_PORT"/>
+            <service name="CAP"/>
+            <service name="RM"/>
+            <service name="SIGNAL"/>
+            <service name="TOPO"/>
+        </parent-provides>
+        <default-route>
+            <any-service><parent/><any-child/></any-service>
+        </default-route>
+        <default caps="200"/>
+        <start name="timer">
+            <resource name="RAM" quantum="1M"/>
+            <provides><service name="Timer"/></provides>
+            <route>
+                <any-service><parent/><any-child/></any-service>
+            </route>
+        </start>
+}
+
+append config {
+        <start name="libpfm_test">
+            <resource name="RAM" quantum="10M"/>
+            <config>
+                <vfs> <dir name="dev"> <log/> <inline name="rtc">2022-07-20 14:30</inline> </dir> </vfs>
+                <libc stdout="/dev/log" stderr="/dev/log" rtc="/dev/rtc"/>
+            </config>
+            <route>
+                <service name="Timer"><child name="timer"/></service>
+                <any-service><parent/><any-child/></any-service>
+            </route>
+        </start>
+    </config>
+}
+
+install_config $config
+
+set boot_modules {
+    core init timer vfs.lib.so ld.lib.so posix.lib.so libc.lib.so libm.lib.so stdcxx.lib.so libpfm_test
+}
+
+append_platform_drv_boot_modules
+
+build_boot_image $boot_modules
+append qemu_args "-nographic "
+
+run_genode_until forever
--- a/repos/mml/run/livedemo.run
+++ b/repos/mml/run/livedemo.run
@@ -0,0 +1,116 @@
+set build_components {
+    core init timer app/blinktree
+}
+
+build $build_components
+
+create_boot_directory
+
+set config {
+    <config>
+        <default caps="200"/>
+        <default-route>
+            <any-service><parent/><any-child/></any-service>
+        </default-route>
+
+        <parent-provides>
+            <service name="PD"/>
+            <service name="CPU"/>
+            <service name="RAM"/>
+            <service name="ROM"/>
+            <service name="RM"/>
+            <service name="LOG"/>
+            <service name="TOPO"/>
+        </parent-provides>
+
+        <affinity-space width="32" height="1"/>
+
+        <start name="timer">
+            <provides><service name="Timer"/></provides>
+            <resource name="RAM" quantum="8M"/>
+            <route>
+                <any-service><parent/><any-child/></any-service>
+            </route>
+        </start>
+
+        <start name="blinktree1">
+            <binary name="blinktree"/>
+            <resource name="RAM" quantum="80G"/>
+            <affinity xpos="1" ypos="0" width="31" height="1"/>
+            <route>
+                <service name="Timer"><child name="timer"/></service>
+                <any-service><parent/><any-child/></any-service>
+            </route>
+            <config>
+            <vfs>
+                    <dir name="dev"> 
+                        <log/> 
+                        <inline name="rtc">2022-07-20 14:30</inline> 
+                    </dir>
+                    <dir name="workloads">
+                        <rom name="fill_randint_workloada"/>
+                        <rom name="mixed_randint_workloada"/>
+                    </dir>
+                </vfs>
+                <libc stdout="/dev/log" stderr="/dev/log" rtc="/dev/rtc"/>
+            </config>
+        </start>
+        <start name="blinktree2">
+            <binary name="blinktree"/>
+            <resource name="RAM" quantum="80G"/>
+            <affinity xpos="1" ypos="0" width="31" height="1"/>
+            <route>
+                <service name="Timer"><child name="timer"/></service>
+                <any-service><parent/><any-child/></any-service>
+            </route>
+            <config>
+            <vfs>
+                    <dir name="dev"> 
+                        <log/> 
+                        <inline name="rtc">2022-07-20 14:30</inline> 
+                    </dir>
+                    <dir name="workloads">
+                        <rom name="fill_randint_workloada"/>
+                        <rom name="mixed_randint_workloada"/>
+                    </dir>
+                </vfs>
+                <libc stdout="/dev/log" stderr="/dev/log" rtc="/dev/rtc"/>
+            </config>
+        </start>
+        <start name="blinktree3">
+            <binary name="blinktree"/>
+            <resource name="RAM" quantum="80G"/>
+            <affinity xpos="1" ypos="0" width="31" height="1"/>
+            <route>
+                <service name="Timer"><child name="timer"/></service>
+                <any-service><parent/><any-child/></any-service>
+            </route>
+            <config>
+            <vfs>
+                    <dir name="dev"> 
+                        <log/> 
+                        <inline name="rtc">2022-07-20 14:30</inline> 
+                    </dir>
+                    <dir name="workloads">
+                        <rom name="fill_randint_workloada"/>
+                        <rom name="mixed_randint_workloada"/>
+                    </dir>
+                </vfs>
+                <libc stdout="/dev/log" stderr="/dev/log" rtc="/dev/rtc"/>
+            </config>
+        </start>
+    </config>    
+             
+
+             
+}
+
+install_config $config
+
+set boot_modules { 
+    core init timer vfs.lib.so ld.lib.so libm.lib.so libc.lib.so stdcxx.lib.so posix.lib.so blinktree fill_randint_workloada mixed_randint_workloada
+}
+
+build_boot_image $boot_modules
+append qemu_args "-nographic"
+run_genode_until forever
--- a/repos/mml/src/app/blinktree/benchmark/chronometer.h
+++ b/repos/mml/src/app/blinktree/benchmark/chronometer.h
@@ -1,8 +1,6 @@
 #pragma once

-#ifdef PERF_SUPPORT
 #include "perf.h"
-#endif
 #include "phase.h"
 #include <chrono>
 #include <json.hpp>
@@ -51,7 +49,7 @@ template <typename P> class InterimResult
 public:
    InterimResult(const std::uint64_t operation_count, const P &phase, const std::uint16_t iteration,
                  const std::uint16_t core_count, const std::chrono::milliseconds time,
-                  /*std::vector<PerfCounter> &counter,*/ 
+                  std::vector<PerfCounter> &counter, 
                  std::unordered_map<std::uint16_t, std::uint64_t> executed_tasks,
                  std::unordered_map<std::uint16_t, std::uint64_t> executed_reader_tasks,
                  std::unordered_map<std::uint16_t, std::uint64_t> executed_writer_tasks,
@@ -65,12 +63,10 @@ public:
          _scheduled_tasks_on_core(std::move(scheduled_tasks_on_core)),
          _scheduled_tasks_off_core(std::move(scheduled_tasks_off_core)), _worker_fills(std::move(worker_fills))
    {
-#ifdef PERF_SUPPORT
        for (auto &c : counter)
        {
            _performance_counter.emplace_back(std::make_pair(c.name(), c.read()));
        }
-#endif
    }

    ~InterimResult() = default;
@@ -181,9 +177,7 @@ public:
        _current_phase = phase;
        _current_iteration = iteration;
        _core_set = core_set;
-#ifdef PERF_SUPPORT
        _perf.start();
-#endif
        
        //_start = std::chrono::steady_clock::now();
        _start = Genode::Trace::timestamp();
@@ -193,9 +187,7 @@ public:
    {
        const auto end = Genode::Trace::timestamp();
        //const auto end = std::chrono::steady_clock::now();
-#ifdef PERF_SUPPORT
        _perf.stop();
-#endif

        //const auto milliseconds = std::chrono::duration_cast<std::chrono::milliseconds>(end-_start);
        const auto milliseconds = std::chrono::milliseconds((end-_start)/2000000UL);
@@ -205,7 +197,7 @@ public:
                _current_iteration,
                _core_set.size(),
                milliseconds,
-                //_perf.counter(),
+                _perf.counter(),
                statistic_map(mx::tasking::profiling::Statistic::Executed),
                statistic_map(mx::tasking::profiling::Statistic::ExecutedReader),
                statistic_map(mx::tasking::profiling::Statistic::ExecutedWriter),
@@ -214,16 +206,12 @@ public:
                statistic_map(mx::tasking::profiling::Statistic::ScheduledOffChannel),
                statistic_map(mx::tasking::profiling::Statistic::Fill)};
    }
-#ifdef PERF_SUPPORT
    void add(PerfCounter &performance_counter) { _perf.add(performance_counter); }
-#endif
 private:
    std::uint16_t _current_iteration{0U};
    P _current_phase;
    mx::util::core_set _core_set;
-#ifdef PERF_SUPPORT
    alignas(64) Perf _perf;
-#endif
    //alignas(64) std::chrono::steady_clock::time_point _start;
    alignas(64) size_t _start;

--- a/repos/mml/src/app/blinktree/benchmark/perf.cpp
+++ b/repos/mml/src/app/blinktree/benchmark/perf.cpp
@@ -6,29 +6,27 @@ using namespace benchmark;
 * Counter "Instructions Retired"
 * Counts when the last uop of an instruction retires.
 */
-[[maybe_unused]] PerfCounter Perf::INSTRUCTIONS = {"instr", PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS};
+[[maybe_unused]] PerfCounter Perf::INSTRUCTIONS = {"instr", Genode::Trace::Performance_counter::Type::CORE, 0xc0, 0x0};

 /**
 */
-[[maybe_unused]] PerfCounter Perf::CYCLES = {"cycles", PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES};
+[[maybe_unused]] PerfCounter Perf::CYCLES = {"cycles", Genode::Trace::Performance_counter::Type::CORE, 0x76, 0x0};

 /**
 */
-[[maybe_unused]] PerfCounter Perf::L1_MISSES = {"l1-miss", PERF_TYPE_HW_CACHE,
-                                                PERF_COUNT_HW_CACHE_L1D | (PERF_COUNT_HW_CACHE_OP_READ << 8) |
-                                                    (PERF_COUNT_HW_CACHE_RESULT_MISS << 16)};
+[[maybe_unused]] PerfCounter Perf::L1_MISSES = {"l1-miss", Genode::Trace::Performance_counter::Type::CORE, 0x43, 0x5b};

 /**
 * Counter "LLC Misses"
 * Accesses to the LLC in which the data is not present(miss).
 */
-[[maybe_unused]] PerfCounter Perf::LLC_MISSES = {"llc-miss", PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES};
+[[maybe_unused]] PerfCounter Perf::LLC_MISSES = {"llc-miss", Genode::Trace::Performance_counter::Type::CACHE, 0x6, 0xff};

 /**
 * Counter "LLC Reference"
 * Accesses to the LLC, in which the data is present(hit) or not present(miss)
 */
-[[maybe_unused]] PerfCounter Perf::LLC_REFERENCES = {"llc-ref", PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES};
+[[maybe_unused]] PerfCounter Perf::LLC_REFERENCES = {"llc-ref", Genode::Trace::Performance_counter::Type::CACHE, 0x4, 0xff};

 /**
 * Micro architecture "Skylake"
@@ -36,7 +34,7 @@ using namespace benchmark;
 * EventSel=A3H,UMask=14H, CMask=20
 * Execution stalls while memory subsystem has an outstanding load.
 */
-PerfCounter Perf::STALLS_MEM_ANY = {"memory-stall", PERF_TYPE_RAW, 0x145314a3};
+//PerfCounter Perf::STALLS_MEM_ANY = {"memory-stall", PERF_TYPE_RAW, 0x145314a3};

 /**
 * Micro architecture "Skylake"
@@ -44,7 +42,7 @@ PerfCounter Perf::STALLS_MEM_ANY = {"memory-stall", PERF_TYPE_RAW, 0x145314a3};
 * EventSel=32H,UMask=01H
 * Number of PREFETCHNTA instructions executed.
 */
-[[maybe_unused]] PerfCounter Perf::SW_PREFETCH_ACCESS_NTA = {"sw-prefetch-nta", PERF_TYPE_RAW, 0x530132};
+[[maybe_unused]] PerfCounter Perf::SW_PREFETCH_ACCESS_NTA = {"sw-prefetch-nta", Genode::Trace::Performance_counter::Type::CORE, 0x4b, 0x4};

 /**
 * Micro architecture "Skylake"
@@ -52,7 +50,7 @@ PerfCounter Perf::STALLS_MEM_ANY = {"memory-stall", PERF_TYPE_RAW, 0x145314a3};
 * EventSel=32H,UMask=02H
 * Number of PREFETCHT0 instructions executed.
 */
-[[maybe_unused]] PerfCounter Perf::SW_PREFETCH_ACCESS_T0 = {"sw-prefetch-t0", PERF_TYPE_RAW, 0x530232};
+//[[maybe_unused]] PerfCounter Perf::SW_PREFETCH_ACCESS_T0 = {"sw-prefetch-t0", Genode::Trace::Performance_counter::Type::CORE, 0x4b, };

 /**
 * Micro architecture "Skylake"
@@ -60,7 +58,7 @@ PerfCounter Perf::STALLS_MEM_ANY = {"memory-stall", PERF_TYPE_RAW, 0x145314a3};
 * EventSel=32H,UMask=04H
 * Number of PREFETCHT1 or PREFETCHT2 instructions executed.
 */
-[[maybe_unused]] PerfCounter Perf::SW_PREFETCH_ACCESS_T1_T2 = {"sw-prefetch-t1t2", PERF_TYPE_RAW, 0x530432};
+//[[maybe_unused]] PerfCounter Perf::SW_PREFETCH_ACCESS_T1_T2 = {"sw-prefetch-t1t2", PERF_TYPE_RAW, 0x530432};

 /**
 * Micro architecture "Skylake"
@@ -68,4 +66,4 @@ PerfCounter Perf::STALLS_MEM_ANY = {"memory-stall", PERF_TYPE_RAW, 0x145314a3};
 * EventSel=32H,UMask=08H
 * Number of PREFETCHW instructions executed.
 */
-[[maybe_unused]] PerfCounter Perf::SW_PREFETCH_ACCESS_WRITE = {"sw-prefetch-w", PERF_TYPE_RAW, 0x530832};
+[[maybe_unused]] PerfCounter Perf::SW_PREFETCH_ACCESS_WRITE = {"sw-prefetch-w", Genode::Trace::Performance_counter::Type::CORE, 0x4b, 0x2};
--- a/repos/mml/src/app/blinktree/benchmark/perf.h
+++ b/repos/mml/src/app/blinktree/benchmark/perf.h
@@ -1,12 +1,11 @@
 #pragma once
 #include <algorithm>
-#include <asm/unistd.h>
 #include <cstring>
-#include <linux/perf_event.h> // TODO: Find Genode equivalent
+#include <iostream>
 #include <string>
-#include <sys/ioctl.h>
-#include <unistd.h>
 #include <vector>
+#include <base/trace/perf.h>
+

 /*
 * For more Performance Counter take a look into the Manual from Intel:
@@ -28,46 +27,62 @@ namespace benchmark {
 class PerfCounter
 {
 public:
-    PerfCounter(std::string &&name, const std::uint64_t type, const std::uint64_t event_id) : _name(std::move(name))
+    PerfCounter(std::string &&name, const Genode::Trace::Performance_counter::Type type, const std::uint64_t event_id, const std::uint64_t mask) : _name(std::move(name)), _type(type), _event_id(static_cast<Genode::uint64_t>(event_id)), _mask(static_cast<Genode::uint64_t>(mask))
    {
-        /*std::memset(&_perf_event_attribute, 0, sizeof(perf_event_attr));
-        _perf_event_attribute.type = type;
-        _perf_event_attribute.size = sizeof(perf_event_attr);
-        _perf_event_attribute.config = event_id;
-        _perf_event_attribute.disabled = true;
-        _perf_event_attribute.inherit = 1;
-        _perf_event_attribute.exclude_kernel = false;
-        _perf_event_attribute.exclude_hv = false;
-        _perf_event_attribute.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING;*/
+        
    }

    ~PerfCounter() = default;

    bool open()
    {
-        /*_file_descriptor = syscall(__NR_perf_event_open, &_perf_event_attribute, 0, -1, -1, 0);*/
-        return _file_descriptor >= 0;
+        try {
+            _counter = Genode::Trace::Performance_counter::acquire(_type);
+        } catch (Genode::Trace::Pfc_no_avail) {
+            std::cerr << "Failed to open performance counters." << std::endl;
+        }
+
+        try {
+            Genode::Trace::Performance_counter::setup(_counter, _event_id, _mask, (_type == Genode::Trace::Performance_counter::Type::CORE ? 0x30000 : 0x550f000000000000));
+        } catch (Genode::Trace::Pfc_access_error &e) {
+            std::cerr << "Error while setting up performance counter: " << e.error_code() << std::endl;
+        }
+
+        return _counter >= 0;
    }

    bool start()
    {
-        //ioctl(_file_descriptor, PERF_EVENT_IOC_RESET, 0);
-        //ioctl(_file_descriptor, PERF_EVENT_IOC_ENABLE, 0);
-        return ::read(_file_descriptor, &_prev, sizeof(read_format)) == sizeof(read_format);
+        try {
+            Genode::Trace::Performance_counter::start(_counter);
+            _prev.value = static_cast<std::uint64_t>(Genode::Trace::Performance_counter::read(_counter));
+        }
+        catch (Genode::Trace::Pfc_access_error &e)
+        {
+            std::cerr << "Failed to start counter: " << e.error_code() << std::endl;
+        }
+        return _prev.value >= 0;
    }

    bool stop()
    {
-        //const auto is_read = ::read(_file_descriptor, &_data, sizeof(read_format)) == sizeof(read_format);
-        //ioctl(_file_descriptor, PERF_EVENT_IOC_DISABLE, 0);
-        return false; // is_read;
+        try {
+            _data.value = Genode::Trace::Performance_counter::read(_counter);
+            Genode::Trace::Performance_counter::stop(_counter);
+            Genode::Trace::Performance_counter::reset(_counter);
+        }
+        catch (Genode::Trace::Pfc_access_error &e)
+        {
+            std::cerr << "Failed to stop counter: " << e.error_code() << std::endl;
+        }
+        // const auto is_read = ::read(_file_descriptor, &_data, sizeof(read_format)) == sizeof(read_format);
+        // ioctl(_file_descriptor, PERF_EVENT_IOC_DISABLE, 0);
+        return _data.value >= 0; // is_read;
    }

    [[nodiscard]] double read() const
    {
-        const auto multiplexing_correction = static_cast<double>(_data.time_enabled - _prev.time_enabled) /
-                                             static_cast<double>(_data.time_running - _prev.time_running);
-        return static_cast<double>(_data.value - _prev.value) * multiplexing_correction;
+        return static_cast<double>(_data.value - _prev.value);
    }

    [[nodiscard]] const std::string &name() const { return _name; }
@@ -84,8 +99,10 @@ private:
    };

    const std::string _name;
-    std::int32_t _file_descriptor = -1;
-    //perf_event_attr _perf_event_attribute{};
+    Genode::Trace::Performance_counter::Type _type;
+    Genode::uint64_t _event_id;
+    Genode::uint64_t _mask;
+    Genode::Trace::Performance_counter::Counter _counter;
    read_format _prev{};
    read_format _data{};
 };
@@ -101,11 +118,11 @@ public:
    [[maybe_unused]] static PerfCounter L1_MISSES;
    [[maybe_unused]] [[maybe_unused]] static PerfCounter LLC_MISSES;
    [[maybe_unused]] static PerfCounter LLC_REFERENCES;
-    [[maybe_unused]] static PerfCounter STALLED_CYCLES_BACKEND;
-    [[maybe_unused]] static PerfCounter STALLS_MEM_ANY;
+    //[[maybe_unused]] static PerfCounter STALLED_CYCLES_BACKEND;
+    //[[maybe_unused]] static PerfCounter STALLS_MEM_ANY;
    [[maybe_unused]] static PerfCounter SW_PREFETCH_ACCESS_NTA;
-    [[maybe_unused]] static PerfCounter SW_PREFETCH_ACCESS_T0;
-    [[maybe_unused]] static PerfCounter SW_PREFETCH_ACCESS_T1_T2;
+    //[[maybe_unused]] static PerfCounter SW_PREFETCH_ACCESS_T0;
+    //[[maybe_unused]] static PerfCounter SW_PREFETCH_ACCESS_T1_T2;
    [[maybe_unused]] static PerfCounter SW_PREFETCH_ACCESS_WRITE;

    Perf() noexcept = default;
--- a/repos/mml/src/app/blinktree/blinktree_benchmark/benchmark.cpp
+++ b/repos/mml/src/app/blinktree/blinktree_benchmark/benchmark.cpp
@@ -22,16 +22,14 @@ Benchmark::Benchmark(Libc::Env &env, benchmark::Cores &&cores, const std::uint16
      _result_file_name(std::move(result_file_name)), _statistic_file_name(std::move(statistic_file_name)),
      _tree_file_name(std::move(tree_file_name)), _profile(profile), _workload(env)
 {
-#ifdef PERF_SUPPORT
    if (use_performance_counter)
    {
        this->_chronometer.add(benchmark::Perf::CYCLES);
        this->_chronometer.add(benchmark::Perf::INSTRUCTIONS);
-        this->_chronometer.add(benchmark::Perf::STALLS_MEM_ANY);
+        //this->_chronometer.add(benchmark::Perf::STALLS_MEM_ANY);
        this->_chronometer.add(benchmark::Perf::SW_PREFETCH_ACCESS_NTA);
        this->_chronometer.add(benchmark::Perf::SW_PREFETCH_ACCESS_WRITE);
    }
-#endif
    std::cout << "core configuration: \n" << this->_cores.dump(2) << std::endl;

    this->_workload.build(fill_workload_file, mixed_workload_file);
@@ -117,7 +115,18 @@ void Benchmark::requests_finished()

    if (open_requests == 0U) // All request schedulers are done.
    {
+        std::uint16_t core_id = mx::system::topology::core_id();
+        if (core_id != 0)
+        {
+            this->_open_requests++;
+            auto *stop_task = mx::tasking::runtime::new_task<StopMeasurementTask>(0U, *this);
+            stop_task->annotate(static_cast<mx::tasking::TaskInterface::channel>(0));
+            mx::tasking::runtime::spawn(*stop_task, core_id);
+            return;
+        }
+
        // Stop and print time (and performance counter).
+        //Genode::log("Stopping timer");
        const auto result = this->_chronometer.stop(this->_workload.size());
        mx::tasking::runtime::stop();

@@ -126,7 +135,7 @@ void Benchmark::requests_finished()
        //std::cout << result << std::endl;
        //if (mx::system::topology::core_id() == 0)
        //std::cout << result << "\t " << (_end - _start) << " cycles" << std::endl;
-        std::cout << result.to_json().dump() << std::endl;
+        std::cout << "core: " << mx::system::topology::core_id() << result.to_json().dump() << std::endl;
        

          //  std::cout << result << std::endl;
--- a/repos/mml/src/app/blinktree/blinktree_benchmark/benchmark.h
+++ b/repos/mml/src/app/blinktree/blinktree_benchmark/benchmark.h
@@ -110,6 +110,7 @@ private:
    [[nodiscard]] std::string profile_file_name() const;

    friend class StartMeasurementTask;
+    friend class StopMeasurementTask;
 };

 class StartMeasurementTask : public mx::tasking::TaskInterface
@@ -123,9 +124,26 @@ class StartMeasurementTask : public mx::tasking::TaskInterface

        mx::tasking::TaskResult execute(const std::uint16_t core_id, const std::uint16_t channel_id) override 
        {
+            //Genode::log("Starting timer");
            _benchmark._chronometer.start(static_cast<std::uint16_t>(static_cast<benchmark::phase>(_benchmark._workload)), _benchmark._current_iteration + 1, _benchmark._cores.current());
            //_benchmark._start = Genode::Trace::timestamp();
            return mx::tasking::TaskResult::make_remove();
        }
 };
+
+class StopMeasurementTask : public mx::tasking::TaskInterface
+{
+    private:
+        Benchmark &_benchmark;
+
+    public:
+        constexpr StopMeasurementTask(Benchmark& benchmark) : _benchmark(benchmark) {}
+        ~StopMeasurementTask() override = default;
+
+        mx::tasking::TaskResult execute(const std::uint16_t core_id, const std::uint16_t channel_id) override 
+        {
+            _benchmark.requests_finished();
+            return mx::tasking::TaskResult::make_remove();
+        }
+};
 } // namespace application::blinktree_benchmark
--- a/repos/mml/src/app/blinktree/blinktree_benchmark/main.cpp
+++ b/repos/mml/src/app/blinktree/blinktree_benchmark/main.cpp
@@ -9,6 +9,7 @@
 #include <tuple>
 #include <libc/component.h>
 #include <cstring>
+#include <cstdio>

 using namespace application::blinktree_benchmark;

@@ -202,13 +203,13 @@ void Libc::Component::construct(Libc::Env &env) {
    std::uint16_t cores = env.cpu().affinity_space().total();

    char cores_arg[10];
-    snprintf(cores_arg, 9, "1:%d", cores);
+    sprintf(cores_arg, "%d", cores);

-    char *args[] = {"blinktree_benchmark", "-i", "4", "-pd", "3", cores_arg};
+    char *args[] = {"blinktree_benchmark", "-i", "4", "-pd", "3", "-p", cores_arg};

    Libc::with_libc([&]()
                    { 
                        std::cout << "Starting B-link tree benchmark" << std::endl;
-                        bt_main(env, 6, args); 
+                        bt_main(env, 7, args); 
                    });
 }
--- a/repos/mml/src/app/blinktree/target.mk
+++ b/repos/mml/src/app/blinktree/target.mk
@@ -1,4 +1,5 @@
 MXINC_DIR=$(REP_DIR)/src/app/blinktree
+GENODE_GCC_TOOLCHAIN_DIR ?= /usr/local/genode/tool/21.05

 TARGET = blinktree
 # soure file for benchmark framework
@@ -6,11 +7,18 @@ SRC_MXBENCH = benchmark/workload_set.cpp
 SRC_MXBENCH += benchmark/workload.cpp
 SRC_MXBENCH += benchmark/cores.cpp
 SRC_MXBENCH += benchmark/string_util.cpp
+SRC_MXBENCH += benchmark/perf.cpp
 # source files for blinktree benchmark
 SRC_BTREE += blinktree_benchmark/main.cpp
 SRC_BTREE += blinktree_benchmark/benchmark.cpp

 SRC_CC = ${SRC_MXBENCH} ${SRC_BTREE}
 LIBS += base libc stdcxx mxtasking 
-CC_OPT += -Wno-error -fno-aligned-new -I$(MXINC_DIR)
+EXT_OBJECTS += /usr/local/genode/tool/lib/clang/14.0.5/lib/linux/libclang_rt.builtins-x86_64.a /usr/local/genode/tool/lib/libatomic.a 
+CUSTOM_CC = /usr/local/genode/tool/bin/clang
+CUSTOM_CXX = /usr/local/genode/tool/bin/clang++
+CC_OPT += --target=x86_64-genode --sysroot=/does/not/exist --gcc-toolchain=$(GENODE_GCC_TOOLCHAIN_DIR) -Wno-error -O2 -g -fno-aligned-new -DNDEBUG -I$(MXINC_DIR) -std=c++17 #-D_GLIBCXX_ATOMIC_BUILTINS_8 -D__GCC_HAVE_SYNC_COMPARE_AND_SWAP_8
+CC_OPT +=  -femulated-tls -DCLANG_CXX11_ATOMICS
 CC_CXX_WARN_STRICT =
+CUSTOM_CXX_LIB := $(CROSS_DEV_PREFIX)g++
+#CXX_LD += $(CROSS_DEV_PREFIX)g++ 
--- a/repos/mml/src/app/hpc_test/main.cc
+++ b/repos/mml/src/app/hpc_test/main.cc
@@ -0,0 +1,89 @@
+/**
+ * @file main.cc
+ * @author Michael Müller (michael.mueller@uos.de)
+ * @brief Some test for programing hardware performance counters in NOVA
+ * @version 0.1
+ * @date 2022-12-14
+ * 
+ * @copyright Copyright (c) 2022
+ * 
+ */
+
+#include <nova/syscall-generic.h>
+#include <nova/syscalls.h>
+
+#include <iostream>
+#include <chrono>
+#include <thread>
+#include <x86intrin.h>
+
+int main(void) 
+{
+    Nova::mword_t event = 0x26;
+    Nova::mword_t mask = 0x00;
+    Nova::mword_t flags = 0x70000;
+    Nova::uint8_t rc;
+
+    if ((rc = Nova::hpc_ctrl(Nova::HPC_SETUP, 0, 1, event, mask, flags)) != Nova::NOVA_OK) {
+        std::cerr << "Failed to setup performance counter 0" << std::endl;
+        return -1;
+    }
+
+    std::cout << "Counter 0 setup" << std::endl;
+    event = 0x60;
+    mask = 0xfe;
+    if ((rc = Nova::hpc_ctrl(Nova::HPC_SETUP, 1, 1, event, mask, flags)) != Nova::NOVA_OK)
+    {
+        std::cerr << "Failed to setup performance counter 1, rc = " <<  static_cast<Nova::uint32_t>(rc) << std::endl;
+        return -1;
+    }
+
+    event = 0x62;
+    mask = 0x1;
+    if ((rc = Nova::hpc_ctrl(Nova::HPC_SETUP, 2, 1, event, mask, flags)) != Nova::NOVA_OK)
+    {
+        std::cerr << "Failed to setup performance counter 2, rc = " <<  static_cast<Nova::uint32_t>(rc) << std::endl;
+        return -1;
+    }
+    if ((rc = Nova::hpc_start(0, 1)) != Nova::NOVA_OK) {
+        std::cerr << "Failed to start counter 0" << std::endl;
+        return -2;
+    }
+    
+    if ((rc = Nova::hpc_start(1, 1)) != Nova::NOVA_OK) {
+        std::cerr << "Failed to start counter 0" << std::endl;
+        return -2;
+    }
+
+    if ((rc = Nova::hpc_start(2, 1)) != Nova::NOVA_OK) {
+        std::cerr << "Failed to start counter 0" << std::endl;
+        return -2;
+    }
+
+    for (;;) {
+        std::this_thread::sleep_for(std::chrono::milliseconds(2000));
+        Nova::mword_t count = 0;
+        
+        _mm_clflush(&count);
+        if ((rc = Nova::hpc_read(0, 1, count)) != Nova::NOVA_OK)
+        {
+            std::cerr << "Failed to read counter 0" << std::endl;
+        }
+        std::cout << count << " cache line flushes" << std::endl;
+
+        Nova::mword_t latency = 0;
+        if ((rc = Nova::hpc_read(2, 1, latency)) != Nova::NOVA_OK)
+        {
+            std::cerr << "Failed to read counter 1" << std::endl;
+        }
+        Nova::mword_t l2_requests = 0;
+        if ((rc = Nova::hpc_read(1, 1, l2_requests)) != Nova::NOVA_OK)
+        {
+            std::cerr << "Failed to read counter 1" << std::endl;
+        }
+        count = (latency * 4) / l2_requests;
+        std::cout << "L2 latency:" << count << " cycles" << std::endl;
+    }
+
+    return 0;
+}
--- a/repos/mml/src/app/hpc_test/target.mk
+++ b/repos/mml/src/app/hpc_test/target.mk
@@ -0,0 +1,5 @@
+TARGET = hpc_test
+SRC_CC = trace_pfc.cc
+LIBS += base posix libm libc stdcxx 
+CC_OPT += -Wno-error -Wno-permissive -fpermissive -Wno-error=conversion
+
--- a/repos/mml/src/app/hpc_test/trace_pfc.cc
+++ b/repos/mml/src/app/hpc_test/trace_pfc.cc
@@ -0,0 +1,105 @@
+/**
+ * @file trace_pfc.cc
+ * @author Michael Müller (michael.mueller@uos.de)
+ * @brief Tests for Genode wrappers around Performance counter syscalls in NOVA
+ * @version 0.1
+ * @date 2022-12-15
+ * 
+ * @copyright Copyright (c) 2022
+ * 
+ */
+
+#include <base/trace/perf.h>
+
+#include <iostream>
+#include <chrono>
+#include <thread>
+#include <x86intrin.h>
+
+using namespace Genode;
+
+int main(void)
+{
+    Trace::Performance_counter::Counter ctr_clflush, ctr_l2_latency, ctr_l2_requests, /*ctr_l3_miss,*/ ctr_l2_prefetch;
+
+    try {
+        ctr_clflush = Trace::Performance_counter::alloc_core();
+        ctr_l2_latency = Trace::Performance_counter::alloc_core();
+        ctr_l2_requests = Trace::Performance_counter::alloc_core();
+        ctr_l2_prefetch = Trace::Performance_counter::acquire(Trace::Performance_counter::Type::CORE);
+        // ctr_l3_miss = Trace::Performance_counter::alloc_cbo();
+    }
+    catch (Trace::Pfc_no_avail)
+    {
+        std::cout << "Unable to allocate performance counters." << std::endl;
+        return -1;
+    }
+
+    std::cout << "Performance counter allocation successful." << std::endl;
+    
+    try {
+        Trace::Performance_counter::setup(ctr_clflush, 0x26, 0x00, 0x70000);
+        Trace::Performance_counter::setup(ctr_l2_latency, 0x62, 0x01, 0x30000);
+        Trace::Performance_counter::setup(ctr_l2_requests, 0x60, 0xfe, 0x30000);
+        Trace::Performance_counter::setup(ctr_l2_prefetch, 0xc0, 0x00, 0x30000);
+        //Trace::Performance_counter::setup(ctr_l3_miss, 0x6, 0xff, 0x550f000000000000);
+    } catch (Trace::Pfc_access_error &e) {
+        std::cerr << "PFC access failed. rc=" << e.error_code() << std::endl;
+        return -1;
+    }
+
+    std::cout << "Performance counters successfully set up." << std::endl;
+
+    try {
+        Trace::Performance_counter::start(ctr_clflush);
+        Trace::Performance_counter::start(ctr_l2_latency);
+        Trace::Performance_counter::start(ctr_l2_requests);
+        Trace::Performance_counter::start(ctr_l2_prefetch);
+        //Trace::Performance_counter::start(ctr_l3_miss);
+    } catch (Trace::Pfc_access_error &e) {
+        std::cerr << "PFC access failed. rc=" << e.error_code() << std::endl;
+        return -1;
+    }
+
+    std::cout << "Performance counters started." << std::endl;
+
+    for (;;) {
+        Genode::uint64_t clflushes, latency, requests, /*l3_misses,*/ l2_prefetches;
+        clflushes = latency = requests = l2_prefetches = 0;
+
+        std::this_thread::sleep_for(std::chrono::seconds(2));
+        _mm_clflush(&clflushes);
+        _mm_clflush(&clflushes);
+
+        try {
+            clflushes = Trace::Performance_counter::read(ctr_clflush);
+            latency = Trace::Performance_counter::read(ctr_l2_latency);
+            requests = Trace::Performance_counter::read(ctr_l2_requests);
+            l2_prefetches = Trace::Performance_counter::read(ctr_l2_prefetch);
+            //l3_misses = Trace::Performance_counter::read(ctr_l3_miss);
+        } catch (Trace::Pfc_access_error &e) {
+            std::cerr << "PFC access failed. rc=" << e.error_code() << std::endl;
+            return 1;
+        }
+
+        std::cout << clflushes << " cache line flushes." << std::endl;
+        //std::cout << "L2 latency: " << (latency * 4) / requests << " cycles." << std::endl;
+        std::cout << l2_prefetches << " L2 prefetch requests." << std::endl;
+   /* 
+        try {
+            Trace::Performance_counter::stop(ctr_l2_prefetch);
+            Trace::Performance_counter::reset(ctr_l2_prefetch, 0xdeadbeef);
+            Trace::Performance_counter::start(ctr_l2_prefetch);
+            std::cout << Trace::Performance_counter::read(ctr_l2_prefetch) << " L2 prefetches after context-switch" << std::endl;
+            Trace::Performance_counter::stop(ctr_l2_prefetch);
+            Trace::Performance_counter::reset(ctr_l2_prefetch, l2_prefetches);
+            Trace::Performance_counter::start(ctr_l2_prefetch);
+        } catch (Trace::Pfc_access_error &e) {
+            std::cerr << "PFC access failed. rc=" << e.error_code() << std::endl;
+        }
+*/
+        // std::cout << l3_misses << " L3 misses" << std::endl;
+    }
+
+    return 0;
+}
--- a/repos/mml/src/app/libpfm_test/check_events.c
+++ b/repos/mml/src/app/libpfm_test/check_events.c
@@ -0,0 +1,174 @@
+/*
+ * check_events.c - show event encoding
+ *
+ * Copyright (c) 2009 Google, Inc
+ * Contributed by Stephane Eranian <eranian@gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+ * PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+ * CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
+ * OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * This file is part of libpfm, a performance monitoring support library for
+ * applications on Linux.
+ */
+#include <sys/types.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <perfmon/err.h>
+
+#include <perfmon/pfmlib.h>
+
+int pmu_is_present(pfm_pmu_t p)
+{
+    pfm_pmu_info_t pinfo;
+    int ret;
+
+    memset(&pinfo, 0, sizeof(pinfo));
+    ret = pfm_get_pmu_info(p, &pinfo);
+    return ret == PFM_SUCCESS ? pinfo.is_present : 0;
+}
+
+int main(int argc, const char **argv)
+{
+    pfm_pmu_info_t pinfo;
+    pfm_pmu_encode_arg_t e;
+    const char *arg[3];
+    const char **p;
+    char *fqstr;
+    pfm_event_info_t info;
+    int j, ret;
+    pfm_pmu_t i;
+    int total_supported_events = 0;
+    int total_available_events = 0;
+
+    unsigned long low, high, msr;
+    msr = 0xc0010200;
+
+    asm volatile("rdmsr"
+                 : "=a"(low), "=d"(high)
+                 : "c"(msr)); /*
+                               * Initialize pfm library (required before we can use it)
+                               */
+    ret = pfm_initialize();
+    if (ret != PFM_SUCCESS)
+        errx(1, "cannot initialize library: %s\n", pfm_strerror(ret));
+
+    memset(&pinfo, 0, sizeof(pinfo));
+    memset(&info, 0, sizeof(info));
+
+    printf("Supported PMU models:\n");
+    for (i = PFM_PMU_NONE; i < PFM_PMU_MAX; i++)
+    {
+        ret = pfm_get_pmu_info(i, &pinfo);
+        if (ret != PFM_SUCCESS)
+            continue;
+
+        printf("\t[%d, %s, \"%s\"]\n", i, pinfo.name, pinfo.desc);
+    }
+
+    printf("Detected PMU models:\n");
+    for (i = PFM_PMU_NONE; i < PFM_PMU_MAX; i++)
+    {
+        ret = pfm_get_pmu_info(i, &pinfo);
+        if (ret != PFM_SUCCESS)
+            continue;
+        if (pinfo.is_present)
+        {
+            printf("\t[%d, %s, \"%s\"]\n", i, pinfo.name, pinfo.desc);
+            total_supported_events += pinfo.nevents;
+        }
+        total_available_events += pinfo.nevents;
+    }
+
+    printf("Total events: %d available, %d supported\n", total_available_events, total_supported_events);
+
+    /*
+     * be nice to user!
+     */
+    if (argc < 2 && pmu_is_present(PFM_PMU_PERF_EVENT))
+    {
+        arg[0] = "PERF_COUNT_HW_CPU_CYCLES";
+        arg[1] = "PERF_COUNT_HW_INSTRUCTIONS";
+        arg[2] = NULL;
+        p = arg;
+    }
+    else
+    {
+        p = argv + 1;
+    }
+
+    if (!*p)
+        errx(1, "you must pass at least one event");
+
+    memset(&e, 0, sizeof(e));
+    while (*p)
+    {
+        /*
+         * extract raw event encoding
+         *
+         * For perf_event encoding, use
+         * #include <perfmon/pfmlib_perf_event.h>
+         * and the function:
+         * pfm_get_perf_event_encoding()
+         */
+        fqstr = NULL;
+        e.fstr = &fqstr;
+        ret = pfm_get_os_event_encoding(*p, PFM_PLM0 | PFM_PLM3, PFM_OS_NONE, &e);
+        if (ret != PFM_SUCCESS)
+        {
+            /*
+             * codes is too small for this event
+             * free and let the library resize
+             */
+            if (ret == PFM_ERR_TOOSMALL)
+            {
+                free(e.codes);
+                e.codes = NULL;
+                e.count = 0;
+                free(fqstr);
+                continue;
+            }
+            if (ret == PFM_ERR_NOTFOUND && strstr(*p, "::"))
+                errx(1, "%s: try setting LIBPFM_ENCODE_INACTIVE=1", pfm_strerror(ret));
+            errx(1, "cannot encode event %s: %s", *p, pfm_strerror(ret));
+        }
+        ret = pfm_get_event_info(e.idx, PFM_OS_NONE, &info);
+        if (ret != PFM_SUCCESS)
+            errx(1, "cannot get event info: %s", pfm_strerror(ret));
+
+        ret = pfm_get_pmu_info(info.pmu, &pinfo);
+        if (ret != PFM_SUCCESS)
+            errx(1, "cannot get PMU info: %s", pfm_strerror(ret));
+
+        printf("Requested Event: %s\n", *p);
+        printf("Actual    Event: %s\n", fqstr);
+        printf("PMU            : %s\n", pinfo.desc);
+        printf("IDX            : %d\n", e.idx);
+        printf("Codes          :");
+        for (j = 0; j < e.count; j++)
+            printf(" 0x%" PRIx64, e.codes[j]);
+        putchar('\n');
+
+        free(fqstr);
+        p++;
+    }
+    if (e.codes)
+        free(e.codes);
+    return 0;
+}
--- a/repos/mml/src/app/libpfm_test/showevtinfo.c
+++ b/repos/mml/src/app/libpfm_test/showevtinfo.c
--- a/repos/mml/src/app/libpfm_test/target.mk
+++ b/repos/mml/src/app/libpfm_test/target.mk
@@ -0,0 +1,5 @@
+TARGET = libpfm_test
+SRC_CC = check_events.c
+LIBS += base posix libm libc stdcxx libpfm4 
+CC_OPT += -Wno-error -Wno-permissive -fpermissive
+
--- a/repos/mml/src/app/thread_test/target.mk
+++ b/repos/mml/src/app/thread_test/target.mk
@@ -1,4 +1,4 @@
 TARGET = thread_test
 SRC_CC = thread_test.cc 
-LIBS += base stdcxx 
+LIBS += base libc stdcxx 
 CXXFLAGS += -Wno-error
				`@@ -0,0 +1 @@`
				`INC_DIR += $(call select_from_ports,libpfm4)/include`