genode/repos/base-tukija/include/tukija/syscall-generic.h

/*
 * \brief  Syscall bindings for the Tukija (originally NOVA) microhypervisor
 * \author Norman Feske
 * \author Sebastian Sumpf
 * \author Alexander Boettcher
 * \author Benjamin Lamowski
 * \author Michael Müller
 * \date   2009-12-27
 */

/*
 * Copyright (c) 2009-2023 Genode Labs
 * Copyright (c) 2025 Michael Müller, Osnabrück University
 *
 * Permission is hereby granted, free of charge, to any person
 * obtaining a copy of this software and associated documentation
 * files (the "Software"), to deal in the Software without
 * restriction, including without limitation the rights to use,
 * copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following
 * conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 */

#ifndef _INCLUDE__NOVA__SYSCALL_GENERIC_H_
#define _INCLUDE__NOVA__SYSCALL_GENERIC_H_

#include <tukija/stdint.h>
#include <tukija/atomic.h>
#include <tukija/bits.h>

#include <base/affinity.h>

namespace Tukija {

	enum {
		PAGE_SIZE_LOG2 = 12,
		PAGE_SIZE_BYTE = 1 << PAGE_SIZE_LOG2,
		PAGE_MASK_ = ~(PAGE_SIZE_BYTE - 1)
	};

	/**
	 * NOVA system-call IDs
	 */
	enum Syscall {
		NOVA_CALL       = 0x0,
		NOVA_REPLY      = 0x1,
		NOVA_CREATE_PD  = 0x2,
		NOVA_CREATE_EC  = 0x3,
		NOVA_CREATE_SC  = 0x4,
		NOVA_CREATE_PT  = 0x5,
		NOVA_CREATE_SM  = 0x6,
		NOVA_REVOKE     = 0x7,
		NOVA_MISC       = 0x8, /* lookup, delegate, acpi_suspend */
		NOVA_EC_CTRL    = 0x9,
		NOVA_SC_CTRL    = 0xa,
		NOVA_PT_CTRL    = 0xb,
		NOVA_SM_CTRL    = 0xc,
		NOVA_ASSIGN_PCI = 0xd,
		NOVA_ASSIGN_GSI = 0xe,
		NOVA_PD_CTRL    = 0xf,
		TUKIJA_CREATE_CELL = 0x10,
		TUKIJA_ALLOCATE	   = 0x11,
		TUKIJA_CELL_CTRL   = 0x12,
		TUKIJA_RELEASE	   = 0x13,
	};

	/**
	 * Tukija operations
	 */
	enum Cell_control
	{
		UPDATE_AFFINITY = 0,
	};

	enum Resource_type
	{
		CPU_CORE = 0,
	};

	enum Release_op
	{
		RELEASE = 0,
		RETURN_TO_OWNER = 1,
	};

	/**
	 * NOVA status codes returned by system-calls
	 */
	enum Status
	{
		NOVA_OK             = 0,
		NOVA_TIMEOUT        = 1,
		NOVA_IPC_ABORT      = 2,
		NOVA_INV_HYPERCALL  = 3,
		NOVA_INV_SELECTOR   = 4,
		NOVA_INV_PARAMETER  = 5,
		NOVA_INV_FEATURE    = 6,
		NOVA_INV_CPU        = 7,
		NOVA_INVD_DEVICE_ID = 8,
		NOVA_PD_OOM         = 9,
	};

	/**
	 * CPU Set
	 */
	class Cpuset
	{
		private:
			enum {CPUS_PER_VALUE = sizeof(mword_t) * 8};
			mword_t raw[1 + (256 - 1) / CPUS_PER_VALUE];

			inline mword_t &value(unsigned const cpu) {
				return raw[cpu / CPUS_PER_VALUE];
			}

			inline mword_t const &value(unsigned const cpu) const {
				return raw[cpu / CPUS_PER_VALUE];
			}

			inline mword_t bit_cpu(unsigned const cpu) const {
				return cpu % CPUS_PER_VALUE;
			}

		public:
			inline explicit Cpuset(mword_t const v)
			{
				for (unsigned i = 0; i < sizeof(raw) / sizeof(raw[0]); i++)
					raw[i] = v;
			}

			inline bool chk(unsigned const cpu) const {
				return value(cpu) & (1UL << bit_cpu(cpu));
			}

			inline void set(unsigned const cpu) {
				Atomic::test_set_bit(value(cpu), bit_cpu(cpu));
			}

			inline void clr(unsigned const cpu) {
				Atomic::test_clr_bit(value(cpu), bit_cpu(cpu));
			}

			inline void merge(Cpuset const &s)
			{
				for (unsigned i = 0; i < sizeof(raw) / sizeof(raw[0]); i++)
					Atomic::set_mask(value(i * CPUS_PER_VALUE), s.value(i * CPUS_PER_VALUE));
			}

			inline void clear()
			{
				for (unsigned i = 0; i < sizeof(raw) / sizeof(raw[0]); i++)
					Atomic::clr_mask(value(i * CPUS_PER_VALUE),0UL);
			}

			template <typename T>
			void for_each(T const fn)
			{
				long cpu = 0;
				for (unsigned i = 0; i < sizeof(raw) / sizeof(raw[0]); i++)
				{
					mword_t subset = raw[i];
					while ((cpu = bit_scan_forward(subset)) != -1)
					{
						Atomic::test_clr_bit(subset, bit_cpu(static_cast<unsigned int>(cpu)));
						fn((cpu+i*CPUS_PER_VALUE));
					}
				}
			}

			unsigned count()
			{
				unsigned count = 0;
				for (unsigned i = 0; i < sizeof(raw); i++) {
					count += static_cast<unsigned>(popcount(raw[i]));
				}
				return count;
			}
	};

	/**
	 * Cell information pages
	 *
	 */
	class Cip
	{
		public:

			/**
			 * \brief Per-worker thread information
			 *
			 * \details Workers represent arbitrary computing resources (e.g. CPU cores).
			 * Tukija grants or revokes workers from cells by adjusting its CPU core allocation.
			 * However, to allow a cell to react to a revocation of workers, Tukija employs a
			 * shared mem structure containing a flag that signals a yield request. It is expected
			 * that the cell polls this flag regularly in user-space, e.g. after the execution of an MxTask.
			 */
			struct Worker {
				volatile unsigned long yield_flag{0}; /* This flag will be set if a yield request has been filed */
				unsigned long padding[3];
			};


			/**
			 * @brief Information about MxTasking channels
			 *
			 * @details A channel is the representation of a task queue, holding a set of tasks to execute.
			 * Every task is enqueued in exactly one channel and each channel as processed by at most one
			 * worker at any given time. To ensure progress of a cell, Tukija expects cells to implement
			 * load balancing strategies to ensure that each channel's tasks are executed by a worker thread.
			 * However, Tukija does not command that each allocated core and its worker must process a channel.
			 */
			struct Channels {
				volatile unsigned short remainder{0}; /* Number of channels that remain unstolen, after each worker has stolen `limit` many channels*/
				volatile unsigned short limit{0}; /* Number of channels each worker is allowed to steal when core allocation has changed */
				unsigned int count{0}; /* Total number of channels this cell uses. */
			};

			alignas(64) Worker worker_info[256];
			Channels channel_info;

			/**
			 * @brief Contains the set of CPU cores that are currently allocated by this cell.
			 *
			 */
			Cpuset cores_current;

			/**
			 * @brief Contains the set of CPU cores this cell may lay claims to.
			 *
			 * @details Hoitaja prepartitions all CPU cores among the currently running cells.
			 * Each of these partitions present the optimal allocation according to each cell's priority.
			 * However, to achieve better utilization,
			 * Tukija can assign more or less cores to a cell than contained in this set.
			 */
			Cpuset cores_reserved;

			/**
			 * @brief The set of cores, Tukija has chosen to answer the last allocation request.
			 *
			 */
			Cpuset cores_new;

			unsigned idx_to_phys_cpu_id[256]; /* Mapping from pager index to kernel cpu ID */

			Genode::Affinity::Space habitat_affinity; /* the affinity space the corresponding cell lives in */

			Genode::Affinity::Location location; /* Location of the cell within its habitat */

			/**
			 * @brief Return the sanitized kernel CPU ID for a given location
			 *
			 * @param location - the location for which to request the kernel CPU ID
			 * @return unsigned - the kernel CPU ID of the given location
			 */
			unsigned location_to_kernel_cpu(Genode::Affinity::Location const &loc)
			{
				Genode::Affinity::Location loc_in_habitat = loc.transpose(location.xpos(), location.ypos());
				unsigned idx = (loc_in_habitat.xpos() * habitat_affinity.height() + loc_in_habitat.ypos()) % habitat_affinity.total();
				return idx_to_phys_cpu_id[idx];
			}

			/**
			 * @brief Return the worker information structure for the workers at the given location
			 *
			 * @details Allows to request kernel worker information for the workers residing on the CPU specified by location.
			 *          This must be used by user-space to retrieve the yield_flag for the CPU specified by location. The kernel
			 * 			expects each user-space cell to regularly poll the yield_flag of each of its granted CPUs to ensure
			 *  		that borrowed CPU cores are returned to their rightful owner.
			 *
			 * @param location - location of the workers for which to request the information struct
			 * @return struct Worker& - the information struct for the workers
			 */
			struct Worker &worker_for_location(Genode::Affinity::Location const &location)
			{
				return worker_info[location_to_kernel_cpu(location)];
			}

			/**
			 * @brief Returns a pointer to the cell information pages from within the address space
			 * 		  of the respective cell.
			 * @details This pointer is used by the user-space runtime environment of the cell to get
			 * information about pending yield requests, changes in core allocation, and parameters for
			 * adjusting the assignment of channels to workers.
			 *
			 * @return Cip* pointer to the cell's information pages
			 */
			static Cip *
			cip()
			{
				return reinterpret_cast<Cip *>(0x7FFFBFFDC000);
			}
	};

	/**
	 * Topology information pages
	 */
	class Tip
	{
		public:
		/**************/
		/* Exceptions */
		/**************/

		/**
		 * @brief Exception indicating that the requested NUMA domain could not be found.
		 *
		 */
		struct Domain_not_found {
		};

		/**
		 * @brief Exception signaling that a NUMA domain has no memory region
		 *
		 */
		struct Domain_has_no_memory_regions {
		};

		/***
		 * Data structure definitions
		 */

		/**
		 * @brief Describes the physical address range of a NUMA domain
		 *
		 */
		struct Memory_region {
			void *start;
			void *end;
		};

		/**
		 * @brief Desribes either an ACPI or PCI device and its beloging to a NUMA domain.
		 *
		 */
		struct Device {
			enum dev_type
			{
				ACPI = 0,
				PCI = 1
			};

			dev_type type;

			union {
				struct {
					uint64_t hid;
					uint32_t uid;
				} acpi_handle;
				struct {
					uint16_t segment;
					uint16_t bdf;
				} pci_handle;
			};
		};

		/**
		 * @brief Representation of a NUMA domain
		 *
		 * @details A NUMA domain consists of a (possibly empty set) of CPU cores, a physical address range, and a set of devices. The information provided by a NUMA domain structure can be used to provide NUMA-aware placement of data objects and tasks.
		 *
		 */
		struct Domain {
			uint32_t id;
			uint8_t num_mem_descriptors;
			uint8_t num_devices;
			Memory_region memory_regions[32];
			Device devices[32];
			Cpuset cpus{0};

			template <typename T>
			void for_each_mem(T const fn) {
				for (uint8_t i = 0; i < num_mem_descriptors; i++)
					fn(memory_regions[i]);
			}
		};

		/** Page layout */
		uint16_t length{8}; /* Length of the Topology information page */
		uint32_t cpu_to_domain[256]; /* Mapping of CPU to NUMA IDs */
		Domain nodes[]; /* Set of detected NUMA domains */

		/* Parsing functions */
		/**
		 * @brief Applies the function fn to all detected NUMA domain structures
		 *
		 * @tparam T - type parameter used for the function lambda provided by fn
		 * @param fn - a function (e.g. a lambda closure) to execute on each NUMA domain.
		 */
		template <typename T>
		void for_each(T const fn) {
			mword_t const node_cnt = (reinterpret_cast<mword_t>(&nodes) + length - reinterpret_cast<mword_t>(nodes)) / sizeof(Domain);

			Domain *dom = nodes;

			for (unsigned i = 0; i < node_cnt; i++) {
				dom = nodes + i;

				fn(*dom);
			}
		}

		/**
		 * @brief Looks up the NUMA domain the CPU with id cpu belongs to.
		 *
		 * @param cpu | The ID of the cpu whose NUMA domain should be looked up.
		 * @return Domain& | the NUMA domain the cpu belongs to.
		 * @throws a Domain_not_found exception, if the cpu cannot be linked to a NUMA domain.
		 */
		Domain const &dom_of_cpu(unsigned cpu) {
			unsigned id = cpu_to_domain[cpu];
			unsigned idx = 0;
			mword_t const node_cnt = (reinterpret_cast<mword_t>(&nodes) + length - reinterpret_cast<mword_t>(nodes)) / sizeof(Domain);

			for (; idx < node_cnt; idx++) {
				if (nodes[idx].id == id)
					break;
			}

			Domain const &d = nodes[idx];
			if (!d.cpus.chk(cpu))
				throw Domain_not_found();

			return d;
		}

		/**
		 * @brief Applies a function on a NUMA domain structure
		 *
		 * @tparam T
		 * @param fn | the function to apply
		 * @param id | the ID of the NUMA domain
		 */
		template <typename T>
		void on_node(T const fn, uint32_t id)
		{
			mword_t const node_cnt = (reinterpret_cast<mword_t>(nodes) + length - reinterpret_cast<mword_t>(nodes)) / sizeof(Domain);

			Domain *dom = nodes;

			for (unsigned i = 0; i < node_cnt; i++) {
				dom = nodes + i;

				if (dom->id == id)
					break;
			}

			if (dom->id != id)
				return;

			fn(*dom);
		}

		/**
		 * @brief Returns the memory region of a NUMA domain
		 *
		 * @tparam T
		 * @param dom_id - The ID of the NUMA ID
		 * @param region_count - number of memory regions the NUMA domain encompasses.
		 * @return Memory_region& - The first memory region for this NUMA domain.
		 */
		template <typename T>
		Memory_region &memory_for_domain(uint32_t dom_id, uint8_t *&region_count) {
			Memory_region *mem;

			on_node([&](Domain &dom)
					{ mem = &(dom.memory_regions); });

			if (!mem) {
				throw Domain_has_no_memory_regions();
			}

			return *mem;
		}

		inline static Tip const *tip() {
			return reinterpret_cast<Tip *>(0x7fffbffe0000);
		}
	};

	/**
	 * Hypervisor information page
	 */
	struct Hip
	{
		struct Mem_desc
		{
			enum Type {
				EFI_SYSTEM_TABLE    = -7,
				HYPERVISOR_LOG      = -6,
				FRAMEBUFFER         = -5,
				ACPI_XSDT           = -4,
				ACPI_RSDT           = -3,
				MULTIBOOT_MODULE    = -2,
				MICROHYPERVISOR     = -1,
				AVAILABLE_MEMORY    =  1,
				RESERVED_MEMORY     =  2,
				ACPI_RECLAIM_MEMORY =  3,
				ACPI_NVS_MEMORY     =  4
			};

			uint64_t const addr;
			uint64_t const size;
			Type     const type;
			uint32_t const aux;
		};

		uint32_t const signature;   /* magic value 0x41564f4e */
		uint16_t const hip_checksum;
		uint16_t const hip_length;
		uint16_t const cpu_desc_offset;
		uint16_t const cpu_desc_size;
		uint16_t const mem_desc_offset;
		uint16_t const mem_desc_size;
		uint32_t const feature_flags;
		uint32_t const api_version;
		uint32_t const sel;         /* number of cap selectors                 */
		uint32_t const sel_exc;     /* number of cap selectors for exceptions  */
		uint32_t const sel_vm;      /* number of cap selectors for VM handling */
		uint32_t const sel_gsi;     /* number of global system interrupts      */
		uint32_t const page_sizes;  /* supported page sizes                    */
		uint32_t const utcb_sizes;  /* supported utcb sizes                    */
		uint32_t const tsc_freq;    /* time-stamp counter frequency in kHz     */
		uint32_t const bus_freq;    /* bus frequency in kHz                    */
		mword_t const topo_model;
		Genode::addr_t const topo_phys;
		bool has_feature_iommu() const { return feature_flags & (1 << 0); }
		bool has_feature_vmx()   const { return feature_flags & (1 << 1); }
		bool has_feature_svm()   const { return feature_flags & (1 << 2); }

		struct Cpu_desc {
			uint8_t flags;
			uint8_t thread;
			uint8_t core;
			uint8_t package;
			uint8_t acpi_id;
			uint8_t family;
			uint8_t model;
			uint8_t stepping:4;
			uint8_t platform:3;
			uint8_t reserved:1;
			uint32_t patch;

			bool p_core() const { return flags & 0x2; }
			bool e_core() const { return flags & 0x4; }
		} __attribute__((packed));

		unsigned cpu_max() const {
			return (mem_desc_offset - cpu_desc_offset) / cpu_desc_size; }

		unsigned cpus() const {
			unsigned cpu_num = 0;

			for (unsigned i = 0; i < cpu_max(); i++)
				if (is_cpu_enabled(i))
					cpu_num++;

			return cpu_num;
		}

		Cpu_desc const * cpu_desc_of_cpu(unsigned i) const {
			if (i >= cpu_max())
				return nullptr;

			unsigned long desc_addr = reinterpret_cast<unsigned long>(this) +
			                          cpu_desc_offset + i * cpu_desc_size;
			return reinterpret_cast<Cpu_desc const *>(desc_addr);
		}

		bool is_cpu_enabled(unsigned i) const {
			Cpu_desc const * const desc = cpu_desc_of_cpu(i);
			return desc ? desc->flags & 0x1 : false;
		}

		/**
		 * Resort CPU ids such, that
		 * - the boot CPU id is ever logical CPU id 0
		 * - SMT threads of one CPU have logical CPU ids close together
		 * - P-Core has a smaller logical CPU id than E-Core CPUs
		 *
		 * Returns true, if re-mapping succeeded otherwise false.
		 *
		 * In case of failure, map_cpus will contain a 1:1 fallback mapping
		 * without any sorting as mentioned above.
		 */
		bool remap_cpu_ids(uint16_t *map_cpus, unsigned const max_cpus,
		                   unsigned const boot_cpu) const
		{
			unsigned const num_cpus = cpus();
			bool too_many_cpus = false;
			unsigned cpu_i = 0;

			/* fallback lambda in case re-ordering fails */
			auto remap_failure = [&] {
				for (uint16_t i = 0; i < max_cpus; i++) { map_cpus[i] = i; }
				return false;
			};

			/* assign boot cpu ever the virtual cpu id 0 */
			Cpu_desc const * const boot = cpu_desc_of_cpu(boot_cpu);
			if (!boot)
				return remap_failure();

			map_cpus[cpu_i++] = (uint8_t)boot_cpu;
			if (cpu_i >= num_cpus)
				return true;
			if (cpu_i >= max_cpus)
				return remap_failure();

			/* assign cores + SMT threads first and skip E-cores */
			bool done = for_all_cpus([&](auto const &cpu, auto const kernel_cpu_id) {
				if (kernel_cpu_id == boot_cpu)
					return false;

				/* handle normal or P-core */
				if (cpu.e_core())
					return false;

				map_cpus[cpu_i++] = (uint8_t)kernel_cpu_id;

				too_many_cpus = !!(cpu_i >= max_cpus);

				return (cpu_i >= num_cpus || too_many_cpus);
			});

			if (done)
				return too_many_cpus ? remap_failure() : true;

			/* assign remaining E-cores */
			done = for_all_cpus([&](auto &cpu, auto &kernel_cpu_id) {
				if (kernel_cpu_id == boot_cpu)
					return false;

				/* handle solely E-core */
				if (!cpu.e_core())
					return false;

				map_cpus[cpu_i++] = (uint16_t)kernel_cpu_id;

				too_many_cpus = !!(cpu_i >= max_cpus);

				return (cpu_i >= num_cpus || too_many_cpus);
			});

			return too_many_cpus ? remap_failure() : done;
		}

		/**
		 * Iterate over all CPUs in a _ever_ _consistent_ order.
		 */
		bool for_all_cpus(auto const &fn) const
		{
			for (uint16_t package = 0; package <= 255; package++) {
				for (uint16_t core = 0; core <= 255; core++) {
					for (uint16_t thread = 0; thread <= 255; thread++) {
						for (unsigned i = 0; i < cpu_max(); i++) {
							if (!is_cpu_enabled(i))
								continue;

							auto const cpu = cpu_desc_of_cpu(i);
							if (!cpu)
								continue;

							if (!(cpu->package == package && cpu->core == core &&
							      cpu->thread == thread))
								continue;

							bool done = fn(*cpu, i);
							if (done)
								return done;
						}
					}
				}
			}
			return false;
		}

		void for_each_enabled_cpu(auto const &fn) const
		{
			for (unsigned i = 0; i < cpu_max(); i++) {
				Cpu_desc const * cpu = cpu_desc_of_cpu(i);
				if (!is_cpu_enabled(i)) continue;
				if (!cpu) return;
				fn(*cpu, i);
			}
		}

	} __attribute__((packed));


	/**
	 * Semaphore operations
	 */
	enum Sem_op { SEMAPHORE_UP = 0U, SEMAPHORE_DOWN = 1U, SEMAPHORE_DOWNZERO = 0x3U };

	/**
	 * Ec operations
	 */
	enum Ec_op {
		EC_RECALL = 0U,
		EC_YIELD  = 1U,
		EC_DONATE_SC = 2U,
		EC_RESCHEDULE = 3U,
		EC_MIGRATE = 4U,
		EC_TIME = 5U,
		EC_GET_VCPU_STATE = 6U,
		EC_SET_VCPU_STATE = 7U,
		EC_MSR_ACCESS = 8U
	};

	enum Sc_op {
		SC_TIME_IDLE   = 0,
		SC_TIME_CROSS  = 1,
		SC_TIME_KILLED = 2,
		SC_EC_TIME     = 3,
	};

	/**
	 * Pd operations
	 */
	enum Pd_op { TRANSFER_QUOTA = 0U, PD_DEBUG = 2U };

	/**
	 * Hpc operations
	 *
	 */
	enum Hpc_op
	{
		HPC_SETUP = 9U,
		HPC_START = 10U,
		HPC_STOP = 11U,
		HPC_RESET = 12U,
		HPC_READ = 13U,
	};

	/**
	 * Cell operations
	*/
	enum Cell_op
	{
		SHRINK = 0,
		GROW = 1,
	};

	class Gsi_flags
	{
		private:

			uint8_t _value { 0 };

		public:

			enum Mode { HIGH, LOW, EDGE };

			Gsi_flags() { }

			Gsi_flags(Mode m)
			{
				switch (m) {
				case HIGH: _value = 0b110; break; /* level-high */
				case LOW:  _value = 0b111; break; /* level-low */
				case EDGE: _value = 0b100; break; /* edge-triggered */
				}
			}

			uint8_t value() const { return _value; }
	};


	class Descriptor
	{
		protected:

			mword_t _value { 0 };

			/**
			 * Assign bitfield to descriptor
			 */
			template<mword_t MASK, mword_t SHIFT>
			void _assign(mword_t new_bits)
			{
				_value &= ~(MASK << SHIFT);
				_value |= (new_bits & MASK) << SHIFT;
			}

			/**
			 * Query bitfield from descriptor
			 */
			template<mword_t MASK, mword_t SHIFT>
			mword_t _query() const { return (_value >> SHIFT) & MASK; }

		public:

			mword_t value() const { return _value; }

	} __attribute__((packed));


	/**
	 * Message-transfer descriptor
	 */
	class Mtd
	{
		private:

			mword_t const _value;

		public:

			enum {
				ACDB           = 1U << 0,   /* eax, ecx, edx, ebx */
				EBSD           = 1U << 1,   /* ebp, esi, edi */
				ESP            = 1U << 2,
				EIP            = 1U << 3,
				EFL            = 1U << 4,   /* eflags */
				ESDS           = 1U << 5,
				FSGS           = 1U << 6,
				CSSS           = 1U << 7,
				TR             = 1U << 8,
				LDTR           = 1U << 9,
				GDTR           = 1U << 10,
				IDTR           = 1U << 11,
				CR             = 1U << 12,
				DR             = 1U << 13,  /* DR7 */
				SYS            = 1U << 14,  /* Sysenter MSRs CS, ESP, EIP */
				QUAL           = 1U << 15,  /* exit qualification */
				CTRL           = 1U << 16,  /* execution controls */
				INJ            = 1U << 17,  /* injection info */
				STA            = 1U << 18,  /* interruptibility state */
				TSC            = 1U << 19,  /* time-stamp counter */
				EFER           = 1U << 20,  /* EFER MSR */
				PDPTE          = 1U << 21,  /* PDPTE0 .. PDPTE3 */
				R8_R15         = 1U << 22,  /* R8 .. R15 */
				SYSCALL_SWAPGS = 1U << 23,  /* SYSCALL and SWAPGS MSRs */
				TPR            = 1U << 24,  /* TPR and TPR threshold */
				TSC_AUX        = 1U << 25,  /* IA32_TSC_AUX used by rdtscp */
				XSAVE          = 1U << 26,  /* XCR and XSS used with XSAVE */
				FPU            = 1U << 31,  /* FPU state */

				IRQ   = EFL | STA | INJ | TSC,
				ALL   = (0x000fffff & ~CTRL) | EFER | R8_R15 | SYSCALL_SWAPGS | TPR,
			};

			Mtd(mword_t value) : _value(value) { }

			mword_t value() const { return _value; }
	};


	class Crd : public Descriptor
	{
		protected:

			/**
			 * Bitfield holding the descriptor type
			 */
			enum {
				TYPE_MASK   = 0x3,  TYPE_SHIFT  =  0,
				BASE_SHIFT  = 12,   RIGHTS_MASK = 0x1f,
				ORDER_MASK  = 0x1f, ORDER_SHIFT =  7,
				BASE_MASK   = (~0UL) >> BASE_SHIFT,
				RIGHTS_SHIFT= 2
			};

			/**
			 * Capability-range-descriptor types
			 */
			enum {
				NULL_CRD_TYPE   = 0,
				MEM_CRD_TYPE    = 1,
				IO_CRD_TYPE     = 2,
				OBJ_CRD_TYPE    = 3,
				RIGHTS_ALL      = 0x1f,
			};

			void _base(mword_t base)
			{ _assign<BASE_MASK, BASE_SHIFT>(base); }

			void _order(mword_t order)
			{ _assign<ORDER_MASK, ORDER_SHIFT>(order); }

		public:

			Crd(mword_t base, mword_t order) {
				_value = 0; _base(base), _order(order); }

			Crd(mword_t value) { _value = value; }

			mword_t hotspot(mword_t sel_hotspot) const
			{
				if ((value() & TYPE_MASK) == MEM_CRD_TYPE)
					return sel_hotspot & PAGE_MASK_;

				return sel_hotspot << 12;
			}

			mword_t addr()   const { return base() << BASE_SHIFT; }
			mword_t base()   const { return _query<BASE_MASK, BASE_SHIFT>(); }
			mword_t order()  const { return _query<ORDER_MASK, ORDER_SHIFT>(); }
			bool is_null()   const { return (_value & TYPE_MASK) == NULL_CRD_TYPE; }
			uint8_t type()   const { return (uint8_t)_query<TYPE_MASK, TYPE_SHIFT>(); }
			uint8_t rights() const { return (uint8_t)_query<RIGHTS_MASK, RIGHTS_SHIFT>(); }
	} __attribute__((packed));


	class Rights
	{
		private:

			bool const _readable, _writeable, _executable;

		public:

			Rights(bool readable, bool writeable, bool executable)
			: _readable(readable), _writeable(writeable),
			  _executable(executable) { }

			Rights() : _readable(false), _writeable(false), _executable(false) {}

			bool readable()   const { return _readable; }
			bool writeable()  const { return _writeable; }
			bool executable() const { return _executable; }
	};


	/**
	 * Memory-capability-range descriptor
	 */
	class Mem_crd : public Crd
	{
		private:

			enum {
				EXEC_MASK  = 0x1, EXEC_SHIFT  =  4,
				WRITE_MASK = 0x1, WRITE_SHIFT =  3,
				READ_MASK  = 0x1, READ_SHIFT  =  2
			};

			void _rights(Rights r)
			{
				_assign<EXEC_MASK,  EXEC_SHIFT>(r.executable());
				_assign<WRITE_MASK, WRITE_SHIFT>(r.writeable());
				_assign<READ_MASK,  READ_SHIFT>(r.readable());
			}

		public:

			Mem_crd(mword_t base, mword_t order, Rights rights = Rights())
			: Crd(base, order)
			{
				_rights(rights);
				_assign<TYPE_MASK, TYPE_SHIFT>(MEM_CRD_TYPE);
			}

			Rights rights() const
			{
				return Rights(_query<READ_MASK,  READ_SHIFT>(),
				              _query<WRITE_MASK, WRITE_SHIFT>(),
				              _query<EXEC_MASK,  EXEC_SHIFT>());
			}
	};


	/**
	 * I/O-capability-range descriptor
	 */
	class Io_crd : public Crd
	{
		public:

			Io_crd(mword_t base, mword_t order)
			: Crd(base, order)
			{
				_assign<TYPE_MASK, TYPE_SHIFT>(IO_CRD_TYPE);
				_assign<RIGHTS_MASK, RIGHTS_SHIFT>(RIGHTS_ALL);
			}
	};


	class Obj_crd : public Crd
	{
		public:

			enum {
				RIGHT_EC_RECALL = 0x1U,
				RIGHT_PT_CALL   = 0x2U,
				RIGHT_PT_CTRL   = 0x1U,
				RIGHT_PT_XCPU   = 0x10U,
				RIGHT_SM_UP     = 0x1U,
				RIGHT_SM_DOWN   = 0x2U
			};

			Obj_crd() : Crd(0, 0)
			{
				_assign<TYPE_MASK, TYPE_SHIFT>(NULL_CRD_TYPE);
			}

			Obj_crd(mword_t base, mword_t order,
			        mword_t rights = RIGHTS_ALL)
			: Crd(base, order)
			{
				_assign<TYPE_MASK, TYPE_SHIFT>(OBJ_CRD_TYPE);
				_assign<RIGHTS_MASK, RIGHTS_SHIFT>(rights);
			}
	};


	/**
	 * Quantum-priority descriptor
	 */
	class Qpd : public Descriptor
	{
		private:

			enum {
				PRIORITY_MASK = 0xff,    PRIORITY_SHIFT =  0,
				QUANTUM_SHIFT = 12,
				QUANTUM_MASK  = (~0UL) >> QUANTUM_SHIFT
			};

			void _quantum(mword_t quantum)
			{ _assign<QUANTUM_MASK, QUANTUM_SHIFT>(quantum); }

			void _priority(mword_t priority)
			{ _assign<PRIORITY_MASK, PRIORITY_SHIFT>(priority); }

		public:

			enum { DEFAULT_QUANTUM = 10000, DEFAULT_PRIORITY = 64 };

			Qpd(mword_t quantum  = DEFAULT_QUANTUM,
			    mword_t priority = DEFAULT_PRIORITY)
			{
				_value = 0;
				_quantum(quantum), _priority(priority);
			}

			mword_t quantum()  const { return _query<QUANTUM_MASK,  QUANTUM_SHIFT>(); }
			mword_t priority() const { return _query<PRIORITY_MASK, PRIORITY_SHIFT>(); }
	};


	/**
	 * User-level thread-control block
	 */
	struct Utcb
	{
		/**
		 * Return physical size of UTCB in bytes
		 */
		static constexpr mword_t size() { return 4096; }

		/**
		 * Number of untyped items uses lowest 16 bit, number of typed items
		 * uses bit 16-31, bit 32+ are ignored on 64bit
		 */
		mword_t items;
		Crd     crd_xlt;   /* receive capability-range descriptor for translation */
		Crd     crd_rcv;   /* receive capability-range descriptor for delegation */
		mword_t tls;

		/**
		 * Data area
		 *
		 * The UTCB entries following the header hold message payload (normal
		 * IDC operations) or architectural state (exception handling).
		 */
		union {

			/* exception state */
			struct {
				mword_t mtd, instr_len, ip, flags;
				unsigned intr_state, actv_state, inj_info, inj_error;
				mword_t ax, cx, dx, bx;
				mword_t sp, bp, si, di;
#ifdef __x86_64__
				mword_t r8, r9, r10, r11, r12, r13, r14, r15;
#endif
				unsigned long long qual[2];  /* exit qualification */
				unsigned ctrl[2];
				mword_t cr0, cr2, cr3, cr4;
				unsigned long long xcr0, xss;
				mword_t pdpte[4];
#ifdef __x86_64__
				mword_t cr8, efer;
				unsigned long long star;
				unsigned long long lstar;
				unsigned long long cstar;
				unsigned long long fmask;
				unsigned long long kernel_gs_base;
				unsigned tpr;
				unsigned tpr_threshold;
#endif
				mword_t dr7, sysenter_cs, sysenter_sp, sysenter_ip;

				struct {
					unsigned short sel, ar;
					unsigned limit;
					mword_t  base;
#ifndef __x86_64__
					mword_t  reserved;
#endif
				} es, cs, ss, ds, fs, gs, ldtr, tr;
				struct {
					unsigned reserved0;
					unsigned limit;
					mword_t  base;
#ifndef __x86_64__
					mword_t  reserved1;
#endif
				} gdtr, idtr;
				unsigned long long tsc_val, tsc_off, tsc_aux;
				unsigned long long exit_reason;
				uint8_t fpu[2560];
			} __attribute__((packed));
			mword_t mr[(4096 - 4 * sizeof(mword_t)) / sizeof(mword_t)];
		};

		/* message payload */
		mword_t * msg() { return mr; }

		struct Item {
			mword_t crd;
			mword_t hotspot;
			bool is_del() const { return hotspot & 0x1; }
		};

#ifdef __x86_64__
		uint64_t read_r8()             const { return r8; }
		uint64_t read_r9()             const { return r9; }
		uint64_t read_r10()            const { return r10; }
		uint64_t read_r11()            const { return r11; }
		uint64_t read_r12()            const { return r12; }
		uint64_t read_r13()            const { return r13; }
		uint64_t read_r14()            const { return r14; }
		uint64_t read_r15()            const { return r15; }
		mword_t  read_efer()           const { return efer; }
		uint64_t read_star()           const { return star; }
		uint64_t read_lstar()          const { return lstar; }
		uint64_t read_cstar()          const { return cstar; }
		uint64_t read_fmask()          const { return fmask; }
		uint64_t read_kernel_gs_base() const { return kernel_gs_base; }
		uint32_t read_tpr()            const { return tpr; }
		uint32_t read_tpr_threshold()  const { return tpr_threshold; }

		void write_r8             (uint64_t value) { r8             = value; }
		void write_r9             (uint64_t value) { r9             = value; }
		void write_r10            (uint64_t value) { r10            = value; }
		void write_r11            (uint64_t value) { r11            = value; }
		void write_r12            (uint64_t value) { r12            = value; }
		void write_r13            (uint64_t value) { r13            = value; }
		void write_r14            (uint64_t value) { r14            = value; }
		void write_r15            (uint64_t value) { r15            = value; }
		void write_efer           (mword_t  value) { efer           = value; }
		void write_star           (uint64_t value) { star           = value; }
		void write_lstar          (uint64_t value) { lstar          = value; }
		void write_cstar          (uint64_t value) { cstar          = value; }
		void write_fmask          (uint64_t value) { fmask          = value; }
		void write_kernel_gs_base (uint64_t value) { kernel_gs_base = value; }
		void write_tpr            (uint32_t value) { tpr            = value; }
		void write_tpr_threshold  (uint32_t value) { tpr_threshold  = value; }
#else
		uint64_t read_r8()             const { return 0; }
		uint64_t read_r9()             const { return 0; }
		uint64_t read_r10()            const { return 0; }
		uint64_t read_r11()            const { return 0; }
		uint64_t read_r12()            const { return 0; }
		uint64_t read_r13()            const { return 0; }
		uint64_t read_r14()            const { return 0; }
		uint64_t read_r15()            const { return 0; }
		mword_t  read_efer()           const { return 0; }
		uint64_t read_star()           const { return 0; }
		uint64_t read_lstar()          const { return 0; }
		uint64_t read_cstar()          const { return 0; }
		uint64_t read_fmask()          const { return 0; }
		uint64_t read_kernel_gs_base() const { return 0; }
		uint32_t read_tpr()            const { return 0; }
		uint32_t read_tpr_threshold()  const { return 0; }

		void write_r8             (uint64_t) { }
		void write_r9             (uint64_t) { }
		void write_r10            (uint64_t) { }
		void write_r11            (uint64_t) { }
		void write_r12            (uint64_t) { }
		void write_r13            (uint64_t) { }
		void write_r14            (uint64_t) { }
		void write_r15            (uint64_t) { }
		void write_efer           (mword_t)  { }
		void write_star           (uint64_t) { }
		void write_lstar          (uint64_t) { }
		void write_cstar          (uint64_t) { }
		void write_fmask          (uint64_t) { }
		void write_kernel_gs_base (uint64_t) { }
		void write_tpr            (uint32_t) { }
		void write_tpr_threshold  (uint32_t) { }
#endif

		/**
		 * Set number of untyped message words
		 *
		 * Calling this function has the side effect of removing all typed
		 * message items from the message buffer.
		 */
		void set_msg_word(mword_t const num) { items = num; }

		/**
		 * Return current number of message word in UTCB
		 */
		unsigned msg_words() { return items & 0xffffU; }

		/**
		 * Return current number of message items on UTCB
		 */
		unsigned msg_items() { return (unsigned)(items >> 16); }

		/**
		 * Append message-transfer item to message buffer
		 *
		 * \param exception  true to append the item to an exception reply
		 */
		__attribute__((warn_unused_result))
		bool append_item(Crd crd, mword_t sel_hotspot,
		                 bool kern_pd = false,
		                 bool update_guest_pt = false,
		                 bool translate_map = false,
		                 bool dma_mem = false,
		                 bool write_combined = false)
		{
			/* transfer items start at the end of the UTCB */
			items += 1 << 16;
			Item *item = reinterpret_cast<Item *>(this);
			item += (PAGE_SIZE_BYTE / sizeof(struct Item)) - msg_items();

			/* check that there is enough space left on UTCB */
			if (msg() + msg_words() >= reinterpret_cast<mword_t *>(item)) {
				items -= 1 << 16;
				return false;
			}

			/* map from hypervisor or current pd */
			unsigned h = kern_pd ? (1 << 11) : 0;

			/* map write-combined */
			unsigned wc = write_combined ? (1 << 10) : 0;

			/* update guest page table */
			unsigned g = update_guest_pt ? (1 << 9) : 0;

			/* mark memory dma able */
			unsigned d = dma_mem ? (1 << 8) : 0;

			/* set type of delegation, either 'map' or 'translate and map' */
			unsigned m = translate_map ? 2 : 1;

			item->hotspot = crd.hotspot(sel_hotspot) | g | h | wc | d | m;
			item->crd = crd.value();

			return true;
		}

		/**
		 * Return typed item at postion i in UTCB
		 *
		 * \param i position of item requested, starts with 0
		 */
		Item * get_item(const unsigned i) {
			if (i > (PAGE_SIZE_BYTE / sizeof(struct Item))) return 0;
			Item * item = reinterpret_cast<Item *>(this) + (PAGE_SIZE_BYTE / sizeof(struct Item)) - i - 1;
			if (reinterpret_cast<mword_t *>(item) < this->msg()) return 0;
			return item;
		}

		mword_t mtd_value() const { return static_cast<Mtd>(mtd).value(); }

		/**
		 * Return fault address and type of page-fault message
		 */
		mword_t pf_addr() const { return (mword_t)qual[1]; }
		uint8_t pf_type() const { return (uint8_t)qual[0]; }
	};

	static_assert(sizeof(Utcb) == 4096, "Unexpected size of UTCB");

	/**
	 * Size of event-specific portal window mapped at PD creation time
	 */
	enum {
		NUM_INITIAL_PT_LOG2 = 5,
		NUM_INITIAL_PT = 1UL << NUM_INITIAL_PT_LOG2,
		NUM_INITIAL_PT_RESERVED = 2 * NUM_INITIAL_PT,
		NUM_INITIAL_VCPU_PT_LOG2 = 8,
		NUM_INITIAL_VCPU_PT = 1UL << NUM_INITIAL_VCPU_PT_LOG2,
	};

	/**
	 * Event-specific capability selectors
	 */
	enum {
		PT_SEL_PAGE_FAULT = 0xe,
		PT_SEL_PARENT     = 0x1a,  /* convention on Genode */
		EC_SEL_THREAD     = 0x1c,  /* convention on Genode */
		PT_SEL_STARTUP    = 0x1e,
		SM_SEL_SIGNAL     = 0x1e,  /* alias of PT_SEL_STARTUP */
		PT_SEL_RECALL     = 0x1f,
		SM_SEL_EC         = 0x1d,  /* convention on Genode */
	};

}

namespace Genode {
	static inline void print(Output &out, Tukija::Tip::Memory_region &mem) {
		print(out, " | ", Hex(reinterpret_cast<unsigned long long>(mem.start)), " - ", Hex(reinterpret_cast<unsigned long long>(mem.end)));
	}

	static inline void print(Output &out, Tukija::Cpuset &cpus) {
		print(out, "CPUs [ ");
		cpus.for_each([&](long cpu)
					  { print(out, cpu, " "); });
		print(out, "]");
	}

	static inline void print(Output &out, Tukija::Tip::Domain &dom) {
		print(out, "Domain ", dom.id, ": ");
		print(out, dom.cpus);
		print(out, " ", dom.num_mem_descriptors, " memory regions");
		dom.for_each_mem([&](Tukija::Tip::Memory_region &mem)
						 { print(out, mem); });
	}
}
#endif /* _INCLUDE__NOVA__SYSCALL_GENERIC_H_ */