Files
genode/repos/base-tukija/include/tukija/syscall-generic.h

1343 lines
36 KiB
C++

/*
* \brief Syscall bindings for the Tukija (originally NOVA) microhypervisor
* \author Norman Feske
* \author Sebastian Sumpf
* \author Alexander Boettcher
* \author Benjamin Lamowski
* \author Michael Müller
* \date 2009-12-27
*/
/*
* Copyright (c) 2009-2023 Genode Labs
* Copyright (c) 2025 Michael Müller, Osnabrück University
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef _INCLUDE__NOVA__SYSCALL_GENERIC_H_
#define _INCLUDE__NOVA__SYSCALL_GENERIC_H_
#include <tukija/stdint.h>
#include <tukija/atomic.h>
#include <tukija/bits.h>
#include <base/affinity.h>
namespace Tukija {
enum {
PAGE_SIZE_LOG2 = 12,
PAGE_SIZE_BYTE = 1 << PAGE_SIZE_LOG2,
PAGE_MASK_ = ~(PAGE_SIZE_BYTE - 1)
};
/**
* NOVA system-call IDs
*/
enum Syscall {
NOVA_CALL = 0x0,
NOVA_REPLY = 0x1,
NOVA_CREATE_PD = 0x2,
NOVA_CREATE_EC = 0x3,
NOVA_CREATE_SC = 0x4,
NOVA_CREATE_PT = 0x5,
NOVA_CREATE_SM = 0x6,
NOVA_REVOKE = 0x7,
NOVA_MISC = 0x8, /* lookup, delegate, acpi_suspend */
NOVA_EC_CTRL = 0x9,
NOVA_SC_CTRL = 0xa,
NOVA_PT_CTRL = 0xb,
NOVA_SM_CTRL = 0xc,
NOVA_ASSIGN_PCI = 0xd,
NOVA_ASSIGN_GSI = 0xe,
NOVA_PD_CTRL = 0xf,
TUKIJA_CREATE_CELL = 0x10,
TUKIJA_ALLOCATE = 0x11,
TUKIJA_CELL_CTRL = 0x12,
TUKIJA_RELEASE = 0x13,
};
/**
* Tukija operations
*/
enum Cell_control
{
UPDATE_AFFINITY = 0,
};
enum Resource_type
{
CPU_CORE = 0,
};
enum Release_op
{
RELEASE = 0,
RETURN_TO_OWNER = 1,
};
/**
* NOVA status codes returned by system-calls
*/
enum Status
{
NOVA_OK = 0,
NOVA_TIMEOUT = 1,
NOVA_IPC_ABORT = 2,
NOVA_INV_HYPERCALL = 3,
NOVA_INV_SELECTOR = 4,
NOVA_INV_PARAMETER = 5,
NOVA_INV_FEATURE = 6,
NOVA_INV_CPU = 7,
NOVA_INVD_DEVICE_ID = 8,
NOVA_PD_OOM = 9,
};
/**
* CPU Set
*/
class Cpuset
{
private:
enum {CPUS_PER_VALUE = sizeof(mword_t) * 8};
mword_t raw[1 + (256 - 1) / CPUS_PER_VALUE];
inline mword_t &value(unsigned const cpu) {
return raw[cpu / CPUS_PER_VALUE];
}
inline mword_t const &value(unsigned const cpu) const {
return raw[cpu / CPUS_PER_VALUE];
}
inline mword_t bit_cpu(unsigned const cpu) const {
return cpu % CPUS_PER_VALUE;
}
public:
inline explicit Cpuset(mword_t const v)
{
for (unsigned i = 0; i < sizeof(raw) / sizeof(raw[0]); i++)
raw[i] = v;
}
inline bool chk(unsigned const cpu) const {
return value(cpu) & (1UL << bit_cpu(cpu));
}
inline void set(unsigned const cpu) {
Atomic::test_set_bit(value(cpu), bit_cpu(cpu));
}
inline void clr(unsigned const cpu) {
Atomic::test_clr_bit(value(cpu), bit_cpu(cpu));
}
inline void merge(Cpuset const &s)
{
for (unsigned i = 0; i < sizeof(raw) / sizeof(raw[0]); i++)
Atomic::set_mask(value(i * CPUS_PER_VALUE), s.value(i * CPUS_PER_VALUE));
}
inline void clear()
{
for (unsigned i = 0; i < sizeof(raw) / sizeof(raw[0]); i++)
Atomic::clr_mask(value(i * CPUS_PER_VALUE),0UL);
}
template <typename T>
void for_each(T const fn)
{
long cpu = 0;
for (unsigned i = 0; i < sizeof(raw) / sizeof(raw[0]); i++)
{
mword_t subset = raw[i];
while ((cpu = bit_scan_forward(subset)) != -1)
{
Atomic::test_clr_bit(subset, bit_cpu(static_cast<unsigned int>(cpu)));
fn((cpu+i*CPUS_PER_VALUE));
}
}
}
unsigned count()
{
unsigned count = 0;
for (unsigned i = 0; i < sizeof(raw); i++) {
count += static_cast<unsigned>(popcount(raw[i]));
}
return count;
}
};
/**
* Cell information pages
*
*/
class Cip
{
public:
/**
* \brief Per-worker thread information
*
* \details Workers represent arbitrary computing resources (e.g. CPU cores).
* Tukija grants or revokes workers from cells by adjusting its CPU core allocation.
* However, to allow a cell to react to a revocation of workers, Tukija employs a
* shared mem structure containing a flag that signals a yield request. It is expected
* that the cell polls this flag regularly in user-space, e.g. after the execution of an MxTask.
*/
struct Worker {
volatile unsigned long yield_flag{0}; /* This flag will be set if a yield request has been filed */
unsigned long padding[3];
};
/**
* @brief Information about MxTasking channels
*
* @details A channel is the representation of a task queue, holding a set of tasks to execute.
* Every task is enqueued in exactly one channel and each channel as processed by at most one
* worker at any given time. To ensure progress of a cell, Tukija expects cells to implement
* load balancing strategies to ensure that each channel's tasks are executed by a worker thread.
* However, Tukija does not command that each allocated core and its worker must process a channel.
*/
struct Channels {
volatile unsigned short remainder{0}; /* Number of channels that remain unstolen, after each worker has stolen `limit` many channels*/
volatile unsigned short limit{0}; /* Number of channels each worker is allowed to steal when core allocation has changed */
unsigned int count{0}; /* Total number of channels this cell uses. */
};
alignas(64) Worker worker_info[256];
Channels channel_info;
/**
* @brief Contains the set of CPU cores that are currently allocated by this cell.
*
*/
Cpuset cores_current;
/**
* @brief Contains the set of CPU cores this cell may lay claims to.
*
* @details Hoitaja prepartitions all CPU cores among the currently running cells.
* Each of these partitions present the optimal allocation according to each cell's priority.
* However, to achieve better utilization,
* Tukija can assign more or less cores to a cell than contained in this set.
*/
Cpuset cores_reserved;
/**
* @brief The set of cores, Tukija has chosen to answer the last allocation request.
*
*/
Cpuset cores_new;
unsigned idx_to_phys_cpu_id[256]; /* Mapping from pager index to kernel cpu ID */
Genode::Affinity::Space habitat_affinity; /* the affinity space the corresponding cell lives in */
Genode::Affinity::Location location; /* Location of the cell within its habitat */
/**
* @brief Return the sanitized kernel CPU ID for a given location
*
* @param location - the location for which to request the kernel CPU ID
* @return unsigned - the kernel CPU ID of the given location
*/
unsigned location_to_kernel_cpu(Genode::Affinity::Location const &loc)
{
Genode::Affinity::Location loc_in_habitat = loc.transpose(location.xpos(), location.ypos());
unsigned idx = (loc_in_habitat.xpos() * habitat_affinity.height() + loc_in_habitat.ypos()) % habitat_affinity.total();
return idx_to_phys_cpu_id[idx];
}
/**
* @brief Return the worker information structure for the workers at the given location
*
* @details Allows to request kernel worker information for the workers residing on the CPU specified by location.
* This must be used by user-space to retrieve the yield_flag for the CPU specified by location. The kernel
* expects each user-space cell to regularly poll the yield_flag of each of its granted CPUs to ensure
* that borrowed CPU cores are returned to their rightful owner.
*
* @param location - location of the workers for which to request the information struct
* @return struct Worker& - the information struct for the workers
*/
struct Worker &worker_for_location(Genode::Affinity::Location const &location)
{
return worker_info[location_to_kernel_cpu(location)];
}
/**
* @brief Returns a pointer to the cell information pages from within the address space
* of the respective cell.
* @details This pointer is used by the user-space runtime environment of the cell to get
* information about pending yield requests, changes in core allocation, and parameters for
* adjusting the assignment of channels to workers.
*
* @return Cip* pointer to the cell's information pages
*/
static Cip *
cip()
{
return reinterpret_cast<Cip *>(0x7FFFBFFDC000);
}
};
/**
* Topology information pages
*/
class Tip
{
public:
/**************/
/* Exceptions */
/**************/
/**
* @brief Exception indicating that the requested NUMA domain could not be found.
*
*/
struct Domain_not_found {
};
/**
* @brief Exception signaling that a NUMA domain has no memory region
*
*/
struct Domain_has_no_memory_regions {
};
/***
* Data structure definitions
*/
/**
* @brief Describes the physical address range of a NUMA domain
*
*/
struct Memory_region {
void *start;
void *end;
};
/**
* @brief Desribes either an ACPI or PCI device and its beloging to a NUMA domain.
*
*/
struct Device {
enum dev_type
{
ACPI = 0,
PCI = 1
};
dev_type type;
union {
struct {
uint64_t hid;
uint32_t uid;
} acpi_handle;
struct {
uint16_t segment;
uint16_t bdf;
} pci_handle;
};
};
/**
* @brief Representation of a NUMA domain
*
* @details A NUMA domain consists of a (possibly empty set) of CPU cores, a physical address range, and a set of devices. The information provided by a NUMA domain structure can be used to provide NUMA-aware placement of data objects and tasks.
*
*/
struct Domain {
uint32_t id;
uint8_t num_mem_descriptors;
uint8_t num_devices;
Memory_region memory_regions[32];
Device devices[32];
Cpuset cpus{0};
template <typename T>
void for_each_mem(T const fn) {
for (uint8_t i = 0; i < num_mem_descriptors; i++)
fn(memory_regions[i]);
}
};
/** Page layout */
uint16_t length{8}; /* Length of the Topology information page */
uint32_t cpu_to_domain[256]; /* Mapping of CPU to NUMA IDs */
Domain nodes[]; /* Set of detected NUMA domains */
/* Parsing functions */
/**
* @brief Applies the function fn to all detected NUMA domain structures
*
* @tparam T - type parameter used for the function lambda provided by fn
* @param fn - a function (e.g. a lambda closure) to execute on each NUMA domain.
*/
template <typename T>
void for_each(T const fn) {
mword_t const node_cnt = (reinterpret_cast<mword_t>(&nodes) + length - reinterpret_cast<mword_t>(nodes)) / sizeof(Domain);
Domain *dom = nodes;
for (unsigned i = 0; i < node_cnt; i++) {
dom = nodes + i;
fn(*dom);
}
}
/**
* @brief Looks up the NUMA domain the CPU with id cpu belongs to.
*
* @param cpu | The ID of the cpu whose NUMA domain should be looked up.
* @return Domain& | the NUMA domain the cpu belongs to.
* @throws a Domain_not_found exception, if the cpu cannot be linked to a NUMA domain.
*/
Domain const &dom_of_cpu(unsigned cpu) {
unsigned id = cpu_to_domain[cpu];
unsigned idx = 0;
mword_t const node_cnt = (reinterpret_cast<mword_t>(&nodes) + length - reinterpret_cast<mword_t>(nodes)) / sizeof(Domain);
for (; idx < node_cnt; idx++) {
if (nodes[idx].id == id)
break;
}
Domain const &d = nodes[idx];
if (!d.cpus.chk(cpu))
throw Domain_not_found();
return d;
}
/**
* @brief Applies a function on a NUMA domain structure
*
* @tparam T
* @param fn | the function to apply
* @param id | the ID of the NUMA domain
*/
template <typename T>
void on_node(T const fn, uint32_t id)
{
mword_t const node_cnt = (reinterpret_cast<mword_t>(nodes) + length - reinterpret_cast<mword_t>(nodes)) / sizeof(Domain);
Domain *dom = nodes;
for (unsigned i = 0; i < node_cnt; i++) {
dom = nodes + i;
if (dom->id == id)
break;
}
if (dom->id != id)
return;
fn(*dom);
}
/**
* @brief Returns the memory region of a NUMA domain
*
* @tparam T
* @param dom_id - The ID of the NUMA ID
* @param region_count - number of memory regions the NUMA domain encompasses.
* @return Memory_region& - The first memory region for this NUMA domain.
*/
template <typename T>
Memory_region &memory_for_domain(uint32_t dom_id, uint8_t *&region_count) {
Memory_region *mem;
on_node([&](Domain &dom)
{ mem = &(dom.memory_regions); });
if (!mem) {
throw Domain_has_no_memory_regions();
}
return *mem;
}
inline static Tip const *tip() {
return reinterpret_cast<Tip *>(0x7fffbffe0000);
}
};
/**
* Hypervisor information page
*/
struct Hip
{
struct Mem_desc
{
enum Type {
EFI_SYSTEM_TABLE = -7,
HYPERVISOR_LOG = -6,
FRAMEBUFFER = -5,
ACPI_XSDT = -4,
ACPI_RSDT = -3,
MULTIBOOT_MODULE = -2,
MICROHYPERVISOR = -1,
AVAILABLE_MEMORY = 1,
RESERVED_MEMORY = 2,
ACPI_RECLAIM_MEMORY = 3,
ACPI_NVS_MEMORY = 4
};
uint64_t const addr;
uint64_t const size;
Type const type;
uint32_t const aux;
};
uint32_t const signature; /* magic value 0x41564f4e */
uint16_t const hip_checksum;
uint16_t const hip_length;
uint16_t const cpu_desc_offset;
uint16_t const cpu_desc_size;
uint16_t const mem_desc_offset;
uint16_t const mem_desc_size;
uint32_t const feature_flags;
uint32_t const api_version;
uint32_t const sel; /* number of cap selectors */
uint32_t const sel_exc; /* number of cap selectors for exceptions */
uint32_t const sel_vm; /* number of cap selectors for VM handling */
uint32_t const sel_gsi; /* number of global system interrupts */
uint32_t const page_sizes; /* supported page sizes */
uint32_t const utcb_sizes; /* supported utcb sizes */
uint32_t const tsc_freq; /* time-stamp counter frequency in kHz */
uint32_t const bus_freq; /* bus frequency in kHz */
mword_t const topo_model;
Genode::addr_t const topo_phys;
bool has_feature_iommu() const { return feature_flags & (1 << 0); }
bool has_feature_vmx() const { return feature_flags & (1 << 1); }
bool has_feature_svm() const { return feature_flags & (1 << 2); }
struct Cpu_desc {
uint8_t flags;
uint8_t thread;
uint8_t core;
uint8_t package;
uint8_t acpi_id;
uint8_t family;
uint8_t model;
uint8_t stepping:4;
uint8_t platform:3;
uint8_t reserved:1;
uint32_t patch;
bool p_core() const { return flags & 0x2; }
bool e_core() const { return flags & 0x4; }
} __attribute__((packed));
unsigned cpu_max() const {
return (mem_desc_offset - cpu_desc_offset) / cpu_desc_size; }
unsigned cpus() const {
unsigned cpu_num = 0;
for (unsigned i = 0; i < cpu_max(); i++)
if (is_cpu_enabled(i))
cpu_num++;
return cpu_num;
}
Cpu_desc const * cpu_desc_of_cpu(unsigned i) const {
if (i >= cpu_max())
return nullptr;
unsigned long desc_addr = reinterpret_cast<unsigned long>(this) +
cpu_desc_offset + i * cpu_desc_size;
return reinterpret_cast<Cpu_desc const *>(desc_addr);
}
bool is_cpu_enabled(unsigned i) const {
Cpu_desc const * const desc = cpu_desc_of_cpu(i);
return desc ? desc->flags & 0x1 : false;
}
/**
* Resort CPU ids such, that
* - the boot CPU id is ever logical CPU id 0
* - SMT threads of one CPU have logical CPU ids close together
* - P-Core has a smaller logical CPU id than E-Core CPUs
*
* Returns true, if re-mapping succeeded otherwise false.
*
* In case of failure, map_cpus will contain a 1:1 fallback mapping
* without any sorting as mentioned above.
*/
bool remap_cpu_ids(uint16_t *map_cpus, unsigned const max_cpus,
unsigned const boot_cpu) const
{
unsigned const num_cpus = cpus();
bool too_many_cpus = false;
unsigned cpu_i = 0;
/* fallback lambda in case re-ordering fails */
auto remap_failure = [&] {
for (uint16_t i = 0; i < max_cpus; i++) { map_cpus[i] = i; }
return false;
};
/* assign boot cpu ever the virtual cpu id 0 */
Cpu_desc const * const boot = cpu_desc_of_cpu(boot_cpu);
if (!boot)
return remap_failure();
map_cpus[cpu_i++] = (uint8_t)boot_cpu;
if (cpu_i >= num_cpus)
return true;
if (cpu_i >= max_cpus)
return remap_failure();
/* assign cores + SMT threads first and skip E-cores */
bool done = for_all_cpus([&](auto const &cpu, auto const kernel_cpu_id) {
if (kernel_cpu_id == boot_cpu)
return false;
/* handle normal or P-core */
if (cpu.e_core())
return false;
map_cpus[cpu_i++] = (uint8_t)kernel_cpu_id;
too_many_cpus = !!(cpu_i >= max_cpus);
return (cpu_i >= num_cpus || too_many_cpus);
});
if (done)
return too_many_cpus ? remap_failure() : true;
/* assign remaining E-cores */
done = for_all_cpus([&](auto &cpu, auto &kernel_cpu_id) {
if (kernel_cpu_id == boot_cpu)
return false;
/* handle solely E-core */
if (!cpu.e_core())
return false;
map_cpus[cpu_i++] = (uint16_t)kernel_cpu_id;
too_many_cpus = !!(cpu_i >= max_cpus);
return (cpu_i >= num_cpus || too_many_cpus);
});
return too_many_cpus ? remap_failure() : done;
}
/**
* Iterate over all CPUs in a _ever_ _consistent_ order.
*/
bool for_all_cpus(auto const &fn) const
{
for (uint16_t package = 0; package <= 255; package++) {
for (uint16_t core = 0; core <= 255; core++) {
for (uint16_t thread = 0; thread <= 255; thread++) {
for (unsigned i = 0; i < cpu_max(); i++) {
if (!is_cpu_enabled(i))
continue;
auto const cpu = cpu_desc_of_cpu(i);
if (!cpu)
continue;
if (!(cpu->package == package && cpu->core == core &&
cpu->thread == thread))
continue;
bool done = fn(*cpu, i);
if (done)
return done;
}
}
}
}
return false;
}
void for_each_enabled_cpu(auto const &fn) const
{
for (unsigned i = 0; i < cpu_max(); i++) {
Cpu_desc const * cpu = cpu_desc_of_cpu(i);
if (!is_cpu_enabled(i)) continue;
if (!cpu) return;
fn(*cpu, i);
}
}
} __attribute__((packed));
/**
* Semaphore operations
*/
enum Sem_op { SEMAPHORE_UP = 0U, SEMAPHORE_DOWN = 1U, SEMAPHORE_DOWNZERO = 0x3U };
/**
* Ec operations
*/
enum Ec_op {
EC_RECALL = 0U,
EC_YIELD = 1U,
EC_DONATE_SC = 2U,
EC_RESCHEDULE = 3U,
EC_MIGRATE = 4U,
EC_TIME = 5U,
EC_GET_VCPU_STATE = 6U,
EC_SET_VCPU_STATE = 7U,
EC_MSR_ACCESS = 8U
};
enum Sc_op {
SC_TIME_IDLE = 0,
SC_TIME_CROSS = 1,
SC_TIME_KILLED = 2,
SC_EC_TIME = 3,
};
/**
* Pd operations
*/
enum Pd_op { TRANSFER_QUOTA = 0U, PD_DEBUG = 2U };
/**
* Hpc operations
*
*/
enum Hpc_op
{
HPC_SETUP = 9U,
HPC_START = 10U,
HPC_STOP = 11U,
HPC_RESET = 12U,
HPC_READ = 13U,
};
/**
* Cell operations
*/
enum Cell_op
{
SHRINK = 0,
GROW = 1,
};
class Gsi_flags
{
private:
uint8_t _value { 0 };
public:
enum Mode { HIGH, LOW, EDGE };
Gsi_flags() { }
Gsi_flags(Mode m)
{
switch (m) {
case HIGH: _value = 0b110; break; /* level-high */
case LOW: _value = 0b111; break; /* level-low */
case EDGE: _value = 0b100; break; /* edge-triggered */
}
}
uint8_t value() const { return _value; }
};
class Descriptor
{
protected:
mword_t _value { 0 };
/**
* Assign bitfield to descriptor
*/
template<mword_t MASK, mword_t SHIFT>
void _assign(mword_t new_bits)
{
_value &= ~(MASK << SHIFT);
_value |= (new_bits & MASK) << SHIFT;
}
/**
* Query bitfield from descriptor
*/
template<mword_t MASK, mword_t SHIFT>
mword_t _query() const { return (_value >> SHIFT) & MASK; }
public:
mword_t value() const { return _value; }
} __attribute__((packed));
/**
* Message-transfer descriptor
*/
class Mtd
{
private:
mword_t const _value;
public:
enum {
ACDB = 1U << 0, /* eax, ecx, edx, ebx */
EBSD = 1U << 1, /* ebp, esi, edi */
ESP = 1U << 2,
EIP = 1U << 3,
EFL = 1U << 4, /* eflags */
ESDS = 1U << 5,
FSGS = 1U << 6,
CSSS = 1U << 7,
TR = 1U << 8,
LDTR = 1U << 9,
GDTR = 1U << 10,
IDTR = 1U << 11,
CR = 1U << 12,
DR = 1U << 13, /* DR7 */
SYS = 1U << 14, /* Sysenter MSRs CS, ESP, EIP */
QUAL = 1U << 15, /* exit qualification */
CTRL = 1U << 16, /* execution controls */
INJ = 1U << 17, /* injection info */
STA = 1U << 18, /* interruptibility state */
TSC = 1U << 19, /* time-stamp counter */
EFER = 1U << 20, /* EFER MSR */
PDPTE = 1U << 21, /* PDPTE0 .. PDPTE3 */
R8_R15 = 1U << 22, /* R8 .. R15 */
SYSCALL_SWAPGS = 1U << 23, /* SYSCALL and SWAPGS MSRs */
TPR = 1U << 24, /* TPR and TPR threshold */
TSC_AUX = 1U << 25, /* IA32_TSC_AUX used by rdtscp */
XSAVE = 1U << 26, /* XCR and XSS used with XSAVE */
FPU = 1U << 31, /* FPU state */
IRQ = EFL | STA | INJ | TSC,
ALL = (0x000fffff & ~CTRL) | EFER | R8_R15 | SYSCALL_SWAPGS | TPR,
};
Mtd(mword_t value) : _value(value) { }
mword_t value() const { return _value; }
};
class Crd : public Descriptor
{
protected:
/**
* Bitfield holding the descriptor type
*/
enum {
TYPE_MASK = 0x3, TYPE_SHIFT = 0,
BASE_SHIFT = 12, RIGHTS_MASK = 0x1f,
ORDER_MASK = 0x1f, ORDER_SHIFT = 7,
BASE_MASK = (~0UL) >> BASE_SHIFT,
RIGHTS_SHIFT= 2
};
/**
* Capability-range-descriptor types
*/
enum {
NULL_CRD_TYPE = 0,
MEM_CRD_TYPE = 1,
IO_CRD_TYPE = 2,
OBJ_CRD_TYPE = 3,
RIGHTS_ALL = 0x1f,
};
void _base(mword_t base)
{ _assign<BASE_MASK, BASE_SHIFT>(base); }
void _order(mword_t order)
{ _assign<ORDER_MASK, ORDER_SHIFT>(order); }
public:
Crd(mword_t base, mword_t order) {
_value = 0; _base(base), _order(order); }
Crd(mword_t value) { _value = value; }
mword_t hotspot(mword_t sel_hotspot) const
{
if ((value() & TYPE_MASK) == MEM_CRD_TYPE)
return sel_hotspot & PAGE_MASK_;
return sel_hotspot << 12;
}
mword_t addr() const { return base() << BASE_SHIFT; }
mword_t base() const { return _query<BASE_MASK, BASE_SHIFT>(); }
mword_t order() const { return _query<ORDER_MASK, ORDER_SHIFT>(); }
bool is_null() const { return (_value & TYPE_MASK) == NULL_CRD_TYPE; }
uint8_t type() const { return (uint8_t)_query<TYPE_MASK, TYPE_SHIFT>(); }
uint8_t rights() const { return (uint8_t)_query<RIGHTS_MASK, RIGHTS_SHIFT>(); }
} __attribute__((packed));
class Rights
{
private:
bool const _readable, _writeable, _executable;
public:
Rights(bool readable, bool writeable, bool executable)
: _readable(readable), _writeable(writeable),
_executable(executable) { }
Rights() : _readable(false), _writeable(false), _executable(false) {}
bool readable() const { return _readable; }
bool writeable() const { return _writeable; }
bool executable() const { return _executable; }
};
/**
* Memory-capability-range descriptor
*/
class Mem_crd : public Crd
{
private:
enum {
EXEC_MASK = 0x1, EXEC_SHIFT = 4,
WRITE_MASK = 0x1, WRITE_SHIFT = 3,
READ_MASK = 0x1, READ_SHIFT = 2
};
void _rights(Rights r)
{
_assign<EXEC_MASK, EXEC_SHIFT>(r.executable());
_assign<WRITE_MASK, WRITE_SHIFT>(r.writeable());
_assign<READ_MASK, READ_SHIFT>(r.readable());
}
public:
Mem_crd(mword_t base, mword_t order, Rights rights = Rights())
: Crd(base, order)
{
_rights(rights);
_assign<TYPE_MASK, TYPE_SHIFT>(MEM_CRD_TYPE);
}
Rights rights() const
{
return Rights(_query<READ_MASK, READ_SHIFT>(),
_query<WRITE_MASK, WRITE_SHIFT>(),
_query<EXEC_MASK, EXEC_SHIFT>());
}
};
/**
* I/O-capability-range descriptor
*/
class Io_crd : public Crd
{
public:
Io_crd(mword_t base, mword_t order)
: Crd(base, order)
{
_assign<TYPE_MASK, TYPE_SHIFT>(IO_CRD_TYPE);
_assign<RIGHTS_MASK, RIGHTS_SHIFT>(RIGHTS_ALL);
}
};
class Obj_crd : public Crd
{
public:
enum {
RIGHT_EC_RECALL = 0x1U,
RIGHT_PT_CALL = 0x2U,
RIGHT_PT_CTRL = 0x1U,
RIGHT_PT_XCPU = 0x10U,
RIGHT_SM_UP = 0x1U,
RIGHT_SM_DOWN = 0x2U
};
Obj_crd() : Crd(0, 0)
{
_assign<TYPE_MASK, TYPE_SHIFT>(NULL_CRD_TYPE);
}
Obj_crd(mword_t base, mword_t order,
mword_t rights = RIGHTS_ALL)
: Crd(base, order)
{
_assign<TYPE_MASK, TYPE_SHIFT>(OBJ_CRD_TYPE);
_assign<RIGHTS_MASK, RIGHTS_SHIFT>(rights);
}
};
/**
* Quantum-priority descriptor
*/
class Qpd : public Descriptor
{
private:
enum {
PRIORITY_MASK = 0xff, PRIORITY_SHIFT = 0,
QUANTUM_SHIFT = 12,
QUANTUM_MASK = (~0UL) >> QUANTUM_SHIFT
};
void _quantum(mword_t quantum)
{ _assign<QUANTUM_MASK, QUANTUM_SHIFT>(quantum); }
void _priority(mword_t priority)
{ _assign<PRIORITY_MASK, PRIORITY_SHIFT>(priority); }
public:
enum { DEFAULT_QUANTUM = 10000, DEFAULT_PRIORITY = 64 };
Qpd(mword_t quantum = DEFAULT_QUANTUM,
mword_t priority = DEFAULT_PRIORITY)
{
_value = 0;
_quantum(quantum), _priority(priority);
}
mword_t quantum() const { return _query<QUANTUM_MASK, QUANTUM_SHIFT>(); }
mword_t priority() const { return _query<PRIORITY_MASK, PRIORITY_SHIFT>(); }
};
/**
* User-level thread-control block
*/
struct Utcb
{
/**
* Return physical size of UTCB in bytes
*/
static constexpr mword_t size() { return 4096; }
/**
* Number of untyped items uses lowest 16 bit, number of typed items
* uses bit 16-31, bit 32+ are ignored on 64bit
*/
mword_t items;
Crd crd_xlt; /* receive capability-range descriptor for translation */
Crd crd_rcv; /* receive capability-range descriptor for delegation */
mword_t tls;
/**
* Data area
*
* The UTCB entries following the header hold message payload (normal
* IDC operations) or architectural state (exception handling).
*/
union {
/* exception state */
struct {
mword_t mtd, instr_len, ip, flags;
unsigned intr_state, actv_state, inj_info, inj_error;
mword_t ax, cx, dx, bx;
mword_t sp, bp, si, di;
#ifdef __x86_64__
mword_t r8, r9, r10, r11, r12, r13, r14, r15;
#endif
unsigned long long qual[2]; /* exit qualification */
unsigned ctrl[2];
mword_t cr0, cr2, cr3, cr4;
unsigned long long xcr0, xss;
mword_t pdpte[4];
#ifdef __x86_64__
mword_t cr8, efer;
unsigned long long star;
unsigned long long lstar;
unsigned long long cstar;
unsigned long long fmask;
unsigned long long kernel_gs_base;
unsigned tpr;
unsigned tpr_threshold;
#endif
mword_t dr7, sysenter_cs, sysenter_sp, sysenter_ip;
struct {
unsigned short sel, ar;
unsigned limit;
mword_t base;
#ifndef __x86_64__
mword_t reserved;
#endif
} es, cs, ss, ds, fs, gs, ldtr, tr;
struct {
unsigned reserved0;
unsigned limit;
mword_t base;
#ifndef __x86_64__
mword_t reserved1;
#endif
} gdtr, idtr;
unsigned long long tsc_val, tsc_off, tsc_aux;
unsigned long long exit_reason;
uint8_t fpu[2560];
} __attribute__((packed));
mword_t mr[(4096 - 4 * sizeof(mword_t)) / sizeof(mword_t)];
};
/* message payload */
mword_t * msg() { return mr; }
struct Item {
mword_t crd;
mword_t hotspot;
bool is_del() const { return hotspot & 0x1; }
};
#ifdef __x86_64__
uint64_t read_r8() const { return r8; }
uint64_t read_r9() const { return r9; }
uint64_t read_r10() const { return r10; }
uint64_t read_r11() const { return r11; }
uint64_t read_r12() const { return r12; }
uint64_t read_r13() const { return r13; }
uint64_t read_r14() const { return r14; }
uint64_t read_r15() const { return r15; }
mword_t read_efer() const { return efer; }
uint64_t read_star() const { return star; }
uint64_t read_lstar() const { return lstar; }
uint64_t read_cstar() const { return cstar; }
uint64_t read_fmask() const { return fmask; }
uint64_t read_kernel_gs_base() const { return kernel_gs_base; }
uint32_t read_tpr() const { return tpr; }
uint32_t read_tpr_threshold() const { return tpr_threshold; }
void write_r8 (uint64_t value) { r8 = value; }
void write_r9 (uint64_t value) { r9 = value; }
void write_r10 (uint64_t value) { r10 = value; }
void write_r11 (uint64_t value) { r11 = value; }
void write_r12 (uint64_t value) { r12 = value; }
void write_r13 (uint64_t value) { r13 = value; }
void write_r14 (uint64_t value) { r14 = value; }
void write_r15 (uint64_t value) { r15 = value; }
void write_efer (mword_t value) { efer = value; }
void write_star (uint64_t value) { star = value; }
void write_lstar (uint64_t value) { lstar = value; }
void write_cstar (uint64_t value) { cstar = value; }
void write_fmask (uint64_t value) { fmask = value; }
void write_kernel_gs_base (uint64_t value) { kernel_gs_base = value; }
void write_tpr (uint32_t value) { tpr = value; }
void write_tpr_threshold (uint32_t value) { tpr_threshold = value; }
#else
uint64_t read_r8() const { return 0; }
uint64_t read_r9() const { return 0; }
uint64_t read_r10() const { return 0; }
uint64_t read_r11() const { return 0; }
uint64_t read_r12() const { return 0; }
uint64_t read_r13() const { return 0; }
uint64_t read_r14() const { return 0; }
uint64_t read_r15() const { return 0; }
mword_t read_efer() const { return 0; }
uint64_t read_star() const { return 0; }
uint64_t read_lstar() const { return 0; }
uint64_t read_cstar() const { return 0; }
uint64_t read_fmask() const { return 0; }
uint64_t read_kernel_gs_base() const { return 0; }
uint32_t read_tpr() const { return 0; }
uint32_t read_tpr_threshold() const { return 0; }
void write_r8 (uint64_t) { }
void write_r9 (uint64_t) { }
void write_r10 (uint64_t) { }
void write_r11 (uint64_t) { }
void write_r12 (uint64_t) { }
void write_r13 (uint64_t) { }
void write_r14 (uint64_t) { }
void write_r15 (uint64_t) { }
void write_efer (mword_t) { }
void write_star (uint64_t) { }
void write_lstar (uint64_t) { }
void write_cstar (uint64_t) { }
void write_fmask (uint64_t) { }
void write_kernel_gs_base (uint64_t) { }
void write_tpr (uint32_t) { }
void write_tpr_threshold (uint32_t) { }
#endif
/**
* Set number of untyped message words
*
* Calling this function has the side effect of removing all typed
* message items from the message buffer.
*/
void set_msg_word(mword_t const num) { items = num; }
/**
* Return current number of message word in UTCB
*/
unsigned msg_words() { return items & 0xffffU; }
/**
* Return current number of message items on UTCB
*/
unsigned msg_items() { return (unsigned)(items >> 16); }
/**
* Append message-transfer item to message buffer
*
* \param exception true to append the item to an exception reply
*/
__attribute__((warn_unused_result))
bool append_item(Crd crd, mword_t sel_hotspot,
bool kern_pd = false,
bool update_guest_pt = false,
bool translate_map = false,
bool dma_mem = false,
bool write_combined = false)
{
/* transfer items start at the end of the UTCB */
items += 1 << 16;
Item *item = reinterpret_cast<Item *>(this);
item += (PAGE_SIZE_BYTE / sizeof(struct Item)) - msg_items();
/* check that there is enough space left on UTCB */
if (msg() + msg_words() >= reinterpret_cast<mword_t *>(item)) {
items -= 1 << 16;
return false;
}
/* map from hypervisor or current pd */
unsigned h = kern_pd ? (1 << 11) : 0;
/* map write-combined */
unsigned wc = write_combined ? (1 << 10) : 0;
/* update guest page table */
unsigned g = update_guest_pt ? (1 << 9) : 0;
/* mark memory dma able */
unsigned d = dma_mem ? (1 << 8) : 0;
/* set type of delegation, either 'map' or 'translate and map' */
unsigned m = translate_map ? 2 : 1;
item->hotspot = crd.hotspot(sel_hotspot) | g | h | wc | d | m;
item->crd = crd.value();
return true;
}
/**
* Return typed item at postion i in UTCB
*
* \param i position of item requested, starts with 0
*/
Item * get_item(const unsigned i) {
if (i > (PAGE_SIZE_BYTE / sizeof(struct Item))) return 0;
Item * item = reinterpret_cast<Item *>(this) + (PAGE_SIZE_BYTE / sizeof(struct Item)) - i - 1;
if (reinterpret_cast<mword_t *>(item) < this->msg()) return 0;
return item;
}
mword_t mtd_value() const { return static_cast<Mtd>(mtd).value(); }
/**
* Return fault address and type of page-fault message
*/
mword_t pf_addr() const { return (mword_t)qual[1]; }
uint8_t pf_type() const { return (uint8_t)qual[0]; }
};
static_assert(sizeof(Utcb) == 4096, "Unexpected size of UTCB");
/**
* Size of event-specific portal window mapped at PD creation time
*/
enum {
NUM_INITIAL_PT_LOG2 = 5,
NUM_INITIAL_PT = 1UL << NUM_INITIAL_PT_LOG2,
NUM_INITIAL_PT_RESERVED = 2 * NUM_INITIAL_PT,
NUM_INITIAL_VCPU_PT_LOG2 = 8,
NUM_INITIAL_VCPU_PT = 1UL << NUM_INITIAL_VCPU_PT_LOG2,
};
/**
* Event-specific capability selectors
*/
enum {
PT_SEL_PAGE_FAULT = 0xe,
PT_SEL_PARENT = 0x1a, /* convention on Genode */
EC_SEL_THREAD = 0x1c, /* convention on Genode */
PT_SEL_STARTUP = 0x1e,
SM_SEL_SIGNAL = 0x1e, /* alias of PT_SEL_STARTUP */
PT_SEL_RECALL = 0x1f,
SM_SEL_EC = 0x1d, /* convention on Genode */
};
}
namespace Genode {
static inline void print(Output &out, Tukija::Tip::Memory_region &mem) {
print(out, " | ", Hex(reinterpret_cast<unsigned long long>(mem.start)), " - ", Hex(reinterpret_cast<unsigned long long>(mem.end)));
}
static inline void print(Output &out, Tukija::Cpuset &cpus) {
print(out, "CPUs [ ");
cpus.for_each([&](long cpu)
{ print(out, cpu, " "); });
print(out, "]");
}
static inline void print(Output &out, Tukija::Tip::Domain &dom) {
print(out, "Domain ", dom.id, ": ");
print(out, dom.cpus);
print(out, " ", dom.num_mem_descriptors, " memory regions");
dom.for_each_mem([&](Tukija::Tip::Memory_region &mem)
{ print(out, mem); });
}
}
#endif /* _INCLUDE__NOVA__SYSCALL_GENERIC_H_ */