Initial commit

2026-01-21 12:42:57 +01:00 · 2021-03-22 18:38:54 +01:00
commit ad8c48083c
128 changed files with 33166 additions and 0 deletions
--- a/.clang-format
+++ b/.clang-format
@@ -0,0 +1,127 @@
+---
+Language:        Cpp
+# BasedOnStyle:  Microsoft
+AccessModifierOffset: -4
+AlignAfterOpenBracket: Align
+AlignConsecutiveMacros: false
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlines: Right
+AlignOperands:   true
+AlignTrailingComments: true
+AllowAllArgumentsOnNextLine: true
+AllowAllConstructorInitializersOnNextLine: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: InlineOnly
+AllowShortLambdasOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: Never
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakTemplateDeclarations: MultiLine
+BinPackArguments: true
+BinPackParameters: true
+BraceWrapping:
+  AfterCaseLabel:  false
+  AfterClass:      true
+  AfterControlStatement: true
+  AfterEnum:       true
+  AfterFunction:   true
+  AfterNamespace:  false
+  AfterObjCDeclaration: true
+  AfterStruct:     true
+  AfterUnion:      false
+  AfterExternBlock: true
+  BeforeCatch:     true
+  BeforeElse:      true
+  IndentBraces:    false
+  SplitEmptyFunction: true
+  SplitEmptyRecord: true
+  SplitEmptyNamespace: true
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Custom
+BreakBeforeInheritanceComma: false
+BreakInheritanceList: BeforeColon
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+BreakConstructorInitializers: BeforeColon
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: true
+ColumnLimit:     120
+CommentPragmas:  '^ IWYU pragma:'
+CompactNamespaces: false
+ConstructorInitializerAllOnOneLineOrOnePerLine: false
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false
+DisableFormat:   false
+ExperimentalAutoDetectBinPacking: false
+FixNamespaceComments: true
+ForEachMacros:
+  - foreach
+  - Q_FOREACH
+  - BOOST_FOREACH
+IncludeBlocks:   Preserve
+IncludeCategories:
+  - Regex:           '^"(llvm|llvm-c|clang|clang-c)/'
+    Priority:        2
+  - Regex:           '^(<|"(gtest|gmock|isl|json)/)'
+    Priority:        3
+  - Regex:           '.*'
+    Priority:        1
+IncludeIsMainRegex: '(Test)?$'
+IndentCaseLabels: false
+IndentPPDirectives: None
+IndentWidth:     4
+IndentWrappedFunctionNames: false
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtTheStartOfBlocks: true
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBinPackProtocolList: Auto
+ObjCBlockIndentWidth: 2
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: true
+PenaltyBreakAssignment: 2
+PenaltyBreakBeforeFirstCallParameter: 19
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyBreakTemplateDeclaration: 10
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 1000
+PointerAlignment: Right
+ReflowComments:  true
+SortIncludes:    true
+SortUsingDeclarations: true
+SpaceAfterCStyleCast: false
+SpaceAfterLogicalNot: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCpp11BracedList: false
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: ControlStatements
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles:  false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard:        Cpp11
+StatementMacros:
+  - Q_UNUSED
+  - QT_REQUIRE_VERSION
+TabWidth:        4
+UseTab:          Never
+...
+
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -0,0 +1,14 @@
+---
+Checks:     '
+  -*,
+  clang-analyzer-*,
+  modernize-*,
+  performance-*,
+  readability-*,
+  concurrency-*,
+  -modernize-use-trailing-return-type,
+  -google-build-using-namespace,
+  -readability-simplify-boolean-expr,
+  -readability-magic-numbers,
+  -clang-analyzer-core.DivideZero,
+  '
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,12 @@
+workloads/
+YCSB/
+cmake-build-debug/
+cmake_install.cmake
+CMakeCache.txt
+Makefile
+CMakeFiles/
+bin/
+ycsb_binding/bin
+lib/*.so
+*.cbp
+./idea
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -0,0 +1,107 @@
+cmake_minimum_required(VERSION 3.10)
+project(mxtasking)
+
+# Check SSE is available
+INCLUDE(scripts/FindSSE.cmake)
+FindSSE()
+
+# Set compile flags
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_C_COMPILER clang)
+set(CMAKE_CXX_COMPILER clang++)
+set(CMAKE_CXX_CLANG_TIDY "clang-tidy;--extra-arg-before=-std=c++17 --system-headers=0")
+set(CMAKE_CXX_FLAGS "-pedantic -Wall -Wextra -Werror \
+ -Wno-invalid-offsetof -Wcast-align -Wcast-qual -Wctor-dtor-privacy -Wdisabled-optimization \
+ -Wformat=2 -Winit-self -Wmissing-declarations -Wmissing-include-dirs -Woverloaded-virtual \
+ -Wredundant-decls -Wshadow -Wsign-promo -Wstrict-overflow=5 -Wswitch-default -Wundef \
+ -Wno-unused -Wold-style-cast -Wno-uninitialized")
+
+# Set compile flag for x86_64
+if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native -mtune=native")
+endif()
+
+# Set SSE flag if available
+IF(SSE4_2_FOUND)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2 -DUSE_SSE2")
+ENDIF(SSE4_2_FOUND)
+
+set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g3")
+set(CMAKE_CXX_FLAGS_RELEASE "-O3 -g -DNDEBUG -flto")
+set(CMAKE_BUILD_TYPE RELEASE)
+
+# Directories for output binaries and libraries
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
+
+# External libraries
+find_library(GTEST gtest)
+
+# Include folders
+include_directories(src/ lib/)
+
+# Source files
+SET(MX_TASKING_SRC
+    src/mx/resource/builder.cpp
+    src/mx/tasking/scheduler.cpp
+    src/mx/tasking/worker.cpp
+    src/mx/tasking/profiling/profiling_task.cpp
+    src/mx/util/core_set.cpp
+    src/mx/util/random.cpp
+    src/mx/memory/dynamic_size_allocator.cpp
+    src/mx/memory/reclamation/epoch_manager.cpp
+)
+
+SET(MX_BENCHMARKING_SRC
+    src/benchmark/workload_set.cpp
+    src/benchmark/workload.cpp
+    src/benchmark/cores.cpp
+    src/benchmark/perf.cpp
+    src/benchmark/string_util.cpp
+)
+
+# Build libraries
+add_library(mxtasking SHARED ${MX_TASKING_SRC})
+add_library(mxbenchmarking SHARED ${MX_BENCHMARKING_SRC})
+
+# Build executables
+add_executable(blinktree_benchmark
+    src/application/blinktree_benchmark/main.cpp
+    src/application/blinktree_benchmark/benchmark.cpp
+)
+
+add_executable(hashjoin_benchmark
+    src/application/hashjoin_benchmark/main.cpp
+    src/application/hashjoin_benchmark/benchmark.cpp
+    src/application/hashjoin_benchmark/merge_task.cpp
+    src/application/hashjoin_benchmark/tpch_table_reader.cpp
+    src/application/hashjoin_benchmark/notifier.cpp
+)
+
+# Link executables
+target_link_libraries(blinktree_benchmark pthread numa atomic mxtasking mxbenchmarking)
+target_link_libraries(hashjoin_benchmark pthread numa atomic mxtasking mxbenchmarking)
+
+# Add tests
+if (GTEST)
+    set(TESTS
+        test/mx/memory/alignment_helper.test.cpp
+        test/mx/memory/dynamic_size_allocator.test.cpp
+        test/mx/memory/fixed_size_allocator.test.cpp
+        test/mx/memory/tagged_ptr.test.cpp
+        test/mx/util/aligned_t.test.cpp
+        test/mx/util/mpsc_queue.test.cpp
+        test/mx/util/queue.test.cpp
+        test/mx/util/core_set.test.cpp
+        test/mx/util/vector.test.cpp
+    )
+
+    add_executable(mxtests test/test.cpp ${TESTS})
+    target_link_libraries(mxtests pthread numa atomic mxtasking mxbenchmarking gtest)
+else()
+    message("Library 'gtest' not found. Please install 'libgtest-dev' for unit tests.")
+endif()
+
+# Custom targets
+add_custom_target(ycsb-a ${CMAKE_SOURCE_DIR}/scripts/generate_ycsb a randint)
+add_custom_target(ycsb-c ${CMAKE_SOURCE_DIR}/scripts/generate_ycsb c randint)
--- a/README.md
+++ b/README.md
@@ -0,0 +1,50 @@
+# MxTasking: Task-based framework with built-in prefetching and synchronization
+
+MxTasking is a task-based framework that assists the design of latch-free and parallel data structures. 
+MxTasking eases the information exchange between applications and the operating system, resulting in novel opportunities to manage resources in a truly hardware- and application-conscious way.
+
+# Cite 
+The code was used for our SIGMOD'21 paper.
+
+Jan Mühlig and Jens Teubner. 2021. MxTasks: How to Make Efficient Synchronization and Prefetching Easy. In Proceedings of the 2021 International Conference on Management of Data. [[PDF]](http://dbis.cs.tu-dortmund.de/TODO)
+
+    @inproceedings{muehlig2021mxtasks,
+        author = {Jan Mühlig and Jens Teubner},
+        title = {MxTasks: How to Make Efficient Synchronization and Prefetching Easy},
+        booktitle = {Proceedings of the 2021 International Conference on Management of Data},
+        year = {2021}    
+    }
+
+## Dependencies
+### For building
+#### Required
+* `cmake` `>= 3.10`
+* `clang` `>= 10`
+* `clang-tidy` `>= 10`
+* `libnuma` or `libnuma-dev`
+
+#### Optional
+* `libgtest-dev` for tests in `test/`
+
+### For generating the YCSB workload
+* `python` `>= 3`
+* `java`
+* `curl`
+
+## How to build
+* Call `cmake .` to generate `Makefile`.
+* Call `make` to generate all binaries.
+
+## How to run
+For detailed information please see README files in `src/application/<app>` folders:
+* [B Link Tree benchmark](src/application/blinktree_benchmark/README.md)  (`src/application/blinktree_benchmark`)
+* [Hash Join benchmark](src/application/hashjoin_benchmark/README.md) (`src/application/hashjoin_benchmark`)
+
+### Simple example for B Link Tree
+* Call `make ycsb-a` to generate the default workload
+* Call `./bin/blinktree_benchmark 1:4` to run benchmark for one to four cores.
+
+## External Libraries
+* `argparse` ([view on github](https://github.com/p-ranav/argparse)) under MIT license
+* `json` ([view on github](https://github.com/nlohmann/json)) under MIT license
+* Yahoo! Cloud Serving Benchmark ([view on github](https://github.com/brianfrankcooper/YCSB)) under  Apache License 2.0
--- a/lib/argparse.hpp
+++ b/lib/argparse.hpp
@@ -0,0 +1,619 @@
+/*
+  __ _ _ __ __ _ _ __   __ _ _ __ ___  ___
+ / _` | '__/ _` | '_ \ / _` | '__/ __|/ _ \ Argument Parser for Modern C++
+| (_| | | | (_| | |_) | (_| | |  \__ \  __/ http://github.com/p-ranav/argparse
+ \__,_|_|  \__, | .__/ \__,_|_|  |___/\___|
+           |___/|_|
+
+Licensed under the MIT License <http://opensource.org/licenses/MIT>.
+SPDX-License-Identifier: MIT
+Copyright (c) 2019 Pranav Srinivas Kumar <pranav.srinivas.kumar@gmail.com>.
+
+Permission is hereby  granted, free of charge, to any  person obtaining a copy
+of this software and associated  documentation files (the "Software"), to deal
+in the Software  without restriction, including without  limitation the rights
+to  use, copy,  modify, merge,  publish, distribute,  sublicense, and/or  sell
+copies  of  the Software,  and  to  permit persons  to  whom  the Software  is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE  IS PROVIDED "AS  IS", WITHOUT WARRANTY  OF ANY KIND,  EXPRESS OR
+IMPLIED,  INCLUDING BUT  NOT  LIMITED TO  THE  WARRANTIES OF  MERCHANTABILITY,
+FITNESS FOR  A PARTICULAR PURPOSE AND  NONINFRINGEMENT. IN NO EVENT  SHALL THE
+AUTHORS  OR COPYRIGHT  HOLDERS  BE  LIABLE FOR  ANY  CLAIM,  DAMAGES OR  OTHER
+LIABILITY, WHETHER IN AN ACTION OF  CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE  OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+#pragma once
+#include <algorithm>
+#include <any>
+#include <functional>
+#include <iostream>
+#include <iterator>
+#include <list>
+#include <map>
+#include <numeric>
+#include <optional>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <string_view>
+#include <tuple>
+#include <type_traits>
+#include <variant>
+#include <vector>
+
+namespace argparse {
+
+    namespace details { // namespace for helper methods
+
+        template <typename... Ts> struct is_container_helper {};
+
+        template <typename T, typename _ = void>
+        struct is_container : std::false_type {};
+
+        template <> struct is_container<std::string> : std::false_type {};
+
+        template <typename T>
+        struct is_container<
+                T,
+                std::conditional_t<false,
+                        is_container_helper<typename T::value_type,
+                                decltype(std::declval<T>().begin()),
+                                decltype(std::declval<T>().end()),
+                                decltype(std::declval<T>().size())>,
+                        void>> : std::true_type {};
+
+        template <typename T>
+        static constexpr bool is_container_v = is_container<T>::value;
+
+        template <typename T>
+        struct is_string_like
+                : std::conjunction<std::is_constructible<std::string, T>,
+                        std::is_convertible<T, std::string_view>> {};
+
+        template <class F, class Tuple, class Extra, size_t... I>
+        constexpr decltype(auto) apply_plus_one_impl(F &&f, Tuple &&t, Extra &&x,
+                                                     std::index_sequence<I...>) {
+            return std::invoke(std::forward<F>(f), std::get<I>(std::forward<Tuple>(t))...,
+                               std::forward<Extra>(x));
+        }
+
+        template <class F, class Tuple, class Extra>
+        constexpr decltype(auto) apply_plus_one(F &&f, Tuple &&t, Extra &&x) {
+            return details::apply_plus_one_impl(
+                    std::forward<F>(f), std::forward<Tuple>(t), std::forward<Extra>(x),
+                    std::make_index_sequence<
+                            std::tuple_size_v<std::remove_reference_t<Tuple>>>{});
+        }
+
+    } // namespace details
+
+    class ArgumentParser;
+
+    class Argument {
+        friend class ArgumentParser;
+        friend auto operator<<(std::ostream &, ArgumentParser const &)
+        -> std::ostream &;
+
+        template <size_t N, size_t... I>
+        explicit Argument(std::string(&&a)[N], std::index_sequence<I...>)
+                : mIsOptional((is_optional(a[I]) || ...)), mIsRequired(false),
+                  mIsUsed(false) {
+            ((void)mNames.push_back(std::move(a[I])), ...);
+            std::sort(
+                    mNames.begin(), mNames.end(), [](const auto &lhs, const auto &rhs) {
+                        return lhs.size() == rhs.size() ? lhs < rhs : lhs.size() < rhs.size();
+                    });
+        }
+
+    public:
+        Argument() = default;
+
+        template <typename... Args,
+                std::enable_if_t<
+                        std::conjunction_v<details::is_string_like<Args>...>, int> = 0>
+        explicit Argument(Args &&... args)
+                : Argument({std::string(std::forward<Args>(args))...},
+                           std::make_index_sequence<sizeof...(Args)>{}) {}
+
+        Argument &help(std::string aHelp) {
+            mHelp = std::move(aHelp);
+            return *this;
+        }
+
+        Argument &default_value(std::any aDefaultValue) {
+            mDefaultValue = std::move(aDefaultValue);
+            return *this;
+        }
+
+        Argument &required() {
+            mIsRequired = true;
+            return *this;
+        }
+
+        Argument &implicit_value(std::any aImplicitValue) {
+            mImplicitValue = std::move(aImplicitValue);
+            mNumArgs = 0;
+            return *this;
+        }
+
+        template <class F, class... Args>
+        auto action(F &&aAction, Args &&... aBound)
+        -> std::enable_if_t<std::is_invocable_v<F, Args..., std::string const>,
+                Argument &> {
+            using action_type = std::conditional_t<
+                    std::is_void_v<std::invoke_result_t<F, Args..., std::string const>>,
+                    void_action, valued_action>;
+            if constexpr (sizeof...(Args) == 0)
+                mAction.emplace<action_type>(std::forward<F>(aAction));
+            else
+                mAction.emplace<action_type>(
+                        [f = std::forward<F>(aAction),
+                                tup = std::make_tuple(std::forward<Args>(aBound)...)](
+                                std::string const &opt) mutable {
+                            return details::apply_plus_one(f, tup, opt);
+                        });
+            return *this;
+        }
+
+        Argument &nargs(int aNumArgs) {
+            if (aNumArgs < 0)
+                throw std::logic_error("Number of arguments must be non-negative");
+            mNumArgs = aNumArgs;
+            return *this;
+        }
+
+        Argument &remaining() {
+            mNumArgs = -1;
+            return *this;
+        }
+
+        template <typename Iterator>
+        Iterator consume(Iterator start, Iterator end, std::string usedName = {}) {
+            if (mIsUsed) {
+                throw std::runtime_error("Duplicate argument");
+            }
+            mIsUsed = true;
+            mUsedName = std::move(usedName);
+            if (mNumArgs == 0) {
+                mValues.emplace_back(mImplicitValue);
+                return start;
+            } else if (mNumArgs <= std::distance(start, end)) {
+                if (auto expected = maybe_nargs()) {
+                    end = std::next(start, *expected);
+                    if (std::any_of(start, end, Argument::is_optional)) {
+                        throw std::runtime_error("optional argument in parameter sequence");
+                    }
+                }
+
+                struct action_apply {
+                    void operator()(valued_action &f) {
+                        std::transform(start, end, std::back_inserter(self.mValues), f);
+                    }
+
+                    void operator()(void_action &f) {
+                        std::for_each(start, end, f);
+                        if (!self.mDefaultValue.has_value()) {
+                            if (auto expected = self.maybe_nargs())
+                                self.mValues.resize(*expected);
+                        }
+                    }
+
+                    Iterator start, end;
+                    Argument &self;
+                };
+                std::visit(action_apply{start, end, *this}, mAction);
+                return end;
+            } else if (mDefaultValue.has_value()) {
+                return start;
+            } else {
+                throw std::runtime_error("Too few arguments");
+            }
+        }
+
+        /*
+         * @throws std::runtime_error if argument values are not valid
+         */
+        void validate() const {
+            if (auto expected = maybe_nargs()) {
+                if (mIsOptional) {
+                    if (mIsUsed && mValues.size() != *expected &&
+                        !mDefaultValue.has_value()) {
+                        std::stringstream stream;
+                        stream << mUsedName << ": expected " << *expected << " argument(s). "
+                               << mValues.size() << " provided.";
+                        throw std::runtime_error(stream.str());
+                    } else {
+                        // TODO: check if an implicit value was programmed for this argument
+                        if (!mIsUsed && !mDefaultValue.has_value() && mIsRequired) {
+                            std::stringstream stream;
+                            stream << mNames[0] << ": required.";
+                            throw std::runtime_error(stream.str());
+                        }
+                        if (mIsUsed && mIsRequired && mValues.size() == 0) {
+                            std::stringstream stream;
+                            stream << mUsedName << ": no value provided.";
+                            throw std::runtime_error(stream.str());
+                        }
+                    }
+                } else {
+                    if (mValues.size() != expected && !mDefaultValue.has_value()) {
+                        std::stringstream stream;
+                        stream << mUsedName << ": expected " << *expected << " argument(s). "
+                               << mValues.size() << " provided.";
+                        throw std::runtime_error(stream.str());
+                    }
+                }
+            }
+        }
+
+        auto maybe_nargs() const -> std::optional<size_t> {
+            if (mNumArgs < 0)
+                return std::nullopt;
+            else
+                return static_cast<size_t>(mNumArgs);
+        }
+
+        size_t get_arguments_length() const {
+            return std::accumulate(std::begin(mNames), std::end(mNames), size_t(0),
+                                   [](const auto &sum, const auto &s) {
+                                       return sum + s.size() +
+                                              1; // +1 for space between names
+                                   });
+        }
+
+        friend std::ostream &operator<<(std::ostream &stream,
+                                        const Argument &argument) {
+            std::stringstream nameStream;
+            std::copy(std::begin(argument.mNames), std::end(argument.mNames),
+                      std::ostream_iterator<std::string>(nameStream, " "));
+            stream << nameStream.str() << "\t" << argument.mHelp;
+            if (argument.mIsRequired)
+                stream << "[Required]";
+            stream << "\n";
+            return stream;
+        }
+
+        template <typename T> bool operator!=(const T &aRhs) const {
+            return !(*this == aRhs);
+        }
+
+        /*
+         * Compare to an argument value of known type
+         * @throws std::logic_error in case of incompatible types
+         */
+        template <typename T> bool operator==(const T &aRhs) const {
+            if constexpr (!details::is_container_v<T>) {
+                return get<T>() == aRhs;
+            } else {
+                using ValueType = typename T::value_type;
+                auto tLhs = get<T>();
+                return std::equal(std::begin(tLhs), std::end(tLhs), std::begin(aRhs),
+                                  std::end(aRhs), [](const auto &lhs, const auto &rhs) {
+                            return std::any_cast<const ValueType &>(lhs) == rhs;
+                        });
+            }
+        }
+
+    private:
+        static bool is_integer(const std::string &aValue) {
+            if (aValue.empty() ||
+                ((!isdigit(aValue[0])) && (aValue[0] != '-') && (aValue[0] != '+')))
+                return false;
+            char *tPtr;
+            strtol(aValue.c_str(), &tPtr, 10);
+            return (*tPtr == 0);
+        }
+
+        static bool is_float(const std::string &aValue) {
+            std::istringstream tStream(aValue);
+            float tFloat;
+            // noskipws considers leading whitespace invalid
+            tStream >> std::noskipws >> tFloat;
+            // Check the entire string was consumed
+            // and if either failbit or badbit is set
+            return tStream.eof() && !tStream.fail();
+        }
+
+        // If an argument starts with "-" or "--", then it's optional
+        static bool is_optional(const std::string &aName) {
+            return (aName.size() > 1 && aName[0] == '-' && !is_integer(aName) &&
+                    !is_float(aName));
+        }
+
+        static bool is_positional(const std::string &aName) {
+            return !is_optional(aName);
+        }
+
+        /*
+         * Get argument value given a type
+         * @throws std::logic_error in case of incompatible types
+         */
+        template <typename T> T get() const {
+            if (!mValues.empty()) {
+                if constexpr (details::is_container_v<T>)
+                    return any_cast_container<T>(mValues);
+                else
+                    return std::any_cast<T>(mValues.front());
+            }
+            if (mDefaultValue.has_value()) {
+                return std::any_cast<T>(mDefaultValue);
+            }
+            throw std::logic_error("No value provided");
+        }
+
+        template <typename T>
+        static auto any_cast_container(const std::vector<std::any> &aOperand) -> T {
+            using ValueType = typename T::value_type;
+
+            T tResult;
+            std::transform(
+                    begin(aOperand), end(aOperand), std::back_inserter(tResult),
+                    [](const auto &value) { return std::any_cast<ValueType>(value); });
+            return tResult;
+        }
+
+        std::vector<std::string> mNames;
+        std::string mUsedName;
+        std::string mHelp;
+        std::any mDefaultValue;
+        std::any mImplicitValue;
+        using valued_action = std::function<std::any(const std::string &)>;
+        using void_action = std::function<void(const std::string &)>;
+        std::variant<valued_action, void_action> mAction{
+                std::in_place_type<valued_action>,
+                [](const std::string &aValue) { return aValue; }};
+        std::vector<std::any> mValues;
+        int mNumArgs = 1;
+        bool mIsOptional : 1;
+        bool mIsRequired : 1;
+        bool mIsUsed : 1; // True if the optional argument is used by user
+
+        static constexpr auto mHelpOption = "-h";
+        static constexpr auto mHelpOptionLong = "--help";
+    };
+
+    class ArgumentParser {
+    public:
+        explicit ArgumentParser(std::string aProgramName = {})
+                : mProgramName(std::move(aProgramName)) {
+            add_argument(Argument::mHelpOption, Argument::mHelpOptionLong)
+                    .help("show this help message and exit")
+                    .nargs(0)
+                    .default_value(false)
+                    .implicit_value(true);
+        }
+
+        ArgumentParser(ArgumentParser &&) noexcept = default;
+        ArgumentParser &operator=(ArgumentParser &&) = default;
+
+        ArgumentParser(const ArgumentParser &other)
+                : mProgramName(other.mProgramName),
+                  mPositionalArguments(other.mPositionalArguments),
+                  mOptionalArguments(other.mOptionalArguments) {
+            for (auto it = begin(mPositionalArguments); it != end(mPositionalArguments);
+                 ++it)
+                index_argument(it);
+            for (auto it = begin(mOptionalArguments); it != end(mOptionalArguments);
+                 ++it)
+                index_argument(it);
+        }
+
+        ArgumentParser &operator=(const ArgumentParser &other) {
+            auto tmp = other;
+            std::swap(*this, tmp);
+            return *this;
+        }
+
+        // Parameter packing
+        // Call add_argument with variadic number of string arguments
+        template <typename... Targs> Argument &add_argument(Targs... Fargs) {
+            auto tArgument = mOptionalArguments.emplace(cend(mOptionalArguments),
+                                                        std::move(Fargs)...);
+
+            if (!tArgument->mIsOptional)
+                mPositionalArguments.splice(cend(mPositionalArguments),
+                                            mOptionalArguments, tArgument);
+
+            index_argument(tArgument);
+            return *tArgument;
+        }
+
+        // Parameter packed add_parents method
+        // Accepts a variadic number of ArgumentParser objects
+        template <typename... Targs> void add_parents(const Targs &... Fargs) {
+            for (const ArgumentParser &tParentParser : {std::ref(Fargs)...}) {
+                for (auto &tArgument : tParentParser.mPositionalArguments) {
+                    auto it =
+                            mPositionalArguments.insert(cend(mPositionalArguments), tArgument);
+                    index_argument(it);
+                }
+                for (auto &tArgument : tParentParser.mOptionalArguments) {
+                    auto it =
+                            mOptionalArguments.insert(cend(mOptionalArguments), tArgument);
+                    index_argument(it);
+                }
+            }
+        }
+
+        /* Call parse_args_internal - which does all the work
+         * Then, validate the parsed arguments
+         * This variant is used mainly for testing
+         * @throws std::runtime_error in case of any invalid argument
+         */
+        void parse_args(const std::vector<std::string> &aArguments) {
+            parse_args_internal(aArguments);
+            parse_args_validate();
+        }
+
+        /* Main entry point for parsing command-line arguments using this
+         * ArgumentParser
+         * @throws std::runtime_error in case of any invalid argument
+         */
+        void parse_args(int argc, const char *const argv[]) {
+            std::vector<std::string> arguments;
+            std::copy(argv, argv + argc, std::back_inserter(arguments));
+            parse_args(arguments);
+        }
+
+        /* Getter enabled for all template types other than std::vector and std::list
+         * @throws std::logic_error in case of an invalid argument name
+         * @throws std::logic_error in case of incompatible types
+         */
+        template <typename T = std::string> T get(std::string_view aArgumentName) {
+            return (*this)[aArgumentName].get<T>();
+        }
+
+        /* Indexing operator. Return a reference to an Argument object
+         * Used in conjuction with Argument.operator== e.g., parser["foo"] == true
+         * @throws std::logic_error in case of an invalid argument name
+         */
+        Argument &operator[](std::string_view aArgumentName) {
+            auto tIterator = mArgumentMap.find(aArgumentName);
+            if (tIterator != mArgumentMap.end()) {
+                return *(tIterator->second);
+            }
+            throw std::logic_error("No such argument");
+        }
+
+        // Print help message
+        friend auto operator<<(std::ostream &stream, const ArgumentParser &parser)
+        -> std::ostream & {
+            if (auto sen = std::ostream::sentry(stream)) {
+                stream.setf(std::ios_base::left);
+                stream << "Usage: " << parser.mProgramName << " [options] ";
+                size_t tLongestArgumentLength = parser.get_length_of_longest_argument();
+
+                for (const auto &argument : parser.mPositionalArguments) {
+                    stream << argument.mNames.front() << " ";
+                }
+                stream << "\n\n";
+
+                if (!parser.mPositionalArguments.empty())
+                    stream << "Positional arguments:\n";
+
+                for (const auto &mPositionalArgument : parser.mPositionalArguments) {
+                    stream.width(tLongestArgumentLength);
+                    stream << mPositionalArgument;
+                }
+
+                if (!parser.mOptionalArguments.empty())
+                    stream << (parser.mPositionalArguments.empty() ? "" : "\n")
+                           << "Optional arguments:\n";
+
+                for (const auto &mOptionalArgument : parser.mOptionalArguments) {
+                    stream.width(tLongestArgumentLength);
+                    stream << mOptionalArgument;
+                }
+            }
+
+            return stream;
+        }
+
+        // Format help message
+        auto help() const -> std::stringstream {
+            std::stringstream out;
+            out << *this;
+            return out;
+        }
+
+        // Printing the one and only help message
+        // I've stuck with a simple message format, nothing fancy.
+        [[deprecated("Use cout << program; instead.  See also help().")]] std::string
+        print_help() {
+            auto out = help();
+            std::cout << out.rdbuf();
+            return out.str();
+        }
+
+    private:
+        /*
+         * @throws std::runtime_error in case of any invalid argument
+         */
+        void parse_args_internal(const std::vector<std::string> &aArguments) {
+            if (mProgramName.empty() && !aArguments.empty()) {
+                mProgramName = aArguments.front();
+            }
+            auto end = std::end(aArguments);
+            auto positionalArgumentIt = std::begin(mPositionalArguments);
+            for (auto it = std::next(std::begin(aArguments)); it != end;) {
+                const auto &tCurrentArgument = *it;
+                if (tCurrentArgument == Argument::mHelpOption ||
+                    tCurrentArgument == Argument::mHelpOptionLong) {
+                    throw std::runtime_error("help called");
+                }
+                if (Argument::is_positional(tCurrentArgument)) {
+                    if (positionalArgumentIt == std::end(mPositionalArguments)) {
+                        throw std::runtime_error(
+                                "Maximum number of positional arguments exceeded");
+                    }
+                    auto tArgument = positionalArgumentIt++;
+                    it = tArgument->consume(it, end);
+                } else if (auto tIterator = mArgumentMap.find(tCurrentArgument);
+                        tIterator != mArgumentMap.end()) {
+                    auto tArgument = tIterator->second;
+                    it = tArgument->consume(std::next(it), end, tCurrentArgument);
+                } else if (const auto &tCompoundArgument = tCurrentArgument;
+                        tCompoundArgument.size() > 1 && tCompoundArgument[0] == '-' &&
+                        tCompoundArgument[1] != '-') {
+                    ++it;
+                    for (size_t j = 1; j < tCompoundArgument.size(); j++) {
+                        auto tHypotheticalArgument = std::string{'-', tCompoundArgument[j]};
+                        auto tIterator2 = mArgumentMap.find(tHypotheticalArgument);
+                        if (tIterator2 != mArgumentMap.end()) {
+                            auto tArgument = tIterator2->second;
+                            it = tArgument->consume(it, end, tHypotheticalArgument);
+                        } else {
+                            throw std::runtime_error("Unknown argument");
+                        }
+                    }
+                } else {
+                    throw std::runtime_error("Unknown argument");
+                }
+            }
+        }
+
+        /*
+         * @throws std::runtime_error in case of any invalid argument
+         */
+        void parse_args_validate() {
+            // Check if all arguments are parsed
+            std::for_each(std::begin(mArgumentMap), std::end(mArgumentMap),
+                          [](const auto &argPair) {
+                              const auto &tArgument = argPair.second;
+                              tArgument->validate();
+                          });
+        }
+
+        // Used by print_help.
+        size_t get_length_of_longest_argument() const {
+            if (mArgumentMap.empty())
+                return 0;
+            std::vector<size_t> argumentLengths(mArgumentMap.size());
+            std::transform(std::begin(mArgumentMap), std::end(mArgumentMap),
+                           std::begin(argumentLengths), [](const auto &argPair) {
+                        const auto &tArgument = argPair.second;
+                        return tArgument->get_arguments_length();
+                    });
+            return *std::max_element(std::begin(argumentLengths),
+                                     std::end(argumentLengths));
+        }
+
+        using list_iterator = std::list<Argument>::iterator;
+
+        void index_argument(list_iterator argIt) {
+            for (auto &mName : std::as_const(argIt->mNames))
+                mArgumentMap.emplace(mName, argIt);
+        }
+
+        std::string mProgramName;
+        std::list<Argument> mPositionalArguments;
+        std::list<Argument> mOptionalArguments;
+        std::map<std::string_view, list_iterator, std::less<>> mArgumentMap;
+    };
+
+} // namespace argparse
--- a/lib/json.hpp
+++ b/lib/json.hpp
--- a/scripts/FindSSE.cmake
+++ b/scripts/FindSSE.cmake
@@ -0,0 +1,110 @@
+MACRO (FindSSE)
+
+IF(CMAKE_SYSTEM_NAME MATCHES "Linux")
+    EXEC_PROGRAM(cat ARGS "/proc/cpuinfo" OUTPUT_VARIABLE CPUINFO)
+
+    STRING(REGEX REPLACE "^.*(sse2).*$" "\\1" SSE_THERE ${CPUINFO})
+    STRING(COMPARE EQUAL "sse2" "${SSE_THERE}" SSE2_TRUE)
+    IF (SSE2_TRUE)
+        set(SSE2_FOUND true CACHE BOOL "SSE2 available on host")
+    ELSE (SSE2_TRUE)
+        set(SSE2_FOUND false CACHE BOOL "SSE2 available on host")
+    ENDIF (SSE2_TRUE)
+
+    # /proc/cpuinfo apparently omits sse3 :(
+    STRING(REGEX REPLACE "^.*[^s](sse3).*$" "\\1" SSE_THERE ${CPUINFO})
+    STRING(COMPARE EQUAL "sse3" "${SSE_THERE}" SSE3_TRUE)
+    IF (NOT SSE3_TRUE)
+        STRING(REGEX REPLACE "^.*(T2300).*$" "\\1" SSE_THERE ${CPUINFO})
+        STRING(COMPARE EQUAL "T2300" "${SSE_THERE}" SSE3_TRUE)
+    ENDIF (NOT SSE3_TRUE)
+
+    STRING(REGEX REPLACE "^.*(ssse3).*$" "\\1" SSE_THERE ${CPUINFO})
+    STRING(COMPARE EQUAL "ssse3" "${SSE_THERE}" SSSE3_TRUE)
+    IF (SSE3_TRUE OR SSSE3_TRUE)
+        set(SSE3_FOUND true CACHE BOOL "SSE3 available on host")
+    ELSE (SSE3_TRUE OR SSSE3_TRUE)
+        set(SSE3_FOUND false CACHE BOOL "SSE3 available on host")
+    ENDIF (SSE3_TRUE OR SSSE3_TRUE)
+    IF (SSSE3_TRUE)
+        set(SSSE3_FOUND true CACHE BOOL "SSSE3 available on host")
+    ELSE (SSSE3_TRUE)
+        set(SSSE3_FOUND false CACHE BOOL "SSSE3 available on host")
+    ENDIF (SSSE3_TRUE)
+
+    STRING(REGEX REPLACE "^.*(sse4_1).*$" "\\1" SSE_THERE ${CPUINFO})
+    STRING(COMPARE EQUAL "sse4_1" "${SSE_THERE}" SSE41_TRUE)
+    IF (SSE41_TRUE)
+        set(SSE4_1_FOUND true CACHE BOOL "SSE4.1 available on host")
+    ELSE (SSE41_TRUE)
+        set(SSE4_1_FOUND false CACHE BOOL "SSE4.1 available on host")
+    ENDIF (SSE41_TRUE)
+
+    STRING(REGEX REPLACE "^.*(sse4_2).*$" "\\1" SSE_THERE ${CPUINFO})
+    STRING(COMPARE EQUAL "sse4_2" "${SSE_THERE}" SSE42_TRUE)
+    IF (SSE42_TRUE)
+        set(SSE4_2_FOUND true CACHE BOOL "SSE4.2 available on host")
+    ELSE (SSE42_TRUE)
+        set(SSE4_2_FOUND false CACHE BOOL "SSE4.2 available on host")
+    ENDIF (SSE42_TRUE)
+
+ELSEIF(CMAKE_SYSTEM_NAME MATCHES "Darwin")
+    EXEC_PROGRAM("/usr/sbin/sysctl -n machdep.cpu.features" OUTPUT_VARIABLE
+            CPUINFO)
+
+    STRING(REGEX REPLACE "^.*[^S](SSE2).*$" "\\1" SSE_THERE ${CPUINFO})
+    STRING(COMPARE EQUAL "SSE2" "${SSE_THERE}" SSE2_TRUE)
+    IF (SSE2_TRUE)
+        set(SSE2_FOUND true CACHE BOOL "SSE2 available on host")
+    ELSE (SSE2_TRUE)
+        set(SSE2_FOUND false CACHE BOOL "SSE2 available on host")
+    ENDIF (SSE2_TRUE)
+
+    STRING(REGEX REPLACE "^.*[^S](SSE3).*$" "\\1" SSE_THERE ${CPUINFO})
+    STRING(COMPARE EQUAL "SSE3" "${SSE_THERE}" SSE3_TRUE)
+    IF (SSE3_TRUE)
+        set(SSE3_FOUND true CACHE BOOL "SSE3 available on host")
+    ELSE (SSE3_TRUE)
+        set(SSE3_FOUND false CACHE BOOL "SSE3 available on host")
+    ENDIF (SSE3_TRUE)
+
+    STRING(REGEX REPLACE "^.*(SSSE3).*$" "\\1" SSE_THERE ${CPUINFO})
+    STRING(COMPARE EQUAL "SSSE3" "${SSE_THERE}" SSSE3_TRUE)
+    IF (SSSE3_TRUE)
+        set(SSSE3_FOUND true CACHE BOOL "SSSE3 available on host")
+    ELSE (SSSE3_TRUE)
+        set(SSSE3_FOUND false CACHE BOOL "SSSE3 available on host")
+    ENDIF (SSSE3_TRUE)
+
+    STRING(REGEX REPLACE "^.*(SSE4.1).*$" "\\1" SSE_THERE ${CPUINFO})
+    STRING(COMPARE EQUAL "SSE4.1" "${SSE_THERE}" SSE41_TRUE)
+    IF (SSE41_TRUE)
+        set(SSE4_1_FOUND true CACHE BOOL "SSE4.1 available on host")
+    ELSE (SSE41_TRUE)
+        set(SSE4_1_FOUND false CACHE BOOL "SSE4.1 available on host")
+    ENDIF (SSE41_TRUE)
+
+    STRING(REGEX REPLACE "^.*(SSE4.2).*$" "\\1" SSE_THERE ${CPUINFO})
+    STRING(COMPARE EQUAL "SSE4.2" "${SSE_THERE}" SSE42_TRUE)
+    IF (SSE42_TRUE)
+        set(SSE4_2_FOUND true CACHE BOOL "SSE4.2 available on host")
+    ELSE (SSE42_TRUE)
+        set(SSE4_2_FOUND false CACHE BOOL "SSE4.2 available on host")
+    ENDIF (SSE42_TRUE)
+
+ELSEIF(CMAKE_SYSTEM_NAME MATCHES "Windows")
+    # TODO
+    set(SSE2_FOUND   true  CACHE BOOL "SSE2 available on host")
+    set(SSE3_FOUND   false CACHE BOOL "SSE3 available on host")
+    set(SSSE3_FOUND  false CACHE BOOL "SSSE3 available on host")
+    set(SSE4_1_FOUND false CACHE BOOL "SSE4.1 available on host")
+    set(SSE4_2_FOUND false CACHE BOOL "SSE4.2 available on host")
+ELSE(CMAKE_SYSTEM_NAME MATCHES "Linux")
+    set(SSE2_FOUND   true  CACHE BOOL "SSE2 available on host")
+    set(SSE3_FOUND   false CACHE BOOL "SSE3 available on host")
+    set(SSSE3_FOUND  false CACHE BOOL "SSSE3 available on host")
+    set(SSE4_1_FOUND false CACHE BOOL "SSE4.1 available on host")
+    set(SSE4_2_FOUND false CACHE BOOL "SSE4.2 available on host")
+ENDIF(CMAKE_SYSTEM_NAME MATCHES "Linux")
+
+ENDMACRO(FindSSE)
--- a/scripts/generate_ycsb
+++ b/scripts/generate_ycsb
@@ -0,0 +1,115 @@
+#!/usr/bin/python3
+
+import sys
+import os
+from pathlib import Path
+
+script_dir = Path(os.path.realpath(__file__))
+ycsb_dir = "YCSB/"
+workload_dir = str(script_dir.parent.parent) + "/workloads_specification/"
+output_dir = "workloads/"
+
+if not os.path.exists(output_dir):
+    os.makedirs(output_dir)
+
+if not os.path.exists(ycsb_dir):
+    os.system("curl -O --location https://github.com/brianfrankcooper/YCSB/releases/download/0.16.0/ycsb-0.16.0.tar.gz")
+    os.system("tar xfvz ycsb-0.16.0.tar.gz")
+    os.system("mv ycsb-0.16.0 " + ycsb_dir)
+    os.system("rm ycsb-0.16.0.tar.gz")
+
+workload = "workloada"
+key_type = "randint"
+
+if len(sys.argv) == 1:
+    print("Usage: python", sys.argv[0], "<workload_type> <key_type>")
+    print("Example: 'python", sys.argv[0], "a randint' for generating workloada with randint.")
+
+if len(sys.argv) > 1:
+    workload = str(sys.argv[1])
+    if not workload.startswith("workload"):
+        workload = "workload" + workload
+
+if len(sys.argv) > 2:
+    key_type = sys.argv[2]
+
+print("Generaring YCSB workload", workload, "with key type", key_type)
+
+out_ycsb_load = output_dir + 'ycsb_load_' + key_type + '_' + workload
+out_ycsb_txn = output_dir + 'ycsb_txn_' + key_type + '_' + workload
+out_load_ycsbkey = output_dir + 'load_' + 'ycsbkey' + '_' + workload
+out_txn_ycsbkey = output_dir + 'txn_' + 'ycsbkey' + '_' + workload
+out_load = output_dir + 'fill_' + key_type + '_' + workload
+out_txn = output_dir + 'mixed_' + key_type + '_' + workload
+
+cmd_ycsb_load = ycsb_dir + 'bin/ycsb.sh load basic -P ' + workload_dir + workload + ' -s > ' + out_ycsb_load
+cmd_ycsb_txn = ycsb_dir + 'bin/ycsb.sh run basic -P ' + workload_dir + workload + ' -s > ' + out_ycsb_txn
+
+os.system(cmd_ycsb_load)
+os.system(cmd_ycsb_txn)
+
+#####################################################################################
+
+with open(out_ycsb_load, 'r') as f_load:
+    with open(out_load_ycsbkey, 'w') as f_load_out:
+        for line in f_load :
+            cols = line.split()
+            if len(cols) > 0 and cols[0] == "INSERT":
+                f_load_out.write (cols[0] + " " + cols[2][4:] + "\n")
+
+with open (out_ycsb_txn, 'r') as f_txn:
+    with open (out_txn_ycsbkey, 'w') as f_txn_out:
+        for line in f_txn :
+            cols = line.split()
+            if (cols[0] == 'SCAN') or (cols[0] == 'INSERT') or (cols[0] == 'READ') or (cols[0] == 'UPDATE'):
+                startkey = cols[2][4:]
+                if cols[0] == 'SCAN' :
+                    numkeys = cols[3]
+                    f_txn_out.write (cols[0] + ' ' + startkey + ' ' + numkeys + '\n')
+                else :
+                    f_txn_out.write (cols[0] + ' ' + startkey + '\n')
+cmd = 'rm -f ' + out_ycsb_load
+os.system(cmd)
+cmd = 'rm -f ' + out_ycsb_txn
+os.system(cmd)
+
+if key_type == 'randint' :
+    with open (out_load_ycsbkey, 'r') as f_load:
+        with open (out_load, 'w') as f_load_out:
+            for line in f_load :
+                f_load_out.write (line)
+
+    with open (out_txn_ycsbkey, 'r') as f_txn:
+        with open (out_txn, 'w') as f_txn_out:
+            for line in f_txn :
+                f_txn_out.write (line)
+
+elif key_type == 'monoint' :
+    keymap = {}
+    with open (out_load_ycsbkey, 'r') as f_load:
+        with open (out_load, 'w') as f_load_out:
+            count = 0
+            for line in f_load :
+                cols = line.split()
+                keymap[int(cols[1])] = count
+                f_load_out.write (cols[0] + ' ' + str(count) + '\n')
+                count += 1
+
+    with open (out_txn_ycsbkey, 'r') as f_txn:
+        with open (out_txn, 'w') as f_txn_out:
+            for line in f_txn :
+                cols = line.split()
+                if cols[0] == 'SCAN' :
+                    f_txn_out.write (cols[0] + ' ' + str(keymap[int(cols[1])]) + ' ' + cols[2] + '\n')
+                elif cols[0] == 'INSERT' :
+                    keymap[int(cols[1])] = count
+                    f_txn_out.write (cols[0] + ' ' + str(count) + '\n')
+                    count += 1
+                else :
+                    f_txn_out.write (cols[0] + ' ' + str(keymap[int(cols[1])]) + '\n')
+
+
+cmd = 'rm -f ' + out_load_ycsbkey
+os.system(cmd)
+cmd = 'rm -f ' + out_txn_ycsbkey
+os.system(cmd)
--- a/src/application/blinktree_benchmark/README.md
+++ b/src/application/blinktree_benchmark/README.md
@@ -0,0 +1,79 @@
+# BLinkTree Benchmark
+The BLinkTree-benchmark stores `8` byte numeric keys and values.
+Call `./bin/blinktree_benchmark -h` for help and parameters.
+
+## How to generate YCSB workload
+* Workload specifications are done by files in `workloads_specification/`.
+* Call `make ycsb-a` and `make ycsb-c` to generate workloads **A** and **C**.
+* Workload files are stored in `workloads/`
+* Use `./bin/blinktree_benchmark -f <fill-file> <mixed-file>` to pass the desired workload.
+* Default (if not specified) is `-f workloads/fill_randint_workloada workloads/mixed_randint_workloada`.
+
+## Important CLI arguments
+* The first argument is the number of cores:
+    * `./bin/blinktree_benchmark 1` for using a single core.
+    * `./bin/blinktree_benchmark 1:24` for using cores `1` up to `24`.
+* `-i <NUMBER>` specifies the number of repetitions of each workload.
+* `-s <NUMBER>` steps of the cores:
+    * `-s 1` will increase the used cores by one (core ids: `0,1,2,3,4,5,6,7,..,23`).
+    * `-s 2` will skip every second core (core ids: `0,1,3,5,7,..23`).
+* `-pd <NUMBER>` specifies the prefetch distance.
+* `-p` or `--perf` will activate performance counter (result will be printed to console and output file).
+* `--latched` will enable latches for synchronization (default off).
+* `--exclusive` forces the tasks to access tree nodes exclusively (e.g. by using spinlocks or core-based sequencing) (default off).
+*  `--sync4me` will use built-in synchronization selection to choose the matching primitive based on annotations.
+* `-o <FILE>` will write the results in **json** format to the given file.
+
+## Understanding the output
+After started, the benchmark will print a summary of configured cores and workload:
+
+    core configuration: 
+      1: 0
+      2: 0 1
+      4: 0 1 2 3
+    workload: fill: 5m / readonly: 5m
+
+Here, we configured the benchmark to use one to four cores; each line of the core configuration displays the number of cores and the core identifiers.
+
+Following, the benchmark will be started and print the results for every iteration:
+
+    1	1	0	1478 ms	3.38295e+06 op/s
+    1	1	1	1237 ms	4.04204e+06 op/s
+    2	1	0	964 ms	5.18672e+06 op/s
+    2	1	1	675 ms	7.40741e+06 op/s
+    4	1	0	935 ms	5.34759e+06 op/s
+    4	1	1	532 ms	9.3985e+06 op/s
+    
+* The first column is the number of used cores.
+* The second column displays the iteration of the benchmark (configured by `-i X`).
+* Thirdly, the phase-identifier will be printed: `0` for initialization phase (which will be only inserts) and `1` for the workload phase (which is read-only here).
+* After that, the time and throughput are written.
+* If `--perf` is enabled, the output will be extended by some perf counters, which are labeled (like throughput).
+
+## Plot the results
+When using `-o FILE`, the results will be written to the given file, using `JSON` format.
+The plot script `scripts/plot_blinktree_benchmark INPUT_FILE [INPUT_FILE ...]` will aggregate and plot the results using one or more of those `JSON` files.
+
+## Examples
+
+###### Running workload A using optimistic synchronization
+
+    ./bin/blinktree_benchmark 1: -s 2 -i 3 -pd 3 -p -f workloads/fill_randint_workloada workloads/mixed_randint_workloada -o optimistic.json
+
+###### Running workload A using best matching synchronization
+
+    ./bin/blinktree_benchmark 1: -s 2 -i 3 -pd 3 -p --sync4me -f workloads/fill_randint_workloada workloads/mixed_randint_workloada -o sync4me.json
+
+###### Running workload A using reader/writer-locks
+    
+    ./bin/blinktree_benchmark 1: -s 2 -i 3 -pd 3 -p --latched -f workloads/fill_randint_workloada workloads/mixed_randint_workloada -o rwlocked.json
+    
+###### Running workload A using core-based sequencing
+    
+    ./bin/blinktree_benchmark 1: -s 2 -i 3 -pd 3 -p --exclusive -f workloads/fill_randint_workloada workloads/mixed_randint_workloada -o core-sequenced.json
+    
+###### Running workload A using spin-locks
+        
+    ./bin/blinktree_benchmark 1: -s 2 -i 3 -pd 3 -p --latched --exclusive -f workloads/fill_randint_workloada workloads/mixed_randint_workloada -o spinlocked.json
+        
+        
--- a/src/application/blinktree_benchmark/benchmark.cpp
+++ b/src/application/blinktree_benchmark/benchmark.cpp
@@ -0,0 +1,199 @@
+#include "benchmark.h"
+#include <cstdlib>
+#include <iostream>
+#include <json.hpp>
+#include <memory>
+#include <mx/memory/global_heap.h>
+
+using namespace application::blinktree_benchmark;
+
+Benchmark::Benchmark(benchmark::Cores &&cores, const std::uint16_t iterations, std::string &&fill_workload_file,
+                     std::string &&mixed_workload_file, const bool use_performance_counter,
+                     const mx::synchronization::isolation_level node_isolation_level,
+                     const mx::synchronization::protocol preferred_synchronization_method,
+                     const bool print_tree_statistics, const bool check_tree, std::string &&result_file_name,
+                     std::string &&statistic_file_name, std::string &&tree_file_name, const bool profile)
+    : _cores(std::move(cores)), _iterations(iterations), _node_isolation_level(node_isolation_level),
+      _preferred_synchronization_method(preferred_synchronization_method),
+      _print_tree_statistics(print_tree_statistics), _check_tree(check_tree),
+      _result_file_name(std::move(result_file_name)), _statistic_file_name(std::move(statistic_file_name)),
+      _tree_file_name(std::move(tree_file_name)), _profile(profile)
+{
+    if (use_performance_counter)
+    {
+        this->_chronometer.add(benchmark::Perf::CYCLES);
+        this->_chronometer.add(benchmark::Perf::INSTRUCTIONS);
+        this->_chronometer.add(benchmark::Perf::STALLS_MEM_ANY);
+        this->_chronometer.add(benchmark::Perf::SW_PREFETCH_ACCESS_NTA);
+        this->_chronometer.add(benchmark::Perf::SW_PREFETCH_ACCESS_WRITE);
+    }
+
+    std::cout << "core configuration: \n" << this->_cores.dump(2) << std::endl;
+
+    this->_workload.build(fill_workload_file, mixed_workload_file);
+    if (this->_workload.empty(benchmark::phase::FILL) && this->_workload.empty(benchmark::phase::MIXED))
+    {
+        std::exit(1);
+    }
+
+    std::cout << "workload: " << this->_workload << "\n" << std::endl;
+}
+
+void Benchmark::start()
+{
+    // Reset tree.
+    if (this->_tree == nullptr)
+    {
+        this->_tree = std::make_unique<db::index::blinktree::BLinkTree<std::uint64_t, std::int64_t>>(
+            this->_node_isolation_level, this->_preferred_synchronization_method);
+    }
+
+    // Reset request scheduler.
+    if (this->_request_scheduler.empty() == false)
+    {
+        this->_request_scheduler.clear();
+    }
+
+    // Create one request scheduler per core.
+    for (auto core_index = 0U; core_index < this->_cores.current().size(); core_index++)
+    {
+        const auto channel_id = core_index;
+        auto *request_scheduler = mx::tasking::runtime::new_task<RequestSchedulerTask>(
+            0U, core_index, channel_id, this->_workload, this->_cores.current(), this->_tree.get(), this);
+        mx::tasking::runtime::spawn(*request_scheduler, 0U);
+        this->_request_scheduler.push_back(request_scheduler);
+    }
+    this->_open_requests = this->_request_scheduler.size();
+
+    // Start measurement.
+    if (this->_profile)
+    {
+        mx::tasking::runtime::profile(this->profile_file_name());
+    }
+    this->_chronometer.start(static_cast<std::uint16_t>(static_cast<benchmark::phase>(this->_workload)),
+                             this->_current_iteration + 1, this->_cores.current());
+}
+
+const mx::util::core_set &Benchmark::core_set()
+{
+    if (this->_current_iteration == std::numeric_limits<std::uint16_t>::max())
+    {
+        // This is the very first time we start the benchmark.
+        this->_current_iteration = 0U;
+        return this->_cores.next();
+    }
+
+    // Switch from fill to mixed phase.
+    if (this->_workload == benchmark::phase::FILL && this->_workload.empty(benchmark::phase::MIXED) == false)
+    {
+        this->_workload.reset(benchmark::phase::MIXED);
+        return this->_cores.current();
+    }
+    this->_workload.reset(benchmark::phase::FILL);
+
+    // Run the next iteration.
+    if (++this->_current_iteration < this->_iterations)
+    {
+        return this->_cores.current();
+    }
+    this->_current_iteration = 0U;
+
+    // At this point, all phases and all iterations for the current core configuration
+    // are done. Increase the cores.
+    return this->_cores.next();
+}
+
+void Benchmark::requests_finished()
+{
+    const auto open_requests = --this->_open_requests;
+
+    if (open_requests == 0U) // All request schedulers are done.
+    {
+        // Stop and print time (and performance counter).
+        const auto result = this->_chronometer.stop(this->_workload.size());
+        mx::tasking::runtime::stop();
+        std::cout << result << std::endl;
+
+        // Dump results to file.
+        if (this->_result_file_name.empty() == false)
+        {
+            std::ofstream result_file_stream(this->_result_file_name, std::ofstream::app);
+            result_file_stream << result.to_json().dump() << std::endl;
+        }
+
+        // Dump statistics to file.
+        if constexpr (mx::tasking::config::task_statistics())
+        {
+            if (this->_statistic_file_name.empty() == false)
+            {
+                std::ofstream statistic_file_stream(this->_statistic_file_name, std::ofstream::app);
+                nlohmann::json statistic_json;
+                statistic_json["iteration"] = result.iteration();
+                statistic_json["cores"] = result.core_count();
+                statistic_json["phase"] = result.phase();
+                statistic_json["scheduled"] = nlohmann::json();
+                statistic_json["scheduled-on-channel"] = nlohmann::json();
+                statistic_json["scheduled-off-channel"] = nlohmann::json();
+                statistic_json["executed"] = nlohmann::json();
+                statistic_json["executed-reader"] = nlohmann::json();
+                statistic_json["executed-writer"] = nlohmann::json();
+                statistic_json["buffer-fills"] = nlohmann::json();
+                for (auto i = 0U; i < this->_cores.current().size(); i++)
+                {
+                    const auto core_id = std::int32_t{this->_cores.current()[i]};
+                    const auto core_id_string = std::to_string(core_id);
+                    statistic_json["scheduled"][core_id_string] =
+                        result.scheduled_tasks(core_id) / double(result.operation_count());
+                    statistic_json["scheduled-on-core"][core_id_string] =
+                        result.scheduled_tasks_on_core(core_id) / double(result.operation_count());
+                    statistic_json["scheduled-off-core"][core_id_string] =
+                        result.scheduled_tasks_off_core(core_id) / double(result.operation_count());
+                    statistic_json["executed"][core_id_string] =
+                        result.executed_tasks(core_id) / double(result.operation_count());
+                    statistic_json["executed-reader"][core_id_string] =
+                        result.executed_reader_tasks(core_id) / double(result.operation_count());
+                    statistic_json["executed-writer"][core_id_string] =
+                        result.executed_writer_tasks(core_id) / double(result.operation_count());
+                    statistic_json["fill"][core_id_string] =
+                        result.worker_fills(core_id) / double(result.operation_count());
+                }
+
+                statistic_file_stream << statistic_json.dump(2) << std::endl;
+            }
+        }
+
+        // Check and print the tree.
+        if (this->_check_tree)
+        {
+            this->_tree->check();
+        }
+
+        if (this->_print_tree_statistics)
+        {
+            this->_tree->print_statistics();
+        }
+
+        const auto is_last_phase =
+            this->_workload == benchmark::phase::MIXED || this->_workload.empty(benchmark::phase::MIXED);
+
+        // Dump the tree.
+        if (this->_tree_file_name.empty() == false && is_last_phase)
+        {
+            std::ofstream tree_file_stream(this->_tree_file_name);
+            tree_file_stream << static_cast<nlohmann::json>(*(this->_tree)).dump() << std::endl;
+        }
+
+        // Delete the tree to free the hole memory.
+        if (is_last_phase)
+        {
+            this->_tree.reset(nullptr);
+        }
+    }
+}
+
+std::string Benchmark::profile_file_name() const
+{
+    return "profiling-" + std::to_string(this->_cores.current().size()) + "-cores" + "-phase-" +
+           std::to_string(static_cast<std::uint16_t>(static_cast<benchmark::phase>(this->_workload))) + "-iteration-" +
+           std::to_string(this->_current_iteration) + ".json";
+}
--- a/src/application/blinktree_benchmark/benchmark.h
+++ b/src/application/blinktree_benchmark/benchmark.h
@@ -0,0 +1,103 @@
+#pragma once
+
+#include "listener.h"
+#include "request_scheduler.h"
+#include <array>
+#include <atomic>
+#include <benchmark/chronometer.h>
+#include <benchmark/cores.h>
+#include <benchmark/workload.h>
+#include <cstdint>
+#include <db/index/blinktree/b_link_tree.h>
+#include <memory>
+#include <mx/util/core_set.h>
+#include <string>
+#include <vector>
+
+namespace application::blinktree_benchmark {
+/**
+ * Benchmark executing the task-based BLink-Tree.
+ */
+class Benchmark final : public Listener
+{
+public:
+    Benchmark(benchmark::Cores &&, std::uint16_t iterations, std::string &&fill_workload_file,
+              std::string &&mixed_workload_file, bool use_performance_counter,
+              mx::synchronization::isolation_level node_isolation_level,
+              mx::synchronization::protocol preferred_synchronization_method, bool print_tree_statistics,
+              bool check_tree, std::string &&result_file_name, std::string &&statistic_file_name,
+              std::string &&tree_file_name, bool profile);
+
+    ~Benchmark() noexcept override = default;
+
+    /**
+     * @return Core set the benchmark should run in the current iteration.
+     */
+    const mx::util::core_set &core_set();
+
+    /**
+     * Callback for request tasks to notify they are out of
+     * new requests.
+     */
+    void requests_finished() override;
+
+    /**
+     * Starts the benchmark after initialization.
+     */
+    void start();
+
+private:
+    // Collection of cores the benchmark should run on.
+    benchmark::Cores _cores;
+
+    // Number of iterations the benchmark should use.
+    const std::uint16_t _iterations;
+
+    // Current iteration within the actual core set.
+    std::uint16_t _current_iteration = std::numeric_limits<std::uint16_t>::max();
+
+    // Workload to get requests from.
+    benchmark::Workload _workload;
+
+    // Tree to run requests on.
+    std::unique_ptr<db::index::blinktree::BLinkTree<std::uint64_t, std::int64_t>> _tree;
+
+    // The synchronization mechanism to use for tree nodes.
+    const mx::synchronization::isolation_level _node_isolation_level;
+
+    // Preferred synchronization method.
+    const mx::synchronization::protocol _preferred_synchronization_method;
+
+    // If true, the tree statistics (height, number of nodes, ...) will be printed.
+    const bool _print_tree_statistics;
+
+    // If true, the tree will be checked for consistency after each iteration.
+    const bool _check_tree;
+
+    // Name of the file to print results to.
+    const std::string _result_file_name;
+
+    // Name of the file to print further statistics.
+    const std::string _statistic_file_name;
+
+    // Name of the file to serialize the tree to.
+    const std::string _tree_file_name;
+
+    // If true, use idle profiling.
+    const bool _profile;
+
+    // Number of open request tasks; used for tracking the benchmark.
+    alignas(64) std::atomic_uint16_t _open_requests = 0;
+
+    // List of request schedulers.
+    alignas(64) std::vector<RequestSchedulerTask *> _request_scheduler;
+
+    // Chronometer for starting/stopping time and performance counter.
+    alignas(64) benchmark::Chronometer<std::uint16_t> _chronometer;
+
+    /**
+     * @return Name of the file to write profiling results to.
+     */
+    [[nodiscard]] std::string profile_file_name() const;
+};
+} // namespace application::blinktree_benchmark
--- a/src/application/blinktree_benchmark/config.h
+++ b/src/application/blinktree_benchmark/config.h
@@ -0,0 +1,17 @@
+#pragma once
+
+namespace application::blinktree_benchmark {
+class config
+{
+public:
+    /**
+     * @return Number of requests that will be started at a time by the request scheduler.
+     */
+    static constexpr auto batch_size() noexcept { return 500U; }
+
+    /**
+     * @return Number of maximal open requests, system-wide.
+     */
+    static constexpr auto max_parallel_requests() noexcept { return 1500U; }
+};
+} // namespace application::blinktree_benchmark
--- a/src/application/blinktree_benchmark/listener.h
+++ b/src/application/blinktree_benchmark/listener.h
@@ -0,0 +1,15 @@
+#pragma once
+
+namespace application::blinktree_benchmark {
+/**
+ * The listener will be used to notify the benchmark that request tasks are
+ * done and no more work is available.
+ */
+class Listener
+{
+public:
+    constexpr Listener() = default;
+    virtual ~Listener() = default;
+    virtual void requests_finished() = 0;
+};
+} // namespace application::blinktree_benchmark
--- a/src/application/blinktree_benchmark/main.cpp
+++ b/src/application/blinktree_benchmark/main.cpp
@@ -0,0 +1,179 @@
+#include "benchmark.h"
+#include <argparse.hpp>
+#include <benchmark/cores.h>
+#include <mx/system/environment.h>
+#include <mx/system/thread.h>
+#include <mx/tasking/runtime.h>
+#include <mx/util/core_set.h>
+#include <tuple>
+
+using namespace application::blinktree_benchmark;
+
+/**
+ * Instantiates the BLink-Tree benchmark with CLI arguments.
+ * @param count_arguments Number of CLI arguments.
+ * @param arguments Arguments itself.
+ *
+ * @return Instance of the benchmark and parameters for tasking runtime.
+ */
+std::tuple<Benchmark *, std::uint16_t, bool> create_benchmark(int count_arguments, char **arguments);
+
+/**
+ * Starts the benchmark.
+ *
+ * @param count_arguments Number of CLI arguments.
+ * @param arguments Arguments itself.
+ *
+ * @return Return code of the application.
+ */
+int main(int count_arguments, char **arguments)
+{
+    if (mx::system::Environment::is_numa_balancing_enabled())
+    {
+        std::cout << "[Warn] NUMA balancing may be enabled, set '/proc/sys/kernel/numa_balancing' to '0'" << std::endl;
+    }
+
+    auto [benchmark, prefetch_distance, use_system_allocator] = create_benchmark(count_arguments, arguments);
+    if (benchmark == nullptr)
+    {
+        return 1;
+    }
+
+    mx::util::core_set cores{};
+
+    while ((cores = benchmark->core_set()))
+    {
+        mx::tasking::runtime_guard _(use_system_allocator, cores, prefetch_distance);
+        benchmark->start();
+    }
+
+    delete benchmark;
+
+    return 0;
+}
+
+std::tuple<Benchmark *, std::uint16_t, bool> create_benchmark(int count_arguments, char **arguments)
+{
+    // Set up arguments.
+    argparse::ArgumentParser argument_parser("blinktree_benchmark");
+    argument_parser.add_argument("cores")
+        .help("Range of the number of cores (1 for using 1 core, 1: for using 1 up to available cores, 1:4 for using "
+              "cores from 1 to 4).")
+        .default_value(std::string("1"));
+    /* Not used for the moment.
+    argument_parser.add_argument("-c", "--channels-per-core")
+        .help("Number of how many channels used per core.")
+        .default_value(std::uint16_t(1))
+        .action([](const std::string &value) { return std::uint16_t(std::stoi(value)); });
+    */
+    argument_parser.add_argument("-s", "--steps")
+        .help("Steps, how number of cores is increased (1,2,4,6,.. for -s 2).")
+        .default_value(std::uint16_t(2))
+        .action([](const std::string &value) { return std::uint16_t(std::stoi(value)); });
+    argument_parser.add_argument("-i", "--iterations")
+        .help("Number of iterations for each workload")
+        .default_value(std::uint16_t(1))
+        .action([](const std::string &value) { return std::uint16_t(std::stoi(value)); });
+    argument_parser.add_argument("-sco", "--system-core-order")
+        .help("Use systems core order. If not, cores are ordered by node id (should be preferred).")
+        .implicit_value(true)
+        .default_value(false);
+    argument_parser.add_argument("-p", "--perf")
+        .help("Use performance counter.")
+        .implicit_value(true)
+        .default_value(false);
+    argument_parser.add_argument("--exclusive")
+        .help("Are all node accesses exclusive?")
+        .implicit_value(true)
+        .default_value(false);
+    argument_parser.add_argument("--latched")
+        .help("Prefer latch for synchronization?")
+        .implicit_value(true)
+        .default_value(false);
+    argument_parser.add_argument("--olfit")
+        .help("Prefer OLFIT for synchronization?")
+        .implicit_value(true)
+        .default_value(false);
+    argument_parser.add_argument("--sync4me")
+        .help("Let the tasking layer decide the synchronization primitive.")
+        .implicit_value(true)
+        .default_value(false);
+    argument_parser.add_argument("--print-stats")
+        .help("Print tree statistics after every iteration.")
+        .implicit_value(true)
+        .default_value(false);
+    argument_parser.add_argument("--disable-check")
+        .help("Disable tree check while benchmarking.")
+        .implicit_value(true)
+        .default_value(false);
+    argument_parser.add_argument("-f", "--workload-files")
+        .help("Files containing the workloads (workloads/fill workloads/mixed for example).")
+        .nargs(2)
+        .default_value(
+            std::vector<std::string>{"workloads/fill_randint_workloada", "workloads/mixed_randint_workloada"});
+    argument_parser.add_argument("-pd", "--prefetch-distance")
+        .help("Distance of prefetched data objects (0 = disable prefetching).")
+        .default_value(std::uint16_t(0))
+        .action([](const std::string &value) { return std::uint16_t(std::stoi(value)); });
+    argument_parser.add_argument("--system-allocator")
+        .help("Use the systems malloc interface to allocate tasks (default disabled).")
+        .implicit_value(true)
+        .default_value(false);
+    argument_parser.add_argument("-ot", "--out-tree")
+        .help("Name of the file, the tree will be written in json format.")
+        .default_value(std::string(""));
+    argument_parser.add_argument("-os", "--out-statistics")
+        .help("Name of the file, the task statistics will be written in json format.")
+        .default_value(std::string(""));
+    argument_parser.add_argument("-o", "--out")
+        .help("Name of the file, the results will be written to.")
+        .default_value(std::string(""));
+    argument_parser.add_argument("--profiling")
+        .help("Enable profiling (default disabled).")
+        .implicit_value(true)
+        .default_value(false);
+
+    // Parse arguments.
+    try
+    {
+        argument_parser.parse_args(count_arguments, arguments);
+    }
+    catch (std::runtime_error &e)
+    {
+        std::cout << argument_parser << std::endl;
+        return {nullptr, 0U, false};
+    }
+
+    auto order =
+        argument_parser.get<bool>("-sco") ? mx::util::core_set::Order::Ascending : mx::util::core_set::Order::NUMAAware;
+    auto cores =
+        benchmark::Cores({argument_parser.get<std::string>("cores"), argument_parser.get<std::uint16_t>("-s"), order});
+    auto workload_files = argument_parser.get<std::vector<std::string>>("-f");
+    const auto isolation_level = argument_parser.get<bool>("--exclusive")
+                                     ? mx::synchronization::isolation_level::Exclusive
+                                     : mx::synchronization::isolation_level::ExclusiveWriter;
+    auto preferred_synchronization_method = mx::synchronization::protocol::Queue;
+    if (argument_parser.get<bool>("--latched"))
+    {
+        preferred_synchronization_method = mx::synchronization::protocol::Latch;
+    }
+    else if (argument_parser.get<bool>("--olfit"))
+    {
+        preferred_synchronization_method = mx::synchronization::protocol::OLFIT;
+    }
+    else if (argument_parser.get<bool>("--sync4me"))
+    {
+        preferred_synchronization_method = mx::synchronization::protocol::None;
+    }
+
+    // Create the benchmark.
+    auto *benchmark =
+        new Benchmark(std::move(cores), argument_parser.get<std::uint16_t>("-i"), std::move(workload_files[0]),
+                      std::move(workload_files[1]), argument_parser.get<bool>("-p"), isolation_level,
+                      preferred_synchronization_method, argument_parser.get<bool>("--print-stats"),
+                      argument_parser.get<bool>("--disable-check") == false, argument_parser.get<std::string>("-o"),
+                      argument_parser.get<std::string>("-os"), argument_parser.get<std::string>("-ot"),
+                      argument_parser.get<bool>("--profiling"));
+
+    return {benchmark, argument_parser.get<std::uint16_t>("-pd"), argument_parser.get<bool>("--system-allocator")};
+}
--- a/src/application/blinktree_benchmark/request_scheduler.h
+++ b/src/application/blinktree_benchmark/request_scheduler.h
@@ -0,0 +1,252 @@
+#pragma once
+
+#include "config.h"
+#include "listener.h"
+#include <atomic>
+#include <benchmark/workload.h>
+#include <cstdint>
+#include <db/index/blinktree/b_link_tree.h>
+#include <db/index/blinktree/config.h>
+#include <db/index/blinktree/insert_value_task.h>
+#include <db/index/blinktree/lookup_task.h>
+#include <db/index/blinktree/update_task.h>
+#include <mx/resource/resource.h>
+#include <mx/tasking/runtime.h>
+#include <mx/tasking/task.h>
+#include <mx/util/core_set.h>
+#include <mx/util/reference_counter.h>
+
+namespace application::blinktree_benchmark {
+
+class RequestIndex
+{
+public:
+    static RequestIndex make_finished() { return RequestIndex{std::numeric_limits<decltype(_index)>::max(), 0UL}; }
+    static RequestIndex make_no_new() { return RequestIndex{0UL, 0UL}; }
+
+    RequestIndex(const std::uint64_t index, const std::uint64_t count) noexcept : _index(index), _count(count) {}
+    explicit RequestIndex(std::pair<std::uint64_t, std::uint64_t> &&index_and_count) noexcept
+        : _index(std::get<0>(index_and_count)), _count(std::get<1>(index_and_count))
+    {
+    }
+    RequestIndex(RequestIndex &&) noexcept = default;
+    RequestIndex(const RequestIndex &) = default;
+    ~RequestIndex() noexcept = default;
+
+    RequestIndex &operator=(RequestIndex &&) noexcept = default;
+
+    [[nodiscard]] std::uint64_t index() const noexcept { return _index; }
+    [[nodiscard]] std::uint64_t count() const noexcept { return _count; }
+
+    [[nodiscard]] bool is_finished() const noexcept { return _index == std::numeric_limits<decltype(_index)>::max(); }
+    [[nodiscard]] bool has_new() const noexcept { return _count > 0UL; }
+
+    RequestIndex &operator-=(const std::uint64_t count) noexcept
+    {
+        _count -= count;
+        _index += count;
+        return *this;
+    }
+
+private:
+    std::uint64_t _index;
+    std::uint64_t _count;
+};
+
+/**
+ * The RequestContainer manages the workload and allocates new batches of requests
+ * that will be scheduled by the request scheduler.
+ */
+class RequestContainer
+{
+public:
+    RequestContainer(const std::uint16_t core_id, const std::uint64_t max_open_requests,
+                     benchmark::Workload &workload) noexcept
+        : _finished_requests(core_id), _local_buffer(workload.next(config::batch_size())),
+          _max_pending_requests(max_open_requests), _workload(workload)
+    {
+    }
+
+    ~RequestContainer() noexcept = default;
+
+    /**
+     * Allocates the next requests to spawn.
+     *
+     * @return Pair of workload-index and number of tuples to request.
+     *         When the number is negative, no more requests are available.
+     */
+    RequestIndex next() noexcept
+    {
+        const auto finished_requests = _finished_requests.load();
+
+        const auto pending_requests = _scheduled_requests - finished_requests;
+        if (pending_requests >= _max_pending_requests)
+        {
+            // Too many open requests somewhere in the system.
+            return RequestIndex::make_no_new();
+        }
+
+        if (_local_buffer.has_new() == false)
+        {
+            _local_buffer = RequestIndex{_workload.next(config::batch_size())};
+        }
+
+        if (_local_buffer.has_new())
+        {
+            // How many requests can be scheduled without reaching the request limit?
+            const auto free_requests = _max_pending_requests - pending_requests;
+
+            // Try to spawn all free requests, but at least those in the local buffer.
+            const auto count = std::min(free_requests, _local_buffer.count());
+
+            _scheduled_requests += count;
+
+            const auto index = RequestIndex{_local_buffer.index(), count};
+            _local_buffer -= count;
+
+            return index;
+        }
+
+        // Do we have to wait for pending requests or are we finished?
+        return pending_requests > 0UL ? RequestIndex::make_no_new() : RequestIndex::make_finished();
+    }
+
+    /**
+     * Callback after inserted a value.
+     */
+    void inserted(const std::uint16_t core_id, const std::uint64_t /*key*/, const std::int64_t /*value*/) noexcept
+    {
+        task_finished(core_id);
+    }
+
+    /**
+     * Callback after updated a value.
+     */
+    void updated(const std::uint16_t core_id, const std::uint64_t /*key*/, const std::int64_t /*value*/) noexcept
+    {
+        task_finished(core_id);
+    }
+
+    /**
+     * Callback after removed a value.
+     */
+    void removed(const std::uint16_t core_id, const std::uint64_t /*key*/) noexcept { task_finished(core_id); }
+
+    /**
+     * Callback after found a value.
+     */
+    void found(const std::uint16_t core_id, const std::uint64_t /*key*/, const std::int64_t /*value*/) noexcept
+    {
+        task_finished(core_id);
+    }
+
+    /**
+     * Callback on missing a value.
+     */
+    void missing(const std::uint16_t core_id, const std::uint64_t /*key*/) noexcept { task_finished(core_id); }
+
+    const benchmark::NumericTuple &operator[](const std::size_t index) const noexcept { return _workload[index]; }
+
+private:
+    // Number of requests finished by tasks.
+    mx::util::reference_counter_64 _finished_requests;
+
+    // Number of tasks scheduled by the owning request scheduler.
+    std::uint64_t _scheduled_requests = 0UL;
+
+    // Local buffer holding not scheduled, but from global worker owned request items.
+    RequestIndex _local_buffer;
+
+    // Number of requests that can be distributed by this scheduler,
+    // due to system-wide maximal parallel requests.
+    const std::uint64_t _max_pending_requests;
+
+    // Workload to get requests from.
+    benchmark::Workload &_workload;
+
+    /**
+     * Updates the counter of finished requests.
+     */
+    void task_finished(const std::uint16_t core_id) { _finished_requests.add(core_id); }
+};
+
+/**
+ * The RequestScheduler own its own request container and sets up requests for the BLink-Tree.
+ */
+class RequestSchedulerTask final : public mx::tasking::TaskInterface
+{
+public:
+    RequestSchedulerTask(const std::uint16_t core_id, const std::uint16_t channel_id, benchmark::Workload &workload,
+                         const mx::util::core_set &core_set,
+                         db::index::blinktree::BLinkTree<std::uint64_t, std::int64_t> *tree, Listener *listener)
+        : _tree(tree), _listener(listener)
+    {
+        this->annotate(mx::tasking::priority::low);
+        this->is_readonly(false);
+
+        const auto container = mx::tasking::runtime::new_resource<RequestContainer>(
+            sizeof(RequestContainer), mx::resource::hint{channel_id}, core_id,
+            config::max_parallel_requests() / core_set.size(), workload);
+        this->annotate(container, sizeof(RequestContainer));
+    }
+
+    ~RequestSchedulerTask() final = default;
+
+    mx::tasking::TaskResult execute(const std::uint16_t core_id, const std::uint16_t channel_id) override
+    {
+        // Get some new requests from the container.
+        auto &request_container = *mx::resource::ptr_cast<RequestContainer>(this->annotated_resource());
+        const auto next_requests = request_container.next();
+
+        if (next_requests.has_new())
+        {
+            for (auto i = next_requests.index(); i < next_requests.index() + next_requests.count(); ++i)
+            {
+                mx::tasking::TaskInterface *task{nullptr};
+                const auto &tuple = request_container[i];
+                if (tuple == benchmark::NumericTuple::INSERT)
+                {
+                    task = mx::tasking::runtime::new_task<
+                        db::index::blinktree::InsertValueTask<std::uint64_t, std::int64_t, RequestContainer>>(
+                        core_id, tuple.key(), tuple.value(), _tree, request_container);
+                    task->is_readonly(_tree->height() > 1U);
+                }
+                else if (tuple == benchmark::NumericTuple::LOOKUP)
+                {
+                    task = mx::tasking::runtime::new_task<
+                        db::index::blinktree::LookupTask<std::uint64_t, std::int64_t, RequestContainer>>(
+                        core_id, tuple.key(), request_container);
+
+                    task->is_readonly(true);
+                }
+                else if (tuple == benchmark::NumericTuple::UPDATE)
+                {
+                    task = mx::tasking::runtime::new_task<
+                        db::index::blinktree::UpdateTask<std::uint64_t, std::int64_t, RequestContainer>>(
+                        core_id, tuple.key(), tuple.value(), request_container);
+                    task->is_readonly(_tree->height() > 1U);
+                }
+
+                task->annotate(_tree->root(), db::index::blinktree::config::node_size() / 4U);
+                mx::tasking::runtime::spawn(*task, channel_id);
+            }
+        }
+        else if (next_requests.is_finished())
+        {
+            // All requests are done. Notify the benchmark and die.
+            _listener->requests_finished();
+            mx::tasking::runtime::delete_resource<RequestContainer>(this->annotated_resource());
+            return mx::tasking::TaskResult::make_remove();
+        }
+
+        return mx::tasking::TaskResult::make_succeed(this);
+    }
+
+private:
+    // The tree to send requests to.
+    db::index::blinktree::BLinkTree<std::uint64_t, std::int64_t> *_tree;
+
+    // Benchmark listener to notify on requests are done.
+    Listener *_listener;
+};
+} // namespace application::blinktree_benchmark
--- a/src/application/hashjoin_benchmark/README.md
+++ b/src/application/hashjoin_benchmark/README.md
@@ -0,0 +1,49 @@
+# HashJoin Benchmark
+Benchmark of a parallel, task-based HashJoin.
+
+## How to generate workload
+* Download TPC-H benchmark and generate tables
+* Specify joined tables and key-indices via CLI arguments
+
+## Important CLI arguments
+* The first argument is the number of cores:
+    * `./bin/hashjoin_benchmark 1` for using a single core.
+    * `./bin/hashjoin_benchmark 1:24` for using cores `1` up to `24`.
+* `-i <NUMBER>` specifies the number of repetitions of each workload.
+* `-s <NUMBER>` steps of the cores:
+    * `-s 1` will increase the used cores by one (core ids: `0,1,2,3,4,5,6,7,..,23`).
+    * `-s 2` will skip every second core (core ids: `0,1,3,5,7,..23`).
+* `-pd <NUMBER>` specifies the prefetch distance.
+* `-p` or `--perf` will activate performance counter (result will be printed to console and output file).
+* `-R` specifies the TPC-H table file for the left relation.
+* `-R-key` specifies the index of the join key for `R`.
+* `-S` specifies the TPC-H table file for the right relation.
+* `-S-key` specifies the index of the join key for `S`.
+* `--batch` specifies the records per task (comma separated: `8,16,64,256`)
+
+## Understanding the output
+After started, the benchmark will print a summary of configured cores and workload:
+
+    core configuration: 
+      1: 0
+      2: 0 1
+      4: 0 1 2 3
+    workload: customer.tbl.0 (#3000000) JOIN orders.tbl.1 (#30000000)
+
+Here, we configured the benchmark to use one to four cores; each line of the core configuration displays the number of cores and the core identifiers.
+
+Following, the benchmark will be started and print the results for every iteration:
+
+    1	1	64	1478 ms	3.38295e+06 op/s
+    2	1	64	964 ms	5.18672e+06 op/s
+    4	1	64	935 ms	5.34759e+06 op/s
+    
+* The first column is the number of used cores.
+* The second column displays the iteration of the benchmark (configured by `-i X`).
+* Thirdly, the granularity of how many records per task will be processed.
+* After that, the time and throughput are written.
+* If `--perf` is enabled, the output will be extended by some perf counters, which are labeled (like throughput).
+
+## Plot the results
+When using `-o FILE`, the results will be written to the given file, using `JSON` format.
+The plot script `scripts/plot_hashjoin_benchmark INPUT_FILE` will aggregate and plot the results using one `JSON` file.
--- a/src/application/hashjoin_benchmark/benchmark.cpp
+++ b/src/application/hashjoin_benchmark/benchmark.cpp
@@ -0,0 +1,196 @@
+#include "benchmark.h"
+#include "build_task.h"
+#include "inline_hashtable.h"
+#include "partition_task.h"
+#include "tpch_table_reader.h"
+#include <mx/memory/global_heap.h>
+#include <mx/tasking/runtime.h>
+
+using namespace application::hash_join;
+
+Benchmark::Benchmark(
+    benchmark::Cores &&cores, const std::uint16_t iterations, std::vector<std::uint32_t> &&batches,
+    std::tuple<std::pair<std::string, std::uint16_t>, std::pair<std::string, std::uint16_t>> &&join_table_files,
+    const bool use_performance_counter, std::string &&result_file_name)
+    : _cores(std::move(cores)), _iterations(iterations), _batches(std::move(batches)),
+      _result_file_name(std::move(result_file_name))
+{
+    if (use_performance_counter)
+    {
+        this->_chronometer.add(benchmark::Perf::CYCLES);
+        this->_chronometer.add(benchmark::Perf::INSTRUCTIONS);
+        this->_chronometer.add(benchmark::Perf::STALLS_MEM_ANY);
+        this->_chronometer.add(benchmark::Perf::SW_PREFETCH_ACCESS_NTA);
+        this->_chronometer.add(benchmark::Perf::SW_PREFETCH_ACCESS_WRITE);
+    }
+
+    std::cout << "core configuration: \n" << this->_cores.dump(2) << std::endl;
+
+    std::vector<std::uint32_t> left_keys;
+    const auto &left_table = std::get<0>(std::get<0>(join_table_files));
+    const auto left_column_index = std::get<1>(std::get<0>(join_table_files));
+    application::hash_join::TPCHTableReader::read(
+        left_table, [&left_keys, left_column_index](const std::uint16_t index, const std::string &value) {
+            if (index == left_column_index)
+            {
+                left_keys.emplace_back(std::stoul(value));
+            }
+        });
+
+    std::vector<std::uint32_t> right_keys;
+    const auto &right_table = std::get<0>(std::get<1>(join_table_files));
+    const auto right_column_index = std::get<1>(std::get<1>(join_table_files));
+    application::hash_join::TPCHTableReader::read(
+        right_table, [&right_keys, right_column_index](const std::uint16_t index, const std::string &value) {
+            if (index == right_column_index)
+            {
+                right_keys.emplace_back(std::stoul(value));
+            }
+        });
+
+    this->_join_keys = std::make_tuple(std::move(left_keys), std::move(right_keys));
+
+    std::cout << "workload: " << left_table << "." << left_column_index << " (#" << std::get<0>(this->_join_keys).size()
+              << ")"
+              << " JOIN " << right_table << "." << right_column_index << " (#" << std::get<1>(this->_join_keys).size()
+              << ")"
+              << "\n"
+              << std::endl;
+}
+
+void Benchmark::start()
+{
+    const auto count_cores = this->_cores.current().size();
+    const auto count_left_keys = std::get<0>(this->_join_keys).size();
+    const auto count_left_keys_per_core = Benchmark::tuples_per_core(count_left_keys, count_cores);
+    const auto count_right_keys = std::get<1>(this->_join_keys).size();
+    const auto count_right_keys_per_core = Benchmark::tuples_per_core(count_right_keys, count_cores);
+
+    this->_merge_task = std::make_unique<MergeTask>(this->_cores.current(), this, count_right_keys_per_core);
+
+    // Clear notifications.
+    this->_build_notification = BuildFinishedNotifier{count_cores};
+    this->_probe_notification = ProbeFinishedNotifier{this->_merge_task.get()};
+    this->_build_listener = std::make_unique<Listener<BuildFinishedNotifier>>(count_cores, this->_build_notification);
+    this->_probe_listener = std::make_unique<Listener<ProbeFinishedNotifier>>(count_cores, this->_probe_notification);
+
+    // Build hash_tables.
+    this->_hash_tables.reset(new mx::resource::ptr[count_cores]); // NOLINT
+
+    for (auto channel_id = 0U; channel_id < count_cores; ++channel_id)
+    {
+        const auto needed_keys = std::size_t(count_left_keys_per_core * 1.5);
+        const auto needed_bytes = InlineHashtable<std::uint32_t, std::size_t>::needed_bytes(needed_keys);
+        this->_hash_tables.get()[channel_id] =
+            mx::tasking::runtime::new_resource<InlineHashtable<std::uint32_t, std::size_t>>(
+                needed_bytes,
+                mx::resource::hint{std::uint16_t(channel_id), mx::synchronization::isolation_level::Exclusive,
+                                   mx::synchronization::protocol::Queue},
+                needed_bytes);
+    }
+
+    /// Dispatch left table
+    auto partition_build_tasks = std::array<mx::tasking::TaskInterface *, mx::tasking::config::max_cores()>{nullptr};
+
+    for (auto i = 0U; i < count_cores; ++i)
+    {
+        const auto count_left_keys_for_core = i < count_cores - 1U
+                                                  ? count_left_keys_per_core
+                                                  : (count_left_keys - (count_cores - 1U) * count_left_keys_per_core);
+        const auto count_right_keys_for_core =
+            i < count_cores - 1U ? count_right_keys_per_core
+                                 : (count_right_keys - (count_cores - 1U) * count_right_keys_per_core);
+
+        // Build chunk for local dispatching
+        auto left_chunk = mx::tasking::runtime::to_resource(
+            &std::get<0>(this->_join_keys)[i * count_left_keys_per_core],
+            mx::resource::hint{std::uint16_t(i), mx::synchronization::isolation_level::Exclusive,
+                               mx::synchronization::protocol::Queue});
+        auto right_chunk = mx::tasking::runtime::to_resource(
+            &std::get<1>(this->_join_keys)[i * count_right_keys_per_core],
+            mx::resource::hint{std::uint16_t(i), mx::synchronization::isolation_level::Exclusive,
+                               mx::synchronization::protocol::Queue});
+
+        // Run dispatcher task.
+        auto *partition_probe_task = mx::tasking::runtime::new_task<PartitionTask<ProbeTask>>(
+            0U, *this->_probe_listener, this->_batches[this->_current_batch_index], count_right_keys_for_core,
+            this->_hash_tables.get());
+        partition_probe_task->annotate(right_chunk, 64U);
+        this->_build_notification.dispatch_probe_task(i, partition_probe_task);
+
+        auto *partition_build_task = mx::tasking::runtime::new_task<PartitionTask<BuildTask>>(
+            0U, *this->_build_listener, this->_batches[this->_current_batch_index], count_left_keys_for_core,
+            this->_hash_tables.get());
+        partition_build_task->annotate(left_chunk, 64U);
+        partition_build_tasks[i] = partition_build_task;
+    }
+
+    // Here we go
+    this->_chronometer.start(this->_batches[this->_current_batch_index], this->_current_iteration,
+                             this->_cores.current());
+    for (auto i = 0U; i < count_cores; ++i)
+    {
+        mx::tasking::runtime::spawn(*(partition_build_tasks[i]), 0U);
+    }
+}
+
+void Benchmark::stop()
+{
+    // Stop and print time (and performance counter).
+    const auto result = this->_chronometer.stop(this->_merge_task->count_tuples());
+    mx::tasking::runtime::stop();
+
+    std::cout << result << std::endl;
+
+    // Dump results to file.
+    if (this->_result_file_name.empty() == false)
+    {
+        std::ofstream result_file_stream(this->_result_file_name, std::ofstream::app);
+        result_file_stream << result.to_json().dump() << std::endl;
+    }
+}
+
+const mx::util::core_set &Benchmark::core_set()
+{
+    if (this->_current_iteration == std::numeric_limits<std::uint16_t>::max())
+    {
+        // This is the very first time we start the benchmark.
+        this->_current_iteration = 0U;
+        return this->_cores.next();
+    }
+
+    for (auto i = 0U; i < this->_cores.current().size(); ++i)
+    {
+        mx::tasking::runtime::delete_resource<InlineHashtable<std::uint32_t, std::size_t>>(this->_hash_tables.get()[i]);
+    }
+
+    // Run the next iteration.
+    if (++this->_current_iteration < this->_iterations)
+    {
+        return this->_cores.current();
+    }
+    this->_current_iteration = 0U;
+
+    if (++this->_current_batch_index < this->_batches.size())
+    {
+        return this->_cores.current();
+    }
+    this->_current_batch_index = 0U;
+
+    // At this point, all phases and all iterations for the current core configuration
+    // are done. Increase the cores.
+    return this->_cores.next();
+}
+
+std::uint64_t Benchmark::tuples_per_core(const std::uint64_t count_join_keys, const std::uint16_t count_cores) noexcept
+{
+    const auto cache_lines = (count_join_keys * sizeof(std::uint32_t)) / 64U;
+    const auto cache_lines_per_core = cache_lines / count_cores;
+    auto p = 1U;
+    while (p < cache_lines_per_core)
+    {
+        p += 64U;
+    }
+
+    return p * (64U / sizeof(std::uint32_t));
+}
--- a/src/application/hashjoin_benchmark/benchmark.h
+++ b/src/application/hashjoin_benchmark/benchmark.h
@@ -0,0 +1,69 @@
+#pragma once
+
+#include "listener.h"
+#include "merge_task.h"
+#include "notifier.h"
+#include <benchmark/chronometer.h>
+#include <benchmark/cores.h>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <tuple>
+
+namespace application::hash_join {
+
+class Benchmark
+{
+public:
+    Benchmark(
+        benchmark::Cores &&cores, std::uint16_t iterations, std::vector<std::uint32_t> &&batches,
+        std::tuple<std::pair<std::string, std::uint16_t>, std::pair<std::string, std::uint16_t>> &&join_table_files,
+        bool use_performance_counter, std::string &&result_file_name);
+
+    ~Benchmark() = default;
+
+    /**
+     * @return Core set the benchmark should run in the current iteration.
+     */
+    const mx::util::core_set &core_set();
+
+    void start();
+
+    void stop();
+
+private:
+    // Collection of cores the benchmark should run on.
+    benchmark::Cores _cores;
+
+    // Number of iterations the benchmark should use.
+    const std::uint16_t _iterations;
+
+    // Current iteration within the actual core set.
+    std::uint16_t _current_iteration = std::numeric_limits<std::uint16_t>::max();
+
+    // Number of tuples that are probed/build together.
+    const std::vector<std::uint32_t> _batches;
+    std::uint16_t _current_batch_index{0U};
+
+    // Name of the file to print results to.
+    const std::string _result_file_name;
+
+    // Keys to join.
+    std::tuple<std::vector<std::uint32_t>, std::vector<std::uint32_t>> _join_keys;
+
+    std::unique_ptr<mx::resource::ptr> _hash_tables;
+
+    std::unique_ptr<Listener<BuildFinishedNotifier>> _build_listener;
+    std::unique_ptr<Listener<ProbeFinishedNotifier>> _probe_listener;
+
+    std::unique_ptr<MergeTask> _merge_task;
+
+    alignas(64) BuildFinishedNotifier _build_notification;
+    ProbeFinishedNotifier _probe_notification;
+
+    // Chronometer for starting/stopping time and performance counter.
+    alignas(64) benchmark::Chronometer<std::uint32_t> _chronometer;
+
+    static std::uint64_t tuples_per_core(std::uint64_t count_join_keys, std::uint16_t count_cores) noexcept;
+};
+} // namespace application::hash_join
--- a/src/application/hashjoin_benchmark/build_task.h
+++ b/src/application/hashjoin_benchmark/build_task.h
@@ -0,0 +1,42 @@
+#pragma once
+#include "inline_hashtable.h"
+#include <cstdint>
+#include <iostream>
+#include <mx/tasking/task.h>
+#include <vector>
+
+namespace application::hash_join {
+/**
+ * The build task builds the hash table.
+ */
+class BuildTask final : public mx::tasking::TaskInterface
+{
+public:
+    BuildTask(const std::size_t size, const std::uint8_t /*numa_node_id*/) { _keys.reserve(size); }
+    ~BuildTask() override = default;
+
+    mx::tasking::TaskResult execute(const std::uint16_t /*core_id*/, const std::uint16_t /*channel_id*/) override
+    {
+        auto *hashtable = this->annotated_resource().get<InlineHashtable<std::uint32_t, std::size_t>>();
+
+        for (const auto &row : _keys)
+        {
+            hashtable->insert(std::get<1>(row), std::get<0>(row));
+        }
+
+        return mx::tasking::TaskResult::make_remove();
+    }
+
+    void emplace_back(const std::size_t row_id, const std::uint32_t key) noexcept
+    {
+        _keys.emplace_back(std::make_pair(row_id, key));
+    }
+
+    [[nodiscard]] std::uint64_t size() const noexcept { return _keys.size(); }
+    [[nodiscard]] bool empty() const noexcept { return _keys.empty(); }
+
+private:
+    // Keys and row ids to insert into the hashtable.
+    std::vector<std::pair<std::size_t, std::uint32_t>> _keys;
+};
+} // namespace application::hash_join
--- a/src/application/hashjoin_benchmark/inline_hashtable.h
+++ b/src/application/hashjoin_benchmark/inline_hashtable.h
@@ -0,0 +1,97 @@
+#pragma once
+
+#include <cstdint>
+#include <limits>
+#include <mx/memory/alignment_helper.h>
+#include <utility>
+
+namespace application::hash_join {
+/**
+ * Hashtable for hashjoin.
+ */
+template <typename K, typename V> class InlineHashtable
+{
+private:
+    struct Entry
+    {
+        constexpr Entry() noexcept : key(std::numeric_limits<K>::max()), value(0) {}
+        Entry(Entry &&other) noexcept = default;
+        ~Entry() noexcept = default;
+
+        Entry &operator=(Entry &&) noexcept = default;
+        K key;
+        V value;
+    };
+
+public:
+    static std::size_t needed_bytes(const std::size_t slots) noexcept
+    {
+        return sizeof(InlineHashtable<K, V>) +
+               sizeof(InlineHashtable<K, V>::Entry) * mx::memory::alignment_helper::next_power_of_two(slots);
+    }
+
+    InlineHashtable(const std::size_t size)
+        : _slots((size - sizeof(InlineHashtable<K, V>)) / sizeof(InlineHashtable<K, V>::Entry))
+    {
+        for (auto i = 0U; i < _slots; ++i)
+        {
+            at(i) = Entry{};
+        }
+    }
+    ~InlineHashtable() = default;
+
+    void insert(const K key, const V value) noexcept
+    {
+        for (auto index = hash(key);; ++index)
+        {
+            index &= _slots - 1U;
+
+            auto &entry = at(index);
+            if (entry.key != key && entry.key != std::numeric_limits<K>::max())
+            {
+                continue;
+            }
+
+            entry.key = key;
+            entry.value = value;
+            return;
+        }
+    }
+
+    V get(const K key) const noexcept
+    {
+        for (auto index = hash(key);; ++index)
+        {
+            index &= _slots - 1U;
+
+            const auto &entry = at(index);
+            if (entry.key == key)
+            {
+                return entry.value;
+            }
+
+            if (entry.key == std::numeric_limits<K>::max())
+            {
+                return std::numeric_limits<V>::max();
+            }
+        }
+    }
+
+    const Entry &at(const std::size_t slot) const noexcept { return reinterpret_cast<const Entry *>(this + 1)[slot]; }
+
+    Entry &at(const std::size_t slot) noexcept { return reinterpret_cast<Entry *>(this + 1)[slot]; }
+
+private:
+    const std::size_t _slots;
+
+    std::size_t hash(K key) const
+    {
+        key ^= key >> 16;
+        key *= 0x85ebca6b;
+        key ^= key >> 13;
+        key *= 0xc2b2ae35;
+        key ^= key >> 16;
+        return std::size_t(key);
+    }
+};
+} // namespace application::hash_join
--- a/src/application/hashjoin_benchmark/listener.h
+++ b/src/application/hashjoin_benchmark/listener.h
@@ -0,0 +1,42 @@
+#pragma once
+#include "merge_task.h"
+#include <algorithm>
+#include <array>
+#include <cstdint>
+#include <iostream>
+#include <mx/tasking/runtime.h>
+#include <mx/tasking/task.h>
+#include <mx/util/aligned_t.h>
+#include <mx/util/core_set.h>
+#include <mx/util/reference_counter.h>
+#include <vector>
+
+namespace application::hash_join {
+template <class N> class Listener
+{
+public:
+    Listener(const std::uint16_t count_cores, N &notificator) : _count_cores(count_cores), _notificator(notificator)
+    {
+        _pending_local_notifications.fill(mx::util::aligned_t<std::uint32_t>{0U});
+        std::fill_n(_pending_local_notifications.begin(), count_cores, mx::util::aligned_t<std::uint32_t>{count_cores});
+
+        _pending_global_notifications.store(count_cores);
+    }
+
+    ~Listener() = default;
+
+    std::uint16_t count_cores() const noexcept { return _count_cores; }
+    N &notificator() noexcept { return _notificator; }
+    std::uint32_t &pending_local(const std::uint16_t channel_id) noexcept
+    {
+        return _pending_local_notifications[channel_id].value();
+    }
+    std::atomic_uint32_t &pending_global() noexcept { return _pending_global_notifications; }
+
+private:
+    const std::uint16_t _count_cores;
+    N &_notificator;
+    std::array<mx::util::aligned_t<std::uint32_t>, mx::tasking::config::max_cores()> _pending_local_notifications{};
+    alignas(64) std::atomic_uint32_t _pending_global_notifications{0U};
+};
+} // namespace application::hash_join
--- a/src/application/hashjoin_benchmark/main.cpp
+++ b/src/application/hashjoin_benchmark/main.cpp
@@ -0,0 +1,117 @@
+#include "benchmark.h"
+#include <argparse.hpp>
+#include <iostream>
+#include <mx/system/environment.h>
+#include <utility>
+#include <vector>
+
+using namespace application::hash_join;
+
+std::pair<Benchmark *, std::uint16_t> create_benchmark(int count_arguments, char **arguments);
+
+int main(int count_arguments, char **arguments)
+{
+    auto [benchmark, prefetch_distance] = create_benchmark(count_arguments, arguments);
+
+    if (mx::system::Environment::is_numa_balancing_enabled())
+    {
+        std::cout << "[Warn] NUMA balancing may be enabled, set '/proc/sys/kernel/numa_balancing' to '0'" << std::endl;
+    }
+
+    if (benchmark == nullptr)
+    {
+        return 1;
+    }
+
+    mx::util::core_set cores{};
+
+    while ((cores = benchmark->core_set()))
+    {
+        mx::tasking::runtime_guard _(false, cores, prefetch_distance);
+        benchmark->start();
+    }
+
+    delete benchmark;
+
+    return 0;
+}
+
+std::pair<Benchmark *, std::uint16_t> create_benchmark(int count_arguments, char **arguments)
+{
+    argparse::ArgumentParser argument_parser("hashjoin_benchmark");
+    argument_parser.add_argument("cores")
+        .help("Range of the number of cores (1 for using 1 core, 1: for using 1 up to available cores, 1:4 for using "
+              "cores from 1 to 4).")
+        .default_value(std::string("1"));
+    argument_parser.add_argument("-s", "--steps")
+        .help("Steps, how number of cores is increased (1,2,4,6,.. for -s 2).")
+        .default_value(std::uint16_t(2U))
+        .action([](const std::string &value) { return std::uint16_t(std::stoi(value)); });
+    argument_parser.add_argument("-i", "--iterations")
+        .help("Number of iterations for each workload")
+        .default_value(std::uint16_t(1U))
+        .action([](const std::string &value) { return std::uint16_t(std::stoi(value)); });
+    argument_parser.add_argument("-sco", "--system-core-order")
+        .help("Use systems core order. If not, cores are ordered by node id (should be preferred).")
+        .implicit_value(true)
+        .default_value(false);
+    argument_parser.add_argument("-p", "--perf")
+        .help("Use performance counter.")
+        .implicit_value(true)
+        .default_value(false);
+    argument_parser.add_argument("-pd", "--prefetch-distance")
+        .help("Distance of prefetched data objects (0 = disable prefetching).")
+        .default_value(std::uint16_t(0U))
+        .action([](const std::string &value) { return std::uint16_t(std::stoi(value)); });
+    argument_parser.add_argument("-o", "--out")
+        .help("Name of the file, the results will be written to.")
+        .default_value(std::string(""));
+    argument_parser.add_argument("--batch")
+        .help("Number of tuples build/probed together; comma separated as string (e.g. \"64,128,256\")")
+        .default_value(std::string("128"));
+    argument_parser.add_argument("-R").help("Data file of left relation.").default_value(std::string("customer.tbl"));
+    argument_parser.add_argument("-R-key")
+        .help("Index of join key of R")
+        .default_value(std::uint16_t(0U))
+        .action([](const std::string &value) { return std::uint16_t(std::stoi(value)); });
+    argument_parser.add_argument("-S").help("Data file of right relation.").default_value(std::string("orders.tbl"));
+    argument_parser.add_argument("-S-key")
+        .help("Index of join key of S")
+        .default_value(std::uint16_t(1U))
+        .action([](const std::string &value) { return std::uint16_t(std::stoi(value)); });
+
+    // Parse arguments.
+    try
+    {
+        argument_parser.parse_args(count_arguments, arguments);
+    }
+    catch (std::runtime_error &e)
+    {
+        std::cout << argument_parser << std::endl;
+        return std::make_pair(nullptr, 0U);
+    }
+
+    auto order =
+        argument_parser.get<bool>("-sco") ? mx::util::core_set::Order::Ascending : mx::util::core_set::Order::NUMAAware;
+    auto cores =
+        benchmark::Cores({argument_parser.get<std::string>("cores"), argument_parser.get<std::uint16_t>("-s"), order});
+
+    std::vector<std::uint32_t> build_probe_batches;
+    auto batches = std::stringstream{argument_parser.get<std::string>("--batch")};
+    std::string batch;
+    while (std::getline(batches, batch, ','))
+    {
+        build_probe_batches.emplace_back(std::stoul(batch));
+    }
+
+    // Join relations
+    auto r = std::make_pair(argument_parser.get<std::string>("-R"), argument_parser.get<std::uint16_t>("-R-key"));
+    auto s = std::make_pair(argument_parser.get<std::string>("-S"), argument_parser.get<std::uint16_t>("-S-key"));
+
+    // Create the benchmark.
+    auto *benchmark = new Benchmark(std::move(cores), argument_parser.get<std::uint16_t>("-i"),
+                                    std::move(build_probe_batches), std::make_tuple(std::move(r), std::move(s)),
+                                    argument_parser.get<bool>("-p"), argument_parser.get<std::string>("-o"));
+
+    return {benchmark, argument_parser.get<std::uint16_t>("-pd")};
+}
--- a/src/application/hashjoin_benchmark/merge_task.cpp
+++ b/src/application/hashjoin_benchmark/merge_task.cpp
@@ -0,0 +1,31 @@
+#include "merge_task.h"
+#include "benchmark.h"
+
+using namespace application::hash_join;
+
+MergeTask::MergeTask(const mx::util::core_set &cores, Benchmark *benchmark, const std::uint64_t output_per_core)
+    : _benchmark(benchmark), _count_cores(cores.size())
+{
+    this->_result_sets = new mx::util::aligned_t<mx::util::vector<std::pair<std::size_t, std::size_t>>>[cores.size()];
+
+    for (auto channel_id = 0U; channel_id < cores.size(); ++channel_id)
+    {
+        this->_result_sets[channel_id].value().reserve(cores.numa_node_id(channel_id), output_per_core);
+    }
+}
+
+MergeTask::~MergeTask()
+{
+    delete[] this->_result_sets;
+}
+
+mx::tasking::TaskResult MergeTask::execute(const std::uint16_t /*core_id*/, const std::uint16_t /*channel_id*/)
+{
+    for (auto channel = 0U; channel < _count_cores; ++channel)
+    {
+        _count_output_tuples += result_set(channel).size();
+    }
+
+    _benchmark->stop();
+    return mx::tasking::TaskResult::make_null();
+}
--- a/src/application/hashjoin_benchmark/merge_task.h
+++ b/src/application/hashjoin_benchmark/merge_task.h
@@ -0,0 +1,41 @@
+#pragma once
+
+#include <array>
+#include <functional>
+#include <mx/memory/global_heap.h>
+#include <mx/tasking/task.h>
+#include <mx/util/aligned_t.h>
+#include <mx/util/core_set.h>
+#include <mx/util/vector.h>
+#include <utility>
+
+namespace application::hash_join {
+class Benchmark;
+class MergeTask final : public mx::tasking::TaskInterface
+{
+public:
+    MergeTask(const mx::util::core_set &cores, Benchmark *benchmark, std::uint64_t output_per_core);
+    ~MergeTask() override;
+
+    mx::tasking::TaskResult execute(std::uint16_t /*core_id*/, std::uint16_t /*channel_id*/) override;
+
+    mx::util::vector<std::pair<std::size_t, std::size_t>> &result_set(const std::uint16_t channel_id)
+    {
+        return _result_sets[channel_id].value();
+    }
+
+    [[nodiscard]] const mx::util::vector<std::pair<std::size_t, std::size_t>> &result_set(
+        const std::uint16_t channel_id) const
+    {
+        return _result_sets[channel_id].value();
+    }
+
+    [[nodiscard]] std::size_t count_tuples() const noexcept { return _count_output_tuples; }
+
+private:
+    Benchmark *_benchmark;
+    const std::uint16_t _count_cores;
+    std::size_t _count_output_tuples{0U};
+    mx::util::aligned_t<mx::util::vector<std::pair<std::size_t, std::size_t>>> *_result_sets;
+};
+} // namespace application::hash_join
--- a/src/application/hashjoin_benchmark/notification_task.h
+++ b/src/application/hashjoin_benchmark/notification_task.h
@@ -0,0 +1,31 @@
+#pragma once
+#include "listener.h"
+#include <cstdint>
+#include <cstdlib>
+#include <mx/tasking/task.h>
+
+namespace application::hash_join {
+template <class N> class NotificationTask final : public mx::tasking::TaskInterface
+{
+public:
+    NotificationTask(Listener<N> &listener) : _listener(listener) {}
+
+    ~NotificationTask() override = default;
+
+    mx::tasking::TaskResult execute(const std::uint16_t /*core_id*/, const std::uint16_t channel_id) override
+    {
+        if (--_listener.pending_local(channel_id) == 0U)
+        {
+            if (--_listener.pending_global() == 0U)
+            {
+                _listener.notificator()(channel_id);
+            }
+        }
+
+        return mx::tasking::TaskResult::make_remove();
+    }
+
+private:
+    Listener<N> &_listener;
+};
+} // namespace application::hash_join
--- a/src/application/hashjoin_benchmark/notifier.cpp
+++ b/src/application/hashjoin_benchmark/notifier.cpp
@@ -0,0 +1,17 @@
+#include "notifier.h"
+#include <mx/tasking/runtime.h>
+
+using namespace application::hash_join;
+
+void BuildFinishedNotifier::operator()(const std::uint16_t channel_id)
+{
+    for (auto target_channel_id = 0U; target_channel_id < this->_count_cores; ++target_channel_id)
+    {
+        mx::tasking::runtime::spawn(*this->_probe_tasks[target_channel_id], channel_id);
+    }
+}
+
+void ProbeFinishedNotifier::operator()(const std::uint16_t channel_id)
+{
+    mx::tasking::runtime::spawn(*this->_merge_task, channel_id);
+}
--- a/src/application/hashjoin_benchmark/notifier.h
+++ b/src/application/hashjoin_benchmark/notifier.h
@@ -0,0 +1,56 @@
+#pragma once
+
+#include "merge_task.h"
+#include <array>
+#include <mx/tasking/task.h>
+#include <mx/util/vector.h>
+
+namespace application::hash_join {
+
+class BuildFinishedNotifier
+{
+public:
+    constexpr BuildFinishedNotifier() = default;
+    constexpr BuildFinishedNotifier(const std::uint16_t count_cores) : _count_cores(count_cores)
+    {
+        _probe_tasks.fill(nullptr);
+    }
+
+    BuildFinishedNotifier &operator=(BuildFinishedNotifier &&) = default;
+
+    ~BuildFinishedNotifier() = default;
+
+    void dispatch_probe_task(const std::uint16_t index, mx::tasking::TaskInterface *task) noexcept
+    {
+        _probe_tasks[index] = task;
+    }
+
+    void operator()(std::uint16_t channel_id);
+
+private:
+    std::uint16_t _count_cores{0U};
+    std::array<mx::tasking::TaskInterface *, mx::tasking::config::max_cores()> _probe_tasks{};
+};
+
+class ProbeFinishedNotifier
+{
+public:
+    constexpr ProbeFinishedNotifier() = default;
+    constexpr ProbeFinishedNotifier(MergeTask *merge_task) : _merge_task(merge_task) {}
+
+    ProbeFinishedNotifier &operator=(ProbeFinishedNotifier &&) = default;
+
+    ~ProbeFinishedNotifier() = default;
+
+    void operator()(std::uint16_t channel_id);
+
+    mx::util::vector<std::pair<std::size_t, std::size_t>> &result_set(const std::uint16_t channel_id) noexcept
+    {
+        return _merge_task->result_set(channel_id);
+    }
+
+private:
+    MergeTask *_merge_task{nullptr};
+};
+
+} // namespace application::hash_join
--- a/src/application/hashjoin_benchmark/partition_task.h
+++ b/src/application/hashjoin_benchmark/partition_task.h
@@ -0,0 +1,117 @@
+#pragma once
+#include "build_task.h"
+#include "listener.h"
+#include "notification_task.h"
+#include "notifier.h"
+#include "probe_task.h"
+#include <array>
+#include <atomic>
+#include <cassert>
+#include <cstdint>
+#include <iostream>
+#include <mx/tasking/runtime.h>
+#include <mx/tasking/task.h>
+#include <mx/util/core_set.h>
+#include <vector>
+
+namespace application::hash_join {
+
+template <class T> struct notifier_type
+{
+    using value = BuildFinishedNotifier;
+};
+
+template <> struct notifier_type<ProbeTask>
+{
+    using value = ProbeFinishedNotifier;
+};
+
+template <typename T> class PartitionTask final : public mx::tasking::TaskInterface
+{
+public:
+    constexpr PartitionTask(Listener<typename notifier_type<T>::value> &listener, const std::uint32_t batch_size,
+                            const std::size_t count, const mx::resource::ptr *hash_tables) noexcept
+        : _listener(listener), _batch_size(batch_size), _count(count), _hash_tables(hash_tables)
+    {
+    }
+
+    ~PartitionTask() override = default;
+
+    mx::tasking::TaskResult execute(const std::uint16_t core_id, const std::uint16_t channel_id) override
+    {
+        const auto count_cores = _listener.count_cores();
+
+        auto build_probe_tasks = std::array<T *, mx::tasking::config::max_cores()>{nullptr};
+        for (auto target_channel_id = 0U; target_channel_id < count_cores; ++target_channel_id)
+        {
+            if constexpr (std::is_same<T, BuildTask>::value)
+            {
+                build_probe_tasks[target_channel_id] = mx::tasking::runtime::new_task<T>(
+                    core_id, _batch_size, mx::tasking::runtime::numa_node_id(target_channel_id));
+            }
+            else
+            {
+                build_probe_tasks[target_channel_id] = mx::tasking::runtime::new_task<T>(
+                    core_id, _listener.notificator().result_set(target_channel_id), _batch_size,
+                    mx::tasking::runtime::numa_node_id(target_channel_id));
+            }
+
+            build_probe_tasks[target_channel_id]->annotate(_hash_tables[target_channel_id], 64U);
+        }
+
+        auto *data = this->annotated_resource().template get<std::uint32_t>();
+        const auto offset = channel_id * _count;
+        for (auto data_index = 0U; data_index < _count; ++data_index)
+        {
+            const auto key = data[data_index];
+
+            // Distribute key to core
+            const auto target_channel_id = PartitionTask::hash(key) % count_cores;
+            build_probe_tasks[target_channel_id]->emplace_back(offset + data_index, key);
+
+            // Run specific task and create new.
+            if (build_probe_tasks[target_channel_id]->size() == _batch_size)
+            {
+                mx::tasking::runtime::spawn(*build_probe_tasks[target_channel_id], channel_id);
+
+                if constexpr (std::is_same<T, BuildTask>::value)
+                {
+                    build_probe_tasks[target_channel_id] = mx::tasking::runtime::new_task<T>(
+                        core_id, _batch_size, mx::tasking::runtime::numa_node_id(target_channel_id));
+                }
+                else
+                {
+                    build_probe_tasks[target_channel_id] = mx::tasking::runtime::new_task<T>(
+                        core_id, _listener.notificator().result_set(target_channel_id), _batch_size,
+                        mx::tasking::runtime::numa_node_id(target_channel_id));
+                }
+
+                build_probe_tasks[target_channel_id]->annotate(_hash_tables[target_channel_id], 64U);
+            }
+        }
+
+        for (auto target_channel_id = 0U; target_channel_id < count_cores; ++target_channel_id)
+        {
+            // Run last build/probe tasks that are not "full".
+            mx::tasking::runtime::spawn(*build_probe_tasks[target_channel_id], channel_id);
+
+            // Run notification tasks for every core, indicating that all
+            // build/probe tasks of this core are dispatched.
+            auto *notification_task =
+                mx::tasking::runtime::new_task<NotificationTask<typename notifier_type<T>::value>>(core_id, _listener);
+            notification_task->annotate(std::uint16_t(target_channel_id));
+            mx::tasking::runtime::spawn(*notification_task, channel_id);
+        }
+
+        return mx::tasking::TaskResult::make_remove();
+    }
+
+private:
+    Listener<typename notifier_type<T>::value> &_listener;
+    const std::uint32_t _batch_size;
+    const std::size_t _count;
+    const mx::resource::ptr *_hash_tables;
+
+    static std::uint16_t hash(const std::uint32_t key) { return std::hash<std::uint32_t>()(key); }
+};
+} // namespace application::hash_join
--- a/src/application/hashjoin_benchmark/probe_task.h
+++ b/src/application/hashjoin_benchmark/probe_task.h
@@ -0,0 +1,51 @@
+#pragma once
+
+#include "inline_hashtable.h"
+#include <iostream>
+#include <mx/tasking/task.h>
+#include <mx/util/vector.h>
+#include <utility>
+#include <vector>
+
+namespace application::hash_join {
+class ProbeTask final : public mx::tasking::TaskInterface
+{
+public:
+    ProbeTask(mx::util::vector<std::pair<std::size_t, std::size_t>> &result_set, const std::size_t size,
+              const std::uint8_t /*numa_node_id*/)
+        : _result_set(result_set)
+    {
+        _keys.reserve(size);
+    }
+
+    ~ProbeTask() override = default;
+
+    mx::tasking::TaskResult execute(const std::uint16_t /*core_id*/, const std::uint16_t /*channel_id*/) override
+    {
+        auto *hashtable = this->annotated_resource().get<InlineHashtable<std::uint32_t, std::size_t>>();
+
+        for (const auto &[row_id, key] : _keys)
+        {
+            const auto row = hashtable->get(key);
+            if (row != std::numeric_limits<std::size_t>::max())
+            {
+                _result_set.emplace_back(std::make_pair(row_id, row));
+            }
+        }
+
+        return mx::tasking::TaskResult::make_remove();
+    }
+
+    void emplace_back(const std::size_t row_id, const std::uint32_t key) noexcept
+    {
+        _keys.emplace_back(std::make_pair(row_id, key));
+    }
+
+    [[nodiscard]] std::uint64_t size() const noexcept { return _keys.size(); }
+    [[nodiscard]] bool empty() const noexcept { return _keys.empty(); }
+
+private:
+    std::vector<std::pair<std::size_t, std::uint32_t>> _keys;
+    mx::util::vector<std::pair<std::size_t, std::size_t>> &_result_set;
+};
+} // namespace application::hash_join
--- a/src/application/hashjoin_benchmark/tpch_table_reader.cpp
+++ b/src/application/hashjoin_benchmark/tpch_table_reader.cpp
@@ -0,0 +1,24 @@
+#include "tpch_table_reader.h"
+#include <fstream>
+#include <sstream>
+
+using namespace application::hash_join;
+void TPCHTableReader::read(const std::string &file_name,
+                           std::function<void(const std::uint16_t, const std::string &)> &&callback)
+{
+    std::ifstream tpc_file(file_name);
+    if (tpc_file.good())
+    {
+        std::string line;
+        while (std::getline(tpc_file, line))
+        {
+            auto line_stream = std::stringstream{line};
+            std::string column;
+            auto index = 0U;
+            while (std::getline(line_stream, column, '|'))
+            {
+                callback(index++, column);
+            }
+        }
+    }
+}
--- a/src/application/hashjoin_benchmark/tpch_table_reader.h
+++ b/src/application/hashjoin_benchmark/tpch_table_reader.h
@@ -0,0 +1,13 @@
+#pragma once
+#include <cstdint>
+#include <functional>
+#include <string>
+
+namespace application::hash_join {
+class TPCHTableReader
+{
+public:
+    static void read(const std::string &file_name,
+                     std::function<void(const std::uint16_t, const std::string &)> &&callback);
+};
+} // namespace application::hash_join
--- a/src/benchmark/chronometer.h
+++ b/src/benchmark/chronometer.h
@@ -0,0 +1,225 @@
+#pragma once
+
+#include "perf.h"
+#include "phase.h"
+#include <chrono>
+#include <json.hpp>
+#include <mx/tasking/config.h>
+#include <mx/tasking/profiling/statistic.h>
+#include <mx/tasking/runtime.h>
+#include <mx/util/core_set.h>
+#include <numeric>
+#include <ostream>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+namespace benchmark {
+/**
+ * The InterimResult is part of the chronometer, which in turn holds
+ * all results during a benchmark.
+ */
+template <typename P> class InterimResult
+{
+    friend std::ostream &operator<<(std::ostream &stream, const InterimResult &result)
+    {
+        stream << result.core_count() << "\t" << result.iteration() << "\t" << result.phase() << "\t"
+               << result.time().count() << " ms"
+               << "\t" << result.throughput() << " op/s";
+
+        for (const auto &[name, value] : result.performance_counter())
+        {
+            const auto value_per_operation = value / double(result.operation_count());
+            stream << "\t" << value_per_operation << " " << name << "/op";
+        }
+
+        if constexpr (mx::tasking::config::task_statistics())
+        {
+            stream << "\t" << result.executed_writer_tasks() / double(result.operation_count()) << " writer/op";
+            stream << "\t" << result.executed_reader_tasks() / double(result.operation_count()) << " reader/op";
+            stream << "\t" << result.scheduled_tasks_on_core() / double(result.operation_count()) << " on-channel/op";
+            stream << "\t" << result.scheduled_tasks_off_core() / double(result.operation_count()) << " off-channel/op";
+            stream << "\t" << result.worker_fills() / double(result.operation_count()) << " fills/op";
+        }
+
+        return stream << std::flush;
+    }
+
+public:
+    InterimResult(const std::uint64_t operation_count, const P &phase, const std::uint16_t iteration,
+                  const std::uint16_t core_count, const std::chrono::milliseconds time,
+                  std::vector<PerfCounter> &counter, std::unordered_map<std::uint16_t, std::uint64_t> executed_tasks,
+                  std::unordered_map<std::uint16_t, std::uint64_t> executed_reader_tasks,
+                  std::unordered_map<std::uint16_t, std::uint64_t> executed_writer_tasks,
+                  std::unordered_map<std::uint16_t, std::uint64_t> scheduled_tasks,
+                  std::unordered_map<std::uint16_t, std::uint64_t> scheduled_tasks_on_core,
+                  std::unordered_map<std::uint16_t, std::uint64_t> scheduled_tasks_off_core,
+                  std::unordered_map<std::uint16_t, std::uint64_t> worker_fills)
+        : _operation_count(operation_count), _phase(phase), _iteration(iteration), _core_count(core_count), _time(time),
+          _executed_tasks(std::move(executed_tasks)), _executed_reader_tasks(std::move(executed_reader_tasks)),
+          _executed_writer_tasks(std::move(executed_writer_tasks)), _scheduled_tasks(std::move(scheduled_tasks)),
+          _scheduled_tasks_on_core(std::move(scheduled_tasks_on_core)),
+          _scheduled_tasks_off_core(std::move(scheduled_tasks_off_core)), _worker_fills(std::move(worker_fills))
+    {
+        for (auto &c : counter)
+        {
+            _performance_counter.emplace_back(std::make_pair(c.name(), c.read()));
+        }
+    }
+
+    ~InterimResult() = default;
+
+    std::uint64_t operation_count() const noexcept { return _operation_count; }
+    const P &phase() const noexcept { return _phase; }
+    std::uint16_t iteration() const noexcept { return _iteration; }
+    std::uint16_t core_count() const noexcept { return _core_count; }
+    std::chrono::milliseconds time() const noexcept { return _time; }
+    double throughput() const { return _operation_count / (_time.count() / 1000.0); }
+    const std::vector<std::pair<std::string, double>> &performance_counter() const noexcept
+    {
+        return _performance_counter;
+    }
+
+    [[maybe_unused]] std::uint64_t executed_tasks() const noexcept { return sum(_executed_tasks); }
+    [[maybe_unused]] std::uint64_t executed_reader_tasks() const noexcept { return sum(_executed_reader_tasks); }
+    [[maybe_unused]] std::uint64_t executed_writer_tasks() const noexcept { return sum(_executed_writer_tasks); }
+    [[maybe_unused]] std::uint64_t scheduled_tasks() const noexcept { return sum(_scheduled_tasks); }
+    [[maybe_unused]] std::uint64_t scheduled_tasks_on_core() const noexcept { return sum(_scheduled_tasks_on_core); }
+    [[maybe_unused]] std::uint64_t scheduled_tasks_off_core() const noexcept { return sum(_scheduled_tasks_off_core); }
+    [[maybe_unused]] std::uint64_t worker_fills() const noexcept { return sum(_worker_fills); }
+
+    std::uint64_t executed_tasks(const std::uint16_t channel_id) const noexcept
+    {
+        return _executed_tasks.at(channel_id);
+    }
+    std::uint64_t executed_reader_tasks(const std::uint16_t channel_id) const noexcept
+    {
+        return _executed_reader_tasks.at(channel_id);
+    }
+    std::uint64_t executed_writer_tasks(const std::uint16_t channel_id) const noexcept
+    {
+        return _executed_writer_tasks.at(channel_id);
+    }
+    std::uint64_t scheduled_tasks(const std::uint16_t channel_id) const noexcept
+    {
+        return _scheduled_tasks.at(channel_id);
+    }
+    std::uint64_t scheduled_tasks_on_core(const std::uint16_t channel_id) const noexcept
+    {
+        return _scheduled_tasks_on_core.at(channel_id);
+    }
+    std::uint64_t scheduled_tasks_off_core(const std::uint16_t channel_id) const noexcept
+    {
+        return _scheduled_tasks_off_core.at(channel_id);
+    }
+    std::uint64_t worker_fills(const std::uint16_t channel_id) const noexcept { return _worker_fills.at(channel_id); }
+
+    [[nodiscard]] nlohmann::json to_json() const noexcept
+    {
+        auto json = nlohmann::json{};
+        json["iteration"] = iteration();
+        json["cores"] = core_count();
+        json["phase"] = phase();
+        json["throughput"] = throughput();
+        for (const auto &[name, value] : performance_counter())
+        {
+            json[name] = value / double(operation_count());
+        }
+
+        if constexpr (mx::tasking::config::task_statistics())
+        {
+            json["executed-writer-tasks"] = executed_writer_tasks() / double(operation_count());
+            json["executed-reader-tasks"] = executed_reader_tasks() / double(operation_count());
+            json["scheduled-tasks-on-channel"] = scheduled_tasks_on_core() / double(operation_count());
+            json["scheduled-tasks-off-channel"] = scheduled_tasks_off_core() / double(operation_count());
+            json["buffer-fills"] = worker_fills() / double(operation_count());
+        }
+
+        return json;
+    }
+
+private:
+    const std::uint64_t _operation_count;
+    const P &_phase;
+    const std::uint16_t _iteration;
+    const std::uint16_t _core_count;
+    const std::chrono::milliseconds _time;
+    std::vector<std::pair<std::string, double>> _performance_counter;
+    const std::unordered_map<std::uint16_t, std::uint64_t> _executed_tasks;
+    const std::unordered_map<std::uint16_t, std::uint64_t> _executed_reader_tasks;
+    const std::unordered_map<std::uint16_t, std::uint64_t> _executed_writer_tasks;
+    const std::unordered_map<std::uint16_t, std::uint64_t> _scheduled_tasks;
+    const std::unordered_map<std::uint16_t, std::uint64_t> _scheduled_tasks_on_core;
+    const std::unordered_map<std::uint16_t, std::uint64_t> _scheduled_tasks_off_core;
+    const std::unordered_map<std::uint16_t, std::uint64_t> _worker_fills;
+
+    std::uint64_t sum(const std::unordered_map<std::uint16_t, std::uint64_t> &map) const noexcept
+    {
+        return std::accumulate(map.begin(), map.end(), 0U,
+                               [](const auto &current, const auto &item) { return current + item.second; });
+    }
+};
+/**
+ * The Chronometer is the "benchmark clock", which will be started and stopped
+ * before and after each benchmark run. On stopping, the chronometer will calculate
+ * used time, persist performance counter values, and mx::tasking statistics.
+ */
+template <typename P> class Chronometer
+{
+public:
+    Chronometer() = default;
+    ~Chronometer() = default;
+
+    void start(const P phase, const std::uint16_t iteration, const mx::util::core_set &core_set)
+    {
+        _current_phase = phase;
+        _current_iteration = iteration;
+        _core_set = core_set;
+
+        _perf.start();
+        _start = std::chrono::steady_clock::now();
+    }
+
+    InterimResult<P> stop(const std::uint64_t count_operations)
+    {
+        const auto end = std::chrono::steady_clock::now();
+        _perf.stop();
+
+        const auto milliseconds = std::chrono::duration_cast<std::chrono::milliseconds>(end - _start);
+
+        return {count_operations,
+                _current_phase,
+                _current_iteration,
+                _core_set.size(),
+                milliseconds,
+                _perf.counter(),
+                statistic_map(mx::tasking::profiling::Statistic::Executed),
+                statistic_map(mx::tasking::profiling::Statistic::ExecutedReader),
+                statistic_map(mx::tasking::profiling::Statistic::ExecutedWriter),
+                statistic_map(mx::tasking::profiling::Statistic::Scheduled),
+                statistic_map(mx::tasking::profiling::Statistic::ScheduledOnChannel),
+                statistic_map(mx::tasking::profiling::Statistic::ScheduledOffChannel),
+                statistic_map(mx::tasking::profiling::Statistic::Fill)};
+    }
+
+    void add(PerfCounter &performance_counter) { _perf.add(performance_counter); }
+
+private:
+    std::uint16_t _current_iteration{0U};
+    P _current_phase;
+    mx::util::core_set _core_set;
+    alignas(64) Perf _perf;
+    alignas(64) std::chrono::steady_clock::time_point _start;
+
+    std::unordered_map<std::uint16_t, std::uint64_t> statistic_map(
+        const mx::tasking::profiling::Statistic::Counter counter)
+    {
+        std::unordered_map<std::uint16_t, std::uint64_t> statistics;
+        for (auto i = 0U; i < mx::tasking::runtime::channels(); ++i)
+        {
+            statistics[i] = mx::tasking::runtime::statistic(counter, i);
+        }
+        return statistics;
+    }
+};
+} // namespace benchmark
--- a/src/benchmark/cores.cpp
+++ b/src/benchmark/cores.cpp
@@ -0,0 +1,100 @@
+#include "cores.h"
+#include <mx/system/topology.h>
+#include <regex>
+#include <sstream>
+
+using namespace benchmark;
+
+Cores::Cores(const std::uint16_t min_cores, const std::uint16_t max_cores, const std::uint16_t steps,
+             const mx::util::core_set::Order order)
+{
+    this->add_for_range(min_cores, max_cores, steps, order);
+}
+
+Cores::Cores(const std::string &cores, const std::uint16_t steps, const mx::util::core_set::Order order)
+{
+    const std::regex single_core_regex("(\\d+)$");
+    const std::regex from_core_regex("(\\d+):$");
+    const std::regex core_range_regex("(\\d+):(\\d+)");
+
+    std::stringstream stream(cores);
+    std::string token;
+    while (std::getline(stream, token, ';'))
+    {
+        std::smatch match;
+
+        if (std::regex_match(token, match, single_core_regex))
+        {
+            const auto core = std::stoi(match[1].str());
+            this->add_for_range(core, core, steps, order);
+        }
+        else if (std::regex_match(token, match, from_core_regex))
+        {
+            this->add_for_range(std::stoi(match[1].str()), mx::system::topology::count_cores(), steps, order);
+        }
+        else if (std::regex_match(token, match, core_range_regex))
+        {
+            this->add_for_range(std::stoi(match[1].str()), std::stoi(match[2].str()), steps, order);
+        }
+    }
+}
+
+void Cores::add_for_range(const std::uint16_t min_cores, const std::uint16_t max_cores, const std::uint16_t steps,
+                          const mx::util::core_set::Order order)
+{
+    if (min_cores == 0U || min_cores == max_cores)
+    {
+        this->_core_sets.push_back(mx::util::core_set::build(max_cores, order));
+    }
+    else
+    {
+        auto cores = min_cores;
+        if (cores % steps != 0U)
+        {
+            this->_core_sets.push_back(mx::util::core_set::build(cores, order));
+            cores++;
+        }
+
+        for (auto count_cores = cores; count_cores <= max_cores; count_cores++)
+        {
+            if (count_cores % steps == 0U)
+            {
+                this->_core_sets.push_back(mx::util::core_set::build(count_cores, order));
+            }
+        }
+
+        if (max_cores % steps != 0U)
+        {
+            this->_core_sets.push_back(mx::util::core_set::build(max_cores, order));
+        }
+    }
+}
+
+std::string Cores::dump(const std::uint8_t indent) const
+{
+    std::stringstream stream;
+
+    for (auto i = 0U; i < this->_core_sets.size(); ++i)
+    {
+        if (i > 0U)
+        {
+            stream << "\n";
+        }
+        const auto &core_set = this->_core_sets[i];
+        if (indent > 0U)
+        {
+            stream << std::string(indent, ' ');
+        }
+        stream << core_set.size() << ": " << core_set;
+    }
+    stream << std::flush;
+
+    return stream.str();
+}
+
+namespace benchmark {
+std::ostream &operator<<(std::ostream &stream, const Cores &cores)
+{
+    return stream << cores.dump(0U) << std::endl;
+}
+} // namespace benchmark
--- a/src/benchmark/cores.h
+++ b/src/benchmark/cores.h
@@ -0,0 +1,53 @@
+#pragma once
+
+#include <cstdint>
+#include <mx/util/core_set.h>
+#include <ostream>
+#include <string>
+#include <vector>
+
+namespace benchmark {
+/**
+ * Set of core_sets used for a benchmark that should be performed over
+ * different core counts to benchmark scalability.
+ * Can be created from min and max cores (i.e. 1 core to 32 cores) or from
+ * string identifying the cores (i.e. "1:32").
+ */
+class Cores
+{
+    friend std::ostream &operator<<(std::ostream &stream, const Cores &cores);
+
+public:
+    Cores(std::uint16_t min_cores, std::uint16_t max_cores, std::uint16_t steps, mx::util::core_set::Order order);
+    Cores(const std::string &cores, std::uint16_t steps, mx::util::core_set::Order order);
+    Cores(Cores &&) noexcept = default;
+
+    ~Cores() = default;
+
+    const mx::util::core_set &next()
+    {
+        const auto current_index = _current_index++;
+        if (current_index < _core_sets.size())
+        {
+            return _core_sets[current_index];
+        }
+
+        return _empty_core_set;
+    }
+
+    [[nodiscard]] const mx::util::core_set &current() const noexcept { return _core_sets[_current_index - 1]; }
+    [[nodiscard]] std::size_t size() const noexcept { return _core_sets.size(); }
+
+    void reset() { _current_index = 0U; }
+
+    [[nodiscard]] std::string dump(std::uint8_t indent) const;
+
+private:
+    std::vector<mx::util::core_set> _core_sets;
+    std::uint16_t _current_index = 0U;
+    const mx::util::core_set _empty_core_set;
+
+    void add_for_range(std::uint16_t min_cores, std::uint16_t max_cores, std::uint16_t steps,
+                       mx::util::core_set::Order order);
+};
+} // namespace benchmark
--- a/src/benchmark/perf.cpp
+++ b/src/benchmark/perf.cpp
@@ -0,0 +1,71 @@
+#include "perf.h"
+
+using namespace benchmark;
+
+/**
+ * Counter "Instructions Retired"
+ * Counts when the last uop of an instruction retires.
+ */
+[[maybe_unused]] PerfCounter Perf::INSTRUCTIONS = {"instr", PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS};
+
+/**
+ */
+[[maybe_unused]] PerfCounter Perf::CYCLES = {"cycles", PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES};
+
+/**
+ */
+[[maybe_unused]] PerfCounter Perf::L1_MISSES = {"l1-miss", PERF_TYPE_HW_CACHE,
+                                                PERF_COUNT_HW_CACHE_L1D | (PERF_COUNT_HW_CACHE_OP_READ << 8) |
+                                                    (PERF_COUNT_HW_CACHE_RESULT_MISS << 16)};
+
+/**
+ * Counter "LLC Misses"
+ * Accesses to the LLC in which the data is not present(miss).
+ */
+[[maybe_unused]] PerfCounter Perf::LLC_MISSES = {"llc-miss", PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES};
+
+/**
+ * Counter "LLC Reference"
+ * Accesses to the LLC, in which the data is present(hit) or not present(miss)
+ */
+[[maybe_unused]] PerfCounter Perf::LLC_REFERENCES = {"llc-ref", PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_REFERENCES};
+
+/**
+ * Micro architecture "Skylake"
+ * Counter "CYCLE_ACTIVITY.STALLS_MEM_ANY"
+ * EventSel=A3H,UMask=14H, CMask=20
+ * Execution stalls while memory subsystem has an outstanding load.
+ */
+PerfCounter Perf::STALLS_MEM_ANY = {"memory-stall", PERF_TYPE_RAW, 0x145314a3};
+
+/**
+ * Micro architecture "Skylake"
+ * Counter "SW_PREFETCH_ACCESS.NTA"
+ * EventSel=32H,UMask=01H
+ * Number of PREFETCHNTA instructions executed.
+ */
+[[maybe_unused]] PerfCounter Perf::SW_PREFETCH_ACCESS_NTA = {"sw-prefetch-nta", PERF_TYPE_RAW, 0x530132};
+
+/**
+ * Micro architecture "Skylake"
+ * Counter "SW_PREFETCH_ACCESS.T0"
+ * EventSel=32H,UMask=02H
+ * Number of PREFETCHT0 instructions executed.
+ */
+[[maybe_unused]] PerfCounter Perf::SW_PREFETCH_ACCESS_T0 = {"sw-prefetch-t0", PERF_TYPE_RAW, 0x530232};
+
+/**
+ * Micro architecture "Skylake"
+ * Counter "SW_PREFETCH_ACCESS.T1_T2"
+ * EventSel=32H,UMask=04H
+ * Number of PREFETCHT1 or PREFETCHT2 instructions executed.
+ */
+[[maybe_unused]] PerfCounter Perf::SW_PREFETCH_ACCESS_T1_T2 = {"sw-prefetch-t1t2", PERF_TYPE_RAW, 0x530432};
+
+/**
+ * Micro architecture "Skylake"
+ * Counter "SW_PREFETCH_ACCESS.PREFETCHW"
+ * EventSel=32H,UMask=08H
+ * Number of PREFETCHW instructions executed.
+ */
+[[maybe_unused]] PerfCounter Perf::SW_PREFETCH_ACCESS_WRITE = {"sw-prefetch-w", PERF_TYPE_RAW, 0x530832};
--- a/src/benchmark/perf.h
+++ b/src/benchmark/perf.h
@@ -0,0 +1,157 @@
+#pragma once
+#include <algorithm>
+#include <asm/unistd.h>
+#include <cstring>
+#include <linux/perf_event.h>
+#include <string>
+#include <sys/ioctl.h>
+#include <unistd.h>
+#include <vector>
+
+/*
+ * For more Performance Counter take a look into the Manual from Intel:
+ *  https://software.intel.com/sites/default/files/managed/8b/6e/335279_performance_monitoring_events_guide.pdf
+ *
+ * To get event ids from manual specification see libpfm4:
+ *  http://www.bnikolic.co.uk/blog/hpc-prof-events.html
+ * Clone, Make, use examples/check_events to generate event id code from event:
+ *  ./check_events <category>:<umask>[:c=<cmask>]
+ * Example:
+ *  ./cycle_activity:0x14:c=20
+ */
+
+namespace benchmark {
+
+/**
+ * Represents a Linux Performance Counter.
+ */
+class PerfCounter
+{
+public:
+    PerfCounter(std::string &&name, const std::uint64_t type, const std::uint64_t event_id) : _name(std::move(name))
+    {
+        std::memset(&_perf_event_attribute, 0, sizeof(perf_event_attr));
+        _perf_event_attribute.type = type;
+        _perf_event_attribute.size = sizeof(perf_event_attr);
+        _perf_event_attribute.config = event_id;
+        _perf_event_attribute.disabled = true;
+        _perf_event_attribute.inherit = 1;
+        _perf_event_attribute.exclude_kernel = false;
+        _perf_event_attribute.exclude_hv = false;
+        _perf_event_attribute.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING;
+    }
+
+    ~PerfCounter() = default;
+
+    bool open()
+    {
+        _file_descriptor = syscall(__NR_perf_event_open, &_perf_event_attribute, 0, -1, -1, 0);
+        return _file_descriptor >= 0;
+    }
+
+    bool start()
+    {
+        ioctl(_file_descriptor, PERF_EVENT_IOC_RESET, 0);
+        ioctl(_file_descriptor, PERF_EVENT_IOC_ENABLE, 0);
+        return ::read(_file_descriptor, &_prev, sizeof(read_format)) == sizeof(read_format);
+    }
+
+    bool stop()
+    {
+        const auto is_read = ::read(_file_descriptor, &_data, sizeof(read_format)) == sizeof(read_format);
+        ioctl(_file_descriptor, PERF_EVENT_IOC_DISABLE, 0);
+        return is_read;
+    }
+
+    [[nodiscard]] double read() const
+    {
+        const auto multiplexing_correction = static_cast<double>(_data.time_enabled - _prev.time_enabled) /
+                                             static_cast<double>(_data.time_running - _prev.time_running);
+        return static_cast<double>(_data.value - _prev.value) * multiplexing_correction;
+    }
+
+    [[nodiscard]] const std::string &name() const { return _name; }
+    explicit operator const std::string &() const { return name(); }
+
+    bool operator==(const std::string &name) const { return _name == name; }
+
+private:
+    struct read_format
+    {
+        std::uint64_t value = 0;
+        std::uint64_t time_enabled = 0;
+        std::uint64_t time_running = 0;
+    };
+
+    const std::string _name;
+    std::int32_t _file_descriptor = -1;
+    perf_event_attr _perf_event_attribute{};
+    read_format _prev{};
+    read_format _data{};
+};
+
+/**
+ * Holds a set of performance counter and starts/stops them together.
+ */
+class Perf
+{
+public:
+    [[maybe_unused]] static PerfCounter INSTRUCTIONS;
+    [[maybe_unused]] static PerfCounter CYCLES;
+    [[maybe_unused]] static PerfCounter L1_MISSES;
+    [[maybe_unused]] [[maybe_unused]] static PerfCounter LLC_MISSES;
+    [[maybe_unused]] static PerfCounter LLC_REFERENCES;
+    [[maybe_unused]] static PerfCounter STALLED_CYCLES_BACKEND;
+    [[maybe_unused]] static PerfCounter STALLS_MEM_ANY;
+    [[maybe_unused]] static PerfCounter SW_PREFETCH_ACCESS_NTA;
+    [[maybe_unused]] static PerfCounter SW_PREFETCH_ACCESS_T0;
+    [[maybe_unused]] static PerfCounter SW_PREFETCH_ACCESS_T1_T2;
+    [[maybe_unused]] static PerfCounter SW_PREFETCH_ACCESS_WRITE;
+
+    Perf() noexcept = default;
+    ~Perf() noexcept = default;
+
+    bool add(PerfCounter &counter_)
+    {
+        if (counter_.open())
+        {
+            _counter.push_back(counter_);
+            return true;
+        }
+
+        return false;
+    }
+
+    void start()
+    {
+        for (auto &counter_ : _counter)
+        {
+            counter_.start();
+        }
+    }
+
+    void stop()
+    {
+        for (auto &counter_ : _counter)
+        {
+            counter_.stop();
+        }
+    }
+
+    double operator[](const std::string &name) const
+    {
+        auto counter_iterator = std::find(_counter.begin(), _counter.end(), name);
+        if (counter_iterator != _counter.end())
+        {
+            return counter_iterator->read();
+        }
+
+        return 0.0;
+    }
+
+    std::vector<PerfCounter> &counter() { return _counter; }
+
+private:
+    std::vector<PerfCounter> _counter;
+};
+} // namespace benchmark
--- a/src/benchmark/phase.h
+++ b/src/benchmark/phase.h
@@ -0,0 +1,9 @@
+#pragma once
+#include <cstdint>
+namespace benchmark {
+enum class phase : std::uint8_t
+{
+    FILL = 0U,
+    MIXED = 1U
+};
+}
--- a/src/benchmark/string_util.cpp
+++ b/src/benchmark/string_util.cpp
@@ -0,0 +1,15 @@
+#include "string_util.h"
+#include <sstream>
+
+using namespace benchmark;
+
+void string_util::split(const std::string &text, const char delimiter,
+                        const std::function<void(const std::string &line)> &callback)
+{
+    std::stringstream stream(text);
+    std::string token;
+    while (std::getline(stream, token, delimiter))
+    {
+        callback(token);
+    }
+}
--- a/src/benchmark/string_util.h
+++ b/src/benchmark/string_util.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include <functional>
+#include <string>
+
+namespace benchmark {
+class string_util
+{
+public:
+    static void split(const std::string &text, char delimiter,
+                      const std::function<void(const std::string &line)> &callback);
+};
+} // namespace benchmark
--- a/src/benchmark/workload.cpp
+++ b/src/benchmark/workload.cpp
@@ -0,0 +1,20 @@
+#include "workload.h"
+#include <limits>
+
+using namespace benchmark;
+
+std::pair<std::uint64_t, std::uint64_t> Workload::next(const std::uint64_t count) noexcept
+{
+    const auto index = this->_current_index.fetch_add(count, std::memory_order_relaxed);
+    const auto workload_size = this->_workload_set[this->_current_phase].size();
+
+    return index < workload_size ? std::make_pair(index, std::min(count, workload_size - index))
+                                 : std::make_pair(std::numeric_limits<std::uint64_t>::max(), 0UL);
+}
+
+namespace benchmark {
+std::ostream &operator<<(std::ostream &stream, const Workload &workload)
+{
+    return stream << workload._workload_set << std::flush;
+}
+} // namespace benchmark
--- a/src/benchmark/workload.h
+++ b/src/benchmark/workload.h
@@ -0,0 +1,58 @@
+#pragma once
+
+#include "phase.h"
+#include "workload_set.h"
+#include <array>
+#include <atomic>
+#include <cstdint>
+#include <utility>
+
+namespace benchmark {
+class Workload
+{
+    friend std::ostream &operator<<(std::ostream &stream, const Workload &workload);
+
+public:
+    Workload() noexcept = default;
+    ~Workload() noexcept = default;
+
+    [[maybe_unused]] void build(const std::string &fill_workload_file, const std::string &mixed_workload_file)
+    {
+        _workload_set.build(fill_workload_file, mixed_workload_file);
+    }
+
+    [[maybe_unused]] void build(const std::uint64_t fill_inserts, const std::uint64_t mixed_inserts,
+                                const std::uint64_t mixed_lookups, const std::uint64_t mixed_updates,
+                                const std::uint64_t mixed_deletes)
+    {
+        _workload_set.build(fill_inserts, mixed_inserts, mixed_lookups, mixed_updates, mixed_deletes);
+    }
+
+    [[maybe_unused]] void shuffle() { _workload_set.shuffle(); }
+
+    std::pair<std::uint64_t, std::uint64_t> next(std::uint64_t count) noexcept;
+
+    [[nodiscard]] std::uint64_t size() const noexcept { return _workload_set[_current_phase].size(); }
+    [[nodiscard]] bool empty() const noexcept { return _workload_set[_current_phase].empty(); }
+    [[nodiscard]] bool empty(const phase phase) const noexcept { return _workload_set[phase].empty(); }
+
+    void reset(const phase phase) noexcept
+    {
+        _current_phase = phase;
+        _current_index = 0;
+    }
+
+    const NumericTuple &operator[](const std::size_t index) const noexcept
+    {
+        return _workload_set[_current_phase][index];
+    }
+    bool operator==(const phase phase) const noexcept { return _current_phase == phase; }
+    explicit operator phase() const noexcept { return _current_phase; }
+
+private:
+    NumericWorkloadSet _workload_set;
+    phase _current_phase = phase::FILL;
+
+    alignas(64) std::atomic_uint64_t _current_index{0U};
+};
+} // namespace benchmark
--- a/src/benchmark/workload_set.cpp
+++ b/src/benchmark/workload_set.cpp
@@ -0,0 +1,168 @@
+#include "workload_set.h"
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <mutex>
+#include <random>
+#include <thread>
+
+using namespace benchmark;
+
+void NumericWorkloadSet::build(const std::string &fill_workload_file, const std::string &mixed_workload_file)
+{
+    auto parse = [](auto &file_stream, std::vector<NumericTuple> &data_set) -> bool {
+        std::srand(1337);
+        std::string op_name;
+        std::uint64_t key{};
+
+        bool contains_update = false;
+
+        while (file_stream >> op_name >> key)
+        {
+            if (op_name == "INSERT")
+            {
+                contains_update = true;
+                data_set.emplace_back(NumericTuple{NumericTuple::INSERT, key, std::rand()});
+            }
+            else if (op_name == "READ")
+            {
+                data_set.emplace_back(NumericTuple{NumericTuple::LOOKUP, key});
+            }
+            else if (op_name == "UPDATE")
+            {
+                contains_update = true;
+                data_set.emplace_back(NumericTuple{NumericTuple::UPDATE, key, std::rand()});
+            }
+        }
+
+        return contains_update;
+    };
+
+    std::mutex out_mutex;
+    std::thread fill_thread{[this, &out_mutex, &parse, &fill_workload_file]() {
+        std::ifstream fill_file(fill_workload_file);
+        if (fill_file.good())
+        {
+            parse(fill_file, this->_data_sets[static_cast<std::size_t>(phase::FILL)]);
+        }
+        else
+        {
+            std::lock_guard lock{out_mutex};
+            std::cerr << "Could not open workload file '" << fill_workload_file << "'." << std::endl;
+        }
+    }};
+
+    std::thread mixed_thread{[this, &out_mutex, &parse, &mixed_workload_file]() {
+        std::ifstream mixed_file(mixed_workload_file);
+        if (mixed_file.good())
+        {
+            this->_mixed_phase_contains_update =
+                parse(mixed_file, this->_data_sets[static_cast<std::size_t>(phase::MIXED)]);
+        }
+        else
+        {
+            std::lock_guard lock{out_mutex};
+            std::cerr << "Could not open workload file '" << mixed_workload_file << "'." << std::endl;
+        }
+    }};
+
+    fill_thread.join();
+    mixed_thread.join();
+}
+
+void NumericWorkloadSet::build(const std::uint64_t fill_inserts, const std::uint64_t mixed_inserts,
+                               const std::uint64_t mixed_lookups, const std::uint64_t mixed_updates,
+                               const std::uint64_t mixed_deletes)
+{
+    std::srand(1337);
+    this->_data_sets[static_cast<std::uint8_t>(phase::FILL)].reserve(fill_inserts);
+    this->_data_sets[static_cast<std::uint8_t>(phase::MIXED)].reserve(mixed_inserts + mixed_lookups + mixed_updates +
+                                                                      mixed_deletes);
+
+    for (auto i = 0U; i < fill_inserts; ++i)
+    {
+        this->_data_sets[static_cast<std::uint8_t>(phase::FILL)].emplace_back(
+            NumericTuple{NumericTuple::INSERT, i + 1U, std::rand()});
+    }
+
+    this->_mixed_phase_contains_update = mixed_inserts > 0U || mixed_deletes > 0U || mixed_updates > 0U;
+
+    for (auto i = fill_inserts; i < fill_inserts + mixed_inserts; ++i)
+    {
+        this->_data_sets[static_cast<std::uint8_t>(phase::MIXED)].emplace_back(
+            NumericTuple{NumericTuple::INSERT, i + 1U, std::rand()});
+    }
+
+    for (auto i = 0U; i < mixed_lookups; ++i)
+    {
+        this->_data_sets[static_cast<std::uint8_t>(phase::MIXED)].push_back(
+            {NumericTuple::LOOKUP, this->_data_sets[static_cast<std::uint16_t>(phase::FILL)][i % fill_inserts].key()});
+    }
+
+    for (auto i = 0U; i < mixed_updates; ++i)
+    {
+        this->_data_sets[static_cast<std::size_t>(phase::MIXED)].push_back(
+            {NumericTuple::UPDATE, this->_data_sets[static_cast<std::uint16_t>(phase::FILL)][i % fill_inserts].key(),
+             std::rand()});
+    }
+
+    for (auto i = 0U; i < mixed_deletes; ++i)
+    {
+        this->_data_sets[static_cast<std::uint8_t>(phase::MIXED)].push_back(
+            {NumericTuple::DELETE, this->_data_sets[static_cast<std::uint16_t>(phase::FILL)][i % fill_inserts].key()});
+    }
+}
+
+void NumericWorkloadSet::shuffle()
+{
+    std::srand(1337U + 42U);
+    std::random_device random_device;
+    std::mt19937 mersenne_twister_engine(random_device());
+
+    std::shuffle(this->_data_sets[static_cast<std::uint8_t>(phase::FILL)].begin(),
+                 this->_data_sets[static_cast<std::uint8_t>(phase::FILL)].end(), mersenne_twister_engine);
+    std::shuffle(this->_data_sets[static_cast<std::uint8_t>(phase::MIXED)].begin(),
+                 this->_data_sets[static_cast<std::uint8_t>(phase::MIXED)].end(), mersenne_twister_engine);
+}
+
+std::ostream &NumericWorkloadSet::nice_print(std::ostream &stream, const std::size_t number) noexcept
+{
+    if (number >= 1000000U)
+    {
+        return stream << (number / 1000000U) << "m";
+    }
+
+    if (number >= 1000U)
+    {
+        return stream << (number / 1000U) << "k";
+    }
+
+    return stream << number;
+}
+
+namespace benchmark {
+std::ostream &operator<<(std::ostream &stream, const NumericWorkloadSet &workload)
+{
+    const auto has_fill_and_mixed = workload[phase::FILL].empty() == false && workload[phase::MIXED].empty() == false;
+
+    if (workload[phase::FILL].empty() == false)
+    {
+        stream << "fill: ";
+        NumericWorkloadSet::nice_print(stream, workload[phase::FILL].size());
+    }
+
+    if (has_fill_and_mixed)
+    {
+        stream << " / ";
+    }
+
+    if (workload[phase::MIXED].empty() == false)
+    {
+        stream << (workload._mixed_phase_contains_update ? "mixed: " : "read-only: ");
+        NumericWorkloadSet::nice_print(stream, workload[phase::MIXED].size());
+    }
+
+    return stream << std::flush;
+}
+} // namespace benchmark
--- a/src/benchmark/workload_set.h
+++ b/src/benchmark/workload_set.h
@@ -0,0 +1,74 @@
+#pragma once
+
+#include "phase.h"
+#include <array>
+#include <cstdint>
+#include <ostream>
+#include <string>
+#include <vector>
+
+namespace benchmark {
+class NumericTuple
+{
+public:
+    enum Type
+    {
+        INSERT,
+        LOOKUP,
+        UPDATE,
+        DELETE
+    };
+
+    constexpr NumericTuple(const Type type, const std::uint64_t key) : _type(type), _key(key) {}
+    constexpr NumericTuple(const Type type, const std::uint64_t key, const std::int64_t value)
+        : _type(type), _key(key), _value(value)
+    {
+    }
+
+    NumericTuple(NumericTuple &&) noexcept = default;
+    NumericTuple(const NumericTuple &) = default;
+
+    ~NumericTuple() = default;
+
+    NumericTuple &operator=(NumericTuple &&) noexcept = default;
+
+    [[nodiscard]] std::uint64_t key() const { return _key; };
+    [[nodiscard]] std::int64_t value() const { return _value; }
+
+    bool operator==(const Type type) const { return _type == type; }
+
+private:
+    Type _type;
+    std::uint64_t _key;
+    std::int64_t _value = 0;
+};
+
+class NumericWorkloadSet
+{
+    friend std::ostream &operator<<(std::ostream &stream, const NumericWorkloadSet &workload_set);
+
+public:
+    NumericWorkloadSet() = default;
+    ~NumericWorkloadSet() = default;
+
+    void build(const std::string &fill_workload_file, const std::string &mixed_workload_file);
+    void build(std::uint64_t fill_inserts, std::uint64_t mixed_inserts, std::uint64_t mixed_lookups,
+               std::uint64_t mixed_updates, std::uint64_t mixed_deletes);
+    void shuffle();
+
+    [[nodiscard]] const std::vector<NumericTuple> &fill() const noexcept { return _data_sets[0]; }
+    [[nodiscard]] const std::vector<NumericTuple> &mixed() const noexcept { return _data_sets[1]; }
+    const std::vector<NumericTuple> &operator[](const phase phase) const noexcept
+    {
+        return _data_sets[static_cast<std::uint16_t>(phase)];
+    }
+
+    explicit operator bool() const { return fill().empty() == false || mixed().empty() == false; }
+
+private:
+    std::array<std::vector<NumericTuple>, 2> _data_sets;
+    bool _mixed_phase_contains_update = false;
+
+    static std::ostream &nice_print(std::ostream &stream, std::size_t number) noexcept;
+};
+} // namespace benchmark
--- a/src/db/index/blinktree/b_link_tree.h
+++ b/src/db/index/blinktree/b_link_tree.h
@@ -0,0 +1,366 @@
+#pragma once
+
+#include "config.h"
+#include "node.h"
+#include "node_consistency_checker.h"
+#include "node_iterator.h"
+#include "node_statistics.h"
+#include <atomic>
+#include <cassert>
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <json.hpp>
+#include <mx/resource/resource.h>
+#include <mx/tasking/runtime.h>
+#include <utility>
+#include <vector>
+
+namespace db::index::blinktree {
+
+template <typename K, typename V> class BLinkTree
+{
+public:
+    BLinkTree(const mx::synchronization::isolation_level isolation_level,
+              const mx::synchronization::protocol preferred_synchronization_protocol)
+        : _isolation_level(isolation_level), _preferred_synchronization_protocol(preferred_synchronization_protocol),
+          _root(create_node(NodeType::Leaf, mx::resource::ptr{}, true))
+    {
+    }
+
+    ~BLinkTree() { mx::tasking::runtime::delete_resource<Node<K, V>>(_root); }
+
+    /**
+     * @return Root node of the tree.
+     */
+    [[nodiscard]] mx::resource::ptr root() const { return _root; }
+
+    /**
+     * @return Height of the tree.
+     */
+    [[nodiscard]] std::uint16_t height() const { return _height; }
+
+    /**
+     * @return True, when the tree does not contain any value.
+     */
+    [[nodiscard]] bool empty() const
+    {
+        return static_cast<bool>(_root) == false || _root.template get<Node<K, V>>()->size() == 0;
+    }
+
+    /**
+     * Creates a node of type inner.
+     *
+     * @param is_branch True, when the children of the new inner node will be leaf nodes.
+     * @param parent Parent of the new inner node.
+     * @param is_root True, then the new inner node will be the root.
+     * @return Inner node.
+     */
+    [[nodiscard]] mx::resource::ptr create_inner_node(const bool is_branch, const mx::resource::ptr parent,
+                                                      const bool is_root = false) const
+    {
+        const auto inner_type = is_branch ? NodeType::Inner | NodeType::Branch : NodeType::Inner;
+        return create_node(inner_type, parent, is_root);
+    }
+
+    /**
+     * Creates a node of type leaf.
+     *
+     * @param parent Parent of the new leaf node.
+     * @return Leaf node.
+     */
+    [[nodiscard]] mx::resource::ptr create_leaf_node(const mx::resource::ptr parent) const
+    {
+        return create_node(NodeType::Leaf, parent, false);
+    }
+
+    /**
+     * Creates a new root node, containing two separators (to the left and right).
+     * The new root node will be set in the tree.
+     *
+     * @param left Link to the "smaller" child node.
+     * @param right Link to the "greater" child node.
+     * @param key Separator key.
+     */
+    void create_new_root(mx::resource::ptr left, mx::resource::ptr right, K key);
+
+    /**
+     * Splits an inner node.
+     *
+     * @param inner_node Node to split.
+     * @param key Key to insert after split.
+     * @param separator Separator to insert after split.
+     * @return Pointer and high key of the new node.
+     */
+    std::pair<mx::resource::ptr, K> split(mx::resource::ptr inner_node, K key, mx::resource::ptr separator) const;
+
+    /**
+     * Splits a leaf node.
+     *
+     * @param leaf_node Node to split.
+     * @param key Key to insert after split.
+     * @param value Value to insert after split.
+     * @return Pointer to the leaf node and key for parent.
+     */
+    std::pair<mx::resource::ptr, K> split(mx::resource::ptr leaf_node, K key, V value) const;
+
+    /**
+     * @return Begin iterator for iterating ofer nodes.
+     */
+    NodeIterator<K, V> begin() const { return NodeIterator(mx::resource::ptr_cast<Node<K, V>>(_root)); }
+
+    /**
+     * @return End iterator (aka empty node iterator).
+     */
+    NodeIterator<K, V> end() const { return {}; }
+
+    /**
+     * Checks the consistency of the tree.
+     */
+    void check() const;
+
+    /**
+     * Dumps the statistics like height, number of (inner/leaf) nodes, number of records,... .
+     */
+    void print_statistics() const;
+
+    explicit operator nlohmann::json() const
+    {
+        nlohmann::json out;
+        out["height"] = _height;
+        out["root"] = node_to_json(_root);
+
+        return out;
+    }
+
+protected:
+    // Height of the tree.
+    std::uint8_t _height = 1;
+
+    // Isolation of tasks accessing a node.
+    const mx::synchronization::isolation_level _isolation_level;
+
+    // Select a preferred method for synchronization.
+    const mx::synchronization::protocol _preferred_synchronization_protocol;
+
+    // Pointer to the root.
+    alignas(64) mx::resource::ptr _root;
+
+    /**
+     * Creates a new node.
+     *
+     * @param node_type Type of the node.
+     * @param parent Parent of the node.
+     * @param is_root True, if the new node will be the root.
+     * @return Pointer to the new node.
+     */
+    [[nodiscard]] mx::resource::ptr create_node(const NodeType node_type, const mx::resource::ptr parent,
+                                                const bool is_root) const
+    {
+        const auto is_inner = static_cast<bool>(node_type & NodeType::Inner);
+        return mx::tasking::runtime::new_resource<Node<K, V>>(
+            config::node_size(),
+            mx::resource::hint{_isolation_level, _preferred_synchronization_protocol,
+                               predict_access_frequency(is_inner, is_root), predict_read_write_ratio(is_inner)},
+            node_type, parent);
+    }
+
+    /**
+     * Creates a hint for tasking regarding usage of the node.
+     *
+     * @param is_inner True, of the node is an inner node.
+     * @param is_root True, of the node is the root.
+     * @return Hint for usage prediction which will be used for allocating resources.
+     */
+    [[nodiscard]] static mx::resource::hint::expected_access_frequency predict_access_frequency(const bool is_inner,
+                                                                                                const bool is_root)
+    {
+        if (is_root)
+        {
+            return mx::resource::hint::expected_access_frequency::excessive;
+        }
+
+        if (is_inner)
+        {
+            return mx::resource::hint::expected_access_frequency::high;
+        }
+
+        return mx::resource::hint::expected_access_frequency::normal;
+    }
+
+    /**
+     * Create a hint for the read/write ratio.
+     * Inner nodes will be written very little while
+     * leaf nodes will be written more often.
+     *
+     * @param is_inner True, when the node is an inner node.
+     * @return  Predicted read/write ratio.
+     */
+    [[nodiscard]] static mx::resource::hint::expected_read_write_ratio predict_read_write_ratio(const bool is_inner)
+    {
+        return is_inner ? mx::resource::hint::expected_read_write_ratio::heavy_read
+                        : mx::resource::hint::expected_read_write_ratio::balanced;
+    }
+
+    /**
+     * Serializes a tree node to json format.
+     *
+     * @param node Node to serialize.
+     * @return JSON representation of the node.
+     */
+    [[nodiscard]] nlohmann::json node_to_json(mx::resource::ptr node) const
+    {
+        auto out = nlohmann::json();
+        auto node_ptr = mx::resource::ptr_cast<Node<K, V>>(node);
+
+        out["channel_id"] = node.channel_id();
+        out["is_leaf"] = node_ptr->is_leaf();
+        out["size"] = node_ptr->size();
+
+        if (node_ptr->is_inner())
+        {
+            auto children = nlohmann::json::array();
+            for (auto i = 0U; i <= node_ptr->size(); ++i)
+            {
+                children.push_back(node_to_json(node_ptr->separator(i)));
+            }
+            out["children"] = children;
+        }
+
+        return out;
+    }
+};
+
+template <typename K, typename V>
+void BLinkTree<K, V>::create_new_root(const mx::resource::ptr left, const mx::resource::ptr right, const K key)
+{
+    const auto is_left_inner = mx::resource::ptr_cast<Node<K, V>>(left)->is_inner();
+    mx::tasking::runtime::modify_predicted_usage(left, predict_access_frequency(is_left_inner, true),
+                                                 predict_access_frequency(is_left_inner, false));
+
+    auto root = this->create_inner_node(mx::resource::ptr_cast<Node<K, V>>(left)->is_leaf(), mx::resource::ptr{}, true);
+
+    left.template get<Node<K, V>>()->parent(root);
+    right.template get<Node<K, V>>()->parent(root);
+
+    root.template get<Node<K, V>>()->separator(0, left);
+    root.template get<Node<K, V>>()->insert(0, right, key);
+
+    this->_height++;
+    this->_root = root;
+}
+
+template <typename K, typename V>
+std::pair<mx::resource::ptr, K> BLinkTree<K, V>::split(const mx::resource::ptr inner_node, const K key,
+                                                       const mx::resource::ptr separator) const
+{
+    constexpr std::uint16_t left_size = InnerNode<K, V>::max_keys / 2;
+    constexpr std::uint16_t right_size = InnerNode<K, V>::max_keys - left_size;
+
+    auto node_ptr = mx::resource::ptr_cast<Node<K, V>>(inner_node);
+
+    K key_up;
+    auto new_inner_node = this->create_inner_node(node_ptr->is_branch(), node_ptr->parent());
+    auto new_node_ptr = mx::resource::ptr_cast<Node<K, V>>(new_inner_node);
+
+    new_node_ptr->high_key(node_ptr->high_key());
+
+    if (key < node_ptr->inner_key(left_size - 1))
+    {
+        node_ptr->move(new_inner_node, left_size, right_size);
+        new_node_ptr->separator(0, node_ptr->separator(left_size));
+        new_node_ptr->size(right_size);
+        node_ptr->size(left_size - 1);
+        key_up = node_ptr->inner_key(left_size - 1);
+        const auto index = node_ptr->index(key);
+        separator.template get<Node<K, V>>()->parent(inner_node);
+        node_ptr->insert(index, separator, key);
+    }
+    else if (key < node_ptr->inner_key(left_size))
+    {
+        node_ptr->move(new_inner_node, left_size, right_size);
+        new_node_ptr->separator(0, separator);
+        key_up = key;
+        node_ptr->size(left_size);
+        new_node_ptr->size(right_size);
+    }
+    else
+    {
+        node_ptr->move(new_inner_node, left_size + 1, right_size - 1);
+        new_node_ptr->separator(0, node_ptr->separator(left_size + 1));
+        node_ptr->size(left_size);
+        new_node_ptr->size(right_size - 1);
+        key_up = node_ptr->inner_key(left_size);
+
+        const auto index = new_node_ptr->index(key);
+        new_node_ptr->insert(index, separator, key);
+    }
+
+    new_node_ptr->right_sibling(node_ptr->right_sibling());
+    node_ptr->right_sibling(new_inner_node);
+    node_ptr->high_key(key_up);
+
+    for (auto index = 0U; index <= new_node_ptr->size(); ++index)
+    {
+        new_node_ptr->separator(index).template get<Node<K, V>>()->parent(new_inner_node);
+    }
+
+    return {new_inner_node, key_up};
+}
+
+template <typename K, typename V>
+std::pair<mx::resource::ptr, K> BLinkTree<K, V>::split(const mx::resource::ptr leaf_node_ptr, const K key,
+                                                       const V value) const
+{
+    auto *leaf_node = mx::resource::ptr_cast<Node<K, V>>(leaf_node_ptr);
+
+    constexpr std::uint16_t left_size = LeafNode<K, V>::max_items / 2;
+    constexpr std::uint16_t right_size = LeafNode<K, V>::max_items - left_size;
+
+    auto new_leaf_node_ptr = this->create_leaf_node(leaf_node->parent());
+    auto *new_leaf_node = mx::resource::ptr_cast<Node<K, V>>(new_leaf_node_ptr);
+
+    leaf_node->move(new_leaf_node_ptr, left_size, right_size);
+    if (leaf_node->right_sibling() != nullptr)
+    {
+        new_leaf_node->right_sibling(leaf_node->right_sibling());
+    }
+    new_leaf_node->high_key(leaf_node->high_key());
+    new_leaf_node->size(right_size);
+    leaf_node->size(left_size);
+    leaf_node->right_sibling(new_leaf_node_ptr);
+
+    if (key < new_leaf_node->leaf_key(0))
+    {
+        leaf_node->insert(leaf_node->index(key), value, key);
+    }
+    else
+    {
+        new_leaf_node->insert(new_leaf_node->index(key), value, key);
+    }
+
+    leaf_node->high_key(new_leaf_node->leaf_key(0));
+
+    return {new_leaf_node_ptr, new_leaf_node->leaf_key(0)};
+}
+
+template <typename K, typename V> void BLinkTree<K, V>::print_statistics() const
+{
+    NodeStatistics<K, V> statistics(this->height());
+
+    for (auto node : *this)
+    {
+        statistics += node;
+    }
+
+    std::cout << statistics << std::endl;
+}
+
+template <typename K, typename V> void BLinkTree<K, V>::check() const
+{
+    for (auto node : *this)
+    {
+        NodeConsistencyChecker<K, V>::check_and_print_errors(node, std::cerr);
+    }
+}
+} // namespace db::index::blinktree
--- a/src/db/index/blinktree/config.h
+++ b/src/db/index/blinktree/config.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <mx/synchronization/synchronization.h>
+
+namespace db::index::blinktree {
+class config
+{
+public:
+    static constexpr auto node_size() { return 1024U; }
+};
+} // namespace db::index::blinktree
--- a/src/db/index/blinktree/insert_separator_task.h
+++ b/src/db/index/blinktree/insert_separator_task.h
@@ -0,0 +1,63 @@
+#pragma once
+
+#include "b_link_tree.h"
+#include "node.h"
+#include "task.h"
+#include <mx/tasking/runtime.h>
+
+namespace db::index::blinktree {
+template <typename K, typename V, class L> class InsertSeparatorTask final : public Task<K, V, L>
+{
+public:
+    constexpr InsertSeparatorTask(const K key, const mx::resource::ptr separator, BLinkTree<K, V> *tree,
+                                  L &listener) noexcept
+        : Task<K, V, L>(key, listener), _tree(tree), _separator(separator)
+    {
+    }
+
+    ~InsertSeparatorTask() override = default;
+
+    mx::tasking::TaskResult execute(std::uint16_t core_id, std::uint16_t channel_id) override;
+
+private:
+    BLinkTree<K, V> *_tree;
+    mx::resource::ptr _separator;
+};
+
+template <typename K, typename V, class L>
+mx::tasking::TaskResult InsertSeparatorTask<K, V, L>::execute(const std::uint16_t core_id,
+                                                              const std::uint16_t /*channel_id*/)
+{
+    auto *annotated_node = this->annotated_resource().template get<Node<K, V>>();
+
+    // Is the node related to the key?
+    if (annotated_node->high_key() <= this->_key)
+    {
+        this->annotate(annotated_node->right_sibling(), config::node_size() / 4U);
+        return mx::tasking::TaskResult::make_succeed(this);
+    }
+
+    // At this point, we are accessing the related leaf and we are in writer mode.
+    if (!annotated_node->full())
+    {
+        const auto index = annotated_node->index(this->_key);
+        annotated_node->insert(index, this->_separator, this->_key);
+        this->_separator.template get<Node<K, V>>()->parent(this->annotated_resource());
+        this->_listener.inserted(core_id, this->_key, 0U);
+        return mx::tasking::TaskResult::make_remove();
+    }
+
+    auto [right, key] = this->_tree->split(this->annotated_resource(), this->_key, this->_separator);
+    if (annotated_node->parent() != nullptr)
+    {
+        this->_separator = right;
+        this->_key = key;
+        this->annotate(annotated_node->parent(), config::node_size() / 4U);
+        return mx::tasking::TaskResult::make_succeed(this);
+    }
+
+    this->_tree->create_new_root(this->annotated_resource(), right, key);
+    this->_listener.inserted(core_id, this->_key, 0U);
+    return mx::tasking::TaskResult::make_remove();
+}
+} // namespace db::index::blinktree
--- a/src/db/index/blinktree/insert_value_task.h
+++ b/src/db/index/blinktree/insert_value_task.h
@@ -0,0 +1,85 @@
+#pragma once
+
+#include "b_link_tree.h"
+#include "insert_separator_task.h"
+#include "node.h"
+#include "task.h"
+#include <mx/tasking/runtime.h>
+#include <vector>
+
+namespace db::index::blinktree {
+template <typename K, typename V, class L> class InsertValueTask final : public Task<K, V, L>
+{
+public:
+    constexpr InsertValueTask(const K key, const V value, BLinkTree<K, V> *tree, L &listener) noexcept
+        : Task<K, V, L>(key, listener), _tree(tree), _value(value)
+    {
+    }
+
+    ~InsertValueTask() override = default;
+
+    mx::tasking::TaskResult execute(std::uint16_t core_id, std::uint16_t channel_id) override;
+
+private:
+    BLinkTree<K, V> *_tree;
+    const V _value;
+};
+
+template <typename K, typename V, class L>
+mx::tasking::TaskResult InsertValueTask<K, V, L>::execute(const std::uint16_t core_id,
+                                                          const std::uint16_t /*channel_id*/)
+{
+    auto *annotated_node = this->annotated_resource().template get<Node<K, V>>();
+
+    // Is the node related to the key?
+    if (annotated_node->high_key() <= this->_key)
+    {
+        this->annotate(annotated_node->right_sibling(), config::node_size() / 4U);
+        return mx::tasking::TaskResult::make_succeed(this);
+    }
+
+    // If we are accessing an inner node, pick the next related child.
+    if (annotated_node->is_inner())
+    {
+        const auto child = annotated_node->child(this->_key);
+        this->annotate(child, config::node_size() / 4U);
+        this->is_readonly(!annotated_node->is_branch());
+        return mx::tasking::TaskResult::make_succeed(this);
+    }
+
+    // Is it a leaf, but we are still reading? Upgrade to writer.
+    if (annotated_node->is_leaf() && this->is_readonly())
+    {
+        this->is_readonly(false);
+        return mx::tasking::TaskResult::make_succeed(this);
+    }
+
+    // At this point, we are accessing the related leaf and we are in writer mode.
+    const auto index = annotated_node->index(this->_key);
+    if (index < annotated_node->size() && annotated_node->leaf_key(index) == this->_key)
+    {
+        this->_listener.inserted(core_id, this->_key, this->_value);
+        return mx::tasking::TaskResult::make_remove();
+    }
+
+    if (annotated_node->full() == false)
+    {
+        annotated_node->insert(index, this->_value, this->_key);
+        this->_listener.inserted(core_id, this->_key, this->_value);
+        return mx::tasking::TaskResult::make_remove();
+    }
+
+    auto [right, key] = this->_tree->split(this->annotated_resource(), this->_key, this->_value);
+    if (annotated_node->parent() != nullptr)
+    {
+        auto *task = mx::tasking::runtime::new_task<InsertSeparatorTask<K, V, L>>(core_id, key, right, this->_tree,
+                                                                                  this->_listener);
+        task->annotate(annotated_node->parent(), config::node_size() / 4U);
+        return mx::tasking::TaskResult::make_succeed_and_remove(task);
+    }
+
+    this->_tree->create_new_root(this->annotated_resource(), right, key);
+    this->_listener.inserted(core_id, this->_key, this->_value);
+    return mx::tasking::TaskResult::make_remove();
+}
+} // namespace db::index::blinktree
--- a/src/db/index/blinktree/listener.h
+++ b/src/db/index/blinktree/listener.h
@@ -0,0 +1,13 @@
+#pragma once
+
+namespace db::index::blinktree {
+template <typename K, typename V> class Listener
+{
+public:
+    virtual void inserted(std::uint16_t core_id, K key, V value) = 0;
+    virtual void updated(std::uint16_t core_id, K key, V value) = 0;
+    virtual void removed(std::uint16_t core_id, K key) = 0;
+    virtual void found(std::uint16_t core_id, K key, V value) = 0;
+    virtual void missing(std::uint16_t core_id, K key) = 0;
+};
+} // namespace db::index::blinktree
--- a/src/db/index/blinktree/lookup_task.h
+++ b/src/db/index/blinktree/lookup_task.h
@@ -0,0 +1,54 @@
+#pragma once
+
+#include "b_link_tree.h"
+#include "insert_separator_task.h"
+#include "node.h"
+#include "task.h"
+#include <optional>
+
+namespace db::index::blinktree {
+template <typename K, typename V, class L> class LookupTask final : public Task<K, V, L>
+{
+public:
+    LookupTask(const K key, L &listener) noexcept : Task<K, V, L>(key, listener) {}
+
+    ~LookupTask() override { this->_listener.found(_core_id, this->_key, _value); }
+
+    mx::tasking::TaskResult execute(std::uint16_t core_id, std::uint16_t channel_id) override;
+
+private:
+    V _value;
+    std::uint16_t _core_id{0U};
+};
+
+template <typename K, typename V, typename L>
+mx::tasking::TaskResult LookupTask<K, V, L>::execute(const std::uint16_t core_id, const std::uint16_t /*channel_id*/)
+{
+    auto *annotated_node = this->annotated_resource().template get<Node<K, V>>();
+
+    // Is the node related to the key?
+    if (annotated_node->high_key() <= this->_key)
+    {
+        this->annotate(annotated_node->right_sibling(), config::node_size() / 4U);
+        return mx::tasking::TaskResult::make_succeed(this);
+    }
+
+    // If we are accessing an inner node, pick the next related child.
+    if (annotated_node->is_inner())
+    {
+        const auto child = annotated_node->child(this->_key);
+        this->annotate(child, config::node_size() / 4U);
+        return mx::tasking::TaskResult::make_succeed(this);
+    }
+
+    // We are accessing the correct leaf.
+    const auto index = annotated_node->index(this->_key);
+    if (annotated_node->leaf_key(index) == this->_key)
+    {
+        this->_value = annotated_node->value(index);
+    }
+    _core_id = core_id;
+
+    return mx::tasking::TaskResult::make_remove();
+}
+} // namespace db::index::blinktree
--- a/src/db/index/blinktree/node.h
+++ b/src/db/index/blinktree/node.h
@@ -0,0 +1,388 @@
+#pragma once
+#include "config.h"
+#include <array>
+#include <atomic>
+#include <cstdint>
+#include <cstring>
+#include <mx/resource/resource.h>
+#include <mx/resource/resource_interface.h>
+#include <mx/tasking/runtime.h>
+
+namespace db::index::blinktree {
+
+template <typename K, typename V> class Node;
+
+/**
+ * Node type.
+ */
+enum NodeType : std::uint8_t
+{
+    Leaf = 1U << 0U,
+    Inner = 1U << 1U,
+    Branch = 1U << 2U
+};
+
+inline NodeType operator|(const NodeType a, const NodeType b) noexcept
+{
+    return static_cast<NodeType>(static_cast<std::uint8_t>(a) | static_cast<std::uint8_t>(b));
+}
+
+/**
+ * Header for every node
+ */
+template <typename K, typename V> struct NodeHeader
+{
+    static constexpr std::uint16_t node_size =
+        config::node_size() - sizeof(NodeHeader<K, V>) - sizeof(mx::resource::ResourceInterface);
+
+    // Type of the node.
+    const NodeType node_type;
+
+    // High key.
+    K high_key{std::numeric_limits<K>::max()};
+
+    // Link to the right sibling.
+    mx::resource::ptr right_sibling;
+
+    // Link to the parent. Alignment needed by some CPU architectures (e.g. arm) because of atomicity.
+    alignas(8) std::atomic<mx::resource::ptr> parent;
+
+    // Number of records in the node.
+    std::uint16_t size{0U};
+
+    [[maybe_unused]] NodeHeader(const NodeType node_type_, const mx::resource::ptr parent_) : node_type(node_type_)
+    {
+        this->parent.store(parent_);
+    }
+
+    ~NodeHeader() = default;
+#ifdef __GNUG__
+};
+#else
+} __attribute__((packed));
+#endif
+
+/**
+ * Representation of an inner node.
+ */
+template <typename K, typename V> struct InnerNode
+{
+    static constexpr std::uint16_t max_keys =
+        (NodeHeader<K, V>::node_size - sizeof(mx::resource::ptr)) / (sizeof(K) + sizeof(mx::resource::ptr));
+    static constexpr std::uint16_t max_separators = max_keys + 1;
+
+    // Memory for keys.
+    std::array<K, InnerNode::max_keys> keys;
+
+    // Memory for separators.
+    std::array<mx::resource::ptr, InnerNode::max_separators> separators;
+};
+
+/**
+ * Representation of a leaf node.
+ */
+template <typename K, typename V> struct LeafNode
+{
+    static constexpr std::uint16_t max_items = NodeHeader<K, V>::node_size / (sizeof(K) + sizeof(V));
+
+    // Memory for keys.
+    std::array<K, LeafNode::max_items> keys;
+
+    // Memory for payloads.
+    std::array<V, LeafNode::max_items> values;
+};
+
+/**
+ * Abstract node representation.
+ */
+template <typename K, typename V> class Node final : public mx::resource::ResourceInterface
+{
+public:
+    constexpr Node(const NodeType node_type, const mx::resource::ptr parent) : _header(node_type, parent)
+    {
+        static_assert(sizeof(Node<K, V>) <= config::node_size());
+    }
+
+    ~Node() override
+    {
+        if (is_inner())
+        {
+            for (auto i = 0U; i <= _header.size; ++i)
+            {
+                if (_inner_node.separators[i] != nullptr)
+                {
+                    mx::tasking::runtime::delete_resource<Node<K, V>>(_inner_node.separators[i]);
+                }
+            }
+        }
+    }
+
+    void on_reclaim() override { this->~Node(); }
+
+    /**
+     * @return True, if this node is a leaf node.
+     */
+    [[nodiscard]] bool is_leaf() const noexcept { return _header.node_type & NodeType::Leaf; }
+
+    /**
+     * @return True, if this node is an inner node.
+     */
+    [[nodiscard]] bool is_inner() const noexcept { return _header.node_type & NodeType::Inner; }
+
+    /**
+     * @return True, if this node is an inner node and children are leaf nodes.
+     */
+    [[nodiscard]] bool is_branch() const noexcept { return _header.node_type & NodeType::Branch; }
+
+    /**
+     * @return Number of records stored in the node.
+     */
+    [[nodiscard]] std::uint16_t size() const noexcept { return _header.size; }
+
+    /**
+     * Updates the number of records stored in the node.
+     * @param size New number of records.
+     */
+    void size(const std::uint16_t size) noexcept { _header.size = size; }
+
+    /**
+     * @return High key of the node.
+     */
+    K high_key() const noexcept { return _header.high_key; }
+
+    /**
+     * Updates the high key.
+     * @param high_key New high key.
+     */
+    [[maybe_unused]] void high_key(const K high_key) noexcept { _header.high_key = high_key; }
+
+    /**
+     * @return Pointer to the right sibling.
+     */
+    [[nodiscard]] mx::resource::ptr right_sibling() const noexcept { return _header.right_sibling; }
+
+    /**
+     * Updates the right sibling.
+     * @param right_sibling Pointer to the new right sibling.
+     */
+    [[maybe_unused]] void right_sibling(const mx::resource::ptr right_sibling) noexcept
+    {
+        _header.right_sibling = right_sibling;
+    }
+
+    /**
+     * @return Pointer to the parent node.
+     */
+    [[nodiscard]] mx::resource::ptr parent() const noexcept { return _header.parent; }
+
+    /**
+     * Updates the parent node.
+     * @param parent Pointer to the new parent node.
+     */
+    void parent(const mx::resource::ptr parent) noexcept { _header.parent = parent; }
+
+    /**
+     * Read the value at a given index.
+     * @param index Index.
+     * @return Value at the index.
+     */
+    V value(const std::uint16_t index) const noexcept { return _leaf_node.values[index]; }
+
+    /**
+     * Update the value at a given index.
+     * @param index Index.
+     * @param value New value.
+     */
+    void value(const std::uint16_t index, const V value) noexcept { _leaf_node.values[index] = value; }
+
+    /**
+     * Read the separator at a given index.
+     * @param index Index.
+     * @return Separator at the index.
+     */
+    [[nodiscard]] mx::resource::ptr separator(const std::uint16_t index) const noexcept
+    {
+        return _inner_node.separators[index];
+    }
+
+    /**
+     * Update the separator for a given index.
+     * @param index Index.
+     * @param separator New separator for the index.
+     */
+    void separator(const std::uint16_t index, const mx::resource::ptr separator) noexcept
+    {
+        _inner_node.separators[index] = separator;
+    }
+
+    /**
+     * Read the key from the leaf node.
+     * @param index Index.
+     * @return Key at the index.
+     */
+    K leaf_key(const std::uint16_t index) const noexcept { return _leaf_node.keys[index]; }
+
+    /**
+     * Read the key from the inner node.
+     * @param index Index.
+     * @return Key at the index.
+     */
+    K inner_key(const std::uint16_t index) const noexcept { return _inner_node.keys[index]; }
+
+    /**
+     * @return True, if the node can not store further records.
+     */
+    [[nodiscard]] bool full() const noexcept
+    {
+        const auto max_size = is_leaf() ? LeafNode<K, V>::max_items : InnerNode<K, V>::max_keys;
+        return _header.size >= max_size;
+    }
+
+    /**
+     * Calculates the index for a given key.
+     * @param key Key.
+     * @return Index for the key.
+     */
+    std::uint16_t index(K key) const noexcept;
+
+    /**
+     * Calculates the child for a given key using binary search.
+     * @param key Key.
+     * @return Child for the key.
+     */
+    mx::resource::ptr child(K key) const noexcept;
+
+    /**
+     * Inserts a record into an inner node.
+     * @param index Index.
+     * @param separator Separator.
+     * @param key Key.
+     */
+    void insert(std::uint16_t index, mx::resource::ptr separator, K key);
+
+    /**
+     * Inserts a record into a leaf node.
+     * @param index Index.
+     * @param value Payload.
+     * @param key Key.
+     */
+    void insert(std::uint16_t index, V value, K key);
+
+    /**
+     * Moves a range of records into another node.
+     * @param destination Other node.
+     * @param from_index Start index.
+     * @param count Number of records to move.
+     */
+    void move(mx::resource::ptr destination, std::uint16_t from_index, std::uint16_t count);
+
+    /**
+     * Searches a separator within an inner node.
+     * @param separator Separator to search.
+     * @return True, if the separator was found.
+     */
+    [[nodiscard]] bool contains(mx::resource::ptr separator) const noexcept;
+
+private:
+    NodeHeader<K, V> _header;
+    union {
+        InnerNode<K, V> _inner_node;
+        LeafNode<K, V> _leaf_node;
+    };
+};
+
+template <typename K, typename V> std::uint16_t Node<K, V>::index(const K key) const noexcept
+{
+    const auto keys = this->is_leaf() ? this->_leaf_node.keys.cbegin() : this->_inner_node.keys.cbegin();
+    const auto iterator = std::lower_bound(keys, keys + this->size(), key);
+
+    return std::distance(keys, iterator);
+}
+
+template <typename K, typename V> mx::resource::ptr Node<K, V>::child(const K key) const noexcept
+{
+    std::int16_t low = 0;
+    std::int16_t high = size() - 1;
+    while (low <= high)
+    {
+        const auto mid = (low + high) >> 1U; // Will work for size() - 1 < max(std::int32_t)/2
+        if (this->inner_key(mid) <= key)
+        {
+            low = mid + 1;
+        }
+        else
+        {
+            high = mid - 1;
+        }
+    }
+
+    return this->_inner_node.separators[high + 1U];
+}
+
+template <typename K, typename V>
+void Node<K, V>::insert(const std::uint16_t index, const mx::resource::ptr separator, const K key)
+{
+    if (index < this->size())
+    {
+        const auto offset = this->size() - index;
+        std::memmove(static_cast<void *>(&this->_inner_node.keys[index + 1]),
+                     static_cast<void *>(&this->_inner_node.keys[index]), offset * sizeof(K));
+        std::memmove(static_cast<void *>(&this->_inner_node.separators[index + 2]),
+                     static_cast<void *>(&this->_inner_node.separators[index + 1]), offset * sizeof(mx::resource::ptr));
+    }
+
+    this->_inner_node.keys[index] = key;
+    this->_inner_node.separators[index + 1] = separator;
+    ++this->_header.size;
+}
+
+template <typename K, typename V> void Node<K, V>::insert(const std::uint16_t index, const V value, const K key)
+{
+    if (index < this->size())
+    {
+        const auto offset = this->size() - index;
+        std::memmove(static_cast<void *>(&this->_leaf_node.keys[index + 1]),
+                     static_cast<void *>(&this->_leaf_node.keys[index]), offset * sizeof(K));
+        std::memmove(static_cast<void *>(&this->_leaf_node.values[index + 1]),
+                     static_cast<void *>(&this->_leaf_node.values[index]), offset * sizeof(V));
+    }
+
+    this->_leaf_node.keys[index] = key;
+    this->_leaf_node.values[index] = value;
+    ++this->_header.size;
+}
+
+template <typename K, typename V>
+void Node<K, V>::move(const mx::resource::ptr destination, const std::uint16_t from_index, const std::uint16_t count)
+{
+    auto *node = mx::resource::ptr_cast<Node<K, V>>(destination);
+    if (this->is_leaf())
+    {
+        std::memcpy(static_cast<void *>(&node->_leaf_node.keys[0]),
+                    static_cast<void *>(&this->_leaf_node.keys[from_index]), count * sizeof(K));
+        std::memcpy(static_cast<void *>(&node->_leaf_node.values[0]),
+                    static_cast<void *>(&this->_leaf_node.values[from_index]), count * sizeof(V));
+    }
+    else
+    {
+        std::memcpy(static_cast<void *>(&node->_inner_node.keys[0]),
+                    static_cast<void *>(&this->_inner_node.keys[from_index]), count * sizeof(K));
+        std::memcpy(static_cast<void *>(&node->_inner_node.separators[1]),
+                    static_cast<void *>(&this->_inner_node.separators[from_index + 1]),
+                    count * sizeof(mx::resource::ptr));
+    }
+}
+
+template <typename K, typename V> bool Node<K, V>::contains(const mx::resource::ptr separator) const noexcept
+{
+    for (auto i = 0U; i <= this->size(); ++i)
+    {
+        if (this->_inner_node.separators[i] == separator)
+        {
+            return true;
+        }
+    }
+
+    return false;
+}
+} // namespace db::index::blinktree
--- a/src/db/index/blinktree/node_consistency_checker.h
+++ b/src/db/index/blinktree/node_consistency_checker.h
@@ -0,0 +1,185 @@
+#pragma once
+#include <ostream>
+
+#include "node.h"
+
+namespace db::index::blinktree {
+/**
+ * Validates tree nodes and checks consistency.
+ */
+template <typename K, typename V> class NodeConsistencyChecker
+{
+public:
+    /**
+     * Validates the node and prints errors to the given stream.
+     * @param node Node to validate.
+     * @param stream Stream to print errors.
+     */
+    static void check_and_print_errors(Node<K, V> *node, std::ostream &stream);
+
+private:
+    static void check_high_key_valid(Node<K, V> *node, std::ostream &stream);
+    static void check_key_order_valid(Node<K, V> *node, std::ostream &stream);
+    static void check_no_null_separator(Node<K, V> *node, std::ostream &stream);
+    static void check_children_order_valid(Node<K, V> *node, std::ostream &stream);
+    static void check_level_valid(Node<K, V> *node, std::ostream &stream);
+    static void check_and_print_parent(Node<K, V> *node, std::ostream &stream);
+};
+
+template <typename K, typename V>
+void NodeConsistencyChecker<K, V>::check_and_print_errors(Node<K, V> *node, std::ostream &stream)
+{
+    check_high_key_valid(node, stream);
+    check_key_order_valid(node, stream);
+    check_no_null_separator(node, stream);
+    check_children_order_valid(node, stream);
+    check_level_valid(node, stream);
+
+    // check_and_print_parent(node, stream);
+}
+
+template <typename K, typename V>
+void NodeConsistencyChecker<K, V>::check_high_key_valid(Node<K, V> *node, std::ostream &stream)
+{
+    if (node->is_leaf())
+    {
+        if (node->leaf_key(node->size() - 1) >= node->high_key())
+        {
+            stream << "[HighKey   ] Leaf " << node << ": Key[" << node->size() - 1
+                   << "] (=" << node->leaf_key(node->size() - 1) << ") >= " << node->high_key() << std::endl;
+        }
+    }
+    else
+    {
+        if (node->inner_key(node->size() - 1) >= node->high_key())
+        {
+            stream << "[HighKey   ] Inner " << node << ": Key[" << node->size() - 1
+                   << "] (=" << node->inner_key(node->size() - 1) << ") >= " << node->high_key() << std::endl;
+        }
+    }
+}
+
+template <typename K, typename V>
+void NodeConsistencyChecker<K, V>::check_key_order_valid(Node<K, V> *node, std::ostream &stream)
+{
+    for (auto index = 1U; index < node->size(); index++)
+    {
+        if (node->is_leaf())
+        {
+            if (node->leaf_key(index - 1U) >= node->leaf_key(index))
+            {
+                stream << "[KeyOrder  ] Leaf " << node << ": Key[" << index - 1U << "] (=" << node->leaf_key(index - 1U)
+                       << ") >= Key[" << index << "] (=" << node->leaf_key(index) << ")" << std::endl;
+            }
+        }
+        else
+        {
+            if (node->inner_key(index - 1) >= node->inner_key(index))
+            {
+                stream << "[KeyOrder  ] Inner " << node << ": Key[" << index - 1 << "] (=" << node->inner_key(index - 1)
+                       << ") >= Key[" << index << "] (=" << node->inner_key(index) << ")" << std::endl;
+            }
+        }
+    }
+}
+
+template <typename K, typename V>
+void NodeConsistencyChecker<K, V>::check_no_null_separator(Node<K, V> *node, std::ostream &stream)
+{
+    if (node->is_inner())
+    {
+        for (auto index = 0U; index <= node->size(); index++)
+        {
+            if (node->separator(index) == nullptr)
+            {
+                stream << "[Separator ] Inner " << node << ": Separator[" << index << "] is empty." << std::endl;
+            }
+        }
+    }
+}
+
+template <typename K, typename V>
+void NodeConsistencyChecker<K, V>::check_children_order_valid(Node<K, V> *node, std::ostream &stream)
+{
+    if (node->is_inner())
+    {
+        for (auto index = 0U; index < node->size(); index++)
+        {
+            auto child = node->separator(index).template get<Node<K, V>>();
+            const auto child_last_key =
+                child->is_leaf() ? child->leaf_key(child->size() - 1U) : child->inner_key(child->size() - 1U);
+            if (child_last_key >= node->inner_key(index))
+            {
+                stream << "[ChildOrder] Inner " << node << ": Key[" << index << "] (=" << node->inner_key(index)
+                       << ") <= Separator[" << index << "].Key[" << child->size() - 1U << "] (=" << child_last_key
+                       << ")" << std::endl;
+            }
+        }
+    }
+}
+
+template <typename K, typename V>
+void NodeConsistencyChecker<K, V>::check_level_valid(Node<K, V> *node, std::ostream &stream)
+{
+    if (node->right_sibling() && node->is_leaf() != node->right_sibling().template get<Node<K, V>>()->is_leaf())
+    {
+        stream << "[Level     ] Leaf " << node << ": Is marked as leaf, but right sibling is not" << std::endl;
+    }
+
+    if (node->is_inner())
+    {
+        for (auto index = 0U; index < node->size(); index++)
+        {
+            if (node->separator(index).template get<Node<K, V>>()->is_leaf() !=
+                node->separator(index + 1U).template get<Node<K, V>>()->is_leaf())
+            {
+                stream << "[Level     ] Inner " << node << ": Separator[" << index
+                       << "] is marked as is_leaf = " << node->separator(index).template get<Node<K, V>>()->is_leaf()
+                       << " but Separator[" << index + 1U << "] is not" << std::endl;
+            }
+        }
+    }
+}
+
+template <typename K, typename V>
+void NodeConsistencyChecker<K, V>::check_and_print_parent(Node<K, V> *node, std::ostream &stream)
+{
+    const auto parent = node->parent();
+    if (parent)
+    {
+        if (parent.template get<Node<K, V>>()->contains(mx::resource::ptr(node)) == false)
+        {
+            stream << "Wrong parent(1) for node " << node << " (leaf: " << node->is_leaf() << ")" << std::endl;
+        }
+        else
+        {
+            auto index = 0U;
+            for (; index <= parent.template get<Node<K, V>>()->size(); index++)
+            {
+                if (parent.template get<Node<K, V>>()->separator(index).template get<Node<K, V>>() == node)
+                {
+                    break;
+                }
+            }
+
+            if (index < parent.template get<Node<K, V>>()->size())
+            {
+                const auto key =
+                    node->is_leaf() ? node->leaf_key(node->size() - 1U) : node->inner_key(node->size() - 1);
+                if ((key < parent.template get<Node<K, V>>()->inner_key(index)) == false)
+                {
+                    stream << "Wrong parent(2) for node " << node << " (leaf: " << node->is_leaf() << ")" << std::endl;
+                }
+            }
+            else
+            {
+                const auto key = node->is_leaf() ? node->leaf_key(0U) : node->inner_key(0U);
+                if ((key >= parent.template get<Node<K, V>>()->inner_key(index - 1U)) == false)
+                {
+                    stream << "Wrong parent(3) for node " << node << " (leaf: " << node->is_leaf() << ")" << std::endl;
+                }
+            }
+        }
+    }
+}
+} // namespace db::index::blinktree
--- a/src/db/index/blinktree/node_iterator.h
+++ b/src/db/index/blinktree/node_iterator.h
@@ -0,0 +1,44 @@
+#pragma once
+
+#include "node.h"
+#include <mx/resource/resource.h>
+
+namespace db::index::blinktree {
+/**
+ * Iterator for iterating over nodes of a tree.
+ */
+template <typename K, typename V> class NodeIterator
+{
+public:
+    NodeIterator() = default;
+    explicit NodeIterator(Node<K, V> *root) : _current_node(root), _first_node_in_level(root) {}
+    ~NodeIterator() = default;
+
+    Node<K, V> *&operator*() { return _current_node; }
+
+    NodeIterator<K, V> &operator++()
+    {
+        if (_current_node->right_sibling())
+        {
+            _current_node = _current_node->right_sibling().template get<Node<K, V>>();
+        }
+        else if (_current_node->is_inner())
+        {
+            _first_node_in_level = _first_node_in_level->separator(0).template get<Node<K, V>>();
+            _current_node = _first_node_in_level;
+        }
+        else
+        {
+            _current_node = nullptr;
+        }
+
+        return *this;
+    }
+
+    bool operator!=(const NodeIterator<K, V> &other) const { return _current_node != other._current_node; }
+
+private:
+    Node<K, V> *_current_node = nullptr;
+    Node<K, V> *_first_node_in_level = nullptr;
+};
+} // namespace db::index::blinktree
--- a/src/db/index/blinktree/node_statistics.h
+++ b/src/db/index/blinktree/node_statistics.h
@@ -0,0 +1,72 @@
+#pragma once
+
+#include "config.h"
+#include "node.h"
+#include <cstdint>
+#include <ostream>
+
+namespace db::index::blinktree {
+/**
+ * Collects and prints statistics of a set of nodes.
+ */
+template <typename K, typename V> class NodeStatistics
+{
+public:
+    explicit NodeStatistics(const std::uint16_t height) : _tree_height(height) {}
+    ~NodeStatistics() = default;
+
+    NodeStatistics &operator+=(Node<K, V> *node)
+    {
+        this->_count_inner_nodes += node->is_inner();
+        this->_count_leaf_nodes += node->is_leaf();
+
+        if (node->is_leaf())
+        {
+            this->_count_leaf_node_keys += node->size();
+        }
+        else
+        {
+            this->_count_inner_node_keys += node->size();
+        }
+
+        return *this;
+    }
+
+    friend std::ostream &operator<<(std::ostream &stream, const NodeStatistics<K, V> &tree_statistics)
+    {
+        const auto count_nodes = tree_statistics._count_leaf_nodes + tree_statistics._count_inner_nodes;
+        const auto size_in_bytes = count_nodes * config::node_size();
+        stream << "Statistics of the Tree: \n"
+               << "  Node   size:    " << sizeof(Node<K, V>) << " B\n"
+               << "  Header size:    " << sizeof(NodeHeader<K, V>) << " B\n"
+               << "  Inner  keys:    " << InnerNode<K, V>::max_keys << " (" << sizeof(K) * InnerNode<K, V>::max_keys
+               << " B)\n"
+               << "  Leaf   keys:    " << LeafNode<K, V>::max_items << " (" << sizeof(K) * LeafNode<K, V>::max_items
+               << " B)\n"
+               << "  Tree   height:  " << tree_statistics._tree_height << "\n"
+               << "  Inner  nodes:   " << tree_statistics._count_inner_nodes << "\n"
+               << "  Inner  entries: " << tree_statistics._count_inner_node_keys << "\n"
+               << "  Leaf   nodes:   " << tree_statistics._count_leaf_nodes << "\n"
+               << "  Leaf   entries: " << tree_statistics._count_leaf_node_keys << "\n"
+               << "  Tree   size:    " << size_in_bytes / 1024.0 / 1024.0 << " MB";
+
+        return stream;
+    }
+
+private:
+    // Number of inner nodes.
+    std::uint64_t _count_inner_nodes = 0U;
+
+    // Number of leaf nodes.
+    std::uint64_t _count_leaf_nodes = 0U;
+
+    // Number of records located in inner nodes.
+    std::uint64_t _count_inner_node_keys = 0U;
+
+    // Number of records located in leaf nodes.
+    std::uint64_t _count_leaf_node_keys = 0U;
+
+    // Hight of the tree.
+    const std::uint16_t _tree_height;
+};
+} // namespace db::index::blinktree
--- a/src/db/index/blinktree/task.h
+++ b/src/db/index/blinktree/task.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include <mx/tasking/task.h>
+
+namespace db::index::blinktree {
+template <typename K, typename V, class L> class Task : public mx::tasking::TaskInterface
+{
+public:
+    constexpr Task(const K key, L &listener) : _listener(listener), _key(key) {}
+    ~Task() override = default;
+
+protected:
+    L &_listener;
+    K _key;
+};
+} // namespace db::index::blinktree
--- a/src/db/index/blinktree/update_task.h
+++ b/src/db/index/blinktree/update_task.h
@@ -0,0 +1,69 @@
+#pragma once
+
+#include "b_link_tree.h"
+#include "insert_separator_task.h"
+#include "node.h"
+#include "task.h"
+#include <iostream>
+
+namespace db::index::blinktree {
+template <typename K, typename V, class L> class UpdateTask final : public Task<K, V, L>
+{
+public:
+    constexpr UpdateTask(const K key, const V value, L &listener) noexcept : Task<K, V, L>(key, listener), _value(value)
+    {
+    }
+
+    ~UpdateTask() override = default;
+
+    mx::tasking::TaskResult execute(std::uint16_t core_id, std::uint16_t channel_id) override;
+
+private:
+    const V _value;
+};
+
+template <typename K, typename V, typename L>
+mx::tasking::TaskResult UpdateTask<K, V, L>::execute(const std::uint16_t core_id, const std::uint16_t /*channel_id*/)
+{
+    auto *node = this->annotated_resource().template get<Node<K, V>>();
+
+    // Is the node related to the key?
+    if (node->high_key() <= this->_key)
+    {
+        this->annotate(node->right_sibling(), config::node_size() / 4U);
+        return mx::tasking::TaskResult::make_succeed(this);
+    }
+
+    // If we are accessing an inner node, pick the next related child.
+    if (node->is_inner())
+    {
+        const auto child = node->child(this->_key);
+        this->annotate(child, config::node_size() / 4U);
+        this->is_readonly(!node->is_branch());
+        return mx::tasking::TaskResult::make_succeed(this);
+    }
+
+    // If the task is still reading, but this is a leaf,
+    // spawn again as writer.
+    if (node->is_leaf() && this->is_readonly())
+    {
+        this->is_readonly(false);
+        return mx::tasking::TaskResult::make_succeed(this);
+    }
+
+    // We are accessing the correct leaf.
+    const auto index = node->index(this->_key);
+    const auto key = node->leaf_key(index);
+    if (key == this->_key)
+    {
+        node->value(index, this->_value);
+        this->_listener.updated(core_id, key, this->_value);
+    }
+    else
+    {
+        this->_listener.missing(core_id, key);
+    }
+
+    return mx::tasking::TaskResult::make_remove();
+}
+} // namespace db::index::blinktree
--- a/src/mx/README.md
+++ b/src/mx/README.md
@@ -0,0 +1,42 @@
+# How to use `MxTasking`
+
+## Build a simple _Hello World_ task
+Every task inherits from `mx::tasking::TaskInterface` and implements the `execute` method, which is called when the task gets executed by the runtime.
+
+    #include <mx/tasking/task.h>
+    #include <iostream>
+    class HelloWorldTask : public mx::tasking::TaskInterface
+    {
+    public:
+        HelloWorldTask() = default;
+        virtual ~HelloWorldTask() = default;
+        
+        virtual TaskInterface *execute(const std::uint16_t, const std::uint16_t)
+        {
+            std::cout << "Hello world from MxTasking!" << std::endl;
+            return nullptr;
+        }
+    };
+    
+## Run the _Hello World_ task
+
+    #include <mx/tasking/runtime.h>
+    
+    int main()
+    {
+        // Define which cores will be used (1 core here).
+        auto cores = mx::util::core_set::build(1);
+        
+        // Create an instance of the task with the current core as first
+        // parameter (we assume that we start at core 0).
+        auto *task = mx::tasking::runtime::new_task<HelloWorldTask>(0);
+    
+        // Create a runtime for the given cores.
+        mx::tasking::runtime_guard runtime { cores };
+        
+        // Schedule the task.
+        mx::tasking::runtime::spawn(*task);
+        
+        // Will print: "Hello world from MxTasking!"
+        return 0;
+    }
--- a/src/mx/memory/alignment_helper.h
+++ b/src/mx/memory/alignment_helper.h
@@ -0,0 +1,39 @@
+#pragma once
+
+#include <cstdint>
+
+namespace mx::memory {
+/**
+ * Helper for setting the correct size on aligned allocation:
+ *  The allocation size has to be a multiple of the alignment.
+ */
+class alignment_helper
+{
+public:
+    template <typename T> static constexpr T next_multiple(const T value, const T base)
+    {
+        if (value > base)
+        {
+            const auto mod = value % base;
+            if (mod == 0U)
+            {
+                return value;
+            }
+
+            return value + base - mod;
+        }
+
+        return base;
+    }
+
+    static constexpr bool is_power_of_two(const std::uint64_t value)
+    {
+        return ((value != 0U) && ((value & (value - 1U)) == 0U));
+    }
+
+    static constexpr std::uint64_t next_power_of_two(const std::uint64_t value)
+    {
+        return is_power_of_two(value) ? value : 1ULL << (sizeof(std::uint64_t) * 8 - __builtin_clzll(value));
+    }
+};
+} // namespace mx::memory
--- a/src/mx/memory/config.h
+++ b/src/mx/memory/config.h
@@ -0,0 +1,29 @@
+#pragma once
+#include <chrono>
+
+namespace mx::memory {
+class config
+{
+public:
+    /**
+     * @return Number of maximal provided NUMA regions.
+     */
+    static constexpr auto max_numa_nodes() { return 2U; }
+
+    /**
+     * Decreases the use of memory of external NUMA regions within the allocator.
+     * @return True, when memory usage of external NUMA regions should be less.
+     */
+    static constexpr auto low_priority_for_external_numa() { return false; }
+
+    /**
+     * @return Interval of each epoch, if memory reclamation is used.
+     */
+    static constexpr auto epoch_interval() { return std::chrono::milliseconds(50U); }
+
+    /**
+     * @return True, if garbage is removed local.
+     */
+    static constexpr auto local_garbage_collection() { return false; }
+};
+} // namespace mx::memory
--- a/src/mx/memory/dynamic_size_allocator.cpp
+++ b/src/mx/memory/dynamic_size_allocator.cpp
@@ -0,0 +1,326 @@
+#include "dynamic_size_allocator.h"
+#include "global_heap.h"
+#include <algorithm>
+#include <cassert>
+#include <mx/system/topology.h>
+
+using namespace mx::memory::dynamic;
+
+AllocationBlock::AllocationBlock(const std::uint32_t id, const std::uint8_t numa_node_id, const std::size_t size)
+    : _id(id), _numa_node_id(numa_node_id), _size(size), _available_size(size)
+{
+    this->_allocated_block = GlobalHeap::allocate(numa_node_id, size);
+    this->_free_elements.emplace_back(FreeHeader{reinterpret_cast<std::uintptr_t>(this->_allocated_block), size});
+}
+
+AllocationBlock::AllocationBlock(AllocationBlock &&other) noexcept
+    : _id(other._id), _numa_node_id(other._numa_node_id), _size(other._size), _allocated_block(other._allocated_block),
+      _free_elements(std::move(other._free_elements)), _available_size(other._available_size)
+{
+    other._allocated_block = nullptr;
+}
+
+AllocationBlock &AllocationBlock::operator=(AllocationBlock &&other) noexcept
+{
+    this->_id = other._id;
+    this->_numa_node_id = other._numa_node_id;
+    this->_size = other._size;
+    this->_allocated_block = std::exchange(other._allocated_block, nullptr);
+    this->_free_elements = std::move(other._free_elements);
+    this->_available_size = other._available_size;
+    return *this;
+}
+
+AllocationBlock::~AllocationBlock()
+{
+    if (this->_allocated_block != nullptr)
+    {
+        GlobalHeap::free(this->_allocated_block, this->_size);
+    }
+}
+
+void *AllocationBlock::allocate(const std::size_t alignment, const std::size_t size) noexcept
+{
+    assert(alignment && (!(alignment & (alignment - 1))) && "Alignment must be > 0 and power of 2");
+    this->_lock.lock();
+
+    if (this->_available_size < size)
+    {
+        this->_lock.unlock();
+        return nullptr;
+    }
+
+    auto [free_element_iterator, aligned_size_including_header] = this->find_block(alignment, size);
+    if (free_element_iterator == this->_free_elements.end())
+    {
+        this->_lock.unlock();
+        return nullptr;
+    }
+
+    const auto free_block_start = free_element_iterator->start();
+    const auto free_block_end = free_block_start + free_element_iterator->size();
+    const auto remaining_size = free_element_iterator->size() - aligned_size_including_header;
+
+    std::uint16_t size_before_header{0U};
+    if (remaining_size >= 256U)
+    {
+        const auto index = std::distance(this->_free_elements.begin(), free_element_iterator);
+        this->_free_elements[index].contract(aligned_size_including_header);
+        this->_available_size -= aligned_size_including_header;
+    }
+    else
+    {
+        size_before_header = remaining_size;
+        this->_free_elements.erase(free_element_iterator);
+        this->_available_size -= free_element_iterator->size();
+    }
+    this->_lock.unlock();
+
+    const auto allocation_header_address = free_block_end - aligned_size_including_header;
+    new (reinterpret_cast<void *>(allocation_header_address)) AllocatedHeader(
+        aligned_size_including_header - sizeof(AllocatedHeader), size_before_header, this->_numa_node_id, this->_id);
+    assert((allocation_header_address + sizeof(AllocatedHeader)) % alignment == 0 && "Not aligned");
+
+    return reinterpret_cast<void *>(allocation_header_address + sizeof(AllocatedHeader));
+}
+
+void AllocationBlock::free(AllocatedHeader *allocation_header) noexcept
+{
+    const auto allocated_size = allocation_header->size;
+    const auto unused_size_before_header = allocation_header->unused_size_before_header;
+    const auto block_address = reinterpret_cast<std::uintptr_t>(allocation_header) - unused_size_before_header;
+    const auto size = allocated_size + unused_size_before_header + sizeof(AllocatedHeader);
+
+    const auto free_element = FreeHeader{block_address, size};
+
+    this->_lock.lock();
+
+    if (this->_free_elements.empty())
+    {
+        this->_free_elements.push_back(free_element);
+    }
+    else
+    {
+        const auto lower_bound_iterator =
+            std::lower_bound(this->_free_elements.begin(), this->_free_elements.end(), free_element);
+        const auto index = std::distance(this->_free_elements.begin(), lower_bound_iterator);
+        assert(index >= 0 && "Index is negative");
+        const auto real_index = std::size_t(index);
+
+        // Try merge to the right.
+        if (real_index < this->_free_elements.size() && free_element.borders(this->_free_elements[real_index]))
+        {
+            this->_free_elements[real_index].merge(free_element);
+
+            // Okay, we inserted the new free element as merge,  we do not insert it "real".
+            // Try to merge the expanded right with the left.
+            if (real_index > 0U && this->_free_elements[real_index - 1U].borders(this->_free_elements[real_index]))
+            {
+                this->_free_elements[real_index - 1].merge(this->_free_elements[real_index]);
+                this->_free_elements.erase(this->_free_elements.begin() + real_index);
+            }
+        }
+        else if (real_index > 0U && this->_free_elements[real_index - 1U].borders(free_element))
+        {
+            // In this case, we could not merge with the right, but can we merge
+            // to the left? By this, we could save up the real insert.
+            this->_free_elements[real_index - 1U].merge(free_element);
+        }
+        else
+        {
+            // We could not merge anything. Just insert.
+            this->_free_elements.insert(this->_free_elements.begin() + real_index, free_element);
+        }
+    }
+    this->_available_size += free_element.size();
+
+    this->_lock.unlock();
+}
+
+std::pair<std::vector<FreeHeader>::iterator, std::size_t> AllocationBlock::find_block(const std::size_t alignment,
+                                                                                      const std::size_t size) noexcept
+{
+    /**
+     * Check each block of the free list for enough space to include the wanted space.
+     * If enough, check the alignment (starting at the end).
+     *
+     * +----------------------------+
+     * | 2000byte                   |
+     * +----------------------------+
+     *  => wanted: 700byte
+     *  => align border -> 1300 is not aligned, expand to 720byte -> 1280 is aligned
+     * +----------------------------+
+     * | 1280byte       | 720byte   |
+     * +----------------------------+
+     *
+     */
+
+    const auto size_including_header = size + sizeof(AllocatedHeader);
+
+    for (auto iterator = this->_free_elements.begin(); iterator != this->_free_elements.end(); iterator++)
+    {
+        const auto &free_element = *iterator;
+        if (free_element >= size_including_header)
+        {
+            const auto start = free_element.start();
+
+            // The free block ends here.
+            const auto end = start + free_element.size();
+
+            // This is where we would start the memory block on allocation
+            // But this may be not aligned.
+            const auto possible_block_begin = end - size;
+
+            // This is the size we need to start the block aligned.
+            const auto aligned_size = size + (possible_block_begin & (alignment - 1U));
+
+            // This is the size we need aligned and for header.
+            const auto aligned_size_including_header = aligned_size + sizeof(AllocatedHeader);
+
+            if (free_element >= aligned_size_including_header)
+            {
+                // aligned_size_including_header
+                return std::make_pair(iterator, aligned_size_including_header);
+            }
+        }
+    }
+
+    return std::make_pair(this->_free_elements.end(), 0U);
+}
+
+Allocator::Allocator()
+{
+    this->initialize_empty();
+}
+
+void *Allocator::allocate(const std::uint8_t numa_node_id, const std::size_t alignment, const std::size_t size) noexcept
+{
+    auto &allocation_blocks = this->_numa_allocation_blocks[numa_node_id];
+
+    auto *memory = allocation_blocks.back().allocate(alignment, size);
+    if (memory == nullptr)
+    {
+        // This will be allocated default...
+        constexpr auto default_alloc_size = 1UL << 28U;
+
+        // ... but if the requested size is higher, allocate more.
+        const auto size_to_alloc = std::max(default_alloc_size, alignment_helper::next_multiple(size, 64UL));
+
+        // Try to allocate until allocation was successful.
+        // It is possible, that another core tries to allocate at the
+        // same time, therefore we capture the allocation flag (one per region)
+        auto &flag = this->_numa_allocation_flags[numa_node_id].value();
+        while (memory == nullptr)
+        {
+            allocate_new_block(numa_node_id, size_to_alloc, allocation_blocks, flag);
+            memory = allocation_blocks.back().allocate(alignment, size);
+        }
+    }
+
+    return memory;
+}
+
+void Allocator::allocate_new_block(const std::uint8_t numa_node_id, const std::size_t size,
+                                   std::vector<AllocationBlock> &blocks, std::atomic<bool> &flag)
+{
+    // Acquire the allocation flag to ensure only one thread to allocate.
+    auto expected = false;
+    const auto can_allocate = flag.compare_exchange_strong(expected, true);
+
+    if (can_allocate)
+    {
+        // If that was this thread go for it...
+        const auto next_id = this->_next_allocation_id[numa_node_id].value().fetch_add(1U, std::memory_order_acq_rel);
+        blocks.emplace_back(AllocationBlock{next_id, numa_node_id, size});
+
+        // .. but release the allocation flag afterward.
+        flag.store(false);
+    }
+    else
+    {
+        // If that was another thread, wait until he finished.
+        while (flag.load())
+        {
+            system::builtin::pause();
+        }
+    }
+}
+
+void Allocator::free(void *pointer) noexcept
+{
+    // Every allocated memory belongs to one allocation block.
+    // The reason is, that we can only return full blocks to
+    // the global heap that is managed by the operating system.
+    const auto address = reinterpret_cast<std::uintptr_t>(pointer);
+
+    // Access the header to identify the allocation block.
+    const auto header_address = address - sizeof(AllocatedHeader);
+    auto *allocation_header = reinterpret_cast<AllocatedHeader *>(header_address);
+
+    // Check all blocks to find the matching one.
+    for (auto &block : this->_numa_allocation_blocks[allocation_header->numa_node_id])
+    {
+        if (allocation_header->allocation_block_id == block.id())
+        {
+            block.free(allocation_header);
+            return;
+        }
+    }
+}
+
+void Allocator::defragment() noexcept
+{
+    // Remove all blocks that are unused to free as much memory as possible.
+    for (auto i = 0U; i <= system::topology::max_node_id(); ++i)
+    {
+        auto &numa_blocks = this->_numa_allocation_blocks[i];
+        numa_blocks.erase(
+            std::remove_if(numa_blocks.begin(), numa_blocks.end(), [](const auto &block) { return block.is_free(); }),
+            numa_blocks.end());
+    }
+
+    // If all memory was released, acquire new.
+    this->initialize_empty();
+}
+
+void Allocator::initialize_empty()
+{
+    // For performance reasons: Each list must contain at least
+    // one block. This way, we do not have to check every time.
+    for (auto i = 0U; i <= system::topology::max_node_id(); ++i)
+    {
+        auto &blocks = this->_numa_allocation_blocks[i];
+        if (blocks.empty())
+        {
+            const auto next_id = this->_next_allocation_id[i].value().fetch_add(1U, std::memory_order_relaxed);
+            blocks.emplace_back(AllocationBlock{next_id, std::uint8_t(i), 4096U * 4096U});
+        }
+    }
+}
+
+bool Allocator::is_free() const noexcept
+{
+    for (auto i = 0U; i <= system::topology::max_node_id(); ++i)
+    {
+        const auto &numa_blocks = this->_numa_allocation_blocks[i];
+        const auto iterator = std::find_if(numa_blocks.cbegin(), numa_blocks.cend(), [](const auto &allocation_block) {
+            return allocation_block.is_free() == false;
+        });
+
+        if (iterator != numa_blocks.cend())
+        {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+void Allocator::release_allocated_memory() noexcept
+{
+    for (auto i = 0U; i <= system::topology::max_node_id(); ++i)
+    {
+        this->_numa_allocation_blocks[i].clear();
+        this->_next_allocation_id[i].value().store(0U);
+    }
+}
--- a/src/mx/memory/dynamic_size_allocator.h
+++ b/src/mx/memory/dynamic_size_allocator.h
@@ -0,0 +1,185 @@
+#pragma once
+
+#include "config.h"
+#include <array>
+#include <cassert>
+#include <cstdint>
+#include <mx/synchronization/spinlock.h>
+#include <mx/util/aligned_t.h>
+#include <utility>
+#include <vector>
+
+namespace mx::memory::dynamic {
+
+/**
+ * Represents free space within an allocation block.
+ * Holds the start and the size of a free object.
+ */
+class FreeHeader
+{
+public:
+    constexpr FreeHeader(const std::uintptr_t start, const std::size_t size) noexcept : _start(start), _size(size) {}
+    constexpr FreeHeader(const FreeHeader &other) noexcept = default;
+    ~FreeHeader() noexcept = default;
+
+    void contract(const std::size_t size) noexcept { _size -= size; }
+    [[nodiscard]] std::uintptr_t start() const noexcept { return _start; }
+    [[nodiscard]] std::uintptr_t size() const noexcept { return _size; }
+
+    bool operator<(const FreeHeader &other) const noexcept { return _start < other._start; }
+    bool operator>=(const std::size_t size) const noexcept { return _size >= size; }
+
+    [[nodiscard]] bool borders(const FreeHeader &other) const noexcept { return (_start + _size) == other._start; }
+
+    void merge(const FreeHeader &other) noexcept
+    {
+        if (other._start < _start)
+        {
+            assert(other.borders(*this) && "Can not merge: Elements are not next to each other");
+            _start = other._start;
+            _size += other._size;
+        }
+        else
+        {
+            assert(borders(other) && "Can not merge: Elements are not next to each other");
+            _size += other._size;
+        }
+    }
+
+private:
+    std::uintptr_t _start;
+    std::size_t _size;
+};
+
+/**
+ * Header in front of allocated memory, storing the
+ * size, the size which is unused because of alignment,
+ * the ID of the NUMA node the memory is allocated on,
+ * and the source allocation block of this memory.
+ */
+struct AllocatedHeader
+{
+    constexpr AllocatedHeader(const std::size_t size_, const std::uint16_t unused_size_before_header_,
+                              const std::uint8_t numa_node_id_, const std::uint32_t allocation_block_id_) noexcept
+        : size(size_), unused_size_before_header(unused_size_before_header_), numa_node_id(numa_node_id_),
+          allocation_block_id(allocation_block_id_)
+    {
+    }
+
+    const std::size_t size;
+    const std::uint16_t unused_size_before_header;
+    const std::uint8_t numa_node_id;
+    const std::uint32_t allocation_block_id;
+};
+
+/**
+ * Set of on or more free tiles, that can be allocated.
+ */
+class AllocationBlock
+{
+public:
+    AllocationBlock(std::uint32_t id, std::uint8_t numa_node_id, std::size_t size);
+    AllocationBlock(const AllocationBlock &other) = delete;
+    AllocationBlock(AllocationBlock &&other) noexcept;
+    AllocationBlock &operator=(AllocationBlock &&other) noexcept;
+    ~AllocationBlock();
+
+    /**
+     * Allocates memory from the allocation block.
+     *
+     * @param alignment Requested alignment.
+     * @param size Requested size.
+     * @return Pointer to the allocated memory.
+     */
+    void *allocate(std::size_t alignment, std::size_t size) noexcept;
+
+    /**
+     * Frees memory.
+     *
+     * @param allocation_header Pointer to the header of the freed memory.
+     */
+    void free(AllocatedHeader *allocation_header) noexcept;
+
+    /**
+     * @return Unique number of this allocation block.
+     */
+    [[nodiscard]] std::uint32_t id() const noexcept { return _id; }
+
+    /**
+     * @return True, if the full block is free.
+     */
+    [[nodiscard]] bool is_free() const noexcept
+    {
+        return _free_elements.empty() || (_free_elements.size() == 1 && _free_elements[0].size() == _size);
+    }
+
+private:
+    alignas(64) std::uint32_t _id;
+    std::uint8_t _numa_node_id;
+    std::size_t _size;
+
+    void *_allocated_block;
+    std::vector<FreeHeader> _free_elements;
+
+    alignas(64) std::size_t _available_size;
+    synchronization::Spinlock _lock;
+
+    std::pair<std::vector<FreeHeader>::iterator, std::size_t> find_block(std::size_t alignment,
+                                                                         std::size_t size) noexcept;
+};
+
+/**
+ * Allocator which holds a set of allocation blocks separated
+ * for each numa node region.
+ */
+class Allocator
+{
+public:
+    Allocator();
+    ~Allocator() = default;
+
+    void *allocate(std::uint8_t numa_node_id, std::size_t alignment, std::size_t size) noexcept;
+    void free(void *pointer) noexcept;
+
+    /**
+     * Frees unused allocation blocks.
+     */
+    void defragment() noexcept;
+
+    /**
+     * Releases all allocated memory.
+     */
+    void release_allocated_memory() noexcept;
+
+    /**
+     * Adds minimal memory to all numa node regions.
+     */
+    void initialize_empty();
+
+    /**
+     * @return True, if all blocks of all numa regions are free.
+     */
+    [[nodiscard]] bool is_free() const noexcept;
+
+private:
+    // Allocation blocks per numa node region.
+    std::array<std::vector<AllocationBlock>, config::max_numa_nodes()> _numa_allocation_blocks;
+
+    // Allocation flags, used for synchronization when allocating, per numa node region.
+    std::array<util::aligned_t<std::atomic<bool>>, config::max_numa_nodes()> _numa_allocation_flags;
+
+    // Sequence for block allocation per numa node region.
+    std::array<util::aligned_t<std::atomic_uint32_t>, config::max_numa_nodes()> _next_allocation_id;
+
+    /**
+     * Allocates (thread-safe) a block of fresh memory
+     * @param numa_node_id
+     * @param size
+     * @param blocks
+     * @param flag
+     */
+    void allocate_new_block(std::uint8_t numa_node_id, std::size_t size, std::vector<AllocationBlock> &blocks,
+                            std::atomic<bool> &flag);
+};
+
+} // namespace mx::memory::dynamic
--- a/src/mx/memory/fixed_size_allocator.h
+++ b/src/mx/memory/fixed_size_allocator.h
@@ -0,0 +1,356 @@
+#pragma once
+
+#include "alignment_helper.h"
+#include "config.h"
+#include "global_heap.h"
+#include "task_allocator_interface.h"
+#include <array>
+#include <atomic>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <mx/synchronization/spinlock.h>
+#include <mx/system/cache.h>
+#include <mx/system/topology.h>
+#include <mx/tasking/config.h>
+#include <mx/util/core_set.h>
+#include <unordered_map>
+#include <vector>
+
+namespace mx::memory::fixed {
+/**
+ * Represents a free memory object.
+ */
+class FreeHeader
+{
+public:
+    constexpr FreeHeader() noexcept = default;
+    ~FreeHeader() noexcept = default;
+
+    [[nodiscard]] FreeHeader *next() const noexcept { return _next; }
+    void next(FreeHeader *next) noexcept { _next = next; }
+
+    void numa_node_id(const std::uint8_t numa_node_id) noexcept { _numa_node_id = numa_node_id; }
+    [[nodiscard]] std::uint8_t numa_node_id() const noexcept { return _numa_node_id; }
+
+private:
+    FreeHeader *_next = nullptr;
+    std::uint8_t _numa_node_id = 0U;
+};
+
+/**
+ * The Chunk holds a fixed size of memory.
+ */
+class Chunk
+{
+public:
+    Chunk() noexcept = default;
+    explicit Chunk(void *memory) noexcept : _memory(memory) {}
+    ~Chunk() noexcept = default;
+
+    static constexpr auto size() { return 4096 * 4096; /* 16mb */ }
+
+    explicit operator void *() const noexcept { return _memory; }
+    explicit operator std::uintptr_t() const noexcept { return reinterpret_cast<std::uintptr_t>(_memory); }
+    explicit operator bool() const noexcept { return _memory != nullptr; }
+
+private:
+    void *_memory = nullptr;
+};
+
+/**
+ * The ProcessorHeap holds memory for a single socket.
+ * All cores sitting on this socket can allocate memory.
+ * Internal, the ProcessorHeap bufferes allocated memory
+ * to minimize access to the global heap.
+ */
+class ProcessorHeap
+{
+public:
+    explicit ProcessorHeap(const std::uint8_t numa_node_id) noexcept : _numa_node_id(numa_node_id)
+    {
+        _allocated_chunks.reserve(1024);
+        fill_buffer<true>();
+    }
+
+    ~ProcessorHeap() noexcept
+    {
+        for (const auto allocated_chunk : _allocated_chunks)
+        {
+            GlobalHeap::free(static_cast<void *>(allocated_chunk), Chunk::size());
+        }
+
+        for (const auto free_chunk : _free_chunk_buffer)
+        {
+            GlobalHeap::free(static_cast<void *>(free_chunk), Chunk::size());
+        }
+    }
+
+    /**
+     * @return ID of the NUMA node the memory is allocated on.
+     */
+    [[nodiscard]] std::uint8_t numa_node_id() const noexcept { return _numa_node_id; }
+
+    /**
+     * Allocates a chunk of memory from the internal buffer.
+     * In case the buffer is empty, new Chunks from the GlobalHeap
+     * will be allocated.
+     *
+     * @return A chunk of allocated memory.
+     */
+    Chunk allocate() noexcept
+    {
+        const auto next_free_chunk = _next_free_chunk.fetch_add(1, std::memory_order_relaxed);
+        if (next_free_chunk < _free_chunk_buffer.size())
+        {
+            return _free_chunk_buffer[next_free_chunk];
+        }
+
+        auto expect = false;
+        const auto can_fill = _fill_buffer_flag.compare_exchange_strong(expect, true);
+        if (can_fill)
+        {
+            fill_buffer<false>();
+            _fill_buffer_flag = false;
+        }
+        else
+        {
+            while (_fill_buffer_flag)
+            {
+                system::builtin::pause();
+            }
+        }
+
+        return allocate();
+    }
+
+private:
+    // Size of the internal chunk buffer.
+    inline static constexpr auto CHUNKS = 128U;
+
+    // ID of the NUMA node of this ProcessorHeap.
+    alignas(64) const std::uint8_t _numa_node_id;
+
+    // Buffer for free chunks.
+    std::array<Chunk, CHUNKS> _free_chunk_buffer;
+
+    // Pointer to the next free chunk in the buffer.
+    alignas(64) std::atomic_uint8_t _next_free_chunk{0U};
+
+    // Flag, used for allocation from the global Heap for mutual exclusion.
+    std::atomic_bool _fill_buffer_flag{false};
+
+    // List of all allocated chunks, they will be freed later.
+    std::vector<Chunk> _allocated_chunks;
+
+    /**
+     * Allocates a very big chunk from the GlobalHeap and
+     * splits it into smaller chunks to store them in the
+     * internal buffer.
+     */
+    template <bool IS_FIRST = false> void fill_buffer() noexcept
+    {
+        if constexpr (IS_FIRST == false)
+        {
+            for (const auto &chunk : _free_chunk_buffer)
+            {
+                _allocated_chunks.push_back(chunk);
+            }
+        }
+
+        auto *heap_memory = GlobalHeap::allocate(_numa_node_id, Chunk::size() * _free_chunk_buffer.size());
+        auto heap_memory_address = reinterpret_cast<std::uintptr_t>(heap_memory);
+        for (auto i = 0U; i < _free_chunk_buffer.size(); ++i)
+        {
+            _free_chunk_buffer[i] = Chunk(reinterpret_cast<void *>(heap_memory_address + (i * Chunk::size())));
+        }
+
+        _next_free_chunk.store(0U);
+    }
+};
+
+/**
+ * The CoreHeap represents the allocator on a single core.
+ * By this, allocations are latch-free.
+ */
+template <std::size_t S> class alignas(64) CoreHeap
+{
+public:
+    explicit CoreHeap(ProcessorHeap *processor_heap) noexcept
+        : _processor_heap(processor_heap), _numa_node_id(processor_heap->numa_node_id())
+    {
+    }
+
+    CoreHeap() noexcept = default;
+
+    ~CoreHeap() noexcept = default;
+
+    /**
+     * Allocates new memory from the CoreHeap.
+     * When the internal buffer is empty, the CoreHeap
+     * will allocate new chunks from the ProcessorHeap.
+     *
+     * @return Pointer to the new allocated memory.
+     */
+    void *allocate() noexcept
+    {
+        if (empty())
+        {
+            fill_buffer();
+        }
+
+        auto *free_object = _first;
+        _first = free_object->next();
+
+        if constexpr (config::low_priority_for_external_numa())
+        {
+            free_object->numa_node_id(_numa_node_id);
+
+            return reinterpret_cast<void *>(reinterpret_cast<std::uintptr_t>(free_object) + 64U);
+        }
+        else
+        {
+            return static_cast<void *>(free_object);
+        }
+    }
+
+    /**
+     * Frees a memory object. The new available memory location
+     * will be placed in front of the "available"-list. By this,
+     * the next allocation will use the just freed object, which
+     * may be still in the CPU cache.
+     *
+     * @param pointer Pointer to the memory object to be freed.
+     */
+    void free(void *pointer) noexcept
+    {
+        if constexpr (config::low_priority_for_external_numa())
+        {
+            const auto address = reinterpret_cast<std::uintptr_t>(pointer);
+            auto *free_object = reinterpret_cast<FreeHeader *>(address - 64U);
+
+            if (free_object->numa_node_id() == _numa_node_id)
+            {
+                free_object->next(_first);
+                _first = free_object;
+            }
+            else
+            {
+                _last->next(free_object);
+                free_object->next(nullptr);
+                _last = free_object;
+            }
+        }
+        else
+        {
+            auto *free_object = static_cast<FreeHeader *>(pointer);
+            free_object->next(_first);
+            _first = free_object;
+        }
+    }
+
+    /**
+     * Fills the buffer by asking the ProcessorHeap for more memory.
+     * This is latch-free since just a single core calls this method.
+     */
+    void fill_buffer()
+    {
+        auto chunk = _processor_heap->allocate();
+        const auto chunk_address = static_cast<std::uintptr_t>(chunk);
+
+        constexpr auto object_size = config::low_priority_for_external_numa() ? S + 64U : S;
+        constexpr auto count_objects = std::uint64_t{Chunk::size() / object_size};
+
+        auto *first_free = reinterpret_cast<FreeHeader *>(chunk_address);
+        auto *last_free = reinterpret_cast<FreeHeader *>(chunk_address + ((count_objects - 1) * object_size));
+
+        auto *current_free = first_free;
+        for (auto i = 0U; i < count_objects - 1U; ++i)
+        {
+            auto *next = reinterpret_cast<FreeHeader *>(chunk_address + ((i + 1U) * object_size));
+            current_free->next(next);
+            current_free = next;
+        }
+
+        last_free->next(nullptr);
+        _first = first_free;
+        _last = last_free;
+    }
+
+private:
+    // Processor heap to allocate new chunks.
+    ProcessorHeap *_processor_heap = nullptr;
+
+    // ID of the NUMA node the core is placed in.
+    std::uint8_t _numa_node_id = 0U;
+
+    // First element of the list of free memory objects.
+    FreeHeader *_first = nullptr;
+
+    // Last element of the list of free memory objects.
+    FreeHeader *_last = nullptr;
+
+    /**
+     * @return True, when the buffer is empty.
+     */
+    [[nodiscard]] bool empty() const noexcept { return _first == nullptr; }
+};
+
+/**
+ * The Allocator is the interface to the internal CoreHeaps.
+ */
+template <std::size_t S> class Allocator final : public TaskAllocatorInterface
+{
+public:
+    explicit Allocator(const util::core_set &core_set) : _core_heaps(core_set.size())
+    {
+        _processor_heaps.fill(nullptr);
+
+        for (auto i = 0U; i < core_set.size(); ++i)
+        {
+            const auto core_id = core_set[i];
+            const auto node_id = system::topology::node_id(core_id);
+            if (_processor_heaps[node_id] == nullptr)
+            {
+                _processor_heaps[node_id] =
+                    new (GlobalHeap::allocate_cache_line_aligned(sizeof(ProcessorHeap))) ProcessorHeap(node_id);
+            }
+
+            auto core_heap = CoreHeap<S>{_processor_heaps[node_id]};
+            core_heap.fill_buffer();
+            _core_heaps.insert(std::make_pair(core_id, std::move(core_heap)));
+        }
+    }
+
+    ~Allocator() override
+    {
+        for (auto *processor_heap : _processor_heaps)
+        {
+            delete processor_heap;
+        }
+    }
+
+    /**
+     * Allocates memory from the given CoreHeap.
+     *
+     * @param core_id ID of the core.
+     * @return Allocated memory object.
+     */
+    void *allocate(const std::uint16_t core_id) override { return _core_heaps[core_id].allocate(); }
+
+    /**
+     * Frees memory.
+     *
+     * @param core_id ID of the core to place the free object in.
+     * @param address Pointer to the memory object.
+     */
+    void free(const std::uint16_t core_id, void *address) noexcept override { _core_heaps[core_id].free(address); }
+
+private:
+    // Heap for every processor socket/NUMA region.
+    std::array<ProcessorHeap *, config::max_numa_nodes()> _processor_heaps;
+
+    // Map from core_id to core-local allocator.
+    std::unordered_map<std::uint16_t, CoreHeap<S>> _core_heaps;
+};
+} // namespace mx::memory::fixed
--- a/src/mx/memory/global_heap.h
+++ b/src/mx/memory/global_heap.h
@@ -0,0 +1,46 @@
+#pragma once
+#include "alignment_helper.h"
+#include <cstdint>
+#include <cstdlib>
+#include <numa.h>
+
+namespace mx::memory {
+/**
+ * The global heap represents the heap, provided by the OS.
+ */
+class GlobalHeap
+{
+public:
+    /**
+     * Allocates the given size on the given NUMA node.
+     *
+     * @param numa_node_id ID of the NUMA node, the memory should allocated on.
+     * @param size  Size of the memory to be allocated.
+     * @return Pointer to allocated memory.
+     */
+    static void *allocate(const std::uint8_t numa_node_id, const std::size_t size)
+    {
+        return numa_alloc_onnode(size, numa_node_id);
+    }
+
+    /**
+     * Allocates the given memory aligned to the cache line
+     * with a multiple of the alignment as a size.
+     * The allocated memory is not NUMA aware.
+     * @param size Size to be allocated.
+     * @return Allocated memory
+     */
+    static void *allocate_cache_line_aligned(const std::size_t size)
+    {
+        return std::aligned_alloc(64U, alignment_helper::next_multiple(size, 64UL));
+    }
+
+    /**
+     * Frees the given memory.
+     *
+     * @param memory Pointer to memory.
+     * @param size Size of the allocated object.
+     */
+    static void free(void *memory, const std::size_t size) { numa_free(memory, size); }
+};
+} // namespace mx::memory
--- a/src/mx/memory/reclamation/epoch_manager.cpp
+++ b/src/mx/memory/reclamation/epoch_manager.cpp
@@ -0,0 +1,155 @@
+#include "epoch_manager.h"
+#include <mx/system/topology.h>
+#include <mx/tasking/runtime.h>
+#include <mx/util/queue.h>
+#include <thread>
+
+using namespace mx::memory::reclamation;
+
+void EpochManager::enter_epoch_periodically()
+{
+    // Wait until the scheduler starts the system.
+    while (this->_is_running == false)
+    {
+        system::builtin::pause();
+    }
+
+    // Enter new epochs and collect garbage periodically
+    // while the system is running.
+    while (this->_is_running)
+    {
+        // Enter new epoch.
+        this->_global_epoch.fetch_add(1U);
+
+        if constexpr (config::local_garbage_collection())
+        {
+            // Collect local garbage.
+            const auto core_id = mx::system::topology::core_id();
+            for (auto channel_id = 0U; channel_id < this->_count_channels; ++channel_id)
+            {
+                auto *garbage_task =
+                    mx::tasking::runtime::new_task<ReclaimEpochGarbageTask>(core_id, *this, this->_allocator);
+                garbage_task->annotate(std::uint16_t(channel_id));
+                mx::tasking::runtime::spawn(*garbage_task);
+            }
+        }
+        else
+        {
+            // Collect global garbage of finished epochs.
+            this->reclaim_epoch_garbage();
+        }
+
+        // Wait some time until next epoch.
+        std::this_thread::sleep_for(config::epoch_interval()); // NOLINT: sleep_for seems to crash clang-tidy
+    }
+}
+
+void EpochManager::reclaim_epoch_garbage() noexcept
+{
+    // Items logically removed in an epoch leq than
+    // this epoch can be removed physically.
+    const auto min_epoch = this->min_local_epoch();
+
+    // Items that could not be physically removed in this epoch
+    // and therefore have to be scheduled to the next one.
+    util::Queue<resource::ResourceInterface> deferred_resources{};
+
+    resource::ResourceInterface *resource;
+    while ((resource = reinterpret_cast<resource::ResourceInterface *>(this->_global_garbage_queue.pop_front())) !=
+           nullptr)
+    {
+        if (resource->remove_epoch() < min_epoch)
+        {
+            resource->on_reclaim();
+            this->_allocator.free(static_cast<void *>(resource));
+        }
+        else
+        {
+            deferred_resources.push_back(resource);
+        }
+    }
+
+    // Resources that could not be deleted physically
+    // need to be deleted in next epochs.
+    if (deferred_resources.empty() == false)
+    {
+        this->_global_garbage_queue.push_back(deferred_resources.begin(), deferred_resources.end());
+    }
+}
+
+void EpochManager::reclaim_all() noexcept
+{
+    if constexpr (config::local_garbage_collection())
+    {
+        for (auto channel_id = 0U; channel_id < this->_count_channels; ++channel_id)
+        {
+            resource::ResourceInterface *resource;
+            while ((resource = reinterpret_cast<resource::ResourceInterface *>(
+                        this->_local_garbage_queues[channel_id].value().pop_front())) != nullptr)
+            {
+                resource->on_reclaim();
+                this->_allocator.free(static_cast<void *>(resource));
+            }
+        }
+    }
+    else
+    {
+        resource::ResourceInterface *resource;
+        while ((resource = reinterpret_cast<resource::ResourceInterface *>(this->_global_garbage_queue.pop_front())) !=
+               nullptr)
+        {
+            resource->on_reclaim();
+            this->_allocator.free(static_cast<void *>(resource));
+        }
+    }
+}
+
+void EpochManager::reset() noexcept
+{
+    if (this->_allocator.is_free())
+    {
+        this->_global_epoch.store(0U);
+        for (auto channel_id = 0U; channel_id < tasking::config::max_cores(); ++channel_id)
+        {
+            _local_epochs[channel_id] = std::numeric_limits<epoch_t>::max();
+        }
+    }
+}
+
+mx::tasking::TaskResult ReclaimEpochGarbageTask::execute(const std::uint16_t /*core_id*/,
+                                                         const std::uint16_t channel_id)
+{
+    // Items logically removed in an epoch leq than
+    // this epoch can be removed physically.
+    const auto min_epoch = this->_epoch_manager.min_local_epoch();
+
+    // Items that could not be physically removed in this epoch
+    // and therefore have to be scheduled to the next one.
+    util::Queue<resource::ResourceInterface> deferred_resources{};
+
+    // Queue with channel-local garbage.
+    auto &garbage_queue = this->_epoch_manager.local_garbage(channel_id);
+
+    resource::ResourceInterface *resource;
+    while ((resource = reinterpret_cast<resource::ResourceInterface *>(garbage_queue.pop_front())) != nullptr)
+    {
+        if (resource->remove_epoch() < min_epoch)
+        {
+            resource->on_reclaim();
+            this->_allocator.free(static_cast<void *>(resource));
+        }
+        else
+        {
+            deferred_resources.push_back(resource);
+        }
+    }
+
+    // Resources that could not be deleted physically
+    // need to be deleted in next epochs.
+    if (deferred_resources.empty() == false)
+    {
+        garbage_queue.push_back(deferred_resources.begin(), deferred_resources.end());
+    }
+
+    return tasking::TaskResult::make_remove();
+}
--- a/src/mx/memory/reclamation/epoch_manager.h
+++ b/src/mx/memory/reclamation/epoch_manager.h
@@ -0,0 +1,183 @@
+#pragma once
+
+#include "epoch_t.h"
+#include <array>
+#include <atomic>
+#include <chrono>
+#include <cstdint>
+#include <mx/memory/config.h>
+#include <mx/memory/dynamic_size_allocator.h>
+#include <mx/resource/resource_interface.h>
+#include <mx/system/builtin.h>
+#include <mx/tasking/config.h>
+#include <mx/tasking/task.h>
+#include <mx/util/aligned_t.h>
+#include <mx/util/core_set.h>
+#include <mx/util/maybe_atomic.h>
+#include <mx/util/mpsc_queue.h>
+#include <thread>
+
+namespace mx::memory::reclamation {
+class alignas(64) LocalEpoch
+{
+public:
+    constexpr LocalEpoch() noexcept = default;
+    ~LocalEpoch() noexcept = default;
+
+    LocalEpoch &operator=(const epoch_t epoch) noexcept
+    {
+        _epoch = epoch;
+        return *this;
+    }
+
+    void enter(const std::atomic<epoch_t> &global_epoch) noexcept
+    {
+        _epoch.store(global_epoch.load(std::memory_order_seq_cst), std::memory_order_seq_cst);
+    }
+    void leave() noexcept { _epoch.store(std::numeric_limits<epoch_t>::max()); }
+
+    [[nodiscard]] epoch_t operator()() const noexcept { return _epoch.load(std::memory_order_seq_cst); }
+
+private:
+    std::atomic<epoch_t> _epoch{std::numeric_limits<epoch_t>::max()};
+};
+
+/**
+ * The Epoch Manager manages periodic epochs which
+ * are used to protect reads against concurrent
+ * delete operations. Therefore, a global epoch
+ * will be incremented every 50ms (configurable).
+ * Read operations, on the other hand, will update
+ * their local epoch every time before reading an
+ * optimistic resource.
+ * When (logically) deleting an optimistic resource,
+ * the resource will be deleted physically, when
+ * every local epoch is greater than the epoch
+ * when the resource is deleted.
+ */
+class EpochManager
+{
+public:
+    EpochManager(const std::uint16_t count_channels, dynamic::Allocator &allocator,
+                 util::maybe_atomic<bool> &is_running) noexcept
+        : _count_channels(count_channels), _is_running(is_running), _allocator(allocator)
+    {
+    }
+
+    EpochManager(const EpochManager &) = delete;
+
+    ~EpochManager() = default;
+
+    LocalEpoch &operator[](const std::uint16_t channel_id) noexcept { return _local_epochs[channel_id]; }
+
+    /**
+     * @return Access to read to global epoch.
+     */
+    [[nodiscard]] const std::atomic<epoch_t> &global_epoch() const noexcept { return _global_epoch; }
+
+    /**
+     * @return The minimal epoch of all channels.
+     */
+    [[nodiscard]] epoch_t min_local_epoch() const noexcept
+    {
+        auto min_epoch = _local_epochs[0U]();
+        for (auto channel_id = 1U; channel_id < _count_channels; ++channel_id)
+        {
+            min_epoch = std::min(min_epoch, _local_epochs[channel_id]());
+        }
+
+        return min_epoch;
+    }
+
+    /**
+     * Adds an optimistic resource to garbage collection.
+     * @param resource Resource to logically delete.
+     */
+    void add_to_garbage_collection(resource::ResourceInterface *resource,
+                                   [[maybe_unused]] const std::uint16_t owning_channel_id) noexcept
+    {
+        resource->remove_epoch(_global_epoch.load(std::memory_order_acq_rel));
+
+        if constexpr (config::local_garbage_collection())
+        {
+            _local_garbage_queues[owning_channel_id].value().push_back(resource);
+        }
+        else
+        {
+            _global_garbage_queue.push_back(resource);
+        }
+    }
+
+    /**
+     * Called periodically by a separate thread.
+     */
+    void enter_epoch_periodically();
+
+    /**
+     * Reclaims all garbage, mainly right before shut down tasking.
+     */
+    void reclaim_all() noexcept;
+
+    /**
+     * Grants access to the local garbage queue of a specific channel.
+     *
+     * @param channel_id Channel Id.
+     * @return Local garbage queue.
+     */
+    [[nodiscard]] util::MPSCQueue<resource::ResourceInterface> &local_garbage(const std::uint16_t channel_id) noexcept
+    {
+        return _local_garbage_queues[channel_id].value();
+    }
+
+    /**
+     * Reset all local and the global epoch to initial values
+     * if no memory is in use.
+     */
+    void reset() noexcept;
+
+private:
+    // Number of used channels; important for min-calculation.
+    const std::uint16_t _count_channels;
+
+    // Flag of the scheduler indicating the state of the system.
+    util::maybe_atomic<bool> &_is_running;
+
+    // Allocator to free collected resources.
+    dynamic::Allocator &_allocator;
+
+    // Global epoch, incremented periodically.
+    std::atomic<epoch_t> _global_epoch{0U};
+
+    // Local epochs, one for every channel.
+    alignas(64) std::array<LocalEpoch, tasking::config::max_cores()> _local_epochs;
+
+    // Queue that holds all logically deleted objects in a global space.
+    alignas(64) util::MPSCQueue<resource::ResourceInterface> _global_garbage_queue;
+
+    // Queues for every worker thread. Logically deleted objects are stored here
+    // whenever local garbage collection is used.
+    alignas(64) std::array<util::aligned_t<util::MPSCQueue<resource::ResourceInterface>>,
+                           tasking::config::max_cores()> _local_garbage_queues;
+
+    /**
+     * Reclaims resources with regard to the epoch.
+     */
+    void reclaim_epoch_garbage() noexcept;
+};
+
+class ReclaimEpochGarbageTask final : public tasking::TaskInterface
+{
+public:
+    constexpr ReclaimEpochGarbageTask(EpochManager &epoch_manager, dynamic::Allocator &allocator) noexcept
+        : _epoch_manager(epoch_manager), _allocator(allocator)
+    {
+    }
+    ~ReclaimEpochGarbageTask() noexcept override = default;
+
+    tasking::TaskResult execute(std::uint16_t core_id, std::uint16_t channel_id) override;
+
+private:
+    EpochManager &_epoch_manager;
+    dynamic::Allocator &_allocator;
+};
+} // namespace mx::memory::reclamation
--- a/src/mx/memory/reclamation/epoch_t.h
+++ b/src/mx/memory/reclamation/epoch_t.h
@@ -0,0 +1,5 @@
+#pragma once
+#include <cstdint>
+namespace mx::memory::reclamation {
+using epoch_t = std::uint32_t;
+}
--- a/src/mx/memory/tagged_ptr.h
+++ b/src/mx/memory/tagged_ptr.h
@@ -0,0 +1,101 @@
+#pragma once
+#include <cassert>
+#include <cstdint>
+#include <functional>
+
+namespace mx::memory {
+/**
+ * Holds the memory address of an instance of the class T
+ * and decodes a 16bit core address within the memory address.
+ * The size of the tagged_ptr<T> is equal to T*.
+ */
+template <class T, typename I> class tagged_ptr
+{
+public:
+    constexpr tagged_ptr() noexcept : _object_pointer(0U)
+    {
+        static_assert(sizeof(I) == 2U);
+        static_assert(sizeof(tagged_ptr) == 8U);
+    }
+
+    constexpr explicit tagged_ptr(T *pointer) noexcept : _object_pointer(std::uintptr_t(pointer)) {}
+
+    constexpr explicit tagged_ptr(T *pointer, const I information) noexcept
+        : _object_pointer(std::uintptr_t(pointer)), _information(information)
+    {
+    }
+
+    ~tagged_ptr() noexcept = default;
+
+    /**
+     * @return The decoded info.
+     */
+    inline I info() const noexcept { return _information; }
+
+    /**
+     * @return The memory address without the info.
+     */
+    template <typename S = T> inline S *get() const noexcept { return reinterpret_cast<S *>(_object_pointer); }
+
+    /**
+     * Decodes the given info within the pointer.
+     *
+     * @param info  Info to store in the tagged pointer.
+     */
+    inline void reset(const I information) noexcept { _information = information; }
+
+    /**
+     * Replaces the internal pointer by a new one.
+     *
+     * @param new_pointer Pointer to the new memory object.
+     */
+    inline void reset(T *new_pointer = nullptr) noexcept { _object_pointer = std::uintptr_t(new_pointer); }
+
+    T *operator->() const noexcept { return get(); }
+
+    explicit operator T *() const noexcept { return get(); }
+
+    explicit operator bool() const noexcept { return _object_pointer > 0U; }
+
+    tagged_ptr<T, I> &operator=(const tagged_ptr<T, I> &other) noexcept = default;
+
+    bool operator==(const tagged_ptr<T, I> &other) const noexcept { return other._object_pointer == _object_pointer; }
+
+    bool operator==(const T *other) const noexcept { return other == get(); }
+
+    bool operator==(std::nullptr_t) const noexcept { return _object_pointer == 0U; }
+
+    bool operator!=(const tagged_ptr<T, I> &other) const noexcept { return other.get() != get(); }
+
+    bool operator!=(std::nullptr_t) const noexcept { return _object_pointer != 0U; }
+
+    bool operator<(const tagged_ptr<T, I> &other) noexcept { return other.get() < get(); }
+
+    bool operator<=(const tagged_ptr<T, I> &other) noexcept { return other.get() <= get(); }
+
+    bool operator>(const tagged_ptr<T, I> &other) noexcept { return other.get() > get(); }
+
+    bool operator>=(const tagged_ptr<T, I> &other) noexcept { return other.get() >= get(); }
+
+private:
+    /**
+     * Pointer to the instance of T, only 48bit are used.
+     */
+    std::uintptr_t _object_pointer : 48;
+
+    /**
+     * Information stored within this pointer, remaining 16bit are used.
+     */
+    I _information{};
+} __attribute__((packed));
+} // namespace mx::memory
+
+namespace std {
+template <class T, typename I> struct hash<mx::memory::tagged_ptr<T, I>>
+{
+    std::size_t operator()(const mx::memory::tagged_ptr<T, I> &ptr) const noexcept
+    {
+        return std::hash<T *>().operator()(ptr.get());
+    }
+};
+} // namespace std
--- a/src/mx/memory/task_allocator_interface.h
+++ b/src/mx/memory/task_allocator_interface.h
@@ -0,0 +1,52 @@
+#pragma once
+
+#include <cstdint>
+#include <cstdlib>
+
+namespace mx::memory {
+/**
+ * Interface for task allocators (e.g. using systems malloc
+ * or the internal allocator).
+ */
+class TaskAllocatorInterface
+{
+public:
+    constexpr TaskAllocatorInterface() noexcept = default;
+    virtual ~TaskAllocatorInterface() noexcept = default;
+
+    /**
+     * Allocates memory for the given core.
+     * @param core_id Core to allocate memory for.
+     * @return Allocated memory.
+     */
+    virtual void *allocate(std::uint16_t core_id) = 0;
+
+    /**
+     * Frees the memory at the given core.
+     * @param core_id Core to store free memory.
+     * @param address Address to free.
+     */
+    virtual void free(std::uint16_t core_id, void *address) noexcept = 0;
+};
+
+/**
+ * Task allocator using the systems (aligned_)malloc/free interface.
+ */
+template <std::size_t S> class SystemTaskAllocator final : public TaskAllocatorInterface
+{
+public:
+    constexpr SystemTaskAllocator() noexcept = default;
+    virtual ~SystemTaskAllocator() noexcept = default;
+
+    /**
+     * @return Allocated memory using systems malloc (but aligned).
+     */
+    void *allocate(const std::uint16_t /*core_id*/) override { return std::aligned_alloc(64U, S); }
+
+    /**
+     * Frees the given memory using systems free.
+     * @param address Memory to free.
+     */
+    void free(const std::uint16_t /*core_id*/, void *address) noexcept override { std::free(address); }
+};
+} // namespace mx::memory
--- a/src/mx/resource/builder.cpp
+++ b/src/mx/resource/builder.cpp
@@ -0,0 +1,66 @@
+#include "builder.h"
+#include <mx/synchronization/primitive_matrix.h>
+
+using namespace mx::resource;
+
+std::pair<std::uint16_t, std::uint8_t> Builder::schedule(const resource::hint &hint)
+{
+    // Scheduling was done by the hint.
+    if (hint.has_channel_id())
+    {
+        this->_scheduler.predict_usage(hint.channel_id(), hint.access_frequency());
+        return std::make_pair(hint.channel_id(), this->_scheduler.numa_node_id(hint.channel_id()));
+    }
+
+    // Schedule resources round robin to the channels.
+    const auto count_channels = this->_scheduler.count_channels();
+    auto channel_id = this->_round_robin_channel_id.fetch_add(1U, std::memory_order_relaxed) % count_channels;
+
+    // If the chosen channel contains an excessive accessed resource, get another.
+    if (count_channels > 2U && hint.isolation_level() == synchronization::isolation_level::Exclusive &&
+        this->_scheduler.has_excessive_usage_prediction(channel_id))
+    {
+        channel_id = this->_round_robin_channel_id.fetch_add(1U, std::memory_order_relaxed) % count_channels;
+    }
+    this->_scheduler.predict_usage(channel_id, hint.access_frequency());
+
+    const auto numa_node_id = hint.has_numa_node_id() ? hint.numa_node_id() : this->_scheduler.numa_node_id(channel_id);
+
+    return std::make_pair(channel_id, numa_node_id);
+}
+
+mx::synchronization::primitive Builder::isolation_level_to_synchronization_primitive(const hint &hint) noexcept
+{
+    // The developer did not define any fixed protocol for
+    // synchronization; we choose one depending on the hints.
+    if (hint == synchronization::protocol::None)
+    {
+        return synchronization::PrimitiveMatrix::select_primitive(hint.isolation_level(), hint.access_frequency(),
+                                                                  hint.read_write_ratio());
+    }
+
+    // The developer hinted a specific protocol (latched, queued, ...)
+    // and a relaxed isolation level.
+    if (hint == synchronization::isolation_level::ExclusiveWriter)
+    {
+        switch (hint.preferred_protocol())
+        {
+        case synchronization::protocol::Latch:
+            return synchronization::primitive::ReaderWriterLatch;
+        case synchronization::protocol::OLFIT:
+            return synchronization::primitive::OLFIT;
+        default:
+            return synchronization::primitive::ScheduleWriter;
+        }
+    }
+
+    // The developer hinted a specific protocol (latched, queued, ...)
+    // and a strict isolation level.
+    if (hint == synchronization::isolation_level::Exclusive)
+    {
+        return hint == synchronization::protocol::Latch ? synchronization::primitive::ExclusiveLatch
+                                                        : synchronization::primitive::ScheduleAll;
+    }
+
+    return mx::synchronization::primitive::None;
+}
--- a/src/mx/resource/builder.h
+++ b/src/mx/resource/builder.h
@@ -0,0 +1,144 @@
+#pragma once
+#include "resource.h"
+#include <array>
+#include <atomic>
+#include <cstdint>
+#include <mx/memory/dynamic_size_allocator.h>
+#include <mx/memory/global_heap.h>
+#include <mx/tasking/config.h>
+#include <mx/tasking/scheduler.h>
+#include <mx/util/aligned_t.h>
+#include <type_traits>
+#include <utility>
+
+namespace mx::resource {
+/**
+ * The Builder constructs and deletes data objects.
+ * Besides, the Builder schedules data objects to
+ * channels.
+ */
+class Builder
+{
+public:
+    Builder(tasking::Scheduler &scheduler, memory::dynamic::Allocator &allocator) noexcept
+        : _allocator(allocator), _scheduler(scheduler)
+    {
+    }
+
+    ~Builder() noexcept = default;
+
+    /**
+     * Build a data object of given type with given
+     * size and arguments. The hint defines the synchronization
+     * requirements and affects scheduling.
+     *
+     * @param size Size of the data object.
+     * @param hint  Hint for scheduling and synchronization.
+     * @param arguments Arguments to the constructor.
+     * @return Tagged pointer holding the synchronization, assigned channel and pointer.
+     */
+    template <typename T, typename... Args>
+    ptr build(const std::size_t size, resource::hint &&hint, Args &&... arguments) noexcept
+    {
+#ifndef NDEBUG
+        if (hint != synchronization::isolation_level::None &&
+            (hint != synchronization::isolation_level::Exclusive || hint != synchronization::protocol::Queue))
+        {
+            if constexpr (std::is_base_of<ResourceInterface, T>::value == false)
+            {
+                assert(false && "Type must be inherited from mx::resource::ResourceInterface");
+            }
+        }
+#endif
+
+        const auto synchronization_method = Builder::isolation_level_to_synchronization_primitive(hint);
+
+        const auto [channel_id, numa_node_id] = schedule(hint);
+        const auto resource_information = information{channel_id, synchronization_method};
+
+        return ptr{new (_allocator.allocate(numa_node_id, 64U, size)) T(std::forward<Args>(arguments)...),
+                   resource_information};
+    }
+
+    /**
+     * Builds data resourced from an existing pointer.
+     * The hint defines the synchronization
+     * requirements and affects scheduling.
+     * @param object
+     * @param hint  Hint for scheduling and synchronization.
+     * @return Tagged pointer holding the synchronization, assigned channel and pointer.
+     */
+    template <typename T> ptr build(T *object, resource::hint &&hint) noexcept
+    {
+#ifndef NDEBUG
+        if (hint != synchronization::isolation_level::None &&
+            (hint != synchronization::isolation_level::Exclusive || hint != synchronization::protocol::Queue))
+        {
+            if constexpr (std::is_base_of<ResourceInterface, T>::value == false)
+            {
+                assert(false && "Type must be inherited from mx::resource::ResourceInterface");
+            }
+        }
+#endif
+
+        const auto synchronization_method = Builder::isolation_level_to_synchronization_primitive(hint);
+        const auto [channel_id, _] = schedule(hint);
+
+        return ptr{object, information{channel_id, synchronization_method}};
+    }
+
+    /**
+     * Destroys the given data object.
+     * @param core_id Executing core.
+     * @param resource Tagged pointer to the data object.
+     */
+    template <typename T> void destroy(const ptr resource)
+    {
+        // TODO: Revoke usage prediction?
+        if (resource != nullptr)
+        {
+            if constexpr (tasking::config::memory_reclamation() != tasking::config::None)
+            {
+                if (synchronization::is_optimistic(resource.synchronization_primitive()))
+                {
+                    _scheduler.epoch_manager().add_to_garbage_collection(resource.get<resource::ResourceInterface>(),
+                                                                         resource.channel_id());
+                    return;
+                }
+            }
+
+            // No need to reclaim memory.
+            resource.get<T>()->~T();
+            _allocator.free(resource.get<void>());
+        }
+    }
+
+private:
+    // Internal allocator for dynamic sized allocation.
+    memory::dynamic::Allocator &_allocator;
+
+    // Scheduler of MxTasking to get access to channels.
+    tasking::Scheduler &_scheduler;
+
+    // Next channel id for round-robin scheduling.
+    alignas(64) std::atomic_uint16_t _round_robin_channel_id{0U};
+
+    /**
+     * Schedules the resource to a channel, affected by the given hint.
+     *
+     * @param hint Hint for scheduling.
+     * @return Pair of Channel and NUMA node IDs.
+     */
+    std::pair<std::uint16_t, std::uint8_t> schedule(const resource::hint &hint);
+
+    /**
+     * Determines the best synchronization method based on
+     * synchronization requirement.
+     *
+     * @param isolation_level Synchronization requirement.
+     * @param prefer_latch Prefer latch for synchronization or latch-free?
+     * @return Chosen synchronization method.
+     */
+    static synchronization::primitive isolation_level_to_synchronization_primitive(const hint &hint) noexcept;
+};
+} // namespace mx::resource
--- a/src/mx/resource/resource.h
+++ b/src/mx/resource/resource.h
@@ -0,0 +1,223 @@
+#pragma once
+
+#include "resource_interface.h"
+#include <cassert>
+#include <cstdint>
+#include <mx/memory/alignment_helper.h>
+#include <mx/memory/tagged_ptr.h>
+#include <mx/synchronization/synchronization.h>
+#include <mx/util/random.h>
+#include <new>
+
+namespace mx::resource {
+/**
+ * Hint for creating resources by the resource interface.
+ * Encapsulates the requested numa region, synchronization requirements
+ * and expected access frequency.
+ */
+class hint
+{
+public:
+    enum expected_access_frequency : std::uint8_t
+    {
+        excessive = 0U,
+        high = 1U,
+        normal = 2U,
+        unused = 3U,
+    };
+
+    enum expected_read_write_ratio : std::uint8_t
+    {
+        heavy_read = 0U,
+        mostly_read = 1U,
+        balanced = 2U,
+        mostly_written = 3U,
+        heavy_written = 4U
+    };
+
+    constexpr explicit hint(const std::uint8_t node_id) noexcept : _numa_node_id(node_id) {}
+    constexpr explicit hint(const std::uint16_t channel_id) noexcept : _channel_id(channel_id) {}
+    constexpr explicit hint(const synchronization::isolation_level isolation_level) noexcept
+        : _isolation_level(isolation_level)
+    {
+    }
+    constexpr explicit hint(const expected_access_frequency access_frequency) noexcept
+        : _access_frequency(access_frequency)
+    {
+    }
+    constexpr hint(const std::uint16_t channel_id, const synchronization::isolation_level isolation_level) noexcept
+        : _channel_id(channel_id), _isolation_level(isolation_level)
+    {
+    }
+    constexpr hint(const std::uint8_t node_id, const synchronization::isolation_level isolation_level) noexcept
+        : _numa_node_id(node_id), _isolation_level(isolation_level)
+    {
+    }
+    constexpr hint(const std::uint8_t node_id, const synchronization::isolation_level isolation_level,
+                   const synchronization::protocol preferred_protocol) noexcept
+        : _numa_node_id(node_id), _isolation_level(isolation_level), _preferred_protocol(preferred_protocol)
+    {
+    }
+
+    constexpr hint(const std::uint16_t channel_id, const synchronization::isolation_level isolation_level,
+                   const synchronization::protocol preferred_protocol) noexcept
+        : _channel_id(channel_id), _isolation_level(isolation_level), _preferred_protocol(preferred_protocol)
+    {
+    }
+
+    constexpr hint(const std::uint8_t node_id, const expected_access_frequency access_frequency) noexcept
+        : _numa_node_id(node_id), _access_frequency(access_frequency)
+    {
+    }
+    constexpr hint(const synchronization::isolation_level isolation_level,
+                   const expected_access_frequency access_frequency) noexcept
+        : _access_frequency(access_frequency), _isolation_level(isolation_level)
+    {
+    }
+    constexpr hint(const synchronization::isolation_level isolation_level,
+                   const synchronization::protocol preferred_protocol,
+                   const expected_access_frequency access_frequency) noexcept
+        : _access_frequency(access_frequency), _isolation_level(isolation_level),
+          _preferred_protocol(preferred_protocol)
+    {
+    }
+    constexpr hint(const synchronization::isolation_level isolation_level,
+                   const synchronization::protocol preferred_protocol, const expected_access_frequency access_frequency,
+                   const expected_read_write_ratio read_write_ratio) noexcept
+        : _access_frequency(access_frequency), _read_write_ratio(read_write_ratio), _isolation_level(isolation_level),
+          _preferred_protocol(preferred_protocol)
+    {
+    }
+    constexpr hint(const std::uint8_t node_id, const synchronization::isolation_level isolation_level,
+                   const expected_access_frequency access_frequency) noexcept
+        : _numa_node_id(node_id), _access_frequency(access_frequency), _isolation_level(isolation_level)
+    {
+    }
+    constexpr hint(const std::uint8_t node_id, const synchronization::isolation_level isolation_level,
+                   const synchronization::protocol preferred_protocol,
+                   const expected_access_frequency access_frequency) noexcept
+        : _numa_node_id(node_id), _access_frequency(access_frequency), _isolation_level(isolation_level),
+          _preferred_protocol(preferred_protocol)
+    {
+    }
+
+    constexpr hint(hint &&) noexcept = default;
+
+    ~hint() = default;
+
+    [[nodiscard]] bool has_numa_node_id() const noexcept
+    {
+        return _numa_node_id < std::numeric_limits<std::uint8_t>::max();
+    }
+    [[nodiscard]] std::uint8_t numa_node_id() const noexcept { return _numa_node_id; }
+
+    [[nodiscard]] bool has_channel_id() const noexcept
+    {
+        return _channel_id < std::numeric_limits<std::uint16_t>::max();
+    }
+    [[nodiscard]] std::uint16_t channel_id() const noexcept { return _channel_id; }
+    [[nodiscard]] expected_access_frequency access_frequency() const noexcept { return _access_frequency; }
+    [[nodiscard]] expected_read_write_ratio read_write_ratio() const noexcept { return _read_write_ratio; }
+    [[nodiscard]] synchronization::isolation_level isolation_level() const noexcept { return _isolation_level; }
+    [[nodiscard]] synchronization::protocol preferred_protocol() const noexcept { return _preferred_protocol; }
+
+    bool operator==(const synchronization::isolation_level isolation_level) const noexcept
+    {
+        return _isolation_level == isolation_level;
+    }
+
+    bool operator!=(const synchronization::isolation_level isolation_level) const noexcept
+    {
+        return _isolation_level != isolation_level;
+    }
+
+    bool operator==(const synchronization::protocol protocol) const noexcept { return _preferred_protocol == protocol; }
+
+    bool operator!=(const synchronization::protocol protocol) const noexcept { return _preferred_protocol != protocol; }
+
+private:
+    hint() = default;
+
+    // Preferred NUMA region; no preference by default.
+    const std::uint8_t _numa_node_id{std::numeric_limits<std::uint8_t>::max()};
+
+    // Preferred channel; no preference by default.
+    const std::uint16_t _channel_id{std::numeric_limits<std::uint16_t>::max()};
+
+    // Expected access frequency; normal by default.
+    const enum expected_access_frequency _access_frequency { expected_access_frequency::normal };
+
+    // Expected read/write ratio; normal by default.
+    const expected_read_write_ratio _read_write_ratio{expected_read_write_ratio::balanced};
+
+    // Preferred isolation level; no synchronization by default.
+    const synchronization::isolation_level _isolation_level{synchronization::isolation_level::None};
+
+    // Preferred synchronization protocol (queue, latch, ...); no synchronization by default.
+    const synchronization::protocol _preferred_protocol{synchronization::protocol::None};
+};
+
+/**
+ * Information of a resource, stored within
+ * the pointer to the resource.
+ */
+class information
+{
+public:
+    constexpr information() noexcept : _channel_id(0U), _synchronization_primitive(0U) {}
+    explicit information(const std::uint16_t channel_id,
+                         const synchronization::primitive synchronization_primitive) noexcept
+        : _channel_id(channel_id), _synchronization_primitive(static_cast<std::uint16_t>(synchronization_primitive))
+    {
+    }
+
+    ~information() = default;
+
+    [[nodiscard]] std::uint16_t channel_id() const noexcept { return _channel_id; }
+    [[nodiscard]] synchronization::primitive synchronization_primitive() const noexcept
+    {
+        return static_cast<synchronization::primitive>(_synchronization_primitive);
+    }
+
+    information &operator=(const information &other) = default;
+
+private:
+    std::uint16_t _channel_id : 12;
+    std::uint16_t _synchronization_primitive : 4;
+} __attribute__((packed));
+
+/**
+ * Pointer to a resource, stores information about
+ * that resource.
+ */
+class ptr final : public memory::tagged_ptr<void, information>
+{
+public:
+    constexpr ptr() noexcept = default;
+    explicit ptr(void *ptr_, const information info = {}) noexcept : memory::tagged_ptr<void, information>(ptr_, info)
+    {
+    }
+    ~ptr() = default;
+
+    ptr &operator=(const ptr &other) noexcept = default;
+
+    [[nodiscard]] std::uint16_t channel_id() const noexcept { return info().channel_id(); }
+    [[nodiscard]] synchronization::primitive synchronization_primitive() const noexcept
+    {
+        return info().synchronization_primitive();
+    }
+} __attribute__((packed));
+
+/**
+ * Casts the internal pointer of the resource pointer
+ * to a pointer typed by the given template parameter.
+ *
+ * @param resource Resource to cast.
+ * @return Pointer to the requested type.
+ */
+template <typename S> static auto *ptr_cast(const ptr resource) noexcept
+{
+    return resource.template get<S>();
+}
+
+} // namespace mx::resource
--- a/src/mx/resource/resource_interface.h
+++ b/src/mx/resource/resource_interface.h
@@ -0,0 +1,154 @@
+#pragma once
+
+#include <atomic>
+#include <cstdint>
+#include <mx/memory/reclamation/epoch_t.h>
+#include <mx/synchronization/optimistic_lock.h>
+#include <mx/synchronization/rw_spinlock.h>
+#include <mx/synchronization/spinlock.h>
+
+namespace mx::resource {
+/**
+ * The resource interface represents resources that
+ * needs to be synchronized by the tasking engine.
+ * Supported synchronizations are:
+ *  - Latches (Spinlock, R/W-lock)
+ *  - Optimistic latches + memory reclamation
+ */
+class ResourceInterface
+{
+public:
+    enum SynchronizationType : std::uint8_t
+    {
+        Exclusive,
+        SharedRead,
+        SharedWrite,
+        Optimistic,
+        OLFIT,
+    };
+
+    constexpr ResourceInterface() noexcept = default;
+    ResourceInterface(const ResourceInterface &) = delete;
+    ResourceInterface(ResourceInterface &&) = delete;
+    virtual ~ResourceInterface() = default;
+
+    /**
+     * Called by the epoch manager on safe reclaiming this resource.
+     */
+    virtual void on_reclaim() = 0;
+
+    /**
+     * Set the next resource in garbage list.
+     * @param next Next resource in garbage list.
+     */
+    void next(ResourceInterface *next) noexcept { _next_garbage = next; }
+
+    /**
+     * @return Next resource in garbage list.
+     */
+    [[nodiscard]] ResourceInterface *next() const noexcept { return _next_garbage; }
+
+    /**
+     * @return The current version of the resource.
+     */
+    [[nodiscard]] synchronization::OptimisticLock::version_t version() const noexcept
+    {
+        return _optimistic_latch.read_valid();
+    }
+
+    /**
+     * Checks whether the given version is still valid.
+     *
+     * @param version Version to check.
+     * @return True, when the version is valid.
+     */
+    [[nodiscard]] bool is_version_valid(const synchronization::OptimisticLock::version_t version) const noexcept
+    {
+        return _optimistic_latch.is_valid(version);
+    }
+
+    /**
+     * Tries to acquire the optimistic latch.
+     * @return True, when latch was acquired.
+     */
+    [[nodiscard]] bool try_acquire_optimistic_latch() noexcept { return _optimistic_latch.try_lock(); }
+
+    /**
+     * Set the epoch-timestamp this resource was removed.
+     * @param epoch Epoch where this resource was removed.
+     */
+    void remove_epoch(const memory::reclamation::epoch_t epoch) noexcept { _remove_epoch = epoch; }
+
+    /**
+     * @return The epoch this resource was removed.
+     */
+    [[nodiscard]] memory::reclamation::epoch_t remove_epoch() const noexcept { return _remove_epoch; }
+
+    template <SynchronizationType T> class scoped_latch
+    {
+    public:
+        constexpr inline explicit scoped_latch(ResourceInterface *resource) noexcept : _resource(resource)
+        {
+            if constexpr (T == SynchronizationType::Exclusive)
+            {
+                _resource->_exclusive_latch.lock();
+            }
+            else if constexpr (T == SynchronizationType::SharedRead)
+            {
+                _resource->_rw_latch.lock_shared();
+            }
+            else if constexpr (T == SynchronizationType::SharedWrite)
+            {
+                _resource->_rw_latch.lock();
+            }
+            else if constexpr (T == SynchronizationType::Optimistic)
+            {
+                _resource->_optimistic_latch.lock<true>();
+            }
+            else if constexpr (T == SynchronizationType::OLFIT)
+            {
+                _resource->_optimistic_latch.lock<false>();
+            }
+        }
+
+        inline ~scoped_latch() noexcept
+        {
+            if constexpr (T == SynchronizationType::Exclusive)
+            {
+                _resource->_exclusive_latch.unlock();
+            }
+            else if constexpr (T == SynchronizationType::SharedRead)
+            {
+                _resource->_rw_latch.unlock_shared();
+            }
+            else if constexpr (T == SynchronizationType::SharedWrite)
+            {
+                _resource->_rw_latch.unlock();
+            }
+            else if constexpr (T == SynchronizationType::Optimistic || T == SynchronizationType::OLFIT)
+            {
+                _resource->_optimistic_latch.unlock();
+            }
+        }
+
+    private:
+        ResourceInterface *_resource;
+    };
+
+    using scoped_exclusive_latch = scoped_latch<SynchronizationType::Exclusive>;
+    using scoped_optimistic_latch = scoped_latch<SynchronizationType::Optimistic>;
+    using scoped_olfit_latch = scoped_latch<SynchronizationType::OLFIT>;
+    template <bool WRITER>
+    using scoped_rw_latch = scoped_latch<WRITER ? SynchronizationType::SharedWrite : SynchronizationType::SharedRead>;
+
+private:
+    // Encapsulated synchronization primitives.
+    synchronization::Spinlock _exclusive_latch;
+    synchronization::RWSpinLock _rw_latch;
+    synchronization::OptimisticLock _optimistic_latch;
+
+    // Epoch and Garbage management.
+    memory::reclamation::epoch_t _remove_epoch{0U};
+    ResourceInterface *_next_garbage{nullptr};
+};
+} // namespace mx::resource
--- a/src/mx/synchronization/optimistic_lock.h
+++ b/src/mx/synchronization/optimistic_lock.h
@@ -0,0 +1,89 @@
+#pragma once
+#include <atomic>
+#include <cstdint>
+#include <limits>
+#include <mx/system/builtin.h>
+#include <mx/tasking/config.h>
+
+namespace mx::synchronization {
+class OptimisticLock
+{
+public:
+    using version_t = std::uint32_t;
+
+    constexpr OptimisticLock() = default;
+    ~OptimisticLock() = default;
+
+    /**
+     * Guarantees to read a valid version by blocking until
+     * the version is not locked.
+     * @return The current version.
+     */
+    [[nodiscard]] version_t read_valid() const noexcept
+    {
+        auto version = _version.load(std::memory_order_seq_cst);
+        while (OptimisticLock::is_locked(version))
+        {
+            system::builtin::pause();
+            version = _version.load(std::memory_order_seq_cst);
+        }
+        return version;
+    }
+
+    /**
+     * Validates the version.
+     *
+     * @param version The version to validate.
+     * @return True, if the version is valid.
+     */
+    [[nodiscard]] bool is_valid(const version_t version) const noexcept
+    {
+        return version == _version.load(std::memory_order_seq_cst);
+    }
+
+    /**
+     * Tries to acquire the lock.
+     * @return True, when lock was acquired.
+     */
+    [[nodiscard]] bool try_lock() noexcept
+    {
+        auto version = read_valid();
+
+        return _version.compare_exchange_strong(version, version + 0b10);
+    }
+
+    /**
+     * Waits until the lock is successfully acquired.
+     */
+    template <bool SINGLE_WRITER> void lock() noexcept
+    {
+        if constexpr (SINGLE_WRITER)
+        {
+            _version.fetch_add(0b10, std::memory_order_seq_cst);
+        }
+        else
+        {
+            auto tries = std::uint64_t{1U};
+            while (this->try_lock() == false)
+            {
+                const auto wait = tries++;
+                for (auto i = 0U; i < wait * 32U; ++i)
+                {
+                    system::builtin::pause();
+                    std::atomic_thread_fence(std::memory_order_seq_cst);
+                }
+            }
+        }
+    }
+
+    /**
+     * Unlocks the version lock.
+     */
+    void unlock() noexcept { _version.fetch_add(0b10, std::memory_order_seq_cst); }
+
+private:
+    std::atomic<version_t> _version{0b100};
+
+    [[nodiscard]] static bool is_locked(const version_t version) noexcept { return (version & 0b10) == 0b10; }
+};
+} // namespace mx::synchronization
--- a/src/mx/synchronization/primitive_matrix.h
+++ b/src/mx/synchronization/primitive_matrix.h
@@ -0,0 +1,68 @@
+#pragma once
+#include "synchronization.h"
+#include <algorithm>
+#include <cstdint>
+#include <mx/resource/resource.h>
+
+namespace mx::synchronization {
+class PrimitiveMatrix
+{
+public:
+    static primitive select_primitive(const isolation_level isolation_level,
+                                      const resource::hint::expected_access_frequency access_frequency,
+                                      const resource::hint::expected_read_write_ratio read_write_ratio) noexcept
+    {
+        return isolation_level != isolation_level::None
+                   ? matrix()[static_cast<std::uint8_t>(isolation_level)][static_cast<std::uint8_t>(read_write_ratio)]
+                             [static_cast<std::uint8_t>(access_frequency)]
+                   : primitive::None;
+    }
+
+private:
+    constexpr static std::array<std::array<std::array<primitive, 4>, 5>, 2> matrix() noexcept
+    {
+        return {{// For isolation_level::ExclusiveWriter
+                 {{
+                     // For predicted_read_write_ratio::heavy_read
+                     {{primitive::ScheduleWriter, primitive::ScheduleWriter, primitive::ScheduleWriter,
+                       primitive::ScheduleWriter}},
+
+                     // For predicted_read_write_ratio::mostly_read
+                     {{primitive::ScheduleWriter, primitive::ScheduleWriter, primitive::OLFIT, primitive::OLFIT}},
+
+                     // For predicted_read_write_ratio::balanced
+                     {{primitive::OLFIT, primitive::OLFIT, primitive::OLFIT, primitive::OLFIT}},
+
+                     // For predicted_read_write_ratio::mostly_written
+                     {{primitive::OLFIT, primitive::OLFIT, primitive::ReaderWriterLatch, primitive::ReaderWriterLatch}},
+
+                     // For predicted_read_write_ratio::heavy_written
+                     {{primitive::ScheduleAll, primitive::ScheduleAll, primitive::ReaderWriterLatch,
+                       primitive::ReaderWriterLatch}},
+                 }},
+
+                 // For isolation_level::Exclusive
+                 {{
+                     // For predicted_read_write_ratio::heavy_read
+                     {{primitive::ScheduleAll, primitive::ScheduleAll, primitive::ExclusiveLatch,
+                       primitive::ExclusiveLatch}},
+
+                     // For predicted_read_write_ratio::mostly_read
+                     {{primitive::ScheduleAll, primitive::ScheduleAll, primitive::ExclusiveLatch,
+                       primitive::ExclusiveLatch}},
+
+                     // For predicted_read_write_ratio::balanced
+                     {{primitive::ScheduleAll, primitive::ScheduleAll, primitive::ExclusiveLatch,
+                       primitive::ExclusiveLatch}},
+
+                     // For predicted_read_write_ratio::mostly_written
+                     {{primitive::ScheduleAll, primitive::ScheduleAll, primitive::ExclusiveLatch,
+                       primitive::ExclusiveLatch}},
+
+                     // For predicted_read_write_ratio::heavy_written
+                     {{primitive::ScheduleAll, primitive::ScheduleAll, primitive::ExclusiveLatch,
+                       primitive::ExclusiveLatch}},
+                 }}}};
+    }
+};
+} // namespace mx::synchronization
--- a/src/mx/synchronization/rw_spinlock.h
+++ b/src/mx/synchronization/rw_spinlock.h
@@ -0,0 +1,292 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * N.B. You most likely do _not_ want to use RWSpinLock or any other
+ * kind of spinlock.  Use SharedMutex instead.
+ *
+ * In short, spinlocks in preemptive multi-tasking operating systems
+ * have serious problems and fast mutexes like SharedMutex are almost
+ * certainly the better choice, because letting the OS scheduler put a
+ * thread to sleep is better for system responsiveness and throughput
+ * than wasting a timeslice repeatedly querying a lock held by a
+ * thread that's blocked, and you can't prevent userspace
+ * programs blocking.
+ *
+ * Spinlocks in an operating system kernel make much more sense than
+ * they do in userspace.
+ *
+ * -------------------------------------------------------------------
+ *
+ * Two Read-Write spin lock implementations.
+ *
+ *  Ref: http://locklessinc.com/articles/locks
+ *
+ *  Both locks here are faster than pthread_rwlock and have very low
+ *  overhead (usually 20-30ns).  They don't use any system mutexes and
+ *  are very compact (4/8 bytes), so are suitable for per-instance
+ *  based locking, particularly when contention is not expected.
+ *
+ *  For a spinlock, RWSpinLock is a reasonable choice.  (See the note
+ *  about for why a spin lock is frequently a bad idea generally.)
+ *  RWSpinLock has minimal overhead, and comparable contention
+ *  performance when the number of competing threads is less than or
+ *  equal to the number of logical CPUs.  Even as the number of
+ *  threads gets larger, RWSpinLock can still be very competitive in
+ *  READ, although it is slower on WRITE, and also inherently unfair
+ *  to writers.
+ *
+ *  RWTicketSpinLock shows more balanced READ/WRITE performance.  If
+ *  your application really needs a lot more threads, and a
+ *  higher-priority writer, prefer one of the RWTicketSpinLock locks.
+ *
+ *  Caveats:
+ *
+ *    RWTicketSpinLock locks can only be used with GCC on x86/x86-64
+ *    based systems.
+ *
+ *    RWTicketSpinLock<32> only allows up to 2^8 - 1 concurrent
+ *    readers and writers.
+ *
+ *    RWTicketSpinLock<64> only allows up to 2^16 - 1 concurrent
+ *    readers and writers.
+ *
+ *    RWTicketSpinLock<..., true> (kFavorWriter = true, that is, strict
+ *    writer priority) is NOT reentrant, even for lock_shared().
+ *
+ *    The lock will not grant any new shared (read) accesses while a thread
+ *    attempting to acquire the lock in write mode is blocked. (That is,
+ *    if the lock is held in shared mode by N threads, and a thread attempts
+ *    to acquire it in write mode, no one else can acquire it in shared mode
+ *    until these N threads release the lock and then the blocked thread
+ *    acquires and releases the exclusive lock.) This also applies for
+ *    attempts to reacquire the lock in shared mode by threads that already
+ *    hold it in shared mode, making the lock non-reentrant.
+ *
+ *    RWSpinLock handles 2^30 - 1 concurrent readers.
+ *
+ * @author Xin Liu <xliux@fb.com>
+ */
+
+#pragma once
+
+/*
+========================================================================
+Benchmark on (Intel(R) Xeon(R) CPU  L5630  @ 2.13GHz)  8 cores(16 HTs)
+========================================================================
+
+------------------------------------------------------------------------------
+1. Single thread benchmark (read/write lock + unlock overhead)
+Benchmark                                    Iters   Total t    t/iter iter/sec
+-------------------------------------------------------------------------------
+*      BM_RWSpinLockRead                     100000  1.786 ms  17.86 ns   53.4M
+30.5% BM_RWSpinLockWrite                    100000  2.331 ms  23.31 ns  40.91M
+85.7% BM_RWTicketSpinLock32Read             100000  3.317 ms  33.17 ns  28.75M
+96.0% BM_RWTicketSpinLock32Write            100000    3.5 ms     35 ns  27.25M
+85.6% BM_RWTicketSpinLock64Read             100000  3.315 ms  33.15 ns  28.77M
+96.0% BM_RWTicketSpinLock64Write            100000    3.5 ms     35 ns  27.25M
+85.7% BM_RWTicketSpinLock32FavorWriterRead  100000  3.317 ms  33.17 ns  28.75M
+29.7% BM_RWTicketSpinLock32FavorWriterWrite 100000  2.316 ms  23.16 ns  41.18M
+85.3% BM_RWTicketSpinLock64FavorWriterRead  100000  3.309 ms  33.09 ns  28.82M
+30.2% BM_RWTicketSpinLock64FavorWriterWrite 100000  2.325 ms  23.25 ns  41.02M
+ 175% BM_PThreadRWMutexRead                 100000  4.917 ms  49.17 ns   19.4M
+ 166% BM_PThreadRWMutexWrite                100000  4.757 ms  47.57 ns  20.05M
+
+------------------------------------------------------------------------------
+2. Contention Benchmark      90% read  10% write
+Benchmark                    hits       average    min       max        sigma
+------------------------------------------------------------------------------
+---------- 8  threads ------------
+RWSpinLock       Write       142666     220ns      78ns      40.8us     269ns
+RWSpinLock       Read        1282297    222ns      80ns      37.7us     248ns
+RWTicketSpinLock Write       85692      209ns      71ns      17.9us     252ns
+RWTicketSpinLock Read        769571     215ns      78ns      33.4us     251ns
+pthread_rwlock_t Write       84248      2.48us     99ns      269us      8.19us
+pthread_rwlock_t Read        761646     933ns      101ns     374us      3.25us
+
+---------- 16 threads ------------
+RWSpinLock       Write       124236     237ns      78ns      261us      801ns
+RWSpinLock       Read        1115807    236ns      78ns      2.27ms     2.17us
+RWTicketSpinLock Write       81781      231ns      71ns      31.4us     351ns
+RWTicketSpinLock Read        734518     238ns      78ns      73.6us     379ns
+pthread_rwlock_t Write       83363      7.12us     99ns      785us      28.1us
+pthread_rwlock_t Read        754978     2.18us     101ns     1.02ms     14.3us
+
+---------- 50 threads ------------
+RWSpinLock       Write       131142     1.37us     82ns      7.53ms     68.2us
+RWSpinLock       Read        1181240    262ns      78ns      6.62ms     12.7us
+RWTicketSpinLock Write       83045      397ns      73ns      7.01ms     31.5us
+RWTicketSpinLock Read        744133     386ns      78ns        11ms     31.4us
+pthread_rwlock_t Write       80849      112us      103ns     4.52ms     263us
+pthread_rwlock_t Read        728698     24us       101ns     7.28ms     194us
+
+*/
+
+#include <algorithm>
+#include <atomic>
+#include <mx/system/builtin.h>
+#include <thread>
+
+namespace mx::synchronization {
+
+/*
+ * A simple, small (4-bytes), but unfair rwlock.  Use it when you want
+ * a nice writer and don't expect a lot of write/read contention, or
+ * when you need small rwlocks since you are creating a large number
+ * of them.
+ *
+ * Note that the unfairness here is extreme: if the lock is
+ * continually accessed for read, writers will never get a chance.  If
+ * the lock can be that highly contended this class is probably not an
+ * ideal choice anyway.
+ *
+ * It currently implements most of the Lockable, SharedLockable and
+ * UpgradeLockable concepts except the TimedLockable related locking/unlocking
+ * interfaces.
+ */
+class RWSpinLock
+{
+    enum : int32_t
+    {
+        READER = 4,
+        UPGRADED = 2,
+        WRITER = 1
+    };
+
+public:
+    constexpr RWSpinLock() : bits_(0) {}
+
+    RWSpinLock(RWSpinLock const &) = delete;
+    RWSpinLock &operator=(RWSpinLock const &) = delete;
+
+    // Lockable Concept
+    void lock() noexcept
+    {
+        while (!try_lock())
+        {
+            mx::system::builtin::pause();
+        }
+    }
+
+    // Writer is responsible for clearing up both the UPGRADED and WRITER bits.
+    void unlock() noexcept
+    {
+        static_assert(READER > WRITER + UPGRADED, "wrong bits!");
+        bits_.fetch_and(~(WRITER | UPGRADED), std::memory_order_release);
+    }
+
+    // SharedLockable Concept
+    void lock_shared() noexcept
+    {
+        while (!try_lock_shared())
+        {
+            mx::system::builtin::pause();
+        }
+    }
+
+    void unlock_shared() noexcept { bits_.fetch_add(-READER, std::memory_order_release); }
+
+    // Downgrade the lock from writer status to reader status.
+    void unlock_and_lock_shared() noexcept
+    {
+        bits_.fetch_add(READER, std::memory_order_acquire);
+        unlock();
+    }
+
+    // UpgradeLockable Concept
+    void lock_upgrade() noexcept
+    {
+        while (!try_lock_upgrade())
+        {
+            system::builtin::pause();
+        }
+    }
+
+    void unlock_upgrade() noexcept { bits_.fetch_add(-UPGRADED, std::memory_order_acq_rel); }
+
+    // unlock upgrade and try to acquire write lock
+    void unlock_upgrade_and_lock() noexcept
+    {
+        while (!try_unlock_upgrade_and_lock())
+        {
+            system::builtin::pause();
+        }
+    }
+
+    // unlock upgrade and read lock atomically
+    void unlock_upgrade_and_lock_shared() noexcept { bits_.fetch_add(READER - UPGRADED, std::memory_order_acq_rel); }
+
+    // write unlock and upgrade lock atomically
+    void unlock_and_lock_upgrade() noexcept
+    {
+        // need to do it in two steps here -- as the UPGRADED bit might be OR-ed at
+        // the same time when other threads are trying do try_lock_upgrade().
+        bits_.fetch_or(UPGRADED, std::memory_order_acquire);
+        bits_.fetch_add(-WRITER, std::memory_order_release);
+    }
+
+    // Attempt to acquire writer permission. Return false if we didn't get it.
+    bool try_lock() noexcept
+    {
+        int32_t expect = 0;
+        return bits_.compare_exchange_strong(expect, WRITER, std::memory_order_acq_rel);
+    }
+
+    // Try to get reader permission on the lock. This can fail if we
+    // find out someone is a writer or upgrader.
+    // Setting the UPGRADED bit would allow a writer-to-be to indicate
+    // its intention to write and block any new readers while waiting
+    // for existing readers to finish and release their read locks. This
+    // helps avoid starving writers (promoted from upgraders).
+    bool try_lock_shared() noexcept
+    {
+        // fetch_add is considerably (100%) faster than compare_exchange,
+        // so here we are optimizing for the common (lock success) case.
+        int32_t value = bits_.fetch_add(READER, std::memory_order_acquire);
+        if (value & (WRITER | UPGRADED))
+        {
+            bits_.fetch_add(-READER, std::memory_order_release);
+            return false;
+        }
+        return true;
+    }
+
+    // try to unlock upgrade and write lock atomically
+    bool try_unlock_upgrade_and_lock() noexcept
+    {
+        int32_t expect = UPGRADED;
+        return bits_.compare_exchange_strong(expect, WRITER, std::memory_order_acq_rel);
+    }
+
+    // try to acquire an upgradable lock.
+    bool try_lock_upgrade() noexcept
+    {
+        int32_t value = bits_.fetch_or(UPGRADED, std::memory_order_acquire);
+
+        // Note: when failed, we cannot flip the UPGRADED bit back,
+        // as in this case there is either another upgrade lock or a write lock.
+        // If it's a write lock, the bit will get cleared up when that lock's done
+        // with unlock().
+        return ((value & (UPGRADED | WRITER)) == 0);
+    }
+
+    // mainly for debugging purposes.
+    [[nodiscard]] int32_t bits() const noexcept { return bits_.load(std::memory_order_acquire); }
+
+private:
+    std::atomic<int32_t> bits_;
+};
+} // namespace mx::synchronization
--- a/src/mx/synchronization/spinlock.h
+++ b/src/mx/synchronization/spinlock.h
@@ -0,0 +1,59 @@
+#pragma once
+
+#include <atomic>
+#include <cstdint>
+#include <mx/system/builtin.h>
+
+namespace mx::synchronization {
+/**
+ * Simple spinlock for mutual exclusion.
+ */
+class Spinlock
+{
+public:
+    constexpr Spinlock() noexcept = default;
+    ~Spinlock() = default;
+
+    /**
+     * Locks the spinlock by spinning until it is lockable.
+     */
+    void lock() noexcept
+    {
+        while (true)
+        {
+            while (_flag.load(std::memory_order_relaxed))
+            {
+                system::builtin::pause();
+            }
+
+            if (try_lock())
+            {
+                return;
+            }
+        }
+    }
+
+    /**
+     * Try to lock the lock.
+     * @return True, when successfully locked.
+     */
+    bool try_lock() noexcept
+    {
+        bool expected = false;
+        return _flag.compare_exchange_weak(expected, true, std::memory_order_acquire);
+    }
+
+    /**
+     * Unlocks the spinlock.
+     */
+    void unlock() noexcept { _flag.store(false, std::memory_order_acquire); }
+
+    /**
+     * @return True, if the lock is in use.
+     */
+    [[nodiscard]] bool is_locked() const noexcept { return _flag.load(std::memory_order_relaxed); }
+
+private:
+    std::atomic_bool _flag{false};
+};
+} // namespace mx::synchronization
--- a/src/mx/synchronization/synchronization.h
+++ b/src/mx/synchronization/synchronization.h
@@ -0,0 +1,57 @@
+#pragma once
+#include <cstdint>
+
+namespace mx::synchronization {
+/**
+ * Desired isolation level of a resource.
+ */
+enum class isolation_level : std::uint8_t
+{
+    ExclusiveWriter = 0U, // Reads can be parallel, writes will be synchronized
+    Exclusive = 1U,       // All accesses will be synchronized
+    None = 2U,            // Nothing will be synchronized
+};
+
+/**
+ * Desired protocol of synchronization.
+ */
+enum class protocol : std::uint8_t
+{
+    None = 0U,               // System is free to choose
+    Queue = 1U,              // Choose primitive with queues with respect to isolation level
+    Latch = 2U,              // Choose primitive with latches with respect to isolation level
+    OLFIT = 3U,              // Try to choose olfit
+    TransactionalMemory = 4U // Try to choose htm
+};
+
+/**
+ * Real method, based on the isolation level
+ * and decision by the tasking layer.
+ *
+ * Attention: Even if the primitive is 8bit long,
+ *            it is stored within the tagged_ptr as
+ *            using only 4bit! Therefore, the max.
+ *            value can be 15.
+ */
+enum class primitive : std::uint8_t
+{
+    None = 0U,              // Nothing will be synchronized
+    ExclusiveLatch = 1U,    // All accesses will use a spinlock
+    ScheduleAll = 2U,       // All accesses will be scheduled to the mapped channel
+    ReaderWriterLatch = 3U, // Use a reader/writer latch to enable parallel reads
+    ScheduleWriter = 4U,    // Reads can perform anywhere, writes are scheduled to the mapped channel
+    OLFIT = 5U              // Read/write anywhere but use a latch for writers
+};
+
+/**
+ * Checks whether the given primitive is kind of optimistic synchronization
+ * or not.
+ * @param primitive_ Primitive to check.
+ * @return True, if the given primitive is optimistic.
+ */
+static inline bool is_optimistic(const primitive primitive_) noexcept
+{
+    return primitive_ == primitive::ScheduleWriter || primitive_ == primitive::OLFIT;
+}
+
+} // namespace mx::synchronization
--- a/src/mx/system/builtin.h
+++ b/src/mx/system/builtin.h
@@ -0,0 +1,35 @@
+#pragma once
+#include <cstdint>
+#include <iostream>
+
+namespace mx::system {
+/**
+ * Encapsulates compiler builtins.
+ */
+class builtin
+{
+public:
+    /**
+     * Generates a pause/yield cpu instruction, independently
+     * of the hardware.
+     */
+    static void pause() noexcept
+    {
+#if defined(__x86_64__) || defined(__amd64__)
+        __builtin_ia32_pause();
+#elif defined(__arm__)
+        asm("YIELD");
+#endif
+    }
+
+    [[maybe_unused]] static bool expect_false(const bool expression) noexcept
+    {
+        return __builtin_expect(expression, false);
+    }
+
+    [[maybe_unused]] static bool expect_true(const bool expression) noexcept
+    {
+        return __builtin_expect(expression, true);
+    }
+};
+} // namespace mx::system
--- a/src/mx/system/cache.h
+++ b/src/mx/system/cache.h
@@ -0,0 +1,187 @@
+#pragma once
+
+#include <cstdint>
+
+namespace mx::system {
+/**
+ * Encapsulates cache operations like prefetching.
+ *
+ * Further documentation on Intel: https://www.felixcloutier.com/x86/prefetchh
+ */
+class cache
+{
+public:
+    enum level : std::uint8_t
+    {
+        L1 = 1U,
+        L2 = 2U,
+        LLC = 3U
+    };
+    enum access : std::uint8_t
+    {
+        read = 0U,
+        write = 1U
+    };
+
+    /**
+     * Prefetches a single cache line into a given prefetch level.
+     *
+     * @tparam L Wanted cache level.
+     * @tparam A Access to the cache line whether read or write.
+     * @param address Address of the memory which should be prefetched.
+     */
+    template <level L, access A = access::read> static void prefetch(void *address) noexcept
+    {
+#ifdef __x86_64
+        if constexpr (A == access::write)
+        {
+            asm volatile("PREFETCHW (%0)\n" ::"r"(address));
+        }
+        else if constexpr (L == level::L1)
+        {
+            asm volatile("PREFETCHT1 (%0)\n" ::"r"(address));
+        }
+        else if constexpr (L == level::L2)
+        {
+            asm volatile("PREFETCHT2 (%0)\n" ::"r"(address));
+        }
+        else
+        {
+            asm volatile("PREFETCHNTA (%0)\n" ::"r"(address));
+        }
+#elif defined(__aarch64__)
+        if constexpr (L == L1)
+        {
+            if constexpr (A == access::read)
+            {
+                asm volatile("prfm pldl1keep, %a0\n" : : "p"(address));
+            }
+            else
+            {
+                asm volatile("prfm pstl1keep, %a0\n" : : "p"(address));
+            }
+        }
+        else if constexpr (L == L2)
+        {
+            if constexpr (A == access::read)
+            {
+                asm volatile("prfm pldl2keep, %a0\n" : : "p"(address));
+            }
+            else
+            {
+                asm volatile("prfm pstl2keep, %a0\n" : : "p"(address));
+            }
+        }
+        else
+        {
+            if constexpr (A == access::read)
+            {
+                asm volatile("prfm pldl3keep, %a0\n" : : "p"(address));
+            }
+            else
+            {
+                asm volatile("prfm pstl3keep, %a0\n" : : "p"(address));
+            }
+        }
+#endif
+    }
+
+    /**
+     * Prefetches a range of cache lines into the given cache level.
+     *
+     * @tparam L Wanted cache level.
+     * @tparam A Access to the cache line whether read or write.
+     * @param address Address of the memory which should be prefetched.
+     * @param size Size of the accessed memory.
+     */
+    template <level L, access A = access::read>
+    static void prefetch_range(void *address, const std::uint32_t size) noexcept
+    {
+        auto addr = std::uintptr_t(address);
+        const auto end = addr + size;
+
+        if ((size & 1023U) == 0U)
+        {
+            for (; addr < end; addr += 1024U)
+            {
+                prefetch_range<L, 1024U, A>(reinterpret_cast<void *>(addr));
+            }
+        }
+        else if ((size & 511U) == 0U)
+        {
+            for (; addr < end; addr += 512U)
+            {
+                prefetch_range<L, 512U, A>(reinterpret_cast<void *>(addr));
+            }
+        }
+        else if ((size & 255U) == 0U)
+        {
+            for (; addr < end; addr += 256U)
+            {
+                prefetch_range<L, 256U, A>(reinterpret_cast<void *>(addr));
+            }
+        }
+        else if ((size & 127U) == 0U)
+        {
+            for (; addr < end; addr += 128U)
+            {
+                prefetch_range<L, 128U, A>(reinterpret_cast<void *>(addr));
+            }
+        }
+        else
+        {
+            for (; addr < end; addr += 64U)
+            {
+                prefetch<L, A>(reinterpret_cast<void *>(addr));
+            }
+        }
+    }
+
+    /**
+     * Prefetches a range of cache lines into the given cache level.
+     *
+     * @tparam L Wanted cache level.
+     * @tparam S Size of the accessed memory.
+     * @tparam A Access to the cache line whether read or write.
+     * @param address Address of the accessed memory.
+     */
+    template <level L, std::uint32_t S, access A = access::read> static void prefetch_range(void *address) noexcept
+    {
+        static_assert(S && (!(S & (S - 1))) && "Must be power of two.");
+        const auto addr = std::uintptr_t(address);
+        if constexpr (S <= 64U)
+        {
+            prefetch<L, A>(address);
+        }
+        else if constexpr (S == 128U)
+        {
+            prefetch<L, A>(address);
+            prefetch<L, A>(reinterpret_cast<void *>(addr + 64U));
+        }
+        else if constexpr (S == 192U)
+        {
+            prefetch_range<L, 128U, A>(address);
+            prefetch<L, A>(reinterpret_cast<void *>(addr + 128U));
+        }
+        else if constexpr (S == 256U)
+        {
+            prefetch_range<L, 128U, A>(address);
+            prefetch_range<L, 128U, A>(reinterpret_cast<void *>(addr + 128U));
+        }
+        else if constexpr (S == 512U)
+        {
+            prefetch_range<L, 256U, A>(address);
+            prefetch_range<L, 256U, A>(reinterpret_cast<void *>(addr + 256U));
+        }
+        else if constexpr (S == 1024U)
+        {
+            prefetch_range<L, 512U, A>(address);
+            prefetch_range<L, 512U, A>(reinterpret_cast<void *>(addr + 512U));
+        }
+        else
+        {
+            prefetch_range<L, A>(address, S);
+        }
+    }
+};
+} // namespace mx::system
--- a/src/mx/system/cpuid.h
+++ b/src/mx/system/cpuid.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#include <cstdint>
+
+namespace mx::system {
+/**
+ * Encapsulates methods for checking features
+ * of the system by calling cpuid instruction.
+ */
+class cpuid
+{
+public:
+    /**
+     * @return True, when restricted transactional memory
+     *  is enabled.
+     */
+    static bool is_rtm_provided()
+    {
+        std::uint32_t eax = 0x7;
+        std::uint32_t ebx;
+        std::uint32_t ecx = 0x0;
+        asm volatile("cpuid" : "=b"(ebx) : "a"(eax), "c"(ecx));
+
+        return ebx & 0b100000000000;
+    }
+};
+} // namespace mx::system
--- a/src/mx/system/environment.h
+++ b/src/mx/system/environment.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include <fstream>
+
+namespace mx::system {
+/**
+ * Encapsulates functionality of the (Linux) system.
+ */
+class Environment
+{
+public:
+    /**
+     * @return True, if NUMA balancing is enabled by the system.
+     */
+    static bool is_numa_balancing_enabled()
+    {
+        std::ifstream numa_balancing_file("/proc/sys/kernel/numa_balancing");
+        auto is_enabled = std::int32_t{};
+        if (numa_balancing_file >> is_enabled)
+        {
+            return !(is_enabled == 0);
+        }
+
+        return true;
+    }
+
+    static constexpr auto is_sse2()
+    {
+#ifdef USE_SSE2
+        return true;
+#else
+        return false;
+#endif
+    }
+};
+} // namespace mx::system
--- a/src/mx/system/thread.h
+++ b/src/mx/system/thread.h
@@ -0,0 +1,36 @@
+#pragma once
+#include <chrono>
+#include <cstdint>
+#include <iostream>
+#include <thread>
+
+namespace mx::system {
+/**
+ * Encapsulates methods for thread access.
+ */
+class thread
+{
+public:
+    /**
+     * Pins a thread to a given core.
+     *
+     * @param thread Thread to pin.
+     * @param core_id Core where the thread should be pinned.
+     * @return True, when pinning was successful.
+     */
+    static bool pin(std::thread &thread, const std::uint16_t core_id)
+    {
+        cpu_set_t cpu_set;
+        CPU_ZERO(&cpu_set);
+        CPU_SET(core_id, &cpu_set);
+
+        if (pthread_setaffinity_np(thread.native_handle(), sizeof(cpu_set_t), &cpu_set) != 0)
+        {
+            std::cerr << "Can not pin thread!" << std::endl;
+            return false;
+        }
+
+        return true;
+    }
+};
+} // namespace mx::system
--- a/src/mx/system/topology.h
+++ b/src/mx/system/topology.h
@@ -0,0 +1,40 @@
+#pragma once
+
+#include <algorithm>
+#include <cstdint>
+#include <numa.h>
+#include <sched.h>
+#include <thread>
+
+namespace mx::system {
+/**
+ * Encapsulates methods for retrieving information
+ * about the hardware landscape.
+ */
+class topology
+{
+public:
+    /**
+     * @return Core where the caller is running.
+     */
+    static std::uint16_t core_id() { return std::uint16_t(sched_getcpu()); }
+
+    /**
+     * Reads the NUMA region identifier of the given core.
+     *
+     * @param core_id Id of the core.
+     * @return Id of the NUMA region the core stays in.
+     */
+    static std::uint8_t node_id(const std::uint16_t core_id) { return std::max(numa_node_of_cpu(core_id), 0); }
+
+    /**
+     * @return The greatest NUMA region identifier.
+     */
+    static std::uint8_t max_node_id() { return std::uint8_t(numa_max_node()); }
+
+    /**
+     * @return Number of available cores.
+     */
+    static std::uint16_t count_cores() { return std::uint16_t(std::thread::hardware_concurrency()); }
+};
+} // namespace mx::system
--- a/src/mx/tasking/channel.h
+++ b/src/mx/tasking/channel.h
@@ -0,0 +1,186 @@
+#pragma once
+
+#include "channel_occupancy.h"
+#include "load.h"
+#include "task.h"
+#include "task_buffer.h"
+#include <array>
+#include <cassert>
+#include <cstdint>
+#include <mx/memory/config.h>
+#include <mx/system/cache.h>
+#include <mx/util/mpsc_queue.h>
+#include <mx/util/queue.h>
+
+namespace mx::tasking {
+/**
+ * The channel is the central data structure where tasks are scheduled to pass tasks
+ * between worker threads. Every worker thread owns his own channel, where tasks
+ * are popped by only this channel. Every worker thread (or task) can push further
+ * tasks to the channel.
+ *
+ * Every channel consists of a handful of queues, where the tasks are really stored,
+ * different queues have different guarantees regarding concurrency and locality.
+ *
+ * In addition, every channel has its own buffer, where tasks are transferred from the
+ * queues. If the buffer is empty, the worker thread will fill it with tasks from backend
+ * queues.
+ *
+ * The buffer enables the worker thread to have a view to tasks that are ready for execution;
+ * this is used e.g. for prefetching.
+ */
+class Channel
+{
+public:
+    constexpr Channel(const std::uint16_t id, const std::uint8_t numa_node_id,
+                      const std::uint8_t prefetch_distance) noexcept
+        : _remote_queues({}), _local_queues({}), _task_buffer(prefetch_distance), _id(id), _numa_node_id(numa_node_id)
+    {
+    }
+    ~Channel() noexcept = default;
+
+    /**
+     * @return Identifier of the channel.
+     */
+    [[nodiscard]] std::uint16_t id() const noexcept { return _id; }
+
+    /**
+     * @return The next task to be executed.
+     */
+    TaskInterface *next() noexcept { return _task_buffer.next(); }
+
+    /**
+     * Schedules the task to thread-safe queue with regard to the NUMA region
+     * of the producer. Producer of different NUMA regions should not share
+     * a single queue.
+     * @param task Task to be scheduled.
+     * @param numa_node_id NUMA region of the producer.
+     */
+    void push_back_remote(TaskInterface *task, const std::uint8_t numa_node_id) noexcept
+    {
+        _remote_queues[task->priority()][numa_node_id].push_back(task);
+    }
+
+    /**
+     * Schedules a task to the local queue, which is not thread-safe. Only
+     * the channel owner should spawn tasks this way.
+     * @param task Task to be scheduled.
+     */
+    void push_back_local(TaskInterface *task) noexcept { _local_queues[task->priority()].push_back(task); }
+
+    /**
+     * Fill the task buffer with tasks from the backend queues.
+     * @return Size of the buffer after filling it.
+     */
+    std::uint16_t fill() noexcept
+    {
+        // Fill with normal prioritized.
+        auto size = fill<priority::normal>(_task_buffer.available_slots());
+
+        // Fill with low prioritized.
+        if (this->_task_buffer.empty())
+        {
+            size = fill<priority::low>(config::task_buffer_size());
+        }
+
+        return size;
+    }
+
+    /**
+     * Fills the task buffer with tasks scheduled with a given priority.
+     *
+     * @tparam P Priority of the tasks.
+     * @return Size of the task buffer after filling.
+     */
+    template <priority P> std::uint16_t fill() noexcept { return fill<P>(_task_buffer.available_slots()); }
+
+    /**
+     * @return Number of tasks available in the buffer and ready for execution.
+     */
+    [[nodiscard]] std::uint16_t size() const noexcept { return _task_buffer.size(); }
+
+    /**
+     * @return True, when the task buffer is empty. Backend queues may be have tasks.
+     */
+    [[nodiscard]] bool empty() const noexcept { return _task_buffer.empty(); }
+
+    /**
+     * Adds usage prediction of a resource to this channel.
+     * @param usage Predicted usage.
+     */
+    void predict_usage(const resource::hint::expected_access_frequency usage) noexcept { _occupancy.predict(usage); }
+
+    /**
+     * Updates the usage prediction of this channel.
+     * @param old_prediction So far predicted usage.
+     * @param new_prediction New predicted usage.
+     */
+    void modify_predicted_usage(const resource::hint::expected_access_frequency old_prediction,
+                                const resource::hint::expected_access_frequency new_prediction) noexcept
+    {
+        _occupancy.revoke(old_prediction);
+        _occupancy.predict(new_prediction);
+    }
+
+    /**
+     * @return Aggregated predicted usage.
+     */
+    [[nodiscard]] resource::hint::expected_access_frequency predicted_usage() const noexcept
+    {
+        return static_cast<resource::hint::expected_access_frequency>(_occupancy);
+    }
+
+    /**
+     * @return True, whenever min. one prediction was "excessive".
+     */
+    [[nodiscard]] bool has_excessive_usage_prediction() const noexcept
+    {
+        return _occupancy.has_excessive_usage_prediction();
+    }
+
+private:
+    // Backend queues for multiple produces in different NUMA regions and different priorities,
+    alignas(64)
+        std::array<std::array<util::MPSCQueue<TaskInterface>, memory::config::max_numa_nodes()>, 2> _remote_queues{};
+
+    // Backend queues for a single producer (owning worker thread) and different priorities.
+    alignas(64) std::array<util::Queue<TaskInterface>, 2> _local_queues{};
+
+    // Buffer for ready-to-execute tasks.
+    alignas(64) TaskBuffer<config::task_buffer_size()> _task_buffer;
+
+    // Id of this channel.
+    const std::uint16_t _id;
+
+    // NUMA id of the worker thread owning this channel.
+    const std::uint8_t _numa_node_id;
+
+    // Holder of resource predictions of this channel.
+    alignas(64) ChannelOccupancy _occupancy{};
+
+    /**
+     * Fills the task buffer with tasks scheduled with a given priority.
+     *
+     * @tparam P Priority.
+     * @param available Number of maximal tasks to fill the task buffer.
+     * @return Size of the task buffer after filling.
+     */
+    template <priority P> std::uint16_t fill(std::uint16_t available) noexcept
+    {
+        // 1) Fill up from the local queue.
+        available -= _task_buffer.fill(_local_queues[P], available);
+
+        if (available > 0U)
+        {
+            // 2) Fill up from remote queues; start with the NUMA-local one.
+            for (auto i = 0U; i < _remote_queues[P].max_size(); ++i)
+            {
+                const auto numa_node_id = (_numa_node_id + i) & (_remote_queues[P].max_size() - 1U);
+                available -= _task_buffer.fill(_remote_queues[P][numa_node_id], available);
+            }
+        }
+
+        return _task_buffer.max_size() - available;
+    }
+};
+} // namespace mx::tasking
--- a/src/mx/tasking/channel_occupancy.h
+++ b/src/mx/tasking/channel_occupancy.h
@@ -0,0 +1,78 @@
+#pragma once
+
+#include <array>
+#include <atomic>
+#include <mx/resource/resource.h>
+
+namespace mx::tasking {
+/**
+ * Stores usage predictions.
+ */
+class ChannelOccupancy
+{
+public:
+    constexpr ChannelOccupancy() = default;
+    ~ChannelOccupancy() = default;
+
+    /**
+     * Adds the given predicted usage.
+     * @param predicted_usage Predicted usage.
+     */
+    void predict(const resource::hint::expected_access_frequency predicted_usage) noexcept
+    {
+        _predicted_usage_counter[static_cast<std::uint8_t>(predicted_usage)].fetch_add(1, std::memory_order_relaxed);
+    }
+
+    /**
+     * Subtracts the given predicted usage.
+     * @param predicted_usage  Predicted usage.
+     */
+    void revoke(const resource::hint::expected_access_frequency predicted_usage) noexcept
+    {
+        _predicted_usage_counter[static_cast<std::uint8_t>(predicted_usage)].fetch_sub(1, std::memory_order_relaxed);
+    }
+
+    /**
+     * @return True, when at least one prediction was "excessive".
+     */
+    [[nodiscard]] bool has_excessive_usage_prediction() const noexcept
+    {
+        return has_at_least_one<resource::hint::expected_access_frequency::excessive>();
+    }
+
+    /**
+     * @return The highest predicted usage.
+     */
+    explicit operator resource::hint::expected_access_frequency() const noexcept
+    {
+        if (has_at_least_one<resource::hint::expected_access_frequency::excessive>())
+        {
+            return resource::hint::expected_access_frequency::excessive;
+        }
+
+        if (has_at_least_one<resource::hint::expected_access_frequency::high>())
+        {
+            return resource::hint::expected_access_frequency::high;
+        }
+
+        if (has_at_least_one<resource::hint::expected_access_frequency::normal>())
+        {
+            return resource::hint::expected_access_frequency::normal;
+        }
+
+        return resource::hint::expected_access_frequency::unused;
+    }
+
+private:
+    // Counter of predicted usages.
+    std::array<std::atomic_uint64_t, 4U> _predicted_usage_counter{0U};
+
+    /**
+     * @return True, when at least one usage as given by the template was predicted.
+     */
+    template <resource::hint::expected_access_frequency U>[[nodiscard]] bool has_at_least_one() const noexcept
+    {
+        return _predicted_usage_counter[static_cast<std::uint8_t>(U)].load(std::memory_order_relaxed) > 0;
+    }
+};
+} // namespace mx::tasking
--- a/src/mx/tasking/config.h
+++ b/src/mx/tasking/config.h
@@ -0,0 +1,33 @@
+#pragma once
+
+namespace mx::tasking {
+class config
+{
+public:
+    enum memory_reclamation_scheme
+    {
+        None = 0U,
+        UpdateEpochOnRead = 1U,
+        UpdateEpochPeriodically = 2U
+    };
+
+    // Maximal number of supported cores.
+    static constexpr auto max_cores() { return 64U; }
+
+    // Maximal size for a single task, will be used for task allocation.
+    static constexpr auto task_size() { return 64U; }
+
+    // The task buffer will hold a set of tasks, fetched from
+    // queues. This is the size of the buffer.
+    static constexpr auto task_buffer_size() { return 64U; }
+
+    // If enabled, will record the number of execute tasks,
+    // scheduled tasks, reader and writer per core and more.
+    static constexpr auto task_statistics() { return false; }
+
+    // If enabled, memory will be reclaimed while using optimistic
+    // synchronization by epoch-based reclamation. Otherwise, freeing
+    // memory is unsafe.
+    static constexpr auto memory_reclamation() { return memory_reclamation_scheme::UpdateEpochPeriodically; }
+};
+} // namespace mx::tasking
--- a/src/mx/tasking/load.h
+++ b/src/mx/tasking/load.h
@@ -0,0 +1,40 @@
+#pragma once
+#include <bitset>
+#include <cstdint>
+
+namespace mx::tasking {
+/**
+ * Persists the channel load for the last 64 requests.
+ */
+class Load
+{
+public:
+    constexpr Load() = default;
+    ~Load() = default;
+
+    Load &operator+=(const bool hit) noexcept
+    {
+        _hits <<= 1;
+        _hits.set(0, hit);
+        return *this;
+    }
+
+    Load &operator|=(const Load &other) noexcept
+    {
+        _hits |= other._hits;
+        return *this;
+    }
+
+    /**
+     * @return Number of successful requests.
+     */
+    [[nodiscard]] std::size_t count() const noexcept { return _hits.count(); }
+
+    bool operator<(const Load &other) const noexcept { return _hits.count() < other._hits.count(); }
+    bool operator<(const std::size_t other) const noexcept { return _hits.count() < other; }
+
+private:
+    // Bitvector of the last 64 requests.
+    std::bitset<64> _hits{0U};
+};
+} // namespace mx::tasking
--- a/src/mx/tasking/prefetch_slot.h
+++ b/src/mx/tasking/prefetch_slot.h
@@ -0,0 +1,49 @@
+#pragma once
+#include "task.h"
+#include <mx/system/cache.h>
+#include <utility>
+
+namespace mx::tasking {
+/**
+ * A prefetch slot is part of the prefetch buffer used for task
+ * and resource prefetching
+ * A slot can contain up to one task and one resource that are
+ * prefetched by the channel.
+ */
+class PrefetchSlot
+{
+public:
+    constexpr PrefetchSlot() noexcept = default;
+    ~PrefetchSlot() = default;
+
+    PrefetchSlot &operator=(TaskInterface *task) noexcept
+    {
+        _task = task;
+        if (task->has_resource_annotated())
+        {
+            _resource = std::make_pair(task->annotated_resource().get(), task->annotated_resource_size());
+        }
+        return *this;
+    }
+
+    void operator()() noexcept
+    {
+        if (_task != nullptr)
+        {
+            system::cache::prefetch<system::cache::L1, system::cache::write>(_task);
+            _task = nullptr;
+        }
+
+        if (std::get<0>(_resource) != nullptr)
+        {
+            system::cache::prefetch_range<system::cache::LLC, system::cache::read>(std::get<0>(_resource),
+                                                                                   std::get<1>(_resource));
+            std::get<0>(_resource) = nullptr;
+        }
+    }
+
+private:
+    void *_task = nullptr;
+    std::pair<void *, std::uint16_t> _resource = std::make_pair(nullptr, 0U);
+};
+} // namespace mx::tasking
--- a/src/mx/tasking/profiling/profiling_task.cpp
+++ b/src/mx/tasking/profiling/profiling_task.cpp
@@ -0,0 +1,107 @@
+#include "profiling_task.h"
+#include <fstream>
+#include <json.hpp>
+#include <mx/memory/global_heap.h>
+#include <mx/tasking/runtime.h>
+
+using namespace mx::tasking::profiling;
+
+ProfilingTask::ProfilingTask(mx::util::maybe_atomic<bool> &is_running, mx::tasking::Channel &channel)
+    : _is_running(is_running), _channel(channel)
+{
+    _idle_ranges.reserve(1 << 16);
+}
+
+mx::tasking::TaskResult ProfilingTask::execute(const std::uint16_t /*core_id*/, const std::uint16_t /*channel_id*/)
+{
+    IdleRange range;
+
+    while (this->_is_running && this->_channel.empty())
+    {
+        this->_channel.fill();
+    }
+
+    range.stop();
+
+    if (range.nanoseconds() > 10U)
+    {
+        this->_idle_ranges.emplace_back(std::move(range));
+    }
+
+    if (this->_is_running)
+    {
+        return tasking::TaskResult::make_succeed(this);
+    }
+
+    return tasking::TaskResult::make_null();
+}
+
+Profiler::~Profiler()
+{
+    for (auto *task : this->_tasks)
+    {
+        delete task;
+    }
+}
+
+void Profiler::profile(const std::string &profiling_output_file)
+{
+    for (auto *task : this->_tasks)
+    {
+        delete task;
+    }
+    this->_tasks.clear();
+
+    this->_profiling_output_file.emplace(profiling_output_file);
+    this->_start = std::chrono::steady_clock::now();
+}
+
+void Profiler::profile(util::maybe_atomic<bool> &is_running, Channel &channel)
+{
+    auto *task =
+        new (memory::GlobalHeap::allocate_cache_line_aligned(sizeof(ProfilingTask))) ProfilingTask(is_running, channel);
+    task->annotate(channel.id());
+    task->annotate(mx::tasking::priority::low);
+    this->_tasks.push_back(task);
+    mx::tasking::runtime::spawn(*task);
+}
+
+void Profiler::stop()
+{
+    const auto end = std::chrono::steady_clock::now();
+    const auto end_relative_nanoseconds =
+        std::chrono::duration_cast<std::chrono::nanoseconds>(end - this->_start).count();
+    if (this->_profiling_output_file.has_value())
+    {
+        auto output = nlohmann::json{};
+        for (auto *task : this->_tasks)
+        {
+            if (task != nullptr && task->idle_ranges().empty() == false)
+            {
+                nlohmann::json channel_output;
+                channel_output["channel"] = task->annotated_channel();
+                nlohmann::json ranges{};
+                for (const auto &range : task->idle_ranges())
+                {
+                    const auto normalized = range.normalize(this->_start);
+                    auto normalized_json = nlohmann::json{};
+                    normalized_json["s"] = std::get<0>(normalized);
+                    normalized_json["e"] = std::get<1>(normalized);
+                    ranges.push_back(std::move(normalized_json));
+                }
+
+                channel_output["ranges"] = std::move(ranges);
+                output.push_back(std::move(channel_output));
+            }
+        }
+
+        nlohmann::json end_output;
+        end_output["end"] = end_relative_nanoseconds;
+        output.push_back(std::move(end_output));
+
+        std::ofstream out_file{this->_profiling_output_file.value()};
+        out_file << output.dump() << std::endl;
+    }
+
+    this->_profiling_output_file = std::nullopt;
+}
--- a/src/mx/tasking/profiling/profiling_task.h
+++ b/src/mx/tasking/profiling/profiling_task.h
@@ -0,0 +1,119 @@
+#pragma once
+
+#include <chrono>
+#include <mx/tasking/channel.h>
+#include <mx/tasking/task.h>
+#include <mx/util/maybe_atomic.h>
+#include <optional>
+#include <utility>
+#include <vector>
+
+namespace mx::tasking::profiling {
+/**
+ * Time range (from -- to) for idled time of a single channel.
+ */
+class IdleRange
+{
+public:
+    IdleRange() : _start(std::chrono::steady_clock::now()) {}
+    IdleRange(IdleRange &&) = default;
+    ~IdleRange() = default;
+
+    /**
+     * Sets the end of the idle range to the current time.
+     */
+    void stop() noexcept { _end = std::chrono::steady_clock::now(); }
+
+    /**
+     * @return Number of nanoseconds idled.
+     */
+    [[nodiscard]] std::uint64_t nanoseconds() const noexcept
+    {
+        return std::chrono::duration_cast<std::chrono::nanoseconds>(_end - _start).count();
+    }
+
+    /**
+     * Normalizes this range with respect to a given point in time.
+     * @param global_start Point in time to normalize.
+     * @return Pair of (start, stop) normalized to the given time point.
+     */
+    [[nodiscard]] std::pair<std::uint64_t, std::uint64_t> normalize(
+        const std::chrono::steady_clock::time_point global_start) const noexcept
+    {
+        return {
+            std::chrono::duration_cast<std::chrono::nanoseconds>(_start - global_start).count(),
+            std::chrono::duration_cast<std::chrono::nanoseconds>(_end - global_start).count(),
+        };
+    }
+
+private:
+    // Start of idling.
+    std::chrono::steady_clock::time_point _start;
+
+    // End of idling.
+    std::chrono::steady_clock::time_point _end;
+};
+
+/**
+ * Task, that is scheduled with low priority and gets CPU time,
+ * whenever no other task is available.
+ * Every time the task gets executed, it will record the time range,
+ * until the channel has new tasks for execution.
+ */
+class ProfilingTask final : public TaskInterface
+{
+public:
+    ProfilingTask(util::maybe_atomic<bool> &is_running, Channel &channel);
+    ~ProfilingTask() override = default;
+
+    TaskResult execute(std::uint16_t core_id, std::uint16_t channel_id) override;
+
+    [[nodiscard]] const std::vector<IdleRange> &idle_ranges() const noexcept { return _idle_ranges; }
+
+private:
+    util::maybe_atomic<bool> &_is_running;
+    Channel &_channel;
+    std::vector<IdleRange> _idle_ranges;
+};
+
+/**
+ * Schedules the idle/profiling task to every channel and
+ * writes the memory to a given file.
+ */
+class Profiler
+{
+public:
+    Profiler() noexcept = default;
+    ~Profiler();
+
+    /**
+     * Enable profiling and set the result file.
+     * @param profiling_output_file File, where results should be written to.
+     */
+    void profile(const std::string &profiling_output_file);
+
+    /**
+     * Schedules a new idle/profile task to the given channel.
+     * @param is_running Reference to the schedulers "is_running" flag.
+     * @param channel   Channel to spawn the task to.
+     */
+    void profile(util::maybe_atomic<bool> &is_running, Channel &channel);
+
+    /**
+     * Normalizes all time ranges and writes them to the specified
+     * file.
+     */
+    void stop();
+
+private:
+    // File to write the output.
+    std::optional<std::string> _profiling_output_file{std::nullopt};
+
+    // Time point of the runtime start.
+    std::chrono::steady_clock::time_point _start;
+
+    // List of all idle/profile tasks.
+    std::vector<ProfilingTask *> _tasks;
+};
+
+} // namespace mx::tasking::profiling
--- a/src/mx/tasking/profiling/statistic.h
+++ b/src/mx/tasking/profiling/statistic.h
@@ -0,0 +1,92 @@
+#pragma once
+#include <array>
+#include <cstdint>
+#include <mx/memory/global_heap.h>
+#include <mx/tasking/config.h>
+#include <mx/util/aligned_t.h>
+
+namespace mx::tasking::profiling {
+/**
+ * Collector for tasking statistics (scheduled tasks, executed tasks, ...).
+ */
+class Statistic
+{
+public:
+    using counter_line_t = util::aligned_t<std::array<std::uint64_t, 7>>;
+
+    enum Counter : std::uint8_t
+    {
+        Scheduled,
+        ScheduledOnChannel,
+        ScheduledOffChannel,
+        Executed,
+        ExecutedReader,
+        ExecutedWriter,
+        Fill
+    };
+
+    explicit Statistic(const std::uint16_t count_channels) noexcept : _count_channels(count_channels)
+    {
+        this->_counter = new (memory::GlobalHeap::allocate_cache_line_aligned(sizeof(counter_line_t) * count_channels))
+            counter_line_t[count_channels];
+        std::memset(static_cast<void *>(this->_counter), 0, sizeof(counter_line_t) * count_channels);
+    }
+
+    Statistic(const Statistic &) = delete;
+
+    ~Statistic() noexcept { delete[] this->_counter; }
+
+    Statistic &operator=(const Statistic &) = delete;
+
+    /**
+     * Clears all collected statistics.
+     */
+    void clear() noexcept
+    {
+        std::memset(static_cast<void *>(this->_counter), 0, sizeof(counter_line_t) * this->_count_channels);
+    }
+
+    /**
+     * Increment the template-given counter by one for the given channel.
+     * @param channel_id Channel to increment the statistics for.
+     */
+    template <Counter C> void increment(const std::uint16_t channel_id) noexcept
+    {
+        _counter[channel_id].value()[static_cast<std::uint8_t>(C)] += 1;
+    }
+
+    /**
+     * Read the given counter for a given channel.
+     * @param counter Counter to read.
+     * @param channel_id Channel the counter is for.
+     * @return Value of the counter.
+     */
+    [[nodiscard]] std::uint64_t get(const Counter counter, const std::uint16_t channel_id) const noexcept
+    {
+        return _counter[channel_id].value()[static_cast<std::uint8_t>(counter)];
+    }
+
+    /**
+     * Read and aggregate the counter for all channels.
+     * @param counter Counter to read.
+     * @return Value of the counter for all channels.
+     */
+    [[nodiscard]] std::uint64_t get(const Counter counter) const noexcept
+    {
+        std::uint64_t sum = 0U;
+        for (auto i = 0U; i < _count_channels; ++i)
+        {
+            sum += get(counter, i);
+        }
+
+        return sum;
+    }
+
+private:
+    // Number of channels to monitor.
+    const std::uint16_t _count_channels;
+
+    // Memory for storing the counter.
+    counter_line_t *_counter = nullptr;
+};
+} // namespace mx::tasking::profiling
--- a/src/mx/tasking/runtime.h
+++ b/src/mx/tasking/runtime.h
@@ -0,0 +1,265 @@
+#pragma once
+#include "scheduler.h"
+#include "task.h"
+#include <iostream>
+#include <memory>
+#include <mx/memory/dynamic_size_allocator.h>
+#include <mx/memory/fixed_size_allocator.h>
+#include <mx/memory/task_allocator_interface.h>
+#include <mx/resource/builder.h>
+#include <mx/util/core_set.h>
+#include <utility>
+
+namespace mx::tasking {
+/**
+ * The runtime is the central access structure to MxTasking.
+ * Here, we can initialize MxTasking, spawn and allocate tasks, allocate
+ * data objects.
+ */
+class runtime
+{
+public:
+    /**
+     * Initializes the MxTasking runtime.
+     * @param core_set Cores, where the runtime should execute on.
+     * @param prefetch_distance Distance for prefetching.
+     * @param channels_per_core Number of channels per core (more than one enables channel-stealing).
+     * @param use_system_allocator Should we use the systems malloc interface or our allocator?
+     * @return True, when the runtime was started successfully.
+     */
+    static bool init(const util::core_set &core_set, const std::uint16_t prefetch_distance,
+                     const bool use_system_allocator)
+    {
+        // Are we ready to re-initialize the scheduler?
+        if (_scheduler != nullptr && _scheduler->is_running())
+        {
+            return false;
+        }
+
+        // Create a new resource allocator.
+        if (_resource_allocator == nullptr)
+        {
+            _resource_allocator.reset(new (memory::GlobalHeap::allocate_cache_line_aligned(
+                sizeof(memory::dynamic::Allocator))) memory::dynamic::Allocator());
+        }
+        else if (_resource_allocator->is_free())
+        {
+            _resource_allocator->release_allocated_memory();
+            _resource_allocator->initialize_empty();
+        }
+        else
+        {
+            _resource_allocator->defragment();
+        }
+
+        // Create a new task allocator.
+        if (use_system_allocator)
+        {
+            _task_allocator.reset(new (memory::GlobalHeap::allocate_cache_line_aligned(sizeof(
+                memory::SystemTaskAllocator<config::task_size()>))) memory::SystemTaskAllocator<config::task_size()>());
+        }
+        else
+        {
+            _task_allocator.reset(new (
+                memory::GlobalHeap::allocate_cache_line_aligned(sizeof(memory::fixed::Allocator<config::task_size()>)))
+                                      memory::fixed::Allocator<config::task_size()>(core_set));
+        }
+
+        // Create a new scheduler.
+        const auto need_new_scheduler = _scheduler == nullptr || *_scheduler != core_set;
+        if (need_new_scheduler)
+        {
+            _scheduler.reset(new (memory::GlobalHeap::allocate_cache_line_aligned(sizeof(Scheduler)))
+                                 Scheduler(core_set, prefetch_distance, *_resource_allocator));
+        }
+        else
+        {
+            _scheduler->reset();
+        }
+
+        // Create a new resource builder.
+        if (_resource_builder == nullptr || need_new_scheduler)
+        {
+            _resource_builder = std::make_unique<resource::Builder>(*_scheduler, *_resource_allocator);
+        }
+
+        return true;
+    }
+
+    /**
+     * Start profiling of idle times. Results will be written to the given file.
+     * @param output_file File for idle-time results.
+     */
+    static void profile(const std::string &output_file) noexcept { _scheduler->profile(output_file); }
+
+    /**
+     * Spawns the given task.
+     * @param task Task to be scheduled.
+     * @param current_channel_id Channel, the spawn request came from.
+     */
+    static void spawn(TaskInterface &task, const std::uint16_t current_channel_id) noexcept
+    {
+        _scheduler->schedule(task, current_channel_id);
+    }
+
+    /**
+     * Spawns the given task.
+     * @param task Task to be scheduled.
+     */
+    static void spawn(TaskInterface &task) noexcept { _scheduler->schedule(task); }
+
+    /**
+     * @return Number of available channels.
+     */
+    static std::uint16_t channels() noexcept { return _scheduler->count_channels(); }
+
+    /**
+     * Starts the runtime and suspends the starting thread until MxTasking is stopped.
+     */
+    static void start_and_wait() { _scheduler->start_and_wait(); }
+
+    /**
+     * Instructs all worker threads to stop their work.
+     * After all worker threads are stopped, the starting
+     * thread will be resumed.
+     */
+    static void stop() noexcept { _scheduler->interrupt(); }
+
+    /**
+     * Creates a new task.
+     * @param core_id Core to allocate memory from.
+     * @param arguments Arguments for the task.
+     * @return The new task.
+     */
+    template <typename T, typename... Args> static T *new_task(const std::uint16_t core_id, Args &&... arguments)
+    {
+        static_assert(sizeof(T) <= config::task_size() && "Task must be leq defined task size.");
+        return new (_task_allocator->allocate(core_id)) T(std::forward<Args>(arguments)...);
+    }
+
+    /**
+     * Frees a given task.
+     * @param core_id Core id to return the memory to.
+     * @param task Task to be freed.
+     */
+    template <typename T> static void delete_task(const std::uint16_t core_id, T *task) noexcept
+    {
+        task->~T();
+        _task_allocator->free(core_id, static_cast<void *>(task));
+    }
+
+    /**
+     * Creates a resource.
+     * @param size Size of the data object.
+     * @param hint Hints for allocation and scheduling.
+     * @param arguments Arguments for the data object.
+     * @return The resource pointer.
+     */
+    template <typename T, typename... Args>
+    static resource::ptr new_resource(const std::size_t size, resource::hint &&hint, Args &&... arguments) noexcept
+    {
+        return _resource_builder->build<T>(size, std::move(hint), std::forward<Args>(arguments)...);
+    }
+
+    /**
+     * Creates a resource from a given pointer.
+     * @param object Pointer to the existing object.
+     * @param hint Hints for allocation and scheduling.
+     * @return The resource pointer.
+     */
+    template <typename T> static resource::ptr to_resource(T *object, resource::hint &&hint) noexcept
+    {
+        return _resource_builder->build<T>(object, std::move(hint));
+    }
+
+    /**
+     * Deletes the given data object.
+     * @param resource Data object to be deleted.
+     */
+    template <typename T> static void delete_resource(const resource::ptr resource) noexcept
+    {
+        _resource_builder->destroy<T>(resource);
+    }
+
+    static void *allocate(const std::uint8_t numa_node_id, const std::size_t alignment, const std::size_t size) noexcept
+    {
+        return _resource_allocator->allocate(numa_node_id, alignment, size);
+    }
+
+    static void free(void *pointer) noexcept { _resource_allocator->free(pointer); }
+
+    /**
+     * Updates the prediction of a data object.
+     * @param resource Data object, whose usage should be predicted.
+     * @param old_prediction Prediction so far.
+     * @param new_prediction New usage prediction.
+     */
+    static void modify_predicted_usage(const resource::ptr resource,
+                                       const resource::hint::expected_access_frequency old_prediction,
+                                       const resource::hint::expected_access_frequency new_prediction) noexcept
+    {
+        _scheduler->modify_predicted_usage(resource.channel_id(), old_prediction, new_prediction);
+    }
+
+    /**
+     * ID of the NUMA region of a channel.
+     * @param channel_id Channel.
+     * @return ID of the NUMA region.
+     */
+    static std::uint8_t numa_node_id(const std::uint16_t channel_id) noexcept
+    {
+        return _scheduler->numa_node_id(channel_id);
+    }
+
+    /**
+     * Reads the task statistics for a given counter and all channels.
+     * @param counter Counter to be read.
+     * @return Aggregated value of all channels.
+     */
+    static std::uint64_t statistic(const profiling::Statistic::Counter counter) noexcept
+    {
+        return _scheduler->statistic(counter);
+    }
+
+    /**
+     * Reads the task statistic for a given counter on a given channel.
+     * @param counter Counter to be read.
+     * @param channel_id Channel.
+     * @return Value of the counter of the given channel.
+     */
+    static std::uint64_t statistic(const profiling::Statistic::Counter counter, const std::uint16_t channel_id) noexcept
+    {
+        return _scheduler->statistic(counter, channel_id);
+    }
+
+private:
+    // Scheduler to spawn tasks.
+    inline static std::unique_ptr<Scheduler> _scheduler = {nullptr};
+
+    // Allocator to allocate tasks (could be systems malloc or our Multi-level allocator).
+    inline static std::unique_ptr<memory::TaskAllocatorInterface> _task_allocator = {nullptr};
+
+    // Allocator to allocate resources.
+    inline static std::unique_ptr<memory::dynamic::Allocator> _resource_allocator = {nullptr};
+
+    // Allocator to allocate data objects.
+    inline static std::unique_ptr<resource::Builder> _resource_builder = {nullptr};
+};
+
+/**
+ * The runtime_guard initializes the runtime at initialization and starts
+ * the runtime when the object is deleted. This allows MxTasking to execute
+ * within a specific scope.
+ */
+class runtime_guard
+{
+public:
+    runtime_guard(const bool use_system_allocator, const util::core_set &core_set,
+                  const std::uint16_t prefetch_distance = 0U) noexcept
+    {
+        runtime::init(core_set, prefetch_distance, use_system_allocator);
+    }
+
+    ~runtime_guard() noexcept { runtime::start_and_wait(); }
+};
+} // namespace mx::tasking
--- a/src/mx/tasking/scheduler.cpp
+++ b/src/mx/tasking/scheduler.cpp
@@ -0,0 +1,203 @@
+#include "scheduler.h"
+#include <cassert>
+#include <mx/memory/global_heap.h>
+#include <mx/synchronization/synchronization.h>
+#include <mx/system/thread.h>
+#include <mx/system/topology.h>
+#include <thread>
+#include <vector>
+
+using namespace mx::tasking;
+
+Scheduler::Scheduler(const mx::util::core_set &core_set, const std::uint16_t prefetch_distance,
+                     memory::dynamic::Allocator &resource_allocator) noexcept
+    : _core_set(core_set), _count_channels(core_set.size()), _worker({}), _channel_numa_node_map({0U}),
+      _epoch_manager(core_set.size(), resource_allocator, _is_running), _statistic(_count_channels)
+{
+    this->_worker.fill(nullptr);
+    this->_channel_numa_node_map.fill(0U);
+
+    for (auto worker_id = 0U; worker_id < this->_count_channels; ++worker_id)
+    {
+        const auto core_id = this->_core_set[worker_id];
+        this->_channel_numa_node_map[worker_id] = system::topology::node_id(core_id);
+        this->_worker[worker_id] =
+            new (memory::GlobalHeap::allocate(this->_channel_numa_node_map[worker_id], sizeof(Worker)))
+                Worker(worker_id, core_id, this->_channel_numa_node_map[worker_id], this->_is_running,
+                       prefetch_distance, this->_epoch_manager[worker_id], this->_epoch_manager.global_epoch(),
+                       this->_statistic);
+    }
+}
+
+Scheduler::~Scheduler() noexcept
+{
+    for (auto *worker : this->_worker)
+    {
+        worker->~Worker();
+        memory::GlobalHeap::free(worker, sizeof(Worker));
+    }
+}
+
+void Scheduler::start_and_wait()
+{
+    // Create threads for worker...
+    std::vector<std::thread> worker_threads(this->_core_set.size() +
+                                            static_cast<std::uint16_t>(config::memory_reclamation() != config::None));
+    for (auto channel_id = 0U; channel_id < this->_core_set.size(); ++channel_id)
+    {
+        worker_threads[channel_id] = std::thread([this, channel_id] { this->_worker[channel_id]->execute(); });
+
+        system::thread::pin(worker_threads[channel_id], this->_worker[channel_id]->core_id());
+    }
+
+    // ... and epoch management (if enabled).
+    if constexpr (config::memory_reclamation() != config::None)
+    {
+        // In case we enable memory reclamation: Use an additional thread.
+        worker_threads[this->_core_set.size()] =
+            std::thread([this] { this->_epoch_manager.enter_epoch_periodically(); });
+    }
+
+    // Turning the flag on starts all worker threads to execute tasks.
+    this->_is_running = true;
+
+    // Wait for the worker threads to end. This will only
+    // reached when the _is_running flag is set to false
+    // from somewhere in the application.
+    for (auto &worker_thread : worker_threads)
+    {
+        worker_thread.join();
+    }
+
+    if constexpr (config::memory_reclamation() != config::None)
+    {
+        // At this point, no task will execute on any resource;
+        // but the epoch manager has joined, too. Therefore,
+        // we will reclaim all memory manually.
+        this->_epoch_manager.reclaim_all();
+    }
+}
+
+void Scheduler::schedule(TaskInterface &task, const std::uint16_t current_channel_id) noexcept
+{
+    // Scheduling is based on the annotated resource of the given task.
+    if (task.has_resource_annotated())
+    {
+        const auto annotated_resource = task.annotated_resource();
+        const auto resource_channel_id = annotated_resource.channel_id();
+
+        // For performance reasons, we prefer the local (not synchronized) queue
+        // whenever possible to spawn the task. The decision is based on the
+        // synchronization primitive and the access mode of the task (reader/writer).
+        if (Scheduler::keep_task_local(task.is_readonly(), annotated_resource.synchronization_primitive(),
+                                       resource_channel_id, current_channel_id))
+        {
+            this->_worker[current_channel_id]->channel().push_back_local(&task);
+            if constexpr (config::task_statistics())
+            {
+                this->_statistic.increment<profiling::Statistic::ScheduledOnChannel>(current_channel_id);
+            }
+        }
+        else
+        {
+            this->_worker[resource_channel_id]->channel().push_back_remote(&task,
+                                                                           this->numa_node_id(current_channel_id));
+            if constexpr (config::task_statistics())
+            {
+                this->_statistic.increment<profiling::Statistic::ScheduledOffChannel>(current_channel_id);
+            }
+        }
+    }
+
+    // The developer assigned a fixed channel to the task.
+    else if (task.has_channel_annotated())
+    {
+        const auto target_channel_id = task.annotated_channel();
+
+        // For performance reasons, we prefer the local (not synchronized) queue
+        // whenever possible to spawn the task.
+        if (target_channel_id == current_channel_id)
+        {
+            this->_worker[current_channel_id]->channel().push_back_local(&task);
+            if constexpr (config::task_statistics())
+            {
+                this->_statistic.increment<profiling::Statistic::ScheduledOnChannel>(current_channel_id);
+            }
+        }
+        else
+        {
+            this->_worker[target_channel_id]->channel().push_back_remote(&task, this->numa_node_id(current_channel_id));
+            if constexpr (config::task_statistics())
+            {
+                this->_statistic.increment<profiling::Statistic::ScheduledOffChannel>(current_channel_id);
+            }
+        }
+    }
+
+    // The developer assigned a fixed NUMA region to the task.
+    else if (task.has_node_annotated())
+    {
+        // TODO: Select random channel @ node, based on load
+        assert(false && "NOT IMPLEMENTED: Task scheduling for node.");
+    }
+
+    // The task can run everywhere.
+    else
+    {
+        this->_worker[current_channel_id]->channel().push_back_local(&task);
+        if constexpr (config::task_statistics())
+        {
+            this->_statistic.increment<profiling::Statistic::ScheduledOnChannel>(current_channel_id);
+        }
+    }
+
+    if constexpr (config::task_statistics())
+    {
+        this->_statistic.increment<profiling::Statistic::Scheduled>(current_channel_id);
+    }
+}
+
+void Scheduler::schedule(TaskInterface &task) noexcept
+{
+    if (task.has_resource_annotated())
+    {
+        const auto &annotated_resource = task.annotated_resource();
+        this->_worker[annotated_resource.channel_id()]->channel().push_back_remote(&task, 0U);
+        if constexpr (config::task_statistics())
+        {
+            this->_statistic.increment<profiling::Statistic::ScheduledOffChannel>(annotated_resource.channel_id());
+        }
+    }
+    else if (task.has_channel_annotated())
+    {
+        this->_worker[task.annotated_channel()]->channel().push_back_remote(&task, 0U);
+        if constexpr (config::task_statistics())
+        {
+            this->_statistic.increment<profiling::Statistic::ScheduledOffChannel>(task.annotated_channel());
+        }
+    }
+    else if (task.has_node_annotated())
+    {
+        // TODO: Select random channel @ node, based on load
+        assert(false && "NOT IMPLEMENTED: Task scheduling for node.");
+    }
+    else
+    {
+        assert(false && "NOT IMPLEMENTED: Task scheduling without channel.");
+    }
+}
+
+void Scheduler::reset() noexcept
+{
+    this->_statistic.clear();
+    this->_epoch_manager.reset();
+}
+
+void Scheduler::profile(const std::string &output_file)
+{
+    this->_profiler.profile(output_file);
+    for (auto i = 0U; i < this->_count_channels; ++i)
+    {
+        this->_profiler.profile(this->_is_running, this->_worker[i]->channel());
+    }
+}
--- a/src/mx/tasking/scheduler.h
+++ b/src/mx/tasking/scheduler.h
@@ -0,0 +1,220 @@
+#pragma once
+#include "channel.h"
+#include "task.h"
+#include "worker.h"
+#include <array>
+#include <atomic>
+#include <cassert>
+#include <cstdint>
+#include <iostream>
+#include <memory>
+#include <mx/memory/config.h>
+#include <mx/memory/dynamic_size_allocator.h>
+#include <mx/memory/reclamation/epoch_manager.h>
+#include <mx/resource/resource.h>
+#include <mx/tasking/profiling/profiling_task.h>
+#include <mx/tasking/profiling/statistic.h>
+#include <mx/util/core_set.h>
+#include <mx/util/random.h>
+#include <string>
+
+namespace mx::tasking {
+/**
+ * The scheduler is the central (but hidden by the runtime) data structure to spawn
+ * tasks between worker threads.
+ */
+class Scheduler
+{
+public:
+    Scheduler(const util::core_set &core_set, std::uint16_t prefetch_distance,
+              memory::dynamic::Allocator &resource_allocator) noexcept;
+    ~Scheduler() noexcept;
+
+    /**
+     * Schedules a given task.
+     * @param task Task to be scheduled.
+     * @param current_channel_id Channel, the request came from.
+     */
+    void schedule(TaskInterface &task, std::uint16_t current_channel_id) noexcept;
+
+    /**
+     * Schedules a given task.
+     * @param task Task to be scheduled.
+     */
+    void schedule(TaskInterface &task) noexcept;
+
+    /**
+     * Starts all worker threads and waits until they finish.
+     */
+    void start_and_wait();
+
+    /**
+     * Interrupts the worker threads. They will finish after executing
+     * their current tasks.
+     */
+    void interrupt() noexcept
+    {
+        _is_running = false;
+        this->_profiler.stop();
+    }
+
+    /**
+     * @return Core set of this instance.
+     */
+    [[nodiscard]] const util::core_set &core_set() const noexcept { return _core_set; }
+
+    /**
+     * @return True, when the worker threads are not interrupted.
+     */
+    [[nodiscard]] bool is_running() const noexcept { return _is_running; }
+
+    /**
+     * @return The global epoch manager.
+     */
+    [[nodiscard]] memory::reclamation::EpochManager &epoch_manager() noexcept { return _epoch_manager; }
+
+    /**
+     * @return Number of all channels.
+     */
+    [[nodiscard]] std::uint16_t count_channels() const noexcept { return _count_channels; }
+
+    /**
+     * Reads the NUMA region of a given channel/worker thread.
+     * @param channel_id Channel.
+     * @return NUMA region of the given channel.
+     */
+    [[nodiscard]] std::uint8_t numa_node_id(const std::uint16_t channel_id) const noexcept
+    {
+        return _channel_numa_node_map[channel_id];
+    }
+
+    /**
+     * Predicts usage for a given channel.
+     * @param channel_id Channel.
+     * @param usage Usage to predict.
+     */
+    void predict_usage(const std::uint16_t channel_id, const resource::hint::expected_access_frequency usage) noexcept
+    {
+        _worker[channel_id]->channel().predict_usage(usage);
+    }
+
+    /**
+     * Updates the predicted usage of a channel.
+     * @param channel_id Channel.
+     * @param old_prediction So far predicted usage.
+     * @param new_prediction New prediction.
+     */
+    void modify_predicted_usage(const std::uint16_t channel_id,
+                                const resource::hint::expected_access_frequency old_prediction,
+                                const resource::hint::expected_access_frequency new_prediction) noexcept
+    {
+        _worker[channel_id]->channel().modify_predicted_usage(old_prediction, new_prediction);
+    }
+
+    /**
+     * @param channel_id Channel.
+     * @return True, when a least one usage was predicted to be "excessive" for the given channel.
+     */
+    [[nodiscard]] bool has_excessive_usage_prediction(const std::uint16_t channel_id) const noexcept
+    {
+        return _worker[channel_id]->channel().has_excessive_usage_prediction();
+    }
+
+    /**
+     * Resets the statistics.
+     */
+    void reset() noexcept;
+
+    /**
+     * Aggregates the counter for all cores.
+     * @param counter Statistic counter.
+     * @return Aggregated value.
+     */
+    [[nodiscard]] std::uint64_t statistic([[maybe_unused]] const profiling::Statistic::Counter counter) const noexcept
+    {
+        if constexpr (config::task_statistics())
+        {
+            return this->_statistic.get(counter);
+        }
+        else
+        {
+            return 0U;
+        }
+    }
+
+    /**
+     * Reads the statistics for a given counter on a given channel.
+     * @param counter Statistic counter.
+     * @param channel_id Channel.
+     * @return Value of the counter for the given channel.
+     */
+    [[nodiscard]] std::uint64_t statistic([[maybe_unused]] const profiling::Statistic::Counter counter,
+                                          [[maybe_unused]] const std::uint16_t channel_id) const noexcept
+    {
+        if constexpr (config::task_statistics())
+        {
+            return this->_statistic.get(counter, channel_id);
+        }
+        else
+        {
+            return 0U;
+        }
+    }
+
+    /**
+     * Starts profiling of idle times and specifies the results file.
+     * @param output_file File to write idle times after stopping MxTasking.
+     */
+    void profile(const std::string &output_file);
+
+    bool operator==(const util::core_set &cores) const noexcept { return _core_set == cores; }
+
+    bool operator!=(const util::core_set &cores) const noexcept { return _core_set != cores; }
+
+private:
+    // Cores to run the worker threads on.
+    const util::core_set _core_set;
+
+    // Number of all channels.
+    const std::uint16_t _count_channels;
+
+    // Flag for the worker threads. If false, the worker threads will stop.
+    // This is atomic for hardware that does not guarantee atomic reads/writes of booleans.
+    alignas(64) util::maybe_atomic<bool> _is_running{false};
+
+    // All initialized workers.
+    alignas(64) std::array<Worker *, config::max_cores()> _worker{nullptr};
+
+    // Map of channel id to NUMA region id.
+    alignas(64) std::array<std::uint8_t, config::max_cores()> _channel_numa_node_map{0U};
+
+    // Epoch manager for memory reclamation,
+    alignas(64) memory::reclamation::EpochManager _epoch_manager;
+
+    // Profiler for task statistics.
+    profiling::Statistic _statistic;
+
+    // Profiler for idle times.
+    profiling::Profiler _profiler{};
+
+    /**
+     * Make a decision whether a task should be scheduled to the local
+     * channel or a remote.
+     *
+     * @param is_readonly Access mode of the task.
+     * @param primitive The synchronization primitive of the task annotated resource.
+     * @param resource_channel_id Channel id of the task annotated resource.
+     * @param current_channel_id Channel id where the spawn() operation is called.
+     * @return True, if the task should be scheduled local.
+     */
+    [[nodiscard]] static inline bool keep_task_local(const bool is_readonly, const synchronization::primitive primitive,
+                                                     const std::uint16_t resource_channel_id,
+                                                     const std::uint16_t current_channel_id)
+    {
+        return (resource_channel_id == current_channel_id) ||
+               (is_readonly && primitive != synchronization::primitive::ScheduleAll) ||
+               (primitive != synchronization::primitive::None && primitive != synchronization::primitive::ScheduleAll &&
+                primitive != synchronization::primitive::ScheduleWriter);
+    }
+};
+} // namespace mx::tasking
--- a/src/mx/tasking/task.h
+++ b/src/mx/tasking/task.h
@@ -0,0 +1,208 @@
+#pragma once
+
+#include "config.h"
+#include "task_stack.h"
+#include <bitset>
+#include <cstdint>
+#include <functional>
+#include <mx/resource/resource.h>
+#include <variant>
+
+namespace mx::tasking {
+enum priority : std::uint8_t
+{
+    low = 0,
+    normal = 1
+};
+
+class TaskInterface;
+class TaskResult
+{
+public:
+    static TaskResult make_succeed(TaskInterface *successor_task) noexcept { return TaskResult{successor_task, false}; }
+    static TaskResult make_remove() noexcept { return TaskResult{nullptr, true}; }
+    static TaskResult make_succeed_and_remove(TaskInterface *successor_task) noexcept
+    {
+        return TaskResult{successor_task, true};
+    }
+    static TaskResult make_null() noexcept { return TaskResult{nullptr, false}; }
+    constexpr TaskResult() = default;
+    ~TaskResult() = default;
+
+    TaskResult &operator=(const TaskResult &) = default;
+
+    explicit operator TaskInterface *() const noexcept { return _successor_task; }
+
+    [[nodiscard]] bool is_remove() const noexcept { return _remove_task; }
+    [[nodiscard]] bool has_successor() const noexcept { return _successor_task != nullptr; }
+
+private:
+    constexpr TaskResult(TaskInterface *successor_task, const bool remove) noexcept
+        : _successor_task(successor_task), _remove_task(remove)
+    {
+    }
+    TaskInterface *_successor_task = nullptr;
+    bool _remove_task = false;
+};
+
+/**
+ * The task is the central execution unit of mxtasking.
+ * Every task that should be executed has to derive
+ * from this class.
+ */
+class TaskInterface
+{
+public:
+    using channel = std::uint16_t;
+    using node = std::uint8_t;
+    using resource_and_size = std::pair<mx::resource::ptr, std::uint16_t>;
+
+    constexpr TaskInterface() = default;
+    virtual ~TaskInterface() = default;
+
+    /**
+     * Will be executed by a worker when the task gets CPU time.
+     *
+     * @param core_id       (System-)ID of the core, the task is executed on.
+     * @param channel_id    Channel ID the task is executed on.
+     * @return Pointer to the follow up task.
+     */
+    virtual TaskResult execute(std::uint16_t core_id, std::uint16_t channel_id) = 0;
+
+    /**
+     * Annotate the task with a resource the task will work on.
+     *
+     * @param resource Pointer to the resource.
+     * @param size  Size of the resource (that will be prefetched).
+     */
+    void annotate(const mx::resource::ptr resource_, const std::uint16_t size) noexcept
+    {
+        _annotation.target = std::make_pair(resource_, size);
+    }
+
+    /**
+     * Annotate the task with a desired channel the task should be executed on.
+     *
+     * @param channel_id ID of the channel.
+     */
+    void annotate(const channel channel_id) noexcept { _annotation.target = channel_id; }
+
+    /**
+     * Annotate the task with a desired NUMA node id the task should executed on.
+     *
+     * @param node_id ID of the NUMA node.
+     */
+    void annotate(const node node_id) noexcept { _annotation.target = node_id; }
+
+    /**
+     * Annotate the task with a run priority (low, normal, high).
+     *
+     * @param priority_ Priority the task should run with.
+     */
+    void annotate(const priority priority_) noexcept { _annotation.priority = priority_; }
+
+    /**
+     * Annotate the task whether it is a reading or writing task.
+     *
+     * @param is_readonly True, when the task is read only (false by default).
+     */
+    void is_readonly(const bool is_readonly) noexcept { _annotation.is_readonly = is_readonly; }
+
+    /**
+     * @return The annotated resource.
+     */
+    [[nodiscard]] mx::resource::ptr annotated_resource() const noexcept
+    {
+        return std::get<0>(std::get<resource_and_size>(_annotation.target));
+    }
+
+    /**
+     * @return The annotated resource size.
+     */
+    [[nodiscard]] std::uint16_t annotated_resource_size() const noexcept
+    {
+        return std::get<1>(std::get<resource_and_size>(_annotation.target));
+    }
+
+    /**
+     * @return The annotated channel.
+     */
+    [[nodiscard]] channel annotated_channel() const noexcept { return std::get<channel>(_annotation.target); }
+
+    /**
+     * @return The annotated NUMA node id.
+     */
+    [[nodiscard]] node annotated_node() const noexcept { return std::get<node>(_annotation.target); }
+
+    /**
+     * @return Annotated priority.
+     */
+    [[nodiscard]] enum priority priority() const noexcept { return _annotation.priority; }
+
+    /**
+     * @return True, when the task is a read only task.
+     */
+    [[nodiscard]] bool is_readonly() const noexcept { return _annotation.is_readonly; }
+
+    /**
+     * @return True, when the task has a resource annotated.
+     */
+    [[nodiscard]] bool has_resource_annotated() const noexcept
+    {
+        return std::holds_alternative<resource_and_size>(_annotation.target);
+    }
+
+    /**
+     * @return True, when the task has a channel annotated.
+     */
+    [[nodiscard]] bool has_channel_annotated() const noexcept
+    {
+        return std::holds_alternative<channel>(_annotation.target);
+    }
+
+    /**
+     * @return True, when the task has a NUMA node annotated.
+     */
+    [[nodiscard]] bool has_node_annotated() const noexcept { return std::holds_alternative<node>(_annotation.target); }
+
+    /**
+     * @return Pointer to the next task in spawn queue.
+     */
+    [[nodiscard]] TaskInterface *next() const noexcept { return _next; }
+
+    /**
+     * Set the next task for scheduling.
+     * @param next Task scheduled after this task.
+     */
+    void next(TaskInterface *next) noexcept { _next = next; }
+
+private:
+    /**
+     * Annotation of a task.
+     */
+    class annotation
+    {
+    public:
+        constexpr annotation() noexcept = default;
+        ~annotation() = default;
+
+        // Is the task just reading?
+        bool is_readonly{false};
+
+        // Priority of a task.
+        enum priority priority
+        {
+            priority::normal
+        };
+
+        // Target the task will run on.
+        std::variant<channel, node, resource_and_size, bool> target{false};
+    } __attribute__((packed));
+
+    // Pointer for next task in queue.
+    TaskInterface *_next{nullptr};
+
+    // Tasks annotations.
+    annotation _annotation;
+};
+} // namespace mx::tasking
--- a/src/mx/tasking/task_buffer.h
+++ b/src/mx/tasking/task_buffer.h
@@ -0,0 +1,163 @@
+#pragma once
+#include "load.h"
+#include "prefetch_slot.h"
+#include "task.h"
+#include <array>
+#include <cstdint>
+#include <mx/system/cache.h>
+#include <utility>
+
+namespace mx::tasking {
+/**
+ * The task buffer holds tasks that are ready to execute.
+ * The buffer is realized as a ring buffer with a fixed size.
+ * All empty slots are null pointers.
+ */
+template <std::size_t S> class TaskBuffer
+{
+private:
+    class Slot
+    {
+    public:
+        constexpr Slot() noexcept = default;
+        ~Slot() noexcept = default;
+
+        void task(TaskInterface *task) noexcept { _task = task; }
+        [[nodiscard]] TaskInterface *consume_task() noexcept { return std::exchange(_task, nullptr); }
+
+        void prefetch() noexcept { _prefetch_slot(); }
+        void prefetch(TaskInterface *task) noexcept { _prefetch_slot = task; }
+
+        bool operator==(std::nullptr_t) const noexcept { return _task == nullptr; }
+        bool operator!=(std::nullptr_t) const noexcept { return _task != nullptr; }
+
+    private:
+        TaskInterface *_task{nullptr};
+        PrefetchSlot _prefetch_slot{};
+    };
+
+public:
+    constexpr explicit TaskBuffer(const std::uint8_t prefetch_distance) noexcept : _prefetch_distance(prefetch_distance)
+    {
+    }
+    ~TaskBuffer() noexcept = default;
+
+    /**
+     * @return True, when the buffer is empty.
+     */
+    [[nodiscard]] bool empty() const noexcept { return _buffer[_head] == nullptr; }
+
+    /**
+     * @return Number of tasks in the buffer.
+     */
+    [[nodiscard]] std::uint16_t size() const noexcept
+    {
+        return _tail >= _head ? (_tail - _head) : (S - (_head - _tail));
+    }
+
+    /**
+     * @return Number of maximal tasks of the buffer.
+     */
+    constexpr auto max_size() const noexcept { return S; }
+
+    /**
+     * @return Number of free slots.
+     */
+    [[nodiscard]] std::uint16_t available_slots() const noexcept { return S - size(); }
+
+    /**
+     * @return The next task in the buffer; the slot will be available after.
+     */
+    TaskInterface *next() noexcept;
+
+    /**
+     * Takes out tasks from the given queue and inserts them into the buffer.
+     * @param from_queue Queue to take tasks from.
+     * @param count Number of maximal tasks to take out of the queue.
+     * @return Number of retrieved tasks.
+     */
+    template <class Q> std::uint16_t fill(Q &from_queue, std::uint16_t count) noexcept;
+
+private:
+    // Prefetch distance.
+    const std::uint8_t _prefetch_distance;
+
+    // Index of the first element in the buffer.
+    std::uint16_t _head{0U};
+
+    // Index of the last element in the buffer.
+    std::uint16_t _tail{0U};
+
+    // Array with task-slots.
+    std::array<Slot, S> _buffer{};
+
+    /**
+     * Normalizes the index with respect to the size.
+     * @param index Index.
+     * @return Normalized index.
+     */
+    static std::uint16_t normalize(const std::uint16_t index) noexcept { return index & (S - 1U); }
+
+    /**
+     *  Normalizes the index backwards with respect to the given offset.
+     * @param index Index.
+     * @param offset Offset to index.
+     * @return Normalized index.
+     */
+    static std::uint16_t normalize_backward(const std::uint16_t index, const std::uint16_t offset) noexcept
+    {
+        return index >= offset ? index - offset : S - (offset - index);
+    }
+};
+
+template <std::size_t S> TaskInterface *TaskBuffer<S>::next() noexcept
+{
+    auto &slot = this->_buffer[this->_head];
+    if (slot != nullptr)
+    {
+        slot.prefetch();
+        this->_head = TaskBuffer<S>::normalize(this->_head + 1U);
+        return slot.consume_task();
+    }
+
+    return nullptr;
+}
+
+template <std::size_t S>
+template <class Q>
+std::uint16_t TaskBuffer<S>::fill(Q &from_queue, const std::uint16_t count) noexcept
+{
+    if (count == 0U || from_queue.empty())
+    {
+        return 0U;
+    }
+
+    const auto size = S - count;
+    const auto is_prefetching = this->_prefetch_distance > 0U;
+    auto prefetch_tail = TaskBuffer<S>::normalize_backward(this->_tail, this->_prefetch_distance);
+
+    for (auto i = 0U; i < count; ++i)
+    {
+        auto *task = static_cast<TaskInterface *>(from_queue.pop_front());
+        if (task == nullptr)
+        {
+            return i;
+        }
+
+        // Schedule prefetch instruction <prefetch_distance> slots before.
+        if (is_prefetching && (size + i) >= this->_prefetch_distance)
+        {
+            this->_buffer[prefetch_tail].prefetch(task);
+        }
+
+        // Schedule task.
+        this->_buffer[this->_tail].task(task);
+
+        // Increment tail.
+        this->_tail = TaskBuffer<S>::normalize(this->_tail + 1U);
+        prefetch_tail = TaskBuffer<S>::normalize(prefetch_tail + 1U);
+    }
+
+    return count;
+}
+} // namespace mx::tasking
--- a/src/mx/tasking/task_stack.h
+++ b/src/mx/tasking/task_stack.h
@@ -0,0 +1,162 @@
+#pragma once
+
+#include "config.h"
+#include <array>
+#include <cstddef>
+#include <cstring>
+#include <mx/system/environment.h>
+#ifdef USE_SSE2
+#include <emmintrin.h>
+#endif
+
+namespace mx::tasking {
+/**
+ * Stack to save/restore tasks before/after optimistic synchronization.
+ * In case of failed read, the task will be restored to re-run.
+ */
+class TaskInterface;
+class TaskStack
+{
+public:
+    constexpr TaskStack() : _data({}) { _data.fill(std::byte{'\0'}); }
+    ~TaskStack() = default;
+
+    /**
+     * Saves the full task on the stack.
+     * @param task Task to save.
+     */
+    void save(const TaskInterface *task) noexcept
+    {
+        if constexpr (system::Environment::is_sse2() && (config::task_size() == 64U || config::task_size() == 128U))
+        {
+            TaskStack::memcpy_simd<config::task_size()>(_data.data(), static_cast<const void *>(task));
+        }
+        else if constexpr (config::task_size() == 64U || config::task_size() == 128U)
+        {
+            TaskStack::memcpy_tiny<config::task_size()>(_data.data(), static_cast<const void *>(task));
+        }
+        else
+        {
+            std::memcpy(_data.data(), static_cast<const void *>(task), config::task_size());
+        }
+    }
+
+    /**
+     * Restores the full task from the stack.
+     * @param task Task to restore.
+     */
+    void restore(TaskInterface *task) const noexcept
+    {
+        if constexpr (system::Environment::is_sse2() && (config::task_size() == 64U || config::task_size() == 128U))
+        {
+            TaskStack::memcpy_simd<config::task_size()>(static_cast<void *>(task), _data.data());
+        }
+        else if constexpr (config::task_size() == 64U || config::task_size() == 128U)
+        {
+            TaskStack::memcpy_tiny<config::task_size()>(static_cast<void *>(task), _data.data());
+        }
+        else
+        {
+            std::memcpy(static_cast<void *>(task), _data.data(), config::task_size());
+        }
+    }
+
+    /**
+     * Saves some data on the stack.
+     *
+     * @param index Index where to store.
+     * @param data Data to store.
+     */
+    template <typename T> void store(const std::uint16_t index, const T &data)
+    {
+        *reinterpret_cast<T *>(&_data[index]) = data;
+    }
+
+    /**
+     * Restores some data from the stack.
+     *
+     * @param index Index where the data is stored.
+     * @return The restored data.
+     */
+    template <typename T> const T *read(const std::uint16_t index) const
+    {
+        return reinterpret_cast<const T *>(&_data[index]);
+    }
+
+private:
+    // Data to store tasks or single data on the stack.
+    std::array<std::byte, config::task_size()> _data;
+
+    template <std::size_t S>
+    static inline void memcpy_simd([[maybe_unused]] void *destination, [[maybe_unused]] const void *src)
+    {
+#ifdef USE_SSE2
+        if constexpr (S == 64U)
+        {
+            __m128i m0 = _mm_loadu_si128(static_cast<const __m128i *>(src) + 0U);
+            __m128i m1 = _mm_loadu_si128(static_cast<const __m128i *>(src) + 1U);
+            __m128i m2 = _mm_loadu_si128(static_cast<const __m128i *>(src) + 2U);
+            __m128i m3 = _mm_loadu_si128(static_cast<const __m128i *>(src) + 3U);
+            _mm_storeu_si128(static_cast<__m128i *>(destination) + 0U, m0);
+            _mm_storeu_si128(static_cast<__m128i *>(destination) + 1U, m1);
+            _mm_storeu_si128(static_cast<__m128i *>(destination) + 2U, m2);
+            _mm_storeu_si128(static_cast<__m128i *>(destination) + 3U, m3);
+        }
+        else if constexpr (S == 128U)
+        {
+            __m128i m0 = _mm_loadu_si128(static_cast<const __m128i *>(src) + 0U);
+            __m128i m1 = _mm_loadu_si128(static_cast<const __m128i *>(src) + 1U);
+            __m128i m2 = _mm_loadu_si128(static_cast<const __m128i *>(src) + 2U);
+            __m128i m3 = _mm_loadu_si128(static_cast<const __m128i *>(src) + 3U);
+            __m128i m4 = _mm_loadu_si128(static_cast<const __m128i *>(src) + 4U);
+            __m128i m5 = _mm_loadu_si128(static_cast<const __m128i *>(src) + 5U);
+            __m128i m6 = _mm_loadu_si128(static_cast<const __m128i *>(src) + 6U);
+            __m128i m7 = _mm_loadu_si128(static_cast<const __m128i *>(src) + 7U);
+            _mm_storeu_si128(static_cast<__m128i *>(destination) + 0U, m0);
+            _mm_storeu_si128(static_cast<__m128i *>(destination) + 1U, m1);
+            _mm_storeu_si128(static_cast<__m128i *>(destination) + 2U, m2);
+            _mm_storeu_si128(static_cast<__m128i *>(destination) + 3U, m3);
+            _mm_storeu_si128(static_cast<__m128i *>(destination) + 4U, m4);
+            _mm_storeu_si128(static_cast<__m128i *>(destination) + 5U, m5);
+            _mm_storeu_si128(static_cast<__m128i *>(destination) + 6U, m6);
+            _mm_storeu_si128(static_cast<__m128i *>(destination) + 7U, m7);
+        }
+#endif
+    }
+
+    template <std::size_t S>
+    static inline void memcpy_tiny([[maybe_unused]] void *destination, [[maybe_unused]] const void *src)
+    {
+        if constexpr (S == 64U)
+        {
+            static_cast<std::int64_t *>(destination)[0U] = static_cast<const std::int64_t *>(src)[0U];
+            static_cast<std::int64_t *>(destination)[1U] = static_cast<const std::int64_t *>(src)[1U];
+            static_cast<std::int64_t *>(destination)[2U] = static_cast<const std::int64_t *>(src)[2U];
+            static_cast<std::int64_t *>(destination)[3U] = static_cast<const std::int64_t *>(src)[3U];
+            static_cast<std::int64_t *>(destination)[4U] = static_cast<const std::int64_t *>(src)[4U];
+            static_cast<std::int64_t *>(destination)[5U] = static_cast<const std::int64_t *>(src)[5U];
+            static_cast<std::int64_t *>(destination)[6U] = static_cast<const std::int64_t *>(src)[6U];
+            static_cast<std::int64_t *>(destination)[7U] = static_cast<const std::int64_t *>(src)[7U];
+        }
+        else if constexpr (S == 128U)
+        {
+            static_cast<std::int64_t *>(destination)[0U] = static_cast<const std::int64_t *>(src)[0U];
+            static_cast<std::int64_t *>(destination)[1U] = static_cast<const std::int64_t *>(src)[1U];
+            static_cast<std::int64_t *>(destination)[2U] = static_cast<const std::int64_t *>(src)[2U];
+            static_cast<std::int64_t *>(destination)[3U] = static_cast<const std::int64_t *>(src)[3U];
+            static_cast<std::int64_t *>(destination)[4U] = static_cast<const std::int64_t *>(src)[4U];
+            static_cast<std::int64_t *>(destination)[5U] = static_cast<const std::int64_t *>(src)[5U];
+            static_cast<std::int64_t *>(destination)[6U] = static_cast<const std::int64_t *>(src)[6U];
+            static_cast<std::int64_t *>(destination)[7U] = static_cast<const std::int64_t *>(src)[7U];
+            static_cast<std::int64_t *>(destination)[8U] = static_cast<const std::int64_t *>(src)[8U];
+            static_cast<std::int64_t *>(destination)[9U] = static_cast<const std::int64_t *>(src)[9U];
+            static_cast<std::int64_t *>(destination)[10U] = static_cast<const std::int64_t *>(src)[10U];
+            static_cast<std::int64_t *>(destination)[11U] = static_cast<const std::int64_t *>(src)[11U];
+            static_cast<std::int64_t *>(destination)[12U] = static_cast<const std::int64_t *>(src)[12U];
+            static_cast<std::int64_t *>(destination)[13U] = static_cast<const std::int64_t *>(src)[13U];
+            static_cast<std::int64_t *>(destination)[14U] = static_cast<const std::int64_t *>(src)[14U];
+            static_cast<std::int64_t *>(destination)[15U] = static_cast<const std::int64_t *>(src)[15U];
+        }
+    }
+};
+} // namespace mx::tasking
--- a/src/mx/tasking/worker.cpp
+++ b/src/mx/tasking/worker.cpp
@@ -0,0 +1,231 @@
+#include "worker.h"
+#include "config.h"
+#include "runtime.h"
+#include "task.h"
+#include <cassert>
+#include <mx/system/builtin.h>
+#include <mx/system/topology.h>
+#include <mx/util/random.h>
+
+using namespace mx::tasking;
+
+Worker::Worker(const std::uint16_t id, const std::uint16_t target_core_id, const std::uint16_t target_numa_node_id,
+               const util::maybe_atomic<bool> &is_running, const std::uint16_t prefetch_distance,
+               memory::reclamation::LocalEpoch &local_epoch,
+               const std::atomic<memory::reclamation::epoch_t> &global_epoch, profiling::Statistic &statistic) noexcept
+    : _target_core_id(target_core_id), _prefetch_distance(prefetch_distance),
+      _channel(id, target_numa_node_id, prefetch_distance), _local_epoch(local_epoch), _global_epoch(global_epoch),
+      _statistic(statistic), _is_running(is_running)
+{
+}
+
+void Worker::execute()
+{
+    while (this->_is_running == false)
+    {
+        system::builtin::pause();
+    }
+
+    TaskInterface *task;
+    const auto core_id = system::topology::core_id();
+    assert(this->_target_core_id == core_id && "Worker not pinned to correct core.");
+    const auto channel_id = this->_channel.id();
+
+    while (this->_is_running)
+    {
+        if constexpr (config::memory_reclamation() == config::UpdateEpochPeriodically)
+        {
+            this->_local_epoch.enter(this->_global_epoch);
+        }
+
+        this->_channel_size = this->_channel.fill();
+
+        if constexpr (config::task_statistics())
+        {
+            this->_statistic.increment<profiling::Statistic::Fill>(channel_id);
+        }
+
+        while ((task = this->_channel.next()) != nullptr)
+        {
+            // Whenever the worker-local task-buffer falls under
+            // the prefetch distance, we re-fill the buffer to avoid
+            // empty slots in the prefetch-buffer.
+            if (--this->_channel_size <= this->_prefetch_distance)
+            {
+                if constexpr (config::memory_reclamation() == config::UpdateEpochPeriodically)
+                {
+                    this->_local_epoch.enter(this->_global_epoch);
+                }
+
+                this->_channel_size = this->_channel.fill();
+                if constexpr (config::task_statistics())
+                {
+                    this->_statistic.increment<profiling::Statistic::Fill>(channel_id);
+                }
+            }
+
+            if constexpr (config::task_statistics())
+            {
+                this->_statistic.increment<profiling::Statistic::Executed>(channel_id);
+                if (task->has_resource_annotated())
+                {
+                    if (task->is_readonly())
+                    {
+                        this->_statistic.increment<profiling::Statistic::ExecutedReader>(channel_id);
+                    }
+                    else
+                    {
+                        this->_statistic.increment<profiling::Statistic::ExecutedWriter>(channel_id);
+                    }
+                }
+            }
+
+            // Based on the annotated resource and its synchronization
+            // primitive, we choose the fitting execution context.
+            auto result = TaskResult{};
+            switch (Worker::synchronization_primitive(task))
+            {
+            case synchronization::primitive::ScheduleWriter:
+                result = this->execute_optimistic(core_id, channel_id, task);
+                break;
+            case synchronization::primitive::OLFIT:
+                result = this->execute_olfit(core_id, channel_id, task);
+                break;
+            case synchronization::primitive::ScheduleAll:
+            case synchronization::primitive::None:
+                result = task->execute(core_id, channel_id);
+                break;
+            case synchronization::primitive::ReaderWriterLatch:
+                result = Worker::execute_reader_writer_latched(core_id, channel_id, task);
+                break;
+            case synchronization::primitive::ExclusiveLatch:
+                result = Worker::execute_exclusive_latched(core_id, channel_id, task);
+                break;
+            }
+
+            // The task-chain may be finished at time the
+            // task has no successor. Otherwise, we spawn
+            // the successor task.
+            if (result.has_successor())
+            {
+                runtime::spawn(*static_cast<TaskInterface *>(result), channel_id);
+            }
+
+            if (result.is_remove())
+            {
+                runtime::delete_task(core_id, task);
+            }
+        }
+    }
+}
+
+TaskResult Worker::execute_exclusive_latched(const std::uint16_t core_id, const std::uint16_t channel_id,
+                                             mx::tasking::TaskInterface *const task)
+{
+    auto *resource = resource::ptr_cast<resource::ResourceInterface>(task->annotated_resource());
+
+    resource::ResourceInterface::scoped_exclusive_latch _{resource};
+    return task->execute(core_id, channel_id);
+}
+
+TaskResult Worker::execute_reader_writer_latched(const std::uint16_t core_id, const std::uint16_t channel_id,
+                                                 mx::tasking::TaskInterface *const task)
+{
+    auto *resource = resource::ptr_cast<resource::ResourceInterface>(task->annotated_resource());
+
+    // Reader do only need to acquire a "read-only" latch.
+    if (task->is_readonly())
+    {
+        resource::ResourceInterface::scoped_rw_latch<false> _{resource};
+        return task->execute(core_id, channel_id);
+    }
+
+    {
+        resource::ResourceInterface::scoped_rw_latch<true> _{resource};
+        return task->execute(core_id, channel_id);
+    }
+}
+
+TaskResult Worker::execute_optimistic(const std::uint16_t core_id, const std::uint16_t channel_id,
+                                      mx::tasking::TaskInterface *const task)
+{
+    auto *optimistic_resource = resource::ptr_cast<resource::ResourceInterface>(task->annotated_resource());
+
+    if (task->is_readonly())
+    {
+        // For readers running at a different channel than writer,
+        // we need to validate the version of the resource. This
+        // comes along with saving the tasks state on a stack and
+        // re-running the task, whenever the version check failed.
+        if (task->annotated_resource().channel_id() != channel_id)
+        {
+            return this->execute_optimistic_read(core_id, channel_id, optimistic_resource, task);
+        }
+
+        // Whenever the task is executed at the same channel
+        // where writing tasks are executed, we do not need to
+        // synchronize because no write can happen.
+        return task->execute(core_id, channel_id);
+    }
+
+    // Writers, however, need to acquire the version to tell readers, that
+    // the resource is modified. This is done by making the version odd before
+    // writing to the resource and even afterwards. Here, we can use a simple
+    // fetch_add operation, because writers are serialized on the channel.
+    {
+        resource::ResourceInterface::scoped_optimistic_latch _{optimistic_resource};
+        return task->execute(core_id, channel_id);
+    }
+}
+
+TaskResult Worker::execute_olfit(const std::uint16_t core_id, const std::uint16_t channel_id, TaskInterface *const task)
+{
+    auto *optimistic_resource = resource::ptr_cast<resource::ResourceInterface>(task->annotated_resource());
+
+    if (task->is_readonly())
+    {
+        return this->execute_optimistic_read(core_id, channel_id, optimistic_resource, task);
+    }
+
+    // Writers, however, need to acquire the version to tell readers, that
+    // the resource is modified. This is done by making the version odd before
+    // writing to the resource and even afterwards. Here, we need to use compare
+    // xchg because writers can appear on every channel.
+    {
+        resource::ResourceInterface::scoped_olfit_latch _{optimistic_resource};
+        return task->execute(core_id, channel_id);
+    }
+}
+
+TaskResult Worker::execute_optimistic_read(const std::uint16_t core_id, const std::uint16_t channel_id,
+                                           resource::ResourceInterface *optimistic_resource, TaskInterface *const task)
+{
+    if constexpr (config::memory_reclamation() == config::UpdateEpochOnRead)
+    {
+        this->_local_epoch.enter(this->_global_epoch);
+    }
+
+    // The current state of the task is saved for
+    // restoring if the read operation failed, but
+    // the task was maybe modified.
+    this->_task_stack.save(task);
+
+    do
+    {
+        const auto version = optimistic_resource->version();
+        const auto result = task->execute(core_id, channel_id);
+
+        if (optimistic_resource->is_version_valid(version))
+        {
+            if constexpr (config::memory_reclamation() == config::UpdateEpochOnRead)
+            {
+                this->_local_epoch.leave();
+            }
+            return result;
+        }
+
+        // At this point, the version check failed and we need
+        // to re-run the read operation.
+        this->_task_stack.restore(task);
+    } while (true);
+}
--- a/src/mx/tasking/worker.h
+++ b/src/mx/tasking/worker.h
@@ -0,0 +1,130 @@
+#pragma once
+
+#include "channel.h"
+#include "config.h"
+#include "profiling/statistic.h"
+#include "task.h"
+#include "task_stack.h"
+#include <atomic>
+#include <cstddef>
+#include <memory>
+#include <mx/memory/reclamation/epoch_manager.h>
+#include <mx/util/maybe_atomic.h>
+#include <variant>
+#include <vector>
+
+namespace mx::tasking {
+/**
+ * The worker executes tasks from his own channel, until the "running" flag is false.
+ */
+class alignas(64) Worker
+{
+public:
+    Worker(std::uint16_t id, std::uint16_t target_core_id, std::uint16_t target_numa_node_id,
+           const util::maybe_atomic<bool> &is_running, std::uint16_t prefetch_distance,
+           memory::reclamation::LocalEpoch &local_epoch, const std::atomic<memory::reclamation::epoch_t> &global_epoch,
+           profiling::Statistic &statistic) noexcept;
+
+    ~Worker() noexcept = default;
+
+    /**
+     * Starts the worker (typically in its own thread).
+     */
+    void execute();
+
+    /**
+     * @return Id of the logical core this worker runs on.
+     */
+    [[nodiscard]] std::uint16_t core_id() const noexcept { return _target_core_id; }
+
+    [[nodiscard]] Channel &channel() noexcept { return _channel; }
+    [[nodiscard]] const Channel &channel() const noexcept { return _channel; }
+
+private:
+    // Id of the logical core.
+    const std::uint16_t _target_core_id;
+
+    // Distance of prefetching tasks.
+    const std::uint16_t _prefetch_distance;
+
+    std::int32_t _channel_size{0U};
+
+    // Stack for persisting tasks in optimistic execution. Optimistically
+    // executed tasks may fail and be restored after execution.
+    alignas(64) TaskStack _task_stack;
+
+    // Channel where tasks are stored for execution.
+    alignas(64) Channel _channel;
+
+    // Local epoch of this worker.
+    memory::reclamation::LocalEpoch &_local_epoch;
+
+    // Global epoch.
+    const std::atomic<memory::reclamation::epoch_t> &_global_epoch;
+
+    // Statistics container.
+    profiling::Statistic &_statistic;
+
+    // Flag for "running" state of MxTasking.
+    const util::maybe_atomic<bool> &_is_running;
+
+    /**
+     * Analyzes the given task and chooses the execution method regarding synchronization.
+     * @param task Task to be executed.
+     * @return Synchronization method.
+     */
+    static synchronization::primitive synchronization_primitive(TaskInterface *task) noexcept
+    {
+        return task->has_resource_annotated() ? task->annotated_resource().synchronization_primitive()
+                                              : synchronization::primitive::None;
+    }
+
+    /**
+     * Executes a task with a latch.
+     * @param core_id Id of the core.
+     * @param channel_id Id of the channel.
+     * @param task Task to be executed.
+     * @return Task to be scheduled after execution.
+     */
+    static TaskResult execute_exclusive_latched(std::uint16_t core_id, std::uint16_t channel_id, TaskInterface *task);
+
+    /**
+     * Executes a task with a reader/writer latch.
+     * @param core_id Id of the core.
+     * @param channel_id Id of the channel.
+     * @param task Task to be executed.
+     * @return Task to be scheduled after execution.
+     */
+    static TaskResult execute_reader_writer_latched(std::uint16_t core_id, std::uint16_t channel_id,
+                                                    TaskInterface *task);
+
+    /**
+     * Executes the task optimistically.
+     * @param core_id Id of the core.
+     * @param channel_id Id of the channel.
+     * @param task Task to be executed.
+     * @return Task to be scheduled after execution.
+     */
+    TaskResult execute_optimistic(std::uint16_t core_id, std::uint16_t channel_id, TaskInterface *task);
+
+    /**
+     * Executes the task using olfit protocol.
+     * @param core_id Id of the core.
+     * @param channel_id Id of the channel.
+     * @param task Task to be executed.
+     * @return Task to be scheduled after execution.
+     */
+    TaskResult execute_olfit(std::uint16_t core_id, std::uint16_t channel_id, TaskInterface *task);
+
+    /**
+     * Executes the read-only task optimistically.
+     * @param core_id Id of the core.
+     * @param channel_id Id of the channel.
+     * @param resource Resource the task reads.
+     * @param task Task to be executed.
+     * @return Task to be scheduled after execution.
+     */
+    TaskResult execute_optimistic_read(std::uint16_t core_id, std::uint16_t channel_id,
+                                       resource::ResourceInterface *resource, TaskInterface *task);
+};
+} // namespace mx::tasking
--- a/src/mx/util/aligned_t.h
+++ b/src/mx/util/aligned_t.h
@@ -0,0 +1,64 @@
+#pragma once
+#include <type_traits>
+
+namespace mx::util {
+/**
+ * Aligns the given data type with an alignment of 64.
+ */
+template <typename T> class alignas(64) aligned_t
+{
+public:
+    constexpr aligned_t() noexcept = default;
+
+    explicit constexpr aligned_t(const T &value) noexcept : _value(value) {}
+    constexpr aligned_t(const aligned_t<T> &other) = default;
+
+    template <typename... Args> explicit aligned_t(Args &&... args) noexcept : _value(std::forward<Args>(args)...) {}
+
+    ~aligned_t() noexcept = default;
+
+    aligned_t<T> &operator=(const aligned_t<T> &) = default;
+    aligned_t<T> &operator=(aligned_t<T> &&) noexcept = default;
+
+    explicit operator T() const noexcept { return _value; }
+
+    T &operator*() noexcept { return _value; };
+    const T &operator*() const noexcept { return _value; };
+
+    T &value() noexcept { return _value; }
+    const T &value() const noexcept { return _value; }
+
+    aligned_t<T> &operator=(const T &value) noexcept
+    {
+        _value = value;
+        return *this;
+    }
+
+    bool operator==(std::nullptr_t) const noexcept
+    {
+        if constexpr (std::is_pointer<T>::value)
+        {
+            return _value == nullptr;
+        }
+        else
+        {
+            return false;
+        }
+    }
+
+    bool operator!=(std::nullptr_t) const noexcept
+    {
+        if constexpr (std::is_pointer<T>::value)
+        {
+            return _value != nullptr;
+        }
+        else
+        {
+            return true;
+        }
+    }
+
+private:
+    T _value = T();
+};
+} // namespace mx::util
--- a/Show More
+++ b/Show More