shared gpu memory across cells

2026-01-21 12:32:56 +01:00 · 2025-08-18 10:36:05 +02:00
parent 9d22ca68ab
commit c028d5d838
25 changed files with 37213 additions and 10 deletions
--- a/repos/dde_uos-intel-gpgpu/include/gpgpu_virt/client.h
+++ b/repos/dde_uos-intel-gpgpu/include/gpgpu_virt/client.h
@@ -23,6 +23,16 @@ struct Session_client : Genode::Rpc_client<Session>
 		call<Rpc_register_vm>(size, ram_cap);
 	}

+	void register_shm(Genode::size_t size, Genode::Ram_dataspace_capability& ram_cap) override
+	{
+		call<Rpc_register_shm>(size, ram_cap);
+	}
+
+	void ask_shm(int id, Genode::size_t &size, Genode::Ram_dataspace_capability& ram_cap) override
+	{
+		call<Rpc_ask_shm>(id, size, ram_cap);
+	}
+
 };

 }
--- a/repos/dde_uos-intel-gpgpu/include/gpgpu_virt/session.h
+++ b/repos/dde_uos-intel-gpgpu/include/gpgpu_virt/session.h
@@ -14,6 +14,8 @@ struct Session : Genode::Session

 	virtual void register_vm(Genode::size_t size, Genode::Ram_dataspace_capability& ram_cap) = 0;
 	virtual void start_task(unsigned long kconf) = 0;
+	virtual void register_shm(Genode::size_t size, Genode::Ram_dataspace_capability& ram_cap) = 0;
+	virtual void ask_shm(int id, Genode::size_t &size, Genode::Ram_dataspace_capability& ram_cap) = 0;

 	/*******************
 	 ** RPC interface **
@@ -21,9 +23,11 @@ struct Session : Genode::Session

 	GENODE_RPC(Rpc_register_vm, void, register_vm, Genode::size_t, Genode::Ram_dataspace_capability&);
 	GENODE_RPC(Rpc_start_task, void, start_task, unsigned long);
+	GENODE_RPC(Rpc_register_shm, void, register_shm, Genode::size_t, Genode::Ram_dataspace_capability&);
+	GENODE_RPC(Rpc_ask_shm, void, ask_shm, int, Genode::size_t&, Genode::Ram_dataspace_capability&);


-	GENODE_RPC_INTERFACE(Rpc_register_vm, Rpc_start_task);
+	GENODE_RPC_INTERFACE(Rpc_register_vm, Rpc_start_task, Rpc_register_shm, Rpc_ask_shm);
 };

 }
--- a/repos/dde_uos-intel-gpgpu/src/config.h
+++ b/repos/dde_uos-intel-gpgpu/src/config.h
@@ -1,11 +1,13 @@
 #ifndef CONFIG_H
 #define CONFIG_H

-#define QEMU_TEST
+// #define QEMU_TEST

-//#define VERBOSE
+#define VERBOSE

 #define SCHED_CFS
 //#define SCHED_RR // default

+#define MAX_SHM_REGIONS 32
+
 #endif // CONFIG_H
--- a/repos/dde_uos-intel-gpgpu/src/uos-intel-gpgpu
+++ b/repos/dde_uos-intel-gpgpu/src/uos-intel-gpgpu
--- a/repos/dde_uos-intel-gpgpu/src/virt/rpc.cc
+++ b/repos/dde_uos-intel-gpgpu/src/virt/rpc.cc
@@ -31,6 +31,49 @@ void Session_component::register_vm(Genode::size_t size, Genode::Ram_dataspace_c
 	_global_sched->add_vgpu(&vgpu);
 }

+int SHM_manager::alloc_shm(Genode::size_t size, Genode::Ram_dataspace_capability& ram_cap_vm)
+{
+	// get shared memory id
+	const int s = __atomic_fetch_add(&shid, 1, __ATOMIC_SEQ_CST);
+
+	if(s >= MAX_SHM_REGIONS)
+		return -1;
+
+	// alloc shared memory
+	Genode::addr_t mapped_base;
+	ram_cap[s] = _global_gpgpu_genode->allocRamCap(size, mapped_base, base[s]);
+	sizes[s] = size;
+	ram_cap_vm = ram_cap[s];
+
+	return s;
+}
+
+void SHM_manager::free_shm(int id)
+{
+	_global_gpgpu_genode->freeRamCap(ram_cap[id]);
+}
+
+void Session_component::register_shm(Genode::size_t size, Genode::Ram_dataspace_capability& ram_cap_vm)
+{
+	// create shared mem
+	int shid = SHM_manager::getInstance().alloc_shm(size, ram_cap_vm);
+	vgpu.assignSHM(shid);
+}
+
+void Session_component::ask_shm(int id, Genode::size_t &size, Genode::Ram_dataspace_capability& ram_cap_vm)
+{
+	// get size
+	size = SHM_manager::getInstance().getSize(id);
+	if(size == 0) // invalid
+		return;
+
+	// get ram cap
+	ram_cap_vm = SHM_manager::getInstance().getCap(id);
+
+	// assign id to vgpu
+	vgpu.assignSHM(id);
+}
+
 void Session_component::start_task(unsigned long kconf)
 {
 	// convert offset to driver virt addr
@@ -44,7 +87,8 @@ void Session_component::start_task(unsigned long kconf)
        }
 		else // for pointer set phys addr
 		{
-			kc->buffConfigs[i].buffer = (void*)((Genode::addr_t)kc->buffConfigs[i].buffer + base);
+			const Genode::addr_t addrBase = kc->buffConfigs[i].shmid == -1 ? base : SHM_manager::getInstance().getBase(kc->buffConfigs[i].shmid);
+			kc->buffConfigs[i].buffer = (void*)((Genode::addr_t)kc->buffConfigs[i].buffer + addrBase);
 		}
    }
 	kc->kernelName = (char*)((Genode::addr_t)kc->kernelName + mapped_base);
@@ -63,6 +107,7 @@ void Session_component::start_task(unsigned long kconf)
 	for(int i = 0; i < kc->buffCount; i++)
 	{
 		Genode::log("\tBuffer ", i);
+		Genode::log("\t\tshmid: ", (int)kc->buffConfigs[i].shmid);
 		if(kc->buffConfigs[i].non_pointer_type)
 		{
 			Genode::log("\t\tvaddr: ", (void*)kc->buffConfigs[i].buffer);
@@ -72,7 +117,8 @@ void Session_component::start_task(unsigned long kconf)
 		}
 		else
 		{
-			Genode::log("\t\tvaddr: ", (void*)((Genode::addr_t)kc->buffConfigs[i].buffer - base + mapped_base));
+			const Genode::addr_t addrBase = kc->buffConfigs[i].shmid == -1 ? base : SHM_manager::getInstance().getBase(kc->buffConfigs[i].shmid);
+			Genode::log("\t\tvaddr: ", (void*)((Genode::addr_t)kc->buffConfigs[i].buffer - addrBase));
 			Genode::log("\t\tpaddr: ", (void*)kc->buffConfigs[i].buffer);
 			//Genode::log("\t\tgpuaddr: ", (void*)((addr_t)kc->buffConfigs[i].ga));  // to print this, temporary make the var public
 			//Genode::log("\t\tpos: ", (uint32_t)kc->buffConfigs[i].pos);  // to print this, temporary make the var public
--- a/repos/dde_uos-intel-gpgpu/src/virt/rpc.h
+++ b/repos/dde_uos-intel-gpgpu/src/virt/rpc.h
@@ -10,6 +10,28 @@
 namespace gpgpu_virt
 {

+class SHM_manager
+{
+	private:
+	Genode::Ram_dataspace_capability ram_cap[MAX_SHM_REGIONS];
+		Genode::addr_t base[MAX_SHM_REGIONS];
+		Genode::size_t sizes[MAX_SHM_REGIONS];
+		unsigned int shid;
+		SHM_manager() : ram_cap{}, base {0, }, sizes {0, }, shid(0) {};
+
+	public:
+		static SHM_manager &getInstance() {
+			static SHM_manager inst;
+			return inst;
+		}
+
+		int alloc_shm(Genode::size_t size, Genode::Ram_dataspace_capability& ram_cap);
+		void free_shm(int id);
+		Genode::Ram_dataspace_capability getCap(int id) const { return ram_cap[id]; }
+		Genode::addr_t getBase(int id) const { return base[id]; }
+		Genode::addr_t getSize(int id) const { return sizes[id]; }
+};
+
 struct Session_component : Genode::Rpc_object<Session>
 {
 	VGpu vgpu;
@@ -25,6 +47,9 @@ struct Session_component : Genode::Rpc_object<Session>

 	void start_task(unsigned long kconf) override;

+	void register_shm(Genode::size_t size, Genode::Ram_dataspace_capability& ram_cap) override;
+
+	void ask_shm(int id, Genode::size_t& size, Genode::Ram_dataspace_capability& ram_cap) override;
 };

 class Root_component
--- a/repos/dde_uos-intel-gpgpu/src/virt/vgpu.h
+++ b/repos/dde_uos-intel-gpgpu/src/virt/vgpu.h
@@ -33,11 +33,32 @@ namespace gpgpu_virt {
            /// priority of vgpu
            int prio;

+            /// assigned shared memory regions
+            int shm_ids[MAX_SHM_REGIONS];
+
+            /// local shm id counter
+            int curr_shm_id;
+
        public:
            /**
             * @brief Construct a new VGpu object
             */
-            VGpu() : ctx(nullptr), ready_list(), prio(-1) {}
+            VGpu() : ctx(nullptr), ready_list(), prio(-1), curr_shm_id(0) {}
+
+            void assignSHM(int id)
+            {
+                shm_ids[curr_shm_id++] = id;
+            }
+
+            void removeSHM(int id)
+            {
+                for(int i = 0; i < MAX_SHM_REGIONS; ++i)
+                {
+                    if (shm_ids[i] == id){
+                        shm_ids[i] = -1;
+                    }
+                }
+            }

            /**
             * @brief Set the Priority
--- a/repos/hello_gpgpu/run/hello_gpgpu.run
+++ b/repos/hello_gpgpu/run/hello_gpgpu.run
@@ -62,7 +62,7 @@ append config {
 	<start name="hello_gpgpu">
 		<binary name="hello_gpgpu"/>
 		<resource name="RAM" quantum="1024M"/>
-		<config bench="1">
+		<config bench="2097152">
 			<vfs> <dir name="dev"> <log/> <inline name="rtc">2022-07-20 14:30</inline> </dir> </vfs>
 			<libc stdout="/dev/log" stderr="/dev/log" rtc="/dev/rtc"/>
 		</config>
--- a/repos/hello_gpgpu/run/shm_gpgpu.run
+++ b/repos/hello_gpgpu/run/shm_gpgpu.run
@@ -0,0 +1,103 @@
+# build config
+build { core init gpgpu timer producer consumer1 consumer2 }
+
+# platform config
+set use_acpica_as_acpi_drv 0
+source ${genode_dir}/repos/base/run/platform_drv.inc
+proc platform_drv_policy {} {
+	global use_acpica_as_acpi_drv
+	set policy ""
+
+	append_if $use_acpica_as_acpi_drv policy {
+			<policy label="acpi_drv -> ">    <pci class="ALL"/> </policy>}
+
+	append policy {
+			<policy label_prefix="gpgpu"> <pci class="ALL"/> </policy>}
+	append policy {
+			<policy label_prefix="producer"> <pci class="ALL"/> </policy>}
+	append policy {
+			<policy label_prefix="consumer1"> <pci class="ALL"/> </policy>}
+	append policy {
+			<policy label_prefix="consumer2"> <pci class="ALL"/> </policy>}
+
+	return $policy
+}
+append_platform_drv_build_components
+build $build_components
+
+# boot dir
+create_boot_directory
+
+# other config
+append config {
+	<config>
+		<parent-provides>
+			<service name="ROM"/>
+            <service name="IRQ"/>
+            <service name="IO_MEM"/>
+			<service name="PD"/>
+			<service name="RM"/>
+			<service name="CPU"/>
+			<service name="LOG"/>
+			<service name="RAM"/>
+			<service name="CAP"/>
+            <service name="TOPO"/>
+		</parent-provides>
+		<default-route>
+			<any-service> <parent/> <any-child/> </any-service>
+		</default-route>
+		<default caps="200"/>
+		<start name="timer">
+			<resource name="RAM" quantum="1M"/>
+			<provides><service name="Timer"/></provides>
+			<route>
+				<any-service><parent/><any-child/></any-service>
+			</route>
+		</start>
+}
+
+append_platform_drv_config
+
+append config {
+    <start name="gpgpu" priority="0">
+		<provides> <service name="gpgpu"/> </provides>
+        <resource name="RAM" quantum="12G"/>
+    </start>
+	<start name="producer">
+		<binary name="producer"/>
+		<resource name="RAM" quantum="512M"/>
+		<config>
+			<vfs> <dir name="dev"> <log/> <inline name="rtc">2022-07-20 14:30</inline> </dir> </vfs>
+			<libc stdout="/dev/log" stderr="/dev/log" rtc="/dev/rtc"/>
+		</config>
+	</start>
+	<start name="consumer (high)">
+		<binary name="consumer1"/>
+		<resource name="RAM" quantum="512M"/>
+		<config>
+			<vfs> <dir name="dev"> <log/> <inline name="rtc">2022-07-20 14:30</inline> </dir> </vfs>
+			<libc stdout="/dev/log" stderr="/dev/log" rtc="/dev/rtc"/>
+		</config>
+	</start>
+	<start name="consumer (low)">
+		<binary name="consumer2"/>
+		<resource name="RAM" quantum="512M"/>
+		<config >
+			<vfs> <dir name="dev"> <log/> <inline name="rtc">2022-07-20 14:30</inline> </dir> </vfs>
+			<libc stdout="/dev/log" stderr="/dev/log" rtc="/dev/rtc"/>
+		</config>
+	</start>
+</config>}
+
+install_config $config
+
+# boot modules
+set boot_modules {
+	core ld.lib.so libc.lib.so vfs.lib.so libm.lib.so init gpgpu timer producer consumer1 consumer2
+}
+append_platform_drv_boot_modules
+build_boot_image $boot_modules
+
+# qemu stuff
+append qemu_args " -nographic -m 24G"
+run_genode_until forever
--- a/repos/hello_gpgpu/src/consumer1/OpenSurf.cpp
+++ b/repos/hello_gpgpu/src/consumer1/OpenSurf.cpp
--- a/repos/hello_gpgpu/src/consumer1/SURF_noSLM_fixed_kernel.h
+++ b/repos/hello_gpgpu/src/consumer1/SURF_noSLM_fixed_kernel.h
--- a/repos/hello_gpgpu/src/consumer1/frac_320_240.h
+++ b/repos/hello_gpgpu/src/consumer1/frac_320_240.h
--- a/repos/hello_gpgpu/src/consumer1/main.cc
+++ b/repos/hello_gpgpu/src/consumer1/main.cc
@@ -0,0 +1,99 @@
+#include <base/log.h>
+#include <base/heap.h>
+#include <base/allocator_avl.h>
+#include <base/attached_rom_dataspace.h>
+
+#include <libc/component.h>
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <time.h>
+#include <string.h>
+
+// OpenCL
+#define CL_TARGET_OPENCL_VERSION 100
+#include "../hello_gpgpu/CL/cl.h"
+
+// rpc
+#include <gpgpu_virt/connection.h>
+
+// stupid alloc
+#include "../hello_gpgpu/allocator_stupid.h"
+
+namespace ns_OpenSurf{int main(int argc, char *argv[]);};
+
+struct consumer_conv
+{
+    Genode::Env &env;
+    gpgpu_virt::Connection backend_driver;
+    Genode::Allocator_stupid allocator;
+
+    const unsigned long size = 0x40000000;
+    cl_genode clg;
+    Genode::Ram_dataspace_capability vgpu_mem_ram_cap;
+    Genode::Ram_dataspace_capability vgpu_shm_ram_cap;
+
+    volatile uint8_t *ready;
+    const unsigned long img_size = 320 * 240 * sizeof(float);
+    volatile float *data;
+
+    void init()
+    {
+        Genode::log("===Init Consumer Surf===");
+		clInitGenode(clg);
+
+        // register vgpu (optional?)
+        const unsigned long size_vgpu_mem = 0x1000;
+        backend_driver.register_vm(size_vgpu_mem, vgpu_mem_ram_cap);
+
+        // create shm for gpu
+        const unsigned long id = 0;
+        Genode::size_t total_size = 0;
+        while (total_size == 0)
+        {
+            backend_driver.ask_shm(id, total_size, vgpu_shm_ram_cap);
+        }
+
+        // attach shm to vm
+        Genode::addr_t mapped_base = env.rm().attach(vgpu_shm_ram_cap);
+        clg.add_shm_mapped_base(id, mapped_base);
+
+        // use it in allocator
+        allocator.add_range(mapped_base, total_size);
+
+        // alloc whole data
+        ready = (uint8_t *)allocator.alloc(1);
+        data = (float *)allocator.alloc_aligned(0x10000, img_size);
+    }
+
+    void run()
+    {
+        Genode::log("===Run Consumer Surf===");
+
+        Libc::with_libc([&]
+                        {
+                            for(;;)
+                            {
+                                while (*ready != 0x42);
+
+                                ns_OpenSurf::main(2, (char**)data);
+
+                                //Genode::log(data[0]);
+                                sleep(3);
+                            } });
+
+        Genode::log("===End===");
+        Genode::log("Consumer Surf completed");
+    }
+
+    consumer_conv(Genode::Env &e) : env(e), backend_driver(env), allocator(), clg(env, size), ready(nullptr), data(nullptr)
+    {
+    }
+};
+
+void Libc::Component::construct(Libc::Env &env)
+{
+    static consumer_conv p(env);
+    p.init();
+    p.run();
+}
--- a/repos/hello_gpgpu/src/consumer1/target.mk
+++ b/repos/hello_gpgpu/src/consumer1/target.mk
@@ -0,0 +1,9 @@
+TARGET = consumer1
+SRC_CC = main.cc \
+		OpenSurf.cpp \
+		../hello_gpgpu/CL/cl.cc ../hello_gpgpu/CL/cl_genode.cc \
+		../hello_gpgpu/allocator_stupid.cc
+
+LIBS   = base libc libm
+
+CC_CXX_WARN_STRICT =
--- a/repos/hello_gpgpu/src/consumer2/2DConvolution.cc
+++ b/repos/hello_gpgpu/src/consumer2/2DConvolution.cc
@@ -0,0 +1,326 @@
+/**
+ * 2DConvolution.c: This file is part of the PolyBench/GPU 1.0 test suite.
+ *
+ *
+ * Contact: Scott Grauer-Gray <sgrauerg@gmail.com>
+ * Will Killian <killian@udel.edu>
+ * Louis-Noel Pouchet <pouchet@cse.ohio-state.edu>
+ * Web address: http://www.cse.ohio-state.edu/~pouchet/software/polybench/GPU
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <sys/time.h>
+#include <math.h>
+
+#define CL_TARGET_OPENCL_VERSION 100
+#ifdef __APPLE__
+#include <OpenCL/opencl.h>
+#else
+#include "../hello_gpgpu/CL/cl.h"
+#endif
+
+#define POLYBENCH_TIME 1
+
+//select the OpenCL device to use (can be GPU, CPU, or Accelerator such as Intel Xeon Phi)
+#define OPENCL_DEVICE_SELECTION CL_DEVICE_TYPE_GPU
+
+#include "../hello_gpgpu/benchmark/convolution-2d/2DConvolution.h"
+#include "../hello_gpgpu/polybench.h"
+
+//define the error threshold for the results "not matching"
+#define PERCENT_DIFF_ERROR_THRESHOLD 1.05
+
+#define MAX_SOURCE_SIZE (0x100000)
+
+#if defined(cl_khr_fp64)  // Khronos extension available?
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#elif defined(cl_amd_fp64)  // AMD extension available?
+#pragma OPENCL EXTENSION cl_amd_fp64 : enable
+#endif
+
+namespace ns_convolution_2d {
+
+#include "../hello_gpgpu/benchmark/convolution-2d/2DConvolution_kernel.h"
+#include "../hello_gpgpu/polybenchUtilFuncts.h"
+
+
+
+char str_temp[1024];
+
+cl_platform_id platform_id;
+cl_device_id device_id;   
+cl_uint num_devices;
+cl_uint num_platforms;
+cl_int errcode;
+cl_context clGPUContext;
+cl_kernel clKernel;
+cl_command_queue clCommandQue;
+cl_program clProgram;
+cl_mem a_mem_obj;
+cl_mem b_mem_obj;
+cl_mem c_mem_obj;
+FILE *fp;
+char *source_str;
+size_t source_size;
+
+// patch config for consumer2
+#undef NI
+#undef NJ
+#define NI 320
+#define NJ 240
+
+void compareResults(int ni, int nj, DATA_TYPE POLYBENCH_2D(B, NI, NJ, ni, nj), DATA_TYPE POLYBENCH_2D(B_outputFromGpu, NI, NJ, ni, nj))
+{
+	int i, j, fail;
+	fail = 0;
+	
+	// Compare outputs from CPU and GPU
+	for (i=1; i < (ni-1); i++) 
+	{
+		for (j=1; j < (nj-1); j++) 
+		{
+			if (percentDiff(B[i][j], B_outputFromGpu[i][j]) > PERCENT_DIFF_ERROR_THRESHOLD) 
+			{
+				fail++;
+			}
+		}
+	}
+	
+	// Print results
+	printf("Non-Matching CPU-GPU Outputs Beyond Error Threshold of %4.2f Percent: %d\n", PERCENT_DIFF_ERROR_THRESHOLD, fail);
+}
+
+
+void read_cl_file()
+{
+	// Load the kernel source code into the array source_str
+	// fp = fopen("2DConvolution.cl", "r");
+	// if (!fp) {
+	// 	fprintf(stdout, "Failed to load kernel.\n");
+	// 	exit(1);
+	// }
+	// source_str = (char*)malloc(MAX_SOURCE_SIZE);
+	// source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
+	// fclose( fp );
+}
+
+
+void init(int ni, int nj, DATA_TYPE POLYBENCH_2D(A, NI, NJ, ni, nj))
+{
+	int i, j;
+
+	for (i = 0; i < ni; ++i)
+    	{
+		for (j = 0; j < nj; ++j)
+		{
+			A[i][j] = (float)rand()/RAND_MAX;
+        	}
+    	}
+}
+
+
+void cl_initialization()
+{
+	
+	// Get platform and device information
+	errcode = clGetPlatformIDs(1, &platform_id, &num_platforms);
+	if(errcode == CL_SUCCESS) printf("number of platforms is %d\n",num_platforms);
+	else printf("Error getting platform IDs\n");
+
+	errcode = clGetPlatformInfo(platform_id,CL_PLATFORM_NAME, sizeof(str_temp), str_temp,NULL);
+	if(errcode == CL_SUCCESS) printf("platform name is %s\n",str_temp);
+	else printf("Error getting platform name\n");
+
+	errcode = clGetPlatformInfo(platform_id, CL_PLATFORM_VERSION, sizeof(str_temp), str_temp,NULL);
+	if(errcode == CL_SUCCESS) printf("platform version is %s\n",str_temp);
+	else printf("Error getting platform version\n");
+
+	errcode = clGetDeviceIDs( platform_id, OPENCL_DEVICE_SELECTION, 1, &device_id, &num_devices);
+	if(errcode == CL_SUCCESS) printf("number of devices is %d\n", num_devices);
+	else printf("Error getting device IDs\n");
+
+	errcode = clGetDeviceInfo(device_id,CL_DEVICE_NAME, sizeof(str_temp), str_temp,NULL);
+	if(errcode == CL_SUCCESS) printf("device name is %s\n",str_temp);
+	else printf("Error getting device name\n");
+	
+	// Create an OpenCL context
+	clGPUContext = clCreateContext( NULL, 1, &device_id, NULL, NULL, &errcode);
+	if(errcode != CL_SUCCESS) printf("Error in creating context\n");
+ 
+	//Create a command-queue
+	clCommandQue = clCreateCommandQueue(clGPUContext, device_id, 0, &errcode);
+	if(errcode != CL_SUCCESS) printf("Error in creating command queue\n");
+}
+
+
+void cl_mem_init(DATA_TYPE POLYBENCH_2D(A, NI, NJ, ni, nj))
+{
+	a_mem_obj = clCreateBufferSHM(clGPUContext, CL_MEM_READ_ONLY, sizeof(DATA_TYPE) * NI * NJ, A, &errcode, 0);
+	b_mem_obj = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE, sizeof(DATA_TYPE) * NI * NJ, NULL, &errcode);
+	
+	if(errcode != CL_SUCCESS) printf("Error in creating buffers\n");
+
+	//errcode = clEnqueueWriteBuffer(clCommandQue, a_mem_obj, CL_TRUE, 0, sizeof(DATA_TYPE) * NI * NJ, A, 0, NULL, NULL);
+	//if(errcode != CL_SUCCESS)printf("Error in writing buffers\n");
+}
+
+void cl_load_prog()
+{
+	// Create a program from the kernel source
+	const size_t kernel_size = __2DConvolution_Gen9core_gen_len;
+    const unsigned char* kernel_bin = __2DConvolution_Gen9core_gen;
+	clProgram = clCreateProgramWithBinary(clGPUContext, 1, &device_id, &kernel_size, &kernel_bin, NULL, &errcode);
+	// clProgram = clCreateProgramWithSource(clGPUContext, 1, (const char **)&source_str, (const size_t *)&source_size, &errcode);
+
+	if(errcode != CL_SUCCESS) printf("Error in creating program\n");
+
+	// Build the program
+	errcode = clBuildProgram(clProgram, 1, &device_id, NULL, NULL, NULL);
+	if(errcode != CL_SUCCESS) printf("Error in building program\n");
+		
+	// Create the OpenCL kernel
+	clKernel = clCreateKernel(clProgram, "Convolution2D_kernel", &errcode);
+	if(errcode != CL_SUCCESS) printf("Error in creating kernel\n");
+
+	clFinish(clCommandQue);
+}
+
+
+void cl_launch_kernel(int ni, int nj)
+{
+	size_t localWorkSize[2], globalWorkSize[2];
+	localWorkSize[0] = DIM_LOCAL_WORK_GROUP_X;
+	localWorkSize[1] = DIM_LOCAL_WORK_GROUP_Y;
+	globalWorkSize[0] = (size_t)ceil(((float)NI) / ((float)DIM_LOCAL_WORK_GROUP_X)) * DIM_LOCAL_WORK_GROUP_X;
+	globalWorkSize[1] = (size_t)ceil(((float)NJ) / ((float)DIM_LOCAL_WORK_GROUP_Y)) * DIM_LOCAL_WORK_GROUP_Y;
+
+	/* Start timer. */
+  	polybench_start_instruments;
+	
+	// Set the arguments of the kernel
+	errcode =  clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&a_mem_obj);
+	errcode |= clSetKernelArg(clKernel, 1, sizeof(cl_mem), (void *)&b_mem_obj);
+	errcode =  clSetKernelArg(clKernel, 2, sizeof(int), &ni);
+	errcode |= clSetKernelArg(clKernel, 3, sizeof(int), &nj);
+
+	if(errcode != CL_SUCCESS) printf("Error in seting arguments\n");
+	// Execute the OpenCL kernel
+	errcode = clEnqueueNDRangeKernel(clCommandQue, clKernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
+
+	if(errcode != CL_SUCCESS) printf("Error in launching kernel\n");
+	clFinish(clCommandQue);
+
+	/* Stop and print timer. */
+	polybench_stop_instruments;
+  	printf("GPU Time in seconds:\n");
+ 	polybench_print_instruments;
+}
+
+void cl_clean_up()
+{
+	// Clean up
+	errcode = clFlush(clCommandQue);
+	errcode = clFinish(clCommandQue);
+	errcode = clReleaseKernel(clKernel);
+	errcode = clReleaseProgram(clProgram);
+	errcode = clReleaseMemObject(a_mem_obj);
+	errcode = clReleaseMemObject(b_mem_obj);
+	errcode = clReleaseCommandQueue(clCommandQue);
+	errcode = clReleaseContext(clGPUContext);
+	if(errcode != CL_SUCCESS) printf("Error in cleanup\n");
+}
+
+void conv2D(int ni, int nj, DATA_TYPE POLYBENCH_2D(A, NI, NJ, ni, nj), DATA_TYPE POLYBENCH_2D(B, NI, NJ, ni, nj))
+{
+	int i, j;
+	DATA_TYPE c11, c12, c13, c21, c22, c23, c31, c32, c33;
+
+	c11 = +0.2;  c21 = +0.5;  c31 = -0.8;
+	c12 = -0.3;  c22 = +0.6;  c32 = -0.9;
+	c13 = +0.4;  c23 = +0.7;  c33 = +0.10;
+
+
+	for (i = 1; i < _PB_NI - 1; ++i) // 0
+	{
+		for (j = 1; j < _PB_NJ - 1; ++j) // 1
+		{
+			B[i][j] = c11 * A[(i - 1)][(j - 1)]  +  c12 * A[(i + 0)][(j - 1)]  +  c13 * A[(i + 1)][(j - 1)]
+				+ c21 * A[(i - 1)][(j + 0)]  +  c22 * A[(i + 0)][(j + 0)]  +  c23 * A[(i + 1)][(j + 0)] 
+				+ c31 * A[(i - 1)][(j + 1)]  +  c32 * A[(i + 0)][(j + 1)]  +  c33 * A[(i + 1)][(j + 1)];
+		}
+	}
+}
+
+
+/* DCE code. Must scan the entire live-out data.
+   Can be used also to check the correctness of the output. */
+static
+void print_array(int ni, int nj,
+		 DATA_TYPE POLYBENCH_2D(B,NI,NJ,ni,nj))
+{
+  int i, j;
+
+  for (i = 0; i < ni; i++)
+    for (j = 0; j < nj; j++) {
+	fprintf (stderr, DATA_PRINTF_MODIFIER, B[i][j]);
+	if ((i * ni + j) % 20 == 0) fprintf (stderr, "\n");
+    }
+  fprintf (stderr, "\n");
+}
+
+
+int main(int argc, char *argv[])
+{
+	/* Retrieve problem size */
+	int ni = NI;
+	int nj = NJ;
+
+	POLYBENCH_2D_ARRAY_DECL(A,DATA_TYPE,NI,NJ,ni,nj);
+  	POLYBENCH_2D_ARRAY_DECL(B,DATA_TYPE,NI,NJ,ni,nj);
+  	POLYBENCH_2D_ARRAY_DECL(B_outputFromGpu,DATA_TYPE,NI,NJ,ni,nj);
+
+	//init(ni, nj, POLYBENCH_ARRAY(A));
+    A = (float(*)[NI][NJ]) argv;
+
+	read_cl_file();
+	cl_initialization();
+	cl_mem_init(POLYBENCH_ARRAY(A));
+	cl_load_prog();
+
+	cl_launch_kernel(ni, nj);
+
+	errcode = clEnqueueReadBuffer(clCommandQue, b_mem_obj, CL_TRUE, 0, NI*NJ*sizeof(DATA_TYPE), POLYBENCH_ARRAY(B_outputFromGpu), 0, NULL, NULL);
+	if(errcode != CL_SUCCESS) printf("Error in reading GPU mem\n");
+
+	#ifdef RUN_ON_CPU
+
+		/* Start timer. */
+  		polybench_start_instruments;
+
+		conv2D(ni, nj, POLYBENCH_ARRAY(A), POLYBENCH_ARRAY(B));
+	
+		/* Stop and print timer. */
+		printf("CPU Time in seconds:\n");
+  		polybench_stop_instruments;
+ 		polybench_print_instruments;
+
+		compareResults(ni, nj, POLYBENCH_ARRAY(B), POLYBENCH_ARRAY(B_outputFromGpu));
+
+	#else //prevent dead code elimination
+
+		polybench_prevent_dce(print_array(ni, nj, POLYBENCH_ARRAY(B_outputFromGpu)));
+
+	#endif //RUN_ON_CPU
+
+	cl_clean_up();
+
+	//POLYBENCH_FREE_ARRAY(A);
+	POLYBENCH_FREE_ARRAY(B);
+	POLYBENCH_FREE_ARRAY(B_outputFromGpu);
+
+    	return 0;
+}
+
+}
--- a/repos/hello_gpgpu/src/consumer2/main.cc
+++ b/repos/hello_gpgpu/src/consumer2/main.cc
@@ -0,0 +1,99 @@
+#include <base/log.h>
+#include <base/heap.h>
+#include <base/allocator_avl.h>
+#include <base/attached_rom_dataspace.h>
+
+#include <libc/component.h>
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <time.h>
+#include <string.h>
+
+// OpenCL
+#define CL_TARGET_OPENCL_VERSION 100
+#include "../hello_gpgpu/CL/cl.h"
+
+// rpc
+#include <gpgpu_virt/connection.h>
+
+// stupid alloc
+#include "../hello_gpgpu/allocator_stupid.h"
+
+namespace ns_convolution_2d{int main(int argc, char *argv[]);};
+
+struct consumer_conv
+{
+    Genode::Env &env;
+    gpgpu_virt::Connection backend_driver;
+    Genode::Allocator_stupid allocator;
+
+    const unsigned long size = 0x40000000;
+    cl_genode clg;
+    Genode::Ram_dataspace_capability vgpu_mem_ram_cap;
+    Genode::Ram_dataspace_capability vgpu_shm_ram_cap;
+
+    volatile uint8_t *ready;
+    const unsigned long img_size = 320 * 240 * sizeof(float);
+    volatile float *data;
+
+    void init()
+    {
+        Genode::log("===Init Consumer Conv===");
+		clInitGenode(clg);
+
+        // register vgpu (optional?)
+        const unsigned long size_vgpu_mem = 0x1000;
+        backend_driver.register_vm(size_vgpu_mem, vgpu_mem_ram_cap);
+
+        // create shm for gpu
+        const unsigned long id = 0;
+        Genode::size_t total_size = 0;
+        while (total_size == 0)
+        {
+            backend_driver.ask_shm(id, total_size, vgpu_shm_ram_cap);
+        }
+
+        // attach shm to vm
+        Genode::addr_t mapped_base = env.rm().attach(vgpu_shm_ram_cap);
+        clg.add_shm_mapped_base(id, mapped_base);
+
+        // use it in allocator
+        allocator.add_range(mapped_base, total_size);
+
+        // alloc whole data
+        ready = (uint8_t *)allocator.alloc(1);
+        data = (float *)allocator.alloc_aligned(0x10000, img_size);
+    }
+
+    void run()
+    {
+        Genode::log("===Run Consumer Conv===");
+
+        Libc::with_libc([&]
+                        {
+                            for(;;)
+                            {
+                                while (*ready != 0x42);
+                                
+                                ns_convolution_2d::main(2, (char**)data);
+
+                                //Genode::log(data[0]);
+                                sleep(3);
+                            } });
+
+        Genode::log("===End===");
+        Genode::log("Consumer Conv completed");
+    }
+
+    consumer_conv(Genode::Env &e) : env(e), backend_driver(env), allocator(), clg(env, size), ready(nullptr), data(nullptr)
+    {
+    }
+};
+
+void Libc::Component::construct(Libc::Env &env)
+{
+    static consumer_conv p(env);
+    p.init();
+    p.run();
+}
--- a/repos/hello_gpgpu/src/consumer2/target.mk
+++ b/repos/hello_gpgpu/src/consumer2/target.mk
@@ -0,0 +1,10 @@
+TARGET = consumer2
+SRC_CC = main.cc \
+		2DConvolution.cc \
+		../hello_gpgpu/polybench.cc \
+		../hello_gpgpu/CL/cl.cc ../hello_gpgpu/CL/cl_genode.cc \
+		../hello_gpgpu/allocator_stupid.cc
+
+LIBS   = base libc libm
+
+CC_CXX_WARN_STRICT =
--- a/repos/hello_gpgpu/src/hello_gpgpu/CL/cl.cc
+++ b/repos/hello_gpgpu/src/hello_gpgpu/CL/cl.cc
@@ -357,6 +357,26 @@ clCreateBuffer(cl_context   context,
    clmem->bc.buffer = host_ptr;
    clmem->bc.buffer_size = (uint32_t)size;
    clmem->bc.non_pointer_type = false;
+    clmem->bc.shmid = -1;
+
+    *errcode_ret |= CL_SUCCESS;
+    return clmem;
+}
+
+CL_API_ENTRY cl_mem CL_API_CALL
+clCreateBufferSHM(cl_context   context,
+               cl_mem_flags flags,
+               size_t       size,
+               void *       host_ptr,
+               cl_int *     errcode_ret,
+                int shid)
+{
+    cl_mem clmem = (cl_mem)g_cl_genode->alloc(sizeof(struct _cl_mem));
+    clmem->virt_vm = host_ptr;
+    clmem->bc.buffer = host_ptr;
+    clmem->bc.buffer_size = (uint32_t)size;
+    clmem->bc.non_pointer_type = false;
+    clmem->bc.shmid = shid;

    *errcode_ret |= CL_SUCCESS;
    return clmem;
--- a/repos/hello_gpgpu/src/hello_gpgpu/CL/cl.h
+++ b/repos/hello_gpgpu/src/hello_gpgpu/CL/cl.h
@@ -1088,6 +1088,14 @@ clCreateBuffer(cl_context   context,
               void *       host_ptr,
               cl_int *     errcode_ret) CL_API_SUFFIX__VERSION_1_0;

+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateBufferSHM(cl_context   context,
+               cl_mem_flags flags,
+               size_t       size,
+               void *       host_ptr,
+               cl_int *     errcode_ret,
+                int shid) CL_API_SUFFIX__VERSION_1_0;
+
 #ifdef CL_VERSION_1_1

 extern CL_API_ENTRY cl_mem CL_API_CALL
--- a/repos/hello_gpgpu/src/hello_gpgpu/CL/cl_genode.cc
+++ b/repos/hello_gpgpu/src/hello_gpgpu/CL/cl_genode.cc
@@ -1,6 +1,6 @@
 #include "cl_genode.h"

-cl_genode::cl_genode(Genode::Env& env, unsigned long size) : env(env), allocator(), mapped_base(0), backend_driver(env)
+cl_genode::cl_genode(Genode::Env& env, unsigned long size) : env(env), allocator(), mapped_base(0), backend_driver(env), shm_mapped_base{0, }
 {
    // get shared memory with driver
    Genode::Ram_dataspace_capability ram_cap;
@@ -38,7 +38,8 @@ void cl_genode::enqueue_task(struct kernel_config* kconf)
    // convert virt vm addr to offset
    for(int i = 0; i < kconf->buffCount; i++)
    {
-        kconf->buffConfigs[i].buffer = (void*)((Genode::addr_t)kconf->buffConfigs[i].buffer - mapped_base);
+        const Genode::addr_t mbase = kconf->buffConfigs[i].shmid == -1 ? mapped_base : shm_mapped_base[kconf->buffConfigs[i].shmid];
+        kconf->buffConfigs[i].buffer = (void*)((Genode::addr_t)kconf->buffConfigs[i].buffer - mbase);
    }
    kconf->buffConfigs = (struct buffer_config*)((Genode::addr_t)kconf->buffConfigs - mapped_base);
    kconf->kernelName = (char*)((Genode::addr_t)kconf->kernelName - mapped_base);
@@ -55,3 +56,8 @@ void cl_genode::wait(struct kernel_config* kconf)
        asm("nop");
    }
 }
+
+void cl_genode::add_shm_mapped_base(int shmid, Genode::addr_t mbase)
+{
+    shm_mapped_base[shmid] = mbase;
+}
--- a/repos/hello_gpgpu/src/hello_gpgpu/CL/cl_genode.h
+++ b/repos/hello_gpgpu/src/hello_gpgpu/CL/cl_genode.h
@@ -17,6 +17,9 @@
 // driver
 #include <gpgpu/gpgpu.h>

+// config
+#include "../../../../dde_uos-intel-gpgpu/src/config.h"
+
 class cl_genode
 {
 private:
@@ -30,6 +33,9 @@ private:
    // rpc
    gpgpu_virt::Connection backend_driver;

+    // shm mapped_bases
+    Genode::addr_t shm_mapped_base[MAX_SHM_REGIONS];
+
    // do not allow copies
    cl_genode(const cl_genode& copy) = delete;
    cl_genode& operator=(const cl_genode& src) = delete;
@@ -100,6 +106,14 @@ public:
     * 
     */
    void reset() { allocator.reset(); }
+
+    /**
+     * @brief 
+     * 
+     * @param shmid 
+     * @param mbase 
+     */
+    void add_shm_mapped_base(int shmid, Genode::addr_t mbase);
 };

 #endif // CL_GENODE_H
--- a/repos/hello_gpgpu/src/producer/frac2_320_240.h
+++ b/repos/hello_gpgpu/src/producer/frac2_320_240.h
--- a/repos/hello_gpgpu/src/producer/frac_320_240.h
+++ b/repos/hello_gpgpu/src/producer/frac_320_240.h
--- a/repos/hello_gpgpu/src/producer/main.cc
+++ b/repos/hello_gpgpu/src/producer/main.cc
@@ -0,0 +1,155 @@
+#include <base/log.h>
+#include <base/heap.h>
+#include <base/allocator_avl.h>
+#include <base/attached_rom_dataspace.h>
+#include <util/misc_math.h>
+
+#include <libc/component.h>
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <time.h>
+#include <string.h>
+
+// rpc
+#include <gpgpu_virt/connection.h>
+
+// stupid alloc
+#include "../hello_gpgpu/allocator_stupid.h"
+
+// imgs
+#include "frac_320_240.h"
+#include "frac2_320_240.h"
+
+namespace fake_cv
+{
+    class Mat
+    {
+    public:
+        int rows;
+        int cols;
+        int step;
+        float *data;
+        template <typename T>
+        T *ptr(int off)
+        {
+            return (T *)&data[off * sizeof(T)];
+        }
+    };
+
+    Mat imread(const char *img)
+    {
+        Mat m;
+        m.rows = height;
+        m.cols = width;
+        m.step = width * sizeof(float);
+
+        const size_t size = m.rows * m.cols;
+        m.data = (float *)malloc(size * sizeof(float));
+
+        for (size_t i = 0; i < size; ++i)
+        {
+            unsigned int px[3];
+            HEADER_PIXEL(img, px);
+            const unsigned int g = 0.298936021293775 * px[0] + 0.587043074451121 * px[1] + 0.114020904255103 * px[2];
+            m.data[i] = g / 251.; // 255.
+        }
+
+        return m;
+    }
+}
+
+typedef fake_cv::Mat Image;
+
+fake_cv::Mat getGray(const fake_cv::Mat &img)
+{
+    return img;
+}
+
+struct producer
+{
+    Genode::Env &env;
+    gpgpu_virt::Connection backend_driver;
+    Genode::Allocator_stupid allocator;
+
+    float *img1;
+    float *img2;
+    unsigned long img_size;
+    volatile uint8_t *ready;
+    volatile float *data;
+
+    void init()
+    {
+        Genode::log("===Init Producer===");
+
+        // register vgpu (optional?)
+        const unsigned long size_vgpu_mem = 0x1000;
+        Genode::Ram_dataspace_capability vgpu_mem_ram_cap;
+        backend_driver.register_vm(size_vgpu_mem, vgpu_mem_ram_cap);
+
+        // create shm for gpu
+        const unsigned long size_vgpu_shm = 0x100000;
+        Genode::Ram_dataspace_capability vgpu_shm_ram_cap;
+        backend_driver.register_shm(size_vgpu_shm, vgpu_shm_ram_cap);
+
+        // attach shm to vm
+        Genode::addr_t mapped_base = env.rm().attach(vgpu_shm_ram_cap);
+
+        // use it in allocator
+        allocator.add_range(mapped_base, size_vgpu_shm);
+
+        // set not ready
+        ready = (uint8_t *)allocator.alloc(1);
+
+        // load img1 and img2
+        const Image s1 = fake_cv::imread(header_data);
+        Image i1 = getGray(s1);
+        img1 = (float *)i1.ptr<float>(0);
+        const Image s2 = fake_cv::imread(header_data2);
+        Image i2 = getGray(s2);
+        img2 = (float *)i2.ptr<float>(0);
+
+        img_size = Genode::max(i1.rows * i2.cols * sizeof(float), i2.rows * i2.cols * sizeof(float));
+
+        // alloc whole data
+        data = (float *)allocator.alloc_aligned(0x10000, img_size);
+    }
+
+    void run()
+    {
+        Genode::log("===Run Producer===");
+
+        Libc::with_libc([&]
+                        {
+            srand(time(NULL));
+
+            int flip = 0;
+
+            for (;;)
+            {
+                // fetch new img
+                *ready = 0x43;
+                memcpy((void*)data, flip ? img1 : img2, img_size);
+                *ready = 0x42;
+                flip = !flip;
+                Genode::log("===New Image ready: ", flip, " ===");
+                
+                // sleep for 5s
+                sleep(5);
+            } });
+
+        Genode::log("===End===");
+        Genode::log("Producer completed");
+    }
+
+    producer(Genode::Env &e) : env(e), backend_driver(env), allocator(), img1(nullptr), img2(nullptr), img_size(0), ready(nullptr), data(nullptr)
+    {
+    }
+};
+
+void Libc::Component::construct(Libc::Env &env)
+{
+    static producer p(env);
+    p.init();
+    p.run();
+}
--- a/repos/hello_gpgpu/src/producer/target.mk
+++ b/repos/hello_gpgpu/src/producer/target.mk
@@ -0,0 +1,7 @@
+TARGET = producer
+SRC_CC = main.cc \
+		../hello_gpgpu/allocator_stupid.cc
+
+LIBS   = base libc libm
+
+CC_CXX_WARN_STRICT =