shared gpu memory across cells

This commit is contained in:
Marcel Lütke Dreimann
2025-08-18 10:36:05 +02:00
parent 9d22ca68ab
commit c028d5d838
25 changed files with 37213 additions and 10 deletions

View File

@@ -23,6 +23,16 @@ struct Session_client : Genode::Rpc_client<Session>
call<Rpc_register_vm>(size, ram_cap);
}
void register_shm(Genode::size_t size, Genode::Ram_dataspace_capability& ram_cap) override
{
call<Rpc_register_shm>(size, ram_cap);
}
void ask_shm(int id, Genode::size_t &size, Genode::Ram_dataspace_capability& ram_cap) override
{
call<Rpc_ask_shm>(id, size, ram_cap);
}
};
}

View File

@@ -14,6 +14,8 @@ struct Session : Genode::Session
virtual void register_vm(Genode::size_t size, Genode::Ram_dataspace_capability& ram_cap) = 0;
virtual void start_task(unsigned long kconf) = 0;
virtual void register_shm(Genode::size_t size, Genode::Ram_dataspace_capability& ram_cap) = 0;
virtual void ask_shm(int id, Genode::size_t &size, Genode::Ram_dataspace_capability& ram_cap) = 0;
/*******************
** RPC interface **
@@ -21,9 +23,11 @@ struct Session : Genode::Session
GENODE_RPC(Rpc_register_vm, void, register_vm, Genode::size_t, Genode::Ram_dataspace_capability&);
GENODE_RPC(Rpc_start_task, void, start_task, unsigned long);
GENODE_RPC(Rpc_register_shm, void, register_shm, Genode::size_t, Genode::Ram_dataspace_capability&);
GENODE_RPC(Rpc_ask_shm, void, ask_shm, int, Genode::size_t&, Genode::Ram_dataspace_capability&);
GENODE_RPC_INTERFACE(Rpc_register_vm, Rpc_start_task);
GENODE_RPC_INTERFACE(Rpc_register_vm, Rpc_start_task, Rpc_register_shm, Rpc_ask_shm);
};
}

View File

@@ -1,11 +1,13 @@
#ifndef CONFIG_H
#define CONFIG_H
#define QEMU_TEST
// #define QEMU_TEST
//#define VERBOSE
#define VERBOSE
#define SCHED_CFS
//#define SCHED_RR // default
#define MAX_SHM_REGIONS 32
#endif // CONFIG_H

View File

@@ -31,6 +31,49 @@ void Session_component::register_vm(Genode::size_t size, Genode::Ram_dataspace_c
_global_sched->add_vgpu(&vgpu);
}
int SHM_manager::alloc_shm(Genode::size_t size, Genode::Ram_dataspace_capability& ram_cap_vm)
{
// get shared memory id
const int s = __atomic_fetch_add(&shid, 1, __ATOMIC_SEQ_CST);
if(s >= MAX_SHM_REGIONS)
return -1;
// alloc shared memory
Genode::addr_t mapped_base;
ram_cap[s] = _global_gpgpu_genode->allocRamCap(size, mapped_base, base[s]);
sizes[s] = size;
ram_cap_vm = ram_cap[s];
return s;
}
void SHM_manager::free_shm(int id)
{
_global_gpgpu_genode->freeRamCap(ram_cap[id]);
}
void Session_component::register_shm(Genode::size_t size, Genode::Ram_dataspace_capability& ram_cap_vm)
{
// create shared mem
int shid = SHM_manager::getInstance().alloc_shm(size, ram_cap_vm);
vgpu.assignSHM(shid);
}
void Session_component::ask_shm(int id, Genode::size_t &size, Genode::Ram_dataspace_capability& ram_cap_vm)
{
// get size
size = SHM_manager::getInstance().getSize(id);
if(size == 0) // invalid
return;
// get ram cap
ram_cap_vm = SHM_manager::getInstance().getCap(id);
// assign id to vgpu
vgpu.assignSHM(id);
}
void Session_component::start_task(unsigned long kconf)
{
// convert offset to driver virt addr
@@ -44,7 +87,8 @@ void Session_component::start_task(unsigned long kconf)
}
else // for pointer set phys addr
{
kc->buffConfigs[i].buffer = (void*)((Genode::addr_t)kc->buffConfigs[i].buffer + base);
const Genode::addr_t addrBase = kc->buffConfigs[i].shmid == -1 ? base : SHM_manager::getInstance().getBase(kc->buffConfigs[i].shmid);
kc->buffConfigs[i].buffer = (void*)((Genode::addr_t)kc->buffConfigs[i].buffer + addrBase);
}
}
kc->kernelName = (char*)((Genode::addr_t)kc->kernelName + mapped_base);
@@ -63,6 +107,7 @@ void Session_component::start_task(unsigned long kconf)
for(int i = 0; i < kc->buffCount; i++)
{
Genode::log("\tBuffer ", i);
Genode::log("\t\tshmid: ", (int)kc->buffConfigs[i].shmid);
if(kc->buffConfigs[i].non_pointer_type)
{
Genode::log("\t\tvaddr: ", (void*)kc->buffConfigs[i].buffer);
@@ -72,7 +117,8 @@ void Session_component::start_task(unsigned long kconf)
}
else
{
Genode::log("\t\tvaddr: ", (void*)((Genode::addr_t)kc->buffConfigs[i].buffer - base + mapped_base));
const Genode::addr_t addrBase = kc->buffConfigs[i].shmid == -1 ? base : SHM_manager::getInstance().getBase(kc->buffConfigs[i].shmid);
Genode::log("\t\tvaddr: ", (void*)((Genode::addr_t)kc->buffConfigs[i].buffer - addrBase));
Genode::log("\t\tpaddr: ", (void*)kc->buffConfigs[i].buffer);
//Genode::log("\t\tgpuaddr: ", (void*)((addr_t)kc->buffConfigs[i].ga)); // to print this, temporary make the var public
//Genode::log("\t\tpos: ", (uint32_t)kc->buffConfigs[i].pos); // to print this, temporary make the var public

View File

@@ -10,6 +10,28 @@
namespace gpgpu_virt
{
class SHM_manager
{
private:
Genode::Ram_dataspace_capability ram_cap[MAX_SHM_REGIONS];
Genode::addr_t base[MAX_SHM_REGIONS];
Genode::size_t sizes[MAX_SHM_REGIONS];
unsigned int shid;
SHM_manager() : ram_cap{}, base {0, }, sizes {0, }, shid(0) {};
public:
static SHM_manager &getInstance() {
static SHM_manager inst;
return inst;
}
int alloc_shm(Genode::size_t size, Genode::Ram_dataspace_capability& ram_cap);
void free_shm(int id);
Genode::Ram_dataspace_capability getCap(int id) const { return ram_cap[id]; }
Genode::addr_t getBase(int id) const { return base[id]; }
Genode::addr_t getSize(int id) const { return sizes[id]; }
};
struct Session_component : Genode::Rpc_object<Session>
{
VGpu vgpu;
@@ -25,6 +47,9 @@ struct Session_component : Genode::Rpc_object<Session>
void start_task(unsigned long kconf) override;
void register_shm(Genode::size_t size, Genode::Ram_dataspace_capability& ram_cap) override;
void ask_shm(int id, Genode::size_t& size, Genode::Ram_dataspace_capability& ram_cap) override;
};
class Root_component

View File

@@ -33,11 +33,32 @@ namespace gpgpu_virt {
/// priority of vgpu
int prio;
/// assigned shared memory regions
int shm_ids[MAX_SHM_REGIONS];
/// local shm id counter
int curr_shm_id;
public:
/**
* @brief Construct a new VGpu object
*/
VGpu() : ctx(nullptr), ready_list(), prio(-1) {}
VGpu() : ctx(nullptr), ready_list(), prio(-1), curr_shm_id(0) {}
void assignSHM(int id)
{
shm_ids[curr_shm_id++] = id;
}
void removeSHM(int id)
{
for(int i = 0; i < MAX_SHM_REGIONS; ++i)
{
if (shm_ids[i] == id){
shm_ids[i] = -1;
}
}
}
/**
* @brief Set the Priority

View File

@@ -62,7 +62,7 @@ append config {
<start name="hello_gpgpu">
<binary name="hello_gpgpu"/>
<resource name="RAM" quantum="1024M"/>
<config bench="1">
<config bench="2097152">
<vfs> <dir name="dev"> <log/> <inline name="rtc">2022-07-20 14:30</inline> </dir> </vfs>
<libc stdout="/dev/log" stderr="/dev/log" rtc="/dev/rtc"/>
</config>

View File

@@ -0,0 +1,103 @@
# build config
build { core init gpgpu timer producer consumer1 consumer2 }
# platform config
set use_acpica_as_acpi_drv 0
source ${genode_dir}/repos/base/run/platform_drv.inc
proc platform_drv_policy {} {
global use_acpica_as_acpi_drv
set policy ""
append_if $use_acpica_as_acpi_drv policy {
<policy label="acpi_drv -> "> <pci class="ALL"/> </policy>}
append policy {
<policy label_prefix="gpgpu"> <pci class="ALL"/> </policy>}
append policy {
<policy label_prefix="producer"> <pci class="ALL"/> </policy>}
append policy {
<policy label_prefix="consumer1"> <pci class="ALL"/> </policy>}
append policy {
<policy label_prefix="consumer2"> <pci class="ALL"/> </policy>}
return $policy
}
append_platform_drv_build_components
build $build_components
# boot dir
create_boot_directory
# other config
append config {
<config>
<parent-provides>
<service name="ROM"/>
<service name="IRQ"/>
<service name="IO_MEM"/>
<service name="PD"/>
<service name="RM"/>
<service name="CPU"/>
<service name="LOG"/>
<service name="RAM"/>
<service name="CAP"/>
<service name="TOPO"/>
</parent-provides>
<default-route>
<any-service> <parent/> <any-child/> </any-service>
</default-route>
<default caps="200"/>
<start name="timer">
<resource name="RAM" quantum="1M"/>
<provides><service name="Timer"/></provides>
<route>
<any-service><parent/><any-child/></any-service>
</route>
</start>
}
append_platform_drv_config
append config {
<start name="gpgpu" priority="0">
<provides> <service name="gpgpu"/> </provides>
<resource name="RAM" quantum="12G"/>
</start>
<start name="producer">
<binary name="producer"/>
<resource name="RAM" quantum="512M"/>
<config>
<vfs> <dir name="dev"> <log/> <inline name="rtc">2022-07-20 14:30</inline> </dir> </vfs>
<libc stdout="/dev/log" stderr="/dev/log" rtc="/dev/rtc"/>
</config>
</start>
<start name="consumer (high)">
<binary name="consumer1"/>
<resource name="RAM" quantum="512M"/>
<config>
<vfs> <dir name="dev"> <log/> <inline name="rtc">2022-07-20 14:30</inline> </dir> </vfs>
<libc stdout="/dev/log" stderr="/dev/log" rtc="/dev/rtc"/>
</config>
</start>
<start name="consumer (low)">
<binary name="consumer2"/>
<resource name="RAM" quantum="512M"/>
<config >
<vfs> <dir name="dev"> <log/> <inline name="rtc">2022-07-20 14:30</inline> </dir> </vfs>
<libc stdout="/dev/log" stderr="/dev/log" rtc="/dev/rtc"/>
</config>
</start>
</config>}
install_config $config
# boot modules
set boot_modules {
core ld.lib.so libc.lib.so vfs.lib.so libm.lib.so init gpgpu timer producer consumer1 consumer2
}
append_platform_drv_boot_modules
build_boot_image $boot_modules
# qemu stuff
append qemu_args " -nographic -m 24G"
run_genode_until forever

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,99 @@
#include <base/log.h>
#include <base/heap.h>
#include <base/allocator_avl.h>
#include <base/attached_rom_dataspace.h>
#include <libc/component.h>
#include <unistd.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
// OpenCL
#define CL_TARGET_OPENCL_VERSION 100
#include "../hello_gpgpu/CL/cl.h"
// rpc
#include <gpgpu_virt/connection.h>
// stupid alloc
#include "../hello_gpgpu/allocator_stupid.h"
namespace ns_OpenSurf{int main(int argc, char *argv[]);};
struct consumer_conv
{
Genode::Env &env;
gpgpu_virt::Connection backend_driver;
Genode::Allocator_stupid allocator;
const unsigned long size = 0x40000000;
cl_genode clg;
Genode::Ram_dataspace_capability vgpu_mem_ram_cap;
Genode::Ram_dataspace_capability vgpu_shm_ram_cap;
volatile uint8_t *ready;
const unsigned long img_size = 320 * 240 * sizeof(float);
volatile float *data;
void init()
{
Genode::log("===Init Consumer Surf===");
clInitGenode(clg);
// register vgpu (optional?)
const unsigned long size_vgpu_mem = 0x1000;
backend_driver.register_vm(size_vgpu_mem, vgpu_mem_ram_cap);
// create shm for gpu
const unsigned long id = 0;
Genode::size_t total_size = 0;
while (total_size == 0)
{
backend_driver.ask_shm(id, total_size, vgpu_shm_ram_cap);
}
// attach shm to vm
Genode::addr_t mapped_base = env.rm().attach(vgpu_shm_ram_cap);
clg.add_shm_mapped_base(id, mapped_base);
// use it in allocator
allocator.add_range(mapped_base, total_size);
// alloc whole data
ready = (uint8_t *)allocator.alloc(1);
data = (float *)allocator.alloc_aligned(0x10000, img_size);
}
void run()
{
Genode::log("===Run Consumer Surf===");
Libc::with_libc([&]
{
for(;;)
{
while (*ready != 0x42);
ns_OpenSurf::main(2, (char**)data);
//Genode::log(data[0]);
sleep(3);
} });
Genode::log("===End===");
Genode::log("Consumer Surf completed");
}
consumer_conv(Genode::Env &e) : env(e), backend_driver(env), allocator(), clg(env, size), ready(nullptr), data(nullptr)
{
}
};
void Libc::Component::construct(Libc::Env &env)
{
static consumer_conv p(env);
p.init();
p.run();
}

View File

@@ -0,0 +1,9 @@
TARGET = consumer1
SRC_CC = main.cc \
OpenSurf.cpp \
../hello_gpgpu/CL/cl.cc ../hello_gpgpu/CL/cl_genode.cc \
../hello_gpgpu/allocator_stupid.cc
LIBS = base libc libm
CC_CXX_WARN_STRICT =

View File

@@ -0,0 +1,326 @@
/**
* 2DConvolution.c: This file is part of the PolyBench/GPU 1.0 test suite.
*
*
* Contact: Scott Grauer-Gray <sgrauerg@gmail.com>
* Will Killian <killian@udel.edu>
* Louis-Noel Pouchet <pouchet@cse.ohio-state.edu>
* Web address: http://www.cse.ohio-state.edu/~pouchet/software/polybench/GPU
*/
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#include <math.h>
#define CL_TARGET_OPENCL_VERSION 100
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include "../hello_gpgpu/CL/cl.h"
#endif
#define POLYBENCH_TIME 1
//select the OpenCL device to use (can be GPU, CPU, or Accelerator such as Intel Xeon Phi)
#define OPENCL_DEVICE_SELECTION CL_DEVICE_TYPE_GPU
#include "../hello_gpgpu/benchmark/convolution-2d/2DConvolution.h"
#include "../hello_gpgpu/polybench.h"
//define the error threshold for the results "not matching"
#define PERCENT_DIFF_ERROR_THRESHOLD 1.05
#define MAX_SOURCE_SIZE (0x100000)
#if defined(cl_khr_fp64) // Khronos extension available?
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#elif defined(cl_amd_fp64) // AMD extension available?
#pragma OPENCL EXTENSION cl_amd_fp64 : enable
#endif
namespace ns_convolution_2d {
#include "../hello_gpgpu/benchmark/convolution-2d/2DConvolution_kernel.h"
#include "../hello_gpgpu/polybenchUtilFuncts.h"
char str_temp[1024];
cl_platform_id platform_id;
cl_device_id device_id;
cl_uint num_devices;
cl_uint num_platforms;
cl_int errcode;
cl_context clGPUContext;
cl_kernel clKernel;
cl_command_queue clCommandQue;
cl_program clProgram;
cl_mem a_mem_obj;
cl_mem b_mem_obj;
cl_mem c_mem_obj;
FILE *fp;
char *source_str;
size_t source_size;
// patch config for consumer2
#undef NI
#undef NJ
#define NI 320
#define NJ 240
void compareResults(int ni, int nj, DATA_TYPE POLYBENCH_2D(B, NI, NJ, ni, nj), DATA_TYPE POLYBENCH_2D(B_outputFromGpu, NI, NJ, ni, nj))
{
int i, j, fail;
fail = 0;
// Compare outputs from CPU and GPU
for (i=1; i < (ni-1); i++)
{
for (j=1; j < (nj-1); j++)
{
if (percentDiff(B[i][j], B_outputFromGpu[i][j]) > PERCENT_DIFF_ERROR_THRESHOLD)
{
fail++;
}
}
}
// Print results
printf("Non-Matching CPU-GPU Outputs Beyond Error Threshold of %4.2f Percent: %d\n", PERCENT_DIFF_ERROR_THRESHOLD, fail);
}
void read_cl_file()
{
// Load the kernel source code into the array source_str
// fp = fopen("2DConvolution.cl", "r");
// if (!fp) {
// fprintf(stdout, "Failed to load kernel.\n");
// exit(1);
// }
// source_str = (char*)malloc(MAX_SOURCE_SIZE);
// source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
// fclose( fp );
}
void init(int ni, int nj, DATA_TYPE POLYBENCH_2D(A, NI, NJ, ni, nj))
{
int i, j;
for (i = 0; i < ni; ++i)
{
for (j = 0; j < nj; ++j)
{
A[i][j] = (float)rand()/RAND_MAX;
}
}
}
void cl_initialization()
{
// Get platform and device information
errcode = clGetPlatformIDs(1, &platform_id, &num_platforms);
if(errcode == CL_SUCCESS) printf("number of platforms is %d\n",num_platforms);
else printf("Error getting platform IDs\n");
errcode = clGetPlatformInfo(platform_id,CL_PLATFORM_NAME, sizeof(str_temp), str_temp,NULL);
if(errcode == CL_SUCCESS) printf("platform name is %s\n",str_temp);
else printf("Error getting platform name\n");
errcode = clGetPlatformInfo(platform_id, CL_PLATFORM_VERSION, sizeof(str_temp), str_temp,NULL);
if(errcode == CL_SUCCESS) printf("platform version is %s\n",str_temp);
else printf("Error getting platform version\n");
errcode = clGetDeviceIDs( platform_id, OPENCL_DEVICE_SELECTION, 1, &device_id, &num_devices);
if(errcode == CL_SUCCESS) printf("number of devices is %d\n", num_devices);
else printf("Error getting device IDs\n");
errcode = clGetDeviceInfo(device_id,CL_DEVICE_NAME, sizeof(str_temp), str_temp,NULL);
if(errcode == CL_SUCCESS) printf("device name is %s\n",str_temp);
else printf("Error getting device name\n");
// Create an OpenCL context
clGPUContext = clCreateContext( NULL, 1, &device_id, NULL, NULL, &errcode);
if(errcode != CL_SUCCESS) printf("Error in creating context\n");
//Create a command-queue
clCommandQue = clCreateCommandQueue(clGPUContext, device_id, 0, &errcode);
if(errcode != CL_SUCCESS) printf("Error in creating command queue\n");
}
void cl_mem_init(DATA_TYPE POLYBENCH_2D(A, NI, NJ, ni, nj))
{
a_mem_obj = clCreateBufferSHM(clGPUContext, CL_MEM_READ_ONLY, sizeof(DATA_TYPE) * NI * NJ, A, &errcode, 0);
b_mem_obj = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE, sizeof(DATA_TYPE) * NI * NJ, NULL, &errcode);
if(errcode != CL_SUCCESS) printf("Error in creating buffers\n");
//errcode = clEnqueueWriteBuffer(clCommandQue, a_mem_obj, CL_TRUE, 0, sizeof(DATA_TYPE) * NI * NJ, A, 0, NULL, NULL);
//if(errcode != CL_SUCCESS)printf("Error in writing buffers\n");
}
void cl_load_prog()
{
// Create a program from the kernel source
const size_t kernel_size = __2DConvolution_Gen9core_gen_len;
const unsigned char* kernel_bin = __2DConvolution_Gen9core_gen;
clProgram = clCreateProgramWithBinary(clGPUContext, 1, &device_id, &kernel_size, &kernel_bin, NULL, &errcode);
// clProgram = clCreateProgramWithSource(clGPUContext, 1, (const char **)&source_str, (const size_t *)&source_size, &errcode);
if(errcode != CL_SUCCESS) printf("Error in creating program\n");
// Build the program
errcode = clBuildProgram(clProgram, 1, &device_id, NULL, NULL, NULL);
if(errcode != CL_SUCCESS) printf("Error in building program\n");
// Create the OpenCL kernel
clKernel = clCreateKernel(clProgram, "Convolution2D_kernel", &errcode);
if(errcode != CL_SUCCESS) printf("Error in creating kernel\n");
clFinish(clCommandQue);
}
void cl_launch_kernel(int ni, int nj)
{
size_t localWorkSize[2], globalWorkSize[2];
localWorkSize[0] = DIM_LOCAL_WORK_GROUP_X;
localWorkSize[1] = DIM_LOCAL_WORK_GROUP_Y;
globalWorkSize[0] = (size_t)ceil(((float)NI) / ((float)DIM_LOCAL_WORK_GROUP_X)) * DIM_LOCAL_WORK_GROUP_X;
globalWorkSize[1] = (size_t)ceil(((float)NJ) / ((float)DIM_LOCAL_WORK_GROUP_Y)) * DIM_LOCAL_WORK_GROUP_Y;
/* Start timer. */
polybench_start_instruments;
// Set the arguments of the kernel
errcode = clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&a_mem_obj);
errcode |= clSetKernelArg(clKernel, 1, sizeof(cl_mem), (void *)&b_mem_obj);
errcode = clSetKernelArg(clKernel, 2, sizeof(int), &ni);
errcode |= clSetKernelArg(clKernel, 3, sizeof(int), &nj);
if(errcode != CL_SUCCESS) printf("Error in seting arguments\n");
// Execute the OpenCL kernel
errcode = clEnqueueNDRangeKernel(clCommandQue, clKernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
if(errcode != CL_SUCCESS) printf("Error in launching kernel\n");
clFinish(clCommandQue);
/* Stop and print timer. */
polybench_stop_instruments;
printf("GPU Time in seconds:\n");
polybench_print_instruments;
}
void cl_clean_up()
{
// Clean up
errcode = clFlush(clCommandQue);
errcode = clFinish(clCommandQue);
errcode = clReleaseKernel(clKernel);
errcode = clReleaseProgram(clProgram);
errcode = clReleaseMemObject(a_mem_obj);
errcode = clReleaseMemObject(b_mem_obj);
errcode = clReleaseCommandQueue(clCommandQue);
errcode = clReleaseContext(clGPUContext);
if(errcode != CL_SUCCESS) printf("Error in cleanup\n");
}
void conv2D(int ni, int nj, DATA_TYPE POLYBENCH_2D(A, NI, NJ, ni, nj), DATA_TYPE POLYBENCH_2D(B, NI, NJ, ni, nj))
{
int i, j;
DATA_TYPE c11, c12, c13, c21, c22, c23, c31, c32, c33;
c11 = +0.2; c21 = +0.5; c31 = -0.8;
c12 = -0.3; c22 = +0.6; c32 = -0.9;
c13 = +0.4; c23 = +0.7; c33 = +0.10;
for (i = 1; i < _PB_NI - 1; ++i) // 0
{
for (j = 1; j < _PB_NJ - 1; ++j) // 1
{
B[i][j] = c11 * A[(i - 1)][(j - 1)] + c12 * A[(i + 0)][(j - 1)] + c13 * A[(i + 1)][(j - 1)]
+ c21 * A[(i - 1)][(j + 0)] + c22 * A[(i + 0)][(j + 0)] + c23 * A[(i + 1)][(j + 0)]
+ c31 * A[(i - 1)][(j + 1)] + c32 * A[(i + 0)][(j + 1)] + c33 * A[(i + 1)][(j + 1)];
}
}
}
/* DCE code. Must scan the entire live-out data.
Can be used also to check the correctness of the output. */
static
void print_array(int ni, int nj,
DATA_TYPE POLYBENCH_2D(B,NI,NJ,ni,nj))
{
int i, j;
for (i = 0; i < ni; i++)
for (j = 0; j < nj; j++) {
fprintf (stderr, DATA_PRINTF_MODIFIER, B[i][j]);
if ((i * ni + j) % 20 == 0) fprintf (stderr, "\n");
}
fprintf (stderr, "\n");
}
int main(int argc, char *argv[])
{
/* Retrieve problem size */
int ni = NI;
int nj = NJ;
POLYBENCH_2D_ARRAY_DECL(A,DATA_TYPE,NI,NJ,ni,nj);
POLYBENCH_2D_ARRAY_DECL(B,DATA_TYPE,NI,NJ,ni,nj);
POLYBENCH_2D_ARRAY_DECL(B_outputFromGpu,DATA_TYPE,NI,NJ,ni,nj);
//init(ni, nj, POLYBENCH_ARRAY(A));
A = (float(*)[NI][NJ]) argv;
read_cl_file();
cl_initialization();
cl_mem_init(POLYBENCH_ARRAY(A));
cl_load_prog();
cl_launch_kernel(ni, nj);
errcode = clEnqueueReadBuffer(clCommandQue, b_mem_obj, CL_TRUE, 0, NI*NJ*sizeof(DATA_TYPE), POLYBENCH_ARRAY(B_outputFromGpu), 0, NULL, NULL);
if(errcode != CL_SUCCESS) printf("Error in reading GPU mem\n");
#ifdef RUN_ON_CPU
/* Start timer. */
polybench_start_instruments;
conv2D(ni, nj, POLYBENCH_ARRAY(A), POLYBENCH_ARRAY(B));
/* Stop and print timer. */
printf("CPU Time in seconds:\n");
polybench_stop_instruments;
polybench_print_instruments;
compareResults(ni, nj, POLYBENCH_ARRAY(B), POLYBENCH_ARRAY(B_outputFromGpu));
#else //prevent dead code elimination
polybench_prevent_dce(print_array(ni, nj, POLYBENCH_ARRAY(B_outputFromGpu)));
#endif //RUN_ON_CPU
cl_clean_up();
//POLYBENCH_FREE_ARRAY(A);
POLYBENCH_FREE_ARRAY(B);
POLYBENCH_FREE_ARRAY(B_outputFromGpu);
return 0;
}
}

View File

@@ -0,0 +1,99 @@
#include <base/log.h>
#include <base/heap.h>
#include <base/allocator_avl.h>
#include <base/attached_rom_dataspace.h>
#include <libc/component.h>
#include <unistd.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
// OpenCL
#define CL_TARGET_OPENCL_VERSION 100
#include "../hello_gpgpu/CL/cl.h"
// rpc
#include <gpgpu_virt/connection.h>
// stupid alloc
#include "../hello_gpgpu/allocator_stupid.h"
namespace ns_convolution_2d{int main(int argc, char *argv[]);};
struct consumer_conv
{
Genode::Env &env;
gpgpu_virt::Connection backend_driver;
Genode::Allocator_stupid allocator;
const unsigned long size = 0x40000000;
cl_genode clg;
Genode::Ram_dataspace_capability vgpu_mem_ram_cap;
Genode::Ram_dataspace_capability vgpu_shm_ram_cap;
volatile uint8_t *ready;
const unsigned long img_size = 320 * 240 * sizeof(float);
volatile float *data;
void init()
{
Genode::log("===Init Consumer Conv===");
clInitGenode(clg);
// register vgpu (optional?)
const unsigned long size_vgpu_mem = 0x1000;
backend_driver.register_vm(size_vgpu_mem, vgpu_mem_ram_cap);
// create shm for gpu
const unsigned long id = 0;
Genode::size_t total_size = 0;
while (total_size == 0)
{
backend_driver.ask_shm(id, total_size, vgpu_shm_ram_cap);
}
// attach shm to vm
Genode::addr_t mapped_base = env.rm().attach(vgpu_shm_ram_cap);
clg.add_shm_mapped_base(id, mapped_base);
// use it in allocator
allocator.add_range(mapped_base, total_size);
// alloc whole data
ready = (uint8_t *)allocator.alloc(1);
data = (float *)allocator.alloc_aligned(0x10000, img_size);
}
void run()
{
Genode::log("===Run Consumer Conv===");
Libc::with_libc([&]
{
for(;;)
{
while (*ready != 0x42);
ns_convolution_2d::main(2, (char**)data);
//Genode::log(data[0]);
sleep(3);
} });
Genode::log("===End===");
Genode::log("Consumer Conv completed");
}
consumer_conv(Genode::Env &e) : env(e), backend_driver(env), allocator(), clg(env, size), ready(nullptr), data(nullptr)
{
}
};
void Libc::Component::construct(Libc::Env &env)
{
static consumer_conv p(env);
p.init();
p.run();
}

View File

@@ -0,0 +1,10 @@
TARGET = consumer2
SRC_CC = main.cc \
2DConvolution.cc \
../hello_gpgpu/polybench.cc \
../hello_gpgpu/CL/cl.cc ../hello_gpgpu/CL/cl_genode.cc \
../hello_gpgpu/allocator_stupid.cc
LIBS = base libc libm
CC_CXX_WARN_STRICT =

View File

@@ -357,6 +357,26 @@ clCreateBuffer(cl_context context,
clmem->bc.buffer = host_ptr;
clmem->bc.buffer_size = (uint32_t)size;
clmem->bc.non_pointer_type = false;
clmem->bc.shmid = -1;
*errcode_ret |= CL_SUCCESS;
return clmem;
}
CL_API_ENTRY cl_mem CL_API_CALL
clCreateBufferSHM(cl_context context,
cl_mem_flags flags,
size_t size,
void * host_ptr,
cl_int * errcode_ret,
int shid)
{
cl_mem clmem = (cl_mem)g_cl_genode->alloc(sizeof(struct _cl_mem));
clmem->virt_vm = host_ptr;
clmem->bc.buffer = host_ptr;
clmem->bc.buffer_size = (uint32_t)size;
clmem->bc.non_pointer_type = false;
clmem->bc.shmid = shid;
*errcode_ret |= CL_SUCCESS;
return clmem;

View File

@@ -1088,6 +1088,14 @@ clCreateBuffer(cl_context context,
void * host_ptr,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateBufferSHM(cl_context context,
cl_mem_flags flags,
size_t size,
void * host_ptr,
cl_int * errcode_ret,
int shid) CL_API_SUFFIX__VERSION_1_0;
#ifdef CL_VERSION_1_1
extern CL_API_ENTRY cl_mem CL_API_CALL

View File

@@ -1,6 +1,6 @@
#include "cl_genode.h"
cl_genode::cl_genode(Genode::Env& env, unsigned long size) : env(env), allocator(), mapped_base(0), backend_driver(env)
cl_genode::cl_genode(Genode::Env& env, unsigned long size) : env(env), allocator(), mapped_base(0), backend_driver(env), shm_mapped_base{0, }
{
// get shared memory with driver
Genode::Ram_dataspace_capability ram_cap;
@@ -38,7 +38,8 @@ void cl_genode::enqueue_task(struct kernel_config* kconf)
// convert virt vm addr to offset
for(int i = 0; i < kconf->buffCount; i++)
{
kconf->buffConfigs[i].buffer = (void*)((Genode::addr_t)kconf->buffConfigs[i].buffer - mapped_base);
const Genode::addr_t mbase = kconf->buffConfigs[i].shmid == -1 ? mapped_base : shm_mapped_base[kconf->buffConfigs[i].shmid];
kconf->buffConfigs[i].buffer = (void*)((Genode::addr_t)kconf->buffConfigs[i].buffer - mbase);
}
kconf->buffConfigs = (struct buffer_config*)((Genode::addr_t)kconf->buffConfigs - mapped_base);
kconf->kernelName = (char*)((Genode::addr_t)kconf->kernelName - mapped_base);
@@ -55,3 +56,8 @@ void cl_genode::wait(struct kernel_config* kconf)
asm("nop");
}
}
void cl_genode::add_shm_mapped_base(int shmid, Genode::addr_t mbase)
{
shm_mapped_base[shmid] = mbase;
}

View File

@@ -17,6 +17,9 @@
// driver
#include <gpgpu/gpgpu.h>
// config
#include "../../../../dde_uos-intel-gpgpu/src/config.h"
class cl_genode
{
private:
@@ -30,6 +33,9 @@ private:
// rpc
gpgpu_virt::Connection backend_driver;
// shm mapped_bases
Genode::addr_t shm_mapped_base[MAX_SHM_REGIONS];
// do not allow copies
cl_genode(const cl_genode& copy) = delete;
cl_genode& operator=(const cl_genode& src) = delete;
@@ -100,6 +106,14 @@ public:
*
*/
void reset() { allocator.reset(); }
/**
* @brief
*
* @param shmid
* @param mbase
*/
void add_shm_mapped_base(int shmid, Genode::addr_t mbase);
};
#endif // CL_GENODE_H

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,155 @@
#include <base/log.h>
#include <base/heap.h>
#include <base/allocator_avl.h>
#include <base/attached_rom_dataspace.h>
#include <util/misc_math.h>
#include <libc/component.h>
#include <unistd.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
// rpc
#include <gpgpu_virt/connection.h>
// stupid alloc
#include "../hello_gpgpu/allocator_stupid.h"
// imgs
#include "frac_320_240.h"
#include "frac2_320_240.h"
namespace fake_cv
{
class Mat
{
public:
int rows;
int cols;
int step;
float *data;
template <typename T>
T *ptr(int off)
{
return (T *)&data[off * sizeof(T)];
}
};
Mat imread(const char *img)
{
Mat m;
m.rows = height;
m.cols = width;
m.step = width * sizeof(float);
const size_t size = m.rows * m.cols;
m.data = (float *)malloc(size * sizeof(float));
for (size_t i = 0; i < size; ++i)
{
unsigned int px[3];
HEADER_PIXEL(img, px);
const unsigned int g = 0.298936021293775 * px[0] + 0.587043074451121 * px[1] + 0.114020904255103 * px[2];
m.data[i] = g / 251.; // 255.
}
return m;
}
}
typedef fake_cv::Mat Image;
fake_cv::Mat getGray(const fake_cv::Mat &img)
{
return img;
}
struct producer
{
Genode::Env &env;
gpgpu_virt::Connection backend_driver;
Genode::Allocator_stupid allocator;
float *img1;
float *img2;
unsigned long img_size;
volatile uint8_t *ready;
volatile float *data;
void init()
{
Genode::log("===Init Producer===");
// register vgpu (optional?)
const unsigned long size_vgpu_mem = 0x1000;
Genode::Ram_dataspace_capability vgpu_mem_ram_cap;
backend_driver.register_vm(size_vgpu_mem, vgpu_mem_ram_cap);
// create shm for gpu
const unsigned long size_vgpu_shm = 0x100000;
Genode::Ram_dataspace_capability vgpu_shm_ram_cap;
backend_driver.register_shm(size_vgpu_shm, vgpu_shm_ram_cap);
// attach shm to vm
Genode::addr_t mapped_base = env.rm().attach(vgpu_shm_ram_cap);
// use it in allocator
allocator.add_range(mapped_base, size_vgpu_shm);
// set not ready
ready = (uint8_t *)allocator.alloc(1);
// load img1 and img2
const Image s1 = fake_cv::imread(header_data);
Image i1 = getGray(s1);
img1 = (float *)i1.ptr<float>(0);
const Image s2 = fake_cv::imread(header_data2);
Image i2 = getGray(s2);
img2 = (float *)i2.ptr<float>(0);
img_size = Genode::max(i1.rows * i2.cols * sizeof(float), i2.rows * i2.cols * sizeof(float));
// alloc whole data
data = (float *)allocator.alloc_aligned(0x10000, img_size);
}
void run()
{
Genode::log("===Run Producer===");
Libc::with_libc([&]
{
srand(time(NULL));
int flip = 0;
for (;;)
{
// fetch new img
*ready = 0x43;
memcpy((void*)data, flip ? img1 : img2, img_size);
*ready = 0x42;
flip = !flip;
Genode::log("===New Image ready: ", flip, " ===");
// sleep for 5s
sleep(5);
} });
Genode::log("===End===");
Genode::log("Producer completed");
}
producer(Genode::Env &e) : env(e), backend_driver(env), allocator(), img1(nullptr), img2(nullptr), img_size(0), ready(nullptr), data(nullptr)
{
}
};
void Libc::Component::construct(Libc::Env &env)
{
static producer p(env);
p.init();
p.run();
}

View File

@@ -0,0 +1,7 @@
TARGET = producer
SRC_CC = main.cc \
../hello_gpgpu/allocator_stupid.cc
LIBS = base libc libm
CC_CXX_WARN_STRICT =