11 cl_program
load_program(
const char* cl_kernel_directory,
const char* file_name, cl_context ctx)
14 sprintf(file_path,
"%s/%s.cl",cl_kernel_directory,file_name);
15 FILE* f = fopen(file_path,
"r");
17 fprintf(stderr,
"File %s/%s.cl not found.",cl_kernel_directory,file_name);
22 size_t file_size = (size_t)ftell(f);
25 char* buffer = (
char*)malloc(file_size+1);
26 buffer[file_size] =
'\0';
27 fread(buffer,1,file_size,f);
30 cl_program result = clCreateProgramWithSource(ctx, 1, (
const char**)&buffer, &file_size, &
clm4rm_error);
36 cl_build_status
build_program(cl_program program, cl_device_id device,
int tile_m)
40 sprintf(options,
"-D %s=%i -D %s=%i -D %s=%i",
44 clm4rm_error = clBuildProgram(program, 1,&device, options, NULL,NULL);
46 cl_build_status build_status;
47 clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_STATUS,
sizeof(build_status), &build_status,NULL);
49 if (build_status != CL_SUCCESS) {
51 clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
52 char *log = (
char *) malloc(log_size + 1);
53 clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size + 1, log, NULL);
55 fprintf(stderr,
"%i %s\n", build_status, log);
85 cl_context ctx, cl_device_id device)
97 clGetDeviceInfo(device,CL_DEVICE_MAX_WORK_ITEM_SIZES, 3*
sizeof(
size_t), &
max_items, NULL);
98 clGetDeviceInfo(device,CL_DEVICE_MAX_WORK_GROUP_SIZE,
sizeof(
size_t),&
max_group_size, NULL);
100 clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE,
sizeof(
size_t), &
shared_mem_bytes, NULL);
101 clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_SIZE,
sizeof(
size_t), &
heap_size, NULL);
102 clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
sizeof(
size_t), &
max_object_size, NULL);
126 for(
int tile_m=
MAX_TILE_M; tile_m >= 1; --tile_m)
133 cl_build_status status =
build_program(program, device, tile_m);
134 if (status != CL_BUILD_SUCCESS)
159 for(
int tile_m=1; tile_m <=
MAX_TILE_M; ++tile_m) {
164 if (device) clReleaseDevice(device);
165 if (ctx) clReleaseContext(ctx);
168 printf(
"WARNING: %li bytes of device memory have not been released.\n",
allocated_size);
177 Q_ASSERT(host_matrix->nrows==gpu_matrix->
nrows);
178 Q_ASSERT(host_matrix->ncols==gpu_matrix->
ncols);
180 Q_ASSERT(host_matrix->row_offset==0);
181 Q_ASSERT(mzd_is_windowed(host_matrix)==0);
186 if ((padding > 0) && (nrows % padding))
187 return nrows + (padding - nrows % padding);
221 printf(
"WARNING object size %li exceeds max. %li\n", sz,
max_object_size);
240 m->
data = clCreateImage2D(ctx,
241 read_only ? CL_MEM_READ_ONLY : CL_MEM_READ_WRITE,
245 m->
data = clCreateBuffer(ctx,
246 read_only ? CL_MEM_READ_ONLY : CL_MEM_READ_WRITE,
263 m->
data = clCreateImage2D(ctx,
264 (read_only ? CL_MEM_READ_ONLY : CL_MEM_READ_WRITE) | CL_MEM_COPY_HOST_PTR,
268 m->
data = clCreateBuffer(ctx,
269 (read_only ? CL_MEM_READ_ONLY : CL_MEM_READ_WRITE) | CL_MEM_COPY_HOST_PTR,
291 while(list->
count > 0) {
306 for(
int i=0; i < b->
count; ++i) {
307 cl_event evt = b->
events[i];
308 Q_ASSERT(evt != NULL);
327 for (
int i = 0; i < cond->
pre->
count; ++i)
340 for (
int i = 0; i < cond->
pre->
count; ++i)
369 queue, gpu_matrix->
data, &zero,
sizeof(zero),
389 size_t origin[3] = { 0,0,0 };
410 host_matrix = mzd_init(gpu_matrix->
nrows,gpu_matrix->
ncols);
416 size_t origin[3] = { 0,0,0 };
442 for (
int row = 0; row < M->nrows; ++row)
444 word* Mrow = M->rows[row];
445 for (
int col = 0; col < width; col += 2)
447 word Mword = Mrow[col>>1];
454 for (
int col = 0; col < width; col++)
464 for (
int row = 0; row < M->nrows; ++row)
466 word* Mrow = M->rows[row];
467 for (
int col = 0; col < width; col += 2)
471 Mrow[col>>1] |= ((word)G[(col+1)*
padded_rows + row]) << 32;
void init_events(clm4rm_event_list *list)
reset events list
void clm4rm_zero_fill(clmatrix_t *gpu_matrix, cl_command_queue queue, clm4rm_conditions *cond)
Fill a matrix with zero data. The operation is scheduled for asynchronous execution of the GPU....
size_t heap_size
size of allocated memory in bytes
OpenCL boolean matrix data structure. Data is arranged in 32 bit words.
clm4rm_event_list event_lists[2]
< pre-conditions and post-conditions
size_t shared_mem_bytes
size of shared memory in bytes
cl_kernel clm4rm_and_kernel
rci_t padded_rows
Number of rows padded to a multiple of 32.
void assertMatrixLayout(const clmatrix_t *gpu_matrix, const mzd_t *host_matrix)
cl_int clm4rm_error
latest OpenCL result code. CL_SUCCESS indicates no error.
cl_program load_program(const char *cl_kernel_directory, const char *file_name, cl_context ctx)
unsigned int gpuword
a GPU word has 32 bits
cl_kernel clm4rm_copy_kernel
cl_event * pre_events(clm4rm_conditions *cond)
cl_build_status build_program(cl_program program, cl_device_id device, int tile_m)
clm4rm_event_list * post
post-conditions: conditions after an operation finishes. post-conditions may act as pre-conditioins f...
void release_events(clm4rm_event_list *list)
release events
Manages OpenCL event dependencies; necessary when the queue is out-of-order; dependencies must be est...
rci_t ncols
Number of columns.
cl_int clm4rm_setup(const char *cl_kernel_directory, cl_context ctx, cl_device_id device)
load OpenCL kernels and set up parameters
void clm4rm_tear_down(cl_context ctx, cl_device_id device)
release OpenCL resources
size_t shared_mem_words
size of shared memory in (32bit) words
gpuword * copy_matrix_data(gpuword *G, const mzd_t *M, int padded_rows)
create a column-major copy from an mzd_t matrix
bool printed_heap_warning
cl_kernel clm4rm_or_kernel
void clm4rm_write(clmatrix_t *gpu_matrix, const mzd_t *host_matrix, cl_command_queue queue, clm4rm_conditions *cond)
Copy matrix data from host memory to GPU. The operation is scheduled for asynchronous execution of th...
cl_kernel clm4rm_mul_kernel
OpenCL kernel for Four-Russians matrix multiplication.
cl_kernel clutri_mul_kernel[MAX_TILE_M+1]
OpenCL kernels for cubic upper-triangle matrix multiplication. Each kernel for a tile size....
a list of cl_events; used by clm4rm_conditions to keep track of schedules jobs in the OpenCL queue.
void merge_events(clm4rm_event_list *a, clm4rm_event_list *b)
append tow lists
void join_conditions(clm4rm_conditions *cond)
called when the pre-conditions are met. The post-conditions become new pre-conditioins.
cl_uint pre_count(clm4rm_conditions *cond)
void merge_conditions(clm4rm_conditions *a, clm4rm_conditions *b)
merge pre-conditions into one list
cl_kernel clcubic_mul_kernel[MAX_TILE_M+1]
OpenCL kernels for cubic matrix multiplication. Each kernel for a tile size. Actual tile sizes are in...
clmatrix_t * clm4rm_create(rci_t rows, rci_t cols, int rowpadding, int read_only, cl_context ctx)
create an empty matrix
#define clm4rm_radix
word size. for compatibility with GPU memory layout, we operate on 32 bit words.
cl_event * push_event(clm4rm_conditions *cond)
reserve one post-condition event
cl_program programs[MAX_TILE_M+1]
rci_t width
Number of words with valid bits: width = ceil(ncols / m4ri_radix) */.
cl_kernel clm4rm_query_diagonal_kernel
cl_mem data
handle to GPU data (32-bit unsigned integers)
int padded_rows(int nrows, int padding)
calculate the number of padded rows
size_t max_group_size
max. size of a work group
clmatrix_t * clm4rm_copy(const mzd_t *host_matrix, int rowpadding, int read_only, cl_context ctx)
ceate a copy from a matrix in M4RI format
clm4rm_event_list * pre
pre-conditions: an operation is scheduled when all pre-conditions are met
void init_conditions(clm4rm_conditions *cond)
reset conditions list
rci_t nrows
Number of rows.
cl_event * pushed_event(clm4rm_conditions *cond)
clmatrix_t * clm4rm_allocate(int rows, int cols, int rowpadding)
cl_image_format IMAGE_FORMAT
size_t max_object_size
max. object allocation size
void copy_back_matrix_data(mzd_t *M, const gpuword *G, int padded_rows)
copy back a colum–major matrix
cl_uint count
current number of events
gpuword * local_data
matrix data in CPU memory
rci_t padded_cols
Number of columns padded to a multiple of 64.
mzd_t * clm4rm_read(mzd_t *host_matrix, clmatrix_t *gpu_matrix, cl_command_queue queue, clm4rm_conditions *cond)
copy matrix from gpu memory to host
void track_heap_size(size_t sz)
void release_conditions(clm4rm_conditions *cond)
release conditions list
size_t max_items[3]
max. number of items in each dimension
cl_event events[MAX_EVENTS]
array of OpenCL events