fv/html/clm4rm_8cpp_source.html

 //
 // Created by nightrider on 21.09.18.
 //

 #include <clm4rm.h>
 #include <stdio.h>
 #include <qdebug.h>

 cl_int clm4rm_error;

 cl_program load_program(const char* cl_kernel_directory, const char* file_name, cl_context ctx)
 {
     char file_path[2048];
     sprintf(file_path, "%s/%s.cl",cl_kernel_directory,file_name);
     FILE* f = fopen(file_path,"r");
     if (f==NULL) {
         fprintf(stderr,"File %s/%s.cl not found.",cl_kernel_directory,file_name);
         return NULL;
     }

     fseek(f,0,SEEK_END);
     size_t file_size = (size_t)ftell(f);
     rewind(f);

     char* buffer = (char*)malloc(file_size+1);
     buffer[file_size] = '\0';
     fread(buffer,1,file_size,f);
     fclose(f);

     cl_program result = clCreateProgramWithSource(ctx, 1, (const char**)&buffer, &file_size, &clm4rm_error);

     free(buffer);
     return result;
 }

 cl_build_status build_program(cl_program program, cl_device_id device, int tile_m)
 {
 #define str(S) #S
     char options[1024];
     sprintf(options, "-D %s=%i -D %s=%i -D %s=%i",
             str(IMAGE2D), IMAGE2D,
             str(BUFFERED), BUFFERED,
             "TILE_M", tile_m);
     clm4rm_error = clBuildProgram(program, 1,&device, options, NULL,NULL);

     cl_build_status build_status;
     clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_STATUS, sizeof(build_status), &build_status,NULL);

     if (build_status != CL_SUCCESS) {
         size_t log_size;
         clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
         char *log = (char *) malloc(log_size + 1);
         clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size + 1, log, NULL);

         fprintf(stderr, "%i %s\n", build_status, log);
         free(log);
     }

     Q_ASSERT(clm4rm_error == CL_SUCCESS);
     return build_status;
 }

 cl_kernel clm4rm_and_kernel;
 cl_kernel clm4rm_or_kernel;
 cl_kernel clm4rm_copy_kernel;
 cl_kernel clm4rm_query_diagonal_kernel;

 cl_kernel clm4rm_mul_kernel;
 //cl_kernel clm4rm_addmul_kernel;

 cl_kernel clcubic_mul_kernel[MAX_TILE_M+1];
 cl_kernel clutri_mul_kernel[MAX_TILE_M+1];

 size_t max_group_size;
 size_t max_items[3];

 size_t shared_mem_bytes, shared_mem_words;
 size_t heap_size, allocated_size=0;
 size_t max_object_size;

 cl_program programs[MAX_TILE_M+1];


 cl_int clm4rm_setup(const char* cl_kernel_directory,
                         cl_context ctx, cl_device_id device)
 {
     //  load program from disk

     //  "clm4rm_bitwise.cl"
     cl_program program = programs[0] = load_program(cl_kernel_directory,"clm4rm_bitwise",ctx);
     if (program==NULL)
         return -1;
     //  compile for device
     if (build_program(program,device,0) != CL_BUILD_SUCCESS)
         return -1;

     clGetDeviceInfo(device,CL_DEVICE_MAX_WORK_ITEM_SIZES, 3*sizeof(size_t), &max_items, NULL);
     clGetDeviceInfo(device,CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t),&max_group_size, NULL);
     Q_ASSERT(max_group_size > 0);
     clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(size_t), &shared_mem_bytes, NULL);
     clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(size_t), &heap_size, NULL);
     clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &max_object_size, NULL);

     shared_mem_words = shared_mem_bytes / sizeof(gpuword);

     //  get the kernels
     clm4rm_and_kernel = clCreateKernel(program,"clm4rm_and",&clm4rm_error); Q_ASSERT(clm4rm_error == CL_SUCCESS);
     clm4rm_or_kernel = clCreateKernel(program,"clm4rm_or",&clm4rm_error); Q_ASSERT(clm4rm_error == CL_SUCCESS);
     clm4rm_copy_kernel = clCreateKernel(program,"clm4rm_copy",&clm4rm_error); Q_ASSERT(clm4rm_error == CL_SUCCESS);
     clm4rm_query_diagonal_kernel = clCreateKernel(program, "clm4rm_query_diagonal", &clm4rm_error); Q_ASSERT(clm4rm_error == CL_SUCCESS);

     if (clm4rm_and_kernel==NULL || clm4rm_or_kernel==NULL)
         return -1;

     //  "clm4rm_mul.cl"
     program = programs[1] = load_program(cl_kernel_directory,"clm4rm_mul",ctx);
     if (program==NULL)
         return -1;
     //  compile for device
     if (build_program(program,device,0) != CL_BUILD_SUCCESS)
         return -1;

     clm4rm_mul_kernel = clCreateKernel(program,"clm4rm_mul",&clm4rm_error); Q_ASSERT(clm4rm_error==CL_SUCCESS);
     //clm4rm_addmul_kernel = clCreateKernel(program,"clm4rm_addmul",&clm4rm_error); Q_ASSERT(SUCCESS);

     for(int tile_m=MAX_TILE_M; tile_m >= 1; --tile_m)
     {
         //  "clcubic_mul.cl"
         program = programs[1+tile_m] = load_program(cl_kernel_directory, "clcubic_mul", ctx);
         if (program == NULL)
             return -1;

         cl_build_status status = build_program(program, device, tile_m);
         if (status != CL_BUILD_SUCCESS)
             return -1;

         clcubic_mul_kernel[tile_m] = clCreateKernel(program, "clcubic_mul", &clm4rm_error);
         Q_ASSERT(clm4rm_error == CL_SUCCESS);
         clutri_mul_kernel[tile_m] = clCreateKernel(program, "clutri_mul", &clm4rm_error);
         Q_ASSERT(clm4rm_error == CL_SUCCESS);
     }

     return CL_SUCCESS;
 }

 void clm4rm_tear_down(cl_context ctx, cl_device_id device)
 {
     if (clm4rm_and_kernel) clReleaseKernel(clm4rm_and_kernel);
     if (clm4rm_or_kernel) clReleaseKernel(clm4rm_or_kernel);
     if (clm4rm_copy_kernel) clReleaseKernel(clm4rm_copy_kernel);
     if (clm4rm_query_diagonal_kernel) clReleaseKernel(clm4rm_query_diagonal_kernel);

     if (clm4rm_mul_kernel) clReleaseKernel(clm4rm_mul_kernel);
     //if (clm4rm_addmul_kernel) clReleaseKernel(clm4rm_addmul_kernel);

     for(int i=0; i <= MAX_TILE_M; ++i)
         if (programs[i]) clReleaseProgram(programs[i]);

     for(int tile_m=1; tile_m <= MAX_TILE_M; ++tile_m) {
         if (clcubic_mul_kernel[tile_m]) clReleaseKernel(clcubic_mul_kernel[tile_m]);
         if (clutri_mul_kernel[tile_m]) clReleaseKernel(clutri_mul_kernel[tile_m]);
     }

     if (device) clReleaseDevice(device);
     if (ctx) clReleaseContext(ctx);

     if (allocated_size > 0) {
         printf("WARNING: %li bytes of device memory have not been released.\n", allocated_size);
     }
 }


 cl_image_format IMAGE_FORMAT = { CL_R, CL_UNSIGNED_INT32 };

 void assertMatrixLayout(const clmatrix_t* gpu_matrix, const mzd_t* host_matrix)
 {
     Q_ASSERT(host_matrix->nrows==gpu_matrix->nrows);
     Q_ASSERT(host_matrix->ncols==gpu_matrix->ncols);
 //    Q_ASSERT(host_matrix->rowstride*m4ri_radix==gpu_matrix->rowstride*clm4rm_radix);
     Q_ASSERT(host_matrix->row_offset==0);
     Q_ASSERT(mzd_is_windowed(host_matrix)==0);
     //  what else?
 }

 int padded_rows(int nrows, int padding) {
     if ((padding > 0) && (nrows % padding))
         return nrows + (padding - nrows % padding);
     else
         return nrows;
 }

 clmatrix_t* clm4rm_allocate(int rows, int cols, int rowpadding)
 {
     clmatrix_t* m = (clmatrix_t*)malloc(sizeof(clmatrix_t));

     m->nrows = rows;
     m->ncols = cols;

     m->padded_rows = padded_rows(rows, rowpadding);
     m->width = CEILCOLS(cols);
     m->padded_cols = m->width*clm4rm_radix;

 //    m->rowstride = m->width = M4RI_WIDTH(cols);
 //    if (m->rowstride & 1) m->rowstride++;

 //    m->rowstride *= (m4ri_radix/clm4rm_radix);
 //    m->width *= (m4ri_radix/clm4rm_radix);

     //  note: rowstride must resemble m4ri.
     //  wastes some bytes, but keeps compatibility with M4RI
     m->data = NULL;
     m->local_data=NULL;
     return m;
 }

 bool printed_heap_warning = false;

 void track_heap_size(size_t sz)
 {
     if (sz > max_object_size && !printed_heap_warning) {
         printf("WARNING object size %li exceeds max. %li\n", sz, max_object_size);
         printed_heap_warning = true;
     }

     allocated_size += sz;

     if (allocated_size > heap_size && !printed_heap_warning) {
         printf("WARNING heap size %li exceeds max. %li\n", allocated_size, heap_size);
         printed_heap_warning = true;
     }
 }

 clmatrix_t* clm4rm_create(rci_t rows, rci_t cols, int rowpadding, int read_only, cl_context ctx)
 {
     clmatrix_t* m = clm4rm_allocate(rows,cols,rowpadding);

 #if IMAGE2D
     //  Note: column-major format !!
     //  a matrix column is actually a row in Image2d
     m->data = clCreateImage2D(ctx,
         read_only ? CL_MEM_READ_ONLY : CL_MEM_READ_WRITE,
         &IMAGE_FORMAT, m->padded_rows, m->width, 0,
         NULL, &clm4rm_error);
 #else
     m->data = clCreateBuffer(ctx,
         read_only ? CL_MEM_READ_ONLY : CL_MEM_READ_WRITE,
         DATA_BYTES(m), NULL, &clm4rm_error);
 #endif
     track_heap_size(DATA_BYTES(m));
     Q_ASSERT(clm4rm_error == CL_SUCCESS);
     return m;
 }

 clmatrix_t* clm4rm_copy(const mzd_t* host_matrix, int rowpadding, int read_only, cl_context ctx)
 {
     clmatrix_t* m = clm4rm_allocate(host_matrix->nrows,host_matrix->ncols, rowpadding);
     m->local_data = copy_matrix_data(m->local_data,host_matrix,m->padded_rows);

     //assertMatrixLayout(m,host_matrix);
 #if IMAGE2D
     //  Note: column-major format !!
     //  a matrix column is actually a row in Image2d
     m->data = clCreateImage2D(ctx,
         (read_only ? CL_MEM_READ_ONLY : CL_MEM_READ_WRITE) | CL_MEM_COPY_HOST_PTR,
         &IMAGE_FORMAT, m->padded_rows, m->width, m->padded_rows * sizeof(gpuword),
         m->local_data, &clm4rm_error);
 #else
     m->data = clCreateBuffer(ctx,
         (read_only ? CL_MEM_READ_ONLY : CL_MEM_READ_WRITE) | CL_MEM_COPY_HOST_PTR,
         DATA_BYTES(m), m->local_data, &clm4rm_error);
 #endif
     track_heap_size(DATA_BYTES(m));
     Q_ASSERT(clm4rm_error == CL_SUCCESS);
     return m;
 }

 void init_events(clm4rm_event_list* list)
 {
     list->count = 0;
     for (int i = 0; i < MAX_EVENTS; ++i)
         list->events[i] = NULL;
 }

 void init_conditions(clm4rm_conditions* cond) {
     Q_ASSERT(cond);
     init_events(cond->pre = &cond->event_lists[0]);
     init_events(cond->post = &cond->event_lists[1]);
 }

 void release_events(clm4rm_event_list* list) {
     while(list->count > 0) {
         clReleaseEvent(list->events[--(list->count)]);
         list->events[list->count]=NULL;
     }
 }

 void release_conditions(clm4rm_conditions* cond) {
     Q_ASSERT(cond);
     release_events(cond->pre);
     release_events(cond->post);
 }

 void merge_events(clm4rm_event_list* a, clm4rm_event_list* b)
 {
     Q_ASSERT((a->count+b->count) <= MAX_EVENTS);
     for(int i=0; i < b->count; ++i) {
         cl_event evt = b->events[i];
         Q_ASSERT(evt != NULL);
         a->events[(a->count)++] = evt;
         clRetainEvent(evt);
     }
 }

 void merge_conditions(clm4rm_conditions* a, clm4rm_conditions* b)
 {
     merge_events(a->pre, b->pre);
 }

 void join_conditions(clm4rm_conditions* cond) {
     if (!cond) return;
     //  clear pre-conditions
     release_events(cond->pre);
     //  move post to pre
     clm4rm_event_list* temp = cond->pre;
     cond->pre = cond->post;
     cond->post = temp;
     for (int i = 0; i < cond->pre->count; ++i)
         Q_ASSERT(cond->pre->events[i]);
 }

 cl_uint pre_count(clm4rm_conditions* cond) {
     if (cond)
         return cond->pre->count;
     else
         return 0;
 }

 cl_event* pre_events(clm4rm_conditions* cond) {
     if (cond && cond->pre->count > 0) {
         for (int i = 0; i < cond->pre->count; ++i)
             Q_ASSERT(cond->pre->events[i] != NULL);
         return cond->pre->events;
     }
     else
         return NULL;
 }

 cl_event* push_event(clm4rm_conditions* cond) {
     if (cond) {
         Q_ASSERT(cond->post->count+1 < MAX_EVENTS);
         return cond->post->events + (cond->post->count)++;
     }
     else
         return NULL;
 }

 cl_event* pushed_event(clm4rm_conditions* cond) {
     Q_ASSERT(cond->post->count >= 1 && cond->post->count < MAX_EVENTS);
     return cond->post->events + (cond->post->count - 1);
 }


 void clm4rm_zero_fill(clmatrix_t* gpu_matrix,
                       cl_command_queue  queue, clm4rm_conditions* cond)
 {
     gpuword zero=0;
     clEnqueueFillBuffer(
             queue, gpu_matrix->data, &zero, sizeof(zero),
             0, DATA_BYTES(gpu_matrix),
             pre_count(cond),pre_events(cond),push_event(cond));
     Q_ASSERT(pushed_event(cond) != NULL);
 }


 void clm4rm_write(clmatrix_t* gpu_matrix, const mzd_t* host_matrix,
                       cl_command_queue queue, clm4rm_conditions* cond)
 {
     assertMatrixLayout(gpu_matrix,host_matrix);
     gpu_matrix->local_data = copy_matrix_data(gpu_matrix->local_data, host_matrix, gpu_matrix->padded_rows);

 #if IMAGE2D
     size_t origin[3] = { 0,0,0 };
     size_t region[3] = { gpu_matrix->padded_rows, gpu_matrix->width, 1 };
     //  Note column-major format !!
     //  a matrix column is actually a row in Image2d
     clm4rm_error = clEnqueueWriteImage(queue, gpu_matrix->data, CL_FALSE,
         origin, region, gpu_matrix->padded_rows * sizeof(gpuword), 0,
         gpu_matrix->local_data,
         pre_count(cond), pre_events(cond), pushed_event(cond));
 #else
     clm4rm_error = clEnqueueWriteBuffer(queue, gpu_matrix->data, CL_FALSE,
         0, DATA_BYTES(gpu_matrix), gpu_matrix->local_data,
         pre_count(cond), pre_events(cond), push_event(cond));
 #endif
     Q_ASSERT(pushed_event(cond) != NULL);
 }


 mzd_t* clm4rm_read(mzd_t* host_matrix, clmatrix_t* gpu_matrix,
                  cl_command_queue queue, clm4rm_conditions* cond)
 {
     if (!host_matrix)
         host_matrix = mzd_init(gpu_matrix->nrows,gpu_matrix->ncols);

     if (gpu_matrix->local_data==NULL)
         gpu_matrix->local_data = (gpuword*)malloc(DATA_BYTES(gpu_matrix));

 #if IMAGE2D
     size_t origin[3] = { 0,0,0 };
     size_t region[3] = { gpu_matrix->padded_rows, gpu_matrix->width, 1 };
     //  Note: column-major format !!
     //  a matrix column is actually a row in Image2d
     clm4rm_error = clEnqueueReadImage(queue, gpu_matrix->data, CL_TRUE,
         origin, region, gpu_matrix->padded_rows * sizeof(gpuword), 0,
         gpu_matrix->local_data,
         pre_count(cond), pre_events(cond), push_event(cond));
 #else
     clm4rm_error = clEnqueueReadBuffer(queue, gpu_matrix->data, CL_TRUE,
         0, DATA_BYTES(gpu_matrix), gpu_matrix->local_data,
         pre_count(cond), pre_events(cond), push_event(cond));
 #endif
     Q_ASSERT(pushed_event(cond) != NULL);
     copy_back_matrix_data(host_matrix,gpu_matrix->local_data, gpu_matrix->padded_rows);
     return host_matrix;
 }


 gpuword* copy_matrix_data(gpuword* G, const mzd_t* M, int padded_rows)
 {
     int width = CEILCOLS(M->ncols);

     if (G==NULL)
         G = (gpuword*)malloc(sizeof(gpuword)*width*padded_rows);
     for (int row = 0; row < M->nrows; ++row)
     {
         word* Mrow = M->rows[row];
         for (int col = 0; col < width; col += 2)
         {
             word Mword = Mrow[col>>1];
             G[col*padded_rows + row] = Mword;
             if (col+1 < width)
                 G[(col+1)*padded_rows + row] = Mword >> 32;
         }
     }
     for (int row=M->nrows; row < padded_rows; ++row)
         for (int col = 0; col < width; col++)
             G[col*padded_rows + row] = 0;
     return G;
 }


 void copy_back_matrix_data(mzd_t* M, const gpuword* G, int padded_rows)
 {
     int width = CEILCOLS(M->ncols);

     for (int row = 0; row < M->nrows; ++row)
     {
         word* Mrow = M->rows[row];
         for (int col = 0; col < width; col += 2)
         {
             Mrow[col>>1] = G[col*padded_rows + row];
             if (col+1 < width)
                 Mrow[col>>1] |= ((word)G[(col+1)*padded_rows + row]) << 32;
         }
     }
 }

init_events
void init_events(clm4rm_event_list *list)
reset events list
Definition: clm4rm.cpp:277

clm4rm_zero_fill
void clm4rm_zero_fill(clmatrix_t *gpu_matrix, cl_command_queue queue, clm4rm_conditions *cond)
Fill a matrix with zero data. The operation is scheduled for asynchronous execution of the GPU....
Definition: clm4rm.cpp:364

heap_size
size_t heap_size
size of allocated memory in bytes
Definition: clm4rm.cpp:78

clmatrix_t
OpenCL boolean matrix data structure. Data is arranged in 32 bit words.
Definition: clm4rm.h:98

clm4rm_conditions::event_lists
clm4rm_event_list event_lists[2]
< pre-conditions and post-conditions
Definition: clm4rm.h:229

shared_mem_bytes
size_t shared_mem_bytes
size of shared memory in bytes
Definition: clm4rm.cpp:77

clm4rm_and_kernel
cl_kernel clm4rm_and_kernel
Definition: clm4rm.cpp:63

clmatrix_t::padded_rows
rci_t padded_rows
Number of rows padded to a multiple of 32.
Definition: clm4rm.h:100

assertMatrixLayout
void assertMatrixLayout(const clmatrix_t *gpu_matrix, const mzd_t *host_matrix)
Definition: clm4rm.cpp:175

clm4rm_error
cl_int clm4rm_error
latest OpenCL result code. CL_SUCCESS indicates no error.
Definition: clm4rm.cpp:9

load_program
cl_program load_program(const char *cl_kernel_directory, const char *file_name, cl_context ctx)
Definition: clm4rm.cpp:11

gpuword
unsigned int gpuword
a GPU word has 32 bits
Definition: clcubic_mul.cl:74

clm4rm_copy_kernel
cl_kernel clm4rm_copy_kernel
Definition: clm4rm.cpp:65

pre_events
cl_event * pre_events(clm4rm_conditions *cond)
Definition: clm4rm.cpp:338

build_program
cl_build_status build_program(cl_program program, cl_device_id device, int tile_m)
Definition: clm4rm.cpp:36

allocated_size
size_t allocated_size
Definition: clm4rm.cpp:78

clm4rm_conditions::post
clm4rm_event_list * post
post-conditions: conditions after an operation finishes. post-conditions may act as pre-conditioins f...
Definition: clm4rm.h:234

release_events
void release_events(clm4rm_event_list *list)
release events
Definition: clm4rm.cpp:290

clm4rm_conditions
Manages OpenCL event dependencies; necessary when the queue is out-of-order; dependencies must be est...
Definition: clm4rm.h:227

clmatrix_t::ncols
rci_t ncols
Number of columns.
Definition: clm4rm.h:101

clm4rm_setup
cl_int clm4rm_setup(const char *cl_kernel_directory, cl_context ctx, cl_device_id device)
load OpenCL kernels and set up parameters
Definition: clm4rm.cpp:84

clm4rm_tear_down
void clm4rm_tear_down(cl_context ctx, cl_device_id device)
release OpenCL resources
Definition: clm4rm.cpp:146

MAX_EVENTS
#define MAX_EVENTS
Definition: clm4rm.h:194

shared_mem_words
size_t shared_mem_words
size of shared memory in (32bit) words
Definition: clm4rm.cpp:77

copy_matrix_data
gpuword * copy_matrix_data(gpuword *G, const mzd_t *M, int padded_rows)
create a column-major copy from an mzd_t matrix
Definition: clm4rm.cpp:436

printed_heap_warning
bool printed_heap_warning
Definition: clm4rm.cpp:216

MAX_TILE_M
#define MAX_TILE_M
Definition: clm4rm.h:62

clm4rm_or_kernel
cl_kernel clm4rm_or_kernel
Definition: clm4rm.cpp:64

IMAGE2D
#define IMAGE2D
Definition: clm4rm.h:53

clm4rm_write
void clm4rm_write(clmatrix_t *gpu_matrix, const mzd_t *host_matrix, cl_command_queue queue, clm4rm_conditions *cond)
Copy matrix data from host memory to GPU. The operation is scheduled for asynchronous execution of th...
Definition: clm4rm.cpp:382

clm4rm_mul_kernel
cl_kernel clm4rm_mul_kernel
OpenCL kernel for Four-Russians matrix multiplication.
Definition: clm4rm.cpp:68

clutri_mul_kernel
cl_kernel clutri_mul_kernel[MAX_TILE_M+1]
OpenCL kernels for cubic upper-triangle matrix multiplication. Each kernel for a tile size....
Definition: clm4rm.cpp:72

clm4rm_event_list
a list of cl_events; used by clm4rm_conditions to keep track of schedules jobs in the OpenCL queue.
Definition: clm4rm.h:200

merge_events
void merge_events(clm4rm_event_list *a, clm4rm_event_list *b)
append tow lists
Definition: clm4rm.cpp:303

join_conditions
void join_conditions(clm4rm_conditions *cond)
called when the pre-conditions are met. The post-conditions become new pre-conditioins.
Definition: clm4rm.cpp:319

pre_count
cl_uint pre_count(clm4rm_conditions *cond)
Definition: clm4rm.cpp:331

merge_conditions
void merge_conditions(clm4rm_conditions *a, clm4rm_conditions *b)
merge pre-conditions into one list
Definition: clm4rm.cpp:314

clcubic_mul_kernel
cl_kernel clcubic_mul_kernel[MAX_TILE_M+1]
OpenCL kernels for cubic matrix multiplication. Each kernel for a tile size. Actual tile sizes are in...
Definition: clm4rm.cpp:71

clm4rm_create
clmatrix_t * clm4rm_create(rci_t rows, rci_t cols, int rowpadding, int read_only, cl_context ctx)
create an empty matrix
Definition: clm4rm.cpp:233

clm4rm_radix
#define clm4rm_radix
word size. for compatibility with GPU memory layout, we operate on 32 bit words.
Definition: clm4rm.h:41

push_event
cl_event * push_event(clm4rm_conditions *cond)
reserve one post-condition event
Definition: clm4rm.cpp:348

programs
cl_program programs[MAX_TILE_M+1]
Definition: clm4rm.cpp:81

clmatrix_t::width
rci_t width
Number of words with valid bits: width = ceil(ncols / m4ri_radix) */.
Definition: clm4rm.h:103

clm4rm_query_diagonal_kernel
cl_kernel clm4rm_query_diagonal_kernel
Definition: clm4rm.cpp:66

clm4rm.h

clmatrix_t::data
cl_mem data
handle to GPU data (32-bit unsigned integers)
Definition: clm4rm.h:114

padded_rows
int padded_rows(int nrows, int padding)
calculate the number of padded rows
Definition: clm4rm.cpp:185

max_group_size
size_t max_group_size
max. size of a work group
Definition: clm4rm.cpp:74

str
#define str(S)

clm4rm_copy
clmatrix_t * clm4rm_copy(const mzd_t *host_matrix, int rowpadding, int read_only, cl_context ctx)
ceate a copy from a matrix in M4RI format
Definition: clm4rm.cpp:254

clm4rm_conditions::pre
clm4rm_event_list * pre
pre-conditions: an operation is scheduled when all pre-conditions are met
Definition: clm4rm.h:231

init_conditions
void init_conditions(clm4rm_conditions *cond)
reset conditions list
Definition: clm4rm.cpp:284

CEILCOLS
#define CEILCOLS(i)
Definition: clcubic_mul.cl:76

BUFFERED
#define BUFFERED
Definition: clcubic_mul.cl:28

clmatrix_t::nrows
rci_t nrows
Number of rows.
Definition: clm4rm.h:99

pushed_event
cl_event * pushed_event(clm4rm_conditions *cond)
Definition: clm4rm.cpp:357

clm4rm_allocate
clmatrix_t * clm4rm_allocate(int rows, int cols, int rowpadding)
Definition: clm4rm.cpp:192

IMAGE_FORMAT
cl_image_format IMAGE_FORMAT
Definition: clm4rm.cpp:173

max_object_size
size_t max_object_size
max. object allocation size
Definition: clm4rm.cpp:79

copy_back_matrix_data
void copy_back_matrix_data(mzd_t *M, const gpuword *G, int padded_rows)
copy back a colum–major matrix
Definition: clm4rm.cpp:460

clm4rm_event_list::count
cl_uint count
current number of events
Definition: clm4rm.h:201

clmatrix_t::local_data
gpuword * local_data
matrix data in CPU memory
Definition: clm4rm.h:113

clmatrix_t::padded_cols
rci_t padded_cols
Number of columns padded to a multiple of 64.
Definition: clm4rm.h:102

clm4rm_read
mzd_t * clm4rm_read(mzd_t *host_matrix, clmatrix_t *gpu_matrix, cl_command_queue queue, clm4rm_conditions *cond)
copy matrix from gpu memory to host
Definition: clm4rm.cpp:406

track_heap_size
void track_heap_size(size_t sz)
Definition: clm4rm.cpp:218

DATA_BYTES
#define DATA_BYTES(m)
Definition: clm4rm.h:119

release_conditions
void release_conditions(clm4rm_conditions *cond)
release conditions list
Definition: clm4rm.cpp:297

max_items
size_t max_items[3]
max. number of items in each dimension
Definition: clm4rm.cpp:75

clm4rm_event_list::events
cl_event events[MAX_EVENTS]
array of OpenCL events
Definition: clm4rm.h:202