Fréchet View  1.6.0
A Tool for Exploring Fréchet Distance Algorithms
clm4rm.cpp
Go to the documentation of this file.
1 //
2 // Created by nightrider on 21.09.18.
3 //
4 
5 #include <clm4rm.h>
6 #include <stdio.h>
7 #include <qdebug.h>
8 
9 cl_int clm4rm_error;
10 
11 cl_program load_program(const char* cl_kernel_directory, const char* file_name, cl_context ctx)
12 {
13  char file_path[2048];
14  sprintf(file_path, "%s/%s.cl",cl_kernel_directory,file_name);
15  FILE* f = fopen(file_path,"r");
16  if (f==NULL) {
17  fprintf(stderr,"File %s/%s.cl not found.",cl_kernel_directory,file_name);
18  return NULL;
19  }
20 
21  fseek(f,0,SEEK_END);
22  size_t file_size = (size_t)ftell(f);
23  rewind(f);
24 
25  char* buffer = (char*)malloc(file_size+1);
26  buffer[file_size] = '\0';
27  fread(buffer,1,file_size,f);
28  fclose(f);
29 
30  cl_program result = clCreateProgramWithSource(ctx, 1, (const char**)&buffer, &file_size, &clm4rm_error);
31 
32  free(buffer);
33  return result;
34 }
35 
36 cl_build_status build_program(cl_program program, cl_device_id device, int tile_m)
37 {
38 #define str(S) #S
39  char options[1024];
40  sprintf(options, "-D %s=%i -D %s=%i -D %s=%i",
43  "TILE_M", tile_m);
44  clm4rm_error = clBuildProgram(program, 1,&device, options, NULL,NULL);
45 
46  cl_build_status build_status;
47  clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_STATUS, sizeof(build_status), &build_status,NULL);
48 
49  if (build_status != CL_SUCCESS) {
50  size_t log_size;
51  clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
52  char *log = (char *) malloc(log_size + 1);
53  clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size + 1, log, NULL);
54 
55  fprintf(stderr, "%i %s\n", build_status, log);
56  free(log);
57  }
58 
59  Q_ASSERT(clm4rm_error == CL_SUCCESS);
60  return build_status;
61 }
62 
64 cl_kernel clm4rm_or_kernel;
67 
69 //cl_kernel clm4rm_addmul_kernel;
70 
73 
75 size_t max_items[3];
76 
80 
81 cl_program programs[MAX_TILE_M+1];
82 
83 
84 cl_int clm4rm_setup(const char* cl_kernel_directory,
85  cl_context ctx, cl_device_id device)
86 {
87  // load program from disk
88 
89  // "clm4rm_bitwise.cl"
90  cl_program program = programs[0] = load_program(cl_kernel_directory,"clm4rm_bitwise",ctx);
91  if (program==NULL)
92  return -1;
93  // compile for device
94  if (build_program(program,device,0) != CL_BUILD_SUCCESS)
95  return -1;
96 
97  clGetDeviceInfo(device,CL_DEVICE_MAX_WORK_ITEM_SIZES, 3*sizeof(size_t), &max_items, NULL);
98  clGetDeviceInfo(device,CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t),&max_group_size, NULL);
99  Q_ASSERT(max_group_size > 0);
100  clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(size_t), &shared_mem_bytes, NULL);
101  clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(size_t), &heap_size, NULL);
102  clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &max_object_size, NULL);
103 
105 
106  // get the kernels
107  clm4rm_and_kernel = clCreateKernel(program,"clm4rm_and",&clm4rm_error); Q_ASSERT(clm4rm_error == CL_SUCCESS);
108  clm4rm_or_kernel = clCreateKernel(program,"clm4rm_or",&clm4rm_error); Q_ASSERT(clm4rm_error == CL_SUCCESS);
109  clm4rm_copy_kernel = clCreateKernel(program,"clm4rm_copy",&clm4rm_error); Q_ASSERT(clm4rm_error == CL_SUCCESS);
110  clm4rm_query_diagonal_kernel = clCreateKernel(program, "clm4rm_query_diagonal", &clm4rm_error); Q_ASSERT(clm4rm_error == CL_SUCCESS);
111 
112  if (clm4rm_and_kernel==NULL || clm4rm_or_kernel==NULL)
113  return -1;
114 
115  // "clm4rm_mul.cl"
116  program = programs[1] = load_program(cl_kernel_directory,"clm4rm_mul",ctx);
117  if (program==NULL)
118  return -1;
119  // compile for device
120  if (build_program(program,device,0) != CL_BUILD_SUCCESS)
121  return -1;
122 
123  clm4rm_mul_kernel = clCreateKernel(program,"clm4rm_mul",&clm4rm_error); Q_ASSERT(clm4rm_error==CL_SUCCESS);
124  //clm4rm_addmul_kernel = clCreateKernel(program,"clm4rm_addmul",&clm4rm_error); Q_ASSERT(SUCCESS);
125 
126  for(int tile_m=MAX_TILE_M; tile_m >= 1; --tile_m)
127  {
128  // "clcubic_mul.cl"
129  program = programs[1+tile_m] = load_program(cl_kernel_directory, "clcubic_mul", ctx);
130  if (program == NULL)
131  return -1;
132 
133  cl_build_status status = build_program(program, device, tile_m);
134  if (status != CL_BUILD_SUCCESS)
135  return -1;
136 
137  clcubic_mul_kernel[tile_m] = clCreateKernel(program, "clcubic_mul", &clm4rm_error);
138  Q_ASSERT(clm4rm_error == CL_SUCCESS);
139  clutri_mul_kernel[tile_m] = clCreateKernel(program, "clutri_mul", &clm4rm_error);
140  Q_ASSERT(clm4rm_error == CL_SUCCESS);
141  }
142 
143  return CL_SUCCESS;
144 }
145 
146 void clm4rm_tear_down(cl_context ctx, cl_device_id device)
147 {
148  if (clm4rm_and_kernel) clReleaseKernel(clm4rm_and_kernel);
149  if (clm4rm_or_kernel) clReleaseKernel(clm4rm_or_kernel);
150  if (clm4rm_copy_kernel) clReleaseKernel(clm4rm_copy_kernel);
152 
153  if (clm4rm_mul_kernel) clReleaseKernel(clm4rm_mul_kernel);
154  //if (clm4rm_addmul_kernel) clReleaseKernel(clm4rm_addmul_kernel);
155 
156  for(int i=0; i <= MAX_TILE_M; ++i)
157  if (programs[i]) clReleaseProgram(programs[i]);
158 
159  for(int tile_m=1; tile_m <= MAX_TILE_M; ++tile_m) {
160  if (clcubic_mul_kernel[tile_m]) clReleaseKernel(clcubic_mul_kernel[tile_m]);
161  if (clutri_mul_kernel[tile_m]) clReleaseKernel(clutri_mul_kernel[tile_m]);
162  }
163 
164  if (device) clReleaseDevice(device);
165  if (ctx) clReleaseContext(ctx);
166 
167  if (allocated_size > 0) {
168  printf("WARNING: %li bytes of device memory have not been released.\n", allocated_size);
169  }
170 }
171 
172 
173 cl_image_format IMAGE_FORMAT = { CL_R, CL_UNSIGNED_INT32 };
174 
175 void assertMatrixLayout(const clmatrix_t* gpu_matrix, const mzd_t* host_matrix)
176 {
177  Q_ASSERT(host_matrix->nrows==gpu_matrix->nrows);
178  Q_ASSERT(host_matrix->ncols==gpu_matrix->ncols);
179 // Q_ASSERT(host_matrix->rowstride*m4ri_radix==gpu_matrix->rowstride*clm4rm_radix);
180  Q_ASSERT(host_matrix->row_offset==0);
181  Q_ASSERT(mzd_is_windowed(host_matrix)==0);
182  // what else?
183 }
184 
185 int padded_rows(int nrows, int padding) {
186  if ((padding > 0) && (nrows % padding))
187  return nrows + (padding - nrows % padding);
188  else
189  return nrows;
190 }
191 
192 clmatrix_t* clm4rm_allocate(int rows, int cols, int rowpadding)
193 {
194  clmatrix_t* m = (clmatrix_t*)malloc(sizeof(clmatrix_t));
195 
196  m->nrows = rows;
197  m->ncols = cols;
198 
199  m->padded_rows = padded_rows(rows, rowpadding);
200  m->width = CEILCOLS(cols);
202 
203 // m->rowstride = m->width = M4RI_WIDTH(cols);
204 // if (m->rowstride & 1) m->rowstride++;
205 
206 // m->rowstride *= (m4ri_radix/clm4rm_radix);
207 // m->width *= (m4ri_radix/clm4rm_radix);
208 
209  // note: rowstride must resemble m4ri.
210  // wastes some bytes, but keeps compatibility with M4RI
211  m->data = NULL;
212  m->local_data=NULL;
213  return m;
214 }
215 
216 bool printed_heap_warning = false;
217 
218 void track_heap_size(size_t sz)
219 {
220  if (sz > max_object_size && !printed_heap_warning) {
221  printf("WARNING object size %li exceeds max. %li\n", sz, max_object_size);
222  printed_heap_warning = true;
223  }
224 
225  allocated_size += sz;
226 
228  printf("WARNING heap size %li exceeds max. %li\n", allocated_size, heap_size);
229  printed_heap_warning = true;
230  }
231 }
232 
233 clmatrix_t* clm4rm_create(rci_t rows, rci_t cols, int rowpadding, int read_only, cl_context ctx)
234 {
235  clmatrix_t* m = clm4rm_allocate(rows,cols,rowpadding);
236 
237 #if IMAGE2D
238  // Note: column-major format !!
239  // a matrix column is actually a row in Image2d
240  m->data = clCreateImage2D(ctx,
241  read_only ? CL_MEM_READ_ONLY : CL_MEM_READ_WRITE,
242  &IMAGE_FORMAT, m->padded_rows, m->width, 0,
243  NULL, &clm4rm_error);
244 #else
245  m->data = clCreateBuffer(ctx,
246  read_only ? CL_MEM_READ_ONLY : CL_MEM_READ_WRITE,
247  DATA_BYTES(m), NULL, &clm4rm_error);
248 #endif
250  Q_ASSERT(clm4rm_error == CL_SUCCESS);
251  return m;
252 }
253 
254 clmatrix_t* clm4rm_copy(const mzd_t* host_matrix, int rowpadding, int read_only, cl_context ctx)
255 {
256  clmatrix_t* m = clm4rm_allocate(host_matrix->nrows,host_matrix->ncols, rowpadding);
257  m->local_data = copy_matrix_data(m->local_data,host_matrix,m->padded_rows);
258 
259  //assertMatrixLayout(m,host_matrix);
260 #if IMAGE2D
261  // Note: column-major format !!
262  // a matrix column is actually a row in Image2d
263  m->data = clCreateImage2D(ctx,
264  (read_only ? CL_MEM_READ_ONLY : CL_MEM_READ_WRITE) | CL_MEM_COPY_HOST_PTR,
265  &IMAGE_FORMAT, m->padded_rows, m->width, m->padded_rows * sizeof(gpuword),
266  m->local_data, &clm4rm_error);
267 #else
268  m->data = clCreateBuffer(ctx,
269  (read_only ? CL_MEM_READ_ONLY : CL_MEM_READ_WRITE) | CL_MEM_COPY_HOST_PTR,
271 #endif
273  Q_ASSERT(clm4rm_error == CL_SUCCESS);
274  return m;
275 }
276 
278 {
279  list->count = 0;
280  for (int i = 0; i < MAX_EVENTS; ++i)
281  list->events[i] = NULL;
282 }
283 
285  Q_ASSERT(cond);
286  init_events(cond->pre = &cond->event_lists[0]);
287  init_events(cond->post = &cond->event_lists[1]);
288 }
289 
291  while(list->count > 0) {
292  clReleaseEvent(list->events[--(list->count)]);
293  list->events[list->count]=NULL;
294  }
295 }
296 
298  Q_ASSERT(cond);
299  release_events(cond->pre);
300  release_events(cond->post);
301 }
302 
304 {
305  Q_ASSERT((a->count+b->count) <= MAX_EVENTS);
306  for(int i=0; i < b->count; ++i) {
307  cl_event evt = b->events[i];
308  Q_ASSERT(evt != NULL);
309  a->events[(a->count)++] = evt;
310  clRetainEvent(evt);
311  }
312 }
313 
315 {
316  merge_events(a->pre, b->pre);
317 }
318 
320  if (!cond) return;
321  // clear pre-conditions
322  release_events(cond->pre);
323  // move post to pre
324  clm4rm_event_list* temp = cond->pre;
325  cond->pre = cond->post;
326  cond->post = temp;
327  for (int i = 0; i < cond->pre->count; ++i)
328  Q_ASSERT(cond->pre->events[i]);
329 }
330 
331 cl_uint pre_count(clm4rm_conditions* cond) {
332  if (cond)
333  return cond->pre->count;
334  else
335  return 0;
336 }
337 
338 cl_event* pre_events(clm4rm_conditions* cond) {
339  if (cond && cond->pre->count > 0) {
340  for (int i = 0; i < cond->pre->count; ++i)
341  Q_ASSERT(cond->pre->events[i] != NULL);
342  return cond->pre->events;
343  }
344  else
345  return NULL;
346 }
347 
348 cl_event* push_event(clm4rm_conditions* cond) {
349  if (cond) {
350  Q_ASSERT(cond->post->count+1 < MAX_EVENTS);
351  return cond->post->events + (cond->post->count)++;
352  }
353  else
354  return NULL;
355 }
356 
357 cl_event* pushed_event(clm4rm_conditions* cond) {
358  Q_ASSERT(cond->post->count >= 1 && cond->post->count < MAX_EVENTS);
359  return cond->post->events + (cond->post->count - 1);
360 }
361 
362 
363 
364 void clm4rm_zero_fill(clmatrix_t* gpu_matrix,
365  cl_command_queue queue, clm4rm_conditions* cond)
366 {
367  gpuword zero=0;
368  clEnqueueFillBuffer(
369  queue, gpu_matrix->data, &zero, sizeof(zero),
370  0, DATA_BYTES(gpu_matrix),
371  pre_count(cond),pre_events(cond),push_event(cond));
372  Q_ASSERT(pushed_event(cond) != NULL);
373 }
374 
375 
382 void clm4rm_write(clmatrix_t* gpu_matrix, const mzd_t* host_matrix,
383  cl_command_queue queue, clm4rm_conditions* cond)
384 {
385  assertMatrixLayout(gpu_matrix,host_matrix);
386  gpu_matrix->local_data = copy_matrix_data(gpu_matrix->local_data, host_matrix, gpu_matrix->padded_rows);
387 
388 #if IMAGE2D
389  size_t origin[3] = { 0,0,0 };
390  size_t region[3] = { gpu_matrix->padded_rows, gpu_matrix->width, 1 };
391  // Note column-major format !!
392  // a matrix column is actually a row in Image2d
393  clm4rm_error = clEnqueueWriteImage(queue, gpu_matrix->data, CL_FALSE,
394  origin, region, gpu_matrix->padded_rows * sizeof(gpuword), 0,
395  gpu_matrix->local_data,
396  pre_count(cond), pre_events(cond), pushed_event(cond));
397 #else
398  clm4rm_error = clEnqueueWriteBuffer(queue, gpu_matrix->data, CL_FALSE,
399  0, DATA_BYTES(gpu_matrix), gpu_matrix->local_data,
400  pre_count(cond), pre_events(cond), push_event(cond));
401 #endif
402  Q_ASSERT(pushed_event(cond) != NULL);
403 }
404 
405 
406 mzd_t* clm4rm_read(mzd_t* host_matrix, clmatrix_t* gpu_matrix,
407  cl_command_queue queue, clm4rm_conditions* cond)
408 {
409  if (!host_matrix)
410  host_matrix = mzd_init(gpu_matrix->nrows,gpu_matrix->ncols);
411 
412  if (gpu_matrix->local_data==NULL)
413  gpu_matrix->local_data = (gpuword*)malloc(DATA_BYTES(gpu_matrix));
414 
415 #if IMAGE2D
416  size_t origin[3] = { 0,0,0 };
417  size_t region[3] = { gpu_matrix->padded_rows, gpu_matrix->width, 1 };
418  // Note: column-major format !!
419  // a matrix column is actually a row in Image2d
420  clm4rm_error = clEnqueueReadImage(queue, gpu_matrix->data, CL_TRUE,
421  origin, region, gpu_matrix->padded_rows * sizeof(gpuword), 0,
422  gpu_matrix->local_data,
423  pre_count(cond), pre_events(cond), push_event(cond));
424 #else
425  clm4rm_error = clEnqueueReadBuffer(queue, gpu_matrix->data, CL_TRUE,
426  0, DATA_BYTES(gpu_matrix), gpu_matrix->local_data,
427  pre_count(cond), pre_events(cond), push_event(cond));
428 #endif
429  Q_ASSERT(pushed_event(cond) != NULL);
430  copy_back_matrix_data(host_matrix,gpu_matrix->local_data, gpu_matrix->padded_rows);
431  return host_matrix;
432 }
433 
434 
435 
436 gpuword* copy_matrix_data(gpuword* G, const mzd_t* M, int padded_rows)
437 {
438  int width = CEILCOLS(M->ncols);
439 
440  if (G==NULL)
441  G = (gpuword*)malloc(sizeof(gpuword)*width*padded_rows);
442  for (int row = 0; row < M->nrows; ++row)
443  {
444  word* Mrow = M->rows[row];
445  for (int col = 0; col < width; col += 2)
446  {
447  word Mword = Mrow[col>>1];
448  G[col*padded_rows + row] = Mword;
449  if (col+1 < width)
450  G[(col+1)*padded_rows + row] = Mword >> 32;
451  }
452  }
453  for (int row=M->nrows; row < padded_rows; ++row)
454  for (int col = 0; col < width; col++)
455  G[col*padded_rows + row] = 0;
456  return G;
457 }
458 
459 
460 void copy_back_matrix_data(mzd_t* M, const gpuword* G, int padded_rows)
461 {
462  int width = CEILCOLS(M->ncols);
463 
464  for (int row = 0; row < M->nrows; ++row)
465  {
466  word* Mrow = M->rows[row];
467  for (int col = 0; col < width; col += 2)
468  {
469  Mrow[col>>1] = G[col*padded_rows + row];
470  if (col+1 < width)
471  Mrow[col>>1] |= ((word)G[(col+1)*padded_rows + row]) << 32;
472  }
473  }
474 }
475 
void init_events(clm4rm_event_list *list)
reset events list
Definition: clm4rm.cpp:277
void clm4rm_zero_fill(clmatrix_t *gpu_matrix, cl_command_queue queue, clm4rm_conditions *cond)
Fill a matrix with zero data. The operation is scheduled for asynchronous execution of the GPU....
Definition: clm4rm.cpp:364
size_t heap_size
size of allocated memory in bytes
Definition: clm4rm.cpp:78
OpenCL boolean matrix data structure. Data is arranged in 32 bit words.
Definition: clm4rm.h:98
clm4rm_event_list event_lists[2]
< pre-conditions and post-conditions
Definition: clm4rm.h:229
size_t shared_mem_bytes
size of shared memory in bytes
Definition: clm4rm.cpp:77
cl_kernel clm4rm_and_kernel
Definition: clm4rm.cpp:63
rci_t padded_rows
Number of rows padded to a multiple of 32.
Definition: clm4rm.h:100
void assertMatrixLayout(const clmatrix_t *gpu_matrix, const mzd_t *host_matrix)
Definition: clm4rm.cpp:175
cl_int clm4rm_error
latest OpenCL result code. CL_SUCCESS indicates no error.
Definition: clm4rm.cpp:9
cl_program load_program(const char *cl_kernel_directory, const char *file_name, cl_context ctx)
Definition: clm4rm.cpp:11
unsigned int gpuword
a GPU word has 32 bits
Definition: clcubic_mul.cl:74
cl_kernel clm4rm_copy_kernel
Definition: clm4rm.cpp:65
cl_event * pre_events(clm4rm_conditions *cond)
Definition: clm4rm.cpp:338
cl_build_status build_program(cl_program program, cl_device_id device, int tile_m)
Definition: clm4rm.cpp:36
size_t allocated_size
Definition: clm4rm.cpp:78
clm4rm_event_list * post
post-conditions: conditions after an operation finishes. post-conditions may act as pre-conditioins f...
Definition: clm4rm.h:234
void release_events(clm4rm_event_list *list)
release events
Definition: clm4rm.cpp:290
Manages OpenCL event dependencies; necessary when the queue is out-of-order; dependencies must be est...
Definition: clm4rm.h:227
rci_t ncols
Number of columns.
Definition: clm4rm.h:101
cl_int clm4rm_setup(const char *cl_kernel_directory, cl_context ctx, cl_device_id device)
load OpenCL kernels and set up parameters
Definition: clm4rm.cpp:84
void clm4rm_tear_down(cl_context ctx, cl_device_id device)
release OpenCL resources
Definition: clm4rm.cpp:146
#define MAX_EVENTS
Definition: clm4rm.h:194
size_t shared_mem_words
size of shared memory in (32bit) words
Definition: clm4rm.cpp:77
gpuword * copy_matrix_data(gpuword *G, const mzd_t *M, int padded_rows)
create a column-major copy from an mzd_t matrix
Definition: clm4rm.cpp:436
bool printed_heap_warning
Definition: clm4rm.cpp:216
#define MAX_TILE_M
Definition: clm4rm.h:62
cl_kernel clm4rm_or_kernel
Definition: clm4rm.cpp:64
#define IMAGE2D
Definition: clm4rm.h:53
void clm4rm_write(clmatrix_t *gpu_matrix, const mzd_t *host_matrix, cl_command_queue queue, clm4rm_conditions *cond)
Copy matrix data from host memory to GPU. The operation is scheduled for asynchronous execution of th...
Definition: clm4rm.cpp:382
cl_kernel clm4rm_mul_kernel
OpenCL kernel for Four-Russians matrix multiplication.
Definition: clm4rm.cpp:68
cl_kernel clutri_mul_kernel[MAX_TILE_M+1]
OpenCL kernels for cubic upper-triangle matrix multiplication. Each kernel for a tile size....
Definition: clm4rm.cpp:72
a list of cl_events; used by clm4rm_conditions to keep track of schedules jobs in the OpenCL queue.
Definition: clm4rm.h:200
void merge_events(clm4rm_event_list *a, clm4rm_event_list *b)
append tow lists
Definition: clm4rm.cpp:303
void join_conditions(clm4rm_conditions *cond)
called when the pre-conditions are met. The post-conditions become new pre-conditioins.
Definition: clm4rm.cpp:319
cl_uint pre_count(clm4rm_conditions *cond)
Definition: clm4rm.cpp:331
void merge_conditions(clm4rm_conditions *a, clm4rm_conditions *b)
merge pre-conditions into one list
Definition: clm4rm.cpp:314
cl_kernel clcubic_mul_kernel[MAX_TILE_M+1]
OpenCL kernels for cubic matrix multiplication. Each kernel for a tile size. Actual tile sizes are in...
Definition: clm4rm.cpp:71
clmatrix_t * clm4rm_create(rci_t rows, rci_t cols, int rowpadding, int read_only, cl_context ctx)
create an empty matrix
Definition: clm4rm.cpp:233
#define clm4rm_radix
word size. for compatibility with GPU memory layout, we operate on 32 bit words.
Definition: clm4rm.h:41
cl_event * push_event(clm4rm_conditions *cond)
reserve one post-condition event
Definition: clm4rm.cpp:348
cl_program programs[MAX_TILE_M+1]
Definition: clm4rm.cpp:81
rci_t width
Number of words with valid bits: width = ceil(ncols / m4ri_radix) */.
Definition: clm4rm.h:103
cl_kernel clm4rm_query_diagonal_kernel
Definition: clm4rm.cpp:66
cl_mem data
handle to GPU data (32-bit unsigned integers)
Definition: clm4rm.h:114
int padded_rows(int nrows, int padding)
calculate the number of padded rows
Definition: clm4rm.cpp:185
size_t max_group_size
max. size of a work group
Definition: clm4rm.cpp:74
#define str(S)
clmatrix_t * clm4rm_copy(const mzd_t *host_matrix, int rowpadding, int read_only, cl_context ctx)
ceate a copy from a matrix in M4RI format
Definition: clm4rm.cpp:254
clm4rm_event_list * pre
pre-conditions: an operation is scheduled when all pre-conditions are met
Definition: clm4rm.h:231
void init_conditions(clm4rm_conditions *cond)
reset conditions list
Definition: clm4rm.cpp:284
#define CEILCOLS(i)
Definition: clcubic_mul.cl:76
#define BUFFERED
Definition: clcubic_mul.cl:28
rci_t nrows
Number of rows.
Definition: clm4rm.h:99
cl_event * pushed_event(clm4rm_conditions *cond)
Definition: clm4rm.cpp:357
clmatrix_t * clm4rm_allocate(int rows, int cols, int rowpadding)
Definition: clm4rm.cpp:192
cl_image_format IMAGE_FORMAT
Definition: clm4rm.cpp:173
size_t max_object_size
max. object allocation size
Definition: clm4rm.cpp:79
void copy_back_matrix_data(mzd_t *M, const gpuword *G, int padded_rows)
copy back a colum–major matrix
Definition: clm4rm.cpp:460
cl_uint count
current number of events
Definition: clm4rm.h:201
gpuword * local_data
matrix data in CPU memory
Definition: clm4rm.h:113
rci_t padded_cols
Number of columns padded to a multiple of 64.
Definition: clm4rm.h:102
mzd_t * clm4rm_read(mzd_t *host_matrix, clmatrix_t *gpu_matrix, cl_command_queue queue, clm4rm_conditions *cond)
copy matrix from gpu memory to host
Definition: clm4rm.cpp:406
void track_heap_size(size_t sz)
Definition: clm4rm.cpp:218
#define DATA_BYTES(m)
Definition: clm4rm.h:119
void release_conditions(clm4rm_conditions *cond)
release conditions list
Definition: clm4rm.cpp:297
size_t max_items[3]
max. number of items in each dimension
Definition: clm4rm.cpp:75
cl_event events[MAX_EVENTS]
array of OpenCL events
Definition: clm4rm.h:202