Fréchet View  1.6.0
A Tool for Exploring Fréchet Distance Algorithms
clm4rm.h
Go to the documentation of this file.
1 
5 #ifndef CLM4RM_H
6 #define CLM4RM_H
7 
18 // Includes the normal OpenCL C header
19 #if defined(__APPLE__) || defined(__MACOSX)
20 # include <OpenCL/opencl.h>
21 #else
22 # include <CL/opencl.h>
23 #endif
24 
25 // host data structures from M4RI
26 // row-major 64-bit unsigned integers
27 #if defined(__cplusplus) && !defined(_MSC_VER)
28 extern "C" {
29 #endif
30 #include <m4ri/mzd.h>
31 #if defined(__cplusplus) && !defined(_MSC_VER)
32 }
33 #endif
34 
40 #define clm4rm_radix 32
42 
53 #define IMAGE2D 0
54 
57 #define BUFFERED 1
58 
62 #define MAX_TILE_M 6
63 
65 typedef uint32_t gpuword;
67 typedef size_t size2_t[2];
68 
70 #define CEILDIV(x,y) (((x)+(y)-1)/(y))
71 
73 #define FLOOR(x,y) ((y)*((x)/(y)))
74 
76 #define CEILCOLS(i) CEILDIV(i,clm4rm_radix)
77 
78 #define POW2(i) (((gpuword)1)<<(i))
79 
98 struct clmatrix_t {
99  rci_t nrows;
100  rci_t padded_rows;
101  rci_t ncols;
102  rci_t padded_cols;
103  rci_t width;
104  /*
105  * Offset in words between rows.
106  *
107  * rowstride = ((width & 1) == 0) ? width : width + 1;
108  * where width is the width of the underlying non-windowed matrix.
109  * @deprecated not used anymore
110  */
111  //wi_t rowstride;
112 
114  cl_mem data;
115 
116  // TODO indicators for Triangular matrices ?
117 };
118 typedef struct clmatrix_t clmatrix_t;
119 #define DATA_BYTES(m) ( (m)->padded_rows * (m)->width * sizeof(gpuword) )
120 
127 int padded_rows(int nrows, int padding);
128 
136 gpuword* copy_matrix_data(gpuword* dest, const mzd_t* src, int padded_rows);
137 
144 void copy_back_matrix_data(mzd_t* dest, const gpuword* src, int padded_rows);
145 
153 extern cl_int clm4rm_error;
155 
157 extern size_t max_group_size;
159 extern size_t max_items[3];
160 
162 extern size_t shared_mem_bytes;
164 extern size_t shared_mem_words;
166 extern size_t heap_size, allocated_size;
168 extern size_t max_object_size;
169 
177 cl_int clm4rm_setup(const char* cl_kernel_directory,
178  cl_context ctx, cl_device_id device);
184 void clm4rm_tear_down(cl_context ctx, cl_device_id device);
185 
194 #define MAX_EVENTS 6
195 
201  cl_uint count;
202  cl_event events[MAX_EVENTS];
203 };
209 void init_events(clm4rm_event_list* list);
221 
235 };
259 
264 cl_uint pre_count(clm4rm_conditions* cond);
269 cl_event* pre_events(clm4rm_conditions* cond);
275 cl_event* push_event(clm4rm_conditions* cond);
280 cl_event* pushed_event(clm4rm_conditions* cond);
281 
299 clmatrix_t* clm4rm_create(rci_t rows, rci_t cols, int rowpadding,
300  int read_only, cl_context ctx);
310 clmatrix_t* clm4rm_copy(const mzd_t* host_matrix, int rowpadding,
311  int read_only, cl_context ctx);
312 
313 //? clmatrix_t* clm4rm_copy(clmatrix_t* gpu_matrix);
314 
324 void clm4rm_zero_fill(clmatrix_t* gpu_matrix,
325  cl_command_queue queue, clm4rm_conditions* cond);
326 
337 void clm4rm_write(clmatrix_t* gpu_matrix, const mzd_t* host_matrix,
338  cl_command_queue queue, clm4rm_conditions* cond);
339 
340 
341 
350 mzd_t* clm4rm_read(mzd_t* host_matrix, clmatrix_t* gpu_matrix,
351  cl_command_queue queue, clm4rm_conditions* cond);
352 
357 void clm4rm_free(clmatrix_t* gpu_matrix);
358 
374  cl_command_queue queue, clm4rm_conditions* cond);
391  size2_t max_tile,
392  cl_command_queue queue, clm4rm_conditions* cond);
393 
411  size2_t max_tile,
412  cl_command_queue queue, clm4rm_conditions* cond);
413 
414 /*
415  * @brief C := C + A*B
416  * @deprecated not used anymore
417  */
418 /*cl_event clm4rm_addmul(clmatrix_t* C, clmatrix_t* A, clmatrix_t* B,
419  cl_command_queue queue);*/
420 
426  cl_command_queue queue, clm4rm_conditions* cond);
427 
433  cl_command_queue queue, clm4rm_conditions* cond);
434 
439 void clm4rm_or(clmatrix_t* C, clmatrix_t* A, clmatrix_t* B,
440  cl_command_queue queue, clm4rm_conditions* cond);
441 
454  cl_command_queue queue, clm4rm_conditions* cond);
455 // TODO
456 //cl_event clutri_and(clmatrix_t* C, clmatrix_t* A, clmatrix_t* B,
457 // cl_command_queue queue, int wait_for_it);
458 
459 
460 // @returns
474  cl_context ctx, cl_command_queue queue,
475  clm4rm_conditions* cond);
484 int clm4rm_query_result(cl_mem result_buffer,
485  cl_command_queue queue,
486  clm4rm_conditions* cond);
487 
488 /* @} */
489 
490 
491 #endif //CLM4RM_H
OpenCL boolean matrix data structure. Data is arranged in 32 bit words.
Definition: clm4rm.h:98
clm4rm_event_list event_lists[2]
< pre-conditions and post-conditions
Definition: clm4rm.h:229
size_t heap_size
size of allocated memory in bytes
Definition: clm4rm.cpp:78
void clm4rm_stack(clmatrix_t *C, clmatrix_t *A, clmatrix_t *B, cl_command_queue queue, clm4rm_conditions *cond)
concatenate two matrices
rci_t padded_rows
Number of rows padded to a multiple of 32.
Definition: clm4rm.h:100
size_t size2_t[2]
tow-dimensional size; used for various OpenCL parameters
Definition: clm4rm.h:67
cl_event * push_event(clm4rm_conditions *cond)
reserve one post-condition event
Definition: clm4rm.cpp:348
void join_conditions(clm4rm_conditions *cond)
called when the pre-conditions are met. The post-conditions become new pre-conditioins.
Definition: clm4rm.cpp:319
unsigned int gpuword
a GPU word has 32 bits
Definition: clcubic_mul.cl:74
clm4rm_event_list * post
post-conditions: conditions after an operation finishes. post-conditions may act as pre-conditioins f...
Definition: clm4rm.h:234
size_t allocated_size
Definition: clm4rm.cpp:78
Manages OpenCL event dependencies; necessary when the queue is out-of-order; dependencies must be est...
Definition: clm4rm.h:227
void clm4rm_concat(clmatrix_t *C, clmatrix_t *A, clmatrix_t *B, cl_command_queue queue, clm4rm_conditions *cond)
concatenate two matrices
size_t shared_mem_words
size of shared memory in (32bit) words
Definition: clm4rm.cpp:77
rci_t ncols
Number of columns.
Definition: clm4rm.h:101
void clm4rm_tear_down(cl_context ctx, cl_device_id device)
release OpenCL resources
Definition: clm4rm.cpp:146
void clcubic_mul(clmatrix_t *C, clmatrix_t *A, clmatrix_t *B, size2_t max_tile, cl_command_queue queue, clm4rm_conditions *cond)
Boolean matrix multiplication on the GPU using nested loops. C := A*B.
void clm4rm_free(clmatrix_t *gpu_matrix)
release memory (CPU and GPU)
size_t max_group_size
max. size of a work group
Definition: clm4rm.cpp:74
#define MAX_EVENTS
Definition: clm4rm.h:194
cl_int clm4rm_setup(const char *cl_kernel_directory, cl_context ctx, cl_device_id device)
load OpenCL kernels and set up parameters
Definition: clm4rm.cpp:84
clmatrix_t * clm4rm_copy(const mzd_t *host_matrix, int rowpadding, int read_only, cl_context ctx)
ceate a copy from a matrix in M4RI format
Definition: clm4rm.cpp:254
void init_conditions(clm4rm_conditions *cond)
reset conditions list
Definition: clm4rm.cpp:284
cl_event * pre_events(clm4rm_conditions *cond)
Definition: clm4rm.cpp:338
void merge_events(clm4rm_event_list *a, clm4rm_event_list *b)
append tow lists
Definition: clm4rm.cpp:303
uint32_t gpuword
word size of GPU data (32 bits)
Definition: clm4rm.h:65
size_t shared_mem_bytes
size of shared memory in bytes
Definition: clm4rm.cpp:77
void release_conditions(clm4rm_conditions *cond)
release conditions list
Definition: clm4rm.cpp:297
a list of cl_events; used by clm4rm_conditions to keep track of schedules jobs in the OpenCL queue.
Definition: clm4rm.h:200
cl_int clm4rm_error
latest OpenCL result code. CL_SUCCESS indicates no error.
Definition: clm4rm.cpp:9
void clm4rm_zero_fill(clmatrix_t *gpu_matrix, cl_command_queue queue, clm4rm_conditions *cond)
Fill a matrix with zero data. The operation is scheduled for asynchronous execution of the GPU....
Definition: clm4rm.cpp:364
cl_uint pre_count(clm4rm_conditions *cond)
Definition: clm4rm.cpp:331
void copy_back_matrix_data(mzd_t *dest, const gpuword *src, int padded_rows)
copy back a colum–major matrix
Definition: clm4rm.cpp:460
rci_t width
Number of words with valid bits: width = ceil(ncols / m4ri_radix) */.
Definition: clm4rm.h:103
size_t max_items[3]
max. number of items in each dimension
Definition: clm4rm.cpp:75
void clm4rm_and(clmatrix_t *C, clmatrix_t *A, clmatrix_t *B, cl_command_queue queue, clm4rm_conditions *cond)
perform element-wise logical conjunction (AND). For each entry, compute C_ij := A_ij & B_ij....
cl_mem data
handle to GPU data (32-bit unsigned integers)
Definition: clm4rm.h:114
void clm4rm_write(clmatrix_t *gpu_matrix, const mzd_t *host_matrix, cl_command_queue queue, clm4rm_conditions *cond)
Copy matrix data from host memory to GPU. The operation is scheduled for asynchronous execution of th...
Definition: clm4rm.cpp:382
void merge_conditions(clm4rm_conditions *a, clm4rm_conditions *b)
merge pre-conditions into one list
Definition: clm4rm.cpp:314
int padded_rows(int nrows, int padding)
calculate the number of padded rows
Definition: clm4rm.cpp:185
clm4rm_event_list * pre
pre-conditions: an operation is scheduled when all pre-conditions are met
Definition: clm4rm.h:231
void clm4rm_mul(clmatrix_t *C, clmatrix_t *A, clmatrix_t *B, cl_command_queue queue, clm4rm_conditions *cond)
Boolean matrix multiplication on the GPU using the method of the Four Russians. C := A * B.
clmatrix_t * clm4rm_create(rci_t rows, rci_t cols, int rowpadding, int read_only, cl_context ctx)
create an empty matrix
Definition: clm4rm.cpp:233
rci_t nrows
Number of rows.
Definition: clm4rm.h:99
mzd_t * clm4rm_read(mzd_t *host_matrix, clmatrix_t *gpu_matrix, cl_command_queue queue, clm4rm_conditions *cond)
copy matrix from gpu memory to host
Definition: clm4rm.cpp:406
cl_uint count
current number of events
Definition: clm4rm.h:201
void init_events(clm4rm_event_list *list)
reset events list
Definition: clm4rm.cpp:277
gpuword * local_data
matrix data in CPU memory
Definition: clm4rm.h:113
cl_event * pushed_event(clm4rm_conditions *cond)
Definition: clm4rm.cpp:357
rci_t padded_cols
Number of columns padded to a multiple of 64.
Definition: clm4rm.h:102
void release_events(clm4rm_event_list *list)
release events
Definition: clm4rm.cpp:290
size_t max_object_size
max. object allocation size
Definition: clm4rm.cpp:79
cl_mem clm4rm_query_diagonal(clmatrix_t *M, cl_context ctx, cl_command_queue queue, clm4rm_conditions *cond)
find a non-zero entry on the diagonal of a matrix. Return the column/row of the first non-zero entry,...
void clm4rm_or(clmatrix_t *C, clmatrix_t *A, clmatrix_t *B, cl_command_queue queue, clm4rm_conditions *cond)
perform element-wise logical disjunction (OR)
void clutri_mul(clmatrix_t *C, clmatrix_t *A, clmatrix_t *B, size2_t max_tile, cl_command_queue queue, clm4rm_conditions *cond)
Boolean matrix multiplication on the GPU using nested loops. C := A*B Assumes matrixes to be upper tr...
gpuword * copy_matrix_data(gpuword *dest, const mzd_t *src, int padded_rows)
create a column-major copy from an mzd_t matrix
Definition: clm4rm.cpp:436
int clm4rm_query_result(cl_mem result_buffer, cl_command_queue queue, clm4rm_conditions *cond)
examine the result of a previous call to clm4rm_query_diagonal
cl_event events[MAX_EVENTS]
array of OpenCL events
Definition: clm4rm.h:202