19 #if defined(__APPLE__) || defined(__MACOSX) 20 # include <OpenCL/opencl.h> 22 # include <CL/opencl.h> 27 #if defined(__cplusplus) && !defined(_MSC_VER) 31 #if defined(__cplusplus) && !defined(_MSC_VER) 40 #define clm4rm_radix 32 70 #define CEILDIV(x,y) (((x)+(y)-1)/(y)) 73 #define FLOOR(x,y) ((y)*((x)/(y))) 76 #define CEILCOLS(i) CEILDIV(i,clm4rm_radix) 78 #define POW2(i) (((gpuword)1)<<(i)) 119 #define DATA_BYTES(m) ( (m)->padded_rows * (m)->width * sizeof(gpuword) ) 178 cl_context ctx, cl_device_id device);
300 int read_only, cl_context ctx);
311 int read_only, cl_context ctx);
474 cl_context ctx, cl_command_queue queue,
485 cl_command_queue queue,
OpenCL boolean matrix data structure. Data is arranged in 32 bit words.
clm4rm_event_list event_lists[2]
< pre-conditions and post-conditions
size_t heap_size
size of allocated memory in bytes
void clm4rm_stack(clmatrix_t *C, clmatrix_t *A, clmatrix_t *B, cl_command_queue queue, clm4rm_conditions *cond)
concatenate two matrices
rci_t padded_rows
Number of rows padded to a multiple of 32.
size_t size2_t[2]
tow-dimensional size; used for various OpenCL parameters
cl_event * push_event(clm4rm_conditions *cond)
reserve one post-condition event
void join_conditions(clm4rm_conditions *cond)
called when the pre-conditions are met. The post-conditions become new pre-conditioins.
unsigned int gpuword
a GPU word has 32 bits
clm4rm_event_list * post
post-conditions: conditions after an operation finishes. post-conditions may act as pre-conditioins f...
Manages OpenCL event dependencies; necessary when the queue is out-of-order; dependencies must be est...
void clm4rm_concat(clmatrix_t *C, clmatrix_t *A, clmatrix_t *B, cl_command_queue queue, clm4rm_conditions *cond)
concatenate two matrices
size_t shared_mem_words
size of shared memory in (32bit) words
rci_t ncols
Number of columns.
void clm4rm_tear_down(cl_context ctx, cl_device_id device)
release OpenCL resources
void clcubic_mul(clmatrix_t *C, clmatrix_t *A, clmatrix_t *B, size2_t max_tile, cl_command_queue queue, clm4rm_conditions *cond)
Boolean matrix multiplication on the GPU using nested loops. C := A*B.
void clm4rm_free(clmatrix_t *gpu_matrix)
release memory (CPU and GPU)
size_t max_group_size
max. size of a work group
cl_int clm4rm_setup(const char *cl_kernel_directory, cl_context ctx, cl_device_id device)
load OpenCL kernels and set up parameters
clmatrix_t * clm4rm_copy(const mzd_t *host_matrix, int rowpadding, int read_only, cl_context ctx)
ceate a copy from a matrix in M4RI format
void init_conditions(clm4rm_conditions *cond)
reset conditions list
cl_event * pre_events(clm4rm_conditions *cond)
void merge_events(clm4rm_event_list *a, clm4rm_event_list *b)
append tow lists
uint32_t gpuword
word size of GPU data (32 bits)
size_t shared_mem_bytes
size of shared memory in bytes
void release_conditions(clm4rm_conditions *cond)
release conditions list
a list of cl_events; used by clm4rm_conditions to keep track of schedules jobs in the OpenCL queue.
cl_int clm4rm_error
latest OpenCL result code. CL_SUCCESS indicates no error.
void clm4rm_zero_fill(clmatrix_t *gpu_matrix, cl_command_queue queue, clm4rm_conditions *cond)
Fill a matrix with zero data. The operation is scheduled for asynchronous execution of the GPU....
cl_uint pre_count(clm4rm_conditions *cond)
void copy_back_matrix_data(mzd_t *dest, const gpuword *src, int padded_rows)
copy back a colum–major matrix
rci_t width
Number of words with valid bits: width = ceil(ncols / m4ri_radix) */.
size_t max_items[3]
max. number of items in each dimension
void clm4rm_and(clmatrix_t *C, clmatrix_t *A, clmatrix_t *B, cl_command_queue queue, clm4rm_conditions *cond)
perform element-wise logical conjunction (AND). For each entry, compute C_ij := A_ij & B_ij....
cl_mem data
handle to GPU data (32-bit unsigned integers)
void clm4rm_write(clmatrix_t *gpu_matrix, const mzd_t *host_matrix, cl_command_queue queue, clm4rm_conditions *cond)
Copy matrix data from host memory to GPU. The operation is scheduled for asynchronous execution of th...
void merge_conditions(clm4rm_conditions *a, clm4rm_conditions *b)
merge pre-conditions into one list
int padded_rows(int nrows, int padding)
calculate the number of padded rows
clm4rm_event_list * pre
pre-conditions: an operation is scheduled when all pre-conditions are met
void clm4rm_mul(clmatrix_t *C, clmatrix_t *A, clmatrix_t *B, cl_command_queue queue, clm4rm_conditions *cond)
Boolean matrix multiplication on the GPU using the method of the Four Russians. C := A * B.
clmatrix_t * clm4rm_create(rci_t rows, rci_t cols, int rowpadding, int read_only, cl_context ctx)
create an empty matrix
rci_t nrows
Number of rows.
mzd_t * clm4rm_read(mzd_t *host_matrix, clmatrix_t *gpu_matrix, cl_command_queue queue, clm4rm_conditions *cond)
copy matrix from gpu memory to host
cl_uint count
current number of events
void init_events(clm4rm_event_list *list)
reset events list
gpuword * local_data
matrix data in CPU memory
cl_event * pushed_event(clm4rm_conditions *cond)
rci_t padded_cols
Number of columns padded to a multiple of 64.
void release_events(clm4rm_event_list *list)
release events
size_t max_object_size
max. object allocation size
cl_mem clm4rm_query_diagonal(clmatrix_t *M, cl_context ctx, cl_command_queue queue, clm4rm_conditions *cond)
find a non-zero entry on the diagonal of a matrix. Return the column/row of the first non-zero entry,...
void clm4rm_or(clmatrix_t *C, clmatrix_t *A, clmatrix_t *B, cl_command_queue queue, clm4rm_conditions *cond)
perform element-wise logical disjunction (OR)
void clutri_mul(clmatrix_t *C, clmatrix_t *A, clmatrix_t *B, size2_t max_tile, cl_command_queue queue, clm4rm_conditions *cond)
Boolean matrix multiplication on the GPU using nested loops. C := A*B Assumes matrixes to be upper tr...
gpuword * copy_matrix_data(gpuword *dest, const mzd_t *src, int padded_rows)
create a column-major copy from an mzd_t matrix
int clm4rm_query_result(cl_mem result_buffer, cl_command_queue queue, clm4rm_conditions *cond)
examine the result of a previous call to clm4rm_query_diagonal
cl_event events[MAX_EVENTS]
array of OpenCL events