37 size_t work_size[2] = { (size_t)A->
nrows, (
size_t)A->
width };
39 work_dim, NULL, work_size, NULL,
59 size_t work_size[2] = { (size_t)A->
nrows, (
size_t)A->
width };
61 work_dim, NULL, work_size, NULL,
68 cl_context ctx, cl_command_queue queue,
74 cl_mem result_buffer = clCreateBuffer(ctx, CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR,
86 work_dim, NULL, &work_size, NULL,
99 clm4rm_error = clEnqueueReadBuffer(queue, result_buffer, CL_TRUE,
100 0,
sizeof(
int), &result,
104 clReleaseMemObject(result_buffer);
115 clReleaseMemObject(gpu_matrix->
data);
134 size_t aorigin[3] = { 0,0,0 };
141 aorigin, aorigin, aregion,
142 0, NULL, &events[0]);
145 aorigin, borigin, bregion,
146 0, NULL, &events[1]);
167 size_t Arows = A->
nrows;
168 size_t Acols = A->
ncols;
169 size_t Brows = B->
nrows;
170 size_t Bcols = B->
ncols;
171 size_t origin[3] = { 0,0,0 };
172 size_t aregion[3] = { 0, Arows, 1 };
178 origin, origin, aregion,
179 0, NULL, &events[0]);
183 size_t boffset[3] = { A->
ncols, 0, 1 };
188 origin, boffset, bregion,
199 origin, origin, aregion,
205 if ((A->
ncols % 8)==0) {
207 size_t boffset[3] = { Acols/8, 0, 0 };
208 size_t bregion[3] = { 0, Brows, 1 };
212 origin, boffset, bregion,
232 cl_uint work_dim = 2;
233 size_t work_size[2] = { Brows,
CEILCOLS(Bcols) };
235 work_dim, NULL, work_size, NULL,
OpenCL boolean matrix data structure. Data is arranged in 32 bit words.
rci_t padded_rows
Number of rows padded to a multiple of 32.
void clm4rm_free(clmatrix_t *gpu_matrix)
release memory (CPU and GPU)
cl_int clm4rm_error
latest OpenCL result code. CL_SUCCESS indicates no error.
cl_event * pre_events(clm4rm_conditions *cond)
cl_kernel clm4rm_or_kernel
void clm4rm_and(clmatrix_t *C, clmatrix_t *A, clmatrix_t *B, cl_command_queue queue, clm4rm_conditions *cond)
perform element-wise logical conjunction (AND). For each entry, compute C_ij := A_ij & B_ij....
Manages OpenCL event dependencies; necessary when the queue is out-of-order; dependencies must be est...
void clm4rm_stack(clmatrix_t *C, clmatrix_t *A, clmatrix_t *B, cl_command_queue queue, clm4rm_conditions *cond)
concatenate two matrices
void assertMatrixSize(clmatrix_t *C, clmatrix_t *A, clmatrix_t *B)
rci_t ncols
Number of columns.
cl_kernel clm4rm_and_kernel
cl_kernel clm4rm_query_diagonal_kernel
void clm4rm_or(clmatrix_t *C, clmatrix_t *A, clmatrix_t *B, cl_command_queue queue, clm4rm_conditions *cond)
perform element-wise logical disjunction (OR)
cl_uint pre_count(clm4rm_conditions *cond)
#define clm4rm_radix
word size. for compatibility with GPU memory layout, we operate on 32 bit words.
cl_event * push_event(clm4rm_conditions *cond)
reserve one post-condition event
rci_t width
Number of words with valid bits: width = ceil(ncols / m4ri_radix) */.
cl_mem data
handle to GPU data (32-bit unsigned integers)
int clm4rm_query_result(cl_mem result_buffer, cl_command_queue queue, clm4rm_conditions *cond)
examine the result of a previous call to clm4rm_query_diagonal
void clm4rm_concat(clmatrix_t *C, clmatrix_t *A, clmatrix_t *B, cl_command_queue queue, clm4rm_conditions *cond)
concatenate two matrices
rci_t nrows
Number of rows.
cl_event * pushed_event(clm4rm_conditions *cond)
cl_kernel clm4rm_copy_kernel
gpuword * local_data
matrix data in CPU memory
void release_conditions(clm4rm_conditions *cond)
release conditions list
cl_mem clm4rm_query_diagonal(clmatrix_t *M, cl_context ctx, cl_command_queue queue, clm4rm_conditions *cond)
find a non-zero entry on the diagonal of a matrix. Return the column/row of the first non-zero entry,...