39 if (row_offset < A->nrows) {
57 size_t work_size[2] = { (size_t)r1-r0, (
size_t)C->
width };
61 Q_ASSERT((work_size[0] % group_size[0]) == 0);
62 Q_ASSERT((work_size[1] % group_size[1]) == 0);
90 work_dim, NULL, work_size, group_size,
97 int tile_n,
int tile_m,
98 size_t work_offset[2],
147 tile_n = MAX(1,
MIN(max_tile[0], tile_n));
149 tile_m = MAX(1,
MIN(max_tile[1], tile_m));
156 size2_t work_size_w, work_size_1, work_size_m, work_size_n, work_size, main_size_1;
160 work_size_w[1] = C->
width;
162 work_size_1[0] = work_size_w[0]/32;
163 work_size_1[1] = work_size_w[1];
167 work_size_m[0] = work_size_1[0] / tile_m;
168 work_size_m[1] = work_size_1[1] / tile_m;
170 work_size_n[0] = work_size_m[0] / tile_n;
171 work_size_n[1] = work_size_m[1] / tile_n;
173 if ((work_size_n[0] > 0) && (work_size_n[1] > 0))
180 Q_ASSERT(tile_n > 1);
186 work_size[0] = work_size_n[0]*32*tile_n;
187 work_size[1] = work_size_n[1]*tile_n;
193 main_size_1[0] = work_size_n[0]*tile_n*tile_m;
194 main_size_1[1] = work_size_n[1]*tile_n*tile_m;
196 if (work_size_1[0] > main_size_1[0]) {
199 work_offset[0] = main_size_1[0]*32;
203 rest_work_size_1[0] = work_size_1[0]-main_size_1[0];
204 rest_work_size_1[1] = work_size_1[1];
206 work_size[0] = rest_work_size_1[0]*32;
207 work_size[1] = rest_work_size_1[1];
211 if (work_size_1[1] > main_size_1[1]) {
215 work_offset[1] = main_size_1[1];
218 rest_work_size_1[0] = main_size_1[0];
219 rest_work_size_1[1] = work_size_1[1]-main_size_1[1];
221 work_size[0] = rest_work_size_1[0]*32;
222 work_size[1] = rest_work_size_1[1];
232 clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END,
233 sizeof(cl_ulong), &end, NULL);
234 clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START,
235 sizeof(cl_ulong), &start, NULL);
237 float executionTimeInMilliseconds = (end - start) * 1.0e-6f;
238 printf(
" -- GPU clock = %lf ms\n", executionTimeInMilliseconds);
242 int tile_n,
int tile_m,
243 size_t work_offset[2],
253 cl_uint work_dim = 2;
254 size2_t group_size = { (size_t)tile_n*32, (
size_t)tile_n };
278 clm4rm_error = clSetKernelArg(kernel, p++, buf_bytes, NULL);
279 clm4rm_error = clSetKernelArg(kernel, p++, buf_bytes, NULL);
292 work_dim, work_offset, work_size, group_size,
295 case CL_MEM_OBJECT_ALLOCATION_FAILURE:
296 printf(
"OpenCL: CL_MEM_OBJECT_ALLOCATION_FAILURE. " 361 queue, C->
data, &zero,
sizeof(zero),
365 tile_n = MAX(1,
MIN(max_tile[0], tile_n));
368 tile_m = MAX(1,
MIN(max_tile[1], tile_m));
373 size2_t work_size_w, work_size_1, work_size_m, work_size_n, work_size, main_size_1;
377 work_size_w[1] = C->
width;
379 work_size_1[0] = work_size_w[0]/32;
380 work_size_1[1] = work_size_w[1];
384 work_size_m[0] = work_size_1[0] / tile_m;
385 work_size_m[1] = work_size_1[1] / tile_m;
387 work_size_n[0] = work_size_m[0] / tile_n;
388 work_size_n[1] = work_size_m[1] / tile_n;
390 if ((work_size_n[0] > 0) && (work_size_n[1] > 0))
397 Q_ASSERT(tile_n > 1);
403 work_size[0] = (work_size_n[0] - (work_size_n[0]-1)/2)*32*tile_n;
406 work_size[1] = work_size_n[1]*tile_n;
412 main_size_1[0] = work_size_n[0]*tile_n*tile_m;
413 main_size_1[1] = work_size_n[1]*tile_n*tile_m;
415 if (work_size_1[1] > main_size_1[1]) {
419 work_offset[1] = main_size_1[1];
422 rest_work_size_1[0] = work_size_1[0];
423 rest_work_size_1[1] = work_size_1[1]-main_size_1[1];
425 work_size[0] = rest_work_size_1[0]*32;
426 work_size[1] = rest_work_size_1[1];
434 uint32_t mask = (1<<k);
435 for( ; mask; mask = mask>>1)
436 printf(
"%i", (x & mask)?1:0);
444 uint32_t lsb = x & -x;
456 for(
int i=1; i < k; ++i) {
458 uint32_t val = (1 << i) - 1;
460 uint32_t stop = val << (k-i-1);
464 uint32_t c = val & -val;
465 uint32_t r = val + c;
466 val = (((r ^ val) >> 2) / c) | r;
void clm4rm_mul(clmatrix_t *C, clmatrix_t *A, clmatrix_t *B, cl_command_queue queue, clm4rm_conditions *cond)
Boolean matrix multiplication on the GPU using the method of the Four Russians. C := A * B.
OpenCL boolean matrix data structure. Data is arranged in 32 bit words.
void clcubic_mul(clmatrix_t *C, clmatrix_t *A, clmatrix_t *B, size2_t max_tile, cl_command_queue queue, clm4rm_conditions *cond)
Boolean matrix multiplication on the GPU using nested loops. C := A*B.
void clm4rm_mul_block(clmatrix_t *C, clmatrix_t *A, clmatrix_t *B, int r0, int r1, cl_command_queue queue, clm4rm_conditions *cond)
size_t shared_mem_bytes
size of shared memory in bytes
rci_t padded_rows
Number of rows padded to a multiple of 32.
size_t size2_t[2]
tow-dimensional size; used for various OpenCL parameters
cl_int clm4rm_error
latest OpenCL result code. CL_SUCCESS indicates no error.
unsigned int gpuword
a GPU word has 32 bits
void clcubic_mul_enqeue(clmatrix_t *C, clmatrix_t *A, clmatrix_t *B, int tile_n, int tile_m, size_t work_offset[2], size_t work_size[2], int uptri, cl_command_queue queue, clm4rm_conditions *cond)
cl_event * pre_events(clm4rm_conditions *cond)
cl_kernel clcubic_mul_kernel[MAX_TILE_M+1]
OpenCL kernels for cubic matrix multiplication. Each kernel for a tile size. Actual tile sizes are in...
Manages OpenCL event dependencies; necessary when the queue is out-of-order; dependencies must be est...
rci_t ncols
Number of columns.
Float sqrt(const Float &x)
square-root function template for floating point types
size_t shared_mem_words
size of shared memory in (32bit) words
void printb(uint32_t x, int k)
cl_kernel clm4rm_mul_kernel
OpenCL kernel for Four-Russians matrix multiplication.
cl_uint pre_count(clm4rm_conditions *cond)
cl_event * push_event(clm4rm_conditions *cond)
reserve one post-condition event
rci_t width
Number of words with valid bits: width = ceil(ncols / m4ri_radix) */.
cl_mem data
handle to GPU data (32-bit unsigned integers)
cl_kernel clutri_mul_kernel[MAX_TILE_M+1]
OpenCL kernels for cubic upper-triangle matrix multiplication. Each kernel for a tile size....
size_t max_group_size
max. size of a work group
void clutri_mul(clmatrix_t *C, clmatrix_t *A, clmatrix_t *B, size2_t max_tile, cl_command_queue queue, clm4rm_conditions *cond)
Boolean matrix multiplication on the GPU using nested loops. C := A*B Assumes matrixes to be upper tr...
rci_t nrows
Number of rows.
cl_event * pushed_event(clm4rm_conditions *cond)
void print3(uint32_t x, int k)
void print_event_info(cl_event event)
void create_index_tables(uint32_t k)