Fréchet View  1.6.0
A Tool for Exploring Fréchet Distance Algorithms
clm4rm_multiplication.cpp
Go to the documentation of this file.
1 //
2 // Created by nightrider on 21.09.18.
3 //
4 
5 #include <clm4rm.h>
6 #include <stdlib.h>
7 #include <qdebug.h>
11 extern cl_kernel clm4rm_mul_kernel;
12 
17 extern cl_kernel clcubic_mul_kernel[MAX_TILE_M+1];
22 extern cl_kernel clutri_mul_kernel[MAX_TILE_M+1];
23 
24 void print_event_info(cl_event event);
25 
27  int r0, int r1,
28  cl_command_queue queue, clm4rm_conditions* cond);
29 
31  cl_command_queue queue, clm4rm_conditions* cond)
32 {
33  int row_offset = A->nrows - (A->nrows % max_group_size);
34  cl_event events[2];
35  int e=0;
36  if (row_offset > 0) {
37  clm4rm_mul_block(C, A, B, 0, row_offset, queue, cond);
38  }
39  if (row_offset < A->nrows) {
40  clm4rm_mul_block(C, A, B, row_offset, A->nrows, queue, cond);
41  }
42 }
43 
45  clmatrix_t* C, clmatrix_t* A, clmatrix_t* B,
46  int r0, int r1,
47  cl_command_queue queue, clm4rm_conditions* cond)
48 {
49  Q_ASSERT(A->ncols==B->nrows);
50  Q_ASSERT(A->nrows==C->nrows);
51  Q_ASSERT(B->ncols==C->ncols);
52  Q_ASSERT(A->width*32 == B->padded_rows);
53  Q_ASSERT(A->padded_rows==C->padded_rows);
54 
55  // 1 Work-Item = 1 Word in C
56  cl_uint work_dim = 2;
57  size_t work_size[2] = { (size_t)r1-r0, (size_t)C->width };
58  size_t group_size[2] = { MIN(work_size[0],max_group_size), 1 };
59 
60  // group size must divide work size
61  Q_ASSERT((work_size[0] % group_size[0]) == 0);
62  Q_ASSERT((work_size[1] % group_size[1]) == 0);
63 
64  // Work-Group = 1 Word-Column.
65  // Each Work-Group manages their Look-Up-Tables.
66 
67  int k=8; //TODO auto-choose
68 
69  /*
70  __kernel void clm4rm_mul(
71  __global gpuword* C,
72  __global gpuword* A,
73  __global gpuword* B,
74  __local gpuword T [2^k],
75  int k, int r0,
76  int A_nrows, int A_ncols, int B_ncols)
77  */
78 
79  clm4rm_error = clSetKernelArg(clm4rm_mul_kernel, 0, sizeof(cl_mem), &C->data);
80  clm4rm_error = clSetKernelArg(clm4rm_mul_kernel, 1, sizeof(cl_mem), &A->data);
81  clm4rm_error = clSetKernelArg(clm4rm_mul_kernel, 2, sizeof(cl_mem), &B->data);
82  clm4rm_error = clSetKernelArg(clm4rm_mul_kernel, 3, sizeof(gpuword)*POW2(k), NULL);
83  clm4rm_error = clSetKernelArg(clm4rm_mul_kernel, 4, sizeof(int), &k);
84  clm4rm_error = clSetKernelArg(clm4rm_mul_kernel, 5, sizeof(int), &r0);
85  clm4rm_error = clSetKernelArg(clm4rm_mul_kernel, 6, sizeof(int), &A->padded_rows);
86  clm4rm_error = clSetKernelArg(clm4rm_mul_kernel, 7, sizeof(int), &A->ncols);
87  clm4rm_error = clSetKernelArg(clm4rm_mul_kernel, 8, sizeof(int), &B->ncols);
88 
89  clm4rm_error = clEnqueueNDRangeKernel(queue, clm4rm_mul_kernel,
90  work_dim, NULL, work_size, group_size,
91  pre_count(cond),pre_events(cond),push_event(cond));
92  Q_ASSERT(clm4rm_error==CL_SUCCESS);
93  Q_ASSERT(pushed_event(cond) != NULL);
94 }
95 
97  int tile_n, int tile_m,
98  size_t work_offset[2],
99  size_t work_size[2],
100  int uptri,
101  cl_command_queue queue, clm4rm_conditions* cond);
102 
133  cl_command_queue queue, clm4rm_conditions* cond)
134 {
135  // A and B and C are expected to be padded to multiple of 32 rows
136  Q_ASSERT(A->padded_rows % 32 == 0);
137  Q_ASSERT(B->padded_rows % 32 == 0);
138  Q_ASSERT(C->padded_rows % 32 == 0);
139 
140  Q_ASSERT(A->ncols == B->nrows);
141  Q_ASSERT(A->nrows == C->nrows);
142  Q_ASSERT(B->ncols == C->ncols);
143  Q_ASSERT(A->padded_rows==C->padded_rows);
144  Q_ASSERT(A->nrows==C->nrows);
145 
146  int tile_n = sqrt(max_group_size/32);
147  tile_n = MAX(1, MIN(max_tile[0], tile_n));
148  int tile_m = (sqrt(1 + 17*shared_mem_words)-1)/(68*tile_n);
149  tile_m = MAX(1, MIN(max_tile[1], tile_m));
150  Q_ASSERT(tile_m <= MAX_TILE_M);
151  // limtied by shared memory size
152  // On Intel HD Graphis, n=2,m=2 seems to be OK, but m=3 is a disaster.
153  // Why is that so? Is the register file overbooked? Is shared memory overbooked?
154 
155  size2_t work_offset = { 0,0 };
156  size2_t work_size_w, work_size_1, work_size_m, work_size_n, work_size, main_size_1;
157 
158  // work-size in words
159  work_size_w[0] = C->padded_rows;
160  work_size_w[1] = C->width;
161  // work-size in 1x1 tiles (exact)
162  work_size_1[0] = work_size_w[0]/32;
163  work_size_1[1] = work_size_w[1];
164 
165  for(;;) {
166  // work-size in mxm tiles (small tiles)
167  work_size_m[0] = work_size_1[0] / tile_m; // rounded down
168  work_size_m[1] = work_size_1[1] / tile_m; // rounded down
169  // work-size in nxn tiles (big tiles)
170  work_size_n[0] = work_size_m[0] / tile_n; // rounded down
171  work_size_n[1] = work_size_m[1] / tile_n; // rounded down
172 
173  if ((work_size_n[0] > 0) && (work_size_n[1] > 0))
174  break;
175 
176  // else: reduce tile size
177  if (tile_m > 1)
178  tile_m--;
179  else {
180  Q_ASSERT(tile_n > 1);
181  tile_n--;
182  }
183  }
184 
185  // work-size in items
186  work_size[0] = work_size_n[0]*32*tile_n;
187  work_size[1] = work_size_n[1]*tile_n;
188  // group size is always { 32*tile_n,tile_n }
189 
190 // printf(" -- tile n=%i m=%i\n",tile_n, tile_m);
191  clcubic_mul_enqeue(C, A, B, tile_n,tile_m, work_offset, work_size, 0, queue, cond);
192 
193  main_size_1[0] = work_size_n[0]*tile_n*tile_m;
194  main_size_1[1] = work_size_n[1]*tile_n*tile_m;
195 
196  if (work_size_1[0] > main_size_1[0]) {
197  // rest bottom with 1x1 tiles
198  // work offset in rows
199  work_offset[0] = main_size_1[0]*32;
200  work_offset[1] = 0;
201 
202  size2_t rest_work_size_1;
203  rest_work_size_1[0] = work_size_1[0]-main_size_1[0];
204  rest_work_size_1[1] = work_size_1[1];
205  // tile_n = tile_m = 1
206  work_size[0] = rest_work_size_1[0]*32;
207  work_size[1] = rest_work_size_1[1];
208 
209  clcubic_mul_enqeue(C, A, B, 1,1, work_offset, work_size, 0, queue, cond);
210  }
211  if (work_size_1[1] > main_size_1[1]) {
212  // rest right with 1x1 tiles
213  // work offset in rows
214  work_offset[0] = 0;
215  work_offset[1] = main_size_1[1];
216 
217  size2_t rest_work_size_1;
218  rest_work_size_1[0] = main_size_1[0];
219  rest_work_size_1[1] = work_size_1[1]-main_size_1[1];
220 
221  work_size[0] = rest_work_size_1[0]*32;
222  work_size[1] = rest_work_size_1[1];
223 
224  clcubic_mul_enqeue(C, A, B, 1,1, work_offset, work_size, 0, queue, cond);
225  }
226 }
227 
228 void print_event_info(cl_event event)
229 {
230  cl_ulong start, end;
231 
232  clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END,
233  sizeof(cl_ulong), &end, NULL);
234  clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START,
235  sizeof(cl_ulong), &start, NULL);
236 
237  float executionTimeInMilliseconds = (end - start) * 1.0e-6f;
238  printf(" -- GPU clock = %lf ms\n", executionTimeInMilliseconds);
239 }
240 
242  int tile_n, int tile_m,
243  size_t work_offset[2],
244  size_t work_size[2],
245  int utri,
246  cl_command_queue queue, clm4rm_conditions* cond)
247 {
248  int tile_width = tile_n*tile_m;
249  int tile_ncols = tile_width*32;
250  int tile_nrows = tile_n*tile_m*32;
251  int col_stride = 34*tile_n*tile_m+1;
252 
253  cl_uint work_dim = 2;
254  size2_t group_size = { (size_t)tile_n*32, (size_t)tile_n };
255 
256 // B buffer is column-major
257 // A buffer is row-major
258  size_t buf_bytes = sizeof(gpuword)*tile_width*col_stride;
259  Q_ASSERT(2*buf_bytes <= shared_mem_bytes);
260  // Note: buffered tiles have odd (prime) colstride to minimize bank conflicts (right?)
261 
262  /*
263  __kernel void clcubic_mul(
264  write_only_global C,
265  read_only_global A,
266  read_only_global B,
267  __local gpuword* A_buf,
268  __local gpuword* B_buf,
269  int A_nrows, int A_ncols)
270  */
271  cl_kernel kernel = utri ? clutri_mul_kernel[tile_m] : clcubic_mul_kernel[tile_m];
272 
273  int p = 0;
274  clm4rm_error = clSetKernelArg(kernel, p++, sizeof(cl_mem), &C->data);
275  clm4rm_error = clSetKernelArg(kernel, p++, sizeof(cl_mem), &A->data);
276  clm4rm_error = clSetKernelArg(kernel, p++, sizeof(cl_mem), &B->data);
277 #if BUFFERED
278  clm4rm_error = clSetKernelArg(kernel, p++, buf_bytes, NULL);
279  clm4rm_error = clSetKernelArg(kernel, p++, buf_bytes, NULL);
280 #endif
281  clm4rm_error = clSetKernelArg(kernel, p++, sizeof(int), &A->padded_rows);
282 
283  if (!utri) {
284  clm4rm_error = clSetKernelArg(kernel, p++, sizeof(int), &A->ncols);
285  }
286 
287  // TODO how about clEnqueueNDRangeKernel(...global_work_offset ? )
288  // and get_global_offset(0,1)
289  // Note that the cubic_mul kernel does not query get_global_id() at all.
290  // It applies an offset to get_group_id()*X instead.
291  clm4rm_error = clEnqueueNDRangeKernel(queue, kernel,
292  work_dim, work_offset, work_size, group_size,
293  pre_count(cond),pre_events(cond),push_event(cond));
294  switch (clm4rm_error) {
295  case CL_MEM_OBJECT_ALLOCATION_FAILURE:
296  printf( "OpenCL: CL_MEM_OBJECT_ALLOCATION_FAILURE. "
297  " Allocated %s memory = %li \n", (IMAGE2D ? "texture":"global"), allocated_size);
298  break;
299  default:
300  printf("OpenCL ERROR: %i \n", clm4rm_error);
301  break;
302  case CL_SUCCESS:
303  break;
304  }
305 
306  Q_ASSERT(clm4rm_error == CL_SUCCESS);
307  Q_ASSERT(pushed_event(cond) != NULL);
308 }
309 
310 
340  size2_t max_tile,
341  cl_command_queue queue, clm4rm_conditions* cond)
342 {
343  // A and B and C are expected to be padded to multiple of 32 rows
344  Q_ASSERT(A->padded_rows % 32 == 0);
345  Q_ASSERT(B->padded_rows % 32 == 0);
346  Q_ASSERT(C->padded_rows % 32 == 0);
347 
348  Q_ASSERT(A->nrows==A->ncols);
349  Q_ASSERT(B->nrows==B->ncols);
350  Q_ASSERT(C->nrows==C->ncols);
351 
352  Q_ASSERT(A->ncols == B->nrows);
353  Q_ASSERT(A->nrows == C->nrows);
354  Q_ASSERT(B->ncols == C->ncols);
355  Q_ASSERT(A->padded_rows==C->padded_rows);
356  Q_ASSERT(A->nrows==C->nrows);
357 
358  // fill C with Zero (not strictly necesary, but...
359  gpuword zero=0;
360  clEnqueueFillBuffer(
361  queue, C->data, &zero, sizeof(zero),
362  0, DATA_BYTES(C), 0, NULL, NULL);
363 
364  int tile_n = sqrt(max_group_size/32);
365  tile_n = MAX(1, MIN(max_tile[0], tile_n));
366 
367  int tile_m = (sqrt(1 + 17*shared_mem_words)-1)/(68*tile_n);
368  tile_m = MAX(1, MIN(max_tile[1], tile_m));
369  Q_ASSERT(tile_m <= MAX_TILE_M);
370  // limtied by shared memory size
371 
372  size2_t work_offset = { 0,0 };
373  size2_t work_size_w, work_size_1, work_size_m, work_size_n, work_size, main_size_1;
374 
375  // work-size in words
376  work_size_w[0] = C->padded_rows;
377  work_size_w[1] = C->width;
378  // work-size in 1x1 tiles (exact)
379  work_size_1[0] = work_size_w[0]/32;
380  work_size_1[1] = work_size_w[1];
381 
382  for(;;) {
383  // work-size in mxm tiles (small tiles)
384  work_size_m[0] = work_size_1[0] / tile_m; // rounded down
385  work_size_m[1] = work_size_1[1] / tile_m; // rounded down
386  // work-size in nxn tiles (big tiles)
387  work_size_n[0] = work_size_m[0] / tile_n; // rounded down
388  work_size_n[1] = work_size_m[1] / tile_n; // rounded down
389 
390  if ((work_size_n[0] > 0) && (work_size_n[1] > 0))
391  break;
392 
393  // else: reduce tile size
394  if (tile_m > 1)
395  tile_m--;
396  else {
397  Q_ASSERT(tile_n > 1);
398  tile_n--;
399  }
400  }
401 
402  // work-size in items
403  work_size[0] = (work_size_n[0] - (work_size_n[0]-1)/2)*32*tile_n;
404 // work_size[0] = work_size_n[0]*32*tile_n;
405  // fold lower part
406  work_size[1] = work_size_n[1]*tile_n;
407  // group size is always { 32*tile_n,tile_n }
408 
409 // printf(" -- tile n=%i m=%i\n",tile_n, tile_m);
410  clcubic_mul_enqeue(C, A, B, tile_n,tile_m, work_offset, work_size, 1, queue, cond);
411 
412  main_size_1[0] = work_size_n[0]*tile_n*tile_m;
413  main_size_1[1] = work_size_n[1]*tile_n*tile_m;
414 
415  if (work_size_1[1] > main_size_1[1]) {
416  // rest right with 1x1 tiles
417  // work offset in rows
418  work_offset[0] = 0;
419  work_offset[1] = main_size_1[1];
420 
421  size2_t rest_work_size_1;
422  rest_work_size_1[0] = work_size_1[0];
423  rest_work_size_1[1] = work_size_1[1]-main_size_1[1];
424 
425  work_size[0] = rest_work_size_1[0]*32;
426  work_size[1] = rest_work_size_1[1];
427 
428  clcubic_mul_enqeue(C, A, B, 1,1, work_offset, work_size, 0, queue, cond);
429  }
430 }
431 
432 
433 void printb(uint32_t x, int k) {
434  uint32_t mask = (1<<k);
435  for( ; mask; mask = mask>>1)
436  printf("%i", (x & mask)?1:0);
437 }
438 
439 void print3(uint32_t x, int k)
440 {
441  printb(x,k);
442  printf(" = ");
443  // LSB
444  uint32_t lsb = x & -x;
445  printb(lsb,k);
446  printf(" + ");
447  // rest
448  printb(x-lsb,k);
449  printf("\n");
450 }
451 
452 
453  void create_index_tables(uint32_t k)
454 {
455  k++;
456  for(int i=1; i < k; ++i) {
457  // generate all entries with Hamming weight i
458  uint32_t val = (1 << i) - 1;
459  print3(val,k+1);
460  uint32_t stop = val << (k-i-1);
461  //print3(stop,k+1);
462 
463  while(val != stop) {
464  uint32_t c = val & -val; // lsb
465  uint32_t r = val + c;
466  val = (((r ^ val) >> 2) / c) | r;
467  print3(val,k+1);
468  }
469  }
470 }
471 
472 
473 
474 
void clm4rm_mul(clmatrix_t *C, clmatrix_t *A, clmatrix_t *B, cl_command_queue queue, clm4rm_conditions *cond)
Boolean matrix multiplication on the GPU using the method of the Four Russians. C := A * B.
OpenCL boolean matrix data structure. Data is arranged in 32 bit words.
Definition: clm4rm.h:98
void clcubic_mul(clmatrix_t *C, clmatrix_t *A, clmatrix_t *B, size2_t max_tile, cl_command_queue queue, clm4rm_conditions *cond)
Boolean matrix multiplication on the GPU using nested loops. C := A*B.
#define tile_ncols
Definition: clcubic_mul.cl:35
void clm4rm_mul_block(clmatrix_t *C, clmatrix_t *A, clmatrix_t *B, int r0, int r1, cl_command_queue queue, clm4rm_conditions *cond)
size_t shared_mem_bytes
size of shared memory in bytes
Definition: clm4rm.cpp:77
rci_t padded_rows
Number of rows padded to a multiple of 32.
Definition: clm4rm.h:100
size_t size2_t[2]
tow-dimensional size; used for various OpenCL parameters
Definition: clm4rm.h:67
cl_int clm4rm_error
latest OpenCL result code. CL_SUCCESS indicates no error.
Definition: clm4rm.cpp:9
unsigned int gpuword
a GPU word has 32 bits
Definition: clcubic_mul.cl:74
void clcubic_mul_enqeue(clmatrix_t *C, clmatrix_t *A, clmatrix_t *B, int tile_n, int tile_m, size_t work_offset[2], size_t work_size[2], int uptri, cl_command_queue queue, clm4rm_conditions *cond)
cl_event * pre_events(clm4rm_conditions *cond)
Definition: clm4rm.cpp:338
#define tile_width
Definition: clcubic_mul.cl:34
cl_kernel clcubic_mul_kernel[MAX_TILE_M+1]
OpenCL kernels for cubic matrix multiplication. Each kernel for a tile size. Actual tile sizes are in...
Definition: clm4rm.cpp:71
size_t allocated_size
Definition: clm4rm.cpp:78
Manages OpenCL event dependencies; necessary when the queue is out-of-order; dependencies must be est...
Definition: clm4rm.h:227
#define tile_nrows
Definition: clcubic_mul.cl:36
rci_t ncols
Number of columns.
Definition: clm4rm.h:101
Float sqrt(const Float &x)
square-root function template for floating point types
#define POW2(x)
Definition: clcubic_mul.cl:78
size_t shared_mem_words
size of shared memory in (32bit) words
Definition: clm4rm.cpp:77
#define MAX_TILE_M
Definition: clm4rm.h:62
void printb(uint32_t x, int k)
cl_kernel clm4rm_mul_kernel
OpenCL kernel for Four-Russians matrix multiplication.
Definition: clm4rm.cpp:68
#define IMAGE2D
Definition: clm4rm.h:53
#define MIN(x, y)
Definition: clcubic_mul.cl:77
cl_uint pre_count(clm4rm_conditions *cond)
Definition: clm4rm.cpp:331
cl_event * push_event(clm4rm_conditions *cond)
reserve one post-condition event
Definition: clm4rm.cpp:348
rci_t width
Number of words with valid bits: width = ceil(ncols / m4ri_radix) */.
Definition: clm4rm.h:103
cl_mem data
handle to GPU data (32-bit unsigned integers)
Definition: clm4rm.h:114
cl_kernel clutri_mul_kernel[MAX_TILE_M+1]
OpenCL kernels for cubic upper-triangle matrix multiplication. Each kernel for a tile size....
Definition: clm4rm.cpp:72
size_t max_group_size
max. size of a work group
Definition: clm4rm.cpp:74
void clutri_mul(clmatrix_t *C, clmatrix_t *A, clmatrix_t *B, size2_t max_tile, cl_command_queue queue, clm4rm_conditions *cond)
Boolean matrix multiplication on the GPU using nested loops. C := A*B Assumes matrixes to be upper tr...
rci_t nrows
Number of rows.
Definition: clm4rm.h:99
cl_event * pushed_event(clm4rm_conditions *cond)
Definition: clm4rm.cpp:357
#define DATA_BYTES(m)
Definition: clm4rm.h:119
void print3(uint32_t x, int k)
void print_event_info(cl_event event)
#define col_stride
Definition: clcubic_mul.cl:37
void create_index_tables(uint32_t k)