Fréchet View  1.6.0
A Tool for Exploring Fréchet Distance Algorithms
ocl_prototype.cpp
Go to the documentation of this file.
1 
2 #if defined(__cplusplus) && !defined(_MSC_VER)
3 extern "C" {
4 #endif
5 #include <clm4rm/ocl_prototype.h>
6 #include <m4ri/graycode.h>
7 #if defined(__cplusplus) && !defined(_MSC_VER)
8 }
9 #endif
10 
11 #include <assert.h>
12 #include <gtest/gtest.h>
13 #include <omp.h>
14 #include <atomic>
15 
16 //#define CEILCOLS(i) ((i+clm4rm_radix-1)/clm4rm_radix)
17 //#define row(M,i) (M + (i)*M ## _rowstride)
18 //#define at(M,row,col) *(M + (col)*M ## _nrows + row)
19 
20 int nblocks(mzd_t const* A) {
21  int blockrows = __M4RI_MAX_MZD_BLOCKSIZE / A->rowstride;
22  return (A->nrows + blockrows - 1) / blockrows;
23 }
24 
25 int adjust_k(int k, rci_t A_nrows) {
26  size_t shared_mem_size = 32 * 1024;
27  size_t shared_mem_words = shared_mem_size/sizeof(gpuword);
28  // TODO read actual size from OpenCL settings
29  if (shared_mem_words < POW2(2) + 3 * A_nrows) {
30  // TODO when A_nrows becomes to large, we must split the matrix !!
31  EXPECT_FALSE(true) << "too many rows " << A_nrows;
32  return 2;
33  }
34 
35  if (k==0) {
36  // find optimal k for given Cache size
37 
38  // We need
39  // shared_mem_words >= 2^k + 3 * A_nrows;
40  k = log2(shared_mem_words - 3 * A_nrows);
41  }
42  if (k < 2)
43  k = 2;
44 // else if (k > 8)
45 // k = 8;
46 
47  EXPECT_GE(shared_mem_words, POW2(k) + 3 * A_nrows);
48  return k;
49 }
50 
51 
52 bool assertEquals(const mzd_t* M, const gpuword* G, int padded_rows)
53 {
54  int width = CEILCOLS(M->ncols);
55 
56  for (int row = 0; row < M->nrows; ++row)
57  {
58  word* Mrow = M->rows[row];
59  for (int col = 0; col < width; col += 2)
60  {
61  if (col+1 < width)
62  EXPECT_EQ(Mrow[col>>1], (word)G[col*padded_rows + row] | ((word)G[(col+1)*padded_rows + row]) << 32);
63  else
64  EXPECT_EQ(Mrow[col>>1], (word)G[col*padded_rows + row]);
65  }
66  }
67  return true;
68 }
69 
70 
71 mzd_t* proto_bool_mul_m4rm(mzd_t* Cm, mzd_t const* Am, mzd_t const* Bm, int k)
72 {
73  EXPECT_EQ(Am->ncols,Bm->nrows);
74  EXPECT_TRUE(Cm==NULL || Am->nrows==Cm->nrows);
75  EXPECT_TRUE(Cm==NULL || Bm->ncols==Cm->ncols);
76 
77  if (Cm == NULL)
78  Cm = mzd_init(Am->nrows, Bm->ncols);
79  else
80  mzd_set_ui(Cm, 0);
81  // TODO add_mul
82 
83  // We don't want blocking (resp: we must copy all blocks to GPU memory)
84  EXPECT_LE(nblocks(Am), 1);
85  EXPECT_LE(nblocks(Bm), 1);
86  EXPECT_LE(nblocks(Cm), 1);
87 
88  k = adjust_k(k, Am->nrows);
89 
90  gpuword* A = copy_matrix_data(NULL,Am,Am->nrows);
91  gpuword* B = copy_matrix_data(NULL,Bm,Bm->nrows);
92  gpuword* C = copy_matrix_data(NULL,Cm,Cm->nrows);
93 
94  EXPECT_TRUE(assertEquals(Am, A,Am->nrows));
95  EXPECT_TRUE(assertEquals(Bm, B,Bm->nrows));
96  EXPECT_TRUE(assertEquals(Cm, C,Cm->nrows));
97 
98  proto_mul_m4rm( C,A,B, k,
99  Am->nrows, Am->ncols, Bm->ncols);
100 
101  copy_back_matrix_data(Cm,C,Cm->nrows);
102  EXPECT_TRUE(assertEquals(Cm, C,Cm->nrows));
103 
104  delete A;
105  delete B;
106  delete C;
107 
108  return Cm;
109 }
110 
111 /*
112 // return the least significant bit
113 #define lsb(i) ((i) & -(i))
114 
273 gpuword proto_read_bits(gpuword a0, gpuword a1, int spot, int n)
274 {
275  int spill = spot + n - 32;
276  gpuword temp;
277  if (spill <= 0)
278  temp = a0 << -spill;
279  else
280  temp = (a1 << (32 - spill)) | (a0 >> spill);
281  return temp >> (32 - n);
282 }
283 
284 void swap(gpuword** A, gpuword** B) {
285  gpuword* aux = *A;
286  *A = *B;
287  *B = aux;
288 }
289 
290 gpuword combinate(gpuword x, gpuword* T)
291 {
292  gpuword result = 0;
293  for (gpuword y = 1; x; y <<= 1, x >>= 1)
294  result |= (x & 1) * T[y];
295  // Note: avoid if()
296  return result;
297 }
298 /*
299 class Barrier {
300 private:
301  int count;
302  pthread_barrier_t barrier;
303 public:
304  Barrier(int acount) : count(acount) {
305  pthread_barrier_init(&barrier,NULL,count);
306  }
307  ~Barrier() {
308  pthread_barrier_destroy(&barrier);
309  }
310 
311  void reset() {
312  pthread_barrier_destroy(&barrier);
313  pthread_barrier_init(&barrier,NULL,count);
314  }
315 
316  void operator() (void) {
317  pthread_barrier_wait(&barrier);
318  }
319 };
320 */
321 
322 #define A_width CEILCOLS(A_ncols)
323 #define C_ncols B_ncols
324 #define C_width CEILCOLS(C_ncols)
325 #define B_nrows A_ncols
326 #define C_nrows A_nrows
327 
328 #define read(M,row,col) M[col*M##_nrows+row]
329 #define write(M,row,col,x) M[col*M##_nrows+row]=x
330 
331 void proto_mul_m4rm_block(gpuword *C, const gpuword *A, const gpuword *B,
332  int k,
333  int A_nrows, int A_ncols, int B_ncols,
334  int r0, int r1);
335 
339 void proto_mul_m4rm(gpuword *C, const gpuword *A, const gpuword *B,
340  int k,
341  int A_nrows, int A_ncols, int B_ncols)
342 {
343  k = 8;
344  max_group_size = 256;
345 
346  int row_offset = A_nrows - (A_nrows % max_group_size);
347  if (row_offset > 0)
348  proto_mul_m4rm_block(C, A, B, k,
349  A_nrows, A_ncols, B_ncols,
350  0,row_offset);
351  if (row_offset < A_nrows)
352  proto_mul_m4rm_block(C, A, B, k,
353  A_nrows, A_ncols, B_ncols,
354  row_offset,A_nrows);
355 }
356 
357 void proto_mul_m4rm_block(gpuword *C, const gpuword *A, const gpuword *B,
358  int k,
359  int A_nrows, int A_ncols, int B_ncols,
360  int r0, int r1)
361 {
362  size_t work_size[2] = { (size_t)r1-r0, (size_t)C_width };
363  size_t group_size[2] = { MIN(work_size[0],max_group_size), 1 };
364 
365  // group size must divide work size
366  ASSERT_TRUE((work_size[0] % group_size[0]) == 0);
367  ASSERT_TRUE((work_size[1] % group_size[1]) == 0);
368 
369  size_t group_id_0,group_id_1;
370  size_t group_offset_0,group_offset_1;
371 // size_t global_id_0,global_id_1;
372  size_t local_size_0,local_size_1;
373 // size_t local_id_0,local_id_1;
374  size_t local_count;
375 #define get_group_id(i) group_id_##i
376 #define get_global_id(i) global_id_##i
377 #define get_local_size(i) local_size_##i
378 #define get_local_id(i) local_id_##i
379  group_offset_0=0;
380  get_group_id(0)=0;
381  while(group_offset_0 < work_size[0])
382  {
383  get_local_size(0) = MIN(group_size[0],work_size[0]-group_offset_0);
384  group_offset_1=0;
385  get_group_id(1)=0;
386 
387  while(group_offset_1 < work_size[1])
388  {
389  get_local_size(1) = MIN(group_size[1],work_size[1]-group_offset_1);
390  local_count = get_local_size(0) * get_local_size(1);
391 
392  //
393  // 1 GROUP
394  //
395  // emulate shared memory
396  gpuword* T = new gpuword[POW2(k)];
397 
398 // for(int get_local_id(0)=0; get_local_id(0) < get_local_size(0); ++get_local_id(0))
399 // for(int get_local_id(1)=0; get_local_id(1) < get_local_size(1); ++get_local_id(1))
400 #pragma omp parallel num_threads(local_count)
401  {
402  int thread_num = omp_get_thread_num();
403 
404  int get_local_id(0) = thread_num % get_local_size(0);
405  int get_local_id(1) = thread_num / get_local_size(0);
406 
407  int get_global_id(0) = group_offset_0+get_local_id(0);
408  int get_global_id(1) = group_offset_1+get_local_id(1);
409 
410  EXPECT_EQ(get_local_size(1),1);
411  EXPECT_EQ(get_local_id(1),0);
412  EXPECT_EQ(get_group_id(1), get_global_id(1));
413 
414  //
415  // 1 ITEM
416  //
417  int group_size = get_local_size(0);
418  int ci = get_group_id(1);
419  int cj = r0 + get_global_id(0);
420 
421  gpuword Csum = 0;
422  gpuword A0 = read(A, cj, 0);
423  gpuword A1 = read(A, cj, 1);
424 
425  // iterate one row of A and one block column of B
426  int ablock = 0;
427  int aspot = 0;
428 
429  for (int ai = 0; ai < A_ncols; ai += k)
430  {
431  int k1 = MIN(k, A_ncols - ai);
432  // Make one column of T
433  // distribute the 1 bit loop over the first k items
434  T[0] = 0;
435  for (int sj = 0; sj < k1; sj += group_size) {
436  int tj = sj + cj;//get_local_id(0);
437  if (tj < k1)
438  T[POW2(tj)] = read(B, ai + tj, ci);
439  }
440 
441 #pragma omp barrier
442 
443  for (int sj = 0; sj < POW2(k1); sj += group_size) {
444  int tj = sj + cj;//get_local_id(0);
445  if (tj < POW2(k1))
446  T[tj] = combinate(tj, T);
447  }
448 
449 #pragma omp barrier
450 
451  gpuword a = proto_read_bits(A0, A1, aspot, k1);
452  Csum |= T[a];
453 
454  aspot += k;
455  if (aspot >= 32) {
456  // cross over to next A column
457  aspot -= 32;
458  ablock++;
459  A0 = A1;
460 
461  if ((ablock + 1) < A_width)
462  A1 = read(A, cj, ablock + 1);
463  }
464 
465 #pragma omp barrier
466  }
467  // write results back to global memory
468  write(C, cj, ci, Csum);
469  // Note: thanks to column-major storage for A,B,C
470  // global memory access is coalesced, i.e. consecutive
471  // for all items in the group=column.
472  }
473 
474  delete T;
475 
476  group_offset_1 += get_local_size(1);
477  get_group_id(1)++;
478  }
479 
480  group_offset_0 += get_local_size(0);
481  get_group_id(0)++;
482  }
483 
484  /* Memory access pattern:
485  C written once (for ci, for cj)
486  B read once (for ci, for ai)
487  A read once in each group
488 
489  = n^2 * (2 + n/32)
490 
491  (where is the four-russian speed-up? it's gone!)
492 
493  TODO forget about the four russian and calculate all combinations from B
494  TODO improve A reads by making the groups wider (fit a chunk of B into shared memory)
495  but we need enough groups to keep the GPU busy
496  */
497 
498 }
499 
500 mzd_t* proto_bool_mul_cubic(mzd_t* Cm, mzd_t const* Am, mzd_t const* Bm, int)
501 {
502  EXPECT_EQ(Am->ncols, Bm->nrows);
503  EXPECT_TRUE(Cm == NULL || Am->nrows == Cm->nrows);
504  EXPECT_TRUE(Cm == NULL || Bm->ncols == Cm->ncols);
505 
506  EXPECT_GE(Am->nrows, 1);
507  EXPECT_GE(Am->ncols, 1);
508  EXPECT_GE(Bm->ncols, 1);
509 
510  if (Cm == NULL)
511  Cm = mzd_init(Am->nrows, Bm->ncols);
512  else
513  mzd_set_ui(Cm, 0);
514  // TODO add_mul
515 
516  // We don't want blocking (resp: we must copy all blocks to GPU memory)
517  EXPECT_LE(nblocks(Am), 1);
518  EXPECT_LE(nblocks(Bm), 1);
519  EXPECT_LE(nblocks(Cm), 1);
520 
521  // A and B and C are padded to multiple of 32 rows
522  int rowpadding = 32;
523  gpuword* A = copy_matrix_data(NULL, Am, padded_rows(Am->nrows, 32));
524  gpuword* B = copy_matrix_data(NULL, Bm, padded_rows(Bm->nrows, 32));
525  gpuword* C = copy_matrix_data(NULL, Cm, padded_rows(Cm->nrows, 32));
526 
527  EXPECT_TRUE(assertEquals(Am, A, padded_rows(Am->nrows, 32)));
528  EXPECT_TRUE(assertEquals(Bm, B, padded_rows(Bm->nrows, 32)));
529  EXPECT_TRUE(assertEquals(Cm, C, padded_rows(Cm->nrows, 32)));
530 
531  proto_mul_cubic(C, A, B,
532  padded_rows(Am->nrows, rowpadding),
533  Am->ncols, Bm->ncols);
534 
535  copy_back_matrix_data(Cm, C, padded_rows(Cm->nrows, 32));
536  EXPECT_TRUE(assertEquals(Cm, C, padded_rows(Cm->nrows, 32)));
537 
538  delete A;
539  delete B;
540  delete C;
541 
542  return Cm;
543 }
544 
545 void proto_mul_cubic(gpuword *C, const gpuword *A, const gpuword *B,
546  int A_nrows, int A_ncols, int B_ncols)
547 {
548  max_group_size = 256;
549  size_t work_size[2] = { (size_t)A_nrows, (size_t)C_width };
550  size_t group_size[2] = { 32, 1 };
551 
552  // group size must divide work size
553  ASSERT_TRUE((work_size[0] % group_size[0]) == 0);
554  ASSERT_TRUE((work_size[1] % group_size[1]) == 0);
555 
556  size_t group_id_0,group_id_1;
557  size_t group_offset_0,group_offset_1;
558 // size_t global_id[2];
559  size_t local_size_0,local_size_1;
560 // size_t local_id[2];
561  size_t local_count;
562 // global_id[0]=global_id[1]=0;
563  group_offset_0=0;
564  get_group_id(0)=0;
565  while(group_offset_0 < work_size[0])
566  {
567  get_local_size(0) = MIN(group_size[0],work_size[0]-group_offset_0);
568  group_offset_1=0;
569  get_group_id(1)=0;
570 
571  while(group_offset_1 < work_size[1])
572  {
573  get_local_size(1) = MIN(group_size[1],work_size[1]-group_offset_1);
574  local_count = get_local_size(0) * get_local_size(1);
575 
576  EXPECT_EQ(get_local_size(0),32);
577  EXPECT_EQ(get_local_size(1),1);
578  EXPECT_EQ(local_count,32);
579  //
580  // 1 GROUP
581  //
582  // emulate shared memory
583  gpuword* T = new gpuword[32];
584 
585 #pragma omp parallel num_threads(local_count)
586  {
587  int thread_num = omp_get_thread_num();
588 
589  int get_local_id(0) = thread_num % get_local_size(0);
590  int get_local_id(1) = thread_num / get_local_size(0);
591 
592  EXPECT_EQ(get_local_size(0),32);
593  EXPECT_EQ(get_local_size(1),1);
594  EXPECT_EQ(get_local_id(1),0);
595 
596  int get_global_id(0) = group_offset_0+get_local_id(0);
597  int get_global_id(1) = group_offset_1+get_local_id(1);
598 
599  //
600  // 1 ITEM
601  //
602  int ci = get_global_id(1);
603  int cj = get_global_id(0);
604  int lcj = get_local_id(0);
605 
606  gpuword Csum = 0;
607  for (int ai = 0; ai < A_width; ++ai)
608  {
609  // Buffer 32 words of B in shared memory
610  T[lcj] = read(B, 32*ai + lcj, ci);
611 #pragma omp barrier
612  gpuword a = read(A, cj, ai);
613  for (int y = 0; y < 32; ++y, a >>= 1)
614  Csum |= (a & 1) * T[y];
615 #pragma omp barrier
616  }
617  // write results back to global memory
618  write(C, cj, ci, Csum);
619  // Note: thanks to column-major storage for A,B,C
620  // global memory access is coalesced, i.e. consecutive
621  // for all items in the group=column.
622  }
623 
624  delete T;
625 
626  group_offset_1 += get_local_size(1);
627  get_group_id(1)++;
628  }
629 
630  group_offset_0 += get_local_size(0);
631  get_group_id(0)++;
632  }
633 
634  /* Memory access pattern:
635  C written once (for ci, for cj)
636  B read once (for ci, for ai)
637  A read once in each group
638 
639  = n^2 * (2 + n/32)
640 
641  (where is the four-russian speed-up? it's gone!)
642 
643  TODO forget about the four russian and calculate all combinations from B
644  TODO improve A reads by making the groups wider (fit a chunk of B into shared memory)
645  but we need enough groups to keep the GPU busy
646  */
647 
648 }
649 
650 
#define write(M, row, col, x)
Definition: clcubic_mul.cl:24
gpuword combinate(gpuword x, gpuword *T)
unsigned int gpuword
a GPU word has 32 bits
Definition: clcubic_mul.cl:74
#define read(M, row, col)
Definition: clcubic_mul.cl:23
#define A_width
Definition: clcubic_mul.cl:80
void proto_mul_m4rm(gpuword *C, const gpuword *A, const gpuword *B, int k, int A_nrows, int A_ncols, int B_ncols)
#define A_ncols
#define C_width
void proto_mul_cubic(gpuword *C, const gpuword *A, const gpuword *B, int A_nrows, int A_ncols, int B_ncols)
int nblocks(mzd_t const *A)
#define POW2(x)
Definition: clcubic_mul.cl:78
size_t shared_mem_words
size of shared memory in (32bit) words
Definition: clm4rm.cpp:77
gpuword * copy_matrix_data(gpuword *G, const mzd_t *M, int padded_rows)
create a column-major copy from an mzd_t matrix
Definition: clm4rm.cpp:436
int adjust_k(int k, rci_t A_nrows)
#define MIN(x, y)
Definition: clcubic_mul.cl:77
void proto_mul_m4rm_block(gpuword *C, const gpuword *A, const gpuword *B, int k, int A_nrows, int A_ncols, int B_ncols, int r0, int r1)
bool assertEquals(const mzd_t *M, const gpuword *G, int padded_rows)
mzd_t * proto_bool_mul_m4rm(mzd_t *Cm, mzd_t const *Am, mzd_t const *Bm, int k)
mzd_t * proto_bool_mul_cubic(mzd_t *Cm, mzd_t const *Am, mzd_t const *Bm, int)
int padded_rows(int nrows, int padding)
calculate the number of padded rows
Definition: clm4rm.cpp:185
size_t max_group_size
max. size of a work group
Definition: clm4rm.cpp:74
#define CEILCOLS(i)
Definition: clcubic_mul.cl:76
void copy_back_matrix_data(mzd_t *M, const gpuword *G, int padded_rows)
copy back a colum–major matrix
Definition: clm4rm.cpp:460
gpuword proto_read_bits(gpuword a0, gpuword a1, int spot, int n)