Fréchet View  1.6.0
A Tool for Exploring Fréchet Distance Algorithms
clm4rm_mul.cl
Go to the documentation of this file.
1 
5 typedef unsigned int gpuword;
6 
7 #define CEILCOLS(i) ((i+31)/32)
8 #if IMAGE2D
9 //
10 // Matrix stored in texture memory
11 //
12 # define read_only_global __read_only image2d_t
13 # define write_only_global __write_only image2d_t
14 // Note: column-major format
15 // a matrix colum is actually a row (y-coordinate) in Image2D
16 // a matrix row is actually a column (x-coordinate) in Image2D
17 // Pixel contains only one (red) component
18 # define read(M,row,col) read_imageui(M,(int2)(row,col)).x
19 # define write(M,row,col,x) write_imageui(M,(int2)(row,col),(uint4)(x,0,0,0))
20 #else
21 //
22 // Matrix stored in __global memory
23 //
24 # define read_only_global __global gpuword*
25 # define write_only_global __global gpuword*
26 # define read(M,row,col) M[(col)*M ## _nrows + row]
27 # define write(M,row,col,x) M[(col)*M ## _nrows + row]=x
28 #endif
29 
30 #define MIN(x,y) (((x) < (y)) ? (x) : (y))
31 #define POW2(x) (((gpuword)1) << x)
32 
41 gpuword read_bits(gpuword a0, gpuword a1, int spot, int n)
42 {
43  int spill = spot + n - 32;
44  gpuword temp;
45  if (spill <= 0)
46  temp = a0 << -spill;
47  else
48  temp = (a1 << (32 - spill)) | (a0 >> spill);
49  return temp >> (32 - n);
50 }
51 
52 gpuword combinate(gpuword x, int k, __local gpuword* T)
53 {
54  gpuword result = 0;
55 #pragma unroll
56  for (int y = 0; y < k; ++y, x >>= 1)
57  result |= (x & 1) * T[POW2(y)];
58  return result;
59 }
60 
74  __kernel void clm4rm_mul(
78  __local gpuword* T,
79  int k, int r0,
80  int A_nrows, int A_ncols, int B_ncols)
81  {
82 #define A_width CEILCOLS(A_ncols)
83 #define C_ncols B_ncols
84 #define C_width CEILCOLS(C_ncols)
85 #define B_nrows A_ncols
86 #define C_nrows A_nrows
87 
88  // work-group = column of C
89  int group_size = get_local_size(0);
90  // assert(group_size==A_nrows)
91  int ci = get_group_id(1);
92 
93  // work-item = one word of C
94  int cj = r0 + get_global_id(0);
95  int lcj = get_local_id(0);
96 
97  gpuword Csum = 0;
98  gpuword A0 = read(A,cj,0);
99  gpuword A1 = read(A,cj,1);
100 
101  int ablock = 0;
102  int aspot = 0;
103  for (int ai = 0; ai < A_ncols; ai += k)
104  {
105  int k1 = MIN(k, A_ncols - ai);
106  // Make one column of T
107  // distribute the 1 bit loop over the first k items
108  T[0] = 0;
109  for (int sj=0; sj < k1; sj += group_size) {
110  int tj = sj+lcj;
111  if (tj < k1)
112  T[POW2(tj)] = read(B, ai+tj,ci);
113  }
114 
115  // read/write access to T must be guarded by barriers
116  barrier(CLK_LOCAL_MEM_FENCE);
117 
118  // Then calcluate the remaining (2^k-k) combinations; distribute over all items
119  for (int sj=0; sj < POW2(k1); sj += group_size) {
120  int tj = sj+lcj;
121  if (tj < POW2(k1))
122  T[tj] = combinate(tj, k1, T);
123  }
124 
125  barrier(CLK_LOCAL_MEM_FENCE);
126 
127  // apply table
128  gpuword a = read_bits(A0,A1, aspot, k1);
129  Csum |= T[a];
130 
131  aspot += k;
132  if (aspot >= 32) {
133  // cross over to next A column
134  aspot -= 32;
135  ablock++;
136  A0 = A1;
137  if ((ablock + 1) < A_width)
138  A1 = read(A,cj, ablock + 1);
139  }
140 
141  barrier(CLK_LOCAL_MEM_FENCE);
142  }
143 
144  // write result back to global memory
145  write(C,cj,ci, Csum);
146  }
147 
#define A_width
#define read_only_global
Definition: clm4rm_mul.cl:24
unsigned int gpuword
a GPU word has 32 bits
Definition: clcubic_mul.cl:74
gpuword read_bits(gpuword a0, gpuword a1, int spot, int n)
read 32 bits from memory, not necessarily aligned to word boundaries
Definition: clm4rm_mul.cl:41
gpuword combinate(gpuword x, int k, __local gpuword *T)
Definition: clm4rm_mul.cl:52
#define A_ncols
__kernel void clm4rm_mul(write_only_global C, read_only_global A, read_only_global B, __local gpuword *T, int k, int r0, int A_nrows, int A_ncols, int B_ncols)
OpenCL kernel for M4R matrix Multiplication C := A*B.
Definition: clm4rm_mul.cl:74
unsigned int gpuword
Definition: clm4rm_mul.cl:5
#define read(M, row, col)
Definition: clm4rm_mul.cl:26
#define write_only_global
Definition: clm4rm_mul.cl:25
#define POW2(x)
Definition: clm4rm_mul.cl:31
#define MIN(x, y)
Definition: clm4rm_mul.cl:30
#define write(M, row, col, x)
Definition: clm4rm_mul.cl:27