Fréchet View  1.6.0
A Tool for Exploring Fréchet Distance Algorithms
clcubic_mul.cl
Go to the documentation of this file.
1 /*
2  * Cubic Matrix Multiplication
3  */
4 
5 #if IMAGE2D
6 //
7 // Matrix stored in texture memory
8 //
9 # define read_only_global __read_only image2d_t
10 # define write_only_global __write_only image2d_t
11 // Note: column-major format
12 // a matrix colum is actually a row (y-coordinate) in Image2D
13 // a matrix row is actually a column (x-coordinate) in Image2D
14 // Pixel contains only one (red) component
15 # define read(M,row,col) read_imageui(M,(int2)(row,col)).x
16 # define write(M,row,col,x) write_imageui(M,(int2)(row,col),(uint4)(x,0,0,0))
17 #else
18 //
19 // Matrix stored in __global memory
20 //
21 # define read_only_global __global gpuword*
22 # define write_only_global __global gpuword*
23 # define read(M,row,col) M[(col)*M ## _nrows + row]
24 # define write(M,row,col,x) M[(col)*M ## _nrows + row]=x
25 #endif
26 
27 #ifndef BUFFERED
28 # define BUFFERED 1
29 #endif
30 
31 //
32 // tile sizes; TILE_M is given by -D, tile_n is the group size(1)
33 //
34 #define tile_width (tile_n*TILE_M)
35 #define tile_ncols (tile_width*32)
36 #define tile_nrows (tile_n*TILE_M*32)
37 #define col_stride (34*tile_n*TILE_M+1)
38 
55 inline int buffer_address(int row, int col, int tile_n)
56 {
57  // col * col_stride + row/16*17 + row%16
58  int rowd16 = row>>4;
59  return col*col_stride + (rowd16<<4)+rowd16 + (row & 0x0f);
60 }
61 
62 #define buf(M,row,col) M##_buf[buffer_address(row,col,tile_n)]
63 
64 // loop over small tile TILE_M x TILE_M
65 #define for_tile \
66  for(ti=0,tcol=lcol; ti<TILE_M; ++ti,tcol+=tile_n) \
67  for(tj=0,trow=lrow; tj<TILE_M; ++tj,trow+=32*tile_n)
68 
69 #define unrolled_for_tile \
70  _Pragma("unroll") for(ti=0,tcol=lcol; ti<TILE_M; ++ti,tcol+=tile_n) \
71  _Pragma("unroll") for(tj=0,trow=lrow; tj<TILE_M; ++tj,trow+=32*tile_n)
72 
74 typedef unsigned int gpuword;
75 
76 #define CEILCOLS(i) ((i+31)/32)
77 #define MIN(x,y) (((x) < (y)) ? (x) : (y))
78 #define POW2(x) (((gpuword)1) << x)
79 
80 #define A_width CEILCOLS(A_ncols)
81 #define B_nrows 32*A_width
82 #define C_nrows A_nrows
83 
112 __kernel void clcubic_mul(
116 # if BUFFERED
117  __local gpuword* A_buf,
118  __local gpuword* B_buf,
119 # endif
120  int A_nrows, int A_ncols)
121  {
122  int tile_n = get_local_size(1);
123  int lrow = get_local_id(0); // <= 32*tile_n
124  int lcol = get_local_id(1); // <= tile_n
125 
126  int row0 = get_global_offset(0) + get_group_id(0)*tile_nrows; // = first row in A,C
127  int col0 = get_global_offset(1) + get_group_id(1)*tile_width; // = first column in B,C
128 
129  gpuword Csum[TILE_M][TILE_M];
130  gpuword a,b;
131  int a0,a1,ai;
132  int ti,tj;
133  int trow,tcol;
134 
136  Csum[tj][ti] = 0;
137  }
138 
139  for (a0=0; a0 < A_width; a0 += tile_n*TILE_M)
140  {
141 
142 # if BUFFERED
143  // Buffer a tile of A and B in shared memory
145  buf(A,trow,tcol) = read(A, row0+trow, a0+tcol);
146  buf(B,trow,tcol) = read(B, 32*a0+trow, col0+tcol);
147  }
148 
149  barrier(CLK_LOCAL_MEM_FENCE);
150 # endif
151 
152  // process a row of A against a column of B
153  for(a1=0; a1 < tile_n*TILE_M; ++a1)
154  for_tile {
155  ai = a0+a1;
156 # if BUFFERED
157  a = buf(A,trow, a1);
158 # else
159  a = read(A, row0+trow, ai);
160 # endif
161  a &= -(ai < A_width); // if (ai >= A_width) a = 0;
162 
163 # pragma unroll
164  for (int y=0; y < 32; ++y, a >>= 1) {
165 # if BUFFERED
166  b = buf(B,32*a1+y, tcol);
167 # else
168  b = read(B, 32*ai+y, col0+tcol);
169 # endif
170  Csum[tj][ti] |= -(a & 1) & b; // if (a & 1) Csum |= b
171  }
172  }
173 # if BUFFERED
174  barrier(CLK_LOCAL_MEM_FENCE);
175 # endif
176  }
177  // write results back to global memory
179  write(C, row0+trow, col0+tcol, Csum[tj][ti]);
180  }
181 }
182 
200  __kernel void clutri_mul(
204 # if BUFFERED
205  __local gpuword* A_buf,
206  __local gpuword* B_buf,
207 # endif
208  int A_nrows)
209  {
210  #define A_ncols A_nrows
211 
212  int tile_n = get_local_size(1);
213  int lrow = get_local_id(0); // <= 32*tile_n
214  int lcol = get_local_id(1); // <= tile_n
215 
216  int row0, col0;
217  if (get_group_id(0) <= get_group_id(1)) {
218  // upper triangle
219  row0 = get_group_id(0); // = first row in A,C
220  col0 = get_group_id(1); // = first column in B,C
221  }
222  else {
223  // lower triangle is empty; work on a mirrored tile, instead
224  // note: get_num_groups(1) indicates the logical number of groups, get_num_groups(0) does not
225  // TODO
226  row0 = get_num_groups(1)-get_group_id(0);
227  col0 = get_num_groups(1)-1-get_group_id(1);
228  if (row0==get_group_id(0))
229  return;
230  }
231 
232  row0 *= tile_nrows;
233  col0 *= tile_width;
234 
235  gpuword Csum[TILE_M][TILE_M];
236  gpuword a,b;
237  int a0,a1,ai;
238  int ti,tj;
239  int trow,tcol;
240 
242  Csum[tj][ti] = 0;
243  }
244 
245  // For upper triangular matrixes we need only iterate a subsection of A and B
246  for (a0=row0/32; a0 < (col0+tile_width); a0 += tile_width)
247  {
248 
249 # if BUFFERED
250  // Buffer a tile of A and B in shared memory
252  buf(A,trow,tcol) = read(A, row0+trow, a0+tcol);
253  buf(B,trow,tcol) = read(B, 32*a0+trow, col0+tcol);
254  }
255 
256  barrier(CLK_LOCAL_MEM_FENCE);
257 # endif
258 
259  // process a row of A against a column of B
260  for(a1=0; a1 < tile_n*TILE_M; ++a1)
261  for_tile {
262  ai = a0+a1;
263 # if BUFFERED
264  a = buf(A,trow, a1);
265 # else
266  a = read(A, row0+trow, ai);
267 # endif
268  a &= -(ai < A_width); // if (ai >= A_width) a = 0;
269 
270 # pragma unroll
271  for (int y=0; y < 32; ++y, a >>= 1) {
272 # if BUFFERED
273  b = buf(B,32*a1+y, tcol);
274 # else
275  b = read(B, 32*ai+y, col0+tcol);
276 # endif
277  Csum[tj][ti] |= -(a & 1) & b; // if (a & 1) Csum |= b
278  }
279  }
280 # if BUFFERED
281  barrier(CLK_LOCAL_MEM_FENCE);
282 # endif
283  }
284  // write results back to global memory
286  write(C, row0+trow, col0+tcol, Csum[tj][ti]);
287  }
288 }
289 
#define write(M, row, col, x)
Definition: clcubic_mul.cl:24
unsigned int gpuword
a GPU word has 32 bits
Definition: clcubic_mul.cl:74
__kernel void clcubic_mul(write_only_global C, read_only_global A, read_only_global B, __local gpuword *A_buf, __local gpuword *B_buf, int A_nrows, int A_ncols)
OpenCL kernel for cubic matrix multiplication.
Definition: clcubic_mul.cl:112
#define read(M, row, col)
Definition: clcubic_mul.cl:23
#define tile_width
Definition: clcubic_mul.cl:34
#define A_width
Definition: clcubic_mul.cl:80
#define A_ncols
#define tile_nrows
Definition: clcubic_mul.cl:36
int buffer_address(int row, int col, int tile_n)
offset into shared memory buffers
Definition: clcubic_mul.cl:55
#define buf(M, row, col)
Definition: clcubic_mul.cl:62
#define write_only_global
Definition: clcubic_mul.cl:22
#define BUFFERED
Definition: clcubic_mul.cl:28
__kernel void clutri_mul(write_only_global C, read_only_global A, read_only_global B, __local gpuword *A_buf, __local gpuword *B_buf, int A_nrows)
OpenCL kernel for cubic upper triangular matrix multiplication.
Definition: clcubic_mul.cl:200
#define for_tile
Definition: clcubic_mul.cl:65
#define read_only_global
Definition: clcubic_mul.cl:21
#define col_stride
Definition: clcubic_mul.cl:37
#define unrolled_for_tile
Definition: clcubic_mul.cl:69