Fréchet View  1.6.0
A Tool for Exploring Fréchet Distance Algorithms
cluptri_mul.cl
Go to the documentation of this file.
1 
6 #if IMAGE2D
7 //
8 // Matrix stored in texture memory
9 //
10 # define read_only_global __read_only image2d_t
11 # define write_only_global __write_only image2d_t
12 // Note: column-major format
13 // a matrix colum is actually a row (y-coordinate) in Image2D
14 // a matrix row is actually a column (x-coordinate) in Image2D
15 // Pixel contains only one (red) component
16 # define read(M,row,col) read_imageui(M,(int2)(row,col)).x
17 # define write(M,row,col,x) write_imageui(M,(int2)(row,col),(uint4)(x,0,0,0))
18 #else
19 //
20 // Matrix stored in __global memory
21 //
22 # define read_only_global __global gpuword*
23 # define write_only_global __global gpuword*
24 # define read(M,row,col) M[(col)*M ## _nrows + row]
25 # define write(M,row,col,x) M[(col)*M ## _nrows + row]=x
26 #endif
27 
28 #ifndef BUFFERED
29 # define BUFFERED 1
30 #endif
31 
32 // tile sizes; TILE_M is given by -D, tile_n is the group size
33 #define tile_width (tile_n*TILE_M)
34 #define tile_ncols (tile_width*32)
35 #define tile_nrows (tile_n*TILE_M*32)
36 #define col_stride (34*tile_n*TILE_M+1)
37 
48 inline int buffer_address(int row, int col, int tile_n)
49 {
50  // col * col_stride + row/16*17 + row%16
51  int rowd16 = row>>4;
52  return col*col_stride + (rowd16<<4)+rowd16 + (row & 0x0f);
53 }
54 
55 #define buf(M,row,col) M##_buf[buffer_address(row,col,tile_n)]
56 
57 // loop over small tile TILE_M x TILE_M
58 #define for_tile \
59  for(ti=0,tcol=lcol; ti<TILE_M; ++ti,tcol+=tile_n) \
60  for(tj=0,trow=lrow; tj<TILE_M; ++tj,trow+=32*tile_n)
61 
62 #define unrolled_for_tile \
63  _Pragma("unroll") for(ti=0,tcol=lcol; ti<TILE_M; ++ti,tcol+=tile_n) \
64  _Pragma("unroll") for(tj=0,trow=lrow; tj<TILE_M; ++tj,trow+=32*tile_n)
65 
66 typedef unsigned int gpuword;
67 
68 #define CEILCOLS(i) ((i+31)/32)
69 #define MIN(x,y) (((x) < (y)) ? (x) : (y))
70 #define POW2(x) (((gpuword)1) << x)
71 
103  __kernel void clcubic_mul(
107 # if BUFFERED
108  __local gpuword* A_buf,
109  __local gpuword* B_buf,
110 # endif
111  int A_nrows, int A_ncols, int B_ncols,
112  int row_offset, int col_offset)
113  {
114 #define A_width CEILCOLS(A_ncols)
115 #define B_nrows 32*A_width
116 #define C_nrows A_nrows
117 
118  int tile_n = get_local_size(1);
119  int lrow = get_local_id(0); // <= 32*tile_n
120  int lcol = get_local_id(1); // <= tile_n
121 
122  int row0, col0;
123  if (get_group_id(0) <= get_group_id(1)) {
124  // upper triangle
125  row0 = get_group_id(0); // = first row in A,C
126  col0 = get_group_id(1); // = first column in B,C
127  }
128  else if (?) {
129  // lower triangle; work on mirrored tile, instead
130  // note: get_num_groups(1) indicates the logical number of groups, get_num_groups(0) does not
131  row0 = get_num_groups(1)-get_group_id(0);
132  col0 = get_num_groups(1)-get_group_id(1);
133  }
134  else {
135  return;
136  }
137 
138  row0 = row_offset + row0*tile_nrows;
139  col0 = col_offset + col0*tile_width;
140 
141  gpuword Csum[TILE_M][TILE_M];
142  gpuword a,b;
143  int a0,a1,ai;
144  int ti,tj;
145  int trow,tcol;
146 
148  Csum[tj][ti] = 0;
149  }
150 
151  for (a0=row0 ?; a0 < col0 ?; a0 += tile_n*TILE_M)
152  {
153 
154 # if BUFFERED
155  // Buffer a tile of A and B in shared memory
157  buf(A,trow,tcol) = read(A, row0+trow, a0+tcol);
158  buf(B,trow,tcol) = read(B, 32*a0+trow, col0+tcol);
159  }
160 
161  barrier(CLK_LOCAL_MEM_FENCE);
162 # endif
163 
164  // process a row of A against a column of B
165  for(a1=0; a1 < tile_n*TILE_M; ++a1)
166  for_tile {
167  ai = a0+a1;
168 # if BUFFERED
169  a = buf(A,trow, a1);
170 # else
171  a = read(A, row0+trow, ai);
172 # endif
173  a &= -(ai < A_width); // if (ai >= A_width) a = 0;
174 
175 # pragma unroll
176  for (int y=0; y < 32; ++y, a >>= 1) {
177 # if BUFFERED
178  b = buf(B,32*a1+y, tcol);
179 # else
180  b = read(B, 32*ai+y, col0+tcol);
181 # endif
182  Csum[tj][ti] |= -(a & 1) & b; // if (a & 1) Csum |= b
183  }
184  }
185 # if BUFFERED
186  barrier(CLK_LOCAL_MEM_FENCE);
187 # endif
188  }
189  // write results back to global memory
191  write(C, row0+trow, col0+tcol, Csum[tj][ti]);
192  }
193 }
194 
__kernel void clcubic_mul(write_only_global C, read_only_global A, read_only_global B, __local gpuword *A_buf, __local gpuword *B_buf, int A_nrows, int A_ncols, int B_ncols, int row_offset, int col_offset)
OpenCL kernel for cubic matrix multiplication.
Definition: cluptri_mul.cl:103
#define A_width
#define read(M, row, col)
Definition: cluptri_mul.cl:24
unsigned int gpuword
Definition: cluptri_mul.cl:66
#define write_only_global
Definition: cluptri_mul.cl:23
unsigned int gpuword
a GPU word has 32 bits
Definition: clcubic_mul.cl:74
int buffer_address(int row, int col, int tile_n)
Definition: cluptri_mul.cl:48
#define A_ncols
#define col_stride
Definition: cluptri_mul.cl:36
#define unrolled_for_tile
Definition: cluptri_mul.cl:62
#define BUFFERED
Definition: cluptri_mul.cl:29
#define write(M, row, col, x)
Definition: cluptri_mul.cl:25
#define tile_nrows
Definition: cluptri_mul.cl:35
#define for_tile
Definition: cluptri_mul.cl:58
#define read_only_global
Definition: cluptri_mul.cl:22
#define tile_width
Definition: cluptri_mul.cl:33
#define buf(M, row, col)
Definition: cluptri_mul.cl:55