9 # define read_only_global __read_only image2d_t 10 # define write_only_global __write_only image2d_t 15 # define read(M,row,col) read_imageui(M,(int2)(row,col)).x 16 # define write(M,row,col,x) write_imageui(M,(int2)(row,col),(uint4)(x,0,0,0)) 21 # define read_only_global __global gpuword* 22 # define write_only_global __global gpuword* 23 # define read(M,row,col) M[(col)*M ## _nrows + row] 24 # define write(M,row,col,x) M[(col)*M ## _nrows + row]=x 34 #define tile_width (tile_n*TILE_M) 35 #define tile_ncols (tile_width*32) 36 #define tile_nrows (tile_n*TILE_M*32) 37 #define col_stride (34*tile_n*TILE_M+1) 59 return col*
col_stride + (rowd16<<4)+rowd16 + (row & 0x0f);
62 #define buf(M,row,col) M##_buf[buffer_address(row,col,tile_n)] 66 for(ti=0,tcol=lcol; ti<TILE_M; ++ti,tcol+=tile_n) \ 67 for(tj=0,trow=lrow; tj<TILE_M; ++tj,trow+=32*tile_n) 69 #define unrolled_for_tile \ 70 _Pragma("unroll") for(ti=0,tcol=lcol; ti<TILE_M; ++ti,tcol+=tile_n) \ 71 _Pragma("unroll") for(tj=0,trow=lrow; tj<TILE_M; ++tj,trow+=32*tile_n) 76 #define CEILCOLS(i) ((i+31)/32) 77 #define MIN(x,y) (((x) < (y)) ? (x) : (y)) 78 #define POW2(x) (((gpuword)1) << x) 80 #define A_width CEILCOLS(A_ncols) 81 #define B_nrows 32*A_width 82 #define C_nrows A_nrows 122 int tile_n = get_local_size(1);
123 int lrow = get_local_id(0);
124 int lcol = get_local_id(1);
126 int row0 = get_global_offset(0) + get_group_id(0)*
tile_nrows;
127 int col0 = get_global_offset(1) + get_group_id(1)*
tile_width;
139 for (a0=0; a0 <
A_width; a0 += tile_n*TILE_M)
145 buf(A,trow,tcol) =
read(A, row0+trow, a0+tcol);
146 buf(B,trow,tcol) =
read(B, 32*a0+trow, col0+tcol);
149 barrier(CLK_LOCAL_MEM_FENCE);
153 for(a1=0; a1 < tile_n*TILE_M; ++a1)
159 a =
read(A, row0+trow, ai);
164 for (
int y=0; y < 32; ++y, a >>= 1) {
166 b =
buf(B,32*a1+y, tcol);
168 b =
read(B, 32*ai+y, col0+tcol);
170 Csum[tj][ti] |= -(a & 1) & b;
174 barrier(CLK_LOCAL_MEM_FENCE);
179 write(C, row0+trow, col0+tcol, Csum[tj][ti]);
210 #define A_ncols A_nrows 212 int tile_n = get_local_size(1);
213 int lrow = get_local_id(0);
214 int lcol = get_local_id(1);
217 if (get_group_id(0) <= get_group_id(1)) {
219 row0 = get_group_id(0);
220 col0 = get_group_id(1);
226 row0 = get_num_groups(1)-get_group_id(0);
227 col0 = get_num_groups(1)-1-get_group_id(1);
228 if (row0==get_group_id(0))
252 buf(A,trow,tcol) =
read(A, row0+trow, a0+tcol);
253 buf(B,trow,tcol) =
read(B, 32*a0+trow, col0+tcol);
256 barrier(CLK_LOCAL_MEM_FENCE);
260 for(a1=0; a1 < tile_n*TILE_M; ++a1)
266 a =
read(A, row0+trow, ai);
271 for (
int y=0; y < 32; ++y, a >>= 1) {
273 b =
buf(B,32*a1+y, tcol);
275 b =
read(B, 32*ai+y, col0+tcol);
277 Csum[tj][ti] |= -(a & 1) & b;
281 barrier(CLK_LOCAL_MEM_FENCE);
286 write(C, row0+trow, col0+tcol, Csum[tj][ti]);
#define write(M, row, col, x)
unsigned int gpuword
a GPU word has 32 bits
__kernel void clcubic_mul(write_only_global C, read_only_global A, read_only_global B, __local gpuword *A_buf, __local gpuword *B_buf, int A_nrows, int A_ncols)
OpenCL kernel for cubic matrix multiplication.
#define read(M, row, col)
int buffer_address(int row, int col, int tile_n)
offset into shared memory buffers
#define write_only_global
__kernel void clutri_mul(write_only_global C, read_only_global A, read_only_global B, __local gpuword *A_buf, __local gpuword *B_buf, int A_nrows)
OpenCL kernel for cubic upper triangular matrix multiplication.
#define unrolled_for_tile