10 # define read_only_global __read_only image2d_t 11 # define write_only_global __write_only image2d_t 16 # define read(M,row,col) read_imageui(M,(int2)(row,col)).x 17 # define write(M,row,col,x) write_imageui(M,(int2)(row,col),(uint4)(x,0,0,0)) 22 # define read_only_global __global gpuword* 23 # define write_only_global __global gpuword* 24 # define read(M,row,col) M[(col)*M ## _nrows + row] 25 # define write(M,row,col,x) M[(col)*M ## _nrows + row]=x 33 #define tile_width (tile_n*TILE_M) 34 #define tile_ncols (tile_width*32) 35 #define tile_nrows (tile_n*TILE_M*32) 36 #define col_stride (34*tile_n*TILE_M+1) 52 return col*
col_stride + (rowd16<<4)+rowd16 + (row & 0x0f);
55 #define buf(M,row,col) M##_buf[buffer_address(row,col,tile_n)] 59 for(ti=0,tcol=lcol; ti<TILE_M; ++ti,tcol+=tile_n) \ 60 for(tj=0,trow=lrow; tj<TILE_M; ++tj,trow+=32*tile_n) 62 #define unrolled_for_tile \ 63 _Pragma("unroll") for(ti=0,tcol=lcol; ti<TILE_M; ++ti,tcol+=tile_n) \ 64 _Pragma("unroll") for(tj=0,trow=lrow; tj<TILE_M; ++tj,trow+=32*tile_n) 68 #define CEILCOLS(i) ((i+31)/32) 69 #define MIN(x,y) (((x) < (y)) ? (x) : (y)) 70 #define POW2(x) (((gpuword)1) << x) 111 int A_nrows,
int A_ncols,
int B_ncols,
112 int row_offset,
int col_offset)
114 #define A_width CEILCOLS(A_ncols) 115 #define B_nrows 32*A_width 116 #define C_nrows A_nrows 118 int tile_n = get_local_size(1);
119 int lrow = get_local_id(0);
120 int lcol = get_local_id(1);
123 if (get_group_id(0) <= get_group_id(1)) {
125 row0 = get_group_id(0);
126 col0 = get_group_id(1);
131 row0 = get_num_groups(1)-get_group_id(0);
132 col0 = get_num_groups(1)-get_group_id(1);
151 for (a0=row0 ?; a0 < col0 ?; a0 += tile_n*TILE_M)
157 buf(A,trow,tcol) =
read(A, row0+trow, a0+tcol);
158 buf(B,trow,tcol) =
read(B, 32*a0+trow, col0+tcol);
161 barrier(CLK_LOCAL_MEM_FENCE);
165 for(a1=0; a1 < tile_n*TILE_M; ++a1)
171 a =
read(A, row0+trow, ai);
176 for (
int y=0; y < 32; ++y, a >>= 1) {
178 b =
buf(B,32*a1+y, tcol);
180 b =
read(B, 32*ai+y, col0+tcol);
182 Csum[tj][ti] |= -(a & 1) & b;
186 barrier(CLK_LOCAL_MEM_FENCE);
191 write(C, row0+trow, col0+tcol, Csum[tj][ti]);
__kernel void clcubic_mul(write_only_global C, read_only_global A, read_only_global B, __local gpuword *A_buf, __local gpuword *B_buf, int A_nrows, int A_ncols, int B_ncols, int row_offset, int col_offset)
OpenCL kernel for cubic matrix multiplication.
#define read(M, row, col)
#define write_only_global
unsigned int gpuword
a GPU word has 32 bits
int buffer_address(int row, int col, int tile_n)
#define unrolled_for_tile
#define write(M, row, col, x)