7 #define CEILCOLS(i) ((i+31)/32) 12 # define read_only_global __read_only image2d_t 13 # define write_only_global __write_only image2d_t 18 # define read(M,row,col) read_imageui(M,(int2)(row,col)).x 19 # define write(M,row,col,x) write_imageui(M,(int2)(row,col),(uint4)(x,0,0,0)) 24 # define read_only_global __global gpuword* 25 # define write_only_global __global gpuword* 26 # define read(M,row,col) M[(col)*M ## _nrows + row] 27 # define write(M,row,col,x) M[(col)*M ## _nrows + row]=x 30 #define MIN(x,y) (((x) < (y)) ? (x) : (y)) 31 #define POW2(x) (((gpuword)1) << x) 43 int spill = spot + n - 32;
48 temp = (a1 << (32 - spill)) | (a0 >> spill);
49 return temp >> (32 - n);
56 for (
int y = 0; y < k; ++y, x >>= 1)
57 result |= (x & 1) * T[
POW2(y)];
80 int A_nrows,
int A_ncols,
int B_ncols)
82 #define A_width CEILCOLS(A_ncols) 83 #define C_ncols B_ncols 84 #define C_width CEILCOLS(C_ncols) 85 #define B_nrows A_ncols 86 #define C_nrows A_nrows 89 int group_size = get_local_size(0);
91 int ci = get_group_id(1);
94 int cj = r0 + get_global_id(0);
95 int lcj = get_local_id(0);
103 for (
int ai = 0; ai <
A_ncols; ai += k)
109 for (
int sj=0; sj < k1; sj += group_size) {
116 barrier(CLK_LOCAL_MEM_FENCE);
119 for (
int sj=0; sj <
POW2(k1); sj += group_size) {
125 barrier(CLK_LOCAL_MEM_FENCE);
138 A1 =
read(A,cj, ablock + 1);
141 barrier(CLK_LOCAL_MEM_FENCE);
145 write(C,cj,ci, Csum);
unsigned int gpuword
a GPU word has 32 bits
gpuword read_bits(gpuword a0, gpuword a1, int spot, int n)
read 32 bits from memory, not necessarily aligned to word boundaries
gpuword combinate(gpuword x, int k, __local gpuword *T)
__kernel void clm4rm_mul(write_only_global C, read_only_global A, read_only_global B, __local gpuword *T, int k, int r0, int A_nrows, int A_ncols, int B_ncols)
OpenCL kernel for M4R matrix Multiplication C := A*B.
#define read(M, row, col)
#define write_only_global
#define write(M, row, col, x)