fv/html/cluptri__mul_8cl_source.html

 #if IMAGE2D
 //
 // Matrix stored in texture memory
 //
 # define read_only_global     __read_only image2d_t
 # define write_only_global    __write_only image2d_t
 // Note: column-major format
 // a matrix colum is actually a row (y-coordinate) in Image2D
 // a matrix row is actually a column (x-coordinate) in Image2D
 // Pixel contains only one (red) component
 # define read(M,row,col)      read_imageui(M,(int2)(row,col)).x
 # define write(M,row,col,x)   write_imageui(M,(int2)(row,col),(uint4)(x,0,0,0))
 #else
 //
 // Matrix stored in __global memory
 //
 # define read_only_global     __global gpuword*
 # define write_only_global    __global gpuword*
 # define read(M,row,col)      M[(col)*M ## _nrows + row]
 # define write(M,row,col,x)   M[(col)*M ## _nrows + row]=x
 #endif

 #ifndef BUFFERED
 # define BUFFERED 1
 #endif

 //  tile sizes; TILE_M is given by -D, tile_n is the group size
 #define tile_width  (tile_n*TILE_M)
 #define tile_ncols  (tile_width*32)
 #define tile_nrows  (tile_n*TILE_M*32)
 #define col_stride  (34*tile_n*TILE_M+1)

 inline int buffer_address(int row, int col, int tile_n)
 {
     //  col * col_stride + row/16*17 + row%16
     int rowd16 = row>>4;
     return col*col_stride + (rowd16<<4)+rowd16 + (row & 0x0f);
 }

 #define buf(M,row,col)  M##_buf[buffer_address(row,col,tile_n)]

 //  loop over small tile TILE_M x TILE_M
 #define for_tile \
     for(ti=0,tcol=lcol; ti<TILE_M; ++ti,tcol+=tile_n) \
     for(tj=0,trow=lrow; tj<TILE_M; ++tj,trow+=32*tile_n)

 #define unrolled_for_tile \
     _Pragma("unroll")   for(ti=0,tcol=lcol; ti<TILE_M; ++ti,tcol+=tile_n) \
     _Pragma("unroll")   for(tj=0,trow=lrow; tj<TILE_M; ++tj,trow+=32*tile_n)

 typedef unsigned int gpuword;

 #define CEILCOLS(i)     ((i+31)/32)
 #define MIN(x,y)        (((x) < (y)) ? (x) : (y))
 #define POW2(x)         (((gpuword)1) << x)

  __kernel void clcubic_mul(
             write_only_global C,
             read_only_global A,
             read_only_global B,
 #       if BUFFERED
             __local  gpuword* A_buf,
             __local  gpuword* B_buf,
 #       endif
             int A_nrows, int A_ncols, int B_ncols,
             int row_offset, int col_offset)
  {
 #define A_width CEILCOLS(A_ncols)
 #define B_nrows 32*A_width
 #define C_nrows A_nrows

     int tile_n = get_local_size(1);
     int lrow = get_local_id(0); // <= 32*tile_n
     int lcol = get_local_id(1); // <= tile_n

     int row0, col0;
     if (get_group_id(0) <= get_group_id(1)) {
         //  upper triangle
         row0 = get_group_id(0); // = first row in A,C
         col0 = get_group_id(1); // = first column in B,C
     }
     else if (?) {
         //  lower triangle; work on mirrored tile, instead
         //  note: get_num_groups(1) indicates the logical number of groups, get_num_groups(0) does not
         row0 = get_num_groups(1)-get_group_id(0);
         col0 = get_num_groups(1)-get_group_id(1);
     }
     else {
         return;
     }

     row0 = row_offset + row0*tile_nrows;
     col0 = col_offset + col0*tile_width;

     gpuword Csum[TILE_M][TILE_M];
     gpuword a,b;
     int a0,a1,ai;
     int ti,tj;
     int trow,tcol;

     unrolled_for_tile {
         Csum[tj][ti] = 0;
     }

     for (a0=row0 ?; a0 < col0 ?; a0 += tile_n*TILE_M)
     {

 #   if BUFFERED
         //  Buffer a tile of A and B in shared memory
         unrolled_for_tile {
             buf(A,trow,tcol) = read(A, row0+trow,   a0+tcol);
             buf(B,trow,tcol) = read(B, 32*a0+trow,  col0+tcol);
         }

         barrier(CLK_LOCAL_MEM_FENCE);
 #   endif

         //  process a row of A against a column of B
         for(a1=0; a1 < tile_n*TILE_M; ++a1)
             for_tile {
                 ai = a0+a1;
 #           if BUFFERED
                 a = buf(A,trow, a1);
 #           else
                 a = read(A, row0+trow, ai);
 #           endif
                 a &= -(ai < A_width); // if (ai >= A_width) a = 0;

 #               pragma unroll
                 for (int y=0; y < 32; ++y, a >>= 1) {
 #               if BUFFERED
                     b = buf(B,32*a1+y, tcol);
 #               else
                     b = read(B, 32*ai+y,  col0+tcol);
 #               endif
                     Csum[tj][ti] |= -(a & 1) & b;   // if (a & 1) Csum |= b
                 }
             }
 #   if BUFFERED
         barrier(CLK_LOCAL_MEM_FENCE);
 #   endif
     }
     //  write results back to global memory
     unrolled_for_tile {
         write(C, row0+trow, col0+tcol, Csum[tj][ti]);
     }
 }

clcubic_mul
__kernel void clcubic_mul(write_only_global C, read_only_global A, read_only_global B, __local gpuword *A_buf, __local gpuword *B_buf, int A_nrows, int A_ncols, int B_ncols, int row_offset, int col_offset)
OpenCL kernel for cubic matrix multiplication.
Definition: cluptri_mul.cl:103

A_width
#define A_width

read
#define read(M, row, col)
Definition: cluptri_mul.cl:24

gpuword
unsigned int gpuword
Definition: cluptri_mul.cl:66

write_only_global
#define write_only_global
Definition: cluptri_mul.cl:23

gpuword
unsigned int gpuword
a GPU word has 32 bits
Definition: clcubic_mul.cl:74

buffer_address
int buffer_address(int row, int col, int tile_n)
Definition: cluptri_mul.cl:48

A_ncols
#define A_ncols

col_stride
#define col_stride
Definition: cluptri_mul.cl:36

unrolled_for_tile
#define unrolled_for_tile
Definition: cluptri_mul.cl:62

BUFFERED
#define BUFFERED
Definition: cluptri_mul.cl:29

write
#define write(M, row, col, x)
Definition: cluptri_mul.cl:25

tile_nrows
#define tile_nrows
Definition: cluptri_mul.cl:35

for_tile
#define for_tile
Definition: cluptri_mul.cl:58

read_only_global
#define read_only_global
Definition: cluptri_mul.cl:22

tile_width
#define tile_width
Definition: cluptri_mul.cl:33

buf
#define buf(M, row, col)
Definition: cluptri_mul.cl:55