Macros
#define	read_only_global __global gpuword*

#define	write_only_global __global gpuword*

#define	read(M, row, col) M[(col)*M ## _nrows + row]

#define	write(M, row, col, x) M[(col)*M ## _nrows + row]=x

#define	BUFFERED 1

#define	tile_width (tile_n*TILE_M)

#define	tile_ncols (tile_width*32)

#define	tile_nrows (tile_nTILE_M32)

#define	col_stride (34tile_nTILE_M+1)

#define	buf(M, row, col) M##_buf[buffer_address(row,col,tile_n)]

#define	for_tile

#define	unrolled_for_tile

#define	CEILCOLS(i) ((i+31)/32)

#define	MIN(x, y) (((x) < (y)) ? (x) : (y))

#define	POW2(x) (((gpuword)1) << x)

#define	A_width CEILCOLS(A_ncols)

#define	B_nrows 32*A_width

#define	C_nrows A_nrows

Typedefs
typedef unsigned int	gpuword

Functions
int	buffer_address (int row, int col, int tile_n)

__kernel void	clcubic_mul (write_only_global C, read_only_global A, read_only_global B, __local gpuword A_buf, __local gpuword B_buf, int A_nrows, int A_ncols, int B_ncols, int row_offset, int col_offset)
	OpenCL kernel for cubic matrix multiplication. More...

Macro Definition Documentation

◆ A_width

#define A_width CEILCOLS(A_ncols)

◆ B_nrows

#define B_nrows 32*A_width

◆ buf

#define buf	(	M,
		row,
		col
	)	M##_buf[buffer_address(row,col,tile_n)]

Definition at line 55 of file cluptri_mul.cl.

◆ BUFFERED

#define BUFFERED 1

Definition at line 29 of file cluptri_mul.cl.

◆ C_nrows

#define C_nrows A_nrows

◆ CEILCOLS

#define CEILCOLS ( i ) ((i+31)/32)

Definition at line 68 of file cluptri_mul.cl.

◆ col_stride

#define col_stride (34*tile_n*TILE_M+1)

Definition at line 36 of file cluptri_mul.cl.

◆ for_tile

#define for_tile

Value:

for(ti=0,tcol=lcol; ti<TILE_M; ++ti,tcol+=tile_n) \

for(tj=0,trow=lrow; tj<TILE_M; ++tj,trow+=32*tile_n)

Definition at line 58 of file cluptri_mul.cl.

◆ MIN

#define MIN	(	x,
		y
	)	(((x) < (y)) ? (x) : (y))

Definition at line 69 of file cluptri_mul.cl.

◆ POW2

#define POW2 ( x ) (((gpuword)1) << x)

Definition at line 70 of file cluptri_mul.cl.

◆ read

#define read	(	M,
		row,
		col
	)	M[(col)*M ## _nrows + row]

Definition at line 24 of file cluptri_mul.cl.

◆ read_only_global

#define read_only_global __global gpuword*

Cubic Matrix Multiplication Upper Triangular Matrix

Definition at line 22 of file cluptri_mul.cl.

◆ tile_ncols

#define tile_ncols (tile_width*32)

Definition at line 34 of file cluptri_mul.cl.

◆ tile_nrows

#define tile_nrows (tile_n*TILE_M*32)

Definition at line 35 of file cluptri_mul.cl.

◆ tile_width

#define tile_width (tile_n*TILE_M)

Definition at line 33 of file cluptri_mul.cl.

◆ unrolled_for_tile

#define unrolled_for_tile

Value:

_Pragma("unroll") for(ti=0,tcol=lcol; ti<TILE_M; ++ti,tcol+=tile_n) \

_Pragma("unroll") for(tj=0,trow=lrow; tj<TILE_M; ++tj,trow+=32*tile_n)

Definition at line 62 of file cluptri_mul.cl.

◆ write

#define write	(	M,
		row,
		col,
		x
	)	M[(col)*M ## _nrows + row]=x

Definition at line 25 of file cluptri_mul.cl.

◆ write_only_global

#define write_only_global __global gpuword*

Definition at line 23 of file cluptri_mul.cl.

Typedef Documentation

◆ gpuword

typedef unsigned int gpuword

Definition at line 66 of file cluptri_mul.cl.

Function Documentation

◆ buffer_address()

int buffer_address	(	int	row,
		int	col,
		int	tile_n
	)

inline

Shared Memory buffers are aligned to avoid bank conflict

column major
after every 16 rows, there is an additional 17th empty block
after each column there is an additional empty block

column stride is 32*tile_n*tile_m + 32*tile_n*tilem/16 + 1 = 34*tile_n*tile_m+1

Definition at line 48 of file cluptri_mul.cl.

◆ clcubic_mul()

__kernel void clcubic_mul	(	write_only_global	C,
		read_only_global	A,
		read_only_global	B,
		__local gpuword *	A_buf,
		__local gpuword *	B_buf,
		int	A_nrows,
		int	A_ncols,
		int	B_ncols,
		int	row_offset,
		int	col_offset
	)

OpenCL kernel for cubic matrix multiplication.

Perform Boolean matrix multiplication on upper triangular matrices C += A * B

Uses a tiled cubic approach. Tiles of A and B are buffered in shared memory.

Parameters

C	destination matrix
A	source matrix
B	source matrix
A_buf	shared memory for buffering tiles
B_buf	shared memory for buffering tiles
A_nrows	number of rows in A == number of rows in C
A_ncols	number columns in A == number of rows in B
B_ncols	number of columns in B
row_offset	start row
col_offset	start columnImportant: global memory access must be coalesced. Each half warp (=set of 16 threads) must access consecutive addresses. Matrix data is stored in column-major order, so it is imperative that consecutive rows are accessed. row0+trow is the relevant variable. Groups height (get_local_size(0)) is garuanteed to be a multiple of 32. "for_tile" loop is arranged to read consecutive words (32*tile_n words for iteration).

For shared memory it is imperative to aovid bank conflicts. Use odd aligned access patterns whenever possible.

Definition at line 103 of file cluptri_mul.cl.

Macros

Typedefs

Functions