Macros
#define	read_only_global __global gpuword*

#define	write_only_global __global gpuword*

#define	read(M, row, col) M[(col)*M ## _nrows + row]

#define	write(M, row, col, x) M[(col)*M ## _nrows + row]=x

#define	BUFFERED 1

#define	tile_width (tile_n*TILE_M)

#define	tile_ncols (tile_width*32)

#define	tile_nrows (tile_nTILE_M32)

#define	col_stride (34tile_nTILE_M+1)

#define	buf(M, row, col) M##_buf[buffer_address(row,col,tile_n)]

#define	for_tile

#define	unrolled_for_tile

#define	CEILCOLS(i) ((i+31)/32)

#define	MIN(x, y) (((x) < (y)) ? (x) : (y))

#define	POW2(x) (((gpuword)1) << x)

#define	A_width CEILCOLS(A_ncols)

#define	B_nrows 32*A_width

#define	C_nrows A_nrows

#define	A_ncols A_nrows

Typedefs
typedef unsigned int	gpuword
	a GPU word has 32 bits More...

Functions
int	buffer_address (int row, int col, int tile_n)
	offset into shared memory buffers More...

__kernel void	clcubic_mul (write_only_global C, read_only_global A, read_only_global B, __local gpuword A_buf, __local gpuword B_buf, int A_nrows, int A_ncols)
	OpenCL kernel for cubic matrix multiplication. More...

__kernel void	clutri_mul (write_only_global C, read_only_global A, read_only_global B, __local gpuword A_buf, __local gpuword B_buf, int A_nrows)
	OpenCL kernel for cubic upper triangular matrix multiplication. More...

Macro Definition Documentation

◆ A_ncols

#define A_ncols A_nrows

◆ A_width

#define A_width CEILCOLS(A_ncols)

Definition at line 80 of file clcubic_mul.cl.

◆ B_nrows

#define B_nrows 32*A_width

Definition at line 81 of file clcubic_mul.cl.

◆ buf

#define buf	(	M,
		row,
		col
	)	M##_buf[buffer_address(row,col,tile_n)]

Definition at line 62 of file clcubic_mul.cl.

◆ BUFFERED

#define BUFFERED 1

Definition at line 28 of file clcubic_mul.cl.

◆ C_nrows

#define C_nrows A_nrows

Definition at line 82 of file clcubic_mul.cl.

◆ CEILCOLS

#define CEILCOLS ( i ) ((i+31)/32)

Definition at line 76 of file clcubic_mul.cl.

◆ col_stride

#define col_stride (34*tile_n*TILE_M+1)

Definition at line 37 of file clcubic_mul.cl.

◆ for_tile

#define for_tile

Value:

for(ti=0,tcol=lcol; ti<TILE_M; ++ti,tcol+=tile_n) \

for(tj=0,trow=lrow; tj<TILE_M; ++tj,trow+=32*tile_n)

Definition at line 65 of file clcubic_mul.cl.

◆ MIN

#define MIN	(	x,
		y
	)	(((x) < (y)) ? (x) : (y))

Definition at line 77 of file clcubic_mul.cl.

◆ POW2

#define POW2 ( x ) (((gpuword)1) << x)

Definition at line 78 of file clcubic_mul.cl.

◆ read

#define read	(	M,
		row,
		col
	)	M[(col)*M ## _nrows + row]

Definition at line 23 of file clcubic_mul.cl.

◆ read_only_global

#define read_only_global __global gpuword*

Definition at line 21 of file clcubic_mul.cl.

◆ tile_ncols

#define tile_ncols (tile_width*32)

Definition at line 35 of file clcubic_mul.cl.

◆ tile_nrows

#define tile_nrows (tile_n*TILE_M*32)

Definition at line 36 of file clcubic_mul.cl.

◆ tile_width

#define tile_width (tile_n*TILE_M)

Definition at line 34 of file clcubic_mul.cl.

◆ unrolled_for_tile

#define unrolled_for_tile

Value:

_Pragma("unroll") for(ti=0,tcol=lcol; ti<TILE_M; ++ti,tcol+=tile_n) \

_Pragma("unroll") for(tj=0,trow=lrow; tj<TILE_M; ++tj,trow+=32*tile_n)

Definition at line 69 of file clcubic_mul.cl.

◆ write

#define write	(	M,
		row,
		col,
		x
	)	M[(col)*M ## _nrows + row]=x

Definition at line 24 of file clcubic_mul.cl.

◆ write_only_global

#define write_only_global __global gpuword*

Definition at line 22 of file clcubic_mul.cl.

Typedef Documentation

◆ gpuword

typedef unsigned int gpuword

a GPU word has 32 bits

Definition at line 74 of file clcubic_mul.cl.

Function Documentation

◆ buffer_address()

int buffer_address	(	int	row,
		int	col,
		int	tile_n
	)

inline

offset into shared memory buffers

Shared Memory buffers are aligned to avoid bank conflicts

column-major storage
after every 16 rows, there is an additional 17th empty block
after each column there is an additional empty block

column stride is 32*tile_n*tile_m + 32*tile_n*tilem/16 + 1 = 34*tile_n*tile_m+1

Parameters

row	tile row
col	tile column
tile_n	number of rows in tile

Returns: offset into buffer

Definition at line 55 of file clcubic_mul.cl.

◆ clcubic_mul()

__kernel void clcubic_mul	(	write_only_global	C,
		read_only_global	A,
		read_only_global	B,
		__local gpuword *	A_buf,
		__local gpuword *	B_buf,
		int	A_nrows,
		int	A_ncols
	)

OpenCL kernel for cubic matrix multiplication.

Perform Boolean matrix multiplication C += A * B

Uses a tiled cubic approach. Tiles of A and B are buffered in shared memory.

Parameters

C	destination matrix
A	source matrix
B	source matrix
A_buf	shared memory for buffering tiles
B_buf	shared memory for buffering tiles
A_nrows	number of rows in A == number of rows in C
A_ncols	number columns in A == number of rows in BImportant: global memory access must be coalesced. Each half warp (=set of 16 threads) must access consecutive addresses. Matrix data is stored in column-major order, so it is imperative that consecutive rows are accessed. row0+trow is the relevant variable. Groups height (get_local_size(0)) is garuanteed to be a multiple of 32. "for_tile" loop is arranged to read consecutive words (32*tile_n words for iteration).

For shared memory it is imperative to aovid bank conflicts. Use odd aligned access patterns whenever possible.

Definition at line 112 of file clcubic_mul.cl.

◆ clutri_mul()

__kernel void clutri_mul	(	write_only_global	C,
		read_only_global	A,
		read_only_global	B,
		__local gpuword *	A_buf,
		__local gpuword *	B_buf,
		int	A_nrows
	)

OpenCL kernel for cubic upper triangular matrix multiplication.

Perform Boolean matrix multiplication C += A * B A,B,C assumed to be upper triangular, square matrixes.

Uses a tiled cubic approach. Tiles of A and B are buffered in shared memory.

Parameters

C	destination matrix
A	source matrix
B	source matrix
A_buf	shared memory for buffering tiles
B_buf	shared memory for buffering tiles
A_nrows	number of rows in A,B,C == number of cols in A,B,C

Definition at line 200 of file clcubic_mul.cl.

Macros

Typedefs

Functions

Macro Definition Documentation

◆ A_ncols

◆ A_width

◆ B_nrows

◆ buf

◆ BUFFERED

◆ C_nrows

◆ CEILCOLS

◆ col_stride

◆ for_tile

◆ MIN

◆ POW2

◆ read

◆ read_only_global

◆ tile_ncols

◆ tile_nrows

◆ tile_width

◆ unrolled_for_tile

◆ write

◆ write_only_global

Typedef Documentation

◆ gpuword

Function Documentation

◆ buffer_address()

◆ clcubic_mul()

◆ clutri_mul()