Detailed Description

M4RM on the GPU Perform Boolean matrix arithmetics on the GPU. Based on OpenCL.

Kernels are provided for arithmetic, like bitwise operation, multiplication, etc.

Methods are provided for allocating and copying data from CPU memory to GPU memory and back.

Definition in file clm4rm.h.

#include <CL/opencl.h>
#include <m4ri/mzd.h>

Classes
struct	clmatrix_t
	OpenCL boolean matrix data structure. Data is arranged in 32 bit words. More...

struct	clm4rm_event_list
	a list of cl_events; used by clm4rm_conditions to keep track of schedules jobs in the OpenCL queue. More...

struct	clm4rm_conditions
	Manages OpenCL event dependencies; necessary when the queue is out-of-order; dependencies must be established through cl_event. More...

Functions
matrix operations
clmatrix_t *	clm4rm_create (rci_t rows, rci_t cols, int rowpadding, int read_only, cl_context ctx)
	create an empty matrix More...

clmatrix_t *	clm4rm_copy (const mzd_t *host_matrix, int rowpadding, int read_only, cl_context ctx)
	ceate a copy from a matrix in M4RI format More...

void	clm4rm_zero_fill (clmatrix_t gpu_matrix, cl_command_queue queue, clm4rm_conditions cond)
	Fill a matrix with zero data. The operation is scheduled for asynchronous execution of the GPU. The function returns immediately. Use post-condition events to wait for the execution of the operation. More...

void	clm4rm_write (clmatrix_t gpu_matrix, const mzd_t host_matrix, cl_command_queue queue, clm4rm_conditions *cond)
	Copy matrix data from host memory to GPU. The operation is scheduled for asynchronous execution of the GPU. The function returns immediately. Use post-condition events to wait for the execution of the operation. More...

mzd_t *	clm4rm_read (mzd_t host_matrix, clmatrix_t gpu_matrix, cl_command_queue queue, clm4rm_conditions *cond)
	copy matrix from gpu memory to host More...

void	clm4rm_free (clmatrix_t *gpu_matrix)
	release memory (CPU and GPU) More...

void	clm4rm_mul (clmatrix_t C, clmatrix_t A, clmatrix_t B, cl_command_queue queue, clm4rm_conditions cond)
	Boolean matrix multiplication on the GPU using the method of the Four Russians. C := A * B. More...

void	clcubic_mul (clmatrix_t C, clmatrix_t A, clmatrix_t B, size2_t max_tile, cl_command_queue queue, clm4rm_conditions cond)
	Boolean matrix multiplication on the GPU using nested loops. C := A*B. More...

void	clutri_mul (clmatrix_t C, clmatrix_t A, clmatrix_t B, size2_t max_tile, cl_command_queue queue, clm4rm_conditions cond)
	Boolean matrix multiplication on the GPU using nested loops. C := A*B Assumes matrixes to be upper triangular. More...

void	clm4rm_stack (clmatrix_t C, clmatrix_t A, clmatrix_t B, cl_command_queue queue, clm4rm_conditions cond)
	concatenate two matrices More...

void	clm4rm_concat (clmatrix_t C, clmatrix_t A, clmatrix_t B, cl_command_queue queue, clm4rm_conditions cond)
	concatenate two matrices More...

void	clm4rm_or (clmatrix_t C, clmatrix_t A, clmatrix_t B, cl_command_queue queue, clm4rm_conditions cond)
	perform element-wise logical disjunction (OR) More...

void	clm4rm_and (clmatrix_t C, clmatrix_t A, clmatrix_t B, cl_command_queue queue, clm4rm_conditions cond)
	perform element-wise logical conjunction (AND). For each entry, compute C_ij := A_ij & B_ij. All input matrices must have the same size. More...

cl_mem	clm4rm_query_diagonal (clmatrix_t M, cl_context ctx, cl_command_queue queue, clm4rm_conditions cond)
	find a non-zero entry on the diagonal of a matrix. Return the column/row of the first non-zero entry, or -1 if all entries are zero. The operation does not immediately return a result. It performs asynchronously. Use the post-conditions variables to wait for the execution of the operation, then use clm4rm_query_result to retrieve the actual result. More...

int	clm4rm_query_result (cl_mem result_buffer, cl_command_queue queue, clm4rm_conditions *cond)
	examine the result of a previous call to clm4rm_query_diagonal More...

basic definitions
#define	clm4rm_radix 32
	word size. for compatibility with GPU memory layout, we operate on 32 bit words. More...

#define	IMAGE2D 0

#define	BUFFERED 1

#define	MAX_TILE_M 6

#define	CEILDIV(x, y) (((x)+(y)-1)/(y))
	integer division with rounding to a multiple of y More...

#define	FLOOR(x, y) ((y)*((x)/(y)))
	integer division with rounding More...

#define	CEILCOLS(i) CEILDIV(i,clm4rm_radix)
	integer division by number of bits per word More...

#define	POW2(i) (((gpuword)1)<<(i))

typedef uint32_t	gpuword
	word size of GPU data (32 bits) More...

typedef size_t	size2_t[2]
	tow-dimensional size; used for various OpenCL parameters More...

matrix storage
#define	DATA_BYTES(m) ( (m)->padded_rows * (m)->width * sizeof(gpuword) )

typedef struct clmatrix_t	clmatrix_t

int	padded_rows (int nrows, int padding)
	calculate the number of padded rows More...

gpuword *	copy_matrix_data (gpuword dest, const mzd_t src, int padded_rows)
	create a column-major copy from an mzd_t matrix More...

void	copy_back_matrix_data (mzd_t dest, const gpuword src, int padded_rows)
	copy back a colum–major matrix More...

global variables
cl_int	clm4rm_error
	latest OpenCL result code. CL_SUCCESS indicates no error. More...

size_t	max_group_size
	max. size of a work group More...

size_t	max_items [3]
	max. number of items in each dimension More...

size_t	shared_mem_bytes
	size of shared memory in bytes More...

size_t	shared_mem_words
	size of shared memory in (32bit) words More...

size_t	heap_size
	size of allocated memory in bytes More...

size_t	allocated_size

size_t	max_object_size
	max. object allocation size More...

cl_int	clm4rm_setup (const char *cl_kernel_directory, cl_context ctx, cl_device_id device)
	load OpenCL kernels and set up parameters More...

void	clm4rm_tear_down (cl_context ctx, cl_device_id device)
	release OpenCL resources More...

event handling
#define	MAX_EVENTS 6

typedef struct clm4rm_event_list	clm4rm_event_list

typedef struct clm4rm_conditions	clm4rm_conditions

void	init_events (clm4rm_event_list *list)
	reset events list More...

void	release_events (clm4rm_event_list *list)
	release events More...

void	merge_events (clm4rm_event_list a, clm4rm_event_list b)
	append tow lists More...

void	init_conditions (clm4rm_conditions *cond)
	reset conditions list More...

void	release_conditions (clm4rm_conditions *cond)
	release conditions list More...

void	join_conditions (clm4rm_conditions *cond)
	called when the pre-conditions are met. The post-conditions become new pre-conditioins. More...

void	merge_conditions (clm4rm_conditions a, clm4rm_conditions b)
	merge pre-conditions into one list More...

cl_uint	pre_count (clm4rm_conditions *cond)

cl_event *	pre_events (clm4rm_conditions *cond)

cl_event *	push_event (clm4rm_conditions *cond)
	reserve one post-condition event More...

cl_event *	pushed_event (clm4rm_conditions *cond)

Macro Definition Documentation

◆ BUFFERED

#define BUFFERED 1

Use shared GPU memory to buffer tiles of the matrix.

Definition at line 57 of file clm4rm.h.

◆ CEILCOLS

#define CEILCOLS ( i ) CEILDIV(i,clm4rm_radix)

integer division by number of bits per word

Definition at line 76 of file clm4rm.h.

◆ CEILDIV

#define CEILDIV	(	x,
		y
	)	(((x)+(y)-1)/(y))

integer division with rounding to a multiple of y

Definition at line 70 of file clm4rm.h.

◆ clm4rm_radix

#define clm4rm_radix 32

word size. for compatibility with GPU memory layout, we operate on 32 bit words.

Definition at line 41 of file clm4rm.h.

◆ DATA_BYTES

#define DATA_BYTES ( m ) ( (m)->padded_rows * (m)->width * sizeof(gpuword) )

Definition at line 119 of file clm4rm.h.

◆ FLOOR

#define FLOOR	(	x,
		y
	)	((y)*((x)/(y)))

integer division with rounding

Definition at line 73 of file clm4rm.h.

◆ IMAGE2D

#define IMAGE2D 0

OpenCL data can be stored as Buffer object or Image objects, the latter being supposedly faster (really?). Within a kernel, an image buffer can be only read-only or write-only, with is alright with us.

Turns out that the different is actually marginal. Note: texture memory if limited (about 2G on a Tesla V100). Global buffer memory is not.

Definition at line 53 of file clm4rm.h.

◆ MAX_EVENTS

#define MAX_EVENTS 6

Definition at line 194 of file clm4rm.h.

◆ MAX_TILE_M

#define MAX_TILE_M 6

max. tile parameter (the actual size of a tile is 32*m*n)

Definition at line 62 of file clm4rm.h.

◆ POW2

#define POW2 ( i ) (((gpuword)1)<<(i))

Definition at line 78 of file clm4rm.h.

Typedef Documentation

◆ clm4rm_conditions

typedef struct clm4rm_conditions clm4rm_conditions

Definition at line 236 of file clm4rm.h.

◆ clm4rm_event_list

typedef struct clm4rm_event_list clm4rm_event_list

Definition at line 204 of file clm4rm.h.

◆ clmatrix_t

typedef struct clmatrix_t clmatrix_t

Definition at line 118 of file clm4rm.h.

◆ gpuword

typedef uint32_t gpuword

word size of GPU data (32 bits)

Definition at line 65 of file clm4rm.h.

◆ size2_t

typedef size_t size2_t[2]

tow-dimensional size; used for various OpenCL parameters

Definition at line 67 of file clm4rm.h.

Function Documentation

◆ clcubic_mul()

void clcubic_mul	(	clmatrix_t *	C,
		clmatrix_t *	A,
		clmatrix_t *	B,
		size2_t	max_tile,
		cl_command_queue	queue,
		clm4rm_conditions *	cond
	)

Boolean matrix multiplication on the GPU using nested loops. C := A*B.

The function returns immediately. The operation is scheduled for asynchronous execution of the GPU. Use post-condition events to wait for the execution of the operation.

Parameters

C	a matrix structure; receives the resutl
A	a matrix structure
B	a matrix structure
max_tile	max. size of tiles
queue	OpenCL command queue
cond	keeps track of pre-conditions and newly created post-conditions

Matrix is partitioned into three parts:

       +----------------------------------+----+
       |                                  |    |
       |      |      |      |      |      |    |
       |                                  |    |
       | - - -+ - - -+ - - -+ - - -+ - - -|    |
       |                                  |    |
       |      |      |      |      |      |    |
       |                                  |    |
       | - - -+ - - -+ - - -+ - - -+ - - -|    |
       |                                  |REST|
       |      |      |      |      |      |RIGHT
       |                                  |    |
       | - - -+ - - -+ - - -+ - - -+ - - -|    |
       |                                  |    |
       |      |      |      |      |      |    |
       |                                  |    |
       | - - -+ - - -+ - - -+ - - -+ - - -|    |
       |                                  |    |
       |      |      |      |      |      |    |
       |                                  |    |
       |------+------+------+------+------+----|
       |   REST BOTTOM                         |
       | - - -+ - - -+ - - -+ - - -+ - - -+ - -+

Definition at line 132 of file clm4rm_multiplication.cpp.

◆ clm4rm_and()

void clm4rm_and	(	clmatrix_t *	C,
		clmatrix_t *	A,
		clmatrix_t *	B,
		cl_command_queue	queue,
		clm4rm_conditions *	cond
	)

perform element-wise logical conjunction (AND). For each entry, compute C_ij := A_ij & B_ij. All input matrices must have the same size.

Parameters

C	a Boolean matrix; holds the result on return
A	an input Boolean matrix
B	an input Boolean matrix
queue	OpenCL command queue
cond	keeps track of pre-conditions and newly created post-conditions

Definition at line 45 of file clm4rm_bitwise.cpp.

◆ clm4rm_concat()

void clm4rm_concat	(	clmatrix_t *	C,
		clmatrix_t *	A,
		clmatrix_t *	B,
		cl_command_queue	queue,
		clm4rm_conditions *	cond
	)

concatenate two matrices

Deprecated:: not used anymore

Definition at line 160 of file clm4rm_bitwise.cpp.

◆ clm4rm_copy()

clmatrix_t* clm4rm_copy	(	const mzd_t *	host_matrix,
		int	rowpadding,
		int	read_only,
		cl_context	ctx
	)

ceate a copy from a matrix in M4RI format

Parameters

host_matrix	matrix data in M4RI format
rowpadding	desired padding
read_only	if 1, create a read-only buffer in GPU memory
ctx	OpenCL context

Returns: a newly allocated matrix structure. Both, CPU memory and GPU memory are allocated and filled with data.

Definition at line 254 of file clm4rm.cpp.

◆ clm4rm_create()

clmatrix_t* clm4rm_create	(	rci_t	rows,
		rci_t	cols,
		int	rowpadding,
		int	read_only,
		cl_context	ctx
	)

create an empty matrix

Parameters

rows	number of rows
cols	number of columns
rowpadding	pad rows to multiples of 32, or 64
read_only	1 if the GPU memory buffer should be read only
ctx	OpenCL context

Returns: a newly allocated matrix structure. Both, CPU memory and GPU memory are allocated.

Definition at line 233 of file clm4rm.cpp.

◆ clm4rm_free()

void clm4rm_free ( clmatrix_t * gpu_matrix )

release memory (CPU and GPU)

Parameters

gpu_matrix a matrix structure

release gpu memory

Definition at line 112 of file clm4rm_bitwise.cpp.

◆ clm4rm_mul()

void clm4rm_mul	(	clmatrix_t *	C,
		clmatrix_t *	A,
		clmatrix_t *	B,
		cl_command_queue	queue,
		clm4rm_conditions *	cond
	)

Boolean matrix multiplication on the GPU using the method of the Four Russians. C := A * B.

The function returns immediately. The operation is scheduled for asynchronous execution of the GPU. Use post-condition events to wait for the execution of the operation.

Parameters

C	a matrix structure; receives the resutl
A	a matrix structure
B	a matrix structure
queue	OpenCL command queue
cond	keeps track of pre-conditions and newly created post-conditions

Definition at line 30 of file clm4rm_multiplication.cpp.

◆ clm4rm_or()

void clm4rm_or	(	clmatrix_t *	C,
		clmatrix_t *	A,
		clmatrix_t *	B,
		cl_command_queue	queue,
		clm4rm_conditions *	cond
	)

perform element-wise logical disjunction (OR)

Deprecated:: not used anymore

Definition at line 23 of file clm4rm_bitwise.cpp.

◆ clm4rm_query_diagonal()

cl_mem clm4rm_query_diagonal	(	clmatrix_t *	M,
		cl_context	ctx,
		cl_command_queue	queue,
		clm4rm_conditions *	cond
	)

find a non-zero entry on the diagonal of a matrix. Return the column/row of the first non-zero entry, or -1 if all entries are zero. The operation does not immediately return a result. It performs asynchronously. Use the post-conditions variables to wait for the execution of the operation, then use clm4rm_query_result to retrieve the actual result.

Parameters

M	a matrix
ctx	OpenCL context
queue	OpenCL command queue
cond	keeps track of pre-conditions and newly created post-conditions

Returns: a memory buffer that holds exactly one integer. It will eventually hold the result.

Definition at line 67 of file clm4rm_bitwise.cpp.

◆ clm4rm_query_result()

int clm4rm_query_result	(	cl_mem	result_buffer,
		cl_command_queue	queue,
		clm4rm_conditions *	cond
	)

examine the result of a previous call to clm4rm_query_diagonal

Parameters

result_buffer	buffer that holds one integer. Was returned by clm4rm_query_diagonal. Will be released.
queue	OpenCL command queue
cond	keeps track of pre-conditions and newly created post-conditions

Returns: the column/row of the first non-zero entry, or -1 if all entries are zero.

Definition at line 94 of file clm4rm_bitwise.cpp.

◆ clm4rm_read()

mzd_t* clm4rm_read	(	mzd_t *	host_matrix,
		clmatrix_t *	gpu_matrix,
		cl_command_queue	queue,
		clm4rm_conditions *	cond
	)

copy matrix from gpu memory to host

Parameters

host_matrix	matrix data in M4RI format; if nullptr, allocate a new one
gpu_matrix	a matrix structure
queue	OpenCL command queue
cond	keeps track of pre-conditions and newly created post-conditions

Returns: pointer to a matrix structure inf M4RI format

Definition at line 406 of file clm4rm.cpp.

◆ clm4rm_setup()

cl_int clm4rm_setup	(	const char *	cl_kernel_directory,
		cl_context	ctx,
		cl_device_id	device
	)

load OpenCL kernels and set up parameters

Parameters

cl_kernel_directory	location on disk where the kernel source code files (*.cl) are stored
ctx	OpenCL context
device	OpenCL device

Returns: OpenCL error code. 0 means no error.

Definition at line 84 of file clm4rm.cpp.

◆ clm4rm_stack()

void clm4rm_stack	(	clmatrix_t *	C,
		clmatrix_t *	A,
		clmatrix_t *	B,
		cl_command_queue	queue,
		clm4rm_conditions *	cond
	)

concatenate two matrices

Deprecated:: not used anymore

Definition at line 123 of file clm4rm_bitwise.cpp.

◆ clm4rm_tear_down()

void clm4rm_tear_down	(	cl_context	ctx,
		cl_device_id	device
	)

release OpenCL resources

Parameters

ctx	OpenCL context
device	OpenCL device

Definition at line 146 of file clm4rm.cpp.

◆ clm4rm_write()

void clm4rm_write	(	clmatrix_t *	gpu_matrix,
		const mzd_t *	host_matrix,
		cl_command_queue	queue,
		clm4rm_conditions *	cond
	)

Copy matrix data from host memory to GPU. The operation is scheduled for asynchronous execution of the GPU. The function returns immediately. Use post-condition events to wait for the execution of the operation.

Parameters

gpu_matrix	a matrix structure
host_matrix	matrix data in M4RI format
queue	OpenCL command queue
cond	keeps track of pre-conditions and newly created post-conditions

M4RI data are 64-bit unsigned int M4RM data are supposed to be 32-bit unsigned int

Casting and copying is sane if both platforms are LITTLE-ENDIAN.

Definition at line 382 of file clm4rm.cpp.

◆ clm4rm_zero_fill()

void clm4rm_zero_fill	(	clmatrix_t *	gpu_matrix,
		cl_command_queue	queue,
		clm4rm_conditions *	cond
	)

Fill a matrix with zero data. The operation is scheduled for asynchronous execution of the GPU. The function returns immediately. Use post-condition events to wait for the execution of the operation.

Parameters

gpu_matrix	a matrix structure
queue	OpenCL command queue
cond	keeps track of pre-conditions and newly created post-conditions

Definition at line 364 of file clm4rm.cpp.

◆ clutri_mul()

void clutri_mul	(	clmatrix_t *	C,
		clmatrix_t *	A,
		clmatrix_t *	B,
		size2_t	max_tile,
		cl_command_queue	queue,
		clm4rm_conditions *	cond
	)

Boolean matrix multiplication on the GPU using nested loops. C := A*B Assumes matrixes to be upper triangular.

The function returns immediately. The operation is scheduled for asynchronous execution of the GPU. Use post-condition events to wait for the execution of the operation.

Parameters

C	a matrix structure; receives the resutl
A	an upper triangular matrix structure
B	an upper triangular matrix structure
max_tile	max. size of tiles
queue	OpenCL command queue
cond	keeps track of pre-conditions and newly created post-conditions

Matrix is partitioned into three parts

       +----------------------------------+----+
       |                                  |    |
       |      |      |      |      |      |    |
       |                                  |    |
       | - - -+ - - -+ - - -+ - - -+ - - -|    |
       |                                  |    |
       |      |      |      |      |      |    |
       |                                  |    |
       |      + - - -+ - - -+ - - -+ - - -|    |
       |                                  |REST|
       |             |      |      |      |RIGHT
       |                                  |    |
       |             + - - -+ - - -+ - - -|    |
       |                                  |    |
       |                    |      |      |    |
       |   (empty)                        |    |
       |                    + - - -+ - - -|    |
       |                                  |    |
       |                           |      |    |
       |                                  |    |
       | - - - - - - + - - - - - - + - - -+    |
       |   (empty)                        |    |
       | - - -+ - - -+ - - -+ - - -+ - - -+ - -+

Definition at line 339 of file clm4rm_multiplication.cpp.

◆ copy_back_matrix_data()

void copy_back_matrix_data	(	mzd_t *	dest,
		const gpuword *	src,
		int	padded_rows
	)

copy back a colum–major matrix

Parameters

dest	destination data in M4RI format
src	input data
padded_rows	number of rows (padded)

Definition at line 460 of file clm4rm.cpp.

◆ copy_matrix_data()

gpuword* copy_matrix_data	(	gpuword *	dest,
		const mzd_t *	src,
		int	padded_rows
	)

create a column-major copy from an mzd_t matrix

Parameters

dest	destination data in clmatrix format
src	input data in M4RI format
padded_rows	number of words (padded)

Returns: pointer to CPU matrix data

Definition at line 436 of file clm4rm.cpp.

◆ init_conditions()

void init_conditions ( clm4rm_conditions * cond )

reset conditions list

Parameters

cond	a list of pre- and post-conditions

Definition at line 284 of file clm4rm.cpp.

◆ init_events()

void init_events ( clm4rm_event_list * list )

reset events list

Parameters

list	a list of OpenCL events

Definition at line 277 of file clm4rm.cpp.

◆ join_conditions()

void join_conditions ( clm4rm_conditions * cond )

called when the pre-conditions are met. The post-conditions become new pre-conditioins.

Parameters

cond	a list of pre- and post-conditions

Definition at line 319 of file clm4rm.cpp.

◆ merge_conditions()

void merge_conditions	(	clm4rm_conditions *	a,
		clm4rm_conditions *	b
	)

merge pre-conditions into one list

Parameters

a	a list of pre- and post-conditions
b	another list of pre- and post-conditions

Definition at line 314 of file clm4rm.cpp.

◆ merge_events()

void merge_events	(	clm4rm_event_list *	a,
		clm4rm_event_list *	b
	)

append tow lists

Parameters

a	a list of OpenCL events
b	another list of OpenCL events

Definition at line 303 of file clm4rm.cpp.

◆ padded_rows()

int padded_rows	(	int	nrows,
		int	padding
	)

calculate the number of padded rows

Parameters

nrows	actual matrix rows
padding	desired padding (32, or 64)

Returns: number of padded rows

Definition at line 185 of file clm4rm.cpp.

◆ pre_count()

cl_uint pre_count ( clm4rm_conditions * cond )

Parameters

cond	a list of pre- and post-conditions

Returns: number of pre-conditioins

Definition at line 331 of file clm4rm.cpp.

◆ pre_events()

cl_event* pre_events ( clm4rm_conditions * cond )

Parameters

cond	a list of pre- and post-conditions

Returns: pointer to list of pre-conditions

Definition at line 338 of file clm4rm.cpp.

◆ push_event()

cl_event* push_event ( clm4rm_conditions * cond )

reserve one post-condition event

Parameters

cond	a list of pre- and post-conditions

Returns: pointer to reserved event

Definition at line 348 of file clm4rm.cpp.

◆ pushed_event()

cl_event* pushed_event ( clm4rm_conditions * cond )

Parameters

cond	a list of pre- and post-conditions

Returns: pointer to last reserved event

Definition at line 357 of file clm4rm.cpp.

◆ release_conditions()

void release_conditions ( clm4rm_conditions * cond )

release conditions list

Parameters

cond	a list of pre- and post-conditions

Definition at line 297 of file clm4rm.cpp.

◆ release_events()

void release_events ( clm4rm_event_list * list )

release events

Parameters

list	a list of OpenCL events

Definition at line 290 of file clm4rm.cpp.

Variable Documentation

◆ allocated_size

size_t allocated_size

Definition at line 78 of file clm4rm.cpp.

◆ clm4rm_error

cl_int clm4rm_error

latest OpenCL result code. CL_SUCCESS indicates no error.

Definition at line 9 of file clm4rm.cpp.

◆ heap_size

size_t heap_size

size of allocated memory in bytes

Definition at line 78 of file clm4rm.cpp.

◆ max_group_size

size_t max_group_size

max. size of a work group

Definition at line 74 of file clm4rm.cpp.

◆ max_items

size_t max_items[3]

max. number of items in each dimension

Definition at line 75 of file clm4rm.cpp.

◆ max_object_size

size_t max_object_size

max. object allocation size

Definition at line 79 of file clm4rm.cpp.

◆ shared_mem_bytes

size_t shared_mem_bytes

size of shared memory in bytes

Definition at line 77 of file clm4rm.cpp.

◆ shared_mem_words

size_t shared_mem_words

size of shared memory in (32bit) words

Definition at line 77 of file clm4rm.cpp.

Detailed Description

Classes

Functions

basic definitions

matrix storage

global variables

event handling

Macro Definition Documentation

◆ BUFFERED

◆ CEILCOLS

◆ CEILDIV

◆ clm4rm_radix

◆ DATA_BYTES

◆ FLOOR

◆ IMAGE2D

◆ MAX_EVENTS

◆ MAX_TILE_M

◆ POW2

Typedef Documentation

◆ clm4rm_conditions

◆ clm4rm_event_list

◆ clmatrix_t

◆ gpuword

◆ size2_t

Function Documentation

◆ clcubic_mul()

◆ clm4rm_and()

◆ clm4rm_concat()

◆ clm4rm_copy()

◆ clm4rm_create()

◆ clm4rm_free()

◆ clm4rm_mul()

◆ clm4rm_or()

◆ clm4rm_query_diagonal()

◆ clm4rm_query_result()

◆ clm4rm_read()

◆ clm4rm_setup()

◆ clm4rm_stack()

◆ clm4rm_tear_down()

◆ clm4rm_write()

◆ clm4rm_zero_fill()

◆ clutri_mul()

◆ copy_back_matrix_data()

◆ copy_matrix_data()

◆ init_conditions()

◆ init_events()

◆ join_conditions()

◆ merge_conditions()

◆ merge_events()

◆ padded_rows()

◆ pre_count()

◆ pre_events()

◆ push_event()

◆ pushed_event()

◆ release_conditions()

◆ release_events()

Variable Documentation

◆ allocated_size

◆ clm4rm_error

◆ heap_size

◆ max_group_size

◆ max_items

◆ max_object_size

◆ shared_mem_bytes

◆ shared_mem_words