![]() |
Fréchet View
1.6.0
A Tool for Exploring Fréchet Distance Algorithms
|
M4RM on the GPU Perform Boolean matrix arithmetics on the GPU. Based on OpenCL.
Kernels are provided for arithmetic, like bitwise operation, multiplication, etc.
Methods are provided for allocating and copying data from CPU memory to GPU memory and back.
Definition in file clm4rm.h.
#include <CL/opencl.h>
#include <m4ri/mzd.h>
Go to the source code of this file.
Classes | |
struct | clmatrix_t |
OpenCL boolean matrix data structure. Data is arranged in 32 bit words. More... | |
struct | clm4rm_event_list |
a list of cl_events; used by clm4rm_conditions to keep track of schedules jobs in the OpenCL queue. More... | |
struct | clm4rm_conditions |
Manages OpenCL event dependencies; necessary when the queue is out-of-order; dependencies must be established through cl_event. More... | |
Functions | |
matrix operations | |
clmatrix_t * | clm4rm_create (rci_t rows, rci_t cols, int rowpadding, int read_only, cl_context ctx) |
create an empty matrix More... | |
clmatrix_t * | clm4rm_copy (const mzd_t *host_matrix, int rowpadding, int read_only, cl_context ctx) |
ceate a copy from a matrix in M4RI format More... | |
void | clm4rm_zero_fill (clmatrix_t *gpu_matrix, cl_command_queue queue, clm4rm_conditions *cond) |
Fill a matrix with zero data. The operation is scheduled for asynchronous execution of the GPU. The function returns immediately. Use post-condition events to wait for the execution of the operation. More... | |
void | clm4rm_write (clmatrix_t *gpu_matrix, const mzd_t *host_matrix, cl_command_queue queue, clm4rm_conditions *cond) |
Copy matrix data from host memory to GPU. The operation is scheduled for asynchronous execution of the GPU. The function returns immediately. Use post-condition events to wait for the execution of the operation. More... | |
mzd_t * | clm4rm_read (mzd_t *host_matrix, clmatrix_t *gpu_matrix, cl_command_queue queue, clm4rm_conditions *cond) |
copy matrix from gpu memory to host More... | |
void | clm4rm_free (clmatrix_t *gpu_matrix) |
release memory (CPU and GPU) More... | |
void | clm4rm_mul (clmatrix_t *C, clmatrix_t *A, clmatrix_t *B, cl_command_queue queue, clm4rm_conditions *cond) |
Boolean matrix multiplication on the GPU using the method of the Four Russians. C := A * B. More... | |
void | clcubic_mul (clmatrix_t *C, clmatrix_t *A, clmatrix_t *B, size2_t max_tile, cl_command_queue queue, clm4rm_conditions *cond) |
Boolean matrix multiplication on the GPU using nested loops. C := A*B. More... | |
void | clutri_mul (clmatrix_t *C, clmatrix_t *A, clmatrix_t *B, size2_t max_tile, cl_command_queue queue, clm4rm_conditions *cond) |
Boolean matrix multiplication on the GPU using nested loops. C := A*B Assumes matrixes to be upper triangular. More... | |
void | clm4rm_stack (clmatrix_t *C, clmatrix_t *A, clmatrix_t *B, cl_command_queue queue, clm4rm_conditions *cond) |
concatenate two matrices More... | |
void | clm4rm_concat (clmatrix_t *C, clmatrix_t *A, clmatrix_t *B, cl_command_queue queue, clm4rm_conditions *cond) |
concatenate two matrices More... | |
void | clm4rm_or (clmatrix_t *C, clmatrix_t *A, clmatrix_t *B, cl_command_queue queue, clm4rm_conditions *cond) |
perform element-wise logical disjunction (OR) More... | |
void | clm4rm_and (clmatrix_t *C, clmatrix_t *A, clmatrix_t *B, cl_command_queue queue, clm4rm_conditions *cond) |
perform element-wise logical conjunction (AND). For each entry, compute C_ij := A_ij & B_ij. All input matrices must have the same size. More... | |
cl_mem | clm4rm_query_diagonal (clmatrix_t *M, cl_context ctx, cl_command_queue queue, clm4rm_conditions *cond) |
find a non-zero entry on the diagonal of a matrix. Return the column/row of the first non-zero entry, or -1 if all entries are zero. The operation does not immediately return a result. It performs asynchronously. Use the post-conditions variables to wait for the execution of the operation, then use clm4rm_query_result to retrieve the actual result. More... | |
int | clm4rm_query_result (cl_mem result_buffer, cl_command_queue queue, clm4rm_conditions *cond) |
examine the result of a previous call to clm4rm_query_diagonal More... | |
basic definitions | |
#define | clm4rm_radix 32 |
word size. for compatibility with GPU memory layout, we operate on 32 bit words. More... | |
#define | IMAGE2D 0 |
#define | BUFFERED 1 |
#define | MAX_TILE_M 6 |
#define | CEILDIV(x, y) (((x)+(y)-1)/(y)) |
integer division with rounding to a multiple of y More... | |
#define | FLOOR(x, y) ((y)*((x)/(y))) |
integer division with rounding More... | |
#define | CEILCOLS(i) CEILDIV(i,clm4rm_radix) |
integer division by number of bits per word More... | |
#define | POW2(i) (((gpuword)1)<<(i)) |
typedef uint32_t | gpuword |
word size of GPU data (32 bits) More... | |
typedef size_t | size2_t[2] |
tow-dimensional size; used for various OpenCL parameters More... | |
matrix storage | |
#define | DATA_BYTES(m) ( (m)->padded_rows * (m)->width * sizeof(gpuword) ) |
typedef struct clmatrix_t | clmatrix_t |
int | padded_rows (int nrows, int padding) |
calculate the number of padded rows More... | |
gpuword * | copy_matrix_data (gpuword *dest, const mzd_t *src, int padded_rows) |
create a column-major copy from an mzd_t matrix More... | |
void | copy_back_matrix_data (mzd_t *dest, const gpuword *src, int padded_rows) |
copy back a colum–major matrix More... | |
global variables | |
cl_int | clm4rm_error |
latest OpenCL result code. CL_SUCCESS indicates no error. More... | |
size_t | max_group_size |
max. size of a work group More... | |
size_t | max_items [3] |
max. number of items in each dimension More... | |
size_t | shared_mem_bytes |
size of shared memory in bytes More... | |
size_t | shared_mem_words |
size of shared memory in (32bit) words More... | |
size_t | heap_size |
size of allocated memory in bytes More... | |
size_t | allocated_size |
size_t | max_object_size |
max. object allocation size More... | |
cl_int | clm4rm_setup (const char *cl_kernel_directory, cl_context ctx, cl_device_id device) |
load OpenCL kernels and set up parameters More... | |
void | clm4rm_tear_down (cl_context ctx, cl_device_id device) |
release OpenCL resources More... | |
event handling | |
#define | MAX_EVENTS 6 |
typedef struct clm4rm_event_list | clm4rm_event_list |
typedef struct clm4rm_conditions | clm4rm_conditions |
void | init_events (clm4rm_event_list *list) |
reset events list More... | |
void | release_events (clm4rm_event_list *list) |
release events More... | |
void | merge_events (clm4rm_event_list *a, clm4rm_event_list *b) |
append tow lists More... | |
void | init_conditions (clm4rm_conditions *cond) |
reset conditions list More... | |
void | release_conditions (clm4rm_conditions *cond) |
release conditions list More... | |
void | join_conditions (clm4rm_conditions *cond) |
called when the pre-conditions are met. The post-conditions become new pre-conditioins. More... | |
void | merge_conditions (clm4rm_conditions *a, clm4rm_conditions *b) |
merge pre-conditions into one list More... | |
cl_uint | pre_count (clm4rm_conditions *cond) |
cl_event * | pre_events (clm4rm_conditions *cond) |
cl_event * | push_event (clm4rm_conditions *cond) |
reserve one post-condition event More... | |
cl_event * | pushed_event (clm4rm_conditions *cond) |
#define BUFFERED 1 |
#define CEILCOLS | ( | i | ) | CEILDIV(i,clm4rm_radix) |
#define CEILDIV | ( | x, | |
y | |||
) | (((x)+(y)-1)/(y)) |
#define clm4rm_radix 32 |
#define DATA_BYTES | ( | m | ) | ( (m)->padded_rows * (m)->width * sizeof(gpuword) ) |
#define FLOOR | ( | x, | |
y | |||
) | ((y)*((x)/(y))) |
#define IMAGE2D 0 |
OpenCL data can be stored as Buffer object or Image objects, the latter being supposedly faster (really?). Within a kernel, an image buffer can be only read-only or write-only, with is alright with us.
Turns out that the different is actually marginal. Note: texture memory if limited (about 2G on a Tesla V100). Global buffer memory is not.
#define MAX_TILE_M 6 |
typedef struct clm4rm_conditions clm4rm_conditions |
typedef struct clm4rm_event_list clm4rm_event_list |
typedef struct clmatrix_t clmatrix_t |
typedef size_t size2_t[2] |
void clcubic_mul | ( | clmatrix_t * | C, |
clmatrix_t * | A, | ||
clmatrix_t * | B, | ||
size2_t | max_tile, | ||
cl_command_queue | queue, | ||
clm4rm_conditions * | cond | ||
) |
Boolean matrix multiplication on the GPU using nested loops. C := A*B.
The function returns immediately. The operation is scheduled for asynchronous execution of the GPU. Use post-condition events to wait for the execution of the operation.
C | a matrix structure; receives the resutl |
A | a matrix structure |
B | a matrix structure |
max_tile | max. size of tiles |
queue | OpenCL command queue |
cond | keeps track of pre-conditions and newly created post-conditions |
Matrix is partitioned into three parts:
+----------------------------------+----+ | | | | | | | | | | | | | | - - -+ - - -+ - - -+ - - -+ - - -| | | | | | | | | | | | | | | | - - -+ - - -+ - - -+ - - -+ - - -| | | |REST| | | | | | |RIGHT | | | | - - -+ - - -+ - - -+ - - -+ - - -| | | | | | | | | | | | | | | | - - -+ - - -+ - - -+ - - -+ - - -| | | | | | | | | | | | | | | |------+------+------+------+------+----| | REST BOTTOM | | - - -+ - - -+ - - -+ - - -+ - - -+ - -+
Definition at line 132 of file clm4rm_multiplication.cpp.
void clm4rm_and | ( | clmatrix_t * | C, |
clmatrix_t * | A, | ||
clmatrix_t * | B, | ||
cl_command_queue | queue, | ||
clm4rm_conditions * | cond | ||
) |
perform element-wise logical conjunction (AND). For each entry, compute C_ij := A_ij & B_ij. All input matrices must have the same size.
C | a Boolean matrix; holds the result on return |
A | an input Boolean matrix |
B | an input Boolean matrix |
queue | OpenCL command queue |
cond | keeps track of pre-conditions and newly created post-conditions |
Definition at line 45 of file clm4rm_bitwise.cpp.
void clm4rm_concat | ( | clmatrix_t * | C, |
clmatrix_t * | A, | ||
clmatrix_t * | B, | ||
cl_command_queue | queue, | ||
clm4rm_conditions * | cond | ||
) |
concatenate two matrices
Definition at line 160 of file clm4rm_bitwise.cpp.
clmatrix_t* clm4rm_copy | ( | const mzd_t * | host_matrix, |
int | rowpadding, | ||
int | read_only, | ||
cl_context | ctx | ||
) |
ceate a copy from a matrix in M4RI format
host_matrix | matrix data in M4RI format |
rowpadding | desired padding |
read_only | if 1, create a read-only buffer in GPU memory |
ctx | OpenCL context |
Definition at line 254 of file clm4rm.cpp.
clmatrix_t* clm4rm_create | ( | rci_t | rows, |
rci_t | cols, | ||
int | rowpadding, | ||
int | read_only, | ||
cl_context | ctx | ||
) |
create an empty matrix
rows | number of rows |
cols | number of columns |
rowpadding | pad rows to multiples of 32, or 64 |
read_only | 1 if the GPU memory buffer should be read only |
ctx | OpenCL context |
Definition at line 233 of file clm4rm.cpp.
void clm4rm_free | ( | clmatrix_t * | gpu_matrix | ) |
release memory (CPU and GPU)
gpu_matrix | a matrix structure |
release gpu memory
Definition at line 112 of file clm4rm_bitwise.cpp.
void clm4rm_mul | ( | clmatrix_t * | C, |
clmatrix_t * | A, | ||
clmatrix_t * | B, | ||
cl_command_queue | queue, | ||
clm4rm_conditions * | cond | ||
) |
Boolean matrix multiplication on the GPU using the method of the Four Russians. C := A * B.
The function returns immediately. The operation is scheduled for asynchronous execution of the GPU. Use post-condition events to wait for the execution of the operation.
C | a matrix structure; receives the resutl |
A | a matrix structure |
B | a matrix structure |
queue | OpenCL command queue |
cond | keeps track of pre-conditions and newly created post-conditions |
Definition at line 30 of file clm4rm_multiplication.cpp.
void clm4rm_or | ( | clmatrix_t * | C, |
clmatrix_t * | A, | ||
clmatrix_t * | B, | ||
cl_command_queue | queue, | ||
clm4rm_conditions * | cond | ||
) |
perform element-wise logical disjunction (OR)
Definition at line 23 of file clm4rm_bitwise.cpp.
cl_mem clm4rm_query_diagonal | ( | clmatrix_t * | M, |
cl_context | ctx, | ||
cl_command_queue | queue, | ||
clm4rm_conditions * | cond | ||
) |
find a non-zero entry on the diagonal of a matrix. Return the column/row of the first non-zero entry, or -1 if all entries are zero. The operation does not immediately return a result. It performs asynchronously. Use the post-conditions variables to wait for the execution of the operation, then use clm4rm_query_result to retrieve the actual result.
M | a matrix |
ctx | OpenCL context |
queue | OpenCL command queue |
cond | keeps track of pre-conditions and newly created post-conditions |
Definition at line 67 of file clm4rm_bitwise.cpp.
int clm4rm_query_result | ( | cl_mem | result_buffer, |
cl_command_queue | queue, | ||
clm4rm_conditions * | cond | ||
) |
examine the result of a previous call to clm4rm_query_diagonal
result_buffer | buffer that holds one integer. Was returned by clm4rm_query_diagonal. Will be released. |
queue | OpenCL command queue |
cond | keeps track of pre-conditions and newly created post-conditions |
Definition at line 94 of file clm4rm_bitwise.cpp.
mzd_t* clm4rm_read | ( | mzd_t * | host_matrix, |
clmatrix_t * | gpu_matrix, | ||
cl_command_queue | queue, | ||
clm4rm_conditions * | cond | ||
) |
copy matrix from gpu memory to host
host_matrix | matrix data in M4RI format; if nullptr, allocate a new one |
gpu_matrix | a matrix structure |
queue | OpenCL command queue |
cond | keeps track of pre-conditions and newly created post-conditions |
Definition at line 406 of file clm4rm.cpp.
cl_int clm4rm_setup | ( | const char * | cl_kernel_directory, |
cl_context | ctx, | ||
cl_device_id | device | ||
) |
load OpenCL kernels and set up parameters
cl_kernel_directory | location on disk where the kernel source code files (*.cl) are stored |
ctx | OpenCL context |
device | OpenCL device |
Definition at line 84 of file clm4rm.cpp.
void clm4rm_stack | ( | clmatrix_t * | C, |
clmatrix_t * | A, | ||
clmatrix_t * | B, | ||
cl_command_queue | queue, | ||
clm4rm_conditions * | cond | ||
) |
concatenate two matrices
Definition at line 123 of file clm4rm_bitwise.cpp.
void clm4rm_tear_down | ( | cl_context | ctx, |
cl_device_id | device | ||
) |
release OpenCL resources
ctx | OpenCL context |
device | OpenCL device |
Definition at line 146 of file clm4rm.cpp.
void clm4rm_write | ( | clmatrix_t * | gpu_matrix, |
const mzd_t * | host_matrix, | ||
cl_command_queue | queue, | ||
clm4rm_conditions * | cond | ||
) |
Copy matrix data from host memory to GPU. The operation is scheduled for asynchronous execution of the GPU. The function returns immediately. Use post-condition events to wait for the execution of the operation.
gpu_matrix | a matrix structure |
host_matrix | matrix data in M4RI format |
queue | OpenCL command queue |
cond | keeps track of pre-conditions and newly created post-conditions |
M4RI data are 64-bit unsigned int M4RM data are supposed to be 32-bit unsigned int
Casting and copying is sane if both platforms are LITTLE-ENDIAN.
Definition at line 382 of file clm4rm.cpp.
void clm4rm_zero_fill | ( | clmatrix_t * | gpu_matrix, |
cl_command_queue | queue, | ||
clm4rm_conditions * | cond | ||
) |
Fill a matrix with zero data. The operation is scheduled for asynchronous execution of the GPU. The function returns immediately. Use post-condition events to wait for the execution of the operation.
gpu_matrix | a matrix structure |
queue | OpenCL command queue |
cond | keeps track of pre-conditions and newly created post-conditions |
Definition at line 364 of file clm4rm.cpp.
void clutri_mul | ( | clmatrix_t * | C, |
clmatrix_t * | A, | ||
clmatrix_t * | B, | ||
size2_t | max_tile, | ||
cl_command_queue | queue, | ||
clm4rm_conditions * | cond | ||
) |
Boolean matrix multiplication on the GPU using nested loops. C := A*B Assumes matrixes to be upper triangular.
The function returns immediately. The operation is scheduled for asynchronous execution of the GPU. Use post-condition events to wait for the execution of the operation.
C | a matrix structure; receives the resutl |
A | an upper triangular matrix structure |
B | an upper triangular matrix structure |
max_tile | max. size of tiles |
queue | OpenCL command queue |
cond | keeps track of pre-conditions and newly created post-conditions |
Matrix is partitioned into three parts
+----------------------------------+----+ | | | | | | | | | | | | | | - - -+ - - -+ - - -+ - - -+ - - -| | | | | | | | | | | | | | | | + - - -+ - - -+ - - -+ - - -| | | |REST| | | | | |RIGHT | | | | + - - -+ - - -+ - - -| | | | | | | | | | | (empty) | | | + - - -+ - - -| | | | | | | | | | | | | - - - - - - + - - - - - - + - - -+ | | (empty) | | | - - -+ - - -+ - - -+ - - -+ - - -+ - -+
Definition at line 339 of file clm4rm_multiplication.cpp.
void copy_back_matrix_data | ( | mzd_t * | dest, |
const gpuword * | src, | ||
int | padded_rows | ||
) |
copy back a colum–major matrix
dest | destination data in M4RI format |
src | input data |
padded_rows | number of rows (padded) |
Definition at line 460 of file clm4rm.cpp.
create a column-major copy from an mzd_t matrix
dest | destination data in clmatrix format |
src | input data in M4RI format |
padded_rows | number of words (padded) |
Definition at line 436 of file clm4rm.cpp.
void init_conditions | ( | clm4rm_conditions * | cond | ) |
reset conditions list
cond | a list of pre- and post-conditions |
Definition at line 284 of file clm4rm.cpp.
void init_events | ( | clm4rm_event_list * | list | ) |
void join_conditions | ( | clm4rm_conditions * | cond | ) |
called when the pre-conditions are met. The post-conditions become new pre-conditioins.
cond | a list of pre- and post-conditions |
Definition at line 319 of file clm4rm.cpp.
void merge_conditions | ( | clm4rm_conditions * | a, |
clm4rm_conditions * | b | ||
) |
merge pre-conditions into one list
a | a list of pre- and post-conditions |
b | another list of pre- and post-conditions |
Definition at line 314 of file clm4rm.cpp.
void merge_events | ( | clm4rm_event_list * | a, |
clm4rm_event_list * | b | ||
) |
append tow lists
a | a list of OpenCL events |
b | another list of OpenCL events |
Definition at line 303 of file clm4rm.cpp.
int padded_rows | ( | int | nrows, |
int | padding | ||
) |
calculate the number of padded rows
nrows | actual matrix rows |
padding | desired padding (32, or 64) |
Definition at line 185 of file clm4rm.cpp.
cl_uint pre_count | ( | clm4rm_conditions * | cond | ) |
cond | a list of pre- and post-conditions |
Definition at line 331 of file clm4rm.cpp.
cl_event* pre_events | ( | clm4rm_conditions * | cond | ) |
cond | a list of pre- and post-conditions |
Definition at line 338 of file clm4rm.cpp.
cl_event* push_event | ( | clm4rm_conditions * | cond | ) |
reserve one post-condition event
cond | a list of pre- and post-conditions |
Definition at line 348 of file clm4rm.cpp.
cl_event* pushed_event | ( | clm4rm_conditions * | cond | ) |
cond | a list of pre- and post-conditions |
Definition at line 357 of file clm4rm.cpp.
void release_conditions | ( | clm4rm_conditions * | cond | ) |
release conditions list
cond | a list of pre- and post-conditions |
Definition at line 297 of file clm4rm.cpp.
void release_events | ( | clm4rm_event_list * | list | ) |
size_t allocated_size |
Definition at line 78 of file clm4rm.cpp.
cl_int clm4rm_error |
latest OpenCL result code. CL_SUCCESS indicates no error.
Definition at line 9 of file clm4rm.cpp.
size_t heap_size |
size of allocated memory in bytes
Definition at line 78 of file clm4rm.cpp.
size_t max_group_size |
max. size of a work group
Definition at line 74 of file clm4rm.cpp.
size_t max_items[3] |
max. number of items in each dimension
Definition at line 75 of file clm4rm.cpp.
size_t max_object_size |
max. object allocation size
Definition at line 79 of file clm4rm.cpp.
size_t shared_mem_bytes |
size of shared memory in bytes
Definition at line 77 of file clm4rm.cpp.
size_t shared_mem_words |
size of shared memory in (32bit) words
Definition at line 77 of file clm4rm.cpp.