![]() |
Fréchet View
1.6.0
A Tool for Exploring Fréchet Distance Algorithms
|
Go to the source code of this file.
Macros | |
| #define | read_only_global __global gpuword* |
| #define | write_only_global __global gpuword* |
| #define | read(M, row, col) M[(col)*M ## _nrows + row] |
| #define | write(M, row, col, x) M[(col)*M ## _nrows + row]=x |
| #define | BUFFERED 1 |
| #define | tile_width (tile_n*TILE_M) |
| #define | tile_ncols (tile_width*32) |
| #define | tile_nrows (tile_n*TILE_M*32) |
| #define | col_stride (34*tile_n*TILE_M+1) |
| #define | buf(M, row, col) M##_buf[buffer_address(row,col,tile_n)] |
| #define | for_tile |
| #define | unrolled_for_tile |
| #define | CEILCOLS(i) ((i+31)/32) |
| #define | MIN(x, y) (((x) < (y)) ? (x) : (y)) |
| #define | POW2(x) (((gpuword)1) << x) |
| #define | A_width CEILCOLS(A_ncols) |
| #define | B_nrows 32*A_width |
| #define | C_nrows A_nrows |
| #define | A_ncols A_nrows |
Typedefs | |
| typedef unsigned int | gpuword |
| a GPU word has 32 bits More... | |
Functions | |
| int | buffer_address (int row, int col, int tile_n) |
| offset into shared memory buffers More... | |
| __kernel void | clcubic_mul (write_only_global C, read_only_global A, read_only_global B, __local gpuword *A_buf, __local gpuword *B_buf, int A_nrows, int A_ncols) |
| OpenCL kernel for cubic matrix multiplication. More... | |
| __kernel void | clutri_mul (write_only_global C, read_only_global A, read_only_global B, __local gpuword *A_buf, __local gpuword *B_buf, int A_nrows) |
| OpenCL kernel for cubic upper triangular matrix multiplication. More... | |
| #define A_ncols A_nrows |
Definition at line 80 of file clcubic_mul.cl.
| #define B_nrows 32*A_width |
Definition at line 81 of file clcubic_mul.cl.
| #define buf | ( | M, | |
| row, | |||
| col | |||
| ) | M##_buf[buffer_address(row,col,tile_n)] |
Definition at line 62 of file clcubic_mul.cl.
| #define BUFFERED 1 |
Definition at line 28 of file clcubic_mul.cl.
| #define C_nrows A_nrows |
Definition at line 82 of file clcubic_mul.cl.
| #define CEILCOLS | ( | i | ) | ((i+31)/32) |
Definition at line 76 of file clcubic_mul.cl.
| #define col_stride (34*tile_n*TILE_M+1) |
Definition at line 37 of file clcubic_mul.cl.
| #define for_tile |
Definition at line 65 of file clcubic_mul.cl.
| #define MIN | ( | x, | |
| y | |||
| ) | (((x) < (y)) ? (x) : (y)) |
Definition at line 77 of file clcubic_mul.cl.
| #define POW2 | ( | x | ) | (((gpuword)1) << x) |
Definition at line 78 of file clcubic_mul.cl.
| #define read | ( | M, | |
| row, | |||
| col | |||
| ) | M[(col)*M ## _nrows + row] |
Definition at line 23 of file clcubic_mul.cl.
| #define read_only_global __global gpuword* |
Definition at line 21 of file clcubic_mul.cl.
| #define tile_ncols (tile_width*32) |
Definition at line 35 of file clcubic_mul.cl.
| #define tile_nrows (tile_n*TILE_M*32) |
Definition at line 36 of file clcubic_mul.cl.
| #define tile_width (tile_n*TILE_M) |
Definition at line 34 of file clcubic_mul.cl.
| #define unrolled_for_tile |
Definition at line 69 of file clcubic_mul.cl.
| #define write | ( | M, | |
| row, | |||
| col, | |||
| x | |||
| ) | M[(col)*M ## _nrows + row]=x |
Definition at line 24 of file clcubic_mul.cl.
| #define write_only_global __global gpuword* |
Definition at line 22 of file clcubic_mul.cl.
| typedef unsigned int gpuword |
a GPU word has 32 bits
Definition at line 74 of file clcubic_mul.cl.
|
inline |
offset into shared memory buffers
Shared Memory buffers are aligned to avoid bank conflicts
column stride is 32*tile_n*tile_m + 32*tile_n*tilem/16 + 1 = 34*tile_n*tile_m+1
| row | tile row |
| col | tile column |
| tile_n | number of rows in tile |
Definition at line 55 of file clcubic_mul.cl.
| __kernel void clcubic_mul | ( | write_only_global | C, |
| read_only_global | A, | ||
| read_only_global | B, | ||
| __local gpuword * | A_buf, | ||
| __local gpuword * | B_buf, | ||
| int | A_nrows, | ||
| int | A_ncols | ||
| ) |
OpenCL kernel for cubic matrix multiplication.
Perform Boolean matrix multiplication C += A * B
Uses a tiled cubic approach. Tiles of A and B are buffered in shared memory.
| C | destination matrix |
| A | source matrix |
| B | source matrix |
| A_buf | shared memory for buffering tiles |
| B_buf | shared memory for buffering tiles |
| A_nrows | number of rows in A == number of rows in C |
| A_ncols | number columns in A == number of rows in BImportant: global memory access must be coalesced. Each half warp (=set of 16 threads) must access consecutive addresses. Matrix data is stored in column-major order, so it is imperative that consecutive rows are accessed. row0+trow is the relevant variable. Groups height (get_local_size(0)) is garuanteed to be a multiple of 32. "for_tile" loop is arranged to read consecutive words (32*tile_n words for iteration). |
For shared memory it is imperative to aovid bank conflicts. Use odd aligned access patterns whenever possible.
Definition at line 112 of file clcubic_mul.cl.
| __kernel void clutri_mul | ( | write_only_global | C, |
| read_only_global | A, | ||
| read_only_global | B, | ||
| __local gpuword * | A_buf, | ||
| __local gpuword * | B_buf, | ||
| int | A_nrows | ||
| ) |
OpenCL kernel for cubic upper triangular matrix multiplication.
Perform Boolean matrix multiplication C += A * B A,B,C assumed to be upper triangular, square matrixes.
Uses a tiled cubic approach. Tiles of A and B are buffered in shared memory.
| C | destination matrix |
| A | source matrix |
| B | source matrix |
| A_buf | shared memory for buffering tiles |
| B_buf | shared memory for buffering tiles |
| A_nrows | number of rows in A,B,C == number of cols in A,B,C |
Definition at line 200 of file clcubic_mul.cl.