2 #if defined(__cplusplus) && !defined(_MSC_VER) 6 #include <m4ri/graycode.h> 7 #if defined(__cplusplus) && !defined(_MSC_VER) 12 #include <gtest/gtest.h> 21 int blockrows = __M4RI_MAX_MZD_BLOCKSIZE / A->rowstride;
22 return (A->nrows + blockrows - 1) / blockrows;
26 size_t shared_mem_size = 32 * 1024;
31 EXPECT_FALSE(
true) <<
"too many rows " << A_nrows;
56 for (
int row = 0; row < M->nrows; ++row)
58 word* Mrow = M->rows[row];
59 for (
int col = 0; col < width; col += 2)
64 EXPECT_EQ(Mrow[col>>1], (word)G[col*
padded_rows + row]);
73 EXPECT_EQ(Am->ncols,Bm->nrows);
74 EXPECT_TRUE(Cm==NULL || Am->nrows==Cm->nrows);
75 EXPECT_TRUE(Cm==NULL || Bm->ncols==Cm->ncols);
78 Cm = mzd_init(Am->nrows, Bm->ncols);
99 Am->nrows, Am->ncols, Bm->ncols);
322 #define A_width CEILCOLS(A_ncols) 323 #define C_ncols B_ncols 324 #define C_width CEILCOLS(C_ncols) 325 #define B_nrows A_ncols 326 #define C_nrows A_nrows 328 #define read(M,row,col) M[col*M##_nrows+row] 329 #define write(M,row,col,x) M[col*M##_nrows+row]=x 333 int A_nrows,
int A_ncols,
int B_ncols,
341 int A_nrows,
int A_ncols,
int B_ncols)
351 if (row_offset < A_nrows)
359 int A_nrows,
int A_ncols,
int B_ncols,
362 size_t work_size[2] = { (size_t)r1-r0, (
size_t)
C_width };
366 ASSERT_TRUE((work_size[0] % group_size[0]) == 0);
367 ASSERT_TRUE((work_size[1] % group_size[1]) == 0);
369 size_t group_id_0,group_id_1;
370 size_t group_offset_0,group_offset_1;
372 size_t local_size_0,local_size_1;
375 #define get_group_id(i) group_id_##i 376 #define get_global_id(i) global_id_##i 377 #define get_local_size(i) local_size_##i 378 #define get_local_id(i) local_id_##i 381 while(group_offset_0 < work_size[0])
383 get_local_size(0) =
MIN(group_size[0],work_size[0]-group_offset_0);
387 while(group_offset_1 < work_size[1])
389 get_local_size(1) =
MIN(group_size[1],work_size[1]-group_offset_1);
390 local_count = get_local_size(0) * get_local_size(1);
400 #pragma omp parallel num_threads(local_count) 402 int thread_num = omp_get_thread_num();
404 int get_local_id(0) = thread_num % get_local_size(0);
405 int get_local_id(1) = thread_num / get_local_size(0);
407 int get_global_id(0) = group_offset_0+get_local_id(0);
408 int get_global_id(1) = group_offset_1+get_local_id(1);
410 EXPECT_EQ(get_local_size(1),1);
411 EXPECT_EQ(get_local_id(1),0);
412 EXPECT_EQ(get_group_id(1), get_global_id(1));
417 int group_size = get_local_size(0);
418 int ci = get_group_id(1);
419 int cj = r0 + get_global_id(0);
429 for (
int ai = 0; ai <
A_ncols; ai += k)
435 for (
int sj = 0; sj < k1; sj += group_size) {
443 for (
int sj = 0; sj <
POW2(k1); sj += group_size) {
462 A1 =
read(A, cj, ablock + 1);
468 write(C, cj, ci, Csum);
476 group_offset_1 += get_local_size(1);
480 group_offset_0 += get_local_size(0);
502 EXPECT_EQ(Am->ncols, Bm->nrows);
503 EXPECT_TRUE(Cm == NULL || Am->nrows == Cm->nrows);
504 EXPECT_TRUE(Cm == NULL || Bm->ncols == Cm->ncols);
506 EXPECT_GE(Am->nrows, 1);
507 EXPECT_GE(Am->ncols, 1);
508 EXPECT_GE(Bm->ncols, 1);
511 Cm = mzd_init(Am->nrows, Bm->ncols);
533 Am->ncols, Bm->ncols);
546 int A_nrows,
int A_ncols,
int B_ncols)
549 size_t work_size[2] = { (size_t)A_nrows, (
size_t)
C_width };
550 size_t group_size[2] = { 32, 1 };
553 ASSERT_TRUE((work_size[0] % group_size[0]) == 0);
554 ASSERT_TRUE((work_size[1] % group_size[1]) == 0);
556 size_t group_id_0,group_id_1;
557 size_t group_offset_0,group_offset_1;
559 size_t local_size_0,local_size_1;
565 while(group_offset_0 < work_size[0])
567 get_local_size(0) =
MIN(group_size[0],work_size[0]-group_offset_0);
571 while(group_offset_1 < work_size[1])
573 get_local_size(1) =
MIN(group_size[1],work_size[1]-group_offset_1);
574 local_count = get_local_size(0) * get_local_size(1);
576 EXPECT_EQ(get_local_size(0),32);
577 EXPECT_EQ(get_local_size(1),1);
578 EXPECT_EQ(local_count,32);
585 #pragma omp parallel num_threads(local_count) 587 int thread_num = omp_get_thread_num();
589 int get_local_id(0) = thread_num % get_local_size(0);
590 int get_local_id(1) = thread_num / get_local_size(0);
592 EXPECT_EQ(get_local_size(0),32);
593 EXPECT_EQ(get_local_size(1),1);
594 EXPECT_EQ(get_local_id(1),0);
596 int get_global_id(0) = group_offset_0+get_local_id(0);
597 int get_global_id(1) = group_offset_1+get_local_id(1);
602 int ci = get_global_id(1);
603 int cj = get_global_id(0);
604 int lcj = get_local_id(0);
607 for (
int ai = 0; ai <
A_width; ++ai)
610 T[lcj] =
read(B, 32*ai + lcj, ci);
613 for (
int y = 0; y < 32; ++y, a >>= 1)
614 Csum |= (a & 1) * T[y];
618 write(C, cj, ci, Csum);
626 group_offset_1 += get_local_size(1);
630 group_offset_0 += get_local_size(0);
#define write(M, row, col, x)
gpuword combinate(gpuword x, gpuword *T)
unsigned int gpuword
a GPU word has 32 bits
#define read(M, row, col)
void proto_mul_m4rm(gpuword *C, const gpuword *A, const gpuword *B, int k, int A_nrows, int A_ncols, int B_ncols)
void proto_mul_cubic(gpuword *C, const gpuword *A, const gpuword *B, int A_nrows, int A_ncols, int B_ncols)
int nblocks(mzd_t const *A)
size_t shared_mem_words
size of shared memory in (32bit) words
gpuword * copy_matrix_data(gpuword *G, const mzd_t *M, int padded_rows)
create a column-major copy from an mzd_t matrix
int adjust_k(int k, rci_t A_nrows)
void proto_mul_m4rm_block(gpuword *C, const gpuword *A, const gpuword *B, int k, int A_nrows, int A_ncols, int B_ncols, int r0, int r1)
bool assertEquals(const mzd_t *M, const gpuword *G, int padded_rows)
mzd_t * proto_bool_mul_m4rm(mzd_t *Cm, mzd_t const *Am, mzd_t const *Bm, int k)
mzd_t * proto_bool_mul_cubic(mzd_t *Cm, mzd_t const *Am, mzd_t const *Bm, int)
int padded_rows(int nrows, int padding)
calculate the number of padded rows
size_t max_group_size
max. size of a work group
void copy_back_matrix_data(mzd_t *M, const gpuword *G, int padded_rows)
copy back a colum–major matrix
gpuword proto_read_bits(gpuword a0, gpuword a1, int spot, int n)