Fréchet View  1.6.0
A Tool for Exploring Fréchet Distance Algorithms
clm4rm_bitwise.cpp
Go to the documentation of this file.
1 
2 #include <clm4rm.h>
3 #include <qdebug.h>
4 
6 {
7  Q_ASSERT(A->nrows==B->nrows);
8  Q_ASSERT(A->nrows==C->nrows);
9  Q_ASSERT(A->ncols==B->ncols);
10  Q_ASSERT(A->ncols==C->ncols);
11 // Q_ASSERT(A->width==B->width);
12 // Q_ASSERT(A->width==C->width);
13 // Q_ASSERT(A->rowstride==B->rowstride);
14 // Q_ASSERT(A->rowstride==C->rowstride);
15 }
16 
17 extern cl_kernel clm4rm_and_kernel;
18 extern cl_kernel clm4rm_or_kernel;
19 extern cl_kernel clm4rm_copy_kernel;
20 extern cl_kernel clm4rm_query_diagonal_kernel;
21 
22 
24  cl_command_queue queue, clm4rm_conditions* cond)
25 {
26  assertMatrixSize(C,A,B);
27  Q_ASSERT(clm4rm_or_kernel!=NULL);
28 
29  clm4rm_error = clSetKernelArg(clm4rm_or_kernel, 0, sizeof(cl_mem), &C->data);
30  clm4rm_error = clSetKernelArg(clm4rm_or_kernel, 1, sizeof(cl_mem), &A->data);
31  clm4rm_error = clSetKernelArg(clm4rm_or_kernel, 2, sizeof(cl_mem), &B->data);
32 
33  // can't do in-place operation on Image2D
34  Q_ASSERT(! IMAGE2D || (C->data!=A->data) && (C->data!=B->data));
35 
36  cl_uint work_dim = 2;
37  size_t work_size[2] = { (size_t)A->nrows, (size_t)A->width };
38  clm4rm_error = clEnqueueNDRangeKernel(queue, clm4rm_or_kernel,
39  work_dim, NULL, work_size, NULL,
40  pre_count(cond), pre_events(cond), push_event(cond));
41  Q_ASSERT(pushed_event(cond) != NULL);
42 }
43 
44 
46  cl_command_queue queue, clm4rm_conditions* cond)
47 {
48  assertMatrixSize(C,A,B);
49  Q_ASSERT(clm4rm_and_kernel!=NULL);
50 
51  clm4rm_error = clSetKernelArg(clm4rm_and_kernel, 0, sizeof(cl_mem), &C->data);
52  clm4rm_error = clSetKernelArg(clm4rm_and_kernel, 1, sizeof(cl_mem), &A->data);
53  clm4rm_error = clSetKernelArg(clm4rm_and_kernel, 2, sizeof(cl_mem), &B->data);
54 
55  // can't do in-place operation on Image2D
56  Q_ASSERT(! IMAGE2D || (C->data!=A->data) && (C->data!=B->data));
57 
58  cl_uint work_dim = 2;
59  size_t work_size[2] = { (size_t)A->nrows, (size_t)A->width };
60  clm4rm_error = clEnqueueNDRangeKernel(queue, clm4rm_and_kernel,
61  work_dim, NULL, work_size, NULL,
62  pre_count(cond), pre_events(cond), push_event(cond));
63  Q_ASSERT(pushed_event(cond) != NULL);
64 }
65 
66 
68  cl_context ctx, cl_command_queue queue,
69  clm4rm_conditions* cond)
70 {
71  Q_ASSERT(clm4rm_query_diagonal_kernel != NULL);
72 
73  int result = -1;
74  cl_mem result_buffer = clCreateBuffer(ctx, CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR,
75  sizeof(int), &result,
76  &clm4rm_error);
77 
78  clm4rm_error = clSetKernelArg(clm4rm_query_diagonal_kernel, 0, sizeof(cl_mem), &M->data);
79  clm4rm_error = clSetKernelArg(clm4rm_query_diagonal_kernel, 1, sizeof(int), &M->padded_rows);
80  clm4rm_error = clSetKernelArg(clm4rm_query_diagonal_kernel, 2, sizeof(cl_mem), &result_buffer);
81 
82  cl_uint work_dim = 1;
83  size_t work_size = MIN(M->ncols, M->nrows);
84 
85  clm4rm_error = clEnqueueNDRangeKernel(queue, clm4rm_query_diagonal_kernel,
86  work_dim, NULL, &work_size, NULL,
87  pre_count(cond), pre_events(cond), push_event(cond));
88  Q_ASSERT(clm4rm_error == CL_SUCCESS);
89  Q_ASSERT(pushed_event(cond) != NULL);
90  return result_buffer;
91 }
92 
93 
94 int clm4rm_query_result(cl_mem result_buffer,
95  cl_command_queue queue, clm4rm_conditions* cond)
96 {
97  int result;
98  // call is blocking
99  clm4rm_error = clEnqueueReadBuffer(queue, result_buffer, CL_TRUE,
100  0, sizeof(int), &result,
101  pre_count(cond), pre_events(cond), NULL);
102  // was blocking: all conditions are met
103  release_conditions(cond);
104  clReleaseMemObject(result_buffer);
105  return result;
106 }
107 
108 
112 void clm4rm_free(clmatrix_t* gpu_matrix)
113 {
114  if (gpu_matrix) {
115  clReleaseMemObject(gpu_matrix->data);
116  allocated_size -= DATA_BYTES(gpu_matrix);
117  free(gpu_matrix->local_data);
118  free(gpu_matrix);
119  }
120 }
121 
122 
124  cl_command_queue queue, clm4rm_conditions* cond)
125 {
126  Q_ASSERT(C->ncols==A->ncols);
127  Q_ASSERT(C->ncols==B->ncols);
128 // Q_ASSERT(C->rowstride==A->rowstride);
129 // Q_ASSERT(C->rowstride==B->rowstride);
130  Q_ASSERT(A->nrows+B->nrows == C->nrows);
131  Q_ASSERT(DATA_BYTES(C) >= DATA_BYTES(A)+DATA_BYTES(B));
132 
133 #if IMAGE2D
134  size_t aorigin[3] = { 0,0,0 };
135  size_t borigin[3] = { 0,A->padded_rows,0 };
136  size_t aregion[3] = { A->width, A->padded_rows, 1 };
137  size_t bregion[3] = { B->width, B->padded_rows, 1 };
138 
139  clm4rm_error = clEnqueueCopyImage(queue,
140  A->data, C->data,
141  aorigin, aorigin, aregion,
142  0, NULL, &events[0]);
143  clm4rm_error = clEnqueueCopyImage(queue,
144  B->data, C->data,
145  aorigin, borigin, bregion,
146  0, NULL, &events[1]);
147 #else
148  clm4rm_error = clEnqueueCopyBuffer(queue,
149  A->data, C->data, 0, 0, DATA_BYTES(A),
150  pre_count(cond),pre_events(cond),push_event(cond));
151  Q_ASSERT(pushed_event(cond) != NULL);
152  clm4rm_error = clEnqueueCopyBuffer(queue,
153  B->data, C->data, 0, DATA_BYTES(A), DATA_BYTES(B),
154  pre_count(cond),pre_events(cond),push_event(cond));
155  Q_ASSERT(pushed_event(cond) != NULL);
156 #endif
157 }
158 
159 
161  cl_command_queue queue, clm4rm_conditions* cond)
162 {
163  Q_ASSERT(C->nrows==A->nrows);
164  Q_ASSERT(C->nrows==B->nrows);
165  Q_ASSERT(A->ncols + B->ncols == C->ncols);
166 
167  size_t Arows = A->nrows;
168  size_t Acols = A->ncols;
169  size_t Brows = B->nrows;
170  size_t Bcols = B->ncols;
171  size_t origin[3] = { 0,0,0 };
172  size_t aregion[3] = { /*BYTE_WIDTH(A)*/0, Arows, 1 };
173 
174  // Copy A -> C
175 #if IMAGE2D
176  clm4rm_error = clEnqueueCopyImage(queue,
177  A->data, C->data,
178  origin, origin, aregion,
179  0, NULL, &events[0]);
180 
181  if ((A->ncols % clm4rm_radix) == 0) {
182  // A is Word aligned. Let's use clEnqueueCopyBufferRect
183  size_t boffset[3] = { A->ncols, 0, 1 };
184  size_t bregion[3] = { B->width, B->padded_rows, 1 };
185  // Copy B -> C
186  clm4rm_error = clEnqueueCopyImage(queue,
187  B->data, C->data,
188  origin, boffset, bregion,
189  pre_count(cond),pre_events(cond),post_event(cond));
190  }
191  else {
192  // nasty: B needs to be shifted left by .. bits
193  int shleft = clm4rm_radix - (A->ncols % clm4rm_radix);
194  // TODO use a kernel ?
195  }
196 #else
197  clm4rm_error = clEnqueueCopyBufferRect (queue,
198  A->data, C->data,
199  origin, origin, aregion,
200  /*BYTE_WIDTH(A)*/0, 0,
201  /*BYTE_WIDTH(C)*/0, 0,
202  pre_count(cond),pre_events(cond),push_event(cond));
203  Q_ASSERT(pushed_event(cond) != NULL);
204 
205  if ((A->ncols % 8)==0) {
206  // A is Byte aligned. Let's use clEnqueueCopyBufferRect
207  size_t boffset[3] = { Acols/8, 0, 0 };
208  size_t bregion[3] = { /*BYTE_WIDTH(B)*/0, Brows, 1 };
209  // Copy B -> C
210  clm4rm_error = clEnqueueCopyBufferRect (queue,
211  B->data, C->data,
212  origin, boffset, bregion,
213  /*BYTE_WIDTH(B)*/0, 0,
214  /*BYTE_WIDTH(C)*/0, 0,
215  pre_count(cond),pre_events(cond),push_event(cond));
216  Q_ASSERT(pushed_event(cond) != NULL);
217  }
218  else {
219  /*
220  * __global unsigned int* C, unsigned int C_rowstride,
221  __global unsigned int* B, unsigned int B_rowstride,
222  unsigned int offset
223  */
224  Q_ASSERT(clm4rm_copy_kernel!=NULL);
225 
226  clm4rm_error = clSetKernelArg(clm4rm_copy_kernel, 0, sizeof(cl_mem), &C->data);
227 // clm4rm_error = clSetKernelArg(clm4rm_copy_kernel, 1, sizeof(int), &C->rowstride);
228  clm4rm_error = clSetKernelArg(clm4rm_copy_kernel, 2, sizeof(cl_mem), &B->data);
229 // clm4rm_error = clSetKernelArg(clm4rm_copy_kernel, 3, sizeof(int), &B->rowstride);
230  clm4rm_error = clSetKernelArg(clm4rm_copy_kernel, 4, sizeof(int), &A->ncols);
231 
232  cl_uint work_dim = 2;
233  size_t work_size[2] = { Brows, CEILCOLS(Bcols) };
234  clm4rm_error = clEnqueueNDRangeKernel(queue, clm4rm_copy_kernel,
235  work_dim, NULL, work_size, NULL,
236  pre_count(cond),pre_events(cond),push_event(cond));
237 
238  }
239 #endif
240 }
241 
OpenCL boolean matrix data structure. Data is arranged in 32 bit words.
Definition: clm4rm.h:98
rci_t padded_rows
Number of rows padded to a multiple of 32.
Definition: clm4rm.h:100
void clm4rm_free(clmatrix_t *gpu_matrix)
release memory (CPU and GPU)
cl_int clm4rm_error
latest OpenCL result code. CL_SUCCESS indicates no error.
Definition: clm4rm.cpp:9
cl_event * pre_events(clm4rm_conditions *cond)
Definition: clm4rm.cpp:338
size_t allocated_size
Definition: clm4rm.cpp:78
cl_kernel clm4rm_or_kernel
Definition: clm4rm.cpp:64
void clm4rm_and(clmatrix_t *C, clmatrix_t *A, clmatrix_t *B, cl_command_queue queue, clm4rm_conditions *cond)
perform element-wise logical conjunction (AND). For each entry, compute C_ij := A_ij & B_ij....
Manages OpenCL event dependencies; necessary when the queue is out-of-order; dependencies must be est...
Definition: clm4rm.h:227
void clm4rm_stack(clmatrix_t *C, clmatrix_t *A, clmatrix_t *B, cl_command_queue queue, clm4rm_conditions *cond)
concatenate two matrices
void assertMatrixSize(clmatrix_t *C, clmatrix_t *A, clmatrix_t *B)
rci_t ncols
Number of columns.
Definition: clm4rm.h:101
cl_kernel clm4rm_and_kernel
Definition: clm4rm.cpp:63
cl_kernel clm4rm_query_diagonal_kernel
Definition: clm4rm.cpp:66
#define IMAGE2D
Definition: clm4rm.h:53
#define MIN(x, y)
Definition: clcubic_mul.cl:77
void clm4rm_or(clmatrix_t *C, clmatrix_t *A, clmatrix_t *B, cl_command_queue queue, clm4rm_conditions *cond)
perform element-wise logical disjunction (OR)
cl_uint pre_count(clm4rm_conditions *cond)
Definition: clm4rm.cpp:331
#define clm4rm_radix
word size. for compatibility with GPU memory layout, we operate on 32 bit words.
Definition: clm4rm.h:41
cl_event * push_event(clm4rm_conditions *cond)
reserve one post-condition event
Definition: clm4rm.cpp:348
rci_t width
Number of words with valid bits: width = ceil(ncols / m4ri_radix) */.
Definition: clm4rm.h:103
cl_mem data
handle to GPU data (32-bit unsigned integers)
Definition: clm4rm.h:114
int clm4rm_query_result(cl_mem result_buffer, cl_command_queue queue, clm4rm_conditions *cond)
examine the result of a previous call to clm4rm_query_diagonal
void clm4rm_concat(clmatrix_t *C, clmatrix_t *A, clmatrix_t *B, cl_command_queue queue, clm4rm_conditions *cond)
concatenate two matrices
#define CEILCOLS(i)
Definition: clcubic_mul.cl:76
rci_t nrows
Number of rows.
Definition: clm4rm.h:99
cl_event * pushed_event(clm4rm_conditions *cond)
Definition: clm4rm.cpp:357
cl_kernel clm4rm_copy_kernel
Definition: clm4rm.cpp:65
gpuword * local_data
matrix data in CPU memory
Definition: clm4rm.h:113
#define DATA_BYTES(m)
Definition: clm4rm.h:119
void release_conditions(clm4rm_conditions *cond)
release conditions list
Definition: clm4rm.cpp:297
cl_mem clm4rm_query_diagonal(clmatrix_t *M, cl_context ctx, cl_command_queue queue, clm4rm_conditions *cond)
find a non-zero entry on the diagonal of a matrix. Return the column/row of the first non-zero entry,...