Fréchet View  1.6.0
A Tool for Exploring Fréchet Distance Algorithms
concurrency.cpp
Go to the documentation of this file.
1 
3 //#include <poly/parallel.h>
4 #include <stack>
5 #include <iostream>
6 #include <iomanip>
7 #include <algorithm>
8 #include <assert.h>
9 
10 #ifndef UNIT_TEST
11 #include <QCoreApplication>
12 #include <QDir>
13 #include <qdebug.h>
14 #endif
15 
16 #include <atomic>
17 #ifdef __GNUC__
18 #include <cpuid.h>
19 //#include <boost/thread.hpp>
20 #endif
21 
22 using namespace frechet;
23 using namespace app;
24 
26 : tbb_context(nullptr), num_threads(0),num_cores(0)
27 {
28  num_cores = tbb::task_scheduler_init::default_num_threads();
29 }
30 
32 {
33  close();
34 }
35 
36 
38 
39 void ConcurrencyContext::setup(int max_threads)
40 {
41  assert(tbb_context == nullptr);
42  num_threads = (max_threads >= 1) ? max_threads : (num_cores/*/2?*/);
43  // TODO better num_cores / 2 ?
44  tbb_context = new tbb::task_scheduler_init(num_threads);
45 }
46 
48  return instance.num_threads;
49 }
50 
52  return instance.num_cores;
53 }
54 
55 tbb::tbb_thread::id ConcurrencyContext::currentThread() {
56  return tbb::this_tbb_thread::get_id();
57 }
58 
60  return instance.queue != nullptr;
61 }
62 
64  return instance.device_name;
65 }
66 
68  return instance.gpu_units;
69 }
70 
72  assert(instance.ctx != nullptr);
73  return instance.ctx;
74 }
75 
76 cl_command_queue ConcurrencyContext::clQueue() {
77  assert(instance.queue != nullptr);
78  return instance.queue;
79 }
80 
82  result[0] = instance.max_tile[0];
83  result[1] = instance.max_tile[1];
84 }
85 
86 
88 {
89  /* Setup OpenCL environment. */
90  platform = nullptr;
91 
92  cl_platform_id platforms[4];
93  cl_uint num_platforms;
94  clerr = clGetPlatformIDs(4, platforms, &num_platforms);
95  for (int plf = 0; plf < num_platforms; ++plf)
96  {
97  clerr = clGetDeviceIDs(platforms[plf], CL_DEVICE_TYPE_GPU, 1, &device, NULL);
98  if (clerr != CL_SUCCESS) continue;
99 
100  char name[1024];
101  char version[1024];
102  char profile[1024];
103  char extensions[1024];
104  clerr = clGetPlatformInfo(platforms[plf], CL_PLATFORM_NAME, 1024, name, NULL);
105  clerr = clGetPlatformInfo(platforms[plf], CL_PLATFORM_VERSION, 1024, version, NULL);
106  clerr = clGetPlatformInfo(platforms[plf], CL_PLATFORM_PROFILE, 1024, profile, NULL);
107  clerr = clGetPlatformInfo(platforms[plf], CL_PLATFORM_EXTENSIONS, 1024, extensions, NULL);
108 
109 /* std::cout << "Platform " << plf << std::endl
110  << platforms[plf] << " = " << name << " " << version << ", "
111  << profile << ","
112  << extensions << std::endl << std::endl;
113 */
114  platform = platforms[plf];
115  cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
116  props[1] = (cl_context_properties)platform;
117 
118  ctx = clCreateContext(props, 1, &device, NULL, NULL, &clerr);
119  queue = clCreateCommandQueue(ctx, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &clerr);
120  // Important: Out-of-Order queue
121  // to imposed a specific order on scheduled kernels, use cl_events and barriers
122  clGetDeviceInfo(device, CL_DEVICE_NAME, 1024, &name, NULL);
123  clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &gpu_units, NULL);
124  device_name = name;
125 
126  // Load Kernels
127 #ifndef UNIT_TEST
128  const char* kernelDir = findKernelDirectory("clm4rm");
129 #else
130  // Unit Test environment w/out QT
131  const char* kernelDir = "src/clm4rm";
132 #endif
133  clerr = clm4rm_setup(kernelDir, ctx, device);
134  assert(clerr == CL_SUCCESS);
135 
136  // max. tile size fo cubic multiplication
137  // tile_n = number of compute units; limited by max. units per group
138  // tile_m = number of row/cols per compute unit; limited by available shared memory
139  // for best performance, we should allow >= 2 groups per shared memory area
140  // may be overridden by command line
141  // @see clcubic_mul()
142  if (amax_tile[0]<=0)
143  max_tile[0] = sqrt(max_group_size/32);
144  else
145  max_tile[0] = amax_tile[0];
146  if (amax_tile[1]<=0)
147  max_tile[1] = (sqrt(1 + 17*shared_mem_words)-1)/(68*max_tile[0]);
148  else
149  max_tile[1] = amax_tile[1];
150  max_tile[1] = std::min((size_t)MAX_TILE_M,max_tile[1]);
151  }
152  return hasGpuSupport();
153 }
154 
155 #ifndef UNIT_TEST
156 QByteArray kernelDirectory;
157 
158 const char* ConcurrencyContext::findKernelDirectory(const char* dirname)
159 {
160  QStringList prefixes = { "Resources","src","rsrc" };
161  QString dirstring(dirname);
162  // Look for folder "Resources/clm4rm" or "src/clm4rm" or "rsrc/clm4rm"
163  QDir dir = QCoreApplication::applicationDirPath();
164  for(;; dir.cdUp()) {
165  for(QString prefix : prefixes)
166  {
167  QString subdir = prefix+"/"+dirstring;
168  if (dir.exists(subdir)) {
169  kernelDirectory = dir.absoluteFilePath(subdir).toLocal8Bit();
170  return kernelDirectory.constData();
171  }
172  }
173  }
174 }
175 #endif // QT
176 
178 {
179  delete tbb_context;
180  tbb_context=nullptr;
181  // release OpenCL stuff
182  if (queue)
183  clReleaseCommandQueue(queue);
184  if (hasGpuSupport())
186 }
187 
188 void ConcurrencyContext::cpuid(int leaf, int level, unsigned int regs[4])
189 {
190 #ifdef _MSC_VER
191  __cpuidex((int*)regs, leaf, level);
192 #endif
193 #ifdef __GNUC__
194  __cpuid_count(leaf, level, regs[0], regs[1], regs[2], regs[3]);
195 #endif
196 }
197 
198 void ConcurrencyContext::cpuid(int leaf, unsigned int regs[4])
199 {
200 #ifdef _MSC_VER
201  __cpuid((int*)regs, leaf);
202 #endif
203 #ifdef __GNUC__
204  __cpuid(leaf, regs[0], regs[1], regs[2], regs[3]);
205 #endif
206 }
207 
209 {
210  unsigned int E[4];
211  cpuid(4,level,E);
212 
213  // = (Ways + 1) * (Partitions + 1) * (Line_Size + 1) * (Sets + 1)
214  // = (EBX[31:22] + 1) * (EBX[21:12] + 1) * (EBX[11:0] + 1) * (ECX + 1)
215 
216  unsigned int EBX = E[1];
217  unsigned int ECX = E[2];
218  unsigned int Ways = (EBX >> 22) & 0x3ff;
219  unsigned int Partitions = (EBX >> 12) & 0x3ff;
220  unsigned int Line_Size = EBX & 0x0fff;
221  unsigned int Sets = ECX;
222  return (Ways + 1) * (Partitions + 1) * (Line_Size + 1) * (Sets + 1);
223 }
224 
225 std::stack<time_point> time_stack;
226 
227 
229 {
230  time_stack.push(Clock::now());
231 }
232 
233 time_point frechet::app::printTimer(std::string label, bool do_print)
234 {
235  long microsecs;
236  time_point t1;
237  time_point t2 = Clock::now();
238 
239  if (time_stack.empty()) {
240  pushTimer();
241  }
242  else {
243  t1 = time_stack.top();
244  }
245 
246  microsecs = std::chrono::duration_cast<std::chrono::microseconds>(t2-t1).count();
247 
248  if (do_print) {
249  std::string spaces(std::max<int>(0, 12 - label.length()), ' ');
250  std::cout << std::fixed << std::setprecision(3)
251  << " " << label << ": "<< spaces
252  //<< microsecs << " µs" << " = "
253  << (((double)microsecs) / 1e3) << " ms" << " = "
254  << (((double)microsecs) / 1e6) << " s"
255  << std::endl;
256  }
257  return t2;
258 }
259 
260 #ifdef Q_DEBUG
261 # define DO_DEBUG 1
262 #else
263 # define DO_DEBUG 0
264 #endif
265 
267 {
268  return printTimer(label, DO_DEBUG);
269 }
270 
271 time_point frechet::app::popTimer(std::string label, bool do_print)
272 {
273  time_point t = printTimer(label,do_print);
274  time_stack.pop();
275  return t;
276 }
277 
279 {
280  return popTimer(label, DO_DEBUG);
281 }
a singleton class managing concurrency settings for the application.
Definition: concurrency.h:22
cl_uint gpu_units
number of GPU units (if known)
Definition: concurrency.h:51
time_point printTimer(std::string label, bool do_print=true)
clock benchmark
size_t size2_t[2]
tow-dimensional size; used for various OpenCL parameters
Definition: clm4rm.h:67
cl_command_queue queue
OpenCL command queue.
Definition: concurrency.h:42
static void cpuid(int leaf, unsigned int regs[4])
retrieve CPU factory info
static tbb::tbb_thread::id currentThread()
Definition: concurrency.cpp:55
global definitions for all algorithms.
int num_threads
number of logical threads (always known)
Definition: concurrency.h:33
static const char * findKernelDirectory(const char *subdir)
look up directory containing OpenCL source files ("kernels")
static void maxMaxtrixTile(size2_t &)
set tile size for cubic matrix multiplication. Tiles adapt to local memory and/or compute units....
Definition: concurrency.cpp:81
Clock::time_point time_point
timestamp with high resolution
Definition: concurrency.h:136
std::string device_name
name of OpenCL device
Definition: concurrency.h:49
int num_cores
number of physical cores (if known)
Definition: concurrency.h:31
cl_device_id device
OpenCL device.
Definition: concurrency.h:40
tbb::task_scheduler_init * tbb_context
controls the number of parallel threads that are used by TBB functions.
Definition: concurrency.h:29
Float sqrt(const Float &x)
square-root function template for floating point types
cl_int clm4rm_setup(const char *cl_kernel_directory, cl_context ctx, cl_device_id device)
load OpenCL kernels and set up parameters
Definition: clm4rm.cpp:84
void clm4rm_tear_down(cl_context ctx, cl_device_id device)
release OpenCL resources
Definition: clm4rm.cpp:146
cl_platform_id platform
OpenCL platform identifier.
Definition: concurrency.h:36
size_t shared_mem_words
size of shared memory in (32bit) words
Definition: clm4rm.cpp:77
time_point printDebugTimer(std::string label)
clock benchmark and print elapsed time
static ConcurrencyContext instance
singleton instance
Definition: concurrency.h:61
~ConcurrencyContext()
destructor; releases all OpenCL resources
Definition: concurrency.cpp:31
#define DO_DEBUG
#define MAX_TILE_M
Definition: clm4rm.h:62
time_point popTimer(std::string label, bool do_print=true)
clock benchmark and remove from stack
bool setupGpu(size2_t max_tile)
set up OpenCL context
Definition: concurrency.cpp:87
static std::string gpuName()
Definition: concurrency.cpp:63
time_point popDebugTimer(std::string label)
clock benchmark and remove from stack
size2_t max_tile
maximum tile size for Boolean matrix multiplication
Definition: concurrency.h:53
double min(double a, double b)
minimum function with checks for NAN
Definition: numeric.h:222
ConcurrencyContext()
default constructor; does no initialisation
Definition: concurrency.cpp:25
void close()
release all OpenCL resources.
size_t max_group_size
max. size of a work group
Definition: clm4rm.cpp:74
cl_context ctx
OpenCL platform identifier.
Definition: concurrency.h:38
static int cacheSize(int level)
size of CPU cache memory
static cl_context clContext()
Definition: concurrency.cpp:71
static cl_command_queue clQueue()
Definition: concurrency.cpp:76
QByteArray kernelDirectory
void pushTimer()
start a new benchmark timer and push it to stack
void setup(int max_threads)
set up TBB thread count
Definition: concurrency.cpp:39
cl_int clerr
last OpenCL return code
Definition: concurrency.h:46
std::stack< time_point > time_stack