11 #include <QCoreApplication> 26 : tbb_context(nullptr), num_threads(0),num_cores(0)
28 num_cores = tbb::task_scheduler_init::default_num_threads();
56 return tbb::this_tbb_thread::get_id();
92 cl_platform_id platforms[4];
93 cl_uint num_platforms;
94 clerr = clGetPlatformIDs(4, platforms, &num_platforms);
95 for (
int plf = 0; plf < num_platforms; ++plf)
97 clerr = clGetDeviceIDs(platforms[plf], CL_DEVICE_TYPE_GPU, 1, &
device, NULL);
98 if (
clerr != CL_SUCCESS)
continue;
103 char extensions[1024];
104 clerr = clGetPlatformInfo(platforms[plf], CL_PLATFORM_NAME, 1024, name, NULL);
105 clerr = clGetPlatformInfo(platforms[plf], CL_PLATFORM_VERSION, 1024, version, NULL);
106 clerr = clGetPlatformInfo(platforms[plf], CL_PLATFORM_PROFILE, 1024, profile, NULL);
107 clerr = clGetPlatformInfo(platforms[plf], CL_PLATFORM_EXTENSIONS, 1024, extensions, NULL);
115 cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
116 props[1] = (cl_context_properties)
platform;
122 clGetDeviceInfo(
device, CL_DEVICE_NAME, 1024, &name, NULL);
123 clGetDeviceInfo(
device, CL_DEVICE_MAX_COMPUTE_UNITS,
sizeof(cl_uint), &
gpu_units, NULL);
131 const char* kernelDir =
"src/clm4rm";
134 assert(
clerr == CL_SUCCESS);
160 QStringList prefixes = {
"Resources",
"src",
"rsrc" };
161 QString dirstring(dirname);
163 QDir dir = QCoreApplication::applicationDirPath();
165 for(QString prefix : prefixes)
167 QString subdir = prefix+
"/"+dirstring;
168 if (dir.exists(subdir)) {
183 clReleaseCommandQueue(
queue);
191 __cpuidex((
int*)regs, leaf, level);
194 __cpuid_count(leaf, level, regs[0], regs[1], regs[2], regs[3]);
201 __cpuid((
int*)regs, leaf);
204 __cpuid(leaf, regs[0], regs[1], regs[2], regs[3]);
216 unsigned int EBX = E[1];
217 unsigned int ECX = E[2];
218 unsigned int Ways = (EBX >> 22) & 0x3ff;
219 unsigned int Partitions = (EBX >> 12) & 0x3ff;
220 unsigned int Line_Size = EBX & 0x0fff;
221 unsigned int Sets = ECX;
222 return (Ways + 1) * (Partitions + 1) * (Line_Size + 1) * (Sets + 1);
246 microsecs = std::chrono::duration_cast<std::chrono::microseconds>(t2-t1).count();
249 std::string spaces(std::max<int>(0, 12 - label.length()),
' ');
250 std::cout << std::fixed << std::setprecision(3)
251 <<
" " << label <<
": "<< spaces
253 << (((double)microsecs) / 1e3) <<
" ms" <<
" = " 254 << (((
double)microsecs) / 1e6) <<
" s" a singleton class managing concurrency settings for the application.
cl_uint gpu_units
number of GPU units (if known)
time_point printTimer(std::string label, bool do_print=true)
clock benchmark
size_t size2_t[2]
tow-dimensional size; used for various OpenCL parameters
cl_command_queue queue
OpenCL command queue.
static void cpuid(int leaf, unsigned int regs[4])
retrieve CPU factory info
static tbb::tbb_thread::id currentThread()
global definitions for all algorithms.
int num_threads
number of logical threads (always known)
static int countThreads()
static const char * findKernelDirectory(const char *subdir)
look up directory containing OpenCL source files ("kernels")
static void maxMaxtrixTile(size2_t &)
set tile size for cubic matrix multiplication. Tiles adapt to local memory and/or compute units....
Clock::time_point time_point
timestamp with high resolution
std::string device_name
name of OpenCL device
int num_cores
number of physical cores (if known)
cl_device_id device
OpenCL device.
tbb::task_scheduler_init * tbb_context
controls the number of parallel threads that are used by TBB functions.
Float sqrt(const Float &x)
square-root function template for floating point types
cl_int clm4rm_setup(const char *cl_kernel_directory, cl_context ctx, cl_device_id device)
load OpenCL kernels and set up parameters
void clm4rm_tear_down(cl_context ctx, cl_device_id device)
release OpenCL resources
cl_platform_id platform
OpenCL platform identifier.
size_t shared_mem_words
size of shared memory in (32bit) words
time_point printDebugTimer(std::string label)
clock benchmark and print elapsed time
static bool hasGpuSupport()
static ConcurrencyContext instance
singleton instance
~ConcurrencyContext()
destructor; releases all OpenCL resources
time_point popTimer(std::string label, bool do_print=true)
clock benchmark and remove from stack
bool setupGpu(size2_t max_tile)
set up OpenCL context
static std::string gpuName()
time_point popDebugTimer(std::string label)
clock benchmark and remove from stack
size2_t max_tile
maximum tile size for Boolean matrix multiplication
double min(double a, double b)
minimum function with checks for NAN
ConcurrencyContext()
default constructor; does no initialisation
void close()
release all OpenCL resources.
size_t max_group_size
max. size of a work group
cl_context ctx
OpenCL platform identifier.
static int cacheSize(int level)
size of CPU cache memory
static cl_context clContext()
static cl_uint countGpuUnits()
static cl_command_queue clQueue()
QByteArray kernelDirectory
void pushTimer()
start a new benchmark timer and push it to stack
void setup(int max_threads)
set up TBB thread count
cl_int clerr
last OpenCL return code
std::stack< time_point > time_stack