1 /*M/////////////////////////////////////////////////////////////////////////////////////// 2 // 3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4 // 5 // By downloading, copying, installing or using the software you agree to this license. 6 // If you do not agree to this license, do not download, install, 7 // copy or use the software. 8 // 9 // 10 // License Agreement 11 // For Open Source Computer Vision Library 12 // 13 // Copyright (C) 2013, OpenCV Foundation, all rights reserved. 14 // Third party copyrights are property of their respective owners. 15 // 16 // Redistribution and use in source and binary forms, with or without modification, 17 // are permitted provided that the following conditions are met: 18 // 19 // * Redistribution's of source code must retain the above copyright notice, 20 // this list of conditions and the following disclaimer. 21 // 22 // * Redistribution's in binary form must reproduce the above copyright notice, 23 // this list of conditions and the following disclaimer in the documentation 24 // and/or other materials provided with the distribution. 25 // 26 // * The name of the copyright holders may not be used to endorse or promote products 27 // derived from this software without specific prior written permission. 28 // 29 // This software is provided by the copyright holders and contributors "as is" and 30 // any express or implied warranties, including, but not limited to, the implied 31 // warranties of merchantability and fitness for a particular purpose are disclaimed. 32 // In no event shall the OpenCV Foundation or contributors be liable for any direct, 33 // indirect, incidental, special, exemplary, or consequential damages 34 // (including, but not limited to, procurement of substitute goods or services; 35 // loss of use, data, or profits; or business interruption) however caused 36 // and on any theory of liability, whether in contract, strict liability, 37 // or tort (including negligence or otherwise) arising in any way out of 38 // the use of this software, even if advised of the possibility of such damage. 39 // 40 //M*/ 41 42 #include "precomp.hpp" 43 #include <list> 44 #include <map> 45 #include <string> 46 #include <sstream> 47 #include <iostream> // std::cerr 48 49 #define CV_OPENCL_ALWAYS_SHOW_BUILD_LOG 0 50 #define CV_OPENCL_SHOW_RUN_ERRORS 0 51 #define CV_OPENCL_SHOW_SVM_ERROR_LOG 1 52 #define CV_OPENCL_SHOW_SVM_LOG 0 53 54 #include "opencv2/core/bufferpool.hpp" 55 #ifndef LOG_BUFFER_POOL 56 # if 0 57 # define LOG_BUFFER_POOL printf 58 # else 59 # define LOG_BUFFER_POOL(...) 60 # endif 61 #endif 62 63 64 // TODO Move to some common place 65 static bool getBoolParameter(const char* name, bool defaultValue) 66 { 67 /* 68 * If your system doesn't support getenv(), define NO_GETENV to disable 69 * this feature. 70 */ 71 #ifdef NO_GETENV 72 const char* envValue = NULL; 73 #else 74 const char* envValue = getenv(name); 75 #endif 76 if (envValue == NULL) 77 { 78 return defaultValue; 79 } 80 cv::String value = envValue; 81 if (value == "1" || value == "True" || value == "true" || value == "TRUE") 82 { 83 return true; 84 } 85 if (value == "0" || value == "False" || value == "false" || value == "FALSE") 86 { 87 return false; 88 } 89 CV_ErrorNoReturn(cv::Error::StsBadArg, cv::format("Invalid value for %s parameter: %s", name, value.c_str())); 90 } 91 92 93 // TODO Move to some common place 94 static size_t getConfigurationParameterForSize(const char* name, size_t defaultValue) 95 { 96 #ifdef NO_GETENV 97 const char* envValue = NULL; 98 #else 99 const char* envValue = getenv(name); 100 #endif 101 if (envValue == NULL) 102 { 103 return defaultValue; 104 } 105 cv::String value = envValue; 106 size_t pos = 0; 107 for (; pos < value.size(); pos++) 108 { 109 if (!isdigit(value[pos])) 110 break; 111 } 112 cv::String valueStr = value.substr(0, pos); 113 cv::String suffixStr = value.substr(pos, value.length() - pos); 114 int v = atoi(valueStr.c_str()); 115 if (suffixStr.length() == 0) 116 return v; 117 else if (suffixStr == "MB" || suffixStr == "Mb" || suffixStr == "mb") 118 return v * 1024 * 1024; 119 else if (suffixStr == "KB" || suffixStr == "Kb" || suffixStr == "kb") 120 return v * 1024; 121 CV_ErrorNoReturn(cv::Error::StsBadArg, cv::format("Invalid value for %s parameter: %s", name, value.c_str())); 122 } 123 124 #if CV_OPENCL_SHOW_SVM_LOG 125 // TODO add timestamp logging 126 #define CV_OPENCL_SVM_TRACE_P printf("line %d (ocl.cpp): ", __LINE__); printf 127 #else 128 #define CV_OPENCL_SVM_TRACE_P(...) 129 #endif 130 131 #if CV_OPENCL_SHOW_SVM_ERROR_LOG 132 // TODO add timestamp logging 133 #define CV_OPENCL_SVM_TRACE_ERROR_P printf("Error on line %d (ocl.cpp): ", __LINE__); printf 134 #else 135 #define CV_OPENCL_SVM_TRACE_ERROR_P(...) 136 #endif 137 138 #include "opencv2/core/opencl/runtime/opencl_clamdblas.hpp" 139 #include "opencv2/core/opencl/runtime/opencl_clamdfft.hpp" 140 141 #ifdef HAVE_OPENCL 142 #include "opencv2/core/opencl/runtime/opencl_core.hpp" 143 #else 144 // TODO FIXIT: This file can't be build without OPENCL 145 146 /* 147 Part of the file is an extract from the standard OpenCL headers from Khronos site. 148 Below is the original copyright. 149 */ 150 151 /******************************************************************************* 152 * Copyright (c) 2008 - 2012 The Khronos Group Inc. 153 * 154 * Permission is hereby granted, free of charge, to any person obtaining a 155 * copy of this software and/or associated documentation files (the 156 * "Materials"), to deal in the Materials without restriction, including 157 * without limitation the rights to use, copy, modify, merge, publish, 158 * distribute, sublicense, and/or sell copies of the Materials, and to 159 * permit persons to whom the Materials are furnished to do so, subject to 160 * the following conditions: 161 * 162 * The above copyright notice and this permission notice shall be included 163 * in all copies or substantial portions of the Materials. 164 * 165 * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 166 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 167 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 168 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 169 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 170 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 171 * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. 172 ******************************************************************************/ 173 174 #if 0 //defined __APPLE__ 175 #define HAVE_OPENCL 1 176 #else 177 #undef HAVE_OPENCL 178 #endif 179 180 #define OPENCV_CL_NOT_IMPLEMENTED -1000 181 182 #ifdef HAVE_OPENCL 183 184 #if defined __APPLE__ 185 #include <OpenCL/opencl.h> 186 #else 187 #include <CL/opencl.h> 188 #endif 189 190 static const bool g_haveOpenCL = true; 191 192 #else 193 194 extern "C" { 195 196 struct _cl_platform_id { int dummy; }; 197 struct _cl_device_id { int dummy; }; 198 struct _cl_context { int dummy; }; 199 struct _cl_command_queue { int dummy; }; 200 struct _cl_mem { int dummy; }; 201 struct _cl_program { int dummy; }; 202 struct _cl_kernel { int dummy; }; 203 struct _cl_event { int dummy; }; 204 struct _cl_sampler { int dummy; }; 205 206 typedef struct _cl_platform_id * cl_platform_id; 207 typedef struct _cl_device_id * cl_device_id; 208 typedef struct _cl_context * cl_context; 209 typedef struct _cl_command_queue * cl_command_queue; 210 typedef struct _cl_mem * cl_mem; 211 typedef struct _cl_program * cl_program; 212 typedef struct _cl_kernel * cl_kernel; 213 typedef struct _cl_event * cl_event; 214 typedef struct _cl_sampler * cl_sampler; 215 216 typedef int cl_int; 217 typedef unsigned cl_uint; 218 #if defined (_WIN32) && defined(_MSC_VER) 219 typedef __int64 cl_long; 220 typedef unsigned __int64 cl_ulong; 221 #else 222 typedef long cl_long; 223 typedef unsigned long cl_ulong; 224 #endif 225 226 typedef cl_uint cl_bool; /* WARNING! Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */ 227 typedef cl_ulong cl_bitfield; 228 typedef cl_bitfield cl_device_type; 229 typedef cl_uint cl_platform_info; 230 typedef cl_uint cl_device_info; 231 typedef cl_bitfield cl_device_fp_config; 232 typedef cl_uint cl_device_mem_cache_type; 233 typedef cl_uint cl_device_local_mem_type; 234 typedef cl_bitfield cl_device_exec_capabilities; 235 typedef cl_bitfield cl_command_queue_properties; 236 typedef intptr_t cl_device_partition_property; 237 typedef cl_bitfield cl_device_affinity_domain; 238 239 typedef intptr_t cl_context_properties; 240 typedef cl_uint cl_context_info; 241 typedef cl_uint cl_command_queue_info; 242 typedef cl_uint cl_channel_order; 243 typedef cl_uint cl_channel_type; 244 typedef cl_bitfield cl_mem_flags; 245 typedef cl_uint cl_mem_object_type; 246 typedef cl_uint cl_mem_info; 247 typedef cl_bitfield cl_mem_migration_flags; 248 typedef cl_uint cl_image_info; 249 typedef cl_uint cl_buffer_create_type; 250 typedef cl_uint cl_addressing_mode; 251 typedef cl_uint cl_filter_mode; 252 typedef cl_uint cl_sampler_info; 253 typedef cl_bitfield cl_map_flags; 254 typedef cl_uint cl_program_info; 255 typedef cl_uint cl_program_build_info; 256 typedef cl_uint cl_program_binary_type; 257 typedef cl_int cl_build_status; 258 typedef cl_uint cl_kernel_info; 259 typedef cl_uint cl_kernel_arg_info; 260 typedef cl_uint cl_kernel_arg_address_qualifier; 261 typedef cl_uint cl_kernel_arg_access_qualifier; 262 typedef cl_bitfield cl_kernel_arg_type_qualifier; 263 typedef cl_uint cl_kernel_work_group_info; 264 typedef cl_uint cl_event_info; 265 typedef cl_uint cl_command_type; 266 typedef cl_uint cl_profiling_info; 267 268 269 typedef struct _cl_image_format { 270 cl_channel_order image_channel_order; 271 cl_channel_type image_channel_data_type; 272 } cl_image_format; 273 274 typedef struct _cl_image_desc { 275 cl_mem_object_type image_type; 276 size_t image_width; 277 size_t image_height; 278 size_t image_depth; 279 size_t image_array_size; 280 size_t image_row_pitch; 281 size_t image_slice_pitch; 282 cl_uint num_mip_levels; 283 cl_uint num_samples; 284 cl_mem buffer; 285 } cl_image_desc; 286 287 typedef struct _cl_buffer_region { 288 size_t origin; 289 size_t size; 290 } cl_buffer_region; 291 292 293 ////////////////////////////////////////////////////////// 294 295 #define CL_SUCCESS 0 296 #define CL_DEVICE_NOT_FOUND -1 297 #define CL_DEVICE_NOT_AVAILABLE -2 298 #define CL_COMPILER_NOT_AVAILABLE -3 299 #define CL_MEM_OBJECT_ALLOCATION_FAILURE -4 300 #define CL_OUT_OF_RESOURCES -5 301 #define CL_OUT_OF_HOST_MEMORY -6 302 #define CL_PROFILING_INFO_NOT_AVAILABLE -7 303 #define CL_MEM_COPY_OVERLAP -8 304 #define CL_IMAGE_FORMAT_MISMATCH -9 305 #define CL_IMAGE_FORMAT_NOT_SUPPORTED -10 306 #define CL_BUILD_PROGRAM_FAILURE -11 307 #define CL_MAP_FAILURE -12 308 #define CL_MISALIGNED_SUB_BUFFER_OFFSET -13 309 #define CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST -14 310 #define CL_COMPILE_PROGRAM_FAILURE -15 311 #define CL_LINKER_NOT_AVAILABLE -16 312 #define CL_LINK_PROGRAM_FAILURE -17 313 #define CL_DEVICE_PARTITION_FAILED -18 314 #define CL_KERNEL_ARG_INFO_NOT_AVAILABLE -19 315 316 #define CL_INVALID_VALUE -30 317 #define CL_INVALID_DEVICE_TYPE -31 318 #define CL_INVALID_PLATFORM -32 319 #define CL_INVALID_DEVICE -33 320 #define CL_INVALID_CONTEXT -34 321 #define CL_INVALID_QUEUE_PROPERTIES -35 322 #define CL_INVALID_COMMAND_QUEUE -36 323 #define CL_INVALID_HOST_PTR -37 324 #define CL_INVALID_MEM_OBJECT -38 325 #define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR -39 326 #define CL_INVALID_IMAGE_SIZE -40 327 #define CL_INVALID_SAMPLER -41 328 #define CL_INVALID_BINARY -42 329 #define CL_INVALID_BUILD_OPTIONS -43 330 #define CL_INVALID_PROGRAM -44 331 #define CL_INVALID_PROGRAM_EXECUTABLE -45 332 #define CL_INVALID_KERNEL_NAME -46 333 #define CL_INVALID_KERNEL_DEFINITION -47 334 #define CL_INVALID_KERNEL -48 335 #define CL_INVALID_ARG_INDEX -49 336 #define CL_INVALID_ARG_VALUE -50 337 #define CL_INVALID_ARG_SIZE -51 338 #define CL_INVALID_KERNEL_ARGS -52 339 #define CL_INVALID_WORK_DIMENSION -53 340 #define CL_INVALID_WORK_GROUP_SIZE -54 341 #define CL_INVALID_WORK_ITEM_SIZE -55 342 #define CL_INVALID_GLOBAL_OFFSET -56 343 #define CL_INVALID_EVENT_WAIT_LIST -57 344 #define CL_INVALID_EVENT -58 345 #define CL_INVALID_OPERATION -59 346 #define CL_INVALID_GL_OBJECT -60 347 #define CL_INVALID_BUFFER_SIZE -61 348 #define CL_INVALID_MIP_LEVEL -62 349 #define CL_INVALID_GLOBAL_WORK_SIZE -63 350 #define CL_INVALID_PROPERTY -64 351 #define CL_INVALID_IMAGE_DESCRIPTOR -65 352 #define CL_INVALID_COMPILER_OPTIONS -66 353 #define CL_INVALID_LINKER_OPTIONS -67 354 #define CL_INVALID_DEVICE_PARTITION_COUNT -68 355 356 /*#define CL_VERSION_1_0 1 357 #define CL_VERSION_1_1 1 358 #define CL_VERSION_1_2 1*/ 359 360 #define CL_FALSE 0 361 #define CL_TRUE 1 362 #define CL_BLOCKING CL_TRUE 363 #define CL_NON_BLOCKING CL_FALSE 364 365 #define CL_PLATFORM_PROFILE 0x0900 366 #define CL_PLATFORM_VERSION 0x0901 367 #define CL_PLATFORM_NAME 0x0902 368 #define CL_PLATFORM_VENDOR 0x0903 369 #define CL_PLATFORM_EXTENSIONS 0x0904 370 371 #define CL_DEVICE_TYPE_DEFAULT (1 << 0) 372 #define CL_DEVICE_TYPE_CPU (1 << 1) 373 #define CL_DEVICE_TYPE_GPU (1 << 2) 374 #define CL_DEVICE_TYPE_ACCELERATOR (1 << 3) 375 #define CL_DEVICE_TYPE_CUSTOM (1 << 4) 376 #define CL_DEVICE_TYPE_ALL 0xFFFFFFFF 377 #define CL_DEVICE_TYPE 0x1000 378 #define CL_DEVICE_VENDOR_ID 0x1001 379 #define CL_DEVICE_MAX_COMPUTE_UNITS 0x1002 380 #define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS 0x1003 381 #define CL_DEVICE_MAX_WORK_GROUP_SIZE 0x1004 382 #define CL_DEVICE_MAX_WORK_ITEM_SIZES 0x1005 383 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR 0x1006 384 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT 0x1007 385 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT 0x1008 386 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG 0x1009 387 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT 0x100A 388 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE 0x100B 389 #define CL_DEVICE_MAX_CLOCK_FREQUENCY 0x100C 390 #define CL_DEVICE_ADDRESS_BITS 0x100D 391 #define CL_DEVICE_MAX_READ_IMAGE_ARGS 0x100E 392 #define CL_DEVICE_MAX_WRITE_IMAGE_ARGS 0x100F 393 #define CL_DEVICE_MAX_MEM_ALLOC_SIZE 0x1010 394 #define CL_DEVICE_IMAGE2D_MAX_WIDTH 0x1011 395 #define CL_DEVICE_IMAGE2D_MAX_HEIGHT 0x1012 396 #define CL_DEVICE_IMAGE3D_MAX_WIDTH 0x1013 397 #define CL_DEVICE_IMAGE3D_MAX_HEIGHT 0x1014 398 #define CL_DEVICE_IMAGE3D_MAX_DEPTH 0x1015 399 #define CL_DEVICE_IMAGE_SUPPORT 0x1016 400 #define CL_DEVICE_MAX_PARAMETER_SIZE 0x1017 401 #define CL_DEVICE_MAX_SAMPLERS 0x1018 402 #define CL_DEVICE_MEM_BASE_ADDR_ALIGN 0x1019 403 #define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE 0x101A 404 #define CL_DEVICE_SINGLE_FP_CONFIG 0x101B 405 #define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE 0x101C 406 #define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE 0x101D 407 #define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE 0x101E 408 #define CL_DEVICE_GLOBAL_MEM_SIZE 0x101F 409 #define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE 0x1020 410 #define CL_DEVICE_MAX_CONSTANT_ARGS 0x1021 411 #define CL_DEVICE_LOCAL_MEM_TYPE 0x1022 412 #define CL_DEVICE_LOCAL_MEM_SIZE 0x1023 413 #define CL_DEVICE_ERROR_CORRECTION_SUPPORT 0x1024 414 #define CL_DEVICE_PROFILING_TIMER_RESOLUTION 0x1025 415 #define CL_DEVICE_ENDIAN_LITTLE 0x1026 416 #define CL_DEVICE_AVAILABLE 0x1027 417 #define CL_DEVICE_COMPILER_AVAILABLE 0x1028 418 #define CL_DEVICE_EXECUTION_CAPABILITIES 0x1029 419 #define CL_DEVICE_QUEUE_PROPERTIES 0x102A 420 #define CL_DEVICE_NAME 0x102B 421 #define CL_DEVICE_VENDOR 0x102C 422 #define CL_DRIVER_VERSION 0x102D 423 #define CL_DEVICE_PROFILE 0x102E 424 #define CL_DEVICE_VERSION 0x102F 425 #define CL_DEVICE_EXTENSIONS 0x1030 426 #define CL_DEVICE_PLATFORM 0x1031 427 #define CL_DEVICE_DOUBLE_FP_CONFIG 0x1032 428 #define CL_DEVICE_HALF_FP_CONFIG 0x1033 429 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF 0x1034 430 #define CL_DEVICE_HOST_UNIFIED_MEMORY 0x1035 431 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR 0x1036 432 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT 0x1037 433 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT 0x1038 434 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG 0x1039 435 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT 0x103A 436 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE 0x103B 437 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF 0x103C 438 #define CL_DEVICE_OPENCL_C_VERSION 0x103D 439 #define CL_DEVICE_LINKER_AVAILABLE 0x103E 440 #define CL_DEVICE_BUILT_IN_KERNELS 0x103F 441 #define CL_DEVICE_IMAGE_MAX_BUFFER_SIZE 0x1040 442 #define CL_DEVICE_IMAGE_MAX_ARRAY_SIZE 0x1041 443 #define CL_DEVICE_PARENT_DEVICE 0x1042 444 #define CL_DEVICE_PARTITION_MAX_SUB_DEVICES 0x1043 445 #define CL_DEVICE_PARTITION_PROPERTIES 0x1044 446 #define CL_DEVICE_PARTITION_AFFINITY_DOMAIN 0x1045 447 #define CL_DEVICE_PARTITION_TYPE 0x1046 448 #define CL_DEVICE_REFERENCE_COUNT 0x1047 449 #define CL_DEVICE_PREFERRED_INTEROP_USER_SYNC 0x1048 450 #define CL_DEVICE_PRINTF_BUFFER_SIZE 0x1049 451 #define CL_DEVICE_IMAGE_PITCH_ALIGNMENT 0x104A 452 #define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT 0x104B 453 454 #define CL_FP_DENORM (1 << 0) 455 #define CL_FP_INF_NAN (1 << 1) 456 #define CL_FP_ROUND_TO_NEAREST (1 << 2) 457 #define CL_FP_ROUND_TO_ZERO (1 << 3) 458 #define CL_FP_ROUND_TO_INF (1 << 4) 459 #define CL_FP_FMA (1 << 5) 460 #define CL_FP_SOFT_FLOAT (1 << 6) 461 #define CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT (1 << 7) 462 463 #define CL_NONE 0x0 464 #define CL_READ_ONLY_CACHE 0x1 465 #define CL_READ_WRITE_CACHE 0x2 466 #define CL_LOCAL 0x1 467 #define CL_GLOBAL 0x2 468 #define CL_EXEC_KERNEL (1 << 0) 469 #define CL_EXEC_NATIVE_KERNEL (1 << 1) 470 #define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE (1 << 0) 471 #define CL_QUEUE_PROFILING_ENABLE (1 << 1) 472 473 #define CL_CONTEXT_REFERENCE_COUNT 0x1080 474 #define CL_CONTEXT_DEVICES 0x1081 475 #define CL_CONTEXT_PROPERTIES 0x1082 476 #define CL_CONTEXT_NUM_DEVICES 0x1083 477 #define CL_CONTEXT_PLATFORM 0x1084 478 #define CL_CONTEXT_INTEROP_USER_SYNC 0x1085 479 480 #define CL_DEVICE_PARTITION_EQUALLY 0x1086 481 #define CL_DEVICE_PARTITION_BY_COUNTS 0x1087 482 #define CL_DEVICE_PARTITION_BY_COUNTS_LIST_END 0x0 483 #define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN 0x1088 484 #define CL_DEVICE_AFFINITY_DOMAIN_NUMA (1 << 0) 485 #define CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE (1 << 1) 486 #define CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE (1 << 2) 487 #define CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE (1 << 3) 488 #define CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE (1 << 4) 489 #define CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE (1 << 5) 490 #define CL_QUEUE_CONTEXT 0x1090 491 #define CL_QUEUE_DEVICE 0x1091 492 #define CL_QUEUE_REFERENCE_COUNT 0x1092 493 #define CL_QUEUE_PROPERTIES 0x1093 494 #define CL_MEM_READ_WRITE (1 << 0) 495 #define CL_MEM_WRITE_ONLY (1 << 1) 496 #define CL_MEM_READ_ONLY (1 << 2) 497 #define CL_MEM_USE_HOST_PTR (1 << 3) 498 #define CL_MEM_ALLOC_HOST_PTR (1 << 4) 499 #define CL_MEM_COPY_HOST_PTR (1 << 5) 500 // reserved (1 << 6) 501 #define CL_MEM_HOST_WRITE_ONLY (1 << 7) 502 #define CL_MEM_HOST_READ_ONLY (1 << 8) 503 #define CL_MEM_HOST_NO_ACCESS (1 << 9) 504 #define CL_MIGRATE_MEM_OBJECT_HOST (1 << 0) 505 #define CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED (1 << 1) 506 507 #define CL_R 0x10B0 508 #define CL_A 0x10B1 509 #define CL_RG 0x10B2 510 #define CL_RA 0x10B3 511 #define CL_RGB 0x10B4 512 #define CL_RGBA 0x10B5 513 #define CL_BGRA 0x10B6 514 #define CL_ARGB 0x10B7 515 #define CL_INTENSITY 0x10B8 516 #define CL_LUMINANCE 0x10B9 517 #define CL_Rx 0x10BA 518 #define CL_RGx 0x10BB 519 #define CL_RGBx 0x10BC 520 #define CL_DEPTH 0x10BD 521 #define CL_DEPTH_STENCIL 0x10BE 522 523 #define CL_SNORM_INT8 0x10D0 524 #define CL_SNORM_INT16 0x10D1 525 #define CL_UNORM_INT8 0x10D2 526 #define CL_UNORM_INT16 0x10D3 527 #define CL_UNORM_SHORT_565 0x10D4 528 #define CL_UNORM_SHORT_555 0x10D5 529 #define CL_UNORM_INT_101010 0x10D6 530 #define CL_SIGNED_INT8 0x10D7 531 #define CL_SIGNED_INT16 0x10D8 532 #define CL_SIGNED_INT32 0x10D9 533 #define CL_UNSIGNED_INT8 0x10DA 534 #define CL_UNSIGNED_INT16 0x10DB 535 #define CL_UNSIGNED_INT32 0x10DC 536 #define CL_HALF_FLOAT 0x10DD 537 #define CL_FLOAT 0x10DE 538 #define CL_UNORM_INT24 0x10DF 539 540 #define CL_MEM_OBJECT_BUFFER 0x10F0 541 #define CL_MEM_OBJECT_IMAGE2D 0x10F1 542 #define CL_MEM_OBJECT_IMAGE3D 0x10F2 543 #define CL_MEM_OBJECT_IMAGE2D_ARRAY 0x10F3 544 #define CL_MEM_OBJECT_IMAGE1D 0x10F4 545 #define CL_MEM_OBJECT_IMAGE1D_ARRAY 0x10F5 546 #define CL_MEM_OBJECT_IMAGE1D_BUFFER 0x10F6 547 548 #define CL_MEM_TYPE 0x1100 549 #define CL_MEM_FLAGS 0x1101 550 #define CL_MEM_SIZE 0x1102 551 #define CL_MEM_HOST_PTR 0x1103 552 #define CL_MEM_MAP_COUNT 0x1104 553 #define CL_MEM_REFERENCE_COUNT 0x1105 554 #define CL_MEM_CONTEXT 0x1106 555 #define CL_MEM_ASSOCIATED_MEMOBJECT 0x1107 556 #define CL_MEM_OFFSET 0x1108 557 558 #define CL_IMAGE_FORMAT 0x1110 559 #define CL_IMAGE_ELEMENT_SIZE 0x1111 560 #define CL_IMAGE_ROW_PITCH 0x1112 561 #define CL_IMAGE_SLICE_PITCH 0x1113 562 #define CL_IMAGE_WIDTH 0x1114 563 #define CL_IMAGE_HEIGHT 0x1115 564 #define CL_IMAGE_DEPTH 0x1116 565 #define CL_IMAGE_ARRAY_SIZE 0x1117 566 #define CL_IMAGE_BUFFER 0x1118 567 #define CL_IMAGE_NUM_MIP_LEVELS 0x1119 568 #define CL_IMAGE_NUM_SAMPLES 0x111A 569 570 #define CL_ADDRESS_NONE 0x1130 571 #define CL_ADDRESS_CLAMP_TO_EDGE 0x1131 572 #define CL_ADDRESS_CLAMP 0x1132 573 #define CL_ADDRESS_REPEAT 0x1133 574 #define CL_ADDRESS_MIRRORED_REPEAT 0x1134 575 576 #define CL_FILTER_NEAREST 0x1140 577 #define CL_FILTER_LINEAR 0x1141 578 579 #define CL_SAMPLER_REFERENCE_COUNT 0x1150 580 #define CL_SAMPLER_CONTEXT 0x1151 581 #define CL_SAMPLER_NORMALIZED_COORDS 0x1152 582 #define CL_SAMPLER_ADDRESSING_MODE 0x1153 583 #define CL_SAMPLER_FILTER_MODE 0x1154 584 585 #define CL_MAP_READ (1 << 0) 586 #define CL_MAP_WRITE (1 << 1) 587 #define CL_MAP_WRITE_INVALIDATE_REGION (1 << 2) 588 589 #define CL_PROGRAM_REFERENCE_COUNT 0x1160 590 #define CL_PROGRAM_CONTEXT 0x1161 591 #define CL_PROGRAM_NUM_DEVICES 0x1162 592 #define CL_PROGRAM_DEVICES 0x1163 593 #define CL_PROGRAM_SOURCE 0x1164 594 #define CL_PROGRAM_BINARY_SIZES 0x1165 595 #define CL_PROGRAM_BINARIES 0x1166 596 #define CL_PROGRAM_NUM_KERNELS 0x1167 597 #define CL_PROGRAM_KERNEL_NAMES 0x1168 598 #define CL_PROGRAM_BUILD_STATUS 0x1181 599 #define CL_PROGRAM_BUILD_OPTIONS 0x1182 600 #define CL_PROGRAM_BUILD_LOG 0x1183 601 #define CL_PROGRAM_BINARY_TYPE 0x1184 602 #define CL_PROGRAM_BINARY_TYPE_NONE 0x0 603 #define CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT 0x1 604 #define CL_PROGRAM_BINARY_TYPE_LIBRARY 0x2 605 #define CL_PROGRAM_BINARY_TYPE_EXECUTABLE 0x4 606 607 #define CL_BUILD_SUCCESS 0 608 #define CL_BUILD_NONE -1 609 #define CL_BUILD_ERROR -2 610 #define CL_BUILD_IN_PROGRESS -3 611 612 #define CL_KERNEL_FUNCTION_NAME 0x1190 613 #define CL_KERNEL_NUM_ARGS 0x1191 614 #define CL_KERNEL_REFERENCE_COUNT 0x1192 615 #define CL_KERNEL_CONTEXT 0x1193 616 #define CL_KERNEL_PROGRAM 0x1194 617 #define CL_KERNEL_ATTRIBUTES 0x1195 618 #define CL_KERNEL_ARG_ADDRESS_QUALIFIER 0x1196 619 #define CL_KERNEL_ARG_ACCESS_QUALIFIER 0x1197 620 #define CL_KERNEL_ARG_TYPE_NAME 0x1198 621 #define CL_KERNEL_ARG_TYPE_QUALIFIER 0x1199 622 #define CL_KERNEL_ARG_NAME 0x119A 623 #define CL_KERNEL_ARG_ADDRESS_GLOBAL 0x119B 624 #define CL_KERNEL_ARG_ADDRESS_LOCAL 0x119C 625 #define CL_KERNEL_ARG_ADDRESS_CONSTANT 0x119D 626 #define CL_KERNEL_ARG_ADDRESS_PRIVATE 0x119E 627 #define CL_KERNEL_ARG_ACCESS_READ_ONLY 0x11A0 628 #define CL_KERNEL_ARG_ACCESS_WRITE_ONLY 0x11A1 629 #define CL_KERNEL_ARG_ACCESS_READ_WRITE 0x11A2 630 #define CL_KERNEL_ARG_ACCESS_NONE 0x11A3 631 #define CL_KERNEL_ARG_TYPE_NONE 0 632 #define CL_KERNEL_ARG_TYPE_CONST (1 << 0) 633 #define CL_KERNEL_ARG_TYPE_RESTRICT (1 << 1) 634 #define CL_KERNEL_ARG_TYPE_VOLATILE (1 << 2) 635 #define CL_KERNEL_WORK_GROUP_SIZE 0x11B0 636 #define CL_KERNEL_COMPILE_WORK_GROUP_SIZE 0x11B1 637 #define CL_KERNEL_LOCAL_MEM_SIZE 0x11B2 638 #define CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x11B3 639 #define CL_KERNEL_PRIVATE_MEM_SIZE 0x11B4 640 #define CL_KERNEL_GLOBAL_WORK_SIZE 0x11B5 641 642 #define CL_EVENT_COMMAND_QUEUE 0x11D0 643 #define CL_EVENT_COMMAND_TYPE 0x11D1 644 #define CL_EVENT_REFERENCE_COUNT 0x11D2 645 #define CL_EVENT_COMMAND_EXECUTION_STATUS 0x11D3 646 #define CL_EVENT_CONTEXT 0x11D4 647 648 #define CL_COMMAND_NDRANGE_KERNEL 0x11F0 649 #define CL_COMMAND_TASK 0x11F1 650 #define CL_COMMAND_NATIVE_KERNEL 0x11F2 651 #define CL_COMMAND_READ_BUFFER 0x11F3 652 #define CL_COMMAND_WRITE_BUFFER 0x11F4 653 #define CL_COMMAND_COPY_BUFFER 0x11F5 654 #define CL_COMMAND_READ_IMAGE 0x11F6 655 #define CL_COMMAND_WRITE_IMAGE 0x11F7 656 #define CL_COMMAND_COPY_IMAGE 0x11F8 657 #define CL_COMMAND_COPY_IMAGE_TO_BUFFER 0x11F9 658 #define CL_COMMAND_COPY_BUFFER_TO_IMAGE 0x11FA 659 #define CL_COMMAND_MAP_BUFFER 0x11FB 660 #define CL_COMMAND_MAP_IMAGE 0x11FC 661 #define CL_COMMAND_UNMAP_MEM_OBJECT 0x11FD 662 #define CL_COMMAND_MARKER 0x11FE 663 #define CL_COMMAND_ACQUIRE_GL_OBJECTS 0x11FF 664 #define CL_COMMAND_RELEASE_GL_OBJECTS 0x1200 665 #define CL_COMMAND_READ_BUFFER_RECT 0x1201 666 #define CL_COMMAND_WRITE_BUFFER_RECT 0x1202 667 #define CL_COMMAND_COPY_BUFFER_RECT 0x1203 668 #define CL_COMMAND_USER 0x1204 669 #define CL_COMMAND_BARRIER 0x1205 670 #define CL_COMMAND_MIGRATE_MEM_OBJECTS 0x1206 671 #define CL_COMMAND_FILL_BUFFER 0x1207 672 #define CL_COMMAND_FILL_IMAGE 0x1208 673 674 #define CL_COMPLETE 0x0 675 #define CL_RUNNING 0x1 676 #define CL_SUBMITTED 0x2 677 #define CL_QUEUED 0x3 678 #define CL_BUFFER_CREATE_TYPE_REGION 0x1220 679 680 #define CL_PROFILING_COMMAND_QUEUED 0x1280 681 #define CL_PROFILING_COMMAND_SUBMIT 0x1281 682 #define CL_PROFILING_COMMAND_START 0x1282 683 #define CL_PROFILING_COMMAND_END 0x1283 684 685 #define CL_CALLBACK CV_STDCALL 686 687 static volatile bool g_haveOpenCL = false; 688 static const char* oclFuncToCheck = "clEnqueueReadBufferRect"; 689 690 #if defined(__APPLE__) 691 #include <dlfcn.h> 692 693 static void* initOpenCLAndLoad(const char* funcname) 694 { 695 static bool initialized = false; 696 static void* handle = 0; 697 if (!handle) 698 { 699 if(!initialized) 700 { 701 const char* oclpath = getenv("OPENCV_OPENCL_RUNTIME"); 702 oclpath = oclpath && strlen(oclpath) > 0 ? oclpath : 703 "/System/Library/Frameworks/OpenCL.framework/Versions/Current/OpenCL"; 704 handle = dlopen(oclpath, RTLD_LAZY); 705 initialized = true; 706 g_haveOpenCL = handle != 0 && dlsym(handle, oclFuncToCheck) != 0; 707 if( g_haveOpenCL ) 708 fprintf(stderr, "Successfully loaded OpenCL v1.1+ runtime from %s\n", oclpath); 709 else 710 fprintf(stderr, "Failed to load OpenCL runtime\n"); 711 } 712 if(!handle) 713 return 0; 714 } 715 716 return funcname && handle ? dlsym(handle, funcname) : 0; 717 } 718 719 #elif defined WIN32 || defined _WIN32 720 721 #ifndef _WIN32_WINNT // This is needed for the declaration of TryEnterCriticalSection in winbase.h with Visual Studio 2005 (and older?) 722 #define _WIN32_WINNT 0x0400 // http://msdn.microsoft.com/en-us/library/ms686857(VS.85).aspx 723 #endif 724 #include <windows.h> 725 #if (_WIN32_WINNT >= 0x0602) 726 #include <synchapi.h> 727 #endif 728 #undef small 729 #undef min 730 #undef max 731 #undef abs 732 733 static void* initOpenCLAndLoad(const char* funcname) 734 { 735 static bool initialized = false; 736 static HMODULE handle = 0; 737 if (!handle) 738 { 739 #ifndef WINRT 740 if(!initialized) 741 { 742 handle = LoadLibraryA("OpenCL.dll"); 743 initialized = true; 744 g_haveOpenCL = handle != 0 && GetProcAddress(handle, oclFuncToCheck) != 0; 745 } 746 #endif 747 if(!handle) 748 return 0; 749 } 750 751 return funcname ? (void*)GetProcAddress(handle, funcname) : 0; 752 } 753 754 #elif defined(__linux) 755 756 #include <dlfcn.h> 757 #include <stdio.h> 758 759 static void* initOpenCLAndLoad(const char* funcname) 760 { 761 static bool initialized = false; 762 static void* handle = 0; 763 if (!handle) 764 { 765 if(!initialized) 766 { 767 handle = dlopen("libOpenCL.so", RTLD_LAZY); 768 if(!handle) 769 handle = dlopen("libCL.so", RTLD_LAZY); 770 initialized = true; 771 g_haveOpenCL = handle != 0 && dlsym(handle, oclFuncToCheck) != 0; 772 } 773 if(!handle) 774 return 0; 775 } 776 777 return funcname ? (void*)dlsym(handle, funcname) : 0; 778 } 779 780 #else 781 782 static void* initOpenCLAndLoad(const char*) 783 { 784 return 0; 785 } 786 787 #endif 788 789 790 #define OCL_FUNC(rettype, funcname, argsdecl, args) \ 791 typedef rettype (CV_STDCALL * funcname##_t) argsdecl; \ 792 static rettype funcname argsdecl \ 793 { \ 794 static funcname##_t funcname##_p = 0; \ 795 if( !funcname##_p ) \ 796 { \ 797 funcname##_p = (funcname##_t)initOpenCLAndLoad(#funcname); \ 798 if( !funcname##_p ) \ 799 return OPENCV_CL_NOT_IMPLEMENTED; \ 800 } \ 801 return funcname##_p args; \ 802 } 803 804 805 #define OCL_FUNC_P(rettype, funcname, argsdecl, args) \ 806 typedef rettype (CV_STDCALL * funcname##_t) argsdecl; \ 807 static rettype funcname argsdecl \ 808 { \ 809 static funcname##_t funcname##_p = 0; \ 810 if( !funcname##_p ) \ 811 { \ 812 funcname##_p = (funcname##_t)initOpenCLAndLoad(#funcname); \ 813 if( !funcname##_p ) \ 814 { \ 815 if( errcode_ret ) \ 816 *errcode_ret = OPENCV_CL_NOT_IMPLEMENTED; \ 817 return 0; \ 818 } \ 819 } \ 820 return funcname##_p args; \ 821 } 822 823 OCL_FUNC(cl_int, clGetPlatformIDs, 824 (cl_uint num_entries, cl_platform_id* platforms, cl_uint* num_platforms), 825 (num_entries, platforms, num_platforms)) 826 827 OCL_FUNC(cl_int, clGetPlatformInfo, 828 (cl_platform_id platform, cl_platform_info param_name, 829 size_t param_value_size, void * param_value, 830 size_t * param_value_size_ret), 831 (platform, param_name, param_value_size, param_value, param_value_size_ret)) 832 833 OCL_FUNC(cl_int, clGetDeviceInfo, 834 (cl_device_id device, 835 cl_device_info param_name, 836 size_t param_value_size, 837 void * param_value, 838 size_t * param_value_size_ret), 839 (device, param_name, param_value_size, param_value, param_value_size_ret)) 840 841 842 OCL_FUNC(cl_int, clGetDeviceIDs, 843 (cl_platform_id platform, 844 cl_device_type device_type, 845 cl_uint num_entries, 846 cl_device_id * devices, 847 cl_uint * num_devices), 848 (platform, device_type, num_entries, devices, num_devices)) 849 850 OCL_FUNC_P(cl_context, clCreateContext, 851 (const cl_context_properties * properties, 852 cl_uint num_devices, 853 const cl_device_id * devices, 854 void (CL_CALLBACK * pfn_notify)(const char *, const void *, size_t, void *), 855 void * user_data, 856 cl_int * errcode_ret), 857 (properties, num_devices, devices, pfn_notify, user_data, errcode_ret)) 858 859 OCL_FUNC(cl_int, clReleaseContext, (cl_context context), (context)) 860 861 /* 862 OCL_FUNC(cl_int, clRetainContext, (cl_context context), (context)) 863 864 OCL_FUNC_P(cl_context, clCreateContextFromType, 865 (const cl_context_properties * properties, 866 cl_device_type device_type, 867 void (CL_CALLBACK * pfn_notify)(const char *, const void *, size_t, void *), 868 void * user_data, 869 cl_int * errcode_ret), 870 (properties, device_type, pfn_notify, user_data, errcode_ret)) 871 872 OCL_FUNC(cl_int, clGetContextInfo, 873 (cl_context context, 874 cl_context_info param_name, 875 size_t param_value_size, 876 void * param_value, 877 size_t * param_value_size_ret), 878 (context, param_name, param_value_size, 879 param_value, param_value_size_ret)) 880 */ 881 OCL_FUNC_P(cl_command_queue, clCreateCommandQueue, 882 (cl_context context, 883 cl_device_id device, 884 cl_command_queue_properties properties, 885 cl_int * errcode_ret), 886 (context, device, properties, errcode_ret)) 887 888 OCL_FUNC(cl_int, clReleaseCommandQueue, (cl_command_queue command_queue), (command_queue)) 889 890 OCL_FUNC_P(cl_mem, clCreateBuffer, 891 (cl_context context, 892 cl_mem_flags flags, 893 size_t size, 894 void * host_ptr, 895 cl_int * errcode_ret), 896 (context, flags, size, host_ptr, errcode_ret)) 897 898 /* 899 OCL_FUNC(cl_int, clRetainCommandQueue, (cl_command_queue command_queue), (command_queue)) 900 901 OCL_FUNC(cl_int, clGetCommandQueueInfo, 902 (cl_command_queue command_queue, 903 cl_command_queue_info param_name, 904 size_t param_value_size, 905 void * param_value, 906 size_t * param_value_size_ret), 907 (command_queue, param_name, param_value_size, param_value, param_value_size_ret)) 908 909 OCL_FUNC_P(cl_mem, clCreateSubBuffer, 910 (cl_mem buffer, 911 cl_mem_flags flags, 912 cl_buffer_create_type buffer_create_type, 913 const void * buffer_create_info, 914 cl_int * errcode_ret), 915 (buffer, flags, buffer_create_type, buffer_create_info, errcode_ret)) 916 */ 917 918 OCL_FUNC_P(cl_mem, clCreateImage, 919 (cl_context context, 920 cl_mem_flags flags, 921 const cl_image_format * image_format, 922 const cl_image_desc * image_desc, 923 void * host_ptr, 924 cl_int * errcode_ret), 925 (context, flags, image_format, image_desc, host_ptr, errcode_ret)) 926 927 OCL_FUNC_P(cl_mem, clCreateImage2D, 928 (cl_context context, 929 cl_mem_flags flags, 930 const cl_image_format * image_format, 931 size_t image_width, 932 size_t image_height, 933 size_t image_row_pitch, 934 void * host_ptr, 935 cl_int *errcode_ret), 936 (context, flags, image_format, image_width, image_height, image_row_pitch, host_ptr, errcode_ret)) 937 938 OCL_FUNC(cl_int, clGetSupportedImageFormats, 939 (cl_context context, 940 cl_mem_flags flags, 941 cl_mem_object_type image_type, 942 cl_uint num_entries, 943 cl_image_format * image_formats, 944 cl_uint * num_image_formats), 945 (context, flags, image_type, num_entries, image_formats, num_image_formats)) 946 947 948 /* 949 OCL_FUNC(cl_int, clGetMemObjectInfo, 950 (cl_mem memobj, 951 cl_mem_info param_name, 952 size_t param_value_size, 953 void * param_value, 954 size_t * param_value_size_ret), 955 (memobj, param_name, param_value_size, param_value, param_value_size_ret)) 956 957 OCL_FUNC(cl_int, clGetImageInfo, 958 (cl_mem image, 959 cl_image_info param_name, 960 size_t param_value_size, 961 void * param_value, 962 size_t * param_value_size_ret), 963 (image, param_name, param_value_size, param_value, param_value_size_ret)) 964 965 OCL_FUNC(cl_int, clCreateKernelsInProgram, 966 (cl_program program, 967 cl_uint num_kernels, 968 cl_kernel * kernels, 969 cl_uint * num_kernels_ret), 970 (program, num_kernels, kernels, num_kernels_ret)) 971 972 OCL_FUNC(cl_int, clRetainKernel, (cl_kernel kernel), (kernel)) 973 974 OCL_FUNC(cl_int, clGetKernelArgInfo, 975 (cl_kernel kernel, 976 cl_uint arg_indx, 977 cl_kernel_arg_info param_name, 978 size_t param_value_size, 979 void * param_value, 980 size_t * param_value_size_ret), 981 (kernel, arg_indx, param_name, param_value_size, param_value, param_value_size_ret)) 982 983 OCL_FUNC(cl_int, clEnqueueReadImage, 984 (cl_command_queue command_queue, 985 cl_mem image, 986 cl_bool blocking_read, 987 const size_t * origin[3], 988 const size_t * region[3], 989 size_t row_pitch, 990 size_t slice_pitch, 991 void * ptr, 992 cl_uint num_events_in_wait_list, 993 const cl_event * event_wait_list, 994 cl_event * event), 995 (command_queue, image, blocking_read, origin, region, 996 row_pitch, slice_pitch, 997 ptr, 998 num_events_in_wait_list, 999 event_wait_list, 1000 event)) 1001 1002 OCL_FUNC(cl_int, clEnqueueWriteImage, 1003 (cl_command_queue command_queue, 1004 cl_mem image, 1005 cl_bool blocking_write, 1006 const size_t * origin[3], 1007 const size_t * region[3], 1008 size_t input_row_pitch, 1009 size_t input_slice_pitch, 1010 const void * ptr, 1011 cl_uint num_events_in_wait_list, 1012 const cl_event * event_wait_list, 1013 cl_event * event), 1014 (command_queue, image, blocking_write, origin, region, input_row_pitch, 1015 input_slice_pitch, ptr, num_events_in_wait_list, event_wait_list, event)) 1016 1017 OCL_FUNC(cl_int, clEnqueueFillImage, 1018 (cl_command_queue command_queue, 1019 cl_mem image, 1020 const void * fill_color, 1021 const size_t * origin[3], 1022 const size_t * region[3], 1023 cl_uint num_events_in_wait_list, 1024 const cl_event * event_wait_list, 1025 cl_event * event), 1026 (command_queue, image, fill_color, origin, region, 1027 num_events_in_wait_list, event_wait_list, event)) 1028 1029 OCL_FUNC(cl_int, clEnqueueCopyImage, 1030 (cl_command_queue command_queue, 1031 cl_mem src_image, 1032 cl_mem dst_image, 1033 const size_t * src_origin[3], 1034 const size_t * dst_origin[3], 1035 const size_t * region[3], 1036 cl_uint num_events_in_wait_list, 1037 const cl_event * event_wait_list, 1038 cl_event * event), 1039 (command_queue, src_image, dst_image, src_origin, dst_origin, 1040 region, num_events_in_wait_list, event_wait_list, event)) 1041 1042 OCL_FUNC(cl_int, clEnqueueCopyImageToBuffer, 1043 (cl_command_queue command_queue, 1044 cl_mem src_image, 1045 cl_mem dst_buffer, 1046 const size_t * src_origin[3], 1047 const size_t * region[3], 1048 size_t dst_offset, 1049 cl_uint num_events_in_wait_list, 1050 const cl_event * event_wait_list, 1051 cl_event * event), 1052 (command_queue, src_image, dst_buffer, src_origin, region, dst_offset, 1053 num_events_in_wait_list, event_wait_list, event)) 1054 */ 1055 1056 OCL_FUNC(cl_int, clEnqueueCopyBufferToImage, 1057 (cl_command_queue command_queue, 1058 cl_mem src_buffer, 1059 cl_mem dst_image, 1060 size_t src_offset, 1061 const size_t dst_origin[3], 1062 const size_t region[3], 1063 cl_uint num_events_in_wait_list, 1064 const cl_event * event_wait_list, 1065 cl_event * event), 1066 (command_queue, src_buffer, dst_image, src_offset, dst_origin, 1067 region, num_events_in_wait_list, event_wait_list, event)) 1068 1069 OCL_FUNC(cl_int, clFlush, 1070 (cl_command_queue command_queue), 1071 (command_queue)) 1072 1073 /* 1074 OCL_FUNC_P(void*, clEnqueueMapImage, 1075 (cl_command_queue command_queue, 1076 cl_mem image, 1077 cl_bool blocking_map, 1078 cl_map_flags map_flags, 1079 const size_t * origin[3], 1080 const size_t * region[3], 1081 size_t * image_row_pitch, 1082 size_t * image_slice_pitch, 1083 cl_uint num_events_in_wait_list, 1084 const cl_event * event_wait_list, 1085 cl_event * event, 1086 cl_int * errcode_ret), 1087 (command_queue, image, blocking_map, map_flags, origin, region, 1088 image_row_pitch, image_slice_pitch, num_events_in_wait_list, 1089 event_wait_list, event, errcode_ret)) 1090 */ 1091 1092 /* 1093 OCL_FUNC(cl_int, clRetainProgram, (cl_program program), (program)) 1094 1095 OCL_FUNC(cl_int, clGetKernelInfo, 1096 (cl_kernel kernel, 1097 cl_kernel_info param_name, 1098 size_t param_value_size, 1099 void * param_value, 1100 size_t * param_value_size_ret), 1101 (kernel, param_name, param_value_size, param_value, param_value_size_ret)) 1102 1103 OCL_FUNC(cl_int, clRetainMemObject, (cl_mem memobj), (memobj)) 1104 1105 */ 1106 1107 OCL_FUNC(cl_int, clReleaseMemObject, (cl_mem memobj), (memobj)) 1108 1109 1110 OCL_FUNC_P(cl_program, clCreateProgramWithSource, 1111 (cl_context context, 1112 cl_uint count, 1113 const char ** strings, 1114 const size_t * lengths, 1115 cl_int * errcode_ret), 1116 (context, count, strings, lengths, errcode_ret)) 1117 1118 OCL_FUNC_P(cl_program, clCreateProgramWithBinary, 1119 (cl_context context, 1120 cl_uint num_devices, 1121 const cl_device_id * device_list, 1122 const size_t * lengths, 1123 const unsigned char ** binaries, 1124 cl_int * binary_status, 1125 cl_int * errcode_ret), 1126 (context, num_devices, device_list, lengths, binaries, binary_status, errcode_ret)) 1127 1128 OCL_FUNC(cl_int, clReleaseProgram, (cl_program program), (program)) 1129 1130 OCL_FUNC(cl_int, clBuildProgram, 1131 (cl_program program, 1132 cl_uint num_devices, 1133 const cl_device_id * device_list, 1134 const char * options, 1135 void (CL_CALLBACK * pfn_notify)(cl_program, void *), 1136 void * user_data), 1137 (program, num_devices, device_list, options, pfn_notify, user_data)) 1138 1139 OCL_FUNC(cl_int, clGetProgramInfo, 1140 (cl_program program, 1141 cl_program_info param_name, 1142 size_t param_value_size, 1143 void * param_value, 1144 size_t * param_value_size_ret), 1145 (program, param_name, param_value_size, param_value, param_value_size_ret)) 1146 1147 OCL_FUNC(cl_int, clGetProgramBuildInfo, 1148 (cl_program program, 1149 cl_device_id device, 1150 cl_program_build_info param_name, 1151 size_t param_value_size, 1152 void * param_value, 1153 size_t * param_value_size_ret), 1154 (program, device, param_name, param_value_size, param_value, param_value_size_ret)) 1155 1156 OCL_FUNC_P(cl_kernel, clCreateKernel, 1157 (cl_program program, 1158 const char * kernel_name, 1159 cl_int * errcode_ret), 1160 (program, kernel_name, errcode_ret)) 1161 1162 OCL_FUNC(cl_int, clReleaseKernel, (cl_kernel kernel), (kernel)) 1163 1164 OCL_FUNC(cl_int, clSetKernelArg, 1165 (cl_kernel kernel, 1166 cl_uint arg_index, 1167 size_t arg_size, 1168 const void * arg_value), 1169 (kernel, arg_index, arg_size, arg_value)) 1170 1171 OCL_FUNC(cl_int, clGetKernelWorkGroupInfo, 1172 (cl_kernel kernel, 1173 cl_device_id device, 1174 cl_kernel_work_group_info param_name, 1175 size_t param_value_size, 1176 void * param_value, 1177 size_t * param_value_size_ret), 1178 (kernel, device, param_name, param_value_size, param_value, param_value_size_ret)) 1179 1180 OCL_FUNC(cl_int, clFinish, (cl_command_queue command_queue), (command_queue)) 1181 1182 OCL_FUNC(cl_int, clEnqueueReadBuffer, 1183 (cl_command_queue command_queue, 1184 cl_mem buffer, 1185 cl_bool blocking_read, 1186 size_t offset, 1187 size_t size, 1188 void * ptr, 1189 cl_uint num_events_in_wait_list, 1190 const cl_event * event_wait_list, 1191 cl_event * event), 1192 (command_queue, buffer, blocking_read, offset, size, ptr, 1193 num_events_in_wait_list, event_wait_list, event)) 1194 1195 OCL_FUNC(cl_int, clEnqueueReadBufferRect, 1196 (cl_command_queue command_queue, 1197 cl_mem buffer, 1198 cl_bool blocking_read, 1199 const size_t * buffer_offset, 1200 const size_t * host_offset, 1201 const size_t * region, 1202 size_t buffer_row_pitch, 1203 size_t buffer_slice_pitch, 1204 size_t host_row_pitch, 1205 size_t host_slice_pitch, 1206 void * ptr, 1207 cl_uint num_events_in_wait_list, 1208 const cl_event * event_wait_list, 1209 cl_event * event), 1210 (command_queue, buffer, blocking_read, buffer_offset, host_offset, region, buffer_row_pitch, 1211 buffer_slice_pitch, host_row_pitch, host_slice_pitch, ptr, num_events_in_wait_list, 1212 event_wait_list, event)) 1213 1214 OCL_FUNC(cl_int, clEnqueueWriteBuffer, 1215 (cl_command_queue command_queue, 1216 cl_mem buffer, 1217 cl_bool blocking_write, 1218 size_t offset, 1219 size_t size, 1220 const void * ptr, 1221 cl_uint num_events_in_wait_list, 1222 const cl_event * event_wait_list, 1223 cl_event * event), 1224 (command_queue, buffer, blocking_write, offset, size, ptr, 1225 num_events_in_wait_list, event_wait_list, event)) 1226 1227 OCL_FUNC(cl_int, clEnqueueWriteBufferRect, 1228 (cl_command_queue command_queue, 1229 cl_mem buffer, 1230 cl_bool blocking_write, 1231 const size_t * buffer_offset, 1232 const size_t * host_offset, 1233 const size_t * region, 1234 size_t buffer_row_pitch, 1235 size_t buffer_slice_pitch, 1236 size_t host_row_pitch, 1237 size_t host_slice_pitch, 1238 const void * ptr, 1239 cl_uint num_events_in_wait_list, 1240 const cl_event * event_wait_list, 1241 cl_event * event), 1242 (command_queue, buffer, blocking_write, buffer_offset, host_offset, 1243 region, buffer_row_pitch, buffer_slice_pitch, host_row_pitch, 1244 host_slice_pitch, ptr, num_events_in_wait_list, event_wait_list, event)) 1245 1246 /*OCL_FUNC(cl_int, clEnqueueFillBuffer, 1247 (cl_command_queue command_queue, 1248 cl_mem buffer, 1249 const void * pattern, 1250 size_t pattern_size, 1251 size_t offset, 1252 size_t size, 1253 cl_uint num_events_in_wait_list, 1254 const cl_event * event_wait_list, 1255 cl_event * event), 1256 (command_queue, buffer, pattern, pattern_size, offset, size, 1257 num_events_in_wait_list, event_wait_list, event))*/ 1258 1259 OCL_FUNC(cl_int, clEnqueueCopyBuffer, 1260 (cl_command_queue command_queue, 1261 cl_mem src_buffer, 1262 cl_mem dst_buffer, 1263 size_t src_offset, 1264 size_t dst_offset, 1265 size_t size, 1266 cl_uint num_events_in_wait_list, 1267 const cl_event * event_wait_list, 1268 cl_event * event), 1269 (command_queue, src_buffer, dst_buffer, src_offset, dst_offset, 1270 size, num_events_in_wait_list, event_wait_list, event)) 1271 1272 OCL_FUNC(cl_int, clEnqueueCopyBufferRect, 1273 (cl_command_queue command_queue, 1274 cl_mem src_buffer, 1275 cl_mem dst_buffer, 1276 const size_t * src_origin, 1277 const size_t * dst_origin, 1278 const size_t * region, 1279 size_t src_row_pitch, 1280 size_t src_slice_pitch, 1281 size_t dst_row_pitch, 1282 size_t dst_slice_pitch, 1283 cl_uint num_events_in_wait_list, 1284 const cl_event * event_wait_list, 1285 cl_event * event), 1286 (command_queue, src_buffer, dst_buffer, src_origin, dst_origin, 1287 region, src_row_pitch, src_slice_pitch, dst_row_pitch, dst_slice_pitch, 1288 num_events_in_wait_list, event_wait_list, event)) 1289 1290 OCL_FUNC_P(void*, clEnqueueMapBuffer, 1291 (cl_command_queue command_queue, 1292 cl_mem buffer, 1293 cl_bool blocking_map, 1294 cl_map_flags map_flags, 1295 size_t offset, 1296 size_t size, 1297 cl_uint num_events_in_wait_list, 1298 const cl_event * event_wait_list, 1299 cl_event * event, 1300 cl_int * errcode_ret), 1301 (command_queue, buffer, blocking_map, map_flags, offset, size, 1302 num_events_in_wait_list, event_wait_list, event, errcode_ret)) 1303 1304 OCL_FUNC(cl_int, clEnqueueUnmapMemObject, 1305 (cl_command_queue command_queue, 1306 cl_mem memobj, 1307 void * mapped_ptr, 1308 cl_uint num_events_in_wait_list, 1309 const cl_event * event_wait_list, 1310 cl_event * event), 1311 (command_queue, memobj, mapped_ptr, num_events_in_wait_list, event_wait_list, event)) 1312 1313 OCL_FUNC(cl_int, clEnqueueNDRangeKernel, 1314 (cl_command_queue command_queue, 1315 cl_kernel kernel, 1316 cl_uint work_dim, 1317 const size_t * global_work_offset, 1318 const size_t * global_work_size, 1319 const size_t * local_work_size, 1320 cl_uint num_events_in_wait_list, 1321 const cl_event * event_wait_list, 1322 cl_event * event), 1323 (command_queue, kernel, work_dim, global_work_offset, global_work_size, 1324 local_work_size, num_events_in_wait_list, event_wait_list, event)) 1325 1326 OCL_FUNC(cl_int, clEnqueueTask, 1327 (cl_command_queue command_queue, 1328 cl_kernel kernel, 1329 cl_uint num_events_in_wait_list, 1330 const cl_event * event_wait_list, 1331 cl_event * event), 1332 (command_queue, kernel, num_events_in_wait_list, event_wait_list, event)) 1333 1334 OCL_FUNC(cl_int, clSetEventCallback, 1335 (cl_event event, 1336 cl_int command_exec_callback_type , 1337 void (CL_CALLBACK *pfn_event_notify) (cl_event event, cl_int event_command_exec_status, void *user_data), 1338 void *user_data), 1339 (event, command_exec_callback_type, pfn_event_notify, user_data)) 1340 1341 OCL_FUNC(cl_int, clReleaseEvent, (cl_event event), (event)) 1342 1343 } 1344 1345 #endif 1346 1347 #ifndef CL_VERSION_1_2 1348 #define CL_VERSION_1_2 1349 #endif 1350 1351 #endif 1352 1353 #ifdef _DEBUG 1354 #define CV_OclDbgAssert CV_DbgAssert 1355 #else 1356 static bool isRaiseError() 1357 { 1358 static bool initialized = false; 1359 static bool value = false; 1360 if (!initialized) 1361 { 1362 value = getBoolParameter("OPENCV_OPENCL_RAISE_ERROR", false); 1363 initialized = true; 1364 } 1365 return value; 1366 } 1367 #define CV_OclDbgAssert(expr) do { if (isRaiseError()) { CV_Assert(expr); } else { (void)(expr); } } while ((void)0, 0) 1368 #endif 1369 1370 #ifdef HAVE_OPENCL_SVM 1371 #include "opencv2/core/opencl/runtime/opencl_svm_20.hpp" 1372 #include "opencv2/core/opencl/runtime/opencl_svm_hsa_extension.hpp" 1373 #include "opencv2/core/opencl/opencl_svm.hpp" 1374 #endif 1375 1376 namespace cv { namespace ocl { 1377 1378 struct UMat2D 1379 { 1380 UMat2D(const UMat& m) 1381 { 1382 offset = (int)m.offset; 1383 step = (int)m.step; 1384 rows = m.rows; 1385 cols = m.cols; 1386 } 1387 int offset; 1388 int step; 1389 int rows; 1390 int cols; 1391 }; 1392 1393 struct UMat3D 1394 { 1395 UMat3D(const UMat& m) 1396 { 1397 offset = (int)m.offset; 1398 step = (int)m.step.p[1]; 1399 slicestep = (int)m.step.p[0]; 1400 slices = (int)m.size.p[0]; 1401 rows = m.size.p[1]; 1402 cols = m.size.p[2]; 1403 } 1404 int offset; 1405 int slicestep; 1406 int step; 1407 int slices; 1408 int rows; 1409 int cols; 1410 }; 1411 1412 // Computes 64-bit "cyclic redundancy check" sum, as specified in ECMA-182 1413 static uint64 crc64( const uchar* data, size_t size, uint64 crc0=0 ) 1414 { 1415 static uint64 table[256]; 1416 static bool initialized = false; 1417 1418 if( !initialized ) 1419 { 1420 for( int i = 0; i < 256; i++ ) 1421 { 1422 uint64 c = i; 1423 for( int j = 0; j < 8; j++ ) 1424 c = ((c & 1) ? CV_BIG_UINT(0xc96c5795d7870f42) : 0) ^ (c >> 1); 1425 table[i] = c; 1426 } 1427 initialized = true; 1428 } 1429 1430 uint64 crc = ~crc0; 1431 for( size_t idx = 0; idx < size; idx++ ) 1432 crc = table[(uchar)crc ^ data[idx]] ^ (crc >> 8); 1433 1434 return ~crc; 1435 } 1436 1437 struct HashKey 1438 { 1439 typedef uint64 part; 1440 HashKey(part _a, part _b) : a(_a), b(_b) {} 1441 part a, b; 1442 }; 1443 1444 inline bool operator == (const HashKey& h1, const HashKey& h2) 1445 { 1446 return h1.a == h2.a && h1.b == h2.b; 1447 } 1448 1449 inline bool operator < (const HashKey& h1, const HashKey& h2) 1450 { 1451 return h1.a < h2.a || (h1.a == h2.a && h1.b < h2.b); 1452 } 1453 1454 1455 bool haveOpenCL() 1456 { 1457 #ifdef HAVE_OPENCL 1458 static bool g_isOpenCLInitialized = false; 1459 static bool g_isOpenCLAvailable = false; 1460 1461 if (!g_isOpenCLInitialized) 1462 { 1463 try 1464 { 1465 cl_uint n = 0; 1466 g_isOpenCLAvailable = ::clGetPlatformIDs(0, NULL, &n) == CL_SUCCESS; 1467 } 1468 catch (...) 1469 { 1470 g_isOpenCLAvailable = false; 1471 } 1472 g_isOpenCLInitialized = true; 1473 } 1474 return g_isOpenCLAvailable; 1475 #else 1476 return false; 1477 #endif 1478 } 1479 1480 bool useOpenCL() 1481 { 1482 CoreTLSData* data = getCoreTlsData().get(); 1483 if( data->useOpenCL < 0 ) 1484 { 1485 try 1486 { 1487 data->useOpenCL = (int)haveOpenCL() && Device::getDefault().ptr() && Device::getDefault().available(); 1488 } 1489 catch (...) 1490 { 1491 data->useOpenCL = 0; 1492 } 1493 } 1494 return data->useOpenCL > 0; 1495 } 1496 1497 void setUseOpenCL(bool flag) 1498 { 1499 if( haveOpenCL() ) 1500 { 1501 CoreTLSData* data = getCoreTlsData().get(); 1502 data->useOpenCL = (flag && Device::getDefault().ptr() != NULL) ? 1 : 0; 1503 } 1504 } 1505 1506 #ifdef HAVE_CLAMDBLAS 1507 1508 class AmdBlasHelper 1509 { 1510 public: 1511 static AmdBlasHelper & getInstance() 1512 { 1513 static AmdBlasHelper amdBlas; 1514 return amdBlas; 1515 } 1516 1517 bool isAvailable() const 1518 { 1519 return g_isAmdBlasAvailable; 1520 } 1521 1522 ~AmdBlasHelper() 1523 { 1524 try 1525 { 1526 clAmdBlasTeardown(); 1527 } 1528 catch (...) { } 1529 } 1530 1531 protected: 1532 AmdBlasHelper() 1533 { 1534 if (!g_isAmdBlasInitialized) 1535 { 1536 AutoLock lock(m); 1537 1538 if (!g_isAmdBlasInitialized && haveOpenCL()) 1539 { 1540 try 1541 { 1542 g_isAmdBlasAvailable = clAmdBlasSetup() == clAmdBlasSuccess; 1543 } 1544 catch (...) 1545 { 1546 g_isAmdBlasAvailable = false; 1547 } 1548 } 1549 else 1550 g_isAmdBlasAvailable = false; 1551 1552 g_isAmdBlasInitialized = true; 1553 } 1554 } 1555 1556 private: 1557 static Mutex m; 1558 static bool g_isAmdBlasInitialized; 1559 static bool g_isAmdBlasAvailable; 1560 }; 1561 1562 bool AmdBlasHelper::g_isAmdBlasAvailable = false; 1563 bool AmdBlasHelper::g_isAmdBlasInitialized = false; 1564 Mutex AmdBlasHelper::m; 1565 1566 bool haveAmdBlas() 1567 { 1568 return AmdBlasHelper::getInstance().isAvailable(); 1569 } 1570 1571 #else 1572 1573 bool haveAmdBlas() 1574 { 1575 return false; 1576 } 1577 1578 #endif 1579 1580 #ifdef HAVE_CLAMDFFT 1581 1582 class AmdFftHelper 1583 { 1584 public: 1585 static AmdFftHelper & getInstance() 1586 { 1587 static AmdFftHelper amdFft; 1588 return amdFft; 1589 } 1590 1591 bool isAvailable() const 1592 { 1593 return g_isAmdFftAvailable; 1594 } 1595 1596 ~AmdFftHelper() 1597 { 1598 try 1599 { 1600 // clAmdFftTeardown(); 1601 } 1602 catch (...) { } 1603 } 1604 1605 protected: 1606 AmdFftHelper() 1607 { 1608 if (!g_isAmdFftInitialized) 1609 { 1610 AutoLock lock(m); 1611 1612 if (!g_isAmdFftInitialized && haveOpenCL()) 1613 { 1614 try 1615 { 1616 cl_uint major, minor, patch; 1617 CV_Assert(clAmdFftInitSetupData(&setupData) == CLFFT_SUCCESS); 1618 1619 // it throws exception in case AmdFft binaries are not found 1620 CV_Assert(clAmdFftGetVersion(&major, &minor, &patch) == CLFFT_SUCCESS); 1621 g_isAmdFftAvailable = true; 1622 } 1623 catch (const Exception &) 1624 { 1625 g_isAmdFftAvailable = false; 1626 } 1627 } 1628 else 1629 g_isAmdFftAvailable = false; 1630 1631 g_isAmdFftInitialized = true; 1632 } 1633 } 1634 1635 private: 1636 static clAmdFftSetupData setupData; 1637 static Mutex m; 1638 static bool g_isAmdFftInitialized; 1639 static bool g_isAmdFftAvailable; 1640 }; 1641 1642 clAmdFftSetupData AmdFftHelper::setupData; 1643 bool AmdFftHelper::g_isAmdFftAvailable = false; 1644 bool AmdFftHelper::g_isAmdFftInitialized = false; 1645 Mutex AmdFftHelper::m; 1646 1647 bool haveAmdFft() 1648 { 1649 return AmdFftHelper::getInstance().isAvailable(); 1650 } 1651 1652 #else 1653 1654 bool haveAmdFft() 1655 { 1656 return false; 1657 } 1658 1659 #endif 1660 1661 bool haveSVM() 1662 { 1663 #ifdef HAVE_OPENCL_SVM 1664 return true; 1665 #else 1666 return false; 1667 #endif 1668 } 1669 1670 void finish() 1671 { 1672 Queue::getDefault().finish(); 1673 } 1674 1675 #define IMPLEMENT_REFCOUNTABLE() \ 1676 void addref() { CV_XADD(&refcount, 1); } \ 1677 void release() { if( CV_XADD(&refcount, -1) == 1 && !cv::__termination) delete this; } \ 1678 int refcount 1679 1680 /////////////////////////////////////////// Platform ///////////////////////////////////////////// 1681 1682 struct Platform::Impl 1683 { 1684 Impl() 1685 { 1686 refcount = 1; 1687 handle = 0; 1688 initialized = false; 1689 } 1690 1691 ~Impl() {} 1692 1693 void init() 1694 { 1695 if( !initialized ) 1696 { 1697 //cl_uint num_entries 1698 cl_uint n = 0; 1699 if( clGetPlatformIDs(1, &handle, &n) != CL_SUCCESS || n == 0 ) 1700 handle = 0; 1701 if( handle != 0 ) 1702 { 1703 char buf[1000]; 1704 size_t len = 0; 1705 CV_OclDbgAssert(clGetPlatformInfo(handle, CL_PLATFORM_VENDOR, sizeof(buf), buf, &len) == CL_SUCCESS); 1706 buf[len] = '\0'; 1707 vendor = String(buf); 1708 } 1709 1710 initialized = true; 1711 } 1712 } 1713 1714 IMPLEMENT_REFCOUNTABLE(); 1715 1716 cl_platform_id handle; 1717 String vendor; 1718 bool initialized; 1719 }; 1720 1721 Platform::Platform() 1722 { 1723 p = 0; 1724 } 1725 1726 Platform::~Platform() 1727 { 1728 if(p) 1729 p->release(); 1730 } 1731 1732 Platform::Platform(const Platform& pl) 1733 { 1734 p = (Impl*)pl.p; 1735 if(p) 1736 p->addref(); 1737 } 1738 1739 Platform& Platform::operator = (const Platform& pl) 1740 { 1741 Impl* newp = (Impl*)pl.p; 1742 if(newp) 1743 newp->addref(); 1744 if(p) 1745 p->release(); 1746 p = newp; 1747 return *this; 1748 } 1749 1750 void* Platform::ptr() const 1751 { 1752 return p ? p->handle : 0; 1753 } 1754 1755 Platform& Platform::getDefault() 1756 { 1757 static Platform p; 1758 if( !p.p ) 1759 { 1760 p.p = new Impl; 1761 p.p->init(); 1762 } 1763 return p; 1764 } 1765 1766 /////////////////////////////////////// Device //////////////////////////////////////////// 1767 1768 // deviceVersion has format 1769 // OpenCL<space><major_version.minor_version><space><vendor-specific information> 1770 // by specification 1771 // http://www.khronos.org/registry/cl/sdk/1.1/docs/man/xhtml/clGetDeviceInfo.html 1772 // http://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/clGetDeviceInfo.html 1773 static void parseDeviceVersion(const String &deviceVersion, int &major, int &minor) 1774 { 1775 major = minor = 0; 1776 if (10 >= deviceVersion.length()) 1777 return; 1778 const char *pstr = deviceVersion.c_str(); 1779 if (0 != strncmp(pstr, "OpenCL ", 7)) 1780 return; 1781 size_t ppos = deviceVersion.find('.', 7); 1782 if (String::npos == ppos) 1783 return; 1784 String temp = deviceVersion.substr(7, ppos - 7); 1785 major = atoi(temp.c_str()); 1786 temp = deviceVersion.substr(ppos + 1); 1787 minor = atoi(temp.c_str()); 1788 } 1789 1790 struct Device::Impl 1791 { 1792 Impl(void* d) 1793 { 1794 handle = (cl_device_id)d; 1795 refcount = 1; 1796 1797 name_ = getStrProp(CL_DEVICE_NAME); 1798 version_ = getStrProp(CL_DEVICE_VERSION); 1799 doubleFPConfig_ = getProp<cl_device_fp_config, int>(CL_DEVICE_DOUBLE_FP_CONFIG); 1800 hostUnifiedMemory_ = getBoolProp(CL_DEVICE_HOST_UNIFIED_MEMORY); 1801 maxComputeUnits_ = getProp<cl_uint, int>(CL_DEVICE_MAX_COMPUTE_UNITS); 1802 maxWorkGroupSize_ = getProp<size_t, size_t>(CL_DEVICE_MAX_WORK_GROUP_SIZE); 1803 type_ = getProp<cl_device_type, int>(CL_DEVICE_TYPE); 1804 driverVersion_ = getStrProp(CL_DRIVER_VERSION); 1805 1806 String deviceVersion_ = getStrProp(CL_DEVICE_VERSION); 1807 parseDeviceVersion(deviceVersion_, deviceVersionMajor_, deviceVersionMinor_); 1808 1809 vendorName_ = getStrProp(CL_DEVICE_VENDOR); 1810 if (vendorName_ == "Advanced Micro Devices, Inc." || 1811 vendorName_ == "AMD") 1812 vendorID_ = VENDOR_AMD; 1813 else if (vendorName_ == "Intel(R) Corporation" || vendorName_ == "Intel" || strstr(name_.c_str(), "Iris") != 0) 1814 vendorID_ = VENDOR_INTEL; 1815 else if (vendorName_ == "NVIDIA Corporation") 1816 vendorID_ = VENDOR_NVIDIA; 1817 else 1818 vendorID_ = UNKNOWN_VENDOR; 1819 } 1820 1821 template<typename _TpCL, typename _TpOut> 1822 _TpOut getProp(cl_device_info prop) const 1823 { 1824 _TpCL temp=_TpCL(); 1825 size_t sz = 0; 1826 1827 return clGetDeviceInfo(handle, prop, sizeof(temp), &temp, &sz) == CL_SUCCESS && 1828 sz == sizeof(temp) ? _TpOut(temp) : _TpOut(); 1829 } 1830 1831 bool getBoolProp(cl_device_info prop) const 1832 { 1833 cl_bool temp = CL_FALSE; 1834 size_t sz = 0; 1835 1836 return clGetDeviceInfo(handle, prop, sizeof(temp), &temp, &sz) == CL_SUCCESS && 1837 sz == sizeof(temp) ? temp != 0 : false; 1838 } 1839 1840 String getStrProp(cl_device_info prop) const 1841 { 1842 char buf[1024]; 1843 size_t sz=0; 1844 return clGetDeviceInfo(handle, prop, sizeof(buf)-16, buf, &sz) == CL_SUCCESS && 1845 sz < sizeof(buf) ? String(buf) : String(); 1846 } 1847 1848 IMPLEMENT_REFCOUNTABLE(); 1849 cl_device_id handle; 1850 1851 String name_; 1852 String version_; 1853 int doubleFPConfig_; 1854 bool hostUnifiedMemory_; 1855 int maxComputeUnits_; 1856 size_t maxWorkGroupSize_; 1857 int type_; 1858 int deviceVersionMajor_; 1859 int deviceVersionMinor_; 1860 String driverVersion_; 1861 String vendorName_; 1862 int vendorID_; 1863 }; 1864 1865 1866 Device::Device() 1867 { 1868 p = 0; 1869 } 1870 1871 Device::Device(void* d) 1872 { 1873 p = 0; 1874 set(d); 1875 } 1876 1877 Device::Device(const Device& d) 1878 { 1879 p = d.p; 1880 if(p) 1881 p->addref(); 1882 } 1883 1884 Device& Device::operator = (const Device& d) 1885 { 1886 Impl* newp = (Impl*)d.p; 1887 if(newp) 1888 newp->addref(); 1889 if(p) 1890 p->release(); 1891 p = newp; 1892 return *this; 1893 } 1894 1895 Device::~Device() 1896 { 1897 if(p) 1898 p->release(); 1899 } 1900 1901 void Device::set(void* d) 1902 { 1903 if(p) 1904 p->release(); 1905 p = new Impl(d); 1906 } 1907 1908 void* Device::ptr() const 1909 { 1910 return p ? p->handle : 0; 1911 } 1912 1913 String Device::name() const 1914 { return p ? p->name_ : String(); } 1915 1916 String Device::extensions() const 1917 { return p ? p->getStrProp(CL_DEVICE_EXTENSIONS) : String(); } 1918 1919 String Device::version() const 1920 { return p ? p->version_ : String(); } 1921 1922 String Device::vendorName() const 1923 { return p ? p->vendorName_ : String(); } 1924 1925 int Device::vendorID() const 1926 { return p ? p->vendorID_ : 0; } 1927 1928 String Device::OpenCL_C_Version() const 1929 { return p ? p->getStrProp(CL_DEVICE_OPENCL_C_VERSION) : String(); } 1930 1931 String Device::OpenCLVersion() const 1932 { return p ? p->getStrProp(CL_DEVICE_EXTENSIONS) : String(); } 1933 1934 int Device::deviceVersionMajor() const 1935 { return p ? p->deviceVersionMajor_ : 0; } 1936 1937 int Device::deviceVersionMinor() const 1938 { return p ? p->deviceVersionMinor_ : 0; } 1939 1940 String Device::driverVersion() const 1941 { return p ? p->driverVersion_ : String(); } 1942 1943 int Device::type() const 1944 { return p ? p->type_ : 0; } 1945 1946 int Device::addressBits() const 1947 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_ADDRESS_BITS) : 0; } 1948 1949 bool Device::available() const 1950 { return p ? p->getBoolProp(CL_DEVICE_AVAILABLE) : false; } 1951 1952 bool Device::compilerAvailable() const 1953 { return p ? p->getBoolProp(CL_DEVICE_COMPILER_AVAILABLE) : false; } 1954 1955 bool Device::linkerAvailable() const 1956 #ifdef CL_VERSION_1_2 1957 { return p ? p->getBoolProp(CL_DEVICE_LINKER_AVAILABLE) : false; } 1958 #else 1959 { CV_REQUIRE_OPENCL_1_2_ERROR; } 1960 #endif 1961 1962 int Device::doubleFPConfig() const 1963 { return p ? p->doubleFPConfig_ : 0; } 1964 1965 int Device::singleFPConfig() const 1966 { return p ? p->getProp<cl_device_fp_config, int>(CL_DEVICE_SINGLE_FP_CONFIG) : 0; } 1967 1968 int Device::halfFPConfig() const 1969 #ifdef CL_VERSION_1_2 1970 { return p ? p->getProp<cl_device_fp_config, int>(CL_DEVICE_HALF_FP_CONFIG) : 0; } 1971 #else 1972 { CV_REQUIRE_OPENCL_1_2_ERROR; } 1973 #endif 1974 1975 bool Device::endianLittle() const 1976 { return p ? p->getBoolProp(CL_DEVICE_ENDIAN_LITTLE) : false; } 1977 1978 bool Device::errorCorrectionSupport() const 1979 { return p ? p->getBoolProp(CL_DEVICE_ERROR_CORRECTION_SUPPORT) : false; } 1980 1981 int Device::executionCapabilities() const 1982 { return p ? p->getProp<cl_device_exec_capabilities, int>(CL_DEVICE_EXECUTION_CAPABILITIES) : 0; } 1983 1984 size_t Device::globalMemCacheSize() const 1985 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE) : 0; } 1986 1987 int Device::globalMemCacheType() const 1988 { return p ? p->getProp<cl_device_mem_cache_type, int>(CL_DEVICE_GLOBAL_MEM_CACHE_TYPE) : 0; } 1989 1990 int Device::globalMemCacheLineSize() const 1991 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE) : 0; } 1992 1993 size_t Device::globalMemSize() const 1994 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_GLOBAL_MEM_SIZE) : 0; } 1995 1996 size_t Device::localMemSize() const 1997 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_LOCAL_MEM_SIZE) : 0; } 1998 1999 int Device::localMemType() const 2000 { return p ? p->getProp<cl_device_local_mem_type, int>(CL_DEVICE_LOCAL_MEM_TYPE) : 0; } 2001 2002 bool Device::hostUnifiedMemory() const 2003 { return p ? p->hostUnifiedMemory_ : false; } 2004 2005 bool Device::imageSupport() const 2006 { return p ? p->getBoolProp(CL_DEVICE_IMAGE_SUPPORT) : false; } 2007 2008 bool Device::imageFromBufferSupport() const 2009 { 2010 bool ret = false; 2011 if (p) 2012 { 2013 size_t pos = p->getStrProp(CL_DEVICE_EXTENSIONS).find("cl_khr_image2d_from_buffer"); 2014 if (pos != String::npos) 2015 { 2016 ret = true; 2017 } 2018 } 2019 return ret; 2020 } 2021 2022 uint Device::imagePitchAlignment() const 2023 { 2024 #ifdef CL_DEVICE_IMAGE_PITCH_ALIGNMENT 2025 return p ? p->getProp<cl_uint, uint>(CL_DEVICE_IMAGE_PITCH_ALIGNMENT) : 0; 2026 #else 2027 return 0; 2028 #endif 2029 } 2030 2031 uint Device::imageBaseAddressAlignment() const 2032 { 2033 #ifdef CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT 2034 return p ? p->getProp<cl_uint, uint>(CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT) : 0; 2035 #else 2036 return 0; 2037 #endif 2038 } 2039 2040 size_t Device::image2DMaxWidth() const 2041 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE2D_MAX_WIDTH) : 0; } 2042 2043 size_t Device::image2DMaxHeight() const 2044 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE2D_MAX_HEIGHT) : 0; } 2045 2046 size_t Device::image3DMaxWidth() const 2047 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_WIDTH) : 0; } 2048 2049 size_t Device::image3DMaxHeight() const 2050 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_HEIGHT) : 0; } 2051 2052 size_t Device::image3DMaxDepth() const 2053 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_DEPTH) : 0; } 2054 2055 size_t Device::imageMaxBufferSize() const 2056 #ifdef CL_VERSION_1_2 2057 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE_MAX_BUFFER_SIZE) : 0; } 2058 #else 2059 { CV_REQUIRE_OPENCL_1_2_ERROR; } 2060 #endif 2061 2062 size_t Device::imageMaxArraySize() const 2063 #ifdef CL_VERSION_1_2 2064 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE_MAX_ARRAY_SIZE) : 0; } 2065 #else 2066 { CV_REQUIRE_OPENCL_1_2_ERROR; } 2067 #endif 2068 2069 int Device::maxClockFrequency() const 2070 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_CLOCK_FREQUENCY) : 0; } 2071 2072 int Device::maxComputeUnits() const 2073 { return p ? p->maxComputeUnits_ : 0; } 2074 2075 int Device::maxConstantArgs() const 2076 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_CONSTANT_ARGS) : 0; } 2077 2078 size_t Device::maxConstantBufferSize() const 2079 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE) : 0; } 2080 2081 size_t Device::maxMemAllocSize() const 2082 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_MEM_ALLOC_SIZE) : 0; } 2083 2084 size_t Device::maxParameterSize() const 2085 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_PARAMETER_SIZE) : 0; } 2086 2087 int Device::maxReadImageArgs() const 2088 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_READ_IMAGE_ARGS) : 0; } 2089 2090 int Device::maxWriteImageArgs() const 2091 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_WRITE_IMAGE_ARGS) : 0; } 2092 2093 int Device::maxSamplers() const 2094 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_SAMPLERS) : 0; } 2095 2096 size_t Device::maxWorkGroupSize() const 2097 { return p ? p->maxWorkGroupSize_ : 0; } 2098 2099 int Device::maxWorkItemDims() const 2100 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS) : 0; } 2101 2102 void Device::maxWorkItemSizes(size_t* sizes) const 2103 { 2104 if(p) 2105 { 2106 const int MAX_DIMS = 32; 2107 size_t retsz = 0; 2108 CV_OclDbgAssert(clGetDeviceInfo(p->handle, CL_DEVICE_MAX_WORK_ITEM_SIZES, 2109 MAX_DIMS*sizeof(sizes[0]), &sizes[0], &retsz) == CL_SUCCESS); 2110 } 2111 } 2112 2113 int Device::memBaseAddrAlign() const 2114 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MEM_BASE_ADDR_ALIGN) : 0; } 2115 2116 int Device::nativeVectorWidthChar() const 2117 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR) : 0; } 2118 2119 int Device::nativeVectorWidthShort() const 2120 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT) : 0; } 2121 2122 int Device::nativeVectorWidthInt() const 2123 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_INT) : 0; } 2124 2125 int Device::nativeVectorWidthLong() const 2126 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG) : 0; } 2127 2128 int Device::nativeVectorWidthFloat() const 2129 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT) : 0; } 2130 2131 int Device::nativeVectorWidthDouble() const 2132 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE) : 0; } 2133 2134 int Device::nativeVectorWidthHalf() const 2135 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF) : 0; } 2136 2137 int Device::preferredVectorWidthChar() const 2138 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR) : 0; } 2139 2140 int Device::preferredVectorWidthShort() const 2141 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT) : 0; } 2142 2143 int Device::preferredVectorWidthInt() const 2144 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT) : 0; } 2145 2146 int Device::preferredVectorWidthLong() const 2147 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG) : 0; } 2148 2149 int Device::preferredVectorWidthFloat() const 2150 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT) : 0; } 2151 2152 int Device::preferredVectorWidthDouble() const 2153 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE) : 0; } 2154 2155 int Device::preferredVectorWidthHalf() const 2156 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF) : 0; } 2157 2158 size_t Device::printfBufferSize() const 2159 #ifdef CL_VERSION_1_2 2160 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_PRINTF_BUFFER_SIZE) : 0; } 2161 #else 2162 { CV_REQUIRE_OPENCL_1_2_ERROR; } 2163 #endif 2164 2165 2166 size_t Device::profilingTimerResolution() const 2167 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_PROFILING_TIMER_RESOLUTION) : 0; } 2168 2169 const Device& Device::getDefault() 2170 { 2171 const Context& ctx = Context::getDefault(); 2172 int idx = getCoreTlsData().get()->device; 2173 const Device& device = ctx.device(idx); 2174 return device; 2175 } 2176 2177 ////////////////////////////////////// Context /////////////////////////////////////////////////// 2178 2179 template <typename Functor, typename ObjectType> 2180 inline cl_int getStringInfo(Functor f, ObjectType obj, cl_uint name, std::string& param) 2181 { 2182 ::size_t required; 2183 cl_int err = f(obj, name, 0, NULL, &required); 2184 if (err != CL_SUCCESS) 2185 return err; 2186 2187 param.clear(); 2188 if (required > 0) 2189 { 2190 AutoBuffer<char> buf(required + 1); 2191 char* ptr = (char*)buf; // cleanup is not needed 2192 err = f(obj, name, required, ptr, NULL); 2193 if (err != CL_SUCCESS) 2194 return err; 2195 param = ptr; 2196 } 2197 2198 return CL_SUCCESS; 2199 } 2200 2201 static void split(const std::string &s, char delim, std::vector<std::string> &elems) 2202 { 2203 elems.clear(); 2204 if (s.size() == 0) 2205 return; 2206 std::istringstream ss(s); 2207 std::string item; 2208 while (!ss.eof()) 2209 { 2210 std::getline(ss, item, delim); 2211 elems.push_back(item); 2212 } 2213 } 2214 2215 // Layout: <Platform>:<CPU|GPU|ACCELERATOR|nothing=GPU/CPU>:<deviceName> 2216 // Sample: AMD:GPU: 2217 // Sample: AMD:GPU:Tahiti 2218 // Sample: :GPU|CPU: = '' = ':' = '::' 2219 static bool parseOpenCLDeviceConfiguration(const std::string& configurationStr, 2220 std::string& platform, std::vector<std::string>& deviceTypes, std::string& deviceNameOrID) 2221 { 2222 std::vector<std::string> parts; 2223 split(configurationStr, ':', parts); 2224 if (parts.size() > 3) 2225 { 2226 std::cerr << "ERROR: Invalid configuration string for OpenCL device" << std::endl; 2227 return false; 2228 } 2229 if (parts.size() > 2) 2230 deviceNameOrID = parts[2]; 2231 if (parts.size() > 1) 2232 { 2233 split(parts[1], '|', deviceTypes); 2234 } 2235 if (parts.size() > 0) 2236 { 2237 platform = parts[0]; 2238 } 2239 return true; 2240 } 2241 2242 #ifdef WINRT 2243 static cl_device_id selectOpenCLDevice() 2244 { 2245 return NULL; 2246 } 2247 #else 2248 static cl_device_id selectOpenCLDevice() 2249 { 2250 std::string platform, deviceName; 2251 std::vector<std::string> deviceTypes; 2252 2253 const char* configuration = getenv("OPENCV_OPENCL_DEVICE"); 2254 if (configuration && 2255 (strcmp(configuration, "disabled") == 0 || 2256 !parseOpenCLDeviceConfiguration(std::string(configuration), platform, deviceTypes, deviceName) 2257 )) 2258 return NULL; 2259 2260 bool isID = false; 2261 int deviceID = -1; 2262 if (deviceName.length() == 1) 2263 // We limit ID range to 0..9, because we want to write: 2264 // - '2500' to mean i5-2500 2265 // - '8350' to mean AMD FX-8350 2266 // - '650' to mean GeForce 650 2267 // To extend ID range change condition to '> 0' 2268 { 2269 isID = true; 2270 for (size_t i = 0; i < deviceName.length(); i++) 2271 { 2272 if (!isdigit(deviceName[i])) 2273 { 2274 isID = false; 2275 break; 2276 } 2277 } 2278 if (isID) 2279 { 2280 deviceID = atoi(deviceName.c_str()); 2281 if (deviceID < 0) 2282 return NULL; 2283 } 2284 } 2285 2286 std::vector<cl_platform_id> platforms; 2287 { 2288 cl_uint numPlatforms = 0; 2289 CV_OclDbgAssert(clGetPlatformIDs(0, NULL, &numPlatforms) == CL_SUCCESS); 2290 2291 if (numPlatforms == 0) 2292 return NULL; 2293 platforms.resize((size_t)numPlatforms); 2294 CV_OclDbgAssert(clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms) == CL_SUCCESS); 2295 platforms.resize(numPlatforms); 2296 } 2297 2298 int selectedPlatform = -1; 2299 if (platform.length() > 0) 2300 { 2301 for (size_t i = 0; i < platforms.size(); i++) 2302 { 2303 std::string name; 2304 CV_OclDbgAssert(getStringInfo(clGetPlatformInfo, platforms[i], CL_PLATFORM_NAME, name) == CL_SUCCESS); 2305 if (name.find(platform) != std::string::npos) 2306 { 2307 selectedPlatform = (int)i; 2308 break; 2309 } 2310 } 2311 if (selectedPlatform == -1) 2312 { 2313 std::cerr << "ERROR: Can't find OpenCL platform by name: " << platform << std::endl; 2314 goto not_found; 2315 } 2316 } 2317 if (deviceTypes.size() == 0) 2318 { 2319 if (!isID) 2320 { 2321 deviceTypes.push_back("GPU"); 2322 if (configuration) 2323 deviceTypes.push_back("CPU"); 2324 } 2325 else 2326 deviceTypes.push_back("ALL"); 2327 } 2328 for (size_t t = 0; t < deviceTypes.size(); t++) 2329 { 2330 int deviceType = 0; 2331 std::string tempStrDeviceType = deviceTypes[t]; 2332 std::transform( tempStrDeviceType.begin(), tempStrDeviceType.end(), tempStrDeviceType.begin(), tolower ); 2333 2334 if (tempStrDeviceType == "gpu" || tempStrDeviceType == "dgpu" || tempStrDeviceType == "igpu") 2335 deviceType = Device::TYPE_GPU; 2336 else if (tempStrDeviceType == "cpu") 2337 deviceType = Device::TYPE_CPU; 2338 else if (tempStrDeviceType == "accelerator") 2339 deviceType = Device::TYPE_ACCELERATOR; 2340 else if (tempStrDeviceType == "all") 2341 deviceType = Device::TYPE_ALL; 2342 else 2343 { 2344 std::cerr << "ERROR: Unsupported device type for OpenCL device (GPU, CPU, ACCELERATOR): " << deviceTypes[t] << std::endl; 2345 goto not_found; 2346 } 2347 2348 std::vector<cl_device_id> devices; // TODO Use clReleaseDevice to cleanup 2349 for (int i = selectedPlatform >= 0 ? selectedPlatform : 0; 2350 (selectedPlatform >= 0 ? i == selectedPlatform : true) && (i < (int)platforms.size()); 2351 i++) 2352 { 2353 cl_uint count = 0; 2354 cl_int status = clGetDeviceIDs(platforms[i], deviceType, 0, NULL, &count); 2355 CV_OclDbgAssert(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND); 2356 if (count == 0) 2357 continue; 2358 size_t base = devices.size(); 2359 devices.resize(base + count); 2360 status = clGetDeviceIDs(platforms[i], deviceType, count, &devices[base], &count); 2361 CV_OclDbgAssert(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND); 2362 } 2363 2364 for (size_t i = (isID ? deviceID : 0); 2365 (isID ? (i == (size_t)deviceID) : true) && (i < devices.size()); 2366 i++) 2367 { 2368 std::string name; 2369 CV_OclDbgAssert(getStringInfo(clGetDeviceInfo, devices[i], CL_DEVICE_NAME, name) == CL_SUCCESS); 2370 cl_bool useGPU = true; 2371 if(tempStrDeviceType == "dgpu" || tempStrDeviceType == "igpu") 2372 { 2373 cl_bool isIGPU = CL_FALSE; 2374 clGetDeviceInfo(devices[i], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(isIGPU), &isIGPU, NULL); 2375 useGPU = tempStrDeviceType == "dgpu" ? !isIGPU : isIGPU; 2376 } 2377 if ( (isID || name.find(deviceName) != std::string::npos) && useGPU) 2378 { 2379 // TODO check for OpenCL 1.1 2380 return devices[i]; 2381 } 2382 } 2383 } 2384 2385 not_found: 2386 if (!configuration) 2387 return NULL; // suppress messages on stderr 2388 2389 std::cerr << "ERROR: Requested OpenCL device not found, check configuration: " << (configuration == NULL ? "" : configuration) << std::endl 2390 << " Platform: " << (platform.length() == 0 ? "any" : platform) << std::endl 2391 << " Device types: "; 2392 for (size_t t = 0; t < deviceTypes.size(); t++) 2393 std::cerr << deviceTypes[t] << " "; 2394 2395 std::cerr << std::endl << " Device name: " << (deviceName.length() == 0 ? "any" : deviceName) << std::endl; 2396 return NULL; 2397 } 2398 #endif 2399 2400 #ifdef HAVE_OPENCL_SVM 2401 namespace svm { 2402 2403 enum AllocatorFlags { // don't use first 16 bits 2404 OPENCL_SVM_COARSE_GRAIN_BUFFER = 1 << 16, // clSVMAlloc + SVM map/unmap 2405 OPENCL_SVM_FINE_GRAIN_BUFFER = 2 << 16, // clSVMAlloc 2406 OPENCL_SVM_FINE_GRAIN_SYSTEM = 3 << 16, // direct access 2407 OPENCL_SVM_BUFFER_MASK = 3 << 16, 2408 OPENCL_SVM_BUFFER_MAP = 4 << 16 2409 }; 2410 2411 static bool checkForceSVMUmatUsage() 2412 { 2413 static bool initialized = false; 2414 static bool force = false; 2415 if (!initialized) 2416 { 2417 force = getBoolParameter("OPENCV_OPENCL_SVM_FORCE_UMAT_USAGE", false); 2418 initialized = true; 2419 } 2420 return force; 2421 } 2422 static bool checkDisableSVMUMatUsage() 2423 { 2424 static bool initialized = false; 2425 static bool force = false; 2426 if (!initialized) 2427 { 2428 force = getBoolParameter("OPENCV_OPENCL_SVM_DISABLE_UMAT_USAGE", false); 2429 initialized = true; 2430 } 2431 return force; 2432 } 2433 static bool checkDisableSVM() 2434 { 2435 static bool initialized = false; 2436 static bool force = false; 2437 if (!initialized) 2438 { 2439 force = getBoolParameter("OPENCV_OPENCL_SVM_DISABLE", false); 2440 initialized = true; 2441 } 2442 return force; 2443 } 2444 // see SVMCapabilities 2445 static unsigned int getSVMCapabilitiesMask() 2446 { 2447 static bool initialized = false; 2448 static unsigned int mask = 0; 2449 if (!initialized) 2450 { 2451 const char* envValue = getenv("OPENCV_OPENCL_SVM_CAPABILITIES_MASK"); 2452 if (envValue == NULL) 2453 { 2454 return ~0U; // all bits 1 2455 } 2456 mask = atoi(envValue); 2457 initialized = true; 2458 } 2459 return mask; 2460 } 2461 } // namespace 2462 #endif 2463 2464 struct Context::Impl 2465 { 2466 static Context::Impl* get(Context& context) { return context.p; } 2467 2468 void __init() 2469 { 2470 refcount = 1; 2471 handle = 0; 2472 #ifdef HAVE_OPENCL_SVM 2473 svmInitialized = false; 2474 #endif 2475 } 2476 2477 Impl() 2478 { 2479 __init(); 2480 } 2481 2482 void setDefault() 2483 { 2484 CV_Assert(handle == NULL); 2485 2486 cl_device_id d = selectOpenCLDevice(); 2487 2488 if (d == NULL) 2489 return; 2490 2491 cl_platform_id pl = NULL; 2492 CV_OclDbgAssert(clGetDeviceInfo(d, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &pl, NULL) == CL_SUCCESS); 2493 2494 cl_context_properties prop[] = 2495 { 2496 CL_CONTEXT_PLATFORM, (cl_context_properties)pl, 2497 0 2498 }; 2499 2500 // !!! in the current implementation force the number of devices to 1 !!! 2501 cl_uint nd = 1; 2502 cl_int status; 2503 2504 handle = clCreateContext(prop, nd, &d, 0, 0, &status); 2505 2506 bool ok = handle != 0 && status == CL_SUCCESS; 2507 if( ok ) 2508 { 2509 devices.resize(nd); 2510 devices[0].set(d); 2511 } 2512 else 2513 handle = NULL; 2514 } 2515 2516 Impl(int dtype0) 2517 { 2518 __init(); 2519 2520 cl_int retval = 0; 2521 cl_platform_id pl = (cl_platform_id)Platform::getDefault().ptr(); 2522 cl_context_properties prop[] = 2523 { 2524 CL_CONTEXT_PLATFORM, (cl_context_properties)pl, 2525 0 2526 }; 2527 2528 cl_uint i, nd0 = 0, nd = 0; 2529 int dtype = dtype0 & 15; 2530 CV_OclDbgAssert(clGetDeviceIDs( pl, dtype, 0, 0, &nd0 ) == CL_SUCCESS); 2531 2532 AutoBuffer<void*> dlistbuf(nd0*2+1); 2533 cl_device_id* dlist = (cl_device_id*)(void**)dlistbuf; 2534 cl_device_id* dlist_new = dlist + nd0; 2535 CV_OclDbgAssert(clGetDeviceIDs( pl, dtype, nd0, dlist, &nd0 ) == CL_SUCCESS); 2536 String name0; 2537 2538 for(i = 0; i < nd0; i++) 2539 { 2540 Device d(dlist[i]); 2541 if( !d.available() || !d.compilerAvailable() ) 2542 continue; 2543 if( dtype0 == Device::TYPE_DGPU && d.hostUnifiedMemory() ) 2544 continue; 2545 if( dtype0 == Device::TYPE_IGPU && !d.hostUnifiedMemory() ) 2546 continue; 2547 String name = d.name(); 2548 if( nd != 0 && name != name0 ) 2549 continue; 2550 name0 = name; 2551 dlist_new[nd++] = dlist[i]; 2552 } 2553 2554 if(nd == 0) 2555 return; 2556 2557 // !!! in the current implementation force the number of devices to 1 !!! 2558 nd = 1; 2559 2560 handle = clCreateContext(prop, nd, dlist_new, 0, 0, &retval); 2561 bool ok = handle != 0 && retval == CL_SUCCESS; 2562 if( ok ) 2563 { 2564 devices.resize(nd); 2565 for( i = 0; i < nd; i++ ) 2566 devices[i].set(dlist_new[i]); 2567 } 2568 } 2569 2570 ~Impl() 2571 { 2572 if(handle) 2573 { 2574 clReleaseContext(handle); 2575 handle = NULL; 2576 } 2577 devices.clear(); 2578 } 2579 2580 Program getProg(const ProgramSource& src, 2581 const String& buildflags, String& errmsg) 2582 { 2583 String prefix = Program::getPrefix(buildflags); 2584 HashKey k(src.hash(), crc64((const uchar*)prefix.c_str(), prefix.size())); 2585 phash_t::iterator it = phash.find(k); 2586 if( it != phash.end() ) 2587 return it->second; 2588 //String filename = format("%08x%08x_%08x%08x.clb2", 2589 Program prog(src, buildflags, errmsg); 2590 if(prog.ptr()) 2591 phash.insert(std::pair<HashKey,Program>(k, prog)); 2592 return prog; 2593 } 2594 2595 IMPLEMENT_REFCOUNTABLE(); 2596 2597 cl_context handle; 2598 std::vector<Device> devices; 2599 2600 typedef ProgramSource::hash_t hash_t; 2601 2602 struct HashKey 2603 { 2604 HashKey(hash_t _a, hash_t _b) : a(_a), b(_b) {} 2605 bool operator < (const HashKey& k) const { return a < k.a || (a == k.a && b < k.b); } 2606 bool operator == (const HashKey& k) const { return a == k.a && b == k.b; } 2607 bool operator != (const HashKey& k) const { return a != k.a || b != k.b; } 2608 hash_t a, b; 2609 }; 2610 typedef std::map<HashKey, Program> phash_t; 2611 phash_t phash; 2612 2613 #ifdef HAVE_OPENCL_SVM 2614 bool svmInitialized; 2615 bool svmAvailable; 2616 bool svmEnabled; 2617 svm::SVMCapabilities svmCapabilities; 2618 svm::SVMFunctions svmFunctions; 2619 2620 void svmInit() 2621 { 2622 CV_Assert(handle != NULL); 2623 const Device& device = devices[0]; 2624 cl_device_svm_capabilities deviceCaps = 0; 2625 CV_Assert(((void)0, CL_DEVICE_SVM_CAPABILITIES == CL_DEVICE_SVM_CAPABILITIES_AMD)); // Check assumption 2626 cl_int status = clGetDeviceInfo((cl_device_id)device.ptr(), CL_DEVICE_SVM_CAPABILITIES, sizeof(deviceCaps), &deviceCaps, NULL); 2627 if (status != CL_SUCCESS) 2628 { 2629 CV_OPENCL_SVM_TRACE_ERROR_P("CL_DEVICE_SVM_CAPABILITIES via clGetDeviceInfo failed: %d\n", status); 2630 goto noSVM; 2631 } 2632 CV_OPENCL_SVM_TRACE_P("CL_DEVICE_SVM_CAPABILITIES returned: 0x%x\n", (int)deviceCaps); 2633 CV_Assert(((void)0, CL_DEVICE_SVM_COARSE_GRAIN_BUFFER == CL_DEVICE_SVM_COARSE_GRAIN_BUFFER_AMD)); // Check assumption 2634 svmCapabilities.value_ = 2635 ((deviceCaps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER) ? svm::SVMCapabilities::SVM_COARSE_GRAIN_BUFFER : 0) | 2636 ((deviceCaps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) ? svm::SVMCapabilities::SVM_FINE_GRAIN_BUFFER : 0) | 2637 ((deviceCaps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM) ? svm::SVMCapabilities::SVM_FINE_GRAIN_SYSTEM : 0) | 2638 ((deviceCaps & CL_DEVICE_SVM_ATOMICS) ? svm::SVMCapabilities::SVM_ATOMICS : 0); 2639 svmCapabilities.value_ &= svm::getSVMCapabilitiesMask(); 2640 if (svmCapabilities.value_ == 0) 2641 { 2642 CV_OPENCL_SVM_TRACE_ERROR_P("svmCapabilities is empty\n"); 2643 goto noSVM; 2644 } 2645 try 2646 { 2647 // Try OpenCL 2.0 2648 CV_OPENCL_SVM_TRACE_P("Try SVM from OpenCL 2.0 ...\n"); 2649 void* ptr = clSVMAlloc(handle, CL_MEM_READ_WRITE, 100, 0); 2650 if (!ptr) 2651 { 2652 CV_OPENCL_SVM_TRACE_ERROR_P("clSVMAlloc returned NULL...\n"); 2653 CV_ErrorNoReturn(Error::StsBadArg, "clSVMAlloc returned NULL"); 2654 } 2655 try 2656 { 2657 bool error = false; 2658 cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr(); 2659 if (CL_SUCCESS != clEnqueueSVMMap(q, CL_TRUE, CL_MAP_WRITE, ptr, 100, 0, NULL, NULL)) 2660 { 2661 CV_OPENCL_SVM_TRACE_ERROR_P("clEnqueueSVMMap failed...\n"); 2662 CV_ErrorNoReturn(Error::StsBadArg, "clEnqueueSVMMap FAILED"); 2663 } 2664 clFinish(q); 2665 try 2666 { 2667 ((int*)ptr)[0] = 100; 2668 } 2669 catch (...) 2670 { 2671 CV_OPENCL_SVM_TRACE_ERROR_P("SVM buffer access test FAILED\n"); 2672 error = true; 2673 } 2674 if (CL_SUCCESS != clEnqueueSVMUnmap(q, ptr, 0, NULL, NULL)) 2675 { 2676 CV_OPENCL_SVM_TRACE_ERROR_P("clEnqueueSVMUnmap failed...\n"); 2677 CV_ErrorNoReturn(Error::StsBadArg, "clEnqueueSVMUnmap FAILED"); 2678 } 2679 clFinish(q); 2680 if (error) 2681 { 2682 CV_ErrorNoReturn(Error::StsBadArg, "OpenCL SVM buffer access test was FAILED"); 2683 } 2684 } 2685 catch (...) 2686 { 2687 CV_OPENCL_SVM_TRACE_ERROR_P("OpenCL SVM buffer access test was FAILED\n"); 2688 clSVMFree(handle, ptr); 2689 throw; 2690 } 2691 clSVMFree(handle, ptr); 2692 svmFunctions.fn_clSVMAlloc = clSVMAlloc; 2693 svmFunctions.fn_clSVMFree = clSVMFree; 2694 svmFunctions.fn_clSetKernelArgSVMPointer = clSetKernelArgSVMPointer; 2695 //svmFunctions.fn_clSetKernelExecInfo = clSetKernelExecInfo; 2696 //svmFunctions.fn_clEnqueueSVMFree = clEnqueueSVMFree; 2697 svmFunctions.fn_clEnqueueSVMMemcpy = clEnqueueSVMMemcpy; 2698 svmFunctions.fn_clEnqueueSVMMemFill = clEnqueueSVMMemFill; 2699 svmFunctions.fn_clEnqueueSVMMap = clEnqueueSVMMap; 2700 svmFunctions.fn_clEnqueueSVMUnmap = clEnqueueSVMUnmap; 2701 } 2702 catch (...) 2703 { 2704 CV_OPENCL_SVM_TRACE_P("clSVMAlloc failed, trying HSA extension...\n"); 2705 try 2706 { 2707 // Try HSA extension 2708 String extensions = device.extensions(); 2709 if (extensions.find("cl_amd_svm") == String::npos) 2710 { 2711 CV_OPENCL_SVM_TRACE_P("Device extension doesn't have cl_amd_svm: %s\n", extensions.c_str()); 2712 goto noSVM; 2713 } 2714 cl_platform_id p = NULL; 2715 status = clGetDeviceInfo((cl_device_id)device.ptr(), CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &p, NULL); 2716 CV_Assert(status == CL_SUCCESS); 2717 svmFunctions.fn_clSVMAlloc = (clSVMAllocAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSVMAllocAMD"); 2718 svmFunctions.fn_clSVMFree = (clSVMFreeAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSVMFreeAMD"); 2719 svmFunctions.fn_clSetKernelArgSVMPointer = (clSetKernelArgSVMPointerAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSetKernelArgSVMPointerAMD"); 2720 //svmFunctions.fn_clSetKernelExecInfo = (clSetKernelExecInfoAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSetKernelExecInfoAMD"); 2721 //svmFunctions.fn_clEnqueueSVMFree = (clEnqueueSVMFreeAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMFreeAMD"); 2722 svmFunctions.fn_clEnqueueSVMMemcpy = (clEnqueueSVMMemcpyAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMemcpyAMD"); 2723 svmFunctions.fn_clEnqueueSVMMemFill = (clEnqueueSVMMemFillAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMemFillAMD"); 2724 svmFunctions.fn_clEnqueueSVMMap = (clEnqueueSVMMapAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMapAMD"); 2725 svmFunctions.fn_clEnqueueSVMUnmap = (clEnqueueSVMUnmapAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMUnmapAMD"); 2726 CV_Assert(svmFunctions.isValid()); 2727 } 2728 catch (...) 2729 { 2730 CV_OPENCL_SVM_TRACE_P("Something is totally wrong\n"); 2731 goto noSVM; 2732 } 2733 } 2734 2735 svmAvailable = true; 2736 svmEnabled = !svm::checkDisableSVM(); 2737 svmInitialized = true; 2738 CV_OPENCL_SVM_TRACE_P("OpenCV OpenCL SVM support initialized\n"); 2739 return; 2740 noSVM: 2741 CV_OPENCL_SVM_TRACE_P("OpenCL SVM is not detected\n"); 2742 svmAvailable = false; 2743 svmEnabled = false; 2744 svmCapabilities.value_ = 0; 2745 svmInitialized = true; 2746 svmFunctions.fn_clSVMAlloc = NULL; 2747 return; 2748 } 2749 #endif 2750 }; 2751 2752 2753 Context::Context() 2754 { 2755 p = 0; 2756 } 2757 2758 Context::Context(int dtype) 2759 { 2760 p = 0; 2761 create(dtype); 2762 } 2763 2764 bool Context::create() 2765 { 2766 if( !haveOpenCL() ) 2767 return false; 2768 if(p) 2769 p->release(); 2770 p = new Impl(); 2771 if(!p->handle) 2772 { 2773 delete p; 2774 p = 0; 2775 } 2776 return p != 0; 2777 } 2778 2779 bool Context::create(int dtype0) 2780 { 2781 if( !haveOpenCL() ) 2782 return false; 2783 if(p) 2784 p->release(); 2785 p = new Impl(dtype0); 2786 if(!p->handle) 2787 { 2788 delete p; 2789 p = 0; 2790 } 2791 return p != 0; 2792 } 2793 2794 Context::~Context() 2795 { 2796 if (p) 2797 { 2798 p->release(); 2799 p = NULL; 2800 } 2801 } 2802 2803 Context::Context(const Context& c) 2804 { 2805 p = (Impl*)c.p; 2806 if(p) 2807 p->addref(); 2808 } 2809 2810 Context& Context::operator = (const Context& c) 2811 { 2812 Impl* newp = (Impl*)c.p; 2813 if(newp) 2814 newp->addref(); 2815 if(p) 2816 p->release(); 2817 p = newp; 2818 return *this; 2819 } 2820 2821 void* Context::ptr() const 2822 { 2823 return p == NULL ? NULL : p->handle; 2824 } 2825 2826 size_t Context::ndevices() const 2827 { 2828 return p ? p->devices.size() : 0; 2829 } 2830 2831 const Device& Context::device(size_t idx) const 2832 { 2833 static Device dummy; 2834 return !p || idx >= p->devices.size() ? dummy : p->devices[idx]; 2835 } 2836 2837 Context& Context::getDefault(bool initialize) 2838 { 2839 static Context* ctx = new Context(); 2840 if(!ctx->p && haveOpenCL()) 2841 { 2842 if (!ctx->p) 2843 ctx->p = new Impl(); 2844 if (initialize) 2845 { 2846 // do not create new Context right away. 2847 // First, try to retrieve existing context of the same type. 2848 // In its turn, Platform::getContext() may call Context::create() 2849 // if there is no such context. 2850 if (ctx->p->handle == NULL) 2851 ctx->p->setDefault(); 2852 } 2853 } 2854 2855 return *ctx; 2856 } 2857 2858 Program Context::getProg(const ProgramSource& prog, 2859 const String& buildopts, String& errmsg) 2860 { 2861 return p ? p->getProg(prog, buildopts, errmsg) : Program(); 2862 } 2863 2864 2865 2866 #ifdef HAVE_OPENCL_SVM 2867 bool Context::useSVM() const 2868 { 2869 Context::Impl* i = p; 2870 CV_Assert(i); 2871 if (!i->svmInitialized) 2872 i->svmInit(); 2873 return i->svmEnabled; 2874 } 2875 void Context::setUseSVM(bool enabled) 2876 { 2877 Context::Impl* i = p; 2878 CV_Assert(i); 2879 if (!i->svmInitialized) 2880 i->svmInit(); 2881 if (enabled && !i->svmAvailable) 2882 { 2883 CV_ErrorNoReturn(Error::StsError, "OpenCL Shared Virtual Memory (SVM) is not supported by OpenCL device"); 2884 } 2885 i->svmEnabled = enabled; 2886 } 2887 #else 2888 bool Context::useSVM() const { return false; } 2889 void Context::setUseSVM(bool enabled) { CV_Assert(!enabled); } 2890 #endif 2891 2892 #ifdef HAVE_OPENCL_SVM 2893 namespace svm { 2894 2895 const SVMCapabilities getSVMCapabilitites(const ocl::Context& context) 2896 { 2897 Context::Impl* i = context.p; 2898 CV_Assert(i); 2899 if (!i->svmInitialized) 2900 i->svmInit(); 2901 return i->svmCapabilities; 2902 } 2903 2904 CV_EXPORTS const SVMFunctions* getSVMFunctions(const ocl::Context& context) 2905 { 2906 Context::Impl* i = context.p; 2907 CV_Assert(i); 2908 CV_Assert(i->svmInitialized); // getSVMCapabilitites() must be called first 2909 CV_Assert(i->svmFunctions.fn_clSVMAlloc != NULL); 2910 return &i->svmFunctions; 2911 } 2912 2913 CV_EXPORTS bool useSVM(UMatUsageFlags usageFlags) 2914 { 2915 if (checkForceSVMUmatUsage()) 2916 return true; 2917 if (checkDisableSVMUMatUsage()) 2918 return false; 2919 if ((usageFlags & USAGE_ALLOCATE_SHARED_MEMORY) != 0) 2920 return true; 2921 return false; // don't use SVM by default 2922 } 2923 2924 } // namespace cv::ocl::svm 2925 #endif // HAVE_OPENCL_SVM 2926 2927 2928 2929 void initializeContextFromHandle(Context& ctx, void* platform, void* _context, void* _device) 2930 { 2931 cl_context context = (cl_context)_context; 2932 cl_device_id device = (cl_device_id)_device; 2933 2934 // cleanup old context 2935 Context::Impl * impl = ctx.p; 2936 if (impl->handle) 2937 { 2938 CV_OclDbgAssert(clReleaseContext(impl->handle) == CL_SUCCESS); 2939 } 2940 impl->devices.clear(); 2941 2942 impl->handle = context; 2943 impl->devices.resize(1); 2944 impl->devices[0].set(device); 2945 2946 Platform& p = Platform::getDefault(); 2947 Platform::Impl* pImpl = p.p; 2948 pImpl->handle = (cl_platform_id)platform; 2949 } 2950 2951 /////////////////////////////////////////// Queue ///////////////////////////////////////////// 2952 2953 struct Queue::Impl 2954 { 2955 Impl(const Context& c, const Device& d) 2956 { 2957 refcount = 1; 2958 const Context* pc = &c; 2959 cl_context ch = (cl_context)pc->ptr(); 2960 if( !ch ) 2961 { 2962 pc = &Context::getDefault(); 2963 ch = (cl_context)pc->ptr(); 2964 } 2965 cl_device_id dh = (cl_device_id)d.ptr(); 2966 if( !dh ) 2967 dh = (cl_device_id)pc->device(0).ptr(); 2968 cl_int retval = 0; 2969 handle = clCreateCommandQueue(ch, dh, 0, &retval); 2970 CV_OclDbgAssert(retval == CL_SUCCESS); 2971 } 2972 2973 ~Impl() 2974 { 2975 #ifdef _WIN32 2976 if (!cv::__termination) 2977 #endif 2978 { 2979 if(handle) 2980 { 2981 clFinish(handle); 2982 clReleaseCommandQueue(handle); 2983 handle = NULL; 2984 } 2985 } 2986 } 2987 2988 IMPLEMENT_REFCOUNTABLE(); 2989 2990 cl_command_queue handle; 2991 }; 2992 2993 Queue::Queue() 2994 { 2995 p = 0; 2996 } 2997 2998 Queue::Queue(const Context& c, const Device& d) 2999 { 3000 p = 0; 3001 create(c, d); 3002 } 3003 3004 Queue::Queue(const Queue& q) 3005 { 3006 p = q.p; 3007 if(p) 3008 p->addref(); 3009 } 3010 3011 Queue& Queue::operator = (const Queue& q) 3012 { 3013 Impl* newp = (Impl*)q.p; 3014 if(newp) 3015 newp->addref(); 3016 if(p) 3017 p->release(); 3018 p = newp; 3019 return *this; 3020 } 3021 3022 Queue::~Queue() 3023 { 3024 if(p) 3025 p->release(); 3026 } 3027 3028 bool Queue::create(const Context& c, const Device& d) 3029 { 3030 if(p) 3031 p->release(); 3032 p = new Impl(c, d); 3033 return p->handle != 0; 3034 } 3035 3036 void Queue::finish() 3037 { 3038 if(p && p->handle) 3039 { 3040 CV_OclDbgAssert(clFinish(p->handle) == CL_SUCCESS); 3041 } 3042 } 3043 3044 void* Queue::ptr() const 3045 { 3046 return p ? p->handle : 0; 3047 } 3048 3049 Queue& Queue::getDefault() 3050 { 3051 Queue& q = getCoreTlsData().get()->oclQueue; 3052 if( !q.p && haveOpenCL() ) 3053 q.create(Context::getDefault()); 3054 return q; 3055 } 3056 3057 static cl_command_queue getQueue(const Queue& q) 3058 { 3059 cl_command_queue qq = (cl_command_queue)q.ptr(); 3060 if(!qq) 3061 qq = (cl_command_queue)Queue::getDefault().ptr(); 3062 return qq; 3063 } 3064 3065 /////////////////////////////////////////// KernelArg ///////////////////////////////////////////// 3066 3067 KernelArg::KernelArg() 3068 : flags(0), m(0), obj(0), sz(0), wscale(1), iwscale(1) 3069 { 3070 } 3071 3072 KernelArg::KernelArg(int _flags, UMat* _m, int _wscale, int _iwscale, const void* _obj, size_t _sz) 3073 : flags(_flags), m(_m), obj(_obj), sz(_sz), wscale(_wscale), iwscale(_iwscale) 3074 { 3075 } 3076 3077 KernelArg KernelArg::Constant(const Mat& m) 3078 { 3079 CV_Assert(m.isContinuous()); 3080 return KernelArg(CONSTANT, 0, 0, 0, m.ptr(), m.total()*m.elemSize()); 3081 } 3082 3083 /////////////////////////////////////////// Kernel ///////////////////////////////////////////// 3084 3085 struct Kernel::Impl 3086 { 3087 Impl(const char* kname, const Program& prog) : 3088 refcount(1), e(0), nu(0) 3089 { 3090 cl_program ph = (cl_program)prog.ptr(); 3091 cl_int retval = 0; 3092 handle = ph != 0 ? 3093 clCreateKernel(ph, kname, &retval) : 0; 3094 CV_OclDbgAssert(retval == CL_SUCCESS); 3095 for( int i = 0; i < MAX_ARRS; i++ ) 3096 u[i] = 0; 3097 haveTempDstUMats = false; 3098 } 3099 3100 void cleanupUMats() 3101 { 3102 for( int i = 0; i < MAX_ARRS; i++ ) 3103 if( u[i] ) 3104 { 3105 if( CV_XADD(&u[i]->urefcount, -1) == 1 ) 3106 u[i]->currAllocator->deallocate(u[i]); 3107 u[i] = 0; 3108 } 3109 nu = 0; 3110 haveTempDstUMats = false; 3111 } 3112 3113 void addUMat(const UMat& m, bool dst) 3114 { 3115 CV_Assert(nu < MAX_ARRS && m.u && m.u->urefcount > 0); 3116 u[nu] = m.u; 3117 CV_XADD(&m.u->urefcount, 1); 3118 nu++; 3119 if(dst && m.u->tempUMat()) 3120 haveTempDstUMats = true; 3121 } 3122 3123 void addImage(const Image2D& image) 3124 { 3125 images.push_back(image); 3126 } 3127 3128 void finit() 3129 { 3130 cleanupUMats(); 3131 images.clear(); 3132 if(e) { clReleaseEvent(e); e = 0; } 3133 release(); 3134 } 3135 3136 ~Impl() 3137 { 3138 if(handle) 3139 clReleaseKernel(handle); 3140 } 3141 3142 IMPLEMENT_REFCOUNTABLE(); 3143 3144 cl_kernel handle; 3145 cl_event e; 3146 enum { MAX_ARRS = 16 }; 3147 UMatData* u[MAX_ARRS]; 3148 int nu; 3149 std::list<Image2D> images; 3150 bool haveTempDstUMats; 3151 }; 3152 3153 }} 3154 3155 extern "C" 3156 { 3157 static void CL_CALLBACK oclCleanupCallback(cl_event, cl_int, void *p) 3158 { 3159 ((cv::ocl::Kernel::Impl*)p)->finit(); 3160 } 3161 3162 } 3163 3164 namespace cv { namespace ocl { 3165 3166 Kernel::Kernel() 3167 { 3168 p = 0; 3169 } 3170 3171 Kernel::Kernel(const char* kname, const Program& prog) 3172 { 3173 p = 0; 3174 create(kname, prog); 3175 } 3176 3177 Kernel::Kernel(const char* kname, const ProgramSource& src, 3178 const String& buildopts, String* errmsg) 3179 { 3180 p = 0; 3181 create(kname, src, buildopts, errmsg); 3182 } 3183 3184 Kernel::Kernel(const Kernel& k) 3185 { 3186 p = k.p; 3187 if(p) 3188 p->addref(); 3189 } 3190 3191 Kernel& Kernel::operator = (const Kernel& k) 3192 { 3193 Impl* newp = (Impl*)k.p; 3194 if(newp) 3195 newp->addref(); 3196 if(p) 3197 p->release(); 3198 p = newp; 3199 return *this; 3200 } 3201 3202 Kernel::~Kernel() 3203 { 3204 if(p) 3205 p->release(); 3206 } 3207 3208 bool Kernel::create(const char* kname, const Program& prog) 3209 { 3210 if(p) 3211 p->release(); 3212 p = new Impl(kname, prog); 3213 if(p->handle == 0) 3214 { 3215 p->release(); 3216 p = 0; 3217 } 3218 #ifdef CV_OPENCL_RUN_ASSERT // check kernel compilation fails 3219 CV_Assert(p); 3220 #endif 3221 return p != 0; 3222 } 3223 3224 bool Kernel::create(const char* kname, const ProgramSource& src, 3225 const String& buildopts, String* errmsg) 3226 { 3227 if(p) 3228 { 3229 p->release(); 3230 p = 0; 3231 } 3232 String tempmsg; 3233 if( !errmsg ) errmsg = &tempmsg; 3234 const Program& prog = Context::getDefault().getProg(src, buildopts, *errmsg); 3235 return create(kname, prog); 3236 } 3237 3238 void* Kernel::ptr() const 3239 { 3240 return p ? p->handle : 0; 3241 } 3242 3243 bool Kernel::empty() const 3244 { 3245 return ptr() == 0; 3246 } 3247 3248 int Kernel::set(int i, const void* value, size_t sz) 3249 { 3250 if (!p || !p->handle) 3251 return -1; 3252 if (i < 0) 3253 return i; 3254 if( i == 0 ) 3255 p->cleanupUMats(); 3256 3257 cl_int retval = clSetKernelArg(p->handle, (cl_uint)i, sz, value); 3258 CV_OclDbgAssert(retval == CL_SUCCESS); 3259 if (retval != CL_SUCCESS) 3260 return -1; 3261 return i+1; 3262 } 3263 3264 int Kernel::set(int i, const Image2D& image2D) 3265 { 3266 p->addImage(image2D); 3267 cl_mem h = (cl_mem)image2D.ptr(); 3268 return set(i, &h, sizeof(h)); 3269 } 3270 3271 int Kernel::set(int i, const UMat& m) 3272 { 3273 return set(i, KernelArg(KernelArg::READ_WRITE, (UMat*)&m, 0, 0)); 3274 } 3275 3276 int Kernel::set(int i, const KernelArg& arg) 3277 { 3278 if( !p || !p->handle ) 3279 return -1; 3280 if (i < 0) 3281 return i; 3282 if( i == 0 ) 3283 p->cleanupUMats(); 3284 if( arg.m ) 3285 { 3286 int accessFlags = ((arg.flags & KernelArg::READ_ONLY) ? ACCESS_READ : 0) + 3287 ((arg.flags & KernelArg::WRITE_ONLY) ? ACCESS_WRITE : 0); 3288 bool ptronly = (arg.flags & KernelArg::PTR_ONLY) != 0; 3289 cl_mem h = (cl_mem)arg.m->handle(accessFlags); 3290 3291 if (!h) 3292 { 3293 p->release(); 3294 p = 0; 3295 return -1; 3296 } 3297 3298 #ifdef HAVE_OPENCL_SVM 3299 if ((arg.m->u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0) 3300 { 3301 const Context& ctx = Context::getDefault(); 3302 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx); 3303 uchar*& svmDataPtr = (uchar*&)arg.m->u->handle; 3304 CV_OPENCL_SVM_TRACE_P("clSetKernelArgSVMPointer: %p\n", svmDataPtr); 3305 #if 1 // TODO 3306 cl_int status = svmFns->fn_clSetKernelArgSVMPointer(p->handle, (cl_uint)i, svmDataPtr); 3307 #else 3308 cl_int status = svmFns->fn_clSetKernelArgSVMPointer(p->handle, (cl_uint)i, &svmDataPtr); 3309 #endif 3310 CV_Assert(status == CL_SUCCESS); 3311 } 3312 else 3313 #endif 3314 { 3315 CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, sizeof(h), &h) == CL_SUCCESS); 3316 } 3317 3318 if (ptronly) 3319 { 3320 i++; 3321 } 3322 else if( arg.m->dims <= 2 ) 3323 { 3324 UMat2D u2d(*arg.m); 3325 CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u2d.step), &u2d.step) == CL_SUCCESS); 3326 CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u2d.offset), &u2d.offset) == CL_SUCCESS); 3327 i += 3; 3328 3329 if( !(arg.flags & KernelArg::NO_SIZE) ) 3330 { 3331 int cols = u2d.cols*arg.wscale/arg.iwscale; 3332 CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, sizeof(u2d.rows), &u2d.rows) == CL_SUCCESS); 3333 CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(cols), &cols) == CL_SUCCESS); 3334 i += 2; 3335 } 3336 } 3337 else 3338 { 3339 UMat3D u3d(*arg.m); 3340 CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.slicestep), &u3d.slicestep) == CL_SUCCESS); 3341 CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.step), &u3d.step) == CL_SUCCESS); 3342 CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+3), sizeof(u3d.offset), &u3d.offset) == CL_SUCCESS); 3343 i += 4; 3344 if( !(arg.flags & KernelArg::NO_SIZE) ) 3345 { 3346 int cols = u3d.cols*arg.wscale/arg.iwscale; 3347 CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, sizeof(u3d.slices), &u3d.rows) == CL_SUCCESS); 3348 CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.rows), &u3d.rows) == CL_SUCCESS); 3349 CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.cols), &cols) == CL_SUCCESS); 3350 i += 3; 3351 } 3352 } 3353 p->addUMat(*arg.m, (accessFlags & ACCESS_WRITE) != 0); 3354 return i; 3355 } 3356 CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, arg.sz, arg.obj) == CL_SUCCESS); 3357 return i+1; 3358 } 3359 3360 3361 bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[], 3362 bool sync, const Queue& q) 3363 { 3364 if(!p || !p->handle || p->e != 0) 3365 return false; 3366 3367 cl_command_queue qq = getQueue(q); 3368 size_t offset[CV_MAX_DIM] = {0}, globalsize[CV_MAX_DIM] = {1,1,1}; 3369 size_t total = 1; 3370 CV_Assert(_globalsize != 0); 3371 for (int i = 0; i < dims; i++) 3372 { 3373 size_t val = _localsize ? _localsize[i] : 3374 dims == 1 ? 64 : dims == 2 ? (i == 0 ? 256 : 8) : dims == 3 ? (8>>(int)(i>0)) : 1; 3375 CV_Assert( val > 0 ); 3376 total *= _globalsize[i]; 3377 globalsize[i] = ((_globalsize[i] + val - 1)/val)*val; 3378 } 3379 if( total == 0 ) 3380 return true; 3381 if( p->haveTempDstUMats ) 3382 sync = true; 3383 cl_int retval = clEnqueueNDRangeKernel(qq, p->handle, (cl_uint)dims, 3384 offset, globalsize, _localsize, 0, 0, 3385 sync ? 0 : &p->e); 3386 #if CV_OPENCL_SHOW_RUN_ERRORS 3387 if (retval != CL_SUCCESS) 3388 { 3389 printf("OpenCL program returns error: %d\n", retval); 3390 fflush(stdout); 3391 } 3392 #endif 3393 if( sync || retval != CL_SUCCESS ) 3394 { 3395 CV_OclDbgAssert(clFinish(qq) == CL_SUCCESS); 3396 p->cleanupUMats(); 3397 } 3398 else 3399 { 3400 p->addref(); 3401 CV_OclDbgAssert(clSetEventCallback(p->e, CL_COMPLETE, oclCleanupCallback, p) == CL_SUCCESS); 3402 } 3403 return retval == CL_SUCCESS; 3404 } 3405 3406 bool Kernel::runTask(bool sync, const Queue& q) 3407 { 3408 if(!p || !p->handle || p->e != 0) 3409 return false; 3410 3411 cl_command_queue qq = getQueue(q); 3412 cl_int retval = clEnqueueTask(qq, p->handle, 0, 0, sync ? 0 : &p->e); 3413 if( sync || retval != CL_SUCCESS ) 3414 { 3415 CV_OclDbgAssert(clFinish(qq) == CL_SUCCESS); 3416 p->cleanupUMats(); 3417 } 3418 else 3419 { 3420 p->addref(); 3421 CV_OclDbgAssert(clSetEventCallback(p->e, CL_COMPLETE, oclCleanupCallback, p) == CL_SUCCESS); 3422 } 3423 return retval == CL_SUCCESS; 3424 } 3425 3426 3427 size_t Kernel::workGroupSize() const 3428 { 3429 if(!p || !p->handle) 3430 return 0; 3431 size_t val = 0, retsz = 0; 3432 cl_device_id dev = (cl_device_id)Device::getDefault().ptr(); 3433 return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_WORK_GROUP_SIZE, 3434 sizeof(val), &val, &retsz) == CL_SUCCESS ? val : 0; 3435 } 3436 3437 size_t Kernel::preferedWorkGroupSizeMultiple() const 3438 { 3439 if(!p || !p->handle) 3440 return 0; 3441 size_t val = 0, retsz = 0; 3442 cl_device_id dev = (cl_device_id)Device::getDefault().ptr(); 3443 return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, 3444 sizeof(val), &val, &retsz) == CL_SUCCESS ? val : 0; 3445 } 3446 3447 bool Kernel::compileWorkGroupSize(size_t wsz[]) const 3448 { 3449 if(!p || !p->handle || !wsz) 3450 return 0; 3451 size_t retsz = 0; 3452 cl_device_id dev = (cl_device_id)Device::getDefault().ptr(); 3453 return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, 3454 sizeof(wsz[0])*3, wsz, &retsz) == CL_SUCCESS; 3455 } 3456 3457 size_t Kernel::localMemSize() const 3458 { 3459 if(!p || !p->handle) 3460 return 0; 3461 size_t retsz = 0; 3462 cl_ulong val = 0; 3463 cl_device_id dev = (cl_device_id)Device::getDefault().ptr(); 3464 return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_LOCAL_MEM_SIZE, 3465 sizeof(val), &val, &retsz) == CL_SUCCESS ? (size_t)val : 0; 3466 } 3467 3468 /////////////////////////////////////////// Program ///////////////////////////////////////////// 3469 3470 struct Program::Impl 3471 { 3472 Impl(const ProgramSource& _src, 3473 const String& _buildflags, String& errmsg) 3474 { 3475 refcount = 1; 3476 const Context& ctx = Context::getDefault(); 3477 src = _src; 3478 buildflags = _buildflags; 3479 const String& srcstr = src.source(); 3480 const char* srcptr = srcstr.c_str(); 3481 size_t srclen = srcstr.size(); 3482 cl_int retval = 0; 3483 3484 handle = clCreateProgramWithSource((cl_context)ctx.ptr(), 1, &srcptr, &srclen, &retval); 3485 if( handle && retval == CL_SUCCESS ) 3486 { 3487 int i, n = (int)ctx.ndevices(); 3488 AutoBuffer<void*> deviceListBuf(n+1); 3489 void** deviceList = deviceListBuf; 3490 for( i = 0; i < n; i++ ) 3491 deviceList[i] = ctx.device(i).ptr(); 3492 3493 Device device = Device::getDefault(); 3494 if (device.isAMD()) 3495 buildflags += " -D AMD_DEVICE"; 3496 else if (device.isIntel()) 3497 buildflags += " -D INTEL_DEVICE"; 3498 3499 retval = clBuildProgram(handle, n, 3500 (const cl_device_id*)deviceList, 3501 buildflags.c_str(), 0, 0); 3502 #if !CV_OPENCL_ALWAYS_SHOW_BUILD_LOG 3503 if( retval != CL_SUCCESS ) 3504 #endif 3505 { 3506 size_t retsz = 0; 3507 cl_int buildInfo_retval = clGetProgramBuildInfo(handle, (cl_device_id)deviceList[0], 3508 CL_PROGRAM_BUILD_LOG, 0, 0, &retsz); 3509 if (buildInfo_retval == CL_SUCCESS && retsz > 1) 3510 { 3511 AutoBuffer<char> bufbuf(retsz + 16); 3512 char* buf = bufbuf; 3513 buildInfo_retval = clGetProgramBuildInfo(handle, (cl_device_id)deviceList[0], 3514 CL_PROGRAM_BUILD_LOG, retsz+1, buf, &retsz); 3515 if (buildInfo_retval == CL_SUCCESS) 3516 { 3517 // TODO It is useful to see kernel name & program file name also 3518 errmsg = String(buf); 3519 printf("OpenCL program build log: %s\n%s\n", buildflags.c_str(), errmsg.c_str()); 3520 fflush(stdout); 3521 } 3522 } 3523 if (retval != CL_SUCCESS && handle) 3524 { 3525 clReleaseProgram(handle); 3526 handle = NULL; 3527 } 3528 } 3529 } 3530 } 3531 3532 Impl(const String& _buf, const String& _buildflags) 3533 { 3534 refcount = 1; 3535 handle = 0; 3536 buildflags = _buildflags; 3537 if(_buf.empty()) 3538 return; 3539 String prefix0 = Program::getPrefix(buildflags); 3540 const Context& ctx = Context::getDefault(); 3541 const Device& dev = Device::getDefault(); 3542 const char* pos0 = _buf.c_str(); 3543 const char* pos1 = strchr(pos0, '\n'); 3544 if(!pos1) 3545 return; 3546 const char* pos2 = strchr(pos1+1, '\n'); 3547 if(!pos2) 3548 return; 3549 const char* pos3 = strchr(pos2+1, '\n'); 3550 if(!pos3) 3551 return; 3552 size_t prefixlen = (pos3 - pos0)+1; 3553 String prefix(pos0, prefixlen); 3554 if( prefix != prefix0 ) 3555 return; 3556 const uchar* bin = (uchar*)(pos3+1); 3557 void* devid = dev.ptr(); 3558 size_t codelen = _buf.length() - prefixlen; 3559 cl_int binstatus = 0, retval = 0; 3560 handle = clCreateProgramWithBinary((cl_context)ctx.ptr(), 1, (cl_device_id*)&devid, 3561 &codelen, &bin, &binstatus, &retval); 3562 CV_OclDbgAssert(retval == CL_SUCCESS); 3563 } 3564 3565 String store() 3566 { 3567 if(!handle) 3568 return String(); 3569 size_t progsz = 0, retsz = 0; 3570 String prefix = Program::getPrefix(buildflags); 3571 size_t prefixlen = prefix.length(); 3572 if(clGetProgramInfo(handle, CL_PROGRAM_BINARY_SIZES, sizeof(progsz), &progsz, &retsz) != CL_SUCCESS) 3573 return String(); 3574 AutoBuffer<uchar> bufbuf(prefixlen + progsz + 16); 3575 uchar* buf = bufbuf; 3576 memcpy(buf, prefix.c_str(), prefixlen); 3577 buf += prefixlen; 3578 if(clGetProgramInfo(handle, CL_PROGRAM_BINARIES, sizeof(buf), &buf, &retsz) != CL_SUCCESS) 3579 return String(); 3580 buf[progsz] = (uchar)'\0'; 3581 return String((const char*)(uchar*)bufbuf, prefixlen + progsz); 3582 } 3583 3584 ~Impl() 3585 { 3586 if( handle ) 3587 { 3588 #ifdef _WIN32 3589 if (!cv::__termination) 3590 #endif 3591 { 3592 clReleaseProgram(handle); 3593 } 3594 handle = NULL; 3595 } 3596 } 3597 3598 IMPLEMENT_REFCOUNTABLE(); 3599 3600 ProgramSource src; 3601 String buildflags; 3602 cl_program handle; 3603 }; 3604 3605 3606 Program::Program() { p = 0; } 3607 3608 Program::Program(const ProgramSource& src, 3609 const String& buildflags, String& errmsg) 3610 { 3611 p = 0; 3612 create(src, buildflags, errmsg); 3613 } 3614 3615 Program::Program(const Program& prog) 3616 { 3617 p = prog.p; 3618 if(p) 3619 p->addref(); 3620 } 3621 3622 Program& Program::operator = (const Program& prog) 3623 { 3624 Impl* newp = (Impl*)prog.p; 3625 if(newp) 3626 newp->addref(); 3627 if(p) 3628 p->release(); 3629 p = newp; 3630 return *this; 3631 } 3632 3633 Program::~Program() 3634 { 3635 if(p) 3636 p->release(); 3637 } 3638 3639 bool Program::create(const ProgramSource& src, 3640 const String& buildflags, String& errmsg) 3641 { 3642 if(p) 3643 p->release(); 3644 p = new Impl(src, buildflags, errmsg); 3645 if(!p->handle) 3646 { 3647 p->release(); 3648 p = 0; 3649 } 3650 return p != 0; 3651 } 3652 3653 const ProgramSource& Program::source() const 3654 { 3655 static ProgramSource dummy; 3656 return p ? p->src : dummy; 3657 } 3658 3659 void* Program::ptr() const 3660 { 3661 return p ? p->handle : 0; 3662 } 3663 3664 bool Program::read(const String& bin, const String& buildflags) 3665 { 3666 if(p) 3667 p->release(); 3668 p = new Impl(bin, buildflags); 3669 return p->handle != 0; 3670 } 3671 3672 bool Program::write(String& bin) const 3673 { 3674 if(!p) 3675 return false; 3676 bin = p->store(); 3677 return !bin.empty(); 3678 } 3679 3680 String Program::getPrefix() const 3681 { 3682 if(!p) 3683 return String(); 3684 return getPrefix(p->buildflags); 3685 } 3686 3687 String Program::getPrefix(const String& buildflags) 3688 { 3689 const Context& ctx = Context::getDefault(); 3690 const Device& dev = ctx.device(0); 3691 return format("name=%s\ndriver=%s\nbuildflags=%s\n", 3692 dev.name().c_str(), dev.driverVersion().c_str(), buildflags.c_str()); 3693 } 3694 3695 ///////////////////////////////////////// ProgramSource /////////////////////////////////////////////// 3696 3697 struct ProgramSource::Impl 3698 { 3699 Impl(const char* _src) 3700 { 3701 init(String(_src)); 3702 } 3703 Impl(const String& _src) 3704 { 3705 init(_src); 3706 } 3707 void init(const String& _src) 3708 { 3709 refcount = 1; 3710 src = _src; 3711 h = crc64((uchar*)src.c_str(), src.size()); 3712 } 3713 3714 IMPLEMENT_REFCOUNTABLE(); 3715 String src; 3716 ProgramSource::hash_t h; 3717 }; 3718 3719 3720 ProgramSource::ProgramSource() 3721 { 3722 p = 0; 3723 } 3724 3725 ProgramSource::ProgramSource(const char* prog) 3726 { 3727 p = new Impl(prog); 3728 } 3729 3730 ProgramSource::ProgramSource(const String& prog) 3731 { 3732 p = new Impl(prog); 3733 } 3734 3735 ProgramSource::~ProgramSource() 3736 { 3737 if(p) 3738 p->release(); 3739 } 3740 3741 ProgramSource::ProgramSource(const ProgramSource& prog) 3742 { 3743 p = prog.p; 3744 if(p) 3745 p->addref(); 3746 } 3747 3748 ProgramSource& ProgramSource::operator = (const ProgramSource& prog) 3749 { 3750 Impl* newp = (Impl*)prog.p; 3751 if(newp) 3752 newp->addref(); 3753 if(p) 3754 p->release(); 3755 p = newp; 3756 return *this; 3757 } 3758 3759 const String& ProgramSource::source() const 3760 { 3761 static String dummy; 3762 return p ? p->src : dummy; 3763 } 3764 3765 ProgramSource::hash_t ProgramSource::hash() const 3766 { 3767 return p ? p->h : 0; 3768 } 3769 3770 //////////////////////////////////////////// OpenCLAllocator ////////////////////////////////////////////////// 3771 3772 template<typename T> 3773 class OpenCLBufferPool 3774 { 3775 protected: 3776 ~OpenCLBufferPool() { } 3777 public: 3778 virtual T allocate(size_t size) = 0; 3779 virtual void release(T buffer) = 0; 3780 }; 3781 3782 template <typename Derived, typename BufferEntry, typename T> 3783 class OpenCLBufferPoolBaseImpl : public BufferPoolController, public OpenCLBufferPool<T> 3784 { 3785 private: 3786 inline Derived& derived() { return *static_cast<Derived*>(this); } 3787 protected: 3788 Mutex mutex_; 3789 3790 size_t currentReservedSize; 3791 size_t maxReservedSize; 3792 3793 std::list<BufferEntry> allocatedEntries_; // Allocated and used entries 3794 std::list<BufferEntry> reservedEntries_; // LRU order. Allocated, but not used entries 3795 3796 // synchronized 3797 bool _findAndRemoveEntryFromAllocatedList(CV_OUT BufferEntry& entry, T buffer) 3798 { 3799 typename std::list<BufferEntry>::iterator i = allocatedEntries_.begin(); 3800 for (; i != allocatedEntries_.end(); ++i) 3801 { 3802 BufferEntry& e = *i; 3803 if (e.clBuffer_ == buffer) 3804 { 3805 entry = e; 3806 allocatedEntries_.erase(i); 3807 return true; 3808 } 3809 } 3810 return false; 3811 } 3812 3813 // synchronized 3814 bool _findAndRemoveEntryFromReservedList(CV_OUT BufferEntry& entry, const size_t size) 3815 { 3816 if (reservedEntries_.empty()) 3817 return false; 3818 typename std::list<BufferEntry>::iterator i = reservedEntries_.begin(); 3819 typename std::list<BufferEntry>::iterator result_pos = reservedEntries_.end(); 3820 BufferEntry result; 3821 size_t minDiff = (size_t)(-1); 3822 for (; i != reservedEntries_.end(); ++i) 3823 { 3824 BufferEntry& e = *i; 3825 if (e.capacity_ >= size) 3826 { 3827 size_t diff = e.capacity_ - size; 3828 if (diff < size / 8 && (result_pos == reservedEntries_.end() || diff < minDiff)) 3829 { 3830 minDiff = diff; 3831 result_pos = i; 3832 result = e; 3833 if (diff == 0) 3834 break; 3835 } 3836 } 3837 } 3838 if (result_pos != reservedEntries_.end()) 3839 { 3840 //CV_DbgAssert(result == *result_pos); 3841 reservedEntries_.erase(result_pos); 3842 entry = result; 3843 currentReservedSize -= entry.capacity_; 3844 allocatedEntries_.push_back(entry); 3845 return true; 3846 } 3847 return false; 3848 } 3849 3850 // synchronized 3851 void _checkSizeOfReservedEntries() 3852 { 3853 while (currentReservedSize > maxReservedSize) 3854 { 3855 CV_DbgAssert(!reservedEntries_.empty()); 3856 const BufferEntry& entry = reservedEntries_.back(); 3857 CV_DbgAssert(currentReservedSize >= entry.capacity_); 3858 currentReservedSize -= entry.capacity_; 3859 derived()._releaseBufferEntry(entry); 3860 reservedEntries_.pop_back(); 3861 } 3862 } 3863 3864 inline size_t _allocationGranularity(size_t size) 3865 { 3866 // heuristic values 3867 if (size < 1024) 3868 return 16; 3869 else if (size < 64*1024) 3870 return 64; 3871 else if (size < 1024*1024) 3872 return 4096; 3873 else if (size < 16*1024*1024) 3874 return 64*1024; 3875 else 3876 return 1024*1024; 3877 } 3878 3879 public: 3880 OpenCLBufferPoolBaseImpl() 3881 : currentReservedSize(0), 3882 maxReservedSize(0) 3883 { 3884 // nothing 3885 } 3886 virtual ~OpenCLBufferPoolBaseImpl() 3887 { 3888 freeAllReservedBuffers(); 3889 CV_Assert(reservedEntries_.empty()); 3890 } 3891 public: 3892 virtual T allocate(size_t size) 3893 { 3894 AutoLock locker(mutex_); 3895 BufferEntry entry; 3896 if (maxReservedSize > 0 && _findAndRemoveEntryFromReservedList(entry, size)) 3897 { 3898 CV_DbgAssert(size <= entry.capacity_); 3899 LOG_BUFFER_POOL("Reuse reserved buffer: %p\n", entry.clBuffer_); 3900 } 3901 else 3902 { 3903 derived()._allocateBufferEntry(entry, size); 3904 } 3905 return entry.clBuffer_; 3906 } 3907 virtual void release(T buffer) 3908 { 3909 AutoLock locker(mutex_); 3910 BufferEntry entry; 3911 CV_Assert(_findAndRemoveEntryFromAllocatedList(entry, buffer)); 3912 if (maxReservedSize == 0 || entry.capacity_ > maxReservedSize / 8) 3913 { 3914 derived()._releaseBufferEntry(entry); 3915 } 3916 else 3917 { 3918 reservedEntries_.push_front(entry); 3919 currentReservedSize += entry.capacity_; 3920 _checkSizeOfReservedEntries(); 3921 } 3922 } 3923 3924 virtual size_t getReservedSize() const { return currentReservedSize; } 3925 virtual size_t getMaxReservedSize() const { return maxReservedSize; } 3926 virtual void setMaxReservedSize(size_t size) 3927 { 3928 AutoLock locker(mutex_); 3929 size_t oldMaxReservedSize = maxReservedSize; 3930 maxReservedSize = size; 3931 if (maxReservedSize < oldMaxReservedSize) 3932 { 3933 typename std::list<BufferEntry>::iterator i = reservedEntries_.begin(); 3934 for (; i != reservedEntries_.end();) 3935 { 3936 const BufferEntry& entry = *i; 3937 if (entry.capacity_ > maxReservedSize / 8) 3938 { 3939 CV_DbgAssert(currentReservedSize >= entry.capacity_); 3940 currentReservedSize -= entry.capacity_; 3941 derived()._releaseBufferEntry(entry); 3942 i = reservedEntries_.erase(i); 3943 continue; 3944 } 3945 ++i; 3946 } 3947 _checkSizeOfReservedEntries(); 3948 } 3949 } 3950 virtual void freeAllReservedBuffers() 3951 { 3952 AutoLock locker(mutex_); 3953 typename std::list<BufferEntry>::const_iterator i = reservedEntries_.begin(); 3954 for (; i != reservedEntries_.end(); ++i) 3955 { 3956 const BufferEntry& entry = *i; 3957 derived()._releaseBufferEntry(entry); 3958 } 3959 reservedEntries_.clear(); 3960 currentReservedSize = 0; 3961 } 3962 }; 3963 3964 struct CLBufferEntry 3965 { 3966 cl_mem clBuffer_; 3967 size_t capacity_; 3968 CLBufferEntry() : clBuffer_((cl_mem)NULL), capacity_(0) { } 3969 }; 3970 3971 class OpenCLBufferPoolImpl : public OpenCLBufferPoolBaseImpl<OpenCLBufferPoolImpl, CLBufferEntry, cl_mem> 3972 { 3973 public: 3974 typedef struct CLBufferEntry BufferEntry; 3975 protected: 3976 int createFlags_; 3977 public: 3978 OpenCLBufferPoolImpl(int createFlags = 0) 3979 : createFlags_(createFlags) 3980 { 3981 } 3982 3983 void _allocateBufferEntry(BufferEntry& entry, size_t size) 3984 { 3985 CV_DbgAssert(entry.clBuffer_ == NULL); 3986 entry.capacity_ = alignSize(size, (int)_allocationGranularity(size)); 3987 Context& ctx = Context::getDefault(); 3988 cl_int retval = CL_SUCCESS; 3989 entry.clBuffer_ = clCreateBuffer((cl_context)ctx.ptr(), CL_MEM_READ_WRITE|createFlags_, entry.capacity_, 0, &retval); 3990 CV_Assert(retval == CL_SUCCESS); 3991 CV_Assert(entry.clBuffer_ != NULL); 3992 if(retval == CL_SUCCESS) 3993 { 3994 CV_IMPL_ADD(CV_IMPL_OCL); 3995 } 3996 LOG_BUFFER_POOL("OpenCL allocate %lld (0x%llx) bytes: %p\n", 3997 (long long)entry.capacity_, (long long)entry.capacity_, entry.clBuffer_); 3998 allocatedEntries_.push_back(entry); 3999 } 4000 4001 void _releaseBufferEntry(const BufferEntry& entry) 4002 { 4003 CV_Assert(entry.capacity_ != 0); 4004 CV_Assert(entry.clBuffer_ != NULL); 4005 LOG_BUFFER_POOL("OpenCL release buffer: %p, %lld (0x%llx) bytes\n", 4006 entry.clBuffer_, (long long)entry.capacity_, (long long)entry.capacity_); 4007 clReleaseMemObject(entry.clBuffer_); 4008 } 4009 }; 4010 4011 #ifdef HAVE_OPENCL_SVM 4012 struct CLSVMBufferEntry 4013 { 4014 void* clBuffer_; 4015 size_t capacity_; 4016 CLSVMBufferEntry() : clBuffer_(NULL), capacity_(0) { } 4017 }; 4018 class OpenCLSVMBufferPoolImpl : public OpenCLBufferPoolBaseImpl<OpenCLSVMBufferPoolImpl, CLSVMBufferEntry, void*> 4019 { 4020 public: 4021 typedef struct CLSVMBufferEntry BufferEntry; 4022 public: 4023 OpenCLSVMBufferPoolImpl() 4024 { 4025 } 4026 4027 void _allocateBufferEntry(BufferEntry& entry, size_t size) 4028 { 4029 CV_DbgAssert(entry.clBuffer_ == NULL); 4030 entry.capacity_ = alignSize(size, (int)_allocationGranularity(size)); 4031 4032 Context& ctx = Context::getDefault(); 4033 const svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx); 4034 bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer(); 4035 cl_svm_mem_flags memFlags = CL_MEM_READ_WRITE | 4036 (isFineGrainBuffer ? CL_MEM_SVM_FINE_GRAIN_BUFFER : 0); 4037 4038 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx); 4039 CV_DbgAssert(svmFns->isValid()); 4040 4041 CV_OPENCL_SVM_TRACE_P("clSVMAlloc: %d\n", (int)entry.capacity_); 4042 void *buf = svmFns->fn_clSVMAlloc((cl_context)ctx.ptr(), memFlags, entry.capacity_, 0); 4043 CV_Assert(buf); 4044 4045 entry.clBuffer_ = buf; 4046 { 4047 CV_IMPL_ADD(CV_IMPL_OCL); 4048 } 4049 LOG_BUFFER_POOL("OpenCL SVM allocate %lld (0x%llx) bytes: %p\n", 4050 (long long)entry.capacity_, (long long)entry.capacity_, entry.clBuffer_); 4051 allocatedEntries_.push_back(entry); 4052 } 4053 4054 void _releaseBufferEntry(const BufferEntry& entry) 4055 { 4056 CV_Assert(entry.capacity_ != 0); 4057 CV_Assert(entry.clBuffer_ != NULL); 4058 LOG_BUFFER_POOL("OpenCL release SVM buffer: %p, %lld (0x%llx) bytes\n", 4059 entry.clBuffer_, (long long)entry.capacity_, (long long)entry.capacity_); 4060 Context& ctx = Context::getDefault(); 4061 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx); 4062 CV_DbgAssert(svmFns->isValid()); 4063 CV_OPENCL_SVM_TRACE_P("clSVMFree: %p\n", entry.clBuffer_); 4064 svmFns->fn_clSVMFree((cl_context)ctx.ptr(), entry.clBuffer_); 4065 } 4066 }; 4067 #endif 4068 4069 4070 4071 #if defined _MSC_VER 4072 #pragma warning(disable:4127) // conditional expression is constant 4073 #endif 4074 template <bool readAccess, bool writeAccess> 4075 class AlignedDataPtr 4076 { 4077 protected: 4078 const size_t size_; 4079 uchar* const originPtr_; 4080 const size_t alignment_; 4081 uchar* ptr_; 4082 uchar* allocatedPtr_; 4083 4084 public: 4085 AlignedDataPtr(uchar* ptr, size_t size, size_t alignment) 4086 : size_(size), originPtr_(ptr), alignment_(alignment), ptr_(ptr), allocatedPtr_(NULL) 4087 { 4088 CV_DbgAssert((alignment & (alignment - 1)) == 0); // check for 2^n 4089 if (((size_t)ptr_ & (alignment - 1)) != 0) 4090 { 4091 allocatedPtr_ = new uchar[size_ + alignment - 1]; 4092 ptr_ = (uchar*)(((uintptr_t)allocatedPtr_ + (alignment - 1)) & ~(alignment - 1)); 4093 if (readAccess) 4094 { 4095 memcpy(ptr_, originPtr_, size_); 4096 } 4097 } 4098 } 4099 4100 uchar* getAlignedPtr() const 4101 { 4102 CV_DbgAssert(((size_t)ptr_ & (alignment_ - 1)) == 0); 4103 return ptr_; 4104 } 4105 4106 ~AlignedDataPtr() 4107 { 4108 if (allocatedPtr_) 4109 { 4110 if (writeAccess) 4111 { 4112 memcpy(originPtr_, ptr_, size_); 4113 } 4114 delete[] allocatedPtr_; 4115 allocatedPtr_ = NULL; 4116 } 4117 ptr_ = NULL; 4118 } 4119 private: 4120 AlignedDataPtr(const AlignedDataPtr&); // disabled 4121 AlignedDataPtr& operator=(const AlignedDataPtr&); // disabled 4122 }; 4123 #if defined _MSC_VER 4124 #pragma warning(default:4127) // conditional expression is constant 4125 #endif 4126 4127 #ifndef CV_OPENCL_DATA_PTR_ALIGNMENT 4128 #define CV_OPENCL_DATA_PTR_ALIGNMENT 16 4129 #endif 4130 4131 class OpenCLAllocator : public MatAllocator 4132 { 4133 mutable OpenCLBufferPoolImpl bufferPool; 4134 mutable OpenCLBufferPoolImpl bufferPoolHostPtr; 4135 #ifdef HAVE_OPENCL_SVM 4136 mutable OpenCLSVMBufferPoolImpl bufferPoolSVM; 4137 #endif 4138 4139 enum AllocatorFlags 4140 { 4141 ALLOCATOR_FLAGS_BUFFER_POOL_USED = 1 << 0, 4142 ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED = 1 << 1 4143 #ifdef HAVE_OPENCL_SVM 4144 ,ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED = 1 << 2 4145 #endif 4146 }; 4147 public: 4148 OpenCLAllocator() 4149 : bufferPool(0), 4150 bufferPoolHostPtr(CL_MEM_ALLOC_HOST_PTR) 4151 { 4152 size_t defaultPoolSize, poolSize; 4153 defaultPoolSize = ocl::Device::getDefault().isIntel() ? 1 << 27 : 0; 4154 poolSize = getConfigurationParameterForSize("OPENCV_OPENCL_BUFFERPOOL_LIMIT", defaultPoolSize); 4155 bufferPool.setMaxReservedSize(poolSize); 4156 poolSize = getConfigurationParameterForSize("OPENCV_OPENCL_HOST_PTR_BUFFERPOOL_LIMIT", defaultPoolSize); 4157 bufferPoolHostPtr.setMaxReservedSize(poolSize); 4158 #ifdef HAVE_OPENCL_SVM 4159 poolSize = getConfigurationParameterForSize("OPENCV_OPENCL_SVM_BUFFERPOOL_LIMIT", defaultPoolSize); 4160 bufferPoolSVM.setMaxReservedSize(poolSize); 4161 #endif 4162 4163 matStdAllocator = Mat::getStdAllocator(); 4164 } 4165 4166 UMatData* defaultAllocate(int dims, const int* sizes, int type, void* data, size_t* step, 4167 int flags, UMatUsageFlags usageFlags) const 4168 { 4169 UMatData* u = matStdAllocator->allocate(dims, sizes, type, data, step, flags, usageFlags); 4170 return u; 4171 } 4172 4173 void getBestFlags(const Context& ctx, int /*flags*/, UMatUsageFlags usageFlags, int& createFlags, int& flags0) const 4174 { 4175 const Device& dev = ctx.device(0); 4176 createFlags = 0; 4177 if ((usageFlags & USAGE_ALLOCATE_HOST_MEMORY) != 0) 4178 createFlags |= CL_MEM_ALLOC_HOST_PTR; 4179 4180 if( dev.hostUnifiedMemory() ) 4181 flags0 = 0; 4182 else 4183 flags0 = UMatData::COPY_ON_MAP; 4184 } 4185 4186 UMatData* allocate(int dims, const int* sizes, int type, 4187 void* data, size_t* step, int flags, UMatUsageFlags usageFlags) const 4188 { 4189 if(!useOpenCL()) 4190 return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags); 4191 CV_Assert(data == 0); 4192 size_t total = CV_ELEM_SIZE(type); 4193 for( int i = dims-1; i >= 0; i-- ) 4194 { 4195 if( step ) 4196 step[i] = total; 4197 total *= sizes[i]; 4198 } 4199 4200 Context& ctx = Context::getDefault(); 4201 4202 int createFlags = 0, flags0 = 0; 4203 getBestFlags(ctx, flags, usageFlags, createFlags, flags0); 4204 4205 void* handle = NULL; 4206 int allocatorFlags = 0; 4207 4208 #ifdef HAVE_OPENCL_SVM 4209 const svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx); 4210 if (ctx.useSVM() && svm::useSVM(usageFlags) && !svmCaps.isNoSVMSupport()) 4211 { 4212 allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED; 4213 handle = bufferPoolSVM.allocate(total); 4214 4215 // this property is constant, so single buffer pool can be used here 4216 bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer(); 4217 allocatorFlags |= isFineGrainBuffer ? svm::OPENCL_SVM_FINE_GRAIN_BUFFER : svm::OPENCL_SVM_COARSE_GRAIN_BUFFER; 4218 } 4219 else 4220 #endif 4221 if (createFlags == 0) 4222 { 4223 allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_USED; 4224 handle = bufferPool.allocate(total); 4225 } 4226 else if (createFlags == CL_MEM_ALLOC_HOST_PTR) 4227 { 4228 allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED; 4229 handle = bufferPoolHostPtr.allocate(total); 4230 } 4231 else 4232 { 4233 CV_Assert(handle != NULL); // Unsupported, throw 4234 } 4235 4236 if (!handle) 4237 return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags); 4238 4239 UMatData* u = new UMatData(this); 4240 u->data = 0; 4241 u->size = total; 4242 u->handle = handle; 4243 u->flags = flags0; 4244 u->allocatorFlags_ = allocatorFlags; 4245 CV_DbgAssert(!u->tempUMat()); // for bufferPool.release() consistency in deallocate() 4246 return u; 4247 } 4248 4249 bool allocate(UMatData* u, int accessFlags, UMatUsageFlags usageFlags) const 4250 { 4251 if(!u) 4252 return false; 4253 4254 UMatDataAutoLock lock(u); 4255 4256 if(u->handle == 0) 4257 { 4258 CV_Assert(u->origdata != 0); 4259 Context& ctx = Context::getDefault(); 4260 int createFlags = 0, flags0 = 0; 4261 getBestFlags(ctx, accessFlags, usageFlags, createFlags, flags0); 4262 4263 cl_context ctx_handle = (cl_context)ctx.ptr(); 4264 int allocatorFlags = 0; 4265 int tempUMatFlags = 0; 4266 void* handle = NULL; 4267 cl_int retval = CL_SUCCESS; 4268 4269 #ifdef HAVE_OPENCL_SVM 4270 svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx); 4271 bool useSVM = ctx.useSVM() && svm::useSVM(usageFlags); 4272 if (useSVM && svmCaps.isSupportFineGrainSystem()) 4273 { 4274 allocatorFlags = svm::OPENCL_SVM_FINE_GRAIN_SYSTEM; 4275 tempUMatFlags = UMatData::TEMP_UMAT; 4276 handle = u->origdata; 4277 CV_OPENCL_SVM_TRACE_P("Use fine grain system: %d (%p)\n", (int)u->size, handle); 4278 } 4279 else if (useSVM && (svmCaps.isSupportFineGrainBuffer() || svmCaps.isSupportCoarseGrainBuffer())) 4280 { 4281 if (!(accessFlags & ACCESS_FAST)) // memcpy used 4282 { 4283 bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer(); 4284 4285 cl_svm_mem_flags memFlags = createFlags | 4286 (isFineGrainBuffer ? CL_MEM_SVM_FINE_GRAIN_BUFFER : 0); 4287 4288 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx); 4289 CV_DbgAssert(svmFns->isValid()); 4290 4291 CV_OPENCL_SVM_TRACE_P("clSVMAlloc + copy: %d\n", (int)u->size); 4292 handle = svmFns->fn_clSVMAlloc((cl_context)ctx.ptr(), memFlags, u->size, 0); 4293 CV_Assert(handle); 4294 4295 cl_command_queue q = NULL; 4296 if (!isFineGrainBuffer) 4297 { 4298 q = (cl_command_queue)Queue::getDefault().ptr(); 4299 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", handle, (int)u->size); 4300 cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_TRUE, CL_MAP_WRITE, 4301 handle, u->size, 4302 0, NULL, NULL); 4303 CV_Assert(status == CL_SUCCESS); 4304 4305 } 4306 memcpy(handle, u->origdata, u->size); 4307 if (!isFineGrainBuffer) 4308 { 4309 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", handle); 4310 cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, handle, 0, NULL, NULL); 4311 CV_Assert(status == CL_SUCCESS); 4312 } 4313 4314 tempUMatFlags = UMatData::TEMP_UMAT | UMatData::TEMP_COPIED_UMAT; 4315 allocatorFlags |= isFineGrainBuffer ? svm::OPENCL_SVM_FINE_GRAIN_BUFFER 4316 : svm::OPENCL_SVM_COARSE_GRAIN_BUFFER; 4317 } 4318 } 4319 else 4320 #endif 4321 { 4322 tempUMatFlags = UMatData::TEMP_UMAT; 4323 handle = clCreateBuffer(ctx_handle, CL_MEM_USE_HOST_PTR|createFlags, 4324 u->size, u->origdata, &retval); 4325 if((!handle || retval < 0) && !(accessFlags & ACCESS_FAST)) 4326 { 4327 handle = clCreateBuffer(ctx_handle, CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE|createFlags, 4328 u->size, u->origdata, &retval); 4329 tempUMatFlags |= UMatData::TEMP_COPIED_UMAT; 4330 } 4331 } 4332 if(!handle || retval != CL_SUCCESS) 4333 return false; 4334 u->handle = handle; 4335 u->prevAllocator = u->currAllocator; 4336 u->currAllocator = this; 4337 u->flags |= tempUMatFlags; 4338 u->allocatorFlags_ = allocatorFlags; 4339 } 4340 if(accessFlags & ACCESS_WRITE) 4341 u->markHostCopyObsolete(true); 4342 return true; 4343 } 4344 4345 /*void sync(UMatData* u) const 4346 { 4347 cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr(); 4348 UMatDataAutoLock lock(u); 4349 4350 if( u->hostCopyObsolete() && u->handle && u->refcount > 0 && u->origdata) 4351 { 4352 if( u->tempCopiedUMat() ) 4353 { 4354 clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0, 4355 u->size, u->origdata, 0, 0, 0); 4356 } 4357 else 4358 { 4359 cl_int retval = 0; 4360 void* data = clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE, 4361 (CL_MAP_READ | CL_MAP_WRITE), 4362 0, u->size, 0, 0, 0, &retval); 4363 clEnqueueUnmapMemObject(q, (cl_mem)u->handle, data, 0, 0, 0); 4364 clFinish(q); 4365 } 4366 u->markHostCopyObsolete(false); 4367 } 4368 else if( u->copyOnMap() && u->deviceCopyObsolete() && u->data ) 4369 { 4370 clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE, 0, 4371 u->size, u->data, 0, 0, 0); 4372 } 4373 }*/ 4374 4375 void deallocate(UMatData* u) const 4376 { 4377 if(!u) 4378 return; 4379 4380 CV_Assert(u->urefcount >= 0); 4381 CV_Assert(u->refcount >= 0); 4382 4383 CV_Assert(u->handle != 0 && u->urefcount == 0); 4384 if(u->tempUMat()) 4385 { 4386 // UMatDataAutoLock lock(u); 4387 4388 if( u->hostCopyObsolete() && u->refcount > 0 ) 4389 { 4390 #ifdef HAVE_OPENCL_SVM 4391 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0) 4392 { 4393 Context& ctx = Context::getDefault(); 4394 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx); 4395 CV_DbgAssert(svmFns->isValid()); 4396 4397 if( u->tempCopiedUMat() ) 4398 { 4399 CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER || 4400 (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER); 4401 bool isFineGrainBuffer = (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER; 4402 cl_command_queue q = NULL; 4403 if (!isFineGrainBuffer) 4404 { 4405 CV_DbgAssert(((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0)); 4406 q = (cl_command_queue)Queue::getDefault().ptr(); 4407 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size); 4408 cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ, 4409 u->handle, u->size, 4410 0, NULL, NULL); 4411 CV_Assert(status == CL_SUCCESS); 4412 } 4413 clFinish(q); 4414 memcpy(u->origdata, u->handle, u->size); 4415 if (!isFineGrainBuffer) 4416 { 4417 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle); 4418 cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle, 0, NULL, NULL); 4419 CV_Assert(status == CL_SUCCESS); 4420 } 4421 } 4422 else 4423 { 4424 CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM); 4425 // nothing 4426 } 4427 } 4428 else 4429 #endif 4430 { 4431 cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr(); 4432 if( u->tempCopiedUMat() ) 4433 { 4434 AlignedDataPtr<false, true> alignedPtr(u->origdata, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT); 4435 CV_OclDbgAssert(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0, 4436 u->size, alignedPtr.getAlignedPtr(), 0, 0, 0) == CL_SUCCESS); 4437 } 4438 else 4439 { 4440 // TODO Is it really needed for clCreateBuffer with CL_MEM_USE_HOST_PTR? 4441 cl_int retval = 0; 4442 void* data = clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE, 4443 (CL_MAP_READ | CL_MAP_WRITE), 4444 0, u->size, 0, 0, 0, &retval); 4445 CV_OclDbgAssert(retval == CL_SUCCESS); 4446 CV_OclDbgAssert(clEnqueueUnmapMemObject(q, (cl_mem)u->handle, data, 0, 0, 0) == CL_SUCCESS); 4447 CV_OclDbgAssert(clFinish(q) == CL_SUCCESS); 4448 } 4449 } 4450 u->markHostCopyObsolete(false); 4451 } 4452 #ifdef HAVE_OPENCL_SVM 4453 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0) 4454 { 4455 if( u->tempCopiedUMat() ) 4456 { 4457 Context& ctx = Context::getDefault(); 4458 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx); 4459 CV_DbgAssert(svmFns->isValid()); 4460 4461 CV_OPENCL_SVM_TRACE_P("clSVMFree: %p\n", u->handle); 4462 svmFns->fn_clSVMFree((cl_context)ctx.ptr(), u->handle); 4463 } 4464 } 4465 else 4466 #endif 4467 { 4468 clReleaseMemObject((cl_mem)u->handle); 4469 } 4470 u->handle = 0; 4471 u->currAllocator = u->prevAllocator; 4472 if(u->data && u->copyOnMap() && !(u->flags & UMatData::USER_ALLOCATED)) 4473 fastFree(u->data); 4474 u->data = u->origdata; 4475 if(u->refcount == 0) 4476 u->currAllocator->deallocate(u); 4477 } 4478 else 4479 { 4480 CV_Assert(u->refcount == 0); 4481 if(u->data && u->copyOnMap() && !(u->flags & UMatData::USER_ALLOCATED)) 4482 { 4483 fastFree(u->data); 4484 u->data = 0; 4485 } 4486 if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_USED) 4487 { 4488 bufferPool.release((cl_mem)u->handle); 4489 } 4490 else if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED) 4491 { 4492 bufferPoolHostPtr.release((cl_mem)u->handle); 4493 } 4494 #ifdef HAVE_OPENCL_SVM 4495 else if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED) 4496 { 4497 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM) 4498 { 4499 //nothing 4500 } 4501 else if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER || 4502 (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER) 4503 { 4504 Context& ctx = Context::getDefault(); 4505 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx); 4506 CV_DbgAssert(svmFns->isValid()); 4507 cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr(); 4508 4509 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) != 0) 4510 { 4511 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle); 4512 cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle, 0, NULL, NULL); 4513 CV_Assert(status == CL_SUCCESS); 4514 } 4515 } 4516 bufferPoolSVM.release((void*)u->handle); 4517 } 4518 #endif 4519 else 4520 { 4521 clReleaseMemObject((cl_mem)u->handle); 4522 } 4523 u->handle = 0; 4524 delete u; 4525 } 4526 } 4527 4528 void map(UMatData* u, int accessFlags) const 4529 { 4530 if(!u) 4531 return; 4532 4533 CV_Assert( u->handle != 0 ); 4534 4535 UMatDataAutoLock autolock(u); 4536 4537 if(accessFlags & ACCESS_WRITE) 4538 u->markDeviceCopyObsolete(true); 4539 4540 cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr(); 4541 4542 // FIXIT Workaround for UMat synchronization issue 4543 // if( u->refcount == 0 ) 4544 { 4545 if( !u->copyOnMap() ) 4546 { 4547 // TODO 4548 // because there can be other map requests for the same UMat with different access flags, 4549 // we use the universal (read-write) access mode. 4550 #ifdef HAVE_OPENCL_SVM 4551 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0) 4552 { 4553 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER) 4554 { 4555 Context& ctx = Context::getDefault(); 4556 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx); 4557 CV_DbgAssert(svmFns->isValid()); 4558 4559 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0) 4560 { 4561 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size); 4562 cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 4563 u->handle, u->size, 4564 0, NULL, NULL); 4565 CV_Assert(status == CL_SUCCESS); 4566 u->allocatorFlags_ |= svm::OPENCL_SVM_BUFFER_MAP; 4567 } 4568 } 4569 clFinish(q); 4570 u->data = (uchar*)u->handle; 4571 u->markHostCopyObsolete(false); 4572 u->markDeviceMemMapped(true); 4573 return; 4574 } 4575 #endif 4576 if (u->data) // FIXIT Workaround for UMat synchronization issue 4577 { 4578 //CV_Assert(u->hostCopyObsolete() == false); 4579 return; 4580 } 4581 4582 cl_int retval = 0; 4583 u->data = (uchar*)clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE, 4584 (CL_MAP_READ | CL_MAP_WRITE), 4585 0, u->size, 0, 0, 0, &retval); 4586 if(u->data && retval == CL_SUCCESS) 4587 { 4588 u->markHostCopyObsolete(false); 4589 u->markDeviceMemMapped(true); 4590 return; 4591 } 4592 4593 // TODO Is it really a good idea and was it tested well? 4594 // if map failed, switch to copy-on-map mode for the particular buffer 4595 u->flags |= UMatData::COPY_ON_MAP; 4596 } 4597 4598 if(!u->data) 4599 { 4600 u->data = (uchar*)fastMalloc(u->size); 4601 u->markHostCopyObsolete(true); 4602 } 4603 } 4604 4605 if( (accessFlags & ACCESS_READ) != 0 && u->hostCopyObsolete() ) 4606 { 4607 AlignedDataPtr<false, true> alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT); 4608 #ifdef HAVE_OPENCL_SVM 4609 CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == 0); 4610 #endif 4611 CV_Assert( clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0, 4612 u->size, alignedPtr.getAlignedPtr(), 0, 0, 0) == CL_SUCCESS ); 4613 u->markHostCopyObsolete(false); 4614 } 4615 } 4616 4617 void unmap(UMatData* u) const 4618 { 4619 if(!u) 4620 return; 4621 4622 4623 CV_Assert(u->handle != 0); 4624 4625 UMatDataAutoLock autolock(u); 4626 4627 // FIXIT Workaround for UMat synchronization issue 4628 if(u->refcount > 0) 4629 return; 4630 4631 cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr(); 4632 cl_int retval = 0; 4633 if( !u->copyOnMap() && u->deviceMemMapped() ) 4634 { 4635 CV_Assert(u->data != NULL); 4636 u->markDeviceMemMapped(false); 4637 #ifdef HAVE_OPENCL_SVM 4638 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0) 4639 { 4640 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER) 4641 { 4642 Context& ctx = Context::getDefault(); 4643 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx); 4644 CV_DbgAssert(svmFns->isValid()); 4645 4646 CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) != 0); 4647 { 4648 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle); 4649 cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle, 4650 0, NULL, NULL); 4651 CV_Assert(status == CL_SUCCESS); 4652 clFinish(q); 4653 u->allocatorFlags_ &= ~svm::OPENCL_SVM_BUFFER_MAP; 4654 } 4655 } 4656 u->data = 0; 4657 u->markDeviceCopyObsolete(false); 4658 u->markHostCopyObsolete(false); 4659 return; 4660 } 4661 #endif 4662 CV_Assert( (retval = clEnqueueUnmapMemObject(q, 4663 (cl_mem)u->handle, u->data, 0, 0, 0)) == CL_SUCCESS ); 4664 if (Device::getDefault().isAMD()) 4665 { 4666 // required for multithreaded applications (see stitching test) 4667 CV_OclDbgAssert(clFinish(q) == CL_SUCCESS); 4668 } 4669 u->data = 0; 4670 } 4671 else if( u->copyOnMap() && u->deviceCopyObsolete() ) 4672 { 4673 AlignedDataPtr<true, false> alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT); 4674 #ifdef HAVE_OPENCL_SVM 4675 CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == 0); 4676 #endif 4677 CV_Assert( (retval = clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE, 0, 4678 u->size, alignedPtr.getAlignedPtr(), 0, 0, 0)) == CL_SUCCESS ); 4679 } 4680 u->markDeviceCopyObsolete(false); 4681 u->markHostCopyObsolete(false); 4682 } 4683 4684 bool checkContinuous(int dims, const size_t sz[], 4685 const size_t srcofs[], const size_t srcstep[], 4686 const size_t dstofs[], const size_t dststep[], 4687 size_t& total, size_t new_sz[], 4688 size_t& srcrawofs, size_t new_srcofs[], size_t new_srcstep[], 4689 size_t& dstrawofs, size_t new_dstofs[], size_t new_dststep[]) const 4690 { 4691 bool iscontinuous = true; 4692 srcrawofs = srcofs ? srcofs[dims-1] : 0; 4693 dstrawofs = dstofs ? dstofs[dims-1] : 0; 4694 total = sz[dims-1]; 4695 for( int i = dims-2; i >= 0; i-- ) 4696 { 4697 if( i >= 0 && (total != srcstep[i] || total != dststep[i]) ) 4698 iscontinuous = false; 4699 total *= sz[i]; 4700 if( srcofs ) 4701 srcrawofs += srcofs[i]*srcstep[i]; 4702 if( dstofs ) 4703 dstrawofs += dstofs[i]*dststep[i]; 4704 } 4705 4706 if( !iscontinuous ) 4707 { 4708 // OpenCL uses {x, y, z} order while OpenCV uses {z, y, x} order. 4709 if( dims == 2 ) 4710 { 4711 new_sz[0] = sz[1]; new_sz[1] = sz[0]; new_sz[2] = 1; 4712 // we assume that new_... arrays are initialized by caller 4713 // with 0's, so there is no else branch 4714 if( srcofs ) 4715 { 4716 new_srcofs[0] = srcofs[1]; 4717 new_srcofs[1] = srcofs[0]; 4718 new_srcofs[2] = 0; 4719 } 4720 4721 if( dstofs ) 4722 { 4723 new_dstofs[0] = dstofs[1]; 4724 new_dstofs[1] = dstofs[0]; 4725 new_dstofs[2] = 0; 4726 } 4727 4728 new_srcstep[0] = srcstep[0]; new_srcstep[1] = 0; 4729 new_dststep[0] = dststep[0]; new_dststep[1] = 0; 4730 } 4731 else 4732 { 4733 // we could check for dims == 3 here, 4734 // but from user perspective this one is more informative 4735 CV_Assert(dims <= 3); 4736 new_sz[0] = sz[2]; new_sz[1] = sz[1]; new_sz[2] = sz[0]; 4737 if( srcofs ) 4738 { 4739 new_srcofs[0] = srcofs[2]; 4740 new_srcofs[1] = srcofs[1]; 4741 new_srcofs[2] = srcofs[0]; 4742 } 4743 4744 if( dstofs ) 4745 { 4746 new_dstofs[0] = dstofs[2]; 4747 new_dstofs[1] = dstofs[1]; 4748 new_dstofs[2] = dstofs[0]; 4749 } 4750 4751 new_srcstep[0] = srcstep[1]; new_srcstep[1] = srcstep[0]; 4752 new_dststep[0] = dststep[1]; new_dststep[1] = dststep[0]; 4753 } 4754 } 4755 return iscontinuous; 4756 } 4757 4758 void download(UMatData* u, void* dstptr, int dims, const size_t sz[], 4759 const size_t srcofs[], const size_t srcstep[], 4760 const size_t dststep[]) const 4761 { 4762 if(!u) 4763 return; 4764 UMatDataAutoLock autolock(u); 4765 4766 if( u->data && !u->hostCopyObsolete() ) 4767 { 4768 Mat::getStdAllocator()->download(u, dstptr, dims, sz, srcofs, srcstep, dststep); 4769 return; 4770 } 4771 CV_Assert( u->handle != 0 ); 4772 4773 cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr(); 4774 4775 size_t total = 0, new_sz[] = {0, 0, 0}; 4776 size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0}; 4777 size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0}; 4778 4779 bool iscontinuous = checkContinuous(dims, sz, srcofs, srcstep, 0, dststep, 4780 total, new_sz, 4781 srcrawofs, new_srcofs, new_srcstep, 4782 dstrawofs, new_dstofs, new_dststep); 4783 4784 #ifdef HAVE_OPENCL_SVM 4785 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0) 4786 { 4787 CV_DbgAssert(u->data == NULL || u->data == u->handle); 4788 Context& ctx = Context::getDefault(); 4789 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx); 4790 CV_DbgAssert(svmFns->isValid()); 4791 4792 CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0); 4793 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER) 4794 { 4795 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size); 4796 cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ, 4797 u->handle, u->size, 4798 0, NULL, NULL); 4799 CV_Assert(status == CL_SUCCESS); 4800 } 4801 clFinish(q); 4802 if( iscontinuous ) 4803 { 4804 memcpy(dstptr, (uchar*)u->handle + srcrawofs, total); 4805 } 4806 else 4807 { 4808 // This code is from MatAllocator::download() 4809 int isz[CV_MAX_DIM]; 4810 uchar* srcptr = (uchar*)u->handle; 4811 for( int i = 0; i < dims; i++ ) 4812 { 4813 CV_Assert( sz[i] <= (size_t)INT_MAX ); 4814 if( sz[i] == 0 ) 4815 return; 4816 if( srcofs ) 4817 srcptr += srcofs[i]*(i <= dims-2 ? srcstep[i] : 1); 4818 isz[i] = (int)sz[i]; 4819 } 4820 4821 Mat src(dims, isz, CV_8U, srcptr, srcstep); 4822 Mat dst(dims, isz, CV_8U, dstptr, dststep); 4823 4824 const Mat* arrays[] = { &src, &dst }; 4825 uchar* ptrs[2]; 4826 NAryMatIterator it(arrays, ptrs, 2); 4827 size_t j, planesz = it.size; 4828 4829 for( j = 0; j < it.nplanes; j++, ++it ) 4830 memcpy(ptrs[1], ptrs[0], planesz); 4831 } 4832 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER) 4833 { 4834 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle); 4835 cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle, 4836 0, NULL, NULL); 4837 CV_Assert(status == CL_SUCCESS); 4838 clFinish(q); 4839 } 4840 } 4841 else 4842 #endif 4843 { 4844 AlignedDataPtr<false, true> alignedPtr((uchar*)dstptr, sz[0] * dststep[0], CV_OPENCL_DATA_PTR_ALIGNMENT); 4845 if( iscontinuous ) 4846 { 4847 CV_Assert( clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 4848 srcrawofs, total, alignedPtr.getAlignedPtr(), 0, 0, 0) >= 0 ); 4849 } 4850 else 4851 { 4852 CV_Assert( clEnqueueReadBufferRect(q, (cl_mem)u->handle, CL_TRUE, 4853 new_srcofs, new_dstofs, new_sz, new_srcstep[0], new_srcstep[1], 4854 new_dststep[0], new_dststep[1], alignedPtr.getAlignedPtr(), 0, 0, 0) >= 0 ); 4855 } 4856 } 4857 } 4858 4859 void upload(UMatData* u, const void* srcptr, int dims, const size_t sz[], 4860 const size_t dstofs[], const size_t dststep[], 4861 const size_t srcstep[]) const 4862 { 4863 if(!u) 4864 return; 4865 4866 // there should be no user-visible CPU copies of the UMat which we are going to copy to 4867 CV_Assert(u->refcount == 0 || u->tempUMat()); 4868 4869 size_t total = 0, new_sz[] = {0, 0, 0}; 4870 size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0}; 4871 size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0}; 4872 4873 bool iscontinuous = checkContinuous(dims, sz, 0, srcstep, dstofs, dststep, 4874 total, new_sz, 4875 srcrawofs, new_srcofs, new_srcstep, 4876 dstrawofs, new_dstofs, new_dststep); 4877 4878 UMatDataAutoLock autolock(u); 4879 4880 // if there is cached CPU copy of the GPU matrix, 4881 // we could use it as a destination. 4882 // we can do it in 2 cases: 4883 // 1. we overwrite the whole content 4884 // 2. we overwrite part of the matrix, but the GPU copy is out-of-date 4885 if( u->data && (u->hostCopyObsolete() < u->deviceCopyObsolete() || total == u->size)) 4886 { 4887 Mat::getStdAllocator()->upload(u, srcptr, dims, sz, dstofs, dststep, srcstep); 4888 u->markHostCopyObsolete(false); 4889 u->markDeviceCopyObsolete(true); 4890 return; 4891 } 4892 4893 CV_Assert( u->handle != 0 ); 4894 cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr(); 4895 4896 #ifdef HAVE_OPENCL_SVM 4897 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0) 4898 { 4899 CV_DbgAssert(u->data == NULL || u->data == u->handle); 4900 Context& ctx = Context::getDefault(); 4901 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx); 4902 CV_DbgAssert(svmFns->isValid()); 4903 4904 CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0); 4905 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER) 4906 { 4907 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size); 4908 cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_WRITE, 4909 u->handle, u->size, 4910 0, NULL, NULL); 4911 CV_Assert(status == CL_SUCCESS); 4912 } 4913 clFinish(q); 4914 if( iscontinuous ) 4915 { 4916 memcpy((uchar*)u->handle + dstrawofs, srcptr, total); 4917 } 4918 else 4919 { 4920 // This code is from MatAllocator::upload() 4921 int isz[CV_MAX_DIM]; 4922 uchar* dstptr = (uchar*)u->handle; 4923 for( int i = 0; i < dims; i++ ) 4924 { 4925 CV_Assert( sz[i] <= (size_t)INT_MAX ); 4926 if( sz[i] == 0 ) 4927 return; 4928 if( dstofs ) 4929 dstptr += dstofs[i]*(i <= dims-2 ? dststep[i] : 1); 4930 isz[i] = (int)sz[i]; 4931 } 4932 4933 Mat src(dims, isz, CV_8U, (void*)srcptr, srcstep); 4934 Mat dst(dims, isz, CV_8U, dstptr, dststep); 4935 4936 const Mat* arrays[] = { &src, &dst }; 4937 uchar* ptrs[2]; 4938 NAryMatIterator it(arrays, ptrs, 2); 4939 size_t j, planesz = it.size; 4940 4941 for( j = 0; j < it.nplanes; j++, ++it ) 4942 memcpy(ptrs[1], ptrs[0], planesz); 4943 } 4944 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER) 4945 { 4946 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle); 4947 cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle, 4948 0, NULL, NULL); 4949 CV_Assert(status == CL_SUCCESS); 4950 clFinish(q); 4951 } 4952 } 4953 else 4954 #endif 4955 { 4956 AlignedDataPtr<true, false> alignedPtr((uchar*)srcptr, sz[0] * srcstep[0], CV_OPENCL_DATA_PTR_ALIGNMENT); 4957 if( iscontinuous ) 4958 { 4959 CV_Assert( clEnqueueWriteBuffer(q, (cl_mem)u->handle, 4960 CL_TRUE, dstrawofs, total, alignedPtr.getAlignedPtr(), 0, 0, 0) >= 0 ); 4961 } 4962 else 4963 { 4964 CV_Assert( clEnqueueWriteBufferRect(q, (cl_mem)u->handle, CL_TRUE, 4965 new_dstofs, new_srcofs, new_sz, new_dststep[0], new_dststep[1], 4966 new_srcstep[0], new_srcstep[1], alignedPtr.getAlignedPtr(), 0, 0, 0) >= 0 ); 4967 } 4968 } 4969 u->markHostCopyObsolete(true); 4970 #ifdef HAVE_OPENCL_SVM 4971 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER || 4972 (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM) 4973 { 4974 // nothing 4975 } 4976 else 4977 #endif 4978 { 4979 u->markHostCopyObsolete(true); 4980 } 4981 u->markDeviceCopyObsolete(false); 4982 } 4983 4984 void copy(UMatData* src, UMatData* dst, int dims, const size_t sz[], 4985 const size_t srcofs[], const size_t srcstep[], 4986 const size_t dstofs[], const size_t dststep[], bool _sync) const 4987 { 4988 if(!src || !dst) 4989 return; 4990 4991 size_t total = 0, new_sz[] = {0, 0, 0}; 4992 size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0}; 4993 size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0}; 4994 4995 bool iscontinuous = checkContinuous(dims, sz, srcofs, srcstep, dstofs, dststep, 4996 total, new_sz, 4997 srcrawofs, new_srcofs, new_srcstep, 4998 dstrawofs, new_dstofs, new_dststep); 4999 5000 UMatDataAutoLock src_autolock(src); 5001 UMatDataAutoLock dst_autolock(dst); 5002 5003 if( !src->handle || (src->data && src->hostCopyObsolete() < src->deviceCopyObsolete()) ) 5004 { 5005 upload(dst, src->data + srcrawofs, dims, sz, dstofs, dststep, srcstep); 5006 return; 5007 } 5008 if( !dst->handle || (dst->data && dst->hostCopyObsolete() < dst->deviceCopyObsolete()) ) 5009 { 5010 download(src, dst->data + dstrawofs, dims, sz, srcofs, srcstep, dststep); 5011 dst->markHostCopyObsolete(false); 5012 #ifdef HAVE_OPENCL_SVM 5013 if ((dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER || 5014 (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM) 5015 { 5016 // nothing 5017 } 5018 else 5019 #endif 5020 { 5021 dst->markDeviceCopyObsolete(true); 5022 } 5023 return; 5024 } 5025 5026 // there should be no user-visible CPU copies of the UMat which we are going to copy to 5027 CV_Assert(dst->refcount == 0); 5028 cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr(); 5029 5030 cl_int retval = CL_SUCCESS; 5031 #ifdef HAVE_OPENCL_SVM 5032 if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0 || 5033 (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0) 5034 { 5035 if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0 && 5036 (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0) 5037 { 5038 Context& ctx = Context::getDefault(); 5039 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx); 5040 CV_DbgAssert(svmFns->isValid()); 5041 5042 if( iscontinuous ) 5043 { 5044 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMemcpy: %p <-- %p (%d)\n", 5045 (uchar*)dst->handle + dstrawofs, (uchar*)src->handle + srcrawofs, (int)total); 5046 cl_int status = svmFns->fn_clEnqueueSVMMemcpy(q, CL_TRUE, 5047 (uchar*)dst->handle + dstrawofs, (uchar*)src->handle + srcrawofs, 5048 total, 0, NULL, NULL); 5049 CV_Assert(status == CL_SUCCESS); 5050 } 5051 else 5052 { 5053 clFinish(q); 5054 // This code is from MatAllocator::download()/upload() 5055 int isz[CV_MAX_DIM]; 5056 uchar* srcptr = (uchar*)src->handle; 5057 for( int i = 0; i < dims; i++ ) 5058 { 5059 CV_Assert( sz[i] <= (size_t)INT_MAX ); 5060 if( sz[i] == 0 ) 5061 return; 5062 if( srcofs ) 5063 srcptr += srcofs[i]*(i <= dims-2 ? srcstep[i] : 1); 5064 isz[i] = (int)sz[i]; 5065 } 5066 Mat m_src(dims, isz, CV_8U, srcptr, srcstep); 5067 5068 uchar* dstptr = (uchar*)dst->handle; 5069 for( int i = 0; i < dims; i++ ) 5070 { 5071 if( dstofs ) 5072 dstptr += dstofs[i]*(i <= dims-2 ? dststep[i] : 1); 5073 } 5074 Mat m_dst(dims, isz, CV_8U, dstptr, dststep); 5075 5076 const Mat* arrays[] = { &m_src, &m_dst }; 5077 uchar* ptrs[2]; 5078 NAryMatIterator it(arrays, ptrs, 2); 5079 size_t j, planesz = it.size; 5080 5081 for( j = 0; j < it.nplanes; j++, ++it ) 5082 memcpy(ptrs[1], ptrs[0], planesz); 5083 } 5084 } 5085 else 5086 { 5087 if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0) 5088 { 5089 map(src, ACCESS_READ); 5090 upload(dst, src->data + srcrawofs, dims, sz, dstofs, dststep, srcstep); 5091 unmap(src); 5092 } 5093 else 5094 { 5095 map(dst, ACCESS_WRITE); 5096 download(src, dst->data + dstrawofs, dims, sz, srcofs, srcstep, dststep); 5097 unmap(dst); 5098 } 5099 } 5100 } 5101 else 5102 #endif 5103 { 5104 if( iscontinuous ) 5105 { 5106 CV_Assert( (retval = clEnqueueCopyBuffer(q, (cl_mem)src->handle, (cl_mem)dst->handle, 5107 srcrawofs, dstrawofs, total, 0, 0, 0)) == CL_SUCCESS ); 5108 } 5109 else 5110 { 5111 CV_Assert( (retval = clEnqueueCopyBufferRect(q, (cl_mem)src->handle, (cl_mem)dst->handle, 5112 new_srcofs, new_dstofs, new_sz, 5113 new_srcstep[0], new_srcstep[1], 5114 new_dststep[0], new_dststep[1], 5115 0, 0, 0)) == CL_SUCCESS ); 5116 } 5117 } 5118 if (retval == CL_SUCCESS) 5119 { 5120 CV_IMPL_ADD(CV_IMPL_OCL) 5121 } 5122 5123 #ifdef HAVE_OPENCL_SVM 5124 if ((dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER || 5125 (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM) 5126 { 5127 // nothing 5128 } 5129 else 5130 #endif 5131 { 5132 dst->markHostCopyObsolete(true); 5133 } 5134 dst->markDeviceCopyObsolete(false); 5135 5136 if( _sync ) 5137 { 5138 CV_OclDbgAssert(clFinish(q) == CL_SUCCESS); 5139 } 5140 } 5141 5142 BufferPoolController* getBufferPoolController(const char* id) const { 5143 #ifdef HAVE_OPENCL_SVM 5144 if ((svm::checkForceSVMUmatUsage() && (id == NULL || strcmp(id, "OCL") == 0)) || (id != NULL && strcmp(id, "SVM") == 0)) 5145 { 5146 return &bufferPoolSVM; 5147 } 5148 #endif 5149 if (id != NULL && strcmp(id, "HOST_ALLOC") == 0) 5150 { 5151 return &bufferPoolHostPtr; 5152 } 5153 if (id != NULL && strcmp(id, "OCL") != 0) 5154 { 5155 CV_ErrorNoReturn(cv::Error::StsBadArg, "getBufferPoolController(): unknown BufferPool ID\n"); 5156 } 5157 return &bufferPool; 5158 } 5159 5160 MatAllocator* matStdAllocator; 5161 }; 5162 5163 MatAllocator* getOpenCLAllocator() 5164 { 5165 static MatAllocator * allocator = new OpenCLAllocator(); 5166 return allocator; 5167 } 5168 5169 ///////////////////////////////////////////// Utility functions ///////////////////////////////////////////////// 5170 5171 static void getDevices(std::vector<cl_device_id>& devices, cl_platform_id platform) 5172 { 5173 cl_uint numDevices = 0; 5174 CV_OclDbgAssert(clGetDeviceIDs(platform, (cl_device_type)Device::TYPE_ALL, 5175 0, NULL, &numDevices) == CL_SUCCESS); 5176 5177 if (numDevices == 0) 5178 { 5179 devices.clear(); 5180 return; 5181 } 5182 5183 devices.resize((size_t)numDevices); 5184 CV_OclDbgAssert(clGetDeviceIDs(platform, (cl_device_type)Device::TYPE_ALL, 5185 numDevices, &devices[0], &numDevices) == CL_SUCCESS); 5186 } 5187 5188 struct PlatformInfo::Impl 5189 { 5190 Impl(void* id) 5191 { 5192 refcount = 1; 5193 handle = *(cl_platform_id*)id; 5194 getDevices(devices, handle); 5195 } 5196 5197 String getStrProp(cl_device_info prop) const 5198 { 5199 char buf[1024]; 5200 size_t sz=0; 5201 return clGetPlatformInfo(handle, prop, sizeof(buf)-16, buf, &sz) == CL_SUCCESS && 5202 sz < sizeof(buf) ? String(buf) : String(); 5203 } 5204 5205 IMPLEMENT_REFCOUNTABLE(); 5206 std::vector<cl_device_id> devices; 5207 cl_platform_id handle; 5208 }; 5209 5210 PlatformInfo::PlatformInfo() 5211 { 5212 p = 0; 5213 } 5214 5215 PlatformInfo::PlatformInfo(void* platform_id) 5216 { 5217 p = new Impl(platform_id); 5218 } 5219 5220 PlatformInfo::~PlatformInfo() 5221 { 5222 if(p) 5223 p->release(); 5224 } 5225 5226 PlatformInfo::PlatformInfo(const PlatformInfo& i) 5227 { 5228 if (i.p) 5229 i.p->addref(); 5230 p = i.p; 5231 } 5232 5233 PlatformInfo& PlatformInfo::operator =(const PlatformInfo& i) 5234 { 5235 if (i.p != p) 5236 { 5237 if (i.p) 5238 i.p->addref(); 5239 if (p) 5240 p->release(); 5241 p = i.p; 5242 } 5243 return *this; 5244 } 5245 5246 int PlatformInfo::deviceNumber() const 5247 { 5248 return p ? (int)p->devices.size() : 0; 5249 } 5250 5251 void PlatformInfo::getDevice(Device& device, int d) const 5252 { 5253 CV_Assert(p && d < (int)p->devices.size() ); 5254 if(p) 5255 device.set(p->devices[d]); 5256 } 5257 5258 String PlatformInfo::name() const 5259 { 5260 return p ? p->getStrProp(CL_PLATFORM_NAME) : String(); 5261 } 5262 5263 String PlatformInfo::vendor() const 5264 { 5265 return p ? p->getStrProp(CL_PLATFORM_VENDOR) : String(); 5266 } 5267 5268 String PlatformInfo::version() const 5269 { 5270 return p ? p->getStrProp(CL_PLATFORM_VERSION) : String(); 5271 } 5272 5273 static void getPlatforms(std::vector<cl_platform_id>& platforms) 5274 { 5275 cl_uint numPlatforms = 0; 5276 CV_OclDbgAssert(clGetPlatformIDs(0, NULL, &numPlatforms) == CL_SUCCESS); 5277 5278 if (numPlatforms == 0) 5279 { 5280 platforms.clear(); 5281 return; 5282 } 5283 5284 platforms.resize((size_t)numPlatforms); 5285 CV_OclDbgAssert(clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms) == CL_SUCCESS); 5286 } 5287 5288 void getPlatfomsInfo(std::vector<PlatformInfo>& platformsInfo) 5289 { 5290 std::vector<cl_platform_id> platforms; 5291 getPlatforms(platforms); 5292 5293 for (size_t i = 0; i < platforms.size(); i++) 5294 platformsInfo.push_back( PlatformInfo((void*)&platforms[i]) ); 5295 } 5296 5297 const char* typeToStr(int type) 5298 { 5299 static const char* tab[]= 5300 { 5301 "uchar", "uchar2", "uchar3", "uchar4", 0, 0, 0, "uchar8", 0, 0, 0, 0, 0, 0, 0, "uchar16", 5302 "char", "char2", "char3", "char4", 0, 0, 0, "char8", 0, 0, 0, 0, 0, 0, 0, "char16", 5303 "ushort", "ushort2", "ushort3", "ushort4",0, 0, 0, "ushort8", 0, 0, 0, 0, 0, 0, 0, "ushort16", 5304 "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16", 5305 "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16", 5306 "float", "float2", "float3", "float4", 0, 0, 0, "float8", 0, 0, 0, 0, 0, 0, 0, "float16", 5307 "double", "double2", "double3", "double4", 0, 0, 0, "double8", 0, 0, 0, 0, 0, 0, 0, "double16", 5308 "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?" 5309 }; 5310 int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type); 5311 return cn > 16 ? "?" : tab[depth*16 + cn-1]; 5312 } 5313 5314 const char* memopTypeToStr(int type) 5315 { 5316 static const char* tab[] = 5317 { 5318 "uchar", "uchar2", "uchar3", "uchar4", 0, 0, 0, "uchar8", 0, 0, 0, 0, 0, 0, 0, "uchar16", 5319 "char", "char2", "char3", "char4", 0, 0, 0, "char8", 0, 0, 0, 0, 0, 0, 0, "char16", 5320 "ushort", "ushort2", "ushort3", "ushort4",0, 0, 0, "ushort8", 0, 0, 0, 0, 0, 0, 0, "ushort16", 5321 "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16", 5322 "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16", 5323 "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16", 5324 "ulong", "ulong2", "ulong3", "ulong4", 0, 0, 0, "ulong8", 0, 0, 0, 0, 0, 0, 0, "ulong16", 5325 "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?" 5326 }; 5327 int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type); 5328 return cn > 16 ? "?" : tab[depth*16 + cn-1]; 5329 } 5330 5331 const char* vecopTypeToStr(int type) 5332 { 5333 static const char* tab[] = 5334 { 5335 "uchar", "short", "uchar3", "int", 0, 0, 0, "int2", 0, 0, 0, 0, 0, 0, 0, "int4", 5336 "char", "short", "char3", "int", 0, 0, 0, "int2", 0, 0, 0, 0, 0, 0, 0, "int4", 5337 "ushort", "int", "ushort3", "int2",0, 0, 0, "int4", 0, 0, 0, 0, 0, 0, 0, "int8", 5338 "short", "int", "short3", "int2", 0, 0, 0, "int4", 0, 0, 0, 0, 0, 0, 0, "int8", 5339 "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16", 5340 "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16", 5341 "ulong", "ulong2", "ulong3", "ulong4", 0, 0, 0, "ulong8", 0, 0, 0, 0, 0, 0, 0, "ulong16", 5342 "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?" 5343 }; 5344 int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type); 5345 return cn > 16 ? "?" : tab[depth*16 + cn-1]; 5346 } 5347 5348 const char* convertTypeStr(int sdepth, int ddepth, int cn, char* buf) 5349 { 5350 if( sdepth == ddepth ) 5351 return "noconvert"; 5352 const char *typestr = typeToStr(CV_MAKETYPE(ddepth, cn)); 5353 if( ddepth >= CV_32F || 5354 (ddepth == CV_32S && sdepth < CV_32S) || 5355 (ddepth == CV_16S && sdepth <= CV_8S) || 5356 (ddepth == CV_16U && sdepth == CV_8U)) 5357 { 5358 sprintf(buf, "convert_%s", typestr); 5359 } 5360 else if( sdepth >= CV_32F ) 5361 sprintf(buf, "convert_%s%s_rte", typestr, (ddepth < CV_32S ? "_sat" : "")); 5362 else 5363 sprintf(buf, "convert_%s_sat", typestr); 5364 5365 return buf; 5366 } 5367 5368 template <typename T> 5369 static std::string kerToStr(const Mat & k) 5370 { 5371 int width = k.cols - 1, depth = k.depth(); 5372 const T * const data = k.ptr<T>(); 5373 5374 std::ostringstream stream; 5375 stream.precision(10); 5376 5377 if (depth <= CV_8S) 5378 { 5379 for (int i = 0; i < width; ++i) 5380 stream << "DIG(" << (int)data[i] << ")"; 5381 stream << "DIG(" << (int)data[width] << ")"; 5382 } 5383 else if (depth == CV_32F) 5384 { 5385 stream.setf(std::ios_base::showpoint); 5386 for (int i = 0; i < width; ++i) 5387 stream << "DIG(" << data[i] << "f)"; 5388 stream << "DIG(" << data[width] << "f)"; 5389 } 5390 else 5391 { 5392 for (int i = 0; i < width; ++i) 5393 stream << "DIG(" << data[i] << ")"; 5394 stream << "DIG(" << data[width] << ")"; 5395 } 5396 5397 return stream.str(); 5398 } 5399 5400 String kernelToStr(InputArray _kernel, int ddepth, const char * name) 5401 { 5402 Mat kernel = _kernel.getMat().reshape(1, 1); 5403 5404 int depth = kernel.depth(); 5405 if (ddepth < 0) 5406 ddepth = depth; 5407 5408 if (ddepth != depth) 5409 kernel.convertTo(kernel, ddepth); 5410 5411 typedef std::string (* func_t)(const Mat &); 5412 static const func_t funcs[] = { kerToStr<uchar>, kerToStr<char>, kerToStr<ushort>, kerToStr<short>, 5413 kerToStr<int>, kerToStr<float>, kerToStr<double>, 0 }; 5414 const func_t func = funcs[ddepth]; 5415 CV_Assert(func != 0); 5416 5417 return cv::format(" -D %s=%s", name ? name : "COEFF", func(kernel).c_str()); 5418 } 5419 5420 #define PROCESS_SRC(src) \ 5421 do \ 5422 { \ 5423 if (!src.empty()) \ 5424 { \ 5425 CV_Assert(src.isMat() || src.isUMat()); \ 5426 Size csize = src.size(); \ 5427 int ctype = src.type(), ccn = CV_MAT_CN(ctype), cdepth = CV_MAT_DEPTH(ctype), \ 5428 ckercn = vectorWidths[cdepth], cwidth = ccn * csize.width; \ 5429 if (cwidth < ckercn || ckercn <= 0) \ 5430 return 1; \ 5431 cols.push_back(cwidth); \ 5432 if (strat == OCL_VECTOR_OWN && ctype != ref_type) \ 5433 return 1; \ 5434 offsets.push_back(src.offset()); \ 5435 steps.push_back(src.step()); \ 5436 dividers.push_back(ckercn * CV_ELEM_SIZE1(ctype)); \ 5437 kercns.push_back(ckercn); \ 5438 } \ 5439 } \ 5440 while ((void)0, 0) 5441 5442 int predictOptimalVectorWidth(InputArray src1, InputArray src2, InputArray src3, 5443 InputArray src4, InputArray src5, InputArray src6, 5444 InputArray src7, InputArray src8, InputArray src9, 5445 OclVectorStrategy strat) 5446 { 5447 const ocl::Device & d = ocl::Device::getDefault(); 5448 5449 int vectorWidths[] = { d.preferredVectorWidthChar(), d.preferredVectorWidthChar(), 5450 d.preferredVectorWidthShort(), d.preferredVectorWidthShort(), 5451 d.preferredVectorWidthInt(), d.preferredVectorWidthFloat(), 5452 d.preferredVectorWidthDouble(), -1 }; 5453 5454 // if the device says don't use vectors 5455 if (vectorWidths[0] == 1) 5456 { 5457 // it's heuristic 5458 vectorWidths[CV_8U] = vectorWidths[CV_8S] = 4; 5459 vectorWidths[CV_16U] = vectorWidths[CV_16S] = 2; 5460 vectorWidths[CV_32S] = vectorWidths[CV_32F] = vectorWidths[CV_64F] = 1; 5461 } 5462 5463 return checkOptimalVectorWidth(vectorWidths, src1, src2, src3, src4, src5, src6, src7, src8, src9, strat); 5464 } 5465 5466 int checkOptimalVectorWidth(const int *vectorWidths, 5467 InputArray src1, InputArray src2, InputArray src3, 5468 InputArray src4, InputArray src5, InputArray src6, 5469 InputArray src7, InputArray src8, InputArray src9, 5470 OclVectorStrategy strat) 5471 { 5472 CV_Assert(vectorWidths); 5473 5474 int ref_type = src1.type(); 5475 5476 std::vector<size_t> offsets, steps, cols; 5477 std::vector<int> dividers, kercns; 5478 PROCESS_SRC(src1); 5479 PROCESS_SRC(src2); 5480 PROCESS_SRC(src3); 5481 PROCESS_SRC(src4); 5482 PROCESS_SRC(src5); 5483 PROCESS_SRC(src6); 5484 PROCESS_SRC(src7); 5485 PROCESS_SRC(src8); 5486 PROCESS_SRC(src9); 5487 5488 size_t size = offsets.size(); 5489 5490 for (size_t i = 0; i < size; ++i) 5491 while (offsets[i] % dividers[i] != 0 || steps[i] % dividers[i] != 0 || cols[i] % kercns[i] != 0) 5492 dividers[i] >>= 1, kercns[i] >>= 1; 5493 5494 // default strategy 5495 int kercn = *std::min_element(kercns.begin(), kercns.end()); 5496 5497 return kercn; 5498 } 5499 5500 int predictOptimalVectorWidthMax(InputArray src1, InputArray src2, InputArray src3, 5501 InputArray src4, InputArray src5, InputArray src6, 5502 InputArray src7, InputArray src8, InputArray src9) 5503 { 5504 return predictOptimalVectorWidth(src1, src2, src3, src4, src5, src6, src7, src8, src9, OCL_VECTOR_MAX); 5505 } 5506 5507 #undef PROCESS_SRC 5508 5509 5510 // TODO Make this as a method of OpenCL "BuildOptions" class 5511 void buildOptionsAddMatrixDescription(String& buildOptions, const String& name, InputArray _m) 5512 { 5513 if (!buildOptions.empty()) 5514 buildOptions += " "; 5515 int type = _m.type(), depth = CV_MAT_DEPTH(type); 5516 buildOptions += format( 5517 "-D %s_T=%s -D %s_T1=%s -D %s_CN=%d -D %s_TSIZE=%d -D %s_T1SIZE=%d -D %s_DEPTH=%d", 5518 name.c_str(), ocl::typeToStr(type), 5519 name.c_str(), ocl::typeToStr(CV_MAKE_TYPE(depth, 1)), 5520 name.c_str(), (int)CV_MAT_CN(type), 5521 name.c_str(), (int)CV_ELEM_SIZE(type), 5522 name.c_str(), (int)CV_ELEM_SIZE1(type), 5523 name.c_str(), (int)depth 5524 ); 5525 } 5526 5527 5528 struct Image2D::Impl 5529 { 5530 Impl(const UMat &src, bool norm, bool alias) 5531 { 5532 handle = 0; 5533 refcount = 1; 5534 init(src, norm, alias); 5535 } 5536 5537 ~Impl() 5538 { 5539 if (handle) 5540 clReleaseMemObject(handle); 5541 } 5542 5543 static cl_image_format getImageFormat(int depth, int cn, bool norm) 5544 { 5545 cl_image_format format; 5546 static const int channelTypes[] = { CL_UNSIGNED_INT8, CL_SIGNED_INT8, CL_UNSIGNED_INT16, 5547 CL_SIGNED_INT16, CL_SIGNED_INT32, CL_FLOAT, -1, -1 }; 5548 static const int channelTypesNorm[] = { CL_UNORM_INT8, CL_SNORM_INT8, CL_UNORM_INT16, 5549 CL_SNORM_INT16, -1, -1, -1, -1 }; 5550 static const int channelOrders[] = { -1, CL_R, CL_RG, -1, CL_RGBA }; 5551 5552 int channelType = norm ? channelTypesNorm[depth] : channelTypes[depth]; 5553 int channelOrder = channelOrders[cn]; 5554 format.image_channel_data_type = (cl_channel_type)channelType; 5555 format.image_channel_order = (cl_channel_order)channelOrder; 5556 return format; 5557 } 5558 5559 static bool isFormatSupported(cl_image_format format) 5560 { 5561 if (!haveOpenCL()) 5562 CV_Error(Error::OpenCLApiCallError, "OpenCL runtime not found!"); 5563 5564 cl_context context = (cl_context)Context::getDefault().ptr(); 5565 // Figure out how many formats are supported by this context. 5566 cl_uint numFormats = 0; 5567 cl_int err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE, 5568 CL_MEM_OBJECT_IMAGE2D, numFormats, 5569 NULL, &numFormats); 5570 AutoBuffer<cl_image_format> formats(numFormats); 5571 err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE, 5572 CL_MEM_OBJECT_IMAGE2D, numFormats, 5573 formats, NULL); 5574 CV_OclDbgAssert(err == CL_SUCCESS); 5575 for (cl_uint i = 0; i < numFormats; ++i) 5576 { 5577 if (!memcmp(&formats[i], &format, sizeof(format))) 5578 { 5579 return true; 5580 } 5581 } 5582 return false; 5583 } 5584 5585 void init(const UMat &src, bool norm, bool alias) 5586 { 5587 if (!haveOpenCL()) 5588 CV_Error(Error::OpenCLApiCallError, "OpenCL runtime not found!"); 5589 5590 CV_Assert(!src.empty()); 5591 CV_Assert(ocl::Device::getDefault().imageSupport()); 5592 5593 int err, depth = src.depth(), cn = src.channels(); 5594 CV_Assert(cn <= 4); 5595 cl_image_format format = getImageFormat(depth, cn, norm); 5596 5597 if (!isFormatSupported(format)) 5598 CV_Error(Error::OpenCLApiCallError, "Image format is not supported"); 5599 5600 if (alias && !src.handle(ACCESS_RW)) 5601 CV_Error(Error::OpenCLApiCallError, "Incorrect UMat, handle is null"); 5602 5603 cl_context context = (cl_context)Context::getDefault().ptr(); 5604 cl_command_queue queue = (cl_command_queue)Queue::getDefault().ptr(); 5605 5606 #ifdef CL_VERSION_1_2 5607 // this enables backwards portability to 5608 // run on OpenCL 1.1 platform if library binaries are compiled with OpenCL 1.2 support 5609 const Device & d = ocl::Device::getDefault(); 5610 int minor = d.deviceVersionMinor(), major = d.deviceVersionMajor(); 5611 CV_Assert(!alias || canCreateAlias(src)); 5612 if (1 < major || (1 == major && 2 <= minor)) 5613 { 5614 cl_image_desc desc; 5615 desc.image_type = CL_MEM_OBJECT_IMAGE2D; 5616 desc.image_width = src.cols; 5617 desc.image_height = src.rows; 5618 desc.image_depth = 0; 5619 desc.image_array_size = 1; 5620 desc.image_row_pitch = alias ? src.step[0] : 0; 5621 desc.image_slice_pitch = 0; 5622 desc.buffer = alias ? (cl_mem)src.handle(ACCESS_RW) : 0; 5623 desc.num_mip_levels = 0; 5624 desc.num_samples = 0; 5625 handle = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc, NULL, &err); 5626 } 5627 else 5628 #endif 5629 { 5630 CV_SUPPRESS_DEPRECATED_START 5631 CV_Assert(!alias); // This is an OpenCL 1.2 extension 5632 handle = clCreateImage2D(context, CL_MEM_READ_WRITE, &format, src.cols, src.rows, 0, NULL, &err); 5633 CV_SUPPRESS_DEPRECATED_END 5634 } 5635 CV_OclDbgAssert(err == CL_SUCCESS); 5636 5637 size_t origin[] = { 0, 0, 0 }; 5638 size_t region[] = { static_cast<size_t>(src.cols), static_cast<size_t>(src.rows), 1 }; 5639 5640 cl_mem devData; 5641 if (!alias && !src.isContinuous()) 5642 { 5643 devData = clCreateBuffer(context, CL_MEM_READ_ONLY, src.cols * src.rows * src.elemSize(), NULL, &err); 5644 CV_OclDbgAssert(err == CL_SUCCESS); 5645 5646 const size_t roi[3] = {static_cast<size_t>(src.cols) * src.elemSize(), static_cast<size_t>(src.rows), 1}; 5647 CV_Assert(clEnqueueCopyBufferRect(queue, (cl_mem)src.handle(ACCESS_READ), devData, origin, origin, 5648 roi, src.step, 0, src.cols * src.elemSize(), 0, 0, NULL, NULL) == CL_SUCCESS); 5649 CV_OclDbgAssert(clFlush(queue) == CL_SUCCESS); 5650 } 5651 else 5652 { 5653 devData = (cl_mem)src.handle(ACCESS_READ); 5654 } 5655 CV_Assert(devData != NULL); 5656 5657 if (!alias) 5658 { 5659 CV_OclDbgAssert(clEnqueueCopyBufferToImage(queue, devData, handle, 0, origin, region, 0, NULL, 0) == CL_SUCCESS); 5660 if (!src.isContinuous()) 5661 { 5662 CV_OclDbgAssert(clFlush(queue) == CL_SUCCESS); 5663 CV_OclDbgAssert(clReleaseMemObject(devData) == CL_SUCCESS); 5664 } 5665 } 5666 } 5667 5668 IMPLEMENT_REFCOUNTABLE(); 5669 5670 cl_mem handle; 5671 }; 5672 5673 Image2D::Image2D() 5674 { 5675 p = NULL; 5676 } 5677 5678 Image2D::Image2D(const UMat &src, bool norm, bool alias) 5679 { 5680 p = new Impl(src, norm, alias); 5681 } 5682 5683 bool Image2D::canCreateAlias(const UMat &m) 5684 { 5685 bool ret = false; 5686 const Device & d = ocl::Device::getDefault(); 5687 if (d.imageFromBufferSupport() && !m.empty()) 5688 { 5689 // This is the required pitch alignment in pixels 5690 uint pitchAlign = d.imagePitchAlignment(); 5691 if (pitchAlign && !(m.step % (pitchAlign * m.elemSize()))) 5692 { 5693 // We don't currently handle the case where the buffer was created 5694 // with CL_MEM_USE_HOST_PTR 5695 if (!m.u->tempUMat()) 5696 { 5697 ret = true; 5698 } 5699 } 5700 } 5701 return ret; 5702 } 5703 5704 bool Image2D::isFormatSupported(int depth, int cn, bool norm) 5705 { 5706 cl_image_format format = Impl::getImageFormat(depth, cn, norm); 5707 5708 return Impl::isFormatSupported(format); 5709 } 5710 5711 Image2D::Image2D(const Image2D & i) 5712 { 5713 p = i.p; 5714 if (p) 5715 p->addref(); 5716 } 5717 5718 Image2D & Image2D::operator = (const Image2D & i) 5719 { 5720 if (i.p != p) 5721 { 5722 if (i.p) 5723 i.p->addref(); 5724 if (p) 5725 p->release(); 5726 p = i.p; 5727 } 5728 return *this; 5729 } 5730 5731 Image2D::~Image2D() 5732 { 5733 if (p) 5734 p->release(); 5735 } 5736 5737 void* Image2D::ptr() const 5738 { 5739 return p ? p->handle : 0; 5740 } 5741 5742 bool internal::isPerformanceCheckBypassed() 5743 { 5744 static bool initialized = false; 5745 static bool value = false; 5746 if (!initialized) 5747 { 5748 value = getBoolParameter("OPENCV_OPENCL_PERF_CHECK_BYPASS", false); 5749 initialized = true; 5750 } 5751 return value; 5752 } 5753 5754 bool internal::isCLBuffer(UMat& u) 5755 { 5756 void* h = u.handle(ACCESS_RW); 5757 if (!h) 5758 return true; 5759 CV_DbgAssert(u.u->currAllocator == getOpenCLAllocator()); 5760 #if 1 5761 if ((u.u->allocatorFlags_ & 0xffff0000) != 0) // OpenCL SVM flags are stored here 5762 return false; 5763 #else 5764 cl_mem_object_type type = 0; 5765 cl_int ret = clGetMemObjectInfo((cl_mem)h, CL_MEM_TYPE, sizeof(type), &type, NULL); 5766 if (ret != CL_SUCCESS || type != CL_MEM_OBJECT_BUFFER) 5767 return false; 5768 #endif 5769 return true; 5770 } 5771 5772 }} 5773