Home | History | Annotate | Download | only in src
      1 /*M///////////////////////////////////////////////////////////////////////////////////////
      2 //
      3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
      4 //
      5 //  By downloading, copying, installing or using the software you agree to this license.
      6 //  If you do not agree to this license, do not download, install,
      7 //  copy or use the software.
      8 //
      9 //
     10 //                           License Agreement
     11 //                For Open Source Computer Vision Library
     12 //
     13 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
     14 // Third party copyrights are property of their respective owners.
     15 //
     16 // Redistribution and use in source and binary forms, with or without modification,
     17 // are permitted provided that the following conditions are met:
     18 //
     19 //   * Redistribution's of source code must retain the above copyright notice,
     20 //     this list of conditions and the following disclaimer.
     21 //
     22 //   * Redistribution's in binary form must reproduce the above copyright notice,
     23 //     this list of conditions and the following disclaimer in the documentation
     24 //     and/or other materials provided with the distribution.
     25 //
     26 //   * The name of the copyright holders may not be used to endorse or promote products
     27 //     derived from this software without specific prior written permission.
     28 //
     29 // This software is provided by the copyright holders and contributors "as is" and
     30 // any express or implied warranties, including, but not limited to, the implied
     31 // warranties of merchantability and fitness for a particular purpose are disclaimed.
     32 // In no event shall the OpenCV Foundation or contributors be liable for any direct,
     33 // indirect, incidental, special, exemplary, or consequential damages
     34 // (including, but not limited to, procurement of substitute goods or services;
     35 // loss of use, data, or profits; or business interruption) however caused
     36 // and on any theory of liability, whether in contract, strict liability,
     37 // or tort (including negligence or otherwise) arising in any way out of
     38 // the use of this software, even if advised of the possibility of such damage.
     39 //
     40 //M*/
     41 
     42 #include "precomp.hpp"
     43 #include <list>
     44 #include <map>
     45 #include <string>
     46 #include <sstream>
     47 #include <iostream> // std::cerr
     48 
     49 #define CV_OPENCL_ALWAYS_SHOW_BUILD_LOG 0
     50 #define CV_OPENCL_SHOW_RUN_ERRORS       0
     51 #define CV_OPENCL_SHOW_SVM_ERROR_LOG    1
     52 #define CV_OPENCL_SHOW_SVM_LOG          0
     53 
     54 #include "opencv2/core/bufferpool.hpp"
     55 #ifndef LOG_BUFFER_POOL
     56 # if 0
     57 #   define LOG_BUFFER_POOL printf
     58 # else
     59 #   define LOG_BUFFER_POOL(...)
     60 # endif
     61 #endif
     62 
     63 
     64 // TODO Move to some common place
     65 static bool getBoolParameter(const char* name, bool defaultValue)
     66 {
     67 /*
     68  * If your system doesn't support getenv(), define NO_GETENV to disable
     69  * this feature.
     70  */
     71 #ifdef NO_GETENV
     72     const char* envValue = NULL;
     73 #else
     74     const char* envValue = getenv(name);
     75 #endif
     76     if (envValue == NULL)
     77     {
     78         return defaultValue;
     79     }
     80     cv::String value = envValue;
     81     if (value == "1" || value == "True" || value == "true" || value == "TRUE")
     82     {
     83         return true;
     84     }
     85     if (value == "0" || value == "False" || value == "false" || value == "FALSE")
     86     {
     87         return false;
     88     }
     89     CV_ErrorNoReturn(cv::Error::StsBadArg, cv::format("Invalid value for %s parameter: %s", name, value.c_str()));
     90 }
     91 
     92 
     93 // TODO Move to some common place
     94 static size_t getConfigurationParameterForSize(const char* name, size_t defaultValue)
     95 {
     96 #ifdef NO_GETENV
     97     const char* envValue = NULL;
     98 #else
     99     const char* envValue = getenv(name);
    100 #endif
    101     if (envValue == NULL)
    102     {
    103         return defaultValue;
    104     }
    105     cv::String value = envValue;
    106     size_t pos = 0;
    107     for (; pos < value.size(); pos++)
    108     {
    109         if (!isdigit(value[pos]))
    110             break;
    111     }
    112     cv::String valueStr = value.substr(0, pos);
    113     cv::String suffixStr = value.substr(pos, value.length() - pos);
    114     int v = atoi(valueStr.c_str());
    115     if (suffixStr.length() == 0)
    116         return v;
    117     else if (suffixStr == "MB" || suffixStr == "Mb" || suffixStr == "mb")
    118         return v * 1024 * 1024;
    119     else if (suffixStr == "KB" || suffixStr == "Kb" || suffixStr == "kb")
    120         return v * 1024;
    121     CV_ErrorNoReturn(cv::Error::StsBadArg, cv::format("Invalid value for %s parameter: %s", name, value.c_str()));
    122 }
    123 
    124 #if CV_OPENCL_SHOW_SVM_LOG
    125 // TODO add timestamp logging
    126 #define CV_OPENCL_SVM_TRACE_P printf("line %d (ocl.cpp): ", __LINE__); printf
    127 #else
    128 #define CV_OPENCL_SVM_TRACE_P(...)
    129 #endif
    130 
    131 #if CV_OPENCL_SHOW_SVM_ERROR_LOG
    132 // TODO add timestamp logging
    133 #define CV_OPENCL_SVM_TRACE_ERROR_P printf("Error on line %d (ocl.cpp): ", __LINE__); printf
    134 #else
    135 #define CV_OPENCL_SVM_TRACE_ERROR_P(...)
    136 #endif
    137 
    138 #include "opencv2/core/opencl/runtime/opencl_clamdblas.hpp"
    139 #include "opencv2/core/opencl/runtime/opencl_clamdfft.hpp"
    140 
    141 #ifdef HAVE_OPENCL
    142 #include "opencv2/core/opencl/runtime/opencl_core.hpp"
    143 #else
    144 // TODO FIXIT: This file can't be build without OPENCL
    145 
    146 /*
    147   Part of the file is an extract from the standard OpenCL headers from Khronos site.
    148   Below is the original copyright.
    149 */
    150 
    151 /*******************************************************************************
    152  * Copyright (c) 2008 - 2012 The Khronos Group Inc.
    153  *
    154  * Permission is hereby granted, free of charge, to any person obtaining a
    155  * copy of this software and/or associated documentation files (the
    156  * "Materials"), to deal in the Materials without restriction, including
    157  * without limitation the rights to use, copy, modify, merge, publish,
    158  * distribute, sublicense, and/or sell copies of the Materials, and to
    159  * permit persons to whom the Materials are furnished to do so, subject to
    160  * the following conditions:
    161  *
    162  * The above copyright notice and this permission notice shall be included
    163  * in all copies or substantial portions of the Materials.
    164  *
    165  * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
    166  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
    167  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
    168  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
    169  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
    170  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
    171  * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
    172  ******************************************************************************/
    173 
    174 #if 0 //defined __APPLE__
    175 #define HAVE_OPENCL 1
    176 #else
    177 #undef HAVE_OPENCL
    178 #endif
    179 
    180 #define OPENCV_CL_NOT_IMPLEMENTED -1000
    181 
    182 #ifdef HAVE_OPENCL
    183 
    184 #if defined __APPLE__
    185 #include <OpenCL/opencl.h>
    186 #else
    187 #include <CL/opencl.h>
    188 #endif
    189 
    190 static const bool g_haveOpenCL = true;
    191 
    192 #else
    193 
    194 extern "C" {
    195 
    196 struct _cl_platform_id { int dummy; };
    197 struct _cl_device_id { int dummy; };
    198 struct _cl_context { int dummy; };
    199 struct _cl_command_queue { int dummy; };
    200 struct _cl_mem { int dummy; };
    201 struct _cl_program { int dummy; };
    202 struct _cl_kernel { int dummy; };
    203 struct _cl_event { int dummy; };
    204 struct _cl_sampler { int dummy; };
    205 
    206 typedef struct _cl_platform_id *    cl_platform_id;
    207 typedef struct _cl_device_id *      cl_device_id;
    208 typedef struct _cl_context *        cl_context;
    209 typedef struct _cl_command_queue *  cl_command_queue;
    210 typedef struct _cl_mem *            cl_mem;
    211 typedef struct _cl_program *        cl_program;
    212 typedef struct _cl_kernel *         cl_kernel;
    213 typedef struct _cl_event *          cl_event;
    214 typedef struct _cl_sampler *        cl_sampler;
    215 
    216 typedef int cl_int;
    217 typedef unsigned cl_uint;
    218 #if defined (_WIN32) && defined(_MSC_VER)
    219     typedef __int64 cl_long;
    220     typedef unsigned __int64 cl_ulong;
    221 #else
    222     typedef long cl_long;
    223     typedef unsigned long cl_ulong;
    224 #endif
    225 
    226 typedef cl_uint             cl_bool; /* WARNING!  Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */
    227 typedef cl_ulong            cl_bitfield;
    228 typedef cl_bitfield         cl_device_type;
    229 typedef cl_uint             cl_platform_info;
    230 typedef cl_uint             cl_device_info;
    231 typedef cl_bitfield         cl_device_fp_config;
    232 typedef cl_uint             cl_device_mem_cache_type;
    233 typedef cl_uint             cl_device_local_mem_type;
    234 typedef cl_bitfield         cl_device_exec_capabilities;
    235 typedef cl_bitfield         cl_command_queue_properties;
    236 typedef intptr_t            cl_device_partition_property;
    237 typedef cl_bitfield         cl_device_affinity_domain;
    238 
    239 typedef intptr_t            cl_context_properties;
    240 typedef cl_uint             cl_context_info;
    241 typedef cl_uint             cl_command_queue_info;
    242 typedef cl_uint             cl_channel_order;
    243 typedef cl_uint             cl_channel_type;
    244 typedef cl_bitfield         cl_mem_flags;
    245 typedef cl_uint             cl_mem_object_type;
    246 typedef cl_uint             cl_mem_info;
    247 typedef cl_bitfield         cl_mem_migration_flags;
    248 typedef cl_uint             cl_image_info;
    249 typedef cl_uint             cl_buffer_create_type;
    250 typedef cl_uint             cl_addressing_mode;
    251 typedef cl_uint             cl_filter_mode;
    252 typedef cl_uint             cl_sampler_info;
    253 typedef cl_bitfield         cl_map_flags;
    254 typedef cl_uint             cl_program_info;
    255 typedef cl_uint             cl_program_build_info;
    256 typedef cl_uint             cl_program_binary_type;
    257 typedef cl_int              cl_build_status;
    258 typedef cl_uint             cl_kernel_info;
    259 typedef cl_uint             cl_kernel_arg_info;
    260 typedef cl_uint             cl_kernel_arg_address_qualifier;
    261 typedef cl_uint             cl_kernel_arg_access_qualifier;
    262 typedef cl_bitfield         cl_kernel_arg_type_qualifier;
    263 typedef cl_uint             cl_kernel_work_group_info;
    264 typedef cl_uint             cl_event_info;
    265 typedef cl_uint             cl_command_type;
    266 typedef cl_uint             cl_profiling_info;
    267 
    268 
    269 typedef struct _cl_image_format {
    270     cl_channel_order        image_channel_order;
    271     cl_channel_type         image_channel_data_type;
    272 } cl_image_format;
    273 
    274 typedef struct _cl_image_desc {
    275     cl_mem_object_type      image_type;
    276     size_t                  image_width;
    277     size_t                  image_height;
    278     size_t                  image_depth;
    279     size_t                  image_array_size;
    280     size_t                  image_row_pitch;
    281     size_t                  image_slice_pitch;
    282     cl_uint                 num_mip_levels;
    283     cl_uint                 num_samples;
    284     cl_mem                  buffer;
    285 } cl_image_desc;
    286 
    287 typedef struct _cl_buffer_region {
    288     size_t                  origin;
    289     size_t                  size;
    290 } cl_buffer_region;
    291 
    292 
    293 //////////////////////////////////////////////////////////
    294 
    295 #define CL_SUCCESS                                  0
    296 #define CL_DEVICE_NOT_FOUND                         -1
    297 #define CL_DEVICE_NOT_AVAILABLE                     -2
    298 #define CL_COMPILER_NOT_AVAILABLE                   -3
    299 #define CL_MEM_OBJECT_ALLOCATION_FAILURE            -4
    300 #define CL_OUT_OF_RESOURCES                         -5
    301 #define CL_OUT_OF_HOST_MEMORY                       -6
    302 #define CL_PROFILING_INFO_NOT_AVAILABLE             -7
    303 #define CL_MEM_COPY_OVERLAP                         -8
    304 #define CL_IMAGE_FORMAT_MISMATCH                    -9
    305 #define CL_IMAGE_FORMAT_NOT_SUPPORTED               -10
    306 #define CL_BUILD_PROGRAM_FAILURE                    -11
    307 #define CL_MAP_FAILURE                              -12
    308 #define CL_MISALIGNED_SUB_BUFFER_OFFSET             -13
    309 #define CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST -14
    310 #define CL_COMPILE_PROGRAM_FAILURE                  -15
    311 #define CL_LINKER_NOT_AVAILABLE                     -16
    312 #define CL_LINK_PROGRAM_FAILURE                     -17
    313 #define CL_DEVICE_PARTITION_FAILED                  -18
    314 #define CL_KERNEL_ARG_INFO_NOT_AVAILABLE            -19
    315 
    316 #define CL_INVALID_VALUE                            -30
    317 #define CL_INVALID_DEVICE_TYPE                      -31
    318 #define CL_INVALID_PLATFORM                         -32
    319 #define CL_INVALID_DEVICE                           -33
    320 #define CL_INVALID_CONTEXT                          -34
    321 #define CL_INVALID_QUEUE_PROPERTIES                 -35
    322 #define CL_INVALID_COMMAND_QUEUE                    -36
    323 #define CL_INVALID_HOST_PTR                         -37
    324 #define CL_INVALID_MEM_OBJECT                       -38
    325 #define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR          -39
    326 #define CL_INVALID_IMAGE_SIZE                       -40
    327 #define CL_INVALID_SAMPLER                          -41
    328 #define CL_INVALID_BINARY                           -42
    329 #define CL_INVALID_BUILD_OPTIONS                    -43
    330 #define CL_INVALID_PROGRAM                          -44
    331 #define CL_INVALID_PROGRAM_EXECUTABLE               -45
    332 #define CL_INVALID_KERNEL_NAME                      -46
    333 #define CL_INVALID_KERNEL_DEFINITION                -47
    334 #define CL_INVALID_KERNEL                           -48
    335 #define CL_INVALID_ARG_INDEX                        -49
    336 #define CL_INVALID_ARG_VALUE                        -50
    337 #define CL_INVALID_ARG_SIZE                         -51
    338 #define CL_INVALID_KERNEL_ARGS                      -52
    339 #define CL_INVALID_WORK_DIMENSION                   -53
    340 #define CL_INVALID_WORK_GROUP_SIZE                  -54
    341 #define CL_INVALID_WORK_ITEM_SIZE                   -55
    342 #define CL_INVALID_GLOBAL_OFFSET                    -56
    343 #define CL_INVALID_EVENT_WAIT_LIST                  -57
    344 #define CL_INVALID_EVENT                            -58
    345 #define CL_INVALID_OPERATION                        -59
    346 #define CL_INVALID_GL_OBJECT                        -60
    347 #define CL_INVALID_BUFFER_SIZE                      -61
    348 #define CL_INVALID_MIP_LEVEL                        -62
    349 #define CL_INVALID_GLOBAL_WORK_SIZE                 -63
    350 #define CL_INVALID_PROPERTY                         -64
    351 #define CL_INVALID_IMAGE_DESCRIPTOR                 -65
    352 #define CL_INVALID_COMPILER_OPTIONS                 -66
    353 #define CL_INVALID_LINKER_OPTIONS                   -67
    354 #define CL_INVALID_DEVICE_PARTITION_COUNT           -68
    355 
    356 /*#define CL_VERSION_1_0                              1
    357 #define CL_VERSION_1_1                              1
    358 #define CL_VERSION_1_2                              1*/
    359 
    360 #define CL_FALSE                                    0
    361 #define CL_TRUE                                     1
    362 #define CL_BLOCKING                                 CL_TRUE
    363 #define CL_NON_BLOCKING                             CL_FALSE
    364 
    365 #define CL_PLATFORM_PROFILE                         0x0900
    366 #define CL_PLATFORM_VERSION                         0x0901
    367 #define CL_PLATFORM_NAME                            0x0902
    368 #define CL_PLATFORM_VENDOR                          0x0903
    369 #define CL_PLATFORM_EXTENSIONS                      0x0904
    370 
    371 #define CL_DEVICE_TYPE_DEFAULT                      (1 << 0)
    372 #define CL_DEVICE_TYPE_CPU                          (1 << 1)
    373 #define CL_DEVICE_TYPE_GPU                          (1 << 2)
    374 #define CL_DEVICE_TYPE_ACCELERATOR                  (1 << 3)
    375 #define CL_DEVICE_TYPE_CUSTOM                       (1 << 4)
    376 #define CL_DEVICE_TYPE_ALL                          0xFFFFFFFF
    377 #define CL_DEVICE_TYPE                              0x1000
    378 #define CL_DEVICE_VENDOR_ID                         0x1001
    379 #define CL_DEVICE_MAX_COMPUTE_UNITS                 0x1002
    380 #define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS          0x1003
    381 #define CL_DEVICE_MAX_WORK_GROUP_SIZE               0x1004
    382 #define CL_DEVICE_MAX_WORK_ITEM_SIZES               0x1005
    383 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR       0x1006
    384 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT      0x1007
    385 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT        0x1008
    386 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG       0x1009
    387 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT      0x100A
    388 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE     0x100B
    389 #define CL_DEVICE_MAX_CLOCK_FREQUENCY               0x100C
    390 #define CL_DEVICE_ADDRESS_BITS                      0x100D
    391 #define CL_DEVICE_MAX_READ_IMAGE_ARGS               0x100E
    392 #define CL_DEVICE_MAX_WRITE_IMAGE_ARGS              0x100F
    393 #define CL_DEVICE_MAX_MEM_ALLOC_SIZE                0x1010
    394 #define CL_DEVICE_IMAGE2D_MAX_WIDTH                 0x1011
    395 #define CL_DEVICE_IMAGE2D_MAX_HEIGHT                0x1012
    396 #define CL_DEVICE_IMAGE3D_MAX_WIDTH                 0x1013
    397 #define CL_DEVICE_IMAGE3D_MAX_HEIGHT                0x1014
    398 #define CL_DEVICE_IMAGE3D_MAX_DEPTH                 0x1015
    399 #define CL_DEVICE_IMAGE_SUPPORT                     0x1016
    400 #define CL_DEVICE_MAX_PARAMETER_SIZE                0x1017
    401 #define CL_DEVICE_MAX_SAMPLERS                      0x1018
    402 #define CL_DEVICE_MEM_BASE_ADDR_ALIGN               0x1019
    403 #define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE          0x101A
    404 #define CL_DEVICE_SINGLE_FP_CONFIG                  0x101B
    405 #define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE             0x101C
    406 #define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE         0x101D
    407 #define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE             0x101E
    408 #define CL_DEVICE_GLOBAL_MEM_SIZE                   0x101F
    409 #define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE          0x1020
    410 #define CL_DEVICE_MAX_CONSTANT_ARGS                 0x1021
    411 #define CL_DEVICE_LOCAL_MEM_TYPE                    0x1022
    412 #define CL_DEVICE_LOCAL_MEM_SIZE                    0x1023
    413 #define CL_DEVICE_ERROR_CORRECTION_SUPPORT          0x1024
    414 #define CL_DEVICE_PROFILING_TIMER_RESOLUTION        0x1025
    415 #define CL_DEVICE_ENDIAN_LITTLE                     0x1026
    416 #define CL_DEVICE_AVAILABLE                         0x1027
    417 #define CL_DEVICE_COMPILER_AVAILABLE                0x1028
    418 #define CL_DEVICE_EXECUTION_CAPABILITIES            0x1029
    419 #define CL_DEVICE_QUEUE_PROPERTIES                  0x102A
    420 #define CL_DEVICE_NAME                              0x102B
    421 #define CL_DEVICE_VENDOR                            0x102C
    422 #define CL_DRIVER_VERSION                           0x102D
    423 #define CL_DEVICE_PROFILE                           0x102E
    424 #define CL_DEVICE_VERSION                           0x102F
    425 #define CL_DEVICE_EXTENSIONS                        0x1030
    426 #define CL_DEVICE_PLATFORM                          0x1031
    427 #define CL_DEVICE_DOUBLE_FP_CONFIG                  0x1032
    428 #define CL_DEVICE_HALF_FP_CONFIG                    0x1033
    429 #define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF       0x1034
    430 #define CL_DEVICE_HOST_UNIFIED_MEMORY               0x1035
    431 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR          0x1036
    432 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT         0x1037
    433 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT           0x1038
    434 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG          0x1039
    435 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT         0x103A
    436 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE        0x103B
    437 #define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF          0x103C
    438 #define CL_DEVICE_OPENCL_C_VERSION                  0x103D
    439 #define CL_DEVICE_LINKER_AVAILABLE                  0x103E
    440 #define CL_DEVICE_BUILT_IN_KERNELS                  0x103F
    441 #define CL_DEVICE_IMAGE_MAX_BUFFER_SIZE             0x1040
    442 #define CL_DEVICE_IMAGE_MAX_ARRAY_SIZE              0x1041
    443 #define CL_DEVICE_PARENT_DEVICE                     0x1042
    444 #define CL_DEVICE_PARTITION_MAX_SUB_DEVICES         0x1043
    445 #define CL_DEVICE_PARTITION_PROPERTIES              0x1044
    446 #define CL_DEVICE_PARTITION_AFFINITY_DOMAIN         0x1045
    447 #define CL_DEVICE_PARTITION_TYPE                    0x1046
    448 #define CL_DEVICE_REFERENCE_COUNT                   0x1047
    449 #define CL_DEVICE_PREFERRED_INTEROP_USER_SYNC       0x1048
    450 #define CL_DEVICE_PRINTF_BUFFER_SIZE                0x1049
    451 #define CL_DEVICE_IMAGE_PITCH_ALIGNMENT             0x104A
    452 #define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT      0x104B
    453 
    454 #define CL_FP_DENORM                                (1 << 0)
    455 #define CL_FP_INF_NAN                               (1 << 1)
    456 #define CL_FP_ROUND_TO_NEAREST                      (1 << 2)
    457 #define CL_FP_ROUND_TO_ZERO                         (1 << 3)
    458 #define CL_FP_ROUND_TO_INF                          (1 << 4)
    459 #define CL_FP_FMA                                   (1 << 5)
    460 #define CL_FP_SOFT_FLOAT                            (1 << 6)
    461 #define CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT         (1 << 7)
    462 
    463 #define CL_NONE                                     0x0
    464 #define CL_READ_ONLY_CACHE                          0x1
    465 #define CL_READ_WRITE_CACHE                         0x2
    466 #define CL_LOCAL                                    0x1
    467 #define CL_GLOBAL                                   0x2
    468 #define CL_EXEC_KERNEL                              (1 << 0)
    469 #define CL_EXEC_NATIVE_KERNEL                       (1 << 1)
    470 #define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE      (1 << 0)
    471 #define CL_QUEUE_PROFILING_ENABLE                   (1 << 1)
    472 
    473 #define CL_CONTEXT_REFERENCE_COUNT                  0x1080
    474 #define CL_CONTEXT_DEVICES                          0x1081
    475 #define CL_CONTEXT_PROPERTIES                       0x1082
    476 #define CL_CONTEXT_NUM_DEVICES                      0x1083
    477 #define CL_CONTEXT_PLATFORM                         0x1084
    478 #define CL_CONTEXT_INTEROP_USER_SYNC                0x1085
    479 
    480 #define CL_DEVICE_PARTITION_EQUALLY                 0x1086
    481 #define CL_DEVICE_PARTITION_BY_COUNTS               0x1087
    482 #define CL_DEVICE_PARTITION_BY_COUNTS_LIST_END      0x0
    483 #define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN      0x1088
    484 #define CL_DEVICE_AFFINITY_DOMAIN_NUMA                     (1 << 0)
    485 #define CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE                 (1 << 1)
    486 #define CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE                 (1 << 2)
    487 #define CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE                 (1 << 3)
    488 #define CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE                 (1 << 4)
    489 #define CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE       (1 << 5)
    490 #define CL_QUEUE_CONTEXT                            0x1090
    491 #define CL_QUEUE_DEVICE                             0x1091
    492 #define CL_QUEUE_REFERENCE_COUNT                    0x1092
    493 #define CL_QUEUE_PROPERTIES                         0x1093
    494 #define CL_MEM_READ_WRITE                           (1 << 0)
    495 #define CL_MEM_WRITE_ONLY                           (1 << 1)
    496 #define CL_MEM_READ_ONLY                            (1 << 2)
    497 #define CL_MEM_USE_HOST_PTR                         (1 << 3)
    498 #define CL_MEM_ALLOC_HOST_PTR                       (1 << 4)
    499 #define CL_MEM_COPY_HOST_PTR                        (1 << 5)
    500 // reserved                                         (1 << 6)
    501 #define CL_MEM_HOST_WRITE_ONLY                      (1 << 7)
    502 #define CL_MEM_HOST_READ_ONLY                       (1 << 8)
    503 #define CL_MEM_HOST_NO_ACCESS                       (1 << 9)
    504 #define CL_MIGRATE_MEM_OBJECT_HOST                  (1 << 0)
    505 #define CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED     (1 << 1)
    506 
    507 #define CL_R                                        0x10B0
    508 #define CL_A                                        0x10B1
    509 #define CL_RG                                       0x10B2
    510 #define CL_RA                                       0x10B3
    511 #define CL_RGB                                      0x10B4
    512 #define CL_RGBA                                     0x10B5
    513 #define CL_BGRA                                     0x10B6
    514 #define CL_ARGB                                     0x10B7
    515 #define CL_INTENSITY                                0x10B8
    516 #define CL_LUMINANCE                                0x10B9
    517 #define CL_Rx                                       0x10BA
    518 #define CL_RGx                                      0x10BB
    519 #define CL_RGBx                                     0x10BC
    520 #define CL_DEPTH                                    0x10BD
    521 #define CL_DEPTH_STENCIL                            0x10BE
    522 
    523 #define CL_SNORM_INT8                               0x10D0
    524 #define CL_SNORM_INT16                              0x10D1
    525 #define CL_UNORM_INT8                               0x10D2
    526 #define CL_UNORM_INT16                              0x10D3
    527 #define CL_UNORM_SHORT_565                          0x10D4
    528 #define CL_UNORM_SHORT_555                          0x10D5
    529 #define CL_UNORM_INT_101010                         0x10D6
    530 #define CL_SIGNED_INT8                              0x10D7
    531 #define CL_SIGNED_INT16                             0x10D8
    532 #define CL_SIGNED_INT32                             0x10D9
    533 #define CL_UNSIGNED_INT8                            0x10DA
    534 #define CL_UNSIGNED_INT16                           0x10DB
    535 #define CL_UNSIGNED_INT32                           0x10DC
    536 #define CL_HALF_FLOAT                               0x10DD
    537 #define CL_FLOAT                                    0x10DE
    538 #define CL_UNORM_INT24                              0x10DF
    539 
    540 #define CL_MEM_OBJECT_BUFFER                        0x10F0
    541 #define CL_MEM_OBJECT_IMAGE2D                       0x10F1
    542 #define CL_MEM_OBJECT_IMAGE3D                       0x10F2
    543 #define CL_MEM_OBJECT_IMAGE2D_ARRAY                 0x10F3
    544 #define CL_MEM_OBJECT_IMAGE1D                       0x10F4
    545 #define CL_MEM_OBJECT_IMAGE1D_ARRAY                 0x10F5
    546 #define CL_MEM_OBJECT_IMAGE1D_BUFFER                0x10F6
    547 
    548 #define CL_MEM_TYPE                                 0x1100
    549 #define CL_MEM_FLAGS                                0x1101
    550 #define CL_MEM_SIZE                                 0x1102
    551 #define CL_MEM_HOST_PTR                             0x1103
    552 #define CL_MEM_MAP_COUNT                            0x1104
    553 #define CL_MEM_REFERENCE_COUNT                      0x1105
    554 #define CL_MEM_CONTEXT                              0x1106
    555 #define CL_MEM_ASSOCIATED_MEMOBJECT                 0x1107
    556 #define CL_MEM_OFFSET                               0x1108
    557 
    558 #define CL_IMAGE_FORMAT                             0x1110
    559 #define CL_IMAGE_ELEMENT_SIZE                       0x1111
    560 #define CL_IMAGE_ROW_PITCH                          0x1112
    561 #define CL_IMAGE_SLICE_PITCH                        0x1113
    562 #define CL_IMAGE_WIDTH                              0x1114
    563 #define CL_IMAGE_HEIGHT                             0x1115
    564 #define CL_IMAGE_DEPTH                              0x1116
    565 #define CL_IMAGE_ARRAY_SIZE                         0x1117
    566 #define CL_IMAGE_BUFFER                             0x1118
    567 #define CL_IMAGE_NUM_MIP_LEVELS                     0x1119
    568 #define CL_IMAGE_NUM_SAMPLES                        0x111A
    569 
    570 #define CL_ADDRESS_NONE                             0x1130
    571 #define CL_ADDRESS_CLAMP_TO_EDGE                    0x1131
    572 #define CL_ADDRESS_CLAMP                            0x1132
    573 #define CL_ADDRESS_REPEAT                           0x1133
    574 #define CL_ADDRESS_MIRRORED_REPEAT                  0x1134
    575 
    576 #define CL_FILTER_NEAREST                           0x1140
    577 #define CL_FILTER_LINEAR                            0x1141
    578 
    579 #define CL_SAMPLER_REFERENCE_COUNT                  0x1150
    580 #define CL_SAMPLER_CONTEXT                          0x1151
    581 #define CL_SAMPLER_NORMALIZED_COORDS                0x1152
    582 #define CL_SAMPLER_ADDRESSING_MODE                  0x1153
    583 #define CL_SAMPLER_FILTER_MODE                      0x1154
    584 
    585 #define CL_MAP_READ                                 (1 << 0)
    586 #define CL_MAP_WRITE                                (1 << 1)
    587 #define CL_MAP_WRITE_INVALIDATE_REGION              (1 << 2)
    588 
    589 #define CL_PROGRAM_REFERENCE_COUNT                  0x1160
    590 #define CL_PROGRAM_CONTEXT                          0x1161
    591 #define CL_PROGRAM_NUM_DEVICES                      0x1162
    592 #define CL_PROGRAM_DEVICES                          0x1163
    593 #define CL_PROGRAM_SOURCE                           0x1164
    594 #define CL_PROGRAM_BINARY_SIZES                     0x1165
    595 #define CL_PROGRAM_BINARIES                         0x1166
    596 #define CL_PROGRAM_NUM_KERNELS                      0x1167
    597 #define CL_PROGRAM_KERNEL_NAMES                     0x1168
    598 #define CL_PROGRAM_BUILD_STATUS                     0x1181
    599 #define CL_PROGRAM_BUILD_OPTIONS                    0x1182
    600 #define CL_PROGRAM_BUILD_LOG                        0x1183
    601 #define CL_PROGRAM_BINARY_TYPE                      0x1184
    602 #define CL_PROGRAM_BINARY_TYPE_NONE                 0x0
    603 #define CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT      0x1
    604 #define CL_PROGRAM_BINARY_TYPE_LIBRARY              0x2
    605 #define CL_PROGRAM_BINARY_TYPE_EXECUTABLE           0x4
    606 
    607 #define CL_BUILD_SUCCESS                            0
    608 #define CL_BUILD_NONE                               -1
    609 #define CL_BUILD_ERROR                              -2
    610 #define CL_BUILD_IN_PROGRESS                        -3
    611 
    612 #define CL_KERNEL_FUNCTION_NAME                     0x1190
    613 #define CL_KERNEL_NUM_ARGS                          0x1191
    614 #define CL_KERNEL_REFERENCE_COUNT                   0x1192
    615 #define CL_KERNEL_CONTEXT                           0x1193
    616 #define CL_KERNEL_PROGRAM                           0x1194
    617 #define CL_KERNEL_ATTRIBUTES                        0x1195
    618 #define CL_KERNEL_ARG_ADDRESS_QUALIFIER             0x1196
    619 #define CL_KERNEL_ARG_ACCESS_QUALIFIER              0x1197
    620 #define CL_KERNEL_ARG_TYPE_NAME                     0x1198
    621 #define CL_KERNEL_ARG_TYPE_QUALIFIER                0x1199
    622 #define CL_KERNEL_ARG_NAME                          0x119A
    623 #define CL_KERNEL_ARG_ADDRESS_GLOBAL                0x119B
    624 #define CL_KERNEL_ARG_ADDRESS_LOCAL                 0x119C
    625 #define CL_KERNEL_ARG_ADDRESS_CONSTANT              0x119D
    626 #define CL_KERNEL_ARG_ADDRESS_PRIVATE               0x119E
    627 #define CL_KERNEL_ARG_ACCESS_READ_ONLY              0x11A0
    628 #define CL_KERNEL_ARG_ACCESS_WRITE_ONLY             0x11A1
    629 #define CL_KERNEL_ARG_ACCESS_READ_WRITE             0x11A2
    630 #define CL_KERNEL_ARG_ACCESS_NONE                   0x11A3
    631 #define CL_KERNEL_ARG_TYPE_NONE                     0
    632 #define CL_KERNEL_ARG_TYPE_CONST                    (1 << 0)
    633 #define CL_KERNEL_ARG_TYPE_RESTRICT                 (1 << 1)
    634 #define CL_KERNEL_ARG_TYPE_VOLATILE                 (1 << 2)
    635 #define CL_KERNEL_WORK_GROUP_SIZE                   0x11B0
    636 #define CL_KERNEL_COMPILE_WORK_GROUP_SIZE           0x11B1
    637 #define CL_KERNEL_LOCAL_MEM_SIZE                    0x11B2
    638 #define CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x11B3
    639 #define CL_KERNEL_PRIVATE_MEM_SIZE                  0x11B4
    640 #define CL_KERNEL_GLOBAL_WORK_SIZE                  0x11B5
    641 
    642 #define CL_EVENT_COMMAND_QUEUE                      0x11D0
    643 #define CL_EVENT_COMMAND_TYPE                       0x11D1
    644 #define CL_EVENT_REFERENCE_COUNT                    0x11D2
    645 #define CL_EVENT_COMMAND_EXECUTION_STATUS           0x11D3
    646 #define CL_EVENT_CONTEXT                            0x11D4
    647 
    648 #define CL_COMMAND_NDRANGE_KERNEL                   0x11F0
    649 #define CL_COMMAND_TASK                             0x11F1
    650 #define CL_COMMAND_NATIVE_KERNEL                    0x11F2
    651 #define CL_COMMAND_READ_BUFFER                      0x11F3
    652 #define CL_COMMAND_WRITE_BUFFER                     0x11F4
    653 #define CL_COMMAND_COPY_BUFFER                      0x11F5
    654 #define CL_COMMAND_READ_IMAGE                       0x11F6
    655 #define CL_COMMAND_WRITE_IMAGE                      0x11F7
    656 #define CL_COMMAND_COPY_IMAGE                       0x11F8
    657 #define CL_COMMAND_COPY_IMAGE_TO_BUFFER             0x11F9
    658 #define CL_COMMAND_COPY_BUFFER_TO_IMAGE             0x11FA
    659 #define CL_COMMAND_MAP_BUFFER                       0x11FB
    660 #define CL_COMMAND_MAP_IMAGE                        0x11FC
    661 #define CL_COMMAND_UNMAP_MEM_OBJECT                 0x11FD
    662 #define CL_COMMAND_MARKER                           0x11FE
    663 #define CL_COMMAND_ACQUIRE_GL_OBJECTS               0x11FF
    664 #define CL_COMMAND_RELEASE_GL_OBJECTS               0x1200
    665 #define CL_COMMAND_READ_BUFFER_RECT                 0x1201
    666 #define CL_COMMAND_WRITE_BUFFER_RECT                0x1202
    667 #define CL_COMMAND_COPY_BUFFER_RECT                 0x1203
    668 #define CL_COMMAND_USER                             0x1204
    669 #define CL_COMMAND_BARRIER                          0x1205
    670 #define CL_COMMAND_MIGRATE_MEM_OBJECTS              0x1206
    671 #define CL_COMMAND_FILL_BUFFER                      0x1207
    672 #define CL_COMMAND_FILL_IMAGE                       0x1208
    673 
    674 #define CL_COMPLETE                                 0x0
    675 #define CL_RUNNING                                  0x1
    676 #define CL_SUBMITTED                                0x2
    677 #define CL_QUEUED                                   0x3
    678 #define CL_BUFFER_CREATE_TYPE_REGION                0x1220
    679 
    680 #define CL_PROFILING_COMMAND_QUEUED                 0x1280
    681 #define CL_PROFILING_COMMAND_SUBMIT                 0x1281
    682 #define CL_PROFILING_COMMAND_START                  0x1282
    683 #define CL_PROFILING_COMMAND_END                    0x1283
    684 
    685 #define CL_CALLBACK CV_STDCALL
    686 
    687 static volatile bool g_haveOpenCL = false;
    688 static const char* oclFuncToCheck = "clEnqueueReadBufferRect";
    689 
    690 #if defined(__APPLE__)
    691 #include <dlfcn.h>
    692 
    693 static void* initOpenCLAndLoad(const char* funcname)
    694 {
    695     static bool initialized = false;
    696     static void* handle = 0;
    697     if (!handle)
    698     {
    699         if(!initialized)
    700         {
    701             const char* oclpath = getenv("OPENCV_OPENCL_RUNTIME");
    702             oclpath = oclpath && strlen(oclpath) > 0 ? oclpath :
    703                 "/System/Library/Frameworks/OpenCL.framework/Versions/Current/OpenCL";
    704             handle = dlopen(oclpath, RTLD_LAZY);
    705             initialized = true;
    706             g_haveOpenCL = handle != 0 && dlsym(handle, oclFuncToCheck) != 0;
    707             if( g_haveOpenCL )
    708                 fprintf(stderr, "Successfully loaded OpenCL v1.1+ runtime from %s\n", oclpath);
    709             else
    710                 fprintf(stderr, "Failed to load OpenCL runtime\n");
    711         }
    712         if(!handle)
    713             return 0;
    714     }
    715 
    716     return funcname && handle ? dlsym(handle, funcname) : 0;
    717 }
    718 
    719 #elif defined WIN32 || defined _WIN32
    720 
    721 #ifndef _WIN32_WINNT           // This is needed for the declaration of TryEnterCriticalSection in winbase.h with Visual Studio 2005 (and older?)
    722   #define _WIN32_WINNT 0x0400  // http://msdn.microsoft.com/en-us/library/ms686857(VS.85).aspx
    723 #endif
    724 #include <windows.h>
    725 #if (_WIN32_WINNT >= 0x0602)
    726   #include <synchapi.h>
    727 #endif
    728 #undef small
    729 #undef min
    730 #undef max
    731 #undef abs
    732 
    733 static void* initOpenCLAndLoad(const char* funcname)
    734 {
    735     static bool initialized = false;
    736     static HMODULE handle = 0;
    737     if (!handle)
    738     {
    739 #ifndef WINRT
    740         if(!initialized)
    741         {
    742             handle = LoadLibraryA("OpenCL.dll");
    743             initialized = true;
    744             g_haveOpenCL = handle != 0 && GetProcAddress(handle, oclFuncToCheck) != 0;
    745         }
    746 #endif
    747         if(!handle)
    748             return 0;
    749     }
    750 
    751     return funcname ? (void*)GetProcAddress(handle, funcname) : 0;
    752 }
    753 
    754 #elif defined(__linux)
    755 
    756 #include <dlfcn.h>
    757 #include <stdio.h>
    758 
    759 static void* initOpenCLAndLoad(const char* funcname)
    760 {
    761     static bool initialized = false;
    762     static void* handle = 0;
    763     if (!handle)
    764     {
    765         if(!initialized)
    766         {
    767             handle = dlopen("libOpenCL.so", RTLD_LAZY);
    768             if(!handle)
    769                 handle = dlopen("libCL.so", RTLD_LAZY);
    770             initialized = true;
    771             g_haveOpenCL = handle != 0 && dlsym(handle, oclFuncToCheck) != 0;
    772         }
    773         if(!handle)
    774             return 0;
    775     }
    776 
    777     return funcname ? (void*)dlsym(handle, funcname) : 0;
    778 }
    779 
    780 #else
    781 
    782 static void* initOpenCLAndLoad(const char*)
    783 {
    784     return 0;
    785 }
    786 
    787 #endif
    788 
    789 
    790 #define OCL_FUNC(rettype, funcname, argsdecl, args) \
    791     typedef rettype (CV_STDCALL * funcname##_t) argsdecl; \
    792     static rettype funcname argsdecl \
    793     { \
    794         static funcname##_t funcname##_p = 0; \
    795         if( !funcname##_p ) \
    796         { \
    797             funcname##_p = (funcname##_t)initOpenCLAndLoad(#funcname); \
    798             if( !funcname##_p ) \
    799                 return OPENCV_CL_NOT_IMPLEMENTED; \
    800         } \
    801         return funcname##_p args; \
    802     }
    803 
    804 
    805 #define OCL_FUNC_P(rettype, funcname, argsdecl, args) \
    806     typedef rettype (CV_STDCALL * funcname##_t) argsdecl; \
    807     static rettype funcname argsdecl \
    808     { \
    809         static funcname##_t funcname##_p = 0; \
    810         if( !funcname##_p ) \
    811         { \
    812             funcname##_p = (funcname##_t)initOpenCLAndLoad(#funcname); \
    813             if( !funcname##_p ) \
    814             { \
    815                 if( errcode_ret ) \
    816                     *errcode_ret = OPENCV_CL_NOT_IMPLEMENTED; \
    817                 return 0; \
    818             } \
    819         } \
    820         return funcname##_p args; \
    821     }
    822 
    823 OCL_FUNC(cl_int, clGetPlatformIDs,
    824     (cl_uint num_entries, cl_platform_id* platforms, cl_uint* num_platforms),
    825     (num_entries, platforms, num_platforms))
    826 
    827 OCL_FUNC(cl_int, clGetPlatformInfo,
    828     (cl_platform_id platform, cl_platform_info param_name,
    829     size_t param_value_size, void * param_value,
    830     size_t * param_value_size_ret),
    831     (platform, param_name, param_value_size, param_value, param_value_size_ret))
    832 
    833 OCL_FUNC(cl_int, clGetDeviceInfo,
    834          (cl_device_id device,
    835           cl_device_info param_name,
    836           size_t param_value_size,
    837           void * param_value,
    838           size_t * param_value_size_ret),
    839          (device, param_name, param_value_size, param_value, param_value_size_ret))
    840 
    841 
    842 OCL_FUNC(cl_int, clGetDeviceIDs,
    843     (cl_platform_id platform,
    844     cl_device_type device_type,
    845     cl_uint num_entries,
    846     cl_device_id * devices,
    847     cl_uint * num_devices),
    848     (platform, device_type, num_entries, devices, num_devices))
    849 
    850 OCL_FUNC_P(cl_context, clCreateContext,
    851     (const cl_context_properties * properties,
    852     cl_uint num_devices,
    853     const cl_device_id * devices,
    854     void (CL_CALLBACK * pfn_notify)(const char *, const void *, size_t, void *),
    855     void * user_data,
    856     cl_int * errcode_ret),
    857     (properties, num_devices, devices, pfn_notify, user_data, errcode_ret))
    858 
    859 OCL_FUNC(cl_int, clReleaseContext, (cl_context context), (context))
    860 
    861 /*
    862 OCL_FUNC(cl_int, clRetainContext, (cl_context context), (context))
    863 
    864 OCL_FUNC_P(cl_context, clCreateContextFromType,
    865     (const cl_context_properties * properties,
    866     cl_device_type device_type,
    867     void (CL_CALLBACK * pfn_notify)(const char *, const void *, size_t, void *),
    868     void * user_data,
    869     cl_int * errcode_ret),
    870     (properties, device_type, pfn_notify, user_data, errcode_ret))
    871 
    872 OCL_FUNC(cl_int, clGetContextInfo,
    873     (cl_context context,
    874     cl_context_info param_name,
    875     size_t param_value_size,
    876     void * param_value,
    877     size_t * param_value_size_ret),
    878     (context, param_name, param_value_size,
    879     param_value, param_value_size_ret))
    880 */
    881 OCL_FUNC_P(cl_command_queue, clCreateCommandQueue,
    882     (cl_context context,
    883     cl_device_id device,
    884     cl_command_queue_properties properties,
    885     cl_int * errcode_ret),
    886     (context, device, properties, errcode_ret))
    887 
    888 OCL_FUNC(cl_int, clReleaseCommandQueue, (cl_command_queue command_queue), (command_queue))
    889 
    890 OCL_FUNC_P(cl_mem, clCreateBuffer,
    891     (cl_context context,
    892     cl_mem_flags flags,
    893     size_t size,
    894     void * host_ptr,
    895     cl_int * errcode_ret),
    896     (context, flags, size, host_ptr, errcode_ret))
    897 
    898 /*
    899 OCL_FUNC(cl_int, clRetainCommandQueue, (cl_command_queue command_queue), (command_queue))
    900 
    901 OCL_FUNC(cl_int, clGetCommandQueueInfo,
    902  (cl_command_queue command_queue,
    903  cl_command_queue_info param_name,
    904  size_t param_value_size,
    905  void * param_value,
    906  size_t * param_value_size_ret),
    907  (command_queue, param_name, param_value_size, param_value, param_value_size_ret))
    908 
    909 OCL_FUNC_P(cl_mem, clCreateSubBuffer,
    910     (cl_mem buffer,
    911     cl_mem_flags flags,
    912     cl_buffer_create_type buffer_create_type,
    913     const void * buffer_create_info,
    914     cl_int * errcode_ret),
    915     (buffer, flags, buffer_create_type, buffer_create_info, errcode_ret))
    916 */
    917 
    918 OCL_FUNC_P(cl_mem, clCreateImage,
    919     (cl_context context,
    920     cl_mem_flags flags,
    921     const cl_image_format * image_format,
    922     const cl_image_desc * image_desc,
    923     void * host_ptr,
    924     cl_int * errcode_ret),
    925     (context, flags, image_format, image_desc, host_ptr, errcode_ret))
    926 
    927 OCL_FUNC_P(cl_mem, clCreateImage2D,
    928     (cl_context context,
    929     cl_mem_flags flags,
    930     const cl_image_format * image_format,
    931     size_t image_width,
    932     size_t image_height,
    933     size_t image_row_pitch,
    934     void * host_ptr,
    935     cl_int *errcode_ret),
    936     (context, flags, image_format, image_width, image_height, image_row_pitch, host_ptr, errcode_ret))
    937 
    938 OCL_FUNC(cl_int, clGetSupportedImageFormats,
    939  (cl_context context,
    940  cl_mem_flags flags,
    941  cl_mem_object_type image_type,
    942  cl_uint num_entries,
    943  cl_image_format * image_formats,
    944  cl_uint * num_image_formats),
    945  (context, flags, image_type, num_entries, image_formats, num_image_formats))
    946 
    947 
    948 /*
    949 OCL_FUNC(cl_int, clGetMemObjectInfo,
    950  (cl_mem memobj,
    951  cl_mem_info param_name,
    952  size_t param_value_size,
    953  void * param_value,
    954  size_t * param_value_size_ret),
    955  (memobj, param_name, param_value_size, param_value, param_value_size_ret))
    956 
    957 OCL_FUNC(cl_int, clGetImageInfo,
    958  (cl_mem image,
    959  cl_image_info param_name,
    960  size_t param_value_size,
    961  void * param_value,
    962  size_t * param_value_size_ret),
    963  (image, param_name, param_value_size, param_value, param_value_size_ret))
    964 
    965 OCL_FUNC(cl_int, clCreateKernelsInProgram,
    966  (cl_program program,
    967  cl_uint num_kernels,
    968  cl_kernel * kernels,
    969  cl_uint * num_kernels_ret),
    970  (program, num_kernels, kernels, num_kernels_ret))
    971 
    972 OCL_FUNC(cl_int, clRetainKernel, (cl_kernel kernel), (kernel))
    973 
    974 OCL_FUNC(cl_int, clGetKernelArgInfo,
    975  (cl_kernel kernel,
    976  cl_uint arg_indx,
    977  cl_kernel_arg_info param_name,
    978  size_t param_value_size,
    979  void * param_value,
    980  size_t * param_value_size_ret),
    981  (kernel, arg_indx, param_name, param_value_size, param_value, param_value_size_ret))
    982 
    983 OCL_FUNC(cl_int, clEnqueueReadImage,
    984  (cl_command_queue command_queue,
    985  cl_mem image,
    986  cl_bool blocking_read,
    987  const size_t * origin[3],
    988  const size_t * region[3],
    989  size_t row_pitch,
    990  size_t slice_pitch,
    991  void * ptr,
    992  cl_uint num_events_in_wait_list,
    993  const cl_event * event_wait_list,
    994  cl_event * event),
    995  (command_queue, image, blocking_read, origin, region,
    996  row_pitch, slice_pitch,
    997  ptr,
    998  num_events_in_wait_list,
    999  event_wait_list,
   1000  event))
   1001 
   1002 OCL_FUNC(cl_int, clEnqueueWriteImage,
   1003  (cl_command_queue command_queue,
   1004  cl_mem image,
   1005  cl_bool blocking_write,
   1006  const size_t * origin[3],
   1007  const size_t * region[3],
   1008  size_t input_row_pitch,
   1009  size_t input_slice_pitch,
   1010  const void * ptr,
   1011  cl_uint num_events_in_wait_list,
   1012  const cl_event * event_wait_list,
   1013  cl_event * event),
   1014  (command_queue, image, blocking_write, origin, region, input_row_pitch,
   1015  input_slice_pitch, ptr, num_events_in_wait_list, event_wait_list, event))
   1016 
   1017 OCL_FUNC(cl_int, clEnqueueFillImage,
   1018  (cl_command_queue command_queue,
   1019  cl_mem image,
   1020  const void * fill_color,
   1021  const size_t * origin[3],
   1022  const size_t * region[3],
   1023  cl_uint num_events_in_wait_list,
   1024  const cl_event * event_wait_list,
   1025  cl_event * event),
   1026  (command_queue, image, fill_color, origin, region,
   1027  num_events_in_wait_list, event_wait_list, event))
   1028 
   1029 OCL_FUNC(cl_int, clEnqueueCopyImage,
   1030  (cl_command_queue command_queue,
   1031  cl_mem src_image,
   1032  cl_mem dst_image,
   1033  const size_t * src_origin[3],
   1034  const size_t * dst_origin[3],
   1035  const size_t * region[3],
   1036  cl_uint num_events_in_wait_list,
   1037  const cl_event * event_wait_list,
   1038  cl_event * event),
   1039  (command_queue, src_image, dst_image, src_origin, dst_origin,
   1040  region, num_events_in_wait_list, event_wait_list, event))
   1041 
   1042 OCL_FUNC(cl_int, clEnqueueCopyImageToBuffer,
   1043  (cl_command_queue command_queue,
   1044  cl_mem src_image,
   1045  cl_mem dst_buffer,
   1046  const size_t * src_origin[3],
   1047  const size_t * region[3],
   1048  size_t dst_offset,
   1049  cl_uint num_events_in_wait_list,
   1050  const cl_event * event_wait_list,
   1051  cl_event * event),
   1052  (command_queue, src_image, dst_buffer, src_origin, region, dst_offset,
   1053  num_events_in_wait_list, event_wait_list, event))
   1054 */
   1055 
   1056 OCL_FUNC(cl_int, clEnqueueCopyBufferToImage,
   1057  (cl_command_queue command_queue,
   1058  cl_mem src_buffer,
   1059  cl_mem dst_image,
   1060  size_t src_offset,
   1061  const size_t dst_origin[3],
   1062  const size_t region[3],
   1063  cl_uint num_events_in_wait_list,
   1064  const cl_event * event_wait_list,
   1065  cl_event * event),
   1066  (command_queue, src_buffer, dst_image, src_offset, dst_origin,
   1067  region, num_events_in_wait_list, event_wait_list, event))
   1068 
   1069  OCL_FUNC(cl_int, clFlush,
   1070  (cl_command_queue command_queue),
   1071  (command_queue))
   1072 
   1073 /*
   1074 OCL_FUNC_P(void*, clEnqueueMapImage,
   1075  (cl_command_queue command_queue,
   1076  cl_mem image,
   1077  cl_bool blocking_map,
   1078  cl_map_flags map_flags,
   1079  const size_t * origin[3],
   1080  const size_t * region[3],
   1081  size_t * image_row_pitch,
   1082  size_t * image_slice_pitch,
   1083  cl_uint num_events_in_wait_list,
   1084  const cl_event * event_wait_list,
   1085  cl_event * event,
   1086  cl_int * errcode_ret),
   1087  (command_queue, image, blocking_map, map_flags, origin, region,
   1088  image_row_pitch, image_slice_pitch, num_events_in_wait_list,
   1089  event_wait_list, event, errcode_ret))
   1090 */
   1091 
   1092 /*
   1093 OCL_FUNC(cl_int, clRetainProgram, (cl_program program), (program))
   1094 
   1095 OCL_FUNC(cl_int, clGetKernelInfo,
   1096  (cl_kernel kernel,
   1097  cl_kernel_info param_name,
   1098  size_t param_value_size,
   1099  void * param_value,
   1100  size_t * param_value_size_ret),
   1101  (kernel, param_name, param_value_size, param_value, param_value_size_ret))
   1102 
   1103 OCL_FUNC(cl_int, clRetainMemObject, (cl_mem memobj), (memobj))
   1104 
   1105 */
   1106 
   1107 OCL_FUNC(cl_int, clReleaseMemObject, (cl_mem memobj), (memobj))
   1108 
   1109 
   1110 OCL_FUNC_P(cl_program, clCreateProgramWithSource,
   1111     (cl_context context,
   1112     cl_uint count,
   1113     const char ** strings,
   1114     const size_t * lengths,
   1115     cl_int * errcode_ret),
   1116     (context, count, strings, lengths, errcode_ret))
   1117 
   1118 OCL_FUNC_P(cl_program, clCreateProgramWithBinary,
   1119     (cl_context context,
   1120     cl_uint num_devices,
   1121     const cl_device_id * device_list,
   1122     const size_t * lengths,
   1123     const unsigned char ** binaries,
   1124     cl_int * binary_status,
   1125     cl_int * errcode_ret),
   1126     (context, num_devices, device_list, lengths, binaries, binary_status, errcode_ret))
   1127 
   1128 OCL_FUNC(cl_int, clReleaseProgram, (cl_program program), (program))
   1129 
   1130 OCL_FUNC(cl_int, clBuildProgram,
   1131     (cl_program program,
   1132     cl_uint num_devices,
   1133     const cl_device_id * device_list,
   1134     const char * options,
   1135     void (CL_CALLBACK * pfn_notify)(cl_program, void *),
   1136     void * user_data),
   1137     (program, num_devices, device_list, options, pfn_notify, user_data))
   1138 
   1139 OCL_FUNC(cl_int, clGetProgramInfo,
   1140     (cl_program program,
   1141     cl_program_info param_name,
   1142     size_t param_value_size,
   1143     void * param_value,
   1144     size_t * param_value_size_ret),
   1145     (program, param_name, param_value_size, param_value, param_value_size_ret))
   1146 
   1147 OCL_FUNC(cl_int, clGetProgramBuildInfo,
   1148     (cl_program program,
   1149     cl_device_id device,
   1150     cl_program_build_info param_name,
   1151     size_t param_value_size,
   1152     void * param_value,
   1153     size_t * param_value_size_ret),
   1154     (program, device, param_name, param_value_size, param_value, param_value_size_ret))
   1155 
   1156 OCL_FUNC_P(cl_kernel, clCreateKernel,
   1157     (cl_program program,
   1158     const char * kernel_name,
   1159     cl_int * errcode_ret),
   1160     (program, kernel_name, errcode_ret))
   1161 
   1162 OCL_FUNC(cl_int, clReleaseKernel, (cl_kernel kernel), (kernel))
   1163 
   1164 OCL_FUNC(cl_int, clSetKernelArg,
   1165     (cl_kernel kernel,
   1166     cl_uint arg_index,
   1167     size_t arg_size,
   1168     const void * arg_value),
   1169     (kernel, arg_index, arg_size, arg_value))
   1170 
   1171 OCL_FUNC(cl_int, clGetKernelWorkGroupInfo,
   1172     (cl_kernel kernel,
   1173     cl_device_id device,
   1174     cl_kernel_work_group_info param_name,
   1175     size_t param_value_size,
   1176     void * param_value,
   1177     size_t * param_value_size_ret),
   1178     (kernel, device, param_name, param_value_size, param_value, param_value_size_ret))
   1179 
   1180 OCL_FUNC(cl_int, clFinish, (cl_command_queue command_queue), (command_queue))
   1181 
   1182 OCL_FUNC(cl_int, clEnqueueReadBuffer,
   1183     (cl_command_queue command_queue,
   1184     cl_mem buffer,
   1185     cl_bool blocking_read,
   1186     size_t offset,
   1187     size_t size,
   1188     void * ptr,
   1189     cl_uint num_events_in_wait_list,
   1190     const cl_event * event_wait_list,
   1191     cl_event * event),
   1192     (command_queue, buffer, blocking_read, offset, size, ptr,
   1193     num_events_in_wait_list, event_wait_list, event))
   1194 
   1195 OCL_FUNC(cl_int, clEnqueueReadBufferRect,
   1196     (cl_command_queue command_queue,
   1197     cl_mem buffer,
   1198     cl_bool blocking_read,
   1199     const size_t * buffer_offset,
   1200     const size_t * host_offset,
   1201     const size_t * region,
   1202     size_t buffer_row_pitch,
   1203     size_t buffer_slice_pitch,
   1204     size_t host_row_pitch,
   1205     size_t host_slice_pitch,
   1206     void * ptr,
   1207     cl_uint num_events_in_wait_list,
   1208     const cl_event * event_wait_list,
   1209     cl_event * event),
   1210     (command_queue, buffer, blocking_read, buffer_offset, host_offset, region, buffer_row_pitch,
   1211     buffer_slice_pitch, host_row_pitch, host_slice_pitch, ptr, num_events_in_wait_list,
   1212     event_wait_list, event))
   1213 
   1214 OCL_FUNC(cl_int, clEnqueueWriteBuffer,
   1215     (cl_command_queue command_queue,
   1216     cl_mem buffer,
   1217     cl_bool blocking_write,
   1218     size_t offset,
   1219     size_t size,
   1220     const void * ptr,
   1221     cl_uint num_events_in_wait_list,
   1222     const cl_event * event_wait_list,
   1223     cl_event * event),
   1224     (command_queue, buffer, blocking_write, offset, size, ptr,
   1225     num_events_in_wait_list, event_wait_list, event))
   1226 
   1227 OCL_FUNC(cl_int, clEnqueueWriteBufferRect,
   1228     (cl_command_queue command_queue,
   1229     cl_mem buffer,
   1230     cl_bool blocking_write,
   1231     const size_t * buffer_offset,
   1232     const size_t * host_offset,
   1233     const size_t * region,
   1234     size_t buffer_row_pitch,
   1235     size_t buffer_slice_pitch,
   1236     size_t host_row_pitch,
   1237     size_t host_slice_pitch,
   1238     const void * ptr,
   1239     cl_uint num_events_in_wait_list,
   1240     const cl_event * event_wait_list,
   1241     cl_event * event),
   1242     (command_queue, buffer, blocking_write, buffer_offset, host_offset,
   1243     region, buffer_row_pitch, buffer_slice_pitch, host_row_pitch,
   1244     host_slice_pitch, ptr, num_events_in_wait_list, event_wait_list, event))
   1245 
   1246 /*OCL_FUNC(cl_int, clEnqueueFillBuffer,
   1247     (cl_command_queue command_queue,
   1248     cl_mem buffer,
   1249     const void * pattern,
   1250     size_t pattern_size,
   1251     size_t offset,
   1252     size_t size,
   1253     cl_uint num_events_in_wait_list,
   1254     const cl_event * event_wait_list,
   1255     cl_event * event),
   1256     (command_queue, buffer, pattern, pattern_size, offset, size,
   1257     num_events_in_wait_list, event_wait_list, event))*/
   1258 
   1259 OCL_FUNC(cl_int, clEnqueueCopyBuffer,
   1260     (cl_command_queue command_queue,
   1261     cl_mem src_buffer,
   1262     cl_mem dst_buffer,
   1263     size_t src_offset,
   1264     size_t dst_offset,
   1265     size_t size,
   1266     cl_uint num_events_in_wait_list,
   1267     const cl_event * event_wait_list,
   1268     cl_event * event),
   1269     (command_queue, src_buffer, dst_buffer, src_offset, dst_offset,
   1270     size, num_events_in_wait_list, event_wait_list, event))
   1271 
   1272 OCL_FUNC(cl_int, clEnqueueCopyBufferRect,
   1273     (cl_command_queue command_queue,
   1274     cl_mem src_buffer,
   1275     cl_mem dst_buffer,
   1276     const size_t * src_origin,
   1277     const size_t * dst_origin,
   1278     const size_t * region,
   1279     size_t src_row_pitch,
   1280     size_t src_slice_pitch,
   1281     size_t dst_row_pitch,
   1282     size_t dst_slice_pitch,
   1283     cl_uint num_events_in_wait_list,
   1284     const cl_event * event_wait_list,
   1285     cl_event * event),
   1286     (command_queue, src_buffer, dst_buffer, src_origin, dst_origin,
   1287     region, src_row_pitch, src_slice_pitch, dst_row_pitch, dst_slice_pitch,
   1288     num_events_in_wait_list, event_wait_list, event))
   1289 
   1290 OCL_FUNC_P(void*, clEnqueueMapBuffer,
   1291     (cl_command_queue command_queue,
   1292     cl_mem buffer,
   1293     cl_bool blocking_map,
   1294     cl_map_flags map_flags,
   1295     size_t offset,
   1296     size_t size,
   1297     cl_uint num_events_in_wait_list,
   1298     const cl_event * event_wait_list,
   1299     cl_event * event,
   1300     cl_int * errcode_ret),
   1301     (command_queue, buffer, blocking_map, map_flags, offset, size,
   1302     num_events_in_wait_list, event_wait_list, event, errcode_ret))
   1303 
   1304 OCL_FUNC(cl_int, clEnqueueUnmapMemObject,
   1305     (cl_command_queue command_queue,
   1306     cl_mem memobj,
   1307     void * mapped_ptr,
   1308     cl_uint num_events_in_wait_list,
   1309     const cl_event * event_wait_list,
   1310     cl_event * event),
   1311     (command_queue, memobj, mapped_ptr, num_events_in_wait_list, event_wait_list, event))
   1312 
   1313 OCL_FUNC(cl_int, clEnqueueNDRangeKernel,
   1314     (cl_command_queue command_queue,
   1315     cl_kernel kernel,
   1316     cl_uint work_dim,
   1317     const size_t * global_work_offset,
   1318     const size_t * global_work_size,
   1319     const size_t * local_work_size,
   1320     cl_uint num_events_in_wait_list,
   1321     const cl_event * event_wait_list,
   1322     cl_event * event),
   1323     (command_queue, kernel, work_dim, global_work_offset, global_work_size,
   1324     local_work_size, num_events_in_wait_list, event_wait_list, event))
   1325 
   1326 OCL_FUNC(cl_int, clEnqueueTask,
   1327     (cl_command_queue command_queue,
   1328     cl_kernel kernel,
   1329     cl_uint num_events_in_wait_list,
   1330     const cl_event * event_wait_list,
   1331     cl_event * event),
   1332     (command_queue, kernel, num_events_in_wait_list, event_wait_list, event))
   1333 
   1334 OCL_FUNC(cl_int, clSetEventCallback,
   1335     (cl_event event,
   1336     cl_int command_exec_callback_type ,
   1337     void (CL_CALLBACK  *pfn_event_notify) (cl_event event, cl_int event_command_exec_status, void *user_data),
   1338     void *user_data),
   1339     (event, command_exec_callback_type, pfn_event_notify, user_data))
   1340 
   1341 OCL_FUNC(cl_int, clReleaseEvent, (cl_event event), (event))
   1342 
   1343 }
   1344 
   1345 #endif
   1346 
   1347 #ifndef CL_VERSION_1_2
   1348 #define CL_VERSION_1_2
   1349 #endif
   1350 
   1351 #endif
   1352 
   1353 #ifdef _DEBUG
   1354 #define CV_OclDbgAssert CV_DbgAssert
   1355 #else
   1356 static bool isRaiseError()
   1357 {
   1358     static bool initialized = false;
   1359     static bool value = false;
   1360     if (!initialized)
   1361     {
   1362         value = getBoolParameter("OPENCV_OPENCL_RAISE_ERROR", false);
   1363         initialized = true;
   1364     }
   1365     return value;
   1366 }
   1367 #define CV_OclDbgAssert(expr) do { if (isRaiseError()) { CV_Assert(expr); } else { (void)(expr); } } while ((void)0, 0)
   1368 #endif
   1369 
   1370 #ifdef HAVE_OPENCL_SVM
   1371 #include "opencv2/core/opencl/runtime/opencl_svm_20.hpp"
   1372 #include "opencv2/core/opencl/runtime/opencl_svm_hsa_extension.hpp"
   1373 #include "opencv2/core/opencl/opencl_svm.hpp"
   1374 #endif
   1375 
   1376 namespace cv { namespace ocl {
   1377 
   1378 struct UMat2D
   1379 {
   1380     UMat2D(const UMat& m)
   1381     {
   1382         offset = (int)m.offset;
   1383         step = (int)m.step;
   1384         rows = m.rows;
   1385         cols = m.cols;
   1386     }
   1387     int offset;
   1388     int step;
   1389     int rows;
   1390     int cols;
   1391 };
   1392 
   1393 struct UMat3D
   1394 {
   1395     UMat3D(const UMat& m)
   1396     {
   1397         offset = (int)m.offset;
   1398         step = (int)m.step.p[1];
   1399         slicestep = (int)m.step.p[0];
   1400         slices = (int)m.size.p[0];
   1401         rows = m.size.p[1];
   1402         cols = m.size.p[2];
   1403     }
   1404     int offset;
   1405     int slicestep;
   1406     int step;
   1407     int slices;
   1408     int rows;
   1409     int cols;
   1410 };
   1411 
   1412 // Computes 64-bit "cyclic redundancy check" sum, as specified in ECMA-182
   1413 static uint64 crc64( const uchar* data, size_t size, uint64 crc0=0 )
   1414 {
   1415     static uint64 table[256];
   1416     static bool initialized = false;
   1417 
   1418     if( !initialized )
   1419     {
   1420         for( int i = 0; i < 256; i++ )
   1421         {
   1422             uint64 c = i;
   1423             for( int j = 0; j < 8; j++ )
   1424                 c = ((c & 1) ? CV_BIG_UINT(0xc96c5795d7870f42) : 0) ^ (c >> 1);
   1425             table[i] = c;
   1426         }
   1427         initialized = true;
   1428     }
   1429 
   1430     uint64 crc = ~crc0;
   1431     for( size_t idx = 0; idx < size; idx++ )
   1432         crc = table[(uchar)crc ^ data[idx]] ^ (crc >> 8);
   1433 
   1434     return ~crc;
   1435 }
   1436 
   1437 struct HashKey
   1438 {
   1439     typedef uint64 part;
   1440     HashKey(part _a, part _b) : a(_a), b(_b) {}
   1441     part a, b;
   1442 };
   1443 
   1444 inline bool operator == (const HashKey& h1, const HashKey& h2)
   1445 {
   1446     return h1.a == h2.a && h1.b == h2.b;
   1447 }
   1448 
   1449 inline bool operator < (const HashKey& h1, const HashKey& h2)
   1450 {
   1451     return h1.a < h2.a || (h1.a == h2.a && h1.b < h2.b);
   1452 }
   1453 
   1454 
   1455 bool haveOpenCL()
   1456 {
   1457 #ifdef HAVE_OPENCL
   1458     static bool g_isOpenCLInitialized = false;
   1459     static bool g_isOpenCLAvailable = false;
   1460 
   1461     if (!g_isOpenCLInitialized)
   1462     {
   1463         try
   1464         {
   1465             cl_uint n = 0;
   1466             g_isOpenCLAvailable = ::clGetPlatformIDs(0, NULL, &n) == CL_SUCCESS;
   1467         }
   1468         catch (...)
   1469         {
   1470             g_isOpenCLAvailable = false;
   1471         }
   1472         g_isOpenCLInitialized = true;
   1473     }
   1474     return g_isOpenCLAvailable;
   1475 #else
   1476     return false;
   1477 #endif
   1478 }
   1479 
   1480 bool useOpenCL()
   1481 {
   1482     CoreTLSData* data = getCoreTlsData().get();
   1483     if( data->useOpenCL < 0 )
   1484     {
   1485         try
   1486         {
   1487             data->useOpenCL = (int)haveOpenCL() && Device::getDefault().ptr() && Device::getDefault().available();
   1488         }
   1489         catch (...)
   1490         {
   1491             data->useOpenCL = 0;
   1492         }
   1493     }
   1494     return data->useOpenCL > 0;
   1495 }
   1496 
   1497 void setUseOpenCL(bool flag)
   1498 {
   1499     if( haveOpenCL() )
   1500     {
   1501         CoreTLSData* data = getCoreTlsData().get();
   1502         data->useOpenCL = (flag && Device::getDefault().ptr() != NULL) ? 1 : 0;
   1503     }
   1504 }
   1505 
   1506 #ifdef HAVE_CLAMDBLAS
   1507 
   1508 class AmdBlasHelper
   1509 {
   1510 public:
   1511     static AmdBlasHelper & getInstance()
   1512     {
   1513         static AmdBlasHelper amdBlas;
   1514         return amdBlas;
   1515     }
   1516 
   1517     bool isAvailable() const
   1518     {
   1519         return g_isAmdBlasAvailable;
   1520     }
   1521 
   1522     ~AmdBlasHelper()
   1523     {
   1524         try
   1525         {
   1526             clAmdBlasTeardown();
   1527         }
   1528         catch (...) { }
   1529     }
   1530 
   1531 protected:
   1532     AmdBlasHelper()
   1533     {
   1534         if (!g_isAmdBlasInitialized)
   1535         {
   1536             AutoLock lock(m);
   1537 
   1538             if (!g_isAmdBlasInitialized && haveOpenCL())
   1539             {
   1540                 try
   1541                 {
   1542                     g_isAmdBlasAvailable = clAmdBlasSetup() == clAmdBlasSuccess;
   1543                 }
   1544                 catch (...)
   1545                 {
   1546                     g_isAmdBlasAvailable = false;
   1547                 }
   1548             }
   1549             else
   1550                 g_isAmdBlasAvailable = false;
   1551 
   1552             g_isAmdBlasInitialized = true;
   1553         }
   1554     }
   1555 
   1556 private:
   1557     static Mutex m;
   1558     static bool g_isAmdBlasInitialized;
   1559     static bool g_isAmdBlasAvailable;
   1560 };
   1561 
   1562 bool AmdBlasHelper::g_isAmdBlasAvailable = false;
   1563 bool AmdBlasHelper::g_isAmdBlasInitialized = false;
   1564 Mutex AmdBlasHelper::m;
   1565 
   1566 bool haveAmdBlas()
   1567 {
   1568     return AmdBlasHelper::getInstance().isAvailable();
   1569 }
   1570 
   1571 #else
   1572 
   1573 bool haveAmdBlas()
   1574 {
   1575     return false;
   1576 }
   1577 
   1578 #endif
   1579 
   1580 #ifdef HAVE_CLAMDFFT
   1581 
   1582 class AmdFftHelper
   1583 {
   1584 public:
   1585     static AmdFftHelper & getInstance()
   1586     {
   1587         static AmdFftHelper amdFft;
   1588         return amdFft;
   1589     }
   1590 
   1591     bool isAvailable() const
   1592     {
   1593         return g_isAmdFftAvailable;
   1594     }
   1595 
   1596     ~AmdFftHelper()
   1597     {
   1598         try
   1599         {
   1600 //            clAmdFftTeardown();
   1601         }
   1602         catch (...) { }
   1603     }
   1604 
   1605 protected:
   1606     AmdFftHelper()
   1607     {
   1608         if (!g_isAmdFftInitialized)
   1609         {
   1610             AutoLock lock(m);
   1611 
   1612             if (!g_isAmdFftInitialized && haveOpenCL())
   1613             {
   1614                 try
   1615                 {
   1616                     cl_uint major, minor, patch;
   1617                     CV_Assert(clAmdFftInitSetupData(&setupData) == CLFFT_SUCCESS);
   1618 
   1619                     // it throws exception in case AmdFft binaries are not found
   1620                     CV_Assert(clAmdFftGetVersion(&major, &minor, &patch) == CLFFT_SUCCESS);
   1621                     g_isAmdFftAvailable = true;
   1622                 }
   1623                 catch (const Exception &)
   1624                 {
   1625                     g_isAmdFftAvailable = false;
   1626                 }
   1627             }
   1628             else
   1629                 g_isAmdFftAvailable = false;
   1630 
   1631             g_isAmdFftInitialized = true;
   1632         }
   1633     }
   1634 
   1635 private:
   1636     static clAmdFftSetupData setupData;
   1637     static Mutex m;
   1638     static bool g_isAmdFftInitialized;
   1639     static bool g_isAmdFftAvailable;
   1640 };
   1641 
   1642 clAmdFftSetupData AmdFftHelper::setupData;
   1643 bool AmdFftHelper::g_isAmdFftAvailable = false;
   1644 bool AmdFftHelper::g_isAmdFftInitialized = false;
   1645 Mutex AmdFftHelper::m;
   1646 
   1647 bool haveAmdFft()
   1648 {
   1649     return AmdFftHelper::getInstance().isAvailable();
   1650 }
   1651 
   1652 #else
   1653 
   1654 bool haveAmdFft()
   1655 {
   1656     return false;
   1657 }
   1658 
   1659 #endif
   1660 
   1661 bool haveSVM()
   1662 {
   1663 #ifdef HAVE_OPENCL_SVM
   1664     return true;
   1665 #else
   1666     return false;
   1667 #endif
   1668 }
   1669 
   1670 void finish()
   1671 {
   1672     Queue::getDefault().finish();
   1673 }
   1674 
   1675 #define IMPLEMENT_REFCOUNTABLE() \
   1676     void addref() { CV_XADD(&refcount, 1); } \
   1677     void release() { if( CV_XADD(&refcount, -1) == 1 && !cv::__termination) delete this; } \
   1678     int refcount
   1679 
   1680 /////////////////////////////////////////// Platform /////////////////////////////////////////////
   1681 
   1682 struct Platform::Impl
   1683 {
   1684     Impl()
   1685     {
   1686         refcount = 1;
   1687         handle = 0;
   1688         initialized = false;
   1689     }
   1690 
   1691     ~Impl() {}
   1692 
   1693     void init()
   1694     {
   1695         if( !initialized )
   1696         {
   1697             //cl_uint num_entries
   1698             cl_uint n = 0;
   1699             if( clGetPlatformIDs(1, &handle, &n) != CL_SUCCESS || n == 0 )
   1700                 handle = 0;
   1701             if( handle != 0 )
   1702             {
   1703                 char buf[1000];
   1704                 size_t len = 0;
   1705                 CV_OclDbgAssert(clGetPlatformInfo(handle, CL_PLATFORM_VENDOR, sizeof(buf), buf, &len) == CL_SUCCESS);
   1706                 buf[len] = '\0';
   1707                 vendor = String(buf);
   1708             }
   1709 
   1710             initialized = true;
   1711         }
   1712     }
   1713 
   1714     IMPLEMENT_REFCOUNTABLE();
   1715 
   1716     cl_platform_id handle;
   1717     String vendor;
   1718     bool initialized;
   1719 };
   1720 
   1721 Platform::Platform()
   1722 {
   1723     p = 0;
   1724 }
   1725 
   1726 Platform::~Platform()
   1727 {
   1728     if(p)
   1729         p->release();
   1730 }
   1731 
   1732 Platform::Platform(const Platform& pl)
   1733 {
   1734     p = (Impl*)pl.p;
   1735     if(p)
   1736         p->addref();
   1737 }
   1738 
   1739 Platform& Platform::operator = (const Platform& pl)
   1740 {
   1741     Impl* newp = (Impl*)pl.p;
   1742     if(newp)
   1743         newp->addref();
   1744     if(p)
   1745         p->release();
   1746     p = newp;
   1747     return *this;
   1748 }
   1749 
   1750 void* Platform::ptr() const
   1751 {
   1752     return p ? p->handle : 0;
   1753 }
   1754 
   1755 Platform& Platform::getDefault()
   1756 {
   1757     static Platform p;
   1758     if( !p.p )
   1759     {
   1760         p.p = new Impl;
   1761         p.p->init();
   1762     }
   1763     return p;
   1764 }
   1765 
   1766 /////////////////////////////////////// Device ////////////////////////////////////////////
   1767 
   1768 // deviceVersion has format
   1769 //   OpenCL<space><major_version.minor_version><space><vendor-specific information>
   1770 // by specification
   1771 //   http://www.khronos.org/registry/cl/sdk/1.1/docs/man/xhtml/clGetDeviceInfo.html
   1772 //   http://www.khronos.org/registry/cl/sdk/1.2/docs/man/xhtml/clGetDeviceInfo.html
   1773 static void parseDeviceVersion(const String &deviceVersion, int &major, int &minor)
   1774 {
   1775     major = minor = 0;
   1776     if (10 >= deviceVersion.length())
   1777         return;
   1778     const char *pstr = deviceVersion.c_str();
   1779     if (0 != strncmp(pstr, "OpenCL ", 7))
   1780         return;
   1781     size_t ppos = deviceVersion.find('.', 7);
   1782     if (String::npos == ppos)
   1783         return;
   1784     String temp = deviceVersion.substr(7, ppos - 7);
   1785     major = atoi(temp.c_str());
   1786     temp = deviceVersion.substr(ppos + 1);
   1787     minor = atoi(temp.c_str());
   1788 }
   1789 
   1790 struct Device::Impl
   1791 {
   1792     Impl(void* d)
   1793     {
   1794         handle = (cl_device_id)d;
   1795         refcount = 1;
   1796 
   1797         name_ = getStrProp(CL_DEVICE_NAME);
   1798         version_ = getStrProp(CL_DEVICE_VERSION);
   1799         doubleFPConfig_ = getProp<cl_device_fp_config, int>(CL_DEVICE_DOUBLE_FP_CONFIG);
   1800         hostUnifiedMemory_ = getBoolProp(CL_DEVICE_HOST_UNIFIED_MEMORY);
   1801         maxComputeUnits_ = getProp<cl_uint, int>(CL_DEVICE_MAX_COMPUTE_UNITS);
   1802         maxWorkGroupSize_ = getProp<size_t, size_t>(CL_DEVICE_MAX_WORK_GROUP_SIZE);
   1803         type_ = getProp<cl_device_type, int>(CL_DEVICE_TYPE);
   1804         driverVersion_ = getStrProp(CL_DRIVER_VERSION);
   1805 
   1806         String deviceVersion_ = getStrProp(CL_DEVICE_VERSION);
   1807         parseDeviceVersion(deviceVersion_, deviceVersionMajor_, deviceVersionMinor_);
   1808 
   1809         vendorName_ = getStrProp(CL_DEVICE_VENDOR);
   1810         if (vendorName_ == "Advanced Micro Devices, Inc." ||
   1811             vendorName_ == "AMD")
   1812             vendorID_ = VENDOR_AMD;
   1813         else if (vendorName_ == "Intel(R) Corporation" || vendorName_ == "Intel" || strstr(name_.c_str(), "Iris") != 0)
   1814             vendorID_ = VENDOR_INTEL;
   1815         else if (vendorName_ == "NVIDIA Corporation")
   1816             vendorID_ = VENDOR_NVIDIA;
   1817         else
   1818             vendorID_ = UNKNOWN_VENDOR;
   1819     }
   1820 
   1821     template<typename _TpCL, typename _TpOut>
   1822     _TpOut getProp(cl_device_info prop) const
   1823     {
   1824         _TpCL temp=_TpCL();
   1825         size_t sz = 0;
   1826 
   1827         return clGetDeviceInfo(handle, prop, sizeof(temp), &temp, &sz) == CL_SUCCESS &&
   1828             sz == sizeof(temp) ? _TpOut(temp) : _TpOut();
   1829     }
   1830 
   1831     bool getBoolProp(cl_device_info prop) const
   1832     {
   1833         cl_bool temp = CL_FALSE;
   1834         size_t sz = 0;
   1835 
   1836         return clGetDeviceInfo(handle, prop, sizeof(temp), &temp, &sz) == CL_SUCCESS &&
   1837             sz == sizeof(temp) ? temp != 0 : false;
   1838     }
   1839 
   1840     String getStrProp(cl_device_info prop) const
   1841     {
   1842         char buf[1024];
   1843         size_t sz=0;
   1844         return clGetDeviceInfo(handle, prop, sizeof(buf)-16, buf, &sz) == CL_SUCCESS &&
   1845             sz < sizeof(buf) ? String(buf) : String();
   1846     }
   1847 
   1848     IMPLEMENT_REFCOUNTABLE();
   1849     cl_device_id handle;
   1850 
   1851     String name_;
   1852     String version_;
   1853     int doubleFPConfig_;
   1854     bool hostUnifiedMemory_;
   1855     int maxComputeUnits_;
   1856     size_t maxWorkGroupSize_;
   1857     int type_;
   1858     int deviceVersionMajor_;
   1859     int deviceVersionMinor_;
   1860     String driverVersion_;
   1861     String vendorName_;
   1862     int vendorID_;
   1863 };
   1864 
   1865 
   1866 Device::Device()
   1867 {
   1868     p = 0;
   1869 }
   1870 
   1871 Device::Device(void* d)
   1872 {
   1873     p = 0;
   1874     set(d);
   1875 }
   1876 
   1877 Device::Device(const Device& d)
   1878 {
   1879     p = d.p;
   1880     if(p)
   1881         p->addref();
   1882 }
   1883 
   1884 Device& Device::operator = (const Device& d)
   1885 {
   1886     Impl* newp = (Impl*)d.p;
   1887     if(newp)
   1888         newp->addref();
   1889     if(p)
   1890         p->release();
   1891     p = newp;
   1892     return *this;
   1893 }
   1894 
   1895 Device::~Device()
   1896 {
   1897     if(p)
   1898         p->release();
   1899 }
   1900 
   1901 void Device::set(void* d)
   1902 {
   1903     if(p)
   1904         p->release();
   1905     p = new Impl(d);
   1906 }
   1907 
   1908 void* Device::ptr() const
   1909 {
   1910     return p ? p->handle : 0;
   1911 }
   1912 
   1913 String Device::name() const
   1914 { return p ? p->name_ : String(); }
   1915 
   1916 String Device::extensions() const
   1917 { return p ? p->getStrProp(CL_DEVICE_EXTENSIONS) : String(); }
   1918 
   1919 String Device::version() const
   1920 { return p ? p->version_ : String(); }
   1921 
   1922 String Device::vendorName() const
   1923 { return p ? p->vendorName_ : String(); }
   1924 
   1925 int Device::vendorID() const
   1926 { return p ? p->vendorID_ : 0; }
   1927 
   1928 String Device::OpenCL_C_Version() const
   1929 { return p ? p->getStrProp(CL_DEVICE_OPENCL_C_VERSION) : String(); }
   1930 
   1931 String Device::OpenCLVersion() const
   1932 { return p ? p->getStrProp(CL_DEVICE_EXTENSIONS) : String(); }
   1933 
   1934 int Device::deviceVersionMajor() const
   1935 { return p ? p->deviceVersionMajor_ : 0; }
   1936 
   1937 int Device::deviceVersionMinor() const
   1938 { return p ? p->deviceVersionMinor_ : 0; }
   1939 
   1940 String Device::driverVersion() const
   1941 { return p ? p->driverVersion_ : String(); }
   1942 
   1943 int Device::type() const
   1944 { return p ? p->type_ : 0; }
   1945 
   1946 int Device::addressBits() const
   1947 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_ADDRESS_BITS) : 0; }
   1948 
   1949 bool Device::available() const
   1950 { return p ? p->getBoolProp(CL_DEVICE_AVAILABLE) : false; }
   1951 
   1952 bool Device::compilerAvailable() const
   1953 { return p ? p->getBoolProp(CL_DEVICE_COMPILER_AVAILABLE) : false; }
   1954 
   1955 bool Device::linkerAvailable() const
   1956 #ifdef CL_VERSION_1_2
   1957 { return p ? p->getBoolProp(CL_DEVICE_LINKER_AVAILABLE) : false; }
   1958 #else
   1959 { CV_REQUIRE_OPENCL_1_2_ERROR; }
   1960 #endif
   1961 
   1962 int Device::doubleFPConfig() const
   1963 { return p ? p->doubleFPConfig_ : 0; }
   1964 
   1965 int Device::singleFPConfig() const
   1966 { return p ? p->getProp<cl_device_fp_config, int>(CL_DEVICE_SINGLE_FP_CONFIG) : 0; }
   1967 
   1968 int Device::halfFPConfig() const
   1969 #ifdef CL_VERSION_1_2
   1970 { return p ? p->getProp<cl_device_fp_config, int>(CL_DEVICE_HALF_FP_CONFIG) : 0; }
   1971 #else
   1972 { CV_REQUIRE_OPENCL_1_2_ERROR; }
   1973 #endif
   1974 
   1975 bool Device::endianLittle() const
   1976 { return p ? p->getBoolProp(CL_DEVICE_ENDIAN_LITTLE) : false; }
   1977 
   1978 bool Device::errorCorrectionSupport() const
   1979 { return p ? p->getBoolProp(CL_DEVICE_ERROR_CORRECTION_SUPPORT) : false; }
   1980 
   1981 int Device::executionCapabilities() const
   1982 { return p ? p->getProp<cl_device_exec_capabilities, int>(CL_DEVICE_EXECUTION_CAPABILITIES) : 0; }
   1983 
   1984 size_t Device::globalMemCacheSize() const
   1985 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE) : 0; }
   1986 
   1987 int Device::globalMemCacheType() const
   1988 { return p ? p->getProp<cl_device_mem_cache_type, int>(CL_DEVICE_GLOBAL_MEM_CACHE_TYPE) : 0; }
   1989 
   1990 int Device::globalMemCacheLineSize() const
   1991 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE) : 0; }
   1992 
   1993 size_t Device::globalMemSize() const
   1994 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_GLOBAL_MEM_SIZE) : 0; }
   1995 
   1996 size_t Device::localMemSize() const
   1997 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_LOCAL_MEM_SIZE) : 0; }
   1998 
   1999 int Device::localMemType() const
   2000 { return p ? p->getProp<cl_device_local_mem_type, int>(CL_DEVICE_LOCAL_MEM_TYPE) : 0; }
   2001 
   2002 bool Device::hostUnifiedMemory() const
   2003 { return p ? p->hostUnifiedMemory_ : false; }
   2004 
   2005 bool Device::imageSupport() const
   2006 { return p ? p->getBoolProp(CL_DEVICE_IMAGE_SUPPORT) : false; }
   2007 
   2008 bool Device::imageFromBufferSupport() const
   2009 {
   2010     bool ret = false;
   2011     if (p)
   2012     {
   2013         size_t pos = p->getStrProp(CL_DEVICE_EXTENSIONS).find("cl_khr_image2d_from_buffer");
   2014         if (pos != String::npos)
   2015         {
   2016             ret = true;
   2017         }
   2018     }
   2019     return ret;
   2020 }
   2021 
   2022 uint Device::imagePitchAlignment() const
   2023 {
   2024 #ifdef CL_DEVICE_IMAGE_PITCH_ALIGNMENT
   2025     return p ? p->getProp<cl_uint, uint>(CL_DEVICE_IMAGE_PITCH_ALIGNMENT) : 0;
   2026 #else
   2027     return 0;
   2028 #endif
   2029 }
   2030 
   2031 uint Device::imageBaseAddressAlignment() const
   2032 {
   2033 #ifdef CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT
   2034     return p ? p->getProp<cl_uint, uint>(CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT) : 0;
   2035 #else
   2036     return 0;
   2037 #endif
   2038 }
   2039 
   2040 size_t Device::image2DMaxWidth() const
   2041 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE2D_MAX_WIDTH) : 0; }
   2042 
   2043 size_t Device::image2DMaxHeight() const
   2044 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE2D_MAX_HEIGHT) : 0; }
   2045 
   2046 size_t Device::image3DMaxWidth() const
   2047 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_WIDTH) : 0; }
   2048 
   2049 size_t Device::image3DMaxHeight() const
   2050 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_HEIGHT) : 0; }
   2051 
   2052 size_t Device::image3DMaxDepth() const
   2053 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE3D_MAX_DEPTH) : 0; }
   2054 
   2055 size_t Device::imageMaxBufferSize() const
   2056 #ifdef CL_VERSION_1_2
   2057 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE_MAX_BUFFER_SIZE) : 0; }
   2058 #else
   2059 { CV_REQUIRE_OPENCL_1_2_ERROR; }
   2060 #endif
   2061 
   2062 size_t Device::imageMaxArraySize() const
   2063 #ifdef CL_VERSION_1_2
   2064 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_IMAGE_MAX_ARRAY_SIZE) : 0; }
   2065 #else
   2066 { CV_REQUIRE_OPENCL_1_2_ERROR; }
   2067 #endif
   2068 
   2069 int Device::maxClockFrequency() const
   2070 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_CLOCK_FREQUENCY) : 0; }
   2071 
   2072 int Device::maxComputeUnits() const
   2073 { return p ? p->maxComputeUnits_ : 0; }
   2074 
   2075 int Device::maxConstantArgs() const
   2076 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_CONSTANT_ARGS) : 0; }
   2077 
   2078 size_t Device::maxConstantBufferSize() const
   2079 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE) : 0; }
   2080 
   2081 size_t Device::maxMemAllocSize() const
   2082 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_MEM_ALLOC_SIZE) : 0; }
   2083 
   2084 size_t Device::maxParameterSize() const
   2085 { return p ? p->getProp<cl_ulong, size_t>(CL_DEVICE_MAX_PARAMETER_SIZE) : 0; }
   2086 
   2087 int Device::maxReadImageArgs() const
   2088 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_READ_IMAGE_ARGS) : 0; }
   2089 
   2090 int Device::maxWriteImageArgs() const
   2091 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_WRITE_IMAGE_ARGS) : 0; }
   2092 
   2093 int Device::maxSamplers() const
   2094 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_SAMPLERS) : 0; }
   2095 
   2096 size_t Device::maxWorkGroupSize() const
   2097 { return p ? p->maxWorkGroupSize_ : 0; }
   2098 
   2099 int Device::maxWorkItemDims() const
   2100 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS) : 0; }
   2101 
   2102 void Device::maxWorkItemSizes(size_t* sizes) const
   2103 {
   2104     if(p)
   2105     {
   2106         const int MAX_DIMS = 32;
   2107         size_t retsz = 0;
   2108         CV_OclDbgAssert(clGetDeviceInfo(p->handle, CL_DEVICE_MAX_WORK_ITEM_SIZES,
   2109                 MAX_DIMS*sizeof(sizes[0]), &sizes[0], &retsz) == CL_SUCCESS);
   2110     }
   2111 }
   2112 
   2113 int Device::memBaseAddrAlign() const
   2114 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_MEM_BASE_ADDR_ALIGN) : 0; }
   2115 
   2116 int Device::nativeVectorWidthChar() const
   2117 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR) : 0; }
   2118 
   2119 int Device::nativeVectorWidthShort() const
   2120 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT) : 0; }
   2121 
   2122 int Device::nativeVectorWidthInt() const
   2123 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_INT) : 0; }
   2124 
   2125 int Device::nativeVectorWidthLong() const
   2126 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG) : 0; }
   2127 
   2128 int Device::nativeVectorWidthFloat() const
   2129 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT) : 0; }
   2130 
   2131 int Device::nativeVectorWidthDouble() const
   2132 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE) : 0; }
   2133 
   2134 int Device::nativeVectorWidthHalf() const
   2135 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF) : 0; }
   2136 
   2137 int Device::preferredVectorWidthChar() const
   2138 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR) : 0; }
   2139 
   2140 int Device::preferredVectorWidthShort() const
   2141 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT) : 0; }
   2142 
   2143 int Device::preferredVectorWidthInt() const
   2144 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT) : 0; }
   2145 
   2146 int Device::preferredVectorWidthLong() const
   2147 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG) : 0; }
   2148 
   2149 int Device::preferredVectorWidthFloat() const
   2150 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT) : 0; }
   2151 
   2152 int Device::preferredVectorWidthDouble() const
   2153 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE) : 0; }
   2154 
   2155 int Device::preferredVectorWidthHalf() const
   2156 { return p ? p->getProp<cl_uint, int>(CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF) : 0; }
   2157 
   2158 size_t Device::printfBufferSize() const
   2159 #ifdef CL_VERSION_1_2
   2160 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_PRINTF_BUFFER_SIZE) : 0; }
   2161 #else
   2162 { CV_REQUIRE_OPENCL_1_2_ERROR; }
   2163 #endif
   2164 
   2165 
   2166 size_t Device::profilingTimerResolution() const
   2167 { return p ? p->getProp<size_t, size_t>(CL_DEVICE_PROFILING_TIMER_RESOLUTION) : 0; }
   2168 
   2169 const Device& Device::getDefault()
   2170 {
   2171     const Context& ctx = Context::getDefault();
   2172     int idx = getCoreTlsData().get()->device;
   2173     const Device& device = ctx.device(idx);
   2174     return device;
   2175 }
   2176 
   2177 ////////////////////////////////////// Context ///////////////////////////////////////////////////
   2178 
   2179 template <typename Functor, typename ObjectType>
   2180 inline cl_int getStringInfo(Functor f, ObjectType obj, cl_uint name, std::string& param)
   2181 {
   2182     ::size_t required;
   2183     cl_int err = f(obj, name, 0, NULL, &required);
   2184     if (err != CL_SUCCESS)
   2185         return err;
   2186 
   2187     param.clear();
   2188     if (required > 0)
   2189     {
   2190         AutoBuffer<char> buf(required + 1);
   2191         char* ptr = (char*)buf; // cleanup is not needed
   2192         err = f(obj, name, required, ptr, NULL);
   2193         if (err != CL_SUCCESS)
   2194             return err;
   2195         param = ptr;
   2196     }
   2197 
   2198     return CL_SUCCESS;
   2199 }
   2200 
   2201 static void split(const std::string &s, char delim, std::vector<std::string> &elems)
   2202 {
   2203     elems.clear();
   2204     if (s.size() == 0)
   2205         return;
   2206     std::istringstream ss(s);
   2207     std::string item;
   2208     while (!ss.eof())
   2209     {
   2210         std::getline(ss, item, delim);
   2211         elems.push_back(item);
   2212     }
   2213 }
   2214 
   2215 // Layout: <Platform>:<CPU|GPU|ACCELERATOR|nothing=GPU/CPU>:<deviceName>
   2216 // Sample: AMD:GPU:
   2217 // Sample: AMD:GPU:Tahiti
   2218 // Sample: :GPU|CPU: = '' = ':' = '::'
   2219 static bool parseOpenCLDeviceConfiguration(const std::string& configurationStr,
   2220         std::string& platform, std::vector<std::string>& deviceTypes, std::string& deviceNameOrID)
   2221 {
   2222     std::vector<std::string> parts;
   2223     split(configurationStr, ':', parts);
   2224     if (parts.size() > 3)
   2225     {
   2226         std::cerr << "ERROR: Invalid configuration string for OpenCL device" << std::endl;
   2227         return false;
   2228     }
   2229     if (parts.size() > 2)
   2230         deviceNameOrID = parts[2];
   2231     if (parts.size() > 1)
   2232     {
   2233         split(parts[1], '|', deviceTypes);
   2234     }
   2235     if (parts.size() > 0)
   2236     {
   2237         platform = parts[0];
   2238     }
   2239     return true;
   2240 }
   2241 
   2242 #ifdef WINRT
   2243 static cl_device_id selectOpenCLDevice()
   2244 {
   2245     return NULL;
   2246 }
   2247 #else
   2248 static cl_device_id selectOpenCLDevice()
   2249 {
   2250     std::string platform, deviceName;
   2251     std::vector<std::string> deviceTypes;
   2252 
   2253     const char* configuration = getenv("OPENCV_OPENCL_DEVICE");
   2254     if (configuration &&
   2255             (strcmp(configuration, "disabled") == 0 ||
   2256              !parseOpenCLDeviceConfiguration(std::string(configuration), platform, deviceTypes, deviceName)
   2257             ))
   2258         return NULL;
   2259 
   2260     bool isID = false;
   2261     int deviceID = -1;
   2262     if (deviceName.length() == 1)
   2263     // We limit ID range to 0..9, because we want to write:
   2264     // - '2500' to mean i5-2500
   2265     // - '8350' to mean AMD FX-8350
   2266     // - '650' to mean GeForce 650
   2267     // To extend ID range change condition to '> 0'
   2268     {
   2269         isID = true;
   2270         for (size_t i = 0; i < deviceName.length(); i++)
   2271         {
   2272             if (!isdigit(deviceName[i]))
   2273             {
   2274                 isID = false;
   2275                 break;
   2276             }
   2277         }
   2278         if (isID)
   2279         {
   2280             deviceID = atoi(deviceName.c_str());
   2281             if (deviceID < 0)
   2282                 return NULL;
   2283         }
   2284     }
   2285 
   2286     std::vector<cl_platform_id> platforms;
   2287     {
   2288         cl_uint numPlatforms = 0;
   2289         CV_OclDbgAssert(clGetPlatformIDs(0, NULL, &numPlatforms) == CL_SUCCESS);
   2290 
   2291         if (numPlatforms == 0)
   2292             return NULL;
   2293         platforms.resize((size_t)numPlatforms);
   2294         CV_OclDbgAssert(clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms) == CL_SUCCESS);
   2295         platforms.resize(numPlatforms);
   2296     }
   2297 
   2298     int selectedPlatform = -1;
   2299     if (platform.length() > 0)
   2300     {
   2301         for (size_t i = 0; i < platforms.size(); i++)
   2302         {
   2303             std::string name;
   2304             CV_OclDbgAssert(getStringInfo(clGetPlatformInfo, platforms[i], CL_PLATFORM_NAME, name) == CL_SUCCESS);
   2305             if (name.find(platform) != std::string::npos)
   2306             {
   2307                 selectedPlatform = (int)i;
   2308                 break;
   2309             }
   2310         }
   2311         if (selectedPlatform == -1)
   2312         {
   2313             std::cerr << "ERROR: Can't find OpenCL platform by name: " << platform << std::endl;
   2314             goto not_found;
   2315         }
   2316     }
   2317     if (deviceTypes.size() == 0)
   2318     {
   2319         if (!isID)
   2320         {
   2321             deviceTypes.push_back("GPU");
   2322             if (configuration)
   2323                 deviceTypes.push_back("CPU");
   2324         }
   2325         else
   2326             deviceTypes.push_back("ALL");
   2327     }
   2328     for (size_t t = 0; t < deviceTypes.size(); t++)
   2329     {
   2330         int deviceType = 0;
   2331         std::string tempStrDeviceType = deviceTypes[t];
   2332         std::transform( tempStrDeviceType.begin(), tempStrDeviceType.end(), tempStrDeviceType.begin(), tolower );
   2333 
   2334         if (tempStrDeviceType == "gpu" || tempStrDeviceType == "dgpu" || tempStrDeviceType == "igpu")
   2335             deviceType = Device::TYPE_GPU;
   2336         else if (tempStrDeviceType == "cpu")
   2337             deviceType = Device::TYPE_CPU;
   2338         else if (tempStrDeviceType == "accelerator")
   2339             deviceType = Device::TYPE_ACCELERATOR;
   2340         else if (tempStrDeviceType == "all")
   2341             deviceType = Device::TYPE_ALL;
   2342         else
   2343         {
   2344             std::cerr << "ERROR: Unsupported device type for OpenCL device (GPU, CPU, ACCELERATOR): " << deviceTypes[t] << std::endl;
   2345             goto not_found;
   2346         }
   2347 
   2348         std::vector<cl_device_id> devices; // TODO Use clReleaseDevice to cleanup
   2349         for (int i = selectedPlatform >= 0 ? selectedPlatform : 0;
   2350                 (selectedPlatform >= 0 ? i == selectedPlatform : true) && (i < (int)platforms.size());
   2351                 i++)
   2352         {
   2353             cl_uint count = 0;
   2354             cl_int status = clGetDeviceIDs(platforms[i], deviceType, 0, NULL, &count);
   2355             CV_OclDbgAssert(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND);
   2356             if (count == 0)
   2357                 continue;
   2358             size_t base = devices.size();
   2359             devices.resize(base + count);
   2360             status = clGetDeviceIDs(platforms[i], deviceType, count, &devices[base], &count);
   2361             CV_OclDbgAssert(status == CL_SUCCESS || status == CL_DEVICE_NOT_FOUND);
   2362         }
   2363 
   2364         for (size_t i = (isID ? deviceID : 0);
   2365              (isID ? (i == (size_t)deviceID) : true) && (i < devices.size());
   2366              i++)
   2367         {
   2368             std::string name;
   2369             CV_OclDbgAssert(getStringInfo(clGetDeviceInfo, devices[i], CL_DEVICE_NAME, name) == CL_SUCCESS);
   2370             cl_bool useGPU = true;
   2371             if(tempStrDeviceType == "dgpu" || tempStrDeviceType == "igpu")
   2372             {
   2373                 cl_bool isIGPU = CL_FALSE;
   2374                 clGetDeviceInfo(devices[i], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(isIGPU), &isIGPU, NULL);
   2375                 useGPU = tempStrDeviceType == "dgpu" ? !isIGPU : isIGPU;
   2376             }
   2377             if ( (isID || name.find(deviceName) != std::string::npos) && useGPU)
   2378             {
   2379                 // TODO check for OpenCL 1.1
   2380                 return devices[i];
   2381             }
   2382         }
   2383     }
   2384 
   2385 not_found:
   2386     if (!configuration)
   2387         return NULL; // suppress messages on stderr
   2388 
   2389     std::cerr << "ERROR: Requested OpenCL device not found, check configuration: " << (configuration == NULL ? "" : configuration) << std::endl
   2390             << "    Platform: " << (platform.length() == 0 ? "any" : platform) << std::endl
   2391             << "    Device types: ";
   2392     for (size_t t = 0; t < deviceTypes.size(); t++)
   2393         std::cerr << deviceTypes[t] << " ";
   2394 
   2395     std::cerr << std::endl << "    Device name: " << (deviceName.length() == 0 ? "any" : deviceName) << std::endl;
   2396     return NULL;
   2397 }
   2398 #endif
   2399 
   2400 #ifdef HAVE_OPENCL_SVM
   2401 namespace svm {
   2402 
   2403 enum AllocatorFlags { // don't use first 16 bits
   2404         OPENCL_SVM_COARSE_GRAIN_BUFFER = 1 << 16, // clSVMAlloc + SVM map/unmap
   2405         OPENCL_SVM_FINE_GRAIN_BUFFER = 2 << 16, // clSVMAlloc
   2406         OPENCL_SVM_FINE_GRAIN_SYSTEM = 3 << 16, // direct access
   2407         OPENCL_SVM_BUFFER_MASK = 3 << 16,
   2408         OPENCL_SVM_BUFFER_MAP = 4 << 16
   2409 };
   2410 
   2411 static bool checkForceSVMUmatUsage()
   2412 {
   2413     static bool initialized = false;
   2414     static bool force = false;
   2415     if (!initialized)
   2416     {
   2417         force = getBoolParameter("OPENCV_OPENCL_SVM_FORCE_UMAT_USAGE", false);
   2418         initialized = true;
   2419     }
   2420     return force;
   2421 }
   2422 static bool checkDisableSVMUMatUsage()
   2423 {
   2424     static bool initialized = false;
   2425     static bool force = false;
   2426     if (!initialized)
   2427     {
   2428         force = getBoolParameter("OPENCV_OPENCL_SVM_DISABLE_UMAT_USAGE", false);
   2429         initialized = true;
   2430     }
   2431     return force;
   2432 }
   2433 static bool checkDisableSVM()
   2434 {
   2435     static bool initialized = false;
   2436     static bool force = false;
   2437     if (!initialized)
   2438     {
   2439         force = getBoolParameter("OPENCV_OPENCL_SVM_DISABLE", false);
   2440         initialized = true;
   2441     }
   2442     return force;
   2443 }
   2444 // see SVMCapabilities
   2445 static unsigned int getSVMCapabilitiesMask()
   2446 {
   2447     static bool initialized = false;
   2448     static unsigned int mask = 0;
   2449     if (!initialized)
   2450     {
   2451         const char* envValue = getenv("OPENCV_OPENCL_SVM_CAPABILITIES_MASK");
   2452         if (envValue == NULL)
   2453         {
   2454             return ~0U; // all bits 1
   2455         }
   2456         mask = atoi(envValue);
   2457         initialized = true;
   2458     }
   2459     return mask;
   2460 }
   2461 } // namespace
   2462 #endif
   2463 
   2464 struct Context::Impl
   2465 {
   2466     static Context::Impl* get(Context& context) { return context.p; }
   2467 
   2468     void __init()
   2469     {
   2470         refcount = 1;
   2471         handle = 0;
   2472 #ifdef HAVE_OPENCL_SVM
   2473         svmInitialized = false;
   2474 #endif
   2475     }
   2476 
   2477     Impl()
   2478     {
   2479         __init();
   2480     }
   2481 
   2482     void setDefault()
   2483     {
   2484         CV_Assert(handle == NULL);
   2485 
   2486         cl_device_id d = selectOpenCLDevice();
   2487 
   2488         if (d == NULL)
   2489             return;
   2490 
   2491         cl_platform_id pl = NULL;
   2492         CV_OclDbgAssert(clGetDeviceInfo(d, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &pl, NULL) == CL_SUCCESS);
   2493 
   2494         cl_context_properties prop[] =
   2495         {
   2496             CL_CONTEXT_PLATFORM, (cl_context_properties)pl,
   2497             0
   2498         };
   2499 
   2500         // !!! in the current implementation force the number of devices to 1 !!!
   2501         cl_uint nd = 1;
   2502         cl_int status;
   2503 
   2504         handle = clCreateContext(prop, nd, &d, 0, 0, &status);
   2505 
   2506         bool ok = handle != 0 && status == CL_SUCCESS;
   2507         if( ok )
   2508         {
   2509             devices.resize(nd);
   2510             devices[0].set(d);
   2511         }
   2512         else
   2513             handle = NULL;
   2514     }
   2515 
   2516     Impl(int dtype0)
   2517     {
   2518         __init();
   2519 
   2520         cl_int retval = 0;
   2521         cl_platform_id pl = (cl_platform_id)Platform::getDefault().ptr();
   2522         cl_context_properties prop[] =
   2523         {
   2524             CL_CONTEXT_PLATFORM, (cl_context_properties)pl,
   2525             0
   2526         };
   2527 
   2528         cl_uint i, nd0 = 0, nd = 0;
   2529         int dtype = dtype0 & 15;
   2530         CV_OclDbgAssert(clGetDeviceIDs( pl, dtype, 0, 0, &nd0 ) == CL_SUCCESS);
   2531 
   2532         AutoBuffer<void*> dlistbuf(nd0*2+1);
   2533         cl_device_id* dlist = (cl_device_id*)(void**)dlistbuf;
   2534         cl_device_id* dlist_new = dlist + nd0;
   2535         CV_OclDbgAssert(clGetDeviceIDs( pl, dtype, nd0, dlist, &nd0 ) == CL_SUCCESS);
   2536         String name0;
   2537 
   2538         for(i = 0; i < nd0; i++)
   2539         {
   2540             Device d(dlist[i]);
   2541             if( !d.available() || !d.compilerAvailable() )
   2542                 continue;
   2543             if( dtype0 == Device::TYPE_DGPU && d.hostUnifiedMemory() )
   2544                 continue;
   2545             if( dtype0 == Device::TYPE_IGPU && !d.hostUnifiedMemory() )
   2546                 continue;
   2547             String name = d.name();
   2548             if( nd != 0 && name != name0 )
   2549                 continue;
   2550             name0 = name;
   2551             dlist_new[nd++] = dlist[i];
   2552         }
   2553 
   2554         if(nd == 0)
   2555             return;
   2556 
   2557         // !!! in the current implementation force the number of devices to 1 !!!
   2558         nd = 1;
   2559 
   2560         handle = clCreateContext(prop, nd, dlist_new, 0, 0, &retval);
   2561         bool ok = handle != 0 && retval == CL_SUCCESS;
   2562         if( ok )
   2563         {
   2564             devices.resize(nd);
   2565             for( i = 0; i < nd; i++ )
   2566                 devices[i].set(dlist_new[i]);
   2567         }
   2568     }
   2569 
   2570     ~Impl()
   2571     {
   2572         if(handle)
   2573         {
   2574             clReleaseContext(handle);
   2575             handle = NULL;
   2576         }
   2577         devices.clear();
   2578     }
   2579 
   2580     Program getProg(const ProgramSource& src,
   2581                     const String& buildflags, String& errmsg)
   2582     {
   2583         String prefix = Program::getPrefix(buildflags);
   2584         HashKey k(src.hash(), crc64((const uchar*)prefix.c_str(), prefix.size()));
   2585         phash_t::iterator it = phash.find(k);
   2586         if( it != phash.end() )
   2587             return it->second;
   2588         //String filename = format("%08x%08x_%08x%08x.clb2",
   2589         Program prog(src, buildflags, errmsg);
   2590         if(prog.ptr())
   2591             phash.insert(std::pair<HashKey,Program>(k, prog));
   2592         return prog;
   2593     }
   2594 
   2595     IMPLEMENT_REFCOUNTABLE();
   2596 
   2597     cl_context handle;
   2598     std::vector<Device> devices;
   2599 
   2600     typedef ProgramSource::hash_t hash_t;
   2601 
   2602     struct HashKey
   2603     {
   2604         HashKey(hash_t _a, hash_t _b) : a(_a), b(_b) {}
   2605         bool operator < (const HashKey& k) const { return a < k.a || (a == k.a && b < k.b); }
   2606         bool operator == (const HashKey& k) const { return a == k.a && b == k.b; }
   2607         bool operator != (const HashKey& k) const { return a != k.a || b != k.b; }
   2608         hash_t a, b;
   2609     };
   2610     typedef std::map<HashKey, Program> phash_t;
   2611     phash_t phash;
   2612 
   2613 #ifdef HAVE_OPENCL_SVM
   2614     bool svmInitialized;
   2615     bool svmAvailable;
   2616     bool svmEnabled;
   2617     svm::SVMCapabilities svmCapabilities;
   2618     svm::SVMFunctions svmFunctions;
   2619 
   2620     void svmInit()
   2621     {
   2622         CV_Assert(handle != NULL);
   2623         const Device& device = devices[0];
   2624         cl_device_svm_capabilities deviceCaps = 0;
   2625         CV_Assert(((void)0, CL_DEVICE_SVM_CAPABILITIES == CL_DEVICE_SVM_CAPABILITIES_AMD)); // Check assumption
   2626         cl_int status = clGetDeviceInfo((cl_device_id)device.ptr(), CL_DEVICE_SVM_CAPABILITIES, sizeof(deviceCaps), &deviceCaps, NULL);
   2627         if (status != CL_SUCCESS)
   2628         {
   2629             CV_OPENCL_SVM_TRACE_ERROR_P("CL_DEVICE_SVM_CAPABILITIES via clGetDeviceInfo failed: %d\n", status);
   2630             goto noSVM;
   2631         }
   2632         CV_OPENCL_SVM_TRACE_P("CL_DEVICE_SVM_CAPABILITIES returned: 0x%x\n", (int)deviceCaps);
   2633         CV_Assert(((void)0, CL_DEVICE_SVM_COARSE_GRAIN_BUFFER == CL_DEVICE_SVM_COARSE_GRAIN_BUFFER_AMD)); // Check assumption
   2634         svmCapabilities.value_ =
   2635                 ((deviceCaps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER) ? svm::SVMCapabilities::SVM_COARSE_GRAIN_BUFFER : 0) |
   2636                 ((deviceCaps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) ? svm::SVMCapabilities::SVM_FINE_GRAIN_BUFFER : 0) |
   2637                 ((deviceCaps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM) ? svm::SVMCapabilities::SVM_FINE_GRAIN_SYSTEM : 0) |
   2638                 ((deviceCaps & CL_DEVICE_SVM_ATOMICS) ? svm::SVMCapabilities::SVM_ATOMICS : 0);
   2639         svmCapabilities.value_ &= svm::getSVMCapabilitiesMask();
   2640         if (svmCapabilities.value_ == 0)
   2641         {
   2642             CV_OPENCL_SVM_TRACE_ERROR_P("svmCapabilities is empty\n");
   2643             goto noSVM;
   2644         }
   2645         try
   2646         {
   2647             // Try OpenCL 2.0
   2648             CV_OPENCL_SVM_TRACE_P("Try SVM from OpenCL 2.0 ...\n");
   2649             void* ptr = clSVMAlloc(handle, CL_MEM_READ_WRITE, 100, 0);
   2650             if (!ptr)
   2651             {
   2652                 CV_OPENCL_SVM_TRACE_ERROR_P("clSVMAlloc returned NULL...\n");
   2653                 CV_ErrorNoReturn(Error::StsBadArg, "clSVMAlloc returned NULL");
   2654             }
   2655             try
   2656             {
   2657                 bool error = false;
   2658                 cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
   2659                 if (CL_SUCCESS != clEnqueueSVMMap(q, CL_TRUE, CL_MAP_WRITE, ptr, 100, 0, NULL, NULL))
   2660                 {
   2661                     CV_OPENCL_SVM_TRACE_ERROR_P("clEnqueueSVMMap failed...\n");
   2662                     CV_ErrorNoReturn(Error::StsBadArg, "clEnqueueSVMMap FAILED");
   2663                 }
   2664                 clFinish(q);
   2665                 try
   2666                 {
   2667                     ((int*)ptr)[0] = 100;
   2668                 }
   2669                 catch (...)
   2670                 {
   2671                     CV_OPENCL_SVM_TRACE_ERROR_P("SVM buffer access test FAILED\n");
   2672                     error = true;
   2673                 }
   2674                 if (CL_SUCCESS != clEnqueueSVMUnmap(q, ptr, 0, NULL, NULL))
   2675                 {
   2676                     CV_OPENCL_SVM_TRACE_ERROR_P("clEnqueueSVMUnmap failed...\n");
   2677                     CV_ErrorNoReturn(Error::StsBadArg, "clEnqueueSVMUnmap FAILED");
   2678                 }
   2679                 clFinish(q);
   2680                 if (error)
   2681                 {
   2682                     CV_ErrorNoReturn(Error::StsBadArg, "OpenCL SVM buffer access test was FAILED");
   2683                 }
   2684             }
   2685             catch (...)
   2686             {
   2687                 CV_OPENCL_SVM_TRACE_ERROR_P("OpenCL SVM buffer access test was FAILED\n");
   2688                 clSVMFree(handle, ptr);
   2689                 throw;
   2690             }
   2691             clSVMFree(handle, ptr);
   2692             svmFunctions.fn_clSVMAlloc = clSVMAlloc;
   2693             svmFunctions.fn_clSVMFree = clSVMFree;
   2694             svmFunctions.fn_clSetKernelArgSVMPointer = clSetKernelArgSVMPointer;
   2695             //svmFunctions.fn_clSetKernelExecInfo = clSetKernelExecInfo;
   2696             //svmFunctions.fn_clEnqueueSVMFree = clEnqueueSVMFree;
   2697             svmFunctions.fn_clEnqueueSVMMemcpy = clEnqueueSVMMemcpy;
   2698             svmFunctions.fn_clEnqueueSVMMemFill = clEnqueueSVMMemFill;
   2699             svmFunctions.fn_clEnqueueSVMMap = clEnqueueSVMMap;
   2700             svmFunctions.fn_clEnqueueSVMUnmap = clEnqueueSVMUnmap;
   2701         }
   2702         catch (...)
   2703         {
   2704             CV_OPENCL_SVM_TRACE_P("clSVMAlloc failed, trying HSA extension...\n");
   2705             try
   2706             {
   2707                 // Try HSA extension
   2708                 String extensions = device.extensions();
   2709                 if (extensions.find("cl_amd_svm") == String::npos)
   2710                 {
   2711                     CV_OPENCL_SVM_TRACE_P("Device extension doesn't have cl_amd_svm: %s\n", extensions.c_str());
   2712                     goto noSVM;
   2713                 }
   2714                 cl_platform_id p = NULL;
   2715                 status = clGetDeviceInfo((cl_device_id)device.ptr(), CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &p, NULL);
   2716                 CV_Assert(status == CL_SUCCESS);
   2717                 svmFunctions.fn_clSVMAlloc = (clSVMAllocAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSVMAllocAMD");
   2718                 svmFunctions.fn_clSVMFree = (clSVMFreeAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSVMFreeAMD");
   2719                 svmFunctions.fn_clSetKernelArgSVMPointer = (clSetKernelArgSVMPointerAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSetKernelArgSVMPointerAMD");
   2720                 //svmFunctions.fn_clSetKernelExecInfo = (clSetKernelExecInfoAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clSetKernelExecInfoAMD");
   2721                 //svmFunctions.fn_clEnqueueSVMFree = (clEnqueueSVMFreeAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMFreeAMD");
   2722                 svmFunctions.fn_clEnqueueSVMMemcpy = (clEnqueueSVMMemcpyAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMemcpyAMD");
   2723                 svmFunctions.fn_clEnqueueSVMMemFill = (clEnqueueSVMMemFillAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMemFillAMD");
   2724                 svmFunctions.fn_clEnqueueSVMMap = (clEnqueueSVMMapAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMMapAMD");
   2725                 svmFunctions.fn_clEnqueueSVMUnmap = (clEnqueueSVMUnmapAMD_fn)clGetExtensionFunctionAddressForPlatform(p, "clEnqueueSVMUnmapAMD");
   2726                 CV_Assert(svmFunctions.isValid());
   2727             }
   2728             catch (...)
   2729             {
   2730                 CV_OPENCL_SVM_TRACE_P("Something is totally wrong\n");
   2731                 goto noSVM;
   2732             }
   2733         }
   2734 
   2735         svmAvailable = true;
   2736         svmEnabled = !svm::checkDisableSVM();
   2737         svmInitialized = true;
   2738         CV_OPENCL_SVM_TRACE_P("OpenCV OpenCL SVM support initialized\n");
   2739         return;
   2740     noSVM:
   2741         CV_OPENCL_SVM_TRACE_P("OpenCL SVM is not detected\n");
   2742         svmAvailable = false;
   2743         svmEnabled = false;
   2744         svmCapabilities.value_ = 0;
   2745         svmInitialized = true;
   2746         svmFunctions.fn_clSVMAlloc = NULL;
   2747         return;
   2748     }
   2749 #endif
   2750 };
   2751 
   2752 
   2753 Context::Context()
   2754 {
   2755     p = 0;
   2756 }
   2757 
   2758 Context::Context(int dtype)
   2759 {
   2760     p = 0;
   2761     create(dtype);
   2762 }
   2763 
   2764 bool Context::create()
   2765 {
   2766     if( !haveOpenCL() )
   2767         return false;
   2768     if(p)
   2769         p->release();
   2770     p = new Impl();
   2771     if(!p->handle)
   2772     {
   2773         delete p;
   2774         p = 0;
   2775     }
   2776     return p != 0;
   2777 }
   2778 
   2779 bool Context::create(int dtype0)
   2780 {
   2781     if( !haveOpenCL() )
   2782         return false;
   2783     if(p)
   2784         p->release();
   2785     p = new Impl(dtype0);
   2786     if(!p->handle)
   2787     {
   2788         delete p;
   2789         p = 0;
   2790     }
   2791     return p != 0;
   2792 }
   2793 
   2794 Context::~Context()
   2795 {
   2796     if (p)
   2797     {
   2798         p->release();
   2799         p = NULL;
   2800     }
   2801 }
   2802 
   2803 Context::Context(const Context& c)
   2804 {
   2805     p = (Impl*)c.p;
   2806     if(p)
   2807         p->addref();
   2808 }
   2809 
   2810 Context& Context::operator = (const Context& c)
   2811 {
   2812     Impl* newp = (Impl*)c.p;
   2813     if(newp)
   2814         newp->addref();
   2815     if(p)
   2816         p->release();
   2817     p = newp;
   2818     return *this;
   2819 }
   2820 
   2821 void* Context::ptr() const
   2822 {
   2823     return p == NULL ? NULL : p->handle;
   2824 }
   2825 
   2826 size_t Context::ndevices() const
   2827 {
   2828     return p ? p->devices.size() : 0;
   2829 }
   2830 
   2831 const Device& Context::device(size_t idx) const
   2832 {
   2833     static Device dummy;
   2834     return !p || idx >= p->devices.size() ? dummy : p->devices[idx];
   2835 }
   2836 
   2837 Context& Context::getDefault(bool initialize)
   2838 {
   2839     static Context* ctx = new Context();
   2840     if(!ctx->p && haveOpenCL())
   2841     {
   2842         if (!ctx->p)
   2843             ctx->p = new Impl();
   2844         if (initialize)
   2845         {
   2846             // do not create new Context right away.
   2847             // First, try to retrieve existing context of the same type.
   2848             // In its turn, Platform::getContext() may call Context::create()
   2849             // if there is no such context.
   2850             if (ctx->p->handle == NULL)
   2851                 ctx->p->setDefault();
   2852         }
   2853     }
   2854 
   2855     return *ctx;
   2856 }
   2857 
   2858 Program Context::getProg(const ProgramSource& prog,
   2859                          const String& buildopts, String& errmsg)
   2860 {
   2861     return p ? p->getProg(prog, buildopts, errmsg) : Program();
   2862 }
   2863 
   2864 
   2865 
   2866 #ifdef HAVE_OPENCL_SVM
   2867 bool Context::useSVM() const
   2868 {
   2869     Context::Impl* i = p;
   2870     CV_Assert(i);
   2871     if (!i->svmInitialized)
   2872         i->svmInit();
   2873     return i->svmEnabled;
   2874 }
   2875 void Context::setUseSVM(bool enabled)
   2876 {
   2877     Context::Impl* i = p;
   2878     CV_Assert(i);
   2879     if (!i->svmInitialized)
   2880         i->svmInit();
   2881     if (enabled && !i->svmAvailable)
   2882     {
   2883         CV_ErrorNoReturn(Error::StsError, "OpenCL Shared Virtual Memory (SVM) is not supported by OpenCL device");
   2884     }
   2885     i->svmEnabled = enabled;
   2886 }
   2887 #else
   2888 bool Context::useSVM() const { return false; }
   2889 void Context::setUseSVM(bool enabled) { CV_Assert(!enabled); }
   2890 #endif
   2891 
   2892 #ifdef HAVE_OPENCL_SVM
   2893 namespace svm {
   2894 
   2895 const SVMCapabilities getSVMCapabilitites(const ocl::Context& context)
   2896 {
   2897     Context::Impl* i = context.p;
   2898     CV_Assert(i);
   2899     if (!i->svmInitialized)
   2900         i->svmInit();
   2901     return i->svmCapabilities;
   2902 }
   2903 
   2904 CV_EXPORTS const SVMFunctions* getSVMFunctions(const ocl::Context& context)
   2905 {
   2906     Context::Impl* i = context.p;
   2907     CV_Assert(i);
   2908     CV_Assert(i->svmInitialized); // getSVMCapabilitites() must be called first
   2909     CV_Assert(i->svmFunctions.fn_clSVMAlloc != NULL);
   2910     return &i->svmFunctions;
   2911 }
   2912 
   2913 CV_EXPORTS bool useSVM(UMatUsageFlags usageFlags)
   2914 {
   2915     if (checkForceSVMUmatUsage())
   2916         return true;
   2917     if (checkDisableSVMUMatUsage())
   2918         return false;
   2919     if ((usageFlags & USAGE_ALLOCATE_SHARED_MEMORY) != 0)
   2920         return true;
   2921     return false; // don't use SVM by default
   2922 }
   2923 
   2924 } // namespace cv::ocl::svm
   2925 #endif // HAVE_OPENCL_SVM
   2926 
   2927 
   2928 
   2929 void initializeContextFromHandle(Context& ctx, void* platform, void* _context, void* _device)
   2930 {
   2931     cl_context context = (cl_context)_context;
   2932     cl_device_id device = (cl_device_id)_device;
   2933 
   2934     // cleanup old context
   2935     Context::Impl * impl = ctx.p;
   2936     if (impl->handle)
   2937     {
   2938         CV_OclDbgAssert(clReleaseContext(impl->handle) == CL_SUCCESS);
   2939     }
   2940     impl->devices.clear();
   2941 
   2942     impl->handle = context;
   2943     impl->devices.resize(1);
   2944     impl->devices[0].set(device);
   2945 
   2946     Platform& p = Platform::getDefault();
   2947     Platform::Impl* pImpl = p.p;
   2948     pImpl->handle = (cl_platform_id)platform;
   2949 }
   2950 
   2951 /////////////////////////////////////////// Queue /////////////////////////////////////////////
   2952 
   2953 struct Queue::Impl
   2954 {
   2955     Impl(const Context& c, const Device& d)
   2956     {
   2957         refcount = 1;
   2958         const Context* pc = &c;
   2959         cl_context ch = (cl_context)pc->ptr();
   2960         if( !ch )
   2961         {
   2962             pc = &Context::getDefault();
   2963             ch = (cl_context)pc->ptr();
   2964         }
   2965         cl_device_id dh = (cl_device_id)d.ptr();
   2966         if( !dh )
   2967             dh = (cl_device_id)pc->device(0).ptr();
   2968         cl_int retval = 0;
   2969         handle = clCreateCommandQueue(ch, dh, 0, &retval);
   2970         CV_OclDbgAssert(retval == CL_SUCCESS);
   2971     }
   2972 
   2973     ~Impl()
   2974     {
   2975 #ifdef _WIN32
   2976         if (!cv::__termination)
   2977 #endif
   2978         {
   2979             if(handle)
   2980             {
   2981                 clFinish(handle);
   2982                 clReleaseCommandQueue(handle);
   2983                 handle = NULL;
   2984             }
   2985         }
   2986     }
   2987 
   2988     IMPLEMENT_REFCOUNTABLE();
   2989 
   2990     cl_command_queue handle;
   2991 };
   2992 
   2993 Queue::Queue()
   2994 {
   2995     p = 0;
   2996 }
   2997 
   2998 Queue::Queue(const Context& c, const Device& d)
   2999 {
   3000     p = 0;
   3001     create(c, d);
   3002 }
   3003 
   3004 Queue::Queue(const Queue& q)
   3005 {
   3006     p = q.p;
   3007     if(p)
   3008         p->addref();
   3009 }
   3010 
   3011 Queue& Queue::operator = (const Queue& q)
   3012 {
   3013     Impl* newp = (Impl*)q.p;
   3014     if(newp)
   3015         newp->addref();
   3016     if(p)
   3017         p->release();
   3018     p = newp;
   3019     return *this;
   3020 }
   3021 
   3022 Queue::~Queue()
   3023 {
   3024     if(p)
   3025         p->release();
   3026 }
   3027 
   3028 bool Queue::create(const Context& c, const Device& d)
   3029 {
   3030     if(p)
   3031         p->release();
   3032     p = new Impl(c, d);
   3033     return p->handle != 0;
   3034 }
   3035 
   3036 void Queue::finish()
   3037 {
   3038     if(p && p->handle)
   3039     {
   3040         CV_OclDbgAssert(clFinish(p->handle) == CL_SUCCESS);
   3041     }
   3042 }
   3043 
   3044 void* Queue::ptr() const
   3045 {
   3046     return p ? p->handle : 0;
   3047 }
   3048 
   3049 Queue& Queue::getDefault()
   3050 {
   3051     Queue& q = getCoreTlsData().get()->oclQueue;
   3052     if( !q.p && haveOpenCL() )
   3053         q.create(Context::getDefault());
   3054     return q;
   3055 }
   3056 
   3057 static cl_command_queue getQueue(const Queue& q)
   3058 {
   3059     cl_command_queue qq = (cl_command_queue)q.ptr();
   3060     if(!qq)
   3061         qq = (cl_command_queue)Queue::getDefault().ptr();
   3062     return qq;
   3063 }
   3064 
   3065 /////////////////////////////////////////// KernelArg /////////////////////////////////////////////
   3066 
   3067 KernelArg::KernelArg()
   3068     : flags(0), m(0), obj(0), sz(0), wscale(1), iwscale(1)
   3069 {
   3070 }
   3071 
   3072 KernelArg::KernelArg(int _flags, UMat* _m, int _wscale, int _iwscale, const void* _obj, size_t _sz)
   3073     : flags(_flags), m(_m), obj(_obj), sz(_sz), wscale(_wscale), iwscale(_iwscale)
   3074 {
   3075 }
   3076 
   3077 KernelArg KernelArg::Constant(const Mat& m)
   3078 {
   3079     CV_Assert(m.isContinuous());
   3080     return KernelArg(CONSTANT, 0, 0, 0, m.ptr(), m.total()*m.elemSize());
   3081 }
   3082 
   3083 /////////////////////////////////////////// Kernel /////////////////////////////////////////////
   3084 
   3085 struct Kernel::Impl
   3086 {
   3087     Impl(const char* kname, const Program& prog) :
   3088         refcount(1), e(0), nu(0)
   3089     {
   3090         cl_program ph = (cl_program)prog.ptr();
   3091         cl_int retval = 0;
   3092         handle = ph != 0 ?
   3093             clCreateKernel(ph, kname, &retval) : 0;
   3094         CV_OclDbgAssert(retval == CL_SUCCESS);
   3095         for( int i = 0; i < MAX_ARRS; i++ )
   3096             u[i] = 0;
   3097         haveTempDstUMats = false;
   3098     }
   3099 
   3100     void cleanupUMats()
   3101     {
   3102         for( int i = 0; i < MAX_ARRS; i++ )
   3103             if( u[i] )
   3104             {
   3105                 if( CV_XADD(&u[i]->urefcount, -1) == 1 )
   3106                     u[i]->currAllocator->deallocate(u[i]);
   3107                 u[i] = 0;
   3108             }
   3109         nu = 0;
   3110         haveTempDstUMats = false;
   3111     }
   3112 
   3113     void addUMat(const UMat& m, bool dst)
   3114     {
   3115         CV_Assert(nu < MAX_ARRS && m.u && m.u->urefcount > 0);
   3116         u[nu] = m.u;
   3117         CV_XADD(&m.u->urefcount, 1);
   3118         nu++;
   3119         if(dst && m.u->tempUMat())
   3120             haveTempDstUMats = true;
   3121     }
   3122 
   3123     void addImage(const Image2D& image)
   3124     {
   3125         images.push_back(image);
   3126     }
   3127 
   3128     void finit()
   3129     {
   3130         cleanupUMats();
   3131         images.clear();
   3132         if(e) { clReleaseEvent(e); e = 0; }
   3133         release();
   3134     }
   3135 
   3136     ~Impl()
   3137     {
   3138         if(handle)
   3139             clReleaseKernel(handle);
   3140     }
   3141 
   3142     IMPLEMENT_REFCOUNTABLE();
   3143 
   3144     cl_kernel handle;
   3145     cl_event e;
   3146     enum { MAX_ARRS = 16 };
   3147     UMatData* u[MAX_ARRS];
   3148     int nu;
   3149     std::list<Image2D> images;
   3150     bool haveTempDstUMats;
   3151 };
   3152 
   3153 }}
   3154 
   3155 extern "C"
   3156 {
   3157 static void CL_CALLBACK oclCleanupCallback(cl_event, cl_int, void *p)
   3158 {
   3159     ((cv::ocl::Kernel::Impl*)p)->finit();
   3160 }
   3161 
   3162 }
   3163 
   3164 namespace cv { namespace ocl {
   3165 
   3166 Kernel::Kernel()
   3167 {
   3168     p = 0;
   3169 }
   3170 
   3171 Kernel::Kernel(const char* kname, const Program& prog)
   3172 {
   3173     p = 0;
   3174     create(kname, prog);
   3175 }
   3176 
   3177 Kernel::Kernel(const char* kname, const ProgramSource& src,
   3178                const String& buildopts, String* errmsg)
   3179 {
   3180     p = 0;
   3181     create(kname, src, buildopts, errmsg);
   3182 }
   3183 
   3184 Kernel::Kernel(const Kernel& k)
   3185 {
   3186     p = k.p;
   3187     if(p)
   3188         p->addref();
   3189 }
   3190 
   3191 Kernel& Kernel::operator = (const Kernel& k)
   3192 {
   3193     Impl* newp = (Impl*)k.p;
   3194     if(newp)
   3195         newp->addref();
   3196     if(p)
   3197         p->release();
   3198     p = newp;
   3199     return *this;
   3200 }
   3201 
   3202 Kernel::~Kernel()
   3203 {
   3204     if(p)
   3205         p->release();
   3206 }
   3207 
   3208 bool Kernel::create(const char* kname, const Program& prog)
   3209 {
   3210     if(p)
   3211         p->release();
   3212     p = new Impl(kname, prog);
   3213     if(p->handle == 0)
   3214     {
   3215         p->release();
   3216         p = 0;
   3217     }
   3218 #ifdef CV_OPENCL_RUN_ASSERT // check kernel compilation fails
   3219     CV_Assert(p);
   3220 #endif
   3221     return p != 0;
   3222 }
   3223 
   3224 bool Kernel::create(const char* kname, const ProgramSource& src,
   3225                     const String& buildopts, String* errmsg)
   3226 {
   3227     if(p)
   3228     {
   3229         p->release();
   3230         p = 0;
   3231     }
   3232     String tempmsg;
   3233     if( !errmsg ) errmsg = &tempmsg;
   3234     const Program& prog = Context::getDefault().getProg(src, buildopts, *errmsg);
   3235     return create(kname, prog);
   3236 }
   3237 
   3238 void* Kernel::ptr() const
   3239 {
   3240     return p ? p->handle : 0;
   3241 }
   3242 
   3243 bool Kernel::empty() const
   3244 {
   3245     return ptr() == 0;
   3246 }
   3247 
   3248 int Kernel::set(int i, const void* value, size_t sz)
   3249 {
   3250     if (!p || !p->handle)
   3251         return -1;
   3252     if (i < 0)
   3253         return i;
   3254     if( i == 0 )
   3255         p->cleanupUMats();
   3256 
   3257     cl_int retval = clSetKernelArg(p->handle, (cl_uint)i, sz, value);
   3258     CV_OclDbgAssert(retval == CL_SUCCESS);
   3259     if (retval != CL_SUCCESS)
   3260         return -1;
   3261     return i+1;
   3262 }
   3263 
   3264 int Kernel::set(int i, const Image2D& image2D)
   3265 {
   3266     p->addImage(image2D);
   3267     cl_mem h = (cl_mem)image2D.ptr();
   3268     return set(i, &h, sizeof(h));
   3269 }
   3270 
   3271 int Kernel::set(int i, const UMat& m)
   3272 {
   3273     return set(i, KernelArg(KernelArg::READ_WRITE, (UMat*)&m, 0, 0));
   3274 }
   3275 
   3276 int Kernel::set(int i, const KernelArg& arg)
   3277 {
   3278     if( !p || !p->handle )
   3279         return -1;
   3280     if (i < 0)
   3281         return i;
   3282     if( i == 0 )
   3283         p->cleanupUMats();
   3284     if( arg.m )
   3285     {
   3286         int accessFlags = ((arg.flags & KernelArg::READ_ONLY) ? ACCESS_READ : 0) +
   3287                           ((arg.flags & KernelArg::WRITE_ONLY) ? ACCESS_WRITE : 0);
   3288         bool ptronly = (arg.flags & KernelArg::PTR_ONLY) != 0;
   3289         cl_mem h = (cl_mem)arg.m->handle(accessFlags);
   3290 
   3291         if (!h)
   3292         {
   3293             p->release();
   3294             p = 0;
   3295             return -1;
   3296         }
   3297 
   3298 #ifdef HAVE_OPENCL_SVM
   3299         if ((arg.m->u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
   3300         {
   3301             const Context& ctx = Context::getDefault();
   3302             const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
   3303             uchar*& svmDataPtr = (uchar*&)arg.m->u->handle;
   3304             CV_OPENCL_SVM_TRACE_P("clSetKernelArgSVMPointer: %p\n", svmDataPtr);
   3305 #if 1 // TODO
   3306             cl_int status = svmFns->fn_clSetKernelArgSVMPointer(p->handle, (cl_uint)i, svmDataPtr);
   3307 #else
   3308             cl_int status = svmFns->fn_clSetKernelArgSVMPointer(p->handle, (cl_uint)i, &svmDataPtr);
   3309 #endif
   3310             CV_Assert(status == CL_SUCCESS);
   3311         }
   3312         else
   3313 #endif
   3314         {
   3315             CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, sizeof(h), &h) == CL_SUCCESS);
   3316         }
   3317 
   3318         if (ptronly)
   3319         {
   3320             i++;
   3321         }
   3322         else if( arg.m->dims <= 2 )
   3323         {
   3324             UMat2D u2d(*arg.m);
   3325             CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u2d.step), &u2d.step) == CL_SUCCESS);
   3326             CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u2d.offset), &u2d.offset) == CL_SUCCESS);
   3327             i += 3;
   3328 
   3329             if( !(arg.flags & KernelArg::NO_SIZE) )
   3330             {
   3331                 int cols = u2d.cols*arg.wscale/arg.iwscale;
   3332                 CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, sizeof(u2d.rows), &u2d.rows) == CL_SUCCESS);
   3333                 CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(cols), &cols) == CL_SUCCESS);
   3334                 i += 2;
   3335             }
   3336         }
   3337         else
   3338         {
   3339             UMat3D u3d(*arg.m);
   3340             CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.slicestep), &u3d.slicestep) == CL_SUCCESS);
   3341             CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.step), &u3d.step) == CL_SUCCESS);
   3342             CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+3), sizeof(u3d.offset), &u3d.offset) == CL_SUCCESS);
   3343             i += 4;
   3344             if( !(arg.flags & KernelArg::NO_SIZE) )
   3345             {
   3346                 int cols = u3d.cols*arg.wscale/arg.iwscale;
   3347                 CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, sizeof(u3d.slices), &u3d.rows) == CL_SUCCESS);
   3348                 CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+1), sizeof(u3d.rows), &u3d.rows) == CL_SUCCESS);
   3349                 CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)(i+2), sizeof(u3d.cols), &cols) == CL_SUCCESS);
   3350                 i += 3;
   3351             }
   3352         }
   3353         p->addUMat(*arg.m, (accessFlags & ACCESS_WRITE) != 0);
   3354         return i;
   3355     }
   3356     CV_OclDbgAssert(clSetKernelArg(p->handle, (cl_uint)i, arg.sz, arg.obj) == CL_SUCCESS);
   3357     return i+1;
   3358 }
   3359 
   3360 
   3361 bool Kernel::run(int dims, size_t _globalsize[], size_t _localsize[],
   3362                  bool sync, const Queue& q)
   3363 {
   3364     if(!p || !p->handle || p->e != 0)
   3365         return false;
   3366 
   3367     cl_command_queue qq = getQueue(q);
   3368     size_t offset[CV_MAX_DIM] = {0}, globalsize[CV_MAX_DIM] = {1,1,1};
   3369     size_t total = 1;
   3370     CV_Assert(_globalsize != 0);
   3371     for (int i = 0; i < dims; i++)
   3372     {
   3373         size_t val = _localsize ? _localsize[i] :
   3374             dims == 1 ? 64 : dims == 2 ? (i == 0 ? 256 : 8) : dims == 3 ? (8>>(int)(i>0)) : 1;
   3375         CV_Assert( val > 0 );
   3376         total *= _globalsize[i];
   3377         globalsize[i] = ((_globalsize[i] + val - 1)/val)*val;
   3378     }
   3379     if( total == 0 )
   3380         return true;
   3381     if( p->haveTempDstUMats )
   3382         sync = true;
   3383     cl_int retval = clEnqueueNDRangeKernel(qq, p->handle, (cl_uint)dims,
   3384                                            offset, globalsize, _localsize, 0, 0,
   3385                                            sync ? 0 : &p->e);
   3386 #if CV_OPENCL_SHOW_RUN_ERRORS
   3387     if (retval != CL_SUCCESS)
   3388     {
   3389         printf("OpenCL program returns error: %d\n", retval);
   3390         fflush(stdout);
   3391     }
   3392 #endif
   3393     if( sync || retval != CL_SUCCESS )
   3394     {
   3395         CV_OclDbgAssert(clFinish(qq) == CL_SUCCESS);
   3396         p->cleanupUMats();
   3397     }
   3398     else
   3399     {
   3400         p->addref();
   3401         CV_OclDbgAssert(clSetEventCallback(p->e, CL_COMPLETE, oclCleanupCallback, p) == CL_SUCCESS);
   3402     }
   3403     return retval == CL_SUCCESS;
   3404 }
   3405 
   3406 bool Kernel::runTask(bool sync, const Queue& q)
   3407 {
   3408     if(!p || !p->handle || p->e != 0)
   3409         return false;
   3410 
   3411     cl_command_queue qq = getQueue(q);
   3412     cl_int retval = clEnqueueTask(qq, p->handle, 0, 0, sync ? 0 : &p->e);
   3413     if( sync || retval != CL_SUCCESS )
   3414     {
   3415         CV_OclDbgAssert(clFinish(qq) == CL_SUCCESS);
   3416         p->cleanupUMats();
   3417     }
   3418     else
   3419     {
   3420         p->addref();
   3421         CV_OclDbgAssert(clSetEventCallback(p->e, CL_COMPLETE, oclCleanupCallback, p) == CL_SUCCESS);
   3422     }
   3423     return retval == CL_SUCCESS;
   3424 }
   3425 
   3426 
   3427 size_t Kernel::workGroupSize() const
   3428 {
   3429     if(!p || !p->handle)
   3430         return 0;
   3431     size_t val = 0, retsz = 0;
   3432     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
   3433     return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_WORK_GROUP_SIZE,
   3434                                     sizeof(val), &val, &retsz) == CL_SUCCESS ? val : 0;
   3435 }
   3436 
   3437 size_t Kernel::preferedWorkGroupSizeMultiple() const
   3438 {
   3439     if(!p || !p->handle)
   3440         return 0;
   3441     size_t val = 0, retsz = 0;
   3442     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
   3443     return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
   3444                                     sizeof(val), &val, &retsz) == CL_SUCCESS ? val : 0;
   3445 }
   3446 
   3447 bool Kernel::compileWorkGroupSize(size_t wsz[]) const
   3448 {
   3449     if(!p || !p->handle || !wsz)
   3450         return 0;
   3451     size_t retsz = 0;
   3452     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
   3453     return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_COMPILE_WORK_GROUP_SIZE,
   3454                                     sizeof(wsz[0])*3, wsz, &retsz) == CL_SUCCESS;
   3455 }
   3456 
   3457 size_t Kernel::localMemSize() const
   3458 {
   3459     if(!p || !p->handle)
   3460         return 0;
   3461     size_t retsz = 0;
   3462     cl_ulong val = 0;
   3463     cl_device_id dev = (cl_device_id)Device::getDefault().ptr();
   3464     return clGetKernelWorkGroupInfo(p->handle, dev, CL_KERNEL_LOCAL_MEM_SIZE,
   3465                                     sizeof(val), &val, &retsz) == CL_SUCCESS ? (size_t)val : 0;
   3466 }
   3467 
   3468 /////////////////////////////////////////// Program /////////////////////////////////////////////
   3469 
   3470 struct Program::Impl
   3471 {
   3472     Impl(const ProgramSource& _src,
   3473          const String& _buildflags, String& errmsg)
   3474     {
   3475         refcount = 1;
   3476         const Context& ctx = Context::getDefault();
   3477         src = _src;
   3478         buildflags = _buildflags;
   3479         const String& srcstr = src.source();
   3480         const char* srcptr = srcstr.c_str();
   3481         size_t srclen = srcstr.size();
   3482         cl_int retval = 0;
   3483 
   3484         handle = clCreateProgramWithSource((cl_context)ctx.ptr(), 1, &srcptr, &srclen, &retval);
   3485         if( handle && retval == CL_SUCCESS )
   3486         {
   3487             int i, n = (int)ctx.ndevices();
   3488             AutoBuffer<void*> deviceListBuf(n+1);
   3489             void** deviceList = deviceListBuf;
   3490             for( i = 0; i < n; i++ )
   3491                 deviceList[i] = ctx.device(i).ptr();
   3492 
   3493             Device device = Device::getDefault();
   3494             if (device.isAMD())
   3495                 buildflags += " -D AMD_DEVICE";
   3496             else if (device.isIntel())
   3497                 buildflags += " -D INTEL_DEVICE";
   3498 
   3499             retval = clBuildProgram(handle, n,
   3500                                     (const cl_device_id*)deviceList,
   3501                                     buildflags.c_str(), 0, 0);
   3502 #if !CV_OPENCL_ALWAYS_SHOW_BUILD_LOG
   3503             if( retval != CL_SUCCESS )
   3504 #endif
   3505             {
   3506                 size_t retsz = 0;
   3507                 cl_int buildInfo_retval = clGetProgramBuildInfo(handle, (cl_device_id)deviceList[0],
   3508                                                CL_PROGRAM_BUILD_LOG, 0, 0, &retsz);
   3509                 if (buildInfo_retval == CL_SUCCESS && retsz > 1)
   3510                 {
   3511                     AutoBuffer<char> bufbuf(retsz + 16);
   3512                     char* buf = bufbuf;
   3513                     buildInfo_retval = clGetProgramBuildInfo(handle, (cl_device_id)deviceList[0],
   3514                                                    CL_PROGRAM_BUILD_LOG, retsz+1, buf, &retsz);
   3515                     if (buildInfo_retval == CL_SUCCESS)
   3516                     {
   3517                         // TODO It is useful to see kernel name & program file name also
   3518                         errmsg = String(buf);
   3519                         printf("OpenCL program build log: %s\n%s\n", buildflags.c_str(), errmsg.c_str());
   3520                         fflush(stdout);
   3521                     }
   3522                 }
   3523                 if (retval != CL_SUCCESS && handle)
   3524                 {
   3525                     clReleaseProgram(handle);
   3526                     handle = NULL;
   3527                 }
   3528             }
   3529         }
   3530     }
   3531 
   3532     Impl(const String& _buf, const String& _buildflags)
   3533     {
   3534         refcount = 1;
   3535         handle = 0;
   3536         buildflags = _buildflags;
   3537         if(_buf.empty())
   3538             return;
   3539         String prefix0 = Program::getPrefix(buildflags);
   3540         const Context& ctx = Context::getDefault();
   3541         const Device& dev = Device::getDefault();
   3542         const char* pos0 = _buf.c_str();
   3543         const char* pos1 = strchr(pos0, '\n');
   3544         if(!pos1)
   3545             return;
   3546         const char* pos2 = strchr(pos1+1, '\n');
   3547         if(!pos2)
   3548             return;
   3549         const char* pos3 = strchr(pos2+1, '\n');
   3550         if(!pos3)
   3551             return;
   3552         size_t prefixlen = (pos3 - pos0)+1;
   3553         String prefix(pos0, prefixlen);
   3554         if( prefix != prefix0 )
   3555             return;
   3556         const uchar* bin = (uchar*)(pos3+1);
   3557         void* devid = dev.ptr();
   3558         size_t codelen = _buf.length() - prefixlen;
   3559         cl_int binstatus = 0, retval = 0;
   3560         handle = clCreateProgramWithBinary((cl_context)ctx.ptr(), 1, (cl_device_id*)&devid,
   3561                                            &codelen, &bin, &binstatus, &retval);
   3562         CV_OclDbgAssert(retval == CL_SUCCESS);
   3563     }
   3564 
   3565     String store()
   3566     {
   3567         if(!handle)
   3568             return String();
   3569         size_t progsz = 0, retsz = 0;
   3570         String prefix = Program::getPrefix(buildflags);
   3571         size_t prefixlen = prefix.length();
   3572         if(clGetProgramInfo(handle, CL_PROGRAM_BINARY_SIZES, sizeof(progsz), &progsz, &retsz) != CL_SUCCESS)
   3573             return String();
   3574         AutoBuffer<uchar> bufbuf(prefixlen + progsz + 16);
   3575         uchar* buf = bufbuf;
   3576         memcpy(buf, prefix.c_str(), prefixlen);
   3577         buf += prefixlen;
   3578         if(clGetProgramInfo(handle, CL_PROGRAM_BINARIES, sizeof(buf), &buf, &retsz) != CL_SUCCESS)
   3579             return String();
   3580         buf[progsz] = (uchar)'\0';
   3581         return String((const char*)(uchar*)bufbuf, prefixlen + progsz);
   3582     }
   3583 
   3584     ~Impl()
   3585     {
   3586         if( handle )
   3587         {
   3588 #ifdef _WIN32
   3589             if (!cv::__termination)
   3590 #endif
   3591             {
   3592                 clReleaseProgram(handle);
   3593             }
   3594             handle = NULL;
   3595         }
   3596     }
   3597 
   3598     IMPLEMENT_REFCOUNTABLE();
   3599 
   3600     ProgramSource src;
   3601     String buildflags;
   3602     cl_program handle;
   3603 };
   3604 
   3605 
   3606 Program::Program() { p = 0; }
   3607 
   3608 Program::Program(const ProgramSource& src,
   3609         const String& buildflags, String& errmsg)
   3610 {
   3611     p = 0;
   3612     create(src, buildflags, errmsg);
   3613 }
   3614 
   3615 Program::Program(const Program& prog)
   3616 {
   3617     p = prog.p;
   3618     if(p)
   3619         p->addref();
   3620 }
   3621 
   3622 Program& Program::operator = (const Program& prog)
   3623 {
   3624     Impl* newp = (Impl*)prog.p;
   3625     if(newp)
   3626         newp->addref();
   3627     if(p)
   3628         p->release();
   3629     p = newp;
   3630     return *this;
   3631 }
   3632 
   3633 Program::~Program()
   3634 {
   3635     if(p)
   3636         p->release();
   3637 }
   3638 
   3639 bool Program::create(const ProgramSource& src,
   3640             const String& buildflags, String& errmsg)
   3641 {
   3642     if(p)
   3643         p->release();
   3644     p = new Impl(src, buildflags, errmsg);
   3645     if(!p->handle)
   3646     {
   3647         p->release();
   3648         p = 0;
   3649     }
   3650     return p != 0;
   3651 }
   3652 
   3653 const ProgramSource& Program::source() const
   3654 {
   3655     static ProgramSource dummy;
   3656     return p ? p->src : dummy;
   3657 }
   3658 
   3659 void* Program::ptr() const
   3660 {
   3661     return p ? p->handle : 0;
   3662 }
   3663 
   3664 bool Program::read(const String& bin, const String& buildflags)
   3665 {
   3666     if(p)
   3667         p->release();
   3668     p = new Impl(bin, buildflags);
   3669     return p->handle != 0;
   3670 }
   3671 
   3672 bool Program::write(String& bin) const
   3673 {
   3674     if(!p)
   3675         return false;
   3676     bin = p->store();
   3677     return !bin.empty();
   3678 }
   3679 
   3680 String Program::getPrefix() const
   3681 {
   3682     if(!p)
   3683         return String();
   3684     return getPrefix(p->buildflags);
   3685 }
   3686 
   3687 String Program::getPrefix(const String& buildflags)
   3688 {
   3689     const Context& ctx = Context::getDefault();
   3690     const Device& dev = ctx.device(0);
   3691     return format("name=%s\ndriver=%s\nbuildflags=%s\n",
   3692                   dev.name().c_str(), dev.driverVersion().c_str(), buildflags.c_str());
   3693 }
   3694 
   3695 ///////////////////////////////////////// ProgramSource ///////////////////////////////////////////////
   3696 
   3697 struct ProgramSource::Impl
   3698 {
   3699     Impl(const char* _src)
   3700     {
   3701         init(String(_src));
   3702     }
   3703     Impl(const String& _src)
   3704     {
   3705         init(_src);
   3706     }
   3707     void init(const String& _src)
   3708     {
   3709         refcount = 1;
   3710         src = _src;
   3711         h = crc64((uchar*)src.c_str(), src.size());
   3712     }
   3713 
   3714     IMPLEMENT_REFCOUNTABLE();
   3715     String src;
   3716     ProgramSource::hash_t h;
   3717 };
   3718 
   3719 
   3720 ProgramSource::ProgramSource()
   3721 {
   3722     p = 0;
   3723 }
   3724 
   3725 ProgramSource::ProgramSource(const char* prog)
   3726 {
   3727     p = new Impl(prog);
   3728 }
   3729 
   3730 ProgramSource::ProgramSource(const String& prog)
   3731 {
   3732     p = new Impl(prog);
   3733 }
   3734 
   3735 ProgramSource::~ProgramSource()
   3736 {
   3737     if(p)
   3738         p->release();
   3739 }
   3740 
   3741 ProgramSource::ProgramSource(const ProgramSource& prog)
   3742 {
   3743     p = prog.p;
   3744     if(p)
   3745         p->addref();
   3746 }
   3747 
   3748 ProgramSource& ProgramSource::operator = (const ProgramSource& prog)
   3749 {
   3750     Impl* newp = (Impl*)prog.p;
   3751     if(newp)
   3752         newp->addref();
   3753     if(p)
   3754         p->release();
   3755     p = newp;
   3756     return *this;
   3757 }
   3758 
   3759 const String& ProgramSource::source() const
   3760 {
   3761     static String dummy;
   3762     return p ? p->src : dummy;
   3763 }
   3764 
   3765 ProgramSource::hash_t ProgramSource::hash() const
   3766 {
   3767     return p ? p->h : 0;
   3768 }
   3769 
   3770 //////////////////////////////////////////// OpenCLAllocator //////////////////////////////////////////////////
   3771 
   3772 template<typename T>
   3773 class OpenCLBufferPool
   3774 {
   3775 protected:
   3776     ~OpenCLBufferPool() { }
   3777 public:
   3778     virtual T allocate(size_t size) = 0;
   3779     virtual void release(T buffer) = 0;
   3780 };
   3781 
   3782 template <typename Derived, typename BufferEntry, typename T>
   3783 class OpenCLBufferPoolBaseImpl : public BufferPoolController, public OpenCLBufferPool<T>
   3784 {
   3785 private:
   3786     inline Derived& derived() { return *static_cast<Derived*>(this); }
   3787 protected:
   3788     Mutex mutex_;
   3789 
   3790     size_t currentReservedSize;
   3791     size_t maxReservedSize;
   3792 
   3793     std::list<BufferEntry> allocatedEntries_; // Allocated and used entries
   3794     std::list<BufferEntry> reservedEntries_; // LRU order. Allocated, but not used entries
   3795 
   3796     // synchronized
   3797     bool _findAndRemoveEntryFromAllocatedList(CV_OUT BufferEntry& entry, T buffer)
   3798     {
   3799         typename std::list<BufferEntry>::iterator i = allocatedEntries_.begin();
   3800         for (; i != allocatedEntries_.end(); ++i)
   3801         {
   3802             BufferEntry& e = *i;
   3803             if (e.clBuffer_ == buffer)
   3804             {
   3805                 entry = e;
   3806                 allocatedEntries_.erase(i);
   3807                 return true;
   3808             }
   3809         }
   3810         return false;
   3811     }
   3812 
   3813     // synchronized
   3814     bool _findAndRemoveEntryFromReservedList(CV_OUT BufferEntry& entry, const size_t size)
   3815     {
   3816         if (reservedEntries_.empty())
   3817             return false;
   3818         typename std::list<BufferEntry>::iterator i = reservedEntries_.begin();
   3819         typename std::list<BufferEntry>::iterator result_pos = reservedEntries_.end();
   3820         BufferEntry result;
   3821         size_t minDiff = (size_t)(-1);
   3822         for (; i != reservedEntries_.end(); ++i)
   3823         {
   3824             BufferEntry& e = *i;
   3825             if (e.capacity_ >= size)
   3826             {
   3827                 size_t diff = e.capacity_ - size;
   3828                 if (diff < size / 8 && (result_pos == reservedEntries_.end() || diff < minDiff))
   3829                 {
   3830                     minDiff = diff;
   3831                     result_pos = i;
   3832                     result = e;
   3833                     if (diff == 0)
   3834                         break;
   3835                 }
   3836             }
   3837         }
   3838         if (result_pos != reservedEntries_.end())
   3839         {
   3840             //CV_DbgAssert(result == *result_pos);
   3841             reservedEntries_.erase(result_pos);
   3842             entry = result;
   3843             currentReservedSize -= entry.capacity_;
   3844             allocatedEntries_.push_back(entry);
   3845             return true;
   3846         }
   3847         return false;
   3848     }
   3849 
   3850     // synchronized
   3851     void _checkSizeOfReservedEntries()
   3852     {
   3853         while (currentReservedSize > maxReservedSize)
   3854         {
   3855             CV_DbgAssert(!reservedEntries_.empty());
   3856             const BufferEntry& entry = reservedEntries_.back();
   3857             CV_DbgAssert(currentReservedSize >= entry.capacity_);
   3858             currentReservedSize -= entry.capacity_;
   3859             derived()._releaseBufferEntry(entry);
   3860             reservedEntries_.pop_back();
   3861         }
   3862     }
   3863 
   3864     inline size_t _allocationGranularity(size_t size)
   3865     {
   3866         // heuristic values
   3867         if (size < 1024)
   3868             return 16;
   3869         else if (size < 64*1024)
   3870             return 64;
   3871         else if (size < 1024*1024)
   3872             return 4096;
   3873         else if (size < 16*1024*1024)
   3874             return 64*1024;
   3875         else
   3876             return 1024*1024;
   3877     }
   3878 
   3879 public:
   3880     OpenCLBufferPoolBaseImpl()
   3881         : currentReservedSize(0),
   3882           maxReservedSize(0)
   3883     {
   3884         // nothing
   3885     }
   3886     virtual ~OpenCLBufferPoolBaseImpl()
   3887     {
   3888         freeAllReservedBuffers();
   3889         CV_Assert(reservedEntries_.empty());
   3890     }
   3891 public:
   3892     virtual T allocate(size_t size)
   3893     {
   3894         AutoLock locker(mutex_);
   3895         BufferEntry entry;
   3896         if (maxReservedSize > 0 && _findAndRemoveEntryFromReservedList(entry, size))
   3897         {
   3898             CV_DbgAssert(size <= entry.capacity_);
   3899             LOG_BUFFER_POOL("Reuse reserved buffer: %p\n", entry.clBuffer_);
   3900         }
   3901         else
   3902         {
   3903             derived()._allocateBufferEntry(entry, size);
   3904         }
   3905         return entry.clBuffer_;
   3906     }
   3907     virtual void release(T buffer)
   3908     {
   3909         AutoLock locker(mutex_);
   3910         BufferEntry entry;
   3911         CV_Assert(_findAndRemoveEntryFromAllocatedList(entry, buffer));
   3912         if (maxReservedSize == 0 || entry.capacity_ > maxReservedSize / 8)
   3913         {
   3914             derived()._releaseBufferEntry(entry);
   3915         }
   3916         else
   3917         {
   3918             reservedEntries_.push_front(entry);
   3919             currentReservedSize += entry.capacity_;
   3920             _checkSizeOfReservedEntries();
   3921         }
   3922     }
   3923 
   3924     virtual size_t getReservedSize() const { return currentReservedSize; }
   3925     virtual size_t getMaxReservedSize() const { return maxReservedSize; }
   3926     virtual void setMaxReservedSize(size_t size)
   3927     {
   3928         AutoLock locker(mutex_);
   3929         size_t oldMaxReservedSize = maxReservedSize;
   3930         maxReservedSize = size;
   3931         if (maxReservedSize < oldMaxReservedSize)
   3932         {
   3933             typename std::list<BufferEntry>::iterator i = reservedEntries_.begin();
   3934             for (; i != reservedEntries_.end();)
   3935             {
   3936                 const BufferEntry& entry = *i;
   3937                 if (entry.capacity_ > maxReservedSize / 8)
   3938                 {
   3939                     CV_DbgAssert(currentReservedSize >= entry.capacity_);
   3940                     currentReservedSize -= entry.capacity_;
   3941                     derived()._releaseBufferEntry(entry);
   3942                     i = reservedEntries_.erase(i);
   3943                     continue;
   3944                 }
   3945                 ++i;
   3946             }
   3947             _checkSizeOfReservedEntries();
   3948         }
   3949     }
   3950     virtual void freeAllReservedBuffers()
   3951     {
   3952         AutoLock locker(mutex_);
   3953         typename std::list<BufferEntry>::const_iterator i = reservedEntries_.begin();
   3954         for (; i != reservedEntries_.end(); ++i)
   3955         {
   3956             const BufferEntry& entry = *i;
   3957             derived()._releaseBufferEntry(entry);
   3958         }
   3959         reservedEntries_.clear();
   3960         currentReservedSize = 0;
   3961     }
   3962 };
   3963 
   3964 struct CLBufferEntry
   3965 {
   3966     cl_mem clBuffer_;
   3967     size_t capacity_;
   3968     CLBufferEntry() : clBuffer_((cl_mem)NULL), capacity_(0) { }
   3969 };
   3970 
   3971 class OpenCLBufferPoolImpl : public OpenCLBufferPoolBaseImpl<OpenCLBufferPoolImpl, CLBufferEntry, cl_mem>
   3972 {
   3973 public:
   3974     typedef struct CLBufferEntry BufferEntry;
   3975 protected:
   3976     int createFlags_;
   3977 public:
   3978     OpenCLBufferPoolImpl(int createFlags = 0)
   3979         : createFlags_(createFlags)
   3980     {
   3981     }
   3982 
   3983     void _allocateBufferEntry(BufferEntry& entry, size_t size)
   3984     {
   3985         CV_DbgAssert(entry.clBuffer_ == NULL);
   3986         entry.capacity_ = alignSize(size, (int)_allocationGranularity(size));
   3987         Context& ctx = Context::getDefault();
   3988         cl_int retval = CL_SUCCESS;
   3989         entry.clBuffer_ = clCreateBuffer((cl_context)ctx.ptr(), CL_MEM_READ_WRITE|createFlags_, entry.capacity_, 0, &retval);
   3990         CV_Assert(retval == CL_SUCCESS);
   3991         CV_Assert(entry.clBuffer_ != NULL);
   3992         if(retval == CL_SUCCESS)
   3993         {
   3994             CV_IMPL_ADD(CV_IMPL_OCL);
   3995         }
   3996         LOG_BUFFER_POOL("OpenCL allocate %lld (0x%llx) bytes: %p\n",
   3997                 (long long)entry.capacity_, (long long)entry.capacity_, entry.clBuffer_);
   3998         allocatedEntries_.push_back(entry);
   3999     }
   4000 
   4001     void _releaseBufferEntry(const BufferEntry& entry)
   4002     {
   4003         CV_Assert(entry.capacity_ != 0);
   4004         CV_Assert(entry.clBuffer_ != NULL);
   4005         LOG_BUFFER_POOL("OpenCL release buffer: %p, %lld (0x%llx) bytes\n",
   4006                 entry.clBuffer_, (long long)entry.capacity_, (long long)entry.capacity_);
   4007         clReleaseMemObject(entry.clBuffer_);
   4008     }
   4009 };
   4010 
   4011 #ifdef HAVE_OPENCL_SVM
   4012 struct CLSVMBufferEntry
   4013 {
   4014     void* clBuffer_;
   4015     size_t capacity_;
   4016     CLSVMBufferEntry() : clBuffer_(NULL), capacity_(0) { }
   4017 };
   4018 class OpenCLSVMBufferPoolImpl : public OpenCLBufferPoolBaseImpl<OpenCLSVMBufferPoolImpl, CLSVMBufferEntry, void*>
   4019 {
   4020 public:
   4021     typedef struct CLSVMBufferEntry BufferEntry;
   4022 public:
   4023     OpenCLSVMBufferPoolImpl()
   4024     {
   4025     }
   4026 
   4027     void _allocateBufferEntry(BufferEntry& entry, size_t size)
   4028     {
   4029         CV_DbgAssert(entry.clBuffer_ == NULL);
   4030         entry.capacity_ = alignSize(size, (int)_allocationGranularity(size));
   4031 
   4032         Context& ctx = Context::getDefault();
   4033         const svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx);
   4034         bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer();
   4035         cl_svm_mem_flags memFlags = CL_MEM_READ_WRITE |
   4036                 (isFineGrainBuffer ? CL_MEM_SVM_FINE_GRAIN_BUFFER : 0);
   4037 
   4038         const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
   4039         CV_DbgAssert(svmFns->isValid());
   4040 
   4041         CV_OPENCL_SVM_TRACE_P("clSVMAlloc: %d\n", (int)entry.capacity_);
   4042         void *buf = svmFns->fn_clSVMAlloc((cl_context)ctx.ptr(), memFlags, entry.capacity_, 0);
   4043         CV_Assert(buf);
   4044 
   4045         entry.clBuffer_ = buf;
   4046         {
   4047             CV_IMPL_ADD(CV_IMPL_OCL);
   4048         }
   4049         LOG_BUFFER_POOL("OpenCL SVM allocate %lld (0x%llx) bytes: %p\n",
   4050                 (long long)entry.capacity_, (long long)entry.capacity_, entry.clBuffer_);
   4051         allocatedEntries_.push_back(entry);
   4052     }
   4053 
   4054     void _releaseBufferEntry(const BufferEntry& entry)
   4055     {
   4056         CV_Assert(entry.capacity_ != 0);
   4057         CV_Assert(entry.clBuffer_ != NULL);
   4058         LOG_BUFFER_POOL("OpenCL release SVM buffer: %p, %lld (0x%llx) bytes\n",
   4059                 entry.clBuffer_, (long long)entry.capacity_, (long long)entry.capacity_);
   4060         Context& ctx = Context::getDefault();
   4061         const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
   4062         CV_DbgAssert(svmFns->isValid());
   4063         CV_OPENCL_SVM_TRACE_P("clSVMFree: %p\n",  entry.clBuffer_);
   4064         svmFns->fn_clSVMFree((cl_context)ctx.ptr(), entry.clBuffer_);
   4065     }
   4066 };
   4067 #endif
   4068 
   4069 
   4070 
   4071 #if defined _MSC_VER
   4072 #pragma warning(disable:4127) // conditional expression is constant
   4073 #endif
   4074 template <bool readAccess, bool writeAccess>
   4075 class AlignedDataPtr
   4076 {
   4077 protected:
   4078     const size_t size_;
   4079     uchar* const originPtr_;
   4080     const size_t alignment_;
   4081     uchar* ptr_;
   4082     uchar* allocatedPtr_;
   4083 
   4084 public:
   4085     AlignedDataPtr(uchar* ptr, size_t size, size_t alignment)
   4086         : size_(size), originPtr_(ptr), alignment_(alignment), ptr_(ptr), allocatedPtr_(NULL)
   4087     {
   4088         CV_DbgAssert((alignment & (alignment - 1)) == 0); // check for 2^n
   4089         if (((size_t)ptr_ & (alignment - 1)) != 0)
   4090         {
   4091             allocatedPtr_ = new uchar[size_ + alignment - 1];
   4092             ptr_ = (uchar*)(((uintptr_t)allocatedPtr_ + (alignment - 1)) & ~(alignment - 1));
   4093             if (readAccess)
   4094             {
   4095                 memcpy(ptr_, originPtr_, size_);
   4096             }
   4097         }
   4098     }
   4099 
   4100     uchar* getAlignedPtr() const
   4101     {
   4102         CV_DbgAssert(((size_t)ptr_ & (alignment_ - 1)) == 0);
   4103         return ptr_;
   4104     }
   4105 
   4106     ~AlignedDataPtr()
   4107     {
   4108         if (allocatedPtr_)
   4109         {
   4110             if (writeAccess)
   4111             {
   4112                 memcpy(originPtr_, ptr_, size_);
   4113             }
   4114             delete[] allocatedPtr_;
   4115             allocatedPtr_ = NULL;
   4116         }
   4117         ptr_ = NULL;
   4118     }
   4119 private:
   4120     AlignedDataPtr(const AlignedDataPtr&); // disabled
   4121     AlignedDataPtr& operator=(const AlignedDataPtr&); // disabled
   4122 };
   4123 #if defined _MSC_VER
   4124 #pragma warning(default:4127) // conditional expression is constant
   4125 #endif
   4126 
   4127 #ifndef CV_OPENCL_DATA_PTR_ALIGNMENT
   4128 #define CV_OPENCL_DATA_PTR_ALIGNMENT 16
   4129 #endif
   4130 
   4131 class OpenCLAllocator : public MatAllocator
   4132 {
   4133     mutable OpenCLBufferPoolImpl bufferPool;
   4134     mutable OpenCLBufferPoolImpl bufferPoolHostPtr;
   4135 #ifdef  HAVE_OPENCL_SVM
   4136     mutable OpenCLSVMBufferPoolImpl bufferPoolSVM;
   4137 #endif
   4138 
   4139     enum AllocatorFlags
   4140     {
   4141         ALLOCATOR_FLAGS_BUFFER_POOL_USED = 1 << 0,
   4142         ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED = 1 << 1
   4143 #ifdef HAVE_OPENCL_SVM
   4144         ,ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED = 1 << 2
   4145 #endif
   4146     };
   4147 public:
   4148     OpenCLAllocator()
   4149         : bufferPool(0),
   4150           bufferPoolHostPtr(CL_MEM_ALLOC_HOST_PTR)
   4151     {
   4152         size_t defaultPoolSize, poolSize;
   4153         defaultPoolSize = ocl::Device::getDefault().isIntel() ? 1 << 27 : 0;
   4154         poolSize = getConfigurationParameterForSize("OPENCV_OPENCL_BUFFERPOOL_LIMIT", defaultPoolSize);
   4155         bufferPool.setMaxReservedSize(poolSize);
   4156         poolSize = getConfigurationParameterForSize("OPENCV_OPENCL_HOST_PTR_BUFFERPOOL_LIMIT", defaultPoolSize);
   4157         bufferPoolHostPtr.setMaxReservedSize(poolSize);
   4158 #ifdef HAVE_OPENCL_SVM
   4159         poolSize = getConfigurationParameterForSize("OPENCV_OPENCL_SVM_BUFFERPOOL_LIMIT", defaultPoolSize);
   4160         bufferPoolSVM.setMaxReservedSize(poolSize);
   4161 #endif
   4162 
   4163         matStdAllocator = Mat::getStdAllocator();
   4164     }
   4165 
   4166     UMatData* defaultAllocate(int dims, const int* sizes, int type, void* data, size_t* step,
   4167             int flags, UMatUsageFlags usageFlags) const
   4168     {
   4169         UMatData* u = matStdAllocator->allocate(dims, sizes, type, data, step, flags, usageFlags);
   4170         return u;
   4171     }
   4172 
   4173     void getBestFlags(const Context& ctx, int /*flags*/, UMatUsageFlags usageFlags, int& createFlags, int& flags0) const
   4174     {
   4175         const Device& dev = ctx.device(0);
   4176         createFlags = 0;
   4177         if ((usageFlags & USAGE_ALLOCATE_HOST_MEMORY) != 0)
   4178             createFlags |= CL_MEM_ALLOC_HOST_PTR;
   4179 
   4180         if( dev.hostUnifiedMemory() )
   4181             flags0 = 0;
   4182         else
   4183             flags0 = UMatData::COPY_ON_MAP;
   4184     }
   4185 
   4186     UMatData* allocate(int dims, const int* sizes, int type,
   4187                        void* data, size_t* step, int flags, UMatUsageFlags usageFlags) const
   4188     {
   4189         if(!useOpenCL())
   4190             return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
   4191         CV_Assert(data == 0);
   4192         size_t total = CV_ELEM_SIZE(type);
   4193         for( int i = dims-1; i >= 0; i-- )
   4194         {
   4195             if( step )
   4196                 step[i] = total;
   4197             total *= sizes[i];
   4198         }
   4199 
   4200         Context& ctx = Context::getDefault();
   4201 
   4202         int createFlags = 0, flags0 = 0;
   4203         getBestFlags(ctx, flags, usageFlags, createFlags, flags0);
   4204 
   4205         void* handle = NULL;
   4206         int allocatorFlags = 0;
   4207 
   4208 #ifdef HAVE_OPENCL_SVM
   4209         const svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx);
   4210         if (ctx.useSVM() && svm::useSVM(usageFlags) && !svmCaps.isNoSVMSupport())
   4211         {
   4212             allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED;
   4213             handle = bufferPoolSVM.allocate(total);
   4214 
   4215             // this property is constant, so single buffer pool can be used here
   4216             bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer();
   4217             allocatorFlags |= isFineGrainBuffer ? svm::OPENCL_SVM_FINE_GRAIN_BUFFER : svm::OPENCL_SVM_COARSE_GRAIN_BUFFER;
   4218         }
   4219         else
   4220 #endif
   4221         if (createFlags == 0)
   4222         {
   4223             allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_USED;
   4224             handle = bufferPool.allocate(total);
   4225         }
   4226         else if (createFlags == CL_MEM_ALLOC_HOST_PTR)
   4227         {
   4228             allocatorFlags = ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED;
   4229             handle = bufferPoolHostPtr.allocate(total);
   4230         }
   4231         else
   4232         {
   4233             CV_Assert(handle != NULL); // Unsupported, throw
   4234         }
   4235 
   4236         if (!handle)
   4237             return defaultAllocate(dims, sizes, type, data, step, flags, usageFlags);
   4238 
   4239         UMatData* u = new UMatData(this);
   4240         u->data = 0;
   4241         u->size = total;
   4242         u->handle = handle;
   4243         u->flags = flags0;
   4244         u->allocatorFlags_ = allocatorFlags;
   4245         CV_DbgAssert(!u->tempUMat()); // for bufferPool.release() consistency in deallocate()
   4246         return u;
   4247     }
   4248 
   4249     bool allocate(UMatData* u, int accessFlags, UMatUsageFlags usageFlags) const
   4250     {
   4251         if(!u)
   4252             return false;
   4253 
   4254         UMatDataAutoLock lock(u);
   4255 
   4256         if(u->handle == 0)
   4257         {
   4258             CV_Assert(u->origdata != 0);
   4259             Context& ctx = Context::getDefault();
   4260             int createFlags = 0, flags0 = 0;
   4261             getBestFlags(ctx, accessFlags, usageFlags, createFlags, flags0);
   4262 
   4263             cl_context ctx_handle = (cl_context)ctx.ptr();
   4264             int allocatorFlags = 0;
   4265             int tempUMatFlags = 0;
   4266             void* handle = NULL;
   4267             cl_int retval = CL_SUCCESS;
   4268 
   4269 #ifdef HAVE_OPENCL_SVM
   4270             svm::SVMCapabilities svmCaps = svm::getSVMCapabilitites(ctx);
   4271             bool useSVM = ctx.useSVM() && svm::useSVM(usageFlags);
   4272             if (useSVM && svmCaps.isSupportFineGrainSystem())
   4273             {
   4274                 allocatorFlags = svm::OPENCL_SVM_FINE_GRAIN_SYSTEM;
   4275                 tempUMatFlags = UMatData::TEMP_UMAT;
   4276                 handle = u->origdata;
   4277                 CV_OPENCL_SVM_TRACE_P("Use fine grain system: %d (%p)\n", (int)u->size, handle);
   4278             }
   4279             else if (useSVM && (svmCaps.isSupportFineGrainBuffer() || svmCaps.isSupportCoarseGrainBuffer()))
   4280             {
   4281                 if (!(accessFlags & ACCESS_FAST)) // memcpy used
   4282                 {
   4283                     bool isFineGrainBuffer = svmCaps.isSupportFineGrainBuffer();
   4284 
   4285                     cl_svm_mem_flags memFlags = createFlags |
   4286                             (isFineGrainBuffer ? CL_MEM_SVM_FINE_GRAIN_BUFFER : 0);
   4287 
   4288                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
   4289                     CV_DbgAssert(svmFns->isValid());
   4290 
   4291                     CV_OPENCL_SVM_TRACE_P("clSVMAlloc + copy: %d\n", (int)u->size);
   4292                     handle = svmFns->fn_clSVMAlloc((cl_context)ctx.ptr(), memFlags, u->size, 0);
   4293                     CV_Assert(handle);
   4294 
   4295                     cl_command_queue q = NULL;
   4296                     if (!isFineGrainBuffer)
   4297                     {
   4298                         q = (cl_command_queue)Queue::getDefault().ptr();
   4299                         CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", handle, (int)u->size);
   4300                         cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_TRUE, CL_MAP_WRITE,
   4301                                 handle, u->size,
   4302                                 0, NULL, NULL);
   4303                         CV_Assert(status == CL_SUCCESS);
   4304 
   4305                     }
   4306                     memcpy(handle, u->origdata, u->size);
   4307                     if (!isFineGrainBuffer)
   4308                     {
   4309                         CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", handle);
   4310                         cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, handle, 0, NULL, NULL);
   4311                         CV_Assert(status == CL_SUCCESS);
   4312                     }
   4313 
   4314                     tempUMatFlags = UMatData::TEMP_UMAT | UMatData::TEMP_COPIED_UMAT;
   4315                     allocatorFlags |= isFineGrainBuffer ? svm::OPENCL_SVM_FINE_GRAIN_BUFFER
   4316                                                 : svm::OPENCL_SVM_COARSE_GRAIN_BUFFER;
   4317                 }
   4318             }
   4319             else
   4320 #endif
   4321             {
   4322                 tempUMatFlags = UMatData::TEMP_UMAT;
   4323                 handle = clCreateBuffer(ctx_handle, CL_MEM_USE_HOST_PTR|createFlags,
   4324                                            u->size, u->origdata, &retval);
   4325                 if((!handle || retval < 0) && !(accessFlags & ACCESS_FAST))
   4326                 {
   4327                     handle = clCreateBuffer(ctx_handle, CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE|createFlags,
   4328                                                u->size, u->origdata, &retval);
   4329                     tempUMatFlags |= UMatData::TEMP_COPIED_UMAT;
   4330                 }
   4331             }
   4332             if(!handle || retval != CL_SUCCESS)
   4333                 return false;
   4334             u->handle = handle;
   4335             u->prevAllocator = u->currAllocator;
   4336             u->currAllocator = this;
   4337             u->flags |= tempUMatFlags;
   4338             u->allocatorFlags_ = allocatorFlags;
   4339         }
   4340         if(accessFlags & ACCESS_WRITE)
   4341             u->markHostCopyObsolete(true);
   4342         return true;
   4343     }
   4344 
   4345     /*void sync(UMatData* u) const
   4346     {
   4347         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
   4348         UMatDataAutoLock lock(u);
   4349 
   4350         if( u->hostCopyObsolete() && u->handle && u->refcount > 0 && u->origdata)
   4351         {
   4352             if( u->tempCopiedUMat() )
   4353             {
   4354                 clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
   4355                                     u->size, u->origdata, 0, 0, 0);
   4356             }
   4357             else
   4358             {
   4359                 cl_int retval = 0;
   4360                 void* data = clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
   4361                                                 (CL_MAP_READ | CL_MAP_WRITE),
   4362                                                 0, u->size, 0, 0, 0, &retval);
   4363                 clEnqueueUnmapMemObject(q, (cl_mem)u->handle, data, 0, 0, 0);
   4364                 clFinish(q);
   4365             }
   4366             u->markHostCopyObsolete(false);
   4367         }
   4368         else if( u->copyOnMap() && u->deviceCopyObsolete() && u->data )
   4369         {
   4370             clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
   4371                                  u->size, u->data, 0, 0, 0);
   4372         }
   4373     }*/
   4374 
   4375     void deallocate(UMatData* u) const
   4376     {
   4377         if(!u)
   4378             return;
   4379 
   4380         CV_Assert(u->urefcount >= 0);
   4381         CV_Assert(u->refcount >= 0);
   4382 
   4383         CV_Assert(u->handle != 0 && u->urefcount == 0);
   4384         if(u->tempUMat())
   4385         {
   4386 //            UMatDataAutoLock lock(u);
   4387 
   4388             if( u->hostCopyObsolete() && u->refcount > 0 )
   4389             {
   4390 #ifdef HAVE_OPENCL_SVM
   4391                 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
   4392                 {
   4393                     Context& ctx = Context::getDefault();
   4394                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
   4395                     CV_DbgAssert(svmFns->isValid());
   4396 
   4397                     if( u->tempCopiedUMat() )
   4398                     {
   4399                         CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
   4400                                 (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER);
   4401                         bool isFineGrainBuffer = (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER;
   4402                         cl_command_queue q = NULL;
   4403                         if (!isFineGrainBuffer)
   4404                         {
   4405                             CV_DbgAssert(((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0));
   4406                             q = (cl_command_queue)Queue::getDefault().ptr();
   4407                             CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
   4408                             cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ,
   4409                                     u->handle, u->size,
   4410                                     0, NULL, NULL);
   4411                             CV_Assert(status == CL_SUCCESS);
   4412                         }
   4413                         clFinish(q);
   4414                         memcpy(u->origdata, u->handle, u->size);
   4415                         if (!isFineGrainBuffer)
   4416                         {
   4417                             CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
   4418                             cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle, 0, NULL, NULL);
   4419                             CV_Assert(status == CL_SUCCESS);
   4420                         }
   4421                     }
   4422                     else
   4423                     {
   4424                         CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM);
   4425                         // nothing
   4426                     }
   4427                 }
   4428                 else
   4429 #endif
   4430                 {
   4431                     cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
   4432                     if( u->tempCopiedUMat() )
   4433                     {
   4434                         AlignedDataPtr<false, true> alignedPtr(u->origdata, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
   4435                         CV_OclDbgAssert(clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
   4436                                             u->size, alignedPtr.getAlignedPtr(), 0, 0, 0) == CL_SUCCESS);
   4437                     }
   4438                     else
   4439                     {
   4440                         // TODO Is it really needed for clCreateBuffer with CL_MEM_USE_HOST_PTR?
   4441                         cl_int retval = 0;
   4442                         void* data = clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
   4443                                                         (CL_MAP_READ | CL_MAP_WRITE),
   4444                                                         0, u->size, 0, 0, 0, &retval);
   4445                         CV_OclDbgAssert(retval == CL_SUCCESS);
   4446                         CV_OclDbgAssert(clEnqueueUnmapMemObject(q, (cl_mem)u->handle, data, 0, 0, 0) == CL_SUCCESS);
   4447                         CV_OclDbgAssert(clFinish(q) == CL_SUCCESS);
   4448                     }
   4449                 }
   4450                 u->markHostCopyObsolete(false);
   4451             }
   4452 #ifdef HAVE_OPENCL_SVM
   4453             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
   4454             {
   4455                 if( u->tempCopiedUMat() )
   4456                 {
   4457                     Context& ctx = Context::getDefault();
   4458                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
   4459                     CV_DbgAssert(svmFns->isValid());
   4460 
   4461                     CV_OPENCL_SVM_TRACE_P("clSVMFree: %p\n", u->handle);
   4462                     svmFns->fn_clSVMFree((cl_context)ctx.ptr(), u->handle);
   4463                 }
   4464             }
   4465             else
   4466 #endif
   4467             {
   4468                 clReleaseMemObject((cl_mem)u->handle);
   4469             }
   4470             u->handle = 0;
   4471             u->currAllocator = u->prevAllocator;
   4472             if(u->data && u->copyOnMap() && !(u->flags & UMatData::USER_ALLOCATED))
   4473                 fastFree(u->data);
   4474             u->data = u->origdata;
   4475             if(u->refcount == 0)
   4476                 u->currAllocator->deallocate(u);
   4477         }
   4478         else
   4479         {
   4480             CV_Assert(u->refcount == 0);
   4481             if(u->data && u->copyOnMap() && !(u->flags & UMatData::USER_ALLOCATED))
   4482             {
   4483                 fastFree(u->data);
   4484                 u->data = 0;
   4485             }
   4486             if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_USED)
   4487             {
   4488                 bufferPool.release((cl_mem)u->handle);
   4489             }
   4490             else if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_HOST_PTR_USED)
   4491             {
   4492                 bufferPoolHostPtr.release((cl_mem)u->handle);
   4493             }
   4494 #ifdef HAVE_OPENCL_SVM
   4495             else if (u->allocatorFlags_ & ALLOCATOR_FLAGS_BUFFER_POOL_SVM_USED)
   4496             {
   4497                 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
   4498                 {
   4499                     //nothing
   4500                 }
   4501                 else if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
   4502                         (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
   4503                 {
   4504                     Context& ctx = Context::getDefault();
   4505                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
   4506                     CV_DbgAssert(svmFns->isValid());
   4507                     cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
   4508 
   4509                     if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) != 0)
   4510                     {
   4511                         CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
   4512                         cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle, 0, NULL, NULL);
   4513                         CV_Assert(status == CL_SUCCESS);
   4514                     }
   4515                 }
   4516                 bufferPoolSVM.release((void*)u->handle);
   4517             }
   4518 #endif
   4519             else
   4520             {
   4521                 clReleaseMemObject((cl_mem)u->handle);
   4522             }
   4523             u->handle = 0;
   4524             delete u;
   4525         }
   4526     }
   4527 
   4528     void map(UMatData* u, int accessFlags) const
   4529     {
   4530         if(!u)
   4531             return;
   4532 
   4533         CV_Assert( u->handle != 0 );
   4534 
   4535         UMatDataAutoLock autolock(u);
   4536 
   4537         if(accessFlags & ACCESS_WRITE)
   4538             u->markDeviceCopyObsolete(true);
   4539 
   4540         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
   4541 
   4542         // FIXIT Workaround for UMat synchronization issue
   4543         // if( u->refcount == 0 )
   4544         {
   4545             if( !u->copyOnMap() )
   4546             {
   4547                 // TODO
   4548                 // because there can be other map requests for the same UMat with different access flags,
   4549                 // we use the universal (read-write) access mode.
   4550 #ifdef HAVE_OPENCL_SVM
   4551                 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
   4552                 {
   4553                     if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
   4554                     {
   4555                         Context& ctx = Context::getDefault();
   4556                         const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
   4557                         CV_DbgAssert(svmFns->isValid());
   4558 
   4559                         if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0)
   4560                         {
   4561                             CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
   4562                             cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ | CL_MAP_WRITE,
   4563                                     u->handle, u->size,
   4564                                     0, NULL, NULL);
   4565                             CV_Assert(status == CL_SUCCESS);
   4566                             u->allocatorFlags_ |= svm::OPENCL_SVM_BUFFER_MAP;
   4567                         }
   4568                     }
   4569                     clFinish(q);
   4570                     u->data = (uchar*)u->handle;
   4571                     u->markHostCopyObsolete(false);
   4572                     u->markDeviceMemMapped(true);
   4573                     return;
   4574                 }
   4575 #endif
   4576                 if (u->data) // FIXIT Workaround for UMat synchronization issue
   4577                 {
   4578                     //CV_Assert(u->hostCopyObsolete() == false);
   4579                     return;
   4580                 }
   4581 
   4582                 cl_int retval = 0;
   4583                 u->data = (uchar*)clEnqueueMapBuffer(q, (cl_mem)u->handle, CL_TRUE,
   4584                                                      (CL_MAP_READ | CL_MAP_WRITE),
   4585                                                      0, u->size, 0, 0, 0, &retval);
   4586                 if(u->data && retval == CL_SUCCESS)
   4587                 {
   4588                     u->markHostCopyObsolete(false);
   4589                     u->markDeviceMemMapped(true);
   4590                     return;
   4591                 }
   4592 
   4593                 // TODO Is it really a good idea and was it tested well?
   4594                 // if map failed, switch to copy-on-map mode for the particular buffer
   4595                 u->flags |= UMatData::COPY_ON_MAP;
   4596             }
   4597 
   4598             if(!u->data)
   4599             {
   4600                 u->data = (uchar*)fastMalloc(u->size);
   4601                 u->markHostCopyObsolete(true);
   4602             }
   4603         }
   4604 
   4605         if( (accessFlags & ACCESS_READ) != 0 && u->hostCopyObsolete() )
   4606         {
   4607             AlignedDataPtr<false, true> alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
   4608 #ifdef HAVE_OPENCL_SVM
   4609             CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == 0);
   4610 #endif
   4611             CV_Assert( clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
   4612                                            u->size, alignedPtr.getAlignedPtr(), 0, 0, 0) == CL_SUCCESS );
   4613             u->markHostCopyObsolete(false);
   4614         }
   4615     }
   4616 
   4617     void unmap(UMatData* u) const
   4618     {
   4619         if(!u)
   4620             return;
   4621 
   4622 
   4623         CV_Assert(u->handle != 0);
   4624 
   4625         UMatDataAutoLock autolock(u);
   4626 
   4627         // FIXIT Workaround for UMat synchronization issue
   4628         if(u->refcount > 0)
   4629             return;
   4630 
   4631         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
   4632         cl_int retval = 0;
   4633         if( !u->copyOnMap() && u->deviceMemMapped() )
   4634         {
   4635             CV_Assert(u->data != NULL);
   4636             u->markDeviceMemMapped(false);
   4637 #ifdef HAVE_OPENCL_SVM
   4638             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
   4639             {
   4640                 if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
   4641                 {
   4642                     Context& ctx = Context::getDefault();
   4643                     const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
   4644                     CV_DbgAssert(svmFns->isValid());
   4645 
   4646                     CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) != 0);
   4647                     {
   4648                         CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
   4649                         cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle,
   4650                                 0, NULL, NULL);
   4651                         CV_Assert(status == CL_SUCCESS);
   4652                         clFinish(q);
   4653                         u->allocatorFlags_ &= ~svm::OPENCL_SVM_BUFFER_MAP;
   4654                     }
   4655                 }
   4656                 u->data = 0;
   4657                 u->markDeviceCopyObsolete(false);
   4658                 u->markHostCopyObsolete(false);
   4659                 return;
   4660             }
   4661 #endif
   4662             CV_Assert( (retval = clEnqueueUnmapMemObject(q,
   4663                                 (cl_mem)u->handle, u->data, 0, 0, 0)) == CL_SUCCESS );
   4664             if (Device::getDefault().isAMD())
   4665             {
   4666                 // required for multithreaded applications (see stitching test)
   4667                 CV_OclDbgAssert(clFinish(q) == CL_SUCCESS);
   4668             }
   4669             u->data = 0;
   4670         }
   4671         else if( u->copyOnMap() && u->deviceCopyObsolete() )
   4672         {
   4673             AlignedDataPtr<true, false> alignedPtr(u->data, u->size, CV_OPENCL_DATA_PTR_ALIGNMENT);
   4674 #ifdef HAVE_OPENCL_SVM
   4675             CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == 0);
   4676 #endif
   4677             CV_Assert( (retval = clEnqueueWriteBuffer(q, (cl_mem)u->handle, CL_TRUE, 0,
   4678                                 u->size, alignedPtr.getAlignedPtr(), 0, 0, 0)) == CL_SUCCESS );
   4679         }
   4680         u->markDeviceCopyObsolete(false);
   4681         u->markHostCopyObsolete(false);
   4682     }
   4683 
   4684     bool checkContinuous(int dims, const size_t sz[],
   4685                          const size_t srcofs[], const size_t srcstep[],
   4686                          const size_t dstofs[], const size_t dststep[],
   4687                          size_t& total, size_t new_sz[],
   4688                          size_t& srcrawofs, size_t new_srcofs[], size_t new_srcstep[],
   4689                          size_t& dstrawofs, size_t new_dstofs[], size_t new_dststep[]) const
   4690     {
   4691         bool iscontinuous = true;
   4692         srcrawofs = srcofs ? srcofs[dims-1] : 0;
   4693         dstrawofs = dstofs ? dstofs[dims-1] : 0;
   4694         total = sz[dims-1];
   4695         for( int i = dims-2; i >= 0; i-- )
   4696         {
   4697             if( i >= 0 && (total != srcstep[i] || total != dststep[i]) )
   4698                 iscontinuous = false;
   4699             total *= sz[i];
   4700             if( srcofs )
   4701                 srcrawofs += srcofs[i]*srcstep[i];
   4702             if( dstofs )
   4703                 dstrawofs += dstofs[i]*dststep[i];
   4704         }
   4705 
   4706         if( !iscontinuous )
   4707         {
   4708             // OpenCL uses {x, y, z} order while OpenCV uses {z, y, x} order.
   4709             if( dims == 2 )
   4710             {
   4711                 new_sz[0] = sz[1]; new_sz[1] = sz[0]; new_sz[2] = 1;
   4712                 // we assume that new_... arrays are initialized by caller
   4713                 // with 0's, so there is no else branch
   4714                 if( srcofs )
   4715                 {
   4716                     new_srcofs[0] = srcofs[1];
   4717                     new_srcofs[1] = srcofs[0];
   4718                     new_srcofs[2] = 0;
   4719                 }
   4720 
   4721                 if( dstofs )
   4722                 {
   4723                     new_dstofs[0] = dstofs[1];
   4724                     new_dstofs[1] = dstofs[0];
   4725                     new_dstofs[2] = 0;
   4726                 }
   4727 
   4728                 new_srcstep[0] = srcstep[0]; new_srcstep[1] = 0;
   4729                 new_dststep[0] = dststep[0]; new_dststep[1] = 0;
   4730             }
   4731             else
   4732             {
   4733                 // we could check for dims == 3 here,
   4734                 // but from user perspective this one is more informative
   4735                 CV_Assert(dims <= 3);
   4736                 new_sz[0] = sz[2]; new_sz[1] = sz[1]; new_sz[2] = sz[0];
   4737                 if( srcofs )
   4738                 {
   4739                     new_srcofs[0] = srcofs[2];
   4740                     new_srcofs[1] = srcofs[1];
   4741                     new_srcofs[2] = srcofs[0];
   4742                 }
   4743 
   4744                 if( dstofs )
   4745                 {
   4746                     new_dstofs[0] = dstofs[2];
   4747                     new_dstofs[1] = dstofs[1];
   4748                     new_dstofs[2] = dstofs[0];
   4749                 }
   4750 
   4751                 new_srcstep[0] = srcstep[1]; new_srcstep[1] = srcstep[0];
   4752                 new_dststep[0] = dststep[1]; new_dststep[1] = dststep[0];
   4753             }
   4754         }
   4755         return iscontinuous;
   4756     }
   4757 
   4758     void download(UMatData* u, void* dstptr, int dims, const size_t sz[],
   4759                   const size_t srcofs[], const size_t srcstep[],
   4760                   const size_t dststep[]) const
   4761     {
   4762         if(!u)
   4763             return;
   4764         UMatDataAutoLock autolock(u);
   4765 
   4766         if( u->data && !u->hostCopyObsolete() )
   4767         {
   4768             Mat::getStdAllocator()->download(u, dstptr, dims, sz, srcofs, srcstep, dststep);
   4769             return;
   4770         }
   4771         CV_Assert( u->handle != 0 );
   4772 
   4773         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
   4774 
   4775         size_t total = 0, new_sz[] = {0, 0, 0};
   4776         size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
   4777         size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
   4778 
   4779         bool iscontinuous = checkContinuous(dims, sz, srcofs, srcstep, 0, dststep,
   4780                                             total, new_sz,
   4781                                             srcrawofs, new_srcofs, new_srcstep,
   4782                                             dstrawofs, new_dstofs, new_dststep);
   4783 
   4784 #ifdef HAVE_OPENCL_SVM
   4785         if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
   4786         {
   4787             CV_DbgAssert(u->data == NULL || u->data == u->handle);
   4788             Context& ctx = Context::getDefault();
   4789             const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
   4790             CV_DbgAssert(svmFns->isValid());
   4791 
   4792             CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0);
   4793             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
   4794             {
   4795                 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
   4796                 cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_READ,
   4797                         u->handle, u->size,
   4798                         0, NULL, NULL);
   4799                 CV_Assert(status == CL_SUCCESS);
   4800             }
   4801             clFinish(q);
   4802             if( iscontinuous )
   4803             {
   4804                 memcpy(dstptr, (uchar*)u->handle + srcrawofs, total);
   4805             }
   4806             else
   4807             {
   4808                 // This code is from MatAllocator::download()
   4809                 int isz[CV_MAX_DIM];
   4810                 uchar* srcptr = (uchar*)u->handle;
   4811                 for( int i = 0; i < dims; i++ )
   4812                 {
   4813                     CV_Assert( sz[i] <= (size_t)INT_MAX );
   4814                     if( sz[i] == 0 )
   4815                     return;
   4816                     if( srcofs )
   4817                     srcptr += srcofs[i]*(i <= dims-2 ? srcstep[i] : 1);
   4818                     isz[i] = (int)sz[i];
   4819                 }
   4820 
   4821                 Mat src(dims, isz, CV_8U, srcptr, srcstep);
   4822                 Mat dst(dims, isz, CV_8U, dstptr, dststep);
   4823 
   4824                 const Mat* arrays[] = { &src, &dst };
   4825                 uchar* ptrs[2];
   4826                 NAryMatIterator it(arrays, ptrs, 2);
   4827                 size_t j, planesz = it.size;
   4828 
   4829                 for( j = 0; j < it.nplanes; j++, ++it )
   4830                     memcpy(ptrs[1], ptrs[0], planesz);
   4831             }
   4832             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
   4833             {
   4834                 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
   4835                 cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle,
   4836                         0, NULL, NULL);
   4837                 CV_Assert(status == CL_SUCCESS);
   4838                 clFinish(q);
   4839             }
   4840         }
   4841         else
   4842 #endif
   4843         {
   4844             AlignedDataPtr<false, true> alignedPtr((uchar*)dstptr, sz[0] * dststep[0], CV_OPENCL_DATA_PTR_ALIGNMENT);
   4845             if( iscontinuous )
   4846             {
   4847                 CV_Assert( clEnqueueReadBuffer(q, (cl_mem)u->handle, CL_TRUE,
   4848                                                srcrawofs, total, alignedPtr.getAlignedPtr(), 0, 0, 0) >= 0 );
   4849             }
   4850             else
   4851             {
   4852                 CV_Assert( clEnqueueReadBufferRect(q, (cl_mem)u->handle, CL_TRUE,
   4853                                 new_srcofs, new_dstofs, new_sz, new_srcstep[0], new_srcstep[1],
   4854                                 new_dststep[0], new_dststep[1], alignedPtr.getAlignedPtr(), 0, 0, 0) >= 0 );
   4855             }
   4856         }
   4857     }
   4858 
   4859     void upload(UMatData* u, const void* srcptr, int dims, const size_t sz[],
   4860                 const size_t dstofs[], const size_t dststep[],
   4861                 const size_t srcstep[]) const
   4862     {
   4863         if(!u)
   4864             return;
   4865 
   4866         // there should be no user-visible CPU copies of the UMat which we are going to copy to
   4867         CV_Assert(u->refcount == 0 || u->tempUMat());
   4868 
   4869         size_t total = 0, new_sz[] = {0, 0, 0};
   4870         size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
   4871         size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
   4872 
   4873         bool iscontinuous = checkContinuous(dims, sz, 0, srcstep, dstofs, dststep,
   4874                                             total, new_sz,
   4875                                             srcrawofs, new_srcofs, new_srcstep,
   4876                                             dstrawofs, new_dstofs, new_dststep);
   4877 
   4878         UMatDataAutoLock autolock(u);
   4879 
   4880         // if there is cached CPU copy of the GPU matrix,
   4881         // we could use it as a destination.
   4882         // we can do it in 2 cases:
   4883         //    1. we overwrite the whole content
   4884         //    2. we overwrite part of the matrix, but the GPU copy is out-of-date
   4885         if( u->data && (u->hostCopyObsolete() < u->deviceCopyObsolete() || total == u->size))
   4886         {
   4887             Mat::getStdAllocator()->upload(u, srcptr, dims, sz, dstofs, dststep, srcstep);
   4888             u->markHostCopyObsolete(false);
   4889             u->markDeviceCopyObsolete(true);
   4890             return;
   4891         }
   4892 
   4893         CV_Assert( u->handle != 0 );
   4894         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
   4895 
   4896 #ifdef HAVE_OPENCL_SVM
   4897         if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
   4898         {
   4899             CV_DbgAssert(u->data == NULL || u->data == u->handle);
   4900             Context& ctx = Context::getDefault();
   4901             const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
   4902             CV_DbgAssert(svmFns->isValid());
   4903 
   4904             CV_DbgAssert((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MAP) == 0);
   4905             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
   4906             {
   4907                 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMap: %p (%d)\n", u->handle, (int)u->size);
   4908                 cl_int status = svmFns->fn_clEnqueueSVMMap(q, CL_FALSE, CL_MAP_WRITE,
   4909                         u->handle, u->size,
   4910                         0, NULL, NULL);
   4911                 CV_Assert(status == CL_SUCCESS);
   4912             }
   4913             clFinish(q);
   4914             if( iscontinuous )
   4915             {
   4916                 memcpy((uchar*)u->handle + dstrawofs, srcptr, total);
   4917             }
   4918             else
   4919             {
   4920                 // This code is from MatAllocator::upload()
   4921                 int isz[CV_MAX_DIM];
   4922                 uchar* dstptr = (uchar*)u->handle;
   4923                 for( int i = 0; i < dims; i++ )
   4924                 {
   4925                     CV_Assert( sz[i] <= (size_t)INT_MAX );
   4926                     if( sz[i] == 0 )
   4927                     return;
   4928                     if( dstofs )
   4929                     dstptr += dstofs[i]*(i <= dims-2 ? dststep[i] : 1);
   4930                     isz[i] = (int)sz[i];
   4931                 }
   4932 
   4933                 Mat src(dims, isz, CV_8U, (void*)srcptr, srcstep);
   4934                 Mat dst(dims, isz, CV_8U, dstptr, dststep);
   4935 
   4936                 const Mat* arrays[] = { &src, &dst };
   4937                 uchar* ptrs[2];
   4938                 NAryMatIterator it(arrays, ptrs, 2);
   4939                 size_t j, planesz = it.size;
   4940 
   4941                 for( j = 0; j < it.nplanes; j++, ++it )
   4942                     memcpy(ptrs[1], ptrs[0], planesz);
   4943             }
   4944             if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_COARSE_GRAIN_BUFFER)
   4945             {
   4946                 CV_OPENCL_SVM_TRACE_P("clEnqueueSVMUnmap: %p\n", u->handle);
   4947                 cl_int status = svmFns->fn_clEnqueueSVMUnmap(q, u->handle,
   4948                         0, NULL, NULL);
   4949                 CV_Assert(status == CL_SUCCESS);
   4950                 clFinish(q);
   4951             }
   4952         }
   4953         else
   4954 #endif
   4955         {
   4956             AlignedDataPtr<true, false> alignedPtr((uchar*)srcptr, sz[0] * srcstep[0], CV_OPENCL_DATA_PTR_ALIGNMENT);
   4957             if( iscontinuous )
   4958             {
   4959                 CV_Assert( clEnqueueWriteBuffer(q, (cl_mem)u->handle,
   4960                     CL_TRUE, dstrawofs, total, alignedPtr.getAlignedPtr(), 0, 0, 0) >= 0 );
   4961             }
   4962             else
   4963             {
   4964                 CV_Assert( clEnqueueWriteBufferRect(q, (cl_mem)u->handle, CL_TRUE,
   4965                     new_dstofs, new_srcofs, new_sz, new_dststep[0], new_dststep[1],
   4966                     new_srcstep[0], new_srcstep[1], alignedPtr.getAlignedPtr(), 0, 0, 0) >= 0 );
   4967             }
   4968         }
   4969         u->markHostCopyObsolete(true);
   4970 #ifdef HAVE_OPENCL_SVM
   4971         if ((u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
   4972                 (u->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
   4973         {
   4974             // nothing
   4975         }
   4976         else
   4977 #endif
   4978         {
   4979             u->markHostCopyObsolete(true);
   4980         }
   4981         u->markDeviceCopyObsolete(false);
   4982     }
   4983 
   4984     void copy(UMatData* src, UMatData* dst, int dims, const size_t sz[],
   4985               const size_t srcofs[], const size_t srcstep[],
   4986               const size_t dstofs[], const size_t dststep[], bool _sync) const
   4987     {
   4988         if(!src || !dst)
   4989             return;
   4990 
   4991         size_t total = 0, new_sz[] = {0, 0, 0};
   4992         size_t srcrawofs = 0, new_srcofs[] = {0, 0, 0}, new_srcstep[] = {0, 0, 0};
   4993         size_t dstrawofs = 0, new_dstofs[] = {0, 0, 0}, new_dststep[] = {0, 0, 0};
   4994 
   4995         bool iscontinuous = checkContinuous(dims, sz, srcofs, srcstep, dstofs, dststep,
   4996                                             total, new_sz,
   4997                                             srcrawofs, new_srcofs, new_srcstep,
   4998                                             dstrawofs, new_dstofs, new_dststep);
   4999 
   5000         UMatDataAutoLock src_autolock(src);
   5001         UMatDataAutoLock dst_autolock(dst);
   5002 
   5003         if( !src->handle || (src->data && src->hostCopyObsolete() < src->deviceCopyObsolete()) )
   5004         {
   5005             upload(dst, src->data + srcrawofs, dims, sz, dstofs, dststep, srcstep);
   5006             return;
   5007         }
   5008         if( !dst->handle || (dst->data && dst->hostCopyObsolete() < dst->deviceCopyObsolete()) )
   5009         {
   5010             download(src, dst->data + dstrawofs, dims, sz, srcofs, srcstep, dststep);
   5011             dst->markHostCopyObsolete(false);
   5012 #ifdef HAVE_OPENCL_SVM
   5013             if ((dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
   5014                     (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
   5015             {
   5016                 // nothing
   5017             }
   5018             else
   5019 #endif
   5020             {
   5021                 dst->markDeviceCopyObsolete(true);
   5022             }
   5023             return;
   5024         }
   5025 
   5026         // there should be no user-visible CPU copies of the UMat which we are going to copy to
   5027         CV_Assert(dst->refcount == 0);
   5028         cl_command_queue q = (cl_command_queue)Queue::getDefault().ptr();
   5029 
   5030         cl_int retval = CL_SUCCESS;
   5031 #ifdef HAVE_OPENCL_SVM
   5032         if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0 ||
   5033                 (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
   5034         {
   5035             if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0 &&
   5036                             (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
   5037             {
   5038                 Context& ctx = Context::getDefault();
   5039                 const svm::SVMFunctions* svmFns = svm::getSVMFunctions(ctx);
   5040                 CV_DbgAssert(svmFns->isValid());
   5041 
   5042                 if( iscontinuous )
   5043                 {
   5044                     CV_OPENCL_SVM_TRACE_P("clEnqueueSVMMemcpy: %p <-- %p (%d)\n",
   5045                             (uchar*)dst->handle + dstrawofs, (uchar*)src->handle + srcrawofs, (int)total);
   5046                     cl_int status = svmFns->fn_clEnqueueSVMMemcpy(q, CL_TRUE,
   5047                             (uchar*)dst->handle + dstrawofs, (uchar*)src->handle + srcrawofs,
   5048                             total, 0, NULL, NULL);
   5049                     CV_Assert(status == CL_SUCCESS);
   5050                 }
   5051                 else
   5052                 {
   5053                     clFinish(q);
   5054                     // This code is from MatAllocator::download()/upload()
   5055                     int isz[CV_MAX_DIM];
   5056                     uchar* srcptr = (uchar*)src->handle;
   5057                     for( int i = 0; i < dims; i++ )
   5058                     {
   5059                         CV_Assert( sz[i] <= (size_t)INT_MAX );
   5060                         if( sz[i] == 0 )
   5061                         return;
   5062                         if( srcofs )
   5063                         srcptr += srcofs[i]*(i <= dims-2 ? srcstep[i] : 1);
   5064                         isz[i] = (int)sz[i];
   5065                     }
   5066                     Mat m_src(dims, isz, CV_8U, srcptr, srcstep);
   5067 
   5068                     uchar* dstptr = (uchar*)dst->handle;
   5069                     for( int i = 0; i < dims; i++ )
   5070                     {
   5071                         if( dstofs )
   5072                         dstptr += dstofs[i]*(i <= dims-2 ? dststep[i] : 1);
   5073                     }
   5074                     Mat m_dst(dims, isz, CV_8U, dstptr, dststep);
   5075 
   5076                     const Mat* arrays[] = { &m_src, &m_dst };
   5077                     uchar* ptrs[2];
   5078                     NAryMatIterator it(arrays, ptrs, 2);
   5079                     size_t j, planesz = it.size;
   5080 
   5081                     for( j = 0; j < it.nplanes; j++, ++it )
   5082                         memcpy(ptrs[1], ptrs[0], planesz);
   5083                 }
   5084             }
   5085             else
   5086             {
   5087                 if ((src->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) != 0)
   5088                 {
   5089                     map(src, ACCESS_READ);
   5090                     upload(dst, src->data + srcrawofs, dims, sz, dstofs, dststep, srcstep);
   5091                     unmap(src);
   5092                 }
   5093                 else
   5094                 {
   5095                     map(dst, ACCESS_WRITE);
   5096                     download(src, dst->data + dstrawofs, dims, sz, srcofs, srcstep, dststep);
   5097                     unmap(dst);
   5098                 }
   5099             }
   5100         }
   5101         else
   5102 #endif
   5103         {
   5104             if( iscontinuous )
   5105             {
   5106                 CV_Assert( (retval = clEnqueueCopyBuffer(q, (cl_mem)src->handle, (cl_mem)dst->handle,
   5107                                                srcrawofs, dstrawofs, total, 0, 0, 0)) == CL_SUCCESS );
   5108             }
   5109             else
   5110             {
   5111                 CV_Assert( (retval = clEnqueueCopyBufferRect(q, (cl_mem)src->handle, (cl_mem)dst->handle,
   5112                                                    new_srcofs, new_dstofs, new_sz,
   5113                                                    new_srcstep[0], new_srcstep[1],
   5114                                                    new_dststep[0], new_dststep[1],
   5115                                                    0, 0, 0)) == CL_SUCCESS );
   5116             }
   5117         }
   5118         if (retval == CL_SUCCESS)
   5119         {
   5120             CV_IMPL_ADD(CV_IMPL_OCL)
   5121         }
   5122 
   5123 #ifdef HAVE_OPENCL_SVM
   5124         if ((dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_BUFFER ||
   5125                 (dst->allocatorFlags_ & svm::OPENCL_SVM_BUFFER_MASK) == svm::OPENCL_SVM_FINE_GRAIN_SYSTEM)
   5126         {
   5127             // nothing
   5128         }
   5129         else
   5130 #endif
   5131         {
   5132             dst->markHostCopyObsolete(true);
   5133         }
   5134         dst->markDeviceCopyObsolete(false);
   5135 
   5136         if( _sync )
   5137         {
   5138             CV_OclDbgAssert(clFinish(q) == CL_SUCCESS);
   5139         }
   5140     }
   5141 
   5142     BufferPoolController* getBufferPoolController(const char* id) const {
   5143 #ifdef HAVE_OPENCL_SVM
   5144         if ((svm::checkForceSVMUmatUsage() && (id == NULL || strcmp(id, "OCL") == 0)) || (id != NULL && strcmp(id, "SVM") == 0))
   5145         {
   5146             return &bufferPoolSVM;
   5147         }
   5148 #endif
   5149         if (id != NULL && strcmp(id, "HOST_ALLOC") == 0)
   5150         {
   5151             return &bufferPoolHostPtr;
   5152         }
   5153         if (id != NULL && strcmp(id, "OCL") != 0)
   5154         {
   5155             CV_ErrorNoReturn(cv::Error::StsBadArg, "getBufferPoolController(): unknown BufferPool ID\n");
   5156         }
   5157         return &bufferPool;
   5158     }
   5159 
   5160     MatAllocator* matStdAllocator;
   5161 };
   5162 
   5163 MatAllocator* getOpenCLAllocator()
   5164 {
   5165     static MatAllocator * allocator = new OpenCLAllocator();
   5166     return allocator;
   5167 }
   5168 
   5169 ///////////////////////////////////////////// Utility functions /////////////////////////////////////////////////
   5170 
   5171 static void getDevices(std::vector<cl_device_id>& devices, cl_platform_id platform)
   5172 {
   5173     cl_uint numDevices = 0;
   5174     CV_OclDbgAssert(clGetDeviceIDs(platform, (cl_device_type)Device::TYPE_ALL,
   5175                                 0, NULL, &numDevices) == CL_SUCCESS);
   5176 
   5177     if (numDevices == 0)
   5178     {
   5179         devices.clear();
   5180         return;
   5181     }
   5182 
   5183     devices.resize((size_t)numDevices);
   5184     CV_OclDbgAssert(clGetDeviceIDs(platform, (cl_device_type)Device::TYPE_ALL,
   5185                                 numDevices, &devices[0], &numDevices) == CL_SUCCESS);
   5186 }
   5187 
   5188 struct PlatformInfo::Impl
   5189 {
   5190     Impl(void* id)
   5191     {
   5192         refcount = 1;
   5193         handle = *(cl_platform_id*)id;
   5194         getDevices(devices, handle);
   5195     }
   5196 
   5197     String getStrProp(cl_device_info prop) const
   5198     {
   5199         char buf[1024];
   5200         size_t sz=0;
   5201         return clGetPlatformInfo(handle, prop, sizeof(buf)-16, buf, &sz) == CL_SUCCESS &&
   5202             sz < sizeof(buf) ? String(buf) : String();
   5203     }
   5204 
   5205     IMPLEMENT_REFCOUNTABLE();
   5206     std::vector<cl_device_id> devices;
   5207     cl_platform_id handle;
   5208 };
   5209 
   5210 PlatformInfo::PlatformInfo()
   5211 {
   5212     p = 0;
   5213 }
   5214 
   5215 PlatformInfo::PlatformInfo(void* platform_id)
   5216 {
   5217     p = new Impl(platform_id);
   5218 }
   5219 
   5220 PlatformInfo::~PlatformInfo()
   5221 {
   5222     if(p)
   5223         p->release();
   5224 }
   5225 
   5226 PlatformInfo::PlatformInfo(const PlatformInfo& i)
   5227 {
   5228     if (i.p)
   5229         i.p->addref();
   5230     p = i.p;
   5231 }
   5232 
   5233 PlatformInfo& PlatformInfo::operator =(const PlatformInfo& i)
   5234 {
   5235     if (i.p != p)
   5236     {
   5237         if (i.p)
   5238             i.p->addref();
   5239         if (p)
   5240             p->release();
   5241         p = i.p;
   5242     }
   5243     return *this;
   5244 }
   5245 
   5246 int PlatformInfo::deviceNumber() const
   5247 {
   5248     return p ? (int)p->devices.size() : 0;
   5249 }
   5250 
   5251 void PlatformInfo::getDevice(Device& device, int d) const
   5252 {
   5253     CV_Assert(p && d < (int)p->devices.size() );
   5254     if(p)
   5255         device.set(p->devices[d]);
   5256 }
   5257 
   5258 String PlatformInfo::name() const
   5259 {
   5260     return p ? p->getStrProp(CL_PLATFORM_NAME) : String();
   5261 }
   5262 
   5263 String PlatformInfo::vendor() const
   5264 {
   5265     return p ? p->getStrProp(CL_PLATFORM_VENDOR) : String();
   5266 }
   5267 
   5268 String PlatformInfo::version() const
   5269 {
   5270     return p ? p->getStrProp(CL_PLATFORM_VERSION) : String();
   5271 }
   5272 
   5273 static void getPlatforms(std::vector<cl_platform_id>& platforms)
   5274 {
   5275     cl_uint numPlatforms = 0;
   5276     CV_OclDbgAssert(clGetPlatformIDs(0, NULL, &numPlatforms) == CL_SUCCESS);
   5277 
   5278     if (numPlatforms == 0)
   5279     {
   5280         platforms.clear();
   5281         return;
   5282     }
   5283 
   5284     platforms.resize((size_t)numPlatforms);
   5285     CV_OclDbgAssert(clGetPlatformIDs(numPlatforms, &platforms[0], &numPlatforms) == CL_SUCCESS);
   5286 }
   5287 
   5288 void getPlatfomsInfo(std::vector<PlatformInfo>& platformsInfo)
   5289 {
   5290     std::vector<cl_platform_id> platforms;
   5291     getPlatforms(platforms);
   5292 
   5293     for (size_t i = 0; i < platforms.size(); i++)
   5294         platformsInfo.push_back( PlatformInfo((void*)&platforms[i]) );
   5295 }
   5296 
   5297 const char* typeToStr(int type)
   5298 {
   5299     static const char* tab[]=
   5300     {
   5301         "uchar", "uchar2", "uchar3", "uchar4", 0, 0, 0, "uchar8", 0, 0, 0, 0, 0, 0, 0, "uchar16",
   5302         "char", "char2", "char3", "char4", 0, 0, 0, "char8", 0, 0, 0, 0, 0, 0, 0, "char16",
   5303         "ushort", "ushort2", "ushort3", "ushort4",0, 0, 0, "ushort8", 0, 0, 0, 0, 0, 0, 0, "ushort16",
   5304         "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
   5305         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
   5306         "float", "float2", "float3", "float4", 0, 0, 0, "float8", 0, 0, 0, 0, 0, 0, 0, "float16",
   5307         "double", "double2", "double3", "double4", 0, 0, 0, "double8", 0, 0, 0, 0, 0, 0, 0, "double16",
   5308         "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?"
   5309     };
   5310     int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
   5311     return cn > 16 ? "?" : tab[depth*16 + cn-1];
   5312 }
   5313 
   5314 const char* memopTypeToStr(int type)
   5315 {
   5316     static const char* tab[] =
   5317     {
   5318         "uchar", "uchar2", "uchar3", "uchar4", 0, 0, 0, "uchar8", 0, 0, 0, 0, 0, 0, 0, "uchar16",
   5319         "char", "char2", "char3", "char4", 0, 0, 0, "char8", 0, 0, 0, 0, 0, 0, 0, "char16",
   5320         "ushort", "ushort2", "ushort3", "ushort4",0, 0, 0, "ushort8", 0, 0, 0, 0, 0, 0, 0, "ushort16",
   5321         "short", "short2", "short3", "short4", 0, 0, 0, "short8", 0, 0, 0, 0, 0, 0, 0, "short16",
   5322         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
   5323         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
   5324         "ulong", "ulong2", "ulong3", "ulong4", 0, 0, 0, "ulong8", 0, 0, 0, 0, 0, 0, 0, "ulong16",
   5325         "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?"
   5326     };
   5327     int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
   5328     return cn > 16 ? "?" : tab[depth*16 + cn-1];
   5329 }
   5330 
   5331 const char* vecopTypeToStr(int type)
   5332 {
   5333     static const char* tab[] =
   5334     {
   5335         "uchar", "short", "uchar3", "int", 0, 0, 0, "int2", 0, 0, 0, 0, 0, 0, 0, "int4",
   5336         "char", "short", "char3", "int", 0, 0, 0, "int2", 0, 0, 0, 0, 0, 0, 0, "int4",
   5337         "ushort", "int", "ushort3", "int2",0, 0, 0, "int4", 0, 0, 0, 0, 0, 0, 0, "int8",
   5338         "short", "int", "short3", "int2", 0, 0, 0, "int4", 0, 0, 0, 0, 0, 0, 0, "int8",
   5339         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
   5340         "int", "int2", "int3", "int4", 0, 0, 0, "int8", 0, 0, 0, 0, 0, 0, 0, "int16",
   5341         "ulong", "ulong2", "ulong3", "ulong4", 0, 0, 0, "ulong8", 0, 0, 0, 0, 0, 0, 0, "ulong16",
   5342         "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", "?"
   5343     };
   5344     int cn = CV_MAT_CN(type), depth = CV_MAT_DEPTH(type);
   5345     return cn > 16 ? "?" : tab[depth*16 + cn-1];
   5346 }
   5347 
   5348 const char* convertTypeStr(int sdepth, int ddepth, int cn, char* buf)
   5349 {
   5350     if( sdepth == ddepth )
   5351         return "noconvert";
   5352     const char *typestr = typeToStr(CV_MAKETYPE(ddepth, cn));
   5353     if( ddepth >= CV_32F ||
   5354         (ddepth == CV_32S && sdepth < CV_32S) ||
   5355         (ddepth == CV_16S && sdepth <= CV_8S) ||
   5356         (ddepth == CV_16U && sdepth == CV_8U))
   5357     {
   5358         sprintf(buf, "convert_%s", typestr);
   5359     }
   5360     else if( sdepth >= CV_32F )
   5361         sprintf(buf, "convert_%s%s_rte", typestr, (ddepth < CV_32S ? "_sat" : ""));
   5362     else
   5363         sprintf(buf, "convert_%s_sat", typestr);
   5364 
   5365     return buf;
   5366 }
   5367 
   5368 template <typename T>
   5369 static std::string kerToStr(const Mat & k)
   5370 {
   5371     int width = k.cols - 1, depth = k.depth();
   5372     const T * const data = k.ptr<T>();
   5373 
   5374     std::ostringstream stream;
   5375     stream.precision(10);
   5376 
   5377     if (depth <= CV_8S)
   5378     {
   5379         for (int i = 0; i < width; ++i)
   5380             stream << "DIG(" << (int)data[i] << ")";
   5381         stream << "DIG(" << (int)data[width] << ")";
   5382     }
   5383     else if (depth == CV_32F)
   5384     {
   5385         stream.setf(std::ios_base::showpoint);
   5386         for (int i = 0; i < width; ++i)
   5387             stream << "DIG(" << data[i] << "f)";
   5388         stream << "DIG(" << data[width] << "f)";
   5389     }
   5390     else
   5391     {
   5392         for (int i = 0; i < width; ++i)
   5393             stream << "DIG(" << data[i] << ")";
   5394         stream << "DIG(" << data[width] << ")";
   5395     }
   5396 
   5397     return stream.str();
   5398 }
   5399 
   5400 String kernelToStr(InputArray _kernel, int ddepth, const char * name)
   5401 {
   5402     Mat kernel = _kernel.getMat().reshape(1, 1);
   5403 
   5404     int depth = kernel.depth();
   5405     if (ddepth < 0)
   5406         ddepth = depth;
   5407 
   5408     if (ddepth != depth)
   5409         kernel.convertTo(kernel, ddepth);
   5410 
   5411     typedef std::string (* func_t)(const Mat &);
   5412     static const func_t funcs[] = { kerToStr<uchar>, kerToStr<char>, kerToStr<ushort>, kerToStr<short>,
   5413                                     kerToStr<int>, kerToStr<float>, kerToStr<double>, 0 };
   5414     const func_t func = funcs[ddepth];
   5415     CV_Assert(func != 0);
   5416 
   5417     return cv::format(" -D %s=%s", name ? name : "COEFF", func(kernel).c_str());
   5418 }
   5419 
   5420 #define PROCESS_SRC(src) \
   5421     do \
   5422     { \
   5423         if (!src.empty()) \
   5424         { \
   5425             CV_Assert(src.isMat() || src.isUMat()); \
   5426             Size csize = src.size(); \
   5427             int ctype = src.type(), ccn = CV_MAT_CN(ctype), cdepth = CV_MAT_DEPTH(ctype), \
   5428                 ckercn = vectorWidths[cdepth], cwidth = ccn * csize.width; \
   5429             if (cwidth < ckercn || ckercn <= 0) \
   5430                 return 1; \
   5431             cols.push_back(cwidth); \
   5432             if (strat == OCL_VECTOR_OWN && ctype != ref_type) \
   5433                 return 1; \
   5434             offsets.push_back(src.offset()); \
   5435             steps.push_back(src.step()); \
   5436             dividers.push_back(ckercn * CV_ELEM_SIZE1(ctype)); \
   5437             kercns.push_back(ckercn); \
   5438         } \
   5439     } \
   5440     while ((void)0, 0)
   5441 
   5442 int predictOptimalVectorWidth(InputArray src1, InputArray src2, InputArray src3,
   5443                               InputArray src4, InputArray src5, InputArray src6,
   5444                               InputArray src7, InputArray src8, InputArray src9,
   5445                               OclVectorStrategy strat)
   5446 {
   5447     const ocl::Device & d = ocl::Device::getDefault();
   5448 
   5449     int vectorWidths[] = { d.preferredVectorWidthChar(), d.preferredVectorWidthChar(),
   5450         d.preferredVectorWidthShort(), d.preferredVectorWidthShort(),
   5451         d.preferredVectorWidthInt(), d.preferredVectorWidthFloat(),
   5452         d.preferredVectorWidthDouble(), -1 };
   5453 
   5454     // if the device says don't use vectors
   5455     if (vectorWidths[0] == 1)
   5456     {
   5457         // it's heuristic
   5458         vectorWidths[CV_8U] = vectorWidths[CV_8S] = 4;
   5459         vectorWidths[CV_16U] = vectorWidths[CV_16S] = 2;
   5460         vectorWidths[CV_32S] = vectorWidths[CV_32F] = vectorWidths[CV_64F] = 1;
   5461     }
   5462 
   5463     return checkOptimalVectorWidth(vectorWidths, src1, src2, src3, src4, src5, src6, src7, src8, src9, strat);
   5464 }
   5465 
   5466 int checkOptimalVectorWidth(const int *vectorWidths,
   5467                             InputArray src1, InputArray src2, InputArray src3,
   5468                             InputArray src4, InputArray src5, InputArray src6,
   5469                             InputArray src7, InputArray src8, InputArray src9,
   5470                             OclVectorStrategy strat)
   5471 {
   5472     CV_Assert(vectorWidths);
   5473 
   5474     int ref_type = src1.type();
   5475 
   5476     std::vector<size_t> offsets, steps, cols;
   5477     std::vector<int> dividers, kercns;
   5478     PROCESS_SRC(src1);
   5479     PROCESS_SRC(src2);
   5480     PROCESS_SRC(src3);
   5481     PROCESS_SRC(src4);
   5482     PROCESS_SRC(src5);
   5483     PROCESS_SRC(src6);
   5484     PROCESS_SRC(src7);
   5485     PROCESS_SRC(src8);
   5486     PROCESS_SRC(src9);
   5487 
   5488     size_t size = offsets.size();
   5489 
   5490     for (size_t i = 0; i < size; ++i)
   5491         while (offsets[i] % dividers[i] != 0 || steps[i] % dividers[i] != 0 || cols[i] % kercns[i] != 0)
   5492             dividers[i] >>= 1, kercns[i] >>= 1;
   5493 
   5494     // default strategy
   5495     int kercn = *std::min_element(kercns.begin(), kercns.end());
   5496 
   5497     return kercn;
   5498 }
   5499 
   5500 int predictOptimalVectorWidthMax(InputArray src1, InputArray src2, InputArray src3,
   5501                                  InputArray src4, InputArray src5, InputArray src6,
   5502                                  InputArray src7, InputArray src8, InputArray src9)
   5503 {
   5504     return predictOptimalVectorWidth(src1, src2, src3, src4, src5, src6, src7, src8, src9, OCL_VECTOR_MAX);
   5505 }
   5506 
   5507 #undef PROCESS_SRC
   5508 
   5509 
   5510 // TODO Make this as a method of OpenCL "BuildOptions" class
   5511 void buildOptionsAddMatrixDescription(String& buildOptions, const String& name, InputArray _m)
   5512 {
   5513     if (!buildOptions.empty())
   5514         buildOptions += " ";
   5515     int type = _m.type(), depth = CV_MAT_DEPTH(type);
   5516     buildOptions += format(
   5517             "-D %s_T=%s -D %s_T1=%s -D %s_CN=%d -D %s_TSIZE=%d -D %s_T1SIZE=%d -D %s_DEPTH=%d",
   5518             name.c_str(), ocl::typeToStr(type),
   5519             name.c_str(), ocl::typeToStr(CV_MAKE_TYPE(depth, 1)),
   5520             name.c_str(), (int)CV_MAT_CN(type),
   5521             name.c_str(), (int)CV_ELEM_SIZE(type),
   5522             name.c_str(), (int)CV_ELEM_SIZE1(type),
   5523             name.c_str(), (int)depth
   5524             );
   5525 }
   5526 
   5527 
   5528 struct Image2D::Impl
   5529 {
   5530     Impl(const UMat &src, bool norm, bool alias)
   5531     {
   5532         handle = 0;
   5533         refcount = 1;
   5534         init(src, norm, alias);
   5535     }
   5536 
   5537     ~Impl()
   5538     {
   5539         if (handle)
   5540             clReleaseMemObject(handle);
   5541     }
   5542 
   5543     static cl_image_format getImageFormat(int depth, int cn, bool norm)
   5544     {
   5545         cl_image_format format;
   5546         static const int channelTypes[] = { CL_UNSIGNED_INT8, CL_SIGNED_INT8, CL_UNSIGNED_INT16,
   5547                                        CL_SIGNED_INT16, CL_SIGNED_INT32, CL_FLOAT, -1, -1 };
   5548         static const int channelTypesNorm[] = { CL_UNORM_INT8, CL_SNORM_INT8, CL_UNORM_INT16,
   5549                                                 CL_SNORM_INT16, -1, -1, -1, -1 };
   5550         static const int channelOrders[] = { -1, CL_R, CL_RG, -1, CL_RGBA };
   5551 
   5552         int channelType = norm ? channelTypesNorm[depth] : channelTypes[depth];
   5553         int channelOrder = channelOrders[cn];
   5554         format.image_channel_data_type = (cl_channel_type)channelType;
   5555         format.image_channel_order = (cl_channel_order)channelOrder;
   5556         return format;
   5557     }
   5558 
   5559     static bool isFormatSupported(cl_image_format format)
   5560     {
   5561         if (!haveOpenCL())
   5562             CV_Error(Error::OpenCLApiCallError, "OpenCL runtime not found!");
   5563 
   5564         cl_context context = (cl_context)Context::getDefault().ptr();
   5565         // Figure out how many formats are supported by this context.
   5566         cl_uint numFormats = 0;
   5567         cl_int err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE,
   5568                                                 CL_MEM_OBJECT_IMAGE2D, numFormats,
   5569                                                 NULL, &numFormats);
   5570         AutoBuffer<cl_image_format> formats(numFormats);
   5571         err = clGetSupportedImageFormats(context, CL_MEM_READ_WRITE,
   5572                                          CL_MEM_OBJECT_IMAGE2D, numFormats,
   5573                                          formats, NULL);
   5574         CV_OclDbgAssert(err == CL_SUCCESS);
   5575         for (cl_uint i = 0; i < numFormats; ++i)
   5576         {
   5577             if (!memcmp(&formats[i], &format, sizeof(format)))
   5578             {
   5579                 return true;
   5580             }
   5581         }
   5582         return false;
   5583     }
   5584 
   5585     void init(const UMat &src, bool norm, bool alias)
   5586     {
   5587         if (!haveOpenCL())
   5588             CV_Error(Error::OpenCLApiCallError, "OpenCL runtime not found!");
   5589 
   5590         CV_Assert(!src.empty());
   5591         CV_Assert(ocl::Device::getDefault().imageSupport());
   5592 
   5593         int err, depth = src.depth(), cn = src.channels();
   5594         CV_Assert(cn <= 4);
   5595         cl_image_format format = getImageFormat(depth, cn, norm);
   5596 
   5597         if (!isFormatSupported(format))
   5598             CV_Error(Error::OpenCLApiCallError, "Image format is not supported");
   5599 
   5600         if (alias && !src.handle(ACCESS_RW))
   5601             CV_Error(Error::OpenCLApiCallError, "Incorrect UMat, handle is null");
   5602 
   5603         cl_context context = (cl_context)Context::getDefault().ptr();
   5604         cl_command_queue queue = (cl_command_queue)Queue::getDefault().ptr();
   5605 
   5606 #ifdef CL_VERSION_1_2
   5607         // this enables backwards portability to
   5608         // run on OpenCL 1.1 platform if library binaries are compiled with OpenCL 1.2 support
   5609         const Device & d = ocl::Device::getDefault();
   5610         int minor = d.deviceVersionMinor(), major = d.deviceVersionMajor();
   5611         CV_Assert(!alias || canCreateAlias(src));
   5612         if (1 < major || (1 == major && 2 <= minor))
   5613         {
   5614             cl_image_desc desc;
   5615             desc.image_type       = CL_MEM_OBJECT_IMAGE2D;
   5616             desc.image_width      = src.cols;
   5617             desc.image_height     = src.rows;
   5618             desc.image_depth      = 0;
   5619             desc.image_array_size = 1;
   5620             desc.image_row_pitch  = alias ? src.step[0] : 0;
   5621             desc.image_slice_pitch = 0;
   5622             desc.buffer           = alias ? (cl_mem)src.handle(ACCESS_RW) : 0;
   5623             desc.num_mip_levels   = 0;
   5624             desc.num_samples      = 0;
   5625             handle = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
   5626         }
   5627         else
   5628 #endif
   5629         {
   5630             CV_SUPPRESS_DEPRECATED_START
   5631             CV_Assert(!alias);  // This is an OpenCL 1.2 extension
   5632             handle = clCreateImage2D(context, CL_MEM_READ_WRITE, &format, src.cols, src.rows, 0, NULL, &err);
   5633             CV_SUPPRESS_DEPRECATED_END
   5634         }
   5635         CV_OclDbgAssert(err == CL_SUCCESS);
   5636 
   5637         size_t origin[] = { 0, 0, 0 };
   5638         size_t region[] = { static_cast<size_t>(src.cols), static_cast<size_t>(src.rows), 1 };
   5639 
   5640         cl_mem devData;
   5641         if (!alias && !src.isContinuous())
   5642         {
   5643             devData = clCreateBuffer(context, CL_MEM_READ_ONLY, src.cols * src.rows * src.elemSize(), NULL, &err);
   5644             CV_OclDbgAssert(err == CL_SUCCESS);
   5645 
   5646             const size_t roi[3] = {static_cast<size_t>(src.cols) * src.elemSize(), static_cast<size_t>(src.rows), 1};
   5647             CV_Assert(clEnqueueCopyBufferRect(queue, (cl_mem)src.handle(ACCESS_READ), devData, origin, origin,
   5648                 roi, src.step, 0, src.cols * src.elemSize(), 0, 0, NULL, NULL) == CL_SUCCESS);
   5649             CV_OclDbgAssert(clFlush(queue) == CL_SUCCESS);
   5650         }
   5651         else
   5652         {
   5653             devData = (cl_mem)src.handle(ACCESS_READ);
   5654         }
   5655         CV_Assert(devData != NULL);
   5656 
   5657         if (!alias)
   5658         {
   5659             CV_OclDbgAssert(clEnqueueCopyBufferToImage(queue, devData, handle, 0, origin, region, 0, NULL, 0) == CL_SUCCESS);
   5660             if (!src.isContinuous())
   5661             {
   5662                 CV_OclDbgAssert(clFlush(queue) == CL_SUCCESS);
   5663                 CV_OclDbgAssert(clReleaseMemObject(devData) == CL_SUCCESS);
   5664             }
   5665         }
   5666     }
   5667 
   5668     IMPLEMENT_REFCOUNTABLE();
   5669 
   5670     cl_mem handle;
   5671 };
   5672 
   5673 Image2D::Image2D()
   5674 {
   5675     p = NULL;
   5676 }
   5677 
   5678 Image2D::Image2D(const UMat &src, bool norm, bool alias)
   5679 {
   5680     p = new Impl(src, norm, alias);
   5681 }
   5682 
   5683 bool Image2D::canCreateAlias(const UMat &m)
   5684 {
   5685     bool ret = false;
   5686     const Device & d = ocl::Device::getDefault();
   5687     if (d.imageFromBufferSupport() && !m.empty())
   5688     {
   5689         // This is the required pitch alignment in pixels
   5690         uint pitchAlign = d.imagePitchAlignment();
   5691         if (pitchAlign && !(m.step % (pitchAlign * m.elemSize())))
   5692         {
   5693             // We don't currently handle the case where the buffer was created
   5694             // with CL_MEM_USE_HOST_PTR
   5695             if (!m.u->tempUMat())
   5696             {
   5697                 ret = true;
   5698             }
   5699         }
   5700     }
   5701     return ret;
   5702 }
   5703 
   5704 bool Image2D::isFormatSupported(int depth, int cn, bool norm)
   5705 {
   5706     cl_image_format format = Impl::getImageFormat(depth, cn, norm);
   5707 
   5708     return Impl::isFormatSupported(format);
   5709 }
   5710 
   5711 Image2D::Image2D(const Image2D & i)
   5712 {
   5713     p = i.p;
   5714     if (p)
   5715         p->addref();
   5716 }
   5717 
   5718 Image2D & Image2D::operator = (const Image2D & i)
   5719 {
   5720     if (i.p != p)
   5721     {
   5722         if (i.p)
   5723             i.p->addref();
   5724         if (p)
   5725             p->release();
   5726         p = i.p;
   5727     }
   5728     return *this;
   5729 }
   5730 
   5731 Image2D::~Image2D()
   5732 {
   5733     if (p)
   5734         p->release();
   5735 }
   5736 
   5737 void* Image2D::ptr() const
   5738 {
   5739     return p ? p->handle : 0;
   5740 }
   5741 
   5742 bool internal::isPerformanceCheckBypassed()
   5743 {
   5744     static bool initialized = false;
   5745     static bool value = false;
   5746     if (!initialized)
   5747     {
   5748         value = getBoolParameter("OPENCV_OPENCL_PERF_CHECK_BYPASS", false);
   5749         initialized = true;
   5750     }
   5751     return value;
   5752 }
   5753 
   5754 bool internal::isCLBuffer(UMat& u)
   5755 {
   5756     void* h = u.handle(ACCESS_RW);
   5757     if (!h)
   5758         return true;
   5759     CV_DbgAssert(u.u->currAllocator == getOpenCLAllocator());
   5760 #if 1
   5761     if ((u.u->allocatorFlags_ & 0xffff0000) != 0) // OpenCL SVM flags are stored here
   5762         return false;
   5763 #else
   5764     cl_mem_object_type type = 0;
   5765     cl_int ret = clGetMemObjectInfo((cl_mem)h, CL_MEM_TYPE, sizeof(type), &type, NULL);
   5766     if (ret != CL_SUCCESS || type != CL_MEM_OBJECT_BUFFER)
   5767         return false;
   5768 #endif
   5769     return true;
   5770 }
   5771 
   5772 }}
   5773