Home | History | Annotate | Download | only in features2d
      1 // This file is auto-generated. Do not edit!
      2 
      3 #include "precomp.hpp"
      4 #include "opencl_kernels_features2d.hpp"
      5 
      6 namespace cv
      7 {
      8 namespace ocl
      9 {
     10 namespace features2d
     11 {
     12 
     13 const struct ProgramEntry brute_force_match={"brute_force_match",
     14 "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable\n"
     15 "#define MAX_FLOAT 3.40282e+038f\n"
     16 "#ifndef T\n"
     17 "#define T float\n"
     18 "#endif\n"
     19 "#ifndef BLOCK_SIZE\n"
     20 "#define BLOCK_SIZE 16\n"
     21 "#endif\n"
     22 "#ifndef MAX_DESC_LEN\n"
     23 "#define MAX_DESC_LEN 64\n"
     24 "#endif\n"
     25 "#define BLOCK_SIZE_ODD          (BLOCK_SIZE + 1)\n"
     26 "#ifndef SHARED_MEM_SZ\n"
     27 "#  if (BLOCK_SIZE < MAX_DESC_LEN)\n"
     28 "#    define SHARED_MEM_SZ      (kercn * (BLOCK_SIZE * MAX_DESC_LEN + BLOCK_SIZE * BLOCK_SIZE))\n"
     29 "#  else\n"
     30 "#    define SHARED_MEM_SZ      (kercn * 2 * BLOCK_SIZE_ODD * BLOCK_SIZE)\n"
     31 "#  endif\n"
     32 "#endif\n"
     33 "#ifndef DIST_TYPE\n"
     34 "#define DIST_TYPE 2\n"
     35 "#endif\n"
     36 "#if (DIST_TYPE == 2)\n"
     37 "#   ifdef T_FLOAT\n"
     38 "typedef float result_type;\n"
     39 "#       if (8 == kercn)\n"
     40 "typedef float8 value_type;\n"
     41 "#           define DIST(x, y) {value_type d = fabs((x) - (y)); result += d.s0 + d.s1 + d.s2 + d.s3 + d.s4 + d.s5 + d.s6 + d.s7;}\n"
     42 "#       elif (4 == kercn)\n"
     43 "typedef float4 value_type;\n"
     44 "#           define DIST(x, y) {value_type d = fabs((x) - (y)); result += d.s0 + d.s1 + d.s2 + d.s3;}\n"
     45 "#       else\n"
     46 "typedef float value_type;\n"
     47 "#           define DIST(x, y) result += fabs((x) - (y))\n"
     48 "#       endif\n"
     49 "#   else\n"
     50 "typedef int result_type;\n"
     51 "#       if (8 == kercn)\n"
     52 "typedef int8 value_type;\n"
     53 "#           define DIST(x, y) {value_type d = abs((x) - (y)); result += d.s0 + d.s1 + d.s2 + d.s3 + d.s4 + d.s5 + d.s6 + d.s7;}\n"
     54 "#       elif (4 == kercn)\n"
     55 "typedef int4 value_type;\n"
     56 "#           define DIST(x, y) {value_type d = abs((x) - (y)); result += d.s0 + d.s1 + d.s2 + d.s3;}\n"
     57 "#       else\n"
     58 "typedef int  value_type;\n"
     59 "#           define DIST(x, y) result += abs((x) - (y))\n"
     60 "#       endif\n"
     61 "#   endif\n"
     62 "#   define DIST_RES(x) (x)\n"
     63 "#elif (DIST_TYPE == 4)\n"
     64 "typedef float result_type;\n"
     65 "#   if (8 == kercn)\n"
     66 "typedef float8 value_type;\n"
     67 "#       define DIST(x, y)   {value_type d = ((x) - (y)); result += dot(d.s0123, d.s0123) + dot(d.s4567, d.s4567);}\n"
     68 "#   elif (4 == kercn)\n"
     69 "typedef float4      value_type;\n"
     70 "#       define DIST(x, y)   {value_type d = ((x) - (y)); result += dot(d, d);}\n"
     71 "#   else\n"
     72 "typedef float       value_type;\n"
     73 "#       define DIST(x, y)   {value_type d = ((x) - (y)); result = mad(d, d, result);}\n"
     74 "#   endif\n"
     75 "#   define DIST_RES(x) sqrt(x)\n"
     76 "#elif (DIST_TYPE == 6)\n"
     77 "#   if (8 == kercn)\n"
     78 "typedef int8 value_type;\n"
     79 "#   elif (4 == kercn)\n"
     80 "typedef int4 value_type;\n"
     81 "#   else\n"
     82 "typedef int value_type;\n"
     83 "#   endif\n"
     84 "typedef int result_type;\n"
     85 "#   define DIST(x, y) result += popcount( (x) ^ (y) )\n"
     86 "#   define DIST_RES(x) (x)\n"
     87 "#endif\n"
     88 "inline result_type reduce_block(\n"
     89 "__local value_type *s_query,\n"
     90 "__local value_type *s_train,\n"
     91 "int lidx,\n"
     92 "int lidy\n"
     93 ")\n"
     94 "{\n"
     95 "result_type result = 0;\n"
     96 "#pragma unroll\n"
     97 "for (int j = 0 ; j < BLOCK_SIZE ; j++)\n"
     98 "{\n"
     99 "DIST(s_query[lidy * BLOCK_SIZE_ODD + j], s_train[j * BLOCK_SIZE_ODD + lidx]);\n"
    100 "}\n"
    101 "return DIST_RES(result);\n"
    102 "}\n"
    103 "inline result_type reduce_block_match(\n"
    104 "__local value_type *s_query,\n"
    105 "__local value_type *s_train,\n"
    106 "int lidx,\n"
    107 "int lidy\n"
    108 ")\n"
    109 "{\n"
    110 "result_type result = 0;\n"
    111 "#pragma unroll\n"
    112 "for (int j = 0 ; j < BLOCK_SIZE ; j++)\n"
    113 "{\n"
    114 "DIST(s_query[lidy * BLOCK_SIZE_ODD + j], s_train[j * BLOCK_SIZE_ODD + lidx]);\n"
    115 "}\n"
    116 "return result;\n"
    117 "}\n"
    118 "inline result_type reduce_multi_block(\n"
    119 "__local value_type *s_query,\n"
    120 "__local value_type *s_train,\n"
    121 "int block_index,\n"
    122 "int lidx,\n"
    123 "int lidy\n"
    124 ")\n"
    125 "{\n"
    126 "result_type result = 0;\n"
    127 "#pragma unroll\n"
    128 "for (int j = 0 ; j < BLOCK_SIZE ; j++)\n"
    129 "{\n"
    130 "DIST(s_query[lidy * MAX_DESC_LEN + block_index * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + lidx]);\n"
    131 "}\n"
    132 "return result;\n"
    133 "}\n"
    134 "__kernel void BruteForceMatch_Match(\n"
    135 "__global T *query,\n"
    136 "__global T *train,\n"
    137 "__global int *bestTrainIdx,\n"
    138 "__global float *bestDistance,\n"
    139 "int query_rows,\n"
    140 "int query_cols,\n"
    141 "int train_rows,\n"
    142 "int train_cols,\n"
    143 "int step\n"
    144 ")\n"
    145 "{\n"
    146 "const int lidx = get_local_id(0);\n"
    147 "const int lidy = get_local_id(1);\n"
    148 "const int groupidx = get_group_id(0);\n"
    149 "const int queryIdx = mad24(BLOCK_SIZE, groupidx, lidy);\n"
    150 "const int queryOffset = min(queryIdx, query_rows - 1) * step;\n"
    151 "__global TN *query_vec = (__global TN *)(query + queryOffset);\n"
    152 "query_cols /= kercn;\n"
    153 "__local float sharebuffer[SHARED_MEM_SZ];\n"
    154 "__local value_type *s_query = (__local value_type *)sharebuffer;\n"
    155 "#if 0 < MAX_DESC_LEN\n"
    156 "__local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * MAX_DESC_LEN;\n"
    157 "#pragma unroll\n"
    158 "for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; i++)\n"
    159 "{\n"
    160 "const int loadx = mad24(BLOCK_SIZE, i, lidx);\n"
    161 "s_query[mad24(MAX_DESC_LEN, lidy, loadx)] = loadx < query_cols ? query_vec[loadx] : 0;\n"
    162 "}\n"
    163 "#else\n"
    164 "__local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE_ODD * BLOCK_SIZE;\n"
    165 "const int s_query_i = mad24(BLOCK_SIZE_ODD, lidy, lidx);\n"
    166 "const int s_train_i = mad24(BLOCK_SIZE_ODD, lidx, lidy);\n"
    167 "#endif\n"
    168 "float myBestDistance = MAX_FLOAT;\n"
    169 "int myBestTrainIdx = -1;\n"
    170 "for (int t = 0, endt = (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; t++)\n"
    171 "{\n"
    172 "result_type result = 0;\n"
    173 "const int trainOffset = min(mad24(BLOCK_SIZE, t, lidy), train_rows - 1) * step;\n"
    174 "__global TN *train_vec = (__global TN *)(train + trainOffset);\n"
    175 "#if 0 < MAX_DESC_LEN\n"
    176 "#pragma unroll\n"
    177 "for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; i++)\n"
    178 "{\n"
    179 "const int loadx = mad24(BLOCK_SIZE, i, lidx);\n"
    180 "s_train[mad24(BLOCK_SIZE, lidx, lidy)] = loadx < train_cols ? train_vec[loadx] : 0;\n"
    181 "barrier(CLK_LOCAL_MEM_FENCE);\n"
    182 "result += reduce_multi_block(s_query, s_train, i, lidx, lidy);\n"
    183 "barrier(CLK_LOCAL_MEM_FENCE);\n"
    184 "}\n"
    185 "#else\n"
    186 "for (int i = 0, endq = (query_cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endq; i++)\n"
    187 "{\n"
    188 "const int loadx = mad24(i, BLOCK_SIZE, lidx);\n"
    189 "if (loadx < query_cols)\n"
    190 "{\n"
    191 "s_query[s_query_i] = query_vec[loadx];\n"
    192 "s_train[s_train_i] = train_vec[loadx];\n"
    193 "}\n"
    194 "else\n"
    195 "{\n"
    196 "s_query[s_query_i] = 0;\n"
    197 "s_train[s_train_i] = 0;\n"
    198 "}\n"
    199 "barrier(CLK_LOCAL_MEM_FENCE);\n"
    200 "result += reduce_block_match(s_query, s_train, lidx, lidy);\n"
    201 "barrier(CLK_LOCAL_MEM_FENCE);\n"
    202 "}\n"
    203 "#endif\n"
    204 "result = DIST_RES(result);\n"
    205 "const int trainIdx = mad24(BLOCK_SIZE, t, lidx);\n"
    206 "if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance )\n"
    207 "{\n"
    208 "myBestDistance = result;\n"
    209 "myBestTrainIdx = trainIdx;\n"
    210 "}\n"
    211 "}\n"
    212 "barrier(CLK_LOCAL_MEM_FENCE);\n"
    213 "__local float *s_distance = (__local float *)sharebuffer;\n"
    214 "__local int *s_trainIdx = (__local int *)(sharebuffer + BLOCK_SIZE_ODD * BLOCK_SIZE);\n"
    215 "s_distance += lidy * BLOCK_SIZE_ODD;\n"
    216 "s_trainIdx += lidy * BLOCK_SIZE_ODD;\n"
    217 "s_distance[lidx] = myBestDistance;\n"
    218 "s_trainIdx[lidx] = myBestTrainIdx;\n"
    219 "barrier(CLK_LOCAL_MEM_FENCE);\n"
    220 "#pragma unroll\n"
    221 "for (int k = 0 ; k < BLOCK_SIZE; k++)\n"
    222 "{\n"
    223 "if (myBestDistance > s_distance[k])\n"
    224 "{\n"
    225 "myBestDistance = s_distance[k];\n"
    226 "myBestTrainIdx = s_trainIdx[k];\n"
    227 "}\n"
    228 "}\n"
    229 "if (queryIdx < query_rows && lidx == 0)\n"
    230 "{\n"
    231 "bestTrainIdx[queryIdx] = myBestTrainIdx;\n"
    232 "bestDistance[queryIdx] = myBestDistance;\n"
    233 "}\n"
    234 "}\n"
    235 "__kernel void BruteForceMatch_RadiusMatch(\n"
    236 "__global T *query,\n"
    237 "__global T *train,\n"
    238 "float maxDistance,\n"
    239 "__global int *bestTrainIdx,\n"
    240 "__global float *bestDistance,\n"
    241 "__global int *nMatches,\n"
    242 "int query_rows,\n"
    243 "int query_cols,\n"
    244 "int train_rows,\n"
    245 "int train_cols,\n"
    246 "int bestTrainIdx_cols,\n"
    247 "int step,\n"
    248 "int ostep\n"
    249 ")\n"
    250 "{\n"
    251 "const int lidx = get_local_id(0);\n"
    252 "const int lidy = get_local_id(1);\n"
    253 "const int groupidx = get_group_id(0);\n"
    254 "const int groupidy = get_group_id(1);\n"
    255 "const int queryIdx = mad24(BLOCK_SIZE, groupidy, lidy);\n"
    256 "const int queryOffset = min(queryIdx, query_rows - 1) * step;\n"
    257 "__global TN *query_vec = (__global TN *)(query + queryOffset);\n"
    258 "const int trainIdx = mad24(BLOCK_SIZE, groupidx, lidx);\n"
    259 "const int trainOffset = min(mad24(BLOCK_SIZE, groupidx, lidy), train_rows - 1) * step;\n"
    260 "__global TN *train_vec = (__global TN *)(train + trainOffset);\n"
    261 "query_cols /= kercn;\n"
    262 "__local float sharebuffer[SHARED_MEM_SZ];\n"
    263 "__local value_type *s_query = (__local value_type *)sharebuffer;\n"
    264 "__local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE_ODD * BLOCK_SIZE;\n"
    265 "result_type result = 0;\n"
    266 "const int s_query_i = mad24(BLOCK_SIZE_ODD, lidy, lidx);\n"
    267 "const int s_train_i = mad24(BLOCK_SIZE_ODD, lidx, lidy);\n"
    268 "for (int i = 0 ; i < (query_cols + BLOCK_SIZE - 1) / BLOCK_SIZE ; ++i)\n"
    269 "{\n"
    270 "const int loadx = mad24(BLOCK_SIZE, i, lidx);\n"
    271 "if (loadx < query_cols)\n"
    272 "{\n"
    273 "s_query[s_query_i] = query_vec[loadx];\n"
    274 "s_train[s_train_i] = train_vec[loadx];\n"
    275 "}\n"
    276 "else\n"
    277 "{\n"
    278 "s_query[s_query_i] = 0;\n"
    279 "s_train[s_train_i] = 0;\n"
    280 "}\n"
    281 "barrier(CLK_LOCAL_MEM_FENCE);\n"
    282 "result += reduce_block(s_query, s_train, lidx, lidy);\n"
    283 "barrier(CLK_LOCAL_MEM_FENCE);\n"
    284 "}\n"
    285 "if (queryIdx < query_rows && trainIdx < train_rows && convert_float(result) < maxDistance)\n"
    286 "{\n"
    287 "int ind = atom_inc(nMatches + queryIdx);\n"
    288 "if(ind < bestTrainIdx_cols)\n"
    289 "{\n"
    290 "bestTrainIdx[mad24(queryIdx, ostep, ind)] = trainIdx;\n"
    291 "bestDistance[mad24(queryIdx, ostep, ind)] = result;\n"
    292 "}\n"
    293 "}\n"
    294 "}\n"
    295 "__kernel void BruteForceMatch_knnMatch(\n"
    296 "__global T *query,\n"
    297 "__global T *train,\n"
    298 "__global int2 *bestTrainIdx,\n"
    299 "__global float2 *bestDistance,\n"
    300 "int query_rows,\n"
    301 "int query_cols,\n"
    302 "int train_rows,\n"
    303 "int train_cols,\n"
    304 "int step\n"
    305 ")\n"
    306 "{\n"
    307 "const int lidx = get_local_id(0);\n"
    308 "const int lidy = get_local_id(1);\n"
    309 "const int groupidx = get_group_id(0);\n"
    310 "const int queryIdx = mad24(BLOCK_SIZE, groupidx, lidy);\n"
    311 "const int queryOffset = min(queryIdx, query_rows - 1) * step;\n"
    312 "__global TN *query_vec = (__global TN *)(query + queryOffset);\n"
    313 "query_cols /= kercn;\n"
    314 "__local float sharebuffer[SHARED_MEM_SZ];\n"
    315 "__local value_type *s_query = (__local value_type *)sharebuffer;\n"
    316 "#if 0 < MAX_DESC_LEN\n"
    317 "__local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * MAX_DESC_LEN;\n"
    318 "#pragma unroll\n"
    319 "for (int i = 0 ;  i <  MAX_DESC_LEN / BLOCK_SIZE; i ++)\n"
    320 "{\n"
    321 "int loadx = mad24(BLOCK_SIZE, i, lidx);\n"
    322 "s_query[mad24(MAX_DESC_LEN, lidy, loadx)] = loadx < query_cols ? query_vec[loadx] : 0;\n"
    323 "}\n"
    324 "#else\n"
    325 "__local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE_ODD * BLOCK_SIZE;\n"
    326 "const int s_query_i = mad24(BLOCK_SIZE_ODD, lidy, lidx);\n"
    327 "const int s_train_i = mad24(BLOCK_SIZE_ODD, lidx, lidy);\n"
    328 "#endif\n"
    329 "float myBestDistance1 = MAX_FLOAT;\n"
    330 "float myBestDistance2 = MAX_FLOAT;\n"
    331 "int myBestTrainIdx1 = -1;\n"
    332 "int myBestTrainIdx2 = -1;\n"
    333 "for (int t = 0, endt = (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt ; t++)\n"
    334 "{\n"
    335 "result_type result = 0;\n"
    336 "int trainOffset = min(mad24(BLOCK_SIZE, t, lidy), train_rows - 1) * step;\n"
    337 "__global TN *train_vec = (__global TN *)(train + trainOffset);\n"
    338 "#if 0 < MAX_DESC_LEN\n"
    339 "#pragma unroll\n"
    340 "for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE ; i++)\n"
    341 "{\n"
    342 "const int loadx = mad24(BLOCK_SIZE, i, lidx);\n"
    343 "s_train[mad24(BLOCK_SIZE, lidx, lidy)] = loadx < train_cols ? train_vec[loadx] : 0;\n"
    344 "barrier(CLK_LOCAL_MEM_FENCE);\n"
    345 "result += reduce_multi_block(s_query, s_train, i, lidx, lidy);\n"
    346 "barrier(CLK_LOCAL_MEM_FENCE);\n"
    347 "}\n"
    348 "#else\n"
    349 "for (int i = 0, endq = (query_cols + BLOCK_SIZE -1) / BLOCK_SIZE; i < endq ; i++)\n"
    350 "{\n"
    351 "const int loadx = mad24(BLOCK_SIZE, i, lidx);\n"
    352 "if (loadx < query_cols)\n"
    353 "{\n"
    354 "s_query[s_query_i] = query_vec[loadx];\n"
    355 "s_train[s_train_i] = train_vec[loadx];\n"
    356 "}\n"
    357 "else\n"
    358 "{\n"
    359 "s_query[s_query_i] = 0;\n"
    360 "s_train[s_train_i] = 0;\n"
    361 "}\n"
    362 "barrier(CLK_LOCAL_MEM_FENCE);\n"
    363 "result += reduce_block_match(s_query, s_train, lidx, lidy);\n"
    364 "barrier(CLK_LOCAL_MEM_FENCE);\n"
    365 "}\n"
    366 "#endif\n"
    367 "result = DIST_RES(result);\n"
    368 "const int trainIdx = mad24(BLOCK_SIZE, t, lidx);\n"
    369 "if (queryIdx < query_rows && trainIdx < train_rows)\n"
    370 "{\n"
    371 "if (result < myBestDistance1)\n"
    372 "{\n"
    373 "myBestDistance2 = myBestDistance1;\n"
    374 "myBestTrainIdx2 = myBestTrainIdx1;\n"
    375 "myBestDistance1 = result;\n"
    376 "myBestTrainIdx1 = trainIdx;\n"
    377 "}\n"
    378 "else if (result < myBestDistance2)\n"
    379 "{\n"
    380 "myBestDistance2 = result;\n"
    381 "myBestTrainIdx2 = trainIdx;\n"
    382 "}\n"
    383 "}\n"
    384 "}\n"
    385 "barrier(CLK_LOCAL_MEM_FENCE);\n"
    386 "__local float *s_distance = (__local float *)sharebuffer;\n"
    387 "__local int *s_trainIdx = (__local int *)(sharebuffer + BLOCK_SIZE_ODD * BLOCK_SIZE);\n"
    388 "s_distance += lidy * BLOCK_SIZE_ODD;\n"
    389 "s_trainIdx += lidy * BLOCK_SIZE_ODD;\n"
    390 "s_distance[lidx] = myBestDistance1;\n"
    391 "s_trainIdx[lidx] = myBestTrainIdx1;\n"
    392 "float bestDistance1 = MAX_FLOAT;\n"
    393 "float bestDistance2 = MAX_FLOAT;\n"
    394 "int bestTrainIdx1 = -1;\n"
    395 "int bestTrainIdx2 = -1;\n"
    396 "barrier(CLK_LOCAL_MEM_FENCE);\n"
    397 "if (lidx == 0)\n"
    398 "{\n"
    399 "for (int i = 0 ; i < BLOCK_SIZE ; i++)\n"
    400 "{\n"
    401 "float val = s_distance[i];\n"
    402 "if (val < bestDistance1)\n"
    403 "{\n"
    404 "bestDistance2 = bestDistance1;\n"
    405 "bestTrainIdx2 = bestTrainIdx1;\n"
    406 "bestDistance1 = val;\n"
    407 "bestTrainIdx1 = s_trainIdx[i];\n"
    408 "}\n"
    409 "else if (val < bestDistance2)\n"
    410 "{\n"
    411 "bestDistance2 = val;\n"
    412 "bestTrainIdx2 = s_trainIdx[i];\n"
    413 "}\n"
    414 "}\n"
    415 "}\n"
    416 "barrier(CLK_LOCAL_MEM_FENCE);\n"
    417 "s_distance[lidx] = myBestDistance2;\n"
    418 "s_trainIdx[lidx] = myBestTrainIdx2;\n"
    419 "barrier(CLK_LOCAL_MEM_FENCE);\n"
    420 "if (lidx == 0)\n"
    421 "{\n"
    422 "for (int i = 0 ; i < BLOCK_SIZE ; i++)\n"
    423 "{\n"
    424 "float val = s_distance[i];\n"
    425 "if (val < bestDistance2)\n"
    426 "{\n"
    427 "bestDistance2 = val;\n"
    428 "bestTrainIdx2 = s_trainIdx[i];\n"
    429 "}\n"
    430 "}\n"
    431 "}\n"
    432 "myBestDistance1 = bestDistance1;\n"
    433 "myBestDistance2 = bestDistance2;\n"
    434 "myBestTrainIdx1 = bestTrainIdx1;\n"
    435 "myBestTrainIdx2 = bestTrainIdx2;\n"
    436 "if (queryIdx < query_rows && lidx == 0)\n"
    437 "{\n"
    438 "bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2);\n"
    439 "bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2);\n"
    440 "}\n"
    441 "}\n"
    442 , "35c3a1e231d446e4088561e3604fb94f"};
    443 ProgramSource brute_force_match_oclsrc(brute_force_match.programStr);
    444 const struct ProgramEntry fast={"fast",
    445 "inline int cornerScore(__global const uchar* img, int step)\n"
    446 "{\n"
    447 "int k, tofs, v = img[0], a0 = 0, b0;\n"
    448 "int d[16];\n"
    449 "#define LOAD2(idx, ofs) \\\n"
    450 "tofs = ofs; d[idx] = (short)(v - img[tofs]); d[idx+8] = (short)(v - img[-tofs])\n"
    451 "LOAD2(0, 3);\n"
    452 "LOAD2(1, -step+3);\n"
    453 "LOAD2(2, -step*2+2);\n"
    454 "LOAD2(3, -step*3+1);\n"
    455 "LOAD2(4, -step*3);\n"
    456 "LOAD2(5, -step*3-1);\n"
    457 "LOAD2(6, -step*2-2);\n"
    458 "LOAD2(7, -step-3);\n"
    459 "#pragma unroll\n"
    460 "for( k = 0; k < 16; k += 2 )\n"
    461 "{\n"
    462 "int a = min((int)d[(k+1)&15], (int)d[(k+2)&15]);\n"
    463 "a = min(a, (int)d[(k+3)&15]);\n"
    464 "a = min(a, (int)d[(k+4)&15]);\n"
    465 "a = min(a, (int)d[(k+5)&15]);\n"
    466 "a = min(a, (int)d[(k+6)&15]);\n"
    467 "a = min(a, (int)d[(k+7)&15]);\n"
    468 "a = min(a, (int)d[(k+8)&15]);\n"
    469 "a0 = max(a0, min(a, (int)d[k&15]));\n"
    470 "a0 = max(a0, min(a, (int)d[(k+9)&15]));\n"
    471 "}\n"
    472 "b0 = -a0;\n"
    473 "#pragma unroll\n"
    474 "for( k = 0; k < 16; k += 2 )\n"
    475 "{\n"
    476 "int b = max((int)d[(k+1)&15], (int)d[(k+2)&15]);\n"
    477 "b = max(b, (int)d[(k+3)&15]);\n"
    478 "b = max(b, (int)d[(k+4)&15]);\n"
    479 "b = max(b, (int)d[(k+5)&15]);\n"
    480 "b = max(b, (int)d[(k+6)&15]);\n"
    481 "b = max(b, (int)d[(k+7)&15]);\n"
    482 "b = max(b, (int)d[(k+8)&15]);\n"
    483 "b0 = min(b0, max(b, (int)d[k]));\n"
    484 "b0 = min(b0, max(b, (int)d[(k+9)&15]));\n"
    485 "}\n"
    486 "return -b0-1;\n"
    487 "}\n"
    488 "__kernel\n"
    489 "void FAST_findKeypoints(\n"
    490 "__global const uchar * _img, int step, int img_offset,\n"
    491 "int img_rows, int img_cols,\n"
    492 "volatile __global int* kp_loc,\n"
    493 "int max_keypoints, int threshold )\n"
    494 "{\n"
    495 "int j = get_global_id(0) + 3;\n"
    496 "int i = get_global_id(1) + 3;\n"
    497 "if (i < img_rows - 3 && j < img_cols - 3)\n"
    498 "{\n"
    499 "__global const uchar* img = _img + mad24(i, step, j + img_offset);\n"
    500 "int v = img[0], t0 = v - threshold, t1 = v + threshold;\n"
    501 "int k, tofs, v0, v1;\n"
    502 "int m0 = 0, m1 = 0;\n"
    503 "#define UPDATE_MASK(idx, ofs) \\\n"
    504 "tofs = ofs; v0 = img[tofs]; v1 = img[-tofs]; \\\n"
    505 "m0 |= ((v0 < t0) << idx) | ((v1 < t0) << (8 + idx)); \\\n"
    506 "m1 |= ((v0 > t1) << idx) | ((v1 > t1) << (8 + idx))\n"
    507 "UPDATE_MASK(0, 3);\n"
    508 "if( (m0 | m1) == 0 )\n"
    509 "return;\n"
    510 "UPDATE_MASK(2, -step*2+2);\n"
    511 "UPDATE_MASK(4, -step*3);\n"
    512 "UPDATE_MASK(6, -step*2-2);\n"
    513 "#define EVEN_MASK (1+4+16+64)\n"
    514 "if( ((m0 | (m0 >> 8)) & EVEN_MASK) != EVEN_MASK &&\n"
    515 "((m1 | (m1 >> 8)) & EVEN_MASK) != EVEN_MASK )\n"
    516 "return;\n"
    517 "UPDATE_MASK(1, -step+3);\n"
    518 "UPDATE_MASK(3, -step*3+1);\n"
    519 "UPDATE_MASK(5, -step*3-1);\n"
    520 "UPDATE_MASK(7, -step-3);\n"
    521 "if( ((m0 | (m0 >> 8)) & 255) != 255 &&\n"
    522 "((m1 | (m1 >> 8)) & 255) != 255 )\n"
    523 "return;\n"
    524 "m0 |= m0 << 16;\n"
    525 "m1 |= m1 << 16;\n"
    526 "#define CHECK0(i) ((m0 & (511 << i)) == (511 << i))\n"
    527 "#define CHECK1(i) ((m1 & (511 << i)) == (511 << i))\n"
    528 "if( CHECK0(0) + CHECK0(1) + CHECK0(2) + CHECK0(3) +\n"
    529 "CHECK0(4) + CHECK0(5) + CHECK0(6) + CHECK0(7) +\n"
    530 "CHECK0(8) + CHECK0(9) + CHECK0(10) + CHECK0(11) +\n"
    531 "CHECK0(12) + CHECK0(13) + CHECK0(14) + CHECK0(15) +\n"
    532 "CHECK1(0) + CHECK1(1) + CHECK1(2) + CHECK1(3) +\n"
    533 "CHECK1(4) + CHECK1(5) + CHECK1(6) + CHECK1(7) +\n"
    534 "CHECK1(8) + CHECK1(9) + CHECK1(10) + CHECK1(11) +\n"
    535 "CHECK1(12) + CHECK1(13) + CHECK1(14) + CHECK1(15) == 0 )\n"
    536 "return;\n"
    537 "{\n"
    538 "int idx = atomic_inc(kp_loc);\n"
    539 "if( idx < max_keypoints )\n"
    540 "{\n"
    541 "kp_loc[1 + 2*idx] = j;\n"
    542 "kp_loc[2 + 2*idx] = i;\n"
    543 "}\n"
    544 "}\n"
    545 "}\n"
    546 "}\n"
    547 "__kernel\n"
    548 "void FAST_nonmaxSupression(\n"
    549 "__global const int* kp_in, volatile __global int* kp_out,\n"
    550 "__global const uchar * _img, int step, int img_offset,\n"
    551 "int rows, int cols, int counter, int max_keypoints)\n"
    552 "{\n"
    553 "const int idx = get_global_id(0);\n"
    554 "if (idx < counter)\n"
    555 "{\n"
    556 "int x = kp_in[1 + 2*idx];\n"
    557 "int y = kp_in[2 + 2*idx];\n"
    558 "__global const uchar* img = _img + mad24(y, step, x + img_offset);\n"
    559 "int s = cornerScore(img, step);\n"
    560 "if( (x < 4 || s > cornerScore(img-1, step)) +\n"
    561 "(y < 4 || s > cornerScore(img-step, step)) != 2 )\n"
    562 "return;\n"
    563 "if( (x >= cols - 4 || s > cornerScore(img+1, step)) +\n"
    564 "(y >= rows - 4 || s > cornerScore(img+step, step)) +\n"
    565 "(x < 4 || y < 4 || s > cornerScore(img-step-1, step)) +\n"
    566 "(x >= cols - 4 || y < 4 || s > cornerScore(img-step+1, step)) +\n"
    567 "(x < 4 || y >= rows - 4 || s > cornerScore(img+step-1, step)) +\n"
    568 "(x >= cols - 4 || y >= rows - 4 || s > cornerScore(img+step+1, step)) == 6)\n"
    569 "{\n"
    570 "int new_idx = atomic_inc(kp_out);\n"
    571 "if( new_idx < max_keypoints )\n"
    572 "{\n"
    573 "kp_out[1 + 3*new_idx] = x;\n"
    574 "kp_out[2 + 3*new_idx] = y;\n"
    575 "kp_out[3 + 3*new_idx] = s;\n"
    576 "}\n"
    577 "}\n"
    578 "}\n"
    579 "}\n"
    580 , "f5e6f463f21a7ed77bd4d2c753478305"};
    581 ProgramSource fast_oclsrc(fast.programStr);
    582 const struct ProgramEntry orb={"orb",
    583 "#define LAYERINFO_SIZE 1\n"
    584 "#define LAYERINFO_OFS 0\n"
    585 "#define KEYPOINT_SIZE 3\n"
    586 "#define ORIENTED_KEYPOINT_SIZE 4\n"
    587 "#define KEYPOINT_X 0\n"
    588 "#define KEYPOINT_Y 1\n"
    589 "#define KEYPOINT_Z 2\n"
    590 "#define KEYPOINT_ANGLE 3\n"
    591 "#ifdef ORB_RESPONSES\n"
    592 "__kernel void\n"
    593 "ORB_HarrisResponses(__global const uchar* imgbuf, int imgstep, int imgoffset0,\n"
    594 "__global const int* layerinfo, __global const int* keypoints,\n"
    595 "__global float* responses, int nkeypoints )\n"
    596 "{\n"
    597 "int idx = get_global_id(0);\n"
    598 "if( idx < nkeypoints )\n"
    599 "{\n"
    600 "__global const int* kpt = keypoints + idx*KEYPOINT_SIZE;\n"
    601 "__global const int* layer = layerinfo + kpt[KEYPOINT_Z]*LAYERINFO_SIZE;\n"
    602 "__global const uchar* img = imgbuf + imgoffset0 + layer[LAYERINFO_OFS] +\n"
    603 "(kpt[KEYPOINT_Y] - blockSize/2)*imgstep + (kpt[KEYPOINT_X] - blockSize/2);\n"
    604 "int i, j;\n"
    605 "int a = 0, b = 0, c = 0;\n"
    606 "for( i = 0; i < blockSize; i++, img += imgstep-blockSize )\n"
    607 "{\n"
    608 "for( j = 0; j < blockSize; j++, img++ )\n"
    609 "{\n"
    610 "int Ix = (img[1] - img[-1])*2 + img[-imgstep+1] - img[-imgstep-1] + img[imgstep+1] - img[imgstep-1];\n"
    611 "int Iy = (img[imgstep] - img[-imgstep])*2 + img[imgstep-1] - img[-imgstep-1] + img[imgstep+1] - img[-imgstep+1];\n"
    612 "a += Ix*Ix;\n"
    613 "b += Iy*Iy;\n"
    614 "c += Ix*Iy;\n"
    615 "}\n"
    616 "}\n"
    617 "responses[idx] = ((float)a * b - (float)c * c - HARRIS_K * (float)(a + b) * (a + b))*scale_sq_sq;\n"
    618 "}\n"
    619 "}\n"
    620 "#endif\n"
    621 "#ifdef ORB_ANGLES\n"
    622 "#define _DBL_EPSILON 2.2204460492503131e-16f\n"
    623 "#define atan2_p1 (0.9997878412794807f*57.29577951308232f)\n"
    624 "#define atan2_p3 (-0.3258083974640975f*57.29577951308232f)\n"
    625 "#define atan2_p5 (0.1555786518463281f*57.29577951308232f)\n"
    626 "#define atan2_p7 (-0.04432655554792128f*57.29577951308232f)\n"
    627 "inline float fastAtan2( float y, float x )\n"
    628 "{\n"
    629 "float ax = fabs(x), ay = fabs(y);\n"
    630 "float a, c, c2;\n"
    631 "if( ax >= ay )\n"
    632 "{\n"
    633 "c = ay/(ax + _DBL_EPSILON);\n"
    634 "c2 = c*c;\n"
    635 "a = (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c;\n"
    636 "}\n"
    637 "else\n"
    638 "{\n"
    639 "c = ax/(ay + _DBL_EPSILON);\n"
    640 "c2 = c*c;\n"
    641 "a = 90.f - (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c;\n"
    642 "}\n"
    643 "if( x < 0 )\n"
    644 "a = 180.f - a;\n"
    645 "if( y < 0 )\n"
    646 "a = 360.f - a;\n"
    647 "return a;\n"
    648 "}\n"
    649 "__kernel void\n"
    650 "ORB_ICAngle(__global const uchar* imgbuf, int imgstep, int imgoffset0,\n"
    651 "__global const int* layerinfo, __global const int* keypoints,\n"
    652 "__global float* responses, const __global int* u_max,\n"
    653 "int nkeypoints, int half_k )\n"
    654 "{\n"
    655 "int idx = get_global_id(0);\n"
    656 "if( idx < nkeypoints )\n"
    657 "{\n"
    658 "__global const int* kpt = keypoints + idx*KEYPOINT_SIZE;\n"
    659 "__global const int* layer = layerinfo + kpt[KEYPOINT_Z]*LAYERINFO_SIZE;\n"
    660 "__global const uchar* center = imgbuf + imgoffset0 + layer[LAYERINFO_OFS] +\n"
    661 "kpt[KEYPOINT_Y]*imgstep + kpt[KEYPOINT_X];\n"
    662 "int u, v, m_01 = 0, m_10 = 0;\n"
    663 "for( u = -half_k; u <= half_k; u++ )\n"
    664 "m_10 += u * center[u];\n"
    665 "for( v = 1; v <= half_k; v++ )\n"
    666 "{\n"
    667 "int v_sum = 0;\n"
    668 "int d = u_max[v];\n"
    669 "for( u = -d; u <= d; u++ )\n"
    670 "{\n"
    671 "int val_plus = center[u + v*imgstep], val_minus = center[u - v*imgstep];\n"
    672 "v_sum += (val_plus - val_minus);\n"
    673 "m_10 += u * (val_plus + val_minus);\n"
    674 "}\n"
    675 "m_01 += v * v_sum;\n"
    676 "}\n"
    677 "responses[idx] = fastAtan2((float)m_01, (float)m_10);\n"
    678 "}\n"
    679 "}\n"
    680 "#endif\n"
    681 "#ifdef ORB_DESCRIPTORS\n"
    682 "__kernel void\n"
    683 "ORB_computeDescriptor(__global const uchar* imgbuf, int imgstep, int imgoffset0,\n"
    684 "__global const int* layerinfo, __global const int* keypoints,\n"
    685 "__global uchar* _desc, const __global int* pattern,\n"
    686 "int nkeypoints, int dsize )\n"
    687 "{\n"
    688 "int idx = get_global_id(0);\n"
    689 "if( idx < nkeypoints )\n"
    690 "{\n"
    691 "int i;\n"
    692 "__global const int* kpt = keypoints + idx*ORIENTED_KEYPOINT_SIZE;\n"
    693 "__global const int* layer = layerinfo + kpt[KEYPOINT_Z]*LAYERINFO_SIZE;\n"
    694 "__global const uchar* center = imgbuf + imgoffset0 + layer[LAYERINFO_OFS] +\n"
    695 "kpt[KEYPOINT_Y]*imgstep + kpt[KEYPOINT_X];\n"
    696 "float angle = as_float(kpt[KEYPOINT_ANGLE]);\n"
    697 "angle *= 0.01745329251994329547f;\n"
    698 "float cosa;\n"
    699 "float sina = sincos(angle, &cosa);\n"
    700 "__global uchar* desc = _desc + idx*dsize;\n"
    701 "#define GET_VALUE(idx) \\\n"
    702 "center[mad24(convert_int_rte(pattern[(idx)*2] * sina + pattern[(idx)*2+1] * cosa), imgstep, \\\n"
    703 "convert_int_rte(pattern[(idx)*2] * cosa - pattern[(idx)*2+1] * sina))]\n"
    704 "for( i = 0; i < dsize; i++ )\n"
    705 "{\n"
    706 "int val;\n"
    707 "#if WTA_K == 2\n"
    708 "int t0, t1;\n"
    709 "t0 = GET_VALUE(0); t1 = GET_VALUE(1);\n"
    710 "val = t0 < t1;\n"
    711 "t0 = GET_VALUE(2); t1 = GET_VALUE(3);\n"
    712 "val |= (t0 < t1) << 1;\n"
    713 "t0 = GET_VALUE(4); t1 = GET_VALUE(5);\n"
    714 "val |= (t0 < t1) << 2;\n"
    715 "t0 = GET_VALUE(6); t1 = GET_VALUE(7);\n"
    716 "val |= (t0 < t1) << 3;\n"
    717 "t0 = GET_VALUE(8); t1 = GET_VALUE(9);\n"
    718 "val |= (t0 < t1) << 4;\n"
    719 "t0 = GET_VALUE(10); t1 = GET_VALUE(11);\n"
    720 "val |= (t0 < t1) << 5;\n"
    721 "t0 = GET_VALUE(12); t1 = GET_VALUE(13);\n"
    722 "val |= (t0 < t1) << 6;\n"
    723 "t0 = GET_VALUE(14); t1 = GET_VALUE(15);\n"
    724 "val |= (t0 < t1) << 7;\n"
    725 "pattern += 16*2;\n"
    726 "#elif WTA_K == 3\n"
    727 "int t0, t1, t2;\n"
    728 "t0 = GET_VALUE(0); t1 = GET_VALUE(1); t2 = GET_VALUE(2);\n"
    729 "val = t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0);\n"
    730 "t0 = GET_VALUE(3); t1 = GET_VALUE(4); t2 = GET_VALUE(5);\n"
    731 "val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 2;\n"
    732 "t0 = GET_VALUE(6); t1 = GET_VALUE(7); t2 = GET_VALUE(8);\n"
    733 "val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 4;\n"
    734 "t0 = GET_VALUE(9); t1 = GET_VALUE(10); t2 = GET_VALUE(11);\n"
    735 "val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 6;\n"
    736 "pattern += 12*2;\n"
    737 "#elif WTA_K == 4\n"
    738 "int t0, t1, t2, t3, k;\n"
    739 "int a, b;\n"
    740 "t0 = GET_VALUE(0); t1 = GET_VALUE(1);\n"
    741 "t2 = GET_VALUE(2); t3 = GET_VALUE(3);\n"
    742 "a = 0, b = 2;\n"
    743 "if( t1 > t0 ) t0 = t1, a = 1;\n"
    744 "if( t3 > t2 ) t2 = t3, b = 3;\n"
    745 "k = t0 > t2 ? a : b;\n"
    746 "val = k;\n"
    747 "t0 = GET_VALUE(4); t1 = GET_VALUE(5);\n"
    748 "t2 = GET_VALUE(6); t3 = GET_VALUE(7);\n"
    749 "a = 0, b = 2;\n"
    750 "if( t1 > t0 ) t0 = t1, a = 1;\n"
    751 "if( t3 > t2 ) t2 = t3, b = 3;\n"
    752 "k = t0 > t2 ? a : b;\n"
    753 "val |= k << 2;\n"
    754 "t0 = GET_VALUE(8); t1 = GET_VALUE(9);\n"
    755 "t2 = GET_VALUE(10); t3 = GET_VALUE(11);\n"
    756 "a = 0, b = 2;\n"
    757 "if( t1 > t0 ) t0 = t1, a = 1;\n"
    758 "if( t3 > t2 ) t2 = t3, b = 3;\n"
    759 "k = t0 > t2 ? a : b;\n"
    760 "val |= k << 4;\n"
    761 "t0 = GET_VALUE(12); t1 = GET_VALUE(13);\n"
    762 "t2 = GET_VALUE(14); t3 = GET_VALUE(15);\n"
    763 "a = 0, b = 2;\n"
    764 "if( t1 > t0 ) t0 = t1, a = 1;\n"
    765 "if( t3 > t2 ) t2 = t3, b = 3;\n"
    766 "k = t0 > t2 ? a : b;\n"
    767 "val |= k << 6;\n"
    768 "pattern += 16*2;\n"
    769 "#else\n"
    770 "#error \"unknown/undefined WTA_K value; should be 2, 3 or 4\"\n"
    771 "#endif\n"
    772 "desc[i] = (uchar)val;\n"
    773 "}\n"
    774 "}\n"
    775 "}\n"
    776 "#endif\n"
    777 , "a7c2cfaeda19907b637211b1cc91d253"};
    778 ProgramSource orb_oclsrc(orb.programStr);
    779 }
    780 }}
    781