Home | History | Annotate | Download | only in cuda
      1 /*M///////////////////////////////////////////////////////////////////////////////////////
      2 //
      3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
      4 //
      5 //  By downloading, copying, installing or using the software you agree to this license.
      6 //  If you do not agree to this license, do not download, install,
      7 //  copy or use the software.
      8 //
      9 //
     10 //                           License Agreement
     11 //                For Open Source Computer Vision Library
     12 //
     13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
     14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
     15 // Third party copyrights are property of their respective owners.
     16 //
     17 // Redistribution and use in source and binary forms, with or without modification,
     18 // are permitted provided that the following conditions are met:
     19 //
     20 //   * Redistribution's of source code must retain the above copyright notice,
     21 //     this list of conditions and the following disclaimer.
     22 //
     23 //   * Redistribution's in binary form must reproduce the above copyright notice,
     24 //     this list of conditions and the following disclaimer in the documentation
     25 //     and/or other materials provided with the distribution.
     26 //
     27 //   * The name of the copyright holders may not be used to endorse or promote products
     28 //     derived from this software without specific prior written permission.
     29 //
     30 // This software is provided by the copyright holders and contributors "as is" and
     31 // any express or implied warranties, including, but not limited to, the implied
     32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
     33 // In no event shall the Intel Corporation or contributors be liable for any direct,
     34 // indirect, incidental, special, exemplary, or consequential damages
     35 // (including, but not limited to, procurement of substitute goods or services;
     36 // loss of use, data, or profits; or business interruption) however caused
     37 // and on any theory of liability, whether in contract, strict liability,
     38 // or tort (including negligence or otherwise) arising in any way out of
     39 // the use of this software, even if advised of the possibility of such damage.
     40 //
     41 //M*/
     42 
     43 #if !defined CUDA_DISABLER
     44 
     45 #include "opencv2/core/cuda/common.hpp"
     46 #include "opencv2/core/cuda/utility.hpp"
     47 #include "opencv2/core/cuda/reduce.hpp"
     48 #include "opencv2/core/cuda/limits.hpp"
     49 #include "opencv2/core/cuda/vec_distance.hpp"
     50 #include "opencv2/core/cuda/datamov_utils.hpp"
     51 #include "opencv2/core/cuda/warp_shuffle.hpp"
     52 
     53 namespace cv { namespace cuda { namespace device
     54 {
     55     namespace bf_knnmatch
     56     {
     57         ///////////////////////////////////////////////////////////////////////////////
     58         // Reduction
     59 
     60         template <int BLOCK_SIZE>
     61         __device__ void findBestMatch(float& bestDistance1, float& bestDistance2,
     62                                       int& bestTrainIdx1, int& bestTrainIdx2,
     63                                       float* s_distance, int* s_trainIdx)
     64         {
     65         #if __CUDA_ARCH__ >= 300
     66             (void) s_distance;
     67             (void) s_trainIdx;
     68 
     69             float d1, d2;
     70             int i1, i2;
     71 
     72             #pragma unroll
     73             for (int i = BLOCK_SIZE / 2; i >= 1; i /= 2)
     74             {
     75                 d1 = shfl_down(bestDistance1, i, BLOCK_SIZE);
     76                 d2 = shfl_down(bestDistance2, i, BLOCK_SIZE);
     77                 i1 = shfl_down(bestTrainIdx1, i, BLOCK_SIZE);
     78                 i2 = shfl_down(bestTrainIdx2, i, BLOCK_SIZE);
     79 
     80                 if (bestDistance1 < d1)
     81                 {
     82                     if (d1 < bestDistance2)
     83                     {
     84                         bestDistance2 = d1;
     85                         bestTrainIdx2 = i1;
     86                     }
     87                 }
     88                 else
     89                 {
     90                     bestDistance2 = bestDistance1;
     91                     bestTrainIdx2 = bestTrainIdx1;
     92 
     93                     bestDistance1 = d1;
     94                     bestTrainIdx1 = i1;
     95 
     96                     if (d2 < bestDistance2)
     97                     {
     98                         bestDistance2 = d2;
     99                         bestTrainIdx2 = i2;
    100                     }
    101                 }
    102             }
    103         #else
    104             float myBestDistance1 = numeric_limits<float>::max();
    105             float myBestDistance2 = numeric_limits<float>::max();
    106             int myBestTrainIdx1 = -1;
    107             int myBestTrainIdx2 = -1;
    108 
    109             s_distance += threadIdx.y * BLOCK_SIZE;
    110             s_trainIdx += threadIdx.y * BLOCK_SIZE;
    111 
    112             s_distance[threadIdx.x] = bestDistance1;
    113             s_trainIdx[threadIdx.x] = bestTrainIdx1;
    114 
    115             __syncthreads();
    116 
    117             if (threadIdx.x == 0)
    118             {
    119                 #pragma unroll
    120                 for (int i = 0; i < BLOCK_SIZE; ++i)
    121                 {
    122                     float val = s_distance[i];
    123 
    124                     if (val < myBestDistance1)
    125                     {
    126                         myBestDistance2 = myBestDistance1;
    127                         myBestTrainIdx2 = myBestTrainIdx1;
    128 
    129                         myBestDistance1 = val;
    130                         myBestTrainIdx1 = s_trainIdx[i];
    131                     }
    132                     else if (val < myBestDistance2)
    133                     {
    134                         myBestDistance2 = val;
    135                         myBestTrainIdx2 = s_trainIdx[i];
    136                     }
    137                 }
    138             }
    139 
    140             __syncthreads();
    141 
    142             s_distance[threadIdx.x] = bestDistance2;
    143             s_trainIdx[threadIdx.x] = bestTrainIdx2;
    144 
    145             __syncthreads();
    146 
    147             if (threadIdx.x == 0)
    148             {
    149                 #pragma unroll
    150                 for (int i = 0; i < BLOCK_SIZE; ++i)
    151                 {
    152                     float val = s_distance[i];
    153 
    154                     if (val < myBestDistance2)
    155                     {
    156                         myBestDistance2 = val;
    157                         myBestTrainIdx2 = s_trainIdx[i];
    158                     }
    159                 }
    160             }
    161 
    162             bestDistance1 = myBestDistance1;
    163             bestDistance2 = myBestDistance2;
    164 
    165             bestTrainIdx1 = myBestTrainIdx1;
    166             bestTrainIdx2 = myBestTrainIdx2;
    167         #endif
    168         }
    169 
    170         template <int BLOCK_SIZE>
    171         __device__ void findBestMatch(float& bestDistance1, float& bestDistance2,
    172                                        int& bestTrainIdx1, int& bestTrainIdx2,
    173                                        int& bestImgIdx1, int& bestImgIdx2,
    174                                        float* s_distance, int* s_trainIdx, int* s_imgIdx)
    175         {
    176         #if __CUDA_ARCH__ >= 300
    177             (void) s_distance;
    178             (void) s_trainIdx;
    179             (void) s_imgIdx;
    180 
    181             float d1, d2;
    182             int i1, i2;
    183             int j1, j2;
    184 
    185             #pragma unroll
    186             for (int i = BLOCK_SIZE / 2; i >= 1; i /= 2)
    187             {
    188                 d1 = shfl_down(bestDistance1, i, BLOCK_SIZE);
    189                 d2 = shfl_down(bestDistance2, i, BLOCK_SIZE);
    190                 i1 = shfl_down(bestTrainIdx1, i, BLOCK_SIZE);
    191                 i2 = shfl_down(bestTrainIdx2, i, BLOCK_SIZE);
    192                 j1 = shfl_down(bestImgIdx1, i, BLOCK_SIZE);
    193                 j2 = shfl_down(bestImgIdx2, i, BLOCK_SIZE);
    194 
    195                 if (bestDistance1 < d1)
    196                 {
    197                     if (d1 < bestDistance2)
    198                     {
    199                         bestDistance2 = d1;
    200                         bestTrainIdx2 = i1;
    201                         bestImgIdx2 = j1;
    202                     }
    203                 }
    204                 else
    205                 {
    206                     bestDistance2 = bestDistance1;
    207                     bestTrainIdx2 = bestTrainIdx1;
    208                     bestImgIdx2 = bestImgIdx1;
    209 
    210                     bestDistance1 = d1;
    211                     bestTrainIdx1 = i1;
    212                     bestImgIdx1 = j1;
    213 
    214                     if (d2 < bestDistance2)
    215                     {
    216                         bestDistance2 = d2;
    217                         bestTrainIdx2 = i2;
    218                         bestImgIdx2 = j2;
    219                     }
    220                 }
    221             }
    222         #else
    223             float myBestDistance1 = numeric_limits<float>::max();
    224             float myBestDistance2 = numeric_limits<float>::max();
    225             int myBestTrainIdx1 = -1;
    226             int myBestTrainIdx2 = -1;
    227             int myBestImgIdx1 = -1;
    228             int myBestImgIdx2 = -1;
    229 
    230             s_distance += threadIdx.y * BLOCK_SIZE;
    231             s_trainIdx += threadIdx.y * BLOCK_SIZE;
    232             s_imgIdx   += threadIdx.y * BLOCK_SIZE;
    233 
    234             s_distance[threadIdx.x] = bestDistance1;
    235             s_trainIdx[threadIdx.x] = bestTrainIdx1;
    236             s_imgIdx[threadIdx.x]   = bestImgIdx1;
    237 
    238             __syncthreads();
    239 
    240             if (threadIdx.x == 0)
    241             {
    242                 #pragma unroll
    243                 for (int i = 0; i < BLOCK_SIZE; ++i)
    244                 {
    245                     float val = s_distance[i];
    246 
    247                     if (val < myBestDistance1)
    248                     {
    249                         myBestDistance2 = myBestDistance1;
    250                         myBestTrainIdx2 = myBestTrainIdx1;
    251                         myBestImgIdx2   = myBestImgIdx1;
    252 
    253                         myBestDistance1 = val;
    254                         myBestTrainIdx1 = s_trainIdx[i];
    255                         myBestImgIdx1   = s_imgIdx[i];
    256                     }
    257                     else if (val < myBestDistance2)
    258                     {
    259                         myBestDistance2 = val;
    260                         myBestTrainIdx2 = s_trainIdx[i];
    261                         myBestImgIdx2   = s_imgIdx[i];
    262                     }
    263                 }
    264             }
    265 
    266             __syncthreads();
    267 
    268             s_distance[threadIdx.x] = bestDistance2;
    269             s_trainIdx[threadIdx.x] = bestTrainIdx2;
    270             s_imgIdx[threadIdx.x]   = bestImgIdx2;
    271 
    272             __syncthreads();
    273 
    274             if (threadIdx.x == 0)
    275             {
    276                 #pragma unroll
    277                 for (int i = 0; i < BLOCK_SIZE; ++i)
    278                 {
    279                     float val = s_distance[i];
    280 
    281                     if (val < myBestDistance2)
    282                     {
    283                         myBestDistance2 = val;
    284                         myBestTrainIdx2 = s_trainIdx[i];
    285                         myBestImgIdx2   = s_imgIdx[i];
    286                     }
    287                 }
    288             }
    289 
    290             bestDistance1 = myBestDistance1;
    291             bestDistance2 = myBestDistance2;
    292 
    293             bestTrainIdx1 = myBestTrainIdx1;
    294             bestTrainIdx2 = myBestTrainIdx2;
    295 
    296             bestImgIdx1 = myBestImgIdx1;
    297             bestImgIdx2 = myBestImgIdx2;
    298         #endif
    299         }
    300 
    301         ///////////////////////////////////////////////////////////////////////////////
    302         // Match Unrolled Cached
    303 
    304         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename T, typename U>
    305         __device__ void loadQueryToSmem(int queryIdx, const PtrStepSz<T>& query, U* s_query)
    306         {
    307             #pragma unroll
    308             for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
    309             {
    310                 const int loadX = threadIdx.x + i * BLOCK_SIZE;
    311                 s_query[threadIdx.y * MAX_DESC_LEN + loadX] = loadX < query.cols ? query.ptr(::min(queryIdx, query.rows - 1))[loadX] : 0;
    312             }
    313         }
    314 
    315         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
    316         __device__ void loopUnrolledCached(int queryIdx, const PtrStepSz<T>& query, int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
    317                                            typename Dist::value_type* s_query, typename Dist::value_type* s_train,
    318                                            float& bestDistance1, float& bestDistance2,
    319                                            int& bestTrainIdx1, int& bestTrainIdx2,
    320                                            int& bestImgIdx1, int& bestImgIdx2)
    321         {
    322             for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
    323             {
    324                 Dist dist;
    325 
    326                 #pragma unroll
    327                 for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
    328                 {
    329                     const int loadX = threadIdx.x + i * BLOCK_SIZE;
    330 
    331                     s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
    332 
    333                     if (loadX < train.cols)
    334                     {
    335                         T val;
    336 
    337                         ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
    338                         s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
    339                     }
    340 
    341                     __syncthreads();
    342 
    343                     #pragma unroll
    344                     for (int j = 0; j < BLOCK_SIZE; ++j)
    345                         dist.reduceIter(s_query[threadIdx.y * MAX_DESC_LEN + i * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
    346 
    347                     __syncthreads();
    348                 }
    349 
    350                 typename Dist::result_type distVal = dist;
    351 
    352                 const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
    353 
    354                 if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))
    355                 {
    356                     if (distVal < bestDistance1)
    357                     {
    358                         bestImgIdx2   = bestImgIdx1;
    359                         bestDistance2 = bestDistance1;
    360                         bestTrainIdx2 = bestTrainIdx1;
    361 
    362                         bestImgIdx1   = imgIdx;
    363                         bestDistance1 = distVal;
    364                         bestTrainIdx1 = trainIdx;
    365                     }
    366                     else if (distVal < bestDistance2)
    367                     {
    368                         bestImgIdx2   = imgIdx;
    369                         bestDistance2 = distVal;
    370                         bestTrainIdx2 = trainIdx;
    371                     }
    372                 }
    373             }
    374         }
    375 
    376         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
    377         __global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
    378         {
    379             extern __shared__ int smem[];
    380 
    381             const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
    382 
    383             typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
    384             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
    385 
    386             loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
    387 
    388             float myBestDistance1 = numeric_limits<float>::max();
    389             float myBestDistance2 = numeric_limits<float>::max();
    390             int myBestTrainIdx1 = -1;
    391             int myBestTrainIdx2 = -1;
    392 
    393             loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);
    394 
    395             __syncthreads();
    396 
    397             float* s_distance = (float*)(smem);
    398             int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
    399 
    400             findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);
    401 
    402             if (queryIdx < query.rows && threadIdx.x == 0)
    403             {
    404                 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
    405                 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
    406             }
    407         }
    408 
    409         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
    410         void matchUnrolledCached(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
    411                                  const PtrStepSz<int2>& trainIdx, const PtrStepSz<float2>& distance,
    412                                  cudaStream_t stream)
    413         {
    414             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
    415             const dim3 grid(divUp(query.rows, BLOCK_SIZE));
    416 
    417             const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
    418 
    419             matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
    420             cudaSafeCall( cudaGetLastError() );
    421 
    422             if (stream == 0)
    423                 cudaSafeCall( cudaDeviceSynchronize() );
    424         }
    425 
    426         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
    427         __global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
    428         {
    429             extern __shared__ int smem[];
    430 
    431             const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
    432 
    433             typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
    434             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
    435 
    436             loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
    437 
    438             float myBestDistance1 = numeric_limits<float>::max();
    439             float myBestDistance2 = numeric_limits<float>::max();
    440             int myBestTrainIdx1 = -1;
    441             int myBestTrainIdx2 = -1;
    442             int myBestImgIdx1 = -1;
    443             int myBestImgIdx2 = -1;
    444 
    445             Mask m = mask;
    446 
    447             for (int imgIdx = 0; imgIdx < n; ++imgIdx)
    448             {
    449                 const PtrStepSz<T> train = trains[imgIdx];
    450                 m.next();
    451                 loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);
    452             }
    453 
    454             __syncthreads();
    455 
    456             float* s_distance = (float*)(smem);
    457             int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
    458             int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
    459 
    460             findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);
    461 
    462             if (queryIdx < query.rows && threadIdx.x == 0)
    463             {
    464                 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
    465                 bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);
    466                 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
    467             }
    468         }
    469 
    470         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
    471         void matchUnrolledCached(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
    472                                  const PtrStepSz<int2>& trainIdx, const PtrStepSz<int2>& imgIdx, const PtrStepSz<float2>& distance,
    473                                  cudaStream_t stream)
    474         {
    475             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
    476             const dim3 grid(divUp(query.rows, BLOCK_SIZE));
    477 
    478             const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= 2 * BLOCK_SIZE ? MAX_DESC_LEN : 2 * BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
    479 
    480             matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
    481             cudaSafeCall( cudaGetLastError() );
    482 
    483             if (stream == 0)
    484                 cudaSafeCall( cudaDeviceSynchronize() );
    485         }
    486 
    487         ///////////////////////////////////////////////////////////////////////////////
    488         // Match Unrolled
    489 
    490         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
    491         __device__ void loopUnrolled(int queryIdx, const PtrStepSz<T>& query, int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
    492                                      typename Dist::value_type* s_query, typename Dist::value_type* s_train,
    493                                      float& bestDistance1, float& bestDistance2,
    494                                      int& bestTrainIdx1, int& bestTrainIdx2,
    495                                      int& bestImgIdx1, int& bestImgIdx2)
    496         {
    497             for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
    498             {
    499                 Dist dist;
    500 
    501                 #pragma unroll
    502                 for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
    503                 {
    504                     const int loadX = threadIdx.x + i * BLOCK_SIZE;
    505 
    506                     s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
    507                     s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
    508 
    509                     if (loadX < query.cols)
    510                     {
    511                         T val;
    512 
    513                         ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
    514                         s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
    515 
    516                         ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
    517                         s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
    518                     }
    519 
    520                     __syncthreads();
    521 
    522                     #pragma unroll
    523                     for (int j = 0; j < BLOCK_SIZE; ++j)
    524                         dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
    525 
    526                     __syncthreads();
    527                 }
    528 
    529                 typename Dist::result_type distVal = dist;
    530 
    531                 const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
    532 
    533                 if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))
    534                 {
    535                     if (distVal < bestDistance1)
    536                     {
    537                         bestImgIdx2   = bestImgIdx1;
    538                         bestDistance2 = bestDistance1;
    539                         bestTrainIdx2 = bestTrainIdx1;
    540 
    541                         bestImgIdx1   = imgIdx;
    542                         bestDistance1 = distVal;
    543                         bestTrainIdx1 = trainIdx;
    544                     }
    545                     else if (distVal < bestDistance2)
    546                     {
    547                         bestImgIdx2   = imgIdx;
    548                         bestDistance2 = distVal;
    549                         bestTrainIdx2 = trainIdx;
    550                     }
    551                 }
    552             }
    553         }
    554 
    555         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
    556         __global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
    557         {
    558             extern __shared__ int smem[];
    559 
    560             const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
    561 
    562             typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
    563             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
    564 
    565             float myBestDistance1 = numeric_limits<float>::max();
    566             float myBestDistance2 = numeric_limits<float>::max();
    567             int myBestTrainIdx1 = -1;
    568             int myBestTrainIdx2 = -1;
    569 
    570             loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);
    571 
    572             __syncthreads();
    573 
    574             float* s_distance = (float*)(smem);
    575             int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
    576 
    577             findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);
    578 
    579             if (queryIdx < query.rows && threadIdx.x == 0)
    580             {
    581                 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
    582                 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
    583             }
    584         }
    585 
    586         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
    587         void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
    588                            const PtrStepSz<int2>& trainIdx, const PtrStepSz<float2>& distance,
    589                            cudaStream_t stream)
    590         {
    591             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
    592             const dim3 grid(divUp(query.rows, BLOCK_SIZE));
    593 
    594             const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
    595 
    596             matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
    597             cudaSafeCall( cudaGetLastError() );
    598 
    599             if (stream == 0)
    600                 cudaSafeCall( cudaDeviceSynchronize() );
    601         }
    602 
    603         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
    604         __global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
    605         {
    606             extern __shared__ int smem[];
    607 
    608             const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
    609 
    610             typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
    611             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
    612 
    613             float myBestDistance1 = numeric_limits<float>::max();
    614             float myBestDistance2 = numeric_limits<float>::max();
    615             int myBestTrainIdx1 = -1;
    616             int myBestTrainIdx2 = -1;
    617             int myBestImgIdx1 = -1;
    618             int myBestImgIdx2 = -1;
    619 
    620             Mask m = mask;
    621 
    622             for (int imgIdx = 0; imgIdx < n; ++imgIdx)
    623             {
    624                 const PtrStepSz<T> train = trains[imgIdx];
    625                 m.next();
    626                 loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);
    627             }
    628 
    629             __syncthreads();
    630 
    631             float* s_distance = (float*)(smem);
    632             int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
    633             int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
    634 
    635             findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);
    636 
    637             if (queryIdx < query.rows && threadIdx.x == 0)
    638             {
    639                 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
    640                 bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);
    641                 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
    642             }
    643         }
    644 
    645         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
    646         void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
    647                            const PtrStepSz<int2>& trainIdx, const PtrStepSz<int2>& imgIdx, const PtrStepSz<float2>& distance,
    648                            cudaStream_t stream)
    649         {
    650             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
    651             const dim3 grid(divUp(query.rows, BLOCK_SIZE));
    652 
    653             const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
    654 
    655             matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
    656             cudaSafeCall( cudaGetLastError() );
    657 
    658             if (stream == 0)
    659                 cudaSafeCall( cudaDeviceSynchronize() );
    660         }
    661 
    662         ///////////////////////////////////////////////////////////////////////////////
    663         // Match
    664 
    665         template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
    666         __device__ void loop(int queryIdx, const PtrStepSz<T>& query, int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
    667                              typename Dist::value_type* s_query, typename Dist::value_type* s_train,
    668                              float& bestDistance1, float& bestDistance2,
    669                              int& bestTrainIdx1, int& bestTrainIdx2,
    670                              int& bestImgIdx1, int& bestImgIdx2)
    671         {
    672             for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
    673             {
    674                 Dist dist;
    675 
    676                 for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
    677                 {
    678                     const int loadX = threadIdx.x + i * BLOCK_SIZE;
    679 
    680                     s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
    681                     s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
    682 
    683                     if (loadX < query.cols)
    684                     {
    685                         T val;
    686 
    687                         ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
    688                         s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
    689 
    690                         ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
    691                         s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
    692                     }
    693 
    694                     __syncthreads();
    695 
    696                     #pragma unroll
    697                     for (int j = 0; j < BLOCK_SIZE; ++j)
    698                         dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
    699 
    700                     __syncthreads();
    701                 }
    702 
    703                 typename Dist::result_type distVal = dist;
    704 
    705                 const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
    706 
    707                 if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))
    708                 {
    709                     if (distVal < bestDistance1)
    710                     {
    711                         bestImgIdx2   = bestImgIdx1;
    712                         bestDistance2 = bestDistance1;
    713                         bestTrainIdx2 = bestTrainIdx1;
    714 
    715                         bestImgIdx1   = imgIdx;
    716                         bestDistance1 = distVal;
    717                         bestTrainIdx1 = trainIdx;
    718                     }
    719                     else if (distVal < bestDistance2)
    720                     {
    721                         bestImgIdx2   = imgIdx;
    722                         bestDistance2 = distVal;
    723                         bestTrainIdx2 = trainIdx;
    724                     }
    725                 }
    726             }
    727         }
    728 
    729         template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
    730         __global__ void match(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
    731         {
    732             extern __shared__ int smem[];
    733 
    734             const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
    735 
    736             typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
    737             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
    738 
    739             float myBestDistance1 = numeric_limits<float>::max();
    740             float myBestDistance2 = numeric_limits<float>::max();
    741             int myBestTrainIdx1 = -1;
    742             int myBestTrainIdx2 = -1;
    743 
    744             loop<BLOCK_SIZE, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);
    745 
    746             __syncthreads();
    747 
    748             float* s_distance = (float*)(smem);
    749             int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
    750 
    751             findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);
    752 
    753             if (queryIdx < query.rows && threadIdx.x == 0)
    754             {
    755                 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
    756                 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
    757             }
    758         }
    759 
    760         template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
    761         void match(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
    762                    const PtrStepSz<int2>& trainIdx, const PtrStepSz<float2>& distance,
    763                    cudaStream_t stream)
    764         {
    765             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
    766             const dim3 grid(divUp(query.rows, BLOCK_SIZE));
    767 
    768             const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
    769 
    770             match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
    771             cudaSafeCall( cudaGetLastError() );
    772 
    773             if (stream == 0)
    774                 cudaSafeCall( cudaDeviceSynchronize() );
    775         }
    776 
    777         template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
    778         __global__ void match(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
    779         {
    780             extern __shared__ int smem[];
    781 
    782             const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
    783 
    784             typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
    785             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
    786 
    787             float myBestDistance1 = numeric_limits<float>::max();
    788             float myBestDistance2 = numeric_limits<float>::max();
    789             int myBestTrainIdx1 = -1;
    790             int myBestTrainIdx2 = -1;
    791             int myBestImgIdx1 = -1;
    792             int myBestImgIdx2 = -1;
    793 
    794             Mask m = mask;
    795 
    796             for (int imgIdx = 0; imgIdx < n; ++imgIdx)
    797             {
    798                 const PtrStepSz<T> train = trains[imgIdx];
    799                 m.next();
    800                 loop<BLOCK_SIZE, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);
    801             }
    802 
    803             __syncthreads();
    804 
    805             float* s_distance = (float*)(smem);
    806             int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
    807             int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
    808 
    809             findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);
    810 
    811             if (queryIdx < query.rows && threadIdx.x == 0)
    812             {
    813                 bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
    814                 bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);
    815                 bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
    816             }
    817         }
    818 
    819         template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
    820         void match(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
    821                    const PtrStepSz<int2>& trainIdx, const PtrStepSz<int2>& imgIdx, const PtrStepSz<float2>& distance,
    822                    cudaStream_t stream)
    823         {
    824             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
    825             const dim3 grid(divUp(query.rows, BLOCK_SIZE));
    826 
    827             const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
    828 
    829             match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
    830             cudaSafeCall( cudaGetLastError() );
    831 
    832             if (stream == 0)
    833                 cudaSafeCall( cudaDeviceSynchronize() );
    834         }
    835 
    836         ///////////////////////////////////////////////////////////////////////////////
    837         // knnMatch 2 dispatcher
    838 
    839         template <typename Dist, typename T, typename Mask>
    840         void match2Dispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
    841                               const PtrStepSzb& trainIdx, const PtrStepSzb& distance,
    842                               cudaStream_t stream)
    843         {
    844             if (query.cols <= 64)
    845             {
    846                 matchUnrolledCached<16, 64, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
    847             }
    848             else if (query.cols <= 128)
    849             {
    850                 matchUnrolledCached<16, 128, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
    851             }
    852             /*else if (query.cols <= 256)
    853             {
    854                 matchUnrolled<16, 256, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
    855             }
    856             else if (query.cols <= 512)
    857             {
    858                 matchUnrolled<16, 512, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
    859             }
    860             else if (query.cols <= 1024)
    861             {
    862                 matchUnrolled<16, 1024, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
    863             }*/
    864             else
    865             {
    866                 match<16, Dist>(query, train, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<float2> > (distance), stream);
    867             }
    868         }
    869 
    870         template <typename Dist, typename T, typename Mask>
    871         void match2Dispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
    872                               const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
    873                               cudaStream_t stream)
    874         {
    875             if (query.cols <= 64)
    876             {
    877                 matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
    878             }
    879             else if (query.cols <= 128)
    880             {
    881                 matchUnrolledCached<16, 128, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
    882             }
    883             /*else if (query.cols <= 256)
    884             {
    885                 matchUnrolled<16, 256, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
    886             }
    887             else if (query.cols <= 512)
    888             {
    889                 matchUnrolled<16, 512, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
    890             }
    891             else if (query.cols <= 1024)
    892             {
    893                 matchUnrolled<16, 1024, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
    894             }*/
    895             else
    896             {
    897                 match<16, Dist>(query, trains, n, mask, static_cast< PtrStepSz<int2> >(trainIdx), static_cast< PtrStepSz<int2> >(imgIdx), static_cast< PtrStepSz<float2> > (distance), stream);
    898             }
    899         }
    900 
    901         ///////////////////////////////////////////////////////////////////////////////
    902         // Calc distance kernel
    903 
    904         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
    905         __global__ void calcDistanceUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, PtrStepf allDist)
    906         {
    907             extern __shared__ int smem[];
    908 
    909             const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
    910             const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
    911 
    912             typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
    913             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
    914 
    915             Dist dist;
    916 
    917             #pragma unroll
    918             for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
    919             {
    920                 const int loadX = threadIdx.x + i * BLOCK_SIZE;
    921 
    922                 if (loadX < query.cols)
    923                 {
    924                     s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(::min(queryIdx, query.rows - 1))[loadX];
    925                     s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];
    926                 }
    927                 else
    928                 {
    929                     s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
    930                     s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
    931                 }
    932 
    933                 __syncthreads();
    934 
    935                 #pragma unroll
    936                 for (int j = 0; j < BLOCK_SIZE; ++j)
    937                     dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
    938 
    939                 __syncthreads();
    940             }
    941 
    942             if (queryIdx < query.rows && trainIdx < train.rows)
    943             {
    944                 float distVal = numeric_limits<float>::max();
    945 
    946                 if (mask(queryIdx, trainIdx))
    947                     distVal = (typename Dist::result_type)dist;
    948 
    949                 allDist.ptr(queryIdx)[trainIdx] = distVal;
    950             }
    951         }
    952 
    953         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
    954         void calcDistanceUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask, const PtrStepSzf& allDist, cudaStream_t stream)
    955         {
    956             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
    957             const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
    958 
    959             const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
    960 
    961             calcDistanceUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, allDist);
    962             cudaSafeCall( cudaGetLastError() );
    963 
    964             if (stream == 0)
    965                 cudaSafeCall( cudaDeviceSynchronize() );
    966         }
    967 
    968         template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
    969         __global__ void calcDistance(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, PtrStepf allDist)
    970         {
    971             extern __shared__ int smem[];
    972 
    973             const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
    974             const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
    975 
    976             typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
    977             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
    978 
    979             Dist dist;
    980 
    981             for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
    982             {
    983                 const int loadX = threadIdx.x + i * BLOCK_SIZE;
    984 
    985                 if (loadX < query.cols)
    986                 {
    987                     s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(::min(queryIdx, query.rows - 1))[loadX];
    988                     s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(::min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];
    989                 }
    990                 else
    991                 {
    992                     s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
    993                     s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
    994                 }
    995 
    996                 __syncthreads();
    997 
    998                 #pragma unroll
    999                 for (int j = 0; j < BLOCK_SIZE; ++j)
   1000                     dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
   1001 
   1002                 __syncthreads();
   1003             }
   1004 
   1005             if (queryIdx < query.rows && trainIdx < train.rows)
   1006             {
   1007                 float distVal = numeric_limits<float>::max();
   1008 
   1009                 if (mask(queryIdx, trainIdx))
   1010                     distVal = (typename Dist::result_type)dist;
   1011 
   1012                 allDist.ptr(queryIdx)[trainIdx] = distVal;
   1013             }
   1014         }
   1015 
   1016         template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
   1017         void calcDistance(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask, const PtrStepSzf& allDist, cudaStream_t stream)
   1018         {
   1019             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
   1020             const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
   1021 
   1022             const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
   1023 
   1024             calcDistance<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, allDist);
   1025             cudaSafeCall( cudaGetLastError() );
   1026 
   1027             if (stream == 0)
   1028                 cudaSafeCall( cudaDeviceSynchronize() );
   1029         }
   1030 
   1031         ///////////////////////////////////////////////////////////////////////////////
   1032         // Calc Distance dispatcher
   1033 
   1034         template <typename Dist, typename T, typename Mask>
   1035         void calcDistanceDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
   1036                                     const PtrStepSzf& allDist,
   1037                                     cudaStream_t stream)
   1038         {
   1039             if (query.cols <= 64)
   1040             {
   1041                 calcDistanceUnrolled<16, 64, Dist>(query, train, mask, allDist, stream);
   1042             }
   1043             else if (query.cols <= 128)
   1044             {
   1045                 calcDistanceUnrolled<16, 128, Dist>(query, train, mask, allDist, stream);
   1046             }
   1047             /*else if (query.cols <= 256)
   1048             {
   1049                 calcDistanceUnrolled<16, 256, Dist>(query, train, mask, allDist, stream);
   1050             }
   1051             else if (query.cols <= 512)
   1052             {
   1053                 calcDistanceUnrolled<16, 512, Dist>(query, train, mask, allDist, stream);
   1054             }
   1055             else if (query.cols <= 1024)
   1056             {
   1057                 calcDistanceUnrolled<16, 1024, Dist>(query, train, mask, allDist, stream);
   1058             }*/
   1059             else
   1060             {
   1061                 calcDistance<16, Dist>(query, train, mask, allDist, stream);
   1062             }
   1063         }
   1064 
   1065         ///////////////////////////////////////////////////////////////////////////////
   1066         // find knn match kernel
   1067 
   1068         template <int BLOCK_SIZE>
   1069         __global__ void findBestMatch(PtrStepSzf allDist, int i, PtrStepi trainIdx, PtrStepf distance)
   1070         {
   1071             const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64;
   1072             __shared__ float s_dist[SMEM_SIZE];
   1073             __shared__ int s_trainIdx[SMEM_SIZE];
   1074 
   1075             const int queryIdx = blockIdx.x;
   1076 
   1077             float* allDistRow = allDist.ptr(queryIdx);
   1078 
   1079             float dist = numeric_limits<float>::max();
   1080             int bestIdx = -1;
   1081 
   1082             for (int i = threadIdx.x; i < allDist.cols; i += BLOCK_SIZE)
   1083             {
   1084                 float reg = allDistRow[i];
   1085                 if (reg < dist)
   1086                 {
   1087                     dist = reg;
   1088                     bestIdx = i;
   1089                 }
   1090             }
   1091 
   1092             s_dist[threadIdx.x] = dist;
   1093             s_trainIdx[threadIdx.x] = bestIdx;
   1094             __syncthreads();
   1095 
   1096             reduceKeyVal<BLOCK_SIZE>(s_dist, dist, s_trainIdx, bestIdx, threadIdx.x, less<float>());
   1097 
   1098             if (threadIdx.x == 0)
   1099             {
   1100                 if (dist < numeric_limits<float>::max())
   1101                 {
   1102                     allDistRow[bestIdx] = numeric_limits<float>::max();
   1103                     trainIdx.ptr(queryIdx)[i] = bestIdx;
   1104                     distance.ptr(queryIdx)[i] = dist;
   1105                 }
   1106             }
   1107         }
   1108 
   1109         template <int BLOCK_SIZE>
   1110         void findKnnMatch(int k, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, const PtrStepSzf& allDist, cudaStream_t stream)
   1111         {
   1112             const dim3 block(BLOCK_SIZE, 1, 1);
   1113             const dim3 grid(trainIdx.rows, 1, 1);
   1114 
   1115             for (int i = 0; i < k; ++i)
   1116             {
   1117                 findBestMatch<BLOCK_SIZE><<<grid, block, 0, stream>>>(allDist, i, trainIdx, distance);
   1118                 cudaSafeCall( cudaGetLastError() );
   1119             }
   1120 
   1121             if (stream == 0)
   1122                 cudaSafeCall( cudaDeviceSynchronize() );
   1123         }
   1124 
   1125         void findKnnMatchDispatcher(int k, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream)
   1126         {
   1127             findKnnMatch<256>(k, static_cast<PtrStepSzi>(trainIdx), static_cast<PtrStepSzf>(distance), allDist, stream);
   1128         }
   1129 
   1130         ///////////////////////////////////////////////////////////////////////////////
   1131         // knn match Dispatcher
   1132 
   1133         template <typename Dist, typename T, typename Mask>
   1134         void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, int k, const Mask& mask,
   1135             const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
   1136             cudaStream_t stream)
   1137         {
   1138             if (k == 2)
   1139             {
   1140                 match2Dispatcher<Dist>(query, train, mask, trainIdx, distance, stream);
   1141             }
   1142             else
   1143             {
   1144                 calcDistanceDispatcher<Dist>(query, train, mask, allDist, stream);
   1145                 findKnnMatchDispatcher(k, trainIdx, distance, allDist, stream);
   1146             }
   1147         }
   1148 
   1149         ///////////////////////////////////////////////////////////////////////////////
   1150         // knn match caller
   1151 
   1152         template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
   1153             const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
   1154             cudaStream_t stream)
   1155         {
   1156             if (mask.data)
   1157                 matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, stream);
   1158             else
   1159                 matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, stream);
   1160         }
   1161 
   1162         template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
   1163         //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
   1164         template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
   1165         template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
   1166         template void matchL1_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
   1167         template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
   1168 
   1169         template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
   1170             const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
   1171             cudaStream_t stream)
   1172         {
   1173             if (mask.data)
   1174                 matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, stream);
   1175             else
   1176                 matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, stream);
   1177         }
   1178 
   1179         //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
   1180         //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
   1181         //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
   1182         //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
   1183         //template void matchL2_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
   1184         template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
   1185 
   1186         template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, int k, const PtrStepSzb& mask,
   1187             const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist,
   1188             cudaStream_t stream)
   1189         {
   1190             if (mask.data)
   1191                 matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, stream);
   1192             else
   1193                 matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, stream);
   1194         }
   1195 
   1196         template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
   1197         //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
   1198         template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
   1199         //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
   1200         template void matchHamming_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, int k, const PtrStepSzb& mask, const PtrStepSzb& trainIdx, const PtrStepSzb& distance, const PtrStepSzf& allDist, cudaStream_t stream);
   1201 
   1202         template <typename T> void match2L1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
   1203             const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
   1204             cudaStream_t stream)
   1205         {
   1206             if (masks.data)
   1207                 match2Dispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, stream);
   1208             else
   1209                 match2Dispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance,  stream);
   1210         }
   1211 
   1212         template void match2L1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
   1213         //template void match2L1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
   1214         template void match2L1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
   1215         template void match2L1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
   1216         template void match2L1_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
   1217         template void match2L1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
   1218 
   1219         template <typename T> void match2L2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
   1220             const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
   1221             cudaStream_t stream)
   1222         {
   1223             if (masks.data)
   1224                 match2Dispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, stream);
   1225             else
   1226                 match2Dispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, stream);
   1227         }
   1228 
   1229         //template void match2L2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
   1230         //template void match2L2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
   1231         //template void match2L2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
   1232         //template void match2L2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
   1233         //template void match2L2_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
   1234         template void match2L2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
   1235 
   1236         template <typename T> void match2Hamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
   1237             const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance,
   1238             cudaStream_t stream)
   1239         {
   1240             if (masks.data)
   1241                 match2Dispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, stream);
   1242             else
   1243                 match2Dispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, stream);
   1244         }
   1245 
   1246         template void match2Hamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
   1247         //template void match2Hamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
   1248         template void match2Hamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
   1249         //template void match2Hamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
   1250         template void match2Hamming_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzb& trainIdx, const PtrStepSzb& imgIdx, const PtrStepSzb& distance, cudaStream_t stream);
   1251     } // namespace bf_knnmatch
   1252 }}} // namespace cv { namespace cuda { namespace cudev {
   1253 
   1254 
   1255 #endif /* CUDA_DISABLER */
   1256