Home | History | Annotate | Download | only in cuda
      1 /*M///////////////////////////////////////////////////////////////////////////////////////
      2 //
      3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
      4 //
      5 //  By downloading, copying, installing or using the software you agree to this license.
      6 //  If you do not agree to this license, do not download, install,
      7 //  copy or use the software.
      8 //
      9 //
     10 //                           License Agreement
     11 //                For Open Source Computer Vision Library
     12 //
     13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
     14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
     15 // Third party copyrights are property of their respective owners.
     16 //
     17 // Redistribution and use in source and binary forms, with or without modification,
     18 // are permitted provided that the following conditions are met:
     19 //
     20 //   * Redistribution's of source code must retain the above copyright notice,
     21 //     this list of conditions and the following disclaimer.
     22 //
     23 //   * Redistribution's in binary form must reproduce the above copyright notice,
     24 //     this list of conditions and the following disclaimer in the documentation
     25 //     and/or other materials provided with the distribution.
     26 //
     27 //   * The name of the copyright holders may not be used to endorse or promote products
     28 //     derived from this software without specific prior written permission.
     29 //
     30 // This software is provided by the copyright holders and contributors "as is" and
     31 // any express or implied warranties, including, but not limited to, the implied
     32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
     33 // In no event shall the Intel Corporation or contributors be liable for any direct,
     34 // indirect, incidental, special, exemplary, or consequential damages
     35 // (including, but not limited to, procurement of substitute goods or services;
     36 // loss of use, data, or profits; or business interruption) however caused
     37 // and on any theory of liability, whether in contract, strict liability,
     38 // or tort (including negligence or otherwise) arising in any way out of
     39 // the use of this software, even if advised of the possibility of such damage.
     40 //
     41 //M*/
     42 
     43 #if !defined CUDA_DISABLER
     44 
     45 #include "opencv2/core/cuda/common.hpp"
     46 #include "opencv2/core/cuda/utility.hpp"
     47 #include "opencv2/core/cuda/reduce.hpp"
     48 #include "opencv2/core/cuda/limits.hpp"
     49 #include "opencv2/core/cuda/vec_distance.hpp"
     50 #include "opencv2/core/cuda/datamov_utils.hpp"
     51 
     52 namespace cv { namespace cuda { namespace device
     53 {
     54     namespace bf_match
     55     {
     56         ///////////////////////////////////////////////////////////////////////////////
     57         // Reduction
     58 
     59         template <int BLOCK_SIZE>
     60         __device__ void findBestMatch(float& bestDistance, int& bestTrainIdx, float* s_distance, int* s_trainIdx)
     61         {
     62             s_distance += threadIdx.y * BLOCK_SIZE;
     63             s_trainIdx += threadIdx.y * BLOCK_SIZE;
     64 
     65             reduceKeyVal<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, threadIdx.x, less<float>());
     66         }
     67 
     68         template <int BLOCK_SIZE>
     69         __device__ void findBestMatch(float& bestDistance, int& bestTrainIdx, int& bestImgIdx, float* s_distance, int* s_trainIdx, int* s_imgIdx)
     70         {
     71             s_distance += threadIdx.y * BLOCK_SIZE;
     72             s_trainIdx += threadIdx.y * BLOCK_SIZE;
     73             s_imgIdx   += threadIdx.y * BLOCK_SIZE;
     74 
     75             reduceKeyVal<BLOCK_SIZE>(s_distance, bestDistance, smem_tuple(s_trainIdx, s_imgIdx), thrust::tie(bestTrainIdx, bestImgIdx), threadIdx.x, less<float>());
     76         }
     77 
     78         ///////////////////////////////////////////////////////////////////////////////
     79         // Match Unrolled Cached
     80 
     81         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename T, typename U>
     82         __device__ void loadQueryToSmem(int queryIdx, const PtrStepSz<T>& query, U* s_query)
     83         {
     84             #pragma unroll
     85             for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
     86             {
     87                 const int loadX = threadIdx.x + i * BLOCK_SIZE;
     88                 s_query[threadIdx.y * MAX_DESC_LEN + loadX] = loadX < query.cols ? query.ptr(::min(queryIdx, query.rows - 1))[loadX] : 0;
     89             }
     90         }
     91 
     92         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
     93         __device__ void loopUnrolledCached(int queryIdx, const PtrStepSz<T>& query,volatile int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
     94                                            typename Dist::value_type* s_query, typename Dist::value_type* s_train,
     95                                            float& bestDistance, int& bestTrainIdx, int& bestImgIdx)
     96         {
     97             for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
     98             {
     99                 Dist dist;
    100 
    101                 #pragma unroll
    102                 for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
    103                 {
    104                     const int loadX = threadIdx.x + i * BLOCK_SIZE;
    105 
    106                     s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
    107 
    108                     if (loadX < train.cols)
    109                     {
    110                         T val;
    111 
    112                         ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
    113                         s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
    114                     }
    115 
    116                     __syncthreads();
    117 
    118                     #pragma unroll
    119                     for (int j = 0; j < BLOCK_SIZE; ++j)
    120                         dist.reduceIter(s_query[threadIdx.y * MAX_DESC_LEN + i * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
    121 
    122                     __syncthreads();
    123                 }
    124 
    125                 typename Dist::result_type distVal = dist;
    126 
    127                 const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
    128 
    129                 if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx))
    130                 {
    131                     bestImgIdx = imgIdx;
    132                     bestDistance = distVal;
    133                     bestTrainIdx = trainIdx;
    134                 }
    135             }
    136         }
    137 
    138         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
    139         __global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
    140         {
    141             extern __shared__ int smem[];
    142 
    143             const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
    144 
    145             typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
    146             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
    147 
    148             loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
    149 
    150             float myBestDistance = numeric_limits<float>::max();
    151             int myBestTrainIdx = -1;
    152 
    153             loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx);
    154 
    155             __syncthreads();
    156 
    157             float* s_distance = (float*)(smem);
    158             int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
    159 
    160             findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx);
    161 
    162             if (queryIdx < query.rows && threadIdx.x == 0)
    163             {
    164                 bestTrainIdx[queryIdx] = myBestTrainIdx;
    165                 bestDistance[queryIdx] = myBestDistance;
    166             }
    167         }
    168 
    169         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
    170         void matchUnrolledCached(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
    171                                  const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
    172                                  cudaStream_t stream)
    173         {
    174             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
    175             const dim3 grid(divUp(query.rows, BLOCK_SIZE));
    176 
    177             const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
    178 
    179             matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
    180             cudaSafeCall( cudaGetLastError() );
    181 
    182             if (stream == 0)
    183                 cudaSafeCall( cudaDeviceSynchronize() );
    184         }
    185 
    186         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
    187         __global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask,
    188                                             int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
    189         {
    190             extern __shared__ int smem[];
    191 
    192             const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
    193 
    194             typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
    195             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
    196 
    197             loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
    198 
    199             float myBestDistance = numeric_limits<float>::max();
    200             int myBestTrainIdx = -1;
    201             int myBestImgIdx = -1;
    202 
    203             Mask m = mask;
    204 
    205             for (int imgIdx = 0; imgIdx < n; ++imgIdx)
    206             {
    207                 const PtrStepSz<T> train = trains[imgIdx];
    208                 m.next();
    209                 loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx);
    210             }
    211 
    212             __syncthreads();
    213 
    214             float* s_distance = (float*)(smem);
    215             int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
    216             int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
    217 
    218             findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdx);
    219 
    220             if (queryIdx < query.rows && threadIdx.x == 0)
    221             {
    222                 bestTrainIdx[queryIdx] = myBestTrainIdx;
    223                 bestImgIdx[queryIdx] = myBestImgIdx;
    224                 bestDistance[queryIdx] = myBestDistance;
    225             }
    226         }
    227 
    228         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
    229         void matchUnrolledCached(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
    230                                  const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
    231                                  cudaStream_t stream)
    232         {
    233             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
    234             const dim3 grid(divUp(query.rows, BLOCK_SIZE));
    235 
    236             const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= 2 * BLOCK_SIZE ? MAX_DESC_LEN : 2 * BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
    237 
    238             matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
    239             cudaSafeCall( cudaGetLastError() );
    240 
    241             if (stream == 0)
    242                 cudaSafeCall( cudaDeviceSynchronize() );
    243         }
    244 
    245         ///////////////////////////////////////////////////////////////////////////////
    246         // Match Unrolled
    247 
    248         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
    249         __device__ void loopUnrolled(int queryIdx, const PtrStepSz<T>& query,volatile int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
    250                                      typename Dist::value_type* s_query, typename Dist::value_type* s_train,
    251                                      float& bestDistance, int& bestTrainIdx, int& bestImgIdx)
    252         {
    253             for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
    254             {
    255                 Dist dist;
    256 
    257                 #pragma unroll
    258                 for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
    259                 {
    260                     const int loadX = threadIdx.x + i * BLOCK_SIZE;
    261 
    262                     s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
    263                     s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
    264 
    265                     if (loadX < query.cols)
    266                     {
    267                         T val;
    268 
    269                         ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
    270                         s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
    271 
    272                         ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
    273                         s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
    274                     }
    275 
    276                     __syncthreads();
    277 
    278                     #pragma unroll
    279                     for (int j = 0; j < BLOCK_SIZE; ++j)
    280                         dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
    281 
    282                     __syncthreads();
    283                 }
    284 
    285                 typename Dist::result_type distVal = dist;
    286 
    287                 const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
    288 
    289                 if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx))
    290                 {
    291                     bestImgIdx = imgIdx;
    292                     bestDistance = distVal;
    293                     bestTrainIdx = trainIdx;
    294                 }
    295             }
    296         }
    297 
    298         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
    299         __global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
    300         {
    301             extern __shared__ int smem[];
    302 
    303             const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
    304 
    305             float myBestDistance = numeric_limits<float>::max();
    306             int myBestTrainIdx = -1;
    307 
    308             typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
    309             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
    310 
    311             loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx);
    312 
    313             __syncthreads();
    314 
    315             float* s_distance = (float*)(smem);
    316             int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
    317 
    318             findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx);
    319 
    320             if (queryIdx < query.rows && threadIdx.x == 0)
    321             {
    322                 bestTrainIdx[queryIdx] = myBestTrainIdx;
    323                 bestDistance[queryIdx] = myBestDistance;
    324             }
    325         }
    326 
    327         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
    328         void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
    329                            const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
    330                            cudaStream_t stream)
    331         {
    332             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
    333             const dim3 grid(divUp(query.rows, BLOCK_SIZE));
    334 
    335             const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
    336 
    337             matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
    338             cudaSafeCall( cudaGetLastError() );
    339 
    340             if (stream == 0)
    341                 cudaSafeCall( cudaDeviceSynchronize() );
    342         }
    343 
    344         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
    345         __global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask,
    346                                       int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
    347         {
    348             extern __shared__ int smem[];
    349 
    350             const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
    351 
    352             float myBestDistance = numeric_limits<float>::max();
    353             int myBestTrainIdx = -1;
    354             int myBestImgIdx = -1;
    355 
    356             typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
    357             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
    358 
    359             Mask m = mask;
    360 
    361             for (int imgIdx = 0; imgIdx < n; ++imgIdx)
    362             {
    363                 const PtrStepSz<T> train = trains[imgIdx];
    364                 m.next();
    365                 loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx);
    366             }
    367 
    368             __syncthreads();
    369 
    370             float* s_distance = (float*)(smem);
    371             int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
    372             int* s_imgIdxIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
    373 
    374             findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdxIdx);
    375 
    376             if (queryIdx < query.rows && threadIdx.x == 0)
    377             {
    378                 bestTrainIdx[queryIdx] = myBestTrainIdx;
    379                 bestImgIdx[queryIdx] = myBestImgIdx;
    380                 bestDistance[queryIdx] = myBestDistance;
    381             }
    382         }
    383 
    384         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
    385         void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
    386                            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
    387                            cudaStream_t stream)
    388         {
    389             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
    390             const dim3 grid(divUp(query.rows, BLOCK_SIZE));
    391 
    392             const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
    393 
    394             matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
    395             cudaSafeCall( cudaGetLastError() );
    396 
    397             if (stream == 0)
    398                 cudaSafeCall( cudaDeviceSynchronize() );
    399         }
    400 
    401         ///////////////////////////////////////////////////////////////////////////////
    402         // Match
    403 
    404         template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
    405         __device__ void loop(int queryIdx, const PtrStepSz<T>& query, volatile int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
    406                              typename Dist::value_type* s_query, typename Dist::value_type* s_train,
    407                              float& bestDistance, int& bestTrainIdx, int& bestImgIdx)
    408         {
    409             for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
    410             {
    411                 Dist dist;
    412 
    413                 for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
    414                 {
    415                     const int loadX = threadIdx.x + i * BLOCK_SIZE;
    416 
    417                     s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
    418                     s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
    419 
    420                     if (loadX < query.cols)
    421                     {
    422                         T val;
    423 
    424                         ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
    425                         s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
    426 
    427                         ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
    428                         s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
    429                     }
    430 
    431                     __syncthreads();
    432 
    433                     #pragma unroll
    434                     for (int j = 0; j < BLOCK_SIZE; ++j)
    435                         dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
    436 
    437                     __syncthreads();
    438                 }
    439 
    440                 typename Dist::result_type distVal = dist;
    441 
    442                 const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
    443 
    444                 if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx))
    445                 {
    446                     bestImgIdx = imgIdx;
    447                     bestDistance = distVal;
    448                     bestTrainIdx = trainIdx;
    449                 }
    450             }
    451         }
    452 
    453         template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
    454         __global__ void match(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
    455         {
    456             extern __shared__ int smem[];
    457 
    458             const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
    459 
    460             float myBestDistance = numeric_limits<float>::max();
    461             int myBestTrainIdx = -1;
    462 
    463             typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
    464             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
    465 
    466             loop<BLOCK_SIZE, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx);
    467 
    468             __syncthreads();
    469 
    470             float* s_distance = (float*)(smem);
    471             int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
    472 
    473             findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx);
    474 
    475             if (queryIdx < query.rows && threadIdx.x == 0)
    476             {
    477                 bestTrainIdx[queryIdx] = myBestTrainIdx;
    478                 bestDistance[queryIdx] = myBestDistance;
    479             }
    480         }
    481 
    482         template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
    483         void match(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
    484                    const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
    485                    cudaStream_t stream)
    486         {
    487             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
    488             const dim3 grid(divUp(query.rows, BLOCK_SIZE));
    489 
    490             const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
    491 
    492             match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
    493             cudaSafeCall( cudaGetLastError() );
    494 
    495             if (stream == 0)
    496                 cudaSafeCall( cudaDeviceSynchronize() );
    497         }
    498 
    499         template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
    500         __global__ void match(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask,
    501                               int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
    502         {
    503             extern __shared__ int smem[];
    504 
    505             const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
    506 
    507             float myBestDistance = numeric_limits<float>::max();
    508             int myBestTrainIdx = -1;
    509             int myBestImgIdx = -1;
    510 
    511             typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
    512             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
    513 
    514             Mask m = mask;
    515             for (int imgIdx = 0; imgIdx < n; ++imgIdx)
    516             {
    517                 const PtrStepSz<T> train = trains[imgIdx];
    518                 m.next();
    519                 loop<BLOCK_SIZE, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx);
    520             }
    521 
    522             __syncthreads();
    523 
    524             float* s_distance = (float*)(smem);
    525             int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
    526             int* s_imgIdxIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
    527 
    528             findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdxIdx);
    529 
    530             if (queryIdx < query.rows && threadIdx.x == 0)
    531             {
    532                 bestTrainIdx[queryIdx] = myBestTrainIdx;
    533                 bestImgIdx[queryIdx] = myBestImgIdx;
    534                 bestDistance[queryIdx] = myBestDistance;
    535             }
    536         }
    537 
    538         template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
    539         void match(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
    540                    const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
    541                    cudaStream_t stream)
    542         {
    543             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
    544             const dim3 grid(divUp(query.rows, BLOCK_SIZE));
    545 
    546             const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
    547 
    548             match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
    549             cudaSafeCall( cudaGetLastError() );
    550 
    551             if (stream == 0)
    552                 cudaSafeCall( cudaDeviceSynchronize() );
    553         }
    554 
    555         ///////////////////////////////////////////////////////////////////////////////
    556         // Match dispatcher
    557 
    558         template <typename Dist, typename T, typename Mask>
    559         void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
    560                              const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
    561                              cudaStream_t stream)
    562         {
    563             if (query.cols <= 64)
    564             {
    565                 matchUnrolledCached<16, 64, Dist>(query, train, mask, trainIdx, distance, stream);
    566             }
    567             else if (query.cols <= 128)
    568             {
    569                 matchUnrolledCached<16, 128, Dist>(query, train, mask, trainIdx, distance, stream);
    570             }
    571             /*else if (query.cols <= 256)
    572             {
    573                 matchUnrolled<16, 256, Dist>(query, train, mask, trainIdx, distance, stream);
    574             }
    575             else if (query.cols <= 512)
    576             {
    577                 matchUnrolled<16, 512, Dist>(query, train, mask, trainIdx, distance, stream);
    578             }
    579             else if (query.cols <= 1024)
    580             {
    581                 matchUnrolled<16, 1024, Dist>(query, train, mask, trainIdx, distance, stream);
    582             }*/
    583             else
    584             {
    585                 match<16, Dist>(query, train, mask, trainIdx, distance, stream);
    586             }
    587         }
    588 
    589         template <typename Dist, typename T, typename Mask>
    590         void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
    591                              const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
    592                              cudaStream_t stream)
    593         {
    594             if (query.cols <= 64)
    595             {
    596                 matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
    597             }
    598             else if (query.cols <= 128)
    599             {
    600                 matchUnrolledCached<16, 128, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
    601             }
    602             /*else if (query.cols <= 256)
    603             {
    604                 matchUnrolled<16, 256, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
    605             }
    606             else if (query.cols <= 512)
    607             {
    608                 matchUnrolled<16, 512, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
    609             }
    610             else if (query.cols <= 1024)
    611             {
    612                 matchUnrolled<16, 1024, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
    613             }*/
    614             else
    615             {
    616                 match<16, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
    617             }
    618         }
    619 
    620         ///////////////////////////////////////////////////////////////////////////////
    621         // Match caller
    622 
    623         template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
    624                                                const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
    625                                                cudaStream_t stream)
    626         {
    627             if (mask.data)
    628             {
    629                 matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), SingleMask(mask),
    630                     trainIdx, distance,
    631                     stream);
    632             }
    633             else
    634             {
    635                 matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), WithOutMask(),
    636                     trainIdx, distance,
    637                     stream);
    638             }
    639         }
    640 
    641         template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
    642         //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
    643         template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
    644         template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
    645         template void matchL1_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
    646         template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
    647 
    648         template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
    649                                                const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
    650                                                cudaStream_t stream)
    651         {
    652             if (mask.data)
    653             {
    654                 matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), SingleMask(mask),
    655                     trainIdx, distance,
    656                     stream);
    657             }
    658             else
    659             {
    660                 matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), WithOutMask(),
    661                     trainIdx, distance,
    662                     stream);
    663             }
    664         }
    665 
    666         //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
    667         //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
    668         //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
    669         //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
    670         //template void matchL2_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
    671         template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
    672 
    673         template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
    674                                                     const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
    675                                                     cudaStream_t stream)
    676         {
    677             if (mask.data)
    678             {
    679                 matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), SingleMask(mask),
    680                     trainIdx, distance,
    681                     stream);
    682             }
    683             else
    684             {
    685                 matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), WithOutMask(),
    686                     trainIdx, distance,
    687                     stream);
    688             }
    689         }
    690 
    691         template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
    692         //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
    693         template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
    694         //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
    695         template void matchHamming_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
    696 
    697         template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
    698                                                const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
    699                                                 cudaStream_t stream)
    700         {
    701             if (masks.data)
    702             {
    703                 matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
    704                     trainIdx, imgIdx, distance,
    705                     stream);
    706             }
    707             else
    708             {
    709                 matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(),
    710                     trainIdx, imgIdx, distance,
    711                     stream);
    712             }
    713         }
    714 
    715         template void matchL1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
    716         //template void matchL1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
    717         template void matchL1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
    718         template void matchL1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
    719         template void matchL1_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
    720         template void matchL1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
    721 
    722         template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
    723                                                const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
    724                                                cudaStream_t stream)
    725         {
    726             if (masks.data)
    727             {
    728                 matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
    729                     trainIdx, imgIdx, distance,
    730                     stream);
    731             }
    732             else
    733             {
    734                 matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(),
    735                     trainIdx, imgIdx, distance,
    736                     stream);
    737             }
    738         }
    739 
    740         //template void matchL2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
    741         //template void matchL2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
    742         //template void matchL2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
    743         //template void matchL2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
    744         //template void matchL2_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
    745         template void matchL2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& maskCollection, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
    746 
    747         template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
    748                                                     const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
    749                                                     cudaStream_t stream)
    750         {
    751             if (masks.data)
    752             {
    753                 matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
    754                     trainIdx, imgIdx, distance,
    755                     stream);
    756             }
    757             else
    758             {
    759                 matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(),
    760                     trainIdx, imgIdx, distance,
    761                     stream);
    762             }
    763         }
    764 
    765         template void matchHamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
    766         //template void matchHamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
    767         template void matchHamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
    768         //template void matchHamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
    769         template void matchHamming_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
    770     } // namespace bf_match
    771 }}} // namespace cv { namespace cuda { namespace cudev {
    772 
    773 
    774 #endif /* CUDA_DISABLER */
    775