Home | History | Annotate | Download | only in cuda
      1 /*M///////////////////////////////////////////////////////////////////////////////////////
      2 //
      3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
      4 //
      5 //  By downloading, copying, installing or using the software you agree to this license.
      6 //  If you do not agree to this license, do not download, install,
      7 //  copy or use the software.
      8 //
      9 //
     10 //                           License Agreement
     11 //                For Open Source Computer Vision Library
     12 //
     13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
     14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
     15 // Third party copyrights are property of their respective owners.
     16 //
     17 // Redistribution and use in source and binary forms, with or without modification,
     18 // are permitted provided that the following conditions are met:
     19 //
     20 //   * Redistribution's of source code must retain the above copyright notice,
     21 //     this list of conditions and the following disclaimer.
     22 //
     23 //   * Redistribution's in binary form must reproduce the above copyright notice,
     24 //     this list of conditions and the following disclaimer in the documentation
     25 //     and/or other materials provided with the distribution.
     26 //
     27 //   * The name of the copyright holders may not be used to endorse or promote products
     28 //     derived from this software without specific prior written permission.
     29 //
     30 // This software is provided by the copyright holders and contributors "as is" and
     31 // any express or implied warranties, including, but not limited to, the implied
     32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
     33 // In no event shall the Intel Corporation or contributors be liable for any direct,
     34 // indirect, incidental, special, exemplary, or consequential damages
     35 // (including, but not limited to, procurement of substitute goods or services;
     36 // loss of use, data, or profits; or business interruption) however caused
     37 // and on any theory of liability, whether in contract, strict liability,
     38 // or tort (including negligence or otherwise) arising in any way out of
     39 // the use of this software, even if advised of the possibility of such damage.
     40 //
     41 //M*/
     42 
     43 #if !defined CUDA_DISABLER
     44 
     45 #include <opencv2/core/cuda/common.hpp>
     46 #include <opencv2/core/cuda/vec_traits.hpp>
     47 #include <opencv2/core/cuda/vec_math.hpp>
     48 #include <opencv2/core/cuda/emulation.hpp>
     49 
     50 #include <iostream>
     51 #include <stdio.h>
     52 
     53 namespace cv { namespace cuda { namespace device
     54 {
     55     namespace ccl
     56     {
     57         enum
     58         {
     59             WARP_SIZE  = 32,
     60             WARP_LOG   = 5,
     61 
     62             CTA_SIZE_X = 32,
     63             CTA_SIZE_Y = 8,
     64 
     65             STA_SIZE_MERGE_Y = 4,
     66             STA_SIZE_MERGE_X = 32,
     67 
     68             TPB_X = 1,
     69             TPB_Y = 4,
     70 
     71             TILE_COLS = CTA_SIZE_X * TPB_X,
     72             TILE_ROWS = CTA_SIZE_Y * TPB_Y
     73         };
     74 
     75         template<typename T> struct IntervalsTraits
     76         {
     77             typedef T elem_type;
     78         };
     79 
     80         template<> struct IntervalsTraits<unsigned char>
     81         {
     82             typedef int dist_type;
     83             enum {ch = 1};
     84         };
     85 
     86         template<> struct IntervalsTraits<uchar3>
     87         {
     88             typedef int3 dist_type;
     89             enum {ch = 3};
     90         };
     91 
     92         template<> struct IntervalsTraits<uchar4>
     93         {
     94             typedef int4 dist_type;
     95             enum {ch = 4};
     96         };
     97 
     98         template<> struct IntervalsTraits<unsigned short>
     99         {
    100             typedef int dist_type;
    101             enum {ch = 1};
    102         };
    103 
    104         template<> struct IntervalsTraits<ushort3>
    105         {
    106             typedef int3 dist_type;
    107             enum {ch = 3};
    108         };
    109 
    110         template<> struct IntervalsTraits<ushort4>
    111         {
    112             typedef int4 dist_type;
    113             enum {ch = 4};
    114         };
    115 
    116         template<> struct IntervalsTraits<float>
    117         {
    118             typedef float dist_type;
    119             enum {ch = 1};
    120         };
    121 
    122         template<> struct IntervalsTraits<int>
    123         {
    124             typedef int dist_type;
    125             enum {ch = 1};
    126         };
    127 
    128         typedef unsigned char component;
    129         enum Edges { UP = 1, DOWN = 2, LEFT = 4, RIGHT = 8, EMPTY = 0xF0 };
    130 
    131         template<typename T, int CH> struct InInterval {};
    132 
    133         template<typename T> struct InInterval<T, 1>
    134         {
    135             typedef typename VecTraits<T>::elem_type E;
    136             __host__ __device__ __forceinline__ InInterval(const float4& _lo, const float4& _hi) : lo((E)(-_lo.x)), hi((E)_hi.x) { }
    137             T lo, hi;
    138 
    139             template<typename I> __device__ __forceinline__ bool operator() (const I& a, const I& b) const
    140             {
    141                 I d = a - b;
    142                 return lo <= d && d <= hi;
    143             }
    144         };
    145 
    146 
    147         template<typename T> struct InInterval<T, 3>
    148         {
    149             typedef typename VecTraits<T>::elem_type E;
    150             __host__ __device__ __forceinline__ InInterval(const float4& _lo, const float4& _hi)
    151             : lo (VecTraits<T>::make((E)(-_lo.x), (E)(-_lo.y), (E)(-_lo.z))), hi (VecTraits<T>::make((E)_hi.x, (E)_hi.y, (E)_hi.z)) { }
    152             T lo, hi;
    153 
    154             template<typename I> __device__ __forceinline__ bool operator() (const I& a, const I& b) const
    155             {
    156                 I d = saturate_cast<I>(a - b);
    157                 return lo.x <= d.x && d.x <= hi.x &&
    158                        lo.y <= d.y && d.y <= hi.y &&
    159                        lo.z <= d.z && d.z <= hi.z;
    160             }
    161         };
    162 
    163         template<typename T> struct InInterval<T, 4>
    164         {
    165             typedef typename VecTraits<T>::elem_type E;
    166             __host__ __device__ __forceinline__ InInterval(const float4& _lo, const float4& _hi)
    167             : lo (VecTraits<T>::make((E)(-_lo.x), (E)(-_lo.y), (E)(-_lo.z), (E)(-_lo.w))), hi (VecTraits<T>::make((E)_hi.x, (E)_hi.y, (E)_hi.z, (E)_hi.w)) { }
    168             T lo, hi;
    169 
    170             template<typename I> __device__ __forceinline__ bool operator() (const I& a, const I& b) const
    171             {
    172                 I d = saturate_cast<I>(a - b);
    173                 return lo.x <= d.x && d.x <= hi.x &&
    174                        lo.y <= d.y && d.y <= hi.y &&
    175                        lo.z <= d.z && d.z <= hi.z &&
    176                        lo.w <= d.w && d.w <= hi.w;
    177             }
    178         };
    179 
    180 
    181         template<typename T, typename F>
    182         __global__ void computeConnectivity(const PtrStepSz<T> image, PtrStepSzb components, F connected)
    183         {
    184             int x = threadIdx.x + blockIdx.x * blockDim.x;
    185             int y = threadIdx.y + blockIdx.y * blockDim.y;
    186 
    187             if (x >= image.cols || y >= image.rows) return;
    188 
    189             T intensity = image(y, x);
    190             component c = 0;
    191 
    192             if ( x > 0 && connected(intensity, image(y, x - 1)))
    193                 c |= LEFT;
    194 
    195             if ( y > 0 && connected(intensity, image(y - 1, x)))
    196                 c |= UP;
    197 
    198             if ( x + 1 < image.cols && connected(intensity, image(y, x + 1)))
    199                 c |= RIGHT;
    200 
    201             if ( y + 1 < image.rows && connected(intensity, image(y + 1, x)))
    202                 c |= DOWN;
    203 
    204             components(y, x) = c;
    205         }
    206 
    207         template< typename T>
    208         void computeEdges(const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream)
    209         {
    210             dim3 block(CTA_SIZE_X, CTA_SIZE_Y);
    211             dim3 grid(divUp(image.cols, block.x), divUp(image.rows, block.y));
    212 
    213             typedef InInterval<typename IntervalsTraits<T>::dist_type, IntervalsTraits<T>::ch> Int_t;
    214 
    215             Int_t inInt(lo, hi);
    216             computeConnectivity<T, Int_t><<<grid, block, 0, stream>>>(static_cast<const PtrStepSz<T> >(image), edges, inInt);
    217 
    218             cudaSafeCall( cudaGetLastError() );
    219             if (stream == 0)
    220                 cudaSafeCall( cudaDeviceSynchronize() );
    221         }
    222 
    223         template void computeEdges<uchar>  (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
    224         template void computeEdges<uchar3> (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
    225         template void computeEdges<uchar4> (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
    226         template void computeEdges<ushort> (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
    227         template void computeEdges<ushort3>(const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
    228         template void computeEdges<ushort4>(const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
    229         template void computeEdges<int>    (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
    230         template void computeEdges<float>  (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
    231 
    232         __global__ void lableTiles(const PtrStepSzb edges, PtrStepSzi comps)
    233         {
    234             int x = threadIdx.x + blockIdx.x * TILE_COLS;
    235             int y = threadIdx.y + blockIdx.y * TILE_ROWS;
    236 
    237             if (x >= edges.cols || y >= edges.rows) return;
    238 
    239             //currently x is 1
    240             int bounds = ((y + TPB_Y) < edges.rows);
    241 
    242             __shared__ int labelsTile[TILE_ROWS][TILE_COLS];
    243             __shared__ int  edgesTile[TILE_ROWS][TILE_COLS];
    244 
    245             int new_labels[TPB_Y][TPB_X];
    246             int old_labels[TPB_Y][TPB_X];
    247 
    248             #pragma unroll
    249             for (int i = 0; i < TPB_Y; ++i)
    250                 #pragma unroll
    251                 for (int j = 0; j < TPB_X; ++j)
    252                 {
    253                     int yloc = threadIdx.y + CTA_SIZE_Y * i;
    254                     int xloc = threadIdx.x + CTA_SIZE_X * j;
    255                     component c = edges(bounds * (y + CTA_SIZE_Y * i), x + CTA_SIZE_X * j);
    256 
    257                     if (!xloc) c &= ~LEFT;
    258                     if (!yloc) c &= ~UP;
    259 
    260                     if (xloc == TILE_COLS -1) c &= ~RIGHT;
    261                     if (yloc == TILE_ROWS -1) c &= ~DOWN;
    262 
    263                     new_labels[i][j] = yloc * TILE_COLS + xloc;
    264                     edgesTile[yloc][xloc] = c;
    265                 }
    266 
    267             for (int k = 0; ;++k)
    268             {
    269                 //1. backup
    270                 #pragma unroll
    271                 for (int i = 0; i < TPB_Y; ++i)
    272                     #pragma unroll
    273                     for (int j = 0; j < TPB_X; ++j)
    274                     {
    275                         int yloc = threadIdx.y + CTA_SIZE_Y * i;
    276                         int xloc = threadIdx.x + CTA_SIZE_X * j;
    277 
    278                         old_labels[i][j]       = new_labels[i][j];
    279                         labelsTile[yloc][xloc] = new_labels[i][j];
    280                     }
    281 
    282                 __syncthreads();
    283 
    284                 //2. compare local arrays
    285                 #pragma unroll
    286                 for (int i = 0; i < TPB_Y; ++i)
    287                     #pragma unroll
    288                     for (int j = 0; j < TPB_X; ++j)
    289                     {
    290                         int yloc = threadIdx.y + CTA_SIZE_Y * i;
    291                         int xloc = threadIdx.x + CTA_SIZE_X * j;
    292 
    293                         component c = edgesTile[yloc][xloc];
    294                         int label = new_labels[i][j];
    295 
    296                         if (c & UP)
    297                            label = ::min(label, labelsTile[yloc - 1][xloc]);
    298 
    299                         if (c &  DOWN)
    300                            label = ::min(label, labelsTile[yloc + 1][xloc]);
    301 
    302                         if (c & LEFT)
    303                            label = ::min(label, labelsTile[yloc][xloc - 1]);
    304 
    305                         if (c & RIGHT)
    306                            label = ::min(label, labelsTile[yloc][xloc + 1]);
    307 
    308                        new_labels[i][j] = label;
    309                     }
    310 
    311                 __syncthreads();
    312 
    313                 //3. determine: Is any value changed?
    314                 int changed = 0;
    315                 #pragma unroll
    316                 for (int i = 0; i < TPB_Y; ++i)
    317                     #pragma unroll
    318                     for (int j = 0; j < TPB_X; ++j)
    319                     {
    320                         if (new_labels[i][j] < old_labels[i][j])
    321                         {
    322                             changed = 1;
    323                             Emulation::smem::atomicMin(&labelsTile[0][0] + old_labels[i][j], new_labels[i][j]);
    324                         }
    325                     }
    326 
    327                 changed = Emulation::syncthreadsOr(changed);
    328 
    329                 if (!changed)
    330                     break;
    331 
    332                 //4. Compact paths
    333                 const int *labels = &labelsTile[0][0];
    334                 #pragma unroll
    335                 for (int i = 0; i < TPB_Y; ++i)
    336                     #pragma unroll
    337                     for (int j = 0; j < TPB_X; ++j)
    338                     {
    339                         int label = new_labels[i][j];
    340 
    341                         while( labels[label] < label ) label = labels[label];
    342 
    343                         new_labels[i][j] = label;
    344                     }
    345                 __syncthreads();
    346             }
    347 
    348             #pragma unroll
    349             for (int i = 0; i < TPB_Y; ++i)
    350             #pragma unroll
    351                 for (int j = 0; j < TPB_X; ++j)
    352                 {
    353                     int label = new_labels[i][j];
    354                     int yloc = label / TILE_COLS;
    355                     int xloc = label - yloc * TILE_COLS;
    356 
    357                     xloc += blockIdx.x * TILE_COLS;
    358                     yloc += blockIdx.y * TILE_ROWS;
    359 
    360                     label = yloc * edges.cols + xloc;
    361                     // do it for x too.
    362                     if (y + CTA_SIZE_Y * i < comps.rows) comps(y + CTA_SIZE_Y * i, x + CTA_SIZE_X * j) = label;
    363                 }
    364         }
    365 
    366         __device__ __forceinline__ int root(const PtrStepSzi& comps, int label)
    367         {
    368             while(1)
    369             {
    370                 int y = label / comps.cols;
    371                 int x = label - y * comps.cols;
    372 
    373                 int parent = comps(y, x);
    374 
    375                 if (label == parent) break;
    376 
    377                 label = parent;
    378             }
    379             return label;
    380         }
    381 
    382         __device__ __forceinline__ void isConnected(PtrStepSzi& comps, int l1, int l2, bool& changed)
    383         {
    384             int r1 = root(comps, l1);
    385             int r2 = root(comps, l2);
    386 
    387             if (r1 == r2) return;
    388 
    389             int mi = ::min(r1, r2);
    390             int ma = ::max(r1, r2);
    391 
    392             int y = ma / comps.cols;
    393             int x = ma - y * comps.cols;
    394 
    395             atomicMin(&comps.ptr(y)[x], mi);
    396             changed = true;
    397         }
    398 
    399         __global__ void crossMerge(const int tilesNumY, const int tilesNumX, int tileSizeY, int tileSizeX,
    400             const PtrStepSzb edges, PtrStepSzi comps, const int yIncomplete, int xIncomplete)
    401         {
    402             int tid = threadIdx.y * blockDim.x + threadIdx.x;
    403             int stride = blockDim.y * blockDim.x;
    404 
    405             int ybegin = blockIdx.y * (tilesNumY * tileSizeY);
    406             int yend   = ybegin + tilesNumY * tileSizeY;
    407 
    408             if (blockIdx.y == gridDim.y - 1)
    409             {
    410                 yend -= yIncomplete * tileSizeY;
    411                 yend -= tileSizeY;
    412                 tileSizeY = (edges.rows % tileSizeY);
    413 
    414                 yend += tileSizeY;
    415             }
    416 
    417             int xbegin = blockIdx.x * tilesNumX * tileSizeX;
    418             int xend   = xbegin + tilesNumX * tileSizeX;
    419 
    420             if (blockIdx.x == gridDim.x - 1)
    421             {
    422                 if (xIncomplete) yend = ybegin;
    423                 xend -= xIncomplete * tileSizeX;
    424                 xend -= tileSizeX;
    425                 tileSizeX = (edges.cols % tileSizeX);
    426 
    427                 xend += tileSizeX;
    428             }
    429 
    430             if (blockIdx.y == (gridDim.y - 1) && yIncomplete)
    431             {
    432                 xend = xbegin;
    433             }
    434 
    435             int tasksV = (tilesNumX - 1) * (yend - ybegin);
    436             int tasksH = (tilesNumY - 1) * (xend - xbegin);
    437 
    438             int total = tasksH + tasksV;
    439 
    440             bool changed;
    441             do
    442             {
    443                 changed = false;
    444                 for (int taskIdx = tid; taskIdx < total; taskIdx += stride)
    445                 {
    446                     if (taskIdx < tasksH)
    447                     {
    448                         int indexH = taskIdx;
    449 
    450                         int row = indexH / (xend - xbegin);
    451                         int col = indexH - row * (xend - xbegin);
    452 
    453                         int y = ybegin + (row + 1) * tileSizeY;
    454                         int x = xbegin + col;
    455 
    456                         component e = edges( x, y);
    457                         if (e & UP)
    458                         {
    459                             int lc = comps(y,x);
    460                             int lu = comps(y - 1, x);
    461 
    462                             isConnected(comps, lc, lu, changed);
    463                         }
    464                     }
    465                     else
    466                     {
    467                         int indexV = taskIdx - tasksH;
    468 
    469                         int col = indexV / (yend - ybegin);
    470                         int row = indexV - col * (yend - ybegin);
    471 
    472                         int x = xbegin + (col + 1) * tileSizeX;
    473                         int y = ybegin + row;
    474 
    475                         component e = edges(x, y);
    476                         if (e & LEFT)
    477                         {
    478                             int lc = comps(y, x);
    479                             int ll = comps(y, x - 1);
    480 
    481                             isConnected(comps, lc, ll, changed);
    482                         }
    483                     }
    484                 }
    485             } while (Emulation::syncthreadsOr(changed));
    486         }
    487 
    488         __global__ void flatten(const PtrStepSzb edges, PtrStepSzi comps)
    489         {
    490             int x = threadIdx.x + blockIdx.x * blockDim.x;
    491             int y = threadIdx.y + blockIdx.y * blockDim.y;
    492 
    493             if( x < comps.cols && y < comps.rows)
    494                 comps(y, x) = root(comps, comps(y, x));
    495         }
    496 
    497         enum {CC_NO_COMPACT = 0, CC_COMPACT_LABELS = 1};
    498 
    499         void labelComponents(const PtrStepSzb& edges, PtrStepSzi comps, int flags, cudaStream_t stream)
    500         {
    501             (void) flags;
    502             dim3 block(CTA_SIZE_X, CTA_SIZE_Y);
    503             dim3 grid(divUp(edges.cols, TILE_COLS), divUp(edges.rows, TILE_ROWS));
    504 
    505             lableTiles<<<grid, block, 0, stream>>>(edges, comps);
    506             cudaSafeCall( cudaGetLastError() );
    507 
    508             int tileSizeX = TILE_COLS, tileSizeY = TILE_ROWS;
    509             while (grid.x > 1 || grid.y > 1)
    510             {
    511                 dim3 mergeGrid((int)ceilf(grid.x / 2.f), (int)ceilf(grid.y / 2.f));
    512                 dim3 mergeBlock(STA_SIZE_MERGE_X, STA_SIZE_MERGE_Y);
    513                 // debug log
    514                 // std::cout << "merging: " << grid.y  << " x " << grid.x << " ---> " << mergeGrid.y <<  " x " << mergeGrid.x << " for tiles: " << tileSizeY << " x " << tileSizeX << std::endl;
    515                 crossMerge<<<mergeGrid, mergeBlock, 0, stream>>>(2, 2, tileSizeY, tileSizeX, edges, comps, (int)ceilf(grid.y / 2.f) - grid.y / 2, (int)ceilf(grid.x / 2.f) - grid.x / 2);
    516                 tileSizeX <<= 1;
    517                 tileSizeY <<= 1;
    518                 grid = mergeGrid;
    519 
    520                 cudaSafeCall( cudaGetLastError() );
    521             }
    522 
    523             grid.x = divUp(edges.cols, block.x);
    524             grid.y = divUp(edges.rows, block.y);
    525             flatten<<<grid, block, 0, stream>>>(edges, comps);
    526             cudaSafeCall( cudaGetLastError() );
    527 
    528             if (stream == 0)
    529                 cudaSafeCall( cudaDeviceSynchronize() );
    530         }
    531     }
    532 } } }
    533 
    534 #endif /* CUDA_DISABLER */
    535