Home | History | Annotate | Download | only in cuda
      1 /*M///////////////////////////////////////////////////////////////////////////////////////
      2 //
      3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
      4 //
      5 //  By downloading, copying, installing or using the software you agree to this license.
      6 //  If you do not agree to this license, do not download, install,
      7 //  copy or use the software.
      8 //
      9 //
     10 //                           License Agreement
     11 //                For Open Source Computer Vision Library
     12 //
     13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
     14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
     15 // Third party copyrights are property of their respective owners.
     16 //
     17 // Redistribution and use in source and binary forms, with or without modification,
     18 // are permitted provided that the following conditions are met:
     19 //
     20 //   * Redistribution's of source code must retain the above copyright notice,
     21 //     this list of conditions and the following disclaimer.
     22 //
     23 //   * Redistribution's in binary form must reproduce the above copyright notice,
     24 //     this list of conditions and the following disclaimer in the documentation
     25 //     and/or other materials provided with the distribution.
     26 //
     27 //   * The name of the copyright holders may not be used to endorse or promote products
     28 //     derived from this software without specific prior written permission.
     29 //
     30 // This software is provided by the copyright holders and contributors "as is" and
     31 // any express or implied warranties, including, but not limited to, the implied
     32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
     33 // In no event shall the Intel Corporation or contributors be liable for any direct,
     34 // indirect, incidental, special, exemplary, or consequential damages
     35 // (including, but not limited to, procurement of substitute goods or services;
     36 // loss of use, data, or profits; or business interruption) however caused
     37 // and on any theory of liability, whether in contract, strict liability,
     38 // or tort (including negligence or otherwise) arising in any way out of
     39 // the use of this software, even if advised of the possibility of such damage.
     40 //
     41 //M*/
     42 
     43 #if !defined CUDA_DISABLER
     44 
     45 #include "opencv2/core/cuda/common.hpp"
     46 #include "opencv2/core/cuda/emulation.hpp"
     47 #include "opencv2/core/cuda/transform.hpp"
     48 #include "opencv2/core/cuda/functional.hpp"
     49 #include "opencv2/core/cuda/utility.hpp"
     50 
     51 using namespace cv::cuda;
     52 using namespace cv::cuda::device;
     53 
     54 namespace canny
     55 {
     56     struct L1 : binary_function<int, int, float>
     57     {
     58         __device__ __forceinline__ float operator ()(int x, int y) const
     59         {
     60             return ::abs(x) + ::abs(y);
     61         }
     62 
     63         __host__ __device__ __forceinline__ L1() {}
     64         __host__ __device__ __forceinline__ L1(const L1&) {}
     65     };
     66     struct L2 : binary_function<int, int, float>
     67     {
     68         __device__ __forceinline__ float operator ()(int x, int y) const
     69         {
     70             return ::sqrtf(x * x + y * y);
     71         }
     72 
     73         __host__ __device__ __forceinline__ L2() {}
     74         __host__ __device__ __forceinline__ L2(const L2&) {}
     75     };
     76 }
     77 
     78 namespace cv { namespace cuda { namespace device
     79 {
     80     template <> struct TransformFunctorTraits<canny::L1> : DefaultTransformFunctorTraits<canny::L1>
     81     {
     82         enum { smart_shift = 4 };
     83     };
     84     template <> struct TransformFunctorTraits<canny::L2> : DefaultTransformFunctorTraits<canny::L2>
     85     {
     86         enum { smart_shift = 4 };
     87     };
     88 }}}
     89 
     90 namespace canny
     91 {
     92     texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_src(false, cudaFilterModePoint, cudaAddressModeClamp);
     93     struct SrcTex
     94     {
     95         int xoff;
     96         int yoff;
     97         __host__ SrcTex(int _xoff, int _yoff) : xoff(_xoff), yoff(_yoff) {}
     98 
     99         __device__ __forceinline__ int operator ()(int y, int x) const
    100         {
    101             return tex2D(tex_src, x + xoff, y + yoff);
    102         }
    103     };
    104 
    105     template <class Norm> __global__
    106     void calcMagnitudeKernel(const SrcTex src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm)
    107     {
    108         const int x = blockIdx.x * blockDim.x + threadIdx.x;
    109         const int y = blockIdx.y * blockDim.y + threadIdx.y;
    110 
    111         if (y >= mag.rows || x >= mag.cols)
    112             return;
    113 
    114         int dxVal = (src(y - 1, x + 1) + 2 * src(y, x + 1) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y, x - 1) + src(y + 1, x - 1));
    115         int dyVal = (src(y + 1, x - 1) + 2 * src(y + 1, x) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y - 1, x) + src(y - 1, x + 1));
    116 
    117         dx(y, x) = dxVal;
    118         dy(y, x) = dyVal;
    119 
    120         mag(y, x) = norm(dxVal, dyVal);
    121     }
    122 
    123     void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad, cudaStream_t stream)
    124     {
    125         const dim3 block(16, 16);
    126         const dim3 grid(divUp(mag.cols, block.x), divUp(mag.rows, block.y));
    127 
    128         bindTexture(&tex_src, srcWhole);
    129         SrcTex src(xoff, yoff);
    130 
    131         if (L2Grad)
    132         {
    133             L2 norm;
    134             calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm);
    135         }
    136         else
    137         {
    138             L1 norm;
    139             calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm);
    140         }
    141 
    142         cudaSafeCall( cudaGetLastError() );
    143 
    144         if (stream == NULL)
    145             cudaSafeCall( cudaDeviceSynchronize() );
    146     }
    147 
    148     void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad, cudaStream_t stream)
    149     {
    150         if (L2Grad)
    151         {
    152             L2 norm;
    153             transform(dx, dy, mag, norm, WithOutMask(), stream);
    154         }
    155         else
    156         {
    157             L1 norm;
    158             transform(dx, dy, mag, norm, WithOutMask(), stream);
    159         }
    160     }
    161 }
    162 
    163 //////////////////////////////////////////////////////////////////////////////////////////
    164 
    165 namespace canny
    166 {
    167     texture<float, cudaTextureType2D, cudaReadModeElementType> tex_mag(false, cudaFilterModePoint, cudaAddressModeClamp);
    168 
    169     __global__ void calcMapKernel(const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh)
    170     {
    171         const int CANNY_SHIFT = 15;
    172         const int TG22 = (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5);
    173 
    174         const int x = blockIdx.x * blockDim.x + threadIdx.x;
    175         const int y = blockIdx.y * blockDim.y + threadIdx.y;
    176 
    177         if (x == 0 || x >= dx.cols - 1 || y == 0 || y >= dx.rows - 1)
    178             return;
    179 
    180         int dxVal = dx(y, x);
    181         int dyVal = dy(y, x);
    182 
    183         const int s = (dxVal ^ dyVal) < 0 ? -1 : 1;
    184         const float m = tex2D(tex_mag, x, y);
    185 
    186         dxVal = ::abs(dxVal);
    187         dyVal = ::abs(dyVal);
    188 
    189         // 0 - the pixel can not belong to an edge
    190         // 1 - the pixel might belong to an edge
    191         // 2 - the pixel does belong to an edge
    192         int edge_type = 0;
    193 
    194         if (m > low_thresh)
    195         {
    196             const int tg22x = dxVal * TG22;
    197             const int tg67x = tg22x + ((dxVal + dxVal) << CANNY_SHIFT);
    198 
    199             dyVal <<= CANNY_SHIFT;
    200 
    201             if (dyVal < tg22x)
    202             {
    203                 if (m > tex2D(tex_mag, x - 1, y) && m >= tex2D(tex_mag, x + 1, y))
    204                     edge_type = 1 + (int)(m > high_thresh);
    205             }
    206             else if(dyVal > tg67x)
    207             {
    208                 if (m > tex2D(tex_mag, x, y - 1) && m >= tex2D(tex_mag, x, y + 1))
    209                     edge_type = 1 + (int)(m > high_thresh);
    210             }
    211             else
    212             {
    213                 if (m > tex2D(tex_mag, x - s, y - 1) && m >= tex2D(tex_mag, x + s, y + 1))
    214                     edge_type = 1 + (int)(m > high_thresh);
    215             }
    216         }
    217 
    218         map(y, x) = edge_type;
    219     }
    220 
    221     void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh, cudaStream_t stream)
    222     {
    223         const dim3 block(16, 16);
    224         const dim3 grid(divUp(dx.cols, block.x), divUp(dx.rows, block.y));
    225 
    226         bindTexture(&tex_mag, mag);
    227 
    228         calcMapKernel<<<grid, block, 0, stream>>>(dx, dy, map, low_thresh, high_thresh);
    229         cudaSafeCall( cudaGetLastError() );
    230 
    231         if (stream == NULL)
    232             cudaSafeCall( cudaDeviceSynchronize() );
    233     }
    234 }
    235 
    236 //////////////////////////////////////////////////////////////////////////////////////////
    237 
    238 namespace canny
    239 {
    240     __device__ int counter = 0;
    241 
    242     __device__ __forceinline__ bool checkIdx(int y, int x, int rows, int cols)
    243     {
    244         return (y >= 0) && (y < rows) && (x >= 0) && (x < cols);
    245     }
    246 
    247     __global__ void edgesHysteresisLocalKernel(PtrStepSzi map, short2* st)
    248     {
    249         __shared__ volatile int smem[18][18];
    250 
    251         const int x = blockIdx.x * blockDim.x + threadIdx.x;
    252         const int y = blockIdx.y * blockDim.y + threadIdx.y;
    253 
    254         smem[threadIdx.y + 1][threadIdx.x + 1] = checkIdx(y, x, map.rows, map.cols) ? map(y, x) : 0;
    255         if (threadIdx.y == 0)
    256             smem[0][threadIdx.x + 1] = checkIdx(y - 1, x, map.rows, map.cols) ? map(y - 1, x) : 0;
    257         if (threadIdx.y == blockDim.y - 1)
    258             smem[blockDim.y + 1][threadIdx.x + 1] = checkIdx(y + 1, x, map.rows, map.cols) ? map(y + 1, x) : 0;
    259         if (threadIdx.x == 0)
    260             smem[threadIdx.y + 1][0] = checkIdx(y, x - 1, map.rows, map.cols) ? map(y, x - 1) : 0;
    261         if (threadIdx.x == blockDim.x - 1)
    262             smem[threadIdx.y + 1][blockDim.x + 1] = checkIdx(y, x + 1, map.rows, map.cols) ? map(y, x + 1) : 0;
    263         if (threadIdx.x == 0 && threadIdx.y == 0)
    264             smem[0][0] = checkIdx(y - 1, x - 1, map.rows, map.cols) ? map(y - 1, x - 1) : 0;
    265         if (threadIdx.x == blockDim.x - 1 && threadIdx.y == 0)
    266             smem[0][blockDim.x + 1] = checkIdx(y - 1, x + 1, map.rows, map.cols) ? map(y - 1, x + 1) : 0;
    267         if (threadIdx.x == 0 && threadIdx.y == blockDim.y - 1)
    268             smem[blockDim.y + 1][0] = checkIdx(y + 1, x - 1, map.rows, map.cols) ? map(y + 1, x - 1) : 0;
    269         if (threadIdx.x == blockDim.x - 1 && threadIdx.y == blockDim.y - 1)
    270             smem[blockDim.y + 1][blockDim.x + 1] = checkIdx(y + 1, x + 1, map.rows, map.cols) ? map(y + 1, x + 1) : 0;
    271 
    272         __syncthreads();
    273 
    274         if (x >= map.cols || y >= map.rows)
    275             return;
    276 
    277         int n;
    278 
    279         #pragma unroll
    280         for (int k = 0; k < 16; ++k)
    281         {
    282             n = 0;
    283 
    284             if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1)
    285             {
    286                 n += smem[threadIdx.y    ][threadIdx.x    ] == 2;
    287                 n += smem[threadIdx.y    ][threadIdx.x + 1] == 2;
    288                 n += smem[threadIdx.y    ][threadIdx.x + 2] == 2;
    289 
    290                 n += smem[threadIdx.y + 1][threadIdx.x    ] == 2;
    291                 n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2;
    292 
    293                 n += smem[threadIdx.y + 2][threadIdx.x    ] == 2;
    294                 n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2;
    295                 n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2;
    296             }
    297 
    298             __syncthreads();
    299 
    300             if (n > 0)
    301                 smem[threadIdx.y + 1][threadIdx.x + 1] = 2;
    302 
    303             __syncthreads();
    304         }
    305 
    306         const int e = smem[threadIdx.y + 1][threadIdx.x + 1];
    307 
    308         map(y, x) = e;
    309 
    310         n = 0;
    311 
    312         if (e == 2)
    313         {
    314             n += smem[threadIdx.y    ][threadIdx.x    ] == 1;
    315             n += smem[threadIdx.y    ][threadIdx.x + 1] == 1;
    316             n += smem[threadIdx.y    ][threadIdx.x + 2] == 1;
    317 
    318             n += smem[threadIdx.y + 1][threadIdx.x    ] == 1;
    319             n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1;
    320 
    321             n += smem[threadIdx.y + 2][threadIdx.x    ] == 1;
    322             n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1;
    323             n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1;
    324         }
    325 
    326         if (n > 0)
    327         {
    328             const int ind =  ::atomicAdd(&counter, 1);
    329             st[ind] = make_short2(x, y);
    330         }
    331     }
    332 
    333     void edgesHysteresisLocal(PtrStepSzi map, short2* st1, cudaStream_t stream)
    334     {
    335         void* counter_ptr;
    336         cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) );
    337 
    338         cudaSafeCall( cudaMemsetAsync(counter_ptr, 0, sizeof(int), stream) );
    339 
    340         const dim3 block(16, 16);
    341         const dim3 grid(divUp(map.cols, block.x), divUp(map.rows, block.y));
    342 
    343         edgesHysteresisLocalKernel<<<grid, block, 0, stream>>>(map, st1);
    344         cudaSafeCall( cudaGetLastError() );
    345 
    346         if (stream == NULL)
    347             cudaSafeCall( cudaDeviceSynchronize() );
    348     }
    349 }
    350 
    351 //////////////////////////////////////////////////////////////////////////////////////////
    352 
    353 namespace canny
    354 {
    355     __constant__ int c_dx[8] = {-1,  0,  1, -1, 1, -1, 0, 1};
    356     __constant__ int c_dy[8] = {-1, -1, -1,  0, 0,  1, 1, 1};
    357 
    358     __global__ void edgesHysteresisGlobalKernel(PtrStepSzi map, short2* st1, short2* st2, const int count)
    359     {
    360         const int stack_size = 512;
    361 
    362         __shared__ int s_counter;
    363         __shared__ int s_ind;
    364         __shared__ short2 s_st[stack_size];
    365 
    366         if (threadIdx.x == 0)
    367             s_counter = 0;
    368 
    369         __syncthreads();
    370 
    371         int ind = blockIdx.y * gridDim.x + blockIdx.x;
    372 
    373         if (ind >= count)
    374             return;
    375 
    376         short2 pos = st1[ind];
    377 
    378         if (threadIdx.x < 8)
    379         {
    380             pos.x += c_dx[threadIdx.x];
    381             pos.y += c_dy[threadIdx.x];
    382 
    383             if (pos.x > 0 && pos.x < map.cols - 1 && pos.y > 0 && pos.y < map.rows - 1 && map(pos.y, pos.x) == 1)
    384             {
    385                 map(pos.y, pos.x) = 2;
    386 
    387                 ind = Emulation::smem::atomicAdd(&s_counter, 1);
    388 
    389                 s_st[ind] = pos;
    390             }
    391         }
    392 
    393         __syncthreads();
    394 
    395         while (s_counter > 0 && s_counter <= stack_size - blockDim.x)
    396         {
    397             const int subTaskIdx = threadIdx.x >> 3;
    398             const int portion = ::min(s_counter, blockDim.x >> 3);
    399 
    400             if (subTaskIdx < portion)
    401                 pos = s_st[s_counter - 1 - subTaskIdx];
    402 
    403             __syncthreads();
    404 
    405             if (threadIdx.x == 0)
    406                 s_counter -= portion;
    407 
    408             __syncthreads();
    409 
    410             if (subTaskIdx < portion)
    411             {
    412                 pos.x += c_dx[threadIdx.x & 7];
    413                 pos.y += c_dy[threadIdx.x & 7];
    414 
    415                 if (pos.x > 0 && pos.x < map.cols - 1 && pos.y > 0 && pos.y < map.rows - 1 && map(pos.y, pos.x) == 1)
    416                 {
    417                     map(pos.y, pos.x) = 2;
    418 
    419                     ind = Emulation::smem::atomicAdd(&s_counter, 1);
    420 
    421                     s_st[ind] = pos;
    422                 }
    423             }
    424 
    425             __syncthreads();
    426         }
    427 
    428         if (s_counter > 0)
    429         {
    430             if (threadIdx.x == 0)
    431             {
    432                 s_ind = ::atomicAdd(&counter, s_counter);
    433 
    434                 if (s_ind + s_counter > map.cols * map.rows)
    435                     s_counter = 0;
    436             }
    437 
    438             __syncthreads();
    439 
    440             ind = s_ind;
    441 
    442             for (int i = threadIdx.x; i < s_counter; i += blockDim.x)
    443                 st2[ind + i] = s_st[i];
    444         }
    445     }
    446 
    447     void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2, cudaStream_t stream)
    448     {
    449         void* counter_ptr;
    450         cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, canny::counter) );
    451 
    452         int count;
    453         cudaSafeCall( cudaMemcpyAsync(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost, stream) );
    454         cudaSafeCall( cudaStreamSynchronize(stream) );
    455 
    456         while (count > 0)
    457         {
    458             cudaSafeCall( cudaMemsetAsync(counter_ptr, 0, sizeof(int), stream) );
    459 
    460             const dim3 block(128);
    461             const dim3 grid(::min(count, 65535u), divUp(count, 65535), 1);
    462 
    463             edgesHysteresisGlobalKernel<<<grid, block, 0, stream>>>(map, st1, st2, count);
    464             cudaSafeCall( cudaGetLastError() );
    465 
    466             if (stream == NULL)
    467                 cudaSafeCall( cudaDeviceSynchronize() );
    468 
    469             cudaSafeCall( cudaMemcpyAsync(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost, stream) );
    470             cudaSafeCall( cudaStreamSynchronize(stream) );
    471 
    472             count = min(count, map.cols * map.rows);
    473 
    474             //std::swap(st1, st2);
    475             short2* tmp = st1;
    476             st1 = st2;
    477             st2 = tmp;
    478         }
    479     }
    480 }
    481 
    482 //////////////////////////////////////////////////////////////////////////////////////////
    483 
    484 namespace canny
    485 {
    486     struct GetEdges : unary_function<int, uchar>
    487     {
    488         __device__ __forceinline__ uchar operator ()(int e) const
    489         {
    490             return (uchar)(-(e >> 1));
    491         }
    492 
    493         __host__ __device__ __forceinline__ GetEdges() {}
    494         __host__ __device__ __forceinline__ GetEdges(const GetEdges&) {}
    495     };
    496 }
    497 
    498 namespace cv { namespace cuda { namespace device
    499 {
    500     template <> struct TransformFunctorTraits<canny::GetEdges> : DefaultTransformFunctorTraits<canny::GetEdges>
    501     {
    502         enum { smart_shift = 4 };
    503     };
    504 }}}
    505 
    506 namespace canny
    507 {
    508     void getEdges(PtrStepSzi map, PtrStepSzb dst, cudaStream_t stream)
    509     {
    510         transform(map, dst, GetEdges(), WithOutMask(), stream);
    511     }
    512 }
    513 
    514 #endif /* CUDA_DISABLER */
    515