Home | History | Annotate | Download | only in src
      1 /*M///////////////////////////////////////////////////////////////////////////////////////
      2 //
      3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
      4 //
      5 //  By downloading, copying, installing or using the software you agree to this license.
      6 //  If you do not agree to this license, do not download, install,
      7 //  copy or use the software.
      8 //
      9 //
     10 //                        Intel License Agreement
     11 //                For Open Source Computer Vision Library
     12 //
     13 // Copyright (C) 2000, Intel Corporation, all rights reserved.
     14 // Copyright (C) 2014, Itseez Inc., all rights reserved.
     15 // Third party copyrights are property of their respective owners.
     16 //
     17 // Redistribution and use in source and binary forms, with or without modification,
     18 // are permitted provided that the following conditions are met:
     19 //
     20 //   * Redistribution's of source code must retain the above copyright notice,
     21 //     this list of conditions and the following disclaimer.
     22 //
     23 //   * Redistribution's in binary form must reproduce the above copyright notice,
     24 //     this list of conditions and the following disclaimer in the documentation
     25 //     and/or other materials provided with the distribution.
     26 //
     27 //   * The name of Intel Corporation may not be used to endorse or promote products
     28 //     derived from this software without specific prior written permission.
     29 //
     30 // This software is provided by the copyright holders and contributors "as is" and
     31 // any express or implied warranties, including, but not limited to, the implied
     32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
     33 // In no event shall the Intel Corporation or contributors be liable for any direct,
     34 // indirect, incidental, special, exemplary, or consequential damages
     35 // (including, but not limited to, procurement of substitute goods or services;
     36 // loss of use, data, or profits; or business interruption) however caused
     37 // and on any theory of liability, whether in contract, strict liability,
     38 // or tort (including negligence or otherwise) arising in any way out of
     39 // the use of this software, even if advised of the possibility of such damage.
     40 //
     41 //M*/
     42 
     43 #include "precomp.hpp"
     44 #include "opencl_kernels_imgproc.hpp"
     45 
     46 
     47 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
     48 #define USE_IPP_CANNY 1
     49 #else
     50 #undef USE_IPP_CANNY
     51 #endif
     52 
     53 
     54 namespace cv
     55 {
     56 
     57 #ifdef USE_IPP_CANNY
     58 static bool ippCanny(const Mat& _src, Mat& _dst, float low,  float high)
     59 {
     60     int size = 0, size1 = 0;
     61     IppiSize roi = { _src.cols, _src.rows };
     62 
     63     if (ippiFilterSobelNegVertGetBufferSize_8u16s_C1R(roi, ippMskSize3x3, &size) < 0)
     64         return false;
     65     if (ippiFilterSobelHorizGetBufferSize_8u16s_C1R(roi, ippMskSize3x3, &size1) < 0)
     66         return false;
     67     size = std::max(size, size1);
     68 
     69     if (ippiCannyGetSize(roi, &size1) < 0)
     70         return false;
     71     size = std::max(size, size1);
     72 
     73     AutoBuffer<uchar> buf(size + 64);
     74     uchar* buffer = alignPtr((uchar*)buf, 32);
     75 
     76     Mat _dx(_src.rows, _src.cols, CV_16S);
     77     if( ippiFilterSobelNegVertBorder_8u16s_C1R(_src.ptr(), (int)_src.step,
     78                     _dx.ptr<short>(), (int)_dx.step, roi,
     79                     ippMskSize3x3, ippBorderRepl, 0, buffer) < 0 )
     80         return false;
     81 
     82     Mat _dy(_src.rows, _src.cols, CV_16S);
     83     if( ippiFilterSobelHorizBorder_8u16s_C1R(_src.ptr(), (int)_src.step,
     84                     _dy.ptr<short>(), (int)_dy.step, roi,
     85                     ippMskSize3x3, ippBorderRepl, 0, buffer) < 0 )
     86         return false;
     87 
     88     if( ippiCanny_16s8u_C1R(_dx.ptr<short>(), (int)_dx.step,
     89                                _dy.ptr<short>(), (int)_dy.step,
     90                               _dst.ptr(), (int)_dst.step, roi, low, high, buffer) < 0 )
     91         return false;
     92     return true;
     93 }
     94 #endif
     95 
     96 #ifdef HAVE_OPENCL
     97 
     98 static bool ocl_Canny(InputArray _src, OutputArray _dst, float low_thresh, float high_thresh,
     99                       int aperture_size, bool L2gradient, int cn, const Size & size)
    100 {
    101     UMat map;
    102 
    103     const ocl::Device &dev = ocl::Device::getDefault();
    104     int max_wg_size = (int)dev.maxWorkGroupSize();
    105 
    106     int lSizeX = 32;
    107     int lSizeY = max_wg_size / 32;
    108 
    109     if (lSizeY == 0)
    110     {
    111         lSizeX = 16;
    112         lSizeY = max_wg_size / 16;
    113     }
    114     if (lSizeY == 0)
    115     {
    116         lSizeY = 1;
    117     }
    118 
    119     if (L2gradient)
    120     {
    121         low_thresh = std::min(32767.0f, low_thresh);
    122         high_thresh = std::min(32767.0f, high_thresh);
    123 
    124         if (low_thresh > 0)
    125             low_thresh *= low_thresh;
    126         if (high_thresh > 0)
    127             high_thresh *= high_thresh;
    128     }
    129     int low = cvFloor(low_thresh), high = cvFloor(high_thresh);
    130 
    131     if (aperture_size == 3 && !_src.isSubmatrix())
    132     {
    133         /*
    134             stage1_with_sobel:
    135                 Sobel operator
    136                 Calc magnitudes
    137                 Non maxima suppression
    138                 Double thresholding
    139         */
    140         char cvt[40];
    141         ocl::Kernel with_sobel("stage1_with_sobel", ocl::imgproc::canny_oclsrc,
    142                                format("-D WITH_SOBEL -D cn=%d -D TYPE=%s -D convert_floatN=%s -D floatN=%s -D GRP_SIZEX=%d -D GRP_SIZEY=%d%s",
    143                                       cn, ocl::memopTypeToStr(_src.depth()),
    144                                       ocl::convertTypeStr(_src.depth(), CV_32F, cn, cvt),
    145                                       ocl::typeToStr(CV_MAKE_TYPE(CV_32F, cn)),
    146                                       lSizeX, lSizeY,
    147                                       L2gradient ? " -D L2GRAD" : ""));
    148         if (with_sobel.empty())
    149             return false;
    150 
    151         UMat src = _src.getUMat();
    152         map.create(size, CV_32S);
    153         with_sobel.args(ocl::KernelArg::ReadOnly(src),
    154                         ocl::KernelArg::WriteOnlyNoSize(map),
    155                         (float) low, (float) high);
    156 
    157         size_t globalsize[2] = { size.width, size.height },
    158                 localsize[2] = { lSizeX, lSizeY };
    159 
    160         if (!with_sobel.run(2, globalsize, localsize, false))
    161             return false;
    162     }
    163     else
    164     {
    165         /*
    166             stage1_without_sobel:
    167                 Calc magnitudes
    168                 Non maxima suppression
    169                 Double thresholding
    170         */
    171         UMat dx, dy;
    172         Sobel(_src, dx, CV_16S, 1, 0, aperture_size, 1, 0, BORDER_REPLICATE);
    173         Sobel(_src, dy, CV_16S, 0, 1, aperture_size, 1, 0, BORDER_REPLICATE);
    174 
    175         ocl::Kernel without_sobel("stage1_without_sobel", ocl::imgproc::canny_oclsrc,
    176                                     format("-D WITHOUT_SOBEL -D cn=%d -D GRP_SIZEX=%d -D GRP_SIZEY=%d%s",
    177                                            cn, lSizeX, lSizeY, L2gradient ? " -D L2GRAD" : ""));
    178         if (without_sobel.empty())
    179             return false;
    180 
    181         map.create(size, CV_32S);
    182         without_sobel.args(ocl::KernelArg::ReadOnlyNoSize(dx), ocl::KernelArg::ReadOnlyNoSize(dy),
    183                            ocl::KernelArg::WriteOnly(map),
    184                            low, high);
    185 
    186         size_t globalsize[2] = { size.width, size.height },
    187                 localsize[2] = { lSizeX, lSizeY };
    188 
    189         if (!without_sobel.run(2, globalsize, localsize, false))
    190             return false;
    191     }
    192 
    193     int PIX_PER_WI = 8;
    194     /*
    195         stage2:
    196             hysteresis (add weak edges if they are connected with strong edges)
    197     */
    198 
    199     int sizey = lSizeY / PIX_PER_WI;
    200     if (sizey == 0)
    201         sizey = 1;
    202 
    203     size_t globalsize[2] = { size.width, (size.height + PIX_PER_WI - 1) / PIX_PER_WI }, localsize[2] = { lSizeX, sizey };
    204 
    205     ocl::Kernel edgesHysteresis("stage2_hysteresis", ocl::imgproc::canny_oclsrc,
    206                                 format("-D STAGE2 -D PIX_PER_WI=%d -D LOCAL_X=%d -D LOCAL_Y=%d",
    207                                 PIX_PER_WI, lSizeX, sizey));
    208 
    209     if (edgesHysteresis.empty())
    210         return false;
    211 
    212     edgesHysteresis.args(ocl::KernelArg::ReadWrite(map));
    213     if (!edgesHysteresis.run(2, globalsize, localsize, false))
    214         return false;
    215 
    216     // get edges
    217 
    218     ocl::Kernel getEdgesKernel("getEdges", ocl::imgproc::canny_oclsrc,
    219                                 format("-D GET_EDGES -D PIX_PER_WI=%d", PIX_PER_WI));
    220     if (getEdgesKernel.empty())
    221         return false;
    222 
    223     _dst.create(size, CV_8UC1);
    224     UMat dst = _dst.getUMat();
    225 
    226     getEdgesKernel.args(ocl::KernelArg::ReadOnly(map), ocl::KernelArg::WriteOnlyNoSize(dst));
    227 
    228     return getEdgesKernel.run(2, globalsize, NULL, false);
    229 }
    230 
    231 #endif
    232 
    233 #ifdef HAVE_TBB
    234 
    235 // Queue with peaks that will processed serially.
    236 static tbb::concurrent_queue<uchar*> borderPeaks;
    237 
    238 class tbbCanny
    239 {
    240 public:
    241     tbbCanny(const Range _boundaries, const Mat& _src, uchar* _map, int _low,
    242             int _high, int _aperture_size, bool _L2gradient)
    243         : boundaries(_boundaries), src(_src), map(_map), low(_low), high(_high),
    244           aperture_size(_aperture_size), L2gradient(_L2gradient)
    245     {}
    246 
    247     // This parallel version of Canny algorithm splits the src image in threadsNumber horizontal slices.
    248     // The first row of each slice contains the last row of the previous slice and
    249     // the last row of each slice contains the first row of the next slice
    250     // so that each slice is independent and no mutexes are required.
    251     void operator()() const
    252     {
    253 #if CV_SSE2
    254         bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
    255 #endif
    256 
    257         const int type = src.type(), cn = CV_MAT_CN(type);
    258 
    259         Mat dx, dy;
    260 
    261         ptrdiff_t mapstep = src.cols + 2;
    262 
    263         // In sobel transform we calculate ksize2 extra lines for the first and last rows of each slice
    264         // because IPPDerivSobel expects only isolated ROIs, in contrast with the opencv version which
    265         // uses the pixels outside of the ROI to form a border.
    266         uchar ksize2 = aperture_size / 2;
    267 
    268         if (boundaries.start == 0 && boundaries.end == src.rows)
    269         {
    270             Mat tempdx(boundaries.end - boundaries.start + 2, src.cols, CV_16SC(cn));
    271             Mat tempdy(boundaries.end - boundaries.start + 2, src.cols, CV_16SC(cn));
    272 
    273             memset(tempdx.ptr<short>(0), 0, cn * src.cols*sizeof(short));
    274             memset(tempdy.ptr<short>(0), 0, cn * src.cols*sizeof(short));
    275             memset(tempdx.ptr<short>(tempdx.rows - 1), 0, cn * src.cols*sizeof(short));
    276             memset(tempdy.ptr<short>(tempdy.rows - 1), 0, cn * src.cols*sizeof(short));
    277 
    278             Sobel(src, tempdx.rowRange(1, tempdx.rows - 1), CV_16S, 1, 0, aperture_size, 1, 0, BORDER_REPLICATE);
    279             Sobel(src, tempdy.rowRange(1, tempdy.rows - 1), CV_16S, 0, 1, aperture_size, 1, 0, BORDER_REPLICATE);
    280 
    281             dx = tempdx;
    282             dy = tempdy;
    283         }
    284         else if (boundaries.start == 0)
    285         {
    286             Mat tempdx(boundaries.end - boundaries.start + 2 + ksize2, src.cols, CV_16SC(cn));
    287             Mat tempdy(boundaries.end - boundaries.start + 2 + ksize2, src.cols, CV_16SC(cn));
    288 
    289             memset(tempdx.ptr<short>(0), 0, cn * src.cols*sizeof(short));
    290             memset(tempdy.ptr<short>(0), 0, cn * src.cols*sizeof(short));
    291 
    292             Sobel(src.rowRange(boundaries.start, boundaries.end + 1 + ksize2), tempdx.rowRange(1, tempdx.rows),
    293                     CV_16S, 1, 0, aperture_size, 1, 0, BORDER_REPLICATE);
    294             Sobel(src.rowRange(boundaries.start, boundaries.end + 1 + ksize2), tempdy.rowRange(1, tempdy.rows),
    295                     CV_16S, 0, 1, aperture_size, 1, 0, BORDER_REPLICATE);
    296 
    297             dx = tempdx.rowRange(0, tempdx.rows - ksize2);
    298             dy = tempdy.rowRange(0, tempdy.rows - ksize2);
    299         }
    300         else if (boundaries.end == src.rows)
    301         {
    302             Mat tempdx(boundaries.end - boundaries.start + 2 + ksize2, src.cols, CV_16SC(cn));
    303             Mat tempdy(boundaries.end - boundaries.start + 2 + ksize2, src.cols, CV_16SC(cn));
    304 
    305             memset(tempdx.ptr<short>(tempdx.rows - 1), 0, cn * src.cols*sizeof(short));
    306             memset(tempdy.ptr<short>(tempdy.rows - 1), 0, cn * src.cols*sizeof(short));
    307 
    308             Sobel(src.rowRange(boundaries.start - 1 - ksize2, boundaries.end), tempdx.rowRange(0, tempdx.rows - 1),
    309                     CV_16S, 1, 0, aperture_size, 1, 0, BORDER_REPLICATE);
    310             Sobel(src.rowRange(boundaries.start - 1 - ksize2, boundaries.end), tempdy.rowRange(0, tempdy.rows - 1),
    311                     CV_16S, 0, 1, aperture_size, 1, 0, BORDER_REPLICATE);
    312 
    313             dx = tempdx.rowRange(ksize2, tempdx.rows);
    314             dy = tempdy.rowRange(ksize2, tempdy.rows);
    315         }
    316         else
    317         {
    318             Mat tempdx(boundaries.end - boundaries.start + 2 + 2*ksize2, src.cols, CV_16SC(cn));
    319             Mat tempdy(boundaries.end - boundaries.start + 2 + 2*ksize2, src.cols, CV_16SC(cn));
    320 
    321             Sobel(src.rowRange(boundaries.start - 1 - ksize2, boundaries.end + 1 + ksize2), tempdx,
    322                     CV_16S, 1, 0, aperture_size, 1, 0, BORDER_REPLICATE);
    323             Sobel(src.rowRange(boundaries.start - 1 - ksize2, boundaries.end + 1 + ksize2), tempdy,
    324                     CV_16S, 0, 1, aperture_size, 1, 0, BORDER_REPLICATE);
    325 
    326             dx = tempdx.rowRange(ksize2, tempdx.rows - ksize2);
    327             dy = tempdy.rowRange(ksize2, tempdy.rows - ksize2);
    328         }
    329 
    330         int maxsize = std::max(1 << 10, src.cols * (boundaries.end - boundaries.start) / 10);
    331         std::vector<uchar*> stack(maxsize);
    332         uchar **stack_top = &stack[0];
    333         uchar **stack_bottom = &stack[0];
    334 
    335         AutoBuffer<uchar> buffer(cn * mapstep * 3 * sizeof(int));
    336 
    337         int* mag_buf[3];
    338         mag_buf[0] = (int*)(uchar*)buffer;
    339         mag_buf[1] = mag_buf[0] + mapstep*cn;
    340         mag_buf[2] = mag_buf[1] + mapstep*cn;
    341 
    342         // calculate magnitude and angle of gradient, perform non-maxima suppression.
    343         // fill the map with one of the following values:
    344         //   0 - the pixel might belong to an edge
    345         //   1 - the pixel can not belong to an edge
    346         //   2 - the pixel does belong to an edge
    347         for (int i = boundaries.start - 1; i <= boundaries.end; i++)
    348         {
    349             int* _norm = mag_buf[(i > boundaries.start) - (i == boundaries.start - 1) + 1] + 1;
    350 
    351             short* _dx = dx.ptr<short>(i - boundaries.start + 1);
    352             short* _dy = dy.ptr<short>(i - boundaries.start + 1);
    353 
    354             if (!L2gradient)
    355             {
    356                 int j = 0, width = src.cols * cn;
    357 #if CV_SSE2
    358                 if (haveSSE2)
    359                 {
    360                     __m128i v_zero = _mm_setzero_si128();
    361                     for ( ; j <= width - 8; j += 8)
    362                     {
    363                         __m128i v_dx = _mm_loadu_si128((const __m128i *)(_dx + j));
    364                         __m128i v_dy = _mm_loadu_si128((const __m128i *)(_dy + j));
    365                         v_dx = _mm_max_epi16(v_dx, _mm_sub_epi16(v_zero, v_dx));
    366                         v_dy = _mm_max_epi16(v_dy, _mm_sub_epi16(v_zero, v_dy));
    367 
    368                         __m128i v_norm = _mm_add_epi32(_mm_unpacklo_epi16(v_dx, v_zero), _mm_unpacklo_epi16(v_dy, v_zero));
    369                         _mm_storeu_si128((__m128i *)(_norm + j), v_norm);
    370 
    371                         v_norm = _mm_add_epi32(_mm_unpackhi_epi16(v_dx, v_zero), _mm_unpackhi_epi16(v_dy, v_zero));
    372                         _mm_storeu_si128((__m128i *)(_norm + j + 4), v_norm);
    373                     }
    374                 }
    375 #elif CV_NEON
    376                 for ( ; j <= width - 8; j += 8)
    377                 {
    378                     int16x8_t v_dx = vld1q_s16(_dx + j), v_dy = vld1q_s16(_dy + j);
    379                     vst1q_s32(_norm + j, vaddq_s32(vabsq_s32(vmovl_s16(vget_low_s16(v_dx))),
    380                                                    vabsq_s32(vmovl_s16(vget_low_s16(v_dy)))));
    381                     vst1q_s32(_norm + j + 4, vaddq_s32(vabsq_s32(vmovl_s16(vget_high_s16(v_dx))),
    382                                                        vabsq_s32(vmovl_s16(vget_high_s16(v_dy)))));
    383                 }
    384 #endif
    385                 for ( ; j < width; ++j)
    386                     _norm[j] = std::abs(int(_dx[j])) + std::abs(int(_dy[j]));
    387             }
    388             else
    389             {
    390                 int j = 0, width = src.cols * cn;
    391 #if CV_SSE2
    392                 if (haveSSE2)
    393                 {
    394                     for ( ; j <= width - 8; j += 8)
    395                     {
    396                         __m128i v_dx = _mm_loadu_si128((const __m128i *)(_dx + j));
    397                         __m128i v_dy = _mm_loadu_si128((const __m128i *)(_dy + j));
    398 
    399                         __m128i v_dx_ml = _mm_mullo_epi16(v_dx, v_dx), v_dx_mh = _mm_mulhi_epi16(v_dx, v_dx);
    400                         __m128i v_dy_ml = _mm_mullo_epi16(v_dy, v_dy), v_dy_mh = _mm_mulhi_epi16(v_dy, v_dy);
    401 
    402                         __m128i v_norm = _mm_add_epi32(_mm_unpacklo_epi16(v_dx_ml, v_dx_mh), _mm_unpacklo_epi16(v_dy_ml, v_dy_mh));
    403                         _mm_storeu_si128((__m128i *)(_norm + j), v_norm);
    404 
    405                         v_norm = _mm_add_epi32(_mm_unpackhi_epi16(v_dx_ml, v_dx_mh), _mm_unpackhi_epi16(v_dy_ml, v_dy_mh));
    406                         _mm_storeu_si128((__m128i *)(_norm + j + 4), v_norm);
    407                     }
    408                 }
    409 #elif CV_NEON
    410                 for ( ; j <= width - 8; j += 8)
    411                 {
    412                     int16x8_t v_dx = vld1q_s16(_dx + j), v_dy = vld1q_s16(_dy + j);
    413                     int16x4_t v_dxp = vget_low_s16(v_dx), v_dyp = vget_low_s16(v_dy);
    414                     int32x4_t v_dst = vmlal_s16(vmull_s16(v_dxp, v_dxp), v_dyp, v_dyp);
    415                     vst1q_s32(_norm + j, v_dst);
    416 
    417                     v_dxp = vget_high_s16(v_dx), v_dyp = vget_high_s16(v_dy);
    418                     v_dst = vmlal_s16(vmull_s16(v_dxp, v_dxp), v_dyp, v_dyp);
    419                     vst1q_s32(_norm + j + 4, v_dst);
    420                 }
    421 #endif
    422                 for ( ; j < width; ++j)
    423                     _norm[j] = int(_dx[j])*_dx[j] + int(_dy[j])*_dy[j];
    424             }
    425 
    426             if (cn > 1)
    427             {
    428                 for(int j = 0, jn = 0; j < src.cols; ++j, jn += cn)
    429                 {
    430                     int maxIdx = jn;
    431                     for(int k = 1; k < cn; ++k)
    432                         if(_norm[jn + k] > _norm[maxIdx]) maxIdx = jn + k;
    433                     _norm[j] = _norm[maxIdx];
    434                     _dx[j] = _dx[maxIdx];
    435                     _dy[j] = _dy[maxIdx];
    436                 }
    437             }
    438             _norm[-1] = _norm[src.cols] = 0;
    439 
    440             // at the very beginning we do not have a complete ring
    441             // buffer of 3 magnitude rows for non-maxima suppression
    442             if (i <= boundaries.start)
    443                 continue;
    444 
    445             uchar* _map = map + mapstep*i + 1;
    446             _map[-1] = _map[src.cols] = 1;
    447 
    448             int* _mag = mag_buf[1] + 1; // take the central row
    449             ptrdiff_t magstep1 = mag_buf[2] - mag_buf[1];
    450             ptrdiff_t magstep2 = mag_buf[0] - mag_buf[1];
    451 
    452             const short* _x = dx.ptr<short>(i - boundaries.start);
    453             const short* _y = dy.ptr<short>(i - boundaries.start);
    454 
    455             if ((stack_top - stack_bottom) + src.cols > maxsize)
    456             {
    457                 int sz = (int)(stack_top - stack_bottom);
    458                 maxsize = std::max(maxsize * 3/2, sz + src.cols);
    459                 stack.resize(maxsize);
    460                 stack_bottom = &stack[0];
    461                 stack_top = stack_bottom + sz;
    462             }
    463 
    464 #define CANNY_PUSH(d)    *(d) = uchar(2), *stack_top++ = (d)
    465 #define CANNY_POP(d)     (d) = *--stack_top
    466 
    467             int prev_flag = 0;
    468             bool canny_push = false;
    469             for (int j = 0; j < src.cols; j++)
    470             {
    471                 #define CANNY_SHIFT 15
    472                 const int TG22 = (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5);
    473 
    474                 int m = _mag[j];
    475 
    476                 if (m > low)
    477                 {
    478                     int xs = _x[j];
    479                     int ys = _y[j];
    480                     int x = std::abs(xs);
    481                     int y = std::abs(ys) << CANNY_SHIFT;
    482 
    483                     int tg22x = x * TG22;
    484 
    485                     if (y < tg22x)
    486                     {
    487                         if (m > _mag[j-1] && m >= _mag[j+1]) canny_push = true;
    488                     }
    489                     else
    490                     {
    491                         int tg67x = tg22x + (x << (CANNY_SHIFT+1));
    492                         if (y > tg67x)
    493                         {
    494                             if (m > _mag[j+magstep2] && m >= _mag[j+magstep1]) canny_push = true;
    495                         }
    496                         else
    497                         {
    498                             int s = (xs ^ ys) < 0 ? -1 : 1;
    499                             if (m > _mag[j+magstep2-s] && m > _mag[j+magstep1+s]) canny_push = true;
    500                         }
    501                     }
    502                 }
    503                 if (!canny_push)
    504                 {
    505                     prev_flag = 0;
    506                     _map[j] = uchar(1);
    507                     continue;
    508                 }
    509                 else
    510                 {
    511                     // _map[j-mapstep] is short-circuited at the start because previous thread is
    512                     // responsible for initializing it.
    513                     if (!prev_flag && m > high && (i <= boundaries.start+1 || _map[j-mapstep] != 2) )
    514                     {
    515                         CANNY_PUSH(_map + j);
    516                         prev_flag = 1;
    517                     }
    518                     else
    519                         _map[j] = 0;
    520 
    521                     canny_push = false;
    522                 }
    523             }
    524 
    525             // scroll the ring buffer
    526             _mag = mag_buf[0];
    527             mag_buf[0] = mag_buf[1];
    528             mag_buf[1] = mag_buf[2];
    529             mag_buf[2] = _mag;
    530         }
    531 
    532         // now track the edges (hysteresis thresholding)
    533         while (stack_top > stack_bottom)
    534         {
    535             if ((stack_top - stack_bottom) + 8 > maxsize)
    536             {
    537                 int sz = (int)(stack_top - stack_bottom);
    538                 maxsize = maxsize * 3/2;
    539                 stack.resize(maxsize);
    540                 stack_bottom = &stack[0];
    541                 stack_top = stack_bottom + sz;
    542             }
    543 
    544             uchar* m;
    545             CANNY_POP(m);
    546 
    547             // Stops thresholding from expanding to other slices by sending pixels in the borders of each
    548             // slice in a queue to be serially processed later.
    549             if ( (m < map + (boundaries.start + 2) * mapstep) || (m >= map + boundaries.end * mapstep) )
    550             {
    551                 borderPeaks.push(m);
    552                 continue;
    553             }
    554 
    555             if (!m[-1])         CANNY_PUSH(m - 1);
    556             if (!m[1])          CANNY_PUSH(m + 1);
    557             if (!m[-mapstep-1]) CANNY_PUSH(m - mapstep - 1);
    558             if (!m[-mapstep])   CANNY_PUSH(m - mapstep);
    559             if (!m[-mapstep+1]) CANNY_PUSH(m - mapstep + 1);
    560             if (!m[mapstep-1])  CANNY_PUSH(m + mapstep - 1);
    561             if (!m[mapstep])    CANNY_PUSH(m + mapstep);
    562             if (!m[mapstep+1])  CANNY_PUSH(m + mapstep + 1);
    563         }
    564     }
    565 
    566 private:
    567     const Range boundaries;
    568     const Mat& src;
    569     uchar* map;
    570     int low;
    571     int high;
    572     int aperture_size;
    573     bool L2gradient;
    574 };
    575 
    576 #endif
    577 
    578 } // namespace cv
    579 
    580 void cv::Canny( InputArray _src, OutputArray _dst,
    581                 double low_thresh, double high_thresh,
    582                 int aperture_size, bool L2gradient )
    583 {
    584     const int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
    585     const Size size = _src.size();
    586 
    587     CV_Assert( depth == CV_8U );
    588     _dst.create(size, CV_8U);
    589 
    590     if (!L2gradient && (aperture_size & CV_CANNY_L2_GRADIENT) == CV_CANNY_L2_GRADIENT)
    591     {
    592         // backward compatibility
    593         aperture_size &= ~CV_CANNY_L2_GRADIENT;
    594         L2gradient = true;
    595     }
    596 
    597     if ((aperture_size & 1) == 0 || (aperture_size != -1 && (aperture_size < 3 || aperture_size > 7)))
    598         CV_Error(CV_StsBadFlag, "Aperture size should be odd");
    599 
    600     if (low_thresh > high_thresh)
    601         std::swap(low_thresh, high_thresh);
    602 
    603     CV_OCL_RUN(_dst.isUMat() && (cn == 1 || cn == 3),
    604                ocl_Canny(_src, _dst, (float)low_thresh, (float)high_thresh, aperture_size, L2gradient, cn, size))
    605 
    606     Mat src = _src.getMat(), dst = _dst.getMat();
    607 
    608 #ifdef HAVE_TEGRA_OPTIMIZATION
    609     if (tegra::useTegra() && tegra::canny(src, dst, low_thresh, high_thresh, aperture_size, L2gradient))
    610         return;
    611 #endif
    612 
    613 #ifdef USE_IPP_CANNY
    614     CV_IPP_CHECK()
    615     {
    616         if( aperture_size == 3 && !L2gradient && 1 == cn )
    617         {
    618             if (ippCanny(src, dst, (float)low_thresh, (float)high_thresh))
    619             {
    620                 CV_IMPL_ADD(CV_IMPL_IPP);
    621                 return;
    622             }
    623             setIppErrorStatus();
    624         }
    625     }
    626 #endif
    627 
    628 #ifdef HAVE_TBB
    629 
    630 if (L2gradient)
    631 {
    632     low_thresh = std::min(32767.0, low_thresh);
    633     high_thresh = std::min(32767.0, high_thresh);
    634 
    635     if (low_thresh > 0) low_thresh *= low_thresh;
    636     if (high_thresh > 0) high_thresh *= high_thresh;
    637 }
    638 int low = cvFloor(low_thresh);
    639 int high = cvFloor(high_thresh);
    640 
    641 ptrdiff_t mapstep = src.cols + 2;
    642 AutoBuffer<uchar> buffer((src.cols+2)*(src.rows+2));
    643 
    644 uchar* map = (uchar*)buffer;
    645 memset(map, 1, mapstep);
    646 memset(map + mapstep*(src.rows + 1), 1, mapstep);
    647 
    648 int threadsNumber = tbb::task_scheduler_init::default_num_threads();
    649 int grainSize = src.rows / threadsNumber;
    650 
    651 // Make a fallback for pictures with too few rows.
    652 uchar ksize2 = aperture_size / 2;
    653 int minGrainSize = 1 + ksize2;
    654 int maxGrainSize = src.rows - 2 - 2*ksize2;
    655 if ( !( minGrainSize <= grainSize && grainSize <= maxGrainSize ) )
    656 {
    657     threadsNumber = 1;
    658     grainSize = src.rows;
    659 }
    660 
    661 tbb::task_group g;
    662 
    663 for (int i = 0; i < threadsNumber; ++i)
    664 {
    665     if (i < threadsNumber - 1)
    666         g.run(tbbCanny(Range(i * grainSize, (i + 1) * grainSize), src, map, low, high, aperture_size, L2gradient));
    667     else
    668         g.run(tbbCanny(Range(i * grainSize, src.rows), src, map, low, high, aperture_size, L2gradient));
    669 }
    670 
    671 g.wait();
    672 
    673 #define CANNY_PUSH_SERIAL(d)    *(d) = uchar(2), borderPeaks.push(d)
    674 
    675 // now track the edges (hysteresis thresholding)
    676 uchar* m;
    677 while (borderPeaks.try_pop(m))
    678 {
    679     if (!m[-1])         CANNY_PUSH_SERIAL(m - 1);
    680     if (!m[1])          CANNY_PUSH_SERIAL(m + 1);
    681     if (!m[-mapstep-1]) CANNY_PUSH_SERIAL(m - mapstep - 1);
    682     if (!m[-mapstep])   CANNY_PUSH_SERIAL(m - mapstep);
    683     if (!m[-mapstep+1]) CANNY_PUSH_SERIAL(m - mapstep + 1);
    684     if (!m[mapstep-1])  CANNY_PUSH_SERIAL(m + mapstep - 1);
    685     if (!m[mapstep])    CANNY_PUSH_SERIAL(m + mapstep);
    686     if (!m[mapstep+1])  CANNY_PUSH_SERIAL(m + mapstep + 1);
    687 }
    688 
    689 #else
    690 
    691     Mat dx(src.rows, src.cols, CV_16SC(cn));
    692     Mat dy(src.rows, src.cols, CV_16SC(cn));
    693 
    694     Sobel(src, dx, CV_16S, 1, 0, aperture_size, 1, 0, BORDER_REPLICATE);
    695     Sobel(src, dy, CV_16S, 0, 1, aperture_size, 1, 0, BORDER_REPLICATE);
    696 
    697     if (L2gradient)
    698     {
    699         low_thresh = std::min(32767.0, low_thresh);
    700         high_thresh = std::min(32767.0, high_thresh);
    701 
    702         if (low_thresh > 0) low_thresh *= low_thresh;
    703         if (high_thresh > 0) high_thresh *= high_thresh;
    704     }
    705     int low = cvFloor(low_thresh);
    706     int high = cvFloor(high_thresh);
    707 
    708     ptrdiff_t mapstep = src.cols + 2;
    709     AutoBuffer<uchar> buffer((src.cols+2)*(src.rows+2) + cn * mapstep * 3 * sizeof(int));
    710 
    711     int* mag_buf[3];
    712     mag_buf[0] = (int*)(uchar*)buffer;
    713     mag_buf[1] = mag_buf[0] + mapstep*cn;
    714     mag_buf[2] = mag_buf[1] + mapstep*cn;
    715     memset(mag_buf[0], 0, /* cn* */mapstep*sizeof(int));
    716 
    717     uchar* map = (uchar*)(mag_buf[2] + mapstep*cn);
    718     memset(map, 1, mapstep);
    719     memset(map + mapstep*(src.rows + 1), 1, mapstep);
    720 
    721     int maxsize = std::max(1 << 10, src.cols * src.rows / 10);
    722     std::vector<uchar*> stack(maxsize);
    723     uchar **stack_top = &stack[0];
    724     uchar **stack_bottom = &stack[0];
    725 
    726     /* sector numbers
    727        (Top-Left Origin)
    728 
    729         1   2   3
    730          *  *  *
    731           * * *
    732         0*******0
    733           * * *
    734          *  *  *
    735         3   2   1
    736     */
    737 
    738     #define CANNY_PUSH(d)    *(d) = uchar(2), *stack_top++ = (d)
    739     #define CANNY_POP(d)     (d) = *--stack_top
    740 
    741 #if CV_SSE2
    742     bool haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
    743 #endif
    744 
    745     // calculate magnitude and angle of gradient, perform non-maxima suppression.
    746     // fill the map with one of the following values:
    747     //   0 - the pixel might belong to an edge
    748     //   1 - the pixel can not belong to an edge
    749     //   2 - the pixel does belong to an edge
    750     for (int i = 0; i <= src.rows; i++)
    751     {
    752         int* _norm = mag_buf[(i > 0) + 1] + 1;
    753         if (i < src.rows)
    754         {
    755             short* _dx = dx.ptr<short>(i);
    756             short* _dy = dy.ptr<short>(i);
    757 
    758             if (!L2gradient)
    759             {
    760                 int j = 0, width = src.cols * cn;
    761 #if CV_SSE2
    762                 if (haveSSE2)
    763                 {
    764                     __m128i v_zero = _mm_setzero_si128();
    765                     for ( ; j <= width - 8; j += 8)
    766                     {
    767                         __m128i v_dx = _mm_loadu_si128((const __m128i *)(_dx + j));
    768                         __m128i v_dy = _mm_loadu_si128((const __m128i *)(_dy + j));
    769                         v_dx = _mm_max_epi16(v_dx, _mm_sub_epi16(v_zero, v_dx));
    770                         v_dy = _mm_max_epi16(v_dy, _mm_sub_epi16(v_zero, v_dy));
    771 
    772                         __m128i v_norm = _mm_add_epi32(_mm_unpacklo_epi16(v_dx, v_zero), _mm_unpacklo_epi16(v_dy, v_zero));
    773                         _mm_storeu_si128((__m128i *)(_norm + j), v_norm);
    774 
    775                         v_norm = _mm_add_epi32(_mm_unpackhi_epi16(v_dx, v_zero), _mm_unpackhi_epi16(v_dy, v_zero));
    776                         _mm_storeu_si128((__m128i *)(_norm + j + 4), v_norm);
    777                     }
    778                 }
    779 #elif CV_NEON
    780                 for ( ; j <= width - 8; j += 8)
    781                 {
    782                     int16x8_t v_dx = vld1q_s16(_dx + j), v_dy = vld1q_s16(_dy + j);
    783                     vst1q_s32(_norm + j, vaddq_s32(vabsq_s32(vmovl_s16(vget_low_s16(v_dx))),
    784                                                    vabsq_s32(vmovl_s16(vget_low_s16(v_dy)))));
    785                     vst1q_s32(_norm + j + 4, vaddq_s32(vabsq_s32(vmovl_s16(vget_high_s16(v_dx))),
    786                                                        vabsq_s32(vmovl_s16(vget_high_s16(v_dy)))));
    787                 }
    788 #endif
    789                 for ( ; j < width; ++j)
    790                     _norm[j] = std::abs(int(_dx[j])) + std::abs(int(_dy[j]));
    791             }
    792             else
    793             {
    794                 int j = 0, width = src.cols * cn;
    795 #if CV_SSE2
    796                 if (haveSSE2)
    797                 {
    798                     for ( ; j <= width - 8; j += 8)
    799                     {
    800                         __m128i v_dx = _mm_loadu_si128((const __m128i *)(_dx + j));
    801                         __m128i v_dy = _mm_loadu_si128((const __m128i *)(_dy + j));
    802 
    803                         __m128i v_dx_ml = _mm_mullo_epi16(v_dx, v_dx), v_dx_mh = _mm_mulhi_epi16(v_dx, v_dx);
    804                         __m128i v_dy_ml = _mm_mullo_epi16(v_dy, v_dy), v_dy_mh = _mm_mulhi_epi16(v_dy, v_dy);
    805 
    806                         __m128i v_norm = _mm_add_epi32(_mm_unpacklo_epi16(v_dx_ml, v_dx_mh), _mm_unpacklo_epi16(v_dy_ml, v_dy_mh));
    807                         _mm_storeu_si128((__m128i *)(_norm + j), v_norm);
    808 
    809                         v_norm = _mm_add_epi32(_mm_unpackhi_epi16(v_dx_ml, v_dx_mh), _mm_unpackhi_epi16(v_dy_ml, v_dy_mh));
    810                         _mm_storeu_si128((__m128i *)(_norm + j + 4), v_norm);
    811                     }
    812                 }
    813 #elif CV_NEON
    814                 for ( ; j <= width - 8; j += 8)
    815                 {
    816                     int16x8_t v_dx = vld1q_s16(_dx + j), v_dy = vld1q_s16(_dy + j);
    817                     int16x4_t v_dxp = vget_low_s16(v_dx), v_dyp = vget_low_s16(v_dy);
    818                     int32x4_t v_dst = vmlal_s16(vmull_s16(v_dxp, v_dxp), v_dyp, v_dyp);
    819                     vst1q_s32(_norm + j, v_dst);
    820 
    821                     v_dxp = vget_high_s16(v_dx), v_dyp = vget_high_s16(v_dy);
    822                     v_dst = vmlal_s16(vmull_s16(v_dxp, v_dxp), v_dyp, v_dyp);
    823                     vst1q_s32(_norm + j + 4, v_dst);
    824                 }
    825 #endif
    826                 for ( ; j < width; ++j)
    827                     _norm[j] = int(_dx[j])*_dx[j] + int(_dy[j])*_dy[j];
    828             }
    829 
    830             if (cn > 1)
    831             {
    832                 for(int j = 0, jn = 0; j < src.cols; ++j, jn += cn)
    833                 {
    834                     int maxIdx = jn;
    835                     for(int k = 1; k < cn; ++k)
    836                         if(_norm[jn + k] > _norm[maxIdx]) maxIdx = jn + k;
    837                     _norm[j] = _norm[maxIdx];
    838                     _dx[j] = _dx[maxIdx];
    839                     _dy[j] = _dy[maxIdx];
    840                 }
    841             }
    842             _norm[-1] = _norm[src.cols] = 0;
    843         }
    844         else
    845             memset(_norm-1, 0, /* cn* */mapstep*sizeof(int));
    846 
    847         // at the very beginning we do not have a complete ring
    848         // buffer of 3 magnitude rows for non-maxima suppression
    849         if (i == 0)
    850             continue;
    851 
    852         uchar* _map = map + mapstep*i + 1;
    853         _map[-1] = _map[src.cols] = 1;
    854 
    855         int* _mag = mag_buf[1] + 1; // take the central row
    856         ptrdiff_t magstep1 = mag_buf[2] - mag_buf[1];
    857         ptrdiff_t magstep2 = mag_buf[0] - mag_buf[1];
    858 
    859         const short* _x = dx.ptr<short>(i-1);
    860         const short* _y = dy.ptr<short>(i-1);
    861 
    862         if ((stack_top - stack_bottom) + src.cols > maxsize)
    863         {
    864             int sz = (int)(stack_top - stack_bottom);
    865             maxsize = std::max(maxsize * 3/2, sz + src.cols);
    866             stack.resize(maxsize);
    867             stack_bottom = &stack[0];
    868             stack_top = stack_bottom + sz;
    869         }
    870 
    871         int prev_flag = 0;
    872         for (int j = 0; j < src.cols; j++)
    873         {
    874             #define CANNY_SHIFT 15
    875             const int TG22 = (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5);
    876 
    877             int m = _mag[j];
    878 
    879             if (m > low)
    880             {
    881                 int xs = _x[j];
    882                 int ys = _y[j];
    883                 int x = std::abs(xs);
    884                 int y = std::abs(ys) << CANNY_SHIFT;
    885 
    886                 int tg22x = x * TG22;
    887 
    888                 if (y < tg22x)
    889                 {
    890                     if (m > _mag[j-1] && m >= _mag[j+1]) goto __ocv_canny_push;
    891                 }
    892                 else
    893                 {
    894                     int tg67x = tg22x + (x << (CANNY_SHIFT+1));
    895                     if (y > tg67x)
    896                     {
    897                         if (m > _mag[j+magstep2] && m >= _mag[j+magstep1]) goto __ocv_canny_push;
    898                     }
    899                     else
    900                     {
    901                         int s = (xs ^ ys) < 0 ? -1 : 1;
    902                         if (m > _mag[j+magstep2-s] && m > _mag[j+magstep1+s]) goto __ocv_canny_push;
    903                     }
    904                 }
    905             }
    906             prev_flag = 0;
    907             _map[j] = uchar(1);
    908             continue;
    909 __ocv_canny_push:
    910             if (!prev_flag && m > high && _map[j-mapstep] != 2)
    911             {
    912                 CANNY_PUSH(_map + j);
    913                 prev_flag = 1;
    914             }
    915             else
    916                 _map[j] = 0;
    917         }
    918 
    919         // scroll the ring buffer
    920         _mag = mag_buf[0];
    921         mag_buf[0] = mag_buf[1];
    922         mag_buf[1] = mag_buf[2];
    923         mag_buf[2] = _mag;
    924     }
    925 
    926     // now track the edges (hysteresis thresholding)
    927     while (stack_top > stack_bottom)
    928     {
    929         uchar* m;
    930         if ((stack_top - stack_bottom) + 8 > maxsize)
    931         {
    932             int sz = (int)(stack_top - stack_bottom);
    933             maxsize = maxsize * 3/2;
    934             stack.resize(maxsize);
    935             stack_bottom = &stack[0];
    936             stack_top = stack_bottom + sz;
    937         }
    938 
    939         CANNY_POP(m);
    940 
    941         if (!m[-1])         CANNY_PUSH(m - 1);
    942         if (!m[1])          CANNY_PUSH(m + 1);
    943         if (!m[-mapstep-1]) CANNY_PUSH(m - mapstep - 1);
    944         if (!m[-mapstep])   CANNY_PUSH(m - mapstep);
    945         if (!m[-mapstep+1]) CANNY_PUSH(m - mapstep + 1);
    946         if (!m[mapstep-1])  CANNY_PUSH(m + mapstep - 1);
    947         if (!m[mapstep])    CANNY_PUSH(m + mapstep);
    948         if (!m[mapstep+1])  CANNY_PUSH(m + mapstep + 1);
    949     }
    950 
    951 #endif
    952 
    953     // the final pass, form the final image
    954     const uchar* pmap = map + mapstep + 1;
    955     uchar* pdst = dst.ptr();
    956     for (int i = 0; i < src.rows; i++, pmap += mapstep, pdst += dst.step)
    957     {
    958         for (int j = 0; j < src.cols; j++)
    959             pdst[j] = (uchar)-(pmap[j] >> 1);
    960     }
    961 }
    962 
    963 void cvCanny( const CvArr* image, CvArr* edges, double threshold1,
    964               double threshold2, int aperture_size )
    965 {
    966     cv::Mat src = cv::cvarrToMat(image), dst = cv::cvarrToMat(edges);
    967     CV_Assert( src.size == dst.size && src.depth() == CV_8U && dst.type() == CV_8U );
    968 
    969     cv::Canny(src, dst, threshold1, threshold2, aperture_size & 255,
    970               (aperture_size & CV_CANNY_L2_GRADIENT) != 0);
    971 }
    972 
    973 /* End of file. */
    974