Home | History | Annotate | Download | only in src
      1 /*M///////////////////////////////////////////////////////////////////////////////////////
      2 //
      3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
      4 //
      5 //  By downloading, copying, installing or using the software you agree to this license.
      6 //  If you do not agree to this license, do not download, install,
      7 //  copy or use the software.
      8 //
      9 //
     10 //                           License Agreement
     11 //                For Open Source Computer Vision Library
     12 //
     13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
     14 // Copyright (C) 2009-2010, Willow Garage Inc., all rights reserved.
     15 // Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
     16 // Third party copyrights are property of their respective owners.
     17 //
     18 // Redistribution and use in source and binary forms, with or without modification,
     19 // are permitted provided that the following conditions are met:
     20 //
     21 //   * Redistribution's of source code must retain the above copyright notice,
     22 //     this list of conditions and the following disclaimer.
     23 //
     24 //   * Redistribution's in binary form must reproduce the above copyright notice,
     25 //     this list of conditions and the following disclaimer in the documentation
     26 //     and/or other materials provided with the distribution.
     27 //
     28 //   * The name of the copyright holders may not be used to endorse or promote products
     29 //     derived from this software without specific prior written permission.
     30 //
     31 // This software is provided by the copyright holders and contributors "as is" and
     32 // any express or implied warranties, including, but not limited to, the implied
     33 // warranties of merchantability and fitness for a particular purpose are disclaimed.
     34 // In no event shall the Intel Corporation or contributors be liable for any direct,
     35 // indirect, incidental, special, exemplary, or consequential damages
     36 // (including, but not limited to, procurement of substitute goods or services;
     37 // loss of use, data, or profits; or business interruption) however caused
     38 // and on any theory of liability, whether in contract, strict liability,
     39 // or tort (including negligence or otherwise) arising in any way out of
     40 // the use of this software, even if advised of the possibility of such damage.
     41 //
     42 //M*/
     43 
     44 /********************************* COPYRIGHT NOTICE *******************************\
     45   The function for RGB to Lab conversion is based on the MATLAB script
     46   RGB2Lab.m translated by Mark Ruzon from C code by Yossi Rubner, 23 September 1997.
     47   See the page [http://vision.stanford.edu/~ruzon/software/rgblab.html]
     48 \**********************************************************************************/
     49 
     50 /********************************* COPYRIGHT NOTICE *******************************\
     51   Original code for Bayer->BGR/RGB conversion is provided by Dirk Schaefer
     52   from MD-Mathematische Dienste GmbH. Below is the copyright notice:
     53 
     54     IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
     55     By downloading, copying, installing or using the software you agree
     56     to this license. If you do not agree to this license, do not download,
     57     install, copy or use the software.
     58 
     59     Contributors License Agreement:
     60 
     61       Copyright (c) 2002,
     62       MD-Mathematische Dienste GmbH
     63       Im Defdahl 5-10
     64       44141 Dortmund
     65       Germany
     66       www.md-it.de
     67 
     68     Redistribution and use in source and binary forms,
     69     with or without modification, are permitted provided
     70     that the following conditions are met:
     71 
     72     Redistributions of source code must retain
     73     the above copyright notice, this list of conditions and the following disclaimer.
     74     Redistributions in binary form must reproduce the above copyright notice,
     75     this list of conditions and the following disclaimer in the documentation
     76     and/or other materials provided with the distribution.
     77     The name of Contributor may not be used to endorse or promote products
     78     derived from this software without specific prior written permission.
     79 
     80     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
     81     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
     82     THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     83     PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE
     84     FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     85     DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     86     OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     87     HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
     88     STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
     89     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
     90     THE POSSIBILITY OF SUCH DAMAGE.
     91 \**********************************************************************************/
     92 
     93 #include "precomp.hpp"
     94 #include "opencl_kernels_imgproc.hpp"
     95 #include <limits>
     96 
     97 #define  CV_DESCALE(x,n)     (((x) + (1 << ((n)-1))) >> (n))
     98 
     99 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
    100 #define MAX_IPP8u   255
    101 #define MAX_IPP16u  65535
    102 #define MAX_IPP32f  1.0
    103 static IppStatus sts = ippInit();
    104 #endif
    105 
    106 namespace cv
    107 {
    108 
    109 // computes cubic spline coefficients for a function: (xi=i, yi=f[i]), i=0..n
    110 template<typename _Tp> static void splineBuild(const _Tp* f, int n, _Tp* tab)
    111 {
    112     _Tp cn = 0;
    113     int i;
    114     tab[0] = tab[1] = (_Tp)0;
    115 
    116     for(i = 1; i < n-1; i++)
    117     {
    118         _Tp t = 3*(f[i+1] - 2*f[i] + f[i-1]);
    119         _Tp l = 1/(4 - tab[(i-1)*4]);
    120         tab[i*4] = l; tab[i*4+1] = (t - tab[(i-1)*4+1])*l;
    121     }
    122 
    123     for(i = n-1; i >= 0; i--)
    124     {
    125         _Tp c = tab[i*4+1] - tab[i*4]*cn;
    126         _Tp b = f[i+1] - f[i] - (cn + c*2)*(_Tp)0.3333333333333333;
    127         _Tp d = (cn - c)*(_Tp)0.3333333333333333;
    128         tab[i*4] = f[i]; tab[i*4+1] = b;
    129         tab[i*4+2] = c; tab[i*4+3] = d;
    130         cn = c;
    131     }
    132 }
    133 
    134 // interpolates value of a function at x, 0 <= x <= n using a cubic spline.
    135 template<typename _Tp> static inline _Tp splineInterpolate(_Tp x, const _Tp* tab, int n)
    136 {
    137     // don't touch this function without urgent need - some versions of gcc fail to inline it correctly
    138     int ix = std::min(std::max(int(x), 0), n-1);
    139     x -= ix;
    140     tab += ix*4;
    141     return ((tab[3]*x + tab[2])*x + tab[1])*x + tab[0];
    142 }
    143 
    144 
    145 template<typename _Tp> struct ColorChannel
    146 {
    147     typedef float worktype_f;
    148     static _Tp max() { return std::numeric_limits<_Tp>::max(); }
    149     static _Tp half() { return (_Tp)(max()/2 + 1); }
    150 };
    151 
    152 template<> struct ColorChannel<float>
    153 {
    154     typedef float worktype_f;
    155     static float max() { return 1.f; }
    156     static float half() { return 0.5f; }
    157 };
    158 
    159 /*template<> struct ColorChannel<double>
    160 {
    161     typedef double worktype_f;
    162     static double max() { return 1.; }
    163     static double half() { return 0.5; }
    164 };*/
    165 
    166 
    167 ///////////////////////////// Top-level template function ////////////////////////////////
    168 
    169 template <typename Cvt>
    170 class CvtColorLoop_Invoker : public ParallelLoopBody
    171 {
    172     typedef typename Cvt::channel_type _Tp;
    173 public:
    174 
    175     CvtColorLoop_Invoker(const Mat& _src, Mat& _dst, const Cvt& _cvt) :
    176         ParallelLoopBody(), src(_src), dst(_dst), cvt(_cvt)
    177     {
    178     }
    179 
    180     virtual void operator()(const Range& range) const
    181     {
    182         const uchar* yS = src.ptr<uchar>(range.start);
    183         uchar* yD = dst.ptr<uchar>(range.start);
    184 
    185         for( int i = range.start; i < range.end; ++i, yS += src.step, yD += dst.step )
    186             cvt((const _Tp*)yS, (_Tp*)yD, src.cols);
    187     }
    188 
    189 private:
    190     const Mat& src;
    191     Mat& dst;
    192     const Cvt& cvt;
    193 
    194     const CvtColorLoop_Invoker& operator= (const CvtColorLoop_Invoker&);
    195 };
    196 
    197 template <typename Cvt>
    198 void CvtColorLoop(const Mat& src, Mat& dst, const Cvt& cvt)
    199 {
    200     parallel_for_(Range(0, src.rows), CvtColorLoop_Invoker<Cvt>(src, dst, cvt), src.total()/(double)(1<<16) );
    201 }
    202 
    203 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
    204 
    205 typedef IppStatus (CV_STDCALL* ippiReorderFunc)(const void *, int, void *, int, IppiSize, const int *);
    206 typedef IppStatus (CV_STDCALL* ippiGeneralFunc)(const void *, int, void *, int, IppiSize);
    207 typedef IppStatus (CV_STDCALL* ippiColor2GrayFunc)(const void *, int, void *, int, IppiSize, const Ipp32f *);
    208 
    209 template <typename Cvt>
    210 class CvtColorIPPLoop_Invoker :
    211         public ParallelLoopBody
    212 {
    213 public:
    214 
    215     CvtColorIPPLoop_Invoker(const Mat& _src, Mat& _dst, const Cvt& _cvt, bool *_ok) :
    216         ParallelLoopBody(), src(_src), dst(_dst), cvt(_cvt), ok(_ok)
    217     {
    218         *ok = true;
    219     }
    220 
    221     virtual void operator()(const Range& range) const
    222     {
    223         const void *yS = src.ptr<uchar>(range.start);
    224         void *yD = dst.ptr<uchar>(range.start);
    225         if( !cvt(yS, (int)src.step[0], yD, (int)dst.step[0], src.cols, range.end - range.start) )
    226             *ok = false;
    227         else
    228         {
    229             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
    230         }
    231     }
    232 
    233 private:
    234     const Mat& src;
    235     Mat& dst;
    236     const Cvt& cvt;
    237     bool *ok;
    238 
    239     const CvtColorIPPLoop_Invoker& operator= (const CvtColorIPPLoop_Invoker&);
    240 };
    241 
    242 template <typename Cvt>
    243 bool CvtColorIPPLoop(const Mat& src, Mat& dst, const Cvt& cvt)
    244 {
    245     bool ok;
    246     parallel_for_(Range(0, src.rows), CvtColorIPPLoop_Invoker<Cvt>(src, dst, cvt, &ok), src.total()/(double)(1<<16) );
    247     return ok;
    248 }
    249 
    250 template <typename Cvt>
    251 bool CvtColorIPPLoopCopy(Mat& src, Mat& dst, const Cvt& cvt)
    252 {
    253     Mat temp;
    254     Mat &source = src;
    255     if( src.data == dst.data )
    256     {
    257         src.copyTo(temp);
    258         source = temp;
    259     }
    260     bool ok;
    261     parallel_for_(Range(0, source.rows), CvtColorIPPLoop_Invoker<Cvt>(source, dst, cvt, &ok),
    262                   source.total()/(double)(1<<16) );
    263     return ok;
    264 }
    265 
    266 static IppStatus CV_STDCALL ippiSwapChannels_8u_C3C4Rf(const Ipp8u* pSrc, int srcStep, Ipp8u* pDst, int dstStep,
    267          IppiSize roiSize, const int *dstOrder)
    268 {
    269     return ippiSwapChannels_8u_C3C4R(pSrc, srcStep, pDst, dstStep, roiSize, dstOrder, MAX_IPP8u);
    270 }
    271 
    272 static IppStatus CV_STDCALL ippiSwapChannels_16u_C3C4Rf(const Ipp16u* pSrc, int srcStep, Ipp16u* pDst, int dstStep,
    273          IppiSize roiSize, const int *dstOrder)
    274 {
    275     return ippiSwapChannels_16u_C3C4R(pSrc, srcStep, pDst, dstStep, roiSize, dstOrder, MAX_IPP16u);
    276 }
    277 
    278 static IppStatus CV_STDCALL ippiSwapChannels_32f_C3C4Rf(const Ipp32f* pSrc, int srcStep, Ipp32f* pDst, int dstStep,
    279          IppiSize roiSize, const int *dstOrder)
    280 {
    281     return ippiSwapChannels_32f_C3C4R(pSrc, srcStep, pDst, dstStep, roiSize, dstOrder, MAX_IPP32f);
    282 }
    283 
    284 static ippiReorderFunc ippiSwapChannelsC3C4RTab[] =
    285 {
    286     (ippiReorderFunc)ippiSwapChannels_8u_C3C4Rf, 0, (ippiReorderFunc)ippiSwapChannels_16u_C3C4Rf, 0,
    287     0, (ippiReorderFunc)ippiSwapChannels_32f_C3C4Rf, 0, 0
    288 };
    289 
    290 static ippiGeneralFunc ippiCopyAC4C3RTab[] =
    291 {
    292     (ippiGeneralFunc)ippiCopy_8u_AC4C3R, 0, (ippiGeneralFunc)ippiCopy_16u_AC4C3R, 0,
    293     0, (ippiGeneralFunc)ippiCopy_32f_AC4C3R, 0, 0
    294 };
    295 
    296 static ippiReorderFunc ippiSwapChannelsC4C3RTab[] =
    297 {
    298     (ippiReorderFunc)ippiSwapChannels_8u_C4C3R, 0, (ippiReorderFunc)ippiSwapChannels_16u_C4C3R, 0,
    299     0, (ippiReorderFunc)ippiSwapChannels_32f_C4C3R, 0, 0
    300 };
    301 
    302 static ippiReorderFunc ippiSwapChannelsC3RTab[] =
    303 {
    304     (ippiReorderFunc)ippiSwapChannels_8u_C3R, 0, (ippiReorderFunc)ippiSwapChannels_16u_C3R, 0,
    305     0, (ippiReorderFunc)ippiSwapChannels_32f_C3R, 0, 0
    306 };
    307 
    308 #if IPP_VERSION_X100 >= 801
    309 static ippiReorderFunc ippiSwapChannelsC4RTab[] =
    310 {
    311     (ippiReorderFunc)ippiSwapChannels_8u_C4R, 0, (ippiReorderFunc)ippiSwapChannels_16u_C4R, 0,
    312     0, (ippiReorderFunc)ippiSwapChannels_32f_C4R, 0, 0
    313 };
    314 #endif
    315 
    316 static ippiColor2GrayFunc ippiColor2GrayC3Tab[] =
    317 {
    318     (ippiColor2GrayFunc)ippiColorToGray_8u_C3C1R, 0, (ippiColor2GrayFunc)ippiColorToGray_16u_C3C1R, 0,
    319     0, (ippiColor2GrayFunc)ippiColorToGray_32f_C3C1R, 0, 0
    320 };
    321 
    322 static ippiColor2GrayFunc ippiColor2GrayC4Tab[] =
    323 {
    324     (ippiColor2GrayFunc)ippiColorToGray_8u_AC4C1R, 0, (ippiColor2GrayFunc)ippiColorToGray_16u_AC4C1R, 0,
    325     0, (ippiColor2GrayFunc)ippiColorToGray_32f_AC4C1R, 0, 0
    326 };
    327 
    328 static ippiGeneralFunc ippiRGB2GrayC3Tab[] =
    329 {
    330     (ippiGeneralFunc)ippiRGBToGray_8u_C3C1R, 0, (ippiGeneralFunc)ippiRGBToGray_16u_C3C1R, 0,
    331     0, (ippiGeneralFunc)ippiRGBToGray_32f_C3C1R, 0, 0
    332 };
    333 
    334 static ippiGeneralFunc ippiRGB2GrayC4Tab[] =
    335 {
    336     (ippiGeneralFunc)ippiRGBToGray_8u_AC4C1R, 0, (ippiGeneralFunc)ippiRGBToGray_16u_AC4C1R, 0,
    337     0, (ippiGeneralFunc)ippiRGBToGray_32f_AC4C1R, 0, 0
    338 };
    339 
    340 static ippiGeneralFunc ippiCopyP3C3RTab[] =
    341 {
    342     (ippiGeneralFunc)ippiCopy_8u_P3C3R, 0, (ippiGeneralFunc)ippiCopy_16u_P3C3R, 0,
    343     0, (ippiGeneralFunc)ippiCopy_32f_P3C3R, 0, 0
    344 };
    345 
    346 static ippiGeneralFunc ippiRGB2XYZTab[] =
    347 {
    348     (ippiGeneralFunc)ippiRGBToXYZ_8u_C3R, 0, (ippiGeneralFunc)ippiRGBToXYZ_16u_C3R, 0,
    349     0, (ippiGeneralFunc)ippiRGBToXYZ_32f_C3R, 0, 0
    350 };
    351 
    352 static ippiGeneralFunc ippiXYZ2RGBTab[] =
    353 {
    354     (ippiGeneralFunc)ippiXYZToRGB_8u_C3R, 0, (ippiGeneralFunc)ippiXYZToRGB_16u_C3R, 0,
    355     0, (ippiGeneralFunc)ippiXYZToRGB_32f_C3R, 0, 0
    356 };
    357 
    358 static ippiGeneralFunc ippiRGB2HSVTab[] =
    359 {
    360     (ippiGeneralFunc)ippiRGBToHSV_8u_C3R, 0, (ippiGeneralFunc)ippiRGBToHSV_16u_C3R, 0,
    361     0, 0, 0, 0
    362 };
    363 
    364 static ippiGeneralFunc ippiHSV2RGBTab[] =
    365 {
    366     (ippiGeneralFunc)ippiHSVToRGB_8u_C3R, 0, (ippiGeneralFunc)ippiHSVToRGB_16u_C3R, 0,
    367     0, 0, 0, 0
    368 };
    369 
    370 static ippiGeneralFunc ippiRGB2HLSTab[] =
    371 {
    372     (ippiGeneralFunc)ippiRGBToHLS_8u_C3R, 0, (ippiGeneralFunc)ippiRGBToHLS_16u_C3R, 0,
    373     0, (ippiGeneralFunc)ippiRGBToHLS_32f_C3R, 0, 0
    374 };
    375 
    376 static ippiGeneralFunc ippiHLS2RGBTab[] =
    377 {
    378     (ippiGeneralFunc)ippiHLSToRGB_8u_C3R, 0, (ippiGeneralFunc)ippiHLSToRGB_16u_C3R, 0,
    379     0, (ippiGeneralFunc)ippiHLSToRGB_32f_C3R, 0, 0
    380 };
    381 
    382 #if !defined(HAVE_IPP_ICV_ONLY) && 0
    383 static ippiGeneralFunc ippiRGBToLUVTab[] =
    384 {
    385     (ippiGeneralFunc)ippiRGBToLUV_8u_C3R, 0, (ippiGeneralFunc)ippiRGBToLUV_16u_C3R, 0,
    386     0, (ippiGeneralFunc)ippiRGBToLUV_32f_C3R, 0, 0
    387 };
    388 
    389 static ippiGeneralFunc ippiLUVToRGBTab[] =
    390 {
    391     (ippiGeneralFunc)ippiLUVToRGB_8u_C3R, 0, (ippiGeneralFunc)ippiLUVToRGB_16u_C3R, 0,
    392     0, (ippiGeneralFunc)ippiLUVToRGB_32f_C3R, 0, 0
    393 };
    394 #endif
    395 
    396 struct IPPGeneralFunctor
    397 {
    398     IPPGeneralFunctor(ippiGeneralFunc _func) : func(_func){}
    399     bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
    400     {
    401         return func ? func(src, srcStep, dst, dstStep, ippiSize(cols, rows)) >= 0 : false;
    402     }
    403 private:
    404     ippiGeneralFunc func;
    405 };
    406 
    407 struct IPPReorderFunctor
    408 {
    409     IPPReorderFunctor(ippiReorderFunc _func, int _order0, int _order1, int _order2) : func(_func)
    410     {
    411         order[0] = _order0;
    412         order[1] = _order1;
    413         order[2] = _order2;
    414         order[3] = 3;
    415     }
    416     bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
    417     {
    418         return func ? func(src, srcStep, dst, dstStep, ippiSize(cols, rows), order) >= 0 : false;
    419     }
    420 private:
    421     ippiReorderFunc func;
    422     int order[4];
    423 };
    424 
    425 struct IPPColor2GrayFunctor
    426 {
    427     IPPColor2GrayFunctor(ippiColor2GrayFunc _func) :
    428         func(_func)
    429     {
    430         coeffs[0] = 0.114f;
    431         coeffs[1] = 0.587f;
    432         coeffs[2] = 0.299f;
    433     }
    434     bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
    435     {
    436         return func ? func(src, srcStep, dst, dstStep, ippiSize(cols, rows), coeffs) >= 0 : false;
    437     }
    438 private:
    439     ippiColor2GrayFunc func;
    440     Ipp32f coeffs[3];
    441 };
    442 
    443 struct IPPGray2BGRFunctor
    444 {
    445     IPPGray2BGRFunctor(ippiGeneralFunc _func) :
    446         func(_func)
    447     {
    448     }
    449 
    450     bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
    451     {
    452         if (func == 0)
    453             return false;
    454 
    455         const void* srcarray[3] = { src, src, src };
    456         return func(srcarray, srcStep, dst, dstStep, ippiSize(cols, rows)) >= 0;
    457     }
    458 private:
    459     ippiGeneralFunc func;
    460 };
    461 
    462 struct IPPGray2BGRAFunctor
    463 {
    464     IPPGray2BGRAFunctor(ippiGeneralFunc _func1, ippiReorderFunc _func2, int _depth) :
    465         func1(_func1), func2(_func2), depth(_depth)
    466     {
    467     }
    468 
    469     bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
    470     {
    471         if (func1 == 0 || func2 == 0)
    472             return false;
    473 
    474         const void* srcarray[3] = { src, src, src };
    475         Mat temp(rows, cols, CV_MAKETYPE(depth, 3));
    476         if(func1(srcarray, srcStep, temp.ptr(), (int)temp.step[0], ippiSize(cols, rows)) < 0)
    477             return false;
    478         int order[4] = {0, 1, 2, 3};
    479         return func2(temp.ptr(), (int)temp.step[0], dst, dstStep, ippiSize(cols, rows), order) >= 0;
    480     }
    481 private:
    482     ippiGeneralFunc func1;
    483     ippiReorderFunc func2;
    484     int depth;
    485 };
    486 
    487 struct IPPReorderGeneralFunctor
    488 {
    489     IPPReorderGeneralFunctor(ippiReorderFunc _func1, ippiGeneralFunc _func2, int _order0, int _order1, int _order2, int _depth) :
    490         func1(_func1), func2(_func2), depth(_depth)
    491     {
    492         order[0] = _order0;
    493         order[1] = _order1;
    494         order[2] = _order2;
    495         order[3] = 3;
    496     }
    497     bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
    498     {
    499         if (func1 == 0 || func2 == 0)
    500             return false;
    501 
    502         Mat temp;
    503         temp.create(rows, cols, CV_MAKETYPE(depth, 3));
    504         if(func1(src, srcStep, temp.ptr(), (int)temp.step[0], ippiSize(cols, rows), order) < 0)
    505             return false;
    506         return func2(temp.ptr(), (int)temp.step[0], dst, dstStep, ippiSize(cols, rows)) >= 0;
    507     }
    508 private:
    509     ippiReorderFunc func1;
    510     ippiGeneralFunc func2;
    511     int order[4];
    512     int depth;
    513 };
    514 
    515 struct IPPGeneralReorderFunctor
    516 {
    517     IPPGeneralReorderFunctor(ippiGeneralFunc _func1, ippiReorderFunc _func2, int _order0, int _order1, int _order2, int _depth) :
    518         func1(_func1), func2(_func2), depth(_depth)
    519     {
    520         order[0] = _order0;
    521         order[1] = _order1;
    522         order[2] = _order2;
    523         order[3] = 3;
    524     }
    525     bool operator()(const void *src, int srcStep, void *dst, int dstStep, int cols, int rows) const
    526     {
    527         if (func1 == 0 || func2 == 0)
    528             return false;
    529 
    530         Mat temp;
    531         temp.create(rows, cols, CV_MAKETYPE(depth, 3));
    532         if(func1(src, srcStep, temp.ptr(), (int)temp.step[0], ippiSize(cols, rows)) < 0)
    533             return false;
    534         return func2(temp.ptr(), (int)temp.step[0], dst, dstStep, ippiSize(cols, rows), order) >= 0;
    535     }
    536 private:
    537     ippiGeneralFunc func1;
    538     ippiReorderFunc func2;
    539     int order[4];
    540     int depth;
    541 };
    542 
    543 #endif
    544 
    545 ////////////////// Various 3/4-channel to 3/4-channel RGB transformations /////////////////
    546 
    547 template<typename _Tp> struct RGB2RGB
    548 {
    549     typedef _Tp channel_type;
    550 
    551     RGB2RGB(int _srccn, int _dstcn, int _blueIdx) : srccn(_srccn), dstcn(_dstcn), blueIdx(_blueIdx) {}
    552     void operator()(const _Tp* src, _Tp* dst, int n) const
    553     {
    554         int scn = srccn, dcn = dstcn, bidx = blueIdx;
    555         if( dcn == 3 )
    556         {
    557             n *= 3;
    558             for( int i = 0; i < n; i += 3, src += scn )
    559             {
    560                 _Tp t0 = src[bidx], t1 = src[1], t2 = src[bidx ^ 2];
    561                 dst[i] = t0; dst[i+1] = t1; dst[i+2] = t2;
    562             }
    563         }
    564         else if( scn == 3 )
    565         {
    566             n *= 3;
    567             _Tp alpha = ColorChannel<_Tp>::max();
    568             for( int i = 0; i < n; i += 3, dst += 4 )
    569             {
    570                 _Tp t0 = src[i], t1 = src[i+1], t2 = src[i+2];
    571                 dst[bidx] = t0; dst[1] = t1; dst[bidx^2] = t2; dst[3] = alpha;
    572             }
    573         }
    574         else
    575         {
    576             n *= 4;
    577             for( int i = 0; i < n; i += 4 )
    578             {
    579                 _Tp t0 = src[i], t1 = src[i+1], t2 = src[i+2], t3 = src[i+3];
    580                 dst[i] = t2; dst[i+1] = t1; dst[i+2] = t0; dst[i+3] = t3;
    581             }
    582         }
    583     }
    584 
    585     int srccn, dstcn, blueIdx;
    586 };
    587 
    588 #if CV_NEON
    589 
    590 template<> struct RGB2RGB<uchar>
    591 {
    592     typedef uchar channel_type;
    593 
    594     RGB2RGB(int _srccn, int _dstcn, int _blueIdx) :
    595         srccn(_srccn), dstcn(_dstcn), blueIdx(_blueIdx)
    596     {
    597         v_alpha = vdupq_n_u8(ColorChannel<uchar>::max());
    598         v_alpha2 = vget_low_u8(v_alpha);
    599     }
    600 
    601     void operator()(const uchar * src, uchar * dst, int n) const
    602     {
    603         int scn = srccn, dcn = dstcn, bidx = blueIdx, i = 0;
    604         if (dcn == 3)
    605         {
    606             n *= 3;
    607             if (scn == 3)
    608             {
    609                 for ( ; i <= n - 48; i += 48, src += 48 )
    610                 {
    611                     uint8x16x3_t v_src = vld3q_u8(src), v_dst;
    612                     v_dst.val[0] = v_src.val[bidx];
    613                     v_dst.val[1] = v_src.val[1];
    614                     v_dst.val[2] = v_src.val[bidx ^ 2];
    615                     vst3q_u8(dst + i, v_dst);
    616                 }
    617                 for ( ; i <= n - 24; i += 24, src += 24 )
    618                 {
    619                     uint8x8x3_t v_src = vld3_u8(src), v_dst;
    620                     v_dst.val[0] = v_src.val[bidx];
    621                     v_dst.val[1] = v_src.val[1];
    622                     v_dst.val[2] = v_src.val[bidx ^ 2];
    623                     vst3_u8(dst + i, v_dst);
    624                 }
    625                 for ( ; i < n; i += 3, src += 3 )
    626                 {
    627                     uchar t0 = src[bidx], t1 = src[1], t2 = src[bidx ^ 2];
    628                     dst[i] = t0; dst[i+1] = t1; dst[i+2] = t2;
    629                 }
    630             }
    631             else
    632             {
    633                 for ( ; i <= n - 48; i += 48, src += 64 )
    634                 {
    635                     uint8x16x4_t v_src = vld4q_u8(src);
    636                     uint8x16x3_t v_dst;
    637                     v_dst.val[0] = v_src.val[bidx];
    638                     v_dst.val[1] = v_src.val[1];
    639                     v_dst.val[2] = v_src.val[bidx ^ 2];
    640                     vst3q_u8(dst + i, v_dst);
    641                 }
    642                 for ( ; i <= n - 24; i += 24, src += 32 )
    643                 {
    644                     uint8x8x4_t v_src = vld4_u8(src);
    645                     uint8x8x3_t v_dst;
    646                     v_dst.val[0] = v_src.val[bidx];
    647                     v_dst.val[1] = v_src.val[1];
    648                     v_dst.val[2] = v_src.val[bidx ^ 2];
    649                     vst3_u8(dst + i, v_dst);
    650                 }
    651                 for ( ; i < n; i += 3, src += 4 )
    652                 {
    653                     uchar t0 = src[bidx], t1 = src[1], t2 = src[bidx ^ 2];
    654                     dst[i] = t0; dst[i+1] = t1; dst[i+2] = t2;
    655                 }
    656             }
    657         }
    658         else if (scn == 3)
    659         {
    660             n *= 3;
    661             for ( ; i <= n - 48; i += 48, dst += 64 )
    662             {
    663                 uint8x16x3_t v_src = vld3q_u8(src + i);
    664                 uint8x16x4_t v_dst;
    665                 v_dst.val[bidx] = v_src.val[0];
    666                 v_dst.val[1] = v_src.val[1];
    667                 v_dst.val[bidx ^ 2] = v_src.val[2];
    668                 v_dst.val[3] = v_alpha;
    669                 vst4q_u8(dst, v_dst);
    670             }
    671             for ( ; i <= n - 24; i += 24, dst += 32 )
    672             {
    673                 uint8x8x3_t v_src = vld3_u8(src + i);
    674                 uint8x8x4_t v_dst;
    675                 v_dst.val[bidx] = v_src.val[0];
    676                 v_dst.val[1] = v_src.val[1];
    677                 v_dst.val[bidx ^ 2] = v_src.val[2];
    678                 v_dst.val[3] = v_alpha2;
    679                 vst4_u8(dst, v_dst);
    680             }
    681             uchar alpha = ColorChannel<uchar>::max();
    682             for (; i < n; i += 3, dst += 4 )
    683             {
    684                 uchar t0 = src[i], t1 = src[i+1], t2 = src[i+2];
    685                 dst[bidx] = t0; dst[1] = t1; dst[bidx^2] = t2; dst[3] = alpha;
    686             }
    687         }
    688         else
    689         {
    690             n *= 4;
    691             for ( ; i <= n - 64; i += 64 )
    692             {
    693                 uint8x16x4_t v_src = vld4q_u8(src + i), v_dst;
    694                 v_dst.val[0] = v_src.val[2];
    695                 v_dst.val[1] = v_src.val[1];
    696                 v_dst.val[2] = v_src.val[0];
    697                 v_dst.val[3] = v_src.val[3];
    698                 vst4q_u8(dst + i, v_dst);
    699             }
    700             for ( ; i <= n - 32; i += 32 )
    701             {
    702                 uint8x8x4_t v_src = vld4_u8(src + i), v_dst;
    703                 v_dst.val[0] = v_src.val[2];
    704                 v_dst.val[1] = v_src.val[1];
    705                 v_dst.val[2] = v_src.val[0];
    706                 v_dst.val[3] = v_src.val[3];
    707                 vst4_u8(dst + i, v_dst);
    708             }
    709             for ( ; i < n; i += 4)
    710             {
    711                 uchar t0 = src[i], t1 = src[i+1], t2 = src[i+2], t3 = src[i+3];
    712                 dst[i] = t2; dst[i+1] = t1; dst[i+2] = t0; dst[i+3] = t3;
    713             }
    714         }
    715     }
    716 
    717     int srccn, dstcn, blueIdx;
    718 
    719     uint8x16_t v_alpha;
    720     uint8x8_t v_alpha2;
    721 };
    722 
    723 #endif
    724 
    725 /////////// Transforming 16-bit (565 or 555) RGB to/from 24/32-bit (888[8]) RGB //////////
    726 
    727 struct RGB5x52RGB
    728 {
    729     typedef uchar channel_type;
    730 
    731     RGB5x52RGB(int _dstcn, int _blueIdx, int _greenBits)
    732         : dstcn(_dstcn), blueIdx(_blueIdx), greenBits(_greenBits)
    733     {
    734         #if CV_NEON
    735         v_n3 = vdupq_n_u16(~3);
    736         v_n7 = vdupq_n_u16(~7);
    737         v_255 = vdupq_n_u8(255);
    738         v_0 = vdupq_n_u8(0);
    739         v_mask = vdupq_n_u16(0x8000);
    740         #endif
    741     }
    742 
    743     void operator()(const uchar* src, uchar* dst, int n) const
    744     {
    745         int dcn = dstcn, bidx = blueIdx, i = 0;
    746         if( greenBits == 6 )
    747         {
    748             #if CV_NEON
    749             for ( ; i <= n - 16; i += 16, dst += dcn * 16)
    750             {
    751                 uint16x8_t v_src0 = vld1q_u16((const ushort *)src + i), v_src1 = vld1q_u16((const ushort *)src + i + 8);
    752                 uint8x16_t v_b = vcombine_u8(vmovn_u16(vshlq_n_u16(v_src0, 3)), vmovn_u16(vshlq_n_u16(v_src1, 3)));
    753                 uint8x16_t v_g = vcombine_u8(vmovn_u16(vandq_u16(vshrq_n_u16(v_src0, 3), v_n3)),
    754                                              vmovn_u16(vandq_u16(vshrq_n_u16(v_src1, 3), v_n3)));
    755                 uint8x16_t v_r = vcombine_u8(vmovn_u16(vandq_u16(vshrq_n_u16(v_src0, 8), v_n7)),
    756                                              vmovn_u16(vandq_u16(vshrq_n_u16(v_src1, 8), v_n7)));
    757                 if (dcn == 3)
    758                 {
    759                     uint8x16x3_t v_dst;
    760                     v_dst.val[bidx] = v_b;
    761                     v_dst.val[1] = v_g;
    762                     v_dst.val[bidx^2] = v_r;
    763                     vst3q_u8(dst, v_dst);
    764                 }
    765                 else
    766                 {
    767                     uint8x16x4_t v_dst;
    768                     v_dst.val[bidx] = v_b;
    769                     v_dst.val[1] = v_g;
    770                     v_dst.val[bidx^2] = v_r;
    771                     v_dst.val[3] = v_255;
    772                     vst4q_u8(dst, v_dst);
    773                 }
    774             }
    775             #endif
    776             for( ; i < n; i++, dst += dcn )
    777             {
    778                 unsigned t = ((const ushort*)src)[i];
    779                 dst[bidx] = (uchar)(t << 3);
    780                 dst[1] = (uchar)((t >> 3) & ~3);
    781                 dst[bidx ^ 2] = (uchar)((t >> 8) & ~7);
    782                 if( dcn == 4 )
    783                     dst[3] = 255;
    784             }
    785         }
    786         else
    787         {
    788             #if CV_NEON
    789             for ( ; i <= n - 16; i += 16, dst += dcn * 16)
    790             {
    791                 uint16x8_t v_src0 = vld1q_u16((const ushort *)src + i), v_src1 = vld1q_u16((const ushort *)src + i + 8);
    792                 uint8x16_t v_b = vcombine_u8(vmovn_u16(vshlq_n_u16(v_src0, 3)), vmovn_u16(vshlq_n_u16(v_src1, 3)));
    793                 uint8x16_t v_g = vcombine_u8(vmovn_u16(vandq_u16(vshrq_n_u16(v_src0, 2), v_n7)),
    794                                              vmovn_u16(vandq_u16(vshrq_n_u16(v_src1, 2), v_n7)));
    795                 uint8x16_t v_r = vcombine_u8(vmovn_u16(vandq_u16(vshrq_n_u16(v_src0, 7), v_n7)),
    796                                              vmovn_u16(vandq_u16(vshrq_n_u16(v_src1, 7), v_n7)));
    797                 if (dcn == 3)
    798                 {
    799                     uint8x16x3_t v_dst;
    800                     v_dst.val[bidx] = v_b;
    801                     v_dst.val[1] = v_g;
    802                     v_dst.val[bidx^2] = v_r;
    803                     vst3q_u8(dst, v_dst);
    804                 }
    805                 else
    806                 {
    807                     uint8x16x4_t v_dst;
    808                     v_dst.val[bidx] = v_b;
    809                     v_dst.val[1] = v_g;
    810                     v_dst.val[bidx^2] = v_r;
    811                     v_dst.val[3] = vbslq_u8(vcombine_u8(vqmovn_u16(vandq_u16(v_src0, v_mask)),
    812                                                         vqmovn_u16(vandq_u16(v_src1, v_mask))), v_255, v_0);
    813                     vst4q_u8(dst, v_dst);
    814                 }
    815             }
    816             #endif
    817             for( ; i < n; i++, dst += dcn )
    818             {
    819                 unsigned t = ((const ushort*)src)[i];
    820                 dst[bidx] = (uchar)(t << 3);
    821                 dst[1] = (uchar)((t >> 2) & ~7);
    822                 dst[bidx ^ 2] = (uchar)((t >> 7) & ~7);
    823                 if( dcn == 4 )
    824                     dst[3] = t & 0x8000 ? 255 : 0;
    825             }
    826         }
    827     }
    828 
    829     int dstcn, blueIdx, greenBits;
    830     #if CV_NEON
    831     uint16x8_t v_n3, v_n7, v_mask;
    832     uint8x16_t v_255, v_0;
    833     #endif
    834 };
    835 
    836 
    837 struct RGB2RGB5x5
    838 {
    839     typedef uchar channel_type;
    840 
    841     RGB2RGB5x5(int _srccn, int _blueIdx, int _greenBits)
    842         : srccn(_srccn), blueIdx(_blueIdx), greenBits(_greenBits)
    843     {
    844         #if CV_NEON
    845         v_n3 = vdup_n_u8(~3);
    846         v_n7 = vdup_n_u8(~7);
    847         v_mask = vdupq_n_u16(0x8000);
    848         v_0 = vdupq_n_u16(0);
    849         v_full = vdupq_n_u16(0xffff);
    850         #endif
    851     }
    852 
    853     void operator()(const uchar* src, uchar* dst, int n) const
    854     {
    855         int scn = srccn, bidx = blueIdx, i = 0;
    856         if (greenBits == 6)
    857         {
    858             if (scn == 3)
    859             {
    860                 #if CV_NEON
    861                 for ( ; i <= n - 8; i += 8, src += 24 )
    862                 {
    863                     uint8x8x3_t v_src = vld3_u8(src);
    864                     uint16x8_t v_dst = vmovl_u8(vshr_n_u8(v_src.val[bidx], 3));
    865                     v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[1], v_n3)), 3));
    866                     v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[bidx^2], v_n7)), 8));
    867                     vst1q_u16((ushort *)dst + i, v_dst);
    868                 }
    869                 #endif
    870                 for ( ; i < n; i++, src += 3 )
    871                     ((ushort*)dst)[i] = (ushort)((src[bidx] >> 3)|((src[1]&~3) << 3)|((src[bidx^2]&~7) << 8));
    872             }
    873             else
    874             {
    875                 #if CV_NEON
    876                 for ( ; i <= n - 8; i += 8, src += 32 )
    877                 {
    878                     uint8x8x4_t v_src = vld4_u8(src);
    879                     uint16x8_t v_dst = vmovl_u8(vshr_n_u8(v_src.val[bidx], 3));
    880                     v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[1], v_n3)), 3));
    881                     v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[bidx^2], v_n7)), 8));
    882                     vst1q_u16((ushort *)dst + i, v_dst);
    883                 }
    884                 #endif
    885                 for ( ; i < n; i++, src += 4 )
    886                     ((ushort*)dst)[i] = (ushort)((src[bidx] >> 3)|((src[1]&~3) << 3)|((src[bidx^2]&~7) << 8));
    887             }
    888         }
    889         else if (scn == 3)
    890         {
    891             #if CV_NEON
    892             for ( ; i <= n - 8; i += 8, src += 24 )
    893             {
    894                 uint8x8x3_t v_src = vld3_u8(src);
    895                 uint16x8_t v_dst = vmovl_u8(vshr_n_u8(v_src.val[bidx], 3));
    896                 v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[1], v_n7)), 2));
    897                 v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[bidx^2], v_n7)), 7));
    898                 vst1q_u16((ushort *)dst + i, v_dst);
    899             }
    900             #endif
    901             for ( ; i < n; i++, src += 3 )
    902                 ((ushort*)dst)[i] = (ushort)((src[bidx] >> 3)|((src[1]&~7) << 2)|((src[bidx^2]&~7) << 7));
    903         }
    904         else
    905         {
    906             #if CV_NEON
    907             for ( ; i <= n - 8; i += 8, src += 32 )
    908             {
    909                 uint8x8x4_t v_src = vld4_u8(src);
    910                 uint16x8_t v_dst = vmovl_u8(vshr_n_u8(v_src.val[bidx], 3));
    911                 v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[1], v_n7)), 2));
    912                 v_dst = vorrq_u16(v_dst, vorrq_u16(vshlq_n_u16(vmovl_u8(vand_u8(v_src.val[bidx^2], v_n7)), 7),
    913                                                    vbslq_u16(veorq_u16(vceqq_u16(vmovl_u8(v_src.val[3]), v_0), v_full), v_mask, v_0)));
    914                 vst1q_u16((ushort *)dst + i, v_dst);
    915             }
    916             #endif
    917             for ( ; i < n; i++, src += 4 )
    918                 ((ushort*)dst)[i] = (ushort)((src[bidx] >> 3)|((src[1]&~7) << 2)|
    919                     ((src[bidx^2]&~7) << 7)|(src[3] ? 0x8000 : 0));
    920         }
    921     }
    922 
    923     int srccn, blueIdx, greenBits;
    924     #if CV_NEON
    925     uint8x8_t v_n3, v_n7;
    926     uint16x8_t v_mask, v_0, v_full;
    927     #endif
    928 };
    929 
    930 ///////////////////////////////// Color to/from Grayscale ////////////////////////////////
    931 
    932 template<typename _Tp>
    933 struct Gray2RGB
    934 {
    935     typedef _Tp channel_type;
    936 
    937     Gray2RGB(int _dstcn) : dstcn(_dstcn) {}
    938     void operator()(const _Tp* src, _Tp* dst, int n) const
    939     {
    940         if( dstcn == 3 )
    941             for( int i = 0; i < n; i++, dst += 3 )
    942             {
    943                 dst[0] = dst[1] = dst[2] = src[i];
    944             }
    945         else
    946         {
    947             _Tp alpha = ColorChannel<_Tp>::max();
    948             for( int i = 0; i < n; i++, dst += 4 )
    949             {
    950                 dst[0] = dst[1] = dst[2] = src[i];
    951                 dst[3] = alpha;
    952             }
    953         }
    954     }
    955 
    956     int dstcn;
    957 };
    958 
    959 
    960 struct Gray2RGB5x5
    961 {
    962     typedef uchar channel_type;
    963 
    964     Gray2RGB5x5(int _greenBits) : greenBits(_greenBits)
    965     {
    966         #if CV_NEON
    967         v_n7 = vdup_n_u8(~7);
    968         v_n3 = vdup_n_u8(~3);
    969         #elif CV_SSE2
    970         haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
    971         v_n7 = _mm_set1_epi16(~7);
    972         v_n3 = _mm_set1_epi16(~3);
    973         v_zero = _mm_setzero_si128();
    974         #endif
    975     }
    976 
    977     void operator()(const uchar* src, uchar* dst, int n) const
    978     {
    979         int i = 0;
    980         if( greenBits == 6 )
    981         {
    982             #if CV_NEON
    983             for ( ; i <= n - 8; i += 8 )
    984             {
    985                 uint8x8_t v_src = vld1_u8(src + i);
    986                 uint16x8_t v_dst = vmovl_u8(vshr_n_u8(v_src, 3));
    987                 v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src, v_n3)), 3));
    988                 v_dst = vorrq_u16(v_dst, vshlq_n_u16(vmovl_u8(vand_u8(v_src, v_n7)), 8));
    989                 vst1q_u16((ushort *)dst + i, v_dst);
    990             }
    991             #elif CV_SSE2
    992             if (haveSIMD)
    993             {
    994                 for ( ; i <= n - 16; i += 16 )
    995                 {
    996                     __m128i v_src = _mm_loadu_si128((__m128i const *)(src + i));
    997 
    998                     __m128i v_src_p = _mm_unpacklo_epi8(v_src, v_zero);
    999                     __m128i v_dst = _mm_or_si128(_mm_srli_epi16(v_src_p, 3),
   1000                                     _mm_or_si128(_mm_slli_epi16(_mm_and_si128(v_src_p, v_n3), 3),
   1001                                                  _mm_slli_epi16(_mm_and_si128(v_src_p, v_n7), 8)));
   1002                     _mm_storeu_si128((__m128i *)((ushort *)dst + i), v_dst);
   1003 
   1004                     v_src_p = _mm_unpackhi_epi8(v_src, v_zero);
   1005                     v_dst = _mm_or_si128(_mm_srli_epi16(v_src_p, 3),
   1006                             _mm_or_si128(_mm_slli_epi16(_mm_and_si128(v_src_p, v_n3), 3),
   1007                                          _mm_slli_epi16(_mm_and_si128(v_src_p, v_n7), 8)));
   1008                     _mm_storeu_si128((__m128i *)((ushort *)dst + i + 8), v_dst);
   1009                 }
   1010             }
   1011             #endif
   1012             for ( ; i < n; i++ )
   1013             {
   1014                 int t = src[i];
   1015                 ((ushort*)dst)[i] = (ushort)((t >> 3)|((t & ~3) << 3)|((t & ~7) << 8));
   1016             }
   1017         }
   1018         else
   1019         {
   1020             #if CV_NEON
   1021             for ( ; i <= n - 8; i += 8 )
   1022             {
   1023                 uint16x8_t v_src = vmovl_u8(vshr_n_u8(vld1_u8(src + i), 3));
   1024                 uint16x8_t v_dst = vorrq_u16(vorrq_u16(v_src, vshlq_n_u16(v_src, 5)), vshlq_n_u16(v_src, 10));
   1025                 vst1q_u16((ushort *)dst + i, v_dst);
   1026             }
   1027             #elif CV_SSE2
   1028             if (haveSIMD)
   1029             {
   1030                 for ( ; i <= n - 16; i += 8 )
   1031                 {
   1032                     __m128i v_src = _mm_loadu_si128((__m128i const *)(src + i));
   1033 
   1034                     __m128i v_src_p = _mm_srli_epi16(_mm_unpacklo_epi8(v_src, v_zero), 3);
   1035                     __m128i v_dst = _mm_or_si128(v_src_p,
   1036                                     _mm_or_si128(_mm_slli_epi32(v_src_p, 5),
   1037                                                  _mm_slli_epi16(v_src_p, 10)));
   1038                     _mm_storeu_si128((__m128i *)((ushort *)dst + i), v_dst);
   1039 
   1040                     v_src_p = _mm_srli_epi16(_mm_unpackhi_epi8(v_src, v_zero), 3);
   1041                     v_dst = _mm_or_si128(v_src_p,
   1042                             _mm_or_si128(_mm_slli_epi16(v_src_p, 5),
   1043                                          _mm_slli_epi16(v_src_p, 10)));
   1044                     _mm_storeu_si128((__m128i *)((ushort *)dst + i + 8), v_dst);
   1045                 }
   1046             }
   1047             #endif
   1048             for( ; i < n; i++ )
   1049             {
   1050                 int t = src[i] >> 3;
   1051                 ((ushort*)dst)[i] = (ushort)(t|(t << 5)|(t << 10));
   1052             }
   1053         }
   1054     }
   1055     int greenBits;
   1056 
   1057     #if CV_NEON
   1058     uint8x8_t v_n7, v_n3;
   1059     #elif CV_SSE2
   1060     __m128i v_n7, v_n3, v_zero;
   1061     bool haveSIMD;
   1062     #endif
   1063 };
   1064 
   1065 
   1066 #undef R2Y
   1067 #undef G2Y
   1068 #undef B2Y
   1069 
   1070 enum
   1071 {
   1072     yuv_shift = 14,
   1073     xyz_shift = 12,
   1074     R2Y = 4899,
   1075     G2Y = 9617,
   1076     B2Y = 1868,
   1077     BLOCK_SIZE = 256
   1078 };
   1079 
   1080 
   1081 struct RGB5x52Gray
   1082 {
   1083     typedef uchar channel_type;
   1084 
   1085     RGB5x52Gray(int _greenBits) : greenBits(_greenBits)
   1086     {
   1087         #if CV_NEON
   1088         v_b2y = vdup_n_u16(B2Y);
   1089         v_g2y = vdup_n_u16(G2Y);
   1090         v_r2y = vdup_n_u16(R2Y);
   1091         v_delta = vdupq_n_u32(1 << (yuv_shift - 1));
   1092         v_f8 = vdupq_n_u16(0xf8);
   1093         v_fc = vdupq_n_u16(0xfc);
   1094         #elif CV_SSE2
   1095         haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
   1096         v_b2y = _mm_set1_epi16(B2Y);
   1097         v_g2y = _mm_set1_epi16(G2Y);
   1098         v_r2y = _mm_set1_epi16(R2Y);
   1099         v_delta = _mm_set1_epi32(1 << (yuv_shift - 1));
   1100         v_f8 = _mm_set1_epi16(0xf8);
   1101         v_fc = _mm_set1_epi16(0xfc);
   1102         #endif
   1103     }
   1104 
   1105     void operator()(const uchar* src, uchar* dst, int n) const
   1106     {
   1107         int i = 0;
   1108         if( greenBits == 6 )
   1109         {
   1110             #if CV_NEON
   1111             for ( ; i <= n - 8; i += 8)
   1112             {
   1113                 uint16x8_t v_src = vld1q_u16((ushort *)src + i);
   1114                 uint16x8_t v_t0 = vandq_u16(vshlq_n_u16(v_src, 3), v_f8),
   1115                            v_t1 = vandq_u16(vshrq_n_u16(v_src, 3), v_fc),
   1116                            v_t2 = vandq_u16(vshrq_n_u16(v_src, 8), v_f8);
   1117 
   1118                 uint32x4_t v_dst0 = vmlal_u16(vmlal_u16(vmull_u16(vget_low_u16(v_t0), v_b2y),
   1119                                               vget_low_u16(v_t1), v_g2y), vget_low_u16(v_t2), v_r2y);
   1120                 uint32x4_t v_dst1 = vmlal_u16(vmlal_u16(vmull_u16(vget_high_u16(v_t0), v_b2y),
   1121                                               vget_high_u16(v_t1), v_g2y), vget_high_u16(v_t2), v_r2y);
   1122                 v_dst0 = vshrq_n_u32(vaddq_u32(v_dst0, v_delta), yuv_shift);
   1123                 v_dst1 = vshrq_n_u32(vaddq_u32(v_dst1, v_delta), yuv_shift);
   1124 
   1125                 vst1_u8(dst + i, vmovn_u16(vcombine_u16(vmovn_u32(v_dst0), vmovn_u32(v_dst1))));
   1126             }
   1127             #elif CV_SSE2
   1128             if (haveSIMD)
   1129             {
   1130                 __m128i v_zero = _mm_setzero_si128();
   1131 
   1132                 for ( ; i <= n - 8; i += 8)
   1133                 {
   1134                     __m128i v_src = _mm_loadu_si128((__m128i const *)((ushort *)src + i));
   1135                     __m128i v_t0 = _mm_and_si128(_mm_slli_epi16(v_src, 3), v_f8),
   1136                             v_t1 = _mm_and_si128(_mm_srli_epi16(v_src, 3), v_fc),
   1137                             v_t2 = _mm_and_si128(_mm_srli_epi16(v_src, 8), v_f8);
   1138 
   1139                     __m128i v_mullo_b = _mm_mullo_epi16(v_t0, v_b2y);
   1140                     __m128i v_mullo_g = _mm_mullo_epi16(v_t1, v_g2y);
   1141                     __m128i v_mullo_r = _mm_mullo_epi16(v_t2, v_r2y);
   1142                     __m128i v_mulhi_b = _mm_mulhi_epi16(v_t0, v_b2y);
   1143                     __m128i v_mulhi_g = _mm_mulhi_epi16(v_t1, v_g2y);
   1144                     __m128i v_mulhi_r = _mm_mulhi_epi16(v_t2, v_r2y);
   1145 
   1146                     __m128i v_dst0 = _mm_add_epi32(_mm_unpacklo_epi16(v_mullo_b, v_mulhi_b),
   1147                                                    _mm_unpacklo_epi16(v_mullo_g, v_mulhi_g));
   1148                     v_dst0 = _mm_add_epi32(_mm_add_epi32(v_dst0, v_delta),
   1149                                            _mm_unpacklo_epi16(v_mullo_r, v_mulhi_r));
   1150 
   1151                     __m128i v_dst1 = _mm_add_epi32(_mm_unpackhi_epi16(v_mullo_b, v_mulhi_b),
   1152                                                    _mm_unpackhi_epi16(v_mullo_g, v_mulhi_g));
   1153                     v_dst1 = _mm_add_epi32(_mm_add_epi32(v_dst1, v_delta),
   1154                                            _mm_unpackhi_epi16(v_mullo_r, v_mulhi_r));
   1155 
   1156                     v_dst0 = _mm_srli_epi32(v_dst0, yuv_shift);
   1157                     v_dst1 = _mm_srli_epi32(v_dst1, yuv_shift);
   1158 
   1159                     __m128i v_dst = _mm_packs_epi32(v_dst0, v_dst1);
   1160                     _mm_storel_epi64((__m128i *)(dst + i), _mm_packus_epi16(v_dst, v_zero));
   1161                 }
   1162             }
   1163             #endif
   1164             for ( ; i < n; i++)
   1165             {
   1166                 int t = ((ushort*)src)[i];
   1167                 dst[i] = (uchar)CV_DESCALE(((t << 3) & 0xf8)*B2Y +
   1168                                            ((t >> 3) & 0xfc)*G2Y +
   1169                                            ((t >> 8) & 0xf8)*R2Y, yuv_shift);
   1170             }
   1171         }
   1172         else
   1173         {
   1174             #if CV_NEON
   1175             for ( ; i <= n - 8; i += 8)
   1176             {
   1177                 uint16x8_t v_src = vld1q_u16((ushort *)src + i);
   1178                 uint16x8_t v_t0 = vandq_u16(vshlq_n_u16(v_src, 3), v_f8),
   1179                            v_t1 = vandq_u16(vshrq_n_u16(v_src, 2), v_f8),
   1180                            v_t2 = vandq_u16(vshrq_n_u16(v_src, 7), v_f8);
   1181 
   1182                 uint32x4_t v_dst0 = vmlal_u16(vmlal_u16(vmull_u16(vget_low_u16(v_t0), v_b2y),
   1183                                               vget_low_u16(v_t1), v_g2y), vget_low_u16(v_t2), v_r2y);
   1184                 uint32x4_t v_dst1 = vmlal_u16(vmlal_u16(vmull_u16(vget_high_u16(v_t0), v_b2y),
   1185                                               vget_high_u16(v_t1), v_g2y), vget_high_u16(v_t2), v_r2y);
   1186                 v_dst0 = vshrq_n_u32(vaddq_u32(v_dst0, v_delta), yuv_shift);
   1187                 v_dst1 = vshrq_n_u32(vaddq_u32(v_dst1, v_delta), yuv_shift);
   1188 
   1189                 vst1_u8(dst + i, vmovn_u16(vcombine_u16(vmovn_u32(v_dst0), vmovn_u32(v_dst1))));
   1190             }
   1191             #elif CV_SSE2
   1192             if (haveSIMD)
   1193             {
   1194                 __m128i v_zero = _mm_setzero_si128();
   1195 
   1196                 for ( ; i <= n - 8; i += 8)
   1197                 {
   1198                     __m128i v_src = _mm_loadu_si128((__m128i const *)((ushort *)src + i));
   1199                     __m128i v_t0 = _mm_and_si128(_mm_slli_epi16(v_src, 3), v_f8),
   1200                             v_t1 = _mm_and_si128(_mm_srli_epi16(v_src, 2), v_f8),
   1201                             v_t2 = _mm_and_si128(_mm_srli_epi16(v_src, 7), v_f8);
   1202 
   1203                     __m128i v_mullo_b = _mm_mullo_epi16(v_t0, v_b2y);
   1204                     __m128i v_mullo_g = _mm_mullo_epi16(v_t1, v_g2y);
   1205                     __m128i v_mullo_r = _mm_mullo_epi16(v_t2, v_r2y);
   1206                     __m128i v_mulhi_b = _mm_mulhi_epi16(v_t0, v_b2y);
   1207                     __m128i v_mulhi_g = _mm_mulhi_epi16(v_t1, v_g2y);
   1208                     __m128i v_mulhi_r = _mm_mulhi_epi16(v_t2, v_r2y);
   1209 
   1210                     __m128i v_dst0 = _mm_add_epi32(_mm_unpacklo_epi16(v_mullo_b, v_mulhi_b),
   1211                                                    _mm_unpacklo_epi16(v_mullo_g, v_mulhi_g));
   1212                     v_dst0 = _mm_add_epi32(_mm_add_epi32(v_dst0, v_delta),
   1213                                            _mm_unpacklo_epi16(v_mullo_r, v_mulhi_r));
   1214 
   1215                     __m128i v_dst1 = _mm_add_epi32(_mm_unpackhi_epi16(v_mullo_b, v_mulhi_b),
   1216                                                    _mm_unpackhi_epi16(v_mullo_g, v_mulhi_g));
   1217                     v_dst1 = _mm_add_epi32(_mm_add_epi32(v_dst1, v_delta),
   1218                                            _mm_unpackhi_epi16(v_mullo_r, v_mulhi_r));
   1219 
   1220                     v_dst0 = _mm_srli_epi32(v_dst0, yuv_shift);
   1221                     v_dst1 = _mm_srli_epi32(v_dst1, yuv_shift);
   1222 
   1223                     __m128i v_dst = _mm_packs_epi32(v_dst0, v_dst1);
   1224                     _mm_storel_epi64((__m128i *)(dst + i), _mm_packus_epi16(v_dst, v_zero));
   1225                 }
   1226             }
   1227             #endif
   1228             for ( ; i < n; i++)
   1229             {
   1230                 int t = ((ushort*)src)[i];
   1231                 dst[i] = (uchar)CV_DESCALE(((t << 3) & 0xf8)*B2Y +
   1232                                            ((t >> 2) & 0xf8)*G2Y +
   1233                                            ((t >> 7) & 0xf8)*R2Y, yuv_shift);
   1234             }
   1235         }
   1236     }
   1237     int greenBits;
   1238 
   1239     #if CV_NEON
   1240     uint16x4_t v_b2y, v_g2y, v_r2y;
   1241     uint32x4_t v_delta;
   1242     uint16x8_t v_f8, v_fc;
   1243     #elif CV_SSE2
   1244     bool haveSIMD;
   1245     __m128i v_b2y, v_g2y, v_r2y;
   1246     __m128i v_delta;
   1247     __m128i v_f8, v_fc;
   1248     #endif
   1249 };
   1250 
   1251 
   1252 template<typename _Tp> struct RGB2Gray
   1253 {
   1254     typedef _Tp channel_type;
   1255 
   1256     RGB2Gray(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
   1257     {
   1258         static const float coeffs0[] = { 0.299f, 0.587f, 0.114f };
   1259         memcpy( coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]) );
   1260         if(blueIdx == 0)
   1261             std::swap(coeffs[0], coeffs[2]);
   1262     }
   1263 
   1264     void operator()(const _Tp* src, _Tp* dst, int n) const
   1265     {
   1266         int scn = srccn;
   1267         float cb = coeffs[0], cg = coeffs[1], cr = coeffs[2];
   1268         for(int i = 0; i < n; i++, src += scn)
   1269             dst[i] = saturate_cast<_Tp>(src[0]*cb + src[1]*cg + src[2]*cr);
   1270     }
   1271     int srccn;
   1272     float coeffs[3];
   1273 };
   1274 
   1275 template<> struct RGB2Gray<uchar>
   1276 {
   1277     typedef uchar channel_type;
   1278 
   1279     RGB2Gray(int _srccn, int blueIdx, const int* coeffs) : srccn(_srccn)
   1280     {
   1281         const int coeffs0[] = { R2Y, G2Y, B2Y };
   1282         if(!coeffs) coeffs = coeffs0;
   1283 
   1284         int b = 0, g = 0, r = (1 << (yuv_shift-1));
   1285         int db = coeffs[blueIdx^2], dg = coeffs[1], dr = coeffs[blueIdx];
   1286 
   1287         for( int i = 0; i < 256; i++, b += db, g += dg, r += dr )
   1288         {
   1289             tab[i] = b;
   1290             tab[i+256] = g;
   1291             tab[i+512] = r;
   1292         }
   1293     }
   1294     void operator()(const uchar* src, uchar* dst, int n) const
   1295     {
   1296         int scn = srccn;
   1297         const int* _tab = tab;
   1298         for(int i = 0; i < n; i++, src += scn)
   1299             dst[i] = (uchar)((_tab[src[0]] + _tab[src[1]+256] + _tab[src[2]+512]) >> yuv_shift);
   1300     }
   1301     int srccn;
   1302     int tab[256*3];
   1303 };
   1304 
   1305 #if CV_NEON
   1306 
   1307 template <>
   1308 struct RGB2Gray<ushort>
   1309 {
   1310     typedef ushort channel_type;
   1311 
   1312     RGB2Gray(int _srccn, int blueIdx, const int* _coeffs) :
   1313         srccn(_srccn)
   1314     {
   1315         static const int coeffs0[] = { R2Y, G2Y, B2Y };
   1316         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]));
   1317         if( blueIdx == 0 )
   1318             std::swap(coeffs[0], coeffs[2]);
   1319 
   1320         v_cb = vdup_n_u16(coeffs[0]);
   1321         v_cg = vdup_n_u16(coeffs[1]);
   1322         v_cr = vdup_n_u16(coeffs[2]);
   1323         v_delta = vdupq_n_u32(1 << (yuv_shift - 1));
   1324     }
   1325 
   1326     void operator()(const ushort* src, ushort* dst, int n) const
   1327     {
   1328         int scn = srccn, cb = coeffs[0], cg = coeffs[1], cr = coeffs[2], i = 0;
   1329 
   1330         for ( ; i <= n - 8; i += 8, src += scn * 8)
   1331         {
   1332             uint16x8_t v_b, v_r, v_g;
   1333             if (scn == 3)
   1334             {
   1335                 uint16x8x3_t v_src = vld3q_u16(src);
   1336                 v_b = v_src.val[0];
   1337                 v_g = v_src.val[1];
   1338                 v_r = v_src.val[2];
   1339             }
   1340             else
   1341             {
   1342                 uint16x8x4_t v_src = vld4q_u16(src);
   1343                 v_b = v_src.val[0];
   1344                 v_g = v_src.val[1];
   1345                 v_r = v_src.val[2];
   1346             }
   1347 
   1348             uint32x4_t v_dst0_ = vmlal_u16(vmlal_u16(
   1349                                            vmull_u16(vget_low_u16(v_b), v_cb),
   1350                                                      vget_low_u16(v_g), v_cg),
   1351                                                      vget_low_u16(v_r), v_cr);
   1352             uint32x4_t v_dst1_ = vmlal_u16(vmlal_u16(
   1353                                            vmull_u16(vget_high_u16(v_b), v_cb),
   1354                                                      vget_high_u16(v_g), v_cg),
   1355                                                      vget_high_u16(v_r), v_cr);
   1356 
   1357             uint16x4_t v_dst0 = vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst0_, v_delta), yuv_shift));
   1358             uint16x4_t v_dst1 = vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst1_, v_delta), yuv_shift));
   1359 
   1360             vst1q_u16(dst + i, vcombine_u16(v_dst0, v_dst1));
   1361         }
   1362 
   1363         for ( ; i <= n - 4; i += 4, src += scn * 4)
   1364         {
   1365             uint16x4_t v_b, v_r, v_g;
   1366             if (scn == 3)
   1367             {
   1368                 uint16x4x3_t v_src = vld3_u16(src);
   1369                 v_b = v_src.val[0];
   1370                 v_g = v_src.val[1];
   1371                 v_r = v_src.val[2];
   1372             }
   1373             else
   1374             {
   1375                 uint16x4x4_t v_src = vld4_u16(src);
   1376                 v_b = v_src.val[0];
   1377                 v_g = v_src.val[1];
   1378                 v_r = v_src.val[2];
   1379             }
   1380 
   1381             uint32x4_t v_dst = vmlal_u16(vmlal_u16(
   1382                                          vmull_u16(v_b, v_cb),
   1383                                                    v_g, v_cg),
   1384                                                    v_r, v_cr);
   1385 
   1386             vst1_u16(dst + i, vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst, v_delta), yuv_shift)));
   1387         }
   1388 
   1389         for( ; i < n; i++, src += scn)
   1390             dst[i] = (ushort)CV_DESCALE((unsigned)(src[0]*cb + src[1]*cg + src[2]*cr), yuv_shift);
   1391     }
   1392 
   1393     int srccn, coeffs[3];
   1394     uint16x4_t v_cb, v_cg, v_cr;
   1395     uint32x4_t v_delta;
   1396 };
   1397 
   1398 template <>
   1399 struct RGB2Gray<float>
   1400 {
   1401     typedef float channel_type;
   1402 
   1403     RGB2Gray(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
   1404     {
   1405         static const float coeffs0[] = { 0.299f, 0.587f, 0.114f };
   1406         memcpy( coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]) );
   1407         if(blueIdx == 0)
   1408             std::swap(coeffs[0], coeffs[2]);
   1409 
   1410         v_cb = vdupq_n_f32(coeffs[0]);
   1411         v_cg = vdupq_n_f32(coeffs[1]);
   1412         v_cr = vdupq_n_f32(coeffs[2]);
   1413     }
   1414 
   1415     void operator()(const float * src, float * dst, int n) const
   1416     {
   1417         int scn = srccn, i = 0;
   1418         float cb = coeffs[0], cg = coeffs[1], cr = coeffs[2];
   1419 
   1420         if (scn == 3)
   1421         {
   1422             for ( ; i <= n - 8; i += 8, src += scn * 8)
   1423             {
   1424                 float32x4x3_t v_src = vld3q_f32(src);
   1425                 vst1q_f32(dst + i, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
   1426 
   1427                 v_src = vld3q_f32(src + scn * 4);
   1428                 vst1q_f32(dst + i + 4, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
   1429             }
   1430 
   1431             for ( ; i <= n - 4; i += 4, src += scn * 4)
   1432             {
   1433                 float32x4x3_t v_src = vld3q_f32(src);
   1434                 vst1q_f32(dst + i, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
   1435             }
   1436         }
   1437         else
   1438         {
   1439             for ( ; i <= n - 8; i += 8, src += scn * 8)
   1440             {
   1441                 float32x4x4_t v_src = vld4q_f32(src);
   1442                 vst1q_f32(dst + i, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
   1443 
   1444                 v_src = vld4q_f32(src + scn * 4);
   1445                 vst1q_f32(dst + i + 4, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
   1446             }
   1447 
   1448             for ( ; i <= n - 4; i += 4, src += scn * 4)
   1449             {
   1450                 float32x4x4_t v_src = vld4q_f32(src);
   1451                 vst1q_f32(dst + i, vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_cb), v_src.val[1], v_cg), v_src.val[2], v_cr));
   1452             }
   1453         }
   1454 
   1455         for ( ; i < n; i++, src += scn)
   1456             dst[i] = src[0]*cb + src[1]*cg + src[2]*cr;
   1457     }
   1458 
   1459     int srccn;
   1460     float coeffs[3];
   1461     float32x4_t v_cb, v_cg, v_cr;
   1462 };
   1463 
   1464 #elif CV_SSE2
   1465 
   1466 #if CV_SSE4_1
   1467 
   1468 template <>
   1469 struct RGB2Gray<ushort>
   1470 {
   1471     typedef ushort channel_type;
   1472 
   1473     RGB2Gray(int _srccn, int blueIdx, const int* _coeffs) :
   1474         srccn(_srccn)
   1475     {
   1476         static const int coeffs0[] = { R2Y, G2Y, B2Y };
   1477         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]));
   1478         if( blueIdx == 0 )
   1479             std::swap(coeffs[0], coeffs[2]);
   1480 
   1481         v_cb = _mm_set1_epi16((short)coeffs[0]);
   1482         v_cg = _mm_set1_epi16((short)coeffs[1]);
   1483         v_cr = _mm_set1_epi16((short)coeffs[2]);
   1484         v_delta = _mm_set1_epi32(1 << (yuv_shift - 1));
   1485 
   1486         haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1);
   1487     }
   1488 
   1489     // 16s x 8
   1490     void process(__m128i v_b, __m128i v_g, __m128i v_r,
   1491                  __m128i & v_gray) const
   1492     {
   1493         __m128i v_mullo_r = _mm_mullo_epi16(v_r, v_cr);
   1494         __m128i v_mullo_g = _mm_mullo_epi16(v_g, v_cg);
   1495         __m128i v_mullo_b = _mm_mullo_epi16(v_b, v_cb);
   1496         __m128i v_mulhi_r = _mm_mulhi_epu16(v_r, v_cr);
   1497         __m128i v_mulhi_g = _mm_mulhi_epu16(v_g, v_cg);
   1498         __m128i v_mulhi_b = _mm_mulhi_epu16(v_b, v_cb);
   1499 
   1500         __m128i v_gray0 = _mm_add_epi32(_mm_unpacklo_epi16(v_mullo_r, v_mulhi_r),
   1501                                         _mm_unpacklo_epi16(v_mullo_g, v_mulhi_g));
   1502         v_gray0 = _mm_add_epi32(_mm_unpacklo_epi16(v_mullo_b, v_mulhi_b), v_gray0);
   1503         v_gray0 = _mm_srli_epi32(_mm_add_epi32(v_gray0, v_delta), yuv_shift);
   1504 
   1505         __m128i v_gray1 = _mm_add_epi32(_mm_unpackhi_epi16(v_mullo_r, v_mulhi_r),
   1506                                         _mm_unpackhi_epi16(v_mullo_g, v_mulhi_g));
   1507         v_gray1 = _mm_add_epi32(_mm_unpackhi_epi16(v_mullo_b, v_mulhi_b), v_gray1);
   1508         v_gray1 = _mm_srli_epi32(_mm_add_epi32(v_gray1, v_delta), yuv_shift);
   1509 
   1510         v_gray = _mm_packus_epi32(v_gray0, v_gray1);
   1511     }
   1512 
   1513     void operator()(const ushort* src, ushort* dst, int n) const
   1514     {
   1515         int scn = srccn, cb = coeffs[0], cg = coeffs[1], cr = coeffs[2], i = 0;
   1516 
   1517         if (scn == 3 && haveSIMD)
   1518         {
   1519             for ( ; i <= n - 16; i += 16, src += scn * 16)
   1520             {
   1521                 __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src));
   1522                 __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + 8));
   1523                 __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + 16));
   1524                 __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + 24));
   1525                 __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + 32));
   1526                 __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + 40));
   1527 
   1528                 _mm_deinterleave_epi16(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
   1529 
   1530                 __m128i v_gray0;
   1531                 process(v_r0, v_g0, v_b0,
   1532                         v_gray0);
   1533 
   1534                 __m128i v_gray1;
   1535                 process(v_r1, v_g1, v_b1,
   1536                         v_gray1);
   1537 
   1538                 _mm_storeu_si128((__m128i *)(dst + i), v_gray0);
   1539                 _mm_storeu_si128((__m128i *)(dst + i + 8), v_gray1);
   1540             }
   1541         }
   1542         else if (scn == 4 && haveSIMD)
   1543         {
   1544             for ( ; i <= n - 16; i += 16, src += scn * 16)
   1545             {
   1546                 __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src));
   1547                 __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + 8));
   1548                 __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + 16));
   1549                 __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + 24));
   1550                 __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + 32));
   1551                 __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + 40));
   1552                 __m128i v_a0 = _mm_loadu_si128((__m128i const *)(src + 48));
   1553                 __m128i v_a1 = _mm_loadu_si128((__m128i const *)(src + 56));
   1554 
   1555                 _mm_deinterleave_epi16(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1, v_a0, v_a1);
   1556 
   1557                 __m128i v_gray0;
   1558                 process(v_r0, v_g0, v_b0,
   1559                         v_gray0);
   1560 
   1561                 __m128i v_gray1;
   1562                 process(v_r1, v_g1, v_b1,
   1563                         v_gray1);
   1564 
   1565                 _mm_storeu_si128((__m128i *)(dst + i), v_gray0);
   1566                 _mm_storeu_si128((__m128i *)(dst + i + 8), v_gray1);
   1567             }
   1568         }
   1569 
   1570         for( ; i < n; i++, src += scn)
   1571             dst[i] = (ushort)CV_DESCALE((unsigned)(src[0]*cb + src[1]*cg + src[2]*cr), yuv_shift);
   1572     }
   1573 
   1574     int srccn, coeffs[3];
   1575     __m128i v_cb, v_cg, v_cr;
   1576     __m128i v_delta;
   1577     bool haveSIMD;
   1578 };
   1579 
   1580 #endif // CV_SSE4_1
   1581 
   1582 template <>
   1583 struct RGB2Gray<float>
   1584 {
   1585     typedef float channel_type;
   1586 
   1587     RGB2Gray(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
   1588     {
   1589         static const float coeffs0[] = { 0.299f, 0.587f, 0.114f };
   1590         memcpy( coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]) );
   1591         if(blueIdx == 0)
   1592             std::swap(coeffs[0], coeffs[2]);
   1593 
   1594         v_cb = _mm_set1_ps(coeffs[0]);
   1595         v_cg = _mm_set1_ps(coeffs[1]);
   1596         v_cr = _mm_set1_ps(coeffs[2]);
   1597 
   1598         haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
   1599     }
   1600 
   1601     void process(__m128 v_b, __m128 v_g, __m128 v_r,
   1602                  __m128 & v_gray) const
   1603     {
   1604         v_gray = _mm_mul_ps(v_r, v_cr);
   1605         v_gray = _mm_add_ps(v_gray, _mm_mul_ps(v_g, v_cg));
   1606         v_gray = _mm_add_ps(v_gray, _mm_mul_ps(v_b, v_cb));
   1607     }
   1608 
   1609     void operator()(const float * src, float * dst, int n) const
   1610     {
   1611         int scn = srccn, i = 0;
   1612         float cb = coeffs[0], cg = coeffs[1], cr = coeffs[2];
   1613 
   1614         if (scn == 3 && haveSIMD)
   1615         {
   1616             for ( ; i <= n - 8; i += 8, src += scn * 8)
   1617             {
   1618                 __m128 v_r0 = _mm_loadu_ps(src);
   1619                 __m128 v_r1 = _mm_loadu_ps(src + 4);
   1620                 __m128 v_g0 = _mm_loadu_ps(src + 8);
   1621                 __m128 v_g1 = _mm_loadu_ps(src + 12);
   1622                 __m128 v_b0 = _mm_loadu_ps(src + 16);
   1623                 __m128 v_b1 = _mm_loadu_ps(src + 20);
   1624 
   1625                 _mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
   1626 
   1627                 __m128 v_gray0;
   1628                 process(v_r0, v_g0, v_b0,
   1629                         v_gray0);
   1630 
   1631                 __m128 v_gray1;
   1632                 process(v_r1, v_g1, v_b1,
   1633                         v_gray1);
   1634 
   1635                 _mm_storeu_ps(dst + i, v_gray0);
   1636                 _mm_storeu_ps(dst + i + 4, v_gray1);
   1637             }
   1638         }
   1639         else if (scn == 4 && haveSIMD)
   1640         {
   1641             for ( ; i <= n - 8; i += 8, src += scn * 8)
   1642             {
   1643                 __m128 v_r0 = _mm_loadu_ps(src);
   1644                 __m128 v_r1 = _mm_loadu_ps(src + 4);
   1645                 __m128 v_g0 = _mm_loadu_ps(src + 8);
   1646                 __m128 v_g1 = _mm_loadu_ps(src + 12);
   1647                 __m128 v_b0 = _mm_loadu_ps(src + 16);
   1648                 __m128 v_b1 = _mm_loadu_ps(src + 20);
   1649                 __m128 v_a0 = _mm_loadu_ps(src + 24);
   1650                 __m128 v_a1 = _mm_loadu_ps(src + 28);
   1651 
   1652                 _mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1, v_a0, v_a1);
   1653 
   1654                 __m128 v_gray0;
   1655                 process(v_r0, v_g0, v_b0,
   1656                         v_gray0);
   1657 
   1658                 __m128 v_gray1;
   1659                 process(v_r1, v_g1, v_b1,
   1660                         v_gray1);
   1661 
   1662                 _mm_storeu_ps(dst + i, v_gray0);
   1663                 _mm_storeu_ps(dst + i + 4, v_gray1);
   1664             }
   1665         }
   1666 
   1667         for ( ; i < n; i++, src += scn)
   1668             dst[i] = src[0]*cb + src[1]*cg + src[2]*cr;
   1669     }
   1670 
   1671     int srccn;
   1672     float coeffs[3];
   1673     __m128 v_cb, v_cg, v_cr;
   1674     bool haveSIMD;
   1675 };
   1676 
   1677 #else
   1678 
   1679 template<> struct RGB2Gray<ushort>
   1680 {
   1681     typedef ushort channel_type;
   1682 
   1683     RGB2Gray(int _srccn, int blueIdx, const int* _coeffs) : srccn(_srccn)
   1684     {
   1685         static const int coeffs0[] = { R2Y, G2Y, B2Y };
   1686         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 3*sizeof(coeffs[0]));
   1687         if( blueIdx == 0 )
   1688             std::swap(coeffs[0], coeffs[2]);
   1689     }
   1690 
   1691     void operator()(const ushort* src, ushort* dst, int n) const
   1692     {
   1693         int scn = srccn, cb = coeffs[0], cg = coeffs[1], cr = coeffs[2];
   1694         for(int i = 0; i < n; i++, src += scn)
   1695             dst[i] = (ushort)CV_DESCALE((unsigned)(src[0]*cb + src[1]*cg + src[2]*cr), yuv_shift);
   1696     }
   1697     int srccn;
   1698     int coeffs[3];
   1699 };
   1700 
   1701 #endif
   1702 
   1703 ///////////////////////////////////// RGB <-> YCrCb //////////////////////////////////////
   1704 
   1705 template<typename _Tp> struct RGB2YCrCb_f
   1706 {
   1707     typedef _Tp channel_type;
   1708 
   1709     RGB2YCrCb_f(int _srccn, int _blueIdx, const float* _coeffs) : srccn(_srccn), blueIdx(_blueIdx)
   1710     {
   1711         static const float coeffs0[] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564f};
   1712         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
   1713         if(blueIdx==0) std::swap(coeffs[0], coeffs[2]);
   1714     }
   1715 
   1716     void operator()(const _Tp* src, _Tp* dst, int n) const
   1717     {
   1718         int scn = srccn, bidx = blueIdx;
   1719         const _Tp delta = ColorChannel<_Tp>::half();
   1720         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
   1721         n *= 3;
   1722         for(int i = 0; i < n; i += 3, src += scn)
   1723         {
   1724             _Tp Y = saturate_cast<_Tp>(src[0]*C0 + src[1]*C1 + src[2]*C2);
   1725             _Tp Cr = saturate_cast<_Tp>((src[bidx^2] - Y)*C3 + delta);
   1726             _Tp Cb = saturate_cast<_Tp>((src[bidx] - Y)*C4 + delta);
   1727             dst[i] = Y; dst[i+1] = Cr; dst[i+2] = Cb;
   1728         }
   1729     }
   1730     int srccn, blueIdx;
   1731     float coeffs[5];
   1732 };
   1733 
   1734 #if CV_NEON
   1735 
   1736 template <>
   1737 struct RGB2YCrCb_f<float>
   1738 {
   1739     typedef float channel_type;
   1740 
   1741     RGB2YCrCb_f(int _srccn, int _blueIdx, const float* _coeffs) :
   1742         srccn(_srccn), blueIdx(_blueIdx)
   1743     {
   1744         static const float coeffs0[] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564f};
   1745         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
   1746         if(blueIdx==0)
   1747             std::swap(coeffs[0], coeffs[2]);
   1748 
   1749         v_c0 = vdupq_n_f32(coeffs[0]);
   1750         v_c1 = vdupq_n_f32(coeffs[1]);
   1751         v_c2 = vdupq_n_f32(coeffs[2]);
   1752         v_c3 = vdupq_n_f32(coeffs[3]);
   1753         v_c4 = vdupq_n_f32(coeffs[4]);
   1754         v_delta = vdupq_n_f32(ColorChannel<float>::half());
   1755     }
   1756 
   1757     void operator()(const float * src, float * dst, int n) const
   1758     {
   1759         int scn = srccn, bidx = blueIdx, i = 0;
   1760         const float delta = ColorChannel<float>::half();
   1761         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
   1762         n *= 3;
   1763 
   1764         if (scn == 3)
   1765             for ( ; i <= n - 12; i += 12, src += 12)
   1766             {
   1767                 float32x4x3_t v_src = vld3q_f32(src), v_dst;
   1768                 v_dst.val[0] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c0), v_src.val[1], v_c1), v_src.val[2], v_c2);
   1769                 v_dst.val[1] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx^2], v_dst.val[0]), v_c3);
   1770                 v_dst.val[2] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx], v_dst.val[0]), v_c4);
   1771 
   1772                 vst3q_f32(dst + i, v_dst);
   1773             }
   1774         else
   1775             for ( ; i <= n - 12; i += 12, src += 16)
   1776             {
   1777                 float32x4x4_t v_src = vld4q_f32(src);
   1778                 float32x4x3_t v_dst;
   1779                 v_dst.val[0] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c0), v_src.val[1], v_c1), v_src.val[2], v_c2);
   1780                 v_dst.val[1] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx^2], v_dst.val[0]), v_c3);
   1781                 v_dst.val[2] = vmlaq_f32(v_delta, vsubq_f32(v_src.val[bidx], v_dst.val[0]), v_c4);
   1782 
   1783                 vst3q_f32(dst + i, v_dst);
   1784             }
   1785 
   1786         for ( ; i < n; i += 3, src += scn)
   1787         {
   1788             float Y = src[0]*C0 + src[1]*C1 + src[2]*C2;
   1789             float Cr = (src[bidx^2] - Y)*C3 + delta;
   1790             float Cb = (src[bidx] - Y)*C4 + delta;
   1791             dst[i] = Y; dst[i+1] = Cr; dst[i+2] = Cb;
   1792         }
   1793     }
   1794     int srccn, blueIdx;
   1795     float coeffs[5];
   1796     float32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_delta;
   1797 };
   1798 
   1799 #elif CV_SSE2
   1800 
   1801 template <>
   1802 struct RGB2YCrCb_f<float>
   1803 {
   1804     typedef float channel_type;
   1805 
   1806     RGB2YCrCb_f(int _srccn, int _blueIdx, const float* _coeffs) :
   1807         srccn(_srccn), blueIdx(_blueIdx)
   1808     {
   1809         static const float coeffs0[] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564f};
   1810         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
   1811         if (blueIdx==0)
   1812             std::swap(coeffs[0], coeffs[2]);
   1813 
   1814         v_c0 = _mm_set1_ps(coeffs[0]);
   1815         v_c1 = _mm_set1_ps(coeffs[1]);
   1816         v_c2 = _mm_set1_ps(coeffs[2]);
   1817         v_c3 = _mm_set1_ps(coeffs[3]);
   1818         v_c4 = _mm_set1_ps(coeffs[4]);
   1819         v_delta = _mm_set1_ps(ColorChannel<float>::half());
   1820 
   1821         haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
   1822     }
   1823 
   1824     void process(__m128 v_r, __m128 v_g, __m128 v_b,
   1825                  __m128 & v_y, __m128 & v_cr, __m128 & v_cb) const
   1826     {
   1827         v_y = _mm_mul_ps(v_r, v_c0);
   1828         v_y = _mm_add_ps(v_y, _mm_mul_ps(v_g, v_c1));
   1829         v_y = _mm_add_ps(v_y, _mm_mul_ps(v_b, v_c2));
   1830 
   1831         v_cr = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(blueIdx == 0 ? v_b : v_r, v_y), v_c3), v_delta);
   1832         v_cb = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(blueIdx == 2 ? v_b : v_r, v_y), v_c4), v_delta);
   1833     }
   1834 
   1835     void operator()(const float * src, float * dst, int n) const
   1836     {
   1837         int scn = srccn, bidx = blueIdx, i = 0;
   1838         const float delta = ColorChannel<float>::half();
   1839         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
   1840         n *= 3;
   1841 
   1842         if (haveSIMD)
   1843         {
   1844             for ( ; i <= n - 24; i += 24, src += 8 * scn)
   1845             {
   1846                 __m128 v_r0 = _mm_loadu_ps(src);
   1847                 __m128 v_r1 = _mm_loadu_ps(src + 4);
   1848                 __m128 v_g0 = _mm_loadu_ps(src + 8);
   1849                 __m128 v_g1 = _mm_loadu_ps(src + 12);
   1850                 __m128 v_b0 = _mm_loadu_ps(src + 16);
   1851                 __m128 v_b1 = _mm_loadu_ps(src + 20);
   1852 
   1853                 if (scn == 4)
   1854                 {
   1855                     __m128 v_a0 = _mm_loadu_ps(src + 24);
   1856                     __m128 v_a1 = _mm_loadu_ps(src + 28);
   1857                     _mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1,
   1858                                         v_b0, v_b1, v_a0, v_a1);
   1859                 }
   1860                 else
   1861                     _mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
   1862 
   1863                 __m128 v_y0, v_cr0, v_cb0;
   1864                 process(v_r0, v_g0, v_b0,
   1865                         v_y0, v_cr0, v_cb0);
   1866 
   1867                 __m128 v_y1, v_cr1, v_cb1;
   1868                 process(v_r1, v_g1, v_b1,
   1869                         v_y1, v_cr1, v_cb1);
   1870 
   1871                 _mm_interleave_ps(v_y0, v_y1, v_cr0, v_cr1, v_cb0, v_cb1);
   1872 
   1873                 _mm_storeu_ps(dst + i, v_y0);
   1874                 _mm_storeu_ps(dst + i + 4, v_y1);
   1875                 _mm_storeu_ps(dst + i + 8, v_cr0);
   1876                 _mm_storeu_ps(dst + i + 12, v_cr1);
   1877                 _mm_storeu_ps(dst + i + 16, v_cb0);
   1878                 _mm_storeu_ps(dst + i + 20, v_cb1);
   1879             }
   1880         }
   1881 
   1882         for ( ; i < n; i += 3, src += scn)
   1883         {
   1884             float Y = src[0]*C0 + src[1]*C1 + src[2]*C2;
   1885             float Cr = (src[bidx^2] - Y)*C3 + delta;
   1886             float Cb = (src[bidx] - Y)*C4 + delta;
   1887             dst[i] = Y; dst[i+1] = Cr; dst[i+2] = Cb;
   1888         }
   1889     }
   1890     int srccn, blueIdx;
   1891     float coeffs[5];
   1892     __m128 v_c0, v_c1, v_c2, v_c3, v_c4, v_delta;
   1893     bool haveSIMD;
   1894 };
   1895 
   1896 #endif
   1897 
   1898 template<typename _Tp> struct RGB2YCrCb_i
   1899 {
   1900     typedef _Tp channel_type;
   1901 
   1902     RGB2YCrCb_i(int _srccn, int _blueIdx, const int* _coeffs)
   1903         : srccn(_srccn), blueIdx(_blueIdx)
   1904     {
   1905         static const int coeffs0[] = {R2Y, G2Y, B2Y, 11682, 9241};
   1906         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
   1907         if(blueIdx==0) std::swap(coeffs[0], coeffs[2]);
   1908     }
   1909     void operator()(const _Tp* src, _Tp* dst, int n) const
   1910     {
   1911         int scn = srccn, bidx = blueIdx;
   1912         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
   1913         int delta = ColorChannel<_Tp>::half()*(1 << yuv_shift);
   1914         n *= 3;
   1915         for(int i = 0; i < n; i += 3, src += scn)
   1916         {
   1917             int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, yuv_shift);
   1918             int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, yuv_shift);
   1919             int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, yuv_shift);
   1920             dst[i] = saturate_cast<_Tp>(Y);
   1921             dst[i+1] = saturate_cast<_Tp>(Cr);
   1922             dst[i+2] = saturate_cast<_Tp>(Cb);
   1923         }
   1924     }
   1925     int srccn, blueIdx;
   1926     int coeffs[5];
   1927 };
   1928 
   1929 #if CV_NEON
   1930 
   1931 template <>
   1932 struct RGB2YCrCb_i<uchar>
   1933 {
   1934     typedef uchar channel_type;
   1935 
   1936     RGB2YCrCb_i(int _srccn, int _blueIdx, const int* _coeffs)
   1937         : srccn(_srccn), blueIdx(_blueIdx)
   1938     {
   1939         static const int coeffs0[] = {R2Y, G2Y, B2Y, 11682, 9241};
   1940         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
   1941         if (blueIdx==0)
   1942             std::swap(coeffs[0], coeffs[2]);
   1943 
   1944         v_c0 = vdup_n_s16(coeffs[0]);
   1945         v_c1 = vdup_n_s16(coeffs[1]);
   1946         v_c2 = vdup_n_s16(coeffs[2]);
   1947         v_c3 = vdupq_n_s32(coeffs[3]);
   1948         v_c4 = vdupq_n_s32(coeffs[4]);
   1949         v_delta = vdupq_n_s32(ColorChannel<uchar>::half()*(1 << yuv_shift));
   1950         v_delta2 = vdupq_n_s32(1 << (yuv_shift - 1));
   1951     }
   1952 
   1953     void operator()(const uchar * src, uchar * dst, int n) const
   1954     {
   1955         int scn = srccn, bidx = blueIdx, i = 0;
   1956         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
   1957         int delta = ColorChannel<uchar>::half()*(1 << yuv_shift);
   1958         n *= 3;
   1959 
   1960         for ( ; i <= n - 24; i += 24, src += scn * 8)
   1961         {
   1962             uint8x8x3_t v_dst;
   1963             int16x8x3_t v_src16;
   1964 
   1965             if (scn == 3)
   1966             {
   1967                 uint8x8x3_t v_src = vld3_u8(src);
   1968                 v_src16.val[0] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[0]));
   1969                 v_src16.val[1] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[1]));
   1970                 v_src16.val[2] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[2]));
   1971             }
   1972             else
   1973             {
   1974                 uint8x8x4_t v_src = vld4_u8(src);
   1975                 v_src16.val[0] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[0]));
   1976                 v_src16.val[1] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[1]));
   1977                 v_src16.val[2] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[2]));
   1978             }
   1979 
   1980             int16x4x3_t v_src0;
   1981             v_src0.val[0] = vget_low_s16(v_src16.val[0]);
   1982             v_src0.val[1] = vget_low_s16(v_src16.val[1]);
   1983             v_src0.val[2] = vget_low_s16(v_src16.val[2]);
   1984 
   1985             int32x4_t v_Y0 = vmlal_s16(vmlal_s16(vmull_s16(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2);
   1986             v_Y0 = vshrq_n_s32(vaddq_s32(v_Y0, v_delta2), yuv_shift);
   1987             int32x4_t v_Cr0 = vmlaq_s32(v_delta, vsubq_s32(vmovl_s16(v_src0.val[bidx^2]), v_Y0), v_c3);
   1988             v_Cr0 = vshrq_n_s32(vaddq_s32(v_Cr0, v_delta2), yuv_shift);
   1989             int32x4_t v_Cb0 = vmlaq_s32(v_delta, vsubq_s32(vmovl_s16(v_src0.val[bidx]), v_Y0), v_c4);
   1990             v_Cb0 = vshrq_n_s32(vaddq_s32(v_Cb0, v_delta2), yuv_shift);
   1991 
   1992             v_src0.val[0] = vget_high_s16(v_src16.val[0]);
   1993             v_src0.val[1] = vget_high_s16(v_src16.val[1]);
   1994             v_src0.val[2] = vget_high_s16(v_src16.val[2]);
   1995 
   1996             int32x4_t v_Y1 = vmlal_s16(vmlal_s16(vmull_s16(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2);
   1997             v_Y1 = vshrq_n_s32(vaddq_s32(v_Y1, v_delta2), yuv_shift);
   1998             int32x4_t v_Cr1 = vmlaq_s32(v_delta, vsubq_s32(vmovl_s16(v_src0.val[bidx^2]), v_Y1), v_c3);
   1999             v_Cr1 = vshrq_n_s32(vaddq_s32(v_Cr1, v_delta2), yuv_shift);
   2000             int32x4_t v_Cb1 = vmlaq_s32(v_delta, vsubq_s32(vmovl_s16(v_src0.val[bidx]), v_Y1), v_c4);
   2001             v_Cb1 = vshrq_n_s32(vaddq_s32(v_Cb1, v_delta2), yuv_shift);
   2002 
   2003             v_dst.val[0] = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Y0), vqmovn_s32(v_Y1)));
   2004             v_dst.val[1] = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Cr0), vqmovn_s32(v_Cr1)));
   2005             v_dst.val[2] = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Cb0), vqmovn_s32(v_Cb1)));
   2006 
   2007             vst3_u8(dst + i, v_dst);
   2008         }
   2009 
   2010         for ( ; i < n; i += 3, src += scn)
   2011         {
   2012             int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, yuv_shift);
   2013             int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, yuv_shift);
   2014             int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, yuv_shift);
   2015             dst[i] = saturate_cast<uchar>(Y);
   2016             dst[i+1] = saturate_cast<uchar>(Cr);
   2017             dst[i+2] = saturate_cast<uchar>(Cb);
   2018         }
   2019     }
   2020     int srccn, blueIdx, coeffs[5];
   2021     int16x4_t v_c0, v_c1, v_c2;
   2022     int32x4_t v_c3, v_c4, v_delta, v_delta2;
   2023 };
   2024 
   2025 template <>
   2026 struct RGB2YCrCb_i<ushort>
   2027 {
   2028     typedef ushort channel_type;
   2029 
   2030     RGB2YCrCb_i(int _srccn, int _blueIdx, const int* _coeffs)
   2031         : srccn(_srccn), blueIdx(_blueIdx)
   2032     {
   2033         static const int coeffs0[] = {R2Y, G2Y, B2Y, 11682, 9241};
   2034         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
   2035         if (blueIdx==0)
   2036             std::swap(coeffs[0], coeffs[2]);
   2037 
   2038         v_c0 = vdupq_n_s32(coeffs[0]);
   2039         v_c1 = vdupq_n_s32(coeffs[1]);
   2040         v_c2 = vdupq_n_s32(coeffs[2]);
   2041         v_c3 = vdupq_n_s32(coeffs[3]);
   2042         v_c4 = vdupq_n_s32(coeffs[4]);
   2043         v_delta = vdupq_n_s32(ColorChannel<ushort>::half()*(1 << yuv_shift));
   2044         v_delta2 = vdupq_n_s32(1 << (yuv_shift - 1));
   2045     }
   2046 
   2047     void operator()(const ushort * src, ushort * dst, int n) const
   2048     {
   2049         int scn = srccn, bidx = blueIdx, i = 0;
   2050         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
   2051         int delta = ColorChannel<ushort>::half()*(1 << yuv_shift);
   2052         n *= 3;
   2053 
   2054         for ( ; i <= n - 24; i += 24, src += scn * 8)
   2055         {
   2056             uint16x8x3_t v_src, v_dst;
   2057             int32x4x3_t v_src0;
   2058 
   2059             if (scn == 3)
   2060                 v_src = vld3q_u16(src);
   2061             else
   2062             {
   2063                 uint16x8x4_t v_src_ = vld4q_u16(src);
   2064                 v_src.val[0] = v_src_.val[0];
   2065                 v_src.val[1] = v_src_.val[1];
   2066                 v_src.val[2] = v_src_.val[2];
   2067             }
   2068 
   2069             v_src0.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[0])));
   2070             v_src0.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[1])));
   2071             v_src0.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[2])));
   2072 
   2073             int32x4_t v_Y0 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2);
   2074             v_Y0 = vshrq_n_s32(vaddq_s32(v_Y0, v_delta2), yuv_shift);
   2075             int32x4_t v_Cr0 = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx^2], v_Y0), v_c3);
   2076             v_Cr0 = vshrq_n_s32(vaddq_s32(v_Cr0, v_delta2), yuv_shift);
   2077             int32x4_t v_Cb0 = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx], v_Y0), v_c4);
   2078             v_Cb0 = vshrq_n_s32(vaddq_s32(v_Cb0, v_delta2), yuv_shift);
   2079 
   2080             v_src0.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[0])));
   2081             v_src0.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[1])));
   2082             v_src0.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[2])));
   2083 
   2084             int32x4_t v_Y1 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2);
   2085             v_Y1 = vshrq_n_s32(vaddq_s32(v_Y1, v_delta2), yuv_shift);
   2086             int32x4_t v_Cr1 = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx^2], v_Y1), v_c3);
   2087             v_Cr1 = vshrq_n_s32(vaddq_s32(v_Cr1, v_delta2), yuv_shift);
   2088             int32x4_t v_Cb1 = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx], v_Y1), v_c4);
   2089             v_Cb1 = vshrq_n_s32(vaddq_s32(v_Cb1, v_delta2), yuv_shift);
   2090 
   2091             v_dst.val[0] = vcombine_u16(vqmovun_s32(v_Y0), vqmovun_s32(v_Y1));
   2092             v_dst.val[1] = vcombine_u16(vqmovun_s32(v_Cr0), vqmovun_s32(v_Cr1));
   2093             v_dst.val[2] = vcombine_u16(vqmovun_s32(v_Cb0), vqmovun_s32(v_Cb1));
   2094 
   2095             vst3q_u16(dst + i, v_dst);
   2096         }
   2097 
   2098         for ( ; i <= n - 12; i += 12, src += scn * 4)
   2099         {
   2100             uint16x4x3_t v_dst;
   2101             int32x4x3_t v_src0;
   2102 
   2103             if (scn == 3)
   2104             {
   2105                 uint16x4x3_t v_src = vld3_u16(src);
   2106                 v_src0.val[0] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[0]));
   2107                 v_src0.val[1] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[1]));
   2108                 v_src0.val[2] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[2]));
   2109             }
   2110             else
   2111             {
   2112                 uint16x4x4_t v_src = vld4_u16(src);
   2113                 v_src0.val[0] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[0]));
   2114                 v_src0.val[1] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[1]));
   2115                 v_src0.val[2] = vreinterpretq_s32_u32(vmovl_u16(v_src.val[2]));
   2116             }
   2117 
   2118             int32x4_t v_Y = vmlaq_s32(vmlaq_s32(vmulq_s32(v_src0.val[0], v_c0), v_src0.val[1], v_c1), v_src0.val[2], v_c2);
   2119             v_Y = vshrq_n_s32(vaddq_s32(v_Y, v_delta2), yuv_shift);
   2120             int32x4_t v_Cr = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx^2], v_Y), v_c3);
   2121             v_Cr = vshrq_n_s32(vaddq_s32(v_Cr, v_delta2), yuv_shift);
   2122             int32x4_t v_Cb = vmlaq_s32(v_delta, vsubq_s32(v_src0.val[bidx], v_Y), v_c4);
   2123             v_Cb = vshrq_n_s32(vaddq_s32(v_Cb, v_delta2), yuv_shift);
   2124 
   2125             v_dst.val[0] = vqmovun_s32(v_Y);
   2126             v_dst.val[1] = vqmovun_s32(v_Cr);
   2127             v_dst.val[2] = vqmovun_s32(v_Cb);
   2128 
   2129             vst3_u16(dst + i, v_dst);
   2130         }
   2131 
   2132         for ( ; i < n; i += 3, src += scn)
   2133         {
   2134             int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, yuv_shift);
   2135             int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, yuv_shift);
   2136             int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, yuv_shift);
   2137             dst[i] = saturate_cast<ushort>(Y);
   2138             dst[i+1] = saturate_cast<ushort>(Cr);
   2139             dst[i+2] = saturate_cast<ushort>(Cb);
   2140         }
   2141     }
   2142     int srccn, blueIdx, coeffs[5];
   2143     int32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_delta, v_delta2;
   2144 };
   2145 
   2146 #elif CV_SSE4_1
   2147 
   2148 template <>
   2149 struct RGB2YCrCb_i<uchar>
   2150 {
   2151     typedef uchar channel_type;
   2152 
   2153     RGB2YCrCb_i(int _srccn, int _blueIdx, const int* _coeffs)
   2154         : srccn(_srccn), blueIdx(_blueIdx)
   2155     {
   2156         static const int coeffs0[] = {R2Y, G2Y, B2Y, 11682, 9241};
   2157         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
   2158         if (blueIdx==0)
   2159             std::swap(coeffs[0], coeffs[2]);
   2160 
   2161         v_c0 = _mm_set1_epi32(coeffs[0]);
   2162         v_c1 = _mm_set1_epi32(coeffs[1]);
   2163         v_c2 = _mm_set1_epi32(coeffs[2]);
   2164         v_c3 = _mm_set1_epi32(coeffs[3]);
   2165         v_c4 = _mm_set1_epi32(coeffs[4]);
   2166         v_delta2 = _mm_set1_epi32(1 << (yuv_shift - 1));
   2167         v_delta = _mm_set1_epi32(ColorChannel<uchar>::half()*(1 << yuv_shift));
   2168         v_delta = _mm_add_epi32(v_delta, v_delta2);
   2169         v_zero = _mm_setzero_si128();
   2170 
   2171         haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1);
   2172     }
   2173 
   2174     // 16u x 8
   2175     void process(__m128i v_r, __m128i v_g, __m128i v_b,
   2176                  __m128i & v_y, __m128i & v_cr, __m128i & v_cb) const
   2177     {
   2178         __m128i v_r_p = _mm_unpacklo_epi16(v_r, v_zero);
   2179         __m128i v_g_p = _mm_unpacklo_epi16(v_g, v_zero);
   2180         __m128i v_b_p = _mm_unpacklo_epi16(v_b, v_zero);
   2181 
   2182         __m128i v_y0 = _mm_add_epi32(_mm_mullo_epi32(v_r_p, v_c0),
   2183                        _mm_add_epi32(_mm_mullo_epi32(v_g_p, v_c1),
   2184                                      _mm_mullo_epi32(v_b_p, v_c2)));
   2185         v_y0 = _mm_srli_epi32(_mm_add_epi32(v_delta2, v_y0), yuv_shift);
   2186 
   2187         __m128i v_cr0 = _mm_mullo_epi32(_mm_sub_epi32(blueIdx == 2 ? v_r_p : v_b_p, v_y0), v_c3);
   2188         __m128i v_cb0 = _mm_mullo_epi32(_mm_sub_epi32(blueIdx == 0 ? v_r_p : v_b_p, v_y0), v_c4);
   2189         v_cr0 = _mm_srai_epi32(_mm_add_epi32(v_delta, v_cr0), yuv_shift);
   2190         v_cb0 = _mm_srai_epi32(_mm_add_epi32(v_delta, v_cb0), yuv_shift);
   2191 
   2192         v_r_p = _mm_unpackhi_epi16(v_r, v_zero);
   2193         v_g_p = _mm_unpackhi_epi16(v_g, v_zero);
   2194         v_b_p = _mm_unpackhi_epi16(v_b, v_zero);
   2195 
   2196         __m128i v_y1 = _mm_add_epi32(_mm_mullo_epi32(v_r_p, v_c0),
   2197                        _mm_add_epi32(_mm_mullo_epi32(v_g_p, v_c1),
   2198                                      _mm_mullo_epi32(v_b_p, v_c2)));
   2199         v_y1 = _mm_srli_epi32(_mm_add_epi32(v_delta2, v_y1), yuv_shift);
   2200 
   2201         __m128i v_cr1 = _mm_mullo_epi32(_mm_sub_epi32(blueIdx == 2 ? v_r_p : v_b_p, v_y1), v_c3);
   2202         __m128i v_cb1 = _mm_mullo_epi32(_mm_sub_epi32(blueIdx == 0 ? v_r_p : v_b_p, v_y1), v_c4);
   2203         v_cr1 = _mm_srai_epi32(_mm_add_epi32(v_delta, v_cr1), yuv_shift);
   2204         v_cb1 = _mm_srai_epi32(_mm_add_epi32(v_delta, v_cb1), yuv_shift);
   2205 
   2206         v_y = _mm_packs_epi32(v_y0, v_y1);
   2207         v_cr = _mm_packs_epi32(v_cr0, v_cr1);
   2208         v_cb = _mm_packs_epi32(v_cb0, v_cb1);
   2209     }
   2210 
   2211     void operator()(const uchar * src, uchar * dst, int n) const
   2212     {
   2213         int scn = srccn, bidx = blueIdx, i = 0;
   2214         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
   2215         int delta = ColorChannel<uchar>::half()*(1 << yuv_shift);
   2216         n *= 3;
   2217 
   2218         if (haveSIMD)
   2219         {
   2220             for ( ; i <= n - 96; i += 96, src += scn * 32)
   2221             {
   2222                 __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src));
   2223                 __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + 16));
   2224                 __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + 32));
   2225                 __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + 48));
   2226                 __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + 64));
   2227                 __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + 80));
   2228 
   2229                 if (scn == 4)
   2230                 {
   2231                     __m128i v_a0 = _mm_loadu_si128((__m128i const *)(src + 96));
   2232                     __m128i v_a1 = _mm_loadu_si128((__m128i const *)(src + 112));
   2233                     _mm_deinterleave_epi8(v_r0, v_r1, v_g0, v_g1,
   2234                                           v_b0, v_b1, v_a0, v_a1);
   2235                 }
   2236                 else
   2237                     _mm_deinterleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
   2238 
   2239                 __m128i v_y0 = v_zero, v_cr0 = v_zero, v_cb0 = v_zero;
   2240                 process(_mm_unpacklo_epi8(v_r0, v_zero),
   2241                         _mm_unpacklo_epi8(v_g0, v_zero),
   2242                         _mm_unpacklo_epi8(v_b0, v_zero),
   2243                         v_y0, v_cr0, v_cb0);
   2244 
   2245                 __m128i v_y1 = v_zero, v_cr1 = v_zero, v_cb1 = v_zero;
   2246                 process(_mm_unpackhi_epi8(v_r0, v_zero),
   2247                         _mm_unpackhi_epi8(v_g0, v_zero),
   2248                         _mm_unpackhi_epi8(v_b0, v_zero),
   2249                         v_y1, v_cr1, v_cb1);
   2250 
   2251                 __m128i v_y_0 = _mm_packus_epi16(v_y0, v_y1);
   2252                 __m128i v_cr_0 = _mm_packus_epi16(v_cr0, v_cr1);
   2253                 __m128i v_cb_0 = _mm_packus_epi16(v_cb0, v_cb1);
   2254 
   2255                 process(_mm_unpacklo_epi8(v_r1, v_zero),
   2256                         _mm_unpacklo_epi8(v_g1, v_zero),
   2257                         _mm_unpacklo_epi8(v_b1, v_zero),
   2258                         v_y0, v_cr0, v_cb0);
   2259 
   2260                 process(_mm_unpackhi_epi8(v_r1, v_zero),
   2261                         _mm_unpackhi_epi8(v_g1, v_zero),
   2262                         _mm_unpackhi_epi8(v_b1, v_zero),
   2263                         v_y1, v_cr1, v_cb1);
   2264 
   2265                 __m128i v_y_1 = _mm_packus_epi16(v_y0, v_y1);
   2266                 __m128i v_cr_1 = _mm_packus_epi16(v_cr0, v_cr1);
   2267                 __m128i v_cb_1 = _mm_packus_epi16(v_cb0, v_cb1);
   2268 
   2269                 _mm_interleave_epi8(v_y_0, v_y_1, v_cr_0, v_cr_1, v_cb_0, v_cb_1);
   2270 
   2271                 _mm_storeu_si128((__m128i *)(dst + i), v_y_0);
   2272                 _mm_storeu_si128((__m128i *)(dst + i + 16), v_y_1);
   2273                 _mm_storeu_si128((__m128i *)(dst + i + 32), v_cr_0);
   2274                 _mm_storeu_si128((__m128i *)(dst + i + 48), v_cr_1);
   2275                 _mm_storeu_si128((__m128i *)(dst + i + 64), v_cb_0);
   2276                 _mm_storeu_si128((__m128i *)(dst + i + 80), v_cb_1);
   2277             }
   2278         }
   2279 
   2280         for ( ; i < n; i += 3, src += scn)
   2281         {
   2282             int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, yuv_shift);
   2283             int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, yuv_shift);
   2284             int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, yuv_shift);
   2285             dst[i] = saturate_cast<uchar>(Y);
   2286             dst[i+1] = saturate_cast<uchar>(Cr);
   2287             dst[i+2] = saturate_cast<uchar>(Cb);
   2288         }
   2289     }
   2290 
   2291     int srccn, blueIdx, coeffs[5];
   2292     __m128i v_c0, v_c1, v_c2;
   2293     __m128i v_c3, v_c4, v_delta, v_delta2;
   2294     __m128i v_zero;
   2295     bool haveSIMD;
   2296 };
   2297 
   2298 template <>
   2299 struct RGB2YCrCb_i<ushort>
   2300 {
   2301     typedef ushort channel_type;
   2302 
   2303     RGB2YCrCb_i(int _srccn, int _blueIdx, const int* _coeffs)
   2304         : srccn(_srccn), blueIdx(_blueIdx)
   2305     {
   2306         static const int coeffs0[] = {R2Y, G2Y, B2Y, 11682, 9241};
   2307         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
   2308         if (blueIdx==0)
   2309             std::swap(coeffs[0], coeffs[2]);
   2310 
   2311         v_c0 = _mm_set1_epi32(coeffs[0]);
   2312         v_c1 = _mm_set1_epi32(coeffs[1]);
   2313         v_c2 = _mm_set1_epi32(coeffs[2]);
   2314         v_c3 = _mm_set1_epi32(coeffs[3]);
   2315         v_c4 = _mm_set1_epi32(coeffs[4]);
   2316         v_delta2 = _mm_set1_epi32(1 << (yuv_shift - 1));
   2317         v_delta = _mm_set1_epi32(ColorChannel<ushort>::half()*(1 << yuv_shift));
   2318         v_delta = _mm_add_epi32(v_delta, v_delta2);
   2319         v_zero = _mm_setzero_si128();
   2320 
   2321         haveSIMD = checkHardwareSupport(CV_CPU_SSE4_1);
   2322     }
   2323 
   2324     // 16u x 8
   2325     void process(__m128i v_r, __m128i v_g, __m128i v_b,
   2326                  __m128i & v_y, __m128i & v_cr, __m128i & v_cb) const
   2327     {
   2328         __m128i v_r_p = _mm_unpacklo_epi16(v_r, v_zero);
   2329         __m128i v_g_p = _mm_unpacklo_epi16(v_g, v_zero);
   2330         __m128i v_b_p = _mm_unpacklo_epi16(v_b, v_zero);
   2331 
   2332         __m128i v_y0 = _mm_add_epi32(_mm_mullo_epi32(v_r_p, v_c0),
   2333                        _mm_add_epi32(_mm_mullo_epi32(v_g_p, v_c1),
   2334                                      _mm_mullo_epi32(v_b_p, v_c2)));
   2335         v_y0 = _mm_srli_epi32(_mm_add_epi32(v_delta2, v_y0), yuv_shift);
   2336 
   2337         __m128i v_cr0 = _mm_mullo_epi32(_mm_sub_epi32(blueIdx == 2 ? v_r_p : v_b_p, v_y0), v_c3);
   2338         __m128i v_cb0 = _mm_mullo_epi32(_mm_sub_epi32(blueIdx == 0 ? v_r_p : v_b_p, v_y0), v_c4);
   2339         v_cr0 = _mm_srai_epi32(_mm_add_epi32(v_delta, v_cr0), yuv_shift);
   2340         v_cb0 = _mm_srai_epi32(_mm_add_epi32(v_delta, v_cb0), yuv_shift);
   2341 
   2342         v_r_p = _mm_unpackhi_epi16(v_r, v_zero);
   2343         v_g_p = _mm_unpackhi_epi16(v_g, v_zero);
   2344         v_b_p = _mm_unpackhi_epi16(v_b, v_zero);
   2345 
   2346         __m128i v_y1 = _mm_add_epi32(_mm_mullo_epi32(v_r_p, v_c0),
   2347                        _mm_add_epi32(_mm_mullo_epi32(v_g_p, v_c1),
   2348                                      _mm_mullo_epi32(v_b_p, v_c2)));
   2349         v_y1 = _mm_srli_epi32(_mm_add_epi32(v_delta2, v_y1), yuv_shift);
   2350 
   2351         __m128i v_cr1 = _mm_mullo_epi32(_mm_sub_epi32(blueIdx == 2 ? v_r_p : v_b_p, v_y1), v_c3);
   2352         __m128i v_cb1 = _mm_mullo_epi32(_mm_sub_epi32(blueIdx == 0 ? v_r_p : v_b_p, v_y1), v_c4);
   2353         v_cr1 = _mm_srai_epi32(_mm_add_epi32(v_delta, v_cr1), yuv_shift);
   2354         v_cb1 = _mm_srai_epi32(_mm_add_epi32(v_delta, v_cb1), yuv_shift);
   2355 
   2356         v_y = _mm_packus_epi32(v_y0, v_y1);
   2357         v_cr = _mm_packus_epi32(v_cr0, v_cr1);
   2358         v_cb = _mm_packus_epi32(v_cb0, v_cb1);
   2359     }
   2360 
   2361     void operator()(const ushort * src, ushort * dst, int n) const
   2362     {
   2363         int scn = srccn, bidx = blueIdx, i = 0;
   2364         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3], C4 = coeffs[4];
   2365         int delta = ColorChannel<ushort>::half()*(1 << yuv_shift);
   2366         n *= 3;
   2367 
   2368         if (haveSIMD)
   2369         {
   2370             for ( ; i <= n - 48; i += 48, src += scn * 16)
   2371             {
   2372                 __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src));
   2373                 __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + 8));
   2374                 __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + 16));
   2375                 __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + 24));
   2376                 __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + 32));
   2377                 __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + 40));
   2378 
   2379                 if (scn == 4)
   2380                 {
   2381                     __m128i v_a0 = _mm_loadu_si128((__m128i const *)(src + 48));
   2382                     __m128i v_a1 = _mm_loadu_si128((__m128i const *)(src + 56));
   2383 
   2384                     _mm_deinterleave_epi16(v_r0, v_r1, v_g0, v_g1,
   2385                                            v_b0, v_b1, v_a0, v_a1);
   2386                 }
   2387                 else
   2388                     _mm_deinterleave_epi16(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
   2389 
   2390                 __m128i v_y0 = v_zero, v_cr0 = v_zero, v_cb0 = v_zero;
   2391                 process(v_r0, v_g0, v_b0,
   2392                         v_y0, v_cr0, v_cb0);
   2393 
   2394                 __m128i v_y1 = v_zero, v_cr1 = v_zero, v_cb1 = v_zero;
   2395                 process(v_r1, v_g1, v_b1,
   2396                         v_y1, v_cr1, v_cb1);
   2397 
   2398                 _mm_interleave_epi16(v_y0, v_y1, v_cr0, v_cr1, v_cb0, v_cb1);
   2399 
   2400                 _mm_storeu_si128((__m128i *)(dst + i), v_y0);
   2401                 _mm_storeu_si128((__m128i *)(dst + i + 8), v_y1);
   2402                 _mm_storeu_si128((__m128i *)(dst + i + 16), v_cr0);
   2403                 _mm_storeu_si128((__m128i *)(dst + i + 24), v_cr1);
   2404                 _mm_storeu_si128((__m128i *)(dst + i + 32), v_cb0);
   2405                 _mm_storeu_si128((__m128i *)(dst + i + 40), v_cb1);
   2406             }
   2407         }
   2408 
   2409         for ( ; i < n; i += 3, src += scn)
   2410         {
   2411             int Y = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, yuv_shift);
   2412             int Cr = CV_DESCALE((src[bidx^2] - Y)*C3 + delta, yuv_shift);
   2413             int Cb = CV_DESCALE((src[bidx] - Y)*C4 + delta, yuv_shift);
   2414             dst[i] = saturate_cast<ushort>(Y);
   2415             dst[i+1] = saturate_cast<ushort>(Cr);
   2416             dst[i+2] = saturate_cast<ushort>(Cb);
   2417         }
   2418     }
   2419 
   2420     int srccn, blueIdx, coeffs[5];
   2421     __m128i v_c0, v_c1, v_c2;
   2422     __m128i v_c3, v_c4, v_delta, v_delta2;
   2423     __m128i v_zero;
   2424     bool haveSIMD;
   2425 };
   2426 
   2427 #endif // CV_SSE4_1
   2428 
   2429 template<typename _Tp> struct YCrCb2RGB_f
   2430 {
   2431     typedef _Tp channel_type;
   2432 
   2433     YCrCb2RGB_f(int _dstcn, int _blueIdx, const float* _coeffs)
   2434         : dstcn(_dstcn), blueIdx(_blueIdx)
   2435     {
   2436         static const float coeffs0[] = {1.403f, -0.714f, -0.344f, 1.773f};
   2437         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0]));
   2438     }
   2439     void operator()(const _Tp* src, _Tp* dst, int n) const
   2440     {
   2441         int dcn = dstcn, bidx = blueIdx;
   2442         const _Tp delta = ColorChannel<_Tp>::half(), alpha = ColorChannel<_Tp>::max();
   2443         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
   2444         n *= 3;
   2445         for(int i = 0; i < n; i += 3, dst += dcn)
   2446         {
   2447             _Tp Y = src[i];
   2448             _Tp Cr = src[i+1];
   2449             _Tp Cb = src[i+2];
   2450 
   2451             _Tp b = saturate_cast<_Tp>(Y + (Cb - delta)*C3);
   2452             _Tp g = saturate_cast<_Tp>(Y + (Cb - delta)*C2 + (Cr - delta)*C1);
   2453             _Tp r = saturate_cast<_Tp>(Y + (Cr - delta)*C0);
   2454 
   2455             dst[bidx] = b; dst[1] = g; dst[bidx^2] = r;
   2456             if( dcn == 4 )
   2457                 dst[3] = alpha;
   2458         }
   2459     }
   2460     int dstcn, blueIdx;
   2461     float coeffs[4];
   2462 };
   2463 
   2464 #if CV_NEON
   2465 
   2466 template <>
   2467 struct YCrCb2RGB_f<float>
   2468 {
   2469     typedef float channel_type;
   2470 
   2471     YCrCb2RGB_f(int _dstcn, int _blueIdx, const float* _coeffs)
   2472         : dstcn(_dstcn), blueIdx(_blueIdx)
   2473     {
   2474         static const float coeffs0[] = {1.403f, -0.714f, -0.344f, 1.773f};
   2475         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0]));
   2476 
   2477         v_c0 = vdupq_n_f32(coeffs[0]);
   2478         v_c1 = vdupq_n_f32(coeffs[1]);
   2479         v_c2 = vdupq_n_f32(coeffs[2]);
   2480         v_c3 = vdupq_n_f32(coeffs[3]);
   2481         v_delta = vdupq_n_f32(ColorChannel<float>::half());
   2482         v_alpha = vdupq_n_f32(ColorChannel<float>::max());
   2483     }
   2484 
   2485     void operator()(const float* src, float* dst, int n) const
   2486     {
   2487         int dcn = dstcn, bidx = blueIdx, i = 0;
   2488         const float delta = ColorChannel<float>::half(), alpha = ColorChannel<float>::max();
   2489         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
   2490         n *= 3;
   2491 
   2492         if (dcn == 3)
   2493             for ( ; i <= n - 12; i += 12, dst += 12)
   2494             {
   2495                 float32x4x3_t v_src = vld3q_f32(src + i), v_dst;
   2496                 float32x4_t v_Y = v_src.val[0], v_Cr = v_src.val[1], v_Cb = v_src.val[2];
   2497 
   2498                 v_dst.val[bidx] = vmlaq_f32(v_Y, vsubq_f32(v_Cb, v_delta), v_c3);
   2499                 v_dst.val[1] = vaddq_f32(vmlaq_f32(vmulq_f32(vsubq_f32(v_Cb, v_delta), v_c2), vsubq_f32(v_Cr, v_delta), v_c1), v_Y);
   2500                 v_dst.val[bidx^2] = vmlaq_f32(v_Y, vsubq_f32(v_Cr, v_delta), v_c0);
   2501 
   2502                 vst3q_f32(dst, v_dst);
   2503             }
   2504         else
   2505             for ( ; i <= n - 12; i += 12, dst += 16)
   2506             {
   2507                 float32x4x3_t v_src = vld3q_f32(src + i);
   2508                 float32x4x4_t v_dst;
   2509                 float32x4_t v_Y = v_src.val[0], v_Cr = v_src.val[1], v_Cb = v_src.val[2];
   2510 
   2511                 v_dst.val[bidx] = vmlaq_f32(v_Y, vsubq_f32(v_Cb, v_delta), v_c3);
   2512                 v_dst.val[1] = vaddq_f32(vmlaq_f32(vmulq_f32(vsubq_f32(v_Cb, v_delta), v_c2), vsubq_f32(v_Cr, v_delta), v_c1), v_Y);
   2513                 v_dst.val[bidx^2] = vmlaq_f32(v_Y, vsubq_f32(v_Cr, v_delta), v_c0);
   2514                 v_dst.val[3] = v_alpha;
   2515 
   2516                 vst4q_f32(dst, v_dst);
   2517             }
   2518 
   2519         for ( ; i < n; i += 3, dst += dcn)
   2520         {
   2521             float Y = src[i], Cr = src[i+1], Cb = src[i+2];
   2522 
   2523             float b = Y + (Cb - delta)*C3;
   2524             float g = Y + (Cb - delta)*C2 + (Cr - delta)*C1;
   2525             float r = Y + (Cr - delta)*C0;
   2526 
   2527             dst[bidx] = b; dst[1] = g; dst[bidx^2] = r;
   2528             if( dcn == 4 )
   2529                 dst[3] = alpha;
   2530         }
   2531     }
   2532     int dstcn, blueIdx;
   2533     float coeffs[4];
   2534     float32x4_t v_c0, v_c1, v_c2, v_c3, v_alpha, v_delta;
   2535 };
   2536 
   2537 #elif CV_SSE2
   2538 
   2539 template <>
   2540 struct YCrCb2RGB_f<float>
   2541 {
   2542     typedef float channel_type;
   2543 
   2544     YCrCb2RGB_f(int _dstcn, int _blueIdx, const float* _coeffs)
   2545         : dstcn(_dstcn), blueIdx(_blueIdx)
   2546     {
   2547         static const float coeffs0[] = {1.403f, -0.714f, -0.344f, 1.773f};
   2548         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0]));
   2549 
   2550         v_c0 = _mm_set1_ps(coeffs[0]);
   2551         v_c1 = _mm_set1_ps(coeffs[1]);
   2552         v_c2 = _mm_set1_ps(coeffs[2]);
   2553         v_c3 = _mm_set1_ps(coeffs[3]);
   2554         v_delta = _mm_set1_ps(ColorChannel<float>::half());
   2555         v_alpha = _mm_set1_ps(ColorChannel<float>::max());
   2556 
   2557         haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
   2558     }
   2559 
   2560     void process(__m128 v_y, __m128 v_cr, __m128 v_cb,
   2561                  __m128 & v_r, __m128 & v_g, __m128 & v_b) const
   2562     {
   2563         v_cb = _mm_sub_ps(v_cb, v_delta);
   2564         v_cr = _mm_sub_ps(v_cr, v_delta);
   2565 
   2566         v_b = _mm_mul_ps(v_cb, v_c3);
   2567         v_g = _mm_add_ps(_mm_mul_ps(v_cb, v_c2), _mm_mul_ps(v_cr, v_c1));
   2568         v_r = _mm_mul_ps(v_cr, v_c0);
   2569 
   2570         v_b = _mm_add_ps(v_b, v_y);
   2571         v_g = _mm_add_ps(v_g, v_y);
   2572         v_r = _mm_add_ps(v_r, v_y);
   2573 
   2574         if (blueIdx == 0)
   2575             std::swap(v_b, v_r);
   2576     }
   2577 
   2578     void operator()(const float* src, float* dst, int n) const
   2579     {
   2580         int dcn = dstcn, bidx = blueIdx, i = 0;
   2581         const float delta = ColorChannel<float>::half(), alpha = ColorChannel<float>::max();
   2582         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
   2583         n *= 3;
   2584 
   2585         if (haveSIMD)
   2586         {
   2587             for ( ; i <= n - 24; i += 24, dst += 8 * dcn)
   2588             {
   2589                 __m128 v_y0 = _mm_loadu_ps(src + i);
   2590                 __m128 v_y1 = _mm_loadu_ps(src + i + 4);
   2591                 __m128 v_cr0 = _mm_loadu_ps(src + i + 8);
   2592                 __m128 v_cr1 = _mm_loadu_ps(src + i + 12);
   2593                 __m128 v_cb0 = _mm_loadu_ps(src + i + 16);
   2594                 __m128 v_cb1 = _mm_loadu_ps(src + i + 20);
   2595 
   2596                 _mm_deinterleave_ps(v_y0, v_y1, v_cr0, v_cr1, v_cb0, v_cb1);
   2597 
   2598                 __m128 v_r0, v_g0, v_b0;
   2599                 process(v_y0, v_cr0, v_cb0,
   2600                         v_r0, v_g0, v_b0);
   2601 
   2602                 __m128 v_r1, v_g1, v_b1;
   2603                 process(v_y1, v_cr1, v_cb1,
   2604                         v_r1, v_g1, v_b1);
   2605 
   2606                 __m128 v_a0 = v_alpha, v_a1 = v_alpha;
   2607 
   2608                 if (dcn == 3)
   2609                     _mm_interleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
   2610                 else
   2611                     _mm_interleave_ps(v_r0, v_r1, v_g0, v_g1,
   2612                                       v_b0, v_b1, v_a0, v_a1);
   2613 
   2614                 _mm_storeu_ps(dst, v_r0);
   2615                 _mm_storeu_ps(dst + 4, v_r1);
   2616                 _mm_storeu_ps(dst + 8, v_g0);
   2617                 _mm_storeu_ps(dst + 12, v_g1);
   2618                 _mm_storeu_ps(dst + 16, v_b0);
   2619                 _mm_storeu_ps(dst + 20, v_b1);
   2620 
   2621                 if (dcn == 4)
   2622                 {
   2623                     _mm_storeu_ps(dst + 24, v_a0);
   2624                     _mm_storeu_ps(dst + 28, v_a1);
   2625                 }
   2626             }
   2627         }
   2628 
   2629         for ( ; i < n; i += 3, dst += dcn)
   2630         {
   2631             float Y = src[i], Cr = src[i+1], Cb = src[i+2];
   2632 
   2633             float b = Y + (Cb - delta)*C3;
   2634             float g = Y + (Cb - delta)*C2 + (Cr - delta)*C1;
   2635             float r = Y + (Cr - delta)*C0;
   2636 
   2637             dst[bidx] = b; dst[1] = g; dst[bidx^2] = r;
   2638             if( dcn == 4 )
   2639                 dst[3] = alpha;
   2640         }
   2641     }
   2642     int dstcn, blueIdx;
   2643     float coeffs[4];
   2644 
   2645     __m128 v_c0, v_c1, v_c2, v_c3, v_alpha, v_delta;
   2646     bool haveSIMD;
   2647 };
   2648 
   2649 #endif
   2650 
   2651 template<typename _Tp> struct YCrCb2RGB_i
   2652 {
   2653     typedef _Tp channel_type;
   2654 
   2655     YCrCb2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
   2656         : dstcn(_dstcn), blueIdx(_blueIdx)
   2657     {
   2658         static const int coeffs0[] = {22987, -11698, -5636, 29049};
   2659         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0]));
   2660     }
   2661 
   2662     void operator()(const _Tp* src, _Tp* dst, int n) const
   2663     {
   2664         int dcn = dstcn, bidx = blueIdx;
   2665         const _Tp delta = ColorChannel<_Tp>::half(), alpha = ColorChannel<_Tp>::max();
   2666         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
   2667         n *= 3;
   2668         for(int i = 0; i < n; i += 3, dst += dcn)
   2669         {
   2670             _Tp Y = src[i];
   2671             _Tp Cr = src[i+1];
   2672             _Tp Cb = src[i+2];
   2673 
   2674             int b = Y + CV_DESCALE((Cb - delta)*C3, yuv_shift);
   2675             int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, yuv_shift);
   2676             int r = Y + CV_DESCALE((Cr - delta)*C0, yuv_shift);
   2677 
   2678             dst[bidx] = saturate_cast<_Tp>(b);
   2679             dst[1] = saturate_cast<_Tp>(g);
   2680             dst[bidx^2] = saturate_cast<_Tp>(r);
   2681             if( dcn == 4 )
   2682                 dst[3] = alpha;
   2683         }
   2684     }
   2685     int dstcn, blueIdx;
   2686     int coeffs[4];
   2687 };
   2688 
   2689 #if CV_NEON
   2690 
   2691 template <>
   2692 struct YCrCb2RGB_i<uchar>
   2693 {
   2694     typedef uchar channel_type;
   2695 
   2696     YCrCb2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
   2697         : dstcn(_dstcn), blueIdx(_blueIdx)
   2698     {
   2699         static const int coeffs0[] = {22987, -11698, -5636, 29049};
   2700         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0]));
   2701 
   2702         v_c0 = vdupq_n_s32(coeffs[0]);
   2703         v_c1 = vdupq_n_s32(coeffs[1]);
   2704         v_c2 = vdupq_n_s32(coeffs[2]);
   2705         v_c3 = vdupq_n_s32(coeffs[3]);
   2706         v_delta = vdup_n_s16(ColorChannel<uchar>::half());
   2707         v_delta2 = vdupq_n_s32(1 << (yuv_shift - 1));
   2708         v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
   2709     }
   2710 
   2711     void operator()(const uchar* src, uchar* dst, int n) const
   2712     {
   2713         int dcn = dstcn, bidx = blueIdx, i = 0;
   2714         const uchar delta = ColorChannel<uchar>::half(), alpha = ColorChannel<uchar>::max();
   2715         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
   2716         n *= 3;
   2717 
   2718         for ( ; i <= n - 24; i += 24, dst += dcn * 8)
   2719         {
   2720             uint8x8x3_t v_src = vld3_u8(src + i);
   2721             int16x8x3_t v_src16;
   2722             v_src16.val[0] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[0]));
   2723             v_src16.val[1] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[1]));
   2724             v_src16.val[2] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[2]));
   2725 
   2726             int16x4_t v_Y = vget_low_s16(v_src16.val[0]),
   2727                       v_Cr = vget_low_s16(v_src16.val[1]),
   2728                       v_Cb = vget_low_s16(v_src16.val[2]);
   2729 
   2730             int32x4_t v_b0 = vmulq_s32(v_c3, vsubl_s16(v_Cb, v_delta));
   2731             v_b0 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_b0, v_delta2), yuv_shift), v_Y);
   2732             int32x4_t v_g0 = vmlaq_s32(vmulq_s32(vsubl_s16(v_Cr, v_delta), v_c1), vsubl_s16(v_Cb, v_delta), v_c2);
   2733             v_g0 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_g0, v_delta2), yuv_shift), v_Y);
   2734             int32x4_t v_r0 = vmulq_s32(v_c0, vsubl_s16(v_Cr, v_delta));
   2735             v_r0 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_r0, v_delta2), yuv_shift), v_Y);
   2736 
   2737             v_Y = vget_high_s16(v_src16.val[0]);
   2738             v_Cr = vget_high_s16(v_src16.val[1]);
   2739             v_Cb = vget_high_s16(v_src16.val[2]);
   2740 
   2741             int32x4_t v_b1 = vmulq_s32(v_c3, vsubl_s16(v_Cb, v_delta));
   2742             v_b1 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_b1, v_delta2), yuv_shift), v_Y);
   2743             int32x4_t v_g1 = vmlaq_s32(vmulq_s32(vsubl_s16(v_Cr, v_delta), v_c1), vsubl_s16(v_Cb, v_delta), v_c2);
   2744             v_g1 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_g1, v_delta2), yuv_shift), v_Y);
   2745             int32x4_t v_r1 = vmulq_s32(v_c0, vsubl_s16(v_Cr, v_delta));
   2746             v_r1 = vaddw_s16(vshrq_n_s32(vaddq_s32(v_r1, v_delta2), yuv_shift), v_Y);
   2747 
   2748             uint8x8_t v_b = vqmovun_s16(vcombine_s16(vmovn_s32(v_b0), vmovn_s32(v_b1)));
   2749             uint8x8_t v_g = vqmovun_s16(vcombine_s16(vmovn_s32(v_g0), vmovn_s32(v_g1)));
   2750             uint8x8_t v_r = vqmovun_s16(vcombine_s16(vmovn_s32(v_r0), vmovn_s32(v_r1)));
   2751 
   2752             if (dcn == 3)
   2753             {
   2754                 uint8x8x3_t v_dst;
   2755                 v_dst.val[bidx] = v_b;
   2756                 v_dst.val[1] = v_g;
   2757                 v_dst.val[bidx^2] = v_r;
   2758                 vst3_u8(dst, v_dst);
   2759             }
   2760             else
   2761             {
   2762                 uint8x8x4_t v_dst;
   2763                 v_dst.val[bidx] = v_b;
   2764                 v_dst.val[1] = v_g;
   2765                 v_dst.val[bidx^2] = v_r;
   2766                 v_dst.val[3] = v_alpha;
   2767                 vst4_u8(dst, v_dst);
   2768             }
   2769         }
   2770 
   2771         for ( ; i < n; i += 3, dst += dcn)
   2772         {
   2773             uchar Y = src[i];
   2774             uchar Cr = src[i+1];
   2775             uchar Cb = src[i+2];
   2776 
   2777             int b = Y + CV_DESCALE((Cb - delta)*C3, yuv_shift);
   2778             int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, yuv_shift);
   2779             int r = Y + CV_DESCALE((Cr - delta)*C0, yuv_shift);
   2780 
   2781             dst[bidx] = saturate_cast<uchar>(b);
   2782             dst[1] = saturate_cast<uchar>(g);
   2783             dst[bidx^2] = saturate_cast<uchar>(r);
   2784             if( dcn == 4 )
   2785                 dst[3] = alpha;
   2786         }
   2787     }
   2788     int dstcn, blueIdx;
   2789     int coeffs[4];
   2790 
   2791     int32x4_t v_c0, v_c1, v_c2, v_c3, v_delta2;
   2792     int16x4_t v_delta;
   2793     uint8x8_t v_alpha;
   2794 };
   2795 
   2796 template <>
   2797 struct YCrCb2RGB_i<ushort>
   2798 {
   2799     typedef ushort channel_type;
   2800 
   2801     YCrCb2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
   2802         : dstcn(_dstcn), blueIdx(_blueIdx)
   2803     {
   2804         static const int coeffs0[] = {22987, -11698, -5636, 29049};
   2805         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0]));
   2806 
   2807         v_c0 = vdupq_n_s32(coeffs[0]);
   2808         v_c1 = vdupq_n_s32(coeffs[1]);
   2809         v_c2 = vdupq_n_s32(coeffs[2]);
   2810         v_c3 = vdupq_n_s32(coeffs[3]);
   2811         v_delta = vdupq_n_s32(ColorChannel<ushort>::half());
   2812         v_delta2 = vdupq_n_s32(1 << (yuv_shift - 1));
   2813         v_alpha = vdupq_n_u16(ColorChannel<ushort>::max());
   2814         v_alpha2 = vget_low_u16(v_alpha);
   2815     }
   2816 
   2817     void operator()(const ushort* src, ushort* dst, int n) const
   2818     {
   2819         int dcn = dstcn, bidx = blueIdx, i = 0;
   2820         const ushort delta = ColorChannel<ushort>::half(), alpha = ColorChannel<ushort>::max();
   2821         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
   2822         n *= 3;
   2823 
   2824         for ( ; i <= n - 24; i += 24, dst += dcn * 8)
   2825         {
   2826             uint16x8x3_t v_src = vld3q_u16(src + i);
   2827 
   2828             int32x4_t v_Y = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[0]))),
   2829                       v_Cr = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[1]))),
   2830                       v_Cb = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[2])));
   2831 
   2832             int32x4_t v_b0 = vmulq_s32(v_c3, vsubq_s32(v_Cb, v_delta));
   2833             v_b0 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_b0, v_delta2), yuv_shift), v_Y);
   2834             int32x4_t v_g0 = vmlaq_s32(vmulq_s32(vsubq_s32(v_Cr, v_delta), v_c1), vsubq_s32(v_Cb, v_delta), v_c2);
   2835             v_g0 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_g0, v_delta2), yuv_shift), v_Y);
   2836             int32x4_t v_r0 = vmulq_s32(v_c0, vsubq_s32(v_Cr, v_delta));
   2837             v_r0 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_r0, v_delta2), yuv_shift), v_Y);
   2838 
   2839             v_Y = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[0]))),
   2840             v_Cr = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[1]))),
   2841             v_Cb = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[2])));
   2842 
   2843             int32x4_t v_b1 = vmulq_s32(v_c3, vsubq_s32(v_Cb, v_delta));
   2844             v_b1 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_b1, v_delta2), yuv_shift), v_Y);
   2845             int32x4_t v_g1 = vmlaq_s32(vmulq_s32(vsubq_s32(v_Cr, v_delta), v_c1), vsubq_s32(v_Cb, v_delta), v_c2);
   2846             v_g1 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_g1, v_delta2), yuv_shift), v_Y);
   2847             int32x4_t v_r1 = vmulq_s32(v_c0, vsubq_s32(v_Cr, v_delta));
   2848             v_r1 = vaddq_s32(vshrq_n_s32(vaddq_s32(v_r1, v_delta2), yuv_shift), v_Y);
   2849 
   2850             uint16x8_t v_b = vcombine_u16(vqmovun_s32(v_b0), vqmovun_s32(v_b1));
   2851             uint16x8_t v_g = vcombine_u16(vqmovun_s32(v_g0), vqmovun_s32(v_g1));
   2852             uint16x8_t v_r = vcombine_u16(vqmovun_s32(v_r0), vqmovun_s32(v_r1));
   2853 
   2854             if (dcn == 3)
   2855             {
   2856                 uint16x8x3_t v_dst;
   2857                 v_dst.val[bidx] = v_b;
   2858                 v_dst.val[1] = v_g;
   2859                 v_dst.val[bidx^2] = v_r;
   2860                 vst3q_u16(dst, v_dst);
   2861             }
   2862             else
   2863             {
   2864                 uint16x8x4_t v_dst;
   2865                 v_dst.val[bidx] = v_b;
   2866                 v_dst.val[1] = v_g;
   2867                 v_dst.val[bidx^2] = v_r;
   2868                 v_dst.val[3] = v_alpha;
   2869                 vst4q_u16(dst, v_dst);
   2870             }
   2871         }
   2872 
   2873         for ( ; i <= n - 12; i += 12, dst += dcn * 4)
   2874         {
   2875             uint16x4x3_t v_src = vld3_u16(src + i);
   2876 
   2877             int32x4_t v_Y = vreinterpretq_s32_u32(vmovl_u16(v_src.val[0])),
   2878                       v_Cr = vreinterpretq_s32_u32(vmovl_u16(v_src.val[1])),
   2879                       v_Cb = vreinterpretq_s32_u32(vmovl_u16(v_src.val[2]));
   2880 
   2881             int32x4_t v_b = vmulq_s32(v_c3, vsubq_s32(v_Cb, v_delta));
   2882             v_b = vaddq_s32(vshrq_n_s32(vaddq_s32(v_b, v_delta2), yuv_shift), v_Y);
   2883             int32x4_t v_g = vmlaq_s32(vmulq_s32(vsubq_s32(v_Cr, v_delta), v_c1), vsubq_s32(v_Cb, v_delta), v_c2);
   2884             v_g = vaddq_s32(vshrq_n_s32(vaddq_s32(v_g, v_delta2), yuv_shift), v_Y);
   2885             int32x4_t v_r = vmulq_s32(vsubq_s32(v_Cr, v_delta), v_c0);
   2886             v_r = vaddq_s32(vshrq_n_s32(vaddq_s32(v_r, v_delta2), yuv_shift), v_Y);
   2887 
   2888             uint16x4_t v_bd = vqmovun_s32(v_b);
   2889             uint16x4_t v_gd = vqmovun_s32(v_g);
   2890             uint16x4_t v_rd = vqmovun_s32(v_r);
   2891 
   2892             if (dcn == 3)
   2893             {
   2894                 uint16x4x3_t v_dst;
   2895                 v_dst.val[bidx] = v_bd;
   2896                 v_dst.val[1] = v_gd;
   2897                 v_dst.val[bidx^2] = v_rd;
   2898                 vst3_u16(dst, v_dst);
   2899             }
   2900             else
   2901             {
   2902                 uint16x4x4_t v_dst;
   2903                 v_dst.val[bidx] = v_bd;
   2904                 v_dst.val[1] = v_gd;
   2905                 v_dst.val[bidx^2] = v_rd;
   2906                 v_dst.val[3] = v_alpha2;
   2907                 vst4_u16(dst, v_dst);
   2908             }
   2909         }
   2910 
   2911         for ( ; i < n; i += 3, dst += dcn)
   2912         {
   2913             ushort Y = src[i];
   2914             ushort Cr = src[i+1];
   2915             ushort Cb = src[i+2];
   2916 
   2917             int b = Y + CV_DESCALE((Cb - delta)*C3, yuv_shift);
   2918             int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, yuv_shift);
   2919             int r = Y + CV_DESCALE((Cr - delta)*C0, yuv_shift);
   2920 
   2921             dst[bidx] = saturate_cast<ushort>(b);
   2922             dst[1] = saturate_cast<ushort>(g);
   2923             dst[bidx^2] = saturate_cast<ushort>(r);
   2924             if( dcn == 4 )
   2925                 dst[3] = alpha;
   2926         }
   2927     }
   2928     int dstcn, blueIdx;
   2929     int coeffs[4];
   2930 
   2931     int32x4_t v_c0, v_c1, v_c2, v_c3, v_delta2, v_delta;
   2932     uint16x8_t v_alpha;
   2933     uint16x4_t v_alpha2;
   2934 };
   2935 
   2936 #elif CV_SSE2
   2937 
   2938 template <>
   2939 struct YCrCb2RGB_i<uchar>
   2940 {
   2941     typedef uchar channel_type;
   2942 
   2943     YCrCb2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
   2944         : dstcn(_dstcn), blueIdx(_blueIdx)
   2945     {
   2946         static const int coeffs0[] = {22987, -11698, -5636, 29049};
   2947         memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0]));
   2948 
   2949         v_c0 = _mm_set1_epi16((short)coeffs[0]);
   2950         v_c1 = _mm_set1_epi16((short)coeffs[1]);
   2951         v_c2 = _mm_set1_epi16((short)coeffs[2]);
   2952         v_c3 = _mm_set1_epi16((short)coeffs[3]);
   2953         v_delta = _mm_set1_epi16(ColorChannel<uchar>::half());
   2954         v_delta2 = _mm_set1_epi32(1 << (yuv_shift - 1));
   2955         v_zero = _mm_setzero_si128();
   2956 
   2957         uchar alpha = ColorChannel<uchar>::max();
   2958         v_alpha = _mm_set1_epi8(*(char *)&alpha);
   2959 
   2960         useSSE = coeffs[0] <= std::numeric_limits<short>::max();
   2961         haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
   2962     }
   2963 
   2964     // 16s x 8
   2965     void process(__m128i v_y, __m128i v_cr, __m128i v_cb,
   2966                  __m128i & v_r, __m128i & v_g, __m128i & v_b) const
   2967     {
   2968         v_cr = _mm_sub_epi16(v_cr, v_delta);
   2969         v_cb = _mm_sub_epi16(v_cb, v_delta);
   2970 
   2971         __m128i v_y_p = _mm_unpacklo_epi16(v_y, v_zero);
   2972 
   2973         __m128i v_mullo_3 = _mm_mullo_epi16(v_cb, v_c3);
   2974         __m128i v_mullo_2 = _mm_mullo_epi16(v_cb, v_c2);
   2975         __m128i v_mullo_1 = _mm_mullo_epi16(v_cr, v_c1);
   2976         __m128i v_mullo_0 = _mm_mullo_epi16(v_cr, v_c0);
   2977 
   2978         __m128i v_mulhi_3 = _mm_mulhi_epi16(v_cb, v_c3);
   2979         __m128i v_mulhi_2 = _mm_mulhi_epi16(v_cb, v_c2);
   2980         __m128i v_mulhi_1 = _mm_mulhi_epi16(v_cr, v_c1);
   2981         __m128i v_mulhi_0 = _mm_mulhi_epi16(v_cr, v_c0);
   2982 
   2983         __m128i v_b0 = _mm_srai_epi32(_mm_add_epi32(_mm_unpacklo_epi16(v_mullo_3, v_mulhi_3), v_delta2), yuv_shift);
   2984         __m128i v_g0 = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(_mm_unpacklo_epi16(v_mullo_2, v_mulhi_2),
   2985                                                                   _mm_unpacklo_epi16(v_mullo_1, v_mulhi_1)), v_delta2),
   2986                                       yuv_shift);
   2987         __m128i v_r0 = _mm_srai_epi32(_mm_add_epi32(_mm_unpacklo_epi16(v_mullo_0, v_mulhi_0), v_delta2), yuv_shift);
   2988 
   2989         v_r0 = _mm_add_epi32(v_r0, v_y_p);
   2990         v_g0 = _mm_add_epi32(v_g0, v_y_p);
   2991         v_b0 = _mm_add_epi32(v_b0, v_y_p);
   2992 
   2993         v_y_p = _mm_unpackhi_epi16(v_y, v_zero);
   2994 
   2995         __m128i v_b1 = _mm_srai_epi32(_mm_add_epi32(_mm_unpackhi_epi16(v_mullo_3, v_mulhi_3), v_delta2), yuv_shift);
   2996         __m128i v_g1 = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(_mm_unpackhi_epi16(v_mullo_2, v_mulhi_2),
   2997                                                                   _mm_unpackhi_epi16(v_mullo_1, v_mulhi_1)), v_delta2),
   2998                                       yuv_shift);
   2999         __m128i v_r1 = _mm_srai_epi32(_mm_add_epi32(_mm_unpackhi_epi16(v_mullo_0, v_mulhi_0), v_delta2), yuv_shift);
   3000 
   3001         v_r1 = _mm_add_epi32(v_r1, v_y_p);
   3002         v_g1 = _mm_add_epi32(v_g1, v_y_p);
   3003         v_b1 = _mm_add_epi32(v_b1, v_y_p);
   3004 
   3005         v_r = _mm_packs_epi32(v_r0, v_r1);
   3006         v_g = _mm_packs_epi32(v_g0, v_g1);
   3007         v_b = _mm_packs_epi32(v_b0, v_b1);
   3008     }
   3009 
   3010     void operator()(const uchar* src, uchar* dst, int n) const
   3011     {
   3012         int dcn = dstcn, bidx = blueIdx, i = 0;
   3013         const uchar delta = ColorChannel<uchar>::half(), alpha = ColorChannel<uchar>::max();
   3014         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2], C3 = coeffs[3];
   3015         n *= 3;
   3016 
   3017         if (haveSIMD && useSSE)
   3018         {
   3019             for ( ; i <= n - 96; i += 96, dst += dcn * 32)
   3020             {
   3021                 __m128i v_y0 = _mm_loadu_si128((__m128i const *)(src + i));
   3022                 __m128i v_y1 = _mm_loadu_si128((__m128i const *)(src + i + 16));
   3023                 __m128i v_cr0 = _mm_loadu_si128((__m128i const *)(src + i + 32));
   3024                 __m128i v_cr1 = _mm_loadu_si128((__m128i const *)(src + i + 48));
   3025                 __m128i v_cb0 = _mm_loadu_si128((__m128i const *)(src + i + 64));
   3026                 __m128i v_cb1 = _mm_loadu_si128((__m128i const *)(src + i + 80));
   3027 
   3028                 _mm_deinterleave_epi8(v_y0, v_y1, v_cr0, v_cr1, v_cb0, v_cb1);
   3029 
   3030                 __m128i v_r_0 = v_zero, v_g_0 = v_zero, v_b_0 = v_zero;
   3031                 process(_mm_unpacklo_epi8(v_y0, v_zero),
   3032                         _mm_unpacklo_epi8(v_cr0, v_zero),
   3033                         _mm_unpacklo_epi8(v_cb0, v_zero),
   3034                         v_r_0, v_g_0, v_b_0);
   3035 
   3036                 __m128i v_r_1 = v_zero, v_g_1 = v_zero, v_b_1 = v_zero;
   3037                 process(_mm_unpackhi_epi8(v_y0, v_zero),
   3038                         _mm_unpackhi_epi8(v_cr0, v_zero),
   3039                         _mm_unpackhi_epi8(v_cb0, v_zero),
   3040                         v_r_1, v_g_1, v_b_1);
   3041 
   3042                 __m128i v_r0 = _mm_packus_epi16(v_r_0, v_r_1);
   3043                 __m128i v_g0 = _mm_packus_epi16(v_g_0, v_g_1);
   3044                 __m128i v_b0 = _mm_packus_epi16(v_b_0, v_b_1);
   3045 
   3046                 process(_mm_unpacklo_epi8(v_y1, v_zero),
   3047                         _mm_unpacklo_epi8(v_cr1, v_zero),
   3048                         _mm_unpacklo_epi8(v_cb1, v_zero),
   3049                         v_r_0, v_g_0, v_b_0);
   3050 
   3051                 process(_mm_unpackhi_epi8(v_y1, v_zero),
   3052                         _mm_unpackhi_epi8(v_cr1, v_zero),
   3053                         _mm_unpackhi_epi8(v_cb1, v_zero),
   3054                         v_r_1, v_g_1, v_b_1);
   3055 
   3056                 __m128i v_r1 = _mm_packus_epi16(v_r_0, v_r_1);
   3057                 __m128i v_g1 = _mm_packus_epi16(v_g_0, v_g_1);
   3058                 __m128i v_b1 = _mm_packus_epi16(v_b_0, v_b_1);
   3059 
   3060                 if (bidx == 0)
   3061                 {
   3062                     std::swap(v_r0, v_b0);
   3063                     std::swap(v_r1, v_b1);
   3064                 }
   3065 
   3066                 __m128i v_a0 = v_alpha, v_a1 = v_alpha;
   3067 
   3068                 if (dcn == 3)
   3069                     _mm_interleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
   3070                 else
   3071                     _mm_interleave_epi8(v_r0, v_r1, v_g0, v_g1,
   3072                                         v_b0, v_b1, v_a0, v_a1);
   3073 
   3074                 _mm_storeu_si128((__m128i *)(dst), v_r0);
   3075                 _mm_storeu_si128((__m128i *)(dst + 16), v_r1);
   3076                 _mm_storeu_si128((__m128i *)(dst + 32), v_g0);
   3077                 _mm_storeu_si128((__m128i *)(dst + 48), v_g1);
   3078                 _mm_storeu_si128((__m128i *)(dst + 64), v_b0);
   3079                 _mm_storeu_si128((__m128i *)(dst + 80), v_b1);
   3080 
   3081                 if (dcn == 4)
   3082                 {
   3083                     _mm_storeu_si128((__m128i *)(dst + 96), v_a0);
   3084                     _mm_storeu_si128((__m128i *)(dst + 112), v_a1);
   3085                 }
   3086             }
   3087         }
   3088 
   3089         for ( ; i < n; i += 3, dst += dcn)
   3090         {
   3091             uchar Y = src[i];
   3092             uchar Cr = src[i+1];
   3093             uchar Cb = src[i+2];
   3094 
   3095             int b = Y + CV_DESCALE((Cb - delta)*C3, yuv_shift);
   3096             int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, yuv_shift);
   3097             int r = Y + CV_DESCALE((Cr - delta)*C0, yuv_shift);
   3098 
   3099             dst[bidx] = saturate_cast<uchar>(b);
   3100             dst[1] = saturate_cast<uchar>(g);
   3101             dst[bidx^2] = saturate_cast<uchar>(r);
   3102             if( dcn == 4 )
   3103                 dst[3] = alpha;
   3104         }
   3105     }
   3106     int dstcn, blueIdx;
   3107     int coeffs[4];
   3108     bool useSSE, haveSIMD;
   3109 
   3110     __m128i v_c0, v_c1, v_c2, v_c3, v_delta2;
   3111     __m128i v_delta, v_alpha, v_zero;
   3112 };
   3113 
   3114 #endif // CV_SSE2
   3115 
   3116 ////////////////////////////////////// RGB <-> XYZ ///////////////////////////////////////
   3117 
   3118 static const float sRGB2XYZ_D65[] =
   3119 {
   3120     0.412453f, 0.357580f, 0.180423f,
   3121     0.212671f, 0.715160f, 0.072169f,
   3122     0.019334f, 0.119193f, 0.950227f
   3123 };
   3124 
   3125 static const float XYZ2sRGB_D65[] =
   3126 {
   3127     3.240479f, -1.53715f, -0.498535f,
   3128     -0.969256f, 1.875991f, 0.041556f,
   3129     0.055648f, -0.204043f, 1.057311f
   3130 };
   3131 
   3132 template<typename _Tp> struct RGB2XYZ_f
   3133 {
   3134     typedef _Tp channel_type;
   3135 
   3136     RGB2XYZ_f(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
   3137     {
   3138         memcpy(coeffs, _coeffs ? _coeffs : sRGB2XYZ_D65, 9*sizeof(coeffs[0]));
   3139         if(blueIdx == 0)
   3140         {
   3141             std::swap(coeffs[0], coeffs[2]);
   3142             std::swap(coeffs[3], coeffs[5]);
   3143             std::swap(coeffs[6], coeffs[8]);
   3144         }
   3145     }
   3146     void operator()(const _Tp* src, _Tp* dst, int n) const
   3147     {
   3148         int scn = srccn;
   3149         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
   3150               C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
   3151               C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
   3152 
   3153         n *= 3;
   3154         for(int i = 0; i < n; i += 3, src += scn)
   3155         {
   3156             _Tp X = saturate_cast<_Tp>(src[0]*C0 + src[1]*C1 + src[2]*C2);
   3157             _Tp Y = saturate_cast<_Tp>(src[0]*C3 + src[1]*C4 + src[2]*C5);
   3158             _Tp Z = saturate_cast<_Tp>(src[0]*C6 + src[1]*C7 + src[2]*C8);
   3159             dst[i] = X; dst[i+1] = Y; dst[i+2] = Z;
   3160         }
   3161     }
   3162     int srccn;
   3163     float coeffs[9];
   3164 };
   3165 
   3166 #if CV_NEON
   3167 
   3168 template <>
   3169 struct RGB2XYZ_f<float>
   3170 {
   3171     typedef float channel_type;
   3172 
   3173     RGB2XYZ_f(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
   3174     {
   3175         memcpy(coeffs, _coeffs ? _coeffs : sRGB2XYZ_D65, 9*sizeof(coeffs[0]));
   3176         if(blueIdx == 0)
   3177         {
   3178             std::swap(coeffs[0], coeffs[2]);
   3179             std::swap(coeffs[3], coeffs[5]);
   3180             std::swap(coeffs[6], coeffs[8]);
   3181         }
   3182 
   3183         v_c0 = vdupq_n_f32(coeffs[0]);
   3184         v_c1 = vdupq_n_f32(coeffs[1]);
   3185         v_c2 = vdupq_n_f32(coeffs[2]);
   3186         v_c3 = vdupq_n_f32(coeffs[3]);
   3187         v_c4 = vdupq_n_f32(coeffs[4]);
   3188         v_c5 = vdupq_n_f32(coeffs[5]);
   3189         v_c6 = vdupq_n_f32(coeffs[6]);
   3190         v_c7 = vdupq_n_f32(coeffs[7]);
   3191         v_c8 = vdupq_n_f32(coeffs[8]);
   3192     }
   3193 
   3194     void operator()(const float* src, float* dst, int n) const
   3195     {
   3196         int scn = srccn, i = 0;
   3197         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
   3198               C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
   3199               C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
   3200 
   3201         n *= 3;
   3202 
   3203         if (scn == 3)
   3204             for ( ; i <= n - 12; i += 12, src += 12)
   3205             {
   3206                 float32x4x3_t v_src = vld3q_f32(src), v_dst;
   3207                 v_dst.val[0] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c0), v_src.val[1], v_c1), v_src.val[2], v_c2);
   3208                 v_dst.val[1] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c3), v_src.val[1], v_c4), v_src.val[2], v_c5);
   3209                 v_dst.val[2] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c6), v_src.val[1], v_c7), v_src.val[2], v_c8);
   3210                 vst3q_f32(dst + i, v_dst);
   3211             }
   3212         else
   3213             for ( ; i <= n - 12; i += 12, src += 16)
   3214             {
   3215                 float32x4x4_t v_src = vld4q_f32(src);
   3216                 float32x4x3_t v_dst;
   3217                 v_dst.val[0] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c0), v_src.val[1], v_c1), v_src.val[2], v_c2);
   3218                 v_dst.val[1] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c3), v_src.val[1], v_c4), v_src.val[2], v_c5);
   3219                 v_dst.val[2] = vmlaq_f32(vmlaq_f32(vmulq_f32(v_src.val[0], v_c6), v_src.val[1], v_c7), v_src.val[2], v_c8);
   3220                 vst3q_f32(dst + i, v_dst);
   3221             }
   3222 
   3223         for ( ; i < n; i += 3, src += scn)
   3224         {
   3225             float X = saturate_cast<float>(src[0]*C0 + src[1]*C1 + src[2]*C2);
   3226             float Y = saturate_cast<float>(src[0]*C3 + src[1]*C4 + src[2]*C5);
   3227             float Z = saturate_cast<float>(src[0]*C6 + src[1]*C7 + src[2]*C8);
   3228             dst[i] = X; dst[i+1] = Y; dst[i+2] = Z;
   3229         }
   3230     }
   3231 
   3232     int srccn;
   3233     float coeffs[9];
   3234     float32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8;
   3235 };
   3236 
   3237 #elif CV_SSE2
   3238 
   3239 template <>
   3240 struct RGB2XYZ_f<float>
   3241 {
   3242     typedef float channel_type;
   3243 
   3244     RGB2XYZ_f(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
   3245     {
   3246         memcpy(coeffs, _coeffs ? _coeffs : sRGB2XYZ_D65, 9*sizeof(coeffs[0]));
   3247         if(blueIdx == 0)
   3248         {
   3249             std::swap(coeffs[0], coeffs[2]);
   3250             std::swap(coeffs[3], coeffs[5]);
   3251             std::swap(coeffs[6], coeffs[8]);
   3252         }
   3253 
   3254         v_c0 = _mm_set1_ps(coeffs[0]);
   3255         v_c1 = _mm_set1_ps(coeffs[1]);
   3256         v_c2 = _mm_set1_ps(coeffs[2]);
   3257         v_c3 = _mm_set1_ps(coeffs[3]);
   3258         v_c4 = _mm_set1_ps(coeffs[4]);
   3259         v_c5 = _mm_set1_ps(coeffs[5]);
   3260         v_c6 = _mm_set1_ps(coeffs[6]);
   3261         v_c7 = _mm_set1_ps(coeffs[7]);
   3262         v_c8 = _mm_set1_ps(coeffs[8]);
   3263 
   3264         haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
   3265     }
   3266 
   3267     void process(__m128 v_r, __m128 v_g, __m128 v_b,
   3268                  __m128 & v_x, __m128 & v_y, __m128 & v_z) const
   3269     {
   3270         v_x = _mm_mul_ps(v_r, v_c0);
   3271         v_x = _mm_add_ps(v_x, _mm_mul_ps(v_g, v_c1));
   3272         v_x = _mm_add_ps(v_x, _mm_mul_ps(v_b, v_c2));
   3273 
   3274         v_y = _mm_mul_ps(v_r, v_c3);
   3275         v_y = _mm_add_ps(v_y, _mm_mul_ps(v_g, v_c4));
   3276         v_y = _mm_add_ps(v_y, _mm_mul_ps(v_b, v_c5));
   3277 
   3278         v_z = _mm_mul_ps(v_r, v_c6);
   3279         v_z = _mm_add_ps(v_z, _mm_mul_ps(v_g, v_c7));
   3280         v_z = _mm_add_ps(v_z, _mm_mul_ps(v_b, v_c8));
   3281     }
   3282 
   3283     void operator()(const float* src, float* dst, int n) const
   3284     {
   3285         int scn = srccn, i = 0;
   3286         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
   3287               C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
   3288               C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
   3289 
   3290         n *= 3;
   3291 
   3292         if (haveSIMD)
   3293         {
   3294             for ( ; i <= n - 24; i += 24, src += 8 * scn)
   3295             {
   3296                 __m128 v_r0 = _mm_loadu_ps(src);
   3297                 __m128 v_r1 = _mm_loadu_ps(src + 4);
   3298                 __m128 v_g0 = _mm_loadu_ps(src + 8);
   3299                 __m128 v_g1 = _mm_loadu_ps(src + 12);
   3300                 __m128 v_b0 = _mm_loadu_ps(src + 16);
   3301                 __m128 v_b1 = _mm_loadu_ps(src + 20);
   3302 
   3303                 if (scn == 4)
   3304                 {
   3305                     __m128 v_a0 = _mm_loadu_ps(src + 24);
   3306                     __m128 v_a1 = _mm_loadu_ps(src + 28);
   3307 
   3308                     _mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1,
   3309                                         v_b0, v_b1, v_a0, v_a1);
   3310                 }
   3311                 else
   3312                     _mm_deinterleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
   3313 
   3314                 __m128 v_x0, v_y0, v_z0;
   3315                 process(v_r0, v_g0, v_b0,
   3316                         v_x0, v_y0, v_z0);
   3317 
   3318                 __m128 v_x1, v_y1, v_z1;
   3319                 process(v_r1, v_g1, v_b1,
   3320                         v_x1, v_y1, v_z1);
   3321 
   3322                 _mm_interleave_ps(v_x0, v_x1, v_y0, v_y1, v_z0, v_z1);
   3323 
   3324                 _mm_storeu_ps(dst + i, v_x0);
   3325                 _mm_storeu_ps(dst + i + 4, v_x1);
   3326                 _mm_storeu_ps(dst + i + 8, v_y0);
   3327                 _mm_storeu_ps(dst + i + 12, v_y1);
   3328                 _mm_storeu_ps(dst + i + 16, v_z0);
   3329                 _mm_storeu_ps(dst + i + 20, v_z1);
   3330             }
   3331         }
   3332 
   3333         for ( ; i < n; i += 3, src += scn)
   3334         {
   3335             float X = saturate_cast<float>(src[0]*C0 + src[1]*C1 + src[2]*C2);
   3336             float Y = saturate_cast<float>(src[0]*C3 + src[1]*C4 + src[2]*C5);
   3337             float Z = saturate_cast<float>(src[0]*C6 + src[1]*C7 + src[2]*C8);
   3338             dst[i] = X; dst[i+1] = Y; dst[i+2] = Z;
   3339         }
   3340     }
   3341 
   3342     int srccn;
   3343     float coeffs[9];
   3344     __m128 v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8;
   3345     bool haveSIMD;
   3346 };
   3347 
   3348 
   3349 #endif
   3350 
   3351 template<typename _Tp> struct RGB2XYZ_i
   3352 {
   3353     typedef _Tp channel_type;
   3354 
   3355     RGB2XYZ_i(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
   3356     {
   3357         static const int coeffs0[] =
   3358         {
   3359             1689,    1465,    739,
   3360             871,     2929,    296,
   3361             79,      488,     3892
   3362         };
   3363         for( int i = 0; i < 9; i++ )
   3364             coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i];
   3365         if(blueIdx == 0)
   3366         {
   3367             std::swap(coeffs[0], coeffs[2]);
   3368             std::swap(coeffs[3], coeffs[5]);
   3369             std::swap(coeffs[6], coeffs[8]);
   3370         }
   3371     }
   3372     void operator()(const _Tp* src, _Tp* dst, int n) const
   3373     {
   3374         int scn = srccn;
   3375         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
   3376             C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
   3377             C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
   3378         n *= 3;
   3379 
   3380         for(int i = 0; i < n; i += 3, src += scn)
   3381         {
   3382             int X = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, xyz_shift);
   3383             int Y = CV_DESCALE(src[0]*C3 + src[1]*C4 + src[2]*C5, xyz_shift);
   3384             int Z = CV_DESCALE(src[0]*C6 + src[1]*C7 + src[2]*C8, xyz_shift);
   3385             dst[i] = saturate_cast<_Tp>(X); dst[i+1] = saturate_cast<_Tp>(Y);
   3386             dst[i+2] = saturate_cast<_Tp>(Z);
   3387         }
   3388     }
   3389     int srccn;
   3390     int coeffs[9];
   3391 };
   3392 
   3393 #if CV_NEON
   3394 
   3395 template <>
   3396 struct RGB2XYZ_i<uchar>
   3397 {
   3398     typedef uchar channel_type;
   3399 
   3400     RGB2XYZ_i(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
   3401     {
   3402         static const int coeffs0[] =
   3403         {
   3404             1689,    1465,    739,
   3405             871,     2929,    296,
   3406             79,      488,     3892
   3407         };
   3408         for( int i = 0; i < 9; i++ )
   3409             coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i];
   3410         if(blueIdx == 0)
   3411         {
   3412             std::swap(coeffs[0], coeffs[2]);
   3413             std::swap(coeffs[3], coeffs[5]);
   3414             std::swap(coeffs[6], coeffs[8]);
   3415         }
   3416 
   3417         v_c0 = vdup_n_u16(coeffs[0]);
   3418         v_c1 = vdup_n_u16(coeffs[1]);
   3419         v_c2 = vdup_n_u16(coeffs[2]);
   3420         v_c3 = vdup_n_u16(coeffs[3]);
   3421         v_c4 = vdup_n_u16(coeffs[4]);
   3422         v_c5 = vdup_n_u16(coeffs[5]);
   3423         v_c6 = vdup_n_u16(coeffs[6]);
   3424         v_c7 = vdup_n_u16(coeffs[7]);
   3425         v_c8 = vdup_n_u16(coeffs[8]);
   3426         v_delta = vdupq_n_u32(1 << (xyz_shift - 1));
   3427     }
   3428     void operator()(const uchar * src, uchar * dst, int n) const
   3429     {
   3430         int scn = srccn, i = 0;
   3431         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
   3432             C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
   3433             C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
   3434         n *= 3;
   3435 
   3436         for ( ; i <= n - 24; i += 24, src += scn * 8)
   3437         {
   3438             uint8x8x3_t v_dst;
   3439             uint16x8x3_t v_src16;
   3440 
   3441             if (scn == 3)
   3442             {
   3443                 uint8x8x3_t v_src = vld3_u8(src);
   3444                 v_src16.val[0] = vmovl_u8(v_src.val[0]);
   3445                 v_src16.val[1] = vmovl_u8(v_src.val[1]);
   3446                 v_src16.val[2] = vmovl_u8(v_src.val[2]);
   3447             }
   3448             else
   3449             {
   3450                 uint8x8x4_t v_src = vld4_u8(src);
   3451                 v_src16.val[0] = vmovl_u8(v_src.val[0]);
   3452                 v_src16.val[1] = vmovl_u8(v_src.val[1]);
   3453                 v_src16.val[2] = vmovl_u8(v_src.val[2]);
   3454             }
   3455 
   3456             uint16x4_t v_s0 = vget_low_u16(v_src16.val[0]),
   3457                        v_s1 = vget_low_u16(v_src16.val[1]),
   3458                        v_s2 = vget_low_u16(v_src16.val[2]);
   3459 
   3460             uint32x4_t v_X0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
   3461             uint32x4_t v_Y0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
   3462             uint32x4_t v_Z0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
   3463             v_X0 = vshrq_n_u32(vaddq_u32(v_X0, v_delta), xyz_shift);
   3464             v_Y0 = vshrq_n_u32(vaddq_u32(v_Y0, v_delta), xyz_shift);
   3465             v_Z0 = vshrq_n_u32(vaddq_u32(v_Z0, v_delta), xyz_shift);
   3466 
   3467             v_s0 = vget_high_u16(v_src16.val[0]),
   3468             v_s1 = vget_high_u16(v_src16.val[1]),
   3469             v_s2 = vget_high_u16(v_src16.val[2]);
   3470 
   3471             uint32x4_t v_X1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
   3472             uint32x4_t v_Y1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
   3473             uint32x4_t v_Z1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
   3474             v_X1 = vshrq_n_u32(vaddq_u32(v_X1, v_delta), xyz_shift);
   3475             v_Y1 = vshrq_n_u32(vaddq_u32(v_Y1, v_delta), xyz_shift);
   3476             v_Z1 = vshrq_n_u32(vaddq_u32(v_Z1, v_delta), xyz_shift);
   3477 
   3478             v_dst.val[0] = vqmovn_u16(vcombine_u16(vmovn_u32(v_X0), vmovn_u32(v_X1)));
   3479             v_dst.val[1] = vqmovn_u16(vcombine_u16(vmovn_u32(v_Y0), vmovn_u32(v_Y1)));
   3480             v_dst.val[2] = vqmovn_u16(vcombine_u16(vmovn_u32(v_Z0), vmovn_u32(v_Z1)));
   3481 
   3482             vst3_u8(dst + i, v_dst);
   3483         }
   3484 
   3485         for ( ; i < n; i += 3, src += scn)
   3486         {
   3487             int X = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, xyz_shift);
   3488             int Y = CV_DESCALE(src[0]*C3 + src[1]*C4 + src[2]*C5, xyz_shift);
   3489             int Z = CV_DESCALE(src[0]*C6 + src[1]*C7 + src[2]*C8, xyz_shift);
   3490             dst[i] = saturate_cast<uchar>(X);
   3491             dst[i+1] = saturate_cast<uchar>(Y);
   3492             dst[i+2] = saturate_cast<uchar>(Z);
   3493         }
   3494     }
   3495 
   3496     int srccn, coeffs[9];
   3497     uint16x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8;
   3498     uint32x4_t v_delta;
   3499 };
   3500 
   3501 template <>
   3502 struct RGB2XYZ_i<ushort>
   3503 {
   3504     typedef ushort channel_type;
   3505 
   3506     RGB2XYZ_i(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
   3507     {
   3508         static const int coeffs0[] =
   3509         {
   3510             1689,    1465,    739,
   3511             871,     2929,    296,
   3512             79,      488,     3892
   3513         };
   3514         for( int i = 0; i < 9; i++ )
   3515             coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i];
   3516         if(blueIdx == 0)
   3517         {
   3518             std::swap(coeffs[0], coeffs[2]);
   3519             std::swap(coeffs[3], coeffs[5]);
   3520             std::swap(coeffs[6], coeffs[8]);
   3521         }
   3522 
   3523         v_c0 = vdup_n_u16(coeffs[0]);
   3524         v_c1 = vdup_n_u16(coeffs[1]);
   3525         v_c2 = vdup_n_u16(coeffs[2]);
   3526         v_c3 = vdup_n_u16(coeffs[3]);
   3527         v_c4 = vdup_n_u16(coeffs[4]);
   3528         v_c5 = vdup_n_u16(coeffs[5]);
   3529         v_c6 = vdup_n_u16(coeffs[6]);
   3530         v_c7 = vdup_n_u16(coeffs[7]);
   3531         v_c8 = vdup_n_u16(coeffs[8]);
   3532         v_delta = vdupq_n_u32(1 << (xyz_shift - 1));
   3533     }
   3534 
   3535     void operator()(const ushort * src, ushort * dst, int n) const
   3536     {
   3537         int scn = srccn, i = 0;
   3538         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
   3539             C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
   3540             C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
   3541         n *= 3;
   3542 
   3543         for ( ; i <= n - 24; i += 24, src += scn * 8)
   3544         {
   3545             uint16x8x3_t v_src, v_dst;
   3546 
   3547             if (scn == 3)
   3548                 v_src = vld3q_u16(src);
   3549             else
   3550             {
   3551                 uint16x8x4_t v_src4 = vld4q_u16(src);
   3552                 v_src.val[0] = v_src4.val[0];
   3553                 v_src.val[1] = v_src4.val[1];
   3554                 v_src.val[2] = v_src4.val[2];
   3555             }
   3556 
   3557             uint16x4_t v_s0 = vget_low_u16(v_src.val[0]),
   3558                        v_s1 = vget_low_u16(v_src.val[1]),
   3559                        v_s2 = vget_low_u16(v_src.val[2]);
   3560 
   3561             uint32x4_t v_X0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
   3562             uint32x4_t v_Y0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
   3563             uint32x4_t v_Z0 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
   3564             v_X0 = vshrq_n_u32(vaddq_u32(v_X0, v_delta), xyz_shift);
   3565             v_Y0 = vshrq_n_u32(vaddq_u32(v_Y0, v_delta), xyz_shift);
   3566             v_Z0 = vshrq_n_u32(vaddq_u32(v_Z0, v_delta), xyz_shift);
   3567 
   3568             v_s0 = vget_high_u16(v_src.val[0]),
   3569             v_s1 = vget_high_u16(v_src.val[1]),
   3570             v_s2 = vget_high_u16(v_src.val[2]);
   3571 
   3572             uint32x4_t v_X1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
   3573             uint32x4_t v_Y1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
   3574             uint32x4_t v_Z1 = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
   3575             v_X1 = vshrq_n_u32(vaddq_u32(v_X1, v_delta), xyz_shift);
   3576             v_Y1 = vshrq_n_u32(vaddq_u32(v_Y1, v_delta), xyz_shift);
   3577             v_Z1 = vshrq_n_u32(vaddq_u32(v_Z1, v_delta), xyz_shift);
   3578 
   3579             v_dst.val[0] = vcombine_u16(vqmovn_u32(v_X0), vqmovn_u32(v_X1));
   3580             v_dst.val[1] = vcombine_u16(vqmovn_u32(v_Y0), vqmovn_u32(v_Y1));
   3581             v_dst.val[2] = vcombine_u16(vqmovn_u32(v_Z0), vqmovn_u32(v_Z1));
   3582 
   3583             vst3q_u16(dst + i, v_dst);
   3584         }
   3585 
   3586         for ( ; i <= n - 12; i += 12, src += scn * 4)
   3587         {
   3588             uint16x4x3_t v_dst;
   3589             uint16x4_t v_s0, v_s1, v_s2;
   3590 
   3591             if (scn == 3)
   3592             {
   3593                 uint16x4x3_t v_src = vld3_u16(src);
   3594                 v_s0 = v_src.val[0];
   3595                 v_s1 = v_src.val[1];
   3596                 v_s2 = v_src.val[2];
   3597             }
   3598             else
   3599             {
   3600                 uint16x4x4_t v_src = vld4_u16(src);
   3601                 v_s0 = v_src.val[0];
   3602                 v_s1 = v_src.val[1];
   3603                 v_s2 = v_src.val[2];
   3604             }
   3605 
   3606             uint32x4_t v_X = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
   3607             uint32x4_t v_Y = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
   3608             uint32x4_t v_Z = vmlal_u16(vmlal_u16(vmull_u16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
   3609 
   3610             v_dst.val[0] = vqmovn_u32(vshrq_n_u32(vaddq_u32(v_X, v_delta), xyz_shift));
   3611             v_dst.val[1] = vqmovn_u32(vshrq_n_u32(vaddq_u32(v_Y, v_delta), xyz_shift));
   3612             v_dst.val[2] = vqmovn_u32(vshrq_n_u32(vaddq_u32(v_Z, v_delta), xyz_shift));
   3613 
   3614             vst3_u16(dst + i, v_dst);
   3615         }
   3616 
   3617         for ( ; i < n; i += 3, src += scn)
   3618         {
   3619             int X = CV_DESCALE(src[0]*C0 + src[1]*C1 + src[2]*C2, xyz_shift);
   3620             int Y = CV_DESCALE(src[0]*C3 + src[1]*C4 + src[2]*C5, xyz_shift);
   3621             int Z = CV_DESCALE(src[0]*C6 + src[1]*C7 + src[2]*C8, xyz_shift);
   3622             dst[i] = saturate_cast<ushort>(X);
   3623             dst[i+1] = saturate_cast<ushort>(Y);
   3624             dst[i+2] = saturate_cast<ushort>(Z);
   3625         }
   3626     }
   3627 
   3628     int srccn, coeffs[9];
   3629     uint16x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8;
   3630     uint32x4_t v_delta;
   3631 };
   3632 
   3633 #endif
   3634 
   3635 template<typename _Tp> struct XYZ2RGB_f
   3636 {
   3637     typedef _Tp channel_type;
   3638 
   3639     XYZ2RGB_f(int _dstcn, int _blueIdx, const float* _coeffs)
   3640     : dstcn(_dstcn), blueIdx(_blueIdx)
   3641     {
   3642         memcpy(coeffs, _coeffs ? _coeffs : XYZ2sRGB_D65, 9*sizeof(coeffs[0]));
   3643         if(blueIdx == 0)
   3644         {
   3645             std::swap(coeffs[0], coeffs[6]);
   3646             std::swap(coeffs[1], coeffs[7]);
   3647             std::swap(coeffs[2], coeffs[8]);
   3648         }
   3649     }
   3650 
   3651     void operator()(const _Tp* src, _Tp* dst, int n) const
   3652     {
   3653         int dcn = dstcn;
   3654         _Tp alpha = ColorChannel<_Tp>::max();
   3655         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
   3656               C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
   3657               C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
   3658         n *= 3;
   3659         for(int i = 0; i < n; i += 3, dst += dcn)
   3660         {
   3661             _Tp B = saturate_cast<_Tp>(src[i]*C0 + src[i+1]*C1 + src[i+2]*C2);
   3662             _Tp G = saturate_cast<_Tp>(src[i]*C3 + src[i+1]*C4 + src[i+2]*C5);
   3663             _Tp R = saturate_cast<_Tp>(src[i]*C6 + src[i+1]*C7 + src[i+2]*C8);
   3664             dst[0] = B; dst[1] = G; dst[2] = R;
   3665             if( dcn == 4 )
   3666                 dst[3] = alpha;
   3667         }
   3668     }
   3669     int dstcn, blueIdx;
   3670     float coeffs[9];
   3671 };
   3672 
   3673 #if CV_SSE2
   3674 
   3675 template <>
   3676 struct XYZ2RGB_f<float>
   3677 {
   3678     typedef float channel_type;
   3679 
   3680     XYZ2RGB_f(int _dstcn, int _blueIdx, const float* _coeffs)
   3681     : dstcn(_dstcn), blueIdx(_blueIdx)
   3682     {
   3683         memcpy(coeffs, _coeffs ? _coeffs : XYZ2sRGB_D65, 9*sizeof(coeffs[0]));
   3684         if(blueIdx == 0)
   3685         {
   3686             std::swap(coeffs[0], coeffs[6]);
   3687             std::swap(coeffs[1], coeffs[7]);
   3688             std::swap(coeffs[2], coeffs[8]);
   3689         }
   3690 
   3691         v_c0 = _mm_set1_ps(coeffs[0]);
   3692         v_c1 = _mm_set1_ps(coeffs[1]);
   3693         v_c2 = _mm_set1_ps(coeffs[2]);
   3694         v_c3 = _mm_set1_ps(coeffs[3]);
   3695         v_c4 = _mm_set1_ps(coeffs[4]);
   3696         v_c5 = _mm_set1_ps(coeffs[5]);
   3697         v_c6 = _mm_set1_ps(coeffs[6]);
   3698         v_c7 = _mm_set1_ps(coeffs[7]);
   3699         v_c8 = _mm_set1_ps(coeffs[8]);
   3700 
   3701         v_alpha = _mm_set1_ps(ColorChannel<float>::max());
   3702 
   3703         haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
   3704     }
   3705 
   3706     void process(__m128 v_x, __m128 v_y, __m128 v_z,
   3707                  __m128 & v_r, __m128 & v_g, __m128 & v_b) const
   3708     {
   3709         v_b = _mm_mul_ps(v_x, v_c0);
   3710         v_b = _mm_add_ps(v_b, _mm_mul_ps(v_y, v_c1));
   3711         v_b = _mm_add_ps(v_b, _mm_mul_ps(v_z, v_c2));
   3712 
   3713         v_g = _mm_mul_ps(v_x, v_c3);
   3714         v_g = _mm_add_ps(v_g, _mm_mul_ps(v_y, v_c4));
   3715         v_g = _mm_add_ps(v_g, _mm_mul_ps(v_z, v_c5));
   3716 
   3717         v_r = _mm_mul_ps(v_x, v_c6);
   3718         v_r = _mm_add_ps(v_r, _mm_mul_ps(v_y, v_c7));
   3719         v_r = _mm_add_ps(v_r, _mm_mul_ps(v_z, v_c8));
   3720     }
   3721 
   3722     void operator()(const float* src, float* dst, int n) const
   3723     {
   3724         int dcn = dstcn;
   3725         float alpha = ColorChannel<float>::max();
   3726         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
   3727               C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
   3728               C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
   3729         n *= 3;
   3730         int i = 0;
   3731 
   3732         if (haveSIMD)
   3733         {
   3734             for ( ; i <= n - 24; i += 24, dst += 8 * dcn)
   3735             {
   3736                 __m128 v_x0 = _mm_loadu_ps(src + i);
   3737                 __m128 v_x1 = _mm_loadu_ps(src + i + 4);
   3738                 __m128 v_y0 = _mm_loadu_ps(src + i + 8);
   3739                 __m128 v_y1 = _mm_loadu_ps(src + i + 12);
   3740                 __m128 v_z0 = _mm_loadu_ps(src + i + 16);
   3741                 __m128 v_z1 = _mm_loadu_ps(src + i + 20);
   3742 
   3743                 _mm_deinterleave_ps(v_x0, v_x1, v_y0, v_y1, v_z0, v_z1);
   3744 
   3745                 __m128 v_r0, v_g0, v_b0;
   3746                 process(v_x0, v_y0, v_z0,
   3747                         v_r0, v_g0, v_b0);
   3748 
   3749                 __m128 v_r1, v_g1, v_b1;
   3750                 process(v_x1, v_y1, v_z1,
   3751                         v_r1, v_g1, v_b1);
   3752 
   3753                 __m128 v_a0 = v_alpha, v_a1 = v_alpha;
   3754 
   3755                 if (dcn == 4)
   3756                     _mm_interleave_ps(v_b0, v_b1, v_g0, v_g1,
   3757                                       v_r0, v_r1, v_a0, v_a1);
   3758                 else
   3759                     _mm_interleave_ps(v_b0, v_b1, v_g0, v_g1, v_r0, v_r1);
   3760 
   3761                 _mm_storeu_ps(dst, v_b0);
   3762                 _mm_storeu_ps(dst + 4, v_b1);
   3763                 _mm_storeu_ps(dst + 8, v_g0);
   3764                 _mm_storeu_ps(dst + 12, v_g1);
   3765                 _mm_storeu_ps(dst + 16, v_r0);
   3766                 _mm_storeu_ps(dst + 20, v_r1);
   3767 
   3768                 if (dcn == 4)
   3769                 {
   3770                     _mm_storeu_ps(dst + 24, v_a0);
   3771                     _mm_storeu_ps(dst + 28, v_a1);
   3772                 }
   3773             }
   3774 
   3775         }
   3776 
   3777         for( ; i < n; i += 3, dst += dcn)
   3778         {
   3779             float B = src[i]*C0 + src[i+1]*C1 + src[i+2]*C2;
   3780             float G = src[i]*C3 + src[i+1]*C4 + src[i+2]*C5;
   3781             float R = src[i]*C6 + src[i+1]*C7 + src[i+2]*C8;
   3782             dst[0] = B; dst[1] = G; dst[2] = R;
   3783             if( dcn == 4 )
   3784                 dst[3] = alpha;
   3785         }
   3786     }
   3787     int dstcn, blueIdx;
   3788     float coeffs[9];
   3789 
   3790     __m128 v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8;
   3791     __m128 v_alpha;
   3792     bool haveSIMD;
   3793 };
   3794 
   3795 #endif // CV_SSE2
   3796 
   3797 
   3798 template<typename _Tp> struct XYZ2RGB_i
   3799 {
   3800     typedef _Tp channel_type;
   3801 
   3802     XYZ2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
   3803     : dstcn(_dstcn), blueIdx(_blueIdx)
   3804     {
   3805         static const int coeffs0[] =
   3806         {
   3807             13273,  -6296,  -2042,
   3808             -3970,   7684,    170,
   3809               228,   -836,   4331
   3810         };
   3811         for(int i = 0; i < 9; i++)
   3812             coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i];
   3813 
   3814         if(blueIdx == 0)
   3815         {
   3816             std::swap(coeffs[0], coeffs[6]);
   3817             std::swap(coeffs[1], coeffs[7]);
   3818             std::swap(coeffs[2], coeffs[8]);
   3819         }
   3820     }
   3821     void operator()(const _Tp* src, _Tp* dst, int n) const
   3822     {
   3823         int dcn = dstcn;
   3824         _Tp alpha = ColorChannel<_Tp>::max();
   3825         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
   3826             C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
   3827             C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
   3828         n *= 3;
   3829         for(int i = 0; i < n; i += 3, dst += dcn)
   3830         {
   3831             int B = CV_DESCALE(src[i]*C0 + src[i+1]*C1 + src[i+2]*C2, xyz_shift);
   3832             int G = CV_DESCALE(src[i]*C3 + src[i+1]*C4 + src[i+2]*C5, xyz_shift);
   3833             int R = CV_DESCALE(src[i]*C6 + src[i+1]*C7 + src[i+2]*C8, xyz_shift);
   3834             dst[0] = saturate_cast<_Tp>(B); dst[1] = saturate_cast<_Tp>(G);
   3835             dst[2] = saturate_cast<_Tp>(R);
   3836             if( dcn == 4 )
   3837                 dst[3] = alpha;
   3838         }
   3839     }
   3840     int dstcn, blueIdx;
   3841     int coeffs[9];
   3842 };
   3843 
   3844 #if CV_NEON
   3845 
   3846 template <>
   3847 struct XYZ2RGB_i<uchar>
   3848 {
   3849     typedef uchar channel_type;
   3850 
   3851     XYZ2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
   3852     : dstcn(_dstcn), blueIdx(_blueIdx)
   3853     {
   3854         static const int coeffs0[] =
   3855         {
   3856             13273,  -6296,  -2042,
   3857             -3970,   7684,    170,
   3858               228,   -836,   4331
   3859         };
   3860         for(int i = 0; i < 9; i++)
   3861             coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i];
   3862 
   3863         if(blueIdx == 0)
   3864         {
   3865             std::swap(coeffs[0], coeffs[6]);
   3866             std::swap(coeffs[1], coeffs[7]);
   3867             std::swap(coeffs[2], coeffs[8]);
   3868         }
   3869 
   3870         v_c0 = vdup_n_s16(coeffs[0]);
   3871         v_c1 = vdup_n_s16(coeffs[1]);
   3872         v_c2 = vdup_n_s16(coeffs[2]);
   3873         v_c3 = vdup_n_s16(coeffs[3]);
   3874         v_c4 = vdup_n_s16(coeffs[4]);
   3875         v_c5 = vdup_n_s16(coeffs[5]);
   3876         v_c6 = vdup_n_s16(coeffs[6]);
   3877         v_c7 = vdup_n_s16(coeffs[7]);
   3878         v_c8 = vdup_n_s16(coeffs[8]);
   3879         v_delta = vdupq_n_s32(1 << (xyz_shift - 1));
   3880         v_alpha = vmovn_u16(vdupq_n_u16(ColorChannel<uchar>::max()));
   3881     }
   3882 
   3883     void operator()(const uchar* src, uchar* dst, int n) const
   3884     {
   3885         int dcn = dstcn, i = 0;
   3886         uchar alpha = ColorChannel<uchar>::max();
   3887         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
   3888             C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
   3889             C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
   3890         n *= 3;
   3891 
   3892         for ( ; i <= n - 24; i += 24, dst += dcn * 8)
   3893         {
   3894             uint8x8x3_t v_src = vld3_u8(src + i);
   3895             int16x8x3_t v_src16;
   3896             v_src16.val[0] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[0]));
   3897             v_src16.val[1] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[1]));
   3898             v_src16.val[2] = vreinterpretq_s16_u16(vmovl_u8(v_src.val[2]));
   3899 
   3900             int16x4_t v_s0 = vget_low_s16(v_src16.val[0]),
   3901                        v_s1 = vget_low_s16(v_src16.val[1]),
   3902                        v_s2 = vget_low_s16(v_src16.val[2]);
   3903 
   3904             int32x4_t v_X0 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
   3905             int32x4_t v_Y0 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
   3906             int32x4_t v_Z0 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
   3907             v_X0 = vshrq_n_s32(vaddq_s32(v_X0, v_delta), xyz_shift);
   3908             v_Y0 = vshrq_n_s32(vaddq_s32(v_Y0, v_delta), xyz_shift);
   3909             v_Z0 = vshrq_n_s32(vaddq_s32(v_Z0, v_delta), xyz_shift);
   3910 
   3911             v_s0 = vget_high_s16(v_src16.val[0]),
   3912             v_s1 = vget_high_s16(v_src16.val[1]),
   3913             v_s2 = vget_high_s16(v_src16.val[2]);
   3914 
   3915             int32x4_t v_X1 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
   3916             int32x4_t v_Y1 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
   3917             int32x4_t v_Z1 = vmlal_s16(vmlal_s16(vmull_s16(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
   3918             v_X1 = vshrq_n_s32(vaddq_s32(v_X1, v_delta), xyz_shift);
   3919             v_Y1 = vshrq_n_s32(vaddq_s32(v_Y1, v_delta), xyz_shift);
   3920             v_Z1 = vshrq_n_s32(vaddq_s32(v_Z1, v_delta), xyz_shift);
   3921 
   3922             uint8x8_t v_b = vqmovun_s16(vcombine_s16(vqmovn_s32(v_X0), vqmovn_s32(v_X1)));
   3923             uint8x8_t v_g = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Y0), vqmovn_s32(v_Y1)));
   3924             uint8x8_t v_r = vqmovun_s16(vcombine_s16(vqmovn_s32(v_Z0), vqmovn_s32(v_Z1)));
   3925 
   3926             if (dcn == 3)
   3927             {
   3928                 uint8x8x3_t v_dst;
   3929                 v_dst.val[0] = v_b;
   3930                 v_dst.val[1] = v_g;
   3931                 v_dst.val[2] = v_r;
   3932                 vst3_u8(dst, v_dst);
   3933             }
   3934             else
   3935             {
   3936                 uint8x8x4_t v_dst;
   3937                 v_dst.val[0] = v_b;
   3938                 v_dst.val[1] = v_g;
   3939                 v_dst.val[2] = v_r;
   3940                 v_dst.val[3] = v_alpha;
   3941                 vst4_u8(dst, v_dst);
   3942             }
   3943         }
   3944 
   3945         for ( ; i < n; i += 3, dst += dcn)
   3946         {
   3947             int B = CV_DESCALE(src[i]*C0 + src[i+1]*C1 + src[i+2]*C2, xyz_shift);
   3948             int G = CV_DESCALE(src[i]*C3 + src[i+1]*C4 + src[i+2]*C5, xyz_shift);
   3949             int R = CV_DESCALE(src[i]*C6 + src[i+1]*C7 + src[i+2]*C8, xyz_shift);
   3950             dst[0] = saturate_cast<uchar>(B); dst[1] = saturate_cast<uchar>(G);
   3951             dst[2] = saturate_cast<uchar>(R);
   3952             if( dcn == 4 )
   3953                 dst[3] = alpha;
   3954         }
   3955     }
   3956     int dstcn, blueIdx;
   3957     int coeffs[9];
   3958 
   3959     int16x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8;
   3960     uint8x8_t v_alpha;
   3961     int32x4_t v_delta;
   3962 };
   3963 
   3964 template <>
   3965 struct XYZ2RGB_i<ushort>
   3966 {
   3967     typedef ushort channel_type;
   3968 
   3969     XYZ2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
   3970     : dstcn(_dstcn), blueIdx(_blueIdx)
   3971     {
   3972         static const int coeffs0[] =
   3973         {
   3974             13273,  -6296,  -2042,
   3975             -3970,   7684,    170,
   3976               228,   -836,   4331
   3977         };
   3978         for(int i = 0; i < 9; i++)
   3979             coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i];
   3980 
   3981         if(blueIdx == 0)
   3982         {
   3983             std::swap(coeffs[0], coeffs[6]);
   3984             std::swap(coeffs[1], coeffs[7]);
   3985             std::swap(coeffs[2], coeffs[8]);
   3986         }
   3987 
   3988         v_c0 = vdupq_n_s32(coeffs[0]);
   3989         v_c1 = vdupq_n_s32(coeffs[1]);
   3990         v_c2 = vdupq_n_s32(coeffs[2]);
   3991         v_c3 = vdupq_n_s32(coeffs[3]);
   3992         v_c4 = vdupq_n_s32(coeffs[4]);
   3993         v_c5 = vdupq_n_s32(coeffs[5]);
   3994         v_c6 = vdupq_n_s32(coeffs[6]);
   3995         v_c7 = vdupq_n_s32(coeffs[7]);
   3996         v_c8 = vdupq_n_s32(coeffs[8]);
   3997         v_delta = vdupq_n_s32(1 << (xyz_shift - 1));
   3998         v_alpha = vdupq_n_u16(ColorChannel<ushort>::max());
   3999         v_alpha2 = vget_low_u16(v_alpha);
   4000     }
   4001 
   4002     void operator()(const ushort* src, ushort* dst, int n) const
   4003     {
   4004         int dcn = dstcn, i = 0;
   4005         ushort alpha = ColorChannel<ushort>::max();
   4006         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
   4007             C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
   4008             C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
   4009         n *= 3;
   4010 
   4011         for ( ; i <= n - 24; i += 24, dst += dcn * 8)
   4012         {
   4013             uint16x8x3_t v_src = vld3q_u16(src + i);
   4014             int32x4_t v_s0 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[0]))),
   4015                       v_s1 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[1]))),
   4016                       v_s2 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_src.val[2])));
   4017 
   4018             int32x4_t v_X0 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
   4019             int32x4_t v_Y0 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
   4020             int32x4_t v_Z0 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
   4021             v_X0 = vshrq_n_s32(vaddq_s32(v_X0, v_delta), xyz_shift);
   4022             v_Y0 = vshrq_n_s32(vaddq_s32(v_Y0, v_delta), xyz_shift);
   4023             v_Z0 = vshrq_n_s32(vaddq_s32(v_Z0, v_delta), xyz_shift);
   4024 
   4025             v_s0 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[0])));
   4026             v_s1 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[1])));
   4027             v_s2 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(v_src.val[2])));
   4028 
   4029             int32x4_t v_X1 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
   4030             int32x4_t v_Y1 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
   4031             int32x4_t v_Z1 = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
   4032             v_X1 = vshrq_n_s32(vaddq_s32(v_X1, v_delta), xyz_shift);
   4033             v_Y1 = vshrq_n_s32(vaddq_s32(v_Y1, v_delta), xyz_shift);
   4034             v_Z1 = vshrq_n_s32(vaddq_s32(v_Z1, v_delta), xyz_shift);
   4035 
   4036             uint16x8_t v_b = vcombine_u16(vqmovun_s32(v_X0), vqmovun_s32(v_X1));
   4037             uint16x8_t v_g = vcombine_u16(vqmovun_s32(v_Y0), vqmovun_s32(v_Y1));
   4038             uint16x8_t v_r = vcombine_u16(vqmovun_s32(v_Z0), vqmovun_s32(v_Z1));
   4039 
   4040             if (dcn == 3)
   4041             {
   4042                 uint16x8x3_t v_dst;
   4043                 v_dst.val[0] = v_b;
   4044                 v_dst.val[1] = v_g;
   4045                 v_dst.val[2] = v_r;
   4046                 vst3q_u16(dst, v_dst);
   4047             }
   4048             else
   4049             {
   4050                 uint16x8x4_t v_dst;
   4051                 v_dst.val[0] = v_b;
   4052                 v_dst.val[1] = v_g;
   4053                 v_dst.val[2] = v_r;
   4054                 v_dst.val[3] = v_alpha;
   4055                 vst4q_u16(dst, v_dst);
   4056             }
   4057         }
   4058 
   4059         for ( ; i <= n - 12; i += 12, dst += dcn * 4)
   4060         {
   4061             uint16x4x3_t v_src = vld3_u16(src + i);
   4062             int32x4_t v_s0 = vreinterpretq_s32_u32(vmovl_u16(v_src.val[0])),
   4063                       v_s1 = vreinterpretq_s32_u32(vmovl_u16(v_src.val[1])),
   4064                       v_s2 = vreinterpretq_s32_u32(vmovl_u16(v_src.val[2]));
   4065 
   4066             int32x4_t v_X = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c0), v_s1, v_c1), v_s2, v_c2);
   4067             int32x4_t v_Y = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c3), v_s1, v_c4), v_s2, v_c5);
   4068             int32x4_t v_Z = vmlaq_s32(vmlaq_s32(vmulq_s32(v_s0, v_c6), v_s1, v_c7), v_s2, v_c8);
   4069             v_X = vshrq_n_s32(vaddq_s32(v_X, v_delta), xyz_shift);
   4070             v_Y = vshrq_n_s32(vaddq_s32(v_Y, v_delta), xyz_shift);
   4071             v_Z = vshrq_n_s32(vaddq_s32(v_Z, v_delta), xyz_shift);
   4072 
   4073             uint16x4_t v_b = vqmovun_s32(v_X);
   4074             uint16x4_t v_g = vqmovun_s32(v_Y);
   4075             uint16x4_t v_r = vqmovun_s32(v_Z);
   4076 
   4077             if (dcn == 3)
   4078             {
   4079                 uint16x4x3_t v_dst;
   4080                 v_dst.val[0] = v_b;
   4081                 v_dst.val[1] = v_g;
   4082                 v_dst.val[2] = v_r;
   4083                 vst3_u16(dst, v_dst);
   4084             }
   4085             else
   4086             {
   4087                 uint16x4x4_t v_dst;
   4088                 v_dst.val[0] = v_b;
   4089                 v_dst.val[1] = v_g;
   4090                 v_dst.val[2] = v_r;
   4091                 v_dst.val[3] = v_alpha2;
   4092                 vst4_u16(dst, v_dst);
   4093             }
   4094         }
   4095 
   4096         for ( ; i < n; i += 3, dst += dcn)
   4097         {
   4098             int B = CV_DESCALE(src[i]*C0 + src[i+1]*C1 + src[i+2]*C2, xyz_shift);
   4099             int G = CV_DESCALE(src[i]*C3 + src[i+1]*C4 + src[i+2]*C5, xyz_shift);
   4100             int R = CV_DESCALE(src[i]*C6 + src[i+1]*C7 + src[i+2]*C8, xyz_shift);
   4101             dst[0] = saturate_cast<ushort>(B); dst[1] = saturate_cast<ushort>(G);
   4102             dst[2] = saturate_cast<ushort>(R);
   4103             if( dcn == 4 )
   4104                 dst[3] = alpha;
   4105         }
   4106     }
   4107     int dstcn, blueIdx;
   4108     int coeffs[9];
   4109 
   4110     int32x4_t v_c0, v_c1, v_c2, v_c3, v_c4, v_c5, v_c6, v_c7, v_c8, v_delta;
   4111     uint16x4_t v_alpha2;
   4112     uint16x8_t v_alpha;
   4113 };
   4114 
   4115 #endif
   4116 
   4117 ////////////////////////////////////// RGB <-> HSV ///////////////////////////////////////
   4118 
   4119 
   4120 struct RGB2HSV_b
   4121 {
   4122     typedef uchar channel_type;
   4123 
   4124     RGB2HSV_b(int _srccn, int _blueIdx, int _hrange)
   4125     : srccn(_srccn), blueIdx(_blueIdx), hrange(_hrange)
   4126     {
   4127         CV_Assert( hrange == 180 || hrange == 256 );
   4128     }
   4129 
   4130     void operator()(const uchar* src, uchar* dst, int n) const
   4131     {
   4132         int i, bidx = blueIdx, scn = srccn;
   4133         const int hsv_shift = 12;
   4134 
   4135         static int sdiv_table[256];
   4136         static int hdiv_table180[256];
   4137         static int hdiv_table256[256];
   4138         static volatile bool initialized = false;
   4139 
   4140         int hr = hrange;
   4141         const int* hdiv_table = hr == 180 ? hdiv_table180 : hdiv_table256;
   4142         n *= 3;
   4143 
   4144         if( !initialized )
   4145         {
   4146             sdiv_table[0] = hdiv_table180[0] = hdiv_table256[0] = 0;
   4147             for( i = 1; i < 256; i++ )
   4148             {
   4149                 sdiv_table[i] = saturate_cast<int>((255 << hsv_shift)/(1.*i));
   4150                 hdiv_table180[i] = saturate_cast<int>((180 << hsv_shift)/(6.*i));
   4151                 hdiv_table256[i] = saturate_cast<int>((256 << hsv_shift)/(6.*i));
   4152             }
   4153             initialized = true;
   4154         }
   4155 
   4156         for( i = 0; i < n; i += 3, src += scn )
   4157         {
   4158             int b = src[bidx], g = src[1], r = src[bidx^2];
   4159             int h, s, v = b;
   4160             int vmin = b, diff;
   4161             int vr, vg;
   4162 
   4163             CV_CALC_MAX_8U( v, g );
   4164             CV_CALC_MAX_8U( v, r );
   4165             CV_CALC_MIN_8U( vmin, g );
   4166             CV_CALC_MIN_8U( vmin, r );
   4167 
   4168             diff = v - vmin;
   4169             vr = v == r ? -1 : 0;
   4170             vg = v == g ? -1 : 0;
   4171 
   4172             s = (diff * sdiv_table[v] + (1 << (hsv_shift-1))) >> hsv_shift;
   4173             h = (vr & (g - b)) +
   4174                 (~vr & ((vg & (b - r + 2 * diff)) + ((~vg) & (r - g + 4 * diff))));
   4175             h = (h * hdiv_table[diff] + (1 << (hsv_shift-1))) >> hsv_shift;
   4176             h += h < 0 ? hr : 0;
   4177 
   4178             dst[i] = saturate_cast<uchar>(h);
   4179             dst[i+1] = (uchar)s;
   4180             dst[i+2] = (uchar)v;
   4181         }
   4182     }
   4183 
   4184     int srccn, blueIdx, hrange;
   4185 };
   4186 
   4187 
   4188 struct RGB2HSV_f
   4189 {
   4190     typedef float channel_type;
   4191 
   4192     RGB2HSV_f(int _srccn, int _blueIdx, float _hrange)
   4193     : srccn(_srccn), blueIdx(_blueIdx), hrange(_hrange) {}
   4194 
   4195     void operator()(const float* src, float* dst, int n) const
   4196     {
   4197         int i, bidx = blueIdx, scn = srccn;
   4198         float hscale = hrange*(1.f/360.f);
   4199         n *= 3;
   4200 
   4201         for( i = 0; i < n; i += 3, src += scn )
   4202         {
   4203             float b = src[bidx], g = src[1], r = src[bidx^2];
   4204             float h, s, v;
   4205 
   4206             float vmin, diff;
   4207 
   4208             v = vmin = r;
   4209             if( v < g ) v = g;
   4210             if( v < b ) v = b;
   4211             if( vmin > g ) vmin = g;
   4212             if( vmin > b ) vmin = b;
   4213 
   4214             diff = v - vmin;
   4215             s = diff/(float)(fabs(v) + FLT_EPSILON);
   4216             diff = (float)(60./(diff + FLT_EPSILON));
   4217             if( v == r )
   4218                 h = (g - b)*diff;
   4219             else if( v == g )
   4220                 h = (b - r)*diff + 120.f;
   4221             else
   4222                 h = (r - g)*diff + 240.f;
   4223 
   4224             if( h < 0 ) h += 360.f;
   4225 
   4226             dst[i] = h*hscale;
   4227             dst[i+1] = s;
   4228             dst[i+2] = v;
   4229         }
   4230     }
   4231 
   4232     int srccn, blueIdx;
   4233     float hrange;
   4234 };
   4235 
   4236 
   4237 struct HSV2RGB_f
   4238 {
   4239     typedef float channel_type;
   4240 
   4241     HSV2RGB_f(int _dstcn, int _blueIdx, float _hrange)
   4242     : dstcn(_dstcn), blueIdx(_blueIdx), hscale(6.f/_hrange) {}
   4243 
   4244     void operator()(const float* src, float* dst, int n) const
   4245     {
   4246         int i, bidx = blueIdx, dcn = dstcn;
   4247         float _hscale = hscale;
   4248         float alpha = ColorChannel<float>::max();
   4249         n *= 3;
   4250 
   4251         for( i = 0; i < n; i += 3, dst += dcn )
   4252         {
   4253             float h = src[i], s = src[i+1], v = src[i+2];
   4254             float b, g, r;
   4255 
   4256             if( s == 0 )
   4257                 b = g = r = v;
   4258             else
   4259             {
   4260                 static const int sector_data[][3]=
   4261                     {{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}};
   4262                 float tab[4];
   4263                 int sector;
   4264                 h *= _hscale;
   4265                 if( h < 0 )
   4266                     do h += 6; while( h < 0 );
   4267                 else if( h >= 6 )
   4268                     do h -= 6; while( h >= 6 );
   4269                 sector = cvFloor(h);
   4270                 h -= sector;
   4271                 if( (unsigned)sector >= 6u )
   4272                 {
   4273                     sector = 0;
   4274                     h = 0.f;
   4275                 }
   4276 
   4277                 tab[0] = v;
   4278                 tab[1] = v*(1.f - s);
   4279                 tab[2] = v*(1.f - s*h);
   4280                 tab[3] = v*(1.f - s*(1.f - h));
   4281 
   4282                 b = tab[sector_data[sector][0]];
   4283                 g = tab[sector_data[sector][1]];
   4284                 r = tab[sector_data[sector][2]];
   4285             }
   4286 
   4287             dst[bidx] = b;
   4288             dst[1] = g;
   4289             dst[bidx^2] = r;
   4290             if( dcn == 4 )
   4291                 dst[3] = alpha;
   4292         }
   4293     }
   4294 
   4295     int dstcn, blueIdx;
   4296     float hscale;
   4297 };
   4298 
   4299 
   4300 struct HSV2RGB_b
   4301 {
   4302     typedef uchar channel_type;
   4303 
   4304     HSV2RGB_b(int _dstcn, int _blueIdx, int _hrange)
   4305     : dstcn(_dstcn), cvt(3, _blueIdx, (float)_hrange)
   4306     {
   4307         #if CV_NEON
   4308         v_scale_inv = vdupq_n_f32(1.f/255.f);
   4309         v_scale = vdupq_n_f32(255.f);
   4310         v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
   4311         #elif CV_SSE2
   4312         v_scale_inv = _mm_set1_ps(1.f/255.f);
   4313         v_scale = _mm_set1_ps(255.0f);
   4314         v_zero = _mm_setzero_si128();
   4315         haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
   4316         #endif
   4317     }
   4318 
   4319     #if CV_SSE2
   4320     // 16s x 8
   4321     void process(__m128i v_r, __m128i v_g, __m128i v_b,
   4322                  float * buf) const
   4323     {
   4324         __m128 v_r0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_r, v_zero));
   4325         __m128 v_g0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_g, v_zero));
   4326         __m128 v_b0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_b, v_zero));
   4327 
   4328         __m128 v_r1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_r, v_zero));
   4329         __m128 v_g1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_g, v_zero));
   4330         __m128 v_b1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_b, v_zero));
   4331 
   4332         v_g0 = _mm_mul_ps(v_g0, v_scale_inv);
   4333         v_b0 = _mm_mul_ps(v_b0, v_scale_inv);
   4334 
   4335         v_g1 = _mm_mul_ps(v_g1, v_scale_inv);
   4336         v_b1 = _mm_mul_ps(v_b1, v_scale_inv);
   4337 
   4338         _mm_interleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
   4339 
   4340         _mm_store_ps(buf, v_r0);
   4341         _mm_store_ps(buf + 4, v_r1);
   4342         _mm_store_ps(buf + 8, v_g0);
   4343         _mm_store_ps(buf + 12, v_g1);
   4344         _mm_store_ps(buf + 16, v_b0);
   4345         _mm_store_ps(buf + 20, v_b1);
   4346     }
   4347     #endif
   4348 
   4349     void operator()(const uchar* src, uchar* dst, int n) const
   4350     {
   4351         int i, j, dcn = dstcn;
   4352         uchar alpha = ColorChannel<uchar>::max();
   4353         float CV_DECL_ALIGNED(16) buf[3*BLOCK_SIZE];
   4354 
   4355         for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 )
   4356         {
   4357             int dn = std::min(n - i, (int)BLOCK_SIZE);
   4358             j = 0;
   4359 
   4360             #if CV_NEON
   4361             for ( ; j <= (dn - 8) * 3; j += 24)
   4362             {
   4363                 uint8x8x3_t v_src = vld3_u8(src + j);
   4364                 uint16x8_t v_t0 = vmovl_u8(v_src.val[0]),
   4365                            v_t1 = vmovl_u8(v_src.val[1]),
   4366                            v_t2 = vmovl_u8(v_src.val[2]);
   4367 
   4368                 float32x4x3_t v_dst;
   4369                 v_dst.val[0] = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t0)));
   4370                 v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t1))), v_scale_inv);
   4371                 v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t2))), v_scale_inv);
   4372                 vst3q_f32(buf + j, v_dst);
   4373 
   4374                 v_dst.val[0] = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t0)));
   4375                 v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t1))), v_scale_inv);
   4376                 v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_scale_inv);
   4377                 vst3q_f32(buf + j + 12, v_dst);
   4378             }
   4379             #elif CV_SSE2
   4380             if (haveSIMD)
   4381             {
   4382                 for ( ; j <= (dn - 32) * 3; j += 96)
   4383                 {
   4384                     __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src + j));
   4385                     __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + j + 16));
   4386                     __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + j + 32));
   4387                     __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + j + 48));
   4388                     __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + j + 64));
   4389                     __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + j + 80));
   4390 
   4391                     _mm_deinterleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
   4392 
   4393                     process(_mm_unpacklo_epi8(v_r0, v_zero),
   4394                             _mm_unpacklo_epi8(v_g0, v_zero),
   4395                             _mm_unpacklo_epi8(v_b0, v_zero),
   4396                             buf + j);
   4397 
   4398                     process(_mm_unpackhi_epi8(v_r0, v_zero),
   4399                             _mm_unpackhi_epi8(v_g0, v_zero),
   4400                             _mm_unpackhi_epi8(v_b0, v_zero),
   4401                             buf + j + 24);
   4402 
   4403                     process(_mm_unpacklo_epi8(v_r1, v_zero),
   4404                             _mm_unpacklo_epi8(v_g1, v_zero),
   4405                             _mm_unpacklo_epi8(v_b1, v_zero),
   4406                             buf + j + 48);
   4407 
   4408                     process(_mm_unpackhi_epi8(v_r1, v_zero),
   4409                             _mm_unpackhi_epi8(v_g1, v_zero),
   4410                             _mm_unpackhi_epi8(v_b1, v_zero),
   4411                             buf + j + 72);
   4412                 }
   4413             }
   4414             #endif
   4415 
   4416             for( ; j < dn*3; j += 3 )
   4417             {
   4418                 buf[j] = src[j];
   4419                 buf[j+1] = src[j+1]*(1.f/255.f);
   4420                 buf[j+2] = src[j+2]*(1.f/255.f);
   4421             }
   4422             cvt(buf, buf, dn);
   4423 
   4424             j = 0;
   4425             #if CV_NEON
   4426             for ( ; j <= (dn - 8) * 3; j += 24, dst += dcn * 8)
   4427             {
   4428                 float32x4x3_t v_src0 = vld3q_f32(buf + j), v_src1 = vld3q_f32(buf + j + 12);
   4429                 uint8x8_t v_dst0 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[0], v_scale))),
   4430                                                            vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[0], v_scale)))));
   4431                 uint8x8_t v_dst1 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[1], v_scale))),
   4432                                                            vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[1], v_scale)))));
   4433                 uint8x8_t v_dst2 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[2], v_scale))),
   4434                                                            vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[2], v_scale)))));
   4435 
   4436                 if (dcn == 4)
   4437                 {
   4438                     uint8x8x4_t v_dst;
   4439                     v_dst.val[0] = v_dst0;
   4440                     v_dst.val[1] = v_dst1;
   4441                     v_dst.val[2] = v_dst2;
   4442                     v_dst.val[3] = v_alpha;
   4443                     vst4_u8(dst, v_dst);
   4444                 }
   4445                 else
   4446                 {
   4447                     uint8x8x3_t v_dst;
   4448                     v_dst.val[0] = v_dst0;
   4449                     v_dst.val[1] = v_dst1;
   4450                     v_dst.val[2] = v_dst2;
   4451                     vst3_u8(dst, v_dst);
   4452                 }
   4453             }
   4454             #elif CV_SSE2
   4455             if (dcn == 3 && haveSIMD)
   4456             {
   4457                 for ( ; j <= (dn * 3 - 16); j += 16, dst += 16)
   4458                 {
   4459                     __m128 v_src0 = _mm_mul_ps(_mm_load_ps(buf + j), v_scale);
   4460                     __m128 v_src1 = _mm_mul_ps(_mm_load_ps(buf + j + 4), v_scale);
   4461                     __m128 v_src2 = _mm_mul_ps(_mm_load_ps(buf + j + 8), v_scale);
   4462                     __m128 v_src3 = _mm_mul_ps(_mm_load_ps(buf + j + 12), v_scale);
   4463 
   4464                     __m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(v_src0),
   4465                                                      _mm_cvtps_epi32(v_src1));
   4466                     __m128i v_dst1 = _mm_packs_epi32(_mm_cvtps_epi32(v_src2),
   4467                                                      _mm_cvtps_epi32(v_src3));
   4468 
   4469                     _mm_storeu_si128((__m128i *)dst, _mm_packus_epi16(v_dst0, v_dst1));
   4470                 }
   4471 
   4472                 int jr = j % 3;
   4473                 if (jr)
   4474                     dst -= jr, j -= jr;
   4475             }
   4476             #endif
   4477 
   4478             for( ; j < dn*3; j += 3, dst += dcn )
   4479             {
   4480                 dst[0] = saturate_cast<uchar>(buf[j]*255.f);
   4481                 dst[1] = saturate_cast<uchar>(buf[j+1]*255.f);
   4482                 dst[2] = saturate_cast<uchar>(buf[j+2]*255.f);
   4483                 if( dcn == 4 )
   4484                     dst[3] = alpha;
   4485             }
   4486         }
   4487     }
   4488 
   4489     int dstcn;
   4490     HSV2RGB_f cvt;
   4491     #if CV_NEON
   4492     float32x4_t v_scale, v_scale_inv;
   4493     uint8x8_t v_alpha;
   4494     #elif CV_SSE2
   4495     __m128 v_scale_inv, v_scale;
   4496     __m128i v_zero;
   4497     bool haveSIMD;
   4498     #endif
   4499 };
   4500 
   4501 
   4502 ///////////////////////////////////// RGB <-> HLS ////////////////////////////////////////
   4503 
   4504 struct RGB2HLS_f
   4505 {
   4506     typedef float channel_type;
   4507 
   4508     RGB2HLS_f(int _srccn, int _blueIdx, float _hrange)
   4509     : srccn(_srccn), blueIdx(_blueIdx), hrange(_hrange) {}
   4510 
   4511     void operator()(const float* src, float* dst, int n) const
   4512     {
   4513         int i, bidx = blueIdx, scn = srccn;
   4514         float hscale = hrange*(1.f/360.f);
   4515         n *= 3;
   4516 
   4517         for( i = 0; i < n; i += 3, src += scn )
   4518         {
   4519             float b = src[bidx], g = src[1], r = src[bidx^2];
   4520             float h = 0.f, s = 0.f, l;
   4521             float vmin, vmax, diff;
   4522 
   4523             vmax = vmin = r;
   4524             if( vmax < g ) vmax = g;
   4525             if( vmax < b ) vmax = b;
   4526             if( vmin > g ) vmin = g;
   4527             if( vmin > b ) vmin = b;
   4528 
   4529             diff = vmax - vmin;
   4530             l = (vmax + vmin)*0.5f;
   4531 
   4532             if( diff > FLT_EPSILON )
   4533             {
   4534                 s = l < 0.5f ? diff/(vmax + vmin) : diff/(2 - vmax - vmin);
   4535                 diff = 60.f/diff;
   4536 
   4537                 if( vmax == r )
   4538                     h = (g - b)*diff;
   4539                 else if( vmax == g )
   4540                     h = (b - r)*diff + 120.f;
   4541                 else
   4542                     h = (r - g)*diff + 240.f;
   4543 
   4544                 if( h < 0.f ) h += 360.f;
   4545             }
   4546 
   4547             dst[i] = h*hscale;
   4548             dst[i+1] = l;
   4549             dst[i+2] = s;
   4550         }
   4551     }
   4552 
   4553     int srccn, blueIdx;
   4554     float hrange;
   4555 };
   4556 
   4557 
   4558 struct RGB2HLS_b
   4559 {
   4560     typedef uchar channel_type;
   4561 
   4562     RGB2HLS_b(int _srccn, int _blueIdx, int _hrange)
   4563     : srccn(_srccn), cvt(3, _blueIdx, (float)_hrange)
   4564     {
   4565         #if CV_NEON
   4566         v_scale_inv = vdupq_n_f32(1.f/255.f);
   4567         v_scale = vdupq_n_f32(255.f);
   4568         v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
   4569         #elif CV_SSE2
   4570         v_scale_inv = _mm_set1_ps(1.f/255.f);
   4571         v_scale = _mm_set1_ps(255.f);
   4572         v_zero = _mm_setzero_si128();
   4573         haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
   4574         #endif
   4575     }
   4576 
   4577     #if CV_SSE2
   4578     void process(const float * buf,
   4579                  __m128i & v_h, __m128i & v_l, __m128i & v_s) const
   4580     {
   4581         __m128 v_h0f = _mm_load_ps(buf);
   4582         __m128 v_h1f = _mm_load_ps(buf + 4);
   4583         __m128 v_l0f = _mm_load_ps(buf + 8);
   4584         __m128 v_l1f = _mm_load_ps(buf + 12);
   4585         __m128 v_s0f = _mm_load_ps(buf + 16);
   4586         __m128 v_s1f = _mm_load_ps(buf + 20);
   4587 
   4588         _mm_deinterleave_ps(v_h0f, v_h1f, v_l0f, v_l1f, v_s0f, v_s1f);
   4589 
   4590         v_l0f = _mm_mul_ps(v_l0f, v_scale);
   4591         v_l1f = _mm_mul_ps(v_l1f, v_scale);
   4592         v_s0f = _mm_mul_ps(v_s0f, v_scale);
   4593         v_s1f = _mm_mul_ps(v_s1f, v_scale);
   4594 
   4595         v_h = _mm_packs_epi32(_mm_cvtps_epi32(v_h0f), _mm_cvtps_epi32(v_h1f));
   4596         v_l = _mm_packs_epi32(_mm_cvtps_epi32(v_l0f), _mm_cvtps_epi32(v_l1f));
   4597         v_s = _mm_packs_epi32(_mm_cvtps_epi32(v_s0f), _mm_cvtps_epi32(v_s1f));
   4598     }
   4599     #endif
   4600 
   4601     void operator()(const uchar* src, uchar* dst, int n) const
   4602     {
   4603         int i, j, scn = srccn;
   4604         float CV_DECL_ALIGNED(16) buf[3*BLOCK_SIZE];
   4605 
   4606         for( i = 0; i < n; i += BLOCK_SIZE, dst += BLOCK_SIZE*3 )
   4607         {
   4608             int dn = std::min(n - i, (int)BLOCK_SIZE);
   4609             j = 0;
   4610 
   4611             #if CV_NEON
   4612             for ( ; j <= (dn - 8) * 3; j += 24, src += 8 * scn)
   4613             {
   4614                 uint16x8_t v_t0, v_t1, v_t2;
   4615 
   4616                 if (scn == 3)
   4617                 {
   4618                     uint8x8x3_t v_src = vld3_u8(src);
   4619                     v_t0 = vmovl_u8(v_src.val[0]);
   4620                     v_t1 = vmovl_u8(v_src.val[1]);
   4621                     v_t2 = vmovl_u8(v_src.val[2]);
   4622                 }
   4623                 else
   4624                 {
   4625                     uint8x8x4_t v_src = vld4_u8(src);
   4626                     v_t0 = vmovl_u8(v_src.val[0]);
   4627                     v_t1 = vmovl_u8(v_src.val[1]);
   4628                     v_t2 = vmovl_u8(v_src.val[2]);
   4629                 }
   4630 
   4631                 float32x4x3_t v_dst;
   4632                 v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t0))), v_scale_inv);
   4633                 v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t1))), v_scale_inv);
   4634                 v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t2))), v_scale_inv);
   4635                 vst3q_f32(buf + j, v_dst);
   4636 
   4637                 v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t0))), v_scale_inv);
   4638                 v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t1))), v_scale_inv);
   4639                 v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_scale_inv);
   4640                 vst3q_f32(buf + j + 12, v_dst);
   4641             }
   4642             #elif CV_SSE2
   4643             if (scn == 3 && haveSIMD)
   4644             {
   4645                 for ( ; j <= (dn * 3 - 16); j += 16, src += 16)
   4646                 {
   4647                     __m128i v_src = _mm_loadu_si128((__m128i const *)src);
   4648 
   4649                     __m128i v_src_p = _mm_unpacklo_epi8(v_src, v_zero);
   4650                     _mm_store_ps(buf + j, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_p, v_zero)), v_scale_inv));
   4651                     _mm_store_ps(buf + j + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src_p, v_zero)), v_scale_inv));
   4652 
   4653                     v_src_p = _mm_unpackhi_epi8(v_src, v_zero);
   4654                     _mm_store_ps(buf + j + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_p, v_zero)), v_scale_inv));
   4655                     _mm_store_ps(buf + j + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src_p, v_zero)), v_scale_inv));
   4656                 }
   4657 
   4658                 int jr = j % 3;
   4659                 if (jr)
   4660                     src -= jr, j -= jr;
   4661             }
   4662             #endif
   4663             for( ; j < dn*3; j += 3, src += scn )
   4664             {
   4665                 buf[j] = src[0]*(1.f/255.f);
   4666                 buf[j+1] = src[1]*(1.f/255.f);
   4667                 buf[j+2] = src[2]*(1.f/255.f);
   4668             }
   4669             cvt(buf, buf, dn);
   4670 
   4671             j = 0;
   4672             #if CV_NEON
   4673             for ( ; j <= (dn - 8) * 3; j += 24)
   4674             {
   4675                 float32x4x3_t v_src0 = vld3q_f32(buf + j), v_src1 = vld3q_f32(buf + j + 12);
   4676 
   4677                 uint8x8x3_t v_dst;
   4678                 v_dst.val[0] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_src0.val[0])),
   4679                                                        vqmovn_u32(cv_vrndq_u32_f32(v_src1.val[0]))));
   4680                 v_dst.val[1] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[1], v_scale))),
   4681                                                        vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[1], v_scale)))));
   4682                 v_dst.val[2] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[2], v_scale))),
   4683                                                        vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[2], v_scale)))));
   4684                 vst3_u8(dst + j, v_dst);
   4685             }
   4686             #elif CV_SSE2
   4687             if (haveSIMD)
   4688             {
   4689                 for ( ; j <= (dn - 32) * 3; j += 96)
   4690                 {
   4691                     __m128i v_h_0, v_l_0, v_s_0;
   4692                     process(buf + j,
   4693                             v_h_0, v_l_0, v_s_0);
   4694 
   4695                     __m128i v_h_1, v_l_1, v_s_1;
   4696                     process(buf + j + 24,
   4697                             v_h_1, v_l_1, v_s_1);
   4698 
   4699                     __m128i v_h0 = _mm_packus_epi16(v_h_0, v_h_1);
   4700                     __m128i v_l0 = _mm_packus_epi16(v_l_0, v_l_1);
   4701                     __m128i v_s0 = _mm_packus_epi16(v_s_0, v_s_1);
   4702 
   4703                     process(buf + j + 48,
   4704                             v_h_0, v_l_0, v_s_0);
   4705 
   4706                     process(buf + j + 72,
   4707                             v_h_1, v_l_1, v_s_1);
   4708 
   4709                     __m128i v_h1 = _mm_packus_epi16(v_h_0, v_h_1);
   4710                     __m128i v_l1 = _mm_packus_epi16(v_l_0, v_l_1);
   4711                     __m128i v_s1 = _mm_packus_epi16(v_s_0, v_s_1);
   4712 
   4713                     _mm_interleave_epi8(v_h0, v_h1, v_l0, v_l1, v_s0, v_s1);
   4714 
   4715                     _mm_storeu_si128((__m128i *)(dst + j), v_h0);
   4716                     _mm_storeu_si128((__m128i *)(dst + j + 16), v_h1);
   4717                     _mm_storeu_si128((__m128i *)(dst + j + 32), v_l0);
   4718                     _mm_storeu_si128((__m128i *)(dst + j + 48), v_l1);
   4719                     _mm_storeu_si128((__m128i *)(dst + j + 64), v_s0);
   4720                     _mm_storeu_si128((__m128i *)(dst + j + 80), v_s1);
   4721                 }
   4722             }
   4723             #endif
   4724             for( ; j < dn*3; j += 3 )
   4725             {
   4726                 dst[j] = saturate_cast<uchar>(buf[j]);
   4727                 dst[j+1] = saturate_cast<uchar>(buf[j+1]*255.f);
   4728                 dst[j+2] = saturate_cast<uchar>(buf[j+2]*255.f);
   4729             }
   4730         }
   4731     }
   4732 
   4733     int srccn;
   4734     RGB2HLS_f cvt;
   4735     #if CV_NEON
   4736     float32x4_t v_scale, v_scale_inv;
   4737     uint8x8_t v_alpha;
   4738     #elif CV_SSE2
   4739     __m128 v_scale, v_scale_inv;
   4740     __m128i v_zero;
   4741     bool haveSIMD;
   4742     #endif
   4743 };
   4744 
   4745 
   4746 struct HLS2RGB_f
   4747 {
   4748     typedef float channel_type;
   4749 
   4750     HLS2RGB_f(int _dstcn, int _blueIdx, float _hrange)
   4751     : dstcn(_dstcn), blueIdx(_blueIdx), hscale(6.f/_hrange) {}
   4752 
   4753     void operator()(const float* src, float* dst, int n) const
   4754     {
   4755         int i, bidx = blueIdx, dcn = dstcn;
   4756         float _hscale = hscale;
   4757         float alpha = ColorChannel<float>::max();
   4758         n *= 3;
   4759 
   4760         for( i = 0; i < n; i += 3, dst += dcn )
   4761         {
   4762             float h = src[i], l = src[i+1], s = src[i+2];
   4763             float b, g, r;
   4764 
   4765             if( s == 0 )
   4766                 b = g = r = l;
   4767             else
   4768             {
   4769                 static const int sector_data[][3]=
   4770                 {{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}};
   4771                 float tab[4];
   4772                 int sector;
   4773 
   4774                 float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s;
   4775                 float p1 = 2*l - p2;
   4776 
   4777                 h *= _hscale;
   4778                 if( h < 0 )
   4779                     do h += 6; while( h < 0 );
   4780                 else if( h >= 6 )
   4781                     do h -= 6; while( h >= 6 );
   4782 
   4783                 assert( 0 <= h && h < 6 );
   4784                 sector = cvFloor(h);
   4785                 h -= sector;
   4786 
   4787                 tab[0] = p2;
   4788                 tab[1] = p1;
   4789                 tab[2] = p1 + (p2 - p1)*(1-h);
   4790                 tab[3] = p1 + (p2 - p1)*h;
   4791 
   4792                 b = tab[sector_data[sector][0]];
   4793                 g = tab[sector_data[sector][1]];
   4794                 r = tab[sector_data[sector][2]];
   4795             }
   4796 
   4797             dst[bidx] = b;
   4798             dst[1] = g;
   4799             dst[bidx^2] = r;
   4800             if( dcn == 4 )
   4801                 dst[3] = alpha;
   4802         }
   4803     }
   4804 
   4805     int dstcn, blueIdx;
   4806     float hscale;
   4807 };
   4808 
   4809 
   4810 struct HLS2RGB_b
   4811 {
   4812     typedef uchar channel_type;
   4813 
   4814     HLS2RGB_b(int _dstcn, int _blueIdx, int _hrange)
   4815     : dstcn(_dstcn), cvt(3, _blueIdx, (float)_hrange)
   4816     {
   4817         #if CV_NEON
   4818         v_scale_inv = vdupq_n_f32(1.f/255.f);
   4819         v_scale = vdupq_n_f32(255.f);
   4820         v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
   4821         #elif CV_SSE2
   4822         v_scale_inv = _mm_set1_ps(1.f/255.f);
   4823         v_scale = _mm_set1_ps(255.f);
   4824         v_zero = _mm_setzero_si128();
   4825         haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
   4826         #endif
   4827     }
   4828 
   4829     #if CV_SSE2
   4830     // 16s x 8
   4831     void process(__m128i v_r, __m128i v_g, __m128i v_b,
   4832                  float * buf) const
   4833     {
   4834         __m128 v_r0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_r, v_zero));
   4835         __m128 v_g0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_g, v_zero));
   4836         __m128 v_b0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_b, v_zero));
   4837 
   4838         __m128 v_r1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_r, v_zero));
   4839         __m128 v_g1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_g, v_zero));
   4840         __m128 v_b1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_b, v_zero));
   4841 
   4842         v_g0 = _mm_mul_ps(v_g0, v_scale_inv);
   4843         v_b0 = _mm_mul_ps(v_b0, v_scale_inv);
   4844 
   4845         v_g1 = _mm_mul_ps(v_g1, v_scale_inv);
   4846         v_b1 = _mm_mul_ps(v_b1, v_scale_inv);
   4847 
   4848         _mm_interleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
   4849 
   4850         _mm_store_ps(buf, v_r0);
   4851         _mm_store_ps(buf + 4, v_r1);
   4852         _mm_store_ps(buf + 8, v_g0);
   4853         _mm_store_ps(buf + 12, v_g1);
   4854         _mm_store_ps(buf + 16, v_b0);
   4855         _mm_store_ps(buf + 20, v_b1);
   4856     }
   4857     #endif
   4858 
   4859     void operator()(const uchar* src, uchar* dst, int n) const
   4860     {
   4861         int i, j, dcn = dstcn;
   4862         uchar alpha = ColorChannel<uchar>::max();
   4863         float CV_DECL_ALIGNED(16) buf[3*BLOCK_SIZE];
   4864 
   4865         for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 )
   4866         {
   4867             int dn = std::min(n - i, (int)BLOCK_SIZE);
   4868             j = 0;
   4869 
   4870             #if CV_NEON
   4871             for ( ; j <= (dn - 8) * 3; j += 24)
   4872             {
   4873                 uint8x8x3_t v_src = vld3_u8(src + j);
   4874                 uint16x8_t v_t0 = vmovl_u8(v_src.val[0]),
   4875                            v_t1 = vmovl_u8(v_src.val[1]),
   4876                            v_t2 = vmovl_u8(v_src.val[2]);
   4877 
   4878                 float32x4x3_t v_dst;
   4879                 v_dst.val[0] = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t0)));
   4880                 v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t1))), v_scale_inv);
   4881                 v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t2))), v_scale_inv);
   4882                 vst3q_f32(buf + j, v_dst);
   4883 
   4884                 v_dst.val[0] = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t0)));
   4885                 v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t1))), v_scale_inv);
   4886                 v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_scale_inv);
   4887                 vst3q_f32(buf + j + 12, v_dst);
   4888             }
   4889             #elif CV_SSE2
   4890             if (haveSIMD)
   4891             {
   4892                 for ( ; j <= (dn - 32) * 3; j += 96)
   4893                 {
   4894                     __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src + j));
   4895                     __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + j + 16));
   4896                     __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + j + 32));
   4897                     __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + j + 48));
   4898                     __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + j + 64));
   4899                     __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + j + 80));
   4900 
   4901                     _mm_deinterleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
   4902 
   4903                     process(_mm_unpacklo_epi8(v_r0, v_zero),
   4904                             _mm_unpacklo_epi8(v_g0, v_zero),
   4905                             _mm_unpacklo_epi8(v_b0, v_zero),
   4906                             buf + j);
   4907 
   4908                     process(_mm_unpackhi_epi8(v_r0, v_zero),
   4909                             _mm_unpackhi_epi8(v_g0, v_zero),
   4910                             _mm_unpackhi_epi8(v_b0, v_zero),
   4911                             buf + j + 24);
   4912 
   4913                     process(_mm_unpacklo_epi8(v_r1, v_zero),
   4914                             _mm_unpacklo_epi8(v_g1, v_zero),
   4915                             _mm_unpacklo_epi8(v_b1, v_zero),
   4916                             buf + j + 48);
   4917 
   4918                     process(_mm_unpackhi_epi8(v_r1, v_zero),
   4919                             _mm_unpackhi_epi8(v_g1, v_zero),
   4920                             _mm_unpackhi_epi8(v_b1, v_zero),
   4921                             buf + j + 72);
   4922                 }
   4923             }
   4924             #endif
   4925             for( ; j < dn*3; j += 3 )
   4926             {
   4927                 buf[j] = src[j];
   4928                 buf[j+1] = src[j+1]*(1.f/255.f);
   4929                 buf[j+2] = src[j+2]*(1.f/255.f);
   4930             }
   4931             cvt(buf, buf, dn);
   4932 
   4933             j = 0;
   4934             #if CV_NEON
   4935             for ( ; j <= (dn - 8) * 3; j += 24, dst += dcn * 8)
   4936             {
   4937                 float32x4x3_t v_src0 = vld3q_f32(buf + j), v_src1 = vld3q_f32(buf + j + 12);
   4938                 uint8x8_t v_dst0 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[0], v_scale))),
   4939                                                            vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[0], v_scale)))));
   4940                 uint8x8_t v_dst1 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[1], v_scale))),
   4941                                                            vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[1], v_scale)))));
   4942                 uint8x8_t v_dst2 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[2], v_scale))),
   4943                                                            vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[2], v_scale)))));
   4944 
   4945                 if (dcn == 4)
   4946                 {
   4947                     uint8x8x4_t v_dst;
   4948                     v_dst.val[0] = v_dst0;
   4949                     v_dst.val[1] = v_dst1;
   4950                     v_dst.val[2] = v_dst2;
   4951                     v_dst.val[3] = v_alpha;
   4952                     vst4_u8(dst, v_dst);
   4953                 }
   4954                 else
   4955                 {
   4956                     uint8x8x3_t v_dst;
   4957                     v_dst.val[0] = v_dst0;
   4958                     v_dst.val[1] = v_dst1;
   4959                     v_dst.val[2] = v_dst2;
   4960                     vst3_u8(dst, v_dst);
   4961                 }
   4962             }
   4963             #elif CV_SSE2
   4964             if (dcn == 3 && haveSIMD)
   4965             {
   4966                 for ( ; j <= (dn * 3 - 16); j += 16, dst += 16)
   4967                 {
   4968                     __m128 v_src0 = _mm_mul_ps(_mm_load_ps(buf + j), v_scale);
   4969                     __m128 v_src1 = _mm_mul_ps(_mm_load_ps(buf + j + 4), v_scale);
   4970                     __m128 v_src2 = _mm_mul_ps(_mm_load_ps(buf + j + 8), v_scale);
   4971                     __m128 v_src3 = _mm_mul_ps(_mm_load_ps(buf + j + 12), v_scale);
   4972 
   4973                     __m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(v_src0),
   4974                                                      _mm_cvtps_epi32(v_src1));
   4975                     __m128i v_dst1 = _mm_packs_epi32(_mm_cvtps_epi32(v_src2),
   4976                                                      _mm_cvtps_epi32(v_src3));
   4977 
   4978                     _mm_storeu_si128((__m128i *)dst, _mm_packus_epi16(v_dst0, v_dst1));
   4979                 }
   4980 
   4981                 int jr = j % 3;
   4982                 if (jr)
   4983                     dst -= jr, j -= jr;
   4984             }
   4985             #endif
   4986 
   4987             for( ; j < dn*3; j += 3, dst += dcn )
   4988             {
   4989                 dst[0] = saturate_cast<uchar>(buf[j]*255.f);
   4990                 dst[1] = saturate_cast<uchar>(buf[j+1]*255.f);
   4991                 dst[2] = saturate_cast<uchar>(buf[j+2]*255.f);
   4992                 if( dcn == 4 )
   4993                     dst[3] = alpha;
   4994             }
   4995         }
   4996     }
   4997 
   4998     int dstcn;
   4999     HLS2RGB_f cvt;
   5000     #if CV_NEON
   5001     float32x4_t v_scale, v_scale_inv;
   5002     uint8x8_t v_alpha;
   5003     #elif CV_SSE2
   5004     __m128 v_scale, v_scale_inv;
   5005     __m128i v_zero;
   5006     bool haveSIMD;
   5007     #endif
   5008 };
   5009 
   5010 
   5011 ///////////////////////////////////// RGB <-> L*a*b* /////////////////////////////////////
   5012 
   5013 static const float D65[] = { 0.950456f, 1.f, 1.088754f };
   5014 
   5015 enum { LAB_CBRT_TAB_SIZE = 1024, GAMMA_TAB_SIZE = 1024 };
   5016 static float LabCbrtTab[LAB_CBRT_TAB_SIZE*4];
   5017 static const float LabCbrtTabScale = LAB_CBRT_TAB_SIZE/1.5f;
   5018 
   5019 static float sRGBGammaTab[GAMMA_TAB_SIZE*4], sRGBInvGammaTab[GAMMA_TAB_SIZE*4];
   5020 static const float GammaTabScale = (float)GAMMA_TAB_SIZE;
   5021 
   5022 static ushort sRGBGammaTab_b[256], linearGammaTab_b[256];
   5023 #undef lab_shift
   5024 #define lab_shift xyz_shift
   5025 #define gamma_shift 3
   5026 #define lab_shift2 (lab_shift + gamma_shift)
   5027 #define LAB_CBRT_TAB_SIZE_B (256*3/2*(1<<gamma_shift))
   5028 static ushort LabCbrtTab_b[LAB_CBRT_TAB_SIZE_B];
   5029 
   5030 static void initLabTabs()
   5031 {
   5032     static bool initialized = false;
   5033     if(!initialized)
   5034     {
   5035         float f[LAB_CBRT_TAB_SIZE+1], g[GAMMA_TAB_SIZE+1], ig[GAMMA_TAB_SIZE+1], scale = 1.f/LabCbrtTabScale;
   5036         int i;
   5037         for(i = 0; i <= LAB_CBRT_TAB_SIZE; i++)
   5038         {
   5039             float x = i*scale;
   5040             f[i] = x < 0.008856f ? x*7.787f + 0.13793103448275862f : cvCbrt(x);
   5041         }
   5042         splineBuild(f, LAB_CBRT_TAB_SIZE, LabCbrtTab);
   5043 
   5044         scale = 1.f/GammaTabScale;
   5045         for(i = 0; i <= GAMMA_TAB_SIZE; i++)
   5046         {
   5047             float x = i*scale;
   5048             g[i] = x <= 0.04045f ? x*(1.f/12.92f) : (float)std::pow((double)(x + 0.055)*(1./1.055), 2.4);
   5049             ig[i] = x <= 0.0031308 ? x*12.92f : (float)(1.055*std::pow((double)x, 1./2.4) - 0.055);
   5050         }
   5051         splineBuild(g, GAMMA_TAB_SIZE, sRGBGammaTab);
   5052         splineBuild(ig, GAMMA_TAB_SIZE, sRGBInvGammaTab);
   5053 
   5054         for(i = 0; i < 256; i++)
   5055         {
   5056             float x = i*(1.f/255.f);
   5057             sRGBGammaTab_b[i] = saturate_cast<ushort>(255.f*(1 << gamma_shift)*(x <= 0.04045f ? x*(1.f/12.92f) : (float)std::pow((double)(x + 0.055)*(1./1.055), 2.4)));
   5058             linearGammaTab_b[i] = (ushort)(i*(1 << gamma_shift));
   5059         }
   5060 
   5061         for(i = 0; i < LAB_CBRT_TAB_SIZE_B; i++)
   5062         {
   5063             float x = i*(1.f/(255.f*(1 << gamma_shift)));
   5064             LabCbrtTab_b[i] = saturate_cast<ushort>((1 << lab_shift2)*(x < 0.008856f ? x*7.787f + 0.13793103448275862f : cvCbrt(x)));
   5065         }
   5066         initialized = true;
   5067     }
   5068 }
   5069 
   5070 struct RGB2Lab_b
   5071 {
   5072     typedef uchar channel_type;
   5073 
   5074     RGB2Lab_b(int _srccn, int blueIdx, const float* _coeffs,
   5075               const float* _whitept, bool _srgb)
   5076     : srccn(_srccn), srgb(_srgb)
   5077     {
   5078         static volatile int _3 = 3;
   5079         initLabTabs();
   5080 
   5081         if (!_coeffs)
   5082             _coeffs = sRGB2XYZ_D65;
   5083         if (!_whitept)
   5084             _whitept = D65;
   5085 
   5086         float scale[] =
   5087         {
   5088             (1 << lab_shift)/_whitept[0],
   5089             (float)(1 << lab_shift),
   5090             (1 << lab_shift)/_whitept[2]
   5091         };
   5092 
   5093         for( int i = 0; i < _3; i++ )
   5094         {
   5095             coeffs[i*3+(blueIdx^2)] = cvRound(_coeffs[i*3]*scale[i]);
   5096             coeffs[i*3+1] = cvRound(_coeffs[i*3+1]*scale[i]);
   5097             coeffs[i*3+blueIdx] = cvRound(_coeffs[i*3+2]*scale[i]);
   5098 
   5099             CV_Assert( coeffs[i] >= 0 && coeffs[i*3+1] >= 0 && coeffs[i*3+2] >= 0 &&
   5100                       coeffs[i*3] + coeffs[i*3+1] + coeffs[i*3+2] < 2*(1 << lab_shift) );
   5101         }
   5102     }
   5103 
   5104     void operator()(const uchar* src, uchar* dst, int n) const
   5105     {
   5106         const int Lscale = (116*255+50)/100;
   5107         const int Lshift = -((16*255*(1 << lab_shift2) + 50)/100);
   5108         const ushort* tab = srgb ? sRGBGammaTab_b : linearGammaTab_b;
   5109         int i, scn = srccn;
   5110         int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
   5111             C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
   5112             C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
   5113         n *= 3;
   5114 
   5115         for( i = 0; i < n; i += 3, src += scn )
   5116         {
   5117             int R = tab[src[0]], G = tab[src[1]], B = tab[src[2]];
   5118             int fX = LabCbrtTab_b[CV_DESCALE(R*C0 + G*C1 + B*C2, lab_shift)];
   5119             int fY = LabCbrtTab_b[CV_DESCALE(R*C3 + G*C4 + B*C5, lab_shift)];
   5120             int fZ = LabCbrtTab_b[CV_DESCALE(R*C6 + G*C7 + B*C8, lab_shift)];
   5121 
   5122             int L = CV_DESCALE( Lscale*fY + Lshift, lab_shift2 );
   5123             int a = CV_DESCALE( 500*(fX - fY) + 128*(1 << lab_shift2), lab_shift2 );
   5124             int b = CV_DESCALE( 200*(fY - fZ) + 128*(1 << lab_shift2), lab_shift2 );
   5125 
   5126             dst[i] = saturate_cast<uchar>(L);
   5127             dst[i+1] = saturate_cast<uchar>(a);
   5128             dst[i+2] = saturate_cast<uchar>(b);
   5129         }
   5130     }
   5131 
   5132     int srccn;
   5133     int coeffs[9];
   5134     bool srgb;
   5135 };
   5136 
   5137 
   5138 #define clip(value) \
   5139     value < 0.0f ? 0.0f : value > 1.0f ? 1.0f : value;
   5140 
   5141 struct RGB2Lab_f
   5142 {
   5143     typedef float channel_type;
   5144 
   5145     RGB2Lab_f(int _srccn, int blueIdx, const float* _coeffs,
   5146               const float* _whitept, bool _srgb)
   5147     : srccn(_srccn), srgb(_srgb)
   5148     {
   5149         volatile int _3 = 3;
   5150         initLabTabs();
   5151 
   5152         if (!_coeffs)
   5153             _coeffs = sRGB2XYZ_D65;
   5154         if (!_whitept)
   5155             _whitept = D65;
   5156 
   5157         float scale[] = { 1.0f / _whitept[0], 1.0f, 1.0f / _whitept[2] };
   5158 
   5159         for( int i = 0; i < _3; i++ )
   5160         {
   5161             int j = i * 3;
   5162             coeffs[j + (blueIdx ^ 2)] = _coeffs[j] * scale[i];
   5163             coeffs[j + 1] = _coeffs[j + 1] * scale[i];
   5164             coeffs[j + blueIdx] = _coeffs[j + 2] * scale[i];
   5165 
   5166             CV_Assert( coeffs[j] >= 0 && coeffs[j + 1] >= 0 && coeffs[j + 2] >= 0 &&
   5167                        coeffs[j] + coeffs[j + 1] + coeffs[j + 2] < 1.5f*LabCbrtTabScale );
   5168         }
   5169     }
   5170 
   5171     void operator()(const float* src, float* dst, int n) const
   5172     {
   5173         int i, scn = srccn;
   5174         float gscale = GammaTabScale;
   5175         const float* gammaTab = srgb ? sRGBGammaTab : 0;
   5176         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
   5177               C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
   5178               C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
   5179         n *= 3;
   5180 
   5181         static const float _1_3 = 1.0f / 3.0f;
   5182         static const float _a = 16.0f / 116.0f;
   5183         for (i = 0; i < n; i += 3, src += scn )
   5184         {
   5185             float R = clip(src[0]);
   5186             float G = clip(src[1]);
   5187             float B = clip(src[2]);
   5188 
   5189             if (gammaTab)
   5190             {
   5191                 R = splineInterpolate(R * gscale, gammaTab, GAMMA_TAB_SIZE);
   5192                 G = splineInterpolate(G * gscale, gammaTab, GAMMA_TAB_SIZE);
   5193                 B = splineInterpolate(B * gscale, gammaTab, GAMMA_TAB_SIZE);
   5194             }
   5195             float X = R*C0 + G*C1 + B*C2;
   5196             float Y = R*C3 + G*C4 + B*C5;
   5197             float Z = R*C6 + G*C7 + B*C8;
   5198 
   5199             float FX = X > 0.008856f ? std::pow(X, _1_3) : (7.787f * X + _a);
   5200             float FY = Y > 0.008856f ? std::pow(Y, _1_3) : (7.787f * Y + _a);
   5201             float FZ = Z > 0.008856f ? std::pow(Z, _1_3) : (7.787f * Z + _a);
   5202 
   5203             float L = Y > 0.008856f ? (116.f * FY - 16.f) : (903.3f * Y);
   5204             float a = 500.f * (FX - FY);
   5205             float b = 200.f * (FY - FZ);
   5206 
   5207             dst[i] = L;
   5208             dst[i + 1] = a;
   5209             dst[i + 2] = b;
   5210         }
   5211     }
   5212 
   5213     int srccn;
   5214     float coeffs[9];
   5215     bool srgb;
   5216 };
   5217 
   5218 struct Lab2RGB_f
   5219 {
   5220     typedef float channel_type;
   5221 
   5222     Lab2RGB_f( int _dstcn, int blueIdx, const float* _coeffs,
   5223               const float* _whitept, bool _srgb )
   5224     : dstcn(_dstcn), srgb(_srgb)
   5225     {
   5226         initLabTabs();
   5227 
   5228         if(!_coeffs)
   5229             _coeffs = XYZ2sRGB_D65;
   5230         if(!_whitept)
   5231             _whitept = D65;
   5232 
   5233         for( int i = 0; i < 3; i++ )
   5234         {
   5235             coeffs[i+(blueIdx^2)*3] = _coeffs[i]*_whitept[i];
   5236             coeffs[i+3] = _coeffs[i+3]*_whitept[i];
   5237             coeffs[i+blueIdx*3] = _coeffs[i+6]*_whitept[i];
   5238         }
   5239     }
   5240 
   5241     void operator()(const float* src, float* dst, int n) const
   5242     {
   5243         int i, dcn = dstcn;
   5244         const float* gammaTab = srgb ? sRGBInvGammaTab : 0;
   5245         float gscale = GammaTabScale;
   5246         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
   5247         C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
   5248         C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
   5249         float alpha = ColorChannel<float>::max();
   5250         n *= 3;
   5251 
   5252         static const float lThresh = 0.008856f * 903.3f;
   5253         static const float fThresh = 7.787f * 0.008856f + 16.0f / 116.0f;
   5254         for (i = 0; i < n; i += 3, dst += dcn)
   5255         {
   5256             float li = src[i];
   5257             float ai = src[i + 1];
   5258             float bi = src[i + 2];
   5259 
   5260             float y, fy;
   5261             if (li <= lThresh)
   5262             {
   5263                 y = li / 903.3f;
   5264                 fy = 7.787f * y + 16.0f / 116.0f;
   5265             }
   5266             else
   5267             {
   5268                 fy = (li + 16.0f) / 116.0f;
   5269                 y = fy * fy * fy;
   5270             }
   5271 
   5272             float fxz[] = { ai / 500.0f + fy, fy - bi / 200.0f };
   5273 
   5274             for (int j = 0; j < 2; j++)
   5275                 if (fxz[j] <= fThresh)
   5276                     fxz[j] = (fxz[j] - 16.0f / 116.0f) / 7.787f;
   5277                 else
   5278                     fxz[j] = fxz[j] * fxz[j] * fxz[j];
   5279 
   5280 
   5281             float x = fxz[0], z = fxz[1];
   5282             float ro = C0 * x + C1 * y + C2 * z;
   5283             float go = C3 * x + C4 * y + C5 * z;
   5284             float bo = C6 * x + C7 * y + C8 * z;
   5285             ro = clip(ro);
   5286             go = clip(go);
   5287             bo = clip(bo);
   5288 
   5289             if (gammaTab)
   5290             {
   5291                 ro = splineInterpolate(ro * gscale, gammaTab, GAMMA_TAB_SIZE);
   5292                 go = splineInterpolate(go * gscale, gammaTab, GAMMA_TAB_SIZE);
   5293                 bo = splineInterpolate(bo * gscale, gammaTab, GAMMA_TAB_SIZE);
   5294             }
   5295 
   5296             dst[0] = ro, dst[1] = go, dst[2] = bo;
   5297             if( dcn == 4 )
   5298                 dst[3] = alpha;
   5299         }
   5300     }
   5301 
   5302     int dstcn;
   5303     float coeffs[9];
   5304     bool srgb;
   5305 };
   5306 
   5307 #undef clip
   5308 
   5309 struct Lab2RGB_b
   5310 {
   5311     typedef uchar channel_type;
   5312 
   5313     Lab2RGB_b( int _dstcn, int blueIdx, const float* _coeffs,
   5314                const float* _whitept, bool _srgb )
   5315     : dstcn(_dstcn), cvt(3, blueIdx, _coeffs, _whitept, _srgb )
   5316     {
   5317         #if CV_NEON
   5318         v_scale_inv = vdupq_n_f32(100.f/255.f);
   5319         v_scale = vdupq_n_f32(255.f);
   5320         v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
   5321         v_128 = vdupq_n_f32(128.0f);
   5322         #elif CV_SSE2
   5323         v_scale_inv = _mm_set1_ps(100.f/255.f);
   5324         v_scale = _mm_set1_ps(255.f);
   5325         v_128 = _mm_set1_ps(128.0f);
   5326         v_zero = _mm_setzero_si128();
   5327         haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
   5328         #endif
   5329     }
   5330 
   5331     #if CV_SSE2
   5332     // 16s x 8
   5333     void process(__m128i v_r, __m128i v_g, __m128i v_b,
   5334                  float * buf) const
   5335     {
   5336         __m128 v_r0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_r, v_zero));
   5337         __m128 v_g0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_g, v_zero));
   5338         __m128 v_b0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_b, v_zero));
   5339 
   5340         __m128 v_r1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_r, v_zero));
   5341         __m128 v_g1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_g, v_zero));
   5342         __m128 v_b1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_b, v_zero));
   5343 
   5344         v_r0 = _mm_mul_ps(v_r0, v_scale_inv);
   5345         v_r1 = _mm_mul_ps(v_r1, v_scale_inv);
   5346 
   5347         v_g0 = _mm_sub_ps(v_g0, v_128);
   5348         v_g1 = _mm_sub_ps(v_g1, v_128);
   5349         v_b0 = _mm_sub_ps(v_b0, v_128);
   5350         v_b1 = _mm_sub_ps(v_b1, v_128);
   5351 
   5352         _mm_interleave_ps(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
   5353 
   5354         _mm_store_ps(buf, v_r0);
   5355         _mm_store_ps(buf + 4, v_r1);
   5356         _mm_store_ps(buf + 8, v_g0);
   5357         _mm_store_ps(buf + 12, v_g1);
   5358         _mm_store_ps(buf + 16, v_b0);
   5359         _mm_store_ps(buf + 20, v_b1);
   5360     }
   5361     #endif
   5362 
   5363     void operator()(const uchar* src, uchar* dst, int n) const
   5364     {
   5365         int i, j, dcn = dstcn;
   5366         uchar alpha = ColorChannel<uchar>::max();
   5367         float CV_DECL_ALIGNED(16) buf[3*BLOCK_SIZE];
   5368 
   5369         for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 )
   5370         {
   5371             int dn = std::min(n - i, (int)BLOCK_SIZE);
   5372             j = 0;
   5373 
   5374             #if CV_NEON
   5375             for ( ; j <= (dn - 8) * 3; j += 24)
   5376             {
   5377                 uint8x8x3_t v_src = vld3_u8(src + j);
   5378                 uint16x8_t v_t0 = vmovl_u8(v_src.val[0]),
   5379                            v_t1 = vmovl_u8(v_src.val[1]),
   5380                            v_t2 = vmovl_u8(v_src.val[2]);
   5381 
   5382                 float32x4x3_t v_dst;
   5383                 v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t0))), v_scale_inv);
   5384                 v_dst.val[1] = vsubq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t1))), v_128);
   5385                 v_dst.val[2] = vsubq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t2))), v_128);
   5386                 vst3q_f32(buf + j, v_dst);
   5387 
   5388                 v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t0))), v_scale_inv);
   5389                 v_dst.val[1] = vsubq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t1))), v_128);
   5390                 v_dst.val[2] = vsubq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_128);
   5391                 vst3q_f32(buf + j + 12, v_dst);
   5392             }
   5393             #elif CV_SSE2
   5394             if (haveSIMD)
   5395             {
   5396                 for ( ; j <= (dn - 32) * 3; j += 96)
   5397                 {
   5398                     __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src + j));
   5399                     __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + j + 16));
   5400                     __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + j + 32));
   5401                     __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + j + 48));
   5402                     __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + j + 64));
   5403                     __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + j + 80));
   5404 
   5405                     _mm_deinterleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
   5406 
   5407                     process(_mm_unpacklo_epi8(v_r0, v_zero),
   5408                             _mm_unpacklo_epi8(v_g0, v_zero),
   5409                             _mm_unpacklo_epi8(v_b0, v_zero),
   5410                             buf + j);
   5411 
   5412                     process(_mm_unpackhi_epi8(v_r0, v_zero),
   5413                             _mm_unpackhi_epi8(v_g0, v_zero),
   5414                             _mm_unpackhi_epi8(v_b0, v_zero),
   5415                             buf + j + 24);
   5416 
   5417                     process(_mm_unpacklo_epi8(v_r1, v_zero),
   5418                             _mm_unpacklo_epi8(v_g1, v_zero),
   5419                             _mm_unpacklo_epi8(v_b1, v_zero),
   5420                             buf + j + 48);
   5421 
   5422                     process(_mm_unpackhi_epi8(v_r1, v_zero),
   5423                             _mm_unpackhi_epi8(v_g1, v_zero),
   5424                             _mm_unpackhi_epi8(v_b1, v_zero),
   5425                             buf + j + 72);
   5426                 }
   5427             }
   5428             #endif
   5429 
   5430             for( ; j < dn*3; j += 3 )
   5431             {
   5432                 buf[j] = src[j]*(100.f/255.f);
   5433                 buf[j+1] = (float)(src[j+1] - 128);
   5434                 buf[j+2] = (float)(src[j+2] - 128);
   5435             }
   5436             cvt(buf, buf, dn);
   5437             j = 0;
   5438 
   5439             #if CV_NEON
   5440             for ( ; j <= (dn - 8) * 3; j += 24, dst += dcn * 8)
   5441             {
   5442                 float32x4x3_t v_src0 = vld3q_f32(buf + j), v_src1 = vld3q_f32(buf + j + 12);
   5443                 uint8x8_t v_dst0 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[0], v_scale))),
   5444                                                            vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[0], v_scale)))));
   5445                 uint8x8_t v_dst1 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[1], v_scale))),
   5446                                                            vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[1], v_scale)))));
   5447                 uint8x8_t v_dst2 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[2], v_scale))),
   5448                                                            vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[2], v_scale)))));
   5449 
   5450                 if (dcn == 4)
   5451                 {
   5452                     uint8x8x4_t v_dst;
   5453                     v_dst.val[0] = v_dst0;
   5454                     v_dst.val[1] = v_dst1;
   5455                     v_dst.val[2] = v_dst2;
   5456                     v_dst.val[3] = v_alpha;
   5457                     vst4_u8(dst, v_dst);
   5458                 }
   5459                 else
   5460                 {
   5461                     uint8x8x3_t v_dst;
   5462                     v_dst.val[0] = v_dst0;
   5463                     v_dst.val[1] = v_dst1;
   5464                     v_dst.val[2] = v_dst2;
   5465                     vst3_u8(dst, v_dst);
   5466                 }
   5467             }
   5468             #elif CV_SSE2
   5469             if (dcn == 3 && haveSIMD)
   5470             {
   5471                 for ( ; j <= (dn * 3 - 16); j += 16, dst += 16)
   5472                 {
   5473                     __m128 v_src0 = _mm_mul_ps(_mm_load_ps(buf + j), v_scale);
   5474                     __m128 v_src1 = _mm_mul_ps(_mm_load_ps(buf + j + 4), v_scale);
   5475                     __m128 v_src2 = _mm_mul_ps(_mm_load_ps(buf + j + 8), v_scale);
   5476                     __m128 v_src3 = _mm_mul_ps(_mm_load_ps(buf + j + 12), v_scale);
   5477 
   5478                     __m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(v_src0),
   5479                                                      _mm_cvtps_epi32(v_src1));
   5480                     __m128i v_dst1 = _mm_packs_epi32(_mm_cvtps_epi32(v_src2),
   5481                                                      _mm_cvtps_epi32(v_src3));
   5482 
   5483                     _mm_storeu_si128((__m128i *)dst, _mm_packus_epi16(v_dst0, v_dst1));
   5484                 }
   5485 
   5486                 int jr = j % 3;
   5487                 if (jr)
   5488                     dst -= jr, j -= jr;
   5489             }
   5490             #endif
   5491 
   5492             for( ; j < dn*3; j += 3, dst += dcn )
   5493             {
   5494                 dst[0] = saturate_cast<uchar>(buf[j]*255.f);
   5495                 dst[1] = saturate_cast<uchar>(buf[j+1]*255.f);
   5496                 dst[2] = saturate_cast<uchar>(buf[j+2]*255.f);
   5497                 if( dcn == 4 )
   5498                     dst[3] = alpha;
   5499             }
   5500         }
   5501     }
   5502 
   5503     int dstcn;
   5504     Lab2RGB_f cvt;
   5505 
   5506     #if CV_NEON
   5507     float32x4_t v_scale, v_scale_inv, v_128;
   5508     uint8x8_t v_alpha;
   5509     #elif CV_SSE2
   5510     __m128 v_scale, v_scale_inv, v_128;
   5511     __m128i v_zero;
   5512     bool haveSIMD;
   5513     #endif
   5514 };
   5515 
   5516 
   5517 ///////////////////////////////////// RGB <-> L*u*v* /////////////////////////////////////
   5518 
   5519 struct RGB2Luv_f
   5520 {
   5521     typedef float channel_type;
   5522 
   5523     RGB2Luv_f( int _srccn, int blueIdx, const float* _coeffs,
   5524                const float* whitept, bool _srgb )
   5525     : srccn(_srccn), srgb(_srgb)
   5526     {
   5527         volatile int i;
   5528         initLabTabs();
   5529 
   5530         if(!_coeffs) _coeffs = sRGB2XYZ_D65;
   5531         if(!whitept) whitept = D65;
   5532 
   5533         for( i = 0; i < 3; i++ )
   5534         {
   5535             coeffs[i*3] = _coeffs[i*3];
   5536             coeffs[i*3+1] = _coeffs[i*3+1];
   5537             coeffs[i*3+2] = _coeffs[i*3+2];
   5538             if( blueIdx == 0 )
   5539                 std::swap(coeffs[i*3], coeffs[i*3+2]);
   5540             CV_Assert( coeffs[i*3] >= 0 && coeffs[i*3+1] >= 0 && coeffs[i*3+2] >= 0 &&
   5541                       coeffs[i*3] + coeffs[i*3+1] + coeffs[i*3+2] < 1.5f );
   5542         }
   5543 
   5544         float d = 1.f/(whitept[0] + whitept[1]*15 + whitept[2]*3);
   5545         un = 4*whitept[0]*d;
   5546         vn = 9*whitept[1]*d;
   5547 
   5548         CV_Assert(whitept[1] == 1.f);
   5549     }
   5550 
   5551     void operator()(const float* src, float* dst, int n) const
   5552     {
   5553         int i, scn = srccn;
   5554         float gscale = GammaTabScale;
   5555         const float* gammaTab = srgb ? sRGBGammaTab : 0;
   5556         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
   5557               C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
   5558               C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
   5559         float _un = 13*un, _vn = 13*vn;
   5560         n *= 3;
   5561 
   5562         for( i = 0; i < n; i += 3, src += scn )
   5563         {
   5564             float R = src[0], G = src[1], B = src[2];
   5565             if( gammaTab )
   5566             {
   5567                 R = splineInterpolate(R*gscale, gammaTab, GAMMA_TAB_SIZE);
   5568                 G = splineInterpolate(G*gscale, gammaTab, GAMMA_TAB_SIZE);
   5569                 B = splineInterpolate(B*gscale, gammaTab, GAMMA_TAB_SIZE);
   5570             }
   5571 
   5572             float X = R*C0 + G*C1 + B*C2;
   5573             float Y = R*C3 + G*C4 + B*C5;
   5574             float Z = R*C6 + G*C7 + B*C8;
   5575 
   5576             float L = splineInterpolate(Y*LabCbrtTabScale, LabCbrtTab, LAB_CBRT_TAB_SIZE);
   5577             L = 116.f*L - 16.f;
   5578 
   5579             float d = (4*13) / std::max(X + 15 * Y + 3 * Z, FLT_EPSILON);
   5580             float u = L*(X*d - _un);
   5581             float v = L*((9*0.25f)*Y*d - _vn);
   5582 
   5583             dst[i] = L; dst[i+1] = u; dst[i+2] = v;
   5584         }
   5585     }
   5586 
   5587     int srccn;
   5588     float coeffs[9], un, vn;
   5589     bool srgb;
   5590 };
   5591 
   5592 
   5593 struct Luv2RGB_f
   5594 {
   5595     typedef float channel_type;
   5596 
   5597     Luv2RGB_f( int _dstcn, int blueIdx, const float* _coeffs,
   5598               const float* whitept, bool _srgb )
   5599     : dstcn(_dstcn), srgb(_srgb)
   5600     {
   5601         initLabTabs();
   5602 
   5603         if(!_coeffs) _coeffs = XYZ2sRGB_D65;
   5604         if(!whitept) whitept = D65;
   5605 
   5606         for( int i = 0; i < 3; i++ )
   5607         {
   5608             coeffs[i+(blueIdx^2)*3] = _coeffs[i];
   5609             coeffs[i+3] = _coeffs[i+3];
   5610             coeffs[i+blueIdx*3] = _coeffs[i+6];
   5611         }
   5612 
   5613         float d = 1.f/(whitept[0] + whitept[1]*15 + whitept[2]*3);
   5614         un = 4*whitept[0]*d;
   5615         vn = 9*whitept[1]*d;
   5616 
   5617         CV_Assert(whitept[1] == 1.f);
   5618     }
   5619 
   5620     void operator()(const float* src, float* dst, int n) const
   5621     {
   5622         int i, dcn = dstcn;
   5623         const float* gammaTab = srgb ? sRGBInvGammaTab : 0;
   5624         float gscale = GammaTabScale;
   5625         float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
   5626               C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
   5627               C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
   5628         float alpha = ColorChannel<float>::max();
   5629         float _un = un, _vn = vn;
   5630         n *= 3;
   5631 
   5632         for( i = 0; i < n; i += 3, dst += dcn )
   5633         {
   5634             float L = src[i], u = src[i+1], v = src[i+2], d, X, Y, Z;
   5635             Y = (L + 16.f) * (1.f/116.f);
   5636             Y = Y*Y*Y;
   5637             d = (1.f/13.f)/L;
   5638             u = u*d + _un;
   5639             v = v*d + _vn;
   5640             float iv = 1.f/v;
   5641             X = 2.25f * u * Y * iv ;
   5642             Z = (12 - 3 * u - 20 * v) * Y * 0.25f * iv;
   5643 
   5644             float R = X*C0 + Y*C1 + Z*C2;
   5645             float G = X*C3 + Y*C4 + Z*C5;
   5646             float B = X*C6 + Y*C7 + Z*C8;
   5647 
   5648             R = std::min(std::max(R, 0.f), 1.f);
   5649             G = std::min(std::max(G, 0.f), 1.f);
   5650             B = std::min(std::max(B, 0.f), 1.f);
   5651 
   5652             if( gammaTab )
   5653             {
   5654                 R = splineInterpolate(R*gscale, gammaTab, GAMMA_TAB_SIZE);
   5655                 G = splineInterpolate(G*gscale, gammaTab, GAMMA_TAB_SIZE);
   5656                 B = splineInterpolate(B*gscale, gammaTab, GAMMA_TAB_SIZE);
   5657             }
   5658 
   5659             dst[0] = R; dst[1] = G; dst[2] = B;
   5660             if( dcn == 4 )
   5661                 dst[3] = alpha;
   5662         }
   5663     }
   5664 
   5665     int dstcn;
   5666     float coeffs[9], un, vn;
   5667     bool srgb;
   5668 };
   5669 
   5670 
   5671 struct RGB2Luv_b
   5672 {
   5673     typedef uchar channel_type;
   5674 
   5675     RGB2Luv_b( int _srccn, int blueIdx, const float* _coeffs,
   5676                const float* _whitept, bool _srgb )
   5677     : srccn(_srccn), cvt(3, blueIdx, _coeffs, _whitept, _srgb)
   5678     {
   5679         #if CV_NEON
   5680         v_scale_inv = vdupq_n_f32(1.f/255.f);
   5681         v_scale = vdupq_n_f32(2.55f);
   5682         v_coeff1 = vdupq_n_f32(0.72033898305084743f);
   5683         v_coeff2 = vdupq_n_f32(96.525423728813564f);
   5684         v_coeff3 = vdupq_n_f32(0.9732824427480916f);
   5685         v_coeff4 = vdupq_n_f32(136.259541984732824f);
   5686         v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
   5687         #elif CV_SSE2
   5688         v_zero = _mm_setzero_si128();
   5689         v_scale_inv = _mm_set1_ps(1.f/255.f);
   5690         v_scale = _mm_set1_ps(2.55f);
   5691         v_coeff1 = _mm_set1_ps(0.72033898305084743f);
   5692         v_coeff2 = _mm_set1_ps(96.525423728813564f);
   5693         v_coeff3 = _mm_set1_ps(0.9732824427480916f);
   5694         v_coeff4 = _mm_set1_ps(136.259541984732824f);
   5695         haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
   5696         #endif
   5697     }
   5698 
   5699     #if CV_SSE2
   5700     void process(const float * buf,
   5701                  __m128i & v_l, __m128i & v_u, __m128i & v_v) const
   5702     {
   5703         __m128 v_l0f = _mm_load_ps(buf);
   5704         __m128 v_l1f = _mm_load_ps(buf + 4);
   5705         __m128 v_u0f = _mm_load_ps(buf + 8);
   5706         __m128 v_u1f = _mm_load_ps(buf + 12);
   5707         __m128 v_v0f = _mm_load_ps(buf + 16);
   5708         __m128 v_v1f = _mm_load_ps(buf + 20);
   5709 
   5710         _mm_deinterleave_ps(v_l0f, v_l1f, v_u0f, v_u1f, v_v0f, v_v1f);
   5711 
   5712         v_l0f = _mm_mul_ps(v_l0f, v_scale);
   5713         v_l1f = _mm_mul_ps(v_l1f, v_scale);
   5714         v_u0f = _mm_add_ps(_mm_mul_ps(v_u0f, v_coeff1), v_coeff2);
   5715         v_u1f = _mm_add_ps(_mm_mul_ps(v_u1f, v_coeff1), v_coeff2);
   5716         v_v0f = _mm_add_ps(_mm_mul_ps(v_v0f, v_coeff3), v_coeff4);
   5717         v_v1f = _mm_add_ps(_mm_mul_ps(v_v1f, v_coeff3), v_coeff4);
   5718 
   5719         v_l = _mm_packs_epi32(_mm_cvtps_epi32(v_l0f), _mm_cvtps_epi32(v_l1f));
   5720         v_u = _mm_packs_epi32(_mm_cvtps_epi32(v_u0f), _mm_cvtps_epi32(v_u1f));
   5721         v_v = _mm_packs_epi32(_mm_cvtps_epi32(v_v0f), _mm_cvtps_epi32(v_v1f));
   5722     }
   5723     #endif
   5724 
   5725     void operator()(const uchar* src, uchar* dst, int n) const
   5726     {
   5727         int i, j, scn = srccn;
   5728         float CV_DECL_ALIGNED(16) buf[3*BLOCK_SIZE];
   5729 
   5730         for( i = 0; i < n; i += BLOCK_SIZE, dst += BLOCK_SIZE*3 )
   5731         {
   5732             int dn = std::min(n - i, (int)BLOCK_SIZE);
   5733             j = 0;
   5734 
   5735             #if CV_NEON
   5736             for ( ; j <= (dn - 8) * 3; j += 24, src += 8 * scn)
   5737             {
   5738                 uint16x8_t v_t0, v_t1, v_t2;
   5739 
   5740                 if (scn == 3)
   5741                 {
   5742                     uint8x8x3_t v_src = vld3_u8(src);
   5743                     v_t0 = vmovl_u8(v_src.val[0]);
   5744                     v_t1 = vmovl_u8(v_src.val[1]);
   5745                     v_t2 = vmovl_u8(v_src.val[2]);
   5746                 }
   5747                 else
   5748                 {
   5749                     uint8x8x4_t v_src = vld4_u8(src);
   5750                     v_t0 = vmovl_u8(v_src.val[0]);
   5751                     v_t1 = vmovl_u8(v_src.val[1]);
   5752                     v_t2 = vmovl_u8(v_src.val[2]);
   5753                 }
   5754 
   5755                 float32x4x3_t v_dst;
   5756                 v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t0))), v_scale_inv);
   5757                 v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t1))), v_scale_inv);
   5758                 v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t2))), v_scale_inv);
   5759                 vst3q_f32(buf + j, v_dst);
   5760 
   5761                 v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t0))), v_scale_inv);
   5762                 v_dst.val[1] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t1))), v_scale_inv);
   5763                 v_dst.val[2] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_scale_inv);
   5764                 vst3q_f32(buf + j + 12, v_dst);
   5765             }
   5766             #elif CV_SSE2
   5767             if (scn == 3 && haveSIMD)
   5768             {
   5769                 for ( ; j <= (dn * 3 - 16); j += 16, src += 16)
   5770                 {
   5771                     __m128i v_src = _mm_loadu_si128((__m128i const *)src);
   5772 
   5773                     __m128i v_src_p = _mm_unpacklo_epi8(v_src, v_zero);
   5774                     _mm_store_ps(buf + j, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_p, v_zero)), v_scale_inv));
   5775                     _mm_store_ps(buf + j + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src_p, v_zero)), v_scale_inv));
   5776 
   5777                     v_src_p = _mm_unpackhi_epi8(v_src, v_zero);
   5778                     _mm_store_ps(buf + j + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src_p, v_zero)), v_scale_inv));
   5779                     _mm_store_ps(buf + j + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src_p, v_zero)), v_scale_inv));
   5780                 }
   5781 
   5782                 int jr = j % 3;
   5783                 if (jr)
   5784                     src -= jr, j -= jr;
   5785             }
   5786             #endif
   5787             for( ; j < dn*3; j += 3, src += scn )
   5788             {
   5789                 buf[j] = src[0]*(1.f/255.f);
   5790                 buf[j+1] = (float)(src[1]*(1.f/255.f));
   5791                 buf[j+2] = (float)(src[2]*(1.f/255.f));
   5792             }
   5793             cvt(buf, buf, dn);
   5794 
   5795             j = 0;
   5796             #if CV_NEON
   5797             for ( ; j <= (dn - 8) * 3; j += 24)
   5798             {
   5799                 float32x4x3_t v_src0 = vld3q_f32(buf + j), v_src1 = vld3q_f32(buf + j + 12);
   5800 
   5801                 uint8x8x3_t v_dst;
   5802                 v_dst.val[0] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[0], v_scale))),
   5803                                                        vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[0], v_scale)))));
   5804                 v_dst.val[1] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vmulq_f32(v_src0.val[1], v_coeff1), v_coeff2))),
   5805                                                        vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vmulq_f32(v_src1.val[1], v_coeff1), v_coeff2)))));
   5806                 v_dst.val[2] = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vmulq_f32(v_src0.val[2], v_coeff3), v_coeff4))),
   5807                                                        vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vmulq_f32(v_src1.val[2], v_coeff3), v_coeff4)))));
   5808 
   5809                 vst3_u8(dst + j, v_dst);
   5810             }
   5811             #elif CV_SSE2
   5812             if (haveSIMD)
   5813             {
   5814                 for ( ; j <= (dn - 32) * 3; j += 96)
   5815                 {
   5816                     __m128i v_l_0, v_u_0, v_v_0;
   5817                     process(buf + j,
   5818                             v_l_0, v_u_0, v_v_0);
   5819 
   5820                     __m128i v_l_1, v_u_1, v_v_1;
   5821                     process(buf + j + 24,
   5822                             v_l_1, v_u_1, v_v_1);
   5823 
   5824                     __m128i v_l0 = _mm_packus_epi16(v_l_0, v_l_1);
   5825                     __m128i v_u0 = _mm_packus_epi16(v_u_0, v_u_1);
   5826                     __m128i v_v0 = _mm_packus_epi16(v_v_0, v_v_1);
   5827 
   5828                     process(buf + j + 48,
   5829                             v_l_0, v_u_0, v_v_0);
   5830 
   5831                     process(buf + j + 72,
   5832                             v_l_1, v_u_1, v_v_1);
   5833 
   5834                     __m128i v_l1 = _mm_packus_epi16(v_l_0, v_l_1);
   5835                     __m128i v_u1 = _mm_packus_epi16(v_u_0, v_u_1);
   5836                     __m128i v_v1 = _mm_packus_epi16(v_v_0, v_v_1);
   5837 
   5838                     _mm_interleave_epi8(v_l0, v_l1, v_u0, v_u1, v_v0, v_v1);
   5839 
   5840                     _mm_storeu_si128((__m128i *)(dst + j), v_l0);
   5841                     _mm_storeu_si128((__m128i *)(dst + j + 16), v_l1);
   5842                     _mm_storeu_si128((__m128i *)(dst + j + 32), v_u0);
   5843                     _mm_storeu_si128((__m128i *)(dst + j + 48), v_u1);
   5844                     _mm_storeu_si128((__m128i *)(dst + j + 64), v_v0);
   5845                     _mm_storeu_si128((__m128i *)(dst + j + 80), v_v1);
   5846                 }
   5847             }
   5848             #endif
   5849 
   5850             for( ; j < dn*3; j += 3 )
   5851             {
   5852                 dst[j] = saturate_cast<uchar>(buf[j]*2.55f);
   5853                 dst[j+1] = saturate_cast<uchar>(buf[j+1]*0.72033898305084743f + 96.525423728813564f);
   5854                 dst[j+2] = saturate_cast<uchar>(buf[j+2]*0.9732824427480916f + 136.259541984732824f);
   5855             }
   5856         }
   5857     }
   5858 
   5859     int srccn;
   5860     RGB2Luv_f cvt;
   5861 
   5862     #if CV_NEON
   5863     float32x4_t v_scale, v_scale_inv, v_coeff1, v_coeff2, v_coeff3, v_coeff4;
   5864     uint8x8_t v_alpha;
   5865     #elif CV_SSE2
   5866     __m128 v_scale, v_scale_inv, v_coeff1, v_coeff2, v_coeff3, v_coeff4;
   5867     __m128i v_zero;
   5868     bool haveSIMD;
   5869     #endif
   5870 };
   5871 
   5872 
   5873 struct Luv2RGB_b
   5874 {
   5875     typedef uchar channel_type;
   5876 
   5877     Luv2RGB_b( int _dstcn, int blueIdx, const float* _coeffs,
   5878                const float* _whitept, bool _srgb )
   5879     : dstcn(_dstcn), cvt(3, blueIdx, _coeffs, _whitept, _srgb )
   5880     {
   5881         #if CV_NEON
   5882         v_scale_inv = vdupq_n_f32(100.f/255.f);
   5883         v_coeff1 = vdupq_n_f32(1.388235294117647f);
   5884         v_coeff2 = vdupq_n_f32(1.027450980392157f);
   5885         v_134 = vdupq_n_f32(134.f);
   5886         v_140 = vdupq_n_f32(140.f);
   5887         v_scale = vdupq_n_f32(255.f);
   5888         v_alpha = vdup_n_u8(ColorChannel<uchar>::max());
   5889         #elif CV_SSE2
   5890         v_scale_inv = _mm_set1_ps(100.f/255.f);
   5891         v_coeff1 = _mm_set1_ps(1.388235294117647f);
   5892         v_coeff2 = _mm_set1_ps(1.027450980392157f);
   5893         v_134 = _mm_set1_ps(134.f);
   5894         v_140 = _mm_set1_ps(140.f);
   5895         v_scale = _mm_set1_ps(255.f);
   5896         v_zero = _mm_setzero_si128();
   5897         haveSIMD = checkHardwareSupport(CV_CPU_SSE2);
   5898         #endif
   5899     }
   5900 
   5901     #if CV_SSE2
   5902     // 16s x 8
   5903     void process(__m128i v_l, __m128i v_u, __m128i v_v,
   5904                  float * buf) const
   5905     {
   5906         __m128 v_l0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_l, v_zero));
   5907         __m128 v_u0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_u, v_zero));
   5908         __m128 v_v0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_v, v_zero));
   5909 
   5910         __m128 v_l1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_l, v_zero));
   5911         __m128 v_u1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_u, v_zero));
   5912         __m128 v_v1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_v, v_zero));
   5913 
   5914         v_l0 = _mm_mul_ps(v_l0, v_scale_inv);
   5915         v_l1 = _mm_mul_ps(v_l1, v_scale_inv);
   5916 
   5917         v_u0 = _mm_sub_ps(_mm_mul_ps(v_u0, v_coeff1), v_134);
   5918         v_u1 = _mm_sub_ps(_mm_mul_ps(v_u1, v_coeff1), v_134);
   5919         v_v0 = _mm_sub_ps(_mm_mul_ps(v_v0, v_coeff2), v_140);
   5920         v_v1 = _mm_sub_ps(_mm_mul_ps(v_v1, v_coeff2), v_140);
   5921 
   5922         _mm_interleave_ps(v_l0, v_l1, v_u0, v_u1, v_v0, v_v1);
   5923 
   5924         _mm_store_ps(buf, v_l0);
   5925         _mm_store_ps(buf + 4, v_l1);
   5926         _mm_store_ps(buf + 8, v_u0);
   5927         _mm_store_ps(buf + 12, v_u1);
   5928         _mm_store_ps(buf + 16, v_v0);
   5929         _mm_store_ps(buf + 20, v_v1);
   5930     }
   5931     #endif
   5932 
   5933     void operator()(const uchar* src, uchar* dst, int n) const
   5934     {
   5935         int i, j, dcn = dstcn;
   5936         uchar alpha = ColorChannel<uchar>::max();
   5937         float CV_DECL_ALIGNED(16) buf[3*BLOCK_SIZE];
   5938 
   5939         for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 )
   5940         {
   5941             int dn = std::min(n - i, (int)BLOCK_SIZE);
   5942             j = 0;
   5943 
   5944             #if CV_NEON
   5945             for ( ; j <= (dn - 8) * 3; j += 24)
   5946             {
   5947                 uint8x8x3_t v_src = vld3_u8(src + j);
   5948                 uint16x8_t v_t0 = vmovl_u8(v_src.val[0]),
   5949                            v_t1 = vmovl_u8(v_src.val[1]),
   5950                            v_t2 = vmovl_u8(v_src.val[2]);
   5951 
   5952                 float32x4x3_t v_dst;
   5953                 v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t0))), v_scale_inv);
   5954                 v_dst.val[1] = vsubq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t1))), v_coeff1), v_134);
   5955                 v_dst.val[2] = vsubq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_t2))), v_coeff2), v_140);
   5956                 vst3q_f32(buf + j, v_dst);
   5957 
   5958                 v_dst.val[0] = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t0))), v_scale_inv);
   5959                 v_dst.val[1] = vsubq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t1))), v_coeff1), v_134);
   5960                 v_dst.val[2] = vsubq_f32(vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_t2))), v_coeff2), v_140);
   5961                 vst3q_f32(buf + j + 12, v_dst);
   5962             }
   5963             #elif CV_SSE2
   5964             if (haveSIMD)
   5965             {
   5966                 for ( ; j <= (dn - 32) * 3; j += 96)
   5967                 {
   5968                     __m128i v_r0 = _mm_loadu_si128((__m128i const *)(src + j));
   5969                     __m128i v_r1 = _mm_loadu_si128((__m128i const *)(src + j + 16));
   5970                     __m128i v_g0 = _mm_loadu_si128((__m128i const *)(src + j + 32));
   5971                     __m128i v_g1 = _mm_loadu_si128((__m128i const *)(src + j + 48));
   5972                     __m128i v_b0 = _mm_loadu_si128((__m128i const *)(src + j + 64));
   5973                     __m128i v_b1 = _mm_loadu_si128((__m128i const *)(src + j + 80));
   5974 
   5975                     _mm_deinterleave_epi8(v_r0, v_r1, v_g0, v_g1, v_b0, v_b1);
   5976 
   5977                     process(_mm_unpacklo_epi8(v_r0, v_zero),
   5978                             _mm_unpacklo_epi8(v_g0, v_zero),
   5979                             _mm_unpacklo_epi8(v_b0, v_zero),
   5980                             buf + j);
   5981 
   5982                     process(_mm_unpackhi_epi8(v_r0, v_zero),
   5983                             _mm_unpackhi_epi8(v_g0, v_zero),
   5984                             _mm_unpackhi_epi8(v_b0, v_zero),
   5985                             buf + j + 24);
   5986 
   5987                     process(_mm_unpacklo_epi8(v_r1, v_zero),
   5988                             _mm_unpacklo_epi8(v_g1, v_zero),
   5989                             _mm_unpacklo_epi8(v_b1, v_zero),
   5990                             buf + j + 48);
   5991 
   5992                     process(_mm_unpackhi_epi8(v_r1, v_zero),
   5993                             _mm_unpackhi_epi8(v_g1, v_zero),
   5994                             _mm_unpackhi_epi8(v_b1, v_zero),
   5995                             buf + j + 72);
   5996                 }
   5997             }
   5998             #endif
   5999             for( ; j < dn*3; j += 3 )
   6000             {
   6001                 buf[j] = src[j]*(100.f/255.f);
   6002                 buf[j+1] = (float)(src[j+1]*1.388235294117647f - 134.f);
   6003                 buf[j+2] = (float)(src[j+2]*1.027450980392157f - 140.f);
   6004             }
   6005             cvt(buf, buf, dn);
   6006 
   6007             j = 0;
   6008             #if CV_NEON
   6009             for ( ; j <= (dn - 8) * 3; j += 24, dst += dcn * 8)
   6010             {
   6011                 float32x4x3_t v_src0 = vld3q_f32(buf + j), v_src1 = vld3q_f32(buf + j + 12);
   6012                 uint8x8_t v_dst0 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[0], v_scale))),
   6013                                                            vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[0], v_scale)))));
   6014                 uint8x8_t v_dst1 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[1], v_scale))),
   6015                                                            vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[1], v_scale)))));
   6016                 uint8x8_t v_dst2 = vqmovn_u16(vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src0.val[2], v_scale))),
   6017                                                            vqmovn_u32(cv_vrndq_u32_f32(vmulq_f32(v_src1.val[2], v_scale)))));
   6018 
   6019                 if (dcn == 4)
   6020                 {
   6021                     uint8x8x4_t v_dst;
   6022                     v_dst.val[0] = v_dst0;
   6023                     v_dst.val[1] = v_dst1;
   6024                     v_dst.val[2] = v_dst2;
   6025                     v_dst.val[3] = v_alpha;
   6026                     vst4_u8(dst, v_dst);
   6027                 }
   6028                 else
   6029                 {
   6030                     uint8x8x3_t v_dst;
   6031                     v_dst.val[0] = v_dst0;
   6032                     v_dst.val[1] = v_dst1;
   6033                     v_dst.val[2] = v_dst2;
   6034                     vst3_u8(dst, v_dst);
   6035                 }
   6036             }
   6037             #elif CV_SSE2
   6038             if (dcn == 3 && haveSIMD)
   6039             {
   6040                 for ( ; j <= (dn * 3 - 16); j += 16, dst += 16)
   6041                 {
   6042                     __m128 v_src0 = _mm_mul_ps(_mm_load_ps(buf + j), v_scale);
   6043                     __m128 v_src1 = _mm_mul_ps(_mm_load_ps(buf + j + 4), v_scale);
   6044                     __m128 v_src2 = _mm_mul_ps(_mm_load_ps(buf + j + 8), v_scale);
   6045                     __m128 v_src3 = _mm_mul_ps(_mm_load_ps(buf + j + 12), v_scale);
   6046 
   6047                     __m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(v_src0),
   6048                                                      _mm_cvtps_epi32(v_src1));
   6049                     __m128i v_dst1 = _mm_packs_epi32(_mm_cvtps_epi32(v_src2),
   6050                                                      _mm_cvtps_epi32(v_src3));
   6051 
   6052                     _mm_storeu_si128((__m128i *)dst, _mm_packus_epi16(v_dst0, v_dst1));
   6053                 }
   6054 
   6055                 int jr = j % 3;
   6056                 if (jr)
   6057                     dst -= jr, j -= jr;
   6058             }
   6059             #endif
   6060 
   6061             for( ; j < dn*3; j += 3, dst += dcn )
   6062             {
   6063                 dst[0] = saturate_cast<uchar>(buf[j]*255.f);
   6064                 dst[1] = saturate_cast<uchar>(buf[j+1]*255.f);
   6065                 dst[2] = saturate_cast<uchar>(buf[j+2]*255.f);
   6066                 if( dcn == 4 )
   6067                     dst[3] = alpha;
   6068             }
   6069         }
   6070     }
   6071 
   6072     int dstcn;
   6073     Luv2RGB_f cvt;
   6074 
   6075     #if CV_NEON
   6076     float32x4_t v_scale, v_scale_inv, v_coeff1, v_coeff2, v_134, v_140;
   6077     uint8x8_t v_alpha;
   6078     #elif CV_SSE2
   6079     __m128 v_scale, v_scale_inv, v_coeff1, v_coeff2, v_134, v_140;
   6080     __m128i v_zero;
   6081     bool haveSIMD;
   6082     #endif
   6083 };
   6084 
   6085 
   6086 ///////////////////////////////////// YUV420 -> RGB /////////////////////////////////////
   6087 
   6088 const int ITUR_BT_601_CY = 1220542;
   6089 const int ITUR_BT_601_CUB = 2116026;
   6090 const int ITUR_BT_601_CUG = -409993;
   6091 const int ITUR_BT_601_CVG = -852492;
   6092 const int ITUR_BT_601_CVR = 1673527;
   6093 const int ITUR_BT_601_SHIFT = 20;
   6094 
   6095 // Coefficients for RGB to YUV420p conversion
   6096 const int ITUR_BT_601_CRY =  269484;
   6097 const int ITUR_BT_601_CGY =  528482;
   6098 const int ITUR_BT_601_CBY =  102760;
   6099 const int ITUR_BT_601_CRU = -155188;
   6100 const int ITUR_BT_601_CGU = -305135;
   6101 const int ITUR_BT_601_CBU =  460324;
   6102 const int ITUR_BT_601_CGV = -385875;
   6103 const int ITUR_BT_601_CBV = -74448;
   6104 
   6105 template<int bIdx, int uIdx>
   6106 struct YUV420sp2RGB888Invoker : ParallelLoopBody
   6107 {
   6108     Mat* dst;
   6109     const uchar* my1, *muv;
   6110     int width, stride;
   6111 
   6112     YUV420sp2RGB888Invoker(Mat* _dst, int _stride, const uchar* _y1, const uchar* _uv)
   6113         : dst(_dst), my1(_y1), muv(_uv), width(_dst->cols), stride(_stride) {}
   6114 
   6115     void operator()(const Range& range) const
   6116     {
   6117         int rangeBegin = range.start * 2;
   6118         int rangeEnd = range.end * 2;
   6119 
   6120         //R = 1.164(Y - 16) + 1.596(V - 128)
   6121         //G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128)
   6122         //B = 1.164(Y - 16)                  + 2.018(U - 128)
   6123 
   6124         //R = (1220542(Y - 16) + 1673527(V - 128)                  + (1 << 19)) >> 20
   6125         //G = (1220542(Y - 16) - 852492(V - 128) - 409993(U - 128) + (1 << 19)) >> 20
   6126         //B = (1220542(Y - 16)                  + 2116026(U - 128) + (1 << 19)) >> 20
   6127 
   6128         const uchar* y1 = my1 + rangeBegin * stride, *uv = muv + rangeBegin * stride / 2;
   6129 
   6130 #ifdef HAVE_TEGRA_OPTIMIZATION
   6131         if(tegra::useTegra() && tegra::cvtYUV4202RGB(bIdx, uIdx, 3, y1, uv, stride, dst->ptr<uchar>(rangeBegin), dst->step, rangeEnd - rangeBegin, dst->cols))
   6132             return;
   6133 #endif
   6134 
   6135         for (int j = rangeBegin; j < rangeEnd; j += 2, y1 += stride * 2, uv += stride)
   6136         {
   6137             uchar* row1 = dst->ptr<uchar>(j);
   6138             uchar* row2 = dst->ptr<uchar>(j + 1);
   6139             const uchar* y2 = y1 + stride;
   6140 
   6141             for (int i = 0; i < width; i += 2, row1 += 6, row2 += 6)
   6142             {
   6143                 int u = int(uv[i + 0 + uIdx]) - 128;
   6144                 int v = int(uv[i + 1 - uIdx]) - 128;
   6145 
   6146                 int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * v;
   6147                 int guv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVG * v + ITUR_BT_601_CUG * u;
   6148                 int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * u;
   6149 
   6150                 int y00 = std::max(0, int(y1[i]) - 16) * ITUR_BT_601_CY;
   6151                 row1[2-bIdx] = saturate_cast<uchar>((y00 + ruv) >> ITUR_BT_601_SHIFT);
   6152                 row1[1]      = saturate_cast<uchar>((y00 + guv) >> ITUR_BT_601_SHIFT);
   6153                 row1[bIdx]   = saturate_cast<uchar>((y00 + buv) >> ITUR_BT_601_SHIFT);
   6154 
   6155                 int y01 = std::max(0, int(y1[i + 1]) - 16) * ITUR_BT_601_CY;
   6156                 row1[5-bIdx] = saturate_cast<uchar>((y01 + ruv) >> ITUR_BT_601_SHIFT);
   6157                 row1[4]      = saturate_cast<uchar>((y01 + guv) >> ITUR_BT_601_SHIFT);
   6158                 row1[3+bIdx] = saturate_cast<uchar>((y01 + buv) >> ITUR_BT_601_SHIFT);
   6159 
   6160                 int y10 = std::max(0, int(y2[i]) - 16) * ITUR_BT_601_CY;
   6161                 row2[2-bIdx] = saturate_cast<uchar>((y10 + ruv) >> ITUR_BT_601_SHIFT);
   6162                 row2[1]      = saturate_cast<uchar>((y10 + guv) >> ITUR_BT_601_SHIFT);
   6163                 row2[bIdx]   = saturate_cast<uchar>((y10 + buv) >> ITUR_BT_601_SHIFT);
   6164 
   6165                 int y11 = std::max(0, int(y2[i + 1]) - 16) * ITUR_BT_601_CY;
   6166                 row2[5-bIdx] = saturate_cast<uchar>((y11 + ruv) >> ITUR_BT_601_SHIFT);
   6167                 row2[4]      = saturate_cast<uchar>((y11 + guv) >> ITUR_BT_601_SHIFT);
   6168                 row2[3+bIdx] = saturate_cast<uchar>((y11 + buv) >> ITUR_BT_601_SHIFT);
   6169             }
   6170         }
   6171     }
   6172 };
   6173 
   6174 template<int bIdx, int uIdx>
   6175 struct YUV420sp2RGBA8888Invoker : ParallelLoopBody
   6176 {
   6177     Mat* dst;
   6178     const uchar* my1, *muv;
   6179     int width, stride;
   6180 
   6181     YUV420sp2RGBA8888Invoker(Mat* _dst, int _stride, const uchar* _y1, const uchar* _uv)
   6182         : dst(_dst), my1(_y1), muv(_uv), width(_dst->cols), stride(_stride) {}
   6183 
   6184     void operator()(const Range& range) const
   6185     {
   6186         int rangeBegin = range.start * 2;
   6187         int rangeEnd = range.end * 2;
   6188 
   6189         //R = 1.164(Y - 16) + 1.596(V - 128)
   6190         //G = 1.164(Y - 16) - 0.813(V - 128) - 0.391(U - 128)
   6191         //B = 1.164(Y - 16)                  + 2.018(U - 128)
   6192 
   6193         //R = (1220542(Y - 16) + 1673527(V - 128)                  + (1 << 19)) >> 20
   6194         //G = (1220542(Y - 16) - 852492(V - 128) - 409993(U - 128) + (1 << 19)) >> 20
   6195         //B = (1220542(Y - 16)                  + 2116026(U - 128) + (1 << 19)) >> 20
   6196 
   6197         const uchar* y1 = my1 + rangeBegin * stride, *uv = muv + rangeBegin * stride / 2;
   6198 
   6199 #ifdef HAVE_TEGRA_OPTIMIZATION
   6200         if(tegra::useTegra() && tegra::cvtYUV4202RGB(bIdx, uIdx, 4, y1, uv, stride, dst->ptr<uchar>(rangeBegin), dst->step, rangeEnd - rangeBegin, dst->cols))
   6201             return;
   6202 #endif
   6203 
   6204         for (int j = rangeBegin; j < rangeEnd; j += 2, y1 += stride * 2, uv += stride)
   6205         {
   6206             uchar* row1 = dst->ptr<uchar>(j);
   6207             uchar* row2 = dst->ptr<uchar>(j + 1);
   6208             const uchar* y2 = y1 + stride;
   6209 
   6210             for (int i = 0; i < width; i += 2, row1 += 8, row2 += 8)
   6211             {
   6212                 int u = int(uv[i + 0 + uIdx]) - 128;
   6213                 int v = int(uv[i + 1 - uIdx]) - 128;
   6214 
   6215                 int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * v;
   6216                 int guv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVG * v + ITUR_BT_601_CUG * u;
   6217                 int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * u;
   6218 
   6219                 int y00 = std::max(0, int(y1[i]) - 16) * ITUR_BT_601_CY;
   6220                 row1[2-bIdx] = saturate_cast<uchar>((y00 + ruv) >> ITUR_BT_601_SHIFT);
   6221                 row1[1]      = saturate_cast<uchar>((y00 + guv) >> ITUR_BT_601_SHIFT);
   6222                 row1[bIdx]   = saturate_cast<uchar>((y00 + buv) >> ITUR_BT_601_SHIFT);
   6223                 row1[3]      = uchar(0xff);
   6224 
   6225                 int y01 = std::max(0, int(y1[i + 1]) - 16) * ITUR_BT_601_CY;
   6226                 row1[6-bIdx] = saturate_cast<uchar>((y01 + ruv) >> ITUR_BT_601_SHIFT);
   6227                 row1[5]      = saturate_cast<uchar>((y01 + guv) >> ITUR_BT_601_SHIFT);
   6228                 row1[4+bIdx] = saturate_cast<uchar>((y01 + buv) >> ITUR_BT_601_SHIFT);
   6229                 row1[7]      = uchar(0xff);
   6230 
   6231                 int y10 = std::max(0, int(y2[i]) - 16) * ITUR_BT_601_CY;
   6232                 row2[2-bIdx] = saturate_cast<uchar>((y10 + ruv) >> ITUR_BT_601_SHIFT);
   6233                 row2[1]      = saturate_cast<uchar>((y10 + guv) >> ITUR_BT_601_SHIFT);
   6234                 row2[bIdx]   = saturate_cast<uchar>((y10 + buv) >> ITUR_BT_601_SHIFT);
   6235                 row2[3]      = uchar(0xff);
   6236 
   6237                 int y11 = std::max(0, int(y2[i + 1]) - 16) * ITUR_BT_601_CY;
   6238                 row2[6-bIdx] = saturate_cast<uchar>((y11 + ruv) >> ITUR_BT_601_SHIFT);
   6239                 row2[5]      = saturate_cast<uchar>((y11 + guv) >> ITUR_BT_601_SHIFT);
   6240                 row2[4+bIdx] = saturate_cast<uchar>((y11 + buv) >> ITUR_BT_601_SHIFT);
   6241                 row2[7]      = uchar(0xff);
   6242             }
   6243         }
   6244     }
   6245 };
   6246 
   6247 template<int bIdx>
   6248 struct YUV420p2RGB888Invoker : ParallelLoopBody
   6249 {
   6250     Mat* dst;
   6251     const uchar* my1, *mu, *mv;
   6252     int width, stride;
   6253     int ustepIdx, vstepIdx;
   6254 
   6255     YUV420p2RGB888Invoker(Mat* _dst, int _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int _ustepIdx, int _vstepIdx)
   6256         : dst(_dst), my1(_y1), mu(_u), mv(_v), width(_dst->cols), stride(_stride), ustepIdx(_ustepIdx), vstepIdx(_vstepIdx) {}
   6257 
   6258     void operator()(const Range& range) const
   6259     {
   6260         const int rangeBegin = range.start * 2;
   6261         const int rangeEnd = range.end * 2;
   6262 
   6263         int uvsteps[2] = {width/2, stride - width/2};
   6264         int usIdx = ustepIdx, vsIdx = vstepIdx;
   6265 
   6266         const uchar* y1 = my1 + rangeBegin * stride;
   6267         const uchar* u1 = mu + (range.start / 2) * stride;
   6268         const uchar* v1 = mv + (range.start / 2) * stride;
   6269 
   6270         if(range.start % 2 == 1)
   6271         {
   6272             u1 += uvsteps[(usIdx++) & 1];
   6273             v1 += uvsteps[(vsIdx++) & 1];
   6274         }
   6275 
   6276         for (int j = rangeBegin; j < rangeEnd; j += 2, y1 += stride * 2, u1 += uvsteps[(usIdx++) & 1], v1 += uvsteps[(vsIdx++) & 1])
   6277         {
   6278             uchar* row1 = dst->ptr<uchar>(j);
   6279             uchar* row2 = dst->ptr<uchar>(j + 1);
   6280             const uchar* y2 = y1 + stride;
   6281 
   6282             for (int i = 0; i < width / 2; i += 1, row1 += 6, row2 += 6)
   6283             {
   6284                 int u = int(u1[i]) - 128;
   6285                 int v = int(v1[i]) - 128;
   6286 
   6287                 int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * v;
   6288                 int guv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVG * v + ITUR_BT_601_CUG * u;
   6289                 int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * u;
   6290 
   6291                 int y00 = std::max(0, int(y1[2 * i]) - 16) * ITUR_BT_601_CY;
   6292                 row1[2-bIdx] = saturate_cast<uchar>((y00 + ruv) >> ITUR_BT_601_SHIFT);
   6293                 row1[1]      = saturate_cast<uchar>((y00 + guv) >> ITUR_BT_601_SHIFT);
   6294                 row1[bIdx]   = saturate_cast<uchar>((y00 + buv) >> ITUR_BT_601_SHIFT);
   6295 
   6296                 int y01 = std::max(0, int(y1[2 * i + 1]) - 16) * ITUR_BT_601_CY;
   6297                 row1[5-bIdx] = saturate_cast<uchar>((y01 + ruv) >> ITUR_BT_601_SHIFT);
   6298                 row1[4]      = saturate_cast<uchar>((y01 + guv) >> ITUR_BT_601_SHIFT);
   6299                 row1[3+bIdx] = saturate_cast<uchar>((y01 + buv) >> ITUR_BT_601_SHIFT);
   6300 
   6301                 int y10 = std::max(0, int(y2[2 * i]) - 16) * ITUR_BT_601_CY;
   6302                 row2[2-bIdx] = saturate_cast<uchar>((y10 + ruv) >> ITUR_BT_601_SHIFT);
   6303                 row2[1]      = saturate_cast<uchar>((y10 + guv) >> ITUR_BT_601_SHIFT);
   6304                 row2[bIdx]   = saturate_cast<uchar>((y10 + buv) >> ITUR_BT_601_SHIFT);
   6305 
   6306                 int y11 = std::max(0, int(y2[2 * i + 1]) - 16) * ITUR_BT_601_CY;
   6307                 row2[5-bIdx] = saturate_cast<uchar>((y11 + ruv) >> ITUR_BT_601_SHIFT);
   6308                 row2[4]      = saturate_cast<uchar>((y11 + guv) >> ITUR_BT_601_SHIFT);
   6309                 row2[3+bIdx] = saturate_cast<uchar>((y11 + buv) >> ITUR_BT_601_SHIFT);
   6310             }
   6311         }
   6312     }
   6313 };
   6314 
   6315 template<int bIdx>
   6316 struct YUV420p2RGBA8888Invoker : ParallelLoopBody
   6317 {
   6318     Mat* dst;
   6319     const uchar* my1, *mu, *mv;
   6320     int width, stride;
   6321     int ustepIdx, vstepIdx;
   6322 
   6323     YUV420p2RGBA8888Invoker(Mat* _dst, int _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int _ustepIdx, int _vstepIdx)
   6324         : dst(_dst), my1(_y1), mu(_u), mv(_v), width(_dst->cols), stride(_stride), ustepIdx(_ustepIdx), vstepIdx(_vstepIdx) {}
   6325 
   6326     void operator()(const Range& range) const
   6327     {
   6328         int rangeBegin = range.start * 2;
   6329         int rangeEnd = range.end * 2;
   6330 
   6331         int uvsteps[2] = {width/2, stride - width/2};
   6332         int usIdx = ustepIdx, vsIdx = vstepIdx;
   6333 
   6334         const uchar* y1 = my1 + rangeBegin * stride;
   6335         const uchar* u1 = mu + (range.start / 2) * stride;
   6336         const uchar* v1 = mv + (range.start / 2) * stride;
   6337 
   6338         if(range.start % 2 == 1)
   6339         {
   6340             u1 += uvsteps[(usIdx++) & 1];
   6341             v1 += uvsteps[(vsIdx++) & 1];
   6342         }
   6343 
   6344         for (int j = rangeBegin; j < rangeEnd; j += 2, y1 += stride * 2, u1 += uvsteps[(usIdx++) & 1], v1 += uvsteps[(vsIdx++) & 1])
   6345         {
   6346             uchar* row1 = dst->ptr<uchar>(j);
   6347             uchar* row2 = dst->ptr<uchar>(j + 1);
   6348             const uchar* y2 = y1 + stride;
   6349 
   6350             for (int i = 0; i < width / 2; i += 1, row1 += 8, row2 += 8)
   6351             {
   6352                 int u = int(u1[i]) - 128;
   6353                 int v = int(v1[i]) - 128;
   6354 
   6355                 int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * v;
   6356                 int guv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVG * v + ITUR_BT_601_CUG * u;
   6357                 int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * u;
   6358 
   6359                 int y00 = std::max(0, int(y1[2 * i]) - 16) * ITUR_BT_601_CY;
   6360                 row1[2-bIdx] = saturate_cast<uchar>((y00 + ruv) >> ITUR_BT_601_SHIFT);
   6361                 row1[1]      = saturate_cast<uchar>((y00 + guv) >> ITUR_BT_601_SHIFT);
   6362                 row1[bIdx]   = saturate_cast<uchar>((y00 + buv) >> ITUR_BT_601_SHIFT);
   6363                 row1[3]      = uchar(0xff);
   6364 
   6365                 int y01 = std::max(0, int(y1[2 * i + 1]) - 16) * ITUR_BT_601_CY;
   6366                 row1[6-bIdx] = saturate_cast<uchar>((y01 + ruv) >> ITUR_BT_601_SHIFT);
   6367                 row1[5]      = saturate_cast<uchar>((y01 + guv) >> ITUR_BT_601_SHIFT);
   6368                 row1[4+bIdx] = saturate_cast<uchar>((y01 + buv) >> ITUR_BT_601_SHIFT);
   6369                 row1[7]      = uchar(0xff);
   6370 
   6371                 int y10 = std::max(0, int(y2[2 * i]) - 16) * ITUR_BT_601_CY;
   6372                 row2[2-bIdx] = saturate_cast<uchar>((y10 + ruv) >> ITUR_BT_601_SHIFT);
   6373                 row2[1]      = saturate_cast<uchar>((y10 + guv) >> ITUR_BT_601_SHIFT);
   6374                 row2[bIdx]   = saturate_cast<uchar>((y10 + buv) >> ITUR_BT_601_SHIFT);
   6375                 row2[3]      = uchar(0xff);
   6376 
   6377                 int y11 = std::max(0, int(y2[2 * i + 1]) - 16) * ITUR_BT_601_CY;
   6378                 row2[6-bIdx] = saturate_cast<uchar>((y11 + ruv) >> ITUR_BT_601_SHIFT);
   6379                 row2[5]      = saturate_cast<uchar>((y11 + guv) >> ITUR_BT_601_SHIFT);
   6380                 row2[4+bIdx] = saturate_cast<uchar>((y11 + buv) >> ITUR_BT_601_SHIFT);
   6381                 row2[7]      = uchar(0xff);
   6382             }
   6383         }
   6384     }
   6385 };
   6386 
   6387 #define MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION (320*240)
   6388 
   6389 template<int bIdx, int uIdx>
   6390 inline void cvtYUV420sp2RGB(Mat& _dst, int _stride, const uchar* _y1, const uchar* _uv)
   6391 {
   6392     YUV420sp2RGB888Invoker<bIdx, uIdx> converter(&_dst, _stride, _y1,  _uv);
   6393     if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION)
   6394         parallel_for_(Range(0, _dst.rows/2), converter);
   6395     else
   6396         converter(Range(0, _dst.rows/2));
   6397 }
   6398 
   6399 template<int bIdx, int uIdx>
   6400 inline void cvtYUV420sp2RGBA(Mat& _dst, int _stride, const uchar* _y1, const uchar* _uv)
   6401 {
   6402     YUV420sp2RGBA8888Invoker<bIdx, uIdx> converter(&_dst, _stride, _y1,  _uv);
   6403     if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION)
   6404         parallel_for_(Range(0, _dst.rows/2), converter);
   6405     else
   6406         converter(Range(0, _dst.rows/2));
   6407 }
   6408 
   6409 template<int bIdx>
   6410 inline void cvtYUV420p2RGB(Mat& _dst, int _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int ustepIdx, int vstepIdx)
   6411 {
   6412     YUV420p2RGB888Invoker<bIdx> converter(&_dst, _stride, _y1,  _u, _v, ustepIdx, vstepIdx);
   6413     if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION)
   6414         parallel_for_(Range(0, _dst.rows/2), converter);
   6415     else
   6416         converter(Range(0, _dst.rows/2));
   6417 }
   6418 
   6419 template<int bIdx>
   6420 inline void cvtYUV420p2RGBA(Mat& _dst, int _stride, const uchar* _y1, const uchar* _u, const uchar* _v, int ustepIdx, int vstepIdx)
   6421 {
   6422     YUV420p2RGBA8888Invoker<bIdx> converter(&_dst, _stride, _y1,  _u, _v, ustepIdx, vstepIdx);
   6423     if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV420_CONVERSION)
   6424         parallel_for_(Range(0, _dst.rows/2), converter);
   6425     else
   6426         converter(Range(0, _dst.rows/2));
   6427 }
   6428 
   6429 ///////////////////////////////////// RGB -> YUV420p /////////////////////////////////////
   6430 
   6431 template<int bIdx>
   6432 struct RGB888toYUV420pInvoker: public ParallelLoopBody
   6433 {
   6434     RGB888toYUV420pInvoker( const Mat& src, Mat* dst, const int uIdx )
   6435         : src_(src),
   6436           dst_(dst),
   6437           uIdx_(uIdx) { }
   6438 
   6439     void operator()(const Range& rowRange) const
   6440     {
   6441         const int w = src_.cols;
   6442         const int h = src_.rows;
   6443 
   6444         const int cn = src_.channels();
   6445         for( int i = rowRange.start; i < rowRange.end; i++ )
   6446         {
   6447             const uchar* row0 = src_.ptr<uchar>(2 * i);
   6448             const uchar* row1 = src_.ptr<uchar>(2 * i + 1);
   6449 
   6450             uchar* y = dst_->ptr<uchar>(2*i);
   6451             uchar* u = dst_->ptr<uchar>(h + i/2) + (i % 2) * (w/2);
   6452             uchar* v = dst_->ptr<uchar>(h + (i + h/2)/2) + ((i + h/2) % 2) * (w/2);
   6453             if( uIdx_ == 2 ) std::swap(u, v);
   6454 
   6455             for( int j = 0, k = 0; j < w * cn; j += 2 * cn, k++ )
   6456             {
   6457                 int r00 = row0[2-bIdx + j];      int g00 = row0[1 + j];      int b00 = row0[bIdx + j];
   6458                 int r01 = row0[2-bIdx + cn + j]; int g01 = row0[1 + cn + j]; int b01 = row0[bIdx + cn + j];
   6459                 int r10 = row1[2-bIdx + j];      int g10 = row1[1 + j];      int b10 = row1[bIdx + j];
   6460                 int r11 = row1[2-bIdx + cn + j]; int g11 = row1[1 + cn + j]; int b11 = row1[bIdx + cn + j];
   6461 
   6462                 const int shifted16 = (16 << ITUR_BT_601_SHIFT);
   6463                 const int halfShift = (1 << (ITUR_BT_601_SHIFT - 1));
   6464                 int y00 = ITUR_BT_601_CRY * r00 + ITUR_BT_601_CGY * g00 + ITUR_BT_601_CBY * b00 + halfShift + shifted16;
   6465                 int y01 = ITUR_BT_601_CRY * r01 + ITUR_BT_601_CGY * g01 + ITUR_BT_601_CBY * b01 + halfShift + shifted16;
   6466                 int y10 = ITUR_BT_601_CRY * r10 + ITUR_BT_601_CGY * g10 + ITUR_BT_601_CBY * b10 + halfShift + shifted16;
   6467                 int y11 = ITUR_BT_601_CRY * r11 + ITUR_BT_601_CGY * g11 + ITUR_BT_601_CBY * b11 + halfShift + shifted16;
   6468 
   6469                 y[2*k + 0]            = saturate_cast<uchar>(y00 >> ITUR_BT_601_SHIFT);
   6470                 y[2*k + 1]            = saturate_cast<uchar>(y01 >> ITUR_BT_601_SHIFT);
   6471                 y[2*k + dst_->step + 0] = saturate_cast<uchar>(y10 >> ITUR_BT_601_SHIFT);
   6472                 y[2*k + dst_->step + 1] = saturate_cast<uchar>(y11 >> ITUR_BT_601_SHIFT);
   6473 
   6474                 const int shifted128 = (128 << ITUR_BT_601_SHIFT);
   6475                 int u00 = ITUR_BT_601_CRU * r00 + ITUR_BT_601_CGU * g00 + ITUR_BT_601_CBU * b00 + halfShift + shifted128;
   6476                 int v00 = ITUR_BT_601_CBU * r00 + ITUR_BT_601_CGV * g00 + ITUR_BT_601_CBV * b00 + halfShift + shifted128;
   6477 
   6478                 u[k] = saturate_cast<uchar>(u00 >> ITUR_BT_601_SHIFT);
   6479                 v[k] = saturate_cast<uchar>(v00 >> ITUR_BT_601_SHIFT);
   6480             }
   6481         }
   6482     }
   6483 
   6484     static bool isFit( const Mat& src )
   6485     {
   6486         return (src.total() >= 320*240);
   6487     }
   6488 
   6489 private:
   6490     RGB888toYUV420pInvoker& operator=(const RGB888toYUV420pInvoker&);
   6491 
   6492     const Mat& src_;
   6493     Mat* const dst_;
   6494     const int uIdx_;
   6495 };
   6496 
   6497 template<int bIdx, int uIdx>
   6498 static void cvtRGBtoYUV420p(const Mat& src, Mat& dst)
   6499 {
   6500     RGB888toYUV420pInvoker<bIdx> colorConverter(src, &dst, uIdx);
   6501     if( RGB888toYUV420pInvoker<bIdx>::isFit(src) )
   6502         parallel_for_(Range(0, src.rows/2), colorConverter);
   6503     else
   6504         colorConverter(Range(0, src.rows/2));
   6505 }
   6506 
   6507 ///////////////////////////////////// YUV422 -> RGB /////////////////////////////////////
   6508 
   6509 template<int bIdx, int uIdx, int yIdx>
   6510 struct YUV422toRGB888Invoker : ParallelLoopBody
   6511 {
   6512     Mat* dst;
   6513     const uchar* src;
   6514     int width, stride;
   6515 
   6516     YUV422toRGB888Invoker(Mat* _dst, int _stride, const uchar* _yuv)
   6517         : dst(_dst), src(_yuv), width(_dst->cols), stride(_stride) {}
   6518 
   6519     void operator()(const Range& range) const
   6520     {
   6521         int rangeBegin = range.start;
   6522         int rangeEnd = range.end;
   6523 
   6524         const int uidx = 1 - yIdx + uIdx * 2;
   6525         const int vidx = (2 + uidx) % 4;
   6526         const uchar* yuv_src = src + rangeBegin * stride;
   6527 
   6528         for (int j = rangeBegin; j < rangeEnd; j++, yuv_src += stride)
   6529         {
   6530             uchar* row = dst->ptr<uchar>(j);
   6531 
   6532             for (int i = 0; i < 2 * width; i += 4, row += 6)
   6533             {
   6534                 int u = int(yuv_src[i + uidx]) - 128;
   6535                 int v = int(yuv_src[i + vidx]) - 128;
   6536 
   6537                 int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * v;
   6538                 int guv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVG * v + ITUR_BT_601_CUG * u;
   6539                 int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * u;
   6540 
   6541                 int y00 = std::max(0, int(yuv_src[i + yIdx]) - 16) * ITUR_BT_601_CY;
   6542                 row[2-bIdx] = saturate_cast<uchar>((y00 + ruv) >> ITUR_BT_601_SHIFT);
   6543                 row[1]      = saturate_cast<uchar>((y00 + guv) >> ITUR_BT_601_SHIFT);
   6544                 row[bIdx]   = saturate_cast<uchar>((y00 + buv) >> ITUR_BT_601_SHIFT);
   6545 
   6546                 int y01 = std::max(0, int(yuv_src[i + yIdx + 2]) - 16) * ITUR_BT_601_CY;
   6547                 row[5-bIdx] = saturate_cast<uchar>((y01 + ruv) >> ITUR_BT_601_SHIFT);
   6548                 row[4]      = saturate_cast<uchar>((y01 + guv) >> ITUR_BT_601_SHIFT);
   6549                 row[3+bIdx] = saturate_cast<uchar>((y01 + buv) >> ITUR_BT_601_SHIFT);
   6550             }
   6551         }
   6552     }
   6553 };
   6554 
   6555 template<int bIdx, int uIdx, int yIdx>
   6556 struct YUV422toRGBA8888Invoker : ParallelLoopBody
   6557 {
   6558     Mat* dst;
   6559     const uchar* src;
   6560     int width, stride;
   6561 
   6562     YUV422toRGBA8888Invoker(Mat* _dst, int _stride, const uchar* _yuv)
   6563         : dst(_dst), src(_yuv), width(_dst->cols), stride(_stride) {}
   6564 
   6565     void operator()(const Range& range) const
   6566     {
   6567         int rangeBegin = range.start;
   6568         int rangeEnd = range.end;
   6569 
   6570         const int uidx = 1 - yIdx + uIdx * 2;
   6571         const int vidx = (2 + uidx) % 4;
   6572         const uchar* yuv_src = src + rangeBegin * stride;
   6573 
   6574         for (int j = rangeBegin; j < rangeEnd; j++, yuv_src += stride)
   6575         {
   6576             uchar* row = dst->ptr<uchar>(j);
   6577 
   6578             for (int i = 0; i < 2 * width; i += 4, row += 8)
   6579             {
   6580                 int u = int(yuv_src[i + uidx]) - 128;
   6581                 int v = int(yuv_src[i + vidx]) - 128;
   6582 
   6583                 int ruv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVR * v;
   6584                 int guv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CVG * v + ITUR_BT_601_CUG * u;
   6585                 int buv = (1 << (ITUR_BT_601_SHIFT - 1)) + ITUR_BT_601_CUB * u;
   6586 
   6587                 int y00 = std::max(0, int(yuv_src[i + yIdx]) - 16) * ITUR_BT_601_CY;
   6588                 row[2-bIdx] = saturate_cast<uchar>((y00 + ruv) >> ITUR_BT_601_SHIFT);
   6589                 row[1]      = saturate_cast<uchar>((y00 + guv) >> ITUR_BT_601_SHIFT);
   6590                 row[bIdx]   = saturate_cast<uchar>((y00 + buv) >> ITUR_BT_601_SHIFT);
   6591                 row[3]      = uchar(0xff);
   6592 
   6593                 int y01 = std::max(0, int(yuv_src[i + yIdx + 2]) - 16) * ITUR_BT_601_CY;
   6594                 row[6-bIdx] = saturate_cast<uchar>((y01 + ruv) >> ITUR_BT_601_SHIFT);
   6595                 row[5]      = saturate_cast<uchar>((y01 + guv) >> ITUR_BT_601_SHIFT);
   6596                 row[4+bIdx] = saturate_cast<uchar>((y01 + buv) >> ITUR_BT_601_SHIFT);
   6597                 row[7]      = uchar(0xff);
   6598             }
   6599         }
   6600     }
   6601 };
   6602 
   6603 #define MIN_SIZE_FOR_PARALLEL_YUV422_CONVERSION (320*240)
   6604 
   6605 template<int bIdx, int uIdx, int yIdx>
   6606 inline void cvtYUV422toRGB(Mat& _dst, int _stride, const uchar* _yuv)
   6607 {
   6608     YUV422toRGB888Invoker<bIdx, uIdx, yIdx> converter(&_dst, _stride, _yuv);
   6609     if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV422_CONVERSION)
   6610         parallel_for_(Range(0, _dst.rows), converter);
   6611     else
   6612         converter(Range(0, _dst.rows));
   6613 }
   6614 
   6615 template<int bIdx, int uIdx, int yIdx>
   6616 inline void cvtYUV422toRGBA(Mat& _dst, int _stride, const uchar* _yuv)
   6617 {
   6618     YUV422toRGBA8888Invoker<bIdx, uIdx, yIdx> converter(&_dst, _stride, _yuv);
   6619     if (_dst.total() >= MIN_SIZE_FOR_PARALLEL_YUV422_CONVERSION)
   6620         parallel_for_(Range(0, _dst.rows), converter);
   6621     else
   6622         converter(Range(0, _dst.rows));
   6623 }
   6624 
   6625 /////////////////////////// RGBA <-> mRGBA (alpha premultiplied) //////////////
   6626 
   6627 template<typename _Tp>
   6628 struct RGBA2mRGBA
   6629 {
   6630     typedef _Tp channel_type;
   6631 
   6632     void operator()(const _Tp* src, _Tp* dst, int n) const
   6633     {
   6634         _Tp max_val  = ColorChannel<_Tp>::max();
   6635         _Tp half_val = ColorChannel<_Tp>::half();
   6636         for( int i = 0; i < n; i++ )
   6637         {
   6638             _Tp v0 = *src++;
   6639             _Tp v1 = *src++;
   6640             _Tp v2 = *src++;
   6641             _Tp v3 = *src++;
   6642 
   6643             *dst++ = (v0 * v3 + half_val) / max_val;
   6644             *dst++ = (v1 * v3 + half_val) / max_val;
   6645             *dst++ = (v2 * v3 + half_val) / max_val;
   6646             *dst++ = v3;
   6647         }
   6648     }
   6649 };
   6650 
   6651 
   6652 template<typename _Tp>
   6653 struct mRGBA2RGBA
   6654 {
   6655     typedef _Tp channel_type;
   6656 
   6657     void operator()(const _Tp* src, _Tp* dst, int n) const
   6658     {
   6659         _Tp max_val = ColorChannel<_Tp>::max();
   6660         for( int i = 0; i < n; i++ )
   6661         {
   6662             _Tp v0 = *src++;
   6663             _Tp v1 = *src++;
   6664             _Tp v2 = *src++;
   6665             _Tp v3 = *src++;
   6666             _Tp v3_half = v3 / 2;
   6667 
   6668             *dst++ = (v3==0)? 0 : (v0 * max_val + v3_half) / v3;
   6669             *dst++ = (v3==0)? 0 : (v1 * max_val + v3_half) / v3;
   6670             *dst++ = (v3==0)? 0 : (v2 * max_val + v3_half) / v3;
   6671             *dst++ = v3;
   6672         }
   6673     }
   6674 };
   6675 
   6676 #ifdef HAVE_OPENCL
   6677 
   6678 static bool ocl_cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
   6679 {
   6680     bool ok = false;
   6681     UMat src = _src.getUMat(), dst;
   6682     Size sz = src.size(), dstSz = sz;
   6683     int scn = src.channels(), depth = src.depth(), bidx, uidx, yidx;
   6684     int dims = 2, stripeSize = 1;
   6685     ocl::Kernel k;
   6686 
   6687     if (depth != CV_8U && depth != CV_16U && depth != CV_32F)
   6688         return false;
   6689 
   6690     ocl::Device dev = ocl::Device::getDefault();
   6691     int pxPerWIy = dev.isIntel() && (dev.type() & ocl::Device::TYPE_GPU) ? 4 : 1;
   6692     int pxPerWIx = 1;
   6693 
   6694     size_t globalsize[] = { src.cols, (src.rows + pxPerWIy - 1) / pxPerWIy };
   6695     cv::String opts = format("-D depth=%d -D scn=%d -D PIX_PER_WI_Y=%d ",
   6696                              depth, scn, pxPerWIy);
   6697 
   6698     switch (code)
   6699     {
   6700     case COLOR_BGR2BGRA: case COLOR_RGB2BGRA: case COLOR_BGRA2BGR:
   6701     case COLOR_RGBA2BGR: case COLOR_RGB2BGR: case COLOR_BGRA2RGBA:
   6702     {
   6703         CV_Assert(scn == 3 || scn == 4);
   6704         dcn = code == COLOR_BGR2BGRA || code == COLOR_RGB2BGRA || code == COLOR_BGRA2RGBA ? 4 : 3;
   6705         bool reverse = !(code == COLOR_BGR2BGRA || code == COLOR_BGRA2BGR);
   6706         k.create("RGB", ocl::imgproc::cvtcolor_oclsrc,
   6707                  opts + format("-D dcn=%d -D bidx=0 -D %s", dcn,
   6708                         reverse ? "REVERSE" : "ORDER"));
   6709         break;
   6710     }
   6711     case COLOR_BGR5652BGR: case COLOR_BGR5552BGR: case COLOR_BGR5652RGB: case COLOR_BGR5552RGB:
   6712     case COLOR_BGR5652BGRA: case COLOR_BGR5552BGRA: case COLOR_BGR5652RGBA: case COLOR_BGR5552RGBA:
   6713     {
   6714         dcn = code == COLOR_BGR5652BGRA || code == COLOR_BGR5552BGRA || code == COLOR_BGR5652RGBA || code == COLOR_BGR5552RGBA ? 4 : 3;
   6715         CV_Assert((dcn == 3 || dcn == 4) && scn == 2 && depth == CV_8U);
   6716         bidx = code == COLOR_BGR5652BGR || code == COLOR_BGR5552BGR ||
   6717             code == COLOR_BGR5652BGRA || code == COLOR_BGR5552BGRA ? 0 : 2;
   6718         int greenbits = code == COLOR_BGR5652BGR || code == COLOR_BGR5652RGB ||
   6719             code == COLOR_BGR5652BGRA || code == COLOR_BGR5652RGBA ? 6 : 5;
   6720         k.create("RGB5x52RGB", ocl::imgproc::cvtcolor_oclsrc,
   6721                  opts + format("-D dcn=%d -D bidx=%d -D greenbits=%d", dcn, bidx, greenbits));
   6722         break;
   6723     }
   6724     case COLOR_BGR2BGR565: case COLOR_BGR2BGR555: case COLOR_RGB2BGR565: case COLOR_RGB2BGR555:
   6725     case COLOR_BGRA2BGR565: case COLOR_BGRA2BGR555: case COLOR_RGBA2BGR565: case COLOR_RGBA2BGR555:
   6726     {
   6727         CV_Assert((scn == 3 || scn == 4) && depth == CV_8U );
   6728         bidx = code == COLOR_BGR2BGR565 || code == COLOR_BGR2BGR555 ||
   6729             code == COLOR_BGRA2BGR565 || code == COLOR_BGRA2BGR555 ? 0 : 2;
   6730         int greenbits = code == COLOR_BGR2BGR565 || code == COLOR_RGB2BGR565 ||
   6731             code == COLOR_BGRA2BGR565 || code == COLOR_RGBA2BGR565 ? 6 : 5;
   6732         dcn = 2;
   6733         k.create("RGB2RGB5x5", ocl::imgproc::cvtcolor_oclsrc,
   6734                  opts + format("-D dcn=2 -D bidx=%d -D greenbits=%d", bidx, greenbits));
   6735         break;
   6736     }
   6737     case COLOR_BGR5652GRAY: case COLOR_BGR5552GRAY:
   6738     {
   6739         CV_Assert(scn == 2 && depth == CV_8U);
   6740         dcn = 1;
   6741         int greenbits = code == COLOR_BGR5652GRAY ? 6 : 5;
   6742         k.create("BGR5x52Gray", ocl::imgproc::cvtcolor_oclsrc,
   6743                  opts + format("-D dcn=1 -D bidx=0 -D greenbits=%d", greenbits));
   6744         break;
   6745     }
   6746     case COLOR_GRAY2BGR565: case COLOR_GRAY2BGR555:
   6747     {
   6748         CV_Assert(scn == 1 && depth == CV_8U);
   6749         dcn = 2;
   6750         int greenbits = code == COLOR_GRAY2BGR565 ? 6 : 5;
   6751         k.create("Gray2BGR5x5", ocl::imgproc::cvtcolor_oclsrc,
   6752                  opts + format("-D dcn=2 -D bidx=0 -D greenbits=%d", greenbits));
   6753         break;
   6754     }
   6755     case COLOR_BGR2GRAY: case COLOR_BGRA2GRAY:
   6756     case COLOR_RGB2GRAY: case COLOR_RGBA2GRAY:
   6757     {
   6758         CV_Assert(scn == 3 || scn == 4);
   6759         bidx = code == COLOR_BGR2GRAY || code == COLOR_BGRA2GRAY ? 0 : 2;
   6760         dcn = 1;
   6761         k.create("RGB2Gray", ocl::imgproc::cvtcolor_oclsrc,
   6762                  opts + format("-D dcn=1 -D bidx=%d -D STRIPE_SIZE=%d",
   6763                                bidx, stripeSize));
   6764         globalsize[0] = (src.cols + stripeSize-1)/stripeSize;
   6765         break;
   6766     }
   6767     case COLOR_GRAY2BGR:
   6768     case COLOR_GRAY2BGRA:
   6769     {
   6770         CV_Assert(scn == 1);
   6771         dcn = code == COLOR_GRAY2BGRA ? 4 : 3;
   6772         k.create("Gray2RGB", ocl::imgproc::cvtcolor_oclsrc,
   6773                  opts + format("-D bidx=0 -D dcn=%d", dcn));
   6774         break;
   6775     }
   6776     case COLOR_BGR2YUV:
   6777     case COLOR_RGB2YUV:
   6778     {
   6779         CV_Assert(scn == 3 || scn == 4);
   6780         bidx = code == COLOR_RGB2YUV ? 0 : 2;
   6781         dcn = 3;
   6782         k.create("RGB2YUV", ocl::imgproc::cvtcolor_oclsrc,
   6783                  opts + format("-D dcn=3 -D bidx=%d", bidx));
   6784         break;
   6785     }
   6786     case COLOR_YUV2BGR:
   6787     case COLOR_YUV2RGB:
   6788     {
   6789         if(dcn < 0) dcn = 3;
   6790         CV_Assert(dcn == 3 || dcn == 4);
   6791         bidx = code == COLOR_YUV2RGB ? 0 : 2;
   6792         k.create("YUV2RGB", ocl::imgproc::cvtcolor_oclsrc,
   6793                  opts + format("-D dcn=%d -D bidx=%d", dcn, bidx));
   6794         break;
   6795     }
   6796     case COLOR_YUV2RGB_NV12: case COLOR_YUV2BGR_NV12: case COLOR_YUV2RGB_NV21: case COLOR_YUV2BGR_NV21:
   6797     case COLOR_YUV2RGBA_NV12: case COLOR_YUV2BGRA_NV12: case COLOR_YUV2RGBA_NV21: case COLOR_YUV2BGRA_NV21:
   6798     {
   6799         CV_Assert( scn == 1 );
   6800         CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0 && depth == CV_8U );
   6801         dcn  = code == COLOR_YUV2BGRA_NV12 || code == COLOR_YUV2RGBA_NV12 ||
   6802                code == COLOR_YUV2BGRA_NV21 || code == COLOR_YUV2RGBA_NV21 ? 4 : 3;
   6803         bidx = code == COLOR_YUV2BGRA_NV12 || code == COLOR_YUV2BGR_NV12 ||
   6804                code == COLOR_YUV2BGRA_NV21 || code == COLOR_YUV2BGR_NV21 ? 0 : 2;
   6805         uidx = code == COLOR_YUV2RGBA_NV21 || code == COLOR_YUV2RGB_NV21 ||
   6806                code == COLOR_YUV2BGRA_NV21 || code == COLOR_YUV2BGR_NV21 ? 1 : 0;
   6807 
   6808         dstSz = Size(sz.width, sz.height * 2 / 3);
   6809         globalsize[0] = dstSz.width / 2; globalsize[1] = (dstSz.height/2 + pxPerWIy - 1) / pxPerWIy;
   6810         k.create("YUV2RGB_NVx", ocl::imgproc::cvtcolor_oclsrc,
   6811                  opts + format("-D dcn=%d -D bidx=%d -D uidx=%d", dcn, bidx, uidx));
   6812         break;
   6813     }
   6814     case COLOR_YUV2BGR_YV12: case COLOR_YUV2RGB_YV12: case COLOR_YUV2BGRA_YV12: case COLOR_YUV2RGBA_YV12:
   6815     case COLOR_YUV2BGR_IYUV: case COLOR_YUV2RGB_IYUV: case COLOR_YUV2BGRA_IYUV: case COLOR_YUV2RGBA_IYUV:
   6816     {
   6817         CV_Assert( scn == 1 );
   6818         CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0 && depth == CV_8U );
   6819         dcn  = code == COLOR_YUV2BGRA_YV12 || code == COLOR_YUV2RGBA_YV12 ||
   6820                code == COLOR_YUV2BGRA_IYUV || code == COLOR_YUV2RGBA_IYUV ? 4 : 3;
   6821         bidx = code == COLOR_YUV2BGRA_YV12 || code == COLOR_YUV2BGR_YV12 ||
   6822                code == COLOR_YUV2BGRA_IYUV || code == COLOR_YUV2BGR_IYUV ? 0 : 2;
   6823         uidx = code == COLOR_YUV2BGRA_YV12 || code == COLOR_YUV2BGR_YV12 ||
   6824                code == COLOR_YUV2RGBA_YV12 || code == COLOR_YUV2RGB_YV12 ? 1 : 0;
   6825 
   6826         dstSz = Size(sz.width, sz.height * 2 / 3);
   6827         globalsize[0] = dstSz.width / 2; globalsize[1] = (dstSz.height/2 + pxPerWIy - 1) / pxPerWIy;
   6828         k.create("YUV2RGB_YV12_IYUV", ocl::imgproc::cvtcolor_oclsrc,
   6829                  opts + format("-D dcn=%d -D bidx=%d -D uidx=%d%s", dcn, bidx, uidx,
   6830                  src.isContinuous() ? " -D SRC_CONT" : ""));
   6831         break;
   6832     }
   6833     case COLOR_YUV2GRAY_420:
   6834     {
   6835         if (dcn <= 0) dcn = 1;
   6836 
   6837         CV_Assert( dcn == 1 );
   6838         CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0 && depth == CV_8U );
   6839 
   6840         dstSz = Size(sz.width, sz.height * 2 / 3);
   6841         _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
   6842         dst = _dst.getUMat();
   6843 
   6844         src.rowRange(0, dstSz.height).copyTo(dst);
   6845         return true;
   6846     }
   6847     case COLOR_RGB2YUV_YV12: case COLOR_BGR2YUV_YV12: case COLOR_RGBA2YUV_YV12: case COLOR_BGRA2YUV_YV12:
   6848     case COLOR_RGB2YUV_IYUV: case COLOR_BGR2YUV_IYUV: case COLOR_RGBA2YUV_IYUV: case COLOR_BGRA2YUV_IYUV:
   6849     {
   6850         if (dcn <= 0) dcn = 1;
   6851         bidx = code == COLOR_BGRA2YUV_YV12 || code == COLOR_BGR2YUV_YV12 ||
   6852                code == COLOR_BGRA2YUV_IYUV || code == COLOR_BGR2YUV_IYUV ? 0 : 2;
   6853         uidx = code == COLOR_RGBA2YUV_YV12 || code == COLOR_RGB2YUV_YV12 ||
   6854                code == COLOR_BGRA2YUV_YV12 || code == COLOR_BGR2YUV_YV12 ? 1 : 0;
   6855 
   6856         CV_Assert( (scn == 3 || scn == 4) && depth == CV_8U );
   6857         CV_Assert( dcn == 1 );
   6858         CV_Assert( sz.width % 2 == 0 && sz.height % 2 == 0 );
   6859 
   6860         dstSz = Size(sz.width, sz.height / 2 * 3);
   6861         _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
   6862         dst = _dst.getUMat();
   6863 
   6864         if (dev.isIntel() && src.cols % 4 == 0 && src.step % 4 == 0 && src.offset % 4 == 0 &&
   6865             dst.step % 4 == 0 && dst.offset % 4 == 0)
   6866         {
   6867             pxPerWIx = 2;
   6868         }
   6869         globalsize[0] = dstSz.width / (2 * pxPerWIx); globalsize[1] = (dstSz.height/3 + pxPerWIy - 1) / pxPerWIy;
   6870 
   6871         k.create("RGB2YUV_YV12_IYUV", ocl::imgproc::cvtcolor_oclsrc,
   6872                  opts + format("-D dcn=%d -D bidx=%d -D uidx=%d -D PIX_PER_WI_X=%d", dcn, bidx, uidx, pxPerWIx));
   6873         k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst));
   6874         return k.run(2, globalsize, NULL, false);
   6875     }
   6876     case COLOR_YUV2RGB_UYVY: case COLOR_YUV2BGR_UYVY: case COLOR_YUV2RGBA_UYVY: case COLOR_YUV2BGRA_UYVY:
   6877     case COLOR_YUV2RGB_YUY2: case COLOR_YUV2BGR_YUY2: case COLOR_YUV2RGB_YVYU: case COLOR_YUV2BGR_YVYU:
   6878     case COLOR_YUV2RGBA_YUY2: case COLOR_YUV2BGRA_YUY2: case COLOR_YUV2RGBA_YVYU: case COLOR_YUV2BGRA_YVYU:
   6879     {
   6880         if (dcn <= 0)
   6881             dcn = (code==COLOR_YUV2RGBA_UYVY || code==COLOR_YUV2BGRA_UYVY || code==COLOR_YUV2RGBA_YUY2 ||
   6882                    code==COLOR_YUV2BGRA_YUY2 || code==COLOR_YUV2RGBA_YVYU || code==COLOR_YUV2BGRA_YVYU) ? 4 : 3;
   6883 
   6884         bidx = (code==COLOR_YUV2BGR_UYVY || code==COLOR_YUV2BGRA_UYVY || code==COLOR_YUV2BGRA_YUY2 ||
   6885                 code==COLOR_YUV2BGR_YUY2 || code==COLOR_YUV2BGRA_YVYU || code==COLOR_YUV2BGR_YVYU) ? 0 : 2;
   6886         yidx = (code==COLOR_YUV2RGB_UYVY || code==COLOR_YUV2RGBA_UYVY || code==COLOR_YUV2BGR_UYVY || code==COLOR_YUV2BGRA_UYVY) ? 1 : 0;
   6887         uidx = (code==COLOR_YUV2RGB_YVYU || code==COLOR_YUV2RGBA_YVYU ||
   6888                 code==COLOR_YUV2BGR_YVYU || code==COLOR_YUV2BGRA_YVYU) ? 2 : 0;
   6889         uidx = 1 - yidx + uidx;
   6890 
   6891         CV_Assert( dcn == 3 || dcn == 4 );
   6892         CV_Assert( scn == 2 && depth == CV_8U );
   6893 
   6894         k.create("YUV2RGB_422", ocl::imgproc::cvtcolor_oclsrc,
   6895                  opts + format("-D dcn=%d -D bidx=%d -D uidx=%d -D yidx=%d%s", dcn, bidx, uidx, yidx,
   6896                                 src.offset % 4 == 0 && src.step % 4 == 0 ? " -D USE_OPTIMIZED_LOAD" : ""));
   6897         break;
   6898     }
   6899     case COLOR_BGR2YCrCb:
   6900     case COLOR_RGB2YCrCb:
   6901     {
   6902         CV_Assert(scn == 3 || scn == 4);
   6903         bidx = code == COLOR_BGR2YCrCb ? 0 : 2;
   6904         dcn = 3;
   6905         k.create("RGB2YCrCb", ocl::imgproc::cvtcolor_oclsrc,
   6906                  opts + format("-D dcn=3 -D bidx=%d", bidx));
   6907         break;
   6908     }
   6909     case COLOR_YCrCb2BGR:
   6910     case COLOR_YCrCb2RGB:
   6911     {
   6912         if( dcn <= 0 )
   6913             dcn = 3;
   6914         CV_Assert(scn == 3 && (dcn == 3 || dcn == 4));
   6915         bidx = code == COLOR_YCrCb2BGR ? 0 : 2;
   6916         k.create("YCrCb2RGB", ocl::imgproc::cvtcolor_oclsrc,
   6917                  opts + format("-D dcn=%d -D bidx=%d", dcn, bidx));
   6918         break;
   6919     }
   6920     case COLOR_BGR2XYZ: case COLOR_RGB2XYZ:
   6921     {
   6922         CV_Assert(scn == 3 || scn == 4);
   6923         bidx = code == COLOR_BGR2XYZ ? 0 : 2;
   6924 
   6925         UMat c;
   6926         if (depth == CV_32F)
   6927         {
   6928             float coeffs[] =
   6929             {
   6930                 0.412453f, 0.357580f, 0.180423f,
   6931                 0.212671f, 0.715160f, 0.072169f,
   6932                 0.019334f, 0.119193f, 0.950227f
   6933             };
   6934             if (bidx == 0)
   6935             {
   6936                 std::swap(coeffs[0], coeffs[2]);
   6937                 std::swap(coeffs[3], coeffs[5]);
   6938                 std::swap(coeffs[6], coeffs[8]);
   6939             }
   6940             Mat(1, 9, CV_32FC1, &coeffs[0]).copyTo(c);
   6941         }
   6942         else
   6943         {
   6944             int coeffs[] =
   6945             {
   6946                 1689,    1465,    739,
   6947                 871,     2929,    296,
   6948                 79,      488,     3892
   6949             };
   6950             if (bidx == 0)
   6951             {
   6952                 std::swap(coeffs[0], coeffs[2]);
   6953                 std::swap(coeffs[3], coeffs[5]);
   6954                 std::swap(coeffs[6], coeffs[8]);
   6955             }
   6956             Mat(1, 9, CV_32SC1, &coeffs[0]).copyTo(c);
   6957         }
   6958 
   6959         _dst.create(dstSz, CV_MAKETYPE(depth, 3));
   6960         dst = _dst.getUMat();
   6961 
   6962         k.create("RGB2XYZ", ocl::imgproc::cvtcolor_oclsrc,
   6963                  opts + format("-D dcn=3 -D bidx=%d", bidx));
   6964         if (k.empty())
   6965             return false;
   6966         k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrReadOnly(c));
   6967         return k.run(2, globalsize, 0, false);
   6968     }
   6969     case COLOR_XYZ2BGR: case COLOR_XYZ2RGB:
   6970     {
   6971         if (dcn <= 0)
   6972             dcn = 3;
   6973         CV_Assert(scn == 3 && (dcn == 3 || dcn == 4));
   6974         bidx = code == COLOR_XYZ2BGR ? 0 : 2;
   6975 
   6976         UMat c;
   6977         if (depth == CV_32F)
   6978         {
   6979             float coeffs[] =
   6980             {
   6981                 3.240479f, -1.53715f, -0.498535f,
   6982                 -0.969256f, 1.875991f, 0.041556f,
   6983                 0.055648f, -0.204043f, 1.057311f
   6984             };
   6985             if (bidx == 0)
   6986             {
   6987                 std::swap(coeffs[0], coeffs[6]);
   6988                 std::swap(coeffs[1], coeffs[7]);
   6989                 std::swap(coeffs[2], coeffs[8]);
   6990             }
   6991             Mat(1, 9, CV_32FC1, &coeffs[0]).copyTo(c);
   6992         }
   6993         else
   6994         {
   6995             int coeffs[] =
   6996             {
   6997                 13273,  -6296,  -2042,
   6998                 -3970,   7684,    170,
   6999                   228,   -836,   4331
   7000             };
   7001             if (bidx == 0)
   7002             {
   7003                 std::swap(coeffs[0], coeffs[6]);
   7004                 std::swap(coeffs[1], coeffs[7]);
   7005                 std::swap(coeffs[2], coeffs[8]);
   7006             }
   7007             Mat(1, 9, CV_32SC1, &coeffs[0]).copyTo(c);
   7008         }
   7009 
   7010         _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
   7011         dst = _dst.getUMat();
   7012 
   7013         k.create("XYZ2RGB", ocl::imgproc::cvtcolor_oclsrc,
   7014                  opts + format("-D dcn=%d -D bidx=%d", dcn, bidx));
   7015         if (k.empty())
   7016             return false;
   7017         k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrReadOnly(c));
   7018         return k.run(2, globalsize, 0, false);
   7019     }
   7020     case COLOR_BGR2HSV: case COLOR_RGB2HSV: case COLOR_BGR2HSV_FULL: case COLOR_RGB2HSV_FULL:
   7021     case COLOR_BGR2HLS: case COLOR_RGB2HLS: case COLOR_BGR2HLS_FULL: case COLOR_RGB2HLS_FULL:
   7022     {
   7023         CV_Assert((scn == 3 || scn == 4) && (depth == CV_8U || depth == CV_32F));
   7024         bidx = code == COLOR_BGR2HSV || code == COLOR_BGR2HLS ||
   7025             code == COLOR_BGR2HSV_FULL || code == COLOR_BGR2HLS_FULL ? 0 : 2;
   7026         int hrange = depth == CV_32F ? 360 : code == COLOR_BGR2HSV || code == COLOR_RGB2HSV ||
   7027             code == COLOR_BGR2HLS || code == COLOR_RGB2HLS ? 180 : 256;
   7028         bool is_hsv = code == COLOR_BGR2HSV || code == COLOR_RGB2HSV || code == COLOR_BGR2HSV_FULL || code == COLOR_RGB2HSV_FULL;
   7029         String kernelName = String("RGB2") + (is_hsv ? "HSV" : "HLS");
   7030         dcn = 3;
   7031 
   7032         if (is_hsv && depth == CV_8U)
   7033         {
   7034             static UMat sdiv_data;
   7035             static UMat hdiv_data180;
   7036             static UMat hdiv_data256;
   7037             static int sdiv_table[256];
   7038             static int hdiv_table180[256];
   7039             static int hdiv_table256[256];
   7040             static volatile bool initialized180 = false, initialized256 = false;
   7041             volatile bool & initialized = hrange == 180 ? initialized180 : initialized256;
   7042 
   7043             if (!initialized)
   7044             {
   7045                 int * const hdiv_table = hrange == 180 ? hdiv_table180 : hdiv_table256, hsv_shift = 12;
   7046                 UMat & hdiv_data = hrange == 180 ? hdiv_data180 : hdiv_data256;
   7047 
   7048                 sdiv_table[0] = hdiv_table180[0] = hdiv_table256[0] = 0;
   7049 
   7050                 int v = 255 << hsv_shift;
   7051                 if (!initialized180 && !initialized256)
   7052                 {
   7053                     for(int i = 1; i < 256; i++ )
   7054                         sdiv_table[i] = saturate_cast<int>(v/(1.*i));
   7055                     Mat(1, 256, CV_32SC1, sdiv_table).copyTo(sdiv_data);
   7056                 }
   7057 
   7058                 v = hrange << hsv_shift;
   7059                 for (int i = 1; i < 256; i++ )
   7060                     hdiv_table[i] = saturate_cast<int>(v/(6.*i));
   7061 
   7062                 Mat(1, 256, CV_32SC1, hdiv_table).copyTo(hdiv_data);
   7063                 initialized = true;
   7064             }
   7065 
   7066             _dst.create(dstSz, CV_8UC3);
   7067             dst = _dst.getUMat();
   7068 
   7069             k.create("RGB2HSV", ocl::imgproc::cvtcolor_oclsrc,
   7070                      opts + format("-D hrange=%d -D bidx=%d -D dcn=3",
   7071                                    hrange, bidx));
   7072             if (k.empty())
   7073                 return false;
   7074 
   7075             k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst),
   7076                    ocl::KernelArg::PtrReadOnly(sdiv_data), hrange == 256 ? ocl::KernelArg::PtrReadOnly(hdiv_data256) :
   7077                                                                        ocl::KernelArg::PtrReadOnly(hdiv_data180));
   7078 
   7079             return k.run(2, globalsize, NULL, false);
   7080         }
   7081         else
   7082             k.create(kernelName.c_str(), ocl::imgproc::cvtcolor_oclsrc,
   7083                      opts + format("-D hscale=%ff -D bidx=%d -D dcn=3",
   7084                                    hrange*(1.f/360.f), bidx));
   7085         break;
   7086     }
   7087     case COLOR_HSV2BGR: case COLOR_HSV2RGB: case COLOR_HSV2BGR_FULL: case COLOR_HSV2RGB_FULL:
   7088     case COLOR_HLS2BGR: case COLOR_HLS2RGB: case COLOR_HLS2BGR_FULL: case COLOR_HLS2RGB_FULL:
   7089     {
   7090         if (dcn <= 0)
   7091             dcn = 3;
   7092         CV_Assert(scn == 3 && (dcn == 3 || dcn == 4) && (depth == CV_8U || depth == CV_32F));
   7093         bidx = code == COLOR_HSV2BGR || code == COLOR_HLS2BGR ||
   7094             code == COLOR_HSV2BGR_FULL || code == COLOR_HLS2BGR_FULL ? 0 : 2;
   7095         int hrange = depth == CV_32F ? 360 : code == COLOR_HSV2BGR || code == COLOR_HSV2RGB ||
   7096             code == COLOR_HLS2BGR || code == COLOR_HLS2RGB ? 180 : 255;
   7097         bool is_hsv = code == COLOR_HSV2BGR || code == COLOR_HSV2RGB ||
   7098                 code == COLOR_HSV2BGR_FULL || code == COLOR_HSV2RGB_FULL;
   7099 
   7100         String kernelName = String(is_hsv ? "HSV" : "HLS") + "2RGB";
   7101         k.create(kernelName.c_str(), ocl::imgproc::cvtcolor_oclsrc,
   7102                  opts + format("-D dcn=%d -D bidx=%d -D hrange=%d -D hscale=%ff",
   7103                                dcn, bidx, hrange, 6.f/hrange));
   7104         break;
   7105     }
   7106     case COLOR_RGBA2mRGBA: case COLOR_mRGBA2RGBA:
   7107     {
   7108         CV_Assert(scn == 4 && depth == CV_8U);
   7109         dcn = 4;
   7110 
   7111         k.create(code == COLOR_RGBA2mRGBA ? "RGBA2mRGBA" : "mRGBA2RGBA", ocl::imgproc::cvtcolor_oclsrc,
   7112                  opts + "-D dcn=4 -D bidx=3");
   7113         break;
   7114     }
   7115     case CV_BGR2Lab: case CV_RGB2Lab: case CV_LBGR2Lab: case CV_LRGB2Lab:
   7116     case CV_BGR2Luv: case CV_RGB2Luv: case CV_LBGR2Luv: case CV_LRGB2Luv:
   7117     {
   7118         CV_Assert( (scn == 3 || scn == 4) && (depth == CV_8U || depth == CV_32F) );
   7119 
   7120         bidx = code == CV_BGR2Lab || code == CV_LBGR2Lab || code == CV_BGR2Luv || code == CV_LBGR2Luv ? 0 : 2;
   7121         bool srgb = code == CV_BGR2Lab || code == CV_RGB2Lab || code == CV_RGB2Luv || code == CV_BGR2Luv;
   7122         bool lab = code == CV_BGR2Lab || code == CV_RGB2Lab || code == CV_LBGR2Lab || code == CV_LRGB2Lab;
   7123         float un, vn;
   7124         dcn = 3;
   7125 
   7126         k.create(format("BGR2%s", lab ? "Lab" : "Luv").c_str(),
   7127                  ocl::imgproc::cvtcolor_oclsrc,
   7128                  opts + format("-D dcn=%d -D bidx=%d%s",
   7129                                dcn, bidx, srgb ? " -D SRGB" : ""));
   7130         if (k.empty())
   7131             return false;
   7132 
   7133         initLabTabs();
   7134 
   7135         _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
   7136         dst = _dst.getUMat();
   7137 
   7138         ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
   7139                 dstarg = ocl::KernelArg::WriteOnly(dst);
   7140 
   7141         if (depth == CV_8U && lab)
   7142         {
   7143             static UMat usRGBGammaTab, ulinearGammaTab, uLabCbrtTab, ucoeffs;
   7144 
   7145             if (srgb && usRGBGammaTab.empty())
   7146                 Mat(1, 256, CV_16UC1, sRGBGammaTab_b).copyTo(usRGBGammaTab);
   7147             else if (ulinearGammaTab.empty())
   7148                 Mat(1, 256, CV_16UC1, linearGammaTab_b).copyTo(ulinearGammaTab);
   7149             if (uLabCbrtTab.empty())
   7150                 Mat(1, LAB_CBRT_TAB_SIZE_B, CV_16UC1, LabCbrtTab_b).copyTo(uLabCbrtTab);
   7151 
   7152             {
   7153                 int coeffs[9];
   7154                 const float * const _coeffs = sRGB2XYZ_D65, * const _whitept = D65;
   7155                 const float scale[] =
   7156                 {
   7157                     (1 << lab_shift)/_whitept[0],
   7158                     (float)(1 << lab_shift),
   7159                     (1 << lab_shift)/_whitept[2]
   7160                 };
   7161 
   7162                 for (int i = 0; i < 3; i++ )
   7163                 {
   7164                     coeffs[i*3+(bidx^2)] = cvRound(_coeffs[i*3]*scale[i]);
   7165                     coeffs[i*3+1] = cvRound(_coeffs[i*3+1]*scale[i]);
   7166                     coeffs[i*3+bidx] = cvRound(_coeffs[i*3+2]*scale[i]);
   7167 
   7168                     CV_Assert( coeffs[i] >= 0 && coeffs[i*3+1] >= 0 && coeffs[i*3+2] >= 0 &&
   7169                               coeffs[i*3] + coeffs[i*3+1] + coeffs[i*3+2] < 2*(1 << lab_shift) );
   7170                 }
   7171                 Mat(1, 9, CV_32SC1, coeffs).copyTo(ucoeffs);
   7172             }
   7173 
   7174             const int Lscale = (116*255+50)/100;
   7175             const int Lshift = -((16*255*(1 << lab_shift2) + 50)/100);
   7176 
   7177             k.args(srcarg, dstarg,
   7178                    ocl::KernelArg::PtrReadOnly(srgb ? usRGBGammaTab : ulinearGammaTab),
   7179                    ocl::KernelArg::PtrReadOnly(uLabCbrtTab), ocl::KernelArg::PtrReadOnly(ucoeffs),
   7180                    Lscale, Lshift);
   7181         }
   7182         else
   7183         {
   7184             static UMat usRGBGammaTab, ucoeffs, uLabCbrtTab;
   7185 
   7186             if (srgb && usRGBGammaTab.empty())
   7187                 Mat(1, GAMMA_TAB_SIZE * 4, CV_32FC1, sRGBGammaTab).copyTo(usRGBGammaTab);
   7188             if (!lab && uLabCbrtTab.empty())
   7189                 Mat(1, LAB_CBRT_TAB_SIZE * 4, CV_32FC1, LabCbrtTab).copyTo(uLabCbrtTab);
   7190 
   7191             {
   7192                 float coeffs[9];
   7193                 const float * const _coeffs = sRGB2XYZ_D65, * const _whitept = D65;
   7194                 float scale[] = { 1.0f / _whitept[0], 1.0f, 1.0f / _whitept[2] };
   7195 
   7196                 for (int i = 0; i < 3; i++)
   7197                 {
   7198                     int j = i * 3;
   7199                     coeffs[j + (bidx ^ 2)] = _coeffs[j] * (lab ? scale[i] : 1);
   7200                     coeffs[j + 1] = _coeffs[j + 1] * (lab ? scale[i] : 1);
   7201                     coeffs[j + bidx] = _coeffs[j + 2] * (lab ? scale[i] : 1);
   7202 
   7203                     CV_Assert( coeffs[j] >= 0 && coeffs[j + 1] >= 0 && coeffs[j + 2] >= 0 &&
   7204                                coeffs[j] + coeffs[j + 1] + coeffs[j + 2] < 1.5f*(lab ? LabCbrtTabScale : 1) );
   7205                 }
   7206 
   7207                 float d = 1.f/(_whitept[0] + _whitept[1]*15 + _whitept[2]*3);
   7208                 un = 13*4*_whitept[0]*d;
   7209                 vn = 13*9*_whitept[1]*d;
   7210 
   7211                 Mat(1, 9, CV_32FC1, coeffs).copyTo(ucoeffs);
   7212             }
   7213 
   7214             float _1_3 = 1.0f / 3.0f, _a = 16.0f / 116.0f;
   7215             ocl::KernelArg ucoeffsarg = ocl::KernelArg::PtrReadOnly(ucoeffs);
   7216 
   7217             if (lab)
   7218             {
   7219                 if (srgb)
   7220                     k.args(srcarg, dstarg, ocl::KernelArg::PtrReadOnly(usRGBGammaTab),
   7221                            ucoeffsarg, _1_3, _a);
   7222                 else
   7223                     k.args(srcarg, dstarg, ucoeffsarg, _1_3, _a);
   7224             }
   7225             else
   7226             {
   7227                 ocl::KernelArg LabCbrtTabarg = ocl::KernelArg::PtrReadOnly(uLabCbrtTab);
   7228                 if (srgb)
   7229                     k.args(srcarg, dstarg, ocl::KernelArg::PtrReadOnly(usRGBGammaTab),
   7230                            LabCbrtTabarg, ucoeffsarg, un, vn);
   7231                 else
   7232                     k.args(srcarg, dstarg, LabCbrtTabarg, ucoeffsarg, un, vn);
   7233             }
   7234         }
   7235 
   7236         return k.run(dims, globalsize, NULL, false);
   7237     }
   7238     case CV_Lab2BGR: case CV_Lab2RGB: case CV_Lab2LBGR: case CV_Lab2LRGB:
   7239     case CV_Luv2BGR: case CV_Luv2RGB: case CV_Luv2LBGR: case CV_Luv2LRGB:
   7240     {
   7241         if( dcn <= 0 )
   7242             dcn = 3;
   7243         CV_Assert( scn == 3 && (dcn == 3 || dcn == 4) && (depth == CV_8U || depth == CV_32F) );
   7244 
   7245         bidx = code == CV_Lab2BGR || code == CV_Lab2LBGR || code == CV_Luv2BGR || code == CV_Luv2LBGR ? 0 : 2;
   7246         bool srgb = code == CV_Lab2BGR || code == CV_Lab2RGB || code == CV_Luv2BGR || code == CV_Luv2RGB;
   7247         bool lab = code == CV_Lab2BGR || code == CV_Lab2RGB || code == CV_Lab2LBGR || code == CV_Lab2LRGB;
   7248         float un, vn;
   7249 
   7250         k.create(format("%s2BGR", lab ? "Lab" : "Luv").c_str(),
   7251                  ocl::imgproc::cvtcolor_oclsrc,
   7252                  opts + format("-D dcn=%d -D bidx=%d%s",
   7253                                dcn, bidx, srgb ? " -D SRGB" : ""));
   7254         if (k.empty())
   7255             return false;
   7256 
   7257         initLabTabs();
   7258         static UMat ucoeffs, usRGBInvGammaTab;
   7259 
   7260         if (srgb && usRGBInvGammaTab.empty())
   7261             Mat(1, GAMMA_TAB_SIZE*4, CV_32FC1, sRGBInvGammaTab).copyTo(usRGBInvGammaTab);
   7262 
   7263         {
   7264             float coeffs[9];
   7265             const float * const _coeffs = XYZ2sRGB_D65, * const _whitept = D65;
   7266 
   7267             for( int i = 0; i < 3; i++ )
   7268             {
   7269                 coeffs[i+(bidx^2)*3] = _coeffs[i] * (lab ? _whitept[i] : 1);
   7270                 coeffs[i+3] = _coeffs[i+3] * (lab ? _whitept[i] : 1);
   7271                 coeffs[i+bidx*3] = _coeffs[i+6] * (lab ? _whitept[i] : 1);
   7272             }
   7273 
   7274             float d = 1.f/(_whitept[0] + _whitept[1]*15 + _whitept[2]*3);
   7275             un = 4*_whitept[0]*d;
   7276             vn = 9*_whitept[1]*d;
   7277 
   7278             Mat(1, 9, CV_32FC1, coeffs).copyTo(ucoeffs);
   7279         }
   7280 
   7281         _dst.create(sz, CV_MAKETYPE(depth, dcn));
   7282         dst = _dst.getUMat();
   7283 
   7284         float lThresh = 0.008856f * 903.3f;
   7285         float fThresh = 7.787f * 0.008856f + 16.0f / 116.0f;
   7286 
   7287         ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
   7288                 dstarg = ocl::KernelArg::WriteOnly(dst),
   7289                 coeffsarg = ocl::KernelArg::PtrReadOnly(ucoeffs);
   7290 
   7291         if (lab)
   7292         {
   7293             if (srgb)
   7294                 k.args(srcarg, dstarg, ocl::KernelArg::PtrReadOnly(usRGBInvGammaTab),
   7295                        coeffsarg, lThresh, fThresh);
   7296             else
   7297                 k.args(srcarg, dstarg, coeffsarg, lThresh, fThresh);
   7298         }
   7299         else
   7300         {
   7301             if (srgb)
   7302                 k.args(srcarg, dstarg, ocl::KernelArg::PtrReadOnly(usRGBInvGammaTab),
   7303                        coeffsarg, un, vn);
   7304             else
   7305                 k.args(srcarg, dstarg, coeffsarg, un, vn);
   7306         }
   7307 
   7308         return k.run(dims, globalsize, NULL, false);
   7309     }
   7310     default:
   7311         break;
   7312     }
   7313 
   7314     if( !k.empty() )
   7315     {
   7316         _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
   7317         dst = _dst.getUMat();
   7318         k.args(ocl::KernelArg::ReadOnlyNoSize(src), ocl::KernelArg::WriteOnly(dst));
   7319         ok = k.run(dims, globalsize, NULL, false);
   7320     }
   7321     return ok;
   7322 }
   7323 
   7324 #endif
   7325 
   7326 }//namespace cv
   7327 
   7328 //////////////////////////////////////////////////////////////////////////////////////////
   7329 //                                   The main function                                  //
   7330 //////////////////////////////////////////////////////////////////////////////////////////
   7331 
   7332 void cv::cvtColor( InputArray _src, OutputArray _dst, int code, int dcn )
   7333 {
   7334     int stype = _src.type();
   7335     int scn = CV_MAT_CN(stype), depth = CV_MAT_DEPTH(stype), bidx;
   7336 
   7337     CV_OCL_RUN( _src.dims() <= 2 && _dst.isUMat() && !(depth == CV_8U && (code == CV_Luv2BGR || code == CV_Luv2RGB)),
   7338                 ocl_cvtColor(_src, _dst, code, dcn) )
   7339 
   7340     Mat src = _src.getMat(), dst;
   7341     Size sz = src.size();
   7342 
   7343     CV_Assert( depth == CV_8U || depth == CV_16U || depth == CV_32F );
   7344 
   7345     switch( code )
   7346     {
   7347         case CV_BGR2BGRA: case CV_RGB2BGRA: case CV_BGRA2BGR:
   7348         case CV_RGBA2BGR: case CV_RGB2BGR: case CV_BGRA2RGBA:
   7349             CV_Assert( scn == 3 || scn == 4 );
   7350             dcn = code == CV_BGR2BGRA || code == CV_RGB2BGRA || code == CV_BGRA2RGBA ? 4 : 3;
   7351             bidx = code == CV_BGR2BGRA || code == CV_BGRA2BGR ? 0 : 2;
   7352 
   7353             _dst.create( sz, CV_MAKETYPE(depth, dcn));
   7354             dst = _dst.getMat();
   7355 
   7356 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
   7357             CV_IPP_CHECK()
   7358             {
   7359                 if( code == CV_BGR2BGRA)
   7360                 {
   7361                     if ( CvtColorIPPLoop(src, dst, IPPReorderFunctor(ippiSwapChannelsC3C4RTab[depth], 0, 1, 2)) )
   7362                     {
   7363                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   7364                         return;
   7365                     }
   7366                     setIppErrorStatus();
   7367                 }
   7368                 else if( code == CV_BGRA2BGR )
   7369                 {
   7370                     if ( CvtColorIPPLoop(src, dst, IPPGeneralFunctor(ippiCopyAC4C3RTab[depth])) )
   7371                     {
   7372                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   7373                         return;
   7374                     }
   7375                     setIppErrorStatus();
   7376                 }
   7377                 else if( code == CV_BGR2RGBA )
   7378                 {
   7379                     if( CvtColorIPPLoop(src, dst, IPPReorderFunctor(ippiSwapChannelsC3C4RTab[depth], 2, 1, 0)) )
   7380                     {
   7381                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   7382                         return;
   7383                     }
   7384                     setIppErrorStatus();
   7385                 }
   7386                 else if( code == CV_RGBA2BGR )
   7387                 {
   7388                     if( CvtColorIPPLoop(src, dst, IPPReorderFunctor(ippiSwapChannelsC4C3RTab[depth], 2, 1, 0)) )
   7389                     {
   7390                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   7391                         return;
   7392                     }
   7393                     setIppErrorStatus();
   7394                 }
   7395                 else if( code == CV_RGB2BGR )
   7396                 {
   7397                     if( CvtColorIPPLoopCopy(src, dst, IPPReorderFunctor(ippiSwapChannelsC3RTab[depth], 2, 1, 0)) )
   7398                     {
   7399                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   7400                         return;
   7401                     }
   7402                     setIppErrorStatus();
   7403                 }
   7404 #if IPP_VERSION_X100 >= 801
   7405                 else if( code == CV_RGBA2BGRA )
   7406                 {
   7407                     if( CvtColorIPPLoopCopy(src, dst, IPPReorderFunctor(ippiSwapChannelsC4RTab[depth], 2, 1, 0)) )
   7408                     {
   7409                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   7410                         return;
   7411                     }
   7412                     setIppErrorStatus();
   7413                 }
   7414 #endif
   7415             }
   7416 #endif
   7417 
   7418             if( depth == CV_8U )
   7419             {
   7420 #ifdef HAVE_TEGRA_OPTIMIZATION
   7421                 if(tegra::useTegra() && tegra::cvtBGR2RGB(src, dst, bidx))
   7422                     break;
   7423 #endif
   7424                 CvtColorLoop(src, dst, RGB2RGB<uchar>(scn, dcn, bidx));
   7425             }
   7426             else if( depth == CV_16U )
   7427                 CvtColorLoop(src, dst, RGB2RGB<ushort>(scn, dcn, bidx));
   7428             else
   7429                 CvtColorLoop(src, dst, RGB2RGB<float>(scn, dcn, bidx));
   7430             break;
   7431 
   7432         case CV_BGR2BGR565: case CV_BGR2BGR555: case CV_RGB2BGR565: case CV_RGB2BGR555:
   7433         case CV_BGRA2BGR565: case CV_BGRA2BGR555: case CV_RGBA2BGR565: case CV_RGBA2BGR555:
   7434             CV_Assert( (scn == 3 || scn == 4) && depth == CV_8U );
   7435             _dst.create(sz, CV_8UC2);
   7436             dst = _dst.getMat();
   7437 
   7438 #if defined(HAVE_IPP) && 0 // breaks OCL accuracy tests
   7439             CV_IPP_CHECK()
   7440             {
   7441                 CV_SUPPRESS_DEPRECATED_START
   7442 
   7443                 if (code == CV_BGR2BGR565 && scn == 3)
   7444                 {
   7445                     if (CvtColorIPPLoop(src, dst, IPPGeneralFunctor((ippiGeneralFunc)ippiBGRToBGR565_8u16u_C3R)))
   7446                     {
   7447                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   7448                         return;
   7449                     }
   7450                     setIppErrorStatus();
   7451                 }
   7452                 else if (code == CV_BGRA2BGR565 && scn == 4)
   7453                 {
   7454                     if (CvtColorIPPLoopCopy(src, dst,
   7455                                             IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
   7456                                             (ippiGeneralFunc)ippiBGRToBGR565_8u16u_C3R, 0, 1, 2, depth)))
   7457                     {
   7458                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   7459                         return;
   7460                     }
   7461                     setIppErrorStatus();
   7462                 }
   7463                 else if (code == CV_RGB2BGR565 && scn == 3)
   7464                 {
   7465                     if( CvtColorIPPLoopCopy(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth],
   7466                                                                                (ippiGeneralFunc)ippiBGRToBGR565_8u16u_C3R, 2, 1, 0, depth)) )
   7467                     {
   7468                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   7469                         return;
   7470                     }
   7471                     setIppErrorStatus();
   7472                 }
   7473                 else if (code == CV_RGBA2BGR565 && scn == 4)
   7474                 {
   7475                     if( CvtColorIPPLoopCopy(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
   7476                                                                                (ippiGeneralFunc)ippiBGRToBGR565_8u16u_C3R, 2, 1, 0, depth)) )
   7477                     {
   7478                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   7479                         return;
   7480                     }
   7481                     setIppErrorStatus();
   7482                 }
   7483                 CV_SUPPRESS_DEPRECATED_END
   7484             }
   7485 #endif
   7486 
   7487 #ifdef HAVE_TEGRA_OPTIMIZATION
   7488             if(code == CV_BGR2BGR565 || code == CV_BGRA2BGR565 || code == CV_RGB2BGR565  || code == CV_RGBA2BGR565)
   7489                 if(tegra::useTegra() && tegra::cvtRGB2RGB565(src, dst, code == CV_RGB2BGR565 || code == CV_RGBA2BGR565 ? 0 : 2))
   7490                     break;
   7491 #endif
   7492 
   7493             CvtColorLoop(src, dst, RGB2RGB5x5(scn,
   7494                       code == CV_BGR2BGR565 || code == CV_BGR2BGR555 ||
   7495                       code == CV_BGRA2BGR565 || code == CV_BGRA2BGR555 ? 0 : 2,
   7496                       code == CV_BGR2BGR565 || code == CV_RGB2BGR565 ||
   7497                       code == CV_BGRA2BGR565 || code == CV_RGBA2BGR565 ? 6 : 5 // green bits
   7498                                               ));
   7499             break;
   7500 
   7501         case CV_BGR5652BGR: case CV_BGR5552BGR: case CV_BGR5652RGB: case CV_BGR5552RGB:
   7502         case CV_BGR5652BGRA: case CV_BGR5552BGRA: case CV_BGR5652RGBA: case CV_BGR5552RGBA:
   7503             if(dcn <= 0) dcn = (code==CV_BGR5652BGRA || code==CV_BGR5552BGRA || code==CV_BGR5652RGBA || code==CV_BGR5552RGBA) ? 4 : 3;
   7504             CV_Assert( (dcn == 3 || dcn == 4) && scn == 2 && depth == CV_8U );
   7505             _dst.create(sz, CV_MAKETYPE(depth, dcn));
   7506             dst = _dst.getMat();
   7507 
   7508 #ifdef HAVE_IPP
   7509             CV_IPP_CHECK()
   7510             {
   7511                 CV_SUPPRESS_DEPRECATED_START
   7512                 if (code == CV_BGR5652BGR && dcn == 3)
   7513                 {
   7514                     if (CvtColorIPPLoop(src, dst, IPPGeneralFunctor((ippiGeneralFunc)ippiBGR565ToBGR_16u8u_C3R)))
   7515                     {
   7516                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   7517                         return;
   7518                     }
   7519                     setIppErrorStatus();
   7520                 }
   7521                 else if (code == CV_BGR5652RGB && dcn == 3)
   7522                 {
   7523                     if (CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiBGR565ToBGR_16u8u_C3R,
   7524                                                                            ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)))
   7525                     {
   7526                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   7527                         return;
   7528                     }
   7529                     setIppErrorStatus();
   7530                 }
   7531                 else if (code == CV_BGR5652BGRA && dcn == 4)
   7532                 {
   7533                     if (CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiBGR565ToBGR_16u8u_C3R,
   7534                                                                            ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)))
   7535                     {
   7536                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   7537                         return;
   7538                     }
   7539                     setIppErrorStatus();
   7540                 }
   7541                 else if (code == CV_BGR5652RGBA && dcn == 4)
   7542                 {
   7543                     if (CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiBGR565ToBGR_16u8u_C3R,
   7544                                                                            ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)))
   7545                     {
   7546                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   7547                         return;
   7548                     }
   7549                     setIppErrorStatus();
   7550                 }
   7551                 CV_SUPPRESS_DEPRECATED_END
   7552             }
   7553 #endif
   7554 
   7555             CvtColorLoop(src, dst, RGB5x52RGB(dcn,
   7556                       code == CV_BGR5652BGR || code == CV_BGR5552BGR ||
   7557                       code == CV_BGR5652BGRA || code == CV_BGR5552BGRA ? 0 : 2, // blue idx
   7558                       code == CV_BGR5652BGR || code == CV_BGR5652RGB ||
   7559                       code == CV_BGR5652BGRA || code == CV_BGR5652RGBA ? 6 : 5 // green bits
   7560                       ));
   7561             break;
   7562 
   7563         case CV_BGR2GRAY: case CV_BGRA2GRAY: case CV_RGB2GRAY: case CV_RGBA2GRAY:
   7564             CV_Assert( scn == 3 || scn == 4 );
   7565             _dst.create(sz, CV_MAKETYPE(depth, 1));
   7566             dst = _dst.getMat();
   7567 
   7568 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
   7569             CV_IPP_CHECK()
   7570             {
   7571                 if( code == CV_BGR2GRAY && depth == CV_32F )
   7572                 {
   7573                     if( CvtColorIPPLoop(src, dst, IPPColor2GrayFunctor(ippiColor2GrayC3Tab[depth])) )
   7574                     {
   7575                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   7576                         return;
   7577                     }
   7578                     setIppErrorStatus();
   7579                 }
   7580                 else if( code == CV_RGB2GRAY && depth == CV_32F )
   7581                 {
   7582                     if( CvtColorIPPLoop(src, dst, IPPGeneralFunctor(ippiRGB2GrayC3Tab[depth])) )
   7583                     {
   7584                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   7585                         return;
   7586                     }
   7587                     setIppErrorStatus();
   7588                 }
   7589                 else if( code == CV_BGRA2GRAY && depth == CV_32F )
   7590                 {
   7591                     if( CvtColorIPPLoop(src, dst, IPPColor2GrayFunctor(ippiColor2GrayC4Tab[depth])) )
   7592                     {
   7593                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   7594                         return;
   7595                     }
   7596                     setIppErrorStatus();
   7597                 }
   7598                 else if( code == CV_RGBA2GRAY && depth == CV_32F )
   7599                 {
   7600                     if( CvtColorIPPLoop(src, dst, IPPGeneralFunctor(ippiRGB2GrayC4Tab[depth])) )
   7601                     {
   7602                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   7603                         return;
   7604                     }
   7605                     setIppErrorStatus();
   7606                 }
   7607             }
   7608 #endif
   7609 
   7610             bidx = code == CV_BGR2GRAY || code == CV_BGRA2GRAY ? 0 : 2;
   7611 
   7612             if( depth == CV_8U )
   7613             {
   7614 #ifdef HAVE_TEGRA_OPTIMIZATION
   7615                 if(tegra::useTegra() && tegra::cvtRGB2Gray(src, dst, bidx))
   7616                     break;
   7617 #endif
   7618                 CvtColorLoop(src, dst, RGB2Gray<uchar>(scn, bidx, 0));
   7619             }
   7620             else if( depth == CV_16U )
   7621                 CvtColorLoop(src, dst, RGB2Gray<ushort>(scn, bidx, 0));
   7622             else
   7623                 CvtColorLoop(src, dst, RGB2Gray<float>(scn, bidx, 0));
   7624             break;
   7625 
   7626         case CV_BGR5652GRAY: case CV_BGR5552GRAY:
   7627             CV_Assert( scn == 2 && depth == CV_8U );
   7628             _dst.create(sz, CV_8UC1);
   7629             dst = _dst.getMat();
   7630 
   7631             CvtColorLoop(src, dst, RGB5x52Gray(code == CV_BGR5652GRAY ? 6 : 5));
   7632             break;
   7633 
   7634         case CV_GRAY2BGR: case CV_GRAY2BGRA:
   7635             if( dcn <= 0 ) dcn = (code==CV_GRAY2BGRA) ? 4 : 3;
   7636             CV_Assert( scn == 1 && (dcn == 3 || dcn == 4));
   7637             _dst.create(sz, CV_MAKETYPE(depth, dcn));
   7638             dst = _dst.getMat();
   7639 
   7640 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
   7641             CV_IPP_CHECK()
   7642             {
   7643                 if( code == CV_GRAY2BGR )
   7644                 {
   7645                     if( CvtColorIPPLoop(src, dst, IPPGray2BGRFunctor(ippiCopyP3C3RTab[depth])) )
   7646                     {
   7647                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   7648                         return;
   7649                     }
   7650                     setIppErrorStatus();
   7651                 }
   7652                 else if( code == CV_GRAY2BGRA )
   7653                 {
   7654                     if( CvtColorIPPLoop(src, dst, IPPGray2BGRAFunctor(ippiCopyP3C3RTab[depth], ippiSwapChannelsC3C4RTab[depth], depth)) )
   7655                     {
   7656                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   7657                         return;
   7658                     }
   7659                     setIppErrorStatus();
   7660                 }
   7661             }
   7662 #endif
   7663 
   7664 
   7665             if( depth == CV_8U )
   7666             {
   7667 #ifdef HAVE_TEGRA_OPTIMIZATION
   7668                 if(tegra::useTegra() && tegra::cvtGray2RGB(src, dst))
   7669                     break;
   7670 #endif
   7671                 CvtColorLoop(src, dst, Gray2RGB<uchar>(dcn));
   7672             }
   7673             else if( depth == CV_16U )
   7674                 CvtColorLoop(src, dst, Gray2RGB<ushort>(dcn));
   7675             else
   7676                 CvtColorLoop(src, dst, Gray2RGB<float>(dcn));
   7677             break;
   7678 
   7679         case CV_GRAY2BGR565: case CV_GRAY2BGR555:
   7680             CV_Assert( scn == 1 && depth == CV_8U );
   7681             _dst.create(sz, CV_8UC2);
   7682             dst = _dst.getMat();
   7683 
   7684             CvtColorLoop(src, dst, Gray2RGB5x5(code == CV_GRAY2BGR565 ? 6 : 5));
   7685             break;
   7686 
   7687         case CV_BGR2YCrCb: case CV_RGB2YCrCb:
   7688         case CV_BGR2YUV: case CV_RGB2YUV:
   7689             {
   7690             CV_Assert( scn == 3 || scn == 4 );
   7691             bidx = code == CV_BGR2YCrCb || code == CV_BGR2YUV ? 0 : 2;
   7692             static const float yuv_f[] = { 0.114f, 0.587f, 0.299f, 0.492f, 0.877f };
   7693             static const int yuv_i[] = { B2Y, G2Y, R2Y, 8061, 14369 };
   7694             const float* coeffs_f = code == CV_BGR2YCrCb || code == CV_RGB2YCrCb ? 0 : yuv_f;
   7695             const int* coeffs_i = code == CV_BGR2YCrCb || code == CV_RGB2YCrCb ? 0 : yuv_i;
   7696 
   7697             _dst.create(sz, CV_MAKETYPE(depth, 3));
   7698             dst = _dst.getMat();
   7699 
   7700 #if defined HAVE_IPP && 0
   7701             CV_IPP_CHECK()
   7702             {
   7703                 if (code == CV_RGB2YUV && scn == 3 && depth == CV_8U)
   7704                 {
   7705                     if (CvtColorIPPLoop(src, dst, IPPGeneralFunctor((ippiGeneralFunc)ippiRGBToYUV_8u_C3R)))
   7706                     {
   7707                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   7708                         return;
   7709                     }
   7710                     setIppErrorStatus();
   7711                 }
   7712                 else if (code == CV_BGR2YUV && scn == 3 && depth == CV_8U)
   7713                 {
   7714                     if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth],
   7715                                                                            (ippiGeneralFunc)ippiRGBToYUV_8u_C3R, 2, 1, 0, depth)))
   7716                     {
   7717                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   7718                         return;
   7719                     }
   7720                     setIppErrorStatus();
   7721                 }
   7722                 else if (code == CV_RGB2YUV && scn == 4 && depth == CV_8U)
   7723                 {
   7724                     if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
   7725                                                                            (ippiGeneralFunc)ippiRGBToYUV_8u_C3R, 0, 1, 2, depth)))
   7726                     {
   7727                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   7728                         return;
   7729                     }
   7730                     setIppErrorStatus();
   7731                 }
   7732                 else if (code == CV_BGR2YUV && scn == 4 && depth == CV_8U)
   7733                 {
   7734                     if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
   7735                                                                            (ippiGeneralFunc)ippiRGBToYUV_8u_C3R, 2, 1, 0, depth)))
   7736                     {
   7737                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   7738                         return;
   7739                     }
   7740                     setIppErrorStatus();
   7741                 }
   7742             }
   7743 #endif
   7744 
   7745             if( depth == CV_8U )
   7746             {
   7747 #ifdef HAVE_TEGRA_OPTIMIZATION
   7748                 if((code == CV_RGB2YCrCb || code == CV_BGR2YCrCb) && tegra::useTegra() && tegra::cvtRGB2YCrCb(src, dst, bidx))
   7749                     break;
   7750 #endif
   7751                 CvtColorLoop(src, dst, RGB2YCrCb_i<uchar>(scn, bidx, coeffs_i));
   7752             }
   7753             else if( depth == CV_16U )
   7754                 CvtColorLoop(src, dst, RGB2YCrCb_i<ushort>(scn, bidx, coeffs_i));
   7755             else
   7756                 CvtColorLoop(src, dst, RGB2YCrCb_f<float>(scn, bidx, coeffs_f));
   7757             }
   7758             break;
   7759 
   7760         case CV_YCrCb2BGR: case CV_YCrCb2RGB:
   7761         case CV_YUV2BGR: case CV_YUV2RGB:
   7762             {
   7763             if( dcn <= 0 ) dcn = 3;
   7764             CV_Assert( scn == 3 && (dcn == 3 || dcn == 4) );
   7765             bidx = code == CV_YCrCb2BGR || code == CV_YUV2BGR ? 0 : 2;
   7766             static const float yuv_f[] = { 2.032f, -0.395f, -0.581f, 1.140f };
   7767             static const int yuv_i[] = { 33292, -6472, -9519, 18678 };
   7768             const float* coeffs_f = code == CV_YCrCb2BGR || code == CV_YCrCb2RGB ? 0 : yuv_f;
   7769             const int* coeffs_i = code == CV_YCrCb2BGR || code == CV_YCrCb2RGB ? 0 : yuv_i;
   7770 
   7771             _dst.create(sz, CV_MAKETYPE(depth, dcn));
   7772             dst = _dst.getMat();
   7773 
   7774 #if defined HAVE_IPP && 0
   7775             CV_IPP_CHECK()
   7776             {
   7777                 if (code == CV_YUV2RGB && dcn == 3 && depth == CV_8U)
   7778                 {
   7779                     if (CvtColorIPPLoop(src, dst, IPPGeneralFunctor((ippiGeneralFunc)ippiYUVToRGB_8u_C3R)))
   7780                     {
   7781                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   7782                         return;
   7783                     }
   7784                     setIppErrorStatus();
   7785                 }
   7786                 else if (code == CV_YUV2BGR && dcn == 3 && depth == CV_8U)
   7787                 {
   7788                     if (CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiYUVToRGB_8u_C3R,
   7789                                                                            ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)))
   7790                     {
   7791                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   7792                         return;
   7793                     }
   7794                     setIppErrorStatus();
   7795                 }
   7796                 else if (code == CV_YUV2RGB && dcn == 4 && depth == CV_8U)
   7797                 {
   7798                     if (CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiYUVToRGB_8u_C3R,
   7799                                                                            ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)))
   7800                     {
   7801                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   7802                         return;
   7803                     }
   7804                     setIppErrorStatus();
   7805                 }
   7806                 else if (code == CV_YUV2BGR && dcn == 4 && depth == CV_8U)
   7807                 {
   7808                     if (CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiYUVToRGB_8u_C3R,
   7809                                                                            ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)))
   7810                     {
   7811                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   7812                         return;
   7813                     }
   7814                     setIppErrorStatus();
   7815                 }
   7816             }
   7817 #endif
   7818 
   7819             if( depth == CV_8U )
   7820                 CvtColorLoop(src, dst, YCrCb2RGB_i<uchar>(dcn, bidx, coeffs_i));
   7821             else if( depth == CV_16U )
   7822                 CvtColorLoop(src, dst, YCrCb2RGB_i<ushort>(dcn, bidx, coeffs_i));
   7823             else
   7824                 CvtColorLoop(src, dst, YCrCb2RGB_f<float>(dcn, bidx, coeffs_f));
   7825             }
   7826             break;
   7827 
   7828         case CV_BGR2XYZ: case CV_RGB2XYZ:
   7829             CV_Assert( scn == 3 || scn == 4 );
   7830             bidx = code == CV_BGR2XYZ ? 0 : 2;
   7831 
   7832             _dst.create(sz, CV_MAKETYPE(depth, 3));
   7833             dst = _dst.getMat();
   7834 
   7835 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
   7836             CV_IPP_CHECK()
   7837             {
   7838                 if( code == CV_BGR2XYZ && scn == 3 && depth != CV_32F )
   7839                 {
   7840                     if( CvtColorIPPLoopCopy(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth], ippiRGB2XYZTab[depth], 2, 1, 0, depth)) )
   7841                     {
   7842                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   7843                         return;
   7844                     }
   7845                     setIppErrorStatus();
   7846                 }
   7847                 else if( code == CV_BGR2XYZ && scn == 4 && depth != CV_32F )
   7848                 {
   7849                     if( CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2XYZTab[depth], 2, 1, 0, depth)) )
   7850                     {
   7851                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   7852                         return;
   7853                     }
   7854                     setIppErrorStatus();
   7855                 }
   7856                 else if( code == CV_RGB2XYZ && scn == 3 && depth != CV_32F )
   7857                 {
   7858                     if( CvtColorIPPLoopCopy(src, dst, IPPGeneralFunctor(ippiRGB2XYZTab[depth])) )
   7859                     {
   7860                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   7861                         return;
   7862                     }
   7863                     setIppErrorStatus();
   7864                 }
   7865                 else if( code == CV_RGB2XYZ && scn == 4 && depth != CV_32F )
   7866                 {
   7867                     if( CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2XYZTab[depth], 0, 1, 2, depth)) )
   7868                     {
   7869                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   7870                         return;
   7871                     }
   7872                     setIppErrorStatus();
   7873                 }
   7874             }
   7875 #endif
   7876 
   7877             if( depth == CV_8U )
   7878                 CvtColorLoop(src, dst, RGB2XYZ_i<uchar>(scn, bidx, 0));
   7879             else if( depth == CV_16U )
   7880                 CvtColorLoop(src, dst, RGB2XYZ_i<ushort>(scn, bidx, 0));
   7881             else
   7882                 CvtColorLoop(src, dst, RGB2XYZ_f<float>(scn, bidx, 0));
   7883             break;
   7884 
   7885         case CV_XYZ2BGR: case CV_XYZ2RGB:
   7886             if( dcn <= 0 ) dcn = 3;
   7887             CV_Assert( scn == 3 && (dcn == 3 || dcn == 4) );
   7888             bidx = code == CV_XYZ2BGR ? 0 : 2;
   7889 
   7890             _dst.create(sz, CV_MAKETYPE(depth, dcn));
   7891             dst = _dst.getMat();
   7892 
   7893 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
   7894             CV_IPP_CHECK()
   7895             {
   7896                 if( code == CV_XYZ2BGR && dcn == 3 && depth != CV_32F )
   7897                 {
   7898                     if( CvtColorIPPLoopCopy(src, dst, IPPGeneralReorderFunctor(ippiXYZ2RGBTab[depth], ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)) )
   7899                     {
   7900                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   7901                         return;
   7902                     }
   7903                     setIppErrorStatus();
   7904                 }
   7905                 else if( code == CV_XYZ2BGR && dcn == 4 && depth != CV_32F )
   7906                 {
   7907                     if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor(ippiXYZ2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)) )
   7908                     {
   7909                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   7910                         return;
   7911                     }
   7912                     setIppErrorStatus();
   7913                 }
   7914                 if( code == CV_XYZ2RGB && dcn == 3 && depth != CV_32F )
   7915                 {
   7916                     if( CvtColorIPPLoopCopy(src, dst, IPPGeneralFunctor(ippiXYZ2RGBTab[depth])) )
   7917                     {
   7918                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   7919                         return;
   7920                     }
   7921                     setIppErrorStatus();
   7922                 }
   7923                 else if( code == CV_XYZ2RGB && dcn == 4 && depth != CV_32F )
   7924                 {
   7925                     if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor(ippiXYZ2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)) )
   7926                     {
   7927                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   7928                         return;
   7929                     }
   7930                     setIppErrorStatus();
   7931                 }
   7932             }
   7933 #endif
   7934 
   7935             if( depth == CV_8U )
   7936                 CvtColorLoop(src, dst, XYZ2RGB_i<uchar>(dcn, bidx, 0));
   7937             else if( depth == CV_16U )
   7938                 CvtColorLoop(src, dst, XYZ2RGB_i<ushort>(dcn, bidx, 0));
   7939             else
   7940                 CvtColorLoop(src, dst, XYZ2RGB_f<float>(dcn, bidx, 0));
   7941             break;
   7942 
   7943         case CV_BGR2HSV: case CV_RGB2HSV: case CV_BGR2HSV_FULL: case CV_RGB2HSV_FULL:
   7944         case CV_BGR2HLS: case CV_RGB2HLS: case CV_BGR2HLS_FULL: case CV_RGB2HLS_FULL:
   7945             {
   7946             CV_Assert( (scn == 3 || scn == 4) && (depth == CV_8U || depth == CV_32F) );
   7947             bidx = code == CV_BGR2HSV || code == CV_BGR2HLS ||
   7948                 code == CV_BGR2HSV_FULL || code == CV_BGR2HLS_FULL ? 0 : 2;
   7949             int hrange = depth == CV_32F ? 360 : code == CV_BGR2HSV || code == CV_RGB2HSV ||
   7950                 code == CV_BGR2HLS || code == CV_RGB2HLS ? 180 : 256;
   7951 
   7952             _dst.create(sz, CV_MAKETYPE(depth, 3));
   7953             dst = _dst.getMat();
   7954 
   7955 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
   7956             CV_IPP_CHECK()
   7957             {
   7958                 if( depth == CV_8U || depth == CV_16U )
   7959                 {
   7960 #if 0 // breaks OCL accuracy tests
   7961                     if( code == CV_BGR2HSV_FULL && scn == 3 )
   7962                     {
   7963                         if( CvtColorIPPLoopCopy(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth], ippiRGB2HSVTab[depth], 2, 1, 0, depth)) )
   7964                         {
   7965                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   7966                             return;
   7967                         }
   7968                         setIppErrorStatus();
   7969                     }
   7970                     else if( code == CV_BGR2HSV_FULL && scn == 4 )
   7971                     {
   7972                         if( CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2HSVTab[depth], 2, 1, 0, depth)) )
   7973                         {
   7974                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   7975                             return;
   7976                         }
   7977                         setIppErrorStatus();
   7978                     }
   7979                     else if( code == CV_RGB2HSV_FULL && scn == 4 )
   7980                     {
   7981                         if( CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2HSVTab[depth], 0, 1, 2, depth)) )
   7982                         {
   7983                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   7984                             return;
   7985                         }
   7986                         setIppErrorStatus();
   7987                     } else
   7988 #endif
   7989                     if( code == CV_RGB2HSV_FULL && scn == 3 && depth == CV_16U )
   7990                     {
   7991                         if( CvtColorIPPLoopCopy(src, dst, IPPGeneralFunctor(ippiRGB2HSVTab[depth])) )
   7992                         {
   7993                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   7994                             return;
   7995                         }
   7996                         setIppErrorStatus();
   7997                     }
   7998                     else if( code == CV_BGR2HLS_FULL && scn == 3 )
   7999                     {
   8000                         if( CvtColorIPPLoopCopy(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth], ippiRGB2HLSTab[depth], 2, 1, 0, depth)) )
   8001                         {
   8002                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   8003                             return;
   8004                         }
   8005                         setIppErrorStatus();
   8006                     }
   8007                     else if( code == CV_BGR2HLS_FULL && scn == 4 )
   8008                     {
   8009                         if( CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2HLSTab[depth], 2, 1, 0, depth)) )
   8010                         {
   8011                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   8012                             return;
   8013                         }
   8014                         setIppErrorStatus();
   8015                     }
   8016                     else if( code == CV_RGB2HLS_FULL && scn == 3 )
   8017                     {
   8018                         if( CvtColorIPPLoopCopy(src, dst, IPPGeneralFunctor(ippiRGB2HLSTab[depth])) )
   8019                         {
   8020                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   8021                             return;
   8022                         }
   8023                         setIppErrorStatus();
   8024                     }
   8025                     else if( code == CV_RGB2HLS_FULL && scn == 4 )
   8026                     {
   8027                         if( CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth], ippiRGB2HLSTab[depth], 0, 1, 2, depth)) )
   8028                         {
   8029                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   8030                             return;
   8031                         }
   8032                         setIppErrorStatus();
   8033                     }
   8034                 }
   8035             }
   8036 #endif
   8037 
   8038             if( code == CV_BGR2HSV || code == CV_RGB2HSV ||
   8039                 code == CV_BGR2HSV_FULL || code == CV_RGB2HSV_FULL )
   8040             {
   8041 #ifdef HAVE_TEGRA_OPTIMIZATION
   8042                 if(tegra::useTegra() && tegra::cvtRGB2HSV(src, dst, bidx, hrange))
   8043                     break;
   8044 #endif
   8045                 if( depth == CV_8U )
   8046                     CvtColorLoop(src, dst, RGB2HSV_b(scn, bidx, hrange));
   8047                 else
   8048                     CvtColorLoop(src, dst, RGB2HSV_f(scn, bidx, (float)hrange));
   8049             }
   8050             else
   8051             {
   8052                 if( depth == CV_8U )
   8053                     CvtColorLoop(src, dst, RGB2HLS_b(scn, bidx, hrange));
   8054                 else
   8055                     CvtColorLoop(src, dst, RGB2HLS_f(scn, bidx, (float)hrange));
   8056             }
   8057             }
   8058             break;
   8059 
   8060         case CV_HSV2BGR: case CV_HSV2RGB: case CV_HSV2BGR_FULL: case CV_HSV2RGB_FULL:
   8061         case CV_HLS2BGR: case CV_HLS2RGB: case CV_HLS2BGR_FULL: case CV_HLS2RGB_FULL:
   8062             {
   8063             if( dcn <= 0 ) dcn = 3;
   8064             CV_Assert( scn == 3 && (dcn == 3 || dcn == 4) && (depth == CV_8U || depth == CV_32F) );
   8065             bidx = code == CV_HSV2BGR || code == CV_HLS2BGR ||
   8066                 code == CV_HSV2BGR_FULL || code == CV_HLS2BGR_FULL ? 0 : 2;
   8067             int hrange = depth == CV_32F ? 360 : code == CV_HSV2BGR || code == CV_HSV2RGB ||
   8068                 code == CV_HLS2BGR || code == CV_HLS2RGB ? 180 : 255;
   8069 
   8070             _dst.create(sz, CV_MAKETYPE(depth, dcn));
   8071             dst = _dst.getMat();
   8072 
   8073 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
   8074             CV_IPP_CHECK()
   8075             {
   8076                 if( depth == CV_8U || depth == CV_16U )
   8077                 {
   8078                     if( code == CV_HSV2BGR_FULL && dcn == 3 )
   8079                     {
   8080                         if( CvtColorIPPLoopCopy(src, dst, IPPGeneralReorderFunctor(ippiHSV2RGBTab[depth], ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)) )
   8081                         {
   8082                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   8083                             return;
   8084                         }
   8085                         setIppErrorStatus();
   8086                     }
   8087                     else if( code == CV_HSV2BGR_FULL && dcn == 4 )
   8088                     {
   8089                         if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor(ippiHSV2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)) )
   8090                         {
   8091                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   8092                             return;
   8093                         }
   8094                         setIppErrorStatus();
   8095                     }
   8096                     else if( code == CV_HSV2RGB_FULL && dcn == 3 )
   8097                     {
   8098                         if( CvtColorIPPLoopCopy(src, dst, IPPGeneralFunctor(ippiHSV2RGBTab[depth])) )
   8099                         {
   8100                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   8101                             return;
   8102                         }
   8103                         setIppErrorStatus();
   8104                     }
   8105                     else if( code == CV_HSV2RGB_FULL && dcn == 4 )
   8106                     {
   8107                         if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor(ippiHSV2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)) )
   8108                         {
   8109                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   8110                             return;
   8111                         }
   8112                         setIppErrorStatus();
   8113                     }
   8114                     else if( code == CV_HLS2BGR_FULL && dcn == 3 )
   8115                     {
   8116                         if( CvtColorIPPLoopCopy(src, dst, IPPGeneralReorderFunctor(ippiHLS2RGBTab[depth], ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)) )
   8117                         {
   8118                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   8119                             return;
   8120                         }
   8121                         setIppErrorStatus();
   8122                     }
   8123                     else if( code == CV_HLS2BGR_FULL && dcn == 4 )
   8124                     {
   8125                         if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor(ippiHLS2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)) )
   8126                         {
   8127                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   8128                             return;
   8129                         }
   8130                         setIppErrorStatus();
   8131                     }
   8132                     else if( code == CV_HLS2RGB_FULL && dcn == 3 )
   8133                     {
   8134                         if( CvtColorIPPLoopCopy(src, dst, IPPGeneralFunctor(ippiHLS2RGBTab[depth])) )
   8135                         {
   8136                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   8137                             return;
   8138                         }
   8139                         setIppErrorStatus();
   8140                     }
   8141                     else if( code == CV_HLS2RGB_FULL && dcn == 4 )
   8142                     {
   8143                         if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor(ippiHLS2RGBTab[depth], ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)) )
   8144                         {
   8145                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   8146                             return;
   8147                         }
   8148                         setIppErrorStatus();
   8149                     }
   8150                 }
   8151             }
   8152 #endif
   8153 
   8154             if( code == CV_HSV2BGR || code == CV_HSV2RGB ||
   8155                 code == CV_HSV2BGR_FULL || code == CV_HSV2RGB_FULL )
   8156             {
   8157                 if( depth == CV_8U )
   8158                     CvtColorLoop(src, dst, HSV2RGB_b(dcn, bidx, hrange));
   8159                 else
   8160                     CvtColorLoop(src, dst, HSV2RGB_f(dcn, bidx, (float)hrange));
   8161             }
   8162             else
   8163             {
   8164                 if( depth == CV_8U )
   8165                     CvtColorLoop(src, dst, HLS2RGB_b(dcn, bidx, hrange));
   8166                 else
   8167                     CvtColorLoop(src, dst, HLS2RGB_f(dcn, bidx, (float)hrange));
   8168             }
   8169             }
   8170             break;
   8171 
   8172         case CV_BGR2Lab: case CV_RGB2Lab: case CV_LBGR2Lab: case CV_LRGB2Lab:
   8173         case CV_BGR2Luv: case CV_RGB2Luv: case CV_LBGR2Luv: case CV_LRGB2Luv:
   8174             {
   8175             CV_Assert( (scn == 3 || scn == 4) && (depth == CV_8U || depth == CV_32F) );
   8176             bidx = code == CV_BGR2Lab || code == CV_BGR2Luv ||
   8177                    code == CV_LBGR2Lab || code == CV_LBGR2Luv ? 0 : 2;
   8178             bool srgb = code == CV_BGR2Lab || code == CV_RGB2Lab ||
   8179                         code == CV_BGR2Luv || code == CV_RGB2Luv;
   8180 
   8181             _dst.create(sz, CV_MAKETYPE(depth, 3));
   8182             dst = _dst.getMat();
   8183 
   8184 #if defined HAVE_IPP && 0
   8185             CV_IPP_CHECK()
   8186             {
   8187                 if (code == CV_LBGR2Lab && scn == 3 && depth == CV_8U)
   8188                 {
   8189                     if (CvtColorIPPLoop(src, dst, IPPGeneralFunctor((ippiGeneralFunc)ippiBGRToLab_8u_C3R)))
   8190                     {
   8191                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   8192                         return;
   8193                     }
   8194                     setIppErrorStatus();
   8195                 }
   8196                 else if (code == CV_LBGR2Lab && scn == 4 && depth == CV_8U)
   8197                 {
   8198                     if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
   8199                                                                            (ippiGeneralFunc)ippiBGRToLab_8u_C3R, 0, 1, 2, depth)))
   8200                     {
   8201                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   8202                         return;
   8203                     }
   8204                     setIppErrorStatus();
   8205                 }
   8206                 else
   8207                 if (code == CV_LRGB2Lab && scn == 3 && depth == CV_8U) // slower than OpenCV
   8208                 {
   8209                     if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth],
   8210                                                                            (ippiGeneralFunc)ippiBGRToLab_8u_C3R, 2, 1, 0, depth)))
   8211                     {
   8212                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   8213                         return;
   8214                     }
   8215                     setIppErrorStatus();
   8216                 }
   8217                 else if (code == CV_LRGB2Lab && scn == 4 && depth == CV_8U) // slower than OpenCV
   8218                 {
   8219                     if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
   8220                                                                            (ippiGeneralFunc)ippiBGRToLab_8u_C3R, 2, 1, 0, depth)))
   8221                     {
   8222                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   8223                         return;
   8224                     }
   8225                     setIppErrorStatus();
   8226                 }
   8227                 else if (code == CV_LRGB2Luv && scn == 3)
   8228                 {
   8229                     if (CvtColorIPPLoop(src, dst, IPPGeneralFunctor(ippiRGBToLUVTab[depth])))
   8230                     {
   8231                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   8232                         return;
   8233                     }
   8234                     setIppErrorStatus();
   8235                 }
   8236                 else if (code == CV_LRGB2Luv && scn == 4)
   8237                 {
   8238                     if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
   8239                                                                            ippiRGBToLUVTab[depth], 0, 1, 2, depth)))
   8240                     {
   8241                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   8242                         return;
   8243                     }
   8244                     setIppErrorStatus();
   8245                 }
   8246                 else if (code == CV_LBGR2Luv && scn == 3)
   8247                 {
   8248                     if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC3RTab[depth],
   8249                                                                            ippiRGBToLUVTab[depth], 2, 1, 0, depth)))
   8250                     {
   8251                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   8252                         return;
   8253                     }
   8254                     setIppErrorStatus();
   8255                 }
   8256                 else if (code == CV_LBGR2Luv && scn == 4)
   8257                 {
   8258                     if (CvtColorIPPLoop(src, dst, IPPReorderGeneralFunctor(ippiSwapChannelsC4C3RTab[depth],
   8259                                                                            ippiRGBToLUVTab[depth], 2, 1, 0, depth)))
   8260                     {
   8261                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   8262                         return;
   8263                     }
   8264                     setIppErrorStatus();
   8265                 }
   8266             }
   8267 #endif
   8268 
   8269             if( code == CV_BGR2Lab || code == CV_RGB2Lab ||
   8270                 code == CV_LBGR2Lab || code == CV_LRGB2Lab )
   8271             {
   8272                 if( depth == CV_8U )
   8273                     CvtColorLoop(src, dst, RGB2Lab_b(scn, bidx, 0, 0, srgb));
   8274                 else
   8275                     CvtColorLoop(src, dst, RGB2Lab_f(scn, bidx, 0, 0, srgb));
   8276             }
   8277             else
   8278             {
   8279                 if( depth == CV_8U )
   8280                     CvtColorLoop(src, dst, RGB2Luv_b(scn, bidx, 0, 0, srgb));
   8281                 else
   8282                     CvtColorLoop(src, dst, RGB2Luv_f(scn, bidx, 0, 0, srgb));
   8283             }
   8284             }
   8285             break;
   8286 
   8287         case CV_Lab2BGR: case CV_Lab2RGB: case CV_Lab2LBGR: case CV_Lab2LRGB:
   8288         case CV_Luv2BGR: case CV_Luv2RGB: case CV_Luv2LBGR: case CV_Luv2LRGB:
   8289             {
   8290             if( dcn <= 0 ) dcn = 3;
   8291             CV_Assert( scn == 3 && (dcn == 3 || dcn == 4) && (depth == CV_8U || depth == CV_32F) );
   8292             bidx = code == CV_Lab2BGR || code == CV_Luv2BGR ||
   8293                    code == CV_Lab2LBGR || code == CV_Luv2LBGR ? 0 : 2;
   8294             bool srgb = code == CV_Lab2BGR || code == CV_Lab2RGB ||
   8295                     code == CV_Luv2BGR || code == CV_Luv2RGB;
   8296 
   8297             _dst.create(sz, CV_MAKETYPE(depth, dcn));
   8298             dst = _dst.getMat();
   8299 
   8300 #if defined HAVE_IPP && 0
   8301             CV_IPP_CHECK()
   8302             {
   8303                 if( code == CV_Lab2LBGR && dcn == 3 && depth == CV_8U)
   8304                 {
   8305                     if( CvtColorIPPLoop(src, dst, IPPGeneralFunctor((ippiGeneralFunc)ippiLabToBGR_8u_C3R)) )
   8306                     {
   8307                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   8308                         return;
   8309                     }
   8310                     setIppErrorStatus();
   8311                 }
   8312                 else if( code == CV_Lab2LBGR && dcn == 4 && depth == CV_8U )
   8313                 {
   8314                     if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiLabToBGR_8u_C3R,
   8315                                         ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)) )
   8316                     {
   8317                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   8318                         return;
   8319                     }
   8320                     setIppErrorStatus();
   8321                 }
   8322                 if( code == CV_Lab2LRGB && dcn == 3 && depth == CV_8U )
   8323                 {
   8324                     if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiLabToBGR_8u_C3R,
   8325                                                                                ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)) )
   8326                     {
   8327                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   8328                         return;
   8329                     }
   8330                     setIppErrorStatus();
   8331                 }
   8332                 else if( code == CV_Lab2LRGB && dcn == 4 && depth == CV_8U )
   8333                 {
   8334                     if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor((ippiGeneralFunc)ippiLabToBGR_8u_C3R,
   8335                                                                            ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)) )
   8336                     {
   8337                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   8338                         return;
   8339                     }
   8340                     setIppErrorStatus();
   8341                 }
   8342                 if( code == CV_Luv2LRGB && dcn == 3 )
   8343                 {
   8344                     if( CvtColorIPPLoop(src, dst, IPPGeneralFunctor(ippiLUVToRGBTab[depth])) )
   8345                         return;
   8346                 }
   8347                 else if( code == CV_Luv2LRGB && dcn == 4 )
   8348                 {
   8349                     if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor(ippiLUVToRGBTab[depth],
   8350                                                                            ippiSwapChannelsC3C4RTab[depth], 0, 1, 2, depth)) )
   8351                     {
   8352                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   8353                         return;
   8354                     }
   8355                 }
   8356                 if( code == CV_Luv2LBGR && dcn == 3 )
   8357                 {
   8358                     if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor(ippiLUVToRGBTab[depth],
   8359                                                                            ippiSwapChannelsC3RTab[depth], 2, 1, 0, depth)) )
   8360                     {
   8361                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   8362                         return;
   8363                     }
   8364                 }
   8365                 else if( code == CV_Luv2LBGR && dcn == 4 )
   8366                 {
   8367                     if( CvtColorIPPLoop(src, dst, IPPGeneralReorderFunctor(ippiLUVToRGBTab[depth],
   8368                                                                            ippiSwapChannelsC3C4RTab[depth], 2, 1, 0, depth)) )
   8369                     {
   8370                         CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   8371                         return;
   8372                     }
   8373                 }
   8374             }
   8375 #endif
   8376 
   8377             if( code == CV_Lab2BGR || code == CV_Lab2RGB ||
   8378                 code == CV_Lab2LBGR || code == CV_Lab2LRGB )
   8379             {
   8380                 if( depth == CV_8U )
   8381                     CvtColorLoop(src, dst, Lab2RGB_b(dcn, bidx, 0, 0, srgb));
   8382                 else
   8383                     CvtColorLoop(src, dst, Lab2RGB_f(dcn, bidx, 0, 0, srgb));
   8384             }
   8385             else
   8386             {
   8387                 if( depth == CV_8U )
   8388                     CvtColorLoop(src, dst, Luv2RGB_b(dcn, bidx, 0, 0, srgb));
   8389                 else
   8390                     CvtColorLoop(src, dst, Luv2RGB_f(dcn, bidx, 0, 0, srgb));
   8391             }
   8392             }
   8393             break;
   8394 
   8395         case CV_BayerBG2GRAY: case CV_BayerGB2GRAY: case CV_BayerRG2GRAY: case CV_BayerGR2GRAY:
   8396         case CV_BayerBG2BGR: case CV_BayerGB2BGR: case CV_BayerRG2BGR: case CV_BayerGR2BGR:
   8397         case CV_BayerBG2BGR_VNG: case CV_BayerGB2BGR_VNG: case CV_BayerRG2BGR_VNG: case CV_BayerGR2BGR_VNG:
   8398         case CV_BayerBG2BGR_EA: case CV_BayerGB2BGR_EA: case CV_BayerRG2BGR_EA: case CV_BayerGR2BGR_EA:
   8399             demosaicing(src, _dst, code, dcn);
   8400             break;
   8401 
   8402         case CV_YUV2BGR_NV21:  case CV_YUV2RGB_NV21:  case CV_YUV2BGR_NV12:  case CV_YUV2RGB_NV12:
   8403         case CV_YUV2BGRA_NV21: case CV_YUV2RGBA_NV21: case CV_YUV2BGRA_NV12: case CV_YUV2RGBA_NV12:
   8404             {
   8405                 // http://www.fourcc.org/yuv.php#NV21 == yuv420sp -> a plane of 8 bit Y samples followed by an interleaved V/U plane containing 8 bit 2x2 subsampled chroma samples
   8406                 // http://www.fourcc.org/yuv.php#NV12 -> a plane of 8 bit Y samples followed by an interleaved U/V plane containing 8 bit 2x2 subsampled colour difference samples
   8407 
   8408                 if (dcn <= 0) dcn = (code==CV_YUV420sp2BGRA || code==CV_YUV420sp2RGBA || code==CV_YUV2BGRA_NV12 || code==CV_YUV2RGBA_NV12) ? 4 : 3;
   8409                 const int bIdx = (code==CV_YUV2BGR_NV21 || code==CV_YUV2BGRA_NV21 || code==CV_YUV2BGR_NV12 || code==CV_YUV2BGRA_NV12) ? 0 : 2;
   8410                 const int uIdx = (code==CV_YUV2BGR_NV21 || code==CV_YUV2BGRA_NV21 || code==CV_YUV2RGB_NV21 || code==CV_YUV2RGBA_NV21) ? 1 : 0;
   8411 
   8412                 CV_Assert( dcn == 3 || dcn == 4 );
   8413                 CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0 && depth == CV_8U );
   8414 
   8415                 Size dstSz(sz.width, sz.height * 2 / 3);
   8416                 _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
   8417                 dst = _dst.getMat();
   8418 
   8419                 int srcstep = (int)src.step;
   8420                 const uchar* y = src.ptr();
   8421                 const uchar* uv = y + srcstep * dstSz.height;
   8422 
   8423                 switch(dcn*100 + bIdx * 10 + uIdx)
   8424                 {
   8425                     case 300: cvtYUV420sp2RGB<0, 0> (dst, srcstep, y, uv); break;
   8426                     case 301: cvtYUV420sp2RGB<0, 1> (dst, srcstep, y, uv); break;
   8427                     case 320: cvtYUV420sp2RGB<2, 0> (dst, srcstep, y, uv); break;
   8428                     case 321: cvtYUV420sp2RGB<2, 1> (dst, srcstep, y, uv); break;
   8429                     case 400: cvtYUV420sp2RGBA<0, 0>(dst, srcstep, y, uv); break;
   8430                     case 401: cvtYUV420sp2RGBA<0, 1>(dst, srcstep, y, uv); break;
   8431                     case 420: cvtYUV420sp2RGBA<2, 0>(dst, srcstep, y, uv); break;
   8432                     case 421: cvtYUV420sp2RGBA<2, 1>(dst, srcstep, y, uv); break;
   8433                     default: CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" ); break;
   8434                 };
   8435             }
   8436             break;
   8437         case CV_YUV2BGR_YV12: case CV_YUV2RGB_YV12: case CV_YUV2BGRA_YV12: case CV_YUV2RGBA_YV12:
   8438         case CV_YUV2BGR_IYUV: case CV_YUV2RGB_IYUV: case CV_YUV2BGRA_IYUV: case CV_YUV2RGBA_IYUV:
   8439             {
   8440                 //http://www.fourcc.org/yuv.php#YV12 == yuv420p -> It comprises an NxM Y plane followed by (N/2)x(M/2) V and U planes.
   8441                 //http://www.fourcc.org/yuv.php#IYUV == I420 -> It comprises an NxN Y plane followed by (N/2)x(N/2) U and V planes
   8442 
   8443                 if (dcn <= 0) dcn = (code==CV_YUV2BGRA_YV12 || code==CV_YUV2RGBA_YV12 || code==CV_YUV2RGBA_IYUV || code==CV_YUV2BGRA_IYUV) ? 4 : 3;
   8444                 const int bIdx = (code==CV_YUV2BGR_YV12 || code==CV_YUV2BGRA_YV12 || code==CV_YUV2BGR_IYUV || code==CV_YUV2BGRA_IYUV) ? 0 : 2;
   8445                 const int uIdx  = (code==CV_YUV2BGR_YV12 || code==CV_YUV2RGB_YV12 || code==CV_YUV2BGRA_YV12 || code==CV_YUV2RGBA_YV12) ? 1 : 0;
   8446 
   8447                 CV_Assert( dcn == 3 || dcn == 4 );
   8448                 CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0 && depth == CV_8U );
   8449 
   8450                 Size dstSz(sz.width, sz.height * 2 / 3);
   8451                 _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
   8452                 dst = _dst.getMat();
   8453 
   8454                 int srcstep = (int)src.step;
   8455                 const uchar* y = src.ptr();
   8456                 const uchar* u = y + srcstep * dstSz.height;
   8457                 const uchar* v = y + srcstep * (dstSz.height + dstSz.height/4) + (dstSz.width/2) * ((dstSz.height % 4)/2);
   8458 
   8459                 int ustepIdx = 0;
   8460                 int vstepIdx = dstSz.height % 4 == 2 ? 1 : 0;
   8461 
   8462                 if(uIdx == 1) { std::swap(u ,v), std::swap(ustepIdx, vstepIdx); }
   8463 
   8464                 switch(dcn*10 + bIdx)
   8465                 {
   8466                     case 30: cvtYUV420p2RGB<0>(dst, srcstep, y, u, v, ustepIdx, vstepIdx); break;
   8467                     case 32: cvtYUV420p2RGB<2>(dst, srcstep, y, u, v, ustepIdx, vstepIdx); break;
   8468                     case 40: cvtYUV420p2RGBA<0>(dst, srcstep, y, u, v, ustepIdx, vstepIdx); break;
   8469                     case 42: cvtYUV420p2RGBA<2>(dst, srcstep, y, u, v, ustepIdx, vstepIdx); break;
   8470                     default: CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" ); break;
   8471                 };
   8472             }
   8473             break;
   8474         case CV_YUV2GRAY_420:
   8475             {
   8476                 if (dcn <= 0) dcn = 1;
   8477 
   8478                 CV_Assert( dcn == 1 );
   8479                 CV_Assert( sz.width % 2 == 0 && sz.height % 3 == 0 && depth == CV_8U );
   8480 
   8481                 Size dstSz(sz.width, sz.height * 2 / 3);
   8482                 _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
   8483                 dst = _dst.getMat();
   8484 #if defined HAVE_IPP
   8485                 CV_IPP_CHECK()
   8486                 {
   8487                     if (ippStsNoErr == ippiCopy_8u_C1R(src.data, (int)src.step, dst.data, (int)dst.step,
   8488                             ippiSize(dstSz.width, dstSz.height)))
   8489                     {
   8490                         CV_IMPL_ADD(CV_IMPL_IPP);
   8491                         return;
   8492                     }
   8493                     setIppErrorStatus();
   8494                 }
   8495 #endif
   8496                 src(Range(0, dstSz.height), Range::all()).copyTo(dst);
   8497             }
   8498             break;
   8499         case CV_RGB2YUV_YV12: case CV_BGR2YUV_YV12: case CV_RGBA2YUV_YV12: case CV_BGRA2YUV_YV12:
   8500         case CV_RGB2YUV_IYUV: case CV_BGR2YUV_IYUV: case CV_RGBA2YUV_IYUV: case CV_BGRA2YUV_IYUV:
   8501             {
   8502                 if (dcn <= 0) dcn = 1;
   8503                 const int bIdx = (code == CV_BGR2YUV_IYUV || code == CV_BGRA2YUV_IYUV || code == CV_BGR2YUV_YV12 || code == CV_BGRA2YUV_YV12) ? 0 : 2;
   8504                 const int uIdx = (code == CV_BGR2YUV_IYUV || code == CV_BGRA2YUV_IYUV || code == CV_RGB2YUV_IYUV || code == CV_RGBA2YUV_IYUV) ? 1 : 2;
   8505 
   8506                 CV_Assert( (scn == 3 || scn == 4) && depth == CV_8U );
   8507                 CV_Assert( dcn == 1 );
   8508                 CV_Assert( sz.width % 2 == 0 && sz.height % 2 == 0 );
   8509 
   8510                 Size dstSz(sz.width, sz.height / 2 * 3);
   8511                 _dst.create(dstSz, CV_MAKETYPE(depth, dcn));
   8512                 dst = _dst.getMat();
   8513 
   8514                 switch(bIdx + uIdx*10)
   8515                 {
   8516                     case 10: cvtRGBtoYUV420p<0, 1>(src, dst); break;
   8517                     case 12: cvtRGBtoYUV420p<2, 1>(src, dst); break;
   8518                     case 20: cvtRGBtoYUV420p<0, 2>(src, dst); break;
   8519                     case 22: cvtRGBtoYUV420p<2, 2>(src, dst); break;
   8520                     default: CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" ); break;
   8521                 };
   8522             }
   8523             break;
   8524         case CV_YUV2RGB_UYVY: case CV_YUV2BGR_UYVY: case CV_YUV2RGBA_UYVY: case CV_YUV2BGRA_UYVY:
   8525         case CV_YUV2RGB_YUY2: case CV_YUV2BGR_YUY2: case CV_YUV2RGB_YVYU: case CV_YUV2BGR_YVYU:
   8526         case CV_YUV2RGBA_YUY2: case CV_YUV2BGRA_YUY2: case CV_YUV2RGBA_YVYU: case CV_YUV2BGRA_YVYU:
   8527             {
   8528                 //http://www.fourcc.org/yuv.php#UYVY
   8529                 //http://www.fourcc.org/yuv.php#YUY2
   8530                 //http://www.fourcc.org/yuv.php#YVYU
   8531 
   8532                 if (dcn <= 0) dcn = (code==CV_YUV2RGBA_UYVY || code==CV_YUV2BGRA_UYVY || code==CV_YUV2RGBA_YUY2 || code==CV_YUV2BGRA_YUY2 || code==CV_YUV2RGBA_YVYU || code==CV_YUV2BGRA_YVYU) ? 4 : 3;
   8533                 const int bIdx = (code==CV_YUV2BGR_UYVY || code==CV_YUV2BGRA_UYVY || code==CV_YUV2BGR_YUY2 || code==CV_YUV2BGRA_YUY2 || code==CV_YUV2BGR_YVYU || code==CV_YUV2BGRA_YVYU) ? 0 : 2;
   8534                 const int ycn  = (code==CV_YUV2RGB_UYVY || code==CV_YUV2BGR_UYVY || code==CV_YUV2RGBA_UYVY || code==CV_YUV2BGRA_UYVY) ? 1 : 0;
   8535                 const int uIdx = (code==CV_YUV2RGB_YVYU || code==CV_YUV2BGR_YVYU || code==CV_YUV2RGBA_YVYU || code==CV_YUV2BGRA_YVYU) ? 1 : 0;
   8536 
   8537                 CV_Assert( dcn == 3 || dcn == 4 );
   8538                 CV_Assert( scn == 2 && depth == CV_8U );
   8539 
   8540                 _dst.create(sz, CV_8UC(dcn));
   8541                 dst = _dst.getMat();
   8542 
   8543                 switch(dcn*1000 + bIdx*100 + uIdx*10 + ycn)
   8544                 {
   8545                     case 3000: cvtYUV422toRGB<0,0,0>(dst, (int)src.step, src.ptr<uchar>()); break;
   8546                     case 3001: cvtYUV422toRGB<0,0,1>(dst, (int)src.step, src.ptr<uchar>()); break;
   8547                     case 3010: cvtYUV422toRGB<0,1,0>(dst, (int)src.step, src.ptr<uchar>()); break;
   8548                     case 3011: cvtYUV422toRGB<0,1,1>(dst, (int)src.step, src.ptr<uchar>()); break;
   8549                     case 3200: cvtYUV422toRGB<2,0,0>(dst, (int)src.step, src.ptr<uchar>()); break;
   8550                     case 3201: cvtYUV422toRGB<2,0,1>(dst, (int)src.step, src.ptr<uchar>()); break;
   8551                     case 3210: cvtYUV422toRGB<2,1,0>(dst, (int)src.step, src.ptr<uchar>()); break;
   8552                     case 3211: cvtYUV422toRGB<2,1,1>(dst, (int)src.step, src.ptr<uchar>()); break;
   8553                     case 4000: cvtYUV422toRGBA<0,0,0>(dst, (int)src.step, src.ptr<uchar>()); break;
   8554                     case 4001: cvtYUV422toRGBA<0,0,1>(dst, (int)src.step, src.ptr<uchar>()); break;
   8555                     case 4010: cvtYUV422toRGBA<0,1,0>(dst, (int)src.step, src.ptr<uchar>()); break;
   8556                     case 4011: cvtYUV422toRGBA<0,1,1>(dst, (int)src.step, src.ptr<uchar>()); break;
   8557                     case 4200: cvtYUV422toRGBA<2,0,0>(dst, (int)src.step, src.ptr<uchar>()); break;
   8558                     case 4201: cvtYUV422toRGBA<2,0,1>(dst, (int)src.step, src.ptr<uchar>()); break;
   8559                     case 4210: cvtYUV422toRGBA<2,1,0>(dst, (int)src.step, src.ptr<uchar>()); break;
   8560                     case 4211: cvtYUV422toRGBA<2,1,1>(dst, (int)src.step, src.ptr<uchar>()); break;
   8561                     default: CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" ); break;
   8562                 };
   8563             }
   8564             break;
   8565         case CV_YUV2GRAY_UYVY: case CV_YUV2GRAY_YUY2:
   8566             {
   8567                 if (dcn <= 0) dcn = 1;
   8568 
   8569                 CV_Assert( dcn == 1 );
   8570                 CV_Assert( scn == 2 && depth == CV_8U );
   8571 
   8572                 extractChannel(_src, _dst, code == CV_YUV2GRAY_UYVY ? 1 : 0);
   8573             }
   8574             break;
   8575         case CV_RGBA2mRGBA:
   8576             {
   8577                 if (dcn <= 0) dcn = 4;
   8578                 CV_Assert( scn == 4 && dcn == 4 );
   8579 
   8580                 _dst.create(sz, CV_MAKETYPE(depth, dcn));
   8581                 dst = _dst.getMat();
   8582 
   8583                 if( depth == CV_8U )
   8584                 {
   8585 #if defined(HAVE_IPP)
   8586                     CV_IPP_CHECK()
   8587                     {
   8588                         if (CvtColorIPPLoop(src, dst, IPPGeneralFunctor((ippiGeneralFunc)ippiAlphaPremul_8u_AC4R)))
   8589                         {
   8590                             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   8591                             return;
   8592                         }
   8593                         setIppErrorStatus();
   8594                     }
   8595 #endif
   8596                     CvtColorLoop(src, dst, RGBA2mRGBA<uchar>());
   8597                 }
   8598                 else
   8599                 {
   8600                     CV_Error( CV_StsBadArg, "Unsupported image depth" );
   8601                 }
   8602             }
   8603             break;
   8604         case CV_mRGBA2RGBA:
   8605             {
   8606                 if (dcn <= 0) dcn = 4;
   8607                 CV_Assert( scn == 4 && dcn == 4 );
   8608 
   8609                 _dst.create(sz, CV_MAKETYPE(depth, dcn));
   8610                 dst = _dst.getMat();
   8611 
   8612                 if( depth == CV_8U )
   8613                     CvtColorLoop(src, dst, mRGBA2RGBA<uchar>());
   8614                 else
   8615                 {
   8616                     CV_Error( CV_StsBadArg, "Unsupported image depth" );
   8617                 }
   8618             }
   8619             break;
   8620         default:
   8621             CV_Error( CV_StsBadFlag, "Unknown/unsupported color conversion code" );
   8622     }
   8623 }
   8624 
   8625 CV_IMPL void
   8626 cvCvtColor( const CvArr* srcarr, CvArr* dstarr, int code )
   8627 {
   8628     cv::Mat src = cv::cvarrToMat(srcarr), dst0 = cv::cvarrToMat(dstarr), dst = dst0;
   8629     CV_Assert( src.depth() == dst.depth() );
   8630 
   8631     cv::cvtColor(src, dst, code, dst.channels());
   8632     CV_Assert( dst.data == dst0.data );
   8633 }
   8634 
   8635 
   8636 /* End of file. */
   8637