Home | History | Annotate | Download | only in src
      1 /*M///////////////////////////////////////////////////////////////////////////////////////
      2 //
      3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
      4 //
      5 //  By downloading, copying, installing or using the software you agree to this license.
      6 //  If you do not agree to this license, do not download, install,
      7 //  copy or use the software.
      8 //
      9 //
     10 //                           License Agreement
     11 //                For Open Source Computer Vision Library
     12 //
     13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
     14 // Copyright (C) 2009-2010, Willow Garage Inc., all rights reserved.
     15 // Copyright (C) 2014, Itseez Inc., all rights reserved.
     16 // Third party copyrights are property of their respective owners.
     17 //
     18 // Redistribution and use in source and binary forms, with or without modification,
     19 // are permitted provided that the following conditions are met:
     20 //
     21 //   * Redistribution's of source code must retain the above copyright notice,
     22 //     this list of conditions and the following disclaimer.
     23 //
     24 //   * Redistribution's in binary form must reproduce the above copyright notice,
     25 //     this list of conditions and the following disclaimer in the documentation
     26 //     and/or other materials provided with the distribution.
     27 //
     28 //   * The name of the copyright holders may not be used to endorse or promote products
     29 //     derived from this software without specific prior written permission.
     30 //
     31 // This software is provided by the copyright holders and contributors "as is" and
     32 // any express or implied warranties, including, but not limited to, the implied
     33 // warranties of merchantability and fitness for a particular purpose are disclaimed.
     34 // In no event shall the Intel Corporation or contributors be liable for any direct,
     35 // indirect, incidental, special, exemplary, or consequential damages
     36 // (including, but not limited to, procurement of substitute goods or services;
     37 // loss of use, data, or profits; or business interruption) however caused
     38 // and on any theory of liability, whether in contract, strict liability,
     39 // or tort (including negligence or otherwise) arising in any way out of
     40 // the use of this software, even if advised of the possibility of such damage.
     41 //
     42 //M*/
     43 
     44 #include "precomp.hpp"
     45 
     46 #include <limits>
     47 
     48 #define  CV_DESCALE(x,n)     (((x) + (1 << ((n)-1))) >> (n))
     49 
     50 namespace cv
     51 {
     52 
     53 
     54 //////////////////////////// Bayer Pattern -> RGB conversion /////////////////////////////
     55 
     56 template<typename T>
     57 class SIMDBayerStubInterpolator_
     58 {
     59 public:
     60     int bayer2Gray(const T*, int, T*, int, int, int, int) const
     61     {
     62         return 0;
     63     }
     64 
     65     int bayer2RGB(const T*, int, T*, int, int) const
     66     {
     67         return 0;
     68     }
     69 
     70     int bayer2RGBA(const T*, int, T*, int, int) const
     71     {
     72         return 0;
     73     }
     74 
     75     int bayer2RGB_EA(const T*, int, T*, int, int) const
     76     {
     77         return 0;
     78     }
     79 };
     80 
     81 #if CV_SSE2
     82 class SIMDBayerInterpolator_8u
     83 {
     84 public:
     85     SIMDBayerInterpolator_8u()
     86     {
     87         use_simd = checkHardwareSupport(CV_CPU_SSE2);
     88     }
     89 
     90     int bayer2Gray(const uchar* bayer, int bayer_step, uchar* dst,
     91                    int width, int bcoeff, int gcoeff, int rcoeff) const
     92     {
     93         if( !use_simd )
     94             return 0;
     95 
     96         __m128i _b2y = _mm_set1_epi16((short)(rcoeff*2));
     97         __m128i _g2y = _mm_set1_epi16((short)(gcoeff*2));
     98         __m128i _r2y = _mm_set1_epi16((short)(bcoeff*2));
     99         const uchar* bayer_end = bayer + width;
    100 
    101         for( ; bayer <= bayer_end - 18; bayer += 14, dst += 14 )
    102         {
    103             __m128i r0 = _mm_loadu_si128((const __m128i*)bayer);
    104             __m128i r1 = _mm_loadu_si128((const __m128i*)(bayer+bayer_step));
    105             __m128i r2 = _mm_loadu_si128((const __m128i*)(bayer+bayer_step*2));
    106 
    107             __m128i b1 = _mm_add_epi16(_mm_srli_epi16(_mm_slli_epi16(r0, 8), 7),
    108                                        _mm_srli_epi16(_mm_slli_epi16(r2, 8), 7));
    109             __m128i b0 = _mm_add_epi16(b1, _mm_srli_si128(b1, 2));
    110             b1 = _mm_slli_epi16(_mm_srli_si128(b1, 2), 1);
    111 
    112             __m128i g0 = _mm_add_epi16(_mm_srli_epi16(r0, 7), _mm_srli_epi16(r2, 7));
    113             __m128i g1 = _mm_srli_epi16(_mm_slli_epi16(r1, 8), 7);
    114             g0 = _mm_add_epi16(g0, _mm_add_epi16(g1, _mm_srli_si128(g1, 2)));
    115             g1 = _mm_slli_epi16(_mm_srli_si128(g1, 2), 2);
    116 
    117             r0 = _mm_srli_epi16(r1, 8);
    118             r1 = _mm_slli_epi16(_mm_add_epi16(r0, _mm_srli_si128(r0, 2)), 2);
    119             r0 = _mm_slli_epi16(r0, 3);
    120 
    121             g0 = _mm_add_epi16(_mm_mulhi_epi16(b0, _b2y), _mm_mulhi_epi16(g0, _g2y));
    122             g1 = _mm_add_epi16(_mm_mulhi_epi16(b1, _b2y), _mm_mulhi_epi16(g1, _g2y));
    123             g0 = _mm_add_epi16(g0, _mm_mulhi_epi16(r0, _r2y));
    124             g1 = _mm_add_epi16(g1, _mm_mulhi_epi16(r1, _r2y));
    125             g0 = _mm_srli_epi16(g0, 2);
    126             g1 = _mm_srli_epi16(g1, 2);
    127             g0 = _mm_packus_epi16(g0, g0);
    128             g1 = _mm_packus_epi16(g1, g1);
    129             g0 = _mm_unpacklo_epi8(g0, g1);
    130             _mm_storeu_si128((__m128i*)dst, g0);
    131         }
    132 
    133         return (int)(bayer - (bayer_end - width));
    134     }
    135 
    136     int bayer2RGB(const uchar* bayer, int bayer_step, uchar* dst, int width, int blue) const
    137     {
    138         if( !use_simd )
    139             return 0;
    140         /*
    141          B G B G | B G B G | B G B G | B G B G
    142          G R G R | G R G R | G R G R | G R G R
    143          B G B G | B G B G | B G B G | B G B G
    144          */
    145 
    146         __m128i delta1 = _mm_set1_epi16(1), delta2 = _mm_set1_epi16(2);
    147         __m128i mask = _mm_set1_epi16(blue < 0 ? -1 : 0), z = _mm_setzero_si128();
    148         __m128i masklo = _mm_set1_epi16(0x00ff);
    149         const uchar* bayer_end = bayer + width;
    150 
    151         for( ; bayer <= bayer_end - 18; bayer += 14, dst += 42 )
    152         {
    153             __m128i r0 = _mm_loadu_si128((const __m128i*)bayer);
    154             __m128i r1 = _mm_loadu_si128((const __m128i*)(bayer+bayer_step));
    155             __m128i r2 = _mm_loadu_si128((const __m128i*)(bayer+bayer_step*2));
    156 
    157             __m128i b1 = _mm_add_epi16(_mm_and_si128(r0, masklo), _mm_and_si128(r2, masklo));
    158             __m128i nextb1 = _mm_srli_si128(b1, 2);
    159             __m128i b0 = _mm_add_epi16(b1, nextb1);
    160             b1 = _mm_srli_epi16(_mm_add_epi16(nextb1, delta1), 1);
    161             b0 = _mm_srli_epi16(_mm_add_epi16(b0, delta2), 2);
    162             // b0 b2 ... b14 b1 b3 ... b15
    163             b0 = _mm_packus_epi16(b0, b1);
    164 
    165             __m128i g0 = _mm_add_epi16(_mm_srli_epi16(r0, 8), _mm_srli_epi16(r2, 8));
    166             __m128i g1 = _mm_and_si128(r1, masklo);
    167             g0 = _mm_add_epi16(g0, _mm_add_epi16(g1, _mm_srli_si128(g1, 2)));
    168             g1 = _mm_srli_si128(g1, 2);
    169             g0 = _mm_srli_epi16(_mm_add_epi16(g0, delta2), 2);
    170             // g0 g2 ... g14 g1 g3 ... g15
    171             g0 = _mm_packus_epi16(g0, g1);
    172 
    173             r0 = _mm_srli_epi16(r1, 8);
    174             r1 = _mm_add_epi16(r0, _mm_srli_si128(r0, 2));
    175             r1 = _mm_srli_epi16(_mm_add_epi16(r1, delta1), 1);
    176             // r0 r2 ... r14 r1 r3 ... r15
    177             r0 = _mm_packus_epi16(r0, r1);
    178 
    179             b1 = _mm_and_si128(_mm_xor_si128(b0, r0), mask);
    180             b0 = _mm_xor_si128(b0, b1);
    181             r0 = _mm_xor_si128(r0, b1);
    182 
    183             // b1 g1 b3 g3 b5 g5...
    184             b1 = _mm_unpackhi_epi8(b0, g0);
    185             // b0 g0 b2 g2 b4 g4 ....
    186             b0 = _mm_unpacklo_epi8(b0, g0);
    187 
    188             // r1 0 r3 0 r5 0 ...
    189             r1 = _mm_unpackhi_epi8(r0, z);
    190             // r0 0 r2 0 r4 0 ...
    191             r0 = _mm_unpacklo_epi8(r0, z);
    192 
    193             // 0 b0 g0 r0 0 b2 g2 r2 ...
    194             g0 = _mm_slli_si128(_mm_unpacklo_epi16(b0, r0), 1);
    195             // 0 b8 g8 r8 0 b10 g10 r10 ...
    196             g1 = _mm_slli_si128(_mm_unpackhi_epi16(b0, r0), 1);
    197 
    198             // b1 g1 r1 0 b3 g3 r3 0 ...
    199             r0 = _mm_unpacklo_epi16(b1, r1);
    200             // b9 g9 r9 0 b11 g11 r11 0 ...
    201             r1 = _mm_unpackhi_epi16(b1, r1);
    202 
    203             // 0 b0 g0 r0 b1 g1 r1 0 ...
    204             b0 = _mm_srli_si128(_mm_unpacklo_epi32(g0, r0), 1);
    205             // 0 b4 g4 r4 b5 g5 r5 0 ...
    206             b1 = _mm_srli_si128(_mm_unpackhi_epi32(g0, r0), 1);
    207 
    208             _mm_storel_epi64((__m128i*)(dst-1+0), b0);
    209             _mm_storel_epi64((__m128i*)(dst-1+6*1), _mm_srli_si128(b0, 8));
    210             _mm_storel_epi64((__m128i*)(dst-1+6*2), b1);
    211             _mm_storel_epi64((__m128i*)(dst-1+6*3), _mm_srli_si128(b1, 8));
    212 
    213             // 0 b8 g8 r8 b9 g9 r9 0 ...
    214             g0 = _mm_srli_si128(_mm_unpacklo_epi32(g1, r1), 1);
    215             // 0 b12 g12 r12 b13 g13 r13 0 ...
    216             g1 = _mm_srli_si128(_mm_unpackhi_epi32(g1, r1), 1);
    217 
    218             _mm_storel_epi64((__m128i*)(dst-1+6*4), g0);
    219             _mm_storel_epi64((__m128i*)(dst-1+6*5), _mm_srli_si128(g0, 8));
    220 
    221             _mm_storel_epi64((__m128i*)(dst-1+6*6), g1);
    222         }
    223 
    224         return (int)(bayer - (bayer_end - width));
    225     }
    226 
    227     int bayer2RGBA(const uchar*, int, uchar*, int, int) const
    228     {
    229         return 0;
    230     }
    231 
    232     int bayer2RGB_EA(const uchar* bayer, int bayer_step, uchar* dst, int width, int blue) const
    233     {
    234         if (!use_simd)
    235             return 0;
    236 
    237         const uchar* bayer_end = bayer + width;
    238         __m128i masklow = _mm_set1_epi16(0x00ff);
    239         __m128i delta1 = _mm_set1_epi16(1), delta2 = _mm_set1_epi16(2);
    240         __m128i full = _mm_set1_epi16(-1), z = _mm_setzero_si128();
    241         __m128i mask = _mm_set1_epi16(blue > 0 ? -1 : 0);
    242 
    243         for ( ; bayer <= bayer_end - 18; bayer += 14, dst += 42)
    244         {
    245             /*
    246              B G B G | B G B G | B G B G | B G B G
    247              G R G R | G R G R | G R G R | G R G R
    248              B G B G | B G B G | B G B G | B G B G
    249              */
    250 
    251             __m128i r0 = _mm_loadu_si128((const __m128i*)bayer);
    252             __m128i r1 = _mm_loadu_si128((const __m128i*)(bayer+bayer_step));
    253             __m128i r2 = _mm_loadu_si128((const __m128i*)(bayer+bayer_step*2));
    254 
    255             __m128i b1 = _mm_add_epi16(_mm_and_si128(r0, masklow), _mm_and_si128(r2, masklow));
    256             __m128i nextb1 = _mm_srli_si128(b1, 2);
    257             __m128i b0 = _mm_add_epi16(b1, nextb1);
    258             b1 = _mm_srli_epi16(_mm_add_epi16(nextb1, delta1), 1);
    259             b0 = _mm_srli_epi16(_mm_add_epi16(b0, delta2), 2);
    260             // b0 b2 ... b14 b1 b3 ... b15
    261             b0 = _mm_packus_epi16(b0, b1);
    262 
    263             // vertical sum
    264             __m128i r0g = _mm_srli_epi16(r0, 8);
    265             __m128i r2g = _mm_srli_epi16(r2, 8);
    266             __m128i sumv = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(r0g, r2g), delta1), 1);
    267             // gorizontal sum
    268             __m128i g1 = _mm_and_si128(masklow, r1);
    269             __m128i nextg1 = _mm_srli_si128(g1, 2);
    270             __m128i sumg = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(g1, nextg1), delta1), 1);
    271 
    272             // gradients
    273             __m128i gradv = _mm_adds_epi16(_mm_subs_epu16(r0g, r2g), _mm_subs_epu16(r2g, r0g));
    274             __m128i gradg = _mm_adds_epi16(_mm_subs_epu16(nextg1, g1), _mm_subs_epu16(g1, nextg1));
    275             __m128i gmask = _mm_cmpgt_epi16(gradg, gradv);
    276 
    277             __m128i g0 = _mm_add_epi16(_mm_and_si128(gmask, sumv), _mm_and_si128(sumg, _mm_xor_si128(gmask, full)));
    278             // g0 g2 ... g14 g1 g3 ...
    279             g0 = _mm_packus_epi16(g0, nextg1);
    280 
    281             r0 = _mm_srli_epi16(r1, 8);
    282             r1 = _mm_add_epi16(r0, _mm_srli_si128(r0, 2));
    283             r1 = _mm_srli_epi16(_mm_add_epi16(r1, delta1), 1);
    284             // r0 r2 ... r14 r1 r3 ... r15
    285             r0 = _mm_packus_epi16(r0, r1);
    286 
    287             b1 = _mm_and_si128(_mm_xor_si128(b0, r0), mask);
    288             b0 = _mm_xor_si128(b0, b1);
    289             r0 = _mm_xor_si128(r0, b1);
    290 
    291             // b1 g1 b3 g3 b5 g5...
    292             b1 = _mm_unpackhi_epi8(b0, g0);
    293             // b0 g0 b2 g2 b4 g4 ....
    294             b0 = _mm_unpacklo_epi8(b0, g0);
    295 
    296             // r1 0 r3 0 r5 0 ...
    297             r1 = _mm_unpackhi_epi8(r0, z);
    298             // r0 0 r2 0 r4 0 ...
    299             r0 = _mm_unpacklo_epi8(r0, z);
    300 
    301             // 0 b0 g0 r0 0 b2 g2 r2 ...
    302             g0 = _mm_slli_si128(_mm_unpacklo_epi16(b0, r0), 1);
    303             // 0 b8 g8 r8 0 b10 g10 r10 ...
    304             g1 = _mm_slli_si128(_mm_unpackhi_epi16(b0, r0), 1);
    305 
    306             // b1 g1 r1 0 b3 g3 r3 0 ...
    307             r0 = _mm_unpacklo_epi16(b1, r1);
    308             // b9 g9 r9 0 b11 g11 r11 0 ...
    309             r1 = _mm_unpackhi_epi16(b1, r1);
    310 
    311             // 0 b0 g0 r0 b1 g1 r1 0 ...
    312             b0 = _mm_srli_si128(_mm_unpacklo_epi32(g0, r0), 1);
    313             // 0 b4 g4 r4 b5 g5 r5 0 ...
    314             b1 = _mm_srli_si128(_mm_unpackhi_epi32(g0, r0), 1);
    315 
    316             _mm_storel_epi64((__m128i*)(dst+0), b0);
    317             _mm_storel_epi64((__m128i*)(dst+6*1), _mm_srli_si128(b0, 8));
    318             _mm_storel_epi64((__m128i*)(dst+6*2), b1);
    319             _mm_storel_epi64((__m128i*)(dst+6*3), _mm_srli_si128(b1, 8));
    320 
    321             // 0 b8 g8 r8 b9 g9 r9 0 ...
    322             g0 = _mm_srli_si128(_mm_unpacklo_epi32(g1, r1), 1);
    323             // 0 b12 g12 r12 b13 g13 r13 0 ...
    324             g1 = _mm_srli_si128(_mm_unpackhi_epi32(g1, r1), 1);
    325 
    326             _mm_storel_epi64((__m128i*)(dst+6*4), g0);
    327             _mm_storel_epi64((__m128i*)(dst+6*5), _mm_srli_si128(g0, 8));
    328 
    329             _mm_storel_epi64((__m128i*)(dst+6*6), g1);
    330         }
    331 
    332         return int(bayer - (bayer_end - width));
    333     }
    334 
    335     bool use_simd;
    336 };
    337 #elif CV_NEON
    338 class SIMDBayerInterpolator_8u
    339 {
    340 public:
    341     SIMDBayerInterpolator_8u()
    342     {
    343     }
    344 
    345     int bayer2Gray(const uchar* bayer, int bayer_step, uchar* dst,
    346                    int width, int bcoeff, int gcoeff, int rcoeff) const
    347     {
    348         /*
    349          B G B G | B G B G | B G B G | B G B G
    350          G R G R | G R G R | G R G R | G R G R
    351          B G B G | B G B G | B G B G | B G B G
    352          */
    353 
    354         uint16x8_t masklo = vdupq_n_u16(255);
    355         const uchar* bayer_end = bayer + width;
    356 
    357         for( ; bayer <= bayer_end - 18; bayer += 14, dst += 14 )
    358         {
    359             uint16x8_t r0 = vld1q_u16((const ushort*)bayer);
    360             uint16x8_t r1 = vld1q_u16((const ushort*)(bayer + bayer_step));
    361             uint16x8_t r2 = vld1q_u16((const ushort*)(bayer + bayer_step*2));
    362 
    363             uint16x8_t b1_ = vaddq_u16(vandq_u16(r0, masklo), vandq_u16(r2, masklo));
    364             uint16x8_t b1 = vextq_u16(b1_, b1_, 1);
    365             uint16x8_t b0 = vaddq_u16(b1_, b1);
    366             // b0 = b0 b2 b4 ...
    367             // b1 = b1 b3 b5 ...
    368 
    369             uint16x8_t g0 = vaddq_u16(vshrq_n_u16(r0, 8), vshrq_n_u16(r2, 8));
    370             uint16x8_t g1 = vandq_u16(r1, masklo);
    371             g0 = vaddq_u16(g0, vaddq_u16(g1, vextq_u16(g1, g1, 1)));
    372             uint16x8_t rot = vextq_u16(g1, g1, 1);
    373             g1 = vshlq_n_u16(rot, 2);
    374             // g0 = b0 b2 b4 ...
    375             // g1 = b1 b3 b5 ...
    376 
    377             r0 = vshrq_n_u16(r1, 8);
    378             r1 = vaddq_u16(r0, vextq_u16(r0, r0, 1));
    379             r0 = vshlq_n_u16(r0, 2);
    380             // r0 = r0 r2 r4 ...
    381             // r1 = r1 r3 r5 ...
    382 
    383             b0 = vreinterpretq_u16_s16(vqdmulhq_n_s16(vreinterpretq_s16_u16(b0), (short)(rcoeff*2)));
    384             b1 = vreinterpretq_u16_s16(vqdmulhq_n_s16(vreinterpretq_s16_u16(b1), (short)(rcoeff*4)));
    385 
    386             g0 = vreinterpretq_u16_s16(vqdmulhq_n_s16(vreinterpretq_s16_u16(g0), (short)(gcoeff*2)));
    387             g1 = vreinterpretq_u16_s16(vqdmulhq_n_s16(vreinterpretq_s16_u16(g1), (short)(gcoeff*2)));
    388 
    389             r0 = vreinterpretq_u16_s16(vqdmulhq_n_s16(vreinterpretq_s16_u16(r0), (short)(bcoeff*2)));
    390             r1 = vreinterpretq_u16_s16(vqdmulhq_n_s16(vreinterpretq_s16_u16(r1), (short)(bcoeff*4)));
    391 
    392             g0 = vaddq_u16(vaddq_u16(g0, b0), r0);
    393             g1 = vaddq_u16(vaddq_u16(g1, b1), r1);
    394 
    395             uint8x8x2_t p = vzip_u8(vrshrn_n_u16(g0, 2), vrshrn_n_u16(g1, 2));
    396             vst1_u8(dst, p.val[0]);
    397             vst1_u8(dst + 8, p.val[1]);
    398         }
    399 
    400         return (int)(bayer - (bayer_end - width));
    401     }
    402 
    403     int bayer2RGB(const uchar* bayer, int bayer_step, uchar* dst, int width, int blue) const
    404     {
    405         /*
    406          B G B G | B G B G | B G B G | B G B G
    407          G R G R | G R G R | G R G R | G R G R
    408          B G B G | B G B G | B G B G | B G B G
    409          */
    410         uint16x8_t masklo = vdupq_n_u16(255);
    411         uint8x16x3_t pix;
    412         const uchar* bayer_end = bayer + width;
    413 
    414         for( ; bayer <= bayer_end - 18; bayer += 14, dst += 42 )
    415         {
    416             uint16x8_t r0 = vld1q_u16((const ushort*)bayer);
    417             uint16x8_t r1 = vld1q_u16((const ushort*)(bayer + bayer_step));
    418             uint16x8_t r2 = vld1q_u16((const ushort*)(bayer + bayer_step*2));
    419 
    420             uint16x8_t b1 = vaddq_u16(vandq_u16(r0, masklo), vandq_u16(r2, masklo));
    421             uint16x8_t nextb1 = vextq_u16(b1, b1, 1);
    422             uint16x8_t b0 = vaddq_u16(b1, nextb1);
    423             // b0 b1 b2 ...
    424             uint8x8x2_t bb = vzip_u8(vrshrn_n_u16(b0, 2), vrshrn_n_u16(nextb1, 1));
    425             pix.val[1-blue] = vcombine_u8(bb.val[0], bb.val[1]);
    426 
    427             uint16x8_t g0 = vaddq_u16(vshrq_n_u16(r0, 8), vshrq_n_u16(r2, 8));
    428             uint16x8_t g1 = vandq_u16(r1, masklo);
    429             g0 = vaddq_u16(g0, vaddq_u16(g1, vextq_u16(g1, g1, 1)));
    430             g1 = vextq_u16(g1, g1, 1);
    431             // g0 g1 g2 ...
    432             uint8x8x2_t gg = vzip_u8(vrshrn_n_u16(g0, 2), vmovn_u16(g1));
    433             pix.val[1] = vcombine_u8(gg.val[0], gg.val[1]);
    434 
    435             r0 = vshrq_n_u16(r1, 8);
    436             r1 = vaddq_u16(r0, vextq_u16(r0, r0, 1));
    437             // r0 r1 r2 ...
    438             uint8x8x2_t rr = vzip_u8(vmovn_u16(r0), vrshrn_n_u16(r1, 1));
    439             pix.val[1+blue] = vcombine_u8(rr.val[0], rr.val[1]);
    440 
    441             vst3q_u8(dst-1, pix);
    442         }
    443 
    444         return (int)(bayer - (bayer_end - width));
    445     }
    446 
    447     int bayer2RGBA(const uchar* bayer, int bayer_step, uchar* dst, int width, int blue) const
    448     {
    449         /*
    450          B G B G | B G B G | B G B G | B G B G
    451          G R G R | G R G R | G R G R | G R G R
    452          B G B G | B G B G | B G B G | B G B G
    453          */
    454         uint16x8_t masklo = vdupq_n_u16(255);
    455         uint8x16x4_t pix;
    456         const uchar* bayer_end = bayer + width;
    457         pix.val[3] = vdupq_n_u8(255);
    458 
    459         for( ; bayer <= bayer_end - 18; bayer += 14, dst += 56 )
    460         {
    461             uint16x8_t r0 = vld1q_u16((const ushort*)bayer);
    462             uint16x8_t r1 = vld1q_u16((const ushort*)(bayer + bayer_step));
    463             uint16x8_t r2 = vld1q_u16((const ushort*)(bayer + bayer_step*2));
    464 
    465             uint16x8_t b1 = vaddq_u16(vandq_u16(r0, masklo), vandq_u16(r2, masklo));
    466             uint16x8_t nextb1 = vextq_u16(b1, b1, 1);
    467             uint16x8_t b0 = vaddq_u16(b1, nextb1);
    468             // b0 b1 b2 ...
    469             uint8x8x2_t bb = vzip_u8(vrshrn_n_u16(b0, 2), vrshrn_n_u16(nextb1, 1));
    470             pix.val[1-blue] = vcombine_u8(bb.val[0], bb.val[1]);
    471 
    472             uint16x8_t g0 = vaddq_u16(vshrq_n_u16(r0, 8), vshrq_n_u16(r2, 8));
    473             uint16x8_t g1 = vandq_u16(r1, masklo);
    474             g0 = vaddq_u16(g0, vaddq_u16(g1, vextq_u16(g1, g1, 1)));
    475             g1 = vextq_u16(g1, g1, 1);
    476             // g0 g1 g2 ...
    477             uint8x8x2_t gg = vzip_u8(vrshrn_n_u16(g0, 2), vmovn_u16(g1));
    478             pix.val[1] = vcombine_u8(gg.val[0], gg.val[1]);
    479 
    480             r0 = vshrq_n_u16(r1, 8);
    481             r1 = vaddq_u16(r0, vextq_u16(r0, r0, 1));
    482             // r0 r1 r2 ...
    483             uint8x8x2_t rr = vzip_u8(vmovn_u16(r0), vrshrn_n_u16(r1, 1));
    484             pix.val[1+blue] = vcombine_u8(rr.val[0], rr.val[1]);
    485 
    486             vst4q_u8(dst-1, pix);
    487         }
    488 
    489         return (int)(bayer - (bayer_end - width));
    490     }
    491 
    492     int bayer2RGB_EA(const uchar*, int, uchar*, int, int) const
    493     {
    494         return 0;
    495     }
    496 };
    497 #else
    498 typedef SIMDBayerStubInterpolator_<uchar> SIMDBayerInterpolator_8u;
    499 #endif
    500 
    501 
    502 template<typename T, class SIMDInterpolator>
    503 class Bayer2Gray_Invoker :
    504     public ParallelLoopBody
    505 {
    506 public:
    507     Bayer2Gray_Invoker(const Mat& _srcmat, Mat& _dstmat, int _start_with_green, bool _brow,
    508         const Size& _size, int _bcoeff, int _rcoeff) :
    509         ParallelLoopBody(), srcmat(_srcmat), dstmat(_dstmat), Start_with_green(_start_with_green),
    510         Brow(_brow), size(_size), Bcoeff(_bcoeff), Rcoeff(_rcoeff)
    511     {
    512     }
    513 
    514     virtual void operator ()(const Range& range) const
    515     {
    516         SIMDInterpolator vecOp;
    517         const int G2Y = 9617;
    518         const int SHIFT = 14;
    519 
    520         const T* bayer0 = srcmat.ptr<T>();
    521         int bayer_step = (int)(srcmat.step/sizeof(T));
    522         T* dst0 = (T*)dstmat.data;
    523         int dst_step = (int)(dstmat.step/sizeof(T));
    524         int bcoeff = Bcoeff, rcoeff = Rcoeff;
    525         int start_with_green = Start_with_green;
    526         bool brow = Brow;
    527 
    528         dst0 += dst_step + 1;
    529 
    530         if (range.start % 2)
    531         {
    532             brow = !brow;
    533             std::swap(bcoeff, rcoeff);
    534             start_with_green = !start_with_green;
    535         }
    536 
    537         bayer0 += range.start * bayer_step;
    538         dst0 += range.start * dst_step;
    539 
    540         for(int i = range.start ; i < range.end; ++i, bayer0 += bayer_step, dst0 += dst_step )
    541         {
    542             unsigned t0, t1, t2;
    543             const T* bayer = bayer0;
    544             T* dst = dst0;
    545             const T* bayer_end = bayer + size.width;
    546 
    547             if( size.width <= 0 )
    548             {
    549                 dst[-1] = dst[size.width] = 0;
    550                 continue;
    551             }
    552 
    553             if( start_with_green )
    554             {
    555                 t0 = (bayer[1] + bayer[bayer_step*2+1])*rcoeff;
    556                 t1 = (bayer[bayer_step] + bayer[bayer_step+2])*bcoeff;
    557                 t2 = bayer[bayer_step+1]*(2*G2Y);
    558 
    559                 dst[0] = (T)CV_DESCALE(t0 + t1 + t2, SHIFT+1);
    560                 bayer++;
    561                 dst++;
    562             }
    563 
    564             int delta = vecOp.bayer2Gray(bayer, bayer_step, dst, size.width, bcoeff, G2Y, rcoeff);
    565             bayer += delta;
    566             dst += delta;
    567 
    568             for( ; bayer <= bayer_end - 2; bayer += 2, dst += 2 )
    569             {
    570                 t0 = (bayer[0] + bayer[2] + bayer[bayer_step*2] + bayer[bayer_step*2+2])*rcoeff;
    571                 t1 = (bayer[1] + bayer[bayer_step] + bayer[bayer_step+2] + bayer[bayer_step*2+1])*G2Y;
    572                 t2 = bayer[bayer_step+1]*(4*bcoeff);
    573                 dst[0] = (T)CV_DESCALE(t0 + t1 + t2, SHIFT+2);
    574 
    575                 t0 = (bayer[2] + bayer[bayer_step*2+2])*rcoeff;
    576                 t1 = (bayer[bayer_step+1] + bayer[bayer_step+3])*bcoeff;
    577                 t2 = bayer[bayer_step+2]*(2*G2Y);
    578                 dst[1] = (T)CV_DESCALE(t0 + t1 + t2, SHIFT+1);
    579             }
    580 
    581             if( bayer < bayer_end )
    582             {
    583                 t0 = (bayer[0] + bayer[2] + bayer[bayer_step*2] + bayer[bayer_step*2+2])*rcoeff;
    584                 t1 = (bayer[1] + bayer[bayer_step] + bayer[bayer_step+2] + bayer[bayer_step*2+1])*G2Y;
    585                 t2 = bayer[bayer_step+1]*(4*bcoeff);
    586                 dst[0] = (T)CV_DESCALE(t0 + t1 + t2, SHIFT+2);
    587                 bayer++;
    588                 dst++;
    589             }
    590 
    591             dst0[-1] = dst0[0];
    592             dst0[size.width] = dst0[size.width-1];
    593 
    594             brow = !brow;
    595             std::swap(bcoeff, rcoeff);
    596             start_with_green = !start_with_green;
    597         }
    598     }
    599 
    600 private:
    601     Mat srcmat;
    602     Mat dstmat;
    603     int Start_with_green;
    604     bool Brow;
    605     Size size;
    606     int Bcoeff, Rcoeff;
    607 };
    608 
    609 template<typename T, typename SIMDInterpolator>
    610 static void Bayer2Gray_( const Mat& srcmat, Mat& dstmat, int code )
    611 {
    612     const int R2Y = 4899;
    613     const int B2Y = 1868;
    614 
    615     Size size = srcmat.size();
    616     int bcoeff = B2Y, rcoeff = R2Y;
    617     int start_with_green = code == CV_BayerGB2GRAY || code == CV_BayerGR2GRAY;
    618     bool brow = true;
    619 
    620     if( code != CV_BayerBG2GRAY && code != CV_BayerGB2GRAY )
    621     {
    622         brow = false;
    623         std::swap(bcoeff, rcoeff);
    624     }
    625     size.height -= 2;
    626     size.width -= 2;
    627 
    628     if (size.height > 0)
    629     {
    630         Range range(0, size.height);
    631         Bayer2Gray_Invoker<T, SIMDInterpolator> invoker(srcmat, dstmat,
    632             start_with_green, brow, size, bcoeff, rcoeff);
    633         parallel_for_(range, invoker, dstmat.total()/static_cast<double>(1<<16));
    634     }
    635 
    636     size = dstmat.size();
    637     T* dst0 = dstmat.ptr<T>();
    638     int dst_step = (int)(dstmat.step/sizeof(T));
    639     if( size.height > 2 )
    640         for( int i = 0; i < size.width; i++ )
    641         {
    642             dst0[i] = dst0[i + dst_step];
    643             dst0[i + (size.height-1)*dst_step] = dst0[i + (size.height-2)*dst_step];
    644         }
    645     else
    646         for( int i = 0; i < size.width; i++ )
    647             dst0[i] = dst0[i + (size.height-1)*dst_step] = 0;
    648 }
    649 
    650 template <typename T>
    651 struct Alpha
    652 {
    653     static T value() { return std::numeric_limits<T>::max(); }
    654 };
    655 
    656 template <>
    657 struct Alpha<float>
    658 {
    659     static float value() { return 1.0f; }
    660 };
    661 
    662 template <typename T, typename SIMDInterpolator>
    663 class Bayer2RGB_Invoker :
    664     public ParallelLoopBody
    665 {
    666 public:
    667     Bayer2RGB_Invoker(const Mat& _srcmat, Mat& _dstmat, int _start_with_green, int _blue, const Size& _size) :
    668         ParallelLoopBody(),
    669         srcmat(_srcmat), dstmat(_dstmat), Start_with_green(_start_with_green), Blue(_blue), size(_size)
    670     {
    671     }
    672 
    673     virtual void operator() (const Range& range) const
    674     {
    675         SIMDInterpolator vecOp;
    676         T alpha = Alpha<T>::value();
    677         int dcn = dstmat.channels();
    678         int dcn2 = dcn << 1;
    679 
    680         int bayer_step = (int)(srcmat.step/sizeof(T));
    681         const T* bayer0 = srcmat.ptr<T>() + bayer_step * range.start;
    682 
    683         int dst_step = (int)(dstmat.step/sizeof(T));
    684         T* dst0 = reinterpret_cast<T*>(dstmat.data) + (range.start + 1) * dst_step + dcn + 1;
    685 
    686         int blue = Blue, start_with_green = Start_with_green;
    687         if (range.start % 2)
    688         {
    689             blue = -blue;
    690             start_with_green = !start_with_green;
    691         }
    692 
    693         for (int i = range.start; i < range.end; bayer0 += bayer_step, dst0 += dst_step, ++i )
    694         {
    695             int t0, t1;
    696             const T* bayer = bayer0;
    697             T* dst = dst0;
    698             const T* bayer_end = bayer + size.width;
    699 
    700             // in case of when size.width <= 2
    701             if( size.width <= 0 )
    702             {
    703                 if (dcn == 3)
    704                 {
    705                     dst[-4] = dst[-3] = dst[-2] = dst[size.width*dcn-1] =
    706                     dst[size.width*dcn] = dst[size.width*dcn+1] = 0;
    707                 }
    708                 else
    709                 {
    710                     dst[-5] = dst[-4] = dst[-3] = dst[size.width*dcn-1] =
    711                     dst[size.width*dcn] = dst[size.width*dcn+1] = 0;
    712                     dst[-2] = dst[size.width*dcn+2] = alpha;
    713                 }
    714                 continue;
    715             }
    716 
    717             if( start_with_green )
    718             {
    719                 t0 = (bayer[1] + bayer[bayer_step*2+1] + 1) >> 1;
    720                 t1 = (bayer[bayer_step] + bayer[bayer_step+2] + 1) >> 1;
    721 
    722                 dst[-blue] = (T)t0;
    723                 dst[0] = bayer[bayer_step+1];
    724                 dst[blue] = (T)t1;
    725                 if (dcn == 4)
    726                     dst[2] = alpha; // alpha channel
    727 
    728                 bayer++;
    729                 dst += dcn;
    730             }
    731 
    732             // simd optimization only for dcn == 3
    733             int delta = dcn == 4 ?
    734                 vecOp.bayer2RGBA(bayer, bayer_step, dst, size.width, blue) :
    735                 vecOp.bayer2RGB(bayer, bayer_step, dst, size.width, blue);
    736             bayer += delta;
    737             dst += delta*dcn;
    738 
    739             if (dcn == 3) // Bayer to BGR
    740             {
    741                 if( blue > 0 )
    742                 {
    743                     for( ; bayer <= bayer_end - 2; bayer += 2, dst += dcn2 )
    744                     {
    745                         t0 = (bayer[0] + bayer[2] + bayer[bayer_step*2] +
    746                               bayer[bayer_step*2+2] + 2) >> 2;
    747                         t1 = (bayer[1] + bayer[bayer_step] +
    748                               bayer[bayer_step+2] + bayer[bayer_step*2+1]+2) >> 2;
    749                         dst[-1] = (T)t0;
    750                         dst[0] = (T)t1;
    751                         dst[1] = bayer[bayer_step+1];
    752 
    753                         t0 = (bayer[2] + bayer[bayer_step*2+2] + 1) >> 1;
    754                         t1 = (bayer[bayer_step+1] + bayer[bayer_step+3] + 1) >> 1;
    755                         dst[2] = (T)t0;
    756                         dst[3] = bayer[bayer_step+2];
    757                         dst[4] = (T)t1;
    758                     }
    759                 }
    760                 else
    761                 {
    762                     for( ; bayer <= bayer_end - 2; bayer += 2, dst += dcn2 )
    763                     {
    764                         t0 = (bayer[0] + bayer[2] + bayer[bayer_step*2] +
    765                               bayer[bayer_step*2+2] + 2) >> 2;
    766                         t1 = (bayer[1] + bayer[bayer_step] +
    767                               bayer[bayer_step+2] + bayer[bayer_step*2+1]+2) >> 2;
    768                         dst[1] = (T)t0;
    769                         dst[0] = (T)t1;
    770                         dst[-1] = bayer[bayer_step+1];
    771 
    772                         t0 = (bayer[2] + bayer[bayer_step*2+2] + 1) >> 1;
    773                         t1 = (bayer[bayer_step+1] + bayer[bayer_step+3] + 1) >> 1;
    774                         dst[4] = (T)t0;
    775                         dst[3] = bayer[bayer_step+2];
    776                         dst[2] = (T)t1;
    777                     }
    778                 }
    779             }
    780             else // Bayer to BGRA
    781             {
    782                 // if current row does not contain Blue pixels
    783                 if( blue > 0 )
    784                 {
    785                     for( ; bayer <= bayer_end - 2; bayer += 2, dst += dcn2 )
    786                     {
    787                         t0 = (bayer[0] + bayer[2] + bayer[bayer_step*2] +
    788                               bayer[bayer_step*2+2] + 2) >> 2;
    789                         t1 = (bayer[1] + bayer[bayer_step] +
    790                               bayer[bayer_step+2] + bayer[bayer_step*2+1]+2) >> 2;
    791                         dst[-1] = (T)t0;
    792                         dst[0] = (T)t1;
    793                         dst[1] = bayer[bayer_step+1];
    794                         dst[2] = alpha; // alpha channel
    795 
    796                         t0 = (bayer[2] + bayer[bayer_step*2+2] + 1) >> 1;
    797                         t1 = (bayer[bayer_step+1] + bayer[bayer_step+3] + 1) >> 1;
    798                         dst[3] = (T)t0;
    799                         dst[4] = bayer[bayer_step+2];
    800                         dst[5] = (T)t1;
    801                         dst[6] = alpha; // alpha channel
    802                     }
    803                 }
    804                 else // if current row contains Blue pixels
    805                 {
    806                     for( ; bayer <= bayer_end - 2; bayer += 2, dst += dcn2 )
    807                     {
    808                         t0 = (bayer[0] + bayer[2] + bayer[bayer_step*2] +
    809                               bayer[bayer_step*2+2] + 2) >> 2;
    810                         t1 = (bayer[1] + bayer[bayer_step] +
    811                               bayer[bayer_step+2] + bayer[bayer_step*2+1]+2) >> 2;
    812                         dst[-1] = bayer[bayer_step+1];
    813                         dst[0] = (T)t1;
    814                         dst[1] = (T)t0;
    815                         dst[2] = alpha; // alpha channel
    816 
    817                         t0 = (bayer[2] + bayer[bayer_step*2+2] + 1) >> 1;
    818                         t1 = (bayer[bayer_step+1] + bayer[bayer_step+3] + 1) >> 1;
    819                         dst[3] = (T)t1;
    820                         dst[4] = bayer[bayer_step+2];
    821                         dst[5] = (T)t0;
    822                         dst[6] = alpha; // alpha channel
    823                     }
    824                 }
    825             }
    826 
    827             // if skip one pixel at the end of row
    828             if( bayer < bayer_end )
    829             {
    830                 t0 = (bayer[0] + bayer[2] + bayer[bayer_step*2] +
    831                       bayer[bayer_step*2+2] + 2) >> 2;
    832                 t1 = (bayer[1] + bayer[bayer_step] +
    833                       bayer[bayer_step+2] + bayer[bayer_step*2+1]+2) >> 2;
    834                 dst[-blue] = (T)t0;
    835                 dst[0] = (T)t1;
    836                 dst[blue] = bayer[bayer_step+1];
    837                 if (dcn == 4)
    838                     dst[2] = alpha; // alpha channel
    839                 bayer++;
    840                 dst += dcn;
    841             }
    842 
    843             // fill the last and the first pixels of row accordingly
    844             if (dcn == 3)
    845             {
    846                 dst0[-4] = dst0[-1];
    847                 dst0[-3] = dst0[0];
    848                 dst0[-2] = dst0[1];
    849                 dst0[size.width*dcn-1] = dst0[size.width*dcn-4];
    850                 dst0[size.width*dcn] = dst0[size.width*dcn-3];
    851                 dst0[size.width*dcn+1] = dst0[size.width*dcn-2];
    852             }
    853             else
    854             {
    855                 dst0[-5] = dst0[-1];
    856                 dst0[-4] = dst0[0];
    857                 dst0[-3] = dst0[1];
    858                 dst0[-2] = dst0[2]; // alpha channel
    859                 dst0[size.width*dcn-1] = dst0[size.width*dcn-5];
    860                 dst0[size.width*dcn] = dst0[size.width*dcn-4];
    861                 dst0[size.width*dcn+1] = dst0[size.width*dcn-3];
    862                 dst0[size.width*dcn+2] = dst0[size.width*dcn-2]; // alpha channel
    863             }
    864 
    865             blue = -blue;
    866             start_with_green = !start_with_green;
    867         }
    868     }
    869 
    870 private:
    871     Mat srcmat;
    872     Mat dstmat;
    873     int Start_with_green, Blue;
    874     Size size;
    875 };
    876 
    877 template<typename T, class SIMDInterpolator>
    878 static void Bayer2RGB_( const Mat& srcmat, Mat& dstmat, int code )
    879 {
    880     int dst_step = (int)(dstmat.step/sizeof(T));
    881     Size size = srcmat.size();
    882     int blue = code == CV_BayerBG2BGR || code == CV_BayerGB2BGR ? -1 : 1;
    883     int start_with_green = code == CV_BayerGB2BGR || code == CV_BayerGR2BGR;
    884 
    885     int dcn = dstmat.channels();
    886     size.height -= 2;
    887     size.width -= 2;
    888 
    889     if (size.height > 0)
    890     {
    891         Range range(0, size.height);
    892         Bayer2RGB_Invoker<T, SIMDInterpolator> invoker(srcmat, dstmat, start_with_green, blue, size);
    893         parallel_for_(range, invoker, dstmat.total()/static_cast<double>(1<<16));
    894     }
    895 
    896     // filling the first and the last rows
    897     size = dstmat.size();
    898     T* dst0 = dstmat.ptr<T>();
    899     if( size.height > 2 )
    900         for( int i = 0; i < size.width*dcn; i++ )
    901         {
    902             dst0[i] = dst0[i + dst_step];
    903             dst0[i + (size.height-1)*dst_step] = dst0[i + (size.height-2)*dst_step];
    904         }
    905     else
    906         for( int i = 0; i < size.width*dcn; i++ )
    907             dst0[i] = dst0[i + (size.height-1)*dst_step] = 0;
    908 }
    909 
    910 
    911 /////////////////// Demosaicing using Variable Number of Gradients ///////////////////////
    912 
    913 static void Bayer2RGB_VNG_8u( const Mat& srcmat, Mat& dstmat, int code )
    914 {
    915     const uchar* bayer = srcmat.ptr();
    916     int bstep = (int)srcmat.step;
    917     uchar* dst = dstmat.ptr();
    918     int dststep = (int)dstmat.step;
    919     Size size = srcmat.size();
    920 
    921     int blueIdx = code == CV_BayerBG2BGR_VNG || code == CV_BayerGB2BGR_VNG ? 0 : 2;
    922     bool greenCell0 = code != CV_BayerBG2BGR_VNG && code != CV_BayerRG2BGR_VNG;
    923 
    924     // for too small images use the simple interpolation algorithm
    925     if( MIN(size.width, size.height) < 8 )
    926     {
    927         Bayer2RGB_<uchar, SIMDBayerInterpolator_8u>( srcmat, dstmat, code );
    928         return;
    929     }
    930 
    931     const int brows = 3, bcn = 7;
    932     int N = size.width, N2 = N*2, N3 = N*3, N4 = N*4, N5 = N*5, N6 = N*6, N7 = N*7;
    933     int i, bufstep = N7*bcn;
    934     cv::AutoBuffer<ushort> _buf(bufstep*brows);
    935     ushort* buf = (ushort*)_buf;
    936 
    937     bayer += bstep*2;
    938 
    939 #if CV_SSE2
    940     bool haveSSE = cv::checkHardwareSupport(CV_CPU_SSE2);
    941     #define _mm_absdiff_epu16(a,b) _mm_adds_epu16(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a))
    942 #endif
    943 
    944     for( int y = 2; y < size.height - 4; y++ )
    945     {
    946         uchar* dstrow = dst + dststep*y + 6;
    947         const uchar* srow;
    948 
    949         for( int dy = (y == 2 ? -1 : 1); dy <= 1; dy++ )
    950         {
    951             ushort* brow = buf + ((y + dy - 1)%brows)*bufstep + 1;
    952             srow = bayer + (y+dy)*bstep + 1;
    953 
    954             for( i = 0; i < bcn; i++ )
    955                 brow[N*i-1] = brow[(N-2) + N*i] = 0;
    956 
    957             i = 1;
    958 
    959 #if CV_SSE2
    960             if( haveSSE )
    961             {
    962                 __m128i z = _mm_setzero_si128();
    963                 for( ; i <= N-9; i += 8, srow += 8, brow += 8 )
    964                 {
    965                     __m128i s1, s2, s3, s4, s6, s7, s8, s9;
    966 
    967                     s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-1-bstep)),z);
    968                     s2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-bstep)),z);
    969                     s3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+1-bstep)),z);
    970 
    971                     s4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-1)),z);
    972                     s6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+1)),z);
    973 
    974                     s7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-1+bstep)),z);
    975                     s8 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+bstep)),z);
    976                     s9 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+1+bstep)),z);
    977 
    978                     __m128i b0, b1, b2, b3, b4, b5, b6;
    979 
    980                     b0 = _mm_adds_epu16(_mm_slli_epi16(_mm_absdiff_epu16(s2,s8),1),
    981                                         _mm_adds_epu16(_mm_absdiff_epu16(s1, s7),
    982                                                        _mm_absdiff_epu16(s3, s9)));
    983                     b1 = _mm_adds_epu16(_mm_slli_epi16(_mm_absdiff_epu16(s4,s6),1),
    984                                         _mm_adds_epu16(_mm_absdiff_epu16(s1, s3),
    985                                                        _mm_absdiff_epu16(s7, s9)));
    986                     b2 = _mm_slli_epi16(_mm_absdiff_epu16(s3,s7),1);
    987                     b3 = _mm_slli_epi16(_mm_absdiff_epu16(s1,s9),1);
    988 
    989                     _mm_storeu_si128((__m128i*)brow, b0);
    990                     _mm_storeu_si128((__m128i*)(brow + N), b1);
    991                     _mm_storeu_si128((__m128i*)(brow + N2), b2);
    992                     _mm_storeu_si128((__m128i*)(brow + N3), b3);
    993 
    994                     b4 = _mm_adds_epu16(b2,_mm_adds_epu16(_mm_absdiff_epu16(s2, s4),
    995                                                           _mm_absdiff_epu16(s6, s8)));
    996                     b5 = _mm_adds_epu16(b3,_mm_adds_epu16(_mm_absdiff_epu16(s2, s6),
    997                                                           _mm_absdiff_epu16(s4, s8)));
    998                     b6 = _mm_adds_epu16(_mm_adds_epu16(s2, s4), _mm_adds_epu16(s6, s8));
    999                     b6 = _mm_srli_epi16(b6, 1);
   1000 
   1001                     _mm_storeu_si128((__m128i*)(brow + N4), b4);
   1002                     _mm_storeu_si128((__m128i*)(brow + N5), b5);
   1003                     _mm_storeu_si128((__m128i*)(brow + N6), b6);
   1004                 }
   1005             }
   1006 #endif
   1007 
   1008             for( ; i < N-1; i++, srow++, brow++ )
   1009             {
   1010                 brow[0] = (ushort)(std::abs(srow[-1-bstep] - srow[-1+bstep]) +
   1011                                    std::abs(srow[-bstep] - srow[+bstep])*2 +
   1012                                    std::abs(srow[1-bstep] - srow[1+bstep]));
   1013                 brow[N] = (ushort)(std::abs(srow[-1-bstep] - srow[1-bstep]) +
   1014                                    std::abs(srow[-1] - srow[1])*2 +
   1015                                    std::abs(srow[-1+bstep] - srow[1+bstep]));
   1016                 brow[N2] = (ushort)(std::abs(srow[+1-bstep] - srow[-1+bstep])*2);
   1017                 brow[N3] = (ushort)(std::abs(srow[-1-bstep] - srow[1+bstep])*2);
   1018                 brow[N4] = (ushort)(brow[N2] + std::abs(srow[-bstep] - srow[-1]) +
   1019                                     std::abs(srow[+bstep] - srow[1]));
   1020                 brow[N5] = (ushort)(brow[N3] + std::abs(srow[-bstep] - srow[1]) +
   1021                                     std::abs(srow[+bstep] - srow[-1]));
   1022                 brow[N6] = (ushort)((srow[-bstep] + srow[-1] + srow[1] + srow[+bstep])>>1);
   1023             }
   1024         }
   1025 
   1026         const ushort* brow0 = buf + ((y - 2) % brows)*bufstep + 2;
   1027         const ushort* brow1 = buf + ((y - 1) % brows)*bufstep + 2;
   1028         const ushort* brow2 = buf + (y % brows)*bufstep + 2;
   1029         static const float scale[] = { 0.f, 0.5f, 0.25f, 0.1666666666667f, 0.125f, 0.1f, 0.08333333333f, 0.0714286f, 0.0625f };
   1030         srow = bayer + y*bstep + 2;
   1031         bool greenCell = greenCell0;
   1032 
   1033         i = 2;
   1034 #if CV_SSE2
   1035         int limit = !haveSSE ? N-2 : greenCell ? std::min(3, N-2) : 2;
   1036 #else
   1037         int limit = N - 2;
   1038 #endif
   1039 
   1040         do
   1041         {
   1042             for( ; i < limit; i++, srow++, brow0++, brow1++, brow2++, dstrow += 3 )
   1043             {
   1044                 int gradN = brow0[0] + brow1[0];
   1045                 int gradS = brow1[0] + brow2[0];
   1046                 int gradW = brow1[N-1] + brow1[N];
   1047                 int gradE = brow1[N] + brow1[N+1];
   1048                 int minGrad = std::min(std::min(std::min(gradN, gradS), gradW), gradE);
   1049                 int maxGrad = std::max(std::max(std::max(gradN, gradS), gradW), gradE);
   1050                 int R, G, B;
   1051 
   1052                 if( !greenCell )
   1053                 {
   1054                     int gradNE = brow0[N4+1] + brow1[N4];
   1055                     int gradSW = brow1[N4] + brow2[N4-1];
   1056                     int gradNW = brow0[N5-1] + brow1[N5];
   1057                     int gradSE = brow1[N5] + brow2[N5+1];
   1058 
   1059                     minGrad = std::min(std::min(std::min(std::min(minGrad, gradNE), gradSW), gradNW), gradSE);
   1060                     maxGrad = std::max(std::max(std::max(std::max(maxGrad, gradNE), gradSW), gradNW), gradSE);
   1061                     int T = minGrad + MAX(maxGrad/2, 1);
   1062 
   1063                     int Rs = 0, Gs = 0, Bs = 0, ng = 0;
   1064                     if( gradN < T )
   1065                     {
   1066                         Rs += srow[-bstep*2] + srow[0];
   1067                         Gs += srow[-bstep]*2;
   1068                         Bs += srow[-bstep-1] + srow[-bstep+1];
   1069                         ng++;
   1070                     }
   1071                     if( gradS < T )
   1072                     {
   1073                         Rs += srow[bstep*2] + srow[0];
   1074                         Gs += srow[bstep]*2;
   1075                         Bs += srow[bstep-1] + srow[bstep+1];
   1076                         ng++;
   1077                     }
   1078                     if( gradW < T )
   1079                     {
   1080                         Rs += srow[-2] + srow[0];
   1081                         Gs += srow[-1]*2;
   1082                         Bs += srow[-bstep-1] + srow[bstep-1];
   1083                         ng++;
   1084                     }
   1085                     if( gradE < T )
   1086                     {
   1087                         Rs += srow[2] + srow[0];
   1088                         Gs += srow[1]*2;
   1089                         Bs += srow[-bstep+1] + srow[bstep+1];
   1090                         ng++;
   1091                     }
   1092                     if( gradNE < T )
   1093                     {
   1094                         Rs += srow[-bstep*2+2] + srow[0];
   1095                         Gs += brow0[N6+1];
   1096                         Bs += srow[-bstep+1]*2;
   1097                         ng++;
   1098                     }
   1099                     if( gradSW < T )
   1100                     {
   1101                         Rs += srow[bstep*2-2] + srow[0];
   1102                         Gs += brow2[N6-1];
   1103                         Bs += srow[bstep-1]*2;
   1104                         ng++;
   1105                     }
   1106                     if( gradNW < T )
   1107                     {
   1108                         Rs += srow[-bstep*2-2] + srow[0];
   1109                         Gs += brow0[N6-1];
   1110                         Bs += srow[-bstep+1]*2;
   1111                         ng++;
   1112                     }
   1113                     if( gradSE < T )
   1114                     {
   1115                         Rs += srow[bstep*2+2] + srow[0];
   1116                         Gs += brow2[N6+1];
   1117                         Bs += srow[-bstep+1]*2;
   1118                         ng++;
   1119                     }
   1120                     R = srow[0];
   1121                     G = R + cvRound((Gs - Rs)*scale[ng]);
   1122                     B = R + cvRound((Bs - Rs)*scale[ng]);
   1123                 }
   1124                 else
   1125                 {
   1126                     int gradNE = brow0[N2] + brow0[N2+1] + brow1[N2] + brow1[N2+1];
   1127                     int gradSW = brow1[N2] + brow1[N2-1] + brow2[N2] + brow2[N2-1];
   1128                     int gradNW = brow0[N3] + brow0[N3-1] + brow1[N3] + brow1[N3-1];
   1129                     int gradSE = brow1[N3] + brow1[N3+1] + brow2[N3] + brow2[N3+1];
   1130 
   1131                     minGrad = std::min(std::min(std::min(std::min(minGrad, gradNE), gradSW), gradNW), gradSE);
   1132                     maxGrad = std::max(std::max(std::max(std::max(maxGrad, gradNE), gradSW), gradNW), gradSE);
   1133                     int T = minGrad + MAX(maxGrad/2, 1);
   1134 
   1135                     int Rs = 0, Gs = 0, Bs = 0, ng = 0;
   1136                     if( gradN < T )
   1137                     {
   1138                         Rs += srow[-bstep*2-1] + srow[-bstep*2+1];
   1139                         Gs += srow[-bstep*2] + srow[0];
   1140                         Bs += srow[-bstep]*2;
   1141                         ng++;
   1142                     }
   1143                     if( gradS < T )
   1144                     {
   1145                         Rs += srow[bstep*2-1] + srow[bstep*2+1];
   1146                         Gs += srow[bstep*2] + srow[0];
   1147                         Bs += srow[bstep]*2;
   1148                         ng++;
   1149                     }
   1150                     if( gradW < T )
   1151                     {
   1152                         Rs += srow[-1]*2;
   1153                         Gs += srow[-2] + srow[0];
   1154                         Bs += srow[-bstep-2]+srow[bstep-2];
   1155                         ng++;
   1156                     }
   1157                     if( gradE < T )
   1158                     {
   1159                         Rs += srow[1]*2;
   1160                         Gs += srow[2] + srow[0];
   1161                         Bs += srow[-bstep+2]+srow[bstep+2];
   1162                         ng++;
   1163                     }
   1164                     if( gradNE < T )
   1165                     {
   1166                         Rs += srow[-bstep*2+1] + srow[1];
   1167                         Gs += srow[-bstep+1]*2;
   1168                         Bs += srow[-bstep] + srow[-bstep+2];
   1169                         ng++;
   1170                     }
   1171                     if( gradSW < T )
   1172                     {
   1173                         Rs += srow[bstep*2-1] + srow[-1];
   1174                         Gs += srow[bstep-1]*2;
   1175                         Bs += srow[bstep] + srow[bstep-2];
   1176                         ng++;
   1177                     }
   1178                     if( gradNW < T )
   1179                     {
   1180                         Rs += srow[-bstep*2-1] + srow[-1];
   1181                         Gs += srow[-bstep-1]*2;
   1182                         Bs += srow[-bstep-2]+srow[-bstep];
   1183                         ng++;
   1184                     }
   1185                     if( gradSE < T )
   1186                     {
   1187                         Rs += srow[bstep*2+1] + srow[1];
   1188                         Gs += srow[bstep+1]*2;
   1189                         Bs += srow[bstep+2]+srow[bstep];
   1190                         ng++;
   1191                     }
   1192                     G = srow[0];
   1193                     R = G + cvRound((Rs - Gs)*scale[ng]);
   1194                     B = G + cvRound((Bs - Gs)*scale[ng]);
   1195                 }
   1196                 dstrow[blueIdx] = cv::saturate_cast<uchar>(B);
   1197                 dstrow[1] = cv::saturate_cast<uchar>(G);
   1198                 dstrow[blueIdx^2] = cv::saturate_cast<uchar>(R);
   1199                 greenCell = !greenCell;
   1200             }
   1201 
   1202 #if CV_SSE2
   1203             if( !haveSSE )
   1204                 break;
   1205 
   1206             __m128i emask    = _mm_set1_epi32(0x0000ffff),
   1207                     omask    = _mm_set1_epi32(0xffff0000),
   1208                     z        = _mm_setzero_si128(),
   1209                     one      = _mm_set1_epi16(1);
   1210             __m128 _0_5      = _mm_set1_ps(0.5f);
   1211 
   1212             #define _mm_merge_epi16(a, b) _mm_or_si128(_mm_and_si128(a, emask), _mm_and_si128(b, omask)) //(aA_aA_aA_aA) * (bB_bB_bB_bB) => (bA_bA_bA_bA)
   1213             #define _mm_cvtloepi16_ps(a)  _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(a,a), 16))   //(1,2,3,4,5,6,7,8) => (1f,2f,3f,4f)
   1214             #define _mm_cvthiepi16_ps(a)  _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(a,a), 16))   //(1,2,3,4,5,6,7,8) => (5f,6f,7f,8f)
   1215             #define _mm_loadl_u8_s16(ptr, offset) _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)((ptr) + (offset))), z) //load 8 uchars to 8 shorts
   1216 
   1217             // process 8 pixels at once
   1218             for( ; i <= N - 10; i += 8, srow += 8, brow0 += 8, brow1 += 8, brow2 += 8 )
   1219             {
   1220                 //int gradN = brow0[0] + brow1[0];
   1221                 __m128i gradN = _mm_adds_epi16(_mm_loadu_si128((__m128i*)brow0), _mm_loadu_si128((__m128i*)brow1));
   1222 
   1223                 //int gradS = brow1[0] + brow2[0];
   1224                 __m128i gradS = _mm_adds_epi16(_mm_loadu_si128((__m128i*)brow1), _mm_loadu_si128((__m128i*)brow2));
   1225 
   1226                 //int gradW = brow1[N-1] + brow1[N];
   1227                 __m128i gradW = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow1+N-1)), _mm_loadu_si128((__m128i*)(brow1+N)));
   1228 
   1229                 //int gradE = brow1[N+1] + brow1[N];
   1230                 __m128i gradE = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow1+N+1)), _mm_loadu_si128((__m128i*)(brow1+N)));
   1231 
   1232                 //int minGrad = std::min(std::min(std::min(gradN, gradS), gradW), gradE);
   1233                 //int maxGrad = std::max(std::max(std::max(gradN, gradS), gradW), gradE);
   1234                 __m128i minGrad = _mm_min_epi16(_mm_min_epi16(gradN, gradS), _mm_min_epi16(gradW, gradE));
   1235                 __m128i maxGrad = _mm_max_epi16(_mm_max_epi16(gradN, gradS), _mm_max_epi16(gradW, gradE));
   1236 
   1237                 __m128i grad0, grad1;
   1238 
   1239                 //int gradNE = brow0[N4+1] + brow1[N4];
   1240                 //int gradNE = brow0[N2] + brow0[N2+1] + brow1[N2] + brow1[N2+1];
   1241                 grad0 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow0+N4+1)), _mm_loadu_si128((__m128i*)(brow1+N4)));
   1242                 grad1 = _mm_adds_epi16( _mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow0+N2)), _mm_loadu_si128((__m128i*)(brow0+N2+1))),
   1243                                         _mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow1+N2)), _mm_loadu_si128((__m128i*)(brow1+N2+1))));
   1244                 __m128i gradNE = _mm_merge_epi16(grad0, grad1);
   1245 
   1246                 //int gradSW = brow1[N4] + brow2[N4-1];
   1247                 //int gradSW = brow1[N2] + brow1[N2-1] + brow2[N2] + brow2[N2-1];
   1248                 grad0 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow2+N4-1)), _mm_loadu_si128((__m128i*)(brow1+N4)));
   1249                 grad1 = _mm_adds_epi16(_mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow2+N2)), _mm_loadu_si128((__m128i*)(brow2+N2-1))),
   1250                                        _mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow1+N2)), _mm_loadu_si128((__m128i*)(brow1+N2-1))));
   1251                 __m128i gradSW = _mm_merge_epi16(grad0, grad1);
   1252 
   1253                 minGrad = _mm_min_epi16(_mm_min_epi16(minGrad, gradNE), gradSW);
   1254                 maxGrad = _mm_max_epi16(_mm_max_epi16(maxGrad, gradNE), gradSW);
   1255 
   1256                 //int gradNW = brow0[N5-1] + brow1[N5];
   1257                 //int gradNW = brow0[N3] + brow0[N3-1] + brow1[N3] + brow1[N3-1];
   1258                 grad0 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow0+N5-1)), _mm_loadu_si128((__m128i*)(brow1+N5)));
   1259                 grad1 = _mm_adds_epi16(_mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow0+N3)), _mm_loadu_si128((__m128i*)(brow0+N3-1))),
   1260                                        _mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow1+N3)), _mm_loadu_si128((__m128i*)(brow1+N3-1))));
   1261                 __m128i gradNW = _mm_merge_epi16(grad0, grad1);
   1262 
   1263                 //int gradSE = brow1[N5] + brow2[N5+1];
   1264                 //int gradSE = brow1[N3] + brow1[N3+1] + brow2[N3] + brow2[N3+1];
   1265                 grad0 = _mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow2+N5+1)), _mm_loadu_si128((__m128i*)(brow1+N5)));
   1266                 grad1 = _mm_adds_epi16(_mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow2+N3)), _mm_loadu_si128((__m128i*)(brow2+N3+1))),
   1267                                        _mm_adds_epi16(_mm_loadu_si128((__m128i*)(brow1+N3)), _mm_loadu_si128((__m128i*)(brow1+N3+1))));
   1268                 __m128i gradSE = _mm_merge_epi16(grad0, grad1);
   1269 
   1270                 minGrad = _mm_min_epi16(_mm_min_epi16(minGrad, gradNW), gradSE);
   1271                 maxGrad = _mm_max_epi16(_mm_max_epi16(maxGrad, gradNW), gradSE);
   1272 
   1273                 //int T = minGrad + maxGrad/2;
   1274                 __m128i T = _mm_adds_epi16(_mm_max_epi16(_mm_srli_epi16(maxGrad, 1), one), minGrad);
   1275 
   1276                 __m128i RGs = z, GRs = z, Bs = z, ng = z;
   1277 
   1278                 __m128i x0  = _mm_loadl_u8_s16(srow, +0          );
   1279                 __m128i x1  = _mm_loadl_u8_s16(srow, -1 - bstep  );
   1280                 __m128i x2  = _mm_loadl_u8_s16(srow, -1 - bstep*2);
   1281                 __m128i x3  = _mm_loadl_u8_s16(srow,    - bstep  );
   1282                 __m128i x4  = _mm_loadl_u8_s16(srow, +1 - bstep*2);
   1283                 __m128i x5  = _mm_loadl_u8_s16(srow, +1 - bstep  );
   1284                 __m128i x6  = _mm_loadl_u8_s16(srow, +2 - bstep  );
   1285                 __m128i x7  = _mm_loadl_u8_s16(srow, +1          );
   1286                 __m128i x8  = _mm_loadl_u8_s16(srow, +2 + bstep  );
   1287                 __m128i x9  = _mm_loadl_u8_s16(srow, +1 + bstep  );
   1288                 __m128i x10 = _mm_loadl_u8_s16(srow, +1 + bstep*2);
   1289                 __m128i x11 = _mm_loadl_u8_s16(srow,    + bstep  );
   1290                 __m128i x12 = _mm_loadl_u8_s16(srow, -1 + bstep*2);
   1291                 __m128i x13 = _mm_loadl_u8_s16(srow, -1 + bstep  );
   1292                 __m128i x14 = _mm_loadl_u8_s16(srow, -2 + bstep  );
   1293                 __m128i x15 = _mm_loadl_u8_s16(srow, -1          );
   1294                 __m128i x16 = _mm_loadl_u8_s16(srow, -2 - bstep  );
   1295 
   1296                 __m128i t0, t1, mask;
   1297 
   1298                 // gradN ***********************************************
   1299                 mask = _mm_cmpgt_epi16(T, gradN); // mask = T>gradN
   1300                 ng = _mm_sub_epi16(ng, mask);     // ng += (T>gradN)
   1301 
   1302                 t0 = _mm_slli_epi16(x3, 1);                                 // srow[-bstep]*2
   1303                 t1 = _mm_adds_epi16(_mm_loadl_u8_s16(srow, -bstep*2), x0);  // srow[-bstep*2] + srow[0]
   1304 
   1305                 // RGs += (srow[-bstep*2] + srow[0]) * (T>gradN)
   1306                 RGs = _mm_adds_epi16(RGs, _mm_and_si128(t1, mask));
   1307                 // GRs += {srow[-bstep]*2; (srow[-bstep*2-1] + srow[-bstep*2+1])} * (T>gradN)
   1308                 GRs = _mm_adds_epi16(GRs, _mm_and_si128(_mm_merge_epi16(t0, _mm_adds_epi16(x2,x4)), mask));
   1309                 // Bs  += {(srow[-bstep-1]+srow[-bstep+1]); srow[-bstep]*2 } * (T>gradN)
   1310                 Bs  = _mm_adds_epi16(Bs, _mm_and_si128(_mm_merge_epi16(_mm_adds_epi16(x1,x5), t0), mask));
   1311 
   1312                 // gradNE **********************************************
   1313                 mask = _mm_cmpgt_epi16(T, gradNE); // mask = T>gradNE
   1314                 ng = _mm_sub_epi16(ng, mask);      // ng += (T>gradNE)
   1315 
   1316                 t0 = _mm_slli_epi16(x5, 1);                                    // srow[-bstep+1]*2
   1317                 t1 = _mm_adds_epi16(_mm_loadl_u8_s16(srow, -bstep*2+2), x0);   // srow[-bstep*2+2] + srow[0]
   1318 
   1319                 // RGs += {(srow[-bstep*2+2] + srow[0]); srow[-bstep+1]*2} * (T>gradNE)
   1320                 RGs = _mm_adds_epi16(RGs, _mm_and_si128(_mm_merge_epi16(t1, t0), mask));
   1321                 // GRs += {brow0[N6+1]; (srow[-bstep*2+1] + srow[1])} * (T>gradNE)
   1322                 GRs = _mm_adds_epi16(GRs, _mm_and_si128(_mm_merge_epi16(_mm_loadu_si128((__m128i*)(brow0+N6+1)), _mm_adds_epi16(x4,x7)), mask));
   1323                 // Bs  += {srow[-bstep+1]*2; (srow[-bstep] + srow[-bstep+2])}  * (T>gradNE)
   1324                 Bs  = _mm_adds_epi16(Bs, _mm_and_si128(_mm_merge_epi16(t0,_mm_adds_epi16(x3,x6)), mask));
   1325 
   1326                 // gradE ***********************************************
   1327                 mask = _mm_cmpgt_epi16(T, gradE);  // mask = T>gradE
   1328                 ng = _mm_sub_epi16(ng, mask);      // ng += (T>gradE)
   1329 
   1330                 t0 = _mm_slli_epi16(x7, 1);                         // srow[1]*2
   1331                 t1 = _mm_adds_epi16(_mm_loadl_u8_s16(srow, 2), x0); // srow[2] + srow[0]
   1332 
   1333                 // RGs += (srow[2] + srow[0]) * (T>gradE)
   1334                 RGs = _mm_adds_epi16(RGs, _mm_and_si128(t1, mask));
   1335                 // GRs += (srow[1]*2) * (T>gradE)
   1336                 GRs = _mm_adds_epi16(GRs, _mm_and_si128(t0, mask));
   1337                 // Bs  += {(srow[-bstep+1]+srow[bstep+1]); (srow[-bstep+2]+srow[bstep+2])} * (T>gradE)
   1338                 Bs  = _mm_adds_epi16(Bs, _mm_and_si128(_mm_merge_epi16(_mm_adds_epi16(x5,x9), _mm_adds_epi16(x6,x8)), mask));
   1339 
   1340                 // gradSE **********************************************
   1341                 mask = _mm_cmpgt_epi16(T, gradSE);  // mask = T>gradSE
   1342                 ng = _mm_sub_epi16(ng, mask);       // ng += (T>gradSE)
   1343 
   1344                 t0 = _mm_slli_epi16(x9, 1);                                 // srow[bstep+1]*2
   1345                 t1 = _mm_adds_epi16(_mm_loadl_u8_s16(srow, bstep*2+2), x0); // srow[bstep*2+2] + srow[0]
   1346 
   1347                 // RGs += {(srow[bstep*2+2] + srow[0]); srow[bstep+1]*2} * (T>gradSE)
   1348                 RGs = _mm_adds_epi16(RGs, _mm_and_si128(_mm_merge_epi16(t1, t0), mask));
   1349                 // GRs += {brow2[N6+1]; (srow[1]+srow[bstep*2+1])} * (T>gradSE)
   1350                 GRs = _mm_adds_epi16(GRs, _mm_and_si128(_mm_merge_epi16(_mm_loadu_si128((__m128i*)(brow2+N6+1)), _mm_adds_epi16(x7,x10)), mask));
   1351                 // Bs  += {srow[-bstep+1]*2; (srow[bstep+2]+srow[bstep])} * (T>gradSE)
   1352                 Bs  = _mm_adds_epi16(Bs, _mm_and_si128(_mm_merge_epi16(_mm_slli_epi16(x5, 1), _mm_adds_epi16(x8,x11)), mask));
   1353 
   1354                 // gradS ***********************************************
   1355                 mask = _mm_cmpgt_epi16(T, gradS);  // mask = T>gradS
   1356                 ng = _mm_sub_epi16(ng, mask);      // ng += (T>gradS)
   1357 
   1358                 t0 = _mm_slli_epi16(x11, 1);                             // srow[bstep]*2
   1359                 t1 = _mm_adds_epi16(_mm_loadl_u8_s16(srow,bstep*2), x0); // srow[bstep*2]+srow[0]
   1360 
   1361                 // RGs += (srow[bstep*2]+srow[0]) * (T>gradS)
   1362                 RGs = _mm_adds_epi16(RGs, _mm_and_si128(t1, mask));
   1363                 // GRs += {srow[bstep]*2; (srow[bstep*2+1]+srow[bstep*2-1])} * (T>gradS)
   1364                 GRs = _mm_adds_epi16(GRs, _mm_and_si128(_mm_merge_epi16(t0, _mm_adds_epi16(x10,x12)), mask));
   1365                 // Bs  += {(srow[bstep+1]+srow[bstep-1]); srow[bstep]*2} * (T>gradS)
   1366                 Bs  = _mm_adds_epi16(Bs, _mm_and_si128(_mm_merge_epi16(_mm_adds_epi16(x9,x13), t0), mask));
   1367 
   1368                 // gradSW **********************************************
   1369                 mask = _mm_cmpgt_epi16(T, gradSW);  // mask = T>gradSW
   1370                 ng = _mm_sub_epi16(ng, mask);       // ng += (T>gradSW)
   1371 
   1372                 t0 = _mm_slli_epi16(x13, 1);                                // srow[bstep-1]*2
   1373                 t1 = _mm_adds_epi16(_mm_loadl_u8_s16(srow, bstep*2-2), x0); // srow[bstep*2-2]+srow[0]
   1374 
   1375                 // RGs += {(srow[bstep*2-2]+srow[0]); srow[bstep-1]*2} * (T>gradSW)
   1376                 RGs = _mm_adds_epi16(RGs, _mm_and_si128(_mm_merge_epi16(t1, t0), mask));
   1377                 // GRs += {brow2[N6-1]; (srow[bstep*2-1]+srow[-1])} * (T>gradSW)
   1378                 GRs = _mm_adds_epi16(GRs, _mm_and_si128(_mm_merge_epi16(_mm_loadu_si128((__m128i*)(brow2+N6-1)), _mm_adds_epi16(x12,x15)), mask));
   1379                 // Bs  += {srow[bstep-1]*2; (srow[bstep]+srow[bstep-2])} * (T>gradSW)
   1380                 Bs  = _mm_adds_epi16(Bs, _mm_and_si128(_mm_merge_epi16(t0,_mm_adds_epi16(x11,x14)), mask));
   1381 
   1382                 // gradW ***********************************************
   1383                 mask = _mm_cmpgt_epi16(T, gradW);  // mask = T>gradW
   1384                 ng = _mm_sub_epi16(ng, mask);      // ng += (T>gradW)
   1385 
   1386                 t0 = _mm_slli_epi16(x15, 1);                         // srow[-1]*2
   1387                 t1 = _mm_adds_epi16(_mm_loadl_u8_s16(srow, -2), x0); // srow[-2]+srow[0]
   1388 
   1389                 // RGs += (srow[-2]+srow[0]) * (T>gradW)
   1390                 RGs = _mm_adds_epi16(RGs, _mm_and_si128(t1, mask));
   1391                 // GRs += (srow[-1]*2) * (T>gradW)
   1392                 GRs = _mm_adds_epi16(GRs, _mm_and_si128(t0, mask));
   1393                 // Bs  += {(srow[-bstep-1]+srow[bstep-1]); (srow[bstep-2]+srow[-bstep-2])} * (T>gradW)
   1394                 Bs  = _mm_adds_epi16(Bs, _mm_and_si128(_mm_merge_epi16(_mm_adds_epi16(x1,x13), _mm_adds_epi16(x14,x16)), mask));
   1395 
   1396                 // gradNW **********************************************
   1397                 mask = _mm_cmpgt_epi16(T, gradNW);  // mask = T>gradNW
   1398                 ng = _mm_sub_epi16(ng, mask);       // ng += (T>gradNW)
   1399 
   1400                 t0 = _mm_slli_epi16(x1, 1);                                 // srow[-bstep-1]*2
   1401                 t1 = _mm_adds_epi16(_mm_loadl_u8_s16(srow,-bstep*2-2), x0); // srow[-bstep*2-2]+srow[0]
   1402 
   1403                 // RGs += {(srow[-bstep*2-2]+srow[0]); srow[-bstep-1]*2} * (T>gradNW)
   1404                 RGs = _mm_adds_epi16(RGs, _mm_and_si128(_mm_merge_epi16(t1, t0), mask));
   1405                 // GRs += {brow0[N6-1]; (srow[-bstep*2-1]+srow[-1])} * (T>gradNW)
   1406                 GRs = _mm_adds_epi16(GRs, _mm_and_si128(_mm_merge_epi16(_mm_loadu_si128((__m128i*)(brow0+N6-1)), _mm_adds_epi16(x2,x15)), mask));
   1407                 // Bs  += {srow[-bstep-1]*2; (srow[-bstep]+srow[-bstep-2])} * (T>gradNW)
   1408                 Bs  = _mm_adds_epi16(Bs, _mm_and_si128(_mm_merge_epi16(_mm_slli_epi16(x5, 1),_mm_adds_epi16(x3,x16)), mask));
   1409 
   1410                 __m128 ngf0 = _mm_div_ps(_0_5, _mm_cvtloepi16_ps(ng));
   1411                 __m128 ngf1 = _mm_div_ps(_0_5, _mm_cvthiepi16_ps(ng));
   1412 
   1413                 // now interpolate r, g & b
   1414                 t0 = _mm_subs_epi16(GRs, RGs);
   1415                 t1 = _mm_subs_epi16(Bs, RGs);
   1416 
   1417                 t0 = _mm_add_epi16(x0, _mm_packs_epi32(
   1418                                                        _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtloepi16_ps(t0), ngf0)),
   1419                                                        _mm_cvtps_epi32(_mm_mul_ps(_mm_cvthiepi16_ps(t0), ngf1))));
   1420 
   1421                 t1 = _mm_add_epi16(x0, _mm_packs_epi32(
   1422                                                        _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtloepi16_ps(t1), ngf0)),
   1423                                                        _mm_cvtps_epi32(_mm_mul_ps(_mm_cvthiepi16_ps(t1), ngf1))));
   1424 
   1425                 x1 = _mm_merge_epi16(x0, t0);
   1426                 x2 = _mm_merge_epi16(t0, x0);
   1427 
   1428                 uchar R[8], G[8], B[8];
   1429 
   1430                 _mm_storel_epi64(blueIdx ? (__m128i*)B : (__m128i*)R, _mm_packus_epi16(x1, z));
   1431                 _mm_storel_epi64((__m128i*)G, _mm_packus_epi16(x2, z));
   1432                 _mm_storel_epi64(blueIdx ? (__m128i*)R : (__m128i*)B, _mm_packus_epi16(t1, z));
   1433 
   1434                 for( int j = 0; j < 8; j++, dstrow += 3 )
   1435                 {
   1436                     dstrow[0] = B[j]; dstrow[1] = G[j]; dstrow[2] = R[j];
   1437                 }
   1438             }
   1439 #endif
   1440 
   1441             limit = N - 2;
   1442         }
   1443         while( i < N - 2 );
   1444 
   1445         for( i = 0; i < 6; i++ )
   1446         {
   1447             dst[dststep*y + 5 - i] = dst[dststep*y + 8 - i];
   1448             dst[dststep*y + (N - 2)*3 + i] = dst[dststep*y + (N - 3)*3 + i];
   1449         }
   1450 
   1451         greenCell0 = !greenCell0;
   1452         blueIdx ^= 2;
   1453     }
   1454 
   1455     for( i = 0; i < size.width*3; i++ )
   1456     {
   1457         dst[i] = dst[i + dststep] = dst[i + dststep*2];
   1458         dst[i + dststep*(size.height-4)] =
   1459         dst[i + dststep*(size.height-3)] =
   1460         dst[i + dststep*(size.height-2)] =
   1461         dst[i + dststep*(size.height-1)] = dst[i + dststep*(size.height-5)];
   1462     }
   1463 }
   1464 
   1465 //////////////////////////////// Edge-Aware Demosaicing //////////////////////////////////
   1466 
   1467 template <typename T, typename SIMDInterpolator>
   1468 class Bayer2RGB_EdgeAware_T_Invoker :
   1469     public cv::ParallelLoopBody
   1470 {
   1471 public:
   1472     Bayer2RGB_EdgeAware_T_Invoker(const Mat& _src, Mat& _dst, const Size& _size,
   1473         int _blue, int _start_with_green) :
   1474         ParallelLoopBody(),
   1475         src(_src), dst(_dst), size(_size), Blue(_blue), Start_with_green(_start_with_green)
   1476     {
   1477     }
   1478 
   1479     virtual void operator()(const Range& range) const
   1480     {
   1481         int dcn = dst.channels();
   1482         int dcn2 = dcn<<1;
   1483         int start_with_green = Start_with_green, blue = Blue;
   1484         int sstep = int(src.step / src.elemSize1()), dstep = int(dst.step / dst.elemSize1());
   1485         SIMDInterpolator vecOp;
   1486 
   1487         const T* S = src.ptr<T>(range.start + 1) + 1;
   1488         T* D = reinterpret_cast<T*>(dst.data + (range.start + 1) * dst.step) + dcn;
   1489 
   1490         if (range.start % 2)
   1491         {
   1492             start_with_green ^= 1;
   1493             blue ^= 1;
   1494         }
   1495 
   1496         // to BGR
   1497         for (int y = range.start; y < range.end; ++y)
   1498         {
   1499             int x = 1;
   1500             if (start_with_green)
   1501             {
   1502                 D[blue<<1] = (S[-sstep] + S[sstep]) >> 1;
   1503                 D[1] = S[0];
   1504                 D[2-(blue<<1)] = (S[-1] + S[1]) >> 1;
   1505                 D += dcn;
   1506                 ++S;
   1507                 ++x;
   1508             }
   1509 
   1510             int delta = vecOp.bayer2RGB_EA(S - sstep - 1, sstep, D, size.width, blue);
   1511             x += delta;
   1512             S += delta;
   1513             D += dcn * delta;
   1514 
   1515             if (blue)
   1516                 for (; x < size.width; x += 2, S += 2, D += dcn2)
   1517                 {
   1518                     D[0] = S[0];
   1519                     D[1] = (std::abs(S[-1] - S[1]) > std::abs(S[sstep] - S[-sstep]) ? (S[sstep] + S[-sstep] + 1) : (S[-1] + S[1] + 1)) >> 1;
   1520                     D[2] = (S[-sstep-1] + S[-sstep+1] + S[sstep-1] + S[sstep+1]) >> 2;
   1521 
   1522                     D[3] = (S[0] + S[2] + 1) >> 1;
   1523                     D[4] = S[1];
   1524                     D[5] = (S[-sstep+1] + S[sstep+1] + 1) >> 1;
   1525                 }
   1526             else
   1527                 for (; x < size.width; x += 2, S += 2, D += dcn2)
   1528                 {
   1529                     D[0] = (S[-sstep-1] + S[-sstep+1] + S[sstep-1] + S[sstep+1] + 2) >> 2;
   1530                     D[1] = (std::abs(S[-1] - S[1]) > std::abs(S[sstep] - S[-sstep]) ? (S[sstep] + S[-sstep] + 1) : (S[-1] + S[1] + 1)) >> 1;
   1531                     D[2] = S[0];
   1532 
   1533                     D[3] = (S[-sstep+1] + S[sstep+1] + 1) >> 1;
   1534                     D[4] = S[1];
   1535                     D[5] = (S[0] + S[2] + 1) >> 1;
   1536                 }
   1537 
   1538             if (x <= size.width)
   1539             {
   1540                 D[blue<<1] = (S[-sstep-1] + S[-sstep+1] + S[sstep-1] + S[sstep+1] + 2) >> 2;
   1541                 D[1] = (std::abs(S[-1] - S[1]) > std::abs(S[sstep] - S[-sstep]) ? (S[sstep] + S[-sstep] + 1) : (S[-1] + S[1] + 1)) >> 1;
   1542                 D[2-(blue<<1)] = S[0];
   1543                 D += dcn;
   1544                 ++S;
   1545             }
   1546 
   1547             for (int i = 0; i < dcn; ++i)
   1548             {
   1549                 D[i] = D[-dcn + i];
   1550                 D[-dstep+dcn+i] = D[-dstep+(dcn<<1)+i];
   1551             }
   1552 
   1553             start_with_green ^= 1;
   1554             blue ^= 1;
   1555             S += 2;
   1556             D += dcn2;
   1557         }
   1558     }
   1559 
   1560 private:
   1561     Mat src;
   1562     Mat dst;
   1563     Size size;
   1564     int Blue, Start_with_green;
   1565 };
   1566 
   1567 template <typename T, typename SIMDInterpolator>
   1568 static void Bayer2RGB_EdgeAware_T(const Mat& src, Mat& dst, int code)
   1569 {
   1570     Size size = src.size();
   1571 
   1572     // for small sizes
   1573     if (size.width <= 2 || size.height <= 2)
   1574     {
   1575         dst = Scalar::all(0);
   1576         return;
   1577     }
   1578 
   1579     size.width -= 2;
   1580     size.height -= 2;
   1581 
   1582     int start_with_green = code == CV_BayerGB2BGR_EA || code == CV_BayerGR2BGR_EA ? 1 : 0;
   1583     int blue = code == CV_BayerGB2BGR_EA || code == CV_BayerBG2BGR_EA ? 1 : 0;
   1584 
   1585     if (size.height > 0)
   1586     {
   1587         Bayer2RGB_EdgeAware_T_Invoker<T, SIMDInterpolator> invoker(src, dst, size, blue, start_with_green);
   1588         Range range(0, size.height);
   1589         parallel_for_(range, invoker, dst.total()/static_cast<double>(1<<16));
   1590     }
   1591     size = dst.size();
   1592     size.width *= dst.channels();
   1593     size_t dstep = dst.step / dst.elemSize1();
   1594     T* firstRow = dst.ptr<T>();
   1595     T* lastRow = dst.ptr<T>() + (size.height-1) * dstep;
   1596 
   1597     if (size.height > 2)
   1598     {
   1599         for (int x = 0; x < size.width; ++x)
   1600         {
   1601             firstRow[x] = (firstRow+dstep)[x];
   1602             lastRow[x] = (lastRow-dstep)[x];
   1603         }
   1604     }
   1605     else
   1606         for (int x = 0; x < size.width; ++x)
   1607             firstRow[x] = lastRow[x] = 0;
   1608 }
   1609 
   1610 } // end namespace cv
   1611 
   1612 //////////////////////////////////////////////////////////////////////////////////////////
   1613 //                           The main Demosaicing function                              //
   1614 //////////////////////////////////////////////////////////////////////////////////////////
   1615 
   1616 void cv::demosaicing(InputArray _src, OutputArray _dst, int code, int dcn)
   1617 {
   1618     Mat src = _src.getMat(), dst;
   1619     Size sz = src.size();
   1620     int scn = src.channels(), depth = src.depth();
   1621 
   1622     CV_Assert(depth == CV_8U || depth == CV_16U);
   1623     CV_Assert(!src.empty());
   1624 
   1625     switch (code)
   1626     {
   1627     case CV_BayerBG2GRAY: case CV_BayerGB2GRAY: case CV_BayerRG2GRAY: case CV_BayerGR2GRAY:
   1628         if (dcn <= 0)
   1629             dcn = 1;
   1630         CV_Assert( scn == 1 && dcn == 1 );
   1631 
   1632         _dst.create(sz, CV_MAKETYPE(depth, dcn));
   1633         dst = _dst.getMat();
   1634 
   1635         if( depth == CV_8U )
   1636             Bayer2Gray_<uchar, SIMDBayerInterpolator_8u>(src, dst, code);
   1637         else if( depth == CV_16U )
   1638             Bayer2Gray_<ushort, SIMDBayerStubInterpolator_<ushort> >(src, dst, code);
   1639         else
   1640             CV_Error(CV_StsUnsupportedFormat, "Bayer->Gray demosaicing only supports 8u and 16u types");
   1641         break;
   1642 
   1643     case CV_BayerBG2BGR: case CV_BayerGB2BGR: case CV_BayerRG2BGR: case CV_BayerGR2BGR:
   1644     case CV_BayerBG2BGR_VNG: case CV_BayerGB2BGR_VNG: case CV_BayerRG2BGR_VNG: case CV_BayerGR2BGR_VNG:
   1645         {
   1646             if (dcn <= 0)
   1647                 dcn = 3;
   1648             CV_Assert( scn == 1 && (dcn == 3 || dcn == 4) );
   1649 
   1650             _dst.create(sz, CV_MAKE_TYPE(depth, dcn));
   1651             Mat dst_ = _dst.getMat();
   1652 
   1653             if( code == CV_BayerBG2BGR || code == CV_BayerGB2BGR ||
   1654                 code == CV_BayerRG2BGR || code == CV_BayerGR2BGR )
   1655             {
   1656                 if( depth == CV_8U )
   1657                     Bayer2RGB_<uchar, SIMDBayerInterpolator_8u>(src, dst_, code);
   1658                 else if( depth == CV_16U )
   1659                     Bayer2RGB_<ushort, SIMDBayerStubInterpolator_<ushort> >(src, dst_, code);
   1660                 else
   1661                     CV_Error(CV_StsUnsupportedFormat, "Bayer->RGB demosaicing only supports 8u and 16u types");
   1662             }
   1663             else
   1664             {
   1665                 CV_Assert( depth == CV_8U );
   1666                 Bayer2RGB_VNG_8u(src, dst_, code);
   1667             }
   1668         }
   1669         break;
   1670 
   1671     case CV_BayerBG2BGR_EA: case CV_BayerGB2BGR_EA: case CV_BayerRG2BGR_EA: case CV_BayerGR2BGR_EA:
   1672         if (dcn <= 0)
   1673             dcn = 3;
   1674 
   1675         CV_Assert(scn == 1 && dcn == 3);
   1676         _dst.create(sz, CV_MAKETYPE(depth, dcn));
   1677         dst = _dst.getMat();
   1678 
   1679         if (depth == CV_8U)
   1680             Bayer2RGB_EdgeAware_T<uchar, SIMDBayerInterpolator_8u>(src, dst, code);
   1681         else if (depth == CV_16U)
   1682             Bayer2RGB_EdgeAware_T<ushort, SIMDBayerStubInterpolator_<ushort> >(src, dst, code);
   1683         else
   1684             CV_Error(CV_StsUnsupportedFormat, "Bayer->RGB Edge-Aware demosaicing only currently supports 8u and 16u types");
   1685 
   1686         break;
   1687 
   1688     default:
   1689         CV_Error( CV_StsBadFlag, "Unknown / unsupported color conversion code" );
   1690     }
   1691 }
   1692