Home | History | Annotate | Download | only in core
      1 /*M///////////////////////////////////////////////////////////////////////////////////////
      2 //
      3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
      4 //
      5 //  By downloading, copying, installing or using the software you agree to this license.
      6 //  If you do not agree to this license, do not download, install,
      7 //  copy or use the software.
      8 //
      9 //
     10 //                          License Agreement
     11 //                For Open Source Computer Vision Library
     12 //
     13 // Copyright (C) 2015, Itseez Inc., all rights reserved.
     14 // Third party copyrights are property of their respective owners.
     15 //
     16 // Redistribution and use in source and binary forms, with or without modification,
     17 // are permitted provided that the following conditions are met:
     18 //
     19 //   * Redistribution's of source code must retain the above copyright notice,
     20 //     this list of conditions and the following disclaimer.
     21 //
     22 //   * Redistribution's in binary form must reproduce the above copyright notice,
     23 //     this list of conditions and the following disclaimer in the documentation
     24 //     and/or other materials provided with the distribution.
     25 //
     26 //   * The name of the copyright holders may not be used to endorse or promote products
     27 //     derived from this software without specific prior written permission.
     28 //
     29 // This software is provided by the copyright holders and contributors "as is" and
     30 // any express or implied warranties, including, but not limited to, the implied
     31 // warranties of merchantability and fitness for a particular purpose are disclaimed.
     32 // In no event shall the Intel Corporation or contributors be liable for any direct,
     33 // indirect, incidental, special, exemplary, or consequential damages
     34 // (including, but not limited to, procurement of substitute goods or services;
     35 // loss of use, data, or profits; or business interruption) however caused
     36 // and on any theory of liability, whether in contract, strict liability,
     37 // or tort (including negligence or otherwise) arising in any way out of
     38 // the use of this software, even if advised of the possibility of such damage.
     39 //
     40 //M*/
     41 
     42 #ifndef __OPENCV_CORE_SSE_UTILS_HPP__
     43 #define __OPENCV_CORE_SSE_UTILS_HPP__
     44 
     45 #ifndef __cplusplus
     46 #  error sse_utils.hpp header must be compiled as C++
     47 #endif
     48 
     49 #if CV_SSE2
     50 
     51 inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
     52 {
     53     __m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_g0);
     54     __m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_g0);
     55     __m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_g1);
     56     __m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_g1);
     57 
     58     __m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk2);
     59     __m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk2);
     60     __m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk3);
     61     __m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk3);
     62 
     63     __m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk2);
     64     __m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk2);
     65     __m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk3);
     66     __m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk3);
     67 
     68     __m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk2);
     69     __m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk2);
     70     __m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk3);
     71     __m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk3);
     72 
     73     v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk2);
     74     v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk2);
     75     v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk3);
     76     v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk3);
     77 }
     78 
     79 inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
     80                                   __m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
     81 {
     82     __m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_g1);
     83     __m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_g1);
     84     __m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_b0);
     85     __m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_b0);
     86     __m128i layer1_chunk4 = _mm_unpacklo_epi8(v_g0, v_b1);
     87     __m128i layer1_chunk5 = _mm_unpackhi_epi8(v_g0, v_b1);
     88 
     89     __m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk3);
     90     __m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk3);
     91     __m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk4);
     92     __m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk4);
     93     __m128i layer2_chunk4 = _mm_unpacklo_epi8(layer1_chunk2, layer1_chunk5);
     94     __m128i layer2_chunk5 = _mm_unpackhi_epi8(layer1_chunk2, layer1_chunk5);
     95 
     96     __m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk3);
     97     __m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk3);
     98     __m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk4);
     99     __m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk4);
    100     __m128i layer3_chunk4 = _mm_unpacklo_epi8(layer2_chunk2, layer2_chunk5);
    101     __m128i layer3_chunk5 = _mm_unpackhi_epi8(layer2_chunk2, layer2_chunk5);
    102 
    103     __m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk3);
    104     __m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk3);
    105     __m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk4);
    106     __m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk4);
    107     __m128i layer4_chunk4 = _mm_unpacklo_epi8(layer3_chunk2, layer3_chunk5);
    108     __m128i layer4_chunk5 = _mm_unpackhi_epi8(layer3_chunk2, layer3_chunk5);
    109 
    110     v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk3);
    111     v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk3);
    112     v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk4);
    113     v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk4);
    114     v_b0 = _mm_unpacklo_epi8(layer4_chunk2, layer4_chunk5);
    115     v_b1 = _mm_unpackhi_epi8(layer4_chunk2, layer4_chunk5);
    116 }
    117 
    118 inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
    119                                   __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
    120 {
    121     __m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_b0);
    122     __m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_b0);
    123     __m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_b1);
    124     __m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_b1);
    125     __m128i layer1_chunk4 = _mm_unpacklo_epi8(v_g0, v_a0);
    126     __m128i layer1_chunk5 = _mm_unpackhi_epi8(v_g0, v_a0);
    127     __m128i layer1_chunk6 = _mm_unpacklo_epi8(v_g1, v_a1);
    128     __m128i layer1_chunk7 = _mm_unpackhi_epi8(v_g1, v_a1);
    129 
    130     __m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk4);
    131     __m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk4);
    132     __m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk5);
    133     __m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk5);
    134     __m128i layer2_chunk4 = _mm_unpacklo_epi8(layer1_chunk2, layer1_chunk6);
    135     __m128i layer2_chunk5 = _mm_unpackhi_epi8(layer1_chunk2, layer1_chunk6);
    136     __m128i layer2_chunk6 = _mm_unpacklo_epi8(layer1_chunk3, layer1_chunk7);
    137     __m128i layer2_chunk7 = _mm_unpackhi_epi8(layer1_chunk3, layer1_chunk7);
    138 
    139     __m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk4);
    140     __m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk4);
    141     __m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk5);
    142     __m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk5);
    143     __m128i layer3_chunk4 = _mm_unpacklo_epi8(layer2_chunk2, layer2_chunk6);
    144     __m128i layer3_chunk5 = _mm_unpackhi_epi8(layer2_chunk2, layer2_chunk6);
    145     __m128i layer3_chunk6 = _mm_unpacklo_epi8(layer2_chunk3, layer2_chunk7);
    146     __m128i layer3_chunk7 = _mm_unpackhi_epi8(layer2_chunk3, layer2_chunk7);
    147 
    148     __m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk4);
    149     __m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk4);
    150     __m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk5);
    151     __m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk5);
    152     __m128i layer4_chunk4 = _mm_unpacklo_epi8(layer3_chunk2, layer3_chunk6);
    153     __m128i layer4_chunk5 = _mm_unpackhi_epi8(layer3_chunk2, layer3_chunk6);
    154     __m128i layer4_chunk6 = _mm_unpacklo_epi8(layer3_chunk3, layer3_chunk7);
    155     __m128i layer4_chunk7 = _mm_unpackhi_epi8(layer3_chunk3, layer3_chunk7);
    156 
    157     v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk4);
    158     v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk4);
    159     v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk5);
    160     v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk5);
    161     v_b0 = _mm_unpacklo_epi8(layer4_chunk2, layer4_chunk6);
    162     v_b1 = _mm_unpackhi_epi8(layer4_chunk2, layer4_chunk6);
    163     v_a0 = _mm_unpacklo_epi8(layer4_chunk3, layer4_chunk7);
    164     v_a1 = _mm_unpackhi_epi8(layer4_chunk3, layer4_chunk7);
    165 }
    166 
    167 inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
    168 {
    169     __m128i v_mask = _mm_set1_epi16(0x00ff);
    170 
    171     __m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
    172     __m128i layer4_chunk2 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8));
    173     __m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
    174     __m128i layer4_chunk3 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8));
    175 
    176     __m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask));
    177     __m128i layer3_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8));
    178     __m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask));
    179     __m128i layer3_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8));
    180 
    181     __m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
    182     __m128i layer2_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8));
    183     __m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
    184     __m128i layer2_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8));
    185 
    186     __m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
    187     __m128i layer1_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8));
    188     __m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
    189     __m128i layer1_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8));
    190 
    191     v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
    192     v_g0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8));
    193     v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
    194     v_g1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8));
    195 }
    196 
    197 inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
    198                                 __m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
    199 {
    200     __m128i v_mask = _mm_set1_epi16(0x00ff);
    201 
    202     __m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
    203     __m128i layer4_chunk3 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8));
    204     __m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
    205     __m128i layer4_chunk4 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8));
    206     __m128i layer4_chunk2 = _mm_packus_epi16(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
    207     __m128i layer4_chunk5 = _mm_packus_epi16(_mm_srli_epi16(v_b0, 8), _mm_srli_epi16(v_b1, 8));
    208 
    209     __m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask));
    210     __m128i layer3_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8));
    211     __m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask));
    212     __m128i layer3_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8));
    213     __m128i layer3_chunk2 = _mm_packus_epi16(_mm_and_si128(layer4_chunk4, v_mask), _mm_and_si128(layer4_chunk5, v_mask));
    214     __m128i layer3_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk4, 8), _mm_srli_epi16(layer4_chunk5, 8));
    215 
    216     __m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
    217     __m128i layer2_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8));
    218     __m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
    219     __m128i layer2_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8));
    220     __m128i layer2_chunk2 = _mm_packus_epi16(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
    221     __m128i layer2_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk4, 8), _mm_srli_epi16(layer3_chunk5, 8));
    222 
    223     __m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
    224     __m128i layer1_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8));
    225     __m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
    226     __m128i layer1_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8));
    227     __m128i layer1_chunk2 = _mm_packus_epi16(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
    228     __m128i layer1_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk4, 8), _mm_srli_epi16(layer2_chunk5, 8));
    229 
    230     v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
    231     v_g1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8));
    232     v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
    233     v_b0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8));
    234     v_g0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
    235     v_b1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk4, 8), _mm_srli_epi16(layer1_chunk5, 8));
    236 }
    237 
    238 inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
    239                                 __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
    240 {
    241     __m128i v_mask = _mm_set1_epi16(0x00ff);
    242 
    243     __m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
    244     __m128i layer4_chunk4 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8));
    245     __m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
    246     __m128i layer4_chunk5 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8));
    247     __m128i layer4_chunk2 = _mm_packus_epi16(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
    248     __m128i layer4_chunk6 = _mm_packus_epi16(_mm_srli_epi16(v_b0, 8), _mm_srli_epi16(v_b1, 8));
    249     __m128i layer4_chunk3 = _mm_packus_epi16(_mm_and_si128(v_a0, v_mask), _mm_and_si128(v_a1, v_mask));
    250     __m128i layer4_chunk7 = _mm_packus_epi16(_mm_srli_epi16(v_a0, 8), _mm_srli_epi16(v_a1, 8));
    251 
    252     __m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask));
    253     __m128i layer3_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8));
    254     __m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask));
    255     __m128i layer3_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8));
    256     __m128i layer3_chunk2 = _mm_packus_epi16(_mm_and_si128(layer4_chunk4, v_mask), _mm_and_si128(layer4_chunk5, v_mask));
    257     __m128i layer3_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk4, 8), _mm_srli_epi16(layer4_chunk5, 8));
    258     __m128i layer3_chunk3 = _mm_packus_epi16(_mm_and_si128(layer4_chunk6, v_mask), _mm_and_si128(layer4_chunk7, v_mask));
    259     __m128i layer3_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk6, 8), _mm_srli_epi16(layer4_chunk7, 8));
    260 
    261     __m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
    262     __m128i layer2_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8));
    263     __m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
    264     __m128i layer2_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8));
    265     __m128i layer2_chunk2 = _mm_packus_epi16(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
    266     __m128i layer2_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk4, 8), _mm_srli_epi16(layer3_chunk5, 8));
    267     __m128i layer2_chunk3 = _mm_packus_epi16(_mm_and_si128(layer3_chunk6, v_mask), _mm_and_si128(layer3_chunk7, v_mask));
    268     __m128i layer2_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk6, 8), _mm_srli_epi16(layer3_chunk7, 8));
    269 
    270     __m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
    271     __m128i layer1_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8));
    272     __m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
    273     __m128i layer1_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8));
    274     __m128i layer1_chunk2 = _mm_packus_epi16(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
    275     __m128i layer1_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk4, 8), _mm_srli_epi16(layer2_chunk5, 8));
    276     __m128i layer1_chunk3 = _mm_packus_epi16(_mm_and_si128(layer2_chunk6, v_mask), _mm_and_si128(layer2_chunk7, v_mask));
    277     __m128i layer1_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk6, 8), _mm_srli_epi16(layer2_chunk7, 8));
    278 
    279     v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
    280     v_b0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8));
    281     v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
    282     v_b1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8));
    283     v_g0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
    284     v_a0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk4, 8), _mm_srli_epi16(layer1_chunk5, 8));
    285     v_g1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk6, v_mask), _mm_and_si128(layer1_chunk7, v_mask));
    286     v_a1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk6, 8), _mm_srli_epi16(layer1_chunk7, 8));
    287 }
    288 
    289 inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
    290 {
    291     __m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_g0);
    292     __m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_g0);
    293     __m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_g1);
    294     __m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_g1);
    295 
    296     __m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk2);
    297     __m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk2);
    298     __m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk3);
    299     __m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk3);
    300 
    301     __m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk2);
    302     __m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk2);
    303     __m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk3);
    304     __m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk3);
    305 
    306     v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk2);
    307     v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk2);
    308     v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk3);
    309     v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk3);
    310 }
    311 
    312 inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
    313                                    __m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
    314 {
    315     __m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_g1);
    316     __m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_g1);
    317     __m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_b0);
    318     __m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_b0);
    319     __m128i layer1_chunk4 = _mm_unpacklo_epi16(v_g0, v_b1);
    320     __m128i layer1_chunk5 = _mm_unpackhi_epi16(v_g0, v_b1);
    321 
    322     __m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk3);
    323     __m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk3);
    324     __m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk4);
    325     __m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk4);
    326     __m128i layer2_chunk4 = _mm_unpacklo_epi16(layer1_chunk2, layer1_chunk5);
    327     __m128i layer2_chunk5 = _mm_unpackhi_epi16(layer1_chunk2, layer1_chunk5);
    328 
    329     __m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk3);
    330     __m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk3);
    331     __m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk4);
    332     __m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk4);
    333     __m128i layer3_chunk4 = _mm_unpacklo_epi16(layer2_chunk2, layer2_chunk5);
    334     __m128i layer3_chunk5 = _mm_unpackhi_epi16(layer2_chunk2, layer2_chunk5);
    335 
    336     v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk3);
    337     v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk3);
    338     v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk4);
    339     v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk4);
    340     v_b0 = _mm_unpacklo_epi16(layer3_chunk2, layer3_chunk5);
    341     v_b1 = _mm_unpackhi_epi16(layer3_chunk2, layer3_chunk5);
    342 }
    343 
    344 inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
    345                                    __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
    346 {
    347     __m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_b0);
    348     __m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_b0);
    349     __m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_b1);
    350     __m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_b1);
    351     __m128i layer1_chunk4 = _mm_unpacklo_epi16(v_g0, v_a0);
    352     __m128i layer1_chunk5 = _mm_unpackhi_epi16(v_g0, v_a0);
    353     __m128i layer1_chunk6 = _mm_unpacklo_epi16(v_g1, v_a1);
    354     __m128i layer1_chunk7 = _mm_unpackhi_epi16(v_g1, v_a1);
    355 
    356     __m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk4);
    357     __m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk4);
    358     __m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk5);
    359     __m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk5);
    360     __m128i layer2_chunk4 = _mm_unpacklo_epi16(layer1_chunk2, layer1_chunk6);
    361     __m128i layer2_chunk5 = _mm_unpackhi_epi16(layer1_chunk2, layer1_chunk6);
    362     __m128i layer2_chunk6 = _mm_unpacklo_epi16(layer1_chunk3, layer1_chunk7);
    363     __m128i layer2_chunk7 = _mm_unpackhi_epi16(layer1_chunk3, layer1_chunk7);
    364 
    365     __m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk4);
    366     __m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk4);
    367     __m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk5);
    368     __m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk5);
    369     __m128i layer3_chunk4 = _mm_unpacklo_epi16(layer2_chunk2, layer2_chunk6);
    370     __m128i layer3_chunk5 = _mm_unpackhi_epi16(layer2_chunk2, layer2_chunk6);
    371     __m128i layer3_chunk6 = _mm_unpacklo_epi16(layer2_chunk3, layer2_chunk7);
    372     __m128i layer3_chunk7 = _mm_unpackhi_epi16(layer2_chunk3, layer2_chunk7);
    373 
    374     v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk4);
    375     v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk4);
    376     v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk5);
    377     v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk5);
    378     v_b0 = _mm_unpacklo_epi16(layer3_chunk2, layer3_chunk6);
    379     v_b1 = _mm_unpackhi_epi16(layer3_chunk2, layer3_chunk6);
    380     v_a0 = _mm_unpacklo_epi16(layer3_chunk3, layer3_chunk7);
    381     v_a1 = _mm_unpackhi_epi16(layer3_chunk3, layer3_chunk7);
    382 }
    383 
    384 #if CV_SSE4_1
    385 
    386 inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1)
    387 {
    388     __m128i v_mask = _mm_set1_epi32(0x0000ffff);
    389 
    390     __m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
    391     __m128i layer3_chunk2 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16));
    392     __m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
    393     __m128i layer3_chunk3 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16));
    394 
    395     __m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
    396     __m128i layer2_chunk2 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16));
    397     __m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
    398     __m128i layer2_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16));
    399 
    400     __m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
    401     __m128i layer1_chunk2 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16));
    402     __m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
    403     __m128i layer1_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16));
    404 
    405     v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
    406     v_g0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16));
    407     v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
    408     v_g1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16));
    409 }
    410 
    411 inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0,
    412                                  __m128i & v_g1, __m128i & v_b0, __m128i & v_b1)
    413 {
    414     __m128i v_mask = _mm_set1_epi32(0x0000ffff);
    415 
    416     __m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
    417     __m128i layer3_chunk3 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16));
    418     __m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
    419     __m128i layer3_chunk4 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16));
    420     __m128i layer3_chunk2 = _mm_packus_epi32(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
    421     __m128i layer3_chunk5 = _mm_packus_epi32(_mm_srli_epi32(v_b0, 16), _mm_srli_epi32(v_b1, 16));
    422 
    423     __m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
    424     __m128i layer2_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16));
    425     __m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
    426     __m128i layer2_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16));
    427     __m128i layer2_chunk2 = _mm_packus_epi32(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
    428     __m128i layer2_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk4, 16), _mm_srli_epi32(layer3_chunk5, 16));
    429 
    430     __m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
    431     __m128i layer1_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16));
    432     __m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
    433     __m128i layer1_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16));
    434     __m128i layer1_chunk2 = _mm_packus_epi32(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
    435     __m128i layer1_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk4, 16), _mm_srli_epi32(layer2_chunk5, 16));
    436 
    437     v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
    438     v_g1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16));
    439     v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
    440     v_b0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16));
    441     v_g0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
    442     v_b1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk4, 16), _mm_srli_epi32(layer1_chunk5, 16));
    443 }
    444 
    445 inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1,
    446                                  __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1)
    447 {
    448     __m128i v_mask = _mm_set1_epi32(0x0000ffff);
    449 
    450     __m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask));
    451     __m128i layer3_chunk4 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16));
    452     __m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask));
    453     __m128i layer3_chunk5 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16));
    454     __m128i layer3_chunk2 = _mm_packus_epi32(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask));
    455     __m128i layer3_chunk6 = _mm_packus_epi32(_mm_srli_epi32(v_b0, 16), _mm_srli_epi32(v_b1, 16));
    456     __m128i layer3_chunk3 = _mm_packus_epi32(_mm_and_si128(v_a0, v_mask), _mm_and_si128(v_a1, v_mask));
    457     __m128i layer3_chunk7 = _mm_packus_epi32(_mm_srli_epi32(v_a0, 16), _mm_srli_epi32(v_a1, 16));
    458 
    459     __m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask));
    460     __m128i layer2_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16));
    461     __m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask));
    462     __m128i layer2_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16));
    463     __m128i layer2_chunk2 = _mm_packus_epi32(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask));
    464     __m128i layer2_chunk6 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk4, 16), _mm_srli_epi32(layer3_chunk5, 16));
    465     __m128i layer2_chunk3 = _mm_packus_epi32(_mm_and_si128(layer3_chunk6, v_mask), _mm_and_si128(layer3_chunk7, v_mask));
    466     __m128i layer2_chunk7 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk6, 16), _mm_srli_epi32(layer3_chunk7, 16));
    467 
    468     __m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask));
    469     __m128i layer1_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16));
    470     __m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask));
    471     __m128i layer1_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16));
    472     __m128i layer1_chunk2 = _mm_packus_epi32(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask));
    473     __m128i layer1_chunk6 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk4, 16), _mm_srli_epi32(layer2_chunk5, 16));
    474     __m128i layer1_chunk3 = _mm_packus_epi32(_mm_and_si128(layer2_chunk6, v_mask), _mm_and_si128(layer2_chunk7, v_mask));
    475     __m128i layer1_chunk7 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk6, 16), _mm_srli_epi32(layer2_chunk7, 16));
    476 
    477     v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask));
    478     v_b0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16));
    479     v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask));
    480     v_b1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16));
    481     v_g0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask));
    482     v_a0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk4, 16), _mm_srli_epi32(layer1_chunk5, 16));
    483     v_g1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk6, v_mask), _mm_and_si128(layer1_chunk7, v_mask));
    484     v_a1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk6, 16), _mm_srli_epi32(layer1_chunk7, 16));
    485 }
    486 
    487 #endif // CV_SSE4_1
    488 
    489 inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1)
    490 {
    491     __m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_g0);
    492     __m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_g0);
    493     __m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_g1);
    494     __m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_g1);
    495 
    496     __m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk2);
    497     __m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk2);
    498     __m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk3);
    499     __m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk3);
    500 
    501     v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk2);
    502     v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk2);
    503     v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk3);
    504     v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk3);
    505 }
    506 
    507 inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0,
    508                                 __m128 & v_g1, __m128 & v_b0, __m128 & v_b1)
    509 {
    510     __m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_g1);
    511     __m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_g1);
    512     __m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_b0);
    513     __m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_b0);
    514     __m128 layer1_chunk4 = _mm_unpacklo_ps(v_g0, v_b1);
    515     __m128 layer1_chunk5 = _mm_unpackhi_ps(v_g0, v_b1);
    516 
    517     __m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk3);
    518     __m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk3);
    519     __m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk4);
    520     __m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk4);
    521     __m128 layer2_chunk4 = _mm_unpacklo_ps(layer1_chunk2, layer1_chunk5);
    522     __m128 layer2_chunk5 = _mm_unpackhi_ps(layer1_chunk2, layer1_chunk5);
    523 
    524     v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk3);
    525     v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk3);
    526     v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk4);
    527     v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk4);
    528     v_b0 = _mm_unpacklo_ps(layer2_chunk2, layer2_chunk5);
    529     v_b1 = _mm_unpackhi_ps(layer2_chunk2, layer2_chunk5);
    530 }
    531 
    532 inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1,
    533                                 __m128 & v_b0, __m128 & v_b1, __m128 & v_a0, __m128 & v_a1)
    534 {
    535     __m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_b0);
    536     __m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_b0);
    537     __m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_b1);
    538     __m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_b1);
    539     __m128 layer1_chunk4 = _mm_unpacklo_ps(v_g0, v_a0);
    540     __m128 layer1_chunk5 = _mm_unpackhi_ps(v_g0, v_a0);
    541     __m128 layer1_chunk6 = _mm_unpacklo_ps(v_g1, v_a1);
    542     __m128 layer1_chunk7 = _mm_unpackhi_ps(v_g1, v_a1);
    543 
    544     __m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk4);
    545     __m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk4);
    546     __m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk5);
    547     __m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk5);
    548     __m128 layer2_chunk4 = _mm_unpacklo_ps(layer1_chunk2, layer1_chunk6);
    549     __m128 layer2_chunk5 = _mm_unpackhi_ps(layer1_chunk2, layer1_chunk6);
    550     __m128 layer2_chunk6 = _mm_unpacklo_ps(layer1_chunk3, layer1_chunk7);
    551     __m128 layer2_chunk7 = _mm_unpackhi_ps(layer1_chunk3, layer1_chunk7);
    552 
    553     v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk4);
    554     v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk4);
    555     v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk5);
    556     v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk5);
    557     v_b0 = _mm_unpacklo_ps(layer2_chunk2, layer2_chunk6);
    558     v_b1 = _mm_unpackhi_ps(layer2_chunk2, layer2_chunk6);
    559     v_a0 = _mm_unpacklo_ps(layer2_chunk3, layer2_chunk7);
    560     v_a1 = _mm_unpackhi_ps(layer2_chunk3, layer2_chunk7);
    561 }
    562 
    563 inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1)
    564 {
    565     const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
    566 
    567     __m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo);
    568     __m128 layer2_chunk2 = _mm_shuffle_ps(v_r0, v_r1, mask_hi);
    569     __m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo);
    570     __m128 layer2_chunk3 = _mm_shuffle_ps(v_g0, v_g1, mask_hi);
    571 
    572     __m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo);
    573     __m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi);
    574     __m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo);
    575     __m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi);
    576 
    577     v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo);
    578     v_g0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi);
    579     v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo);
    580     v_g1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi);
    581 }
    582 
    583 inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0,
    584                               __m128 & v_g1, __m128 & v_b0, __m128 & v_b1)
    585 {
    586     const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
    587 
    588     __m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo);
    589     __m128 layer2_chunk3 = _mm_shuffle_ps(v_r0, v_r1, mask_hi);
    590     __m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo);
    591     __m128 layer2_chunk4 = _mm_shuffle_ps(v_g0, v_g1, mask_hi);
    592     __m128 layer2_chunk2 = _mm_shuffle_ps(v_b0, v_b1, mask_lo);
    593     __m128 layer2_chunk5 = _mm_shuffle_ps(v_b0, v_b1, mask_hi);
    594 
    595     __m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo);
    596     __m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi);
    597     __m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo);
    598     __m128 layer1_chunk4 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi);
    599     __m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_lo);
    600     __m128 layer1_chunk5 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_hi);
    601 
    602     v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo);
    603     v_g1 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi);
    604     v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo);
    605     v_b0 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi);
    606     v_g0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_lo);
    607     v_b1 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_hi);
    608 }
    609 
    610 inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1,
    611                               __m128 & v_b0, __m128 & v_b1, __m128 & v_a0, __m128 & v_a1)
    612 {
    613     const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
    614 
    615     __m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo);
    616     __m128 layer2_chunk4 = _mm_shuffle_ps(v_r0, v_r1, mask_hi);
    617     __m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo);
    618     __m128 layer2_chunk5 = _mm_shuffle_ps(v_g0, v_g1, mask_hi);
    619     __m128 layer2_chunk2 = _mm_shuffle_ps(v_b0, v_b1, mask_lo);
    620     __m128 layer2_chunk6 = _mm_shuffle_ps(v_b0, v_b1, mask_hi);
    621     __m128 layer2_chunk3 = _mm_shuffle_ps(v_a0, v_a1, mask_lo);
    622     __m128 layer2_chunk7 = _mm_shuffle_ps(v_a0, v_a1, mask_hi);
    623 
    624     __m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo);
    625     __m128 layer1_chunk4 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi);
    626     __m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo);
    627     __m128 layer1_chunk5 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi);
    628     __m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_lo);
    629     __m128 layer1_chunk6 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_hi);
    630     __m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk6, layer2_chunk7, mask_lo);
    631     __m128 layer1_chunk7 = _mm_shuffle_ps(layer2_chunk6, layer2_chunk7, mask_hi);
    632 
    633     v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo);
    634     v_b0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi);
    635     v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo);
    636     v_b1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi);
    637     v_g0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_lo);
    638     v_a0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_hi);
    639     v_g1 = _mm_shuffle_ps(layer1_chunk6, layer1_chunk7, mask_lo);
    640     v_a1 = _mm_shuffle_ps(layer1_chunk6, layer1_chunk7, mask_hi);
    641 }
    642 
    643 #endif // CV_SSE2
    644 
    645 #endif //__OPENCV_CORE_SSE_UTILS_HPP__
    646