1 /*M/////////////////////////////////////////////////////////////////////////////////////// 2 // 3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4 // 5 // By downloading, copying, installing or using the software you agree to this license. 6 // If you do not agree to this license, do not download, install, 7 // copy or use the software. 8 // 9 // 10 // License Agreement 11 // For Open Source Computer Vision Library 12 // 13 // Copyright (C) 2015, Itseez Inc., all rights reserved. 14 // Third party copyrights are property of their respective owners. 15 // 16 // Redistribution and use in source and binary forms, with or without modification, 17 // are permitted provided that the following conditions are met: 18 // 19 // * Redistribution's of source code must retain the above copyright notice, 20 // this list of conditions and the following disclaimer. 21 // 22 // * Redistribution's in binary form must reproduce the above copyright notice, 23 // this list of conditions and the following disclaimer in the documentation 24 // and/or other materials provided with the distribution. 25 // 26 // * The name of the copyright holders may not be used to endorse or promote products 27 // derived from this software without specific prior written permission. 28 // 29 // This software is provided by the copyright holders and contributors "as is" and 30 // any express or implied warranties, including, but not limited to, the implied 31 // warranties of merchantability and fitness for a particular purpose are disclaimed. 32 // In no event shall the Intel Corporation or contributors be liable for any direct, 33 // indirect, incidental, special, exemplary, or consequential damages 34 // (including, but not limited to, procurement of substitute goods or services; 35 // loss of use, data, or profits; or business interruption) however caused 36 // and on any theory of liability, whether in contract, strict liability, 37 // or tort (including negligence or otherwise) arising in any way out of 38 // the use of this software, even if advised of the possibility of such damage. 39 // 40 //M*/ 41 42 #ifndef __OPENCV_CORE_SSE_UTILS_HPP__ 43 #define __OPENCV_CORE_SSE_UTILS_HPP__ 44 45 #ifndef __cplusplus 46 # error sse_utils.hpp header must be compiled as C++ 47 #endif 48 49 #if CV_SSE2 50 51 inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1) 52 { 53 __m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_g0); 54 __m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_g0); 55 __m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_g1); 56 __m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_g1); 57 58 __m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk2); 59 __m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk2); 60 __m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk3); 61 __m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk3); 62 63 __m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk2); 64 __m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk2); 65 __m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk3); 66 __m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk3); 67 68 __m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk2); 69 __m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk2); 70 __m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk3); 71 __m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk3); 72 73 v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk2); 74 v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk2); 75 v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk3); 76 v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk3); 77 } 78 79 inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, 80 __m128i & v_g1, __m128i & v_b0, __m128i & v_b1) 81 { 82 __m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_g1); 83 __m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_g1); 84 __m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_b0); 85 __m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_b0); 86 __m128i layer1_chunk4 = _mm_unpacklo_epi8(v_g0, v_b1); 87 __m128i layer1_chunk5 = _mm_unpackhi_epi8(v_g0, v_b1); 88 89 __m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk3); 90 __m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk3); 91 __m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk4); 92 __m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk4); 93 __m128i layer2_chunk4 = _mm_unpacklo_epi8(layer1_chunk2, layer1_chunk5); 94 __m128i layer2_chunk5 = _mm_unpackhi_epi8(layer1_chunk2, layer1_chunk5); 95 96 __m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk3); 97 __m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk3); 98 __m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk4); 99 __m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk4); 100 __m128i layer3_chunk4 = _mm_unpacklo_epi8(layer2_chunk2, layer2_chunk5); 101 __m128i layer3_chunk5 = _mm_unpackhi_epi8(layer2_chunk2, layer2_chunk5); 102 103 __m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk3); 104 __m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk3); 105 __m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk4); 106 __m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk4); 107 __m128i layer4_chunk4 = _mm_unpacklo_epi8(layer3_chunk2, layer3_chunk5); 108 __m128i layer4_chunk5 = _mm_unpackhi_epi8(layer3_chunk2, layer3_chunk5); 109 110 v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk3); 111 v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk3); 112 v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk4); 113 v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk4); 114 v_b0 = _mm_unpacklo_epi8(layer4_chunk2, layer4_chunk5); 115 v_b1 = _mm_unpackhi_epi8(layer4_chunk2, layer4_chunk5); 116 } 117 118 inline void _mm_deinterleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1, 119 __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1) 120 { 121 __m128i layer1_chunk0 = _mm_unpacklo_epi8(v_r0, v_b0); 122 __m128i layer1_chunk1 = _mm_unpackhi_epi8(v_r0, v_b0); 123 __m128i layer1_chunk2 = _mm_unpacklo_epi8(v_r1, v_b1); 124 __m128i layer1_chunk3 = _mm_unpackhi_epi8(v_r1, v_b1); 125 __m128i layer1_chunk4 = _mm_unpacklo_epi8(v_g0, v_a0); 126 __m128i layer1_chunk5 = _mm_unpackhi_epi8(v_g0, v_a0); 127 __m128i layer1_chunk6 = _mm_unpacklo_epi8(v_g1, v_a1); 128 __m128i layer1_chunk7 = _mm_unpackhi_epi8(v_g1, v_a1); 129 130 __m128i layer2_chunk0 = _mm_unpacklo_epi8(layer1_chunk0, layer1_chunk4); 131 __m128i layer2_chunk1 = _mm_unpackhi_epi8(layer1_chunk0, layer1_chunk4); 132 __m128i layer2_chunk2 = _mm_unpacklo_epi8(layer1_chunk1, layer1_chunk5); 133 __m128i layer2_chunk3 = _mm_unpackhi_epi8(layer1_chunk1, layer1_chunk5); 134 __m128i layer2_chunk4 = _mm_unpacklo_epi8(layer1_chunk2, layer1_chunk6); 135 __m128i layer2_chunk5 = _mm_unpackhi_epi8(layer1_chunk2, layer1_chunk6); 136 __m128i layer2_chunk6 = _mm_unpacklo_epi8(layer1_chunk3, layer1_chunk7); 137 __m128i layer2_chunk7 = _mm_unpackhi_epi8(layer1_chunk3, layer1_chunk7); 138 139 __m128i layer3_chunk0 = _mm_unpacklo_epi8(layer2_chunk0, layer2_chunk4); 140 __m128i layer3_chunk1 = _mm_unpackhi_epi8(layer2_chunk0, layer2_chunk4); 141 __m128i layer3_chunk2 = _mm_unpacklo_epi8(layer2_chunk1, layer2_chunk5); 142 __m128i layer3_chunk3 = _mm_unpackhi_epi8(layer2_chunk1, layer2_chunk5); 143 __m128i layer3_chunk4 = _mm_unpacklo_epi8(layer2_chunk2, layer2_chunk6); 144 __m128i layer3_chunk5 = _mm_unpackhi_epi8(layer2_chunk2, layer2_chunk6); 145 __m128i layer3_chunk6 = _mm_unpacklo_epi8(layer2_chunk3, layer2_chunk7); 146 __m128i layer3_chunk7 = _mm_unpackhi_epi8(layer2_chunk3, layer2_chunk7); 147 148 __m128i layer4_chunk0 = _mm_unpacklo_epi8(layer3_chunk0, layer3_chunk4); 149 __m128i layer4_chunk1 = _mm_unpackhi_epi8(layer3_chunk0, layer3_chunk4); 150 __m128i layer4_chunk2 = _mm_unpacklo_epi8(layer3_chunk1, layer3_chunk5); 151 __m128i layer4_chunk3 = _mm_unpackhi_epi8(layer3_chunk1, layer3_chunk5); 152 __m128i layer4_chunk4 = _mm_unpacklo_epi8(layer3_chunk2, layer3_chunk6); 153 __m128i layer4_chunk5 = _mm_unpackhi_epi8(layer3_chunk2, layer3_chunk6); 154 __m128i layer4_chunk6 = _mm_unpacklo_epi8(layer3_chunk3, layer3_chunk7); 155 __m128i layer4_chunk7 = _mm_unpackhi_epi8(layer3_chunk3, layer3_chunk7); 156 157 v_r0 = _mm_unpacklo_epi8(layer4_chunk0, layer4_chunk4); 158 v_r1 = _mm_unpackhi_epi8(layer4_chunk0, layer4_chunk4); 159 v_g0 = _mm_unpacklo_epi8(layer4_chunk1, layer4_chunk5); 160 v_g1 = _mm_unpackhi_epi8(layer4_chunk1, layer4_chunk5); 161 v_b0 = _mm_unpacklo_epi8(layer4_chunk2, layer4_chunk6); 162 v_b1 = _mm_unpackhi_epi8(layer4_chunk2, layer4_chunk6); 163 v_a0 = _mm_unpacklo_epi8(layer4_chunk3, layer4_chunk7); 164 v_a1 = _mm_unpackhi_epi8(layer4_chunk3, layer4_chunk7); 165 } 166 167 inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1) 168 { 169 __m128i v_mask = _mm_set1_epi16(0x00ff); 170 171 __m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask)); 172 __m128i layer4_chunk2 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8)); 173 __m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask)); 174 __m128i layer4_chunk3 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8)); 175 176 __m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask)); 177 __m128i layer3_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8)); 178 __m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask)); 179 __m128i layer3_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8)); 180 181 __m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); 182 __m128i layer2_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8)); 183 __m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); 184 __m128i layer2_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8)); 185 186 __m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); 187 __m128i layer1_chunk2 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8)); 188 __m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); 189 __m128i layer1_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8)); 190 191 v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); 192 v_g0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8)); 193 v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); 194 v_g1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8)); 195 } 196 197 inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, 198 __m128i & v_g1, __m128i & v_b0, __m128i & v_b1) 199 { 200 __m128i v_mask = _mm_set1_epi16(0x00ff); 201 202 __m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask)); 203 __m128i layer4_chunk3 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8)); 204 __m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask)); 205 __m128i layer4_chunk4 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8)); 206 __m128i layer4_chunk2 = _mm_packus_epi16(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask)); 207 __m128i layer4_chunk5 = _mm_packus_epi16(_mm_srli_epi16(v_b0, 8), _mm_srli_epi16(v_b1, 8)); 208 209 __m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask)); 210 __m128i layer3_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8)); 211 __m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask)); 212 __m128i layer3_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8)); 213 __m128i layer3_chunk2 = _mm_packus_epi16(_mm_and_si128(layer4_chunk4, v_mask), _mm_and_si128(layer4_chunk5, v_mask)); 214 __m128i layer3_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk4, 8), _mm_srli_epi16(layer4_chunk5, 8)); 215 216 __m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); 217 __m128i layer2_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8)); 218 __m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); 219 __m128i layer2_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8)); 220 __m128i layer2_chunk2 = _mm_packus_epi16(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask)); 221 __m128i layer2_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk4, 8), _mm_srli_epi16(layer3_chunk5, 8)); 222 223 __m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); 224 __m128i layer1_chunk3 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8)); 225 __m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); 226 __m128i layer1_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8)); 227 __m128i layer1_chunk2 = _mm_packus_epi16(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask)); 228 __m128i layer1_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk4, 8), _mm_srli_epi16(layer2_chunk5, 8)); 229 230 v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); 231 v_g1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8)); 232 v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); 233 v_b0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8)); 234 v_g0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask)); 235 v_b1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk4, 8), _mm_srli_epi16(layer1_chunk5, 8)); 236 } 237 238 inline void _mm_interleave_epi8(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1, 239 __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1) 240 { 241 __m128i v_mask = _mm_set1_epi16(0x00ff); 242 243 __m128i layer4_chunk0 = _mm_packus_epi16(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask)); 244 __m128i layer4_chunk4 = _mm_packus_epi16(_mm_srli_epi16(v_r0, 8), _mm_srli_epi16(v_r1, 8)); 245 __m128i layer4_chunk1 = _mm_packus_epi16(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask)); 246 __m128i layer4_chunk5 = _mm_packus_epi16(_mm_srli_epi16(v_g0, 8), _mm_srli_epi16(v_g1, 8)); 247 __m128i layer4_chunk2 = _mm_packus_epi16(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask)); 248 __m128i layer4_chunk6 = _mm_packus_epi16(_mm_srli_epi16(v_b0, 8), _mm_srli_epi16(v_b1, 8)); 249 __m128i layer4_chunk3 = _mm_packus_epi16(_mm_and_si128(v_a0, v_mask), _mm_and_si128(v_a1, v_mask)); 250 __m128i layer4_chunk7 = _mm_packus_epi16(_mm_srli_epi16(v_a0, 8), _mm_srli_epi16(v_a1, 8)); 251 252 __m128i layer3_chunk0 = _mm_packus_epi16(_mm_and_si128(layer4_chunk0, v_mask), _mm_and_si128(layer4_chunk1, v_mask)); 253 __m128i layer3_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk0, 8), _mm_srli_epi16(layer4_chunk1, 8)); 254 __m128i layer3_chunk1 = _mm_packus_epi16(_mm_and_si128(layer4_chunk2, v_mask), _mm_and_si128(layer4_chunk3, v_mask)); 255 __m128i layer3_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk2, 8), _mm_srli_epi16(layer4_chunk3, 8)); 256 __m128i layer3_chunk2 = _mm_packus_epi16(_mm_and_si128(layer4_chunk4, v_mask), _mm_and_si128(layer4_chunk5, v_mask)); 257 __m128i layer3_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk4, 8), _mm_srli_epi16(layer4_chunk5, 8)); 258 __m128i layer3_chunk3 = _mm_packus_epi16(_mm_and_si128(layer4_chunk6, v_mask), _mm_and_si128(layer4_chunk7, v_mask)); 259 __m128i layer3_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer4_chunk6, 8), _mm_srli_epi16(layer4_chunk7, 8)); 260 261 __m128i layer2_chunk0 = _mm_packus_epi16(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); 262 __m128i layer2_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk0, 8), _mm_srli_epi16(layer3_chunk1, 8)); 263 __m128i layer2_chunk1 = _mm_packus_epi16(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); 264 __m128i layer2_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk2, 8), _mm_srli_epi16(layer3_chunk3, 8)); 265 __m128i layer2_chunk2 = _mm_packus_epi16(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask)); 266 __m128i layer2_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk4, 8), _mm_srli_epi16(layer3_chunk5, 8)); 267 __m128i layer2_chunk3 = _mm_packus_epi16(_mm_and_si128(layer3_chunk6, v_mask), _mm_and_si128(layer3_chunk7, v_mask)); 268 __m128i layer2_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer3_chunk6, 8), _mm_srli_epi16(layer3_chunk7, 8)); 269 270 __m128i layer1_chunk0 = _mm_packus_epi16(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); 271 __m128i layer1_chunk4 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk0, 8), _mm_srli_epi16(layer2_chunk1, 8)); 272 __m128i layer1_chunk1 = _mm_packus_epi16(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); 273 __m128i layer1_chunk5 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk2, 8), _mm_srli_epi16(layer2_chunk3, 8)); 274 __m128i layer1_chunk2 = _mm_packus_epi16(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask)); 275 __m128i layer1_chunk6 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk4, 8), _mm_srli_epi16(layer2_chunk5, 8)); 276 __m128i layer1_chunk3 = _mm_packus_epi16(_mm_and_si128(layer2_chunk6, v_mask), _mm_and_si128(layer2_chunk7, v_mask)); 277 __m128i layer1_chunk7 = _mm_packus_epi16(_mm_srli_epi16(layer2_chunk6, 8), _mm_srli_epi16(layer2_chunk7, 8)); 278 279 v_r0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); 280 v_b0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk0, 8), _mm_srli_epi16(layer1_chunk1, 8)); 281 v_r1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); 282 v_b1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk2, 8), _mm_srli_epi16(layer1_chunk3, 8)); 283 v_g0 = _mm_packus_epi16(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask)); 284 v_a0 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk4, 8), _mm_srli_epi16(layer1_chunk5, 8)); 285 v_g1 = _mm_packus_epi16(_mm_and_si128(layer1_chunk6, v_mask), _mm_and_si128(layer1_chunk7, v_mask)); 286 v_a1 = _mm_packus_epi16(_mm_srli_epi16(layer1_chunk6, 8), _mm_srli_epi16(layer1_chunk7, 8)); 287 } 288 289 inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1) 290 { 291 __m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_g0); 292 __m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_g0); 293 __m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_g1); 294 __m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_g1); 295 296 __m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk2); 297 __m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk2); 298 __m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk3); 299 __m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk3); 300 301 __m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk2); 302 __m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk2); 303 __m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk3); 304 __m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk3); 305 306 v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk2); 307 v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk2); 308 v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk3); 309 v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk3); 310 } 311 312 inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, 313 __m128i & v_g1, __m128i & v_b0, __m128i & v_b1) 314 { 315 __m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_g1); 316 __m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_g1); 317 __m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_b0); 318 __m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_b0); 319 __m128i layer1_chunk4 = _mm_unpacklo_epi16(v_g0, v_b1); 320 __m128i layer1_chunk5 = _mm_unpackhi_epi16(v_g0, v_b1); 321 322 __m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk3); 323 __m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk3); 324 __m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk4); 325 __m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk4); 326 __m128i layer2_chunk4 = _mm_unpacklo_epi16(layer1_chunk2, layer1_chunk5); 327 __m128i layer2_chunk5 = _mm_unpackhi_epi16(layer1_chunk2, layer1_chunk5); 328 329 __m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk3); 330 __m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk3); 331 __m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk4); 332 __m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk4); 333 __m128i layer3_chunk4 = _mm_unpacklo_epi16(layer2_chunk2, layer2_chunk5); 334 __m128i layer3_chunk5 = _mm_unpackhi_epi16(layer2_chunk2, layer2_chunk5); 335 336 v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk3); 337 v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk3); 338 v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk4); 339 v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk4); 340 v_b0 = _mm_unpacklo_epi16(layer3_chunk2, layer3_chunk5); 341 v_b1 = _mm_unpackhi_epi16(layer3_chunk2, layer3_chunk5); 342 } 343 344 inline void _mm_deinterleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1, 345 __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1) 346 { 347 __m128i layer1_chunk0 = _mm_unpacklo_epi16(v_r0, v_b0); 348 __m128i layer1_chunk1 = _mm_unpackhi_epi16(v_r0, v_b0); 349 __m128i layer1_chunk2 = _mm_unpacklo_epi16(v_r1, v_b1); 350 __m128i layer1_chunk3 = _mm_unpackhi_epi16(v_r1, v_b1); 351 __m128i layer1_chunk4 = _mm_unpacklo_epi16(v_g0, v_a0); 352 __m128i layer1_chunk5 = _mm_unpackhi_epi16(v_g0, v_a0); 353 __m128i layer1_chunk6 = _mm_unpacklo_epi16(v_g1, v_a1); 354 __m128i layer1_chunk7 = _mm_unpackhi_epi16(v_g1, v_a1); 355 356 __m128i layer2_chunk0 = _mm_unpacklo_epi16(layer1_chunk0, layer1_chunk4); 357 __m128i layer2_chunk1 = _mm_unpackhi_epi16(layer1_chunk0, layer1_chunk4); 358 __m128i layer2_chunk2 = _mm_unpacklo_epi16(layer1_chunk1, layer1_chunk5); 359 __m128i layer2_chunk3 = _mm_unpackhi_epi16(layer1_chunk1, layer1_chunk5); 360 __m128i layer2_chunk4 = _mm_unpacklo_epi16(layer1_chunk2, layer1_chunk6); 361 __m128i layer2_chunk5 = _mm_unpackhi_epi16(layer1_chunk2, layer1_chunk6); 362 __m128i layer2_chunk6 = _mm_unpacklo_epi16(layer1_chunk3, layer1_chunk7); 363 __m128i layer2_chunk7 = _mm_unpackhi_epi16(layer1_chunk3, layer1_chunk7); 364 365 __m128i layer3_chunk0 = _mm_unpacklo_epi16(layer2_chunk0, layer2_chunk4); 366 __m128i layer3_chunk1 = _mm_unpackhi_epi16(layer2_chunk0, layer2_chunk4); 367 __m128i layer3_chunk2 = _mm_unpacklo_epi16(layer2_chunk1, layer2_chunk5); 368 __m128i layer3_chunk3 = _mm_unpackhi_epi16(layer2_chunk1, layer2_chunk5); 369 __m128i layer3_chunk4 = _mm_unpacklo_epi16(layer2_chunk2, layer2_chunk6); 370 __m128i layer3_chunk5 = _mm_unpackhi_epi16(layer2_chunk2, layer2_chunk6); 371 __m128i layer3_chunk6 = _mm_unpacklo_epi16(layer2_chunk3, layer2_chunk7); 372 __m128i layer3_chunk7 = _mm_unpackhi_epi16(layer2_chunk3, layer2_chunk7); 373 374 v_r0 = _mm_unpacklo_epi16(layer3_chunk0, layer3_chunk4); 375 v_r1 = _mm_unpackhi_epi16(layer3_chunk0, layer3_chunk4); 376 v_g0 = _mm_unpacklo_epi16(layer3_chunk1, layer3_chunk5); 377 v_g1 = _mm_unpackhi_epi16(layer3_chunk1, layer3_chunk5); 378 v_b0 = _mm_unpacklo_epi16(layer3_chunk2, layer3_chunk6); 379 v_b1 = _mm_unpackhi_epi16(layer3_chunk2, layer3_chunk6); 380 v_a0 = _mm_unpacklo_epi16(layer3_chunk3, layer3_chunk7); 381 v_a1 = _mm_unpackhi_epi16(layer3_chunk3, layer3_chunk7); 382 } 383 384 #if CV_SSE4_1 385 386 inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1) 387 { 388 __m128i v_mask = _mm_set1_epi32(0x0000ffff); 389 390 __m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask)); 391 __m128i layer3_chunk2 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16)); 392 __m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask)); 393 __m128i layer3_chunk3 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16)); 394 395 __m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); 396 __m128i layer2_chunk2 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16)); 397 __m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); 398 __m128i layer2_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16)); 399 400 __m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); 401 __m128i layer1_chunk2 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16)); 402 __m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); 403 __m128i layer1_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16)); 404 405 v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); 406 v_g0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16)); 407 v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); 408 v_g1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16)); 409 } 410 411 inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, 412 __m128i & v_g1, __m128i & v_b0, __m128i & v_b1) 413 { 414 __m128i v_mask = _mm_set1_epi32(0x0000ffff); 415 416 __m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask)); 417 __m128i layer3_chunk3 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16)); 418 __m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask)); 419 __m128i layer3_chunk4 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16)); 420 __m128i layer3_chunk2 = _mm_packus_epi32(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask)); 421 __m128i layer3_chunk5 = _mm_packus_epi32(_mm_srli_epi32(v_b0, 16), _mm_srli_epi32(v_b1, 16)); 422 423 __m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); 424 __m128i layer2_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16)); 425 __m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); 426 __m128i layer2_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16)); 427 __m128i layer2_chunk2 = _mm_packus_epi32(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask)); 428 __m128i layer2_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk4, 16), _mm_srli_epi32(layer3_chunk5, 16)); 429 430 __m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); 431 __m128i layer1_chunk3 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16)); 432 __m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); 433 __m128i layer1_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16)); 434 __m128i layer1_chunk2 = _mm_packus_epi32(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask)); 435 __m128i layer1_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk4, 16), _mm_srli_epi32(layer2_chunk5, 16)); 436 437 v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); 438 v_g1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16)); 439 v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); 440 v_b0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16)); 441 v_g0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask)); 442 v_b1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk4, 16), _mm_srli_epi32(layer1_chunk5, 16)); 443 } 444 445 inline void _mm_interleave_epi16(__m128i & v_r0, __m128i & v_r1, __m128i & v_g0, __m128i & v_g1, 446 __m128i & v_b0, __m128i & v_b1, __m128i & v_a0, __m128i & v_a1) 447 { 448 __m128i v_mask = _mm_set1_epi32(0x0000ffff); 449 450 __m128i layer3_chunk0 = _mm_packus_epi32(_mm_and_si128(v_r0, v_mask), _mm_and_si128(v_r1, v_mask)); 451 __m128i layer3_chunk4 = _mm_packus_epi32(_mm_srli_epi32(v_r0, 16), _mm_srli_epi32(v_r1, 16)); 452 __m128i layer3_chunk1 = _mm_packus_epi32(_mm_and_si128(v_g0, v_mask), _mm_and_si128(v_g1, v_mask)); 453 __m128i layer3_chunk5 = _mm_packus_epi32(_mm_srli_epi32(v_g0, 16), _mm_srli_epi32(v_g1, 16)); 454 __m128i layer3_chunk2 = _mm_packus_epi32(_mm_and_si128(v_b0, v_mask), _mm_and_si128(v_b1, v_mask)); 455 __m128i layer3_chunk6 = _mm_packus_epi32(_mm_srli_epi32(v_b0, 16), _mm_srli_epi32(v_b1, 16)); 456 __m128i layer3_chunk3 = _mm_packus_epi32(_mm_and_si128(v_a0, v_mask), _mm_and_si128(v_a1, v_mask)); 457 __m128i layer3_chunk7 = _mm_packus_epi32(_mm_srli_epi32(v_a0, 16), _mm_srli_epi32(v_a1, 16)); 458 459 __m128i layer2_chunk0 = _mm_packus_epi32(_mm_and_si128(layer3_chunk0, v_mask), _mm_and_si128(layer3_chunk1, v_mask)); 460 __m128i layer2_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk0, 16), _mm_srli_epi32(layer3_chunk1, 16)); 461 __m128i layer2_chunk1 = _mm_packus_epi32(_mm_and_si128(layer3_chunk2, v_mask), _mm_and_si128(layer3_chunk3, v_mask)); 462 __m128i layer2_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk2, 16), _mm_srli_epi32(layer3_chunk3, 16)); 463 __m128i layer2_chunk2 = _mm_packus_epi32(_mm_and_si128(layer3_chunk4, v_mask), _mm_and_si128(layer3_chunk5, v_mask)); 464 __m128i layer2_chunk6 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk4, 16), _mm_srli_epi32(layer3_chunk5, 16)); 465 __m128i layer2_chunk3 = _mm_packus_epi32(_mm_and_si128(layer3_chunk6, v_mask), _mm_and_si128(layer3_chunk7, v_mask)); 466 __m128i layer2_chunk7 = _mm_packus_epi32(_mm_srli_epi32(layer3_chunk6, 16), _mm_srli_epi32(layer3_chunk7, 16)); 467 468 __m128i layer1_chunk0 = _mm_packus_epi32(_mm_and_si128(layer2_chunk0, v_mask), _mm_and_si128(layer2_chunk1, v_mask)); 469 __m128i layer1_chunk4 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk0, 16), _mm_srli_epi32(layer2_chunk1, 16)); 470 __m128i layer1_chunk1 = _mm_packus_epi32(_mm_and_si128(layer2_chunk2, v_mask), _mm_and_si128(layer2_chunk3, v_mask)); 471 __m128i layer1_chunk5 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk2, 16), _mm_srli_epi32(layer2_chunk3, 16)); 472 __m128i layer1_chunk2 = _mm_packus_epi32(_mm_and_si128(layer2_chunk4, v_mask), _mm_and_si128(layer2_chunk5, v_mask)); 473 __m128i layer1_chunk6 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk4, 16), _mm_srli_epi32(layer2_chunk5, 16)); 474 __m128i layer1_chunk3 = _mm_packus_epi32(_mm_and_si128(layer2_chunk6, v_mask), _mm_and_si128(layer2_chunk7, v_mask)); 475 __m128i layer1_chunk7 = _mm_packus_epi32(_mm_srli_epi32(layer2_chunk6, 16), _mm_srli_epi32(layer2_chunk7, 16)); 476 477 v_r0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk0, v_mask), _mm_and_si128(layer1_chunk1, v_mask)); 478 v_b0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk0, 16), _mm_srli_epi32(layer1_chunk1, 16)); 479 v_r1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk2, v_mask), _mm_and_si128(layer1_chunk3, v_mask)); 480 v_b1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk2, 16), _mm_srli_epi32(layer1_chunk3, 16)); 481 v_g0 = _mm_packus_epi32(_mm_and_si128(layer1_chunk4, v_mask), _mm_and_si128(layer1_chunk5, v_mask)); 482 v_a0 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk4, 16), _mm_srli_epi32(layer1_chunk5, 16)); 483 v_g1 = _mm_packus_epi32(_mm_and_si128(layer1_chunk6, v_mask), _mm_and_si128(layer1_chunk7, v_mask)); 484 v_a1 = _mm_packus_epi32(_mm_srli_epi32(layer1_chunk6, 16), _mm_srli_epi32(layer1_chunk7, 16)); 485 } 486 487 #endif // CV_SSE4_1 488 489 inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1) 490 { 491 __m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_g0); 492 __m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_g0); 493 __m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_g1); 494 __m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_g1); 495 496 __m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk2); 497 __m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk2); 498 __m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk3); 499 __m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk3); 500 501 v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk2); 502 v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk2); 503 v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk3); 504 v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk3); 505 } 506 507 inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, 508 __m128 & v_g1, __m128 & v_b0, __m128 & v_b1) 509 { 510 __m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_g1); 511 __m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_g1); 512 __m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_b0); 513 __m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_b0); 514 __m128 layer1_chunk4 = _mm_unpacklo_ps(v_g0, v_b1); 515 __m128 layer1_chunk5 = _mm_unpackhi_ps(v_g0, v_b1); 516 517 __m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk3); 518 __m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk3); 519 __m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk4); 520 __m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk4); 521 __m128 layer2_chunk4 = _mm_unpacklo_ps(layer1_chunk2, layer1_chunk5); 522 __m128 layer2_chunk5 = _mm_unpackhi_ps(layer1_chunk2, layer1_chunk5); 523 524 v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk3); 525 v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk3); 526 v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk4); 527 v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk4); 528 v_b0 = _mm_unpacklo_ps(layer2_chunk2, layer2_chunk5); 529 v_b1 = _mm_unpackhi_ps(layer2_chunk2, layer2_chunk5); 530 } 531 532 inline void _mm_deinterleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1, 533 __m128 & v_b0, __m128 & v_b1, __m128 & v_a0, __m128 & v_a1) 534 { 535 __m128 layer1_chunk0 = _mm_unpacklo_ps(v_r0, v_b0); 536 __m128 layer1_chunk1 = _mm_unpackhi_ps(v_r0, v_b0); 537 __m128 layer1_chunk2 = _mm_unpacklo_ps(v_r1, v_b1); 538 __m128 layer1_chunk3 = _mm_unpackhi_ps(v_r1, v_b1); 539 __m128 layer1_chunk4 = _mm_unpacklo_ps(v_g0, v_a0); 540 __m128 layer1_chunk5 = _mm_unpackhi_ps(v_g0, v_a0); 541 __m128 layer1_chunk6 = _mm_unpacklo_ps(v_g1, v_a1); 542 __m128 layer1_chunk7 = _mm_unpackhi_ps(v_g1, v_a1); 543 544 __m128 layer2_chunk0 = _mm_unpacklo_ps(layer1_chunk0, layer1_chunk4); 545 __m128 layer2_chunk1 = _mm_unpackhi_ps(layer1_chunk0, layer1_chunk4); 546 __m128 layer2_chunk2 = _mm_unpacklo_ps(layer1_chunk1, layer1_chunk5); 547 __m128 layer2_chunk3 = _mm_unpackhi_ps(layer1_chunk1, layer1_chunk5); 548 __m128 layer2_chunk4 = _mm_unpacklo_ps(layer1_chunk2, layer1_chunk6); 549 __m128 layer2_chunk5 = _mm_unpackhi_ps(layer1_chunk2, layer1_chunk6); 550 __m128 layer2_chunk6 = _mm_unpacklo_ps(layer1_chunk3, layer1_chunk7); 551 __m128 layer2_chunk7 = _mm_unpackhi_ps(layer1_chunk3, layer1_chunk7); 552 553 v_r0 = _mm_unpacklo_ps(layer2_chunk0, layer2_chunk4); 554 v_r1 = _mm_unpackhi_ps(layer2_chunk0, layer2_chunk4); 555 v_g0 = _mm_unpacklo_ps(layer2_chunk1, layer2_chunk5); 556 v_g1 = _mm_unpackhi_ps(layer2_chunk1, layer2_chunk5); 557 v_b0 = _mm_unpacklo_ps(layer2_chunk2, layer2_chunk6); 558 v_b1 = _mm_unpackhi_ps(layer2_chunk2, layer2_chunk6); 559 v_a0 = _mm_unpacklo_ps(layer2_chunk3, layer2_chunk7); 560 v_a1 = _mm_unpackhi_ps(layer2_chunk3, layer2_chunk7); 561 } 562 563 inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1) 564 { 565 const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1); 566 567 __m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo); 568 __m128 layer2_chunk2 = _mm_shuffle_ps(v_r0, v_r1, mask_hi); 569 __m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo); 570 __m128 layer2_chunk3 = _mm_shuffle_ps(v_g0, v_g1, mask_hi); 571 572 __m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo); 573 __m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi); 574 __m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo); 575 __m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi); 576 577 v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo); 578 v_g0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi); 579 v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo); 580 v_g1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi); 581 } 582 583 inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, 584 __m128 & v_g1, __m128 & v_b0, __m128 & v_b1) 585 { 586 const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1); 587 588 __m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo); 589 __m128 layer2_chunk3 = _mm_shuffle_ps(v_r0, v_r1, mask_hi); 590 __m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo); 591 __m128 layer2_chunk4 = _mm_shuffle_ps(v_g0, v_g1, mask_hi); 592 __m128 layer2_chunk2 = _mm_shuffle_ps(v_b0, v_b1, mask_lo); 593 __m128 layer2_chunk5 = _mm_shuffle_ps(v_b0, v_b1, mask_hi); 594 595 __m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo); 596 __m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi); 597 __m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo); 598 __m128 layer1_chunk4 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi); 599 __m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_lo); 600 __m128 layer1_chunk5 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_hi); 601 602 v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo); 603 v_g1 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi); 604 v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo); 605 v_b0 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi); 606 v_g0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_lo); 607 v_b1 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_hi); 608 } 609 610 inline void _mm_interleave_ps(__m128 & v_r0, __m128 & v_r1, __m128 & v_g0, __m128 & v_g1, 611 __m128 & v_b0, __m128 & v_b1, __m128 & v_a0, __m128 & v_a1) 612 { 613 const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1); 614 615 __m128 layer2_chunk0 = _mm_shuffle_ps(v_r0, v_r1, mask_lo); 616 __m128 layer2_chunk4 = _mm_shuffle_ps(v_r0, v_r1, mask_hi); 617 __m128 layer2_chunk1 = _mm_shuffle_ps(v_g0, v_g1, mask_lo); 618 __m128 layer2_chunk5 = _mm_shuffle_ps(v_g0, v_g1, mask_hi); 619 __m128 layer2_chunk2 = _mm_shuffle_ps(v_b0, v_b1, mask_lo); 620 __m128 layer2_chunk6 = _mm_shuffle_ps(v_b0, v_b1, mask_hi); 621 __m128 layer2_chunk3 = _mm_shuffle_ps(v_a0, v_a1, mask_lo); 622 __m128 layer2_chunk7 = _mm_shuffle_ps(v_a0, v_a1, mask_hi); 623 624 __m128 layer1_chunk0 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_lo); 625 __m128 layer1_chunk4 = _mm_shuffle_ps(layer2_chunk0, layer2_chunk1, mask_hi); 626 __m128 layer1_chunk1 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_lo); 627 __m128 layer1_chunk5 = _mm_shuffle_ps(layer2_chunk2, layer2_chunk3, mask_hi); 628 __m128 layer1_chunk2 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_lo); 629 __m128 layer1_chunk6 = _mm_shuffle_ps(layer2_chunk4, layer2_chunk5, mask_hi); 630 __m128 layer1_chunk3 = _mm_shuffle_ps(layer2_chunk6, layer2_chunk7, mask_lo); 631 __m128 layer1_chunk7 = _mm_shuffle_ps(layer2_chunk6, layer2_chunk7, mask_hi); 632 633 v_r0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_lo); 634 v_b0 = _mm_shuffle_ps(layer1_chunk0, layer1_chunk1, mask_hi); 635 v_r1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_lo); 636 v_b1 = _mm_shuffle_ps(layer1_chunk2, layer1_chunk3, mask_hi); 637 v_g0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_lo); 638 v_a0 = _mm_shuffle_ps(layer1_chunk4, layer1_chunk5, mask_hi); 639 v_g1 = _mm_shuffle_ps(layer1_chunk6, layer1_chunk7, mask_lo); 640 v_a1 = _mm_shuffle_ps(layer1_chunk6, layer1_chunk7, mask_hi); 641 } 642 643 #endif // CV_SSE2 644 645 #endif //__OPENCV_CORE_SSE_UTILS_HPP__ 646