1 /* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #ifndef VPX_DSP_X86_TRANSPOSE_SSE2_H_ 12 #define VPX_DSP_X86_TRANSPOSE_SSE2_H_ 13 14 #include <emmintrin.h> // SSE2 15 16 #include "./vpx_config.h" 17 18 static INLINE __m128i transpose_8bit_4x4(const __m128i *const in) { 19 // Unpack 16 bit elements. Goes from: 20 // in[0]: 00 01 02 03 21 // in[1]: 10 11 12 13 22 // in[2]: 20 21 22 23 23 // in[3]: 30 31 32 33 24 // to: 25 // a0: 00 10 01 11 02 12 03 13 26 // a1: 20 30 21 31 22 32 23 33 27 const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]); 28 const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]); 29 30 // Unpack 32 bit elements resulting in: 31 // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 32 return _mm_unpacklo_epi16(a0, a1); 33 } 34 35 static INLINE void transpose_8bit_8x8(const __m128i *const in, 36 __m128i *const out) { 37 // Unpack 8 bit elements. Goes from: 38 // in[0]: 00 01 02 03 04 05 06 07 39 // in[1]: 10 11 12 13 14 15 16 17 40 // in[2]: 20 21 22 23 24 25 26 27 41 // in[3]: 30 31 32 33 34 35 36 37 42 // in[4]: 40 41 42 43 44 45 46 47 43 // in[5]: 50 51 52 53 54 55 56 57 44 // in[6]: 60 61 62 63 64 65 66 67 45 // in[7]: 70 71 72 73 74 75 76 77 46 // to: 47 // a0: 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 48 // a1: 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 49 // a2: 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 50 // a3: 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 51 const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]); 52 const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]); 53 const __m128i a2 = _mm_unpacklo_epi8(in[4], in[5]); 54 const __m128i a3 = _mm_unpacklo_epi8(in[6], in[7]); 55 56 // Unpack 16 bit elements resulting in: 57 // b0: 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 58 // b1: 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 59 // b2: 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 60 // b3: 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 61 const __m128i b0 = _mm_unpacklo_epi16(a0, a1); 62 const __m128i b1 = _mm_unpackhi_epi16(a0, a1); 63 const __m128i b2 = _mm_unpacklo_epi16(a2, a3); 64 const __m128i b3 = _mm_unpackhi_epi16(a2, a3); 65 66 // Unpack 32 bit elements resulting in: 67 // c0: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 68 // c1: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 69 // c2: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 70 // c3: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 71 const __m128i c0 = _mm_unpacklo_epi32(b0, b2); 72 const __m128i c1 = _mm_unpackhi_epi32(b0, b2); 73 const __m128i c2 = _mm_unpacklo_epi32(b1, b3); 74 const __m128i c3 = _mm_unpackhi_epi32(b1, b3); 75 76 // Unpack 64 bit elements resulting in: 77 // out[0]: 00 10 20 30 40 50 60 70 78 // out[1]: 01 11 21 31 41 51 61 71 79 // out[2]: 02 12 22 32 42 52 62 72 80 // out[3]: 03 13 23 33 43 53 63 73 81 // out[4]: 04 14 24 34 44 54 64 74 82 // out[5]: 05 15 25 35 45 55 65 75 83 // out[6]: 06 16 26 36 46 56 66 76 84 // out[7]: 07 17 27 37 47 57 67 77 85 out[0] = _mm_unpacklo_epi64(c0, c0); 86 out[1] = _mm_unpackhi_epi64(c0, c0); 87 out[2] = _mm_unpacklo_epi64(c1, c1); 88 out[3] = _mm_unpackhi_epi64(c1, c1); 89 out[4] = _mm_unpacklo_epi64(c2, c2); 90 out[5] = _mm_unpackhi_epi64(c2, c2); 91 out[6] = _mm_unpacklo_epi64(c3, c3); 92 out[7] = _mm_unpackhi_epi64(c3, c3); 93 } 94 95 static INLINE void transpose_16bit_4x4(const __m128i *const in, 96 __m128i *const out) { 97 // Unpack 16 bit elements. Goes from: 98 // in[0]: 00 01 02 03 XX XX XX XX 99 // in[1]: 10 11 12 13 XX XX XX XX 100 // in[2]: 20 21 22 23 XX XX XX XX 101 // in[3]: 30 31 32 33 XX XX XX XX 102 // to: 103 // a0: 00 10 01 11 02 12 03 13 104 // a1: 20 30 21 31 22 32 23 33 105 const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); 106 const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); 107 108 // Unpack 32 bit elements resulting in: 109 // out[0]: 00 10 20 30 01 11 21 31 110 // out[1]: 02 12 22 32 03 13 23 33 111 out[0] = _mm_unpacklo_epi32(a0, a1); 112 out[1] = _mm_unpackhi_epi32(a0, a1); 113 } 114 115 static INLINE void transpose_16bit_4x8(const __m128i *const in, 116 __m128i *const out) { 117 // Unpack 16 bit elements. Goes from: 118 // in[0]: 00 01 02 03 XX XX XX XX 119 // in[1]: 10 11 12 13 XX XX XX XX 120 // in[2]: 20 21 22 23 XX XX XX XX 121 // in[3]: 30 31 32 33 XX XX XX XX 122 // in[4]: 40 41 42 43 XX XX XX XX 123 // in[5]: 50 51 52 53 XX XX XX XX 124 // in[6]: 60 61 62 63 XX XX XX XX 125 // in[7]: 70 71 72 73 XX XX XX XX 126 // to: 127 // a0: 00 10 01 11 02 12 03 13 128 // a1: 20 30 21 31 22 32 23 33 129 // a2: 40 50 41 51 42 52 43 53 130 // a3: 60 70 61 71 62 72 63 73 131 const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); 132 const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); 133 const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]); 134 const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]); 135 136 // Unpack 32 bit elements resulting in: 137 // b0: 00 10 20 30 01 11 21 31 138 // b1: 40 50 60 70 41 51 61 71 139 // b2: 02 12 22 32 03 13 23 33 140 // b3: 42 52 62 72 43 53 63 73 141 const __m128i b0 = _mm_unpacklo_epi32(a0, a1); 142 const __m128i b1 = _mm_unpacklo_epi32(a2, a3); 143 const __m128i b2 = _mm_unpackhi_epi32(a0, a1); 144 const __m128i b3 = _mm_unpackhi_epi32(a2, a3); 145 146 // Unpack 64 bit elements resulting in: 147 // out[0]: 00 10 20 30 40 50 60 70 148 // out[1]: 01 11 21 31 41 51 61 71 149 // out[2]: 02 12 22 32 42 52 62 72 150 // out[3]: 03 13 23 33 43 53 63 73 151 out[0] = _mm_unpacklo_epi64(b0, b1); 152 out[1] = _mm_unpackhi_epi64(b0, b1); 153 out[2] = _mm_unpacklo_epi64(b2, b3); 154 out[3] = _mm_unpackhi_epi64(b2, b3); 155 } 156 157 static INLINE void transpose_16bit_8x8(const __m128i *const in, 158 __m128i *const out) { 159 // Unpack 16 bit elements. Goes from: 160 // in[0]: 00 01 02 03 04 05 06 07 161 // in[1]: 10 11 12 13 14 15 16 17 162 // in[2]: 20 21 22 23 24 25 26 27 163 // in[3]: 30 31 32 33 34 35 36 37 164 // in[4]: 40 41 42 43 44 45 46 47 165 // in[5]: 50 51 52 53 54 55 56 57 166 // in[6]: 60 61 62 63 64 65 66 67 167 // in[7]: 70 71 72 73 74 75 76 77 168 // to: 169 // a0: 00 10 01 11 02 12 03 13 170 // a1: 20 30 21 31 22 32 23 33 171 // a2: 40 50 41 51 42 52 43 53 172 // a3: 60 70 61 71 62 72 63 73 173 // a4: 04 14 05 15 06 16 07 17 174 // a5: 24 34 25 35 26 36 27 37 175 // a6: 44 54 45 55 46 56 47 57 176 // a7: 64 74 65 75 66 76 67 77 177 const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]); 178 const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]); 179 const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]); 180 const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]); 181 const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]); 182 const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]); 183 const __m128i a6 = _mm_unpackhi_epi16(in[4], in[5]); 184 const __m128i a7 = _mm_unpackhi_epi16(in[6], in[7]); 185 186 // Unpack 32 bit elements resulting in: 187 // b0: 00 10 20 30 01 11 21 31 188 // b1: 40 50 60 70 41 51 61 71 189 // b2: 04 14 24 34 05 15 25 35 190 // b3: 44 54 64 74 45 55 65 75 191 // b4: 02 12 22 32 03 13 23 33 192 // b5: 42 52 62 72 43 53 63 73 193 // b6: 06 16 26 36 07 17 27 37 194 // b7: 46 56 66 76 47 57 67 77 195 const __m128i b0 = _mm_unpacklo_epi32(a0, a1); 196 const __m128i b1 = _mm_unpacklo_epi32(a2, a3); 197 const __m128i b2 = _mm_unpacklo_epi32(a4, a5); 198 const __m128i b3 = _mm_unpacklo_epi32(a6, a7); 199 const __m128i b4 = _mm_unpackhi_epi32(a0, a1); 200 const __m128i b5 = _mm_unpackhi_epi32(a2, a3); 201 const __m128i b6 = _mm_unpackhi_epi32(a4, a5); 202 const __m128i b7 = _mm_unpackhi_epi32(a6, a7); 203 204 // Unpack 64 bit elements resulting in: 205 // out[0]: 00 10 20 30 40 50 60 70 206 // out[1]: 01 11 21 31 41 51 61 71 207 // out[2]: 02 12 22 32 42 52 62 72 208 // out[3]: 03 13 23 33 43 53 63 73 209 // out[4]: 04 14 24 34 44 54 64 74 210 // out[5]: 05 15 25 35 45 55 65 75 211 // out[6]: 06 16 26 36 46 56 66 76 212 // out[7]: 07 17 27 37 47 57 67 77 213 out[0] = _mm_unpacklo_epi64(b0, b1); 214 out[1] = _mm_unpackhi_epi64(b0, b1); 215 out[2] = _mm_unpacklo_epi64(b4, b5); 216 out[3] = _mm_unpackhi_epi64(b4, b5); 217 out[4] = _mm_unpacklo_epi64(b2, b3); 218 out[5] = _mm_unpackhi_epi64(b2, b3); 219 out[6] = _mm_unpacklo_epi64(b6, b7); 220 out[7] = _mm_unpackhi_epi64(b6, b7); 221 } 222 223 // Transpose in-place 224 static INLINE void transpose_16bit_16x16(__m128i *const left, 225 __m128i *const right) { 226 __m128i tbuf[8]; 227 transpose_16bit_8x8(left, left); 228 transpose_16bit_8x8(right, tbuf); 229 transpose_16bit_8x8(left + 8, right); 230 transpose_16bit_8x8(right + 8, right + 8); 231 232 left[8] = tbuf[0]; 233 left[9] = tbuf[1]; 234 left[10] = tbuf[2]; 235 left[11] = tbuf[3]; 236 left[12] = tbuf[4]; 237 left[13] = tbuf[5]; 238 left[14] = tbuf[6]; 239 left[15] = tbuf[7]; 240 } 241 242 static INLINE void transpose_32bit_4x4(const __m128i *const in, 243 __m128i *const out) { 244 // Unpack 32 bit elements. Goes from: 245 // in[0]: 00 01 02 03 246 // in[1]: 10 11 12 13 247 // in[2]: 20 21 22 23 248 // in[3]: 30 31 32 33 249 // to: 250 // a0: 00 10 01 11 251 // a1: 20 30 21 31 252 // a2: 02 12 03 13 253 // a3: 22 32 23 33 254 255 const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]); 256 const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]); 257 const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]); 258 const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]); 259 260 // Unpack 64 bit elements resulting in: 261 // out[0]: 00 10 20 30 262 // out[1]: 01 11 21 31 263 // out[2]: 02 12 22 32 264 // out[3]: 03 13 23 33 265 out[0] = _mm_unpacklo_epi64(a0, a1); 266 out[1] = _mm_unpackhi_epi64(a0, a1); 267 out[2] = _mm_unpacklo_epi64(a2, a3); 268 out[3] = _mm_unpackhi_epi64(a2, a3); 269 } 270 271 static INLINE void transpose_32bit_4x4x2(const __m128i *const in, 272 __m128i *const out) { 273 // Unpack 32 bit elements. Goes from: 274 // in[0]: 00 01 02 03 275 // in[1]: 10 11 12 13 276 // in[2]: 20 21 22 23 277 // in[3]: 30 31 32 33 278 // in[4]: 04 05 06 07 279 // in[5]: 14 15 16 17 280 // in[6]: 24 25 26 27 281 // in[7]: 34 35 36 37 282 // to: 283 // a0: 00 10 01 11 284 // a1: 20 30 21 31 285 // a2: 02 12 03 13 286 // a3: 22 32 23 33 287 // a4: 04 14 05 15 288 // a5: 24 34 25 35 289 // a6: 06 16 07 17 290 // a7: 26 36 27 37 291 const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]); 292 const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]); 293 const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]); 294 const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]); 295 const __m128i a4 = _mm_unpacklo_epi32(in[4], in[5]); 296 const __m128i a5 = _mm_unpacklo_epi32(in[6], in[7]); 297 const __m128i a6 = _mm_unpackhi_epi32(in[4], in[5]); 298 const __m128i a7 = _mm_unpackhi_epi32(in[6], in[7]); 299 300 // Unpack 64 bit elements resulting in: 301 // out[0]: 00 10 20 30 302 // out[1]: 01 11 21 31 303 // out[2]: 02 12 22 32 304 // out[3]: 03 13 23 33 305 // out[4]: 04 14 24 34 306 // out[5]: 05 15 25 35 307 // out[6]: 06 16 26 36 308 // out[7]: 07 17 27 37 309 out[0] = _mm_unpacklo_epi64(a0, a1); 310 out[1] = _mm_unpackhi_epi64(a0, a1); 311 out[2] = _mm_unpacklo_epi64(a2, a3); 312 out[3] = _mm_unpackhi_epi64(a2, a3); 313 out[4] = _mm_unpacklo_epi64(a4, a5); 314 out[5] = _mm_unpackhi_epi64(a4, a5); 315 out[6] = _mm_unpacklo_epi64(a6, a7); 316 out[7] = _mm_unpackhi_epi64(a6, a7); 317 } 318 319 static INLINE void transpose_32bit_8x4(const __m128i *const in, 320 __m128i *const out) { 321 // Unpack 32 bit elements. Goes from: 322 // in[0]: 00 01 02 03 323 // in[1]: 04 05 06 07 324 // in[2]: 10 11 12 13 325 // in[3]: 14 15 16 17 326 // in[4]: 20 21 22 23 327 // in[5]: 24 25 26 27 328 // in[6]: 30 31 32 33 329 // in[7]: 34 35 36 37 330 // to: 331 // a0: 00 10 01 11 332 // a1: 20 30 21 31 333 // a2: 02 12 03 13 334 // a3: 22 32 23 33 335 // a4: 04 14 05 15 336 // a5: 24 34 25 35 337 // a6: 06 16 07 17 338 // a7: 26 36 27 37 339 const __m128i a0 = _mm_unpacklo_epi32(in[0], in[2]); 340 const __m128i a1 = _mm_unpacklo_epi32(in[4], in[6]); 341 const __m128i a2 = _mm_unpackhi_epi32(in[0], in[2]); 342 const __m128i a3 = _mm_unpackhi_epi32(in[4], in[6]); 343 const __m128i a4 = _mm_unpacklo_epi32(in[1], in[3]); 344 const __m128i a5 = _mm_unpacklo_epi32(in[5], in[7]); 345 const __m128i a6 = _mm_unpackhi_epi32(in[1], in[3]); 346 const __m128i a7 = _mm_unpackhi_epi32(in[5], in[7]); 347 348 // Unpack 64 bit elements resulting in: 349 // out[0]: 00 10 20 30 350 // out[1]: 01 11 21 31 351 // out[2]: 02 12 22 32 352 // out[3]: 03 13 23 33 353 // out[4]: 04 14 24 34 354 // out[5]: 05 15 25 35 355 // out[6]: 06 16 26 36 356 // out[7]: 07 17 27 37 357 out[0] = _mm_unpacklo_epi64(a0, a1); 358 out[1] = _mm_unpackhi_epi64(a0, a1); 359 out[2] = _mm_unpacklo_epi64(a2, a3); 360 out[3] = _mm_unpackhi_epi64(a2, a3); 361 out[4] = _mm_unpacklo_epi64(a4, a5); 362 out[5] = _mm_unpackhi_epi64(a4, a5); 363 out[6] = _mm_unpacklo_epi64(a6, a7); 364 out[7] = _mm_unpackhi_epi64(a6, a7); 365 } 366 367 #endif // VPX_DSP_X86_TRANSPOSE_SSE2_H_ 368