1 /* 2 * Copyright (c) 2016 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #ifndef VPX_DSP_ARM_TRANSPOSE_NEON_H_ 12 #define VPX_DSP_ARM_TRANSPOSE_NEON_H_ 13 14 #include <arm_neon.h> 15 16 #include "./vpx_config.h" 17 18 // Transpose 64 bit elements as follows: 19 // a0: 00 01 02 03 04 05 06 07 20 // a1: 16 17 18 19 20 21 22 23 21 // 22 // b0.val[0]: 00 01 02 03 16 17 18 19 23 // b0.val[1]: 04 05 06 07 20 21 22 23 24 static INLINE int16x8x2_t vpx_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) { 25 int16x8x2_t b0; 26 b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)), 27 vreinterpret_s16_s32(vget_low_s32(a1))); 28 b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)), 29 vreinterpret_s16_s32(vget_high_s32(a1))); 30 return b0; 31 } 32 33 static INLINE int32x4x2_t vpx_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) { 34 int32x4x2_t b0; 35 b0.val[0] = vcombine_s32(vget_low_s32(a0), vget_low_s32(a1)); 36 b0.val[1] = vcombine_s32(vget_high_s32(a0), vget_high_s32(a1)); 37 return b0; 38 } 39 40 static INLINE int64x2x2_t vpx_vtrnq_s64(int32x4_t a0, int32x4_t a1) { 41 int64x2x2_t b0; 42 b0.val[0] = vcombine_s64(vreinterpret_s64_s32(vget_low_s32(a0)), 43 vreinterpret_s64_s32(vget_low_s32(a1))); 44 b0.val[1] = vcombine_s64(vreinterpret_s64_s32(vget_high_s32(a0)), 45 vreinterpret_s64_s32(vget_high_s32(a1))); 46 return b0; 47 } 48 49 static INLINE uint8x16x2_t vpx_vtrnq_u64_to_u8(uint32x4_t a0, uint32x4_t a1) { 50 uint8x16x2_t b0; 51 b0.val[0] = vcombine_u8(vreinterpret_u8_u32(vget_low_u32(a0)), 52 vreinterpret_u8_u32(vget_low_u32(a1))); 53 b0.val[1] = vcombine_u8(vreinterpret_u8_u32(vget_high_u32(a0)), 54 vreinterpret_u8_u32(vget_high_u32(a1))); 55 return b0; 56 } 57 58 static INLINE uint16x8x2_t vpx_vtrnq_u64_to_u16(uint32x4_t a0, uint32x4_t a1) { 59 uint16x8x2_t b0; 60 b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)), 61 vreinterpret_u16_u32(vget_low_u32(a1))); 62 b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)), 63 vreinterpret_u16_u32(vget_high_u32(a1))); 64 return b0; 65 } 66 67 static INLINE void transpose_u8_4x4(uint8x8_t *a0, uint8x8_t *a1) { 68 // Swap 16 bit elements. Goes from: 69 // a0: 00 01 02 03 10 11 12 13 70 // a1: 20 21 22 23 30 31 32 33 71 // to: 72 // b0.val[0]: 00 01 20 21 10 11 30 31 73 // b0.val[1]: 02 03 22 23 12 13 32 33 74 75 const uint16x4x2_t b0 = 76 vtrn_u16(vreinterpret_u16_u8(*a0), vreinterpret_u16_u8(*a1)); 77 78 // Swap 32 bit elements resulting in: 79 // c0.val[0]: 00 01 20 21 02 03 22 23 80 // c0.val[1]: 10 11 30 31 12 13 32 33 81 82 const uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]), 83 vreinterpret_u32_u16(b0.val[1])); 84 85 // Swap 8 bit elements resulting in: 86 // d0.val[0]: 00 10 20 30 02 12 22 32 87 // d0.val[1]: 01 11 21 31 03 13 23 33 88 89 const uint8x8x2_t d0 = 90 vtrn_u8(vreinterpret_u8_u32(c0.val[0]), vreinterpret_u8_u32(c0.val[1])); 91 92 *a0 = d0.val[0]; 93 *a1 = d0.val[1]; 94 } 95 96 static INLINE void transpose_s16_4x4d(int16x4_t *a0, int16x4_t *a1, 97 int16x4_t *a2, int16x4_t *a3) { 98 // Swap 16 bit elements. Goes from: 99 // a0: 00 01 02 03 100 // a1: 10 11 12 13 101 // a2: 20 21 22 23 102 // a3: 30 31 32 33 103 // to: 104 // b0.val[0]: 00 10 02 12 105 // b0.val[1]: 01 11 03 13 106 // b1.val[0]: 20 30 22 32 107 // b1.val[1]: 21 31 23 33 108 109 const int16x4x2_t b0 = vtrn_s16(*a0, *a1); 110 const int16x4x2_t b1 = vtrn_s16(*a2, *a3); 111 112 // Swap 32 bit elements resulting in: 113 // c0.val[0]: 00 10 20 30 114 // c0.val[1]: 02 12 22 32 115 // c1.val[0]: 01 11 21 31 116 // c1.val[1]: 03 13 23 33 117 118 const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]), 119 vreinterpret_s32_s16(b1.val[0])); 120 const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]), 121 vreinterpret_s32_s16(b1.val[1])); 122 123 *a0 = vreinterpret_s16_s32(c0.val[0]); 124 *a1 = vreinterpret_s16_s32(c1.val[0]); 125 *a2 = vreinterpret_s16_s32(c0.val[1]); 126 *a3 = vreinterpret_s16_s32(c1.val[1]); 127 } 128 129 static INLINE void transpose_s16_4x4q(int16x8_t *a0, int16x8_t *a1) { 130 // Swap 32 bit elements. Goes from: 131 // a0: 00 01 02 03 10 11 12 13 132 // a1: 20 21 22 23 30 31 32 33 133 // to: 134 // b0.val[0]: 00 01 20 21 10 11 30 31 135 // b0.val[1]: 02 03 22 23 12 13 32 33 136 137 const int32x4x2_t b0 = 138 vtrnq_s32(vreinterpretq_s32_s16(*a0), vreinterpretq_s32_s16(*a1)); 139 140 // Swap 64 bit elements resulting in: 141 // c0.val[0]: 00 01 20 21 02 03 22 23 142 // c0.val[1]: 10 11 30 31 12 13 32 33 143 144 const int32x4_t c0 = 145 vcombine_s32(vget_low_s32(b0.val[0]), vget_low_s32(b0.val[1])); 146 const int32x4_t c1 = 147 vcombine_s32(vget_high_s32(b0.val[0]), vget_high_s32(b0.val[1])); 148 149 // Swap 16 bit elements resulting in: 150 // d0.val[0]: 00 10 20 30 02 12 22 32 151 // d0.val[1]: 01 11 21 31 03 13 23 33 152 153 const int16x8x2_t d0 = 154 vtrnq_s16(vreinterpretq_s16_s32(c0), vreinterpretq_s16_s32(c1)); 155 156 *a0 = d0.val[0]; 157 *a1 = d0.val[1]; 158 } 159 160 static INLINE void transpose_u16_4x4q(uint16x8_t *a0, uint16x8_t *a1) { 161 // Swap 32 bit elements. Goes from: 162 // a0: 00 01 02 03 10 11 12 13 163 // a1: 20 21 22 23 30 31 32 33 164 // to: 165 // b0.val[0]: 00 01 20 21 10 11 30 31 166 // b0.val[1]: 02 03 22 23 12 13 32 33 167 168 const uint32x4x2_t b0 = 169 vtrnq_u32(vreinterpretq_u32_u16(*a0), vreinterpretq_u32_u16(*a1)); 170 171 // Swap 64 bit elements resulting in: 172 // c0.val[0]: 00 01 20 21 02 03 22 23 173 // c0.val[1]: 10 11 30 31 12 13 32 33 174 175 const uint32x4_t c0 = 176 vcombine_u32(vget_low_u32(b0.val[0]), vget_low_u32(b0.val[1])); 177 const uint32x4_t c1 = 178 vcombine_u32(vget_high_u32(b0.val[0]), vget_high_u32(b0.val[1])); 179 180 // Swap 16 bit elements resulting in: 181 // d0.val[0]: 00 10 20 30 02 12 22 32 182 // d0.val[1]: 01 11 21 31 03 13 23 33 183 184 const uint16x8x2_t d0 = 185 vtrnq_u16(vreinterpretq_u16_u32(c0), vreinterpretq_u16_u32(c1)); 186 187 *a0 = d0.val[0]; 188 *a1 = d0.val[1]; 189 } 190 191 static INLINE void transpose_u8_4x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2, 192 uint8x8_t *a3, const uint8x8_t a4, 193 const uint8x8_t a5, const uint8x8_t a6, 194 const uint8x8_t a7) { 195 // Swap 32 bit elements. Goes from: 196 // a0: 00 01 02 03 XX XX XX XX 197 // a1: 10 11 12 13 XX XX XX XX 198 // a2: 20 21 22 23 XX XX XX XX 199 // a3; 30 31 32 33 XX XX XX XX 200 // a4: 40 41 42 43 XX XX XX XX 201 // a5: 50 51 52 53 XX XX XX XX 202 // a6: 60 61 62 63 XX XX XX XX 203 // a7: 70 71 72 73 XX XX XX XX 204 // to: 205 // b0.val[0]: 00 01 02 03 40 41 42 43 206 // b1.val[0]: 10 11 12 13 50 51 52 53 207 // b2.val[0]: 20 21 22 23 60 61 62 63 208 // b3.val[0]: 30 31 32 33 70 71 72 73 209 210 const uint32x2x2_t b0 = 211 vtrn_u32(vreinterpret_u32_u8(*a0), vreinterpret_u32_u8(a4)); 212 const uint32x2x2_t b1 = 213 vtrn_u32(vreinterpret_u32_u8(*a1), vreinterpret_u32_u8(a5)); 214 const uint32x2x2_t b2 = 215 vtrn_u32(vreinterpret_u32_u8(*a2), vreinterpret_u32_u8(a6)); 216 const uint32x2x2_t b3 = 217 vtrn_u32(vreinterpret_u32_u8(*a3), vreinterpret_u32_u8(a7)); 218 219 // Swap 16 bit elements resulting in: 220 // c0.val[0]: 00 01 20 21 40 41 60 61 221 // c0.val[1]: 02 03 22 23 42 43 62 63 222 // c1.val[0]: 10 11 30 31 50 51 70 71 223 // c1.val[1]: 12 13 32 33 52 53 72 73 224 225 const uint16x4x2_t c0 = vtrn_u16(vreinterpret_u16_u32(b0.val[0]), 226 vreinterpret_u16_u32(b2.val[0])); 227 const uint16x4x2_t c1 = vtrn_u16(vreinterpret_u16_u32(b1.val[0]), 228 vreinterpret_u16_u32(b3.val[0])); 229 230 // Swap 8 bit elements resulting in: 231 // d0.val[0]: 00 10 20 30 40 50 60 70 232 // d0.val[1]: 01 11 21 31 41 51 61 71 233 // d1.val[0]: 02 12 22 32 42 52 62 72 234 // d1.val[1]: 03 13 23 33 43 53 63 73 235 236 const uint8x8x2_t d0 = 237 vtrn_u8(vreinterpret_u8_u16(c0.val[0]), vreinterpret_u8_u16(c1.val[0])); 238 const uint8x8x2_t d1 = 239 vtrn_u8(vreinterpret_u8_u16(c0.val[1]), vreinterpret_u8_u16(c1.val[1])); 240 241 *a0 = d0.val[0]; 242 *a1 = d0.val[1]; 243 *a2 = d1.val[0]; 244 *a3 = d1.val[1]; 245 } 246 247 static INLINE void transpose_s32_4x4(int32x4_t *a0, int32x4_t *a1, 248 int32x4_t *a2, int32x4_t *a3) { 249 // Swap 32 bit elements. Goes from: 250 // a0: 00 01 02 03 251 // a1: 10 11 12 13 252 // a2: 20 21 22 23 253 // a3: 30 31 32 33 254 // to: 255 // b0.val[0]: 00 10 02 12 256 // b0.val[1]: 01 11 03 13 257 // b1.val[0]: 20 30 22 32 258 // b1.val[1]: 21 31 23 33 259 260 const int32x4x2_t b0 = vtrnq_s32(*a0, *a1); 261 const int32x4x2_t b1 = vtrnq_s32(*a2, *a3); 262 263 // Swap 64 bit elements resulting in: 264 // c0.val[0]: 00 10 20 30 265 // c0.val[1]: 02 12 22 32 266 // c1.val[0]: 01 11 21 31 267 // c1.val[1]: 03 13 23 33 268 269 const int32x4x2_t c0 = vpx_vtrnq_s64_to_s32(b0.val[0], b1.val[0]); 270 const int32x4x2_t c1 = vpx_vtrnq_s64_to_s32(b0.val[1], b1.val[1]); 271 272 *a0 = c0.val[0]; 273 *a1 = c1.val[0]; 274 *a2 = c0.val[1]; 275 *a3 = c1.val[1]; 276 } 277 278 static INLINE void transpose_s16_4x8(const int16x4_t a0, const int16x4_t a1, 279 const int16x4_t a2, const int16x4_t a3, 280 const int16x4_t a4, const int16x4_t a5, 281 const int16x4_t a6, const int16x4_t a7, 282 int16x8_t *const o0, int16x8_t *const o1, 283 int16x8_t *const o2, int16x8_t *const o3) { 284 // Swap 16 bit elements. Goes from: 285 // a0: 00 01 02 03 286 // a1: 10 11 12 13 287 // a2: 20 21 22 23 288 // a3: 30 31 32 33 289 // a4: 40 41 42 43 290 // a5: 50 51 52 53 291 // a6: 60 61 62 63 292 // a7: 70 71 72 73 293 // to: 294 // b0.val[0]: 00 10 02 12 295 // b0.val[1]: 01 11 03 13 296 // b1.val[0]: 20 30 22 32 297 // b1.val[1]: 21 31 23 33 298 // b2.val[0]: 40 50 42 52 299 // b2.val[1]: 41 51 43 53 300 // b3.val[0]: 60 70 62 72 301 // b3.val[1]: 61 71 63 73 302 303 const int16x4x2_t b0 = vtrn_s16(a0, a1); 304 const int16x4x2_t b1 = vtrn_s16(a2, a3); 305 const int16x4x2_t b2 = vtrn_s16(a4, a5); 306 const int16x4x2_t b3 = vtrn_s16(a6, a7); 307 308 // Swap 32 bit elements resulting in: 309 // c0.val[0]: 00 10 20 30 310 // c0.val[1]: 02 12 22 32 311 // c1.val[0]: 01 11 21 31 312 // c1.val[1]: 03 13 23 33 313 // c2.val[0]: 40 50 60 70 314 // c2.val[1]: 42 52 62 72 315 // c3.val[0]: 41 51 61 71 316 // c3.val[1]: 43 53 63 73 317 318 const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]), 319 vreinterpret_s32_s16(b1.val[0])); 320 const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]), 321 vreinterpret_s32_s16(b1.val[1])); 322 const int32x2x2_t c2 = vtrn_s32(vreinterpret_s32_s16(b2.val[0]), 323 vreinterpret_s32_s16(b3.val[0])); 324 const int32x2x2_t c3 = vtrn_s32(vreinterpret_s32_s16(b2.val[1]), 325 vreinterpret_s32_s16(b3.val[1])); 326 327 // Swap 64 bit elements resulting in: 328 // o0: 00 10 20 30 40 50 60 70 329 // o1: 01 11 21 31 41 51 61 71 330 // o2: 02 12 22 32 42 52 62 72 331 // o3: 03 13 23 33 43 53 63 73 332 333 *o0 = vcombine_s16(vreinterpret_s16_s32(c0.val[0]), 334 vreinterpret_s16_s32(c2.val[0])); 335 *o1 = vcombine_s16(vreinterpret_s16_s32(c1.val[0]), 336 vreinterpret_s16_s32(c3.val[0])); 337 *o2 = vcombine_s16(vreinterpret_s16_s32(c0.val[1]), 338 vreinterpret_s16_s32(c2.val[1])); 339 *o3 = vcombine_s16(vreinterpret_s16_s32(c1.val[1]), 340 vreinterpret_s16_s32(c3.val[1])); 341 } 342 343 static INLINE void transpose_s32_4x8(int32x4_t *const a0, int32x4_t *const a1, 344 int32x4_t *const a2, int32x4_t *const a3, 345 int32x4_t *const a4, int32x4_t *const a5, 346 int32x4_t *const a6, int32x4_t *const a7) { 347 // Swap 32 bit elements. Goes from: 348 // a0: 00 01 02 03 349 // a1: 10 11 12 13 350 // a2: 20 21 22 23 351 // a3: 30 31 32 33 352 // a4: 40 41 42 43 353 // a5: 50 51 52 53 354 // a6: 60 61 62 63 355 // a7: 70 71 72 73 356 // to: 357 // b0.val[0]: 00 10 02 12 358 // b0.val[1]: 01 11 03 13 359 // b1.val[0]: 20 30 22 32 360 // b1.val[1]: 21 31 23 33 361 // b2.val[0]: 40 50 42 52 362 // b2.val[1]: 41 51 43 53 363 // b3.val[0]: 60 70 62 72 364 // b3.val[1]: 61 71 63 73 365 366 const int32x4x2_t b0 = vtrnq_s32(*a0, *a1); 367 const int32x4x2_t b1 = vtrnq_s32(*a2, *a3); 368 const int32x4x2_t b2 = vtrnq_s32(*a4, *a5); 369 const int32x4x2_t b3 = vtrnq_s32(*a6, *a7); 370 371 // Swap 64 bit elements resulting in: 372 // c0.val[0]: 00 10 20 30 373 // c0.val[1]: 02 12 22 32 374 // c1.val[0]: 01 11 21 31 375 // c1.val[1]: 03 13 23 33 376 // c2.val[0]: 40 50 60 70 377 // c2.val[1]: 42 52 62 72 378 // c3.val[0]: 41 51 61 71 379 // c3.val[1]: 43 53 63 73 380 381 const int64x2x2_t c0 = vpx_vtrnq_s64(b0.val[0], b1.val[0]); 382 const int64x2x2_t c1 = vpx_vtrnq_s64(b0.val[1], b1.val[1]); 383 const int64x2x2_t c2 = vpx_vtrnq_s64(b2.val[0], b3.val[0]); 384 const int64x2x2_t c3 = vpx_vtrnq_s64(b2.val[1], b3.val[1]); 385 386 *a0 = vreinterpretq_s32_s64(c0.val[0]); 387 *a1 = vreinterpretq_s32_s64(c2.val[0]); 388 *a2 = vreinterpretq_s32_s64(c1.val[0]); 389 *a3 = vreinterpretq_s32_s64(c3.val[0]); 390 *a4 = vreinterpretq_s32_s64(c0.val[1]); 391 *a5 = vreinterpretq_s32_s64(c2.val[1]); 392 *a6 = vreinterpretq_s32_s64(c1.val[1]); 393 *a7 = vreinterpretq_s32_s64(c3.val[1]); 394 } 395 396 static INLINE void transpose_u8_8x4(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2, 397 uint8x8_t *a3) { 398 // Swap 8 bit elements. Goes from: 399 // a0: 00 01 02 03 04 05 06 07 400 // a1: 10 11 12 13 14 15 16 17 401 // a2: 20 21 22 23 24 25 26 27 402 // a3: 30 31 32 33 34 35 36 37 403 // to: 404 // b0.val[0]: 00 10 02 12 04 14 06 16 405 // b0.val[1]: 01 11 03 13 05 15 07 17 406 // b1.val[0]: 20 30 22 32 24 34 26 36 407 // b1.val[1]: 21 31 23 33 25 35 27 37 408 409 const uint8x8x2_t b0 = vtrn_u8(*a0, *a1); 410 const uint8x8x2_t b1 = vtrn_u8(*a2, *a3); 411 412 // Swap 16 bit elements resulting in: 413 // c0.val[0]: 00 10 20 30 04 14 24 34 414 // c0.val[1]: 02 12 22 32 06 16 26 36 415 // c1.val[0]: 01 11 21 31 05 15 25 35 416 // c1.val[1]: 03 13 23 33 07 17 27 37 417 418 const uint16x4x2_t c0 = 419 vtrn_u16(vreinterpret_u16_u8(b0.val[0]), vreinterpret_u16_u8(b1.val[0])); 420 const uint16x4x2_t c1 = 421 vtrn_u16(vreinterpret_u16_u8(b0.val[1]), vreinterpret_u16_u8(b1.val[1])); 422 423 *a0 = vreinterpret_u8_u16(c0.val[0]); 424 *a1 = vreinterpret_u8_u16(c1.val[0]); 425 *a2 = vreinterpret_u8_u16(c0.val[1]); 426 *a3 = vreinterpret_u8_u16(c1.val[1]); 427 } 428 429 static INLINE void transpose_u16_8x4(uint16x8_t *a0, uint16x8_t *a1, 430 uint16x8_t *a2, uint16x8_t *a3) { 431 // Swap 16 bit elements. Goes from: 432 // a0: 00 01 02 03 04 05 06 07 433 // a1: 10 11 12 13 14 15 16 17 434 // a2: 20 21 22 23 24 25 26 27 435 // a3: 30 31 32 33 34 35 36 37 436 // to: 437 // b0.val[0]: 00 10 02 12 04 14 06 16 438 // b0.val[1]: 01 11 03 13 05 15 07 17 439 // b1.val[0]: 20 30 22 32 24 34 26 36 440 // b1.val[1]: 21 31 23 33 25 35 27 37 441 442 const uint16x8x2_t b0 = vtrnq_u16(*a0, *a1); 443 const uint16x8x2_t b1 = vtrnq_u16(*a2, *a3); 444 445 // Swap 32 bit elements resulting in: 446 // c0.val[0]: 00 10 20 30 04 14 24 34 447 // c0.val[1]: 02 12 22 32 06 16 26 36 448 // c1.val[0]: 01 11 21 31 05 15 25 35 449 // c1.val[1]: 03 13 23 33 07 17 27 37 450 451 const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]), 452 vreinterpretq_u32_u16(b1.val[0])); 453 const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]), 454 vreinterpretq_u32_u16(b1.val[1])); 455 456 *a0 = vreinterpretq_u16_u32(c0.val[0]); 457 *a1 = vreinterpretq_u16_u32(c1.val[0]); 458 *a2 = vreinterpretq_u16_u32(c0.val[1]); 459 *a3 = vreinterpretq_u16_u32(c1.val[1]); 460 } 461 462 static INLINE void transpose_s32_8x4(int32x4_t *const a0, int32x4_t *const a1, 463 int32x4_t *const a2, int32x4_t *const a3, 464 int32x4_t *const a4, int32x4_t *const a5, 465 int32x4_t *const a6, int32x4_t *const a7) { 466 // Swap 32 bit elements. Goes from: 467 // a0: 00 01 02 03 468 // a1: 04 05 06 07 469 // a2: 10 11 12 13 470 // a3: 14 15 16 17 471 // a4: 20 21 22 23 472 // a5: 24 25 26 27 473 // a6: 30 31 32 33 474 // a7: 34 35 36 37 475 // to: 476 // b0.val[0]: 00 10 02 12 477 // b0.val[1]: 01 11 03 13 478 // b1.val[0]: 04 14 06 16 479 // b1.val[1]: 05 15 07 17 480 // b2.val[0]: 20 30 22 32 481 // b2.val[1]: 21 31 23 33 482 // b3.val[0]: 24 34 26 36 483 // b3.val[1]: 25 35 27 37 484 485 const int32x4x2_t b0 = vtrnq_s32(*a0, *a2); 486 const int32x4x2_t b1 = vtrnq_s32(*a1, *a3); 487 const int32x4x2_t b2 = vtrnq_s32(*a4, *a6); 488 const int32x4x2_t b3 = vtrnq_s32(*a5, *a7); 489 490 // Swap 64 bit elements resulting in: 491 // c0.val[0]: 00 10 20 30 492 // c0.val[1]: 02 12 22 32 493 // c1.val[0]: 01 11 21 31 494 // c1.val[1]: 03 13 23 33 495 // c2.val[0]: 04 14 24 34 496 // c2.val[1]: 06 16 26 36 497 // c3.val[0]: 05 15 25 35 498 // c3.val[1]: 07 17 27 37 499 500 const int64x2x2_t c0 = vpx_vtrnq_s64(b0.val[0], b2.val[0]); 501 const int64x2x2_t c1 = vpx_vtrnq_s64(b0.val[1], b2.val[1]); 502 const int64x2x2_t c2 = vpx_vtrnq_s64(b1.val[0], b3.val[0]); 503 const int64x2x2_t c3 = vpx_vtrnq_s64(b1.val[1], b3.val[1]); 504 505 *a0 = vreinterpretq_s32_s64(c0.val[0]); 506 *a1 = vreinterpretq_s32_s64(c1.val[0]); 507 *a2 = vreinterpretq_s32_s64(c0.val[1]); 508 *a3 = vreinterpretq_s32_s64(c1.val[1]); 509 *a4 = vreinterpretq_s32_s64(c2.val[0]); 510 *a5 = vreinterpretq_s32_s64(c3.val[0]); 511 *a6 = vreinterpretq_s32_s64(c2.val[1]); 512 *a7 = vreinterpretq_s32_s64(c3.val[1]); 513 } 514 515 // Note: Using 'd' registers or 'q' registers has almost identical speed. We use 516 // 'q' registers here to save some instructions. 517 static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2, 518 uint8x8_t *a3, uint8x8_t *a4, uint8x8_t *a5, 519 uint8x8_t *a6, uint8x8_t *a7) { 520 // Swap 8 bit elements. Goes from: 521 // a0: 00 01 02 03 04 05 06 07 522 // a1: 10 11 12 13 14 15 16 17 523 // a2: 20 21 22 23 24 25 26 27 524 // a3: 30 31 32 33 34 35 36 37 525 // a4: 40 41 42 43 44 45 46 47 526 // a5: 50 51 52 53 54 55 56 57 527 // a6: 60 61 62 63 64 65 66 67 528 // a7: 70 71 72 73 74 75 76 77 529 // to: 530 // b0.val[0]: 00 10 02 12 04 14 06 16 40 50 42 52 44 54 46 56 531 // b0.val[1]: 01 11 03 13 05 15 07 17 41 51 43 53 45 55 47 57 532 // b1.val[0]: 20 30 22 32 24 34 26 36 60 70 62 72 64 74 66 76 533 // b1.val[1]: 21 31 23 33 25 35 27 37 61 71 63 73 65 75 67 77 534 535 const uint8x16x2_t b0 = 536 vtrnq_u8(vcombine_u8(*a0, *a4), vcombine_u8(*a1, *a5)); 537 const uint8x16x2_t b1 = 538 vtrnq_u8(vcombine_u8(*a2, *a6), vcombine_u8(*a3, *a7)); 539 540 // Swap 16 bit elements resulting in: 541 // c0.val[0]: 00 10 20 30 04 14 24 34 40 50 60 70 44 54 64 74 542 // c0.val[1]: 02 12 22 32 06 16 26 36 42 52 62 72 46 56 66 76 543 // c1.val[0]: 01 11 21 31 05 15 25 35 41 51 61 71 45 55 65 75 544 // c1.val[1]: 03 13 23 33 07 17 27 37 43 53 63 73 47 57 67 77 545 546 const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]), 547 vreinterpretq_u16_u8(b1.val[0])); 548 const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]), 549 vreinterpretq_u16_u8(b1.val[1])); 550 551 // Unzip 32 bit elements resulting in: 552 // d0.val[0]: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 553 // d0.val[1]: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 554 // d1.val[0]: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 555 // d1.val[1]: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 556 const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]), 557 vreinterpretq_u32_u16(c1.val[0])); 558 const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]), 559 vreinterpretq_u32_u16(c1.val[1])); 560 561 *a0 = vreinterpret_u8_u32(vget_low_u32(d0.val[0])); 562 *a1 = vreinterpret_u8_u32(vget_high_u32(d0.val[0])); 563 *a2 = vreinterpret_u8_u32(vget_low_u32(d1.val[0])); 564 *a3 = vreinterpret_u8_u32(vget_high_u32(d1.val[0])); 565 *a4 = vreinterpret_u8_u32(vget_low_u32(d0.val[1])); 566 *a5 = vreinterpret_u8_u32(vget_high_u32(d0.val[1])); 567 *a6 = vreinterpret_u8_u32(vget_low_u32(d1.val[1])); 568 *a7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1])); 569 } 570 571 static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1, 572 int16x8_t *a2, int16x8_t *a3, 573 int16x8_t *a4, int16x8_t *a5, 574 int16x8_t *a6, int16x8_t *a7) { 575 // Swap 16 bit elements. Goes from: 576 // a0: 00 01 02 03 04 05 06 07 577 // a1: 10 11 12 13 14 15 16 17 578 // a2: 20 21 22 23 24 25 26 27 579 // a3: 30 31 32 33 34 35 36 37 580 // a4: 40 41 42 43 44 45 46 47 581 // a5: 50 51 52 53 54 55 56 57 582 // a6: 60 61 62 63 64 65 66 67 583 // a7: 70 71 72 73 74 75 76 77 584 // to: 585 // b0.val[0]: 00 10 02 12 04 14 06 16 586 // b0.val[1]: 01 11 03 13 05 15 07 17 587 // b1.val[0]: 20 30 22 32 24 34 26 36 588 // b1.val[1]: 21 31 23 33 25 35 27 37 589 // b2.val[0]: 40 50 42 52 44 54 46 56 590 // b2.val[1]: 41 51 43 53 45 55 47 57 591 // b3.val[0]: 60 70 62 72 64 74 66 76 592 // b3.val[1]: 61 71 63 73 65 75 67 77 593 594 const int16x8x2_t b0 = vtrnq_s16(*a0, *a1); 595 const int16x8x2_t b1 = vtrnq_s16(*a2, *a3); 596 const int16x8x2_t b2 = vtrnq_s16(*a4, *a5); 597 const int16x8x2_t b3 = vtrnq_s16(*a6, *a7); 598 599 // Swap 32 bit elements resulting in: 600 // c0.val[0]: 00 10 20 30 04 14 24 34 601 // c0.val[1]: 02 12 22 32 06 16 26 36 602 // c1.val[0]: 01 11 21 31 05 15 25 35 603 // c1.val[1]: 03 13 23 33 07 17 27 37 604 // c2.val[0]: 40 50 60 70 44 54 64 74 605 // c2.val[1]: 42 52 62 72 46 56 66 76 606 // c3.val[0]: 41 51 61 71 45 55 65 75 607 // c3.val[1]: 43 53 63 73 47 57 67 77 608 609 const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]), 610 vreinterpretq_s32_s16(b1.val[0])); 611 const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]), 612 vreinterpretq_s32_s16(b1.val[1])); 613 const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]), 614 vreinterpretq_s32_s16(b3.val[0])); 615 const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]), 616 vreinterpretq_s32_s16(b3.val[1])); 617 618 // Swap 64 bit elements resulting in: 619 // d0.val[0]: 00 10 20 30 40 50 60 70 620 // d0.val[1]: 04 14 24 34 44 54 64 74 621 // d1.val[0]: 01 11 21 31 41 51 61 71 622 // d1.val[1]: 05 15 25 35 45 55 65 75 623 // d2.val[0]: 02 12 22 32 42 52 62 72 624 // d2.val[1]: 06 16 26 36 46 56 66 76 625 // d3.val[0]: 03 13 23 33 43 53 63 73 626 // d3.val[1]: 07 17 27 37 47 57 67 77 627 const int16x8x2_t d0 = vpx_vtrnq_s64_to_s16(c0.val[0], c2.val[0]); 628 const int16x8x2_t d1 = vpx_vtrnq_s64_to_s16(c1.val[0], c3.val[0]); 629 const int16x8x2_t d2 = vpx_vtrnq_s64_to_s16(c0.val[1], c2.val[1]); 630 const int16x8x2_t d3 = vpx_vtrnq_s64_to_s16(c1.val[1], c3.val[1]); 631 632 *a0 = d0.val[0]; 633 *a1 = d1.val[0]; 634 *a2 = d2.val[0]; 635 *a3 = d3.val[0]; 636 *a4 = d0.val[1]; 637 *a5 = d1.val[1]; 638 *a6 = d2.val[1]; 639 *a7 = d3.val[1]; 640 } 641 642 static INLINE void transpose_u16_8x8(uint16x8_t *a0, uint16x8_t *a1, 643 uint16x8_t *a2, uint16x8_t *a3, 644 uint16x8_t *a4, uint16x8_t *a5, 645 uint16x8_t *a6, uint16x8_t *a7) { 646 // Swap 16 bit elements. Goes from: 647 // a0: 00 01 02 03 04 05 06 07 648 // a1: 10 11 12 13 14 15 16 17 649 // a2: 20 21 22 23 24 25 26 27 650 // a3: 30 31 32 33 34 35 36 37 651 // a4: 40 41 42 43 44 45 46 47 652 // a5: 50 51 52 53 54 55 56 57 653 // a6: 60 61 62 63 64 65 66 67 654 // a7: 70 71 72 73 74 75 76 77 655 // to: 656 // b0.val[0]: 00 10 02 12 04 14 06 16 657 // b0.val[1]: 01 11 03 13 05 15 07 17 658 // b1.val[0]: 20 30 22 32 24 34 26 36 659 // b1.val[1]: 21 31 23 33 25 35 27 37 660 // b2.val[0]: 40 50 42 52 44 54 46 56 661 // b2.val[1]: 41 51 43 53 45 55 47 57 662 // b3.val[0]: 60 70 62 72 64 74 66 76 663 // b3.val[1]: 61 71 63 73 65 75 67 77 664 665 const uint16x8x2_t b0 = vtrnq_u16(*a0, *a1); 666 const uint16x8x2_t b1 = vtrnq_u16(*a2, *a3); 667 const uint16x8x2_t b2 = vtrnq_u16(*a4, *a5); 668 const uint16x8x2_t b3 = vtrnq_u16(*a6, *a7); 669 670 // Swap 32 bit elements resulting in: 671 // c0.val[0]: 00 10 20 30 04 14 24 34 672 // c0.val[1]: 02 12 22 32 06 16 26 36 673 // c1.val[0]: 01 11 21 31 05 15 25 35 674 // c1.val[1]: 03 13 23 33 07 17 27 37 675 // c2.val[0]: 40 50 60 70 44 54 64 74 676 // c2.val[1]: 42 52 62 72 46 56 66 76 677 // c3.val[0]: 41 51 61 71 45 55 65 75 678 // c3.val[1]: 43 53 63 73 47 57 67 77 679 680 const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]), 681 vreinterpretq_u32_u16(b1.val[0])); 682 const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]), 683 vreinterpretq_u32_u16(b1.val[1])); 684 const uint32x4x2_t c2 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[0]), 685 vreinterpretq_u32_u16(b3.val[0])); 686 const uint32x4x2_t c3 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[1]), 687 vreinterpretq_u32_u16(b3.val[1])); 688 689 // Swap 64 bit elements resulting in: 690 // d0.val[0]: 00 10 20 30 40 50 60 70 691 // d0.val[1]: 04 14 24 34 44 54 64 74 692 // d1.val[0]: 01 11 21 31 41 51 61 71 693 // d1.val[1]: 05 15 25 35 45 55 65 75 694 // d2.val[0]: 02 12 22 32 42 52 62 72 695 // d2.val[1]: 06 16 26 36 46 56 66 76 696 // d3.val[0]: 03 13 23 33 43 53 63 73 697 // d3.val[1]: 07 17 27 37 47 57 67 77 698 const uint16x8x2_t d0 = vpx_vtrnq_u64_to_u16(c0.val[0], c2.val[0]); 699 const uint16x8x2_t d1 = vpx_vtrnq_u64_to_u16(c1.val[0], c3.val[0]); 700 const uint16x8x2_t d2 = vpx_vtrnq_u64_to_u16(c0.val[1], c2.val[1]); 701 const uint16x8x2_t d3 = vpx_vtrnq_u64_to_u16(c1.val[1], c3.val[1]); 702 703 *a0 = d0.val[0]; 704 *a1 = d1.val[0]; 705 *a2 = d2.val[0]; 706 *a3 = d3.val[0]; 707 *a4 = d0.val[1]; 708 *a5 = d1.val[1]; 709 *a6 = d2.val[1]; 710 *a7 = d3.val[1]; 711 } 712 713 static INLINE void transpose_s32_8x8(int32x4x2_t *a0, int32x4x2_t *a1, 714 int32x4x2_t *a2, int32x4x2_t *a3, 715 int32x4x2_t *a4, int32x4x2_t *a5, 716 int32x4x2_t *a6, int32x4x2_t *a7) { 717 // Swap 32 bit elements. Goes from: 718 // a0: 00 01 02 03 04 05 06 07 719 // a1: 10 11 12 13 14 15 16 17 720 // a2: 20 21 22 23 24 25 26 27 721 // a3: 30 31 32 33 34 35 36 37 722 // a4: 40 41 42 43 44 45 46 47 723 // a5: 50 51 52 53 54 55 56 57 724 // a6: 60 61 62 63 64 65 66 67 725 // a7: 70 71 72 73 74 75 76 77 726 // to: 727 // b0: 00 10 02 12 01 11 03 13 728 // b1: 20 30 22 32 21 31 23 33 729 // b2: 40 50 42 52 41 51 43 53 730 // b3: 60 70 62 72 61 71 63 73 731 // b4: 04 14 06 16 05 15 07 17 732 // b5: 24 34 26 36 25 35 27 37 733 // b6: 44 54 46 56 45 55 47 57 734 // b7: 64 74 66 76 65 75 67 77 735 736 const int32x4x2_t b0 = vtrnq_s32(a0->val[0], a1->val[0]); 737 const int32x4x2_t b1 = vtrnq_s32(a2->val[0], a3->val[0]); 738 const int32x4x2_t b2 = vtrnq_s32(a4->val[0], a5->val[0]); 739 const int32x4x2_t b3 = vtrnq_s32(a6->val[0], a7->val[0]); 740 const int32x4x2_t b4 = vtrnq_s32(a0->val[1], a1->val[1]); 741 const int32x4x2_t b5 = vtrnq_s32(a2->val[1], a3->val[1]); 742 const int32x4x2_t b6 = vtrnq_s32(a4->val[1], a5->val[1]); 743 const int32x4x2_t b7 = vtrnq_s32(a6->val[1], a7->val[1]); 744 745 // Swap 64 bit elements resulting in: 746 // c0: 00 10 20 30 02 12 22 32 747 // c1: 01 11 21 31 03 13 23 33 748 // c2: 40 50 60 70 42 52 62 72 749 // c3: 41 51 61 71 43 53 63 73 750 // c4: 04 14 24 34 06 16 26 36 751 // c5: 05 15 25 35 07 17 27 37 752 // c6: 44 54 64 74 46 56 66 76 753 // c7: 45 55 65 75 47 57 67 77 754 const int32x4x2_t c0 = vpx_vtrnq_s64_to_s32(b0.val[0], b1.val[0]); 755 const int32x4x2_t c1 = vpx_vtrnq_s64_to_s32(b0.val[1], b1.val[1]); 756 const int32x4x2_t c2 = vpx_vtrnq_s64_to_s32(b2.val[0], b3.val[0]); 757 const int32x4x2_t c3 = vpx_vtrnq_s64_to_s32(b2.val[1], b3.val[1]); 758 const int32x4x2_t c4 = vpx_vtrnq_s64_to_s32(b4.val[0], b5.val[0]); 759 const int32x4x2_t c5 = vpx_vtrnq_s64_to_s32(b4.val[1], b5.val[1]); 760 const int32x4x2_t c6 = vpx_vtrnq_s64_to_s32(b6.val[0], b7.val[0]); 761 const int32x4x2_t c7 = vpx_vtrnq_s64_to_s32(b6.val[1], b7.val[1]); 762 763 // Swap 128 bit elements resulting in: 764 // a0: 00 10 20 30 40 50 60 70 765 // a1: 01 11 21 31 41 51 61 71 766 // a2: 02 12 22 32 42 52 62 72 767 // a3: 03 13 23 33 43 53 63 73 768 // a4: 04 14 24 34 44 54 64 74 769 // a5: 05 15 25 35 45 55 65 75 770 // a6: 06 16 26 36 46 56 66 76 771 // a7: 07 17 27 37 47 57 67 77 772 a0->val[0] = c0.val[0]; 773 a0->val[1] = c2.val[0]; 774 a1->val[0] = c1.val[0]; 775 a1->val[1] = c3.val[0]; 776 a2->val[0] = c0.val[1]; 777 a2->val[1] = c2.val[1]; 778 a3->val[0] = c1.val[1]; 779 a3->val[1] = c3.val[1]; 780 a4->val[0] = c4.val[0]; 781 a4->val[1] = c6.val[0]; 782 a5->val[0] = c5.val[0]; 783 a5->val[1] = c7.val[0]; 784 a6->val[0] = c4.val[1]; 785 a6->val[1] = c6.val[1]; 786 a7->val[0] = c5.val[1]; 787 a7->val[1] = c7.val[1]; 788 } 789 790 static INLINE void transpose_u8_16x8( 791 const uint8x16_t i0, const uint8x16_t i1, const uint8x16_t i2, 792 const uint8x16_t i3, const uint8x16_t i4, const uint8x16_t i5, 793 const uint8x16_t i6, const uint8x16_t i7, uint8x8_t *o0, uint8x8_t *o1, 794 uint8x8_t *o2, uint8x8_t *o3, uint8x8_t *o4, uint8x8_t *o5, uint8x8_t *o6, 795 uint8x8_t *o7, uint8x8_t *o8, uint8x8_t *o9, uint8x8_t *o10, uint8x8_t *o11, 796 uint8x8_t *o12, uint8x8_t *o13, uint8x8_t *o14, uint8x8_t *o15) { 797 // Swap 8 bit elements. Goes from: 798 // i0: 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F 799 // i1: 10 11 12 13 14 15 16 17 18 19 1A 1B 1C 1D 1E 1F 800 // i2: 20 21 22 23 24 25 26 27 28 29 2A 2B 2C 2D 2E 2F 801 // i3: 30 31 32 33 34 35 36 37 38 39 3A 3B 3C 3D 3E 3F 802 // i4: 40 41 42 43 44 45 46 47 48 49 4A 4B 4C 4D 4E 4F 803 // i5: 50 51 52 53 54 55 56 57 58 59 5A 5B 5C 5D 5E 5F 804 // i6: 60 61 62 63 64 65 66 67 68 69 6A 6B 6C 6D 6E 6F 805 // i7: 70 71 72 73 74 75 76 77 78 79 7A 7B 7C 7D 7E 7F 806 // to: 807 // b0.val[0]: 00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E 808 // b0.val[1]: 01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F 809 // b1.val[0]: 20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E 810 // b1.val[1]: 21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F 811 // b2.val[0]: 40 50 42 52 44 54 46 56 48 58 4A 5A 4C 5C 4E 5E 812 // b2.val[1]: 41 51 43 53 45 55 47 57 49 59 4B 5B 4D 5D 4F 5F 813 // b3.val[0]: 60 70 62 72 64 74 66 76 68 78 6A 7A 6C 7C 6E 7E 814 // b3.val[1]: 61 71 63 73 65 75 67 77 69 79 6B 7B 6D 7D 6F 7F 815 const uint8x16x2_t b0 = vtrnq_u8(i0, i1); 816 const uint8x16x2_t b1 = vtrnq_u8(i2, i3); 817 const uint8x16x2_t b2 = vtrnq_u8(i4, i5); 818 const uint8x16x2_t b3 = vtrnq_u8(i6, i7); 819 820 // Swap 16 bit elements resulting in: 821 // c0.val[0]: 00 10 20 30 04 14 24 34 08 18 28 38 0C 1C 2C 3C 822 // c0.val[1]: 02 12 22 32 06 16 26 36 0A 1A 2A 3A 0E 1E 2E 3E 823 // c1.val[0]: 01 11 21 31 05 15 25 35 09 19 29 39 0D 1D 2D 3D 824 // c1.val[1]: 03 13 23 33 07 17 27 37 0B 1B 2B 3B 0F 1F 2F 3F 825 // c2.val[0]: 40 50 60 70 44 54 64 74 48 58 68 78 4C 5C 6C 7C 826 // c2.val[1]: 42 52 62 72 46 56 66 76 4A 5A 6A 7A 4E 5E 6E 7E 827 // c3.val[0]: 41 51 61 71 45 55 65 75 49 59 69 79 4D 5D 6D 7D 828 // c3.val[1]: 43 53 63 73 47 57 67 77 4B 5B 6B 7B 4F 5F 6F 7F 829 const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]), 830 vreinterpretq_u16_u8(b1.val[0])); 831 const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]), 832 vreinterpretq_u16_u8(b1.val[1])); 833 const uint16x8x2_t c2 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[0]), 834 vreinterpretq_u16_u8(b3.val[0])); 835 const uint16x8x2_t c3 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[1]), 836 vreinterpretq_u16_u8(b3.val[1])); 837 838 // Swap 32 bit elements resulting in: 839 // d0.val[0]: 00 10 20 30 40 50 60 70 08 18 28 38 48 58 68 78 840 // d0.val[1]: 04 14 24 34 44 54 64 74 0C 1C 2C 3C 4C 5C 6C 7C 841 // d1.val[0]: 02 12 22 32 42 52 62 72 0A 1A 2A 3A 4A 5A 6A 7A 842 // d1.val[1]: 06 16 26 36 46 56 66 76 0E 1E 2E 3E 4E 5E 6E 7E 843 // d2.val[0]: 01 11 21 31 41 51 61 71 09 19 29 39 49 59 69 79 844 // d2.val[1]: 05 15 25 35 45 55 65 75 0D 1D 2D 3D 4D 5D 6D 7D 845 // d3.val[0]: 03 13 23 33 43 53 63 73 0B 1B 2B 3B 4B 5B 6B 7B 846 // d3.val[1]: 07 17 27 37 47 57 67 77 0F 1F 2F 3F 4F 5F 6F 7F 847 const uint32x4x2_t d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]), 848 vreinterpretq_u32_u16(c2.val[0])); 849 const uint32x4x2_t d1 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]), 850 vreinterpretq_u32_u16(c2.val[1])); 851 const uint32x4x2_t d2 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[0]), 852 vreinterpretq_u32_u16(c3.val[0])); 853 const uint32x4x2_t d3 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[1]), 854 vreinterpretq_u32_u16(c3.val[1])); 855 856 // Output: 857 // o0 : 00 10 20 30 40 50 60 70 858 // o1 : 01 11 21 31 41 51 61 71 859 // o2 : 02 12 22 32 42 52 62 72 860 // o3 : 03 13 23 33 43 53 63 73 861 // o4 : 04 14 24 34 44 54 64 74 862 // o5 : 05 15 25 35 45 55 65 75 863 // o6 : 06 16 26 36 46 56 66 76 864 // o7 : 07 17 27 37 47 57 67 77 865 // o8 : 08 18 28 38 48 58 68 78 866 // o9 : 09 19 29 39 49 59 69 79 867 // o10: 0A 1A 2A 3A 4A 5A 6A 7A 868 // o11: 0B 1B 2B 3B 4B 5B 6B 7B 869 // o12: 0C 1C 2C 3C 4C 5C 6C 7C 870 // o13: 0D 1D 2D 3D 4D 5D 6D 7D 871 // o14: 0E 1E 2E 3E 4E 5E 6E 7E 872 // o15: 0F 1F 2F 3F 4F 5F 6F 7F 873 *o0 = vget_low_u8(vreinterpretq_u8_u32(d0.val[0])); 874 *o1 = vget_low_u8(vreinterpretq_u8_u32(d2.val[0])); 875 *o2 = vget_low_u8(vreinterpretq_u8_u32(d1.val[0])); 876 *o3 = vget_low_u8(vreinterpretq_u8_u32(d3.val[0])); 877 *o4 = vget_low_u8(vreinterpretq_u8_u32(d0.val[1])); 878 *o5 = vget_low_u8(vreinterpretq_u8_u32(d2.val[1])); 879 *o6 = vget_low_u8(vreinterpretq_u8_u32(d1.val[1])); 880 *o7 = vget_low_u8(vreinterpretq_u8_u32(d3.val[1])); 881 *o8 = vget_high_u8(vreinterpretq_u8_u32(d0.val[0])); 882 *o9 = vget_high_u8(vreinterpretq_u8_u32(d2.val[0])); 883 *o10 = vget_high_u8(vreinterpretq_u8_u32(d1.val[0])); 884 *o11 = vget_high_u8(vreinterpretq_u8_u32(d3.val[0])); 885 *o12 = vget_high_u8(vreinterpretq_u8_u32(d0.val[1])); 886 *o13 = vget_high_u8(vreinterpretq_u8_u32(d2.val[1])); 887 *o14 = vget_high_u8(vreinterpretq_u8_u32(d1.val[1])); 888 *o15 = vget_high_u8(vreinterpretq_u8_u32(d3.val[1])); 889 } 890 891 static INLINE void transpose_u8_8x16( 892 const uint8x8_t i0, const uint8x8_t i1, const uint8x8_t i2, 893 const uint8x8_t i3, const uint8x8_t i4, const uint8x8_t i5, 894 const uint8x8_t i6, const uint8x8_t i7, const uint8x8_t i8, 895 const uint8x8_t i9, const uint8x8_t i10, const uint8x8_t i11, 896 const uint8x8_t i12, const uint8x8_t i13, const uint8x8_t i14, 897 const uint8x8_t i15, uint8x16_t *o0, uint8x16_t *o1, uint8x16_t *o2, 898 uint8x16_t *o3, uint8x16_t *o4, uint8x16_t *o5, uint8x16_t *o6, 899 uint8x16_t *o7) { 900 // Combine 8 bit elements. Goes from: 901 // i0 : 00 01 02 03 04 05 06 07 902 // i1 : 10 11 12 13 14 15 16 17 903 // i2 : 20 21 22 23 24 25 26 27 904 // i3 : 30 31 32 33 34 35 36 37 905 // i4 : 40 41 42 43 44 45 46 47 906 // i5 : 50 51 52 53 54 55 56 57 907 // i6 : 60 61 62 63 64 65 66 67 908 // i7 : 70 71 72 73 74 75 76 77 909 // i8 : 80 81 82 83 84 85 86 87 910 // i9 : 90 91 92 93 94 95 96 97 911 // i10: A0 A1 A2 A3 A4 A5 A6 A7 912 // i11: B0 B1 B2 B3 B4 B5 B6 B7 913 // i12: C0 C1 C2 C3 C4 C5 C6 C7 914 // i13: D0 D1 D2 D3 D4 D5 D6 D7 915 // i14: E0 E1 E2 E3 E4 E5 E6 E7 916 // i15: F0 F1 F2 F3 F4 F5 F6 F7 917 // to: 918 // a0: 00 01 02 03 04 05 06 07 80 81 82 83 84 85 86 87 919 // a1: 10 11 12 13 14 15 16 17 90 91 92 93 94 95 96 97 920 // a2: 20 21 22 23 24 25 26 27 A0 A1 A2 A3 A4 A5 A6 A7 921 // a3: 30 31 32 33 34 35 36 37 B0 B1 B2 B3 B4 B5 B6 B7 922 // a4: 40 41 42 43 44 45 46 47 C0 C1 C2 C3 C4 C5 C6 C7 923 // a5: 50 51 52 53 54 55 56 57 D0 D1 D2 D3 D4 D5 D6 D7 924 // a6: 60 61 62 63 64 65 66 67 E0 E1 E2 E3 E4 E5 E6 E7 925 // a7: 70 71 72 73 74 75 76 77 F0 F1 F2 F3 F4 F5 F6 F7 926 const uint8x16_t a0 = vcombine_u8(i0, i8); 927 const uint8x16_t a1 = vcombine_u8(i1, i9); 928 const uint8x16_t a2 = vcombine_u8(i2, i10); 929 const uint8x16_t a3 = vcombine_u8(i3, i11); 930 const uint8x16_t a4 = vcombine_u8(i4, i12); 931 const uint8x16_t a5 = vcombine_u8(i5, i13); 932 const uint8x16_t a6 = vcombine_u8(i6, i14); 933 const uint8x16_t a7 = vcombine_u8(i7, i15); 934 935 // Swap 8 bit elements resulting in: 936 // b0.val[0]: 00 10 02 12 04 14 06 16 80 90 82 92 84 94 86 96 937 // b0.val[1]: 01 11 03 13 05 15 07 17 81 91 83 93 85 95 87 97 938 // b1.val[0]: 20 30 22 32 24 34 26 36 A0 B0 A2 B2 A4 B4 A6 B6 939 // b1.val[1]: 21 31 23 33 25 35 27 37 A1 B1 A3 B3 A5 B5 A7 B7 940 // b2.val[0]: 40 50 42 52 44 54 46 56 C0 D0 C2 D2 C4 D4 C6 D6 941 // b2.val[1]: 41 51 43 53 45 55 47 57 C1 D1 C3 D3 C5 D5 C7 D7 942 // b3.val[0]: 60 70 62 72 64 74 66 76 E0 F0 E2 F2 E4 F4 E6 F6 943 // b3.val[1]: 61 71 63 73 65 75 67 77 E1 F1 E3 F3 E5 F5 E7 F7 944 const uint8x16x2_t b0 = vtrnq_u8(a0, a1); 945 const uint8x16x2_t b1 = vtrnq_u8(a2, a3); 946 const uint8x16x2_t b2 = vtrnq_u8(a4, a5); 947 const uint8x16x2_t b3 = vtrnq_u8(a6, a7); 948 949 // Swap 16 bit elements resulting in: 950 // c0.val[0]: 00 10 20 30 04 14 24 34 80 90 A0 B0 84 94 A4 B4 951 // c0.val[1]: 02 12 22 32 06 16 26 36 82 92 A2 B2 86 96 A6 B6 952 // c1.val[0]: 01 11 21 31 05 15 25 35 81 91 A1 B1 85 95 A5 B5 953 // c1.val[1]: 03 13 23 33 07 17 27 37 83 93 A3 B3 87 97 A7 B7 954 // c2.val[0]: 40 50 60 70 44 54 64 74 C0 D0 E0 F0 C4 D4 E4 F4 955 // c2.val[1]: 42 52 62 72 46 56 66 76 C2 D2 E2 F2 C6 D6 E6 F6 956 // c3.val[0]: 41 51 61 71 45 55 65 75 C1 D1 E1 F1 C5 D5 E5 F5 957 // c3.val[1]: 43 53 63 73 47 57 67 77 C3 D3 E3 F3 C7 D7 E7 F7 958 const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]), 959 vreinterpretq_u16_u8(b1.val[0])); 960 const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]), 961 vreinterpretq_u16_u8(b1.val[1])); 962 const uint16x8x2_t c2 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[0]), 963 vreinterpretq_u16_u8(b3.val[0])); 964 const uint16x8x2_t c3 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[1]), 965 vreinterpretq_u16_u8(b3.val[1])); 966 967 // Swap 32 bit elements resulting in: 968 // d0.val[0]: 00 10 20 30 40 50 60 70 80 90 A0 B0 C0 D0 E0 F0 969 // d0.val[1]: 04 14 24 34 44 54 64 74 84 94 A4 B4 C4 D4 E4 F4 970 // d1.val[0]: 02 12 22 32 42 52 62 72 82 92 A2 B2 C2 D2 E2 F2 971 // d1.val[1]: 06 16 26 36 46 56 66 76 86 96 A6 B6 C6 D6 E6 F6 972 // d2.val[0]: 01 11 21 31 41 51 61 71 81 91 A1 B1 C1 D1 E1 F1 973 // d2.val[1]: 05 15 25 35 45 55 65 75 85 95 A5 B5 C5 D5 E5 F5 974 // d3.val[0]: 03 13 23 33 43 53 63 73 83 93 A3 B3 C3 D3 E3 F3 975 // d3.val[1]: 07 17 27 37 47 57 67 77 87 97 A7 B7 C7 D7 E7 F7 976 const uint32x4x2_t d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]), 977 vreinterpretq_u32_u16(c2.val[0])); 978 const uint32x4x2_t d1 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]), 979 vreinterpretq_u32_u16(c2.val[1])); 980 const uint32x4x2_t d2 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[0]), 981 vreinterpretq_u32_u16(c3.val[0])); 982 const uint32x4x2_t d3 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[1]), 983 vreinterpretq_u32_u16(c3.val[1])); 984 985 // Output: 986 // o0: 00 10 20 30 40 50 60 70 80 90 A0 B0 C0 D0 E0 F0 987 // o1: 01 11 21 31 41 51 61 71 81 91 A1 B1 C1 D1 E1 F1 988 // o2: 02 12 22 32 42 52 62 72 82 92 A2 B2 C2 D2 E2 F2 989 // o3: 03 13 23 33 43 53 63 73 83 93 A3 B3 C3 D3 E3 F3 990 // o4: 04 14 24 34 44 54 64 74 84 94 A4 B4 C4 D4 E4 F4 991 // o5: 05 15 25 35 45 55 65 75 85 95 A5 B5 C5 D5 E5 F5 992 // o6: 06 16 26 36 46 56 66 76 86 96 A6 B6 C6 D6 E6 F6 993 // o7: 07 17 27 37 47 57 67 77 87 97 A7 B7 C7 D7 E7 F7 994 *o0 = vreinterpretq_u8_u32(d0.val[0]); 995 *o1 = vreinterpretq_u8_u32(d2.val[0]); 996 *o2 = vreinterpretq_u8_u32(d1.val[0]); 997 *o3 = vreinterpretq_u8_u32(d3.val[0]); 998 *o4 = vreinterpretq_u8_u32(d0.val[1]); 999 *o5 = vreinterpretq_u8_u32(d2.val[1]); 1000 *o6 = vreinterpretq_u8_u32(d1.val[1]); 1001 *o7 = vreinterpretq_u8_u32(d3.val[1]); 1002 } 1003 1004 static INLINE void transpose_u8_16x16( 1005 const uint8x16_t i0, const uint8x16_t i1, const uint8x16_t i2, 1006 const uint8x16_t i3, const uint8x16_t i4, const uint8x16_t i5, 1007 const uint8x16_t i6, const uint8x16_t i7, const uint8x16_t i8, 1008 const uint8x16_t i9, const uint8x16_t i10, const uint8x16_t i11, 1009 const uint8x16_t i12, const uint8x16_t i13, const uint8x16_t i14, 1010 const uint8x16_t i15, uint8x16_t *o0, uint8x16_t *o1, uint8x16_t *o2, 1011 uint8x16_t *o3, uint8x16_t *o4, uint8x16_t *o5, uint8x16_t *o6, 1012 uint8x16_t *o7, uint8x16_t *o8, uint8x16_t *o9, uint8x16_t *o10, 1013 uint8x16_t *o11, uint8x16_t *o12, uint8x16_t *o13, uint8x16_t *o14, 1014 uint8x16_t *o15) { 1015 // Swap 8 bit elements. Goes from: 1016 // i0: 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F 1017 // i1: 10 11 12 13 14 15 16 17 18 19 1A 1B 1C 1D 1E 1F 1018 // i2: 20 21 22 23 24 25 26 27 28 29 2A 2B 2C 2D 2E 2F 1019 // i3: 30 31 32 33 34 35 36 37 38 39 3A 3B 3C 3D 3E 3F 1020 // i4: 40 41 42 43 44 45 46 47 48 49 4A 4B 4C 4D 4E 4F 1021 // i5: 50 51 52 53 54 55 56 57 58 59 5A 5B 5C 5D 5E 5F 1022 // i6: 60 61 62 63 64 65 66 67 68 69 6A 6B 6C 6D 6E 6F 1023 // i7: 70 71 72 73 74 75 76 77 78 79 7A 7B 7C 7D 7E 7F 1024 // i8: 80 81 82 83 84 85 86 87 88 89 8A 8B 8C 8D 8E 8F 1025 // i9: 90 91 92 93 94 95 96 97 98 99 9A 9B 9C 9D 9E 9F 1026 // i10: A0 A1 A2 A3 A4 A5 A6 A7 A8 A9 AA AB AC AD AE AF 1027 // i11: B0 B1 B2 B3 B4 B5 B6 B7 B8 B9 BA BB BC BD BE BF 1028 // i12: C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF 1029 // i13: D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF 1030 // i14: E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF 1031 // i15: F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF 1032 // to: 1033 // b0.val[0]: 00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E 1034 // b0.val[1]: 01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F 1035 // b1.val[0]: 20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E 1036 // b1.val[1]: 21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F 1037 // b2.val[0]: 40 50 42 52 44 54 46 56 48 58 4A 5A 4C 5C 4E 5E 1038 // b2.val[1]: 41 51 43 53 45 55 47 57 49 59 4B 5B 4D 5D 4F 5F 1039 // b3.val[0]: 60 70 62 72 64 74 66 76 68 78 6A 7A 6C 7C 6E 7E 1040 // b3.val[1]: 61 71 63 73 65 75 67 77 69 79 6B 7B 6D 7D 6F 7F 1041 // b4.val[0]: 80 90 82 92 84 94 86 96 88 98 8A 9A 8C 9C 8E 9E 1042 // b4.val[1]: 81 91 83 93 85 95 87 97 89 99 8B 9B 8D 9D 8F 9F 1043 // b5.val[0]: A0 B0 A2 B2 A4 B4 A6 B6 A8 B8 AA BA AC BC AE BE 1044 // b5.val[1]: A1 B1 A3 B3 A5 B5 A7 B7 A9 B9 AB BB AD BD AF BF 1045 // b6.val[0]: C0 D0 C2 D2 C4 D4 C6 D6 C8 D8 CA DA CC DC CE DE 1046 // b6.val[1]: C1 D1 C3 D3 C5 D5 C7 D7 C9 D9 CB DB CD DD CF DF 1047 // b7.val[0]: E0 F0 E2 F2 E4 F4 E6 F6 E8 F8 EA FA EC FC EE FE 1048 // b7.val[1]: E1 F1 E3 F3 E5 F5 E7 F7 E9 F9 EB FB ED FD EF FF 1049 const uint8x16x2_t b0 = vtrnq_u8(i0, i1); 1050 const uint8x16x2_t b1 = vtrnq_u8(i2, i3); 1051 const uint8x16x2_t b2 = vtrnq_u8(i4, i5); 1052 const uint8x16x2_t b3 = vtrnq_u8(i6, i7); 1053 const uint8x16x2_t b4 = vtrnq_u8(i8, i9); 1054 const uint8x16x2_t b5 = vtrnq_u8(i10, i11); 1055 const uint8x16x2_t b6 = vtrnq_u8(i12, i13); 1056 const uint8x16x2_t b7 = vtrnq_u8(i14, i15); 1057 1058 // Swap 16 bit elements resulting in: 1059 // c0.val[0]: 00 10 20 30 04 14 24 34 08 18 28 38 0C 1C 2C 3C 1060 // c0.val[1]: 02 12 22 32 06 16 26 36 0A 1A 2A 3A 0E 1E 2E 3E 1061 // c1.val[0]: 01 11 21 31 05 15 25 35 09 19 29 39 0D 1D 2D 3D 1062 // c1.val[1]: 03 13 23 33 07 17 27 37 0B 1B 2B 3B 0F 1F 2F 3F 1063 // c2.val[0]: 40 50 60 70 44 54 64 74 48 58 68 78 4C 5C 6C 7C 1064 // c2.val[1]: 42 52 62 72 46 56 66 76 4A 5A 6A 7A 4E 5E 6E 7E 1065 // c3.val[0]: 41 51 61 71 45 55 65 75 49 59 69 79 4D 5D 6D 7D 1066 // c3.val[1]: 43 53 63 73 47 57 67 77 4B 5B 6B 7B 4F 5F 6F 7F 1067 // c4.val[0]: 80 90 A0 B0 84 94 A4 B4 88 98 A8 B8 8C 9C AC BC 1068 // c4.val[1]: 82 92 A2 B2 86 96 A6 B6 8A 9A AA BA 8E 9E AE BE 1069 // c5.val[0]: 81 91 A1 B1 85 95 A5 B5 89 99 A9 B9 8D 9D AD BD 1070 // c5.val[1]: 83 93 A3 B3 87 97 A7 B7 8B 9B AB BB 8F 9F AF BF 1071 // c6.val[0]: C0 D0 E0 F0 C4 D4 E4 F4 C8 D8 E8 F8 CC DC EC FC 1072 // c6.val[1]: C2 D2 E2 F2 C6 D6 E6 F6 CA DA EA FA CE DE EE FE 1073 // c7.val[0]: C1 D1 E1 F1 C5 D5 E5 F5 C9 D9 E9 F9 CD DD ED FD 1074 // c7.val[1]: C3 D3 E3 F3 C7 D7 E7 F7 CB DB EB FB CF DF EF FF 1075 const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]), 1076 vreinterpretq_u16_u8(b1.val[0])); 1077 const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]), 1078 vreinterpretq_u16_u8(b1.val[1])); 1079 const uint16x8x2_t c2 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[0]), 1080 vreinterpretq_u16_u8(b3.val[0])); 1081 const uint16x8x2_t c3 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[1]), 1082 vreinterpretq_u16_u8(b3.val[1])); 1083 const uint16x8x2_t c4 = vtrnq_u16(vreinterpretq_u16_u8(b4.val[0]), 1084 vreinterpretq_u16_u8(b5.val[0])); 1085 const uint16x8x2_t c5 = vtrnq_u16(vreinterpretq_u16_u8(b4.val[1]), 1086 vreinterpretq_u16_u8(b5.val[1])); 1087 const uint16x8x2_t c6 = vtrnq_u16(vreinterpretq_u16_u8(b6.val[0]), 1088 vreinterpretq_u16_u8(b7.val[0])); 1089 const uint16x8x2_t c7 = vtrnq_u16(vreinterpretq_u16_u8(b6.val[1]), 1090 vreinterpretq_u16_u8(b7.val[1])); 1091 1092 // Swap 32 bit elements resulting in: 1093 // d0.val[0]: 00 10 20 30 40 50 60 70 08 18 28 38 48 58 68 78 1094 // d0.val[1]: 04 14 24 34 44 54 64 74 0C 1C 2C 3C 4C 5C 6C 7C 1095 // d1.val[0]: 02 12 22 32 42 52 62 72 0A 1A 2A 3A 4A 5A 6A 7A 1096 // d1.val[1]: 06 16 26 36 46 56 66 76 0E 1E 2E 3E 4E 5E 6E 7E 1097 // d2.val[0]: 01 11 21 31 41 51 61 71 09 19 29 39 49 59 69 79 1098 // d2.val[1]: 05 15 25 35 45 55 65 75 0D 1D 2D 3D 4D 5D 6D 7D 1099 // d3.val[0]: 03 13 23 33 43 53 63 73 0B 1B 2B 3B 4B 5B 6B 7B 1100 // d3.val[1]: 07 17 27 37 47 57 67 77 0F 1F 2F 3F 4F 5F 6F 7F 1101 // d4.val[0]: 80 90 A0 B0 C0 D0 E0 F0 88 98 A8 B8 C8 D8 E8 F8 1102 // d4.val[1]: 84 94 A4 B4 C4 D4 E4 F4 8C 9C AC BC CC DC EC FC 1103 // d5.val[0]: 82 92 A2 B2 C2 D2 E2 F2 8A 9A AA BA CA DA EA FA 1104 // d5.val[1]: 86 96 A6 B6 C6 D6 E6 F6 8E 9E AE BE CE DE EE FE 1105 // d6.val[0]: 81 91 A1 B1 C1 D1 E1 F1 89 99 A9 B9 C9 D9 E9 F9 1106 // d6.val[1]: 85 95 A5 B5 C5 D5 E5 F5 8D 9D AD BD CD DD ED FD 1107 // d7.val[0]: 83 93 A3 B3 C3 D3 E3 F3 8B 9B AB BB CB DB EB FB 1108 // d7.val[1]: 87 97 A7 B7 C7 D7 E7 F7 8F 9F AF BF CF DF EF FF 1109 const uint32x4x2_t d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]), 1110 vreinterpretq_u32_u16(c2.val[0])); 1111 const uint32x4x2_t d1 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]), 1112 vreinterpretq_u32_u16(c2.val[1])); 1113 const uint32x4x2_t d2 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[0]), 1114 vreinterpretq_u32_u16(c3.val[0])); 1115 const uint32x4x2_t d3 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[1]), 1116 vreinterpretq_u32_u16(c3.val[1])); 1117 const uint32x4x2_t d4 = vtrnq_u32(vreinterpretq_u32_u16(c4.val[0]), 1118 vreinterpretq_u32_u16(c6.val[0])); 1119 const uint32x4x2_t d5 = vtrnq_u32(vreinterpretq_u32_u16(c4.val[1]), 1120 vreinterpretq_u32_u16(c6.val[1])); 1121 const uint32x4x2_t d6 = vtrnq_u32(vreinterpretq_u32_u16(c5.val[0]), 1122 vreinterpretq_u32_u16(c7.val[0])); 1123 const uint32x4x2_t d7 = vtrnq_u32(vreinterpretq_u32_u16(c5.val[1]), 1124 vreinterpretq_u32_u16(c7.val[1])); 1125 1126 // Swap 64 bit elements resulting in: 1127 // e0.val[0]: 00 10 20 30 40 50 60 70 80 90 A0 B0 C0 D0 E0 F0 1128 // e0.val[1]: 08 18 28 38 48 58 68 78 88 98 A8 B8 C8 D8 E8 F8 1129 // e1.val[0]: 01 11 21 31 41 51 61 71 84 94 A4 B4 C4 D4 E4 F4 1130 // e1.val[1]: 09 19 29 39 49 59 69 79 89 99 A9 B9 C9 D9 E9 F9 1131 // e2.val[0]: 02 12 22 32 42 52 62 72 82 92 A2 B2 C2 D2 E2 F2 1132 // e2.val[1]: 0A 1A 2A 3A 4A 5A 6A 7A 8A 9A AA BA CA DA EA FA 1133 // e3.val[0]: 03 13 23 33 43 53 63 73 86 96 A6 B6 C6 D6 E6 F6 1134 // e3.val[1]: 0B 1B 2B 3B 4B 5B 6B 7B 8B 9B AB BB CB DB EB FB 1135 // e4.val[0]: 04 14 24 34 44 54 64 74 81 91 A1 B1 C1 D1 E1 F1 1136 // e4.val[1]: 0C 1C 2C 3C 4C 5C 6C 7C 8C 9C AC BC CC DC EC FC 1137 // e5.val[0]: 05 15 25 35 45 55 65 75 85 95 A5 B5 C5 D5 E5 F5 1138 // e5.val[1]: 0D 1D 2D 3D 4D 5D 6D 7D 8D 9D AD BD CD DD ED FD 1139 // e6.val[0]: 06 16 26 36 46 56 66 76 83 93 A3 B3 C3 D3 E3 F3 1140 // e6.val[1]: 0E 1E 2E 3E 4E 5E 6E 7E 8E 9E AE BE CE DE EE FE 1141 // e7.val[0]: 07 17 27 37 47 57 67 77 87 97 A7 B7 C7 D7 E7 F7 1142 // e7.val[1]: 0F 1F 2F 3F 4F 5F 6F 7F 8F 9F AF BF CF DF EF FF 1143 const uint8x16x2_t e0 = vpx_vtrnq_u64_to_u8(d0.val[0], d4.val[0]); 1144 const uint8x16x2_t e1 = vpx_vtrnq_u64_to_u8(d2.val[0], d6.val[0]); 1145 const uint8x16x2_t e2 = vpx_vtrnq_u64_to_u8(d1.val[0], d5.val[0]); 1146 const uint8x16x2_t e3 = vpx_vtrnq_u64_to_u8(d3.val[0], d7.val[0]); 1147 const uint8x16x2_t e4 = vpx_vtrnq_u64_to_u8(d0.val[1], d4.val[1]); 1148 const uint8x16x2_t e5 = vpx_vtrnq_u64_to_u8(d2.val[1], d6.val[1]); 1149 const uint8x16x2_t e6 = vpx_vtrnq_u64_to_u8(d1.val[1], d5.val[1]); 1150 const uint8x16x2_t e7 = vpx_vtrnq_u64_to_u8(d3.val[1], d7.val[1]); 1151 1152 // Output: 1153 // o0 : 00 10 20 30 40 50 60 70 80 90 A0 B0 C0 D0 E0 F0 1154 // o1 : 01 11 21 31 41 51 61 71 84 94 A4 B4 C4 D4 E4 F4 1155 // o2 : 02 12 22 32 42 52 62 72 82 92 A2 B2 C2 D2 E2 F2 1156 // o3 : 03 13 23 33 43 53 63 73 86 96 A6 B6 C6 D6 E6 F6 1157 // o4 : 04 14 24 34 44 54 64 74 81 91 A1 B1 C1 D1 E1 F1 1158 // o5 : 05 15 25 35 45 55 65 75 85 95 A5 B5 C5 D5 E5 F5 1159 // o6 : 06 16 26 36 46 56 66 76 83 93 A3 B3 C3 D3 E3 F3 1160 // o7 : 07 17 27 37 47 57 67 77 87 97 A7 B7 C7 D7 E7 F7 1161 // o8 : 08 18 28 38 48 58 68 78 88 98 A8 B8 C8 D8 E8 F8 1162 // o9 : 09 19 29 39 49 59 69 79 89 99 A9 B9 C9 D9 E9 F9 1163 // o10: 0A 1A 2A 3A 4A 5A 6A 7A 8A 9A AA BA CA DA EA FA 1164 // o11: 0B 1B 2B 3B 4B 5B 6B 7B 8B 9B AB BB CB DB EB FB 1165 // o12: 0C 1C 2C 3C 4C 5C 6C 7C 8C 9C AC BC CC DC EC FC 1166 // o13: 0D 1D 2D 3D 4D 5D 6D 7D 8D 9D AD BD CD DD ED FD 1167 // o14: 0E 1E 2E 3E 4E 5E 6E 7E 8E 9E AE BE CE DE EE FE 1168 // o15: 0F 1F 2F 3F 4F 5F 6F 7F 8F 9F AF BF CF DF EF FF 1169 *o0 = e0.val[0]; 1170 *o1 = e1.val[0]; 1171 *o2 = e2.val[0]; 1172 *o3 = e3.val[0]; 1173 *o4 = e4.val[0]; 1174 *o5 = e5.val[0]; 1175 *o6 = e6.val[0]; 1176 *o7 = e7.val[0]; 1177 *o8 = e0.val[1]; 1178 *o9 = e1.val[1]; 1179 *o10 = e2.val[1]; 1180 *o11 = e3.val[1]; 1181 *o12 = e4.val[1]; 1182 *o13 = e5.val[1]; 1183 *o14 = e6.val[1]; 1184 *o15 = e7.val[1]; 1185 } 1186 1187 static INLINE void load_and_transpose_u8_4x8(const uint8_t *a, 1188 const int a_stride, uint8x8_t *a0, 1189 uint8x8_t *a1, uint8x8_t *a2, 1190 uint8x8_t *a3) { 1191 uint8x8_t a4, a5, a6, a7; 1192 *a0 = vld1_u8(a); 1193 a += a_stride; 1194 *a1 = vld1_u8(a); 1195 a += a_stride; 1196 *a2 = vld1_u8(a); 1197 a += a_stride; 1198 *a3 = vld1_u8(a); 1199 a += a_stride; 1200 a4 = vld1_u8(a); 1201 a += a_stride; 1202 a5 = vld1_u8(a); 1203 a += a_stride; 1204 a6 = vld1_u8(a); 1205 a += a_stride; 1206 a7 = vld1_u8(a); 1207 1208 transpose_u8_4x8(a0, a1, a2, a3, a4, a5, a6, a7); 1209 } 1210 1211 static INLINE void load_and_transpose_u8_8x8(const uint8_t *a, 1212 const int a_stride, uint8x8_t *a0, 1213 uint8x8_t *a1, uint8x8_t *a2, 1214 uint8x8_t *a3, uint8x8_t *a4, 1215 uint8x8_t *a5, uint8x8_t *a6, 1216 uint8x8_t *a7) { 1217 *a0 = vld1_u8(a); 1218 a += a_stride; 1219 *a1 = vld1_u8(a); 1220 a += a_stride; 1221 *a2 = vld1_u8(a); 1222 a += a_stride; 1223 *a3 = vld1_u8(a); 1224 a += a_stride; 1225 *a4 = vld1_u8(a); 1226 a += a_stride; 1227 *a5 = vld1_u8(a); 1228 a += a_stride; 1229 *a6 = vld1_u8(a); 1230 a += a_stride; 1231 *a7 = vld1_u8(a); 1232 1233 transpose_u8_8x8(a0, a1, a2, a3, a4, a5, a6, a7); 1234 } 1235 1236 static INLINE void transpose_and_store_u8_8x8(uint8_t *a, const int a_stride, 1237 uint8x8_t a0, uint8x8_t a1, 1238 uint8x8_t a2, uint8x8_t a3, 1239 uint8x8_t a4, uint8x8_t a5, 1240 uint8x8_t a6, uint8x8_t a7) { 1241 transpose_u8_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); 1242 1243 vst1_u8(a, a0); 1244 a += a_stride; 1245 vst1_u8(a, a1); 1246 a += a_stride; 1247 vst1_u8(a, a2); 1248 a += a_stride; 1249 vst1_u8(a, a3); 1250 a += a_stride; 1251 vst1_u8(a, a4); 1252 a += a_stride; 1253 vst1_u8(a, a5); 1254 a += a_stride; 1255 vst1_u8(a, a6); 1256 a += a_stride; 1257 vst1_u8(a, a7); 1258 } 1259 1260 static INLINE void load_and_transpose_s16_8x8(const int16_t *a, 1261 const int a_stride, int16x8_t *a0, 1262 int16x8_t *a1, int16x8_t *a2, 1263 int16x8_t *a3, int16x8_t *a4, 1264 int16x8_t *a5, int16x8_t *a6, 1265 int16x8_t *a7) { 1266 *a0 = vld1q_s16(a); 1267 a += a_stride; 1268 *a1 = vld1q_s16(a); 1269 a += a_stride; 1270 *a2 = vld1q_s16(a); 1271 a += a_stride; 1272 *a3 = vld1q_s16(a); 1273 a += a_stride; 1274 *a4 = vld1q_s16(a); 1275 a += a_stride; 1276 *a5 = vld1q_s16(a); 1277 a += a_stride; 1278 *a6 = vld1q_s16(a); 1279 a += a_stride; 1280 *a7 = vld1q_s16(a); 1281 1282 transpose_s16_8x8(a0, a1, a2, a3, a4, a5, a6, a7); 1283 } 1284 1285 static INLINE void load_and_transpose_s32_8x8( 1286 const int32_t *a, const int a_stride, int32x4x2_t *const a0, 1287 int32x4x2_t *const a1, int32x4x2_t *const a2, int32x4x2_t *const a3, 1288 int32x4x2_t *const a4, int32x4x2_t *const a5, int32x4x2_t *const a6, 1289 int32x4x2_t *const a7) { 1290 a0->val[0] = vld1q_s32(a); 1291 a0->val[1] = vld1q_s32(a + 4); 1292 a += a_stride; 1293 a1->val[0] = vld1q_s32(a); 1294 a1->val[1] = vld1q_s32(a + 4); 1295 a += a_stride; 1296 a2->val[0] = vld1q_s32(a); 1297 a2->val[1] = vld1q_s32(a + 4); 1298 a += a_stride; 1299 a3->val[0] = vld1q_s32(a); 1300 a3->val[1] = vld1q_s32(a + 4); 1301 a += a_stride; 1302 a4->val[0] = vld1q_s32(a); 1303 a4->val[1] = vld1q_s32(a + 4); 1304 a += a_stride; 1305 a5->val[0] = vld1q_s32(a); 1306 a5->val[1] = vld1q_s32(a + 4); 1307 a += a_stride; 1308 a6->val[0] = vld1q_s32(a); 1309 a6->val[1] = vld1q_s32(a + 4); 1310 a += a_stride; 1311 a7->val[0] = vld1q_s32(a); 1312 a7->val[1] = vld1q_s32(a + 4); 1313 1314 transpose_s32_8x8(a0, a1, a2, a3, a4, a5, a6, a7); 1315 } 1316 #endif // VPX_DSP_ARM_TRANSPOSE_NEON_H_ 1317