1 /* 2 * Copyright (c) 2018, Alliance for Open Media. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #ifndef AOM_AV1_COMMON_ARM_MEM_NEON_H_ 12 #define AOM_AV1_COMMON_ARM_MEM_NEON_H_ 13 14 #include <arm_neon.h> 15 #include <string.h> 16 17 static INLINE void store_row2_u8_8x8(uint8_t *s, int p, const uint8x8_t s0, 18 const uint8x8_t s1) { 19 vst1_u8(s, s0); 20 s += p; 21 vst1_u8(s, s1); 22 s += p; 23 } 24 25 /* These intrinsics require immediate values, so we must use #defines 26 to enforce that. */ 27 #define load_u8_4x1(s, s0, lane) \ 28 do { \ 29 *(s0) = vreinterpret_u8_u32( \ 30 vld1_lane_u32((uint32_t *)(s), vreinterpret_u32_u8(*(s0)), lane)); \ 31 } while (0) 32 33 static INLINE void load_u8_8x8(const uint8_t *s, ptrdiff_t p, 34 uint8x8_t *const s0, uint8x8_t *const s1, 35 uint8x8_t *const s2, uint8x8_t *const s3, 36 uint8x8_t *const s4, uint8x8_t *const s5, 37 uint8x8_t *const s6, uint8x8_t *const s7) { 38 *s0 = vld1_u8(s); 39 s += p; 40 *s1 = vld1_u8(s); 41 s += p; 42 *s2 = vld1_u8(s); 43 s += p; 44 *s3 = vld1_u8(s); 45 s += p; 46 *s4 = vld1_u8(s); 47 s += p; 48 *s5 = vld1_u8(s); 49 s += p; 50 *s6 = vld1_u8(s); 51 s += p; 52 *s7 = vld1_u8(s); 53 } 54 55 static INLINE void load_u8_8x16(const uint8_t *s, ptrdiff_t p, 56 uint8x16_t *const s0, uint8x16_t *const s1, 57 uint8x16_t *const s2, uint8x16_t *const s3) { 58 *s0 = vld1q_u8(s); 59 s += p; 60 *s1 = vld1q_u8(s); 61 s += p; 62 *s2 = vld1q_u8(s); 63 s += p; 64 *s3 = vld1q_u8(s); 65 } 66 67 static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p, 68 uint8x8_t *const s0, uint8x8_t *const s1, 69 uint8x8_t *const s2, uint8x8_t *const s3) { 70 *s0 = vld1_u8(s); 71 s += p; 72 *s1 = vld1_u8(s); 73 s += p; 74 *s2 = vld1_u8(s); 75 s += p; 76 *s3 = vld1_u8(s); 77 } 78 79 static INLINE void load_u16_4x4(const uint16_t *s, const ptrdiff_t p, 80 uint16x4_t *const s0, uint16x4_t *const s1, 81 uint16x4_t *const s2, uint16x4_t *const s3) { 82 *s0 = vld1_u16(s); 83 s += p; 84 *s1 = vld1_u16(s); 85 s += p; 86 *s2 = vld1_u16(s); 87 s += p; 88 *s3 = vld1_u16(s); 89 s += p; 90 } 91 92 static INLINE void load_u16_8x4(const uint16_t *s, const ptrdiff_t p, 93 uint16x8_t *const s0, uint16x8_t *const s1, 94 uint16x8_t *const s2, uint16x8_t *const s3) { 95 *s0 = vld1q_u16(s); 96 s += p; 97 *s1 = vld1q_u16(s); 98 s += p; 99 *s2 = vld1q_u16(s); 100 s += p; 101 *s3 = vld1q_u16(s); 102 s += p; 103 } 104 105 static INLINE void load_s16_4x8(const int16_t *s, ptrdiff_t p, 106 int16x4_t *const s0, int16x4_t *const s1, 107 int16x4_t *const s2, int16x4_t *const s3, 108 int16x4_t *const s4, int16x4_t *const s5, 109 int16x4_t *const s6, int16x4_t *const s7) { 110 *s0 = vld1_s16(s); 111 s += p; 112 *s1 = vld1_s16(s); 113 s += p; 114 *s2 = vld1_s16(s); 115 s += p; 116 *s3 = vld1_s16(s); 117 s += p; 118 *s4 = vld1_s16(s); 119 s += p; 120 *s5 = vld1_s16(s); 121 s += p; 122 *s6 = vld1_s16(s); 123 s += p; 124 *s7 = vld1_s16(s); 125 } 126 127 static INLINE void load_s16_4x4(const int16_t *s, ptrdiff_t p, 128 int16x4_t *const s0, int16x4_t *const s1, 129 int16x4_t *const s2, int16x4_t *const s3) { 130 *s0 = vld1_s16(s); 131 s += p; 132 *s1 = vld1_s16(s); 133 s += p; 134 *s2 = vld1_s16(s); 135 s += p; 136 *s3 = vld1_s16(s); 137 } 138 139 /* These intrinsics require immediate values, so we must use #defines 140 to enforce that. */ 141 #define store_u8_4x1(s, s0, lane) \ 142 do { \ 143 vst1_lane_u32((uint32_t *)(s), vreinterpret_u32_u8(s0), lane); \ 144 } while (0) 145 146 static INLINE void store_u8_8x8(uint8_t *s, ptrdiff_t p, const uint8x8_t s0, 147 const uint8x8_t s1, const uint8x8_t s2, 148 const uint8x8_t s3, const uint8x8_t s4, 149 const uint8x8_t s5, const uint8x8_t s6, 150 const uint8x8_t s7) { 151 vst1_u8(s, s0); 152 s += p; 153 vst1_u8(s, s1); 154 s += p; 155 vst1_u8(s, s2); 156 s += p; 157 vst1_u8(s, s3); 158 s += p; 159 vst1_u8(s, s4); 160 s += p; 161 vst1_u8(s, s5); 162 s += p; 163 vst1_u8(s, s6); 164 s += p; 165 vst1_u8(s, s7); 166 } 167 168 static INLINE void store_u8_8x4(uint8_t *s, ptrdiff_t p, const uint8x8_t s0, 169 const uint8x8_t s1, const uint8x8_t s2, 170 const uint8x8_t s3) { 171 vst1_u8(s, s0); 172 s += p; 173 vst1_u8(s, s1); 174 s += p; 175 vst1_u8(s, s2); 176 s += p; 177 vst1_u8(s, s3); 178 } 179 180 static INLINE void store_u8_8x16(uint8_t *s, ptrdiff_t p, const uint8x16_t s0, 181 const uint8x16_t s1, const uint8x16_t s2, 182 const uint8x16_t s3) { 183 vst1q_u8(s, s0); 184 s += p; 185 vst1q_u8(s, s1); 186 s += p; 187 vst1q_u8(s, s2); 188 s += p; 189 vst1q_u8(s, s3); 190 } 191 192 static INLINE void store_u16_8x8(uint16_t *s, ptrdiff_t dst_stride, 193 const uint16x8_t s0, const uint16x8_t s1, 194 const uint16x8_t s2, const uint16x8_t s3, 195 const uint16x8_t s4, const uint16x8_t s5, 196 const uint16x8_t s6, const uint16x8_t s7) { 197 vst1q_u16(s, s0); 198 s += dst_stride; 199 vst1q_u16(s, s1); 200 s += dst_stride; 201 vst1q_u16(s, s2); 202 s += dst_stride; 203 vst1q_u16(s, s3); 204 s += dst_stride; 205 vst1q_u16(s, s4); 206 s += dst_stride; 207 vst1q_u16(s, s5); 208 s += dst_stride; 209 vst1q_u16(s, s6); 210 s += dst_stride; 211 vst1q_u16(s, s7); 212 } 213 214 static INLINE void store_u16_4x4(uint16_t *s, ptrdiff_t dst_stride, 215 const uint16x4_t s0, const uint16x4_t s1, 216 const uint16x4_t s2, const uint16x4_t s3) { 217 vst1_u16(s, s0); 218 s += dst_stride; 219 vst1_u16(s, s1); 220 s += dst_stride; 221 vst1_u16(s, s2); 222 s += dst_stride; 223 vst1_u16(s, s3); 224 } 225 226 static INLINE void store_u16_8x4(uint16_t *s, ptrdiff_t dst_stride, 227 const uint16x8_t s0, const uint16x8_t s1, 228 const uint16x8_t s2, const uint16x8_t s3) { 229 vst1q_u16(s, s0); 230 s += dst_stride; 231 vst1q_u16(s, s1); 232 s += dst_stride; 233 vst1q_u16(s, s2); 234 s += dst_stride; 235 vst1q_u16(s, s3); 236 } 237 238 static INLINE void store_s16_8x8(int16_t *s, ptrdiff_t dst_stride, 239 const int16x8_t s0, const int16x8_t s1, 240 const int16x8_t s2, const int16x8_t s3, 241 const int16x8_t s4, const int16x8_t s5, 242 const int16x8_t s6, const int16x8_t s7) { 243 vst1q_s16(s, s0); 244 s += dst_stride; 245 vst1q_s16(s, s1); 246 s += dst_stride; 247 vst1q_s16(s, s2); 248 s += dst_stride; 249 vst1q_s16(s, s3); 250 s += dst_stride; 251 vst1q_s16(s, s4); 252 s += dst_stride; 253 vst1q_s16(s, s5); 254 s += dst_stride; 255 vst1q_s16(s, s6); 256 s += dst_stride; 257 vst1q_s16(s, s7); 258 } 259 260 static INLINE void store_s16_4x4(int16_t *s, ptrdiff_t dst_stride, 261 const int16x4_t s0, const int16x4_t s1, 262 const int16x4_t s2, const int16x4_t s3) { 263 vst1_s16(s, s0); 264 s += dst_stride; 265 vst1_s16(s, s1); 266 s += dst_stride; 267 vst1_s16(s, s2); 268 s += dst_stride; 269 vst1_s16(s, s3); 270 } 271 272 static INLINE void store_s16_8x4(int16_t *s, ptrdiff_t dst_stride, 273 const int16x8_t s0, const int16x8_t s1, 274 const int16x8_t s2, const int16x8_t s3) { 275 vst1q_s16(s, s0); 276 s += dst_stride; 277 vst1q_s16(s, s1); 278 s += dst_stride; 279 vst1q_s16(s, s2); 280 s += dst_stride; 281 vst1q_s16(s, s3); 282 } 283 284 static INLINE void load_s16_8x8(const int16_t *s, ptrdiff_t p, 285 int16x8_t *const s0, int16x8_t *const s1, 286 int16x8_t *const s2, int16x8_t *const s3, 287 int16x8_t *const s4, int16x8_t *const s5, 288 int16x8_t *const s6, int16x8_t *const s7) { 289 *s0 = vld1q_s16(s); 290 s += p; 291 *s1 = vld1q_s16(s); 292 s += p; 293 *s2 = vld1q_s16(s); 294 s += p; 295 *s3 = vld1q_s16(s); 296 s += p; 297 *s4 = vld1q_s16(s); 298 s += p; 299 *s5 = vld1q_s16(s); 300 s += p; 301 *s6 = vld1q_s16(s); 302 s += p; 303 *s7 = vld1q_s16(s); 304 } 305 306 static INLINE void load_s16_8x4(const int16_t *s, ptrdiff_t p, 307 int16x8_t *const s0, int16x8_t *const s1, 308 int16x8_t *const s2, int16x8_t *const s3) { 309 *s0 = vld1q_s16(s); 310 s += p; 311 *s1 = vld1q_s16(s); 312 s += p; 313 *s2 = vld1q_s16(s); 314 s += p; 315 *s3 = vld1q_s16(s); 316 } 317 318 static INLINE void load_unaligned_u8_4x8(const uint8_t *buf, int stride, 319 uint32x2_t *tu0, uint32x2_t *tu1, 320 uint32x2_t *tu2, uint32x2_t *tu3) { 321 uint32_t a; 322 323 memcpy(&a, buf, 4); 324 buf += stride; 325 *tu0 = vset_lane_u32(a, *tu0, 0); 326 memcpy(&a, buf, 4); 327 buf += stride; 328 *tu0 = vset_lane_u32(a, *tu0, 1); 329 memcpy(&a, buf, 4); 330 buf += stride; 331 *tu1 = vset_lane_u32(a, *tu1, 0); 332 memcpy(&a, buf, 4); 333 buf += stride; 334 *tu1 = vset_lane_u32(a, *tu1, 1); 335 memcpy(&a, buf, 4); 336 buf += stride; 337 *tu2 = vset_lane_u32(a, *tu2, 0); 338 memcpy(&a, buf, 4); 339 buf += stride; 340 *tu2 = vset_lane_u32(a, *tu2, 1); 341 memcpy(&a, buf, 4); 342 buf += stride; 343 *tu3 = vset_lane_u32(a, *tu3, 0); 344 memcpy(&a, buf, 4); 345 *tu3 = vset_lane_u32(a, *tu3, 1); 346 } 347 348 static INLINE void load_unaligned_u8_4x4(const uint8_t *buf, int stride, 349 uint32x2_t *tu0, uint32x2_t *tu1) { 350 uint32_t a; 351 352 memcpy(&a, buf, 4); 353 buf += stride; 354 *tu0 = vset_lane_u32(a, *tu0, 0); 355 memcpy(&a, buf, 4); 356 buf += stride; 357 *tu0 = vset_lane_u32(a, *tu0, 1); 358 memcpy(&a, buf, 4); 359 buf += stride; 360 *tu1 = vset_lane_u32(a, *tu1, 0); 361 memcpy(&a, buf, 4); 362 *tu1 = vset_lane_u32(a, *tu1, 1); 363 } 364 365 static INLINE void load_unaligned_u8_4x1(const uint8_t *buf, int stride, 366 uint32x2_t *tu0) { 367 uint32_t a; 368 369 memcpy(&a, buf, 4); 370 buf += stride; 371 *tu0 = vset_lane_u32(a, *tu0, 0); 372 } 373 374 static INLINE void load_unaligned_u8_4x2(const uint8_t *buf, int stride, 375 uint32x2_t *tu0) { 376 uint32_t a; 377 378 memcpy(&a, buf, 4); 379 buf += stride; 380 *tu0 = vset_lane_u32(a, *tu0, 0); 381 memcpy(&a, buf, 4); 382 buf += stride; 383 *tu0 = vset_lane_u32(a, *tu0, 1); 384 } 385 386 /* These intrinsics require immediate values, so we must use #defines 387 to enforce that. */ 388 #define store_unaligned_u8_4x1(dst, src, lane) \ 389 do { \ 390 uint32_t a; \ 391 a = vget_lane_u32(vreinterpret_u32_u8(src), lane); \ 392 memcpy(dst, &a, 4); \ 393 } while (0) 394 395 static INLINE void load_unaligned_u8_2x2(const uint8_t *buf, int stride, 396 uint16x4_t *tu0) { 397 uint16_t a; 398 399 memcpy(&a, buf, 2); 400 buf += stride; 401 *tu0 = vset_lane_u16(a, *tu0, 0); 402 memcpy(&a, buf, 2); 403 buf += stride; 404 *tu0 = vset_lane_u16(a, *tu0, 1); 405 } 406 407 static INLINE void load_u8_16x8(const uint8_t *s, ptrdiff_t p, 408 uint8x16_t *const s0, uint8x16_t *const s1, 409 uint8x16_t *const s2, uint8x16_t *const s3, 410 uint8x16_t *const s4, uint8x16_t *const s5, 411 uint8x16_t *const s6, uint8x16_t *const s7) { 412 *s0 = vld1q_u8(s); 413 s += p; 414 *s1 = vld1q_u8(s); 415 s += p; 416 *s2 = vld1q_u8(s); 417 s += p; 418 *s3 = vld1q_u8(s); 419 s += p; 420 *s4 = vld1q_u8(s); 421 s += p; 422 *s5 = vld1q_u8(s); 423 s += p; 424 *s6 = vld1q_u8(s); 425 s += p; 426 *s7 = vld1q_u8(s); 427 } 428 429 static INLINE void load_u8_16x4(const uint8_t *s, ptrdiff_t p, 430 uint8x16_t *const s0, uint8x16_t *const s1, 431 uint8x16_t *const s2, uint8x16_t *const s3) { 432 *s0 = vld1q_u8(s); 433 s += p; 434 *s1 = vld1q_u8(s); 435 s += p; 436 *s2 = vld1q_u8(s); 437 s += p; 438 *s3 = vld1q_u8(s); 439 } 440 441 static INLINE void load_unaligned_u16_4x4(const uint16_t *buf, uint32_t stride, 442 uint64x2_t *tu0, uint64x2_t *tu1) { 443 uint64_t a; 444 445 memcpy(&a, buf, 8); 446 buf += stride; 447 *tu0 = vsetq_lane_u64(a, *tu0, 0); 448 memcpy(&a, buf, 8); 449 buf += stride; 450 *tu0 = vsetq_lane_u64(a, *tu0, 1); 451 memcpy(&a, buf, 8); 452 buf += stride; 453 *tu1 = vsetq_lane_u64(a, *tu1, 0); 454 memcpy(&a, buf, 8); 455 *tu1 = vsetq_lane_u64(a, *tu1, 1); 456 } 457 458 static INLINE void load_s32_4x4(int32_t *s, int32_t p, int32x4_t *s1, 459 int32x4_t *s2, int32x4_t *s3, int32x4_t *s4) { 460 *s1 = vld1q_s32(s); 461 s += p; 462 *s2 = vld1q_s32(s); 463 s += p; 464 *s3 = vld1q_s32(s); 465 s += p; 466 *s4 = vld1q_s32(s); 467 } 468 469 static INLINE void store_s32_4x4(int32_t *s, int32_t p, int32x4_t s1, 470 int32x4_t s2, int32x4_t s3, int32x4_t s4) { 471 vst1q_s32(s, s1); 472 s += p; 473 vst1q_s32(s, s2); 474 s += p; 475 vst1q_s32(s, s3); 476 s += p; 477 vst1q_s32(s, s4); 478 } 479 480 static INLINE void load_u32_4x4(uint32_t *s, int32_t p, uint32x4_t *s1, 481 uint32x4_t *s2, uint32x4_t *s3, 482 uint32x4_t *s4) { 483 *s1 = vld1q_u32(s); 484 s += p; 485 *s2 = vld1q_u32(s); 486 s += p; 487 *s3 = vld1q_u32(s); 488 s += p; 489 *s4 = vld1q_u32(s); 490 } 491 492 static INLINE void store_u32_4x4(uint32_t *s, int32_t p, uint32x4_t s1, 493 uint32x4_t s2, uint32x4_t s3, uint32x4_t s4) { 494 vst1q_u32(s, s1); 495 s += p; 496 vst1q_u32(s, s2); 497 s += p; 498 vst1q_u32(s, s3); 499 s += p; 500 vst1q_u32(s, s4); 501 } 502 503 #endif // AOM_AV1_COMMON_ARM_MEM_NEON_H_ 504