Home | History | Annotate | Download | only in arm
      1 /*
      2  *  Copyright (c) 2018, Alliance for Open Media. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #ifndef AOM_AV1_COMMON_ARM_MEM_NEON_H_
     12 #define AOM_AV1_COMMON_ARM_MEM_NEON_H_
     13 
     14 #include <arm_neon.h>
     15 #include <string.h>
     16 
     17 static INLINE void store_row2_u8_8x8(uint8_t *s, int p, const uint8x8_t s0,
     18                                      const uint8x8_t s1) {
     19   vst1_u8(s, s0);
     20   s += p;
     21   vst1_u8(s, s1);
     22   s += p;
     23 }
     24 
     25 /* These intrinsics require immediate values, so we must use #defines
     26    to enforce that. */
     27 #define load_u8_4x1(s, s0, lane)                                           \
     28   do {                                                                     \
     29     *(s0) = vreinterpret_u8_u32(                                           \
     30         vld1_lane_u32((uint32_t *)(s), vreinterpret_u32_u8(*(s0)), lane)); \
     31   } while (0)
     32 
     33 static INLINE void load_u8_8x8(const uint8_t *s, ptrdiff_t p,
     34                                uint8x8_t *const s0, uint8x8_t *const s1,
     35                                uint8x8_t *const s2, uint8x8_t *const s3,
     36                                uint8x8_t *const s4, uint8x8_t *const s5,
     37                                uint8x8_t *const s6, uint8x8_t *const s7) {
     38   *s0 = vld1_u8(s);
     39   s += p;
     40   *s1 = vld1_u8(s);
     41   s += p;
     42   *s2 = vld1_u8(s);
     43   s += p;
     44   *s3 = vld1_u8(s);
     45   s += p;
     46   *s4 = vld1_u8(s);
     47   s += p;
     48   *s5 = vld1_u8(s);
     49   s += p;
     50   *s6 = vld1_u8(s);
     51   s += p;
     52   *s7 = vld1_u8(s);
     53 }
     54 
     55 static INLINE void load_u8_8x16(const uint8_t *s, ptrdiff_t p,
     56                                 uint8x16_t *const s0, uint8x16_t *const s1,
     57                                 uint8x16_t *const s2, uint8x16_t *const s3) {
     58   *s0 = vld1q_u8(s);
     59   s += p;
     60   *s1 = vld1q_u8(s);
     61   s += p;
     62   *s2 = vld1q_u8(s);
     63   s += p;
     64   *s3 = vld1q_u8(s);
     65 }
     66 
     67 static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
     68                                uint8x8_t *const s0, uint8x8_t *const s1,
     69                                uint8x8_t *const s2, uint8x8_t *const s3) {
     70   *s0 = vld1_u8(s);
     71   s += p;
     72   *s1 = vld1_u8(s);
     73   s += p;
     74   *s2 = vld1_u8(s);
     75   s += p;
     76   *s3 = vld1_u8(s);
     77 }
     78 
     79 static INLINE void load_u16_4x4(const uint16_t *s, const ptrdiff_t p,
     80                                 uint16x4_t *const s0, uint16x4_t *const s1,
     81                                 uint16x4_t *const s2, uint16x4_t *const s3) {
     82   *s0 = vld1_u16(s);
     83   s += p;
     84   *s1 = vld1_u16(s);
     85   s += p;
     86   *s2 = vld1_u16(s);
     87   s += p;
     88   *s3 = vld1_u16(s);
     89   s += p;
     90 }
     91 
     92 static INLINE void load_u16_8x4(const uint16_t *s, const ptrdiff_t p,
     93                                 uint16x8_t *const s0, uint16x8_t *const s1,
     94                                 uint16x8_t *const s2, uint16x8_t *const s3) {
     95   *s0 = vld1q_u16(s);
     96   s += p;
     97   *s1 = vld1q_u16(s);
     98   s += p;
     99   *s2 = vld1q_u16(s);
    100   s += p;
    101   *s3 = vld1q_u16(s);
    102   s += p;
    103 }
    104 
    105 static INLINE void load_s16_4x8(const int16_t *s, ptrdiff_t p,
    106                                 int16x4_t *const s0, int16x4_t *const s1,
    107                                 int16x4_t *const s2, int16x4_t *const s3,
    108                                 int16x4_t *const s4, int16x4_t *const s5,
    109                                 int16x4_t *const s6, int16x4_t *const s7) {
    110   *s0 = vld1_s16(s);
    111   s += p;
    112   *s1 = vld1_s16(s);
    113   s += p;
    114   *s2 = vld1_s16(s);
    115   s += p;
    116   *s3 = vld1_s16(s);
    117   s += p;
    118   *s4 = vld1_s16(s);
    119   s += p;
    120   *s5 = vld1_s16(s);
    121   s += p;
    122   *s6 = vld1_s16(s);
    123   s += p;
    124   *s7 = vld1_s16(s);
    125 }
    126 
    127 static INLINE void load_s16_4x4(const int16_t *s, ptrdiff_t p,
    128                                 int16x4_t *const s0, int16x4_t *const s1,
    129                                 int16x4_t *const s2, int16x4_t *const s3) {
    130   *s0 = vld1_s16(s);
    131   s += p;
    132   *s1 = vld1_s16(s);
    133   s += p;
    134   *s2 = vld1_s16(s);
    135   s += p;
    136   *s3 = vld1_s16(s);
    137 }
    138 
    139 /* These intrinsics require immediate values, so we must use #defines
    140    to enforce that. */
    141 #define store_u8_4x1(s, s0, lane)                                  \
    142   do {                                                             \
    143     vst1_lane_u32((uint32_t *)(s), vreinterpret_u32_u8(s0), lane); \
    144   } while (0)
    145 
    146 static INLINE void store_u8_8x8(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
    147                                 const uint8x8_t s1, const uint8x8_t s2,
    148                                 const uint8x8_t s3, const uint8x8_t s4,
    149                                 const uint8x8_t s5, const uint8x8_t s6,
    150                                 const uint8x8_t s7) {
    151   vst1_u8(s, s0);
    152   s += p;
    153   vst1_u8(s, s1);
    154   s += p;
    155   vst1_u8(s, s2);
    156   s += p;
    157   vst1_u8(s, s3);
    158   s += p;
    159   vst1_u8(s, s4);
    160   s += p;
    161   vst1_u8(s, s5);
    162   s += p;
    163   vst1_u8(s, s6);
    164   s += p;
    165   vst1_u8(s, s7);
    166 }
    167 
    168 static INLINE void store_u8_8x4(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
    169                                 const uint8x8_t s1, const uint8x8_t s2,
    170                                 const uint8x8_t s3) {
    171   vst1_u8(s, s0);
    172   s += p;
    173   vst1_u8(s, s1);
    174   s += p;
    175   vst1_u8(s, s2);
    176   s += p;
    177   vst1_u8(s, s3);
    178 }
    179 
    180 static INLINE void store_u8_8x16(uint8_t *s, ptrdiff_t p, const uint8x16_t s0,
    181                                  const uint8x16_t s1, const uint8x16_t s2,
    182                                  const uint8x16_t s3) {
    183   vst1q_u8(s, s0);
    184   s += p;
    185   vst1q_u8(s, s1);
    186   s += p;
    187   vst1q_u8(s, s2);
    188   s += p;
    189   vst1q_u8(s, s3);
    190 }
    191 
    192 static INLINE void store_u16_8x8(uint16_t *s, ptrdiff_t dst_stride,
    193                                  const uint16x8_t s0, const uint16x8_t s1,
    194                                  const uint16x8_t s2, const uint16x8_t s3,
    195                                  const uint16x8_t s4, const uint16x8_t s5,
    196                                  const uint16x8_t s6, const uint16x8_t s7) {
    197   vst1q_u16(s, s0);
    198   s += dst_stride;
    199   vst1q_u16(s, s1);
    200   s += dst_stride;
    201   vst1q_u16(s, s2);
    202   s += dst_stride;
    203   vst1q_u16(s, s3);
    204   s += dst_stride;
    205   vst1q_u16(s, s4);
    206   s += dst_stride;
    207   vst1q_u16(s, s5);
    208   s += dst_stride;
    209   vst1q_u16(s, s6);
    210   s += dst_stride;
    211   vst1q_u16(s, s7);
    212 }
    213 
    214 static INLINE void store_u16_4x4(uint16_t *s, ptrdiff_t dst_stride,
    215                                  const uint16x4_t s0, const uint16x4_t s1,
    216                                  const uint16x4_t s2, const uint16x4_t s3) {
    217   vst1_u16(s, s0);
    218   s += dst_stride;
    219   vst1_u16(s, s1);
    220   s += dst_stride;
    221   vst1_u16(s, s2);
    222   s += dst_stride;
    223   vst1_u16(s, s3);
    224 }
    225 
    226 static INLINE void store_u16_8x4(uint16_t *s, ptrdiff_t dst_stride,
    227                                  const uint16x8_t s0, const uint16x8_t s1,
    228                                  const uint16x8_t s2, const uint16x8_t s3) {
    229   vst1q_u16(s, s0);
    230   s += dst_stride;
    231   vst1q_u16(s, s1);
    232   s += dst_stride;
    233   vst1q_u16(s, s2);
    234   s += dst_stride;
    235   vst1q_u16(s, s3);
    236 }
    237 
    238 static INLINE void store_s16_8x8(int16_t *s, ptrdiff_t dst_stride,
    239                                  const int16x8_t s0, const int16x8_t s1,
    240                                  const int16x8_t s2, const int16x8_t s3,
    241                                  const int16x8_t s4, const int16x8_t s5,
    242                                  const int16x8_t s6, const int16x8_t s7) {
    243   vst1q_s16(s, s0);
    244   s += dst_stride;
    245   vst1q_s16(s, s1);
    246   s += dst_stride;
    247   vst1q_s16(s, s2);
    248   s += dst_stride;
    249   vst1q_s16(s, s3);
    250   s += dst_stride;
    251   vst1q_s16(s, s4);
    252   s += dst_stride;
    253   vst1q_s16(s, s5);
    254   s += dst_stride;
    255   vst1q_s16(s, s6);
    256   s += dst_stride;
    257   vst1q_s16(s, s7);
    258 }
    259 
    260 static INLINE void store_s16_4x4(int16_t *s, ptrdiff_t dst_stride,
    261                                  const int16x4_t s0, const int16x4_t s1,
    262                                  const int16x4_t s2, const int16x4_t s3) {
    263   vst1_s16(s, s0);
    264   s += dst_stride;
    265   vst1_s16(s, s1);
    266   s += dst_stride;
    267   vst1_s16(s, s2);
    268   s += dst_stride;
    269   vst1_s16(s, s3);
    270 }
    271 
    272 static INLINE void store_s16_8x4(int16_t *s, ptrdiff_t dst_stride,
    273                                  const int16x8_t s0, const int16x8_t s1,
    274                                  const int16x8_t s2, const int16x8_t s3) {
    275   vst1q_s16(s, s0);
    276   s += dst_stride;
    277   vst1q_s16(s, s1);
    278   s += dst_stride;
    279   vst1q_s16(s, s2);
    280   s += dst_stride;
    281   vst1q_s16(s, s3);
    282 }
    283 
    284 static INLINE void load_s16_8x8(const int16_t *s, ptrdiff_t p,
    285                                 int16x8_t *const s0, int16x8_t *const s1,
    286                                 int16x8_t *const s2, int16x8_t *const s3,
    287                                 int16x8_t *const s4, int16x8_t *const s5,
    288                                 int16x8_t *const s6, int16x8_t *const s7) {
    289   *s0 = vld1q_s16(s);
    290   s += p;
    291   *s1 = vld1q_s16(s);
    292   s += p;
    293   *s2 = vld1q_s16(s);
    294   s += p;
    295   *s3 = vld1q_s16(s);
    296   s += p;
    297   *s4 = vld1q_s16(s);
    298   s += p;
    299   *s5 = vld1q_s16(s);
    300   s += p;
    301   *s6 = vld1q_s16(s);
    302   s += p;
    303   *s7 = vld1q_s16(s);
    304 }
    305 
    306 static INLINE void load_s16_8x4(const int16_t *s, ptrdiff_t p,
    307                                 int16x8_t *const s0, int16x8_t *const s1,
    308                                 int16x8_t *const s2, int16x8_t *const s3) {
    309   *s0 = vld1q_s16(s);
    310   s += p;
    311   *s1 = vld1q_s16(s);
    312   s += p;
    313   *s2 = vld1q_s16(s);
    314   s += p;
    315   *s3 = vld1q_s16(s);
    316 }
    317 
    318 static INLINE void load_unaligned_u8_4x8(const uint8_t *buf, int stride,
    319                                          uint32x2_t *tu0, uint32x2_t *tu1,
    320                                          uint32x2_t *tu2, uint32x2_t *tu3) {
    321   uint32_t a;
    322 
    323   memcpy(&a, buf, 4);
    324   buf += stride;
    325   *tu0 = vset_lane_u32(a, *tu0, 0);
    326   memcpy(&a, buf, 4);
    327   buf += stride;
    328   *tu0 = vset_lane_u32(a, *tu0, 1);
    329   memcpy(&a, buf, 4);
    330   buf += stride;
    331   *tu1 = vset_lane_u32(a, *tu1, 0);
    332   memcpy(&a, buf, 4);
    333   buf += stride;
    334   *tu1 = vset_lane_u32(a, *tu1, 1);
    335   memcpy(&a, buf, 4);
    336   buf += stride;
    337   *tu2 = vset_lane_u32(a, *tu2, 0);
    338   memcpy(&a, buf, 4);
    339   buf += stride;
    340   *tu2 = vset_lane_u32(a, *tu2, 1);
    341   memcpy(&a, buf, 4);
    342   buf += stride;
    343   *tu3 = vset_lane_u32(a, *tu3, 0);
    344   memcpy(&a, buf, 4);
    345   *tu3 = vset_lane_u32(a, *tu3, 1);
    346 }
    347 
    348 static INLINE void load_unaligned_u8_4x4(const uint8_t *buf, int stride,
    349                                          uint32x2_t *tu0, uint32x2_t *tu1) {
    350   uint32_t a;
    351 
    352   memcpy(&a, buf, 4);
    353   buf += stride;
    354   *tu0 = vset_lane_u32(a, *tu0, 0);
    355   memcpy(&a, buf, 4);
    356   buf += stride;
    357   *tu0 = vset_lane_u32(a, *tu0, 1);
    358   memcpy(&a, buf, 4);
    359   buf += stride;
    360   *tu1 = vset_lane_u32(a, *tu1, 0);
    361   memcpy(&a, buf, 4);
    362   *tu1 = vset_lane_u32(a, *tu1, 1);
    363 }
    364 
    365 static INLINE void load_unaligned_u8_4x1(const uint8_t *buf, int stride,
    366                                          uint32x2_t *tu0) {
    367   uint32_t a;
    368 
    369   memcpy(&a, buf, 4);
    370   buf += stride;
    371   *tu0 = vset_lane_u32(a, *tu0, 0);
    372 }
    373 
    374 static INLINE void load_unaligned_u8_4x2(const uint8_t *buf, int stride,
    375                                          uint32x2_t *tu0) {
    376   uint32_t a;
    377 
    378   memcpy(&a, buf, 4);
    379   buf += stride;
    380   *tu0 = vset_lane_u32(a, *tu0, 0);
    381   memcpy(&a, buf, 4);
    382   buf += stride;
    383   *tu0 = vset_lane_u32(a, *tu0, 1);
    384 }
    385 
    386 /* These intrinsics require immediate values, so we must use #defines
    387    to enforce that. */
    388 #define store_unaligned_u8_4x1(dst, src, lane)         \
    389   do {                                                 \
    390     uint32_t a;                                        \
    391     a = vget_lane_u32(vreinterpret_u32_u8(src), lane); \
    392     memcpy(dst, &a, 4);                                \
    393   } while (0)
    394 
    395 static INLINE void load_unaligned_u8_2x2(const uint8_t *buf, int stride,
    396                                          uint16x4_t *tu0) {
    397   uint16_t a;
    398 
    399   memcpy(&a, buf, 2);
    400   buf += stride;
    401   *tu0 = vset_lane_u16(a, *tu0, 0);
    402   memcpy(&a, buf, 2);
    403   buf += stride;
    404   *tu0 = vset_lane_u16(a, *tu0, 1);
    405 }
    406 
    407 static INLINE void load_u8_16x8(const uint8_t *s, ptrdiff_t p,
    408                                 uint8x16_t *const s0, uint8x16_t *const s1,
    409                                 uint8x16_t *const s2, uint8x16_t *const s3,
    410                                 uint8x16_t *const s4, uint8x16_t *const s5,
    411                                 uint8x16_t *const s6, uint8x16_t *const s7) {
    412   *s0 = vld1q_u8(s);
    413   s += p;
    414   *s1 = vld1q_u8(s);
    415   s += p;
    416   *s2 = vld1q_u8(s);
    417   s += p;
    418   *s3 = vld1q_u8(s);
    419   s += p;
    420   *s4 = vld1q_u8(s);
    421   s += p;
    422   *s5 = vld1q_u8(s);
    423   s += p;
    424   *s6 = vld1q_u8(s);
    425   s += p;
    426   *s7 = vld1q_u8(s);
    427 }
    428 
    429 static INLINE void load_u8_16x4(const uint8_t *s, ptrdiff_t p,
    430                                 uint8x16_t *const s0, uint8x16_t *const s1,
    431                                 uint8x16_t *const s2, uint8x16_t *const s3) {
    432   *s0 = vld1q_u8(s);
    433   s += p;
    434   *s1 = vld1q_u8(s);
    435   s += p;
    436   *s2 = vld1q_u8(s);
    437   s += p;
    438   *s3 = vld1q_u8(s);
    439 }
    440 
    441 static INLINE void load_unaligned_u16_4x4(const uint16_t *buf, uint32_t stride,
    442                                           uint64x2_t *tu0, uint64x2_t *tu1) {
    443   uint64_t a;
    444 
    445   memcpy(&a, buf, 8);
    446   buf += stride;
    447   *tu0 = vsetq_lane_u64(a, *tu0, 0);
    448   memcpy(&a, buf, 8);
    449   buf += stride;
    450   *tu0 = vsetq_lane_u64(a, *tu0, 1);
    451   memcpy(&a, buf, 8);
    452   buf += stride;
    453   *tu1 = vsetq_lane_u64(a, *tu1, 0);
    454   memcpy(&a, buf, 8);
    455   *tu1 = vsetq_lane_u64(a, *tu1, 1);
    456 }
    457 
    458 static INLINE void load_s32_4x4(int32_t *s, int32_t p, int32x4_t *s1,
    459                                 int32x4_t *s2, int32x4_t *s3, int32x4_t *s4) {
    460   *s1 = vld1q_s32(s);
    461   s += p;
    462   *s2 = vld1q_s32(s);
    463   s += p;
    464   *s3 = vld1q_s32(s);
    465   s += p;
    466   *s4 = vld1q_s32(s);
    467 }
    468 
    469 static INLINE void store_s32_4x4(int32_t *s, int32_t p, int32x4_t s1,
    470                                  int32x4_t s2, int32x4_t s3, int32x4_t s4) {
    471   vst1q_s32(s, s1);
    472   s += p;
    473   vst1q_s32(s, s2);
    474   s += p;
    475   vst1q_s32(s, s3);
    476   s += p;
    477   vst1q_s32(s, s4);
    478 }
    479 
    480 static INLINE void load_u32_4x4(uint32_t *s, int32_t p, uint32x4_t *s1,
    481                                 uint32x4_t *s2, uint32x4_t *s3,
    482                                 uint32x4_t *s4) {
    483   *s1 = vld1q_u32(s);
    484   s += p;
    485   *s2 = vld1q_u32(s);
    486   s += p;
    487   *s3 = vld1q_u32(s);
    488   s += p;
    489   *s4 = vld1q_u32(s);
    490 }
    491 
    492 static INLINE void store_u32_4x4(uint32_t *s, int32_t p, uint32x4_t s1,
    493                                  uint32x4_t s2, uint32x4_t s3, uint32x4_t s4) {
    494   vst1q_u32(s, s1);
    495   s += p;
    496   vst1q_u32(s, s2);
    497   s += p;
    498   vst1q_u32(s, s3);
    499   s += p;
    500   vst1q_u32(s, s4);
    501 }
    502 
    503 #endif  // AOM_AV1_COMMON_ARM_MEM_NEON_H_
    504