Home | History | Annotate | Download | only in dsp
      1 // Copyright 2012 Google Inc. All Rights Reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style license
      4 // that can be found in the COPYING file in the root of the source
      5 // tree. An additional intellectual property rights grant can be found
      6 // in the file PATENTS. All contributing project authors may
      7 // be found in the AUTHORS file in the root of the source tree.
      8 // -----------------------------------------------------------------------------
      9 //
     10 // ARM NEON version of dsp functions and loop filtering.
     11 //
     12 // Authors: Somnath Banerjee (somnath (at) google.com)
     13 //          Johann Koenig (johannkoenig (at) google.com)
     14 
     15 #include "./dsp.h"
     16 
     17 #if defined(WEBP_USE_NEON)
     18 
     19 #include "./neon.h"
     20 #include "../dec/vp8i_dec.h"
     21 
     22 //------------------------------------------------------------------------------
     23 // NxM Loading functions
     24 
     25 // Load/Store vertical edge
     26 #define LOAD8x4(c1, c2, c3, c4, b1, b2, stride)                                \
     27   "vld4.8 {" #c1 "[0]," #c2 "[0]," #c3 "[0]," #c4 "[0]}," #b1 "," #stride "\n" \
     28   "vld4.8 {" #c1 "[1]," #c2 "[1]," #c3 "[1]," #c4 "[1]}," #b2 "," #stride "\n" \
     29   "vld4.8 {" #c1 "[2]," #c2 "[2]," #c3 "[2]," #c4 "[2]}," #b1 "," #stride "\n" \
     30   "vld4.8 {" #c1 "[3]," #c2 "[3]," #c3 "[3]," #c4 "[3]}," #b2 "," #stride "\n" \
     31   "vld4.8 {" #c1 "[4]," #c2 "[4]," #c3 "[4]," #c4 "[4]}," #b1 "," #stride "\n" \
     32   "vld4.8 {" #c1 "[5]," #c2 "[5]," #c3 "[5]," #c4 "[5]}," #b2 "," #stride "\n" \
     33   "vld4.8 {" #c1 "[6]," #c2 "[6]," #c3 "[6]," #c4 "[6]}," #b1 "," #stride "\n" \
     34   "vld4.8 {" #c1 "[7]," #c2 "[7]," #c3 "[7]," #c4 "[7]}," #b2 "," #stride "\n"
     35 
     36 #define STORE8x2(c1, c2, p, stride)                                            \
     37   "vst2.8   {" #c1 "[0], " #c2 "[0]}," #p "," #stride " \n"                    \
     38   "vst2.8   {" #c1 "[1], " #c2 "[1]}," #p "," #stride " \n"                    \
     39   "vst2.8   {" #c1 "[2], " #c2 "[2]}," #p "," #stride " \n"                    \
     40   "vst2.8   {" #c1 "[3], " #c2 "[3]}," #p "," #stride " \n"                    \
     41   "vst2.8   {" #c1 "[4], " #c2 "[4]}," #p "," #stride " \n"                    \
     42   "vst2.8   {" #c1 "[5], " #c2 "[5]}," #p "," #stride " \n"                    \
     43   "vst2.8   {" #c1 "[6], " #c2 "[6]}," #p "," #stride " \n"                    \
     44   "vst2.8   {" #c1 "[7], " #c2 "[7]}," #p "," #stride " \n"
     45 
     46 #if !defined(WORK_AROUND_GCC)
     47 
     48 // This intrinsics version makes gcc-4.6.3 crash during Load4x??() compilation
     49 // (register alloc, probably). The variants somewhat mitigate the problem, but
     50 // not quite. HFilter16i() remains problematic.
     51 static WEBP_INLINE uint8x8x4_t Load4x8(const uint8_t* const src, int stride) {
     52   const uint8x8_t zero = vdup_n_u8(0);
     53   uint8x8x4_t out;
     54   INIT_VECTOR4(out, zero, zero, zero, zero);
     55   out = vld4_lane_u8(src + 0 * stride, out, 0);
     56   out = vld4_lane_u8(src + 1 * stride, out, 1);
     57   out = vld4_lane_u8(src + 2 * stride, out, 2);
     58   out = vld4_lane_u8(src + 3 * stride, out, 3);
     59   out = vld4_lane_u8(src + 4 * stride, out, 4);
     60   out = vld4_lane_u8(src + 5 * stride, out, 5);
     61   out = vld4_lane_u8(src + 6 * stride, out, 6);
     62   out = vld4_lane_u8(src + 7 * stride, out, 7);
     63   return out;
     64 }
     65 
     66 static WEBP_INLINE void Load4x16(const uint8_t* const src, int stride,
     67                                  uint8x16_t* const p1, uint8x16_t* const p0,
     68                                  uint8x16_t* const q0, uint8x16_t* const q1) {
     69   // row0 = p1[0..7]|p0[0..7]|q0[0..7]|q1[0..7]
     70   // row8 = p1[8..15]|p0[8..15]|q0[8..15]|q1[8..15]
     71   const uint8x8x4_t row0 = Load4x8(src - 2 + 0 * stride, stride);
     72   const uint8x8x4_t row8 = Load4x8(src - 2 + 8 * stride, stride);
     73   *p1 = vcombine_u8(row0.val[0], row8.val[0]);
     74   *p0 = vcombine_u8(row0.val[1], row8.val[1]);
     75   *q0 = vcombine_u8(row0.val[2], row8.val[2]);
     76   *q1 = vcombine_u8(row0.val[3], row8.val[3]);
     77 }
     78 
     79 #else  // WORK_AROUND_GCC
     80 
     81 #define LOADQ_LANE_32b(VALUE, LANE) do {                             \
     82   (VALUE) = vld1q_lane_u32((const uint32_t*)src, (VALUE), (LANE));   \
     83   src += stride;                                                     \
     84 } while (0)
     85 
     86 static WEBP_INLINE void Load4x16(const uint8_t* src, int stride,
     87                                  uint8x16_t* const p1, uint8x16_t* const p0,
     88                                  uint8x16_t* const q0, uint8x16_t* const q1) {
     89   const uint32x4_t zero = vdupq_n_u32(0);
     90   uint32x4x4_t in;
     91   INIT_VECTOR4(in, zero, zero, zero, zero);
     92   src -= 2;
     93   LOADQ_LANE_32b(in.val[0], 0);
     94   LOADQ_LANE_32b(in.val[1], 0);
     95   LOADQ_LANE_32b(in.val[2], 0);
     96   LOADQ_LANE_32b(in.val[3], 0);
     97   LOADQ_LANE_32b(in.val[0], 1);
     98   LOADQ_LANE_32b(in.val[1], 1);
     99   LOADQ_LANE_32b(in.val[2], 1);
    100   LOADQ_LANE_32b(in.val[3], 1);
    101   LOADQ_LANE_32b(in.val[0], 2);
    102   LOADQ_LANE_32b(in.val[1], 2);
    103   LOADQ_LANE_32b(in.val[2], 2);
    104   LOADQ_LANE_32b(in.val[3], 2);
    105   LOADQ_LANE_32b(in.val[0], 3);
    106   LOADQ_LANE_32b(in.val[1], 3);
    107   LOADQ_LANE_32b(in.val[2], 3);
    108   LOADQ_LANE_32b(in.val[3], 3);
    109   // Transpose four 4x4 parts:
    110   {
    111     const uint8x16x2_t row01 = vtrnq_u8(vreinterpretq_u8_u32(in.val[0]),
    112                                         vreinterpretq_u8_u32(in.val[1]));
    113     const uint8x16x2_t row23 = vtrnq_u8(vreinterpretq_u8_u32(in.val[2]),
    114                                         vreinterpretq_u8_u32(in.val[3]));
    115     const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]),
    116                                          vreinterpretq_u16_u8(row23.val[0]));
    117     const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]),
    118                                          vreinterpretq_u16_u8(row23.val[1]));
    119     *p1 = vreinterpretq_u8_u16(row02.val[0]);
    120     *p0 = vreinterpretq_u8_u16(row13.val[0]);
    121     *q0 = vreinterpretq_u8_u16(row02.val[1]);
    122     *q1 = vreinterpretq_u8_u16(row13.val[1]);
    123   }
    124 }
    125 #undef LOADQ_LANE_32b
    126 
    127 #endif  // !WORK_AROUND_GCC
    128 
    129 static WEBP_INLINE void Load8x16(const uint8_t* const src, int stride,
    130                                  uint8x16_t* const p3, uint8x16_t* const p2,
    131                                  uint8x16_t* const p1, uint8x16_t* const p0,
    132                                  uint8x16_t* const q0, uint8x16_t* const q1,
    133                                  uint8x16_t* const q2, uint8x16_t* const q3) {
    134   Load4x16(src - 2, stride, p3, p2, p1, p0);
    135   Load4x16(src + 2, stride, q0, q1, q2, q3);
    136 }
    137 
    138 static WEBP_INLINE void Load16x4(const uint8_t* const src, int stride,
    139                                  uint8x16_t* const p1, uint8x16_t* const p0,
    140                                  uint8x16_t* const q0, uint8x16_t* const q1) {
    141   *p1 = vld1q_u8(src - 2 * stride);
    142   *p0 = vld1q_u8(src - 1 * stride);
    143   *q0 = vld1q_u8(src + 0 * stride);
    144   *q1 = vld1q_u8(src + 1 * stride);
    145 }
    146 
    147 static WEBP_INLINE void Load16x8(const uint8_t* const src, int stride,
    148                                  uint8x16_t* const p3, uint8x16_t* const p2,
    149                                  uint8x16_t* const p1, uint8x16_t* const p0,
    150                                  uint8x16_t* const q0, uint8x16_t* const q1,
    151                                  uint8x16_t* const q2, uint8x16_t* const q3) {
    152   Load16x4(src - 2  * stride, stride, p3, p2, p1, p0);
    153   Load16x4(src + 2  * stride, stride, q0, q1, q2, q3);
    154 }
    155 
    156 static WEBP_INLINE void Load8x8x2(const uint8_t* const u,
    157                                   const uint8_t* const v,
    158                                   int stride,
    159                                   uint8x16_t* const p3, uint8x16_t* const p2,
    160                                   uint8x16_t* const p1, uint8x16_t* const p0,
    161                                   uint8x16_t* const q0, uint8x16_t* const q1,
    162                                   uint8x16_t* const q2, uint8x16_t* const q3) {
    163   // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
    164   // and the v-samples on the higher half.
    165   *p3 = vcombine_u8(vld1_u8(u - 4 * stride), vld1_u8(v - 4 * stride));
    166   *p2 = vcombine_u8(vld1_u8(u - 3 * stride), vld1_u8(v - 3 * stride));
    167   *p1 = vcombine_u8(vld1_u8(u - 2 * stride), vld1_u8(v - 2 * stride));
    168   *p0 = vcombine_u8(vld1_u8(u - 1 * stride), vld1_u8(v - 1 * stride));
    169   *q0 = vcombine_u8(vld1_u8(u + 0 * stride), vld1_u8(v + 0 * stride));
    170   *q1 = vcombine_u8(vld1_u8(u + 1 * stride), vld1_u8(v + 1 * stride));
    171   *q2 = vcombine_u8(vld1_u8(u + 2 * stride), vld1_u8(v + 2 * stride));
    172   *q3 = vcombine_u8(vld1_u8(u + 3 * stride), vld1_u8(v + 3 * stride));
    173 }
    174 
    175 #if !defined(WORK_AROUND_GCC)
    176 
    177 #define LOAD_UV_8(ROW) \
    178   vcombine_u8(vld1_u8(u - 4 + (ROW) * stride), vld1_u8(v - 4 + (ROW) * stride))
    179 
    180 static WEBP_INLINE void Load8x8x2T(const uint8_t* const u,
    181                                    const uint8_t* const v,
    182                                    int stride,
    183                                    uint8x16_t* const p3, uint8x16_t* const p2,
    184                                    uint8x16_t* const p1, uint8x16_t* const p0,
    185                                    uint8x16_t* const q0, uint8x16_t* const q1,
    186                                    uint8x16_t* const q2, uint8x16_t* const q3) {
    187   // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
    188   // and the v-samples on the higher half.
    189   const uint8x16_t row0 = LOAD_UV_8(0);
    190   const uint8x16_t row1 = LOAD_UV_8(1);
    191   const uint8x16_t row2 = LOAD_UV_8(2);
    192   const uint8x16_t row3 = LOAD_UV_8(3);
    193   const uint8x16_t row4 = LOAD_UV_8(4);
    194   const uint8x16_t row5 = LOAD_UV_8(5);
    195   const uint8x16_t row6 = LOAD_UV_8(6);
    196   const uint8x16_t row7 = LOAD_UV_8(7);
    197   // Perform two side-by-side 8x8 transposes
    198   // u00 u01 u02 u03 u04 u05 u06 u07 | v00 v01 v02 v03 v04 v05 v06 v07
    199   // u10 u11 u12 u13 u14 u15 u16 u17 | v10 v11 v12 ...
    200   // u20 u21 u22 u23 u24 u25 u26 u27 | v20 v21 ...
    201   // u30 u31 u32 u33 u34 u35 u36 u37 | ...
    202   // u40 u41 u42 u43 u44 u45 u46 u47 | ...
    203   // u50 u51 u52 u53 u54 u55 u56 u57 | ...
    204   // u60 u61 u62 u63 u64 u65 u66 u67 | v60 ...
    205   // u70 u71 u72 u73 u74 u75 u76 u77 | v70 v71 v72 ...
    206   const uint8x16x2_t row01 = vtrnq_u8(row0, row1);  // u00 u10 u02 u12 ...
    207                                                     // u01 u11 u03 u13 ...
    208   const uint8x16x2_t row23 = vtrnq_u8(row2, row3);  // u20 u30 u22 u32 ...
    209                                                     // u21 u31 u23 u33 ...
    210   const uint8x16x2_t row45 = vtrnq_u8(row4, row5);  // ...
    211   const uint8x16x2_t row67 = vtrnq_u8(row6, row7);  // ...
    212   const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]),
    213                                        vreinterpretq_u16_u8(row23.val[0]));
    214   const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]),
    215                                        vreinterpretq_u16_u8(row23.val[1]));
    216   const uint16x8x2_t row46 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[0]),
    217                                        vreinterpretq_u16_u8(row67.val[0]));
    218   const uint16x8x2_t row57 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[1]),
    219                                        vreinterpretq_u16_u8(row67.val[1]));
    220   const uint32x4x2_t row04 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[0]),
    221                                        vreinterpretq_u32_u16(row46.val[0]));
    222   const uint32x4x2_t row26 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[1]),
    223                                        vreinterpretq_u32_u16(row46.val[1]));
    224   const uint32x4x2_t row15 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[0]),
    225                                        vreinterpretq_u32_u16(row57.val[0]));
    226   const uint32x4x2_t row37 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[1]),
    227                                        vreinterpretq_u32_u16(row57.val[1]));
    228   *p3 = vreinterpretq_u8_u32(row04.val[0]);
    229   *p2 = vreinterpretq_u8_u32(row15.val[0]);
    230   *p1 = vreinterpretq_u8_u32(row26.val[0]);
    231   *p0 = vreinterpretq_u8_u32(row37.val[0]);
    232   *q0 = vreinterpretq_u8_u32(row04.val[1]);
    233   *q1 = vreinterpretq_u8_u32(row15.val[1]);
    234   *q2 = vreinterpretq_u8_u32(row26.val[1]);
    235   *q3 = vreinterpretq_u8_u32(row37.val[1]);
    236 }
    237 #undef LOAD_UV_8
    238 
    239 #endif  // !WORK_AROUND_GCC
    240 
    241 static WEBP_INLINE void Store2x8(const uint8x8x2_t v,
    242                                  uint8_t* const dst, int stride) {
    243   vst2_lane_u8(dst + 0 * stride, v, 0);
    244   vst2_lane_u8(dst + 1 * stride, v, 1);
    245   vst2_lane_u8(dst + 2 * stride, v, 2);
    246   vst2_lane_u8(dst + 3 * stride, v, 3);
    247   vst2_lane_u8(dst + 4 * stride, v, 4);
    248   vst2_lane_u8(dst + 5 * stride, v, 5);
    249   vst2_lane_u8(dst + 6 * stride, v, 6);
    250   vst2_lane_u8(dst + 7 * stride, v, 7);
    251 }
    252 
    253 static WEBP_INLINE void Store2x16(const uint8x16_t p0, const uint8x16_t q0,
    254                                   uint8_t* const dst, int stride) {
    255   uint8x8x2_t lo, hi;
    256   lo.val[0] = vget_low_u8(p0);
    257   lo.val[1] = vget_low_u8(q0);
    258   hi.val[0] = vget_high_u8(p0);
    259   hi.val[1] = vget_high_u8(q0);
    260   Store2x8(lo, dst - 1 + 0 * stride, stride);
    261   Store2x8(hi, dst - 1 + 8 * stride, stride);
    262 }
    263 
    264 #if !defined(WORK_AROUND_GCC)
    265 static WEBP_INLINE void Store4x8(const uint8x8x4_t v,
    266                                  uint8_t* const dst, int stride) {
    267   vst4_lane_u8(dst + 0 * stride, v, 0);
    268   vst4_lane_u8(dst + 1 * stride, v, 1);
    269   vst4_lane_u8(dst + 2 * stride, v, 2);
    270   vst4_lane_u8(dst + 3 * stride, v, 3);
    271   vst4_lane_u8(dst + 4 * stride, v, 4);
    272   vst4_lane_u8(dst + 5 * stride, v, 5);
    273   vst4_lane_u8(dst + 6 * stride, v, 6);
    274   vst4_lane_u8(dst + 7 * stride, v, 7);
    275 }
    276 
    277 static WEBP_INLINE void Store4x16(const uint8x16_t p1, const uint8x16_t p0,
    278                                   const uint8x16_t q0, const uint8x16_t q1,
    279                                   uint8_t* const dst, int stride) {
    280   uint8x8x4_t lo, hi;
    281   INIT_VECTOR4(lo,
    282                vget_low_u8(p1), vget_low_u8(p0),
    283                vget_low_u8(q0), vget_low_u8(q1));
    284   INIT_VECTOR4(hi,
    285                vget_high_u8(p1), vget_high_u8(p0),
    286                vget_high_u8(q0), vget_high_u8(q1));
    287   Store4x8(lo, dst - 2 + 0 * stride, stride);
    288   Store4x8(hi, dst - 2 + 8 * stride, stride);
    289 }
    290 #endif  // !WORK_AROUND_GCC
    291 
    292 static WEBP_INLINE void Store16x2(const uint8x16_t p0, const uint8x16_t q0,
    293                                   uint8_t* const dst, int stride) {
    294   vst1q_u8(dst - stride, p0);
    295   vst1q_u8(dst, q0);
    296 }
    297 
    298 static WEBP_INLINE void Store16x4(const uint8x16_t p1, const uint8x16_t p0,
    299                                   const uint8x16_t q0, const uint8x16_t q1,
    300                                   uint8_t* const dst, int stride) {
    301   Store16x2(p1, p0, dst - stride, stride);
    302   Store16x2(q0, q1, dst + stride, stride);
    303 }
    304 
    305 static WEBP_INLINE void Store8x2x2(const uint8x16_t p0, const uint8x16_t q0,
    306                                    uint8_t* const u, uint8_t* const v,
    307                                    int stride) {
    308   // p0 and q0 contain the u+v samples packed in low/high halves.
    309   vst1_u8(u - stride, vget_low_u8(p0));
    310   vst1_u8(u,          vget_low_u8(q0));
    311   vst1_u8(v - stride, vget_high_u8(p0));
    312   vst1_u8(v,          vget_high_u8(q0));
    313 }
    314 
    315 static WEBP_INLINE void Store8x4x2(const uint8x16_t p1, const uint8x16_t p0,
    316                                    const uint8x16_t q0, const uint8x16_t q1,
    317                                    uint8_t* const u, uint8_t* const v,
    318                                    int stride) {
    319   // The p1...q1 registers contain the u+v samples packed in low/high halves.
    320   Store8x2x2(p1, p0, u - stride, v - stride, stride);
    321   Store8x2x2(q0, q1, u + stride, v + stride, stride);
    322 }
    323 
    324 #if !defined(WORK_AROUND_GCC)
    325 
    326 #define STORE6_LANE(DST, VAL0, VAL1, LANE) do {   \
    327   vst3_lane_u8((DST) - 3, (VAL0), (LANE));        \
    328   vst3_lane_u8((DST) + 0, (VAL1), (LANE));        \
    329   (DST) += stride;                                \
    330 } while (0)
    331 
    332 static WEBP_INLINE void Store6x8x2(const uint8x16_t p2, const uint8x16_t p1,
    333                                    const uint8x16_t p0, const uint8x16_t q0,
    334                                    const uint8x16_t q1, const uint8x16_t q2,
    335                                    uint8_t* u, uint8_t* v,
    336                                    int stride) {
    337   uint8x8x3_t u0, u1, v0, v1;
    338   INIT_VECTOR3(u0, vget_low_u8(p2), vget_low_u8(p1), vget_low_u8(p0));
    339   INIT_VECTOR3(u1, vget_low_u8(q0), vget_low_u8(q1), vget_low_u8(q2));
    340   INIT_VECTOR3(v0, vget_high_u8(p2), vget_high_u8(p1), vget_high_u8(p0));
    341   INIT_VECTOR3(v1, vget_high_u8(q0), vget_high_u8(q1), vget_high_u8(q2));
    342   STORE6_LANE(u, u0, u1, 0);
    343   STORE6_LANE(u, u0, u1, 1);
    344   STORE6_LANE(u, u0, u1, 2);
    345   STORE6_LANE(u, u0, u1, 3);
    346   STORE6_LANE(u, u0, u1, 4);
    347   STORE6_LANE(u, u0, u1, 5);
    348   STORE6_LANE(u, u0, u1, 6);
    349   STORE6_LANE(u, u0, u1, 7);
    350   STORE6_LANE(v, v0, v1, 0);
    351   STORE6_LANE(v, v0, v1, 1);
    352   STORE6_LANE(v, v0, v1, 2);
    353   STORE6_LANE(v, v0, v1, 3);
    354   STORE6_LANE(v, v0, v1, 4);
    355   STORE6_LANE(v, v0, v1, 5);
    356   STORE6_LANE(v, v0, v1, 6);
    357   STORE6_LANE(v, v0, v1, 7);
    358 }
    359 #undef STORE6_LANE
    360 
    361 static WEBP_INLINE void Store4x8x2(const uint8x16_t p1, const uint8x16_t p0,
    362                                    const uint8x16_t q0, const uint8x16_t q1,
    363                                    uint8_t* const u, uint8_t* const v,
    364                                    int stride) {
    365   uint8x8x4_t u0, v0;
    366   INIT_VECTOR4(u0,
    367                vget_low_u8(p1), vget_low_u8(p0),
    368                vget_low_u8(q0), vget_low_u8(q1));
    369   INIT_VECTOR4(v0,
    370                vget_high_u8(p1), vget_high_u8(p0),
    371                vget_high_u8(q0), vget_high_u8(q1));
    372   vst4_lane_u8(u - 2 + 0 * stride, u0, 0);
    373   vst4_lane_u8(u - 2 + 1 * stride, u0, 1);
    374   vst4_lane_u8(u - 2 + 2 * stride, u0, 2);
    375   vst4_lane_u8(u - 2 + 3 * stride, u0, 3);
    376   vst4_lane_u8(u - 2 + 4 * stride, u0, 4);
    377   vst4_lane_u8(u - 2 + 5 * stride, u0, 5);
    378   vst4_lane_u8(u - 2 + 6 * stride, u0, 6);
    379   vst4_lane_u8(u - 2 + 7 * stride, u0, 7);
    380   vst4_lane_u8(v - 2 + 0 * stride, v0, 0);
    381   vst4_lane_u8(v - 2 + 1 * stride, v0, 1);
    382   vst4_lane_u8(v - 2 + 2 * stride, v0, 2);
    383   vst4_lane_u8(v - 2 + 3 * stride, v0, 3);
    384   vst4_lane_u8(v - 2 + 4 * stride, v0, 4);
    385   vst4_lane_u8(v - 2 + 5 * stride, v0, 5);
    386   vst4_lane_u8(v - 2 + 6 * stride, v0, 6);
    387   vst4_lane_u8(v - 2 + 7 * stride, v0, 7);
    388 }
    389 
    390 #endif  // !WORK_AROUND_GCC
    391 
    392 // Zero extend 'v' to an int16x8_t.
    393 static WEBP_INLINE int16x8_t ConvertU8ToS16(uint8x8_t v) {
    394   return vreinterpretq_s16_u16(vmovl_u8(v));
    395 }
    396 
    397 // Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
    398 // to the corresponding rows of 'dst'.
    399 static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst,
    400                                             const int16x8_t dst01,
    401                                             const int16x8_t dst23) {
    402   // Unsigned saturate to 8b.
    403   const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
    404   const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
    405 
    406   // Store the results.
    407   vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0);
    408   vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1);
    409   vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0);
    410   vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
    411 }
    412 
    413 static WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23,
    414                                uint8_t* const dst) {
    415   uint32x2_t dst01 = vdup_n_u32(0);
    416   uint32x2_t dst23 = vdup_n_u32(0);
    417 
    418   // Load the source pixels.
    419   dst01 = vld1_lane_u32((uint32_t*)(dst + 0 * BPS), dst01, 0);
    420   dst23 = vld1_lane_u32((uint32_t*)(dst + 2 * BPS), dst23, 0);
    421   dst01 = vld1_lane_u32((uint32_t*)(dst + 1 * BPS), dst01, 1);
    422   dst23 = vld1_lane_u32((uint32_t*)(dst + 3 * BPS), dst23, 1);
    423 
    424   {
    425     // Convert to 16b.
    426     const int16x8_t dst01_s16 = ConvertU8ToS16(vreinterpret_u8_u32(dst01));
    427     const int16x8_t dst23_s16 = ConvertU8ToS16(vreinterpret_u8_u32(dst23));
    428 
    429     // Descale with rounding.
    430     const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
    431     const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
    432     // Add the inverse transform.
    433     SaturateAndStore4x4(dst, out01, out23);
    434   }
    435 }
    436 
    437 //-----------------------------------------------------------------------------
    438 // Simple In-loop filtering (Paragraph 15.2)
    439 
    440 static uint8x16_t NeedsFilter(const uint8x16_t p1, const uint8x16_t p0,
    441                               const uint8x16_t q0, const uint8x16_t q1,
    442                               int thresh) {
    443   const uint8x16_t thresh_v = vdupq_n_u8((uint8_t)thresh);
    444   const uint8x16_t a_p0_q0 = vabdq_u8(p0, q0);               // abs(p0-q0)
    445   const uint8x16_t a_p1_q1 = vabdq_u8(p1, q1);               // abs(p1-q1)
    446   const uint8x16_t a_p0_q0_2 = vqaddq_u8(a_p0_q0, a_p0_q0);  // 2 * abs(p0-q0)
    447   const uint8x16_t a_p1_q1_2 = vshrq_n_u8(a_p1_q1, 1);       // abs(p1-q1) / 2
    448   const uint8x16_t sum = vqaddq_u8(a_p0_q0_2, a_p1_q1_2);
    449   const uint8x16_t mask = vcgeq_u8(thresh_v, sum);
    450   return mask;
    451 }
    452 
    453 static int8x16_t FlipSign(const uint8x16_t v) {
    454   const uint8x16_t sign_bit = vdupq_n_u8(0x80);
    455   return vreinterpretq_s8_u8(veorq_u8(v, sign_bit));
    456 }
    457 
    458 static uint8x16_t FlipSignBack(const int8x16_t v) {
    459   const int8x16_t sign_bit = vdupq_n_s8(0x80);
    460   return vreinterpretq_u8_s8(veorq_s8(v, sign_bit));
    461 }
    462 
    463 static int8x16_t GetBaseDelta(const int8x16_t p1, const int8x16_t p0,
    464                               const int8x16_t q0, const int8x16_t q1) {
    465   const int8x16_t q0_p0 = vqsubq_s8(q0, p0);      // (q0-p0)
    466   const int8x16_t p1_q1 = vqsubq_s8(p1, q1);      // (p1-q1)
    467   const int8x16_t s1 = vqaddq_s8(p1_q1, q0_p0);   // (p1-q1) + 1 * (q0 - p0)
    468   const int8x16_t s2 = vqaddq_s8(q0_p0, s1);      // (p1-q1) + 2 * (q0 - p0)
    469   const int8x16_t s3 = vqaddq_s8(q0_p0, s2);      // (p1-q1) + 3 * (q0 - p0)
    470   return s3;
    471 }
    472 
    473 static int8x16_t GetBaseDelta0(const int8x16_t p0, const int8x16_t q0) {
    474   const int8x16_t q0_p0 = vqsubq_s8(q0, p0);      // (q0-p0)
    475   const int8x16_t s1 = vqaddq_s8(q0_p0, q0_p0);   // 2 * (q0 - p0)
    476   const int8x16_t s2 = vqaddq_s8(q0_p0, s1);      // 3 * (q0 - p0)
    477   return s2;
    478 }
    479 
    480 //------------------------------------------------------------------------------
    481 
    482 static void ApplyFilter2NoFlip(const int8x16_t p0s, const int8x16_t q0s,
    483                                const int8x16_t delta,
    484                                int8x16_t* const op0, int8x16_t* const oq0) {
    485   const int8x16_t kCst3 = vdupq_n_s8(0x03);
    486   const int8x16_t kCst4 = vdupq_n_s8(0x04);
    487   const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
    488   const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);
    489   const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3);
    490   const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);
    491   *op0 = vqaddq_s8(p0s, delta3);
    492   *oq0 = vqsubq_s8(q0s, delta4);
    493 }
    494 
    495 #if defined(WEBP_USE_INTRINSICS)
    496 
    497 static void ApplyFilter2(const int8x16_t p0s, const int8x16_t q0s,
    498                          const int8x16_t delta,
    499                          uint8x16_t* const op0, uint8x16_t* const oq0) {
    500   const int8x16_t kCst3 = vdupq_n_s8(0x03);
    501   const int8x16_t kCst4 = vdupq_n_s8(0x04);
    502   const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
    503   const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);
    504   const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3);
    505   const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);
    506   const int8x16_t sp0 = vqaddq_s8(p0s, delta3);
    507   const int8x16_t sq0 = vqsubq_s8(q0s, delta4);
    508   *op0 = FlipSignBack(sp0);
    509   *oq0 = FlipSignBack(sq0);
    510 }
    511 
    512 static void DoFilter2(const uint8x16_t p1, const uint8x16_t p0,
    513                       const uint8x16_t q0, const uint8x16_t q1,
    514                       const uint8x16_t mask,
    515                       uint8x16_t* const op0, uint8x16_t* const oq0) {
    516   const int8x16_t p1s = FlipSign(p1);
    517   const int8x16_t p0s = FlipSign(p0);
    518   const int8x16_t q0s = FlipSign(q0);
    519   const int8x16_t q1s = FlipSign(q1);
    520   const int8x16_t delta0 = GetBaseDelta(p1s, p0s, q0s, q1s);
    521   const int8x16_t delta1 = vandq_s8(delta0, vreinterpretq_s8_u8(mask));
    522   ApplyFilter2(p0s, q0s, delta1, op0, oq0);
    523 }
    524 
    525 static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
    526   uint8x16_t p1, p0, q0, q1, op0, oq0;
    527   Load16x4(p, stride, &p1, &p0, &q0, &q1);
    528   {
    529     const uint8x16_t mask = NeedsFilter(p1, p0, q0, q1, thresh);
    530     DoFilter2(p1, p0, q0, q1, mask, &op0, &oq0);
    531   }
    532   Store16x2(op0, oq0, p, stride);
    533 }
    534 
    535 static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
    536   uint8x16_t p1, p0, q0, q1, oq0, op0;
    537   Load4x16(p, stride, &p1, &p0, &q0, &q1);
    538   {
    539     const uint8x16_t mask = NeedsFilter(p1, p0, q0, q1, thresh);
    540     DoFilter2(p1, p0, q0, q1, mask, &op0, &oq0);
    541   }
    542   Store2x16(op0, oq0, p, stride);
    543 }
    544 
    545 #else
    546 
    547 #define QRegs "q0", "q1", "q2", "q3",                                          \
    548               "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
    549 
    550 #define FLIP_SIGN_BIT2(a, b, s)                                                \
    551   "veor     " #a "," #a "," #s "               \n"                             \
    552   "veor     " #b "," #b "," #s "               \n"                             \
    553 
    554 #define FLIP_SIGN_BIT4(a, b, c, d, s)                                          \
    555   FLIP_SIGN_BIT2(a, b, s)                                                      \
    556   FLIP_SIGN_BIT2(c, d, s)                                                      \
    557 
    558 #define NEEDS_FILTER(p1, p0, q0, q1, thresh, mask)                             \
    559   "vabd.u8    q15," #p0 "," #q0 "         \n"  /* abs(p0 - q0) */              \
    560   "vabd.u8    q14," #p1 "," #q1 "         \n"  /* abs(p1 - q1) */              \
    561   "vqadd.u8   q15, q15, q15               \n"  /* abs(p0 - q0) * 2 */          \
    562   "vshr.u8    q14, q14, #1                \n"  /* abs(p1 - q1) / 2 */          \
    563   "vqadd.u8   q15, q15, q14     \n"  /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \
    564   "vdup.8     q14, " #thresh "            \n"                                  \
    565   "vcge.u8   " #mask ", q14, q15          \n"  /* mask <= thresh */
    566 
    567 #define GET_BASE_DELTA(p1, p0, q0, q1, o)                                      \
    568   "vqsub.s8   q15," #q0 "," #p0 "         \n"  /* (q0 - p0) */                 \
    569   "vqsub.s8  " #o "," #p1 "," #q1 "       \n"  /* (p1 - q1) */                 \
    570   "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 1 * (p0 - q0) */ \
    571   "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 2 * (p0 - q0) */ \
    572   "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 3 * (p0 - q0) */
    573 
    574 #define DO_SIMPLE_FILTER(p0, q0, fl)                                           \
    575   "vmov.i8    q15, #0x03                  \n"                                  \
    576   "vqadd.s8   q15, q15, " #fl "           \n"  /* filter1 = filter + 3 */      \
    577   "vshr.s8    q15, q15, #3                \n"  /* filter1 >> 3 */              \
    578   "vqadd.s8  " #p0 "," #p0 ", q15         \n"  /* p0 += filter1 */             \
    579                                                                                \
    580   "vmov.i8    q15, #0x04                  \n"                                  \
    581   "vqadd.s8   q15, q15, " #fl "           \n"  /* filter1 = filter + 4 */      \
    582   "vshr.s8    q15, q15, #3                \n"  /* filter2 >> 3 */              \
    583   "vqsub.s8  " #q0 "," #q0 ", q15         \n"  /* q0 -= filter2 */
    584 
    585 // Applies filter on 2 pixels (p0 and q0)
    586 #define DO_FILTER2(p1, p0, q0, q1, thresh)                                     \
    587   NEEDS_FILTER(p1, p0, q0, q1, thresh, q9)     /* filter mask in q9 */         \
    588   "vmov.i8    q10, #0x80                  \n"  /* sign bit */                  \
    589   FLIP_SIGN_BIT4(p1, p0, q0, q1, q10)          /* convert to signed value */   \
    590   GET_BASE_DELTA(p1, p0, q0, q1, q11)          /* get filter level  */         \
    591   "vand       q9, q9, q11                 \n"  /* apply filter mask */         \
    592   DO_SIMPLE_FILTER(p0, q0, q9)                 /* apply filter */              \
    593   FLIP_SIGN_BIT2(p0, q0, q10)
    594 
    595 static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
    596   __asm__ volatile (
    597     "sub        %[p], %[p], %[stride], lsl #1  \n"  // p -= 2 * stride
    598 
    599     "vld1.u8    {q1}, [%[p]], %[stride]        \n"  // p1
    600     "vld1.u8    {q2}, [%[p]], %[stride]        \n"  // p0
    601     "vld1.u8    {q3}, [%[p]], %[stride]        \n"  // q0
    602     "vld1.u8    {q12}, [%[p]]                  \n"  // q1
    603 
    604     DO_FILTER2(q1, q2, q3, q12, %[thresh])
    605 
    606     "sub        %[p], %[p], %[stride], lsl #1  \n"  // p -= 2 * stride
    607 
    608     "vst1.u8    {q2}, [%[p]], %[stride]        \n"  // store op0
    609     "vst1.u8    {q3}, [%[p]]                   \n"  // store oq0
    610     : [p] "+r"(p)
    611     : [stride] "r"(stride), [thresh] "r"(thresh)
    612     : "memory", QRegs
    613   );
    614 }
    615 
    616 static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
    617   __asm__ volatile (
    618     "sub        r4, %[p], #2                   \n"  // base1 = p - 2
    619     "lsl        r6, %[stride], #1              \n"  // r6 = 2 * stride
    620     "add        r5, r4, %[stride]              \n"  // base2 = base1 + stride
    621 
    622     LOAD8x4(d2, d3, d4, d5, [r4], [r5], r6)
    623     LOAD8x4(d24, d25, d26, d27, [r4], [r5], r6)
    624     "vswp       d3, d24                        \n"  // p1:q1 p0:q3
    625     "vswp       d5, d26                        \n"  // q0:q2 q1:q4
    626     "vswp       q2, q12                        \n"  // p1:q1 p0:q2 q0:q3 q1:q4
    627 
    628     DO_FILTER2(q1, q2, q12, q13, %[thresh])
    629 
    630     "sub        %[p], %[p], #1                 \n"  // p - 1
    631 
    632     "vswp        d5, d24                       \n"
    633     STORE8x2(d4, d5, [%[p]], %[stride])
    634     STORE8x2(d24, d25, [%[p]], %[stride])
    635 
    636     : [p] "+r"(p)
    637     : [stride] "r"(stride), [thresh] "r"(thresh)
    638     : "memory", "r4", "r5", "r6", QRegs
    639   );
    640 }
    641 
    642 #endif    // WEBP_USE_INTRINSICS
    643 
    644 static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
    645   uint32_t k;
    646   for (k = 3; k != 0; --k) {
    647     p += 4 * stride;
    648     SimpleVFilter16(p, stride, thresh);
    649   }
    650 }
    651 
    652 static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
    653   uint32_t k;
    654   for (k = 3; k != 0; --k) {
    655     p += 4;
    656     SimpleHFilter16(p, stride, thresh);
    657   }
    658 }
    659 
    660 //------------------------------------------------------------------------------
    661 // Complex In-loop filtering (Paragraph 15.3)
    662 
    663 static uint8x16_t NeedsHev(const uint8x16_t p1, const uint8x16_t p0,
    664                            const uint8x16_t q0, const uint8x16_t q1,
    665                            int hev_thresh) {
    666   const uint8x16_t hev_thresh_v = vdupq_n_u8((uint8_t)hev_thresh);
    667   const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0);  // abs(p1 - p0)
    668   const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0);  // abs(q1 - q0)
    669   const uint8x16_t a_max = vmaxq_u8(a_p1_p0, a_q1_q0);
    670   const uint8x16_t mask = vcgtq_u8(a_max, hev_thresh_v);
    671   return mask;
    672 }
    673 
    674 static uint8x16_t NeedsFilter2(const uint8x16_t p3, const uint8x16_t p2,
    675                                const uint8x16_t p1, const uint8x16_t p0,
    676                                const uint8x16_t q0, const uint8x16_t q1,
    677                                const uint8x16_t q2, const uint8x16_t q3,
    678                                int ithresh, int thresh) {
    679   const uint8x16_t ithresh_v = vdupq_n_u8((uint8_t)ithresh);
    680   const uint8x16_t a_p3_p2 = vabdq_u8(p3, p2);  // abs(p3 - p2)
    681   const uint8x16_t a_p2_p1 = vabdq_u8(p2, p1);  // abs(p2 - p1)
    682   const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0);  // abs(p1 - p0)
    683   const uint8x16_t a_q3_q2 = vabdq_u8(q3, q2);  // abs(q3 - q2)
    684   const uint8x16_t a_q2_q1 = vabdq_u8(q2, q1);  // abs(q2 - q1)
    685   const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0);  // abs(q1 - q0)
    686   const uint8x16_t max1 = vmaxq_u8(a_p3_p2, a_p2_p1);
    687   const uint8x16_t max2 = vmaxq_u8(a_p1_p0, a_q3_q2);
    688   const uint8x16_t max3 = vmaxq_u8(a_q2_q1, a_q1_q0);
    689   const uint8x16_t max12 = vmaxq_u8(max1, max2);
    690   const uint8x16_t max123 = vmaxq_u8(max12, max3);
    691   const uint8x16_t mask2 = vcgeq_u8(ithresh_v, max123);
    692   const uint8x16_t mask1 = NeedsFilter(p1, p0, q0, q1, thresh);
    693   const uint8x16_t mask = vandq_u8(mask1, mask2);
    694   return mask;
    695 }
    696 
    697 //  4-points filter
    698 
    699 static void ApplyFilter4(
    700     const int8x16_t p1, const int8x16_t p0,
    701     const int8x16_t q0, const int8x16_t q1,
    702     const int8x16_t delta0,
    703     uint8x16_t* const op1, uint8x16_t* const op0,
    704     uint8x16_t* const oq0, uint8x16_t* const oq1) {
    705   const int8x16_t kCst3 = vdupq_n_s8(0x03);
    706   const int8x16_t kCst4 = vdupq_n_s8(0x04);
    707   const int8x16_t delta1 = vqaddq_s8(delta0, kCst4);
    708   const int8x16_t delta2 = vqaddq_s8(delta0, kCst3);
    709   const int8x16_t a1 = vshrq_n_s8(delta1, 3);
    710   const int8x16_t a2 = vshrq_n_s8(delta2, 3);
    711   const int8x16_t a3 = vrshrq_n_s8(a1, 1);   // a3 = (a1 + 1) >> 1
    712   *op0 = FlipSignBack(vqaddq_s8(p0, a2));  // clip(p0 + a2)
    713   *oq0 = FlipSignBack(vqsubq_s8(q0, a1));  // clip(q0 - a1)
    714   *op1 = FlipSignBack(vqaddq_s8(p1, a3));  // clip(p1 + a3)
    715   *oq1 = FlipSignBack(vqsubq_s8(q1, a3));  // clip(q1 - a3)
    716 }
    717 
    718 static void DoFilter4(
    719     const uint8x16_t p1, const uint8x16_t p0,
    720     const uint8x16_t q0, const uint8x16_t q1,
    721     const uint8x16_t mask, const uint8x16_t hev_mask,
    722     uint8x16_t* const op1, uint8x16_t* const op0,
    723     uint8x16_t* const oq0, uint8x16_t* const oq1) {
    724   // This is a fused version of DoFilter2() calling ApplyFilter2 directly
    725   const int8x16_t p1s = FlipSign(p1);
    726   int8x16_t p0s = FlipSign(p0);
    727   int8x16_t q0s = FlipSign(q0);
    728   const int8x16_t q1s = FlipSign(q1);
    729   const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
    730 
    731   // do_filter2 part (simple loopfilter on pixels with hev)
    732   {
    733     const int8x16_t delta = GetBaseDelta(p1s, p0s, q0s, q1s);
    734     const int8x16_t simple_lf_delta =
    735         vandq_s8(delta, vreinterpretq_s8_u8(simple_lf_mask));
    736     ApplyFilter2NoFlip(p0s, q0s, simple_lf_delta, &p0s, &q0s);
    737   }
    738 
    739   // do_filter4 part (complex loopfilter on pixels without hev)
    740   {
    741     const int8x16_t delta0 = GetBaseDelta0(p0s, q0s);
    742     // we use: (mask & hev_mask) ^ mask = mask & !hev_mask
    743     const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
    744     const int8x16_t complex_lf_delta =
    745         vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
    746     ApplyFilter4(p1s, p0s, q0s, q1s, complex_lf_delta, op1, op0, oq0, oq1);
    747   }
    748 }
    749 
    750 //  6-points filter
    751 
    752 static void ApplyFilter6(
    753     const int8x16_t p2, const int8x16_t p1, const int8x16_t p0,
    754     const int8x16_t q0, const int8x16_t q1, const int8x16_t q2,
    755     const int8x16_t delta,
    756     uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
    757     uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
    758   // We have to compute: X = (9*a+63) >> 7, Y = (18*a+63)>>7, Z = (27*a+63) >> 7
    759   // Turns out, there's a common sub-expression S=9 * a - 1 that can be used
    760   // with the special vqrshrn_n_s16 rounding-shift-and-narrow instruction:
    761   //   X = (S + 64) >> 7, Y = (S + 32) >> 6, Z = (18 * a + S + 64) >> 7
    762   const int8x8_t delta_lo = vget_low_s8(delta);
    763   const int8x8_t delta_hi = vget_high_s8(delta);
    764   const int8x8_t kCst9 = vdup_n_s8(9);
    765   const int16x8_t kCstm1 = vdupq_n_s16(-1);
    766   const int8x8_t kCst18 = vdup_n_s8(18);
    767   const int16x8_t S_lo = vmlal_s8(kCstm1, kCst9, delta_lo);  // S = 9 * a - 1
    768   const int16x8_t S_hi = vmlal_s8(kCstm1, kCst9, delta_hi);
    769   const int16x8_t Z_lo = vmlal_s8(S_lo, kCst18, delta_lo);   // S + 18 * a
    770   const int16x8_t Z_hi = vmlal_s8(S_hi, kCst18, delta_hi);
    771   const int8x8_t a3_lo = vqrshrn_n_s16(S_lo, 7);   // (9 * a + 63) >> 7
    772   const int8x8_t a3_hi = vqrshrn_n_s16(S_hi, 7);
    773   const int8x8_t a2_lo = vqrshrn_n_s16(S_lo, 6);   // (9 * a + 31) >> 6
    774   const int8x8_t a2_hi = vqrshrn_n_s16(S_hi, 6);
    775   const int8x8_t a1_lo = vqrshrn_n_s16(Z_lo, 7);   // (27 * a + 63) >> 7
    776   const int8x8_t a1_hi = vqrshrn_n_s16(Z_hi, 7);
    777   const int8x16_t a1 = vcombine_s8(a1_lo, a1_hi);
    778   const int8x16_t a2 = vcombine_s8(a2_lo, a2_hi);
    779   const int8x16_t a3 = vcombine_s8(a3_lo, a3_hi);
    780 
    781   *op0 = FlipSignBack(vqaddq_s8(p0, a1));  // clip(p0 + a1)
    782   *oq0 = FlipSignBack(vqsubq_s8(q0, a1));  // clip(q0 - q1)
    783   *oq1 = FlipSignBack(vqsubq_s8(q1, a2));  // clip(q1 - a2)
    784   *op1 = FlipSignBack(vqaddq_s8(p1, a2));  // clip(p1 + a2)
    785   *oq2 = FlipSignBack(vqsubq_s8(q2, a3));  // clip(q2 - a3)
    786   *op2 = FlipSignBack(vqaddq_s8(p2, a3));  // clip(p2 + a3)
    787 }
    788 
    789 static void DoFilter6(
    790     const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
    791     const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
    792     const uint8x16_t mask, const uint8x16_t hev_mask,
    793     uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
    794     uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
    795   // This is a fused version of DoFilter2() calling ApplyFilter2 directly
    796   const int8x16_t p2s = FlipSign(p2);
    797   const int8x16_t p1s = FlipSign(p1);
    798   int8x16_t p0s = FlipSign(p0);
    799   int8x16_t q0s = FlipSign(q0);
    800   const int8x16_t q1s = FlipSign(q1);
    801   const int8x16_t q2s = FlipSign(q2);
    802   const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
    803   const int8x16_t delta0 = GetBaseDelta(p1s, p0s, q0s, q1s);
    804 
    805   // do_filter2 part (simple loopfilter on pixels with hev)
    806   {
    807     const int8x16_t simple_lf_delta =
    808         vandq_s8(delta0, vreinterpretq_s8_u8(simple_lf_mask));
    809     ApplyFilter2NoFlip(p0s, q0s, simple_lf_delta, &p0s, &q0s);
    810   }
    811 
    812   // do_filter6 part (complex loopfilter on pixels without hev)
    813   {
    814     // we use: (mask & hev_mask) ^ mask = mask & !hev_mask
    815     const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
    816     const int8x16_t complex_lf_delta =
    817         vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
    818     ApplyFilter6(p2s, p1s, p0s, q0s, q1s, q2s, complex_lf_delta,
    819                  op2, op1, op0, oq0, oq1, oq2);
    820   }
    821 }
    822 
    823 // on macroblock edges
    824 
    825 static void VFilter16(uint8_t* p, int stride,
    826                       int thresh, int ithresh, int hev_thresh) {
    827   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
    828   Load16x8(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
    829   {
    830     const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
    831                                          ithresh, thresh);
    832     const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
    833     uint8x16_t op2, op1, op0, oq0, oq1, oq2;
    834     DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
    835               &op2, &op1, &op0, &oq0, &oq1, &oq2);
    836     Store16x2(op2, op1, p - 2 * stride, stride);
    837     Store16x2(op0, oq0, p + 0 * stride, stride);
    838     Store16x2(oq1, oq2, p + 2 * stride, stride);
    839   }
    840 }
    841 
    842 static void HFilter16(uint8_t* p, int stride,
    843                       int thresh, int ithresh, int hev_thresh) {
    844   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
    845   Load8x16(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
    846   {
    847     const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
    848                                          ithresh, thresh);
    849     const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
    850     uint8x16_t op2, op1, op0, oq0, oq1, oq2;
    851     DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
    852               &op2, &op1, &op0, &oq0, &oq1, &oq2);
    853     Store2x16(op2, op1, p - 2, stride);
    854     Store2x16(op0, oq0, p + 0, stride);
    855     Store2x16(oq1, oq2, p + 2, stride);
    856   }
    857 }
    858 
    859 // on three inner edges
    860 static void VFilter16i(uint8_t* p, int stride,
    861                        int thresh, int ithresh, int hev_thresh) {
    862   uint32_t k;
    863   uint8x16_t p3, p2, p1, p0;
    864   Load16x4(p + 2  * stride, stride, &p3, &p2, &p1, &p0);
    865   for (k = 3; k != 0; --k) {
    866     uint8x16_t q0, q1, q2, q3;
    867     p += 4 * stride;
    868     Load16x4(p + 2  * stride, stride, &q0, &q1, &q2, &q3);
    869     {
    870       const uint8x16_t mask =
    871           NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
    872       const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
    873       // p3 and p2 are not just temporary variables here: they will be
    874       // re-used for next span. And q2/q3 will become p1/p0 accordingly.
    875       DoFilter4(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
    876       Store16x4(p1, p0, p3, p2, p, stride);
    877       p1 = q2;
    878       p0 = q3;
    879     }
    880   }
    881 }
    882 
    883 #if !defined(WORK_AROUND_GCC)
    884 static void HFilter16i(uint8_t* p, int stride,
    885                        int thresh, int ithresh, int hev_thresh) {
    886   uint32_t k;
    887   uint8x16_t p3, p2, p1, p0;
    888   Load4x16(p + 2, stride, &p3, &p2, &p1, &p0);
    889   for (k = 3; k != 0; --k) {
    890     uint8x16_t q0, q1, q2, q3;
    891     p += 4;
    892     Load4x16(p + 2, stride, &q0, &q1, &q2, &q3);
    893     {
    894       const uint8x16_t mask =
    895           NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
    896       const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
    897       DoFilter4(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
    898       Store4x16(p1, p0, p3, p2, p, stride);
    899       p1 = q2;
    900       p0 = q3;
    901     }
    902   }
    903 }
    904 #endif  // !WORK_AROUND_GCC
    905 
    906 // 8-pixels wide variant, for chroma filtering
    907 static void VFilter8(uint8_t* u, uint8_t* v, int stride,
    908                      int thresh, int ithresh, int hev_thresh) {
    909   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
    910   Load8x8x2(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
    911   {
    912     const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
    913                                          ithresh, thresh);
    914     const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
    915     uint8x16_t op2, op1, op0, oq0, oq1, oq2;
    916     DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
    917               &op2, &op1, &op0, &oq0, &oq1, &oq2);
    918     Store8x2x2(op2, op1, u - 2 * stride, v - 2 * stride, stride);
    919     Store8x2x2(op0, oq0, u + 0 * stride, v + 0 * stride, stride);
    920     Store8x2x2(oq1, oq2, u + 2 * stride, v + 2 * stride, stride);
    921   }
    922 }
    923 static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
    924                       int thresh, int ithresh, int hev_thresh) {
    925   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
    926   u += 4 * stride;
    927   v += 4 * stride;
    928   Load8x8x2(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
    929   {
    930     const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
    931                                          ithresh, thresh);
    932     const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
    933     uint8x16_t op1, op0, oq0, oq1;
    934     DoFilter4(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
    935     Store8x4x2(op1, op0, oq0, oq1, u, v, stride);
    936   }
    937 }
    938 
    939 #if !defined(WORK_AROUND_GCC)
    940 static void HFilter8(uint8_t* u, uint8_t* v, int stride,
    941                      int thresh, int ithresh, int hev_thresh) {
    942   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
    943   Load8x8x2T(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
    944   {
    945     const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
    946                                          ithresh, thresh);
    947     const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
    948     uint8x16_t op2, op1, op0, oq0, oq1, oq2;
    949     DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
    950               &op2, &op1, &op0, &oq0, &oq1, &oq2);
    951     Store6x8x2(op2, op1, op0, oq0, oq1, oq2, u, v, stride);
    952   }
    953 }
    954 
    955 static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
    956                       int thresh, int ithresh, int hev_thresh) {
    957   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
    958   u += 4;
    959   v += 4;
    960   Load8x8x2T(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
    961   {
    962     const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
    963                                          ithresh, thresh);
    964     const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
    965     uint8x16_t op1, op0, oq0, oq1;
    966     DoFilter4(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
    967     Store4x8x2(op1, op0, oq0, oq1, u, v, stride);
    968   }
    969 }
    970 #endif  // !WORK_AROUND_GCC
    971 
    972 //-----------------------------------------------------------------------------
    973 // Inverse transforms (Paragraph 14.4)
    974 
    975 // Technically these are unsigned but vqdmulh is only available in signed.
    976 // vqdmulh returns high half (effectively >> 16) but also doubles the value,
    977 // changing the >> 16 to >> 15 and requiring an additional >> 1.
    978 // We use this to our advantage with kC2. The canonical value is 35468.
    979 // However, the high bit is set so treating it as signed will give incorrect
    980 // results. We avoid this by down shifting by 1 here to clear the highest bit.
    981 // Combined with the doubling effect of vqdmulh we get >> 16.
    982 // This can not be applied to kC1 because the lowest bit is set. Down shifting
    983 // the constant would reduce precision.
    984 
    985 // libwebp uses a trick to avoid some extra addition that libvpx does.
    986 // Instead of:
    987 // temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
    988 // libwebp adds 1 << 16 to cospi8sqrt2minus1 (kC1). However, this causes the
    989 // same issue with kC1 and vqdmulh that we work around by down shifting kC2
    990 
    991 static const int16_t kC1 = 20091;
    992 static const int16_t kC2 = 17734;  // half of kC2, actually. See comment above.
    993 
    994 #if defined(WEBP_USE_INTRINSICS)
    995 static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1,
    996                                      int16x8x2_t* const out) {
    997   // a0 a1 a2 a3 | b0 b1 b2 b3   => a0 b0 c0 d0 | a1 b1 c1 d1
    998   // c0 c1 c2 c3 | d0 d1 d2 d3      a2 b2 c2 d2 | a3 b3 c3 d3
    999   const int16x8x2_t tmp0 = vzipq_s16(in0, in1);   // a0 c0 a1 c1 a2 c2 ...
   1000                                                   // b0 d0 b1 d1 b2 d2 ...
   1001   *out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
   1002 }
   1003 
   1004 static WEBP_INLINE void TransformPass(int16x8x2_t* const rows) {
   1005   // {rows} = in0 | in4
   1006   //          in8 | in12
   1007   // B1 = in4 | in12
   1008   const int16x8_t B1 =
   1009       vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));
   1010   // C0 = kC1 * in4 | kC1 * in12
   1011   // C1 = kC2 * in4 | kC2 * in12
   1012   const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);
   1013   const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);
   1014   const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),
   1015                                 vget_low_s16(rows->val[1]));   // in0 + in8
   1016   const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),
   1017                                 vget_low_s16(rows->val[1]));   // in0 - in8
   1018   // c = kC2 * in4 - kC1 * in12
   1019   // d = kC1 * in4 + kC2 * in12
   1020   const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));
   1021   const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));
   1022   const int16x8_t D0 = vcombine_s16(a, b);      // D0 = a | b
   1023   const int16x8_t D1 = vcombine_s16(d, c);      // D1 = d | c
   1024   const int16x8_t E0 = vqaddq_s16(D0, D1);      // a+d | b+c
   1025   const int16x8_t E_tmp = vqsubq_s16(D0, D1);   // a-d | b-c
   1026   const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
   1027   Transpose8x2(E0, E1, rows);
   1028 }
   1029 
   1030 static void TransformOne(const int16_t* in, uint8_t* dst) {
   1031   int16x8x2_t rows;
   1032   INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
   1033   TransformPass(&rows);
   1034   TransformPass(&rows);
   1035   Add4x4(rows.val[0], rows.val[1], dst);
   1036 }
   1037 
   1038 #else
   1039 
   1040 static void TransformOne(const int16_t* in, uint8_t* dst) {
   1041   const int kBPS = BPS;
   1042   // kC1, kC2. Padded because vld1.16 loads 8 bytes
   1043   const int16_t constants[4] = { kC1, kC2, 0, 0 };
   1044   /* Adapted from libvpx: vp8/common/arm/neon/shortidct4x4llm_neon.asm */
   1045   __asm__ volatile (
   1046     "vld1.16         {q1, q2}, [%[in]]           \n"
   1047     "vld1.16         {d0}, [%[constants]]        \n"
   1048 
   1049     /* d2: in[0]
   1050      * d3: in[8]
   1051      * d4: in[4]
   1052      * d5: in[12]
   1053      */
   1054     "vswp            d3, d4                      \n"
   1055 
   1056     /* q8 = {in[4], in[12]} * kC1 * 2 >> 16
   1057      * q9 = {in[4], in[12]} * kC2 >> 16
   1058      */
   1059     "vqdmulh.s16     q8, q2, d0[0]               \n"
   1060     "vqdmulh.s16     q9, q2, d0[1]               \n"
   1061 
   1062     /* d22 = a = in[0] + in[8]
   1063      * d23 = b = in[0] - in[8]
   1064      */
   1065     "vqadd.s16       d22, d2, d3                 \n"
   1066     "vqsub.s16       d23, d2, d3                 \n"
   1067 
   1068     /* The multiplication should be x * kC1 >> 16
   1069      * However, with vqdmulh we get x * kC1 * 2 >> 16
   1070      * (multiply, double, return high half)
   1071      * We avoided this in kC2 by pre-shifting the constant.
   1072      * q8 = in[4]/[12] * kC1 >> 16
   1073      */
   1074     "vshr.s16        q8, q8, #1                  \n"
   1075 
   1076     /* Add {in[4], in[12]} back after the multiplication. This is handled by
   1077      * adding 1 << 16 to kC1 in the libwebp C code.
   1078      */
   1079     "vqadd.s16       q8, q2, q8                  \n"
   1080 
   1081     /* d20 = c = in[4]*kC2 - in[12]*kC1
   1082      * d21 = d = in[4]*kC1 + in[12]*kC2
   1083      */
   1084     "vqsub.s16       d20, d18, d17               \n"
   1085     "vqadd.s16       d21, d19, d16               \n"
   1086 
   1087     /* d2 = tmp[0] = a + d
   1088      * d3 = tmp[1] = b + c
   1089      * d4 = tmp[2] = b - c
   1090      * d5 = tmp[3] = a - d
   1091      */
   1092     "vqadd.s16       d2, d22, d21                \n"
   1093     "vqadd.s16       d3, d23, d20                \n"
   1094     "vqsub.s16       d4, d23, d20                \n"
   1095     "vqsub.s16       d5, d22, d21                \n"
   1096 
   1097     "vzip.16         q1, q2                      \n"
   1098     "vzip.16         q1, q2                      \n"
   1099 
   1100     "vswp            d3, d4                      \n"
   1101 
   1102     /* q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
   1103      * q9 = {tmp[4], tmp[12]} * kC2 >> 16
   1104      */
   1105     "vqdmulh.s16     q8, q2, d0[0]               \n"
   1106     "vqdmulh.s16     q9, q2, d0[1]               \n"
   1107 
   1108     /* d22 = a = tmp[0] + tmp[8]
   1109      * d23 = b = tmp[0] - tmp[8]
   1110      */
   1111     "vqadd.s16       d22, d2, d3                 \n"
   1112     "vqsub.s16       d23, d2, d3                 \n"
   1113 
   1114     /* See long winded explanations prior */
   1115     "vshr.s16        q8, q8, #1                  \n"
   1116     "vqadd.s16       q8, q2, q8                  \n"
   1117 
   1118     /* d20 = c = in[4]*kC2 - in[12]*kC1
   1119      * d21 = d = in[4]*kC1 + in[12]*kC2
   1120      */
   1121     "vqsub.s16       d20, d18, d17               \n"
   1122     "vqadd.s16       d21, d19, d16               \n"
   1123 
   1124     /* d2 = tmp[0] = a + d
   1125      * d3 = tmp[1] = b + c
   1126      * d4 = tmp[2] = b - c
   1127      * d5 = tmp[3] = a - d
   1128      */
   1129     "vqadd.s16       d2, d22, d21                \n"
   1130     "vqadd.s16       d3, d23, d20                \n"
   1131     "vqsub.s16       d4, d23, d20                \n"
   1132     "vqsub.s16       d5, d22, d21                \n"
   1133 
   1134     "vld1.32         d6[0], [%[dst]], %[kBPS]    \n"
   1135     "vld1.32         d6[1], [%[dst]], %[kBPS]    \n"
   1136     "vld1.32         d7[0], [%[dst]], %[kBPS]    \n"
   1137     "vld1.32         d7[1], [%[dst]], %[kBPS]    \n"
   1138 
   1139     "sub         %[dst], %[dst], %[kBPS], lsl #2 \n"
   1140 
   1141     /* (val) + 4 >> 3 */
   1142     "vrshr.s16       d2, d2, #3                  \n"
   1143     "vrshr.s16       d3, d3, #3                  \n"
   1144     "vrshr.s16       d4, d4, #3                  \n"
   1145     "vrshr.s16       d5, d5, #3                  \n"
   1146 
   1147     "vzip.16         q1, q2                      \n"
   1148     "vzip.16         q1, q2                      \n"
   1149 
   1150     /* Must accumulate before saturating */
   1151     "vmovl.u8        q8, d6                      \n"
   1152     "vmovl.u8        q9, d7                      \n"
   1153 
   1154     "vqadd.s16       q1, q1, q8                  \n"
   1155     "vqadd.s16       q2, q2, q9                  \n"
   1156 
   1157     "vqmovun.s16     d0, q1                      \n"
   1158     "vqmovun.s16     d1, q2                      \n"
   1159 
   1160     "vst1.32         d0[0], [%[dst]], %[kBPS]    \n"
   1161     "vst1.32         d0[1], [%[dst]], %[kBPS]    \n"
   1162     "vst1.32         d1[0], [%[dst]], %[kBPS]    \n"
   1163     "vst1.32         d1[1], [%[dst]]             \n"
   1164 
   1165     : [in] "+r"(in), [dst] "+r"(dst)  /* modified registers */
   1166     : [kBPS] "r"(kBPS), [constants] "r"(constants)  /* constants */
   1167     : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11"  /* clobbered */
   1168   );
   1169 }
   1170 
   1171 #endif    // WEBP_USE_INTRINSICS
   1172 
   1173 static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
   1174   TransformOne(in, dst);
   1175   if (do_two) {
   1176     TransformOne(in + 16, dst + 4);
   1177   }
   1178 }
   1179 
   1180 static void TransformDC(const int16_t* in, uint8_t* dst) {
   1181   const int16x8_t DC = vdupq_n_s16(in[0]);
   1182   Add4x4(DC, DC, dst);
   1183 }
   1184 
   1185 //------------------------------------------------------------------------------
   1186 
   1187 #define STORE_WHT(dst, col, rows) do {                  \
   1188   *dst = vgetq_lane_s32(rows.val[0], col); (dst) += 16; \
   1189   *dst = vgetq_lane_s32(rows.val[1], col); (dst) += 16; \
   1190   *dst = vgetq_lane_s32(rows.val[2], col); (dst) += 16; \
   1191   *dst = vgetq_lane_s32(rows.val[3], col); (dst) += 16; \
   1192 } while (0)
   1193 
   1194 static void TransformWHT(const int16_t* in, int16_t* out) {
   1195   int32x4x4_t tmp;
   1196 
   1197   {
   1198     // Load the source.
   1199     const int16x4_t in00_03 = vld1_s16(in + 0);
   1200     const int16x4_t in04_07 = vld1_s16(in + 4);
   1201     const int16x4_t in08_11 = vld1_s16(in + 8);
   1202     const int16x4_t in12_15 = vld1_s16(in + 12);
   1203     const int32x4_t a0 = vaddl_s16(in00_03, in12_15);  // in[0..3] + in[12..15]
   1204     const int32x4_t a1 = vaddl_s16(in04_07, in08_11);  // in[4..7] + in[8..11]
   1205     const int32x4_t a2 = vsubl_s16(in04_07, in08_11);  // in[4..7] - in[8..11]
   1206     const int32x4_t a3 = vsubl_s16(in00_03, in12_15);  // in[0..3] - in[12..15]
   1207     tmp.val[0] = vaddq_s32(a0, a1);
   1208     tmp.val[1] = vaddq_s32(a3, a2);
   1209     tmp.val[2] = vsubq_s32(a0, a1);
   1210     tmp.val[3] = vsubq_s32(a3, a2);
   1211     // Arrange the temporary results column-wise.
   1212     tmp = Transpose4x4(tmp);
   1213   }
   1214 
   1215   {
   1216     const int32x4_t kCst3 = vdupq_n_s32(3);
   1217     const int32x4_t dc = vaddq_s32(tmp.val[0], kCst3);  // add rounder
   1218     const int32x4_t a0 = vaddq_s32(dc, tmp.val[3]);
   1219     const int32x4_t a1 = vaddq_s32(tmp.val[1], tmp.val[2]);
   1220     const int32x4_t a2 = vsubq_s32(tmp.val[1], tmp.val[2]);
   1221     const int32x4_t a3 = vsubq_s32(dc, tmp.val[3]);
   1222 
   1223     tmp.val[0] = vaddq_s32(a0, a1);
   1224     tmp.val[1] = vaddq_s32(a3, a2);
   1225     tmp.val[2] = vsubq_s32(a0, a1);
   1226     tmp.val[3] = vsubq_s32(a3, a2);
   1227 
   1228     // right shift the results by 3.
   1229     tmp.val[0] = vshrq_n_s32(tmp.val[0], 3);
   1230     tmp.val[1] = vshrq_n_s32(tmp.val[1], 3);
   1231     tmp.val[2] = vshrq_n_s32(tmp.val[2], 3);
   1232     tmp.val[3] = vshrq_n_s32(tmp.val[3], 3);
   1233 
   1234     STORE_WHT(out, 0, tmp);
   1235     STORE_WHT(out, 1, tmp);
   1236     STORE_WHT(out, 2, tmp);
   1237     STORE_WHT(out, 3, tmp);
   1238   }
   1239 }
   1240 
   1241 #undef STORE_WHT
   1242 
   1243 //------------------------------------------------------------------------------
   1244 
   1245 #define MUL(a, b) (((a) * (b)) >> 16)
   1246 static void TransformAC3(const int16_t* in, uint8_t* dst) {
   1247   static const int kC1_full = 20091 + (1 << 16);
   1248   static const int kC2_full = 35468;
   1249   const int16x4_t A = vld1_dup_s16(in);
   1250   const int16x4_t c4 = vdup_n_s16(MUL(in[4], kC2_full));
   1251   const int16x4_t d4 = vdup_n_s16(MUL(in[4], kC1_full));
   1252   const int c1 = MUL(in[1], kC2_full);
   1253   const int d1 = MUL(in[1], kC1_full);
   1254   const uint64_t cd = (uint64_t)( d1 & 0xffff) <<  0 |
   1255                       (uint64_t)( c1 & 0xffff) << 16 |
   1256                       (uint64_t)(-c1 & 0xffff) << 32 |
   1257                       (uint64_t)(-d1 & 0xffff) << 48;
   1258   const int16x4_t CD = vcreate_s16(cd);
   1259   const int16x4_t B = vqadd_s16(A, CD);
   1260   const int16x8_t m0_m1 = vcombine_s16(vqadd_s16(B, d4), vqadd_s16(B, c4));
   1261   const int16x8_t m2_m3 = vcombine_s16(vqsub_s16(B, c4), vqsub_s16(B, d4));
   1262   Add4x4(m0_m1, m2_m3, dst);
   1263 }
   1264 #undef MUL
   1265 
   1266 //------------------------------------------------------------------------------
   1267 // 4x4
   1268 
   1269 static void DC4(uint8_t* dst) {    // DC
   1270   const uint8x8_t A = vld1_u8(dst - BPS);  // top row
   1271   const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
   1272   const uint16x4_t p1 = vpadd_u16(p0, p0);
   1273   const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1));
   1274   const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1));
   1275   const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1));
   1276   const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1));
   1277   const uint16x8_t s0 = vaddq_u16(L0, L1);
   1278   const uint16x8_t s1 = vaddq_u16(L2, L3);
   1279   const uint16x8_t s01 = vaddq_u16(s0, s1);
   1280   const uint16x8_t sum = vaddq_u16(s01, vcombine_u16(p1, p1));
   1281   const uint8x8_t dc0 = vrshrn_n_u16(sum, 3);  // (sum + 4) >> 3
   1282   const uint8x8_t dc = vdup_lane_u8(dc0, 0);
   1283   int i;
   1284   for (i = 0; i < 4; ++i) {
   1285     vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc), 0);
   1286   }
   1287 }
   1288 
   1289 // TrueMotion (4x4 + 8x8)
   1290 static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
   1291   const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1);  // top-left pixel 'A[-1]'
   1292   const uint8x8_t T = vld1_u8(dst - BPS);  // top row 'A[0..3]'
   1293   const int16x8_t d = vreinterpretq_s16_u16(vsubl_u8(T, TL));  // A[c] - A[-1]
   1294   int y;
   1295   for (y = 0; y < size; y += 4) {
   1296     // left edge
   1297     const int16x8_t L0 = ConvertU8ToS16(vld1_dup_u8(dst + 0 * BPS - 1));
   1298     const int16x8_t L1 = ConvertU8ToS16(vld1_dup_u8(dst + 1 * BPS - 1));
   1299     const int16x8_t L2 = ConvertU8ToS16(vld1_dup_u8(dst + 2 * BPS - 1));
   1300     const int16x8_t L3 = ConvertU8ToS16(vld1_dup_u8(dst + 3 * BPS - 1));
   1301     const int16x8_t r0 = vaddq_s16(L0, d);  // L[r] + A[c] - A[-1]
   1302     const int16x8_t r1 = vaddq_s16(L1, d);
   1303     const int16x8_t r2 = vaddq_s16(L2, d);
   1304     const int16x8_t r3 = vaddq_s16(L3, d);
   1305     // Saturate and store the result.
   1306     const uint32x2_t r0_u32 = vreinterpret_u32_u8(vqmovun_s16(r0));
   1307     const uint32x2_t r1_u32 = vreinterpret_u32_u8(vqmovun_s16(r1));
   1308     const uint32x2_t r2_u32 = vreinterpret_u32_u8(vqmovun_s16(r2));
   1309     const uint32x2_t r3_u32 = vreinterpret_u32_u8(vqmovun_s16(r3));
   1310     if (size == 4) {
   1311       vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0_u32, 0);
   1312       vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1_u32, 0);
   1313       vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2_u32, 0);
   1314       vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3_u32, 0);
   1315     } else {
   1316       vst1_u32((uint32_t*)(dst + 0 * BPS), r0_u32);
   1317       vst1_u32((uint32_t*)(dst + 1 * BPS), r1_u32);
   1318       vst1_u32((uint32_t*)(dst + 2 * BPS), r2_u32);
   1319       vst1_u32((uint32_t*)(dst + 3 * BPS), r3_u32);
   1320     }
   1321     dst += 4 * BPS;
   1322   }
   1323 }
   1324 
   1325 static void TM4(uint8_t* dst) { TrueMotion(dst, 4); }
   1326 
   1327 static void VE4(uint8_t* dst) {    // vertical
   1328   // NB: avoid vld1_u64 here as an alignment hint may be added -> SIGBUS.
   1329   const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(dst - BPS - 1));  // top row
   1330   const uint64x1_t A1 = vshr_n_u64(A0, 8);
   1331   const uint64x1_t A2 = vshr_n_u64(A0, 16);
   1332   const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0);
   1333   const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1);
   1334   const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2);
   1335   const uint8x8_t b = vhadd_u8(ABCDEFGH, CDEFGH00);
   1336   const uint8x8_t avg = vrhadd_u8(b, BCDEFGH0);
   1337   int i;
   1338   for (i = 0; i < 4; ++i) {
   1339     vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(avg), 0);
   1340   }
   1341 }
   1342 
   1343 static void RD4(uint8_t* dst) {   // Down-right
   1344   const uint8x8_t XABCD_u8 = vld1_u8(dst - BPS - 1);
   1345   const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
   1346   const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
   1347   const uint32_t I = dst[-1 + 0 * BPS];
   1348   const uint32_t J = dst[-1 + 1 * BPS];
   1349   const uint32_t K = dst[-1 + 2 * BPS];
   1350   const uint32_t L = dst[-1 + 3 * BPS];
   1351   const uint64x1_t LKJI____ = vcreate_u64(L | (K << 8) | (J << 16) | (I << 24));
   1352   const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
   1353   const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
   1354   const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
   1355   const uint8_t D = vget_lane_u8(XABCD_u8, 4);
   1356   const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);
   1357   const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);
   1358   const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);
   1359   const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);
   1360   const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
   1361   const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
   1362   const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
   1363   const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
   1364   const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
   1365   vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0);
   1366   vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0);
   1367   vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0);
   1368   vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);
   1369 }
   1370 
   1371 static void LD4(uint8_t* dst) {    // Down-left
   1372   // Note using the same shift trick as VE4() is slower here.
   1373   const uint8x8_t ABCDEFGH = vld1_u8(dst - BPS + 0);
   1374   const uint8x8_t BCDEFGH0 = vld1_u8(dst - BPS + 1);
   1375   const uint8x8_t CDEFGH00 = vld1_u8(dst - BPS + 2);
   1376   const uint8x8_t CDEFGHH0 = vset_lane_u8(dst[-BPS + 7], CDEFGH00, 6);
   1377   const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGHH0);
   1378   const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0);
   1379   const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
   1380   const uint32x2_t r0 = vreinterpret_u32_u8(avg2);
   1381   const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
   1382   const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
   1383   const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
   1384   vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0);
   1385   vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0);
   1386   vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0);
   1387   vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);
   1388 }
   1389 
   1390 //------------------------------------------------------------------------------
   1391 // Chroma
   1392 
   1393 static void VE8uv(uint8_t* dst) {    // vertical
   1394   const uint8x8_t top = vld1_u8(dst - BPS);
   1395   int j;
   1396   for (j = 0; j < 8; ++j) {
   1397     vst1_u8(dst + j * BPS, top);
   1398   }
   1399 }
   1400 
   1401 static void HE8uv(uint8_t* dst) {    // horizontal
   1402   int j;
   1403   for (j = 0; j < 8; ++j) {
   1404     const uint8x8_t left = vld1_dup_u8(dst - 1);
   1405     vst1_u8(dst, left);
   1406     dst += BPS;
   1407   }
   1408 }
   1409 
   1410 static WEBP_INLINE void DC8(uint8_t* dst, int do_top, int do_left) {
   1411   uint16x8_t sum_top;
   1412   uint16x8_t sum_left;
   1413   uint8x8_t dc0;
   1414 
   1415   if (do_top) {
   1416     const uint8x8_t A = vld1_u8(dst - BPS);  // top row
   1417     const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
   1418     const uint16x4_t p1 = vpadd_u16(p0, p0);
   1419     const uint16x4_t p2 = vpadd_u16(p1, p1);
   1420     sum_top = vcombine_u16(p2, p2);
   1421   }
   1422 
   1423   if (do_left) {
   1424     const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1));
   1425     const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1));
   1426     const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1));
   1427     const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1));
   1428     const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + 4 * BPS - 1));
   1429     const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + 5 * BPS - 1));
   1430     const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + 6 * BPS - 1));
   1431     const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + 7 * BPS - 1));
   1432     const uint16x8_t s0 = vaddq_u16(L0, L1);
   1433     const uint16x8_t s1 = vaddq_u16(L2, L3);
   1434     const uint16x8_t s2 = vaddq_u16(L4, L5);
   1435     const uint16x8_t s3 = vaddq_u16(L6, L7);
   1436     const uint16x8_t s01 = vaddq_u16(s0, s1);
   1437     const uint16x8_t s23 = vaddq_u16(s2, s3);
   1438     sum_left = vaddq_u16(s01, s23);
   1439   }
   1440 
   1441   if (do_top && do_left) {
   1442     const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
   1443     dc0 = vrshrn_n_u16(sum, 4);
   1444   } else if (do_top) {
   1445     dc0 = vrshrn_n_u16(sum_top, 3);
   1446   } else if (do_left) {
   1447     dc0 = vrshrn_n_u16(sum_left, 3);
   1448   } else {
   1449     dc0 = vdup_n_u8(0x80);
   1450   }
   1451 
   1452   {
   1453     const uint8x8_t dc = vdup_lane_u8(dc0, 0);
   1454     int i;
   1455     for (i = 0; i < 8; ++i) {
   1456       vst1_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc));
   1457     }
   1458   }
   1459 }
   1460 
   1461 static void DC8uv(uint8_t* dst) { DC8(dst, 1, 1); }
   1462 static void DC8uvNoTop(uint8_t* dst) { DC8(dst, 0, 1); }
   1463 static void DC8uvNoLeft(uint8_t* dst) { DC8(dst, 1, 0); }
   1464 static void DC8uvNoTopLeft(uint8_t* dst) { DC8(dst, 0, 0); }
   1465 
   1466 static void TM8uv(uint8_t* dst) { TrueMotion(dst, 8); }
   1467 
   1468 //------------------------------------------------------------------------------
   1469 // 16x16
   1470 
   1471 static void VE16(uint8_t* dst) {     // vertical
   1472   const uint8x16_t top = vld1q_u8(dst - BPS);
   1473   int j;
   1474   for (j = 0; j < 16; ++j) {
   1475     vst1q_u8(dst + j * BPS, top);
   1476   }
   1477 }
   1478 
   1479 static void HE16(uint8_t* dst) {     // horizontal
   1480   int j;
   1481   for (j = 0; j < 16; ++j) {
   1482     const uint8x16_t left = vld1q_dup_u8(dst - 1);
   1483     vst1q_u8(dst, left);
   1484     dst += BPS;
   1485   }
   1486 }
   1487 
   1488 static WEBP_INLINE void DC16(uint8_t* dst, int do_top, int do_left) {
   1489   uint16x8_t sum_top;
   1490   uint16x8_t sum_left;
   1491   uint8x8_t dc0;
   1492 
   1493   if (do_top) {
   1494     const uint8x16_t A = vld1q_u8(dst - BPS);  // top row
   1495     const uint16x8_t p0 = vpaddlq_u8(A);  // cascading summation of the top
   1496     const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
   1497     const uint16x4_t p2 = vpadd_u16(p1, p1);
   1498     const uint16x4_t p3 = vpadd_u16(p2, p2);
   1499     sum_top = vcombine_u16(p3, p3);
   1500   }
   1501 
   1502   if (do_left) {
   1503     int i;
   1504     sum_left = vdupq_n_u16(0);
   1505     for (i = 0; i < 16; i += 8) {
   1506       const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + (i + 0) * BPS - 1));
   1507       const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + (i + 1) * BPS - 1));
   1508       const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + (i + 2) * BPS - 1));
   1509       const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + (i + 3) * BPS - 1));
   1510       const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + (i + 4) * BPS - 1));
   1511       const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + (i + 5) * BPS - 1));
   1512       const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + (i + 6) * BPS - 1));
   1513       const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + (i + 7) * BPS - 1));
   1514       const uint16x8_t s0 = vaddq_u16(L0, L1);
   1515       const uint16x8_t s1 = vaddq_u16(L2, L3);
   1516       const uint16x8_t s2 = vaddq_u16(L4, L5);
   1517       const uint16x8_t s3 = vaddq_u16(L6, L7);
   1518       const uint16x8_t s01 = vaddq_u16(s0, s1);
   1519       const uint16x8_t s23 = vaddq_u16(s2, s3);
   1520       const uint16x8_t sum = vaddq_u16(s01, s23);
   1521       sum_left = vaddq_u16(sum_left, sum);
   1522     }
   1523   }
   1524 
   1525   if (do_top && do_left) {
   1526     const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
   1527     dc0 = vrshrn_n_u16(sum, 5);
   1528   } else if (do_top) {
   1529     dc0 = vrshrn_n_u16(sum_top, 4);
   1530   } else if (do_left) {
   1531     dc0 = vrshrn_n_u16(sum_left, 4);
   1532   } else {
   1533     dc0 = vdup_n_u8(0x80);
   1534   }
   1535 
   1536   {
   1537     const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
   1538     int i;
   1539     for (i = 0; i < 16; ++i) {
   1540       vst1q_u8(dst + i * BPS, dc);
   1541     }
   1542   }
   1543 }
   1544 
   1545 static void DC16TopLeft(uint8_t* dst) { DC16(dst, 1, 1); }
   1546 static void DC16NoTop(uint8_t* dst) { DC16(dst, 0, 1); }
   1547 static void DC16NoLeft(uint8_t* dst) { DC16(dst, 1, 0); }
   1548 static void DC16NoTopLeft(uint8_t* dst) { DC16(dst, 0, 0); }
   1549 
   1550 static void TM16(uint8_t* dst) {
   1551   const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1);  // top-left pixel 'A[-1]'
   1552   const uint8x16_t T = vld1q_u8(dst - BPS);  // top row 'A[0..15]'
   1553   // A[c] - A[-1]
   1554   const int16x8_t d_lo = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(T), TL));
   1555   const int16x8_t d_hi = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(T), TL));
   1556   int y;
   1557   for (y = 0; y < 16; y += 4) {
   1558     // left edge
   1559     const int16x8_t L0 = ConvertU8ToS16(vld1_dup_u8(dst + 0 * BPS - 1));
   1560     const int16x8_t L1 = ConvertU8ToS16(vld1_dup_u8(dst + 1 * BPS - 1));
   1561     const int16x8_t L2 = ConvertU8ToS16(vld1_dup_u8(dst + 2 * BPS - 1));
   1562     const int16x8_t L3 = ConvertU8ToS16(vld1_dup_u8(dst + 3 * BPS - 1));
   1563     const int16x8_t r0_lo = vaddq_s16(L0, d_lo);  // L[r] + A[c] - A[-1]
   1564     const int16x8_t r1_lo = vaddq_s16(L1, d_lo);
   1565     const int16x8_t r2_lo = vaddq_s16(L2, d_lo);
   1566     const int16x8_t r3_lo = vaddq_s16(L3, d_lo);
   1567     const int16x8_t r0_hi = vaddq_s16(L0, d_hi);
   1568     const int16x8_t r1_hi = vaddq_s16(L1, d_hi);
   1569     const int16x8_t r2_hi = vaddq_s16(L2, d_hi);
   1570     const int16x8_t r3_hi = vaddq_s16(L3, d_hi);
   1571     // Saturate and store the result.
   1572     const uint8x16_t row0 = vcombine_u8(vqmovun_s16(r0_lo), vqmovun_s16(r0_hi));
   1573     const uint8x16_t row1 = vcombine_u8(vqmovun_s16(r1_lo), vqmovun_s16(r1_hi));
   1574     const uint8x16_t row2 = vcombine_u8(vqmovun_s16(r2_lo), vqmovun_s16(r2_hi));
   1575     const uint8x16_t row3 = vcombine_u8(vqmovun_s16(r3_lo), vqmovun_s16(r3_hi));
   1576     vst1q_u8(dst + 0 * BPS, row0);
   1577     vst1q_u8(dst + 1 * BPS, row1);
   1578     vst1q_u8(dst + 2 * BPS, row2);
   1579     vst1q_u8(dst + 3 * BPS, row3);
   1580     dst += 4 * BPS;
   1581   }
   1582 }
   1583 
   1584 //------------------------------------------------------------------------------
   1585 // Entry point
   1586 
   1587 extern void VP8DspInitNEON(void);
   1588 
   1589 WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitNEON(void) {
   1590   VP8Transform = TransformTwo;
   1591   VP8TransformAC3 = TransformAC3;
   1592   VP8TransformDC = TransformDC;
   1593   VP8TransformWHT = TransformWHT;
   1594 
   1595   VP8VFilter16 = VFilter16;
   1596   VP8VFilter16i = VFilter16i;
   1597   VP8HFilter16 = HFilter16;
   1598 #if !defined(WORK_AROUND_GCC)
   1599   VP8HFilter16i = HFilter16i;
   1600 #endif
   1601   VP8VFilter8 = VFilter8;
   1602   VP8VFilter8i = VFilter8i;
   1603 #if !defined(WORK_AROUND_GCC)
   1604   VP8HFilter8 = HFilter8;
   1605   VP8HFilter8i = HFilter8i;
   1606 #endif
   1607   VP8SimpleVFilter16 = SimpleVFilter16;
   1608   VP8SimpleHFilter16 = SimpleHFilter16;
   1609   VP8SimpleVFilter16i = SimpleVFilter16i;
   1610   VP8SimpleHFilter16i = SimpleHFilter16i;
   1611 
   1612   VP8PredLuma4[0] = DC4;
   1613   VP8PredLuma4[1] = TM4;
   1614   VP8PredLuma4[2] = VE4;
   1615   VP8PredLuma4[4] = RD4;
   1616   VP8PredLuma4[6] = LD4;
   1617 
   1618   VP8PredLuma16[0] = DC16TopLeft;
   1619   VP8PredLuma16[1] = TM16;
   1620   VP8PredLuma16[2] = VE16;
   1621   VP8PredLuma16[3] = HE16;
   1622   VP8PredLuma16[4] = DC16NoTop;
   1623   VP8PredLuma16[5] = DC16NoLeft;
   1624   VP8PredLuma16[6] = DC16NoTopLeft;
   1625 
   1626   VP8PredChroma8[0] = DC8uv;
   1627   VP8PredChroma8[1] = TM8uv;
   1628   VP8PredChroma8[2] = VE8uv;
   1629   VP8PredChroma8[3] = HE8uv;
   1630   VP8PredChroma8[4] = DC8uvNoTop;
   1631   VP8PredChroma8[5] = DC8uvNoLeft;
   1632   VP8PredChroma8[6] = DC8uvNoTopLeft;
   1633 }
   1634 
   1635 #else  // !WEBP_USE_NEON
   1636 
   1637 WEBP_DSP_INIT_STUB(VP8DspInitNEON)
   1638 
   1639 #endif  // WEBP_USE_NEON
   1640