Home | History | Annotate | Download | only in dsp
      1 // Copyright 2012 Google Inc. All Rights Reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style license
      4 // that can be found in the COPYING file in the root of the source
      5 // tree. An additional intellectual property rights grant can be found
      6 // in the file PATENTS. All contributing project authors may
      7 // be found in the AUTHORS file in the root of the source tree.
      8 // -----------------------------------------------------------------------------
      9 //
     10 // ARM NEON version of dsp functions and loop filtering.
     11 //
     12 // Authors: Somnath Banerjee (somnath (at) google.com)
     13 //          Johann Koenig (johannkoenig (at) google.com)
     14 
     15 #include "src/dsp/dsp.h"
     16 
     17 #if defined(WEBP_USE_NEON)
     18 
     19 #include "src/dsp/neon.h"
     20 #include "src/dec/vp8i_dec.h"
     21 
     22 //------------------------------------------------------------------------------
     23 // NxM Loading functions
     24 
     25 #if !defined(WORK_AROUND_GCC)
     26 
     27 // This intrinsics version makes gcc-4.6.3 crash during Load4x??() compilation
     28 // (register alloc, probably). The variants somewhat mitigate the problem, but
     29 // not quite. HFilter16i() remains problematic.
     30 static WEBP_INLINE uint8x8x4_t Load4x8_NEON(const uint8_t* const src,
     31                                             int stride) {
     32   const uint8x8_t zero = vdup_n_u8(0);
     33   uint8x8x4_t out;
     34   INIT_VECTOR4(out, zero, zero, zero, zero);
     35   out = vld4_lane_u8(src + 0 * stride, out, 0);
     36   out = vld4_lane_u8(src + 1 * stride, out, 1);
     37   out = vld4_lane_u8(src + 2 * stride, out, 2);
     38   out = vld4_lane_u8(src + 3 * stride, out, 3);
     39   out = vld4_lane_u8(src + 4 * stride, out, 4);
     40   out = vld4_lane_u8(src + 5 * stride, out, 5);
     41   out = vld4_lane_u8(src + 6 * stride, out, 6);
     42   out = vld4_lane_u8(src + 7 * stride, out, 7);
     43   return out;
     44 }
     45 
     46 static WEBP_INLINE void Load4x16_NEON(const uint8_t* const src, int stride,
     47                                       uint8x16_t* const p1,
     48                                       uint8x16_t* const p0,
     49                                       uint8x16_t* const q0,
     50                                       uint8x16_t* const q1) {
     51   // row0 = p1[0..7]|p0[0..7]|q0[0..7]|q1[0..7]
     52   // row8 = p1[8..15]|p0[8..15]|q0[8..15]|q1[8..15]
     53   const uint8x8x4_t row0 = Load4x8_NEON(src - 2 + 0 * stride, stride);
     54   const uint8x8x4_t row8 = Load4x8_NEON(src - 2 + 8 * stride, stride);
     55   *p1 = vcombine_u8(row0.val[0], row8.val[0]);
     56   *p0 = vcombine_u8(row0.val[1], row8.val[1]);
     57   *q0 = vcombine_u8(row0.val[2], row8.val[2]);
     58   *q1 = vcombine_u8(row0.val[3], row8.val[3]);
     59 }
     60 
     61 #else  // WORK_AROUND_GCC
     62 
     63 #define LOADQ_LANE_32b(VALUE, LANE) do {                             \
     64   (VALUE) = vld1q_lane_u32((const uint32_t*)src, (VALUE), (LANE));   \
     65   src += stride;                                                     \
     66 } while (0)
     67 
     68 static WEBP_INLINE void Load4x16_NEON(const uint8_t* src, int stride,
     69                                       uint8x16_t* const p1,
     70                                       uint8x16_t* const p0,
     71                                       uint8x16_t* const q0,
     72                                       uint8x16_t* const q1) {
     73   const uint32x4_t zero = vdupq_n_u32(0);
     74   uint32x4x4_t in;
     75   INIT_VECTOR4(in, zero, zero, zero, zero);
     76   src -= 2;
     77   LOADQ_LANE_32b(in.val[0], 0);
     78   LOADQ_LANE_32b(in.val[1], 0);
     79   LOADQ_LANE_32b(in.val[2], 0);
     80   LOADQ_LANE_32b(in.val[3], 0);
     81   LOADQ_LANE_32b(in.val[0], 1);
     82   LOADQ_LANE_32b(in.val[1], 1);
     83   LOADQ_LANE_32b(in.val[2], 1);
     84   LOADQ_LANE_32b(in.val[3], 1);
     85   LOADQ_LANE_32b(in.val[0], 2);
     86   LOADQ_LANE_32b(in.val[1], 2);
     87   LOADQ_LANE_32b(in.val[2], 2);
     88   LOADQ_LANE_32b(in.val[3], 2);
     89   LOADQ_LANE_32b(in.val[0], 3);
     90   LOADQ_LANE_32b(in.val[1], 3);
     91   LOADQ_LANE_32b(in.val[2], 3);
     92   LOADQ_LANE_32b(in.val[3], 3);
     93   // Transpose four 4x4 parts:
     94   {
     95     const uint8x16x2_t row01 = vtrnq_u8(vreinterpretq_u8_u32(in.val[0]),
     96                                         vreinterpretq_u8_u32(in.val[1]));
     97     const uint8x16x2_t row23 = vtrnq_u8(vreinterpretq_u8_u32(in.val[2]),
     98                                         vreinterpretq_u8_u32(in.val[3]));
     99     const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]),
    100                                          vreinterpretq_u16_u8(row23.val[0]));
    101     const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]),
    102                                          vreinterpretq_u16_u8(row23.val[1]));
    103     *p1 = vreinterpretq_u8_u16(row02.val[0]);
    104     *p0 = vreinterpretq_u8_u16(row13.val[0]);
    105     *q0 = vreinterpretq_u8_u16(row02.val[1]);
    106     *q1 = vreinterpretq_u8_u16(row13.val[1]);
    107   }
    108 }
    109 #undef LOADQ_LANE_32b
    110 
    111 #endif  // !WORK_AROUND_GCC
    112 
    113 static WEBP_INLINE void Load8x16_NEON(
    114     const uint8_t* const src, int stride,
    115     uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
    116     uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
    117     uint8x16_t* const q2, uint8x16_t* const q3) {
    118   Load4x16_NEON(src - 2, stride, p3, p2, p1, p0);
    119   Load4x16_NEON(src + 2, stride, q0, q1, q2, q3);
    120 }
    121 
    122 static WEBP_INLINE void Load16x4_NEON(const uint8_t* const src, int stride,
    123                                       uint8x16_t* const p1,
    124                                       uint8x16_t* const p0,
    125                                       uint8x16_t* const q0,
    126                                       uint8x16_t* const q1) {
    127   *p1 = vld1q_u8(src - 2 * stride);
    128   *p0 = vld1q_u8(src - 1 * stride);
    129   *q0 = vld1q_u8(src + 0 * stride);
    130   *q1 = vld1q_u8(src + 1 * stride);
    131 }
    132 
    133 static WEBP_INLINE void Load16x8_NEON(
    134     const uint8_t* const src, int stride,
    135     uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
    136     uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
    137     uint8x16_t* const q2, uint8x16_t* const q3) {
    138   Load16x4_NEON(src - 2  * stride, stride, p3, p2, p1, p0);
    139   Load16x4_NEON(src + 2  * stride, stride, q0, q1, q2, q3);
    140 }
    141 
    142 static WEBP_INLINE void Load8x8x2_NEON(
    143     const uint8_t* const u, const uint8_t* const v, int stride,
    144     uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
    145     uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
    146     uint8x16_t* const q2, uint8x16_t* const q3) {
    147   // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
    148   // and the v-samples on the higher half.
    149   *p3 = vcombine_u8(vld1_u8(u - 4 * stride), vld1_u8(v - 4 * stride));
    150   *p2 = vcombine_u8(vld1_u8(u - 3 * stride), vld1_u8(v - 3 * stride));
    151   *p1 = vcombine_u8(vld1_u8(u - 2 * stride), vld1_u8(v - 2 * stride));
    152   *p0 = vcombine_u8(vld1_u8(u - 1 * stride), vld1_u8(v - 1 * stride));
    153   *q0 = vcombine_u8(vld1_u8(u + 0 * stride), vld1_u8(v + 0 * stride));
    154   *q1 = vcombine_u8(vld1_u8(u + 1 * stride), vld1_u8(v + 1 * stride));
    155   *q2 = vcombine_u8(vld1_u8(u + 2 * stride), vld1_u8(v + 2 * stride));
    156   *q3 = vcombine_u8(vld1_u8(u + 3 * stride), vld1_u8(v + 3 * stride));
    157 }
    158 
    159 #if !defined(WORK_AROUND_GCC)
    160 
    161 #define LOAD_UV_8(ROW) \
    162   vcombine_u8(vld1_u8(u - 4 + (ROW) * stride), vld1_u8(v - 4 + (ROW) * stride))
    163 
    164 static WEBP_INLINE void Load8x8x2T_NEON(
    165     const uint8_t* const u, const uint8_t* const v, int stride,
    166     uint8x16_t* const p3, uint8x16_t* const p2, uint8x16_t* const p1,
    167     uint8x16_t* const p0, uint8x16_t* const q0, uint8x16_t* const q1,
    168     uint8x16_t* const q2, uint8x16_t* const q3) {
    169   // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
    170   // and the v-samples on the higher half.
    171   const uint8x16_t row0 = LOAD_UV_8(0);
    172   const uint8x16_t row1 = LOAD_UV_8(1);
    173   const uint8x16_t row2 = LOAD_UV_8(2);
    174   const uint8x16_t row3 = LOAD_UV_8(3);
    175   const uint8x16_t row4 = LOAD_UV_8(4);
    176   const uint8x16_t row5 = LOAD_UV_8(5);
    177   const uint8x16_t row6 = LOAD_UV_8(6);
    178   const uint8x16_t row7 = LOAD_UV_8(7);
    179   // Perform two side-by-side 8x8 transposes
    180   // u00 u01 u02 u03 u04 u05 u06 u07 | v00 v01 v02 v03 v04 v05 v06 v07
    181   // u10 u11 u12 u13 u14 u15 u16 u17 | v10 v11 v12 ...
    182   // u20 u21 u22 u23 u24 u25 u26 u27 | v20 v21 ...
    183   // u30 u31 u32 u33 u34 u35 u36 u37 | ...
    184   // u40 u41 u42 u43 u44 u45 u46 u47 | ...
    185   // u50 u51 u52 u53 u54 u55 u56 u57 | ...
    186   // u60 u61 u62 u63 u64 u65 u66 u67 | v60 ...
    187   // u70 u71 u72 u73 u74 u75 u76 u77 | v70 v71 v72 ...
    188   const uint8x16x2_t row01 = vtrnq_u8(row0, row1);  // u00 u10 u02 u12 ...
    189                                                     // u01 u11 u03 u13 ...
    190   const uint8x16x2_t row23 = vtrnq_u8(row2, row3);  // u20 u30 u22 u32 ...
    191                                                     // u21 u31 u23 u33 ...
    192   const uint8x16x2_t row45 = vtrnq_u8(row4, row5);  // ...
    193   const uint8x16x2_t row67 = vtrnq_u8(row6, row7);  // ...
    194   const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]),
    195                                        vreinterpretq_u16_u8(row23.val[0]));
    196   const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]),
    197                                        vreinterpretq_u16_u8(row23.val[1]));
    198   const uint16x8x2_t row46 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[0]),
    199                                        vreinterpretq_u16_u8(row67.val[0]));
    200   const uint16x8x2_t row57 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[1]),
    201                                        vreinterpretq_u16_u8(row67.val[1]));
    202   const uint32x4x2_t row04 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[0]),
    203                                        vreinterpretq_u32_u16(row46.val[0]));
    204   const uint32x4x2_t row26 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[1]),
    205                                        vreinterpretq_u32_u16(row46.val[1]));
    206   const uint32x4x2_t row15 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[0]),
    207                                        vreinterpretq_u32_u16(row57.val[0]));
    208   const uint32x4x2_t row37 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[1]),
    209                                        vreinterpretq_u32_u16(row57.val[1]));
    210   *p3 = vreinterpretq_u8_u32(row04.val[0]);
    211   *p2 = vreinterpretq_u8_u32(row15.val[0]);
    212   *p1 = vreinterpretq_u8_u32(row26.val[0]);
    213   *p0 = vreinterpretq_u8_u32(row37.val[0]);
    214   *q0 = vreinterpretq_u8_u32(row04.val[1]);
    215   *q1 = vreinterpretq_u8_u32(row15.val[1]);
    216   *q2 = vreinterpretq_u8_u32(row26.val[1]);
    217   *q3 = vreinterpretq_u8_u32(row37.val[1]);
    218 }
    219 #undef LOAD_UV_8
    220 
    221 #endif  // !WORK_AROUND_GCC
    222 
    223 static WEBP_INLINE void Store2x8_NEON(const uint8x8x2_t v,
    224                                       uint8_t* const dst, int stride) {
    225   vst2_lane_u8(dst + 0 * stride, v, 0);
    226   vst2_lane_u8(dst + 1 * stride, v, 1);
    227   vst2_lane_u8(dst + 2 * stride, v, 2);
    228   vst2_lane_u8(dst + 3 * stride, v, 3);
    229   vst2_lane_u8(dst + 4 * stride, v, 4);
    230   vst2_lane_u8(dst + 5 * stride, v, 5);
    231   vst2_lane_u8(dst + 6 * stride, v, 6);
    232   vst2_lane_u8(dst + 7 * stride, v, 7);
    233 }
    234 
    235 static WEBP_INLINE void Store2x16_NEON(const uint8x16_t p0, const uint8x16_t q0,
    236                                        uint8_t* const dst, int stride) {
    237   uint8x8x2_t lo, hi;
    238   lo.val[0] = vget_low_u8(p0);
    239   lo.val[1] = vget_low_u8(q0);
    240   hi.val[0] = vget_high_u8(p0);
    241   hi.val[1] = vget_high_u8(q0);
    242   Store2x8_NEON(lo, dst - 1 + 0 * stride, stride);
    243   Store2x8_NEON(hi, dst - 1 + 8 * stride, stride);
    244 }
    245 
    246 #if !defined(WORK_AROUND_GCC)
    247 static WEBP_INLINE void Store4x8_NEON(const uint8x8x4_t v,
    248                                       uint8_t* const dst, int stride) {
    249   vst4_lane_u8(dst + 0 * stride, v, 0);
    250   vst4_lane_u8(dst + 1 * stride, v, 1);
    251   vst4_lane_u8(dst + 2 * stride, v, 2);
    252   vst4_lane_u8(dst + 3 * stride, v, 3);
    253   vst4_lane_u8(dst + 4 * stride, v, 4);
    254   vst4_lane_u8(dst + 5 * stride, v, 5);
    255   vst4_lane_u8(dst + 6 * stride, v, 6);
    256   vst4_lane_u8(dst + 7 * stride, v, 7);
    257 }
    258 
    259 static WEBP_INLINE void Store4x16_NEON(const uint8x16_t p1, const uint8x16_t p0,
    260                                        const uint8x16_t q0, const uint8x16_t q1,
    261                                        uint8_t* const dst, int stride) {
    262   uint8x8x4_t lo, hi;
    263   INIT_VECTOR4(lo,
    264                vget_low_u8(p1), vget_low_u8(p0),
    265                vget_low_u8(q0), vget_low_u8(q1));
    266   INIT_VECTOR4(hi,
    267                vget_high_u8(p1), vget_high_u8(p0),
    268                vget_high_u8(q0), vget_high_u8(q1));
    269   Store4x8_NEON(lo, dst - 2 + 0 * stride, stride);
    270   Store4x8_NEON(hi, dst - 2 + 8 * stride, stride);
    271 }
    272 #endif  // !WORK_AROUND_GCC
    273 
    274 static WEBP_INLINE void Store16x2_NEON(const uint8x16_t p0, const uint8x16_t q0,
    275                                        uint8_t* const dst, int stride) {
    276   vst1q_u8(dst - stride, p0);
    277   vst1q_u8(dst, q0);
    278 }
    279 
    280 static WEBP_INLINE void Store16x4_NEON(const uint8x16_t p1, const uint8x16_t p0,
    281                                        const uint8x16_t q0, const uint8x16_t q1,
    282                                        uint8_t* const dst, int stride) {
    283   Store16x2_NEON(p1, p0, dst - stride, stride);
    284   Store16x2_NEON(q0, q1, dst + stride, stride);
    285 }
    286 
    287 static WEBP_INLINE void Store8x2x2_NEON(const uint8x16_t p0,
    288                                         const uint8x16_t q0,
    289                                         uint8_t* const u, uint8_t* const v,
    290                                         int stride) {
    291   // p0 and q0 contain the u+v samples packed in low/high halves.
    292   vst1_u8(u - stride, vget_low_u8(p0));
    293   vst1_u8(u,          vget_low_u8(q0));
    294   vst1_u8(v - stride, vget_high_u8(p0));
    295   vst1_u8(v,          vget_high_u8(q0));
    296 }
    297 
    298 static WEBP_INLINE void Store8x4x2_NEON(const uint8x16_t p1,
    299                                         const uint8x16_t p0,
    300                                         const uint8x16_t q0,
    301                                         const uint8x16_t q1,
    302                                         uint8_t* const u, uint8_t* const v,
    303                                         int stride) {
    304   // The p1...q1 registers contain the u+v samples packed in low/high halves.
    305   Store8x2x2_NEON(p1, p0, u - stride, v - stride, stride);
    306   Store8x2x2_NEON(q0, q1, u + stride, v + stride, stride);
    307 }
    308 
    309 #if !defined(WORK_AROUND_GCC)
    310 
    311 #define STORE6_LANE(DST, VAL0, VAL1, LANE) do {   \
    312   vst3_lane_u8((DST) - 3, (VAL0), (LANE));        \
    313   vst3_lane_u8((DST) + 0, (VAL1), (LANE));        \
    314   (DST) += stride;                                \
    315 } while (0)
    316 
    317 static WEBP_INLINE void Store6x8x2_NEON(
    318     const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
    319     const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
    320     uint8_t* u, uint8_t* v, int stride) {
    321   uint8x8x3_t u0, u1, v0, v1;
    322   INIT_VECTOR3(u0, vget_low_u8(p2), vget_low_u8(p1), vget_low_u8(p0));
    323   INIT_VECTOR3(u1, vget_low_u8(q0), vget_low_u8(q1), vget_low_u8(q2));
    324   INIT_VECTOR3(v0, vget_high_u8(p2), vget_high_u8(p1), vget_high_u8(p0));
    325   INIT_VECTOR3(v1, vget_high_u8(q0), vget_high_u8(q1), vget_high_u8(q2));
    326   STORE6_LANE(u, u0, u1, 0);
    327   STORE6_LANE(u, u0, u1, 1);
    328   STORE6_LANE(u, u0, u1, 2);
    329   STORE6_LANE(u, u0, u1, 3);
    330   STORE6_LANE(u, u0, u1, 4);
    331   STORE6_LANE(u, u0, u1, 5);
    332   STORE6_LANE(u, u0, u1, 6);
    333   STORE6_LANE(u, u0, u1, 7);
    334   STORE6_LANE(v, v0, v1, 0);
    335   STORE6_LANE(v, v0, v1, 1);
    336   STORE6_LANE(v, v0, v1, 2);
    337   STORE6_LANE(v, v0, v1, 3);
    338   STORE6_LANE(v, v0, v1, 4);
    339   STORE6_LANE(v, v0, v1, 5);
    340   STORE6_LANE(v, v0, v1, 6);
    341   STORE6_LANE(v, v0, v1, 7);
    342 }
    343 #undef STORE6_LANE
    344 
    345 static WEBP_INLINE void Store4x8x2_NEON(const uint8x16_t p1,
    346                                         const uint8x16_t p0,
    347                                         const uint8x16_t q0,
    348                                         const uint8x16_t q1,
    349                                         uint8_t* const u, uint8_t* const v,
    350                                         int stride) {
    351   uint8x8x4_t u0, v0;
    352   INIT_VECTOR4(u0,
    353                vget_low_u8(p1), vget_low_u8(p0),
    354                vget_low_u8(q0), vget_low_u8(q1));
    355   INIT_VECTOR4(v0,
    356                vget_high_u8(p1), vget_high_u8(p0),
    357                vget_high_u8(q0), vget_high_u8(q1));
    358   vst4_lane_u8(u - 2 + 0 * stride, u0, 0);
    359   vst4_lane_u8(u - 2 + 1 * stride, u0, 1);
    360   vst4_lane_u8(u - 2 + 2 * stride, u0, 2);
    361   vst4_lane_u8(u - 2 + 3 * stride, u0, 3);
    362   vst4_lane_u8(u - 2 + 4 * stride, u0, 4);
    363   vst4_lane_u8(u - 2 + 5 * stride, u0, 5);
    364   vst4_lane_u8(u - 2 + 6 * stride, u0, 6);
    365   vst4_lane_u8(u - 2 + 7 * stride, u0, 7);
    366   vst4_lane_u8(v - 2 + 0 * stride, v0, 0);
    367   vst4_lane_u8(v - 2 + 1 * stride, v0, 1);
    368   vst4_lane_u8(v - 2 + 2 * stride, v0, 2);
    369   vst4_lane_u8(v - 2 + 3 * stride, v0, 3);
    370   vst4_lane_u8(v - 2 + 4 * stride, v0, 4);
    371   vst4_lane_u8(v - 2 + 5 * stride, v0, 5);
    372   vst4_lane_u8(v - 2 + 6 * stride, v0, 6);
    373   vst4_lane_u8(v - 2 + 7 * stride, v0, 7);
    374 }
    375 
    376 #endif  // !WORK_AROUND_GCC
    377 
    378 // Zero extend 'v' to an int16x8_t.
    379 static WEBP_INLINE int16x8_t ConvertU8ToS16_NEON(uint8x8_t v) {
    380   return vreinterpretq_s16_u16(vmovl_u8(v));
    381 }
    382 
    383 // Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
    384 // to the corresponding rows of 'dst'.
    385 static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst,
    386                                                  const int16x8_t dst01,
    387                                                  const int16x8_t dst23) {
    388   // Unsigned saturate to 8b.
    389   const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
    390   const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
    391 
    392   // Store the results.
    393   vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0);
    394   vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1);
    395   vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0);
    396   vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
    397 }
    398 
    399 static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01,
    400                                     const int16x8_t row23,
    401                                     uint8_t* const dst) {
    402   uint32x2_t dst01 = vdup_n_u32(0);
    403   uint32x2_t dst23 = vdup_n_u32(0);
    404 
    405   // Load the source pixels.
    406   dst01 = vld1_lane_u32((uint32_t*)(dst + 0 * BPS), dst01, 0);
    407   dst23 = vld1_lane_u32((uint32_t*)(dst + 2 * BPS), dst23, 0);
    408   dst01 = vld1_lane_u32((uint32_t*)(dst + 1 * BPS), dst01, 1);
    409   dst23 = vld1_lane_u32((uint32_t*)(dst + 3 * BPS), dst23, 1);
    410 
    411   {
    412     // Convert to 16b.
    413     const int16x8_t dst01_s16 = ConvertU8ToS16_NEON(vreinterpret_u8_u32(dst01));
    414     const int16x8_t dst23_s16 = ConvertU8ToS16_NEON(vreinterpret_u8_u32(dst23));
    415 
    416     // Descale with rounding.
    417     const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
    418     const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
    419     // Add the inverse transform.
    420     SaturateAndStore4x4_NEON(dst, out01, out23);
    421   }
    422 }
    423 
    424 //-----------------------------------------------------------------------------
    425 // Simple In-loop filtering (Paragraph 15.2)
    426 
    427 static uint8x16_t NeedsFilter_NEON(const uint8x16_t p1, const uint8x16_t p0,
    428                                    const uint8x16_t q0, const uint8x16_t q1,
    429                                    int thresh) {
    430   const uint8x16_t thresh_v = vdupq_n_u8((uint8_t)thresh);
    431   const uint8x16_t a_p0_q0 = vabdq_u8(p0, q0);               // abs(p0-q0)
    432   const uint8x16_t a_p1_q1 = vabdq_u8(p1, q1);               // abs(p1-q1)
    433   const uint8x16_t a_p0_q0_2 = vqaddq_u8(a_p0_q0, a_p0_q0);  // 2 * abs(p0-q0)
    434   const uint8x16_t a_p1_q1_2 = vshrq_n_u8(a_p1_q1, 1);       // abs(p1-q1) / 2
    435   const uint8x16_t sum = vqaddq_u8(a_p0_q0_2, a_p1_q1_2);
    436   const uint8x16_t mask = vcgeq_u8(thresh_v, sum);
    437   return mask;
    438 }
    439 
    440 static int8x16_t FlipSign_NEON(const uint8x16_t v) {
    441   const uint8x16_t sign_bit = vdupq_n_u8(0x80);
    442   return vreinterpretq_s8_u8(veorq_u8(v, sign_bit));
    443 }
    444 
    445 static uint8x16_t FlipSignBack_NEON(const int8x16_t v) {
    446   const int8x16_t sign_bit = vdupq_n_s8(0x80);
    447   return vreinterpretq_u8_s8(veorq_s8(v, sign_bit));
    448 }
    449 
    450 static int8x16_t GetBaseDelta_NEON(const int8x16_t p1, const int8x16_t p0,
    451                                    const int8x16_t q0, const int8x16_t q1) {
    452   const int8x16_t q0_p0 = vqsubq_s8(q0, p0);      // (q0-p0)
    453   const int8x16_t p1_q1 = vqsubq_s8(p1, q1);      // (p1-q1)
    454   const int8x16_t s1 = vqaddq_s8(p1_q1, q0_p0);   // (p1-q1) + 1 * (q0 - p0)
    455   const int8x16_t s2 = vqaddq_s8(q0_p0, s1);      // (p1-q1) + 2 * (q0 - p0)
    456   const int8x16_t s3 = vqaddq_s8(q0_p0, s2);      // (p1-q1) + 3 * (q0 - p0)
    457   return s3;
    458 }
    459 
    460 static int8x16_t GetBaseDelta0_NEON(const int8x16_t p0, const int8x16_t q0) {
    461   const int8x16_t q0_p0 = vqsubq_s8(q0, p0);      // (q0-p0)
    462   const int8x16_t s1 = vqaddq_s8(q0_p0, q0_p0);   // 2 * (q0 - p0)
    463   const int8x16_t s2 = vqaddq_s8(q0_p0, s1);      // 3 * (q0 - p0)
    464   return s2;
    465 }
    466 
    467 //------------------------------------------------------------------------------
    468 
    469 static void ApplyFilter2NoFlip_NEON(const int8x16_t p0s, const int8x16_t q0s,
    470                                     const int8x16_t delta,
    471                                     int8x16_t* const op0,
    472                                     int8x16_t* const oq0) {
    473   const int8x16_t kCst3 = vdupq_n_s8(0x03);
    474   const int8x16_t kCst4 = vdupq_n_s8(0x04);
    475   const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
    476   const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);
    477   const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3);
    478   const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);
    479   *op0 = vqaddq_s8(p0s, delta3);
    480   *oq0 = vqsubq_s8(q0s, delta4);
    481 }
    482 
    483 #if defined(WEBP_USE_INTRINSICS)
    484 
    485 static void ApplyFilter2_NEON(const int8x16_t p0s, const int8x16_t q0s,
    486                               const int8x16_t delta,
    487                               uint8x16_t* const op0, uint8x16_t* const oq0) {
    488   const int8x16_t kCst3 = vdupq_n_s8(0x03);
    489   const int8x16_t kCst4 = vdupq_n_s8(0x04);
    490   const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
    491   const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);
    492   const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3);
    493   const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);
    494   const int8x16_t sp0 = vqaddq_s8(p0s, delta3);
    495   const int8x16_t sq0 = vqsubq_s8(q0s, delta4);
    496   *op0 = FlipSignBack_NEON(sp0);
    497   *oq0 = FlipSignBack_NEON(sq0);
    498 }
    499 
    500 static void DoFilter2_NEON(const uint8x16_t p1, const uint8x16_t p0,
    501                            const uint8x16_t q0, const uint8x16_t q1,
    502                            const uint8x16_t mask,
    503                            uint8x16_t* const op0, uint8x16_t* const oq0) {
    504   const int8x16_t p1s = FlipSign_NEON(p1);
    505   const int8x16_t p0s = FlipSign_NEON(p0);
    506   const int8x16_t q0s = FlipSign_NEON(q0);
    507   const int8x16_t q1s = FlipSign_NEON(q1);
    508   const int8x16_t delta0 = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
    509   const int8x16_t delta1 = vandq_s8(delta0, vreinterpretq_s8_u8(mask));
    510   ApplyFilter2_NEON(p0s, q0s, delta1, op0, oq0);
    511 }
    512 
    513 static void SimpleVFilter16_NEON(uint8_t* p, int stride, int thresh) {
    514   uint8x16_t p1, p0, q0, q1, op0, oq0;
    515   Load16x4_NEON(p, stride, &p1, &p0, &q0, &q1);
    516   {
    517     const uint8x16_t mask = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
    518     DoFilter2_NEON(p1, p0, q0, q1, mask, &op0, &oq0);
    519   }
    520   Store16x2_NEON(op0, oq0, p, stride);
    521 }
    522 
    523 static void SimpleHFilter16_NEON(uint8_t* p, int stride, int thresh) {
    524   uint8x16_t p1, p0, q0, q1, oq0, op0;
    525   Load4x16_NEON(p, stride, &p1, &p0, &q0, &q1);
    526   {
    527     const uint8x16_t mask = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
    528     DoFilter2_NEON(p1, p0, q0, q1, mask, &op0, &oq0);
    529   }
    530   Store2x16_NEON(op0, oq0, p, stride);
    531 }
    532 
    533 #else
    534 
    535 // Load/Store vertical edge
    536 #define LOAD8x4(c1, c2, c3, c4, b1, b2, stride)                                \
    537   "vld4.8 {" #c1 "[0]," #c2 "[0]," #c3 "[0]," #c4 "[0]}," #b1 "," #stride "\n" \
    538   "vld4.8 {" #c1 "[1]," #c2 "[1]," #c3 "[1]," #c4 "[1]}," #b2 "," #stride "\n" \
    539   "vld4.8 {" #c1 "[2]," #c2 "[2]," #c3 "[2]," #c4 "[2]}," #b1 "," #stride "\n" \
    540   "vld4.8 {" #c1 "[3]," #c2 "[3]," #c3 "[3]," #c4 "[3]}," #b2 "," #stride "\n" \
    541   "vld4.8 {" #c1 "[4]," #c2 "[4]," #c3 "[4]," #c4 "[4]}," #b1 "," #stride "\n" \
    542   "vld4.8 {" #c1 "[5]," #c2 "[5]," #c3 "[5]," #c4 "[5]}," #b2 "," #stride "\n" \
    543   "vld4.8 {" #c1 "[6]," #c2 "[6]," #c3 "[6]," #c4 "[6]}," #b1 "," #stride "\n" \
    544   "vld4.8 {" #c1 "[7]," #c2 "[7]," #c3 "[7]," #c4 "[7]}," #b2 "," #stride "\n"
    545 
    546 #define STORE8x2(c1, c2, p, stride)                                            \
    547   "vst2.8   {" #c1 "[0], " #c2 "[0]}," #p "," #stride " \n"                    \
    548   "vst2.8   {" #c1 "[1], " #c2 "[1]}," #p "," #stride " \n"                    \
    549   "vst2.8   {" #c1 "[2], " #c2 "[2]}," #p "," #stride " \n"                    \
    550   "vst2.8   {" #c1 "[3], " #c2 "[3]}," #p "," #stride " \n"                    \
    551   "vst2.8   {" #c1 "[4], " #c2 "[4]}," #p "," #stride " \n"                    \
    552   "vst2.8   {" #c1 "[5], " #c2 "[5]}," #p "," #stride " \n"                    \
    553   "vst2.8   {" #c1 "[6], " #c2 "[6]}," #p "," #stride " \n"                    \
    554   "vst2.8   {" #c1 "[7], " #c2 "[7]}," #p "," #stride " \n"
    555 
    556 #define QRegs "q0", "q1", "q2", "q3",                                          \
    557               "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
    558 
    559 #define FLIP_SIGN_BIT2(a, b, s)                                                \
    560   "veor     " #a "," #a "," #s "               \n"                             \
    561   "veor     " #b "," #b "," #s "               \n"                             \
    562 
    563 #define FLIP_SIGN_BIT4(a, b, c, d, s)                                          \
    564   FLIP_SIGN_BIT2(a, b, s)                                                      \
    565   FLIP_SIGN_BIT2(c, d, s)                                                      \
    566 
    567 #define NEEDS_FILTER(p1, p0, q0, q1, thresh, mask)                             \
    568   "vabd.u8    q15," #p0 "," #q0 "         \n"  /* abs(p0 - q0) */              \
    569   "vabd.u8    q14," #p1 "," #q1 "         \n"  /* abs(p1 - q1) */              \
    570   "vqadd.u8   q15, q15, q15               \n"  /* abs(p0 - q0) * 2 */          \
    571   "vshr.u8    q14, q14, #1                \n"  /* abs(p1 - q1) / 2 */          \
    572   "vqadd.u8   q15, q15, q14     \n"  /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \
    573   "vdup.8     q14, " #thresh "            \n"                                  \
    574   "vcge.u8   " #mask ", q14, q15          \n"  /* mask <= thresh */
    575 
    576 #define GET_BASE_DELTA(p1, p0, q0, q1, o)                                      \
    577   "vqsub.s8   q15," #q0 "," #p0 "         \n"  /* (q0 - p0) */                 \
    578   "vqsub.s8  " #o "," #p1 "," #q1 "       \n"  /* (p1 - q1) */                 \
    579   "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 1 * (p0 - q0) */ \
    580   "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 2 * (p0 - q0) */ \
    581   "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 3 * (p0 - q0) */
    582 
    583 #define DO_SIMPLE_FILTER(p0, q0, fl)                                           \
    584   "vmov.i8    q15, #0x03                  \n"                                  \
    585   "vqadd.s8   q15, q15, " #fl "           \n"  /* filter1 = filter + 3 */      \
    586   "vshr.s8    q15, q15, #3                \n"  /* filter1 >> 3 */              \
    587   "vqadd.s8  " #p0 "," #p0 ", q15         \n"  /* p0 += filter1 */             \
    588                                                                                \
    589   "vmov.i8    q15, #0x04                  \n"                                  \
    590   "vqadd.s8   q15, q15, " #fl "           \n"  /* filter1 = filter + 4 */      \
    591   "vshr.s8    q15, q15, #3                \n"  /* filter2 >> 3 */              \
    592   "vqsub.s8  " #q0 "," #q0 ", q15         \n"  /* q0 -= filter2 */
    593 
    594 // Applies filter on 2 pixels (p0 and q0)
    595 #define DO_FILTER2(p1, p0, q0, q1, thresh)                                     \
    596   NEEDS_FILTER(p1, p0, q0, q1, thresh, q9)     /* filter mask in q9 */         \
    597   "vmov.i8    q10, #0x80                  \n"  /* sign bit */                  \
    598   FLIP_SIGN_BIT4(p1, p0, q0, q1, q10)          /* convert to signed value */   \
    599   GET_BASE_DELTA(p1, p0, q0, q1, q11)          /* get filter level  */         \
    600   "vand       q9, q9, q11                 \n"  /* apply filter mask */         \
    601   DO_SIMPLE_FILTER(p0, q0, q9)                 /* apply filter */              \
    602   FLIP_SIGN_BIT2(p0, q0, q10)
    603 
    604 static void SimpleVFilter16_NEON(uint8_t* p, int stride, int thresh) {
    605   __asm__ volatile (
    606     "sub        %[p], %[p], %[stride], lsl #1  \n"  // p -= 2 * stride
    607 
    608     "vld1.u8    {q1}, [%[p]], %[stride]        \n"  // p1
    609     "vld1.u8    {q2}, [%[p]], %[stride]        \n"  // p0
    610     "vld1.u8    {q3}, [%[p]], %[stride]        \n"  // q0
    611     "vld1.u8    {q12}, [%[p]]                  \n"  // q1
    612 
    613     DO_FILTER2(q1, q2, q3, q12, %[thresh])
    614 
    615     "sub        %[p], %[p], %[stride], lsl #1  \n"  // p -= 2 * stride
    616 
    617     "vst1.u8    {q2}, [%[p]], %[stride]        \n"  // store op0
    618     "vst1.u8    {q3}, [%[p]]                   \n"  // store oq0
    619     : [p] "+r"(p)
    620     : [stride] "r"(stride), [thresh] "r"(thresh)
    621     : "memory", QRegs
    622   );
    623 }
    624 
    625 static void SimpleHFilter16_NEON(uint8_t* p, int stride, int thresh) {
    626   __asm__ volatile (
    627     "sub        r4, %[p], #2                   \n"  // base1 = p - 2
    628     "lsl        r6, %[stride], #1              \n"  // r6 = 2 * stride
    629     "add        r5, r4, %[stride]              \n"  // base2 = base1 + stride
    630 
    631     LOAD8x4(d2, d3, d4, d5, [r4], [r5], r6)
    632     LOAD8x4(d24, d25, d26, d27, [r4], [r5], r6)
    633     "vswp       d3, d24                        \n"  // p1:q1 p0:q3
    634     "vswp       d5, d26                        \n"  // q0:q2 q1:q4
    635     "vswp       q2, q12                        \n"  // p1:q1 p0:q2 q0:q3 q1:q4
    636 
    637     DO_FILTER2(q1, q2, q12, q13, %[thresh])
    638 
    639     "sub        %[p], %[p], #1                 \n"  // p - 1
    640 
    641     "vswp        d5, d24                       \n"
    642     STORE8x2(d4, d5, [%[p]], %[stride])
    643     STORE8x2(d24, d25, [%[p]], %[stride])
    644 
    645     : [p] "+r"(p)
    646     : [stride] "r"(stride), [thresh] "r"(thresh)
    647     : "memory", "r4", "r5", "r6", QRegs
    648   );
    649 }
    650 
    651 #undef LOAD8x4
    652 #undef STORE8x2
    653 
    654 #endif    // WEBP_USE_INTRINSICS
    655 
    656 static void SimpleVFilter16i_NEON(uint8_t* p, int stride, int thresh) {
    657   uint32_t k;
    658   for (k = 3; k != 0; --k) {
    659     p += 4 * stride;
    660     SimpleVFilter16_NEON(p, stride, thresh);
    661   }
    662 }
    663 
    664 static void SimpleHFilter16i_NEON(uint8_t* p, int stride, int thresh) {
    665   uint32_t k;
    666   for (k = 3; k != 0; --k) {
    667     p += 4;
    668     SimpleHFilter16_NEON(p, stride, thresh);
    669   }
    670 }
    671 
    672 //------------------------------------------------------------------------------
    673 // Complex In-loop filtering (Paragraph 15.3)
    674 
    675 static uint8x16_t NeedsHev_NEON(const uint8x16_t p1, const uint8x16_t p0,
    676                                 const uint8x16_t q0, const uint8x16_t q1,
    677                                 int hev_thresh) {
    678   const uint8x16_t hev_thresh_v = vdupq_n_u8((uint8_t)hev_thresh);
    679   const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0);  // abs(p1 - p0)
    680   const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0);  // abs(q1 - q0)
    681   const uint8x16_t a_max = vmaxq_u8(a_p1_p0, a_q1_q0);
    682   const uint8x16_t mask = vcgtq_u8(a_max, hev_thresh_v);
    683   return mask;
    684 }
    685 
    686 static uint8x16_t NeedsFilter2_NEON(const uint8x16_t p3, const uint8x16_t p2,
    687                                     const uint8x16_t p1, const uint8x16_t p0,
    688                                     const uint8x16_t q0, const uint8x16_t q1,
    689                                     const uint8x16_t q2, const uint8x16_t q3,
    690                                     int ithresh, int thresh) {
    691   const uint8x16_t ithresh_v = vdupq_n_u8((uint8_t)ithresh);
    692   const uint8x16_t a_p3_p2 = vabdq_u8(p3, p2);  // abs(p3 - p2)
    693   const uint8x16_t a_p2_p1 = vabdq_u8(p2, p1);  // abs(p2 - p1)
    694   const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0);  // abs(p1 - p0)
    695   const uint8x16_t a_q3_q2 = vabdq_u8(q3, q2);  // abs(q3 - q2)
    696   const uint8x16_t a_q2_q1 = vabdq_u8(q2, q1);  // abs(q2 - q1)
    697   const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0);  // abs(q1 - q0)
    698   const uint8x16_t max1 = vmaxq_u8(a_p3_p2, a_p2_p1);
    699   const uint8x16_t max2 = vmaxq_u8(a_p1_p0, a_q3_q2);
    700   const uint8x16_t max3 = vmaxq_u8(a_q2_q1, a_q1_q0);
    701   const uint8x16_t max12 = vmaxq_u8(max1, max2);
    702   const uint8x16_t max123 = vmaxq_u8(max12, max3);
    703   const uint8x16_t mask2 = vcgeq_u8(ithresh_v, max123);
    704   const uint8x16_t mask1 = NeedsFilter_NEON(p1, p0, q0, q1, thresh);
    705   const uint8x16_t mask = vandq_u8(mask1, mask2);
    706   return mask;
    707 }
    708 
    709 //  4-points filter
    710 
    711 static void ApplyFilter4_NEON(
    712     const int8x16_t p1, const int8x16_t p0,
    713     const int8x16_t q0, const int8x16_t q1,
    714     const int8x16_t delta0,
    715     uint8x16_t* const op1, uint8x16_t* const op0,
    716     uint8x16_t* const oq0, uint8x16_t* const oq1) {
    717   const int8x16_t kCst3 = vdupq_n_s8(0x03);
    718   const int8x16_t kCst4 = vdupq_n_s8(0x04);
    719   const int8x16_t delta1 = vqaddq_s8(delta0, kCst4);
    720   const int8x16_t delta2 = vqaddq_s8(delta0, kCst3);
    721   const int8x16_t a1 = vshrq_n_s8(delta1, 3);
    722   const int8x16_t a2 = vshrq_n_s8(delta2, 3);
    723   const int8x16_t a3 = vrshrq_n_s8(a1, 1);   // a3 = (a1 + 1) >> 1
    724   *op0 = FlipSignBack_NEON(vqaddq_s8(p0, a2));  // clip(p0 + a2)
    725   *oq0 = FlipSignBack_NEON(vqsubq_s8(q0, a1));  // clip(q0 - a1)
    726   *op1 = FlipSignBack_NEON(vqaddq_s8(p1, a3));  // clip(p1 + a3)
    727   *oq1 = FlipSignBack_NEON(vqsubq_s8(q1, a3));  // clip(q1 - a3)
    728 }
    729 
    730 static void DoFilter4_NEON(
    731     const uint8x16_t p1, const uint8x16_t p0,
    732     const uint8x16_t q0, const uint8x16_t q1,
    733     const uint8x16_t mask, const uint8x16_t hev_mask,
    734     uint8x16_t* const op1, uint8x16_t* const op0,
    735     uint8x16_t* const oq0, uint8x16_t* const oq1) {
    736   // This is a fused version of DoFilter2() calling ApplyFilter2 directly
    737   const int8x16_t p1s = FlipSign_NEON(p1);
    738   int8x16_t p0s = FlipSign_NEON(p0);
    739   int8x16_t q0s = FlipSign_NEON(q0);
    740   const int8x16_t q1s = FlipSign_NEON(q1);
    741   const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
    742 
    743   // do_filter2 part (simple loopfilter on pixels with hev)
    744   {
    745     const int8x16_t delta = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
    746     const int8x16_t simple_lf_delta =
    747         vandq_s8(delta, vreinterpretq_s8_u8(simple_lf_mask));
    748     ApplyFilter2NoFlip_NEON(p0s, q0s, simple_lf_delta, &p0s, &q0s);
    749   }
    750 
    751   // do_filter4 part (complex loopfilter on pixels without hev)
    752   {
    753     const int8x16_t delta0 = GetBaseDelta0_NEON(p0s, q0s);
    754     // we use: (mask & hev_mask) ^ mask = mask & !hev_mask
    755     const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
    756     const int8x16_t complex_lf_delta =
    757         vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
    758     ApplyFilter4_NEON(p1s, p0s, q0s, q1s, complex_lf_delta, op1, op0, oq0, oq1);
    759   }
    760 }
    761 
    762 //  6-points filter
    763 
    764 static void ApplyFilter6_NEON(
    765     const int8x16_t p2, const int8x16_t p1, const int8x16_t p0,
    766     const int8x16_t q0, const int8x16_t q1, const int8x16_t q2,
    767     const int8x16_t delta,
    768     uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
    769     uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
    770   // We have to compute: X = (9*a+63) >> 7, Y = (18*a+63)>>7, Z = (27*a+63) >> 7
    771   // Turns out, there's a common sub-expression S=9 * a - 1 that can be used
    772   // with the special vqrshrn_n_s16 rounding-shift-and-narrow instruction:
    773   //   X = (S + 64) >> 7, Y = (S + 32) >> 6, Z = (18 * a + S + 64) >> 7
    774   const int8x8_t delta_lo = vget_low_s8(delta);
    775   const int8x8_t delta_hi = vget_high_s8(delta);
    776   const int8x8_t kCst9 = vdup_n_s8(9);
    777   const int16x8_t kCstm1 = vdupq_n_s16(-1);
    778   const int8x8_t kCst18 = vdup_n_s8(18);
    779   const int16x8_t S_lo = vmlal_s8(kCstm1, kCst9, delta_lo);  // S = 9 * a - 1
    780   const int16x8_t S_hi = vmlal_s8(kCstm1, kCst9, delta_hi);
    781   const int16x8_t Z_lo = vmlal_s8(S_lo, kCst18, delta_lo);   // S + 18 * a
    782   const int16x8_t Z_hi = vmlal_s8(S_hi, kCst18, delta_hi);
    783   const int8x8_t a3_lo = vqrshrn_n_s16(S_lo, 7);   // (9 * a + 63) >> 7
    784   const int8x8_t a3_hi = vqrshrn_n_s16(S_hi, 7);
    785   const int8x8_t a2_lo = vqrshrn_n_s16(S_lo, 6);   // (9 * a + 31) >> 6
    786   const int8x8_t a2_hi = vqrshrn_n_s16(S_hi, 6);
    787   const int8x8_t a1_lo = vqrshrn_n_s16(Z_lo, 7);   // (27 * a + 63) >> 7
    788   const int8x8_t a1_hi = vqrshrn_n_s16(Z_hi, 7);
    789   const int8x16_t a1 = vcombine_s8(a1_lo, a1_hi);
    790   const int8x16_t a2 = vcombine_s8(a2_lo, a2_hi);
    791   const int8x16_t a3 = vcombine_s8(a3_lo, a3_hi);
    792 
    793   *op0 = FlipSignBack_NEON(vqaddq_s8(p0, a1));  // clip(p0 + a1)
    794   *oq0 = FlipSignBack_NEON(vqsubq_s8(q0, a1));  // clip(q0 - q1)
    795   *oq1 = FlipSignBack_NEON(vqsubq_s8(q1, a2));  // clip(q1 - a2)
    796   *op1 = FlipSignBack_NEON(vqaddq_s8(p1, a2));  // clip(p1 + a2)
    797   *oq2 = FlipSignBack_NEON(vqsubq_s8(q2, a3));  // clip(q2 - a3)
    798   *op2 = FlipSignBack_NEON(vqaddq_s8(p2, a3));  // clip(p2 + a3)
    799 }
    800 
    801 static void DoFilter6_NEON(
    802     const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
    803     const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
    804     const uint8x16_t mask, const uint8x16_t hev_mask,
    805     uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
    806     uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
    807   // This is a fused version of DoFilter2() calling ApplyFilter2 directly
    808   const int8x16_t p2s = FlipSign_NEON(p2);
    809   const int8x16_t p1s = FlipSign_NEON(p1);
    810   int8x16_t p0s = FlipSign_NEON(p0);
    811   int8x16_t q0s = FlipSign_NEON(q0);
    812   const int8x16_t q1s = FlipSign_NEON(q1);
    813   const int8x16_t q2s = FlipSign_NEON(q2);
    814   const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
    815   const int8x16_t delta0 = GetBaseDelta_NEON(p1s, p0s, q0s, q1s);
    816 
    817   // do_filter2 part (simple loopfilter on pixels with hev)
    818   {
    819     const int8x16_t simple_lf_delta =
    820         vandq_s8(delta0, vreinterpretq_s8_u8(simple_lf_mask));
    821     ApplyFilter2NoFlip_NEON(p0s, q0s, simple_lf_delta, &p0s, &q0s);
    822   }
    823 
    824   // do_filter6 part (complex loopfilter on pixels without hev)
    825   {
    826     // we use: (mask & hev_mask) ^ mask = mask & !hev_mask
    827     const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
    828     const int8x16_t complex_lf_delta =
    829         vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
    830     ApplyFilter6_NEON(p2s, p1s, p0s, q0s, q1s, q2s, complex_lf_delta,
    831                       op2, op1, op0, oq0, oq1, oq2);
    832   }
    833 }
    834 
    835 // on macroblock edges
    836 
    837 static void VFilter16_NEON(uint8_t* p, int stride,
    838                            int thresh, int ithresh, int hev_thresh) {
    839   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
    840   Load16x8_NEON(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
    841   {
    842     const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
    843                                               ithresh, thresh);
    844     const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
    845     uint8x16_t op2, op1, op0, oq0, oq1, oq2;
    846     DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
    847                    &op2, &op1, &op0, &oq0, &oq1, &oq2);
    848     Store16x2_NEON(op2, op1, p - 2 * stride, stride);
    849     Store16x2_NEON(op0, oq0, p + 0 * stride, stride);
    850     Store16x2_NEON(oq1, oq2, p + 2 * stride, stride);
    851   }
    852 }
    853 
    854 static void HFilter16_NEON(uint8_t* p, int stride,
    855                            int thresh, int ithresh, int hev_thresh) {
    856   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
    857   Load8x16_NEON(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
    858   {
    859     const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
    860                                               ithresh, thresh);
    861     const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
    862     uint8x16_t op2, op1, op0, oq0, oq1, oq2;
    863     DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
    864                    &op2, &op1, &op0, &oq0, &oq1, &oq2);
    865     Store2x16_NEON(op2, op1, p - 2, stride);
    866     Store2x16_NEON(op0, oq0, p + 0, stride);
    867     Store2x16_NEON(oq1, oq2, p + 2, stride);
    868   }
    869 }
    870 
    871 // on three inner edges
    872 static void VFilter16i_NEON(uint8_t* p, int stride,
    873                             int thresh, int ithresh, int hev_thresh) {
    874   uint32_t k;
    875   uint8x16_t p3, p2, p1, p0;
    876   Load16x4_NEON(p + 2  * stride, stride, &p3, &p2, &p1, &p0);
    877   for (k = 3; k != 0; --k) {
    878     uint8x16_t q0, q1, q2, q3;
    879     p += 4 * stride;
    880     Load16x4_NEON(p + 2  * stride, stride, &q0, &q1, &q2, &q3);
    881     {
    882       const uint8x16_t mask =
    883           NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
    884       const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
    885       // p3 and p2 are not just temporary variables here: they will be
    886       // re-used for next span. And q2/q3 will become p1/p0 accordingly.
    887       DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
    888       Store16x4_NEON(p1, p0, p3, p2, p, stride);
    889       p1 = q2;
    890       p0 = q3;
    891     }
    892   }
    893 }
    894 
    895 #if !defined(WORK_AROUND_GCC)
    896 static void HFilter16i_NEON(uint8_t* p, int stride,
    897                             int thresh, int ithresh, int hev_thresh) {
    898   uint32_t k;
    899   uint8x16_t p3, p2, p1, p0;
    900   Load4x16_NEON(p + 2, stride, &p3, &p2, &p1, &p0);
    901   for (k = 3; k != 0; --k) {
    902     uint8x16_t q0, q1, q2, q3;
    903     p += 4;
    904     Load4x16_NEON(p + 2, stride, &q0, &q1, &q2, &q3);
    905     {
    906       const uint8x16_t mask =
    907           NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
    908       const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
    909       DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
    910       Store4x16_NEON(p1, p0, p3, p2, p, stride);
    911       p1 = q2;
    912       p0 = q3;
    913     }
    914   }
    915 }
    916 #endif  // !WORK_AROUND_GCC
    917 
    918 // 8-pixels wide variant, for chroma filtering
    919 static void VFilter8_NEON(uint8_t* u, uint8_t* v, int stride,
    920                           int thresh, int ithresh, int hev_thresh) {
    921   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
    922   Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
    923   {
    924     const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
    925                                               ithresh, thresh);
    926     const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
    927     uint8x16_t op2, op1, op0, oq0, oq1, oq2;
    928     DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
    929                    &op2, &op1, &op0, &oq0, &oq1, &oq2);
    930     Store8x2x2_NEON(op2, op1, u - 2 * stride, v - 2 * stride, stride);
    931     Store8x2x2_NEON(op0, oq0, u + 0 * stride, v + 0 * stride, stride);
    932     Store8x2x2_NEON(oq1, oq2, u + 2 * stride, v + 2 * stride, stride);
    933   }
    934 }
    935 static void VFilter8i_NEON(uint8_t* u, uint8_t* v, int stride,
    936                            int thresh, int ithresh, int hev_thresh) {
    937   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
    938   u += 4 * stride;
    939   v += 4 * stride;
    940   Load8x8x2_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
    941   {
    942     const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
    943                                               ithresh, thresh);
    944     const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
    945     uint8x16_t op1, op0, oq0, oq1;
    946     DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
    947     Store8x4x2_NEON(op1, op0, oq0, oq1, u, v, stride);
    948   }
    949 }
    950 
    951 #if !defined(WORK_AROUND_GCC)
    952 static void HFilter8_NEON(uint8_t* u, uint8_t* v, int stride,
    953                           int thresh, int ithresh, int hev_thresh) {
    954   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
    955   Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
    956   {
    957     const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
    958                                               ithresh, thresh);
    959     const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
    960     uint8x16_t op2, op1, op0, oq0, oq1, oq2;
    961     DoFilter6_NEON(p2, p1, p0, q0, q1, q2, mask, hev_mask,
    962                    &op2, &op1, &op0, &oq0, &oq1, &oq2);
    963     Store6x8x2_NEON(op2, op1, op0, oq0, oq1, oq2, u, v, stride);
    964   }
    965 }
    966 
    967 static void HFilter8i_NEON(uint8_t* u, uint8_t* v, int stride,
    968                            int thresh, int ithresh, int hev_thresh) {
    969   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
    970   u += 4;
    971   v += 4;
    972   Load8x8x2T_NEON(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
    973   {
    974     const uint8x16_t mask = NeedsFilter2_NEON(p3, p2, p1, p0, q0, q1, q2, q3,
    975                                               ithresh, thresh);
    976     const uint8x16_t hev_mask = NeedsHev_NEON(p1, p0, q0, q1, hev_thresh);
    977     uint8x16_t op1, op0, oq0, oq1;
    978     DoFilter4_NEON(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
    979     Store4x8x2_NEON(op1, op0, oq0, oq1, u, v, stride);
    980   }
    981 }
    982 #endif  // !WORK_AROUND_GCC
    983 
    984 //-----------------------------------------------------------------------------
    985 // Inverse transforms (Paragraph 14.4)
    986 
    987 // Technically these are unsigned but vqdmulh is only available in signed.
    988 // vqdmulh returns high half (effectively >> 16) but also doubles the value,
    989 // changing the >> 16 to >> 15 and requiring an additional >> 1.
    990 // We use this to our advantage with kC2. The canonical value is 35468.
    991 // However, the high bit is set so treating it as signed will give incorrect
    992 // results. We avoid this by down shifting by 1 here to clear the highest bit.
    993 // Combined with the doubling effect of vqdmulh we get >> 16.
    994 // This can not be applied to kC1 because the lowest bit is set. Down shifting
    995 // the constant would reduce precision.
    996 
    997 // libwebp uses a trick to avoid some extra addition that libvpx does.
    998 // Instead of:
    999 // temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
   1000 // libwebp adds 1 << 16 to cospi8sqrt2minus1 (kC1). However, this causes the
   1001 // same issue with kC1 and vqdmulh that we work around by down shifting kC2
   1002 
   1003 static const int16_t kC1 = 20091;
   1004 static const int16_t kC2 = 17734;  // half of kC2, actually. See comment above.
   1005 
   1006 #if defined(WEBP_USE_INTRINSICS)
   1007 static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0,
   1008                                           const int16x8_t in1,
   1009                                           int16x8x2_t* const out) {
   1010   // a0 a1 a2 a3 | b0 b1 b2 b3   => a0 b0 c0 d0 | a1 b1 c1 d1
   1011   // c0 c1 c2 c3 | d0 d1 d2 d3      a2 b2 c2 d2 | a3 b3 c3 d3
   1012   const int16x8x2_t tmp0 = vzipq_s16(in0, in1);   // a0 c0 a1 c1 a2 c2 ...
   1013                                                   // b0 d0 b1 d1 b2 d2 ...
   1014   *out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
   1015 }
   1016 
   1017 static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
   1018   // {rows} = in0 | in4
   1019   //          in8 | in12
   1020   // B1 = in4 | in12
   1021   const int16x8_t B1 =
   1022       vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));
   1023   // C0 = kC1 * in4 | kC1 * in12
   1024   // C1 = kC2 * in4 | kC2 * in12
   1025   const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);
   1026   const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);
   1027   const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),
   1028                                 vget_low_s16(rows->val[1]));   // in0 + in8
   1029   const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),
   1030                                 vget_low_s16(rows->val[1]));   // in0 - in8
   1031   // c = kC2 * in4 - kC1 * in12
   1032   // d = kC1 * in4 + kC2 * in12
   1033   const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));
   1034   const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));
   1035   const int16x8_t D0 = vcombine_s16(a, b);      // D0 = a | b
   1036   const int16x8_t D1 = vcombine_s16(d, c);      // D1 = d | c
   1037   const int16x8_t E0 = vqaddq_s16(D0, D1);      // a+d | b+c
   1038   const int16x8_t E_tmp = vqsubq_s16(D0, D1);   // a-d | b-c
   1039   const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
   1040   Transpose8x2_NEON(E0, E1, rows);
   1041 }
   1042 
   1043 static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {
   1044   int16x8x2_t rows;
   1045   INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
   1046   TransformPass_NEON(&rows);
   1047   TransformPass_NEON(&rows);
   1048   Add4x4_NEON(rows.val[0], rows.val[1], dst);
   1049 }
   1050 
   1051 #else
   1052 
   1053 static void TransformOne_NEON(const int16_t* in, uint8_t* dst) {
   1054   const int kBPS = BPS;
   1055   // kC1, kC2. Padded because vld1.16 loads 8 bytes
   1056   const int16_t constants[4] = { kC1, kC2, 0, 0 };
   1057   /* Adapted from libvpx: vp8/common/arm/neon/shortidct4x4llm_neon.asm */
   1058   __asm__ volatile (
   1059     "vld1.16         {q1, q2}, [%[in]]           \n"
   1060     "vld1.16         {d0}, [%[constants]]        \n"
   1061 
   1062     /* d2: in[0]
   1063      * d3: in[8]
   1064      * d4: in[4]
   1065      * d5: in[12]
   1066      */
   1067     "vswp            d3, d4                      \n"
   1068 
   1069     /* q8 = {in[4], in[12]} * kC1 * 2 >> 16
   1070      * q9 = {in[4], in[12]} * kC2 >> 16
   1071      */
   1072     "vqdmulh.s16     q8, q2, d0[0]               \n"
   1073     "vqdmulh.s16     q9, q2, d0[1]               \n"
   1074 
   1075     /* d22 = a = in[0] + in[8]
   1076      * d23 = b = in[0] - in[8]
   1077      */
   1078     "vqadd.s16       d22, d2, d3                 \n"
   1079     "vqsub.s16       d23, d2, d3                 \n"
   1080 
   1081     /* The multiplication should be x * kC1 >> 16
   1082      * However, with vqdmulh we get x * kC1 * 2 >> 16
   1083      * (multiply, double, return high half)
   1084      * We avoided this in kC2 by pre-shifting the constant.
   1085      * q8 = in[4]/[12] * kC1 >> 16
   1086      */
   1087     "vshr.s16        q8, q8, #1                  \n"
   1088 
   1089     /* Add {in[4], in[12]} back after the multiplication. This is handled by
   1090      * adding 1 << 16 to kC1 in the libwebp C code.
   1091      */
   1092     "vqadd.s16       q8, q2, q8                  \n"
   1093 
   1094     /* d20 = c = in[4]*kC2 - in[12]*kC1
   1095      * d21 = d = in[4]*kC1 + in[12]*kC2
   1096      */
   1097     "vqsub.s16       d20, d18, d17               \n"
   1098     "vqadd.s16       d21, d19, d16               \n"
   1099 
   1100     /* d2 = tmp[0] = a + d
   1101      * d3 = tmp[1] = b + c
   1102      * d4 = tmp[2] = b - c
   1103      * d5 = tmp[3] = a - d
   1104      */
   1105     "vqadd.s16       d2, d22, d21                \n"
   1106     "vqadd.s16       d3, d23, d20                \n"
   1107     "vqsub.s16       d4, d23, d20                \n"
   1108     "vqsub.s16       d5, d22, d21                \n"
   1109 
   1110     "vzip.16         q1, q2                      \n"
   1111     "vzip.16         q1, q2                      \n"
   1112 
   1113     "vswp            d3, d4                      \n"
   1114 
   1115     /* q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
   1116      * q9 = {tmp[4], tmp[12]} * kC2 >> 16
   1117      */
   1118     "vqdmulh.s16     q8, q2, d0[0]               \n"
   1119     "vqdmulh.s16     q9, q2, d0[1]               \n"
   1120 
   1121     /* d22 = a = tmp[0] + tmp[8]
   1122      * d23 = b = tmp[0] - tmp[8]
   1123      */
   1124     "vqadd.s16       d22, d2, d3                 \n"
   1125     "vqsub.s16       d23, d2, d3                 \n"
   1126 
   1127     /* See long winded explanations prior */
   1128     "vshr.s16        q8, q8, #1                  \n"
   1129     "vqadd.s16       q8, q2, q8                  \n"
   1130 
   1131     /* d20 = c = in[4]*kC2 - in[12]*kC1
   1132      * d21 = d = in[4]*kC1 + in[12]*kC2
   1133      */
   1134     "vqsub.s16       d20, d18, d17               \n"
   1135     "vqadd.s16       d21, d19, d16               \n"
   1136 
   1137     /* d2 = tmp[0] = a + d
   1138      * d3 = tmp[1] = b + c
   1139      * d4 = tmp[2] = b - c
   1140      * d5 = tmp[3] = a - d
   1141      */
   1142     "vqadd.s16       d2, d22, d21                \n"
   1143     "vqadd.s16       d3, d23, d20                \n"
   1144     "vqsub.s16       d4, d23, d20                \n"
   1145     "vqsub.s16       d5, d22, d21                \n"
   1146 
   1147     "vld1.32         d6[0], [%[dst]], %[kBPS]    \n"
   1148     "vld1.32         d6[1], [%[dst]], %[kBPS]    \n"
   1149     "vld1.32         d7[0], [%[dst]], %[kBPS]    \n"
   1150     "vld1.32         d7[1], [%[dst]], %[kBPS]    \n"
   1151 
   1152     "sub         %[dst], %[dst], %[kBPS], lsl #2 \n"
   1153 
   1154     /* (val) + 4 >> 3 */
   1155     "vrshr.s16       d2, d2, #3                  \n"
   1156     "vrshr.s16       d3, d3, #3                  \n"
   1157     "vrshr.s16       d4, d4, #3                  \n"
   1158     "vrshr.s16       d5, d5, #3                  \n"
   1159 
   1160     "vzip.16         q1, q2                      \n"
   1161     "vzip.16         q1, q2                      \n"
   1162 
   1163     /* Must accumulate before saturating */
   1164     "vmovl.u8        q8, d6                      \n"
   1165     "vmovl.u8        q9, d7                      \n"
   1166 
   1167     "vqadd.s16       q1, q1, q8                  \n"
   1168     "vqadd.s16       q2, q2, q9                  \n"
   1169 
   1170     "vqmovun.s16     d0, q1                      \n"
   1171     "vqmovun.s16     d1, q2                      \n"
   1172 
   1173     "vst1.32         d0[0], [%[dst]], %[kBPS]    \n"
   1174     "vst1.32         d0[1], [%[dst]], %[kBPS]    \n"
   1175     "vst1.32         d1[0], [%[dst]], %[kBPS]    \n"
   1176     "vst1.32         d1[1], [%[dst]]             \n"
   1177 
   1178     : [in] "+r"(in), [dst] "+r"(dst)  /* modified registers */
   1179     : [kBPS] "r"(kBPS), [constants] "r"(constants)  /* constants */
   1180     : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11"  /* clobbered */
   1181   );
   1182 }
   1183 
   1184 #endif    // WEBP_USE_INTRINSICS
   1185 
   1186 static void TransformTwo_NEON(const int16_t* in, uint8_t* dst, int do_two) {
   1187   TransformOne_NEON(in, dst);
   1188   if (do_two) {
   1189     TransformOne_NEON(in + 16, dst + 4);
   1190   }
   1191 }
   1192 
   1193 static void TransformDC_NEON(const int16_t* in, uint8_t* dst) {
   1194   const int16x8_t DC = vdupq_n_s16(in[0]);
   1195   Add4x4_NEON(DC, DC, dst);
   1196 }
   1197 
   1198 //------------------------------------------------------------------------------
   1199 
   1200 #define STORE_WHT(dst, col, rows) do {                  \
   1201   *dst = vgetq_lane_s32(rows.val[0], col); (dst) += 16; \
   1202   *dst = vgetq_lane_s32(rows.val[1], col); (dst) += 16; \
   1203   *dst = vgetq_lane_s32(rows.val[2], col); (dst) += 16; \
   1204   *dst = vgetq_lane_s32(rows.val[3], col); (dst) += 16; \
   1205 } while (0)
   1206 
   1207 static void TransformWHT_NEON(const int16_t* in, int16_t* out) {
   1208   int32x4x4_t tmp;
   1209 
   1210   {
   1211     // Load the source.
   1212     const int16x4_t in00_03 = vld1_s16(in + 0);
   1213     const int16x4_t in04_07 = vld1_s16(in + 4);
   1214     const int16x4_t in08_11 = vld1_s16(in + 8);
   1215     const int16x4_t in12_15 = vld1_s16(in + 12);
   1216     const int32x4_t a0 = vaddl_s16(in00_03, in12_15);  // in[0..3] + in[12..15]
   1217     const int32x4_t a1 = vaddl_s16(in04_07, in08_11);  // in[4..7] + in[8..11]
   1218     const int32x4_t a2 = vsubl_s16(in04_07, in08_11);  // in[4..7] - in[8..11]
   1219     const int32x4_t a3 = vsubl_s16(in00_03, in12_15);  // in[0..3] - in[12..15]
   1220     tmp.val[0] = vaddq_s32(a0, a1);
   1221     tmp.val[1] = vaddq_s32(a3, a2);
   1222     tmp.val[2] = vsubq_s32(a0, a1);
   1223     tmp.val[3] = vsubq_s32(a3, a2);
   1224     // Arrange the temporary results column-wise.
   1225     tmp = Transpose4x4_NEON(tmp);
   1226   }
   1227 
   1228   {
   1229     const int32x4_t kCst3 = vdupq_n_s32(3);
   1230     const int32x4_t dc = vaddq_s32(tmp.val[0], kCst3);  // add rounder
   1231     const int32x4_t a0 = vaddq_s32(dc, tmp.val[3]);
   1232     const int32x4_t a1 = vaddq_s32(tmp.val[1], tmp.val[2]);
   1233     const int32x4_t a2 = vsubq_s32(tmp.val[1], tmp.val[2]);
   1234     const int32x4_t a3 = vsubq_s32(dc, tmp.val[3]);
   1235 
   1236     tmp.val[0] = vaddq_s32(a0, a1);
   1237     tmp.val[1] = vaddq_s32(a3, a2);
   1238     tmp.val[2] = vsubq_s32(a0, a1);
   1239     tmp.val[3] = vsubq_s32(a3, a2);
   1240 
   1241     // right shift the results by 3.
   1242     tmp.val[0] = vshrq_n_s32(tmp.val[0], 3);
   1243     tmp.val[1] = vshrq_n_s32(tmp.val[1], 3);
   1244     tmp.val[2] = vshrq_n_s32(tmp.val[2], 3);
   1245     tmp.val[3] = vshrq_n_s32(tmp.val[3], 3);
   1246 
   1247     STORE_WHT(out, 0, tmp);
   1248     STORE_WHT(out, 1, tmp);
   1249     STORE_WHT(out, 2, tmp);
   1250     STORE_WHT(out, 3, tmp);
   1251   }
   1252 }
   1253 
   1254 #undef STORE_WHT
   1255 
   1256 //------------------------------------------------------------------------------
   1257 
   1258 #define MUL(a, b) (((a) * (b)) >> 16)
   1259 static void TransformAC3_NEON(const int16_t* in, uint8_t* dst) {
   1260   static const int kC1_full = 20091 + (1 << 16);
   1261   static const int kC2_full = 35468;
   1262   const int16x4_t A = vld1_dup_s16(in);
   1263   const int16x4_t c4 = vdup_n_s16(MUL(in[4], kC2_full));
   1264   const int16x4_t d4 = vdup_n_s16(MUL(in[4], kC1_full));
   1265   const int c1 = MUL(in[1], kC2_full);
   1266   const int d1 = MUL(in[1], kC1_full);
   1267   const uint64_t cd = (uint64_t)( d1 & 0xffff) <<  0 |
   1268                       (uint64_t)( c1 & 0xffff) << 16 |
   1269                       (uint64_t)(-c1 & 0xffff) << 32 |
   1270                       (uint64_t)(-d1 & 0xffff) << 48;
   1271   const int16x4_t CD = vcreate_s16(cd);
   1272   const int16x4_t B = vqadd_s16(A, CD);
   1273   const int16x8_t m0_m1 = vcombine_s16(vqadd_s16(B, d4), vqadd_s16(B, c4));
   1274   const int16x8_t m2_m3 = vcombine_s16(vqsub_s16(B, c4), vqsub_s16(B, d4));
   1275   Add4x4_NEON(m0_m1, m2_m3, dst);
   1276 }
   1277 #undef MUL
   1278 
   1279 //------------------------------------------------------------------------------
   1280 // 4x4
   1281 
   1282 static void DC4_NEON(uint8_t* dst) {    // DC
   1283   const uint8x8_t A = vld1_u8(dst - BPS);  // top row
   1284   const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
   1285   const uint16x4_t p1 = vpadd_u16(p0, p0);
   1286   const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1));
   1287   const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1));
   1288   const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1));
   1289   const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1));
   1290   const uint16x8_t s0 = vaddq_u16(L0, L1);
   1291   const uint16x8_t s1 = vaddq_u16(L2, L3);
   1292   const uint16x8_t s01 = vaddq_u16(s0, s1);
   1293   const uint16x8_t sum = vaddq_u16(s01, vcombine_u16(p1, p1));
   1294   const uint8x8_t dc0 = vrshrn_n_u16(sum, 3);  // (sum + 4) >> 3
   1295   const uint8x8_t dc = vdup_lane_u8(dc0, 0);
   1296   int i;
   1297   for (i = 0; i < 4; ++i) {
   1298     vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc), 0);
   1299   }
   1300 }
   1301 
   1302 // TrueMotion (4x4 + 8x8)
   1303 static WEBP_INLINE void TrueMotion_NEON(uint8_t* dst, int size) {
   1304   const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1);  // top-left pixel 'A[-1]'
   1305   const uint8x8_t T = vld1_u8(dst - BPS);  // top row 'A[0..3]'
   1306   const int16x8_t d = vreinterpretq_s16_u16(vsubl_u8(T, TL));  // A[c] - A[-1]
   1307   int y;
   1308   for (y = 0; y < size; y += 4) {
   1309     // left edge
   1310     const int16x8_t L0 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 0 * BPS - 1));
   1311     const int16x8_t L1 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 1 * BPS - 1));
   1312     const int16x8_t L2 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 2 * BPS - 1));
   1313     const int16x8_t L3 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 3 * BPS - 1));
   1314     const int16x8_t r0 = vaddq_s16(L0, d);  // L[r] + A[c] - A[-1]
   1315     const int16x8_t r1 = vaddq_s16(L1, d);
   1316     const int16x8_t r2 = vaddq_s16(L2, d);
   1317     const int16x8_t r3 = vaddq_s16(L3, d);
   1318     // Saturate and store the result.
   1319     const uint32x2_t r0_u32 = vreinterpret_u32_u8(vqmovun_s16(r0));
   1320     const uint32x2_t r1_u32 = vreinterpret_u32_u8(vqmovun_s16(r1));
   1321     const uint32x2_t r2_u32 = vreinterpret_u32_u8(vqmovun_s16(r2));
   1322     const uint32x2_t r3_u32 = vreinterpret_u32_u8(vqmovun_s16(r3));
   1323     if (size == 4) {
   1324       vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0_u32, 0);
   1325       vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1_u32, 0);
   1326       vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2_u32, 0);
   1327       vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3_u32, 0);
   1328     } else {
   1329       vst1_u32((uint32_t*)(dst + 0 * BPS), r0_u32);
   1330       vst1_u32((uint32_t*)(dst + 1 * BPS), r1_u32);
   1331       vst1_u32((uint32_t*)(dst + 2 * BPS), r2_u32);
   1332       vst1_u32((uint32_t*)(dst + 3 * BPS), r3_u32);
   1333     }
   1334     dst += 4 * BPS;
   1335   }
   1336 }
   1337 
   1338 static void TM4_NEON(uint8_t* dst) { TrueMotion_NEON(dst, 4); }
   1339 
   1340 static void VE4_NEON(uint8_t* dst) {    // vertical
   1341   // NB: avoid vld1_u64 here as an alignment hint may be added -> SIGBUS.
   1342   const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(dst - BPS - 1));  // top row
   1343   const uint64x1_t A1 = vshr_n_u64(A0, 8);
   1344   const uint64x1_t A2 = vshr_n_u64(A0, 16);
   1345   const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0);
   1346   const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1);
   1347   const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2);
   1348   const uint8x8_t b = vhadd_u8(ABCDEFGH, CDEFGH00);
   1349   const uint8x8_t avg = vrhadd_u8(b, BCDEFGH0);
   1350   int i;
   1351   for (i = 0; i < 4; ++i) {
   1352     vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(avg), 0);
   1353   }
   1354 }
   1355 
   1356 static void RD4_NEON(uint8_t* dst) {   // Down-right
   1357   const uint8x8_t XABCD_u8 = vld1_u8(dst - BPS - 1);
   1358   const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
   1359   const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
   1360   const uint32_t I = dst[-1 + 0 * BPS];
   1361   const uint32_t J = dst[-1 + 1 * BPS];
   1362   const uint32_t K = dst[-1 + 2 * BPS];
   1363   const uint32_t L = dst[-1 + 3 * BPS];
   1364   const uint64x1_t LKJI____ = vcreate_u64(L | (K << 8) | (J << 16) | (I << 24));
   1365   const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
   1366   const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
   1367   const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
   1368   const uint8_t D = vget_lane_u8(XABCD_u8, 4);
   1369   const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);
   1370   const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);
   1371   const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);
   1372   const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);
   1373   const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
   1374   const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
   1375   const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
   1376   const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
   1377   const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
   1378   vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0);
   1379   vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0);
   1380   vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0);
   1381   vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);
   1382 }
   1383 
   1384 static void LD4_NEON(uint8_t* dst) {    // Down-left
   1385   // Note using the same shift trick as VE4() is slower here.
   1386   const uint8x8_t ABCDEFGH = vld1_u8(dst - BPS + 0);
   1387   const uint8x8_t BCDEFGH0 = vld1_u8(dst - BPS + 1);
   1388   const uint8x8_t CDEFGH00 = vld1_u8(dst - BPS + 2);
   1389   const uint8x8_t CDEFGHH0 = vset_lane_u8(dst[-BPS + 7], CDEFGH00, 6);
   1390   const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGHH0);
   1391   const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0);
   1392   const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
   1393   const uint32x2_t r0 = vreinterpret_u32_u8(avg2);
   1394   const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
   1395   const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
   1396   const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
   1397   vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0);
   1398   vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0);
   1399   vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0);
   1400   vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);
   1401 }
   1402 
   1403 //------------------------------------------------------------------------------
   1404 // Chroma
   1405 
   1406 static void VE8uv_NEON(uint8_t* dst) {    // vertical
   1407   const uint8x8_t top = vld1_u8(dst - BPS);
   1408   int j;
   1409   for (j = 0; j < 8; ++j) {
   1410     vst1_u8(dst + j * BPS, top);
   1411   }
   1412 }
   1413 
   1414 static void HE8uv_NEON(uint8_t* dst) {    // horizontal
   1415   int j;
   1416   for (j = 0; j < 8; ++j) {
   1417     const uint8x8_t left = vld1_dup_u8(dst - 1);
   1418     vst1_u8(dst, left);
   1419     dst += BPS;
   1420   }
   1421 }
   1422 
   1423 static WEBP_INLINE void DC8_NEON(uint8_t* dst, int do_top, int do_left) {
   1424   uint16x8_t sum_top;
   1425   uint16x8_t sum_left;
   1426   uint8x8_t dc0;
   1427 
   1428   if (do_top) {
   1429     const uint8x8_t A = vld1_u8(dst - BPS);  // top row
   1430     const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
   1431     const uint16x4_t p1 = vpadd_u16(p0, p0);
   1432     const uint16x4_t p2 = vpadd_u16(p1, p1);
   1433     sum_top = vcombine_u16(p2, p2);
   1434   }
   1435 
   1436   if (do_left) {
   1437     const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1));
   1438     const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1));
   1439     const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1));
   1440     const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1));
   1441     const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + 4 * BPS - 1));
   1442     const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + 5 * BPS - 1));
   1443     const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + 6 * BPS - 1));
   1444     const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + 7 * BPS - 1));
   1445     const uint16x8_t s0 = vaddq_u16(L0, L1);
   1446     const uint16x8_t s1 = vaddq_u16(L2, L3);
   1447     const uint16x8_t s2 = vaddq_u16(L4, L5);
   1448     const uint16x8_t s3 = vaddq_u16(L6, L7);
   1449     const uint16x8_t s01 = vaddq_u16(s0, s1);
   1450     const uint16x8_t s23 = vaddq_u16(s2, s3);
   1451     sum_left = vaddq_u16(s01, s23);
   1452   }
   1453 
   1454   if (do_top && do_left) {
   1455     const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
   1456     dc0 = vrshrn_n_u16(sum, 4);
   1457   } else if (do_top) {
   1458     dc0 = vrshrn_n_u16(sum_top, 3);
   1459   } else if (do_left) {
   1460     dc0 = vrshrn_n_u16(sum_left, 3);
   1461   } else {
   1462     dc0 = vdup_n_u8(0x80);
   1463   }
   1464 
   1465   {
   1466     const uint8x8_t dc = vdup_lane_u8(dc0, 0);
   1467     int i;
   1468     for (i = 0; i < 8; ++i) {
   1469       vst1_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc));
   1470     }
   1471   }
   1472 }
   1473 
   1474 static void DC8uv_NEON(uint8_t* dst) { DC8_NEON(dst, 1, 1); }
   1475 static void DC8uvNoTop_NEON(uint8_t* dst) { DC8_NEON(dst, 0, 1); }
   1476 static void DC8uvNoLeft_NEON(uint8_t* dst) { DC8_NEON(dst, 1, 0); }
   1477 static void DC8uvNoTopLeft_NEON(uint8_t* dst) { DC8_NEON(dst, 0, 0); }
   1478 
   1479 static void TM8uv_NEON(uint8_t* dst) { TrueMotion_NEON(dst, 8); }
   1480 
   1481 //------------------------------------------------------------------------------
   1482 // 16x16
   1483 
   1484 static void VE16_NEON(uint8_t* dst) {     // vertical
   1485   const uint8x16_t top = vld1q_u8(dst - BPS);
   1486   int j;
   1487   for (j = 0; j < 16; ++j) {
   1488     vst1q_u8(dst + j * BPS, top);
   1489   }
   1490 }
   1491 
   1492 static void HE16_NEON(uint8_t* dst) {     // horizontal
   1493   int j;
   1494   for (j = 0; j < 16; ++j) {
   1495     const uint8x16_t left = vld1q_dup_u8(dst - 1);
   1496     vst1q_u8(dst, left);
   1497     dst += BPS;
   1498   }
   1499 }
   1500 
   1501 static WEBP_INLINE void DC16_NEON(uint8_t* dst, int do_top, int do_left) {
   1502   uint16x8_t sum_top;
   1503   uint16x8_t sum_left;
   1504   uint8x8_t dc0;
   1505 
   1506   if (do_top) {
   1507     const uint8x16_t A = vld1q_u8(dst - BPS);  // top row
   1508     const uint16x8_t p0 = vpaddlq_u8(A);  // cascading summation of the top
   1509     const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
   1510     const uint16x4_t p2 = vpadd_u16(p1, p1);
   1511     const uint16x4_t p3 = vpadd_u16(p2, p2);
   1512     sum_top = vcombine_u16(p3, p3);
   1513   }
   1514 
   1515   if (do_left) {
   1516     int i;
   1517     sum_left = vdupq_n_u16(0);
   1518     for (i = 0; i < 16; i += 8) {
   1519       const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + (i + 0) * BPS - 1));
   1520       const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + (i + 1) * BPS - 1));
   1521       const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + (i + 2) * BPS - 1));
   1522       const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + (i + 3) * BPS - 1));
   1523       const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + (i + 4) * BPS - 1));
   1524       const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + (i + 5) * BPS - 1));
   1525       const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + (i + 6) * BPS - 1));
   1526       const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + (i + 7) * BPS - 1));
   1527       const uint16x8_t s0 = vaddq_u16(L0, L1);
   1528       const uint16x8_t s1 = vaddq_u16(L2, L3);
   1529       const uint16x8_t s2 = vaddq_u16(L4, L5);
   1530       const uint16x8_t s3 = vaddq_u16(L6, L7);
   1531       const uint16x8_t s01 = vaddq_u16(s0, s1);
   1532       const uint16x8_t s23 = vaddq_u16(s2, s3);
   1533       const uint16x8_t sum = vaddq_u16(s01, s23);
   1534       sum_left = vaddq_u16(sum_left, sum);
   1535     }
   1536   }
   1537 
   1538   if (do_top && do_left) {
   1539     const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
   1540     dc0 = vrshrn_n_u16(sum, 5);
   1541   } else if (do_top) {
   1542     dc0 = vrshrn_n_u16(sum_top, 4);
   1543   } else if (do_left) {
   1544     dc0 = vrshrn_n_u16(sum_left, 4);
   1545   } else {
   1546     dc0 = vdup_n_u8(0x80);
   1547   }
   1548 
   1549   {
   1550     const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
   1551     int i;
   1552     for (i = 0; i < 16; ++i) {
   1553       vst1q_u8(dst + i * BPS, dc);
   1554     }
   1555   }
   1556 }
   1557 
   1558 static void DC16TopLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 1, 1); }
   1559 static void DC16NoTop_NEON(uint8_t* dst) { DC16_NEON(dst, 0, 1); }
   1560 static void DC16NoLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 1, 0); }
   1561 static void DC16NoTopLeft_NEON(uint8_t* dst) { DC16_NEON(dst, 0, 0); }
   1562 
   1563 static void TM16_NEON(uint8_t* dst) {
   1564   const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1);  // top-left pixel 'A[-1]'
   1565   const uint8x16_t T = vld1q_u8(dst - BPS);  // top row 'A[0..15]'
   1566   // A[c] - A[-1]
   1567   const int16x8_t d_lo = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(T), TL));
   1568   const int16x8_t d_hi = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(T), TL));
   1569   int y;
   1570   for (y = 0; y < 16; y += 4) {
   1571     // left edge
   1572     const int16x8_t L0 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 0 * BPS - 1));
   1573     const int16x8_t L1 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 1 * BPS - 1));
   1574     const int16x8_t L2 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 2 * BPS - 1));
   1575     const int16x8_t L3 = ConvertU8ToS16_NEON(vld1_dup_u8(dst + 3 * BPS - 1));
   1576     const int16x8_t r0_lo = vaddq_s16(L0, d_lo);  // L[r] + A[c] - A[-1]
   1577     const int16x8_t r1_lo = vaddq_s16(L1, d_lo);
   1578     const int16x8_t r2_lo = vaddq_s16(L2, d_lo);
   1579     const int16x8_t r3_lo = vaddq_s16(L3, d_lo);
   1580     const int16x8_t r0_hi = vaddq_s16(L0, d_hi);
   1581     const int16x8_t r1_hi = vaddq_s16(L1, d_hi);
   1582     const int16x8_t r2_hi = vaddq_s16(L2, d_hi);
   1583     const int16x8_t r3_hi = vaddq_s16(L3, d_hi);
   1584     // Saturate and store the result.
   1585     const uint8x16_t row0 = vcombine_u8(vqmovun_s16(r0_lo), vqmovun_s16(r0_hi));
   1586     const uint8x16_t row1 = vcombine_u8(vqmovun_s16(r1_lo), vqmovun_s16(r1_hi));
   1587     const uint8x16_t row2 = vcombine_u8(vqmovun_s16(r2_lo), vqmovun_s16(r2_hi));
   1588     const uint8x16_t row3 = vcombine_u8(vqmovun_s16(r3_lo), vqmovun_s16(r3_hi));
   1589     vst1q_u8(dst + 0 * BPS, row0);
   1590     vst1q_u8(dst + 1 * BPS, row1);
   1591     vst1q_u8(dst + 2 * BPS, row2);
   1592     vst1q_u8(dst + 3 * BPS, row3);
   1593     dst += 4 * BPS;
   1594   }
   1595 }
   1596 
   1597 //------------------------------------------------------------------------------
   1598 // Entry point
   1599 
   1600 extern void VP8DspInitNEON(void);
   1601 
   1602 WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitNEON(void) {
   1603   VP8Transform = TransformTwo_NEON;
   1604   VP8TransformAC3 = TransformAC3_NEON;
   1605   VP8TransformDC = TransformDC_NEON;
   1606   VP8TransformWHT = TransformWHT_NEON;
   1607 
   1608   VP8VFilter16 = VFilter16_NEON;
   1609   VP8VFilter16i = VFilter16i_NEON;
   1610   VP8HFilter16 = HFilter16_NEON;
   1611 #if !defined(WORK_AROUND_GCC)
   1612   VP8HFilter16i = HFilter16i_NEON;
   1613 #endif
   1614   VP8VFilter8 = VFilter8_NEON;
   1615   VP8VFilter8i = VFilter8i_NEON;
   1616 #if !defined(WORK_AROUND_GCC)
   1617   VP8HFilter8 = HFilter8_NEON;
   1618   VP8HFilter8i = HFilter8i_NEON;
   1619 #endif
   1620   VP8SimpleVFilter16 = SimpleVFilter16_NEON;
   1621   VP8SimpleHFilter16 = SimpleHFilter16_NEON;
   1622   VP8SimpleVFilter16i = SimpleVFilter16i_NEON;
   1623   VP8SimpleHFilter16i = SimpleHFilter16i_NEON;
   1624 
   1625   VP8PredLuma4[0] = DC4_NEON;
   1626   VP8PredLuma4[1] = TM4_NEON;
   1627   VP8PredLuma4[2] = VE4_NEON;
   1628   VP8PredLuma4[4] = RD4_NEON;
   1629   VP8PredLuma4[6] = LD4_NEON;
   1630 
   1631   VP8PredLuma16[0] = DC16TopLeft_NEON;
   1632   VP8PredLuma16[1] = TM16_NEON;
   1633   VP8PredLuma16[2] = VE16_NEON;
   1634   VP8PredLuma16[3] = HE16_NEON;
   1635   VP8PredLuma16[4] = DC16NoTop_NEON;
   1636   VP8PredLuma16[5] = DC16NoLeft_NEON;
   1637   VP8PredLuma16[6] = DC16NoTopLeft_NEON;
   1638 
   1639   VP8PredChroma8[0] = DC8uv_NEON;
   1640   VP8PredChroma8[1] = TM8uv_NEON;
   1641   VP8PredChroma8[2] = VE8uv_NEON;
   1642   VP8PredChroma8[3] = HE8uv_NEON;
   1643   VP8PredChroma8[4] = DC8uvNoTop_NEON;
   1644   VP8PredChroma8[5] = DC8uvNoLeft_NEON;
   1645   VP8PredChroma8[6] = DC8uvNoTopLeft_NEON;
   1646 }
   1647 
   1648 #else  // !WEBP_USE_NEON
   1649 
   1650 WEBP_DSP_INIT_STUB(VP8DspInitNEON)
   1651 
   1652 #endif  // WEBP_USE_NEON
   1653