Home | History | Annotate | Download | only in dsp
      1 // Copyright 2012 Google Inc. All Rights Reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style license
      4 // that can be found in the COPYING file in the root of the source
      5 // tree. An additional intellectual property rights grant can be found
      6 // in the file PATENTS. All contributing project authors may
      7 // be found in the AUTHORS file in the root of the source tree.
      8 // -----------------------------------------------------------------------------
      9 //
     10 // ARM NEON version of dsp functions and loop filtering.
     11 //
     12 // Authors: Somnath Banerjee (somnath (at) google.com)
     13 //          Johann Koenig (johannkoenig (at) google.com)
     14 
     15 #include "./dsp.h"
     16 
     17 #if defined(WEBP_USE_NEON)
     18 
     19 #include "./neon.h"
     20 #include "../dec/vp8i.h"
     21 
     22 //------------------------------------------------------------------------------
     23 // NxM Loading functions
     24 
     25 // Load/Store vertical edge
     26 #define LOAD8x4(c1, c2, c3, c4, b1, b2, stride)                                \
     27   "vld4.8   {" #c1"[0], " #c2"[0], " #c3"[0], " #c4"[0]}," #b1 "," #stride"\n" \
     28   "vld4.8   {" #c1"[1], " #c2"[1], " #c3"[1], " #c4"[1]}," #b2 "," #stride"\n" \
     29   "vld4.8   {" #c1"[2], " #c2"[2], " #c3"[2], " #c4"[2]}," #b1 "," #stride"\n" \
     30   "vld4.8   {" #c1"[3], " #c2"[3], " #c3"[3], " #c4"[3]}," #b2 "," #stride"\n" \
     31   "vld4.8   {" #c1"[4], " #c2"[4], " #c3"[4], " #c4"[4]}," #b1 "," #stride"\n" \
     32   "vld4.8   {" #c1"[5], " #c2"[5], " #c3"[5], " #c4"[5]}," #b2 "," #stride"\n" \
     33   "vld4.8   {" #c1"[6], " #c2"[6], " #c3"[6], " #c4"[6]}," #b1 "," #stride"\n" \
     34   "vld4.8   {" #c1"[7], " #c2"[7], " #c3"[7], " #c4"[7]}," #b2 "," #stride"\n"
     35 
     36 #define STORE8x2(c1, c2, p, stride)                                            \
     37   "vst2.8   {" #c1"[0], " #c2"[0]}," #p "," #stride " \n"                      \
     38   "vst2.8   {" #c1"[1], " #c2"[1]}," #p "," #stride " \n"                      \
     39   "vst2.8   {" #c1"[2], " #c2"[2]}," #p "," #stride " \n"                      \
     40   "vst2.8   {" #c1"[3], " #c2"[3]}," #p "," #stride " \n"                      \
     41   "vst2.8   {" #c1"[4], " #c2"[4]}," #p "," #stride " \n"                      \
     42   "vst2.8   {" #c1"[5], " #c2"[5]}," #p "," #stride " \n"                      \
     43   "vst2.8   {" #c1"[6], " #c2"[6]}," #p "," #stride " \n"                      \
     44   "vst2.8   {" #c1"[7], " #c2"[7]}," #p "," #stride " \n"
     45 
     46 #if !defined(WORK_AROUND_GCC)
     47 
     48 // This intrinsics version makes gcc-4.6.3 crash during Load4x??() compilation
     49 // (register alloc, probably). The variants somewhat mitigate the problem, but
     50 // not quite. HFilter16i() remains problematic.
     51 static WEBP_INLINE uint8x8x4_t Load4x8(const uint8_t* const src, int stride) {
     52   const uint8x8_t zero = vdup_n_u8(0);
     53   uint8x8x4_t out;
     54   INIT_VECTOR4(out, zero, zero, zero, zero);
     55   out = vld4_lane_u8(src + 0 * stride, out, 0);
     56   out = vld4_lane_u8(src + 1 * stride, out, 1);
     57   out = vld4_lane_u8(src + 2 * stride, out, 2);
     58   out = vld4_lane_u8(src + 3 * stride, out, 3);
     59   out = vld4_lane_u8(src + 4 * stride, out, 4);
     60   out = vld4_lane_u8(src + 5 * stride, out, 5);
     61   out = vld4_lane_u8(src + 6 * stride, out, 6);
     62   out = vld4_lane_u8(src + 7 * stride, out, 7);
     63   return out;
     64 }
     65 
     66 static WEBP_INLINE void Load4x16(const uint8_t* const src, int stride,
     67                                  uint8x16_t* const p1, uint8x16_t* const p0,
     68                                  uint8x16_t* const q0, uint8x16_t* const q1) {
     69   // row0 = p1[0..7]|p0[0..7]|q0[0..7]|q1[0..7]
     70   // row8 = p1[8..15]|p0[8..15]|q0[8..15]|q1[8..15]
     71   const uint8x8x4_t row0 = Load4x8(src - 2 + 0 * stride, stride);
     72   const uint8x8x4_t row8 = Load4x8(src - 2 + 8 * stride, stride);
     73   *p1 = vcombine_u8(row0.val[0], row8.val[0]);
     74   *p0 = vcombine_u8(row0.val[1], row8.val[1]);
     75   *q0 = vcombine_u8(row0.val[2], row8.val[2]);
     76   *q1 = vcombine_u8(row0.val[3], row8.val[3]);
     77 }
     78 
     79 #else  // WORK_AROUND_GCC
     80 
     81 #define LOADQ_LANE_32b(VALUE, LANE) do {                             \
     82   (VALUE) = vld1q_lane_u32((const uint32_t*)src, (VALUE), (LANE));   \
     83   src += stride;                                                     \
     84 } while (0)
     85 
     86 static WEBP_INLINE void Load4x16(const uint8_t* src, int stride,
     87                                  uint8x16_t* const p1, uint8x16_t* const p0,
     88                                  uint8x16_t* const q0, uint8x16_t* const q1) {
     89   const uint32x4_t zero = vdupq_n_u32(0);
     90   uint32x4x4_t in;
     91   INIT_VECTOR4(in, zero, zero, zero, zero);
     92   src -= 2;
     93   LOADQ_LANE_32b(in.val[0], 0);
     94   LOADQ_LANE_32b(in.val[1], 0);
     95   LOADQ_LANE_32b(in.val[2], 0);
     96   LOADQ_LANE_32b(in.val[3], 0);
     97   LOADQ_LANE_32b(in.val[0], 1);
     98   LOADQ_LANE_32b(in.val[1], 1);
     99   LOADQ_LANE_32b(in.val[2], 1);
    100   LOADQ_LANE_32b(in.val[3], 1);
    101   LOADQ_LANE_32b(in.val[0], 2);
    102   LOADQ_LANE_32b(in.val[1], 2);
    103   LOADQ_LANE_32b(in.val[2], 2);
    104   LOADQ_LANE_32b(in.val[3], 2);
    105   LOADQ_LANE_32b(in.val[0], 3);
    106   LOADQ_LANE_32b(in.val[1], 3);
    107   LOADQ_LANE_32b(in.val[2], 3);
    108   LOADQ_LANE_32b(in.val[3], 3);
    109   // Transpose four 4x4 parts:
    110   {
    111     const uint8x16x2_t row01 = vtrnq_u8(vreinterpretq_u8_u32(in.val[0]),
    112                                         vreinterpretq_u8_u32(in.val[1]));
    113     const uint8x16x2_t row23 = vtrnq_u8(vreinterpretq_u8_u32(in.val[2]),
    114                                         vreinterpretq_u8_u32(in.val[3]));
    115     const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]),
    116                                          vreinterpretq_u16_u8(row23.val[0]));
    117     const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]),
    118                                          vreinterpretq_u16_u8(row23.val[1]));
    119     *p1 = vreinterpretq_u8_u16(row02.val[0]);
    120     *p0 = vreinterpretq_u8_u16(row13.val[0]);
    121     *q0 = vreinterpretq_u8_u16(row02.val[1]);
    122     *q1 = vreinterpretq_u8_u16(row13.val[1]);
    123   }
    124 }
    125 #undef LOADQ_LANE_32b
    126 
    127 #endif  // !WORK_AROUND_GCC
    128 
    129 static WEBP_INLINE void Load8x16(const uint8_t* const src, int stride,
    130                                  uint8x16_t* const p3, uint8x16_t* const p2,
    131                                  uint8x16_t* const p1, uint8x16_t* const p0,
    132                                  uint8x16_t* const q0, uint8x16_t* const q1,
    133                                  uint8x16_t* const q2, uint8x16_t* const q3) {
    134   Load4x16(src - 2, stride, p3, p2, p1, p0);
    135   Load4x16(src + 2, stride, q0, q1, q2, q3);
    136 }
    137 
    138 static WEBP_INLINE void Load16x4(const uint8_t* const src, int stride,
    139                                  uint8x16_t* const p1, uint8x16_t* const p0,
    140                                  uint8x16_t* const q0, uint8x16_t* const q1) {
    141   *p1 = vld1q_u8(src - 2 * stride);
    142   *p0 = vld1q_u8(src - 1 * stride);
    143   *q0 = vld1q_u8(src + 0 * stride);
    144   *q1 = vld1q_u8(src + 1 * stride);
    145 }
    146 
    147 static WEBP_INLINE void Load16x8(const uint8_t* const src, int stride,
    148                                  uint8x16_t* const p3, uint8x16_t* const p2,
    149                                  uint8x16_t* const p1, uint8x16_t* const p0,
    150                                  uint8x16_t* const q0, uint8x16_t* const q1,
    151                                  uint8x16_t* const q2, uint8x16_t* const q3) {
    152   Load16x4(src - 2  * stride, stride, p3, p2, p1, p0);
    153   Load16x4(src + 2  * stride, stride, q0, q1, q2, q3);
    154 }
    155 
    156 static WEBP_INLINE void Load8x8x2(const uint8_t* const u,
    157                                   const uint8_t* const v,
    158                                   int stride,
    159                                   uint8x16_t* const p3, uint8x16_t* const p2,
    160                                   uint8x16_t* const p1, uint8x16_t* const p0,
    161                                   uint8x16_t* const q0, uint8x16_t* const q1,
    162                                   uint8x16_t* const q2, uint8x16_t* const q3) {
    163   // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
    164   // and the v-samples on the higher half.
    165   *p3 = vcombine_u8(vld1_u8(u - 4 * stride), vld1_u8(v - 4 * stride));
    166   *p2 = vcombine_u8(vld1_u8(u - 3 * stride), vld1_u8(v - 3 * stride));
    167   *p1 = vcombine_u8(vld1_u8(u - 2 * stride), vld1_u8(v - 2 * stride));
    168   *p0 = vcombine_u8(vld1_u8(u - 1 * stride), vld1_u8(v - 1 * stride));
    169   *q0 = vcombine_u8(vld1_u8(u + 0 * stride), vld1_u8(v + 0 * stride));
    170   *q1 = vcombine_u8(vld1_u8(u + 1 * stride), vld1_u8(v + 1 * stride));
    171   *q2 = vcombine_u8(vld1_u8(u + 2 * stride), vld1_u8(v + 2 * stride));
    172   *q3 = vcombine_u8(vld1_u8(u + 3 * stride), vld1_u8(v + 3 * stride));
    173 }
    174 
    175 #if !defined(WORK_AROUND_GCC)
    176 
    177 #define LOAD_UV_8(ROW) \
    178   vcombine_u8(vld1_u8(u - 4 + (ROW) * stride), vld1_u8(v - 4 + (ROW) * stride))
    179 
    180 static WEBP_INLINE void Load8x8x2T(const uint8_t* const u,
    181                                    const uint8_t* const v,
    182                                    int stride,
    183                                    uint8x16_t* const p3, uint8x16_t* const p2,
    184                                    uint8x16_t* const p1, uint8x16_t* const p0,
    185                                    uint8x16_t* const q0, uint8x16_t* const q1,
    186                                    uint8x16_t* const q2, uint8x16_t* const q3) {
    187   // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
    188   // and the v-samples on the higher half.
    189   const uint8x16_t row0 = LOAD_UV_8(0);
    190   const uint8x16_t row1 = LOAD_UV_8(1);
    191   const uint8x16_t row2 = LOAD_UV_8(2);
    192   const uint8x16_t row3 = LOAD_UV_8(3);
    193   const uint8x16_t row4 = LOAD_UV_8(4);
    194   const uint8x16_t row5 = LOAD_UV_8(5);
    195   const uint8x16_t row6 = LOAD_UV_8(6);
    196   const uint8x16_t row7 = LOAD_UV_8(7);
    197   // Perform two side-by-side 8x8 transposes
    198   // u00 u01 u02 u03 u04 u05 u06 u07 | v00 v01 v02 v03 v04 v05 v06 v07
    199   // u10 u11 u12 u13 u14 u15 u16 u17 | v10 v11 v12 ...
    200   // u20 u21 u22 u23 u24 u25 u26 u27 | v20 v21 ...
    201   // u30 u31 u32 u33 u34 u35 u36 u37 | ...
    202   // u40 u41 u42 u43 u44 u45 u46 u47 | ...
    203   // u50 u51 u52 u53 u54 u55 u56 u57 | ...
    204   // u60 u61 u62 u63 u64 u65 u66 u67 | v60 ...
    205   // u70 u71 u72 u73 u74 u75 u76 u77 | v70 v71 v72 ...
    206   const uint8x16x2_t row01 = vtrnq_u8(row0, row1);  // u00 u10 u02 u12 ...
    207                                                     // u01 u11 u03 u13 ...
    208   const uint8x16x2_t row23 = vtrnq_u8(row2, row3);  // u20 u30 u22 u32 ...
    209                                                     // u21 u31 u23 u33 ...
    210   const uint8x16x2_t row45 = vtrnq_u8(row4, row5);  // ...
    211   const uint8x16x2_t row67 = vtrnq_u8(row6, row7);  // ...
    212   const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]),
    213                                        vreinterpretq_u16_u8(row23.val[0]));
    214   const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]),
    215                                        vreinterpretq_u16_u8(row23.val[1]));
    216   const uint16x8x2_t row46 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[0]),
    217                                        vreinterpretq_u16_u8(row67.val[0]));
    218   const uint16x8x2_t row57 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[1]),
    219                                        vreinterpretq_u16_u8(row67.val[1]));
    220   const uint32x4x2_t row04 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[0]),
    221                                        vreinterpretq_u32_u16(row46.val[0]));
    222   const uint32x4x2_t row26 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[1]),
    223                                        vreinterpretq_u32_u16(row46.val[1]));
    224   const uint32x4x2_t row15 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[0]),
    225                                        vreinterpretq_u32_u16(row57.val[0]));
    226   const uint32x4x2_t row37 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[1]),
    227                                        vreinterpretq_u32_u16(row57.val[1]));
    228   *p3 = vreinterpretq_u8_u32(row04.val[0]);
    229   *p2 = vreinterpretq_u8_u32(row15.val[0]);
    230   *p1 = vreinterpretq_u8_u32(row26.val[0]);
    231   *p0 = vreinterpretq_u8_u32(row37.val[0]);
    232   *q0 = vreinterpretq_u8_u32(row04.val[1]);
    233   *q1 = vreinterpretq_u8_u32(row15.val[1]);
    234   *q2 = vreinterpretq_u8_u32(row26.val[1]);
    235   *q3 = vreinterpretq_u8_u32(row37.val[1]);
    236 }
    237 #undef LOAD_UV_8
    238 
    239 #endif  // !WORK_AROUND_GCC
    240 
    241 static WEBP_INLINE void Store2x8(const uint8x8x2_t v,
    242                                  uint8_t* const dst, int stride) {
    243   vst2_lane_u8(dst + 0 * stride, v, 0);
    244   vst2_lane_u8(dst + 1 * stride, v, 1);
    245   vst2_lane_u8(dst + 2 * stride, v, 2);
    246   vst2_lane_u8(dst + 3 * stride, v, 3);
    247   vst2_lane_u8(dst + 4 * stride, v, 4);
    248   vst2_lane_u8(dst + 5 * stride, v, 5);
    249   vst2_lane_u8(dst + 6 * stride, v, 6);
    250   vst2_lane_u8(dst + 7 * stride, v, 7);
    251 }
    252 
    253 static WEBP_INLINE void Store2x16(const uint8x16_t p0, const uint8x16_t q0,
    254                                   uint8_t* const dst, int stride) {
    255   uint8x8x2_t lo, hi;
    256   lo.val[0] = vget_low_u8(p0);
    257   lo.val[1] = vget_low_u8(q0);
    258   hi.val[0] = vget_high_u8(p0);
    259   hi.val[1] = vget_high_u8(q0);
    260   Store2x8(lo, dst - 1 + 0 * stride, stride);
    261   Store2x8(hi, dst - 1 + 8 * stride, stride);
    262 }
    263 
    264 #if !defined(WORK_AROUND_GCC)
    265 static WEBP_INLINE void Store4x8(const uint8x8x4_t v,
    266                                  uint8_t* const dst, int stride) {
    267   vst4_lane_u8(dst + 0 * stride, v, 0);
    268   vst4_lane_u8(dst + 1 * stride, v, 1);
    269   vst4_lane_u8(dst + 2 * stride, v, 2);
    270   vst4_lane_u8(dst + 3 * stride, v, 3);
    271   vst4_lane_u8(dst + 4 * stride, v, 4);
    272   vst4_lane_u8(dst + 5 * stride, v, 5);
    273   vst4_lane_u8(dst + 6 * stride, v, 6);
    274   vst4_lane_u8(dst + 7 * stride, v, 7);
    275 }
    276 
    277 static WEBP_INLINE void Store4x16(const uint8x16_t p1, const uint8x16_t p0,
    278                                   const uint8x16_t q0, const uint8x16_t q1,
    279                                   uint8_t* const dst, int stride) {
    280   uint8x8x4_t lo, hi;
    281   INIT_VECTOR4(lo,
    282                vget_low_u8(p1), vget_low_u8(p0),
    283                vget_low_u8(q0), vget_low_u8(q1));
    284   INIT_VECTOR4(hi,
    285                vget_high_u8(p1), vget_high_u8(p0),
    286                vget_high_u8(q0), vget_high_u8(q1));
    287   Store4x8(lo, dst - 2 + 0 * stride, stride);
    288   Store4x8(hi, dst - 2 + 8 * stride, stride);
    289 }
    290 #endif  // !WORK_AROUND_GCC
    291 
    292 static WEBP_INLINE void Store16x2(const uint8x16_t p0, const uint8x16_t q0,
    293                                   uint8_t* const dst, int stride) {
    294   vst1q_u8(dst - stride, p0);
    295   vst1q_u8(dst, q0);
    296 }
    297 
    298 static WEBP_INLINE void Store16x4(const uint8x16_t p1, const uint8x16_t p0,
    299                                   const uint8x16_t q0, const uint8x16_t q1,
    300                                   uint8_t* const dst, int stride) {
    301   Store16x2(p1, p0, dst - stride, stride);
    302   Store16x2(q0, q1, dst + stride, stride);
    303 }
    304 
    305 static WEBP_INLINE void Store8x2x2(const uint8x16_t p0, const uint8x16_t q0,
    306                                    uint8_t* const u, uint8_t* const v,
    307                                    int stride) {
    308   // p0 and q0 contain the u+v samples packed in low/high halves.
    309   vst1_u8(u - stride, vget_low_u8(p0));
    310   vst1_u8(u,          vget_low_u8(q0));
    311   vst1_u8(v - stride, vget_high_u8(p0));
    312   vst1_u8(v,          vget_high_u8(q0));
    313 }
    314 
    315 static WEBP_INLINE void Store8x4x2(const uint8x16_t p1, const uint8x16_t p0,
    316                                    const uint8x16_t q0, const uint8x16_t q1,
    317                                    uint8_t* const u, uint8_t* const v,
    318                                    int stride) {
    319   // The p1...q1 registers contain the u+v samples packed in low/high halves.
    320   Store8x2x2(p1, p0, u - stride, v - stride, stride);
    321   Store8x2x2(q0, q1, u + stride, v + stride, stride);
    322 }
    323 
    324 #if !defined(WORK_AROUND_GCC)
    325 
    326 #define STORE6_LANE(DST, VAL0, VAL1, LANE) do {   \
    327   vst3_lane_u8((DST) - 3, (VAL0), (LANE));        \
    328   vst3_lane_u8((DST) + 0, (VAL1), (LANE));        \
    329   (DST) += stride;                                \
    330 } while (0)
    331 
    332 static WEBP_INLINE void Store6x8x2(const uint8x16_t p2, const uint8x16_t p1,
    333                                    const uint8x16_t p0, const uint8x16_t q0,
    334                                    const uint8x16_t q1, const uint8x16_t q2,
    335                                    uint8_t* u, uint8_t* v,
    336                                    int stride) {
    337   uint8x8x3_t u0, u1, v0, v1;
    338   INIT_VECTOR3(u0, vget_low_u8(p2), vget_low_u8(p1), vget_low_u8(p0));
    339   INIT_VECTOR3(u1, vget_low_u8(q0), vget_low_u8(q1), vget_low_u8(q2));
    340   INIT_VECTOR3(v0, vget_high_u8(p2), vget_high_u8(p1), vget_high_u8(p0));
    341   INIT_VECTOR3(v1, vget_high_u8(q0), vget_high_u8(q1), vget_high_u8(q2));
    342   STORE6_LANE(u, u0, u1, 0);
    343   STORE6_LANE(u, u0, u1, 1);
    344   STORE6_LANE(u, u0, u1, 2);
    345   STORE6_LANE(u, u0, u1, 3);
    346   STORE6_LANE(u, u0, u1, 4);
    347   STORE6_LANE(u, u0, u1, 5);
    348   STORE6_LANE(u, u0, u1, 6);
    349   STORE6_LANE(u, u0, u1, 7);
    350   STORE6_LANE(v, v0, v1, 0);
    351   STORE6_LANE(v, v0, v1, 1);
    352   STORE6_LANE(v, v0, v1, 2);
    353   STORE6_LANE(v, v0, v1, 3);
    354   STORE6_LANE(v, v0, v1, 4);
    355   STORE6_LANE(v, v0, v1, 5);
    356   STORE6_LANE(v, v0, v1, 6);
    357   STORE6_LANE(v, v0, v1, 7);
    358 }
    359 #undef STORE6_LANE
    360 
    361 static WEBP_INLINE void Store4x8x2(const uint8x16_t p1, const uint8x16_t p0,
    362                                    const uint8x16_t q0, const uint8x16_t q1,
    363                                    uint8_t* const u, uint8_t* const v,
    364                                    int stride) {
    365   uint8x8x4_t u0, v0;
    366   INIT_VECTOR4(u0,
    367                vget_low_u8(p1), vget_low_u8(p0),
    368                vget_low_u8(q0), vget_low_u8(q1));
    369   INIT_VECTOR4(v0,
    370                vget_high_u8(p1), vget_high_u8(p0),
    371                vget_high_u8(q0), vget_high_u8(q1));
    372   vst4_lane_u8(u - 2 + 0 * stride, u0, 0);
    373   vst4_lane_u8(u - 2 + 1 * stride, u0, 1);
    374   vst4_lane_u8(u - 2 + 2 * stride, u0, 2);
    375   vst4_lane_u8(u - 2 + 3 * stride, u0, 3);
    376   vst4_lane_u8(u - 2 + 4 * stride, u0, 4);
    377   vst4_lane_u8(u - 2 + 5 * stride, u0, 5);
    378   vst4_lane_u8(u - 2 + 6 * stride, u0, 6);
    379   vst4_lane_u8(u - 2 + 7 * stride, u0, 7);
    380   vst4_lane_u8(v - 2 + 0 * stride, v0, 0);
    381   vst4_lane_u8(v - 2 + 1 * stride, v0, 1);
    382   vst4_lane_u8(v - 2 + 2 * stride, v0, 2);
    383   vst4_lane_u8(v - 2 + 3 * stride, v0, 3);
    384   vst4_lane_u8(v - 2 + 4 * stride, v0, 4);
    385   vst4_lane_u8(v - 2 + 5 * stride, v0, 5);
    386   vst4_lane_u8(v - 2 + 6 * stride, v0, 6);
    387   vst4_lane_u8(v - 2 + 7 * stride, v0, 7);
    388 }
    389 
    390 #endif  // !WORK_AROUND_GCC
    391 
    392 // Treats 'v' as an uint8x8_t and zero extends to an int16x8_t.
    393 static WEBP_INLINE int16x8_t ConvertU8ToS16(uint32x2_t v) {
    394   return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v)));
    395 }
    396 
    397 // Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
    398 // to the corresponding rows of 'dst'.
    399 static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst,
    400                                             const int16x8_t dst01,
    401                                             const int16x8_t dst23) {
    402   // Unsigned saturate to 8b.
    403   const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
    404   const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
    405 
    406   // Store the results.
    407   vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0);
    408   vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1);
    409   vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0);
    410   vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
    411 }
    412 
    413 static WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23,
    414                                uint8_t* const dst) {
    415   uint32x2_t dst01 = vdup_n_u32(0);
    416   uint32x2_t dst23 = vdup_n_u32(0);
    417 
    418   // Load the source pixels.
    419   dst01 = vld1_lane_u32((uint32_t*)(dst + 0 * BPS), dst01, 0);
    420   dst23 = vld1_lane_u32((uint32_t*)(dst + 2 * BPS), dst23, 0);
    421   dst01 = vld1_lane_u32((uint32_t*)(dst + 1 * BPS), dst01, 1);
    422   dst23 = vld1_lane_u32((uint32_t*)(dst + 3 * BPS), dst23, 1);
    423 
    424   {
    425     // Convert to 16b.
    426     const int16x8_t dst01_s16 = ConvertU8ToS16(dst01);
    427     const int16x8_t dst23_s16 = ConvertU8ToS16(dst23);
    428 
    429     // Descale with rounding.
    430     const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
    431     const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
    432     // Add the inverse transform.
    433     SaturateAndStore4x4(dst, out01, out23);
    434   }
    435 }
    436 
    437 //-----------------------------------------------------------------------------
    438 // Simple In-loop filtering (Paragraph 15.2)
    439 
    440 static uint8x16_t NeedsFilter(const uint8x16_t p1, const uint8x16_t p0,
    441                               const uint8x16_t q0, const uint8x16_t q1,
    442                               int thresh) {
    443   const uint8x16_t thresh_v = vdupq_n_u8((uint8_t)thresh);
    444   const uint8x16_t a_p0_q0 = vabdq_u8(p0, q0);               // abs(p0-q0)
    445   const uint8x16_t a_p1_q1 = vabdq_u8(p1, q1);               // abs(p1-q1)
    446   const uint8x16_t a_p0_q0_2 = vqaddq_u8(a_p0_q0, a_p0_q0);  // 2 * abs(p0-q0)
    447   const uint8x16_t a_p1_q1_2 = vshrq_n_u8(a_p1_q1, 1);       // abs(p1-q1) / 2
    448   const uint8x16_t sum = vqaddq_u8(a_p0_q0_2, a_p1_q1_2);
    449   const uint8x16_t mask = vcgeq_u8(thresh_v, sum);
    450   return mask;
    451 }
    452 
    453 static int8x16_t FlipSign(const uint8x16_t v) {
    454   const uint8x16_t sign_bit = vdupq_n_u8(0x80);
    455   return vreinterpretq_s8_u8(veorq_u8(v, sign_bit));
    456 }
    457 
    458 static uint8x16_t FlipSignBack(const int8x16_t v) {
    459   const int8x16_t sign_bit = vdupq_n_s8(0x80);
    460   return vreinterpretq_u8_s8(veorq_s8(v, sign_bit));
    461 }
    462 
    463 static int8x16_t GetBaseDelta(const int8x16_t p1, const int8x16_t p0,
    464                               const int8x16_t q0, const int8x16_t q1) {
    465   const int8x16_t q0_p0 = vqsubq_s8(q0, p0);      // (q0-p0)
    466   const int8x16_t p1_q1 = vqsubq_s8(p1, q1);      // (p1-q1)
    467   const int8x16_t s1 = vqaddq_s8(p1_q1, q0_p0);   // (p1-q1) + 1 * (q0 - p0)
    468   const int8x16_t s2 = vqaddq_s8(q0_p0, s1);      // (p1-q1) + 2 * (q0 - p0)
    469   const int8x16_t s3 = vqaddq_s8(q0_p0, s2);      // (p1-q1) + 3 * (q0 - p0)
    470   return s3;
    471 }
    472 
    473 static int8x16_t GetBaseDelta0(const int8x16_t p0, const int8x16_t q0) {
    474   const int8x16_t q0_p0 = vqsubq_s8(q0, p0);      // (q0-p0)
    475   const int8x16_t s1 = vqaddq_s8(q0_p0, q0_p0);   // 2 * (q0 - p0)
    476   const int8x16_t s2 = vqaddq_s8(q0_p0, s1);      // 3 * (q0 - p0)
    477   return s2;
    478 }
    479 
    480 //------------------------------------------------------------------------------
    481 
    482 static void ApplyFilter2(const int8x16_t p0s, const int8x16_t q0s,
    483                          const int8x16_t delta,
    484                          uint8x16_t* const op0, uint8x16_t* const oq0) {
    485   const int8x16_t kCst3 = vdupq_n_s8(0x03);
    486   const int8x16_t kCst4 = vdupq_n_s8(0x04);
    487   const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
    488   const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);
    489   const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3);
    490   const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);
    491   const int8x16_t sp0 = vqaddq_s8(p0s, delta3);
    492   const int8x16_t sq0 = vqsubq_s8(q0s, delta4);
    493   *op0 = FlipSignBack(sp0);
    494   *oq0 = FlipSignBack(sq0);
    495 }
    496 
    497 #if defined(USE_INTRINSICS)
    498 
    499 static void DoFilter2(const uint8x16_t p1, const uint8x16_t p0,
    500                       const uint8x16_t q0, const uint8x16_t q1,
    501                       const uint8x16_t mask,
    502                       uint8x16_t* const op0, uint8x16_t* const oq0) {
    503   const int8x16_t p1s = FlipSign(p1);
    504   const int8x16_t p0s = FlipSign(p0);
    505   const int8x16_t q0s = FlipSign(q0);
    506   const int8x16_t q1s = FlipSign(q1);
    507   const int8x16_t delta0 = GetBaseDelta(p1s, p0s, q0s, q1s);
    508   const int8x16_t delta1 = vandq_s8(delta0, vreinterpretq_s8_u8(mask));
    509   ApplyFilter2(p0s, q0s, delta1, op0, oq0);
    510 }
    511 
    512 static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
    513   uint8x16_t p1, p0, q0, q1, op0, oq0;
    514   Load16x4(p, stride, &p1, &p0, &q0, &q1);
    515   {
    516     const uint8x16_t mask = NeedsFilter(p1, p0, q0, q1, thresh);
    517     DoFilter2(p1, p0, q0, q1, mask, &op0, &oq0);
    518   }
    519   Store16x2(op0, oq0, p, stride);
    520 }
    521 
    522 static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
    523   uint8x16_t p1, p0, q0, q1, oq0, op0;
    524   Load4x16(p, stride, &p1, &p0, &q0, &q1);
    525   {
    526     const uint8x16_t mask = NeedsFilter(p1, p0, q0, q1, thresh);
    527     DoFilter2(p1, p0, q0, q1, mask, &op0, &oq0);
    528   }
    529   Store2x16(op0, oq0, p, stride);
    530 }
    531 
    532 #else
    533 
    534 #define QRegs "q0", "q1", "q2", "q3",                                          \
    535               "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
    536 
    537 #define FLIP_SIGN_BIT2(a, b, s)                                                \
    538   "veor     " #a "," #a "," #s "               \n"                             \
    539   "veor     " #b "," #b "," #s "               \n"                             \
    540 
    541 #define FLIP_SIGN_BIT4(a, b, c, d, s)                                          \
    542   FLIP_SIGN_BIT2(a, b, s)                                                      \
    543   FLIP_SIGN_BIT2(c, d, s)                                                      \
    544 
    545 #define NEEDS_FILTER(p1, p0, q0, q1, thresh, mask)                             \
    546   "vabd.u8    q15," #p0 "," #q0 "         \n"  /* abs(p0 - q0) */              \
    547   "vabd.u8    q14," #p1 "," #q1 "         \n"  /* abs(p1 - q1) */              \
    548   "vqadd.u8   q15, q15, q15               \n"  /* abs(p0 - q0) * 2 */          \
    549   "vshr.u8    q14, q14, #1                \n"  /* abs(p1 - q1) / 2 */          \
    550   "vqadd.u8   q15, q15, q14     \n"  /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \
    551   "vdup.8     q14, " #thresh "            \n"                                  \
    552   "vcge.u8   " #mask ", q14, q15          \n"  /* mask <= thresh */
    553 
    554 #define GET_BASE_DELTA(p1, p0, q0, q1, o)                                      \
    555   "vqsub.s8   q15," #q0 "," #p0 "         \n"  /* (q0 - p0) */                 \
    556   "vqsub.s8  " #o "," #p1 "," #q1 "       \n"  /* (p1 - q1) */                 \
    557   "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 1 * (p0 - q0) */ \
    558   "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 2 * (p0 - q0) */ \
    559   "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 3 * (p0 - q0) */
    560 
    561 #define DO_SIMPLE_FILTER(p0, q0, fl)                                           \
    562   "vmov.i8    q15, #0x03                  \n"                                  \
    563   "vqadd.s8   q15, q15, " #fl "           \n"  /* filter1 = filter + 3 */      \
    564   "vshr.s8    q15, q15, #3                \n"  /* filter1 >> 3 */              \
    565   "vqadd.s8  " #p0 "," #p0 ", q15         \n"  /* p0 += filter1 */             \
    566                                                                                \
    567   "vmov.i8    q15, #0x04                  \n"                                  \
    568   "vqadd.s8   q15, q15, " #fl "           \n"  /* filter1 = filter + 4 */      \
    569   "vshr.s8    q15, q15, #3                \n"  /* filter2 >> 3 */              \
    570   "vqsub.s8  " #q0 "," #q0 ", q15         \n"  /* q0 -= filter2 */
    571 
    572 // Applies filter on 2 pixels (p0 and q0)
    573 #define DO_FILTER2(p1, p0, q0, q1, thresh)                                     \
    574   NEEDS_FILTER(p1, p0, q0, q1, thresh, q9)     /* filter mask in q9 */         \
    575   "vmov.i8    q10, #0x80                  \n"  /* sign bit */                  \
    576   FLIP_SIGN_BIT4(p1, p0, q0, q1, q10)          /* convert to signed value */   \
    577   GET_BASE_DELTA(p1, p0, q0, q1, q11)          /* get filter level  */         \
    578   "vand       q9, q9, q11                 \n"  /* apply filter mask */         \
    579   DO_SIMPLE_FILTER(p0, q0, q9)                 /* apply filter */              \
    580   FLIP_SIGN_BIT2(p0, q0, q10)
    581 
    582 static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
    583   __asm__ volatile (
    584     "sub        %[p], %[p], %[stride], lsl #1  \n"  // p -= 2 * stride
    585 
    586     "vld1.u8    {q1}, [%[p]], %[stride]        \n"  // p1
    587     "vld1.u8    {q2}, [%[p]], %[stride]        \n"  // p0
    588     "vld1.u8    {q3}, [%[p]], %[stride]        \n"  // q0
    589     "vld1.u8    {q12}, [%[p]]                  \n"  // q1
    590 
    591     DO_FILTER2(q1, q2, q3, q12, %[thresh])
    592 
    593     "sub        %[p], %[p], %[stride], lsl #1  \n"  // p -= 2 * stride
    594 
    595     "vst1.u8    {q2}, [%[p]], %[stride]        \n"  // store op0
    596     "vst1.u8    {q3}, [%[p]]                   \n"  // store oq0
    597     : [p] "+r"(p)
    598     : [stride] "r"(stride), [thresh] "r"(thresh)
    599     : "memory", QRegs
    600   );
    601 }
    602 
    603 static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
    604   __asm__ volatile (
    605     "sub        r4, %[p], #2                   \n"  // base1 = p - 2
    606     "lsl        r6, %[stride], #1              \n"  // r6 = 2 * stride
    607     "add        r5, r4, %[stride]              \n"  // base2 = base1 + stride
    608 
    609     LOAD8x4(d2, d3, d4, d5, [r4], [r5], r6)
    610     LOAD8x4(d24, d25, d26, d27, [r4], [r5], r6)
    611     "vswp       d3, d24                        \n"  // p1:q1 p0:q3
    612     "vswp       d5, d26                        \n"  // q0:q2 q1:q4
    613     "vswp       q2, q12                        \n"  // p1:q1 p0:q2 q0:q3 q1:q4
    614 
    615     DO_FILTER2(q1, q2, q12, q13, %[thresh])
    616 
    617     "sub        %[p], %[p], #1                 \n"  // p - 1
    618 
    619     "vswp        d5, d24                       \n"
    620     STORE8x2(d4, d5, [%[p]], %[stride])
    621     STORE8x2(d24, d25, [%[p]], %[stride])
    622 
    623     : [p] "+r"(p)
    624     : [stride] "r"(stride), [thresh] "r"(thresh)
    625     : "memory", "r4", "r5", "r6", QRegs
    626   );
    627 }
    628 
    629 #endif    // USE_INTRINSICS
    630 
    631 static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
    632   uint32_t k;
    633   for (k = 3; k != 0; --k) {
    634     p += 4 * stride;
    635     SimpleVFilter16(p, stride, thresh);
    636   }
    637 }
    638 
    639 static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
    640   uint32_t k;
    641   for (k = 3; k != 0; --k) {
    642     p += 4;
    643     SimpleHFilter16(p, stride, thresh);
    644   }
    645 }
    646 
    647 //------------------------------------------------------------------------------
    648 // Complex In-loop filtering (Paragraph 15.3)
    649 
    650 static uint8x16_t NeedsHev(const uint8x16_t p1, const uint8x16_t p0,
    651                            const uint8x16_t q0, const uint8x16_t q1,
    652                            int hev_thresh) {
    653   const uint8x16_t hev_thresh_v = vdupq_n_u8((uint8_t)hev_thresh);
    654   const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0);  // abs(p1 - p0)
    655   const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0);  // abs(q1 - q0)
    656   const uint8x16_t mask1 = vcgtq_u8(a_p1_p0, hev_thresh_v);
    657   const uint8x16_t mask2 = vcgtq_u8(a_q1_q0, hev_thresh_v);
    658   const uint8x16_t mask = vorrq_u8(mask1, mask2);
    659   return mask;
    660 }
    661 
    662 static uint8x16_t NeedsFilter2(const uint8x16_t p3, const uint8x16_t p2,
    663                                const uint8x16_t p1, const uint8x16_t p0,
    664                                const uint8x16_t q0, const uint8x16_t q1,
    665                                const uint8x16_t q2, const uint8x16_t q3,
    666                                int ithresh, int thresh) {
    667   const uint8x16_t ithresh_v = vdupq_n_u8((uint8_t)ithresh);
    668   const uint8x16_t a_p3_p2 = vabdq_u8(p3, p2);  // abs(p3 - p2)
    669   const uint8x16_t a_p2_p1 = vabdq_u8(p2, p1);  // abs(p2 - p1)
    670   const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0);  // abs(p1 - p0)
    671   const uint8x16_t a_q3_q2 = vabdq_u8(q3, q2);  // abs(q3 - q2)
    672   const uint8x16_t a_q2_q1 = vabdq_u8(q2, q1);  // abs(q2 - q1)
    673   const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0);  // abs(q1 - q0)
    674   const uint8x16_t max1 = vmaxq_u8(a_p3_p2, a_p2_p1);
    675   const uint8x16_t max2 = vmaxq_u8(a_p1_p0, a_q3_q2);
    676   const uint8x16_t max3 = vmaxq_u8(a_q2_q1, a_q1_q0);
    677   const uint8x16_t max12 = vmaxq_u8(max1, max2);
    678   const uint8x16_t max123 = vmaxq_u8(max12, max3);
    679   const uint8x16_t mask2 = vcgeq_u8(ithresh_v, max123);
    680   const uint8x16_t mask1 = NeedsFilter(p1, p0, q0, q1, thresh);
    681   const uint8x16_t mask = vandq_u8(mask1, mask2);
    682   return mask;
    683 }
    684 
    685 //  4-points filter
    686 
    687 static void ApplyFilter4(
    688     const int8x16_t p1, const int8x16_t p0,
    689     const int8x16_t q0, const int8x16_t q1,
    690     const int8x16_t delta0,
    691     uint8x16_t* const op1, uint8x16_t* const op0,
    692     uint8x16_t* const oq0, uint8x16_t* const oq1) {
    693   const int8x16_t kCst3 = vdupq_n_s8(0x03);
    694   const int8x16_t kCst4 = vdupq_n_s8(0x04);
    695   const int8x16_t delta1 = vqaddq_s8(delta0, kCst4);
    696   const int8x16_t delta2 = vqaddq_s8(delta0, kCst3);
    697   const int8x16_t a1 = vshrq_n_s8(delta1, 3);
    698   const int8x16_t a2 = vshrq_n_s8(delta2, 3);
    699   const int8x16_t a3 = vrshrq_n_s8(a1, 1);   // a3 = (a1 + 1) >> 1
    700   *op0 = FlipSignBack(vqaddq_s8(p0, a2));  // clip(p0 + a2)
    701   *oq0 = FlipSignBack(vqsubq_s8(q0, a1));  // clip(q0 - a1)
    702   *op1 = FlipSignBack(vqaddq_s8(p1, a3));  // clip(p1 + a3)
    703   *oq1 = FlipSignBack(vqsubq_s8(q1, a3));  // clip(q1 - a3)
    704 }
    705 
    706 static void DoFilter4(
    707     const uint8x16_t p1, const uint8x16_t p0,
    708     const uint8x16_t q0, const uint8x16_t q1,
    709     const uint8x16_t mask, const uint8x16_t hev_mask,
    710     uint8x16_t* const op1, uint8x16_t* const op0,
    711     uint8x16_t* const oq0, uint8x16_t* const oq1) {
    712   // This is a fused version of DoFilter2() calling ApplyFilter2 directly
    713   const int8x16_t p1s = FlipSign(p1);
    714   int8x16_t p0s = FlipSign(p0);
    715   int8x16_t q0s = FlipSign(q0);
    716   const int8x16_t q1s = FlipSign(q1);
    717   const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
    718 
    719   // do_filter2 part (simple loopfilter on pixels with hev)
    720   {
    721     const int8x16_t delta = GetBaseDelta(p1s, p0s, q0s, q1s);
    722     const int8x16_t simple_lf_delta =
    723         vandq_s8(delta, vreinterpretq_s8_u8(simple_lf_mask));
    724     uint8x16_t tmp_p0, tmp_q0;
    725     ApplyFilter2(p0s, q0s, simple_lf_delta, &tmp_p0, &tmp_q0);
    726     // TODO(skal): avoid the double FlipSign() in ApplyFilter2() and here
    727     p0s = FlipSign(tmp_p0);
    728     q0s = FlipSign(tmp_q0);
    729   }
    730 
    731   // do_filter4 part (complex loopfilter on pixels without hev)
    732   {
    733     const int8x16_t delta0 = GetBaseDelta0(p0s, q0s);
    734     // we use: (mask & hev_mask) ^ mask = mask & !hev_mask
    735     const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
    736     const int8x16_t complex_lf_delta =
    737         vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
    738     ApplyFilter4(p1s, p0s, q0s, q1s, complex_lf_delta, op1, op0, oq0, oq1);
    739   }
    740 }
    741 
    742 //  6-points filter
    743 
    744 static void ApplyFilter6(
    745     const int8x16_t p2, const int8x16_t p1, const int8x16_t p0,
    746     const int8x16_t q0, const int8x16_t q1, const int8x16_t q2,
    747     const int8x16_t delta,
    748     uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
    749     uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
    750   const int16x8_t kCst63 = vdupq_n_s16(63);
    751   const int8x8_t kCst27 = vdup_n_s8(27);
    752   const int8x8_t kCst18 = vdup_n_s8(18);
    753   const int8x8_t kCst9 = vdup_n_s8(9);
    754   const int8x8_t delta_lo = vget_low_s8(delta);
    755   const int8x8_t delta_hi = vget_high_s8(delta);
    756   const int16x8_t s1_lo = vmlal_s8(kCst63, kCst27, delta_lo);  // 63 + 27 * a
    757   const int16x8_t s1_hi = vmlal_s8(kCst63, kCst27, delta_hi);  // 63 + 27 * a
    758   const int16x8_t s2_lo = vmlal_s8(kCst63, kCst18, delta_lo);  // 63 + 18 * a
    759   const int16x8_t s2_hi = vmlal_s8(kCst63, kCst18, delta_hi);  // 63 + 18 * a
    760   const int16x8_t s3_lo = vmlal_s8(kCst63, kCst9, delta_lo);   // 63 + 9 * a
    761   const int16x8_t s3_hi = vmlal_s8(kCst63, kCst9, delta_hi);   // 63 + 9 * a
    762   const int8x8_t a1_lo = vqshrn_n_s16(s1_lo, 7);
    763   const int8x8_t a1_hi = vqshrn_n_s16(s1_hi, 7);
    764   const int8x8_t a2_lo = vqshrn_n_s16(s2_lo, 7);
    765   const int8x8_t a2_hi = vqshrn_n_s16(s2_hi, 7);
    766   const int8x8_t a3_lo = vqshrn_n_s16(s3_lo, 7);
    767   const int8x8_t a3_hi = vqshrn_n_s16(s3_hi, 7);
    768   const int8x16_t a1 = vcombine_s8(a1_lo, a1_hi);
    769   const int8x16_t a2 = vcombine_s8(a2_lo, a2_hi);
    770   const int8x16_t a3 = vcombine_s8(a3_lo, a3_hi);
    771 
    772   *op0 = FlipSignBack(vqaddq_s8(p0, a1));  // clip(p0 + a1)
    773   *oq0 = FlipSignBack(vqsubq_s8(q0, a1));  // clip(q0 - q1)
    774   *oq1 = FlipSignBack(vqsubq_s8(q1, a2));  // clip(q1 - a2)
    775   *op1 = FlipSignBack(vqaddq_s8(p1, a2));  // clip(p1 + a2)
    776   *oq2 = FlipSignBack(vqsubq_s8(q2, a3));  // clip(q2 - a3)
    777   *op2 = FlipSignBack(vqaddq_s8(p2, a3));  // clip(p2 + a3)
    778 }
    779 
    780 static void DoFilter6(
    781     const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
    782     const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
    783     const uint8x16_t mask, const uint8x16_t hev_mask,
    784     uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
    785     uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
    786   // This is a fused version of DoFilter2() calling ApplyFilter2 directly
    787   const int8x16_t p2s = FlipSign(p2);
    788   const int8x16_t p1s = FlipSign(p1);
    789   int8x16_t p0s = FlipSign(p0);
    790   int8x16_t q0s = FlipSign(q0);
    791   const int8x16_t q1s = FlipSign(q1);
    792   const int8x16_t q2s = FlipSign(q2);
    793   const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
    794   const int8x16_t delta0 = GetBaseDelta(p1s, p0s, q0s, q1s);
    795 
    796   // do_filter2 part (simple loopfilter on pixels with hev)
    797   {
    798     const int8x16_t simple_lf_delta =
    799         vandq_s8(delta0, vreinterpretq_s8_u8(simple_lf_mask));
    800     uint8x16_t tmp_p0, tmp_q0;
    801     ApplyFilter2(p0s, q0s, simple_lf_delta, &tmp_p0, &tmp_q0);
    802     // TODO(skal): avoid the double FlipSign() in ApplyFilter2() and here
    803     p0s = FlipSign(tmp_p0);
    804     q0s = FlipSign(tmp_q0);
    805   }
    806 
    807   // do_filter6 part (complex loopfilter on pixels without hev)
    808   {
    809     // we use: (mask & hev_mask) ^ mask = mask & !hev_mask
    810     const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
    811     const int8x16_t complex_lf_delta =
    812         vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
    813     ApplyFilter6(p2s, p1s, p0s, q0s, q1s, q2s, complex_lf_delta,
    814                  op2, op1, op0, oq0, oq1, oq2);
    815   }
    816 }
    817 
    818 // on macroblock edges
    819 
    820 static void VFilter16(uint8_t* p, int stride,
    821                       int thresh, int ithresh, int hev_thresh) {
    822   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
    823   Load16x8(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
    824   {
    825     const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
    826                                          ithresh, thresh);
    827     const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
    828     uint8x16_t op2, op1, op0, oq0, oq1, oq2;
    829     DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
    830               &op2, &op1, &op0, &oq0, &oq1, &oq2);
    831     Store16x2(op2, op1, p - 2 * stride, stride);
    832     Store16x2(op0, oq0, p + 0 * stride, stride);
    833     Store16x2(oq1, oq2, p + 2 * stride, stride);
    834   }
    835 }
    836 
    837 static void HFilter16(uint8_t* p, int stride,
    838                       int thresh, int ithresh, int hev_thresh) {
    839   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
    840   Load8x16(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
    841   {
    842     const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
    843                                          ithresh, thresh);
    844     const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
    845     uint8x16_t op2, op1, op0, oq0, oq1, oq2;
    846     DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
    847               &op2, &op1, &op0, &oq0, &oq1, &oq2);
    848     Store2x16(op2, op1, p - 2, stride);
    849     Store2x16(op0, oq0, p + 0, stride);
    850     Store2x16(oq1, oq2, p + 2, stride);
    851   }
    852 }
    853 
    854 // on three inner edges
    855 static void VFilter16i(uint8_t* p, int stride,
    856                        int thresh, int ithresh, int hev_thresh) {
    857   uint32_t k;
    858   uint8x16_t p3, p2, p1, p0;
    859   Load16x4(p + 2  * stride, stride, &p3, &p2, &p1, &p0);
    860   for (k = 3; k != 0; --k) {
    861     uint8x16_t q0, q1, q2, q3;
    862     p += 4 * stride;
    863     Load16x4(p + 2  * stride, stride, &q0, &q1, &q2, &q3);
    864     {
    865       const uint8x16_t mask =
    866           NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
    867       const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
    868       // p3 and p2 are not just temporary variables here: they will be
    869       // re-used for next span. And q2/q3 will become p1/p0 accordingly.
    870       DoFilter4(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
    871       Store16x4(p1, p0, p3, p2, p, stride);
    872       p1 = q2;
    873       p0 = q3;
    874     }
    875   }
    876 }
    877 
    878 #if !defined(WORK_AROUND_GCC)
    879 static void HFilter16i(uint8_t* p, int stride,
    880                        int thresh, int ithresh, int hev_thresh) {
    881   uint32_t k;
    882   uint8x16_t p3, p2, p1, p0;
    883   Load4x16(p + 2, stride, &p3, &p2, &p1, &p0);
    884   for (k = 3; k != 0; --k) {
    885     uint8x16_t q0, q1, q2, q3;
    886     p += 4;
    887     Load4x16(p + 2, stride, &q0, &q1, &q2, &q3);
    888     {
    889       const uint8x16_t mask =
    890           NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
    891       const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
    892       DoFilter4(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
    893       Store4x16(p1, p0, p3, p2, p, stride);
    894       p1 = q2;
    895       p0 = q3;
    896     }
    897   }
    898 }
    899 #endif  // !WORK_AROUND_GCC
    900 
    901 // 8-pixels wide variant, for chroma filtering
    902 static void VFilter8(uint8_t* u, uint8_t* v, int stride,
    903                      int thresh, int ithresh, int hev_thresh) {
    904   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
    905   Load8x8x2(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
    906   {
    907     const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
    908                                          ithresh, thresh);
    909     const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
    910     uint8x16_t op2, op1, op0, oq0, oq1, oq2;
    911     DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
    912               &op2, &op1, &op0, &oq0, &oq1, &oq2);
    913     Store8x2x2(op2, op1, u - 2 * stride, v - 2 * stride, stride);
    914     Store8x2x2(op0, oq0, u + 0 * stride, v + 0 * stride, stride);
    915     Store8x2x2(oq1, oq2, u + 2 * stride, v + 2 * stride, stride);
    916   }
    917 }
    918 static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
    919                       int thresh, int ithresh, int hev_thresh) {
    920   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
    921   u += 4 * stride;
    922   v += 4 * stride;
    923   Load8x8x2(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
    924   {
    925     const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
    926                                          ithresh, thresh);
    927     const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
    928     uint8x16_t op1, op0, oq0, oq1;
    929     DoFilter4(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
    930     Store8x4x2(op1, op0, oq0, oq1, u, v, stride);
    931   }
    932 }
    933 
    934 #if !defined(WORK_AROUND_GCC)
    935 static void HFilter8(uint8_t* u, uint8_t* v, int stride,
    936                      int thresh, int ithresh, int hev_thresh) {
    937   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
    938   Load8x8x2T(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
    939   {
    940     const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
    941                                          ithresh, thresh);
    942     const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
    943     uint8x16_t op2, op1, op0, oq0, oq1, oq2;
    944     DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
    945               &op2, &op1, &op0, &oq0, &oq1, &oq2);
    946     Store6x8x2(op2, op1, op0, oq0, oq1, oq2, u, v, stride);
    947   }
    948 }
    949 
    950 static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
    951                       int thresh, int ithresh, int hev_thresh) {
    952   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
    953   u += 4;
    954   v += 4;
    955   Load8x8x2T(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
    956   {
    957     const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
    958                                          ithresh, thresh);
    959     const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
    960     uint8x16_t op1, op0, oq0, oq1;
    961     DoFilter4(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
    962     Store4x8x2(op1, op0, oq0, oq1, u, v, stride);
    963   }
    964 }
    965 #endif  // !WORK_AROUND_GCC
    966 
    967 //-----------------------------------------------------------------------------
    968 // Inverse transforms (Paragraph 14.4)
    969 
    970 // Technically these are unsigned but vqdmulh is only available in signed.
    971 // vqdmulh returns high half (effectively >> 16) but also doubles the value,
    972 // changing the >> 16 to >> 15 and requiring an additional >> 1.
    973 // We use this to our advantage with kC2. The canonical value is 35468.
    974 // However, the high bit is set so treating it as signed will give incorrect
    975 // results. We avoid this by down shifting by 1 here to clear the highest bit.
    976 // Combined with the doubling effect of vqdmulh we get >> 16.
    977 // This can not be applied to kC1 because the lowest bit is set. Down shifting
    978 // the constant would reduce precision.
    979 
    980 // libwebp uses a trick to avoid some extra addition that libvpx does.
    981 // Instead of:
    982 // temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
    983 // libwebp adds 1 << 16 to cospi8sqrt2minus1 (kC1). However, this causes the
    984 // same issue with kC1 and vqdmulh that we work around by down shifting kC2
    985 
    986 static const int16_t kC1 = 20091;
    987 static const int16_t kC2 = 17734;  // half of kC2, actually. See comment above.
    988 
    989 #if defined(USE_INTRINSICS)
    990 static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1,
    991                                      int16x8x2_t* const out) {
    992   // a0 a1 a2 a3 | b0 b1 b2 b3   => a0 b0 c0 d0 | a1 b1 c1 d1
    993   // c0 c1 c2 c3 | d0 d1 d2 d3      a2 b2 c2 d2 | a3 b3 c3 d3
    994   const int16x8x2_t tmp0 = vzipq_s16(in0, in1);   // a0 c0 a1 c1 a2 c2 ...
    995                                                   // b0 d0 b1 d1 b2 d2 ...
    996   *out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
    997 }
    998 
    999 static WEBP_INLINE void TransformPass(int16x8x2_t* const rows) {
   1000   // {rows} = in0 | in4
   1001   //          in8 | in12
   1002   // B1 = in4 | in12
   1003   const int16x8_t B1 =
   1004       vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));
   1005   // C0 = kC1 * in4 | kC1 * in12
   1006   // C1 = kC2 * in4 | kC2 * in12
   1007   const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);
   1008   const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);
   1009   const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),
   1010                                 vget_low_s16(rows->val[1]));   // in0 + in8
   1011   const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),
   1012                                 vget_low_s16(rows->val[1]));   // in0 - in8
   1013   // c = kC2 * in4 - kC1 * in12
   1014   // d = kC1 * in4 + kC2 * in12
   1015   const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));
   1016   const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));
   1017   const int16x8_t D0 = vcombine_s16(a, b);      // D0 = a | b
   1018   const int16x8_t D1 = vcombine_s16(d, c);      // D1 = d | c
   1019   const int16x8_t E0 = vqaddq_s16(D0, D1);      // a+d | b+c
   1020   const int16x8_t E_tmp = vqsubq_s16(D0, D1);   // a-d | b-c
   1021   const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
   1022   Transpose8x2(E0, E1, rows);
   1023 }
   1024 
   1025 static void TransformOne(const int16_t* in, uint8_t* dst) {
   1026   int16x8x2_t rows;
   1027   INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
   1028   TransformPass(&rows);
   1029   TransformPass(&rows);
   1030   Add4x4(rows.val[0], rows.val[1], dst);
   1031 }
   1032 
   1033 #else
   1034 
   1035 static void TransformOne(const int16_t* in, uint8_t* dst) {
   1036   const int kBPS = BPS;
   1037   // kC1, kC2. Padded because vld1.16 loads 8 bytes
   1038   const int16_t constants[4] = { kC1, kC2, 0, 0 };
   1039   /* Adapted from libvpx: vp8/common/arm/neon/shortidct4x4llm_neon.asm */
   1040   __asm__ volatile (
   1041     "vld1.16         {q1, q2}, [%[in]]           \n"
   1042     "vld1.16         {d0}, [%[constants]]        \n"
   1043 
   1044     /* d2: in[0]
   1045      * d3: in[8]
   1046      * d4: in[4]
   1047      * d5: in[12]
   1048      */
   1049     "vswp            d3, d4                      \n"
   1050 
   1051     /* q8 = {in[4], in[12]} * kC1 * 2 >> 16
   1052      * q9 = {in[4], in[12]} * kC2 >> 16
   1053      */
   1054     "vqdmulh.s16     q8, q2, d0[0]               \n"
   1055     "vqdmulh.s16     q9, q2, d0[1]               \n"
   1056 
   1057     /* d22 = a = in[0] + in[8]
   1058      * d23 = b = in[0] - in[8]
   1059      */
   1060     "vqadd.s16       d22, d2, d3                 \n"
   1061     "vqsub.s16       d23, d2, d3                 \n"
   1062 
   1063     /* The multiplication should be x * kC1 >> 16
   1064      * However, with vqdmulh we get x * kC1 * 2 >> 16
   1065      * (multiply, double, return high half)
   1066      * We avoided this in kC2 by pre-shifting the constant.
   1067      * q8 = in[4]/[12] * kC1 >> 16
   1068      */
   1069     "vshr.s16        q8, q8, #1                  \n"
   1070 
   1071     /* Add {in[4], in[12]} back after the multiplication. This is handled by
   1072      * adding 1 << 16 to kC1 in the libwebp C code.
   1073      */
   1074     "vqadd.s16       q8, q2, q8                  \n"
   1075 
   1076     /* d20 = c = in[4]*kC2 - in[12]*kC1
   1077      * d21 = d = in[4]*kC1 + in[12]*kC2
   1078      */
   1079     "vqsub.s16       d20, d18, d17               \n"
   1080     "vqadd.s16       d21, d19, d16               \n"
   1081 
   1082     /* d2 = tmp[0] = a + d
   1083      * d3 = tmp[1] = b + c
   1084      * d4 = tmp[2] = b - c
   1085      * d5 = tmp[3] = a - d
   1086      */
   1087     "vqadd.s16       d2, d22, d21                \n"
   1088     "vqadd.s16       d3, d23, d20                \n"
   1089     "vqsub.s16       d4, d23, d20                \n"
   1090     "vqsub.s16       d5, d22, d21                \n"
   1091 
   1092     "vzip.16         q1, q2                      \n"
   1093     "vzip.16         q1, q2                      \n"
   1094 
   1095     "vswp            d3, d4                      \n"
   1096 
   1097     /* q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
   1098      * q9 = {tmp[4], tmp[12]} * kC2 >> 16
   1099      */
   1100     "vqdmulh.s16     q8, q2, d0[0]               \n"
   1101     "vqdmulh.s16     q9, q2, d0[1]               \n"
   1102 
   1103     /* d22 = a = tmp[0] + tmp[8]
   1104      * d23 = b = tmp[0] - tmp[8]
   1105      */
   1106     "vqadd.s16       d22, d2, d3                 \n"
   1107     "vqsub.s16       d23, d2, d3                 \n"
   1108 
   1109     /* See long winded explanations prior */
   1110     "vshr.s16        q8, q8, #1                  \n"
   1111     "vqadd.s16       q8, q2, q8                  \n"
   1112 
   1113     /* d20 = c = in[4]*kC2 - in[12]*kC1
   1114      * d21 = d = in[4]*kC1 + in[12]*kC2
   1115      */
   1116     "vqsub.s16       d20, d18, d17               \n"
   1117     "vqadd.s16       d21, d19, d16               \n"
   1118 
   1119     /* d2 = tmp[0] = a + d
   1120      * d3 = tmp[1] = b + c
   1121      * d4 = tmp[2] = b - c
   1122      * d5 = tmp[3] = a - d
   1123      */
   1124     "vqadd.s16       d2, d22, d21                \n"
   1125     "vqadd.s16       d3, d23, d20                \n"
   1126     "vqsub.s16       d4, d23, d20                \n"
   1127     "vqsub.s16       d5, d22, d21                \n"
   1128 
   1129     "vld1.32         d6[0], [%[dst]], %[kBPS]    \n"
   1130     "vld1.32         d6[1], [%[dst]], %[kBPS]    \n"
   1131     "vld1.32         d7[0], [%[dst]], %[kBPS]    \n"
   1132     "vld1.32         d7[1], [%[dst]], %[kBPS]    \n"
   1133 
   1134     "sub         %[dst], %[dst], %[kBPS], lsl #2 \n"
   1135 
   1136     /* (val) + 4 >> 3 */
   1137     "vrshr.s16       d2, d2, #3                  \n"
   1138     "vrshr.s16       d3, d3, #3                  \n"
   1139     "vrshr.s16       d4, d4, #3                  \n"
   1140     "vrshr.s16       d5, d5, #3                  \n"
   1141 
   1142     "vzip.16         q1, q2                      \n"
   1143     "vzip.16         q1, q2                      \n"
   1144 
   1145     /* Must accumulate before saturating */
   1146     "vmovl.u8        q8, d6                      \n"
   1147     "vmovl.u8        q9, d7                      \n"
   1148 
   1149     "vqadd.s16       q1, q1, q8                  \n"
   1150     "vqadd.s16       q2, q2, q9                  \n"
   1151 
   1152     "vqmovun.s16     d0, q1                      \n"
   1153     "vqmovun.s16     d1, q2                      \n"
   1154 
   1155     "vst1.32         d0[0], [%[dst]], %[kBPS]    \n"
   1156     "vst1.32         d0[1], [%[dst]], %[kBPS]    \n"
   1157     "vst1.32         d1[0], [%[dst]], %[kBPS]    \n"
   1158     "vst1.32         d1[1], [%[dst]]             \n"
   1159 
   1160     : [in] "+r"(in), [dst] "+r"(dst)  /* modified registers */
   1161     : [kBPS] "r"(kBPS), [constants] "r"(constants)  /* constants */
   1162     : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11"  /* clobbered */
   1163   );
   1164 }
   1165 
   1166 #endif    // USE_INTRINSICS
   1167 
   1168 static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
   1169   TransformOne(in, dst);
   1170   if (do_two) {
   1171     TransformOne(in + 16, dst + 4);
   1172   }
   1173 }
   1174 
   1175 static void TransformDC(const int16_t* in, uint8_t* dst) {
   1176   const int16x8_t DC = vdupq_n_s16(in[0]);
   1177   Add4x4(DC, DC, dst);
   1178 }
   1179 
   1180 //------------------------------------------------------------------------------
   1181 
   1182 #define STORE_WHT(dst, col, rows) do {                  \
   1183   *dst = vgetq_lane_s32(rows.val[0], col); (dst) += 16; \
   1184   *dst = vgetq_lane_s32(rows.val[1], col); (dst) += 16; \
   1185   *dst = vgetq_lane_s32(rows.val[2], col); (dst) += 16; \
   1186   *dst = vgetq_lane_s32(rows.val[3], col); (dst) += 16; \
   1187 } while (0)
   1188 
   1189 static void TransformWHT(const int16_t* in, int16_t* out) {
   1190   int32x4x4_t tmp;
   1191 
   1192   {
   1193     // Load the source.
   1194     const int16x4_t in00_03 = vld1_s16(in + 0);
   1195     const int16x4_t in04_07 = vld1_s16(in + 4);
   1196     const int16x4_t in08_11 = vld1_s16(in + 8);
   1197     const int16x4_t in12_15 = vld1_s16(in + 12);
   1198     const int32x4_t a0 = vaddl_s16(in00_03, in12_15);  // in[0..3] + in[12..15]
   1199     const int32x4_t a1 = vaddl_s16(in04_07, in08_11);  // in[4..7] + in[8..11]
   1200     const int32x4_t a2 = vsubl_s16(in04_07, in08_11);  // in[4..7] - in[8..11]
   1201     const int32x4_t a3 = vsubl_s16(in00_03, in12_15);  // in[0..3] - in[12..15]
   1202     tmp.val[0] = vaddq_s32(a0, a1);
   1203     tmp.val[1] = vaddq_s32(a3, a2);
   1204     tmp.val[2] = vsubq_s32(a0, a1);
   1205     tmp.val[3] = vsubq_s32(a3, a2);
   1206     // Arrange the temporary results column-wise.
   1207     tmp = Transpose4x4(tmp);
   1208   }
   1209 
   1210   {
   1211     const int32x4_t kCst3 = vdupq_n_s32(3);
   1212     const int32x4_t dc = vaddq_s32(tmp.val[0], kCst3);  // add rounder
   1213     const int32x4_t a0 = vaddq_s32(dc, tmp.val[3]);
   1214     const int32x4_t a1 = vaddq_s32(tmp.val[1], tmp.val[2]);
   1215     const int32x4_t a2 = vsubq_s32(tmp.val[1], tmp.val[2]);
   1216     const int32x4_t a3 = vsubq_s32(dc, tmp.val[3]);
   1217 
   1218     tmp.val[0] = vaddq_s32(a0, a1);
   1219     tmp.val[1] = vaddq_s32(a3, a2);
   1220     tmp.val[2] = vsubq_s32(a0, a1);
   1221     tmp.val[3] = vsubq_s32(a3, a2);
   1222 
   1223     // right shift the results by 3.
   1224     tmp.val[0] = vshrq_n_s32(tmp.val[0], 3);
   1225     tmp.val[1] = vshrq_n_s32(tmp.val[1], 3);
   1226     tmp.val[2] = vshrq_n_s32(tmp.val[2], 3);
   1227     tmp.val[3] = vshrq_n_s32(tmp.val[3], 3);
   1228 
   1229     STORE_WHT(out, 0, tmp);
   1230     STORE_WHT(out, 1, tmp);
   1231     STORE_WHT(out, 2, tmp);
   1232     STORE_WHT(out, 3, tmp);
   1233   }
   1234 }
   1235 
   1236 #undef STORE_WHT
   1237 
   1238 //------------------------------------------------------------------------------
   1239 
   1240 #define MUL(a, b) (((a) * (b)) >> 16)
   1241 static void TransformAC3(const int16_t* in, uint8_t* dst) {
   1242   static const int kC1_full = 20091 + (1 << 16);
   1243   static const int kC2_full = 35468;
   1244   const int16x4_t A = vdup_n_s16(in[0]);
   1245   const int16x4_t c4 = vdup_n_s16(MUL(in[4], kC2_full));
   1246   const int16x4_t d4 = vdup_n_s16(MUL(in[4], kC1_full));
   1247   const int c1 = MUL(in[1], kC2_full);
   1248   const int d1 = MUL(in[1], kC1_full);
   1249   const uint64_t cd = (uint64_t)( d1 & 0xffff) <<  0 |
   1250                       (uint64_t)( c1 & 0xffff) << 16 |
   1251                       (uint64_t)(-c1 & 0xffff) << 32 |
   1252                       (uint64_t)(-d1 & 0xffff) << 48;
   1253   const int16x4_t CD = vcreate_s16(cd);
   1254   const int16x4_t B = vqadd_s16(A, CD);
   1255   const int16x8_t m0_m1 = vcombine_s16(vqadd_s16(B, d4), vqadd_s16(B, c4));
   1256   const int16x8_t m2_m3 = vcombine_s16(vqsub_s16(B, c4), vqsub_s16(B, d4));
   1257   Add4x4(m0_m1, m2_m3, dst);
   1258 }
   1259 #undef MUL
   1260 
   1261 #endif   // WEBP_USE_NEON
   1262 
   1263 //------------------------------------------------------------------------------
   1264 // Entry point
   1265 
   1266 extern void VP8DspInitNEON(void);
   1267 
   1268 void VP8DspInitNEON(void) {
   1269 #if defined(WEBP_USE_NEON)
   1270   VP8Transform = TransformTwo;
   1271   VP8TransformAC3 = TransformAC3;
   1272   VP8TransformDC = TransformDC;
   1273   VP8TransformWHT = TransformWHT;
   1274 
   1275   VP8VFilter16 = VFilter16;
   1276   VP8VFilter16i = VFilter16i;
   1277   VP8HFilter16 = HFilter16;
   1278 #if !defined(WORK_AROUND_GCC)
   1279   VP8HFilter16i = HFilter16i;
   1280 #endif
   1281   VP8VFilter8 = VFilter8;
   1282   VP8VFilter8i = VFilter8i;
   1283 #if !defined(WORK_AROUND_GCC)
   1284   VP8HFilter8 = HFilter8;
   1285   VP8HFilter8i = HFilter8i;
   1286 #endif
   1287   VP8SimpleVFilter16 = SimpleVFilter16;
   1288   VP8SimpleHFilter16 = SimpleHFilter16;
   1289   VP8SimpleVFilter16i = SimpleVFilter16i;
   1290   VP8SimpleHFilter16i = SimpleHFilter16i;
   1291 #endif   // WEBP_USE_NEON
   1292 }
   1293