Home | History | Annotate | Download | only in dsp
      1 // Copyright 2012 Google Inc. All Rights Reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style license
      4 // that can be found in the COPYING file in the root of the source
      5 // tree. An additional intellectual property rights grant can be found
      6 // in the file PATENTS. All contributing project authors may
      7 // be found in the AUTHORS file in the root of the source tree.
      8 // -----------------------------------------------------------------------------
      9 //
     10 // ARM NEON version of speed-critical encoding functions.
     11 //
     12 // adapted from libvpx (http://www.webmproject.org/code/)
     13 
     14 #include "./dsp.h"
     15 
     16 #if defined(WEBP_USE_NEON)
     17 
     18 #include <assert.h>
     19 
     20 #include "./neon.h"
     21 #include "../enc/vp8enci.h"
     22 
     23 //------------------------------------------------------------------------------
     24 // Transforms (Paragraph 14.4)
     25 
     26 // Inverse transform.
     27 // This code is pretty much the same as TransformOne in the dec_neon.c, except
     28 // for subtraction to *ref. See the comments there for algorithmic explanations.
     29 
     30 static const int16_t kC1 = 20091;
     31 static const int16_t kC2 = 17734;  // half of kC2, actually. See comment above.
     32 
     33 // This code works but is *slower* than the inlined-asm version below
     34 // (with gcc-4.6). So we disable it for now. Later, it'll be conditional to
     35 // WEBP_USE_INTRINSICS define.
     36 // With gcc-4.8, it's a little faster speed than inlined-assembly.
     37 #if defined(WEBP_USE_INTRINSICS)
     38 
     39 // Treats 'v' as an uint8x8_t and zero extends to an int16x8_t.
     40 static WEBP_INLINE int16x8_t ConvertU8ToS16(uint32x2_t v) {
     41   return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v)));
     42 }
     43 
     44 // Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
     45 // to the corresponding rows of 'dst'.
     46 static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst,
     47                                             const int16x8_t dst01,
     48                                             const int16x8_t dst23) {
     49   // Unsigned saturate to 8b.
     50   const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
     51   const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
     52 
     53   // Store the results.
     54   vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0);
     55   vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1);
     56   vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0);
     57   vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
     58 }
     59 
     60 static WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23,
     61                                const uint8_t* const ref, uint8_t* const dst) {
     62   uint32x2_t dst01 = vdup_n_u32(0);
     63   uint32x2_t dst23 = vdup_n_u32(0);
     64 
     65   // Load the source pixels.
     66   dst01 = vld1_lane_u32((uint32_t*)(ref + 0 * BPS), dst01, 0);
     67   dst23 = vld1_lane_u32((uint32_t*)(ref + 2 * BPS), dst23, 0);
     68   dst01 = vld1_lane_u32((uint32_t*)(ref + 1 * BPS), dst01, 1);
     69   dst23 = vld1_lane_u32((uint32_t*)(ref + 3 * BPS), dst23, 1);
     70 
     71   {
     72     // Convert to 16b.
     73     const int16x8_t dst01_s16 = ConvertU8ToS16(dst01);
     74     const int16x8_t dst23_s16 = ConvertU8ToS16(dst23);
     75 
     76     // Descale with rounding.
     77     const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
     78     const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
     79     // Add the inverse transform.
     80     SaturateAndStore4x4(dst, out01, out23);
     81   }
     82 }
     83 
     84 static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1,
     85                                      int16x8x2_t* const out) {
     86   // a0 a1 a2 a3 | b0 b1 b2 b3   => a0 b0 c0 d0 | a1 b1 c1 d1
     87   // c0 c1 c2 c3 | d0 d1 d2 d3      a2 b2 c2 d2 | a3 b3 c3 d3
     88   const int16x8x2_t tmp0 = vzipq_s16(in0, in1);   // a0 c0 a1 c1 a2 c2 ...
     89                                                   // b0 d0 b1 d1 b2 d2 ...
     90   *out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
     91 }
     92 
     93 static WEBP_INLINE void TransformPass(int16x8x2_t* const rows) {
     94   // {rows} = in0 | in4
     95   //          in8 | in12
     96   // B1 = in4 | in12
     97   const int16x8_t B1 =
     98       vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));
     99   // C0 = kC1 * in4 | kC1 * in12
    100   // C1 = kC2 * in4 | kC2 * in12
    101   const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);
    102   const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);
    103   const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),
    104                                 vget_low_s16(rows->val[1]));   // in0 + in8
    105   const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),
    106                                 vget_low_s16(rows->val[1]));   // in0 - in8
    107   // c = kC2 * in4 - kC1 * in12
    108   // d = kC1 * in4 + kC2 * in12
    109   const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));
    110   const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));
    111   const int16x8_t D0 = vcombine_s16(a, b);      // D0 = a | b
    112   const int16x8_t D1 = vcombine_s16(d, c);      // D1 = d | c
    113   const int16x8_t E0 = vqaddq_s16(D0, D1);      // a+d | b+c
    114   const int16x8_t E_tmp = vqsubq_s16(D0, D1);   // a-d | b-c
    115   const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
    116   Transpose8x2(E0, E1, rows);
    117 }
    118 
    119 static void ITransformOne(const uint8_t* ref,
    120                           const int16_t* in, uint8_t* dst) {
    121   int16x8x2_t rows;
    122   INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
    123   TransformPass(&rows);
    124   TransformPass(&rows);
    125   Add4x4(rows.val[0], rows.val[1], ref, dst);
    126 }
    127 
    128 #else
    129 
    130 static void ITransformOne(const uint8_t* ref,
    131                           const int16_t* in, uint8_t* dst) {
    132   const int kBPS = BPS;
    133   const int16_t kC1C2[] = { kC1, kC2, 0, 0 };
    134 
    135   __asm__ volatile (
    136     "vld1.16         {q1, q2}, [%[in]]           \n"
    137     "vld1.16         {d0}, [%[kC1C2]]            \n"
    138 
    139     // d2: in[0]
    140     // d3: in[8]
    141     // d4: in[4]
    142     // d5: in[12]
    143     "vswp            d3, d4                      \n"
    144 
    145     // q8 = {in[4], in[12]} * kC1 * 2 >> 16
    146     // q9 = {in[4], in[12]} * kC2 >> 16
    147     "vqdmulh.s16     q8, q2, d0[0]               \n"
    148     "vqdmulh.s16     q9, q2, d0[1]               \n"
    149 
    150     // d22 = a = in[0] + in[8]
    151     // d23 = b = in[0] - in[8]
    152     "vqadd.s16       d22, d2, d3                 \n"
    153     "vqsub.s16       d23, d2, d3                 \n"
    154 
    155     //  q8 = in[4]/[12] * kC1 >> 16
    156     "vshr.s16        q8, q8, #1                  \n"
    157 
    158     // Add {in[4], in[12]} back after the multiplication.
    159     "vqadd.s16       q8, q2, q8                  \n"
    160 
    161     // d20 = c = in[4]*kC2 - in[12]*kC1
    162     // d21 = d = in[4]*kC1 + in[12]*kC2
    163     "vqsub.s16       d20, d18, d17               \n"
    164     "vqadd.s16       d21, d19, d16               \n"
    165 
    166     // d2 = tmp[0] = a + d
    167     // d3 = tmp[1] = b + c
    168     // d4 = tmp[2] = b - c
    169     // d5 = tmp[3] = a - d
    170     "vqadd.s16       d2, d22, d21                \n"
    171     "vqadd.s16       d3, d23, d20                \n"
    172     "vqsub.s16       d4, d23, d20                \n"
    173     "vqsub.s16       d5, d22, d21                \n"
    174 
    175     "vzip.16         q1, q2                      \n"
    176     "vzip.16         q1, q2                      \n"
    177 
    178     "vswp            d3, d4                      \n"
    179 
    180     // q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
    181     // q9 = {tmp[4], tmp[12]} * kC2 >> 16
    182     "vqdmulh.s16     q8, q2, d0[0]               \n"
    183     "vqdmulh.s16     q9, q2, d0[1]               \n"
    184 
    185     // d22 = a = tmp[0] + tmp[8]
    186     // d23 = b = tmp[0] - tmp[8]
    187     "vqadd.s16       d22, d2, d3                 \n"
    188     "vqsub.s16       d23, d2, d3                 \n"
    189 
    190     "vshr.s16        q8, q8, #1                  \n"
    191     "vqadd.s16       q8, q2, q8                  \n"
    192 
    193     // d20 = c = in[4]*kC2 - in[12]*kC1
    194     // d21 = d = in[4]*kC1 + in[12]*kC2
    195     "vqsub.s16       d20, d18, d17               \n"
    196     "vqadd.s16       d21, d19, d16               \n"
    197 
    198     // d2 = tmp[0] = a + d
    199     // d3 = tmp[1] = b + c
    200     // d4 = tmp[2] = b - c
    201     // d5 = tmp[3] = a - d
    202     "vqadd.s16       d2, d22, d21                \n"
    203     "vqadd.s16       d3, d23, d20                \n"
    204     "vqsub.s16       d4, d23, d20                \n"
    205     "vqsub.s16       d5, d22, d21                \n"
    206 
    207     "vld1.32         d6[0], [%[ref]], %[kBPS]    \n"
    208     "vld1.32         d6[1], [%[ref]], %[kBPS]    \n"
    209     "vld1.32         d7[0], [%[ref]], %[kBPS]    \n"
    210     "vld1.32         d7[1], [%[ref]], %[kBPS]    \n"
    211 
    212     "sub         %[ref], %[ref], %[kBPS], lsl #2 \n"
    213 
    214     // (val) + 4 >> 3
    215     "vrshr.s16       d2, d2, #3                  \n"
    216     "vrshr.s16       d3, d3, #3                  \n"
    217     "vrshr.s16       d4, d4, #3                  \n"
    218     "vrshr.s16       d5, d5, #3                  \n"
    219 
    220     "vzip.16         q1, q2                      \n"
    221     "vzip.16         q1, q2                      \n"
    222 
    223     // Must accumulate before saturating
    224     "vmovl.u8        q8, d6                      \n"
    225     "vmovl.u8        q9, d7                      \n"
    226 
    227     "vqadd.s16       q1, q1, q8                  \n"
    228     "vqadd.s16       q2, q2, q9                  \n"
    229 
    230     "vqmovun.s16     d0, q1                      \n"
    231     "vqmovun.s16     d1, q2                      \n"
    232 
    233     "vst1.32         d0[0], [%[dst]], %[kBPS]    \n"
    234     "vst1.32         d0[1], [%[dst]], %[kBPS]    \n"
    235     "vst1.32         d1[0], [%[dst]], %[kBPS]    \n"
    236     "vst1.32         d1[1], [%[dst]]             \n"
    237 
    238     : [in] "+r"(in), [dst] "+r"(dst)               // modified registers
    239     : [kBPS] "r"(kBPS), [kC1C2] "r"(kC1C2), [ref] "r"(ref)  // constants
    240     : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11"  // clobbered
    241   );
    242 }
    243 
    244 #endif    // WEBP_USE_INTRINSICS
    245 
    246 static void ITransform(const uint8_t* ref,
    247                        const int16_t* in, uint8_t* dst, int do_two) {
    248   ITransformOne(ref, in, dst);
    249   if (do_two) {
    250     ITransformOne(ref + 4, in + 16, dst + 4);
    251   }
    252 }
    253 
    254 // Load all 4x4 pixels into a single uint8x16_t variable.
    255 static uint8x16_t Load4x4(const uint8_t* src) {
    256   uint32x4_t out = vdupq_n_u32(0);
    257   out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0);
    258   out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1);
    259   out = vld1q_lane_u32((const uint32_t*)(src + 2 * BPS), out, 2);
    260   out = vld1q_lane_u32((const uint32_t*)(src + 3 * BPS), out, 3);
    261   return vreinterpretq_u8_u32(out);
    262 }
    263 
    264 // Forward transform.
    265 
    266 #if defined(WEBP_USE_INTRINSICS)
    267 
    268 static WEBP_INLINE void Transpose4x4_S16(const int16x4_t A, const int16x4_t B,
    269                                          const int16x4_t C, const int16x4_t D,
    270                                          int16x8_t* const out01,
    271                                          int16x8_t* const out32) {
    272   const int16x4x2_t AB = vtrn_s16(A, B);
    273   const int16x4x2_t CD = vtrn_s16(C, D);
    274   const int32x2x2_t tmp02 = vtrn_s32(vreinterpret_s32_s16(AB.val[0]),
    275                                      vreinterpret_s32_s16(CD.val[0]));
    276   const int32x2x2_t tmp13 = vtrn_s32(vreinterpret_s32_s16(AB.val[1]),
    277                                      vreinterpret_s32_s16(CD.val[1]));
    278   *out01 = vreinterpretq_s16_s64(
    279       vcombine_s64(vreinterpret_s64_s32(tmp02.val[0]),
    280                    vreinterpret_s64_s32(tmp13.val[0])));
    281   *out32 = vreinterpretq_s16_s64(
    282       vcombine_s64(vreinterpret_s64_s32(tmp13.val[1]),
    283                    vreinterpret_s64_s32(tmp02.val[1])));
    284 }
    285 
    286 static WEBP_INLINE int16x8_t DiffU8ToS16(const uint8x8_t a,
    287                                          const uint8x8_t b) {
    288   return vreinterpretq_s16_u16(vsubl_u8(a, b));
    289 }
    290 
    291 static void FTransform(const uint8_t* src, const uint8_t* ref,
    292                        int16_t* out) {
    293   int16x8_t d0d1, d3d2;   // working 4x4 int16 variables
    294   {
    295     const uint8x16_t S0 = Load4x4(src);
    296     const uint8x16_t R0 = Load4x4(ref);
    297     const int16x8_t D0D1 = DiffU8ToS16(vget_low_u8(S0), vget_low_u8(R0));
    298     const int16x8_t D2D3 = DiffU8ToS16(vget_high_u8(S0), vget_high_u8(R0));
    299     const int16x4_t D0 = vget_low_s16(D0D1);
    300     const int16x4_t D1 = vget_high_s16(D0D1);
    301     const int16x4_t D2 = vget_low_s16(D2D3);
    302     const int16x4_t D3 = vget_high_s16(D2D3);
    303     Transpose4x4_S16(D0, D1, D2, D3, &d0d1, &d3d2);
    304   }
    305   {    // 1rst pass
    306     const int32x4_t kCst937 = vdupq_n_s32(937);
    307     const int32x4_t kCst1812 = vdupq_n_s32(1812);
    308     const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2);   // d0+d3 | d1+d2   (=a0|a1)
    309     const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2);   // d0-d3 | d1-d2   (=a3|a2)
    310     const int16x8_t a0a1_2 = vshlq_n_s16(a0a1, 3);
    311     const int16x4_t tmp0 = vadd_s16(vget_low_s16(a0a1_2),
    312                                     vget_high_s16(a0a1_2));
    313     const int16x4_t tmp2 = vsub_s16(vget_low_s16(a0a1_2),
    314                                     vget_high_s16(a0a1_2));
    315     const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
    316     const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
    317     const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
    318     const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
    319     const int16x4_t tmp1 = vshrn_n_s32(vaddq_s32(a2_p_a3, kCst1812), 9);
    320     const int16x4_t tmp3 = vshrn_n_s32(vaddq_s32(a3_m_a2, kCst937), 9);
    321     Transpose4x4_S16(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2);
    322   }
    323   {    // 2nd pass
    324     // the (1<<16) addition is for the replacement: a3!=0  <-> 1-(a3==0)
    325     const int32x4_t kCst12000 = vdupq_n_s32(12000 + (1 << 16));
    326     const int32x4_t kCst51000 = vdupq_n_s32(51000);
    327     const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2);   // d0+d3 | d1+d2   (=a0|a1)
    328     const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2);   // d0-d3 | d1-d2   (=a3|a2)
    329     const int16x4_t a0_k7 = vadd_s16(vget_low_s16(a0a1), vdup_n_s16(7));
    330     const int16x4_t out0 = vshr_n_s16(vadd_s16(a0_k7, vget_high_s16(a0a1)), 4);
    331     const int16x4_t out2 = vshr_n_s16(vsub_s16(a0_k7, vget_high_s16(a0a1)), 4);
    332     const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
    333     const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
    334     const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
    335     const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
    336     const int16x4_t tmp1 = vaddhn_s32(a2_p_a3, kCst12000);
    337     const int16x4_t out3 = vaddhn_s32(a3_m_a2, kCst51000);
    338     const int16x4_t a3_eq_0 =
    339         vreinterpret_s16_u16(vceq_s16(vget_low_s16(a3a2), vdup_n_s16(0)));
    340     const int16x4_t out1 = vadd_s16(tmp1, a3_eq_0);
    341     vst1_s16(out +  0, out0);
    342     vst1_s16(out +  4, out1);
    343     vst1_s16(out +  8, out2);
    344     vst1_s16(out + 12, out3);
    345   }
    346 }
    347 
    348 #else
    349 
    350 // adapted from vp8/encoder/arm/neon/shortfdct_neon.asm
    351 static const int16_t kCoeff16[] = {
    352   5352,  5352,  5352, 5352, 2217,  2217,  2217, 2217
    353 };
    354 static const int32_t kCoeff32[] = {
    355    1812,  1812,  1812,  1812,
    356     937,   937,   937,   937,
    357   12000, 12000, 12000, 12000,
    358   51000, 51000, 51000, 51000
    359 };
    360 
    361 static void FTransform(const uint8_t* src, const uint8_t* ref,
    362                        int16_t* out) {
    363   const int kBPS = BPS;
    364   const uint8_t* src_ptr = src;
    365   const uint8_t* ref_ptr = ref;
    366   const int16_t* coeff16 = kCoeff16;
    367   const int32_t* coeff32 = kCoeff32;
    368 
    369   __asm__ volatile (
    370     // load src into q4, q5 in high half
    371     "vld1.8 {d8},  [%[src_ptr]], %[kBPS]      \n"
    372     "vld1.8 {d10}, [%[src_ptr]], %[kBPS]      \n"
    373     "vld1.8 {d9},  [%[src_ptr]], %[kBPS]      \n"
    374     "vld1.8 {d11}, [%[src_ptr]]               \n"
    375 
    376     // load ref into q6, q7 in high half
    377     "vld1.8 {d12}, [%[ref_ptr]], %[kBPS]      \n"
    378     "vld1.8 {d14}, [%[ref_ptr]], %[kBPS]      \n"
    379     "vld1.8 {d13}, [%[ref_ptr]], %[kBPS]      \n"
    380     "vld1.8 {d15}, [%[ref_ptr]]               \n"
    381 
    382     // Pack the high values in to q4 and q6
    383     "vtrn.32     q4, q5                       \n"
    384     "vtrn.32     q6, q7                       \n"
    385 
    386     // d[0-3] = src - ref
    387     "vsubl.u8    q0, d8, d12                  \n"
    388     "vsubl.u8    q1, d9, d13                  \n"
    389 
    390     // load coeff16 into q8(d16=5352, d17=2217)
    391     "vld1.16     {q8}, [%[coeff16]]           \n"
    392 
    393     // load coeff32 high half into q9 = 1812, q10 = 937
    394     "vld1.32     {q9, q10}, [%[coeff32]]!     \n"
    395 
    396     // load coeff32 low half into q11=12000, q12=51000
    397     "vld1.32     {q11,q12}, [%[coeff32]]      \n"
    398 
    399     // part 1
    400     // Transpose. Register dN is the same as dN in C
    401     "vtrn.32         d0, d2                   \n"
    402     "vtrn.32         d1, d3                   \n"
    403     "vtrn.16         d0, d1                   \n"
    404     "vtrn.16         d2, d3                   \n"
    405 
    406     "vadd.s16        d4, d0, d3               \n" // a0 = d0 + d3
    407     "vadd.s16        d5, d1, d2               \n" // a1 = d1 + d2
    408     "vsub.s16        d6, d1, d2               \n" // a2 = d1 - d2
    409     "vsub.s16        d7, d0, d3               \n" // a3 = d0 - d3
    410 
    411     "vadd.s16        d0, d4, d5               \n" // a0 + a1
    412     "vshl.s16        d0, d0, #3               \n" // temp[0+i*4] = (a0+a1) << 3
    413     "vsub.s16        d2, d4, d5               \n" // a0 - a1
    414     "vshl.s16        d2, d2, #3               \n" // (temp[2+i*4] = (a0-a1) << 3
    415 
    416     "vmlal.s16       q9, d7, d16              \n" // a3*5352 + 1812
    417     "vmlal.s16       q10, d7, d17             \n" // a3*2217 + 937
    418     "vmlal.s16       q9, d6, d17              \n" // a2*2217 + a3*5352 + 1812
    419     "vmlsl.s16       q10, d6, d16             \n" // a3*2217 + 937 - a2*5352
    420 
    421     // temp[1+i*4] = (d2*2217 + d3*5352 + 1812) >> 9
    422     // temp[3+i*4] = (d3*2217 + 937 - d2*5352) >> 9
    423     "vshrn.s32       d1, q9, #9               \n"
    424     "vshrn.s32       d3, q10, #9              \n"
    425 
    426     // part 2
    427     // transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
    428     "vtrn.32         d0, d2                   \n"
    429     "vtrn.32         d1, d3                   \n"
    430     "vtrn.16         d0, d1                   \n"
    431     "vtrn.16         d2, d3                   \n"
    432 
    433     "vmov.s16        d26, #7                  \n"
    434 
    435     "vadd.s16        d4, d0, d3               \n" // a1 = ip[0] + ip[12]
    436     "vadd.s16        d5, d1, d2               \n" // b1 = ip[4] + ip[8]
    437     "vsub.s16        d6, d1, d2               \n" // c1 = ip[4] - ip[8]
    438     "vadd.s16        d4, d4, d26              \n" // a1 + 7
    439     "vsub.s16        d7, d0, d3               \n" // d1 = ip[0] - ip[12]
    440 
    441     "vadd.s16        d0, d4, d5               \n" // op[0] = a1 + b1 + 7
    442     "vsub.s16        d2, d4, d5               \n" // op[8] = a1 - b1 + 7
    443 
    444     "vmlal.s16       q11, d7, d16             \n" // d1*5352 + 12000
    445     "vmlal.s16       q12, d7, d17             \n" // d1*2217 + 51000
    446 
    447     "vceq.s16        d4, d7, #0               \n"
    448 
    449     "vshr.s16        d0, d0, #4               \n"
    450     "vshr.s16        d2, d2, #4               \n"
    451 
    452     "vmlal.s16       q11, d6, d17             \n" // c1*2217 + d1*5352 + 12000
    453     "vmlsl.s16       q12, d6, d16             \n" // d1*2217 - c1*5352 + 51000
    454 
    455     "vmvn            d4, d4                   \n" // !(d1 == 0)
    456     // op[4] = (c1*2217 + d1*5352 + 12000)>>16
    457     "vshrn.s32       d1, q11, #16             \n"
    458     // op[4] += (d1!=0)
    459     "vsub.s16        d1, d1, d4               \n"
    460     // op[12]= (d1*2217 - c1*5352 + 51000)>>16
    461     "vshrn.s32       d3, q12, #16             \n"
    462 
    463     // set result to out array
    464     "vst1.16         {q0, q1}, [%[out]]   \n"
    465     : [src_ptr] "+r"(src_ptr), [ref_ptr] "+r"(ref_ptr),
    466       [coeff32] "+r"(coeff32)          // modified registers
    467     : [kBPS] "r"(kBPS), [coeff16] "r"(coeff16),
    468       [out] "r"(out)                   // constants
    469     : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
    470       "q10", "q11", "q12", "q13"       // clobbered
    471   );
    472 }
    473 
    474 #endif
    475 
    476 #define LOAD_LANE_16b(VALUE, LANE) do {             \
    477   (VALUE) = vld1_lane_s16(src, (VALUE), (LANE));    \
    478   src += stride;                                    \
    479 } while (0)
    480 
    481 static void FTransformWHT(const int16_t* src, int16_t* out) {
    482   const int stride = 16;
    483   const int16x4_t zero = vdup_n_s16(0);
    484   int32x4x4_t tmp0;
    485   int16x4x4_t in;
    486   INIT_VECTOR4(in, zero, zero, zero, zero);
    487   LOAD_LANE_16b(in.val[0], 0);
    488   LOAD_LANE_16b(in.val[1], 0);
    489   LOAD_LANE_16b(in.val[2], 0);
    490   LOAD_LANE_16b(in.val[3], 0);
    491   LOAD_LANE_16b(in.val[0], 1);
    492   LOAD_LANE_16b(in.val[1], 1);
    493   LOAD_LANE_16b(in.val[2], 1);
    494   LOAD_LANE_16b(in.val[3], 1);
    495   LOAD_LANE_16b(in.val[0], 2);
    496   LOAD_LANE_16b(in.val[1], 2);
    497   LOAD_LANE_16b(in.val[2], 2);
    498   LOAD_LANE_16b(in.val[3], 2);
    499   LOAD_LANE_16b(in.val[0], 3);
    500   LOAD_LANE_16b(in.val[1], 3);
    501   LOAD_LANE_16b(in.val[2], 3);
    502   LOAD_LANE_16b(in.val[3], 3);
    503 
    504   {
    505     // a0 = in[0 * 16] + in[2 * 16]
    506     // a1 = in[1 * 16] + in[3 * 16]
    507     // a2 = in[1 * 16] - in[3 * 16]
    508     // a3 = in[0 * 16] - in[2 * 16]
    509     const int32x4_t a0 = vaddl_s16(in.val[0], in.val[2]);
    510     const int32x4_t a1 = vaddl_s16(in.val[1], in.val[3]);
    511     const int32x4_t a2 = vsubl_s16(in.val[1], in.val[3]);
    512     const int32x4_t a3 = vsubl_s16(in.val[0], in.val[2]);
    513     tmp0.val[0] = vaddq_s32(a0, a1);
    514     tmp0.val[1] = vaddq_s32(a3, a2);
    515     tmp0.val[2] = vsubq_s32(a3, a2);
    516     tmp0.val[3] = vsubq_s32(a0, a1);
    517   }
    518   {
    519     const int32x4x4_t tmp1 = Transpose4x4(tmp0);
    520     // a0 = tmp[0 + i] + tmp[ 8 + i]
    521     // a1 = tmp[4 + i] + tmp[12 + i]
    522     // a2 = tmp[4 + i] - tmp[12 + i]
    523     // a3 = tmp[0 + i] - tmp[ 8 + i]
    524     const int32x4_t a0 = vaddq_s32(tmp1.val[0], tmp1.val[2]);
    525     const int32x4_t a1 = vaddq_s32(tmp1.val[1], tmp1.val[3]);
    526     const int32x4_t a2 = vsubq_s32(tmp1.val[1], tmp1.val[3]);
    527     const int32x4_t a3 = vsubq_s32(tmp1.val[0], tmp1.val[2]);
    528     const int32x4_t b0 = vhaddq_s32(a0, a1);  // (a0 + a1) >> 1
    529     const int32x4_t b1 = vhaddq_s32(a3, a2);  // (a3 + a2) >> 1
    530     const int32x4_t b2 = vhsubq_s32(a3, a2);  // (a3 - a2) >> 1
    531     const int32x4_t b3 = vhsubq_s32(a0, a1);  // (a0 - a1) >> 1
    532     const int16x4_t out0 = vmovn_s32(b0);
    533     const int16x4_t out1 = vmovn_s32(b1);
    534     const int16x4_t out2 = vmovn_s32(b2);
    535     const int16x4_t out3 = vmovn_s32(b3);
    536 
    537     vst1_s16(out +  0, out0);
    538     vst1_s16(out +  4, out1);
    539     vst1_s16(out +  8, out2);
    540     vst1_s16(out + 12, out3);
    541   }
    542 }
    543 #undef LOAD_LANE_16b
    544 
    545 //------------------------------------------------------------------------------
    546 // Texture distortion
    547 //
    548 // We try to match the spectral content (weighted) between source and
    549 // reconstructed samples.
    550 
    551 // a 0123, b 0123
    552 // a 4567, b 4567
    553 // a 89ab, b 89ab
    554 // a cdef, b cdef
    555 //
    556 // transpose
    557 //
    558 // a 048c, b 048c
    559 // a 159d, b 159d
    560 // a 26ae, b 26ae
    561 // a 37bf, b 37bf
    562 //
    563 static WEBP_INLINE uint8x8x4_t DistoTranspose4x4U8(uint8x8x4_t d4_in) {
    564   const uint8x8x2_t d2_tmp0 = vtrn_u8(d4_in.val[0], d4_in.val[1]);
    565   const uint8x8x2_t d2_tmp1 = vtrn_u8(d4_in.val[2], d4_in.val[3]);
    566   const uint16x4x2_t d2_tmp2 = vtrn_u16(vreinterpret_u16_u8(d2_tmp0.val[0]),
    567                                         vreinterpret_u16_u8(d2_tmp1.val[0]));
    568   const uint16x4x2_t d2_tmp3 = vtrn_u16(vreinterpret_u16_u8(d2_tmp0.val[1]),
    569                                         vreinterpret_u16_u8(d2_tmp1.val[1]));
    570 
    571   d4_in.val[0] = vreinterpret_u8_u16(d2_tmp2.val[0]);
    572   d4_in.val[2] = vreinterpret_u8_u16(d2_tmp2.val[1]);
    573   d4_in.val[1] = vreinterpret_u8_u16(d2_tmp3.val[0]);
    574   d4_in.val[3] = vreinterpret_u8_u16(d2_tmp3.val[1]);
    575   return d4_in;
    576 }
    577 
    578 static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16(int16x8x4_t q4_in) {
    579   const int16x8x2_t q2_tmp0 = vtrnq_s16(q4_in.val[0], q4_in.val[1]);
    580   const int16x8x2_t q2_tmp1 = vtrnq_s16(q4_in.val[2], q4_in.val[3]);
    581   const int32x4x2_t q2_tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[0]),
    582                                         vreinterpretq_s32_s16(q2_tmp1.val[0]));
    583   const int32x4x2_t q2_tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[1]),
    584                                         vreinterpretq_s32_s16(q2_tmp1.val[1]));
    585   q4_in.val[0] = vreinterpretq_s16_s32(q2_tmp2.val[0]);
    586   q4_in.val[2] = vreinterpretq_s16_s32(q2_tmp2.val[1]);
    587   q4_in.val[1] = vreinterpretq_s16_s32(q2_tmp3.val[0]);
    588   q4_in.val[3] = vreinterpretq_s16_s32(q2_tmp3.val[1]);
    589   return q4_in;
    590 }
    591 
    592 static WEBP_INLINE int16x8x4_t DistoHorizontalPass(const uint8x8x4_t d4_in) {
    593   // {a0, a1} = {in[0] + in[2], in[1] + in[3]}
    594   // {a3, a2} = {in[0] - in[2], in[1] - in[3]}
    595   const int16x8_t q_a0 = vreinterpretq_s16_u16(vaddl_u8(d4_in.val[0],
    596                                                         d4_in.val[2]));
    597   const int16x8_t q_a1 = vreinterpretq_s16_u16(vaddl_u8(d4_in.val[1],
    598                                                         d4_in.val[3]));
    599   const int16x8_t q_a3 = vreinterpretq_s16_u16(vsubl_u8(d4_in.val[0],
    600                                                         d4_in.val[2]));
    601   const int16x8_t q_a2 = vreinterpretq_s16_u16(vsubl_u8(d4_in.val[1],
    602                                                         d4_in.val[3]));
    603   int16x8x4_t q4_out;
    604   // tmp[0] = a0 + a1
    605   // tmp[1] = a3 + a2
    606   // tmp[2] = a3 - a2
    607   // tmp[3] = a0 - a1
    608   INIT_VECTOR4(q4_out,
    609                vaddq_s16(q_a0, q_a1), vaddq_s16(q_a3, q_a2),
    610                vsubq_s16(q_a3, q_a2), vsubq_s16(q_a0, q_a1));
    611   return q4_out;
    612 }
    613 
    614 static WEBP_INLINE int16x8x4_t DistoVerticalPass(int16x8x4_t q4_in) {
    615   const int16x8_t q_a0 = vaddq_s16(q4_in.val[0], q4_in.val[2]);
    616   const int16x8_t q_a1 = vaddq_s16(q4_in.val[1], q4_in.val[3]);
    617   const int16x8_t q_a2 = vsubq_s16(q4_in.val[1], q4_in.val[3]);
    618   const int16x8_t q_a3 = vsubq_s16(q4_in.val[0], q4_in.val[2]);
    619 
    620   q4_in.val[0] = vaddq_s16(q_a0, q_a1);
    621   q4_in.val[1] = vaddq_s16(q_a3, q_a2);
    622   q4_in.val[2] = vabdq_s16(q_a3, q_a2);
    623   q4_in.val[3] = vabdq_s16(q_a0, q_a1);
    624   q4_in.val[0] = vabsq_s16(q4_in.val[0]);
    625   q4_in.val[1] = vabsq_s16(q4_in.val[1]);
    626   return q4_in;
    627 }
    628 
    629 static WEBP_INLINE int16x4x4_t DistoLoadW(const uint16_t* w) {
    630   const uint16x8_t q_w07 = vld1q_u16(&w[0]);
    631   const uint16x8_t q_w8f = vld1q_u16(&w[8]);
    632   int16x4x4_t d4_w;
    633   INIT_VECTOR4(d4_w,
    634                vget_low_s16(vreinterpretq_s16_u16(q_w07)),
    635                vget_high_s16(vreinterpretq_s16_u16(q_w07)),
    636                vget_low_s16(vreinterpretq_s16_u16(q_w8f)),
    637                vget_high_s16(vreinterpretq_s16_u16(q_w8f)));
    638   return d4_w;
    639 }
    640 
    641 static WEBP_INLINE int32x2_t DistoSum(const int16x8x4_t q4_in,
    642                                       const int16x4x4_t d4_w) {
    643   int32x2_t d_sum;
    644   // sum += w[ 0] * abs(b0);
    645   // sum += w[ 4] * abs(b1);
    646   // sum += w[ 8] * abs(b2);
    647   // sum += w[12] * abs(b3);
    648   int32x4_t q_sum0 = vmull_s16(d4_w.val[0], vget_low_s16(q4_in.val[0]));
    649   int32x4_t q_sum1 = vmull_s16(d4_w.val[1], vget_low_s16(q4_in.val[1]));
    650   int32x4_t q_sum2 = vmull_s16(d4_w.val[2], vget_low_s16(q4_in.val[2]));
    651   int32x4_t q_sum3 = vmull_s16(d4_w.val[3], vget_low_s16(q4_in.val[3]));
    652   q_sum0 = vmlsl_s16(q_sum0, d4_w.val[0], vget_high_s16(q4_in.val[0]));
    653   q_sum1 = vmlsl_s16(q_sum1, d4_w.val[1], vget_high_s16(q4_in.val[1]));
    654   q_sum2 = vmlsl_s16(q_sum2, d4_w.val[2], vget_high_s16(q4_in.val[2]));
    655   q_sum3 = vmlsl_s16(q_sum3, d4_w.val[3], vget_high_s16(q4_in.val[3]));
    656 
    657   q_sum0 = vaddq_s32(q_sum0, q_sum1);
    658   q_sum2 = vaddq_s32(q_sum2, q_sum3);
    659   q_sum2 = vaddq_s32(q_sum0, q_sum2);
    660   d_sum = vpadd_s32(vget_low_s32(q_sum2), vget_high_s32(q_sum2));
    661   d_sum = vpadd_s32(d_sum, d_sum);
    662   return d_sum;
    663 }
    664 
    665 #define LOAD_LANE_32b(src, VALUE, LANE) \
    666     (VALUE) = vld1_lane_u32((const uint32_t*)(src), (VALUE), (LANE))
    667 
    668 // Hadamard transform
    669 // Returns the weighted sum of the absolute value of transformed coefficients.
    670 static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
    671                     const uint16_t* const w) {
    672   uint32x2_t d_in_ab_0123 = vdup_n_u32(0);
    673   uint32x2_t d_in_ab_4567 = vdup_n_u32(0);
    674   uint32x2_t d_in_ab_89ab = vdup_n_u32(0);
    675   uint32x2_t d_in_ab_cdef = vdup_n_u32(0);
    676   uint8x8x4_t d4_in;
    677 
    678   // load data a, b
    679   LOAD_LANE_32b(a + 0 * BPS, d_in_ab_0123, 0);
    680   LOAD_LANE_32b(a + 1 * BPS, d_in_ab_4567, 0);
    681   LOAD_LANE_32b(a + 2 * BPS, d_in_ab_89ab, 0);
    682   LOAD_LANE_32b(a + 3 * BPS, d_in_ab_cdef, 0);
    683   LOAD_LANE_32b(b + 0 * BPS, d_in_ab_0123, 1);
    684   LOAD_LANE_32b(b + 1 * BPS, d_in_ab_4567, 1);
    685   LOAD_LANE_32b(b + 2 * BPS, d_in_ab_89ab, 1);
    686   LOAD_LANE_32b(b + 3 * BPS, d_in_ab_cdef, 1);
    687   INIT_VECTOR4(d4_in,
    688                vreinterpret_u8_u32(d_in_ab_0123),
    689                vreinterpret_u8_u32(d_in_ab_4567),
    690                vreinterpret_u8_u32(d_in_ab_89ab),
    691                vreinterpret_u8_u32(d_in_ab_cdef));
    692 
    693   {
    694     // horizontal pass
    695     const uint8x8x4_t d4_t = DistoTranspose4x4U8(d4_in);
    696     const int16x8x4_t q4_h = DistoHorizontalPass(d4_t);
    697     const int16x4x4_t d4_w = DistoLoadW(w);
    698     // vertical pass
    699     const int16x8x4_t q4_t = DistoTranspose4x4S16(q4_h);
    700     const int16x8x4_t q4_v = DistoVerticalPass(q4_t);
    701     int32x2_t d_sum = DistoSum(q4_v, d4_w);
    702 
    703     // abs(sum2 - sum1) >> 5
    704     d_sum = vabs_s32(d_sum);
    705     d_sum  = vshr_n_s32(d_sum, 5);
    706     return vget_lane_s32(d_sum, 0);
    707   }
    708 }
    709 #undef LOAD_LANE_32b
    710 
    711 static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
    712                       const uint16_t* const w) {
    713   int D = 0;
    714   int x, y;
    715   for (y = 0; y < 16 * BPS; y += 4 * BPS) {
    716     for (x = 0; x < 16; x += 4) {
    717       D += Disto4x4(a + x + y, b + x + y, w);
    718     }
    719   }
    720   return D;
    721 }
    722 
    723 //------------------------------------------------------------------------------
    724 
    725 static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
    726                              int start_block, int end_block,
    727                              VP8Histogram* const histo) {
    728   const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH);
    729   int j;
    730   int distribution[MAX_COEFF_THRESH + 1] = { 0 };
    731   for (j = start_block; j < end_block; ++j) {
    732     int16_t out[16];
    733     FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
    734     {
    735       int k;
    736       const int16x8_t a0 = vld1q_s16(out + 0);
    737       const int16x8_t b0 = vld1q_s16(out + 8);
    738       const uint16x8_t a1 = vreinterpretq_u16_s16(vabsq_s16(a0));
    739       const uint16x8_t b1 = vreinterpretq_u16_s16(vabsq_s16(b0));
    740       const uint16x8_t a2 = vshrq_n_u16(a1, 3);
    741       const uint16x8_t b2 = vshrq_n_u16(b1, 3);
    742       const uint16x8_t a3 = vminq_u16(a2, max_coeff_thresh);
    743       const uint16x8_t b3 = vminq_u16(b2, max_coeff_thresh);
    744       vst1q_s16(out + 0, vreinterpretq_s16_u16(a3));
    745       vst1q_s16(out + 8, vreinterpretq_s16_u16(b3));
    746       // Convert coefficients to bin.
    747       for (k = 0; k < 16; ++k) {
    748         ++distribution[out[k]];
    749       }
    750     }
    751   }
    752   VP8SetHistogramData(distribution, histo);
    753 }
    754 
    755 //------------------------------------------------------------------------------
    756 
    757 static WEBP_INLINE void AccumulateSSE16(const uint8_t* const a,
    758                                         const uint8_t* const b,
    759                                         uint32x4_t* const sum) {
    760   const uint8x16_t a0 = vld1q_u8(a);
    761   const uint8x16_t b0 = vld1q_u8(b);
    762   const uint8x16_t abs_diff = vabdq_u8(a0, b0);
    763   uint16x8_t prod = vmull_u8(vget_low_u8(abs_diff), vget_low_u8(abs_diff));
    764   prod = vmlal_u8(prod, vget_high_u8(abs_diff), vget_high_u8(abs_diff));
    765   *sum = vpadalq_u16(*sum, prod);      // pair-wise add and accumulate
    766 }
    767 
    768 // Horizontal sum of all four uint32_t values in 'sum'.
    769 static int SumToInt(uint32x4_t sum) {
    770   const uint64x2_t sum2 = vpaddlq_u32(sum);
    771   const uint64_t sum3 = vgetq_lane_u64(sum2, 0) + vgetq_lane_u64(sum2, 1);
    772   return (int)sum3;
    773 }
    774 
    775 static int SSE16x16(const uint8_t* a, const uint8_t* b) {
    776   uint32x4_t sum = vdupq_n_u32(0);
    777   int y;
    778   for (y = 0; y < 16; ++y) {
    779     AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);
    780   }
    781   return SumToInt(sum);
    782 }
    783 
    784 static int SSE16x8(const uint8_t* a, const uint8_t* b) {
    785   uint32x4_t sum = vdupq_n_u32(0);
    786   int y;
    787   for (y = 0; y < 8; ++y) {
    788     AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);
    789   }
    790   return SumToInt(sum);
    791 }
    792 
    793 static int SSE8x8(const uint8_t* a, const uint8_t* b) {
    794   uint32x4_t sum = vdupq_n_u32(0);
    795   int y;
    796   for (y = 0; y < 8; ++y) {
    797     const uint8x8_t a0 = vld1_u8(a + y * BPS);
    798     const uint8x8_t b0 = vld1_u8(b + y * BPS);
    799     const uint8x8_t abs_diff = vabd_u8(a0, b0);
    800     const uint16x8_t prod = vmull_u8(abs_diff, abs_diff);
    801     sum = vpadalq_u16(sum, prod);
    802   }
    803   return SumToInt(sum);
    804 }
    805 
    806 static int SSE4x4(const uint8_t* a, const uint8_t* b) {
    807   const uint8x16_t a0 = Load4x4(a);
    808   const uint8x16_t b0 = Load4x4(b);
    809   const uint8x16_t abs_diff = vabdq_u8(a0, b0);
    810   uint16x8_t prod = vmull_u8(vget_low_u8(abs_diff), vget_low_u8(abs_diff));
    811   prod = vmlal_u8(prod, vget_high_u8(abs_diff), vget_high_u8(abs_diff));
    812   return SumToInt(vpaddlq_u16(prod));
    813 }
    814 
    815 //------------------------------------------------------------------------------
    816 
    817 // Compilation with gcc-4.6.x is problematic for now.
    818 #if !defined(WORK_AROUND_GCC)
    819 
    820 static int16x8_t Quantize(int16_t* const in,
    821                           const VP8Matrix* const mtx, int offset) {
    822   const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]);
    823   const uint16x8_t q = vld1q_u16(&mtx->q_[offset]);
    824   const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]);
    825   const uint32x4_t bias0 = vld1q_u32(&mtx->bias_[offset + 0]);
    826   const uint32x4_t bias1 = vld1q_u32(&mtx->bias_[offset + 4]);
    827 
    828   const int16x8_t a = vld1q_s16(in + offset);                // in
    829   const uint16x8_t b = vreinterpretq_u16_s16(vabsq_s16(a));  // coeff = abs(in)
    830   const int16x8_t sign = vshrq_n_s16(a, 15);                 // sign
    831   const uint16x8_t c = vaddq_u16(b, sharp);                  // + sharpen
    832   const uint32x4_t m0 = vmull_u16(vget_low_u16(c), vget_low_u16(iq));
    833   const uint32x4_t m1 = vmull_u16(vget_high_u16(c), vget_high_u16(iq));
    834   const uint32x4_t m2 = vhaddq_u32(m0, bias0);
    835   const uint32x4_t m3 = vhaddq_u32(m1, bias1);     // (coeff * iQ + bias) >> 1
    836   const uint16x8_t c0 = vcombine_u16(vshrn_n_u32(m2, 16),
    837                                      vshrn_n_u32(m3, 16));   // QFIX=17 = 16+1
    838   const uint16x8_t c1 = vminq_u16(c0, vdupq_n_u16(MAX_LEVEL));
    839   const int16x8_t c2 = veorq_s16(vreinterpretq_s16_u16(c1), sign);
    840   const int16x8_t c3 = vsubq_s16(c2, sign);                  // restore sign
    841   const int16x8_t c4 = vmulq_s16(c3, vreinterpretq_s16_u16(q));
    842   vst1q_s16(in + offset, c4);
    843   assert(QFIX == 17);  // this function can't work as is if QFIX != 16+1
    844   return c3;
    845 }
    846 
    847 static const uint8_t kShuffles[4][8] = {
    848   { 0,   1,  2,  3,  8,  9, 16, 17 },
    849   { 10, 11,  4,  5,  6,  7, 12, 13 },
    850   { 18, 19, 24, 25, 26, 27, 20, 21 },
    851   { 14, 15, 22, 23, 28, 29, 30, 31 }
    852 };
    853 
    854 static int QuantizeBlock(int16_t in[16], int16_t out[16],
    855                          const VP8Matrix* const mtx) {
    856   const int16x8_t out0 = Quantize(in, mtx, 0);
    857   const int16x8_t out1 = Quantize(in, mtx, 8);
    858   uint8x8x4_t shuffles;
    859   // vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
    860   // non-standard versions there.
    861 #if defined(__APPLE__) && defined(__aarch64__) && \
    862     defined(__apple_build_version__) && (__apple_build_version__< 6020037)
    863   uint8x16x2_t all_out;
    864   INIT_VECTOR2(all_out, vreinterpretq_u8_s16(out0), vreinterpretq_u8_s16(out1));
    865   INIT_VECTOR4(shuffles,
    866                vtbl2q_u8(all_out, vld1_u8(kShuffles[0])),
    867                vtbl2q_u8(all_out, vld1_u8(kShuffles[1])),
    868                vtbl2q_u8(all_out, vld1_u8(kShuffles[2])),
    869                vtbl2q_u8(all_out, vld1_u8(kShuffles[3])));
    870 #else
    871   uint8x8x4_t all_out;
    872   INIT_VECTOR4(all_out,
    873                vreinterpret_u8_s16(vget_low_s16(out0)),
    874                vreinterpret_u8_s16(vget_high_s16(out0)),
    875                vreinterpret_u8_s16(vget_low_s16(out1)),
    876                vreinterpret_u8_s16(vget_high_s16(out1)));
    877   INIT_VECTOR4(shuffles,
    878                vtbl4_u8(all_out, vld1_u8(kShuffles[0])),
    879                vtbl4_u8(all_out, vld1_u8(kShuffles[1])),
    880                vtbl4_u8(all_out, vld1_u8(kShuffles[2])),
    881                vtbl4_u8(all_out, vld1_u8(kShuffles[3])));
    882 #endif
    883   // Zigzag reordering
    884   vst1_u8((uint8_t*)(out +  0), shuffles.val[0]);
    885   vst1_u8((uint8_t*)(out +  4), shuffles.val[1]);
    886   vst1_u8((uint8_t*)(out +  8), shuffles.val[2]);
    887   vst1_u8((uint8_t*)(out + 12), shuffles.val[3]);
    888   // test zeros
    889   if (*(uint64_t*)(out +  0) != 0) return 1;
    890   if (*(uint64_t*)(out +  4) != 0) return 1;
    891   if (*(uint64_t*)(out +  8) != 0) return 1;
    892   if (*(uint64_t*)(out + 12) != 0) return 1;
    893   return 0;
    894 }
    895 
    896 static int Quantize2Blocks(int16_t in[32], int16_t out[32],
    897                            const VP8Matrix* const mtx) {
    898   int nz;
    899   nz  = QuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
    900   nz |= QuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
    901   return nz;
    902 }
    903 
    904 #endif   // !WORK_AROUND_GCC
    905 
    906 //------------------------------------------------------------------------------
    907 // Entry point
    908 
    909 extern void VP8EncDspInitNEON(void);
    910 
    911 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) {
    912   VP8ITransform = ITransform;
    913   VP8FTransform = FTransform;
    914 
    915   VP8FTransformWHT = FTransformWHT;
    916 
    917   VP8TDisto4x4 = Disto4x4;
    918   VP8TDisto16x16 = Disto16x16;
    919   VP8CollectHistogram = CollectHistogram;
    920   VP8SSE16x16 = SSE16x16;
    921   VP8SSE16x8 = SSE16x8;
    922   VP8SSE8x8 = SSE8x8;
    923   VP8SSE4x4 = SSE4x4;
    924 #if !defined(WORK_AROUND_GCC)
    925   VP8EncQuantizeBlock = QuantizeBlock;
    926   VP8EncQuantize2Blocks = Quantize2Blocks;
    927 #endif
    928 }
    929 
    930 #else  // !WEBP_USE_NEON
    931 
    932 WEBP_DSP_INIT_STUB(VP8EncDspInitNEON)
    933 
    934 #endif  // WEBP_USE_NEON
    935