Home | History | Annotate | Download | only in neon
      1 /*
      2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <arm_neon.h>
     12 #include "./vpx_config.h"
     13 #include "vpx_ports/arm.h"
     14 
     15 static INLINE void vp8_loop_filter_neon(uint8x16_t qblimit,  // flimit
     16                                         uint8x16_t qlimit,   // limit
     17                                         uint8x16_t qthresh,  // thresh
     18                                         uint8x16_t q3,       // p3
     19                                         uint8x16_t q4,       // p2
     20                                         uint8x16_t q5,       // p1
     21                                         uint8x16_t q6,       // p0
     22                                         uint8x16_t q7,       // q0
     23                                         uint8x16_t q8,       // q1
     24                                         uint8x16_t q9,       // q2
     25                                         uint8x16_t q10,      // q3
     26                                         uint8x16_t *q5r,     // p1
     27                                         uint8x16_t *q6r,     // p0
     28                                         uint8x16_t *q7r,     // q0
     29                                         uint8x16_t *q8r) {   // q1
     30   uint8x16_t q0u8, q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8;
     31   int16x8_t q2s16, q11s16;
     32   uint16x8_t q4u16;
     33   int8x16_t q1s8, q2s8, q10s8, q11s8, q12s8, q13s8;
     34   int8x8_t d2s8, d3s8;
     35 
     36   q11u8 = vabdq_u8(q3, q4);
     37   q12u8 = vabdq_u8(q4, q5);
     38   q13u8 = vabdq_u8(q5, q6);
     39   q14u8 = vabdq_u8(q8, q7);
     40   q3 = vabdq_u8(q9, q8);
     41   q4 = vabdq_u8(q10, q9);
     42 
     43   q11u8 = vmaxq_u8(q11u8, q12u8);
     44   q12u8 = vmaxq_u8(q13u8, q14u8);
     45   q3 = vmaxq_u8(q3, q4);
     46   q15u8 = vmaxq_u8(q11u8, q12u8);
     47 
     48   q9 = vabdq_u8(q6, q7);
     49 
     50   // vp8_hevmask
     51   q13u8 = vcgtq_u8(q13u8, qthresh);
     52   q14u8 = vcgtq_u8(q14u8, qthresh);
     53   q15u8 = vmaxq_u8(q15u8, q3);
     54 
     55   q2u8 = vabdq_u8(q5, q8);
     56   q9 = vqaddq_u8(q9, q9);
     57 
     58   q15u8 = vcgeq_u8(qlimit, q15u8);
     59 
     60   // vp8_filter() function
     61   // convert to signed
     62   q10 = vdupq_n_u8(0x80);
     63   q8 = veorq_u8(q8, q10);
     64   q7 = veorq_u8(q7, q10);
     65   q6 = veorq_u8(q6, q10);
     66   q5 = veorq_u8(q5, q10);
     67 
     68   q2u8 = vshrq_n_u8(q2u8, 1);
     69   q9 = vqaddq_u8(q9, q2u8);
     70 
     71   q10 = vdupq_n_u8(3);
     72 
     73   q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)),
     74                    vget_low_s8(vreinterpretq_s8_u8(q6)));
     75   q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)),
     76                     vget_high_s8(vreinterpretq_s8_u8(q6)));
     77 
     78   q9 = vcgeq_u8(qblimit, q9);
     79 
     80   q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5), vreinterpretq_s8_u8(q8));
     81 
     82   q14u8 = vorrq_u8(q13u8, q14u8);
     83 
     84   q4u16 = vmovl_u8(vget_low_u8(q10));
     85   q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16));
     86   q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16));
     87 
     88   q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8);
     89   q15u8 = vandq_u8(q15u8, q9);
     90 
     91   q1s8 = vreinterpretq_s8_u8(q1u8);
     92   q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8));
     93   q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8));
     94 
     95   q9 = vdupq_n_u8(4);
     96   // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
     97   d2s8 = vqmovn_s16(q2s16);
     98   d3s8 = vqmovn_s16(q11s16);
     99   q1s8 = vcombine_s8(d2s8, d3s8);
    100   q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8);
    101   q1s8 = vreinterpretq_s8_u8(q1u8);
    102 
    103   q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q10));
    104   q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9));
    105   q2s8 = vshrq_n_s8(q2s8, 3);
    106   q1s8 = vshrq_n_s8(q1s8, 3);
    107 
    108   q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8);
    109   q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8);
    110 
    111   q1s8 = vrshrq_n_s8(q1s8, 1);
    112   q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8));
    113 
    114   q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8);
    115   q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8);
    116 
    117   q0u8 = vdupq_n_u8(0x80);
    118   *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q0u8);
    119   *q7r = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8);
    120   *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8);
    121   *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q0u8);
    122   return;
    123 }
    124 
    125 void vp8_loop_filter_horizontal_edge_y_neon(unsigned char *src, int pitch,
    126                                             unsigned char blimit,
    127                                             unsigned char limit,
    128                                             unsigned char thresh) {
    129   uint8x16_t qblimit, qlimit, qthresh, q3, q4;
    130   uint8x16_t q5, q6, q7, q8, q9, q10;
    131 
    132   qblimit = vdupq_n_u8(blimit);
    133   qlimit = vdupq_n_u8(limit);
    134   qthresh = vdupq_n_u8(thresh);
    135   src -= (pitch << 2);
    136 
    137   q3 = vld1q_u8(src);
    138   src += pitch;
    139   q4 = vld1q_u8(src);
    140   src += pitch;
    141   q5 = vld1q_u8(src);
    142   src += pitch;
    143   q6 = vld1q_u8(src);
    144   src += pitch;
    145   q7 = vld1q_u8(src);
    146   src += pitch;
    147   q8 = vld1q_u8(src);
    148   src += pitch;
    149   q9 = vld1q_u8(src);
    150   src += pitch;
    151   q10 = vld1q_u8(src);
    152 
    153   vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4, q5, q6, q7, q8, q9,
    154                        q10, &q5, &q6, &q7, &q8);
    155 
    156   src -= (pitch * 5);
    157   vst1q_u8(src, q5);
    158   src += pitch;
    159   vst1q_u8(src, q6);
    160   src += pitch;
    161   vst1q_u8(src, q7);
    162   src += pitch;
    163   vst1q_u8(src, q8);
    164   return;
    165 }
    166 
    167 void vp8_loop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch,
    168                                              unsigned char blimit,
    169                                              unsigned char limit,
    170                                              unsigned char thresh,
    171                                              unsigned char *v) {
    172   uint8x16_t qblimit, qlimit, qthresh, q3, q4;
    173   uint8x16_t q5, q6, q7, q8, q9, q10;
    174   uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
    175   uint8x8_t d15, d16, d17, d18, d19, d20, d21;
    176 
    177   qblimit = vdupq_n_u8(blimit);
    178   qlimit = vdupq_n_u8(limit);
    179   qthresh = vdupq_n_u8(thresh);
    180 
    181   u -= (pitch << 2);
    182   v -= (pitch << 2);
    183 
    184   d6 = vld1_u8(u);
    185   u += pitch;
    186   d7 = vld1_u8(v);
    187   v += pitch;
    188   d8 = vld1_u8(u);
    189   u += pitch;
    190   d9 = vld1_u8(v);
    191   v += pitch;
    192   d10 = vld1_u8(u);
    193   u += pitch;
    194   d11 = vld1_u8(v);
    195   v += pitch;
    196   d12 = vld1_u8(u);
    197   u += pitch;
    198   d13 = vld1_u8(v);
    199   v += pitch;
    200   d14 = vld1_u8(u);
    201   u += pitch;
    202   d15 = vld1_u8(v);
    203   v += pitch;
    204   d16 = vld1_u8(u);
    205   u += pitch;
    206   d17 = vld1_u8(v);
    207   v += pitch;
    208   d18 = vld1_u8(u);
    209   u += pitch;
    210   d19 = vld1_u8(v);
    211   v += pitch;
    212   d20 = vld1_u8(u);
    213   d21 = vld1_u8(v);
    214 
    215   q3 = vcombine_u8(d6, d7);
    216   q4 = vcombine_u8(d8, d9);
    217   q5 = vcombine_u8(d10, d11);
    218   q6 = vcombine_u8(d12, d13);
    219   q7 = vcombine_u8(d14, d15);
    220   q8 = vcombine_u8(d16, d17);
    221   q9 = vcombine_u8(d18, d19);
    222   q10 = vcombine_u8(d20, d21);
    223 
    224   vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4, q5, q6, q7, q8, q9,
    225                        q10, &q5, &q6, &q7, &q8);
    226 
    227   u -= (pitch * 5);
    228   vst1_u8(u, vget_low_u8(q5));
    229   u += pitch;
    230   vst1_u8(u, vget_low_u8(q6));
    231   u += pitch;
    232   vst1_u8(u, vget_low_u8(q7));
    233   u += pitch;
    234   vst1_u8(u, vget_low_u8(q8));
    235 
    236   v -= (pitch * 5);
    237   vst1_u8(v, vget_high_u8(q5));
    238   v += pitch;
    239   vst1_u8(v, vget_high_u8(q6));
    240   v += pitch;
    241   vst1_u8(v, vget_high_u8(q7));
    242   v += pitch;
    243   vst1_u8(v, vget_high_u8(q8));
    244   return;
    245 }
    246 
    247 static INLINE void write_4x8(unsigned char *dst, int pitch,
    248                              const uint8x8x4_t result) {
    249 #ifdef VPX_INCOMPATIBLE_GCC
    250   /*
    251    * uint8x8x4_t result
    252   00 01 02 03 | 04 05 06 07
    253   10 11 12 13 | 14 15 16 17
    254   20 21 22 23 | 24 25 26 27
    255   30 31 32 33 | 34 35 36 37
    256   ---
    257   * after vtrn_u16
    258   00 01 20 21 | 04 05 24 25
    259   02 03 22 23 | 06 07 26 27
    260   10 11 30 31 | 14 15 34 35
    261   12 13 32 33 | 16 17 36 37
    262   ---
    263   * after vtrn_u8
    264   00 10 20 30 | 04 14 24 34
    265   01 11 21 31 | 05 15 25 35
    266   02 12 22 32 | 06 16 26 36
    267   03 13 23 33 | 07 17 27 37
    268   */
    269   const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[0]),
    270                                         vreinterpret_u16_u8(result.val[2]));
    271   const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u8(result.val[1]),
    272                                         vreinterpret_u16_u8(result.val[3]));
    273   const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]),
    274                                      vreinterpret_u8_u16(r13_u16.val[0]));
    275   const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]),
    276                                      vreinterpret_u8_u16(r13_u16.val[1]));
    277   const uint32x2_t x_0_4 = vreinterpret_u32_u8(r01_u8.val[0]);
    278   const uint32x2_t x_1_5 = vreinterpret_u32_u8(r01_u8.val[1]);
    279   const uint32x2_t x_2_6 = vreinterpret_u32_u8(r23_u8.val[0]);
    280   const uint32x2_t x_3_7 = vreinterpret_u32_u8(r23_u8.val[1]);
    281   vst1_lane_u32((uint32_t *)dst, x_0_4, 0);
    282   dst += pitch;
    283   vst1_lane_u32((uint32_t *)dst, x_1_5, 0);
    284   dst += pitch;
    285   vst1_lane_u32((uint32_t *)dst, x_2_6, 0);
    286   dst += pitch;
    287   vst1_lane_u32((uint32_t *)dst, x_3_7, 0);
    288   dst += pitch;
    289   vst1_lane_u32((uint32_t *)dst, x_0_4, 1);
    290   dst += pitch;
    291   vst1_lane_u32((uint32_t *)dst, x_1_5, 1);
    292   dst += pitch;
    293   vst1_lane_u32((uint32_t *)dst, x_2_6, 1);
    294   dst += pitch;
    295   vst1_lane_u32((uint32_t *)dst, x_3_7, 1);
    296 #else
    297   vst4_lane_u8(dst, result, 0);
    298   dst += pitch;
    299   vst4_lane_u8(dst, result, 1);
    300   dst += pitch;
    301   vst4_lane_u8(dst, result, 2);
    302   dst += pitch;
    303   vst4_lane_u8(dst, result, 3);
    304   dst += pitch;
    305   vst4_lane_u8(dst, result, 4);
    306   dst += pitch;
    307   vst4_lane_u8(dst, result, 5);
    308   dst += pitch;
    309   vst4_lane_u8(dst, result, 6);
    310   dst += pitch;
    311   vst4_lane_u8(dst, result, 7);
    312 #endif  // VPX_INCOMPATIBLE_GCC
    313 }
    314 
    315 void vp8_loop_filter_vertical_edge_y_neon(unsigned char *src, int pitch,
    316                                           unsigned char blimit,
    317                                           unsigned char limit,
    318                                           unsigned char thresh) {
    319   unsigned char *s, *d;
    320   uint8x16_t qblimit, qlimit, qthresh, q3, q4;
    321   uint8x16_t q5, q6, q7, q8, q9, q10;
    322   uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
    323   uint8x8_t d15, d16, d17, d18, d19, d20, d21;
    324   uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
    325   uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
    326   uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
    327   uint8x8x4_t q4ResultH, q4ResultL;
    328 
    329   qblimit = vdupq_n_u8(blimit);
    330   qlimit = vdupq_n_u8(limit);
    331   qthresh = vdupq_n_u8(thresh);
    332 
    333   s = src - 4;
    334   d6 = vld1_u8(s);
    335   s += pitch;
    336   d8 = vld1_u8(s);
    337   s += pitch;
    338   d10 = vld1_u8(s);
    339   s += pitch;
    340   d12 = vld1_u8(s);
    341   s += pitch;
    342   d14 = vld1_u8(s);
    343   s += pitch;
    344   d16 = vld1_u8(s);
    345   s += pitch;
    346   d18 = vld1_u8(s);
    347   s += pitch;
    348   d20 = vld1_u8(s);
    349   s += pitch;
    350   d7 = vld1_u8(s);
    351   s += pitch;
    352   d9 = vld1_u8(s);
    353   s += pitch;
    354   d11 = vld1_u8(s);
    355   s += pitch;
    356   d13 = vld1_u8(s);
    357   s += pitch;
    358   d15 = vld1_u8(s);
    359   s += pitch;
    360   d17 = vld1_u8(s);
    361   s += pitch;
    362   d19 = vld1_u8(s);
    363   s += pitch;
    364   d21 = vld1_u8(s);
    365 
    366   q3 = vcombine_u8(d6, d7);
    367   q4 = vcombine_u8(d8, d9);
    368   q5 = vcombine_u8(d10, d11);
    369   q6 = vcombine_u8(d12, d13);
    370   q7 = vcombine_u8(d14, d15);
    371   q8 = vcombine_u8(d16, d17);
    372   q9 = vcombine_u8(d18, d19);
    373   q10 = vcombine_u8(d20, d21);
    374 
    375   q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
    376   q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
    377   q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
    378   q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
    379 
    380   q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
    381                      vreinterpretq_u16_u32(q2tmp2.val[0]));
    382   q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
    383                      vreinterpretq_u16_u32(q2tmp3.val[0]));
    384   q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
    385                      vreinterpretq_u16_u32(q2tmp2.val[1]));
    386   q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
    387                      vreinterpretq_u16_u32(q2tmp3.val[1]));
    388 
    389   q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
    390                     vreinterpretq_u8_u16(q2tmp5.val[0]));
    391   q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
    392                     vreinterpretq_u8_u16(q2tmp5.val[1]));
    393   q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
    394                      vreinterpretq_u8_u16(q2tmp7.val[0]));
    395   q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
    396                      vreinterpretq_u8_u16(q2tmp7.val[1]));
    397 
    398   q3 = q2tmp8.val[0];
    399   q4 = q2tmp8.val[1];
    400   q5 = q2tmp9.val[0];
    401   q6 = q2tmp9.val[1];
    402   q7 = q2tmp10.val[0];
    403   q8 = q2tmp10.val[1];
    404   q9 = q2tmp11.val[0];
    405   q10 = q2tmp11.val[1];
    406 
    407   vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4, q5, q6, q7, q8, q9,
    408                        q10, &q5, &q6, &q7, &q8);
    409 
    410   q4ResultL.val[0] = vget_low_u8(q5);   // d10
    411   q4ResultL.val[1] = vget_low_u8(q6);   // d12
    412   q4ResultL.val[2] = vget_low_u8(q7);   // d14
    413   q4ResultL.val[3] = vget_low_u8(q8);   // d16
    414   q4ResultH.val[0] = vget_high_u8(q5);  // d11
    415   q4ResultH.val[1] = vget_high_u8(q6);  // d13
    416   q4ResultH.val[2] = vget_high_u8(q7);  // d15
    417   q4ResultH.val[3] = vget_high_u8(q8);  // d17
    418 
    419   d = src - 2;
    420   write_4x8(d, pitch, q4ResultL);
    421   d += pitch * 8;
    422   write_4x8(d, pitch, q4ResultH);
    423 }
    424 
    425 void vp8_loop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch,
    426                                            unsigned char blimit,
    427                                            unsigned char limit,
    428                                            unsigned char thresh,
    429                                            unsigned char *v) {
    430   unsigned char *us, *ud;
    431   unsigned char *vs, *vd;
    432   uint8x16_t qblimit, qlimit, qthresh, q3, q4;
    433   uint8x16_t q5, q6, q7, q8, q9, q10;
    434   uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
    435   uint8x8_t d15, d16, d17, d18, d19, d20, d21;
    436   uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
    437   uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
    438   uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;
    439   uint8x8x4_t q4ResultH, q4ResultL;
    440 
    441   qblimit = vdupq_n_u8(blimit);
    442   qlimit = vdupq_n_u8(limit);
    443   qthresh = vdupq_n_u8(thresh);
    444 
    445   us = u - 4;
    446   d6 = vld1_u8(us);
    447   us += pitch;
    448   d8 = vld1_u8(us);
    449   us += pitch;
    450   d10 = vld1_u8(us);
    451   us += pitch;
    452   d12 = vld1_u8(us);
    453   us += pitch;
    454   d14 = vld1_u8(us);
    455   us += pitch;
    456   d16 = vld1_u8(us);
    457   us += pitch;
    458   d18 = vld1_u8(us);
    459   us += pitch;
    460   d20 = vld1_u8(us);
    461 
    462   vs = v - 4;
    463   d7 = vld1_u8(vs);
    464   vs += pitch;
    465   d9 = vld1_u8(vs);
    466   vs += pitch;
    467   d11 = vld1_u8(vs);
    468   vs += pitch;
    469   d13 = vld1_u8(vs);
    470   vs += pitch;
    471   d15 = vld1_u8(vs);
    472   vs += pitch;
    473   d17 = vld1_u8(vs);
    474   vs += pitch;
    475   d19 = vld1_u8(vs);
    476   vs += pitch;
    477   d21 = vld1_u8(vs);
    478 
    479   q3 = vcombine_u8(d6, d7);
    480   q4 = vcombine_u8(d8, d9);
    481   q5 = vcombine_u8(d10, d11);
    482   q6 = vcombine_u8(d12, d13);
    483   q7 = vcombine_u8(d14, d15);
    484   q8 = vcombine_u8(d16, d17);
    485   q9 = vcombine_u8(d18, d19);
    486   q10 = vcombine_u8(d20, d21);
    487 
    488   q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
    489   q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
    490   q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
    491   q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));
    492 
    493   q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
    494                      vreinterpretq_u16_u32(q2tmp2.val[0]));
    495   q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
    496                      vreinterpretq_u16_u32(q2tmp3.val[0]));
    497   q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
    498                      vreinterpretq_u16_u32(q2tmp2.val[1]));
    499   q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
    500                      vreinterpretq_u16_u32(q2tmp3.val[1]));
    501 
    502   q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
    503                     vreinterpretq_u8_u16(q2tmp5.val[0]));
    504   q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
    505                     vreinterpretq_u8_u16(q2tmp5.val[1]));
    506   q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
    507                      vreinterpretq_u8_u16(q2tmp7.val[0]));
    508   q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
    509                      vreinterpretq_u8_u16(q2tmp7.val[1]));
    510 
    511   q3 = q2tmp8.val[0];
    512   q4 = q2tmp8.val[1];
    513   q5 = q2tmp9.val[0];
    514   q6 = q2tmp9.val[1];
    515   q7 = q2tmp10.val[0];
    516   q8 = q2tmp10.val[1];
    517   q9 = q2tmp11.val[0];
    518   q10 = q2tmp11.val[1];
    519 
    520   vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4, q5, q6, q7, q8, q9,
    521                        q10, &q5, &q6, &q7, &q8);
    522 
    523   q4ResultL.val[0] = vget_low_u8(q5);  // d10
    524   q4ResultL.val[1] = vget_low_u8(q6);  // d12
    525   q4ResultL.val[2] = vget_low_u8(q7);  // d14
    526   q4ResultL.val[3] = vget_low_u8(q8);  // d16
    527   ud = u - 2;
    528   write_4x8(ud, pitch, q4ResultL);
    529 
    530   q4ResultH.val[0] = vget_high_u8(q5);  // d11
    531   q4ResultH.val[1] = vget_high_u8(q6);  // d13
    532   q4ResultH.val[2] = vget_high_u8(q7);  // d15
    533   q4ResultH.val[3] = vget_high_u8(q8);  // d17
    534   vd = v - 2;
    535   write_4x8(vd, pitch, q4ResultH);
    536 }
    537