Home | History | Annotate | Download | only in dspr2
      1 /*
      2  *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #ifndef VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_FILTERS_DSPR2_H_
     12 #define VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_FILTERS_DSPR2_H_
     13 
     14 #include <stdlib.h>
     15 
     16 #include "./vp9_rtcd.h"
     17 #include "vp9/common/vp9_common.h"
     18 #include "vp9/common/vp9_onyxc_int.h"
     19 
     20 #if HAVE_DSPR2
     21 /* inputs & outputs are quad-byte vectors */
     22 static INLINE void vp9_filter_dspr2(uint32_t mask, uint32_t hev,
     23                                     uint32_t *ps1, uint32_t *ps0,
     24                                     uint32_t *qs0, uint32_t *qs1) {
     25   int32_t   vp9_filter_l, vp9_filter_r;
     26   int32_t   Filter1_l, Filter1_r, Filter2_l, Filter2_r;
     27   int32_t   subr_r, subr_l;
     28   uint32_t  t1, t2, HWM, t3;
     29   uint32_t  hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r;
     30   int32_t   vps1, vps0, vqs0, vqs1;
     31   int32_t   vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r;
     32   uint32_t  N128;
     33 
     34   N128 = 0x80808080;
     35   t1  = 0x03000300;
     36   t2  = 0x04000400;
     37   t3  = 0x01000100;
     38   HWM = 0xFF00FF00;
     39 
     40   vps0 = (*ps0) ^ N128;
     41   vps1 = (*ps1) ^ N128;
     42   vqs0 = (*qs0) ^ N128;
     43   vqs1 = (*qs1) ^ N128;
     44 
     45   /* use halfword pairs instead quad-bytes because of accuracy */
     46   vps0_l = vps0 & HWM;
     47   vps0_r = vps0 << 8;
     48   vps0_r = vps0_r & HWM;
     49 
     50   vps1_l = vps1 & HWM;
     51   vps1_r = vps1 << 8;
     52   vps1_r = vps1_r & HWM;
     53 
     54   vqs0_l = vqs0 & HWM;
     55   vqs0_r = vqs0 << 8;
     56   vqs0_r = vqs0_r & HWM;
     57 
     58   vqs1_l = vqs1 & HWM;
     59   vqs1_r = vqs1 << 8;
     60   vqs1_r = vqs1_r & HWM;
     61 
     62   mask_l = mask & HWM;
     63   mask_r = mask << 8;
     64   mask_r = mask_r & HWM;
     65 
     66   hev_l = hev & HWM;
     67   hev_r = hev << 8;
     68   hev_r = hev_r & HWM;
     69 
     70   __asm__ __volatile__ (
     71       /* vp9_filter = vp8_signed_char_clamp(ps1 - qs1); */
     72       "subq_s.ph    %[vp9_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
     73       "subq_s.ph    %[vp9_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
     74 
     75       /* qs0 - ps0 */
     76       "subq_s.ph    %[subr_l],       %[vqs0_l],       %[vps0_l]       \n\t"
     77       "subq_s.ph    %[subr_r],       %[vqs0_r],       %[vps0_r]       \n\t"
     78 
     79       /* vp9_filter &= hev; */
     80       "and          %[vp9_filter_l], %[vp9_filter_l], %[hev_l]        \n\t"
     81       "and          %[vp9_filter_r], %[vp9_filter_r], %[hev_r]        \n\t"
     82 
     83       /* vp9_filter = vp8_signed_char_clamp(vp9_filter + 3 * (qs0 - ps0)); */
     84       "addq_s.ph    %[vp9_filter_l], %[vp9_filter_l], %[subr_l]       \n\t"
     85       "addq_s.ph    %[vp9_filter_r], %[vp9_filter_r], %[subr_r]       \n\t"
     86       "xor          %[invhev_l],     %[hev_l],        %[HWM]          \n\t"
     87       "addq_s.ph    %[vp9_filter_l], %[vp9_filter_l], %[subr_l]       \n\t"
     88       "addq_s.ph    %[vp9_filter_r], %[vp9_filter_r], %[subr_r]       \n\t"
     89       "xor          %[invhev_r],     %[hev_r],        %[HWM]          \n\t"
     90       "addq_s.ph    %[vp9_filter_l], %[vp9_filter_l], %[subr_l]       \n\t"
     91       "addq_s.ph    %[vp9_filter_r], %[vp9_filter_r], %[subr_r]       \n\t"
     92 
     93       /* vp9_filter &= mask; */
     94       "and          %[vp9_filter_l], %[vp9_filter_l], %[mask_l]       \n\t"
     95       "and          %[vp9_filter_r], %[vp9_filter_r], %[mask_r]       \n\t"
     96 
     97       : [vp9_filter_l] "=&r" (vp9_filter_l),
     98         [vp9_filter_r] "=&r" (vp9_filter_r),
     99         [subr_l] "=&r" (subr_l), [subr_r] "=&r" (subr_r),
    100         [invhev_l] "=&r" (invhev_l), [invhev_r] "=&r" (invhev_r)
    101       : [vps0_l] "r" (vps0_l), [vps0_r] "r" (vps0_r), [vps1_l] "r" (vps1_l),
    102         [vps1_r] "r" (vps1_r), [vqs0_l] "r" (vqs0_l), [vqs0_r] "r" (vqs0_r),
    103         [vqs1_l] "r" (vqs1_l), [vqs1_r] "r" (vqs1_r),
    104         [mask_l] "r" (mask_l), [mask_r] "r" (mask_r),
    105         [hev_l] "r" (hev_l), [hev_r] "r" (hev_r),
    106         [HWM] "r" (HWM)
    107   );
    108 
    109   /* save bottom 3 bits so that we round one side +4 and the other +3 */
    110   __asm__ __volatile__ (
    111       /* Filter2 = vp8_signed_char_clamp(vp9_filter + 3) >>= 3; */
    112       "addq_s.ph    %[Filter1_l],    %[vp9_filter_l], %[t2]           \n\t"
    113       "addq_s.ph    %[Filter1_r],    %[vp9_filter_r], %[t2]           \n\t"
    114 
    115       /* Filter1 = vp8_signed_char_clamp(vp9_filter + 4) >>= 3; */
    116       "addq_s.ph    %[Filter2_l],    %[vp9_filter_l], %[t1]           \n\t"
    117       "addq_s.ph    %[Filter2_r],    %[vp9_filter_r], %[t1]           \n\t"
    118       "shra.ph      %[Filter1_r],    %[Filter1_r],    3               \n\t"
    119       "shra.ph      %[Filter1_l],    %[Filter1_l],    3               \n\t"
    120 
    121       "shra.ph      %[Filter2_l],    %[Filter2_l],    3               \n\t"
    122       "shra.ph      %[Filter2_r],    %[Filter2_r],    3               \n\t"
    123 
    124       "and          %[Filter1_l],    %[Filter1_l],    %[HWM]          \n\t"
    125       "and          %[Filter1_r],    %[Filter1_r],    %[HWM]          \n\t"
    126 
    127       /* vps0 = vp8_signed_char_clamp(ps0 + Filter2); */
    128       "addq_s.ph    %[vps0_l],       %[vps0_l],       %[Filter2_l]    \n\t"
    129       "addq_s.ph    %[vps0_r],       %[vps0_r],       %[Filter2_r]    \n\t"
    130 
    131       /* vqs0 = vp8_signed_char_clamp(qs0 - Filter1); */
    132       "subq_s.ph    %[vqs0_l],       %[vqs0_l],       %[Filter1_l]    \n\t"
    133       "subq_s.ph    %[vqs0_r],       %[vqs0_r],       %[Filter1_r]    \n\t"
    134 
    135       : [Filter1_l] "=&r" (Filter1_l), [Filter1_r] "=&r" (Filter1_r),
    136         [Filter2_l] "=&r" (Filter2_l), [Filter2_r] "=&r" (Filter2_r),
    137         [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
    138         [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r)
    139       : [t1] "r" (t1), [t2] "r" (t2), [HWM] "r" (HWM),
    140         [vp9_filter_l] "r" (vp9_filter_l), [vp9_filter_r] "r" (vp9_filter_r)
    141   );
    142 
    143   __asm__ __volatile__ (
    144       /* (vp9_filter += 1) >>= 1 */
    145       "addqh.ph    %[Filter1_l],    %[Filter1_l],     %[t3]           \n\t"
    146       "addqh.ph    %[Filter1_r],    %[Filter1_r],     %[t3]           \n\t"
    147 
    148       /* vp9_filter &= ~hev; */
    149       "and          %[Filter1_l],    %[Filter1_l],    %[invhev_l]     \n\t"
    150       "and          %[Filter1_r],    %[Filter1_r],    %[invhev_r]     \n\t"
    151 
    152       /* vps1 = vp8_signed_char_clamp(ps1 + vp9_filter); */
    153       "addq_s.ph    %[vps1_l],       %[vps1_l],       %[Filter1_l]    \n\t"
    154       "addq_s.ph    %[vps1_r],       %[vps1_r],       %[Filter1_r]    \n\t"
    155 
    156       /* vqs1 = vp8_signed_char_clamp(qs1 - vp9_filter); */
    157       "subq_s.ph    %[vqs1_l],       %[vqs1_l],       %[Filter1_l]    \n\t"
    158       "subq_s.ph    %[vqs1_r],       %[vqs1_r],       %[Filter1_r]    \n\t"
    159 
    160       : [Filter1_l] "+r" (Filter1_l), [Filter1_r] "+r" (Filter1_r),
    161         [vps1_l] "+r" (vps1_l), [vps1_r] "+r" (vps1_r),
    162         [vqs1_l] "+r" (vqs1_l), [vqs1_r] "+r" (vqs1_r)
    163       : [t3] "r" (t3), [invhev_l] "r" (invhev_l), [invhev_r] "r" (invhev_r)
    164   );
    165 
    166   /* Create quad-bytes from halfword pairs */
    167   vqs0_l = vqs0_l & HWM;
    168   vqs1_l = vqs1_l & HWM;
    169   vps0_l = vps0_l & HWM;
    170   vps1_l = vps1_l & HWM;
    171 
    172   __asm__ __volatile__ (
    173       "shrl.ph      %[vqs0_r],       %[vqs0_r],       8   \n\t"
    174       "shrl.ph      %[vps0_r],       %[vps0_r],       8   \n\t"
    175       "shrl.ph      %[vqs1_r],       %[vqs1_r],       8   \n\t"
    176       "shrl.ph      %[vps1_r],       %[vps1_r],       8   \n\t"
    177 
    178       : [vps1_r] "+r" (vps1_r), [vqs1_r] "+r" (vqs1_r),
    179         [vps0_r] "+r" (vps0_r), [vqs0_r] "+r" (vqs0_r)
    180       :
    181   );
    182 
    183   vqs0 = vqs0_l | vqs0_r;
    184   vqs1 = vqs1_l | vqs1_r;
    185   vps0 = vps0_l | vps0_r;
    186   vps1 = vps1_l | vps1_r;
    187 
    188   *ps0 = vps0 ^ N128;
    189   *ps1 = vps1 ^ N128;
    190   *qs0 = vqs0 ^ N128;
    191   *qs1 = vqs1 ^ N128;
    192 }
    193 
    194 static INLINE void vp9_filter1_dspr2(uint32_t mask, uint32_t hev,
    195                                      uint32_t ps1, uint32_t ps0,
    196                                      uint32_t qs0, uint32_t qs1,
    197                                      uint32_t *p1_f0, uint32_t *p0_f0,
    198                                      uint32_t *q0_f0, uint32_t *q1_f0) {
    199   int32_t   vp9_filter_l, vp9_filter_r;
    200   int32_t   Filter1_l, Filter1_r, Filter2_l, Filter2_r;
    201   int32_t   subr_r, subr_l;
    202   uint32_t  t1, t2, HWM, t3;
    203   uint32_t  hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r;
    204   int32_t   vps1, vps0, vqs0, vqs1;
    205   int32_t   vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r;
    206   uint32_t  N128;
    207 
    208   N128 = 0x80808080;
    209   t1  = 0x03000300;
    210   t2  = 0x04000400;
    211   t3  = 0x01000100;
    212   HWM = 0xFF00FF00;
    213 
    214   vps0 = (ps0) ^ N128;
    215   vps1 = (ps1) ^ N128;
    216   vqs0 = (qs0) ^ N128;
    217   vqs1 = (qs1) ^ N128;
    218 
    219   /* use halfword pairs instead quad-bytes because of accuracy */
    220   vps0_l = vps0 & HWM;
    221   vps0_r = vps0 << 8;
    222   vps0_r = vps0_r & HWM;
    223 
    224   vps1_l = vps1 & HWM;
    225   vps1_r = vps1 << 8;
    226   vps1_r = vps1_r & HWM;
    227 
    228   vqs0_l = vqs0 & HWM;
    229   vqs0_r = vqs0 << 8;
    230   vqs0_r = vqs0_r & HWM;
    231 
    232   vqs1_l = vqs1 & HWM;
    233   vqs1_r = vqs1 << 8;
    234   vqs1_r = vqs1_r & HWM;
    235 
    236   mask_l = mask & HWM;
    237   mask_r = mask << 8;
    238   mask_r = mask_r & HWM;
    239 
    240   hev_l = hev & HWM;
    241   hev_r = hev << 8;
    242   hev_r = hev_r & HWM;
    243 
    244   __asm__ __volatile__ (
    245       /* vp9_filter = vp8_signed_char_clamp(ps1 - qs1); */
    246       "subq_s.ph    %[vp9_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
    247       "subq_s.ph    %[vp9_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
    248 
    249       /* qs0 - ps0 */
    250       "subq_s.ph    %[subr_l],       %[vqs0_l],       %[vps0_l]       \n\t"
    251       "subq_s.ph    %[subr_r],       %[vqs0_r],       %[vps0_r]       \n\t"
    252 
    253       /* vp9_filter &= hev; */
    254       "and          %[vp9_filter_l], %[vp9_filter_l], %[hev_l]        \n\t"
    255       "and          %[vp9_filter_r], %[vp9_filter_r], %[hev_r]        \n\t"
    256 
    257       /* vp9_filter = vp8_signed_char_clamp(vp9_filter + 3 * (qs0 - ps0)); */
    258       "addq_s.ph    %[vp9_filter_l], %[vp9_filter_l], %[subr_l]       \n\t"
    259       "addq_s.ph    %[vp9_filter_r], %[vp9_filter_r], %[subr_r]       \n\t"
    260       "xor          %[invhev_l],     %[hev_l],        %[HWM]          \n\t"
    261       "addq_s.ph    %[vp9_filter_l], %[vp9_filter_l], %[subr_l]       \n\t"
    262       "addq_s.ph    %[vp9_filter_r], %[vp9_filter_r], %[subr_r]       \n\t"
    263       "xor          %[invhev_r],     %[hev_r],        %[HWM]          \n\t"
    264       "addq_s.ph    %[vp9_filter_l], %[vp9_filter_l], %[subr_l]       \n\t"
    265       "addq_s.ph    %[vp9_filter_r], %[vp9_filter_r], %[subr_r]       \n\t"
    266 
    267       /* vp9_filter &= mask; */
    268       "and          %[vp9_filter_l], %[vp9_filter_l], %[mask_l]       \n\t"
    269       "and          %[vp9_filter_r], %[vp9_filter_r], %[mask_r]       \n\t"
    270 
    271       : [vp9_filter_l] "=&r" (vp9_filter_l),
    272         [vp9_filter_r] "=&r" (vp9_filter_r),
    273         [subr_l] "=&r" (subr_l), [subr_r] "=&r" (subr_r),
    274         [invhev_l] "=&r" (invhev_l), [invhev_r] "=&r" (invhev_r)
    275       : [vps0_l] "r" (vps0_l), [vps0_r] "r" (vps0_r), [vps1_l] "r" (vps1_l),
    276         [vps1_r] "r" (vps1_r), [vqs0_l] "r" (vqs0_l), [vqs0_r] "r" (vqs0_r),
    277         [vqs1_l] "r" (vqs1_l), [vqs1_r] "r" (vqs1_r),
    278         [mask_l] "r" (mask_l), [mask_r] "r" (mask_r),
    279         [hev_l] "r" (hev_l), [hev_r] "r" (hev_r), [HWM] "r" (HWM)
    280   );
    281 
    282   /* save bottom 3 bits so that we round one side +4 and the other +3 */
    283   __asm__ __volatile__ (
    284       /* Filter2 = vp8_signed_char_clamp(vp9_filter + 3) >>= 3; */
    285       "addq_s.ph    %[Filter1_l],    %[vp9_filter_l], %[t2]           \n\t"
    286       "addq_s.ph    %[Filter1_r],    %[vp9_filter_r], %[t2]           \n\t"
    287 
    288       /* Filter1 = vp8_signed_char_clamp(vp9_filter + 4) >>= 3; */
    289       "addq_s.ph    %[Filter2_l],    %[vp9_filter_l], %[t1]           \n\t"
    290       "addq_s.ph    %[Filter2_r],    %[vp9_filter_r], %[t1]           \n\t"
    291       "shra.ph      %[Filter1_r],    %[Filter1_r],    3               \n\t"
    292       "shra.ph      %[Filter1_l],    %[Filter1_l],    3               \n\t"
    293 
    294       "shra.ph      %[Filter2_l],    %[Filter2_l],    3               \n\t"
    295       "shra.ph      %[Filter2_r],    %[Filter2_r],    3               \n\t"
    296 
    297       "and          %[Filter1_l],    %[Filter1_l],    %[HWM]          \n\t"
    298       "and          %[Filter1_r],    %[Filter1_r],    %[HWM]          \n\t"
    299 
    300       /* vps0 = vp8_signed_char_clamp(ps0 + Filter2); */
    301       "addq_s.ph    %[vps0_l],       %[vps0_l],       %[Filter2_l]    \n\t"
    302       "addq_s.ph    %[vps0_r],       %[vps0_r],       %[Filter2_r]    \n\t"
    303 
    304       /* vqs0 = vp8_signed_char_clamp(qs0 - Filter1); */
    305       "subq_s.ph    %[vqs0_l],       %[vqs0_l],       %[Filter1_l]    \n\t"
    306       "subq_s.ph    %[vqs0_r],       %[vqs0_r],       %[Filter1_r]    \n\t"
    307 
    308       : [Filter1_l] "=&r" (Filter1_l), [Filter1_r] "=&r" (Filter1_r),
    309         [Filter2_l] "=&r" (Filter2_l), [Filter2_r] "=&r" (Filter2_r),
    310         [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
    311         [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r)
    312       : [t1] "r" (t1), [t2] "r" (t2), [HWM] "r" (HWM),
    313         [vp9_filter_l] "r" (vp9_filter_l), [vp9_filter_r] "r" (vp9_filter_r)
    314   );
    315 
    316   __asm__ __volatile__ (
    317       /* (vp9_filter += 1) >>= 1 */
    318       "addqh.ph    %[Filter1_l],    %[Filter1_l],     %[t3]           \n\t"
    319       "addqh.ph    %[Filter1_r],    %[Filter1_r],     %[t3]           \n\t"
    320 
    321       /* vp9_filter &= ~hev; */
    322       "and          %[Filter1_l],    %[Filter1_l],    %[invhev_l]     \n\t"
    323       "and          %[Filter1_r],    %[Filter1_r],    %[invhev_r]     \n\t"
    324 
    325       /* vps1 = vp8_signed_char_clamp(ps1 + vp9_filter); */
    326       "addq_s.ph    %[vps1_l],       %[vps1_l],       %[Filter1_l]    \n\t"
    327       "addq_s.ph    %[vps1_r],       %[vps1_r],       %[Filter1_r]    \n\t"
    328 
    329       /* vqs1 = vp8_signed_char_clamp(qs1 - vp9_filter); */
    330       "subq_s.ph    %[vqs1_l],       %[vqs1_l],       %[Filter1_l]    \n\t"
    331       "subq_s.ph    %[vqs1_r],       %[vqs1_r],       %[Filter1_r]    \n\t"
    332 
    333       : [Filter1_l] "+r" (Filter1_l), [Filter1_r] "+r" (Filter1_r),
    334         [vps1_l] "+r" (vps1_l), [vps1_r] "+r" (vps1_r),
    335         [vqs1_l] "+r" (vqs1_l), [vqs1_r] "+r" (vqs1_r)
    336       : [t3] "r" (t3), [invhev_l] "r" (invhev_l), [invhev_r] "r" (invhev_r)
    337   );
    338 
    339   /* Create quad-bytes from halfword pairs */
    340   vqs0_l = vqs0_l & HWM;
    341   vqs1_l = vqs1_l & HWM;
    342   vps0_l = vps0_l & HWM;
    343   vps1_l = vps1_l & HWM;
    344 
    345   __asm__ __volatile__ (
    346       "shrl.ph      %[vqs0_r],       %[vqs0_r],       8   \n\t"
    347       "shrl.ph      %[vps0_r],       %[vps0_r],       8   \n\t"
    348       "shrl.ph      %[vqs1_r],       %[vqs1_r],       8   \n\t"
    349       "shrl.ph      %[vps1_r],       %[vps1_r],       8   \n\t"
    350 
    351       : [vps1_r] "+r" (vps1_r), [vqs1_r] "+r" (vqs1_r),
    352         [vps0_r] "+r" (vps0_r), [vqs0_r] "+r" (vqs0_r)
    353       :
    354   );
    355 
    356   vqs0 = vqs0_l | vqs0_r;
    357   vqs1 = vqs1_l | vqs1_r;
    358   vps0 = vps0_l | vps0_r;
    359   vps1 = vps1_l | vps1_r;
    360 
    361   *p0_f0 = vps0 ^ N128;
    362   *p1_f0 = vps1 ^ N128;
    363   *q0_f0 = vqs0 ^ N128;
    364   *q1_f0 = vqs1 ^ N128;
    365 }
    366 
    367 static INLINE void vp9_mbfilter_dspr2(uint32_t *op3, uint32_t *op2,
    368                                       uint32_t *op1, uint32_t *op0,
    369                                       uint32_t *oq0, uint32_t *oq1,
    370                                       uint32_t *oq2, uint32_t *oq3) {
    371   /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
    372   const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
    373   const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
    374   uint32_t       res_op2, res_op1, res_op0;
    375   uint32_t       res_oq0, res_oq1, res_oq2;
    376   uint32_t       tmp;
    377   uint32_t       add_p210_q012;
    378   uint32_t       u32Four = 0x00040004;
    379 
    380   /* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3)  1 */
    381   /* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3)  2 */
    382   /* *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3)  3 */
    383   /* *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3)  4 */
    384   /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3)  5 */
    385   /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3)  6 */
    386 
    387   __asm__ __volatile__ (
    388       "addu.ph    %[add_p210_q012],  %[p2],             %[p1]            \n\t"
    389       "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[p0]            \n\t"
    390       "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q0]            \n\t"
    391       "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q1]            \n\t"
    392       "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q2]            \n\t"
    393       "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[u32Four]       \n\t"
    394 
    395       "shll.ph    %[tmp],            %[p3],             1                \n\t"
    396       "addu.ph    %[res_op2],        %[tmp],            %[p3]            \n\t"
    397       "addu.ph    %[res_op1],        %[p3],             %[p3]            \n\t"
    398       "addu.ph    %[res_op2],        %[res_op2],        %[p2]            \n\t"
    399       "addu.ph    %[res_op1],        %[res_op1],        %[p1]            \n\t"
    400       "addu.ph    %[res_op2],        %[res_op2],        %[add_p210_q012] \n\t"
    401       "addu.ph    %[res_op1],        %[res_op1],        %[add_p210_q012] \n\t"
    402       "subu.ph    %[res_op2],        %[res_op2],        %[q1]            \n\t"
    403       "subu.ph    %[res_op1],        %[res_op1],        %[q2]            \n\t"
    404       "subu.ph    %[res_op2],        %[res_op2],        %[q2]            \n\t"
    405       "shrl.ph    %[res_op1],        %[res_op1],        3                \n\t"
    406       "shrl.ph    %[res_op2],        %[res_op2],        3                \n\t"
    407       "addu.ph    %[res_op0],        %[p3],             %[p0]            \n\t"
    408       "addu.ph    %[res_oq0],        %[q0],             %[q3]            \n\t"
    409       "addu.ph    %[res_op0],        %[res_op0],        %[add_p210_q012] \n\t"
    410       "addu.ph    %[res_oq0],        %[res_oq0],        %[add_p210_q012] \n\t"
    411       "addu.ph    %[res_oq1],        %[q3],             %[q3]            \n\t"
    412       "shll.ph    %[tmp],            %[q3],             1                \n\t"
    413       "addu.ph    %[res_oq1],        %[res_oq1],        %[q1]            \n\t"
    414       "addu.ph    %[res_oq2],        %[tmp],            %[q3]            \n\t"
    415       "addu.ph    %[res_oq1],        %[res_oq1],        %[add_p210_q012] \n\t"
    416       "addu.ph    %[res_oq2],        %[res_oq2],        %[add_p210_q012] \n\t"
    417       "subu.ph    %[res_oq1],        %[res_oq1],        %[p2]            \n\t"
    418       "addu.ph    %[res_oq2],        %[res_oq2],        %[q2]            \n\t"
    419       "shrl.ph    %[res_oq1],        %[res_oq1],        3                \n\t"
    420       "subu.ph    %[res_oq2],        %[res_oq2],        %[p2]            \n\t"
    421       "shrl.ph    %[res_oq0],        %[res_oq0],        3                \n\t"
    422       "subu.ph    %[res_oq2],        %[res_oq2],        %[p1]            \n\t"
    423       "shrl.ph    %[res_op0],        %[res_op0],        3                \n\t"
    424       "shrl.ph    %[res_oq2],        %[res_oq2],        3                \n\t"
    425 
    426       : [add_p210_q012] "=&r" (add_p210_q012),
    427         [tmp] "=&r" (tmp), [res_op2] "=&r" (res_op2),
    428         [res_op1] "=&r" (res_op1), [res_op0] "=&r" (res_op0),
    429         [res_oq0] "=&r" (res_oq0), [res_oq1] "=&r" (res_oq1),
    430         [res_oq2] "=&r" (res_oq2)
    431       : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [q1] "r" (q1),
    432         [p2] "r" (p2), [q2] "r" (q2), [p3] "r" (p3), [q3] "r" (q3),
    433         [u32Four] "r" (u32Four)
    434   );
    435 
    436   *op2 = res_op2;
    437   *op1 = res_op1;
    438   *op0 = res_op0;
    439   *oq0 = res_oq0;
    440   *oq1 = res_oq1;
    441   *oq2 = res_oq2;
    442 }
    443 
    444 static INLINE void vp9_mbfilter1_dspr2(uint32_t p3, uint32_t p2,
    445                                        uint32_t p1, uint32_t p0,
    446                                        uint32_t q0, uint32_t q1,
    447                                        uint32_t q2, uint32_t q3,
    448                                        uint32_t *op2_f1,
    449                                        uint32_t *op1_f1, uint32_t *op0_f1,
    450                                        uint32_t *oq0_f1, uint32_t *oq1_f1,
    451                                        uint32_t *oq2_f1) {
    452   /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
    453   uint32_t  res_op2, res_op1, res_op0;
    454   uint32_t  res_oq0, res_oq1, res_oq2;
    455   uint32_t  tmp;
    456   uint32_t  add_p210_q012;
    457   uint32_t  u32Four = 0x00040004;
    458 
    459   /* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3)   1 */
    460   /* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3)   2 */
    461   /* *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3)   3 */
    462   /* *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3)   4 */
    463   /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3)   5 */
    464   /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3)   6 */
    465 
    466   __asm__ __volatile__ (
    467       "addu.ph    %[add_p210_q012],  %[p2],             %[p1]             \n\t"
    468       "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[p0]             \n\t"
    469       "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q0]             \n\t"
    470       "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q1]             \n\t"
    471       "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q2]             \n\t"
    472       "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[u32Four]        \n\t"
    473 
    474       "shll.ph    %[tmp],            %[p3],             1                 \n\t"
    475       "addu.ph    %[res_op2],        %[tmp],            %[p3]             \n\t"
    476       "addu.ph    %[res_op1],        %[p3],             %[p3]             \n\t"
    477       "addu.ph    %[res_op2],        %[res_op2],        %[p2]             \n\t"
    478       "addu.ph    %[res_op1],        %[res_op1],        %[p1]             \n\t"
    479       "addu.ph    %[res_op2],        %[res_op2],        %[add_p210_q012]  \n\t"
    480       "addu.ph    %[res_op1],        %[res_op1],        %[add_p210_q012]  \n\t"
    481       "subu.ph    %[res_op2],        %[res_op2],        %[q1]             \n\t"
    482       "subu.ph    %[res_op1],        %[res_op1],        %[q2]             \n\t"
    483       "subu.ph    %[res_op2],        %[res_op2],        %[q2]             \n\t"
    484       "shrl.ph    %[res_op1],        %[res_op1],        3                 \n\t"
    485       "shrl.ph    %[res_op2],        %[res_op2],        3                 \n\t"
    486       "addu.ph    %[res_op0],        %[p3],             %[p0]             \n\t"
    487       "addu.ph    %[res_oq0],        %[q0],             %[q3]             \n\t"
    488       "addu.ph    %[res_op0],        %[res_op0],        %[add_p210_q012]  \n\t"
    489       "addu.ph    %[res_oq0],        %[res_oq0],        %[add_p210_q012]  \n\t"
    490       "addu.ph    %[res_oq1],        %[q3],             %[q3]             \n\t"
    491       "shll.ph    %[tmp],            %[q3],             1                 \n\t"
    492       "addu.ph    %[res_oq1],        %[res_oq1],        %[q1]             \n\t"
    493       "addu.ph    %[res_oq2],        %[tmp],            %[q3]             \n\t"
    494       "addu.ph    %[res_oq1],        %[res_oq1],        %[add_p210_q012]  \n\t"
    495       "addu.ph    %[res_oq2],        %[res_oq2],        %[add_p210_q012]  \n\t"
    496       "subu.ph    %[res_oq1],        %[res_oq1],        %[p2]             \n\t"
    497       "addu.ph    %[res_oq2],        %[res_oq2],        %[q2]             \n\t"
    498       "shrl.ph    %[res_oq1],        %[res_oq1],        3                 \n\t"
    499       "subu.ph    %[res_oq2],        %[res_oq2],        %[p2]             \n\t"
    500       "shrl.ph    %[res_oq0],        %[res_oq0],        3                 \n\t"
    501       "subu.ph    %[res_oq2],        %[res_oq2],        %[p1]             \n\t"
    502       "shrl.ph    %[res_op0],        %[res_op0],        3                 \n\t"
    503       "shrl.ph    %[res_oq2],        %[res_oq2],        3                 \n\t"
    504 
    505       : [add_p210_q012] "=&r" (add_p210_q012), [tmp] "=&r" (tmp),
    506         [res_op2] "=&r" (res_op2), [res_op1] "=&r" (res_op1),
    507         [res_op0] "=&r" (res_op0), [res_oq0] "=&r" (res_oq0),
    508         [res_oq1] "=&r" (res_oq1), [res_oq2] "=&r" (res_oq2)
    509       : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [q1] "r" (q1),
    510         [p2] "r" (p2), [q2] "r" (q2), [p3] "r" (p3), [q3] "r" (q3),
    511         [u32Four] "r" (u32Four)
    512   );
    513 
    514   *op2_f1 = res_op2;
    515   *op1_f1 = res_op1;
    516   *op0_f1 = res_op0;
    517   *oq0_f1 = res_oq0;
    518   *oq1_f1 = res_oq1;
    519   *oq2_f1 = res_oq2;
    520 }
    521 
    522 static INLINE void vp9_wide_mbfilter_dspr2(uint32_t *op7, uint32_t *op6,
    523                                            uint32_t *op5, uint32_t *op4,
    524                                            uint32_t *op3, uint32_t *op2,
    525                                            uint32_t *op1, uint32_t *op0,
    526                                            uint32_t *oq0, uint32_t *oq1,
    527                                            uint32_t *oq2, uint32_t *oq3,
    528                                            uint32_t *oq4, uint32_t *oq5,
    529                                            uint32_t *oq6, uint32_t *oq7) {
    530   const uint32_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4;
    531   const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
    532   const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
    533   const uint32_t q4 = *oq4, q5 = *oq5, q6 = *oq6, q7 = *oq7;
    534   uint32_t       res_op6, res_op5, res_op4, res_op3, res_op2, res_op1, res_op0;
    535   uint32_t       res_oq0, res_oq1, res_oq2, res_oq3, res_oq4, res_oq5, res_oq6;
    536   uint32_t       tmp;
    537   uint32_t       add_p6toq6;
    538   uint32_t       u32Eight = 0x00080008;
    539 
    540   __asm__ __volatile__ (
    541       /* addition of p6,p5,p4,p3,p2,p1,p0,q0,q1,q2,q3,q4,q5,q6
    542          which is used most of the time */
    543       "addu.ph      %[add_p6toq6],     %[p6],              %[p5]         \n\t"
    544       "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p4]         \n\t"
    545       "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p3]         \n\t"
    546       "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p2]         \n\t"
    547       "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p1]         \n\t"
    548       "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p0]         \n\t"
    549       "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q0]         \n\t"
    550       "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q1]         \n\t"
    551       "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q2]         \n\t"
    552       "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q3]         \n\t"
    553       "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q4]         \n\t"
    554       "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q5]         \n\t"
    555       "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q6]         \n\t"
    556       "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[u32Eight]   \n\t"
    557 
    558       : [add_p6toq6] "=&r" (add_p6toq6)
    559       : [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4),
    560         [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),
    561         [q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2), [q3] "r" (q3),
    562         [q4] "r" (q4), [q5] "r" (q5), [q6] "r" (q6),
    563         [u32Eight] "r" (u32Eight)
    564   );
    565 
    566   __asm__ __volatile__ (
    567       /* *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 +
    568                                    p3 + p2 + p1 + p0 + q0, 4) */
    569       "shll.ph       %[tmp],            %[p7],            3               \n\t"
    570       "subu.ph       %[res_op6],        %[tmp],           %[p7]           \n\t"
    571       "addu.ph       %[res_op6],        %[res_op6],       %[p6]           \n\t"
    572       "addu.ph       %[res_op6],        %[res_op6],       %[add_p6toq6]   \n\t"
    573       "subu.ph       %[res_op6],        %[res_op6],       %[q1]           \n\t"
    574       "subu.ph       %[res_op6],        %[res_op6],       %[q2]           \n\t"
    575       "subu.ph       %[res_op6],        %[res_op6],       %[q3]           \n\t"
    576       "subu.ph       %[res_op6],        %[res_op6],       %[q4]           \n\t"
    577       "subu.ph       %[res_op6],        %[res_op6],       %[q5]           \n\t"
    578       "subu.ph       %[res_op6],        %[res_op6],       %[q6]           \n\t"
    579       "shrl.ph       %[res_op6],        %[res_op6],       4               \n\t"
    580 
    581       /* *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 +
    582                                    p2 + p1 + p0 + q0 + q1, 4) */
    583       "shll.ph       %[tmp],            %[p7],            2               \n\t"
    584       "addu.ph       %[res_op5],        %[tmp],           %[p7]           \n\t"
    585       "addu.ph       %[res_op5],        %[res_op5],       %[p7]           \n\t"
    586       "addu.ph       %[res_op5],        %[res_op5],       %[p5]           \n\t"
    587       "addu.ph       %[res_op5],        %[res_op5],       %[add_p6toq6]   \n\t"
    588       "subu.ph       %[res_op5],        %[res_op5],       %[q2]           \n\t"
    589       "subu.ph       %[res_op5],        %[res_op5],       %[q3]           \n\t"
    590       "subu.ph       %[res_op5],        %[res_op5],       %[q4]           \n\t"
    591       "subu.ph       %[res_op5],        %[res_op5],       %[q5]           \n\t"
    592       "subu.ph       %[res_op5],        %[res_op5],       %[q6]           \n\t"
    593       "shrl.ph       %[res_op5],        %[res_op5],       4               \n\t"
    594 
    595       /* *op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 +
    596                                    p1 + p0 + q0 + q1 + q2, 4) */
    597       "shll.ph       %[tmp],            %[p7],            2               \n\t"
    598       "addu.ph       %[res_op4],        %[tmp],           %[p7]           \n\t"
    599       "addu.ph       %[res_op4],        %[res_op4],       %[p4]           \n\t"
    600       "addu.ph       %[res_op4],        %[res_op4],       %[add_p6toq6]   \n\t"
    601       "subu.ph       %[res_op4],        %[res_op4],       %[q3]           \n\t"
    602       "subu.ph       %[res_op4],        %[res_op4],       %[q4]           \n\t"
    603       "subu.ph       %[res_op4],        %[res_op4],       %[q5]           \n\t"
    604       "subu.ph       %[res_op4],        %[res_op4],       %[q6]           \n\t"
    605       "shrl.ph       %[res_op4],        %[res_op4],       4               \n\t"
    606 
    607       /* *op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 +
    608                                    p1 + p0 + q0 + q1 + q2 + q3, 4) */
    609       "shll.ph       %[tmp],            %[p7],            2               \n\t"
    610       "addu.ph       %[res_op3],        %[tmp],           %[p3]           \n\t"
    611       "addu.ph       %[res_op3],        %[res_op3],       %[add_p6toq6]   \n\t"
    612       "subu.ph       %[res_op3],        %[res_op3],       %[q4]           \n\t"
    613       "subu.ph       %[res_op3],        %[res_op3],       %[q5]           \n\t"
    614       "subu.ph       %[res_op3],        %[res_op3],       %[q6]           \n\t"
    615       "shrl.ph       %[res_op3],        %[res_op3],       4               \n\t"
    616 
    617       /* *op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 +
    618                                    p0 + q0 + q1 + q2 + q3 + q4, 4) */
    619       "shll.ph       %[tmp],            %[p7],            1               \n\t"
    620       "addu.ph       %[res_op2],        %[tmp],           %[p7]           \n\t"
    621       "addu.ph       %[res_op2],        %[res_op2],       %[p2]           \n\t"
    622       "addu.ph       %[res_op2],        %[res_op2],       %[add_p6toq6]   \n\t"
    623       "subu.ph       %[res_op2],        %[res_op2],       %[q5]           \n\t"
    624       "subu.ph       %[res_op2],        %[res_op2],       %[q6]           \n\t"
    625       "shrl.ph       %[res_op2],        %[res_op2],       4               \n\t"
    626 
    627       /* *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 +
    628                                    p0 + q0 + q1 + q2 + q3 + q4 + q5, 4); */
    629       "shll.ph       %[tmp],            %[p7],            1               \n\t"
    630       "addu.ph       %[res_op1],        %[tmp],           %[p1]           \n\t"
    631       "addu.ph       %[res_op1],        %[res_op1],       %[add_p6toq6]   \n\t"
    632       "subu.ph       %[res_op1],        %[res_op1],       %[q6]           \n\t"
    633       "shrl.ph       %[res_op1],        %[res_op1],       4               \n\t"
    634 
    635       /* *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
    636                                   q0 + q1 + q2 + q3 + q4 + q5 + q6, 4) */
    637       "addu.ph       %[res_op0],        %[p7],            %[p0]           \n\t"
    638       "addu.ph       %[res_op0],        %[res_op0],       %[add_p6toq6]   \n\t"
    639       "shrl.ph       %[res_op0],        %[res_op0],       4               \n\t"
    640 
    641       : [res_op6] "=&r" (res_op6), [res_op5] "=&r" (res_op5),
    642         [res_op4] "=&r" (res_op4), [res_op3] "=&r" (res_op3),
    643         [res_op2] "=&r" (res_op2), [res_op1] "=&r" (res_op1),
    644         [res_op0] "=&r" (res_op0), [tmp] "=&r" (tmp)
    645       : [p7] "r" (p7), [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4),
    646         [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),
    647         [q2] "r" (q2), [q1] "r" (q1),
    648         [q3] "r" (q3), [q4] "r" (q4), [q5] "r" (q5), [q6] "r" (q6),
    649         [add_p6toq6] "r" (add_p6toq6)
    650   );
    651 
    652   *op6 = res_op6;
    653   *op5 = res_op5;
    654   *op4 = res_op4;
    655   *op3 = res_op3;
    656   *op2 = res_op2;
    657   *op1 = res_op1;
    658   *op0 = res_op0;
    659 
    660   __asm__ __volatile__ (
    661       /* *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 +
    662                                    q1 + q2 + q3 + q4 + q5 + q6 + q7, 4); */
    663       "addu.ph       %[res_oq0],        %[q7],            %[q0]           \n\t"
    664       "addu.ph       %[res_oq0],        %[res_oq0],       %[add_p6toq6]   \n\t"
    665       "shrl.ph       %[res_oq0],        %[res_oq0],       4               \n\t"
    666 
    667       /* *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 +
    668                                    q2 + q3 + q4 + q5 + q6 + q7 * 2, 4) */
    669       "shll.ph       %[tmp],            %[q7],            1               \n\t"
    670       "addu.ph       %[res_oq1],        %[tmp],           %[q1]           \n\t"
    671       "addu.ph       %[res_oq1],        %[res_oq1],       %[add_p6toq6]   \n\t"
    672       "subu.ph       %[res_oq1],        %[res_oq1],       %[p6]           \n\t"
    673       "shrl.ph       %[res_oq1],        %[res_oq1],       4               \n\t"
    674 
    675       /* *oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 +
    676                                    q3 + q4 + q5 + q6 + q7 * 3, 4) */
    677       "shll.ph       %[tmp],            %[q7],            1               \n\t"
    678       "addu.ph       %[res_oq2],        %[tmp],           %[q7]           \n\t"
    679       "addu.ph       %[res_oq2],        %[res_oq2],       %[q2]           \n\t"
    680       "addu.ph       %[res_oq2],        %[res_oq2],       %[add_p6toq6]   \n\t"
    681       "subu.ph       %[res_oq2],        %[res_oq2],       %[p5]           \n\t"
    682       "subu.ph       %[res_oq2],        %[res_oq2],       %[p6]           \n\t"
    683       "shrl.ph       %[res_oq2],        %[res_oq2],       4               \n\t"
    684 
    685       /* *oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + q0 + q1 + q2 +
    686                                    q3 * 2 + q4 + q5 + q6 + q7 * 4, 4) */
    687       "shll.ph       %[tmp],            %[q7],            2               \n\t"
    688       "addu.ph       %[res_oq3],        %[tmp],           %[q3]           \n\t"
    689       "addu.ph       %[res_oq3],        %[res_oq3],       %[add_p6toq6]   \n\t"
    690       "subu.ph       %[res_oq3],        %[res_oq3],       %[p4]           \n\t"
    691       "subu.ph       %[res_oq3],        %[res_oq3],       %[p5]           \n\t"
    692       "subu.ph       %[res_oq3],        %[res_oq3],       %[p6]           \n\t"
    693       "shrl.ph       %[res_oq3],        %[res_oq3],       4               \n\t"
    694 
    695       /* *oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q1 + q2 + q3 +
    696                                    q4 * 2 + q5 + q6 + q7 * 5, 4) */
    697       "shll.ph       %[tmp],            %[q7],            2               \n\t"
    698       "addu.ph       %[res_oq4],        %[tmp],           %[q7]           \n\t"
    699       "addu.ph       %[res_oq4],        %[res_oq4],       %[q4]           \n\t"
    700       "addu.ph       %[res_oq4],        %[res_oq4],       %[add_p6toq6]   \n\t"
    701       "subu.ph       %[res_oq4],        %[res_oq4],       %[p3]           \n\t"
    702       "subu.ph       %[res_oq4],        %[res_oq4],       %[p4]           \n\t"
    703       "subu.ph       %[res_oq4],        %[res_oq4],       %[p5]           \n\t"
    704       "subu.ph       %[res_oq4],        %[res_oq4],       %[p6]           \n\t"
    705       "shrl.ph       %[res_oq4],        %[res_oq4],       4               \n\t"
    706 
    707       /* *oq5 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q2 + q3 + q4 +
    708                                    q5 * 2 + q6 + q7 * 6, 4) */
    709       "shll.ph       %[tmp],            %[q7],            2               \n\t"
    710       "addu.ph       %[res_oq5],        %[tmp],           %[q7]           \n\t"
    711       "addu.ph       %[res_oq5],        %[res_oq5],       %[q7]           \n\t"
    712       "addu.ph       %[res_oq5],        %[res_oq5],       %[q5]           \n\t"
    713       "addu.ph       %[res_oq5],        %[res_oq5],       %[add_p6toq6]   \n\t"
    714       "subu.ph       %[res_oq5],        %[res_oq5],       %[p2]           \n\t"
    715       "subu.ph       %[res_oq5],        %[res_oq5],       %[p3]           \n\t"
    716       "subu.ph       %[res_oq5],        %[res_oq5],       %[p4]           \n\t"
    717       "subu.ph       %[res_oq5],        %[res_oq5],       %[p5]           \n\t"
    718       "subu.ph       %[res_oq5],        %[res_oq5],       %[p6]           \n\t"
    719       "shrl.ph       %[res_oq5],        %[res_oq5],       4               \n\t"
    720 
    721       /* *oq6 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 +
    722                                    q4 + q5 + q6 * 2 + q7 * 7, 4) */
    723       "shll.ph       %[tmp],            %[q7],            3               \n\t"
    724       "subu.ph       %[res_oq6],        %[tmp],           %[q7]           \n\t"
    725       "addu.ph       %[res_oq6],        %[res_oq6],       %[q6]           \n\t"
    726       "addu.ph       %[res_oq6],        %[res_oq6],       %[add_p6toq6]   \n\t"
    727       "subu.ph       %[res_oq6],        %[res_oq6],       %[p1]           \n\t"
    728       "subu.ph       %[res_oq6],        %[res_oq6],       %[p2]           \n\t"
    729       "subu.ph       %[res_oq6],        %[res_oq6],       %[p3]           \n\t"
    730       "subu.ph       %[res_oq6],        %[res_oq6],       %[p4]           \n\t"
    731       "subu.ph       %[res_oq6],        %[res_oq6],       %[p5]           \n\t"
    732       "subu.ph       %[res_oq6],        %[res_oq6],       %[p6]           \n\t"
    733       "shrl.ph       %[res_oq6],        %[res_oq6],       4               \n\t"
    734 
    735       : [res_oq6] "=&r" (res_oq6), [res_oq5] "=&r" (res_oq5),
    736         [res_oq4] "=&r" (res_oq4), [res_oq3] "=&r" (res_oq3),
    737         [res_oq2] "=&r" (res_oq2), [res_oq1] "=&r" (res_oq1),
    738         [res_oq0] "=&r" (res_oq0), [tmp] "=&r" (tmp)
    739       : [q7] "r" (q7), [q6] "r" (q6), [q5] "r" (q5), [q4] "r" (q4),
    740         [q3] "r" (q3), [q2] "r" (q2), [q1] "r" (q1), [q0] "r" (q0),
    741         [p1] "r" (p1), [p2] "r" (p2),
    742         [p3] "r" (p3), [p4] "r" (p4), [p5] "r" (p5), [p6] "r" (p6),
    743         [add_p6toq6] "r" (add_p6toq6)
    744   );
    745 
    746   *oq0 = res_oq0;
    747   *oq1 = res_oq1;
    748   *oq2 = res_oq2;
    749   *oq3 = res_oq3;
    750   *oq4 = res_oq4;
    751   *oq5 = res_oq5;
    752   *oq6 = res_oq6;
    753 }
    754 #endif  // #if HAVE_DSPR2
    755 #endif  // VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_FILTERS_DSPR2_H_
    756