Home | History | Annotate | Download | only in dspr2
      1 /*
      2  *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <stdlib.h>
     12 #include "vp8_rtcd.h"
     13 #include "vp8/common/onyxc_int.h"
     14 
     15 #if HAVE_DSPR2
     16 typedef unsigned char uc;
     17 
     18 /* prefetch data for load */
     19 inline void prefetch_load_lf(unsigned char *src) {
     20   __asm__ __volatile__("pref   0,  0(%[src])   \n\t" : : [src] "r"(src));
     21 }
     22 
     23 /* prefetch data for store */
     24 inline void prefetch_store_lf(unsigned char *dst) {
     25   __asm__ __volatile__("pref   1,  0(%[dst])   \n\t" : : [dst] "r"(dst));
     26 }
     27 
     28 /* processing 4 pixels at the same time
     29  * compute hev and mask in the same function
     30  */
     31 static __inline void vp8_filter_mask_vec_mips(
     32     uint32_t limit, uint32_t flimit, uint32_t p1, uint32_t p0, uint32_t p3,
     33     uint32_t p2, uint32_t q0, uint32_t q1, uint32_t q2, uint32_t q3,
     34     uint32_t thresh, uint32_t *hev, uint32_t *mask) {
     35   uint32_t c, r, r3, r_k;
     36   uint32_t s1, s2, s3;
     37   uint32_t ones = 0xFFFFFFFF;
     38   uint32_t hev1;
     39 
     40   __asm__ __volatile__(
     41       /* mask |= (abs(p3 - p2) > limit) */
     42       "subu_s.qb      %[c],   %[p3],     %[p2]        \n\t"
     43       "subu_s.qb      %[r_k], %[p2],     %[p3]        \n\t"
     44       "or             %[r_k], %[r_k],    %[c]         \n\t"
     45       "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
     46       "or             %[r],   $0,        %[c]         \n\t"
     47 
     48       /* mask |= (abs(p2 - p1) > limit) */
     49       "subu_s.qb      %[c],   %[p2],     %[p1]        \n\t"
     50       "subu_s.qb      %[r_k], %[p1],     %[p2]        \n\t"
     51       "or             %[r_k], %[r_k],    %[c]         \n\t"
     52       "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
     53       "or             %[r],   %[r],      %[c]         \n\t"
     54 
     55       /* mask |= (abs(p1 - p0) > limit)
     56        * hev  |= (abs(p1 - p0) > thresh)
     57        */
     58       "subu_s.qb      %[c],   %[p1],     %[p0]        \n\t"
     59       "subu_s.qb      %[r_k], %[p0],     %[p1]        \n\t"
     60       "or             %[r_k], %[r_k],    %[c]         \n\t"
     61       "cmpgu.lt.qb    %[c],   %[thresh], %[r_k]       \n\t"
     62       "or             %[r3],  $0,        %[c]         \n\t"
     63       "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
     64       "or             %[r],   %[r],      %[c]         \n\t"
     65 
     66       /* mask |= (abs(q1 - q0) > limit)
     67        * hev  |= (abs(q1 - q0) > thresh)
     68        */
     69       "subu_s.qb      %[c],   %[q1],     %[q0]        \n\t"
     70       "subu_s.qb      %[r_k], %[q0],     %[q1]        \n\t"
     71       "or             %[r_k], %[r_k],    %[c]         \n\t"
     72       "cmpgu.lt.qb    %[c],   %[thresh], %[r_k]       \n\t"
     73       "or             %[r3],  %[r3],     %[c]         \n\t"
     74       "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
     75       "or             %[r],   %[r],      %[c]         \n\t"
     76 
     77       /* mask |= (abs(q2 - q1) > limit) */
     78       "subu_s.qb      %[c],   %[q2],     %[q1]        \n\t"
     79       "subu_s.qb      %[r_k], %[q1],     %[q2]        \n\t"
     80       "or             %[r_k], %[r_k],    %[c]         \n\t"
     81       "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
     82       "or             %[r],   %[r],      %[c]         \n\t"
     83       "sll            %[r3],    %[r3],    24          \n\t"
     84 
     85       /* mask |= (abs(q3 - q2) > limit) */
     86       "subu_s.qb      %[c],   %[q3],     %[q2]        \n\t"
     87       "subu_s.qb      %[r_k], %[q2],     %[q3]        \n\t"
     88       "or             %[r_k], %[r_k],    %[c]         \n\t"
     89       "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
     90       "or             %[r],   %[r],      %[c]         \n\t"
     91 
     92       : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r3] "=&r"(r3)
     93       : [limit] "r"(limit), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
     94         [p0] "r"(p0), [q1] "r"(q1), [q0] "r"(q0), [q2] "r"(q2), [q3] "r"(q3),
     95         [thresh] "r"(thresh));
     96 
     97   __asm__ __volatile__(
     98       /* abs(p0 - q0) */
     99       "subu_s.qb      %[c],   %[p0],     %[q0]        \n\t"
    100       "subu_s.qb      %[r_k], %[q0],     %[p0]        \n\t"
    101       "wrdsp          %[r3]                           \n\t"
    102       "or             %[s1],  %[r_k],    %[c]         \n\t"
    103 
    104       /* abs(p1 - q1) */
    105       "subu_s.qb      %[c],    %[p1],    %[q1]        \n\t"
    106       "addu_s.qb      %[s3],   %[s1],    %[s1]        \n\t"
    107       "pick.qb        %[hev1], %[ones],  $0           \n\t"
    108       "subu_s.qb      %[r_k],  %[q1],    %[p1]        \n\t"
    109       "or             %[s2],   %[r_k],   %[c]         \n\t"
    110 
    111       /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > flimit * 2 + limit */
    112       "shrl.qb        %[s2],   %[s2],     1           \n\t"
    113       "addu_s.qb      %[s1],   %[s2],     %[s3]       \n\t"
    114       "cmpgu.lt.qb    %[c],    %[flimit], %[s1]       \n\t"
    115       "or             %[r],    %[r],      %[c]        \n\t"
    116       "sll            %[r],    %[r],      24          \n\t"
    117 
    118       "wrdsp          %[r]                            \n\t"
    119       "pick.qb        %[s2],  $0,         %[ones]     \n\t"
    120 
    121       : [c] "=&r"(c), [r_k] "=&r"(r_k), [s1] "=&r"(s1), [hev1] "=&r"(hev1),
    122         [s2] "=&r"(s2), [r] "+r"(r), [s3] "=&r"(s3)
    123       : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [r3] "r"(r3), [q1] "r"(q1),
    124         [ones] "r"(ones), [flimit] "r"(flimit));
    125 
    126   *hev = hev1;
    127   *mask = s2;
    128 }
    129 
    130 /* inputs & outputs are quad-byte vectors */
    131 static __inline void vp8_filter_mips(uint32_t mask, uint32_t hev, uint32_t *ps1,
    132                                      uint32_t *ps0, uint32_t *qs0,
    133                                      uint32_t *qs1) {
    134   int32_t vp8_filter_l, vp8_filter_r;
    135   int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r;
    136   int32_t subr_r, subr_l;
    137   uint32_t t1, t2, HWM, t3;
    138   uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r;
    139 
    140   int32_t vps1, vps0, vqs0, vqs1;
    141   int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r;
    142   uint32_t N128;
    143 
    144   N128 = 0x80808080;
    145   t1 = 0x03000300;
    146   t2 = 0x04000400;
    147   t3 = 0x01000100;
    148   HWM = 0xFF00FF00;
    149 
    150   vps0 = (*ps0) ^ N128;
    151   vps1 = (*ps1) ^ N128;
    152   vqs0 = (*qs0) ^ N128;
    153   vqs1 = (*qs1) ^ N128;
    154 
    155   /* use halfword pairs instead quad-bytes because of accuracy */
    156   vps0_l = vps0 & HWM;
    157   vps0_r = vps0 << 8;
    158   vps0_r = vps0_r & HWM;
    159 
    160   vps1_l = vps1 & HWM;
    161   vps1_r = vps1 << 8;
    162   vps1_r = vps1_r & HWM;
    163 
    164   vqs0_l = vqs0 & HWM;
    165   vqs0_r = vqs0 << 8;
    166   vqs0_r = vqs0_r & HWM;
    167 
    168   vqs1_l = vqs1 & HWM;
    169   vqs1_r = vqs1 << 8;
    170   vqs1_r = vqs1_r & HWM;
    171 
    172   mask_l = mask & HWM;
    173   mask_r = mask << 8;
    174   mask_r = mask_r & HWM;
    175 
    176   hev_l = hev & HWM;
    177   hev_r = hev << 8;
    178   hev_r = hev_r & HWM;
    179 
    180   __asm__ __volatile__(
    181       /* vp8_filter = vp8_signed_char_clamp(ps1 - qs1); */
    182       "subq_s.ph    %[vp8_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
    183       "subq_s.ph    %[vp8_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
    184 
    185       /* qs0 - ps0 */
    186       "subq_s.ph    %[subr_l],       %[vqs0_l],       %[vps0_l]       \n\t"
    187       "subq_s.ph    %[subr_r],       %[vqs0_r],       %[vps0_r]       \n\t"
    188 
    189       /* vp8_filter &= hev; */
    190       "and          %[vp8_filter_l], %[vp8_filter_l], %[hev_l]        \n\t"
    191       "and          %[vp8_filter_r], %[vp8_filter_r], %[hev_r]        \n\t"
    192 
    193       /* vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (qs0 - ps0)); */
    194       "addq_s.ph    %[vp8_filter_l], %[vp8_filter_l], %[subr_l]       \n\t"
    195       "addq_s.ph    %[vp8_filter_r], %[vp8_filter_r], %[subr_r]       \n\t"
    196       "xor          %[invhev_l],     %[hev_l],        %[HWM]          \n\t"
    197       "addq_s.ph    %[vp8_filter_l], %[vp8_filter_l], %[subr_l]       \n\t"
    198       "addq_s.ph    %[vp8_filter_r], %[vp8_filter_r], %[subr_r]       \n\t"
    199       "xor          %[invhev_r],     %[hev_r],        %[HWM]          \n\t"
    200       "addq_s.ph    %[vp8_filter_l], %[vp8_filter_l], %[subr_l]       \n\t"
    201       "addq_s.ph    %[vp8_filter_r], %[vp8_filter_r], %[subr_r]       \n\t"
    202 
    203       /* vp8_filter &= mask; */
    204       "and          %[vp8_filter_l], %[vp8_filter_l], %[mask_l]       \n\t"
    205       "and          %[vp8_filter_r], %[vp8_filter_r], %[mask_r]       \n\t"
    206 
    207       : [vp8_filter_l] "=&r"(vp8_filter_l), [vp8_filter_r] "=&r"(vp8_filter_r),
    208         [subr_l] "=&r"(subr_l), [subr_r] "=&r"(subr_r),
    209         [invhev_l] "=&r"(invhev_l), [invhev_r] "=&r"(invhev_r)
    210 
    211       : [vps0_l] "r"(vps0_l), [vps0_r] "r"(vps0_r), [vps1_l] "r"(vps1_l),
    212         [vps1_r] "r"(vps1_r), [vqs0_l] "r"(vqs0_l), [vqs0_r] "r"(vqs0_r),
    213         [vqs1_l] "r"(vqs1_l), [vqs1_r] "r"(vqs1_r), [mask_l] "r"(mask_l),
    214         [mask_r] "r"(mask_r), [hev_l] "r"(hev_l), [hev_r] "r"(hev_r),
    215         [HWM] "r"(HWM));
    216 
    217   /* save bottom 3 bits so that we round one side +4 and the other +3 */
    218   __asm__ __volatile__(
    219       /* Filter2 = vp8_signed_char_clamp(vp8_filter + 3) >>= 3; */
    220       "addq_s.ph    %[Filter1_l],    %[vp8_filter_l], %[t2]           \n\t"
    221       "addq_s.ph    %[Filter1_r],    %[vp8_filter_r], %[t2]           \n\t"
    222 
    223       /* Filter1 = vp8_signed_char_clamp(vp8_filter + 4) >>= 3; */
    224       "addq_s.ph    %[Filter2_l],    %[vp8_filter_l], %[t1]           \n\t"
    225       "addq_s.ph    %[Filter2_r],    %[vp8_filter_r], %[t1]           \n\t"
    226       "shra.ph      %[Filter1_r],    %[Filter1_r],    3               \n\t"
    227       "shra.ph      %[Filter1_l],    %[Filter1_l],    3               \n\t"
    228 
    229       "shra.ph      %[Filter2_l],    %[Filter2_l],    3               \n\t"
    230       "shra.ph      %[Filter2_r],    %[Filter2_r],    3               \n\t"
    231 
    232       "and          %[Filter1_l],    %[Filter1_l],    %[HWM]          \n\t"
    233       "and          %[Filter1_r],    %[Filter1_r],    %[HWM]          \n\t"
    234 
    235       /* vps0 = vp8_signed_char_clamp(ps0 + Filter2); */
    236       "addq_s.ph    %[vps0_l],       %[vps0_l],       %[Filter2_l]    \n\t"
    237       "addq_s.ph    %[vps0_r],       %[vps0_r],       %[Filter2_r]    \n\t"
    238 
    239       /* vqs0 = vp8_signed_char_clamp(qs0 - Filter1); */
    240       "subq_s.ph    %[vqs0_l],       %[vqs0_l],       %[Filter1_l]    \n\t"
    241       "subq_s.ph    %[vqs0_r],       %[vqs0_r],       %[Filter1_r]    \n\t"
    242 
    243       : [Filter1_l] "=&r"(Filter1_l), [Filter1_r] "=&r"(Filter1_r),
    244         [Filter2_l] "=&r"(Filter2_l), [Filter2_r] "=&r"(Filter2_r),
    245         [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l),
    246         [vqs0_r] "+r"(vqs0_r)
    247 
    248       : [t1] "r"(t1), [t2] "r"(t2), [vp8_filter_l] "r"(vp8_filter_l),
    249         [vp8_filter_r] "r"(vp8_filter_r), [HWM] "r"(HWM));
    250 
    251   __asm__ __volatile__(
    252       /* (vp8_filter += 1) >>= 1 */
    253       "addqh.ph    %[Filter1_l],    %[Filter1_l],     %[t3]           \n\t"
    254       "addqh.ph    %[Filter1_r],    %[Filter1_r],     %[t3]           \n\t"
    255 
    256       /* vp8_filter &= ~hev; */
    257       "and          %[Filter1_l],    %[Filter1_l],    %[invhev_l]     \n\t"
    258       "and          %[Filter1_r],    %[Filter1_r],    %[invhev_r]     \n\t"
    259 
    260       /* vps1 = vp8_signed_char_clamp(ps1 + vp8_filter); */
    261       "addq_s.ph    %[vps1_l],       %[vps1_l],       %[Filter1_l]    \n\t"
    262       "addq_s.ph    %[vps1_r],       %[vps1_r],       %[Filter1_r]    \n\t"
    263 
    264       /* vqs1 = vp8_signed_char_clamp(qs1 - vp8_filter); */
    265       "subq_s.ph    %[vqs1_l],       %[vqs1_l],       %[Filter1_l]    \n\t"
    266       "subq_s.ph    %[vqs1_r],       %[vqs1_r],       %[Filter1_r]    \n\t"
    267 
    268       : [Filter1_l] "+r"(Filter1_l), [Filter1_r] "+r"(Filter1_r),
    269         [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l),
    270         [vqs1_r] "+r"(vqs1_r)
    271 
    272       : [t3] "r"(t3), [invhev_l] "r"(invhev_l), [invhev_r] "r"(invhev_r));
    273 
    274   /* Create quad-bytes from halfword pairs */
    275   vqs0_l = vqs0_l & HWM;
    276   vqs1_l = vqs1_l & HWM;
    277   vps0_l = vps0_l & HWM;
    278   vps1_l = vps1_l & HWM;
    279 
    280   __asm__ __volatile__(
    281       "shrl.ph      %[vqs0_r],       %[vqs0_r],       8               \n\t"
    282       "shrl.ph      %[vps0_r],       %[vps0_r],       8               \n\t"
    283       "shrl.ph      %[vqs1_r],       %[vqs1_r],       8               \n\t"
    284       "shrl.ph      %[vps1_r],       %[vps1_r],       8               \n\t"
    285 
    286       : [vps1_r] "+r"(vps1_r), [vqs1_r] "+r"(vqs1_r), [vps0_r] "+r"(vps0_r),
    287         [vqs0_r] "+r"(vqs0_r)
    288       :);
    289 
    290   vqs0 = vqs0_l | vqs0_r;
    291   vqs1 = vqs1_l | vqs1_r;
    292   vps0 = vps0_l | vps0_r;
    293   vps1 = vps1_l | vps1_r;
    294 
    295   *ps0 = vps0 ^ N128;
    296   *ps1 = vps1 ^ N128;
    297   *qs0 = vqs0 ^ N128;
    298   *qs1 = vqs1 ^ N128;
    299 }
    300 
    301 void vp8_loop_filter_horizontal_edge_mips(unsigned char *s, int p,
    302                                           unsigned int flimit,
    303                                           unsigned int limit,
    304                                           unsigned int thresh, int count) {
    305   uint32_t mask;
    306   uint32_t hev;
    307   uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
    308   unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
    309   (void)count;
    310 
    311   mask = 0;
    312   hev = 0;
    313   p1 = 0;
    314   p2 = 0;
    315   p3 = 0;
    316   p4 = 0;
    317 
    318   /* prefetch data for store */
    319   prefetch_store_lf(s);
    320 
    321   /* loop filter designed to work using chars so that we can make maximum use
    322    * of 8 bit simd instructions.
    323    */
    324 
    325   sm1 = s - (p << 2);
    326   s0 = s - p - p - p;
    327   s1 = s - p - p;
    328   s2 = s - p;
    329   s3 = s;
    330   s4 = s + p;
    331   s5 = s + p + p;
    332   s6 = s + p + p + p;
    333 
    334   /* load quad-byte vectors
    335    * memory is 4 byte aligned
    336    */
    337   p1 = *((uint32_t *)(s1));
    338   p2 = *((uint32_t *)(s2));
    339   p3 = *((uint32_t *)(s3));
    340   p4 = *((uint32_t *)(s4));
    341 
    342   /* if (p1 - p4 == 0) and (p2 - p3 == 0)
    343    * mask will be zero and filtering is not needed
    344    */
    345   if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
    346     pm1 = *((uint32_t *)(sm1));
    347     p0 = *((uint32_t *)(s0));
    348     p5 = *((uint32_t *)(s5));
    349     p6 = *((uint32_t *)(s6));
    350 
    351     vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
    352                              thresh, &hev, &mask);
    353 
    354     /* if mask == 0 do filtering is not needed */
    355     if (mask) {
    356       /* filtering */
    357       vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
    358 
    359       /* unpack processed 4x4 neighborhood */
    360       *((uint32_t *)s1) = p1;
    361       *((uint32_t *)s2) = p2;
    362       *((uint32_t *)s3) = p3;
    363       *((uint32_t *)s4) = p4;
    364     }
    365   }
    366 
    367   sm1 += 4;
    368   s0 += 4;
    369   s1 += 4;
    370   s2 += 4;
    371   s3 += 4;
    372   s4 += 4;
    373   s5 += 4;
    374   s6 += 4;
    375 
    376   /* load quad-byte vectors
    377    * memory is 4 byte aligned
    378    */
    379   p1 = *((uint32_t *)(s1));
    380   p2 = *((uint32_t *)(s2));
    381   p3 = *((uint32_t *)(s3));
    382   p4 = *((uint32_t *)(s4));
    383 
    384   /* if (p1 - p4 == 0) and (p2 - p3 == 0)
    385    * mask will be zero and filtering is not needed
    386    */
    387   if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
    388     pm1 = *((uint32_t *)(sm1));
    389     p0 = *((uint32_t *)(s0));
    390     p5 = *((uint32_t *)(s5));
    391     p6 = *((uint32_t *)(s6));
    392 
    393     vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
    394                              thresh, &hev, &mask);
    395 
    396     /* if mask == 0 do filtering is not needed */
    397     if (mask) {
    398       /* filtering */
    399       vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
    400 
    401       /* unpack processed 4x4 neighborhood */
    402       *((uint32_t *)s1) = p1;
    403       *((uint32_t *)s2) = p2;
    404       *((uint32_t *)s3) = p3;
    405       *((uint32_t *)s4) = p4;
    406     }
    407   }
    408 
    409   sm1 += 4;
    410   s0 += 4;
    411   s1 += 4;
    412   s2 += 4;
    413   s3 += 4;
    414   s4 += 4;
    415   s5 += 4;
    416   s6 += 4;
    417 
    418   /* load quad-byte vectors
    419    * memory is 4 byte aligned
    420    */
    421   p1 = *((uint32_t *)(s1));
    422   p2 = *((uint32_t *)(s2));
    423   p3 = *((uint32_t *)(s3));
    424   p4 = *((uint32_t *)(s4));
    425 
    426   /* if (p1 - p4 == 0) and (p2 - p3 == 0)
    427    * mask will be zero and filtering is not needed
    428    */
    429   if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
    430     pm1 = *((uint32_t *)(sm1));
    431     p0 = *((uint32_t *)(s0));
    432     p5 = *((uint32_t *)(s5));
    433     p6 = *((uint32_t *)(s6));
    434 
    435     vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
    436                              thresh, &hev, &mask);
    437 
    438     /* if mask == 0 do filtering is not needed */
    439     if (mask) {
    440       /* filtering */
    441       vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
    442 
    443       /* unpack processed 4x4 neighborhood */
    444       *((uint32_t *)s1) = p1;
    445       *((uint32_t *)s2) = p2;
    446       *((uint32_t *)s3) = p3;
    447       *((uint32_t *)s4) = p4;
    448     }
    449   }
    450 
    451   sm1 += 4;
    452   s0 += 4;
    453   s1 += 4;
    454   s2 += 4;
    455   s3 += 4;
    456   s4 += 4;
    457   s5 += 4;
    458   s6 += 4;
    459 
    460   /* load quad-byte vectors
    461    * memory is 4 byte aligned
    462    */
    463   p1 = *((uint32_t *)(s1));
    464   p2 = *((uint32_t *)(s2));
    465   p3 = *((uint32_t *)(s3));
    466   p4 = *((uint32_t *)(s4));
    467 
    468   /* if (p1 - p4 == 0) and (p2 - p3 == 0)
    469    * mask will be zero and filtering is not needed
    470    */
    471   if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
    472     pm1 = *((uint32_t *)(sm1));
    473     p0 = *((uint32_t *)(s0));
    474     p5 = *((uint32_t *)(s5));
    475     p6 = *((uint32_t *)(s6));
    476 
    477     vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
    478                              thresh, &hev, &mask);
    479 
    480     /* if mask == 0 do filtering is not needed */
    481     if (mask) {
    482       /* filtering */
    483       vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
    484 
    485       /* unpack processed 4x4 neighborhood */
    486       *((uint32_t *)s1) = p1;
    487       *((uint32_t *)s2) = p2;
    488       *((uint32_t *)s3) = p3;
    489       *((uint32_t *)s4) = p4;
    490     }
    491   }
    492 }
    493 
    494 void vp8_loop_filter_uvhorizontal_edge_mips(unsigned char *s, int p,
    495                                             unsigned int flimit,
    496                                             unsigned int limit,
    497                                             unsigned int thresh, int count) {
    498   uint32_t mask;
    499   uint32_t hev;
    500   uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
    501   unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
    502   (void)count;
    503 
    504   mask = 0;
    505   hev = 0;
    506   p1 = 0;
    507   p2 = 0;
    508   p3 = 0;
    509   p4 = 0;
    510 
    511   /* loop filter designed to work using chars so that we can make maximum use
    512    * of 8 bit simd instructions.
    513    */
    514 
    515   sm1 = s - (p << 2);
    516   s0 = s - p - p - p;
    517   s1 = s - p - p;
    518   s2 = s - p;
    519   s3 = s;
    520   s4 = s + p;
    521   s5 = s + p + p;
    522   s6 = s + p + p + p;
    523 
    524   /* load quad-byte vectors
    525    * memory is 4 byte aligned
    526    */
    527   p1 = *((uint32_t *)(s1));
    528   p2 = *((uint32_t *)(s2));
    529   p3 = *((uint32_t *)(s3));
    530   p4 = *((uint32_t *)(s4));
    531 
    532   /* if (p1 - p4 == 0) and (p2 - p3 == 0)
    533    * mask will be zero and filtering is not needed
    534    */
    535   if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
    536     pm1 = *((uint32_t *)(sm1));
    537     p0 = *((uint32_t *)(s0));
    538     p5 = *((uint32_t *)(s5));
    539     p6 = *((uint32_t *)(s6));
    540 
    541     vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
    542                              thresh, &hev, &mask);
    543 
    544     /* if mask == 0 do filtering is not needed */
    545     if (mask) {
    546       /* filtering */
    547       vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
    548 
    549       /* unpack processed 4x4 neighborhood */
    550       *((uint32_t *)s1) = p1;
    551       *((uint32_t *)s2) = p2;
    552       *((uint32_t *)s3) = p3;
    553       *((uint32_t *)s4) = p4;
    554     }
    555   }
    556 
    557   sm1 += 4;
    558   s0 += 4;
    559   s1 += 4;
    560   s2 += 4;
    561   s3 += 4;
    562   s4 += 4;
    563   s5 += 4;
    564   s6 += 4;
    565 
    566   /* load quad-byte vectors
    567    * memory is 4 byte aligned
    568    */
    569   p1 = *((uint32_t *)(s1));
    570   p2 = *((uint32_t *)(s2));
    571   p3 = *((uint32_t *)(s3));
    572   p4 = *((uint32_t *)(s4));
    573 
    574   /* if (p1 - p4 == 0) and (p2 - p3 == 0)
    575    * mask will be zero and filtering is not needed
    576    */
    577   if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
    578     pm1 = *((uint32_t *)(sm1));
    579     p0 = *((uint32_t *)(s0));
    580     p5 = *((uint32_t *)(s5));
    581     p6 = *((uint32_t *)(s6));
    582 
    583     vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
    584                              thresh, &hev, &mask);
    585 
    586     /* if mask == 0 do filtering is not needed */
    587     if (mask) {
    588       /* filtering */
    589       vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
    590 
    591       /* unpack processed 4x4 neighborhood */
    592       *((uint32_t *)s1) = p1;
    593       *((uint32_t *)s2) = p2;
    594       *((uint32_t *)s3) = p3;
    595       *((uint32_t *)s4) = p4;
    596     }
    597   }
    598 }
    599 
    600 void vp8_loop_filter_vertical_edge_mips(unsigned char *s, int p,
    601                                         const unsigned int flimit,
    602                                         const unsigned int limit,
    603                                         const unsigned int thresh, int count) {
    604   int i;
    605   uint32_t mask, hev;
    606   uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
    607   unsigned char *s1, *s2, *s3, *s4;
    608   uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
    609 
    610   hev = 0;
    611   mask = 0;
    612   i = 0;
    613   pm1 = 0;
    614   p0 = 0;
    615   p1 = 0;
    616   p2 = 0;
    617   p3 = 0;
    618   p4 = 0;
    619   p5 = 0;
    620   p6 = 0;
    621 
    622   /* loop filter designed to work using chars so that we can make maximum use
    623    * of 8 bit simd instructions.
    624    */
    625 
    626   /* apply filter on 4 pixesl at the same time */
    627   do {
    628     /* prefetch data for store */
    629     prefetch_store_lf(s + p);
    630 
    631     s1 = s;
    632     s2 = s + p;
    633     s3 = s2 + p;
    634     s4 = s3 + p;
    635     s = s4 + p;
    636 
    637     /* load quad-byte vectors
    638      * memory is 4 byte aligned
    639      */
    640     p2 = *((uint32_t *)(s1 - 4));
    641     p6 = *((uint32_t *)(s1));
    642     p1 = *((uint32_t *)(s2 - 4));
    643     p5 = *((uint32_t *)(s2));
    644     p0 = *((uint32_t *)(s3 - 4));
    645     p4 = *((uint32_t *)(s3));
    646     pm1 = *((uint32_t *)(s4 - 4));
    647     p3 = *((uint32_t *)(s4));
    648 
    649     /* transpose pm1, p0, p1, p2 */
    650     __asm__ __volatile__(
    651         "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
    652         "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
    653         "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
    654         "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
    655 
    656         "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
    657         "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
    658         "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
    659         "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
    660 
    661         "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
    662         "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
    663         "append         %[p1],      %[sec3],    16          \n\t"
    664         "append         %[pm1],     %[sec4],    16          \n\t"
    665 
    666         : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
    667           [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0),
    668           [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
    669         :);
    670 
    671     /* transpose p3, p4, p5, p6 */
    672     __asm__ __volatile__(
    673         "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
    674         "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
    675         "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
    676         "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
    677 
    678         "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
    679         "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
    680         "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
    681         "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
    682 
    683         "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
    684         "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
    685         "append         %[p5],      %[sec3],    16          \n\t"
    686         "append         %[p3],      %[sec4],    16          \n\t"
    687 
    688         : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
    689           [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4),
    690           [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
    691         :);
    692 
    693     /* if (p1 - p4 == 0) and (p2 - p3 == 0)
    694      * mask will be zero and filtering is not needed
    695      */
    696     if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
    697       vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
    698                                thresh, &hev, &mask);
    699 
    700       /* if mask == 0 do filtering is not needed */
    701       if (mask) {
    702         /* filtering */
    703         vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
    704 
    705         /* unpack processed 4x4 neighborhood
    706          * don't use transpose on output data
    707          * because memory isn't aligned
    708          */
    709         __asm__ __volatile__(
    710             "sb         %[p4],  1(%[s4])    \n\t"
    711             "sb         %[p3],  0(%[s4])    \n\t"
    712             "sb         %[p2], -1(%[s4])    \n\t"
    713             "sb         %[p1], -2(%[s4])    \n\t"
    714             :
    715             : [p4] "r"(p4), [p3] "r"(p3), [s4] "r"(s4), [p2] "r"(p2),
    716               [p1] "r"(p1));
    717 
    718         __asm__ __volatile__(
    719             "srl        %[p4], %[p4], 8     \n\t"
    720             "srl        %[p3], %[p3], 8     \n\t"
    721             "srl        %[p2], %[p2], 8     \n\t"
    722             "srl        %[p1], %[p1], 8     \n\t"
    723             : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
    724             :);
    725 
    726         __asm__ __volatile__(
    727             "sb         %[p4],  1(%[s3])    \n\t"
    728             "sb         %[p3],  0(%[s3])    \n\t"
    729             "sb         %[p2], -1(%[s3])    \n\t"
    730             "sb         %[p1], -2(%[s3])    \n\t"
    731             : [p1] "+r"(p1)
    732             : [p4] "r"(p4), [p3] "r"(p3), [s3] "r"(s3), [p2] "r"(p2));
    733 
    734         __asm__ __volatile__(
    735             "srl        %[p4], %[p4], 8     \n\t"
    736             "srl        %[p3], %[p3], 8     \n\t"
    737             "srl        %[p2], %[p2], 8     \n\t"
    738             "srl        %[p1], %[p1], 8     \n\t"
    739             : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
    740             :);
    741 
    742         __asm__ __volatile__(
    743             "sb         %[p4],  1(%[s2])    \n\t"
    744             "sb         %[p3],  0(%[s2])    \n\t"
    745             "sb         %[p2], -1(%[s2])    \n\t"
    746             "sb         %[p1], -2(%[s2])    \n\t"
    747             :
    748             : [p4] "r"(p4), [p3] "r"(p3), [s2] "r"(s2), [p2] "r"(p2),
    749               [p1] "r"(p1));
    750 
    751         __asm__ __volatile__(
    752             "srl        %[p4], %[p4], 8     \n\t"
    753             "srl        %[p3], %[p3], 8     \n\t"
    754             "srl        %[p2], %[p2], 8     \n\t"
    755             "srl        %[p1], %[p1], 8     \n\t"
    756             : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
    757             :);
    758 
    759         __asm__ __volatile__(
    760             "sb         %[p4],  1(%[s1])    \n\t"
    761             "sb         %[p3],  0(%[s1])    \n\t"
    762             "sb         %[p2], -1(%[s1])    \n\t"
    763             "sb         %[p1], -2(%[s1])    \n\t"
    764             :
    765             : [p4] "r"(p4), [p3] "r"(p3), [s1] "r"(s1), [p2] "r"(p2),
    766               [p1] "r"(p1));
    767       }
    768     }
    769 
    770     s1 = s;
    771     s2 = s + p;
    772     s3 = s2 + p;
    773     s4 = s3 + p;
    774     s = s4 + p;
    775 
    776     /* load quad-byte vectors
    777      * memory is 4 byte aligned
    778      */
    779     p2 = *((uint32_t *)(s1 - 4));
    780     p6 = *((uint32_t *)(s1));
    781     p1 = *((uint32_t *)(s2 - 4));
    782     p5 = *((uint32_t *)(s2));
    783     p0 = *((uint32_t *)(s3 - 4));
    784     p4 = *((uint32_t *)(s3));
    785     pm1 = *((uint32_t *)(s4 - 4));
    786     p3 = *((uint32_t *)(s4));
    787 
    788     /* transpose pm1, p0, p1, p2 */
    789     __asm__ __volatile__(
    790         "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
    791         "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
    792         "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
    793         "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
    794 
    795         "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
    796         "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
    797         "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
    798         "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
    799 
    800         "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
    801         "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
    802         "append         %[p1],      %[sec3],    16          \n\t"
    803         "append         %[pm1],     %[sec4],    16          \n\t"
    804 
    805         : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
    806           [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0),
    807           [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
    808         :);
    809 
    810     /* transpose p3, p4, p5, p6 */
    811     __asm__ __volatile__(
    812         "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
    813         "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
    814         "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
    815         "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
    816 
    817         "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
    818         "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
    819         "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
    820         "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
    821 
    822         "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
    823         "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
    824         "append         %[p5],      %[sec3],    16          \n\t"
    825         "append         %[p3],      %[sec4],    16          \n\t"
    826 
    827         : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
    828           [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4),
    829           [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
    830         :);
    831 
    832     /* if (p1 - p4 == 0) and (p2 - p3 == 0)
    833      * mask will be zero and filtering is not needed
    834      */
    835     if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
    836       vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
    837                                thresh, &hev, &mask);
    838 
    839       /* if mask == 0 do filtering is not needed */
    840       if (mask) {
    841         /* filtering */
    842         vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
    843 
    844         /* unpack processed 4x4 neighborhood
    845          * don't use transpose on output data
    846          * because memory isn't aligned
    847          */
    848         __asm__ __volatile__(
    849             "sb         %[p4],  1(%[s4])    \n\t"
    850             "sb         %[p3],  0(%[s4])    \n\t"
    851             "sb         %[p2], -1(%[s4])    \n\t"
    852             "sb         %[p1], -2(%[s4])    \n\t"
    853             :
    854             : [p4] "r"(p4), [p3] "r"(p3), [s4] "r"(s4), [p2] "r"(p2),
    855               [p1] "r"(p1));
    856 
    857         __asm__ __volatile__(
    858             "srl        %[p4], %[p4], 8     \n\t"
    859             "srl        %[p3], %[p3], 8     \n\t"
    860             "srl        %[p2], %[p2], 8     \n\t"
    861             "srl        %[p1], %[p1], 8     \n\t"
    862             : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
    863             :);
    864 
    865         __asm__ __volatile__(
    866             "sb         %[p4],  1(%[s3])    \n\t"
    867             "sb         %[p3],  0(%[s3])    \n\t"
    868             "sb         %[p2], -1(%[s3])    \n\t"
    869             "sb         %[p1], -2(%[s3])    \n\t"
    870             : [p1] "+r"(p1)
    871             : [p4] "r"(p4), [p3] "r"(p3), [s3] "r"(s3), [p2] "r"(p2));
    872 
    873         __asm__ __volatile__(
    874             "srl        %[p4], %[p4], 8     \n\t"
    875             "srl        %[p3], %[p3], 8     \n\t"
    876             "srl        %[p2], %[p2], 8     \n\t"
    877             "srl        %[p1], %[p1], 8     \n\t"
    878             : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
    879             :);
    880 
    881         __asm__ __volatile__(
    882             "sb         %[p4],  1(%[s2])    \n\t"
    883             "sb         %[p3],  0(%[s2])    \n\t"
    884             "sb         %[p2], -1(%[s2])    \n\t"
    885             "sb         %[p1], -2(%[s2])    \n\t"
    886             :
    887             : [p4] "r"(p4), [p3] "r"(p3), [s2] "r"(s2), [p2] "r"(p2),
    888               [p1] "r"(p1));
    889 
    890         __asm__ __volatile__(
    891             "srl        %[p4], %[p4], 8     \n\t"
    892             "srl        %[p3], %[p3], 8     \n\t"
    893             "srl        %[p2], %[p2], 8     \n\t"
    894             "srl        %[p1], %[p1], 8     \n\t"
    895             : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
    896             :);
    897 
    898         __asm__ __volatile__(
    899             "sb         %[p4],  1(%[s1])    \n\t"
    900             "sb         %[p3],  0(%[s1])    \n\t"
    901             "sb         %[p2], -1(%[s1])    \n\t"
    902             "sb         %[p1], -2(%[s1])    \n\t"
    903             :
    904             : [p4] "r"(p4), [p3] "r"(p3), [s1] "r"(s1), [p2] "r"(p2),
    905               [p1] "r"(p1));
    906       }
    907     }
    908 
    909     i += 8;
    910   }
    911 
    912   while (i < count);
    913 }
    914 
    915 void vp8_loop_filter_uvvertical_edge_mips(unsigned char *s, int p,
    916                                           unsigned int flimit,
    917                                           unsigned int limit,
    918                                           unsigned int thresh, int count) {
    919   uint32_t mask, hev;
    920   uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
    921   unsigned char *s1, *s2, *s3, *s4;
    922   uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
    923   (void)count;
    924 
    925   /* loop filter designed to work using chars so that we can make maximum use
    926    * of 8 bit simd instructions.
    927    */
    928 
    929   /* apply filter on 4 pixesl at the same time */
    930 
    931   s1 = s;
    932   s2 = s + p;
    933   s3 = s2 + p;
    934   s4 = s3 + p;
    935 
    936   /* load quad-byte vectors
    937   * memory is 4 byte aligned
    938   */
    939   p2 = *((uint32_t *)(s1 - 4));
    940   p6 = *((uint32_t *)(s1));
    941   p1 = *((uint32_t *)(s2 - 4));
    942   p5 = *((uint32_t *)(s2));
    943   p0 = *((uint32_t *)(s3 - 4));
    944   p4 = *((uint32_t *)(s3));
    945   pm1 = *((uint32_t *)(s4 - 4));
    946   p3 = *((uint32_t *)(s4));
    947 
    948   /* transpose pm1, p0, p1, p2 */
    949   __asm__ __volatile__(
    950       "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
    951       "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
    952       "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
    953       "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
    954 
    955       "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
    956       "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
    957       "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
    958       "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
    959 
    960       "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
    961       "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
    962       "append         %[p1],      %[sec3],    16          \n\t"
    963       "append         %[pm1],     %[sec4],    16          \n\t"
    964 
    965       : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
    966         [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0),
    967         [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
    968       :);
    969 
    970   /* transpose p3, p4, p5, p6 */
    971   __asm__ __volatile__(
    972       "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
    973       "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
    974       "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
    975       "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
    976 
    977       "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
    978       "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
    979       "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
    980       "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
    981 
    982       "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
    983       "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
    984       "append         %[p5],      %[sec3],    16          \n\t"
    985       "append         %[p3],      %[sec4],    16          \n\t"
    986 
    987       : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
    988         [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4),
    989         [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
    990       :);
    991 
    992   /* if (p1 - p4 == 0) and (p2 - p3 == 0)
    993   * mask will be zero and filtering is not needed
    994   */
    995   if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
    996     vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
    997                              thresh, &hev, &mask);
    998 
    999     /* if mask == 0 do filtering is not needed */
   1000     if (mask) {
   1001       /* filtering */
   1002       vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
   1003 
   1004       /* unpack processed 4x4 neighborhood
   1005        * don't use transpose on output data
   1006        * because memory isn't aligned
   1007        */
   1008       __asm__ __volatile__(
   1009           "sb         %[p4],  1(%[s4])    \n\t"
   1010           "sb         %[p3],  0(%[s4])    \n\t"
   1011           "sb         %[p2], -1(%[s4])    \n\t"
   1012           "sb         %[p1], -2(%[s4])    \n\t"
   1013           :
   1014           :
   1015           [p4] "r"(p4), [p3] "r"(p3), [s4] "r"(s4), [p2] "r"(p2), [p1] "r"(p1));
   1016 
   1017       __asm__ __volatile__(
   1018           "srl        %[p4], %[p4], 8     \n\t"
   1019           "srl        %[p3], %[p3], 8     \n\t"
   1020           "srl        %[p2], %[p2], 8     \n\t"
   1021           "srl        %[p1], %[p1], 8     \n\t"
   1022           : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
   1023           :);
   1024 
   1025       __asm__ __volatile__(
   1026           "sb         %[p4],  1(%[s3])    \n\t"
   1027           "sb         %[p3],  0(%[s3])    \n\t"
   1028           "sb         %[p2], -1(%[s3])    \n\t"
   1029           "sb         %[p1], -2(%[s3])    \n\t"
   1030           : [p1] "+r"(p1)
   1031           : [p4] "r"(p4), [p3] "r"(p3), [s3] "r"(s3), [p2] "r"(p2));
   1032 
   1033       __asm__ __volatile__(
   1034           "srl        %[p4], %[p4], 8     \n\t"
   1035           "srl        %[p3], %[p3], 8     \n\t"
   1036           "srl        %[p2], %[p2], 8     \n\t"
   1037           "srl        %[p1], %[p1], 8     \n\t"
   1038           : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
   1039           :);
   1040 
   1041       __asm__ __volatile__(
   1042           "sb         %[p4],  1(%[s2])    \n\t"
   1043           "sb         %[p3],  0(%[s2])    \n\t"
   1044           "sb         %[p2], -1(%[s2])    \n\t"
   1045           "sb         %[p1], -2(%[s2])    \n\t"
   1046           :
   1047           :
   1048           [p4] "r"(p4), [p3] "r"(p3), [s2] "r"(s2), [p2] "r"(p2), [p1] "r"(p1));
   1049 
   1050       __asm__ __volatile__(
   1051           "srl        %[p4], %[p4], 8     \n\t"
   1052           "srl        %[p3], %[p3], 8     \n\t"
   1053           "srl        %[p2], %[p2], 8     \n\t"
   1054           "srl        %[p1], %[p1], 8     \n\t"
   1055           : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
   1056           :);
   1057 
   1058       __asm__ __volatile__(
   1059           "sb         %[p4],  1(%[s1])    \n\t"
   1060           "sb         %[p3],  0(%[s1])    \n\t"
   1061           "sb         %[p2], -1(%[s1])    \n\t"
   1062           "sb         %[p1], -2(%[s1])    \n\t"
   1063           :
   1064           :
   1065           [p4] "r"(p4), [p3] "r"(p3), [s1] "r"(s1), [p2] "r"(p2), [p1] "r"(p1));
   1066     }
   1067   }
   1068 
   1069   s1 = s4 + p;
   1070   s2 = s1 + p;
   1071   s3 = s2 + p;
   1072   s4 = s3 + p;
   1073 
   1074   /* load quad-byte vectors
   1075    * memory is 4 byte aligned
   1076    */
   1077   p2 = *((uint32_t *)(s1 - 4));
   1078   p6 = *((uint32_t *)(s1));
   1079   p1 = *((uint32_t *)(s2 - 4));
   1080   p5 = *((uint32_t *)(s2));
   1081   p0 = *((uint32_t *)(s3 - 4));
   1082   p4 = *((uint32_t *)(s3));
   1083   pm1 = *((uint32_t *)(s4 - 4));
   1084   p3 = *((uint32_t *)(s4));
   1085 
   1086   /* transpose pm1, p0, p1, p2 */
   1087   __asm__ __volatile__(
   1088       "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
   1089       "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
   1090       "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
   1091       "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
   1092 
   1093       "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
   1094       "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
   1095       "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
   1096       "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
   1097 
   1098       "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
   1099       "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
   1100       "append         %[p1],      %[sec3],    16          \n\t"
   1101       "append         %[pm1],     %[sec4],    16          \n\t"
   1102 
   1103       : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
   1104         [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0),
   1105         [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
   1106       :);
   1107 
   1108   /* transpose p3, p4, p5, p6 */
   1109   __asm__ __volatile__(
   1110       "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
   1111       "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
   1112       "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
   1113       "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
   1114 
   1115       "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
   1116       "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
   1117       "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
   1118       "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
   1119 
   1120       "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
   1121       "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
   1122       "append         %[p5],      %[sec3],    16          \n\t"
   1123       "append         %[p3],      %[sec4],    16          \n\t"
   1124 
   1125       : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
   1126         [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4),
   1127         [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
   1128       :);
   1129 
   1130   /* if (p1 - p4 == 0) and (p2 - p3 == 0)
   1131    * mask will be zero and filtering is not needed
   1132    */
   1133   if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
   1134     vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
   1135                              thresh, &hev, &mask);
   1136 
   1137     /* if mask == 0 do filtering is not needed */
   1138     if (mask) {
   1139       /* filtering */
   1140       vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
   1141 
   1142       /* unpack processed 4x4 neighborhood
   1143        * don't use transpose on output data
   1144        * because memory isn't aligned
   1145        */
   1146       __asm__ __volatile__(
   1147           "sb         %[p4],  1(%[s4])    \n\t"
   1148           "sb         %[p3],  0(%[s4])    \n\t"
   1149           "sb         %[p2], -1(%[s4])    \n\t"
   1150           "sb         %[p1], -2(%[s4])    \n\t"
   1151           :
   1152           :
   1153           [p4] "r"(p4), [p3] "r"(p3), [s4] "r"(s4), [p2] "r"(p2), [p1] "r"(p1));
   1154 
   1155       __asm__ __volatile__(
   1156           "srl        %[p4], %[p4], 8     \n\t"
   1157           "srl        %[p3], %[p3], 8     \n\t"
   1158           "srl        %[p2], %[p2], 8     \n\t"
   1159           "srl        %[p1], %[p1], 8     \n\t"
   1160           : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
   1161           :);
   1162 
   1163       __asm__ __volatile__(
   1164           "sb         %[p4],  1(%[s3])    \n\t"
   1165           "sb         %[p3],  0(%[s3])    \n\t"
   1166           "sb         %[p2], -1(%[s3])    \n\t"
   1167           "sb         %[p1], -2(%[s3])    \n\t"
   1168           : [p1] "+r"(p1)
   1169           : [p4] "r"(p4), [p3] "r"(p3), [s3] "r"(s3), [p2] "r"(p2));
   1170 
   1171       __asm__ __volatile__(
   1172           "srl        %[p4], %[p4], 8     \n\t"
   1173           "srl        %[p3], %[p3], 8     \n\t"
   1174           "srl        %[p2], %[p2], 8     \n\t"
   1175           "srl        %[p1], %[p1], 8     \n\t"
   1176           : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
   1177           :);
   1178 
   1179       __asm__ __volatile__(
   1180           "sb         %[p4],  1(%[s2])    \n\t"
   1181           "sb         %[p3],  0(%[s2])    \n\t"
   1182           "sb         %[p2], -1(%[s2])    \n\t"
   1183           "sb         %[p1], -2(%[s2])    \n\t"
   1184           :
   1185           :
   1186           [p4] "r"(p4), [p3] "r"(p3), [s2] "r"(s2), [p2] "r"(p2), [p1] "r"(p1));
   1187 
   1188       __asm__ __volatile__(
   1189           "srl        %[p4], %[p4], 8     \n\t"
   1190           "srl        %[p3], %[p3], 8     \n\t"
   1191           "srl        %[p2], %[p2], 8     \n\t"
   1192           "srl        %[p1], %[p1], 8     \n\t"
   1193           : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
   1194           :);
   1195 
   1196       __asm__ __volatile__(
   1197           "sb         %[p4],  1(%[s1])    \n\t"
   1198           "sb         %[p3],  0(%[s1])    \n\t"
   1199           "sb         %[p2], -1(%[s1])    \n\t"
   1200           "sb         %[p1], -2(%[s1])    \n\t"
   1201           :
   1202           :
   1203           [p4] "r"(p4), [p3] "r"(p3), [s1] "r"(s1), [p2] "r"(p2), [p1] "r"(p1));
   1204     }
   1205   }
   1206 }
   1207 
   1208 /* inputs & outputs are quad-byte vectors */
   1209 static __inline void vp8_mbfilter_mips(uint32_t mask, uint32_t hev,
   1210                                        uint32_t *ps2, uint32_t *ps1,
   1211                                        uint32_t *ps0, uint32_t *qs0,
   1212                                        uint32_t *qs1, uint32_t *qs2) {
   1213   int32_t vps2, vps1, vps0, vqs0, vqs1, vqs2;
   1214   int32_t vps2_l, vps1_l, vps0_l, vqs0_l, vqs1_l, vqs2_l;
   1215   int32_t vps2_r, vps1_r, vps0_r, vqs0_r, vqs1_r, vqs2_r;
   1216   uint32_t HWM, vp8_filter_l, vp8_filter_r, mask_l, mask_r, hev_l, hev_r,
   1217       subr_r, subr_l;
   1218   uint32_t Filter2_l, Filter2_r, t1, t2, Filter1_l, Filter1_r, invhev_l,
   1219       invhev_r;
   1220   uint32_t N128, R63;
   1221   uint32_t u1_l, u1_r, u2_l, u2_r, u3_l, u3_r;
   1222 
   1223   R63 = 0x003F003F;
   1224   HWM = 0xFF00FF00;
   1225   N128 = 0x80808080;
   1226   t1 = 0x03000300;
   1227   t2 = 0x04000400;
   1228 
   1229   vps0 = (*ps0) ^ N128;
   1230   vps1 = (*ps1) ^ N128;
   1231   vps2 = (*ps2) ^ N128;
   1232   vqs0 = (*qs0) ^ N128;
   1233   vqs1 = (*qs1) ^ N128;
   1234   vqs2 = (*qs2) ^ N128;
   1235 
   1236   /* use halfword pairs instead quad-bytes because of accuracy */
   1237   vps0_l = vps0 & HWM;
   1238   vps0_r = vps0 << 8;
   1239   vps0_r = vps0_r & HWM;
   1240 
   1241   vqs0_l = vqs0 & HWM;
   1242   vqs0_r = vqs0 << 8;
   1243   vqs0_r = vqs0_r & HWM;
   1244 
   1245   vps1_l = vps1 & HWM;
   1246   vps1_r = vps1 << 8;
   1247   vps1_r = vps1_r & HWM;
   1248 
   1249   vqs1_l = vqs1 & HWM;
   1250   vqs1_r = vqs1 << 8;
   1251   vqs1_r = vqs1_r & HWM;
   1252 
   1253   vqs2_l = vqs2 & HWM;
   1254   vqs2_r = vqs2 << 8;
   1255   vqs2_r = vqs2_r & HWM;
   1256 
   1257   __asm__ __volatile__(
   1258       /* qs0 - ps0 */
   1259       "subq_s.ph    %[subr_l],       %[vqs0_l],       %[vps0_l]       \n\t"
   1260       "subq_s.ph    %[subr_r],       %[vqs0_r],       %[vps0_r]       \n\t"
   1261 
   1262       /* vp8_filter = vp8_signed_char_clamp(ps1 - qs1); */
   1263       "subq_s.ph    %[vp8_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
   1264       "subq_s.ph    %[vp8_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
   1265 
   1266       : [vp8_filter_l] "=&r"(vp8_filter_l), [vp8_filter_r] "=r"(vp8_filter_r),
   1267         [subr_l] "=&r"(subr_l), [subr_r] "=&r"(subr_r)
   1268       : [vps0_l] "r"(vps0_l), [vps0_r] "r"(vps0_r), [vps1_l] "r"(vps1_l),
   1269         [vps1_r] "r"(vps1_r), [vqs0_l] "r"(vqs0_l), [vqs0_r] "r"(vqs0_r),
   1270         [vqs1_l] "r"(vqs1_l), [vqs1_r] "r"(vqs1_r));
   1271 
   1272   vps2_l = vps2 & HWM;
   1273   vps2_r = vps2 << 8;
   1274   vps2_r = vps2_r & HWM;
   1275 
   1276   /* add outer taps if we have high edge variance */
   1277   __asm__ __volatile__(
   1278       /* vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (qs0 - ps0)); */
   1279       "addq_s.ph    %[vp8_filter_l], %[vp8_filter_l], %[subr_l]       \n\t"
   1280       "addq_s.ph    %[vp8_filter_r], %[vp8_filter_r], %[subr_r]       \n\t"
   1281       "and          %[mask_l],       %[HWM],          %[mask]         \n\t"
   1282       "sll          %[mask_r],       %[mask],         8               \n\t"
   1283       "and          %[mask_r],       %[HWM],          %[mask_r]       \n\t"
   1284       "addq_s.ph    %[vp8_filter_l], %[vp8_filter_l], %[subr_l]       \n\t"
   1285       "addq_s.ph    %[vp8_filter_r], %[vp8_filter_r], %[subr_r]       \n\t"
   1286       "and          %[hev_l],        %[HWM],          %[hev]          \n\t"
   1287       "sll          %[hev_r],        %[hev],          8               \n\t"
   1288       "and          %[hev_r],        %[HWM],          %[hev_r]        \n\t"
   1289       "addq_s.ph    %[vp8_filter_l], %[vp8_filter_l], %[subr_l]       \n\t"
   1290       "addq_s.ph    %[vp8_filter_r], %[vp8_filter_r], %[subr_r]       \n\t"
   1291 
   1292       /* vp8_filter &= mask; */
   1293       "and          %[vp8_filter_l], %[vp8_filter_l], %[mask_l]       \n\t"
   1294       "and          %[vp8_filter_r], %[vp8_filter_r], %[mask_r]       \n\t"
   1295 
   1296       /* Filter2 = vp8_filter & hev; */
   1297       "and          %[Filter2_l],    %[vp8_filter_l], %[hev_l]        \n\t"
   1298       "and          %[Filter2_r],    %[vp8_filter_r], %[hev_r]        \n\t"
   1299 
   1300       : [vp8_filter_l] "+r"(vp8_filter_l), [vp8_filter_r] "+r"(vp8_filter_r),
   1301         [hev_l] "=&r"(hev_l), [hev_r] "=&r"(hev_r), [mask_l] "=&r"(mask_l),
   1302         [mask_r] "=&r"(mask_r), [Filter2_l] "=&r"(Filter2_l),
   1303         [Filter2_r] "=&r"(Filter2_r)
   1304       : [subr_l] "r"(subr_l), [subr_r] "r"(subr_r), [HWM] "r"(HWM),
   1305         [hev] "r"(hev), [mask] "r"(mask));
   1306 
   1307   /* save bottom 3 bits so that we round one side +4 and the other +3 */
   1308   __asm__ __volatile__(
   1309       /* Filter1 = vp8_signed_char_clamp(Filter2 + 4) >>= 3; */
   1310       "addq_s.ph    %[Filter1_l],    %[Filter2_l],    %[t2]           \n\t"
   1311       "xor          %[invhev_l],     %[hev_l],        %[HWM]          \n\t"
   1312       "addq_s.ph    %[Filter1_r],    %[Filter2_r],    %[t2]           \n\t"
   1313 
   1314       /* Filter2 = vp8_signed_char_clamp(Filter2 + 3) >>= 3; */
   1315       "addq_s.ph    %[Filter2_l],    %[Filter2_l],    %[t1]           \n\t"
   1316       "addq_s.ph    %[Filter2_r],    %[Filter2_r],    %[t1]           \n\t"
   1317 
   1318       "shra.ph      %[Filter1_l],    %[Filter1_l],    3               \n\t"
   1319       "shra.ph      %[Filter1_r],    %[Filter1_r],    3               \n\t"
   1320 
   1321       "shra.ph      %[Filter2_l],    %[Filter2_l],    3               \n\t"
   1322       "shra.ph      %[Filter2_r],    %[Filter2_r],    3               \n\t"
   1323       "and          %[Filter1_l],    %[Filter1_l],    %[HWM]          \n\t"
   1324       "and          %[Filter1_r],    %[Filter1_r],    %[HWM]          \n\t"
   1325       "xor          %[invhev_r],     %[hev_r],        %[HWM]          \n\t"
   1326 
   1327       /* qs0 = vp8_signed_char_clamp(qs0 - Filter1); */
   1328       "subq_s.ph    %[vqs0_l],       %[vqs0_l],       %[Filter1_l]    \n\t"
   1329       "subq_s.ph    %[vqs0_r],       %[vqs0_r],       %[Filter1_r]    \n\t"
   1330 
   1331       /* ps0 = vp8_signed_char_clamp(ps0 + Filter2); */
   1332       "addq_s.ph    %[vps0_l],       %[vps0_l],       %[Filter2_l]    \n\t"
   1333       "addq_s.ph    %[vps0_r],       %[vps0_r],       %[Filter2_r]    \n\t"
   1334 
   1335       : [invhev_l] "=&r"(invhev_l), [invhev_r] "=&r"(invhev_r),
   1336         [Filter1_l] "=&r"(Filter1_l), [Filter1_r] "=&r"(Filter1_r),
   1337         [Filter2_l] "+r"(Filter2_l), [Filter2_r] "+r"(Filter2_r),
   1338         [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l),
   1339         [vqs0_r] "+r"(vqs0_r)
   1340       : [t1] "r"(t1), [t2] "r"(t2), [HWM] "r"(HWM), [hev_l] "r"(hev_l),
   1341         [hev_r] "r"(hev_r));
   1342 
   1343   /* only apply wider filter if not high edge variance */
   1344   __asm__ __volatile__(
   1345       /* vp8_filter &= ~hev; */
   1346       "and          %[Filter2_l],    %[vp8_filter_l], %[invhev_l]     \n\t"
   1347       "and          %[Filter2_r],    %[vp8_filter_r], %[invhev_r]     \n\t"
   1348 
   1349       "shra.ph      %[Filter2_l],    %[Filter2_l],    8               \n\t"
   1350       "shra.ph      %[Filter2_r],    %[Filter2_r],    8               \n\t"
   1351 
   1352       : [Filter2_l] "=&r"(Filter2_l), [Filter2_r] "=&r"(Filter2_r)
   1353       : [vp8_filter_l] "r"(vp8_filter_l), [vp8_filter_r] "r"(vp8_filter_r),
   1354         [invhev_l] "r"(invhev_l), [invhev_r] "r"(invhev_r));
   1355 
   1356   /* roughly 3/7th difference across boundary */
   1357   __asm__ __volatile__(
   1358       "shll.ph      %[u3_l],         %[Filter2_l],    3               \n\t"
   1359       "shll.ph      %[u3_r],         %[Filter2_r],    3               \n\t"
   1360 
   1361       "addq.ph      %[u3_l],         %[u3_l],         %[Filter2_l]    \n\t"
   1362       "addq.ph      %[u3_r],         %[u3_r],         %[Filter2_r]    \n\t"
   1363 
   1364       "shll.ph      %[u2_l],         %[u3_l],         1               \n\t"
   1365       "shll.ph      %[u2_r],         %[u3_r],         1               \n\t"
   1366 
   1367       "addq.ph      %[u1_l],         %[u3_l],         %[u2_l]         \n\t"
   1368       "addq.ph      %[u1_r],         %[u3_r],         %[u2_r]         \n\t"
   1369 
   1370       "addq.ph      %[u2_l],         %[u2_l],         %[R63]          \n\t"
   1371       "addq.ph      %[u2_r],         %[u2_r],         %[R63]          \n\t"
   1372 
   1373       "addq.ph      %[u3_l],         %[u3_l],         %[R63]          \n\t"
   1374       "addq.ph      %[u3_r],         %[u3_r],         %[R63]          \n\t"
   1375 
   1376       /* vp8_signed_char_clamp((63 + Filter2 * 27) >> 7)
   1377        * vp8_signed_char_clamp((63 + Filter2 * 18) >> 7)
   1378        */
   1379       "addq.ph      %[u1_l],         %[u1_l],         %[R63]          \n\t"
   1380       "addq.ph      %[u1_r],         %[u1_r],         %[R63]          \n\t"
   1381       "shra.ph      %[u1_l],         %[u1_l],         7               \n\t"
   1382       "shra.ph      %[u1_r],         %[u1_r],         7               \n\t"
   1383       "shra.ph      %[u2_l],         %[u2_l],         7               \n\t"
   1384       "shra.ph      %[u2_r],         %[u2_r],         7               \n\t"
   1385       "shll.ph      %[u1_l],         %[u1_l],         8               \n\t"
   1386       "shll.ph      %[u1_r],         %[u1_r],         8               \n\t"
   1387       "shll.ph      %[u2_l],         %[u2_l],         8               \n\t"
   1388       "shll.ph      %[u2_r],         %[u2_r],         8               \n\t"
   1389 
   1390       /* vqs0 = vp8_signed_char_clamp(qs0 - u); */
   1391       "subq_s.ph    %[vqs0_l],       %[vqs0_l],       %[u1_l]         \n\t"
   1392       "subq_s.ph    %[vqs0_r],       %[vqs0_r],       %[u1_r]         \n\t"
   1393 
   1394       /* vps0 = vp8_signed_char_clamp(ps0 + u); */
   1395       "addq_s.ph    %[vps0_l],       %[vps0_l],       %[u1_l]         \n\t"
   1396       "addq_s.ph    %[vps0_r],       %[vps0_r],       %[u1_r]         \n\t"
   1397 
   1398       : [u1_l] "=&r"(u1_l), [u1_r] "=&r"(u1_r), [u2_l] "=&r"(u2_l),
   1399         [u2_r] "=&r"(u2_r), [u3_l] "=&r"(u3_l), [u3_r] "=&r"(u3_r),
   1400         [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l),
   1401         [vqs0_r] "+r"(vqs0_r)
   1402       : [R63] "r"(R63), [Filter2_l] "r"(Filter2_l), [Filter2_r] "r"(Filter2_r));
   1403 
   1404   __asm__ __volatile__(
   1405       /* vqs1 = vp8_signed_char_clamp(qs1 - u); */
   1406       "subq_s.ph    %[vqs1_l],       %[vqs1_l],       %[u2_l]         \n\t"
   1407       "addq_s.ph    %[vps1_l],       %[vps1_l],       %[u2_l]         \n\t"
   1408 
   1409       /* vps1 = vp8_signed_char_clamp(ps1 + u); */
   1410       "addq_s.ph    %[vps1_r],       %[vps1_r],       %[u2_r]         \n\t"
   1411       "subq_s.ph    %[vqs1_r],       %[vqs1_r],       %[u2_r]         \n\t"
   1412 
   1413       : [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l),
   1414         [vqs1_r] "+r"(vqs1_r)
   1415       : [u2_l] "r"(u2_l), [u2_r] "r"(u2_r));
   1416 
   1417   /* roughly 1/7th difference across boundary */
   1418   __asm__ __volatile__(
   1419       /* u = vp8_signed_char_clamp((63 + Filter2 * 9) >> 7); */
   1420       "shra.ph      %[u3_l],         %[u3_l],         7               \n\t"
   1421       "shra.ph      %[u3_r],         %[u3_r],         7               \n\t"
   1422       "shll.ph      %[u3_l],         %[u3_l],         8               \n\t"
   1423       "shll.ph      %[u3_r],         %[u3_r],         8               \n\t"
   1424 
   1425       /* vqs2 = vp8_signed_char_clamp(qs2 - u); */
   1426       "subq_s.ph    %[vqs2_l],       %[vqs2_l],       %[u3_l]         \n\t"
   1427       "subq_s.ph    %[vqs2_r],       %[vqs2_r],       %[u3_r]         \n\t"
   1428 
   1429       /* vps2 = vp8_signed_char_clamp(ps2 + u); */
   1430       "addq_s.ph    %[vps2_l],       %[vps2_l],       %[u3_l]         \n\t"
   1431       "addq_s.ph    %[vps2_r],       %[vps2_r],       %[u3_r]         \n\t"
   1432 
   1433       : [u3_l] "+r"(u3_l), [u3_r] "+r"(u3_r), [vps2_l] "+r"(vps2_l),
   1434         [vps2_r] "+r"(vps2_r), [vqs2_l] "+r"(vqs2_l), [vqs2_r] "+r"(vqs2_r)
   1435       :);
   1436 
   1437   /* Create quad-bytes from halfword pairs */
   1438   __asm__ __volatile__(
   1439       "and          %[vqs0_l],       %[vqs0_l],       %[HWM]          \n\t"
   1440       "shrl.ph      %[vqs0_r],       %[vqs0_r],       8               \n\t"
   1441 
   1442       "and          %[vps0_l],       %[vps0_l],       %[HWM]          \n\t"
   1443       "shrl.ph      %[vps0_r],       %[vps0_r],       8               \n\t"
   1444 
   1445       "and          %[vqs1_l],       %[vqs1_l],       %[HWM]          \n\t"
   1446       "shrl.ph      %[vqs1_r],       %[vqs1_r],       8               \n\t"
   1447 
   1448       "and          %[vps1_l],       %[vps1_l],       %[HWM]          \n\t"
   1449       "shrl.ph      %[vps1_r],       %[vps1_r],       8               \n\t"
   1450 
   1451       "and          %[vqs2_l],       %[vqs2_l],       %[HWM]          \n\t"
   1452       "shrl.ph      %[vqs2_r],       %[vqs2_r],       8               \n\t"
   1453 
   1454       "and          %[vps2_l],       %[vps2_l],       %[HWM]          \n\t"
   1455       "shrl.ph      %[vps2_r],       %[vps2_r],       8               \n\t"
   1456 
   1457       "or           %[vqs0_r],       %[vqs0_l],       %[vqs0_r]       \n\t"
   1458       "or           %[vps0_r],       %[vps0_l],       %[vps0_r]       \n\t"
   1459       "or           %[vqs1_r],       %[vqs1_l],       %[vqs1_r]       \n\t"
   1460       "or           %[vps1_r],       %[vps1_l],       %[vps1_r]       \n\t"
   1461       "or           %[vqs2_r],       %[vqs2_l],       %[vqs2_r]       \n\t"
   1462       "or           %[vps2_r],       %[vps2_l],       %[vps2_r]       \n\t"
   1463 
   1464       : [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l),
   1465         [vqs1_r] "+r"(vqs1_r), [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r),
   1466         [vqs0_l] "+r"(vqs0_l), [vqs0_r] "+r"(vqs0_r), [vqs2_l] "+r"(vqs2_l),
   1467         [vqs2_r] "+r"(vqs2_r), [vps2_r] "+r"(vps2_r), [vps2_l] "+r"(vps2_l)
   1468       : [HWM] "r"(HWM));
   1469 
   1470   *ps0 = vps0_r ^ N128;
   1471   *ps1 = vps1_r ^ N128;
   1472   *ps2 = vps2_r ^ N128;
   1473   *qs0 = vqs0_r ^ N128;
   1474   *qs1 = vqs1_r ^ N128;
   1475   *qs2 = vqs2_r ^ N128;
   1476 }
   1477 
   1478 void vp8_mbloop_filter_horizontal_edge_mips(unsigned char *s, int p,
   1479                                             unsigned int flimit,
   1480                                             unsigned int limit,
   1481                                             unsigned int thresh, int count) {
   1482   int i;
   1483   uint32_t mask, hev;
   1484   uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
   1485   unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
   1486 
   1487   mask = 0;
   1488   hev = 0;
   1489   i = 0;
   1490   p1 = 0;
   1491   p2 = 0;
   1492   p3 = 0;
   1493   p4 = 0;
   1494 
   1495   /* loop filter designed to work using chars so that we can make maximum use
   1496    * of 8 bit simd instructions.
   1497    */
   1498 
   1499   sm1 = s - (p << 2);
   1500   s0 = s - p - p - p;
   1501   s1 = s - p - p;
   1502   s2 = s - p;
   1503   s3 = s;
   1504   s4 = s + p;
   1505   s5 = s + p + p;
   1506   s6 = s + p + p + p;
   1507 
   1508   /* prefetch data for load */
   1509   prefetch_load_lf(s + p);
   1510 
   1511   /* apply filter on 4 pixesl at the same time */
   1512   do {
   1513     /* load quad-byte vectors
   1514      * memory is 4 byte aligned
   1515      */
   1516     p1 = *((uint32_t *)(s1));
   1517     p2 = *((uint32_t *)(s2));
   1518     p3 = *((uint32_t *)(s3));
   1519     p4 = *((uint32_t *)(s4));
   1520 
   1521     /* if (p1 - p4 == 0) and (p2 - p3 == 0)
   1522      * mask will be zero and filtering is not needed
   1523      */
   1524     if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
   1525       pm1 = *((uint32_t *)(sm1));
   1526       p0 = *((uint32_t *)(s0));
   1527       p5 = *((uint32_t *)(s5));
   1528       p6 = *((uint32_t *)(s6));
   1529 
   1530       vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
   1531                                thresh, &hev, &mask);
   1532 
   1533       /* if mask == 0 do filtering is not needed */
   1534       if (mask) {
   1535         /* filtering */
   1536         vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
   1537 
   1538         /* unpack processed 4x4 neighborhood
   1539          * memory is 4 byte aligned
   1540          */
   1541         *((uint32_t *)s0) = p0;
   1542         *((uint32_t *)s1) = p1;
   1543         *((uint32_t *)s2) = p2;
   1544         *((uint32_t *)s3) = p3;
   1545         *((uint32_t *)s4) = p4;
   1546         *((uint32_t *)s5) = p5;
   1547       }
   1548     }
   1549 
   1550     sm1 += 4;
   1551     s0 += 4;
   1552     s1 += 4;
   1553     s2 += 4;
   1554     s3 += 4;
   1555     s4 += 4;
   1556     s5 += 4;
   1557     s6 += 4;
   1558 
   1559     /* load quad-byte vectors
   1560      * memory is 4 byte aligned
   1561      */
   1562     p1 = *((uint32_t *)(s1));
   1563     p2 = *((uint32_t *)(s2));
   1564     p3 = *((uint32_t *)(s3));
   1565     p4 = *((uint32_t *)(s4));
   1566 
   1567     /* if (p1 - p4 == 0) and (p2 - p3 == 0)
   1568      * mask will be zero and filtering is not needed
   1569      */
   1570     if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
   1571       pm1 = *((uint32_t *)(sm1));
   1572       p0 = *((uint32_t *)(s0));
   1573       p5 = *((uint32_t *)(s5));
   1574       p6 = *((uint32_t *)(s6));
   1575 
   1576       vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
   1577                                thresh, &hev, &mask);
   1578 
   1579       /* if mask == 0 do filtering is not needed */
   1580       if (mask) {
   1581         /* filtering */
   1582         vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
   1583 
   1584         /* unpack processed 4x4 neighborhood
   1585          * memory is 4 byte aligned
   1586          */
   1587         *((uint32_t *)s0) = p0;
   1588         *((uint32_t *)s1) = p1;
   1589         *((uint32_t *)s2) = p2;
   1590         *((uint32_t *)s3) = p3;
   1591         *((uint32_t *)s4) = p4;
   1592         *((uint32_t *)s5) = p5;
   1593       }
   1594     }
   1595 
   1596     sm1 += 4;
   1597     s0 += 4;
   1598     s1 += 4;
   1599     s2 += 4;
   1600     s3 += 4;
   1601     s4 += 4;
   1602     s5 += 4;
   1603     s6 += 4;
   1604 
   1605     i += 8;
   1606   }
   1607 
   1608   while (i < count);
   1609 }
   1610 
   1611 void vp8_mbloop_filter_uvhorizontal_edge_mips(unsigned char *s, int p,
   1612                                               unsigned int flimit,
   1613                                               unsigned int limit,
   1614                                               unsigned int thresh, int count) {
   1615   uint32_t mask, hev;
   1616   uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
   1617   unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
   1618   (void)count;
   1619 
   1620   mask = 0;
   1621   hev = 0;
   1622   p1 = 0;
   1623   p2 = 0;
   1624   p3 = 0;
   1625   p4 = 0;
   1626 
   1627   /* loop filter designed to work using chars so that we can make maximum use
   1628    * of 8 bit simd instructions.
   1629    */
   1630 
   1631   sm1 = s - (p << 2);
   1632   s0 = s - p - p - p;
   1633   s1 = s - p - p;
   1634   s2 = s - p;
   1635   s3 = s;
   1636   s4 = s + p;
   1637   s5 = s + p + p;
   1638   s6 = s + p + p + p;
   1639 
   1640   /* load quad-byte vectors
   1641    * memory is 4 byte aligned
   1642    */
   1643   p1 = *((uint32_t *)(s1));
   1644   p2 = *((uint32_t *)(s2));
   1645   p3 = *((uint32_t *)(s3));
   1646   p4 = *((uint32_t *)(s4));
   1647 
   1648   /* if (p1 - p4 == 0) and (p2 - p3 == 0)
   1649    * mask will be zero and filtering is not needed
   1650    */
   1651   if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
   1652     pm1 = *((uint32_t *)(sm1));
   1653     p0 = *((uint32_t *)(s0));
   1654     p5 = *((uint32_t *)(s5));
   1655     p6 = *((uint32_t *)(s6));
   1656 
   1657     /* if mask == 0 do filtering is not needed */
   1658     vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
   1659                              thresh, &hev, &mask);
   1660 
   1661     if (mask) {
   1662       /* filtering */
   1663       vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
   1664 
   1665       /* unpack processed 4x4 neighborhood
   1666        * memory is 4 byte aligned
   1667        */
   1668       *((uint32_t *)s0) = p0;
   1669       *((uint32_t *)s1) = p1;
   1670       *((uint32_t *)s2) = p2;
   1671       *((uint32_t *)s3) = p3;
   1672       *((uint32_t *)s4) = p4;
   1673       *((uint32_t *)s5) = p5;
   1674     }
   1675   }
   1676 
   1677   sm1 += 4;
   1678   s0 += 4;
   1679   s1 += 4;
   1680   s2 += 4;
   1681   s3 += 4;
   1682   s4 += 4;
   1683   s5 += 4;
   1684   s6 += 4;
   1685 
   1686   /* load quad-byte vectors
   1687    * memory is 4 byte aligned
   1688    */
   1689   p1 = *((uint32_t *)(s1));
   1690   p2 = *((uint32_t *)(s2));
   1691   p3 = *((uint32_t *)(s3));
   1692   p4 = *((uint32_t *)(s4));
   1693 
   1694   /* if (p1 - p4 == 0) and (p2 - p3 == 0)
   1695    * mask will be zero and filtering is not needed
   1696    */
   1697   if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
   1698     pm1 = *((uint32_t *)(sm1));
   1699     p0 = *((uint32_t *)(s0));
   1700     p5 = *((uint32_t *)(s5));
   1701     p6 = *((uint32_t *)(s6));
   1702 
   1703     vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
   1704                              thresh, &hev, &mask);
   1705 
   1706     /* if mask == 0 do filtering is not needed */
   1707     if (mask) {
   1708       /* filtering */
   1709       vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
   1710 
   1711       /* unpack processed 4x4 neighborhood
   1712        * memory is 4 byte aligned
   1713        */
   1714       *((uint32_t *)s0) = p0;
   1715       *((uint32_t *)s1) = p1;
   1716       *((uint32_t *)s2) = p2;
   1717       *((uint32_t *)s3) = p3;
   1718       *((uint32_t *)s4) = p4;
   1719       *((uint32_t *)s5) = p5;
   1720     }
   1721   }
   1722 }
   1723 
   1724 void vp8_mbloop_filter_vertical_edge_mips(unsigned char *s, int p,
   1725                                           unsigned int flimit,
   1726                                           unsigned int limit,
   1727                                           unsigned int thresh, int count) {
   1728   int i;
   1729   uint32_t mask, hev;
   1730   uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
   1731   unsigned char *s1, *s2, *s3, *s4;
   1732   uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
   1733 
   1734   mask = 0;
   1735   hev = 0;
   1736   i = 0;
   1737   pm1 = 0;
   1738   p0 = 0;
   1739   p1 = 0;
   1740   p2 = 0;
   1741   p3 = 0;
   1742   p4 = 0;
   1743   p5 = 0;
   1744   p6 = 0;
   1745 
   1746   /* loop filter designed to work using chars so that we can make maximum use
   1747    * of 8 bit simd instructions.
   1748    */
   1749 
   1750   /* apply filter on 4 pixesl at the same time */
   1751   do {
   1752     s1 = s;
   1753     s2 = s + p;
   1754     s3 = s2 + p;
   1755     s4 = s3 + p;
   1756     s = s4 + p;
   1757 
   1758     /* load quad-byte vectors
   1759      * memory is 4 byte aligned
   1760      */
   1761     p2 = *((uint32_t *)(s1 - 4));
   1762     p6 = *((uint32_t *)(s1));
   1763     p1 = *((uint32_t *)(s2 - 4));
   1764     p5 = *((uint32_t *)(s2));
   1765     p0 = *((uint32_t *)(s3 - 4));
   1766     p4 = *((uint32_t *)(s3));
   1767     pm1 = *((uint32_t *)(s4 - 4));
   1768     p3 = *((uint32_t *)(s4));
   1769 
   1770     /* transpose pm1, p0, p1, p2 */
   1771     __asm__ __volatile__(
   1772         "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
   1773         "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
   1774         "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
   1775         "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
   1776 
   1777         "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
   1778         "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
   1779         "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
   1780         "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
   1781 
   1782         "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
   1783         "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
   1784         "append         %[p1],      %[sec3],    16          \n\t"
   1785         "append         %[pm1],     %[sec4],    16          \n\t"
   1786 
   1787         : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
   1788           [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0),
   1789           [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
   1790         :);
   1791 
   1792     /* transpose p3, p4, p5, p6 */
   1793     __asm__ __volatile__(
   1794         "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
   1795         "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
   1796         "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
   1797         "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
   1798 
   1799         "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
   1800         "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
   1801         "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
   1802         "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
   1803 
   1804         "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
   1805         "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
   1806         "append         %[p5],      %[sec3],    16          \n\t"
   1807         "append         %[p3],      %[sec4],    16          \n\t"
   1808 
   1809         : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
   1810           [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4),
   1811           [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
   1812         :);
   1813 
   1814     /* if (p1 - p4 == 0) and (p2 - p3 == 0)
   1815      * mask will be zero and filtering is not needed
   1816      */
   1817     if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
   1818       vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
   1819                                thresh, &hev, &mask);
   1820 
   1821       /* if mask == 0 do filtering is not needed */
   1822       if (mask) {
   1823         /* filtering */
   1824         vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
   1825 
   1826         /* don't use transpose on output data
   1827          * because memory isn't aligned
   1828          */
   1829         __asm__ __volatile__(
   1830             "sb         %[p5],  2(%[s4])        \n\t"
   1831             "sb         %[p4],  1(%[s4])        \n\t"
   1832             "sb         %[p3],  0(%[s4])        \n\t"
   1833             "sb         %[p2], -1(%[s4])        \n\t"
   1834             "sb         %[p1], -2(%[s4])        \n\t"
   1835             "sb         %[p0], -3(%[s4])        \n\t"
   1836             :
   1837             : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s4] "r"(s4),
   1838               [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
   1839 
   1840         __asm__ __volatile__(
   1841             "srl        %[p5], %[p5], 8         \n\t"
   1842             "srl        %[p4], %[p4], 8         \n\t"
   1843             "srl        %[p3], %[p3], 8         \n\t"
   1844             "srl        %[p2], %[p2], 8         \n\t"
   1845             "srl        %[p1], %[p1], 8         \n\t"
   1846             "srl        %[p0], %[p0], 8         \n\t"
   1847             : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2),
   1848               [p1] "+r"(p1), [p0] "+r"(p0)
   1849             :);
   1850 
   1851         __asm__ __volatile__(
   1852             "sb         %[p5],  2(%[s3])        \n\t"
   1853             "sb         %[p4],  1(%[s3])        \n\t"
   1854             "sb         %[p3],  0(%[s3])        \n\t"
   1855             "sb         %[p2], -1(%[s3])        \n\t"
   1856             "sb         %[p1], -2(%[s3])        \n\t"
   1857             "sb         %[p0], -3(%[s3])        \n\t"
   1858             :
   1859             : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s3] "r"(s3),
   1860               [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
   1861 
   1862         __asm__ __volatile__(
   1863             "srl        %[p5], %[p5], 8         \n\t"
   1864             "srl        %[p4], %[p4], 8         \n\t"
   1865             "srl        %[p3], %[p3], 8         \n\t"
   1866             "srl        %[p2], %[p2], 8         \n\t"
   1867             "srl        %[p1], %[p1], 8         \n\t"
   1868             "srl        %[p0], %[p0], 8         \n\t"
   1869             : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2),
   1870               [p1] "+r"(p1), [p0] "+r"(p0)
   1871             :);
   1872 
   1873         __asm__ __volatile__(
   1874             "sb         %[p5],  2(%[s2])        \n\t"
   1875             "sb         %[p4],  1(%[s2])        \n\t"
   1876             "sb         %[p3],  0(%[s2])        \n\t"
   1877             "sb         %[p2], -1(%[s2])        \n\t"
   1878             "sb         %[p1], -2(%[s2])        \n\t"
   1879             "sb         %[p0], -3(%[s2])        \n\t"
   1880             :
   1881             : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s2] "r"(s2),
   1882               [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
   1883 
   1884         __asm__ __volatile__(
   1885             "srl        %[p5], %[p5], 8         \n\t"
   1886             "srl        %[p4], %[p4], 8         \n\t"
   1887             "srl        %[p3], %[p3], 8         \n\t"
   1888             "srl        %[p2], %[p2], 8         \n\t"
   1889             "srl        %[p1], %[p1], 8         \n\t"
   1890             "srl        %[p0], %[p0], 8         \n\t"
   1891             : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2),
   1892               [p1] "+r"(p1), [p0] "+r"(p0)
   1893             :);
   1894 
   1895         __asm__ __volatile__(
   1896             "sb         %[p5],  2(%[s1])        \n\t"
   1897             "sb         %[p4],  1(%[s1])        \n\t"
   1898             "sb         %[p3],  0(%[s1])        \n\t"
   1899             "sb         %[p2], -1(%[s1])        \n\t"
   1900             "sb         %[p1], -2(%[s1])        \n\t"
   1901             "sb         %[p0], -3(%[s1])        \n\t"
   1902             :
   1903             : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s1] "r"(s1),
   1904               [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
   1905       }
   1906     }
   1907 
   1908     i += 4;
   1909   }
   1910 
   1911   while (i < count);
   1912 }
   1913 
   1914 void vp8_mbloop_filter_uvvertical_edge_mips(unsigned char *s, int p,
   1915                                             unsigned int flimit,
   1916                                             unsigned int limit,
   1917                                             unsigned int thresh, int count) {
   1918   uint32_t mask, hev;
   1919   uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
   1920   unsigned char *s1, *s2, *s3, *s4;
   1921   uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
   1922   (void)count;
   1923 
   1924   mask = 0;
   1925   hev = 0;
   1926   pm1 = 0;
   1927   p0 = 0;
   1928   p1 = 0;
   1929   p2 = 0;
   1930   p3 = 0;
   1931   p4 = 0;
   1932   p5 = 0;
   1933   p6 = 0;
   1934 
   1935   /* loop filter designed to work using chars so that we can make maximum use
   1936    * of 8 bit simd instructions.
   1937    */
   1938 
   1939   /* apply filter on 4 pixesl at the same time */
   1940 
   1941   s1 = s;
   1942   s2 = s + p;
   1943   s3 = s2 + p;
   1944   s4 = s3 + p;
   1945 
   1946   /* prefetch data for load */
   1947   prefetch_load_lf(s + 2 * p);
   1948 
   1949   /* load quad-byte vectors
   1950    * memory is 4 byte aligned
   1951    */
   1952   p2 = *((uint32_t *)(s1 - 4));
   1953   p6 = *((uint32_t *)(s1));
   1954   p1 = *((uint32_t *)(s2 - 4));
   1955   p5 = *((uint32_t *)(s2));
   1956   p0 = *((uint32_t *)(s3 - 4));
   1957   p4 = *((uint32_t *)(s3));
   1958   pm1 = *((uint32_t *)(s4 - 4));
   1959   p3 = *((uint32_t *)(s4));
   1960 
   1961   /* transpose pm1, p0, p1, p2 */
   1962   __asm__ __volatile__(
   1963       "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
   1964       "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
   1965       "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
   1966       "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
   1967 
   1968       "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
   1969       "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
   1970       "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
   1971       "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
   1972 
   1973       "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
   1974       "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
   1975       "append         %[p1],      %[sec3],    16          \n\t"
   1976       "append         %[pm1],     %[sec4],    16          \n\t"
   1977 
   1978       : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
   1979         [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0),
   1980         [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
   1981       :);
   1982 
   1983   /* transpose p3, p4, p5, p6 */
   1984   __asm__ __volatile__(
   1985       "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
   1986       "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
   1987       "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
   1988       "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
   1989 
   1990       "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
   1991       "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
   1992       "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
   1993       "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
   1994 
   1995       "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
   1996       "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
   1997       "append         %[p5],      %[sec3],    16          \n\t"
   1998       "append         %[p3],      %[sec4],    16          \n\t"
   1999 
   2000       : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
   2001         [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4),
   2002         [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
   2003       :);
   2004 
   2005   /* if (p1 - p4 == 0) and (p2 - p3 == 0)
   2006    * mask will be zero and filtering is not needed
   2007    */
   2008   if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
   2009     vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
   2010                              thresh, &hev, &mask);
   2011 
   2012     /* if mask == 0 do filtering is not needed */
   2013     if (mask) {
   2014       /* filtering */
   2015       vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
   2016 
   2017       /* don't use transpose on output data
   2018        * because memory isn't aligned
   2019        */
   2020       __asm__ __volatile__(
   2021           "sb         %[p5],  2(%[s4])        \n\t"
   2022           "sb         %[p4],  1(%[s4])        \n\t"
   2023           "sb         %[p3],  0(%[s4])        \n\t"
   2024           "sb         %[p2], -1(%[s4])        \n\t"
   2025           "sb         %[p1], -2(%[s4])        \n\t"
   2026           "sb         %[p0], -3(%[s4])        \n\t"
   2027           :
   2028           : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s4] "r"(s4),
   2029             [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
   2030 
   2031       __asm__ __volatile__(
   2032           "srl        %[p5], %[p5], 8         \n\t"
   2033           "srl        %[p4], %[p4], 8         \n\t"
   2034           "srl        %[p3], %[p3], 8         \n\t"
   2035           "srl        %[p2], %[p2], 8         \n\t"
   2036           "srl        %[p1], %[p1], 8         \n\t"
   2037           "srl        %[p0], %[p0], 8         \n\t"
   2038           : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2),
   2039             [p1] "+r"(p1), [p0] "+r"(p0)
   2040           :);
   2041 
   2042       __asm__ __volatile__(
   2043           "sb         %[p5],  2(%[s3])        \n\t"
   2044           "sb         %[p4],  1(%[s3])        \n\t"
   2045           "sb         %[p3],  0(%[s3])        \n\t"
   2046           "sb         %[p2], -1(%[s3])        \n\t"
   2047           "sb         %[p1], -2(%[s3])        \n\t"
   2048           "sb         %[p0], -3(%[s3])        \n\t"
   2049           :
   2050           : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s3] "r"(s3),
   2051             [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
   2052 
   2053       __asm__ __volatile__(
   2054           "srl        %[p5], %[p5], 8         \n\t"
   2055           "srl        %[p4], %[p4], 8         \n\t"
   2056           "srl        %[p3], %[p3], 8         \n\t"
   2057           "srl        %[p2], %[p2], 8         \n\t"
   2058           "srl        %[p1], %[p1], 8         \n\t"
   2059           "srl        %[p0], %[p0], 8         \n\t"
   2060           : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2),
   2061             [p1] "+r"(p1), [p0] "+r"(p0)
   2062           :);
   2063 
   2064       __asm__ __volatile__(
   2065           "sb         %[p5],  2(%[s2])        \n\t"
   2066           "sb         %[p4],  1(%[s2])        \n\t"
   2067           "sb         %[p3],  0(%[s2])        \n\t"
   2068           "sb         %[p2], -1(%[s2])        \n\t"
   2069           "sb         %[p1], -2(%[s2])        \n\t"
   2070           "sb         %[p0], -3(%[s2])        \n\t"
   2071           :
   2072           : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s2] "r"(s2),
   2073             [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
   2074 
   2075       __asm__ __volatile__(
   2076           "srl        %[p5], %[p5], 8         \n\t"
   2077           "srl        %[p4], %[p4], 8         \n\t"
   2078           "srl        %[p3], %[p3], 8         \n\t"
   2079           "srl        %[p2], %[p2], 8         \n\t"
   2080           "srl        %[p1], %[p1], 8         \n\t"
   2081           "srl        %[p0], %[p0], 8         \n\t"
   2082           : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2),
   2083             [p1] "+r"(p1), [p0] "+r"(p0)
   2084           :);
   2085 
   2086       __asm__ __volatile__(
   2087           "sb         %[p5],  2(%[s1])        \n\t"
   2088           "sb         %[p4],  1(%[s1])        \n\t"
   2089           "sb         %[p3],  0(%[s1])        \n\t"
   2090           "sb         %[p2], -1(%[s1])        \n\t"
   2091           "sb         %[p1], -2(%[s1])        \n\t"
   2092           "sb         %[p0], -3(%[s1])        \n\t"
   2093           :
   2094           : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s1] "r"(s1),
   2095             [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
   2096     }
   2097   }
   2098 
   2099   s1 = s4 + p;
   2100   s2 = s1 + p;
   2101   s3 = s2 + p;
   2102   s4 = s3 + p;
   2103 
   2104   /* load quad-byte vectors
   2105   * memory is 4 byte aligned
   2106   */
   2107   p2 = *((uint32_t *)(s1 - 4));
   2108   p6 = *((uint32_t *)(s1));
   2109   p1 = *((uint32_t *)(s2 - 4));
   2110   p5 = *((uint32_t *)(s2));
   2111   p0 = *((uint32_t *)(s3 - 4));
   2112   p4 = *((uint32_t *)(s3));
   2113   pm1 = *((uint32_t *)(s4 - 4));
   2114   p3 = *((uint32_t *)(s4));
   2115 
   2116   /* transpose pm1, p0, p1, p2 */
   2117   __asm__ __volatile__(
   2118       "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
   2119       "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
   2120       "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
   2121       "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
   2122 
   2123       "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
   2124       "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
   2125       "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
   2126       "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
   2127 
   2128       "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
   2129       "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
   2130       "append         %[p1],      %[sec3],    16          \n\t"
   2131       "append         %[pm1],     %[sec4],    16          \n\t"
   2132 
   2133       : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
   2134         [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0),
   2135         [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
   2136       :);
   2137 
   2138   /* transpose p3, p4, p5, p6 */
   2139   __asm__ __volatile__(
   2140       "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
   2141       "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
   2142       "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
   2143       "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
   2144 
   2145       "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
   2146       "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
   2147       "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
   2148       "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
   2149 
   2150       "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
   2151       "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
   2152       "append         %[p5],      %[sec3],    16          \n\t"
   2153       "append         %[p3],      %[sec4],    16          \n\t"
   2154 
   2155       : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
   2156         [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4),
   2157         [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
   2158       :);
   2159 
   2160   /* if (p1 - p4 == 0) and (p2 - p3 == 0)
   2161    * mask will be zero and filtering is not needed
   2162    */
   2163   if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
   2164     vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
   2165                              thresh, &hev, &mask);
   2166 
   2167     /* if mask == 0 do filtering is not needed */
   2168     if (mask) {
   2169       /* filtering */
   2170       vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
   2171 
   2172       /* don't use transpose on output data
   2173        * because memory isn't aligned
   2174        */
   2175       __asm__ __volatile__(
   2176           "sb         %[p5],  2(%[s4])        \n\t"
   2177           "sb         %[p4],  1(%[s4])        \n\t"
   2178           "sb         %[p3],  0(%[s4])        \n\t"
   2179           "sb         %[p2], -1(%[s4])        \n\t"
   2180           "sb         %[p1], -2(%[s4])        \n\t"
   2181           "sb         %[p0], -3(%[s4])        \n\t"
   2182           :
   2183           : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s4] "r"(s4),
   2184             [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
   2185 
   2186       __asm__ __volatile__(
   2187           "srl        %[p5], %[p5], 8         \n\t"
   2188           "srl        %[p4], %[p4], 8         \n\t"
   2189           "srl        %[p3], %[p3], 8         \n\t"
   2190           "srl        %[p2], %[p2], 8         \n\t"
   2191           "srl        %[p1], %[p1], 8         \n\t"
   2192           "srl        %[p0], %[p0], 8         \n\t"
   2193           : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2),
   2194             [p1] "+r"(p1), [p0] "+r"(p0)
   2195           :);
   2196 
   2197       __asm__ __volatile__(
   2198           "sb         %[p5],  2(%[s3])        \n\t"
   2199           "sb         %[p4],  1(%[s3])        \n\t"
   2200           "sb         %[p3],  0(%[s3])        \n\t"
   2201           "sb         %[p2], -1(%[s3])        \n\t"
   2202           "sb         %[p1], -2(%[s3])        \n\t"
   2203           "sb         %[p0], -3(%[s3])        \n\t"
   2204           :
   2205           : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s3] "r"(s3),
   2206             [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
   2207 
   2208       __asm__ __volatile__(
   2209           "srl        %[p5], %[p5], 8         \n\t"
   2210           "srl        %[p4], %[p4], 8         \n\t"
   2211           "srl        %[p3], %[p3], 8         \n\t"
   2212           "srl        %[p2], %[p2], 8         \n\t"
   2213           "srl        %[p1], %[p1], 8         \n\t"
   2214           "srl        %[p0], %[p0], 8         \n\t"
   2215           : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2),
   2216             [p1] "+r"(p1), [p0] "+r"(p0)
   2217           :);
   2218 
   2219       __asm__ __volatile__(
   2220           "sb         %[p5],  2(%[s2])        \n\t"
   2221           "sb         %[p4],  1(%[s2])        \n\t"
   2222           "sb         %[p3],  0(%[s2])        \n\t"
   2223           "sb         %[p2], -1(%[s2])        \n\t"
   2224           "sb         %[p1], -2(%[s2])        \n\t"
   2225           "sb         %[p0], -3(%[s2])        \n\t"
   2226           :
   2227           : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s2] "r"(s2),
   2228             [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
   2229 
   2230       __asm__ __volatile__(
   2231           "srl        %[p5], %[p5], 8         \n\t"
   2232           "srl        %[p4], %[p4], 8         \n\t"
   2233           "srl        %[p3], %[p3], 8         \n\t"
   2234           "srl        %[p2], %[p2], 8         \n\t"
   2235           "srl        %[p1], %[p1], 8         \n\t"
   2236           "srl        %[p0], %[p0], 8         \n\t"
   2237           : [p5] "+r"(p5), [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2),
   2238             [p1] "+r"(p1), [p0] "+r"(p0)
   2239           :);
   2240 
   2241       __asm__ __volatile__(
   2242           "sb         %[p5],  2(%[s1])        \n\t"
   2243           "sb         %[p4],  1(%[s1])        \n\t"
   2244           "sb         %[p3],  0(%[s1])        \n\t"
   2245           "sb         %[p2], -1(%[s1])        \n\t"
   2246           "sb         %[p1], -2(%[s1])        \n\t"
   2247           "sb         %[p0], -3(%[s1])        \n\t"
   2248           :
   2249           : [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [s1] "r"(s1),
   2250             [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0));
   2251     }
   2252   }
   2253 }
   2254 
   2255 /* Horizontal MB filtering */
   2256 void vp8_loop_filter_mbh_dspr2(unsigned char *y_ptr, unsigned char *u_ptr,
   2257                                unsigned char *v_ptr, int y_stride,
   2258                                int uv_stride, loop_filter_info *lfi) {
   2259   unsigned int thresh_vec, flimit_vec, limit_vec;
   2260   unsigned char thresh, flimit, limit, flimit_temp;
   2261 
   2262   /* use direct value instead pointers */
   2263   limit = *(lfi->lim);
   2264   flimit_temp = *(lfi->mblim);
   2265   thresh = *(lfi->hev_thr);
   2266   flimit = flimit_temp;
   2267 
   2268   /* create quad-byte */
   2269   __asm__ __volatile__(
   2270       "replv.qb       %[thresh_vec], %[thresh]    \n\t"
   2271       "replv.qb       %[flimit_vec], %[flimit]    \n\t"
   2272       "replv.qb       %[limit_vec],  %[limit]     \n\t"
   2273       : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
   2274         [limit_vec] "=r"(limit_vec)
   2275       : [thresh] "r"(thresh), [flimit] "r"(flimit), [limit] "r"(limit));
   2276 
   2277   vp8_mbloop_filter_horizontal_edge_mips(y_ptr, y_stride, flimit_vec, limit_vec,
   2278                                          thresh_vec, 16);
   2279 
   2280   if (u_ptr) {
   2281     vp8_mbloop_filter_uvhorizontal_edge_mips(u_ptr, uv_stride, flimit_vec,
   2282                                              limit_vec, thresh_vec, 0);
   2283   }
   2284 
   2285   if (v_ptr) {
   2286     vp8_mbloop_filter_uvhorizontal_edge_mips(v_ptr, uv_stride, flimit_vec,
   2287                                              limit_vec, thresh_vec, 0);
   2288   }
   2289 }
   2290 
   2291 /* Vertical MB Filtering */
   2292 void vp8_loop_filter_mbv_dspr2(unsigned char *y_ptr, unsigned char *u_ptr,
   2293                                unsigned char *v_ptr, int y_stride,
   2294                                int uv_stride, loop_filter_info *lfi) {
   2295   unsigned int thresh_vec, flimit_vec, limit_vec;
   2296   unsigned char thresh, flimit, limit, flimit_temp;
   2297 
   2298   /* use direct value instead pointers */
   2299   limit = *(lfi->lim);
   2300   flimit_temp = *(lfi->mblim);
   2301   thresh = *(lfi->hev_thr);
   2302   flimit = flimit_temp;
   2303 
   2304   /* create quad-byte */
   2305   __asm__ __volatile__(
   2306       "replv.qb       %[thresh_vec], %[thresh]    \n\t"
   2307       "replv.qb       %[flimit_vec], %[flimit]    \n\t"
   2308       "replv.qb       %[limit_vec],  %[limit]     \n\t"
   2309       : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
   2310         [limit_vec] "=r"(limit_vec)
   2311       : [thresh] "r"(thresh), [flimit] "r"(flimit), [limit] "r"(limit));
   2312 
   2313   vp8_mbloop_filter_vertical_edge_mips(y_ptr, y_stride, flimit_vec, limit_vec,
   2314                                        thresh_vec, 16);
   2315 
   2316   if (u_ptr)
   2317     vp8_mbloop_filter_uvvertical_edge_mips(u_ptr, uv_stride, flimit_vec,
   2318                                            limit_vec, thresh_vec, 0);
   2319 
   2320   if (v_ptr)
   2321     vp8_mbloop_filter_uvvertical_edge_mips(v_ptr, uv_stride, flimit_vec,
   2322                                            limit_vec, thresh_vec, 0);
   2323 }
   2324 
   2325 /* Horizontal B Filtering */
   2326 void vp8_loop_filter_bh_dspr2(unsigned char *y_ptr, unsigned char *u_ptr,
   2327                               unsigned char *v_ptr, int y_stride, int uv_stride,
   2328                               loop_filter_info *lfi) {
   2329   unsigned int thresh_vec, flimit_vec, limit_vec;
   2330   unsigned char thresh, flimit, limit, flimit_temp;
   2331 
   2332   /* use direct value instead pointers */
   2333   limit = *(lfi->lim);
   2334   flimit_temp = *(lfi->blim);
   2335   thresh = *(lfi->hev_thr);
   2336   flimit = flimit_temp;
   2337 
   2338   /* create quad-byte */
   2339   __asm__ __volatile__(
   2340       "replv.qb       %[thresh_vec], %[thresh]    \n\t"
   2341       "replv.qb       %[flimit_vec], %[flimit]    \n\t"
   2342       "replv.qb       %[limit_vec],  %[limit]     \n\t"
   2343       : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
   2344         [limit_vec] "=r"(limit_vec)
   2345       : [thresh] "r"(thresh), [flimit] "r"(flimit), [limit] "r"(limit));
   2346 
   2347   vp8_loop_filter_horizontal_edge_mips(y_ptr + 4 * y_stride, y_stride,
   2348                                        flimit_vec, limit_vec, thresh_vec, 16);
   2349   vp8_loop_filter_horizontal_edge_mips(y_ptr + 8 * y_stride, y_stride,
   2350                                        flimit_vec, limit_vec, thresh_vec, 16);
   2351   vp8_loop_filter_horizontal_edge_mips(y_ptr + 12 * y_stride, y_stride,
   2352                                        flimit_vec, limit_vec, thresh_vec, 16);
   2353 
   2354   if (u_ptr)
   2355     vp8_loop_filter_uvhorizontal_edge_mips(
   2356         u_ptr + 4 * uv_stride, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
   2357 
   2358   if (v_ptr)
   2359     vp8_loop_filter_uvhorizontal_edge_mips(
   2360         v_ptr + 4 * uv_stride, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
   2361 }
   2362 
   2363 /* Vertical B Filtering */
   2364 void vp8_loop_filter_bv_dspr2(unsigned char *y_ptr, unsigned char *u_ptr,
   2365                               unsigned char *v_ptr, int y_stride, int uv_stride,
   2366                               loop_filter_info *lfi) {
   2367   unsigned int thresh_vec, flimit_vec, limit_vec;
   2368   unsigned char thresh, flimit, limit, flimit_temp;
   2369 
   2370   /* use direct value instead pointers */
   2371   limit = *(lfi->lim);
   2372   flimit_temp = *(lfi->blim);
   2373   thresh = *(lfi->hev_thr);
   2374   flimit = flimit_temp;
   2375 
   2376   /* create quad-byte */
   2377   __asm__ __volatile__(
   2378       "replv.qb       %[thresh_vec], %[thresh]    \n\t"
   2379       "replv.qb       %[flimit_vec], %[flimit]    \n\t"
   2380       "replv.qb       %[limit_vec],  %[limit]     \n\t"
   2381       : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
   2382         [limit_vec] "=r"(limit_vec)
   2383       : [thresh] "r"(thresh), [flimit] "r"(flimit), [limit] "r"(limit));
   2384 
   2385   vp8_loop_filter_vertical_edge_mips(y_ptr + 4, y_stride, flimit_vec, limit_vec,
   2386                                      thresh_vec, 16);
   2387   vp8_loop_filter_vertical_edge_mips(y_ptr + 8, y_stride, flimit_vec, limit_vec,
   2388                                      thresh_vec, 16);
   2389   vp8_loop_filter_vertical_edge_mips(y_ptr + 12, y_stride, flimit_vec,
   2390                                      limit_vec, thresh_vec, 16);
   2391 
   2392   if (u_ptr)
   2393     vp8_loop_filter_uvvertical_edge_mips(u_ptr + 4, uv_stride, flimit_vec,
   2394                                          limit_vec, thresh_vec, 0);
   2395 
   2396   if (v_ptr)
   2397     vp8_loop_filter_uvvertical_edge_mips(v_ptr + 4, uv_stride, flimit_vec,
   2398                                          limit_vec, thresh_vec, 0);
   2399 }
   2400 
   2401 #endif
   2402