Home | History | Annotate | Download | only in dspr2
      1 /*
      2  *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 
     12 #include <stdlib.h>
     13 #include "vp8_rtcd.h"
     14 #include "vp8/common/onyxc_int.h"
     15 
     16 #if HAVE_DSPR2
     17 typedef unsigned char uc;
     18 
     19 /* prefetch data for load */
     20 inline void prefetch_load_lf(unsigned char *src)
     21 {
     22     __asm__ __volatile__ (
     23         "pref   0,  0(%[src])   \n\t"
     24         :
     25         : [src] "r" (src)
     26     );
     27 }
     28 
     29 
     30 /* prefetch data for store */
     31 inline void prefetch_store_lf(unsigned char *dst)
     32 {
     33     __asm__ __volatile__ (
     34         "pref   1,  0(%[dst])   \n\t"
     35         :
     36         : [dst] "r" (dst)
     37     );
     38 }
     39 
     40 /* processing 4 pixels at the same time
     41  * compute hev and mask in the same function
     42  */
     43 static __inline void vp8_filter_mask_vec_mips
     44 (
     45     uint32_t limit,
     46     uint32_t flimit,
     47     uint32_t p1,
     48     uint32_t p0,
     49     uint32_t p3,
     50     uint32_t p2,
     51     uint32_t q0,
     52     uint32_t q1,
     53     uint32_t q2,
     54     uint32_t q3,
     55     uint32_t thresh,
     56     uint32_t *hev,
     57     uint32_t *mask
     58 )
     59 {
     60     uint32_t c, r, r3, r_k;
     61     uint32_t s1, s2, s3;
     62     uint32_t ones = 0xFFFFFFFF;
     63     uint32_t hev1;
     64 
     65     __asm__ __volatile__ (
     66         /* mask |= (abs(p3 - p2) > limit) */
     67         "subu_s.qb      %[c],   %[p3],     %[p2]        \n\t"
     68         "subu_s.qb      %[r_k], %[p2],     %[p3]        \n\t"
     69         "or             %[r_k], %[r_k],    %[c]         \n\t"
     70         "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
     71         "or             %[r],   $0,        %[c]         \n\t"
     72 
     73         /* mask |= (abs(p2 - p1) > limit) */
     74         "subu_s.qb      %[c],   %[p2],     %[p1]        \n\t"
     75         "subu_s.qb      %[r_k], %[p1],     %[p2]        \n\t"
     76         "or             %[r_k], %[r_k],    %[c]         \n\t"
     77         "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
     78         "or             %[r],   %[r],      %[c]         \n\t"
     79 
     80         /* mask |= (abs(p1 - p0) > limit)
     81          * hev  |= (abs(p1 - p0) > thresh)
     82          */
     83         "subu_s.qb      %[c],   %[p1],     %[p0]        \n\t"
     84         "subu_s.qb      %[r_k], %[p0],     %[p1]        \n\t"
     85         "or             %[r_k], %[r_k],    %[c]         \n\t"
     86         "cmpgu.lt.qb    %[c],   %[thresh], %[r_k]       \n\t"
     87         "or             %[r3],  $0,        %[c]         \n\t"
     88         "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
     89         "or             %[r],   %[r],      %[c]         \n\t"
     90 
     91         /* mask |= (abs(q1 - q0) > limit)
     92          * hev  |= (abs(q1 - q0) > thresh)
     93          */
     94         "subu_s.qb      %[c],   %[q1],     %[q0]        \n\t"
     95         "subu_s.qb      %[r_k], %[q0],     %[q1]        \n\t"
     96         "or             %[r_k], %[r_k],    %[c]         \n\t"
     97         "cmpgu.lt.qb    %[c],   %[thresh], %[r_k]       \n\t"
     98         "or             %[r3],  %[r3],     %[c]         \n\t"
     99         "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
    100         "or             %[r],   %[r],      %[c]         \n\t"
    101 
    102         /* mask |= (abs(q2 - q1) > limit) */
    103         "subu_s.qb      %[c],   %[q2],     %[q1]        \n\t"
    104         "subu_s.qb      %[r_k], %[q1],     %[q2]        \n\t"
    105         "or             %[r_k], %[r_k],    %[c]         \n\t"
    106         "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
    107         "or             %[r],   %[r],      %[c]         \n\t"
    108         "sll            %[r3],    %[r3],    24          \n\t"
    109 
    110         /* mask |= (abs(q3 - q2) > limit) */
    111         "subu_s.qb      %[c],   %[q3],     %[q2]        \n\t"
    112         "subu_s.qb      %[r_k], %[q2],     %[q3]        \n\t"
    113         "or             %[r_k], %[r_k],    %[c]         \n\t"
    114         "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
    115         "or             %[r],   %[r],      %[c]         \n\t"
    116 
    117         : [c] "=&r" (c), [r_k] "=&r" (r_k),
    118           [r] "=&r" (r), [r3] "=&r" (r3)
    119         : [limit] "r" (limit), [p3] "r" (p3), [p2] "r" (p2),
    120           [p1] "r" (p1), [p0] "r" (p0), [q1] "r" (q1), [q0] "r" (q0),
    121           [q2] "r" (q2), [q3] "r" (q3), [thresh] "r" (thresh)
    122     );
    123 
    124     __asm__ __volatile__ (
    125         /* abs(p0 - q0) */
    126         "subu_s.qb      %[c],   %[p0],     %[q0]        \n\t"
    127         "subu_s.qb      %[r_k], %[q0],     %[p0]        \n\t"
    128         "wrdsp          %[r3]                           \n\t"
    129         "or             %[s1],  %[r_k],    %[c]         \n\t"
    130 
    131         /* abs(p1 - q1) */
    132         "subu_s.qb      %[c],    %[p1],    %[q1]        \n\t"
    133         "addu_s.qb      %[s3],   %[s1],    %[s1]        \n\t"
    134         "pick.qb        %[hev1], %[ones],  $0           \n\t"
    135         "subu_s.qb      %[r_k],  %[q1],    %[p1]        \n\t"
    136         "or             %[s2],   %[r_k],   %[c]         \n\t"
    137 
    138         /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > flimit * 2 + limit */
    139         "shrl.qb        %[s2],   %[s2],     1           \n\t"
    140         "addu_s.qb      %[s1],   %[s2],     %[s3]       \n\t"
    141         "cmpgu.lt.qb    %[c],    %[flimit], %[s1]       \n\t"
    142         "or             %[r],    %[r],      %[c]        \n\t"
    143         "sll            %[r],    %[r],      24          \n\t"
    144 
    145         "wrdsp          %[r]                            \n\t"
    146         "pick.qb        %[s2],  $0,         %[ones]     \n\t"
    147 
    148         : [c] "=&r" (c), [r_k] "=&r" (r_k), [s1] "=&r" (s1), [hev1] "=&r" (hev1),
    149           [s2] "=&r" (s2), [r] "+r" (r), [s3] "=&r" (s3)
    150         : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [r3] "r" (r3),
    151           [q1] "r" (q1), [ones] "r" (ones), [flimit] "r" (flimit)
    152     );
    153 
    154     *hev = hev1;
    155     *mask = s2;
    156 }
    157 
    158 
    159 /* inputs & outputs are quad-byte vectors */
    160 static __inline void vp8_filter_mips
    161 (
    162     uint32_t mask,
    163     uint32_t hev,
    164     uint32_t *ps1,
    165     uint32_t *ps0,
    166     uint32_t *qs0,
    167     uint32_t *qs1
    168 )
    169 {
    170     int32_t vp8_filter_l, vp8_filter_r;
    171     int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r;
    172     int32_t subr_r, subr_l;
    173     uint32_t t1, t2, HWM, t3;
    174     uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r;
    175 
    176     int32_t vps1, vps0, vqs0, vqs1;
    177     int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r;
    178     uint32_t N128;
    179 
    180     N128 = 0x80808080;
    181     t1  = 0x03000300;
    182     t2  = 0x04000400;
    183     t3  = 0x01000100;
    184     HWM = 0xFF00FF00;
    185 
    186     vps0 = (*ps0) ^ N128;
    187     vps1 = (*ps1) ^ N128;
    188     vqs0 = (*qs0) ^ N128;
    189     vqs1 = (*qs1) ^ N128;
    190 
    191     /* use halfword pairs instead quad-bytes because of accuracy */
    192     vps0_l = vps0 & HWM;
    193     vps0_r = vps0 << 8;
    194     vps0_r = vps0_r & HWM;
    195 
    196     vps1_l = vps1 & HWM;
    197     vps1_r = vps1 << 8;
    198     vps1_r = vps1_r & HWM;
    199 
    200     vqs0_l = vqs0 & HWM;
    201     vqs0_r = vqs0 << 8;
    202     vqs0_r = vqs0_r & HWM;
    203 
    204     vqs1_l = vqs1 & HWM;
    205     vqs1_r = vqs1 << 8;
    206     vqs1_r = vqs1_r & HWM;
    207 
    208     mask_l = mask & HWM;
    209     mask_r = mask << 8;
    210     mask_r = mask_r & HWM;
    211 
    212     hev_l = hev & HWM;
    213     hev_r = hev << 8;
    214     hev_r = hev_r & HWM;
    215 
    216     __asm__ __volatile__ (
    217         /* vp8_filter = vp8_signed_char_clamp(ps1 - qs1); */
    218         "subq_s.ph    %[vp8_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
    219         "subq_s.ph    %[vp8_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
    220 
    221         /* qs0 - ps0 */
    222         "subq_s.ph    %[subr_l],       %[vqs0_l],       %[vps0_l]       \n\t"
    223         "subq_s.ph    %[subr_r],       %[vqs0_r],       %[vps0_r]       \n\t"
    224 
    225         /* vp8_filter &= hev; */
    226         "and          %[vp8_filter_l], %[vp8_filter_l], %[hev_l]        \n\t"
    227         "and          %[vp8_filter_r], %[vp8_filter_r], %[hev_r]        \n\t"
    228 
    229         /* vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (qs0 - ps0)); */
    230         "addq_s.ph    %[vp8_filter_l], %[vp8_filter_l], %[subr_l]       \n\t"
    231         "addq_s.ph    %[vp8_filter_r], %[vp8_filter_r], %[subr_r]       \n\t"
    232         "xor          %[invhev_l],     %[hev_l],        %[HWM]          \n\t"
    233         "addq_s.ph    %[vp8_filter_l], %[vp8_filter_l], %[subr_l]       \n\t"
    234         "addq_s.ph    %[vp8_filter_r], %[vp8_filter_r], %[subr_r]       \n\t"
    235         "xor          %[invhev_r],     %[hev_r],        %[HWM]          \n\t"
    236         "addq_s.ph    %[vp8_filter_l], %[vp8_filter_l], %[subr_l]       \n\t"
    237         "addq_s.ph    %[vp8_filter_r], %[vp8_filter_r], %[subr_r]       \n\t"
    238 
    239         /* vp8_filter &= mask; */
    240         "and          %[vp8_filter_l], %[vp8_filter_l], %[mask_l]       \n\t"
    241         "and          %[vp8_filter_r], %[vp8_filter_r], %[mask_r]       \n\t"
    242 
    243         : [vp8_filter_l] "=&r" (vp8_filter_l), [vp8_filter_r] "=&r" (vp8_filter_r),
    244           [subr_l] "=&r" (subr_l), [subr_r] "=&r" (subr_r),
    245           [invhev_l] "=&r" (invhev_l), [invhev_r] "=&r" (invhev_r)
    246 
    247         : [vps0_l] "r" (vps0_l), [vps0_r] "r" (vps0_r), [vps1_l] "r" (vps1_l),
    248           [vps1_r] "r" (vps1_r), [vqs0_l] "r" (vqs0_l), [vqs0_r] "r" (vqs0_r),
    249           [vqs1_l] "r" (vqs1_l), [vqs1_r] "r" (vqs1_r),
    250           [mask_l] "r" (mask_l), [mask_r] "r" (mask_r),
    251           [hev_l] "r" (hev_l), [hev_r] "r" (hev_r),
    252           [HWM] "r" (HWM)
    253     );
    254 
    255     /* save bottom 3 bits so that we round one side +4 and the other +3 */
    256     __asm__ __volatile__ (
    257         /* Filter2 = vp8_signed_char_clamp(vp8_filter + 3) >>= 3; */
    258         "addq_s.ph    %[Filter1_l],    %[vp8_filter_l], %[t2]           \n\t"
    259         "addq_s.ph    %[Filter1_r],    %[vp8_filter_r], %[t2]           \n\t"
    260 
    261         /* Filter1 = vp8_signed_char_clamp(vp8_filter + 4) >>= 3; */
    262         "addq_s.ph    %[Filter2_l],    %[vp8_filter_l], %[t1]           \n\t"
    263         "addq_s.ph    %[Filter2_r],    %[vp8_filter_r], %[t1]           \n\t"
    264         "shra.ph      %[Filter1_r],    %[Filter1_r],    3               \n\t"
    265         "shra.ph      %[Filter1_l],    %[Filter1_l],    3               \n\t"
    266 
    267         "shra.ph      %[Filter2_l],    %[Filter2_l],    3               \n\t"
    268         "shra.ph      %[Filter2_r],    %[Filter2_r],    3               \n\t"
    269 
    270         "and          %[Filter1_l],    %[Filter1_l],    %[HWM]          \n\t"
    271         "and          %[Filter1_r],    %[Filter1_r],    %[HWM]          \n\t"
    272 
    273         /* vps0 = vp8_signed_char_clamp(ps0 + Filter2); */
    274         "addq_s.ph    %[vps0_l],       %[vps0_l],       %[Filter2_l]    \n\t"
    275         "addq_s.ph    %[vps0_r],       %[vps0_r],       %[Filter2_r]    \n\t"
    276 
    277         /* vqs0 = vp8_signed_char_clamp(qs0 - Filter1); */
    278         "subq_s.ph    %[vqs0_l],       %[vqs0_l],       %[Filter1_l]    \n\t"
    279         "subq_s.ph    %[vqs0_r],       %[vqs0_r],       %[Filter1_r]    \n\t"
    280 
    281         : [Filter1_l] "=&r" (Filter1_l), [Filter1_r] "=&r" (Filter1_r),
    282           [Filter2_l] "=&r" (Filter2_l), [Filter2_r] "=&r" (Filter2_r),
    283           [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
    284           [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r)
    285 
    286         : [t1] "r" (t1), [t2] "r" (t2),
    287           [vp8_filter_l] "r" (vp8_filter_l), [vp8_filter_r] "r" (vp8_filter_r),
    288           [HWM] "r" (HWM)
    289     );
    290 
    291     __asm__ __volatile__ (
    292         /* (vp8_filter += 1) >>= 1 */
    293         "addqh.ph    %[Filter1_l],    %[Filter1_l],     %[t3]           \n\t"
    294         "addqh.ph    %[Filter1_r],    %[Filter1_r],     %[t3]           \n\t"
    295 
    296         /* vp8_filter &= ~hev; */
    297         "and          %[Filter1_l],    %[Filter1_l],    %[invhev_l]     \n\t"
    298         "and          %[Filter1_r],    %[Filter1_r],    %[invhev_r]     \n\t"
    299 
    300         /* vps1 = vp8_signed_char_clamp(ps1 + vp8_filter); */
    301         "addq_s.ph    %[vps1_l],       %[vps1_l],       %[Filter1_l]    \n\t"
    302         "addq_s.ph    %[vps1_r],       %[vps1_r],       %[Filter1_r]    \n\t"
    303 
    304         /* vqs1 = vp8_signed_char_clamp(qs1 - vp8_filter); */
    305         "subq_s.ph    %[vqs1_l],       %[vqs1_l],       %[Filter1_l]    \n\t"
    306         "subq_s.ph    %[vqs1_r],       %[vqs1_r],       %[Filter1_r]    \n\t"
    307 
    308         : [Filter1_l] "+r" (Filter1_l), [Filter1_r] "+r" (Filter1_r),
    309           [vps1_l] "+r" (vps1_l), [vps1_r] "+r" (vps1_r),
    310           [vqs1_l] "+r" (vqs1_l), [vqs1_r] "+r" (vqs1_r)
    311 
    312         : [t3] "r" (t3), [invhev_l] "r" (invhev_l), [invhev_r] "r" (invhev_r)
    313     );
    314 
    315     /* Create quad-bytes from halfword pairs */
    316     vqs0_l = vqs0_l & HWM;
    317     vqs1_l = vqs1_l & HWM;
    318     vps0_l = vps0_l & HWM;
    319     vps1_l = vps1_l & HWM;
    320 
    321     __asm__ __volatile__ (
    322         "shrl.ph      %[vqs0_r],       %[vqs0_r],       8               \n\t"
    323         "shrl.ph      %[vps0_r],       %[vps0_r],       8               \n\t"
    324         "shrl.ph      %[vqs1_r],       %[vqs1_r],       8               \n\t"
    325         "shrl.ph      %[vps1_r],       %[vps1_r],       8               \n\t"
    326 
    327         : [vps1_r] "+r" (vps1_r), [vqs1_r] "+r" (vqs1_r),
    328           [vps0_r] "+r" (vps0_r), [vqs0_r] "+r" (vqs0_r)
    329         :
    330     );
    331 
    332     vqs0 = vqs0_l | vqs0_r;
    333     vqs1 = vqs1_l | vqs1_r;
    334     vps0 = vps0_l | vps0_r;
    335     vps1 = vps1_l | vps1_r;
    336 
    337     *ps0 = vps0 ^ N128;
    338     *ps1 = vps1 ^ N128;
    339     *qs0 = vqs0 ^ N128;
    340     *qs1 = vqs1 ^ N128;
    341 }
    342 
    343 void vp8_loop_filter_horizontal_edge_mips
    344 (
    345     unsigned char *s,
    346     int p,
    347     unsigned int flimit,
    348     unsigned int limit,
    349     unsigned int thresh,
    350     int count
    351 )
    352 {
    353     uint32_t mask;
    354     uint32_t hev;
    355     uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
    356     unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
    357 
    358     mask = 0;
    359     hev = 0;
    360     p1 = 0;
    361     p2 = 0;
    362     p3 = 0;
    363     p4 = 0;
    364 
    365     /* prefetch data for store */
    366     prefetch_store_lf(s);
    367 
    368     /* loop filter designed to work using chars so that we can make maximum use
    369      * of 8 bit simd instructions.
    370      */
    371 
    372     sm1 = s - (p << 2);
    373     s0 = s - p - p - p;
    374     s1 = s - p - p ;
    375     s2 = s - p;
    376     s3 = s;
    377     s4 = s + p;
    378     s5 = s + p + p;
    379     s6 = s + p + p + p;
    380 
    381     /* load quad-byte vectors
    382      * memory is 4 byte aligned
    383      */
    384     p1 = *((uint32_t *)(s1));
    385     p2 = *((uint32_t *)(s2));
    386     p3 = *((uint32_t *)(s3));
    387     p4 = *((uint32_t *)(s4));
    388 
    389     /* if (p1 - p4 == 0) and (p2 - p3 == 0)
    390      * mask will be zero and filtering is not needed
    391      */
    392     if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
    393     {
    394 
    395         pm1 = *((uint32_t *)(sm1));
    396         p0  = *((uint32_t *)(s0));
    397         p5  = *((uint32_t *)(s5));
    398         p6  = *((uint32_t *)(s6));
    399 
    400         vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
    401                                  thresh, &hev, &mask);
    402 
    403         /* if mask == 0 do filtering is not needed */
    404         if (mask)
    405         {
    406             /* filtering */
    407             vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
    408 
    409             /* unpack processed 4x4 neighborhood */
    410             *((uint32_t *)s1) = p1;
    411             *((uint32_t *)s2) = p2;
    412             *((uint32_t *)s3) = p3;
    413             *((uint32_t *)s4) = p4;
    414         }
    415     }
    416 
    417     sm1 += 4;
    418     s0  += 4;
    419     s1  += 4;
    420     s2  += 4;
    421     s3  += 4;
    422     s4  += 4;
    423     s5  += 4;
    424     s6  += 4;
    425 
    426     /* load quad-byte vectors
    427      * memory is 4 byte aligned
    428      */
    429     p1 = *((uint32_t *)(s1));
    430     p2 = *((uint32_t *)(s2));
    431     p3 = *((uint32_t *)(s3));
    432     p4 = *((uint32_t *)(s4));
    433 
    434     /* if (p1 - p4 == 0) and (p2 - p3 == 0)
    435      * mask will be zero and filtering is not needed
    436      */
    437     if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
    438     {
    439 
    440         pm1 = *((uint32_t *)(sm1));
    441         p0  = *((uint32_t *)(s0));
    442         p5  = *((uint32_t *)(s5));
    443         p6  = *((uint32_t *)(s6));
    444 
    445         vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
    446                                  thresh, &hev, &mask);
    447 
    448         /* if mask == 0 do filtering is not needed */
    449         if (mask)
    450         {
    451             /* filtering */
    452             vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
    453 
    454             /* unpack processed 4x4 neighborhood */
    455             *((uint32_t *)s1) = p1;
    456             *((uint32_t *)s2) = p2;
    457             *((uint32_t *)s3) = p3;
    458             *((uint32_t *)s4) = p4;
    459         }
    460     }
    461 
    462     sm1 += 4;
    463     s0  += 4;
    464     s1  += 4;
    465     s2  += 4;
    466     s3  += 4;
    467     s4  += 4;
    468     s5  += 4;
    469     s6  += 4;
    470 
    471     /* load quad-byte vectors
    472      * memory is 4 byte aligned
    473      */
    474     p1 = *((uint32_t *)(s1));
    475     p2 = *((uint32_t *)(s2));
    476     p3 = *((uint32_t *)(s3));
    477     p4 = *((uint32_t *)(s4));
    478 
    479     /* if (p1 - p4 == 0) and (p2 - p3 == 0)
    480      * mask will be zero and filtering is not needed
    481      */
    482     if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
    483     {
    484 
    485         pm1 = *((uint32_t *)(sm1));
    486         p0  = *((uint32_t *)(s0));
    487         p5  = *((uint32_t *)(s5));
    488         p6  = *((uint32_t *)(s6));
    489 
    490         vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
    491                                  thresh, &hev, &mask);
    492 
    493         /* if mask == 0 do filtering is not needed */
    494         if (mask)
    495         {
    496             /* filtering */
    497             vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
    498 
    499             /* unpack processed 4x4 neighborhood */
    500             *((uint32_t *)s1) = p1;
    501             *((uint32_t *)s2) = p2;
    502             *((uint32_t *)s3) = p3;
    503             *((uint32_t *)s4) = p4;
    504         }
    505     }
    506 
    507     sm1 += 4;
    508     s0  += 4;
    509     s1  += 4;
    510     s2  += 4;
    511     s3  += 4;
    512     s4  += 4;
    513     s5  += 4;
    514     s6  += 4;
    515 
    516     /* load quad-byte vectors
    517      * memory is 4 byte aligned
    518      */
    519     p1 = *((uint32_t *)(s1));
    520     p2 = *((uint32_t *)(s2));
    521     p3 = *((uint32_t *)(s3));
    522     p4 = *((uint32_t *)(s4));
    523 
    524     /* if (p1 - p4 == 0) and (p2 - p3 == 0)
    525      * mask will be zero and filtering is not needed
    526      */
    527     if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
    528     {
    529 
    530         pm1 = *((uint32_t *)(sm1));
    531         p0  = *((uint32_t *)(s0));
    532         p5  = *((uint32_t *)(s5));
    533         p6  = *((uint32_t *)(s6));
    534 
    535         vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
    536                                  thresh, &hev, &mask);
    537 
    538         /* if mask == 0 do filtering is not needed */
    539         if (mask)
    540         {
    541             /* filtering */
    542             vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
    543 
    544             /* unpack processed 4x4 neighborhood */
    545             *((uint32_t *)s1) = p1;
    546             *((uint32_t *)s2) = p2;
    547             *((uint32_t *)s3) = p3;
    548             *((uint32_t *)s4) = p4;
    549         }
    550     }
    551 }
    552 
    553 void vp8_loop_filter_uvhorizontal_edge_mips
    554 (
    555     unsigned char *s,
    556     int p,
    557     unsigned int flimit,
    558     unsigned int limit,
    559     unsigned int thresh,
    560     int count
    561 )
    562 {
    563     uint32_t mask;
    564     uint32_t hev;
    565     uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
    566     unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
    567 
    568     mask = 0;
    569     hev = 0;
    570     p1 = 0;
    571     p2 = 0;
    572     p3 = 0;
    573     p4 = 0;
    574 
    575     /* loop filter designed to work using chars so that we can make maximum use
    576      * of 8 bit simd instructions.
    577      */
    578 
    579     sm1 = s - (p << 2);
    580     s0  = s - p - p - p;
    581     s1  = s - p - p ;
    582     s2  = s - p;
    583     s3  = s;
    584     s4  = s + p;
    585     s5  = s + p + p;
    586     s6  = s + p + p + p;
    587 
    588     /* load quad-byte vectors
    589      * memory is 4 byte aligned
    590      */
    591     p1 = *((uint32_t *)(s1));
    592     p2 = *((uint32_t *)(s2));
    593     p3 = *((uint32_t *)(s3));
    594     p4 = *((uint32_t *)(s4));
    595 
    596     /* if (p1 - p4 == 0) and (p2 - p3 == 0)
    597      * mask will be zero and filtering is not needed
    598      */
    599     if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
    600     {
    601 
    602         pm1 = *((uint32_t *)(sm1));
    603         p0  = *((uint32_t *)(s0));
    604         p5  = *((uint32_t *)(s5));
    605         p6  = *((uint32_t *)(s6));
    606 
    607         vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
    608                                  thresh, &hev, &mask);
    609 
    610         /* if mask == 0 do filtering is not needed */
    611         if (mask)
    612         {
    613             /* filtering */
    614             vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
    615 
    616             /* unpack processed 4x4 neighborhood */
    617             *((uint32_t *)s1) = p1;
    618             *((uint32_t *)s2) = p2;
    619             *((uint32_t *)s3) = p3;
    620             *((uint32_t *)s4) = p4;
    621         }
    622     }
    623 
    624     sm1 += 4;
    625     s0  += 4;
    626     s1  += 4;
    627     s2  += 4;
    628     s3  += 4;
    629     s4  += 4;
    630     s5  += 4;
    631     s6  += 4;
    632 
    633     /* load quad-byte vectors
    634      * memory is 4 byte aligned
    635      */
    636     p1 = *((uint32_t *)(s1));
    637     p2 = *((uint32_t *)(s2));
    638     p3 = *((uint32_t *)(s3));
    639     p4 = *((uint32_t *)(s4));
    640 
    641     /* if (p1 - p4 == 0) and (p2 - p3 == 0)
    642      * mask will be zero and filtering is not needed
    643      */
    644     if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
    645     {
    646 
    647         pm1 = *((uint32_t *)(sm1));
    648         p0  = *((uint32_t *)(s0));
    649         p5  = *((uint32_t *)(s5));
    650         p6  = *((uint32_t *)(s6));
    651 
    652         vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
    653                                  thresh, &hev, &mask);
    654 
    655         /* if mask == 0 do filtering is not needed */
    656         if (mask)
    657         {
    658             /* filtering */
    659             vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
    660 
    661             /* unpack processed 4x4 neighborhood */
    662             *((uint32_t *)s1) = p1;
    663             *((uint32_t *)s2) = p2;
    664             *((uint32_t *)s3) = p3;
    665             *((uint32_t *)s4) = p4;
    666         }
    667     }
    668 }
    669 
    670 void vp8_loop_filter_vertical_edge_mips
    671 (
    672     unsigned char *s,
    673     int p,
    674     const unsigned int flimit,
    675     const unsigned int limit,
    676     const unsigned int thresh,
    677     int count
    678 )
    679 {
    680     int i;
    681     uint32_t mask, hev;
    682     uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
    683     unsigned char *s1, *s2, *s3, *s4;
    684     uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
    685 
    686     hev = 0;
    687     mask = 0;
    688     i = 0;
    689     pm1 = 0;
    690     p0 = 0;
    691     p1 = 0;
    692     p2 = 0;
    693     p3 = 0;
    694     p4 = 0;
    695     p5 = 0;
    696     p6 = 0;
    697 
    698     /* loop filter designed to work using chars so that we can make maximum use
    699      * of 8 bit simd instructions.
    700      */
    701 
    702     /* apply filter on 4 pixesl at the same time */
    703     do
    704     {
    705 
    706         /* prefetch data for store */
    707         prefetch_store_lf(s + p);
    708 
    709         s1 = s;
    710         s2 = s + p;
    711         s3 = s2 + p;
    712         s4 = s3 + p;
    713         s  = s4 + p;
    714 
    715         /* load quad-byte vectors
    716          * memory is 4 byte aligned
    717          */
    718         p2  = *((uint32_t *)(s1 - 4));
    719         p6  = *((uint32_t *)(s1));
    720         p1  = *((uint32_t *)(s2 - 4));
    721         p5  = *((uint32_t *)(s2));
    722         p0  = *((uint32_t *)(s3 - 4));
    723         p4  = *((uint32_t *)(s3));
    724         pm1 = *((uint32_t *)(s4 - 4));
    725         p3  = *((uint32_t *)(s4));
    726 
    727         /* transpose pm1, p0, p1, p2 */
    728         __asm__ __volatile__ (
    729             "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
    730             "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
    731             "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
    732             "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
    733 
    734             "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
    735             "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
    736             "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
    737             "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
    738 
    739             "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
    740             "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
    741             "append         %[p1],      %[sec3],    16          \n\t"
    742             "append         %[pm1],     %[sec4],    16          \n\t"
    743 
    744             : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
    745               [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
    746               [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
    747               [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
    748             :
    749         );
    750 
    751         /* transpose p3, p4, p5, p6 */
    752         __asm__ __volatile__ (
    753             "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
    754             "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
    755             "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
    756             "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
    757 
    758             "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
    759             "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
    760             "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
    761             "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
    762 
    763             "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
    764             "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
    765             "append         %[p5],      %[sec3],    16          \n\t"
    766             "append         %[p3],      %[sec4],    16          \n\t"
    767 
    768             : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
    769               [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
    770               [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
    771               [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
    772             :
    773         );
    774 
    775         /* if (p1 - p4 == 0) and (p2 - p3 == 0)
    776          * mask will be zero and filtering is not needed
    777          */
    778         if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
    779         {
    780 
    781             vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
    782                                      thresh, &hev, &mask);
    783 
    784             /* if mask == 0 do filtering is not needed */
    785             if (mask)
    786             {
    787                 /* filtering */
    788                 vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
    789 
    790                 /* unpack processed 4x4 neighborhood
    791                  * don't use transpose on output data
    792                  * because memory isn't aligned
    793                  */
    794                 __asm__ __volatile__ (
    795                     "sb         %[p4],  1(%[s4])    \n\t"
    796                     "sb         %[p3],  0(%[s4])    \n\t"
    797                     "sb         %[p2], -1(%[s4])    \n\t"
    798                     "sb         %[p1], -2(%[s4])    \n\t"
    799                     :
    800                     : [p4] "r" (p4), [p3] "r" (p3), [s4] "r" (s4),
    801                       [p2] "r" (p2), [p1] "r" (p1)
    802                 );
    803 
    804                 __asm__ __volatile__ (
    805                     "srl        %[p4], %[p4], 8     \n\t"
    806                     "srl        %[p3], %[p3], 8     \n\t"
    807                     "srl        %[p2], %[p2], 8     \n\t"
    808                     "srl        %[p1], %[p1], 8     \n\t"
    809                     : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
    810                     :
    811                 );
    812 
    813                 __asm__ __volatile__ (
    814                     "sb         %[p4],  1(%[s3])    \n\t"
    815                     "sb         %[p3],  0(%[s3])    \n\t"
    816                     "sb         %[p2], -1(%[s3])    \n\t"
    817                     "sb         %[p1], -2(%[s3])    \n\t"
    818                     : [p1] "+r" (p1)
    819                     : [p4] "r" (p4), [p3] "r" (p3), [s3] "r" (s3), [p2] "r" (p2)
    820                 );
    821 
    822                 __asm__ __volatile__ (
    823                     "srl        %[p4], %[p4], 8     \n\t"
    824                     "srl        %[p3], %[p3], 8     \n\t"
    825                     "srl        %[p2], %[p2], 8     \n\t"
    826                     "srl        %[p1], %[p1], 8     \n\t"
    827                     : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
    828                     :
    829                 );
    830 
    831                 __asm__ __volatile__ (
    832                     "sb         %[p4],  1(%[s2])    \n\t"
    833                     "sb         %[p3],  0(%[s2])    \n\t"
    834                     "sb         %[p2], -1(%[s2])    \n\t"
    835                     "sb         %[p1], -2(%[s2])    \n\t"
    836                     :
    837                     : [p4] "r" (p4), [p3] "r" (p3), [s2] "r" (s2),
    838                       [p2] "r" (p2), [p1] "r" (p1)
    839                 );
    840 
    841                 __asm__ __volatile__ (
    842                     "srl        %[p4], %[p4], 8     \n\t"
    843                     "srl        %[p3], %[p3], 8     \n\t"
    844                     "srl        %[p2], %[p2], 8     \n\t"
    845                     "srl        %[p1], %[p1], 8     \n\t"
    846                     : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
    847                     :
    848                 );
    849 
    850                 __asm__ __volatile__ (
    851                     "sb         %[p4],  1(%[s1])    \n\t"
    852                     "sb         %[p3],  0(%[s1])    \n\t"
    853                     "sb         %[p2], -1(%[s1])    \n\t"
    854                     "sb         %[p1], -2(%[s1])    \n\t"
    855                     :
    856                     : [p4] "r" (p4), [p3] "r" (p3), [s1] "r" (s1),
    857                       [p2] "r" (p2), [p1] "r" (p1)
    858                 );
    859             }
    860         }
    861 
    862         s1 = s;
    863         s2 = s + p;
    864         s3 = s2 + p;
    865         s4 = s3 + p;
    866         s  = s4 + p;
    867 
    868         /* load quad-byte vectors
    869          * memory is 4 byte aligned
    870          */
    871         p2  = *((uint32_t *)(s1 - 4));
    872         p6  = *((uint32_t *)(s1));
    873         p1  = *((uint32_t *)(s2 - 4));
    874         p5  = *((uint32_t *)(s2));
    875         p0  = *((uint32_t *)(s3 - 4));
    876         p4  = *((uint32_t *)(s3));
    877         pm1 = *((uint32_t *)(s4 - 4));
    878         p3  = *((uint32_t *)(s4));
    879 
    880         /* transpose pm1, p0, p1, p2 */
    881         __asm__ __volatile__ (
    882             "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
    883             "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
    884             "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
    885             "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
    886 
    887             "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
    888             "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
    889             "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
    890             "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
    891 
    892             "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
    893             "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
    894             "append         %[p1],      %[sec3],    16          \n\t"
    895             "append         %[pm1],     %[sec4],    16          \n\t"
    896 
    897             : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
    898               [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
    899               [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
    900               [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
    901             :
    902         );
    903 
    904         /* transpose p3, p4, p5, p6 */
    905         __asm__ __volatile__ (
    906             "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
    907             "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
    908             "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
    909             "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
    910 
    911             "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
    912             "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
    913             "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
    914             "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
    915 
    916             "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
    917             "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
    918             "append         %[p5],      %[sec3],    16          \n\t"
    919             "append         %[p3],      %[sec4],    16          \n\t"
    920 
    921             : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
    922               [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
    923               [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
    924               [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
    925             :
    926         );
    927 
    928         /* if (p1 - p4 == 0) and (p2 - p3 == 0)
    929          * mask will be zero and filtering is not needed
    930          */
    931         if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
    932         {
    933 
    934             vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
    935                                      thresh, &hev, &mask);
    936 
    937             /* if mask == 0 do filtering is not needed */
    938             if (mask)
    939             {
    940                 /* filtering */
    941                 vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
    942 
    943                 /* unpack processed 4x4 neighborhood
    944                  * don't use transpose on output data
    945                  * because memory isn't aligned
    946                  */
    947                 __asm__ __volatile__ (
    948                     "sb         %[p4],  1(%[s4])    \n\t"
    949                     "sb         %[p3],  0(%[s4])    \n\t"
    950                     "sb         %[p2], -1(%[s4])    \n\t"
    951                     "sb         %[p1], -2(%[s4])    \n\t"
    952                     :
    953                     : [p4] "r" (p4), [p3] "r" (p3), [s4] "r" (s4),
    954                       [p2] "r" (p2), [p1] "r" (p1)
    955                 );
    956 
    957                 __asm__ __volatile__ (
    958                     "srl        %[p4], %[p4], 8     \n\t"
    959                     "srl        %[p3], %[p3], 8     \n\t"
    960                     "srl        %[p2], %[p2], 8     \n\t"
    961                     "srl        %[p1], %[p1], 8     \n\t"
    962                     : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
    963                     :
    964                 );
    965 
    966                 __asm__ __volatile__ (
    967                     "sb         %[p4],  1(%[s3])    \n\t"
    968                     "sb         %[p3],  0(%[s3])    \n\t"
    969                     "sb         %[p2], -1(%[s3])    \n\t"
    970                     "sb         %[p1], -2(%[s3])    \n\t"
    971                     : [p1] "+r" (p1)
    972                     : [p4] "r" (p4), [p3] "r" (p3), [s3] "r" (s3), [p2] "r" (p2)
    973                 );
    974 
    975                 __asm__ __volatile__ (
    976                     "srl        %[p4], %[p4], 8     \n\t"
    977                     "srl        %[p3], %[p3], 8     \n\t"
    978                     "srl        %[p2], %[p2], 8     \n\t"
    979                     "srl        %[p1], %[p1], 8     \n\t"
    980                     : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
    981                     :
    982                 );
    983 
    984                 __asm__ __volatile__ (
    985                     "sb         %[p4],  1(%[s2])    \n\t"
    986                     "sb         %[p3],  0(%[s2])    \n\t"
    987                     "sb         %[p2], -1(%[s2])    \n\t"
    988                     "sb         %[p1], -2(%[s2])    \n\t"
    989                     :
    990                     : [p4] "r" (p4), [p3] "r" (p3), [s2] "r" (s2),
    991                       [p2] "r" (p2), [p1] "r" (p1)
    992                 );
    993 
    994                 __asm__ __volatile__ (
    995                     "srl        %[p4], %[p4], 8     \n\t"
    996                     "srl        %[p3], %[p3], 8     \n\t"
    997                     "srl        %[p2], %[p2], 8     \n\t"
    998                     "srl        %[p1], %[p1], 8     \n\t"
    999                     : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
   1000                     :
   1001                 );
   1002 
   1003                 __asm__ __volatile__ (
   1004                     "sb         %[p4],  1(%[s1])    \n\t"
   1005                     "sb         %[p3],  0(%[s1])    \n\t"
   1006                     "sb         %[p2], -1(%[s1])    \n\t"
   1007                     "sb         %[p1], -2(%[s1])    \n\t"
   1008                     :
   1009                     : [p4] "r" (p4), [p3] "r" (p3), [s1] "r" (s1),
   1010                       [p2] "r" (p2), [p1] "r" (p1)
   1011                 );
   1012             }
   1013         }
   1014 
   1015         i += 8;
   1016     }
   1017 
   1018     while (i < count);
   1019 }
   1020 
   1021 void vp8_loop_filter_uvvertical_edge_mips
   1022 (
   1023     unsigned char *s,
   1024     int p,
   1025     unsigned int flimit,
   1026     unsigned int limit,
   1027     unsigned int thresh,
   1028     int count
   1029 )
   1030 {
   1031     uint32_t mask, hev;
   1032     uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
   1033     unsigned char *s1, *s2, *s3, *s4;
   1034     uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
   1035 
   1036     /* loop filter designed to work using chars so that we can make maximum use
   1037      * of 8 bit simd instructions.
   1038      */
   1039 
   1040     /* apply filter on 4 pixesl at the same time */
   1041 
   1042     s1 = s;
   1043     s2 = s + p;
   1044     s3 = s2 + p;
   1045     s4 = s3 + p;
   1046 
   1047     /* load quad-byte vectors
   1048     * memory is 4 byte aligned
   1049     */
   1050     p2  = *((uint32_t *)(s1 - 4));
   1051     p6  = *((uint32_t *)(s1));
   1052     p1  = *((uint32_t *)(s2 - 4));
   1053     p5  = *((uint32_t *)(s2));
   1054     p0  = *((uint32_t *)(s3 - 4));
   1055     p4  = *((uint32_t *)(s3));
   1056     pm1 = *((uint32_t *)(s4 - 4));
   1057     p3  = *((uint32_t *)(s4));
   1058 
   1059     /* transpose pm1, p0, p1, p2 */
   1060     __asm__ __volatile__ (
   1061         "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
   1062         "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
   1063         "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
   1064         "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
   1065 
   1066         "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
   1067         "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
   1068         "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
   1069         "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
   1070 
   1071         "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
   1072         "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
   1073         "append         %[p1],      %[sec3],    16          \n\t"
   1074         "append         %[pm1],     %[sec4],    16          \n\t"
   1075 
   1076         : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
   1077           [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
   1078           [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
   1079           [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
   1080         :
   1081     );
   1082 
   1083     /* transpose p3, p4, p5, p6 */
   1084     __asm__ __volatile__ (
   1085         "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
   1086         "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
   1087         "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
   1088         "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
   1089 
   1090         "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
   1091         "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
   1092         "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
   1093         "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
   1094 
   1095         "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
   1096         "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
   1097         "append         %[p5],      %[sec3],    16          \n\t"
   1098         "append         %[p3],      %[sec4],    16          \n\t"
   1099 
   1100         : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
   1101           [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
   1102           [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
   1103           [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
   1104         :
   1105     );
   1106 
   1107     /* if (p1 - p4 == 0) and (p2 - p3 == 0)
   1108     * mask will be zero and filtering is not needed
   1109     */
   1110     if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
   1111     {
   1112 
   1113         vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
   1114                                  thresh, &hev, &mask);
   1115 
   1116         /* if mask == 0 do filtering is not needed */
   1117         if (mask)
   1118         {
   1119             /* filtering */
   1120             vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
   1121 
   1122             /* unpack processed 4x4 neighborhood
   1123              * don't use transpose on output data
   1124              * because memory isn't aligned
   1125              */
   1126             __asm__ __volatile__ (
   1127                 "sb         %[p4],  1(%[s4])    \n\t"
   1128                 "sb         %[p3],  0(%[s4])    \n\t"
   1129                 "sb         %[p2], -1(%[s4])    \n\t"
   1130                 "sb         %[p1], -2(%[s4])    \n\t"
   1131                 :
   1132                 : [p4] "r" (p4), [p3] "r" (p3), [s4] "r" (s4),
   1133                   [p2] "r" (p2), [p1] "r" (p1)
   1134             );
   1135 
   1136             __asm__ __volatile__ (
   1137                 "srl        %[p4], %[p4], 8     \n\t"
   1138                 "srl        %[p3], %[p3], 8     \n\t"
   1139                 "srl        %[p2], %[p2], 8     \n\t"
   1140                 "srl        %[p1], %[p1], 8     \n\t"
   1141                 : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
   1142                 :
   1143             );
   1144 
   1145             __asm__ __volatile__ (
   1146                 "sb         %[p4],  1(%[s3])    \n\t"
   1147                 "sb         %[p3],  0(%[s3])    \n\t"
   1148                 "sb         %[p2], -1(%[s3])    \n\t"
   1149                 "sb         %[p1], -2(%[s3])    \n\t"
   1150                 : [p1] "+r" (p1)
   1151                 : [p4] "r" (p4), [p3] "r" (p3), [s3] "r" (s3), [p2] "r" (p2)
   1152             );
   1153 
   1154             __asm__ __volatile__ (
   1155                 "srl        %[p4], %[p4], 8     \n\t"
   1156                 "srl        %[p3], %[p3], 8     \n\t"
   1157                 "srl        %[p2], %[p2], 8     \n\t"
   1158                 "srl        %[p1], %[p1], 8     \n\t"
   1159                 : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
   1160                 :
   1161             );
   1162 
   1163             __asm__ __volatile__ (
   1164                 "sb         %[p4],  1(%[s2])    \n\t"
   1165                 "sb         %[p3],  0(%[s2])    \n\t"
   1166                 "sb         %[p2], -1(%[s2])    \n\t"
   1167                 "sb         %[p1], -2(%[s2])    \n\t"
   1168                 :
   1169                 : [p4] "r" (p4), [p3] "r" (p3), [s2] "r" (s2),
   1170                   [p2] "r" (p2), [p1] "r" (p1)
   1171             );
   1172 
   1173             __asm__ __volatile__ (
   1174                 "srl        %[p4], %[p4], 8     \n\t"
   1175                 "srl        %[p3], %[p3], 8     \n\t"
   1176                 "srl        %[p2], %[p2], 8     \n\t"
   1177                 "srl        %[p1], %[p1], 8     \n\t"
   1178                 : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
   1179                 :
   1180             );
   1181 
   1182             __asm__ __volatile__ (
   1183                 "sb         %[p4],  1(%[s1])    \n\t"
   1184                 "sb         %[p3],  0(%[s1])    \n\t"
   1185                 "sb         %[p2], -1(%[s1])    \n\t"
   1186                 "sb         %[p1], -2(%[s1])    \n\t"
   1187                 :
   1188                 : [p4] "r" (p4), [p3] "r" (p3), [s1] "r" (s1), [p2] "r" (p2), [p1] "r" (p1)
   1189             );
   1190         }
   1191     }
   1192 
   1193     s1 = s4 + p;
   1194     s2 = s1 + p;
   1195     s3 = s2 + p;
   1196     s4 = s3 + p;
   1197 
   1198     /* load quad-byte vectors
   1199      * memory is 4 byte aligned
   1200      */
   1201     p2  = *((uint32_t *)(s1 - 4));
   1202     p6  = *((uint32_t *)(s1));
   1203     p1  = *((uint32_t *)(s2 - 4));
   1204     p5  = *((uint32_t *)(s2));
   1205     p0  = *((uint32_t *)(s3 - 4));
   1206     p4  = *((uint32_t *)(s3));
   1207     pm1 = *((uint32_t *)(s4 - 4));
   1208     p3  = *((uint32_t *)(s4));
   1209 
   1210     /* transpose pm1, p0, p1, p2 */
   1211     __asm__ __volatile__ (
   1212         "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
   1213         "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
   1214         "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
   1215         "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
   1216 
   1217         "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
   1218         "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
   1219         "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
   1220         "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
   1221 
   1222         "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
   1223         "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
   1224         "append         %[p1],      %[sec3],    16          \n\t"
   1225         "append         %[pm1],     %[sec4],    16          \n\t"
   1226 
   1227         : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
   1228           [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
   1229           [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
   1230           [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
   1231         :
   1232     );
   1233 
   1234     /* transpose p3, p4, p5, p6 */
   1235     __asm__ __volatile__ (
   1236         "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
   1237         "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
   1238         "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
   1239         "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
   1240 
   1241         "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
   1242         "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
   1243         "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
   1244         "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
   1245 
   1246         "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
   1247         "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
   1248         "append         %[p5],      %[sec3],    16          \n\t"
   1249         "append         %[p3],      %[sec4],    16          \n\t"
   1250 
   1251         : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
   1252           [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
   1253           [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
   1254           [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
   1255         :
   1256     );
   1257 
   1258     /* if (p1 - p4 == 0) and (p2 - p3 == 0)
   1259      * mask will be zero and filtering is not needed
   1260      */
   1261     if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
   1262     {
   1263 
   1264         vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
   1265                                  thresh, &hev, &mask);
   1266 
   1267         /* if mask == 0 do filtering is not needed */
   1268         if (mask)
   1269         {
   1270             /* filtering */
   1271             vp8_filter_mips(mask, hev, &p1, &p2, &p3, &p4);
   1272 
   1273             /* unpack processed 4x4 neighborhood
   1274              * don't use transpose on output data
   1275              * because memory isn't aligned
   1276              */
   1277             __asm__ __volatile__ (
   1278                 "sb         %[p4],  1(%[s4])    \n\t"
   1279                 "sb         %[p3],  0(%[s4])    \n\t"
   1280                 "sb         %[p2], -1(%[s4])    \n\t"
   1281                 "sb         %[p1], -2(%[s4])    \n\t"
   1282                 :
   1283                 : [p4] "r" (p4), [p3] "r" (p3), [s4] "r" (s4),
   1284                   [p2] "r" (p2), [p1] "r" (p1)
   1285             );
   1286 
   1287             __asm__ __volatile__ (
   1288                 "srl        %[p4], %[p4], 8     \n\t"
   1289                 "srl        %[p3], %[p3], 8     \n\t"
   1290                 "srl        %[p2], %[p2], 8     \n\t"
   1291                 "srl        %[p1], %[p1], 8     \n\t"
   1292                 : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
   1293                 :
   1294             );
   1295 
   1296             __asm__ __volatile__ (
   1297                 "sb         %[p4],  1(%[s3])    \n\t"
   1298                 "sb         %[p3],  0(%[s3])    \n\t"
   1299                 "sb         %[p2], -1(%[s3])    \n\t"
   1300                 "sb         %[p1], -2(%[s3])    \n\t"
   1301                 : [p1] "+r" (p1)
   1302                 : [p4] "r" (p4), [p3] "r" (p3), [s3] "r" (s3), [p2] "r" (p2)
   1303             );
   1304 
   1305             __asm__ __volatile__ (
   1306                 "srl        %[p4], %[p4], 8     \n\t"
   1307                 "srl        %[p3], %[p3], 8     \n\t"
   1308                 "srl        %[p2], %[p2], 8     \n\t"
   1309                 "srl        %[p1], %[p1], 8     \n\t"
   1310                 : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
   1311                 :
   1312             );
   1313 
   1314             __asm__ __volatile__ (
   1315                 "sb         %[p4],  1(%[s2])    \n\t"
   1316                 "sb         %[p3],  0(%[s2])    \n\t"
   1317                 "sb         %[p2], -1(%[s2])    \n\t"
   1318                 "sb         %[p1], -2(%[s2])    \n\t"
   1319                 :
   1320                 : [p4] "r" (p4), [p3] "r" (p3), [s2] "r" (s2),
   1321                   [p2] "r" (p2), [p1] "r" (p1)
   1322             );
   1323 
   1324             __asm__ __volatile__ (
   1325                 "srl        %[p4], %[p4], 8     \n\t"
   1326                 "srl        %[p3], %[p3], 8     \n\t"
   1327                 "srl        %[p2], %[p2], 8     \n\t"
   1328                 "srl        %[p1], %[p1], 8     \n\t"
   1329                 : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
   1330                 :
   1331             );
   1332 
   1333             __asm__ __volatile__ (
   1334                 "sb         %[p4],  1(%[s1])    \n\t"
   1335                 "sb         %[p3],  0(%[s1])    \n\t"
   1336                 "sb         %[p2], -1(%[s1])    \n\t"
   1337                 "sb         %[p1], -2(%[s1])    \n\t"
   1338                 :
   1339                 : [p4] "r" (p4), [p3] "r" (p3), [s1] "r" (s1),
   1340                   [p2] "r" (p2), [p1] "r" (p1)
   1341             );
   1342         }
   1343     }
   1344 }
   1345 
   1346 /* inputs & outputs are quad-byte vectors */
   1347 static __inline void vp8_mbfilter_mips
   1348 (
   1349     uint32_t mask,
   1350     uint32_t hev,
   1351     uint32_t *ps2,
   1352     uint32_t *ps1,
   1353     uint32_t *ps0,
   1354     uint32_t *qs0,
   1355     uint32_t *qs1,
   1356     uint32_t *qs2
   1357 )
   1358 {
   1359     int32_t vps2, vps1, vps0, vqs0, vqs1, vqs2;
   1360     int32_t vps2_l, vps1_l, vps0_l, vqs0_l, vqs1_l, vqs2_l;
   1361     int32_t vps2_r, vps1_r, vps0_r, vqs0_r, vqs1_r, vqs2_r;
   1362     uint32_t HWM, vp8_filter_l, vp8_filter_r, mask_l, mask_r, hev_l, hev_r, subr_r, subr_l;
   1363     uint32_t Filter2_l, Filter2_r, t1, t2, Filter1_l, Filter1_r, invhev_l, invhev_r;
   1364     uint32_t N128, R63;
   1365     uint32_t u1_l, u1_r, u2_l, u2_r, u3_l, u3_r;
   1366 
   1367     R63  = 0x003F003F;
   1368     HWM  = 0xFF00FF00;
   1369     N128 = 0x80808080;
   1370     t1   = 0x03000300;
   1371     t2   = 0x04000400;
   1372 
   1373     vps0 = (*ps0) ^ N128;
   1374     vps1 = (*ps1) ^ N128;
   1375     vps2 = (*ps2) ^ N128;
   1376     vqs0 = (*qs0) ^ N128;
   1377     vqs1 = (*qs1) ^ N128;
   1378     vqs2 = (*qs2) ^ N128;
   1379 
   1380     /* use halfword pairs instead quad-bytes because of accuracy */
   1381     vps0_l = vps0 & HWM;
   1382     vps0_r = vps0 << 8;
   1383     vps0_r = vps0_r & HWM;
   1384 
   1385     vqs0_l = vqs0 & HWM;
   1386     vqs0_r = vqs0 << 8;
   1387     vqs0_r = vqs0_r & HWM;
   1388 
   1389     vps1_l = vps1 & HWM;
   1390     vps1_r = vps1 << 8;
   1391     vps1_r = vps1_r & HWM;
   1392 
   1393     vqs1_l = vqs1 & HWM;
   1394     vqs1_r = vqs1 << 8;
   1395     vqs1_r = vqs1_r & HWM;
   1396 
   1397     vqs2_l = vqs2 & HWM;
   1398     vqs2_r = vqs2 << 8;
   1399     vqs2_r = vqs2_r & HWM;
   1400 
   1401     __asm__ __volatile__ (
   1402         /* qs0 - ps0 */
   1403         "subq_s.ph    %[subr_l],       %[vqs0_l],       %[vps0_l]       \n\t"
   1404         "subq_s.ph    %[subr_r],       %[vqs0_r],       %[vps0_r]       \n\t"
   1405 
   1406         /* vp8_filter = vp8_signed_char_clamp(ps1 - qs1); */
   1407         "subq_s.ph    %[vp8_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
   1408         "subq_s.ph    %[vp8_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
   1409 
   1410         : [vp8_filter_l] "=&r" (vp8_filter_l), [vp8_filter_r] "=r" (vp8_filter_r),
   1411           [subr_l] "=&r" (subr_l), [subr_r] "=&r" (subr_r)
   1412         : [vps0_l] "r" (vps0_l), [vps0_r] "r" (vps0_r), [vps1_l] "r" (vps1_l),
   1413           [vps1_r] "r" (vps1_r), [vqs0_l] "r" (vqs0_l), [vqs0_r] "r" (vqs0_r),
   1414           [vqs1_l] "r" (vqs1_l), [vqs1_r] "r" (vqs1_r)
   1415     );
   1416 
   1417     vps2_l = vps2 & HWM;
   1418     vps2_r = vps2 << 8;
   1419     vps2_r = vps2_r & HWM;
   1420 
   1421     /* add outer taps if we have high edge variance */
   1422     __asm__ __volatile__ (
   1423         /* vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (qs0 - ps0)); */
   1424         "addq_s.ph    %[vp8_filter_l], %[vp8_filter_l], %[subr_l]       \n\t"
   1425         "addq_s.ph    %[vp8_filter_r], %[vp8_filter_r], %[subr_r]       \n\t"
   1426         "and          %[mask_l],       %[HWM],          %[mask]         \n\t"
   1427         "sll          %[mask_r],       %[mask],         8               \n\t"
   1428         "and          %[mask_r],       %[HWM],          %[mask_r]       \n\t"
   1429         "addq_s.ph    %[vp8_filter_l], %[vp8_filter_l], %[subr_l]       \n\t"
   1430         "addq_s.ph    %[vp8_filter_r], %[vp8_filter_r], %[subr_r]       \n\t"
   1431         "and          %[hev_l],        %[HWM],          %[hev]          \n\t"
   1432         "sll          %[hev_r],        %[hev],          8               \n\t"
   1433         "and          %[hev_r],        %[HWM],          %[hev_r]        \n\t"
   1434         "addq_s.ph    %[vp8_filter_l], %[vp8_filter_l], %[subr_l]       \n\t"
   1435         "addq_s.ph    %[vp8_filter_r], %[vp8_filter_r], %[subr_r]       \n\t"
   1436 
   1437         /* vp8_filter &= mask; */
   1438         "and          %[vp8_filter_l], %[vp8_filter_l], %[mask_l]       \n\t"
   1439         "and          %[vp8_filter_r], %[vp8_filter_r], %[mask_r]       \n\t"
   1440 
   1441         /* Filter2 = vp8_filter & hev; */
   1442         "and          %[Filter2_l],    %[vp8_filter_l], %[hev_l]        \n\t"
   1443         "and          %[Filter2_r],    %[vp8_filter_r], %[hev_r]        \n\t"
   1444 
   1445         : [vp8_filter_l] "+r" (vp8_filter_l), [vp8_filter_r] "+r" (vp8_filter_r),
   1446           [hev_l] "=&r" (hev_l), [hev_r] "=&r" (hev_r),
   1447           [mask_l] "=&r" (mask_l), [mask_r] "=&r" (mask_r),
   1448           [Filter2_l] "=&r" (Filter2_l), [Filter2_r] "=&r" (Filter2_r)
   1449         : [subr_l] "r" (subr_l), [subr_r] "r" (subr_r),
   1450           [HWM] "r" (HWM), [hev]  "r" (hev), [mask] "r" (mask)
   1451     );
   1452 
   1453     /* save bottom 3 bits so that we round one side +4 and the other +3 */
   1454     __asm__ __volatile__ (
   1455         /* Filter1 = vp8_signed_char_clamp(Filter2 + 4) >>= 3; */
   1456         "addq_s.ph    %[Filter1_l],    %[Filter2_l],    %[t2]           \n\t"
   1457         "xor          %[invhev_l],     %[hev_l],        %[HWM]          \n\t"
   1458         "addq_s.ph    %[Filter1_r],    %[Filter2_r],    %[t2]           \n\t"
   1459 
   1460         /* Filter2 = vp8_signed_char_clamp(Filter2 + 3) >>= 3; */
   1461         "addq_s.ph    %[Filter2_l],    %[Filter2_l],    %[t1]           \n\t"
   1462         "addq_s.ph    %[Filter2_r],    %[Filter2_r],    %[t1]           \n\t"
   1463 
   1464         "shra.ph      %[Filter1_l],    %[Filter1_l],    3               \n\t"
   1465         "shra.ph      %[Filter1_r],    %[Filter1_r],    3               \n\t"
   1466 
   1467         "shra.ph      %[Filter2_l],    %[Filter2_l],    3               \n\t"
   1468         "shra.ph      %[Filter2_r],    %[Filter2_r],    3               \n\t"
   1469         "and          %[Filter1_l],    %[Filter1_l],    %[HWM]          \n\t"
   1470         "and          %[Filter1_r],    %[Filter1_r],    %[HWM]          \n\t"
   1471         "xor          %[invhev_r],     %[hev_r],        %[HWM]          \n\t"
   1472 
   1473         /* qs0 = vp8_signed_char_clamp(qs0 - Filter1); */
   1474         "subq_s.ph    %[vqs0_l],       %[vqs0_l],       %[Filter1_l]    \n\t"
   1475         "subq_s.ph    %[vqs0_r],       %[vqs0_r],       %[Filter1_r]    \n\t"
   1476 
   1477         /* ps0 = vp8_signed_char_clamp(ps0 + Filter2); */
   1478         "addq_s.ph    %[vps0_l],       %[vps0_l],       %[Filter2_l]    \n\t"
   1479         "addq_s.ph    %[vps0_r],       %[vps0_r],       %[Filter2_r]    \n\t"
   1480 
   1481         : [invhev_l] "=&r" (invhev_l), [invhev_r] "=&r" (invhev_r),
   1482           [Filter1_l] "=&r" (Filter1_l), [Filter1_r] "=&r" (Filter1_r),
   1483           [Filter2_l] "+r" (Filter2_l), [Filter2_r] "+r" (Filter2_r),
   1484           [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
   1485           [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r)
   1486         : [t1] "r" (t1), [t2] "r" (t2), [HWM] "r" (HWM),
   1487           [hev_l] "r" (hev_l), [hev_r] "r" (hev_r)
   1488     );
   1489 
   1490     /* only apply wider filter if not high edge variance */
   1491     __asm__ __volatile__ (
   1492         /* vp8_filter &= ~hev; */
   1493         "and          %[Filter2_l],    %[vp8_filter_l], %[invhev_l]     \n\t"
   1494         "and          %[Filter2_r],    %[vp8_filter_r], %[invhev_r]     \n\t"
   1495 
   1496         "shra.ph      %[Filter2_l],    %[Filter2_l],    8               \n\t"
   1497         "shra.ph      %[Filter2_r],    %[Filter2_r],    8               \n\t"
   1498 
   1499         : [Filter2_l] "=&r" (Filter2_l), [Filter2_r] "=&r" (Filter2_r)
   1500         : [vp8_filter_l] "r" (vp8_filter_l), [vp8_filter_r] "r" (vp8_filter_r),
   1501           [invhev_l] "r" (invhev_l), [invhev_r] "r" (invhev_r)
   1502     );
   1503 
   1504     /* roughly 3/7th difference across boundary */
   1505     __asm__ __volatile__ (
   1506         "shll.ph      %[u3_l],         %[Filter2_l],    3               \n\t"
   1507         "shll.ph      %[u3_r],         %[Filter2_r],    3               \n\t"
   1508 
   1509         "addq.ph      %[u3_l],         %[u3_l],         %[Filter2_l]    \n\t"
   1510         "addq.ph      %[u3_r],         %[u3_r],         %[Filter2_r]    \n\t"
   1511 
   1512         "shll.ph      %[u2_l],         %[u3_l],         1               \n\t"
   1513         "shll.ph      %[u2_r],         %[u3_r],         1               \n\t"
   1514 
   1515         "addq.ph      %[u1_l],         %[u3_l],         %[u2_l]         \n\t"
   1516         "addq.ph      %[u1_r],         %[u3_r],         %[u2_r]         \n\t"
   1517 
   1518         "addq.ph      %[u2_l],         %[u2_l],         %[R63]          \n\t"
   1519         "addq.ph      %[u2_r],         %[u2_r],         %[R63]          \n\t"
   1520 
   1521         "addq.ph      %[u3_l],         %[u3_l],         %[R63]          \n\t"
   1522         "addq.ph      %[u3_r],         %[u3_r],         %[R63]          \n\t"
   1523 
   1524         /* vp8_signed_char_clamp((63 + Filter2 * 27) >> 7)
   1525          * vp8_signed_char_clamp((63 + Filter2 * 18) >> 7)
   1526          */
   1527         "addq.ph      %[u1_l],         %[u1_l],         %[R63]          \n\t"
   1528         "addq.ph      %[u1_r],         %[u1_r],         %[R63]          \n\t"
   1529         "shra.ph      %[u1_l],         %[u1_l],         7               \n\t"
   1530         "shra.ph      %[u1_r],         %[u1_r],         7               \n\t"
   1531         "shra.ph      %[u2_l],         %[u2_l],         7               \n\t"
   1532         "shra.ph      %[u2_r],         %[u2_r],         7               \n\t"
   1533         "shll.ph      %[u1_l],         %[u1_l],         8               \n\t"
   1534         "shll.ph      %[u1_r],         %[u1_r],         8               \n\t"
   1535         "shll.ph      %[u2_l],         %[u2_l],         8               \n\t"
   1536         "shll.ph      %[u2_r],         %[u2_r],         8               \n\t"
   1537 
   1538         /* vqs0 = vp8_signed_char_clamp(qs0 - u); */
   1539         "subq_s.ph    %[vqs0_l],       %[vqs0_l],       %[u1_l]         \n\t"
   1540         "subq_s.ph    %[vqs0_r],       %[vqs0_r],       %[u1_r]         \n\t"
   1541 
   1542         /* vps0 = vp8_signed_char_clamp(ps0 + u); */
   1543         "addq_s.ph    %[vps0_l],       %[vps0_l],       %[u1_l]         \n\t"
   1544         "addq_s.ph    %[vps0_r],       %[vps0_r],       %[u1_r]         \n\t"
   1545 
   1546         : [u1_l] "=&r" (u1_l), [u1_r] "=&r" (u1_r), [u2_l] "=&r" (u2_l),
   1547           [u2_r] "=&r" (u2_r), [u3_l] "=&r" (u3_l), [u3_r] "=&r" (u3_r),
   1548           [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
   1549           [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r)
   1550         : [R63]  "r" (R63),
   1551           [Filter2_l] "r" (Filter2_l), [Filter2_r] "r" (Filter2_r)
   1552     );
   1553 
   1554     __asm__ __volatile__ (
   1555         /* vqs1 = vp8_signed_char_clamp(qs1 - u); */
   1556         "subq_s.ph    %[vqs1_l],       %[vqs1_l],       %[u2_l]         \n\t"
   1557         "addq_s.ph    %[vps1_l],       %[vps1_l],       %[u2_l]         \n\t"
   1558 
   1559         /* vps1 = vp8_signed_char_clamp(ps1 + u); */
   1560         "addq_s.ph    %[vps1_r],       %[vps1_r],       %[u2_r]         \n\t"
   1561         "subq_s.ph    %[vqs1_r],       %[vqs1_r],       %[u2_r]         \n\t"
   1562 
   1563         : [vps1_l] "+r" (vps1_l), [vps1_r] "+r" (vps1_r),
   1564           [vqs1_l] "+r" (vqs1_l), [vqs1_r] "+r" (vqs1_r)
   1565         : [u2_l] "r" (u2_l), [u2_r] "r" (u2_r)
   1566     );
   1567 
   1568     /* roughly 1/7th difference across boundary */
   1569     __asm__ __volatile__ (
   1570         /* u = vp8_signed_char_clamp((63 + Filter2 * 9) >> 7); */
   1571         "shra.ph      %[u3_l],         %[u3_l],         7               \n\t"
   1572         "shra.ph      %[u3_r],         %[u3_r],         7               \n\t"
   1573         "shll.ph      %[u3_l],         %[u3_l],         8               \n\t"
   1574         "shll.ph      %[u3_r],         %[u3_r],         8               \n\t"
   1575 
   1576         /* vqs2 = vp8_signed_char_clamp(qs2 - u); */
   1577         "subq_s.ph    %[vqs2_l],       %[vqs2_l],       %[u3_l]         \n\t"
   1578         "subq_s.ph    %[vqs2_r],       %[vqs2_r],       %[u3_r]         \n\t"
   1579 
   1580         /* vps2 = vp8_signed_char_clamp(ps2 + u); */
   1581         "addq_s.ph    %[vps2_l],       %[vps2_l],       %[u3_l]         \n\t"
   1582         "addq_s.ph    %[vps2_r],       %[vps2_r],       %[u3_r]         \n\t"
   1583 
   1584         : [u3_l] "+r" (u3_l), [u3_r] "+r" (u3_r), [vps2_l] "+r" (vps2_l),
   1585           [vps2_r] "+r" (vps2_r), [vqs2_l] "+r" (vqs2_l), [vqs2_r] "+r" (vqs2_r)
   1586         :
   1587     );
   1588 
   1589     /* Create quad-bytes from halfword pairs */
   1590     __asm__ __volatile__ (
   1591         "and          %[vqs0_l],       %[vqs0_l],       %[HWM]          \n\t"
   1592         "shrl.ph      %[vqs0_r],       %[vqs0_r],       8               \n\t"
   1593 
   1594         "and          %[vps0_l],       %[vps0_l],       %[HWM]          \n\t"
   1595         "shrl.ph      %[vps0_r],       %[vps0_r],       8               \n\t"
   1596 
   1597         "and          %[vqs1_l],       %[vqs1_l],       %[HWM]          \n\t"
   1598         "shrl.ph      %[vqs1_r],       %[vqs1_r],       8               \n\t"
   1599 
   1600         "and          %[vps1_l],       %[vps1_l],       %[HWM]          \n\t"
   1601         "shrl.ph      %[vps1_r],       %[vps1_r],       8               \n\t"
   1602 
   1603         "and          %[vqs2_l],       %[vqs2_l],       %[HWM]          \n\t"
   1604         "shrl.ph      %[vqs2_r],       %[vqs2_r],       8               \n\t"
   1605 
   1606         "and          %[vps2_l],       %[vps2_l],       %[HWM]          \n\t"
   1607         "shrl.ph      %[vps2_r],       %[vps2_r],       8               \n\t"
   1608 
   1609         "or           %[vqs0_r],       %[vqs0_l],       %[vqs0_r]       \n\t"
   1610         "or           %[vps0_r],       %[vps0_l],       %[vps0_r]       \n\t"
   1611         "or           %[vqs1_r],       %[vqs1_l],       %[vqs1_r]       \n\t"
   1612         "or           %[vps1_r],       %[vps1_l],       %[vps1_r]       \n\t"
   1613         "or           %[vqs2_r],       %[vqs2_l],       %[vqs2_r]       \n\t"
   1614         "or           %[vps2_r],       %[vps2_l],       %[vps2_r]       \n\t"
   1615 
   1616         : [vps1_l] "+r" (vps1_l), [vps1_r] "+r" (vps1_r), [vqs1_l] "+r" (vqs1_l),
   1617           [vqs1_r] "+r" (vqs1_r), [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
   1618           [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r), [vqs2_l] "+r" (vqs2_l),
   1619           [vqs2_r] "+r" (vqs2_r), [vps2_r] "+r" (vps2_r), [vps2_l] "+r" (vps2_l)
   1620         : [HWM] "r" (HWM)
   1621     );
   1622 
   1623     *ps0 = vps0_r ^ N128;
   1624     *ps1 = vps1_r ^ N128;
   1625     *ps2 = vps2_r ^ N128;
   1626     *qs0 = vqs0_r ^ N128;
   1627     *qs1 = vqs1_r ^ N128;
   1628     *qs2 = vqs2_r ^ N128;
   1629 }
   1630 
   1631 void vp8_mbloop_filter_horizontal_edge_mips
   1632 (
   1633     unsigned char *s,
   1634     int p,
   1635     unsigned int flimit,
   1636     unsigned int limit,
   1637     unsigned int thresh,
   1638     int count
   1639 )
   1640 {
   1641     int i;
   1642     uint32_t mask, hev;
   1643     uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
   1644     unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
   1645 
   1646     mask = 0;
   1647     hev = 0;
   1648     i = 0;
   1649     p1 = 0;
   1650     p2 = 0;
   1651     p3 = 0;
   1652     p4 = 0;
   1653 
   1654     /* loop filter designed to work using chars so that we can make maximum use
   1655      * of 8 bit simd instructions.
   1656      */
   1657 
   1658     sm1 = s - (p << 2);
   1659     s0  = s - p - p - p;
   1660     s1  = s - p - p;
   1661     s2  = s - p;
   1662     s3  = s;
   1663     s4  = s + p;
   1664     s5  = s + p + p;
   1665     s6  = s + p + p + p;
   1666 
   1667     /* prefetch data for load */
   1668     prefetch_load_lf(s + p);
   1669 
   1670     /* apply filter on 4 pixesl at the same time */
   1671     do
   1672     {
   1673         /* load quad-byte vectors
   1674          * memory is 4 byte aligned
   1675          */
   1676         p1 = *((uint32_t *)(s1));
   1677         p2 = *((uint32_t *)(s2));
   1678         p3 = *((uint32_t *)(s3));
   1679         p4 = *((uint32_t *)(s4));
   1680 
   1681         /* if (p1 - p4 == 0) and (p2 - p3 == 0)
   1682          * mask will be zero and filtering is not needed
   1683          */
   1684         if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
   1685         {
   1686 
   1687             pm1 = *((uint32_t *)(sm1));
   1688             p0  = *((uint32_t *)(s0));
   1689             p5  = *((uint32_t *)(s5));
   1690             p6  = *((uint32_t *)(s6));
   1691 
   1692             vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
   1693                                      thresh, &hev, &mask);
   1694 
   1695             /* if mask == 0 do filtering is not needed */
   1696             if (mask)
   1697             {
   1698                 /* filtering */
   1699                 vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
   1700 
   1701                 /* unpack processed 4x4 neighborhood
   1702                  * memory is 4 byte aligned
   1703                  */
   1704                 *((uint32_t *)s0) = p0;
   1705                 *((uint32_t *)s1) = p1;
   1706                 *((uint32_t *)s2) = p2;
   1707                 *((uint32_t *)s3) = p3;
   1708                 *((uint32_t *)s4) = p4;
   1709                 *((uint32_t *)s5) = p5;
   1710             }
   1711         }
   1712 
   1713         sm1 += 4;
   1714         s0  += 4;
   1715         s1  += 4;
   1716         s2  += 4;
   1717         s3  += 4;
   1718         s4  += 4;
   1719         s5  += 4;
   1720         s6  += 4;
   1721 
   1722         /* load quad-byte vectors
   1723          * memory is 4 byte aligned
   1724          */
   1725         p1 = *((uint32_t *)(s1));
   1726         p2 = *((uint32_t *)(s2));
   1727         p3 = *((uint32_t *)(s3));
   1728         p4 = *((uint32_t *)(s4));
   1729 
   1730         /* if (p1 - p4 == 0) and (p2 - p3 == 0)
   1731          * mask will be zero and filtering is not needed
   1732          */
   1733         if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
   1734         {
   1735 
   1736             pm1 = *((uint32_t *)(sm1));
   1737             p0  = *((uint32_t *)(s0));
   1738             p5  = *((uint32_t *)(s5));
   1739             p6  = *((uint32_t *)(s6));
   1740 
   1741             vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
   1742                                      thresh, &hev, &mask);
   1743 
   1744             /* if mask == 0 do filtering is not needed */
   1745             if (mask)
   1746             {
   1747                 /* filtering */
   1748                 vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
   1749 
   1750                 /* unpack processed 4x4 neighborhood
   1751                  * memory is 4 byte aligned
   1752                  */
   1753                 *((uint32_t *)s0) = p0;
   1754                 *((uint32_t *)s1) = p1;
   1755                 *((uint32_t *)s2) = p2;
   1756                 *((uint32_t *)s3) = p3;
   1757                 *((uint32_t *)s4) = p4;
   1758                 *((uint32_t *)s5) = p5;
   1759             }
   1760         }
   1761 
   1762         sm1 += 4;
   1763         s0  += 4;
   1764         s1  += 4;
   1765         s2  += 4;
   1766         s3  += 4;
   1767         s4  += 4;
   1768         s5  += 4;
   1769         s6  += 4;
   1770 
   1771         i += 8;
   1772     }
   1773 
   1774     while (i < count);
   1775 }
   1776 
   1777 void vp8_mbloop_filter_uvhorizontal_edge_mips
   1778 (
   1779     unsigned char *s,
   1780     int p,
   1781     unsigned int flimit,
   1782     unsigned int limit,
   1783     unsigned int thresh,
   1784     int count
   1785 )
   1786 {
   1787     uint32_t mask, hev;
   1788     uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
   1789     unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
   1790 
   1791     mask = 0;
   1792     hev = 0;
   1793     p1 = 0;
   1794     p2 = 0;
   1795     p3 = 0;
   1796     p4 = 0;
   1797 
   1798     /* loop filter designed to work using chars so that we can make maximum use
   1799      * of 8 bit simd instructions.
   1800      */
   1801 
   1802     sm1 = s - (p << 2);
   1803     s0  = s - p - p - p;
   1804     s1  = s - p - p;
   1805     s2  = s - p;
   1806     s3  = s;
   1807     s4  = s + p;
   1808     s5  = s + p + p;
   1809     s6  = s + p + p + p;
   1810 
   1811     /* load quad-byte vectors
   1812      * memory is 4 byte aligned
   1813      */
   1814     p1 = *((uint32_t *)(s1));
   1815     p2 = *((uint32_t *)(s2));
   1816     p3 = *((uint32_t *)(s3));
   1817     p4 = *((uint32_t *)(s4));
   1818 
   1819     /* if (p1 - p4 == 0) and (p2 - p3 == 0)
   1820      * mask will be zero and filtering is not needed
   1821      */
   1822     if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
   1823     {
   1824 
   1825         pm1 = *((uint32_t *)(sm1));
   1826         p0  = *((uint32_t *)(s0));
   1827         p5  = *((uint32_t *)(s5));
   1828         p6  = *((uint32_t *)(s6));
   1829 
   1830         /* if mask == 0 do filtering is not needed */
   1831         vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
   1832                                  thresh, &hev, &mask);
   1833 
   1834         if (mask)
   1835         {
   1836             /* filtering */
   1837             vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
   1838 
   1839             /* unpack processed 4x4 neighborhood
   1840              * memory is 4 byte aligned
   1841              */
   1842             *((uint32_t *)s0) = p0;
   1843             *((uint32_t *)s1) = p1;
   1844             *((uint32_t *)s2) = p2;
   1845             *((uint32_t *)s3) = p3;
   1846             *((uint32_t *)s4) = p4;
   1847             *((uint32_t *)s5) = p5;
   1848         }
   1849     }
   1850 
   1851     sm1 += 4;
   1852     s0  += 4;
   1853     s1  += 4;
   1854     s2  += 4;
   1855     s3  += 4;
   1856     s4  += 4;
   1857     s5  += 4;
   1858     s6  += 4;
   1859 
   1860     /* load quad-byte vectors
   1861      * memory is 4 byte aligned
   1862      */
   1863     p1 = *((uint32_t *)(s1));
   1864     p2 = *((uint32_t *)(s2));
   1865     p3 = *((uint32_t *)(s3));
   1866     p4 = *((uint32_t *)(s4));
   1867 
   1868     /* if (p1 - p4 == 0) and (p2 - p3 == 0)
   1869      * mask will be zero and filtering is not needed
   1870      */
   1871     if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
   1872     {
   1873 
   1874         pm1 = *((uint32_t *)(sm1));
   1875         p0  = *((uint32_t *)(s0));
   1876         p5  = *((uint32_t *)(s5));
   1877         p6  = *((uint32_t *)(s6));
   1878 
   1879         vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
   1880                                  thresh, &hev, &mask);
   1881 
   1882         /* if mask == 0 do filtering is not needed */
   1883         if (mask)
   1884         {
   1885             /* filtering */
   1886             vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
   1887 
   1888             /* unpack processed 4x4 neighborhood
   1889              * memory is 4 byte aligned
   1890              */
   1891             *((uint32_t *)s0) = p0;
   1892             *((uint32_t *)s1) = p1;
   1893             *((uint32_t *)s2) = p2;
   1894             *((uint32_t *)s3) = p3;
   1895             *((uint32_t *)s4) = p4;
   1896             *((uint32_t *)s5) = p5;
   1897         }
   1898     }
   1899 }
   1900 
   1901 
   1902 void vp8_mbloop_filter_vertical_edge_mips
   1903 (
   1904     unsigned char *s,
   1905     int p,
   1906     unsigned int flimit,
   1907     unsigned int limit,
   1908     unsigned int thresh,
   1909     int count
   1910 )
   1911 {
   1912 
   1913     int i;
   1914     uint32_t mask, hev;
   1915     uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
   1916     unsigned char *s1, *s2, *s3, *s4;
   1917     uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
   1918 
   1919     mask = 0;
   1920     hev = 0;
   1921     i = 0;
   1922     pm1 = 0;
   1923     p0 = 0;
   1924     p1 = 0;
   1925     p2 = 0;
   1926     p3 = 0;
   1927     p4 = 0;
   1928     p5 = 0;
   1929     p6 = 0;
   1930 
   1931     /* loop filter designed to work using chars so that we can make maximum use
   1932      * of 8 bit simd instructions.
   1933      */
   1934 
   1935     /* apply filter on 4 pixesl at the same time */
   1936     do
   1937     {
   1938         s1 = s;
   1939         s2 = s + p;
   1940         s3 = s2 + p;
   1941         s4 = s3 + p;
   1942         s  = s4 + p;
   1943 
   1944         /* load quad-byte vectors
   1945          * memory is 4 byte aligned
   1946          */
   1947         p2  = *((uint32_t *)(s1 - 4));
   1948         p6  = *((uint32_t *)(s1));
   1949         p1  = *((uint32_t *)(s2 - 4));
   1950         p5  = *((uint32_t *)(s2));
   1951         p0  = *((uint32_t *)(s3 - 4));
   1952         p4  = *((uint32_t *)(s3));
   1953         pm1 = *((uint32_t *)(s4 - 4));
   1954         p3  = *((uint32_t *)(s4));
   1955 
   1956         /* transpose pm1, p0, p1, p2 */
   1957         __asm__ __volatile__ (
   1958             "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
   1959             "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
   1960             "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
   1961             "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
   1962 
   1963             "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
   1964             "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
   1965             "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
   1966             "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
   1967 
   1968             "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
   1969             "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
   1970             "append         %[p1],      %[sec3],    16          \n\t"
   1971             "append         %[pm1],     %[sec4],    16          \n\t"
   1972 
   1973             : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
   1974               [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
   1975               [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
   1976               [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
   1977             :
   1978         );
   1979 
   1980         /* transpose p3, p4, p5, p6 */
   1981         __asm__ __volatile__ (
   1982             "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
   1983             "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
   1984             "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
   1985             "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
   1986 
   1987             "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
   1988             "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
   1989             "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
   1990             "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
   1991 
   1992             "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
   1993             "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
   1994             "append         %[p5],      %[sec3],    16          \n\t"
   1995             "append         %[p3],      %[sec4],    16          \n\t"
   1996 
   1997             : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
   1998               [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
   1999               [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
   2000               [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
   2001             :
   2002         );
   2003 
   2004         /* if (p1 - p4 == 0) and (p2 - p3 == 0)
   2005          * mask will be zero and filtering is not needed
   2006          */
   2007         if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
   2008         {
   2009 
   2010             vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
   2011                                      thresh, &hev, &mask);
   2012 
   2013             /* if mask == 0 do filtering is not needed */
   2014             if (mask)
   2015             {
   2016                 /* filtering */
   2017                 vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
   2018 
   2019                 /* don't use transpose on output data
   2020                  * because memory isn't aligned
   2021                  */
   2022                 __asm__ __volatile__ (
   2023                     "sb         %[p5],  2(%[s4])        \n\t"
   2024                     "sb         %[p4],  1(%[s4])        \n\t"
   2025                     "sb         %[p3],  0(%[s4])        \n\t"
   2026                     "sb         %[p2], -1(%[s4])        \n\t"
   2027                     "sb         %[p1], -2(%[s4])        \n\t"
   2028                     "sb         %[p0], -3(%[s4])        \n\t"
   2029                     :
   2030                     : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s4] "r" (s4),
   2031                       [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
   2032                 );
   2033 
   2034                 __asm__ __volatile__ (
   2035                     "srl        %[p5], %[p5], 8         \n\t"
   2036                     "srl        %[p4], %[p4], 8         \n\t"
   2037                     "srl        %[p3], %[p3], 8         \n\t"
   2038                     "srl        %[p2], %[p2], 8         \n\t"
   2039                     "srl        %[p1], %[p1], 8         \n\t"
   2040                     "srl        %[p0], %[p0], 8         \n\t"
   2041                     : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
   2042                       [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
   2043                     :
   2044                 );
   2045 
   2046                 __asm__ __volatile__ (
   2047                     "sb         %[p5],  2(%[s3])        \n\t"
   2048                     "sb         %[p4],  1(%[s3])        \n\t"
   2049                     "sb         %[p3],  0(%[s3])        \n\t"
   2050                     "sb         %[p2], -1(%[s3])        \n\t"
   2051                     "sb         %[p1], -2(%[s3])        \n\t"
   2052                     "sb         %[p0], -3(%[s3])        \n\t"
   2053                     :
   2054                     : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s3] "r" (s3),
   2055                       [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
   2056                 );
   2057 
   2058                 __asm__ __volatile__ (
   2059                     "srl        %[p5], %[p5], 8         \n\t"
   2060                     "srl        %[p4], %[p4], 8         \n\t"
   2061                     "srl        %[p3], %[p3], 8         \n\t"
   2062                     "srl        %[p2], %[p2], 8         \n\t"
   2063                     "srl        %[p1], %[p1], 8         \n\t"
   2064                     "srl        %[p0], %[p0], 8         \n\t"
   2065                     : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
   2066                       [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
   2067                     :
   2068                 );
   2069 
   2070                 __asm__ __volatile__ (
   2071                     "sb         %[p5],  2(%[s2])        \n\t"
   2072                     "sb         %[p4],  1(%[s2])        \n\t"
   2073                     "sb         %[p3],  0(%[s2])        \n\t"
   2074                     "sb         %[p2], -1(%[s2])        \n\t"
   2075                     "sb         %[p1], -2(%[s2])        \n\t"
   2076                     "sb         %[p0], -3(%[s2])        \n\t"
   2077                     :
   2078                     : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s2] "r" (s2),
   2079                       [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
   2080                 );
   2081 
   2082                 __asm__ __volatile__ (
   2083                     "srl        %[p5], %[p5], 8         \n\t"
   2084                     "srl        %[p4], %[p4], 8         \n\t"
   2085                     "srl        %[p3], %[p3], 8         \n\t"
   2086                     "srl        %[p2], %[p2], 8         \n\t"
   2087                     "srl        %[p1], %[p1], 8         \n\t"
   2088                     "srl        %[p0], %[p0], 8         \n\t"
   2089                     : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
   2090                       [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
   2091                     :
   2092                 );
   2093 
   2094                 __asm__ __volatile__ (
   2095                     "sb         %[p5],  2(%[s1])        \n\t"
   2096                     "sb         %[p4],  1(%[s1])        \n\t"
   2097                     "sb         %[p3],  0(%[s1])        \n\t"
   2098                     "sb         %[p2], -1(%[s1])        \n\t"
   2099                     "sb         %[p1], -2(%[s1])        \n\t"
   2100                     "sb         %[p0], -3(%[s1])        \n\t"
   2101                     :
   2102                     : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s1] "r" (s1),
   2103                       [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
   2104                 );
   2105             }
   2106         }
   2107 
   2108         i += 4;
   2109     }
   2110 
   2111     while (i < count);
   2112 }
   2113 
   2114 void vp8_mbloop_filter_uvvertical_edge_mips
   2115 (
   2116     unsigned char *s,
   2117     int p,
   2118     unsigned int flimit,
   2119     unsigned int limit,
   2120     unsigned int thresh,
   2121     int count
   2122 )
   2123 {
   2124     uint32_t mask, hev;
   2125     uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
   2126     unsigned char *s1, *s2, *s3, *s4;
   2127     uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
   2128 
   2129     mask = 0;
   2130     hev = 0;
   2131     pm1 = 0;
   2132     p0 = 0;
   2133     p1 = 0;
   2134     p2 = 0;
   2135     p3 = 0;
   2136     p4 = 0;
   2137     p5 = 0;
   2138     p6 = 0;
   2139 
   2140     /* loop filter designed to work using chars so that we can make maximum use
   2141      * of 8 bit simd instructions.
   2142      */
   2143 
   2144     /* apply filter on 4 pixesl at the same time */
   2145 
   2146     s1 = s;
   2147     s2 = s + p;
   2148     s3 = s2 + p;
   2149     s4 = s3 + p;
   2150 
   2151     /* prefetch data for load */
   2152     prefetch_load_lf(s + 2 * p);
   2153 
   2154     /* load quad-byte vectors
   2155      * memory is 4 byte aligned
   2156      */
   2157     p2  = *((uint32_t *)(s1 - 4));
   2158     p6  = *((uint32_t *)(s1));
   2159     p1  = *((uint32_t *)(s2 - 4));
   2160     p5  = *((uint32_t *)(s2));
   2161     p0  = *((uint32_t *)(s3 - 4));
   2162     p4  = *((uint32_t *)(s3));
   2163     pm1 = *((uint32_t *)(s4 - 4));
   2164     p3  = *((uint32_t *)(s4));
   2165 
   2166     /* transpose pm1, p0, p1, p2 */
   2167     __asm__ __volatile__ (
   2168         "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
   2169         "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
   2170         "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
   2171         "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
   2172 
   2173         "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
   2174         "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
   2175         "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
   2176         "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
   2177 
   2178         "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
   2179         "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
   2180         "append         %[p1],      %[sec3],    16          \n\t"
   2181         "append         %[pm1],     %[sec4],    16          \n\t"
   2182 
   2183         : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
   2184           [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
   2185           [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
   2186           [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
   2187         :
   2188     );
   2189 
   2190     /* transpose p3, p4, p5, p6 */
   2191     __asm__ __volatile__ (
   2192         "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
   2193         "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
   2194         "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
   2195         "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
   2196 
   2197         "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
   2198         "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
   2199         "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
   2200         "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
   2201 
   2202         "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
   2203         "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
   2204         "append         %[p5],      %[sec3],    16          \n\t"
   2205         "append         %[p3],      %[sec4],    16          \n\t"
   2206 
   2207         : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
   2208           [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
   2209           [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
   2210           [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
   2211         :
   2212     );
   2213 
   2214     /* if (p1 - p4 == 0) and (p2 - p3 == 0)
   2215      * mask will be zero and filtering is not needed
   2216      */
   2217     if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
   2218     {
   2219 
   2220         vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
   2221                                  thresh, &hev, &mask);
   2222 
   2223         /* if mask == 0 do filtering is not needed */
   2224         if (mask)
   2225         {
   2226             /* filtering */
   2227             vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
   2228 
   2229             /* don't use transpose on output data
   2230              * because memory isn't aligned
   2231              */
   2232             __asm__ __volatile__ (
   2233                 "sb         %[p5],  2(%[s4])        \n\t"
   2234                 "sb         %[p4],  1(%[s4])        \n\t"
   2235                 "sb         %[p3],  0(%[s4])        \n\t"
   2236                 "sb         %[p2], -1(%[s4])        \n\t"
   2237                 "sb         %[p1], -2(%[s4])        \n\t"
   2238                 "sb         %[p0], -3(%[s4])        \n\t"
   2239                 :
   2240                 : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s4] "r" (s4),
   2241                   [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
   2242             );
   2243 
   2244             __asm__ __volatile__ (
   2245                 "srl        %[p5], %[p5], 8         \n\t"
   2246                 "srl        %[p4], %[p4], 8         \n\t"
   2247                 "srl        %[p3], %[p3], 8         \n\t"
   2248                 "srl        %[p2], %[p2], 8         \n\t"
   2249                 "srl        %[p1], %[p1], 8         \n\t"
   2250                 "srl        %[p0], %[p0], 8         \n\t"
   2251                 : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
   2252                   [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
   2253                 :
   2254             );
   2255 
   2256             __asm__ __volatile__ (
   2257                 "sb         %[p5],  2(%[s3])        \n\t"
   2258                 "sb         %[p4],  1(%[s3])        \n\t"
   2259                 "sb         %[p3],  0(%[s3])        \n\t"
   2260                 "sb         %[p2], -1(%[s3])        \n\t"
   2261                 "sb         %[p1], -2(%[s3])        \n\t"
   2262                 "sb         %[p0], -3(%[s3])        \n\t"
   2263                 :
   2264                 : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s3] "r" (s3),
   2265                   [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
   2266             );
   2267 
   2268             __asm__ __volatile__ (
   2269                 "srl        %[p5], %[p5], 8         \n\t"
   2270                 "srl        %[p4], %[p4], 8         \n\t"
   2271                 "srl        %[p3], %[p3], 8         \n\t"
   2272                 "srl        %[p2], %[p2], 8         \n\t"
   2273                 "srl        %[p1], %[p1], 8         \n\t"
   2274                 "srl        %[p0], %[p0], 8         \n\t"
   2275                 : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
   2276                   [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
   2277                 :
   2278             );
   2279 
   2280             __asm__ __volatile__ (
   2281                 "sb         %[p5],  2(%[s2])        \n\t"
   2282                 "sb         %[p4],  1(%[s2])        \n\t"
   2283                 "sb         %[p3],  0(%[s2])        \n\t"
   2284                 "sb         %[p2], -1(%[s2])        \n\t"
   2285                 "sb         %[p1], -2(%[s2])        \n\t"
   2286                 "sb         %[p0], -3(%[s2])        \n\t"
   2287                 :
   2288                 : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s2] "r" (s2),
   2289                   [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
   2290             );
   2291 
   2292             __asm__ __volatile__ (
   2293                 "srl        %[p5], %[p5], 8         \n\t"
   2294                 "srl        %[p4], %[p4], 8         \n\t"
   2295                 "srl        %[p3], %[p3], 8         \n\t"
   2296                 "srl        %[p2], %[p2], 8         \n\t"
   2297                 "srl        %[p1], %[p1], 8         \n\t"
   2298                 "srl        %[p0], %[p0], 8         \n\t"
   2299                 : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
   2300                   [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
   2301                 :
   2302             );
   2303 
   2304             __asm__ __volatile__ (
   2305                 "sb         %[p5],  2(%[s1])        \n\t"
   2306                 "sb         %[p4],  1(%[s1])        \n\t"
   2307                 "sb         %[p3],  0(%[s1])        \n\t"
   2308                 "sb         %[p2], -1(%[s1])        \n\t"
   2309                 "sb         %[p1], -2(%[s1])        \n\t"
   2310                 "sb         %[p0], -3(%[s1])        \n\t"
   2311                 :
   2312                 : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s1] "r" (s1),
   2313                   [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
   2314             );
   2315         }
   2316     }
   2317 
   2318     s1 = s4 + p;
   2319     s2 = s1 + p;
   2320     s3 = s2 + p;
   2321     s4 = s3 + p;
   2322 
   2323     /* load quad-byte vectors
   2324     * memory is 4 byte aligned
   2325     */
   2326     p2  = *((uint32_t *)(s1 - 4));
   2327     p6  = *((uint32_t *)(s1));
   2328     p1  = *((uint32_t *)(s2 - 4));
   2329     p5  = *((uint32_t *)(s2));
   2330     p0  = *((uint32_t *)(s3 - 4));
   2331     p4  = *((uint32_t *)(s3));
   2332     pm1 = *((uint32_t *)(s4 - 4));
   2333     p3  = *((uint32_t *)(s4));
   2334 
   2335     /* transpose pm1, p0, p1, p2 */
   2336     __asm__ __volatile__ (
   2337         "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
   2338         "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
   2339         "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
   2340         "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
   2341 
   2342         "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
   2343         "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
   2344         "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
   2345         "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
   2346 
   2347         "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
   2348         "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
   2349         "append         %[p1],      %[sec3],    16          \n\t"
   2350         "append         %[pm1],     %[sec4],    16          \n\t"
   2351 
   2352         : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
   2353           [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
   2354           [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
   2355           [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
   2356         :
   2357     );
   2358 
   2359     /* transpose p3, p4, p5, p6 */
   2360     __asm__ __volatile__ (
   2361         "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
   2362         "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
   2363         "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
   2364         "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
   2365 
   2366         "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
   2367         "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
   2368         "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
   2369         "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
   2370 
   2371         "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
   2372         "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
   2373         "append         %[p5],      %[sec3],    16          \n\t"
   2374         "append         %[p3],      %[sec4],    16          \n\t"
   2375 
   2376         : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
   2377           [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
   2378           [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
   2379           [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
   2380         :
   2381     );
   2382 
   2383     /* if (p1 - p4 == 0) and (p2 - p3 == 0)
   2384      * mask will be zero and filtering is not needed
   2385      */
   2386     if (!(((p1 - p4) == 0) && ((p2 - p3) == 0)))
   2387     {
   2388 
   2389         vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6, thresh, &hev, &mask);
   2390 
   2391         /* if mask == 0 do filtering is not needed */
   2392         if (mask)
   2393         {
   2394             /* filtering */
   2395             vp8_mbfilter_mips(mask, hev, &p0, &p1, &p2, &p3, &p4, &p5);
   2396 
   2397             /* don't use transpose on output data
   2398              * because memory isn't aligned
   2399              */
   2400             __asm__ __volatile__ (
   2401                 "sb         %[p5],  2(%[s4])        \n\t"
   2402                 "sb         %[p4],  1(%[s4])        \n\t"
   2403                 "sb         %[p3],  0(%[s4])        \n\t"
   2404                 "sb         %[p2], -1(%[s4])        \n\t"
   2405                 "sb         %[p1], -2(%[s4])        \n\t"
   2406                 "sb         %[p0], -3(%[s4])        \n\t"
   2407                 :
   2408                 : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s4] "r" (s4),
   2409                   [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
   2410             );
   2411 
   2412             __asm__ __volatile__ (
   2413                 "srl        %[p5], %[p5], 8         \n\t"
   2414                 "srl        %[p4], %[p4], 8         \n\t"
   2415                 "srl        %[p3], %[p3], 8         \n\t"
   2416                 "srl        %[p2], %[p2], 8         \n\t"
   2417                 "srl        %[p1], %[p1], 8         \n\t"
   2418                 "srl        %[p0], %[p0], 8         \n\t"
   2419                 : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
   2420                   [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
   2421                 :
   2422             );
   2423 
   2424             __asm__ __volatile__ (
   2425                 "sb         %[p5],  2(%[s3])        \n\t"
   2426                 "sb         %[p4],  1(%[s3])        \n\t"
   2427                 "sb         %[p3],  0(%[s3])        \n\t"
   2428                 "sb         %[p2], -1(%[s3])        \n\t"
   2429                 "sb         %[p1], -2(%[s3])        \n\t"
   2430                 "sb         %[p0], -3(%[s3])        \n\t"
   2431                 :
   2432                 : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s3] "r" (s3),
   2433                   [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
   2434             );
   2435 
   2436             __asm__ __volatile__ (
   2437                 "srl        %[p5], %[p5], 8         \n\t"
   2438                 "srl        %[p4], %[p4], 8         \n\t"
   2439                 "srl        %[p3], %[p3], 8         \n\t"
   2440                 "srl        %[p2], %[p2], 8         \n\t"
   2441                 "srl        %[p1], %[p1], 8         \n\t"
   2442                 "srl        %[p0], %[p0], 8         \n\t"
   2443                 : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
   2444                   [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
   2445                 :
   2446             );
   2447 
   2448             __asm__ __volatile__ (
   2449                 "sb         %[p5],  2(%[s2])        \n\t"
   2450                 "sb         %[p4],  1(%[s2])        \n\t"
   2451                 "sb         %[p3],  0(%[s2])        \n\t"
   2452                 "sb         %[p2], -1(%[s2])        \n\t"
   2453                 "sb         %[p1], -2(%[s2])        \n\t"
   2454                 "sb         %[p0], -3(%[s2])        \n\t"
   2455                 :
   2456                 : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s2] "r" (s2),
   2457                   [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
   2458             );
   2459 
   2460             __asm__ __volatile__ (
   2461                 "srl        %[p5], %[p5], 8         \n\t"
   2462                 "srl        %[p4], %[p4], 8         \n\t"
   2463                 "srl        %[p3], %[p3], 8         \n\t"
   2464                 "srl        %[p2], %[p2], 8         \n\t"
   2465                 "srl        %[p1], %[p1], 8         \n\t"
   2466                 "srl        %[p0], %[p0], 8         \n\t"
   2467                 : [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
   2468                   [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0)
   2469                 :
   2470             );
   2471 
   2472             __asm__ __volatile__ (
   2473                 "sb         %[p5],  2(%[s1])        \n\t"
   2474                 "sb         %[p4],  1(%[s1])        \n\t"
   2475                 "sb         %[p3],  0(%[s1])        \n\t"
   2476                 "sb         %[p2], -1(%[s1])        \n\t"
   2477                 "sb         %[p1], -2(%[s1])        \n\t"
   2478                 "sb         %[p0], -3(%[s1])        \n\t"
   2479                 :
   2480                 : [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3), [s1] "r" (s1),
   2481                   [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0)
   2482             );
   2483         }
   2484     }
   2485 }
   2486 
   2487 /* Horizontal MB filtering */
   2488 void vp8_loop_filter_mbh_dspr2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
   2489                                int y_stride, int uv_stride, loop_filter_info *lfi)
   2490 {
   2491     unsigned int thresh_vec, flimit_vec, limit_vec;
   2492     unsigned char thresh, flimit, limit, flimit_temp;
   2493 
   2494     /* use direct value instead pointers */
   2495     limit = *(lfi->lim);
   2496     flimit_temp = *(lfi->mblim);
   2497     thresh = *(lfi->hev_thr);
   2498     flimit = flimit_temp;
   2499 
   2500     /* create quad-byte */
   2501     __asm__ __volatile__ (
   2502         "replv.qb       %[thresh_vec], %[thresh]    \n\t"
   2503         "replv.qb       %[flimit_vec], %[flimit]    \n\t"
   2504         "replv.qb       %[limit_vec],  %[limit]     \n\t"
   2505         : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec), [limit_vec] "=r" (limit_vec)
   2506         : [thresh] "r" (thresh), [flimit] "r" (flimit), [limit] "r" (limit)
   2507     );
   2508 
   2509     vp8_mbloop_filter_horizontal_edge_mips(y_ptr, y_stride, flimit_vec, limit_vec, thresh_vec, 16);
   2510 
   2511     if (u_ptr)
   2512     {
   2513         vp8_mbloop_filter_uvhorizontal_edge_mips(u_ptr, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
   2514     }
   2515 
   2516     if (v_ptr)
   2517     {
   2518         vp8_mbloop_filter_uvhorizontal_edge_mips(v_ptr, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
   2519     }
   2520 }
   2521 
   2522 
   2523 /* Vertical MB Filtering */
   2524 void vp8_loop_filter_mbv_dspr2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
   2525                                int y_stride, int uv_stride, loop_filter_info *lfi)
   2526 {
   2527     unsigned int thresh_vec, flimit_vec, limit_vec;
   2528     unsigned char thresh, flimit, limit, flimit_temp;
   2529 
   2530     /* use direct value instead pointers */
   2531     limit = *(lfi->lim);
   2532     flimit_temp = *(lfi->mblim);
   2533     thresh = *(lfi->hev_thr);
   2534     flimit = flimit_temp;
   2535 
   2536     /* create quad-byte */
   2537     __asm__ __volatile__ (
   2538         "replv.qb       %[thresh_vec], %[thresh]    \n\t"
   2539         "replv.qb       %[flimit_vec], %[flimit]    \n\t"
   2540         "replv.qb       %[limit_vec],  %[limit]     \n\t"
   2541         : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec), [limit_vec] "=r" (limit_vec)
   2542         : [thresh] "r" (thresh), [flimit] "r" (flimit), [limit] "r" (limit)
   2543     );
   2544 
   2545     vp8_mbloop_filter_vertical_edge_mips(y_ptr, y_stride, flimit_vec, limit_vec, thresh_vec, 16);
   2546 
   2547     if (u_ptr)
   2548         vp8_mbloop_filter_uvvertical_edge_mips(u_ptr, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
   2549 
   2550     if (v_ptr)
   2551         vp8_mbloop_filter_uvvertical_edge_mips(v_ptr, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
   2552 }
   2553 
   2554 
   2555 /* Horizontal B Filtering */
   2556 void vp8_loop_filter_bh_dspr2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
   2557                               int y_stride, int uv_stride, loop_filter_info *lfi)
   2558 {
   2559     unsigned int thresh_vec, flimit_vec, limit_vec;
   2560     unsigned char thresh, flimit, limit, flimit_temp;
   2561 
   2562     /* use direct value instead pointers */
   2563     limit = *(lfi->lim);
   2564     flimit_temp = *(lfi->blim);
   2565     thresh = *(lfi->hev_thr);
   2566     flimit = flimit_temp;
   2567 
   2568     /* create quad-byte */
   2569     __asm__ __volatile__ (
   2570         "replv.qb       %[thresh_vec], %[thresh]    \n\t"
   2571         "replv.qb       %[flimit_vec], %[flimit]    \n\t"
   2572         "replv.qb       %[limit_vec],  %[limit]     \n\t"
   2573         : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec), [limit_vec] "=r" (limit_vec)
   2574         : [thresh] "r" (thresh), [flimit] "r" (flimit), [limit] "r" (limit)
   2575     );
   2576 
   2577     vp8_loop_filter_horizontal_edge_mips(y_ptr + 4 * y_stride, y_stride, flimit_vec, limit_vec, thresh_vec, 16);
   2578     vp8_loop_filter_horizontal_edge_mips(y_ptr + 8 * y_stride, y_stride, flimit_vec, limit_vec, thresh_vec, 16);
   2579     vp8_loop_filter_horizontal_edge_mips(y_ptr + 12 * y_stride, y_stride, flimit_vec, limit_vec, thresh_vec, 16);
   2580 
   2581     if (u_ptr)
   2582         vp8_loop_filter_uvhorizontal_edge_mips(u_ptr + 4 * uv_stride, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
   2583 
   2584     if (v_ptr)
   2585         vp8_loop_filter_uvhorizontal_edge_mips(v_ptr + 4 * uv_stride, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
   2586 }
   2587 
   2588 
   2589 /* Vertical B Filtering */
   2590 void vp8_loop_filter_bv_dspr2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
   2591                               int y_stride, int uv_stride, loop_filter_info *lfi)
   2592 {
   2593     unsigned int thresh_vec, flimit_vec, limit_vec;
   2594     unsigned char thresh, flimit, limit, flimit_temp;
   2595 
   2596     /* use direct value instead pointers */
   2597     limit = *(lfi->lim);
   2598     flimit_temp = *(lfi->blim);
   2599     thresh = *(lfi->hev_thr);
   2600     flimit = flimit_temp;
   2601 
   2602     /* create quad-byte */
   2603     __asm__ __volatile__ (
   2604         "replv.qb       %[thresh_vec], %[thresh]    \n\t"
   2605         "replv.qb       %[flimit_vec], %[flimit]    \n\t"
   2606         "replv.qb       %[limit_vec],  %[limit]     \n\t"
   2607         : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec), [limit_vec] "=r" (limit_vec)
   2608         : [thresh] "r" (thresh), [flimit] "r" (flimit), [limit] "r" (limit)
   2609     );
   2610 
   2611     vp8_loop_filter_vertical_edge_mips(y_ptr + 4, y_stride, flimit_vec, limit_vec, thresh_vec, 16);
   2612     vp8_loop_filter_vertical_edge_mips(y_ptr + 8, y_stride, flimit_vec, limit_vec, thresh_vec, 16);
   2613     vp8_loop_filter_vertical_edge_mips(y_ptr + 12, y_stride, flimit_vec, limit_vec, thresh_vec, 16);
   2614 
   2615     if (u_ptr)
   2616         vp8_loop_filter_uvvertical_edge_mips(u_ptr + 4, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
   2617 
   2618     if (v_ptr)
   2619         vp8_loop_filter_uvvertical_edge_mips(v_ptr + 4, uv_stride, flimit_vec, limit_vec, thresh_vec, 0);
   2620 }
   2621 
   2622 #endif
   2623