Home | History | Annotate | Download | only in mips
      1 /*
      2  *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <stdlib.h>
     12 
     13 #include "./vpx_dsp_rtcd.h"
     14 #include "vpx/vpx_integer.h"
     15 #include "vpx_dsp/mips/common_dspr2.h"
     16 #include "vpx_dsp/mips/loopfilter_filters_dspr2.h"
     17 #include "vpx_dsp/mips/loopfilter_macros_dspr2.h"
     18 #include "vpx_dsp/mips/loopfilter_masks_dspr2.h"
     19 #include "vpx_mem/vpx_mem.h"
     20 
     21 #if HAVE_DSPR2
     22 static void mb_lpf_horizontal_edge(unsigned char *s, int pitch,
     23                                    const uint8_t *blimit, const uint8_t *limit,
     24                                    const uint8_t *thresh, int count) {
     25   uint32_t mask;
     26   uint32_t hev, flat, flat2;
     27   uint8_t i;
     28   uint8_t *sp7, *sp6, *sp5, *sp4, *sp3, *sp2, *sp1, *sp0;
     29   uint8_t *sq0, *sq1, *sq2, *sq3, *sq4, *sq5, *sq6, *sq7;
     30   uint32_t thresh_vec, flimit_vec, limit_vec;
     31   uint32_t uflimit, ulimit, uthresh;
     32   uint32_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
     33   uint32_t p1_f0, p0_f0, q0_f0, q1_f0;
     34   uint32_t p7_l, p6_l, p5_l, p4_l, p3_l, p2_l, p1_l, p0_l;
     35   uint32_t q0_l, q1_l, q2_l, q3_l, q4_l, q5_l, q6_l, q7_l;
     36   uint32_t p7_r, p6_r, p5_r, p4_r, p3_r, p2_r, p1_r, p0_r;
     37   uint32_t q0_r, q1_r, q2_r, q3_r, q4_r, q5_r, q6_r, q7_r;
     38   uint32_t p2_l_f1, p1_l_f1, p0_l_f1, p2_r_f1, p1_r_f1, p0_r_f1;
     39   uint32_t q0_l_f1, q1_l_f1, q2_l_f1, q0_r_f1, q1_r_f1, q2_r_f1;
     40 
     41   uflimit = *blimit;
     42   ulimit = *limit;
     43   uthresh = *thresh;
     44 
     45   /* create quad-byte */
     46   __asm__ __volatile__(
     47       "replv.qb       %[thresh_vec],    %[uthresh]      \n\t"
     48       "replv.qb       %[flimit_vec],    %[uflimit]      \n\t"
     49       "replv.qb       %[limit_vec],     %[ulimit]       \n\t"
     50 
     51       : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
     52         [limit_vec] "=r"(limit_vec)
     53       : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
     54 
     55   /* prefetch data for store */
     56   prefetch_store(s);
     57 
     58   for (i = 0; i < (2 * count); i++) {
     59     sp7 = s - (pitch << 3);
     60     sp6 = sp7 + pitch;
     61     sp5 = sp6 + pitch;
     62     sp4 = sp5 + pitch;
     63     sp3 = sp4 + pitch;
     64     sp2 = sp3 + pitch;
     65     sp1 = sp2 + pitch;
     66     sp0 = sp1 + pitch;
     67     sq0 = s;
     68     sq1 = s + pitch;
     69     sq2 = sq1 + pitch;
     70     sq3 = sq2 + pitch;
     71     sq4 = sq3 + pitch;
     72     sq5 = sq4 + pitch;
     73     sq6 = sq5 + pitch;
     74     sq7 = sq6 + pitch;
     75 
     76     __asm__ __volatile__(
     77         "lw     %[p7],      (%[sp7])            \n\t"
     78         "lw     %[p6],      (%[sp6])            \n\t"
     79         "lw     %[p5],      (%[sp5])            \n\t"
     80         "lw     %[p4],      (%[sp4])            \n\t"
     81         "lw     %[p3],      (%[sp3])            \n\t"
     82         "lw     %[p2],      (%[sp2])            \n\t"
     83         "lw     %[p1],      (%[sp1])            \n\t"
     84         "lw     %[p0],      (%[sp0])            \n\t"
     85 
     86         : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0),
     87           [p7] "=&r"(p7), [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4)
     88         : [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0),
     89           [sp4] "r"(sp4), [sp5] "r"(sp5), [sp6] "r"(sp6), [sp7] "r"(sp7));
     90 
     91     __asm__ __volatile__(
     92         "lw     %[q0],      (%[sq0])            \n\t"
     93         "lw     %[q1],      (%[sq1])            \n\t"
     94         "lw     %[q2],      (%[sq2])            \n\t"
     95         "lw     %[q3],      (%[sq3])            \n\t"
     96         "lw     %[q4],      (%[sq4])            \n\t"
     97         "lw     %[q5],      (%[sq5])            \n\t"
     98         "lw     %[q6],      (%[sq6])            \n\t"
     99         "lw     %[q7],      (%[sq7])            \n\t"
    100 
    101         : [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0),
    102           [q7] "=&r"(q7), [q6] "=&r"(q6), [q5] "=&r"(q5), [q4] "=&r"(q4)
    103         : [sq3] "r"(sq3), [sq2] "r"(sq2), [sq1] "r"(sq1), [sq0] "r"(sq0),
    104           [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6), [sq7] "r"(sq7));
    105 
    106     filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0,
    107                                     p3, p2, q0, q1, q2, q3, &hev, &mask, &flat);
    108 
    109     flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2);
    110 
    111     /* f0 */
    112     if (((flat2 == 0) && (flat == 0) && (mask != 0)) ||
    113         ((flat2 != 0) && (flat == 0) && (mask != 0))) {
    114       filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
    115 
    116       __asm__ __volatile__(
    117           "sw       %[p1_f0],   (%[sp1])            \n\t"
    118           "sw       %[p0_f0],   (%[sp0])            \n\t"
    119           "sw       %[q0_f0],   (%[sq0])            \n\t"
    120           "sw       %[q1_f0],   (%[sq1])            \n\t"
    121 
    122           :
    123           : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
    124             [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
    125             [sq1] "r"(sq1));
    126     } else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) &&
    127                (mask == 0xFFFFFFFF)) {
    128       /* f2 */
    129       PACK_LEFT_0TO3()
    130       PACK_LEFT_4TO7()
    131       wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l,
    132                           &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l,
    133                           &q6_l, &q7_l);
    134 
    135       PACK_RIGHT_0TO3()
    136       PACK_RIGHT_4TO7()
    137       wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r,
    138                           &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r,
    139                           &q6_r, &q7_r);
    140 
    141       COMBINE_LEFT_RIGHT_0TO2()
    142       COMBINE_LEFT_RIGHT_3TO6()
    143 
    144       __asm__ __volatile__(
    145           "sw         %[p6], (%[sp6])    \n\t"
    146           "sw         %[p5], (%[sp5])    \n\t"
    147           "sw         %[p4], (%[sp4])    \n\t"
    148           "sw         %[p3], (%[sp3])    \n\t"
    149           "sw         %[p2], (%[sp2])    \n\t"
    150           "sw         %[p1], (%[sp1])    \n\t"
    151           "sw         %[p0], (%[sp0])    \n\t"
    152 
    153           :
    154           : [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3),
    155             [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [sp6] "r"(sp6),
    156             [sp5] "r"(sp5), [sp4] "r"(sp4), [sp3] "r"(sp3), [sp2] "r"(sp2),
    157             [sp1] "r"(sp1), [sp0] "r"(sp0));
    158 
    159       __asm__ __volatile__(
    160           "sw         %[q6], (%[sq6])    \n\t"
    161           "sw         %[q5], (%[sq5])    \n\t"
    162           "sw         %[q4], (%[sq4])    \n\t"
    163           "sw         %[q3], (%[sq3])    \n\t"
    164           "sw         %[q2], (%[sq2])    \n\t"
    165           "sw         %[q1], (%[sq1])    \n\t"
    166           "sw         %[q0], (%[sq0])    \n\t"
    167 
    168           :
    169           : [q6] "r"(q6), [q5] "r"(q5), [q4] "r"(q4), [q3] "r"(q3),
    170             [q2] "r"(q2), [q1] "r"(q1), [q0] "r"(q0), [sq6] "r"(sq6),
    171             [sq5] "r"(sq5), [sq4] "r"(sq4), [sq3] "r"(sq3), [sq2] "r"(sq2),
    172             [sq1] "r"(sq1), [sq0] "r"(sq0));
    173     } else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) {
    174       /* f1 */
    175       /* left 2 element operation */
    176       PACK_LEFT_0TO3()
    177       mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
    178 
    179       /* right 2 element operation */
    180       PACK_RIGHT_0TO3()
    181       mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
    182 
    183       COMBINE_LEFT_RIGHT_0TO2()
    184 
    185       __asm__ __volatile__(
    186           "sw         %[p2], (%[sp2])    \n\t"
    187           "sw         %[p1], (%[sp1])    \n\t"
    188           "sw         %[p0], (%[sp0])    \n\t"
    189           "sw         %[q0], (%[sq0])    \n\t"
    190           "sw         %[q1], (%[sq1])    \n\t"
    191           "sw         %[q2], (%[sq2])    \n\t"
    192 
    193           :
    194           : [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0),
    195             [q1] "r"(q1), [q2] "r"(q2), [sp2] "r"(sp2), [sp1] "r"(sp1),
    196             [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2));
    197     } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) {
    198       /* f0+f1 */
    199       filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
    200 
    201       /* left 2 element operation */
    202       PACK_LEFT_0TO3()
    203       mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
    204 
    205       /* right 2 element operation */
    206       PACK_RIGHT_0TO3()
    207       mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
    208 
    209       if (mask & flat & 0x000000FF) {
    210         __asm__ __volatile__(
    211             "sb         %[p2_r],  (%[sp2])    \n\t"
    212             "sb         %[p1_r],  (%[sp1])    \n\t"
    213             "sb         %[p0_r],  (%[sp0])    \n\t"
    214             "sb         %[q0_r],  (%[sq0])    \n\t"
    215             "sb         %[q1_r],  (%[sq1])    \n\t"
    216             "sb         %[q2_r],  (%[sq2])    \n\t"
    217 
    218             :
    219             : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
    220               [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
    221               [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
    222               [sq1] "r"(sq1), [sq2] "r"(sq2));
    223       } else if (mask & 0x000000FF) {
    224         __asm__ __volatile__(
    225             "sb         %[p1_f0],  (%[sp1])    \n\t"
    226             "sb         %[p0_f0],  (%[sp0])    \n\t"
    227             "sb         %[q0_f0],  (%[sq0])    \n\t"
    228             "sb         %[q1_f0],  (%[sq1])    \n\t"
    229 
    230             :
    231             : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
    232               [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
    233               [sq0] "r"(sq0), [sq1] "r"(sq1));
    234       }
    235 
    236       __asm__ __volatile__(
    237           "srl      %[p2_r],    %[p2_r],    16      \n\t"
    238           "srl      %[p1_r],    %[p1_r],    16      \n\t"
    239           "srl      %[p0_r],    %[p0_r],    16      \n\t"
    240           "srl      %[q0_r],    %[q0_r],    16      \n\t"
    241           "srl      %[q1_r],    %[q1_r],    16      \n\t"
    242           "srl      %[q2_r],    %[q2_r],    16      \n\t"
    243           "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
    244           "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
    245           "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
    246           "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
    247 
    248           : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r),
    249             [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
    250             [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
    251             [q1_f0] "+r"(q1_f0)
    252           :);
    253 
    254       if (mask & flat & 0x0000FF00) {
    255         __asm__ __volatile__(
    256             "sb         %[p2_r],  +1(%[sp2])    \n\t"
    257             "sb         %[p1_r],  +1(%[sp1])    \n\t"
    258             "sb         %[p0_r],  +1(%[sp0])    \n\t"
    259             "sb         %[q0_r],  +1(%[sq0])    \n\t"
    260             "sb         %[q1_r],  +1(%[sq1])    \n\t"
    261             "sb         %[q2_r],  +1(%[sq2])    \n\t"
    262 
    263             :
    264             : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
    265               [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
    266               [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
    267               [sq1] "r"(sq1), [sq2] "r"(sq2));
    268       } else if (mask & 0x0000FF00) {
    269         __asm__ __volatile__(
    270             "sb         %[p1_f0],  +1(%[sp1])    \n\t"
    271             "sb         %[p0_f0],  +1(%[sp0])    \n\t"
    272             "sb         %[q0_f0],  +1(%[sq0])    \n\t"
    273             "sb         %[q1_f0],  +1(%[sq1])    \n\t"
    274 
    275             :
    276             : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
    277               [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
    278               [sq0] "r"(sq0), [sq1] "r"(sq1));
    279       }
    280 
    281       __asm__ __volatile__(
    282           "srl      %[p1_f0],   %[p1_f0],   8     \n\t"
    283           "srl      %[p0_f0],   %[p0_f0],   8     \n\t"
    284           "srl      %[q0_f0],   %[q0_f0],   8     \n\t"
    285           "srl      %[q1_f0],   %[q1_f0],   8     \n\t"
    286 
    287           : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
    288             [q1_f0] "+r"(q1_f0)
    289           :);
    290 
    291       if (mask & flat & 0x00FF0000) {
    292         __asm__ __volatile__(
    293             "sb         %[p2_l],  +2(%[sp2])    \n\t"
    294             "sb         %[p1_l],  +2(%[sp1])    \n\t"
    295             "sb         %[p0_l],  +2(%[sp0])    \n\t"
    296             "sb         %[q0_l],  +2(%[sq0])    \n\t"
    297             "sb         %[q1_l],  +2(%[sq1])    \n\t"
    298             "sb         %[q2_l],  +2(%[sq2])    \n\t"
    299 
    300             :
    301             : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
    302               [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
    303               [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
    304               [sq1] "r"(sq1), [sq2] "r"(sq2));
    305       } else if (mask & 0x00FF0000) {
    306         __asm__ __volatile__(
    307             "sb         %[p1_f0],  +2(%[sp1])    \n\t"
    308             "sb         %[p0_f0],  +2(%[sp0])    \n\t"
    309             "sb         %[q0_f0],  +2(%[sq0])    \n\t"
    310             "sb         %[q1_f0],  +2(%[sq1])    \n\t"
    311 
    312             :
    313             : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
    314               [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
    315               [sq0] "r"(sq0), [sq1] "r"(sq1));
    316       }
    317 
    318       __asm__ __volatile__(
    319           "srl      %[p2_l],    %[p2_l],    16      \n\t"
    320           "srl      %[p1_l],    %[p1_l],    16      \n\t"
    321           "srl      %[p0_l],    %[p0_l],    16      \n\t"
    322           "srl      %[q0_l],    %[q0_l],    16      \n\t"
    323           "srl      %[q1_l],    %[q1_l],    16      \n\t"
    324           "srl      %[q2_l],    %[q2_l],    16      \n\t"
    325           "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
    326           "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
    327           "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
    328           "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
    329 
    330           : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l),
    331             [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
    332             [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
    333             [q1_f0] "+r"(q1_f0)
    334           :);
    335 
    336       if (mask & flat & 0xFF000000) {
    337         __asm__ __volatile__(
    338             "sb         %[p2_l],  +3(%[sp2])    \n\t"
    339             "sb         %[p1_l],  +3(%[sp1])    \n\t"
    340             "sb         %[p0_l],  +3(%[sp0])    \n\t"
    341             "sb         %[q0_l],  +3(%[sq0])    \n\t"
    342             "sb         %[q1_l],  +3(%[sq1])    \n\t"
    343             "sb         %[q2_l],  +3(%[sq2])    \n\t"
    344 
    345             :
    346             : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
    347               [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
    348               [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
    349               [sq1] "r"(sq1), [sq2] "r"(sq2));
    350       } else if (mask & 0xFF000000) {
    351         __asm__ __volatile__(
    352             "sb         %[p1_f0],  +3(%[sp1])    \n\t"
    353             "sb         %[p0_f0],  +3(%[sp0])    \n\t"
    354             "sb         %[q0_f0],  +3(%[sq0])    \n\t"
    355             "sb         %[q1_f0],  +3(%[sq1])    \n\t"
    356 
    357             :
    358             : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
    359               [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
    360               [sq0] "r"(sq0), [sq1] "r"(sq1));
    361       }
    362     } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) {
    363       /* f0 + f1 + f2 */
    364       /* f0  function */
    365       filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
    366 
    367       /* f1  function */
    368       /* left 2 element operation */
    369       PACK_LEFT_0TO3()
    370       mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, &p2_l_f1,
    371                       &p1_l_f1, &p0_l_f1, &q0_l_f1, &q1_l_f1, &q2_l_f1);
    372 
    373       /* right 2 element operation */
    374       PACK_RIGHT_0TO3()
    375       mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, &p2_r_f1,
    376                       &p1_r_f1, &p0_r_f1, &q0_r_f1, &q1_r_f1, &q2_r_f1);
    377 
    378       /* f2  function */
    379       PACK_LEFT_4TO7()
    380       wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l,
    381                           &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l,
    382                           &q6_l, &q7_l);
    383 
    384       PACK_RIGHT_4TO7()
    385       wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r,
    386                           &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r,
    387                           &q6_r, &q7_r);
    388 
    389       if (mask & flat & flat2 & 0x000000FF) {
    390         __asm__ __volatile__(
    391             "sb         %[p6_r],  (%[sp6])    \n\t"
    392             "sb         %[p5_r],  (%[sp5])    \n\t"
    393             "sb         %[p4_r],  (%[sp4])    \n\t"
    394             "sb         %[p3_r],  (%[sp3])    \n\t"
    395             "sb         %[p2_r],  (%[sp2])    \n\t"
    396             "sb         %[p1_r],  (%[sp1])    \n\t"
    397             "sb         %[p0_r],  (%[sp0])    \n\t"
    398 
    399             :
    400             : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r),
    401               [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r),
    402               [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4), [sp3] "r"(sp3),
    403               [sp2] "r"(sp2), [sp1] "r"(sp1), [p0_r] "r"(p0_r), [sp0] "r"(sp0));
    404 
    405         __asm__ __volatile__(
    406             "sb         %[q0_r],  (%[sq0])    \n\t"
    407             "sb         %[q1_r],  (%[sq1])    \n\t"
    408             "sb         %[q2_r],  (%[sq2])    \n\t"
    409             "sb         %[q3_r],  (%[sq3])    \n\t"
    410             "sb         %[q4_r],  (%[sq4])    \n\t"
    411             "sb         %[q5_r],  (%[sq5])    \n\t"
    412             "sb         %[q6_r],  (%[sq6])    \n\t"
    413 
    414             :
    415             : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
    416               [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r),
    417               [q6_r] "r"(q6_r), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2),
    418               [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6));
    419       } else if (mask & flat & 0x000000FF) {
    420         __asm__ __volatile__(
    421             "sb         %[p2_r_f1],  (%[sp2])    \n\t"
    422             "sb         %[p1_r_f1],  (%[sp1])    \n\t"
    423             "sb         %[p0_r_f1],  (%[sp0])    \n\t"
    424             "sb         %[q0_r_f1],  (%[sq0])    \n\t"
    425             "sb         %[q1_r_f1],  (%[sq1])    \n\t"
    426             "sb         %[q2_r_f1],  (%[sq2])    \n\t"
    427 
    428             :
    429             : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1),
    430               [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1),
    431               [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [sp2] "r"(sp2),
    432               [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1),
    433               [sq2] "r"(sq2));
    434       } else if (mask & 0x000000FF) {
    435         __asm__ __volatile__(
    436             "sb         %[p1_f0],  (%[sp1])    \n\t"
    437             "sb         %[p0_f0],  (%[sp0])    \n\t"
    438             "sb         %[q0_f0],  (%[sq0])    \n\t"
    439             "sb         %[q1_f0],  (%[sq1])    \n\t"
    440 
    441             :
    442             : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
    443               [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
    444               [sq0] "r"(sq0), [sq1] "r"(sq1));
    445       }
    446 
    447       __asm__ __volatile__(
    448           "srl        %[p6_r], %[p6_r], 16     \n\t"
    449           "srl        %[p5_r], %[p5_r], 16     \n\t"
    450           "srl        %[p4_r], %[p4_r], 16     \n\t"
    451           "srl        %[p3_r], %[p3_r], 16     \n\t"
    452           "srl        %[p2_r], %[p2_r], 16     \n\t"
    453           "srl        %[p1_r], %[p1_r], 16     \n\t"
    454           "srl        %[p0_r], %[p0_r], 16     \n\t"
    455           "srl        %[q0_r], %[q0_r], 16     \n\t"
    456           "srl        %[q1_r], %[q1_r], 16     \n\t"
    457           "srl        %[q2_r], %[q2_r], 16     \n\t"
    458           "srl        %[q3_r], %[q3_r], 16     \n\t"
    459           "srl        %[q4_r], %[q4_r], 16     \n\t"
    460           "srl        %[q5_r], %[q5_r], 16     \n\t"
    461           "srl        %[q6_r], %[q6_r], 16     \n\t"
    462 
    463           : [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
    464             [q3_r] "+r"(q3_r), [q4_r] "+r"(q4_r), [q5_r] "+r"(q5_r),
    465             [p6_r] "+r"(p6_r), [p5_r] "+r"(p5_r), [p4_r] "+r"(p4_r),
    466             [p3_r] "+r"(p3_r), [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r),
    467             [q6_r] "+r"(q6_r), [p0_r] "+r"(p0_r)
    468           :);
    469 
    470       __asm__ __volatile__(
    471           "srl        %[p2_r_f1], %[p2_r_f1], 16     \n\t"
    472           "srl        %[p1_r_f1], %[p1_r_f1], 16     \n\t"
    473           "srl        %[p0_r_f1], %[p0_r_f1], 16     \n\t"
    474           "srl        %[q0_r_f1], %[q0_r_f1], 16     \n\t"
    475           "srl        %[q1_r_f1], %[q1_r_f1], 16     \n\t"
    476           "srl        %[q2_r_f1], %[q2_r_f1], 16     \n\t"
    477           "srl        %[p1_f0],   %[p1_f0],   8      \n\t"
    478           "srl        %[p0_f0],   %[p0_f0],   8      \n\t"
    479           "srl        %[q0_f0],   %[q0_f0],   8      \n\t"
    480           "srl        %[q1_f0],   %[q1_f0],   8      \n\t"
    481 
    482           : [p2_r_f1] "+r"(p2_r_f1), [p1_r_f1] "+r"(p1_r_f1),
    483             [p0_r_f1] "+r"(p0_r_f1), [q0_r_f1] "+r"(q0_r_f1),
    484             [q1_r_f1] "+r"(q1_r_f1), [q2_r_f1] "+r"(q2_r_f1),
    485             [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
    486             [q1_f0] "+r"(q1_f0)
    487           :);
    488 
    489       if (mask & flat & flat2 & 0x0000FF00) {
    490         __asm__ __volatile__(
    491             "sb         %[p6_r],  +1(%[sp6])    \n\t"
    492             "sb         %[p5_r],  +1(%[sp5])    \n\t"
    493             "sb         %[p4_r],  +1(%[sp4])    \n\t"
    494             "sb         %[p3_r],  +1(%[sp3])    \n\t"
    495             "sb         %[p2_r],  +1(%[sp2])    \n\t"
    496             "sb         %[p1_r],  +1(%[sp1])    \n\t"
    497             "sb         %[p0_r],  +1(%[sp0])    \n\t"
    498 
    499             :
    500             : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r),
    501               [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r),
    502               [p0_r] "r"(p0_r), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4),
    503               [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0));
    504 
    505         __asm__ __volatile__(
    506             "sb         %[q0_r],  +1(%[sq0])    \n\t"
    507             "sb         %[q1_r],  +1(%[sq1])    \n\t"
    508             "sb         %[q2_r],  +1(%[sq2])    \n\t"
    509             "sb         %[q3_r],  +1(%[sq3])    \n\t"
    510             "sb         %[q4_r],  +1(%[sq4])    \n\t"
    511             "sb         %[q5_r],  +1(%[sq5])    \n\t"
    512             "sb         %[q6_r],  +1(%[sq6])    \n\t"
    513 
    514             :
    515             : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
    516               [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r),
    517               [q6_r] "r"(q6_r), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2),
    518               [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6));
    519       } else if (mask & flat & 0x0000FF00) {
    520         __asm__ __volatile__(
    521             "sb         %[p2_r_f1],  +1(%[sp2])    \n\t"
    522             "sb         %[p1_r_f1],  +1(%[sp1])    \n\t"
    523             "sb         %[p0_r_f1],  +1(%[sp0])    \n\t"
    524             "sb         %[q0_r_f1],  +1(%[sq0])    \n\t"
    525             "sb         %[q1_r_f1],  +1(%[sq1])    \n\t"
    526             "sb         %[q2_r_f1],  +1(%[sq2])    \n\t"
    527 
    528             :
    529             : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1),
    530               [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1),
    531               [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [sp2] "r"(sp2),
    532               [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1),
    533               [sq2] "r"(sq2));
    534       } else if (mask & 0x0000FF00) {
    535         __asm__ __volatile__(
    536             "sb         %[p1_f0],  +1(%[sp1])    \n\t"
    537             "sb         %[p0_f0],  +1(%[sp0])    \n\t"
    538             "sb         %[q0_f0],  +1(%[sq0])    \n\t"
    539             "sb         %[q1_f0],  +1(%[sq1])    \n\t"
    540 
    541             :
    542             : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
    543               [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
    544               [sq0] "r"(sq0), [sq1] "r"(sq1));
    545       }
    546 
    547       __asm__ __volatile__(
    548           "srl        %[p1_f0], %[p1_f0], 8     \n\t"
    549           "srl        %[p0_f0], %[p0_f0], 8     \n\t"
    550           "srl        %[q0_f0], %[q0_f0], 8     \n\t"
    551           "srl        %[q1_f0], %[q1_f0], 8     \n\t"
    552 
    553           : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
    554             [q1_f0] "+r"(q1_f0)
    555           :);
    556 
    557       if (mask & flat & flat2 & 0x00FF0000) {
    558         __asm__ __volatile__(
    559             "sb         %[p6_l],  +2(%[sp6])    \n\t"
    560             "sb         %[p5_l],  +2(%[sp5])    \n\t"
    561             "sb         %[p4_l],  +2(%[sp4])    \n\t"
    562             "sb         %[p3_l],  +2(%[sp3])    \n\t"
    563             "sb         %[p2_l],  +2(%[sp2])    \n\t"
    564             "sb         %[p1_l],  +2(%[sp1])    \n\t"
    565             "sb         %[p0_l],  +2(%[sp0])    \n\t"
    566 
    567             :
    568             : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l),
    569               [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l),
    570               [p0_l] "r"(p0_l), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4),
    571               [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0));
    572 
    573         __asm__ __volatile__(
    574             "sb         %[q0_l],  +2(%[sq0])    \n\t"
    575             "sb         %[q1_l],  +2(%[sq1])    \n\t"
    576             "sb         %[q2_l],  +2(%[sq2])    \n\t"
    577             "sb         %[q3_l],  +2(%[sq3])    \n\t"
    578             "sb         %[q4_l],  +2(%[sq4])    \n\t"
    579             "sb         %[q5_l],  +2(%[sq5])    \n\t"
    580             "sb         %[q6_l],  +2(%[sq6])    \n\t"
    581 
    582             :
    583             : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
    584               [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l),
    585               [q6_l] "r"(q6_l), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2),
    586               [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6));
    587       } else if (mask & flat & 0x00FF0000) {
    588         __asm__ __volatile__(
    589             "sb         %[p2_l_f1],  +2(%[sp2])    \n\t"
    590             "sb         %[p1_l_f1],  +2(%[sp1])    \n\t"
    591             "sb         %[p0_l_f1],  +2(%[sp0])    \n\t"
    592             "sb         %[q0_l_f1],  +2(%[sq0])    \n\t"
    593             "sb         %[q1_l_f1],  +2(%[sq1])    \n\t"
    594             "sb         %[q2_l_f1],  +2(%[sq2])    \n\t"
    595 
    596             :
    597             : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1),
    598               [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1),
    599               [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [sp2] "r"(sp2),
    600               [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1),
    601               [sq2] "r"(sq2));
    602       } else if (mask & 0x00FF0000) {
    603         __asm__ __volatile__(
    604             "sb         %[p1_f0],  +2(%[sp1])    \n\t"
    605             "sb         %[p0_f0],  +2(%[sp0])    \n\t"
    606             "sb         %[q0_f0],  +2(%[sq0])    \n\t"
    607             "sb         %[q1_f0],  +2(%[sq1])    \n\t"
    608 
    609             :
    610             : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
    611               [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
    612               [sq0] "r"(sq0), [sq1] "r"(sq1));
    613       }
    614 
    615       __asm__ __volatile__(
    616           "srl      %[p6_l],    %[p6_l],    16   \n\t"
    617           "srl      %[p5_l],    %[p5_l],    16   \n\t"
    618           "srl      %[p4_l],    %[p4_l],    16   \n\t"
    619           "srl      %[p3_l],    %[p3_l],    16   \n\t"
    620           "srl      %[p2_l],    %[p2_l],    16   \n\t"
    621           "srl      %[p1_l],    %[p1_l],    16   \n\t"
    622           "srl      %[p0_l],    %[p0_l],    16   \n\t"
    623           "srl      %[q0_l],    %[q0_l],    16   \n\t"
    624           "srl      %[q1_l],    %[q1_l],    16   \n\t"
    625           "srl      %[q2_l],    %[q2_l],    16   \n\t"
    626           "srl      %[q3_l],    %[q3_l],    16   \n\t"
    627           "srl      %[q4_l],    %[q4_l],    16   \n\t"
    628           "srl      %[q5_l],    %[q5_l],    16   \n\t"
    629           "srl      %[q6_l],    %[q6_l],    16   \n\t"
    630 
    631           : [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
    632             [q3_l] "+r"(q3_l), [q4_l] "+r"(q4_l), [q5_l] "+r"(q5_l),
    633             [q6_l] "+r"(q6_l), [p6_l] "+r"(p6_l), [p5_l] "+r"(p5_l),
    634             [p4_l] "+r"(p4_l), [p3_l] "+r"(p3_l), [p2_l] "+r"(p2_l),
    635             [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l)
    636           :);
    637 
    638       __asm__ __volatile__(
    639           "srl      %[p2_l_f1],   %[p2_l_f1],   16   \n\t"
    640           "srl      %[p1_l_f1],   %[p1_l_f1],   16   \n\t"
    641           "srl      %[p0_l_f1],   %[p0_l_f1],   16   \n\t"
    642           "srl      %[q0_l_f1],   %[q0_l_f1],   16   \n\t"
    643           "srl      %[q1_l_f1],   %[q1_l_f1],   16   \n\t"
    644           "srl      %[q2_l_f1],   %[q2_l_f1],   16   \n\t"
    645           "srl      %[p1_f0],     %[p1_f0],     8    \n\t"
    646           "srl      %[p0_f0],     %[p0_f0],     8    \n\t"
    647           "srl      %[q0_f0],     %[q0_f0],     8    \n\t"
    648           "srl      %[q1_f0],     %[q1_f0],     8    \n\t"
    649 
    650           : [p2_l_f1] "+r"(p2_l_f1), [p1_l_f1] "+r"(p1_l_f1),
    651             [p0_l_f1] "+r"(p0_l_f1), [q0_l_f1] "+r"(q0_l_f1),
    652             [q1_l_f1] "+r"(q1_l_f1), [q2_l_f1] "+r"(q2_l_f1),
    653             [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
    654             [q1_f0] "+r"(q1_f0)
    655           :);
    656 
    657       if (mask & flat & flat2 & 0xFF000000) {
    658         __asm__ __volatile__(
    659             "sb     %[p6_l],    +3(%[sp6])    \n\t"
    660             "sb     %[p5_l],    +3(%[sp5])    \n\t"
    661             "sb     %[p4_l],    +3(%[sp4])    \n\t"
    662             "sb     %[p3_l],    +3(%[sp3])    \n\t"
    663             "sb     %[p2_l],    +3(%[sp2])    \n\t"
    664             "sb     %[p1_l],    +3(%[sp1])    \n\t"
    665             "sb     %[p0_l],    +3(%[sp0])    \n\t"
    666 
    667             :
    668             : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l),
    669               [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l),
    670               [p0_l] "r"(p0_l), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4),
    671               [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0));
    672 
    673         __asm__ __volatile__(
    674             "sb     %[q0_l],    +3(%[sq0])    \n\t"
    675             "sb     %[q1_l],    +3(%[sq1])    \n\t"
    676             "sb     %[q2_l],    +3(%[sq2])    \n\t"
    677             "sb     %[q3_l],    +3(%[sq3])    \n\t"
    678             "sb     %[q4_l],    +3(%[sq4])    \n\t"
    679             "sb     %[q5_l],    +3(%[sq5])    \n\t"
    680             "sb     %[q6_l],    +3(%[sq6])    \n\t"
    681 
    682             :
    683             : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
    684               [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l),
    685               [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2), [sq3] "r"(sq3),
    686               [sq4] "r"(sq4), [sq5] "r"(sq5), [q6_l] "r"(q6_l), [sq6] "r"(sq6));
    687       } else if (mask & flat & 0xFF000000) {
    688         __asm__ __volatile__(
    689             "sb     %[p2_l_f1],     +3(%[sp2])    \n\t"
    690             "sb     %[p1_l_f1],     +3(%[sp1])    \n\t"
    691             "sb     %[p0_l_f1],     +3(%[sp0])    \n\t"
    692             "sb     %[q0_l_f1],     +3(%[sq0])    \n\t"
    693             "sb     %[q1_l_f1],     +3(%[sq1])    \n\t"
    694             "sb     %[q2_l_f1],     +3(%[sq2])    \n\t"
    695 
    696             :
    697             : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1),
    698               [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1),
    699               [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [sp2] "r"(sp2),
    700               [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1),
    701               [sq2] "r"(sq2));
    702       } else if (mask & 0xFF000000) {
    703         __asm__ __volatile__(
    704             "sb     %[p1_f0],   +3(%[sp1])    \n\t"
    705             "sb     %[p0_f0],   +3(%[sp0])    \n\t"
    706             "sb     %[q0_f0],   +3(%[sq0])    \n\t"
    707             "sb     %[q1_f0],   +3(%[sq1])    \n\t"
    708 
    709             :
    710             : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
    711               [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
    712               [sq0] "r"(sq0), [sq1] "r"(sq1));
    713       }
    714     }
    715 
    716     s = s + 4;
    717   }
    718 }
    719 
    720 void vpx_lpf_horizontal_16_dspr2(unsigned char *s, int pitch,
    721                                  const uint8_t *blimit, const uint8_t *limit,
    722                                  const uint8_t *thresh) {
    723   mb_lpf_horizontal_edge(s, pitch, blimit, limit, thresh, 1);
    724 }
    725 
    726 void vpx_lpf_horizontal_16_dual_dspr2(unsigned char *s, int pitch,
    727                                       const uint8_t *blimit,
    728                                       const uint8_t *limit,
    729                                       const uint8_t *thresh) {
    730   mb_lpf_horizontal_edge(s, pitch, blimit, limit, thresh, 2);
    731 }
    732 #endif  // #if HAVE_DSPR2
    733