Home | History | Annotate | Download | only in mips
      1 /*
      2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
      3  *
      4  * This source code is subject to the terms of the BSD 2 Clause License and
      5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6  * was not distributed with this source code in the LICENSE file, you can
      7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8  * Media Patent License 1.0 was not distributed with this source code in the
      9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10  */
     11 
     12 #include <assert.h>
     13 #include <stdio.h>
     14 
     15 #include "config/aom_dsp_rtcd.h"
     16 
     17 #include "aom_dsp/mips/convolve_common_dspr2.h"
     18 #include "aom_dsp/aom_dsp_common.h"
     19 #include "aom_ports/mem.h"
     20 
     21 #if HAVE_DSPR2
     22 static void convolve_bi_horiz_4_dspr2(const uint8_t *src, int32_t src_stride,
     23                                       uint8_t *dst, int32_t dst_stride,
     24                                       const int16_t *filter_x0, int32_t h) {
     25   int32_t y;
     26   uint8_t *cm = aom_ff_cropTbl;
     27   int32_t Temp1, Temp2, Temp3, Temp4;
     28   uint32_t vector4a = 64;
     29   uint32_t tp1, tp2;
     30   uint32_t p1, p2;
     31   const int16_t *filter = &filter_x0[3];
     32   uint32_t filter45;
     33 
     34   filter45 = ((const int32_t *)filter)[0];
     35 
     36   for (y = h; y--;) {
     37     /* prefetch data to cache memory */
     38     prefetch_load(src + src_stride);
     39     prefetch_load(src + src_stride + 32);
     40     prefetch_store(dst + dst_stride);
     41 
     42     __asm__ __volatile__(
     43         "ulw              %[tp1],      0(%[src])                      \n\t"
     44         "ulw              %[tp2],      4(%[src])                      \n\t"
     45 
     46         /* even 1. pixel */
     47         "mtlo             %[vector4a], $ac3                           \n\t"
     48         "mthi             $zero,       $ac3                           \n\t"
     49         "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
     50         "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
     51         "dpa.w.ph         $ac3,        %[p1],          %[filter45]    \n\t"
     52         "extp             %[Temp1],    $ac3,           31             \n\t"
     53 
     54         /* even 2. pixel */
     55         "mtlo             %[vector4a], $ac2                           \n\t"
     56         "mthi             $zero,       $ac2                           \n\t"
     57         "balign           %[tp2],      %[tp1],         3              \n\t"
     58         "dpa.w.ph         $ac2,        %[p2],          %[filter45]    \n\t"
     59         "extp             %[Temp3],    $ac2,           31             \n\t"
     60 
     61         /* odd 1. pixel */
     62         "lbux             %[tp1],      %[Temp1](%[cm])                \n\t"
     63         "mtlo             %[vector4a], $ac3                           \n\t"
     64         "mthi             $zero,       $ac3                           \n\t"
     65         "preceu.ph.qbr    %[p1],       %[tp2]                         \n\t"
     66         "preceu.ph.qbl    %[p2],       %[tp2]                         \n\t"
     67         "dpa.w.ph         $ac3,        %[p1],          %[filter45]    \n\t"
     68         "extp             %[Temp2],    $ac3,           31             \n\t"
     69 
     70         /* odd 2. pixel */
     71         "lbux             %[tp2],      %[Temp3](%[cm])                \n\t"
     72         "mtlo             %[vector4a], $ac2                           \n\t"
     73         "mthi             $zero,       $ac2                           \n\t"
     74         "dpa.w.ph         $ac2,        %[p2],          %[filter45]    \n\t"
     75         "extp             %[Temp4],    $ac2,           31             \n\t"
     76 
     77         /* clamp */
     78         "lbux             %[p1],       %[Temp2](%[cm])                \n\t"
     79         "lbux             %[p2],       %[Temp4](%[cm])                \n\t"
     80 
     81         /* store bytes */
     82         "sb               %[tp1],      0(%[dst])                      \n\t"
     83         "sb               %[p1],       1(%[dst])                      \n\t"
     84         "sb               %[tp2],      2(%[dst])                      \n\t"
     85         "sb               %[p2],       3(%[dst])                      \n\t"
     86 
     87         : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [p1] "=&r"(p1), [p2] "=&r"(p2),
     88           [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
     89           [Temp4] "=&r"(Temp4)
     90         : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
     91           [dst] "r"(dst), [src] "r"(src));
     92 
     93     /* Next row... */
     94     src += src_stride;
     95     dst += dst_stride;
     96   }
     97 }
     98 
     99 static void convolve_bi_horiz_8_dspr2(const uint8_t *src, int32_t src_stride,
    100                                       uint8_t *dst, int32_t dst_stride,
    101                                       const int16_t *filter_x0, int32_t h) {
    102   int32_t y;
    103   uint8_t *cm = aom_ff_cropTbl;
    104   uint32_t vector4a = 64;
    105   int32_t Temp1, Temp2, Temp3;
    106   uint32_t tp1, tp2, tp3;
    107   uint32_t p1, p2, p3, p4;
    108   uint32_t st0, st1;
    109   const int16_t *filter = &filter_x0[3];
    110   uint32_t filter45;
    111 
    112   filter45 = ((const int32_t *)filter)[0];
    113 
    114   for (y = h; y--;) {
    115     /* prefetch data to cache memory */
    116     prefetch_load(src + src_stride);
    117     prefetch_load(src + src_stride + 32);
    118     prefetch_store(dst + dst_stride);
    119 
    120     __asm__ __volatile__(
    121         "ulw              %[tp1],      0(%[src])                      \n\t"
    122         "ulw              %[tp2],      4(%[src])                      \n\t"
    123 
    124         /* even 1. pixel */
    125         "mtlo             %[vector4a], $ac3                           \n\t"
    126         "mthi             $zero,       $ac3                           \n\t"
    127         "mtlo             %[vector4a], $ac2                           \n\t"
    128         "mthi             $zero,       $ac2                           \n\t"
    129         "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
    130         "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
    131         "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
    132         "preceu.ph.qbl    %[p4],       %[tp2]                         \n\t"
    133         "ulw              %[tp3],      8(%[src])                      \n\t"
    134         "dpa.w.ph         $ac3,        %[p1],          %[filter45]    \n\t"
    135         "extp             %[Temp1],    $ac3,           31             \n\t"
    136 
    137         /* even 2. pixel */
    138         "dpa.w.ph         $ac2,        %[p2],          %[filter45]    \n\t"
    139         "extp             %[Temp3],    $ac2,           31             \n\t"
    140 
    141         /* even 3. pixel */
    142         "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
    143         "mtlo             %[vector4a], $ac1                           \n\t"
    144         "mthi             $zero,       $ac1                           \n\t"
    145         "dpa.w.ph         $ac1,        %[p3],          %[filter45]    \n\t"
    146         "extp             %[Temp1],    $ac1,           31             \n\t"
    147 
    148         /* even 4. pixel */
    149         "mtlo             %[vector4a], $ac2                           \n\t"
    150         "mthi             $zero,       $ac2                           \n\t"
    151         "mtlo             %[vector4a], $ac3                           \n\t"
    152         "mthi             $zero,       $ac3                           \n\t"
    153         "sb               %[st0],      0(%[dst])                      \n\t"
    154         "lbux             %[st1],      %[Temp3](%[cm])                \n\t"
    155 
    156         "balign           %[tp3],      %[tp2],         3              \n\t"
    157         "balign           %[tp2],      %[tp1],         3              \n\t"
    158 
    159         "dpa.w.ph         $ac2,        %[p4],          %[filter45]    \n\t"
    160         "extp             %[Temp3],    $ac2,           31             \n\t"
    161 
    162         "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
    163 
    164         /* odd 1. pixel */
    165         "mtlo             %[vector4a], $ac1                           \n\t"
    166         "mthi             $zero,       $ac1                           \n\t"
    167         "sb               %[st1],      2(%[dst])                      \n\t"
    168         "preceu.ph.qbr    %[p1],       %[tp2]                         \n\t"
    169         "preceu.ph.qbl    %[p2],       %[tp2]                         \n\t"
    170         "preceu.ph.qbr    %[p3],       %[tp3]                         \n\t"
    171         "preceu.ph.qbl    %[p4],       %[tp3]                         \n\t"
    172         "sb               %[st0],      4(%[dst])                      \n\t"
    173         "dpa.w.ph         $ac3,        %[p1],          %[filter45]    \n\t"
    174         "extp             %[Temp2],    $ac3,           31             \n\t"
    175 
    176         /* odd 2. pixel */
    177         "mtlo             %[vector4a], $ac3                           \n\t"
    178         "mthi             $zero,       $ac3                           \n\t"
    179         "mtlo             %[vector4a], $ac2                           \n\t"
    180         "mthi             $zero,       $ac2                           \n\t"
    181         "lbux             %[st0],      %[Temp3](%[cm])                \n\t"
    182         "dpa.w.ph         $ac1,        %[p2],          %[filter45]    \n\t"
    183         "extp             %[Temp3],    $ac1,           31             \n\t"
    184 
    185         /* odd 3. pixel */
    186         "lbux             %[st1],      %[Temp2](%[cm])                \n\t"
    187         "dpa.w.ph         $ac3,        %[p3],          %[filter45]    \n\t"
    188         "extp             %[Temp2],    $ac3,           31             \n\t"
    189 
    190         /* odd 4. pixel */
    191         "sb               %[st1],      1(%[dst])                      \n\t"
    192         "sb               %[st0],      6(%[dst])                      \n\t"
    193         "dpa.w.ph         $ac2,        %[p4],          %[filter45]    \n\t"
    194         "extp             %[Temp1],    $ac2,           31             \n\t"
    195 
    196         /* clamp */
    197         "lbux             %[p4],       %[Temp3](%[cm])                \n\t"
    198         "lbux             %[p2],       %[Temp2](%[cm])                \n\t"
    199         "lbux             %[p1],       %[Temp1](%[cm])                \n\t"
    200 
    201         /* store bytes */
    202         "sb               %[p4],       3(%[dst])                      \n\t"
    203         "sb               %[p2],       5(%[dst])                      \n\t"
    204         "sb               %[p1],       7(%[dst])                      \n\t"
    205 
    206         : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
    207           [st0] "=&r"(st0), [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2),
    208           [p3] "=&r"(p3), [p4] "=&r"(p4), [Temp1] "=&r"(Temp1),
    209           [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
    210         : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
    211           [dst] "r"(dst), [src] "r"(src));
    212 
    213     /* Next row... */
    214     src += src_stride;
    215     dst += dst_stride;
    216   }
    217 }
    218 
    219 static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr,
    220                                        int32_t src_stride, uint8_t *dst_ptr,
    221                                        int32_t dst_stride,
    222                                        const int16_t *filter_x0, int32_t h,
    223                                        int32_t count) {
    224   int32_t y, c;
    225   const uint8_t *src;
    226   uint8_t *dst;
    227   uint8_t *cm = aom_ff_cropTbl;
    228   uint32_t vector_64 = 64;
    229   int32_t Temp1, Temp2, Temp3;
    230   uint32_t qload1, qload2, qload3;
    231   uint32_t p1, p2, p3, p4, p5;
    232   uint32_t st1, st2, st3;
    233   const int16_t *filter = &filter_x0[3];
    234   uint32_t filter45;
    235 
    236   filter45 = ((const int32_t *)filter)[0];
    237 
    238   for (y = h; y--;) {
    239     src = src_ptr;
    240     dst = dst_ptr;
    241 
    242     /* prefetch data to cache memory */
    243     prefetch_load(src_ptr + src_stride);
    244     prefetch_load(src_ptr + src_stride + 32);
    245     prefetch_store(dst_ptr + dst_stride);
    246 
    247     for (c = 0; c < count; c++) {
    248       __asm__ __volatile__(
    249           "ulw              %[qload1],    0(%[src])                    \n\t"
    250           "ulw              %[qload2],    4(%[src])                    \n\t"
    251 
    252           /* even 1. pixel */
    253           "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
    254           "mthi             $zero,        $ac1                         \n\t"
    255           "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
    256           "mthi             $zero,        $ac2                         \n\t"
    257           "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
    258           "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
    259           "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
    260           "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
    261           "ulw              %[qload3],    8(%[src])                    \n\t"
    262           "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* even 1 */
    263           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
    264 
    265           /* even 2. pixel */
    266           "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
    267           "mthi             $zero,        $ac3                         \n\t"
    268           "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
    269           "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
    270           "ulw              %[qload1],    12(%[src])                   \n\t"
    271           "dpa.w.ph         $ac2,         %[p2],          %[filter45]  \n\t" /* even 1 */
    272           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
    273           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
    274 
    275           /* even 3. pixel */
    276           "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
    277           "mthi             $zero,        $ac1                         \n\t"
    278           "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
    279           "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
    280           "dpa.w.ph         $ac3,         %[p3],          %[filter45]  \n\t" /* even 3 */
    281           "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
    282           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
    283 
    284           /* even 4. pixel */
    285           "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
    286           "mthi             $zero,        $ac2                         \n\t"
    287           "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
    288           "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
    289           "dpa.w.ph         $ac1,         %[p4],          %[filter45]  \n\t" /* even 4 */
    290           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
    291           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
    292 
    293           /* even 5. pixel */
    294           "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
    295           "mthi             $zero,        $ac3                         \n\t"
    296           "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
    297           "dpa.w.ph         $ac2,         %[p1],          %[filter45]  \n\t" /* even 5 */
    298           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
    299           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
    300 
    301           /* even 6. pixel */
    302           "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
    303           "mthi             $zero,        $ac1                         \n\t"
    304           "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
    305           "dpa.w.ph         $ac3,         %[p5],          %[filter45]  \n\t" /* even 6 */
    306           "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
    307           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
    308 
    309           /* even 7. pixel */
    310           "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
    311           "mthi             $zero,        $ac2                         \n\t"
    312           "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
    313           "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* even 7 */
    314           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
    315           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
    316 
    317           /* even 8. pixel */
    318           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
    319           "mthi             $zero,        $ac3                         \n\t"
    320           "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* even 8 */
    321           "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
    322           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
    323           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
    324 
    325           /* ODD pixels */
    326           "ulw              %[qload1],    1(%[src])                    \n\t"
    327           "ulw              %[qload2],    5(%[src])                    \n\t"
    328 
    329           /* odd 1. pixel */
    330           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
    331           "mthi             $zero,        $ac1                         \n\t"
    332           "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
    333           "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
    334           "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
    335           "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
    336           "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
    337           "ulw              %[qload3],    9(%[src])                    \n\t"
    338           "dpa.w.ph         $ac3,         %[p1],          %[filter45]  \n\t" /* odd 1 */
    339           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
    340           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
    341 
    342           /* odd 2. pixel */
    343           "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
    344           "mthi             $zero,        $ac2                         \n\t"
    345           "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
    346           "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
    347           "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
    348           "ulw              %[qload1],    13(%[src])                   \n\t"
    349           "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* odd 2 */
    350           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
    351           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
    352 
    353           /* odd 3. pixel */
    354           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
    355           "mthi             $zero,        $ac3                         \n\t"
    356           "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
    357           "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
    358           "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* odd 3 */
    359           "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
    360           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
    361 
    362           /* odd 4. pixel */
    363           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
    364           "mthi             $zero,        $ac1                         \n\t"
    365           "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
    366           "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
    367           "dpa.w.ph         $ac3,         %[p4],          %[filter45]  \n\t" /* odd 4 */
    368           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
    369           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
    370 
    371           /* odd 5. pixel */
    372           "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
    373           "mthi             $zero,        $ac2                         \n\t"
    374           "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
    375           "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* odd 5 */
    376           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
    377           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
    378 
    379           /* odd 6. pixel */
    380           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
    381           "mthi             $zero,        $ac3                         \n\t"
    382           "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
    383           "dpa.w.ph         $ac2,         %[p5],          %[filter45]  \n\t" /* odd 6 */
    384           "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
    385           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
    386 
    387           /* odd 7. pixel */
    388           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
    389           "mthi             $zero,        $ac1                         \n\t"
    390           "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
    391           "dpa.w.ph         $ac3,         %[p2],          %[filter45]  \n\t" /* odd 7 */
    392           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
    393 
    394           /* odd 8. pixel */
    395           "dpa.w.ph         $ac1,         %[p3],          %[filter45]  \n\t" /* odd 8 */
    396           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
    397 
    398           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
    399           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
    400           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
    401 
    402           "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
    403           "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
    404           "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
    405 
    406           : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
    407             [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
    408             [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
    409             [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
    410             [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
    411           : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
    412             [dst] "r"(dst), [src] "r"(src));
    413 
    414       src += 16;
    415       dst += 16;
    416     }
    417 
    418     /* Next row... */
    419     src_ptr += src_stride;
    420     dst_ptr += dst_stride;
    421   }
    422 }
    423 
    424 static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr,
    425                                        int32_t src_stride, uint8_t *dst_ptr,
    426                                        int32_t dst_stride,
    427                                        const int16_t *filter_x0, int32_t h) {
    428   int32_t y, c;
    429   const uint8_t *src;
    430   uint8_t *dst;
    431   uint8_t *cm = aom_ff_cropTbl;
    432   uint32_t vector_64 = 64;
    433   int32_t Temp1, Temp2, Temp3;
    434   uint32_t qload1, qload2, qload3;
    435   uint32_t p1, p2, p3, p4, p5;
    436   uint32_t st1, st2, st3;
    437   const int16_t *filter = &filter_x0[3];
    438   uint32_t filter45;
    439 
    440   filter45 = ((const int32_t *)filter)[0];
    441 
    442   for (y = h; y--;) {
    443     src = src_ptr;
    444     dst = dst_ptr;
    445 
    446     /* prefetch data to cache memory */
    447     prefetch_load(src_ptr + src_stride);
    448     prefetch_load(src_ptr + src_stride + 32);
    449     prefetch_load(src_ptr + src_stride + 64);
    450     prefetch_store(dst_ptr + dst_stride);
    451     prefetch_store(dst_ptr + dst_stride + 32);
    452 
    453     for (c = 0; c < 4; c++) {
    454       __asm__ __volatile__(
    455           "ulw              %[qload1],    0(%[src])                    \n\t"
    456           "ulw              %[qload2],    4(%[src])                    \n\t"
    457 
    458           /* even 1. pixel */
    459           "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
    460           "mthi             $zero,        $ac1                         \n\t"
    461           "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
    462           "mthi             $zero,        $ac2                         \n\t"
    463           "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
    464           "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
    465           "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
    466           "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
    467           "ulw              %[qload3],    8(%[src])                    \n\t"
    468           "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* even 1 */
    469           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
    470 
    471           /* even 2. pixel */
    472           "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
    473           "mthi             $zero,        $ac3                         \n\t"
    474           "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
    475           "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
    476           "ulw              %[qload1],    12(%[src])                   \n\t"
    477           "dpa.w.ph         $ac2,         %[p2],          %[filter45]  \n\t" /* even 1 */
    478           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
    479           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
    480 
    481           /* even 3. pixel */
    482           "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
    483           "mthi             $zero,        $ac1                         \n\t"
    484           "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
    485           "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
    486           "dpa.w.ph         $ac3,         %[p3],          %[filter45]  \n\t" /* even 3 */
    487           "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
    488           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
    489 
    490           /* even 4. pixel */
    491           "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
    492           "mthi             $zero,        $ac2                         \n\t"
    493           "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
    494           "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
    495           "dpa.w.ph         $ac1,         %[p4],          %[filter45]  \n\t" /* even 4 */
    496           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
    497           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
    498 
    499           /* even 5. pixel */
    500           "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
    501           "mthi             $zero,        $ac3                         \n\t"
    502           "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
    503           "dpa.w.ph         $ac2,         %[p1],          %[filter45]  \n\t" /* even 5 */
    504           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
    505           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
    506 
    507           /* even 6. pixel */
    508           "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
    509           "mthi             $zero,        $ac1                         \n\t"
    510           "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
    511           "dpa.w.ph         $ac3,         %[p5],          %[filter45]  \n\t" /* even 6 */
    512           "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
    513           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
    514 
    515           /* even 7. pixel */
    516           "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
    517           "mthi             $zero,        $ac2                         \n\t"
    518           "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
    519           "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* even 7 */
    520           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
    521           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
    522 
    523           /* even 8. pixel */
    524           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
    525           "mthi             $zero,        $ac3                         \n\t"
    526           "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* even 8 */
    527           "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
    528           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
    529           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
    530 
    531           /* ODD pixels */
    532           "ulw              %[qload1],    1(%[src])                    \n\t"
    533           "ulw              %[qload2],    5(%[src])                    \n\t"
    534 
    535           /* odd 1. pixel */
    536           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
    537           "mthi             $zero,        $ac1                         \n\t"
    538           "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
    539           "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
    540           "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
    541           "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
    542           "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
    543           "ulw              %[qload3],    9(%[src])                    \n\t"
    544           "dpa.w.ph         $ac3,         %[p1],          %[filter45]  \n\t" /* odd 1 */
    545           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
    546           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
    547 
    548           /* odd 2. pixel */
    549           "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
    550           "mthi             $zero,        $ac2                         \n\t"
    551           "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
    552           "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
    553           "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
    554           "ulw              %[qload1],    13(%[src])                   \n\t"
    555           "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* odd 2 */
    556           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
    557           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
    558 
    559           /* odd 3. pixel */
    560           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
    561           "mthi             $zero,        $ac3                         \n\t"
    562           "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
    563           "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
    564           "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* odd 3 */
    565           "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
    566           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
    567 
    568           /* odd 4. pixel */
    569           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
    570           "mthi             $zero,        $ac1                         \n\t"
    571           "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
    572           "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
    573           "dpa.w.ph         $ac3,         %[p4],          %[filter45]  \n\t" /* odd 4 */
    574           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
    575           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
    576 
    577           /* odd 5. pixel */
    578           "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
    579           "mthi             $zero,        $ac2                         \n\t"
    580           "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
    581           "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* odd 5 */
    582           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
    583           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
    584 
    585           /* odd 6. pixel */
    586           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
    587           "mthi             $zero,        $ac3                         \n\t"
    588           "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
    589           "dpa.w.ph         $ac2,         %[p5],          %[filter45]  \n\t" /* odd 6 */
    590           "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
    591           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
    592 
    593           /* odd 7. pixel */
    594           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
    595           "mthi             $zero,        $ac1                         \n\t"
    596           "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
    597           "dpa.w.ph         $ac3,         %[p2],          %[filter45]  \n\t" /* odd 7 */
    598           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
    599 
    600           /* odd 8. pixel */
    601           "dpa.w.ph         $ac1,         %[p3],          %[filter45]  \n\t" /* odd 8 */
    602           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
    603 
    604           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
    605           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
    606           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
    607 
    608           "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
    609           "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
    610           "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
    611 
    612           : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
    613             [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
    614             [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
    615             [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
    616             [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
    617           : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
    618             [dst] "r"(dst), [src] "r"(src));
    619 
    620       src += 16;
    621       dst += 16;
    622     }
    623 
    624     /* Next row... */
    625     src_ptr += src_stride;
    626     dst_ptr += dst_stride;
    627   }
    628 }
    629 
    630 void aom_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
    631                                uint8_t *dst, ptrdiff_t dst_stride,
    632                                const int16_t *filter_x, int x_step_q4,
    633                                const int16_t *filter_y, int y_step_q4, int w,
    634                                int h) {
    635   uint32_t pos = 38;
    636 
    637   assert(x_step_q4 == 16);
    638 
    639   prefetch_load((const uint8_t *)filter_x);
    640 
    641   /* bit positon for extract from acc */
    642   __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
    643                        :
    644                        : [pos] "r"(pos));
    645 
    646   /* prefetch data to cache memory */
    647   prefetch_load(src);
    648   prefetch_load(src + 32);
    649   prefetch_store(dst);
    650 
    651   switch (w) {
    652     case 4:
    653       convolve_bi_horiz_4_dspr2(src, (int32_t)src_stride, dst,
    654                                 (int32_t)dst_stride, filter_x, (int32_t)h);
    655       break;
    656     case 8:
    657       convolve_bi_horiz_8_dspr2(src, (int32_t)src_stride, dst,
    658                                 (int32_t)dst_stride, filter_x, (int32_t)h);
    659       break;
    660     case 16:
    661       convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, dst,
    662                                  (int32_t)dst_stride, filter_x, (int32_t)h, 1);
    663       break;
    664     case 32:
    665       convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, dst,
    666                                  (int32_t)dst_stride, filter_x, (int32_t)h, 2);
    667       break;
    668     case 64:
    669       prefetch_load(src + 64);
    670       prefetch_store(dst + 32);
    671 
    672       convolve_bi_horiz_64_dspr2(src, (int32_t)src_stride, dst,
    673                                  (int32_t)dst_stride, filter_x, (int32_t)h);
    674       break;
    675     default:
    676       aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
    677                             x_step_q4, filter_y, y_step_q4, w, h);
    678       break;
    679   }
    680 }
    681 #endif
    682