Home | History | Annotate | Download | only in dspr2
      1 /*
      2  *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <assert.h>
     12 #include <stdio.h>
     13 
     14 #include "./vpx_config.h"
     15 #include "./vp9_rtcd.h"
     16 #include "vp9/common/vp9_common.h"
     17 #include "vpx/vpx_integer.h"
     18 #include "vpx_ports/mem.h"
     19 #include "vp9/common/vp9_convolve.h"
     20 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
     21 
     22 #if HAVE_DSPR2
     23 static void convolve_bi_horiz_4_dspr2(const uint8_t *src,
     24                                       int32_t src_stride,
     25                                       uint8_t *dst,
     26                                       int32_t dst_stride,
     27                                       const int16_t *filter_x0,
     28                                       int32_t h) {
     29   int32_t y;
     30   uint8_t *cm = vp9_ff_cropTbl;
     31   int32_t Temp1, Temp2, Temp3, Temp4;
     32   uint32_t vector4a = 64;
     33   uint32_t tp1, tp2;
     34   uint32_t p1, p2;
     35   const int16_t *filter = &filter_x0[3];
     36   uint32_t filter45;;
     37 
     38   filter45 = ((const int32_t *)filter)[0];
     39 
     40   for (y = h; y--;) {
     41     /* prefetch data to cache memory */
     42     vp9_prefetch_load(src + src_stride);
     43     vp9_prefetch_load(src + src_stride + 32);
     44     vp9_prefetch_store(dst + dst_stride);
     45 
     46     __asm__ __volatile__ (
     47         "ulw              %[tp1],      0(%[src])                      \n\t"
     48         "ulw              %[tp2],      4(%[src])                      \n\t"
     49 
     50         /* even 1. pixel */
     51         "mtlo             %[vector4a], $ac3                           \n\t"
     52         "mthi             $zero,       $ac3                           \n\t"
     53         "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
     54         "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
     55         "dpa.w.ph         $ac3,        %[p1],          %[filter45]    \n\t"
     56         "extp             %[Temp1],    $ac3,           31             \n\t"
     57 
     58         /* even 2. pixel */
     59         "mtlo             %[vector4a], $ac2                           \n\t"
     60         "mthi             $zero,       $ac2                           \n\t"
     61         "balign           %[tp2],      %[tp1],         3              \n\t"
     62         "dpa.w.ph         $ac2,        %[p2],          %[filter45]    \n\t"
     63         "extp             %[Temp3],    $ac2,           31             \n\t"
     64 
     65         /* odd 1. pixel */
     66         "lbux             %[tp1],      %[Temp1](%[cm])                \n\t"
     67         "mtlo             %[vector4a], $ac3                           \n\t"
     68         "mthi             $zero,       $ac3                           \n\t"
     69         "preceu.ph.qbr    %[p1],       %[tp2]                         \n\t"
     70         "preceu.ph.qbl    %[p2],       %[tp2]                         \n\t"
     71         "dpa.w.ph         $ac3,        %[p1],          %[filter45]    \n\t"
     72         "extp             %[Temp2],    $ac3,           31             \n\t"
     73 
     74         /* odd 2. pixel */
     75         "lbux             %[tp2],      %[Temp3](%[cm])                \n\t"
     76         "mtlo             %[vector4a], $ac2                           \n\t"
     77         "mthi             $zero,       $ac2                           \n\t"
     78         "dpa.w.ph         $ac2,        %[p2],          %[filter45]    \n\t"
     79         "extp             %[Temp4],    $ac2,           31             \n\t"
     80 
     81         /* clamp */
     82         "lbux             %[p1],       %[Temp2](%[cm])                \n\t"
     83         "lbux             %[p2],       %[Temp4](%[cm])                \n\t"
     84 
     85         /* store bytes */
     86         "sb               %[tp1],      0(%[dst])                      \n\t"
     87         "sb               %[p1],       1(%[dst])                      \n\t"
     88         "sb               %[tp2],      2(%[dst])                      \n\t"
     89         "sb               %[p2],       3(%[dst])                      \n\t"
     90 
     91         : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
     92           [p1] "=&r" (p1), [p2] "=&r" (p2),
     93           [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
     94           [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
     95         : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
     96           [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
     97     );
     98 
     99     /* Next row... */
    100     src += src_stride;
    101     dst += dst_stride;
    102   }
    103 }
    104 
    105 static void convolve_bi_horiz_8_dspr2(const uint8_t *src,
    106                                       int32_t src_stride,
    107                                       uint8_t *dst,
    108                                       int32_t dst_stride,
    109                                       const int16_t *filter_x0,
    110                                       int32_t h) {
    111   int32_t y;
    112   uint8_t *cm = vp9_ff_cropTbl;
    113   uint32_t vector4a = 64;
    114   int32_t Temp1, Temp2, Temp3;
    115   uint32_t tp1, tp2, tp3;
    116   uint32_t p1, p2, p3, p4;
    117   uint32_t st0, st1;
    118   const int16_t *filter = &filter_x0[3];
    119   uint32_t filter45;;
    120 
    121   filter45 = ((const int32_t *)filter)[0];
    122 
    123   for (y = h; y--;) {
    124     /* prefetch data to cache memory */
    125     vp9_prefetch_load(src + src_stride);
    126     vp9_prefetch_load(src + src_stride + 32);
    127     vp9_prefetch_store(dst + dst_stride);
    128 
    129     __asm__ __volatile__ (
    130         "ulw              %[tp1],      0(%[src])                      \n\t"
    131         "ulw              %[tp2],      4(%[src])                      \n\t"
    132 
    133         /* even 1. pixel */
    134         "mtlo             %[vector4a], $ac3                           \n\t"
    135         "mthi             $zero,       $ac3                           \n\t"
    136         "mtlo             %[vector4a], $ac2                           \n\t"
    137         "mthi             $zero,       $ac2                           \n\t"
    138         "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
    139         "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
    140         "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
    141         "preceu.ph.qbl    %[p4],       %[tp2]                         \n\t"
    142         "ulw              %[tp3],      8(%[src])                      \n\t"
    143         "dpa.w.ph         $ac3,        %[p1],          %[filter45]    \n\t"
    144         "extp             %[Temp1],    $ac3,           31             \n\t"
    145 
    146         /* even 2. pixel */
    147         "dpa.w.ph         $ac2,        %[p2],          %[filter45]    \n\t"
    148         "extp             %[Temp3],    $ac2,           31             \n\t"
    149 
    150         /* even 3. pixel */
    151         "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
    152         "mtlo             %[vector4a], $ac1                           \n\t"
    153         "mthi             $zero,       $ac1                           \n\t"
    154         "dpa.w.ph         $ac1,        %[p3],          %[filter45]    \n\t"
    155         "extp             %[Temp1],    $ac1,           31             \n\t"
    156 
    157         /* even 4. pixel */
    158         "mtlo             %[vector4a], $ac2                           \n\t"
    159         "mthi             $zero,       $ac2                           \n\t"
    160         "mtlo             %[vector4a], $ac3                           \n\t"
    161         "mthi             $zero,       $ac3                           \n\t"
    162         "sb               %[st0],      0(%[dst])                      \n\t"
    163         "lbux             %[st1],      %[Temp3](%[cm])                \n\t"
    164 
    165         "balign           %[tp3],      %[tp2],         3              \n\t"
    166         "balign           %[tp2],      %[tp1],         3              \n\t"
    167 
    168         "dpa.w.ph         $ac2,        %[p4],          %[filter45]    \n\t"
    169         "extp             %[Temp3],    $ac2,           31             \n\t"
    170 
    171         "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
    172 
    173         /* odd 1. pixel */
    174         "mtlo             %[vector4a], $ac1                           \n\t"
    175         "mthi             $zero,       $ac1                           \n\t"
    176         "sb               %[st1],      2(%[dst])                      \n\t"
    177         "preceu.ph.qbr    %[p1],       %[tp2]                         \n\t"
    178         "preceu.ph.qbl    %[p2],       %[tp2]                         \n\t"
    179         "preceu.ph.qbr    %[p3],       %[tp3]                         \n\t"
    180         "preceu.ph.qbl    %[p4],       %[tp3]                         \n\t"
    181         "sb               %[st0],      4(%[dst])                      \n\t"
    182         "dpa.w.ph         $ac3,        %[p1],          %[filter45]    \n\t"
    183         "extp             %[Temp2],    $ac3,           31             \n\t"
    184 
    185         /* odd 2. pixel */
    186         "mtlo             %[vector4a], $ac3                           \n\t"
    187         "mthi             $zero,       $ac3                           \n\t"
    188         "mtlo             %[vector4a], $ac2                           \n\t"
    189         "mthi             $zero,       $ac2                           \n\t"
    190         "lbux             %[st0],      %[Temp3](%[cm])                \n\t"
    191         "dpa.w.ph         $ac1,        %[p2],          %[filter45]    \n\t"
    192         "extp             %[Temp3],    $ac1,           31             \n\t"
    193 
    194         /* odd 3. pixel */
    195         "lbux             %[st1],      %[Temp2](%[cm])                \n\t"
    196         "dpa.w.ph         $ac3,        %[p3],          %[filter45]    \n\t"
    197         "extp             %[Temp2],    $ac3,           31             \n\t"
    198 
    199         /* odd 4. pixel */
    200         "sb               %[st1],      1(%[dst])                      \n\t"
    201         "sb               %[st0],      6(%[dst])                      \n\t"
    202         "dpa.w.ph         $ac2,        %[p4],          %[filter45]    \n\t"
    203         "extp             %[Temp1],    $ac2,           31             \n\t"
    204 
    205         /* clamp */
    206         "lbux             %[p4],       %[Temp3](%[cm])                \n\t"
    207         "lbux             %[p2],       %[Temp2](%[cm])                \n\t"
    208         "lbux             %[p1],       %[Temp1](%[cm])                \n\t"
    209 
    210         /* store bytes */
    211         "sb               %[p4],       3(%[dst])                      \n\t"
    212         "sb               %[p2],       5(%[dst])                      \n\t"
    213         "sb               %[p1],       7(%[dst])                      \n\t"
    214 
    215         : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3),
    216           [st0] "=&r" (st0), [st1] "=&r" (st1),
    217           [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
    218           [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
    219         : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
    220           [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
    221     );
    222 
    223     /* Next row... */
    224     src += src_stride;
    225     dst += dst_stride;
    226   }
    227 }
    228 
    229 static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr,
    230                                        int32_t src_stride,
    231                                        uint8_t *dst_ptr,
    232                                        int32_t dst_stride,
    233                                        const int16_t *filter_x0,
    234                                        int32_t h,
    235                                        int32_t count) {
    236   int32_t y, c;
    237   const uint8_t *src;
    238   uint8_t *dst;
    239   uint8_t *cm = vp9_ff_cropTbl;
    240   uint32_t vector_64 = 64;
    241   int32_t Temp1, Temp2, Temp3;
    242   uint32_t qload1, qload2, qload3;
    243   uint32_t p1, p2, p3, p4, p5;
    244   uint32_t st1, st2, st3;
    245   const int16_t *filter = &filter_x0[3];
    246   uint32_t filter45;;
    247 
    248   filter45 = ((const int32_t *)filter)[0];
    249 
    250   for (y = h; y--;) {
    251     src = src_ptr;
    252     dst = dst_ptr;
    253 
    254     /* prefetch data to cache memory */
    255     vp9_prefetch_load(src_ptr + src_stride);
    256     vp9_prefetch_load(src_ptr + src_stride + 32);
    257     vp9_prefetch_store(dst_ptr + dst_stride);
    258 
    259     for (c = 0; c < count; c++) {
    260       __asm__ __volatile__ (
    261           "ulw              %[qload1],    0(%[src])                    \n\t"
    262           "ulw              %[qload2],    4(%[src])                    \n\t"
    263 
    264           /* even 1. pixel */
    265           "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
    266           "mthi             $zero,        $ac1                         \n\t"
    267           "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
    268           "mthi             $zero,        $ac2                         \n\t"
    269           "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
    270           "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
    271           "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
    272           "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
    273           "ulw              %[qload3],    8(%[src])                    \n\t"
    274           "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* even 1 */
    275           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
    276 
    277           /* even 2. pixel */
    278           "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
    279           "mthi             $zero,        $ac3                         \n\t"
    280           "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
    281           "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
    282           "ulw              %[qload1],    12(%[src])                   \n\t"
    283           "dpa.w.ph         $ac2,         %[p2],          %[filter45]  \n\t" /* even 1 */
    284           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
    285           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
    286 
    287           /* even 3. pixel */
    288           "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
    289           "mthi             $zero,        $ac1                         \n\t"
    290           "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
    291           "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
    292           "dpa.w.ph         $ac3,         %[p3],          %[filter45]  \n\t" /* even 3 */
    293           "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
    294           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
    295 
    296           /* even 4. pixel */
    297           "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
    298           "mthi             $zero,        $ac2                         \n\t"
    299           "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
    300           "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
    301           "dpa.w.ph         $ac1,         %[p4],          %[filter45]  \n\t" /* even 4 */
    302           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
    303           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
    304 
    305           /* even 5. pixel */
    306           "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
    307           "mthi             $zero,        $ac3                         \n\t"
    308           "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
    309           "dpa.w.ph         $ac2,         %[p1],          %[filter45]  \n\t" /* even 5 */
    310           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
    311           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
    312 
    313           /* even 6. pixel */
    314           "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
    315           "mthi             $zero,        $ac1                         \n\t"
    316           "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
    317           "dpa.w.ph         $ac3,         %[p5],          %[filter45]  \n\t" /* even 6 */
    318           "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
    319           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
    320 
    321           /* even 7. pixel */
    322           "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
    323           "mthi             $zero,        $ac2                         \n\t"
    324           "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
    325           "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* even 7 */
    326           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
    327           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
    328 
    329           /* even 8. pixel */
    330           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
    331           "mthi             $zero,        $ac3                         \n\t"
    332           "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* even 8 */
    333           "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
    334           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
    335           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
    336 
    337           /* ODD pixels */
    338           "ulw              %[qload1],    1(%[src])                    \n\t"
    339           "ulw              %[qload2],    5(%[src])                    \n\t"
    340 
    341           /* odd 1. pixel */
    342           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
    343           "mthi             $zero,        $ac1                         \n\t"
    344           "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
    345           "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
    346           "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
    347           "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
    348           "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
    349           "ulw              %[qload3],    9(%[src])                    \n\t"
    350           "dpa.w.ph         $ac3,         %[p1],          %[filter45]  \n\t" /* odd 1 */
    351           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
    352           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
    353 
    354           /* odd 2. pixel */
    355           "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
    356           "mthi             $zero,        $ac2                         \n\t"
    357           "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
    358           "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
    359           "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
    360           "ulw              %[qload1],    13(%[src])                   \n\t"
    361           "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* odd 2 */
    362           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
    363           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
    364 
    365           /* odd 3. pixel */
    366           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
    367           "mthi             $zero,        $ac3                         \n\t"
    368           "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
    369           "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
    370           "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* odd 3 */
    371           "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
    372           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
    373 
    374           /* odd 4. pixel */
    375           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
    376           "mthi             $zero,        $ac1                         \n\t"
    377           "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
    378           "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
    379           "dpa.w.ph         $ac3,         %[p4],          %[filter45]  \n\t" /* odd 4 */
    380           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
    381           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
    382 
    383           /* odd 5. pixel */
    384           "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
    385           "mthi             $zero,        $ac2                         \n\t"
    386           "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
    387           "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* odd 5 */
    388           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
    389           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
    390 
    391           /* odd 6. pixel */
    392           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
    393           "mthi             $zero,        $ac3                         \n\t"
    394           "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
    395           "dpa.w.ph         $ac2,         %[p5],          %[filter45]  \n\t" /* odd 6 */
    396           "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
    397           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
    398 
    399           /* odd 7. pixel */
    400           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
    401           "mthi             $zero,        $ac1                         \n\t"
    402           "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
    403           "dpa.w.ph         $ac3,         %[p2],          %[filter45]  \n\t" /* odd 7 */
    404           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
    405 
    406           /* odd 8. pixel */
    407           "dpa.w.ph         $ac1,         %[p3],          %[filter45]  \n\t" /* odd 8 */
    408           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
    409 
    410           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
    411           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
    412           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
    413 
    414           "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
    415           "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
    416           "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
    417 
    418           : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3),
    419             [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
    420             [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
    421             [p5] "=&r" (p5),
    422             [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
    423           : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
    424             [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
    425       );
    426 
    427       src += 16;
    428       dst += 16;
    429     }
    430 
    431     /* Next row... */
    432     src_ptr += src_stride;
    433     dst_ptr += dst_stride;
    434   }
    435 }
    436 
    437 static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr,
    438                                        int32_t src_stride,
    439                                        uint8_t *dst_ptr,
    440                                        int32_t dst_stride,
    441                                        const int16_t *filter_x0,
    442                                        int32_t h) {
    443   int32_t y, c;
    444   const uint8_t *src;
    445   uint8_t *dst;
    446   uint8_t *cm = vp9_ff_cropTbl;
    447   uint32_t vector_64 = 64;
    448   int32_t Temp1, Temp2, Temp3;
    449   uint32_t qload1, qload2, qload3;
    450   uint32_t p1, p2, p3, p4, p5;
    451   uint32_t st1, st2, st3;
    452   const int16_t *filter = &filter_x0[3];
    453   uint32_t filter45;;
    454 
    455   filter45 = ((const int32_t *)filter)[0];
    456 
    457   for (y = h; y--;) {
    458     src = src_ptr;
    459     dst = dst_ptr;
    460 
    461     /* prefetch data to cache memory */
    462     vp9_prefetch_load(src_ptr + src_stride);
    463     vp9_prefetch_load(src_ptr + src_stride + 32);
    464     vp9_prefetch_load(src_ptr + src_stride + 64);
    465     vp9_prefetch_store(dst_ptr + dst_stride);
    466     vp9_prefetch_store(dst_ptr + dst_stride + 32);
    467 
    468     for (c = 0; c < 4; c++) {
    469       __asm__ __volatile__ (
    470           "ulw              %[qload1],    0(%[src])                    \n\t"
    471           "ulw              %[qload2],    4(%[src])                    \n\t"
    472 
    473           /* even 1. pixel */
    474           "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
    475           "mthi             $zero,        $ac1                         \n\t"
    476           "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
    477           "mthi             $zero,        $ac2                         \n\t"
    478           "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
    479           "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
    480           "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
    481           "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
    482           "ulw              %[qload3],    8(%[src])                    \n\t"
    483           "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* even 1 */
    484           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
    485 
    486           /* even 2. pixel */
    487           "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
    488           "mthi             $zero,        $ac3                         \n\t"
    489           "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
    490           "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
    491           "ulw              %[qload1],    12(%[src])                   \n\t"
    492           "dpa.w.ph         $ac2,         %[p2],          %[filter45]  \n\t" /* even 1 */
    493           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
    494           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
    495 
    496           /* even 3. pixel */
    497           "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
    498           "mthi             $zero,        $ac1                         \n\t"
    499           "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
    500           "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
    501           "dpa.w.ph         $ac3,         %[p3],          %[filter45]  \n\t" /* even 3 */
    502           "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
    503           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
    504 
    505           /* even 4. pixel */
    506           "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
    507           "mthi             $zero,        $ac2                         \n\t"
    508           "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
    509           "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
    510           "dpa.w.ph         $ac1,         %[p4],          %[filter45]  \n\t" /* even 4 */
    511           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
    512           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
    513 
    514           /* even 5. pixel */
    515           "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
    516           "mthi             $zero,        $ac3                         \n\t"
    517           "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
    518           "dpa.w.ph         $ac2,         %[p1],          %[filter45]  \n\t" /* even 5 */
    519           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
    520           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
    521 
    522           /* even 6. pixel */
    523           "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
    524           "mthi             $zero,        $ac1                         \n\t"
    525           "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
    526           "dpa.w.ph         $ac3,         %[p5],          %[filter45]  \n\t" /* even 6 */
    527           "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
    528           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
    529 
    530           /* even 7. pixel */
    531           "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
    532           "mthi             $zero,        $ac2                         \n\t"
    533           "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
    534           "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* even 7 */
    535           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
    536           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
    537 
    538           /* even 8. pixel */
    539           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
    540           "mthi             $zero,        $ac3                         \n\t"
    541           "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* even 8 */
    542           "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
    543           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
    544           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
    545 
    546           /* ODD pixels */
    547           "ulw              %[qload1],    1(%[src])                    \n\t"
    548           "ulw              %[qload2],    5(%[src])                    \n\t"
    549 
    550           /* odd 1. pixel */
    551           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
    552           "mthi             $zero,        $ac1                         \n\t"
    553           "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
    554           "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
    555           "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
    556           "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
    557           "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
    558           "ulw              %[qload3],    9(%[src])                    \n\t"
    559           "dpa.w.ph         $ac3,         %[p1],          %[filter45]  \n\t" /* odd 1 */
    560           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
    561           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
    562 
    563           /* odd 2. pixel */
    564           "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
    565           "mthi             $zero,        $ac2                         \n\t"
    566           "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
    567           "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
    568           "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
    569           "ulw              %[qload1],    13(%[src])                   \n\t"
    570           "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* odd 2 */
    571           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
    572           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
    573 
    574           /* odd 3. pixel */
    575           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
    576           "mthi             $zero,        $ac3                         \n\t"
    577           "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
    578           "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
    579           "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* odd 3 */
    580           "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
    581           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
    582 
    583           /* odd 4. pixel */
    584           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
    585           "mthi             $zero,        $ac1                         \n\t"
    586           "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
    587           "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
    588           "dpa.w.ph         $ac3,         %[p4],          %[filter45]  \n\t" /* odd 4 */
    589           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
    590           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
    591 
    592           /* odd 5. pixel */
    593           "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
    594           "mthi             $zero,        $ac2                         \n\t"
    595           "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
    596           "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* odd 5 */
    597           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
    598           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
    599 
    600           /* odd 6. pixel */
    601           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
    602           "mthi             $zero,        $ac3                         \n\t"
    603           "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
    604           "dpa.w.ph         $ac2,         %[p5],          %[filter45]  \n\t" /* odd 6 */
    605           "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
    606           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
    607 
    608           /* odd 7. pixel */
    609           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
    610           "mthi             $zero,        $ac1                         \n\t"
    611           "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
    612           "dpa.w.ph         $ac3,         %[p2],          %[filter45]  \n\t" /* odd 7 */
    613           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
    614 
    615           /* odd 8. pixel */
    616           "dpa.w.ph         $ac1,         %[p3],          %[filter45]  \n\t" /* odd 8 */
    617           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
    618 
    619           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
    620           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
    621           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
    622 
    623           "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
    624           "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
    625           "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
    626 
    627           : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3),
    628             [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
    629             [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
    630             [p5] "=&r" (p5),
    631             [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
    632           : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
    633             [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
    634       );
    635 
    636       src += 16;
    637       dst += 16;
    638     }
    639 
    640     /* Next row... */
    641     src_ptr += src_stride;
    642     dst_ptr += dst_stride;
    643   }
    644 }
    645 
    646 void vp9_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
    647                                uint8_t *dst, ptrdiff_t dst_stride,
    648                                const int16_t *filter_x, int x_step_q4,
    649                                const int16_t *filter_y, int y_step_q4,
    650                                int w, int h) {
    651   if (16 == x_step_q4) {
    652     uint32_t pos = 38;
    653 
    654     vp9_prefetch_load((const uint8_t *)filter_x);
    655 
    656     /* bit positon for extract from acc */
    657     __asm__ __volatile__ (
    658       "wrdsp      %[pos],     1           \n\t"
    659       :
    660       : [pos] "r" (pos)
    661     );
    662 
    663     /* prefetch data to cache memory */
    664     vp9_prefetch_load(src);
    665     vp9_prefetch_load(src + 32);
    666     vp9_prefetch_store(dst);
    667 
    668     switch (w) {
    669       case 4:
    670         convolve_bi_horiz_4_dspr2(src, (int32_t)src_stride,
    671                                   dst, (int32_t)dst_stride,
    672                                   filter_x, (int32_t)h);
    673         break;
    674       case 8:
    675         convolve_bi_horiz_8_dspr2(src, (int32_t)src_stride,
    676                                   dst, (int32_t)dst_stride,
    677                                   filter_x, (int32_t)h);
    678         break;
    679       case 16:
    680         convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride,
    681                                    dst, (int32_t)dst_stride,
    682                                    filter_x, (int32_t)h, 1);
    683         break;
    684       case 32:
    685         convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride,
    686                                    dst, (int32_t)dst_stride,
    687                                    filter_x, (int32_t)h, 2);
    688         break;
    689       case 64:
    690         vp9_prefetch_load(src + 64);
    691         vp9_prefetch_store(dst + 32);
    692 
    693         convolve_bi_horiz_64_dspr2(src, (int32_t)src_stride,
    694                                    dst, (int32_t)dst_stride,
    695                                    filter_x, (int32_t)h);
    696         break;
    697       default:
    698         vp9_convolve8_horiz_c(src, src_stride,
    699                               dst, dst_stride,
    700                               filter_x, x_step_q4,
    701                               filter_y, y_step_q4,
    702                               w, h);
    703         break;
    704     }
    705   } else {
    706     vp9_convolve8_horiz_c(src, src_stride,
    707                           dst, dst_stride,
    708                           filter_x, x_step_q4,
    709                           filter_y, y_step_q4,
    710                           w, h);
    711   }
    712 }
    713 #endif
    714