Home | History | Annotate | Download | only in dspr2
      1 /*
      2  *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <assert.h>
     12 #include <stdio.h>
     13 
     14 #include "./vpx_config.h"
     15 #include "./vp9_rtcd.h"
     16 #include "vp9/common/vp9_common.h"
     17 #include "vpx/vpx_integer.h"
     18 #include "vpx_ports/mem.h"
     19 #include "vp9/common/vp9_filter.h"
     20 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
     21 
     22 #if HAVE_DSPR2
     23 static void convolve_bi_horiz_4_transposed_dspr2(const uint8_t *src,
     24                                                  int32_t src_stride,
     25                                                  uint8_t *dst,
     26                                                  int32_t dst_stride,
     27                                                  const int16_t *filter_x0,
     28                                                  int32_t h) {
     29   int32_t       y;
     30   uint8_t       *cm = vp9_ff_cropTbl;
     31   uint8_t       *dst_ptr;
     32   int32_t       Temp1, Temp2;
     33   uint32_t      vector4a = 64;
     34   uint32_t      tp1, tp2;
     35   uint32_t      p1, p2;
     36   const int16_t *filter = &filter_x0[3];
     37   uint32_t      filter45;
     38 
     39   filter45 = ((const int32_t *)filter)[0];
     40 
     41   for (y = h; y--;) {
     42     dst_ptr = dst;
     43     /* prefetch data to cache memory */
     44     vp9_prefetch_load(src + src_stride);
     45     vp9_prefetch_load(src + src_stride + 32);
     46 
     47     __asm__ __volatile__ (
     48         "ulw              %[tp1],         0(%[src])                      \n\t"
     49         "ulw              %[tp2],         4(%[src])                      \n\t"
     50 
     51         /* even 1. pixel */
     52         "mtlo             %[vector4a],    $ac3                           \n\t"
     53         "mthi             $zero,          $ac3                           \n\t"
     54         "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
     55         "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
     56         "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
     57         "extp             %[Temp1],       $ac3,           31             \n\t"
     58 
     59         /* even 2. pixel */
     60         "mtlo             %[vector4a],    $ac2                           \n\t"
     61         "mthi             $zero,          $ac2                           \n\t"
     62         "balign           %[tp2],         %[tp1],         3              \n\t"
     63         "dpa.w.ph         $ac2,           %[p2],          %[filter45]    \n\t"
     64         "extp             %[Temp2],       $ac2,           31             \n\t"
     65 
     66         /* odd 1. pixel */
     67         "lbux             %[tp1],         %[Temp1](%[cm])                \n\t"
     68         "mtlo             %[vector4a],    $ac3                           \n\t"
     69         "mthi             $zero,          $ac3                           \n\t"
     70         "preceu.ph.qbr    %[p1],          %[tp2]                         \n\t"
     71         "preceu.ph.qbl    %[p2],          %[tp2]                         \n\t"
     72         "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
     73         "extp             %[Temp1],       $ac3,           31             \n\t"
     74 
     75         /* odd 2. pixel */
     76         "lbux             %[tp2],         %[Temp2](%[cm])                \n\t"
     77         "mtlo             %[vector4a],    $ac2                           \n\t"
     78         "mthi             $zero,          $ac2                           \n\t"
     79         "dpa.w.ph         $ac2,           %[p2],          %[filter45]    \n\t"
     80         "extp             %[Temp2],       $ac2,           31             \n\t"
     81 
     82         /* clamp */
     83         "lbux             %[p1],          %[Temp1](%[cm])                \n\t"
     84         "lbux             %[p2],          %[Temp2](%[cm])                \n\t"
     85 
     86         /* store bytes */
     87         "sb               %[tp1],         0(%[dst_ptr])                  \n\t"
     88         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
     89 
     90         "sb               %[p1],          0(%[dst_ptr])                  \n\t"
     91         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
     92 
     93         "sb               %[tp2],         0(%[dst_ptr])                  \n\t"
     94         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
     95 
     96         "sb               %[p2],          0(%[dst_ptr])                  \n\t"
     97         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
     98 
     99         : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
    100           [p1] "=&r" (p1), [p2] "=&r" (p2),
    101           [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
    102           [dst_ptr] "+r" (dst_ptr)
    103         : [filter45] "r" (filter45),[vector4a] "r" (vector4a),
    104           [cm] "r" (cm), [src] "r" (src), [dst_stride] "r" (dst_stride)
    105     );
    106 
    107     /* Next row... */
    108     src += src_stride;
    109     dst += 1;
    110   }
    111 }
    112 
    113 static void convolve_bi_horiz_8_transposed_dspr2(const uint8_t *src,
    114                                                  int32_t src_stride,
    115                                                  uint8_t *dst,
    116                                                  int32_t dst_stride,
    117                                                  const int16_t *filter_x0,
    118                                                  int32_t h) {
    119   int32_t y;
    120   uint8_t *cm = vp9_ff_cropTbl;
    121   uint8_t *dst_ptr;
    122   uint32_t vector4a = 64;
    123   int32_t Temp1, Temp2, Temp3;
    124   uint32_t tp1, tp2, tp3;
    125   uint32_t p1, p2, p3, p4;
    126   uint8_t *odd_dst;
    127   uint32_t dst_pitch_2 = (dst_stride << 1);
    128   const int16_t *filter = &filter_x0[3];
    129   uint32_t      filter45;
    130 
    131   filter45 = ((const int32_t *)filter)[0];
    132 
    133   for (y = h; y--;) {
    134     /* prefetch data to cache memory */
    135     vp9_prefetch_load(src + src_stride);
    136     vp9_prefetch_load(src + src_stride + 32);
    137 
    138     dst_ptr = dst;
    139     odd_dst = (dst_ptr + dst_stride);
    140 
    141     __asm__ __volatile__ (
    142         "ulw              %[tp1],         0(%[src])                       \n\t"
    143         "ulw              %[tp2],         4(%[src])                       \n\t"
    144 
    145         /* even 1. pixel */
    146         "mtlo             %[vector4a],    $ac3                            \n\t"
    147         "mthi             $zero,          $ac3                            \n\t"
    148         "mtlo             %[vector4a],    $ac2                            \n\t"
    149         "mthi             $zero,          $ac2                            \n\t"
    150         "preceu.ph.qbr    %[p1],          %[tp1]                          \n\t"
    151         "preceu.ph.qbl    %[p2],          %[tp1]                          \n\t"
    152         "preceu.ph.qbr    %[p3],          %[tp2]                          \n\t"
    153         "preceu.ph.qbl    %[p4],          %[tp2]                          \n\t"
    154         "ulw              %[tp3],         8(%[src])                       \n\t"
    155         "dpa.w.ph         $ac3,           %[p1],          %[filter45]     \n\t"
    156         "extp             %[Temp1],       $ac3,           31              \n\t"
    157 
    158         /* even 2. pixel */
    159         "dpa.w.ph         $ac2,           %[p2],          %[filter45]     \n\t"
    160         "extp             %[Temp3],       $ac2,           31              \n\t"
    161 
    162         /* even 3. pixel */
    163         "lbux             %[Temp2],       %[Temp1](%[cm])                 \n\t"
    164         "mtlo             %[vector4a],    $ac1                            \n\t"
    165         "mthi             $zero,          $ac1                            \n\t"
    166         "balign           %[tp3],         %[tp2],         3              \n\t"
    167         "balign           %[tp2],         %[tp1],         3              \n\t"
    168         "dpa.w.ph         $ac1,           %[p3],          %[filter45]     \n\t"
    169         "lbux             %[tp1],         %[Temp3](%[cm])                 \n\t"
    170         "extp             %[p3],          $ac1,           31              \n\t"
    171 
    172         /* even 4. pixel */
    173         "mtlo             %[vector4a],    $ac2                            \n\t"
    174         "mthi             $zero,          $ac2                            \n\t"
    175         "mtlo             %[vector4a],    $ac3                            \n\t"
    176         "mthi             $zero,          $ac3                            \n\t"
    177         "sb               %[Temp2],       0(%[dst_ptr])                   \n\t"
    178         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
    179         "sb               %[tp1],         0(%[dst_ptr])                   \n\t"
    180         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
    181 
    182         "dpa.w.ph         $ac2,           %[p4],          %[filter45]     \n\t"
    183         "extp             %[Temp3],       $ac2,           31              \n\t"
    184 
    185         "lbux             %[Temp1],         %[p3](%[cm])                    \n\t"
    186 
    187         /* odd 1. pixel */
    188         "mtlo             %[vector4a],    $ac1                            \n\t"
    189         "mthi             $zero,          $ac1                            \n\t"
    190         "preceu.ph.qbr    %[p1],          %[tp2]                          \n\t"
    191         "preceu.ph.qbl    %[p2],          %[tp2]                          \n\t"
    192         "preceu.ph.qbr    %[p3],          %[tp3]                          \n\t"
    193         "preceu.ph.qbl    %[p4],          %[tp3]                          \n\t"
    194         "sb               %[Temp1],       0(%[dst_ptr])                   \n\t"
    195         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
    196 
    197         "dpa.w.ph         $ac3,           %[p1],          %[filter45]     \n\t"
    198         "extp             %[Temp2],       $ac3,           31              \n\t"
    199 
    200         /* odd 2. pixel */
    201         "lbux             %[tp1],         %[Temp3](%[cm])                 \n\t"
    202         "mtlo             %[vector4a],    $ac3                            \n\t"
    203         "mthi             $zero,          $ac3                            \n\t"
    204         "mtlo             %[vector4a],    $ac2                            \n\t"
    205         "mthi             $zero,          $ac2                            \n\t"
    206         "dpa.w.ph         $ac1,           %[p2],          %[filter45]     \n\t"
    207         "sb               %[tp1],         0(%[dst_ptr])                   \n\t"
    208         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
    209         "extp             %[Temp3],       $ac1,           31              \n\t"
    210 
    211         /* odd 3. pixel */
    212         "lbux             %[tp3],         %[Temp2](%[cm])                 \n\t"
    213         "dpa.w.ph         $ac3,           %[p3],          %[filter45]     \n\t"
    214         "extp             %[Temp2],       $ac3,           31              \n\t"
    215 
    216         /* odd 4. pixel */
    217         "sb               %[tp3],         0(%[odd_dst])                   \n\t"
    218         "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
    219         "dpa.w.ph         $ac2,           %[p4],          %[filter45]     \n\t"
    220         "extp             %[Temp1],       $ac2,           31              \n\t"
    221 
    222         /* clamp */
    223         "lbux             %[p4],          %[Temp3](%[cm])                 \n\t"
    224         "lbux             %[p2],          %[Temp2](%[cm])                 \n\t"
    225         "lbux             %[p1],          %[Temp1](%[cm])                 \n\t"
    226 
    227         /* store bytes */
    228         "sb               %[p4],          0(%[odd_dst])                   \n\t"
    229         "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
    230 
    231         "sb               %[p2],          0(%[odd_dst])                   \n\t"
    232         "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
    233 
    234         "sb               %[p1],          0(%[odd_dst])                   \n\t"
    235 
    236         : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3),
    237           [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
    238           [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
    239           [dst_ptr] "+r" (dst_ptr), [odd_dst] "+r" (odd_dst)
    240         : [filter45] "r" (filter45),[vector4a] "r" (vector4a), [cm] "r" (cm),
    241           [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
    242     );
    243 
    244     /* Next row... */
    245     src += src_stride;
    246     dst += 1;
    247   }
    248 }
    249 
    250 static void convolve_bi_horiz_16_transposed_dspr2(const uint8_t *src_ptr,
    251                                                   int32_t src_stride,
    252                                                   uint8_t *dst_ptr,
    253                                                   int32_t dst_stride,
    254                                                   const int16_t *filter_x0,
    255                                                   int32_t h,
    256                                                   int32_t count) {
    257   int32_t       c, y;
    258   const uint8_t *src;
    259   uint8_t       *dst;
    260   uint8_t       *cm = vp9_ff_cropTbl;
    261   uint32_t      vector_64 = 64;
    262   int32_t       Temp1, Temp2, Temp3;
    263   uint32_t      qload1, qload2;
    264   uint32_t      p1, p2, p3, p4, p5;
    265   uint32_t      st1, st2, st3;
    266   uint32_t      dst_pitch_2 = (dst_stride << 1);
    267   uint8_t       *odd_dst;
    268   const int16_t *filter = &filter_x0[3];
    269   uint32_t      filter45;
    270 
    271   filter45 = ((const int32_t *)filter)[0];
    272 
    273   for (y = h; y--;) {
    274     /* prefetch data to cache memory */
    275     vp9_prefetch_load(src_ptr + src_stride);
    276     vp9_prefetch_load(src_ptr + src_stride + 32);
    277 
    278     src = src_ptr;
    279     dst = dst_ptr;
    280 
    281     odd_dst = (dst + dst_stride);
    282 
    283     for (c = 0; c < count; c++) {
    284       __asm__ __volatile__ (
    285           "ulw              %[qload1],        0(%[src])                       \n\t"
    286           "ulw              %[qload2],        4(%[src])                       \n\t"
    287 
    288           /* even 1. pixel */
    289           "mtlo             %[vector_64],     $ac1                            \n\t" /* even 1 */
    290           "mthi             $zero,            $ac1                            \n\t"
    291           "mtlo             %[vector_64],     $ac2                            \n\t" /* even 2 */
    292           "mthi             $zero,            $ac2                            \n\t"
    293           "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
    294           "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
    295           "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
    296           "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
    297           "ulw              %[qload1],        8(%[src])                       \n\t"
    298           "dpa.w.ph         $ac1,             %[p1],          %[filter45]     \n\t" /* even 1 */
    299           "extp             %[Temp1],         $ac1,           31              \n\t" /* even 1 */
    300 
    301           /* even 2. pixel */
    302           "mtlo             %[vector_64],     $ac3                            \n\t" /* even 3 */
    303           "mthi             $zero,            $ac3                            \n\t"
    304           "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
    305           "preceu.ph.qbl    %[p5],            %[qload1]                       \n\t"
    306           "ulw              %[qload2],        12(%[src])                      \n\t"
    307           "dpa.w.ph         $ac2,             %[p2],          %[filter45]     \n\t" /* even 1 */
    308           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 1 */
    309           "extp             %[Temp2],         $ac2,           31              \n\t" /* even 1 */
    310 
    311           /* even 3. pixel */
    312           "mtlo             %[vector_64],     $ac1                            \n\t" /* even 4 */
    313           "mthi             $zero,            $ac1                            \n\t"
    314           "preceu.ph.qbr    %[p2],            %[qload2]                       \n\t"
    315           "sb               %[st1],           0(%[dst])                       \n\t" /* even 1 */
    316           "addu             %[dst],           %[dst],         %[dst_pitch_2]             \n\t"
    317           "dpa.w.ph         $ac3,             %[p3],          %[filter45]     \n\t" /* even 3 */
    318           "extp             %[Temp3],         $ac3,           31              \n\t" /* even 3 */
    319           "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 1 */
    320 
    321           /* even 4. pixel */
    322           "mtlo             %[vector_64],     $ac2                            \n\t" /* even 5 */
    323           "mthi             $zero,            $ac2                            \n\t"
    324           "preceu.ph.qbl    %[p3],            %[qload2]                       \n\t"
    325           "sb               %[st2],           0(%[dst])                       \n\t" /* even 2 */
    326           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
    327           "dpa.w.ph         $ac1,             %[p4],          %[filter45]     \n\t" /* even 4 */
    328           "extp             %[Temp1],         $ac1,           31              \n\t" /* even 4 */
    329           "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 3 */
    330 
    331           /* even 5. pixel */
    332           "mtlo             %[vector_64],     $ac3                            \n\t" /* even 6 */
    333           "mthi             $zero,            $ac3                            \n\t"
    334           "sb               %[st3],           0(%[dst])                       \n\t" /* even 3 */
    335           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
    336           "dpa.w.ph         $ac2,             %[p1],          %[filter45]     \n\t" /* even 5 */
    337           "extp             %[Temp2],         $ac2,           31              \n\t" /* even 5 */
    338           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 4 */
    339 
    340           /* even 6. pixel */
    341           "mtlo             %[vector_64],     $ac1                            \n\t" /* even 7 */
    342           "mthi             $zero,            $ac1                            \n\t"
    343           "sb               %[st1],           0(%[dst])                       \n\t" /* even 4 */
    344           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
    345           "ulw              %[qload1],        20(%[src])                      \n\t"
    346           "dpa.w.ph         $ac3,             %[p5],          %[filter45]     \n\t" /* even 6 */
    347           "extp             %[Temp3],         $ac3,           31              \n\t" /* even 6 */
    348           "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 5 */
    349 
    350           /* even 7. pixel */
    351           "mtlo             %[vector_64],     $ac2                            \n\t" /* even 8 */
    352           "mthi             $zero,            $ac2                            \n\t"
    353           "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
    354           "sb               %[st2],           0(%[dst])                       \n\t" /* even 5 */
    355           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
    356           "dpa.w.ph         $ac1,             %[p2],          %[filter45]     \n\t" /* even 7 */
    357           "extp             %[Temp1],         $ac1,           31              \n\t" /* even 7 */
    358           "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 6 */
    359 
    360           /* even 8. pixel */
    361           "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 1 */
    362           "mthi             $zero,            $ac3                            \n\t"
    363           "dpa.w.ph         $ac2,             %[p3],          %[filter45]     \n\t" /* even 8 */
    364           "sb               %[st3],           0(%[dst])                       \n\t" /* even 6 */
    365           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
    366           "extp             %[Temp2],         $ac2,           31              \n\t" /* even 8 */
    367           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 7 */
    368 
    369           /* ODD pixels */
    370           "ulw              %[qload1],        1(%[src])                       \n\t"
    371           "ulw              %[qload2],        5(%[src])                       \n\t"
    372 
    373           /* odd 1. pixel */
    374           "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 2 */
    375           "mthi             $zero,            $ac1                            \n\t"
    376           "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
    377           "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
    378           "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
    379           "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
    380           "sb               %[st1],           0(%[dst])                       \n\t" /* even 7 */
    381           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
    382           "ulw              %[qload2],        9(%[src])                       \n\t"
    383           "dpa.w.ph         $ac3,             %[p1],          %[filter45]     \n\t" /* odd 1 */
    384           "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 1 */
    385           "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 8 */
    386 
    387           /* odd 2. pixel */
    388           "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 3 */
    389           "mthi             $zero,            $ac2                            \n\t"
    390           "preceu.ph.qbr    %[p1],            %[qload2]                       \n\t"
    391           "preceu.ph.qbl    %[p5],            %[qload2]                       \n\t"
    392           "sb               %[st2],           0(%[dst])                       \n\t" /* even 8 */
    393           "ulw              %[qload1],        13(%[src])                      \n\t"
    394           "dpa.w.ph         $ac1,             %[p2],          %[filter45]     \n\t" /* odd 2 */
    395           "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 2 */
    396           "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 1 */
    397 
    398           /* odd 3. pixel */
    399           "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 4 */
    400           "mthi             $zero,            $ac3                            \n\t"
    401           "preceu.ph.qbr    %[p2],            %[qload1]                       \n\t"
    402           "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 1 */
    403           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
    404           "dpa.w.ph         $ac2,             %[p3],          %[filter45]     \n\t" /* odd 3 */
    405           "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 3 */
    406           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 2 */
    407 
    408           /* odd 4. pixel */
    409           "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 5 */
    410           "mthi             $zero,            $ac1                            \n\t"
    411           "preceu.ph.qbl    %[p3],            %[qload1]                       \n\t"
    412           "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 2 */
    413           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
    414           "dpa.w.ph         $ac3,             %[p4],          %[filter45]     \n\t" /* odd 4 */
    415           "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 4 */
    416           "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 3 */
    417 
    418           /* odd 5. pixel */
    419           "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 6 */
    420           "mthi             $zero,            $ac2                            \n\t"
    421           "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 3 */
    422           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
    423           "dpa.w.ph         $ac1,             %[p1],          %[filter45]     \n\t" /* odd 5 */
    424           "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 5 */
    425           "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 4 */
    426 
    427           /* odd 6. pixel */
    428           "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 7 */
    429           "mthi             $zero,            $ac3                            \n\t"
    430           "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 4 */
    431           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
    432           "ulw              %[qload1],        21(%[src])                      \n\t"
    433           "dpa.w.ph         $ac2,             %[p5],          %[filter45]     \n\t" /* odd 6 */
    434           "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 6 */
    435           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 5 */
    436 
    437           /* odd 7. pixel */
    438           "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 8 */
    439           "mthi             $zero,            $ac1                            \n\t"
    440           "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
    441           "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 5 */
    442           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
    443           "dpa.w.ph         $ac3,             %[p2],          %[filter45]     \n\t" /* odd 7 */
    444           "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 7 */
    445 
    446           /* odd 8. pixel */
    447           "dpa.w.ph         $ac1,             %[p3],          %[filter45]     \n\t" /* odd 8 */
    448           "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 8 */
    449 
    450           "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 6 */
    451           "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 7 */
    452           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 8 */
    453 
    454           "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 6 */
    455           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
    456 
    457           "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 7 */
    458           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
    459 
    460           "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 8 */
    461 
    462           : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5),
    463             [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
    464             [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
    465             [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
    466             [dst] "+r" (dst), [odd_dst] "+r" (odd_dst)
    467           : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
    468             [cm] "r" (cm),
    469             [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
    470       );
    471 
    472       src += 16;
    473       dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
    474       odd_dst = (dst + dst_stride);
    475     }
    476 
    477     /* Next row... */
    478     src_ptr += src_stride;
    479     dst_ptr += 1;
    480   }
    481 }
    482 
    483 static void convolve_bi_horiz_64_transposed_dspr2(const uint8_t *src_ptr,
    484                                                   int32_t src_stride,
    485                                                   uint8_t *dst_ptr,
    486                                                   int32_t dst_stride,
    487                                                   const int16_t *filter_x0,
    488                                                   int32_t h) {
    489   int32_t       c, y;
    490   const uint8_t *src;
    491   uint8_t       *dst;
    492   uint8_t       *cm = vp9_ff_cropTbl;
    493   uint32_t      vector_64 = 64;
    494   int32_t       Temp1, Temp2, Temp3;
    495   uint32_t      qload1, qload2;
    496   uint32_t      p1, p2, p3, p4, p5;
    497   uint32_t      st1, st2, st3;
    498   uint32_t      dst_pitch_2 = (dst_stride << 1);
    499   uint8_t       *odd_dst;
    500   const int16_t *filter = &filter_x0[3];
    501   uint32_t      filter45;
    502 
    503   filter45 = ((const int32_t *)filter)[0];
    504 
    505   for (y = h; y--;) {
    506     /* prefetch data to cache memory */
    507     vp9_prefetch_load(src_ptr + src_stride);
    508     vp9_prefetch_load(src_ptr + src_stride + 32);
    509     vp9_prefetch_load(src_ptr + src_stride + 64);
    510 
    511     src = src_ptr;
    512     dst = dst_ptr;
    513 
    514     odd_dst = (dst + dst_stride);
    515 
    516     for (c = 0; c < 4; c++) {
    517       __asm__ __volatile__ (
    518           "ulw              %[qload1],        0(%[src])                       \n\t"
    519           "ulw              %[qload2],        4(%[src])                       \n\t"
    520 
    521           /* even 1. pixel */
    522           "mtlo             %[vector_64],     $ac1                            \n\t" /* even 1 */
    523           "mthi             $zero,            $ac1                            \n\t"
    524           "mtlo             %[vector_64],     $ac2                            \n\t" /* even 2 */
    525           "mthi             $zero,            $ac2                            \n\t"
    526           "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
    527           "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
    528           "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
    529           "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
    530           "ulw              %[qload1],        8(%[src])                       \n\t"
    531           "dpa.w.ph         $ac1,             %[p1],          %[filter45]     \n\t" /* even 1 */
    532           "extp             %[Temp1],         $ac1,           31              \n\t" /* even 1 */
    533 
    534           /* even 2. pixel */
    535           "mtlo             %[vector_64],     $ac3                            \n\t" /* even 3 */
    536           "mthi             $zero,            $ac3                            \n\t"
    537           "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
    538           "preceu.ph.qbl    %[p5],            %[qload1]                       \n\t"
    539           "ulw              %[qload2],        12(%[src])                      \n\t"
    540           "dpa.w.ph         $ac2,             %[p2],          %[filter45]     \n\t" /* even 1 */
    541           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 1 */
    542           "extp             %[Temp2],         $ac2,           31              \n\t" /* even 1 */
    543 
    544           /* even 3. pixel */
    545           "mtlo             %[vector_64],     $ac1                            \n\t" /* even 4 */
    546           "mthi             $zero,            $ac1                            \n\t"
    547           "preceu.ph.qbr    %[p2],            %[qload2]                       \n\t"
    548           "sb               %[st1],           0(%[dst])                       \n\t" /* even 1 */
    549           "addu             %[dst],           %[dst],         %[dst_pitch_2]             \n\t"
    550           "dpa.w.ph         $ac3,             %[p3],          %[filter45]     \n\t" /* even 3 */
    551           "extp             %[Temp3],         $ac3,           31              \n\t" /* even 3 */
    552           "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 1 */
    553 
    554           /* even 4. pixel */
    555           "mtlo             %[vector_64],     $ac2                            \n\t" /* even 5 */
    556           "mthi             $zero,            $ac2                            \n\t"
    557           "preceu.ph.qbl    %[p3],            %[qload2]                       \n\t"
    558           "sb               %[st2],           0(%[dst])                       \n\t" /* even 2 */
    559           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
    560           "dpa.w.ph         $ac1,             %[p4],          %[filter45]     \n\t" /* even 4 */
    561           "extp             %[Temp1],         $ac1,           31              \n\t" /* even 4 */
    562           "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 3 */
    563 
    564           /* even 5. pixel */
    565           "mtlo             %[vector_64],     $ac3                            \n\t" /* even 6 */
    566           "mthi             $zero,            $ac3                            \n\t"
    567           "sb               %[st3],           0(%[dst])                       \n\t" /* even 3 */
    568           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
    569           "dpa.w.ph         $ac2,             %[p1],          %[filter45]     \n\t" /* even 5 */
    570           "extp             %[Temp2],         $ac2,           31              \n\t" /* even 5 */
    571           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 4 */
    572 
    573           /* even 6. pixel */
    574           "mtlo             %[vector_64],     $ac1                            \n\t" /* even 7 */
    575           "mthi             $zero,            $ac1                            \n\t"
    576           "sb               %[st1],           0(%[dst])                       \n\t" /* even 4 */
    577           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
    578           "ulw              %[qload1],        20(%[src])                      \n\t"
    579           "dpa.w.ph         $ac3,             %[p5],          %[filter45]     \n\t" /* even 6 */
    580           "extp             %[Temp3],         $ac3,           31              \n\t" /* even 6 */
    581           "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 5 */
    582 
    583           /* even 7. pixel */
    584           "mtlo             %[vector_64],     $ac2                            \n\t" /* even 8 */
    585           "mthi             $zero,            $ac2                            \n\t"
    586           "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
    587           "sb               %[st2],           0(%[dst])                       \n\t" /* even 5 */
    588           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
    589           "dpa.w.ph         $ac1,             %[p2],          %[filter45]     \n\t" /* even 7 */
    590           "extp             %[Temp1],         $ac1,           31              \n\t" /* even 7 */
    591           "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 6 */
    592 
    593           /* even 8. pixel */
    594           "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 1 */
    595           "mthi             $zero,            $ac3                            \n\t"
    596           "dpa.w.ph         $ac2,             %[p3],          %[filter45]     \n\t" /* even 8 */
    597           "sb               %[st3],           0(%[dst])                       \n\t" /* even 6 */
    598           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
    599           "extp             %[Temp2],         $ac2,           31              \n\t" /* even 8 */
    600           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 7 */
    601 
    602           /* ODD pixels */
    603           "ulw              %[qload1],        1(%[src])                       \n\t"
    604           "ulw              %[qload2],        5(%[src])                       \n\t"
    605 
    606           /* odd 1. pixel */
    607           "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 2 */
    608           "mthi             $zero,            $ac1                            \n\t"
    609           "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
    610           "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
    611           "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
    612           "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
    613           "sb               %[st1],           0(%[dst])                       \n\t" /* even 7 */
    614           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
    615           "ulw              %[qload2],        9(%[src])                       \n\t"
    616           "dpa.w.ph         $ac3,             %[p1],          %[filter45]     \n\t" /* odd 1 */
    617           "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 1 */
    618           "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 8 */
    619 
    620           /* odd 2. pixel */
    621           "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 3 */
    622           "mthi             $zero,            $ac2                            \n\t"
    623           "preceu.ph.qbr    %[p1],            %[qload2]                       \n\t"
    624           "preceu.ph.qbl    %[p5],            %[qload2]                       \n\t"
    625           "sb               %[st2],           0(%[dst])                       \n\t" /* even 8 */
    626           "ulw              %[qload1],        13(%[src])                      \n\t"
    627           "dpa.w.ph         $ac1,             %[p2],          %[filter45]     \n\t" /* odd 2 */
    628           "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 2 */
    629           "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 1 */
    630 
    631           /* odd 3. pixel */
    632           "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 4 */
    633           "mthi             $zero,            $ac3                            \n\t"
    634           "preceu.ph.qbr    %[p2],            %[qload1]                       \n\t"
    635           "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 1 */
    636           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
    637           "dpa.w.ph         $ac2,             %[p3],          %[filter45]     \n\t" /* odd 3 */
    638           "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 3 */
    639           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 2 */
    640 
    641           /* odd 4. pixel */
    642           "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 5 */
    643           "mthi             $zero,            $ac1                            \n\t"
    644           "preceu.ph.qbl    %[p3],            %[qload1]                       \n\t"
    645           "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 2 */
    646           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
    647           "dpa.w.ph         $ac3,             %[p4],          %[filter45]     \n\t" /* odd 4 */
    648           "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 4 */
    649           "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 3 */
    650 
    651           /* odd 5. pixel */
    652           "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 6 */
    653           "mthi             $zero,            $ac2                            \n\t"
    654           "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 3 */
    655           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
    656           "dpa.w.ph         $ac1,             %[p1],          %[filter45]     \n\t" /* odd 5 */
    657           "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 5 */
    658           "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 4 */
    659 
    660           /* odd 6. pixel */
    661           "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 7 */
    662           "mthi             $zero,            $ac3                            \n\t"
    663           "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 4 */
    664           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
    665           "ulw              %[qload1],        21(%[src])                      \n\t"
    666           "dpa.w.ph         $ac2,             %[p5],          %[filter45]     \n\t" /* odd 6 */
    667           "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 6 */
    668           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 5 */
    669 
    670           /* odd 7. pixel */
    671           "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 8 */
    672           "mthi             $zero,            $ac1                            \n\t"
    673           "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
    674           "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 5 */
    675           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
    676           "dpa.w.ph         $ac3,             %[p2],          %[filter45]     \n\t" /* odd 7 */
    677           "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 7 */
    678 
    679           /* odd 8. pixel */
    680           "dpa.w.ph         $ac1,             %[p3],          %[filter45]     \n\t" /* odd 8 */
    681           "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 8 */
    682 
    683           "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 6 */
    684           "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 7 */
    685           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 8 */
    686 
    687           "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 6 */
    688           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
    689 
    690           "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 7 */
    691           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
    692 
    693           "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 8 */
    694 
    695           : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5),
    696             [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
    697             [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
    698             [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
    699             [dst] "+r" (dst), [odd_dst] "+r" (odd_dst)
    700           : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
    701             [cm] "r" (cm),
    702             [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
    703       );
    704 
    705       src += 16;
    706       dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
    707       odd_dst = (dst + dst_stride);
    708     }
    709 
    710     /* Next row... */
    711     src_ptr += src_stride;
    712     dst_ptr += 1;
    713   }
    714 }
    715 
    716 void convolve_bi_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
    717                                   uint8_t *dst, ptrdiff_t dst_stride,
    718                                   const int16_t *filter, int w, int h) {
    719   int x, y;
    720 
    721   for (y = 0; y < h; ++y) {
    722     for (x = 0; x < w; ++x) {
    723       int sum = 0;
    724 
    725       sum += src[x] * filter[3];
    726       sum += src[x + 1] * filter[4];
    727 
    728       dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
    729     }
    730 
    731     src += src_stride;
    732     dst += 1;
    733   }
    734 }
    735 
    736 void vp9_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride,
    737                          uint8_t *dst, ptrdiff_t dst_stride,
    738                          const int16_t *filter,
    739                          int w, int h) {
    740   uint32_t pos = 38;
    741 
    742   /* bit positon for extract from acc */
    743   __asm__ __volatile__ (
    744     "wrdsp      %[pos],     1           \n\t"
    745     :
    746     : [pos] "r" (pos)
    747   );
    748 
    749   /* prefetch data to cache memory */
    750   vp9_prefetch_load(src);
    751   vp9_prefetch_load(src + 32);
    752 
    753   switch (w) {
    754     case 4:
    755       convolve_bi_horiz_4_transposed_dspr2(src, src_stride,
    756                                            dst, dst_stride,
    757                                            filter, h);
    758       break;
    759     case 8:
    760       convolve_bi_horiz_8_transposed_dspr2(src, src_stride,
    761                                            dst, dst_stride,
    762                                            filter, h);
    763       break;
    764     case 16:
    765     case 32:
    766       convolve_bi_horiz_16_transposed_dspr2(src, src_stride,
    767                                             dst, dst_stride,
    768                                             filter, h,
    769                                             (w/16));
    770       break;
    771     case 64:
    772       vp9_prefetch_load(src + 32);
    773       convolve_bi_horiz_64_transposed_dspr2(src, src_stride,
    774                                             dst, dst_stride,
    775                                             filter, h);
    776       break;
    777     default:
    778       convolve_bi_horiz_transposed(src, src_stride,
    779                                    dst, dst_stride,
    780                                    filter, w, h);
    781       break;
    782   }
    783 }
    784 #endif
    785