Home | History | Annotate | Download | only in dspr2
      1 /*
      2  *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <assert.h>
     12 #include <stdio.h>
     13 
     14 #include "./vpx_config.h"
     15 #include "./vp9_rtcd.h"
     16 #include "vp9/common/vp9_common.h"
     17 #include "vpx/vpx_integer.h"
     18 #include "vpx_ports/mem.h"
     19 #include "vp9/common/vp9_convolve.h"
     20 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
     21 
     22 #if HAVE_DSPR2
     23 static void convolve_avg_vert_4_dspr2(const uint8_t *src,
     24                                       int32_t src_stride,
     25                                       uint8_t *dst,
     26                                       int32_t dst_stride,
     27                                       const int16_t *filter_y,
     28                                       int32_t w,
     29                                       int32_t h) {
     30   int32_t       x, y;
     31   const uint8_t *src_ptr;
     32   uint8_t       *dst_ptr;
     33   uint8_t       *cm = vp9_ff_cropTbl;
     34   uint32_t      vector4a = 64;
     35   uint32_t      load1, load2, load3, load4;
     36   uint32_t      p1, p2;
     37   uint32_t      n1, n2;
     38   uint32_t      scratch1, scratch2;
     39   uint32_t      store1, store2;
     40   int32_t       vector1b, vector2b, vector3b, vector4b;
     41   int32_t       Temp1, Temp2;
     42 
     43   vector1b = ((const int32_t *)filter_y)[0];
     44   vector2b = ((const int32_t *)filter_y)[1];
     45   vector3b = ((const int32_t *)filter_y)[2];
     46   vector4b = ((const int32_t *)filter_y)[3];
     47 
     48   src -= 3 * src_stride;
     49 
     50   for (y = h; y--;) {
     51     /* prefetch data to cache memory */
     52     vp9_prefetch_store(dst + dst_stride);
     53 
     54     for (x = 0; x < w; x += 4) {
     55       src_ptr = src + x;
     56       dst_ptr = dst + x;
     57 
     58       __asm__ __volatile__ (
     59           "ulw              %[load1],     0(%[src_ptr])                   \n\t"
     60           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
     61           "ulw              %[load2],     0(%[src_ptr])                   \n\t"
     62           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
     63           "ulw              %[load3],     0(%[src_ptr])                   \n\t"
     64           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
     65           "ulw              %[load4],     0(%[src_ptr])                   \n\t"
     66 
     67           "mtlo             %[vector4a],  $ac0                            \n\t"
     68           "mtlo             %[vector4a],  $ac1                            \n\t"
     69           "mtlo             %[vector4a],  $ac2                            \n\t"
     70           "mtlo             %[vector4a],  $ac3                            \n\t"
     71           "mthi             $zero,        $ac0                            \n\t"
     72           "mthi             $zero,        $ac1                            \n\t"
     73           "mthi             $zero,        $ac2                            \n\t"
     74           "mthi             $zero,        $ac3                            \n\t"
     75 
     76           "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
     77           "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
     78           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
     79           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
     80           "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
     81           "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
     82           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
     83           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
     84 
     85           "dpa.w.ph         $ac0,         %[p1],          %[vector1b]     \n\t"
     86           "dpa.w.ph         $ac0,         %[p2],          %[vector2b]     \n\t"
     87           "dpa.w.ph         $ac1,         %[n1],          %[vector1b]     \n\t"
     88           "dpa.w.ph         $ac1,         %[n2],          %[vector2b]     \n\t"
     89 
     90           "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
     91           "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
     92           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
     93           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
     94           "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
     95           "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
     96           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
     97           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
     98 
     99           "dpa.w.ph         $ac2,         %[p1],          %[vector1b]     \n\t"
    100           "dpa.w.ph         $ac2,         %[p2],          %[vector2b]     \n\t"
    101           "dpa.w.ph         $ac3,         %[n1],          %[vector1b]     \n\t"
    102           "dpa.w.ph         $ac3,         %[n2],          %[vector2b]     \n\t"
    103 
    104           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
    105           "ulw              %[load1],     0(%[src_ptr])                   \n\t"
    106           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
    107           "ulw              %[load2],     0(%[src_ptr])                   \n\t"
    108           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
    109           "ulw              %[load3],     0(%[src_ptr])                   \n\t"
    110           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
    111           "ulw              %[load4],     0(%[src_ptr])                   \n\t"
    112 
    113           "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
    114           "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
    115           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
    116           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
    117           "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
    118           "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
    119           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
    120           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
    121 
    122           "dpa.w.ph         $ac0,         %[p1],          %[vector3b]     \n\t"
    123           "dpa.w.ph         $ac0,         %[p2],          %[vector4b]     \n\t"
    124           "extp             %[Temp1],     $ac0,           31              \n\t"
    125           "dpa.w.ph         $ac1,         %[n1],          %[vector3b]     \n\t"
    126           "dpa.w.ph         $ac1,         %[n2],          %[vector4b]     \n\t"
    127           "extp             %[Temp2],     $ac1,           31              \n\t"
    128 
    129           "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
    130           "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
    131           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
    132           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
    133           "lbu              %[scratch1],  0(%[dst_ptr])                   \n\t"
    134           "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
    135           "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
    136           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
    137           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
    138           "lbu              %[scratch2],  1(%[dst_ptr])                   \n\t"
    139 
    140           "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
    141           "dpa.w.ph         $ac2,         %[p1],          %[vector3b]     \n\t"
    142           "dpa.w.ph         $ac2,         %[p2],          %[vector4b]     \n\t"
    143           "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 1 */
    144           "extp             %[Temp1],     $ac2,           31              \n\t"
    145 
    146           "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
    147           "dpa.w.ph         $ac3,         %[n1],          %[vector3b]     \n\t"
    148           "dpa.w.ph         $ac3,         %[n2],          %[vector4b]     \n\t"
    149           "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 2 */
    150           "extp             %[Temp2],     $ac3,           31              \n\t"
    151           "lbu              %[scratch1],  2(%[dst_ptr])                   \n\t"
    152 
    153           "sb               %[store1],    0(%[dst_ptr])                   \n\t"
    154           "sb               %[store2],    1(%[dst_ptr])                   \n\t"
    155           "lbu              %[scratch2],  3(%[dst_ptr])                   \n\t"
    156 
    157           "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
    158           "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
    159           "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 3 */
    160           "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 4 */
    161 
    162           "sb               %[store1],    2(%[dst_ptr])                   \n\t"
    163           "sb               %[store2],    3(%[dst_ptr])                   \n\t"
    164 
    165           : [load1] "=&r" (load1), [load2] "=&r" (load2),
    166             [load3] "=&r" (load3), [load4] "=&r" (load4),
    167             [p1] "=&r" (p1), [p2] "=&r" (p2), [n1] "=&r" (n1), [n2] "=&r" (n2),
    168             [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
    169             [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
    170             [store1] "=&r" (store1), [store2] "=&r" (store2),
    171             [src_ptr] "+r" (src_ptr)
    172           : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
    173             [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
    174             [vector4a] "r" (vector4a),
    175             [src_stride] "r" (src_stride), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
    176       );
    177     }
    178 
    179     /* Next row... */
    180     src += src_stride;
    181     dst += dst_stride;
    182   }
    183 }
    184 
    185 static void convolve_avg_vert_64_dspr2(const uint8_t *src,
    186                                        int32_t src_stride,
    187                                        uint8_t *dst,
    188                                        int32_t dst_stride,
    189                                        const int16_t *filter_y,
    190                                        int32_t h) {
    191   int32_t       x, y;
    192   const uint8_t *src_ptr;
    193   uint8_t       *dst_ptr;
    194   uint8_t       *cm = vp9_ff_cropTbl;
    195   uint32_t      vector4a = 64;
    196   uint32_t      load1, load2, load3, load4;
    197   uint32_t      p1, p2;
    198   uint32_t      n1, n2;
    199   uint32_t      scratch1, scratch2;
    200   uint32_t      store1, store2;
    201   int32_t       vector1b, vector2b, vector3b, vector4b;
    202   int32_t       Temp1, Temp2;
    203 
    204   vector1b = ((const int32_t *)filter_y)[0];
    205   vector2b = ((const int32_t *)filter_y)[1];
    206   vector3b = ((const int32_t *)filter_y)[2];
    207   vector4b = ((const int32_t *)filter_y)[3];
    208 
    209   src -= 3 * src_stride;
    210 
    211   for (y = h; y--;) {
    212     /* prefetch data to cache memory */
    213     vp9_prefetch_store(dst + dst_stride);
    214     vp9_prefetch_store(dst + dst_stride + 32);
    215 
    216     for (x = 0; x < 64; x += 4) {
    217       src_ptr = src + x;
    218       dst_ptr = dst + x;
    219 
    220       __asm__ __volatile__ (
    221           "ulw              %[load1],     0(%[src_ptr])                   \n\t"
    222           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
    223           "ulw              %[load2],     0(%[src_ptr])                   \n\t"
    224           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
    225           "ulw              %[load3],     0(%[src_ptr])                   \n\t"
    226           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
    227           "ulw              %[load4],     0(%[src_ptr])                   \n\t"
    228 
    229           "mtlo             %[vector4a],  $ac0                            \n\t"
    230           "mtlo             %[vector4a],  $ac1                            \n\t"
    231           "mtlo             %[vector4a],  $ac2                            \n\t"
    232           "mtlo             %[vector4a],  $ac3                            \n\t"
    233           "mthi             $zero,        $ac0                            \n\t"
    234           "mthi             $zero,        $ac1                            \n\t"
    235           "mthi             $zero,        $ac2                            \n\t"
    236           "mthi             $zero,        $ac3                            \n\t"
    237 
    238           "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
    239           "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
    240           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
    241           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
    242           "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
    243           "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
    244           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
    245           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
    246 
    247           "dpa.w.ph         $ac0,         %[p1],          %[vector1b]     \n\t"
    248           "dpa.w.ph         $ac0,         %[p2],          %[vector2b]     \n\t"
    249           "dpa.w.ph         $ac1,         %[n1],          %[vector1b]     \n\t"
    250           "dpa.w.ph         $ac1,         %[n2],          %[vector2b]     \n\t"
    251 
    252           "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
    253           "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
    254           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
    255           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
    256           "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
    257           "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
    258           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
    259           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
    260 
    261           "dpa.w.ph         $ac2,         %[p1],          %[vector1b]     \n\t"
    262           "dpa.w.ph         $ac2,         %[p2],          %[vector2b]     \n\t"
    263           "dpa.w.ph         $ac3,         %[n1],          %[vector1b]     \n\t"
    264           "dpa.w.ph         $ac3,         %[n2],          %[vector2b]     \n\t"
    265 
    266           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
    267           "ulw              %[load1],     0(%[src_ptr])                   \n\t"
    268           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
    269           "ulw              %[load2],     0(%[src_ptr])                   \n\t"
    270           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
    271           "ulw              %[load3],     0(%[src_ptr])                   \n\t"
    272           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
    273           "ulw              %[load4],     0(%[src_ptr])                   \n\t"
    274 
    275           "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
    276           "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
    277           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
    278           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
    279           "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
    280           "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
    281           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
    282           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
    283 
    284           "dpa.w.ph         $ac0,         %[p1],          %[vector3b]     \n\t"
    285           "dpa.w.ph         $ac0,         %[p2],          %[vector4b]     \n\t"
    286           "extp             %[Temp1],     $ac0,           31              \n\t"
    287           "dpa.w.ph         $ac1,         %[n1],          %[vector3b]     \n\t"
    288           "dpa.w.ph         $ac1,         %[n2],          %[vector4b]     \n\t"
    289           "extp             %[Temp2],     $ac1,           31              \n\t"
    290 
    291           "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
    292           "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
    293           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
    294           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
    295           "lbu              %[scratch1],  0(%[dst_ptr])                   \n\t"
    296           "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
    297           "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
    298           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
    299           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
    300           "lbu              %[scratch2],  1(%[dst_ptr])                   \n\t"
    301 
    302           "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
    303           "dpa.w.ph         $ac2,         %[p1],          %[vector3b]     \n\t"
    304           "dpa.w.ph         $ac2,         %[p2],          %[vector4b]     \n\t"
    305           "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 1 */
    306           "extp             %[Temp1],     $ac2,           31              \n\t"
    307 
    308           "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
    309           "dpa.w.ph         $ac3,         %[n1],          %[vector3b]     \n\t"
    310           "dpa.w.ph         $ac3,         %[n2],          %[vector4b]     \n\t"
    311           "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 2 */
    312           "extp             %[Temp2],     $ac3,           31              \n\t"
    313           "lbu              %[scratch1],  2(%[dst_ptr])                   \n\t"
    314 
    315           "sb               %[store1],    0(%[dst_ptr])                   \n\t"
    316           "sb               %[store2],    1(%[dst_ptr])                   \n\t"
    317           "lbu              %[scratch2],  3(%[dst_ptr])                   \n\t"
    318 
    319           "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
    320           "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
    321           "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 3 */
    322           "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 4 */
    323 
    324           "sb               %[store1],    2(%[dst_ptr])                   \n\t"
    325           "sb               %[store2],    3(%[dst_ptr])                   \n\t"
    326 
    327           : [load1] "=&r" (load1), [load2] "=&r" (load2),
    328             [load3] "=&r" (load3), [load4] "=&r" (load4),
    329             [p1] "=&r" (p1), [p2] "=&r" (p2), [n1] "=&r" (n1), [n2] "=&r" (n2),
    330             [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
    331             [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
    332             [store1] "=&r" (store1), [store2] "=&r" (store2),
    333             [src_ptr] "+r" (src_ptr)
    334           : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
    335             [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
    336             [vector4a] "r" (vector4a),
    337             [src_stride] "r" (src_stride), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
    338       );
    339     }
    340 
    341     /* Next row... */
    342     src += src_stride;
    343     dst += dst_stride;
    344   }
    345 }
    346 
    347 void vp9_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
    348                                   uint8_t *dst, ptrdiff_t dst_stride,
    349                                   const int16_t *filter_x, int x_step_q4,
    350                                   const int16_t *filter_y, int y_step_q4,
    351                                   int w, int h) {
    352   if (((const int32_t *)filter_y)[1] == 0x800000) {
    353     vp9_convolve_avg(src, src_stride,
    354                      dst, dst_stride,
    355                      filter_x, x_step_q4,
    356                      filter_y, y_step_q4,
    357                      w, h);
    358   } else if (((const int32_t *)filter_y)[0] == 0) {
    359     vp9_convolve2_avg_vert_dspr2(src, src_stride,
    360                                  dst, dst_stride,
    361                                  filter_x, x_step_q4,
    362                                  filter_y, y_step_q4,
    363                                  w, h);
    364   } else {
    365     if (16 == y_step_q4) {
    366       uint32_t pos = 38;
    367 
    368       /* bit positon for extract from acc */
    369       __asm__ __volatile__ (
    370         "wrdsp      %[pos],     1           \n\t"
    371         :
    372         : [pos] "r" (pos)
    373       );
    374 
    375       vp9_prefetch_store(dst);
    376 
    377       switch (w) {
    378         case 4:
    379         case 8:
    380         case 16:
    381         case 32:
    382           convolve_avg_vert_4_dspr2(src, src_stride,
    383                                     dst, dst_stride,
    384                                     filter_y, w, h);
    385           break;
    386         case 64:
    387           vp9_prefetch_store(dst + 32);
    388           convolve_avg_vert_64_dspr2(src, src_stride,
    389                                      dst, dst_stride,
    390                                      filter_y, h);
    391           break;
    392         default:
    393           vp9_convolve8_avg_vert_c(src, src_stride,
    394                                    dst, dst_stride,
    395                                    filter_x, x_step_q4,
    396                                    filter_y, y_step_q4,
    397                                    w, h);
    398           break;
    399       }
    400     } else {
    401       vp9_convolve8_avg_vert_c(src, src_stride,
    402                                dst, dst_stride,
    403                                filter_x, x_step_q4,
    404                                filter_y, y_step_q4,
    405                                w, h);
    406     }
    407   }
    408 }
    409 
    410 void vp9_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
    411                              uint8_t *dst, ptrdiff_t dst_stride,
    412                              const int16_t *filter_x, int x_step_q4,
    413                              const int16_t *filter_y, int y_step_q4,
    414                              int w, int h) {
    415   /* Fixed size intermediate buffer places limits on parameters. */
    416   DECLARE_ALIGNED_ARRAY(32, uint8_t, temp, 64 * 135);
    417   int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
    418 
    419   assert(w <= 64);
    420   assert(h <= 64);
    421 
    422   if (intermediate_height < h)
    423     intermediate_height = h;
    424 
    425   if (x_step_q4 != 16 || y_step_q4 != 16)
    426     return vp9_convolve8_avg_c(src, src_stride,
    427                                dst, dst_stride,
    428                                filter_x, x_step_q4,
    429                                filter_y, y_step_q4,
    430                                w, h);
    431 
    432   vp9_convolve8_horiz(src - (src_stride * 3), src_stride,
    433                       temp, 64,
    434                       filter_x, x_step_q4,
    435                       filter_y, y_step_q4,
    436                       w, intermediate_height);
    437 
    438   vp9_convolve8_avg_vert(temp + 64 * 3, 64,
    439                          dst, dst_stride,
    440                          filter_x, x_step_q4,
    441                          filter_y, y_step_q4,
    442                          w, h);
    443 }
    444 
    445 void vp9_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
    446                             uint8_t *dst, ptrdiff_t dst_stride,
    447                             const int16_t *filter_x, int filter_x_stride,
    448                             const int16_t *filter_y, int filter_y_stride,
    449                             int w, int h) {
    450   int x, y;
    451   uint32_t tp1, tp2, tn1;
    452   uint32_t tp3, tp4, tn2;
    453 
    454   /* prefetch data to cache memory */
    455   vp9_prefetch_load(src);
    456   vp9_prefetch_load(src + 32);
    457   vp9_prefetch_store(dst);
    458 
    459   switch (w) {
    460     case 4:
    461       /* 1 word storage */
    462       for (y = h; y--; ) {
    463         vp9_prefetch_load(src + src_stride);
    464         vp9_prefetch_load(src + src_stride + 32);
    465         vp9_prefetch_store(dst + dst_stride);
    466 
    467         __asm__ __volatile__ (
    468             "ulw              %[tp1],         0(%[src])      \n\t"
    469             "ulw              %[tp2],         0(%[dst])      \n\t"
    470             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
    471             "sw               %[tn1],         0(%[dst])      \n\t"  /* store */
    472 
    473             : [tn1] "=&r" (tn1), [tp1] "=&r" (tp1),
    474               [tp2] "=&r" (tp2)
    475             : [src] "r" (src), [dst] "r" (dst)
    476         );
    477 
    478         src += src_stride;
    479         dst += dst_stride;
    480       }
    481       break;
    482     case 8:
    483       /* 2 word storage */
    484       for (y = h; y--; ) {
    485         vp9_prefetch_load(src + src_stride);
    486         vp9_prefetch_load(src + src_stride + 32);
    487         vp9_prefetch_store(dst + dst_stride);
    488 
    489         __asm__ __volatile__ (
    490             "ulw              %[tp1],         0(%[src])      \n\t"
    491             "ulw              %[tp2],         0(%[dst])      \n\t"
    492             "ulw              %[tp3],         4(%[src])      \n\t"
    493             "ulw              %[tp4],         4(%[dst])      \n\t"
    494             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
    495             "sw               %[tn1],         0(%[dst])      \n\t"  /* store */
    496             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
    497             "sw               %[tn2],         4(%[dst])      \n\t"  /* store */
    498 
    499             : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
    500               [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
    501               [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
    502             : [src] "r" (src), [dst] "r" (dst)
    503         );
    504 
    505         src += src_stride;
    506         dst += dst_stride;
    507       }
    508       break;
    509     case 16:
    510       /* 4 word storage */
    511       for (y = h; y--; ) {
    512         vp9_prefetch_load(src + src_stride);
    513         vp9_prefetch_load(src + src_stride + 32);
    514         vp9_prefetch_store(dst + dst_stride);
    515 
    516         __asm__ __volatile__ (
    517             "ulw              %[tp1],         0(%[src])      \n\t"
    518             "ulw              %[tp2],         0(%[dst])      \n\t"
    519             "ulw              %[tp3],         4(%[src])      \n\t"
    520             "ulw              %[tp4],         4(%[dst])      \n\t"
    521             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
    522             "ulw              %[tp1],         8(%[src])      \n\t"
    523             "ulw              %[tp2],         8(%[dst])      \n\t"
    524             "sw               %[tn1],         0(%[dst])      \n\t"  /* store */
    525             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
    526             "sw               %[tn2],         4(%[dst])      \n\t"  /* store */
    527             "ulw              %[tp3],         12(%[src])     \n\t"
    528             "ulw              %[tp4],         12(%[dst])     \n\t"
    529             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
    530             "sw               %[tn1],         8(%[dst])      \n\t"  /* store */
    531             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
    532             "sw               %[tn2],         12(%[dst])     \n\t"  /* store */
    533 
    534             : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
    535               [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
    536               [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
    537             : [src] "r" (src), [dst] "r" (dst)
    538         );
    539 
    540         src += src_stride;
    541         dst += dst_stride;
    542       }
    543       break;
    544     case 32:
    545       /* 8 word storage */
    546       for (y = h; y--; ) {
    547         vp9_prefetch_load(src + src_stride);
    548         vp9_prefetch_load(src + src_stride + 32);
    549         vp9_prefetch_store(dst + dst_stride);
    550 
    551         __asm__ __volatile__ (
    552             "ulw              %[tp1],         0(%[src])      \n\t"
    553             "ulw              %[tp2],         0(%[dst])      \n\t"
    554             "ulw              %[tp3],         4(%[src])      \n\t"
    555             "ulw              %[tp4],         4(%[dst])      \n\t"
    556             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
    557             "ulw              %[tp1],         8(%[src])      \n\t"
    558             "ulw              %[tp2],         8(%[dst])      \n\t"
    559             "sw               %[tn1],         0(%[dst])      \n\t"  /* store */
    560             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
    561             "sw               %[tn2],         4(%[dst])      \n\t"  /* store */
    562             "ulw              %[tp3],         12(%[src])     \n\t"
    563             "ulw              %[tp4],         12(%[dst])     \n\t"
    564             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
    565             "ulw              %[tp1],         16(%[src])     \n\t"
    566             "ulw              %[tp2],         16(%[dst])     \n\t"
    567             "sw               %[tn1],         8(%[dst])      \n\t"  /* store */
    568             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
    569             "sw               %[tn2],         12(%[dst])     \n\t"  /* store */
    570             "ulw              %[tp3],         20(%[src])     \n\t"
    571             "ulw              %[tp4],         20(%[dst])     \n\t"
    572             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
    573             "ulw              %[tp1],         24(%[src])     \n\t"
    574             "ulw              %[tp2],         24(%[dst])     \n\t"
    575             "sw               %[tn1],         16(%[dst])     \n\t"  /* store */
    576             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
    577             "sw               %[tn2],         20(%[dst])     \n\t"  /* store */
    578             "ulw              %[tp3],         28(%[src])     \n\t"
    579             "ulw              %[tp4],         28(%[dst])     \n\t"
    580             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
    581             "sw               %[tn1],         24(%[dst])     \n\t"  /* store */
    582             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
    583             "sw               %[tn2],         28(%[dst])     \n\t"  /* store */
    584 
    585             : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
    586               [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
    587               [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
    588             : [src] "r" (src), [dst] "r" (dst)
    589         );
    590 
    591         src += src_stride;
    592         dst += dst_stride;
    593       }
    594       break;
    595     case 64:
    596       vp9_prefetch_load(src + 64);
    597       vp9_prefetch_store(dst + 32);
    598 
    599       /* 16 word storage */
    600       for (y = h; y--; ) {
    601         vp9_prefetch_load(src + src_stride);
    602         vp9_prefetch_load(src + src_stride + 32);
    603         vp9_prefetch_load(src + src_stride + 64);
    604         vp9_prefetch_store(dst + dst_stride);
    605         vp9_prefetch_store(dst + dst_stride + 32);
    606 
    607         __asm__ __volatile__ (
    608             "ulw              %[tp1],         0(%[src])      \n\t"
    609             "ulw              %[tp2],         0(%[dst])      \n\t"
    610             "ulw              %[tp3],         4(%[src])      \n\t"
    611             "ulw              %[tp4],         4(%[dst])      \n\t"
    612             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
    613             "ulw              %[tp1],         8(%[src])      \n\t"
    614             "ulw              %[tp2],         8(%[dst])      \n\t"
    615             "sw               %[tn1],         0(%[dst])      \n\t"  /* store */
    616             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
    617             "sw               %[tn2],         4(%[dst])      \n\t"  /* store */
    618             "ulw              %[tp3],         12(%[src])     \n\t"
    619             "ulw              %[tp4],         12(%[dst])     \n\t"
    620             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
    621             "ulw              %[tp1],         16(%[src])     \n\t"
    622             "ulw              %[tp2],         16(%[dst])     \n\t"
    623             "sw               %[tn1],         8(%[dst])      \n\t"  /* store */
    624             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
    625             "sw               %[tn2],         12(%[dst])     \n\t"  /* store */
    626             "ulw              %[tp3],         20(%[src])     \n\t"
    627             "ulw              %[tp4],         20(%[dst])     \n\t"
    628             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
    629             "ulw              %[tp1],         24(%[src])     \n\t"
    630             "ulw              %[tp2],         24(%[dst])     \n\t"
    631             "sw               %[tn1],         16(%[dst])     \n\t"  /* store */
    632             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
    633             "sw               %[tn2],         20(%[dst])     \n\t"  /* store */
    634             "ulw              %[tp3],         28(%[src])     \n\t"
    635             "ulw              %[tp4],         28(%[dst])     \n\t"
    636             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
    637             "ulw              %[tp1],         32(%[src])     \n\t"
    638             "ulw              %[tp2],         32(%[dst])     \n\t"
    639             "sw               %[tn1],         24(%[dst])     \n\t"  /* store */
    640             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
    641             "sw               %[tn2],         28(%[dst])     \n\t"  /* store */
    642             "ulw              %[tp3],         36(%[src])     \n\t"
    643             "ulw              %[tp4],         36(%[dst])     \n\t"
    644             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
    645             "ulw              %[tp1],         40(%[src])     \n\t"
    646             "ulw              %[tp2],         40(%[dst])     \n\t"
    647             "sw               %[tn1],         32(%[dst])     \n\t"  /* store */
    648             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
    649             "sw               %[tn2],         36(%[dst])     \n\t"  /* store */
    650             "ulw              %[tp3],         44(%[src])     \n\t"
    651             "ulw              %[tp4],         44(%[dst])     \n\t"
    652             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
    653             "ulw              %[tp1],         48(%[src])     \n\t"
    654             "ulw              %[tp2],         48(%[dst])     \n\t"
    655             "sw               %[tn1],         40(%[dst])     \n\t"  /* store */
    656             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
    657             "sw               %[tn2],         44(%[dst])     \n\t"  /* store */
    658             "ulw              %[tp3],         52(%[src])     \n\t"
    659             "ulw              %[tp4],         52(%[dst])     \n\t"
    660             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
    661             "ulw              %[tp1],         56(%[src])     \n\t"
    662             "ulw              %[tp2],         56(%[dst])     \n\t"
    663             "sw               %[tn1],         48(%[dst])     \n\t"  /* store */
    664             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
    665             "sw               %[tn2],         52(%[dst])     \n\t"  /* store */
    666             "ulw              %[tp3],         60(%[src])     \n\t"
    667             "ulw              %[tp4],         60(%[dst])     \n\t"
    668             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
    669             "sw               %[tn1],         56(%[dst])     \n\t"  /* store */
    670             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
    671             "sw               %[tn2],         60(%[dst])     \n\t"  /* store */
    672 
    673             : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
    674               [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
    675               [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
    676             : [src] "r" (src), [dst] "r" (dst)
    677         );
    678 
    679         src += src_stride;
    680         dst += dst_stride;
    681       }
    682       break;
    683     default:
    684       for (y = h; y > 0; --y) {
    685         for (x = 0; x < w; ++x) {
    686           dst[x] = (dst[x] + src[x] + 1) >> 1;
    687         }
    688 
    689         src += src_stride;
    690         dst += dst_stride;
    691       }
    692       break;
    693   }
    694 }
    695 #endif
    696