Home | History | Annotate | Download | only in dspr2
      1 /*
      2  *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <assert.h>
     12 #include <stdio.h>
     13 
     14 #include "./vpx_config.h"
     15 #include "./vp9_rtcd.h"
     16 #include "vp9/common/vp9_common.h"
     17 #include "vpx/vpx_integer.h"
     18 #include "vpx_ports/mem.h"
     19 #include "vp9/common/vp9_convolve.h"
     20 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
     21 
     22 #if HAVE_DSPR2
     23 static void convolve_avg_horiz_4_dspr2(const uint8_t *src,
     24                                        int32_t src_stride,
     25                                        uint8_t *dst,
     26                                        int32_t dst_stride,
     27                                        const int16_t *filter_x0,
     28                                        int32_t h) {
     29   int32_t y;
     30   uint8_t *cm = vp9_ff_cropTbl;
     31   int32_t  vector1b, vector2b, vector3b, vector4b;
     32   int32_t  Temp1, Temp2, Temp3, Temp4;
     33   uint32_t vector4a = 64;
     34   uint32_t tp1, tp2;
     35   uint32_t p1, p2, p3, p4;
     36   uint32_t n1, n2, n3, n4;
     37   uint32_t tn1, tn2;
     38 
     39   vector1b = ((const int32_t *)filter_x0)[0];
     40   vector2b = ((const int32_t *)filter_x0)[1];
     41   vector3b = ((const int32_t *)filter_x0)[2];
     42   vector4b = ((const int32_t *)filter_x0)[3];
     43 
     44   for (y = h; y--;) {
     45     /* prefetch data to cache memory */
     46     vp9_prefetch_load(src + src_stride);
     47     vp9_prefetch_load(src + src_stride + 32);
     48     vp9_prefetch_store(dst + dst_stride);
     49 
     50     __asm__ __volatile__ (
     51         "ulw              %[tp1],         0(%[src])                      \n\t"
     52         "ulw              %[tp2],         4(%[src])                      \n\t"
     53 
     54         /* even 1. pixel */
     55         "mtlo             %[vector4a],    $ac3                           \n\t"
     56         "mthi             $zero,          $ac3                           \n\t"
     57         "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
     58         "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
     59         "preceu.ph.qbr    %[p3],          %[tp2]                         \n\t"
     60         "preceu.ph.qbl    %[p4],          %[tp2]                         \n\t"
     61         "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
     62         "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
     63         "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
     64         "ulw              %[tn2],         8(%[src])                      \n\t"
     65         "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
     66         "extp             %[Temp1],       $ac3,           31             \n\t"
     67 
     68         /* even 2. pixel */
     69         "mtlo             %[vector4a],    $ac2                           \n\t"
     70         "mthi             $zero,          $ac2                           \n\t"
     71         "preceu.ph.qbr    %[p1],          %[tn2]                         \n\t"
     72         "balign           %[tn1],         %[tn2],         3              \n\t"
     73         "balign           %[tn2],         %[tp2],         3              \n\t"
     74         "balign           %[tp2],         %[tp1],         3              \n\t"
     75         "dpa.w.ph         $ac2,           %[p2],          %[vector1b]    \n\t"
     76         "dpa.w.ph         $ac2,           %[p3],          %[vector2b]    \n\t"
     77         "dpa.w.ph         $ac2,           %[p4],          %[vector3b]    \n\t"
     78         "dpa.w.ph         $ac2,           %[p1],          %[vector4b]    \n\t"
     79         "extp             %[Temp3],       $ac2,           31             \n\t"
     80 
     81         "lbu              %[p2],          3(%[dst])                      \n\t"  /* load odd 2 */
     82 
     83         /* odd 1. pixel */
     84         "lbux             %[tp1],         %[Temp1](%[cm])                \n\t"  /* even 1 */
     85         "mtlo             %[vector4a],    $ac3                           \n\t"
     86         "mthi             $zero,          $ac3                           \n\t"
     87         "lbu              %[Temp1],       1(%[dst])                      \n\t"  /* load odd 1 */
     88         "preceu.ph.qbr    %[n1],          %[tp2]                         \n\t"
     89         "preceu.ph.qbl    %[n2],          %[tp2]                         \n\t"
     90         "preceu.ph.qbr    %[n3],          %[tn2]                         \n\t"
     91         "preceu.ph.qbl    %[n4],          %[tn2]                         \n\t"
     92         "dpa.w.ph         $ac3,           %[n1],          %[vector1b]    \n\t"
     93         "dpa.w.ph         $ac3,           %[n2],          %[vector2b]    \n\t"
     94         "dpa.w.ph         $ac3,           %[n3],          %[vector3b]    \n\t"
     95         "dpa.w.ph         $ac3,           %[n4],          %[vector4b]    \n\t"
     96         "extp             %[Temp2],       $ac3,           31             \n\t"
     97 
     98         "lbu              %[tn2],         0(%[dst])                      \n\t"  /* load even 1 */
     99 
    100         /* odd 2. pixel */
    101         "lbux             %[tp2],         %[Temp3](%[cm])                \n\t"  /* even 2 */
    102         "mtlo             %[vector4a],    $ac2                           \n\t"
    103         "mthi             $zero,          $ac2                           \n\t"
    104         "preceu.ph.qbr    %[n1],          %[tn1]                         \n\t"
    105         "lbux             %[tn1],         %[Temp2](%[cm])                \n\t"  /* odd 1 */
    106         "addqh_r.w        %[tn2],         %[tn2],         %[tp1]         \n\t"  /* average even 1 */
    107         "dpa.w.ph         $ac2,           %[n2],          %[vector1b]    \n\t"
    108         "dpa.w.ph         $ac2,           %[n3],          %[vector2b]    \n\t"
    109         "dpa.w.ph         $ac2,           %[n4],          %[vector3b]    \n\t"
    110         "dpa.w.ph         $ac2,           %[n1],          %[vector4b]    \n\t"
    111         "extp             %[Temp4],       $ac2,           31             \n\t"
    112 
    113         "lbu              %[tp1],         2(%[dst])                      \n\t"  /* load even 2 */
    114         "sb               %[tn2],         0(%[dst])                      \n\t"  /* store even 1 */
    115 
    116         /* clamp */
    117         "addqh_r.w        %[Temp1],       %[Temp1],       %[tn1]         \n\t"  /* average odd 1 */
    118         "lbux             %[n2],          %[Temp4](%[cm])                \n\t"  /* odd 2 */
    119         "sb               %[Temp1],       1(%[dst])                      \n\t"  /* store odd 1 */
    120 
    121         "addqh_r.w        %[tp1],         %[tp1],         %[tp2]         \n\t"  /* average even 2 */
    122         "sb               %[tp1],         2(%[dst])                      \n\t"  /* store even 2 */
    123 
    124         "addqh_r.w        %[p2],          %[p2],          %[n2]          \n\t"  /* average odd 2 */
    125         "sb               %[p2],          3(%[dst])                      \n\t"  /* store odd 2 */
    126 
    127         : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
    128           [tn1] "=&r" (tn1), [tn2] "=&r" (tn2),
    129           [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
    130           [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3), [n4] "=&r" (n4),
    131           [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
    132           [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
    133         : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
    134           [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
    135           [vector4a] "r" (vector4a),
    136           [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
    137     );
    138 
    139     /* Next row... */
    140     src += src_stride;
    141     dst += dst_stride;
    142   }
    143 }
    144 
    145 static void convolve_avg_horiz_8_dspr2(const uint8_t *src,
    146                                        int32_t src_stride,
    147                                        uint8_t *dst,
    148                                        int32_t dst_stride,
    149                                        const int16_t *filter_x0,
    150                                        int32_t h) {
    151   int32_t y;
    152   uint8_t *cm = vp9_ff_cropTbl;
    153   uint32_t vector4a = 64;
    154   int32_t vector1b, vector2b, vector3b, vector4b;
    155   int32_t Temp1, Temp2, Temp3;
    156   uint32_t tp1, tp2;
    157   uint32_t p1, p2, p3, p4, n1;
    158   uint32_t tn1, tn2, tn3;
    159   uint32_t st0, st1;
    160 
    161   vector1b = ((const int32_t *)filter_x0)[0];
    162   vector2b = ((const int32_t *)filter_x0)[1];
    163   vector3b = ((const int32_t *)filter_x0)[2];
    164   vector4b = ((const int32_t *)filter_x0)[3];
    165 
    166   for (y = h; y--;) {
    167     /* prefetch data to cache memory */
    168     vp9_prefetch_load(src + src_stride);
    169     vp9_prefetch_load(src + src_stride + 32);
    170     vp9_prefetch_store(dst + dst_stride);
    171 
    172     __asm__ __volatile__ (
    173         "ulw              %[tp1],         0(%[src])                      \n\t"
    174         "ulw              %[tp2],         4(%[src])                      \n\t"
    175 
    176         /* even 1. pixel */
    177         "mtlo             %[vector4a],    $ac3                           \n\t"
    178         "mthi             $zero,          $ac3                           \n\t"
    179         "mtlo             %[vector4a],    $ac2                           \n\t"
    180         "mthi             $zero,          $ac2                           \n\t"
    181         "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
    182         "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
    183         "preceu.ph.qbr    %[p3],          %[tp2]                         \n\t"
    184         "preceu.ph.qbl    %[p4],          %[tp2]                         \n\t"
    185         "ulw              %[tn2],         8(%[src])                      \n\t"
    186         "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
    187         "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
    188         "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
    189         "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
    190         "extp             %[Temp1],       $ac3,           31             \n\t"
    191         "lbu              %[Temp2],       0(%[dst])                      \n\t"
    192         "lbu              %[tn3],         2(%[dst])                      \n\t"
    193 
    194         /* even 2. pixel */
    195         "preceu.ph.qbr    %[p1],          %[tn2]                         \n\t"
    196         "preceu.ph.qbl    %[n1],          %[tn2]                         \n\t"
    197         "ulw              %[tn1],         12(%[src])                     \n\t"
    198         "dpa.w.ph         $ac2,           %[p2],          %[vector1b]    \n\t"
    199         "dpa.w.ph         $ac2,           %[p3],          %[vector2b]    \n\t"
    200         "dpa.w.ph         $ac2,           %[p4],          %[vector3b]    \n\t"
    201         "dpa.w.ph         $ac2,           %[p1],          %[vector4b]    \n\t"
    202         "extp             %[Temp3],       $ac2,           31             \n\t"
    203 
    204         /* even 3. pixel */
    205         "lbux             %[st0],         %[Temp1](%[cm])                \n\t"
    206         "mtlo             %[vector4a],    $ac1                           \n\t"
    207         "mthi             $zero,          $ac1                           \n\t"
    208         "preceu.ph.qbr    %[p2],          %[tn1]                         \n\t"
    209         "lbux             %[st1],         %[Temp3](%[cm])                \n\t"
    210         "dpa.w.ph         $ac1,           %[p3],          %[vector1b]    \n\t"
    211         "dpa.w.ph         $ac1,           %[p4],          %[vector2b]    \n\t"
    212         "dpa.w.ph         $ac1,           %[p1],          %[vector3b]    \n\t"
    213         "dpa.w.ph         $ac1,           %[n1],          %[vector4b]    \n\t"
    214         "extp             %[Temp1],       $ac1,           31             \n\t"
    215 
    216         "addqh_r.w        %[Temp2],       %[Temp2],       %[st0]         \n\t"
    217         "addqh_r.w        %[tn3],         %[tn3],         %[st1]         \n\t"
    218         "sb               %[Temp2],       0(%[dst])                      \n\t"
    219         "sb               %[tn3],         2(%[dst])                      \n\t"
    220 
    221         /* even 4. pixel */
    222         "mtlo             %[vector4a],    $ac2                           \n\t"
    223         "mthi             $zero,          $ac2                           \n\t"
    224         "mtlo             %[vector4a],    $ac3                           \n\t"
    225         "mthi             $zero,          $ac3                           \n\t"
    226 
    227         "balign           %[tn3],         %[tn1],         3              \n\t"
    228         "balign           %[tn1],         %[tn2],         3              \n\t"
    229         "balign           %[tn2],         %[tp2],         3              \n\t"
    230         "balign           %[tp2],         %[tp1],         3              \n\t"
    231 
    232         "lbux             %[st0],         %[Temp1](%[cm])                \n\t"
    233         "lbu              %[Temp2],       4(%[dst])                      \n\t"
    234         "addqh_r.w        %[Temp2],       %[Temp2],       %[st0]         \n\t"
    235 
    236         "dpa.w.ph         $ac2,           %[p4],          %[vector1b]    \n\t"
    237         "dpa.w.ph         $ac2,           %[p1],          %[vector2b]    \n\t"
    238         "dpa.w.ph         $ac2,           %[n1],          %[vector3b]    \n\t"
    239         "dpa.w.ph         $ac2,           %[p2],          %[vector4b]    \n\t"
    240         "extp             %[Temp3],       $ac2,           31             \n\t"
    241 
    242         /* odd 1. pixel */
    243         "mtlo             %[vector4a],    $ac1                           \n\t"
    244         "mthi             $zero,          $ac1                           \n\t"
    245         "sb               %[Temp2],       4(%[dst])                      \n\t"
    246         "preceu.ph.qbr    %[p1],          %[tp2]                         \n\t"
    247         "preceu.ph.qbl    %[p2],          %[tp2]                         \n\t"
    248         "preceu.ph.qbr    %[p3],          %[tn2]                         \n\t"
    249         "preceu.ph.qbl    %[p4],          %[tn2]                         \n\t"
    250         "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
    251         "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
    252         "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
    253         "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
    254         "extp             %[Temp2],       $ac3,           31             \n\t"
    255 
    256         "lbu              %[tp1],         6(%[dst])                      \n\t"
    257 
    258         /* odd 2. pixel */
    259         "mtlo             %[vector4a],    $ac3                           \n\t"
    260         "mthi             $zero,          $ac3                           \n\t"
    261         "mtlo             %[vector4a],    $ac2                           \n\t"
    262         "mthi             $zero,          $ac2                           \n\t"
    263         "preceu.ph.qbr    %[p1],          %[tn1]                         \n\t"
    264         "preceu.ph.qbl    %[n1],          %[tn1]                         \n\t"
    265         "lbux             %[st0],         %[Temp3](%[cm])                \n\t"
    266         "dpa.w.ph         $ac1,           %[p2],          %[vector1b]    \n\t"
    267         "dpa.w.ph         $ac1,           %[p3],          %[vector2b]    \n\t"
    268         "dpa.w.ph         $ac1,           %[p4],          %[vector3b]    \n\t"
    269         "dpa.w.ph         $ac1,           %[p1],          %[vector4b]    \n\t"
    270         "extp             %[Temp3],       $ac1,           31             \n\t"
    271 
    272         "lbu              %[tp2],         1(%[dst])                      \n\t"
    273         "lbu              %[tn2],         3(%[dst])                      \n\t"
    274         "addqh_r.w        %[tp1],         %[tp1],         %[st0]         \n\t"
    275 
    276         /* odd 3. pixel */
    277         "lbux             %[st1],         %[Temp2](%[cm])                \n\t"
    278         "preceu.ph.qbr    %[p2],          %[tn3]                         \n\t"
    279         "dpa.w.ph         $ac3,           %[p3],          %[vector1b]    \n\t"
    280         "dpa.w.ph         $ac3,           %[p4],          %[vector2b]    \n\t"
    281         "dpa.w.ph         $ac3,           %[p1],          %[vector3b]    \n\t"
    282         "dpa.w.ph         $ac3,           %[n1],          %[vector4b]    \n\t"
    283         "addqh_r.w        %[tp2],         %[tp2],         %[st1]         \n\t"
    284         "extp             %[Temp2],       $ac3,           31             \n\t"
    285 
    286         "lbu              %[tn3],         5(%[dst])                      \n\t"
    287 
    288         /* odd 4. pixel */
    289         "sb               %[tp2],         1(%[dst])                      \n\t"
    290         "sb               %[tp1],         6(%[dst])                      \n\t"
    291         "dpa.w.ph         $ac2,           %[p4],          %[vector1b]    \n\t"
    292         "dpa.w.ph         $ac2,           %[p1],          %[vector2b]    \n\t"
    293         "dpa.w.ph         $ac2,           %[n1],          %[vector3b]    \n\t"
    294         "dpa.w.ph         $ac2,           %[p2],          %[vector4b]    \n\t"
    295         "extp             %[Temp1],       $ac2,           31             \n\t"
    296 
    297         "lbu              %[tn1],         7(%[dst])                      \n\t"
    298 
    299         /* clamp */
    300         "lbux             %[p4],          %[Temp3](%[cm])                \n\t"
    301         "addqh_r.w        %[tn2],         %[tn2],         %[p4]          \n\t"
    302 
    303         "lbux             %[p2],          %[Temp2](%[cm])                \n\t"
    304         "addqh_r.w        %[tn3],         %[tn3],         %[p2]          \n\t"
    305 
    306         "lbux             %[n1],          %[Temp1](%[cm])                \n\t"
    307         "addqh_r.w        %[tn1],         %[tn1],         %[n1]          \n\t"
    308 
    309         /* store bytes */
    310         "sb               %[tn2],         3(%[dst])                      \n\t"
    311         "sb               %[tn3],         5(%[dst])                      \n\t"
    312         "sb               %[tn1],         7(%[dst])                      \n\t"
    313 
    314         : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
    315           [tn1] "=&r" (tn1), [tn2] "=&r" (tn2), [tn3] "=&r" (tn3),
    316           [st0] "=&r" (st0), [st1] "=&r" (st1),
    317           [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
    318           [n1] "=&r" (n1),
    319           [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
    320         : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
    321           [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
    322           [vector4a] "r" (vector4a),
    323           [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
    324     );
    325 
    326     /* Next row... */
    327     src += src_stride;
    328     dst += dst_stride;
    329   }
    330 }
    331 
    332 static void convolve_avg_horiz_16_dspr2(const uint8_t *src_ptr,
    333                                         int32_t src_stride,
    334                                         uint8_t *dst_ptr,
    335                                         int32_t dst_stride,
    336                                         const int16_t *filter_x0,
    337                                         int32_t h,
    338                                         int32_t count) {
    339   int32_t y, c;
    340   const uint8_t *src;
    341   uint8_t *dst;
    342   uint8_t *cm = vp9_ff_cropTbl;
    343   uint32_t vector_64 = 64;
    344   int32_t filter12, filter34, filter56, filter78;
    345   int32_t Temp1, Temp2, Temp3;
    346   uint32_t qload1, qload2, qload3;
    347   uint32_t p1, p2, p3, p4, p5;
    348   uint32_t st1, st2, st3;
    349 
    350   filter12 = ((const int32_t *)filter_x0)[0];
    351   filter34 = ((const int32_t *)filter_x0)[1];
    352   filter56 = ((const int32_t *)filter_x0)[2];
    353   filter78 = ((const int32_t *)filter_x0)[3];
    354 
    355   for (y = h; y--;) {
    356     src = src_ptr;
    357     dst = dst_ptr;
    358 
    359     /* prefetch data to cache memory */
    360     vp9_prefetch_load(src_ptr + src_stride);
    361     vp9_prefetch_load(src_ptr + src_stride + 32);
    362     vp9_prefetch_store(dst_ptr + dst_stride);
    363 
    364     for (c = 0; c < count; c++) {
    365       __asm__ __volatile__ (
    366           "ulw              %[qload1],    0(%[src])                    \n\t"
    367           "ulw              %[qload2],    4(%[src])                    \n\t"
    368 
    369           /* even 1. pixel */
    370           "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
    371           "mthi             $zero,        $ac1                         \n\t"
    372           "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
    373           "mthi             $zero,        $ac2                         \n\t"
    374           "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
    375           "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
    376           "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
    377           "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
    378           "ulw              %[qload3],    8(%[src])                    \n\t"
    379           "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* even 1 */
    380           "dpa.w.ph         $ac1,         %[p2],          %[filter34]  \n\t" /* even 1 */
    381           "dpa.w.ph         $ac1,         %[p3],          %[filter56]  \n\t" /* even 1 */
    382           "dpa.w.ph         $ac1,         %[p4],          %[filter78]  \n\t" /* even 1 */
    383           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
    384           "lbu              %[st2],       0(%[dst])                    \n\t" /* load even 1 from dst */
    385 
    386           /* even 2. pixel */
    387           "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
    388           "mthi             $zero,        $ac3                         \n\t"
    389           "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
    390           "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
    391           "ulw              %[qload1],    12(%[src])                   \n\t"
    392           "dpa.w.ph         $ac2,         %[p2],          %[filter12]  \n\t" /* even 1 */
    393           "dpa.w.ph         $ac2,         %[p3],          %[filter34]  \n\t" /* even 1 */
    394           "dpa.w.ph         $ac2,         %[p4],          %[filter56]  \n\t" /* even 1 */
    395           "dpa.w.ph         $ac2,         %[p1],          %[filter78]  \n\t" /* even 1 */
    396           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
    397           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
    398 
    399           "lbu              %[qload3],    2(%[dst])                    \n\t" /* load even 2 from dst */
    400 
    401           /* even 3. pixel */
    402           "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
    403           "mthi             $zero,        $ac1                         \n\t"
    404           "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 1 */
    405           "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
    406           "sb               %[st2],       0(%[dst])                    \n\t" /* store even 1 to dst */
    407           "dpa.w.ph         $ac3,         %[p3],          %[filter12]  \n\t" /* even 3 */
    408           "dpa.w.ph         $ac3,         %[p4],          %[filter34]  \n\t" /* even 3 */
    409           "dpa.w.ph         $ac3,         %[p1],          %[filter56]  \n\t" /* even 3 */
    410           "dpa.w.ph         $ac3,         %[p5],          %[filter78]  \n\t" /* even 3 */
    411           "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
    412           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
    413 
    414           /* even 4. pixel */
    415           "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
    416           "mthi             $zero,        $ac2                         \n\t"
    417           "addqh_r.w        %[qload3],    %[qload3],      %[st2]       \n\t" /* average even 2 */
    418           "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
    419           "sb               %[qload3],    2(%[dst])                    \n\t" /* store even 2 to dst */
    420           "ulw              %[qload2],    16(%[src])                   \n\t"
    421           "lbu              %[qload3],    4(%[dst])                    \n\t" /* load even 3 from dst */
    422           "lbu              %[qload1],    6(%[dst])                    \n\t" /* load even 4 from dst */
    423           "dpa.w.ph         $ac1,         %[p4],          %[filter12]  \n\t" /* even 4 */
    424           "dpa.w.ph         $ac1,         %[p1],          %[filter34]  \n\t" /* even 4 */
    425           "dpa.w.ph         $ac1,         %[p5],          %[filter56]  \n\t" /* even 4 */
    426           "dpa.w.ph         $ac1,         %[p2],          %[filter78]  \n\t" /* even 4 */
    427           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
    428           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
    429 
    430           /* even 5. pixel */
    431           "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
    432           "mthi             $zero,        $ac3                         \n\t"
    433           "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 3 */
    434           "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
    435           "sb               %[qload3],    4(%[dst])                    \n\t" /* store even 3 to dst */
    436           "dpa.w.ph         $ac2,         %[p1],          %[filter12]  \n\t" /* even 5 */
    437           "dpa.w.ph         $ac2,         %[p5],          %[filter34]  \n\t" /* even 5 */
    438           "dpa.w.ph         $ac2,         %[p2],          %[filter56]  \n\t" /* even 5 */
    439           "dpa.w.ph         $ac2,         %[p3],          %[filter78]  \n\t" /* even 5 */
    440           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
    441           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
    442 
    443           /* even 6. pixel */
    444           "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
    445           "mthi             $zero,        $ac1                         \n\t"
    446           "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average even 4 */
    447           "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
    448           "sb               %[qload1],    6(%[dst])                    \n\t" /* store even 4 to dst */
    449           "ulw              %[qload3],    20(%[src])                   \n\t"
    450           "dpa.w.ph         $ac3,         %[p5],          %[filter12]  \n\t" /* even 6 */
    451           "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* even 6 */
    452           "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* even 6 */
    453           "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* even 6 */
    454           "lbu              %[qload2],    8(%[dst])                    \n\t" /* load even 5 from dst */
    455           "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
    456           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
    457 
    458           /* even 7. pixel */
    459           "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
    460           "mthi             $zero,        $ac2                         \n\t"
    461           "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 5 */
    462           "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
    463           "sb               %[qload2],    8(%[dst])                    \n\t" /* store even 5 to dst */
    464           "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* even 7 */
    465           "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* even 7 */
    466           "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* even 7 */
    467           "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* even 7 */
    468           "lbu              %[qload3],    10(%[dst])                   \n\t" /* load even 6 from dst */
    469           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
    470           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
    471 
    472           "lbu              %[st2],       12(%[dst])                   \n\t" /* load even 7 from dst */
    473 
    474           /* even 8. pixel */
    475           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
    476           "mthi             $zero,        $ac3                         \n\t"
    477           "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 6 */
    478           "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* even 8 */
    479           "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* even 8 */
    480           "sb               %[qload3],    10(%[dst])                   \n\t" /* store even 6 to dst */
    481           "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* even 8 */
    482           "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* even 8 */
    483           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
    484           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
    485 
    486           /* ODD pixels */
    487           "ulw              %[qload1],    1(%[src])                   \n\t"
    488           "ulw              %[qload2],    5(%[src])                    \n\t"
    489 
    490           "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 7 */
    491 
    492           /* odd 1. pixel */
    493           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
    494           "mthi             $zero,        $ac1                         \n\t"
    495           "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
    496           "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
    497           "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
    498           "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
    499           "sb               %[st2],       12(%[dst])                   \n\t" /* store even 7 to dst */
    500           "ulw              %[qload3],    9(%[src])                    \n\t"
    501           "dpa.w.ph         $ac3,         %[p1],          %[filter12]  \n\t" /* odd 1 */
    502           "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* odd 1 */
    503           "lbu              %[qload2],    14(%[dst])                   \n\t" /* load even 8 from dst */
    504           "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* odd 1 */
    505           "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* odd 1 */
    506           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
    507           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
    508 
    509           "lbu              %[st1],       1(%[dst])                    \n\t" /* load odd 1 from dst */
    510 
    511           /* odd 2. pixel */
    512           "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
    513           "mthi             $zero,        $ac2                         \n\t"
    514           "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 8 */
    515           "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
    516           "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
    517           "sb               %[qload2],    14(%[dst])                   \n\t" /* store even 8 to dst */
    518           "ulw              %[qload1],    13(%[src])                   \n\t"
    519           "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* odd 2 */
    520           "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* odd 2 */
    521           "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* odd 2 */
    522           "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* odd 2 */
    523           "lbu              %[qload3],    3(%[dst])                    \n\t" /* load odd 2 from dst */
    524           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
    525           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
    526 
    527           /* odd 3. pixel */
    528           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
    529           "mthi             $zero,        $ac3                         \n\t"
    530           "addqh_r.w        %[st3],       %[st3],         %[st1]       \n\t" /* average odd 1 */
    531           "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
    532           "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* odd 3 */
    533           "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* odd 3 */
    534           "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* odd 3 */
    535           "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* odd 3 */
    536           "sb               %[st3],       1(%[dst])                    \n\t" /* store odd 1 to dst */
    537           "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
    538           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
    539 
    540           /* odd 4. pixel */
    541           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
    542           "mthi             $zero,        $ac1                         \n\t"
    543           "addqh_r.w        %[qload3],    %[qload3],      %[st1]       \n\t" /* average odd 2 */
    544           "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
    545           "sb               %[qload3],    3(%[dst])                    \n\t" /* store odd 2 to dst */
    546           "lbu              %[qload1],    5(%[dst])                    \n\t" /* load odd 3 from dst */
    547           "ulw              %[qload2],    17(%[src])                   \n\t"
    548           "dpa.w.ph         $ac3,         %[p4],          %[filter12]  \n\t" /* odd 4 */
    549           "dpa.w.ph         $ac3,         %[p1],          %[filter34]  \n\t" /* odd 4 */
    550           "dpa.w.ph         $ac3,         %[p5],          %[filter56]  \n\t" /* odd 4 */
    551           "dpa.w.ph         $ac3,         %[p2],          %[filter78]  \n\t" /* odd 4 */
    552           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
    553           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
    554 
    555           "lbu              %[st1],       7(%[dst])                    \n\t" /* load odd 4 from dst */
    556 
    557           /* odd 5. pixel */
    558           "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
    559           "mthi             $zero,        $ac2                         \n\t"
    560           "addqh_r.w        %[qload1],    %[qload1],      %[st2]       \n\t" /* average odd 3 */
    561           "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
    562           "sb               %[qload1],    5(%[dst])                    \n\t" /* store odd 3 to dst */
    563           "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* odd 5 */
    564           "dpa.w.ph         $ac1,         %[p5],          %[filter34]  \n\t" /* odd 5 */
    565           "dpa.w.ph         $ac1,         %[p2],          %[filter56]  \n\t" /* odd 5 */
    566           "dpa.w.ph         $ac1,         %[p3],          %[filter78]  \n\t" /* odd 5 */
    567           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
    568           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
    569 
    570           "lbu              %[qload1],    9(%[dst])                    \n\t" /* load odd 5 from dst */
    571 
    572           /* odd 6. pixel */
    573           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
    574           "mthi             $zero,        $ac3                         \n\t"
    575           "addqh_r.w        %[st1],       %[st1],         %[st3]       \n\t" /* average odd 4 */
    576           "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
    577           "sb               %[st1],       7(%[dst])                    \n\t" /* store odd 4 to dst */
    578           "ulw              %[qload3],    21(%[src])                   \n\t"
    579           "dpa.w.ph         $ac2,         %[p5],          %[filter12]  \n\t" /* odd 6 */
    580           "dpa.w.ph         $ac2,         %[p2],          %[filter34]  \n\t" /* odd 6 */
    581           "dpa.w.ph         $ac2,         %[p3],          %[filter56]  \n\t" /* odd 6 */
    582           "dpa.w.ph         $ac2,         %[p4],          %[filter78]  \n\t" /* odd 6 */
    583           "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
    584           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
    585 
    586           /* odd 7. pixel */
    587           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
    588           "mthi             $zero,        $ac1                         \n\t"
    589           "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 5 */
    590           "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
    591           "sb               %[qload1],    9(%[dst])                    \n\t" /* store odd 5 to dst */
    592           "lbu              %[qload2],    11(%[dst])                   \n\t" /* load odd 6 from dst */
    593           "dpa.w.ph         $ac3,         %[p2],          %[filter12]  \n\t" /* odd 7 */
    594           "dpa.w.ph         $ac3,         %[p3],          %[filter34]  \n\t" /* odd 7 */
    595           "dpa.w.ph         $ac3,         %[p4],          %[filter56]  \n\t" /* odd 7 */
    596           "dpa.w.ph         $ac3,         %[p1],          %[filter78]  \n\t" /* odd 7 */
    597           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
    598 
    599           "lbu              %[qload3],    13(%[dst])                   \n\t" /* load odd 7 from dst */
    600 
    601           /* odd 8. pixel */
    602           "dpa.w.ph         $ac1,         %[p3],          %[filter12]  \n\t" /* odd 8 */
    603           "dpa.w.ph         $ac1,         %[p4],          %[filter34]  \n\t" /* odd 8 */
    604           "dpa.w.ph         $ac1,         %[p1],          %[filter56]  \n\t" /* odd 8 */
    605           "dpa.w.ph         $ac1,         %[p5],          %[filter78]  \n\t" /* odd 8 */
    606           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
    607 
    608           "lbu              %[qload1],    15(%[dst])                   \n\t" /* load odd 8 from dst */
    609 
    610           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
    611           "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average odd 6 */
    612 
    613           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
    614           "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average odd 7 */
    615 
    616           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
    617           "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 8 */
    618 
    619           "sb               %[qload2],    11(%[dst])                   \n\t" /* store odd 6 to dst */
    620           "sb               %[qload3],    13(%[dst])                   \n\t" /* store odd 7 to dst */
    621           "sb               %[qload1],    15(%[dst])                   \n\t" /* store odd 8 to dst */
    622 
    623           : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2),
    624             [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
    625             [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
    626             [qload3] "=&r" (qload3), [p5] "=&r" (p5),
    627             [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
    628           : [filter12] "r" (filter12), [filter34] "r" (filter34),
    629             [filter56] "r" (filter56), [filter78] "r" (filter78),
    630             [vector_64] "r" (vector_64),
    631             [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
    632       );
    633 
    634       src += 16;
    635       dst += 16;
    636     }
    637 
    638     /* Next row... */
    639     src_ptr += src_stride;
    640     dst_ptr += dst_stride;
    641   }
    642 }
    643 
    644 static void convolve_avg_horiz_64_dspr2(const uint8_t *src_ptr,
    645                                         int32_t src_stride,
    646                                         uint8_t *dst_ptr,
    647                                         int32_t dst_stride,
    648                                         const int16_t *filter_x0,
    649                                         int32_t h) {
    650   int32_t y, c;
    651   const uint8_t *src;
    652   uint8_t *dst;
    653   uint8_t *cm = vp9_ff_cropTbl;
    654   uint32_t vector_64 = 64;
    655   int32_t filter12, filter34, filter56, filter78;
    656   int32_t Temp1, Temp2, Temp3;
    657   uint32_t qload1, qload2, qload3;
    658   uint32_t p1, p2, p3, p4, p5;
    659   uint32_t st1, st2, st3;
    660 
    661   filter12 = ((const int32_t *)filter_x0)[0];
    662   filter34 = ((const int32_t *)filter_x0)[1];
    663   filter56 = ((const int32_t *)filter_x0)[2];
    664   filter78 = ((const int32_t *)filter_x0)[3];
    665 
    666   for (y = h; y--;) {
    667     src = src_ptr;
    668     dst = dst_ptr;
    669 
    670     /* prefetch data to cache memory */
    671     vp9_prefetch_load(src_ptr + src_stride);
    672     vp9_prefetch_load(src_ptr + src_stride + 32);
    673     vp9_prefetch_load(src_ptr + src_stride + 64);
    674     vp9_prefetch_store(dst_ptr + dst_stride);
    675     vp9_prefetch_store(dst_ptr + dst_stride + 32);
    676 
    677     for (c = 0; c < 4; c++) {
    678       __asm__ __volatile__ (
    679           "ulw              %[qload1],    0(%[src])                    \n\t"
    680           "ulw              %[qload2],    4(%[src])                    \n\t"
    681 
    682           /* even 1. pixel */
    683           "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
    684           "mthi             $zero,        $ac1                         \n\t"
    685           "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
    686           "mthi             $zero,        $ac2                         \n\t"
    687           "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
    688           "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
    689           "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
    690           "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
    691           "ulw              %[qload3],    8(%[src])                    \n\t"
    692           "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* even 1 */
    693           "dpa.w.ph         $ac1,         %[p2],          %[filter34]  \n\t" /* even 1 */
    694           "dpa.w.ph         $ac1,         %[p3],          %[filter56]  \n\t" /* even 1 */
    695           "dpa.w.ph         $ac1,         %[p4],          %[filter78]  \n\t" /* even 1 */
    696           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
    697           "lbu              %[st2],       0(%[dst])                    \n\t" /* load even 1 from dst */
    698 
    699           /* even 2. pixel */
    700           "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
    701           "mthi             $zero,        $ac3                         \n\t"
    702           "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
    703           "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
    704           "ulw              %[qload1],    12(%[src])                   \n\t"
    705           "dpa.w.ph         $ac2,         %[p2],          %[filter12]  \n\t" /* even 1 */
    706           "dpa.w.ph         $ac2,         %[p3],          %[filter34]  \n\t" /* even 1 */
    707           "dpa.w.ph         $ac2,         %[p4],          %[filter56]  \n\t" /* even 1 */
    708           "dpa.w.ph         $ac2,         %[p1],          %[filter78]  \n\t" /* even 1 */
    709           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
    710           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
    711 
    712           "lbu              %[qload3],    2(%[dst])                    \n\t" /* load even 2 from dst */
    713 
    714           /* even 3. pixel */
    715           "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
    716           "mthi             $zero,        $ac1                         \n\t"
    717           "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 1 */
    718           "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
    719           "sb               %[st2],       0(%[dst])                    \n\t" /* store even 1 to dst */
    720           "dpa.w.ph         $ac3,         %[p3],          %[filter12]  \n\t" /* even 3 */
    721           "dpa.w.ph         $ac3,         %[p4],          %[filter34]  \n\t" /* even 3 */
    722           "dpa.w.ph         $ac3,         %[p1],          %[filter56]  \n\t" /* even 3 */
    723           "dpa.w.ph         $ac3,         %[p5],          %[filter78]  \n\t" /* even 3 */
    724           "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
    725           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
    726 
    727           /* even 4. pixel */
    728           "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
    729           "mthi             $zero,        $ac2                         \n\t"
    730           "addqh_r.w        %[qload3],    %[qload3],      %[st2]       \n\t" /* average even 2 */
    731           "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
    732           "sb               %[qload3],    2(%[dst])                    \n\t" /* store even 2 to dst */
    733           "ulw              %[qload2],    16(%[src])                   \n\t"
    734           "lbu              %[qload3],    4(%[dst])                    \n\t" /* load even 3 from dst */
    735           "lbu              %[qload1],    6(%[dst])                    \n\t" /* load even 4 from dst */
    736           "dpa.w.ph         $ac1,         %[p4],          %[filter12]  \n\t" /* even 4 */
    737           "dpa.w.ph         $ac1,         %[p1],          %[filter34]  \n\t" /* even 4 */
    738           "dpa.w.ph         $ac1,         %[p5],          %[filter56]  \n\t" /* even 4 */
    739           "dpa.w.ph         $ac1,         %[p2],          %[filter78]  \n\t" /* even 4 */
    740           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
    741           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
    742 
    743           /* even 5. pixel */
    744           "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
    745           "mthi             $zero,        $ac3                         \n\t"
    746           "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 3 */
    747           "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
    748           "sb               %[qload3],    4(%[dst])                    \n\t" /* store even 3 to dst */
    749           "dpa.w.ph         $ac2,         %[p1],          %[filter12]  \n\t" /* even 5 */
    750           "dpa.w.ph         $ac2,         %[p5],          %[filter34]  \n\t" /* even 5 */
    751           "dpa.w.ph         $ac2,         %[p2],          %[filter56]  \n\t" /* even 5 */
    752           "dpa.w.ph         $ac2,         %[p3],          %[filter78]  \n\t" /* even 5 */
    753           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
    754           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
    755 
    756           /* even 6. pixel */
    757           "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
    758           "mthi             $zero,        $ac1                         \n\t"
    759           "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average even 4 */
    760           "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
    761           "sb               %[qload1],    6(%[dst])                    \n\t" /* store even 4 to dst */
    762           "ulw              %[qload3],    20(%[src])                   \n\t"
    763           "dpa.w.ph         $ac3,         %[p5],          %[filter12]  \n\t" /* even 6 */
    764           "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* even 6 */
    765           "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* even 6 */
    766           "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* even 6 */
    767           "lbu              %[qload2],    8(%[dst])                    \n\t" /* load even 5 from dst */
    768           "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
    769           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
    770 
    771           /* even 7. pixel */
    772           "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
    773           "mthi             $zero,        $ac2                         \n\t"
    774           "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 5 */
    775           "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
    776           "sb               %[qload2],    8(%[dst])                    \n\t" /* store even 5 to dst */
    777           "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* even 7 */
    778           "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* even 7 */
    779           "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* even 7 */
    780           "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* even 7 */
    781           "lbu              %[qload3],    10(%[dst])                   \n\t" /* load even 6 from dst */
    782           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
    783           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
    784 
    785           "lbu              %[st2],       12(%[dst])                   \n\t" /* load even 7 from dst */
    786 
    787           /* even 8. pixel */
    788           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
    789           "mthi             $zero,        $ac3                         \n\t"
    790           "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 6 */
    791           "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* even 8 */
    792           "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* even 8 */
    793           "sb               %[qload3],    10(%[dst])                   \n\t" /* store even 6 to dst */
    794           "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* even 8 */
    795           "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* even 8 */
    796           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
    797           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
    798 
    799           /* ODD pixels */
    800           "ulw              %[qload1],    1(%[src])                   \n\t"
    801           "ulw              %[qload2],    5(%[src])                    \n\t"
    802 
    803           "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 7 */
    804 
    805           /* odd 1. pixel */
    806           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
    807           "mthi             $zero,        $ac1                         \n\t"
    808           "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
    809           "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
    810           "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
    811           "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
    812           "sb               %[st2],       12(%[dst])                   \n\t" /* store even 7 to dst */
    813           "ulw              %[qload3],    9(%[src])                    \n\t"
    814           "dpa.w.ph         $ac3,         %[p1],          %[filter12]  \n\t" /* odd 1 */
    815           "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* odd 1 */
    816           "lbu              %[qload2],    14(%[dst])                   \n\t" /* load even 8 from dst */
    817           "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* odd 1 */
    818           "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* odd 1 */
    819           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
    820           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
    821 
    822           "lbu              %[st1],       1(%[dst])                    \n\t" /* load odd 1 from dst */
    823 
    824           /* odd 2. pixel */
    825           "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
    826           "mthi             $zero,        $ac2                         \n\t"
    827           "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 8 */
    828           "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
    829           "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
    830           "sb               %[qload2],    14(%[dst])                   \n\t" /* store even 8 to dst */
    831           "ulw              %[qload1],    13(%[src])                   \n\t"
    832           "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* odd 2 */
    833           "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* odd 2 */
    834           "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* odd 2 */
    835           "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* odd 2 */
    836           "lbu              %[qload3],    3(%[dst])                    \n\t" /* load odd 2 from dst */
    837           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
    838           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
    839 
    840           /* odd 3. pixel */
    841           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
    842           "mthi             $zero,        $ac3                         \n\t"
    843           "addqh_r.w        %[st3],       %[st3],         %[st1]       \n\t" /* average odd 1 */
    844           "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
    845           "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* odd 3 */
    846           "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* odd 3 */
    847           "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* odd 3 */
    848           "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* odd 3 */
    849           "sb               %[st3],       1(%[dst])                    \n\t" /* store odd 1 to dst */
    850           "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
    851           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
    852 
    853           /* odd 4. pixel */
    854           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
    855           "mthi             $zero,        $ac1                         \n\t"
    856           "addqh_r.w        %[qload3],    %[qload3],      %[st1]       \n\t" /* average odd 2 */
    857           "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
    858           "sb               %[qload3],    3(%[dst])                    \n\t" /* store odd 2 to dst */
    859           "lbu              %[qload1],    5(%[dst])                    \n\t" /* load odd 3 from dst */
    860           "ulw              %[qload2],    17(%[src])                   \n\t"
    861           "dpa.w.ph         $ac3,         %[p4],          %[filter12]  \n\t" /* odd 4 */
    862           "dpa.w.ph         $ac3,         %[p1],          %[filter34]  \n\t" /* odd 4 */
    863           "dpa.w.ph         $ac3,         %[p5],          %[filter56]  \n\t" /* odd 4 */
    864           "dpa.w.ph         $ac3,         %[p2],          %[filter78]  \n\t" /* odd 4 */
    865           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
    866           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
    867 
    868           "lbu              %[st1],       7(%[dst])                    \n\t" /* load odd 4 from dst */
    869 
    870           /* odd 5. pixel */
    871           "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
    872           "mthi             $zero,        $ac2                         \n\t"
    873           "addqh_r.w        %[qload1],    %[qload1],      %[st2]       \n\t" /* average odd 3 */
    874           "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
    875           "sb               %[qload1],    5(%[dst])                    \n\t" /* store odd 3 to dst */
    876           "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* odd 5 */
    877           "dpa.w.ph         $ac1,         %[p5],          %[filter34]  \n\t" /* odd 5 */
    878           "dpa.w.ph         $ac1,         %[p2],          %[filter56]  \n\t" /* odd 5 */
    879           "dpa.w.ph         $ac1,         %[p3],          %[filter78]  \n\t" /* odd 5 */
    880           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
    881           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
    882 
    883           "lbu              %[qload1],    9(%[dst])                    \n\t" /* load odd 5 from dst */
    884 
    885           /* odd 6. pixel */
    886           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
    887           "mthi             $zero,        $ac3                         \n\t"
    888           "addqh_r.w        %[st1],       %[st1],         %[st3]       \n\t" /* average odd 4 */
    889           "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
    890           "sb               %[st1],       7(%[dst])                    \n\t" /* store odd 4 to dst */
    891           "ulw              %[qload3],    21(%[src])                   \n\t"
    892           "dpa.w.ph         $ac2,         %[p5],          %[filter12]  \n\t" /* odd 6 */
    893           "dpa.w.ph         $ac2,         %[p2],          %[filter34]  \n\t" /* odd 6 */
    894           "dpa.w.ph         $ac2,         %[p3],          %[filter56]  \n\t" /* odd 6 */
    895           "dpa.w.ph         $ac2,         %[p4],          %[filter78]  \n\t" /* odd 6 */
    896           "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
    897           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
    898 
    899           /* odd 7. pixel */
    900           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
    901           "mthi             $zero,        $ac1                         \n\t"
    902           "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 5 */
    903           "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
    904           "sb               %[qload1],    9(%[dst])                    \n\t" /* store odd 5 to dst */
    905           "lbu              %[qload2],    11(%[dst])                   \n\t" /* load odd 6 from dst */
    906           "dpa.w.ph         $ac3,         %[p2],          %[filter12]  \n\t" /* odd 7 */
    907           "dpa.w.ph         $ac3,         %[p3],          %[filter34]  \n\t" /* odd 7 */
    908           "dpa.w.ph         $ac3,         %[p4],          %[filter56]  \n\t" /* odd 7 */
    909           "dpa.w.ph         $ac3,         %[p1],          %[filter78]  \n\t" /* odd 7 */
    910           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
    911 
    912           "lbu              %[qload3],    13(%[dst])                   \n\t" /* load odd 7 from dst */
    913 
    914           /* odd 8. pixel */
    915           "dpa.w.ph         $ac1,         %[p3],          %[filter12]  \n\t" /* odd 8 */
    916           "dpa.w.ph         $ac1,         %[p4],          %[filter34]  \n\t" /* odd 8 */
    917           "dpa.w.ph         $ac1,         %[p1],          %[filter56]  \n\t" /* odd 8 */
    918           "dpa.w.ph         $ac1,         %[p5],          %[filter78]  \n\t" /* odd 8 */
    919           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
    920 
    921           "lbu              %[qload1],    15(%[dst])                   \n\t" /* load odd 8 from dst */
    922 
    923           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
    924           "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average odd 6 */
    925 
    926           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
    927           "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average odd 7 */
    928 
    929           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
    930           "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 8 */
    931 
    932           "sb               %[qload2],    11(%[dst])                   \n\t" /* store odd 6 to dst */
    933           "sb               %[qload3],    13(%[dst])                   \n\t" /* store odd 7 to dst */
    934           "sb               %[qload1],    15(%[dst])                   \n\t" /* store odd 8 to dst */
    935 
    936           : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2),
    937             [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
    938             [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
    939             [qload3] "=&r" (qload3), [p5] "=&r" (p5),
    940             [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
    941           : [filter12] "r" (filter12), [filter34] "r" (filter34),
    942             [filter56] "r" (filter56), [filter78] "r" (filter78),
    943             [vector_64] "r" (vector_64),
    944             [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
    945       );
    946 
    947       src += 16;
    948       dst += 16;
    949     }
    950 
    951     /* Next row... */
    952     src_ptr += src_stride;
    953     dst_ptr += dst_stride;
    954   }
    955 }
    956 
    957 void vp9_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
    958                                    uint8_t *dst, ptrdiff_t dst_stride,
    959                                    const int16_t *filter_x, int x_step_q4,
    960                                    const int16_t *filter_y, int y_step_q4,
    961                                    int w, int h) {
    962   if (((const int32_t *)filter_x)[1] == 0x800000) {
    963     vp9_convolve_avg(src, src_stride,
    964                      dst, dst_stride,
    965                      filter_x, x_step_q4,
    966                      filter_y, y_step_q4,
    967                      w, h);
    968   } else if (((const int32_t *)filter_x)[0] == 0) {
    969     vp9_convolve2_avg_horiz_dspr2(src, src_stride,
    970                                   dst, dst_stride,
    971                                   filter_x, x_step_q4,
    972                                   filter_y, y_step_q4,
    973                                   w, h);
    974   } else {
    975     if (16 == x_step_q4) {
    976       uint32_t pos = 38;
    977 
    978       src -= 3;
    979 
    980       /* bit positon for extract from acc */
    981       __asm__ __volatile__ (
    982         "wrdsp      %[pos],     1           \n\t"
    983         :
    984         : [pos] "r" (pos)
    985       );
    986 
    987       /* prefetch data to cache memory */
    988       vp9_prefetch_load(src);
    989       vp9_prefetch_load(src + 32);
    990       vp9_prefetch_store(dst);
    991 
    992       switch (w) {
    993         case 4:
    994           convolve_avg_horiz_4_dspr2(src, src_stride,
    995                                      dst, dst_stride,
    996                                      filter_x, h);
    997           break;
    998         case 8:
    999           convolve_avg_horiz_8_dspr2(src, src_stride,
   1000                                      dst, dst_stride,
   1001                                      filter_x, h);
   1002           break;
   1003         case 16:
   1004           convolve_avg_horiz_16_dspr2(src, src_stride,
   1005                                       dst, dst_stride,
   1006                                       filter_x, h, 1);
   1007           break;
   1008         case 32:
   1009           convolve_avg_horiz_16_dspr2(src, src_stride,
   1010                                       dst, dst_stride,
   1011                                       filter_x, h, 2);
   1012           break;
   1013         case 64:
   1014           vp9_prefetch_load(src + 64);
   1015           vp9_prefetch_store(dst + 32);
   1016 
   1017           convolve_avg_horiz_64_dspr2(src, src_stride,
   1018                                       dst, dst_stride,
   1019                                       filter_x, h);
   1020           break;
   1021         default:
   1022           vp9_convolve8_avg_horiz_c(src + 3, src_stride,
   1023                                     dst, dst_stride,
   1024                                     filter_x, x_step_q4,
   1025                                     filter_y, y_step_q4,
   1026                                     w, h);
   1027           break;
   1028       }
   1029     } else {
   1030       vp9_convolve8_avg_horiz_c(src, src_stride,
   1031                                 dst, dst_stride,
   1032                                 filter_x, x_step_q4,
   1033                                 filter_y, y_step_q4,
   1034                                 w, h);
   1035     }
   1036   }
   1037 }
   1038 #endif
   1039