Home | History | Annotate | Download | only in mips
      1 /*
      2  *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <assert.h>
     12 #include <stdio.h>
     13 
     14 #include "./vpx_dsp_rtcd.h"
     15 #include "vpx_dsp/mips/convolve_common_dspr2.h"
     16 #include "vpx_dsp/vpx_convolve.h"
     17 #include "vpx_dsp/vpx_dsp_common.h"
     18 #include "vpx_ports/mem.h"
     19 
     20 #if HAVE_DSPR2
     21 static void convolve_avg_horiz_4_dspr2(const uint8_t *src, int32_t src_stride,
     22                                        uint8_t *dst, int32_t dst_stride,
     23                                        const int16_t *filter_x0, int32_t h) {
     24   int32_t y;
     25   uint8_t *cm = vpx_ff_cropTbl;
     26   int32_t vector1b, vector2b, vector3b, vector4b;
     27   int32_t Temp1, Temp2, Temp3, Temp4;
     28   uint32_t vector4a = 64;
     29   uint32_t tp1, tp2;
     30   uint32_t p1, p2, p3, p4;
     31   uint32_t n1, n2, n3, n4;
     32   uint32_t tn1, tn2;
     33 
     34   vector1b = ((const int32_t *)filter_x0)[0];
     35   vector2b = ((const int32_t *)filter_x0)[1];
     36   vector3b = ((const int32_t *)filter_x0)[2];
     37   vector4b = ((const int32_t *)filter_x0)[3];
     38 
     39   for (y = h; y--;) {
     40     /* prefetch data to cache memory */
     41     prefetch_load(src + src_stride);
     42     prefetch_load(src + src_stride + 32);
     43     prefetch_store(dst + dst_stride);
     44 
     45     __asm__ __volatile__(
     46         "ulw              %[tp1],         0(%[src])                      \n\t"
     47         "ulw              %[tp2],         4(%[src])                      \n\t"
     48 
     49         /* even 1. pixel */
     50         "mtlo             %[vector4a],    $ac3                           \n\t"
     51         "mthi             $zero,          $ac3                           \n\t"
     52         "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
     53         "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
     54         "preceu.ph.qbr    %[p3],          %[tp2]                         \n\t"
     55         "preceu.ph.qbl    %[p4],          %[tp2]                         \n\t"
     56         "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
     57         "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
     58         "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
     59         "ulw              %[tn2],         8(%[src])                      \n\t"
     60         "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
     61         "extp             %[Temp1],       $ac3,           31             \n\t"
     62 
     63         /* even 2. pixel */
     64         "mtlo             %[vector4a],    $ac2                           \n\t"
     65         "mthi             $zero,          $ac2                           \n\t"
     66         "preceu.ph.qbr    %[p1],          %[tn2]                         \n\t"
     67         "balign           %[tn1],         %[tn2],         3              \n\t"
     68         "balign           %[tn2],         %[tp2],         3              \n\t"
     69         "balign           %[tp2],         %[tp1],         3              \n\t"
     70         "dpa.w.ph         $ac2,           %[p2],          %[vector1b]    \n\t"
     71         "dpa.w.ph         $ac2,           %[p3],          %[vector2b]    \n\t"
     72         "dpa.w.ph         $ac2,           %[p4],          %[vector3b]    \n\t"
     73         "dpa.w.ph         $ac2,           %[p1],          %[vector4b]    \n\t"
     74         "extp             %[Temp3],       $ac2,           31             \n\t"
     75 
     76         "lbu              %[p2],          3(%[dst])                      \n\t" /* load odd 2 */
     77 
     78         /* odd 1. pixel */
     79         "lbux             %[tp1],         %[Temp1](%[cm])                \n\t" /* even 1 */
     80         "mtlo             %[vector4a],    $ac3                           \n\t"
     81         "mthi             $zero,          $ac3                           \n\t"
     82         "lbu              %[Temp1],       1(%[dst])                      \n\t" /* load odd 1 */
     83         "preceu.ph.qbr    %[n1],          %[tp2]                         \n\t"
     84         "preceu.ph.qbl    %[n2],          %[tp2]                         \n\t"
     85         "preceu.ph.qbr    %[n3],          %[tn2]                         \n\t"
     86         "preceu.ph.qbl    %[n4],          %[tn2]                         \n\t"
     87         "dpa.w.ph         $ac3,           %[n1],          %[vector1b]    \n\t"
     88         "dpa.w.ph         $ac3,           %[n2],          %[vector2b]    \n\t"
     89         "dpa.w.ph         $ac3,           %[n3],          %[vector3b]    \n\t"
     90         "dpa.w.ph         $ac3,           %[n4],          %[vector4b]    \n\t"
     91         "extp             %[Temp2],       $ac3,           31             \n\t"
     92 
     93         "lbu              %[tn2],         0(%[dst])                      \n\t" /* load even 1 */
     94 
     95         /* odd 2. pixel */
     96         "lbux             %[tp2],         %[Temp3](%[cm])                \n\t" /* even 2 */
     97         "mtlo             %[vector4a],    $ac2                           \n\t"
     98         "mthi             $zero,          $ac2                           \n\t"
     99         "preceu.ph.qbr    %[n1],          %[tn1]                         \n\t"
    100         "lbux             %[tn1],         %[Temp2](%[cm])                \n\t" /* odd 1 */
    101         "addqh_r.w        %[tn2],         %[tn2],         %[tp1]         \n\t" /* average even 1 */
    102         "dpa.w.ph         $ac2,           %[n2],          %[vector1b]    \n\t"
    103         "dpa.w.ph         $ac2,           %[n3],          %[vector2b]    \n\t"
    104         "dpa.w.ph         $ac2,           %[n4],          %[vector3b]    \n\t"
    105         "dpa.w.ph         $ac2,           %[n1],          %[vector4b]    \n\t"
    106         "extp             %[Temp4],       $ac2,           31             \n\t"
    107 
    108         "lbu              %[tp1],         2(%[dst])                      \n\t" /* load even 2 */
    109         "sb               %[tn2],         0(%[dst])                      \n\t" /* store even 1 */
    110 
    111         /* clamp */
    112         "addqh_r.w        %[Temp1],       %[Temp1],       %[tn1]         \n\t" /* average odd 1 */
    113         "lbux             %[n2],          %[Temp4](%[cm])                \n\t" /* odd 2 */
    114         "sb               %[Temp1],       1(%[dst])                      \n\t" /* store odd 1 */
    115 
    116         "addqh_r.w        %[tp1],         %[tp1],         %[tp2]         \n\t" /* average even 2 */
    117         "sb               %[tp1],         2(%[dst])                      \n\t" /* store even 2 */
    118 
    119         "addqh_r.w        %[p2],          %[p2],          %[n2]          \n\t" /* average odd 2 */
    120         "sb               %[p2],          3(%[dst])                      \n\t" /* store odd 2 */
    121 
    122         : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
    123           [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
    124           [p4] "=&r"(p4), [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3),
    125           [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
    126           [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4)
    127         : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
    128           [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
    129           [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst),
    130           [src] "r"(src));
    131 
    132     /* Next row... */
    133     src += src_stride;
    134     dst += dst_stride;
    135   }
    136 }
    137 
    138 static void convolve_avg_horiz_8_dspr2(const uint8_t *src, int32_t src_stride,
    139                                        uint8_t *dst, int32_t dst_stride,
    140                                        const int16_t *filter_x0, int32_t h) {
    141   int32_t y;
    142   uint8_t *cm = vpx_ff_cropTbl;
    143   uint32_t vector4a = 64;
    144   int32_t vector1b, vector2b, vector3b, vector4b;
    145   int32_t Temp1, Temp2, Temp3;
    146   uint32_t tp1, tp2;
    147   uint32_t p1, p2, p3, p4, n1;
    148   uint32_t tn1, tn2, tn3;
    149   uint32_t st0, st1;
    150 
    151   vector1b = ((const int32_t *)filter_x0)[0];
    152   vector2b = ((const int32_t *)filter_x0)[1];
    153   vector3b = ((const int32_t *)filter_x0)[2];
    154   vector4b = ((const int32_t *)filter_x0)[3];
    155 
    156   for (y = h; y--;) {
    157     /* prefetch data to cache memory */
    158     prefetch_load(src + src_stride);
    159     prefetch_load(src + src_stride + 32);
    160     prefetch_store(dst + dst_stride);
    161 
    162     __asm__ __volatile__(
    163         "ulw              %[tp1],         0(%[src])                      \n\t"
    164         "ulw              %[tp2],         4(%[src])                      \n\t"
    165 
    166         /* even 1. pixel */
    167         "mtlo             %[vector4a],    $ac3                           \n\t"
    168         "mthi             $zero,          $ac3                           \n\t"
    169         "mtlo             %[vector4a],    $ac2                           \n\t"
    170         "mthi             $zero,          $ac2                           \n\t"
    171         "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
    172         "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
    173         "preceu.ph.qbr    %[p3],          %[tp2]                         \n\t"
    174         "preceu.ph.qbl    %[p4],          %[tp2]                         \n\t"
    175         "ulw              %[tn2],         8(%[src])                      \n\t"
    176         "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
    177         "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
    178         "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
    179         "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
    180         "extp             %[Temp1],       $ac3,           31             \n\t"
    181         "lbu              %[Temp2],       0(%[dst])                      \n\t"
    182         "lbu              %[tn3],         2(%[dst])                      \n\t"
    183 
    184         /* even 2. pixel */
    185         "preceu.ph.qbr    %[p1],          %[tn2]                         \n\t"
    186         "preceu.ph.qbl    %[n1],          %[tn2]                         \n\t"
    187         "ulw              %[tn1],         12(%[src])                     \n\t"
    188         "dpa.w.ph         $ac2,           %[p2],          %[vector1b]    \n\t"
    189         "dpa.w.ph         $ac2,           %[p3],          %[vector2b]    \n\t"
    190         "dpa.w.ph         $ac2,           %[p4],          %[vector3b]    \n\t"
    191         "dpa.w.ph         $ac2,           %[p1],          %[vector4b]    \n\t"
    192         "extp             %[Temp3],       $ac2,           31             \n\t"
    193 
    194         /* even 3. pixel */
    195         "lbux             %[st0],         %[Temp1](%[cm])                \n\t"
    196         "mtlo             %[vector4a],    $ac1                           \n\t"
    197         "mthi             $zero,          $ac1                           \n\t"
    198         "preceu.ph.qbr    %[p2],          %[tn1]                         \n\t"
    199         "lbux             %[st1],         %[Temp3](%[cm])                \n\t"
    200         "dpa.w.ph         $ac1,           %[p3],          %[vector1b]    \n\t"
    201         "dpa.w.ph         $ac1,           %[p4],          %[vector2b]    \n\t"
    202         "dpa.w.ph         $ac1,           %[p1],          %[vector3b]    \n\t"
    203         "dpa.w.ph         $ac1,           %[n1],          %[vector4b]    \n\t"
    204         "extp             %[Temp1],       $ac1,           31             \n\t"
    205 
    206         "addqh_r.w        %[Temp2],       %[Temp2],       %[st0]         \n\t"
    207         "addqh_r.w        %[tn3],         %[tn3],         %[st1]         \n\t"
    208         "sb               %[Temp2],       0(%[dst])                      \n\t"
    209         "sb               %[tn3],         2(%[dst])                      \n\t"
    210 
    211         /* even 4. pixel */
    212         "mtlo             %[vector4a],    $ac2                           \n\t"
    213         "mthi             $zero,          $ac2                           \n\t"
    214         "mtlo             %[vector4a],    $ac3                           \n\t"
    215         "mthi             $zero,          $ac3                           \n\t"
    216 
    217         "balign           %[tn3],         %[tn1],         3              \n\t"
    218         "balign           %[tn1],         %[tn2],         3              \n\t"
    219         "balign           %[tn2],         %[tp2],         3              \n\t"
    220         "balign           %[tp2],         %[tp1],         3              \n\t"
    221 
    222         "lbux             %[st0],         %[Temp1](%[cm])                \n\t"
    223         "lbu              %[Temp2],       4(%[dst])                      \n\t"
    224         "addqh_r.w        %[Temp2],       %[Temp2],       %[st0]         \n\t"
    225 
    226         "dpa.w.ph         $ac2,           %[p4],          %[vector1b]    \n\t"
    227         "dpa.w.ph         $ac2,           %[p1],          %[vector2b]    \n\t"
    228         "dpa.w.ph         $ac2,           %[n1],          %[vector3b]    \n\t"
    229         "dpa.w.ph         $ac2,           %[p2],          %[vector4b]    \n\t"
    230         "extp             %[Temp3],       $ac2,           31             \n\t"
    231 
    232         /* odd 1. pixel */
    233         "mtlo             %[vector4a],    $ac1                           \n\t"
    234         "mthi             $zero,          $ac1                           \n\t"
    235         "sb               %[Temp2],       4(%[dst])                      \n\t"
    236         "preceu.ph.qbr    %[p1],          %[tp2]                         \n\t"
    237         "preceu.ph.qbl    %[p2],          %[tp2]                         \n\t"
    238         "preceu.ph.qbr    %[p3],          %[tn2]                         \n\t"
    239         "preceu.ph.qbl    %[p4],          %[tn2]                         \n\t"
    240         "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
    241         "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
    242         "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
    243         "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
    244         "extp             %[Temp2],       $ac3,           31             \n\t"
    245 
    246         "lbu              %[tp1],         6(%[dst])                      \n\t"
    247 
    248         /* odd 2. pixel */
    249         "mtlo             %[vector4a],    $ac3                           \n\t"
    250         "mthi             $zero,          $ac3                           \n\t"
    251         "mtlo             %[vector4a],    $ac2                           \n\t"
    252         "mthi             $zero,          $ac2                           \n\t"
    253         "preceu.ph.qbr    %[p1],          %[tn1]                         \n\t"
    254         "preceu.ph.qbl    %[n1],          %[tn1]                         \n\t"
    255         "lbux             %[st0],         %[Temp3](%[cm])                \n\t"
    256         "dpa.w.ph         $ac1,           %[p2],          %[vector1b]    \n\t"
    257         "dpa.w.ph         $ac1,           %[p3],          %[vector2b]    \n\t"
    258         "dpa.w.ph         $ac1,           %[p4],          %[vector3b]    \n\t"
    259         "dpa.w.ph         $ac1,           %[p1],          %[vector4b]    \n\t"
    260         "extp             %[Temp3],       $ac1,           31             \n\t"
    261 
    262         "lbu              %[tp2],         1(%[dst])                      \n\t"
    263         "lbu              %[tn2],         3(%[dst])                      \n\t"
    264         "addqh_r.w        %[tp1],         %[tp1],         %[st0]         \n\t"
    265 
    266         /* odd 3. pixel */
    267         "lbux             %[st1],         %[Temp2](%[cm])                \n\t"
    268         "preceu.ph.qbr    %[p2],          %[tn3]                         \n\t"
    269         "dpa.w.ph         $ac3,           %[p3],          %[vector1b]    \n\t"
    270         "dpa.w.ph         $ac3,           %[p4],          %[vector2b]    \n\t"
    271         "dpa.w.ph         $ac3,           %[p1],          %[vector3b]    \n\t"
    272         "dpa.w.ph         $ac3,           %[n1],          %[vector4b]    \n\t"
    273         "addqh_r.w        %[tp2],         %[tp2],         %[st1]         \n\t"
    274         "extp             %[Temp2],       $ac3,           31             \n\t"
    275 
    276         "lbu              %[tn3],         5(%[dst])                      \n\t"
    277 
    278         /* odd 4. pixel */
    279         "sb               %[tp2],         1(%[dst])                      \n\t"
    280         "sb               %[tp1],         6(%[dst])                      \n\t"
    281         "dpa.w.ph         $ac2,           %[p4],          %[vector1b]    \n\t"
    282         "dpa.w.ph         $ac2,           %[p1],          %[vector2b]    \n\t"
    283         "dpa.w.ph         $ac2,           %[n1],          %[vector3b]    \n\t"
    284         "dpa.w.ph         $ac2,           %[p2],          %[vector4b]    \n\t"
    285         "extp             %[Temp1],       $ac2,           31             \n\t"
    286 
    287         "lbu              %[tn1],         7(%[dst])                      \n\t"
    288 
    289         /* clamp */
    290         "lbux             %[p4],          %[Temp3](%[cm])                \n\t"
    291         "addqh_r.w        %[tn2],         %[tn2],         %[p4]          \n\t"
    292 
    293         "lbux             %[p2],          %[Temp2](%[cm])                \n\t"
    294         "addqh_r.w        %[tn3],         %[tn3],         %[p2]          \n\t"
    295 
    296         "lbux             %[n1],          %[Temp1](%[cm])                \n\t"
    297         "addqh_r.w        %[tn1],         %[tn1],         %[n1]          \n\t"
    298 
    299         /* store bytes */
    300         "sb               %[tn2],         3(%[dst])                      \n\t"
    301         "sb               %[tn3],         5(%[dst])                      \n\t"
    302         "sb               %[tn1],         7(%[dst])                      \n\t"
    303 
    304         : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
    305           [tn2] "=&r"(tn2), [tn3] "=&r"(tn3), [st0] "=&r"(st0),
    306           [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
    307           [p4] "=&r"(p4), [n1] "=&r"(n1), [Temp1] "=&r"(Temp1),
    308           [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
    309         : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
    310           [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
    311           [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst),
    312           [src] "r"(src));
    313 
    314     /* Next row... */
    315     src += src_stride;
    316     dst += dst_stride;
    317   }
    318 }
    319 
    320 static void convolve_avg_horiz_16_dspr2(const uint8_t *src_ptr,
    321                                         int32_t src_stride, uint8_t *dst_ptr,
    322                                         int32_t dst_stride,
    323                                         const int16_t *filter_x0, int32_t h,
    324                                         int32_t count) {
    325   int32_t y, c;
    326   const uint8_t *src;
    327   uint8_t *dst;
    328   uint8_t *cm = vpx_ff_cropTbl;
    329   uint32_t vector_64 = 64;
    330   int32_t filter12, filter34, filter56, filter78;
    331   int32_t Temp1, Temp2, Temp3;
    332   uint32_t qload1, qload2, qload3;
    333   uint32_t p1, p2, p3, p4, p5;
    334   uint32_t st1, st2, st3;
    335 
    336   filter12 = ((const int32_t *)filter_x0)[0];
    337   filter34 = ((const int32_t *)filter_x0)[1];
    338   filter56 = ((const int32_t *)filter_x0)[2];
    339   filter78 = ((const int32_t *)filter_x0)[3];
    340 
    341   for (y = h; y--;) {
    342     src = src_ptr;
    343     dst = dst_ptr;
    344 
    345     /* prefetch data to cache memory */
    346     prefetch_load(src_ptr + src_stride);
    347     prefetch_load(src_ptr + src_stride + 32);
    348     prefetch_store(dst_ptr + dst_stride);
    349 
    350     for (c = 0; c < count; c++) {
    351       __asm__ __volatile__(
    352           "ulw              %[qload1],    0(%[src])                    \n\t"
    353           "ulw              %[qload2],    4(%[src])                    \n\t"
    354 
    355           /* even 1. pixel */
    356           "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
    357           "mthi             $zero,        $ac1                         \n\t"
    358           "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
    359           "mthi             $zero,        $ac2                         \n\t"
    360           "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
    361           "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
    362           "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
    363           "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
    364           "ulw              %[qload3],    8(%[src])                    \n\t"
    365           "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* even 1 */
    366           "dpa.w.ph         $ac1,         %[p2],          %[filter34]  \n\t" /* even 1 */
    367           "dpa.w.ph         $ac1,         %[p3],          %[filter56]  \n\t" /* even 1 */
    368           "dpa.w.ph         $ac1,         %[p4],          %[filter78]  \n\t" /* even 1 */
    369           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
    370           "lbu              %[st2],       0(%[dst])                    \n\t" /* load even 1 from dst */
    371 
    372           /* even 2. pixel */
    373           "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
    374           "mthi             $zero,        $ac3                         \n\t"
    375           "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
    376           "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
    377           "ulw              %[qload1],    12(%[src])                   \n\t"
    378           "dpa.w.ph         $ac2,         %[p2],          %[filter12]  \n\t" /* even 1 */
    379           "dpa.w.ph         $ac2,         %[p3],          %[filter34]  \n\t" /* even 1 */
    380           "dpa.w.ph         $ac2,         %[p4],          %[filter56]  \n\t" /* even 1 */
    381           "dpa.w.ph         $ac2,         %[p1],          %[filter78]  \n\t" /* even 1 */
    382           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
    383           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
    384 
    385           "lbu              %[qload3],    2(%[dst])                    \n\t" /* load even 2 from dst */
    386 
    387           /* even 3. pixel */
    388           "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
    389           "mthi             $zero,        $ac1                         \n\t"
    390           "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 1 */
    391           "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
    392           "sb               %[st2],       0(%[dst])                    \n\t" /* store even 1 to dst */
    393           "dpa.w.ph         $ac3,         %[p3],          %[filter12]  \n\t" /* even 3 */
    394           "dpa.w.ph         $ac3,         %[p4],          %[filter34]  \n\t" /* even 3 */
    395           "dpa.w.ph         $ac3,         %[p1],          %[filter56]  \n\t" /* even 3 */
    396           "dpa.w.ph         $ac3,         %[p5],          %[filter78]  \n\t" /* even 3 */
    397           "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
    398           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
    399 
    400           /* even 4. pixel */
    401           "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
    402           "mthi             $zero,        $ac2                         \n\t"
    403           "addqh_r.w        %[qload3],    %[qload3],      %[st2]       \n\t" /* average even 2 */
    404           "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
    405           "sb               %[qload3],    2(%[dst])                    \n\t" /* store even 2 to dst */
    406           "ulw              %[qload2],    16(%[src])                   \n\t"
    407           "lbu              %[qload3],    4(%[dst])                    \n\t" /* load even 3 from dst */
    408           "lbu              %[qload1],    6(%[dst])                    \n\t" /* load even 4 from dst */
    409           "dpa.w.ph         $ac1,         %[p4],          %[filter12]  \n\t" /* even 4 */
    410           "dpa.w.ph         $ac1,         %[p1],          %[filter34]  \n\t" /* even 4 */
    411           "dpa.w.ph         $ac1,         %[p5],          %[filter56]  \n\t" /* even 4 */
    412           "dpa.w.ph         $ac1,         %[p2],          %[filter78]  \n\t" /* even 4 */
    413           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
    414           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
    415 
    416           /* even 5. pixel */
    417           "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
    418           "mthi             $zero,        $ac3                         \n\t"
    419           "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 3 */
    420           "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
    421           "sb               %[qload3],    4(%[dst])                    \n\t" /* store even 3 to dst */
    422           "dpa.w.ph         $ac2,         %[p1],          %[filter12]  \n\t" /* even 5 */
    423           "dpa.w.ph         $ac2,         %[p5],          %[filter34]  \n\t" /* even 5 */
    424           "dpa.w.ph         $ac2,         %[p2],          %[filter56]  \n\t" /* even 5 */
    425           "dpa.w.ph         $ac2,         %[p3],          %[filter78]  \n\t" /* even 5 */
    426           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
    427           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
    428 
    429           /* even 6. pixel */
    430           "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
    431           "mthi             $zero,        $ac1                         \n\t"
    432           "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average even 4 */
    433           "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
    434           "sb               %[qload1],    6(%[dst])                    \n\t" /* store even 4 to dst */
    435           "ulw              %[qload3],    20(%[src])                   \n\t"
    436           "dpa.w.ph         $ac3,         %[p5],          %[filter12]  \n\t" /* even 6 */
    437           "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* even 6 */
    438           "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* even 6 */
    439           "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* even 6 */
    440           "lbu              %[qload2],    8(%[dst])                    \n\t" /* load even 5 from dst */
    441           "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
    442           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
    443 
    444           /* even 7. pixel */
    445           "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
    446           "mthi             $zero,        $ac2                         \n\t"
    447           "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 5 */
    448           "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
    449           "sb               %[qload2],    8(%[dst])                    \n\t" /* store even 5 to dst */
    450           "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* even 7 */
    451           "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* even 7 */
    452           "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* even 7 */
    453           "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* even 7 */
    454           "lbu              %[qload3],    10(%[dst])                   \n\t" /* load even 6 from dst */
    455           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
    456           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
    457 
    458           "lbu              %[st2],       12(%[dst])                   \n\t" /* load even 7 from dst */
    459 
    460           /* even 8. pixel */
    461           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
    462           "mthi             $zero,        $ac3                         \n\t"
    463           "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 6 */
    464           "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* even 8 */
    465           "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* even 8 */
    466           "sb               %[qload3],    10(%[dst])                   \n\t" /* store even 6 to dst */
    467           "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* even 8 */
    468           "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* even 8 */
    469           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
    470           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
    471 
    472           /* ODD pixels */
    473           "ulw              %[qload1],    1(%[src])                   \n\t"
    474           "ulw              %[qload2],    5(%[src])                    \n\t"
    475 
    476           "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 7 */
    477 
    478           /* odd 1. pixel */
    479           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
    480           "mthi             $zero,        $ac1                         \n\t"
    481           "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
    482           "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
    483           "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
    484           "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
    485           "sb               %[st2],       12(%[dst])                   \n\t" /* store even 7 to dst */
    486           "ulw              %[qload3],    9(%[src])                    \n\t"
    487           "dpa.w.ph         $ac3,         %[p1],          %[filter12]  \n\t" /* odd 1 */
    488           "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* odd 1 */
    489           "lbu              %[qload2],    14(%[dst])                   \n\t" /* load even 8 from dst */
    490           "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* odd 1 */
    491           "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* odd 1 */
    492           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
    493           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
    494 
    495           "lbu              %[st1],       1(%[dst])                    \n\t" /* load odd 1 from dst */
    496 
    497           /* odd 2. pixel */
    498           "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
    499           "mthi             $zero,        $ac2                         \n\t"
    500           "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 8 */
    501           "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
    502           "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
    503           "sb               %[qload2],    14(%[dst])                   \n\t" /* store even 8 to dst */
    504           "ulw              %[qload1],    13(%[src])                   \n\t"
    505           "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* odd 2 */
    506           "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* odd 2 */
    507           "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* odd 2 */
    508           "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* odd 2 */
    509           "lbu              %[qload3],    3(%[dst])                    \n\t" /* load odd 2 from dst */
    510           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
    511           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
    512 
    513           /* odd 3. pixel */
    514           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
    515           "mthi             $zero,        $ac3                         \n\t"
    516           "addqh_r.w        %[st3],       %[st3],         %[st1]       \n\t" /* average odd 1 */
    517           "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
    518           "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* odd 3 */
    519           "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* odd 3 */
    520           "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* odd 3 */
    521           "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* odd 3 */
    522           "sb               %[st3],       1(%[dst])                    \n\t" /* store odd 1 to dst */
    523           "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
    524           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
    525 
    526           /* odd 4. pixel */
    527           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
    528           "mthi             $zero,        $ac1                         \n\t"
    529           "addqh_r.w        %[qload3],    %[qload3],      %[st1]       \n\t" /* average odd 2 */
    530           "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
    531           "sb               %[qload3],    3(%[dst])                    \n\t" /* store odd 2 to dst */
    532           "lbu              %[qload1],    5(%[dst])                    \n\t" /* load odd 3 from dst */
    533           "ulw              %[qload2],    17(%[src])                   \n\t"
    534           "dpa.w.ph         $ac3,         %[p4],          %[filter12]  \n\t" /* odd 4 */
    535           "dpa.w.ph         $ac3,         %[p1],          %[filter34]  \n\t" /* odd 4 */
    536           "dpa.w.ph         $ac3,         %[p5],          %[filter56]  \n\t" /* odd 4 */
    537           "dpa.w.ph         $ac3,         %[p2],          %[filter78]  \n\t" /* odd 4 */
    538           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
    539           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
    540 
    541           "lbu              %[st1],       7(%[dst])                    \n\t" /* load odd 4 from dst */
    542 
    543           /* odd 5. pixel */
    544           "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
    545           "mthi             $zero,        $ac2                         \n\t"
    546           "addqh_r.w        %[qload1],    %[qload1],      %[st2]       \n\t" /* average odd 3 */
    547           "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
    548           "sb               %[qload1],    5(%[dst])                    \n\t" /* store odd 3 to dst */
    549           "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* odd 5 */
    550           "dpa.w.ph         $ac1,         %[p5],          %[filter34]  \n\t" /* odd 5 */
    551           "dpa.w.ph         $ac1,         %[p2],          %[filter56]  \n\t" /* odd 5 */
    552           "dpa.w.ph         $ac1,         %[p3],          %[filter78]  \n\t" /* odd 5 */
    553           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
    554           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
    555 
    556           "lbu              %[qload1],    9(%[dst])                    \n\t" /* load odd 5 from dst */
    557 
    558           /* odd 6. pixel */
    559           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
    560           "mthi             $zero,        $ac3                         \n\t"
    561           "addqh_r.w        %[st1],       %[st1],         %[st3]       \n\t" /* average odd 4 */
    562           "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
    563           "sb               %[st1],       7(%[dst])                    \n\t" /* store odd 4 to dst */
    564           "ulw              %[qload3],    21(%[src])                   \n\t"
    565           "dpa.w.ph         $ac2,         %[p5],          %[filter12]  \n\t" /* odd 6 */
    566           "dpa.w.ph         $ac2,         %[p2],          %[filter34]  \n\t" /* odd 6 */
    567           "dpa.w.ph         $ac2,         %[p3],          %[filter56]  \n\t" /* odd 6 */
    568           "dpa.w.ph         $ac2,         %[p4],          %[filter78]  \n\t" /* odd 6 */
    569           "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
    570           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
    571 
    572           /* odd 7. pixel */
    573           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
    574           "mthi             $zero,        $ac1                         \n\t"
    575           "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 5 */
    576           "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
    577           "sb               %[qload1],    9(%[dst])                    \n\t" /* store odd 5 to dst */
    578           "lbu              %[qload2],    11(%[dst])                   \n\t" /* load odd 6 from dst */
    579           "dpa.w.ph         $ac3,         %[p2],          %[filter12]  \n\t" /* odd 7 */
    580           "dpa.w.ph         $ac3,         %[p3],          %[filter34]  \n\t" /* odd 7 */
    581           "dpa.w.ph         $ac3,         %[p4],          %[filter56]  \n\t" /* odd 7 */
    582           "dpa.w.ph         $ac3,         %[p1],          %[filter78]  \n\t" /* odd 7 */
    583           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
    584 
    585           "lbu              %[qload3],    13(%[dst])                   \n\t" /* load odd 7 from dst */
    586 
    587           /* odd 8. pixel */
    588           "dpa.w.ph         $ac1,         %[p3],          %[filter12]  \n\t" /* odd 8 */
    589           "dpa.w.ph         $ac1,         %[p4],          %[filter34]  \n\t" /* odd 8 */
    590           "dpa.w.ph         $ac1,         %[p1],          %[filter56]  \n\t" /* odd 8 */
    591           "dpa.w.ph         $ac1,         %[p5],          %[filter78]  \n\t" /* odd 8 */
    592           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
    593 
    594           "lbu              %[qload1],    15(%[dst])                   \n\t" /* load odd 8 from dst */
    595 
    596           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
    597           "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average odd 6 */
    598 
    599           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
    600           "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average odd 7 */
    601 
    602           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
    603           "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 8 */
    604 
    605           "sb               %[qload2],    11(%[dst])                   \n\t" /* store odd 6 to dst */
    606           "sb               %[qload3],    13(%[dst])                   \n\t" /* store odd 7 to dst */
    607           "sb               %[qload1],    15(%[dst])                   \n\t" /* store odd 8 to dst */
    608 
    609           : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1),
    610             [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2),
    611             [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3),
    612             [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
    613             [Temp3] "=&r"(Temp3)
    614           : [filter12] "r"(filter12), [filter34] "r"(filter34),
    615             [filter56] "r"(filter56), [filter78] "r"(filter78),
    616             [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst),
    617             [src] "r"(src));
    618 
    619       src += 16;
    620       dst += 16;
    621     }
    622 
    623     /* Next row... */
    624     src_ptr += src_stride;
    625     dst_ptr += dst_stride;
    626   }
    627 }
    628 
    629 static void convolve_avg_horiz_64_dspr2(const uint8_t *src_ptr,
    630                                         int32_t src_stride, uint8_t *dst_ptr,
    631                                         int32_t dst_stride,
    632                                         const int16_t *filter_x0, int32_t h) {
    633   int32_t y, c;
    634   const uint8_t *src;
    635   uint8_t *dst;
    636   uint8_t *cm = vpx_ff_cropTbl;
    637   uint32_t vector_64 = 64;
    638   int32_t filter12, filter34, filter56, filter78;
    639   int32_t Temp1, Temp2, Temp3;
    640   uint32_t qload1, qload2, qload3;
    641   uint32_t p1, p2, p3, p4, p5;
    642   uint32_t st1, st2, st3;
    643 
    644   filter12 = ((const int32_t *)filter_x0)[0];
    645   filter34 = ((const int32_t *)filter_x0)[1];
    646   filter56 = ((const int32_t *)filter_x0)[2];
    647   filter78 = ((const int32_t *)filter_x0)[3];
    648 
    649   for (y = h; y--;) {
    650     src = src_ptr;
    651     dst = dst_ptr;
    652 
    653     /* prefetch data to cache memory */
    654     prefetch_load(src_ptr + src_stride);
    655     prefetch_load(src_ptr + src_stride + 32);
    656     prefetch_load(src_ptr + src_stride + 64);
    657     prefetch_store(dst_ptr + dst_stride);
    658     prefetch_store(dst_ptr + dst_stride + 32);
    659 
    660     for (c = 0; c < 4; c++) {
    661       __asm__ __volatile__(
    662           "ulw              %[qload1],    0(%[src])                    \n\t"
    663           "ulw              %[qload2],    4(%[src])                    \n\t"
    664 
    665           /* even 1. pixel */
    666           "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
    667           "mthi             $zero,        $ac1                         \n\t"
    668           "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
    669           "mthi             $zero,        $ac2                         \n\t"
    670           "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
    671           "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
    672           "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
    673           "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
    674           "ulw              %[qload3],    8(%[src])                    \n\t"
    675           "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* even 1 */
    676           "dpa.w.ph         $ac1,         %[p2],          %[filter34]  \n\t" /* even 1 */
    677           "dpa.w.ph         $ac1,         %[p3],          %[filter56]  \n\t" /* even 1 */
    678           "dpa.w.ph         $ac1,         %[p4],          %[filter78]  \n\t" /* even 1 */
    679           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
    680           "lbu              %[st2],       0(%[dst])                    \n\t" /* load even 1 from dst */
    681 
    682           /* even 2. pixel */
    683           "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
    684           "mthi             $zero,        $ac3                         \n\t"
    685           "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
    686           "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
    687           "ulw              %[qload1],    12(%[src])                   \n\t"
    688           "dpa.w.ph         $ac2,         %[p2],          %[filter12]  \n\t" /* even 1 */
    689           "dpa.w.ph         $ac2,         %[p3],          %[filter34]  \n\t" /* even 1 */
    690           "dpa.w.ph         $ac2,         %[p4],          %[filter56]  \n\t" /* even 1 */
    691           "dpa.w.ph         $ac2,         %[p1],          %[filter78]  \n\t" /* even 1 */
    692           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
    693           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
    694 
    695           "lbu              %[qload3],    2(%[dst])                    \n\t" /* load even 2 from dst */
    696 
    697           /* even 3. pixel */
    698           "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
    699           "mthi             $zero,        $ac1                         \n\t"
    700           "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 1 */
    701           "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
    702           "sb               %[st2],       0(%[dst])                    \n\t" /* store even 1 to dst */
    703           "dpa.w.ph         $ac3,         %[p3],          %[filter12]  \n\t" /* even 3 */
    704           "dpa.w.ph         $ac3,         %[p4],          %[filter34]  \n\t" /* even 3 */
    705           "dpa.w.ph         $ac3,         %[p1],          %[filter56]  \n\t" /* even 3 */
    706           "dpa.w.ph         $ac3,         %[p5],          %[filter78]  \n\t" /* even 3 */
    707           "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
    708           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
    709 
    710           /* even 4. pixel */
    711           "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
    712           "mthi             $zero,        $ac2                         \n\t"
    713           "addqh_r.w        %[qload3],    %[qload3],      %[st2]       \n\t" /* average even 2 */
    714           "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
    715           "sb               %[qload3],    2(%[dst])                    \n\t" /* store even 2 to dst */
    716           "ulw              %[qload2],    16(%[src])                   \n\t"
    717           "lbu              %[qload3],    4(%[dst])                    \n\t" /* load even 3 from dst */
    718           "lbu              %[qload1],    6(%[dst])                    \n\t" /* load even 4 from dst */
    719           "dpa.w.ph         $ac1,         %[p4],          %[filter12]  \n\t" /* even 4 */
    720           "dpa.w.ph         $ac1,         %[p1],          %[filter34]  \n\t" /* even 4 */
    721           "dpa.w.ph         $ac1,         %[p5],          %[filter56]  \n\t" /* even 4 */
    722           "dpa.w.ph         $ac1,         %[p2],          %[filter78]  \n\t" /* even 4 */
    723           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
    724           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
    725 
    726           /* even 5. pixel */
    727           "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
    728           "mthi             $zero,        $ac3                         \n\t"
    729           "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 3 */
    730           "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
    731           "sb               %[qload3],    4(%[dst])                    \n\t" /* store even 3 to dst */
    732           "dpa.w.ph         $ac2,         %[p1],          %[filter12]  \n\t" /* even 5 */
    733           "dpa.w.ph         $ac2,         %[p5],          %[filter34]  \n\t" /* even 5 */
    734           "dpa.w.ph         $ac2,         %[p2],          %[filter56]  \n\t" /* even 5 */
    735           "dpa.w.ph         $ac2,         %[p3],          %[filter78]  \n\t" /* even 5 */
    736           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
    737           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
    738 
    739           /* even 6. pixel */
    740           "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
    741           "mthi             $zero,        $ac1                         \n\t"
    742           "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average even 4 */
    743           "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
    744           "sb               %[qload1],    6(%[dst])                    \n\t" /* store even 4 to dst */
    745           "ulw              %[qload3],    20(%[src])                   \n\t"
    746           "dpa.w.ph         $ac3,         %[p5],          %[filter12]  \n\t" /* even 6 */
    747           "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* even 6 */
    748           "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* even 6 */
    749           "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* even 6 */
    750           "lbu              %[qload2],    8(%[dst])                    \n\t" /* load even 5 from dst */
    751           "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
    752           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
    753 
    754           /* even 7. pixel */
    755           "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
    756           "mthi             $zero,        $ac2                         \n\t"
    757           "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 5 */
    758           "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
    759           "sb               %[qload2],    8(%[dst])                    \n\t" /* store even 5 to dst */
    760           "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* even 7 */
    761           "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* even 7 */
    762           "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* even 7 */
    763           "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* even 7 */
    764           "lbu              %[qload3],    10(%[dst])                   \n\t" /* load even 6 from dst */
    765           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
    766           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
    767 
    768           "lbu              %[st2],       12(%[dst])                   \n\t" /* load even 7 from dst */
    769 
    770           /* even 8. pixel */
    771           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
    772           "mthi             $zero,        $ac3                         \n\t"
    773           "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 6 */
    774           "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* even 8 */
    775           "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* even 8 */
    776           "sb               %[qload3],    10(%[dst])                   \n\t" /* store even 6 to dst */
    777           "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* even 8 */
    778           "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* even 8 */
    779           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
    780           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
    781 
    782           /* ODD pixels */
    783           "ulw              %[qload1],    1(%[src])                   \n\t"
    784           "ulw              %[qload2],    5(%[src])                    \n\t"
    785 
    786           "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 7 */
    787 
    788           /* odd 1. pixel */
    789           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
    790           "mthi             $zero,        $ac1                         \n\t"
    791           "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
    792           "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
    793           "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
    794           "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
    795           "sb               %[st2],       12(%[dst])                   \n\t" /* store even 7 to dst */
    796           "ulw              %[qload3],    9(%[src])                    \n\t"
    797           "dpa.w.ph         $ac3,         %[p1],          %[filter12]  \n\t" /* odd 1 */
    798           "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* odd 1 */
    799           "lbu              %[qload2],    14(%[dst])                   \n\t" /* load even 8 from dst */
    800           "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* odd 1 */
    801           "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* odd 1 */
    802           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
    803           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
    804 
    805           "lbu              %[st1],       1(%[dst])                    \n\t" /* load odd 1 from dst */
    806 
    807           /* odd 2. pixel */
    808           "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
    809           "mthi             $zero,        $ac2                         \n\t"
    810           "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 8 */
    811           "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
    812           "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
    813           "sb               %[qload2],    14(%[dst])                   \n\t" /* store even 8 to dst */
    814           "ulw              %[qload1],    13(%[src])                   \n\t"
    815           "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* odd 2 */
    816           "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* odd 2 */
    817           "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* odd 2 */
    818           "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* odd 2 */
    819           "lbu              %[qload3],    3(%[dst])                    \n\t" /* load odd 2 from dst */
    820           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
    821           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
    822 
    823           /* odd 3. pixel */
    824           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
    825           "mthi             $zero,        $ac3                         \n\t"
    826           "addqh_r.w        %[st3],       %[st3],         %[st1]       \n\t" /* average odd 1 */
    827           "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
    828           "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* odd 3 */
    829           "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* odd 3 */
    830           "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* odd 3 */
    831           "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* odd 3 */
    832           "sb               %[st3],       1(%[dst])                    \n\t" /* store odd 1 to dst */
    833           "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
    834           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
    835 
    836           /* odd 4. pixel */
    837           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
    838           "mthi             $zero,        $ac1                         \n\t"
    839           "addqh_r.w        %[qload3],    %[qload3],      %[st1]       \n\t" /* average odd 2 */
    840           "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
    841           "sb               %[qload3],    3(%[dst])                    \n\t" /* store odd 2 to dst */
    842           "lbu              %[qload1],    5(%[dst])                    \n\t" /* load odd 3 from dst */
    843           "ulw              %[qload2],    17(%[src])                   \n\t"
    844           "dpa.w.ph         $ac3,         %[p4],          %[filter12]  \n\t" /* odd 4 */
    845           "dpa.w.ph         $ac3,         %[p1],          %[filter34]  \n\t" /* odd 4 */
    846           "dpa.w.ph         $ac3,         %[p5],          %[filter56]  \n\t" /* odd 4 */
    847           "dpa.w.ph         $ac3,         %[p2],          %[filter78]  \n\t" /* odd 4 */
    848           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
    849           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
    850 
    851           "lbu              %[st1],       7(%[dst])                    \n\t" /* load odd 4 from dst */
    852 
    853           /* odd 5. pixel */
    854           "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
    855           "mthi             $zero,        $ac2                         \n\t"
    856           "addqh_r.w        %[qload1],    %[qload1],      %[st2]       \n\t" /* average odd 3 */
    857           "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
    858           "sb               %[qload1],    5(%[dst])                    \n\t" /* store odd 3 to dst */
    859           "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* odd 5 */
    860           "dpa.w.ph         $ac1,         %[p5],          %[filter34]  \n\t" /* odd 5 */
    861           "dpa.w.ph         $ac1,         %[p2],          %[filter56]  \n\t" /* odd 5 */
    862           "dpa.w.ph         $ac1,         %[p3],          %[filter78]  \n\t" /* odd 5 */
    863           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
    864           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
    865 
    866           "lbu              %[qload1],    9(%[dst])                    \n\t" /* load odd 5 from dst */
    867 
    868           /* odd 6. pixel */
    869           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
    870           "mthi             $zero,        $ac3                         \n\t"
    871           "addqh_r.w        %[st1],       %[st1],         %[st3]       \n\t" /* average odd 4 */
    872           "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
    873           "sb               %[st1],       7(%[dst])                    \n\t" /* store odd 4 to dst */
    874           "ulw              %[qload3],    21(%[src])                   \n\t"
    875           "dpa.w.ph         $ac2,         %[p5],          %[filter12]  \n\t" /* odd 6 */
    876           "dpa.w.ph         $ac2,         %[p2],          %[filter34]  \n\t" /* odd 6 */
    877           "dpa.w.ph         $ac2,         %[p3],          %[filter56]  \n\t" /* odd 6 */
    878           "dpa.w.ph         $ac2,         %[p4],          %[filter78]  \n\t" /* odd 6 */
    879           "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
    880           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
    881 
    882           /* odd 7. pixel */
    883           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
    884           "mthi             $zero,        $ac1                         \n\t"
    885           "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 5 */
    886           "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
    887           "sb               %[qload1],    9(%[dst])                    \n\t" /* store odd 5 to dst */
    888           "lbu              %[qload2],    11(%[dst])                   \n\t" /* load odd 6 from dst */
    889           "dpa.w.ph         $ac3,         %[p2],          %[filter12]  \n\t" /* odd 7 */
    890           "dpa.w.ph         $ac3,         %[p3],          %[filter34]  \n\t" /* odd 7 */
    891           "dpa.w.ph         $ac3,         %[p4],          %[filter56]  \n\t" /* odd 7 */
    892           "dpa.w.ph         $ac3,         %[p1],          %[filter78]  \n\t" /* odd 7 */
    893           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
    894 
    895           "lbu              %[qload3],    13(%[dst])                   \n\t" /* load odd 7 from dst */
    896 
    897           /* odd 8. pixel */
    898           "dpa.w.ph         $ac1,         %[p3],          %[filter12]  \n\t" /* odd 8 */
    899           "dpa.w.ph         $ac1,         %[p4],          %[filter34]  \n\t" /* odd 8 */
    900           "dpa.w.ph         $ac1,         %[p1],          %[filter56]  \n\t" /* odd 8 */
    901           "dpa.w.ph         $ac1,         %[p5],          %[filter78]  \n\t" /* odd 8 */
    902           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
    903 
    904           "lbu              %[qload1],    15(%[dst])                   \n\t" /* load odd 8 from dst */
    905 
    906           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
    907           "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average odd 6 */
    908 
    909           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
    910           "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average odd 7 */
    911 
    912           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
    913           "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 8 */
    914 
    915           "sb               %[qload2],    11(%[dst])                   \n\t" /* store odd 6 to dst */
    916           "sb               %[qload3],    13(%[dst])                   \n\t" /* store odd 7 to dst */
    917           "sb               %[qload1],    15(%[dst])                   \n\t" /* store odd 8 to dst */
    918 
    919           : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1),
    920             [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2),
    921             [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3),
    922             [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
    923             [Temp3] "=&r"(Temp3)
    924           : [filter12] "r"(filter12), [filter34] "r"(filter34),
    925             [filter56] "r"(filter56), [filter78] "r"(filter78),
    926             [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst),
    927             [src] "r"(src));
    928 
    929       src += 16;
    930       dst += 16;
    931     }
    932 
    933     /* Next row... */
    934     src_ptr += src_stride;
    935     dst_ptr += dst_stride;
    936   }
    937 }
    938 
    939 void vpx_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
    940                                    uint8_t *dst, ptrdiff_t dst_stride,
    941                                    const int16_t *filter_x, int x_step_q4,
    942                                    const int16_t *filter_y, int y_step_q4,
    943                                    int w, int h) {
    944   assert(x_step_q4 == 16);
    945   assert(((const int32_t *)filter_x)[1] != 0x800000);
    946 
    947   if (((const int32_t *)filter_x)[0] == 0) {
    948     vpx_convolve2_avg_horiz_dspr2(src, src_stride, dst, dst_stride, filter_x,
    949                                   x_step_q4, filter_y, y_step_q4, w, h);
    950   } else {
    951     uint32_t pos = 38;
    952 
    953     src -= 3;
    954 
    955     /* bit positon for extract from acc */
    956     __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
    957                          :
    958                          : [pos] "r"(pos));
    959 
    960     /* prefetch data to cache memory */
    961     prefetch_load(src);
    962     prefetch_load(src + 32);
    963     prefetch_store(dst);
    964 
    965     switch (w) {
    966       case 4:
    967         convolve_avg_horiz_4_dspr2(src, src_stride, dst, dst_stride, filter_x,
    968                                    h);
    969         break;
    970       case 8:
    971         convolve_avg_horiz_8_dspr2(src, src_stride, dst, dst_stride, filter_x,
    972                                    h);
    973         break;
    974       case 16:
    975         convolve_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x,
    976                                     h, 1);
    977         break;
    978       case 32:
    979         convolve_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x,
    980                                     h, 2);
    981         break;
    982       case 64:
    983         prefetch_load(src + 64);
    984         prefetch_store(dst + 32);
    985 
    986         convolve_avg_horiz_64_dspr2(src, src_stride, dst, dst_stride, filter_x,
    987                                     h);
    988         break;
    989       default:
    990         vpx_convolve8_avg_horiz_c(src + 3, src_stride, dst, dst_stride,
    991                                   filter_x, x_step_q4, filter_y, y_step_q4, w,
    992                                   h);
    993         break;
    994     }
    995   }
    996 }
    997 #endif
    998