Home | History | Annotate | Download | only in dspr2
      1 /*
      2  *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <assert.h>
     12 #include <stdio.h>
     13 
     14 #include "./vpx_config.h"
     15 #include "./vp9_rtcd.h"
     16 #include "vp9/common/vp9_common.h"
     17 #include "vpx/vpx_integer.h"
     18 #include "vpx_ports/mem.h"
     19 #include "vp9/common/vp9_filter.h"
     20 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
     21 
     22 #if HAVE_DSPR2
     23 uint8_t vp9_ff_cropTbl_a[256 + 2 * CROP_WIDTH];
     24 uint8_t *vp9_ff_cropTbl;
     25 
     26 void vp9_dsputil_static_init(void) {
     27   int i;
     28 
     29   for (i = 0; i < 256; i++) vp9_ff_cropTbl_a[i + CROP_WIDTH] = i;
     30 
     31   for (i = 0; i < CROP_WIDTH; i++) {
     32     vp9_ff_cropTbl_a[i] = 0;
     33     vp9_ff_cropTbl_a[i + CROP_WIDTH + 256] = 255;
     34   }
     35 
     36   vp9_ff_cropTbl = &vp9_ff_cropTbl_a[CROP_WIDTH];
     37 }
     38 
     39 static void convolve_horiz_4_transposed_dspr2(const uint8_t *src,
     40                                               int32_t src_stride,
     41                                               uint8_t *dst,
     42                                               int32_t dst_stride,
     43                                               const int16_t *filter_x0,
     44                                               int32_t h) {
     45   int32_t y;
     46   uint8_t *cm = vp9_ff_cropTbl;
     47   uint8_t *dst_ptr;
     48   int32_t vector1b, vector2b, vector3b, vector4b;
     49   int32_t Temp1, Temp2, Temp3, Temp4;
     50   uint32_t vector4a = 64;
     51   uint32_t tp1, tp2;
     52   uint32_t p1, p2, p3, p4;
     53   uint32_t tn1, tn2;
     54 
     55   vector1b = ((const int32_t *)filter_x0)[0];
     56   vector2b = ((const int32_t *)filter_x0)[1];
     57   vector3b = ((const int32_t *)filter_x0)[2];
     58   vector4b = ((const int32_t *)filter_x0)[3];
     59 
     60   for (y = h; y--;) {
     61     dst_ptr = dst;
     62     /* prefetch data to cache memory */
     63     vp9_prefetch_load(src + src_stride);
     64     vp9_prefetch_load(src + src_stride + 32);
     65 
     66     __asm__ __volatile__ (
     67         "ulw              %[tp1],         0(%[src])                      \n\t"
     68         "ulw              %[tp2],         4(%[src])                      \n\t"
     69 
     70         /* even 1. pixel */
     71         "mtlo             %[vector4a],    $ac3                           \n\t"
     72         "mthi             $zero,          $ac3                           \n\t"
     73         "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
     74         "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
     75         "preceu.ph.qbr    %[p3],          %[tp2]                         \n\t"
     76         "preceu.ph.qbl    %[p4],          %[tp2]                         \n\t"
     77         "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
     78         "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
     79         "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
     80         "ulw              %[tn2],         8(%[src])                      \n\t"
     81         "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
     82         "extp             %[Temp1],       $ac3,           31             \n\t"
     83 
     84         /* even 2. pixel */
     85         "mtlo             %[vector4a],    $ac2                           \n\t"
     86         "mthi             $zero,          $ac2                           \n\t"
     87         "preceu.ph.qbr    %[p1],          %[tn2]                         \n\t"
     88         "balign           %[tn1],         %[tn2],         3              \n\t"
     89         "balign           %[tn2],         %[tp2],         3              \n\t"
     90         "balign           %[tp2],         %[tp1],         3              \n\t"
     91         "dpa.w.ph         $ac2,           %[p2],          %[vector1b]    \n\t"
     92         "dpa.w.ph         $ac2,           %[p3],          %[vector2b]    \n\t"
     93         "dpa.w.ph         $ac2,           %[p4],          %[vector3b]    \n\t"
     94         "dpa.w.ph         $ac2,           %[p1],          %[vector4b]    \n\t"
     95         "extp             %[Temp3],       $ac2,           31             \n\t"
     96 
     97         /* odd 1. pixel */
     98         "lbux             %[tp1],         %[Temp1](%[cm])                \n\t"
     99         "mtlo             %[vector4a],    $ac3                           \n\t"
    100         "mthi             $zero,          $ac3                           \n\t"
    101         "preceu.ph.qbr    %[p1],          %[tp2]                         \n\t"
    102         "preceu.ph.qbl    %[p2],          %[tp2]                         \n\t"
    103         "preceu.ph.qbr    %[p3],          %[tn2]                         \n\t"
    104         "preceu.ph.qbl    %[p4],          %[tn2]                         \n\t"
    105         "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
    106         "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
    107         "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
    108         "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
    109         "extp             %[Temp2],       $ac3,           31             \n\t"
    110 
    111         /* odd 2. pixel */
    112         "lbux             %[tp2],         %[Temp3](%[cm])                \n\t"
    113         "mtlo             %[vector4a],    $ac2                           \n\t"
    114         "mthi             $zero,          $ac2                           \n\t"
    115         "preceu.ph.qbr    %[p1],          %[tn1]                         \n\t"
    116         "dpa.w.ph         $ac2,           %[p2],          %[vector1b]    \n\t"
    117         "dpa.w.ph         $ac2,           %[p3],          %[vector2b]    \n\t"
    118         "dpa.w.ph         $ac2,           %[p4],          %[vector3b]    \n\t"
    119         "dpa.w.ph         $ac2,           %[p1],          %[vector4b]    \n\t"
    120         "extp             %[Temp4],       $ac2,           31             \n\t"
    121 
    122         /* clamp */
    123         "lbux             %[tn1],         %[Temp2](%[cm])                \n\t"
    124         "lbux             %[p2],          %[Temp4](%[cm])                \n\t"
    125 
    126         /* store bytes */
    127         "sb               %[tp1],         0(%[dst_ptr])                  \n\t"
    128         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
    129 
    130         "sb               %[tn1],         0(%[dst_ptr])                  \n\t"
    131         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
    132 
    133         "sb               %[tp2],         0(%[dst_ptr])                  \n\t"
    134         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
    135 
    136         "sb               %[p2],          0(%[dst_ptr])                  \n\t"
    137         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
    138 
    139         : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn1] "=&r" (tn1), [tn2] "=&r" (tn2),
    140           [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
    141           [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4),
    142           [dst_ptr] "+r" (dst_ptr)
    143         : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
    144           [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
    145           [vector4a] "r" (vector4a),
    146           [cm] "r" (cm), [src] "r" (src), [dst_stride] "r" (dst_stride)
    147     );
    148 
    149     /* Next row... */
    150     src += src_stride;
    151     dst += 1;
    152   }
    153 }
    154 
    155 static void convolve_horiz_8_transposed_dspr2(const uint8_t *src,
    156                                               int32_t src_stride,
    157                                               uint8_t *dst,
    158                                               int32_t dst_stride,
    159                                               const int16_t *filter_x0,
    160                                               int32_t h) {
    161   int32_t y;
    162   uint8_t *cm = vp9_ff_cropTbl;
    163   uint8_t *dst_ptr;
    164   uint32_t vector4a = 64;
    165   int32_t vector1b, vector2b, vector3b, vector4b;
    166   int32_t Temp1, Temp2, Temp3;
    167   uint32_t tp1, tp2, tp3;
    168   uint32_t p1, p2, p3, p4, n1;
    169   uint8_t *odd_dst;
    170   uint32_t dst_pitch_2 = (dst_stride << 1);
    171 
    172   vector1b = ((const int32_t *)filter_x0)[0];
    173   vector2b = ((const int32_t *)filter_x0)[1];
    174   vector3b = ((const int32_t *)filter_x0)[2];
    175   vector4b = ((const int32_t *)filter_x0)[3];
    176 
    177   for (y = h; y--;) {
    178     /* prefetch data to cache memory */
    179     vp9_prefetch_load(src + src_stride);
    180     vp9_prefetch_load(src + src_stride + 32);
    181 
    182     dst_ptr = dst;
    183     odd_dst = (dst_ptr + dst_stride);
    184 
    185     __asm__ __volatile__ (
    186         "ulw              %[tp2],         0(%[src])                       \n\t"
    187         "ulw              %[tp1],         4(%[src])                       \n\t"
    188 
    189         /* even 1. pixel */
    190         "mtlo             %[vector4a],    $ac3                            \n\t"
    191         "mthi             $zero,          $ac3                            \n\t"
    192         "mtlo             %[vector4a],    $ac2                            \n\t"
    193         "mthi             $zero,          $ac2                            \n\t"
    194         "preceu.ph.qbr    %[p1],          %[tp2]                          \n\t"
    195         "preceu.ph.qbl    %[p2],          %[tp2]                          \n\t"
    196         "preceu.ph.qbr    %[p3],          %[tp1]                          \n\t"
    197         "preceu.ph.qbl    %[p4],          %[tp1]                          \n\t"
    198         "ulw              %[tp3],         8(%[src])                       \n\t"
    199         "dpa.w.ph         $ac3,           %[p1],          %[vector1b]     \n\t"
    200         "dpa.w.ph         $ac3,           %[p2],          %[vector2b]     \n\t"
    201         "dpa.w.ph         $ac3,           %[p3],          %[vector3b]     \n\t"
    202         "dpa.w.ph         $ac3,           %[p4],          %[vector4b]     \n\t"
    203         "extp             %[Temp1],       $ac3,           31              \n\t"
    204 
    205         /* even 2. pixel */
    206         "preceu.ph.qbr    %[p1],          %[tp3]                          \n\t"
    207         "preceu.ph.qbl    %[n1],          %[tp3]                          \n\t"
    208         "ulw              %[tp2],         12(%[src])                      \n\t"
    209         "dpa.w.ph         $ac2,           %[p2],          %[vector1b]     \n\t"
    210         "dpa.w.ph         $ac2,           %[p3],          %[vector2b]     \n\t"
    211         "dpa.w.ph         $ac2,           %[p4],          %[vector3b]     \n\t"
    212         "dpa.w.ph         $ac2,           %[p1],          %[vector4b]     \n\t"
    213         "extp             %[Temp3],       $ac2,           31              \n\t"
    214 
    215         /* even 3. pixel */
    216         "lbux             %[Temp2],       %[Temp1](%[cm])                 \n\t"
    217         "mtlo             %[vector4a],    $ac1                            \n\t"
    218         "mthi             $zero,          $ac1                            \n\t"
    219         "preceu.ph.qbr    %[p2],          %[tp2]                          \n\t"
    220         "dpa.w.ph         $ac1,           %[p3],          %[vector1b]     \n\t"
    221         "dpa.w.ph         $ac1,           %[p4],          %[vector2b]     \n\t"
    222         "dpa.w.ph         $ac1,           %[p1],          %[vector3b]     \n\t"
    223         "lbux             %[tp3],         %[Temp3](%[cm])                 \n\t"
    224         "dpa.w.ph         $ac1,           %[n1],          %[vector4b]     \n\t"
    225         "extp             %[p3],          $ac1,           31              \n\t"
    226 
    227         /* even 4. pixel */
    228         "mtlo             %[vector4a],    $ac2                            \n\t"
    229         "mthi             $zero,          $ac2                            \n\t"
    230         "mtlo             %[vector4a],    $ac3                            \n\t"
    231         "mthi             $zero,          $ac3                            \n\t"
    232         "sb               %[Temp2],       0(%[dst_ptr])                   \n\t"
    233         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
    234         "sb               %[tp3],         0(%[dst_ptr])                   \n\t"
    235         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
    236 
    237         "ulw              %[tp1],         1(%[src])                       \n\t"
    238         "ulw              %[tp3],         5(%[src])                       \n\t"
    239 
    240         "dpa.w.ph         $ac2,           %[p4],          %[vector1b]     \n\t"
    241         "dpa.w.ph         $ac2,           %[p1],          %[vector2b]     \n\t"
    242         "dpa.w.ph         $ac2,           %[n1],          %[vector3b]     \n\t"
    243         "dpa.w.ph         $ac2,           %[p2],          %[vector4b]     \n\t"
    244         "extp             %[Temp3],       $ac2,           31              \n\t"
    245 
    246         "lbux             %[tp2],         %[p3](%[cm])                    \n\t"
    247 
    248         /* odd 1. pixel */
    249         "mtlo             %[vector4a],    $ac1                            \n\t"
    250         "mthi             $zero,          $ac1                            \n\t"
    251         "preceu.ph.qbr    %[p1],          %[tp1]                          \n\t"
    252         "preceu.ph.qbl    %[p2],          %[tp1]                          \n\t"
    253         "preceu.ph.qbr    %[p3],          %[tp3]                          \n\t"
    254         "preceu.ph.qbl    %[p4],          %[tp3]                          \n\t"
    255         "sb               %[tp2],         0(%[dst_ptr])                   \n\t"
    256         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
    257         "ulw              %[tp2],         9(%[src])                       \n\t"
    258 
    259         "dpa.w.ph         $ac3,           %[p1],          %[vector1b]     \n\t"
    260         "dpa.w.ph         $ac3,           %[p2],          %[vector2b]     \n\t"
    261         "dpa.w.ph         $ac3,           %[p3],          %[vector3b]     \n\t"
    262         "dpa.w.ph         $ac3,           %[p4],          %[vector4b]     \n\t"
    263         "extp             %[Temp2],       $ac3,           31              \n\t"
    264 
    265         /* odd 2. pixel */
    266         "lbux             %[tp1],         %[Temp3](%[cm])                 \n\t"
    267         "mtlo             %[vector4a],    $ac3                            \n\t"
    268         "mthi             $zero,          $ac3                            \n\t"
    269         "mtlo             %[vector4a],    $ac2                            \n\t"
    270         "mthi             $zero,          $ac2                            \n\t"
    271         "preceu.ph.qbr    %[p1],          %[tp2]                          \n\t"
    272         "preceu.ph.qbl    %[n1],          %[tp2]                          \n\t"
    273         "ulw              %[Temp1],       13(%[src])                      \n\t"
    274         "dpa.w.ph         $ac1,           %[p2],          %[vector1b]     \n\t"
    275         "sb               %[tp1],         0(%[dst_ptr])                   \n\t"
    276         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
    277         "dpa.w.ph         $ac1,           %[p3],          %[vector2b]     \n\t"
    278         "dpa.w.ph         $ac1,           %[p4],          %[vector3b]     \n\t"
    279         "dpa.w.ph         $ac1,           %[p1],          %[vector4b]     \n\t"
    280         "extp             %[Temp3],       $ac1,           31              \n\t"
    281 
    282         /* odd 3. pixel */
    283         "lbux             %[tp3],         %[Temp2](%[cm])                 \n\t"
    284         "preceu.ph.qbr    %[p2],          %[Temp1]                        \n\t"
    285         "dpa.w.ph         $ac3,           %[p3],          %[vector1b]     \n\t"
    286         "dpa.w.ph         $ac3,           %[p4],          %[vector2b]     \n\t"
    287         "dpa.w.ph         $ac3,           %[p1],          %[vector3b]     \n\t"
    288         "dpa.w.ph         $ac3,           %[n1],          %[vector4b]     \n\t"
    289         "extp             %[Temp2],       $ac3,           31              \n\t"
    290 
    291         /* odd 4. pixel */
    292         "sb               %[tp3],         0(%[odd_dst])                   \n\t"
    293         "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
    294         "dpa.w.ph         $ac2,           %[p4],          %[vector1b]     \n\t"
    295         "dpa.w.ph         $ac2,           %[p1],          %[vector2b]     \n\t"
    296         "dpa.w.ph         $ac2,           %[n1],          %[vector3b]     \n\t"
    297         "dpa.w.ph         $ac2,           %[p2],          %[vector4b]     \n\t"
    298         "extp             %[Temp1],       $ac2,           31              \n\t"
    299 
    300         /* clamp */
    301         "lbux             %[p4],          %[Temp3](%[cm])                 \n\t"
    302         "lbux             %[p2],          %[Temp2](%[cm])                 \n\t"
    303         "lbux             %[n1],          %[Temp1](%[cm])                 \n\t"
    304 
    305         /* store bytes */
    306         "sb               %[p4],          0(%[odd_dst])                   \n\t"
    307         "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
    308 
    309         "sb               %[p2],          0(%[odd_dst])                   \n\t"
    310         "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
    311 
    312         "sb               %[n1],          0(%[odd_dst])                   \n\t"
    313 
    314         : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3),
    315           [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
    316           [n1] "=&r" (n1),
    317           [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
    318           [dst_ptr] "+r" (dst_ptr), [odd_dst] "+r" (odd_dst)
    319         : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
    320           [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
    321           [vector4a] "r" (vector4a), [cm] "r" (cm),
    322           [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
    323     );
    324 
    325     /* Next row... */
    326     src += src_stride;
    327     dst += 1;
    328   }
    329 }
    330 
    331 static void convolve_horiz_16_transposed_dspr2(const uint8_t *src_ptr,
    332                                                int32_t src_stride,
    333                                                uint8_t *dst_ptr,
    334                                                int32_t dst_stride,
    335                                                const int16_t *filter_x0,
    336                                                int32_t h,
    337                                                int32_t count) {
    338   int32_t c, y;
    339   const uint8_t *src;
    340   uint8_t *dst;
    341   uint8_t *cm = vp9_ff_cropTbl;
    342   uint32_t vector_64 = 64;
    343   int32_t  filter12, filter34, filter56, filter78;
    344   int32_t  Temp1, Temp2, Temp3;
    345   uint32_t qload1, qload2;
    346   uint32_t p1, p2, p3, p4, p5;
    347   uint32_t st1, st2, st3;
    348   uint32_t dst_pitch_2 = (dst_stride << 1);
    349   uint8_t  *odd_dst;
    350 
    351   filter12 = ((const int32_t *)filter_x0)[0];
    352   filter34 = ((const int32_t *)filter_x0)[1];
    353   filter56 = ((const int32_t *)filter_x0)[2];
    354   filter78 = ((const int32_t *)filter_x0)[3];
    355 
    356   for (y = h; y--;) {
    357     /* prefetch data to cache memory */
    358     vp9_prefetch_load(src_ptr + src_stride);
    359     vp9_prefetch_load(src_ptr + src_stride + 32);
    360 
    361     src = src_ptr;
    362     dst = dst_ptr;
    363 
    364     odd_dst = (dst + dst_stride);
    365 
    366     for (c = 0; c < count; c++) {
    367       __asm__ __volatile__ (
    368           "ulw              %[qload1],        0(%[src])                       \n\t"
    369           "ulw              %[qload2],        4(%[src])                       \n\t"
    370 
    371           /* even 1. pixel */
    372           "mtlo             %[vector_64],     $ac1                            \n\t" /* even 1 */
    373           "mthi             $zero,            $ac1                            \n\t"
    374           "mtlo             %[vector_64],     $ac2                            \n\t" /* even 2 */
    375           "mthi             $zero,            $ac2                            \n\t"
    376           "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
    377           "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
    378           "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
    379           "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
    380           "ulw              %[qload2],        8(%[src])                       \n\t"
    381           "dpa.w.ph         $ac1,             %[p1],          %[filter12]     \n\t" /* even 1 */
    382           "dpa.w.ph         $ac1,             %[p2],          %[filter34]     \n\t" /* even 1 */
    383           "dpa.w.ph         $ac1,             %[p3],          %[filter56]     \n\t" /* even 1 */
    384           "dpa.w.ph         $ac1,             %[p4],          %[filter78]     \n\t" /* even 1 */
    385           "extp             %[Temp1],         $ac1,           31              \n\t" /* even 1 */
    386 
    387           /* even 2. pixel */
    388           "mtlo             %[vector_64],     $ac3                            \n\t" /* even 3 */
    389           "mthi             $zero,            $ac3                            \n\t"
    390           "preceu.ph.qbr    %[p1],            %[qload2]                       \n\t"
    391           "preceu.ph.qbl    %[p5],            %[qload2]                       \n\t"
    392           "ulw              %[qload1],        12(%[src])                      \n\t"
    393           "dpa.w.ph         $ac2,             %[p2],          %[filter12]     \n\t" /* even 1 */
    394           "dpa.w.ph         $ac2,             %[p3],          %[filter34]     \n\t" /* even 1 */
    395           "dpa.w.ph         $ac2,             %[p4],          %[filter56]     \n\t" /* even 1 */
    396           "dpa.w.ph         $ac2,             %[p1],          %[filter78]     \n\t" /* even 1 */
    397           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 1 */
    398           "extp             %[Temp2],         $ac2,           31              \n\t" /* even 1 */
    399 
    400           /* even 3. pixel */
    401           "mtlo             %[vector_64],     $ac1                            \n\t" /* even 4 */
    402           "mthi             $zero,            $ac1                            \n\t"
    403           "preceu.ph.qbr    %[p2],            %[qload1]                       \n\t"
    404           "sb               %[st1],           0(%[dst])                       \n\t" /* even 1 */
    405           "addu             %[dst],           %[dst],         %[dst_pitch_2]             \n\t"
    406           "dpa.w.ph         $ac3,             %[p3],          %[filter12]     \n\t" /* even 3 */
    407           "dpa.w.ph         $ac3,             %[p4],          %[filter34]     \n\t" /* even 3 */
    408           "dpa.w.ph         $ac3,             %[p1],          %[filter56]     \n\t" /* even 3 */
    409           "dpa.w.ph         $ac3,             %[p5],          %[filter78]     \n\t" /* even 3 */
    410           "extp             %[Temp3],         $ac3,           31              \n\t" /* even 3 */
    411           "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 1 */
    412 
    413           /* even 4. pixel */
    414           "mtlo             %[vector_64],     $ac2                            \n\t" /* even 5 */
    415           "mthi             $zero,            $ac2                            \n\t"
    416           "preceu.ph.qbl    %[p3],            %[qload1]                       \n\t"
    417           "sb               %[st2],           0(%[dst])                       \n\t" /* even 2 */
    418           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
    419           "ulw              %[qload2],        16(%[src])                      \n\t"
    420           "dpa.w.ph         $ac1,             %[p4],          %[filter12]     \n\t" /* even 4 */
    421           "dpa.w.ph         $ac1,             %[p1],          %[filter34]     \n\t" /* even 4 */
    422           "dpa.w.ph         $ac1,             %[p5],          %[filter56]     \n\t" /* even 4 */
    423           "dpa.w.ph         $ac1,             %[p2],          %[filter78]     \n\t" /* even 4 */
    424           "extp             %[Temp1],         $ac1,           31              \n\t" /* even 4 */
    425           "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 3 */
    426 
    427           /* even 5. pixel */
    428           "mtlo             %[vector_64],     $ac3                            \n\t" /* even 6 */
    429           "mthi             $zero,            $ac3                            \n\t"
    430           "preceu.ph.qbr    %[p4],            %[qload2]                       \n\t"
    431           "sb               %[st3],           0(%[dst])                       \n\t" /* even 3 */
    432           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
    433           "dpa.w.ph         $ac2,             %[p1],          %[filter12]     \n\t" /* even 5 */
    434           "dpa.w.ph         $ac2,             %[p5],          %[filter34]     \n\t" /* even 5 */
    435           "dpa.w.ph         $ac2,             %[p2],          %[filter56]     \n\t" /* even 5 */
    436           "dpa.w.ph         $ac2,             %[p3],          %[filter78]     \n\t" /* even 5 */
    437           "extp             %[Temp2],         $ac2,           31              \n\t" /* even 5 */
    438           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 4 */
    439 
    440           /* even 6. pixel */
    441           "mtlo             %[vector_64],     $ac1                            \n\t" /* even 7 */
    442           "mthi             $zero,            $ac1                            \n\t"
    443           "preceu.ph.qbl    %[p1],            %[qload2]                       \n\t"
    444           "sb               %[st1],           0(%[dst])                       \n\t" /* even 4 */
    445           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
    446           "ulw              %[qload1],        20(%[src])                      \n\t"
    447           "dpa.w.ph         $ac3,             %[p5],          %[filter12]     \n\t" /* even 6 */
    448           "dpa.w.ph         $ac3,             %[p2],          %[filter34]     \n\t" /* even 6 */
    449           "dpa.w.ph         $ac3,             %[p3],          %[filter56]     \n\t" /* even 6 */
    450           "dpa.w.ph         $ac3,             %[p4],          %[filter78]     \n\t" /* even 6 */
    451           "extp             %[Temp3],         $ac3,           31              \n\t" /* even 6 */
    452           "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 5 */
    453 
    454           /* even 7. pixel */
    455           "mtlo             %[vector_64],     $ac2                            \n\t" /* even 8 */
    456           "mthi             $zero,            $ac2                            \n\t"
    457           "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
    458           "sb               %[st2],           0(%[dst])                       \n\t" /* even 5 */
    459           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
    460           "dpa.w.ph         $ac1,             %[p2],          %[filter12]     \n\t" /* even 7 */
    461           "dpa.w.ph         $ac1,             %[p3],          %[filter34]     \n\t" /* even 7 */
    462           "dpa.w.ph         $ac1,             %[p4],          %[filter56]     \n\t" /* even 7 */
    463           "dpa.w.ph         $ac1,             %[p1],          %[filter78]     \n\t" /* even 7 */
    464           "extp             %[Temp1],         $ac1,           31              \n\t" /* even 7 */
    465           "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 6 */
    466 
    467           /* even 8. pixel */
    468           "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 1 */
    469           "mthi             $zero,            $ac3                            \n\t"
    470           "dpa.w.ph         $ac2,             %[p3],          %[filter12]     \n\t" /* even 8 */
    471           "dpa.w.ph         $ac2,             %[p4],          %[filter34]     \n\t" /* even 8 */
    472           "sb               %[st3],           0(%[dst])                       \n\t" /* even 6 */
    473           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
    474           "dpa.w.ph         $ac2,             %[p1],          %[filter56]     \n\t" /* even 8 */
    475           "dpa.w.ph         $ac2,             %[p5],          %[filter78]     \n\t" /* even 8 */
    476           "extp             %[Temp2],         $ac2,           31              \n\t" /* even 8 */
    477           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 7 */
    478 
    479           /* ODD pixels */
    480           "ulw              %[qload1],        1(%[src])                       \n\t"
    481           "ulw              %[qload2],        5(%[src])                       \n\t"
    482 
    483           /* odd 1. pixel */
    484           "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 2 */
    485           "mthi             $zero,            $ac1                            \n\t"
    486           "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
    487           "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
    488           "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
    489           "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
    490           "sb               %[st1],           0(%[dst])                       \n\t" /* even 7 */
    491           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
    492           "ulw              %[qload2],        9(%[src])                       \n\t"
    493           "dpa.w.ph         $ac3,             %[p1],          %[filter12]     \n\t" /* odd 1 */
    494           "dpa.w.ph         $ac3,             %[p2],          %[filter34]     \n\t" /* odd 1 */
    495           "dpa.w.ph         $ac3,             %[p3],          %[filter56]     \n\t" /* odd 1 */
    496           "dpa.w.ph         $ac3,             %[p4],          %[filter78]     \n\t" /* odd 1 */
    497           "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 1 */
    498           "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 8 */
    499 
    500           /* odd 2. pixel */
    501           "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 3 */
    502           "mthi             $zero,            $ac2                            \n\t"
    503           "preceu.ph.qbr    %[p1],            %[qload2]                       \n\t"
    504           "preceu.ph.qbl    %[p5],            %[qload2]                       \n\t"
    505           "sb               %[st2],           0(%[dst])                       \n\t" /* even 8 */
    506           "ulw              %[qload1],        13(%[src])                      \n\t"
    507           "dpa.w.ph         $ac1,             %[p2],          %[filter12]     \n\t" /* odd 2 */
    508           "dpa.w.ph         $ac1,             %[p3],          %[filter34]     \n\t" /* odd 2 */
    509           "dpa.w.ph         $ac1,             %[p4],          %[filter56]     \n\t" /* odd 2 */
    510           "dpa.w.ph         $ac1,             %[p1],          %[filter78]     \n\t" /* odd 2 */
    511           "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 2 */
    512           "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 1 */
    513 
    514           /* odd 3. pixel */
    515           "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 4 */
    516           "mthi             $zero,            $ac3                            \n\t"
    517           "preceu.ph.qbr    %[p2],            %[qload1]                       \n\t"
    518           "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 1 */
    519           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
    520           "dpa.w.ph         $ac2,             %[p3],          %[filter12]     \n\t" /* odd 3 */
    521           "dpa.w.ph         $ac2,             %[p4],          %[filter34]     \n\t" /* odd 3 */
    522           "dpa.w.ph         $ac2,             %[p1],          %[filter56]     \n\t" /* odd 3 */
    523           "dpa.w.ph         $ac2,             %[p5],          %[filter78]     \n\t" /* odd 3 */
    524           "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 3 */
    525           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 2 */
    526 
    527           /* odd 4. pixel */
    528           "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 5 */
    529           "mthi             $zero,            $ac1                            \n\t"
    530           "preceu.ph.qbl    %[p3],            %[qload1]                       \n\t"
    531           "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 2 */
    532           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
    533           "ulw              %[qload2],        17(%[src])                      \n\t"
    534           "dpa.w.ph         $ac3,             %[p4],          %[filter12]     \n\t" /* odd 4 */
    535           "dpa.w.ph         $ac3,             %[p1],          %[filter34]     \n\t" /* odd 4 */
    536           "dpa.w.ph         $ac3,             %[p5],          %[filter56]     \n\t" /* odd 4 */
    537           "dpa.w.ph         $ac3,             %[p2],          %[filter78]     \n\t" /* odd 4 */
    538           "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 4 */
    539           "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 3 */
    540 
    541           /* odd 5. pixel */
    542           "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 6 */
    543           "mthi             $zero,            $ac2                            \n\t"
    544           "preceu.ph.qbr    %[p4],            %[qload2]                       \n\t"
    545           "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 3 */
    546           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
    547           "dpa.w.ph         $ac1,             %[p1],          %[filter12]     \n\t" /* odd 5 */
    548           "dpa.w.ph         $ac1,             %[p5],          %[filter34]     \n\t" /* odd 5 */
    549           "dpa.w.ph         $ac1,             %[p2],          %[filter56]     \n\t" /* odd 5 */
    550           "dpa.w.ph         $ac1,             %[p3],          %[filter78]     \n\t" /* odd 5 */
    551           "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 5 */
    552           "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 4 */
    553 
    554           /* odd 6. pixel */
    555           "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 7 */
    556           "mthi             $zero,            $ac3                            \n\t"
    557           "preceu.ph.qbl    %[p1],            %[qload2]                       \n\t"
    558           "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 4 */
    559           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
    560           "ulw              %[qload1],        21(%[src])                      \n\t"
    561           "dpa.w.ph         $ac2,             %[p5],          %[filter12]     \n\t" /* odd 6 */
    562           "dpa.w.ph         $ac2,             %[p2],          %[filter34]     \n\t" /* odd 6 */
    563           "dpa.w.ph         $ac2,             %[p3],          %[filter56]     \n\t" /* odd 6 */
    564           "dpa.w.ph         $ac2,             %[p4],          %[filter78]     \n\t" /* odd 6 */
    565           "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 6 */
    566           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 5 */
    567 
    568           /* odd 7. pixel */
    569           "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 8 */
    570           "mthi             $zero,            $ac1                            \n\t"
    571           "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
    572           "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 5 */
    573           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
    574           "dpa.w.ph         $ac3,             %[p2],          %[filter12]     \n\t" /* odd 7 */
    575           "dpa.w.ph         $ac3,             %[p3],          %[filter34]     \n\t" /* odd 7 */
    576           "dpa.w.ph         $ac3,             %[p4],          %[filter56]     \n\t" /* odd 7 */
    577           "dpa.w.ph         $ac3,             %[p1],          %[filter78]     \n\t" /* odd 7 */
    578           "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 7 */
    579 
    580           /* odd 8. pixel */
    581           "dpa.w.ph         $ac1,             %[p3],          %[filter12]     \n\t" /* odd 8 */
    582           "dpa.w.ph         $ac1,             %[p4],          %[filter34]     \n\t" /* odd 8 */
    583           "dpa.w.ph         $ac1,             %[p1],          %[filter56]     \n\t" /* odd 8 */
    584           "dpa.w.ph         $ac1,             %[p5],          %[filter78]     \n\t" /* odd 8 */
    585           "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 8 */
    586 
    587           "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 6 */
    588           "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 7 */
    589           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 8 */
    590 
    591           "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 6 */
    592           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
    593 
    594           "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 7 */
    595           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
    596 
    597           "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 8 */
    598 
    599           : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5),
    600             [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
    601             [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
    602             [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
    603             [dst] "+r" (dst), [odd_dst] "+r" (odd_dst)
    604           : [filter12] "r" (filter12), [filter34] "r" (filter34),
    605             [filter56] "r" (filter56), [filter78] "r" (filter78),
    606             [vector_64] "r" (vector_64), [cm] "r" (cm),
    607             [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
    608       );
    609 
    610       src += 16;
    611       dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
    612       odd_dst = (dst + dst_stride);
    613     }
    614 
    615     /* Next row... */
    616     src_ptr += src_stride;
    617 
    618     dst_ptr += 1;
    619   }
    620 }
    621 
    622 static void convolve_horiz_64_transposed_dspr2(const uint8_t *src_ptr,
    623                                                int32_t src_stride,
    624                                                uint8_t *dst_ptr,
    625                                                int32_t dst_stride,
    626                                                const int16_t *filter_x0,
    627                                                int32_t h) {
    628   int32_t c, y;
    629   const uint8_t *src;
    630   uint8_t *dst;
    631   uint8_t *cm = vp9_ff_cropTbl;
    632   uint32_t vector_64 = 64;
    633   int32_t  filter12, filter34, filter56, filter78;
    634   int32_t  Temp1, Temp2, Temp3;
    635   uint32_t qload1, qload2;
    636   uint32_t p1, p2, p3, p4, p5;
    637   uint32_t st1, st2, st3;
    638   uint32_t dst_pitch_2 = (dst_stride << 1);
    639   uint8_t  *odd_dst;
    640 
    641   filter12 = ((const int32_t *)filter_x0)[0];
    642   filter34 = ((const int32_t *)filter_x0)[1];
    643   filter56 = ((const int32_t *)filter_x0)[2];
    644   filter78 = ((const int32_t *)filter_x0)[3];
    645 
    646   for (y = h; y--;) {
    647     /* prefetch data to cache memory */
    648     vp9_prefetch_load(src_ptr + src_stride);
    649     vp9_prefetch_load(src_ptr + src_stride + 32);
    650     vp9_prefetch_load(src_ptr + src_stride + 64);
    651 
    652     src = src_ptr;
    653     dst = dst_ptr;
    654 
    655     odd_dst = (dst + dst_stride);
    656 
    657     for (c = 0; c < 4; c++) {
    658       __asm__ __volatile__ (
    659           "ulw              %[qload1],        0(%[src])                       \n\t"
    660           "ulw              %[qload2],        4(%[src])                       \n\t"
    661 
    662           /* even 1. pixel */
    663           "mtlo             %[vector_64],     $ac1                            \n\t" /* even 1 */
    664           "mthi             $zero,            $ac1                            \n\t"
    665           "mtlo             %[vector_64],     $ac2                            \n\t" /* even 2 */
    666           "mthi             $zero,            $ac2                            \n\t"
    667           "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
    668           "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
    669           "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
    670           "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
    671           "ulw              %[qload2],        8(%[src])                       \n\t"
    672           "dpa.w.ph         $ac1,             %[p1],          %[filter12]     \n\t" /* even 1 */
    673           "dpa.w.ph         $ac1,             %[p2],          %[filter34]     \n\t" /* even 1 */
    674           "dpa.w.ph         $ac1,             %[p3],          %[filter56]     \n\t" /* even 1 */
    675           "dpa.w.ph         $ac1,             %[p4],          %[filter78]     \n\t" /* even 1 */
    676           "extp             %[Temp1],         $ac1,           31              \n\t" /* even 1 */
    677 
    678           /* even 2. pixel */
    679           "mtlo             %[vector_64],     $ac3                            \n\t" /* even 3 */
    680           "mthi             $zero,            $ac3                            \n\t"
    681           "preceu.ph.qbr    %[p1],            %[qload2]                       \n\t"
    682           "preceu.ph.qbl    %[p5],            %[qload2]                       \n\t"
    683           "ulw              %[qload1],        12(%[src])                      \n\t"
    684           "dpa.w.ph         $ac2,             %[p2],          %[filter12]     \n\t" /* even 1 */
    685           "dpa.w.ph         $ac2,             %[p3],          %[filter34]     \n\t" /* even 1 */
    686           "dpa.w.ph         $ac2,             %[p4],          %[filter56]     \n\t" /* even 1 */
    687           "dpa.w.ph         $ac2,             %[p1],          %[filter78]     \n\t" /* even 1 */
    688           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 1 */
    689           "extp             %[Temp2],         $ac2,           31              \n\t" /* even 1 */
    690 
    691           /* even 3. pixel */
    692           "mtlo             %[vector_64],     $ac1                            \n\t" /* even 4 */
    693           "mthi             $zero,            $ac1                            \n\t"
    694           "preceu.ph.qbr    %[p2],            %[qload1]                       \n\t"
    695           "sb               %[st1],           0(%[dst])                       \n\t" /* even 1 */
    696           "addu             %[dst],           %[dst],         %[dst_pitch_2]             \n\t"
    697           "dpa.w.ph         $ac3,             %[p3],          %[filter12]     \n\t" /* even 3 */
    698           "dpa.w.ph         $ac3,             %[p4],          %[filter34]     \n\t" /* even 3 */
    699           "dpa.w.ph         $ac3,             %[p1],          %[filter56]     \n\t" /* even 3 */
    700           "dpa.w.ph         $ac3,             %[p5],          %[filter78]     \n\t" /* even 3 */
    701           "extp             %[Temp3],         $ac3,           31              \n\t" /* even 3 */
    702           "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 1 */
    703 
    704           /* even 4. pixel */
    705           "mtlo             %[vector_64],     $ac2                            \n\t" /* even 5 */
    706           "mthi             $zero,            $ac2                            \n\t"
    707           "preceu.ph.qbl    %[p3],            %[qload1]                       \n\t"
    708           "sb               %[st2],           0(%[dst])                       \n\t" /* even 2 */
    709           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
    710           "ulw              %[qload2],        16(%[src])                      \n\t"
    711           "dpa.w.ph         $ac1,             %[p4],          %[filter12]     \n\t" /* even 4 */
    712           "dpa.w.ph         $ac1,             %[p1],          %[filter34]     \n\t" /* even 4 */
    713           "dpa.w.ph         $ac1,             %[p5],          %[filter56]     \n\t" /* even 4 */
    714           "dpa.w.ph         $ac1,             %[p2],          %[filter78]     \n\t" /* even 4 */
    715           "extp             %[Temp1],         $ac1,           31              \n\t" /* even 4 */
    716           "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 3 */
    717 
    718           /* even 5. pixel */
    719           "mtlo             %[vector_64],     $ac3                            \n\t" /* even 6 */
    720           "mthi             $zero,            $ac3                            \n\t"
    721           "preceu.ph.qbr    %[p4],            %[qload2]                       \n\t"
    722           "sb               %[st3],           0(%[dst])                       \n\t" /* even 3 */
    723           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
    724           "dpa.w.ph         $ac2,             %[p1],          %[filter12]     \n\t" /* even 5 */
    725           "dpa.w.ph         $ac2,             %[p5],          %[filter34]     \n\t" /* even 5 */
    726           "dpa.w.ph         $ac2,             %[p2],          %[filter56]     \n\t" /* even 5 */
    727           "dpa.w.ph         $ac2,             %[p3],          %[filter78]     \n\t" /* even 5 */
    728           "extp             %[Temp2],         $ac2,           31              \n\t" /* even 5 */
    729           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 4 */
    730 
    731           /* even 6. pixel */
    732           "mtlo             %[vector_64],     $ac1                            \n\t" /* even 7 */
    733           "mthi             $zero,            $ac1                            \n\t"
    734           "preceu.ph.qbl    %[p1],            %[qload2]                       \n\t"
    735           "sb               %[st1],           0(%[dst])                       \n\t" /* even 4 */
    736           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
    737           "ulw              %[qload1],        20(%[src])                      \n\t"
    738           "dpa.w.ph         $ac3,             %[p5],          %[filter12]     \n\t" /* even 6 */
    739           "dpa.w.ph         $ac3,             %[p2],          %[filter34]     \n\t" /* even 6 */
    740           "dpa.w.ph         $ac3,             %[p3],          %[filter56]     \n\t" /* even 6 */
    741           "dpa.w.ph         $ac3,             %[p4],          %[filter78]     \n\t" /* even 6 */
    742           "extp             %[Temp3],         $ac3,           31              \n\t" /* even 6 */
    743           "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 5 */
    744 
    745           /* even 7. pixel */
    746           "mtlo             %[vector_64],     $ac2                            \n\t" /* even 8 */
    747           "mthi             $zero,            $ac2                            \n\t"
    748           "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
    749           "sb               %[st2],           0(%[dst])                       \n\t" /* even 5 */
    750           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
    751           "dpa.w.ph         $ac1,             %[p2],          %[filter12]     \n\t" /* even 7 */
    752           "dpa.w.ph         $ac1,             %[p3],          %[filter34]     \n\t" /* even 7 */
    753           "dpa.w.ph         $ac1,             %[p4],          %[filter56]     \n\t" /* even 7 */
    754           "dpa.w.ph         $ac1,             %[p1],          %[filter78]     \n\t" /* even 7 */
    755           "extp             %[Temp1],         $ac1,           31              \n\t" /* even 7 */
    756           "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 6 */
    757 
    758           /* even 8. pixel */
    759           "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 1 */
    760           "mthi             $zero,            $ac3                            \n\t"
    761           "dpa.w.ph         $ac2,             %[p3],          %[filter12]     \n\t" /* even 8 */
    762           "dpa.w.ph         $ac2,             %[p4],          %[filter34]     \n\t" /* even 8 */
    763           "sb               %[st3],           0(%[dst])                       \n\t" /* even 6 */
    764           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
    765           "dpa.w.ph         $ac2,             %[p1],          %[filter56]     \n\t" /* even 8 */
    766           "dpa.w.ph         $ac2,             %[p5],          %[filter78]     \n\t" /* even 8 */
    767           "extp             %[Temp2],         $ac2,           31              \n\t" /* even 8 */
    768           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 7 */
    769 
    770           /* ODD pixels */
    771           "ulw              %[qload1],        1(%[src])                       \n\t"
    772           "ulw              %[qload2],        5(%[src])                       \n\t"
    773 
    774           /* odd 1. pixel */
    775           "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 2 */
    776           "mthi             $zero,            $ac1                            \n\t"
    777           "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
    778           "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
    779           "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
    780           "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
    781           "sb               %[st1],           0(%[dst])                       \n\t" /* even 7 */
    782           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
    783           "ulw              %[qload2],        9(%[src])                       \n\t"
    784           "dpa.w.ph         $ac3,             %[p1],          %[filter12]     \n\t" /* odd 1 */
    785           "dpa.w.ph         $ac3,             %[p2],          %[filter34]     \n\t" /* odd 1 */
    786           "dpa.w.ph         $ac3,             %[p3],          %[filter56]     \n\t" /* odd 1 */
    787           "dpa.w.ph         $ac3,             %[p4],          %[filter78]     \n\t" /* odd 1 */
    788           "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 1 */
    789           "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 8 */
    790 
    791           /* odd 2. pixel */
    792           "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 3 */
    793           "mthi             $zero,            $ac2                            \n\t"
    794           "preceu.ph.qbr    %[p1],            %[qload2]                       \n\t"
    795           "preceu.ph.qbl    %[p5],            %[qload2]                       \n\t"
    796           "sb               %[st2],           0(%[dst])                       \n\t" /* even 8 */
    797           "ulw              %[qload1],        13(%[src])                      \n\t"
    798           "dpa.w.ph         $ac1,             %[p2],          %[filter12]     \n\t" /* odd 2 */
    799           "dpa.w.ph         $ac1,             %[p3],          %[filter34]     \n\t" /* odd 2 */
    800           "dpa.w.ph         $ac1,             %[p4],          %[filter56]     \n\t" /* odd 2 */
    801           "dpa.w.ph         $ac1,             %[p1],          %[filter78]     \n\t" /* odd 2 */
    802           "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 2 */
    803           "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 1 */
    804 
    805           /* odd 3. pixel */
    806           "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 4 */
    807           "mthi             $zero,            $ac3                            \n\t"
    808           "preceu.ph.qbr    %[p2],            %[qload1]                       \n\t"
    809           "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 1 */
    810           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
    811           "dpa.w.ph         $ac2,             %[p3],          %[filter12]     \n\t" /* odd 3 */
    812           "dpa.w.ph         $ac2,             %[p4],          %[filter34]     \n\t" /* odd 3 */
    813           "dpa.w.ph         $ac2,             %[p1],          %[filter56]     \n\t" /* odd 3 */
    814           "dpa.w.ph         $ac2,             %[p5],          %[filter78]     \n\t" /* odd 3 */
    815           "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 3 */
    816           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 2 */
    817 
    818           /* odd 4. pixel */
    819           "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 5 */
    820           "mthi             $zero,            $ac1                            \n\t"
    821           "preceu.ph.qbl    %[p3],            %[qload1]                       \n\t"
    822           "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 2 */
    823           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
    824           "ulw              %[qload2],        17(%[src])                      \n\t"
    825           "dpa.w.ph         $ac3,             %[p4],          %[filter12]     \n\t" /* odd 4 */
    826           "dpa.w.ph         $ac3,             %[p1],          %[filter34]     \n\t" /* odd 4 */
    827           "dpa.w.ph         $ac3,             %[p5],          %[filter56]     \n\t" /* odd 4 */
    828           "dpa.w.ph         $ac3,             %[p2],          %[filter78]     \n\t" /* odd 4 */
    829           "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 4 */
    830           "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 3 */
    831 
    832           /* odd 5. pixel */
    833           "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 6 */
    834           "mthi             $zero,            $ac2                            \n\t"
    835           "preceu.ph.qbr    %[p4],            %[qload2]                       \n\t"
    836           "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 3 */
    837           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
    838           "dpa.w.ph         $ac1,             %[p1],          %[filter12]     \n\t" /* odd 5 */
    839           "dpa.w.ph         $ac1,             %[p5],          %[filter34]     \n\t" /* odd 5 */
    840           "dpa.w.ph         $ac1,             %[p2],          %[filter56]     \n\t" /* odd 5 */
    841           "dpa.w.ph         $ac1,             %[p3],          %[filter78]     \n\t" /* odd 5 */
    842           "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 5 */
    843           "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 4 */
    844 
    845           /* odd 6. pixel */
    846           "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 7 */
    847           "mthi             $zero,            $ac3                            \n\t"
    848           "preceu.ph.qbl    %[p1],            %[qload2]                       \n\t"
    849           "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 4 */
    850           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
    851           "ulw              %[qload1],        21(%[src])                      \n\t"
    852           "dpa.w.ph         $ac2,             %[p5],          %[filter12]     \n\t" /* odd 6 */
    853           "dpa.w.ph         $ac2,             %[p2],          %[filter34]     \n\t" /* odd 6 */
    854           "dpa.w.ph         $ac2,             %[p3],          %[filter56]     \n\t" /* odd 6 */
    855           "dpa.w.ph         $ac2,             %[p4],          %[filter78]     \n\t" /* odd 6 */
    856           "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 6 */
    857           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 5 */
    858 
    859           /* odd 7. pixel */
    860           "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 8 */
    861           "mthi             $zero,            $ac1                            \n\t"
    862           "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
    863           "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 5 */
    864           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
    865           "dpa.w.ph         $ac3,             %[p2],          %[filter12]     \n\t" /* odd 7 */
    866           "dpa.w.ph         $ac3,             %[p3],          %[filter34]     \n\t" /* odd 7 */
    867           "dpa.w.ph         $ac3,             %[p4],          %[filter56]     \n\t" /* odd 7 */
    868           "dpa.w.ph         $ac3,             %[p1],          %[filter78]     \n\t" /* odd 7 */
    869           "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 7 */
    870 
    871           /* odd 8. pixel */
    872           "dpa.w.ph         $ac1,             %[p3],          %[filter12]     \n\t" /* odd 8 */
    873           "dpa.w.ph         $ac1,             %[p4],          %[filter34]     \n\t" /* odd 8 */
    874           "dpa.w.ph         $ac1,             %[p1],          %[filter56]     \n\t" /* odd 8 */
    875           "dpa.w.ph         $ac1,             %[p5],          %[filter78]     \n\t" /* odd 8 */
    876           "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 8 */
    877 
    878           "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 6 */
    879           "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 7 */
    880           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 8 */
    881 
    882           "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 6 */
    883           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
    884 
    885           "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 7 */
    886           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
    887 
    888           "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 8 */
    889 
    890           : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5),
    891             [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
    892             [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
    893             [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
    894             [dst] "+r" (dst), [odd_dst] "+r" (odd_dst)
    895           : [filter12] "r" (filter12), [filter34] "r" (filter34),
    896             [filter56] "r" (filter56), [filter78] "r" (filter78),
    897             [vector_64] "r" (vector_64), [cm] "r" (cm),
    898             [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
    899       );
    900 
    901       src += 16;
    902       dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
    903       odd_dst = (dst + dst_stride);
    904     }
    905 
    906     /* Next row... */
    907     src_ptr += src_stride;
    908 
    909     dst_ptr += 1;
    910   }
    911 }
    912 
    913 void convolve_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
    914                                uint8_t *dst, ptrdiff_t dst_stride,
    915                                const int16_t *filter, int w, int h) {
    916   int x, y, k;
    917 
    918   for (y = 0; y < h; ++y) {
    919     for (x = 0; x < w; ++x) {
    920       int sum = 0;
    921 
    922       for (k = 0; k < 8; ++k)
    923         sum += src[x + k] * filter[k];
    924 
    925       dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
    926     }
    927 
    928     src += src_stride;
    929     dst += 1;
    930   }
    931 }
    932 
    933 void copy_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
    934                            uint8_t *dst, ptrdiff_t dst_stride,
    935                            int w, int h) {
    936   int x, y;
    937 
    938   for (y = 0; y < h; ++y) {
    939     for (x = 0; x < w; ++x) {
    940       dst[x * dst_stride] = src[x];
    941     }
    942 
    943     src += src_stride;
    944     dst += 1;
    945   }
    946 }
    947 
    948 void vp9_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride,
    949                          uint8_t *dst, ptrdiff_t dst_stride,
    950                          const int16_t *filter_x, int x_step_q4,
    951                          const int16_t *filter_y, int y_step_q4,
    952                          int w, int h) {
    953   DECLARE_ALIGNED_ARRAY(32, uint8_t, temp, 64 * 135);
    954   int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
    955   uint32_t pos = 38;
    956 
    957   /* bit positon for extract from acc */
    958   __asm__ __volatile__ (
    959     "wrdsp      %[pos],     1           \n\t"
    960     :
    961     : [pos] "r" (pos)
    962   );
    963 
    964   if (intermediate_height < h)
    965     intermediate_height = h;
    966 
    967   if (x_step_q4 != 16 || y_step_q4 != 16)
    968     return vp9_convolve8_c(src, src_stride,
    969                            dst, dst_stride,
    970                            filter_x, x_step_q4,
    971                            filter_y, y_step_q4,
    972                            w, h);
    973 
    974   if ((((const int32_t *)filter_x)[1] == 0x800000)
    975       && (((const int32_t *)filter_y)[1] == 0x800000))
    976     return vp9_convolve_copy(src, src_stride,
    977                              dst, dst_stride,
    978                              filter_x, x_step_q4,
    979                              filter_y, y_step_q4,
    980                              w, h);
    981 
    982   /* copy the src to dst */
    983   if (filter_x[3] == 0x80) {
    984     copy_horiz_transposed(src - src_stride * 3, src_stride,
    985                           temp, intermediate_height,
    986                           w, intermediate_height);
    987   } else if (((const int32_t *)filter_x)[0] == 0) {
    988     vp9_convolve2_dspr2(src - src_stride * 3, src_stride,
    989                         temp, intermediate_height,
    990                         filter_x,
    991                         w, intermediate_height);
    992   } else {
    993     src -= (src_stride * 3 + 3);
    994 
    995     /* prefetch data to cache memory */
    996     vp9_prefetch_load(src);
    997     vp9_prefetch_load(src + 32);
    998 
    999     switch (w) {
   1000       case 4:
   1001         convolve_horiz_4_transposed_dspr2(src, src_stride,
   1002                                           temp, intermediate_height,
   1003                                           filter_x, intermediate_height);
   1004         break;
   1005       case 8:
   1006         convolve_horiz_8_transposed_dspr2(src, src_stride,
   1007                                           temp, intermediate_height,
   1008                                           filter_x, intermediate_height);
   1009         break;
   1010       case 16:
   1011       case 32:
   1012         convolve_horiz_16_transposed_dspr2(src, src_stride,
   1013                                            temp, intermediate_height,
   1014                                            filter_x, intermediate_height,
   1015                                            (w/16));
   1016         break;
   1017       case 64:
   1018         vp9_prefetch_load(src + 32);
   1019         convolve_horiz_64_transposed_dspr2(src, src_stride,
   1020                                            temp, intermediate_height,
   1021                                            filter_x, intermediate_height);
   1022         break;
   1023       default:
   1024         convolve_horiz_transposed(src, src_stride,
   1025                                   temp, intermediate_height,
   1026                                   filter_x, w, intermediate_height);
   1027         break;
   1028     }
   1029   }
   1030 
   1031   /* copy the src to dst */
   1032   if (filter_y[3] == 0x80) {
   1033     copy_horiz_transposed(temp + 3, intermediate_height,
   1034                           dst, dst_stride,
   1035                           h, w);
   1036   } else if (((const int32_t *)filter_y)[0] == 0) {
   1037     vp9_convolve2_dspr2(temp + 3, intermediate_height,
   1038                         dst, dst_stride,
   1039                         filter_y,
   1040                         h, w);
   1041   } else {
   1042     switch (h) {
   1043       case 4:
   1044         convolve_horiz_4_transposed_dspr2(temp, intermediate_height,
   1045                                           dst, dst_stride,
   1046                                           filter_y, w);
   1047         break;
   1048       case 8:
   1049         convolve_horiz_8_transposed_dspr2(temp, intermediate_height,
   1050                                           dst, dst_stride,
   1051                                           filter_y, w);
   1052         break;
   1053       case 16:
   1054       case 32:
   1055         convolve_horiz_16_transposed_dspr2(temp, intermediate_height,
   1056                                            dst, dst_stride,
   1057                                            filter_y, w, (h/16));
   1058         break;
   1059       case 64:
   1060         convolve_horiz_64_transposed_dspr2(temp, intermediate_height,
   1061                                            dst, dst_stride,
   1062                                            filter_y, w);
   1063         break;
   1064       default:
   1065         convolve_horiz_transposed(temp, intermediate_height,
   1066                                   dst, dst_stride,
   1067                                   filter_y, h, w);
   1068         break;
   1069     }
   1070   }
   1071 }
   1072 
   1073 void vp9_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride,
   1074                              uint8_t *dst, ptrdiff_t dst_stride,
   1075                              const int16_t *filter_x, int filter_x_stride,
   1076                              const int16_t *filter_y, int filter_y_stride,
   1077                              int w, int h) {
   1078   int x, y;
   1079 
   1080   /* prefetch data to cache memory */
   1081   vp9_prefetch_load(src);
   1082   vp9_prefetch_load(src + 32);
   1083   vp9_prefetch_store(dst);
   1084 
   1085   switch (w) {
   1086     case 4:
   1087       {
   1088       uint32_t tp1;
   1089 
   1090       /* 1 word storage */
   1091       for (y = h; y--; ) {
   1092         vp9_prefetch_load(src + src_stride);
   1093         vp9_prefetch_load(src + src_stride + 32);
   1094         vp9_prefetch_store(dst + dst_stride);
   1095 
   1096         __asm__ __volatile__ (
   1097             "ulw              %[tp1],         (%[src])      \n\t"
   1098             "sw               %[tp1],         (%[dst])      \n\t"  /* store */
   1099 
   1100             : [tp1] "=&r" (tp1)
   1101             : [src] "r" (src), [dst] "r" (dst)
   1102         );
   1103 
   1104         src += src_stride;
   1105         dst += dst_stride;
   1106       }
   1107       }
   1108       break;
   1109     case 8:
   1110       {
   1111       uint32_t tp1, tp2;
   1112 
   1113       /* 2 word storage */
   1114       for (y = h; y--; ) {
   1115         vp9_prefetch_load(src + src_stride);
   1116         vp9_prefetch_load(src + src_stride + 32);
   1117         vp9_prefetch_store(dst + dst_stride);
   1118 
   1119         __asm__ __volatile__ (
   1120             "ulw              %[tp1],         0(%[src])      \n\t"
   1121             "ulw              %[tp2],         4(%[src])      \n\t"
   1122             "sw               %[tp1],         0(%[dst])      \n\t"  /* store */
   1123             "sw               %[tp2],         4(%[dst])      \n\t"  /* store */
   1124 
   1125             : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2)
   1126             : [src] "r" (src), [dst] "r" (dst)
   1127         );
   1128 
   1129         src += src_stride;
   1130         dst += dst_stride;
   1131       }
   1132       }
   1133       break;
   1134     case 16:
   1135       {
   1136       uint32_t tp1, tp2, tp3, tp4;
   1137 
   1138       /* 4 word storage */
   1139       for (y = h; y--; ) {
   1140         vp9_prefetch_load(src + src_stride);
   1141         vp9_prefetch_load(src + src_stride + 32);
   1142         vp9_prefetch_store(dst + dst_stride);
   1143 
   1144         __asm__ __volatile__ (
   1145             "ulw              %[tp1],         0(%[src])      \n\t"
   1146             "ulw              %[tp2],         4(%[src])      \n\t"
   1147             "ulw              %[tp3],         8(%[src])      \n\t"
   1148             "ulw              %[tp4],         12(%[src])     \n\t"
   1149 
   1150             "sw               %[tp1],         0(%[dst])      \n\t"  /* store */
   1151             "sw               %[tp2],         4(%[dst])      \n\t"  /* store */
   1152             "sw               %[tp3],         8(%[dst])      \n\t"  /* store */
   1153             "sw               %[tp4],         12(%[dst])     \n\t"  /* store */
   1154 
   1155             : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
   1156               [tp3] "=&r" (tp3), [tp4] "=&r" (tp4)
   1157             : [src] "r" (src), [dst] "r" (dst)
   1158         );
   1159 
   1160         src += src_stride;
   1161         dst += dst_stride;
   1162       }
   1163       }
   1164       break;
   1165     case 32:
   1166       {
   1167       uint32_t tp1, tp2, tp3, tp4;
   1168       uint32_t tp5, tp6, tp7, tp8;
   1169 
   1170       /* 8 word storage */
   1171       for (y = h; y--; ) {
   1172         vp9_prefetch_load(src + src_stride);
   1173         vp9_prefetch_load(src + src_stride + 32);
   1174         vp9_prefetch_store(dst + dst_stride);
   1175 
   1176         __asm__ __volatile__ (
   1177             "ulw              %[tp1],         0(%[src])      \n\t"
   1178             "ulw              %[tp2],         4(%[src])      \n\t"
   1179             "ulw              %[tp3],         8(%[src])      \n\t"
   1180             "ulw              %[tp4],         12(%[src])     \n\t"
   1181             "ulw              %[tp5],         16(%[src])     \n\t"
   1182             "ulw              %[tp6],         20(%[src])     \n\t"
   1183             "ulw              %[tp7],         24(%[src])     \n\t"
   1184             "ulw              %[tp8],         28(%[src])     \n\t"
   1185 
   1186             "sw               %[tp1],         0(%[dst])      \n\t"  /* store */
   1187             "sw               %[tp2],         4(%[dst])      \n\t"  /* store */
   1188             "sw               %[tp3],         8(%[dst])      \n\t"  /* store */
   1189             "sw               %[tp4],         12(%[dst])     \n\t"  /* store */
   1190             "sw               %[tp5],         16(%[dst])     \n\t"  /* store */
   1191             "sw               %[tp6],         20(%[dst])     \n\t"  /* store */
   1192             "sw               %[tp7],         24(%[dst])     \n\t"  /* store */
   1193             "sw               %[tp8],         28(%[dst])     \n\t"  /* store */
   1194 
   1195             : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
   1196               [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
   1197               [tp5] "=&r" (tp5), [tp6] "=&r" (tp6),
   1198               [tp7] "=&r" (tp7), [tp8] "=&r" (tp8)
   1199             : [src] "r" (src), [dst] "r" (dst)
   1200         );
   1201 
   1202         src += src_stride;
   1203         dst += dst_stride;
   1204       }
   1205       }
   1206       break;
   1207     case 64:
   1208       {
   1209       uint32_t tp1, tp2, tp3, tp4;
   1210       uint32_t tp5, tp6, tp7, tp8;
   1211 
   1212       vp9_prefetch_load(src + 64);
   1213       vp9_prefetch_store(dst + 32);
   1214 
   1215       /* 16 word storage */
   1216       for (y = h; y--; ) {
   1217         vp9_prefetch_load(src + src_stride);
   1218         vp9_prefetch_load(src + src_stride + 32);
   1219         vp9_prefetch_load(src + src_stride + 64);
   1220         vp9_prefetch_store(dst + dst_stride);
   1221         vp9_prefetch_store(dst + dst_stride + 32);
   1222 
   1223         __asm__ __volatile__ (
   1224             "ulw              %[tp1],         0(%[src])      \n\t"
   1225             "ulw              %[tp2],         4(%[src])      \n\t"
   1226             "ulw              %[tp3],         8(%[src])      \n\t"
   1227             "ulw              %[tp4],         12(%[src])     \n\t"
   1228             "ulw              %[tp5],         16(%[src])     \n\t"
   1229             "ulw              %[tp6],         20(%[src])     \n\t"
   1230             "ulw              %[tp7],         24(%[src])     \n\t"
   1231             "ulw              %[tp8],         28(%[src])     \n\t"
   1232 
   1233             "sw               %[tp1],         0(%[dst])      \n\t"  /* store */
   1234             "sw               %[tp2],         4(%[dst])      \n\t"  /* store */
   1235             "sw               %[tp3],         8(%[dst])      \n\t"  /* store */
   1236             "sw               %[tp4],         12(%[dst])     \n\t"  /* store */
   1237             "sw               %[tp5],         16(%[dst])     \n\t"  /* store */
   1238             "sw               %[tp6],         20(%[dst])     \n\t"  /* store */
   1239             "sw               %[tp7],         24(%[dst])     \n\t"  /* store */
   1240             "sw               %[tp8],         28(%[dst])     \n\t"  /* store */
   1241 
   1242             "ulw              %[tp1],         32(%[src])     \n\t"
   1243             "ulw              %[tp2],         36(%[src])     \n\t"
   1244             "ulw              %[tp3],         40(%[src])     \n\t"
   1245             "ulw              %[tp4],         44(%[src])     \n\t"
   1246             "ulw              %[tp5],         48(%[src])     \n\t"
   1247             "ulw              %[tp6],         52(%[src])     \n\t"
   1248             "ulw              %[tp7],         56(%[src])     \n\t"
   1249             "ulw              %[tp8],         60(%[src])     \n\t"
   1250 
   1251             "sw               %[tp1],         32(%[dst])     \n\t"  /* store */
   1252             "sw               %[tp2],         36(%[dst])     \n\t"  /* store */
   1253             "sw               %[tp3],         40(%[dst])     \n\t"  /* store */
   1254             "sw               %[tp4],         44(%[dst])     \n\t"  /* store */
   1255             "sw               %[tp5],         48(%[dst])     \n\t"  /* store */
   1256             "sw               %[tp6],         52(%[dst])     \n\t"  /* store */
   1257             "sw               %[tp7],         56(%[dst])     \n\t"  /* store */
   1258             "sw               %[tp8],         60(%[dst])     \n\t"  /* store */
   1259 
   1260             : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
   1261               [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
   1262               [tp5] "=&r" (tp5), [tp6] "=&r" (tp6),
   1263               [tp7] "=&r" (tp7), [tp8] "=&r" (tp8)
   1264             : [src] "r" (src), [dst] "r" (dst)
   1265         );
   1266 
   1267         src += src_stride;
   1268         dst += dst_stride;
   1269       }
   1270       }
   1271       break;
   1272     default:
   1273       for (y = h; y--; ) {
   1274         for (x = 0; x < w; ++x) {
   1275           dst[x] = src[x];
   1276         }
   1277 
   1278         src += src_stride;
   1279         dst += dst_stride;
   1280       }
   1281       break;
   1282   }
   1283 }
   1284 #endif
   1285