Home | History | Annotate | Download | only in dspr2
      1 /*
      2  *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <assert.h>
     12 #include <stdio.h>
     13 
     14 #include "./vpx_config.h"
     15 #include "./vp9_rtcd.h"
     16 #include "vp9/common/vp9_common.h"
     17 #include "vpx/vpx_integer.h"
     18 #include "vpx_ports/mem.h"
     19 #include "vp9/common/vp9_convolve.h"
     20 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
     21 
     22 #if HAVE_DSPR2
     23 static void convolve_horiz_4_dspr2(const uint8_t *src,
     24                                    int32_t src_stride,
     25                                    uint8_t *dst,
     26                                    int32_t dst_stride,
     27                                    const int16_t *filter_x0,
     28                                    int32_t h) {
     29   int32_t y;
     30   uint8_t *cm = vp9_ff_cropTbl;
     31   int32_t vector1b, vector2b, vector3b, vector4b;
     32   int32_t Temp1, Temp2, Temp3, Temp4;
     33   uint32_t vector4a = 64;
     34   uint32_t tp1, tp2;
     35   uint32_t p1, p2, p3, p4;
     36   uint32_t n1, n2, n3, n4;
     37   uint32_t tn1, tn2;
     38 
     39   vector1b = ((const int32_t *)filter_x0)[0];
     40   vector2b = ((const int32_t *)filter_x0)[1];
     41   vector3b = ((const int32_t *)filter_x0)[2];
     42   vector4b = ((const int32_t *)filter_x0)[3];
     43 
     44   for (y = h; y--;) {
     45     /* prefetch data to cache memory */
     46     vp9_prefetch_load(src + src_stride);
     47     vp9_prefetch_load(src + src_stride + 32);
     48     vp9_prefetch_store(dst + dst_stride);
     49 
     50     __asm__ __volatile__ (
     51         "ulw              %[tp1],      0(%[src])                      \n\t"
     52         "ulw              %[tp2],      4(%[src])                      \n\t"
     53 
     54         /* even 1. pixel */
     55         "mtlo             %[vector4a], $ac3                           \n\t"
     56         "mthi             $zero,       $ac3                           \n\t"
     57         "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
     58         "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
     59         "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
     60         "preceu.ph.qbl    %[p4],       %[tp2]                         \n\t"
     61         "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
     62         "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
     63         "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
     64         "ulw              %[tn2],      8(%[src])                      \n\t"
     65         "dpa.w.ph         $ac3,        %[p4],          %[vector4b]    \n\t"
     66         "extp             %[Temp1],    $ac3,           31             \n\t"
     67 
     68         /* even 2. pixel */
     69         "mtlo             %[vector4a], $ac2                           \n\t"
     70         "mthi             $zero,       $ac2                           \n\t"
     71         "preceu.ph.qbr    %[p1],       %[tn2]                         \n\t"
     72         "balign           %[tn1],      %[tn2],         3              \n\t"
     73         "balign           %[tn2],      %[tp2],         3              \n\t"
     74         "balign           %[tp2],      %[tp1],         3              \n\t"
     75         "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
     76         "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
     77         "dpa.w.ph         $ac2,        %[p4],          %[vector3b]    \n\t"
     78         "dpa.w.ph         $ac2,        %[p1],          %[vector4b]    \n\t"
     79         "extp             %[Temp3],    $ac2,           31             \n\t"
     80 
     81         /* odd 1. pixel */
     82         "lbux             %[tp1],      %[Temp1](%[cm])                \n\t"
     83         "mtlo             %[vector4a], $ac3                           \n\t"
     84         "mthi             $zero,       $ac3                           \n\t"
     85         "preceu.ph.qbr    %[n1],       %[tp2]                         \n\t"
     86         "preceu.ph.qbl    %[n2],       %[tp2]                         \n\t"
     87         "preceu.ph.qbr    %[n3],       %[tn2]                         \n\t"
     88         "preceu.ph.qbl    %[n4],       %[tn2]                         \n\t"
     89         "dpa.w.ph         $ac3,        %[n1],          %[vector1b]    \n\t"
     90         "dpa.w.ph         $ac3,        %[n2],          %[vector2b]    \n\t"
     91         "dpa.w.ph         $ac3,        %[n3],          %[vector3b]    \n\t"
     92         "dpa.w.ph         $ac3,        %[n4],          %[vector4b]    \n\t"
     93         "extp             %[Temp2],    $ac3,           31             \n\t"
     94 
     95         /* odd 2. pixel */
     96         "lbux             %[tp2],      %[Temp3](%[cm])                \n\t"
     97         "mtlo             %[vector4a], $ac2                           \n\t"
     98         "mthi             $zero,       $ac2                           \n\t"
     99         "preceu.ph.qbr    %[n1],       %[tn1]                         \n\t"
    100         "dpa.w.ph         $ac2,        %[n2],          %[vector1b]    \n\t"
    101         "dpa.w.ph         $ac2,        %[n3],          %[vector2b]    \n\t"
    102         "dpa.w.ph         $ac2,        %[n4],          %[vector3b]    \n\t"
    103         "dpa.w.ph         $ac2,        %[n1],          %[vector4b]    \n\t"
    104         "extp             %[Temp4],    $ac2,           31             \n\t"
    105 
    106         /* clamp */
    107         "lbux             %[tn1],      %[Temp2](%[cm])                \n\t"
    108         "lbux             %[n2],       %[Temp4](%[cm])                \n\t"
    109 
    110         /* store bytes */
    111         "sb               %[tp1],      0(%[dst])                      \n\t"
    112         "sb               %[tn1],      1(%[dst])                      \n\t"
    113         "sb               %[tp2],      2(%[dst])                      \n\t"
    114         "sb               %[n2],       3(%[dst])                      \n\t"
    115 
    116         : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
    117           [tn1] "=&r" (tn1), [tn2] "=&r" (tn2),
    118           [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
    119           [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3), [n4] "=&r" (n4),
    120           [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
    121           [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
    122         : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
    123           [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
    124           [vector4a] "r" (vector4a),
    125           [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
    126     );
    127 
    128     /* Next row... */
    129     src += src_stride;
    130     dst += dst_stride;
    131   }
    132 }
    133 
    134 static void convolve_horiz_8_dspr2(const uint8_t *src,
    135                                    int32_t src_stride,
    136                                    uint8_t *dst,
    137                                    int32_t dst_stride,
    138                                    const int16_t *filter_x0,
    139                                    int32_t h) {
    140   int32_t y;
    141   uint8_t *cm = vp9_ff_cropTbl;
    142   uint32_t vector4a = 64;
    143   int32_t vector1b, vector2b, vector3b, vector4b;
    144   int32_t Temp1, Temp2, Temp3;
    145   uint32_t tp1, tp2;
    146   uint32_t p1, p2, p3, p4, n1;
    147   uint32_t tn1, tn2, tn3;
    148   uint32_t st0, st1;
    149 
    150   vector1b = ((const int32_t *)filter_x0)[0];
    151   vector2b = ((const int32_t *)filter_x0)[1];
    152   vector3b = ((const int32_t *)filter_x0)[2];
    153   vector4b = ((const int32_t *)filter_x0)[3];
    154 
    155   for (y = h; y--;) {
    156     /* prefetch data to cache memory */
    157     vp9_prefetch_load(src + src_stride);
    158     vp9_prefetch_load(src + src_stride + 32);
    159     vp9_prefetch_store(dst + dst_stride);
    160 
    161     __asm__ __volatile__ (
    162         "ulw              %[tp1],      0(%[src])                      \n\t"
    163         "ulw              %[tp2],      4(%[src])                      \n\t"
    164 
    165         /* even 1. pixel */
    166         "mtlo             %[vector4a], $ac3                           \n\t"
    167         "mthi             $zero,       $ac3                           \n\t"
    168         "mtlo             %[vector4a], $ac2                           \n\t"
    169         "mthi             $zero,       $ac2                           \n\t"
    170         "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
    171         "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
    172         "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
    173         "preceu.ph.qbl    %[p4],       %[tp2]                         \n\t"
    174         "ulw              %[tn2],      8(%[src])                      \n\t"
    175         "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
    176         "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
    177         "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
    178         "dpa.w.ph         $ac3,        %[p4],          %[vector4b]    \n\t"
    179         "extp             %[Temp1],    $ac3,           31             \n\t"
    180 
    181         /* even 2. pixel */
    182         "preceu.ph.qbr    %[p1],       %[tn2]                         \n\t"
    183         "preceu.ph.qbl    %[n1],       %[tn2]                         \n\t"
    184         "ulw              %[tn1],      12(%[src])                     \n\t"
    185         "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
    186         "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
    187         "dpa.w.ph         $ac2,        %[p4],          %[vector3b]    \n\t"
    188         "dpa.w.ph         $ac2,        %[p1],          %[vector4b]    \n\t"
    189         "extp             %[Temp3],    $ac2,           31             \n\t"
    190 
    191         /* even 3. pixel */
    192         "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
    193         "mtlo             %[vector4a], $ac1                           \n\t"
    194         "mthi             $zero,       $ac1                           \n\t"
    195         "preceu.ph.qbr    %[p2],       %[tn1]                         \n\t"
    196         "dpa.w.ph         $ac1,        %[p3],          %[vector1b]    \n\t"
    197         "dpa.w.ph         $ac1,        %[p4],          %[vector2b]    \n\t"
    198         "dpa.w.ph         $ac1,        %[p1],          %[vector3b]    \n\t"
    199         "dpa.w.ph         $ac1,        %[n1],          %[vector4b]    \n\t"
    200         "extp             %[Temp1],    $ac1,           31             \n\t"
    201 
    202         /* even 4. pixel */
    203         "mtlo             %[vector4a], $ac2                           \n\t"
    204         "mthi             $zero,       $ac2                           \n\t"
    205         "mtlo             %[vector4a], $ac3                           \n\t"
    206         "mthi             $zero,       $ac3                           \n\t"
    207         "sb               %[st0],      0(%[dst])                      \n\t"
    208         "lbux             %[st1],      %[Temp3](%[cm])                \n\t"
    209 
    210         "balign           %[tn3],      %[tn1],         3              \n\t"
    211         "balign           %[tn1],      %[tn2],         3              \n\t"
    212         "balign           %[tn2],      %[tp2],         3              \n\t"
    213         "balign           %[tp2],      %[tp1],         3              \n\t"
    214 
    215         "dpa.w.ph         $ac2,        %[p4],          %[vector1b]    \n\t"
    216         "dpa.w.ph         $ac2,        %[p1],          %[vector2b]    \n\t"
    217         "dpa.w.ph         $ac2,        %[n1],          %[vector3b]    \n\t"
    218         "dpa.w.ph         $ac2,        %[p2],          %[vector4b]    \n\t"
    219         "extp             %[Temp3],    $ac2,           31             \n\t"
    220 
    221         "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
    222 
    223         /* odd 1. pixel */
    224         "mtlo             %[vector4a], $ac1                           \n\t"
    225         "mthi             $zero,       $ac1                           \n\t"
    226         "sb               %[st1],      2(%[dst])                      \n\t"
    227         "preceu.ph.qbr    %[p1],       %[tp2]                         \n\t"
    228         "preceu.ph.qbl    %[p2],       %[tp2]                         \n\t"
    229         "preceu.ph.qbr    %[p3],       %[tn2]                         \n\t"
    230         "preceu.ph.qbl    %[p4],       %[tn2]                         \n\t"
    231         "sb               %[st0],      4(%[dst])                      \n\t"
    232         "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
    233         "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
    234         "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
    235         "dpa.w.ph         $ac3,        %[p4],          %[vector4b]    \n\t"
    236         "extp             %[Temp2],    $ac3,           31             \n\t"
    237 
    238         /* odd 2. pixel */
    239         "mtlo             %[vector4a], $ac3                           \n\t"
    240         "mthi             $zero,       $ac3                           \n\t"
    241         "mtlo             %[vector4a], $ac2                           \n\t"
    242         "mthi             $zero,       $ac2                           \n\t"
    243         "preceu.ph.qbr    %[p1],       %[tn1]                         \n\t"
    244         "preceu.ph.qbl    %[n1],       %[tn1]                         \n\t"
    245         "lbux             %[st0],      %[Temp3](%[cm])                \n\t"
    246         "dpa.w.ph         $ac1,        %[p2],          %[vector1b]    \n\t"
    247         "dpa.w.ph         $ac1,        %[p3],          %[vector2b]    \n\t"
    248         "dpa.w.ph         $ac1,        %[p4],          %[vector3b]    \n\t"
    249         "dpa.w.ph         $ac1,        %[p1],          %[vector4b]    \n\t"
    250         "extp             %[Temp3],    $ac1,           31             \n\t"
    251 
    252         /* odd 3. pixel */
    253         "lbux             %[st1],      %[Temp2](%[cm])                \n\t"
    254         "preceu.ph.qbr    %[p2],       %[tn3]                         \n\t"
    255         "dpa.w.ph         $ac3,        %[p3],          %[vector1b]    \n\t"
    256         "dpa.w.ph         $ac3,        %[p4],          %[vector2b]    \n\t"
    257         "dpa.w.ph         $ac3,        %[p1],          %[vector3b]    \n\t"
    258         "dpa.w.ph         $ac3,        %[n1],          %[vector4b]    \n\t"
    259         "extp             %[Temp2],    $ac3,           31             \n\t"
    260 
    261         /* odd 4. pixel */
    262         "sb               %[st1],      1(%[dst])                      \n\t"
    263         "sb               %[st0],      6(%[dst])                      \n\t"
    264         "dpa.w.ph         $ac2,        %[p4],          %[vector1b]    \n\t"
    265         "dpa.w.ph         $ac2,        %[p1],          %[vector2b]    \n\t"
    266         "dpa.w.ph         $ac2,        %[n1],          %[vector3b]    \n\t"
    267         "dpa.w.ph         $ac2,        %[p2],          %[vector4b]    \n\t"
    268         "extp             %[Temp1],    $ac2,           31             \n\t"
    269 
    270         /* clamp */
    271         "lbux             %[p4],       %[Temp3](%[cm])                \n\t"
    272         "lbux             %[p2],       %[Temp2](%[cm])                \n\t"
    273         "lbux             %[n1],       %[Temp1](%[cm])                \n\t"
    274 
    275         /* store bytes */
    276         "sb               %[p4],       3(%[dst])                      \n\t"
    277         "sb               %[p2],       5(%[dst])                      \n\t"
    278         "sb               %[n1],       7(%[dst])                      \n\t"
    279 
    280         : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
    281           [tn1] "=&r" (tn1), [tn2] "=&r" (tn2), [tn3] "=&r" (tn3),
    282           [st0] "=&r" (st0), [st1] "=&r" (st1),
    283           [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
    284           [n1] "=&r" (n1),
    285           [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
    286         : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
    287           [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
    288           [vector4a] "r" (vector4a),
    289           [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
    290     );
    291 
    292     /* Next row... */
    293     src += src_stride;
    294     dst += dst_stride;
    295   }
    296 }
    297 
    298 static void convolve_horiz_16_dspr2(const uint8_t *src_ptr,
    299                                     int32_t src_stride,
    300                                     uint8_t *dst_ptr,
    301                                     int32_t dst_stride,
    302                                     const int16_t *filter_x0,
    303                                     int32_t h,
    304                                     int32_t count) {
    305   int32_t y, c;
    306   const uint8_t *src;
    307   uint8_t *dst;
    308   uint8_t *cm = vp9_ff_cropTbl;
    309   uint32_t vector_64 = 64;
    310   int32_t filter12, filter34, filter56, filter78;
    311   int32_t Temp1, Temp2, Temp3;
    312   uint32_t qload1, qload2, qload3;
    313   uint32_t p1, p2, p3, p4, p5;
    314   uint32_t st1, st2, st3;
    315 
    316   filter12 = ((const int32_t *)filter_x0)[0];
    317   filter34 = ((const int32_t *)filter_x0)[1];
    318   filter56 = ((const int32_t *)filter_x0)[2];
    319   filter78 = ((const int32_t *)filter_x0)[3];
    320 
    321   for (y = h; y--;) {
    322     src = src_ptr;
    323     dst = dst_ptr;
    324 
    325     /* prefetch data to cache memory */
    326     vp9_prefetch_load(src_ptr + src_stride);
    327     vp9_prefetch_load(src_ptr + src_stride + 32);
    328     vp9_prefetch_store(dst_ptr + dst_stride);
    329 
    330     for (c = 0; c < count; c++) {
    331       __asm__ __volatile__ (
    332           "ulw              %[qload1],    0(%[src])                    \n\t"
    333           "ulw              %[qload2],    4(%[src])                    \n\t"
    334 
    335           /* even 1. pixel */
    336           "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
    337           "mthi             $zero,        $ac1                         \n\t"
    338           "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
    339           "mthi             $zero,        $ac2                         \n\t"
    340           "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
    341           "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
    342           "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
    343           "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
    344           "ulw              %[qload3],    8(%[src])                    \n\t"
    345           "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* even 1 */
    346           "dpa.w.ph         $ac1,         %[p2],          %[filter34]  \n\t" /* even 1 */
    347           "dpa.w.ph         $ac1,         %[p3],          %[filter56]  \n\t" /* even 1 */
    348           "dpa.w.ph         $ac1,         %[p4],          %[filter78]  \n\t" /* even 1 */
    349           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
    350 
    351           /* even 2. pixel */
    352           "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
    353           "mthi             $zero,        $ac3                         \n\t"
    354           "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
    355           "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
    356           "ulw              %[qload1],    12(%[src])                   \n\t"
    357           "dpa.w.ph         $ac2,         %[p2],          %[filter12]  \n\t" /* even 1 */
    358           "dpa.w.ph         $ac2,         %[p3],          %[filter34]  \n\t" /* even 1 */
    359           "dpa.w.ph         $ac2,         %[p4],          %[filter56]  \n\t" /* even 1 */
    360           "dpa.w.ph         $ac2,         %[p1],          %[filter78]  \n\t" /* even 1 */
    361           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
    362           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
    363 
    364           /* even 3. pixel */
    365           "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
    366           "mthi             $zero,        $ac1                         \n\t"
    367           "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
    368           "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
    369           "dpa.w.ph         $ac3,         %[p3],          %[filter12]  \n\t" /* even 3 */
    370           "dpa.w.ph         $ac3,         %[p4],          %[filter34]  \n\t" /* even 3 */
    371           "dpa.w.ph         $ac3,         %[p1],          %[filter56]  \n\t" /* even 3 */
    372           "dpa.w.ph         $ac3,         %[p5],          %[filter78]  \n\t" /* even 3 */
    373           "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
    374           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
    375 
    376           /* even 4. pixel */
    377           "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
    378           "mthi             $zero,        $ac2                         \n\t"
    379           "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
    380           "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
    381           "ulw              %[qload2],    16(%[src])                   \n\t"
    382           "dpa.w.ph         $ac1,         %[p4],          %[filter12]  \n\t" /* even 4 */
    383           "dpa.w.ph         $ac1,         %[p1],          %[filter34]  \n\t" /* even 4 */
    384           "dpa.w.ph         $ac1,         %[p5],          %[filter56]  \n\t" /* even 4 */
    385           "dpa.w.ph         $ac1,         %[p2],          %[filter78]  \n\t" /* even 4 */
    386           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
    387           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
    388 
    389           /* even 5. pixel */
    390           "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
    391           "mthi             $zero,        $ac3                         \n\t"
    392           "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
    393           "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
    394           "dpa.w.ph         $ac2,         %[p1],          %[filter12]  \n\t" /* even 5 */
    395           "dpa.w.ph         $ac2,         %[p5],          %[filter34]  \n\t" /* even 5 */
    396           "dpa.w.ph         $ac2,         %[p2],          %[filter56]  \n\t" /* even 5 */
    397           "dpa.w.ph         $ac2,         %[p3],          %[filter78]  \n\t" /* even 5 */
    398           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
    399           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
    400 
    401           /* even 6. pixel */
    402           "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
    403           "mthi             $zero,        $ac1                         \n\t"
    404           "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
    405           "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
    406           "ulw              %[qload3],    20(%[src])                   \n\t"
    407           "dpa.w.ph         $ac3,         %[p5],          %[filter12]  \n\t" /* even 6 */
    408           "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* even 6 */
    409           "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* even 6 */
    410           "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* even 6 */
    411           "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
    412           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
    413 
    414           /* even 7. pixel */
    415           "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
    416           "mthi             $zero,        $ac2                         \n\t"
    417           "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
    418           "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
    419           "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* even 7 */
    420           "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* even 7 */
    421           "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* even 7 */
    422           "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* even 7 */
    423           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
    424           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
    425 
    426           /* even 8. pixel */
    427           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
    428           "mthi             $zero,        $ac3                         \n\t"
    429           "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* even 8 */
    430           "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* even 8 */
    431           "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
    432           "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* even 8 */
    433           "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* even 8 */
    434           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
    435           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
    436 
    437           /* ODD pixels */
    438           "ulw              %[qload1],    1(%[src])                    \n\t"
    439           "ulw              %[qload2],    5(%[src])                    \n\t"
    440 
    441           /* odd 1. pixel */
    442           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
    443           "mthi             $zero,        $ac1                         \n\t"
    444           "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
    445           "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
    446           "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
    447           "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
    448           "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
    449           "ulw              %[qload3],    9(%[src])                    \n\t"
    450           "dpa.w.ph         $ac3,         %[p1],          %[filter12]  \n\t" /* odd 1 */
    451           "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* odd 1 */
    452           "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* odd 1 */
    453           "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* odd 1 */
    454           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
    455           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
    456 
    457           /* odd 2. pixel */
    458           "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
    459           "mthi             $zero,        $ac2                         \n\t"
    460           "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
    461           "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
    462           "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
    463           "ulw              %[qload1],    13(%[src])                   \n\t"
    464           "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* odd 2 */
    465           "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* odd 2 */
    466           "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* odd 2 */
    467           "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* odd 2 */
    468           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
    469           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
    470 
    471           /* odd 3. pixel */
    472           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
    473           "mthi             $zero,        $ac3                         \n\t"
    474           "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
    475           "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
    476           "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* odd 3 */
    477           "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* odd 3 */
    478           "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* odd 3 */
    479           "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* odd 3 */
    480           "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
    481           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
    482 
    483           /* odd 4. pixel */
    484           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
    485           "mthi             $zero,        $ac1                         \n\t"
    486           "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
    487           "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
    488           "ulw              %[qload2],    17(%[src])                   \n\t"
    489           "dpa.w.ph         $ac3,         %[p4],          %[filter12]  \n\t" /* odd 4 */
    490           "dpa.w.ph         $ac3,         %[p1],          %[filter34]  \n\t" /* odd 4 */
    491           "dpa.w.ph         $ac3,         %[p5],          %[filter56]  \n\t" /* odd 4 */
    492           "dpa.w.ph         $ac3,         %[p2],          %[filter78]  \n\t" /* odd 4 */
    493           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
    494           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
    495 
    496           /* odd 5. pixel */
    497           "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
    498           "mthi             $zero,        $ac2                         \n\t"
    499           "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
    500           "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
    501           "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* odd 5 */
    502           "dpa.w.ph         $ac1,         %[p5],          %[filter34]  \n\t" /* odd 5 */
    503           "dpa.w.ph         $ac1,         %[p2],          %[filter56]  \n\t" /* odd 5 */
    504           "dpa.w.ph         $ac1,         %[p3],          %[filter78]  \n\t" /* odd 5 */
    505           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
    506           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
    507 
    508           /* odd 6. pixel */
    509           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
    510           "mthi             $zero,        $ac3                         \n\t"
    511           "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
    512           "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
    513           "ulw              %[qload3],    21(%[src])                   \n\t"
    514           "dpa.w.ph         $ac2,         %[p5],          %[filter12]  \n\t" /* odd 6 */
    515           "dpa.w.ph         $ac2,         %[p2],          %[filter34]  \n\t" /* odd 6 */
    516           "dpa.w.ph         $ac2,         %[p3],          %[filter56]  \n\t" /* odd 6 */
    517           "dpa.w.ph         $ac2,         %[p4],          %[filter78]  \n\t" /* odd 6 */
    518           "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
    519           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
    520 
    521           /* odd 7. pixel */
    522           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
    523           "mthi             $zero,        $ac1                         \n\t"
    524           "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
    525           "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
    526           "dpa.w.ph         $ac3,         %[p2],          %[filter12]  \n\t" /* odd 7 */
    527           "dpa.w.ph         $ac3,         %[p3],          %[filter34]  \n\t" /* odd 7 */
    528           "dpa.w.ph         $ac3,         %[p4],          %[filter56]  \n\t" /* odd 7 */
    529           "dpa.w.ph         $ac3,         %[p1],          %[filter78]  \n\t" /* odd 7 */
    530           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
    531 
    532           /* odd 8. pixel */
    533           "dpa.w.ph         $ac1,         %[p3],          %[filter12]  \n\t" /* odd 8 */
    534           "dpa.w.ph         $ac1,         %[p4],          %[filter34]  \n\t" /* odd 8 */
    535           "dpa.w.ph         $ac1,         %[p1],          %[filter56]  \n\t" /* odd 8 */
    536           "dpa.w.ph         $ac1,         %[p5],          %[filter78]  \n\t" /* odd 8 */
    537           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
    538 
    539           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
    540           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
    541           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
    542 
    543           "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
    544           "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
    545           "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
    546 
    547           : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3),
    548             [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
    549             [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
    550             [p5] "=&r" (p5),
    551             [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
    552           : [filter12] "r" (filter12), [filter34] "r" (filter34),
    553             [filter56] "r" (filter56), [filter78] "r" (filter78),
    554             [vector_64] "r" (vector_64),
    555             [cm] "r" (cm), [dst] "r" (dst),
    556             [src] "r" (src)
    557       );
    558 
    559       src += 16;
    560       dst += 16;
    561     }
    562 
    563     /* Next row... */
    564     src_ptr += src_stride;
    565     dst_ptr += dst_stride;
    566   }
    567 }
    568 
    569 static void convolve_horiz_64_dspr2(const uint8_t *src_ptr,
    570                                     int32_t src_stride,
    571                                     uint8_t *dst_ptr,
    572                                     int32_t dst_stride,
    573                                     const int16_t *filter_x0,
    574                                     int32_t h) {
    575   int32_t y, c;
    576   const uint8_t *src;
    577   uint8_t *dst;
    578   uint8_t *cm = vp9_ff_cropTbl;
    579   uint32_t vector_64 = 64;
    580   int32_t filter12, filter34, filter56, filter78;
    581   int32_t Temp1, Temp2, Temp3;
    582   uint32_t qload1, qload2, qload3;
    583   uint32_t p1, p2, p3, p4, p5;
    584   uint32_t st1, st2, st3;
    585 
    586   filter12 = ((const int32_t *)filter_x0)[0];
    587   filter34 = ((const int32_t *)filter_x0)[1];
    588   filter56 = ((const int32_t *)filter_x0)[2];
    589   filter78 = ((const int32_t *)filter_x0)[3];
    590 
    591   for (y = h; y--;) {
    592     src = src_ptr;
    593     dst = dst_ptr;
    594 
    595     /* prefetch data to cache memory */
    596     vp9_prefetch_load(src_ptr + src_stride);
    597     vp9_prefetch_load(src_ptr + src_stride + 32);
    598     vp9_prefetch_load(src_ptr + src_stride + 64);
    599     vp9_prefetch_store(dst_ptr + dst_stride);
    600     vp9_prefetch_store(dst_ptr + dst_stride + 32);
    601 
    602     for (c = 0; c < 4; c++) {
    603       __asm__ __volatile__ (
    604           "ulw              %[qload1],    0(%[src])                    \n\t"
    605           "ulw              %[qload2],    4(%[src])                    \n\t"
    606 
    607           /* even 1. pixel */
    608           "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
    609           "mthi             $zero,        $ac1                         \n\t"
    610           "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
    611           "mthi             $zero,        $ac2                         \n\t"
    612           "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
    613           "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
    614           "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
    615           "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
    616           "ulw              %[qload3],    8(%[src])                    \n\t"
    617           "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* even 1 */
    618           "dpa.w.ph         $ac1,         %[p2],          %[filter34]  \n\t" /* even 1 */
    619           "dpa.w.ph         $ac1,         %[p3],          %[filter56]  \n\t" /* even 1 */
    620           "dpa.w.ph         $ac1,         %[p4],          %[filter78]  \n\t" /* even 1 */
    621           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
    622 
    623           /* even 2. pixel */
    624           "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
    625           "mthi             $zero,        $ac3                         \n\t"
    626           "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
    627           "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
    628           "ulw              %[qload1],    12(%[src])                   \n\t"
    629           "dpa.w.ph         $ac2,         %[p2],          %[filter12]  \n\t" /* even 1 */
    630           "dpa.w.ph         $ac2,         %[p3],          %[filter34]  \n\t" /* even 1 */
    631           "dpa.w.ph         $ac2,         %[p4],          %[filter56]  \n\t" /* even 1 */
    632           "dpa.w.ph         $ac2,         %[p1],          %[filter78]  \n\t" /* even 1 */
    633           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
    634           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
    635 
    636           /* even 3. pixel */
    637           "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
    638           "mthi             $zero,        $ac1                         \n\t"
    639           "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
    640           "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
    641           "dpa.w.ph         $ac3,         %[p3],          %[filter12]  \n\t" /* even 3 */
    642           "dpa.w.ph         $ac3,         %[p4],          %[filter34]  \n\t" /* even 3 */
    643           "dpa.w.ph         $ac3,         %[p1],          %[filter56]  \n\t" /* even 3 */
    644           "dpa.w.ph         $ac3,         %[p5],          %[filter78]  \n\t" /* even 3 */
    645           "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
    646           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
    647 
    648           /* even 4. pixel */
    649           "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
    650           "mthi             $zero,        $ac2                         \n\t"
    651           "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
    652           "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
    653           "ulw              %[qload2],    16(%[src])                   \n\t"
    654           "dpa.w.ph         $ac1,         %[p4],          %[filter12]  \n\t" /* even 4 */
    655           "dpa.w.ph         $ac1,         %[p1],          %[filter34]  \n\t" /* even 4 */
    656           "dpa.w.ph         $ac1,         %[p5],          %[filter56]  \n\t" /* even 4 */
    657           "dpa.w.ph         $ac1,         %[p2],          %[filter78]  \n\t" /* even 4 */
    658           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
    659           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
    660 
    661           /* even 5. pixel */
    662           "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
    663           "mthi             $zero,        $ac3                         \n\t"
    664           "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
    665           "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
    666           "dpa.w.ph         $ac2,         %[p1],          %[filter12]  \n\t" /* even 5 */
    667           "dpa.w.ph         $ac2,         %[p5],          %[filter34]  \n\t" /* even 5 */
    668           "dpa.w.ph         $ac2,         %[p2],          %[filter56]  \n\t" /* even 5 */
    669           "dpa.w.ph         $ac2,         %[p3],          %[filter78]  \n\t" /* even 5 */
    670           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
    671           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
    672 
    673           /* even 6. pixel */
    674           "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
    675           "mthi             $zero,        $ac1                         \n\t"
    676           "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
    677           "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
    678           "ulw              %[qload3],    20(%[src])                   \n\t"
    679           "dpa.w.ph         $ac3,         %[p5],          %[filter12]  \n\t" /* even 6 */
    680           "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* even 6 */
    681           "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* even 6 */
    682           "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* even 6 */
    683           "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
    684           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
    685 
    686           /* even 7. pixel */
    687           "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
    688           "mthi             $zero,        $ac2                         \n\t"
    689           "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
    690           "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
    691           "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* even 7 */
    692           "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* even 7 */
    693           "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* even 7 */
    694           "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* even 7 */
    695           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
    696           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
    697 
    698           /* even 8. pixel */
    699           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
    700           "mthi             $zero,        $ac3                         \n\t"
    701           "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* even 8 */
    702           "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* even 8 */
    703           "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
    704           "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* even 8 */
    705           "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* even 8 */
    706           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
    707           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
    708 
    709           /* ODD pixels */
    710           "ulw              %[qload1],    1(%[src])                    \n\t"
    711           "ulw              %[qload2],    5(%[src])                    \n\t"
    712 
    713           /* odd 1. pixel */
    714           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
    715           "mthi             $zero,        $ac1                         \n\t"
    716           "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
    717           "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
    718           "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
    719           "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
    720           "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
    721           "ulw              %[qload3],    9(%[src])                    \n\t"
    722           "dpa.w.ph         $ac3,         %[p1],          %[filter12]  \n\t" /* odd 1 */
    723           "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* odd 1 */
    724           "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* odd 1 */
    725           "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* odd 1 */
    726           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
    727           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
    728 
    729           /* odd 2. pixel */
    730           "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
    731           "mthi             $zero,        $ac2                         \n\t"
    732           "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
    733           "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
    734           "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
    735           "ulw              %[qload1],    13(%[src])                   \n\t"
    736           "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* odd 2 */
    737           "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* odd 2 */
    738           "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* odd 2 */
    739           "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* odd 2 */
    740           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
    741           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
    742 
    743           /* odd 3. pixel */
    744           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
    745           "mthi             $zero,        $ac3                         \n\t"
    746           "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
    747           "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
    748           "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* odd 3 */
    749           "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* odd 3 */
    750           "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* odd 3 */
    751           "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* odd 3 */
    752           "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
    753           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
    754 
    755           /* odd 4. pixel */
    756           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
    757           "mthi             $zero,        $ac1                         \n\t"
    758           "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
    759           "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
    760           "ulw              %[qload2],    17(%[src])                   \n\t"
    761           "dpa.w.ph         $ac3,         %[p4],          %[filter12]  \n\t" /* odd 4 */
    762           "dpa.w.ph         $ac3,         %[p1],          %[filter34]  \n\t" /* odd 4 */
    763           "dpa.w.ph         $ac3,         %[p5],          %[filter56]  \n\t" /* odd 4 */
    764           "dpa.w.ph         $ac3,         %[p2],          %[filter78]  \n\t" /* odd 4 */
    765           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
    766           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
    767 
    768           /* odd 5. pixel */
    769           "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
    770           "mthi             $zero,        $ac2                         \n\t"
    771           "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
    772           "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
    773           "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* odd 5 */
    774           "dpa.w.ph         $ac1,         %[p5],          %[filter34]  \n\t" /* odd 5 */
    775           "dpa.w.ph         $ac1,         %[p2],          %[filter56]  \n\t" /* odd 5 */
    776           "dpa.w.ph         $ac1,         %[p3],          %[filter78]  \n\t" /* odd 5 */
    777           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
    778           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
    779 
    780           /* odd 6. pixel */
    781           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
    782           "mthi             $zero,        $ac3                         \n\t"
    783           "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
    784           "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
    785           "ulw              %[qload3],    21(%[src])                   \n\t"
    786           "dpa.w.ph         $ac2,         %[p5],          %[filter12]  \n\t" /* odd 6 */
    787           "dpa.w.ph         $ac2,         %[p2],          %[filter34]  \n\t" /* odd 6 */
    788           "dpa.w.ph         $ac2,         %[p3],          %[filter56]  \n\t" /* odd 6 */
    789           "dpa.w.ph         $ac2,         %[p4],          %[filter78]  \n\t" /* odd 6 */
    790           "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
    791           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
    792 
    793           /* odd 7. pixel */
    794           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
    795           "mthi             $zero,        $ac1                         \n\t"
    796           "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
    797           "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
    798           "dpa.w.ph         $ac3,         %[p2],          %[filter12]  \n\t" /* odd 7 */
    799           "dpa.w.ph         $ac3,         %[p3],          %[filter34]  \n\t" /* odd 7 */
    800           "dpa.w.ph         $ac3,         %[p4],          %[filter56]  \n\t" /* odd 7 */
    801           "dpa.w.ph         $ac3,         %[p1],          %[filter78]  \n\t" /* odd 7 */
    802           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
    803 
    804           /* odd 8. pixel */
    805           "dpa.w.ph         $ac1,         %[p3],          %[filter12]  \n\t" /* odd 8 */
    806           "dpa.w.ph         $ac1,         %[p4],          %[filter34]  \n\t" /* odd 8 */
    807           "dpa.w.ph         $ac1,         %[p1],          %[filter56]  \n\t" /* odd 8 */
    808           "dpa.w.ph         $ac1,         %[p5],          %[filter78]  \n\t" /* odd 8 */
    809           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
    810 
    811           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
    812           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
    813           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
    814 
    815           "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
    816           "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
    817           "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
    818 
    819           : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3),
    820             [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
    821             [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
    822             [p5] "=&r" (p5),
    823             [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
    824           : [filter12] "r" (filter12), [filter34] "r" (filter34),
    825             [filter56] "r" (filter56), [filter78] "r" (filter78),
    826             [vector_64] "r" (vector_64),
    827             [cm] "r" (cm), [dst] "r" (dst),
    828             [src] "r" (src)
    829       );
    830 
    831       src += 16;
    832       dst += 16;
    833     }
    834 
    835     /* Next row... */
    836     src_ptr += src_stride;
    837     dst_ptr += dst_stride;
    838   }
    839 }
    840 
    841 void vp9_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
    842                                uint8_t *dst, ptrdiff_t dst_stride,
    843                                const int16_t *filter_x, int x_step_q4,
    844                                const int16_t *filter_y, int y_step_q4,
    845                                int w, int h) {
    846   if (((const int32_t *)filter_x)[1] == 0x800000) {
    847     vp9_convolve_copy(src, src_stride,
    848                       dst, dst_stride,
    849                       filter_x, x_step_q4,
    850                       filter_y, y_step_q4,
    851                       w, h);
    852   } else if (((const int32_t *)filter_x)[0] == 0) {
    853     vp9_convolve2_horiz_dspr2(src, src_stride,
    854                               dst, dst_stride,
    855                               filter_x, x_step_q4,
    856                               filter_y, y_step_q4,
    857                               w, h);
    858   } else {
    859     if (16 == x_step_q4) {
    860       uint32_t pos = 38;
    861 
    862       vp9_prefetch_load((const uint8_t *)filter_x);
    863       src -= 3;
    864 
    865       /* bit positon for extract from acc */
    866       __asm__ __volatile__ (
    867         "wrdsp      %[pos],     1           \n\t"
    868         :
    869         : [pos] "r" (pos)
    870       );
    871 
    872       /* prefetch data to cache memory */
    873       vp9_prefetch_load(src);
    874       vp9_prefetch_load(src + 32);
    875       vp9_prefetch_store(dst);
    876 
    877       switch (w) {
    878         case 4:
    879           convolve_horiz_4_dspr2(src, (int32_t)src_stride,
    880                                  dst, (int32_t)dst_stride,
    881                                  filter_x, (int32_t)h);
    882           break;
    883         case 8:
    884           convolve_horiz_8_dspr2(src, (int32_t)src_stride,
    885                                  dst, (int32_t)dst_stride,
    886                                  filter_x, (int32_t)h);
    887           break;
    888         case 16:
    889           convolve_horiz_16_dspr2(src, (int32_t)src_stride,
    890                                   dst, (int32_t)dst_stride,
    891                                   filter_x, (int32_t)h, 1);
    892           break;
    893         case 32:
    894           convolve_horiz_16_dspr2(src, (int32_t)src_stride,
    895                                   dst, (int32_t)dst_stride,
    896                                   filter_x, (int32_t)h, 2);
    897           break;
    898         case 64:
    899           vp9_prefetch_load(src + 64);
    900           vp9_prefetch_store(dst + 32);
    901 
    902           convolve_horiz_64_dspr2(src, (int32_t)src_stride,
    903                                   dst, (int32_t)dst_stride,
    904                                   filter_x, (int32_t)h);
    905           break;
    906         default:
    907           vp9_convolve8_horiz_c(src + 3, src_stride,
    908                                 dst, dst_stride,
    909                                 filter_x, x_step_q4,
    910                                 filter_y, y_step_q4,
    911                                 w, h);
    912           break;
    913       }
    914     } else {
    915       vp9_convolve8_horiz_c(src, src_stride,
    916                             dst, dst_stride,
    917                             filter_x, x_step_q4,
    918                             filter_y, y_step_q4,
    919                             w, h);
    920     }
    921   }
    922 }
    923 #endif
    924