Home | History | Annotate | Download | only in mips
      1 /*
      2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
      3  *
      4  * This source code is subject to the terms of the BSD 2 Clause License and
      5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6  * was not distributed with this source code in the LICENSE file, you can
      7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8  * Media Patent License 1.0 was not distributed with this source code in the
      9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10  */
     11 
     12 #include <assert.h>
     13 #include <stdio.h>
     14 
     15 #include "config/aom_dsp_rtcd.h"
     16 
     17 #include "aom_dsp/mips/convolve_common_dspr2.h"
     18 #include "aom_dsp/aom_dsp_common.h"
     19 #include "aom_dsp/aom_filter.h"
     20 #include "aom_ports/mem.h"
     21 
     22 #if HAVE_DSPR2
     23 static void convolve_horiz_4_dspr2(const uint8_t *src, int32_t src_stride,
     24                                    uint8_t *dst, int32_t dst_stride,
     25                                    const int16_t *filter_x0, int32_t h) {
     26   int32_t y;
     27   uint8_t *cm = aom_ff_cropTbl;
     28   int32_t vector1b, vector2b, vector3b, vector4b;
     29   int32_t Temp1, Temp2, Temp3, Temp4;
     30   uint32_t vector4a = 64;
     31   uint32_t tp1, tp2;
     32   uint32_t p1, p2, p3, p4;
     33   uint32_t n1, n2, n3, n4;
     34   uint32_t tn1, tn2;
     35 
     36   vector1b = ((const int32_t *)filter_x0)[0];
     37   vector2b = ((const int32_t *)filter_x0)[1];
     38   vector3b = ((const int32_t *)filter_x0)[2];
     39   vector4b = ((const int32_t *)filter_x0)[3];
     40 
     41   for (y = h; y--;) {
     42     /* prefetch data to cache memory */
     43     prefetch_load(src + src_stride);
     44     prefetch_load(src + src_stride + 32);
     45     prefetch_store(dst + dst_stride);
     46 
     47     __asm__ __volatile__(
     48         "ulw              %[tp1],      0(%[src])                      \n\t"
     49         "ulw              %[tp2],      4(%[src])                      \n\t"
     50 
     51         /* even 1. pixel */
     52         "mtlo             %[vector4a], $ac3                           \n\t"
     53         "mthi             $zero,       $ac3                           \n\t"
     54         "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
     55         "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
     56         "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
     57         "preceu.ph.qbl    %[p4],       %[tp2]                         \n\t"
     58         "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
     59         "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
     60         "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
     61         "ulw              %[tn2],      8(%[src])                      \n\t"
     62         "dpa.w.ph         $ac3,        %[p4],          %[vector4b]    \n\t"
     63         "extp             %[Temp1],    $ac3,           31             \n\t"
     64 
     65         /* even 2. pixel */
     66         "mtlo             %[vector4a], $ac2                           \n\t"
     67         "mthi             $zero,       $ac2                           \n\t"
     68         "preceu.ph.qbr    %[p1],       %[tn2]                         \n\t"
     69         "balign           %[tn1],      %[tn2],         3              \n\t"
     70         "balign           %[tn2],      %[tp2],         3              \n\t"
     71         "balign           %[tp2],      %[tp1],         3              \n\t"
     72         "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
     73         "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
     74         "dpa.w.ph         $ac2,        %[p4],          %[vector3b]    \n\t"
     75         "dpa.w.ph         $ac2,        %[p1],          %[vector4b]    \n\t"
     76         "extp             %[Temp3],    $ac2,           31             \n\t"
     77 
     78         /* odd 1. pixel */
     79         "lbux             %[tp1],      %[Temp1](%[cm])                \n\t"
     80         "mtlo             %[vector4a], $ac3                           \n\t"
     81         "mthi             $zero,       $ac3                           \n\t"
     82         "preceu.ph.qbr    %[n1],       %[tp2]                         \n\t"
     83         "preceu.ph.qbl    %[n2],       %[tp2]                         \n\t"
     84         "preceu.ph.qbr    %[n3],       %[tn2]                         \n\t"
     85         "preceu.ph.qbl    %[n4],       %[tn2]                         \n\t"
     86         "dpa.w.ph         $ac3,        %[n1],          %[vector1b]    \n\t"
     87         "dpa.w.ph         $ac3,        %[n2],          %[vector2b]    \n\t"
     88         "dpa.w.ph         $ac3,        %[n3],          %[vector3b]    \n\t"
     89         "dpa.w.ph         $ac3,        %[n4],          %[vector4b]    \n\t"
     90         "extp             %[Temp2],    $ac3,           31             \n\t"
     91 
     92         /* odd 2. pixel */
     93         "lbux             %[tp2],      %[Temp3](%[cm])                \n\t"
     94         "mtlo             %[vector4a], $ac2                           \n\t"
     95         "mthi             $zero,       $ac2                           \n\t"
     96         "preceu.ph.qbr    %[n1],       %[tn1]                         \n\t"
     97         "dpa.w.ph         $ac2,        %[n2],          %[vector1b]    \n\t"
     98         "dpa.w.ph         $ac2,        %[n3],          %[vector2b]    \n\t"
     99         "dpa.w.ph         $ac2,        %[n4],          %[vector3b]    \n\t"
    100         "dpa.w.ph         $ac2,        %[n1],          %[vector4b]    \n\t"
    101         "extp             %[Temp4],    $ac2,           31             \n\t"
    102 
    103         /* clamp */
    104         "lbux             %[tn1],      %[Temp2](%[cm])                \n\t"
    105         "lbux             %[n2],       %[Temp4](%[cm])                \n\t"
    106 
    107         /* store bytes */
    108         "sb               %[tp1],      0(%[dst])                      \n\t"
    109         "sb               %[tn1],      1(%[dst])                      \n\t"
    110         "sb               %[tp2],      2(%[dst])                      \n\t"
    111         "sb               %[n2],       3(%[dst])                      \n\t"
    112 
    113         : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
    114           [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
    115           [p4] "=&r"(p4), [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3),
    116           [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
    117           [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4)
    118         : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
    119           [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
    120           [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst),
    121           [src] "r"(src));
    122 
    123     /* Next row... */
    124     src += src_stride;
    125     dst += dst_stride;
    126   }
    127 }
    128 
    129 static void convolve_horiz_8_dspr2(const uint8_t *src, int32_t src_stride,
    130                                    uint8_t *dst, int32_t dst_stride,
    131                                    const int16_t *filter_x0, int32_t h) {
    132   int32_t y;
    133   uint8_t *cm = aom_ff_cropTbl;
    134   uint32_t vector4a = 64;
    135   int32_t vector1b, vector2b, vector3b, vector4b;
    136   int32_t Temp1, Temp2, Temp3;
    137   uint32_t tp1, tp2;
    138   uint32_t p1, p2, p3, p4, n1;
    139   uint32_t tn1, tn2, tn3;
    140   uint32_t st0, st1;
    141 
    142   vector1b = ((const int32_t *)filter_x0)[0];
    143   vector2b = ((const int32_t *)filter_x0)[1];
    144   vector3b = ((const int32_t *)filter_x0)[2];
    145   vector4b = ((const int32_t *)filter_x0)[3];
    146 
    147   for (y = h; y--;) {
    148     /* prefetch data to cache memory */
    149     prefetch_load(src + src_stride);
    150     prefetch_load(src + src_stride + 32);
    151     prefetch_store(dst + dst_stride);
    152 
    153     __asm__ __volatile__(
    154         "ulw              %[tp1],      0(%[src])                      \n\t"
    155         "ulw              %[tp2],      4(%[src])                      \n\t"
    156 
    157         /* even 1. pixel */
    158         "mtlo             %[vector4a], $ac3                           \n\t"
    159         "mthi             $zero,       $ac3                           \n\t"
    160         "mtlo             %[vector4a], $ac2                           \n\t"
    161         "mthi             $zero,       $ac2                           \n\t"
    162         "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
    163         "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
    164         "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
    165         "preceu.ph.qbl    %[p4],       %[tp2]                         \n\t"
    166         "ulw              %[tn2],      8(%[src])                      \n\t"
    167         "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
    168         "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
    169         "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
    170         "dpa.w.ph         $ac3,        %[p4],          %[vector4b]    \n\t"
    171         "extp             %[Temp1],    $ac3,           31             \n\t"
    172 
    173         /* even 2. pixel */
    174         "preceu.ph.qbr    %[p1],       %[tn2]                         \n\t"
    175         "preceu.ph.qbl    %[n1],       %[tn2]                         \n\t"
    176         "ulw              %[tn1],      12(%[src])                     \n\t"
    177         "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
    178         "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
    179         "dpa.w.ph         $ac2,        %[p4],          %[vector3b]    \n\t"
    180         "dpa.w.ph         $ac2,        %[p1],          %[vector4b]    \n\t"
    181         "extp             %[Temp3],    $ac2,           31             \n\t"
    182 
    183         /* even 3. pixel */
    184         "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
    185         "mtlo             %[vector4a], $ac1                           \n\t"
    186         "mthi             $zero,       $ac1                           \n\t"
    187         "preceu.ph.qbr    %[p2],       %[tn1]                         \n\t"
    188         "dpa.w.ph         $ac1,        %[p3],          %[vector1b]    \n\t"
    189         "dpa.w.ph         $ac1,        %[p4],          %[vector2b]    \n\t"
    190         "dpa.w.ph         $ac1,        %[p1],          %[vector3b]    \n\t"
    191         "dpa.w.ph         $ac1,        %[n1],          %[vector4b]    \n\t"
    192         "extp             %[Temp1],    $ac1,           31             \n\t"
    193 
    194         /* even 4. pixel */
    195         "mtlo             %[vector4a], $ac2                           \n\t"
    196         "mthi             $zero,       $ac2                           \n\t"
    197         "mtlo             %[vector4a], $ac3                           \n\t"
    198         "mthi             $zero,       $ac3                           \n\t"
    199         "sb               %[st0],      0(%[dst])                      \n\t"
    200         "lbux             %[st1],      %[Temp3](%[cm])                \n\t"
    201 
    202         "balign           %[tn3],      %[tn1],         3              \n\t"
    203         "balign           %[tn1],      %[tn2],         3              \n\t"
    204         "balign           %[tn2],      %[tp2],         3              \n\t"
    205         "balign           %[tp2],      %[tp1],         3              \n\t"
    206 
    207         "dpa.w.ph         $ac2,        %[p4],          %[vector1b]    \n\t"
    208         "dpa.w.ph         $ac2,        %[p1],          %[vector2b]    \n\t"
    209         "dpa.w.ph         $ac2,        %[n1],          %[vector3b]    \n\t"
    210         "dpa.w.ph         $ac2,        %[p2],          %[vector4b]    \n\t"
    211         "extp             %[Temp3],    $ac2,           31             \n\t"
    212 
    213         "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
    214 
    215         /* odd 1. pixel */
    216         "mtlo             %[vector4a], $ac1                           \n\t"
    217         "mthi             $zero,       $ac1                           \n\t"
    218         "sb               %[st1],      2(%[dst])                      \n\t"
    219         "preceu.ph.qbr    %[p1],       %[tp2]                         \n\t"
    220         "preceu.ph.qbl    %[p2],       %[tp2]                         \n\t"
    221         "preceu.ph.qbr    %[p3],       %[tn2]                         \n\t"
    222         "preceu.ph.qbl    %[p4],       %[tn2]                         \n\t"
    223         "sb               %[st0],      4(%[dst])                      \n\t"
    224         "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
    225         "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
    226         "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
    227         "dpa.w.ph         $ac3,        %[p4],          %[vector4b]    \n\t"
    228         "extp             %[Temp2],    $ac3,           31             \n\t"
    229 
    230         /* odd 2. pixel */
    231         "mtlo             %[vector4a], $ac3                           \n\t"
    232         "mthi             $zero,       $ac3                           \n\t"
    233         "mtlo             %[vector4a], $ac2                           \n\t"
    234         "mthi             $zero,       $ac2                           \n\t"
    235         "preceu.ph.qbr    %[p1],       %[tn1]                         \n\t"
    236         "preceu.ph.qbl    %[n1],       %[tn1]                         \n\t"
    237         "lbux             %[st0],      %[Temp3](%[cm])                \n\t"
    238         "dpa.w.ph         $ac1,        %[p2],          %[vector1b]    \n\t"
    239         "dpa.w.ph         $ac1,        %[p3],          %[vector2b]    \n\t"
    240         "dpa.w.ph         $ac1,        %[p4],          %[vector3b]    \n\t"
    241         "dpa.w.ph         $ac1,        %[p1],          %[vector4b]    \n\t"
    242         "extp             %[Temp3],    $ac1,           31             \n\t"
    243 
    244         /* odd 3. pixel */
    245         "lbux             %[st1],      %[Temp2](%[cm])                \n\t"
    246         "preceu.ph.qbr    %[p2],       %[tn3]                         \n\t"
    247         "dpa.w.ph         $ac3,        %[p3],          %[vector1b]    \n\t"
    248         "dpa.w.ph         $ac3,        %[p4],          %[vector2b]    \n\t"
    249         "dpa.w.ph         $ac3,        %[p1],          %[vector3b]    \n\t"
    250         "dpa.w.ph         $ac3,        %[n1],          %[vector4b]    \n\t"
    251         "extp             %[Temp2],    $ac3,           31             \n\t"
    252 
    253         /* odd 4. pixel */
    254         "sb               %[st1],      1(%[dst])                      \n\t"
    255         "sb               %[st0],      6(%[dst])                      \n\t"
    256         "dpa.w.ph         $ac2,        %[p4],          %[vector1b]    \n\t"
    257         "dpa.w.ph         $ac2,        %[p1],          %[vector2b]    \n\t"
    258         "dpa.w.ph         $ac2,        %[n1],          %[vector3b]    \n\t"
    259         "dpa.w.ph         $ac2,        %[p2],          %[vector4b]    \n\t"
    260         "extp             %[Temp1],    $ac2,           31             \n\t"
    261 
    262         /* clamp */
    263         "lbux             %[p4],       %[Temp3](%[cm])                \n\t"
    264         "lbux             %[p2],       %[Temp2](%[cm])                \n\t"
    265         "lbux             %[n1],       %[Temp1](%[cm])                \n\t"
    266 
    267         /* store bytes */
    268         "sb               %[p4],       3(%[dst])                      \n\t"
    269         "sb               %[p2],       5(%[dst])                      \n\t"
    270         "sb               %[n1],       7(%[dst])                      \n\t"
    271 
    272         : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
    273           [tn2] "=&r"(tn2), [tn3] "=&r"(tn3), [st0] "=&r"(st0),
    274           [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
    275           [p4] "=&r"(p4), [n1] "=&r"(n1), [Temp1] "=&r"(Temp1),
    276           [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
    277         : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
    278           [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
    279           [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst),
    280           [src] "r"(src));
    281 
    282     /* Next row... */
    283     src += src_stride;
    284     dst += dst_stride;
    285   }
    286 }
    287 
    288 static void convolve_horiz_16_dspr2(const uint8_t *src_ptr, int32_t src_stride,
    289                                     uint8_t *dst_ptr, int32_t dst_stride,
    290                                     const int16_t *filter_x0, int32_t h,
    291                                     int32_t count) {
    292   int32_t y, c;
    293   const uint8_t *src;
    294   uint8_t *dst;
    295   uint8_t *cm = aom_ff_cropTbl;
    296   uint32_t vector_64 = 64;
    297   int32_t filter12, filter34, filter56, filter78;
    298   int32_t Temp1, Temp2, Temp3;
    299   uint32_t qload1, qload2, qload3;
    300   uint32_t p1, p2, p3, p4, p5;
    301   uint32_t st1, st2, st3;
    302 
    303   filter12 = ((const int32_t *)filter_x0)[0];
    304   filter34 = ((const int32_t *)filter_x0)[1];
    305   filter56 = ((const int32_t *)filter_x0)[2];
    306   filter78 = ((const int32_t *)filter_x0)[3];
    307 
    308   for (y = h; y--;) {
    309     src = src_ptr;
    310     dst = dst_ptr;
    311 
    312     /* prefetch data to cache memory */
    313     prefetch_load(src_ptr + src_stride);
    314     prefetch_load(src_ptr + src_stride + 32);
    315     prefetch_store(dst_ptr + dst_stride);
    316 
    317     for (c = 0; c < count; c++) {
    318       __asm__ __volatile__(
    319           "ulw              %[qload1],    0(%[src])                    \n\t"
    320           "ulw              %[qload2],    4(%[src])                    \n\t"
    321 
    322           /* even 1. pixel */
    323           "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
    324           "mthi             $zero,        $ac1                         \n\t"
    325           "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
    326           "mthi             $zero,        $ac2                         \n\t"
    327           "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
    328           "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
    329           "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
    330           "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
    331           "ulw              %[qload3],    8(%[src])                    \n\t"
    332           "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* even 1 */
    333           "dpa.w.ph         $ac1,         %[p2],          %[filter34]  \n\t" /* even 1 */
    334           "dpa.w.ph         $ac1,         %[p3],          %[filter56]  \n\t" /* even 1 */
    335           "dpa.w.ph         $ac1,         %[p4],          %[filter78]  \n\t" /* even 1 */
    336           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
    337 
    338           /* even 2. pixel */
    339           "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
    340           "mthi             $zero,        $ac3                         \n\t"
    341           "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
    342           "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
    343           "ulw              %[qload1],    12(%[src])                   \n\t"
    344           "dpa.w.ph         $ac2,         %[p2],          %[filter12]  \n\t" /* even 1 */
    345           "dpa.w.ph         $ac2,         %[p3],          %[filter34]  \n\t" /* even 1 */
    346           "dpa.w.ph         $ac2,         %[p4],          %[filter56]  \n\t" /* even 1 */
    347           "dpa.w.ph         $ac2,         %[p1],          %[filter78]  \n\t" /* even 1 */
    348           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
    349           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
    350 
    351           /* even 3. pixel */
    352           "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
    353           "mthi             $zero,        $ac1                         \n\t"
    354           "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
    355           "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
    356           "dpa.w.ph         $ac3,         %[p3],          %[filter12]  \n\t" /* even 3 */
    357           "dpa.w.ph         $ac3,         %[p4],          %[filter34]  \n\t" /* even 3 */
    358           "dpa.w.ph         $ac3,         %[p1],          %[filter56]  \n\t" /* even 3 */
    359           "dpa.w.ph         $ac3,         %[p5],          %[filter78]  \n\t" /* even 3 */
    360           "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
    361           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
    362 
    363           /* even 4. pixel */
    364           "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
    365           "mthi             $zero,        $ac2                         \n\t"
    366           "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
    367           "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
    368           "ulw              %[qload2],    16(%[src])                   \n\t"
    369           "dpa.w.ph         $ac1,         %[p4],          %[filter12]  \n\t" /* even 4 */
    370           "dpa.w.ph         $ac1,         %[p1],          %[filter34]  \n\t" /* even 4 */
    371           "dpa.w.ph         $ac1,         %[p5],          %[filter56]  \n\t" /* even 4 */
    372           "dpa.w.ph         $ac1,         %[p2],          %[filter78]  \n\t" /* even 4 */
    373           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
    374           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
    375 
    376           /* even 5. pixel */
    377           "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
    378           "mthi             $zero,        $ac3                         \n\t"
    379           "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
    380           "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
    381           "dpa.w.ph         $ac2,         %[p1],          %[filter12]  \n\t" /* even 5 */
    382           "dpa.w.ph         $ac2,         %[p5],          %[filter34]  \n\t" /* even 5 */
    383           "dpa.w.ph         $ac2,         %[p2],          %[filter56]  \n\t" /* even 5 */
    384           "dpa.w.ph         $ac2,         %[p3],          %[filter78]  \n\t" /* even 5 */
    385           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
    386           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
    387 
    388           /* even 6. pixel */
    389           "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
    390           "mthi             $zero,        $ac1                         \n\t"
    391           "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
    392           "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
    393           "ulw              %[qload3],    20(%[src])                   \n\t"
    394           "dpa.w.ph         $ac3,         %[p5],          %[filter12]  \n\t" /* even 6 */
    395           "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* even 6 */
    396           "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* even 6 */
    397           "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* even 6 */
    398           "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
    399           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
    400 
    401           /* even 7. pixel */
    402           "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
    403           "mthi             $zero,        $ac2                         \n\t"
    404           "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
    405           "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
    406           "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* even 7 */
    407           "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* even 7 */
    408           "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* even 7 */
    409           "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* even 7 */
    410           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
    411           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
    412 
    413           /* even 8. pixel */
    414           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
    415           "mthi             $zero,        $ac3                         \n\t"
    416           "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* even 8 */
    417           "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* even 8 */
    418           "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
    419           "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* even 8 */
    420           "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* even 8 */
    421           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
    422           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
    423 
    424           /* ODD pixels */
    425           "ulw              %[qload1],    1(%[src])                    \n\t"
    426           "ulw              %[qload2],    5(%[src])                    \n\t"
    427 
    428           /* odd 1. pixel */
    429           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
    430           "mthi             $zero,        $ac1                         \n\t"
    431           "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
    432           "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
    433           "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
    434           "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
    435           "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
    436           "ulw              %[qload3],    9(%[src])                    \n\t"
    437           "dpa.w.ph         $ac3,         %[p1],          %[filter12]  \n\t" /* odd 1 */
    438           "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* odd 1 */
    439           "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* odd 1 */
    440           "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* odd 1 */
    441           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
    442           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
    443 
    444           /* odd 2. pixel */
    445           "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
    446           "mthi             $zero,        $ac2                         \n\t"
    447           "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
    448           "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
    449           "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
    450           "ulw              %[qload1],    13(%[src])                   \n\t"
    451           "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* odd 2 */
    452           "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* odd 2 */
    453           "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* odd 2 */
    454           "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* odd 2 */
    455           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
    456           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
    457 
    458           /* odd 3. pixel */
    459           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
    460           "mthi             $zero,        $ac3                         \n\t"
    461           "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
    462           "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
    463           "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* odd 3 */
    464           "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* odd 3 */
    465           "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* odd 3 */
    466           "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* odd 3 */
    467           "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
    468           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
    469 
    470           /* odd 4. pixel */
    471           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
    472           "mthi             $zero,        $ac1                         \n\t"
    473           "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
    474           "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
    475           "ulw              %[qload2],    17(%[src])                   \n\t"
    476           "dpa.w.ph         $ac3,         %[p4],          %[filter12]  \n\t" /* odd 4 */
    477           "dpa.w.ph         $ac3,         %[p1],          %[filter34]  \n\t" /* odd 4 */
    478           "dpa.w.ph         $ac3,         %[p5],          %[filter56]  \n\t" /* odd 4 */
    479           "dpa.w.ph         $ac3,         %[p2],          %[filter78]  \n\t" /* odd 4 */
    480           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
    481           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
    482 
    483           /* odd 5. pixel */
    484           "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
    485           "mthi             $zero,        $ac2                         \n\t"
    486           "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
    487           "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
    488           "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* odd 5 */
    489           "dpa.w.ph         $ac1,         %[p5],          %[filter34]  \n\t" /* odd 5 */
    490           "dpa.w.ph         $ac1,         %[p2],          %[filter56]  \n\t" /* odd 5 */
    491           "dpa.w.ph         $ac1,         %[p3],          %[filter78]  \n\t" /* odd 5 */
    492           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
    493           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
    494 
    495           /* odd 6. pixel */
    496           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
    497           "mthi             $zero,        $ac3                         \n\t"
    498           "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
    499           "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
    500           "ulw              %[qload3],    21(%[src])                   \n\t"
    501           "dpa.w.ph         $ac2,         %[p5],          %[filter12]  \n\t" /* odd 6 */
    502           "dpa.w.ph         $ac2,         %[p2],          %[filter34]  \n\t" /* odd 6 */
    503           "dpa.w.ph         $ac2,         %[p3],          %[filter56]  \n\t" /* odd 6 */
    504           "dpa.w.ph         $ac2,         %[p4],          %[filter78]  \n\t" /* odd 6 */
    505           "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
    506           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
    507 
    508           /* odd 7. pixel */
    509           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
    510           "mthi             $zero,        $ac1                         \n\t"
    511           "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
    512           "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
    513           "dpa.w.ph         $ac3,         %[p2],          %[filter12]  \n\t" /* odd 7 */
    514           "dpa.w.ph         $ac3,         %[p3],          %[filter34]  \n\t" /* odd 7 */
    515           "dpa.w.ph         $ac3,         %[p4],          %[filter56]  \n\t" /* odd 7 */
    516           "dpa.w.ph         $ac3,         %[p1],          %[filter78]  \n\t" /* odd 7 */
    517           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
    518 
    519           /* odd 8. pixel */
    520           "dpa.w.ph         $ac1,         %[p3],          %[filter12]  \n\t" /* odd 8 */
    521           "dpa.w.ph         $ac1,         %[p4],          %[filter34]  \n\t" /* odd 8 */
    522           "dpa.w.ph         $ac1,         %[p1],          %[filter56]  \n\t" /* odd 8 */
    523           "dpa.w.ph         $ac1,         %[p5],          %[filter78]  \n\t" /* odd 8 */
    524           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
    525 
    526           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
    527           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
    528           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
    529 
    530           "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
    531           "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
    532           "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
    533 
    534           : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
    535             [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
    536             [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
    537             [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
    538             [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
    539           : [filter12] "r"(filter12), [filter34] "r"(filter34),
    540             [filter56] "r"(filter56), [filter78] "r"(filter78),
    541             [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst),
    542             [src] "r"(src));
    543 
    544       src += 16;
    545       dst += 16;
    546     }
    547 
    548     /* Next row... */
    549     src_ptr += src_stride;
    550     dst_ptr += dst_stride;
    551   }
    552 }
    553 
    554 static void convolve_horiz_64_dspr2(const uint8_t *src_ptr, int32_t src_stride,
    555                                     uint8_t *dst_ptr, int32_t dst_stride,
    556                                     const int16_t *filter_x0, int32_t h) {
    557   int32_t y, c;
    558   const uint8_t *src;
    559   uint8_t *dst;
    560   uint8_t *cm = aom_ff_cropTbl;
    561   uint32_t vector_64 = 64;
    562   int32_t filter12, filter34, filter56, filter78;
    563   int32_t Temp1, Temp2, Temp3;
    564   uint32_t qload1, qload2, qload3;
    565   uint32_t p1, p2, p3, p4, p5;
    566   uint32_t st1, st2, st3;
    567 
    568   filter12 = ((const int32_t *)filter_x0)[0];
    569   filter34 = ((const int32_t *)filter_x0)[1];
    570   filter56 = ((const int32_t *)filter_x0)[2];
    571   filter78 = ((const int32_t *)filter_x0)[3];
    572 
    573   for (y = h; y--;) {
    574     src = src_ptr;
    575     dst = dst_ptr;
    576 
    577     /* prefetch data to cache memory */
    578     prefetch_load(src_ptr + src_stride);
    579     prefetch_load(src_ptr + src_stride + 32);
    580     prefetch_load(src_ptr + src_stride + 64);
    581     prefetch_store(dst_ptr + dst_stride);
    582     prefetch_store(dst_ptr + dst_stride + 32);
    583 
    584     for (c = 0; c < 4; c++) {
    585       __asm__ __volatile__(
    586           "ulw              %[qload1],    0(%[src])                    \n\t"
    587           "ulw              %[qload2],    4(%[src])                    \n\t"
    588 
    589           /* even 1. pixel */
    590           "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
    591           "mthi             $zero,        $ac1                         \n\t"
    592           "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
    593           "mthi             $zero,        $ac2                         \n\t"
    594           "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
    595           "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
    596           "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
    597           "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
    598           "ulw              %[qload3],    8(%[src])                    \n\t"
    599           "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* even 1 */
    600           "dpa.w.ph         $ac1,         %[p2],          %[filter34]  \n\t" /* even 1 */
    601           "dpa.w.ph         $ac1,         %[p3],          %[filter56]  \n\t" /* even 1 */
    602           "dpa.w.ph         $ac1,         %[p4],          %[filter78]  \n\t" /* even 1 */
    603           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
    604 
    605           /* even 2. pixel */
    606           "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
    607           "mthi             $zero,        $ac3                         \n\t"
    608           "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
    609           "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
    610           "ulw              %[qload1],    12(%[src])                   \n\t"
    611           "dpa.w.ph         $ac2,         %[p2],          %[filter12]  \n\t" /* even 1 */
    612           "dpa.w.ph         $ac2,         %[p3],          %[filter34]  \n\t" /* even 1 */
    613           "dpa.w.ph         $ac2,         %[p4],          %[filter56]  \n\t" /* even 1 */
    614           "dpa.w.ph         $ac2,         %[p1],          %[filter78]  \n\t" /* even 1 */
    615           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
    616           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
    617 
    618           /* even 3. pixel */
    619           "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
    620           "mthi             $zero,        $ac1                         \n\t"
    621           "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
    622           "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
    623           "dpa.w.ph         $ac3,         %[p3],          %[filter12]  \n\t" /* even 3 */
    624           "dpa.w.ph         $ac3,         %[p4],          %[filter34]  \n\t" /* even 3 */
    625           "dpa.w.ph         $ac3,         %[p1],          %[filter56]  \n\t" /* even 3 */
    626           "dpa.w.ph         $ac3,         %[p5],          %[filter78]  \n\t" /* even 3 */
    627           "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
    628           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
    629 
    630           /* even 4. pixel */
    631           "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
    632           "mthi             $zero,        $ac2                         \n\t"
    633           "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
    634           "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
    635           "ulw              %[qload2],    16(%[src])                   \n\t"
    636           "dpa.w.ph         $ac1,         %[p4],          %[filter12]  \n\t" /* even 4 */
    637           "dpa.w.ph         $ac1,         %[p1],          %[filter34]  \n\t" /* even 4 */
    638           "dpa.w.ph         $ac1,         %[p5],          %[filter56]  \n\t" /* even 4 */
    639           "dpa.w.ph         $ac1,         %[p2],          %[filter78]  \n\t" /* even 4 */
    640           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
    641           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
    642 
    643           /* even 5. pixel */
    644           "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
    645           "mthi             $zero,        $ac3                         \n\t"
    646           "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
    647           "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
    648           "dpa.w.ph         $ac2,         %[p1],          %[filter12]  \n\t" /* even 5 */
    649           "dpa.w.ph         $ac2,         %[p5],          %[filter34]  \n\t" /* even 5 */
    650           "dpa.w.ph         $ac2,         %[p2],          %[filter56]  \n\t" /* even 5 */
    651           "dpa.w.ph         $ac2,         %[p3],          %[filter78]  \n\t" /* even 5 */
    652           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
    653           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
    654 
    655           /* even 6. pixel */
    656           "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
    657           "mthi             $zero,        $ac1                         \n\t"
    658           "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
    659           "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
    660           "ulw              %[qload3],    20(%[src])                   \n\t"
    661           "dpa.w.ph         $ac3,         %[p5],          %[filter12]  \n\t" /* even 6 */
    662           "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* even 6 */
    663           "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* even 6 */
    664           "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* even 6 */
    665           "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
    666           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
    667 
    668           /* even 7. pixel */
    669           "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
    670           "mthi             $zero,        $ac2                         \n\t"
    671           "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
    672           "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
    673           "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* even 7 */
    674           "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* even 7 */
    675           "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* even 7 */
    676           "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* even 7 */
    677           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
    678           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
    679 
    680           /* even 8. pixel */
    681           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
    682           "mthi             $zero,        $ac3                         \n\t"
    683           "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* even 8 */
    684           "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* even 8 */
    685           "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
    686           "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* even 8 */
    687           "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* even 8 */
    688           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
    689           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
    690 
    691           /* ODD pixels */
    692           "ulw              %[qload1],    1(%[src])                    \n\t"
    693           "ulw              %[qload2],    5(%[src])                    \n\t"
    694 
    695           /* odd 1. pixel */
    696           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
    697           "mthi             $zero,        $ac1                         \n\t"
    698           "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
    699           "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
    700           "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
    701           "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
    702           "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
    703           "ulw              %[qload3],    9(%[src])                    \n\t"
    704           "dpa.w.ph         $ac3,         %[p1],          %[filter12]  \n\t" /* odd 1 */
    705           "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* odd 1 */
    706           "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* odd 1 */
    707           "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* odd 1 */
    708           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
    709           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
    710 
    711           /* odd 2. pixel */
    712           "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
    713           "mthi             $zero,        $ac2                         \n\t"
    714           "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
    715           "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
    716           "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
    717           "ulw              %[qload1],    13(%[src])                   \n\t"
    718           "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* odd 2 */
    719           "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* odd 2 */
    720           "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* odd 2 */
    721           "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* odd 2 */
    722           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
    723           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
    724 
    725           /* odd 3. pixel */
    726           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
    727           "mthi             $zero,        $ac3                         \n\t"
    728           "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
    729           "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
    730           "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* odd 3 */
    731           "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* odd 3 */
    732           "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* odd 3 */
    733           "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* odd 3 */
    734           "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
    735           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
    736 
    737           /* odd 4. pixel */
    738           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
    739           "mthi             $zero,        $ac1                         \n\t"
    740           "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
    741           "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
    742           "ulw              %[qload2],    17(%[src])                   \n\t"
    743           "dpa.w.ph         $ac3,         %[p4],          %[filter12]  \n\t" /* odd 4 */
    744           "dpa.w.ph         $ac3,         %[p1],          %[filter34]  \n\t" /* odd 4 */
    745           "dpa.w.ph         $ac3,         %[p5],          %[filter56]  \n\t" /* odd 4 */
    746           "dpa.w.ph         $ac3,         %[p2],          %[filter78]  \n\t" /* odd 4 */
    747           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
    748           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
    749 
    750           /* odd 5. pixel */
    751           "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
    752           "mthi             $zero,        $ac2                         \n\t"
    753           "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
    754           "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
    755           "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* odd 5 */
    756           "dpa.w.ph         $ac1,         %[p5],          %[filter34]  \n\t" /* odd 5 */
    757           "dpa.w.ph         $ac1,         %[p2],          %[filter56]  \n\t" /* odd 5 */
    758           "dpa.w.ph         $ac1,         %[p3],          %[filter78]  \n\t" /* odd 5 */
    759           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
    760           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
    761 
    762           /* odd 6. pixel */
    763           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
    764           "mthi             $zero,        $ac3                         \n\t"
    765           "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
    766           "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
    767           "ulw              %[qload3],    21(%[src])                   \n\t"
    768           "dpa.w.ph         $ac2,         %[p5],          %[filter12]  \n\t" /* odd 6 */
    769           "dpa.w.ph         $ac2,         %[p2],          %[filter34]  \n\t" /* odd 6 */
    770           "dpa.w.ph         $ac2,         %[p3],          %[filter56]  \n\t" /* odd 6 */
    771           "dpa.w.ph         $ac2,         %[p4],          %[filter78]  \n\t" /* odd 6 */
    772           "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
    773           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
    774 
    775           /* odd 7. pixel */
    776           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
    777           "mthi             $zero,        $ac1                         \n\t"
    778           "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
    779           "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
    780           "dpa.w.ph         $ac3,         %[p2],          %[filter12]  \n\t" /* odd 7 */
    781           "dpa.w.ph         $ac3,         %[p3],          %[filter34]  \n\t" /* odd 7 */
    782           "dpa.w.ph         $ac3,         %[p4],          %[filter56]  \n\t" /* odd 7 */
    783           "dpa.w.ph         $ac3,         %[p1],          %[filter78]  \n\t" /* odd 7 */
    784           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
    785 
    786           /* odd 8. pixel */
    787           "dpa.w.ph         $ac1,         %[p3],          %[filter12]  \n\t" /* odd 8 */
    788           "dpa.w.ph         $ac1,         %[p4],          %[filter34]  \n\t" /* odd 8 */
    789           "dpa.w.ph         $ac1,         %[p1],          %[filter56]  \n\t" /* odd 8 */
    790           "dpa.w.ph         $ac1,         %[p5],          %[filter78]  \n\t" /* odd 8 */
    791           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
    792 
    793           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
    794           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
    795           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
    796 
    797           "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
    798           "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
    799           "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
    800 
    801           : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
    802             [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
    803             [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
    804             [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
    805             [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
    806           : [filter12] "r"(filter12), [filter34] "r"(filter34),
    807             [filter56] "r"(filter56), [filter78] "r"(filter78),
    808             [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst),
    809             [src] "r"(src));
    810 
    811       src += 16;
    812       dst += 16;
    813     }
    814 
    815     /* Next row... */
    816     src_ptr += src_stride;
    817     dst_ptr += dst_stride;
    818   }
    819 }
    820 
    821 void aom_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
    822                                uint8_t *dst, ptrdiff_t dst_stride,
    823                                const int16_t *filter_x, int x_step_q4,
    824                                const int16_t *filter_y, int y_step_q4, int w,
    825                                int h) {
    826   assert(x_step_q4 == 16);
    827   assert(((const int32_t *)filter_x)[1] != 0x800000);
    828 
    829   if (((const int32_t *)filter_x)[0] == 0) {
    830     aom_convolve2_horiz_dspr2(src, src_stride, dst, dst_stride, filter_x,
    831                               x_step_q4, filter_y, y_step_q4, w, h);
    832   } else {
    833     uint32_t pos = 38;
    834 
    835     prefetch_load((const uint8_t *)filter_x);
    836     src -= 3;
    837 
    838     /* bit positon for extract from acc */
    839     __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
    840                          :
    841                          : [pos] "r"(pos));
    842 
    843     /* prefetch data to cache memory */
    844     prefetch_load(src);
    845     prefetch_load(src + 32);
    846     prefetch_store(dst);
    847 
    848     switch (w) {
    849       case 4:
    850         convolve_horiz_4_dspr2(src, (int32_t)src_stride, dst,
    851                                (int32_t)dst_stride, filter_x, (int32_t)h);
    852         break;
    853       case 8:
    854         convolve_horiz_8_dspr2(src, (int32_t)src_stride, dst,
    855                                (int32_t)dst_stride, filter_x, (int32_t)h);
    856         break;
    857       case 16:
    858         convolve_horiz_16_dspr2(src, (int32_t)src_stride, dst,
    859                                 (int32_t)dst_stride, filter_x, (int32_t)h, 1);
    860         break;
    861       case 32:
    862         convolve_horiz_16_dspr2(src, (int32_t)src_stride, dst,
    863                                 (int32_t)dst_stride, filter_x, (int32_t)h, 2);
    864         break;
    865       case 64:
    866         prefetch_load(src + 64);
    867         prefetch_store(dst + 32);
    868 
    869         convolve_horiz_64_dspr2(src, (int32_t)src_stride, dst,
    870                                 (int32_t)dst_stride, filter_x, (int32_t)h);
    871         break;
    872       default:
    873         aom_convolve8_horiz_c(src + 3, src_stride, dst, dst_stride, filter_x,
    874                               x_step_q4, filter_y, y_step_q4, w, h);
    875         break;
    876     }
    877   }
    878 }
    879 #endif
    880