Home | History | Annotate | Download | only in mips
      1 /*
      2  *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <assert.h>
     12 #include <stdio.h>
     13 
     14 #include "./vpx_dsp_rtcd.h"
     15 #include "vpx_dsp/mips/convolve_common_dspr2.h"
     16 #include "vpx_dsp/vpx_dsp_common.h"
     17 #include "vpx_dsp/vpx_filter.h"
     18 #include "vpx_ports/mem.h"
     19 
     20 #if HAVE_DSPR2
     21 static void convolve_bi_horiz_4_transposed_dspr2(
     22     const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
     23     const int16_t *filter_x0, int32_t h) {
     24   int32_t y;
     25   uint8_t *cm = vpx_ff_cropTbl;
     26   uint8_t *dst_ptr;
     27   int32_t Temp1, Temp2;
     28   uint32_t vector4a = 64;
     29   uint32_t tp1, tp2;
     30   uint32_t p1, p2;
     31   const int16_t *filter = &filter_x0[3];
     32   uint32_t filter45;
     33 
     34   filter45 = ((const int32_t *)filter)[0];
     35 
     36   for (y = h; y--;) {
     37     dst_ptr = dst;
     38     /* prefetch data to cache memory */
     39     prefetch_load(src + src_stride);
     40     prefetch_load(src + src_stride + 32);
     41 
     42     __asm__ __volatile__(
     43         "ulw              %[tp1],         0(%[src])                      \n\t"
     44         "ulw              %[tp2],         4(%[src])                      \n\t"
     45 
     46         /* even 1. pixel */
     47         "mtlo             %[vector4a],    $ac3                           \n\t"
     48         "mthi             $zero,          $ac3                           \n\t"
     49         "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
     50         "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
     51         "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
     52         "extp             %[Temp1],       $ac3,           31             \n\t"
     53 
     54         /* even 2. pixel */
     55         "mtlo             %[vector4a],    $ac2                           \n\t"
     56         "mthi             $zero,          $ac2                           \n\t"
     57         "balign           %[tp2],         %[tp1],         3              \n\t"
     58         "dpa.w.ph         $ac2,           %[p2],          %[filter45]    \n\t"
     59         "extp             %[Temp2],       $ac2,           31             \n\t"
     60 
     61         /* odd 1. pixel */
     62         "lbux             %[tp1],         %[Temp1](%[cm])                \n\t"
     63         "mtlo             %[vector4a],    $ac3                           \n\t"
     64         "mthi             $zero,          $ac3                           \n\t"
     65         "preceu.ph.qbr    %[p1],          %[tp2]                         \n\t"
     66         "preceu.ph.qbl    %[p2],          %[tp2]                         \n\t"
     67         "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
     68         "extp             %[Temp1],       $ac3,           31             \n\t"
     69 
     70         /* odd 2. pixel */
     71         "lbux             %[tp2],         %[Temp2](%[cm])                \n\t"
     72         "mtlo             %[vector4a],    $ac2                           \n\t"
     73         "mthi             $zero,          $ac2                           \n\t"
     74         "dpa.w.ph         $ac2,           %[p2],          %[filter45]    \n\t"
     75         "extp             %[Temp2],       $ac2,           31             \n\t"
     76 
     77         /* clamp */
     78         "lbux             %[p1],          %[Temp1](%[cm])                \n\t"
     79         "lbux             %[p2],          %[Temp2](%[cm])                \n\t"
     80 
     81         /* store bytes */
     82         "sb               %[tp1],         0(%[dst_ptr])                  \n\t"
     83         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
     84 
     85         "sb               %[p1],          0(%[dst_ptr])                  \n\t"
     86         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
     87 
     88         "sb               %[tp2],         0(%[dst_ptr])                  \n\t"
     89         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
     90 
     91         "sb               %[p2],          0(%[dst_ptr])                  \n\t"
     92         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
     93 
     94         : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [p1] "=&r"(p1), [p2] "=&r"(p2),
     95           [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [dst_ptr] "+r"(dst_ptr)
     96         : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
     97           [src] "r"(src), [dst_stride] "r"(dst_stride));
     98 
     99     /* Next row... */
    100     src += src_stride;
    101     dst += 1;
    102   }
    103 }
    104 
    105 static void convolve_bi_horiz_8_transposed_dspr2(
    106     const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
    107     const int16_t *filter_x0, int32_t h) {
    108   int32_t y;
    109   uint8_t *cm = vpx_ff_cropTbl;
    110   uint8_t *dst_ptr;
    111   uint32_t vector4a = 64;
    112   int32_t Temp1, Temp2, Temp3;
    113   uint32_t tp1, tp2, tp3;
    114   uint32_t p1, p2, p3, p4;
    115   uint8_t *odd_dst;
    116   uint32_t dst_pitch_2 = (dst_stride << 1);
    117   const int16_t *filter = &filter_x0[3];
    118   uint32_t filter45;
    119 
    120   filter45 = ((const int32_t *)filter)[0];
    121 
    122   for (y = h; y--;) {
    123     /* prefetch data to cache memory */
    124     prefetch_load(src + src_stride);
    125     prefetch_load(src + src_stride + 32);
    126 
    127     dst_ptr = dst;
    128     odd_dst = (dst_ptr + dst_stride);
    129 
    130     __asm__ __volatile__(
    131         "ulw              %[tp1],         0(%[src])                       \n\t"
    132         "ulw              %[tp2],         4(%[src])                       \n\t"
    133 
    134         /* even 1. pixel */
    135         "mtlo             %[vector4a],    $ac3                            \n\t"
    136         "mthi             $zero,          $ac3                            \n\t"
    137         "mtlo             %[vector4a],    $ac2                            \n\t"
    138         "mthi             $zero,          $ac2                            \n\t"
    139         "preceu.ph.qbr    %[p1],          %[tp1]                          \n\t"
    140         "preceu.ph.qbl    %[p2],          %[tp1]                          \n\t"
    141         "preceu.ph.qbr    %[p3],          %[tp2]                          \n\t"
    142         "preceu.ph.qbl    %[p4],          %[tp2]                          \n\t"
    143         "ulw              %[tp3],         8(%[src])                       \n\t"
    144         "dpa.w.ph         $ac3,           %[p1],          %[filter45]     \n\t"
    145         "extp             %[Temp1],       $ac3,           31              \n\t"
    146 
    147         /* even 2. pixel */
    148         "dpa.w.ph         $ac2,           %[p2],          %[filter45]     \n\t"
    149         "extp             %[Temp3],       $ac2,           31              \n\t"
    150 
    151         /* even 3. pixel */
    152         "lbux             %[Temp2],       %[Temp1](%[cm])                 \n\t"
    153         "mtlo             %[vector4a],    $ac1                            \n\t"
    154         "mthi             $zero,          $ac1                            \n\t"
    155         "balign           %[tp3],         %[tp2],         3              \n\t"
    156         "balign           %[tp2],         %[tp1],         3              \n\t"
    157         "dpa.w.ph         $ac1,           %[p3],          %[filter45]     \n\t"
    158         "lbux             %[tp1],         %[Temp3](%[cm])                 \n\t"
    159         "extp             %[p3],          $ac1,           31              \n\t"
    160 
    161         /* even 4. pixel */
    162         "mtlo             %[vector4a],    $ac2                            \n\t"
    163         "mthi             $zero,          $ac2                            \n\t"
    164         "mtlo             %[vector4a],    $ac3                            \n\t"
    165         "mthi             $zero,          $ac3                            \n\t"
    166         "sb               %[Temp2],       0(%[dst_ptr])                   \n\t"
    167         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
    168         "sb               %[tp1],         0(%[dst_ptr])                   \n\t"
    169         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
    170 
    171         "dpa.w.ph         $ac2,           %[p4],          %[filter45]     \n\t"
    172         "extp             %[Temp3],       $ac2,           31              \n\t"
    173 
    174         "lbux             %[Temp1],         %[p3](%[cm])                    "
    175         "\n\t"
    176 
    177         /* odd 1. pixel */
    178         "mtlo             %[vector4a],    $ac1                            \n\t"
    179         "mthi             $zero,          $ac1                            \n\t"
    180         "preceu.ph.qbr    %[p1],          %[tp2]                          \n\t"
    181         "preceu.ph.qbl    %[p2],          %[tp2]                          \n\t"
    182         "preceu.ph.qbr    %[p3],          %[tp3]                          \n\t"
    183         "preceu.ph.qbl    %[p4],          %[tp3]                          \n\t"
    184         "sb               %[Temp1],       0(%[dst_ptr])                   \n\t"
    185         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
    186 
    187         "dpa.w.ph         $ac3,           %[p1],          %[filter45]     \n\t"
    188         "extp             %[Temp2],       $ac3,           31              \n\t"
    189 
    190         /* odd 2. pixel */
    191         "lbux             %[tp1],         %[Temp3](%[cm])                 \n\t"
    192         "mtlo             %[vector4a],    $ac3                            \n\t"
    193         "mthi             $zero,          $ac3                            \n\t"
    194         "mtlo             %[vector4a],    $ac2                            \n\t"
    195         "mthi             $zero,          $ac2                            \n\t"
    196         "dpa.w.ph         $ac1,           %[p2],          %[filter45]     \n\t"
    197         "sb               %[tp1],         0(%[dst_ptr])                   \n\t"
    198         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
    199         "extp             %[Temp3],       $ac1,           31              \n\t"
    200 
    201         /* odd 3. pixel */
    202         "lbux             %[tp3],         %[Temp2](%[cm])                 \n\t"
    203         "dpa.w.ph         $ac3,           %[p3],          %[filter45]     \n\t"
    204         "extp             %[Temp2],       $ac3,           31              \n\t"
    205 
    206         /* odd 4. pixel */
    207         "sb               %[tp3],         0(%[odd_dst])                   \n\t"
    208         "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
    209         "dpa.w.ph         $ac2,           %[p4],          %[filter45]     \n\t"
    210         "extp             %[Temp1],       $ac2,           31              \n\t"
    211 
    212         /* clamp */
    213         "lbux             %[p4],          %[Temp3](%[cm])                 \n\t"
    214         "lbux             %[p2],          %[Temp2](%[cm])                 \n\t"
    215         "lbux             %[p1],          %[Temp1](%[cm])                 \n\t"
    216 
    217         /* store bytes */
    218         "sb               %[p4],          0(%[odd_dst])                   \n\t"
    219         "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
    220 
    221         "sb               %[p2],          0(%[odd_dst])                   \n\t"
    222         "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
    223 
    224         "sb               %[p1],          0(%[odd_dst])                   \n\t"
    225 
    226         : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), [p1] "=&r"(p1),
    227           [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [Temp1] "=&r"(Temp1),
    228           [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), [dst_ptr] "+r"(dst_ptr),
    229           [odd_dst] "+r"(odd_dst)
    230         : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
    231           [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2));
    232 
    233     /* Next row... */
    234     src += src_stride;
    235     dst += 1;
    236   }
    237 }
    238 
    239 static void convolve_bi_horiz_16_transposed_dspr2(
    240     const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr,
    241     int32_t dst_stride, const int16_t *filter_x0, int32_t h, int32_t count) {
    242   int32_t c, y;
    243   const uint8_t *src;
    244   uint8_t *dst;
    245   uint8_t *cm = vpx_ff_cropTbl;
    246   uint32_t vector_64 = 64;
    247   int32_t Temp1, Temp2, Temp3;
    248   uint32_t qload1, qload2;
    249   uint32_t p1, p2, p3, p4, p5;
    250   uint32_t st1, st2, st3;
    251   uint32_t dst_pitch_2 = (dst_stride << 1);
    252   uint8_t *odd_dst;
    253   const int16_t *filter = &filter_x0[3];
    254   uint32_t filter45;
    255 
    256   filter45 = ((const int32_t *)filter)[0];
    257 
    258   for (y = h; y--;) {
    259     /* prefetch data to cache memory */
    260     prefetch_load(src_ptr + src_stride);
    261     prefetch_load(src_ptr + src_stride + 32);
    262 
    263     src = src_ptr;
    264     dst = dst_ptr;
    265 
    266     odd_dst = (dst + dst_stride);
    267 
    268     for (c = 0; c < count; c++) {
    269       __asm__ __volatile__(
    270           "ulw              %[qload1],        0(%[src])                       "
    271           "\n\t"
    272           "ulw              %[qload2],        4(%[src])                       "
    273           "\n\t"
    274 
    275           /* even 1. pixel */
    276           "mtlo             %[vector_64],     $ac1                            "
    277           "\n\t" /* even 1 */
    278           "mthi             $zero,            $ac1                            "
    279           "\n\t"
    280           "mtlo             %[vector_64],     $ac2                            "
    281           "\n\t" /* even 2 */
    282           "mthi             $zero,            $ac2                            "
    283           "\n\t"
    284           "preceu.ph.qbr    %[p1],            %[qload1]                       "
    285           "\n\t"
    286           "preceu.ph.qbl    %[p2],            %[qload1]                       "
    287           "\n\t"
    288           "preceu.ph.qbr    %[p3],            %[qload2]                       "
    289           "\n\t"
    290           "preceu.ph.qbl    %[p4],            %[qload2]                       "
    291           "\n\t"
    292           "ulw              %[qload1],        8(%[src])                       "
    293           "\n\t"
    294           "dpa.w.ph         $ac1,             %[p1],          %[filter45]     "
    295           "\n\t" /* even 1 */
    296           "extp             %[Temp1],         $ac1,           31              "
    297           "\n\t" /* even 1 */
    298 
    299           /* even 2. pixel */
    300           "mtlo             %[vector_64],     $ac3                            "
    301           "\n\t" /* even 3 */
    302           "mthi             $zero,            $ac3                            "
    303           "\n\t"
    304           "preceu.ph.qbr    %[p1],            %[qload1]                       "
    305           "\n\t"
    306           "preceu.ph.qbl    %[p5],            %[qload1]                       "
    307           "\n\t"
    308           "ulw              %[qload2],        12(%[src])                      "
    309           "\n\t"
    310           "dpa.w.ph         $ac2,             %[p2],          %[filter45]     "
    311           "\n\t" /* even 1 */
    312           "lbux             %[st1],           %[Temp1](%[cm])                 "
    313           "\n\t" /* even 1 */
    314           "extp             %[Temp2],         $ac2,           31              "
    315           "\n\t" /* even 1 */
    316 
    317           /* even 3. pixel */
    318           "mtlo             %[vector_64],     $ac1                            "
    319           "\n\t" /* even 4 */
    320           "mthi             $zero,            $ac1                            "
    321           "\n\t"
    322           "preceu.ph.qbr    %[p2],            %[qload2]                       "
    323           "\n\t"
    324           "sb               %[st1],           0(%[dst])                       "
    325           "\n\t" /* even 1 */
    326           "addu             %[dst],           %[dst],         %[dst_pitch_2]   "
    327           "          \n\t"
    328           "dpa.w.ph         $ac3,             %[p3],          %[filter45]     "
    329           "\n\t" /* even 3 */
    330           "extp             %[Temp3],         $ac3,           31              "
    331           "\n\t" /* even 3 */
    332           "lbux             %[st2],           %[Temp2](%[cm])                 "
    333           "\n\t" /* even 1 */
    334 
    335           /* even 4. pixel */
    336           "mtlo             %[vector_64],     $ac2                            "
    337           "\n\t" /* even 5 */
    338           "mthi             $zero,            $ac2                            "
    339           "\n\t"
    340           "preceu.ph.qbl    %[p3],            %[qload2]                       "
    341           "\n\t"
    342           "sb               %[st2],           0(%[dst])                       "
    343           "\n\t" /* even 2 */
    344           "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
    345           "\n\t"
    346           "dpa.w.ph         $ac1,             %[p4],          %[filter45]     "
    347           "\n\t" /* even 4 */
    348           "extp             %[Temp1],         $ac1,           31              "
    349           "\n\t" /* even 4 */
    350           "lbux             %[st3],           %[Temp3](%[cm])                 "
    351           "\n\t" /* even 3 */
    352 
    353           /* even 5. pixel */
    354           "mtlo             %[vector_64],     $ac3                            "
    355           "\n\t" /* even 6 */
    356           "mthi             $zero,            $ac3                            "
    357           "\n\t"
    358           "sb               %[st3],           0(%[dst])                       "
    359           "\n\t" /* even 3 */
    360           "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
    361           "\n\t"
    362           "dpa.w.ph         $ac2,             %[p1],          %[filter45]     "
    363           "\n\t" /* even 5 */
    364           "extp             %[Temp2],         $ac2,           31              "
    365           "\n\t" /* even 5 */
    366           "lbux             %[st1],           %[Temp1](%[cm])                 "
    367           "\n\t" /* even 4 */
    368 
    369           /* even 6. pixel */
    370           "mtlo             %[vector_64],     $ac1                            "
    371           "\n\t" /* even 7 */
    372           "mthi             $zero,            $ac1                            "
    373           "\n\t"
    374           "sb               %[st1],           0(%[dst])                       "
    375           "\n\t" /* even 4 */
    376           "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
    377           "\n\t"
    378           "ulw              %[qload1],        20(%[src])                      "
    379           "\n\t"
    380           "dpa.w.ph         $ac3,             %[p5],          %[filter45]     "
    381           "\n\t" /* even 6 */
    382           "extp             %[Temp3],         $ac3,           31              "
    383           "\n\t" /* even 6 */
    384           "lbux             %[st2],           %[Temp2](%[cm])                 "
    385           "\n\t" /* even 5 */
    386 
    387           /* even 7. pixel */
    388           "mtlo             %[vector_64],     $ac2                            "
    389           "\n\t" /* even 8 */
    390           "mthi             $zero,            $ac2                            "
    391           "\n\t"
    392           "preceu.ph.qbr    %[p5],            %[qload1]                       "
    393           "\n\t"
    394           "sb               %[st2],           0(%[dst])                       "
    395           "\n\t" /* even 5 */
    396           "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
    397           "\n\t"
    398           "dpa.w.ph         $ac1,             %[p2],          %[filter45]     "
    399           "\n\t" /* even 7 */
    400           "extp             %[Temp1],         $ac1,           31              "
    401           "\n\t" /* even 7 */
    402           "lbux             %[st3],           %[Temp3](%[cm])                 "
    403           "\n\t" /* even 6 */
    404 
    405           /* even 8. pixel */
    406           "mtlo             %[vector_64],     $ac3                            "
    407           "\n\t" /* odd 1 */
    408           "mthi             $zero,            $ac3                            "
    409           "\n\t"
    410           "dpa.w.ph         $ac2,             %[p3],          %[filter45]     "
    411           "\n\t" /* even 8 */
    412           "sb               %[st3],           0(%[dst])                       "
    413           "\n\t" /* even 6 */
    414           "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
    415           "\n\t"
    416           "extp             %[Temp2],         $ac2,           31              "
    417           "\n\t" /* even 8 */
    418           "lbux             %[st1],           %[Temp1](%[cm])                 "
    419           "\n\t" /* even 7 */
    420 
    421           /* ODD pixels */
    422           "ulw              %[qload1],        1(%[src])                       "
    423           "\n\t"
    424           "ulw              %[qload2],        5(%[src])                       "
    425           "\n\t"
    426 
    427           /* odd 1. pixel */
    428           "mtlo             %[vector_64],     $ac1                            "
    429           "\n\t" /* odd 2 */
    430           "mthi             $zero,            $ac1                            "
    431           "\n\t"
    432           "preceu.ph.qbr    %[p1],            %[qload1]                       "
    433           "\n\t"
    434           "preceu.ph.qbl    %[p2],            %[qload1]                       "
    435           "\n\t"
    436           "preceu.ph.qbr    %[p3],            %[qload2]                       "
    437           "\n\t"
    438           "preceu.ph.qbl    %[p4],            %[qload2]                       "
    439           "\n\t"
    440           "sb               %[st1],           0(%[dst])                       "
    441           "\n\t" /* even 7 */
    442           "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
    443           "\n\t"
    444           "ulw              %[qload2],        9(%[src])                       "
    445           "\n\t"
    446           "dpa.w.ph         $ac3,             %[p1],          %[filter45]     "
    447           "\n\t" /* odd 1 */
    448           "extp             %[Temp3],         $ac3,           31              "
    449           "\n\t" /* odd 1 */
    450           "lbux             %[st2],           %[Temp2](%[cm])                 "
    451           "\n\t" /* even 8 */
    452 
    453           /* odd 2. pixel */
    454           "mtlo             %[vector_64],     $ac2                            "
    455           "\n\t" /* odd 3 */
    456           "mthi             $zero,            $ac2                            "
    457           "\n\t"
    458           "preceu.ph.qbr    %[p1],            %[qload2]                       "
    459           "\n\t"
    460           "preceu.ph.qbl    %[p5],            %[qload2]                       "
    461           "\n\t"
    462           "sb               %[st2],           0(%[dst])                       "
    463           "\n\t" /* even 8 */
    464           "ulw              %[qload1],        13(%[src])                      "
    465           "\n\t"
    466           "dpa.w.ph         $ac1,             %[p2],          %[filter45]     "
    467           "\n\t" /* odd 2 */
    468           "extp             %[Temp1],         $ac1,           31              "
    469           "\n\t" /* odd 2 */
    470           "lbux             %[st3],           %[Temp3](%[cm])                 "
    471           "\n\t" /* odd 1 */
    472 
    473           /* odd 3. pixel */
    474           "mtlo             %[vector_64],     $ac3                            "
    475           "\n\t" /* odd 4 */
    476           "mthi             $zero,            $ac3                            "
    477           "\n\t"
    478           "preceu.ph.qbr    %[p2],            %[qload1]                       "
    479           "\n\t"
    480           "sb               %[st3],           0(%[odd_dst])                   "
    481           "\n\t" /* odd 1 */
    482           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
    483           "\n\t"
    484           "dpa.w.ph         $ac2,             %[p3],          %[filter45]     "
    485           "\n\t" /* odd 3 */
    486           "extp             %[Temp2],         $ac2,           31              "
    487           "\n\t" /* odd 3 */
    488           "lbux             %[st1],           %[Temp1](%[cm])                 "
    489           "\n\t" /* odd 2 */
    490 
    491           /* odd 4. pixel */
    492           "mtlo             %[vector_64],     $ac1                            "
    493           "\n\t" /* odd 5 */
    494           "mthi             $zero,            $ac1                            "
    495           "\n\t"
    496           "preceu.ph.qbl    %[p3],            %[qload1]                       "
    497           "\n\t"
    498           "sb               %[st1],           0(%[odd_dst])                   "
    499           "\n\t" /* odd 2 */
    500           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
    501           "\n\t"
    502           "dpa.w.ph         $ac3,             %[p4],          %[filter45]     "
    503           "\n\t" /* odd 4 */
    504           "extp             %[Temp3],         $ac3,           31              "
    505           "\n\t" /* odd 4 */
    506           "lbux             %[st2],           %[Temp2](%[cm])                 "
    507           "\n\t" /* odd 3 */
    508 
    509           /* odd 5. pixel */
    510           "mtlo             %[vector_64],     $ac2                            "
    511           "\n\t" /* odd 6 */
    512           "mthi             $zero,            $ac2                            "
    513           "\n\t"
    514           "sb               %[st2],           0(%[odd_dst])                   "
    515           "\n\t" /* odd 3 */
    516           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
    517           "\n\t"
    518           "dpa.w.ph         $ac1,             %[p1],          %[filter45]     "
    519           "\n\t" /* odd 5 */
    520           "extp             %[Temp1],         $ac1,           31              "
    521           "\n\t" /* odd 5 */
    522           "lbux             %[st3],           %[Temp3](%[cm])                 "
    523           "\n\t" /* odd 4 */
    524 
    525           /* odd 6. pixel */
    526           "mtlo             %[vector_64],     $ac3                            "
    527           "\n\t" /* odd 7 */
    528           "mthi             $zero,            $ac3                            "
    529           "\n\t"
    530           "sb               %[st3],           0(%[odd_dst])                   "
    531           "\n\t" /* odd 4 */
    532           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
    533           "\n\t"
    534           "ulw              %[qload1],        21(%[src])                      "
    535           "\n\t"
    536           "dpa.w.ph         $ac2,             %[p5],          %[filter45]     "
    537           "\n\t" /* odd 6 */
    538           "extp             %[Temp2],         $ac2,           31              "
    539           "\n\t" /* odd 6 */
    540           "lbux             %[st1],           %[Temp1](%[cm])                 "
    541           "\n\t" /* odd 5 */
    542 
    543           /* odd 7. pixel */
    544           "mtlo             %[vector_64],     $ac1                            "
    545           "\n\t" /* odd 8 */
    546           "mthi             $zero,            $ac1                            "
    547           "\n\t"
    548           "preceu.ph.qbr    %[p5],            %[qload1]                       "
    549           "\n\t"
    550           "sb               %[st1],           0(%[odd_dst])                   "
    551           "\n\t" /* odd 5 */
    552           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
    553           "\n\t"
    554           "dpa.w.ph         $ac3,             %[p2],          %[filter45]     "
    555           "\n\t" /* odd 7 */
    556           "extp             %[Temp3],         $ac3,           31              "
    557           "\n\t" /* odd 7 */
    558 
    559           /* odd 8. pixel */
    560           "dpa.w.ph         $ac1,             %[p3],          %[filter45]     "
    561           "\n\t" /* odd 8 */
    562           "extp             %[Temp1],         $ac1,           31              "
    563           "\n\t" /* odd 8 */
    564 
    565           "lbux             %[st2],           %[Temp2](%[cm])                 "
    566           "\n\t" /* odd 6 */
    567           "lbux             %[st3],           %[Temp3](%[cm])                 "
    568           "\n\t" /* odd 7 */
    569           "lbux             %[st1],           %[Temp1](%[cm])                 "
    570           "\n\t" /* odd 8 */
    571 
    572           "sb               %[st2],           0(%[odd_dst])                   "
    573           "\n\t" /* odd 6 */
    574           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
    575           "\n\t"
    576 
    577           "sb               %[st3],           0(%[odd_dst])                   "
    578           "\n\t" /* odd 7 */
    579           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
    580           "\n\t"
    581 
    582           "sb               %[st1],           0(%[odd_dst])                   "
    583           "\n\t" /* odd 8 */
    584 
    585           : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5),
    586             [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3),
    587             [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4),
    588             [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
    589             [dst] "+r"(dst), [odd_dst] "+r"(odd_dst)
    590           : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
    591             [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2));
    592 
    593       src += 16;
    594       dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
    595       odd_dst = (dst + dst_stride);
    596     }
    597 
    598     /* Next row... */
    599     src_ptr += src_stride;
    600     dst_ptr += 1;
    601   }
    602 }
    603 
    604 static void convolve_bi_horiz_64_transposed_dspr2(
    605     const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr,
    606     int32_t dst_stride, const int16_t *filter_x0, int32_t h) {
    607   int32_t c, y;
    608   const uint8_t *src;
    609   uint8_t *dst;
    610   uint8_t *cm = vpx_ff_cropTbl;
    611   uint32_t vector_64 = 64;
    612   int32_t Temp1, Temp2, Temp3;
    613   uint32_t qload1, qload2;
    614   uint32_t p1, p2, p3, p4, p5;
    615   uint32_t st1, st2, st3;
    616   uint32_t dst_pitch_2 = (dst_stride << 1);
    617   uint8_t *odd_dst;
    618   const int16_t *filter = &filter_x0[3];
    619   uint32_t filter45;
    620 
    621   filter45 = ((const int32_t *)filter)[0];
    622 
    623   for (y = h; y--;) {
    624     /* prefetch data to cache memory */
    625     prefetch_load(src_ptr + src_stride);
    626     prefetch_load(src_ptr + src_stride + 32);
    627     prefetch_load(src_ptr + src_stride + 64);
    628 
    629     src = src_ptr;
    630     dst = dst_ptr;
    631 
    632     odd_dst = (dst + dst_stride);
    633 
    634     for (c = 0; c < 4; c++) {
    635       __asm__ __volatile__(
    636           "ulw              %[qload1],        0(%[src])                       "
    637           "\n\t"
    638           "ulw              %[qload2],        4(%[src])                       "
    639           "\n\t"
    640 
    641           /* even 1. pixel */
    642           "mtlo             %[vector_64],     $ac1                            "
    643           "\n\t" /* even 1 */
    644           "mthi             $zero,            $ac1                            "
    645           "\n\t"
    646           "mtlo             %[vector_64],     $ac2                            "
    647           "\n\t" /* even 2 */
    648           "mthi             $zero,            $ac2                            "
    649           "\n\t"
    650           "preceu.ph.qbr    %[p1],            %[qload1]                       "
    651           "\n\t"
    652           "preceu.ph.qbl    %[p2],            %[qload1]                       "
    653           "\n\t"
    654           "preceu.ph.qbr    %[p3],            %[qload2]                       "
    655           "\n\t"
    656           "preceu.ph.qbl    %[p4],            %[qload2]                       "
    657           "\n\t"
    658           "ulw              %[qload1],        8(%[src])                       "
    659           "\n\t"
    660           "dpa.w.ph         $ac1,             %[p1],          %[filter45]     "
    661           "\n\t" /* even 1 */
    662           "extp             %[Temp1],         $ac1,           31              "
    663           "\n\t" /* even 1 */
    664 
    665           /* even 2. pixel */
    666           "mtlo             %[vector_64],     $ac3                            "
    667           "\n\t" /* even 3 */
    668           "mthi             $zero,            $ac3                            "
    669           "\n\t"
    670           "preceu.ph.qbr    %[p1],            %[qload1]                       "
    671           "\n\t"
    672           "preceu.ph.qbl    %[p5],            %[qload1]                       "
    673           "\n\t"
    674           "ulw              %[qload2],        12(%[src])                      "
    675           "\n\t"
    676           "dpa.w.ph         $ac2,             %[p2],          %[filter45]     "
    677           "\n\t" /* even 1 */
    678           "lbux             %[st1],           %[Temp1](%[cm])                 "
    679           "\n\t" /* even 1 */
    680           "extp             %[Temp2],         $ac2,           31              "
    681           "\n\t" /* even 1 */
    682 
    683           /* even 3. pixel */
    684           "mtlo             %[vector_64],     $ac1                            "
    685           "\n\t" /* even 4 */
    686           "mthi             $zero,            $ac1                            "
    687           "\n\t"
    688           "preceu.ph.qbr    %[p2],            %[qload2]                       "
    689           "\n\t"
    690           "sb               %[st1],           0(%[dst])                       "
    691           "\n\t" /* even 1 */
    692           "addu             %[dst],           %[dst],         %[dst_pitch_2]   "
    693           "          \n\t"
    694           "dpa.w.ph         $ac3,             %[p3],          %[filter45]     "
    695           "\n\t" /* even 3 */
    696           "extp             %[Temp3],         $ac3,           31              "
    697           "\n\t" /* even 3 */
    698           "lbux             %[st2],           %[Temp2](%[cm])                 "
    699           "\n\t" /* even 1 */
    700 
    701           /* even 4. pixel */
    702           "mtlo             %[vector_64],     $ac2                            "
    703           "\n\t" /* even 5 */
    704           "mthi             $zero,            $ac2                            "
    705           "\n\t"
    706           "preceu.ph.qbl    %[p3],            %[qload2]                       "
    707           "\n\t"
    708           "sb               %[st2],           0(%[dst])                       "
    709           "\n\t" /* even 2 */
    710           "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
    711           "\n\t"
    712           "dpa.w.ph         $ac1,             %[p4],          %[filter45]     "
    713           "\n\t" /* even 4 */
    714           "extp             %[Temp1],         $ac1,           31              "
    715           "\n\t" /* even 4 */
    716           "lbux             %[st3],           %[Temp3](%[cm])                 "
    717           "\n\t" /* even 3 */
    718 
    719           /* even 5. pixel */
    720           "mtlo             %[vector_64],     $ac3                            "
    721           "\n\t" /* even 6 */
    722           "mthi             $zero,            $ac3                            "
    723           "\n\t"
    724           "sb               %[st3],           0(%[dst])                       "
    725           "\n\t" /* even 3 */
    726           "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
    727           "\n\t"
    728           "dpa.w.ph         $ac2,             %[p1],          %[filter45]     "
    729           "\n\t" /* even 5 */
    730           "extp             %[Temp2],         $ac2,           31              "
    731           "\n\t" /* even 5 */
    732           "lbux             %[st1],           %[Temp1](%[cm])                 "
    733           "\n\t" /* even 4 */
    734 
    735           /* even 6. pixel */
    736           "mtlo             %[vector_64],     $ac1                            "
    737           "\n\t" /* even 7 */
    738           "mthi             $zero,            $ac1                            "
    739           "\n\t"
    740           "sb               %[st1],           0(%[dst])                       "
    741           "\n\t" /* even 4 */
    742           "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
    743           "\n\t"
    744           "ulw              %[qload1],        20(%[src])                      "
    745           "\n\t"
    746           "dpa.w.ph         $ac3,             %[p5],          %[filter45]     "
    747           "\n\t" /* even 6 */
    748           "extp             %[Temp3],         $ac3,           31              "
    749           "\n\t" /* even 6 */
    750           "lbux             %[st2],           %[Temp2](%[cm])                 "
    751           "\n\t" /* even 5 */
    752 
    753           /* even 7. pixel */
    754           "mtlo             %[vector_64],     $ac2                            "
    755           "\n\t" /* even 8 */
    756           "mthi             $zero,            $ac2                            "
    757           "\n\t"
    758           "preceu.ph.qbr    %[p5],            %[qload1]                       "
    759           "\n\t"
    760           "sb               %[st2],           0(%[dst])                       "
    761           "\n\t" /* even 5 */
    762           "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
    763           "\n\t"
    764           "dpa.w.ph         $ac1,             %[p2],          %[filter45]     "
    765           "\n\t" /* even 7 */
    766           "extp             %[Temp1],         $ac1,           31              "
    767           "\n\t" /* even 7 */
    768           "lbux             %[st3],           %[Temp3](%[cm])                 "
    769           "\n\t" /* even 6 */
    770 
    771           /* even 8. pixel */
    772           "mtlo             %[vector_64],     $ac3                            "
    773           "\n\t" /* odd 1 */
    774           "mthi             $zero,            $ac3                            "
    775           "\n\t"
    776           "dpa.w.ph         $ac2,             %[p3],          %[filter45]     "
    777           "\n\t" /* even 8 */
    778           "sb               %[st3],           0(%[dst])                       "
    779           "\n\t" /* even 6 */
    780           "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
    781           "\n\t"
    782           "extp             %[Temp2],         $ac2,           31              "
    783           "\n\t" /* even 8 */
    784           "lbux             %[st1],           %[Temp1](%[cm])                 "
    785           "\n\t" /* even 7 */
    786 
    787           /* ODD pixels */
    788           "ulw              %[qload1],        1(%[src])                       "
    789           "\n\t"
    790           "ulw              %[qload2],        5(%[src])                       "
    791           "\n\t"
    792 
    793           /* odd 1. pixel */
    794           "mtlo             %[vector_64],     $ac1                            "
    795           "\n\t" /* odd 2 */
    796           "mthi             $zero,            $ac1                            "
    797           "\n\t"
    798           "preceu.ph.qbr    %[p1],            %[qload1]                       "
    799           "\n\t"
    800           "preceu.ph.qbl    %[p2],            %[qload1]                       "
    801           "\n\t"
    802           "preceu.ph.qbr    %[p3],            %[qload2]                       "
    803           "\n\t"
    804           "preceu.ph.qbl    %[p4],            %[qload2]                       "
    805           "\n\t"
    806           "sb               %[st1],           0(%[dst])                       "
    807           "\n\t" /* even 7 */
    808           "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
    809           "\n\t"
    810           "ulw              %[qload2],        9(%[src])                       "
    811           "\n\t"
    812           "dpa.w.ph         $ac3,             %[p1],          %[filter45]     "
    813           "\n\t" /* odd 1 */
    814           "extp             %[Temp3],         $ac3,           31              "
    815           "\n\t" /* odd 1 */
    816           "lbux             %[st2],           %[Temp2](%[cm])                 "
    817           "\n\t" /* even 8 */
    818 
    819           /* odd 2. pixel */
    820           "mtlo             %[vector_64],     $ac2                            "
    821           "\n\t" /* odd 3 */
    822           "mthi             $zero,            $ac2                            "
    823           "\n\t"
    824           "preceu.ph.qbr    %[p1],            %[qload2]                       "
    825           "\n\t"
    826           "preceu.ph.qbl    %[p5],            %[qload2]                       "
    827           "\n\t"
    828           "sb               %[st2],           0(%[dst])                       "
    829           "\n\t" /* even 8 */
    830           "ulw              %[qload1],        13(%[src])                      "
    831           "\n\t"
    832           "dpa.w.ph         $ac1,             %[p2],          %[filter45]     "
    833           "\n\t" /* odd 2 */
    834           "extp             %[Temp1],         $ac1,           31              "
    835           "\n\t" /* odd 2 */
    836           "lbux             %[st3],           %[Temp3](%[cm])                 "
    837           "\n\t" /* odd 1 */
    838 
    839           /* odd 3. pixel */
    840           "mtlo             %[vector_64],     $ac3                            "
    841           "\n\t" /* odd 4 */
    842           "mthi             $zero,            $ac3                            "
    843           "\n\t"
    844           "preceu.ph.qbr    %[p2],            %[qload1]                       "
    845           "\n\t"
    846           "sb               %[st3],           0(%[odd_dst])                   "
    847           "\n\t" /* odd 1 */
    848           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
    849           "\n\t"
    850           "dpa.w.ph         $ac2,             %[p3],          %[filter45]     "
    851           "\n\t" /* odd 3 */
    852           "extp             %[Temp2],         $ac2,           31              "
    853           "\n\t" /* odd 3 */
    854           "lbux             %[st1],           %[Temp1](%[cm])                 "
    855           "\n\t" /* odd 2 */
    856 
    857           /* odd 4. pixel */
    858           "mtlo             %[vector_64],     $ac1                            "
    859           "\n\t" /* odd 5 */
    860           "mthi             $zero,            $ac1                            "
    861           "\n\t"
    862           "preceu.ph.qbl    %[p3],            %[qload1]                       "
    863           "\n\t"
    864           "sb               %[st1],           0(%[odd_dst])                   "
    865           "\n\t" /* odd 2 */
    866           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
    867           "\n\t"
    868           "dpa.w.ph         $ac3,             %[p4],          %[filter45]     "
    869           "\n\t" /* odd 4 */
    870           "extp             %[Temp3],         $ac3,           31              "
    871           "\n\t" /* odd 4 */
    872           "lbux             %[st2],           %[Temp2](%[cm])                 "
    873           "\n\t" /* odd 3 */
    874 
    875           /* odd 5. pixel */
    876           "mtlo             %[vector_64],     $ac2                            "
    877           "\n\t" /* odd 6 */
    878           "mthi             $zero,            $ac2                            "
    879           "\n\t"
    880           "sb               %[st2],           0(%[odd_dst])                   "
    881           "\n\t" /* odd 3 */
    882           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
    883           "\n\t"
    884           "dpa.w.ph         $ac1,             %[p1],          %[filter45]     "
    885           "\n\t" /* odd 5 */
    886           "extp             %[Temp1],         $ac1,           31              "
    887           "\n\t" /* odd 5 */
    888           "lbux             %[st3],           %[Temp3](%[cm])                 "
    889           "\n\t" /* odd 4 */
    890 
    891           /* odd 6. pixel */
    892           "mtlo             %[vector_64],     $ac3                            "
    893           "\n\t" /* odd 7 */
    894           "mthi             $zero,            $ac3                            "
    895           "\n\t"
    896           "sb               %[st3],           0(%[odd_dst])                   "
    897           "\n\t" /* odd 4 */
    898           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
    899           "\n\t"
    900           "ulw              %[qload1],        21(%[src])                      "
    901           "\n\t"
    902           "dpa.w.ph         $ac2,             %[p5],          %[filter45]     "
    903           "\n\t" /* odd 6 */
    904           "extp             %[Temp2],         $ac2,           31              "
    905           "\n\t" /* odd 6 */
    906           "lbux             %[st1],           %[Temp1](%[cm])                 "
    907           "\n\t" /* odd 5 */
    908 
    909           /* odd 7. pixel */
    910           "mtlo             %[vector_64],     $ac1                            "
    911           "\n\t" /* odd 8 */
    912           "mthi             $zero,            $ac1                            "
    913           "\n\t"
    914           "preceu.ph.qbr    %[p5],            %[qload1]                       "
    915           "\n\t"
    916           "sb               %[st1],           0(%[odd_dst])                   "
    917           "\n\t" /* odd 5 */
    918           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
    919           "\n\t"
    920           "dpa.w.ph         $ac3,             %[p2],          %[filter45]     "
    921           "\n\t" /* odd 7 */
    922           "extp             %[Temp3],         $ac3,           31              "
    923           "\n\t" /* odd 7 */
    924 
    925           /* odd 8. pixel */
    926           "dpa.w.ph         $ac1,             %[p3],          %[filter45]     "
    927           "\n\t" /* odd 8 */
    928           "extp             %[Temp1],         $ac1,           31              "
    929           "\n\t" /* odd 8 */
    930 
    931           "lbux             %[st2],           %[Temp2](%[cm])                 "
    932           "\n\t" /* odd 6 */
    933           "lbux             %[st3],           %[Temp3](%[cm])                 "
    934           "\n\t" /* odd 7 */
    935           "lbux             %[st1],           %[Temp1](%[cm])                 "
    936           "\n\t" /* odd 8 */
    937 
    938           "sb               %[st2],           0(%[odd_dst])                   "
    939           "\n\t" /* odd 6 */
    940           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
    941           "\n\t"
    942 
    943           "sb               %[st3],           0(%[odd_dst])                   "
    944           "\n\t" /* odd 7 */
    945           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
    946           "\n\t"
    947 
    948           "sb               %[st1],           0(%[odd_dst])                   "
    949           "\n\t" /* odd 8 */
    950 
    951           : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5),
    952             [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3),
    953             [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4),
    954             [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
    955             [dst] "+r"(dst), [odd_dst] "+r"(odd_dst)
    956           : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
    957             [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2));
    958 
    959       src += 16;
    960       dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
    961       odd_dst = (dst + dst_stride);
    962     }
    963 
    964     /* Next row... */
    965     src_ptr += src_stride;
    966     dst_ptr += 1;
    967   }
    968 }
    969 
    970 void convolve_bi_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
    971                                   uint8_t *dst, ptrdiff_t dst_stride,
    972                                   const int16_t *filter, int w, int h) {
    973   int x, y;
    974 
    975   for (y = 0; y < h; ++y) {
    976     for (x = 0; x < w; ++x) {
    977       int sum = 0;
    978 
    979       sum += src[x] * filter[3];
    980       sum += src[x + 1] * filter[4];
    981 
    982       dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
    983     }
    984 
    985     src += src_stride;
    986     dst += 1;
    987   }
    988 }
    989 
    990 void vpx_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
    991                          ptrdiff_t dst_stride, const int16_t *filter, int w,
    992                          int h) {
    993   uint32_t pos = 38;
    994 
    995   /* bit positon for extract from acc */
    996   __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
    997                        :
    998                        : [pos] "r"(pos));
    999 
   1000   /* prefetch data to cache memory */
   1001   prefetch_load(src);
   1002   prefetch_load(src + 32);
   1003 
   1004   switch (w) {
   1005     case 4:
   1006       convolve_bi_horiz_4_transposed_dspr2(src, src_stride, dst, dst_stride,
   1007                                            filter, h);
   1008       break;
   1009     case 8:
   1010       convolve_bi_horiz_8_transposed_dspr2(src, src_stride, dst, dst_stride,
   1011                                            filter, h);
   1012       break;
   1013     case 16:
   1014     case 32:
   1015       convolve_bi_horiz_16_transposed_dspr2(src, src_stride, dst, dst_stride,
   1016                                             filter, h, (w / 16));
   1017       break;
   1018     case 64:
   1019       prefetch_load(src + 32);
   1020       convolve_bi_horiz_64_transposed_dspr2(src, src_stride, dst, dst_stride,
   1021                                             filter, h);
   1022       break;
   1023     default:
   1024       convolve_bi_horiz_transposed(src, src_stride, dst, dst_stride, filter, w,
   1025                                    h);
   1026       break;
   1027   }
   1028 }
   1029 #endif
   1030