Home | History | Annotate | Download | only in dspr2
      1 /*
      2  *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 
     12 #include <stdlib.h>
     13 #include "vp8_rtcd.h"
     14 #include "vpx_ports/mem.h"
     15 
     16 #if HAVE_DSPR2
     17 #define CROP_WIDTH 256
     18 unsigned char ff_cropTbl[256 + 2 * CROP_WIDTH];
     19 
     20 static const unsigned short sub_pel_filterss[8][3] =
     21 {
     22     {      0,      0,      0},
     23     {      0, 0x0601, 0x7b0c},
     24     { 0x0201, 0x0b08, 0x6c24},
     25     {      0, 0x0906, 0x5d32},
     26     { 0x0303, 0x1010, 0x4d4d},
     27     {      0, 0x0609, 0x325d},
     28     { 0x0102, 0x080b, 0x246c},
     29     {      0, 0x0106, 0x0c7b},
     30 };
     31 
     32 
     33 static const int sub_pel_filters_int[8][3] =
     34 {
     35     {          0,          0,          0},
     36     { 0x0000fffa, 0x007b000c, 0xffff0000},
     37     { 0x0002fff5, 0x006c0024, 0xfff80001},
     38     { 0x0000fff7, 0x005d0032, 0xfffa0000},
     39     { 0x0003fff0, 0x004d004d, 0xfff00003},
     40     { 0x0000fffa, 0x0032005d, 0xfff70000},
     41     { 0x0001fff8, 0x0024006c, 0xfff50002},
     42     { 0x0000ffff, 0x000c007b, 0xfffa0000},
     43 };
     44 
     45 
     46 static const int sub_pel_filters_inv[8][3] =
     47 {
     48     {          0,          0,          0},
     49     { 0xfffa0000, 0x000c007b, 0x0000ffff},
     50     { 0xfff50002, 0x0024006c, 0x0001fff8},
     51     { 0xfff70000, 0x0032005d, 0x0000fffa},
     52     { 0xfff00003, 0x004d004d, 0x0003fff0},
     53     { 0xfffa0000, 0x005d0032, 0x0000fff7},
     54     { 0xfff80001, 0x006c0024, 0x0002fff5},
     55     { 0xffff0000, 0x007b000c, 0x0000fffa},
     56 };
     57 
     58 
     59 static const int sub_pel_filters_int_tap_4[8][2] =
     60 {
     61     {          0,          0},
     62     { 0xfffa007b, 0x000cffff},
     63     {          0,          0},
     64     { 0xfff7005d, 0x0032fffa},
     65     {          0,          0},
     66     { 0xfffa0032, 0x005dfff7},
     67     {          0,          0},
     68     { 0xffff000c, 0x007bfffa},
     69 };
     70 
     71 
     72 static const int sub_pel_filters_inv_tap_4[8][2] =
     73 {
     74     {          0,          0},
     75     { 0x007bfffa, 0xffff000c},
     76     {          0,          0},
     77     { 0x005dfff7, 0xfffa0032},
     78     {          0,          0},
     79     { 0x0032fffa, 0xfff7005d},
     80     {          0,          0},
     81     { 0x000cffff, 0xfffa007b},
     82 };
     83 
     84 inline void prefetch_load(unsigned char *src)
     85 {
     86     __asm__ __volatile__ (
     87         "pref   0,  0(%[src])   \n\t"
     88         :
     89         : [src] "r" (src)
     90     );
     91 }
     92 
     93 
     94 inline void prefetch_store(unsigned char *dst)
     95 {
     96     __asm__ __volatile__ (
     97         "pref   1,  0(%[dst])   \n\t"
     98         :
     99         : [dst] "r" (dst)
    100     );
    101 }
    102 
    103 void dsputil_static_init(void)
    104 {
    105     int i;
    106 
    107     for (i = 0; i < 256; i++) ff_cropTbl[i + CROP_WIDTH] = i;
    108 
    109     for (i = 0; i < CROP_WIDTH; i++)
    110     {
    111         ff_cropTbl[i] = 0;
    112         ff_cropTbl[i + CROP_WIDTH + 256] = 255;
    113     }
    114 }
    115 
    116 void vp8_filter_block2d_first_pass_4
    117 (
    118     unsigned char *RESTRICT src_ptr,
    119     unsigned char *RESTRICT dst_ptr,
    120     unsigned int src_pixels_per_line,
    121     unsigned int output_height,
    122     int xoffset,
    123     int pitch
    124 )
    125 {
    126     unsigned int i;
    127     int Temp1, Temp2, Temp3, Temp4;
    128 
    129     unsigned int vector4a = 64;
    130     int vector1b, vector2b, vector3b;
    131     unsigned int tp1, tp2, tn1, tn2;
    132     unsigned int p1, p2, p3;
    133     unsigned int n1, n2, n3;
    134     unsigned char *cm = ff_cropTbl + CROP_WIDTH;
    135 
    136     vector3b = sub_pel_filters_inv[xoffset][2];
    137 
    138     /* if (xoffset == 0) we don't need any filtering */
    139     if (vector3b == 0)
    140     {
    141         for (i = 0; i < output_height; i++)
    142         {
    143             /* prefetch src_ptr data to cache memory */
    144             prefetch_load(src_ptr + src_pixels_per_line);
    145             dst_ptr[0] = src_ptr[0];
    146             dst_ptr[1] = src_ptr[1];
    147             dst_ptr[2] = src_ptr[2];
    148             dst_ptr[3] = src_ptr[3];
    149 
    150             /* next row... */
    151             src_ptr += src_pixels_per_line;
    152             dst_ptr += 4;
    153         }
    154     }
    155     else
    156     {
    157         if (vector3b > 65536)
    158         {
    159             /* 6 tap filter */
    160 
    161             vector1b = sub_pel_filters_inv[xoffset][0];
    162             vector2b = sub_pel_filters_inv[xoffset][1];
    163 
    164             /* prefetch src_ptr data to cache memory */
    165             prefetch_load(src_ptr + src_pixels_per_line);
    166 
    167             for (i = output_height; i--;)
    168             {
    169                 /* apply filter with vectors pairs */
    170                 __asm__ __volatile__ (
    171                     "ulw              %[tp1],      -2(%[src_ptr])                 \n\t"
    172                     "ulw              %[tp2],      2(%[src_ptr])                  \n\t"
    173 
    174                     /* even 1. pixel */
    175                     "mtlo             %[vector4a], $ac3                           \n\t"
    176                     "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
    177                     "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
    178                     "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
    179                     "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
    180                     "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
    181                     "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
    182 
    183                     /* even 2. pixel */
    184                     "mtlo             %[vector4a], $ac2                           \n\t"
    185                     "preceu.ph.qbl    %[p1],       %[tp2]                         \n\t"
    186                     "balign           %[tp2],      %[tp1],         3              \n\t"
    187                     "extp             %[Temp1],    $ac3,           9              \n\t"
    188                     "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
    189                     "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
    190                     "dpa.w.ph         $ac2,        %[p1],          %[vector3b]    \n\t"
    191 
    192                     /* odd 1. pixel */
    193                     "ulw              %[tn2],      3(%[src_ptr])                  \n\t"
    194                     "mtlo             %[vector4a], $ac3                           \n\t"
    195                     "preceu.ph.qbr    %[n1],       %[tp2]                         \n\t"
    196                     "preceu.ph.qbl    %[n2],       %[tp2]                         \n\t"
    197                     "preceu.ph.qbr    %[n3],       %[tn2]                         \n\t"
    198                     "extp             %[Temp3],    $ac2,           9              \n\t"
    199                     "dpa.w.ph         $ac3,        %[n1],          %[vector1b]    \n\t"
    200                     "dpa.w.ph         $ac3,        %[n2],          %[vector2b]    \n\t"
    201                     "dpa.w.ph         $ac3,        %[n3],          %[vector3b]    \n\t"
    202 
    203                     /* even 2. pixel */
    204                     "mtlo             %[vector4a], $ac2                           \n\t"
    205                     "preceu.ph.qbl    %[n1],       %[tn2]                         \n\t"
    206                     "extp             %[Temp2],    $ac3,           9              \n\t"
    207                     "dpa.w.ph         $ac2,        %[n2],          %[vector1b]    \n\t"
    208                     "dpa.w.ph         $ac2,        %[n3],          %[vector2b]    \n\t"
    209                     "dpa.w.ph         $ac2,        %[n1],          %[vector3b]    \n\t"
    210                     "extp             %[Temp4],    $ac2,           9              \n\t"
    211 
    212                     /* clamp */
    213                     "lbux             %[tp1],      %[Temp1](%[cm])                \n\t"
    214                     "lbux             %[tn1],      %[Temp2](%[cm])                \n\t"
    215                     "lbux             %[tp2],      %[Temp3](%[cm])                \n\t"
    216                     "lbux             %[n2],       %[Temp4](%[cm])                \n\t"
    217 
    218                     /* store bytes */
    219                     "sb               %[tp1],      0(%[dst_ptr])                  \n\t"
    220                     "sb               %[tn1],      1(%[dst_ptr])                  \n\t"
    221                     "sb               %[tp2],      2(%[dst_ptr])                  \n\t"
    222                     "sb               %[n2],       3(%[dst_ptr])                  \n\t"
    223 
    224                     : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn1] "=&r" (tn1),
    225                       [tn2] "=&r" (tn2), [p1] "=&r" (p1), [p2] "=&r" (p2),
    226                       [p3] "=&r" (p3), [n1] "=&r" (n1), [n2] "=&r" (n2),
    227                       [n3] "=&r" (n3), [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
    228                       [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
    229                     : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
    230                       [vector4a] "r" (vector4a), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr),
    231                       [vector3b] "r" (vector3b), [src_ptr] "r" (src_ptr)
    232                 );
    233 
    234                 /* Next row... */
    235                 src_ptr += src_pixels_per_line;
    236                 dst_ptr += pitch;
    237             }
    238         }
    239         else
    240         {
    241             /* 4 tap filter */
    242 
    243             vector1b = sub_pel_filters_inv_tap_4[xoffset][0];
    244             vector2b = sub_pel_filters_inv_tap_4[xoffset][1];
    245 
    246             for (i = output_height; i--;)
    247             {
    248                 /* apply filter with vectors pairs */
    249                 __asm__ __volatile__ (
    250                     "ulw              %[tp1],      -1(%[src_ptr])                 \n\t"
    251                     "ulw              %[tp2],      3(%[src_ptr])                  \n\t"
    252 
    253                     /* even 1. pixel */
    254                     "mtlo             %[vector4a], $ac3                           \n\t"
    255                     "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
    256                     "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
    257                     "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
    258                     "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
    259                     "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
    260 
    261                     /* even 2. pixel */
    262                     "mtlo             %[vector4a], $ac2                           \n\t"
    263                     "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
    264                     "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
    265                     "extp             %[Temp1],    $ac3,           9              \n\t"
    266 
    267                     /* odd 1. pixel */
    268                     "srl              %[tn1],      %[tp2],         8              \n\t"
    269                     "balign           %[tp2],      %[tp1],         3              \n\t"
    270                     "mtlo             %[vector4a], $ac3                           \n\t"
    271                     "preceu.ph.qbr    %[n1],       %[tp2]                         \n\t"
    272                     "preceu.ph.qbl    %[n2],       %[tp2]                         \n\t"
    273                     "preceu.ph.qbr    %[n3],       %[tn1]                         \n\t"
    274                     "extp             %[Temp3],    $ac2,           9              \n\t"
    275                     "dpa.w.ph         $ac3,        %[n1],          %[vector1b]    \n\t"
    276                     "dpa.w.ph         $ac3,        %[n2],          %[vector2b]    \n\t"
    277 
    278                     /* odd 2. pixel */
    279                     "mtlo             %[vector4a], $ac2                           \n\t"
    280                     "extp             %[Temp2],    $ac3,           9              \n\t"
    281                     "dpa.w.ph         $ac2,        %[n2],          %[vector1b]    \n\t"
    282                     "dpa.w.ph         $ac2,        %[n3],          %[vector2b]    \n\t"
    283                     "extp             %[Temp4],    $ac2,           9              \n\t"
    284 
    285                     /* clamp and store results */
    286                     "lbux             %[tp1],      %[Temp1](%[cm])                \n\t"
    287                     "lbux             %[tn1],      %[Temp2](%[cm])                \n\t"
    288                     "lbux             %[tp2],      %[Temp3](%[cm])                \n\t"
    289                     "sb               %[tp1],      0(%[dst_ptr])                  \n\t"
    290                     "sb               %[tn1],      1(%[dst_ptr])                  \n\t"
    291                     "lbux             %[n2],       %[Temp4](%[cm])                \n\t"
    292                     "sb               %[tp2],      2(%[dst_ptr])                  \n\t"
    293                     "sb               %[n2],       3(%[dst_ptr])                  \n\t"
    294 
    295                     : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn1] "=&r" (tn1),
    296                       [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3),
    297                       [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3),
    298                       [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
    299                       [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
    300                     : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
    301                       [vector4a] "r" (vector4a), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr),
    302                       [src_ptr] "r" (src_ptr)
    303                 );
    304                 /*  Next row... */
    305                 src_ptr += src_pixels_per_line;
    306                 dst_ptr += pitch;
    307             }
    308         }
    309     }
    310 }
    311 
    312 void vp8_filter_block2d_first_pass_8_all
    313 (
    314     unsigned char *RESTRICT src_ptr,
    315     unsigned char *RESTRICT dst_ptr,
    316     unsigned int src_pixels_per_line,
    317     unsigned int output_height,
    318     int xoffset,
    319     int pitch
    320 )
    321 {
    322     unsigned int i;
    323     int Temp1, Temp2, Temp3, Temp4;
    324 
    325     unsigned int vector4a = 64;
    326     unsigned int vector1b, vector2b, vector3b;
    327     unsigned int tp1, tp2, tn1, tn2;
    328     unsigned int p1, p2, p3, p4;
    329     unsigned int n1, n2, n3, n4;
    330 
    331     unsigned char *cm = ff_cropTbl + CROP_WIDTH;
    332 
    333     /* if (xoffset == 0) we don't need any filtering */
    334     if (xoffset == 0)
    335     {
    336         for (i = 0; i < output_height; i++)
    337         {
    338             /* prefetch src_ptr data to cache memory */
    339             prefetch_load(src_ptr + src_pixels_per_line);
    340 
    341             dst_ptr[0] = src_ptr[0];
    342             dst_ptr[1] = src_ptr[1];
    343             dst_ptr[2] = src_ptr[2];
    344             dst_ptr[3] = src_ptr[3];
    345             dst_ptr[4] = src_ptr[4];
    346             dst_ptr[5] = src_ptr[5];
    347             dst_ptr[6] = src_ptr[6];
    348             dst_ptr[7] = src_ptr[7];
    349 
    350             /* next row... */
    351             src_ptr += src_pixels_per_line;
    352             dst_ptr += 8;
    353         }
    354     }
    355     else
    356     {
    357         vector3b = sub_pel_filters_inv[xoffset][2];
    358 
    359         if (vector3b > 65536)
    360         {
    361             /* 6 tap filter */
    362 
    363             vector1b = sub_pel_filters_inv[xoffset][0];
    364             vector2b = sub_pel_filters_inv[xoffset][1];
    365 
    366             for (i = output_height; i--;)
    367             {
    368                 /* prefetch src_ptr data to cache memory */
    369                 prefetch_load(src_ptr + src_pixels_per_line);
    370 
    371                 /* apply filter with vectors pairs */
    372                 __asm__ __volatile__ (
    373                     "ulw              %[tp1],      -2(%[src_ptr])                 \n\t"
    374                     "ulw              %[tp2],      2(%[src_ptr])                  \n\t"
    375 
    376                     /* even 1. pixel */
    377                     "mtlo             %[vector4a], $ac3                           \n\t"
    378                     "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
    379                     "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
    380                     "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
    381                     "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
    382                     "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
    383                     "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
    384 
    385                     /* even 2. pixel */
    386                     "mtlo             %[vector4a], $ac2                           \n\t"
    387                     "preceu.ph.qbl    %[p1],       %[tp2]                         \n\t"
    388                     "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
    389                     "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
    390                     "dpa.w.ph         $ac2,        %[p1],          %[vector3b]    \n\t"
    391 
    392                     "balign           %[tp2],      %[tp1],         3              \n\t"
    393                     "extp             %[Temp1],    $ac3,           9              \n\t"
    394                     "ulw              %[tn2],      3(%[src_ptr])                  \n\t"
    395 
    396                     /* odd 1. pixel */
    397                     "mtlo             %[vector4a], $ac3                           \n\t"
    398                     "preceu.ph.qbr    %[n1],       %[tp2]                         \n\t"
    399                     "preceu.ph.qbl    %[n2],       %[tp2]                         \n\t"
    400                     "preceu.ph.qbr    %[n3],       %[tn2]                         \n\t"
    401                     "extp             %[Temp3],    $ac2,           9              \n\t"
    402                     "dpa.w.ph         $ac3,        %[n1],          %[vector1b]    \n\t"
    403                     "dpa.w.ph         $ac3,        %[n2],          %[vector2b]    \n\t"
    404                     "dpa.w.ph         $ac3,        %[n3],          %[vector3b]    \n\t"
    405 
    406                     /* odd 2. pixel */
    407                     "mtlo             %[vector4a], $ac2                           \n\t"
    408                     "preceu.ph.qbl    %[n1],       %[tn2]                         \n\t"
    409                     "dpa.w.ph         $ac2,        %[n2],          %[vector1b]    \n\t"
    410                     "dpa.w.ph         $ac2,        %[n3],          %[vector2b]    \n\t"
    411                     "dpa.w.ph         $ac2,        %[n1],          %[vector3b]    \n\t"
    412                     "ulw              %[tp1],      6(%[src_ptr])                  \n\t"
    413                     "extp             %[Temp2],    $ac3,           9              \n\t"
    414                     "mtlo             %[vector4a], $ac3                           \n\t"
    415                     "preceu.ph.qbr    %[p2],       %[tp1]                         \n\t"
    416                     "extp             %[Temp4],    $ac2,           9              \n\t"
    417 
    418                     : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn2] "=&r" (tn2),
    419                       [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3),
    420                       [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3),
    421                       [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
    422                       [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
    423                     : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
    424                       [vector4a] "r" (vector4a), [vector3b] "r" (vector3b),
    425                       [src_ptr] "r" (src_ptr)
    426                 );
    427 
    428                 /* clamp and store results */
    429                 dst_ptr[0] = cm[Temp1];
    430                 dst_ptr[1] = cm[Temp2];
    431                 dst_ptr[2] = cm[Temp3];
    432                 dst_ptr[3] = cm[Temp4];
    433 
    434                 /* next 4 pixels */
    435                 __asm__ __volatile__ (
    436                     /* even 3. pixel */
    437                     "dpa.w.ph         $ac3,        %[p3],          %[vector1b]    \n\t"
    438                     "dpa.w.ph         $ac3,        %[p1],          %[vector2b]    \n\t"
    439                     "dpa.w.ph         $ac3,        %[p2],          %[vector3b]    \n\t"
    440 
    441                     /* even 4. pixel */
    442                     "mtlo             %[vector4a], $ac2                           \n\t"
    443                     "preceu.ph.qbl    %[p4],       %[tp1]                         \n\t"
    444                     "dpa.w.ph         $ac2,        %[p1],          %[vector1b]    \n\t"
    445                     "dpa.w.ph         $ac2,        %[p2],          %[vector2b]    \n\t"
    446                     "dpa.w.ph         $ac2,        %[p4],          %[vector3b]    \n\t"
    447 
    448                     "ulw              %[tn1],      7(%[src_ptr])                  \n\t"
    449                     "extp             %[Temp1],    $ac3,           9              \n\t"
    450 
    451                     /* odd 3. pixel */
    452                     "mtlo             %[vector4a], $ac3                           \n\t"
    453                     "preceu.ph.qbr    %[n2],       %[tn1]                         \n\t"
    454                     "dpa.w.ph         $ac3,        %[n3],          %[vector1b]    \n\t"
    455                     "dpa.w.ph         $ac3,        %[n1],          %[vector2b]    \n\t"
    456                     "dpa.w.ph         $ac3,        %[n2],          %[vector3b]    \n\t"
    457                     "extp             %[Temp3],    $ac2,           9              \n\t"
    458 
    459                     /* odd 4. pixel */
    460                     "mtlo             %[vector4a], $ac2                           \n\t"
    461                     "preceu.ph.qbl    %[n4],       %[tn1]                         \n\t"
    462                     "dpa.w.ph         $ac2,        %[n1],          %[vector1b]    \n\t"
    463                     "dpa.w.ph         $ac2,        %[n2],          %[vector2b]    \n\t"
    464                     "dpa.w.ph         $ac2,        %[n4],          %[vector3b]    \n\t"
    465                     "extp             %[Temp2],    $ac3,           9              \n\t"
    466                     "extp             %[Temp4],    $ac2,           9              \n\t"
    467 
    468                     : [tn1] "=&r" (tn1), [n2] "=&r" (n2),
    469                       [p4] "=&r" (p4), [n4] "=&r" (n4),
    470                       [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
    471                       [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
    472                     : [tp1] "r" (tp1), [vector1b] "r" (vector1b), [p2] "r" (p2),
    473                       [vector2b] "r" (vector2b), [n1] "r" (n1), [p1] "r" (p1),
    474                       [vector4a] "r" (vector4a), [vector3b] "r" (vector3b),
    475                       [p3] "r" (p3), [n3] "r" (n3), [src_ptr] "r" (src_ptr)
    476                 );
    477 
    478                 /* clamp and store results */
    479                 dst_ptr[4] = cm[Temp1];
    480                 dst_ptr[5] = cm[Temp2];
    481                 dst_ptr[6] = cm[Temp3];
    482                 dst_ptr[7] = cm[Temp4];
    483 
    484                 src_ptr += src_pixels_per_line;
    485                 dst_ptr += pitch;
    486             }
    487         }
    488         else
    489         {
    490             /* 4 tap filter */
    491 
    492             vector1b = sub_pel_filters_inv_tap_4[xoffset][0];
    493             vector2b = sub_pel_filters_inv_tap_4[xoffset][1];
    494 
    495             for (i = output_height; i--;)
    496             {
    497                 /* prefetch src_ptr data to cache memory */
    498                 prefetch_load(src_ptr + src_pixels_per_line);
    499 
    500                 /* apply filter with vectors pairs */
    501                 __asm__ __volatile__ (
    502                     "ulw              %[tp1],      -1(%[src_ptr])                 \n\t"
    503 
    504                     /* even 1. pixel */
    505                     "mtlo             %[vector4a], $ac3                           \n\t"
    506                     "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
    507                     "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
    508                     "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
    509                     "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
    510 
    511                     "ulw              %[tp2],      3(%[src_ptr])                  \n\t"
    512 
    513                     /* even 2. pixel  */
    514                     "mtlo             %[vector4a], $ac2                           \n\t"
    515                     "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
    516                     "preceu.ph.qbl    %[p4],       %[tp2]                         \n\t"
    517                     "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
    518                     "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
    519                     "extp             %[Temp1],    $ac3,           9              \n\t"
    520 
    521                     "balign           %[tp2],      %[tp1],         3              \n\t"
    522 
    523                     /* odd 1. pixel */
    524                     "mtlo             %[vector4a], $ac3                           \n\t"
    525                     "preceu.ph.qbr    %[n1],       %[tp2]                         \n\t"
    526                     "preceu.ph.qbl    %[n2],       %[tp2]                         \n\t"
    527                     "dpa.w.ph         $ac3,        %[n1],          %[vector1b]    \n\t"
    528                     "dpa.w.ph         $ac3,        %[n2],          %[vector2b]    \n\t"
    529                     "extp             %[Temp3],    $ac2,           9              \n\t"
    530 
    531                     "ulw              %[tn2],      4(%[src_ptr])                  \n\t"
    532 
    533                     /* odd 2. pixel */
    534                     "mtlo             %[vector4a], $ac2                           \n\t"
    535                     "preceu.ph.qbr    %[n3],       %[tn2]                         \n\t"
    536                     "preceu.ph.qbl    %[n4],       %[tn2]                         \n\t"
    537                     "dpa.w.ph         $ac2,        %[n2],          %[vector1b]    \n\t"
    538                     "dpa.w.ph         $ac2,        %[n3],          %[vector2b]    \n\t"
    539                     "ulw              %[tp1],      7(%[src_ptr])                  \n\t"
    540                     "extp             %[Temp2],    $ac3,           9              \n\t"
    541                     "mtlo             %[vector4a], $ac3                           \n\t"
    542                     "extp             %[Temp4],    $ac2,           9              \n\t"
    543 
    544                     : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
    545                       [tn2] "=&r" (tn2), [p1] "=&r" (p1), [p2] "=&r" (p2),
    546                       [p3] "=&r" (p3), [p4] "=&r" (p4), [n1] "=&r" (n1),
    547                       [n2] "=&r" (n2), [n3] "=&r" (n3), [n4] "=&r" (n4),
    548                       [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
    549                       [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
    550                     : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
    551                       [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr)
    552                 );
    553 
    554                 /* clamp and store results */
    555                 dst_ptr[0] = cm[Temp1];
    556                 dst_ptr[1] = cm[Temp2];
    557                 dst_ptr[2] = cm[Temp3];
    558                 dst_ptr[3] = cm[Temp4];
    559 
    560                 /* next 4 pixels */
    561                 __asm__ __volatile__ (
    562                     /* even 3. pixel */
    563                     "dpa.w.ph         $ac3,        %[p3],          %[vector1b]    \n\t"
    564                     "dpa.w.ph         $ac3,        %[p4],          %[vector2b]    \n\t"
    565 
    566                     /* even 4. pixel */
    567                     "mtlo             %[vector4a], $ac2                           \n\t"
    568                     "preceu.ph.qbr    %[p2],       %[tp1]                         \n\t"
    569                     "dpa.w.ph         $ac2,        %[p4],          %[vector1b]    \n\t"
    570                     "dpa.w.ph         $ac2,        %[p2],          %[vector2b]    \n\t"
    571                     "extp             %[Temp1],    $ac3,           9              \n\t"
    572 
    573                     /* odd 3. pixel */
    574                     "mtlo             %[vector4a], $ac3                           \n\t"
    575                     "dpa.w.ph         $ac3,        %[n3],          %[vector1b]    \n\t"
    576                     "dpa.w.ph         $ac3,        %[n4],          %[vector2b]    \n\t"
    577                     "ulw              %[tn1],      8(%[src_ptr])                  \n\t"
    578                     "extp             %[Temp3],    $ac2,           9              \n\t"
    579 
    580                     /* odd 4. pixel */
    581                     "mtlo             %[vector4a], $ac2                           \n\t"
    582                     "preceu.ph.qbr    %[n2],       %[tn1]                         \n\t"
    583                     "dpa.w.ph         $ac2,        %[n4],          %[vector1b]    \n\t"
    584                     "dpa.w.ph         $ac2,        %[n2],          %[vector2b]    \n\t"
    585                     "extp             %[Temp2],    $ac3,           9              \n\t"
    586                     "extp             %[Temp4],    $ac2,           9              \n\t"
    587 
    588                     : [tn1] "=&r" (tn1), [p2] "=&r" (p2), [n2] "=&r" (n2),
    589                       [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
    590                       [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
    591                     : [tp1] "r" (tp1), [p3] "r" (p3), [p4] "r" (p4),
    592                       [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
    593                       [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr),
    594                       [n3] "r" (n3), [n4] "r" (n4)
    595                 );
    596 
    597                 /* clamp and store results */
    598                 dst_ptr[4] = cm[Temp1];
    599                 dst_ptr[5] = cm[Temp2];
    600                 dst_ptr[6] = cm[Temp3];
    601                 dst_ptr[7] = cm[Temp4];
    602 
    603                 /* next row... */
    604                 src_ptr += src_pixels_per_line;
    605                 dst_ptr += pitch;
    606             }
    607         }
    608     }
    609 }
    610 
    611 
    612 void vp8_filter_block2d_first_pass16_6tap
    613 (
    614     unsigned char *RESTRICT src_ptr,
    615     unsigned char *RESTRICT dst_ptr,
    616     unsigned int src_pixels_per_line,
    617     unsigned int output_height,
    618     int xoffset,
    619     int pitch
    620 )
    621 {
    622     unsigned int i;
    623     int Temp1, Temp2, Temp3, Temp4;
    624 
    625     unsigned int vector4a;
    626     unsigned int vector1b, vector2b, vector3b;
    627     unsigned int tp1, tp2, tn1, tn2;
    628     unsigned int p1, p2, p3, p4;
    629     unsigned int n1, n2, n3, n4;
    630     unsigned char *cm = ff_cropTbl + CROP_WIDTH;
    631 
    632     vector1b = sub_pel_filters_inv[xoffset][0];
    633     vector2b = sub_pel_filters_inv[xoffset][1];
    634     vector3b = sub_pel_filters_inv[xoffset][2];
    635     vector4a = 64;
    636 
    637     for (i = output_height; i--;)
    638     {
    639         /* prefetch src_ptr data to cache memory */
    640         prefetch_load(src_ptr + src_pixels_per_line);
    641 
    642         /* apply filter with vectors pairs */
    643         __asm__ __volatile__ (
    644             "ulw                %[tp1],      -2(%[src_ptr])                 \n\t"
    645             "ulw                %[tp2],      2(%[src_ptr])                  \n\t"
    646 
    647             /* even 1. pixel */
    648             "mtlo               %[vector4a], $ac3                           \n\t"
    649             "preceu.ph.qbr      %[p1],       %[tp1]                         \n\t"
    650             "preceu.ph.qbl      %[p2],       %[tp1]                         \n\t"
    651             "preceu.ph.qbr      %[p3],       %[tp2]                         \n\t"
    652             "dpa.w.ph           $ac3,        %[p1],           %[vector1b]   \n\t"
    653             "dpa.w.ph           $ac3,        %[p2],           %[vector2b]   \n\t"
    654             "dpa.w.ph           $ac3,        %[p3],           %[vector3b]   \n\t"
    655 
    656             /* even 2. pixel */
    657             "mtlo               %[vector4a], $ac2                           \n\t"
    658             "preceu.ph.qbl      %[p1],       %[tp2]                         \n\t"
    659             "dpa.w.ph           $ac2,        %[p2],           %[vector1b]   \n\t"
    660             "dpa.w.ph           $ac2,        %[p3],           %[vector2b]   \n\t"
    661             "dpa.w.ph           $ac2,        %[p1],           %[vector3b]   \n\t"
    662 
    663             "balign             %[tp2],      %[tp1],          3             \n\t"
    664             "ulw                %[tn2],      3(%[src_ptr])                  \n\t"
    665             "extp               %[Temp1],    $ac3,            9             \n\t"
    666 
    667             /* odd 1. pixel */
    668             "mtlo               %[vector4a], $ac3                           \n\t"
    669             "preceu.ph.qbr      %[n1],       %[tp2]                         \n\t"
    670             "preceu.ph.qbl      %[n2],       %[tp2]                         \n\t"
    671             "preceu.ph.qbr      %[n3],       %[tn2]                         \n\t"
    672             "extp               %[Temp3],    $ac2,            9             \n\t"
    673             "dpa.w.ph           $ac3,        %[n1],           %[vector1b]   \n\t"
    674             "dpa.w.ph           $ac3,        %[n2],           %[vector2b]   \n\t"
    675             "dpa.w.ph           $ac3,        %[n3],           %[vector3b]   \n\t"
    676 
    677             /* odd 2. pixel */
    678             "mtlo               %[vector4a], $ac2                           \n\t"
    679             "preceu.ph.qbl      %[n1],       %[tn2]                         \n\t"
    680             "dpa.w.ph           $ac2,        %[n2],           %[vector1b]   \n\t"
    681             "dpa.w.ph           $ac2,        %[n3],           %[vector2b]   \n\t"
    682             "dpa.w.ph           $ac2,        %[n1],           %[vector3b]   \n\t"
    683             "ulw                %[tp1],      6(%[src_ptr])                  \n\t"
    684             "extp               %[Temp2],    $ac3,            9             \n\t"
    685             "mtlo               %[vector4a], $ac3                           \n\t"
    686             "preceu.ph.qbr      %[p2],       %[tp1]                         \n\t"
    687             "extp               %[Temp4],    $ac2,            9             \n\t"
    688 
    689             : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn2] "=&r" (tn2),
    690               [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3),
    691               [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3),
    692               [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
    693               [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
    694             : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
    695               [vector4a] "r" (vector4a), [vector3b] "r" (vector3b),
    696               [src_ptr] "r" (src_ptr)
    697         );
    698 
    699         /* clamp and store results */
    700         dst_ptr[0] = cm[Temp1];
    701         dst_ptr[1] = cm[Temp2];
    702         dst_ptr[2] = cm[Temp3];
    703         dst_ptr[3] = cm[Temp4];
    704 
    705         /* next 4 pixels */
    706         __asm__ __volatile__ (
    707             /* even 3. pixel */
    708             "dpa.w.ph           $ac3,        %[p3],           %[vector1b]   \n\t"
    709             "dpa.w.ph           $ac3,        %[p1],           %[vector2b]   \n\t"
    710             "dpa.w.ph           $ac3,        %[p2],           %[vector3b]   \n\t"
    711 
    712             /* even 4. pixel */
    713             "mtlo               %[vector4a], $ac2                           \n\t"
    714             "preceu.ph.qbl      %[p4],       %[tp1]                         \n\t"
    715             "dpa.w.ph           $ac2,        %[p1],           %[vector1b]   \n\t"
    716             "dpa.w.ph           $ac2,        %[p2],           %[vector2b]   \n\t"
    717             "dpa.w.ph           $ac2,        %[p4],           %[vector3b]   \n\t"
    718             "ulw                %[tn1],      7(%[src_ptr])                  \n\t"
    719             "extp               %[Temp1],    $ac3,            9             \n\t"
    720 
    721             /* odd 3. pixel */
    722             "mtlo               %[vector4a], $ac3                           \n\t"
    723             "preceu.ph.qbr      %[n2],       %[tn1]                         \n\t"
    724             "dpa.w.ph           $ac3,        %[n3],           %[vector1b]   \n\t"
    725             "dpa.w.ph           $ac3,        %[n1],           %[vector2b]   \n\t"
    726             "dpa.w.ph           $ac3,        %[n2],           %[vector3b]   \n\t"
    727             "extp               %[Temp3],    $ac2,            9             \n\t"
    728 
    729             /* odd 4. pixel */
    730             "mtlo               %[vector4a], $ac2                           \n\t"
    731             "preceu.ph.qbl      %[n4],       %[tn1]                         \n\t"
    732             "dpa.w.ph           $ac2,        %[n1],           %[vector1b]   \n\t"
    733             "dpa.w.ph           $ac2,        %[n2],           %[vector2b]   \n\t"
    734             "dpa.w.ph           $ac2,        %[n4],           %[vector3b]   \n\t"
    735             "ulw                %[tp2],      10(%[src_ptr])                 \n\t"
    736             "extp               %[Temp2],    $ac3,            9             \n\t"
    737             "mtlo               %[vector4a], $ac3                           \n\t"
    738             "preceu.ph.qbr      %[p1],       %[tp2]                         \n\t"
    739             "extp               %[Temp4],    $ac2,            9             \n\t"
    740 
    741             : [tn1] "=&r" (tn1), [tp2] "=&r" (tp2), [n2] "=&r" (n2),
    742               [p4] "=&r" (p4), [n4] "=&r" (n4),
    743               [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
    744               [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
    745             : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
    746               [tp1] "r" (tp1), [n1] "r" (n1), [p1] "r" (p1),
    747               [vector4a] "r" (vector4a), [p2] "r" (p2), [vector3b] "r" (vector3b),
    748               [p3] "r" (p3), [n3] "r" (n3), [src_ptr] "r" (src_ptr)
    749         );
    750 
    751         /* clamp and store results */
    752         dst_ptr[4] = cm[Temp1];
    753         dst_ptr[5] = cm[Temp2];
    754         dst_ptr[6] = cm[Temp3];
    755         dst_ptr[7] = cm[Temp4];
    756 
    757         /* next 4 pixels */
    758         __asm__ __volatile__ (
    759             /* even 5. pixel */
    760             "dpa.w.ph           $ac3,        %[p2],           %[vector1b]   \n\t"
    761             "dpa.w.ph           $ac3,        %[p4],           %[vector2b]   \n\t"
    762             "dpa.w.ph           $ac3,        %[p1],           %[vector3b]   \n\t"
    763 
    764             /* even 6. pixel */
    765             "mtlo               %[vector4a], $ac2                           \n\t"
    766             "preceu.ph.qbl      %[p3],       %[tp2]                         \n\t"
    767             "dpa.w.ph           $ac2,        %[p4],           %[vector1b]   \n\t"
    768             "dpa.w.ph           $ac2,        %[p1],           %[vector2b]   \n\t"
    769             "dpa.w.ph           $ac2,        %[p3],           %[vector3b]   \n\t"
    770 
    771             "ulw                %[tn1],      11(%[src_ptr])                 \n\t"
    772             "extp               %[Temp1],    $ac3,            9             \n\t"
    773 
    774             /* odd 5. pixel */
    775             "mtlo               %[vector4a], $ac3                           \n\t"
    776             "preceu.ph.qbr      %[n1],       %[tn1]                         \n\t"
    777             "dpa.w.ph           $ac3,        %[n2],           %[vector1b]   \n\t"
    778             "dpa.w.ph           $ac3,        %[n4],           %[vector2b]   \n\t"
    779             "dpa.w.ph           $ac3,        %[n1],           %[vector3b]   \n\t"
    780             "extp               %[Temp3],    $ac2,            9             \n\t"
    781 
    782             /* odd 6. pixel */
    783             "mtlo               %[vector4a], $ac2                           \n\t"
    784             "preceu.ph.qbl      %[n3],       %[tn1]                         \n\t"
    785             "dpa.w.ph           $ac2,        %[n4],           %[vector1b]   \n\t"
    786             "dpa.w.ph           $ac2,        %[n1],           %[vector2b]   \n\t"
    787             "dpa.w.ph           $ac2,        %[n3],           %[vector3b]   \n\t"
    788             "ulw                %[tp1],      14(%[src_ptr])                 \n\t"
    789             "extp               %[Temp2],    $ac3,            9             \n\t"
    790             "mtlo               %[vector4a], $ac3                           \n\t"
    791             "preceu.ph.qbr      %[p4],       %[tp1]                         \n\t"
    792             "extp               %[Temp4],    $ac2,            9             \n\t"
    793 
    794             : [tn1] "=&r" (tn1), [tp1] "=&r" (tp1),
    795               [n1] "=&r" (n1), [p3] "=&r" (p3), [n3] "=&r" (n3),
    796               [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
    797               [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
    798             : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
    799               [tp2] "r" (tp2), [p2] "r" (p2), [n2] "r" (n2),
    800               [p4] "r" (p4), [n4] "r" (n4), [p1] "r" (p1), [src_ptr] "r" (src_ptr),
    801               [vector4a] "r" (vector4a), [vector3b] "r" (vector3b)
    802         );
    803 
    804         /* clamp and store results */
    805         dst_ptr[8] = cm[Temp1];
    806         dst_ptr[9] = cm[Temp2];
    807         dst_ptr[10] = cm[Temp3];
    808         dst_ptr[11] = cm[Temp4];
    809 
    810         /* next 4 pixels */
    811         __asm__ __volatile__ (
    812             /* even 7. pixel */
    813             "dpa.w.ph           $ac3,        %[p1],           %[vector1b]   \n\t"
    814             "dpa.w.ph           $ac3,        %[p3],           %[vector2b]   \n\t"
    815             "dpa.w.ph           $ac3,        %[p4],           %[vector3b]   \n\t"
    816 
    817             /* even 8. pixel */
    818             "mtlo               %[vector4a], $ac2                           \n\t"
    819             "preceu.ph.qbl      %[p2],       %[tp1]                         \n\t"
    820             "dpa.w.ph           $ac2,        %[p3],           %[vector1b]   \n\t"
    821             "dpa.w.ph           $ac2,        %[p4],           %[vector2b]   \n\t"
    822             "dpa.w.ph           $ac2,        %[p2],           %[vector3b]   \n\t"
    823             "ulw                %[tn1],      15(%[src_ptr])                 \n\t"
    824             "extp               %[Temp1],    $ac3,            9             \n\t"
    825 
    826             /* odd 7. pixel */
    827             "mtlo               %[vector4a], $ac3                           \n\t"
    828             "preceu.ph.qbr      %[n4],       %[tn1]                         \n\t"
    829             "dpa.w.ph           $ac3,        %[n1],           %[vector1b]   \n\t"
    830             "dpa.w.ph           $ac3,        %[n3],           %[vector2b]   \n\t"
    831             "dpa.w.ph           $ac3,        %[n4],           %[vector3b]   \n\t"
    832             "extp               %[Temp3],    $ac2,            9             \n\t"
    833 
    834             /* odd 8. pixel */
    835             "mtlo               %[vector4a], $ac2                           \n\t"
    836             "preceu.ph.qbl      %[n2],       %[tn1]                         \n\t"
    837             "dpa.w.ph           $ac2,        %[n3],           %[vector1b]   \n\t"
    838             "dpa.w.ph           $ac2,        %[n4],           %[vector2b]   \n\t"
    839             "dpa.w.ph           $ac2,        %[n2],           %[vector3b]   \n\t"
    840             "extp               %[Temp2],    $ac3,            9             \n\t"
    841             "extp               %[Temp4],    $ac2,            9             \n\t"
    842 
    843             /* clamp and store results */
    844             "lbux               %[tp1],      %[Temp1](%[cm])                \n\t"
    845             "lbux               %[tn1],      %[Temp2](%[cm])                \n\t"
    846             "lbux               %[p2],       %[Temp3](%[cm])                \n\t"
    847             "sb                 %[tp1],      12(%[dst_ptr])                 \n\t"
    848             "sb                 %[tn1],      13(%[dst_ptr])                 \n\t"
    849             "lbux               %[n2],       %[Temp4](%[cm])                \n\t"
    850             "sb                 %[p2],       14(%[dst_ptr])                 \n\t"
    851             "sb                 %[n2],       15(%[dst_ptr])                 \n\t"
    852 
    853             : [tn1] "=&r" (tn1), [p2] "=&r" (p2), [n2] "=&r" (n2), [n4] "=&r" (n4),
    854               [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
    855               [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
    856             : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
    857               [tp1] "r" (tp1), [p4] "r" (p4), [n1] "r" (n1), [p1] "r" (p1),
    858               [vector4a] "r" (vector4a), [vector3b] "r" (vector3b), [p3] "r" (p3),
    859               [n3] "r" (n3), [src_ptr] "r" (src_ptr),
    860               [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
    861         );
    862 
    863         src_ptr += src_pixels_per_line;
    864         dst_ptr += pitch;
    865     }
    866 }
    867 
    868 
    869 void vp8_filter_block2d_first_pass16_0
    870 (
    871     unsigned char *RESTRICT src_ptr,
    872     unsigned char *RESTRICT output_ptr,
    873     unsigned int src_pixels_per_line
    874 )
    875 {
    876     int Temp1, Temp2, Temp3, Temp4;
    877     int i;
    878 
    879     /* prefetch src_ptr data to cache memory */
    880     prefetch_store(output_ptr + 32);
    881 
    882     /* copy memory from src buffer to dst buffer */
    883     for (i = 0; i < 7; i++)
    884     {
    885         __asm__ __volatile__ (
    886             "ulw    %[Temp1],   0(%[src_ptr])                               \n\t"
    887             "ulw    %[Temp2],   4(%[src_ptr])                               \n\t"
    888             "ulw    %[Temp3],   8(%[src_ptr])                               \n\t"
    889             "ulw    %[Temp4],   12(%[src_ptr])                              \n\t"
    890             "sw     %[Temp1],   0(%[output_ptr])                            \n\t"
    891             "sw     %[Temp2],   4(%[output_ptr])                            \n\t"
    892             "sw     %[Temp3],   8(%[output_ptr])                            \n\t"
    893             "sw     %[Temp4],   12(%[output_ptr])                           \n\t"
    894             "addu   %[src_ptr], %[src_ptr],        %[src_pixels_per_line]   \n\t"
    895 
    896             : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
    897               [Temp4] "=&r" (Temp4), [src_ptr] "+r" (src_ptr)
    898             : [src_pixels_per_line] "r" (src_pixels_per_line),
    899               [output_ptr] "r" (output_ptr)
    900         );
    901 
    902         __asm__ __volatile__ (
    903             "ulw    %[Temp1],   0(%[src_ptr])                               \n\t"
    904             "ulw    %[Temp2],   4(%[src_ptr])                               \n\t"
    905             "ulw    %[Temp3],   8(%[src_ptr])                               \n\t"
    906             "ulw    %[Temp4],   12(%[src_ptr])                              \n\t"
    907             "sw     %[Temp1],   16(%[output_ptr])                           \n\t"
    908             "sw     %[Temp2],   20(%[output_ptr])                           \n\t"
    909             "sw     %[Temp3],   24(%[output_ptr])                           \n\t"
    910             "sw     %[Temp4],   28(%[output_ptr])                           \n\t"
    911             "addu   %[src_ptr], %[src_ptr],        %[src_pixels_per_line]   \n\t"
    912 
    913             : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
    914               [Temp4] "=&r" (Temp4), [src_ptr] "+r" (src_ptr)
    915             : [src_pixels_per_line] "r" (src_pixels_per_line),
    916               [output_ptr] "r" (output_ptr)
    917         );
    918 
    919         __asm__ __volatile__ (
    920             "ulw    %[Temp1],   0(%[src_ptr])                               \n\t"
    921             "ulw    %[Temp2],   4(%[src_ptr])                               \n\t"
    922             "ulw    %[Temp3],   8(%[src_ptr])                               \n\t"
    923             "ulw    %[Temp4],   12(%[src_ptr])                              \n\t"
    924             "sw     %[Temp1],   32(%[output_ptr])                           \n\t"
    925             "sw     %[Temp2],   36(%[output_ptr])                           \n\t"
    926             "sw     %[Temp3],   40(%[output_ptr])                           \n\t"
    927             "sw     %[Temp4],   44(%[output_ptr])                           \n\t"
    928             "addu   %[src_ptr], %[src_ptr],        %[src_pixels_per_line]   \n\t"
    929 
    930             : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
    931               [Temp4] "=&r" (Temp4), [src_ptr] "+r" (src_ptr)
    932             : [src_pixels_per_line] "r" (src_pixels_per_line),
    933               [output_ptr] "r" (output_ptr)
    934         );
    935 
    936         output_ptr += 48;
    937     }
    938 }
    939 
    940 
    941 void vp8_filter_block2d_first_pass16_4tap
    942 (
    943     unsigned char *RESTRICT src_ptr,
    944     unsigned char *RESTRICT output_ptr,
    945     unsigned int src_pixels_per_line,
    946     unsigned int output_width,
    947     unsigned int output_height,
    948     int xoffset,
    949     int yoffset,
    950     unsigned char *RESTRICT dst_ptr,
    951     int pitch
    952 )
    953 {
    954     unsigned int i, j;
    955     int Temp1, Temp2, Temp3, Temp4;
    956 
    957     unsigned int vector4a;
    958     int vector1b, vector2b;
    959     unsigned int tp1, tp2, tp3, tn1;
    960     unsigned int p1, p2, p3;
    961     unsigned int n1, n2, n3;
    962     unsigned char *cm = ff_cropTbl + CROP_WIDTH;
    963 
    964     vector4a = 64;
    965 
    966     vector1b = sub_pel_filters_inv_tap_4[xoffset][0];
    967     vector2b = sub_pel_filters_inv_tap_4[xoffset][1];
    968 
    969     /* if (yoffset == 0) don't need temp buffer, data will be stored in dst_ptr */
    970     if (yoffset == 0)
    971     {
    972         output_height -= 5;
    973         src_ptr += (src_pixels_per_line + src_pixels_per_line);
    974 
    975         for (i = output_height; i--;)
    976         {
    977             __asm__ __volatile__ (
    978                 "ulw     %[tp3],   -1(%[src_ptr])               \n\t"
    979                 : [tp3] "=&r" (tp3)
    980                 : [src_ptr] "r" (src_ptr)
    981             );
    982 
    983             /* processing 4 adjacent pixels */
    984             for (j = 0; j < 16; j += 4)
    985             {
    986                 /* apply filter with vectors pairs */
    987                 __asm__ __volatile__ (
    988                     "ulw              %[tp2],      3(%[src_ptr])                    \n\t"
    989                     "move             %[tp1],      %[tp3]                           \n\t"
    990 
    991                     /* even 1. pixel */
    992                     "mtlo             %[vector4a], $ac3                             \n\t"
    993                     "mthi             $0,          $ac3                             \n\t"
    994                     "move             %[tp3],      %[tp2]                           \n\t"
    995                     "preceu.ph.qbr    %[p1],       %[tp1]                           \n\t"
    996                     "preceu.ph.qbl    %[p2],       %[tp1]                           \n\t"
    997                     "preceu.ph.qbr    %[p3],       %[tp2]                           \n\t"
    998                     "dpa.w.ph         $ac3,        %[p1],           %[vector1b]     \n\t"
    999                     "dpa.w.ph         $ac3,        %[p2],           %[vector2b]     \n\t"
   1000 
   1001                     /* even 2. pixel */
   1002                     "mtlo             %[vector4a], $ac2                             \n\t"
   1003                     "mthi             $0,          $ac2                             \n\t"
   1004                     "dpa.w.ph         $ac2,        %[p2],           %[vector1b]     \n\t"
   1005                     "dpa.w.ph         $ac2,        %[p3],           %[vector2b]     \n\t"
   1006                     "extr.w           %[Temp1],    $ac3,            7               \n\t"
   1007 
   1008                     /* odd 1. pixel */
   1009                     "ulw              %[tn1],      4(%[src_ptr])                    \n\t"
   1010                     "balign           %[tp2],      %[tp1],          3               \n\t"
   1011                     "mtlo             %[vector4a], $ac3                             \n\t"
   1012                     "mthi             $0,          $ac3                             \n\t"
   1013                     "preceu.ph.qbr    %[n1],       %[tp2]                           \n\t"
   1014                     "preceu.ph.qbl    %[n2],       %[tp2]                           \n\t"
   1015                     "preceu.ph.qbr    %[n3],       %[tn1]                           \n\t"
   1016                     "extr.w           %[Temp3],    $ac2,            7               \n\t"
   1017                     "dpa.w.ph         $ac3,        %[n1],           %[vector1b]     \n\t"
   1018                     "dpa.w.ph         $ac3,        %[n2],           %[vector2b]     \n\t"
   1019 
   1020                     /* odd 2. pixel */
   1021                     "mtlo             %[vector4a], $ac2                             \n\t"
   1022                     "mthi             $0,          $ac2                             \n\t"
   1023                     "extr.w           %[Temp2],    $ac3,            7               \n\t"
   1024                     "dpa.w.ph         $ac2,        %[n2],           %[vector1b]     \n\t"
   1025                     "dpa.w.ph         $ac2,        %[n3],           %[vector2b]     \n\t"
   1026                     "extr.w           %[Temp4],    $ac2,            7               \n\t"
   1027 
   1028                     /* clamp and store results */
   1029                     "lbux             %[tp1],      %[Temp1](%[cm])                  \n\t"
   1030                     "lbux             %[tn1],      %[Temp2](%[cm])                  \n\t"
   1031                     "lbux             %[tp2],      %[Temp3](%[cm])                  \n\t"
   1032                     "sb               %[tp1],      0(%[dst_ptr])                    \n\t"
   1033                     "sb               %[tn1],      1(%[dst_ptr])                    \n\t"
   1034                     "lbux             %[n2],       %[Temp4](%[cm])                  \n\t"
   1035                     "sb               %[tp2],      2(%[dst_ptr])                    \n\t"
   1036                     "sb               %[n2],       3(%[dst_ptr])                    \n\t"
   1037 
   1038                     : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3),
   1039                       [tn1] "=&r" (tn1), [p1] "=&r" (p1), [p2] "=&r" (p2),
   1040                       [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3),
   1041                       [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [p3] "=&r" (p3),
   1042                       [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
   1043                     : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
   1044                       [vector4a] "r" (vector4a), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr),
   1045                       [src_ptr] "r" (src_ptr)
   1046                 );
   1047 
   1048                 src_ptr += 4;
   1049             }
   1050 
   1051             /* Next row... */
   1052             src_ptr += src_pixels_per_line - 16;
   1053             dst_ptr += pitch;
   1054         }
   1055     }
   1056     else
   1057     {
   1058         for (i = output_height; i--;)
   1059         {
   1060             /* processing 4 adjacent pixels */
   1061             for (j = 0; j < 16; j += 4)
   1062             {
   1063                 /* apply filter with vectors pairs */
   1064                 __asm__ __volatile__ (
   1065                     "ulw              %[tp1],      -1(%[src_ptr])                   \n\t"
   1066                     "ulw              %[tp2],      3(%[src_ptr])                    \n\t"
   1067 
   1068                     /* even 1. pixel */
   1069                     "mtlo             %[vector4a], $ac3                             \n\t"
   1070                     "mthi             $0,          $ac3                             \n\t"
   1071                     "preceu.ph.qbr    %[p1],       %[tp1]                           \n\t"
   1072                     "preceu.ph.qbl    %[p2],       %[tp1]                           \n\t"
   1073                     "preceu.ph.qbr    %[p3],       %[tp2]                           \n\t"
   1074                     "dpa.w.ph         $ac3,        %[p1],           %[vector1b]     \n\t"
   1075                     "dpa.w.ph         $ac3,        %[p2],           %[vector2b]     \n\t"
   1076 
   1077                     /* even 2. pixel */
   1078                     "mtlo             %[vector4a], $ac2                             \n\t"
   1079                     "mthi             $0,          $ac2                             \n\t"
   1080                     "dpa.w.ph         $ac2,        %[p2],           %[vector1b]     \n\t"
   1081                     "dpa.w.ph         $ac2,        %[p3],           %[vector2b]     \n\t"
   1082                     "extr.w           %[Temp1],    $ac3,            7               \n\t"
   1083 
   1084                     /* odd 1. pixel */
   1085                     "ulw              %[tn1],      4(%[src_ptr])                    \n\t"
   1086                     "balign           %[tp2],      %[tp1],          3               \n\t"
   1087                     "mtlo             %[vector4a], $ac3                             \n\t"
   1088                     "mthi             $0,          $ac3                             \n\t"
   1089                     "preceu.ph.qbr    %[n1],       %[tp2]                           \n\t"
   1090                     "preceu.ph.qbl    %[n2],       %[tp2]                           \n\t"
   1091                     "preceu.ph.qbr    %[n3],       %[tn1]                           \n\t"
   1092                     "extr.w           %[Temp3],    $ac2,            7               \n\t"
   1093                     "dpa.w.ph         $ac3,        %[n1],           %[vector1b]     \n\t"
   1094                     "dpa.w.ph         $ac3,        %[n2],           %[vector2b]     \n\t"
   1095 
   1096                     /* odd 2. pixel */
   1097                     "mtlo             %[vector4a], $ac2                             \n\t"
   1098                     "mthi             $0,          $ac2                             \n\t"
   1099                     "extr.w           %[Temp2],    $ac3,            7               \n\t"
   1100                     "dpa.w.ph         $ac2,        %[n2],           %[vector1b]     \n\t"
   1101                     "dpa.w.ph         $ac2,        %[n3],           %[vector2b]     \n\t"
   1102                     "extr.w           %[Temp4],    $ac2,            7               \n\t"
   1103 
   1104                     /* clamp and store results */
   1105                     "lbux             %[tp1],      %[Temp1](%[cm])                  \n\t"
   1106                     "lbux             %[tn1],      %[Temp2](%[cm])                  \n\t"
   1107                     "lbux             %[tp2],      %[Temp3](%[cm])                  \n\t"
   1108                     "sb               %[tp1],      0(%[output_ptr])                 \n\t"
   1109                     "sb               %[tn1],      1(%[output_ptr])                 \n\t"
   1110                     "lbux             %[n2],       %[Temp4](%[cm])                  \n\t"
   1111                     "sb               %[tp2],      2(%[output_ptr])                 \n\t"
   1112                     "sb               %[n2],       3(%[output_ptr])                 \n\t"
   1113 
   1114                     : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn1] "=&r" (tn1),
   1115                       [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3),
   1116                       [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3),
   1117                       [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
   1118                       [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
   1119                     : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
   1120                       [vector4a] "r" (vector4a), [cm] "r" (cm),
   1121                       [output_ptr] "r" (output_ptr), [src_ptr] "r" (src_ptr)
   1122                 );
   1123 
   1124                 src_ptr += 4;
   1125             }
   1126 
   1127             /* next row... */
   1128             src_ptr += src_pixels_per_line;
   1129             output_ptr += output_width;
   1130         }
   1131     }
   1132 }
   1133 
   1134 
   1135 void vp8_filter_block2d_second_pass4
   1136 (
   1137     unsigned char *RESTRICT src_ptr,
   1138     unsigned char *RESTRICT output_ptr,
   1139     int output_pitch,
   1140     int yoffset
   1141 )
   1142 {
   1143     unsigned int i;
   1144 
   1145     int Temp1, Temp2, Temp3, Temp4;
   1146     unsigned int vector1b, vector2b, vector3b, vector4a;
   1147 
   1148     unsigned char src_ptr_l2;
   1149     unsigned char src_ptr_l1;
   1150     unsigned char src_ptr_0;
   1151     unsigned char src_ptr_r1;
   1152     unsigned char src_ptr_r2;
   1153     unsigned char src_ptr_r3;
   1154 
   1155     unsigned char *cm = ff_cropTbl + CROP_WIDTH;
   1156 
   1157     vector4a = 64;
   1158 
   1159     /* load filter coefficients */
   1160     vector1b = sub_pel_filterss[yoffset][0];
   1161     vector2b = sub_pel_filterss[yoffset][2];
   1162     vector3b = sub_pel_filterss[yoffset][1];
   1163 
   1164     if (vector1b)
   1165     {
   1166         /* 6 tap filter */
   1167 
   1168         for (i = 2; i--;)
   1169         {
   1170             /* prefetch src_ptr data to cache memory */
   1171             prefetch_load(src_ptr);
   1172 
   1173             /* do not allow compiler to reorder instructions */
   1174             __asm__ __volatile__ (
   1175                 ".set noreorder                                                 \n\t"
   1176                 :
   1177                 :
   1178             );
   1179 
   1180             /* apply filter with vectors pairs */
   1181             __asm__ __volatile__ (
   1182                 "lbu            %[src_ptr_l2],  -8(%[src_ptr])                  \n\t"
   1183                 "lbu            %[src_ptr_l1],  -4(%[src_ptr])                  \n\t"
   1184                 "lbu            %[src_ptr_0],   0(%[src_ptr])                   \n\t"
   1185                 "lbu            %[src_ptr_r1],  4(%[src_ptr])                   \n\t"
   1186                 "lbu            %[src_ptr_r2],  8(%[src_ptr])                   \n\t"
   1187                 "lbu            %[src_ptr_r3],  12(%[src_ptr])                  \n\t"
   1188                 "mtlo           %[vector4a],    $ac2                            \n\t"
   1189 
   1190                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   1191                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1192                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1193                 "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
   1194                 "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
   1195                 "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1196 
   1197                 "lbu            %[src_ptr_l2],  -7(%[src_ptr])                  \n\t"
   1198                 "lbu            %[src_ptr_l1],  -3(%[src_ptr])                  \n\t"
   1199                 "lbu            %[src_ptr_0],   1(%[src_ptr])                   \n\t"
   1200                 "lbu            %[src_ptr_r1],  5(%[src_ptr])                   \n\t"
   1201                 "lbu            %[src_ptr_r2],  9(%[src_ptr])                   \n\t"
   1202                 "lbu            %[src_ptr_r3],  13(%[src_ptr])                  \n\t"
   1203                 "mtlo           %[vector4a],    $ac3                            \n\t"
   1204                 "extp           %[Temp1],       $ac2,           9               \n\t"
   1205 
   1206                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   1207                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1208                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1209                 "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
   1210                 "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
   1211                 "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1212 
   1213                 "lbu            %[src_ptr_l2],  -6(%[src_ptr])                  \n\t"
   1214                 "lbu            %[src_ptr_l1],  -2(%[src_ptr])                  \n\t"
   1215                 "lbu            %[src_ptr_0],   2(%[src_ptr])                   \n\t"
   1216                 "lbu            %[src_ptr_r1],  6(%[src_ptr])                   \n\t"
   1217                 "lbu            %[src_ptr_r2],  10(%[src_ptr])                  \n\t"
   1218                 "lbu            %[src_ptr_r3],  14(%[src_ptr])                  \n\t"
   1219                 "mtlo           %[vector4a],    $ac0                            \n\t"
   1220                 "extp           %[Temp2],       $ac3,           9               \n\t"
   1221 
   1222                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   1223                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1224                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1225                 "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
   1226                 "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
   1227                 "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1228 
   1229                 "lbu            %[src_ptr_l2],  -5(%[src_ptr])                  \n\t"
   1230                 "lbu            %[src_ptr_l1],  -1(%[src_ptr])                  \n\t"
   1231                 "lbu            %[src_ptr_0],   3(%[src_ptr])                   \n\t"
   1232                 "lbu            %[src_ptr_r1],  7(%[src_ptr])                   \n\t"
   1233                 "lbu            %[src_ptr_r2],  11(%[src_ptr])                  \n\t"
   1234                 "lbu            %[src_ptr_r3],  15(%[src_ptr])                  \n\t"
   1235                 "mtlo           %[vector4a],    $ac1                            \n\t"
   1236                 "extp           %[Temp3],       $ac0,           9               \n\t"
   1237 
   1238                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   1239                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1240                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1241                 "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
   1242                 "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
   1243                 "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1244                 "extp           %[Temp4],       $ac1,           9               \n\t"
   1245 
   1246                 : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
   1247                   [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4),
   1248                   [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
   1249                   [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2),
   1250                   [src_ptr_l2] "=&r" (src_ptr_l2), [src_ptr_r3] "=&r" (src_ptr_r3)
   1251                 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
   1252                   [vector3b] "r" (vector3b), [vector4a] "r" (vector4a),
   1253                   [src_ptr] "r" (src_ptr)
   1254             );
   1255 
   1256             /* clamp and store results */
   1257             output_ptr[0] = cm[Temp1];
   1258             output_ptr[1] = cm[Temp2];
   1259             output_ptr[2] = cm[Temp3];
   1260             output_ptr[3] = cm[Temp4];
   1261 
   1262             output_ptr += output_pitch;
   1263 
   1264             /* apply filter with vectors pairs */
   1265             __asm__ __volatile__ (
   1266                 "lbu            %[src_ptr_l2],  -4(%[src_ptr])                  \n\t"
   1267                 "lbu            %[src_ptr_l1],  0(%[src_ptr])                   \n\t"
   1268                 "lbu            %[src_ptr_0],   4(%[src_ptr])                   \n\t"
   1269                 "lbu            %[src_ptr_r1],  8(%[src_ptr])                   \n\t"
   1270                 "lbu            %[src_ptr_r2],  12(%[src_ptr])                  \n\t"
   1271                 "lbu            %[src_ptr_r3],  16(%[src_ptr])                  \n\t"
   1272                 "mtlo           %[vector4a],    $ac2                            \n\t"
   1273                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   1274                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1275                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1276                 "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
   1277                 "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
   1278                 "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1279 
   1280                 "lbu            %[src_ptr_l2],  -3(%[src_ptr])                  \n\t"
   1281                 "lbu            %[src_ptr_l1],  1(%[src_ptr])                   \n\t"
   1282                 "lbu            %[src_ptr_0],   5(%[src_ptr])                   \n\t"
   1283                 "lbu            %[src_ptr_r1],  9(%[src_ptr])                   \n\t"
   1284                 "lbu            %[src_ptr_r2],  13(%[src_ptr])                  \n\t"
   1285                 "lbu            %[src_ptr_r3],  17(%[src_ptr])                  \n\t"
   1286                 "mtlo           %[vector4a],    $ac3                            \n\t"
   1287                 "extp           %[Temp1],       $ac2,           9               \n\t"
   1288 
   1289                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   1290                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1291                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1292                 "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
   1293                 "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
   1294                 "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1295 
   1296                 "lbu            %[src_ptr_l2],  -2(%[src_ptr])                  \n\t"
   1297                 "lbu            %[src_ptr_l1],  2(%[src_ptr])                   \n\t"
   1298                 "lbu            %[src_ptr_0],   6(%[src_ptr])                   \n\t"
   1299                 "lbu            %[src_ptr_r1],  10(%[src_ptr])                  \n\t"
   1300                 "lbu            %[src_ptr_r2],  14(%[src_ptr])                  \n\t"
   1301                 "lbu            %[src_ptr_r3],  18(%[src_ptr])                  \n\t"
   1302                 "mtlo           %[vector4a],    $ac0                            \n\t"
   1303                 "extp           %[Temp2],       $ac3,           9               \n\t"
   1304 
   1305                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   1306                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1307                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1308                 "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
   1309                 "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
   1310                 "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1311 
   1312                 "lbu            %[src_ptr_l2],  -1(%[src_ptr])                  \n\t"
   1313                 "lbu            %[src_ptr_l1],  3(%[src_ptr])                   \n\t"
   1314                 "lbu            %[src_ptr_0],   7(%[src_ptr])                   \n\t"
   1315                 "lbu            %[src_ptr_r1],  11(%[src_ptr])                  \n\t"
   1316                 "lbu            %[src_ptr_r2],  15(%[src_ptr])                  \n\t"
   1317                 "lbu            %[src_ptr_r3],  19(%[src_ptr])                  \n\t"
   1318                 "mtlo           %[vector4a],    $ac1                            \n\t"
   1319                 "extp           %[Temp3],       $ac0,           9               \n\t"
   1320 
   1321                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   1322                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1323                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1324                 "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
   1325                 "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
   1326                 "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1327                 "extp           %[Temp4],       $ac1,           9               \n\t"
   1328 
   1329                 : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
   1330                   [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4),
   1331                   [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
   1332                   [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2),
   1333                   [src_ptr_l2] "=&r" (src_ptr_l2), [src_ptr_r3] "=&r" (src_ptr_r3)
   1334                 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
   1335                   [vector3b] "r" (vector3b), [vector4a] "r" (vector4a),
   1336                   [src_ptr] "r" (src_ptr)
   1337             );
   1338 
   1339             /* clamp and store results */
   1340             output_ptr[0] = cm[Temp1];
   1341             output_ptr[1] = cm[Temp2];
   1342             output_ptr[2] = cm[Temp3];
   1343             output_ptr[3] = cm[Temp4];
   1344 
   1345             src_ptr += 8;
   1346             output_ptr += output_pitch;
   1347         }
   1348     }
   1349     else
   1350     {
   1351         /* 4 tap filter */
   1352 
   1353         /* prefetch src_ptr data to cache memory */
   1354         prefetch_load(src_ptr);
   1355 
   1356         for (i = 2; i--;)
   1357         {
   1358             /* do not allow compiler to reorder instructions */
   1359             __asm__ __volatile__ (
   1360                 ".set noreorder                                                 \n\t"
   1361                 :
   1362                 :
   1363             );
   1364 
   1365             /* apply filter with vectors pairs */
   1366             __asm__ __volatile__ (
   1367                 "lbu            %[src_ptr_l1],  -4(%[src_ptr])                  \n\t"
   1368                 "lbu            %[src_ptr_0],   0(%[src_ptr])                   \n\t"
   1369                 "lbu            %[src_ptr_r1],  4(%[src_ptr])                   \n\t"
   1370                 "lbu            %[src_ptr_r2],  8(%[src_ptr])                   \n\t"
   1371                 "mtlo           %[vector4a],    $ac2                            \n\t"
   1372                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1373                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1374                 "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
   1375                 "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1376 
   1377                 "lbu            %[src_ptr_l1],  -3(%[src_ptr])                  \n\t"
   1378                 "lbu            %[src_ptr_0],   1(%[src_ptr])                   \n\t"
   1379                 "lbu            %[src_ptr_r1],  5(%[src_ptr])                   \n\t"
   1380                 "lbu            %[src_ptr_r2],  9(%[src_ptr])                   \n\t"
   1381                 "mtlo           %[vector4a],    $ac3                            \n\t"
   1382                 "extp           %[Temp1],       $ac2,           9               \n\t"
   1383 
   1384                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1385                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1386                 "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
   1387                 "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1388 
   1389                 "lbu            %[src_ptr_l1],  -2(%[src_ptr])                  \n\t"
   1390                 "lbu            %[src_ptr_0],   2(%[src_ptr])                   \n\t"
   1391                 "lbu            %[src_ptr_r1],  6(%[src_ptr])                   \n\t"
   1392                 "lbu            %[src_ptr_r2],  10(%[src_ptr])                  \n\t"
   1393                 "mtlo           %[vector4a],    $ac0                            \n\t"
   1394                 "extp           %[Temp2],       $ac3,           9               \n\t"
   1395 
   1396                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1397                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1398                 "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
   1399                 "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1400 
   1401                 "lbu            %[src_ptr_l1],  -1(%[src_ptr])                  \n\t"
   1402                 "lbu            %[src_ptr_0],   3(%[src_ptr])                   \n\t"
   1403                 "lbu            %[src_ptr_r1],  7(%[src_ptr])                   \n\t"
   1404                 "lbu            %[src_ptr_r2],  11(%[src_ptr])                  \n\t"
   1405                 "mtlo           %[vector4a],    $ac1                            \n\t"
   1406                 "extp           %[Temp3],       $ac0,           9               \n\t"
   1407                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1408                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1409                 "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
   1410                 "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1411                 "extp           %[Temp4],       $ac1,           9               \n\t"
   1412 
   1413                 : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
   1414                   [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4),
   1415                   [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
   1416                   [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2)
   1417                 : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
   1418                   [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr)
   1419             );
   1420 
   1421             /* clamp and store results */
   1422             output_ptr[0] = cm[Temp1];
   1423             output_ptr[1] = cm[Temp2];
   1424             output_ptr[2] = cm[Temp3];
   1425             output_ptr[3] = cm[Temp4];
   1426 
   1427             output_ptr += output_pitch;
   1428 
   1429             /* apply filter with vectors pairs */
   1430             __asm__ __volatile__ (
   1431                 "lbu            %[src_ptr_l1],  0(%[src_ptr])                   \n\t"
   1432                 "lbu            %[src_ptr_0],   4(%[src_ptr])                   \n\t"
   1433                 "lbu            %[src_ptr_r1],  8(%[src_ptr])                   \n\t"
   1434                 "lbu            %[src_ptr_r2],  12(%[src_ptr])                  \n\t"
   1435                 "mtlo           %[vector4a],    $ac2                            \n\t"
   1436                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1437                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1438                 "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
   1439                 "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1440 
   1441                 "lbu            %[src_ptr_l1],  1(%[src_ptr])                   \n\t"
   1442                 "lbu            %[src_ptr_0],   5(%[src_ptr])                   \n\t"
   1443                 "lbu            %[src_ptr_r1],  9(%[src_ptr])                   \n\t"
   1444                 "lbu            %[src_ptr_r2],  13(%[src_ptr])                  \n\t"
   1445                 "mtlo           %[vector4a],    $ac3                            \n\t"
   1446                 "extp           %[Temp1],       $ac2,           9               \n\t"
   1447 
   1448                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1449                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1450                 "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
   1451                 "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1452 
   1453                 "lbu            %[src_ptr_l1],  2(%[src_ptr])                   \n\t"
   1454                 "lbu            %[src_ptr_0],   6(%[src_ptr])                   \n\t"
   1455                 "lbu            %[src_ptr_r1],  10(%[src_ptr])                  \n\t"
   1456                 "lbu            %[src_ptr_r2],  14(%[src_ptr])                  \n\t"
   1457                 "mtlo           %[vector4a],    $ac0                            \n\t"
   1458                 "extp           %[Temp2],       $ac3,           9               \n\t"
   1459 
   1460                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1461                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1462                 "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
   1463                 "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1464 
   1465                 "lbu            %[src_ptr_l1],  3(%[src_ptr])                   \n\t"
   1466                 "lbu            %[src_ptr_0],   7(%[src_ptr])                   \n\t"
   1467                 "lbu            %[src_ptr_r1],  11(%[src_ptr])                  \n\t"
   1468                 "lbu            %[src_ptr_r2],  15(%[src_ptr])                  \n\t"
   1469                 "mtlo           %[vector4a],    $ac1                            \n\t"
   1470                 "extp           %[Temp3],       $ac0,           9               \n\t"
   1471                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1472                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1473                 "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
   1474                 "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1475                 "extp           %[Temp4],       $ac1,           9               \n\t"
   1476 
   1477                 : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
   1478                   [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4),
   1479                   [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
   1480                   [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2)
   1481                 : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
   1482                   [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr)
   1483             );
   1484 
   1485             /* clamp and store results */
   1486             output_ptr[0] = cm[Temp1];
   1487             output_ptr[1] = cm[Temp2];
   1488             output_ptr[2] = cm[Temp3];
   1489             output_ptr[3] = cm[Temp4];
   1490 
   1491             src_ptr += 8;
   1492             output_ptr += output_pitch;
   1493         }
   1494     }
   1495 }
   1496 
   1497 
   1498 void vp8_filter_block2d_second_pass_8
   1499 (
   1500     unsigned char *RESTRICT src_ptr,
   1501     unsigned char *RESTRICT output_ptr,
   1502     int output_pitch,
   1503     unsigned int output_height,
   1504     unsigned int output_width,
   1505     unsigned int yoffset
   1506 )
   1507 {
   1508     unsigned int i;
   1509 
   1510     int Temp1, Temp2, Temp3, Temp4, Temp5, Temp6, Temp7, Temp8;
   1511     unsigned int vector1b, vector2b, vector3b, vector4a;
   1512 
   1513     unsigned char src_ptr_l2;
   1514     unsigned char src_ptr_l1;
   1515     unsigned char src_ptr_0;
   1516     unsigned char src_ptr_r1;
   1517     unsigned char src_ptr_r2;
   1518     unsigned char src_ptr_r3;
   1519     unsigned char *cm = ff_cropTbl + CROP_WIDTH;
   1520 
   1521     vector4a = 64;
   1522 
   1523     vector1b = sub_pel_filterss[yoffset][0];
   1524     vector2b = sub_pel_filterss[yoffset][2];
   1525     vector3b = sub_pel_filterss[yoffset][1];
   1526 
   1527     if (vector1b)
   1528     {
   1529         /* 6 tap filter */
   1530 
   1531         /* prefetch src_ptr data to cache memory */
   1532         prefetch_load(src_ptr);
   1533 
   1534         for (i = output_height; i--;)
   1535         {
   1536             /* apply filter with vectors pairs */
   1537             __asm__ __volatile__ (
   1538                 "lbu            %[src_ptr_l2],  -16(%[src_ptr])                 \n\t"
   1539                 "lbu            %[src_ptr_l1],  -8(%[src_ptr])                  \n\t"
   1540                 "lbu            %[src_ptr_0],   0(%[src_ptr])                   \n\t"
   1541                 "lbu            %[src_ptr_r1],  8(%[src_ptr])                   \n\t"
   1542                 "lbu            %[src_ptr_r2],  16(%[src_ptr])                  \n\t"
   1543                 "lbu            %[src_ptr_r3],  24(%[src_ptr])                  \n\t"
   1544                 "mtlo           %[vector4a],    $ac2                            \n\t"
   1545 
   1546                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1547                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1548                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   1549                 "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
   1550                 "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
   1551                 "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1552 
   1553                 "lbu            %[src_ptr_l2],  -15(%[src_ptr])                 \n\t"
   1554                 "lbu            %[src_ptr_l1],  -7(%[src_ptr])                  \n\t"
   1555                 "lbu            %[src_ptr_0],   1(%[src_ptr])                   \n\t"
   1556                 "lbu            %[src_ptr_r1],  9(%[src_ptr])                   \n\t"
   1557                 "lbu            %[src_ptr_r2],  17(%[src_ptr])                  \n\t"
   1558                 "lbu            %[src_ptr_r3],  25(%[src_ptr])                  \n\t"
   1559                 "mtlo           %[vector4a],    $ac3                            \n\t"
   1560                 "extp           %[Temp1],       $ac2,           9               \n\t"
   1561 
   1562                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   1563                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1564                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1565                 "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
   1566                 "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
   1567                 "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1568 
   1569                 "lbu            %[src_ptr_l2],  -14(%[src_ptr])                 \n\t"
   1570                 "lbu            %[src_ptr_l1],  -6(%[src_ptr])                  \n\t"
   1571                 "lbu            %[src_ptr_0],   2(%[src_ptr])                   \n\t"
   1572                 "lbu            %[src_ptr_r1],  10(%[src_ptr])                  \n\t"
   1573                 "lbu            %[src_ptr_r2],  18(%[src_ptr])                  \n\t"
   1574                 "lbu            %[src_ptr_r3],  26(%[src_ptr])                  \n\t"
   1575                 "mtlo           %[vector4a],    $ac0                            \n\t"
   1576                 "extp           %[Temp2],       $ac3,           9               \n\t"
   1577 
   1578                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   1579                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1580                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1581                 "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
   1582                 "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
   1583                 "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1584 
   1585                 "lbu            %[src_ptr_l2],  -13(%[src_ptr])                 \n\t"
   1586                 "lbu            %[src_ptr_l1],  -5(%[src_ptr])                  \n\t"
   1587                 "lbu            %[src_ptr_0],   3(%[src_ptr])                   \n\t"
   1588                 "lbu            %[src_ptr_r1],  11(%[src_ptr])                  \n\t"
   1589                 "lbu            %[src_ptr_r2],  19(%[src_ptr])                  \n\t"
   1590                 "lbu            %[src_ptr_r3],  27(%[src_ptr])                  \n\t"
   1591                 "mtlo           %[vector4a],    $ac1                            \n\t"
   1592                 "extp           %[Temp3],       $ac0,           9               \n\t"
   1593 
   1594                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   1595                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1596                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1597                 "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
   1598                 "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
   1599                 "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1600 
   1601                 : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
   1602                   [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
   1603                   [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2),
   1604                   [src_ptr_l2] "=&r" (src_ptr_l2), [src_ptr_r3] "=&r" (src_ptr_r3)
   1605                 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
   1606                   [vector3b] "r" (vector3b), [vector4a] "r" (vector4a),
   1607                   [src_ptr] "r" (src_ptr)
   1608             );
   1609 
   1610             /* apply filter with vectors pairs */
   1611             __asm__ __volatile__ (
   1612                 "lbu            %[src_ptr_l2],  -12(%[src_ptr])                 \n\t"
   1613                 "lbu            %[src_ptr_l1],  -4(%[src_ptr])                  \n\t"
   1614                 "lbu            %[src_ptr_0],   4(%[src_ptr])                   \n\t"
   1615                 "lbu            %[src_ptr_r1],  12(%[src_ptr])                  \n\t"
   1616                 "lbu            %[src_ptr_r2],  20(%[src_ptr])                  \n\t"
   1617                 "lbu            %[src_ptr_r3],  28(%[src_ptr])                  \n\t"
   1618                 "mtlo           %[vector4a],    $ac2                            \n\t"
   1619 
   1620                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   1621                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1622                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1623                 "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
   1624                 "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
   1625                 "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1626                 "extp           %[Temp4],       $ac1,           9               \n\t"
   1627 
   1628                 "lbu            %[src_ptr_l2],  -11(%[src_ptr])                 \n\t"
   1629                 "lbu            %[src_ptr_l1],  -3(%[src_ptr])                  \n\t"
   1630                 "lbu            %[src_ptr_0],   5(%[src_ptr])                   \n\t"
   1631                 "lbu            %[src_ptr_r1],  13(%[src_ptr])                  \n\t"
   1632                 "lbu            %[src_ptr_r2],  21(%[src_ptr])                  \n\t"
   1633                 "lbu            %[src_ptr_r3],  29(%[src_ptr])                  \n\t"
   1634                 "mtlo           %[vector4a],    $ac3                            \n\t"
   1635                 "extp           %[Temp5],       $ac2,           9               \n\t"
   1636 
   1637                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   1638                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1639                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1640                 "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
   1641                 "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
   1642                 "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1643 
   1644                 "lbu            %[src_ptr_l2],  -10(%[src_ptr])                 \n\t"
   1645                 "lbu            %[src_ptr_l1],  -2(%[src_ptr])                  \n\t"
   1646                 "lbu            %[src_ptr_0],   6(%[src_ptr])                   \n\t"
   1647                 "lbu            %[src_ptr_r1],  14(%[src_ptr])                  \n\t"
   1648                 "lbu            %[src_ptr_r2],  22(%[src_ptr])                  \n\t"
   1649                 "lbu            %[src_ptr_r3],  30(%[src_ptr])                  \n\t"
   1650                 "mtlo           %[vector4a],    $ac0                            \n\t"
   1651                 "extp           %[Temp6],       $ac3,           9               \n\t"
   1652 
   1653                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   1654                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1655                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1656                 "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
   1657                 "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
   1658                 "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1659 
   1660                 "lbu            %[src_ptr_l2],  -9(%[src_ptr])                  \n\t"
   1661                 "lbu            %[src_ptr_l1],  -1(%[src_ptr])                  \n\t"
   1662                 "lbu            %[src_ptr_0],   7(%[src_ptr])                   \n\t"
   1663                 "lbu            %[src_ptr_r1],  15(%[src_ptr])                  \n\t"
   1664                 "lbu            %[src_ptr_r2],  23(%[src_ptr])                  \n\t"
   1665                 "lbu            %[src_ptr_r3],  31(%[src_ptr])                  \n\t"
   1666                 "mtlo           %[vector4a],    $ac1                            \n\t"
   1667                 "extp           %[Temp7],       $ac0,           9               \n\t"
   1668 
   1669                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   1670                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1671                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1672                 "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
   1673                 "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
   1674                 "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1675                 "extp           %[Temp8],       $ac1,           9               \n\t"
   1676 
   1677                 : [Temp4] "=&r" (Temp4), [Temp5] "=&r" (Temp5),
   1678                   [Temp6] "=&r" (Temp6), [Temp7] "=&r" (Temp7), [Temp8] "=r" (Temp8),
   1679                   [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
   1680                   [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2),
   1681                   [src_ptr_l2] "=&r" (src_ptr_l2),[src_ptr_r3] "=&r" (src_ptr_r3)
   1682                 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
   1683                   [vector3b] "r" (vector3b), [vector4a] "r" (vector4a),
   1684                   [src_ptr] "r" (src_ptr)
   1685             );
   1686 
   1687             /* clamp and store results */
   1688             output_ptr[0] = cm[Temp1];
   1689             output_ptr[1] = cm[Temp2];
   1690             output_ptr[2] = cm[Temp3];
   1691             output_ptr[3] = cm[Temp4];
   1692             output_ptr[4] = cm[Temp5];
   1693             output_ptr[5] = cm[Temp6];
   1694             output_ptr[6] = cm[Temp7];
   1695             output_ptr[7] = cm[Temp8];
   1696 
   1697             src_ptr += 8;
   1698             output_ptr += output_pitch;
   1699         }
   1700     }
   1701     else
   1702     {
   1703         /* 4 tap filter */
   1704 
   1705         /* prefetch src_ptr data to cache memory */
   1706         prefetch_load(src_ptr);
   1707 
   1708         for (i = output_height; i--;)
   1709         {
   1710             __asm__ __volatile__ (
   1711                 "lbu            %[src_ptr_l1],  -8(%[src_ptr])                  \n\t"
   1712                 "lbu            %[src_ptr_0],   0(%[src_ptr])                   \n\t"
   1713                 "lbu            %[src_ptr_r1],  8(%[src_ptr])                   \n\t"
   1714                 "lbu            %[src_ptr_r2],  16(%[src_ptr])                  \n\t"
   1715                 "mtlo           %[vector4a],    $ac2                            \n\t"
   1716                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1717                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1718                 "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
   1719                 "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1720 
   1721                 : [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
   1722                   [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2)
   1723                 : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
   1724                   [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr)
   1725             );
   1726 
   1727             __asm__ __volatile__ (
   1728                 "lbu            %[src_ptr_l1],  -7(%[src_ptr])                  \n\t"
   1729                 "lbu            %[src_ptr_0],   1(%[src_ptr])                   \n\t"
   1730                 "lbu            %[src_ptr_r1],  9(%[src_ptr])                   \n\t"
   1731                 "lbu            %[src_ptr_r2],  17(%[src_ptr])                  \n\t"
   1732                 "mtlo           %[vector4a],    $ac3                            \n\t"
   1733                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1734                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1735                 "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
   1736                 "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1737                 "extp           %[Temp1],       $ac2,           9               \n\t"
   1738 
   1739                 : [Temp1] "=r" (Temp1),
   1740                   [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
   1741                   [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2)
   1742                 : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
   1743                   [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr)
   1744             );
   1745 
   1746             src_ptr_l1 = src_ptr[-6];
   1747             src_ptr_0  = src_ptr[2];
   1748             src_ptr_r1 = src_ptr[10];
   1749             src_ptr_r2 = src_ptr[18];
   1750 
   1751             __asm__ __volatile__ (
   1752                 "mtlo           %[vector4a],    $ac0                            \n\t"
   1753                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1754                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1755                 "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
   1756                 "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1757                 "extp           %[Temp2],       $ac3,           9               \n\t"
   1758 
   1759                 : [Temp2] "=r" (Temp2)
   1760                 : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
   1761                   [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0),
   1762                   [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2),
   1763                   [vector4a] "r" (vector4a)
   1764             );
   1765 
   1766             src_ptr_l1 = src_ptr[-5];
   1767             src_ptr_0  = src_ptr[3];
   1768             src_ptr_r1 = src_ptr[11];
   1769             src_ptr_r2 = src_ptr[19];
   1770 
   1771             __asm__ __volatile__ (
   1772                 "mtlo           %[vector4a],    $ac1                            \n\t"
   1773                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1774                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1775                 "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
   1776                 "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1777                 "extp           %[Temp3],       $ac0,           9               \n\t"
   1778 
   1779                 : [Temp3] "=r" (Temp3)
   1780                 : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
   1781                   [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0),
   1782                   [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2),
   1783                   [vector4a] "r" (vector4a)
   1784             );
   1785 
   1786             src_ptr_l1 = src_ptr[-4];
   1787             src_ptr_0  = src_ptr[4];
   1788             src_ptr_r1 = src_ptr[12];
   1789             src_ptr_r2 = src_ptr[20];
   1790 
   1791             __asm__ __volatile__ (
   1792                 "mtlo           %[vector4a],    $ac2                            \n\t"
   1793                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1794                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1795                 "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
   1796                 "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1797                 "extp           %[Temp4],       $ac1,           9               \n\t"
   1798 
   1799                 : [Temp4] "=r" (Temp4)
   1800                 : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
   1801                   [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0),
   1802                   [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2),
   1803                   [vector4a] "r" (vector4a)
   1804             );
   1805 
   1806             src_ptr_l1 = src_ptr[-3];
   1807             src_ptr_0  = src_ptr[5];
   1808             src_ptr_r1 = src_ptr[13];
   1809             src_ptr_r2 = src_ptr[21];
   1810 
   1811             __asm__ __volatile__ (
   1812                 "mtlo           %[vector4a],    $ac3                            \n\t"
   1813                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1814                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1815                 "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
   1816                 "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1817                 "extp           %[Temp5],       $ac2,           9               \n\t"
   1818 
   1819                 : [Temp5] "=&r" (Temp5)
   1820                 : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
   1821                   [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0),
   1822                   [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2),
   1823                   [vector4a] "r" (vector4a)
   1824             );
   1825 
   1826             src_ptr_l1 = src_ptr[-2];
   1827             src_ptr_0  = src_ptr[6];
   1828             src_ptr_r1 = src_ptr[14];
   1829             src_ptr_r2 = src_ptr[22];
   1830 
   1831             __asm__ __volatile__ (
   1832                 "mtlo           %[vector4a],    $ac0                            \n\t"
   1833                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1834                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1835                 "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
   1836                 "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1837                 "extp           %[Temp6],       $ac3,           9               \n\t"
   1838 
   1839                 : [Temp6] "=r" (Temp6)
   1840                 : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
   1841                   [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0),
   1842                   [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2),
   1843                   [vector4a] "r" (vector4a)
   1844             );
   1845 
   1846             src_ptr_l1 = src_ptr[-1];
   1847             src_ptr_0  = src_ptr[7];
   1848             src_ptr_r1 = src_ptr[15];
   1849             src_ptr_r2 = src_ptr[23];
   1850 
   1851             __asm__ __volatile__ (
   1852                 "mtlo           %[vector4a],    $ac1                            \n\t"
   1853                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1854                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1855                 "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
   1856                 "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1857                 "extp           %[Temp7],       $ac0,           9               \n\t"
   1858                 "extp           %[Temp8],       $ac1,           9               \n\t"
   1859 
   1860                 : [Temp7] "=&r" (Temp7), [Temp8] "=r" (Temp8)
   1861                 : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
   1862                   [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0),
   1863                   [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2),
   1864                   [vector4a] "r" (vector4a)
   1865             );
   1866 
   1867             /* clamp and store results */
   1868             output_ptr[0] = cm[Temp1];
   1869             output_ptr[1] = cm[Temp2];
   1870             output_ptr[2] = cm[Temp3];
   1871             output_ptr[3] = cm[Temp4];
   1872             output_ptr[4] = cm[Temp5];
   1873             output_ptr[5] = cm[Temp6];
   1874             output_ptr[6] = cm[Temp7];
   1875             output_ptr[7] = cm[Temp8];
   1876 
   1877             src_ptr += 8;
   1878             output_ptr += output_pitch;
   1879         }
   1880     }
   1881 }
   1882 
   1883 
   1884 void vp8_filter_block2d_second_pass161
   1885 (
   1886     unsigned char *RESTRICT src_ptr,
   1887     unsigned char *RESTRICT output_ptr,
   1888     int output_pitch,
   1889     const unsigned short *vp8_filter
   1890 )
   1891 {
   1892     unsigned int i, j;
   1893 
   1894     int Temp1, Temp2, Temp3, Temp4, Temp5, Temp6, Temp7, Temp8;
   1895     unsigned int vector4a;
   1896     unsigned int vector1b, vector2b, vector3b;
   1897 
   1898     unsigned char src_ptr_l2;
   1899     unsigned char src_ptr_l1;
   1900     unsigned char src_ptr_0;
   1901     unsigned char src_ptr_r1;
   1902     unsigned char src_ptr_r2;
   1903     unsigned char src_ptr_r3;
   1904     unsigned char *cm = ff_cropTbl + CROP_WIDTH;
   1905 
   1906     vector4a = 64;
   1907 
   1908     vector1b = vp8_filter[0];
   1909     vector2b = vp8_filter[2];
   1910     vector3b = vp8_filter[1];
   1911 
   1912     if (vector1b == 0)
   1913     {
   1914         /* 4 tap filter */
   1915 
   1916         /* prefetch src_ptr data to cache memory */
   1917         prefetch_load(src_ptr + 16);
   1918 
   1919         for (i = 16; i--;)
   1920         {
   1921             /* unrolling for loop */
   1922             for (j = 0; j < 16; j += 8)
   1923             {
   1924                 /* apply filter with vectors pairs */
   1925                 __asm__ __volatile__ (
   1926                     "lbu            %[src_ptr_l1],  -16(%[src_ptr])                 \n\t"
   1927                     "lbu            %[src_ptr_0],   0(%[src_ptr])                   \n\t"
   1928                     "lbu            %[src_ptr_r1],  16(%[src_ptr])                  \n\t"
   1929                     "lbu            %[src_ptr_r2],  32(%[src_ptr])                  \n\t"
   1930                     "mtlo           %[vector4a],    $ac2                            \n\t"
   1931                     "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1932                     "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1933                     "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
   1934                     "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1935 
   1936                     "lbu            %[src_ptr_l1],  -15(%[src_ptr])                 \n\t"
   1937                     "lbu            %[src_ptr_0],   1(%[src_ptr])                   \n\t"
   1938                     "lbu            %[src_ptr_r1],  17(%[src_ptr])                  \n\t"
   1939                     "lbu            %[src_ptr_r2],  33(%[src_ptr])                  \n\t"
   1940                     "mtlo           %[vector4a],    $ac3                            \n\t"
   1941                     "extp           %[Temp1],       $ac2,           9               \n\t"
   1942 
   1943                     "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1944                     "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1945                     "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
   1946                     "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1947 
   1948                     "lbu            %[src_ptr_l1],  -14(%[src_ptr])                 \n\t"
   1949                     "lbu            %[src_ptr_0],   2(%[src_ptr])                   \n\t"
   1950                     "lbu            %[src_ptr_r1],  18(%[src_ptr])                  \n\t"
   1951                     "lbu            %[src_ptr_r2],  34(%[src_ptr])                  \n\t"
   1952                     "mtlo           %[vector4a],    $ac1                            \n\t"
   1953                     "extp           %[Temp2],       $ac3,           9               \n\t"
   1954 
   1955                     "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1956                     "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1957                     "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
   1958                     "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1959 
   1960                     "lbu            %[src_ptr_l1],  -13(%[src_ptr])                 \n\t"
   1961                     "lbu            %[src_ptr_0],   3(%[src_ptr])                   \n\t"
   1962                     "lbu            %[src_ptr_r1],  19(%[src_ptr])                  \n\t"
   1963                     "lbu            %[src_ptr_r2],  35(%[src_ptr])                  \n\t"
   1964                     "mtlo           %[vector4a],    $ac3                            \n\t"
   1965                     "extp           %[Temp3],       $ac1,           9               \n\t"
   1966 
   1967                     "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1968                     "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1969                     "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
   1970                     "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1971 
   1972                     "lbu            %[src_ptr_l1],  -12(%[src_ptr])                 \n\t"
   1973                     "lbu            %[src_ptr_0],   4(%[src_ptr])                   \n\t"
   1974                     "lbu            %[src_ptr_r1],  20(%[src_ptr])                  \n\t"
   1975                     "lbu            %[src_ptr_r2],  36(%[src_ptr])                  \n\t"
   1976                     "mtlo           %[vector4a],    $ac2                            \n\t"
   1977                     "extp           %[Temp4],       $ac3,           9               \n\t"
   1978 
   1979                     "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1980                     "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1981                     "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
   1982                     "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1983 
   1984                     "lbu            %[src_ptr_l1],  -11(%[src_ptr])                 \n\t"
   1985                     "lbu            %[src_ptr_0],   5(%[src_ptr])                   \n\t"
   1986                     "lbu            %[src_ptr_r1],  21(%[src_ptr])                  \n\t"
   1987                     "lbu            %[src_ptr_r2],  37(%[src_ptr])                  \n\t"
   1988                     "mtlo           %[vector4a],    $ac3                            \n\t"
   1989                     "extp           %[Temp5],       $ac2,           9               \n\t"
   1990 
   1991                     "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1992                     "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1993                     "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
   1994                     "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1995 
   1996                     "lbu            %[src_ptr_l1],  -10(%[src_ptr])                 \n\t"
   1997                     "lbu            %[src_ptr_0],   6(%[src_ptr])                   \n\t"
   1998                     "lbu            %[src_ptr_r1],  22(%[src_ptr])                  \n\t"
   1999                     "lbu            %[src_ptr_r2],  38(%[src_ptr])                  \n\t"
   2000                     "mtlo           %[vector4a],    $ac1                            \n\t"
   2001                     "extp           %[Temp6],       $ac3,           9               \n\t"
   2002 
   2003                     "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   2004                     "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   2005                     "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
   2006                     "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
   2007 
   2008                     "lbu            %[src_ptr_l1],  -9(%[src_ptr])                  \n\t"
   2009                     "lbu            %[src_ptr_0],   7(%[src_ptr])                   \n\t"
   2010                     "lbu            %[src_ptr_r1],  23(%[src_ptr])                  \n\t"
   2011                     "lbu            %[src_ptr_r2],  39(%[src_ptr])                  \n\t"
   2012                     "mtlo           %[vector4a],    $ac3                            \n\t"
   2013                     "extp           %[Temp7],       $ac1,           9               \n\t"
   2014 
   2015                     "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   2016                     "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   2017                     "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
   2018                     "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
   2019                     "extp           %[Temp8],       $ac3,           9               \n\t"
   2020 
   2021                     : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
   2022                       [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4),
   2023                       [Temp5] "=&r" (Temp5), [Temp6] "=&r" (Temp6),
   2024                       [Temp7] "=&r" (Temp7), [Temp8] "=r" (Temp8),
   2025                       [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
   2026                       [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2)
   2027                     : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
   2028                       [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr)
   2029                 );
   2030 
   2031                 /* clamp and store results */
   2032                 output_ptr[j] = cm[Temp1];
   2033                 output_ptr[j + 1] = cm[Temp2];
   2034                 output_ptr[j + 2] = cm[Temp3];
   2035                 output_ptr[j + 3] = cm[Temp4];
   2036                 output_ptr[j + 4] = cm[Temp5];
   2037                 output_ptr[j + 5] = cm[Temp6];
   2038                 output_ptr[j + 6] = cm[Temp7];
   2039                 output_ptr[j + 7] = cm[Temp8];
   2040 
   2041                 src_ptr += 8;
   2042             }
   2043 
   2044             output_ptr += output_pitch;
   2045         }
   2046     }
   2047     else
   2048     {
   2049         /* 4 tap filter */
   2050 
   2051         /* prefetch src_ptr data to cache memory */
   2052         prefetch_load(src_ptr + 16);
   2053 
   2054         /* unroll for loop */
   2055         for (i = 16; i--;)
   2056         {
   2057             /* apply filter with vectors pairs */
   2058             __asm__ __volatile__ (
   2059                 "lbu            %[src_ptr_l2],  -32(%[src_ptr])                 \n\t"
   2060                 "lbu            %[src_ptr_l1],  -16(%[src_ptr])                 \n\t"
   2061                 "lbu            %[src_ptr_0],   0(%[src_ptr])                   \n\t"
   2062                 "lbu            %[src_ptr_r1],  16(%[src_ptr])                  \n\t"
   2063                 "lbu            %[src_ptr_r2],  32(%[src_ptr])                  \n\t"
   2064                 "lbu            %[src_ptr_r3],  48(%[src_ptr])                  \n\t"
   2065                 "mtlo           %[vector4a],    $ac2                            \n\t"
   2066 
   2067                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   2068                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   2069                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   2070                 "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
   2071                 "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
   2072                 "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
   2073 
   2074                 "lbu            %[src_ptr_l2],  -31(%[src_ptr])                 \n\t"
   2075                 "lbu            %[src_ptr_l1],  -15(%[src_ptr])                 \n\t"
   2076                 "lbu            %[src_ptr_0],   1(%[src_ptr])                   \n\t"
   2077                 "lbu            %[src_ptr_r1],  17(%[src_ptr])                  \n\t"
   2078                 "lbu            %[src_ptr_r2],  33(%[src_ptr])                  \n\t"
   2079                 "lbu            %[src_ptr_r3],  49(%[src_ptr])                  \n\t"
   2080                 "mtlo           %[vector4a],    $ac0                            \n\t"
   2081                 "extp           %[Temp1],       $ac2,           9               \n\t"
   2082 
   2083                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   2084                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   2085                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   2086                 "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
   2087                 "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
   2088                 "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
   2089 
   2090                 "lbu            %[src_ptr_l2],  -30(%[src_ptr])                 \n\t"
   2091                 "lbu            %[src_ptr_l1],  -14(%[src_ptr])                 \n\t"
   2092                 "lbu            %[src_ptr_0],   2(%[src_ptr])                   \n\t"
   2093                 "lbu            %[src_ptr_r1],  18(%[src_ptr])                  \n\t"
   2094                 "lbu            %[src_ptr_r2],  34(%[src_ptr])                  \n\t"
   2095                 "lbu            %[src_ptr_r3],  50(%[src_ptr])                  \n\t"
   2096                 "mtlo           %[vector4a],    $ac1                            \n\t"
   2097                 "extp           %[Temp2],       $ac0,           9               \n\t"
   2098 
   2099                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   2100                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   2101                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   2102                 "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
   2103                 "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
   2104                 "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
   2105 
   2106                 "lbu            %[src_ptr_l2],  -29(%[src_ptr])                 \n\t"
   2107                 "lbu            %[src_ptr_l1],  -13(%[src_ptr])                 \n\t"
   2108                 "lbu            %[src_ptr_0],   3(%[src_ptr])                   \n\t"
   2109                 "lbu            %[src_ptr_r1],  19(%[src_ptr])                  \n\t"
   2110                 "lbu            %[src_ptr_r2],  35(%[src_ptr])                  \n\t"
   2111                 "lbu            %[src_ptr_r3],  51(%[src_ptr])                  \n\t"
   2112                 "mtlo           %[vector4a],    $ac3                            \n\t"
   2113                 "extp           %[Temp3],       $ac1,           9               \n\t"
   2114 
   2115                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   2116                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   2117                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   2118                 "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
   2119                 "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
   2120                 "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
   2121 
   2122                 "lbu            %[src_ptr_l2],  -28(%[src_ptr])                 \n\t"
   2123                 "lbu            %[src_ptr_l1],  -12(%[src_ptr])                 \n\t"
   2124                 "lbu            %[src_ptr_0],   4(%[src_ptr])                   \n\t"
   2125                 "lbu            %[src_ptr_r1],  20(%[src_ptr])                  \n\t"
   2126                 "lbu            %[src_ptr_r2],  36(%[src_ptr])                  \n\t"
   2127                 "lbu            %[src_ptr_r3],  52(%[src_ptr])                  \n\t"
   2128                 "mtlo           %[vector4a],    $ac2                            \n\t"
   2129                 "extp           %[Temp4],       $ac3,           9               \n\t"
   2130 
   2131                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   2132                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   2133                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   2134                 "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
   2135                 "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
   2136                 "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
   2137 
   2138                 "lbu            %[src_ptr_l2],  -27(%[src_ptr])                 \n\t"
   2139                 "lbu            %[src_ptr_l1],  -11(%[src_ptr])                 \n\t"
   2140                 "lbu            %[src_ptr_0],   5(%[src_ptr])                   \n\t"
   2141                 "lbu            %[src_ptr_r1],  21(%[src_ptr])                  \n\t"
   2142                 "lbu            %[src_ptr_r2],  37(%[src_ptr])                  \n\t"
   2143                 "lbu            %[src_ptr_r3],  53(%[src_ptr])                  \n\t"
   2144                 "mtlo           %[vector4a],    $ac0                            \n\t"
   2145                 "extp           %[Temp5],       $ac2,           9               \n\t"
   2146 
   2147                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   2148                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   2149                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   2150                 "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
   2151                 "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
   2152                 "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
   2153 
   2154                 "lbu            %[src_ptr_l2],  -26(%[src_ptr])                 \n\t"
   2155                 "lbu            %[src_ptr_l1],  -10(%[src_ptr])                 \n\t"
   2156                 "lbu            %[src_ptr_0],   6(%[src_ptr])                   \n\t"
   2157                 "lbu            %[src_ptr_r1],  22(%[src_ptr])                  \n\t"
   2158                 "lbu            %[src_ptr_r2],  38(%[src_ptr])                  \n\t"
   2159                 "lbu            %[src_ptr_r3],  54(%[src_ptr])                  \n\t"
   2160                 "mtlo           %[vector4a],    $ac1                            \n\t"
   2161                 "extp           %[Temp6],       $ac0,           9               \n\t"
   2162 
   2163                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   2164                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   2165                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   2166                 "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
   2167                 "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
   2168                 "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
   2169 
   2170                 "lbu            %[src_ptr_l2],  -25(%[src_ptr])                 \n\t"
   2171                 "lbu            %[src_ptr_l1],  -9(%[src_ptr])                  \n\t"
   2172                 "lbu            %[src_ptr_0],   7(%[src_ptr])                   \n\t"
   2173                 "lbu            %[src_ptr_r1],  23(%[src_ptr])                  \n\t"
   2174                 "lbu            %[src_ptr_r2],  39(%[src_ptr])                  \n\t"
   2175                 "lbu            %[src_ptr_r3],  55(%[src_ptr])                  \n\t"
   2176                 "mtlo           %[vector4a],    $ac3                            \n\t"
   2177                 "extp           %[Temp7],       $ac1,           9               \n\t"
   2178 
   2179                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   2180                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   2181                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   2182                 "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
   2183                 "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
   2184                 "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
   2185                 "extp           %[Temp8],       $ac3,           9               \n\t"
   2186 
   2187                 : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
   2188                   [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4),
   2189                   [Temp5] "=&r" (Temp5), [Temp6] "=&r" (Temp6),
   2190                   [Temp7] "=&r" (Temp7), [Temp8] "=r" (Temp8),
   2191                   [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
   2192                   [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2),
   2193                   [src_ptr_l2] "=&r" (src_ptr_l2),[src_ptr_r3] "=&r" (src_ptr_r3)
   2194                 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
   2195                   [vector3b] "r" (vector3b), [vector4a] "r" (vector4a),
   2196                   [src_ptr] "r" (src_ptr)
   2197             );
   2198 
   2199             /* clamp and store results */
   2200             output_ptr[0] = cm[Temp1];
   2201             output_ptr[1] = cm[Temp2];
   2202             output_ptr[2] = cm[Temp3];
   2203             output_ptr[3] = cm[Temp4];
   2204             output_ptr[4] = cm[Temp5];
   2205             output_ptr[5] = cm[Temp6];
   2206             output_ptr[6] = cm[Temp7];
   2207             output_ptr[7] = cm[Temp8];
   2208 
   2209             /* apply filter with vectors pairs */
   2210             __asm__ __volatile__ (
   2211                 "lbu            %[src_ptr_l2],  -24(%[src_ptr])                 \n\t"
   2212                 "lbu            %[src_ptr_l1],  -8(%[src_ptr])                  \n\t"
   2213                 "lbu            %[src_ptr_0],   8(%[src_ptr])                   \n\t"
   2214                 "lbu            %[src_ptr_r1],  24(%[src_ptr])                  \n\t"
   2215                 "lbu            %[src_ptr_r2],  40(%[src_ptr])                  \n\t"
   2216                 "lbu            %[src_ptr_r3],  56(%[src_ptr])                  \n\t"
   2217                 "mtlo           %[vector4a],    $ac2                            \n\t"
   2218 
   2219                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   2220                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   2221                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   2222                 "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
   2223                 "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
   2224                 "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
   2225 
   2226                 "lbu            %[src_ptr_l2],  -23(%[src_ptr])                 \n\t"
   2227                 "lbu            %[src_ptr_l1],  -7(%[src_ptr])                  \n\t"
   2228                 "lbu            %[src_ptr_0],   9(%[src_ptr])                   \n\t"
   2229                 "lbu            %[src_ptr_r1],  25(%[src_ptr])                  \n\t"
   2230                 "lbu            %[src_ptr_r2],  41(%[src_ptr])                  \n\t"
   2231                 "lbu            %[src_ptr_r3],  57(%[src_ptr])                  \n\t"
   2232                 "mtlo           %[vector4a],    $ac0                            \n\t"
   2233                 "extp           %[Temp1],       $ac2,           9               \n\t"
   2234 
   2235                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   2236                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   2237                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   2238                 "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
   2239                 "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
   2240                 "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
   2241 
   2242                 "lbu            %[src_ptr_l2],  -22(%[src_ptr])                 \n\t"
   2243                 "lbu            %[src_ptr_l1],  -6(%[src_ptr])                  \n\t"
   2244                 "lbu            %[src_ptr_0],   10(%[src_ptr])                  \n\t"
   2245                 "lbu            %[src_ptr_r1],  26(%[src_ptr])                  \n\t"
   2246                 "lbu            %[src_ptr_r2],  42(%[src_ptr])                  \n\t"
   2247                 "lbu            %[src_ptr_r3],  58(%[src_ptr])                  \n\t"
   2248                 "mtlo           %[vector4a],    $ac1                            \n\t"
   2249                 "extp           %[Temp2],       $ac0,           9               \n\t"
   2250 
   2251                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   2252                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   2253                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   2254                 "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
   2255                 "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
   2256                 "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
   2257 
   2258                 "lbu            %[src_ptr_l2],  -21(%[src_ptr])                 \n\t"
   2259                 "lbu            %[src_ptr_l1],  -5(%[src_ptr])                  \n\t"
   2260                 "lbu            %[src_ptr_0],   11(%[src_ptr])                  \n\t"
   2261                 "lbu            %[src_ptr_r1],  27(%[src_ptr])                  \n\t"
   2262                 "lbu            %[src_ptr_r2],  43(%[src_ptr])                  \n\t"
   2263                 "lbu            %[src_ptr_r3],  59(%[src_ptr])                  \n\t"
   2264                 "mtlo           %[vector4a],    $ac3                            \n\t"
   2265                 "extp           %[Temp3],       $ac1,           9               \n\t"
   2266 
   2267                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   2268                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   2269                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   2270                 "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
   2271                 "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
   2272                 "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
   2273 
   2274                 "lbu            %[src_ptr_l2],  -20(%[src_ptr])                 \n\t"
   2275                 "lbu            %[src_ptr_l1],  -4(%[src_ptr])                  \n\t"
   2276                 "lbu            %[src_ptr_0],   12(%[src_ptr])                  \n\t"
   2277                 "lbu            %[src_ptr_r1],  28(%[src_ptr])                  \n\t"
   2278                 "lbu            %[src_ptr_r2],  44(%[src_ptr])                  \n\t"
   2279                 "lbu            %[src_ptr_r3],  60(%[src_ptr])                  \n\t"
   2280                 "mtlo           %[vector4a],    $ac2                            \n\t"
   2281                 "extp           %[Temp4],       $ac3,           9               \n\t"
   2282 
   2283                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   2284                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   2285                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   2286                 "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
   2287                 "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
   2288                 "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
   2289 
   2290                 "lbu            %[src_ptr_l2],  -19(%[src_ptr])                 \n\t"
   2291                 "lbu            %[src_ptr_l1],  -3(%[src_ptr])                  \n\t"
   2292                 "lbu            %[src_ptr_0],   13(%[src_ptr])                  \n\t"
   2293                 "lbu            %[src_ptr_r1],  29(%[src_ptr])                  \n\t"
   2294                 "lbu            %[src_ptr_r2],  45(%[src_ptr])                  \n\t"
   2295                 "lbu            %[src_ptr_r3],  61(%[src_ptr])                  \n\t"
   2296                 "mtlo           %[vector4a],    $ac0                            \n\t"
   2297                 "extp           %[Temp5],       $ac2,           9               \n\t"
   2298 
   2299                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   2300                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   2301                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   2302                 "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
   2303                 "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
   2304                 "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
   2305 
   2306                 "lbu            %[src_ptr_l2],  -18(%[src_ptr])                 \n\t"
   2307                 "lbu            %[src_ptr_l1],  -2(%[src_ptr])                  \n\t"
   2308                 "lbu            %[src_ptr_0],   14(%[src_ptr])                  \n\t"
   2309                 "lbu            %[src_ptr_r1],  30(%[src_ptr])                  \n\t"
   2310                 "lbu            %[src_ptr_r2],  46(%[src_ptr])                  \n\t"
   2311                 "lbu            %[src_ptr_r3],  62(%[src_ptr])                  \n\t"
   2312                 "mtlo           %[vector4a],    $ac1                            \n\t"
   2313                 "extp           %[Temp6],       $ac0,           9               \n\t"
   2314 
   2315                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   2316                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   2317                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   2318                 "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
   2319                 "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
   2320                 "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
   2321 
   2322                 "lbu            %[src_ptr_l2],  -17(%[src_ptr])                 \n\t"
   2323                 "lbu            %[src_ptr_l1],  -1(%[src_ptr])                  \n\t"
   2324                 "lbu            %[src_ptr_0],   15(%[src_ptr])                  \n\t"
   2325                 "lbu            %[src_ptr_r1],  31(%[src_ptr])                  \n\t"
   2326                 "lbu            %[src_ptr_r2],  47(%[src_ptr])                  \n\t"
   2327                 "lbu            %[src_ptr_r3],  63(%[src_ptr])                  \n\t"
   2328                 "mtlo           %[vector4a],    $ac3                            \n\t"
   2329                 "extp           %[Temp7],       $ac1,           9               \n\t"
   2330 
   2331                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   2332                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   2333                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   2334                 "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
   2335                 "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
   2336                 "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
   2337                 "extp           %[Temp8],       $ac3,           9               \n\t"
   2338 
   2339                 : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
   2340                   [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4),
   2341                   [Temp5] "=&r" (Temp5), [Temp6] "=&r" (Temp6),
   2342                   [Temp7] "=&r" (Temp7), [Temp8] "=r" (Temp8),
   2343                   [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
   2344                   [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2),
   2345                   [src_ptr_l2] "=&r" (src_ptr_l2), [src_ptr_r3] "=&r" (src_ptr_r3)
   2346                 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
   2347                   [vector3b] "r" (vector3b), [vector4a] "r" (vector4a),
   2348                   [src_ptr] "r" (src_ptr)
   2349             );
   2350 
   2351             src_ptr += 16;
   2352             output_ptr[8] = cm[Temp1];
   2353             output_ptr[9] = cm[Temp2];
   2354             output_ptr[10] = cm[Temp3];
   2355             output_ptr[11] = cm[Temp4];
   2356             output_ptr[12] = cm[Temp5];
   2357             output_ptr[13] = cm[Temp6];
   2358             output_ptr[14] = cm[Temp7];
   2359             output_ptr[15] = cm[Temp8];
   2360 
   2361             output_ptr += output_pitch;
   2362         }
   2363     }
   2364 }
   2365 
   2366 
   2367 void vp8_sixtap_predict4x4_dspr2
   2368 (
   2369     unsigned char *RESTRICT src_ptr,
   2370     int   src_pixels_per_line,
   2371     int  xoffset,
   2372     int  yoffset,
   2373     unsigned char *RESTRICT dst_ptr,
   2374     int dst_pitch
   2375 )
   2376 {
   2377     unsigned char FData[9 * 4]; /* Temp data bufffer used in filtering */
   2378     unsigned int pos = 16;
   2379 
   2380     /* bit positon for extract from acc */
   2381     __asm__ __volatile__ (
   2382         "wrdsp      %[pos],     1           \n\t"
   2383         :
   2384         : [pos] "r" (pos)
   2385     );
   2386 
   2387     if (yoffset)
   2388     {
   2389         /* First filter 1-D horizontally... */
   2390         vp8_filter_block2d_first_pass_4(src_ptr - (2 * src_pixels_per_line), FData,
   2391                                         src_pixels_per_line, 9, xoffset, 4);
   2392         /* then filter verticaly... */
   2393         vp8_filter_block2d_second_pass4(FData + 8, dst_ptr, dst_pitch, yoffset);
   2394     }
   2395     else
   2396         /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */
   2397         vp8_filter_block2d_first_pass_4(src_ptr, dst_ptr, src_pixels_per_line,
   2398                                         4, xoffset, dst_pitch);
   2399 }
   2400 
   2401 
   2402 void vp8_sixtap_predict8x8_dspr2
   2403 (
   2404     unsigned char   *RESTRICT src_ptr,
   2405     int  src_pixels_per_line,
   2406     int  xoffset,
   2407     int  yoffset,
   2408     unsigned char *RESTRICT dst_ptr,
   2409     int  dst_pitch
   2410 )
   2411 {
   2412 
   2413     unsigned char FData[13 * 8]; /* Temp data bufffer used in filtering */
   2414     unsigned int pos, Temp1, Temp2;
   2415 
   2416     pos = 16;
   2417 
   2418     /* bit positon for extract from acc */
   2419     __asm__ __volatile__ (
   2420         "wrdsp      %[pos],     1               \n\t"
   2421         :
   2422         : [pos] "r" (pos)
   2423     );
   2424 
   2425     if (yoffset)
   2426     {
   2427 
   2428         src_ptr = src_ptr - (2 * src_pixels_per_line);
   2429 
   2430         if (xoffset)
   2431             /* filter 1-D horizontally... */
   2432             vp8_filter_block2d_first_pass_8_all(src_ptr, FData, src_pixels_per_line,
   2433                                                 13, xoffset, 8);
   2434 
   2435         else
   2436         {
   2437             /* prefetch src_ptr data to cache memory */
   2438             prefetch_load(src_ptr + 2 * src_pixels_per_line);
   2439 
   2440             __asm__ __volatile__ (
   2441                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2442                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2443                 "sw     %[Temp1],   0(%[FData])                             \n\t"
   2444                 "sw     %[Temp2],   4(%[FData])                             \n\t"
   2445                 "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
   2446 
   2447                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2448                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2449                 "sw     %[Temp1],   8(%[FData])                             \n\t"
   2450                 "sw     %[Temp2],   12(%[FData])                            \n\t"
   2451                 "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
   2452 
   2453                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2454                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2455                 "sw     %[Temp1],   16(%[FData])                            \n\t"
   2456                 "sw     %[Temp2],   20(%[FData])                            \n\t"
   2457                 "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
   2458 
   2459                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2460                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2461                 "sw     %[Temp1],   24(%[FData])                            \n\t"
   2462                 "sw     %[Temp2],   28(%[FData])                            \n\t"
   2463                 "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
   2464 
   2465                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2466                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2467                 "sw     %[Temp1],   32(%[FData])                            \n\t"
   2468                 "sw     %[Temp2],   36(%[FData])                            \n\t"
   2469                 "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
   2470 
   2471                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2472                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2473                 "sw     %[Temp1],   40(%[FData])                            \n\t"
   2474                 "sw     %[Temp2],   44(%[FData])                            \n\t"
   2475                 "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
   2476 
   2477                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2478                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2479                 "sw     %[Temp1],   48(%[FData])                            \n\t"
   2480                 "sw     %[Temp2],   52(%[FData])                            \n\t"
   2481                 "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
   2482 
   2483                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2484                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2485                 "sw     %[Temp1],   56(%[FData])                            \n\t"
   2486                 "sw     %[Temp2],   60(%[FData])                            \n\t"
   2487                 "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
   2488 
   2489                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2490                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2491                 "sw     %[Temp1],   64(%[FData])                            \n\t"
   2492                 "sw     %[Temp2],   68(%[FData])                            \n\t"
   2493                 "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
   2494 
   2495                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2496                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2497                 "sw     %[Temp1],   72(%[FData])                            \n\t"
   2498                 "sw     %[Temp2],   76(%[FData])                            \n\t"
   2499                 "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
   2500 
   2501                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2502                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2503                 "sw     %[Temp1],   80(%[FData])                            \n\t"
   2504                 "sw     %[Temp2],   84(%[FData])                            \n\t"
   2505                 "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
   2506 
   2507                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2508                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2509                 "sw     %[Temp1],   88(%[FData])                            \n\t"
   2510                 "sw     %[Temp2],   92(%[FData])                            \n\t"
   2511                 "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
   2512 
   2513                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2514                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2515                 "sw     %[Temp1],   96(%[FData])                            \n\t"
   2516                 "sw     %[Temp2],   100(%[FData])                           \n\t"
   2517 
   2518                 : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2)
   2519                 : [FData] "r" (FData), [src_ptr] "r" (src_ptr),
   2520                   [src_pixels_per_line] "r" (src_pixels_per_line)
   2521             );
   2522         }
   2523 
   2524         /* filter verticaly... */
   2525         vp8_filter_block2d_second_pass_8(FData + 16, dst_ptr, dst_pitch, 8, 8, yoffset);
   2526     }
   2527 
   2528     /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */
   2529     else
   2530     {
   2531         if (xoffset)
   2532             vp8_filter_block2d_first_pass_8_all(src_ptr, dst_ptr, src_pixels_per_line,
   2533                                                 8, xoffset, dst_pitch);
   2534 
   2535         else
   2536         {
   2537             /* copy from src buffer to dst buffer */
   2538             __asm__ __volatile__ (
   2539                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2540                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2541                 "sw     %[Temp1],   0(%[dst_ptr])                           \n\t"
   2542                 "sw     %[Temp2],   4(%[dst_ptr])                           \n\t"
   2543                 "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
   2544 
   2545                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2546                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2547                 "sw     %[Temp1],   8(%[dst_ptr])                           \n\t"
   2548                 "sw     %[Temp2],   12(%[dst_ptr])                          \n\t"
   2549                 "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
   2550 
   2551                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2552                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2553                 "sw     %[Temp1],   16(%[dst_ptr])                          \n\t"
   2554                 "sw     %[Temp2],   20(%[dst_ptr])                          \n\t"
   2555                 "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
   2556 
   2557                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2558                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2559                 "sw     %[Temp1],   24(%[dst_ptr])                          \n\t"
   2560                 "sw     %[Temp2],   28(%[dst_ptr])                          \n\t"
   2561                 "addu   %[src_ptr], %[src_ptr],   %[src_pixels_per_line]    \n\t"
   2562 
   2563                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2564                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2565                 "sw     %[Temp1],   32(%[dst_ptr])                          \n\t"
   2566                 "sw     %[Temp2],   36(%[dst_ptr])                          \n\t"
   2567                 "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
   2568 
   2569                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2570                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2571                 "sw     %[Temp1],   40(%[dst_ptr])                          \n\t"
   2572                 "sw     %[Temp2],   44(%[dst_ptr])                          \n\t"
   2573                 "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
   2574 
   2575                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2576                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2577                 "sw     %[Temp1],   48(%[dst_ptr])                          \n\t"
   2578                 "sw     %[Temp2],   52(%[dst_ptr])                          \n\t"
   2579                 "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
   2580 
   2581                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2582                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2583                 "sw     %[Temp1],   56(%[dst_ptr])                          \n\t"
   2584                 "sw     %[Temp2],   60(%[dst_ptr])                          \n\t"
   2585 
   2586                 : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2)
   2587                 : [dst_ptr] "r" (dst_ptr), [src_ptr] "r" (src_ptr),
   2588                   [src_pixels_per_line] "r" (src_pixels_per_line)
   2589             );
   2590         }
   2591     }
   2592 }
   2593 
   2594 
   2595 void vp8_sixtap_predict8x4_dspr2
   2596 (
   2597     unsigned char   *RESTRICT src_ptr,
   2598     int  src_pixels_per_line,
   2599     int  xoffset,
   2600     int  yoffset,
   2601     unsigned char *RESTRICT dst_ptr,
   2602     int  dst_pitch
   2603 )
   2604 {
   2605     unsigned char FData[9 * 8]; /* Temp data bufffer used in filtering */
   2606     unsigned int pos, Temp1, Temp2;
   2607 
   2608     pos = 16;
   2609 
   2610     /* bit positon for extract from acc */
   2611     __asm__ __volatile__ (
   2612         "wrdsp      %[pos],     1           \n\t"
   2613         :
   2614         : [pos] "r" (pos)
   2615     );
   2616 
   2617     if (yoffset)
   2618     {
   2619 
   2620         src_ptr = src_ptr - (2 * src_pixels_per_line);
   2621 
   2622         if (xoffset)
   2623             /* filter 1-D horizontally... */
   2624             vp8_filter_block2d_first_pass_8_all(src_ptr, FData, src_pixels_per_line,
   2625                                                 9, xoffset, 8);
   2626 
   2627         else
   2628         {
   2629             /* prefetch src_ptr data to cache memory */
   2630             prefetch_load(src_ptr + 2 * src_pixels_per_line);
   2631 
   2632             __asm__ __volatile__ (
   2633                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2634                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2635                 "sw     %[Temp1],   0(%[FData])                             \n\t"
   2636                 "sw     %[Temp2],   4(%[FData])                             \n\t"
   2637                 "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
   2638 
   2639                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2640                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2641                 "sw     %[Temp1],   8(%[FData])                             \n\t"
   2642                 "sw     %[Temp2],   12(%[FData])                            \n\t"
   2643                 "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
   2644 
   2645                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2646                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2647                 "sw     %[Temp1],   16(%[FData])                            \n\t"
   2648                 "sw     %[Temp2],   20(%[FData])                            \n\t"
   2649                 "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
   2650 
   2651                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2652                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2653                 "sw     %[Temp1],   24(%[FData])                            \n\t"
   2654                 "sw     %[Temp2],   28(%[FData])                            \n\t"
   2655                 "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
   2656 
   2657                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2658                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2659                 "sw     %[Temp1],   32(%[FData])                            \n\t"
   2660                 "sw     %[Temp2],   36(%[FData])                            \n\t"
   2661                 "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
   2662 
   2663                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2664                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2665                 "sw     %[Temp1],   40(%[FData])                            \n\t"
   2666                 "sw     %[Temp2],   44(%[FData])                            \n\t"
   2667                 "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
   2668 
   2669                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2670                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2671                 "sw     %[Temp1],   48(%[FData])                            \n\t"
   2672                 "sw     %[Temp2],   52(%[FData])                            \n\t"
   2673                 "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
   2674 
   2675                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2676                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2677                 "sw     %[Temp1],   56(%[FData])                            \n\t"
   2678                 "sw     %[Temp2],   60(%[FData])                            \n\t"
   2679                 "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
   2680 
   2681                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2682                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2683                 "sw     %[Temp1],   64(%[FData])                            \n\t"
   2684                 "sw     %[Temp2],   68(%[FData])                            \n\t"
   2685 
   2686                 : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2)
   2687                 : [FData] "r" (FData), [src_ptr] "r" (src_ptr),
   2688                   [src_pixels_per_line] "r" (src_pixels_per_line)
   2689             );
   2690         }
   2691 
   2692         /* filter verticaly... */
   2693         vp8_filter_block2d_second_pass_8(FData + 16, dst_ptr, dst_pitch, 4, 8, yoffset);
   2694     }
   2695 
   2696     /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */
   2697     else
   2698     {
   2699         if (xoffset)
   2700             vp8_filter_block2d_first_pass_8_all(src_ptr, dst_ptr, src_pixels_per_line,
   2701                                                 4, xoffset, dst_pitch);
   2702 
   2703         else
   2704         {
   2705             /* copy from src buffer to dst buffer */
   2706             __asm__ __volatile__ (
   2707                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2708                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2709                 "sw     %[Temp1],   0(%[dst_ptr])                           \n\t"
   2710                 "sw     %[Temp2],   4(%[dst_ptr])                           \n\t"
   2711                 "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
   2712 
   2713                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2714                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2715                 "sw     %[Temp1],   8(%[dst_ptr])                           \n\t"
   2716                 "sw     %[Temp2],   12(%[dst_ptr])                          \n\t"
   2717                 "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
   2718 
   2719                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2720                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2721                 "sw     %[Temp1],   16(%[dst_ptr])                          \n\t"
   2722                 "sw     %[Temp2],   20(%[dst_ptr])                          \n\t"
   2723                 "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
   2724 
   2725                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2726                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2727                 "sw     %[Temp1],   24(%[dst_ptr])                          \n\t"
   2728                 "sw     %[Temp2],   28(%[dst_ptr])                          \n\t"
   2729 
   2730                 : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2)
   2731                 : [dst_ptr] "r" (dst_ptr), [src_ptr] "r" (src_ptr),
   2732                   [src_pixels_per_line] "r" (src_pixels_per_line)
   2733             );
   2734         }
   2735     }
   2736 }
   2737 
   2738 
   2739 void vp8_sixtap_predict16x16_dspr2
   2740 (
   2741     unsigned char   *RESTRICT src_ptr,
   2742     int  src_pixels_per_line,
   2743     int  xoffset,
   2744     int  yoffset,
   2745     unsigned char *RESTRICT dst_ptr,
   2746     int  dst_pitch
   2747 )
   2748 {
   2749     const unsigned short *VFilter;
   2750     unsigned char FData[21 * 16]; /* Temp data bufffer used in filtering */
   2751     unsigned int pos;
   2752 
   2753     VFilter = sub_pel_filterss[yoffset];
   2754 
   2755     pos = 16;
   2756 
   2757     /* bit positon for extract from acc */
   2758     __asm__ __volatile__ (
   2759         "wrdsp      %[pos],     1           \n\t"
   2760         :
   2761         : [pos] "r" (pos)
   2762     );
   2763 
   2764     if (yoffset)
   2765     {
   2766 
   2767         src_ptr = src_ptr - (2 * src_pixels_per_line);
   2768 
   2769         switch (xoffset)
   2770         {
   2771             /* filter 1-D horizontally... */
   2772         case 2:
   2773         case 4:
   2774         case 6:
   2775             /* 6 tap filter */
   2776             vp8_filter_block2d_first_pass16_6tap(src_ptr, FData, src_pixels_per_line,
   2777                                                  21, xoffset, 16);
   2778             break;
   2779 
   2780         case 0:
   2781             /* only copy buffer */
   2782             vp8_filter_block2d_first_pass16_0(src_ptr, FData, src_pixels_per_line);
   2783             break;
   2784 
   2785         case 1:
   2786         case 3:
   2787         case 5:
   2788         case 7:
   2789             /* 4 tap filter */
   2790             vp8_filter_block2d_first_pass16_4tap(src_ptr, FData, src_pixels_per_line, 16,
   2791                                                  21, xoffset, yoffset, dst_ptr, dst_pitch);
   2792             break;
   2793         }
   2794 
   2795         /* filter verticaly... */
   2796         vp8_filter_block2d_second_pass161(FData + 32, dst_ptr, dst_pitch, VFilter);
   2797     }
   2798     else
   2799     {
   2800         /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */
   2801         switch (xoffset)
   2802         {
   2803         case 2:
   2804         case 4:
   2805         case 6:
   2806             /* 6 tap filter */
   2807             vp8_filter_block2d_first_pass16_6tap(src_ptr, dst_ptr, src_pixels_per_line,
   2808                                                  16, xoffset, dst_pitch);
   2809             break;
   2810 
   2811         case 1:
   2812         case 3:
   2813         case 5:
   2814         case 7:
   2815             /* 4 tap filter */
   2816             vp8_filter_block2d_first_pass16_4tap(src_ptr, dst_ptr, src_pixels_per_line, 16,
   2817                                                  21, xoffset, yoffset, dst_ptr, dst_pitch);
   2818             break;
   2819         }
   2820     }
   2821 }
   2822 
   2823 #endif
   2824