Home | History | Annotate | Download | only in dspr2
      1 /*
      2  *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <stdlib.h>
     12 #include "vp8_rtcd.h"
     13 #include "vpx_ports/mem.h"
     14 
     15 #if HAVE_DSPR2
     16 #define CROP_WIDTH 256
     17 unsigned char ff_cropTbl[256 + 2 * CROP_WIDTH];
     18 
     19 static const unsigned short sub_pel_filterss[8][3] = {
     20   { 0, 0, 0 },
     21   { 0, 0x0601, 0x7b0c },
     22   { 0x0201, 0x0b08, 0x6c24 },
     23   { 0, 0x0906, 0x5d32 },
     24   { 0x0303, 0x1010, 0x4d4d },
     25   { 0, 0x0609, 0x325d },
     26   { 0x0102, 0x080b, 0x246c },
     27   { 0, 0x0106, 0x0c7b },
     28 };
     29 
     30 static const int sub_pel_filters_int[8][3] = {
     31   { 0, 0, 0 },
     32   { 0x0000fffa, 0x007b000c, 0xffff0000 },
     33   { 0x0002fff5, 0x006c0024, 0xfff80001 },
     34   { 0x0000fff7, 0x005d0032, 0xfffa0000 },
     35   { 0x0003fff0, 0x004d004d, 0xfff00003 },
     36   { 0x0000fffa, 0x0032005d, 0xfff70000 },
     37   { 0x0001fff8, 0x0024006c, 0xfff50002 },
     38   { 0x0000ffff, 0x000c007b, 0xfffa0000 },
     39 };
     40 
     41 static const int sub_pel_filters_inv[8][3] = {
     42   { 0, 0, 0 },
     43   { 0xfffa0000, 0x000c007b, 0x0000ffff },
     44   { 0xfff50002, 0x0024006c, 0x0001fff8 },
     45   { 0xfff70000, 0x0032005d, 0x0000fffa },
     46   { 0xfff00003, 0x004d004d, 0x0003fff0 },
     47   { 0xfffa0000, 0x005d0032, 0x0000fff7 },
     48   { 0xfff80001, 0x006c0024, 0x0002fff5 },
     49   { 0xffff0000, 0x007b000c, 0x0000fffa },
     50 };
     51 
     52 /* clang-format off */
     53 static const int sub_pel_filters_int_tap_4[8][2] = {
     54   {          0,          0},
     55   { 0xfffa007b, 0x000cffff},
     56   {          0,          0},
     57   { 0xfff7005d, 0x0032fffa},
     58   {          0,          0},
     59   { 0xfffa0032, 0x005dfff7},
     60   {          0,          0},
     61   { 0xffff000c, 0x007bfffa},
     62 };
     63 
     64 
     65 static const int sub_pel_filters_inv_tap_4[8][2] = {
     66   {          0,          0},
     67   { 0x007bfffa, 0xffff000c},
     68   {          0,          0},
     69   { 0x005dfff7, 0xfffa0032},
     70   {          0,          0},
     71   { 0x0032fffa, 0xfff7005d},
     72   {          0,          0},
     73   { 0x000cffff, 0xfffa007b},
     74 };
     75 /* clang-format on */
     76 
     77 inline void prefetch_load(unsigned char *src) {
     78   __asm__ __volatile__("pref   0,  0(%[src])   \n\t" : : [src] "r"(src));
     79 }
     80 
     81 inline void prefetch_store(unsigned char *dst) {
     82   __asm__ __volatile__("pref   1,  0(%[dst])   \n\t" : : [dst] "r"(dst));
     83 }
     84 
     85 void dsputil_static_init(void) {
     86   int i;
     87 
     88   for (i = 0; i < 256; ++i) ff_cropTbl[i + CROP_WIDTH] = i;
     89 
     90   for (i = 0; i < CROP_WIDTH; ++i) {
     91     ff_cropTbl[i] = 0;
     92     ff_cropTbl[i + CROP_WIDTH + 256] = 255;
     93   }
     94 }
     95 
     96 void vp8_filter_block2d_first_pass_4(unsigned char *RESTRICT src_ptr,
     97                                      unsigned char *RESTRICT dst_ptr,
     98                                      unsigned int src_pixels_per_line,
     99                                      unsigned int output_height, int xoffset,
    100                                      int pitch) {
    101   unsigned int i;
    102   int Temp1, Temp2, Temp3, Temp4;
    103 
    104   unsigned int vector4a = 64;
    105   int vector1b, vector2b, vector3b;
    106   unsigned int tp1, tp2, tn1, tn2;
    107   unsigned int p1, p2, p3;
    108   unsigned int n1, n2, n3;
    109   unsigned char *cm = ff_cropTbl + CROP_WIDTH;
    110 
    111   vector3b = sub_pel_filters_inv[xoffset][2];
    112 
    113   /* if (xoffset == 0) we don't need any filtering */
    114   if (vector3b == 0) {
    115     for (i = 0; i < output_height; ++i) {
    116       /* prefetch src_ptr data to cache memory */
    117       prefetch_load(src_ptr + src_pixels_per_line);
    118       dst_ptr[0] = src_ptr[0];
    119       dst_ptr[1] = src_ptr[1];
    120       dst_ptr[2] = src_ptr[2];
    121       dst_ptr[3] = src_ptr[3];
    122 
    123       /* next row... */
    124       src_ptr += src_pixels_per_line;
    125       dst_ptr += 4;
    126     }
    127   } else {
    128     if (vector3b > 65536) {
    129       /* 6 tap filter */
    130 
    131       vector1b = sub_pel_filters_inv[xoffset][0];
    132       vector2b = sub_pel_filters_inv[xoffset][1];
    133 
    134       /* prefetch src_ptr data to cache memory */
    135       prefetch_load(src_ptr + src_pixels_per_line);
    136 
    137       for (i = output_height; i--;) {
    138         /* apply filter with vectors pairs */
    139         __asm__ __volatile__(
    140             "ulw              %[tp1],      -2(%[src_ptr])                 \n\t"
    141             "ulw              %[tp2],      2(%[src_ptr])                  \n\t"
    142 
    143             /* even 1. pixel */
    144             "mtlo             %[vector4a], $ac3                           \n\t"
    145             "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
    146             "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
    147             "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
    148             "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
    149             "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
    150             "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
    151 
    152             /* even 2. pixel */
    153             "mtlo             %[vector4a], $ac2                           \n\t"
    154             "preceu.ph.qbl    %[p1],       %[tp2]                         \n\t"
    155             "balign           %[tp2],      %[tp1],         3              \n\t"
    156             "extp             %[Temp1],    $ac3,           9              \n\t"
    157             "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
    158             "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
    159             "dpa.w.ph         $ac2,        %[p1],          %[vector3b]    \n\t"
    160 
    161             /* odd 1. pixel */
    162             "ulw              %[tn2],      3(%[src_ptr])                  \n\t"
    163             "mtlo             %[vector4a], $ac3                           \n\t"
    164             "preceu.ph.qbr    %[n1],       %[tp2]                         \n\t"
    165             "preceu.ph.qbl    %[n2],       %[tp2]                         \n\t"
    166             "preceu.ph.qbr    %[n3],       %[tn2]                         \n\t"
    167             "extp             %[Temp3],    $ac2,           9              \n\t"
    168             "dpa.w.ph         $ac3,        %[n1],          %[vector1b]    \n\t"
    169             "dpa.w.ph         $ac3,        %[n2],          %[vector2b]    \n\t"
    170             "dpa.w.ph         $ac3,        %[n3],          %[vector3b]    \n\t"
    171 
    172             /* even 2. pixel */
    173             "mtlo             %[vector4a], $ac2                           \n\t"
    174             "preceu.ph.qbl    %[n1],       %[tn2]                         \n\t"
    175             "extp             %[Temp2],    $ac3,           9              \n\t"
    176             "dpa.w.ph         $ac2,        %[n2],          %[vector1b]    \n\t"
    177             "dpa.w.ph         $ac2,        %[n3],          %[vector2b]    \n\t"
    178             "dpa.w.ph         $ac2,        %[n1],          %[vector3b]    \n\t"
    179             "extp             %[Temp4],    $ac2,           9              \n\t"
    180 
    181             /* clamp */
    182             "lbux             %[tp1],      %[Temp1](%[cm])                \n\t"
    183             "lbux             %[tn1],      %[Temp2](%[cm])                \n\t"
    184             "lbux             %[tp2],      %[Temp3](%[cm])                \n\t"
    185             "lbux             %[n2],       %[Temp4](%[cm])                \n\t"
    186 
    187             /* store bytes */
    188             "sb               %[tp1],      0(%[dst_ptr])                  \n\t"
    189             "sb               %[tn1],      1(%[dst_ptr])                  \n\t"
    190             "sb               %[tp2],      2(%[dst_ptr])                  \n\t"
    191             "sb               %[n2],       3(%[dst_ptr])                  \n\t"
    192 
    193             : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
    194               [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
    195               [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3),
    196               [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
    197               [Temp4] "=&r"(Temp4)
    198             : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
    199               [vector4a] "r"(vector4a), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr),
    200               [vector3b] "r"(vector3b), [src_ptr] "r"(src_ptr));
    201 
    202         /* Next row... */
    203         src_ptr += src_pixels_per_line;
    204         dst_ptr += pitch;
    205       }
    206     } else {
    207       /* 4 tap filter */
    208 
    209       vector1b = sub_pel_filters_inv_tap_4[xoffset][0];
    210       vector2b = sub_pel_filters_inv_tap_4[xoffset][1];
    211 
    212       for (i = output_height; i--;) {
    213         /* apply filter with vectors pairs */
    214         __asm__ __volatile__(
    215             "ulw              %[tp1],      -1(%[src_ptr])                 \n\t"
    216             "ulw              %[tp2],      3(%[src_ptr])                  \n\t"
    217 
    218             /* even 1. pixel */
    219             "mtlo             %[vector4a], $ac3                           \n\t"
    220             "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
    221             "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
    222             "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
    223             "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
    224             "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
    225 
    226             /* even 2. pixel */
    227             "mtlo             %[vector4a], $ac2                           \n\t"
    228             "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
    229             "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
    230             "extp             %[Temp1],    $ac3,           9              \n\t"
    231 
    232             /* odd 1. pixel */
    233             "srl              %[tn1],      %[tp2],         8              \n\t"
    234             "balign           %[tp2],      %[tp1],         3              \n\t"
    235             "mtlo             %[vector4a], $ac3                           \n\t"
    236             "preceu.ph.qbr    %[n1],       %[tp2]                         \n\t"
    237             "preceu.ph.qbl    %[n2],       %[tp2]                         \n\t"
    238             "preceu.ph.qbr    %[n3],       %[tn1]                         \n\t"
    239             "extp             %[Temp3],    $ac2,           9              \n\t"
    240             "dpa.w.ph         $ac3,        %[n1],          %[vector1b]    \n\t"
    241             "dpa.w.ph         $ac3,        %[n2],          %[vector2b]    \n\t"
    242 
    243             /* odd 2. pixel */
    244             "mtlo             %[vector4a], $ac2                           \n\t"
    245             "extp             %[Temp2],    $ac3,           9              \n\t"
    246             "dpa.w.ph         $ac2,        %[n2],          %[vector1b]    \n\t"
    247             "dpa.w.ph         $ac2,        %[n3],          %[vector2b]    \n\t"
    248             "extp             %[Temp4],    $ac2,           9              \n\t"
    249 
    250             /* clamp and store results */
    251             "lbux             %[tp1],      %[Temp1](%[cm])                \n\t"
    252             "lbux             %[tn1],      %[Temp2](%[cm])                \n\t"
    253             "lbux             %[tp2],      %[Temp3](%[cm])                \n\t"
    254             "sb               %[tp1],      0(%[dst_ptr])                  \n\t"
    255             "sb               %[tn1],      1(%[dst_ptr])                  \n\t"
    256             "lbux             %[n2],       %[Temp4](%[cm])                \n\t"
    257             "sb               %[tp2],      2(%[dst_ptr])                  \n\t"
    258             "sb               %[n2],       3(%[dst_ptr])                  \n\t"
    259 
    260             : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
    261               [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [n1] "=&r"(n1),
    262               [n2] "=&r"(n2), [n3] "=&r"(n3), [Temp1] "=&r"(Temp1),
    263               [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4)
    264             : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
    265               [vector4a] "r"(vector4a), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr),
    266               [src_ptr] "r"(src_ptr));
    267         /*  Next row... */
    268         src_ptr += src_pixels_per_line;
    269         dst_ptr += pitch;
    270       }
    271     }
    272   }
    273 }
    274 
    275 void vp8_filter_block2d_first_pass_8_all(unsigned char *RESTRICT src_ptr,
    276                                          unsigned char *RESTRICT dst_ptr,
    277                                          unsigned int src_pixels_per_line,
    278                                          unsigned int output_height,
    279                                          int xoffset, int pitch) {
    280   unsigned int i;
    281   int Temp1, Temp2, Temp3, Temp4;
    282 
    283   unsigned int vector4a = 64;
    284   unsigned int vector1b, vector2b, vector3b;
    285   unsigned int tp1, tp2, tn1, tn2;
    286   unsigned int p1, p2, p3, p4;
    287   unsigned int n1, n2, n3, n4;
    288 
    289   unsigned char *cm = ff_cropTbl + CROP_WIDTH;
    290 
    291   /* if (xoffset == 0) we don't need any filtering */
    292   if (xoffset == 0) {
    293     for (i = 0; i < output_height; ++i) {
    294       /* prefetch src_ptr data to cache memory */
    295       prefetch_load(src_ptr + src_pixels_per_line);
    296 
    297       dst_ptr[0] = src_ptr[0];
    298       dst_ptr[1] = src_ptr[1];
    299       dst_ptr[2] = src_ptr[2];
    300       dst_ptr[3] = src_ptr[3];
    301       dst_ptr[4] = src_ptr[4];
    302       dst_ptr[5] = src_ptr[5];
    303       dst_ptr[6] = src_ptr[6];
    304       dst_ptr[7] = src_ptr[7];
    305 
    306       /* next row... */
    307       src_ptr += src_pixels_per_line;
    308       dst_ptr += 8;
    309     }
    310   } else {
    311     vector3b = sub_pel_filters_inv[xoffset][2];
    312 
    313     if (vector3b > 65536) {
    314       /* 6 tap filter */
    315 
    316       vector1b = sub_pel_filters_inv[xoffset][0];
    317       vector2b = sub_pel_filters_inv[xoffset][1];
    318 
    319       for (i = output_height; i--;) {
    320         /* prefetch src_ptr data to cache memory */
    321         prefetch_load(src_ptr + src_pixels_per_line);
    322 
    323         /* apply filter with vectors pairs */
    324         __asm__ __volatile__(
    325             "ulw              %[tp1],      -2(%[src_ptr])                 \n\t"
    326             "ulw              %[tp2],      2(%[src_ptr])                  \n\t"
    327 
    328             /* even 1. pixel */
    329             "mtlo             %[vector4a], $ac3                           \n\t"
    330             "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
    331             "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
    332             "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
    333             "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
    334             "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
    335             "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
    336 
    337             /* even 2. pixel */
    338             "mtlo             %[vector4a], $ac2                           \n\t"
    339             "preceu.ph.qbl    %[p1],       %[tp2]                         \n\t"
    340             "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
    341             "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
    342             "dpa.w.ph         $ac2,        %[p1],          %[vector3b]    \n\t"
    343 
    344             "balign           %[tp2],      %[tp1],         3              \n\t"
    345             "extp             %[Temp1],    $ac3,           9              \n\t"
    346             "ulw              %[tn2],      3(%[src_ptr])                  \n\t"
    347 
    348             /* odd 1. pixel */
    349             "mtlo             %[vector4a], $ac3                           \n\t"
    350             "preceu.ph.qbr    %[n1],       %[tp2]                         \n\t"
    351             "preceu.ph.qbl    %[n2],       %[tp2]                         \n\t"
    352             "preceu.ph.qbr    %[n3],       %[tn2]                         \n\t"
    353             "extp             %[Temp3],    $ac2,           9              \n\t"
    354             "dpa.w.ph         $ac3,        %[n1],          %[vector1b]    \n\t"
    355             "dpa.w.ph         $ac3,        %[n2],          %[vector2b]    \n\t"
    356             "dpa.w.ph         $ac3,        %[n3],          %[vector3b]    \n\t"
    357 
    358             /* odd 2. pixel */
    359             "mtlo             %[vector4a], $ac2                           \n\t"
    360             "preceu.ph.qbl    %[n1],       %[tn2]                         \n\t"
    361             "dpa.w.ph         $ac2,        %[n2],          %[vector1b]    \n\t"
    362             "dpa.w.ph         $ac2,        %[n3],          %[vector2b]    \n\t"
    363             "dpa.w.ph         $ac2,        %[n1],          %[vector3b]    \n\t"
    364             "ulw              %[tp1],      6(%[src_ptr])                  \n\t"
    365             "extp             %[Temp2],    $ac3,           9              \n\t"
    366             "mtlo             %[vector4a], $ac3                           \n\t"
    367             "preceu.ph.qbr    %[p2],       %[tp1]                         \n\t"
    368             "extp             %[Temp4],    $ac2,           9              \n\t"
    369 
    370             : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn2] "=&r"(tn2),
    371               [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [n1] "=&r"(n1),
    372               [n2] "=&r"(n2), [n3] "=&r"(n3), [Temp1] "=&r"(Temp1),
    373               [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), [Temp4] "=r"(Temp4)
    374             : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
    375               [vector4a] "r"(vector4a), [vector3b] "r"(vector3b),
    376               [src_ptr] "r"(src_ptr));
    377 
    378         /* clamp and store results */
    379         dst_ptr[0] = cm[Temp1];
    380         dst_ptr[1] = cm[Temp2];
    381         dst_ptr[2] = cm[Temp3];
    382         dst_ptr[3] = cm[Temp4];
    383 
    384         /* next 4 pixels */
    385         __asm__ __volatile__(
    386             /* even 3. pixel */
    387             "dpa.w.ph         $ac3,        %[p3],          %[vector1b]    \n\t"
    388             "dpa.w.ph         $ac3,        %[p1],          %[vector2b]    \n\t"
    389             "dpa.w.ph         $ac3,        %[p2],          %[vector3b]    \n\t"
    390 
    391             /* even 4. pixel */
    392             "mtlo             %[vector4a], $ac2                           \n\t"
    393             "preceu.ph.qbl    %[p4],       %[tp1]                         \n\t"
    394             "dpa.w.ph         $ac2,        %[p1],          %[vector1b]    \n\t"
    395             "dpa.w.ph         $ac2,        %[p2],          %[vector2b]    \n\t"
    396             "dpa.w.ph         $ac2,        %[p4],          %[vector3b]    \n\t"
    397 
    398             "ulw              %[tn1],      7(%[src_ptr])                  \n\t"
    399             "extp             %[Temp1],    $ac3,           9              \n\t"
    400 
    401             /* odd 3. pixel */
    402             "mtlo             %[vector4a], $ac3                           \n\t"
    403             "preceu.ph.qbr    %[n2],       %[tn1]                         \n\t"
    404             "dpa.w.ph         $ac3,        %[n3],          %[vector1b]    \n\t"
    405             "dpa.w.ph         $ac3,        %[n1],          %[vector2b]    \n\t"
    406             "dpa.w.ph         $ac3,        %[n2],          %[vector3b]    \n\t"
    407             "extp             %[Temp3],    $ac2,           9              \n\t"
    408 
    409             /* odd 4. pixel */
    410             "mtlo             %[vector4a], $ac2                           \n\t"
    411             "preceu.ph.qbl    %[n4],       %[tn1]                         \n\t"
    412             "dpa.w.ph         $ac2,        %[n1],          %[vector1b]    \n\t"
    413             "dpa.w.ph         $ac2,        %[n2],          %[vector2b]    \n\t"
    414             "dpa.w.ph         $ac2,        %[n4],          %[vector3b]    \n\t"
    415             "extp             %[Temp2],    $ac3,           9              \n\t"
    416             "extp             %[Temp4],    $ac2,           9              \n\t"
    417 
    418             : [tn1] "=&r"(tn1), [n2] "=&r"(n2), [p4] "=&r"(p4), [n4] "=&r"(n4),
    419               [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
    420               [Temp4] "=r"(Temp4)
    421             : [tp1] "r"(tp1), [vector1b] "r"(vector1b), [p2] "r"(p2),
    422               [vector2b] "r"(vector2b), [n1] "r"(n1), [p1] "r"(p1),
    423               [vector4a] "r"(vector4a), [vector3b] "r"(vector3b), [p3] "r"(p3),
    424               [n3] "r"(n3), [src_ptr] "r"(src_ptr));
    425 
    426         /* clamp and store results */
    427         dst_ptr[4] = cm[Temp1];
    428         dst_ptr[5] = cm[Temp2];
    429         dst_ptr[6] = cm[Temp3];
    430         dst_ptr[7] = cm[Temp4];
    431 
    432         src_ptr += src_pixels_per_line;
    433         dst_ptr += pitch;
    434       }
    435     } else {
    436       /* 4 tap filter */
    437 
    438       vector1b = sub_pel_filters_inv_tap_4[xoffset][0];
    439       vector2b = sub_pel_filters_inv_tap_4[xoffset][1];
    440 
    441       for (i = output_height; i--;) {
    442         /* prefetch src_ptr data to cache memory */
    443         prefetch_load(src_ptr + src_pixels_per_line);
    444 
    445         /* apply filter with vectors pairs */
    446         __asm__ __volatile__(
    447             "ulw              %[tp1],      -1(%[src_ptr])                 \n\t"
    448 
    449             /* even 1. pixel */
    450             "mtlo             %[vector4a], $ac3                           \n\t"
    451             "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
    452             "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
    453             "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
    454             "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
    455 
    456             "ulw              %[tp2],      3(%[src_ptr])                  \n\t"
    457 
    458             /* even 2. pixel  */
    459             "mtlo             %[vector4a], $ac2                           \n\t"
    460             "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
    461             "preceu.ph.qbl    %[p4],       %[tp2]                         \n\t"
    462             "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
    463             "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
    464             "extp             %[Temp1],    $ac3,           9              \n\t"
    465 
    466             "balign           %[tp2],      %[tp1],         3              \n\t"
    467 
    468             /* odd 1. pixel */
    469             "mtlo             %[vector4a], $ac3                           \n\t"
    470             "preceu.ph.qbr    %[n1],       %[tp2]                         \n\t"
    471             "preceu.ph.qbl    %[n2],       %[tp2]                         \n\t"
    472             "dpa.w.ph         $ac3,        %[n1],          %[vector1b]    \n\t"
    473             "dpa.w.ph         $ac3,        %[n2],          %[vector2b]    \n\t"
    474             "extp             %[Temp3],    $ac2,           9              \n\t"
    475 
    476             "ulw              %[tn2],      4(%[src_ptr])                  \n\t"
    477 
    478             /* odd 2. pixel */
    479             "mtlo             %[vector4a], $ac2                           \n\t"
    480             "preceu.ph.qbr    %[n3],       %[tn2]                         \n\t"
    481             "preceu.ph.qbl    %[n4],       %[tn2]                         \n\t"
    482             "dpa.w.ph         $ac2,        %[n2],          %[vector1b]    \n\t"
    483             "dpa.w.ph         $ac2,        %[n3],          %[vector2b]    \n\t"
    484             "ulw              %[tp1],      7(%[src_ptr])                  \n\t"
    485             "extp             %[Temp2],    $ac3,           9              \n\t"
    486             "mtlo             %[vector4a], $ac3                           \n\t"
    487             "extp             %[Temp4],    $ac2,           9              \n\t"
    488 
    489             : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn2] "=&r"(tn2),
    490               [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4),
    491               [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3), [n4] "=&r"(n4),
    492               [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
    493               [Temp4] "=r"(Temp4)
    494             : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
    495               [vector4a] "r"(vector4a), [src_ptr] "r"(src_ptr));
    496 
    497         /* clamp and store results */
    498         dst_ptr[0] = cm[Temp1];
    499         dst_ptr[1] = cm[Temp2];
    500         dst_ptr[2] = cm[Temp3];
    501         dst_ptr[3] = cm[Temp4];
    502 
    503         /* next 4 pixels */
    504         __asm__ __volatile__(
    505             /* even 3. pixel */
    506             "dpa.w.ph         $ac3,        %[p3],          %[vector1b]    \n\t"
    507             "dpa.w.ph         $ac3,        %[p4],          %[vector2b]    \n\t"
    508 
    509             /* even 4. pixel */
    510             "mtlo             %[vector4a], $ac2                           \n\t"
    511             "preceu.ph.qbr    %[p2],       %[tp1]                         \n\t"
    512             "dpa.w.ph         $ac2,        %[p4],          %[vector1b]    \n\t"
    513             "dpa.w.ph         $ac2,        %[p2],          %[vector2b]    \n\t"
    514             "extp             %[Temp1],    $ac3,           9              \n\t"
    515 
    516             /* odd 3. pixel */
    517             "mtlo             %[vector4a], $ac3                           \n\t"
    518             "dpa.w.ph         $ac3,        %[n3],          %[vector1b]    \n\t"
    519             "dpa.w.ph         $ac3,        %[n4],          %[vector2b]    \n\t"
    520             "ulw              %[tn1],      8(%[src_ptr])                  \n\t"
    521             "extp             %[Temp3],    $ac2,           9              \n\t"
    522 
    523             /* odd 4. pixel */
    524             "mtlo             %[vector4a], $ac2                           \n\t"
    525             "preceu.ph.qbr    %[n2],       %[tn1]                         \n\t"
    526             "dpa.w.ph         $ac2,        %[n4],          %[vector1b]    \n\t"
    527             "dpa.w.ph         $ac2,        %[n2],          %[vector2b]    \n\t"
    528             "extp             %[Temp2],    $ac3,           9              \n\t"
    529             "extp             %[Temp4],    $ac2,           9              \n\t"
    530 
    531             : [tn1] "=&r"(tn1), [p2] "=&r"(p2), [n2] "=&r"(n2),
    532               [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
    533               [Temp4] "=r"(Temp4)
    534             : [tp1] "r"(tp1), [p3] "r"(p3), [p4] "r"(p4),
    535               [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
    536               [vector4a] "r"(vector4a), [src_ptr] "r"(src_ptr), [n3] "r"(n3),
    537               [n4] "r"(n4));
    538 
    539         /* clamp and store results */
    540         dst_ptr[4] = cm[Temp1];
    541         dst_ptr[5] = cm[Temp2];
    542         dst_ptr[6] = cm[Temp3];
    543         dst_ptr[7] = cm[Temp4];
    544 
    545         /* next row... */
    546         src_ptr += src_pixels_per_line;
    547         dst_ptr += pitch;
    548       }
    549     }
    550   }
    551 }
    552 
    553 void vp8_filter_block2d_first_pass16_6tap(unsigned char *RESTRICT src_ptr,
    554                                           unsigned char *RESTRICT dst_ptr,
    555                                           unsigned int src_pixels_per_line,
    556                                           unsigned int output_height,
    557                                           int xoffset, int pitch) {
    558   unsigned int i;
    559   int Temp1, Temp2, Temp3, Temp4;
    560 
    561   unsigned int vector4a;
    562   unsigned int vector1b, vector2b, vector3b;
    563   unsigned int tp1, tp2, tn1, tn2;
    564   unsigned int p1, p2, p3, p4;
    565   unsigned int n1, n2, n3, n4;
    566   unsigned char *cm = ff_cropTbl + CROP_WIDTH;
    567 
    568   vector1b = sub_pel_filters_inv[xoffset][0];
    569   vector2b = sub_pel_filters_inv[xoffset][1];
    570   vector3b = sub_pel_filters_inv[xoffset][2];
    571   vector4a = 64;
    572 
    573   for (i = output_height; i--;) {
    574     /* prefetch src_ptr data to cache memory */
    575     prefetch_load(src_ptr + src_pixels_per_line);
    576 
    577     /* apply filter with vectors pairs */
    578     __asm__ __volatile__(
    579         "ulw                %[tp1],      -2(%[src_ptr])                 \n\t"
    580         "ulw                %[tp2],      2(%[src_ptr])                  \n\t"
    581 
    582         /* even 1. pixel */
    583         "mtlo               %[vector4a], $ac3                           \n\t"
    584         "preceu.ph.qbr      %[p1],       %[tp1]                         \n\t"
    585         "preceu.ph.qbl      %[p2],       %[tp1]                         \n\t"
    586         "preceu.ph.qbr      %[p3],       %[tp2]                         \n\t"
    587         "dpa.w.ph           $ac3,        %[p1],           %[vector1b]   \n\t"
    588         "dpa.w.ph           $ac3,        %[p2],           %[vector2b]   \n\t"
    589         "dpa.w.ph           $ac3,        %[p3],           %[vector3b]   \n\t"
    590 
    591         /* even 2. pixel */
    592         "mtlo               %[vector4a], $ac2                           \n\t"
    593         "preceu.ph.qbl      %[p1],       %[tp2]                         \n\t"
    594         "dpa.w.ph           $ac2,        %[p2],           %[vector1b]   \n\t"
    595         "dpa.w.ph           $ac2,        %[p3],           %[vector2b]   \n\t"
    596         "dpa.w.ph           $ac2,        %[p1],           %[vector3b]   \n\t"
    597 
    598         "balign             %[tp2],      %[tp1],          3             \n\t"
    599         "ulw                %[tn2],      3(%[src_ptr])                  \n\t"
    600         "extp               %[Temp1],    $ac3,            9             \n\t"
    601 
    602         /* odd 1. pixel */
    603         "mtlo               %[vector4a], $ac3                           \n\t"
    604         "preceu.ph.qbr      %[n1],       %[tp2]                         \n\t"
    605         "preceu.ph.qbl      %[n2],       %[tp2]                         \n\t"
    606         "preceu.ph.qbr      %[n3],       %[tn2]                         \n\t"
    607         "extp               %[Temp3],    $ac2,            9             \n\t"
    608         "dpa.w.ph           $ac3,        %[n1],           %[vector1b]   \n\t"
    609         "dpa.w.ph           $ac3,        %[n2],           %[vector2b]   \n\t"
    610         "dpa.w.ph           $ac3,        %[n3],           %[vector3b]   \n\t"
    611 
    612         /* odd 2. pixel */
    613         "mtlo               %[vector4a], $ac2                           \n\t"
    614         "preceu.ph.qbl      %[n1],       %[tn2]                         \n\t"
    615         "dpa.w.ph           $ac2,        %[n2],           %[vector1b]   \n\t"
    616         "dpa.w.ph           $ac2,        %[n3],           %[vector2b]   \n\t"
    617         "dpa.w.ph           $ac2,        %[n1],           %[vector3b]   \n\t"
    618         "ulw                %[tp1],      6(%[src_ptr])                  \n\t"
    619         "extp               %[Temp2],    $ac3,            9             \n\t"
    620         "mtlo               %[vector4a], $ac3                           \n\t"
    621         "preceu.ph.qbr      %[p2],       %[tp1]                         \n\t"
    622         "extp               %[Temp4],    $ac2,            9             \n\t"
    623 
    624         : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn2] "=&r"(tn2), [p1] "=&r"(p1),
    625           [p2] "=&r"(p2), [p3] "=&r"(p3), [n1] "=&r"(n1), [n2] "=&r"(n2),
    626           [n3] "=&r"(n3), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
    627           [Temp3] "=&r"(Temp3), [Temp4] "=r"(Temp4)
    628         : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
    629           [vector4a] "r"(vector4a), [vector3b] "r"(vector3b),
    630           [src_ptr] "r"(src_ptr));
    631 
    632     /* clamp and store results */
    633     dst_ptr[0] = cm[Temp1];
    634     dst_ptr[1] = cm[Temp2];
    635     dst_ptr[2] = cm[Temp3];
    636     dst_ptr[3] = cm[Temp4];
    637 
    638     /* next 4 pixels */
    639     __asm__ __volatile__(
    640         /* even 3. pixel */
    641         "dpa.w.ph           $ac3,        %[p3],           %[vector1b]   \n\t"
    642         "dpa.w.ph           $ac3,        %[p1],           %[vector2b]   \n\t"
    643         "dpa.w.ph           $ac3,        %[p2],           %[vector3b]   \n\t"
    644 
    645         /* even 4. pixel */
    646         "mtlo               %[vector4a], $ac2                           \n\t"
    647         "preceu.ph.qbl      %[p4],       %[tp1]                         \n\t"
    648         "dpa.w.ph           $ac2,        %[p1],           %[vector1b]   \n\t"
    649         "dpa.w.ph           $ac2,        %[p2],           %[vector2b]   \n\t"
    650         "dpa.w.ph           $ac2,        %[p4],           %[vector3b]   \n\t"
    651         "ulw                %[tn1],      7(%[src_ptr])                  \n\t"
    652         "extp               %[Temp1],    $ac3,            9             \n\t"
    653 
    654         /* odd 3. pixel */
    655         "mtlo               %[vector4a], $ac3                           \n\t"
    656         "preceu.ph.qbr      %[n2],       %[tn1]                         \n\t"
    657         "dpa.w.ph           $ac3,        %[n3],           %[vector1b]   \n\t"
    658         "dpa.w.ph           $ac3,        %[n1],           %[vector2b]   \n\t"
    659         "dpa.w.ph           $ac3,        %[n2],           %[vector3b]   \n\t"
    660         "extp               %[Temp3],    $ac2,            9             \n\t"
    661 
    662         /* odd 4. pixel */
    663         "mtlo               %[vector4a], $ac2                           \n\t"
    664         "preceu.ph.qbl      %[n4],       %[tn1]                         \n\t"
    665         "dpa.w.ph           $ac2,        %[n1],           %[vector1b]   \n\t"
    666         "dpa.w.ph           $ac2,        %[n2],           %[vector2b]   \n\t"
    667         "dpa.w.ph           $ac2,        %[n4],           %[vector3b]   \n\t"
    668         "ulw                %[tp2],      10(%[src_ptr])                 \n\t"
    669         "extp               %[Temp2],    $ac3,            9             \n\t"
    670         "mtlo               %[vector4a], $ac3                           \n\t"
    671         "preceu.ph.qbr      %[p1],       %[tp2]                         \n\t"
    672         "extp               %[Temp4],    $ac2,            9             \n\t"
    673 
    674         : [tn1] "=&r"(tn1), [tp2] "=&r"(tp2), [n2] "=&r"(n2), [p4] "=&r"(p4),
    675           [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
    676           [Temp3] "=&r"(Temp3), [Temp4] "=r"(Temp4)
    677         : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), [tp1] "r"(tp1),
    678           [n1] "r"(n1), [p1] "r"(p1), [vector4a] "r"(vector4a), [p2] "r"(p2),
    679           [vector3b] "r"(vector3b), [p3] "r"(p3), [n3] "r"(n3),
    680           [src_ptr] "r"(src_ptr));
    681 
    682     /* clamp and store results */
    683     dst_ptr[4] = cm[Temp1];
    684     dst_ptr[5] = cm[Temp2];
    685     dst_ptr[6] = cm[Temp3];
    686     dst_ptr[7] = cm[Temp4];
    687 
    688     /* next 4 pixels */
    689     __asm__ __volatile__(
    690         /* even 5. pixel */
    691         "dpa.w.ph           $ac3,        %[p2],           %[vector1b]   \n\t"
    692         "dpa.w.ph           $ac3,        %[p4],           %[vector2b]   \n\t"
    693         "dpa.w.ph           $ac3,        %[p1],           %[vector3b]   \n\t"
    694 
    695         /* even 6. pixel */
    696         "mtlo               %[vector4a], $ac2                           \n\t"
    697         "preceu.ph.qbl      %[p3],       %[tp2]                         \n\t"
    698         "dpa.w.ph           $ac2,        %[p4],           %[vector1b]   \n\t"
    699         "dpa.w.ph           $ac2,        %[p1],           %[vector2b]   \n\t"
    700         "dpa.w.ph           $ac2,        %[p3],           %[vector3b]   \n\t"
    701 
    702         "ulw                %[tn1],      11(%[src_ptr])                 \n\t"
    703         "extp               %[Temp1],    $ac3,            9             \n\t"
    704 
    705         /* odd 5. pixel */
    706         "mtlo               %[vector4a], $ac3                           \n\t"
    707         "preceu.ph.qbr      %[n1],       %[tn1]                         \n\t"
    708         "dpa.w.ph           $ac3,        %[n2],           %[vector1b]   \n\t"
    709         "dpa.w.ph           $ac3,        %[n4],           %[vector2b]   \n\t"
    710         "dpa.w.ph           $ac3,        %[n1],           %[vector3b]   \n\t"
    711         "extp               %[Temp3],    $ac2,            9             \n\t"
    712 
    713         /* odd 6. pixel */
    714         "mtlo               %[vector4a], $ac2                           \n\t"
    715         "preceu.ph.qbl      %[n3],       %[tn1]                         \n\t"
    716         "dpa.w.ph           $ac2,        %[n4],           %[vector1b]   \n\t"
    717         "dpa.w.ph           $ac2,        %[n1],           %[vector2b]   \n\t"
    718         "dpa.w.ph           $ac2,        %[n3],           %[vector3b]   \n\t"
    719         "ulw                %[tp1],      14(%[src_ptr])                 \n\t"
    720         "extp               %[Temp2],    $ac3,            9             \n\t"
    721         "mtlo               %[vector4a], $ac3                           \n\t"
    722         "preceu.ph.qbr      %[p4],       %[tp1]                         \n\t"
    723         "extp               %[Temp4],    $ac2,            9             \n\t"
    724 
    725         : [tn1] "=&r"(tn1), [tp1] "=&r"(tp1), [n1] "=&r"(n1), [p3] "=&r"(p3),
    726           [n3] "=&r"(n3), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
    727           [Temp3] "=&r"(Temp3), [Temp4] "=r"(Temp4)
    728         : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), [tp2] "r"(tp2),
    729           [p2] "r"(p2), [n2] "r"(n2), [p4] "r"(p4), [n4] "r"(n4), [p1] "r"(p1),
    730           [src_ptr] "r"(src_ptr), [vector4a] "r"(vector4a),
    731           [vector3b] "r"(vector3b));
    732 
    733     /* clamp and store results */
    734     dst_ptr[8] = cm[Temp1];
    735     dst_ptr[9] = cm[Temp2];
    736     dst_ptr[10] = cm[Temp3];
    737     dst_ptr[11] = cm[Temp4];
    738 
    739     /* next 4 pixels */
    740     __asm__ __volatile__(
    741         /* even 7. pixel */
    742         "dpa.w.ph           $ac3,        %[p1],           %[vector1b]   \n\t"
    743         "dpa.w.ph           $ac3,        %[p3],           %[vector2b]   \n\t"
    744         "dpa.w.ph           $ac3,        %[p4],           %[vector3b]   \n\t"
    745 
    746         /* even 8. pixel */
    747         "mtlo               %[vector4a], $ac2                           \n\t"
    748         "preceu.ph.qbl      %[p2],       %[tp1]                         \n\t"
    749         "dpa.w.ph           $ac2,        %[p3],           %[vector1b]   \n\t"
    750         "dpa.w.ph           $ac2,        %[p4],           %[vector2b]   \n\t"
    751         "dpa.w.ph           $ac2,        %[p2],           %[vector3b]   \n\t"
    752         "ulw                %[tn1],      15(%[src_ptr])                 \n\t"
    753         "extp               %[Temp1],    $ac3,            9             \n\t"
    754 
    755         /* odd 7. pixel */
    756         "mtlo               %[vector4a], $ac3                           \n\t"
    757         "preceu.ph.qbr      %[n4],       %[tn1]                         \n\t"
    758         "dpa.w.ph           $ac3,        %[n1],           %[vector1b]   \n\t"
    759         "dpa.w.ph           $ac3,        %[n3],           %[vector2b]   \n\t"
    760         "dpa.w.ph           $ac3,        %[n4],           %[vector3b]   \n\t"
    761         "extp               %[Temp3],    $ac2,            9             \n\t"
    762 
    763         /* odd 8. pixel */
    764         "mtlo               %[vector4a], $ac2                           \n\t"
    765         "preceu.ph.qbl      %[n2],       %[tn1]                         \n\t"
    766         "dpa.w.ph           $ac2,        %[n3],           %[vector1b]   \n\t"
    767         "dpa.w.ph           $ac2,        %[n4],           %[vector2b]   \n\t"
    768         "dpa.w.ph           $ac2,        %[n2],           %[vector3b]   \n\t"
    769         "extp               %[Temp2],    $ac3,            9             \n\t"
    770         "extp               %[Temp4],    $ac2,            9             \n\t"
    771 
    772         /* clamp and store results */
    773         "lbux               %[tp1],      %[Temp1](%[cm])                \n\t"
    774         "lbux               %[tn1],      %[Temp2](%[cm])                \n\t"
    775         "lbux               %[p2],       %[Temp3](%[cm])                \n\t"
    776         "sb                 %[tp1],      12(%[dst_ptr])                 \n\t"
    777         "sb                 %[tn1],      13(%[dst_ptr])                 \n\t"
    778         "lbux               %[n2],       %[Temp4](%[cm])                \n\t"
    779         "sb                 %[p2],       14(%[dst_ptr])                 \n\t"
    780         "sb                 %[n2],       15(%[dst_ptr])                 \n\t"
    781 
    782         : [tn1] "=&r"(tn1), [p2] "=&r"(p2), [n2] "=&r"(n2), [n4] "=&r"(n4),
    783           [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
    784           [Temp4] "=r"(Temp4)
    785         : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), [tp1] "r"(tp1),
    786           [p4] "r"(p4), [n1] "r"(n1), [p1] "r"(p1), [vector4a] "r"(vector4a),
    787           [vector3b] "r"(vector3b), [p3] "r"(p3), [n3] "r"(n3),
    788           [src_ptr] "r"(src_ptr), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
    789 
    790     src_ptr += src_pixels_per_line;
    791     dst_ptr += pitch;
    792   }
    793 }
    794 
    795 void vp8_filter_block2d_first_pass16_0(unsigned char *RESTRICT src_ptr,
    796                                        unsigned char *RESTRICT output_ptr,
    797                                        unsigned int src_pixels_per_line) {
    798   int Temp1, Temp2, Temp3, Temp4;
    799   int i;
    800 
    801   /* prefetch src_ptr data to cache memory */
    802   prefetch_store(output_ptr + 32);
    803 
    804   /* copy memory from src buffer to dst buffer */
    805   for (i = 0; i < 7; ++i) {
    806     __asm__ __volatile__(
    807         "ulw    %[Temp1],   0(%[src_ptr])                               \n\t"
    808         "ulw    %[Temp2],   4(%[src_ptr])                               \n\t"
    809         "ulw    %[Temp3],   8(%[src_ptr])                               \n\t"
    810         "ulw    %[Temp4],   12(%[src_ptr])                              \n\t"
    811         "sw     %[Temp1],   0(%[output_ptr])                            \n\t"
    812         "sw     %[Temp2],   4(%[output_ptr])                            \n\t"
    813         "sw     %[Temp3],   8(%[output_ptr])                            \n\t"
    814         "sw     %[Temp4],   12(%[output_ptr])                           \n\t"
    815         "addu   %[src_ptr], %[src_ptr],        %[src_pixels_per_line]   \n\t"
    816 
    817         : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
    818           [Temp4] "=&r"(Temp4), [src_ptr] "+r"(src_ptr)
    819         : [src_pixels_per_line] "r"(src_pixels_per_line),
    820           [output_ptr] "r"(output_ptr));
    821 
    822     __asm__ __volatile__(
    823         "ulw    %[Temp1],   0(%[src_ptr])                               \n\t"
    824         "ulw    %[Temp2],   4(%[src_ptr])                               \n\t"
    825         "ulw    %[Temp3],   8(%[src_ptr])                               \n\t"
    826         "ulw    %[Temp4],   12(%[src_ptr])                              \n\t"
    827         "sw     %[Temp1],   16(%[output_ptr])                           \n\t"
    828         "sw     %[Temp2],   20(%[output_ptr])                           \n\t"
    829         "sw     %[Temp3],   24(%[output_ptr])                           \n\t"
    830         "sw     %[Temp4],   28(%[output_ptr])                           \n\t"
    831         "addu   %[src_ptr], %[src_ptr],        %[src_pixels_per_line]   \n\t"
    832 
    833         : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
    834           [Temp4] "=&r"(Temp4), [src_ptr] "+r"(src_ptr)
    835         : [src_pixels_per_line] "r"(src_pixels_per_line),
    836           [output_ptr] "r"(output_ptr));
    837 
    838     __asm__ __volatile__(
    839         "ulw    %[Temp1],   0(%[src_ptr])                               \n\t"
    840         "ulw    %[Temp2],   4(%[src_ptr])                               \n\t"
    841         "ulw    %[Temp3],   8(%[src_ptr])                               \n\t"
    842         "ulw    %[Temp4],   12(%[src_ptr])                              \n\t"
    843         "sw     %[Temp1],   32(%[output_ptr])                           \n\t"
    844         "sw     %[Temp2],   36(%[output_ptr])                           \n\t"
    845         "sw     %[Temp3],   40(%[output_ptr])                           \n\t"
    846         "sw     %[Temp4],   44(%[output_ptr])                           \n\t"
    847         "addu   %[src_ptr], %[src_ptr],        %[src_pixels_per_line]   \n\t"
    848 
    849         : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
    850           [Temp4] "=&r"(Temp4), [src_ptr] "+r"(src_ptr)
    851         : [src_pixels_per_line] "r"(src_pixels_per_line),
    852           [output_ptr] "r"(output_ptr));
    853 
    854     output_ptr += 48;
    855   }
    856 }
    857 
    858 void vp8_filter_block2d_first_pass16_4tap(
    859     unsigned char *RESTRICT src_ptr, unsigned char *RESTRICT output_ptr,
    860     unsigned int src_pixels_per_line, unsigned int output_width,
    861     unsigned int output_height, int xoffset, int yoffset,
    862     unsigned char *RESTRICT dst_ptr, int pitch) {
    863   unsigned int i, j;
    864   int Temp1, Temp2, Temp3, Temp4;
    865 
    866   unsigned int vector4a;
    867   int vector1b, vector2b;
    868   unsigned int tp1, tp2, tp3, tn1;
    869   unsigned int p1, p2, p3;
    870   unsigned int n1, n2, n3;
    871   unsigned char *cm = ff_cropTbl + CROP_WIDTH;
    872 
    873   vector4a = 64;
    874 
    875   vector1b = sub_pel_filters_inv_tap_4[xoffset][0];
    876   vector2b = sub_pel_filters_inv_tap_4[xoffset][1];
    877 
    878   /* if (yoffset == 0) don't need temp buffer, data will be stored in dst_ptr */
    879   if (yoffset == 0) {
    880     output_height -= 5;
    881     src_ptr += (src_pixels_per_line + src_pixels_per_line);
    882 
    883     for (i = output_height; i--;) {
    884       __asm__ __volatile__("ulw     %[tp3],   -1(%[src_ptr])               \n\t"
    885                            : [tp3] "=&r"(tp3)
    886                            : [src_ptr] "r"(src_ptr));
    887 
    888       /* processing 4 adjacent pixels */
    889       for (j = 0; j < 16; j += 4) {
    890         /* apply filter with vectors pairs */
    891         __asm__ __volatile__(
    892             "ulw              %[tp2],      3(%[src_ptr])                    "
    893             "\n\t"
    894             "move             %[tp1],      %[tp3]                           "
    895             "\n\t"
    896 
    897             /* even 1. pixel */
    898             "mtlo             %[vector4a], $ac3                             "
    899             "\n\t"
    900             "mthi             $0,          $ac3                             "
    901             "\n\t"
    902             "move             %[tp3],      %[tp2]                           "
    903             "\n\t"
    904             "preceu.ph.qbr    %[p1],       %[tp1]                           "
    905             "\n\t"
    906             "preceu.ph.qbl    %[p2],       %[tp1]                           "
    907             "\n\t"
    908             "preceu.ph.qbr    %[p3],       %[tp2]                           "
    909             "\n\t"
    910             "dpa.w.ph         $ac3,        %[p1],           %[vector1b]     "
    911             "\n\t"
    912             "dpa.w.ph         $ac3,        %[p2],           %[vector2b]     "
    913             "\n\t"
    914 
    915             /* even 2. pixel */
    916             "mtlo             %[vector4a], $ac2                             "
    917             "\n\t"
    918             "mthi             $0,          $ac2                             "
    919             "\n\t"
    920             "dpa.w.ph         $ac2,        %[p2],           %[vector1b]     "
    921             "\n\t"
    922             "dpa.w.ph         $ac2,        %[p3],           %[vector2b]     "
    923             "\n\t"
    924             "extr.w           %[Temp1],    $ac3,            7               "
    925             "\n\t"
    926 
    927             /* odd 1. pixel */
    928             "ulw              %[tn1],      4(%[src_ptr])                    "
    929             "\n\t"
    930             "balign           %[tp2],      %[tp1],          3               "
    931             "\n\t"
    932             "mtlo             %[vector4a], $ac3                             "
    933             "\n\t"
    934             "mthi             $0,          $ac3                             "
    935             "\n\t"
    936             "preceu.ph.qbr    %[n1],       %[tp2]                           "
    937             "\n\t"
    938             "preceu.ph.qbl    %[n2],       %[tp2]                           "
    939             "\n\t"
    940             "preceu.ph.qbr    %[n3],       %[tn1]                           "
    941             "\n\t"
    942             "extr.w           %[Temp3],    $ac2,            7               "
    943             "\n\t"
    944             "dpa.w.ph         $ac3,        %[n1],           %[vector1b]     "
    945             "\n\t"
    946             "dpa.w.ph         $ac3,        %[n2],           %[vector2b]     "
    947             "\n\t"
    948 
    949             /* odd 2. pixel */
    950             "mtlo             %[vector4a], $ac2                             "
    951             "\n\t"
    952             "mthi             $0,          $ac2                             "
    953             "\n\t"
    954             "extr.w           %[Temp2],    $ac3,            7               "
    955             "\n\t"
    956             "dpa.w.ph         $ac2,        %[n2],           %[vector1b]     "
    957             "\n\t"
    958             "dpa.w.ph         $ac2,        %[n3],           %[vector2b]     "
    959             "\n\t"
    960             "extr.w           %[Temp4],    $ac2,            7               "
    961             "\n\t"
    962 
    963             /* clamp and store results */
    964             "lbux             %[tp1],      %[Temp1](%[cm])                  "
    965             "\n\t"
    966             "lbux             %[tn1],      %[Temp2](%[cm])                  "
    967             "\n\t"
    968             "lbux             %[tp2],      %[Temp3](%[cm])                  "
    969             "\n\t"
    970             "sb               %[tp1],      0(%[dst_ptr])                    "
    971             "\n\t"
    972             "sb               %[tn1],      1(%[dst_ptr])                    "
    973             "\n\t"
    974             "lbux             %[n2],       %[Temp4](%[cm])                  "
    975             "\n\t"
    976             "sb               %[tp2],      2(%[dst_ptr])                    "
    977             "\n\t"
    978             "sb               %[n2],       3(%[dst_ptr])                    "
    979             "\n\t"
    980 
    981             : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
    982               [tn1] "=&r"(tn1), [p1] "=&r"(p1), [p2] "=&r"(p2), [n1] "=&r"(n1),
    983               [n2] "=&r"(n2), [n3] "=&r"(n3), [Temp1] "=&r"(Temp1),
    984               [Temp2] "=&r"(Temp2), [p3] "=&r"(p3), [Temp3] "=&r"(Temp3),
    985               [Temp4] "=&r"(Temp4)
    986             : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
    987               [vector4a] "r"(vector4a), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr),
    988               [src_ptr] "r"(src_ptr));
    989 
    990         src_ptr += 4;
    991       }
    992 
    993       /* Next row... */
    994       src_ptr += src_pixels_per_line - 16;
    995       dst_ptr += pitch;
    996     }
    997   } else {
    998     for (i = output_height; i--;) {
    999       /* processing 4 adjacent pixels */
   1000       for (j = 0; j < 16; j += 4) {
   1001         /* apply filter with vectors pairs */
   1002         __asm__ __volatile__(
   1003             "ulw              %[tp1],      -1(%[src_ptr])                   "
   1004             "\n\t"
   1005             "ulw              %[tp2],      3(%[src_ptr])                    "
   1006             "\n\t"
   1007 
   1008             /* even 1. pixel */
   1009             "mtlo             %[vector4a], $ac3                             "
   1010             "\n\t"
   1011             "mthi             $0,          $ac3                             "
   1012             "\n\t"
   1013             "preceu.ph.qbr    %[p1],       %[tp1]                           "
   1014             "\n\t"
   1015             "preceu.ph.qbl    %[p2],       %[tp1]                           "
   1016             "\n\t"
   1017             "preceu.ph.qbr    %[p3],       %[tp2]                           "
   1018             "\n\t"
   1019             "dpa.w.ph         $ac3,        %[p1],           %[vector1b]     "
   1020             "\n\t"
   1021             "dpa.w.ph         $ac3,        %[p2],           %[vector2b]     "
   1022             "\n\t"
   1023 
   1024             /* even 2. pixel */
   1025             "mtlo             %[vector4a], $ac2                             "
   1026             "\n\t"
   1027             "mthi             $0,          $ac2                             "
   1028             "\n\t"
   1029             "dpa.w.ph         $ac2,        %[p2],           %[vector1b]     "
   1030             "\n\t"
   1031             "dpa.w.ph         $ac2,        %[p3],           %[vector2b]     "
   1032             "\n\t"
   1033             "extr.w           %[Temp1],    $ac3,            7               "
   1034             "\n\t"
   1035 
   1036             /* odd 1. pixel */
   1037             "ulw              %[tn1],      4(%[src_ptr])                    "
   1038             "\n\t"
   1039             "balign           %[tp2],      %[tp1],          3               "
   1040             "\n\t"
   1041             "mtlo             %[vector4a], $ac3                             "
   1042             "\n\t"
   1043             "mthi             $0,          $ac3                             "
   1044             "\n\t"
   1045             "preceu.ph.qbr    %[n1],       %[tp2]                           "
   1046             "\n\t"
   1047             "preceu.ph.qbl    %[n2],       %[tp2]                           "
   1048             "\n\t"
   1049             "preceu.ph.qbr    %[n3],       %[tn1]                           "
   1050             "\n\t"
   1051             "extr.w           %[Temp3],    $ac2,            7               "
   1052             "\n\t"
   1053             "dpa.w.ph         $ac3,        %[n1],           %[vector1b]     "
   1054             "\n\t"
   1055             "dpa.w.ph         $ac3,        %[n2],           %[vector2b]     "
   1056             "\n\t"
   1057 
   1058             /* odd 2. pixel */
   1059             "mtlo             %[vector4a], $ac2                             "
   1060             "\n\t"
   1061             "mthi             $0,          $ac2                             "
   1062             "\n\t"
   1063             "extr.w           %[Temp2],    $ac3,            7               "
   1064             "\n\t"
   1065             "dpa.w.ph         $ac2,        %[n2],           %[vector1b]     "
   1066             "\n\t"
   1067             "dpa.w.ph         $ac2,        %[n3],           %[vector2b]     "
   1068             "\n\t"
   1069             "extr.w           %[Temp4],    $ac2,            7               "
   1070             "\n\t"
   1071 
   1072             /* clamp and store results */
   1073             "lbux             %[tp1],      %[Temp1](%[cm])                  "
   1074             "\n\t"
   1075             "lbux             %[tn1],      %[Temp2](%[cm])                  "
   1076             "\n\t"
   1077             "lbux             %[tp2],      %[Temp3](%[cm])                  "
   1078             "\n\t"
   1079             "sb               %[tp1],      0(%[output_ptr])                 "
   1080             "\n\t"
   1081             "sb               %[tn1],      1(%[output_ptr])                 "
   1082             "\n\t"
   1083             "lbux             %[n2],       %[Temp4](%[cm])                  "
   1084             "\n\t"
   1085             "sb               %[tp2],      2(%[output_ptr])                 "
   1086             "\n\t"
   1087             "sb               %[n2],       3(%[output_ptr])                 "
   1088             "\n\t"
   1089 
   1090             : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
   1091               [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [n1] "=&r"(n1),
   1092               [n2] "=&r"(n2), [n3] "=&r"(n3), [Temp1] "=&r"(Temp1),
   1093               [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4)
   1094             : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
   1095               [vector4a] "r"(vector4a), [cm] "r"(cm),
   1096               [output_ptr] "r"(output_ptr), [src_ptr] "r"(src_ptr));
   1097 
   1098         src_ptr += 4;
   1099       }
   1100 
   1101       /* next row... */
   1102       src_ptr += src_pixels_per_line;
   1103       output_ptr += output_width;
   1104     }
   1105   }
   1106 }
   1107 
   1108 void vp8_filter_block2d_second_pass4(unsigned char *RESTRICT src_ptr,
   1109                                      unsigned char *RESTRICT output_ptr,
   1110                                      int output_pitch, int yoffset) {
   1111   unsigned int i;
   1112 
   1113   int Temp1, Temp2, Temp3, Temp4;
   1114   unsigned int vector1b, vector2b, vector3b, vector4a;
   1115 
   1116   unsigned char src_ptr_l2;
   1117   unsigned char src_ptr_l1;
   1118   unsigned char src_ptr_0;
   1119   unsigned char src_ptr_r1;
   1120   unsigned char src_ptr_r2;
   1121   unsigned char src_ptr_r3;
   1122 
   1123   unsigned char *cm = ff_cropTbl + CROP_WIDTH;
   1124 
   1125   vector4a = 64;
   1126 
   1127   /* load filter coefficients */
   1128   vector1b = sub_pel_filterss[yoffset][0];
   1129   vector2b = sub_pel_filterss[yoffset][2];
   1130   vector3b = sub_pel_filterss[yoffset][1];
   1131 
   1132   if (vector1b) {
   1133     /* 6 tap filter */
   1134 
   1135     for (i = 2; i--;) {
   1136       /* prefetch src_ptr data to cache memory */
   1137       prefetch_load(src_ptr);
   1138 
   1139       /* do not allow compiler to reorder instructions */
   1140       __asm__ __volatile__(
   1141           ".set noreorder                                                 \n\t"
   1142           :
   1143           :);
   1144 
   1145       /* apply filter with vectors pairs */
   1146       __asm__ __volatile__(
   1147           "lbu            %[src_ptr_l2],  -8(%[src_ptr])                  \n\t"
   1148           "lbu            %[src_ptr_l1],  -4(%[src_ptr])                  \n\t"
   1149           "lbu            %[src_ptr_0],   0(%[src_ptr])                   \n\t"
   1150           "lbu            %[src_ptr_r1],  4(%[src_ptr])                   \n\t"
   1151           "lbu            %[src_ptr_r2],  8(%[src_ptr])                   \n\t"
   1152           "lbu            %[src_ptr_r3],  12(%[src_ptr])                  \n\t"
   1153           "mtlo           %[vector4a],    $ac2                            \n\t"
   1154 
   1155           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   1156           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1157           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1158           "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
   1159           "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
   1160           "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1161 
   1162           "lbu            %[src_ptr_l2],  -7(%[src_ptr])                  \n\t"
   1163           "lbu            %[src_ptr_l1],  -3(%[src_ptr])                  \n\t"
   1164           "lbu            %[src_ptr_0],   1(%[src_ptr])                   \n\t"
   1165           "lbu            %[src_ptr_r1],  5(%[src_ptr])                   \n\t"
   1166           "lbu            %[src_ptr_r2],  9(%[src_ptr])                   \n\t"
   1167           "lbu            %[src_ptr_r3],  13(%[src_ptr])                  \n\t"
   1168           "mtlo           %[vector4a],    $ac3                            \n\t"
   1169           "extp           %[Temp1],       $ac2,           9               \n\t"
   1170 
   1171           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   1172           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1173           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1174           "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
   1175           "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
   1176           "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1177 
   1178           "lbu            %[src_ptr_l2],  -6(%[src_ptr])                  \n\t"
   1179           "lbu            %[src_ptr_l1],  -2(%[src_ptr])                  \n\t"
   1180           "lbu            %[src_ptr_0],   2(%[src_ptr])                   \n\t"
   1181           "lbu            %[src_ptr_r1],  6(%[src_ptr])                   \n\t"
   1182           "lbu            %[src_ptr_r2],  10(%[src_ptr])                  \n\t"
   1183           "lbu            %[src_ptr_r3],  14(%[src_ptr])                  \n\t"
   1184           "mtlo           %[vector4a],    $ac0                            \n\t"
   1185           "extp           %[Temp2],       $ac3,           9               \n\t"
   1186 
   1187           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   1188           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1189           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1190           "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
   1191           "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
   1192           "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1193 
   1194           "lbu            %[src_ptr_l2],  -5(%[src_ptr])                  \n\t"
   1195           "lbu            %[src_ptr_l1],  -1(%[src_ptr])                  \n\t"
   1196           "lbu            %[src_ptr_0],   3(%[src_ptr])                   \n\t"
   1197           "lbu            %[src_ptr_r1],  7(%[src_ptr])                   \n\t"
   1198           "lbu            %[src_ptr_r2],  11(%[src_ptr])                  \n\t"
   1199           "lbu            %[src_ptr_r3],  15(%[src_ptr])                  \n\t"
   1200           "mtlo           %[vector4a],    $ac1                            \n\t"
   1201           "extp           %[Temp3],       $ac0,           9               \n\t"
   1202 
   1203           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   1204           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1205           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1206           "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
   1207           "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
   1208           "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1209           "extp           %[Temp4],       $ac1,           9               \n\t"
   1210 
   1211           : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
   1212             [Temp4] "=r"(Temp4), [src_ptr_l1] "=&r"(src_ptr_l1),
   1213             [src_ptr_0] "=&r"(src_ptr_0), [src_ptr_r1] "=&r"(src_ptr_r1),
   1214             [src_ptr_r2] "=&r"(src_ptr_r2), [src_ptr_l2] "=&r"(src_ptr_l2),
   1215             [src_ptr_r3] "=&r"(src_ptr_r3)
   1216           : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
   1217             [vector3b] "r"(vector3b), [vector4a] "r"(vector4a),
   1218             [src_ptr] "r"(src_ptr));
   1219 
   1220       /* clamp and store results */
   1221       output_ptr[0] = cm[Temp1];
   1222       output_ptr[1] = cm[Temp2];
   1223       output_ptr[2] = cm[Temp3];
   1224       output_ptr[3] = cm[Temp4];
   1225 
   1226       output_ptr += output_pitch;
   1227 
   1228       /* apply filter with vectors pairs */
   1229       __asm__ __volatile__(
   1230           "lbu            %[src_ptr_l2],  -4(%[src_ptr])                  \n\t"
   1231           "lbu            %[src_ptr_l1],  0(%[src_ptr])                   \n\t"
   1232           "lbu            %[src_ptr_0],   4(%[src_ptr])                   \n\t"
   1233           "lbu            %[src_ptr_r1],  8(%[src_ptr])                   \n\t"
   1234           "lbu            %[src_ptr_r2],  12(%[src_ptr])                  \n\t"
   1235           "lbu            %[src_ptr_r3],  16(%[src_ptr])                  \n\t"
   1236           "mtlo           %[vector4a],    $ac2                            \n\t"
   1237           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   1238           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1239           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1240           "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
   1241           "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
   1242           "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1243 
   1244           "lbu            %[src_ptr_l2],  -3(%[src_ptr])                  \n\t"
   1245           "lbu            %[src_ptr_l1],  1(%[src_ptr])                   \n\t"
   1246           "lbu            %[src_ptr_0],   5(%[src_ptr])                   \n\t"
   1247           "lbu            %[src_ptr_r1],  9(%[src_ptr])                   \n\t"
   1248           "lbu            %[src_ptr_r2],  13(%[src_ptr])                  \n\t"
   1249           "lbu            %[src_ptr_r3],  17(%[src_ptr])                  \n\t"
   1250           "mtlo           %[vector4a],    $ac3                            \n\t"
   1251           "extp           %[Temp1],       $ac2,           9               \n\t"
   1252 
   1253           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   1254           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1255           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1256           "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
   1257           "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
   1258           "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1259 
   1260           "lbu            %[src_ptr_l2],  -2(%[src_ptr])                  \n\t"
   1261           "lbu            %[src_ptr_l1],  2(%[src_ptr])                   \n\t"
   1262           "lbu            %[src_ptr_0],   6(%[src_ptr])                   \n\t"
   1263           "lbu            %[src_ptr_r1],  10(%[src_ptr])                  \n\t"
   1264           "lbu            %[src_ptr_r2],  14(%[src_ptr])                  \n\t"
   1265           "lbu            %[src_ptr_r3],  18(%[src_ptr])                  \n\t"
   1266           "mtlo           %[vector4a],    $ac0                            \n\t"
   1267           "extp           %[Temp2],       $ac3,           9               \n\t"
   1268 
   1269           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   1270           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1271           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1272           "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
   1273           "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
   1274           "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1275 
   1276           "lbu            %[src_ptr_l2],  -1(%[src_ptr])                  \n\t"
   1277           "lbu            %[src_ptr_l1],  3(%[src_ptr])                   \n\t"
   1278           "lbu            %[src_ptr_0],   7(%[src_ptr])                   \n\t"
   1279           "lbu            %[src_ptr_r1],  11(%[src_ptr])                  \n\t"
   1280           "lbu            %[src_ptr_r2],  15(%[src_ptr])                  \n\t"
   1281           "lbu            %[src_ptr_r3],  19(%[src_ptr])                  \n\t"
   1282           "mtlo           %[vector4a],    $ac1                            \n\t"
   1283           "extp           %[Temp3],       $ac0,           9               \n\t"
   1284 
   1285           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   1286           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1287           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1288           "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
   1289           "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
   1290           "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1291           "extp           %[Temp4],       $ac1,           9               \n\t"
   1292 
   1293           : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
   1294             [Temp4] "=r"(Temp4), [src_ptr_l1] "=&r"(src_ptr_l1),
   1295             [src_ptr_0] "=&r"(src_ptr_0), [src_ptr_r1] "=&r"(src_ptr_r1),
   1296             [src_ptr_r2] "=&r"(src_ptr_r2), [src_ptr_l2] "=&r"(src_ptr_l2),
   1297             [src_ptr_r3] "=&r"(src_ptr_r3)
   1298           : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
   1299             [vector3b] "r"(vector3b), [vector4a] "r"(vector4a),
   1300             [src_ptr] "r"(src_ptr));
   1301 
   1302       /* clamp and store results */
   1303       output_ptr[0] = cm[Temp1];
   1304       output_ptr[1] = cm[Temp2];
   1305       output_ptr[2] = cm[Temp3];
   1306       output_ptr[3] = cm[Temp4];
   1307 
   1308       src_ptr += 8;
   1309       output_ptr += output_pitch;
   1310     }
   1311   } else {
   1312     /* 4 tap filter */
   1313 
   1314     /* prefetch src_ptr data to cache memory */
   1315     prefetch_load(src_ptr);
   1316 
   1317     for (i = 2; i--;) {
   1318       /* do not allow compiler to reorder instructions */
   1319       __asm__ __volatile__(
   1320           ".set noreorder                                                 \n\t"
   1321           :
   1322           :);
   1323 
   1324       /* apply filter with vectors pairs */
   1325       __asm__ __volatile__(
   1326           "lbu            %[src_ptr_l1],  -4(%[src_ptr])                  \n\t"
   1327           "lbu            %[src_ptr_0],   0(%[src_ptr])                   \n\t"
   1328           "lbu            %[src_ptr_r1],  4(%[src_ptr])                   \n\t"
   1329           "lbu            %[src_ptr_r2],  8(%[src_ptr])                   \n\t"
   1330           "mtlo           %[vector4a],    $ac2                            \n\t"
   1331           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1332           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1333           "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
   1334           "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1335 
   1336           "lbu            %[src_ptr_l1],  -3(%[src_ptr])                  \n\t"
   1337           "lbu            %[src_ptr_0],   1(%[src_ptr])                   \n\t"
   1338           "lbu            %[src_ptr_r1],  5(%[src_ptr])                   \n\t"
   1339           "lbu            %[src_ptr_r2],  9(%[src_ptr])                   \n\t"
   1340           "mtlo           %[vector4a],    $ac3                            \n\t"
   1341           "extp           %[Temp1],       $ac2,           9               \n\t"
   1342 
   1343           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1344           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1345           "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
   1346           "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1347 
   1348           "lbu            %[src_ptr_l1],  -2(%[src_ptr])                  \n\t"
   1349           "lbu            %[src_ptr_0],   2(%[src_ptr])                   \n\t"
   1350           "lbu            %[src_ptr_r1],  6(%[src_ptr])                   \n\t"
   1351           "lbu            %[src_ptr_r2],  10(%[src_ptr])                  \n\t"
   1352           "mtlo           %[vector4a],    $ac0                            \n\t"
   1353           "extp           %[Temp2],       $ac3,           9               \n\t"
   1354 
   1355           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1356           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1357           "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
   1358           "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1359 
   1360           "lbu            %[src_ptr_l1],  -1(%[src_ptr])                  \n\t"
   1361           "lbu            %[src_ptr_0],   3(%[src_ptr])                   \n\t"
   1362           "lbu            %[src_ptr_r1],  7(%[src_ptr])                   \n\t"
   1363           "lbu            %[src_ptr_r2],  11(%[src_ptr])                  \n\t"
   1364           "mtlo           %[vector4a],    $ac1                            \n\t"
   1365           "extp           %[Temp3],       $ac0,           9               \n\t"
   1366           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1367           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1368           "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
   1369           "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1370           "extp           %[Temp4],       $ac1,           9               \n\t"
   1371 
   1372           : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
   1373             [Temp4] "=r"(Temp4), [src_ptr_l1] "=&r"(src_ptr_l1),
   1374             [src_ptr_0] "=&r"(src_ptr_0), [src_ptr_r1] "=&r"(src_ptr_r1),
   1375             [src_ptr_r2] "=&r"(src_ptr_r2)
   1376           : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b),
   1377             [vector4a] "r"(vector4a), [src_ptr] "r"(src_ptr));
   1378 
   1379       /* clamp and store results */
   1380       output_ptr[0] = cm[Temp1];
   1381       output_ptr[1] = cm[Temp2];
   1382       output_ptr[2] = cm[Temp3];
   1383       output_ptr[3] = cm[Temp4];
   1384 
   1385       output_ptr += output_pitch;
   1386 
   1387       /* apply filter with vectors pairs */
   1388       __asm__ __volatile__(
   1389           "lbu            %[src_ptr_l1],  0(%[src_ptr])                   \n\t"
   1390           "lbu            %[src_ptr_0],   4(%[src_ptr])                   \n\t"
   1391           "lbu            %[src_ptr_r1],  8(%[src_ptr])                   \n\t"
   1392           "lbu            %[src_ptr_r2],  12(%[src_ptr])                  \n\t"
   1393           "mtlo           %[vector4a],    $ac2                            \n\t"
   1394           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1395           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1396           "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
   1397           "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1398 
   1399           "lbu            %[src_ptr_l1],  1(%[src_ptr])                   \n\t"
   1400           "lbu            %[src_ptr_0],   5(%[src_ptr])                   \n\t"
   1401           "lbu            %[src_ptr_r1],  9(%[src_ptr])                   \n\t"
   1402           "lbu            %[src_ptr_r2],  13(%[src_ptr])                  \n\t"
   1403           "mtlo           %[vector4a],    $ac3                            \n\t"
   1404           "extp           %[Temp1],       $ac2,           9               \n\t"
   1405 
   1406           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1407           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1408           "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
   1409           "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1410 
   1411           "lbu            %[src_ptr_l1],  2(%[src_ptr])                   \n\t"
   1412           "lbu            %[src_ptr_0],   6(%[src_ptr])                   \n\t"
   1413           "lbu            %[src_ptr_r1],  10(%[src_ptr])                  \n\t"
   1414           "lbu            %[src_ptr_r2],  14(%[src_ptr])                  \n\t"
   1415           "mtlo           %[vector4a],    $ac0                            \n\t"
   1416           "extp           %[Temp2],       $ac3,           9               \n\t"
   1417 
   1418           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1419           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1420           "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
   1421           "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1422 
   1423           "lbu            %[src_ptr_l1],  3(%[src_ptr])                   \n\t"
   1424           "lbu            %[src_ptr_0],   7(%[src_ptr])                   \n\t"
   1425           "lbu            %[src_ptr_r1],  11(%[src_ptr])                  \n\t"
   1426           "lbu            %[src_ptr_r2],  15(%[src_ptr])                  \n\t"
   1427           "mtlo           %[vector4a],    $ac1                            \n\t"
   1428           "extp           %[Temp3],       $ac0,           9               \n\t"
   1429           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1430           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1431           "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
   1432           "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1433           "extp           %[Temp4],       $ac1,           9               \n\t"
   1434 
   1435           : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
   1436             [Temp4] "=r"(Temp4), [src_ptr_l1] "=&r"(src_ptr_l1),
   1437             [src_ptr_0] "=&r"(src_ptr_0), [src_ptr_r1] "=&r"(src_ptr_r1),
   1438             [src_ptr_r2] "=&r"(src_ptr_r2)
   1439           : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b),
   1440             [vector4a] "r"(vector4a), [src_ptr] "r"(src_ptr));
   1441 
   1442       /* clamp and store results */
   1443       output_ptr[0] = cm[Temp1];
   1444       output_ptr[1] = cm[Temp2];
   1445       output_ptr[2] = cm[Temp3];
   1446       output_ptr[3] = cm[Temp4];
   1447 
   1448       src_ptr += 8;
   1449       output_ptr += output_pitch;
   1450     }
   1451   }
   1452 }
   1453 
   1454 void vp8_filter_block2d_second_pass_8(unsigned char *RESTRICT src_ptr,
   1455                                       unsigned char *RESTRICT output_ptr,
   1456                                       int output_pitch,
   1457                                       unsigned int output_height,
   1458                                       unsigned int output_width,
   1459                                       unsigned int yoffset) {
   1460   unsigned int i;
   1461 
   1462   int Temp1, Temp2, Temp3, Temp4, Temp5, Temp6, Temp7, Temp8;
   1463   unsigned int vector1b, vector2b, vector3b, vector4a;
   1464 
   1465   unsigned char src_ptr_l2;
   1466   unsigned char src_ptr_l1;
   1467   unsigned char src_ptr_0;
   1468   unsigned char src_ptr_r1;
   1469   unsigned char src_ptr_r2;
   1470   unsigned char src_ptr_r3;
   1471   unsigned char *cm = ff_cropTbl + CROP_WIDTH;
   1472   (void)output_width;
   1473 
   1474   vector4a = 64;
   1475 
   1476   vector1b = sub_pel_filterss[yoffset][0];
   1477   vector2b = sub_pel_filterss[yoffset][2];
   1478   vector3b = sub_pel_filterss[yoffset][1];
   1479 
   1480   if (vector1b) {
   1481     /* 6 tap filter */
   1482 
   1483     /* prefetch src_ptr data to cache memory */
   1484     prefetch_load(src_ptr);
   1485 
   1486     for (i = output_height; i--;) {
   1487       /* apply filter with vectors pairs */
   1488       __asm__ __volatile__(
   1489           "lbu            %[src_ptr_l2],  -16(%[src_ptr])                 \n\t"
   1490           "lbu            %[src_ptr_l1],  -8(%[src_ptr])                  \n\t"
   1491           "lbu            %[src_ptr_0],   0(%[src_ptr])                   \n\t"
   1492           "lbu            %[src_ptr_r1],  8(%[src_ptr])                   \n\t"
   1493           "lbu            %[src_ptr_r2],  16(%[src_ptr])                  \n\t"
   1494           "lbu            %[src_ptr_r3],  24(%[src_ptr])                  \n\t"
   1495           "mtlo           %[vector4a],    $ac2                            \n\t"
   1496 
   1497           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1498           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1499           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   1500           "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
   1501           "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
   1502           "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1503 
   1504           "lbu            %[src_ptr_l2],  -15(%[src_ptr])                 \n\t"
   1505           "lbu            %[src_ptr_l1],  -7(%[src_ptr])                  \n\t"
   1506           "lbu            %[src_ptr_0],   1(%[src_ptr])                   \n\t"
   1507           "lbu            %[src_ptr_r1],  9(%[src_ptr])                   \n\t"
   1508           "lbu            %[src_ptr_r2],  17(%[src_ptr])                  \n\t"
   1509           "lbu            %[src_ptr_r3],  25(%[src_ptr])                  \n\t"
   1510           "mtlo           %[vector4a],    $ac3                            \n\t"
   1511           "extp           %[Temp1],       $ac2,           9               \n\t"
   1512 
   1513           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   1514           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1515           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1516           "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
   1517           "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
   1518           "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1519 
   1520           "lbu            %[src_ptr_l2],  -14(%[src_ptr])                 \n\t"
   1521           "lbu            %[src_ptr_l1],  -6(%[src_ptr])                  \n\t"
   1522           "lbu            %[src_ptr_0],   2(%[src_ptr])                   \n\t"
   1523           "lbu            %[src_ptr_r1],  10(%[src_ptr])                  \n\t"
   1524           "lbu            %[src_ptr_r2],  18(%[src_ptr])                  \n\t"
   1525           "lbu            %[src_ptr_r3],  26(%[src_ptr])                  \n\t"
   1526           "mtlo           %[vector4a],    $ac0                            \n\t"
   1527           "extp           %[Temp2],       $ac3,           9               \n\t"
   1528 
   1529           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   1530           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1531           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1532           "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
   1533           "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
   1534           "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1535 
   1536           "lbu            %[src_ptr_l2],  -13(%[src_ptr])                 \n\t"
   1537           "lbu            %[src_ptr_l1],  -5(%[src_ptr])                  \n\t"
   1538           "lbu            %[src_ptr_0],   3(%[src_ptr])                   \n\t"
   1539           "lbu            %[src_ptr_r1],  11(%[src_ptr])                  \n\t"
   1540           "lbu            %[src_ptr_r2],  19(%[src_ptr])                  \n\t"
   1541           "lbu            %[src_ptr_r3],  27(%[src_ptr])                  \n\t"
   1542           "mtlo           %[vector4a],    $ac1                            \n\t"
   1543           "extp           %[Temp3],       $ac0,           9               \n\t"
   1544 
   1545           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   1546           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1547           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1548           "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
   1549           "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
   1550           "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1551 
   1552           : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
   1553             [src_ptr_l1] "=&r"(src_ptr_l1), [src_ptr_0] "=&r"(src_ptr_0),
   1554             [src_ptr_r1] "=&r"(src_ptr_r1), [src_ptr_r2] "=&r"(src_ptr_r2),
   1555             [src_ptr_l2] "=&r"(src_ptr_l2), [src_ptr_r3] "=&r"(src_ptr_r3)
   1556           : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
   1557             [vector3b] "r"(vector3b), [vector4a] "r"(vector4a),
   1558             [src_ptr] "r"(src_ptr));
   1559 
   1560       /* apply filter with vectors pairs */
   1561       __asm__ __volatile__(
   1562           "lbu            %[src_ptr_l2],  -12(%[src_ptr])                 \n\t"
   1563           "lbu            %[src_ptr_l1],  -4(%[src_ptr])                  \n\t"
   1564           "lbu            %[src_ptr_0],   4(%[src_ptr])                   \n\t"
   1565           "lbu            %[src_ptr_r1],  12(%[src_ptr])                  \n\t"
   1566           "lbu            %[src_ptr_r2],  20(%[src_ptr])                  \n\t"
   1567           "lbu            %[src_ptr_r3],  28(%[src_ptr])                  \n\t"
   1568           "mtlo           %[vector4a],    $ac2                            \n\t"
   1569 
   1570           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   1571           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1572           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1573           "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
   1574           "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
   1575           "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1576           "extp           %[Temp4],       $ac1,           9               \n\t"
   1577 
   1578           "lbu            %[src_ptr_l2],  -11(%[src_ptr])                 \n\t"
   1579           "lbu            %[src_ptr_l1],  -3(%[src_ptr])                  \n\t"
   1580           "lbu            %[src_ptr_0],   5(%[src_ptr])                   \n\t"
   1581           "lbu            %[src_ptr_r1],  13(%[src_ptr])                  \n\t"
   1582           "lbu            %[src_ptr_r2],  21(%[src_ptr])                  \n\t"
   1583           "lbu            %[src_ptr_r3],  29(%[src_ptr])                  \n\t"
   1584           "mtlo           %[vector4a],    $ac3                            \n\t"
   1585           "extp           %[Temp5],       $ac2,           9               \n\t"
   1586 
   1587           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   1588           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1589           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1590           "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
   1591           "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
   1592           "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1593 
   1594           "lbu            %[src_ptr_l2],  -10(%[src_ptr])                 \n\t"
   1595           "lbu            %[src_ptr_l1],  -2(%[src_ptr])                  \n\t"
   1596           "lbu            %[src_ptr_0],   6(%[src_ptr])                   \n\t"
   1597           "lbu            %[src_ptr_r1],  14(%[src_ptr])                  \n\t"
   1598           "lbu            %[src_ptr_r2],  22(%[src_ptr])                  \n\t"
   1599           "lbu            %[src_ptr_r3],  30(%[src_ptr])                  \n\t"
   1600           "mtlo           %[vector4a],    $ac0                            \n\t"
   1601           "extp           %[Temp6],       $ac3,           9               \n\t"
   1602 
   1603           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   1604           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1605           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1606           "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
   1607           "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
   1608           "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1609 
   1610           "lbu            %[src_ptr_l2],  -9(%[src_ptr])                  \n\t"
   1611           "lbu            %[src_ptr_l1],  -1(%[src_ptr])                  \n\t"
   1612           "lbu            %[src_ptr_0],   7(%[src_ptr])                   \n\t"
   1613           "lbu            %[src_ptr_r1],  15(%[src_ptr])                  \n\t"
   1614           "lbu            %[src_ptr_r2],  23(%[src_ptr])                  \n\t"
   1615           "lbu            %[src_ptr_r3],  31(%[src_ptr])                  \n\t"
   1616           "mtlo           %[vector4a],    $ac1                            \n\t"
   1617           "extp           %[Temp7],       $ac0,           9               \n\t"
   1618 
   1619           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   1620           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1621           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1622           "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
   1623           "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
   1624           "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1625           "extp           %[Temp8],       $ac1,           9               \n\t"
   1626 
   1627           : [Temp4] "=&r"(Temp4), [Temp5] "=&r"(Temp5), [Temp6] "=&r"(Temp6),
   1628             [Temp7] "=&r"(Temp7), [Temp8] "=r"(Temp8),
   1629             [src_ptr_l1] "=&r"(src_ptr_l1), [src_ptr_0] "=&r"(src_ptr_0),
   1630             [src_ptr_r1] "=&r"(src_ptr_r1), [src_ptr_r2] "=&r"(src_ptr_r2),
   1631             [src_ptr_l2] "=&r"(src_ptr_l2), [src_ptr_r3] "=&r"(src_ptr_r3)
   1632           : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
   1633             [vector3b] "r"(vector3b), [vector4a] "r"(vector4a),
   1634             [src_ptr] "r"(src_ptr));
   1635 
   1636       /* clamp and store results */
   1637       output_ptr[0] = cm[Temp1];
   1638       output_ptr[1] = cm[Temp2];
   1639       output_ptr[2] = cm[Temp3];
   1640       output_ptr[3] = cm[Temp4];
   1641       output_ptr[4] = cm[Temp5];
   1642       output_ptr[5] = cm[Temp6];
   1643       output_ptr[6] = cm[Temp7];
   1644       output_ptr[7] = cm[Temp8];
   1645 
   1646       src_ptr += 8;
   1647       output_ptr += output_pitch;
   1648     }
   1649   } else {
   1650     /* 4 tap filter */
   1651 
   1652     /* prefetch src_ptr data to cache memory */
   1653     prefetch_load(src_ptr);
   1654 
   1655     for (i = output_height; i--;) {
   1656       __asm__ __volatile__(
   1657           "lbu            %[src_ptr_l1],  -8(%[src_ptr])                  \n\t"
   1658           "lbu            %[src_ptr_0],   0(%[src_ptr])                   \n\t"
   1659           "lbu            %[src_ptr_r1],  8(%[src_ptr])                   \n\t"
   1660           "lbu            %[src_ptr_r2],  16(%[src_ptr])                  \n\t"
   1661           "mtlo           %[vector4a],    $ac2                            \n\t"
   1662           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1663           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1664           "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
   1665           "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1666 
   1667           : [src_ptr_l1] "=&r"(src_ptr_l1), [src_ptr_0] "=&r"(src_ptr_0),
   1668             [src_ptr_r1] "=&r"(src_ptr_r1), [src_ptr_r2] "=&r"(src_ptr_r2)
   1669           : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b),
   1670             [vector4a] "r"(vector4a), [src_ptr] "r"(src_ptr));
   1671 
   1672       __asm__ __volatile__(
   1673           "lbu            %[src_ptr_l1],  -7(%[src_ptr])                  \n\t"
   1674           "lbu            %[src_ptr_0],   1(%[src_ptr])                   \n\t"
   1675           "lbu            %[src_ptr_r1],  9(%[src_ptr])                   \n\t"
   1676           "lbu            %[src_ptr_r2],  17(%[src_ptr])                  \n\t"
   1677           "mtlo           %[vector4a],    $ac3                            \n\t"
   1678           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1679           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1680           "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
   1681           "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1682           "extp           %[Temp1],       $ac2,           9               \n\t"
   1683 
   1684           : [Temp1] "=r"(Temp1), [src_ptr_l1] "=&r"(src_ptr_l1),
   1685             [src_ptr_0] "=&r"(src_ptr_0), [src_ptr_r1] "=&r"(src_ptr_r1),
   1686             [src_ptr_r2] "=&r"(src_ptr_r2)
   1687           : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b),
   1688             [vector4a] "r"(vector4a), [src_ptr] "r"(src_ptr));
   1689 
   1690       src_ptr_l1 = src_ptr[-6];
   1691       src_ptr_0 = src_ptr[2];
   1692       src_ptr_r1 = src_ptr[10];
   1693       src_ptr_r2 = src_ptr[18];
   1694 
   1695       __asm__ __volatile__(
   1696           "mtlo           %[vector4a],    $ac0                            \n\t"
   1697           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1698           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1699           "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
   1700           "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1701           "extp           %[Temp2],       $ac3,           9               \n\t"
   1702 
   1703           : [Temp2] "=r"(Temp2)
   1704           : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b),
   1705             [src_ptr_l1] "r"(src_ptr_l1), [src_ptr_0] "r"(src_ptr_0),
   1706             [src_ptr_r1] "r"(src_ptr_r1), [src_ptr_r2] "r"(src_ptr_r2),
   1707             [vector4a] "r"(vector4a));
   1708 
   1709       src_ptr_l1 = src_ptr[-5];
   1710       src_ptr_0 = src_ptr[3];
   1711       src_ptr_r1 = src_ptr[11];
   1712       src_ptr_r2 = src_ptr[19];
   1713 
   1714       __asm__ __volatile__(
   1715           "mtlo           %[vector4a],    $ac1                            \n\t"
   1716           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1717           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1718           "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
   1719           "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1720           "extp           %[Temp3],       $ac0,           9               \n\t"
   1721 
   1722           : [Temp3] "=r"(Temp3)
   1723           : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b),
   1724             [src_ptr_l1] "r"(src_ptr_l1), [src_ptr_0] "r"(src_ptr_0),
   1725             [src_ptr_r1] "r"(src_ptr_r1), [src_ptr_r2] "r"(src_ptr_r2),
   1726             [vector4a] "r"(vector4a));
   1727 
   1728       src_ptr_l1 = src_ptr[-4];
   1729       src_ptr_0 = src_ptr[4];
   1730       src_ptr_r1 = src_ptr[12];
   1731       src_ptr_r2 = src_ptr[20];
   1732 
   1733       __asm__ __volatile__(
   1734           "mtlo           %[vector4a],    $ac2                            \n\t"
   1735           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1736           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1737           "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
   1738           "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1739           "extp           %[Temp4],       $ac1,           9               \n\t"
   1740 
   1741           : [Temp4] "=r"(Temp4)
   1742           : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b),
   1743             [src_ptr_l1] "r"(src_ptr_l1), [src_ptr_0] "r"(src_ptr_0),
   1744             [src_ptr_r1] "r"(src_ptr_r1), [src_ptr_r2] "r"(src_ptr_r2),
   1745             [vector4a] "r"(vector4a));
   1746 
   1747       src_ptr_l1 = src_ptr[-3];
   1748       src_ptr_0 = src_ptr[5];
   1749       src_ptr_r1 = src_ptr[13];
   1750       src_ptr_r2 = src_ptr[21];
   1751 
   1752       __asm__ __volatile__(
   1753           "mtlo           %[vector4a],    $ac3                            \n\t"
   1754           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1755           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1756           "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
   1757           "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1758           "extp           %[Temp5],       $ac2,           9               \n\t"
   1759 
   1760           : [Temp5] "=&r"(Temp5)
   1761           : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b),
   1762             [src_ptr_l1] "r"(src_ptr_l1), [src_ptr_0] "r"(src_ptr_0),
   1763             [src_ptr_r1] "r"(src_ptr_r1), [src_ptr_r2] "r"(src_ptr_r2),
   1764             [vector4a] "r"(vector4a));
   1765 
   1766       src_ptr_l1 = src_ptr[-2];
   1767       src_ptr_0 = src_ptr[6];
   1768       src_ptr_r1 = src_ptr[14];
   1769       src_ptr_r2 = src_ptr[22];
   1770 
   1771       __asm__ __volatile__(
   1772           "mtlo           %[vector4a],    $ac0                            \n\t"
   1773           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1774           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1775           "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
   1776           "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1777           "extp           %[Temp6],       $ac3,           9               \n\t"
   1778 
   1779           : [Temp6] "=r"(Temp6)
   1780           : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b),
   1781             [src_ptr_l1] "r"(src_ptr_l1), [src_ptr_0] "r"(src_ptr_0),
   1782             [src_ptr_r1] "r"(src_ptr_r1), [src_ptr_r2] "r"(src_ptr_r2),
   1783             [vector4a] "r"(vector4a));
   1784 
   1785       src_ptr_l1 = src_ptr[-1];
   1786       src_ptr_0 = src_ptr[7];
   1787       src_ptr_r1 = src_ptr[15];
   1788       src_ptr_r2 = src_ptr[23];
   1789 
   1790       __asm__ __volatile__(
   1791           "mtlo           %[vector4a],    $ac1                            \n\t"
   1792           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   1793           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   1794           "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
   1795           "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
   1796           "extp           %[Temp7],       $ac0,           9               \n\t"
   1797           "extp           %[Temp8],       $ac1,           9               \n\t"
   1798 
   1799           : [Temp7] "=&r"(Temp7), [Temp8] "=r"(Temp8)
   1800           : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b),
   1801             [src_ptr_l1] "r"(src_ptr_l1), [src_ptr_0] "r"(src_ptr_0),
   1802             [src_ptr_r1] "r"(src_ptr_r1), [src_ptr_r2] "r"(src_ptr_r2),
   1803             [vector4a] "r"(vector4a));
   1804 
   1805       /* clamp and store results */
   1806       output_ptr[0] = cm[Temp1];
   1807       output_ptr[1] = cm[Temp2];
   1808       output_ptr[2] = cm[Temp3];
   1809       output_ptr[3] = cm[Temp4];
   1810       output_ptr[4] = cm[Temp5];
   1811       output_ptr[5] = cm[Temp6];
   1812       output_ptr[6] = cm[Temp7];
   1813       output_ptr[7] = cm[Temp8];
   1814 
   1815       src_ptr += 8;
   1816       output_ptr += output_pitch;
   1817     }
   1818   }
   1819 }
   1820 
   1821 void vp8_filter_block2d_second_pass161(unsigned char *RESTRICT src_ptr,
   1822                                        unsigned char *RESTRICT output_ptr,
   1823                                        int output_pitch,
   1824                                        const unsigned short *vp8_filter) {
   1825   unsigned int i, j;
   1826 
   1827   int Temp1, Temp2, Temp3, Temp4, Temp5, Temp6, Temp7, Temp8;
   1828   unsigned int vector4a;
   1829   unsigned int vector1b, vector2b, vector3b;
   1830 
   1831   unsigned char src_ptr_l2;
   1832   unsigned char src_ptr_l1;
   1833   unsigned char src_ptr_0;
   1834   unsigned char src_ptr_r1;
   1835   unsigned char src_ptr_r2;
   1836   unsigned char src_ptr_r3;
   1837   unsigned char *cm = ff_cropTbl + CROP_WIDTH;
   1838 
   1839   vector4a = 64;
   1840 
   1841   vector1b = vp8_filter[0];
   1842   vector2b = vp8_filter[2];
   1843   vector3b = vp8_filter[1];
   1844 
   1845   if (vector1b == 0) {
   1846     /* 4 tap filter */
   1847 
   1848     /* prefetch src_ptr data to cache memory */
   1849     prefetch_load(src_ptr + 16);
   1850 
   1851     for (i = 16; i--;) {
   1852       /* unrolling for loop */
   1853       for (j = 0; j < 16; j += 8) {
   1854         /* apply filter with vectors pairs */
   1855         __asm__ __volatile__(
   1856             "lbu            %[src_ptr_l1],  -16(%[src_ptr])                 "
   1857             "\n\t"
   1858             "lbu            %[src_ptr_0],   0(%[src_ptr])                   "
   1859             "\n\t"
   1860             "lbu            %[src_ptr_r1],  16(%[src_ptr])                  "
   1861             "\n\t"
   1862             "lbu            %[src_ptr_r2],  32(%[src_ptr])                  "
   1863             "\n\t"
   1864             "mtlo           %[vector4a],    $ac2                            "
   1865             "\n\t"
   1866             "append         %[src_ptr_0],   %[src_ptr_r1],  8               "
   1867             "\n\t"
   1868             "append         %[src_ptr_l1],  %[src_ptr_r2],  8               "
   1869             "\n\t"
   1870             "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     "
   1871             "\n\t"
   1872             "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     "
   1873             "\n\t"
   1874 
   1875             "lbu            %[src_ptr_l1],  -15(%[src_ptr])                 "
   1876             "\n\t"
   1877             "lbu            %[src_ptr_0],   1(%[src_ptr])                   "
   1878             "\n\t"
   1879             "lbu            %[src_ptr_r1],  17(%[src_ptr])                  "
   1880             "\n\t"
   1881             "lbu            %[src_ptr_r2],  33(%[src_ptr])                  "
   1882             "\n\t"
   1883             "mtlo           %[vector4a],    $ac3                            "
   1884             "\n\t"
   1885             "extp           %[Temp1],       $ac2,           9               "
   1886             "\n\t"
   1887 
   1888             "append         %[src_ptr_0],   %[src_ptr_r1],  8               "
   1889             "\n\t"
   1890             "append         %[src_ptr_l1],  %[src_ptr_r2],  8               "
   1891             "\n\t"
   1892             "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     "
   1893             "\n\t"
   1894             "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     "
   1895             "\n\t"
   1896 
   1897             "lbu            %[src_ptr_l1],  -14(%[src_ptr])                 "
   1898             "\n\t"
   1899             "lbu            %[src_ptr_0],   2(%[src_ptr])                   "
   1900             "\n\t"
   1901             "lbu            %[src_ptr_r1],  18(%[src_ptr])                  "
   1902             "\n\t"
   1903             "lbu            %[src_ptr_r2],  34(%[src_ptr])                  "
   1904             "\n\t"
   1905             "mtlo           %[vector4a],    $ac1                            "
   1906             "\n\t"
   1907             "extp           %[Temp2],       $ac3,           9               "
   1908             "\n\t"
   1909 
   1910             "append         %[src_ptr_0],   %[src_ptr_r1],  8               "
   1911             "\n\t"
   1912             "append         %[src_ptr_l1],  %[src_ptr_r2],  8               "
   1913             "\n\t"
   1914             "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     "
   1915             "\n\t"
   1916             "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     "
   1917             "\n\t"
   1918 
   1919             "lbu            %[src_ptr_l1],  -13(%[src_ptr])                 "
   1920             "\n\t"
   1921             "lbu            %[src_ptr_0],   3(%[src_ptr])                   "
   1922             "\n\t"
   1923             "lbu            %[src_ptr_r1],  19(%[src_ptr])                  "
   1924             "\n\t"
   1925             "lbu            %[src_ptr_r2],  35(%[src_ptr])                  "
   1926             "\n\t"
   1927             "mtlo           %[vector4a],    $ac3                            "
   1928             "\n\t"
   1929             "extp           %[Temp3],       $ac1,           9               "
   1930             "\n\t"
   1931 
   1932             "append         %[src_ptr_0],   %[src_ptr_r1],  8               "
   1933             "\n\t"
   1934             "append         %[src_ptr_l1],  %[src_ptr_r2],  8               "
   1935             "\n\t"
   1936             "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     "
   1937             "\n\t"
   1938             "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     "
   1939             "\n\t"
   1940 
   1941             "lbu            %[src_ptr_l1],  -12(%[src_ptr])                 "
   1942             "\n\t"
   1943             "lbu            %[src_ptr_0],   4(%[src_ptr])                   "
   1944             "\n\t"
   1945             "lbu            %[src_ptr_r1],  20(%[src_ptr])                  "
   1946             "\n\t"
   1947             "lbu            %[src_ptr_r2],  36(%[src_ptr])                  "
   1948             "\n\t"
   1949             "mtlo           %[vector4a],    $ac2                            "
   1950             "\n\t"
   1951             "extp           %[Temp4],       $ac3,           9               "
   1952             "\n\t"
   1953 
   1954             "append         %[src_ptr_0],   %[src_ptr_r1],  8               "
   1955             "\n\t"
   1956             "append         %[src_ptr_l1],  %[src_ptr_r2],  8               "
   1957             "\n\t"
   1958             "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     "
   1959             "\n\t"
   1960             "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     "
   1961             "\n\t"
   1962 
   1963             "lbu            %[src_ptr_l1],  -11(%[src_ptr])                 "
   1964             "\n\t"
   1965             "lbu            %[src_ptr_0],   5(%[src_ptr])                   "
   1966             "\n\t"
   1967             "lbu            %[src_ptr_r1],  21(%[src_ptr])                  "
   1968             "\n\t"
   1969             "lbu            %[src_ptr_r2],  37(%[src_ptr])                  "
   1970             "\n\t"
   1971             "mtlo           %[vector4a],    $ac3                            "
   1972             "\n\t"
   1973             "extp           %[Temp5],       $ac2,           9               "
   1974             "\n\t"
   1975 
   1976             "append         %[src_ptr_0],   %[src_ptr_r1],  8               "
   1977             "\n\t"
   1978             "append         %[src_ptr_l1],  %[src_ptr_r2],  8               "
   1979             "\n\t"
   1980             "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     "
   1981             "\n\t"
   1982             "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     "
   1983             "\n\t"
   1984 
   1985             "lbu            %[src_ptr_l1],  -10(%[src_ptr])                 "
   1986             "\n\t"
   1987             "lbu            %[src_ptr_0],   6(%[src_ptr])                   "
   1988             "\n\t"
   1989             "lbu            %[src_ptr_r1],  22(%[src_ptr])                  "
   1990             "\n\t"
   1991             "lbu            %[src_ptr_r2],  38(%[src_ptr])                  "
   1992             "\n\t"
   1993             "mtlo           %[vector4a],    $ac1                            "
   1994             "\n\t"
   1995             "extp           %[Temp6],       $ac3,           9               "
   1996             "\n\t"
   1997 
   1998             "append         %[src_ptr_0],   %[src_ptr_r1],  8               "
   1999             "\n\t"
   2000             "append         %[src_ptr_l1],  %[src_ptr_r2],  8               "
   2001             "\n\t"
   2002             "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     "
   2003             "\n\t"
   2004             "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     "
   2005             "\n\t"
   2006 
   2007             "lbu            %[src_ptr_l1],  -9(%[src_ptr])                  "
   2008             "\n\t"
   2009             "lbu            %[src_ptr_0],   7(%[src_ptr])                   "
   2010             "\n\t"
   2011             "lbu            %[src_ptr_r1],  23(%[src_ptr])                  "
   2012             "\n\t"
   2013             "lbu            %[src_ptr_r2],  39(%[src_ptr])                  "
   2014             "\n\t"
   2015             "mtlo           %[vector4a],    $ac3                            "
   2016             "\n\t"
   2017             "extp           %[Temp7],       $ac1,           9               "
   2018             "\n\t"
   2019 
   2020             "append         %[src_ptr_0],   %[src_ptr_r1],  8               "
   2021             "\n\t"
   2022             "append         %[src_ptr_l1],  %[src_ptr_r2],  8               "
   2023             "\n\t"
   2024             "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     "
   2025             "\n\t"
   2026             "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     "
   2027             "\n\t"
   2028             "extp           %[Temp8],       $ac3,           9               "
   2029             "\n\t"
   2030 
   2031             : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
   2032               [Temp4] "=&r"(Temp4), [Temp5] "=&r"(Temp5), [Temp6] "=&r"(Temp6),
   2033               [Temp7] "=&r"(Temp7), [Temp8] "=r"(Temp8),
   2034               [src_ptr_l1] "=&r"(src_ptr_l1), [src_ptr_0] "=&r"(src_ptr_0),
   2035               [src_ptr_r1] "=&r"(src_ptr_r1), [src_ptr_r2] "=&r"(src_ptr_r2)
   2036             : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b),
   2037               [vector4a] "r"(vector4a), [src_ptr] "r"(src_ptr));
   2038 
   2039         /* clamp and store results */
   2040         output_ptr[j] = cm[Temp1];
   2041         output_ptr[j + 1] = cm[Temp2];
   2042         output_ptr[j + 2] = cm[Temp3];
   2043         output_ptr[j + 3] = cm[Temp4];
   2044         output_ptr[j + 4] = cm[Temp5];
   2045         output_ptr[j + 5] = cm[Temp6];
   2046         output_ptr[j + 6] = cm[Temp7];
   2047         output_ptr[j + 7] = cm[Temp8];
   2048 
   2049         src_ptr += 8;
   2050       }
   2051 
   2052       output_ptr += output_pitch;
   2053     }
   2054   } else {
   2055     /* 4 tap filter */
   2056 
   2057     /* prefetch src_ptr data to cache memory */
   2058     prefetch_load(src_ptr + 16);
   2059 
   2060     /* unroll for loop */
   2061     for (i = 16; i--;) {
   2062       /* apply filter with vectors pairs */
   2063       __asm__ __volatile__(
   2064           "lbu            %[src_ptr_l2],  -32(%[src_ptr])                 \n\t"
   2065           "lbu            %[src_ptr_l1],  -16(%[src_ptr])                 \n\t"
   2066           "lbu            %[src_ptr_0],   0(%[src_ptr])                   \n\t"
   2067           "lbu            %[src_ptr_r1],  16(%[src_ptr])                  \n\t"
   2068           "lbu            %[src_ptr_r2],  32(%[src_ptr])                  \n\t"
   2069           "lbu            %[src_ptr_r3],  48(%[src_ptr])                  \n\t"
   2070           "mtlo           %[vector4a],    $ac2                            \n\t"
   2071 
   2072           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   2073           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   2074           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   2075           "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
   2076           "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
   2077           "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
   2078 
   2079           "lbu            %[src_ptr_l2],  -31(%[src_ptr])                 \n\t"
   2080           "lbu            %[src_ptr_l1],  -15(%[src_ptr])                 \n\t"
   2081           "lbu            %[src_ptr_0],   1(%[src_ptr])                   \n\t"
   2082           "lbu            %[src_ptr_r1],  17(%[src_ptr])                  \n\t"
   2083           "lbu            %[src_ptr_r2],  33(%[src_ptr])                  \n\t"
   2084           "lbu            %[src_ptr_r3],  49(%[src_ptr])                  \n\t"
   2085           "mtlo           %[vector4a],    $ac0                            \n\t"
   2086           "extp           %[Temp1],       $ac2,           9               \n\t"
   2087 
   2088           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   2089           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   2090           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   2091           "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
   2092           "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
   2093           "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
   2094 
   2095           "lbu            %[src_ptr_l2],  -30(%[src_ptr])                 \n\t"
   2096           "lbu            %[src_ptr_l1],  -14(%[src_ptr])                 \n\t"
   2097           "lbu            %[src_ptr_0],   2(%[src_ptr])                   \n\t"
   2098           "lbu            %[src_ptr_r1],  18(%[src_ptr])                  \n\t"
   2099           "lbu            %[src_ptr_r2],  34(%[src_ptr])                  \n\t"
   2100           "lbu            %[src_ptr_r3],  50(%[src_ptr])                  \n\t"
   2101           "mtlo           %[vector4a],    $ac1                            \n\t"
   2102           "extp           %[Temp2],       $ac0,           9               \n\t"
   2103 
   2104           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   2105           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   2106           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   2107           "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
   2108           "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
   2109           "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
   2110 
   2111           "lbu            %[src_ptr_l2],  -29(%[src_ptr])                 \n\t"
   2112           "lbu            %[src_ptr_l1],  -13(%[src_ptr])                 \n\t"
   2113           "lbu            %[src_ptr_0],   3(%[src_ptr])                   \n\t"
   2114           "lbu            %[src_ptr_r1],  19(%[src_ptr])                  \n\t"
   2115           "lbu            %[src_ptr_r2],  35(%[src_ptr])                  \n\t"
   2116           "lbu            %[src_ptr_r3],  51(%[src_ptr])                  \n\t"
   2117           "mtlo           %[vector4a],    $ac3                            \n\t"
   2118           "extp           %[Temp3],       $ac1,           9               \n\t"
   2119 
   2120           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   2121           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   2122           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   2123           "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
   2124           "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
   2125           "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
   2126 
   2127           "lbu            %[src_ptr_l2],  -28(%[src_ptr])                 \n\t"
   2128           "lbu            %[src_ptr_l1],  -12(%[src_ptr])                 \n\t"
   2129           "lbu            %[src_ptr_0],   4(%[src_ptr])                   \n\t"
   2130           "lbu            %[src_ptr_r1],  20(%[src_ptr])                  \n\t"
   2131           "lbu            %[src_ptr_r2],  36(%[src_ptr])                  \n\t"
   2132           "lbu            %[src_ptr_r3],  52(%[src_ptr])                  \n\t"
   2133           "mtlo           %[vector4a],    $ac2                            \n\t"
   2134           "extp           %[Temp4],       $ac3,           9               \n\t"
   2135 
   2136           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   2137           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   2138           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   2139           "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
   2140           "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
   2141           "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
   2142 
   2143           "lbu            %[src_ptr_l2],  -27(%[src_ptr])                 \n\t"
   2144           "lbu            %[src_ptr_l1],  -11(%[src_ptr])                 \n\t"
   2145           "lbu            %[src_ptr_0],   5(%[src_ptr])                   \n\t"
   2146           "lbu            %[src_ptr_r1],  21(%[src_ptr])                  \n\t"
   2147           "lbu            %[src_ptr_r2],  37(%[src_ptr])                  \n\t"
   2148           "lbu            %[src_ptr_r3],  53(%[src_ptr])                  \n\t"
   2149           "mtlo           %[vector4a],    $ac0                            \n\t"
   2150           "extp           %[Temp5],       $ac2,           9               \n\t"
   2151 
   2152           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   2153           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   2154           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   2155           "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
   2156           "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
   2157           "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
   2158 
   2159           "lbu            %[src_ptr_l2],  -26(%[src_ptr])                 \n\t"
   2160           "lbu            %[src_ptr_l1],  -10(%[src_ptr])                 \n\t"
   2161           "lbu            %[src_ptr_0],   6(%[src_ptr])                   \n\t"
   2162           "lbu            %[src_ptr_r1],  22(%[src_ptr])                  \n\t"
   2163           "lbu            %[src_ptr_r2],  38(%[src_ptr])                  \n\t"
   2164           "lbu            %[src_ptr_r3],  54(%[src_ptr])                  \n\t"
   2165           "mtlo           %[vector4a],    $ac1                            \n\t"
   2166           "extp           %[Temp6],       $ac0,           9               \n\t"
   2167 
   2168           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   2169           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   2170           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   2171           "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
   2172           "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
   2173           "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
   2174 
   2175           "lbu            %[src_ptr_l2],  -25(%[src_ptr])                 \n\t"
   2176           "lbu            %[src_ptr_l1],  -9(%[src_ptr])                  \n\t"
   2177           "lbu            %[src_ptr_0],   7(%[src_ptr])                   \n\t"
   2178           "lbu            %[src_ptr_r1],  23(%[src_ptr])                  \n\t"
   2179           "lbu            %[src_ptr_r2],  39(%[src_ptr])                  \n\t"
   2180           "lbu            %[src_ptr_r3],  55(%[src_ptr])                  \n\t"
   2181           "mtlo           %[vector4a],    $ac3                            \n\t"
   2182           "extp           %[Temp7],       $ac1,           9               \n\t"
   2183 
   2184           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   2185           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   2186           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   2187           "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
   2188           "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
   2189           "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
   2190           "extp           %[Temp8],       $ac3,           9               \n\t"
   2191 
   2192           : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
   2193             [Temp4] "=&r"(Temp4), [Temp5] "=&r"(Temp5), [Temp6] "=&r"(Temp6),
   2194             [Temp7] "=&r"(Temp7), [Temp8] "=r"(Temp8),
   2195             [src_ptr_l1] "=&r"(src_ptr_l1), [src_ptr_0] "=&r"(src_ptr_0),
   2196             [src_ptr_r1] "=&r"(src_ptr_r1), [src_ptr_r2] "=&r"(src_ptr_r2),
   2197             [src_ptr_l2] "=&r"(src_ptr_l2), [src_ptr_r3] "=&r"(src_ptr_r3)
   2198           : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
   2199             [vector3b] "r"(vector3b), [vector4a] "r"(vector4a),
   2200             [src_ptr] "r"(src_ptr));
   2201 
   2202       /* clamp and store results */
   2203       output_ptr[0] = cm[Temp1];
   2204       output_ptr[1] = cm[Temp2];
   2205       output_ptr[2] = cm[Temp3];
   2206       output_ptr[3] = cm[Temp4];
   2207       output_ptr[4] = cm[Temp5];
   2208       output_ptr[5] = cm[Temp6];
   2209       output_ptr[6] = cm[Temp7];
   2210       output_ptr[7] = cm[Temp8];
   2211 
   2212       /* apply filter with vectors pairs */
   2213       __asm__ __volatile__(
   2214           "lbu            %[src_ptr_l2],  -24(%[src_ptr])                 \n\t"
   2215           "lbu            %[src_ptr_l1],  -8(%[src_ptr])                  \n\t"
   2216           "lbu            %[src_ptr_0],   8(%[src_ptr])                   \n\t"
   2217           "lbu            %[src_ptr_r1],  24(%[src_ptr])                  \n\t"
   2218           "lbu            %[src_ptr_r2],  40(%[src_ptr])                  \n\t"
   2219           "lbu            %[src_ptr_r3],  56(%[src_ptr])                  \n\t"
   2220           "mtlo           %[vector4a],    $ac2                            \n\t"
   2221 
   2222           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   2223           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   2224           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   2225           "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
   2226           "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
   2227           "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
   2228 
   2229           "lbu            %[src_ptr_l2],  -23(%[src_ptr])                 \n\t"
   2230           "lbu            %[src_ptr_l1],  -7(%[src_ptr])                  \n\t"
   2231           "lbu            %[src_ptr_0],   9(%[src_ptr])                   \n\t"
   2232           "lbu            %[src_ptr_r1],  25(%[src_ptr])                  \n\t"
   2233           "lbu            %[src_ptr_r2],  41(%[src_ptr])                  \n\t"
   2234           "lbu            %[src_ptr_r3],  57(%[src_ptr])                  \n\t"
   2235           "mtlo           %[vector4a],    $ac0                            \n\t"
   2236           "extp           %[Temp1],       $ac2,           9               \n\t"
   2237 
   2238           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   2239           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   2240           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   2241           "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
   2242           "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
   2243           "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
   2244 
   2245           "lbu            %[src_ptr_l2],  -22(%[src_ptr])                 \n\t"
   2246           "lbu            %[src_ptr_l1],  -6(%[src_ptr])                  \n\t"
   2247           "lbu            %[src_ptr_0],   10(%[src_ptr])                  \n\t"
   2248           "lbu            %[src_ptr_r1],  26(%[src_ptr])                  \n\t"
   2249           "lbu            %[src_ptr_r2],  42(%[src_ptr])                  \n\t"
   2250           "lbu            %[src_ptr_r3],  58(%[src_ptr])                  \n\t"
   2251           "mtlo           %[vector4a],    $ac1                            \n\t"
   2252           "extp           %[Temp2],       $ac0,           9               \n\t"
   2253 
   2254           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   2255           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   2256           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   2257           "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
   2258           "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
   2259           "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
   2260 
   2261           "lbu            %[src_ptr_l2],  -21(%[src_ptr])                 \n\t"
   2262           "lbu            %[src_ptr_l1],  -5(%[src_ptr])                  \n\t"
   2263           "lbu            %[src_ptr_0],   11(%[src_ptr])                  \n\t"
   2264           "lbu            %[src_ptr_r1],  27(%[src_ptr])                  \n\t"
   2265           "lbu            %[src_ptr_r2],  43(%[src_ptr])                  \n\t"
   2266           "lbu            %[src_ptr_r3],  59(%[src_ptr])                  \n\t"
   2267           "mtlo           %[vector4a],    $ac3                            \n\t"
   2268           "extp           %[Temp3],       $ac1,           9               \n\t"
   2269 
   2270           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   2271           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   2272           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   2273           "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
   2274           "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
   2275           "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
   2276 
   2277           "lbu            %[src_ptr_l2],  -20(%[src_ptr])                 \n\t"
   2278           "lbu            %[src_ptr_l1],  -4(%[src_ptr])                  \n\t"
   2279           "lbu            %[src_ptr_0],   12(%[src_ptr])                  \n\t"
   2280           "lbu            %[src_ptr_r1],  28(%[src_ptr])                  \n\t"
   2281           "lbu            %[src_ptr_r2],  44(%[src_ptr])                  \n\t"
   2282           "lbu            %[src_ptr_r3],  60(%[src_ptr])                  \n\t"
   2283           "mtlo           %[vector4a],    $ac2                            \n\t"
   2284           "extp           %[Temp4],       $ac3,           9               \n\t"
   2285 
   2286           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   2287           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   2288           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   2289           "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
   2290           "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
   2291           "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
   2292 
   2293           "lbu            %[src_ptr_l2],  -19(%[src_ptr])                 \n\t"
   2294           "lbu            %[src_ptr_l1],  -3(%[src_ptr])                  \n\t"
   2295           "lbu            %[src_ptr_0],   13(%[src_ptr])                  \n\t"
   2296           "lbu            %[src_ptr_r1],  29(%[src_ptr])                  \n\t"
   2297           "lbu            %[src_ptr_r2],  45(%[src_ptr])                  \n\t"
   2298           "lbu            %[src_ptr_r3],  61(%[src_ptr])                  \n\t"
   2299           "mtlo           %[vector4a],    $ac0                            \n\t"
   2300           "extp           %[Temp5],       $ac2,           9               \n\t"
   2301 
   2302           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   2303           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   2304           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   2305           "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
   2306           "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
   2307           "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
   2308 
   2309           "lbu            %[src_ptr_l2],  -18(%[src_ptr])                 \n\t"
   2310           "lbu            %[src_ptr_l1],  -2(%[src_ptr])                  \n\t"
   2311           "lbu            %[src_ptr_0],   14(%[src_ptr])                  \n\t"
   2312           "lbu            %[src_ptr_r1],  30(%[src_ptr])                  \n\t"
   2313           "lbu            %[src_ptr_r2],  46(%[src_ptr])                  \n\t"
   2314           "lbu            %[src_ptr_r3],  62(%[src_ptr])                  \n\t"
   2315           "mtlo           %[vector4a],    $ac1                            \n\t"
   2316           "extp           %[Temp6],       $ac0,           9               \n\t"
   2317 
   2318           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   2319           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   2320           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   2321           "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
   2322           "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
   2323           "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
   2324 
   2325           "lbu            %[src_ptr_l2],  -17(%[src_ptr])                 \n\t"
   2326           "lbu            %[src_ptr_l1],  -1(%[src_ptr])                  \n\t"
   2327           "lbu            %[src_ptr_0],   15(%[src_ptr])                  \n\t"
   2328           "lbu            %[src_ptr_r1],  31(%[src_ptr])                  \n\t"
   2329           "lbu            %[src_ptr_r2],  47(%[src_ptr])                  \n\t"
   2330           "lbu            %[src_ptr_r3],  63(%[src_ptr])                  \n\t"
   2331           "mtlo           %[vector4a],    $ac3                            \n\t"
   2332           "extp           %[Temp7],       $ac1,           9               \n\t"
   2333 
   2334           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
   2335           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
   2336           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
   2337           "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
   2338           "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
   2339           "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
   2340           "extp           %[Temp8],       $ac3,           9               \n\t"
   2341 
   2342           : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
   2343             [Temp4] "=&r"(Temp4), [Temp5] "=&r"(Temp5), [Temp6] "=&r"(Temp6),
   2344             [Temp7] "=&r"(Temp7), [Temp8] "=r"(Temp8),
   2345             [src_ptr_l1] "=&r"(src_ptr_l1), [src_ptr_0] "=&r"(src_ptr_0),
   2346             [src_ptr_r1] "=&r"(src_ptr_r1), [src_ptr_r2] "=&r"(src_ptr_r2),
   2347             [src_ptr_l2] "=&r"(src_ptr_l2), [src_ptr_r3] "=&r"(src_ptr_r3)
   2348           : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
   2349             [vector3b] "r"(vector3b), [vector4a] "r"(vector4a),
   2350             [src_ptr] "r"(src_ptr));
   2351 
   2352       src_ptr += 16;
   2353       output_ptr[8] = cm[Temp1];
   2354       output_ptr[9] = cm[Temp2];
   2355       output_ptr[10] = cm[Temp3];
   2356       output_ptr[11] = cm[Temp4];
   2357       output_ptr[12] = cm[Temp5];
   2358       output_ptr[13] = cm[Temp6];
   2359       output_ptr[14] = cm[Temp7];
   2360       output_ptr[15] = cm[Temp8];
   2361 
   2362       output_ptr += output_pitch;
   2363     }
   2364   }
   2365 }
   2366 
   2367 void vp8_sixtap_predict4x4_dspr2(unsigned char *RESTRICT src_ptr,
   2368                                  int src_pixels_per_line, int xoffset,
   2369                                  int yoffset, unsigned char *RESTRICT dst_ptr,
   2370                                  int dst_pitch) {
   2371   unsigned char FData[9 * 4]; /* Temp data bufffer used in filtering */
   2372   unsigned int pos = 16;
   2373 
   2374   /* bit positon for extract from acc */
   2375   __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
   2376                        :
   2377                        : [pos] "r"(pos));
   2378 
   2379   if (yoffset) {
   2380     /* First filter 1-D horizontally... */
   2381     vp8_filter_block2d_first_pass_4(src_ptr - (2 * src_pixels_per_line), FData,
   2382                                     src_pixels_per_line, 9, xoffset, 4);
   2383     /* then filter verticaly... */
   2384     vp8_filter_block2d_second_pass4(FData + 8, dst_ptr, dst_pitch, yoffset);
   2385   } else
   2386     /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */
   2387     vp8_filter_block2d_first_pass_4(src_ptr, dst_ptr, src_pixels_per_line, 4,
   2388                                     xoffset, dst_pitch);
   2389 }
   2390 
   2391 void vp8_sixtap_predict8x8_dspr2(unsigned char *RESTRICT src_ptr,
   2392                                  int src_pixels_per_line, int xoffset,
   2393                                  int yoffset, unsigned char *RESTRICT dst_ptr,
   2394                                  int dst_pitch) {
   2395   unsigned char FData[13 * 8]; /* Temp data bufffer used in filtering */
   2396   unsigned int pos, Temp1, Temp2;
   2397 
   2398   pos = 16;
   2399 
   2400   /* bit positon for extract from acc */
   2401   __asm__ __volatile__("wrdsp      %[pos],     1               \n\t"
   2402                        :
   2403                        : [pos] "r"(pos));
   2404 
   2405   if (yoffset) {
   2406     src_ptr = src_ptr - (2 * src_pixels_per_line);
   2407 
   2408     if (xoffset) /* filter 1-D horizontally... */
   2409       vp8_filter_block2d_first_pass_8_all(src_ptr, FData, src_pixels_per_line,
   2410                                           13, xoffset, 8);
   2411 
   2412     else {
   2413       /* prefetch src_ptr data to cache memory */
   2414       prefetch_load(src_ptr + 2 * src_pixels_per_line);
   2415 
   2416       __asm__ __volatile__(
   2417           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2418           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2419           "sw     %[Temp1],   0(%[FData])                             \n\t"
   2420           "sw     %[Temp2],   4(%[FData])                             \n\t"
   2421           "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
   2422 
   2423           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2424           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2425           "sw     %[Temp1],   8(%[FData])                             \n\t"
   2426           "sw     %[Temp2],   12(%[FData])                            \n\t"
   2427           "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
   2428 
   2429           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2430           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2431           "sw     %[Temp1],   16(%[FData])                            \n\t"
   2432           "sw     %[Temp2],   20(%[FData])                            \n\t"
   2433           "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
   2434 
   2435           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2436           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2437           "sw     %[Temp1],   24(%[FData])                            \n\t"
   2438           "sw     %[Temp2],   28(%[FData])                            \n\t"
   2439           "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
   2440 
   2441           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2442           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2443           "sw     %[Temp1],   32(%[FData])                            \n\t"
   2444           "sw     %[Temp2],   36(%[FData])                            \n\t"
   2445           "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
   2446 
   2447           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2448           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2449           "sw     %[Temp1],   40(%[FData])                            \n\t"
   2450           "sw     %[Temp2],   44(%[FData])                            \n\t"
   2451           "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
   2452 
   2453           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2454           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2455           "sw     %[Temp1],   48(%[FData])                            \n\t"
   2456           "sw     %[Temp2],   52(%[FData])                            \n\t"
   2457           "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
   2458 
   2459           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2460           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2461           "sw     %[Temp1],   56(%[FData])                            \n\t"
   2462           "sw     %[Temp2],   60(%[FData])                            \n\t"
   2463           "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
   2464 
   2465           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2466           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2467           "sw     %[Temp1],   64(%[FData])                            \n\t"
   2468           "sw     %[Temp2],   68(%[FData])                            \n\t"
   2469           "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
   2470 
   2471           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2472           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2473           "sw     %[Temp1],   72(%[FData])                            \n\t"
   2474           "sw     %[Temp2],   76(%[FData])                            \n\t"
   2475           "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
   2476 
   2477           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2478           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2479           "sw     %[Temp1],   80(%[FData])                            \n\t"
   2480           "sw     %[Temp2],   84(%[FData])                            \n\t"
   2481           "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
   2482 
   2483           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2484           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2485           "sw     %[Temp1],   88(%[FData])                            \n\t"
   2486           "sw     %[Temp2],   92(%[FData])                            \n\t"
   2487           "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
   2488 
   2489           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2490           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2491           "sw     %[Temp1],   96(%[FData])                            \n\t"
   2492           "sw     %[Temp2],   100(%[FData])                           \n\t"
   2493 
   2494           : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2)
   2495           : [FData] "r"(FData), [src_ptr] "r"(src_ptr),
   2496             [src_pixels_per_line] "r"(src_pixels_per_line));
   2497     }
   2498 
   2499     /* filter verticaly... */
   2500     vp8_filter_block2d_second_pass_8(FData + 16, dst_ptr, dst_pitch, 8, 8,
   2501                                      yoffset);
   2502   }
   2503 
   2504   /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */
   2505   else {
   2506     if (xoffset)
   2507       vp8_filter_block2d_first_pass_8_all(src_ptr, dst_ptr, src_pixels_per_line,
   2508                                           8, xoffset, dst_pitch);
   2509 
   2510     else {
   2511       /* copy from src buffer to dst buffer */
   2512       __asm__ __volatile__(
   2513           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2514           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2515           "sw     %[Temp1],   0(%[dst_ptr])                           \n\t"
   2516           "sw     %[Temp2],   4(%[dst_ptr])                           \n\t"
   2517           "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
   2518 
   2519           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2520           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2521           "sw     %[Temp1],   8(%[dst_ptr])                           \n\t"
   2522           "sw     %[Temp2],   12(%[dst_ptr])                          \n\t"
   2523           "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
   2524 
   2525           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2526           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2527           "sw     %[Temp1],   16(%[dst_ptr])                          \n\t"
   2528           "sw     %[Temp2],   20(%[dst_ptr])                          \n\t"
   2529           "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
   2530 
   2531           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2532           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2533           "sw     %[Temp1],   24(%[dst_ptr])                          \n\t"
   2534           "sw     %[Temp2],   28(%[dst_ptr])                          \n\t"
   2535           "addu   %[src_ptr], %[src_ptr],   %[src_pixels_per_line]    \n\t"
   2536 
   2537           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2538           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2539           "sw     %[Temp1],   32(%[dst_ptr])                          \n\t"
   2540           "sw     %[Temp2],   36(%[dst_ptr])                          \n\t"
   2541           "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
   2542 
   2543           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2544           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2545           "sw     %[Temp1],   40(%[dst_ptr])                          \n\t"
   2546           "sw     %[Temp2],   44(%[dst_ptr])                          \n\t"
   2547           "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
   2548 
   2549           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2550           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2551           "sw     %[Temp1],   48(%[dst_ptr])                          \n\t"
   2552           "sw     %[Temp2],   52(%[dst_ptr])                          \n\t"
   2553           "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
   2554 
   2555           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2556           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2557           "sw     %[Temp1],   56(%[dst_ptr])                          \n\t"
   2558           "sw     %[Temp2],   60(%[dst_ptr])                          \n\t"
   2559 
   2560           : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2)
   2561           : [dst_ptr] "r"(dst_ptr), [src_ptr] "r"(src_ptr),
   2562             [src_pixels_per_line] "r"(src_pixels_per_line));
   2563     }
   2564   }
   2565 }
   2566 
   2567 void vp8_sixtap_predict8x4_dspr2(unsigned char *RESTRICT src_ptr,
   2568                                  int src_pixels_per_line, int xoffset,
   2569                                  int yoffset, unsigned char *RESTRICT dst_ptr,
   2570                                  int dst_pitch) {
   2571   unsigned char FData[9 * 8]; /* Temp data bufffer used in filtering */
   2572   unsigned int pos, Temp1, Temp2;
   2573 
   2574   pos = 16;
   2575 
   2576   /* bit positon for extract from acc */
   2577   __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
   2578                        :
   2579                        : [pos] "r"(pos));
   2580 
   2581   if (yoffset) {
   2582     src_ptr = src_ptr - (2 * src_pixels_per_line);
   2583 
   2584     if (xoffset) /* filter 1-D horizontally... */
   2585       vp8_filter_block2d_first_pass_8_all(src_ptr, FData, src_pixels_per_line,
   2586                                           9, xoffset, 8);
   2587 
   2588     else {
   2589       /* prefetch src_ptr data to cache memory */
   2590       prefetch_load(src_ptr + 2 * src_pixels_per_line);
   2591 
   2592       __asm__ __volatile__(
   2593           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2594           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2595           "sw     %[Temp1],   0(%[FData])                             \n\t"
   2596           "sw     %[Temp2],   4(%[FData])                             \n\t"
   2597           "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
   2598 
   2599           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2600           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2601           "sw     %[Temp1],   8(%[FData])                             \n\t"
   2602           "sw     %[Temp2],   12(%[FData])                            \n\t"
   2603           "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
   2604 
   2605           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2606           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2607           "sw     %[Temp1],   16(%[FData])                            \n\t"
   2608           "sw     %[Temp2],   20(%[FData])                            \n\t"
   2609           "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
   2610 
   2611           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2612           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2613           "sw     %[Temp1],   24(%[FData])                            \n\t"
   2614           "sw     %[Temp2],   28(%[FData])                            \n\t"
   2615           "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
   2616 
   2617           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2618           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2619           "sw     %[Temp1],   32(%[FData])                            \n\t"
   2620           "sw     %[Temp2],   36(%[FData])                            \n\t"
   2621           "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
   2622 
   2623           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2624           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2625           "sw     %[Temp1],   40(%[FData])                            \n\t"
   2626           "sw     %[Temp2],   44(%[FData])                            \n\t"
   2627           "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
   2628 
   2629           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2630           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2631           "sw     %[Temp1],   48(%[FData])                            \n\t"
   2632           "sw     %[Temp2],   52(%[FData])                            \n\t"
   2633           "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
   2634 
   2635           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2636           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2637           "sw     %[Temp1],   56(%[FData])                            \n\t"
   2638           "sw     %[Temp2],   60(%[FData])                            \n\t"
   2639           "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
   2640 
   2641           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2642           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2643           "sw     %[Temp1],   64(%[FData])                            \n\t"
   2644           "sw     %[Temp2],   68(%[FData])                            \n\t"
   2645 
   2646           : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2)
   2647           : [FData] "r"(FData), [src_ptr] "r"(src_ptr),
   2648             [src_pixels_per_line] "r"(src_pixels_per_line));
   2649     }
   2650 
   2651     /* filter verticaly... */
   2652     vp8_filter_block2d_second_pass_8(FData + 16, dst_ptr, dst_pitch, 4, 8,
   2653                                      yoffset);
   2654   }
   2655 
   2656   /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */
   2657   else {
   2658     if (xoffset)
   2659       vp8_filter_block2d_first_pass_8_all(src_ptr, dst_ptr, src_pixels_per_line,
   2660                                           4, xoffset, dst_pitch);
   2661 
   2662     else {
   2663       /* copy from src buffer to dst buffer */
   2664       __asm__ __volatile__(
   2665           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2666           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2667           "sw     %[Temp1],   0(%[dst_ptr])                           \n\t"
   2668           "sw     %[Temp2],   4(%[dst_ptr])                           \n\t"
   2669           "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
   2670 
   2671           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2672           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2673           "sw     %[Temp1],   8(%[dst_ptr])                           \n\t"
   2674           "sw     %[Temp2],   12(%[dst_ptr])                          \n\t"
   2675           "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
   2676 
   2677           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2678           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2679           "sw     %[Temp1],   16(%[dst_ptr])                          \n\t"
   2680           "sw     %[Temp2],   20(%[dst_ptr])                          \n\t"
   2681           "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
   2682 
   2683           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
   2684           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
   2685           "sw     %[Temp1],   24(%[dst_ptr])                          \n\t"
   2686           "sw     %[Temp2],   28(%[dst_ptr])                          \n\t"
   2687 
   2688           : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2)
   2689           : [dst_ptr] "r"(dst_ptr), [src_ptr] "r"(src_ptr),
   2690             [src_pixels_per_line] "r"(src_pixels_per_line));
   2691     }
   2692   }
   2693 }
   2694 
   2695 void vp8_sixtap_predict16x16_dspr2(unsigned char *RESTRICT src_ptr,
   2696                                    int src_pixels_per_line, int xoffset,
   2697                                    int yoffset, unsigned char *RESTRICT dst_ptr,
   2698                                    int dst_pitch) {
   2699   const unsigned short *VFilter;
   2700   unsigned char FData[21 * 16]; /* Temp data bufffer used in filtering */
   2701   unsigned int pos;
   2702 
   2703   VFilter = sub_pel_filterss[yoffset];
   2704 
   2705   pos = 16;
   2706 
   2707   /* bit positon for extract from acc */
   2708   __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
   2709                        :
   2710                        : [pos] "r"(pos));
   2711 
   2712   if (yoffset) {
   2713     src_ptr = src_ptr - (2 * src_pixels_per_line);
   2714 
   2715     switch (xoffset) {
   2716       /* filter 1-D horizontally... */
   2717       case 2:
   2718       case 4:
   2719       case 6:
   2720         /* 6 tap filter */
   2721         vp8_filter_block2d_first_pass16_6tap(
   2722             src_ptr, FData, src_pixels_per_line, 21, xoffset, 16);
   2723         break;
   2724 
   2725       case 0:
   2726         /* only copy buffer */
   2727         vp8_filter_block2d_first_pass16_0(src_ptr, FData, src_pixels_per_line);
   2728         break;
   2729 
   2730       case 1:
   2731       case 3:
   2732       case 5:
   2733       case 7:
   2734         /* 4 tap filter */
   2735         vp8_filter_block2d_first_pass16_4tap(
   2736             src_ptr, FData, src_pixels_per_line, 16, 21, xoffset, yoffset,
   2737             dst_ptr, dst_pitch);
   2738         break;
   2739     }
   2740 
   2741     /* filter verticaly... */
   2742     vp8_filter_block2d_second_pass161(FData + 32, dst_ptr, dst_pitch, VFilter);
   2743   } else {
   2744     /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */
   2745     switch (xoffset) {
   2746       case 2:
   2747       case 4:
   2748       case 6:
   2749         /* 6 tap filter */
   2750         vp8_filter_block2d_first_pass16_6tap(
   2751             src_ptr, dst_ptr, src_pixels_per_line, 16, xoffset, dst_pitch);
   2752         break;
   2753 
   2754       case 1:
   2755       case 3:
   2756       case 5:
   2757       case 7:
   2758         /* 4 tap filter */
   2759         vp8_filter_block2d_first_pass16_4tap(
   2760             src_ptr, dst_ptr, src_pixels_per_line, 16, 21, xoffset, yoffset,
   2761             dst_ptr, dst_pitch);
   2762         break;
   2763     }
   2764   }
   2765 }
   2766 
   2767 #endif
   2768