Home | History | Annotate | Download | only in dsp
      1 // Copyright 2014 Google Inc. All Rights Reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style license
      4 // that can be found in the COPYING file in the root of the source
      5 // tree. An additional intellectual property rights grant can be found
      6 // in the file PATENTS. All contributing project authors may
      7 // be found in the AUTHORS file in the root of the source tree.
      8 // -----------------------------------------------------------------------------
      9 //
     10 // MIPS version of speed-critical encoding functions.
     11 //
     12 // Author(s): Darko Laus (darko.laus (at) imgtec.com)
     13 //            Mirko Raus (mirko.raus (at) imgtec.com)
     14 
     15 #include "src/dsp/dsp.h"
     16 
     17 #if defined(WEBP_USE_MIPS_DSP_R2)
     18 
     19 #include "src/dsp/mips_macro.h"
     20 #include "src/enc/cost_enc.h"
     21 #include "src/enc/vp8i_enc.h"
     22 
     23 static const int kC1 = 20091 + (1 << 16);
     24 static const int kC2 = 35468;
     25 
     26 // O - output
     27 // I - input (macro doesn't change it)
     28 #define ADD_SUB_HALVES_X4(O0, O1, O2, O3, O4, O5, O6, O7,                      \
     29                           I0, I1, I2, I3, I4, I5, I6, I7)                      \
     30   "addq.ph          %[" #O0 "],   %[" #I0 "],  %[" #I1 "]     \n\t"            \
     31   "subq.ph          %[" #O1 "],   %[" #I0 "],  %[" #I1 "]     \n\t"            \
     32   "addq.ph          %[" #O2 "],   %[" #I2 "],  %[" #I3 "]     \n\t"            \
     33   "subq.ph          %[" #O3 "],   %[" #I2 "],  %[" #I3 "]     \n\t"            \
     34   "addq.ph          %[" #O4 "],   %[" #I4 "],  %[" #I5 "]     \n\t"            \
     35   "subq.ph          %[" #O5 "],   %[" #I4 "],  %[" #I5 "]     \n\t"            \
     36   "addq.ph          %[" #O6 "],   %[" #I6 "],  %[" #I7 "]     \n\t"            \
     37   "subq.ph          %[" #O7 "],   %[" #I6 "],  %[" #I7 "]     \n\t"
     38 
     39 // IO - input/output
     40 #define ABS_X8(IO0, IO1, IO2, IO3, IO4, IO5, IO6, IO7)                         \
     41   "absq_s.ph        %[" #IO0 "],   %[" #IO0 "]                \n\t"            \
     42   "absq_s.ph        %[" #IO1 "],   %[" #IO1 "]                \n\t"            \
     43   "absq_s.ph        %[" #IO2 "],   %[" #IO2 "]                \n\t"            \
     44   "absq_s.ph        %[" #IO3 "],   %[" #IO3 "]                \n\t"            \
     45   "absq_s.ph        %[" #IO4 "],   %[" #IO4 "]                \n\t"            \
     46   "absq_s.ph        %[" #IO5 "],   %[" #IO5 "]                \n\t"            \
     47   "absq_s.ph        %[" #IO6 "],   %[" #IO6 "]                \n\t"            \
     48   "absq_s.ph        %[" #IO7 "],   %[" #IO7 "]                \n\t"
     49 
     50 // dpa.w.ph $ac0 temp0 ,temp1
     51 //  $ac += temp0[31..16] * temp1[31..16] + temp0[15..0] * temp1[15..0]
     52 // dpax.w.ph $ac0 temp0 ,temp1
     53 //  $ac += temp0[31..16] * temp1[15..0] + temp0[15..0] * temp1[31..16]
     54 // O - output
     55 // I - input (macro doesn't change it)
     56 #define MUL_HALF(O0, I0, I1, I2, I3, I4, I5, I6, I7,                           \
     57                  I8, I9, I10, I11, I12, I13, I14, I15)                         \
     58     "mult            $ac0,      $zero,     $zero              \n\t"            \
     59     "dpa.w.ph        $ac0,      %[" #I2 "],  %[" #I0 "]       \n\t"            \
     60     "dpax.w.ph       $ac0,      %[" #I5 "],  %[" #I6 "]       \n\t"            \
     61     "dpa.w.ph        $ac0,      %[" #I8 "],  %[" #I9 "]       \n\t"            \
     62     "dpax.w.ph       $ac0,      %[" #I11 "], %[" #I4 "]       \n\t"            \
     63     "dpa.w.ph        $ac0,      %[" #I12 "], %[" #I7 "]       \n\t"            \
     64     "dpax.w.ph       $ac0,      %[" #I13 "], %[" #I1 "]       \n\t"            \
     65     "dpa.w.ph        $ac0,      %[" #I14 "], %[" #I3 "]       \n\t"            \
     66     "dpax.w.ph       $ac0,      %[" #I15 "], %[" #I10 "]      \n\t"            \
     67     "mflo            %[" #O0 "],  $ac0                        \n\t"
     68 
     69 #define OUTPUT_EARLY_CLOBBER_REGS_17()                                         \
     70   OUTPUT_EARLY_CLOBBER_REGS_10(),                                              \
     71   [temp11]"=&r"(temp11), [temp12]"=&r"(temp12), [temp13]"=&r"(temp13),         \
     72   [temp14]"=&r"(temp14), [temp15]"=&r"(temp15), [temp16]"=&r"(temp16),         \
     73   [temp17]"=&r"(temp17)
     74 
     75 // macro for one horizontal pass in FTransform
     76 // temp0..temp15 holds tmp[0]..tmp[15]
     77 // A - offset in bytes to load from src and ref buffers
     78 // TEMP0..TEMP3 - registers for corresponding tmp elements
     79 #define HORIZONTAL_PASS(A, TEMP0, TEMP1, TEMP2, TEMP3)                         \
     80   "lw              %[" #TEMP0 "],   0(%[args])                          \n\t"  \
     81   "lw              %[" #TEMP1 "],   4(%[args])                          \n\t"  \
     82   "lw              %[" #TEMP2 "],   " XSTR(BPS) "*" #A "(%[" #TEMP0 "]) \n\t"  \
     83   "lw              %[" #TEMP3 "],   " XSTR(BPS) "*" #A "(%[" #TEMP1 "]) \n\t"  \
     84   "preceu.ph.qbl   %[" #TEMP0 "],   %[" #TEMP2 "]                       \n\t"  \
     85   "preceu.ph.qbl   %[" #TEMP1 "],   %[" #TEMP3 "]                       \n\t"  \
     86   "preceu.ph.qbr   %[" #TEMP2 "],   %[" #TEMP2 "]                       \n\t"  \
     87   "preceu.ph.qbr   %[" #TEMP3 "],   %[" #TEMP3 "]                       \n\t"  \
     88   "subq.ph         %[" #TEMP0 "],   %[" #TEMP0 "],   %[" #TEMP1 "]      \n\t"  \
     89   "subq.ph         %[" #TEMP2 "],   %[" #TEMP2 "],   %[" #TEMP3 "]      \n\t"  \
     90   "rotr            %[" #TEMP0 "],   %[" #TEMP0 "],   16                 \n\t"  \
     91   "addq.ph         %[" #TEMP1 "],   %[" #TEMP2 "],   %[" #TEMP0 "]      \n\t"  \
     92   "subq.ph         %[" #TEMP3 "],   %[" #TEMP2 "],   %[" #TEMP0 "]      \n\t"  \
     93   "seh             %[" #TEMP0 "],   %[" #TEMP1 "]                       \n\t"  \
     94   "sra             %[temp16],     %[" #TEMP1 "],   16                   \n\t"  \
     95   "seh             %[temp19],     %[" #TEMP3 "]                         \n\t"  \
     96   "sra             %[" #TEMP3 "],   %[" #TEMP3 "],   16                 \n\t"  \
     97   "subu            %[" #TEMP2 "],   %[" #TEMP0 "],   %[temp16]          \n\t"  \
     98   "addu            %[" #TEMP0 "],   %[" #TEMP0 "],   %[temp16]          \n\t"  \
     99   "mul             %[temp17],     %[temp19],     %[c2217]               \n\t"  \
    100   "mul             %[temp18],     %[" #TEMP3 "],   %[c5352]             \n\t"  \
    101   "mul             %[" #TEMP1 "],   %[temp19],     %[c5352]             \n\t"  \
    102   "mul             %[temp16],     %[" #TEMP3 "],   %[c2217]             \n\t"  \
    103   "sll             %[" #TEMP2 "],   %[" #TEMP2 "],   3                  \n\t"  \
    104   "sll             %[" #TEMP0 "],   %[" #TEMP0 "],   3                  \n\t"  \
    105   "subu            %[" #TEMP3 "],   %[temp17],     %[temp18]            \n\t"  \
    106   "addu            %[" #TEMP1 "],   %[temp16],     %[" #TEMP1 "]        \n\t"  \
    107   "addiu           %[" #TEMP3 "],   %[" #TEMP3 "],   937                \n\t"  \
    108   "addiu           %[" #TEMP1 "],   %[" #TEMP1 "],   1812               \n\t"  \
    109   "sra             %[" #TEMP3 "],   %[" #TEMP3 "],   9                  \n\t"  \
    110   "sra             %[" #TEMP1 "],   %[" #TEMP1 "],   9                  \n\t"
    111 
    112 // macro for one vertical pass in FTransform
    113 // temp0..temp15 holds tmp[0]..tmp[15]
    114 // A..D - offsets in bytes to store to out buffer
    115 // TEMP0, TEMP4, TEMP8 and TEMP12 - registers for corresponding tmp elements
    116 #define VERTICAL_PASS(A, B, C, D, TEMP0, TEMP4, TEMP8, TEMP12)                 \
    117   "addu            %[temp16],     %[" #TEMP0 "],   %[" #TEMP12 "]   \n\t"      \
    118   "subu            %[temp19],     %[" #TEMP0 "],   %[" #TEMP12 "]   \n\t"      \
    119   "addu            %[temp17],     %[" #TEMP4 "],   %[" #TEMP8 "]    \n\t"      \
    120   "subu            %[temp18],     %[" #TEMP4 "],   %[" #TEMP8 "]    \n\t"      \
    121   "mul             %[" #TEMP8 "],   %[temp19],     %[c2217]         \n\t"      \
    122   "mul             %[" #TEMP12 "],  %[temp18],     %[c2217]         \n\t"      \
    123   "mul             %[" #TEMP4 "],   %[temp19],     %[c5352]         \n\t"      \
    124   "mul             %[temp18],     %[temp18],     %[c5352]           \n\t"      \
    125   "addiu           %[temp16],     %[temp16],     7                  \n\t"      \
    126   "addu            %[" #TEMP0 "],   %[temp16],     %[temp17]        \n\t"      \
    127   "sra             %[" #TEMP0 "],   %[" #TEMP0 "],   4              \n\t"      \
    128   "addu            %[" #TEMP12 "],  %[" #TEMP12 "],  %[" #TEMP4 "]  \n\t"      \
    129   "subu            %[" #TEMP4 "],   %[temp16],     %[temp17]        \n\t"      \
    130   "sra             %[" #TEMP4 "],   %[" #TEMP4 "],   4              \n\t"      \
    131   "addiu           %[" #TEMP8 "],   %[" #TEMP8 "],   30000          \n\t"      \
    132   "addiu           %[" #TEMP12 "],  %[" #TEMP12 "],  12000          \n\t"      \
    133   "addiu           %[" #TEMP8 "],   %[" #TEMP8 "],   21000          \n\t"      \
    134   "subu            %[" #TEMP8 "],   %[" #TEMP8 "],   %[temp18]      \n\t"      \
    135   "sra             %[" #TEMP12 "],  %[" #TEMP12 "],  16             \n\t"      \
    136   "sra             %[" #TEMP8 "],   %[" #TEMP8 "],   16             \n\t"      \
    137   "addiu           %[temp16],     %[" #TEMP12 "],  1                \n\t"      \
    138   "movn            %[" #TEMP12 "],  %[temp16],     %[temp19]        \n\t"      \
    139   "sh              %[" #TEMP0 "],   " #A "(%[temp20])               \n\t"      \
    140   "sh              %[" #TEMP4 "],   " #C "(%[temp20])               \n\t"      \
    141   "sh              %[" #TEMP8 "],   " #D "(%[temp20])               \n\t"      \
    142   "sh              %[" #TEMP12 "],  " #B "(%[temp20])               \n\t"
    143 
    144 static void FTransform_MIPSdspR2(const uint8_t* src, const uint8_t* ref,
    145                                  int16_t* out) {
    146   const int c2217 = 2217;
    147   const int c5352 = 5352;
    148   int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
    149   int temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16;
    150   int temp17, temp18, temp19, temp20;
    151   const int* const args[3] =
    152       { (const int*)src, (const int*)ref, (const int*)out };
    153 
    154   __asm__ volatile (
    155     HORIZONTAL_PASS(0, temp0,  temp1,  temp2,  temp3)
    156     HORIZONTAL_PASS(1, temp4,  temp5,  temp6,  temp7)
    157     HORIZONTAL_PASS(2, temp8,  temp9,  temp10, temp11)
    158     HORIZONTAL_PASS(3, temp12, temp13, temp14, temp15)
    159     "lw            %[temp20],     8(%[args])                  \n\t"
    160     VERTICAL_PASS(0,  8, 16, 24, temp0, temp4, temp8,  temp12)
    161     VERTICAL_PASS(2, 10, 18, 26, temp1, temp5, temp9,  temp13)
    162     VERTICAL_PASS(4, 12, 20, 28, temp2, temp6, temp10, temp14)
    163     VERTICAL_PASS(6, 14, 22, 30, temp3, temp7, temp11, temp15)
    164     OUTPUT_EARLY_CLOBBER_REGS_18(),
    165       [temp0]"=&r"(temp0), [temp19]"=&r"(temp19), [temp20]"=&r"(temp20)
    166     : [args]"r"(args), [c2217]"r"(c2217), [c5352]"r"(c5352)
    167     : "memory", "hi", "lo"
    168   );
    169 }
    170 
    171 #undef VERTICAL_PASS
    172 #undef HORIZONTAL_PASS
    173 
    174 static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
    175                                       uint8_t* dst) {
    176   int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
    177   int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;
    178 
    179   __asm__ volatile (
    180     "ulw              %[temp1],   0(%[in])                 \n\t"
    181     "ulw              %[temp2],   16(%[in])                \n\t"
    182     LOAD_IN_X2(temp5, temp6, 24, 26)
    183     ADD_SUB_HALVES(temp3, temp4, temp1, temp2)
    184     LOAD_IN_X2(temp1, temp2, 8, 10)
    185     MUL_SHIFT_SUM(temp7, temp8, temp9, temp10, temp11, temp12, temp13, temp14,
    186                   temp10, temp8, temp9, temp7, temp1, temp2, temp5, temp6,
    187                   temp13, temp11, temp14, temp12)
    188     INSERT_HALF_X2(temp8, temp7, temp10, temp9)
    189     "ulw              %[temp17],  4(%[in])                 \n\t"
    190     "ulw              %[temp18],  20(%[in])                \n\t"
    191     ADD_SUB_HALVES(temp1, temp2, temp3, temp8)
    192     ADD_SUB_HALVES(temp5, temp6, temp4, temp7)
    193     ADD_SUB_HALVES(temp7, temp8, temp17, temp18)
    194     LOAD_IN_X2(temp17, temp18, 12, 14)
    195     LOAD_IN_X2(temp9, temp10, 28, 30)
    196     MUL_SHIFT_SUM(temp11, temp12, temp13, temp14, temp15, temp16, temp4, temp17,
    197                   temp12, temp14, temp11, temp13, temp17, temp18, temp9, temp10,
    198                   temp15, temp4, temp16, temp17)
    199     INSERT_HALF_X2(temp11, temp12, temp13, temp14)
    200     ADD_SUB_HALVES(temp17, temp8, temp8, temp11)
    201     ADD_SUB_HALVES(temp3, temp4, temp7, temp12)
    202 
    203     // horizontal
    204     SRA_16(temp9, temp10, temp11, temp12, temp1, temp2, temp5, temp6)
    205     INSERT_HALF_X2(temp1, temp6, temp5, temp2)
    206     SRA_16(temp13, temp14, temp15, temp16, temp3, temp4, temp17, temp8)
    207     "repl.ph          %[temp2],   0x4                      \n\t"
    208     INSERT_HALF_X2(temp3, temp8, temp17, temp4)
    209     "addq.ph          %[temp1],   %[temp1],  %[temp2]      \n\t"
    210     "addq.ph          %[temp6],   %[temp6],  %[temp2]      \n\t"
    211     ADD_SUB_HALVES(temp2, temp4, temp1, temp3)
    212     ADD_SUB_HALVES(temp5, temp7, temp6, temp8)
    213     MUL_SHIFT_SUM(temp1, temp3, temp6, temp8, temp9, temp13, temp17, temp18,
    214                   temp3, temp13, temp1, temp9, temp9, temp13, temp11, temp15,
    215                   temp6, temp17, temp8, temp18)
    216     MUL_SHIFT_SUM(temp6, temp8, temp18, temp17, temp11, temp15, temp12, temp16,
    217                   temp8, temp15, temp6, temp11, temp12, temp16, temp10, temp14,
    218                   temp18, temp12, temp17, temp16)
    219     INSERT_HALF_X2(temp1, temp3, temp9, temp13)
    220     INSERT_HALF_X2(temp6, temp8, temp11, temp15)
    221     SHIFT_R_SUM_X2(temp9, temp10, temp11, temp12, temp13, temp14, temp15,
    222                    temp16, temp2, temp4, temp5, temp7, temp3, temp1, temp8,
    223                    temp6)
    224     PACK_2_HALVES_TO_WORD(temp1, temp2, temp3, temp4, temp9, temp12, temp13,
    225                           temp16, temp11, temp10, temp15, temp14)
    226     LOAD_WITH_OFFSET_X4(temp10, temp11, temp14, temp15, ref,
    227                         0, 0, 0, 0,
    228                         0, 1, 2, 3,
    229                         BPS)
    230     CONVERT_2_BYTES_TO_HALF(temp5, temp6, temp7, temp8, temp17, temp18, temp10,
    231                             temp11, temp10, temp11, temp14, temp15)
    232     STORE_SAT_SUM_X2(temp5, temp6, temp7, temp8, temp17, temp18, temp10, temp11,
    233                      temp9, temp12, temp1, temp2, temp13, temp16, temp3, temp4,
    234                      dst, 0, 1, 2, 3, BPS)
    235 
    236     OUTPUT_EARLY_CLOBBER_REGS_18()
    237     : [dst]"r"(dst), [in]"r"(in), [kC1]"r"(kC1), [kC2]"r"(kC2), [ref]"r"(ref)
    238     : "memory", "hi", "lo"
    239   );
    240 }
    241 
    242 static void ITransform_MIPSdspR2(const uint8_t* ref, const int16_t* in,
    243                                  uint8_t* dst, int do_two) {
    244   ITransformOne(ref, in, dst);
    245   if (do_two) {
    246     ITransformOne(ref + 4, in + 16, dst + 4);
    247   }
    248 }
    249 
    250 static int Disto4x4_MIPSdspR2(const uint8_t* const a, const uint8_t* const b,
    251                               const uint16_t* const w) {
    252   int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
    253   int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17;
    254 
    255   __asm__ volatile (
    256     LOAD_WITH_OFFSET_X4(temp1, temp2, temp3, temp4, a,
    257                         0, 0, 0, 0,
    258                         0, 1, 2, 3,
    259                         BPS)
    260     CONVERT_2_BYTES_TO_HALF(temp5, temp6, temp7, temp8, temp9,temp10, temp11,
    261                             temp12, temp1, temp2, temp3, temp4)
    262     ADD_SUB_HALVES_X4(temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8,
    263                       temp5, temp6, temp7, temp8, temp9, temp10, temp11, temp12)
    264     PACK_2_HALVES_TO_WORD(temp9, temp10, temp11, temp12, temp1, temp3, temp5,
    265                           temp7, temp2, temp4, temp6, temp8)
    266     ADD_SUB_HALVES_X4(temp2, temp4, temp6, temp8, temp9, temp1, temp3, temp10,
    267                       temp1, temp9, temp3, temp10, temp5, temp11, temp7, temp12)
    268     ADD_SUB_HALVES_X4(temp5, temp11, temp7, temp2, temp9, temp3, temp6, temp12,
    269                       temp2, temp9, temp6, temp3, temp4, temp1, temp8, temp10)
    270     ADD_SUB_HALVES_X4(temp1, temp4, temp10, temp8, temp7, temp11, temp5, temp2,
    271                       temp5, temp7, temp11, temp2, temp9, temp6, temp3, temp12)
    272     ABS_X8(temp1, temp4, temp10, temp8, temp7, temp11, temp5, temp2)
    273     LOAD_WITH_OFFSET_X4(temp3, temp6, temp9, temp12, w,
    274                         0, 4, 8, 12,
    275                         0, 0, 0, 0,
    276                         0)
    277     LOAD_WITH_OFFSET_X4(temp13, temp14, temp15, temp16, w,
    278                         0, 4, 8, 12,
    279                         1, 1, 1, 1,
    280                         16)
    281     MUL_HALF(temp17, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8,
    282              temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16)
    283     LOAD_WITH_OFFSET_X4(temp1, temp2, temp3, temp4, b,
    284                         0, 0, 0, 0,
    285                         0, 1, 2, 3,
    286                         BPS)
    287     CONVERT_2_BYTES_TO_HALF(temp5,temp6, temp7, temp8, temp9,temp10, temp11,
    288                             temp12, temp1, temp2, temp3, temp4)
    289     ADD_SUB_HALVES_X4(temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8,
    290                       temp5, temp6, temp7, temp8, temp9, temp10, temp11, temp12)
    291     PACK_2_HALVES_TO_WORD(temp9, temp10, temp11, temp12, temp1, temp3, temp5,
    292                           temp7, temp2, temp4, temp6, temp8)
    293     ADD_SUB_HALVES_X4(temp2, temp4, temp6, temp8, temp9, temp1, temp3, temp10,
    294                       temp1, temp9, temp3, temp10, temp5, temp11, temp7, temp12)
    295     ADD_SUB_HALVES_X4(temp5, temp11, temp7, temp2, temp9, temp3, temp6, temp12,
    296                       temp2, temp9, temp6, temp3, temp4, temp1, temp8, temp10)
    297     ADD_SUB_HALVES_X4(temp1, temp4, temp10, temp8, temp7, temp11, temp5, temp2,
    298                       temp5, temp7, temp11, temp2, temp9, temp6, temp3, temp12)
    299     ABS_X8(temp1, temp4, temp10, temp8, temp7, temp11, temp5, temp2)
    300     LOAD_WITH_OFFSET_X4(temp3, temp6, temp9, temp12, w,
    301                         0, 4, 8, 12,
    302                         0, 0, 0, 0,
    303                         0)
    304     LOAD_WITH_OFFSET_X4(temp13, temp14, temp15, temp16, w,
    305                         0, 4, 8, 12,
    306                         1, 1, 1, 1,
    307                         16)
    308     MUL_HALF(temp3, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8,
    309              temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16)
    310     OUTPUT_EARLY_CLOBBER_REGS_17()
    311     : [a]"r"(a), [b]"r"(b), [w]"r"(w)
    312     : "memory", "hi", "lo"
    313   );
    314   return abs(temp3 - temp17) >> 5;
    315 }
    316 
    317 static int Disto16x16_MIPSdspR2(const uint8_t* const a,
    318                                 const uint8_t* const b,
    319                                 const uint16_t* const w) {
    320   int D = 0;
    321   int x, y;
    322   for (y = 0; y < 16 * BPS; y += 4 * BPS) {
    323     for (x = 0; x < 16; x += 4) {
    324       D += Disto4x4_MIPSdspR2(a + x + y, b + x + y, w);
    325     }
    326   }
    327   return D;
    328 }
    329 
    330 //------------------------------------------------------------------------------
    331 // Intra predictions
    332 
    333 #define FILL_PART(J, SIZE)                                            \
    334     "usw        %[value],  0+" #J "*" XSTR(BPS) "(%[dst])  \n\t"      \
    335     "usw        %[value],  4+" #J "*" XSTR(BPS) "(%[dst])  \n\t"      \
    336   ".if " #SIZE " == 16                                     \n\t"      \
    337     "usw        %[value],  8+" #J "*" XSTR(BPS) "(%[dst])  \n\t"      \
    338     "usw        %[value], 12+" #J "*" XSTR(BPS) "(%[dst])  \n\t"      \
    339   ".endif                                                  \n\t"
    340 
    341 #define FILL_8_OR_16(DST, VALUE, SIZE) do {                         \
    342   int value = (VALUE);                                              \
    343   __asm__ volatile (                                                \
    344     "replv.qb   %[value],  %[value]                      \n\t"      \
    345     FILL_PART( 0, SIZE)                                             \
    346     FILL_PART( 1, SIZE)                                             \
    347     FILL_PART( 2, SIZE)                                             \
    348     FILL_PART( 3, SIZE)                                             \
    349     FILL_PART( 4, SIZE)                                             \
    350     FILL_PART( 5, SIZE)                                             \
    351     FILL_PART( 6, SIZE)                                             \
    352     FILL_PART( 7, SIZE)                                             \
    353   ".if " #SIZE " == 16                                   \n\t"      \
    354     FILL_PART( 8, 16)                                               \
    355     FILL_PART( 9, 16)                                               \
    356     FILL_PART(10, 16)                                               \
    357     FILL_PART(11, 16)                                               \
    358     FILL_PART(12, 16)                                               \
    359     FILL_PART(13, 16)                                               \
    360     FILL_PART(14, 16)                                               \
    361     FILL_PART(15, 16)                                               \
    362   ".endif                                                \n\t"      \
    363     : [value]"+&r"(value)                                           \
    364     : [dst]"r"((DST))                                               \
    365     : "memory"                                                      \
    366   );                                                                \
    367 } while (0)
    368 
    369 #define VERTICAL_PRED(DST, TOP, SIZE)                                          \
    370 static WEBP_INLINE void VerticalPred##SIZE(uint8_t* (DST),                     \
    371                                            const uint8_t* (TOP)) {             \
    372   int j;                                                                       \
    373   if ((TOP)) {                                                                 \
    374     for (j = 0; j < (SIZE); ++j) memcpy((DST) + j * BPS, (TOP), (SIZE));       \
    375   } else {                                                                     \
    376     FILL_8_OR_16((DST), 127, (SIZE));                                          \
    377   }                                                                            \
    378 }
    379 
    380 VERTICAL_PRED(dst, top, 8)
    381 VERTICAL_PRED(dst, top, 16)
    382 
    383 #undef VERTICAL_PRED
    384 
    385 #define HORIZONTAL_PRED(DST, LEFT, SIZE)                                       \
    386 static WEBP_INLINE void HorizontalPred##SIZE(uint8_t* (DST),                   \
    387                                              const uint8_t* (LEFT)) {          \
    388   if (LEFT) {                                                                  \
    389     int j;                                                                     \
    390     for (j = 0; j < (SIZE); ++j) {                                             \
    391       memset((DST) + j * BPS, (LEFT)[j], (SIZE));                              \
    392     }                                                                          \
    393   } else {                                                                     \
    394     FILL_8_OR_16((DST), 129, (SIZE));                                          \
    395   }                                                                            \
    396 }
    397 
    398 HORIZONTAL_PRED(dst, left, 8)
    399 HORIZONTAL_PRED(dst, left, 16)
    400 
    401 #undef HORIZONTAL_PRED
    402 
    403 #define CLIPPING()                                                             \
    404   "preceu.ph.qbl   %[temp2],   %[temp0]                  \n\t"                 \
    405   "preceu.ph.qbr   %[temp0],   %[temp0]                  \n\t"                 \
    406   "preceu.ph.qbl   %[temp3],   %[temp1]                  \n\t"                 \
    407   "preceu.ph.qbr   %[temp1],   %[temp1]                  \n\t"                 \
    408   "addu.ph         %[temp2],   %[temp2],   %[leftY_1]    \n\t"                 \
    409   "addu.ph         %[temp0],   %[temp0],   %[leftY_1]    \n\t"                 \
    410   "addu.ph         %[temp3],   %[temp3],   %[leftY_1]    \n\t"                 \
    411   "addu.ph         %[temp1],   %[temp1],   %[leftY_1]    \n\t"                 \
    412   "shll_s.ph       %[temp2],   %[temp2],   7             \n\t"                 \
    413   "shll_s.ph       %[temp0],   %[temp0],   7             \n\t"                 \
    414   "shll_s.ph       %[temp3],   %[temp3],   7             \n\t"                 \
    415   "shll_s.ph       %[temp1],   %[temp1],   7             \n\t"                 \
    416   "precrqu_s.qb.ph %[temp0],   %[temp2],   %[temp0]      \n\t"                 \
    417   "precrqu_s.qb.ph %[temp1],   %[temp3],   %[temp1]      \n\t"
    418 
    419 #define CLIP_8B_TO_DST(DST, LEFT, TOP, SIZE) do {                              \
    420   int leftY_1 = ((int)(LEFT)[y] << 16) + (LEFT)[y];                            \
    421   int temp0, temp1, temp2, temp3;                                              \
    422   __asm__ volatile (                                                           \
    423     "replv.ph        %[leftY_1], %[leftY_1]              \n\t"                 \
    424     "ulw             %[temp0],   0(%[top])               \n\t"                 \
    425     "ulw             %[temp1],   4(%[top])               \n\t"                 \
    426     "subu.ph         %[leftY_1], %[leftY_1], %[left_1]   \n\t"                 \
    427     CLIPPING()                                                                 \
    428     "usw             %[temp0],   0(%[dst])               \n\t"                 \
    429     "usw             %[temp1],   4(%[dst])               \n\t"                 \
    430   ".if " #SIZE " == 16                                   \n\t"                 \
    431     "ulw             %[temp0],   8(%[top])               \n\t"                 \
    432     "ulw             %[temp1],   12(%[top])              \n\t"                 \
    433     CLIPPING()                                                                 \
    434     "usw             %[temp0],   8(%[dst])               \n\t"                 \
    435     "usw             %[temp1],   12(%[dst])              \n\t"                 \
    436   ".endif                                                \n\t"                 \
    437     : [leftY_1]"+&r"(leftY_1), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),       \
    438       [temp2]"=&r"(temp2), [temp3]"=&r"(temp3)                                 \
    439     : [left_1]"r"(left_1), [top]"r"((TOP)), [dst]"r"((DST))                    \
    440     : "memory"                                                                 \
    441   );                                                                           \
    442 } while (0)
    443 
    444 #define CLIP_TO_DST(DST, LEFT, TOP, SIZE) do {                                 \
    445   int y;                                                                       \
    446   const int left_1 = ((int)(LEFT)[-1] << 16) + (LEFT)[-1];                     \
    447   for (y = 0; y < (SIZE); ++y) {                                               \
    448     CLIP_8B_TO_DST((DST), (LEFT), (TOP), (SIZE));                              \
    449     (DST) += BPS;                                                              \
    450   }                                                                            \
    451 } while (0)
    452 
    453 #define TRUE_MOTION(DST, LEFT, TOP, SIZE)                                      \
    454 static WEBP_INLINE void TrueMotion##SIZE(uint8_t* (DST), const uint8_t* (LEFT),\
    455                                          const uint8_t* (TOP)) {               \
    456   if ((LEFT) != NULL) {                                                        \
    457     if ((TOP) != NULL) {                                                       \
    458       CLIP_TO_DST((DST), (LEFT), (TOP), (SIZE));                               \
    459     } else {                                                                   \
    460       HorizontalPred##SIZE((DST), (LEFT));                                     \
    461     }                                                                          \
    462   } else {                                                                     \
    463     /* true motion without left samples (hence: with default 129 value)    */  \
    464     /* is equivalent to VE prediction where you just copy the top samples. */  \
    465     /* Note that if top samples are not available, the default value is    */  \
    466     /* then 129, and not 127 as in the VerticalPred case.                  */  \
    467     if ((TOP) != NULL) {                                                       \
    468       VerticalPred##SIZE((DST), (TOP));                                        \
    469     } else {                                                                   \
    470       FILL_8_OR_16((DST), 129, (SIZE));                                        \
    471     }                                                                          \
    472   }                                                                            \
    473 }
    474 
    475 TRUE_MOTION(dst, left, top, 8)
    476 TRUE_MOTION(dst, left, top, 16)
    477 
    478 #undef TRUE_MOTION
    479 #undef CLIP_TO_DST
    480 #undef CLIP_8B_TO_DST
    481 #undef CLIPPING
    482 
    483 static WEBP_INLINE void DCMode16(uint8_t* dst, const uint8_t* left,
    484                                  const uint8_t* top) {
    485   int DC, DC1;
    486   int temp0, temp1, temp2, temp3;
    487 
    488   __asm__ volatile(
    489     "beqz        %[top],   2f                  \n\t"
    490     LOAD_WITH_OFFSET_X4(temp0, temp1, temp2, temp3, top,
    491                         0, 4, 8, 12,
    492                         0, 0, 0, 0,
    493                         0)
    494     "raddu.w.qb  %[temp0], %[temp0]            \n\t"
    495     "raddu.w.qb  %[temp1], %[temp1]            \n\t"
    496     "raddu.w.qb  %[temp2], %[temp2]            \n\t"
    497     "raddu.w.qb  %[temp3], %[temp3]            \n\t"
    498     "addu        %[temp0], %[temp0], %[temp1]  \n\t"
    499     "addu        %[temp2], %[temp2], %[temp3]  \n\t"
    500     "addu        %[DC],    %[temp0], %[temp2]  \n\t"
    501     "move        %[DC1],   %[DC]               \n\t"
    502     "beqz        %[left],  1f                  \n\t"
    503     LOAD_WITH_OFFSET_X4(temp0, temp1, temp2, temp3, left,
    504                         0, 4, 8, 12,
    505                         0, 0, 0, 0,
    506                         0)
    507     "raddu.w.qb  %[temp0], %[temp0]            \n\t"
    508     "raddu.w.qb  %[temp1], %[temp1]            \n\t"
    509     "raddu.w.qb  %[temp2], %[temp2]            \n\t"
    510     "raddu.w.qb  %[temp3], %[temp3]            \n\t"
    511     "addu        %[temp0], %[temp0], %[temp1]  \n\t"
    512     "addu        %[temp2], %[temp2], %[temp3]  \n\t"
    513     "addu        %[DC1],   %[temp0], %[temp2]  \n\t"
    514   "1:                                          \n\t"
    515     "addu        %[DC],   %[DC],     %[DC1]    \n\t"
    516     "j           3f                            \n\t"
    517   "2:                                          \n\t"
    518     "beqz        %[left],  4f                  \n\t"
    519     LOAD_WITH_OFFSET_X4(temp0, temp1, temp2, temp3, left,
    520                         0, 4, 8, 12,
    521                         0, 0, 0, 0,
    522                         0)
    523     "raddu.w.qb  %[temp0], %[temp0]            \n\t"
    524     "raddu.w.qb  %[temp1], %[temp1]            \n\t"
    525     "raddu.w.qb  %[temp2], %[temp2]            \n\t"
    526     "raddu.w.qb  %[temp3], %[temp3]            \n\t"
    527     "addu        %[temp0], %[temp0], %[temp1]  \n\t"
    528     "addu        %[temp2], %[temp2], %[temp3]  \n\t"
    529     "addu        %[DC],    %[temp0], %[temp2]  \n\t"
    530     "addu        %[DC],    %[DC],    %[DC]     \n\t"
    531   "3:                                          \n\t"
    532     "shra_r.w    %[DC],    %[DC],    5         \n\t"
    533     "j           5f                            \n\t"
    534   "4:                                          \n\t"
    535     "li          %[DC],    0x80                \n\t"
    536   "5:                                          \n\t"
    537     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [DC]"=&r"(DC),
    538       [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [DC1]"=&r"(DC1)
    539     : [left]"r"(left), [top]"r"(top)
    540     : "memory"
    541   );
    542 
    543   FILL_8_OR_16(dst, DC, 16);
    544 }
    545 
    546 static WEBP_INLINE void DCMode8(uint8_t* dst, const uint8_t* left,
    547                                 const uint8_t* top) {
    548   int DC, DC1;
    549   int temp0, temp1, temp2, temp3;
    550 
    551   __asm__ volatile(
    552     "beqz        %[top],   2f                  \n\t"
    553     "ulw         %[temp0], 0(%[top])           \n\t"
    554     "ulw         %[temp1], 4(%[top])           \n\t"
    555     "raddu.w.qb  %[temp0], %[temp0]            \n\t"
    556     "raddu.w.qb  %[temp1], %[temp1]            \n\t"
    557     "addu        %[DC],    %[temp0], %[temp1]  \n\t"
    558     "move        %[DC1],   %[DC]               \n\t"
    559     "beqz        %[left],  1f                  \n\t"
    560     "ulw         %[temp2], 0(%[left])          \n\t"
    561     "ulw         %[temp3], 4(%[left])          \n\t"
    562     "raddu.w.qb  %[temp2], %[temp2]            \n\t"
    563     "raddu.w.qb  %[temp3], %[temp3]            \n\t"
    564     "addu        %[DC1],   %[temp2], %[temp3]  \n\t"
    565   "1:                                          \n\t"
    566     "addu        %[DC],    %[DC],    %[DC1]    \n\t"
    567     "j           3f                            \n\t"
    568   "2:                                          \n\t"
    569     "beqz        %[left],  4f                  \n\t"
    570     "ulw         %[temp2], 0(%[left])          \n\t"
    571     "ulw         %[temp3], 4(%[left])          \n\t"
    572     "raddu.w.qb  %[temp2], %[temp2]            \n\t"
    573     "raddu.w.qb  %[temp3], %[temp3]            \n\t"
    574     "addu        %[DC],    %[temp2], %[temp3]  \n\t"
    575     "addu        %[DC],    %[DC],    %[DC]     \n\t"
    576   "3:                                          \n\t"
    577     "shra_r.w    %[DC], %[DC], 4               \n\t"
    578     "j           5f                            \n\t"
    579   "4:                                          \n\t"
    580     "li          %[DC], 0x80                   \n\t"
    581   "5:                                          \n\t"
    582     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [DC]"=&r"(DC),
    583       [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [DC1]"=&r"(DC1)
    584     : [left]"r"(left), [top]"r"(top)
    585     : "memory"
    586   );
    587 
    588   FILL_8_OR_16(dst, DC, 8);
    589 }
    590 
    591 static void DC4(uint8_t* dst, const uint8_t* top) {
    592   int temp0, temp1;
    593   __asm__ volatile(
    594     "ulw          %[temp0],   0(%[top])               \n\t"
    595     "ulw          %[temp1],   -5(%[top])              \n\t"
    596     "raddu.w.qb   %[temp0],   %[temp0]                \n\t"
    597     "raddu.w.qb   %[temp1],   %[temp1]                \n\t"
    598     "addu         %[temp0],   %[temp0],    %[temp1]   \n\t"
    599     "addiu        %[temp0],   %[temp0],    4          \n\t"
    600     "srl          %[temp0],   %[temp0],    3          \n\t"
    601     "replv.qb     %[temp0],   %[temp0]                \n\t"
    602     "usw          %[temp0],   0*" XSTR(BPS) "(%[dst]) \n\t"
    603     "usw          %[temp0],   1*" XSTR(BPS) "(%[dst]) \n\t"
    604     "usw          %[temp0],   2*" XSTR(BPS) "(%[dst]) \n\t"
    605     "usw          %[temp0],   3*" XSTR(BPS) "(%[dst]) \n\t"
    606     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1)
    607     : [top]"r"(top), [dst]"r"(dst)
    608     : "memory"
    609   );
    610 }
    611 
    612 static void TM4(uint8_t* dst, const uint8_t* top) {
    613   int a10, a32, temp0, temp1, temp2, temp3, temp4, temp5;
    614   const int c35 = 0xff00ff;
    615   __asm__ volatile (
    616     "lbu              %[temp1],  0(%[top])                     \n\t"
    617     "lbu              %[a10],    1(%[top])                     \n\t"
    618     "lbu              %[temp2],  2(%[top])                     \n\t"
    619     "lbu              %[a32],    3(%[top])                     \n\t"
    620     "ulw              %[temp0],  -5(%[top])                    \n\t"
    621     "lbu              %[temp4],  -1(%[top])                    \n\t"
    622     "append           %[a10],    %[temp1],   16                \n\t"
    623     "append           %[a32],    %[temp2],   16                \n\t"
    624     "replv.ph         %[temp4],  %[temp4]                      \n\t"
    625     "shrl.ph          %[temp1],  %[temp0],   8                 \n\t"
    626     "and              %[temp0],  %[temp0],   %[c35]            \n\t"
    627     "subu.ph          %[temp1],  %[temp1],   %[temp4]          \n\t"
    628     "subu.ph          %[temp0],  %[temp0],   %[temp4]          \n\t"
    629     "srl              %[temp2],  %[temp1],   16                \n\t"
    630     "srl              %[temp3],  %[temp0],   16                \n\t"
    631     "replv.ph         %[temp2],  %[temp2]                      \n\t"
    632     "replv.ph         %[temp3],  %[temp3]                      \n\t"
    633     "replv.ph         %[temp4],  %[temp1]                      \n\t"
    634     "replv.ph         %[temp5],  %[temp0]                      \n\t"
    635     "addu.ph          %[temp0],  %[temp3],   %[a10]            \n\t"
    636     "addu.ph          %[temp1],  %[temp3],   %[a32]            \n\t"
    637     "addu.ph          %[temp3],  %[temp2],   %[a10]            \n\t"
    638     "addu.ph          %[temp2],  %[temp2],   %[a32]            \n\t"
    639     "shll_s.ph        %[temp0],  %[temp0],   7                 \n\t"
    640     "shll_s.ph        %[temp1],  %[temp1],   7                 \n\t"
    641     "shll_s.ph        %[temp3],  %[temp3],   7                 \n\t"
    642     "shll_s.ph        %[temp2],  %[temp2],   7                 \n\t"
    643     "precrqu_s.qb.ph  %[temp0],  %[temp1],   %[temp0]          \n\t"
    644     "precrqu_s.qb.ph  %[temp1],  %[temp2],   %[temp3]          \n\t"
    645     "addu.ph          %[temp2],  %[temp5],   %[a10]            \n\t"
    646     "addu.ph          %[temp3],  %[temp5],   %[a32]            \n\t"
    647     "addu.ph          %[temp5],  %[temp4],   %[a10]            \n\t"
    648     "addu.ph          %[temp4],  %[temp4],   %[a32]            \n\t"
    649     "shll_s.ph        %[temp2],  %[temp2],   7                 \n\t"
    650     "shll_s.ph        %[temp3],  %[temp3],   7                 \n\t"
    651     "shll_s.ph        %[temp4],  %[temp4],   7                 \n\t"
    652     "shll_s.ph        %[temp5],  %[temp5],   7                 \n\t"
    653     "precrqu_s.qb.ph  %[temp2],  %[temp3],   %[temp2]          \n\t"
    654     "precrqu_s.qb.ph  %[temp3],  %[temp4],   %[temp5]          \n\t"
    655     "usw              %[temp1],  0*" XSTR(BPS) "(%[dst])       \n\t"
    656     "usw              %[temp0],  1*" XSTR(BPS) "(%[dst])       \n\t"
    657     "usw              %[temp3],  2*" XSTR(BPS) "(%[dst])       \n\t"
    658     "usw              %[temp2],  3*" XSTR(BPS) "(%[dst])       \n\t"
    659     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
    660       [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
    661       [a10]"=&r"(a10), [a32]"=&r"(a32)
    662     : [c35]"r"(c35), [top]"r"(top), [dst]"r"(dst)
    663     : "memory"
    664   );
    665 }
    666 
    667 static void VE4(uint8_t* dst, const uint8_t* top) {
    668   int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
    669   __asm__ volatile(
    670     "ulw             %[temp0],   -1(%[top])              \n\t"
    671     "ulh             %[temp1],   3(%[top])               \n\t"
    672     "preceu.ph.qbr   %[temp2],   %[temp0]                \n\t"
    673     "preceu.ph.qbl   %[temp3],   %[temp0]                \n\t"
    674     "preceu.ph.qbr   %[temp4],   %[temp1]                \n\t"
    675     "packrl.ph       %[temp5],   %[temp3],    %[temp2]   \n\t"
    676     "packrl.ph       %[temp6],   %[temp4],    %[temp3]   \n\t"
    677     "shll.ph         %[temp5],   %[temp5],    1          \n\t"
    678     "shll.ph         %[temp6],   %[temp6],    1          \n\t"
    679     "addq.ph         %[temp2],   %[temp5],    %[temp2]   \n\t"
    680     "addq.ph         %[temp6],   %[temp6],    %[temp4]   \n\t"
    681     "addq.ph         %[temp2],   %[temp2],    %[temp3]   \n\t"
    682     "addq.ph         %[temp6],   %[temp6],    %[temp3]   \n\t"
    683     "shra_r.ph       %[temp2],   %[temp2],    2          \n\t"
    684     "shra_r.ph       %[temp6],   %[temp6],    2          \n\t"
    685     "precr.qb.ph     %[temp4],   %[temp6],    %[temp2]   \n\t"
    686     "usw             %[temp4],   0*" XSTR(BPS) "(%[dst]) \n\t"
    687     "usw             %[temp4],   1*" XSTR(BPS) "(%[dst]) \n\t"
    688     "usw             %[temp4],   2*" XSTR(BPS) "(%[dst]) \n\t"
    689     "usw             %[temp4],   3*" XSTR(BPS) "(%[dst]) \n\t"
    690     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
    691       [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
    692       [temp6]"=&r"(temp6)
    693     : [top]"r"(top), [dst]"r"(dst)
    694     : "memory"
    695   );
    696 }
    697 
    698 static void HE4(uint8_t* dst, const uint8_t* top) {
    699   int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
    700   __asm__ volatile(
    701     "ulw             %[temp0],   -4(%[top])              \n\t"
    702     "lbu             %[temp1],   -5(%[top])              \n\t"
    703     "preceu.ph.qbr   %[temp2],   %[temp0]                \n\t"
    704     "preceu.ph.qbl   %[temp3],   %[temp0]                \n\t"
    705     "replv.ph        %[temp4],   %[temp1]                \n\t"
    706     "packrl.ph       %[temp5],   %[temp3],    %[temp2]   \n\t"
    707     "packrl.ph       %[temp6],   %[temp2],    %[temp4]   \n\t"
    708     "shll.ph         %[temp5],   %[temp5],    1          \n\t"
    709     "shll.ph         %[temp6],   %[temp6],    1          \n\t"
    710     "addq.ph         %[temp3],   %[temp3],    %[temp5]   \n\t"
    711     "addq.ph         %[temp3],   %[temp3],    %[temp2]   \n\t"
    712     "addq.ph         %[temp2],   %[temp2],    %[temp6]   \n\t"
    713     "addq.ph         %[temp2],   %[temp2],    %[temp4]   \n\t"
    714     "shra_r.ph       %[temp3],   %[temp3],    2          \n\t"
    715     "shra_r.ph       %[temp2],   %[temp2],    2          \n\t"
    716     "replv.qb        %[temp0],   %[temp3]                \n\t"
    717     "replv.qb        %[temp1],   %[temp2]                \n\t"
    718     "srl             %[temp3],   %[temp3],    16         \n\t"
    719     "srl             %[temp2],   %[temp2],    16         \n\t"
    720     "replv.qb        %[temp3],   %[temp3]                \n\t"
    721     "replv.qb        %[temp2],   %[temp2]                \n\t"
    722     "usw             %[temp3],   0*" XSTR(BPS) "(%[dst]) \n\t"
    723     "usw             %[temp0],   1*" XSTR(BPS) "(%[dst]) \n\t"
    724     "usw             %[temp2],   2*" XSTR(BPS) "(%[dst]) \n\t"
    725     "usw             %[temp1],   3*" XSTR(BPS) "(%[dst]) \n\t"
    726     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
    727       [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
    728       [temp6]"=&r"(temp6)
    729     : [top]"r"(top), [dst]"r"(dst)
    730     : "memory"
    731   );
    732 }
    733 
    734 static void RD4(uint8_t* dst, const uint8_t* top) {
    735   int temp0, temp1, temp2, temp3, temp4, temp5;
    736   int temp6, temp7, temp8, temp9, temp10, temp11;
    737   __asm__ volatile(
    738     "ulw             %[temp0],    -5(%[top])               \n\t"
    739     "ulw             %[temp1],    -1(%[top])               \n\t"
    740     "preceu.ph.qbl   %[temp2],    %[temp0]                 \n\t"
    741     "preceu.ph.qbr   %[temp3],    %[temp0]                 \n\t"
    742     "preceu.ph.qbr   %[temp4],    %[temp1]                 \n\t"
    743     "preceu.ph.qbl   %[temp5],    %[temp1]                 \n\t"
    744     "packrl.ph       %[temp6],    %[temp2],    %[temp3]    \n\t"
    745     "packrl.ph       %[temp7],    %[temp4],    %[temp2]    \n\t"
    746     "packrl.ph       %[temp8],    %[temp5],    %[temp4]    \n\t"
    747     "shll.ph         %[temp6],    %[temp6],    1           \n\t"
    748     "addq.ph         %[temp9],    %[temp2],    %[temp6]    \n\t"
    749     "shll.ph         %[temp7],    %[temp7],    1           \n\t"
    750     "addq.ph         %[temp9],    %[temp9],    %[temp3]    \n\t"
    751     "shll.ph         %[temp8],    %[temp8],    1           \n\t"
    752     "shra_r.ph       %[temp9],    %[temp9],    2           \n\t"
    753     "addq.ph         %[temp10],   %[temp4],    %[temp7]    \n\t"
    754     "addq.ph         %[temp11],   %[temp5],    %[temp8]    \n\t"
    755     "addq.ph         %[temp10],   %[temp10],   %[temp2]    \n\t"
    756     "addq.ph         %[temp11],   %[temp11],   %[temp4]    \n\t"
    757     "shra_r.ph       %[temp10],   %[temp10],   2           \n\t"
    758     "shra_r.ph       %[temp11],   %[temp11],   2           \n\t"
    759     "lbu             %[temp0],    3(%[top])                \n\t"
    760     "lbu             %[temp1],    2(%[top])                \n\t"
    761     "lbu             %[temp2],    1(%[top])                \n\t"
    762     "sll             %[temp1],    %[temp1],    1           \n\t"
    763     "addu            %[temp0],    %[temp0],    %[temp1]    \n\t"
    764     "addu            %[temp0],    %[temp0],    %[temp2]    \n\t"
    765     "precr.qb.ph     %[temp9],    %[temp10],   %[temp9]    \n\t"
    766     "shra_r.w        %[temp0],    %[temp0],    2           \n\t"
    767     "precr.qb.ph     %[temp10],   %[temp11],   %[temp10]   \n\t"
    768     "usw             %[temp9],    3*" XSTR(BPS) "(%[dst])  \n\t"
    769     "usw             %[temp10],   1*" XSTR(BPS) "(%[dst])  \n\t"
    770     "prepend         %[temp9],    %[temp11],   8           \n\t"
    771     "prepend         %[temp10],   %[temp0],    8           \n\t"
    772     "usw             %[temp9],    2*" XSTR(BPS) "(%[dst])  \n\t"
    773     "usw             %[temp10],   0*" XSTR(BPS) "(%[dst])  \n\t"
    774     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
    775       [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
    776       [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
    777       [temp9]"=&r"(temp9), [temp10]"=&r"(temp10), [temp11]"=&r"(temp11)
    778     : [top]"r"(top), [dst]"r"(dst)
    779     : "memory"
    780   );
    781 }
    782 
    783 static void VR4(uint8_t* dst, const uint8_t* top) {
    784   int temp0, temp1, temp2, temp3, temp4;
    785   int temp5, temp6, temp7, temp8, temp9;
    786   __asm__ volatile (
    787     "ulw              %[temp0],   -4(%[top])              \n\t"
    788     "ulw              %[temp1],   0(%[top])               \n\t"
    789     "preceu.ph.qbl    %[temp2],   %[temp0]                \n\t"
    790     "preceu.ph.qbr    %[temp0],   %[temp0]                \n\t"
    791     "preceu.ph.qbla   %[temp3],   %[temp1]                \n\t"
    792     "preceu.ph.qbra   %[temp1],   %[temp1]                \n\t"
    793     "packrl.ph        %[temp7],   %[temp3],    %[temp2]   \n\t"
    794     "addqh_r.ph       %[temp4],   %[temp1],    %[temp3]   \n\t"
    795     "move             %[temp6],   %[temp1]                \n\t"
    796     "append           %[temp1],   %[temp2],    16         \n\t"
    797     "shll.ph          %[temp9],   %[temp6],    1          \n\t"
    798     "addqh_r.ph       %[temp5],   %[temp7],    %[temp6]   \n\t"
    799     "shll.ph          %[temp8],   %[temp7],    1          \n\t"
    800     "addu.ph          %[temp3],   %[temp7],    %[temp3]   \n\t"
    801     "addu.ph          %[temp1],   %[temp1],    %[temp6]   \n\t"
    802     "packrl.ph        %[temp7],   %[temp2],    %[temp0]   \n\t"
    803     "addu.ph          %[temp6],   %[temp0],    %[temp2]   \n\t"
    804     "addu.ph          %[temp3],   %[temp3],    %[temp9]   \n\t"
    805     "addu.ph          %[temp1],   %[temp1],    %[temp8]   \n\t"
    806     "shll.ph          %[temp7],   %[temp7],    1          \n\t"
    807     "shra_r.ph        %[temp3],   %[temp3],    2          \n\t"
    808     "shra_r.ph        %[temp1],   %[temp1],    2          \n\t"
    809     "addu.ph          %[temp6],   %[temp6],    %[temp7]   \n\t"
    810     "shra_r.ph        %[temp6],   %[temp6],    2          \n\t"
    811     "precrq.ph.w      %[temp8],   %[temp4],    %[temp5]   \n\t"
    812     "append           %[temp4],   %[temp5],    16         \n\t"
    813     "precrq.ph.w      %[temp2],   %[temp3],    %[temp1]   \n\t"
    814     "append           %[temp3],   %[temp1],    16         \n\t"
    815     "precr.qb.ph      %[temp8],   %[temp8],    %[temp4]   \n\t"
    816     "precr.qb.ph      %[temp3],   %[temp2],    %[temp3]   \n\t"
    817     "usw              %[temp8],   0*" XSTR(BPS) "(%[dst]) \n\t"
    818     "usw              %[temp3],   1*" XSTR(BPS) "(%[dst]) \n\t"
    819     "append           %[temp3],   %[temp6],    8          \n\t"
    820     "srl              %[temp6],   %[temp6],    16         \n\t"
    821     "append           %[temp8],   %[temp6],    8          \n\t"
    822     "usw              %[temp3],   3*" XSTR(BPS) "(%[dst]) \n\t"
    823     "usw              %[temp8],   2*" XSTR(BPS) "(%[dst]) \n\t"
    824     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
    825       [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
    826       [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
    827       [temp9]"=&r"(temp9)
    828     : [top]"r"(top), [dst]"r"(dst)
    829     : "memory"
    830   );
    831 }
    832 
    833 static void LD4(uint8_t* dst, const uint8_t* top) {
    834   int temp0, temp1, temp2, temp3, temp4, temp5;
    835   int temp6, temp7, temp8, temp9, temp10, temp11;
    836   __asm__ volatile(
    837     "ulw             %[temp0],    0(%[top])               \n\t"
    838     "ulw             %[temp1],    4(%[top])               \n\t"
    839     "preceu.ph.qbl   %[temp2],    %[temp0]                \n\t"
    840     "preceu.ph.qbr   %[temp3],    %[temp0]                \n\t"
    841     "preceu.ph.qbr   %[temp4],    %[temp1]                \n\t"
    842     "preceu.ph.qbl   %[temp5],    %[temp1]                \n\t"
    843     "packrl.ph       %[temp6],    %[temp2],    %[temp3]   \n\t"
    844     "packrl.ph       %[temp7],    %[temp4],    %[temp2]   \n\t"
    845     "packrl.ph       %[temp8],    %[temp5],    %[temp4]   \n\t"
    846     "shll.ph         %[temp6],    %[temp6],    1          \n\t"
    847     "addq.ph         %[temp9],    %[temp2],    %[temp6]   \n\t"
    848     "shll.ph         %[temp7],    %[temp7],    1          \n\t"
    849     "addq.ph         %[temp9],    %[temp9],    %[temp3]   \n\t"
    850     "shll.ph         %[temp8],    %[temp8],    1          \n\t"
    851     "shra_r.ph       %[temp9],    %[temp9],    2          \n\t"
    852     "addq.ph         %[temp10],   %[temp4],    %[temp7]   \n\t"
    853     "addq.ph         %[temp11],   %[temp5],    %[temp8]   \n\t"
    854     "addq.ph         %[temp10],   %[temp10],   %[temp2]   \n\t"
    855     "addq.ph         %[temp11],   %[temp11],   %[temp4]   \n\t"
    856     "shra_r.ph       %[temp10],   %[temp10],   2          \n\t"
    857     "shra_r.ph       %[temp11],   %[temp11],   2          \n\t"
    858     "srl             %[temp1],    %[temp1],    24         \n\t"
    859     "sll             %[temp1],    %[temp1],    1          \n\t"
    860     "raddu.w.qb      %[temp5],    %[temp5]                \n\t"
    861     "precr.qb.ph     %[temp9],    %[temp10],   %[temp9]   \n\t"
    862     "precr.qb.ph     %[temp10],   %[temp11],   %[temp10]  \n\t"
    863     "addu            %[temp1],    %[temp1],    %[temp5]   \n\t"
    864     "shra_r.w        %[temp1],    %[temp1],    2          \n\t"
    865     "usw             %[temp9],    0*" XSTR(BPS) "(%[dst]) \n\t"
    866     "usw             %[temp10],   2*" XSTR(BPS) "(%[dst]) \n\t"
    867     "prepend         %[temp9],    %[temp11],   8          \n\t"
    868     "prepend         %[temp10],   %[temp1],    8          \n\t"
    869     "usw             %[temp9],    1*" XSTR(BPS) "(%[dst]) \n\t"
    870     "usw             %[temp10],   3*" XSTR(BPS) "(%[dst]) \n\t"
    871     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
    872       [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
    873       [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
    874       [temp9]"=&r"(temp9), [temp10]"=&r"(temp10), [temp11]"=&r"(temp11)
    875     : [top]"r"(top), [dst]"r"(dst)
    876     : "memory"
    877   );
    878 }
    879 
    880 static void VL4(uint8_t* dst, const uint8_t* top) {
    881   int temp0, temp1, temp2, temp3, temp4;
    882   int temp5, temp6, temp7, temp8, temp9;
    883   __asm__ volatile (
    884     "ulw              %[temp0],   0(%[top])               \n\t"
    885     "ulw              %[temp1],   4(%[top])               \n\t"
    886     "preceu.ph.qbla   %[temp2],   %[temp0]                \n\t"
    887     "preceu.ph.qbra   %[temp0],   %[temp0]                \n\t"
    888     "preceu.ph.qbl    %[temp3],   %[temp1]                \n\t"
    889     "preceu.ph.qbr    %[temp1],   %[temp1]                \n\t"
    890     "addqh_r.ph       %[temp4],   %[temp0],    %[temp2]   \n\t"
    891     "packrl.ph        %[temp7],   %[temp1],    %[temp0]   \n\t"
    892     "precrq.ph.w      %[temp6],   %[temp1],    %[temp2]   \n\t"
    893     "shll.ph          %[temp9],   %[temp2],    1          \n\t"
    894     "addqh_r.ph       %[temp5],   %[temp7],    %[temp2]   \n\t"
    895     "shll.ph          %[temp8],   %[temp7],    1          \n\t"
    896     "addu.ph          %[temp2],   %[temp2],    %[temp6]   \n\t"
    897     "addu.ph          %[temp0],   %[temp0],    %[temp7]   \n\t"
    898     "packrl.ph        %[temp7],   %[temp3],    %[temp1]   \n\t"
    899     "addu.ph          %[temp6],   %[temp1],    %[temp3]   \n\t"
    900     "addu.ph          %[temp2],   %[temp2],    %[temp8]   \n\t"
    901     "addu.ph          %[temp0],   %[temp0],    %[temp9]   \n\t"
    902     "shll.ph          %[temp7],   %[temp7],    1          \n\t"
    903     "shra_r.ph        %[temp2],   %[temp2],    2          \n\t"
    904     "shra_r.ph        %[temp0],   %[temp0],    2          \n\t"
    905     "addu.ph          %[temp6],   %[temp6],    %[temp7]   \n\t"
    906     "shra_r.ph        %[temp6],   %[temp6],    2          \n\t"
    907     "precrq.ph.w      %[temp8],   %[temp5],    %[temp4]   \n\t"
    908     "append           %[temp5],   %[temp4],    16         \n\t"
    909     "precrq.ph.w      %[temp3],   %[temp2],    %[temp0]   \n\t"
    910     "append           %[temp2],   %[temp0],    16         \n\t"
    911     "precr.qb.ph      %[temp8],   %[temp8],    %[temp5]   \n\t"
    912     "precr.qb.ph      %[temp3],   %[temp3],    %[temp2]   \n\t"
    913     "usw              %[temp8],   0*" XSTR(BPS) "(%[dst]) \n\t"
    914     "prepend          %[temp8],   %[temp6],    8          \n\t"
    915     "usw              %[temp3],   1*" XSTR(BPS) "(%[dst]) \n\t"
    916     "srl              %[temp6],   %[temp6],    16         \n\t"
    917     "prepend          %[temp3],   %[temp6],    8          \n\t"
    918     "usw              %[temp8],   2*" XSTR(BPS) "(%[dst]) \n\t"
    919     "usw              %[temp3],   3*" XSTR(BPS) "(%[dst]) \n\t"
    920     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
    921       [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
    922       [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
    923       [temp9]"=&r"(temp9)
    924     : [top]"r"(top), [dst]"r"(dst)
    925     : "memory"
    926   );
    927 }
    928 
    929 static void HD4(uint8_t* dst, const uint8_t* top) {
    930   int temp0, temp1, temp2, temp3, temp4;
    931   int temp5, temp6, temp7, temp8, temp9;
    932   __asm__ volatile (
    933     "ulw              %[temp0],   -5(%[top])              \n\t"
    934     "ulw              %[temp1],   -1(%[top])              \n\t"
    935     "preceu.ph.qbla   %[temp2],   %[temp0]                \n\t"
    936     "preceu.ph.qbra   %[temp0],   %[temp0]                \n\t"
    937     "preceu.ph.qbl    %[temp3],   %[temp1]                \n\t"
    938     "preceu.ph.qbr    %[temp1],   %[temp1]                \n\t"
    939     "addqh_r.ph       %[temp4],   %[temp0],    %[temp2]   \n\t"
    940     "packrl.ph        %[temp7],   %[temp1],    %[temp0]   \n\t"
    941     "precrq.ph.w      %[temp6],   %[temp1],    %[temp2]   \n\t"
    942     "shll.ph          %[temp9],   %[temp2],    1          \n\t"
    943     "addqh_r.ph       %[temp5],   %[temp7],    %[temp2]   \n\t"
    944     "shll.ph          %[temp8],   %[temp7],    1          \n\t"
    945     "addu.ph          %[temp2],   %[temp2],    %[temp6]   \n\t"
    946     "addu.ph          %[temp0],   %[temp0],    %[temp7]   \n\t"
    947     "packrl.ph        %[temp7],   %[temp3],    %[temp1]   \n\t"
    948     "addu.ph          %[temp6],   %[temp1],    %[temp3]   \n\t"
    949     "addu.ph          %[temp2],   %[temp2],    %[temp8]   \n\t"
    950     "addu.ph          %[temp0],   %[temp0],    %[temp9]   \n\t"
    951     "shll.ph          %[temp7],   %[temp7],    1          \n\t"
    952     "shra_r.ph        %[temp2],   %[temp2],    2          \n\t"
    953     "shra_r.ph        %[temp0],   %[temp0],    2          \n\t"
    954     "addu.ph          %[temp6],   %[temp6],    %[temp7]   \n\t"
    955     "shra_r.ph        %[temp6],   %[temp6],    2          \n\t"
    956     "precrq.ph.w      %[temp1],   %[temp2],    %[temp5]   \n\t"
    957     "precrq.ph.w      %[temp3],   %[temp0],    %[temp4]   \n\t"
    958     "precr.qb.ph      %[temp7],   %[temp6],    %[temp1]   \n\t"
    959     "precr.qb.ph      %[temp6],   %[temp1],    %[temp3]   \n\t"
    960     "usw              %[temp7],   0*" XSTR(BPS) "(%[dst]) \n\t"
    961     "usw              %[temp6],   1*" XSTR(BPS) "(%[dst]) \n\t"
    962     "append           %[temp2],   %[temp5],    16         \n\t"
    963     "append           %[temp0],   %[temp4],    16         \n\t"
    964     "precr.qb.ph      %[temp5],   %[temp3],    %[temp2]   \n\t"
    965     "precr.qb.ph      %[temp4],   %[temp2],    %[temp0]   \n\t"
    966     "usw              %[temp5],   2*" XSTR(BPS) "(%[dst]) \n\t"
    967     "usw              %[temp4],   3*" XSTR(BPS) "(%[dst]) \n\t"
    968     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
    969       [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
    970       [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
    971       [temp9]"=&r"(temp9)
    972     : [top]"r"(top), [dst]"r"(dst)
    973     : "memory"
    974   );
    975 }
    976 
    977 static void HU4(uint8_t* dst, const uint8_t* top) {
    978   int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
    979   __asm__ volatile (
    980     "ulw             %[temp0],   -5(%[top])              \n\t"
    981     "preceu.ph.qbl   %[temp1],   %[temp0]                \n\t"
    982     "preceu.ph.qbr   %[temp2],   %[temp0]                \n\t"
    983     "packrl.ph       %[temp3],   %[temp1],    %[temp2]   \n\t"
    984     "replv.qb        %[temp7],   %[temp2]                \n\t"
    985     "addqh_r.ph      %[temp4],   %[temp1],    %[temp3]   \n\t"
    986     "addqh_r.ph      %[temp5],   %[temp3],    %[temp2]   \n\t"
    987     "shll.ph         %[temp6],   %[temp3],    1          \n\t"
    988     "addu.ph         %[temp3],   %[temp2],    %[temp3]   \n\t"
    989     "addu.ph         %[temp6],   %[temp1],    %[temp6]   \n\t"
    990     "shll.ph         %[temp0],   %[temp2],    1          \n\t"
    991     "addu.ph         %[temp6],   %[temp6],    %[temp2]   \n\t"
    992     "addu.ph         %[temp0],   %[temp3],    %[temp0]   \n\t"
    993     "shra_r.ph       %[temp6],   %[temp6],    2          \n\t"
    994     "shra_r.ph       %[temp0],   %[temp0],    2          \n\t"
    995     "packrl.ph       %[temp3],   %[temp6],    %[temp5]   \n\t"
    996     "precrq.ph.w     %[temp2],   %[temp6],    %[temp4]   \n\t"
    997     "append          %[temp0],   %[temp5],    16         \n\t"
    998     "precr.qb.ph     %[temp3],   %[temp3],    %[temp2]   \n\t"
    999     "usw             %[temp3],   0*" XSTR(BPS) "(%[dst]) \n\t"
   1000     "precr.qb.ph     %[temp1],   %[temp7],    %[temp0]   \n\t"
   1001     "usw             %[temp7],   3*" XSTR(BPS) "(%[dst]) \n\t"
   1002     "packrl.ph       %[temp2],   %[temp1],    %[temp3]   \n\t"
   1003     "usw             %[temp1],   2*" XSTR(BPS) "(%[dst]) \n\t"
   1004     "usw             %[temp2],   1*" XSTR(BPS) "(%[dst]) \n\t"
   1005     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
   1006       [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
   1007       [temp6]"=&r"(temp6), [temp7]"=&r"(temp7)
   1008     : [top]"r"(top), [dst]"r"(dst)
   1009     : "memory"
   1010   );
   1011 }
   1012 
   1013 //------------------------------------------------------------------------------
   1014 // Chroma 8x8 prediction (paragraph 12.2)
   1015 
   1016 static void IntraChromaPreds_MIPSdspR2(uint8_t* dst, const uint8_t* left,
   1017                                        const uint8_t* top) {
   1018   // U block
   1019   DCMode8(C8DC8 + dst, left, top);
   1020   VerticalPred8(C8VE8 + dst, top);
   1021   HorizontalPred8(C8HE8 + dst, left);
   1022   TrueMotion8(C8TM8 + dst, left, top);
   1023   // V block
   1024   dst += 8;
   1025   if (top) top += 8;
   1026   if (left) left += 16;
   1027   DCMode8(C8DC8 + dst, left, top);
   1028   VerticalPred8(C8VE8 + dst, top);
   1029   HorizontalPred8(C8HE8 + dst, left);
   1030   TrueMotion8(C8TM8 + dst, left, top);
   1031 }
   1032 
   1033 //------------------------------------------------------------------------------
   1034 // luma 16x16 prediction (paragraph 12.3)
   1035 
   1036 static void Intra16Preds_MIPSdspR2(uint8_t* dst,
   1037                                    const uint8_t* left, const uint8_t* top) {
   1038   DCMode16(I16DC16 + dst, left, top);
   1039   VerticalPred16(I16VE16 + dst, top);
   1040   HorizontalPred16(I16HE16 + dst, left);
   1041   TrueMotion16(I16TM16 + dst, left, top);
   1042 }
   1043 
   1044 // Left samples are top[-5 .. -2], top_left is top[-1], top are
   1045 // located at top[0..3], and top right is top[4..7]
   1046 static void Intra4Preds_MIPSdspR2(uint8_t* dst, const uint8_t* top) {
   1047   DC4(I4DC4 + dst, top);
   1048   TM4(I4TM4 + dst, top);
   1049   VE4(I4VE4 + dst, top);
   1050   HE4(I4HE4 + dst, top);
   1051   RD4(I4RD4 + dst, top);
   1052   VR4(I4VR4 + dst, top);
   1053   LD4(I4LD4 + dst, top);
   1054   VL4(I4VL4 + dst, top);
   1055   HD4(I4HD4 + dst, top);
   1056   HU4(I4HU4 + dst, top);
   1057 }
   1058 
   1059 //------------------------------------------------------------------------------
   1060 // Metric
   1061 
   1062 #if !defined(WORK_AROUND_GCC)
   1063 
   1064 #define GET_SSE_INNER(A)                                                  \
   1065   "lw               %[temp0],    " #A "(%[a])                  \n\t"      \
   1066   "lw               %[temp1],    " #A "(%[b])                  \n\t"      \
   1067   "preceu.ph.qbr    %[temp2],    %[temp0]                      \n\t"      \
   1068   "preceu.ph.qbl    %[temp0],    %[temp0]                      \n\t"      \
   1069   "preceu.ph.qbr    %[temp3],    %[temp1]                      \n\t"      \
   1070   "preceu.ph.qbl    %[temp1],    %[temp1]                      \n\t"      \
   1071   "subq.ph          %[temp2],    %[temp2],    %[temp3]         \n\t"      \
   1072   "subq.ph          %[temp0],    %[temp0],    %[temp1]         \n\t"      \
   1073   "dpa.w.ph         $ac0,        %[temp2],    %[temp2]         \n\t"      \
   1074   "dpa.w.ph         $ac0,        %[temp0],    %[temp0]         \n\t"
   1075 
   1076 #define GET_SSE(A, B, C, D)               \
   1077   GET_SSE_INNER(A)                        \
   1078   GET_SSE_INNER(B)                        \
   1079   GET_SSE_INNER(C)                        \
   1080   GET_SSE_INNER(D)
   1081 
   1082 static int SSE16x16_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
   1083   int count;
   1084   int temp0, temp1, temp2, temp3;
   1085   __asm__ volatile (
   1086     "mult   $zero,    $zero                            \n\t"
   1087     GET_SSE( 0 * BPS, 4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS)
   1088     GET_SSE( 1 * BPS, 4 +  1 * BPS, 8 +  1 * BPS, 12 +  1 * BPS)
   1089     GET_SSE( 2 * BPS, 4 +  2 * BPS, 8 +  2 * BPS, 12 +  2 * BPS)
   1090     GET_SSE( 3 * BPS, 4 +  3 * BPS, 8 +  3 * BPS, 12 +  3 * BPS)
   1091     GET_SSE( 4 * BPS, 4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS)
   1092     GET_SSE( 5 * BPS, 4 +  5 * BPS, 8 +  5 * BPS, 12 +  5 * BPS)
   1093     GET_SSE( 6 * BPS, 4 +  6 * BPS, 8 +  6 * BPS, 12 +  6 * BPS)
   1094     GET_SSE( 7 * BPS, 4 +  7 * BPS, 8 +  7 * BPS, 12 +  7 * BPS)
   1095     GET_SSE( 8 * BPS, 4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS)
   1096     GET_SSE( 9 * BPS, 4 +  9 * BPS, 8 +  9 * BPS, 12 +  9 * BPS)
   1097     GET_SSE(10 * BPS, 4 + 10 * BPS, 8 + 10 * BPS, 12 + 10 * BPS)
   1098     GET_SSE(11 * BPS, 4 + 11 * BPS, 8 + 11 * BPS, 12 + 11 * BPS)
   1099     GET_SSE(12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS)
   1100     GET_SSE(13 * BPS, 4 + 13 * BPS, 8 + 13 * BPS, 12 + 13 * BPS)
   1101     GET_SSE(14 * BPS, 4 + 14 * BPS, 8 + 14 * BPS, 12 + 14 * BPS)
   1102     GET_SSE(15 * BPS, 4 + 15 * BPS, 8 + 15 * BPS, 12 + 15 * BPS)
   1103     "mflo   %[count]                                   \n\t"
   1104     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
   1105       [temp3]"=&r"(temp3), [count]"=&r"(count)
   1106     : [a]"r"(a), [b]"r"(b)
   1107     : "memory", "hi", "lo"
   1108   );
   1109   return count;
   1110 }
   1111 
   1112 static int SSE16x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
   1113   int count;
   1114   int temp0, temp1, temp2, temp3;
   1115   __asm__ volatile (
   1116     "mult   $zero,    $zero                            \n\t"
   1117     GET_SSE( 0 * BPS, 4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS)
   1118     GET_SSE( 1 * BPS, 4 +  1 * BPS, 8 +  1 * BPS, 12 +  1 * BPS)
   1119     GET_SSE( 2 * BPS, 4 +  2 * BPS, 8 +  2 * BPS, 12 +  2 * BPS)
   1120     GET_SSE( 3 * BPS, 4 +  3 * BPS, 8 +  3 * BPS, 12 +  3 * BPS)
   1121     GET_SSE( 4 * BPS, 4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS)
   1122     GET_SSE( 5 * BPS, 4 +  5 * BPS, 8 +  5 * BPS, 12 +  5 * BPS)
   1123     GET_SSE( 6 * BPS, 4 +  6 * BPS, 8 +  6 * BPS, 12 +  6 * BPS)
   1124     GET_SSE( 7 * BPS, 4 +  7 * BPS, 8 +  7 * BPS, 12 +  7 * BPS)
   1125     "mflo   %[count]                                   \n\t"
   1126     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
   1127       [temp3]"=&r"(temp3), [count]"=&r"(count)
   1128     : [a]"r"(a), [b]"r"(b)
   1129     : "memory", "hi", "lo"
   1130   );
   1131   return count;
   1132 }
   1133 
   1134 static int SSE8x8_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
   1135   int count;
   1136   int temp0, temp1, temp2, temp3;
   1137   __asm__ volatile (
   1138     "mult   $zero,    $zero                            \n\t"
   1139     GET_SSE(0 * BPS, 4 + 0 * BPS, 1 * BPS, 4 + 1 * BPS)
   1140     GET_SSE(2 * BPS, 4 + 2 * BPS, 3 * BPS, 4 + 3 * BPS)
   1141     GET_SSE(4 * BPS, 4 + 4 * BPS, 5 * BPS, 4 + 5 * BPS)
   1142     GET_SSE(6 * BPS, 4 + 6 * BPS, 7 * BPS, 4 + 7 * BPS)
   1143     "mflo   %[count]                                   \n\t"
   1144     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
   1145       [temp3]"=&r"(temp3), [count]"=&r"(count)
   1146     : [a]"r"(a), [b]"r"(b)
   1147     : "memory", "hi", "lo"
   1148   );
   1149   return count;
   1150 }
   1151 
   1152 static int SSE4x4_MIPSdspR2(const uint8_t* a, const uint8_t* b) {
   1153   int count;
   1154   int temp0, temp1, temp2, temp3;
   1155   __asm__ volatile (
   1156     "mult   $zero,    $zero                            \n\t"
   1157     GET_SSE(0 * BPS, 1 * BPS, 2 * BPS, 3 * BPS)
   1158     "mflo   %[count]                                   \n\t"
   1159     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
   1160       [temp3]"=&r"(temp3), [count]"=&r"(count)
   1161     : [a]"r"(a), [b]"r"(b)
   1162     : "memory", "hi", "lo"
   1163   );
   1164   return count;
   1165 }
   1166 
   1167 #undef GET_SSE
   1168 #undef GET_SSE_INNER
   1169 
   1170 #endif  // !WORK_AROUND_GCC
   1171 
   1172 #undef FILL_8_OR_16
   1173 #undef FILL_PART
   1174 #undef OUTPUT_EARLY_CLOBBER_REGS_17
   1175 #undef MUL_HALF
   1176 #undef ABS_X8
   1177 #undef ADD_SUB_HALVES_X4
   1178 
   1179 //------------------------------------------------------------------------------
   1180 // Quantization
   1181 //
   1182 
   1183 // macro for one pass through for loop in QuantizeBlock reading 2 values at time
   1184 // QUANTDIV macro inlined
   1185 // J - offset in bytes (kZigzag[n] * 2)
   1186 // K - offset in bytes (kZigzag[n] * 4)
   1187 // N - offset in bytes (n * 2)
   1188 // N1 - offset in bytes ((n + 1) * 2)
   1189 #define QUANTIZE_ONE(J, K, N, N1)                                         \
   1190   "ulw         %[temp1],     " #J "(%[ppin])                 \n\t"        \
   1191   "ulw         %[temp2],     " #J "(%[ppsharpen])            \n\t"        \
   1192   "lhu         %[temp3],     " #K "(%[ppzthresh])            \n\t"        \
   1193   "lhu         %[temp6],     " #K "+4(%[ppzthresh])          \n\t"        \
   1194   "absq_s.ph   %[temp4],     %[temp1]                        \n\t"        \
   1195   "ins         %[temp3],     %[temp6],         16,       16  \n\t"        \
   1196   "addu.ph     %[coeff],     %[temp4],         %[temp2]      \n\t"        \
   1197   "shra.ph     %[sign],      %[temp1],         15            \n\t"        \
   1198   "li          %[level],     0x10001                         \n\t"        \
   1199   "cmp.lt.ph   %[temp3],     %[coeff]                        \n\t"        \
   1200   "lhu         %[temp1],     " #J "(%[ppiq])                 \n\t"        \
   1201   "pick.ph     %[temp5],     %[level],         $0            \n\t"        \
   1202   "lw          %[temp2],     " #K "(%[ppbias])               \n\t"        \
   1203   "beqz        %[temp5],     0f                              \n\t"        \
   1204   "lhu         %[temp3],     " #J "(%[ppq])                  \n\t"        \
   1205   "beq         %[temp5],     %[level],         1f            \n\t"        \
   1206   "andi        %[temp5],     %[temp5],         0x1           \n\t"        \
   1207   "andi        %[temp4],     %[coeff],         0xffff        \n\t"        \
   1208   "beqz        %[temp5],     2f                              \n\t"        \
   1209   "mul         %[level],     %[temp4],         %[temp1]      \n\t"        \
   1210   "sh          $0,           " #J "+2(%[ppin])               \n\t"        \
   1211   "sh          $0,           " #N1 "(%[pout])                \n\t"        \
   1212   "addu        %[level],     %[level],         %[temp2]      \n\t"        \
   1213   "sra         %[level],     %[level],         17            \n\t"        \
   1214   "slt         %[temp4],     %[max_level],     %[level]      \n\t"        \
   1215   "movn        %[level],     %[max_level],     %[temp4]      \n\t"        \
   1216   "andi        %[temp6],     %[sign],          0xffff        \n\t"        \
   1217   "xor         %[level],     %[level],         %[temp6]      \n\t"        \
   1218   "subu        %[level],     %[level],         %[temp6]      \n\t"        \
   1219   "mul         %[temp5],     %[level],         %[temp3]      \n\t"        \
   1220   "or          %[ret],       %[ret],           %[level]      \n\t"        \
   1221   "sh          %[level],     " #N "(%[pout])                 \n\t"        \
   1222   "sh          %[temp5],     " #J "(%[ppin])                 \n\t"        \
   1223   "j           3f                                            \n\t"        \
   1224 "2:                                                          \n\t"        \
   1225   "lhu         %[temp1],     " #J "+2(%[ppiq])               \n\t"        \
   1226   "srl         %[temp5],     %[coeff],         16            \n\t"        \
   1227   "mul         %[level],     %[temp5],         %[temp1]      \n\t"        \
   1228   "lw          %[temp2],     " #K "+4(%[ppbias])             \n\t"        \
   1229   "lhu         %[temp3],     " #J "+2(%[ppq])                \n\t"        \
   1230   "addu        %[level],     %[level],         %[temp2]      \n\t"        \
   1231   "sra         %[level],     %[level],         17            \n\t"        \
   1232   "srl         %[temp6],     %[sign],          16            \n\t"        \
   1233   "slt         %[temp4],     %[max_level],     %[level]      \n\t"        \
   1234   "movn        %[level],     %[max_level],     %[temp4]      \n\t"        \
   1235   "xor         %[level],     %[level],         %[temp6]      \n\t"        \
   1236   "subu        %[level],     %[level],         %[temp6]      \n\t"        \
   1237   "mul         %[temp5],     %[level],         %[temp3]      \n\t"        \
   1238   "sh          $0,           " #J "(%[ppin])                 \n\t"        \
   1239   "sh          $0,           " #N "(%[pout])                 \n\t"        \
   1240   "or          %[ret],       %[ret],           %[level]      \n\t"        \
   1241   "sh          %[temp5],     " #J "+2(%[ppin])               \n\t"        \
   1242   "sh          %[level],     " #N1 "(%[pout])                \n\t"        \
   1243   "j           3f                                            \n\t"        \
   1244 "1:                                                          \n\t"        \
   1245   "lhu         %[temp1],     " #J "(%[ppiq])                 \n\t"        \
   1246   "lw          %[temp2],     " #K "(%[ppbias])               \n\t"        \
   1247   "ulw         %[temp3],     " #J "(%[ppq])                  \n\t"        \
   1248   "andi        %[temp5],     %[coeff],         0xffff        \n\t"        \
   1249   "srl         %[temp0],     %[coeff],         16            \n\t"        \
   1250   "lhu         %[temp6],     " #J "+2(%[ppiq])               \n\t"        \
   1251   "lw          %[coeff],     " #K "+4(%[ppbias])             \n\t"        \
   1252   "mul         %[level],     %[temp5],         %[temp1]      \n\t"        \
   1253   "mul         %[temp4],     %[temp0],         %[temp6]      \n\t"        \
   1254   "addu        %[level],     %[level],         %[temp2]      \n\t"        \
   1255   "addu        %[temp4],     %[temp4],         %[coeff]      \n\t"        \
   1256   "precrq.ph.w %[level],     %[temp4],         %[level]      \n\t"        \
   1257   "shra.ph     %[level],     %[level],         1             \n\t"        \
   1258   "cmp.lt.ph   %[max_level1],%[level]                        \n\t"        \
   1259   "pick.ph     %[level],     %[max_level],     %[level]      \n\t"        \
   1260   "xor         %[level],     %[level],         %[sign]       \n\t"        \
   1261   "subu.ph     %[level],     %[level],         %[sign]       \n\t"        \
   1262   "mul.ph      %[temp3],     %[level],         %[temp3]      \n\t"        \
   1263   "or          %[ret],       %[ret],           %[level]      \n\t"        \
   1264   "sh          %[level],     " #N "(%[pout])                 \n\t"        \
   1265   "srl         %[level],     %[level],         16            \n\t"        \
   1266   "sh          %[level],     " #N1 "(%[pout])                \n\t"        \
   1267   "usw         %[temp3],     " #J "(%[ppin])                 \n\t"        \
   1268   "j           3f                                            \n\t"        \
   1269 "0:                                                          \n\t"        \
   1270   "sh          $0,           " #N "(%[pout])                 \n\t"        \
   1271   "sh          $0,           " #N1 "(%[pout])                \n\t"        \
   1272   "usw         $0,           " #J "(%[ppin])                 \n\t"        \
   1273 "3:                                                          \n\t"
   1274 
   1275 static int QuantizeBlock_MIPSdspR2(int16_t in[16], int16_t out[16],
   1276                                    const VP8Matrix* const mtx) {
   1277   int temp0, temp1, temp2, temp3, temp4, temp5,temp6;
   1278   int sign, coeff, level;
   1279   int max_level = MAX_LEVEL;
   1280   int max_level1 = max_level << 16 | max_level;
   1281   int ret = 0;
   1282 
   1283   int16_t* ppin             = &in[0];
   1284   int16_t* pout             = &out[0];
   1285   const uint16_t* ppsharpen = &mtx->sharpen_[0];
   1286   const uint32_t* ppzthresh = &mtx->zthresh_[0];
   1287   const uint16_t* ppq       = &mtx->q_[0];
   1288   const uint16_t* ppiq      = &mtx->iq_[0];
   1289   const uint32_t* ppbias    = &mtx->bias_[0];
   1290 
   1291   __asm__ volatile (
   1292     QUANTIZE_ONE( 0,  0,  0,  2)
   1293     QUANTIZE_ONE( 4,  8, 10, 12)
   1294     QUANTIZE_ONE( 8, 16,  4,  8)
   1295     QUANTIZE_ONE(12, 24, 14, 24)
   1296     QUANTIZE_ONE(16, 32,  6, 16)
   1297     QUANTIZE_ONE(20, 40, 22, 26)
   1298     QUANTIZE_ONE(24, 48, 18, 20)
   1299     QUANTIZE_ONE(28, 56, 28, 30)
   1300 
   1301     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
   1302       [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
   1303       [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
   1304       [sign]"=&r"(sign), [coeff]"=&r"(coeff),
   1305       [level]"=&r"(level), [temp6]"=&r"(temp6), [ret]"+&r"(ret)
   1306     : [ppin]"r"(ppin), [pout]"r"(pout), [max_level1]"r"(max_level1),
   1307       [ppiq]"r"(ppiq), [max_level]"r"(max_level),
   1308       [ppbias]"r"(ppbias), [ppzthresh]"r"(ppzthresh),
   1309       [ppsharpen]"r"(ppsharpen), [ppq]"r"(ppq)
   1310     : "memory", "hi", "lo"
   1311   );
   1312 
   1313   return (ret != 0);
   1314 }
   1315 
   1316 static int Quantize2Blocks_MIPSdspR2(int16_t in[32], int16_t out[32],
   1317                                      const VP8Matrix* const mtx) {
   1318   int nz;
   1319   nz  = QuantizeBlock_MIPSdspR2(in + 0 * 16, out + 0 * 16, mtx) << 0;
   1320   nz |= QuantizeBlock_MIPSdspR2(in + 1 * 16, out + 1 * 16, mtx) << 1;
   1321   return nz;
   1322 }
   1323 
   1324 #undef QUANTIZE_ONE
   1325 
   1326 // macro for one horizontal pass in FTransformWHT
   1327 // temp0..temp7 holds tmp[0]..tmp[15]
   1328 // A, B, C, D - offset in bytes to load from in buffer
   1329 // TEMP0, TEMP1 - registers for corresponding tmp elements
   1330 #define HORIZONTAL_PASS_WHT(A, B, C, D, TEMP0, TEMP1)                          \
   1331   "lh              %[" #TEMP0 "],  " #A "(%[in])            \n\t"              \
   1332   "lh              %[" #TEMP1 "],  " #B "(%[in])            \n\t"              \
   1333   "lh              %[temp8],     " #C "(%[in])              \n\t"              \
   1334   "lh              %[temp9],     " #D "(%[in])              \n\t"              \
   1335   "ins             %[" #TEMP1 "],  %[" #TEMP0 "],  16,  16  \n\t"              \
   1336   "ins             %[temp9],     %[temp8],     16,  16      \n\t"              \
   1337   "subq.ph         %[temp8],     %[" #TEMP1 "],  %[temp9]   \n\t"              \
   1338   "addq.ph         %[temp9],     %[" #TEMP1 "],  %[temp9]   \n\t"              \
   1339   "precrq.ph.w     %[" #TEMP0 "],  %[temp8],     %[temp9]   \n\t"              \
   1340   "append          %[temp8],     %[temp9],     16           \n\t"              \
   1341   "subq.ph         %[" #TEMP1 "],  %[" #TEMP0 "],  %[temp8] \n\t"              \
   1342   "addq.ph         %[" #TEMP0 "],  %[" #TEMP0 "],  %[temp8] \n\t"              \
   1343   "rotr            %[" #TEMP1 "],  %[" #TEMP1 "],  16       \n\t"
   1344 
   1345 // macro for one vertical pass in FTransformWHT
   1346 // temp0..temp7 holds tmp[0]..tmp[15]
   1347 // A, B, C, D - offsets in bytes to store to out buffer
   1348 // TEMP0, TEMP2, TEMP4 and TEMP6 - registers for corresponding tmp elements
   1349 #define VERTICAL_PASS_WHT(A, B, C, D, TEMP0, TEMP2, TEMP4, TEMP6)              \
   1350   "addq.ph         %[temp8],     %[" #TEMP0 "],  %[" #TEMP4 "]    \n\t"        \
   1351   "addq.ph         %[temp9],     %[" #TEMP2 "],  %[" #TEMP6 "]    \n\t"        \
   1352   "subq.ph         %[" #TEMP2 "],  %[" #TEMP2 "],  %[" #TEMP6 "]  \n\t"        \
   1353   "subq.ph         %[" #TEMP6 "],  %[" #TEMP0 "],  %[" #TEMP4 "]  \n\t"        \
   1354   "addqh.ph        %[" #TEMP0 "],  %[temp8],     %[temp9]         \n\t"        \
   1355   "subqh.ph        %[" #TEMP4 "],  %[" #TEMP6 "],  %[" #TEMP2 "]  \n\t"        \
   1356   "addqh.ph        %[" #TEMP2 "],  %[" #TEMP2 "],  %[" #TEMP6 "]  \n\t"        \
   1357   "subqh.ph        %[" #TEMP6 "],  %[temp8],     %[temp9]         \n\t"        \
   1358   "usw             %[" #TEMP0 "],  " #A "(%[out])                 \n\t"        \
   1359   "usw             %[" #TEMP2 "],  " #B "(%[out])                 \n\t"        \
   1360   "usw             %[" #TEMP4 "],  " #C "(%[out])                 \n\t"        \
   1361   "usw             %[" #TEMP6 "],  " #D "(%[out])                 \n\t"
   1362 
   1363 static void FTransformWHT_MIPSdspR2(const int16_t* in, int16_t* out) {
   1364   int temp0, temp1, temp2, temp3, temp4;
   1365   int temp5, temp6, temp7, temp8, temp9;
   1366 
   1367   __asm__ volatile (
   1368     HORIZONTAL_PASS_WHT(  0,  32,  64,  96, temp0, temp1)
   1369     HORIZONTAL_PASS_WHT(128, 160, 192, 224, temp2, temp3)
   1370     HORIZONTAL_PASS_WHT(256, 288, 320, 352, temp4, temp5)
   1371     HORIZONTAL_PASS_WHT(384, 416, 448, 480, temp6, temp7)
   1372     VERTICAL_PASS_WHT(0,  8, 16, 24, temp0, temp2, temp4, temp6)
   1373     VERTICAL_PASS_WHT(4, 12, 20, 28, temp1, temp3, temp5, temp7)
   1374     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
   1375       [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
   1376       [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
   1377       [temp9]"=&r"(temp9)
   1378     : [in]"r"(in), [out]"r"(out)
   1379     : "memory"
   1380   );
   1381 }
   1382 
   1383 #undef VERTICAL_PASS_WHT
   1384 #undef HORIZONTAL_PASS_WHT
   1385 
   1386 // macro for converting coefficients to bin
   1387 // convert 8 coeffs at time
   1388 // A, B, C, D - offsets in bytes to load from out buffer
   1389 #define CONVERT_COEFFS_TO_BIN(A, B, C, D)                                      \
   1390   "ulw        %[temp0],  " #A "(%[out])                \n\t"                   \
   1391   "ulw        %[temp1],  " #B "(%[out])                \n\t"                   \
   1392   "ulw        %[temp2],  " #C "(%[out])                \n\t"                   \
   1393   "ulw        %[temp3],  " #D "(%[out])                \n\t"                   \
   1394   "absq_s.ph  %[temp0],  %[temp0]                      \n\t"                   \
   1395   "absq_s.ph  %[temp1],  %[temp1]                      \n\t"                   \
   1396   "absq_s.ph  %[temp2],  %[temp2]                      \n\t"                   \
   1397   "absq_s.ph  %[temp3],  %[temp3]                      \n\t"                   \
   1398   "shra.ph    %[temp0],  %[temp0],    3                \n\t"                   \
   1399   "shra.ph    %[temp1],  %[temp1],    3                \n\t"                   \
   1400   "shra.ph    %[temp2],  %[temp2],    3                \n\t"                   \
   1401   "shra.ph    %[temp3],  %[temp3],    3                \n\t"                   \
   1402   "shll_s.ph  %[temp0],  %[temp0],    10               \n\t"                   \
   1403   "shll_s.ph  %[temp1],  %[temp1],    10               \n\t"                   \
   1404   "shll_s.ph  %[temp2],  %[temp2],    10               \n\t"                   \
   1405   "shll_s.ph  %[temp3],  %[temp3],    10               \n\t"                   \
   1406   "shrl.ph    %[temp0],  %[temp0],    10               \n\t"                   \
   1407   "shrl.ph    %[temp1],  %[temp1],    10               \n\t"                   \
   1408   "shrl.ph    %[temp2],  %[temp2],    10               \n\t"                   \
   1409   "shrl.ph    %[temp3],  %[temp3],    10               \n\t"                   \
   1410   "shll.ph    %[temp0],  %[temp0],    2                \n\t"                   \
   1411   "shll.ph    %[temp1],  %[temp1],    2                \n\t"                   \
   1412   "shll.ph    %[temp2],  %[temp2],    2                \n\t"                   \
   1413   "shll.ph    %[temp3],  %[temp3],    2                \n\t"                   \
   1414   "ext        %[temp4],  %[temp0],    0,       16      \n\t"                   \
   1415   "ext        %[temp0],  %[temp0],    16,      16      \n\t"                   \
   1416   "addu       %[temp4],  %[temp4],    %[dist]          \n\t"                   \
   1417   "addu       %[temp0],  %[temp0],    %[dist]          \n\t"                   \
   1418   "ext        %[temp5],  %[temp1],    0,       16      \n\t"                   \
   1419   "lw         %[temp8],  0(%[temp4])                   \n\t"                   \
   1420   "ext        %[temp1],  %[temp1],    16,      16      \n\t"                   \
   1421   "addu       %[temp5],  %[temp5],    %[dist]          \n\t"                   \
   1422   "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
   1423   "sw         %[temp8],  0(%[temp4])                   \n\t"                   \
   1424   "lw         %[temp8],  0(%[temp0])                   \n\t"                   \
   1425   "addu       %[temp1],  %[temp1],    %[dist]          \n\t"                   \
   1426   "ext        %[temp6],  %[temp2],    0,       16      \n\t"                   \
   1427   "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
   1428   "sw         %[temp8],  0(%[temp0])                   \n\t"                   \
   1429   "lw         %[temp8],  0(%[temp5])                   \n\t"                   \
   1430   "ext        %[temp2],  %[temp2],    16,      16      \n\t"                   \
   1431   "addu       %[temp6],  %[temp6],    %[dist]          \n\t"                   \
   1432   "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
   1433   "sw         %[temp8],  0(%[temp5])                   \n\t"                   \
   1434   "lw         %[temp8],  0(%[temp1])                   \n\t"                   \
   1435   "addu       %[temp2],  %[temp2],    %[dist]          \n\t"                   \
   1436   "ext        %[temp7],  %[temp3],    0,       16      \n\t"                   \
   1437   "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
   1438   "sw         %[temp8],  0(%[temp1])                   \n\t"                   \
   1439   "lw         %[temp8],  0(%[temp6])                   \n\t"                   \
   1440   "ext        %[temp3],  %[temp3],    16,      16      \n\t"                   \
   1441   "addu       %[temp7],  %[temp7],    %[dist]          \n\t"                   \
   1442   "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
   1443   "sw         %[temp8],  0(%[temp6])                   \n\t"                   \
   1444   "lw         %[temp8],  0(%[temp2])                   \n\t"                   \
   1445   "addu       %[temp3],  %[temp3],    %[dist]          \n\t"                   \
   1446   "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
   1447   "sw         %[temp8],  0(%[temp2])                   \n\t"                   \
   1448   "lw         %[temp8],  0(%[temp7])                   \n\t"                   \
   1449   "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
   1450   "sw         %[temp8],  0(%[temp7])                   \n\t"                   \
   1451   "lw         %[temp8],  0(%[temp3])                   \n\t"                   \
   1452   "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
   1453   "sw         %[temp8],  0(%[temp3])                   \n\t"
   1454 
   1455 static void CollectHistogram_MIPSdspR2(const uint8_t* ref, const uint8_t* pred,
   1456                                        int start_block, int end_block,
   1457                                        VP8Histogram* const histo) {
   1458   int j;
   1459   int distribution[MAX_COEFF_THRESH + 1] = { 0 };
   1460   const int max_coeff = (MAX_COEFF_THRESH << 16) + MAX_COEFF_THRESH;
   1461   for (j = start_block; j < end_block; ++j) {
   1462     int16_t out[16];
   1463     int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
   1464 
   1465     VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
   1466 
   1467     // Convert coefficients to bin.
   1468     __asm__ volatile (
   1469       CONVERT_COEFFS_TO_BIN( 0,  4,  8, 12)
   1470       CONVERT_COEFFS_TO_BIN(16, 20, 24, 28)
   1471       : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
   1472         [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
   1473         [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8)
   1474       : [dist]"r"(distribution), [out]"r"(out), [max_coeff]"r"(max_coeff)
   1475       : "memory"
   1476     );
   1477   }
   1478   VP8SetHistogramData(distribution, histo);
   1479 }
   1480 
   1481 #undef CONVERT_COEFFS_TO_BIN
   1482 
   1483 //------------------------------------------------------------------------------
   1484 // Entry point
   1485 
   1486 extern void VP8EncDspInitMIPSdspR2(void);
   1487 
   1488 WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPSdspR2(void) {
   1489   VP8FTransform = FTransform_MIPSdspR2;
   1490   VP8FTransformWHT = FTransformWHT_MIPSdspR2;
   1491   VP8ITransform = ITransform_MIPSdspR2;
   1492 
   1493   VP8TDisto4x4 = Disto4x4_MIPSdspR2;
   1494   VP8TDisto16x16 = Disto16x16_MIPSdspR2;
   1495 
   1496   VP8EncPredLuma16 = Intra16Preds_MIPSdspR2;
   1497   VP8EncPredChroma8 = IntraChromaPreds_MIPSdspR2;
   1498   VP8EncPredLuma4 = Intra4Preds_MIPSdspR2;
   1499 
   1500 #if !defined(WORK_AROUND_GCC)
   1501   VP8SSE16x16 = SSE16x16_MIPSdspR2;
   1502   VP8SSE8x8 = SSE8x8_MIPSdspR2;
   1503   VP8SSE16x8 = SSE16x8_MIPSdspR2;
   1504   VP8SSE4x4 = SSE4x4_MIPSdspR2;
   1505 #endif
   1506 
   1507   VP8EncQuantizeBlock = QuantizeBlock_MIPSdspR2;
   1508   VP8EncQuantize2Blocks = Quantize2Blocks_MIPSdspR2;
   1509 
   1510   VP8CollectHistogram = CollectHistogram_MIPSdspR2;
   1511 }
   1512 
   1513 #else  // !WEBP_USE_MIPS_DSP_R2
   1514 
   1515 WEBP_DSP_INIT_STUB(VP8EncDspInitMIPSdspR2)
   1516 
   1517 #endif  // WEBP_USE_MIPS_DSP_R2
   1518