Home | History | Annotate | Download | only in dsp
      1 // Copyright 2014 Google Inc. All Rights Reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style license
      4 // that can be found in the COPYING file in the root of the source
      5 // tree. An additional intellectual property rights grant can be found
      6 // in the file PATENTS. All contributing project authors may
      7 // be found in the AUTHORS file in the root of the source tree.
      8 // -----------------------------------------------------------------------------
      9 //
     10 // Image transforms and color space conversion methods for lossless decoder.
     11 //
     12 // Author(s):  Djordje Pesut    (djordje.pesut (at) imgtec.com)
     13 //             Jovan Zelincevic (jovan.zelincevic (at) imgtec.com)
     14 
     15 #include "./dsp.h"
     16 
     17 #if defined(WEBP_USE_MIPS_DSP_R2)
     18 
     19 #include "./lossless.h"
     20 
     21 #define MAP_COLOR_FUNCS(FUNC_NAME, TYPE, GET_INDEX, GET_VALUE)                 \
     22 static void FUNC_NAME(const TYPE* src,                                         \
     23                       const uint32_t* const color_map,                         \
     24                       TYPE* dst, int y_start, int y_end,                       \
     25                       int width) {                                             \
     26   int y;                                                                       \
     27   for (y = y_start; y < y_end; ++y) {                                          \
     28     int x;                                                                     \
     29     for (x = 0; x < (width >> 2); ++x) {                                       \
     30       int tmp1, tmp2, tmp3, tmp4;                                              \
     31       __asm__ volatile (                                                       \
     32       ".ifc        " #TYPE ",  uint8_t                  \n\t"                  \
     33         "lbu       %[tmp1],  0(%[src])                  \n\t"                  \
     34         "lbu       %[tmp2],  1(%[src])                  \n\t"                  \
     35         "lbu       %[tmp3],  2(%[src])                  \n\t"                  \
     36         "lbu       %[tmp4],  3(%[src])                  \n\t"                  \
     37         "addiu     %[src],   %[src],      4             \n\t"                  \
     38       ".endif                                           \n\t"                  \
     39       ".ifc        " #TYPE ",  uint32_t                 \n\t"                  \
     40         "lw        %[tmp1],  0(%[src])                  \n\t"                  \
     41         "lw        %[tmp2],  4(%[src])                  \n\t"                  \
     42         "lw        %[tmp3],  8(%[src])                  \n\t"                  \
     43         "lw        %[tmp4],  12(%[src])                 \n\t"                  \
     44         "ext       %[tmp1],  %[tmp1],     8,        8   \n\t"                  \
     45         "ext       %[tmp2],  %[tmp2],     8,        8   \n\t"                  \
     46         "ext       %[tmp3],  %[tmp3],     8,        8   \n\t"                  \
     47         "ext       %[tmp4],  %[tmp4],     8,        8   \n\t"                  \
     48         "addiu     %[src],   %[src],      16            \n\t"                  \
     49       ".endif                                           \n\t"                  \
     50         "sll       %[tmp1],  %[tmp1],     2             \n\t"                  \
     51         "sll       %[tmp2],  %[tmp2],     2             \n\t"                  \
     52         "sll       %[tmp3],  %[tmp3],     2             \n\t"                  \
     53         "sll       %[tmp4],  %[tmp4],     2             \n\t"                  \
     54         "lwx       %[tmp1],  %[tmp1](%[color_map])      \n\t"                  \
     55         "lwx       %[tmp2],  %[tmp2](%[color_map])      \n\t"                  \
     56         "lwx       %[tmp3],  %[tmp3](%[color_map])      \n\t"                  \
     57         "lwx       %[tmp4],  %[tmp4](%[color_map])      \n\t"                  \
     58       ".ifc        " #TYPE ",  uint8_t                  \n\t"                  \
     59         "ext       %[tmp1],  %[tmp1],     8,        8   \n\t"                  \
     60         "ext       %[tmp2],  %[tmp2],     8,        8   \n\t"                  \
     61         "ext       %[tmp3],  %[tmp3],     8,        8   \n\t"                  \
     62         "ext       %[tmp4],  %[tmp4],     8,        8   \n\t"                  \
     63         "sb        %[tmp1],  0(%[dst])                  \n\t"                  \
     64         "sb        %[tmp2],  1(%[dst])                  \n\t"                  \
     65         "sb        %[tmp3],  2(%[dst])                  \n\t"                  \
     66         "sb        %[tmp4],  3(%[dst])                  \n\t"                  \
     67         "addiu     %[dst],   %[dst],      4             \n\t"                  \
     68       ".endif                                           \n\t"                  \
     69       ".ifc        " #TYPE ",  uint32_t                 \n\t"                  \
     70         "sw        %[tmp1],  0(%[dst])                  \n\t"                  \
     71         "sw        %[tmp2],  4(%[dst])                  \n\t"                  \
     72         "sw        %[tmp3],  8(%[dst])                  \n\t"                  \
     73         "sw        %[tmp4],  12(%[dst])                 \n\t"                  \
     74         "addiu     %[dst],   %[dst],      16            \n\t"                  \
     75       ".endif                                           \n\t"                  \
     76         : [tmp1]"=&r"(tmp1), [tmp2]"=&r"(tmp2), [tmp3]"=&r"(tmp3),             \
     77           [tmp4]"=&r"(tmp4), [src]"+&r"(src), [dst]"+r"(dst)                   \
     78         : [color_map]"r"(color_map)                                            \
     79         : "memory"                                                             \
     80       );                                                                       \
     81     }                                                                          \
     82     for (x = 0; x < (width & 3); ++x) {                                        \
     83       *dst++ = GET_VALUE(color_map[GET_INDEX(*src++)]);                        \
     84     }                                                                          \
     85   }                                                                            \
     86 }
     87 
     88 MAP_COLOR_FUNCS(MapARGB, uint32_t, VP8GetARGBIndex, VP8GetARGBValue)
     89 MAP_COLOR_FUNCS(MapAlpha, uint8_t, VP8GetAlphaIndex, VP8GetAlphaValue)
     90 
     91 #undef MAP_COLOR_FUNCS
     92 
     93 static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1,
     94                                                    uint32_t c2) {
     95   int temp0, temp1, temp2, temp3, temp4, temp5;
     96   __asm__ volatile (
     97     "preceu.ph.qbr   %[temp1],   %[c0]                 \n\t"
     98     "preceu.ph.qbl   %[temp2],   %[c0]                 \n\t"
     99     "preceu.ph.qbr   %[temp3],   %[c1]                 \n\t"
    100     "preceu.ph.qbl   %[temp4],   %[c1]                 \n\t"
    101     "preceu.ph.qbr   %[temp5],   %[c2]                 \n\t"
    102     "preceu.ph.qbl   %[temp0],   %[c2]                 \n\t"
    103     "subq.ph         %[temp3],   %[temp3],   %[temp5]  \n\t"
    104     "subq.ph         %[temp4],   %[temp4],   %[temp0]  \n\t"
    105     "addq.ph         %[temp1],   %[temp1],   %[temp3]  \n\t"
    106     "addq.ph         %[temp2],   %[temp2],   %[temp4]  \n\t"
    107     "shll_s.ph       %[temp1],   %[temp1],   7         \n\t"
    108     "shll_s.ph       %[temp2],   %[temp2],   7         \n\t"
    109     "precrqu_s.qb.ph %[temp2],   %[temp2],   %[temp1]  \n\t"
    110     : [temp0]"=r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
    111       [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5)
    112     : [c0]"r"(c0), [c1]"r"(c1), [c2]"r"(c2)
    113     : "memory"
    114   );
    115   return temp2;
    116 }
    117 
    118 static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
    119                                                    uint32_t c2) {
    120   int temp0, temp1, temp2, temp3, temp4, temp5;
    121   __asm__ volatile (
    122     "adduh.qb         %[temp5],   %[c0],      %[c1]       \n\t"
    123     "preceu.ph.qbr    %[temp3],   %[c2]                   \n\t"
    124     "preceu.ph.qbr    %[temp1],   %[temp5]                \n\t"
    125     "preceu.ph.qbl    %[temp2],   %[temp5]                \n\t"
    126     "preceu.ph.qbl    %[temp4],   %[c2]                   \n\t"
    127     "subq.ph          %[temp3],   %[temp1],   %[temp3]    \n\t"
    128     "subq.ph          %[temp4],   %[temp2],   %[temp4]    \n\t"
    129     "shrl.ph          %[temp5],   %[temp3],   15          \n\t"
    130     "shrl.ph          %[temp0],   %[temp4],   15          \n\t"
    131     "addq.ph          %[temp3],   %[temp3],   %[temp5]    \n\t"
    132     "addq.ph          %[temp4],   %[temp0],   %[temp4]    \n\t"
    133     "shra.ph          %[temp3],   %[temp3],   1           \n\t"
    134     "shra.ph          %[temp4],   %[temp4],   1           \n\t"
    135     "addq.ph          %[temp1],   %[temp1],   %[temp3]    \n\t"
    136     "addq.ph          %[temp2],   %[temp2],   %[temp4]    \n\t"
    137     "shll_s.ph        %[temp1],   %[temp1],   7           \n\t"
    138     "shll_s.ph        %[temp2],   %[temp2],   7           \n\t"
    139     "precrqu_s.qb.ph  %[temp1],   %[temp2],   %[temp1]    \n\t"
    140     : [temp0]"=r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
    141       [temp3]"=&r"(temp3), [temp4]"=r"(temp4), [temp5]"=&r"(temp5)
    142     : [c0]"r"(c0), [c1]"r"(c1), [c2]"r"(c2)
    143     : "memory"
    144   );
    145   return temp1;
    146 }
    147 
    148 static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
    149   int temp0, temp1, temp2, temp3, temp4, temp5;
    150   __asm__ volatile (
    151     "cmpgdu.lt.qb %[temp1], %[c],     %[b]             \n\t"
    152     "pick.qb      %[temp1], %[b],     %[c]             \n\t"
    153     "pick.qb      %[temp2], %[c],     %[b]             \n\t"
    154     "cmpgdu.lt.qb %[temp4], %[c],     %[a]             \n\t"
    155     "pick.qb      %[temp4], %[a],     %[c]             \n\t"
    156     "pick.qb      %[temp5], %[c],     %[a]             \n\t"
    157     "subu.qb      %[temp3], %[temp1], %[temp2]         \n\t"
    158     "subu.qb      %[temp0], %[temp4], %[temp5]         \n\t"
    159     "raddu.w.qb   %[temp3], %[temp3]                   \n\t"
    160     "raddu.w.qb   %[temp0], %[temp0]                   \n\t"
    161     "subu         %[temp3], %[temp3], %[temp0]         \n\t"
    162     "slti         %[temp0], %[temp3], 0x1              \n\t"
    163     "movz         %[a],     %[b],     %[temp0]         \n\t"
    164     : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
    165       [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp0]"=&r"(temp0),
    166       [a]"+&r"(a)
    167     : [b]"r"(b), [c]"r"(c)
    168   );
    169   return a;
    170 }
    171 
    172 static WEBP_INLINE uint32_t Average2(uint32_t a0, uint32_t a1) {
    173   __asm__ volatile (
    174     "adduh.qb    %[a0], %[a0], %[a1]       \n\t"
    175     : [a0]"+r"(a0)
    176     : [a1]"r"(a1)
    177   );
    178   return a0;
    179 }
    180 
    181 static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) {
    182   return Average2(Average2(a0, a2), a1);
    183 }
    184 
    185 static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
    186                                      uint32_t a2, uint32_t a3) {
    187   return Average2(Average2(a0, a1), Average2(a2, a3));
    188 }
    189 
    190 static uint32_t Predictor5(uint32_t left, const uint32_t* const top) {
    191   return Average3(left, top[0], top[1]);
    192 }
    193 
    194 static uint32_t Predictor6(uint32_t left, const uint32_t* const top) {
    195   return Average2(left, top[-1]);
    196 }
    197 
    198 static uint32_t Predictor7(uint32_t left, const uint32_t* const top) {
    199   return Average2(left, top[0]);
    200 }
    201 
    202 static uint32_t Predictor8(uint32_t left, const uint32_t* const top) {
    203   (void)left;
    204   return Average2(top[-1], top[0]);
    205 }
    206 
    207 static uint32_t Predictor9(uint32_t left, const uint32_t* const top) {
    208   (void)left;
    209   return Average2(top[0], top[1]);
    210 }
    211 
    212 static uint32_t Predictor10(uint32_t left, const uint32_t* const top) {
    213   return Average4(left, top[-1], top[0], top[1]);
    214 }
    215 
    216 static uint32_t Predictor11(uint32_t left, const uint32_t* const top) {
    217   return Select(top[0], left, top[-1]);
    218 }
    219 
    220 static uint32_t Predictor12(uint32_t left, const uint32_t* const top) {
    221   return ClampedAddSubtractFull(left, top[0], top[-1]);
    222 }
    223 
    224 static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
    225   return ClampedAddSubtractHalf(left, top[0], top[-1]);
    226 }
    227 
    228 // Add green to blue and red channels (i.e. perform the inverse transform of
    229 // 'subtract green').
    230 static void AddGreenToBlueAndRed(uint32_t* data, int num_pixels) {
    231   uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
    232   uint32_t* const p_loop1_end = data + (num_pixels & ~3);
    233   uint32_t* const p_loop2_end = data + num_pixels;
    234   __asm__ volatile (
    235     ".set       push                                          \n\t"
    236     ".set       noreorder                                     \n\t"
    237     "beq        %[data],         %[p_loop1_end],     3f       \n\t"
    238     " nop                                                     \n\t"
    239   "0:                                                         \n\t"
    240     "lw         %[temp0],        0(%[data])                   \n\t"
    241     "lw         %[temp1],        4(%[data])                   \n\t"
    242     "lw         %[temp2],        8(%[data])                   \n\t"
    243     "lw         %[temp3],        12(%[data])                  \n\t"
    244     "ext        %[temp4],        %[temp0],           8,    8  \n\t"
    245     "ext        %[temp5],        %[temp1],           8,    8  \n\t"
    246     "ext        %[temp6],        %[temp2],           8,    8  \n\t"
    247     "ext        %[temp7],        %[temp3],           8,    8  \n\t"
    248     "addiu      %[data],         %[data],            16       \n\t"
    249     "replv.ph   %[temp4],        %[temp4]                     \n\t"
    250     "replv.ph   %[temp5],        %[temp5]                     \n\t"
    251     "replv.ph   %[temp6],        %[temp6]                     \n\t"
    252     "replv.ph   %[temp7],        %[temp7]                     \n\t"
    253     "addu.qb    %[temp0],        %[temp0],           %[temp4] \n\t"
    254     "addu.qb    %[temp1],        %[temp1],           %[temp5] \n\t"
    255     "addu.qb    %[temp2],        %[temp2],           %[temp6] \n\t"
    256     "addu.qb    %[temp3],        %[temp3],           %[temp7] \n\t"
    257     "sw         %[temp0],        -16(%[data])                 \n\t"
    258     "sw         %[temp1],        -12(%[data])                 \n\t"
    259     "sw         %[temp2],        -8(%[data])                  \n\t"
    260     "bne        %[data],         %[p_loop1_end],     0b       \n\t"
    261     " sw        %[temp3],        -4(%[data])                  \n\t"
    262   "3:                                                         \n\t"
    263     "beq        %[data],         %[p_loop2_end],     2f       \n\t"
    264     " nop                                                     \n\t"
    265   "1:                                                         \n\t"
    266     "lw         %[temp0],        0(%[data])                   \n\t"
    267     "addiu      %[data],         %[data],            4        \n\t"
    268     "ext        %[temp4],        %[temp0],           8,    8  \n\t"
    269     "replv.ph   %[temp4],        %[temp4]                     \n\t"
    270     "addu.qb    %[temp0],        %[temp0],           %[temp4] \n\t"
    271     "bne        %[data],         %[p_loop2_end],     1b       \n\t"
    272     " sw        %[temp0],        -4(%[data])                  \n\t"
    273   "2:                                                         \n\t"
    274     ".set       pop                                           \n\t"
    275     : [data]"+&r"(data), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
    276       [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [temp4]"=&r"(temp4),
    277       [temp5]"=&r"(temp5), [temp6]"=&r"(temp6), [temp7]"=&r"(temp7)
    278     : [p_loop1_end]"r"(p_loop1_end), [p_loop2_end]"r"(p_loop2_end)
    279     : "memory"
    280   );
    281 }
    282 
    283 static void TransformColorInverse(const VP8LMultipliers* const m,
    284                                   uint32_t* data, int num_pixels) {
    285   int temp0, temp1, temp2, temp3, temp4, temp5;
    286   uint32_t argb, argb1, new_red;
    287   const uint32_t G_to_R = m->green_to_red_;
    288   const uint32_t G_to_B = m->green_to_blue_;
    289   const uint32_t R_to_B = m->red_to_blue_;
    290   uint32_t* const p_loop_end = data + (num_pixels & ~1);
    291   __asm__ volatile (
    292     ".set            push                                    \n\t"
    293     ".set            noreorder                               \n\t"
    294     "beq             %[data],      %[p_loop_end],  1f        \n\t"
    295     " nop                                                    \n\t"
    296     "replv.ph        %[temp0],     %[G_to_R]                 \n\t"
    297     "replv.ph        %[temp1],     %[G_to_B]                 \n\t"
    298     "replv.ph        %[temp2],     %[R_to_B]                 \n\t"
    299     "shll.ph         %[temp0],     %[temp0],       8         \n\t"
    300     "shll.ph         %[temp1],     %[temp1],       8         \n\t"
    301     "shll.ph         %[temp2],     %[temp2],       8         \n\t"
    302     "shra.ph         %[temp0],     %[temp0],       8         \n\t"
    303     "shra.ph         %[temp1],     %[temp1],       8         \n\t"
    304     "shra.ph         %[temp2],     %[temp2],       8         \n\t"
    305   "0:                                                        \n\t"
    306     "lw              %[argb],      0(%[data])                \n\t"
    307     "lw              %[argb1],     4(%[data])                \n\t"
    308     "addiu           %[data],      %[data],        8         \n\t"
    309     "precrq.qb.ph    %[temp3],     %[argb],        %[argb1]  \n\t"
    310     "preceu.ph.qbra  %[temp3],     %[temp3]                  \n\t"
    311     "shll.ph         %[temp3],     %[temp3],       8         \n\t"
    312     "shra.ph         %[temp3],     %[temp3],       8         \n\t"
    313     "mul.ph          %[temp5],     %[temp3],       %[temp0]  \n\t"
    314     "mul.ph          %[temp3],     %[temp3],       %[temp1]  \n\t"
    315     "precrq.ph.w     %[new_red],   %[argb],        %[argb1]  \n\t"
    316     "ins             %[argb1],     %[argb],        16,   16  \n\t"
    317     "shra.ph         %[temp5],     %[temp5],       5         \n\t"
    318     "shra.ph         %[temp3],     %[temp3],       5         \n\t"
    319     "addu.ph         %[new_red],   %[new_red],     %[temp5]  \n\t"
    320     "addu.ph         %[argb1],     %[argb1],       %[temp3]  \n\t"
    321     "preceu.ph.qbra  %[temp5],     %[new_red]                \n\t"
    322     "shll.ph         %[temp4],     %[temp5],       8         \n\t"
    323     "shra.ph         %[temp4],     %[temp4],       8         \n\t"
    324     "mul.ph          %[temp4],     %[temp4],       %[temp2]  \n\t"
    325     "sb              %[temp5],     -2(%[data])               \n\t"
    326     "sra             %[temp5],     %[temp5],       16        \n\t"
    327     "shra.ph         %[temp4],     %[temp4],       5         \n\t"
    328     "addu.ph         %[argb1],     %[argb1],       %[temp4]  \n\t"
    329     "preceu.ph.qbra  %[temp3],     %[argb1]                  \n\t"
    330     "sb              %[temp5],     -6(%[data])               \n\t"
    331     "sb              %[temp3],     -4(%[data])               \n\t"
    332     "sra             %[temp3],     %[temp3],       16        \n\t"
    333     "bne             %[data],      %[p_loop_end],  0b        \n\t"
    334     " sb             %[temp3],     -8(%[data])               \n\t"
    335   "1:                                                        \n\t"
    336     ".set            pop                                     \n\t"
    337     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
    338       [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
    339       [new_red]"=&r"(new_red), [argb]"=&r"(argb),
    340       [argb1]"=&r"(argb1), [data]"+&r"(data)
    341     : [G_to_R]"r"(G_to_R), [R_to_B]"r"(R_to_B),
    342       [G_to_B]"r"(G_to_B), [p_loop_end]"r"(p_loop_end)
    343     : "memory", "hi", "lo"
    344   );
    345 
    346   // Fall-back to C-version for left-overs.
    347   if (num_pixels & 1) VP8LTransformColorInverse_C(m, data, 1);
    348 }
    349 
    350 static void ConvertBGRAToRGB(const uint32_t* src,
    351                              int num_pixels, uint8_t* dst) {
    352   int temp0, temp1, temp2, temp3;
    353   const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
    354   const uint32_t* const p_loop2_end = src + num_pixels;
    355   __asm__ volatile (
    356     ".set       push                                       \n\t"
    357     ".set       noreorder                                  \n\t"
    358     "beq        %[src],      %[p_loop1_end],    3f         \n\t"
    359     " nop                                                  \n\t"
    360   "0:                                                      \n\t"
    361     "lw         %[temp3],    12(%[src])                    \n\t"
    362     "lw         %[temp2],    8(%[src])                     \n\t"
    363     "lw         %[temp1],    4(%[src])                     \n\t"
    364     "lw         %[temp0],    0(%[src])                     \n\t"
    365     "ins        %[temp3],    %[temp2],          24,   8    \n\t"
    366     "sll        %[temp2],    %[temp2],          8          \n\t"
    367     "rotr       %[temp3],    %[temp3],          16         \n\t"
    368     "ins        %[temp2],    %[temp1],          0,    16   \n\t"
    369     "sll        %[temp1],    %[temp1],          8          \n\t"
    370     "wsbh       %[temp3],    %[temp3]                      \n\t"
    371     "balign     %[temp0],    %[temp1],          1          \n\t"
    372     "wsbh       %[temp2],    %[temp2]                      \n\t"
    373     "wsbh       %[temp0],    %[temp0]                      \n\t"
    374     "usw        %[temp3],    8(%[dst])                     \n\t"
    375     "rotr       %[temp0],    %[temp0],          16         \n\t"
    376     "usw        %[temp2],    4(%[dst])                     \n\t"
    377     "addiu      %[src],      %[src],            16         \n\t"
    378     "usw        %[temp0],    0(%[dst])                     \n\t"
    379     "bne        %[src],      %[p_loop1_end],    0b         \n\t"
    380     " addiu     %[dst],      %[dst],            12         \n\t"
    381   "3:                                                      \n\t"
    382     "beq        %[src],      %[p_loop2_end],    2f         \n\t"
    383     " nop                                                  \n\t"
    384   "1:                                                      \n\t"
    385     "lw         %[temp0],    0(%[src])                     \n\t"
    386     "addiu      %[src],      %[src],            4          \n\t"
    387     "wsbh       %[temp1],    %[temp0]                      \n\t"
    388     "addiu      %[dst],      %[dst],            3          \n\t"
    389     "ush        %[temp1],    -2(%[dst])                    \n\t"
    390     "sra        %[temp0],    %[temp0],          16         \n\t"
    391     "bne        %[src],      %[p_loop2_end],    1b         \n\t"
    392     " sb        %[temp0],    -3(%[dst])                    \n\t"
    393   "2:                                                      \n\t"
    394     ".set       pop                                        \n\t"
    395     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
    396       [temp3]"=&r"(temp3), [dst]"+&r"(dst), [src]"+&r"(src)
    397     : [p_loop1_end]"r"(p_loop1_end), [p_loop2_end]"r"(p_loop2_end)
    398     : "memory"
    399   );
    400 }
    401 
    402 static void ConvertBGRAToRGBA(const uint32_t* src,
    403                               int num_pixels, uint8_t* dst) {
    404   int temp0, temp1, temp2, temp3;
    405   const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
    406   const uint32_t* const p_loop2_end = src + num_pixels;
    407   __asm__ volatile (
    408     ".set       push                                       \n\t"
    409     ".set       noreorder                                  \n\t"
    410     "beq        %[src],      %[p_loop1_end],    3f         \n\t"
    411     " nop                                                  \n\t"
    412   "0:                                                      \n\t"
    413     "lw         %[temp0],    0(%[src])                     \n\t"
    414     "lw         %[temp1],    4(%[src])                     \n\t"
    415     "lw         %[temp2],    8(%[src])                     \n\t"
    416     "lw         %[temp3],    12(%[src])                    \n\t"
    417     "wsbh       %[temp0],    %[temp0]                      \n\t"
    418     "wsbh       %[temp1],    %[temp1]                      \n\t"
    419     "wsbh       %[temp2],    %[temp2]                      \n\t"
    420     "wsbh       %[temp3],    %[temp3]                      \n\t"
    421     "addiu      %[src],      %[src],            16         \n\t"
    422     "balign     %[temp0],    %[temp0],          1          \n\t"
    423     "balign     %[temp1],    %[temp1],          1          \n\t"
    424     "balign     %[temp2],    %[temp2],          1          \n\t"
    425     "balign     %[temp3],    %[temp3],          1          \n\t"
    426     "usw        %[temp0],    0(%[dst])                     \n\t"
    427     "usw        %[temp1],    4(%[dst])                     \n\t"
    428     "usw        %[temp2],    8(%[dst])                     \n\t"
    429     "usw        %[temp3],    12(%[dst])                    \n\t"
    430     "bne        %[src],      %[p_loop1_end],    0b         \n\t"
    431     " addiu     %[dst],      %[dst],            16         \n\t"
    432   "3:                                                      \n\t"
    433     "beq        %[src],      %[p_loop2_end],    2f         \n\t"
    434     " nop                                                  \n\t"
    435   "1:                                                      \n\t"
    436     "lw         %[temp0],    0(%[src])                     \n\t"
    437     "wsbh       %[temp0],    %[temp0]                      \n\t"
    438     "addiu      %[src],      %[src],            4          \n\t"
    439     "balign     %[temp0],    %[temp0],          1          \n\t"
    440     "usw        %[temp0],    0(%[dst])                     \n\t"
    441     "bne        %[src],      %[p_loop2_end],    1b         \n\t"
    442     " addiu     %[dst],      %[dst],            4          \n\t"
    443   "2:                                                      \n\t"
    444     ".set       pop                                        \n\t"
    445     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
    446       [temp3]"=&r"(temp3), [dst]"+&r"(dst), [src]"+&r"(src)
    447     : [p_loop1_end]"r"(p_loop1_end), [p_loop2_end]"r"(p_loop2_end)
    448     : "memory"
    449   );
    450 }
    451 
    452 static void ConvertBGRAToRGBA4444(const uint32_t* src,
    453                                   int num_pixels, uint8_t* dst) {
    454   int temp0, temp1, temp2, temp3, temp4, temp5;
    455   const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
    456   const uint32_t* const p_loop2_end = src + num_pixels;
    457   __asm__ volatile (
    458     ".set           push                                       \n\t"
    459     ".set           noreorder                                  \n\t"
    460     "beq            %[src],      %[p_loop1_end],    3f         \n\t"
    461     " nop                                                      \n\t"
    462   "0:                                                          \n\t"
    463     "lw             %[temp0],    0(%[src])                     \n\t"
    464     "lw             %[temp1],    4(%[src])                     \n\t"
    465     "lw             %[temp2],    8(%[src])                     \n\t"
    466     "lw             %[temp3],    12(%[src])                    \n\t"
    467     "ext            %[temp4],    %[temp0],          28,   4    \n\t"
    468     "ext            %[temp5],    %[temp0],          12,   4    \n\t"
    469     "ins            %[temp0],    %[temp4],          0,    4    \n\t"
    470     "ext            %[temp4],    %[temp1],          28,   4    \n\t"
    471     "ins            %[temp0],    %[temp5],          16,   4    \n\t"
    472     "ext            %[temp5],    %[temp1],          12,   4    \n\t"
    473     "ins            %[temp1],    %[temp4],          0,    4    \n\t"
    474     "ext            %[temp4],    %[temp2],          28,   4    \n\t"
    475     "ins            %[temp1],    %[temp5],          16,   4    \n\t"
    476     "ext            %[temp5],    %[temp2],          12,   4    \n\t"
    477     "ins            %[temp2],    %[temp4],          0,    4    \n\t"
    478     "ext            %[temp4],    %[temp3],          28,   4    \n\t"
    479     "ins            %[temp2],    %[temp5],          16,   4    \n\t"
    480     "ext            %[temp5],    %[temp3],          12,   4    \n\t"
    481     "ins            %[temp3],    %[temp4],          0,    4    \n\t"
    482     "precr.qb.ph    %[temp1],    %[temp1],          %[temp0]   \n\t"
    483     "ins            %[temp3],    %[temp5],          16,   4    \n\t"
    484     "addiu          %[src],      %[src],            16         \n\t"
    485     "precr.qb.ph    %[temp3],    %[temp3],          %[temp2]   \n\t"
    486 #ifdef WEBP_SWAP_16BIT_CSP
    487     "usw            %[temp1],    0(%[dst])                     \n\t"
    488     "usw            %[temp3],    4(%[dst])                     \n\t"
    489 #else
    490     "wsbh           %[temp1],    %[temp1]                      \n\t"
    491     "wsbh           %[temp3],    %[temp3]                      \n\t"
    492     "usw            %[temp1],    0(%[dst])                     \n\t"
    493     "usw            %[temp3],    4(%[dst])                     \n\t"
    494 #endif
    495     "bne            %[src],      %[p_loop1_end],    0b         \n\t"
    496     " addiu         %[dst],      %[dst],            8          \n\t"
    497   "3:                                                          \n\t"
    498     "beq            %[src],      %[p_loop2_end],    2f         \n\t"
    499     " nop                                                      \n\t"
    500   "1:                                                          \n\t"
    501     "lw             %[temp0],    0(%[src])                     \n\t"
    502     "ext            %[temp4],    %[temp0],          28,   4    \n\t"
    503     "ext            %[temp5],    %[temp0],          12,   4    \n\t"
    504     "ins            %[temp0],    %[temp4],          0,    4    \n\t"
    505     "ins            %[temp0],    %[temp5],          16,   4    \n\t"
    506     "addiu          %[src],      %[src],            4          \n\t"
    507     "precr.qb.ph    %[temp0],    %[temp0],          %[temp0]   \n\t"
    508 #ifdef WEBP_SWAP_16BIT_CSP
    509     "ush            %[temp0],    0(%[dst])                     \n\t"
    510 #else
    511     "wsbh           %[temp0],    %[temp0]                      \n\t"
    512     "ush            %[temp0],    0(%[dst])                     \n\t"
    513 #endif
    514     "bne            %[src],      %[p_loop2_end],    1b         \n\t"
    515     " addiu         %[dst],      %[dst],            2          \n\t"
    516   "2:                                                          \n\t"
    517     ".set           pop                                        \n\t"
    518     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
    519       [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
    520       [dst]"+&r"(dst), [src]"+&r"(src)
    521     : [p_loop1_end]"r"(p_loop1_end), [p_loop2_end]"r"(p_loop2_end)
    522     : "memory"
    523   );
    524 }
    525 
    526 static void ConvertBGRAToRGB565(const uint32_t* src,
    527                                 int num_pixels, uint8_t* dst) {
    528   int temp0, temp1, temp2, temp3, temp4, temp5;
    529   const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
    530   const uint32_t* const p_loop2_end = src + num_pixels;
    531   __asm__ volatile (
    532     ".set           push                                       \n\t"
    533     ".set           noreorder                                  \n\t"
    534     "beq            %[src],      %[p_loop1_end],    3f         \n\t"
    535     " nop                                                      \n\t"
    536   "0:                                                          \n\t"
    537     "lw             %[temp0],    0(%[src])                     \n\t"
    538     "lw             %[temp1],    4(%[src])                     \n\t"
    539     "lw             %[temp2],    8(%[src])                     \n\t"
    540     "lw             %[temp3],    12(%[src])                    \n\t"
    541     "ext            %[temp4],    %[temp0],          8,    16   \n\t"
    542     "ext            %[temp5],    %[temp0],          5,    11   \n\t"
    543     "ext            %[temp0],    %[temp0],          3,    5    \n\t"
    544     "ins            %[temp4],    %[temp5],          0,    11   \n\t"
    545     "ext            %[temp5],    %[temp1],          5,    11   \n\t"
    546     "ins            %[temp4],    %[temp0],          0,    5    \n\t"
    547     "ext            %[temp0],    %[temp1],          8,    16   \n\t"
    548     "ext            %[temp1],    %[temp1],          3,    5    \n\t"
    549     "ins            %[temp0],    %[temp5],          0,    11   \n\t"
    550     "ext            %[temp5],    %[temp2],          5,    11   \n\t"
    551     "ins            %[temp0],    %[temp1],          0,    5    \n\t"
    552     "ext            %[temp1],    %[temp2],          8,    16   \n\t"
    553     "ext            %[temp2],    %[temp2],          3,    5    \n\t"
    554     "ins            %[temp1],    %[temp5],          0,    11   \n\t"
    555     "ext            %[temp5],    %[temp3],          5,    11   \n\t"
    556     "ins            %[temp1],    %[temp2],          0,    5    \n\t"
    557     "ext            %[temp2],    %[temp3],          8,    16   \n\t"
    558     "ext            %[temp3],    %[temp3],          3,    5    \n\t"
    559     "ins            %[temp2],    %[temp5],          0,    11   \n\t"
    560     "append         %[temp0],    %[temp4],          16         \n\t"
    561     "ins            %[temp2],    %[temp3],          0,    5    \n\t"
    562     "addiu          %[src],      %[src],            16         \n\t"
    563     "append         %[temp2],    %[temp1],          16         \n\t"
    564 #ifdef WEBP_SWAP_16BIT_CSP
    565     "usw            %[temp0],    0(%[dst])                     \n\t"
    566     "usw            %[temp2],    4(%[dst])                     \n\t"
    567 #else
    568     "wsbh           %[temp0],    %[temp0]                      \n\t"
    569     "wsbh           %[temp2],    %[temp2]                      \n\t"
    570     "usw            %[temp0],    0(%[dst])                     \n\t"
    571     "usw            %[temp2],    4(%[dst])                     \n\t"
    572 #endif
    573     "bne            %[src],      %[p_loop1_end],    0b         \n\t"
    574     " addiu         %[dst],      %[dst],            8          \n\t"
    575   "3:                                                          \n\t"
    576     "beq            %[src],      %[p_loop2_end],    2f         \n\t"
    577     " nop                                                      \n\t"
    578   "1:                                                          \n\t"
    579     "lw             %[temp0],    0(%[src])                     \n\t"
    580     "ext            %[temp4],    %[temp0],          8,    16   \n\t"
    581     "ext            %[temp5],    %[temp0],          5,    11   \n\t"
    582     "ext            %[temp0],    %[temp0],          3,    5    \n\t"
    583     "ins            %[temp4],    %[temp5],          0,    11   \n\t"
    584     "addiu          %[src],      %[src],            4          \n\t"
    585     "ins            %[temp4],    %[temp0],          0,    5    \n\t"
    586 #ifdef WEBP_SWAP_16BIT_CSP
    587     "ush            %[temp4],    0(%[dst])                     \n\t"
    588 #else
    589     "wsbh           %[temp4],    %[temp4]                      \n\t"
    590     "ush            %[temp4],    0(%[dst])                     \n\t"
    591 #endif
    592     "bne            %[src],      %[p_loop2_end],    1b         \n\t"
    593     " addiu         %[dst],      %[dst],            2          \n\t"
    594   "2:                                                          \n\t"
    595     ".set           pop                                        \n\t"
    596     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
    597       [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
    598       [dst]"+&r"(dst), [src]"+&r"(src)
    599     : [p_loop1_end]"r"(p_loop1_end), [p_loop2_end]"r"(p_loop2_end)
    600     : "memory"
    601   );
    602 }
    603 
    604 static void ConvertBGRAToBGR(const uint32_t* src,
    605                              int num_pixels, uint8_t* dst) {
    606   int temp0, temp1, temp2, temp3;
    607   const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
    608   const uint32_t* const p_loop2_end = src + num_pixels;
    609   __asm__ volatile (
    610     ".set       push                                         \n\t"
    611     ".set       noreorder                                    \n\t"
    612     "beq        %[src],      %[p_loop1_end],    3f           \n\t"
    613     " nop                                                    \n\t"
    614   "0:                                                        \n\t"
    615     "lw         %[temp0],    0(%[src])                       \n\t"
    616     "lw         %[temp1],    4(%[src])                       \n\t"
    617     "lw         %[temp2],    8(%[src])                       \n\t"
    618     "lw         %[temp3],    12(%[src])                      \n\t"
    619     "ins        %[temp0],    %[temp1],          24,    8     \n\t"
    620     "sra        %[temp1],    %[temp1],          8            \n\t"
    621     "ins        %[temp1],    %[temp2],          16,    16    \n\t"
    622     "sll        %[temp2],    %[temp2],          8            \n\t"
    623     "balign     %[temp3],    %[temp2],          1            \n\t"
    624     "addiu      %[src],      %[src],            16           \n\t"
    625     "usw        %[temp0],    0(%[dst])                       \n\t"
    626     "usw        %[temp1],    4(%[dst])                       \n\t"
    627     "usw        %[temp3],    8(%[dst])                       \n\t"
    628     "bne        %[src],      %[p_loop1_end],    0b           \n\t"
    629     " addiu     %[dst],      %[dst],            12           \n\t"
    630   "3:                                                        \n\t"
    631     "beq        %[src],      %[p_loop2_end],    2f           \n\t"
    632     " nop                                                    \n\t"
    633   "1:                                                        \n\t"
    634     "lw         %[temp0],    0(%[src])                       \n\t"
    635     "addiu      %[src],      %[src],            4            \n\t"
    636     "addiu      %[dst],      %[dst],            3            \n\t"
    637     "ush        %[temp0],    -3(%[dst])                      \n\t"
    638     "sra        %[temp0],    %[temp0],          16           \n\t"
    639     "bne        %[src],      %[p_loop2_end],    1b           \n\t"
    640     " sb        %[temp0],    -1(%[dst])                      \n\t"
    641   "2:                                                        \n\t"
    642     ".set       pop                                          \n\t"
    643     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
    644       [temp3]"=&r"(temp3), [dst]"+&r"(dst), [src]"+&r"(src)
    645     : [p_loop1_end]"r"(p_loop1_end), [p_loop2_end]"r"(p_loop2_end)
    646     : "memory"
    647   );
    648 }
    649 
    650 //------------------------------------------------------------------------------
    651 // Entry point
    652 
    653 extern void VP8LDspInitMIPSdspR2(void);
    654 
    655 WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitMIPSdspR2(void) {
    656   VP8LMapColor32b = MapARGB;
    657   VP8LMapColor8b = MapAlpha;
    658   VP8LPredictors[5] = Predictor5;
    659   VP8LPredictors[6] = Predictor6;
    660   VP8LPredictors[7] = Predictor7;
    661   VP8LPredictors[8] = Predictor8;
    662   VP8LPredictors[9] = Predictor9;
    663   VP8LPredictors[10] = Predictor10;
    664   VP8LPredictors[11] = Predictor11;
    665   VP8LPredictors[12] = Predictor12;
    666   VP8LPredictors[13] = Predictor13;
    667   VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
    668   VP8LTransformColorInverse = TransformColorInverse;
    669   VP8LConvertBGRAToRGB = ConvertBGRAToRGB;
    670   VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;
    671   VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444;
    672   VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565;
    673   VP8LConvertBGRAToBGR = ConvertBGRAToBGR;
    674 }
    675 
    676 #else  // !WEBP_USE_MIPS_DSP_R2
    677 
    678 WEBP_DSP_INIT_STUB(VP8LDspInitMIPSdspR2)
    679 
    680 #endif  // WEBP_USE_MIPS_DSP_R2
    681