Home | History | Annotate | Download | only in source
      1 /*
      2  *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS. All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "libyuv/basic_types.h"
     12 #include "libyuv/row.h"
     13 
     14 #ifdef __cplusplus
     15 namespace libyuv {
     16 extern "C" {
     17 #endif
     18 
     19 // This module is for GCC MIPS DSPR2
     20 #if !defined(LIBYUV_DISABLE_MIPS) && \
     21     defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
     22     (_MIPS_SIM == _MIPS_SIM_ABI32)
     23 
     24 void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
     25                               uint8* dst, int dst_width) {
     26   __asm__ __volatile__(
     27     ".set push                                     \n"
     28     ".set noreorder                                \n"
     29 
     30     "srl            $t9, %[dst_width], 4           \n"  // iterations -> by 16
     31     "beqz           $t9, 2f                        \n"
     32     " nop                                          \n"
     33 
     34     ".p2align       2                              \n"
     35   "1:                                              \n"
     36     "lw             $t0, 0(%[src_ptr])             \n"  // |3|2|1|0|
     37     "lw             $t1, 4(%[src_ptr])             \n"  // |7|6|5|4|
     38     "lw             $t2, 8(%[src_ptr])             \n"  // |11|10|9|8|
     39     "lw             $t3, 12(%[src_ptr])            \n"  // |15|14|13|12|
     40     "lw             $t4, 16(%[src_ptr])            \n"  // |19|18|17|16|
     41     "lw             $t5, 20(%[src_ptr])            \n"  // |23|22|21|20|
     42     "lw             $t6, 24(%[src_ptr])            \n"  // |27|26|25|24|
     43     "lw             $t7, 28(%[src_ptr])            \n"  // |31|30|29|28|
     44     // TODO(fbarchard): Use odd pixels instead of even.
     45     "precr.qb.ph    $t8, $t1, $t0                  \n"  // |6|4|2|0|
     46     "precr.qb.ph    $t0, $t3, $t2                  \n"  // |14|12|10|8|
     47     "precr.qb.ph    $t1, $t5, $t4                  \n"  // |22|20|18|16|
     48     "precr.qb.ph    $t2, $t7, $t6                  \n"  // |30|28|26|24|
     49     "addiu          %[src_ptr], %[src_ptr], 32     \n"
     50     "addiu          $t9, $t9, -1                   \n"
     51     "sw             $t8, 0(%[dst])                 \n"
     52     "sw             $t0, 4(%[dst])                 \n"
     53     "sw             $t1, 8(%[dst])                 \n"
     54     "sw             $t2, 12(%[dst])                \n"
     55     "bgtz           $t9, 1b                        \n"
     56     " addiu         %[dst], %[dst], 16             \n"
     57 
     58   "2:                                              \n"
     59     "andi           $t9, %[dst_width], 0xf         \n"  // residue
     60     "beqz           $t9, 3f                        \n"
     61     " nop                                          \n"
     62 
     63   "21:                                             \n"
     64     "lbu            $t0, 0(%[src_ptr])             \n"
     65     "addiu          %[src_ptr], %[src_ptr], 2      \n"
     66     "addiu          $t9, $t9, -1                   \n"
     67     "sb             $t0, 0(%[dst])                 \n"
     68     "bgtz           $t9, 21b                       \n"
     69     " addiu         %[dst], %[dst], 1              \n"
     70 
     71   "3:                                              \n"
     72     ".set pop                                      \n"
     73   : [src_ptr] "+r" (src_ptr),
     74     [dst] "+r" (dst)
     75   : [dst_width] "r" (dst_width)
     76   : "t0", "t1", "t2", "t3", "t4", "t5",
     77     "t6", "t7", "t8", "t9"
     78   );
     79 }
     80 
     81 void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
     82                                  uint8* dst, int dst_width) {
     83   const uint8* t = src_ptr + src_stride;
     84 
     85   __asm__ __volatile__ (
     86     ".set push                                    \n"
     87     ".set noreorder                               \n"
     88 
     89     "srl            $t9, %[dst_width], 3          \n"  // iterations -> step 8
     90     "bltz           $t9, 2f                       \n"
     91     " nop                                         \n"
     92 
     93     ".p2align       2                             \n"
     94   "1:                                             \n"
     95     "lw             $t0, 0(%[src_ptr])            \n"  // |3|2|1|0|
     96     "lw             $t1, 4(%[src_ptr])            \n"  // |7|6|5|4|
     97     "lw             $t2, 8(%[src_ptr])            \n"  // |11|10|9|8|
     98     "lw             $t3, 12(%[src_ptr])           \n"  // |15|14|13|12|
     99     "lw             $t4, 0(%[t])                  \n"  // |19|18|17|16|
    100     "lw             $t5, 4(%[t])                  \n"  // |23|22|21|20|
    101     "lw             $t6, 8(%[t])                  \n"  // |27|26|25|24|
    102     "lw             $t7, 12(%[t])                 \n"  // |31|30|29|28|
    103     "addiu          $t9, $t9, -1                  \n"
    104     "srl            $t8, $t0, 16                  \n"  // |X|X|3|2|
    105     "ins            $t0, $t4, 16, 16              \n"  // |17|16|1|0|
    106     "ins            $t4, $t8, 0, 16               \n"  // |19|18|3|2|
    107     "raddu.w.qb     $t0, $t0                      \n"  // |17+16+1+0|
    108     "raddu.w.qb     $t4, $t4                      \n"  // |19+18+3+2|
    109     "shra_r.w       $t0, $t0, 2                   \n"  // |t0+2|>>2
    110     "shra_r.w       $t4, $t4, 2                   \n"  // |t4+2|>>2
    111     "srl            $t8, $t1, 16                  \n"  // |X|X|7|6|
    112     "ins            $t1, $t5, 16, 16              \n"  // |21|20|5|4|
    113     "ins            $t5, $t8, 0, 16               \n"  // |22|23|7|6|
    114     "raddu.w.qb     $t1, $t1                      \n"  // |21+20+5+4|
    115     "raddu.w.qb     $t5, $t5                      \n"  // |23+22+7+6|
    116     "shra_r.w       $t1, $t1, 2                   \n"  // |t1+2|>>2
    117     "shra_r.w       $t5, $t5, 2                   \n"  // |t5+2|>>2
    118     "srl            $t8, $t2, 16                  \n"  // |X|X|11|10|
    119     "ins            $t2, $t6, 16, 16              \n"  // |25|24|9|8|
    120     "ins            $t6, $t8, 0, 16               \n"  // |27|26|11|10|
    121     "raddu.w.qb     $t2, $t2                      \n"  // |25+24+9+8|
    122     "raddu.w.qb     $t6, $t6                      \n"  // |27+26+11+10|
    123     "shra_r.w       $t2, $t2, 2                   \n"  // |t2+2|>>2
    124     "shra_r.w       $t6, $t6, 2                   \n"  // |t5+2|>>2
    125     "srl            $t8, $t3, 16                  \n"  // |X|X|15|14|
    126     "ins            $t3, $t7, 16, 16              \n"  // |29|28|13|12|
    127     "ins            $t7, $t8, 0, 16               \n"  // |31|30|15|14|
    128     "raddu.w.qb     $t3, $t3                      \n"  // |29+28+13+12|
    129     "raddu.w.qb     $t7, $t7                      \n"  // |31+30+15+14|
    130     "shra_r.w       $t3, $t3, 2                   \n"  // |t3+2|>>2
    131     "shra_r.w       $t7, $t7, 2                   \n"  // |t7+2|>>2
    132     "addiu          %[src_ptr], %[src_ptr], 16    \n"
    133     "addiu          %[t], %[t], 16                \n"
    134     "sb             $t0, 0(%[dst])                \n"
    135     "sb             $t4, 1(%[dst])                \n"
    136     "sb             $t1, 2(%[dst])                \n"
    137     "sb             $t5, 3(%[dst])                \n"
    138     "sb             $t2, 4(%[dst])                \n"
    139     "sb             $t6, 5(%[dst])                \n"
    140     "sb             $t3, 6(%[dst])                \n"
    141     "sb             $t7, 7(%[dst])                \n"
    142     "bgtz           $t9, 1b                       \n"
    143     " addiu         %[dst], %[dst], 8             \n"
    144 
    145   "2:                                             \n"
    146     "andi           $t9, %[dst_width], 0x7        \n"  // x = residue
    147     "beqz           $t9, 3f                       \n"
    148     " nop                                         \n"
    149 
    150     "21:                                          \n"
    151     "lwr            $t1, 0(%[src_ptr])            \n"
    152     "lwl            $t1, 3(%[src_ptr])            \n"
    153     "lwr            $t2, 0(%[t])                  \n"
    154     "lwl            $t2, 3(%[t])                  \n"
    155     "srl            $t8, $t1, 16                  \n"
    156     "ins            $t1, $t2, 16, 16              \n"
    157     "ins            $t2, $t8, 0, 16               \n"
    158     "raddu.w.qb     $t1, $t1                      \n"
    159     "raddu.w.qb     $t2, $t2                      \n"
    160     "shra_r.w       $t1, $t1, 2                   \n"
    161     "shra_r.w       $t2, $t2, 2                   \n"
    162     "sb             $t1, 0(%[dst])                \n"
    163     "sb             $t2, 1(%[dst])                \n"
    164     "addiu          %[src_ptr], %[src_ptr], 4     \n"
    165     "addiu          $t9, $t9, -2                  \n"
    166     "addiu          %[t], %[t], 4                 \n"
    167     "bgtz           $t9, 21b                      \n"
    168     " addiu         %[dst], %[dst], 2             \n"
    169 
    170   "3:                                             \n"
    171     ".set pop                                     \n"
    172 
    173   : [src_ptr] "+r" (src_ptr),
    174     [dst] "+r" (dst), [t] "+r" (t)
    175   : [dst_width] "r" (dst_width)
    176   : "t0", "t1", "t2", "t3", "t4", "t5",
    177     "t6", "t7", "t8", "t9"
    178   );
    179 }
    180 
    181 void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
    182                               uint8* dst, int dst_width) {
    183   __asm__ __volatile__ (
    184       ".set push                                    \n"
    185       ".set noreorder                               \n"
    186 
    187       "srl            $t9, %[dst_width], 3          \n"
    188       "beqz           $t9, 2f                       \n"
    189       " nop                                         \n"
    190 
    191       ".p2align       2                             \n"
    192      "1:                                            \n"
    193       "lw             $t1, 0(%[src_ptr])            \n"  // |3|2|1|0|
    194       "lw             $t2, 4(%[src_ptr])            \n"  // |7|6|5|4|
    195       "lw             $t3, 8(%[src_ptr])            \n"  // |11|10|9|8|
    196       "lw             $t4, 12(%[src_ptr])           \n"  // |15|14|13|12|
    197       "lw             $t5, 16(%[src_ptr])           \n"  // |19|18|17|16|
    198       "lw             $t6, 20(%[src_ptr])           \n"  // |23|22|21|20|
    199       "lw             $t7, 24(%[src_ptr])           \n"  // |27|26|25|24|
    200       "lw             $t8, 28(%[src_ptr])           \n"  // |31|30|29|28|
    201       "precr.qb.ph    $t1, $t2, $t1                 \n"  // |6|4|2|0|
    202       "precr.qb.ph    $t2, $t4, $t3                 \n"  // |14|12|10|8|
    203       "precr.qb.ph    $t5, $t6, $t5                 \n"  // |22|20|18|16|
    204       "precr.qb.ph    $t6, $t8, $t7                 \n"  // |30|28|26|24|
    205       "precr.qb.ph    $t1, $t2, $t1                 \n"  // |12|8|4|0|
    206       "precr.qb.ph    $t5, $t6, $t5                 \n"  // |28|24|20|16|
    207       "addiu          %[src_ptr], %[src_ptr], 32    \n"
    208       "addiu          $t9, $t9, -1                  \n"
    209       "sw             $t1, 0(%[dst])                \n"
    210       "sw             $t5, 4(%[dst])                \n"
    211       "bgtz           $t9, 1b                       \n"
    212       " addiu         %[dst], %[dst], 8             \n"
    213 
    214     "2:                                             \n"
    215       "andi           $t9, %[dst_width], 7          \n"  // residue
    216       "beqz           $t9, 3f                       \n"
    217       " nop                                         \n"
    218 
    219     "21:                                            \n"
    220       "lbu            $t1, 0(%[src_ptr])            \n"
    221       "addiu          %[src_ptr], %[src_ptr], 4     \n"
    222       "addiu          $t9, $t9, -1                  \n"
    223       "sb             $t1, 0(%[dst])                \n"
    224       "bgtz           $t9, 21b                      \n"
    225       " addiu         %[dst], %[dst], 1             \n"
    226 
    227     "3:                                             \n"
    228       ".set pop                                     \n"
    229       : [src_ptr] "+r" (src_ptr),
    230         [dst] "+r" (dst)
    231       : [dst_width] "r" (dst_width)
    232       : "t1", "t2", "t3", "t4", "t5",
    233         "t6", "t7", "t8", "t9"
    234   );
    235 }
    236 
    237 void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
    238                                  uint8* dst, int dst_width) {
    239   intptr_t stride = src_stride;
    240   const uint8* s1 = src_ptr + stride;
    241   const uint8* s2 = s1 + stride;
    242   const uint8* s3 = s2 + stride;
    243 
    244   __asm__ __volatile__ (
    245       ".set push                                  \n"
    246       ".set noreorder                             \n"
    247 
    248       "srl           $t9, %[dst_width], 1         \n"
    249       "andi          $t8, %[dst_width], 1         \n"
    250 
    251       ".p2align      2                            \n"
    252      "1:                                          \n"
    253       "lw            $t0, 0(%[src_ptr])           \n"  // |3|2|1|0|
    254       "lw            $t1, 0(%[s1])                \n"  // |7|6|5|4|
    255       "lw            $t2, 0(%[s2])                \n"  // |11|10|9|8|
    256       "lw            $t3, 0(%[s3])                \n"  // |15|14|13|12|
    257       "lw            $t4, 4(%[src_ptr])           \n"  // |19|18|17|16|
    258       "lw            $t5, 4(%[s1])                \n"  // |23|22|21|20|
    259       "lw            $t6, 4(%[s2])                \n"  // |27|26|25|24|
    260       "lw            $t7, 4(%[s3])                \n"  // |31|30|29|28|
    261       "raddu.w.qb    $t0, $t0                     \n"  // |3 + 2 + 1 + 0|
    262       "raddu.w.qb    $t1, $t1                     \n"  // |7 + 6 + 5 + 4|
    263       "raddu.w.qb    $t2, $t2                     \n"  // |11 + 10 + 9 + 8|
    264       "raddu.w.qb    $t3, $t3                     \n"  // |15 + 14 + 13 + 12|
    265       "raddu.w.qb    $t4, $t4                     \n"  // |19 + 18 + 17 + 16|
    266       "raddu.w.qb    $t5, $t5                     \n"  // |23 + 22 + 21 + 20|
    267       "raddu.w.qb    $t6, $t6                     \n"  // |27 + 26 + 25 + 24|
    268       "raddu.w.qb    $t7, $t7                     \n"  // |31 + 30 + 29 + 28|
    269       "add           $t0, $t0, $t1                \n"
    270       "add           $t1, $t2, $t3                \n"
    271       "add           $t0, $t0, $t1                \n"
    272       "add           $t4, $t4, $t5                \n"
    273       "add           $t6, $t6, $t7                \n"
    274       "add           $t4, $t4, $t6                \n"
    275       "shra_r.w      $t0, $t0, 4                  \n"
    276       "shra_r.w      $t4, $t4, 4                  \n"
    277       "sb            $t0, 0(%[dst])               \n"
    278       "sb            $t4, 1(%[dst])               \n"
    279       "addiu         %[src_ptr], %[src_ptr], 8    \n"
    280       "addiu         %[s1], %[s1], 8              \n"
    281       "addiu         %[s2], %[s2], 8              \n"
    282       "addiu         %[s3], %[s3], 8              \n"
    283       "addiu         $t9, $t9, -1                 \n"
    284       "bgtz          $t9, 1b                      \n"
    285       " addiu        %[dst], %[dst], 2            \n"
    286       "beqz          $t8, 2f                      \n"
    287       " nop                                       \n"
    288 
    289       "lw            $t0, 0(%[src_ptr])           \n"  // |3|2|1|0|
    290       "lw            $t1, 0(%[s1])                \n"  // |7|6|5|4|
    291       "lw            $t2, 0(%[s2])                \n"  // |11|10|9|8|
    292       "lw            $t3, 0(%[s3])                \n"  // |15|14|13|12|
    293       "raddu.w.qb    $t0, $t0                     \n"  // |3 + 2 + 1 + 0|
    294       "raddu.w.qb    $t1, $t1                     \n"  // |7 + 6 + 5 + 4|
    295       "raddu.w.qb    $t2, $t2                     \n"  // |11 + 10 + 9 + 8|
    296       "raddu.w.qb    $t3, $t3                     \n"  // |15 + 14 + 13 + 12|
    297       "add           $t0, $t0, $t1                \n"
    298       "add           $t1, $t2, $t3                \n"
    299       "add           $t0, $t0, $t1                \n"
    300       "shra_r.w      $t0, $t0, 4                  \n"
    301       "sb            $t0, 0(%[dst])               \n"
    302 
    303       "2:                                         \n"
    304       ".set pop                                   \n"
    305 
    306       : [src_ptr] "+r" (src_ptr),
    307         [dst] "+r" (dst),
    308         [s1] "+r" (s1),
    309         [s2] "+r" (s2),
    310         [s3] "+r" (s3)
    311       : [dst_width] "r" (dst_width)
    312       : "t0", "t1", "t2", "t3", "t4", "t5",
    313         "t6","t7", "t8", "t9"
    314   );
    315 }
    316 
    317 void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
    318                                uint8* dst, int dst_width) {
    319   __asm__ __volatile__ (
    320       ".set push                                          \n"
    321       ".set noreorder                                     \n"
    322       ".p2align        2                                  \n"
    323     "1:                                                   \n"
    324       "lw              $t1, 0(%[src_ptr])                 \n"  // |3|2|1|0|
    325       "lw              $t2, 4(%[src_ptr])                 \n"  // |7|6|5|4|
    326       "lw              $t3, 8(%[src_ptr])                 \n"  // |11|10|9|8|
    327       "lw              $t4, 12(%[src_ptr])                \n"  // |15|14|13|12|
    328       "lw              $t5, 16(%[src_ptr])                \n"  // |19|18|17|16|
    329       "lw              $t6, 20(%[src_ptr])                \n"  // |23|22|21|20|
    330       "lw              $t7, 24(%[src_ptr])                \n"  // |27|26|25|24|
    331       "lw              $t8, 28(%[src_ptr])                \n"  // |31|30|29|28|
    332       "precrq.qb.ph    $t0, $t2, $t4                      \n"  // |7|5|15|13|
    333       "precrq.qb.ph    $t9, $t6, $t8                      \n"  // |23|21|31|30|
    334       "addiu           %[dst_width], %[dst_width], -24    \n"
    335       "ins             $t1, $t1, 8, 16                    \n"  // |3|1|0|X|
    336       "ins             $t4, $t0, 8, 16                    \n"  // |X|15|13|12|
    337       "ins             $t5, $t5, 8, 16                    \n"  // |19|17|16|X|
    338       "ins             $t8, $t9, 8, 16                    \n"  // |X|31|29|28|
    339       "addiu           %[src_ptr], %[src_ptr], 32         \n"
    340       "packrl.ph       $t0, $t3, $t0                      \n"  // |9|8|7|5|
    341       "packrl.ph       $t9, $t7, $t9                      \n"  // |25|24|23|21|
    342       "prepend         $t1, $t2, 8                        \n"  // |4|3|1|0|
    343       "prepend         $t3, $t4, 24                       \n"  // |15|13|12|11|
    344       "prepend         $t5, $t6, 8                        \n"  // |20|19|17|16|
    345       "prepend         $t7, $t8, 24                       \n"  // |31|29|28|27|
    346       "sw              $t1, 0(%[dst])                     \n"
    347       "sw              $t0, 4(%[dst])                     \n"
    348       "sw              $t3, 8(%[dst])                     \n"
    349       "sw              $t5, 12(%[dst])                    \n"
    350       "sw              $t9, 16(%[dst])                    \n"
    351       "sw              $t7, 20(%[dst])                    \n"
    352       "bnez            %[dst_width], 1b                   \n"
    353       " addiu          %[dst], %[dst], 24                 \n"
    354       ".set pop                                           \n"
    355       : [src_ptr] "+r" (src_ptr),
    356         [dst] "+r" (dst),
    357         [dst_width] "+r" (dst_width)
    358       :
    359       : "t0", "t1", "t2", "t3", "t4", "t5",
    360         "t6","t7", "t8", "t9"
    361   );
    362 }
    363 
    364 void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
    365                                      uint8* d, int dst_width) {
    366   __asm__ __volatile__ (
    367       ".set push                                         \n"
    368       ".set noreorder                                    \n"
    369       "repl.ph           $t3, 3                          \n"  // 0x00030003
    370 
    371      ".p2align           2                               \n"
    372     "1:                                                  \n"
    373       "lw                $t0, 0(%[src_ptr])              \n"  // |S3|S2|S1|S0|
    374       "lwx               $t1, %[src_stride](%[src_ptr])  \n"  // |T3|T2|T1|T0|
    375       "rotr              $t2, $t0, 8                     \n"  // |S0|S3|S2|S1|
    376       "rotr              $t6, $t1, 8                     \n"  // |T0|T3|T2|T1|
    377       "muleu_s.ph.qbl    $t4, $t2, $t3                   \n"  // |S0*3|S3*3|
    378       "muleu_s.ph.qbl    $t5, $t6, $t3                   \n"  // |T0*3|T3*3|
    379       "andi              $t0, $t2, 0xFFFF                \n"  // |0|0|S2|S1|
    380       "andi              $t1, $t6, 0xFFFF                \n"  // |0|0|T2|T1|
    381       "raddu.w.qb        $t0, $t0                        \n"
    382       "raddu.w.qb        $t1, $t1                        \n"
    383       "shra_r.w          $t0, $t0, 1                     \n"
    384       "shra_r.w          $t1, $t1, 1                     \n"
    385       "preceu.ph.qbr     $t2, $t2                        \n"  // |0|S2|0|S1|
    386       "preceu.ph.qbr     $t6, $t6                        \n"  // |0|T2|0|T1|
    387       "rotr              $t2, $t2, 16                    \n"  // |0|S1|0|S2|
    388       "rotr              $t6, $t6, 16                    \n"  // |0|T1|0|T2|
    389       "addu.ph           $t2, $t2, $t4                   \n"
    390       "addu.ph           $t6, $t6, $t5                   \n"
    391       "sll               $t5, $t0, 1                     \n"
    392       "add               $t0, $t5, $t0                   \n"
    393       "shra_r.ph         $t2, $t2, 2                     \n"
    394       "shra_r.ph         $t6, $t6, 2                     \n"
    395       "shll.ph           $t4, $t2, 1                     \n"
    396       "addq.ph           $t4, $t4, $t2                   \n"
    397       "addu              $t0, $t0, $t1                   \n"
    398       "addiu             %[src_ptr], %[src_ptr], 4       \n"
    399       "shra_r.w          $t0, $t0, 2                     \n"
    400       "addu.ph           $t6, $t6, $t4                   \n"
    401       "shra_r.ph         $t6, $t6, 2                     \n"
    402       "srl               $t1, $t6, 16                    \n"
    403       "addiu             %[dst_width], %[dst_width], -3  \n"
    404       "sb                $t1, 0(%[d])                    \n"
    405       "sb                $t0, 1(%[d])                    \n"
    406       "sb                $t6, 2(%[d])                    \n"
    407       "bgtz              %[dst_width], 1b                \n"
    408       " addiu            %[d], %[d], 3                   \n"
    409     "3:                                                  \n"
    410       ".set pop                                          \n"
    411       : [src_ptr] "+r" (src_ptr),
    412         [src_stride] "+r" (src_stride),
    413         [d] "+r" (d),
    414         [dst_width] "+r" (dst_width)
    415       :
    416       : "t0", "t1", "t2", "t3",
    417         "t4", "t5", "t6"
    418   );
    419 }
    420 
    421 void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
    422                                      uint8* d, int dst_width) {
    423   __asm__ __volatile__ (
    424       ".set push                                           \n"
    425       ".set noreorder                                      \n"
    426       "repl.ph           $t2, 3                            \n"  // 0x00030003
    427 
    428       ".p2align          2                                 \n"
    429     "1:                                                    \n"
    430       "lw                $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
    431       "lwx               $t1, %[src_stride](%[src_ptr])    \n"  // |T3|T2|T1|T0|
    432       "rotr              $t4, $t0, 8                       \n"  // |S0|S3|S2|S1|
    433       "rotr              $t6, $t1, 8                       \n"  // |T0|T3|T2|T1|
    434       "muleu_s.ph.qbl    $t3, $t4, $t2                     \n"  // |S0*3|S3*3|
    435       "muleu_s.ph.qbl    $t5, $t6, $t2                     \n"  // |T0*3|T3*3|
    436       "andi              $t0, $t4, 0xFFFF                  \n"  // |0|0|S2|S1|
    437       "andi              $t1, $t6, 0xFFFF                  \n"  // |0|0|T2|T1|
    438       "raddu.w.qb        $t0, $t0                          \n"
    439       "raddu.w.qb        $t1, $t1                          \n"
    440       "shra_r.w          $t0, $t0, 1                       \n"
    441       "shra_r.w          $t1, $t1, 1                       \n"
    442       "preceu.ph.qbr     $t4, $t4                          \n"  // |0|S2|0|S1|
    443       "preceu.ph.qbr     $t6, $t6                          \n"  // |0|T2|0|T1|
    444       "rotr              $t4, $t4, 16                      \n"  // |0|S1|0|S2|
    445       "rotr              $t6, $t6, 16                      \n"  // |0|T1|0|T2|
    446       "addu.ph           $t4, $t4, $t3                     \n"
    447       "addu.ph           $t6, $t6, $t5                     \n"
    448       "shra_r.ph         $t6, $t6, 2                       \n"
    449       "shra_r.ph         $t4, $t4, 2                       \n"
    450       "addu.ph           $t6, $t6, $t4                     \n"
    451       "addiu             %[src_ptr], %[src_ptr], 4         \n"
    452       "shra_r.ph         $t6, $t6, 1                       \n"
    453       "addu              $t0, $t0, $t1                     \n"
    454       "addiu             %[dst_width], %[dst_width], -3    \n"
    455       "shra_r.w          $t0, $t0, 1                       \n"
    456       "srl               $t1, $t6, 16                      \n"
    457       "sb                $t1, 0(%[d])                      \n"
    458       "sb                $t0, 1(%[d])                      \n"
    459       "sb                $t6, 2(%[d])                      \n"
    460       "bgtz              %[dst_width], 1b                  \n"
    461       " addiu            %[d], %[d], 3                     \n"
    462     "3:                                                    \n"
    463       ".set pop                                            \n"
    464       : [src_ptr] "+r" (src_ptr),
    465         [src_stride] "+r" (src_stride),
    466         [d] "+r" (d),
    467         [dst_width] "+r" (dst_width)
    468       :
    469       : "t0", "t1", "t2", "t3",
    470         "t4", "t5", "t6"
    471   );
    472 }
    473 
    474 void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
    475                                uint8* dst, int dst_width) {
    476   __asm__ __volatile__ (
    477       ".set push                                     \n"
    478       ".set noreorder                                \n"
    479 
    480       ".p2align   2                                  \n"
    481     "1:                                              \n"
    482       "lw         $t0, 0(%[src_ptr])                 \n"  // |3|2|1|0|
    483       "lw         $t1, 4(%[src_ptr])                 \n"  // |7|6|5|4|
    484       "lw         $t2, 8(%[src_ptr])                 \n"  // |11|10|9|8|
    485       "lw         $t3, 12(%[src_ptr])                \n"  // |15|14|13|12|
    486       "lw         $t4, 16(%[src_ptr])                \n"  // |19|18|17|16|
    487       "lw         $t5, 20(%[src_ptr])                \n"  // |23|22|21|20|
    488       "lw         $t6, 24(%[src_ptr])                \n"  // |27|26|25|24|
    489       "lw         $t7, 28(%[src_ptr])                \n"  // |31|30|29|28|
    490       "wsbh       $t0, $t0                           \n"  // |2|3|0|1|
    491       "wsbh       $t6, $t6                           \n"  // |26|27|24|25|
    492       "srl        $t0, $t0, 8                        \n"  // |X|2|3|0|
    493       "srl        $t3, $t3, 16                       \n"  // |X|X|15|14|
    494       "srl        $t5, $t5, 16                       \n"  // |X|X|23|22|
    495       "srl        $t7, $t7, 16                       \n"  // |X|X|31|30|
    496       "ins        $t1, $t2, 24, 8                    \n"  // |8|6|5|4|
    497       "ins        $t6, $t5, 0, 8                     \n"  // |26|27|24|22|
    498       "ins        $t1, $t0, 0, 16                    \n"  // |8|6|3|0|
    499       "ins        $t6, $t7, 24, 8                    \n"  // |30|27|24|22|
    500       "prepend    $t2, $t3, 24                       \n"  // |X|15|14|11|
    501       "ins        $t4, $t4, 16, 8                    \n"  // |19|16|17|X|
    502       "ins        $t4, $t2, 0, 16                    \n"  // |19|16|14|11|
    503       "addiu      %[src_ptr], %[src_ptr], 32         \n"
    504       "addiu      %[dst_width], %[dst_width], -12    \n"
    505       "addiu      $t8,%[dst_width], -12              \n"
    506       "sw         $t1, 0(%[dst])                     \n"
    507       "sw         $t4, 4(%[dst])                     \n"
    508       "sw         $t6, 8(%[dst])                     \n"
    509       "bgez       $t8, 1b                            \n"
    510       " addiu     %[dst], %[dst], 12                 \n"
    511       ".set pop                                      \n"
    512       : [src_ptr] "+r" (src_ptr),
    513         [dst] "+r" (dst),
    514         [dst_width] "+r" (dst_width)
    515       :
    516       : "t0", "t1", "t2", "t3", "t4",
    517         "t5", "t6", "t7", "t8"
    518   );
    519 }
    520 
    521 void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
    522                                      uint8* dst_ptr, int dst_width) {
    523   intptr_t stride = src_stride;
    524   const uint8* t = src_ptr + stride;
    525   const int c = 0x2AAA;
    526 
    527   __asm__ __volatile__ (
    528       ".set push                                         \n"
    529       ".set noreorder                                    \n"
    530 
    531       ".p2align        2                                 \n"
    532     "1:                                                  \n"
    533       "lw              $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
    534       "lw              $t1, 4(%[src_ptr])                \n"  // |S7|S6|S5|S4|
    535       "lw              $t2, 0(%[t])                      \n"  // |T3|T2|T1|T0|
    536       "lw              $t3, 4(%[t])                      \n"  // |T7|T6|T5|T4|
    537       "rotr            $t1, $t1, 16                      \n"  // |S5|S4|S7|S6|
    538       "packrl.ph       $t4, $t1, $t3                     \n"  // |S7|S6|T7|T6|
    539       "packrl.ph       $t5, $t3, $t1                     \n"  // |T5|T4|S5|S4|
    540       "raddu.w.qb      $t4, $t4                          \n"  // S7+S6+T7+T6
    541       "raddu.w.qb      $t5, $t5                          \n"  // T5+T4+S5+S4
    542       "precrq.qb.ph    $t6, $t0, $t2                     \n"  // |S3|S1|T3|T1|
    543       "precrq.qb.ph    $t6, $t6, $t6                     \n"  // |S3|T3|S3|T3|
    544       "srl             $t4, $t4, 2                       \n"  // t4 / 4
    545       "srl             $t6, $t6, 16                      \n"  // |0|0|S3|T3|
    546       "raddu.w.qb      $t6, $t6                          \n"  // 0+0+S3+T3
    547       "addu            $t6, $t5, $t6                     \n"
    548       "mul             $t6, $t6, %[c]                    \n"  // t6 * 0x2AAA
    549       "sll             $t0, $t0, 8                       \n"  // |S2|S1|S0|0|
    550       "sll             $t2, $t2, 8                       \n"  // |T2|T1|T0|0|
    551       "raddu.w.qb      $t0, $t0                          \n"  // S2+S1+S0+0
    552       "raddu.w.qb      $t2, $t2                          \n"  // T2+T1+T0+0
    553       "addu            $t0, $t0, $t2                     \n"
    554       "mul             $t0, $t0, %[c]                    \n"  // t0 * 0x2AAA
    555       "addiu           %[src_ptr], %[src_ptr], 8         \n"
    556       "addiu           %[t], %[t], 8                     \n"
    557       "addiu           %[dst_width], %[dst_width], -3    \n"
    558       "addiu           %[dst_ptr], %[dst_ptr], 3         \n"
    559       "srl             $t6, $t6, 16                      \n"
    560       "srl             $t0, $t0, 16                      \n"
    561       "sb              $t4, -1(%[dst_ptr])               \n"
    562       "sb              $t6, -2(%[dst_ptr])               \n"
    563       "bgtz            %[dst_width], 1b                  \n"
    564       " sb             $t0, -3(%[dst_ptr])               \n"
    565       ".set pop                                          \n"
    566       : [src_ptr] "+r" (src_ptr),
    567         [dst_ptr] "+r" (dst_ptr),
    568         [t] "+r" (t),
    569         [dst_width] "+r" (dst_width)
    570       : [c] "r" (c)
    571       : "t0", "t1", "t2", "t3", "t4", "t5", "t6"
    572   );
    573 }
    574 
    575 void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr,
    576                                      ptrdiff_t src_stride,
    577                                      uint8* dst_ptr, int dst_width) {
    578   intptr_t stride = src_stride;
    579   const uint8* s1 = src_ptr + stride;
    580   stride += stride;
    581   const uint8* s2 = src_ptr + stride;
    582   const int c1 = 0x1C71;
    583   const int c2 = 0x2AAA;
    584 
    585   __asm__ __volatile__ (
    586       ".set push                                         \n"
    587       ".set noreorder                                    \n"
    588 
    589       ".p2align        2                                 \n"
    590     "1:                                                  \n"
    591       "lw              $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
    592       "lw              $t1, 4(%[src_ptr])                \n"  // |S7|S6|S5|S4|
    593       "lw              $t2, 0(%[s1])                     \n"  // |T3|T2|T1|T0|
    594       "lw              $t3, 4(%[s1])                     \n"  // |T7|T6|T5|T4|
    595       "lw              $t4, 0(%[s2])                     \n"  // |R3|R2|R1|R0|
    596       "lw              $t5, 4(%[s2])                     \n"  // |R7|R6|R5|R4|
    597       "rotr            $t1, $t1, 16                      \n"  // |S5|S4|S7|S6|
    598       "packrl.ph       $t6, $t1, $t3                     \n"  // |S7|S6|T7|T6|
    599       "raddu.w.qb      $t6, $t6                          \n"  // S7+S6+T7+T6
    600       "packrl.ph       $t7, $t3, $t1                     \n"  // |T5|T4|S5|S4|
    601       "raddu.w.qb      $t7, $t7                          \n"  // T5+T4+S5+S4
    602       "sll             $t8, $t5, 16                      \n"  // |R5|R4|0|0|
    603       "raddu.w.qb      $t8, $t8                          \n"  // R5+R4
    604       "addu            $t7, $t7, $t8                     \n"
    605       "srl             $t8, $t5, 16                      \n"  // |0|0|R7|R6|
    606       "raddu.w.qb      $t8, $t8                          \n"  // R7 + R6
    607       "addu            $t6, $t6, $t8                     \n"
    608       "mul             $t6, $t6, %[c2]                   \n"  // t6 * 0x2AAA
    609       "precrq.qb.ph    $t8, $t0, $t2                     \n"  // |S3|S1|T3|T1|
    610       "precrq.qb.ph    $t8, $t8, $t4                     \n"  // |S3|T3|R3|R1|
    611       "srl             $t8, $t8, 8                       \n"  // |0|S3|T3|R3|
    612       "raddu.w.qb      $t8, $t8                          \n"  // S3 + T3 + R3
    613       "addu            $t7, $t7, $t8                     \n"
    614       "mul             $t7, $t7, %[c1]                   \n"  // t7 * 0x1C71
    615       "sll             $t0, $t0, 8                       \n"  // |S2|S1|S0|0|
    616       "sll             $t2, $t2, 8                       \n"  // |T2|T1|T0|0|
    617       "sll             $t4, $t4, 8                       \n"  // |R2|R1|R0|0|
    618       "raddu.w.qb      $t0, $t0                          \n"
    619       "raddu.w.qb      $t2, $t2                          \n"
    620       "raddu.w.qb      $t4, $t4                          \n"
    621       "addu            $t0, $t0, $t2                     \n"
    622       "addu            $t0, $t0, $t4                     \n"
    623       "mul             $t0, $t0, %[c1]                   \n"  // t0 * 0x1C71
    624       "addiu           %[src_ptr], %[src_ptr], 8         \n"
    625       "addiu           %[s1], %[s1], 8                   \n"
    626       "addiu           %[s2], %[s2], 8                   \n"
    627       "addiu           %[dst_width], %[dst_width], -3    \n"
    628       "addiu           %[dst_ptr], %[dst_ptr], 3         \n"
    629       "srl             $t6, $t6, 16                      \n"
    630       "srl             $t7, $t7, 16                      \n"
    631       "srl             $t0, $t0, 16                      \n"
    632       "sb              $t6, -1(%[dst_ptr])               \n"
    633       "sb              $t7, -2(%[dst_ptr])               \n"
    634       "bgtz            %[dst_width], 1b                  \n"
    635       " sb             $t0, -3(%[dst_ptr])               \n"
    636       ".set pop                                          \n"
    637       : [src_ptr] "+r" (src_ptr),
    638         [dst_ptr] "+r" (dst_ptr),
    639         [s1] "+r" (s1),
    640         [s2] "+r" (s2),
    641         [dst_width] "+r" (dst_width)
    642       : [c1] "r" (c1), [c2] "r" (c2)
    643       : "t0", "t1", "t2", "t3", "t4",
    644         "t5", "t6", "t7", "t8"
    645   );
    646 }
    647 
    648 #endif  // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
    649 
    650 #ifdef __cplusplus
    651 }  // extern "C"
    652 }  // namespace libyuv
    653 #endif
    654 
    655