Home | History | Annotate | Download | only in source
      1 /*
      2  *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS. All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "libyuv/basic_types.h"
     12 #include "libyuv/row.h"
     13 
     14 #ifdef __cplusplus
     15 namespace libyuv {
     16 extern "C" {
     17 #endif
     18 
     19 // This module is for GCC MIPS DSPR2
     20 #if !defined(LIBYUV_DISABLE_MIPS) && \
     21     defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
     22     (_MIPS_SIM == _MIPS_SIM_ABI32)
     23 
     24 void ScaleRowDown2_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
     25                          uint8* dst, int dst_width) {
     26   __asm__ __volatile__(
     27     ".set push                                     \n"
     28     ".set noreorder                                \n"
     29 
     30     "srl            $t9, %[dst_width], 4           \n"  // iterations -> by 16
     31     "beqz           $t9, 2f                        \n"
     32     " nop                                          \n"
     33 
     34   "1:                                              \n"
     35     "lw             $t0, 0(%[src_ptr])             \n"  // |3|2|1|0|
     36     "lw             $t1, 4(%[src_ptr])             \n"  // |7|6|5|4|
     37     "lw             $t2, 8(%[src_ptr])             \n"  // |11|10|9|8|
     38     "lw             $t3, 12(%[src_ptr])            \n"  // |15|14|13|12|
     39     "lw             $t4, 16(%[src_ptr])            \n"  // |19|18|17|16|
     40     "lw             $t5, 20(%[src_ptr])            \n"  // |23|22|21|20|
     41     "lw             $t6, 24(%[src_ptr])            \n"  // |27|26|25|24|
     42     "lw             $t7, 28(%[src_ptr])            \n"  // |31|30|29|28|
     43     // TODO(fbarchard): Use odd pixels instead of even.
     44     "precr.qb.ph    $t8, $t1, $t0                  \n"  // |6|4|2|0|
     45     "precr.qb.ph    $t0, $t3, $t2                  \n"  // |14|12|10|8|
     46     "precr.qb.ph    $t1, $t5, $t4                  \n"  // |22|20|18|16|
     47     "precr.qb.ph    $t2, $t7, $t6                  \n"  // |30|28|26|24|
     48     "addiu          %[src_ptr], %[src_ptr], 32     \n"
     49     "addiu          $t9, $t9, -1                   \n"
     50     "sw             $t8, 0(%[dst])                 \n"
     51     "sw             $t0, 4(%[dst])                 \n"
     52     "sw             $t1, 8(%[dst])                 \n"
     53     "sw             $t2, 12(%[dst])                \n"
     54     "bgtz           $t9, 1b                        \n"
     55     " addiu         %[dst], %[dst], 16             \n"
     56 
     57   "2:                                              \n"
     58     "andi           $t9, %[dst_width], 0xf         \n"  // residue
     59     "beqz           $t9, 3f                        \n"
     60     " nop                                          \n"
     61 
     62   "21:                                             \n"
     63     "lbu            $t0, 0(%[src_ptr])             \n"
     64     "addiu          %[src_ptr], %[src_ptr], 2      \n"
     65     "addiu          $t9, $t9, -1                   \n"
     66     "sb             $t0, 0(%[dst])                 \n"
     67     "bgtz           $t9, 21b                       \n"
     68     " addiu         %[dst], %[dst], 1              \n"
     69 
     70   "3:                                              \n"
     71     ".set pop                                      \n"
     72   : [src_ptr] "+r" (src_ptr),
     73     [dst] "+r" (dst)
     74   : [dst_width] "r" (dst_width)
     75   : "t0", "t1", "t2", "t3", "t4", "t5",
     76     "t6", "t7", "t8", "t9"
     77   );
     78 }
     79 
     80 void ScaleRowDown2Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
     81                             uint8* dst, int dst_width) {
     82   const uint8* t = src_ptr + src_stride;
     83 
     84   __asm__ __volatile__ (
     85     ".set push                                    \n"
     86     ".set noreorder                               \n"
     87 
     88     "srl            $t9, %[dst_width], 3          \n"  // iterations -> step 8
     89     "bltz           $t9, 2f                       \n"
     90     " nop                                         \n"
     91 
     92   "1:                                             \n"
     93     "lw             $t0, 0(%[src_ptr])            \n"  // |3|2|1|0|
     94     "lw             $t1, 4(%[src_ptr])            \n"  // |7|6|5|4|
     95     "lw             $t2, 8(%[src_ptr])            \n"  // |11|10|9|8|
     96     "lw             $t3, 12(%[src_ptr])           \n"  // |15|14|13|12|
     97     "lw             $t4, 0(%[t])                  \n"  // |19|18|17|16|
     98     "lw             $t5, 4(%[t])                  \n"  // |23|22|21|20|
     99     "lw             $t6, 8(%[t])                  \n"  // |27|26|25|24|
    100     "lw             $t7, 12(%[t])                 \n"  // |31|30|29|28|
    101     "addiu          $t9, $t9, -1                  \n"
    102     "srl            $t8, $t0, 16                  \n"  // |X|X|3|2|
    103     "ins            $t0, $t4, 16, 16              \n"  // |17|16|1|0|
    104     "ins            $t4, $t8, 0, 16               \n"  // |19|18|3|2|
    105     "raddu.w.qb     $t0, $t0                      \n"  // |17+16+1+0|
    106     "raddu.w.qb     $t4, $t4                      \n"  // |19+18+3+2|
    107     "shra_r.w       $t0, $t0, 2                   \n"  // |t0+2|>>2
    108     "shra_r.w       $t4, $t4, 2                   \n"  // |t4+2|>>2
    109     "srl            $t8, $t1, 16                  \n"  // |X|X|7|6|
    110     "ins            $t1, $t5, 16, 16              \n"  // |21|20|5|4|
    111     "ins            $t5, $t8, 0, 16               \n"  // |22|23|7|6|
    112     "raddu.w.qb     $t1, $t1                      \n"  // |21+20+5+4|
    113     "raddu.w.qb     $t5, $t5                      \n"  // |23+22+7+6|
    114     "shra_r.w       $t1, $t1, 2                   \n"  // |t1+2|>>2
    115     "shra_r.w       $t5, $t5, 2                   \n"  // |t5+2|>>2
    116     "srl            $t8, $t2, 16                  \n"  // |X|X|11|10|
    117     "ins            $t2, $t6, 16, 16              \n"  // |25|24|9|8|
    118     "ins            $t6, $t8, 0, 16               \n"  // |27|26|11|10|
    119     "raddu.w.qb     $t2, $t2                      \n"  // |25+24+9+8|
    120     "raddu.w.qb     $t6, $t6                      \n"  // |27+26+11+10|
    121     "shra_r.w       $t2, $t2, 2                   \n"  // |t2+2|>>2
    122     "shra_r.w       $t6, $t6, 2                   \n"  // |t5+2|>>2
    123     "srl            $t8, $t3, 16                  \n"  // |X|X|15|14|
    124     "ins            $t3, $t7, 16, 16              \n"  // |29|28|13|12|
    125     "ins            $t7, $t8, 0, 16               \n"  // |31|30|15|14|
    126     "raddu.w.qb     $t3, $t3                      \n"  // |29+28+13+12|
    127     "raddu.w.qb     $t7, $t7                      \n"  // |31+30+15+14|
    128     "shra_r.w       $t3, $t3, 2                   \n"  // |t3+2|>>2
    129     "shra_r.w       $t7, $t7, 2                   \n"  // |t7+2|>>2
    130     "addiu          %[src_ptr], %[src_ptr], 16    \n"
    131     "addiu          %[t], %[t], 16                \n"
    132     "sb             $t0, 0(%[dst])                \n"
    133     "sb             $t4, 1(%[dst])                \n"
    134     "sb             $t1, 2(%[dst])                \n"
    135     "sb             $t5, 3(%[dst])                \n"
    136     "sb             $t2, 4(%[dst])                \n"
    137     "sb             $t6, 5(%[dst])                \n"
    138     "sb             $t3, 6(%[dst])                \n"
    139     "sb             $t7, 7(%[dst])                \n"
    140     "bgtz           $t9, 1b                       \n"
    141     " addiu         %[dst], %[dst], 8             \n"
    142 
    143   "2:                                             \n"
    144     "andi           $t9, %[dst_width], 0x7        \n"  // x = residue
    145     "beqz           $t9, 3f                       \n"
    146     " nop                                         \n"
    147 
    148     "21:                                          \n"
    149     "lwr            $t1, 0(%[src_ptr])            \n"
    150     "lwl            $t1, 3(%[src_ptr])            \n"
    151     "lwr            $t2, 0(%[t])                  \n"
    152     "lwl            $t2, 3(%[t])                  \n"
    153     "srl            $t8, $t1, 16                  \n"
    154     "ins            $t1, $t2, 16, 16              \n"
    155     "ins            $t2, $t8, 0, 16               \n"
    156     "raddu.w.qb     $t1, $t1                      \n"
    157     "raddu.w.qb     $t2, $t2                      \n"
    158     "shra_r.w       $t1, $t1, 2                   \n"
    159     "shra_r.w       $t2, $t2, 2                   \n"
    160     "sb             $t1, 0(%[dst])                \n"
    161     "sb             $t2, 1(%[dst])                \n"
    162     "addiu          %[src_ptr], %[src_ptr], 4     \n"
    163     "addiu          $t9, $t9, -2                  \n"
    164     "addiu          %[t], %[t], 4                 \n"
    165     "bgtz           $t9, 21b                      \n"
    166     " addiu         %[dst], %[dst], 2             \n"
    167 
    168   "3:                                             \n"
    169     ".set pop                                     \n"
    170 
    171   : [src_ptr] "+r" (src_ptr),
    172     [dst] "+r" (dst), [t] "+r" (t)
    173   : [dst_width] "r" (dst_width)
    174   : "t0", "t1", "t2", "t3", "t4", "t5",
    175     "t6", "t7", "t8", "t9"
    176   );
    177 }
    178 
    179 void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
    180                          uint8* dst, int dst_width) {
    181   __asm__ __volatile__ (
    182       ".set push                                    \n"
    183       ".set noreorder                               \n"
    184 
    185       "srl            $t9, %[dst_width], 3          \n"
    186       "beqz           $t9, 2f                       \n"
    187       " nop                                         \n"
    188 
    189      "1:                                            \n"
    190       "lw             $t1, 0(%[src_ptr])            \n"  // |3|2|1|0|
    191       "lw             $t2, 4(%[src_ptr])            \n"  // |7|6|5|4|
    192       "lw             $t3, 8(%[src_ptr])            \n"  // |11|10|9|8|
    193       "lw             $t4, 12(%[src_ptr])           \n"  // |15|14|13|12|
    194       "lw             $t5, 16(%[src_ptr])           \n"  // |19|18|17|16|
    195       "lw             $t6, 20(%[src_ptr])           \n"  // |23|22|21|20|
    196       "lw             $t7, 24(%[src_ptr])           \n"  // |27|26|25|24|
    197       "lw             $t8, 28(%[src_ptr])           \n"  // |31|30|29|28|
    198       "precr.qb.ph    $t1, $t2, $t1                 \n"  // |6|4|2|0|
    199       "precr.qb.ph    $t2, $t4, $t3                 \n"  // |14|12|10|8|
    200       "precr.qb.ph    $t5, $t6, $t5                 \n"  // |22|20|18|16|
    201       "precr.qb.ph    $t6, $t8, $t7                 \n"  // |30|28|26|24|
    202       "precr.qb.ph    $t1, $t2, $t1                 \n"  // |12|8|4|0|
    203       "precr.qb.ph    $t5, $t6, $t5                 \n"  // |28|24|20|16|
    204       "addiu          %[src_ptr], %[src_ptr], 32    \n"
    205       "addiu          $t9, $t9, -1                  \n"
    206       "sw             $t1, 0(%[dst])                \n"
    207       "sw             $t5, 4(%[dst])                \n"
    208       "bgtz           $t9, 1b                       \n"
    209       " addiu         %[dst], %[dst], 8             \n"
    210 
    211     "2:                                             \n"
    212       "andi           $t9, %[dst_width], 7          \n"  // residue
    213       "beqz           $t9, 3f                       \n"
    214       " nop                                         \n"
    215 
    216     "21:                                            \n"
    217       "lbu            $t1, 0(%[src_ptr])            \n"
    218       "addiu          %[src_ptr], %[src_ptr], 4     \n"
    219       "addiu          $t9, $t9, -1                  \n"
    220       "sb             $t1, 0(%[dst])                \n"
    221       "bgtz           $t9, 21b                      \n"
    222       " addiu         %[dst], %[dst], 1             \n"
    223 
    224     "3:                                             \n"
    225       ".set pop                                     \n"
    226       : [src_ptr] "+r" (src_ptr),
    227         [dst] "+r" (dst)
    228       : [dst_width] "r" (dst_width)
    229       : "t1", "t2", "t3", "t4", "t5",
    230         "t6", "t7", "t8", "t9"
    231   );
    232 }
    233 
    234 void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
    235                             uint8* dst, int dst_width) {
    236   intptr_t stride = src_stride;
    237   const uint8* s1 = src_ptr + stride;
    238   const uint8* s2 = s1 + stride;
    239   const uint8* s3 = s2 + stride;
    240 
    241   __asm__ __volatile__ (
    242       ".set push                                  \n"
    243       ".set noreorder                             \n"
    244 
    245       "srl           $t9, %[dst_width], 1         \n"
    246       "andi          $t8, %[dst_width], 1         \n"
    247 
    248      "1:                                          \n"
    249       "lw            $t0, 0(%[src_ptr])           \n"  // |3|2|1|0|
    250       "lw            $t1, 0(%[s1])                \n"  // |7|6|5|4|
    251       "lw            $t2, 0(%[s2])                \n"  // |11|10|9|8|
    252       "lw            $t3, 0(%[s3])                \n"  // |15|14|13|12|
    253       "lw            $t4, 4(%[src_ptr])           \n"  // |19|18|17|16|
    254       "lw            $t5, 4(%[s1])                \n"  // |23|22|21|20|
    255       "lw            $t6, 4(%[s2])                \n"  // |27|26|25|24|
    256       "lw            $t7, 4(%[s3])                \n"  // |31|30|29|28|
    257       "raddu.w.qb    $t0, $t0                     \n"  // |3 + 2 + 1 + 0|
    258       "raddu.w.qb    $t1, $t1                     \n"  // |7 + 6 + 5 + 4|
    259       "raddu.w.qb    $t2, $t2                     \n"  // |11 + 10 + 9 + 8|
    260       "raddu.w.qb    $t3, $t3                     \n"  // |15 + 14 + 13 + 12|
    261       "raddu.w.qb    $t4, $t4                     \n"  // |19 + 18 + 17 + 16|
    262       "raddu.w.qb    $t5, $t5                     \n"  // |23 + 22 + 21 + 20|
    263       "raddu.w.qb    $t6, $t6                     \n"  // |27 + 26 + 25 + 24|
    264       "raddu.w.qb    $t7, $t7                     \n"  // |31 + 30 + 29 + 28|
    265       "add           $t0, $t0, $t1                \n"
    266       "add           $t1, $t2, $t3                \n"
    267       "add           $t0, $t0, $t1                \n"
    268       "add           $t4, $t4, $t5                \n"
    269       "add           $t6, $t6, $t7                \n"
    270       "add           $t4, $t4, $t6                \n"
    271       "shra_r.w      $t0, $t0, 4                  \n"
    272       "shra_r.w      $t4, $t4, 4                  \n"
    273       "sb            $t0, 0(%[dst])               \n"
    274       "sb            $t4, 1(%[dst])               \n"
    275       "addiu         %[src_ptr], %[src_ptr], 8    \n"
    276       "addiu         %[s1], %[s1], 8              \n"
    277       "addiu         %[s2], %[s2], 8              \n"
    278       "addiu         %[s3], %[s3], 8              \n"
    279       "addiu         $t9, $t9, -1                 \n"
    280       "bgtz          $t9, 1b                      \n"
    281       " addiu        %[dst], %[dst], 2            \n"
    282       "beqz          $t8, 2f                      \n"
    283       " nop                                       \n"
    284 
    285       "lw            $t0, 0(%[src_ptr])           \n"  // |3|2|1|0|
    286       "lw            $t1, 0(%[s1])                \n"  // |7|6|5|4|
    287       "lw            $t2, 0(%[s2])                \n"  // |11|10|9|8|
    288       "lw            $t3, 0(%[s3])                \n"  // |15|14|13|12|
    289       "raddu.w.qb    $t0, $t0                     \n"  // |3 + 2 + 1 + 0|
    290       "raddu.w.qb    $t1, $t1                     \n"  // |7 + 6 + 5 + 4|
    291       "raddu.w.qb    $t2, $t2                     \n"  // |11 + 10 + 9 + 8|
    292       "raddu.w.qb    $t3, $t3                     \n"  // |15 + 14 + 13 + 12|
    293       "add           $t0, $t0, $t1                \n"
    294       "add           $t1, $t2, $t3                \n"
    295       "add           $t0, $t0, $t1                \n"
    296       "shra_r.w      $t0, $t0, 4                  \n"
    297       "sb            $t0, 0(%[dst])               \n"
    298 
    299       "2:                                         \n"
    300       ".set pop                                   \n"
    301 
    302       : [src_ptr] "+r" (src_ptr),
    303         [dst] "+r" (dst),
    304         [s1] "+r" (s1),
    305         [s2] "+r" (s2),
    306         [s3] "+r" (s3)
    307       : [dst_width] "r" (dst_width)
    308       : "t0", "t1", "t2", "t3", "t4", "t5",
    309         "t6","t7", "t8", "t9"
    310   );
    311 }
    312 
    313 void ScaleRowDown34_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
    314                           uint8* dst, int dst_width) {
    315   __asm__ __volatile__ (
    316       ".set push                                          \n"
    317       ".set noreorder                                     \n"
    318     "1:                                                   \n"
    319       "lw              $t1, 0(%[src_ptr])                 \n"  // |3|2|1|0|
    320       "lw              $t2, 4(%[src_ptr])                 \n"  // |7|6|5|4|
    321       "lw              $t3, 8(%[src_ptr])                 \n"  // |11|10|9|8|
    322       "lw              $t4, 12(%[src_ptr])                \n"  // |15|14|13|12|
    323       "lw              $t5, 16(%[src_ptr])                \n"  // |19|18|17|16|
    324       "lw              $t6, 20(%[src_ptr])                \n"  // |23|22|21|20|
    325       "lw              $t7, 24(%[src_ptr])                \n"  // |27|26|25|24|
    326       "lw              $t8, 28(%[src_ptr])                \n"  // |31|30|29|28|
    327       "precrq.qb.ph    $t0, $t2, $t4                      \n"  // |7|5|15|13|
    328       "precrq.qb.ph    $t9, $t6, $t8                      \n"  // |23|21|31|30|
    329       "addiu           %[dst_width], %[dst_width], -24    \n"
    330       "ins             $t1, $t1, 8, 16                    \n"  // |3|1|0|X|
    331       "ins             $t4, $t0, 8, 16                    \n"  // |X|15|13|12|
    332       "ins             $t5, $t5, 8, 16                    \n"  // |19|17|16|X|
    333       "ins             $t8, $t9, 8, 16                    \n"  // |X|31|29|28|
    334       "addiu           %[src_ptr], %[src_ptr], 32         \n"
    335       "packrl.ph       $t0, $t3, $t0                      \n"  // |9|8|7|5|
    336       "packrl.ph       $t9, $t7, $t9                      \n"  // |25|24|23|21|
    337       "prepend         $t1, $t2, 8                        \n"  // |4|3|1|0|
    338       "prepend         $t3, $t4, 24                       \n"  // |15|13|12|11|
    339       "prepend         $t5, $t6, 8                        \n"  // |20|19|17|16|
    340       "prepend         $t7, $t8, 24                       \n"  // |31|29|28|27|
    341       "sw              $t1, 0(%[dst])                     \n"
    342       "sw              $t0, 4(%[dst])                     \n"
    343       "sw              $t3, 8(%[dst])                     \n"
    344       "sw              $t5, 12(%[dst])                    \n"
    345       "sw              $t9, 16(%[dst])                    \n"
    346       "sw              $t7, 20(%[dst])                    \n"
    347       "bnez            %[dst_width], 1b                   \n"
    348       " addiu          %[dst], %[dst], 24                 \n"
    349       ".set pop                                           \n"
    350       : [src_ptr] "+r" (src_ptr),
    351         [dst] "+r" (dst),
    352         [dst_width] "+r" (dst_width)
    353       :
    354       : "t0", "t1", "t2", "t3", "t4", "t5",
    355         "t6","t7", "t8", "t9"
    356   );
    357 }
    358 
    359 void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
    360                                 uint8* d, int dst_width) {
    361   __asm__ __volatile__ (
    362       ".set push                                         \n"
    363       ".set noreorder                                    \n"
    364       "repl.ph           $t3, 3                          \n"  // 0x00030003
    365 
    366     "1:                                                  \n"
    367       "lw                $t0, 0(%[src_ptr])              \n"  // |S3|S2|S1|S0|
    368       "lwx               $t1, %[src_stride](%[src_ptr])  \n"  // |T3|T2|T1|T0|
    369       "rotr              $t2, $t0, 8                     \n"  // |S0|S3|S2|S1|
    370       "rotr              $t6, $t1, 8                     \n"  // |T0|T3|T2|T1|
    371       "muleu_s.ph.qbl    $t4, $t2, $t3                   \n"  // |S0*3|S3*3|
    372       "muleu_s.ph.qbl    $t5, $t6, $t3                   \n"  // |T0*3|T3*3|
    373       "andi              $t0, $t2, 0xFFFF                \n"  // |0|0|S2|S1|
    374       "andi              $t1, $t6, 0xFFFF                \n"  // |0|0|T2|T1|
    375       "raddu.w.qb        $t0, $t0                        \n"
    376       "raddu.w.qb        $t1, $t1                        \n"
    377       "shra_r.w          $t0, $t0, 1                     \n"
    378       "shra_r.w          $t1, $t1, 1                     \n"
    379       "preceu.ph.qbr     $t2, $t2                        \n"  // |0|S2|0|S1|
    380       "preceu.ph.qbr     $t6, $t6                        \n"  // |0|T2|0|T1|
    381       "rotr              $t2, $t2, 16                    \n"  // |0|S1|0|S2|
    382       "rotr              $t6, $t6, 16                    \n"  // |0|T1|0|T2|
    383       "addu.ph           $t2, $t2, $t4                   \n"
    384       "addu.ph           $t6, $t6, $t5                   \n"
    385       "sll               $t5, $t0, 1                     \n"
    386       "add               $t0, $t5, $t0                   \n"
    387       "shra_r.ph         $t2, $t2, 2                     \n"
    388       "shra_r.ph         $t6, $t6, 2                     \n"
    389       "shll.ph           $t4, $t2, 1                     \n"
    390       "addq.ph           $t4, $t4, $t2                   \n"
    391       "addu              $t0, $t0, $t1                   \n"
    392       "addiu             %[src_ptr], %[src_ptr], 4       \n"
    393       "shra_r.w          $t0, $t0, 2                     \n"
    394       "addu.ph           $t6, $t6, $t4                   \n"
    395       "shra_r.ph         $t6, $t6, 2                     \n"
    396       "srl               $t1, $t6, 16                    \n"
    397       "addiu             %[dst_width], %[dst_width], -3  \n"
    398       "sb                $t1, 0(%[d])                    \n"
    399       "sb                $t0, 1(%[d])                    \n"
    400       "sb                $t6, 2(%[d])                    \n"
    401       "bgtz              %[dst_width], 1b                \n"
    402       " addiu            %[d], %[d], 3                   \n"
    403     "3:                                                  \n"
    404       ".set pop                                          \n"
    405       : [src_ptr] "+r" (src_ptr),
    406         [src_stride] "+r" (src_stride),
    407         [d] "+r" (d),
    408         [dst_width] "+r" (dst_width)
    409       :
    410       : "t0", "t1", "t2", "t3",
    411         "t4", "t5", "t6"
    412   );
    413 }
    414 
    415 void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
    416                                 uint8* d, int dst_width) {
    417   __asm__ __volatile__ (
    418       ".set push                                           \n"
    419       ".set noreorder                                      \n"
    420       "repl.ph           $t2, 3                            \n"  // 0x00030003
    421 
    422     "1:                                                    \n"
    423       "lw                $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
    424       "lwx               $t1, %[src_stride](%[src_ptr])    \n"  // |T3|T2|T1|T0|
    425       "rotr              $t4, $t0, 8                       \n"  // |S0|S3|S2|S1|
    426       "rotr              $t6, $t1, 8                       \n"  // |T0|T3|T2|T1|
    427       "muleu_s.ph.qbl    $t3, $t4, $t2                     \n"  // |S0*3|S3*3|
    428       "muleu_s.ph.qbl    $t5, $t6, $t2                     \n"  // |T0*3|T3*3|
    429       "andi              $t0, $t4, 0xFFFF                  \n"  // |0|0|S2|S1|
    430       "andi              $t1, $t6, 0xFFFF                  \n"  // |0|0|T2|T1|
    431       "raddu.w.qb        $t0, $t0                          \n"
    432       "raddu.w.qb        $t1, $t1                          \n"
    433       "shra_r.w          $t0, $t0, 1                       \n"
    434       "shra_r.w          $t1, $t1, 1                       \n"
    435       "preceu.ph.qbr     $t4, $t4                          \n"  // |0|S2|0|S1|
    436       "preceu.ph.qbr     $t6, $t6                          \n"  // |0|T2|0|T1|
    437       "rotr              $t4, $t4, 16                      \n"  // |0|S1|0|S2|
    438       "rotr              $t6, $t6, 16                      \n"  // |0|T1|0|T2|
    439       "addu.ph           $t4, $t4, $t3                     \n"
    440       "addu.ph           $t6, $t6, $t5                     \n"
    441       "shra_r.ph         $t6, $t6, 2                       \n"
    442       "shra_r.ph         $t4, $t4, 2                       \n"
    443       "addu.ph           $t6, $t6, $t4                     \n"
    444       "addiu             %[src_ptr], %[src_ptr], 4         \n"
    445       "shra_r.ph         $t6, $t6, 1                       \n"
    446       "addu              $t0, $t0, $t1                     \n"
    447       "addiu             %[dst_width], %[dst_width], -3    \n"
    448       "shra_r.w          $t0, $t0, 1                       \n"
    449       "srl               $t1, $t6, 16                      \n"
    450       "sb                $t1, 0(%[d])                      \n"
    451       "sb                $t0, 1(%[d])                      \n"
    452       "sb                $t6, 2(%[d])                      \n"
    453       "bgtz              %[dst_width], 1b                  \n"
    454       " addiu            %[d], %[d], 3                     \n"
    455     "3:                                                    \n"
    456       ".set pop                                            \n"
    457       : [src_ptr] "+r" (src_ptr),
    458         [src_stride] "+r" (src_stride),
    459         [d] "+r" (d),
    460         [dst_width] "+r" (dst_width)
    461       :
    462       : "t0", "t1", "t2", "t3",
    463         "t4", "t5", "t6"
    464   );
    465 }
    466 
    467 void ScaleRowDown38_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
    468                           uint8* dst, int dst_width) {
    469   __asm__ __volatile__ (
    470       ".set push                                     \n"
    471       ".set noreorder                                \n"
    472 
    473     "1:                                              \n"
    474       "lw         $t0, 0(%[src_ptr])                 \n"  // |3|2|1|0|
    475       "lw         $t1, 4(%[src_ptr])                 \n"  // |7|6|5|4|
    476       "lw         $t2, 8(%[src_ptr])                 \n"  // |11|10|9|8|
    477       "lw         $t3, 12(%[src_ptr])                \n"  // |15|14|13|12|
    478       "lw         $t4, 16(%[src_ptr])                \n"  // |19|18|17|16|
    479       "lw         $t5, 20(%[src_ptr])                \n"  // |23|22|21|20|
    480       "lw         $t6, 24(%[src_ptr])                \n"  // |27|26|25|24|
    481       "lw         $t7, 28(%[src_ptr])                \n"  // |31|30|29|28|
    482       "wsbh       $t0, $t0                           \n"  // |2|3|0|1|
    483       "wsbh       $t6, $t6                           \n"  // |26|27|24|25|
    484       "srl        $t0, $t0, 8                        \n"  // |X|2|3|0|
    485       "srl        $t3, $t3, 16                       \n"  // |X|X|15|14|
    486       "srl        $t5, $t5, 16                       \n"  // |X|X|23|22|
    487       "srl        $t7, $t7, 16                       \n"  // |X|X|31|30|
    488       "ins        $t1, $t2, 24, 8                    \n"  // |8|6|5|4|
    489       "ins        $t6, $t5, 0, 8                     \n"  // |26|27|24|22|
    490       "ins        $t1, $t0, 0, 16                    \n"  // |8|6|3|0|
    491       "ins        $t6, $t7, 24, 8                    \n"  // |30|27|24|22|
    492       "prepend    $t2, $t3, 24                       \n"  // |X|15|14|11|
    493       "ins        $t4, $t4, 16, 8                    \n"  // |19|16|17|X|
    494       "ins        $t4, $t2, 0, 16                    \n"  // |19|16|14|11|
    495       "addiu      %[src_ptr], %[src_ptr], 32         \n"
    496       "addiu      %[dst_width], %[dst_width], -12    \n"
    497       "addiu      $t8,%[dst_width], -12              \n"
    498       "sw         $t1, 0(%[dst])                     \n"
    499       "sw         $t4, 4(%[dst])                     \n"
    500       "sw         $t6, 8(%[dst])                     \n"
    501       "bgez       $t8, 1b                            \n"
    502       " addiu     %[dst], %[dst], 12                 \n"
    503       ".set pop                                      \n"
    504       : [src_ptr] "+r" (src_ptr),
    505         [dst] "+r" (dst),
    506         [dst_width] "+r" (dst_width)
    507       :
    508       : "t0", "t1", "t2", "t3", "t4",
    509         "t5", "t6", "t7", "t8"
    510   );
    511 }
    512 
    513 void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
    514                                 uint8* dst_ptr, int dst_width) {
    515   intptr_t stride = src_stride;
    516   const uint8* t = src_ptr + stride;
    517   const int c = 0x2AAA;
    518 
    519   __asm__ __volatile__ (
    520       ".set push                                         \n"
    521       ".set noreorder                                    \n"
    522 
    523     "1:                                                  \n"
    524       "lw              $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
    525       "lw              $t1, 4(%[src_ptr])                \n"  // |S7|S6|S5|S4|
    526       "lw              $t2, 0(%[t])                      \n"  // |T3|T2|T1|T0|
    527       "lw              $t3, 4(%[t])                      \n"  // |T7|T6|T5|T4|
    528       "rotr            $t1, $t1, 16                      \n"  // |S5|S4|S7|S6|
    529       "packrl.ph       $t4, $t1, $t3                     \n"  // |S7|S6|T7|T6|
    530       "packrl.ph       $t5, $t3, $t1                     \n"  // |T5|T4|S5|S4|
    531       "raddu.w.qb      $t4, $t4                          \n"  // S7+S6+T7+T6
    532       "raddu.w.qb      $t5, $t5                          \n"  // T5+T4+S5+S4
    533       "precrq.qb.ph    $t6, $t0, $t2                     \n"  // |S3|S1|T3|T1|
    534       "precrq.qb.ph    $t6, $t6, $t6                     \n"  // |S3|T3|S3|T3|
    535       "srl             $t4, $t4, 2                       \n"  // t4 / 4
    536       "srl             $t6, $t6, 16                      \n"  // |0|0|S3|T3|
    537       "raddu.w.qb      $t6, $t6                          \n"  // 0+0+S3+T3
    538       "addu            $t6, $t5, $t6                     \n"
    539       "mul             $t6, $t6, %[c]                    \n"  // t6 * 0x2AAA
    540       "sll             $t0, $t0, 8                       \n"  // |S2|S1|S0|0|
    541       "sll             $t2, $t2, 8                       \n"  // |T2|T1|T0|0|
    542       "raddu.w.qb      $t0, $t0                          \n"  // S2+S1+S0+0
    543       "raddu.w.qb      $t2, $t2                          \n"  // T2+T1+T0+0
    544       "addu            $t0, $t0, $t2                     \n"
    545       "mul             $t0, $t0, %[c]                    \n"  // t0 * 0x2AAA
    546       "addiu           %[src_ptr], %[src_ptr], 8         \n"
    547       "addiu           %[t], %[t], 8                     \n"
    548       "addiu           %[dst_width], %[dst_width], -3    \n"
    549       "addiu           %[dst_ptr], %[dst_ptr], 3         \n"
    550       "srl             $t6, $t6, 16                      \n"
    551       "srl             $t0, $t0, 16                      \n"
    552       "sb              $t4, -1(%[dst_ptr])               \n"
    553       "sb              $t6, -2(%[dst_ptr])               \n"
    554       "bgtz            %[dst_width], 1b                  \n"
    555       " sb             $t0, -3(%[dst_ptr])               \n"
    556       ".set pop                                          \n"
    557       : [src_ptr] "+r" (src_ptr),
    558         [dst_ptr] "+r" (dst_ptr),
    559         [t] "+r" (t),
    560         [dst_width] "+r" (dst_width)
    561       : [c] "r" (c)
    562       : "t0", "t1", "t2", "t3", "t4", "t5", "t6"
    563   );
    564 }
    565 
    566 void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr,
    567                                 ptrdiff_t src_stride,
    568                                 uint8* dst_ptr, int dst_width) {
    569   intptr_t stride = src_stride;
    570   const uint8* s1 = src_ptr + stride;
    571   stride += stride;
    572   const uint8* s2 = src_ptr + stride;
    573   const int c1 = 0x1C71;
    574   const int c2 = 0x2AAA;
    575 
    576   __asm__ __volatile__ (
    577       ".set push                                         \n"
    578       ".set noreorder                                    \n"
    579 
    580     "1:                                                  \n"
    581       "lw              $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
    582       "lw              $t1, 4(%[src_ptr])                \n"  // |S7|S6|S5|S4|
    583       "lw              $t2, 0(%[s1])                     \n"  // |T3|T2|T1|T0|
    584       "lw              $t3, 4(%[s1])                     \n"  // |T7|T6|T5|T4|
    585       "lw              $t4, 0(%[s2])                     \n"  // |R3|R2|R1|R0|
    586       "lw              $t5, 4(%[s2])                     \n"  // |R7|R6|R5|R4|
    587       "rotr            $t1, $t1, 16                      \n"  // |S5|S4|S7|S6|
    588       "packrl.ph       $t6, $t1, $t3                     \n"  // |S7|S6|T7|T6|
    589       "raddu.w.qb      $t6, $t6                          \n"  // S7+S6+T7+T6
    590       "packrl.ph       $t7, $t3, $t1                     \n"  // |T5|T4|S5|S4|
    591       "raddu.w.qb      $t7, $t7                          \n"  // T5+T4+S5+S4
    592       "sll             $t8, $t5, 16                      \n"  // |R5|R4|0|0|
    593       "raddu.w.qb      $t8, $t8                          \n"  // R5+R4
    594       "addu            $t7, $t7, $t8                     \n"
    595       "srl             $t8, $t5, 16                      \n"  // |0|0|R7|R6|
    596       "raddu.w.qb      $t8, $t8                          \n"  // R7 + R6
    597       "addu            $t6, $t6, $t8                     \n"
    598       "mul             $t6, $t6, %[c2]                   \n"  // t6 * 0x2AAA
    599       "precrq.qb.ph    $t8, $t0, $t2                     \n"  // |S3|S1|T3|T1|
    600       "precrq.qb.ph    $t8, $t8, $t4                     \n"  // |S3|T3|R3|R1|
    601       "srl             $t8, $t8, 8                       \n"  // |0|S3|T3|R3|
    602       "raddu.w.qb      $t8, $t8                          \n"  // S3 + T3 + R3
    603       "addu            $t7, $t7, $t8                     \n"
    604       "mul             $t7, $t7, %[c1]                   \n"  // t7 * 0x1C71
    605       "sll             $t0, $t0, 8                       \n"  // |S2|S1|S0|0|
    606       "sll             $t2, $t2, 8                       \n"  // |T2|T1|T0|0|
    607       "sll             $t4, $t4, 8                       \n"  // |R2|R1|R0|0|
    608       "raddu.w.qb      $t0, $t0                          \n"
    609       "raddu.w.qb      $t2, $t2                          \n"
    610       "raddu.w.qb      $t4, $t4                          \n"
    611       "addu            $t0, $t0, $t2                     \n"
    612       "addu            $t0, $t0, $t4                     \n"
    613       "mul             $t0, $t0, %[c1]                   \n"  // t0 * 0x1C71
    614       "addiu           %[src_ptr], %[src_ptr], 8         \n"
    615       "addiu           %[s1], %[s1], 8                   \n"
    616       "addiu           %[s2], %[s2], 8                   \n"
    617       "addiu           %[dst_width], %[dst_width], -3    \n"
    618       "addiu           %[dst_ptr], %[dst_ptr], 3         \n"
    619       "srl             $t6, $t6, 16                      \n"
    620       "srl             $t7, $t7, 16                      \n"
    621       "srl             $t0, $t0, 16                      \n"
    622       "sb              $t6, -1(%[dst_ptr])               \n"
    623       "sb              $t7, -2(%[dst_ptr])               \n"
    624       "bgtz            %[dst_width], 1b                  \n"
    625       " sb             $t0, -3(%[dst_ptr])               \n"
    626       ".set pop                                          \n"
    627       : [src_ptr] "+r" (src_ptr),
    628         [dst_ptr] "+r" (dst_ptr),
    629         [s1] "+r" (s1),
    630         [s2] "+r" (s2),
    631         [dst_width] "+r" (dst_width)
    632       : [c1] "r" (c1), [c2] "r" (c2)
    633       : "t0", "t1", "t2", "t3", "t4",
    634         "t5", "t6", "t7", "t8"
    635   );
    636 }
    637 
    638 #endif  // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
    639 
    640 #ifdef __cplusplus
    641 }  // extern "C"
    642 }  // namespace libyuv
    643 #endif
    644 
    645