Home | History | Annotate | Download | only in source
      1 /*
      2  *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS. All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "libyuv/basic_types.h"
     12 #include "libyuv/row.h"
     13 
     14 #ifdef __cplusplus
     15 namespace libyuv {
     16 extern "C" {
     17 #endif
     18 
     19 // This module is for GCC MIPS DSPR2
     20 #if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips_dsp) && \
     21     (__mips_dsp_rev >= 2) && (_MIPS_SIM == _MIPS_SIM_ABI32)
     22 
     23 void ScaleRowDown2_DSPR2(const uint8* src_ptr,
     24                          ptrdiff_t src_stride,
     25                          uint8* dst,
     26                          int dst_width) {
     27   __asm__ __volatile__(
     28       ".set push                                     \n"
     29       ".set noreorder                                \n"
     30 
     31       "srl            $t9, %[dst_width], 4           \n"  // iterations -> by 16
     32       "beqz           $t9, 2f                        \n"
     33       " nop                                          \n"
     34 
     35       "1:                                            \n"
     36       "lw             $t0, 0(%[src_ptr])             \n"  // |3|2|1|0|
     37       "lw             $t1, 4(%[src_ptr])             \n"  // |7|6|5|4|
     38       "lw             $t2, 8(%[src_ptr])             \n"  // |11|10|9|8|
     39       "lw             $t3, 12(%[src_ptr])            \n"  // |15|14|13|12|
     40       "lw             $t4, 16(%[src_ptr])            \n"  // |19|18|17|16|
     41       "lw             $t5, 20(%[src_ptr])            \n"  // |23|22|21|20|
     42       "lw             $t6, 24(%[src_ptr])            \n"  // |27|26|25|24|
     43       "lw             $t7, 28(%[src_ptr])            \n"  // |31|30|29|28|
     44       // TODO(fbarchard): Use odd pixels instead of even.
     45       "precrq.qb.ph   $t8, $t1, $t0                  \n"  // |7|5|3|1|
     46       "precrq.qb.ph   $t0, $t3, $t2                  \n"  // |15|13|11|9|
     47       "precrq.qb.ph   $t1, $t5, $t4                  \n"  // |23|21|19|17|
     48       "precrq.qb.ph   $t2, $t7, $t6                  \n"  // |31|29|27|25|
     49       "addiu          %[src_ptr], %[src_ptr], 32     \n"
     50       "addiu          $t9, $t9, -1                   \n"
     51       "sw             $t8, 0(%[dst])                 \n"
     52       "sw             $t0, 4(%[dst])                 \n"
     53       "sw             $t1, 8(%[dst])                 \n"
     54       "sw             $t2, 12(%[dst])                \n"
     55       "bgtz           $t9, 1b                        \n"
     56       " addiu         %[dst], %[dst], 16             \n"
     57 
     58       "2:                                            \n"
     59       "andi           $t9, %[dst_width], 0xf         \n"  // residue
     60       "beqz           $t9, 3f                        \n"
     61       " nop                                          \n"
     62 
     63       "21:                                           \n"
     64       "lbu            $t0, 1(%[src_ptr])             \n"
     65       "addiu          %[src_ptr], %[src_ptr], 2      \n"
     66       "addiu          $t9, $t9, -1                   \n"
     67       "sb             $t0, 0(%[dst])                 \n"
     68       "bgtz           $t9, 21b                       \n"
     69       " addiu         %[dst], %[dst], 1              \n"
     70 
     71       "3:                                            \n"
     72       ".set pop                                      \n"
     73       : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst)
     74       : [dst_width] "r"(dst_width)
     75       : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
     76 }
     77 
     78 void ScaleRowDown2Box_DSPR2(const uint8* src_ptr,
     79                             ptrdiff_t src_stride,
     80                             uint8* dst,
     81                             int dst_width) {
     82   const uint8* t = src_ptr + src_stride;
     83 
     84   __asm__ __volatile__(
     85       ".set push                                    \n"
     86       ".set noreorder                               \n"
     87 
     88       "srl            $t9, %[dst_width], 3          \n"  // iterations -> step 8
     89       "bltz           $t9, 2f                       \n"
     90       " nop                                         \n"
     91 
     92       "1:                                           \n"
     93       "lw             $t0, 0(%[src_ptr])            \n"  // |3|2|1|0|
     94       "lw             $t1, 4(%[src_ptr])            \n"  // |7|6|5|4|
     95       "lw             $t2, 8(%[src_ptr])            \n"  // |11|10|9|8|
     96       "lw             $t3, 12(%[src_ptr])           \n"  // |15|14|13|12|
     97       "lw             $t4, 0(%[t])                  \n"  // |19|18|17|16|
     98       "lw             $t5, 4(%[t])                  \n"  // |23|22|21|20|
     99       "lw             $t6, 8(%[t])                  \n"  // |27|26|25|24|
    100       "lw             $t7, 12(%[t])                 \n"  // |31|30|29|28|
    101       "addiu          $t9, $t9, -1                  \n"
    102       "srl            $t8, $t0, 16                  \n"  // |X|X|3|2|
    103       "ins            $t0, $t4, 16, 16              \n"  // |17|16|1|0|
    104       "ins            $t4, $t8, 0, 16               \n"  // |19|18|3|2|
    105       "raddu.w.qb     $t0, $t0                      \n"  // |17+16+1+0|
    106       "raddu.w.qb     $t4, $t4                      \n"  // |19+18+3+2|
    107       "shra_r.w       $t0, $t0, 2                   \n"  // |t0+2|>>2
    108       "shra_r.w       $t4, $t4, 2                   \n"  // |t4+2|>>2
    109       "srl            $t8, $t1, 16                  \n"  // |X|X|7|6|
    110       "ins            $t1, $t5, 16, 16              \n"  // |21|20|5|4|
    111       "ins            $t5, $t8, 0, 16               \n"  // |22|23|7|6|
    112       "raddu.w.qb     $t1, $t1                      \n"  // |21+20+5+4|
    113       "raddu.w.qb     $t5, $t5                      \n"  // |23+22+7+6|
    114       "shra_r.w       $t1, $t1, 2                   \n"  // |t1+2|>>2
    115       "shra_r.w       $t5, $t5, 2                   \n"  // |t5+2|>>2
    116       "srl            $t8, $t2, 16                  \n"  // |X|X|11|10|
    117       "ins            $t2, $t6, 16, 16              \n"  // |25|24|9|8|
    118       "ins            $t6, $t8, 0, 16               \n"  // |27|26|11|10|
    119       "raddu.w.qb     $t2, $t2                      \n"  // |25+24+9+8|
    120       "raddu.w.qb     $t6, $t6                      \n"  // |27+26+11+10|
    121       "shra_r.w       $t2, $t2, 2                   \n"  // |t2+2|>>2
    122       "shra_r.w       $t6, $t6, 2                   \n"  // |t5+2|>>2
    123       "srl            $t8, $t3, 16                  \n"  // |X|X|15|14|
    124       "ins            $t3, $t7, 16, 16              \n"  // |29|28|13|12|
    125       "ins            $t7, $t8, 0, 16               \n"  // |31|30|15|14|
    126       "raddu.w.qb     $t3, $t3                      \n"  // |29+28+13+12|
    127       "raddu.w.qb     $t7, $t7                      \n"  // |31+30+15+14|
    128       "shra_r.w       $t3, $t3, 2                   \n"  // |t3+2|>>2
    129       "shra_r.w       $t7, $t7, 2                   \n"  // |t7+2|>>2
    130       "addiu          %[src_ptr], %[src_ptr], 16    \n"
    131       "addiu          %[t], %[t], 16                \n"
    132       "sb             $t0, 0(%[dst])                \n"
    133       "sb             $t4, 1(%[dst])                \n"
    134       "sb             $t1, 2(%[dst])                \n"
    135       "sb             $t5, 3(%[dst])                \n"
    136       "sb             $t2, 4(%[dst])                \n"
    137       "sb             $t6, 5(%[dst])                \n"
    138       "sb             $t3, 6(%[dst])                \n"
    139       "sb             $t7, 7(%[dst])                \n"
    140       "bgtz           $t9, 1b                       \n"
    141       " addiu         %[dst], %[dst], 8             \n"
    142 
    143       "2:                                           \n"
    144       "andi           $t9, %[dst_width], 0x7        \n"  // x = residue
    145       "beqz           $t9, 3f                       \n"
    146       " nop                                         \n"
    147 
    148       "21:                                          \n"
    149       "lwr            $t1, 0(%[src_ptr])            \n"
    150       "lwl            $t1, 3(%[src_ptr])            \n"
    151       "lwr            $t2, 0(%[t])                  \n"
    152       "lwl            $t2, 3(%[t])                  \n"
    153       "srl            $t8, $t1, 16                  \n"
    154       "ins            $t1, $t2, 16, 16              \n"
    155       "ins            $t2, $t8, 0, 16               \n"
    156       "raddu.w.qb     $t1, $t1                      \n"
    157       "raddu.w.qb     $t2, $t2                      \n"
    158       "shra_r.w       $t1, $t1, 2                   \n"
    159       "shra_r.w       $t2, $t2, 2                   \n"
    160       "sb             $t1, 0(%[dst])                \n"
    161       "sb             $t2, 1(%[dst])                \n"
    162       "addiu          %[src_ptr], %[src_ptr], 4     \n"
    163       "addiu          $t9, $t9, -2                  \n"
    164       "addiu          %[t], %[t], 4                 \n"
    165       "bgtz           $t9, 21b                      \n"
    166       " addiu         %[dst], %[dst], 2             \n"
    167 
    168       "3:                                           \n"
    169       ".set pop                                     \n"
    170 
    171       : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst), [t] "+r"(t)
    172       : [dst_width] "r"(dst_width)
    173       : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
    174 }
    175 
    176 void ScaleRowDown4_DSPR2(const uint8* src_ptr,
    177                          ptrdiff_t src_stride,
    178                          uint8* dst,
    179                          int dst_width) {
    180   __asm__ __volatile__(
    181       ".set push                                    \n"
    182       ".set noreorder                               \n"
    183 
    184       "srl            $t9, %[dst_width], 3          \n"
    185       "beqz           $t9, 2f                       \n"
    186       " nop                                         \n"
    187 
    188       "1:                                           \n"
    189       "lw             $t1, 0(%[src_ptr])            \n"  // |3|2|1|0|
    190       "lw             $t2, 4(%[src_ptr])            \n"  // |7|6|5|4|
    191       "lw             $t3, 8(%[src_ptr])            \n"  // |11|10|9|8|
    192       "lw             $t4, 12(%[src_ptr])           \n"  // |15|14|13|12|
    193       "lw             $t5, 16(%[src_ptr])           \n"  // |19|18|17|16|
    194       "lw             $t6, 20(%[src_ptr])           \n"  // |23|22|21|20|
    195       "lw             $t7, 24(%[src_ptr])           \n"  // |27|26|25|24|
    196       "lw             $t8, 28(%[src_ptr])           \n"  // |31|30|29|28|
    197       "precr.qb.ph    $t1, $t2, $t1                 \n"  // |6|4|2|0|
    198       "precr.qb.ph    $t2, $t4, $t3                 \n"  // |14|12|10|8|
    199       "precr.qb.ph    $t5, $t6, $t5                 \n"  // |22|20|18|16|
    200       "precr.qb.ph    $t6, $t8, $t7                 \n"  // |30|28|26|24|
    201       "precrq.qb.ph   $t1, $t2, $t1                 \n"  // |14|10|6|2|
    202       "precrq.qb.ph   $t5, $t6, $t5                 \n"  // |30|26|22|18|
    203       "addiu          %[src_ptr], %[src_ptr], 32    \n"
    204       "addiu          $t9, $t9, -1                  \n"
    205       "sw             $t1, 0(%[dst])                \n"
    206       "sw             $t5, 4(%[dst])                \n"
    207       "bgtz           $t9, 1b                       \n"
    208       " addiu         %[dst], %[dst], 8             \n"
    209 
    210       "2:                                           \n"
    211       "andi           $t9, %[dst_width], 7          \n"  // residue
    212       "beqz           $t9, 3f                       \n"
    213       " nop                                         \n"
    214 
    215       "21:                                          \n"
    216       "lbu            $t1, 2(%[src_ptr])            \n"
    217       "addiu          %[src_ptr], %[src_ptr], 4     \n"
    218       "addiu          $t9, $t9, -1                  \n"
    219       "sb             $t1, 0(%[dst])                \n"
    220       "bgtz           $t9, 21b                      \n"
    221       " addiu         %[dst], %[dst], 1             \n"
    222 
    223       "3:                                           \n"
    224       ".set pop                                     \n"
    225       : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst)
    226       : [dst_width] "r"(dst_width)
    227       : "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
    228 }
    229 
    230 void ScaleRowDown4Box_DSPR2(const uint8* src_ptr,
    231                             ptrdiff_t src_stride,
    232                             uint8* dst,
    233                             int dst_width) {
    234   intptr_t stride = src_stride;
    235   const uint8* s1 = src_ptr + stride;
    236   const uint8* s2 = s1 + stride;
    237   const uint8* s3 = s2 + stride;
    238 
    239   __asm__ __volatile__(
    240       ".set push                                  \n"
    241       ".set noreorder                             \n"
    242 
    243       "srl           $t9, %[dst_width], 1         \n"
    244       "andi          $t8, %[dst_width], 1         \n"
    245 
    246       "1:                                         \n"
    247       "lw            $t0, 0(%[src_ptr])           \n"  // |3|2|1|0|
    248       "lw            $t1, 0(%[s1])                \n"  // |7|6|5|4|
    249       "lw            $t2, 0(%[s2])                \n"  // |11|10|9|8|
    250       "lw            $t3, 0(%[s3])                \n"  // |15|14|13|12|
    251       "lw            $t4, 4(%[src_ptr])           \n"  // |19|18|17|16|
    252       "lw            $t5, 4(%[s1])                \n"  // |23|22|21|20|
    253       "lw            $t6, 4(%[s2])                \n"  // |27|26|25|24|
    254       "lw            $t7, 4(%[s3])                \n"  // |31|30|29|28|
    255       "raddu.w.qb    $t0, $t0                     \n"  // |3 + 2 + 1 + 0|
    256       "raddu.w.qb    $t1, $t1                     \n"  // |7 + 6 + 5 + 4|
    257       "raddu.w.qb    $t2, $t2                     \n"  // |11 + 10 + 9 + 8|
    258       "raddu.w.qb    $t3, $t3                     \n"  // |15 + 14 + 13 + 12|
    259       "raddu.w.qb    $t4, $t4                     \n"  // |19 + 18 + 17 + 16|
    260       "raddu.w.qb    $t5, $t5                     \n"  // |23 + 22 + 21 + 20|
    261       "raddu.w.qb    $t6, $t6                     \n"  // |27 + 26 + 25 + 24|
    262       "raddu.w.qb    $t7, $t7                     \n"  // |31 + 30 + 29 + 28|
    263       "add           $t0, $t0, $t1                \n"
    264       "add           $t1, $t2, $t3                \n"
    265       "add           $t0, $t0, $t1                \n"
    266       "add           $t4, $t4, $t5                \n"
    267       "add           $t6, $t6, $t7                \n"
    268       "add           $t4, $t4, $t6                \n"
    269       "shra_r.w      $t0, $t0, 4                  \n"
    270       "shra_r.w      $t4, $t4, 4                  \n"
    271       "sb            $t0, 0(%[dst])               \n"
    272       "sb            $t4, 1(%[dst])               \n"
    273       "addiu         %[src_ptr], %[src_ptr], 8    \n"
    274       "addiu         %[s1], %[s1], 8              \n"
    275       "addiu         %[s2], %[s2], 8              \n"
    276       "addiu         %[s3], %[s3], 8              \n"
    277       "addiu         $t9, $t9, -1                 \n"
    278       "bgtz          $t9, 1b                      \n"
    279       " addiu        %[dst], %[dst], 2            \n"
    280       "beqz          $t8, 2f                      \n"
    281       " nop                                       \n"
    282 
    283       "lw            $t0, 0(%[src_ptr])           \n"  // |3|2|1|0|
    284       "lw            $t1, 0(%[s1])                \n"  // |7|6|5|4|
    285       "lw            $t2, 0(%[s2])                \n"  // |11|10|9|8|
    286       "lw            $t3, 0(%[s3])                \n"  // |15|14|13|12|
    287       "raddu.w.qb    $t0, $t0                     \n"  // |3 + 2 + 1 + 0|
    288       "raddu.w.qb    $t1, $t1                     \n"  // |7 + 6 + 5 + 4|
    289       "raddu.w.qb    $t2, $t2                     \n"  // |11 + 10 + 9 + 8|
    290       "raddu.w.qb    $t3, $t3                     \n"  // |15 + 14 + 13 + 12|
    291       "add           $t0, $t0, $t1                \n"
    292       "add           $t1, $t2, $t3                \n"
    293       "add           $t0, $t0, $t1                \n"
    294       "shra_r.w      $t0, $t0, 4                  \n"
    295       "sb            $t0, 0(%[dst])               \n"
    296 
    297       "2:                                         \n"
    298       ".set pop                                   \n"
    299 
    300       : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst), [s1] "+r"(s1), [s2] "+r"(s2),
    301         [s3] "+r"(s3)
    302       : [dst_width] "r"(dst_width)
    303       : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
    304 }
    305 
    306 void ScaleRowDown34_DSPR2(const uint8* src_ptr,
    307                           ptrdiff_t src_stride,
    308                           uint8* dst,
    309                           int dst_width) {
    310   __asm__ __volatile__(
    311       ".set push                                          \n"
    312       ".set noreorder                                     \n"
    313       "1:                                                 \n"
    314       "lw              $t1, 0(%[src_ptr])                 \n"  // |3|2|1|0|
    315       "lw              $t2, 4(%[src_ptr])                 \n"  // |7|6|5|4|
    316       "lw              $t3, 8(%[src_ptr])                 \n"  // |11|10|9|8|
    317       "lw              $t4, 12(%[src_ptr])                \n"  // |15|14|13|12|
    318       "lw              $t5, 16(%[src_ptr])                \n"  // |19|18|17|16|
    319       "lw              $t6, 20(%[src_ptr])                \n"  // |23|22|21|20|
    320       "lw              $t7, 24(%[src_ptr])                \n"  // |27|26|25|24|
    321       "lw              $t8, 28(%[src_ptr])                \n"  // |31|30|29|28|
    322       "precrq.qb.ph    $t0, $t2, $t4                      \n"  // |7|5|15|13|
    323       "precrq.qb.ph    $t9, $t6, $t8                      \n"  // |23|21|31|30|
    324       "addiu           %[dst_width], %[dst_width], -24    \n"
    325       "ins             $t1, $t1, 8, 16                    \n"  // |3|1|0|X|
    326       "ins             $t4, $t0, 8, 16                    \n"  // |X|15|13|12|
    327       "ins             $t5, $t5, 8, 16                    \n"  // |19|17|16|X|
    328       "ins             $t8, $t9, 8, 16                    \n"  // |X|31|29|28|
    329       "addiu           %[src_ptr], %[src_ptr], 32         \n"
    330       "packrl.ph       $t0, $t3, $t0                      \n"  // |9|8|7|5|
    331       "packrl.ph       $t9, $t7, $t9                      \n"  // |25|24|23|21|
    332       "prepend         $t1, $t2, 8                        \n"  // |4|3|1|0|
    333       "prepend         $t3, $t4, 24                       \n"  // |15|13|12|11|
    334       "prepend         $t5, $t6, 8                        \n"  // |20|19|17|16|
    335       "prepend         $t7, $t8, 24                       \n"  // |31|29|28|27|
    336       "sw              $t1, 0(%[dst])                     \n"
    337       "sw              $t0, 4(%[dst])                     \n"
    338       "sw              $t3, 8(%[dst])                     \n"
    339       "sw              $t5, 12(%[dst])                    \n"
    340       "sw              $t9, 16(%[dst])                    \n"
    341       "sw              $t7, 20(%[dst])                    \n"
    342       "bnez            %[dst_width], 1b                   \n"
    343       " addiu          %[dst], %[dst], 24                 \n"
    344       ".set pop                                           \n"
    345       : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst), [dst_width] "+r"(dst_width)
    346       :
    347       : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
    348 }
    349 
    350 void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr,
    351                                 ptrdiff_t src_stride,
    352                                 uint8* d,
    353                                 int dst_width) {
    354   __asm__ __volatile__(
    355       ".set push                                         \n"
    356       ".set noreorder                                    \n"
    357       "repl.ph           $t3, 3                          \n"  // 0x00030003
    358 
    359       "1:                                                \n"
    360       "lw                $t0, 0(%[src_ptr])              \n"  // |S3|S2|S1|S0|
    361       "lwx               $t1, %[src_stride](%[src_ptr])  \n"  // |T3|T2|T1|T0|
    362       "rotr              $t2, $t0, 8                     \n"  // |S0|S3|S2|S1|
    363       "rotr              $t6, $t1, 8                     \n"  // |T0|T3|T2|T1|
    364       "muleu_s.ph.qbl    $t4, $t2, $t3                   \n"  // |S0*3|S3*3|
    365       "muleu_s.ph.qbl    $t5, $t6, $t3                   \n"  // |T0*3|T3*3|
    366       "andi              $t0, $t2, 0xFFFF                \n"  // |0|0|S2|S1|
    367       "andi              $t1, $t6, 0xFFFF                \n"  // |0|0|T2|T1|
    368       "raddu.w.qb        $t0, $t0                        \n"
    369       "raddu.w.qb        $t1, $t1                        \n"
    370       "shra_r.w          $t0, $t0, 1                     \n"
    371       "shra_r.w          $t1, $t1, 1                     \n"
    372       "preceu.ph.qbr     $t2, $t2                        \n"  // |0|S2|0|S1|
    373       "preceu.ph.qbr     $t6, $t6                        \n"  // |0|T2|0|T1|
    374       "rotr              $t2, $t2, 16                    \n"  // |0|S1|0|S2|
    375       "rotr              $t6, $t6, 16                    \n"  // |0|T1|0|T2|
    376       "addu.ph           $t2, $t2, $t4                   \n"
    377       "addu.ph           $t6, $t6, $t5                   \n"
    378       "sll               $t5, $t0, 1                     \n"
    379       "add               $t0, $t5, $t0                   \n"
    380       "shra_r.ph         $t2, $t2, 2                     \n"
    381       "shra_r.ph         $t6, $t6, 2                     \n"
    382       "shll.ph           $t4, $t2, 1                     \n"
    383       "addq.ph           $t4, $t4, $t2                   \n"
    384       "addu              $t0, $t0, $t1                   \n"
    385       "addiu             %[src_ptr], %[src_ptr], 4       \n"
    386       "shra_r.w          $t0, $t0, 2                     \n"
    387       "addu.ph           $t6, $t6, $t4                   \n"
    388       "shra_r.ph         $t6, $t6, 2                     \n"
    389       "srl               $t1, $t6, 16                    \n"
    390       "addiu             %[dst_width], %[dst_width], -3  \n"
    391       "sb                $t1, 0(%[d])                    \n"
    392       "sb                $t0, 1(%[d])                    \n"
    393       "sb                $t6, 2(%[d])                    \n"
    394       "bgtz              %[dst_width], 1b                \n"
    395       " addiu            %[d], %[d], 3                   \n"
    396       "3:                                                \n"
    397       ".set pop                                          \n"
    398       : [src_ptr] "+r"(src_ptr), [src_stride] "+r"(src_stride), [d] "+r"(d),
    399         [dst_width] "+r"(dst_width)
    400       :
    401       : "t0", "t1", "t2", "t3", "t4", "t5", "t6");
    402 }
    403 
    404 void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr,
    405                                 ptrdiff_t src_stride,
    406                                 uint8* d,
    407                                 int dst_width) {
    408   __asm__ __volatile__(
    409       ".set push                                           \n"
    410       ".set noreorder                                      \n"
    411       "repl.ph           $t2, 3                            \n"  // 0x00030003
    412 
    413       "1:                                                  \n"
    414       "lw                $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
    415       "lwx               $t1, %[src_stride](%[src_ptr])    \n"  // |T3|T2|T1|T0|
    416       "rotr              $t4, $t0, 8                       \n"  // |S0|S3|S2|S1|
    417       "rotr              $t6, $t1, 8                       \n"  // |T0|T3|T2|T1|
    418       "muleu_s.ph.qbl    $t3, $t4, $t2                     \n"  // |S0*3|S3*3|
    419       "muleu_s.ph.qbl    $t5, $t6, $t2                     \n"  // |T0*3|T3*3|
    420       "andi              $t0, $t4, 0xFFFF                  \n"  // |0|0|S2|S1|
    421       "andi              $t1, $t6, 0xFFFF                  \n"  // |0|0|T2|T1|
    422       "raddu.w.qb        $t0, $t0                          \n"
    423       "raddu.w.qb        $t1, $t1                          \n"
    424       "shra_r.w          $t0, $t0, 1                       \n"
    425       "shra_r.w          $t1, $t1, 1                       \n"
    426       "preceu.ph.qbr     $t4, $t4                          \n"  // |0|S2|0|S1|
    427       "preceu.ph.qbr     $t6, $t6                          \n"  // |0|T2|0|T1|
    428       "rotr              $t4, $t4, 16                      \n"  // |0|S1|0|S2|
    429       "rotr              $t6, $t6, 16                      \n"  // |0|T1|0|T2|
    430       "addu.ph           $t4, $t4, $t3                     \n"
    431       "addu.ph           $t6, $t6, $t5                     \n"
    432       "shra_r.ph         $t6, $t6, 2                       \n"
    433       "shra_r.ph         $t4, $t4, 2                       \n"
    434       "addu.ph           $t6, $t6, $t4                     \n"
    435       "addiu             %[src_ptr], %[src_ptr], 4         \n"
    436       "shra_r.ph         $t6, $t6, 1                       \n"
    437       "addu              $t0, $t0, $t1                     \n"
    438       "addiu             %[dst_width], %[dst_width], -3    \n"
    439       "shra_r.w          $t0, $t0, 1                       \n"
    440       "srl               $t1, $t6, 16                      \n"
    441       "sb                $t1, 0(%[d])                      \n"
    442       "sb                $t0, 1(%[d])                      \n"
    443       "sb                $t6, 2(%[d])                      \n"
    444       "bgtz              %[dst_width], 1b                  \n"
    445       " addiu            %[d], %[d], 3                     \n"
    446       "3:                                                  \n"
    447       ".set pop                                            \n"
    448       : [src_ptr] "+r"(src_ptr), [src_stride] "+r"(src_stride), [d] "+r"(d),
    449         [dst_width] "+r"(dst_width)
    450       :
    451       : "t0", "t1", "t2", "t3", "t4", "t5", "t6");
    452 }
    453 
    454 void ScaleRowDown38_DSPR2(const uint8* src_ptr,
    455                           ptrdiff_t src_stride,
    456                           uint8* dst,
    457                           int dst_width) {
    458   __asm__ __volatile__(
    459       ".set push                                     \n"
    460       ".set noreorder                                \n"
    461 
    462       "1:                                            \n"
    463       "lw         $t0, 0(%[src_ptr])                 \n"  // |3|2|1|0|
    464       "lw         $t1, 4(%[src_ptr])                 \n"  // |7|6|5|4|
    465       "lw         $t2, 8(%[src_ptr])                 \n"  // |11|10|9|8|
    466       "lw         $t3, 12(%[src_ptr])                \n"  // |15|14|13|12|
    467       "lw         $t4, 16(%[src_ptr])                \n"  // |19|18|17|16|
    468       "lw         $t5, 20(%[src_ptr])                \n"  // |23|22|21|20|
    469       "lw         $t6, 24(%[src_ptr])                \n"  // |27|26|25|24|
    470       "lw         $t7, 28(%[src_ptr])                \n"  // |31|30|29|28|
    471       "wsbh       $t0, $t0                           \n"  // |2|3|0|1|
    472       "wsbh       $t6, $t6                           \n"  // |26|27|24|25|
    473       "srl        $t0, $t0, 8                        \n"  // |X|2|3|0|
    474       "srl        $t3, $t3, 16                       \n"  // |X|X|15|14|
    475       "srl        $t5, $t5, 16                       \n"  // |X|X|23|22|
    476       "srl        $t7, $t7, 16                       \n"  // |X|X|31|30|
    477       "ins        $t1, $t2, 24, 8                    \n"  // |8|6|5|4|
    478       "ins        $t6, $t5, 0, 8                     \n"  // |26|27|24|22|
    479       "ins        $t1, $t0, 0, 16                    \n"  // |8|6|3|0|
    480       "ins        $t6, $t7, 24, 8                    \n"  // |30|27|24|22|
    481       "prepend    $t2, $t3, 24                       \n"  // |X|15|14|11|
    482       "ins        $t4, $t4, 16, 8                    \n"  // |19|16|17|X|
    483       "ins        $t4, $t2, 0, 16                    \n"  // |19|16|14|11|
    484       "addiu      %[src_ptr], %[src_ptr], 32         \n"
    485       "addiu      %[dst_width], %[dst_width], -12    \n"
    486       "addiu      $t8,%[dst_width], -12              \n"
    487       "sw         $t1, 0(%[dst])                     \n"
    488       "sw         $t4, 4(%[dst])                     \n"
    489       "sw         $t6, 8(%[dst])                     \n"
    490       "bgez       $t8, 1b                            \n"
    491       " addiu     %[dst], %[dst], 12                 \n"
    492       ".set pop                                      \n"
    493       : [src_ptr] "+r"(src_ptr), [dst] "+r"(dst), [dst_width] "+r"(dst_width)
    494       :
    495       : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8");
    496 }
    497 
    498 void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr,
    499                                 ptrdiff_t src_stride,
    500                                 uint8* dst_ptr,
    501                                 int dst_width) {
    502   intptr_t stride = src_stride;
    503   const uint8* t = src_ptr + stride;
    504   const int c = 0x2AAA;
    505 
    506   __asm__ __volatile__(
    507       ".set push                                         \n"
    508       ".set noreorder                                    \n"
    509 
    510       "1:                                                \n"
    511       "lw              $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
    512       "lw              $t1, 4(%[src_ptr])                \n"  // |S7|S6|S5|S4|
    513       "lw              $t2, 0(%[t])                      \n"  // |T3|T2|T1|T0|
    514       "lw              $t3, 4(%[t])                      \n"  // |T7|T6|T5|T4|
    515       "rotr            $t1, $t1, 16                      \n"  // |S5|S4|S7|S6|
    516       "packrl.ph       $t4, $t1, $t3                     \n"  // |S7|S6|T7|T6|
    517       "packrl.ph       $t5, $t3, $t1                     \n"  // |T5|T4|S5|S4|
    518       "raddu.w.qb      $t4, $t4                          \n"  // S7+S6+T7+T6
    519       "raddu.w.qb      $t5, $t5                          \n"  // T5+T4+S5+S4
    520       "precrq.qb.ph    $t6, $t0, $t2                     \n"  // |S3|S1|T3|T1|
    521       "precrq.qb.ph    $t6, $t6, $t6                     \n"  // |S3|T3|S3|T3|
    522       "srl             $t4, $t4, 2                       \n"  // t4 / 4
    523       "srl             $t6, $t6, 16                      \n"  // |0|0|S3|T3|
    524       "raddu.w.qb      $t6, $t6                          \n"  // 0+0+S3+T3
    525       "addu            $t6, $t5, $t6                     \n"
    526       "mul             $t6, $t6, %[c]                    \n"  // t6 * 0x2AAA
    527       "sll             $t0, $t0, 8                       \n"  // |S2|S1|S0|0|
    528       "sll             $t2, $t2, 8                       \n"  // |T2|T1|T0|0|
    529       "raddu.w.qb      $t0, $t0                          \n"  // S2+S1+S0+0
    530       "raddu.w.qb      $t2, $t2                          \n"  // T2+T1+T0+0
    531       "addu            $t0, $t0, $t2                     \n"
    532       "mul             $t0, $t0, %[c]                    \n"  // t0 * 0x2AAA
    533       "addiu           %[src_ptr], %[src_ptr], 8         \n"
    534       "addiu           %[t], %[t], 8                     \n"
    535       "addiu           %[dst_width], %[dst_width], -3    \n"
    536       "addiu           %[dst_ptr], %[dst_ptr], 3         \n"
    537       "srl             $t6, $t6, 16                      \n"
    538       "srl             $t0, $t0, 16                      \n"
    539       "sb              $t4, -1(%[dst_ptr])               \n"
    540       "sb              $t6, -2(%[dst_ptr])               \n"
    541       "bgtz            %[dst_width], 1b                  \n"
    542       " sb             $t0, -3(%[dst_ptr])               \n"
    543       ".set pop                                          \n"
    544       : [src_ptr] "+r"(src_ptr), [dst_ptr] "+r"(dst_ptr), [t] "+r"(t),
    545         [dst_width] "+r"(dst_width)
    546       : [c] "r"(c)
    547       : "t0", "t1", "t2", "t3", "t4", "t5", "t6");
    548 }
    549 
    550 void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr,
    551                                 ptrdiff_t src_stride,
    552                                 uint8* dst_ptr,
    553                                 int dst_width) {
    554   intptr_t stride = src_stride;
    555   const uint8* s1 = src_ptr + stride;
    556   stride += stride;
    557   const uint8* s2 = src_ptr + stride;
    558   const int c1 = 0x1C71;
    559   const int c2 = 0x2AAA;
    560 
    561   __asm__ __volatile__(
    562       ".set push                                         \n"
    563       ".set noreorder                                    \n"
    564 
    565       "1:                                                \n"
    566       "lw              $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
    567       "lw              $t1, 4(%[src_ptr])                \n"  // |S7|S6|S5|S4|
    568       "lw              $t2, 0(%[s1])                     \n"  // |T3|T2|T1|T0|
    569       "lw              $t3, 4(%[s1])                     \n"  // |T7|T6|T5|T4|
    570       "lw              $t4, 0(%[s2])                     \n"  // |R3|R2|R1|R0|
    571       "lw              $t5, 4(%[s2])                     \n"  // |R7|R6|R5|R4|
    572       "rotr            $t1, $t1, 16                      \n"  // |S5|S4|S7|S6|
    573       "packrl.ph       $t6, $t1, $t3                     \n"  // |S7|S6|T7|T6|
    574       "raddu.w.qb      $t6, $t6                          \n"  // S7+S6+T7+T6
    575       "packrl.ph       $t7, $t3, $t1                     \n"  // |T5|T4|S5|S4|
    576       "raddu.w.qb      $t7, $t7                          \n"  // T5+T4+S5+S4
    577       "sll             $t8, $t5, 16                      \n"  // |R5|R4|0|0|
    578       "raddu.w.qb      $t8, $t8                          \n"  // R5+R4
    579       "addu            $t7, $t7, $t8                     \n"
    580       "srl             $t8, $t5, 16                      \n"  // |0|0|R7|R6|
    581       "raddu.w.qb      $t8, $t8                          \n"  // R7 + R6
    582       "addu            $t6, $t6, $t8                     \n"
    583       "mul             $t6, $t6, %[c2]                   \n"  // t6 * 0x2AAA
    584       "precrq.qb.ph    $t8, $t0, $t2                     \n"  // |S3|S1|T3|T1|
    585       "precrq.qb.ph    $t8, $t8, $t4                     \n"  // |S3|T3|R3|R1|
    586       "srl             $t8, $t8, 8                       \n"  // |0|S3|T3|R3|
    587       "raddu.w.qb      $t8, $t8                          \n"  // S3 + T3 + R3
    588       "addu            $t7, $t7, $t8                     \n"
    589       "mul             $t7, $t7, %[c1]                   \n"  // t7 * 0x1C71
    590       "sll             $t0, $t0, 8                       \n"  // |S2|S1|S0|0|
    591       "sll             $t2, $t2, 8                       \n"  // |T2|T1|T0|0|
    592       "sll             $t4, $t4, 8                       \n"  // |R2|R1|R0|0|
    593       "raddu.w.qb      $t0, $t0                          \n"
    594       "raddu.w.qb      $t2, $t2                          \n"
    595       "raddu.w.qb      $t4, $t4                          \n"
    596       "addu            $t0, $t0, $t2                     \n"
    597       "addu            $t0, $t0, $t4                     \n"
    598       "mul             $t0, $t0, %[c1]                   \n"  // t0 * 0x1C71
    599       "addiu           %[src_ptr], %[src_ptr], 8         \n"
    600       "addiu           %[s1], %[s1], 8                   \n"
    601       "addiu           %[s2], %[s2], 8                   \n"
    602       "addiu           %[dst_width], %[dst_width], -3    \n"
    603       "addiu           %[dst_ptr], %[dst_ptr], 3         \n"
    604       "srl             $t6, $t6, 16                      \n"
    605       "srl             $t7, $t7, 16                      \n"
    606       "srl             $t0, $t0, 16                      \n"
    607       "sb              $t6, -1(%[dst_ptr])               \n"
    608       "sb              $t7, -2(%[dst_ptr])               \n"
    609       "bgtz            %[dst_width], 1b                  \n"
    610       " sb             $t0, -3(%[dst_ptr])               \n"
    611       ".set pop                                          \n"
    612       : [src_ptr] "+r"(src_ptr), [dst_ptr] "+r"(dst_ptr), [s1] "+r"(s1),
    613         [s2] "+r"(s2), [dst_width] "+r"(dst_width)
    614       : [c1] "r"(c1), [c2] "r"(c2)
    615       : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8");
    616 }
    617 
    618 void ScaleAddRow_DSPR2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
    619   int x;
    620   for (x = 0; x < ((src_width - 1)); x += 8) {
    621     uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4;
    622     uint32 tmp_t5, tmp_t6, tmp_t7, tmp_t8;
    623     __asm__ __volatile__(
    624         ".set push                                                \n"
    625         ".set noreorder                                           \n"
    626         "lw                %[tmp_t5],   0(%[src_ptr])             \n"
    627         "lw                %[tmp_t6],   4(%[src_ptr])             \n"
    628         "lw                %[tmp_t1],   0(%[dst_ptr])             \n"
    629         "lw                %[tmp_t2],   4(%[dst_ptr])             \n"
    630         "lw                %[tmp_t3],   8(%[dst_ptr])             \n"
    631         "lw                %[tmp_t4],   12(%[dst_ptr])            \n"
    632         "preceu.ph.qbr     %[tmp_t7],   %[tmp_t5]                 \n"
    633         "preceu.ph.qbl     %[tmp_t8],   %[tmp_t5]                 \n"
    634         "addu.ph           %[tmp_t1],   %[tmp_t1],     %[tmp_t7]  \n"
    635         "addu.ph           %[tmp_t2],   %[tmp_t2],     %[tmp_t8]  \n"
    636         "preceu.ph.qbr     %[tmp_t7],   %[tmp_t6]                 \n"
    637         "preceu.ph.qbl     %[tmp_t8],   %[tmp_t6]                 \n"
    638         "addu.ph           %[tmp_t3],   %[tmp_t3],     %[tmp_t7]  \n"
    639         "addu.ph           %[tmp_t4],   %[tmp_t4],     %[tmp_t8]  \n"
    640         "sw                %[tmp_t1],   0(%[dst_ptr])             \n"
    641         "sw                %[tmp_t2],   4(%[dst_ptr])             \n"
    642         "sw                %[tmp_t3],   8(%[dst_ptr])             \n"
    643         "sw                %[tmp_t4],   12(%[dst_ptr])            \n"
    644         ".set pop                                                 \n"
    645         :
    646         [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2), [tmp_t3] "=&r"(tmp_t3),
    647         [tmp_t4] "=&r"(tmp_t4), [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
    648         [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [src_ptr] "+r"(src_ptr)
    649         : [dst_ptr] "r"(dst_ptr));
    650     src_ptr += 8;
    651     dst_ptr += 8;
    652   }
    653 
    654   if ((src_width)&7) {
    655     for (x = 0; x < ((src_width - 1) & 7); x += 1) {
    656       dst_ptr[0] += src_ptr[0];
    657       src_ptr += 1;
    658       dst_ptr += 1;
    659     }
    660   }
    661 }
    662 
    663 #endif  // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
    664 
    665 #ifdef __cplusplus
    666 }  // extern "C"
    667 }  // namespace libyuv
    668 #endif
    669