Home | History | Annotate | Download | only in source
      1 /*
      2  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS. All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "libyuv/rotate_row.h"
     12 #include "libyuv/row.h"
     13 
     14 #include "libyuv/basic_types.h"
     15 
     16 #ifdef __cplusplus
     17 namespace libyuv {
     18 extern "C" {
     19 #endif
     20 
     21 #if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips_dsp) && \
     22     (__mips_dsp_rev >= 2) && (_MIPS_SIM == _MIPS_SIM_ABI32)
     23 
     24 void TransposeWx8_DSPR2(const uint8* src,
     25                         int src_stride,
     26                         uint8* dst,
     27                         int dst_stride,
     28                         int width) {
     29   __asm__ __volatile__(
     30       ".set push                                         \n"
     31       ".set noreorder                                    \n"
     32       "sll              $t2, %[src_stride], 0x1          \n"  // src_stride x 2
     33       "sll              $t4, %[src_stride], 0x2          \n"  // src_stride x 4
     34       "sll              $t9, %[src_stride], 0x3          \n"  // src_stride x 8
     35       "addu             $t3, $t2, %[src_stride]          \n"
     36       "addu             $t5, $t4, %[src_stride]          \n"
     37       "addu             $t6, $t2, $t4                    \n"
     38       "andi             $t0, %[dst], 0x3                 \n"
     39       "andi             $t1, %[dst_stride], 0x3          \n"
     40       "or               $t0, $t0, $t1                    \n"
     41       "bnez             $t0, 11f                         \n"
     42       " subu            $t7, $t9, %[src_stride]          \n"
     43       // dst + dst_stride word aligned
     44       "1:                                                \n"
     45       "lbu              $t0, 0(%[src])                   \n"
     46       "lbux             $t1, %[src_stride](%[src])       \n"
     47       "lbux             $t8, $t2(%[src])                 \n"
     48       "lbux             $t9, $t3(%[src])                 \n"
     49       "sll              $t1, $t1, 16                     \n"
     50       "sll              $t9, $t9, 16                     \n"
     51       "or               $t0, $t0, $t1                    \n"
     52       "or               $t8, $t8, $t9                    \n"
     53       "precr.qb.ph      $s0, $t8, $t0                    \n"
     54       "lbux             $t0, $t4(%[src])                 \n"
     55       "lbux             $t1, $t5(%[src])                 \n"
     56       "lbux             $t8, $t6(%[src])                 \n"
     57       "lbux             $t9, $t7(%[src])                 \n"
     58       "sll              $t1, $t1, 16                     \n"
     59       "sll              $t9, $t9, 16                     \n"
     60       "or               $t0, $t0, $t1                    \n"
     61       "or               $t8, $t8, $t9                    \n"
     62       "precr.qb.ph      $s1, $t8, $t0                    \n"
     63       "sw               $s0, 0(%[dst])                   \n"
     64       "addiu            %[width], -1                     \n"
     65       "addiu            %[src], 1                        \n"
     66       "sw               $s1, 4(%[dst])                   \n"
     67       "bnez             %[width], 1b                     \n"
     68       " addu            %[dst], %[dst], %[dst_stride]    \n"
     69       "b                2f                               \n"
     70       // dst + dst_stride unaligned
     71       "11:                                               \n"
     72       "lbu              $t0, 0(%[src])                   \n"
     73       "lbux             $t1, %[src_stride](%[src])       \n"
     74       "lbux             $t8, $t2(%[src])                 \n"
     75       "lbux             $t9, $t3(%[src])                 \n"
     76       "sll              $t1, $t1, 16                     \n"
     77       "sll              $t9, $t9, 16                     \n"
     78       "or               $t0, $t0, $t1                    \n"
     79       "or               $t8, $t8, $t9                    \n"
     80       "precr.qb.ph      $s0, $t8, $t0                    \n"
     81       "lbux             $t0, $t4(%[src])                 \n"
     82       "lbux             $t1, $t5(%[src])                 \n"
     83       "lbux             $t8, $t6(%[src])                 \n"
     84       "lbux             $t9, $t7(%[src])                 \n"
     85       "sll              $t1, $t1, 16                     \n"
     86       "sll              $t9, $t9, 16                     \n"
     87       "or               $t0, $t0, $t1                    \n"
     88       "or               $t8, $t8, $t9                    \n"
     89       "precr.qb.ph      $s1, $t8, $t0                    \n"
     90       "swr              $s0, 0(%[dst])                   \n"
     91       "swl              $s0, 3(%[dst])                   \n"
     92       "addiu            %[width], -1                     \n"
     93       "addiu            %[src], 1                        \n"
     94       "swr              $s1, 4(%[dst])                   \n"
     95       "swl              $s1, 7(%[dst])                   \n"
     96       "bnez             %[width], 11b                    \n"
     97       "addu             %[dst], %[dst], %[dst_stride]    \n"
     98       "2:                                                \n"
     99       ".set pop                                          \n"
    100       : [src] "+r"(src), [dst] "+r"(dst), [width] "+r"(width)
    101       : [src_stride] "r"(src_stride), [dst_stride] "r"(dst_stride)
    102       : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "s0", "s1");
    103 }
    104 
    105 void TransposeWx8_Fast_DSPR2(const uint8* src,
    106                              int src_stride,
    107                              uint8* dst,
    108                              int dst_stride,
    109                              int width) {
    110   __asm__ __volatile__(
    111       ".set noat                                         \n"
    112       ".set push                                         \n"
    113       ".set noreorder                                    \n"
    114       "beqz             %[width], 2f                     \n"
    115       " sll             $t2, %[src_stride], 0x1          \n"  // src_stride x 2
    116       "sll              $t4, %[src_stride], 0x2          \n"  // src_stride x 4
    117       "sll              $t9, %[src_stride], 0x3          \n"  // src_stride x 8
    118       "addu             $t3, $t2, %[src_stride]          \n"
    119       "addu             $t5, $t4, %[src_stride]          \n"
    120       "addu             $t6, $t2, $t4                    \n"
    121 
    122       "srl              $AT, %[width], 0x2               \n"
    123       "andi             $t0, %[dst], 0x3                 \n"
    124       "andi             $t1, %[dst_stride], 0x3          \n"
    125       "or               $t0, $t0, $t1                    \n"
    126       "bnez             $t0, 11f                         \n"
    127       " subu            $t7, $t9, %[src_stride]          \n"
    128       // dst + dst_stride word aligned
    129       "1:                                                \n"
    130       "lw               $t0, 0(%[src])                   \n"
    131       "lwx              $t1, %[src_stride](%[src])       \n"
    132       "lwx              $t8, $t2(%[src])                 \n"
    133       "lwx              $t9, $t3(%[src])                 \n"
    134 
    135       // t0 = | 30 | 20 | 10 | 00 |
    136       // t1 = | 31 | 21 | 11 | 01 |
    137       // t8 = | 32 | 22 | 12 | 02 |
    138       // t9 = | 33 | 23 | 13 | 03 |
    139 
    140       "precr.qb.ph     $s0, $t1, $t0                     \n"
    141       "precr.qb.ph     $s1, $t9, $t8                     \n"
    142       "precrq.qb.ph    $s2, $t1, $t0                     \n"
    143       "precrq.qb.ph    $s3, $t9, $t8                     \n"
    144 
    145       // s0 = | 21 | 01 | 20 | 00 |
    146       // s1 = | 23 | 03 | 22 | 02 |
    147       // s2 = | 31 | 11 | 30 | 10 |
    148       // s3 = | 33 | 13 | 32 | 12 |
    149 
    150       "precr.qb.ph     $s4, $s1, $s0                     \n"
    151       "precrq.qb.ph    $s5, $s1, $s0                     \n"
    152       "precr.qb.ph     $s6, $s3, $s2                     \n"
    153       "precrq.qb.ph    $s7, $s3, $s2                     \n"
    154 
    155       // s4 = | 03 | 02 | 01 | 00 |
    156       // s5 = | 23 | 22 | 21 | 20 |
    157       // s6 = | 13 | 12 | 11 | 10 |
    158       // s7 = | 33 | 32 | 31 | 30 |
    159 
    160       "lwx              $t0, $t4(%[src])                 \n"
    161       "lwx              $t1, $t5(%[src])                 \n"
    162       "lwx              $t8, $t6(%[src])                 \n"
    163       "lwx              $t9, $t7(%[src])                 \n"
    164 
    165       // t0 = | 34 | 24 | 14 | 04 |
    166       // t1 = | 35 | 25 | 15 | 05 |
    167       // t8 = | 36 | 26 | 16 | 06 |
    168       // t9 = | 37 | 27 | 17 | 07 |
    169 
    170       "precr.qb.ph     $s0, $t1, $t0                     \n"
    171       "precr.qb.ph     $s1, $t9, $t8                     \n"
    172       "precrq.qb.ph    $s2, $t1, $t0                     \n"
    173       "precrq.qb.ph    $s3, $t9, $t8                     \n"
    174 
    175       // s0 = | 25 | 05 | 24 | 04 |
    176       // s1 = | 27 | 07 | 26 | 06 |
    177       // s2 = | 35 | 15 | 34 | 14 |
    178       // s3 = | 37 | 17 | 36 | 16 |
    179 
    180       "precr.qb.ph     $t0, $s1, $s0                     \n"
    181       "precrq.qb.ph    $t1, $s1, $s0                     \n"
    182       "precr.qb.ph     $t8, $s3, $s2                     \n"
    183       "precrq.qb.ph    $t9, $s3, $s2                     \n"
    184 
    185       // t0 = | 07 | 06 | 05 | 04 |
    186       // t1 = | 27 | 26 | 25 | 24 |
    187       // t8 = | 17 | 16 | 15 | 14 |
    188       // t9 = | 37 | 36 | 35 | 34 |
    189 
    190       "addu            $s0, %[dst], %[dst_stride]        \n"
    191       "addu            $s1, $s0, %[dst_stride]           \n"
    192       "addu            $s2, $s1, %[dst_stride]           \n"
    193 
    194       "sw              $s4, 0(%[dst])                    \n"
    195       "sw              $t0, 4(%[dst])                    \n"
    196       "sw              $s6, 0($s0)                       \n"
    197       "sw              $t8, 4($s0)                       \n"
    198       "sw              $s5, 0($s1)                       \n"
    199       "sw              $t1, 4($s1)                       \n"
    200       "sw              $s7, 0($s2)                       \n"
    201       "sw              $t9, 4($s2)                       \n"
    202 
    203       "addiu            $AT, -1                          \n"
    204       "addiu            %[src], 4                        \n"
    205 
    206       "bnez             $AT, 1b                          \n"
    207       " addu            %[dst], $s2, %[dst_stride]       \n"
    208       "b                2f                               \n"
    209       // dst + dst_stride unaligned
    210       "11:                                               \n"
    211       "lw               $t0, 0(%[src])                   \n"
    212       "lwx              $t1, %[src_stride](%[src])       \n"
    213       "lwx              $t8, $t2(%[src])                 \n"
    214       "lwx              $t9, $t3(%[src])                 \n"
    215 
    216       // t0 = | 30 | 20 | 10 | 00 |
    217       // t1 = | 31 | 21 | 11 | 01 |
    218       // t8 = | 32 | 22 | 12 | 02 |
    219       // t9 = | 33 | 23 | 13 | 03 |
    220 
    221       "precr.qb.ph     $s0, $t1, $t0                     \n"
    222       "precr.qb.ph     $s1, $t9, $t8                     \n"
    223       "precrq.qb.ph    $s2, $t1, $t0                     \n"
    224       "precrq.qb.ph    $s3, $t9, $t8                     \n"
    225 
    226       // s0 = | 21 | 01 | 20 | 00 |
    227       // s1 = | 23 | 03 | 22 | 02 |
    228       // s2 = | 31 | 11 | 30 | 10 |
    229       // s3 = | 33 | 13 | 32 | 12 |
    230 
    231       "precr.qb.ph     $s4, $s1, $s0                     \n"
    232       "precrq.qb.ph    $s5, $s1, $s0                     \n"
    233       "precr.qb.ph     $s6, $s3, $s2                     \n"
    234       "precrq.qb.ph    $s7, $s3, $s2                     \n"
    235 
    236       // s4 = | 03 | 02 | 01 | 00 |
    237       // s5 = | 23 | 22 | 21 | 20 |
    238       // s6 = | 13 | 12 | 11 | 10 |
    239       // s7 = | 33 | 32 | 31 | 30 |
    240 
    241       "lwx              $t0, $t4(%[src])                 \n"
    242       "lwx              $t1, $t5(%[src])                 \n"
    243       "lwx              $t8, $t6(%[src])                 \n"
    244       "lwx              $t9, $t7(%[src])                 \n"
    245 
    246       // t0 = | 34 | 24 | 14 | 04 |
    247       // t1 = | 35 | 25 | 15 | 05 |
    248       // t8 = | 36 | 26 | 16 | 06 |
    249       // t9 = | 37 | 27 | 17 | 07 |
    250 
    251       "precr.qb.ph     $s0, $t1, $t0                     \n"
    252       "precr.qb.ph     $s1, $t9, $t8                     \n"
    253       "precrq.qb.ph    $s2, $t1, $t0                     \n"
    254       "precrq.qb.ph    $s3, $t9, $t8                     \n"
    255 
    256       // s0 = | 25 | 05 | 24 | 04 |
    257       // s1 = | 27 | 07 | 26 | 06 |
    258       // s2 = | 35 | 15 | 34 | 14 |
    259       // s3 = | 37 | 17 | 36 | 16 |
    260 
    261       "precr.qb.ph     $t0, $s1, $s0                     \n"
    262       "precrq.qb.ph    $t1, $s1, $s0                     \n"
    263       "precr.qb.ph     $t8, $s3, $s2                     \n"
    264       "precrq.qb.ph    $t9, $s3, $s2                     \n"
    265 
    266       // t0 = | 07 | 06 | 05 | 04 |
    267       // t1 = | 27 | 26 | 25 | 24 |
    268       // t8 = | 17 | 16 | 15 | 14 |
    269       // t9 = | 37 | 36 | 35 | 34 |
    270 
    271       "addu            $s0, %[dst], %[dst_stride]        \n"
    272       "addu            $s1, $s0, %[dst_stride]           \n"
    273       "addu            $s2, $s1, %[dst_stride]           \n"
    274 
    275       "swr              $s4, 0(%[dst])                   \n"
    276       "swl              $s4, 3(%[dst])                   \n"
    277       "swr              $t0, 4(%[dst])                   \n"
    278       "swl              $t0, 7(%[dst])                   \n"
    279       "swr              $s6, 0($s0)                      \n"
    280       "swl              $s6, 3($s0)                      \n"
    281       "swr              $t8, 4($s0)                      \n"
    282       "swl              $t8, 7($s0)                      \n"
    283       "swr              $s5, 0($s1)                      \n"
    284       "swl              $s5, 3($s1)                      \n"
    285       "swr              $t1, 4($s1)                      \n"
    286       "swl              $t1, 7($s1)                      \n"
    287       "swr              $s7, 0($s2)                      \n"
    288       "swl              $s7, 3($s2)                      \n"
    289       "swr              $t9, 4($s2)                      \n"
    290       "swl              $t9, 7($s2)                      \n"
    291 
    292       "addiu            $AT, -1                          \n"
    293       "addiu            %[src], 4                        \n"
    294 
    295       "bnez             $AT, 11b                         \n"
    296       " addu            %[dst], $s2, %[dst_stride]       \n"
    297       "2:                                                \n"
    298       ".set pop                                          \n"
    299       ".set at                                           \n"
    300       : [src] "+r"(src), [dst] "+r"(dst), [width] "+r"(width)
    301       : [src_stride] "r"(src_stride), [dst_stride] "r"(dst_stride)
    302       : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "s0", "s1",
    303         "s2", "s3", "s4", "s5", "s6", "s7");
    304 }
    305 
    306 void TransposeUVWx8_DSPR2(const uint8* src,
    307                           int src_stride,
    308                           uint8* dst_a,
    309                           int dst_stride_a,
    310                           uint8* dst_b,
    311                           int dst_stride_b,
    312                           int width) {
    313   __asm__ __volatile__(
    314       ".set push                                         \n"
    315       ".set noreorder                                    \n"
    316       "beqz            %[width], 2f                      \n"
    317       " sll            $t2, %[src_stride], 0x1           \n"  // src_stride x 2
    318       "sll             $t4, %[src_stride], 0x2           \n"  // src_stride x 4
    319       "sll             $t9, %[src_stride], 0x3           \n"  // src_stride x 8
    320       "addu            $t3, $t2, %[src_stride]           \n"
    321       "addu            $t5, $t4, %[src_stride]           \n"
    322       "addu            $t6, $t2, $t4                     \n"
    323       "subu            $t7, $t9, %[src_stride]           \n"
    324       "srl             $t1, %[width], 1                  \n"
    325 
    326       // check word aligment for dst_a, dst_b, dst_stride_a and dst_stride_b
    327       "andi            $t0, %[dst_a], 0x3                \n"
    328       "andi            $t8, %[dst_b], 0x3                \n"
    329       "or              $t0, $t0, $t8                     \n"
    330       "andi            $t8, %[dst_stride_a], 0x3         \n"
    331       "andi            $s5, %[dst_stride_b], 0x3         \n"
    332       "or              $t8, $t8, $s5                     \n"
    333       "or              $t0, $t0, $t8                     \n"
    334       "bnez            $t0, 11f                          \n"
    335       " nop                                              \n"
    336       // dst + dst_stride word aligned (both, a & b dst addresses)
    337       "1:                                                \n"
    338       "lw              $t0, 0(%[src])                    \n"  // |B0|A0|b0|a0|
    339       "lwx             $t8, %[src_stride](%[src])        \n"  // |B1|A1|b1|a1|
    340       "addu            $s5, %[dst_a], %[dst_stride_a]    \n"
    341       "lwx             $t9, $t2(%[src])                  \n"  // |B2|A2|b2|a2|
    342       "lwx             $s0, $t3(%[src])                  \n"  // |B3|A3|b3|a3|
    343       "addu            $s6, %[dst_b], %[dst_stride_b]    \n"
    344 
    345       "precrq.ph.w     $s1, $t8, $t0                     \n"  // |B1|A1|B0|A0|
    346       "precrq.ph.w     $s2, $s0, $t9                     \n"  // |B3|A3|B2|A2|
    347       "precr.qb.ph     $s3, $s2, $s1                     \n"  // |A3|A2|A1|A0|
    348       "precrq.qb.ph    $s4, $s2, $s1                     \n"  // |B3|B2|B1|B0|
    349 
    350       "sll             $t0, $t0, 16                      \n"
    351       "packrl.ph       $s1, $t8, $t0                     \n"  // |b1|a1|b0|a0|
    352       "sll             $t9, $t9, 16                      \n"
    353       "packrl.ph       $s2, $s0, $t9                     \n"  // |b3|a3|b2|a2|
    354 
    355       "sw              $s3, 0($s5)                       \n"
    356       "sw              $s4, 0($s6)                       \n"
    357 
    358       "precr.qb.ph     $s3, $s2, $s1                     \n"  // |a3|a2|a1|a0|
    359       "precrq.qb.ph    $s4, $s2, $s1                     \n"  // |b3|b2|b1|b0|
    360 
    361       "lwx             $t0, $t4(%[src])                  \n"  // |B4|A4|b4|a4|
    362       "lwx             $t8, $t5(%[src])                  \n"  // |B5|A5|b5|a5|
    363       "lwx             $t9, $t6(%[src])                  \n"  // |B6|A6|b6|a6|
    364       "lwx             $s0, $t7(%[src])                  \n"  // |B7|A7|b7|a7|
    365       "sw              $s3, 0(%[dst_a])                  \n"
    366       "sw              $s4, 0(%[dst_b])                  \n"
    367 
    368       "precrq.ph.w     $s1, $t8, $t0                     \n"  // |B5|A5|B4|A4|
    369       "precrq.ph.w     $s2, $s0, $t9                     \n"  // |B6|A6|B7|A7|
    370       "precr.qb.ph     $s3, $s2, $s1                     \n"  // |A7|A6|A5|A4|
    371       "precrq.qb.ph    $s4, $s2, $s1                     \n"  // |B7|B6|B5|B4|
    372 
    373       "sll             $t0, $t0, 16                      \n"
    374       "packrl.ph       $s1, $t8, $t0                     \n"  // |b5|a5|b4|a4|
    375       "sll             $t9, $t9, 16                      \n"
    376       "packrl.ph       $s2, $s0, $t9                     \n"  // |b7|a7|b6|a6|
    377       "sw              $s3, 4($s5)                       \n"
    378       "sw              $s4, 4($s6)                       \n"
    379 
    380       "precr.qb.ph     $s3, $s2, $s1                     \n"  // |a7|a6|a5|a4|
    381       "precrq.qb.ph    $s4, $s2, $s1                     \n"  // |b7|b6|b5|b4|
    382 
    383       "addiu           %[src], 4                         \n"
    384       "addiu           $t1, -1                           \n"
    385       "sll             $t0, %[dst_stride_a], 1           \n"
    386       "sll             $t8, %[dst_stride_b], 1           \n"
    387       "sw              $s3, 4(%[dst_a])                  \n"
    388       "sw              $s4, 4(%[dst_b])                  \n"
    389       "addu            %[dst_a], %[dst_a], $t0           \n"
    390       "bnez            $t1, 1b                           \n"
    391       " addu           %[dst_b], %[dst_b], $t8           \n"
    392       "b               2f                                \n"
    393       " nop                                              \n"
    394 
    395       // dst_a or dst_b or dst_stride_a or dst_stride_b not word aligned
    396       "11:                                               \n"
    397       "lw              $t0, 0(%[src])                    \n"  // |B0|A0|b0|a0|
    398       "lwx             $t8, %[src_stride](%[src])        \n"  // |B1|A1|b1|a1|
    399       "addu            $s5, %[dst_a], %[dst_stride_a]    \n"
    400       "lwx             $t9, $t2(%[src])                  \n"  // |B2|A2|b2|a2|
    401       "lwx             $s0, $t3(%[src])                  \n"  // |B3|A3|b3|a3|
    402       "addu            $s6, %[dst_b], %[dst_stride_b]    \n"
    403 
    404       "precrq.ph.w     $s1, $t8, $t0                     \n"  // |B1|A1|B0|A0|
    405       "precrq.ph.w     $s2, $s0, $t9                     \n"  // |B3|A3|B2|A2|
    406       "precr.qb.ph     $s3, $s2, $s1                     \n"  // |A3|A2|A1|A0|
    407       "precrq.qb.ph    $s4, $s2, $s1                     \n"  // |B3|B2|B1|B0|
    408 
    409       "sll             $t0, $t0, 16                      \n"
    410       "packrl.ph       $s1, $t8, $t0                     \n"  // |b1|a1|b0|a0|
    411       "sll             $t9, $t9, 16                      \n"
    412       "packrl.ph       $s2, $s0, $t9                     \n"  // |b3|a3|b2|a2|
    413 
    414       "swr             $s3, 0($s5)                       \n"
    415       "swl             $s3, 3($s5)                       \n"
    416       "swr             $s4, 0($s6)                       \n"
    417       "swl             $s4, 3($s6)                       \n"
    418 
    419       "precr.qb.ph     $s3, $s2, $s1                     \n"  // |a3|a2|a1|a0|
    420       "precrq.qb.ph    $s4, $s2, $s1                     \n"  // |b3|b2|b1|b0|
    421 
    422       "lwx             $t0, $t4(%[src])                  \n"  // |B4|A4|b4|a4|
    423       "lwx             $t8, $t5(%[src])                  \n"  // |B5|A5|b5|a5|
    424       "lwx             $t9, $t6(%[src])                  \n"  // |B6|A6|b6|a6|
    425       "lwx             $s0, $t7(%[src])                  \n"  // |B7|A7|b7|a7|
    426       "swr             $s3, 0(%[dst_a])                  \n"
    427       "swl             $s3, 3(%[dst_a])                  \n"
    428       "swr             $s4, 0(%[dst_b])                  \n"
    429       "swl             $s4, 3(%[dst_b])                  \n"
    430 
    431       "precrq.ph.w     $s1, $t8, $t0                     \n"  // |B5|A5|B4|A4|
    432       "precrq.ph.w     $s2, $s0, $t9                     \n"  // |B6|A6|B7|A7|
    433       "precr.qb.ph     $s3, $s2, $s1                     \n"  // |A7|A6|A5|A4|
    434       "precrq.qb.ph    $s4, $s2, $s1                     \n"  // |B7|B6|B5|B4|
    435 
    436       "sll             $t0, $t0, 16                      \n"
    437       "packrl.ph       $s1, $t8, $t0                     \n"  // |b5|a5|b4|a4|
    438       "sll             $t9, $t9, 16                      \n"
    439       "packrl.ph       $s2, $s0, $t9                     \n"  // |b7|a7|b6|a6|
    440 
    441       "swr             $s3, 4($s5)                       \n"
    442       "swl             $s3, 7($s5)                       \n"
    443       "swr             $s4, 4($s6)                       \n"
    444       "swl             $s4, 7($s6)                       \n"
    445 
    446       "precr.qb.ph     $s3, $s2, $s1                     \n"  // |a7|a6|a5|a4|
    447       "precrq.qb.ph    $s4, $s2, $s1                     \n"  // |b7|b6|b5|b4|
    448 
    449       "addiu           %[src], 4                         \n"
    450       "addiu           $t1, -1                           \n"
    451       "sll             $t0, %[dst_stride_a], 1           \n"
    452       "sll             $t8, %[dst_stride_b], 1           \n"
    453       "swr             $s3, 4(%[dst_a])                  \n"
    454       "swl             $s3, 7(%[dst_a])                  \n"
    455       "swr             $s4, 4(%[dst_b])                  \n"
    456       "swl             $s4, 7(%[dst_b])                  \n"
    457       "addu            %[dst_a], %[dst_a], $t0           \n"
    458       "bnez            $t1, 11b                          \n"
    459       " addu           %[dst_b], %[dst_b], $t8           \n"
    460 
    461       "2:                                                \n"
    462       ".set pop                                          \n"
    463       : [src] "+r"(src), [dst_a] "+r"(dst_a), [dst_b] "+r"(dst_b),
    464         [width] "+r"(width), [src_stride] "+r"(src_stride)
    465       : [dst_stride_a] "r"(dst_stride_a), [dst_stride_b] "r"(dst_stride_b)
    466       : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "s0", "s1",
    467         "s2", "s3", "s4", "s5", "s6");
    468 }
    469 
    470 #endif  // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
    471 
    472 #ifdef __cplusplus
    473 }  // extern "C"
    474 }  // namespace libyuv
    475 #endif
    476