Home | History | Annotate | Download | only in ext
      1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include <algorithm>
      6 #include "skia/ext/convolver.h"
      7 #include "skia/ext/convolver_mips_dspr2.h"
      8 #include "third_party/skia/include/core/SkTypes.h"
      9 
     10 namespace skia {
     11 // Convolves horizontally along a single row. The row data is given in
     12 // |src_data| and continues for the num_values() of the filter.
     13 void ConvolveHorizontally_mips_dspr2(const unsigned char* src_data,
     14                                      const ConvolutionFilter1D& filter,
     15                                      unsigned char* out_row,
     16                                      bool has_alpha) {
     17 #if SIMD_MIPS_DSPR2
     18   int row_to_filter = 0;
     19   int num_values = filter.num_values();
     20   if (has_alpha) {
     21     for (int out_x = 0; out_x < num_values; out_x++) {
     22       // Get the filter that determines the current output pixel.
     23       int filter_offset, filter_length;
     24       const ConvolutionFilter1D::Fixed* filter_values =
     25         filter.FilterForValue(out_x, &filter_offset, &filter_length);
     26       int filter_x = 0;
     27 
     28       __asm__ __volatile__ (
     29         ".set push                                  \n"
     30         ".set noreorder                             \n"
     31 
     32         "beqz            %[filter_len], 3f          \n"
     33         " sll            $t0, %[filter_offset], 2   \n"
     34         "addu            %[rtf], %[src_data], $t0   \n"
     35         "mtlo            $0, $ac0                   \n"
     36         "mtlo            $0, $ac1                   \n"
     37         "mtlo            $0, $ac2                   \n"
     38         "mtlo            $0, $ac3                   \n"
     39         "srl             $t7, %[filter_len], 2      \n"
     40         "beqz            $t7, 2f                    \n"
     41         " li             %[fx], 0                   \n"
     42 
     43         "11:                                        \n"
     44         "addu            $t4, %[filter_val], %[fx]  \n"
     45         "sll             $t5, %[fx], 1              \n"
     46         "ulw             $t6, 0($t4)                \n" // t6 = |cur[1]|cur[0]|
     47         "ulw             $t8, 4($t4)                \n" // t8 = |cur[3]|cur[2]|
     48         "addu            $t0, %[rtf], $t5           \n"
     49         "lw              $t1, 0($t0)                \n" // t1 = |a0|b0|g0|r0|
     50         "lw              $t2, 4($t0)                \n" // t2 = |a1|b1|g1|r1|
     51         "lw              $t3, 8($t0)                \n" // t3 = |a2|b2|g2|r2|
     52         "lw              $t4, 12($t0)               \n" // t4 = |a3|b3|g3|r3|
     53         "precrq.qb.ph    $t0, $t2, $t1              \n" // t0 = |a1|g1|a0|g0|
     54         "precr.qb.ph     $t5, $t2, $t1              \n" // t5 = |b1|r1|b0|r0|
     55         "preceu.ph.qbla  $t1, $t0                   \n" // t1 = |0|a1|0|a0|
     56         "preceu.ph.qbra  $t2, $t0                   \n" // t2 = |0|g1|0|g0|
     57         "preceu.ph.qbla  $t0, $t5                   \n" // t0 = |0|b1|0|b0|
     58         "preceu.ph.qbra  $t5, $t5                   \n" // t5 = |0|r1|0|r0|
     59         "dpa.w.ph        $ac0, $t1, $t6             \n" // ac0+(cur*a1)+(cur*a0)
     60         "dpa.w.ph        $ac1, $t0, $t6             \n" // ac1+(cur*b1)+(cur*b0)
     61         "dpa.w.ph        $ac2, $t2, $t6             \n" // ac2+(cur*g1)+(cur*g0)
     62         "dpa.w.ph        $ac3, $t5, $t6             \n" // ac3+(cur*r1)+(cur*r0)
     63         "precrq.qb.ph    $t0, $t4, $t3              \n" // t0 = |a3|g3|a2|g2|
     64         "precr.qb.ph     $t5, $t4, $t3              \n" // t5 = |b3|r3|b2|r2|
     65         "preceu.ph.qbla  $t1, $t0                   \n" // t1 = |0|a3|0|a2|
     66         "preceu.ph.qbra  $t2, $t0                   \n" // t2 = |0|g3|0|g2|
     67         "preceu.ph.qbla  $t0, $t5                   \n" // t0 = |0|b3|0|b2|
     68         "preceu.ph.qbra  $t5, $t5                   \n" // t5 = |0|r3|0|r2|
     69         "dpa.w.ph        $ac0, $t1, $t8             \n" // ac0+(cur*a3)+(cur*a2)
     70         "dpa.w.ph        $ac1, $t0, $t8             \n" // ac1+(cur*b3)+(cur*b2)
     71         "dpa.w.ph        $ac2, $t2, $t8             \n" // ac2+(cur*g3)+(cur*g2)
     72         "dpa.w.ph        $ac3, $t5, $t8             \n" // ac3+(cur*r3)+(cur*r2)
     73         "addiu           $t7, $t7, -1               \n"
     74         "bgtz            $t7, 11b                   \n"
     75         " addiu          %[fx], %[fx], 8            \n"
     76 
     77         "2:                                         \n"
     78         "andi            $t7, %[filter_len], 0x3    \n" // residual
     79         "beqz            $t7, 3f                    \n"
     80         " nop                                       \n"
     81 
     82         "21:                                        \n"
     83         "sll             $t1, %[fx], 1              \n"
     84         "addu            $t2, %[filter_val], %[fx]  \n"
     85         "addu            $t0, %[rtf], $t1           \n"
     86         "lh              $t6, 0($t2)                \n" // t6 = filter_val[fx]
     87         "lbu             $t1, 0($t0)                \n" // t1 = row[fx * 4 + 0]
     88         "lbu             $t2, 1($t0)                \n" // t2 = row[fx * 4 + 1]
     89         "lbu             $t3, 2($t0)                \n" // t3 = row[fx * 4 + 2]
     90         "lbu             $t4, 3($t0)                \n" // t4 = row[fx * 4 + 2]
     91         "maddu           $ac3, $t6, $t1             \n"
     92         "maddu           $ac2, $t6, $t2             \n"
     93         "maddu           $ac1, $t6, $t3             \n"
     94         "maddu           $ac0, $t6, $t4             \n"
     95         "addiu           $t7, $t7, -1               \n"
     96         "bgtz            $t7, 21b                   \n"
     97         " addiu          %[fx], %[fx], 2            \n"
     98 
     99         "3:                                         \n"
    100         "extrv.w         $t0, $ac0, %[kShiftBits]   \n" // a >> kShiftBits
    101         "extrv.w         $t1, $ac1, %[kShiftBits]   \n" // b >> kShiftBits
    102         "extrv.w         $t2, $ac2, %[kShiftBits]   \n" // g >> kShiftBits
    103         "extrv.w         $t3, $ac3, %[kShiftBits]   \n" // r >> kShiftBits
    104         "sll             $t5, %[out_x], 2           \n"
    105         "repl.ph         $t6, 128                   \n" // t6 = | 128 | 128 |
    106         "addu            $t5, %[out_row], $t5       \n"
    107         "append          $t2, $t3, 16               \n"
    108         "append          $t0, $t1, 16               \n"
    109         "subu.ph         $t1, $t0, $t6              \n"
    110         "shll_s.ph       $t1, $t1, 8                \n"
    111         "shra.ph         $t1, $t1, 8                \n"
    112         "addu.ph         $t1, $t1, $t6              \n"
    113         "subu.ph         $t3, $t2, $t6              \n"
    114         "shll_s.ph       $t3, $t3, 8                \n"
    115         "shra.ph         $t3, $t3, 8                \n"
    116         "addu.ph         $t3, $t3, $t6              \n"
    117         "precr.qb.ph     $t0, $t1, $t3              \n"
    118         "usw             $t0, 0($t5)                \n"
    119 
    120         ".set pop                                   \n"
    121       : [fx] "+r" (filter_x), [out_x] "+r" (out_x), [out_row] "+r" (out_row),
    122         [rtf] "+r" (row_to_filter)
    123       : [filter_val] "r" (filter_values), [filter_len] "r" (filter_length),
    124         [kShiftBits] "r" (ConvolutionFilter1D::kShiftBits),
    125         [filter_offset] "r" (filter_offset), [src_data] "r" (src_data)
    126       : "lo", "hi", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo", "$ac3hi",
    127         "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8"
    128       );
    129     }
    130   } else {
    131     for (int out_x = 0; out_x < num_values; out_x++) {
    132       // Get the filter that determines the current output pixel.
    133       int filter_offset, filter_length;
    134       const ConvolutionFilter1D::Fixed* filter_values =
    135         filter.FilterForValue(out_x, &filter_offset, &filter_length);
    136       int filter_x = 0;
    137       __asm__ __volatile__ (
    138         ".set push                                  \n"
    139         ".set noreorder                             \n"
    140 
    141         "beqz            %[filter_len], 3f          \n"
    142         " sll            $t0, %[filter_offset], 2   \n"
    143         "addu            %[rtf], %[src_data], $t0   \n"
    144         "mtlo            $0, $ac1                   \n"
    145         "mtlo            $0, $ac2                   \n"
    146         "mtlo            $0, $ac3                   \n"
    147         "srl             $t7, %[filter_len], 2      \n"
    148         "beqz            $t7, 2f                    \n"
    149         " li             %[fx], 0                   \n"
    150 
    151         "11:                                        \n"
    152         "addu            $t4, %[filter_val], %[fx]  \n"
    153         "sll             $t5, %[fx], 1              \n"
    154         "ulw             $t6, 0($t4)                \n" // t6 = |cur[1]|cur[0]|
    155         "ulw             $t8, 4($t4)                \n" // t8 = |cur[3]|cur[2]|
    156         "addu            $t0, %[rtf], $t5           \n"
    157         "lw              $t1, 0($t0)                \n" // t1 = |a0|b0|g0|r0|
    158         "lw              $t2, 4($t0)                \n" // t2 = |a1|b1|g1|r1|
    159         "lw              $t3, 8($t0)                \n" // t3 = |a2|b2|g2|r2|
    160         "lw              $t4, 12($t0)               \n" // t4 = |a3|b3|g3|r3|
    161         "precrq.qb.ph    $t0, $t2, $t1              \n" // t0 = |a1|g1|a0|g0|
    162         "precr.qb.ph     $t5, $t2, $t1              \n" // t5 = |b1|r1|b0|r0|
    163         "preceu.ph.qbra  $t2, $t0                   \n" // t2 = |0|g1|0|g0|
    164         "preceu.ph.qbla  $t0, $t5                   \n" // t0 = |0|b1|0|b0|
    165         "preceu.ph.qbra  $t5, $t5                   \n" // t5 = |0|r1|0|r0|
    166         "dpa.w.ph        $ac1, $t0, $t6             \n" // ac1+(cur*b1)+(cur*b0)
    167         "dpa.w.ph        $ac2, $t2, $t6             \n" // ac2+(cur*g1)+(cur*g0)
    168         "dpa.w.ph        $ac3, $t5, $t6             \n" // ac3+(cur*r1)+(cur*r0)
    169         "precrq.qb.ph    $t0, $t4, $t3              \n" // t0 = |a3|g3|a2|g2|
    170         "precr.qb.ph     $t5, $t4, $t3              \n" // t5 = |b3|r3|b2|r2|
    171         "preceu.ph.qbra  $t2, $t0                   \n" // t2 = |0|g3|0|g2|
    172         "preceu.ph.qbla  $t0, $t5                   \n" // t0 = |0|b3|0|b2|
    173         "preceu.ph.qbra  $t5, $t5                   \n" // t5 = |0|r3|0|r2|
    174         "dpa.w.ph        $ac1, $t0, $t8             \n" // ac1+(cur*b3)+(cur*b2)
    175         "dpa.w.ph        $ac2, $t2, $t8             \n" // ac2+(cur*g3)+(cur*g2)
    176         "dpa.w.ph        $ac3, $t5, $t8             \n" // ac3+(cur*r3)+(cur*r2)
    177         "addiu           $t7, $t7, -1               \n"
    178         "bgtz            $t7, 11b                   \n"
    179         " addiu          %[fx], %[fx], 8            \n"
    180 
    181         "2:                                         \n"
    182         "andi            $t7, %[filter_len], 0x3    \n" // residual
    183         "beqz            $t7, 3f                    \n"
    184         " nop                                       \n"
    185 
    186         "21:                                        \n"
    187         "sll             $t1, %[fx], 1              \n"
    188         "addu            $t2, %[filter_val], %[fx]  \n"
    189         "addu            $t0, %[rtf], $t1           \n"
    190         "lh              $t6, 0($t2)                \n" // t6 = filter_val[fx]
    191         "lbu             $t1, 0($t0)                \n" // t1 = row[fx * 4 + 0]
    192         "lbu             $t2, 1($t0)                \n" // t2 = row[fx * 4 + 1]
    193         "lbu             $t3, 2($t0)                \n" // t3 = row[fx * 4 + 2]
    194         "maddu           $ac3, $t6, $t1             \n"
    195         "maddu           $ac2, $t6, $t2             \n"
    196         "maddu           $ac1, $t6, $t3             \n"
    197         "addiu           $t7, $t7, -1               \n"
    198         "bgtz            $t7, 21b                   \n"
    199         " addiu          %[fx], %[fx], 2            \n"
    200 
    201         "3:                                         \n"
    202         "extrv.w         $t1, $ac1, %[kShiftBits]   \n" // b >> kShiftBits
    203         "extrv.w         $t2, $ac2, %[kShiftBits]   \n" // g >> kShiftBits
    204         "extrv.w         $t3, $ac3, %[kShiftBits]   \n" // r >> kShiftBits
    205         "repl.ph         $t6, 128                   \n" // t6 = | 128 | 128 |
    206         "sll             $t8, %[out_x], 2           \n"
    207         "addu            $t8, %[out_row], $t8       \n"
    208         "append          $t2, $t3, 16               \n"
    209         "andi            $t1, 0xFFFF                \n"
    210         "subu.ph         $t5, $t1, $t6              \n"
    211         "shll_s.ph       $t5, $t5, 8                \n"
    212         "shra.ph         $t5, $t5, 8                \n"
    213         "addu.ph         $t5, $t5, $t6              \n"
    214         "subu.ph         $t4, $t2, $t6              \n"
    215         "shll_s.ph       $t4, $t4, 8                \n"
    216         "shra.ph         $t4, $t4, 8                \n"
    217         "addu.ph         $t4, $t4, $t6              \n"
    218         "precr.qb.ph     $t0, $t5, $t4              \n"
    219         "usw             $t0, 0($t8)                \n"
    220 
    221         ".set pop                                   \n"
    222       : [fx] "+r" (filter_x), [out_x] "+r" (out_x), [out_row] "+r" (out_row),
    223         [rtf] "+r" (row_to_filter)
    224       : [filter_val] "r" (filter_values), [filter_len] "r" (filter_length),
    225         [kShiftBits] "r" (ConvolutionFilter1D::kShiftBits),
    226         [filter_offset] "r" (filter_offset), [src_data] "r" (src_data)
    227       : "lo", "hi", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo", "$ac3hi",
    228         "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8"
    229       );
    230     }
    231   }
    232 #endif
    233 }
    234 void ConvolveVertically_mips_dspr2(const ConvolutionFilter1D::Fixed* filter_val,
    235                                    int filter_length,
    236                                    unsigned char* const* source_data_rows,
    237                                    int pixel_width,
    238                                    unsigned char* out_row,
    239                                    bool has_alpha) {
    240 #if SIMD_MIPS_DSPR2
    241   // We go through each column in the output and do a vertical convolution,
    242   // generating one output pixel each time.
    243   int byte_offset;
    244   int cnt;
    245   int filter_y;
    246   if (has_alpha) {
    247     for (int out_x = 0; out_x < pixel_width; out_x++) {
    248       __asm__ __volatile__ (
    249         ".set push                                   \n"
    250         ".set noreorder                              \n"
    251 
    252         "beqz            %[filter_len], 3f           \n"
    253         " sll            %[offset], %[out_x], 2      \n"
    254         "mtlo            $0, $ac0                    \n"
    255         "mtlo            $0, $ac1                    \n"
    256         "mtlo            $0, $ac2                    \n"
    257         "mtlo            $0, $ac3                    \n"
    258         "srl             %[cnt], %[filter_len], 2    \n"
    259         "beqz            %[cnt], 2f                  \n"
    260         " li             %[fy], 0                    \n"
    261 
    262         "11:                                         \n"
    263         "sll             $t1, %[fy], 1               \n"
    264         "addu            $t0, %[src_data_rows], $t1  \n"
    265         "lw              $t1, 0($t0)                 \n"
    266         "lw              $t2, 4($t0)                 \n"
    267         "lw              $t3, 8($t0)                 \n"
    268         "lw              $t4, 12($t0)                \n"
    269         "addu            $t1, $t1, %[offset]         \n"
    270         "addu            $t2, $t2, %[offset]         \n"
    271         "addu            $t3, $t3, %[offset]         \n"
    272         "addu            $t4, $t4, %[offset]         \n"
    273         "lw              $t1, 0($t1)                 \n" // t1 = |a0|b0|g0|r0|
    274         "lw              $t2, 0($t2)                 \n" // t2 = |a1|b1|g1|r1|
    275         "lw              $t3, 0($t3)                 \n" // t3 = |a0|b0|g0|r0|
    276         "lw              $t4, 0($t4)                 \n" // t4 = |a1|b1|g1|r1|
    277         "precrq.qb.ph    $t5, $t2, $t1               \n" // t5 = |a1|g1|a0|g0|
    278         "precr.qb.ph     $t6, $t2, $t1               \n" // t6 = |b1|r1|b0|r0|
    279         "preceu.ph.qbla  $t0, $t5                    \n" // t0 = |0|a1|0|a0|
    280         "preceu.ph.qbra  $t1, $t5                    \n" // t1 = |0|g1|0|g0|
    281         "preceu.ph.qbla  $t2, $t6                    \n" // t2 = |0|b1|0|b0|
    282         "preceu.ph.qbra  $t5, $t6                    \n" // t5 = |0|r1|0|r0|
    283         "addu            $t6, %[filter_val], %[fy]   \n"
    284         "ulw             $t7, 0($t6)                 \n" // t7 = |cur_1|cur_0|
    285         "ulw             $t6, 4($t6)                 \n" // t6 = |cur_3|cur_2|
    286         "dpa.w.ph        $ac0, $t5, $t7              \n" // (cur*r1)+(cur*r0)
    287         "dpa.w.ph        $ac1, $t1, $t7              \n" // (cur*g1)+(cur*g0)
    288         "dpa.w.ph        $ac2, $t2, $t7              \n" // (cur*b1)+(cur*b0)
    289         "dpa.w.ph        $ac3, $t0, $t7              \n" // (cur*a1)+(cur*a0)
    290         "precrq.qb.ph    $t5, $t4, $t3               \n" // t5 = |a3|g3|a2|g2|
    291         "precr.qb.ph     $t7, $t4, $t3               \n" // t7 = |b3|r3|b2|r2|
    292         "preceu.ph.qbla  $t0, $t5                    \n" // t0 = |0|a3|0|a2|
    293         "preceu.ph.qbra  $t1, $t5                    \n" // t1 = |0|g3|0|g2|
    294         "preceu.ph.qbla  $t2, $t7                    \n" // t2 = |0|b3|0|b2|
    295         "preceu.ph.qbra  $t5, $t7                    \n" // t5 = |0|r3|0|r2|
    296         "dpa.w.ph        $ac0, $t5, $t6              \n" // (cur*r3)+(cur*r2)
    297         "dpa.w.ph        $ac1, $t1, $t6              \n" // (cur*g3)+(cur*g2)
    298         "dpa.w.ph        $ac2, $t2, $t6              \n" // (cur*b3)+(cur*b2)
    299         "dpa.w.ph        $ac3, $t0, $t6              \n" // (cur*a3)+(cur*a2)
    300         "addiu           %[cnt], %[cnt], -1          \n"
    301         "bgtz            %[cnt], 11b                 \n"
    302         " addiu          %[fy], %[fy], 8             \n"
    303 
    304         "2:                                          \n"
    305         "andi            %[cnt], %[filter_len], 0x3  \n" // residual
    306         "beqz            %[cnt], 3f                  \n"
    307         " nop                                        \n"
    308 
    309         "21:                                         \n"
    310         "addu            $t0, %[filter_val], %[fy]   \n"
    311         "lh              $t4, 0($t0)                 \n" // t4=filter_val[fx]
    312         "sll             $t1, %[fy], 1               \n"
    313         "addu            $t0, %[src_data_rows], $t1  \n"
    314         "lw              $t1, 0($t0)                 \n"
    315         "addu            $t0, $t1, %[offset]         \n"
    316         "lbu             $t1, 0($t0)                 \n" // t1 = row[fx*4 + 0]
    317         "lbu             $t2, 1($t0)                 \n" // t2 = row[fx*4 + 1]
    318         "lbu             $t3, 2($t0)                 \n" // t3 = row[fx*4 + 2]
    319         "lbu             $t0, 3($t0)                 \n" // t4 = row[fx*4 + 2]
    320         "maddu           $ac0, $t4, $t1              \n"
    321         "maddu           $ac1, $t4, $t2              \n"
    322         "maddu           $ac2, $t4, $t3              \n"
    323         "maddu           $ac3, $t4, $t0              \n"
    324         "addiu           %[cnt], %[cnt], -1          \n"
    325         "bgtz            %[cnt], 21b                 \n"
    326         " addiu          %[fy], %[fy], 2             \n"
    327 
    328         "3:                                          \n"
    329         "extrv.w         $t3, $ac0, %[kShiftBits]    \n" // a >> kShiftBits
    330         "extrv.w         $t2, $ac1, %[kShiftBits]    \n" // b >> kShiftBits
    331         "extrv.w         $t1, $ac2, %[kShiftBits]    \n" // g >> kShiftBits
    332         "extrv.w         $t0, $ac3, %[kShiftBits]    \n" // r >> kShiftBits
    333         "repl.ph         $t4, 128                    \n" // t4 = | 128 | 128 |
    334         "addu            $t5, %[out_row], %[offset]  \n"
    335         "append          $t2, $t3, 16                \n" // t2 = |0|g|0|r|
    336         "append          $t0, $t1, 16                \n" // t0 = |0|a|0|b|
    337         "subu.ph         $t1, $t0, $t4               \n"
    338         "shll_s.ph       $t1, $t1, 8                 \n"
    339         "shra.ph         $t1, $t1, 8                 \n"
    340         "addu.ph         $t1, $t1, $t4               \n" // Clamp(a)|Clamp(b)
    341         "subu.ph         $t2, $t2, $t4               \n"
    342         "shll_s.ph       $t2, $t2, 8                 \n"
    343         "shra.ph         $t2, $t2, 8                 \n"
    344         "addu.ph         $t2, $t2, $t4               \n" // Clamp(g)|Clamp(r)
    345         "andi            $t3, $t1, 0xFF              \n" // t3 = ClampTo8(b)
    346         "cmp.lt.ph       $t3, $t2                    \n" // cmp b, g, r
    347         "pick.ph         $t0, $t2, $t3               \n"
    348         "andi            $t3, $t0, 0xFF              \n"
    349         "srl             $t4, $t0, 16                \n"
    350         "cmp.lt.ph       $t3, $t4                    \n"
    351         "pick.ph         $t0, $t4, $t3               \n" // t0 = max_color_ch
    352         "srl             $t3, $t1, 16                \n" // t1 = ClampTo8(a)
    353         "cmp.lt.ph       $t3, $t0                    \n"
    354         "pick.ph         $t0, $t0, $t3               \n"
    355         "ins             $t1, $t0, 16, 8             \n"
    356         "precr.qb.ph     $t0, $t1, $t2               \n" // t0 = |a|b|g|r|
    357         "usw             $t0, 0($t5)                 \n"
    358 
    359         ".set pop                                    \n"
    360       : [filter_val] "+r" (filter_val), [filter_len] "+r" (filter_length),
    361         [offset] "+r" (byte_offset), [fy] "+r" (filter_y), [cnt] "+r" (cnt),
    362         [out_x] "+r" (out_x), [pixel_width] "+r" (pixel_width)
    363       : [src_data_rows] "r" (source_data_rows), [out_row] "r" (out_row),
    364         [kShiftBits] "r" (ConvolutionFilter1D::kShiftBits)
    365       : "lo", "hi", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo", "$ac3hi",
    366         "t0", "t1", "t2", "t3", "t4", "t5", "t6","t7", "memory"
    367       );
    368     }
    369   } else {
    370     for (int out_x = 0; out_x < pixel_width; out_x++) {
    371       __asm__ __volatile__ (
    372         ".set push                                   \n"
    373         ".set noreorder                              \n"
    374 
    375         "beqz            %[filter_len], 3f           \n"
    376         " sll            %[offset], %[out_x], 2      \n"
    377         "mtlo            $0, $ac0                    \n"
    378         "mtlo            $0, $ac1                    \n"
    379         "mtlo            $0, $ac2                    \n"
    380         "srl             %[cnt], %[filter_len], 2    \n"
    381         "beqz            %[cnt], 2f                  \n"
    382         " li             %[fy], 0                    \n"
    383 
    384         "11:                                         \n"
    385         "sll             $t1, %[fy], 1               \n"
    386         "addu            $t0, %[src_data_rows], $t1  \n"
    387         "lw              $t1, 0($t0)                 \n"
    388         "lw              $t2, 4($t0)                 \n"
    389         "lw              $t3, 8($t0)                 \n"
    390         "lw              $t4, 12($t0)                \n"
    391         "addu            $t1, $t1, %[offset]         \n"
    392         "addu            $t2, $t2, %[offset]         \n"
    393         "addu            $t3, $t3, %[offset]         \n"
    394         "addu            $t4, $t4, %[offset]         \n"
    395         "lw              $t1, 0($t1)                 \n" // t1 = |a0|b0|g0|r0|
    396         "lw              $t2, 0($t2)                 \n" // t2 = |a1|b1|g1|r1|
    397         "lw              $t3, 0($t3)                 \n" // t3 = |a0|b0|g0|r0|
    398         "lw              $t4, 0($t4)                 \n" // t4 = |a1|b1|g1|r1|
    399         "precrq.qb.ph    $t5, $t2, $t1               \n" // t5 = |a1|g1|a0|g0|
    400         "precr.qb.ph     $t6, $t2, $t1               \n" // t6 = |b1|r1|b0|r0|
    401         "preceu.ph.qbra  $t1, $t5                    \n" // t1 = |0|g1|0|g0|
    402         "preceu.ph.qbla  $t2, $t6                    \n" // t2 = |0|b1|0|b0|
    403         "preceu.ph.qbra  $t5, $t6                    \n" // t5 = |0|r1|0|r0|
    404         "addu            $t6, %[filter_val], %[fy]   \n"
    405         "ulw             $t0, 0($t6)                 \n" // t0 = |cur_1|cur_0|
    406         "ulw             $t6, 4($t6)                 \n" // t6 = |cur_1|cur_0|
    407         "dpa.w.ph        $ac0, $t5, $t0              \n" // (cur*r1)+(cur*r0)
    408         "dpa.w.ph        $ac1, $t1, $t0              \n" // (cur*g1)+(cur*g0)
    409         "dpa.w.ph        $ac2, $t2, $t0              \n" // (cur*b1)+(cur*b0)
    410         "precrq.qb.ph    $t5, $t4, $t3               \n" // t5 = |a3|g3|a2|g2|
    411         "precr.qb.ph     $t0, $t4, $t3               \n" // t0 = |b3|r3|b2|r2|
    412         "preceu.ph.qbra  $t1, $t5                    \n" // t1 = |0|g3|0|g2|
    413         "preceu.ph.qbla  $t2, $t0                    \n" // t2 = |0|b3|0|b2|
    414         "preceu.ph.qbra  $t5, $t0                    \n" // t5 = |0|r3|0|r2|
    415         "dpa.w.ph        $ac0, $t5, $t6              \n" // (cur*r1)+(cur*r0)
    416         "dpa.w.ph        $ac1, $t1, $t6              \n" // (cur*g1)+(cur*g0)
    417         "dpa.w.ph        $ac2, $t2, $t6              \n" // (cur*b1)+(cur*b0)
    418         "addiu           %[cnt], %[cnt], -1          \n"
    419         "bgtz            %[cnt], 11b                 \n"
    420         " addiu          %[fy], %[fy], 8             \n"
    421 
    422         "2:                                          \n"
    423         "andi            %[cnt], %[filter_len], 0x3  \n" // residual
    424         "beqz            %[cnt], 3f                  \n"
    425         " nop                                        \n"
    426 
    427         "21:                                         \n"
    428         "addu            $t0, %[filter_val], %[fy]   \n"
    429         "lh              $t4, 0($t0)                 \n" // filter_val[fx]
    430         "sll             $t1, %[fy], 1               \n"
    431         "addu            $t0, %[src_data_rows], $t1  \n"
    432         "lw              $t1, 0($t0)                 \n"
    433         "addu            $t0, $t1, %[offset]         \n"
    434         "lbu             $t1, 0($t0)                 \n" // t1 = row[fx*4 + 0]
    435         "lbu             $t2, 1($t0)                 \n" // t2 = row[fx*4 + 1]
    436         "lbu             $t3, 2($t0)                 \n" // t3 = row[fx*4 + 2]
    437         "maddu           $ac0, $t4, $t1              \n"
    438         "maddu           $ac1, $t4, $t2              \n"
    439         "maddu           $ac2, $t4, $t3              \n"
    440         "addiu           %[cnt], %[cnt], -1          \n"
    441         "bgtz            %[cnt], 21b                 \n"
    442         " addiu          %[fy], %[fy], 2             \n"
    443 
    444         "3:                                          \n"
    445         "extrv.w         $t3, $ac0, %[kShiftBits]    \n" // r >> kShiftBits
    446         "extrv.w         $t2, $ac1, %[kShiftBits]    \n" // g >> kShiftBits
    447         "extrv.w         $t1, $ac2, %[kShiftBits]    \n" // b >> kShiftBits
    448         "repl.ph         $t6, 128                    \n" // t6 = | 128 | 128 |
    449         "addu            $t5, %[out_row], %[offset]  \n"
    450         "append          $t2, $t3, 16                \n" // t2 = |0|g|0|r|
    451         "andi            $t1, $t1, 0xFFFF            \n"
    452         "subu.ph         $t1, $t1, $t6               \n"
    453         "shll_s.ph       $t1, $t1, 8                 \n"
    454         "shra.ph         $t1, $t1, 8                 \n"
    455         "addu.ph         $t1, $t1, $t6               \n" // Clamp(a)|Clamp(b)
    456         "subu.ph         $t2, $t2, $t6               \n"
    457         "shll_s.ph       $t2, $t2, 8                 \n"
    458         "shra.ph         $t2, $t2, 8                 \n"
    459         "addu.ph         $t2, $t2, $t6               \n" // Clamp(g)|Clamp(r)
    460         "li              $t0, 0xFF                   \n"
    461         "ins             $t1, $t0, 16, 8             \n"
    462         "precr.qb.ph     $t0, $t1, $t2               \n" // t0 = |a|b|g|r|
    463         "usw             $t0, 0($t5)                 \n"
    464 
    465         ".set pop                                    \n"
    466       : [filter_val] "+r" (filter_val), [filter_len] "+r" (filter_length),
    467         [offset] "+r" (byte_offset), [fy] "+r" (filter_y), [cnt] "+r" (cnt),
    468         [out_x] "+r" (out_x), [pixel_width] "+r" (pixel_width)
    469       : [src_data_rows] "r" (source_data_rows), [out_row] "r" (out_row),
    470         [kShiftBits] "r" (ConvolutionFilter1D::kShiftBits)
    471       : "lo", "hi", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo", "$ac3hi",
    472         "t0", "t1", "t2", "t3", "t4", "t5", "t6", "memory"
    473       );
    474     }
    475   }
    476 #endif
    477 }
    478 } // namespace skia
    479