Home | History | Annotate | Download | only in source
      1 /*
      2  *  Copyright 2016 The LibYuv Project Authors. All rights reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS. All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <string.h>
     12 
     13 #include "libyuv/row.h"
     14 
     15 // This module is for GCC MSA
     16 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
     17 #include "libyuv/macros_msa.h"
     18 
     19 #ifdef __cplusplus
     20 namespace libyuv {
     21 extern "C" {
     22 #endif
     23 
     24 #define ALPHA_VAL (-1)
     25 
     26 // Fill YUV -> RGB conversion constants into vectors
     27 #define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, bb, bg, br, yg) \
     28   {                                                              \
     29     ub = __msa_fill_w(yuvconst->kUVToB[0]);                      \
     30     vr = __msa_fill_w(yuvconst->kUVToR[1]);                      \
     31     ug = __msa_fill_w(yuvconst->kUVToG[0]);                      \
     32     vg = __msa_fill_w(yuvconst->kUVToG[1]);                      \
     33     bb = __msa_fill_w(yuvconst->kUVBiasB[0]);                    \
     34     bg = __msa_fill_w(yuvconst->kUVBiasG[0]);                    \
     35     br = __msa_fill_w(yuvconst->kUVBiasR[0]);                    \
     36     yg = __msa_fill_w(yuvconst->kYToRgb[0]);                     \
     37   }
     38 
     39 // Load YUV 422 pixel data
     40 #define READYUV422(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v)  \
     41   {                                                              \
     42     uint64 y_m;                                                  \
     43     uint32 u_m, v_m;                                             \
     44     v4i32 zero_m = {0};                                          \
     45     y_m = LD(psrc_y);                                            \
     46     u_m = LW(psrc_u);                                            \
     47     v_m = LW(psrc_v);                                            \
     48     out_y = (v16u8)__msa_insert_d((v2i64)zero_m, 0, (int64)y_m); \
     49     out_u = (v16u8)__msa_insert_w(zero_m, 0, (int32)u_m);        \
     50     out_v = (v16u8)__msa_insert_w(zero_m, 0, (int32)v_m);        \
     51   }
     52 
     53 // Clip input vector elements between 0 to 255
     54 #define CLIP_0TO255(in0, in1, in2, in3, in4, in5) \
     55   {                                               \
     56     v4i32 max_m = __msa_ldi_w(0xFF);              \
     57                                                   \
     58     in0 = __msa_maxi_s_w(in0, 0);                 \
     59     in1 = __msa_maxi_s_w(in1, 0);                 \
     60     in2 = __msa_maxi_s_w(in2, 0);                 \
     61     in3 = __msa_maxi_s_w(in3, 0);                 \
     62     in4 = __msa_maxi_s_w(in4, 0);                 \
     63     in5 = __msa_maxi_s_w(in5, 0);                 \
     64     in0 = __msa_min_s_w(max_m, in0);              \
     65     in1 = __msa_min_s_w(max_m, in1);              \
     66     in2 = __msa_min_s_w(max_m, in2);              \
     67     in3 = __msa_min_s_w(max_m, in3);              \
     68     in4 = __msa_min_s_w(max_m, in4);              \
     69     in5 = __msa_min_s_w(max_m, in5);              \
     70   }
     71 
     72 // Convert 8 pixels of YUV 420 to RGB.
     73 #define YUVTORGB(in_y, in_uv, ubvr, ugvg, bb, bg, br, yg, out_b, out_g, out_r) \
     74   {                                                                            \
     75     v8i16 vec0_m, vec1_m;                                                      \
     76     v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m;                              \
     77     v4i32 reg5_m, reg6_m, reg7_m;                                              \
     78     v16i8 zero_m = {0};                                                        \
     79                                                                                \
     80     vec0_m = (v8i16)__msa_ilvr_b((v16i8)in_y, (v16i8)in_y);                    \
     81     vec1_m = (v8i16)__msa_ilvr_b((v16i8)zero_m, (v16i8)in_uv);                 \
     82     reg0_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec0_m);                \
     83     reg1_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec0_m);                \
     84     reg2_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec1_m);                \
     85     reg3_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec1_m);                \
     86     reg0_m *= yg;                                                              \
     87     reg1_m *= yg;                                                              \
     88     reg2_m *= ubvr;                                                            \
     89     reg3_m *= ubvr;                                                            \
     90     reg0_m = __msa_srai_w(reg0_m, 16);                                         \
     91     reg1_m = __msa_srai_w(reg1_m, 16);                                         \
     92     reg4_m = __msa_dotp_s_w((v8i16)vec1_m, (v8i16)ugvg);                       \
     93     reg5_m = __msa_ilvev_w(reg2_m, reg2_m);                                    \
     94     reg6_m = __msa_ilvev_w(reg3_m, reg3_m);                                    \
     95     reg7_m = __msa_ilvr_w(reg4_m, reg4_m);                                     \
     96     reg2_m = __msa_ilvod_w(reg2_m, reg2_m);                                    \
     97     reg3_m = __msa_ilvod_w(reg3_m, reg3_m);                                    \
     98     reg4_m = __msa_ilvl_w(reg4_m, reg4_m);                                     \
     99     reg5_m = reg0_m - reg5_m;                                                  \
    100     reg6_m = reg1_m - reg6_m;                                                  \
    101     reg2_m = reg0_m - reg2_m;                                                  \
    102     reg3_m = reg1_m - reg3_m;                                                  \
    103     reg7_m = reg0_m - reg7_m;                                                  \
    104     reg4_m = reg1_m - reg4_m;                                                  \
    105     reg5_m += bb;                                                              \
    106     reg6_m += bb;                                                              \
    107     reg7_m += bg;                                                              \
    108     reg4_m += bg;                                                              \
    109     reg2_m += br;                                                              \
    110     reg3_m += br;                                                              \
    111     reg5_m = __msa_srai_w(reg5_m, 6);                                          \
    112     reg6_m = __msa_srai_w(reg6_m, 6);                                          \
    113     reg7_m = __msa_srai_w(reg7_m, 6);                                          \
    114     reg4_m = __msa_srai_w(reg4_m, 6);                                          \
    115     reg2_m = __msa_srai_w(reg2_m, 6);                                          \
    116     reg3_m = __msa_srai_w(reg3_m, 6);                                          \
    117     CLIP_0TO255(reg5_m, reg6_m, reg7_m, reg4_m, reg2_m, reg3_m);               \
    118     out_b = __msa_pckev_h((v8i16)reg6_m, (v8i16)reg5_m);                       \
    119     out_g = __msa_pckev_h((v8i16)reg4_m, (v8i16)reg7_m);                       \
    120     out_r = __msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m);                       \
    121   }
    122 
    123 // Pack and Store 8 ARGB values.
    124 #define STOREARGB(in0, in1, in2, in3, pdst_argb)           \
    125   {                                                        \
    126     v8i16 vec0_m, vec1_m;                                  \
    127     v16u8 dst0_m, dst1_m;                                  \
    128     vec0_m = (v8i16)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \
    129     vec1_m = (v8i16)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \
    130     dst0_m = (v16u8)__msa_ilvr_h(vec1_m, vec0_m);          \
    131     dst1_m = (v16u8)__msa_ilvl_h(vec1_m, vec0_m);          \
    132     ST_UB2(dst0_m, dst1_m, pdst_argb, 16);                 \
    133   }
    134 
    135 // Takes ARGB input and calculates Y.
    136 #define ARGBTOY(argb0, argb1, argb2, argb3, const0, const1, const2, shift, \
    137                 y_out)                                                     \
    138   {                                                                        \
    139     v16u8 vec0_m, vec1_m, vec2_m, vec3_m;                                  \
    140     v8u16 reg0_m, reg1_m;                                                  \
    141                                                                            \
    142     vec0_m = (v16u8)__msa_pckev_h((v8i16)argb1, (v8i16)argb0);             \
    143     vec1_m = (v16u8)__msa_pckev_h((v8i16)argb3, (v8i16)argb2);             \
    144     vec2_m = (v16u8)__msa_pckod_h((v8i16)argb1, (v8i16)argb0);             \
    145     vec3_m = (v16u8)__msa_pckod_h((v8i16)argb3, (v8i16)argb2);             \
    146     reg0_m = __msa_dotp_u_h(vec0_m, const0);                               \
    147     reg1_m = __msa_dotp_u_h(vec1_m, const0);                               \
    148     reg0_m = __msa_dpadd_u_h(reg0_m, vec2_m, const1);                      \
    149     reg1_m = __msa_dpadd_u_h(reg1_m, vec3_m, const1);                      \
    150     reg0_m += const2;                                                      \
    151     reg1_m += const2;                                                      \
    152     reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, shift);                    \
    153     reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, shift);                    \
    154     y_out = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m);            \
    155   }
    156 
    157 // Loads current and next row of ARGB input and averages it to calculate U and V
    158 #define READ_ARGB(s_ptr, t_ptr, argb0, argb1, argb2, argb3)               \
    159   {                                                                       \
    160     v16u8 src0_m, src1_m, src2_m, src3_m, src4_m, src5_m, src6_m, src7_m; \
    161     v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
    162     v16u8 vec8_m, vec9_m;                                                 \
    163     v8u16 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m, reg5_m, reg6_m, reg7_m; \
    164     v8u16 reg8_m, reg9_m;                                                 \
    165                                                                           \
    166     src0_m = (v16u8)__msa_ld_b((v16i8*)s, 0);                             \
    167     src1_m = (v16u8)__msa_ld_b((v16i8*)s, 16);                            \
    168     src2_m = (v16u8)__msa_ld_b((v16i8*)s, 32);                            \
    169     src3_m = (v16u8)__msa_ld_b((v16i8*)s, 48);                            \
    170     src4_m = (v16u8)__msa_ld_b((v16i8*)t, 0);                             \
    171     src5_m = (v16u8)__msa_ld_b((v16i8*)t, 16);                            \
    172     src6_m = (v16u8)__msa_ld_b((v16i8*)t, 32);                            \
    173     src7_m = (v16u8)__msa_ld_b((v16i8*)t, 48);                            \
    174     vec0_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m);           \
    175     vec1_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m);           \
    176     vec2_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m);           \
    177     vec3_m = (v16u8)__msa_ilvr_b((v16i8)src3_m, (v16i8)src7_m);           \
    178     vec4_m = (v16u8)__msa_ilvl_b((v16i8)src0_m, (v16i8)src4_m);           \
    179     vec5_m = (v16u8)__msa_ilvl_b((v16i8)src1_m, (v16i8)src5_m);           \
    180     vec6_m = (v16u8)__msa_ilvl_b((v16i8)src2_m, (v16i8)src6_m);           \
    181     vec7_m = (v16u8)__msa_ilvl_b((v16i8)src3_m, (v16i8)src7_m);           \
    182     reg0_m = __msa_hadd_u_h(vec0_m, vec0_m);                              \
    183     reg1_m = __msa_hadd_u_h(vec1_m, vec1_m);                              \
    184     reg2_m = __msa_hadd_u_h(vec2_m, vec2_m);                              \
    185     reg3_m = __msa_hadd_u_h(vec3_m, vec3_m);                              \
    186     reg4_m = __msa_hadd_u_h(vec4_m, vec4_m);                              \
    187     reg5_m = __msa_hadd_u_h(vec5_m, vec5_m);                              \
    188     reg6_m = __msa_hadd_u_h(vec6_m, vec6_m);                              \
    189     reg7_m = __msa_hadd_u_h(vec7_m, vec7_m);                              \
    190     reg8_m = (v8u16)__msa_pckev_d((v2i64)reg4_m, (v2i64)reg0_m);          \
    191     reg9_m = (v8u16)__msa_pckev_d((v2i64)reg5_m, (v2i64)reg1_m);          \
    192     reg8_m += (v8u16)__msa_pckod_d((v2i64)reg4_m, (v2i64)reg0_m);         \
    193     reg9_m += (v8u16)__msa_pckod_d((v2i64)reg5_m, (v2i64)reg1_m);         \
    194     reg0_m = (v8u16)__msa_pckev_d((v2i64)reg6_m, (v2i64)reg2_m);          \
    195     reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m);          \
    196     reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m);         \
    197     reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m);         \
    198     reg8_m = (v8u16)__msa_srai_h((v8i16)reg8_m, 2);                       \
    199     reg9_m = (v8u16)__msa_srai_h((v8i16)reg9_m, 2);                       \
    200     reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, 2);                       \
    201     reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, 2);                       \
    202     argb0 = (v16u8)__msa_pckev_b((v16i8)reg9_m, (v16i8)reg8_m);           \
    203     argb1 = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m);           \
    204     src0_m = (v16u8)__msa_ld_b((v16i8*)s, 64);                            \
    205     src1_m = (v16u8)__msa_ld_b((v16i8*)s, 80);                            \
    206     src2_m = (v16u8)__msa_ld_b((v16i8*)s, 96);                            \
    207     src3_m = (v16u8)__msa_ld_b((v16i8*)s, 112);                           \
    208     src4_m = (v16u8)__msa_ld_b((v16i8*)t, 64);                            \
    209     src5_m = (v16u8)__msa_ld_b((v16i8*)t, 80);                            \
    210     src6_m = (v16u8)__msa_ld_b((v16i8*)t, 96);                            \
    211     src7_m = (v16u8)__msa_ld_b((v16i8*)t, 112);                           \
    212     vec2_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m);           \
    213     vec3_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m);           \
    214     vec4_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m);           \
    215     vec5_m = (v16u8)__msa_ilvr_b((v16i8)src3_m, (v16i8)src7_m);           \
    216     vec6_m = (v16u8)__msa_ilvl_b((v16i8)src0_m, (v16i8)src4_m);           \
    217     vec7_m = (v16u8)__msa_ilvl_b((v16i8)src1_m, (v16i8)src5_m);           \
    218     vec8_m = (v16u8)__msa_ilvl_b((v16i8)src2_m, (v16i8)src6_m);           \
    219     vec9_m = (v16u8)__msa_ilvl_b((v16i8)src3_m, (v16i8)src7_m);           \
    220     reg0_m = __msa_hadd_u_h(vec2_m, vec2_m);                              \
    221     reg1_m = __msa_hadd_u_h(vec3_m, vec3_m);                              \
    222     reg2_m = __msa_hadd_u_h(vec4_m, vec4_m);                              \
    223     reg3_m = __msa_hadd_u_h(vec5_m, vec5_m);                              \
    224     reg4_m = __msa_hadd_u_h(vec6_m, vec6_m);                              \
    225     reg5_m = __msa_hadd_u_h(vec7_m, vec7_m);                              \
    226     reg6_m = __msa_hadd_u_h(vec8_m, vec8_m);                              \
    227     reg7_m = __msa_hadd_u_h(vec9_m, vec9_m);                              \
    228     reg8_m = (v8u16)__msa_pckev_d((v2i64)reg4_m, (v2i64)reg0_m);          \
    229     reg9_m = (v8u16)__msa_pckev_d((v2i64)reg5_m, (v2i64)reg1_m);          \
    230     reg8_m += (v8u16)__msa_pckod_d((v2i64)reg4_m, (v2i64)reg0_m);         \
    231     reg9_m += (v8u16)__msa_pckod_d((v2i64)reg5_m, (v2i64)reg1_m);         \
    232     reg0_m = (v8u16)__msa_pckev_d((v2i64)reg6_m, (v2i64)reg2_m);          \
    233     reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m);          \
    234     reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m);         \
    235     reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m);         \
    236     reg8_m = (v8u16)__msa_srai_h((v8i16)reg8_m, 2);                       \
    237     reg9_m = (v8u16)__msa_srai_h((v8i16)reg9_m, 2);                       \
    238     reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, 2);                       \
    239     reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, 2);                       \
    240     argb2 = (v16u8)__msa_pckev_b((v16i8)reg9_m, (v16i8)reg8_m);           \
    241     argb3 = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m);           \
    242   }
    243 
    244 // Takes ARGB input and calculates U and V.
    245 #define ARGBTOUV(argb0, argb1, argb2, argb3, const0, const1, const2, const3, \
    246                  shf0, shf1, shf2, shf3, v_out, u_out)                       \
    247   {                                                                          \
    248     v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;    \
    249     v8u16 reg0_m, reg1_m, reg2_m, reg3_m;                                    \
    250                                                                              \
    251     vec0_m = (v16u8)__msa_vshf_b(shf0, (v16i8)argb1, (v16i8)argb0);          \
    252     vec1_m = (v16u8)__msa_vshf_b(shf0, (v16i8)argb3, (v16i8)argb2);          \
    253     vec2_m = (v16u8)__msa_vshf_b(shf1, (v16i8)argb1, (v16i8)argb0);          \
    254     vec3_m = (v16u8)__msa_vshf_b(shf1, (v16i8)argb3, (v16i8)argb2);          \
    255     vec4_m = (v16u8)__msa_vshf_b(shf2, (v16i8)argb1, (v16i8)argb0);          \
    256     vec5_m = (v16u8)__msa_vshf_b(shf2, (v16i8)argb3, (v16i8)argb2);          \
    257     vec6_m = (v16u8)__msa_vshf_b(shf3, (v16i8)argb1, (v16i8)argb0);          \
    258     vec7_m = (v16u8)__msa_vshf_b(shf3, (v16i8)argb3, (v16i8)argb2);          \
    259     reg0_m = __msa_dotp_u_h(vec0_m, const1);                                 \
    260     reg1_m = __msa_dotp_u_h(vec1_m, const1);                                 \
    261     reg2_m = __msa_dotp_u_h(vec4_m, const1);                                 \
    262     reg3_m = __msa_dotp_u_h(vec5_m, const1);                                 \
    263     reg0_m += const3;                                                        \
    264     reg1_m += const3;                                                        \
    265     reg2_m += const3;                                                        \
    266     reg3_m += const3;                                                        \
    267     reg0_m -= __msa_dotp_u_h(vec2_m, const0);                                \
    268     reg1_m -= __msa_dotp_u_h(vec3_m, const0);                                \
    269     reg2_m -= __msa_dotp_u_h(vec6_m, const2);                                \
    270     reg3_m -= __msa_dotp_u_h(vec7_m, const2);                                \
    271     v_out = (v16u8)__msa_pckod_b((v16i8)reg1_m, (v16i8)reg0_m);              \
    272     u_out = (v16u8)__msa_pckod_b((v16i8)reg3_m, (v16i8)reg2_m);              \
    273   }
    274 
    275 // Load I444 pixel data
    276 #define READI444(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v) \
    277   {                                                           \
    278     uint64 y_m, u_m, v_m;                                     \
    279     v2i64 zero_m = {0};                                       \
    280     y_m = LD(psrc_y);                                         \
    281     u_m = LD(psrc_u);                                         \
    282     v_m = LD(psrc_v);                                         \
    283     out_y = (v16u8)__msa_insert_d(zero_m, 0, (int64)y_m);     \
    284     out_u = (v16u8)__msa_insert_d(zero_m, 0, (int64)u_m);     \
    285     out_v = (v16u8)__msa_insert_d(zero_m, 0, (int64)v_m);     \
    286   }
    287 
    288 void MirrorRow_MSA(const uint8* src, uint8* dst, int width) {
    289   int x;
    290   v16u8 src0, src1, src2, src3;
    291   v16u8 dst0, dst1, dst2, dst3;
    292   v16i8 shuffler = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
    293   src += width - 64;
    294 
    295   for (x = 0; x < width; x += 64) {
    296     LD_UB4(src, 16, src3, src2, src1, src0);
    297     VSHF_B2_UB(src3, src3, src2, src2, shuffler, shuffler, dst3, dst2);
    298     VSHF_B2_UB(src1, src1, src0, src0, shuffler, shuffler, dst1, dst0);
    299     ST_UB4(dst0, dst1, dst2, dst3, dst, 16);
    300     dst += 64;
    301     src -= 64;
    302   }
    303 }
    304 
    305 void ARGBMirrorRow_MSA(const uint8* src, uint8* dst, int width) {
    306   int x;
    307   v16u8 src0, src1, src2, src3;
    308   v16u8 dst0, dst1, dst2, dst3;
    309   v16i8 shuffler = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
    310   src += width * 4 - 64;
    311 
    312   for (x = 0; x < width; x += 16) {
    313     LD_UB4(src, 16, src3, src2, src1, src0);
    314     VSHF_B2_UB(src3, src3, src2, src2, shuffler, shuffler, dst3, dst2);
    315     VSHF_B2_UB(src1, src1, src0, src0, shuffler, shuffler, dst1, dst0);
    316     ST_UB4(dst0, dst1, dst2, dst3, dst, 16);
    317     dst += 64;
    318     src -= 64;
    319   }
    320 }
    321 
    322 void I422ToYUY2Row_MSA(const uint8* src_y,
    323                        const uint8* src_u,
    324                        const uint8* src_v,
    325                        uint8* dst_yuy2,
    326                        int width) {
    327   int x;
    328   v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1;
    329   v16u8 dst_yuy2_0, dst_yuy2_1, dst_yuy2_2, dst_yuy2_3;
    330 
    331   for (x = 0; x < width; x += 32) {
    332     src_u0 = LD_UB(src_u);
    333     src_v0 = LD_UB(src_v);
    334     LD_UB2(src_y, 16, src_y0, src_y1);
    335     ILVRL_B2_UB(src_v0, src_u0, vec_uv0, vec_uv1);
    336     ILVRL_B2_UB(vec_uv0, src_y0, dst_yuy2_0, dst_yuy2_1);
    337     ILVRL_B2_UB(vec_uv1, src_y1, dst_yuy2_2, dst_yuy2_3);
    338     ST_UB4(dst_yuy2_0, dst_yuy2_1, dst_yuy2_2, dst_yuy2_3, dst_yuy2, 16);
    339     src_u += 16;
    340     src_v += 16;
    341     src_y += 32;
    342     dst_yuy2 += 64;
    343   }
    344 }
    345 
    346 void I422ToUYVYRow_MSA(const uint8* src_y,
    347                        const uint8* src_u,
    348                        const uint8* src_v,
    349                        uint8* dst_uyvy,
    350                        int width) {
    351   int x;
    352   v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1;
    353   v16u8 dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3;
    354 
    355   for (x = 0; x < width; x += 32) {
    356     src_u0 = LD_UB(src_u);
    357     src_v0 = LD_UB(src_v);
    358     LD_UB2(src_y, 16, src_y0, src_y1);
    359     ILVRL_B2_UB(src_v0, src_u0, vec_uv0, vec_uv1);
    360     ILVRL_B2_UB(src_y0, vec_uv0, dst_uyvy0, dst_uyvy1);
    361     ILVRL_B2_UB(src_y1, vec_uv1, dst_uyvy2, dst_uyvy3);
    362     ST_UB4(dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3, dst_uyvy, 16);
    363     src_u += 16;
    364     src_v += 16;
    365     src_y += 32;
    366     dst_uyvy += 64;
    367   }
    368 }
    369 
    370 void I422ToARGBRow_MSA(const uint8* src_y,
    371                        const uint8* src_u,
    372                        const uint8* src_v,
    373                        uint8* rgb_buf,
    374                        const struct YuvConstants* yuvconstants,
    375                        int width) {
    376   int x;
    377   v16u8 src0, src1, src2;
    378   v8i16 vec0, vec1, vec2;
    379   v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
    380   v4i32 vec_ubvr, vec_ugvg;
    381   v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
    382 
    383   YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
    384                  vec_br, vec_yg);
    385   vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
    386   vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
    387 
    388   for (x = 0; x < width; x += 8) {
    389     READYUV422(src_y, src_u, src_v, src0, src1, src2);
    390     src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
    391     YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
    392              vec0, vec1, vec2);
    393     STOREARGB(vec0, vec1, vec2, alpha, rgb_buf);
    394     src_y += 8;
    395     src_u += 4;
    396     src_v += 4;
    397     rgb_buf += 32;
    398   }
    399 }
    400 
    401 void I422ToRGBARow_MSA(const uint8* src_y,
    402                        const uint8* src_u,
    403                        const uint8* src_v,
    404                        uint8* rgb_buf,
    405                        const struct YuvConstants* yuvconstants,
    406                        int width) {
    407   int x;
    408   v16u8 src0, src1, src2;
    409   v8i16 vec0, vec1, vec2;
    410   v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
    411   v4i32 vec_ubvr, vec_ugvg;
    412   v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
    413 
    414   YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
    415                  vec_br, vec_yg);
    416   vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
    417   vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
    418 
    419   for (x = 0; x < width; x += 8) {
    420     READYUV422(src_y, src_u, src_v, src0, src1, src2);
    421     src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
    422     YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
    423              vec0, vec1, vec2);
    424     STOREARGB(alpha, vec0, vec1, vec2, rgb_buf);
    425     src_y += 8;
    426     src_u += 4;
    427     src_v += 4;
    428     rgb_buf += 32;
    429   }
    430 }
    431 
    432 void I422AlphaToARGBRow_MSA(const uint8* src_y,
    433                             const uint8* src_u,
    434                             const uint8* src_v,
    435                             const uint8* src_a,
    436                             uint8* rgb_buf,
    437                             const struct YuvConstants* yuvconstants,
    438                             int width) {
    439   int x;
    440   int64 data_a;
    441   v16u8 src0, src1, src2, src3;
    442   v8i16 vec0, vec1, vec2;
    443   v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
    444   v4i32 vec_ubvr, vec_ugvg;
    445   v4i32 zero = {0};
    446 
    447   YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
    448                  vec_br, vec_yg);
    449   vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
    450   vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
    451 
    452   for (x = 0; x < width; x += 8) {
    453     data_a = LD(src_a);
    454     READYUV422(src_y, src_u, src_v, src0, src1, src2);
    455     src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
    456     src3 = (v16u8)__msa_insert_d((v2i64)zero, 0, data_a);
    457     YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
    458              vec0, vec1, vec2);
    459     src3 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src3);
    460     STOREARGB(vec0, vec1, vec2, src3, rgb_buf);
    461     src_y += 8;
    462     src_u += 4;
    463     src_v += 4;
    464     src_a += 8;
    465     rgb_buf += 32;
    466   }
    467 }
    468 
    469 void I422ToRGB24Row_MSA(const uint8* src_y,
    470                         const uint8* src_u,
    471                         const uint8* src_v,
    472                         uint8* rgb_buf,
    473                         const struct YuvConstants* yuvconstants,
    474                         int32 width) {
    475   int x;
    476   int64 data_u, data_v;
    477   v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2;
    478   v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
    479   v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
    480   v4i32 vec_ubvr, vec_ugvg;
    481   v16u8 reg0, reg1, reg2, reg3;
    482   v2i64 zero = {0};
    483   v16i8 shuffler0 = {0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, 8, 9, 20, 10};
    484   v16i8 shuffler1 = {0, 21, 1, 2, 22, 3, 4, 23, 5, 6, 24, 7, 8, 25, 9, 10};
    485   v16i8 shuffler2 = {26, 6,  7,  27, 8,  9,  28, 10,
    486                      11, 29, 12, 13, 30, 14, 15, 31};
    487 
    488   YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
    489                  vec_br, vec_yg);
    490   vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
    491   vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
    492 
    493   for (x = 0; x < width; x += 16) {
    494     src0 = (v16u8)__msa_ld_b((v16u8*)src_y, 0);
    495     data_u = LD(src_u);
    496     data_v = LD(src_v);
    497     src1 = (v16u8)__msa_insert_d(zero, 0, data_u);
    498     src2 = (v16u8)__msa_insert_d(zero, 0, data_v);
    499     src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
    500     src3 = (v16u8)__msa_sldi_b((v16i8)src0, (v16i8)src0, 8);
    501     src4 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src1, 8);
    502     YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
    503              vec0, vec1, vec2);
    504     YUVTORGB(src3, src4, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
    505              vec3, vec4, vec5);
    506     reg0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
    507     reg2 = (v16u8)__msa_ilvev_b((v16i8)vec4, (v16i8)vec3);
    508     reg3 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec2);
    509     reg1 = (v16u8)__msa_sldi_b((v16i8)reg2, (v16i8)reg0, 11);
    510     dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)reg3, (v16i8)reg0);
    511     dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)reg3, (v16i8)reg1);
    512     dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)reg3, (v16i8)reg2);
    513     ST_UB2(dst0, dst1, rgb_buf, 16);
    514     ST_UB(dst2, (rgb_buf + 32));
    515     src_y += 16;
    516     src_u += 8;
    517     src_v += 8;
    518     rgb_buf += 48;
    519   }
    520 }
    521 
    522 // TODO(fbarchard): Consider AND instead of shift to isolate 5 upper bits of R.
    523 void I422ToRGB565Row_MSA(const uint8* src_y,
    524                          const uint8* src_u,
    525                          const uint8* src_v,
    526                          uint8* dst_rgb565,
    527                          const struct YuvConstants* yuvconstants,
    528                          int width) {
    529   int x;
    530   v16u8 src0, src1, src2, dst0;
    531   v8i16 vec0, vec1, vec2;
    532   v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
    533   v4i32 vec_ubvr, vec_ugvg;
    534 
    535   YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
    536                  vec_br, vec_yg);
    537   vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
    538   vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
    539 
    540   for (x = 0; x < width; x += 8) {
    541     READYUV422(src_y, src_u, src_v, src0, src1, src2);
    542     src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
    543     YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
    544              vec0, vec2, vec1);
    545     vec0 = __msa_srai_h(vec0, 3);
    546     vec1 = __msa_srai_h(vec1, 3);
    547     vec2 = __msa_srai_h(vec2, 2);
    548     vec1 = __msa_slli_h(vec1, 11);
    549     vec2 = __msa_slli_h(vec2, 5);
    550     vec0 |= vec1;
    551     dst0 = (v16u8)(vec2 | vec0);
    552     ST_UB(dst0, dst_rgb565);
    553     src_y += 8;
    554     src_u += 4;
    555     src_v += 4;
    556     dst_rgb565 += 16;
    557   }
    558 }
    559 
    560 // TODO(fbarchard): Consider AND instead of shift to isolate 4 upper bits of G.
    561 void I422ToARGB4444Row_MSA(const uint8* src_y,
    562                            const uint8* src_u,
    563                            const uint8* src_v,
    564                            uint8* dst_argb4444,
    565                            const struct YuvConstants* yuvconstants,
    566                            int width) {
    567   int x;
    568   v16u8 src0, src1, src2, dst0;
    569   v8i16 vec0, vec1, vec2;
    570   v8u16 reg0, reg1, reg2;
    571   v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
    572   v4i32 vec_ubvr, vec_ugvg;
    573   v8u16 const_0xF000 = (v8u16)__msa_fill_h(0xF000);
    574 
    575   YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
    576                  vec_br, vec_yg);
    577   vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
    578   vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
    579 
    580   for (x = 0; x < width; x += 8) {
    581     READYUV422(src_y, src_u, src_v, src0, src1, src2);
    582     src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
    583     YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
    584              vec0, vec1, vec2);
    585     reg0 = (v8u16)__msa_srai_h(vec0, 4);
    586     reg1 = (v8u16)__msa_srai_h(vec1, 4);
    587     reg2 = (v8u16)__msa_srai_h(vec2, 4);
    588     reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 4);
    589     reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 8);
    590     reg1 |= const_0xF000;
    591     reg0 |= reg2;
    592     dst0 = (v16u8)(reg1 | reg0);
    593     ST_UB(dst0, dst_argb4444);
    594     src_y += 8;
    595     src_u += 4;
    596     src_v += 4;
    597     dst_argb4444 += 16;
    598   }
    599 }
    600 
    601 void I422ToARGB1555Row_MSA(const uint8* src_y,
    602                            const uint8* src_u,
    603                            const uint8* src_v,
    604                            uint8* dst_argb1555,
    605                            const struct YuvConstants* yuvconstants,
    606                            int width) {
    607   int x;
    608   v16u8 src0, src1, src2, dst0;
    609   v8i16 vec0, vec1, vec2;
    610   v8u16 reg0, reg1, reg2;
    611   v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
    612   v4i32 vec_ubvr, vec_ugvg;
    613   v8u16 const_0x8000 = (v8u16)__msa_fill_h(0x8000);
    614 
    615   YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
    616                  vec_br, vec_yg);
    617   vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
    618   vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
    619 
    620   for (x = 0; x < width; x += 8) {
    621     READYUV422(src_y, src_u, src_v, src0, src1, src2);
    622     src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
    623     YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
    624              vec0, vec1, vec2);
    625     reg0 = (v8u16)__msa_srai_h(vec0, 3);
    626     reg1 = (v8u16)__msa_srai_h(vec1, 3);
    627     reg2 = (v8u16)__msa_srai_h(vec2, 3);
    628     reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 5);
    629     reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 10);
    630     reg1 |= const_0x8000;
    631     reg0 |= reg2;
    632     dst0 = (v16u8)(reg1 | reg0);
    633     ST_UB(dst0, dst_argb1555);
    634     src_y += 8;
    635     src_u += 4;
    636     src_v += 4;
    637     dst_argb1555 += 16;
    638   }
    639 }
    640 
    641 void YUY2ToYRow_MSA(const uint8* src_yuy2, uint8* dst_y, int width) {
    642   int x;
    643   v16u8 src0, src1, src2, src3, dst0, dst1;
    644 
    645   for (x = 0; x < width; x += 32) {
    646     LD_UB4(src_yuy2, 16, src0, src1, src2, src3);
    647     dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
    648     dst1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
    649     ST_UB2(dst0, dst1, dst_y, 16);
    650     src_yuy2 += 64;
    651     dst_y += 32;
    652   }
    653 }
    654 
    655 void YUY2ToUVRow_MSA(const uint8* src_yuy2,
    656                      int src_stride_yuy2,
    657                      uint8* dst_u,
    658                      uint8* dst_v,
    659                      int width) {
    660   const uint8* src_yuy2_next = src_yuy2 + src_stride_yuy2;
    661   int x;
    662   v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
    663   v16u8 vec0, vec1, dst0, dst1;
    664 
    665   for (x = 0; x < width; x += 32) {
    666     LD_UB4(src_yuy2, 16, src0, src1, src2, src3);
    667     LD_UB4(src_yuy2_next, 16, src4, src5, src6, src7);
    668     src0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
    669     src1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
    670     src2 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4);
    671     src3 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6);
    672     vec0 = __msa_aver_u_b(src0, src2);
    673     vec1 = __msa_aver_u_b(src1, src3);
    674     dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
    675     dst1 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
    676     ST_UB(dst0, dst_u);
    677     ST_UB(dst1, dst_v);
    678     src_yuy2 += 64;
    679     src_yuy2_next += 64;
    680     dst_u += 16;
    681     dst_v += 16;
    682   }
    683 }
    684 
    685 void YUY2ToUV422Row_MSA(const uint8* src_yuy2,
    686                         uint8* dst_u,
    687                         uint8* dst_v,
    688                         int width) {
    689   int x;
    690   v16u8 src0, src1, src2, src3, dst0, dst1;
    691 
    692   for (x = 0; x < width; x += 32) {
    693     LD_UB4(src_yuy2, 16, src0, src1, src2, src3);
    694     src0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
    695     src1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
    696     dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
    697     dst1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
    698     ST_UB(dst0, dst_u);
    699     ST_UB(dst1, dst_v);
    700     src_yuy2 += 64;
    701     dst_u += 16;
    702     dst_v += 16;
    703   }
    704 }
    705 
    706 void UYVYToYRow_MSA(const uint8* src_uyvy, uint8* dst_y, int width) {
    707   int x;
    708   v16u8 src0, src1, src2, src3, dst0, dst1;
    709 
    710   for (x = 0; x < width; x += 32) {
    711     LD_UB4(src_uyvy, 16, src0, src1, src2, src3);
    712     dst0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
    713     dst1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
    714     ST_UB2(dst0, dst1, dst_y, 16);
    715     src_uyvy += 64;
    716     dst_y += 32;
    717   }
    718 }
    719 
    720 void UYVYToUVRow_MSA(const uint8* src_uyvy,
    721                      int src_stride_uyvy,
    722                      uint8* dst_u,
    723                      uint8* dst_v,
    724                      int width) {
    725   const uint8* src_uyvy_next = src_uyvy + src_stride_uyvy;
    726   int x;
    727   v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
    728   v16u8 vec0, vec1, dst0, dst1;
    729 
    730   for (x = 0; x < width; x += 32) {
    731     LD_UB4(src_uyvy, 16, src0, src1, src2, src3);
    732     LD_UB4(src_uyvy_next, 16, src4, src5, src6, src7);
    733     src0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
    734     src1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
    735     src2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4);
    736     src3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6);
    737     vec0 = __msa_aver_u_b(src0, src2);
    738     vec1 = __msa_aver_u_b(src1, src3);
    739     dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
    740     dst1 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
    741     ST_UB(dst0, dst_u);
    742     ST_UB(dst1, dst_v);
    743     src_uyvy += 64;
    744     src_uyvy_next += 64;
    745     dst_u += 16;
    746     dst_v += 16;
    747   }
    748 }
    749 
    750 void UYVYToUV422Row_MSA(const uint8* src_uyvy,
    751                         uint8* dst_u,
    752                         uint8* dst_v,
    753                         int width) {
    754   int x;
    755   v16u8 src0, src1, src2, src3, dst0, dst1;
    756 
    757   for (x = 0; x < width; x += 32) {
    758     LD_UB4(src_uyvy, 16, src0, src1, src2, src3);
    759     src0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
    760     src1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
    761     dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
    762     dst1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
    763     ST_UB(dst0, dst_u);
    764     ST_UB(dst1, dst_v);
    765     src_uyvy += 64;
    766     dst_u += 16;
    767     dst_v += 16;
    768   }
    769 }
    770 
    771 void ARGBToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) {
    772   int x;
    773   v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0;
    774   v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
    775   v16i8 zero = {0};
    776   v8u16 const_0x19 = (v8u16)__msa_ldi_h(0x19);
    777   v8u16 const_0x81 = (v8u16)__msa_ldi_h(0x81);
    778   v8u16 const_0x42 = (v8u16)__msa_ldi_h(0x42);
    779   v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
    780 
    781   for (x = 0; x < width; x += 16) {
    782     src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 0);
    783     src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 16);
    784     src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 32);
    785     src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 48);
    786     vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
    787     vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
    788     vec2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
    789     vec3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
    790     reg0 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec0);
    791     reg1 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec1);
    792     reg2 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec2);
    793     reg3 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec3);
    794     reg4 = (v8u16)__msa_ilvod_b(zero, (v16i8)vec0);
    795     reg5 = (v8u16)__msa_ilvod_b(zero, (v16i8)vec1);
    796     reg0 *= const_0x19;
    797     reg1 *= const_0x19;
    798     reg2 *= const_0x81;
    799     reg3 *= const_0x81;
    800     reg4 *= const_0x42;
    801     reg5 *= const_0x42;
    802     reg0 += reg2;
    803     reg1 += reg3;
    804     reg0 += reg4;
    805     reg1 += reg5;
    806     reg0 += const_0x1080;
    807     reg1 += const_0x1080;
    808     reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
    809     reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 8);
    810     dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
    811     ST_UB(dst0, dst_y);
    812     src_argb0 += 64;
    813     dst_y += 16;
    814   }
    815 }
    816 
    817 void ARGBToUVRow_MSA(const uint8* src_argb0,
    818                      int src_stride_argb,
    819                      uint8* dst_u,
    820                      uint8* dst_v,
    821                      int width) {
    822   int x;
    823   const uint8* src_argb0_next = src_argb0 + src_stride_argb;
    824   v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
    825   v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
    826   v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9;
    827   v16u8 dst0, dst1;
    828   v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70);
    829   v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A);
    830   v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26);
    831   v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E);
    832   v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12);
    833   v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
    834 
    835   for (x = 0; x < width; x += 32) {
    836     src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 0);
    837     src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 16);
    838     src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 32);
    839     src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 48);
    840     src4 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 64);
    841     src5 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 80);
    842     src6 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 96);
    843     src7 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 112);
    844     vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
    845     vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
    846     vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4);
    847     vec3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6);
    848     vec4 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
    849     vec5 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
    850     vec6 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4);
    851     vec7 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6);
    852     vec8 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
    853     vec9 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
    854     vec4 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
    855     vec5 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6);
    856     vec0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
    857     vec1 = (v16u8)__msa_pckod_b((v16i8)vec3, (v16i8)vec2);
    858     reg0 = __msa_hadd_u_h(vec8, vec8);
    859     reg1 = __msa_hadd_u_h(vec9, vec9);
    860     reg2 = __msa_hadd_u_h(vec4, vec4);
    861     reg3 = __msa_hadd_u_h(vec5, vec5);
    862     reg4 = __msa_hadd_u_h(vec0, vec0);
    863     reg5 = __msa_hadd_u_h(vec1, vec1);
    864     src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 0);
    865     src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 16);
    866     src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 32);
    867     src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 48);
    868     src4 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 64);
    869     src5 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 80);
    870     src6 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 96);
    871     src7 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 112);
    872     vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
    873     vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
    874     vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4);
    875     vec3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6);
    876     vec4 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
    877     vec5 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
    878     vec6 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4);
    879     vec7 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6);
    880     vec8 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
    881     vec9 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
    882     vec4 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
    883     vec5 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6);
    884     vec0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
    885     vec1 = (v16u8)__msa_pckod_b((v16i8)vec3, (v16i8)vec2);
    886     reg0 += __msa_hadd_u_h(vec8, vec8);
    887     reg1 += __msa_hadd_u_h(vec9, vec9);
    888     reg2 += __msa_hadd_u_h(vec4, vec4);
    889     reg3 += __msa_hadd_u_h(vec5, vec5);
    890     reg4 += __msa_hadd_u_h(vec0, vec0);
    891     reg5 += __msa_hadd_u_h(vec1, vec1);
    892     reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 2);
    893     reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 2);
    894     reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 2);
    895     reg3 = (v8u16)__msa_srai_h((v8i16)reg3, 2);
    896     reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 2);
    897     reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 2);
    898     reg6 = reg0 * const_0x70;
    899     reg7 = reg1 * const_0x70;
    900     reg8 = reg2 * const_0x4A;
    901     reg9 = reg3 * const_0x4A;
    902     reg6 += const_0x8080;
    903     reg7 += const_0x8080;
    904     reg8 += reg4 * const_0x26;
    905     reg9 += reg5 * const_0x26;
    906     reg0 *= const_0x12;
    907     reg1 *= const_0x12;
    908     reg2 *= const_0x5E;
    909     reg3 *= const_0x5E;
    910     reg4 *= const_0x70;
    911     reg5 *= const_0x70;
    912     reg2 += reg0;
    913     reg3 += reg1;
    914     reg4 += const_0x8080;
    915     reg5 += const_0x8080;
    916     reg6 -= reg8;
    917     reg7 -= reg9;
    918     reg4 -= reg2;
    919     reg5 -= reg3;
    920     reg6 = (v8u16)__msa_srai_h((v8i16)reg6, 8);
    921     reg7 = (v8u16)__msa_srai_h((v8i16)reg7, 8);
    922     reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 8);
    923     reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 8);
    924     dst0 = (v16u8)__msa_pckev_b((v16i8)reg7, (v16i8)reg6);
    925     dst1 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4);
    926     ST_UB(dst0, dst_u);
    927     ST_UB(dst1, dst_v);
    928     src_argb0 += 128;
    929     src_argb0_next += 128;
    930     dst_u += 16;
    931     dst_v += 16;
    932   }
    933 }
    934 
    935 void ARGBToRGB24Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width) {
    936   int x;
    937   v16u8 src0, src1, src2, src3, dst0, dst1, dst2;
    938   v16i8 shuffler0 = {0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20};
    939   v16i8 shuffler1 = {5,  6,  8,  9,  10, 12, 13, 14,
    940                      16, 17, 18, 20, 21, 22, 24, 25};
    941   v16i8 shuffler2 = {10, 12, 13, 14, 16, 17, 18, 20,
    942                      21, 22, 24, 25, 26, 28, 29, 30};
    943 
    944   for (x = 0; x < width; x += 16) {
    945     src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
    946     src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
    947     src2 = (v16u8)__msa_ld_b((v16i8*)src_argb, 32);
    948     src3 = (v16u8)__msa_ld_b((v16i8*)src_argb, 48);
    949     dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0);
    950     dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src2, (v16i8)src1);
    951     dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src3, (v16i8)src2);
    952     ST_UB2(dst0, dst1, dst_rgb, 16);
    953     ST_UB(dst2, (dst_rgb + 32));
    954     src_argb += 64;
    955     dst_rgb += 48;
    956   }
    957 }
    958 
    959 void ARGBToRAWRow_MSA(const uint8* src_argb, uint8* dst_rgb, int width) {
    960   int x;
    961   v16u8 src0, src1, src2, src3, dst0, dst1, dst2;
    962   v16i8 shuffler0 = {2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, 18, 17, 16, 22};
    963   v16i8 shuffler1 = {5,  4,  10, 9,  8,  14, 13, 12,
    964                      18, 17, 16, 22, 21, 20, 26, 25};
    965   v16i8 shuffler2 = {8,  14, 13, 12, 18, 17, 16, 22,
    966                      21, 20, 26, 25, 24, 30, 29, 28};
    967 
    968   for (x = 0; x < width; x += 16) {
    969     src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
    970     src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
    971     src2 = (v16u8)__msa_ld_b((v16i8*)src_argb, 32);
    972     src3 = (v16u8)__msa_ld_b((v16i8*)src_argb, 48);
    973     dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0);
    974     dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src2, (v16i8)src1);
    975     dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src3, (v16i8)src2);
    976     ST_UB2(dst0, dst1, dst_rgb, 16);
    977     ST_UB(dst2, (dst_rgb + 32));
    978     src_argb += 64;
    979     dst_rgb += 48;
    980   }
    981 }
    982 
    983 void ARGBToRGB565Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width) {
    984   int x;
    985   v16u8 src0, src1, dst0;
    986   v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
    987   v16i8 zero = {0};
    988 
    989   for (x = 0; x < width; x += 8) {
    990     src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
    991     src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
    992     vec0 = (v16u8)__msa_srai_b((v16i8)src0, 3);
    993     vec1 = (v16u8)__msa_slli_b((v16i8)src0, 3);
    994     vec2 = (v16u8)__msa_srai_b((v16i8)src0, 5);
    995     vec4 = (v16u8)__msa_srai_b((v16i8)src1, 3);
    996     vec5 = (v16u8)__msa_slli_b((v16i8)src1, 3);
    997     vec6 = (v16u8)__msa_srai_b((v16i8)src1, 5);
    998     vec1 = (v16u8)__msa_sldi_b(zero, (v16i8)vec1, 1);
    999     vec2 = (v16u8)__msa_sldi_b(zero, (v16i8)vec2, 1);
   1000     vec5 = (v16u8)__msa_sldi_b(zero, (v16i8)vec5, 1);
   1001     vec6 = (v16u8)__msa_sldi_b(zero, (v16i8)vec6, 1);
   1002     vec3 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 2);
   1003     vec7 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 2);
   1004     vec0 = __msa_binsli_b(vec0, vec1, 2);
   1005     vec1 = __msa_binsli_b(vec2, vec3, 4);
   1006     vec4 = __msa_binsli_b(vec4, vec5, 2);
   1007     vec5 = __msa_binsli_b(vec6, vec7, 4);
   1008     vec0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
   1009     vec4 = (v16u8)__msa_ilvev_b((v16i8)vec5, (v16i8)vec4);
   1010     dst0 = (v16u8)__msa_pckev_h((v8i16)vec4, (v8i16)vec0);
   1011     ST_UB(dst0, dst_rgb);
   1012     src_argb += 32;
   1013     dst_rgb += 16;
   1014   }
   1015 }
   1016 
   1017 void ARGBToARGB1555Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width) {
   1018   int x;
   1019   v16u8 src0, src1, dst0;
   1020   v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
   1021   v16i8 zero = {0};
   1022 
   1023   for (x = 0; x < width; x += 8) {
   1024     src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
   1025     src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
   1026     vec0 = (v16u8)__msa_srai_b((v16i8)src0, 3);
   1027     vec1 = (v16u8)__msa_slli_b((v16i8)src0, 2);
   1028     vec2 = (v16u8)__msa_srai_b((v16i8)vec0, 3);
   1029     vec1 = (v16u8)__msa_sldi_b(zero, (v16i8)vec1, 1);
   1030     vec2 = (v16u8)__msa_sldi_b(zero, (v16i8)vec2, 1);
   1031     vec3 = (v16u8)__msa_srai_b((v16i8)src0, 1);
   1032     vec5 = (v16u8)__msa_srai_b((v16i8)src1, 3);
   1033     vec6 = (v16u8)__msa_slli_b((v16i8)src1, 2);
   1034     vec7 = (v16u8)__msa_srai_b((v16i8)vec5, 3);
   1035     vec6 = (v16u8)__msa_sldi_b(zero, (v16i8)vec6, 1);
   1036     vec7 = (v16u8)__msa_sldi_b(zero, (v16i8)vec7, 1);
   1037     vec8 = (v16u8)__msa_srai_b((v16i8)src1, 1);
   1038     vec3 = (v16u8)__msa_sldi_b(zero, (v16i8)vec3, 2);
   1039     vec8 = (v16u8)__msa_sldi_b(zero, (v16i8)vec8, 2);
   1040     vec4 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 3);
   1041     vec9 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 3);
   1042     vec0 = __msa_binsli_b(vec0, vec1, 2);
   1043     vec5 = __msa_binsli_b(vec5, vec6, 2);
   1044     vec1 = __msa_binsli_b(vec2, vec3, 5);
   1045     vec6 = __msa_binsli_b(vec7, vec8, 5);
   1046     vec1 = __msa_binsli_b(vec1, vec4, 0);
   1047     vec6 = __msa_binsli_b(vec6, vec9, 0);
   1048     vec0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
   1049     vec1 = (v16u8)__msa_ilvev_b((v16i8)vec6, (v16i8)vec5);
   1050     dst0 = (v16u8)__msa_pckev_h((v8i16)vec1, (v8i16)vec0);
   1051     ST_UB(dst0, dst_rgb);
   1052     src_argb += 32;
   1053     dst_rgb += 16;
   1054   }
   1055 }
   1056 
   1057 void ARGBToARGB4444Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width) {
   1058   int x;
   1059   v16u8 src0, src1;
   1060   v16u8 vec0, vec1;
   1061   v16u8 dst0;
   1062   v16i8 zero = {0};
   1063 
   1064   for (x = 0; x < width; x += 8) {
   1065     src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
   1066     src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
   1067     vec0 = (v16u8)__msa_srai_b((v16i8)src0, 4);
   1068     vec1 = (v16u8)__msa_srai_b((v16i8)src1, 4);
   1069     src0 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 1);
   1070     src1 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 1);
   1071     vec0 = __msa_binsli_b(vec0, src0, 3);
   1072     vec1 = __msa_binsli_b(vec1, src1, 3);
   1073     dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
   1074     ST_UB(dst0, dst_rgb);
   1075     src_argb += 32;
   1076     dst_rgb += 16;
   1077   }
   1078 }
   1079 
   1080 void ARGBToUV444Row_MSA(const uint8* src_argb,
   1081                         uint8* dst_u,
   1082                         uint8* dst_v,
   1083                         int32 width) {
   1084   int32 x;
   1085   v16u8 src0, src1, src2, src3, reg0, reg1, reg2, reg3, dst0, dst1;
   1086   v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
   1087   v8u16 vec8, vec9, vec10, vec11;
   1088   v8u16 const_112 = (v8u16)__msa_ldi_h(112);
   1089   v8u16 const_74 = (v8u16)__msa_ldi_h(74);
   1090   v8u16 const_38 = (v8u16)__msa_ldi_h(38);
   1091   v8u16 const_94 = (v8u16)__msa_ldi_h(94);
   1092   v8u16 const_18 = (v8u16)__msa_ldi_h(18);
   1093   v8u16 const_32896 = (v8u16)__msa_fill_h(32896);
   1094   v16i8 zero = {0};
   1095 
   1096   for (x = width; x > 0; x -= 16) {
   1097     src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
   1098     src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
   1099     src2 = (v16u8)__msa_ld_b((v16i8*)src_argb, 32);
   1100     src3 = (v16u8)__msa_ld_b((v16i8*)src_argb, 48);
   1101     reg0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
   1102     reg1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
   1103     reg2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
   1104     reg3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
   1105     src0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
   1106     src1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2);
   1107     src2 = (v16u8)__msa_pckod_b((v16i8)reg1, (v16i8)reg0);
   1108     vec0 = (v8u16)__msa_ilvr_b(zero, (v16i8)src0);
   1109     vec1 = (v8u16)__msa_ilvl_b(zero, (v16i8)src0);
   1110     vec2 = (v8u16)__msa_ilvr_b(zero, (v16i8)src1);
   1111     vec3 = (v8u16)__msa_ilvl_b(zero, (v16i8)src1);
   1112     vec4 = (v8u16)__msa_ilvr_b(zero, (v16i8)src2);
   1113     vec5 = (v8u16)__msa_ilvl_b(zero, (v16i8)src2);
   1114     vec10 = vec0 * const_18;
   1115     vec11 = vec1 * const_18;
   1116     vec8 = vec2 * const_94;
   1117     vec9 = vec3 * const_94;
   1118     vec6 = vec4 * const_112;
   1119     vec7 = vec5 * const_112;
   1120     vec0 *= const_112;
   1121     vec1 *= const_112;
   1122     vec2 *= const_74;
   1123     vec3 *= const_74;
   1124     vec4 *= const_38;
   1125     vec5 *= const_38;
   1126     vec8 += vec10;
   1127     vec9 += vec11;
   1128     vec6 += const_32896;
   1129     vec7 += const_32896;
   1130     vec0 += const_32896;
   1131     vec1 += const_32896;
   1132     vec2 += vec4;
   1133     vec3 += vec5;
   1134     vec0 -= vec2;
   1135     vec1 -= vec3;
   1136     vec6 -= vec8;
   1137     vec7 -= vec9;
   1138     vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8);
   1139     vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8);
   1140     vec6 = (v8u16)__msa_srai_h((v8i16)vec6, 8);
   1141     vec7 = (v8u16)__msa_srai_h((v8i16)vec7, 8);
   1142     dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
   1143     dst1 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6);
   1144     ST_UB(dst0, dst_u);
   1145     ST_UB(dst1, dst_v);
   1146     src_argb += 64;
   1147     dst_u += 16;
   1148     dst_v += 16;
   1149   }
   1150 }
   1151 
   1152 void ARGBMultiplyRow_MSA(const uint8* src_argb0,
   1153                          const uint8* src_argb1,
   1154                          uint8* dst_argb,
   1155                          int width) {
   1156   int x;
   1157   v16u8 src0, src1, dst0;
   1158   v8u16 vec0, vec1, vec2, vec3;
   1159   v4u32 reg0, reg1, reg2, reg3;
   1160   v8i16 zero = {0};
   1161 
   1162   for (x = 0; x < width; x += 4) {
   1163     src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
   1164     src1 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 0);
   1165     vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
   1166     vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
   1167     vec2 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1);
   1168     vec3 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src1);
   1169     reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec0);
   1170     reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec0);
   1171     reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec1);
   1172     reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec1);
   1173     reg0 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec2);
   1174     reg1 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec2);
   1175     reg2 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec3);
   1176     reg3 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec3);
   1177     reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 16);
   1178     reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 16);
   1179     reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 16);
   1180     reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 16);
   1181     vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
   1182     vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
   1183     dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
   1184     ST_UB(dst0, dst_argb);
   1185     src_argb0 += 16;
   1186     src_argb1 += 16;
   1187     dst_argb += 16;
   1188   }
   1189 }
   1190 
   1191 void ARGBAddRow_MSA(const uint8* src_argb0,
   1192                     const uint8* src_argb1,
   1193                     uint8* dst_argb,
   1194                     int width) {
   1195   int x;
   1196   v16u8 src0, src1, src2, src3, dst0, dst1;
   1197 
   1198   for (x = 0; x < width; x += 8) {
   1199     src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
   1200     src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
   1201     src2 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 0);
   1202     src3 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 16);
   1203     dst0 = __msa_adds_u_b(src0, src2);
   1204     dst1 = __msa_adds_u_b(src1, src3);
   1205     ST_UB2(dst0, dst1, dst_argb, 16);
   1206     src_argb0 += 32;
   1207     src_argb1 += 32;
   1208     dst_argb += 32;
   1209   }
   1210 }
   1211 
   1212 void ARGBSubtractRow_MSA(const uint8* src_argb0,
   1213                          const uint8* src_argb1,
   1214                          uint8* dst_argb,
   1215                          int width) {
   1216   int x;
   1217   v16u8 src0, src1, src2, src3, dst0, dst1;
   1218 
   1219   for (x = 0; x < width; x += 8) {
   1220     src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
   1221     src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
   1222     src2 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 0);
   1223     src3 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 16);
   1224     dst0 = __msa_subs_u_b(src0, src2);
   1225     dst1 = __msa_subs_u_b(src1, src3);
   1226     ST_UB2(dst0, dst1, dst_argb, 16);
   1227     src_argb0 += 32;
   1228     src_argb1 += 32;
   1229     dst_argb += 32;
   1230   }
   1231 }
   1232 
   1233 void ARGBAttenuateRow_MSA(const uint8* src_argb, uint8* dst_argb, int width) {
   1234   int x;
   1235   v16u8 src0, src1, dst0, dst1;
   1236   v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
   1237   v4u32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
   1238   v8i16 zero = {0};
   1239   v16u8 mask = {0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255};
   1240 
   1241   for (x = 0; x < width; x += 8) {
   1242     src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
   1243     src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
   1244     vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
   1245     vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
   1246     vec2 = (v8u16)__msa_ilvr_b((v16i8)src1, (v16i8)src1);
   1247     vec3 = (v8u16)__msa_ilvl_b((v16i8)src1, (v16i8)src1);
   1248     vec4 = (v8u16)__msa_fill_h(vec0[3]);
   1249     vec5 = (v8u16)__msa_fill_h(vec0[7]);
   1250     vec6 = (v8u16)__msa_fill_h(vec1[3]);
   1251     vec7 = (v8u16)__msa_fill_h(vec1[7]);
   1252     vec4 = (v8u16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4);
   1253     vec5 = (v8u16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);
   1254     vec6 = (v8u16)__msa_fill_h(vec2[3]);
   1255     vec7 = (v8u16)__msa_fill_h(vec2[7]);
   1256     vec8 = (v8u16)__msa_fill_h(vec3[3]);
   1257     vec9 = (v8u16)__msa_fill_h(vec3[7]);
   1258     vec6 = (v8u16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);
   1259     vec7 = (v8u16)__msa_pckev_d((v2i64)vec9, (v2i64)vec8);
   1260     reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec4);
   1261     reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec4);
   1262     reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec5);
   1263     reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec5);
   1264     reg4 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec6);
   1265     reg5 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec6);
   1266     reg6 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec7);
   1267     reg7 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec7);
   1268     reg0 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec0);
   1269     reg1 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec0);
   1270     reg2 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec1);
   1271     reg3 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec1);
   1272     reg4 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec2);
   1273     reg5 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec2);
   1274     reg6 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec3);
   1275     reg7 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec3);
   1276     reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 24);
   1277     reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 24);
   1278     reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 24);
   1279     reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 24);
   1280     reg4 = (v4u32)__msa_srai_w((v4i32)reg4, 24);
   1281     reg5 = (v4u32)__msa_srai_w((v4i32)reg5, 24);
   1282     reg6 = (v4u32)__msa_srai_w((v4i32)reg6, 24);
   1283     reg7 = (v4u32)__msa_srai_w((v4i32)reg7, 24);
   1284     vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
   1285     vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
   1286     vec2 = (v8u16)__msa_pckev_h((v8i16)reg5, (v8i16)reg4);
   1287     vec3 = (v8u16)__msa_pckev_h((v8i16)reg7, (v8i16)reg6);
   1288     dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
   1289     dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
   1290     dst0 = __msa_bmnz_v(dst0, src0, mask);
   1291     dst1 = __msa_bmnz_v(dst1, src1, mask);
   1292     ST_UB2(dst0, dst1, dst_argb, 16);
   1293     src_argb += 32;
   1294     dst_argb += 32;
   1295   }
   1296 }
   1297 
   1298 void ARGBToRGB565DitherRow_MSA(const uint8* src_argb,
   1299                                uint8* dst_rgb,
   1300                                uint32 dither4,
   1301                                int width) {
   1302   int x;
   1303   v16u8 src0, src1, dst0, vec0, vec1;
   1304   v8i16 vec_d0;
   1305   v8i16 reg0, reg1, reg2;
   1306   v16i8 zero = {0};
   1307   v8i16 max = __msa_ldi_h(0xFF);
   1308 
   1309   vec_d0 = (v8i16)__msa_fill_w(dither4);
   1310   vec_d0 = (v8i16)__msa_ilvr_b(zero, (v16i8)vec_d0);
   1311 
   1312   for (x = 0; x < width; x += 8) {
   1313     src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
   1314     src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
   1315     vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
   1316     vec1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
   1317     reg0 = (v8i16)__msa_ilvev_b(zero, (v16i8)vec0);
   1318     reg1 = (v8i16)__msa_ilvev_b(zero, (v16i8)vec1);
   1319     reg2 = (v8i16)__msa_ilvod_b(zero, (v16i8)vec0);
   1320     reg0 += vec_d0;
   1321     reg1 += vec_d0;
   1322     reg2 += vec_d0;
   1323     reg0 = __msa_maxi_s_h((v8i16)reg0, 0);
   1324     reg1 = __msa_maxi_s_h((v8i16)reg1, 0);
   1325     reg2 = __msa_maxi_s_h((v8i16)reg2, 0);
   1326     reg0 = __msa_min_s_h((v8i16)max, (v8i16)reg0);
   1327     reg1 = __msa_min_s_h((v8i16)max, (v8i16)reg1);
   1328     reg2 = __msa_min_s_h((v8i16)max, (v8i16)reg2);
   1329     reg0 = __msa_srai_h(reg0, 3);
   1330     reg2 = __msa_srai_h(reg2, 3);
   1331     reg1 = __msa_srai_h(reg1, 2);
   1332     reg2 = __msa_slli_h(reg2, 11);
   1333     reg1 = __msa_slli_h(reg1, 5);
   1334     reg0 |= reg1;
   1335     dst0 = (v16u8)(reg0 | reg2);
   1336     ST_UB(dst0, dst_rgb);
   1337     src_argb += 32;
   1338     dst_rgb += 16;
   1339   }
   1340 }
   1341 
   1342 void ARGBShuffleRow_MSA(const uint8* src_argb,
   1343                         uint8* dst_argb,
   1344                         const uint8* shuffler,
   1345                         int width) {
   1346   int x;
   1347   v16u8 src0, src1, dst0, dst1;
   1348   v16i8 vec0;
   1349   v16i8 shuffler_vec = {0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
   1350   int32 val = LW((int32*)shuffler);
   1351 
   1352   vec0 = (v16i8)__msa_fill_w(val);
   1353   shuffler_vec += vec0;
   1354 
   1355   for (x = 0; x < width; x += 8) {
   1356     src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
   1357     src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16);
   1358     dst0 = (v16u8)__msa_vshf_b(shuffler_vec, (v16i8)src0, (v16i8)src0);
   1359     dst1 = (v16u8)__msa_vshf_b(shuffler_vec, (v16i8)src1, (v16i8)src1);
   1360     ST_UB2(dst0, dst1, dst_argb, 16);
   1361     src_argb += 32;
   1362     dst_argb += 32;
   1363   }
   1364 }
   1365 
   1366 void ARGBShadeRow_MSA(const uint8* src_argb,
   1367                       uint8* dst_argb,
   1368                       int width,
   1369                       uint32 value) {
   1370   int x;
   1371   v16u8 src0, dst0;
   1372   v8u16 vec0, vec1;
   1373   v4u32 reg0, reg1, reg2, reg3, rgba_scale;
   1374   v8i16 zero = {0};
   1375 
   1376   rgba_scale[0] = value;
   1377   rgba_scale = (v4u32)__msa_ilvr_b((v16i8)rgba_scale, (v16i8)rgba_scale);
   1378   rgba_scale = (v4u32)__msa_ilvr_h(zero, (v8i16)rgba_scale);
   1379 
   1380   for (x = 0; x < width; x += 4) {
   1381     src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
   1382     vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
   1383     vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
   1384     reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec0);
   1385     reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec0);
   1386     reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec1);
   1387     reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec1);
   1388     reg0 *= rgba_scale;
   1389     reg1 *= rgba_scale;
   1390     reg2 *= rgba_scale;
   1391     reg3 *= rgba_scale;
   1392     reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 24);
   1393     reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 24);
   1394     reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 24);
   1395     reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 24);
   1396     vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
   1397     vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
   1398     dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
   1399     ST_UB(dst0, dst_argb);
   1400     src_argb += 16;
   1401     dst_argb += 16;
   1402   }
   1403 }
   1404 
   1405 void ARGBGrayRow_MSA(const uint8* src_argb, uint8* dst_argb, int width) {
   1406   int x;
   1407   v16u8 src0, src1, vec0, vec1, dst0, dst1;
   1408   v8u16 reg0;
   1409   v16u8 const_0x26 = (v16u8)__msa_ldi_h(0x26);
   1410   v16u8 const_0x4B0F = (v16u8)__msa_fill_h(0x4B0F);
   1411 
   1412   for (x = 0; x < width; x += 8) {
   1413     src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0);
   1414     src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16);
   1415     vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0);
   1416     vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0);
   1417     reg0 = __msa_dotp_u_h(vec0, const_0x4B0F);
   1418     reg0 = __msa_dpadd_u_h(reg0, vec1, const_0x26);
   1419     reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 7);
   1420     vec0 = (v16u8)__msa_ilvev_b((v16i8)reg0, (v16i8)reg0);
   1421     vec1 = (v16u8)__msa_ilvod_b((v16i8)vec1, (v16i8)vec0);
   1422     dst0 = (v16u8)__msa_ilvr_b((v16i8)vec1, (v16i8)vec0);
   1423     dst1 = (v16u8)__msa_ilvl_b((v16i8)vec1, (v16i8)vec0);
   1424     ST_UB2(dst0, dst1, dst_argb, 16);
   1425     src_argb += 32;
   1426     dst_argb += 32;
   1427   }
   1428 }
   1429 
   1430 void ARGBSepiaRow_MSA(uint8* dst_argb, int width) {
   1431   int x;
   1432   v16u8 src0, src1, dst0, dst1, vec0, vec1, vec2, vec3, vec4, vec5;
   1433   v8u16 reg0, reg1, reg2;
   1434   v16u8 const_0x4411 = (v16u8)__msa_fill_h(0x4411);
   1435   v16u8 const_0x23 = (v16u8)__msa_ldi_h(0x23);
   1436   v16u8 const_0x5816 = (v16u8)__msa_fill_h(0x5816);
   1437   v16u8 const_0x2D = (v16u8)__msa_ldi_h(0x2D);
   1438   v16u8 const_0x6218 = (v16u8)__msa_fill_h(0x6218);
   1439   v16u8 const_0x32 = (v16u8)__msa_ldi_h(0x32);
   1440   v8u16 const_0xFF = (v8u16)__msa_ldi_h(0xFF);
   1441 
   1442   for (x = 0; x < width; x += 8) {
   1443     src0 = (v16u8)__msa_ld_b((v16u8*)dst_argb, 0);
   1444     src1 = (v16u8)__msa_ld_b((v16u8*)dst_argb, 16);
   1445     vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0);
   1446     vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0);
   1447     vec3 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec1);
   1448     reg0 = (v8u16)__msa_dotp_u_h(vec0, const_0x4411);
   1449     reg1 = (v8u16)__msa_dotp_u_h(vec0, const_0x5816);
   1450     reg2 = (v8u16)__msa_dotp_u_h(vec0, const_0x6218);
   1451     reg0 = (v8u16)__msa_dpadd_u_h(reg0, vec1, const_0x23);
   1452     reg1 = (v8u16)__msa_dpadd_u_h(reg1, vec1, const_0x2D);
   1453     reg2 = (v8u16)__msa_dpadd_u_h(reg2, vec1, const_0x32);
   1454     reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 7);
   1455     reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 7);
   1456     reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 7);
   1457     reg1 = (v8u16)__msa_min_u_h((v8u16)reg1, const_0xFF);
   1458     reg2 = (v8u16)__msa_min_u_h((v8u16)reg2, const_0xFF);
   1459     vec0 = (v16u8)__msa_pckev_b((v16i8)reg0, (v16i8)reg0);
   1460     vec1 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg1);
   1461     vec2 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg2);
   1462     vec4 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0);
   1463     vec5 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1);
   1464     dst0 = (v16u8)__msa_ilvr_b((v16i8)vec5, (v16i8)vec4);
   1465     dst1 = (v16u8)__msa_ilvl_b((v16i8)vec5, (v16i8)vec4);
   1466     ST_UB2(dst0, dst1, dst_argb, 16);
   1467     dst_argb += 32;
   1468   }
   1469 }
   1470 
   1471 void ARGB4444ToARGBRow_MSA(const uint8* src_argb4444,
   1472                            uint8* dst_argb,
   1473                            int width) {
   1474   int x;
   1475   v16u8 src0, src1;
   1476   v8u16 vec0, vec1, vec2, vec3;
   1477   v16u8 dst0, dst1, dst2, dst3;
   1478 
   1479   for (x = 0; x < width; x += 16) {
   1480     src0 = (v16u8)__msa_ld_b((v16u8*)src_argb4444, 0);
   1481     src1 = (v16u8)__msa_ld_b((v16u8*)src_argb4444, 16);
   1482     vec0 = (v8u16)__msa_andi_b(src0, 0x0F);
   1483     vec1 = (v8u16)__msa_andi_b(src1, 0x0F);
   1484     vec2 = (v8u16)__msa_andi_b(src0, 0xF0);
   1485     vec3 = (v8u16)__msa_andi_b(src1, 0xF0);
   1486     vec0 |= (v8u16)__msa_slli_b((v16i8)vec0, 4);
   1487     vec1 |= (v8u16)__msa_slli_b((v16i8)vec1, 4);
   1488     vec2 |= (v8u16)__msa_srli_b((v16i8)vec2, 4);
   1489     vec3 |= (v8u16)__msa_srli_b((v16i8)vec3, 4);
   1490     dst0 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0);
   1491     dst1 = (v16u8)__msa_ilvl_b((v16i8)vec2, (v16i8)vec0);
   1492     dst2 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1);
   1493     dst3 = (v16u8)__msa_ilvl_b((v16i8)vec3, (v16i8)vec1);
   1494     ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
   1495     src_argb4444 += 32;
   1496     dst_argb += 64;
   1497   }
   1498 }
   1499 
   1500 void ARGB1555ToARGBRow_MSA(const uint8* src_argb1555,
   1501                            uint8* dst_argb,
   1502                            int width) {
   1503   int x;
   1504   v8u16 src0, src1;
   1505   v8u16 vec0, vec1, vec2, vec3, vec4, vec5;
   1506   v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6;
   1507   v16u8 dst0, dst1, dst2, dst3;
   1508   v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
   1509 
   1510   for (x = 0; x < width; x += 16) {
   1511     src0 = (v8u16)__msa_ld_h((v8u16*)src_argb1555, 0);
   1512     src1 = (v8u16)__msa_ld_h((v8u16*)src_argb1555, 16);
   1513     vec0 = src0 & const_0x1F;
   1514     vec1 = src1 & const_0x1F;
   1515     src0 = (v8u16)__msa_srli_h((v8i16)src0, 5);
   1516     src1 = (v8u16)__msa_srli_h((v8i16)src1, 5);
   1517     vec2 = src0 & const_0x1F;
   1518     vec3 = src1 & const_0x1F;
   1519     src0 = (v8u16)__msa_srli_h((v8i16)src0, 5);
   1520     src1 = (v8u16)__msa_srli_h((v8i16)src1, 5);
   1521     vec4 = src0 & const_0x1F;
   1522     vec5 = src1 & const_0x1F;
   1523     src0 = (v8u16)__msa_srli_h((v8i16)src0, 5);
   1524     src1 = (v8u16)__msa_srli_h((v8i16)src1, 5);
   1525     reg0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
   1526     reg1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
   1527     reg2 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
   1528     reg3 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
   1529     reg4 = (v16u8)__msa_slli_b((v16i8)reg0, 3);
   1530     reg5 = (v16u8)__msa_slli_b((v16i8)reg1, 3);
   1531     reg6 = (v16u8)__msa_slli_b((v16i8)reg2, 3);
   1532     reg4 |= (v16u8)__msa_srai_b((v16i8)reg0, 2);
   1533     reg5 |= (v16u8)__msa_srai_b((v16i8)reg1, 2);
   1534     reg6 |= (v16u8)__msa_srai_b((v16i8)reg2, 2);
   1535     reg3 = -reg3;
   1536     reg0 = (v16u8)__msa_ilvr_b((v16i8)reg6, (v16i8)reg4);
   1537     reg1 = (v16u8)__msa_ilvl_b((v16i8)reg6, (v16i8)reg4);
   1538     reg2 = (v16u8)__msa_ilvr_b((v16i8)reg3, (v16i8)reg5);
   1539     reg3 = (v16u8)__msa_ilvl_b((v16i8)reg3, (v16i8)reg5);
   1540     dst0 = (v16u8)__msa_ilvr_b((v16i8)reg2, (v16i8)reg0);
   1541     dst1 = (v16u8)__msa_ilvl_b((v16i8)reg2, (v16i8)reg0);
   1542     dst2 = (v16u8)__msa_ilvr_b((v16i8)reg3, (v16i8)reg1);
   1543     dst3 = (v16u8)__msa_ilvl_b((v16i8)reg3, (v16i8)reg1);
   1544     ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
   1545     src_argb1555 += 32;
   1546     dst_argb += 64;
   1547   }
   1548 }
   1549 
   1550 void RGB565ToARGBRow_MSA(const uint8* src_rgb565, uint8* dst_argb, int width) {
   1551   int x;
   1552   v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5;
   1553   v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
   1554   v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3;
   1555   v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
   1556   v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
   1557   v8u16 const_0x7E0 = (v8u16)__msa_fill_h(0x7E0);
   1558   v8u16 const_0xF800 = (v8u16)__msa_fill_h(0xF800);
   1559 
   1560   for (x = 0; x < width; x += 16) {
   1561     src0 = (v8u16)__msa_ld_h((v8u16*)src_rgb565, 0);
   1562     src1 = (v8u16)__msa_ld_h((v8u16*)src_rgb565, 16);
   1563     vec0 = src0 & const_0x1F;
   1564     vec1 = src0 & const_0x7E0;
   1565     vec2 = src0 & const_0xF800;
   1566     vec3 = src1 & const_0x1F;
   1567     vec4 = src1 & const_0x7E0;
   1568     vec5 = src1 & const_0xF800;
   1569     reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3);
   1570     reg1 = (v8u16)__msa_srli_h((v8i16)vec1, 3);
   1571     reg2 = (v8u16)__msa_srli_h((v8i16)vec2, 8);
   1572     reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3);
   1573     reg4 = (v8u16)__msa_srli_h((v8i16)vec4, 3);
   1574     reg5 = (v8u16)__msa_srli_h((v8i16)vec5, 8);
   1575     reg0 |= (v8u16)__msa_srli_h((v8i16)vec0, 2);
   1576     reg1 |= (v8u16)__msa_srli_h((v8i16)vec1, 9);
   1577     reg2 |= (v8u16)__msa_srli_h((v8i16)vec2, 13);
   1578     reg3 |= (v8u16)__msa_srli_h((v8i16)vec3, 2);
   1579     reg4 |= (v8u16)__msa_srli_h((v8i16)vec4, 9);
   1580     reg5 |= (v8u16)__msa_srli_h((v8i16)vec5, 13);
   1581     res0 = (v16u8)__msa_ilvev_b((v16i8)reg2, (v16i8)reg0);
   1582     res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)reg1);
   1583     res2 = (v16u8)__msa_ilvev_b((v16i8)reg5, (v16i8)reg3);
   1584     res3 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)reg4);
   1585     dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0);
   1586     dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0);
   1587     dst2 = (v16u8)__msa_ilvr_b((v16i8)res3, (v16i8)res2);
   1588     dst3 = (v16u8)__msa_ilvl_b((v16i8)res3, (v16i8)res2);
   1589     ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
   1590     src_rgb565 += 32;
   1591     dst_argb += 64;
   1592   }
   1593 }
   1594 
   1595 void RGB24ToARGBRow_MSA(const uint8* src_rgb24, uint8* dst_argb, int width) {
   1596   int x;
   1597   v16u8 src0, src1, src2;
   1598   v16u8 vec0, vec1, vec2;
   1599   v16u8 dst0, dst1, dst2, dst3;
   1600   v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
   1601   v16i8 shuffler = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19};
   1602 
   1603   for (x = 0; x < width; x += 16) {
   1604     src0 = (v16u8)__msa_ld_b((v16i8*)src_rgb24, 0);
   1605     src1 = (v16u8)__msa_ld_b((v16i8*)src_rgb24, 16);
   1606     src2 = (v16u8)__msa_ld_b((v16i8*)src_rgb24, 32);
   1607     vec0 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 12);
   1608     vec1 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8);
   1609     vec2 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src2, 4);
   1610     dst0 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)src0);
   1611     dst1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec0);
   1612     dst2 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec1);
   1613     dst3 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec2);
   1614     ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
   1615     src_rgb24 += 48;
   1616     dst_argb += 64;
   1617   }
   1618 }
   1619 
   1620 void RAWToARGBRow_MSA(const uint8* src_raw, uint8* dst_argb, int width) {
   1621   int x;
   1622   v16u8 src0, src1, src2;
   1623   v16u8 vec0, vec1, vec2;
   1624   v16u8 dst0, dst1, dst2, dst3;
   1625   v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
   1626   v16i8 mask = {2, 1, 0, 16, 5, 4, 3, 17, 8, 7, 6, 18, 11, 10, 9, 19};
   1627 
   1628   for (x = 0; x < width; x += 16) {
   1629     src0 = (v16u8)__msa_ld_b((v16i8*)src_raw, 0);
   1630     src1 = (v16u8)__msa_ld_b((v16i8*)src_raw, 16);
   1631     src2 = (v16u8)__msa_ld_b((v16i8*)src_raw, 32);
   1632     vec0 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 12);
   1633     vec1 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8);
   1634     vec2 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src2, 4);
   1635     dst0 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)src0);
   1636     dst1 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec0);
   1637     dst2 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec1);
   1638     dst3 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec2);
   1639     ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
   1640     src_raw += 48;
   1641     dst_argb += 64;
   1642   }
   1643 }
   1644 
   1645 void ARGB1555ToYRow_MSA(const uint8* src_argb1555, uint8* dst_y, int width) {
   1646   int x;
   1647   v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5;
   1648   v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
   1649   v16u8 dst0;
   1650   v8u16 const_0x19 = (v8u16)__msa_ldi_h(0x19);
   1651   v8u16 const_0x81 = (v8u16)__msa_ldi_h(0x81);
   1652   v8u16 const_0x42 = (v8u16)__msa_ldi_h(0x42);
   1653   v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
   1654   v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
   1655 
   1656   for (x = 0; x < width; x += 16) {
   1657     src0 = (v8u16)__msa_ld_b((v8i16*)src_argb1555, 0);
   1658     src1 = (v8u16)__msa_ld_b((v8i16*)src_argb1555, 16);
   1659     vec0 = src0 & const_0x1F;
   1660     vec1 = src1 & const_0x1F;
   1661     src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
   1662     src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
   1663     vec2 = src0 & const_0x1F;
   1664     vec3 = src1 & const_0x1F;
   1665     src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
   1666     src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
   1667     vec4 = src0 & const_0x1F;
   1668     vec5 = src1 & const_0x1F;
   1669     reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3);
   1670     reg1 = (v8u16)__msa_slli_h((v8i16)vec1, 3);
   1671     reg0 |= (v8u16)__msa_srai_h((v8i16)vec0, 2);
   1672     reg1 |= (v8u16)__msa_srai_h((v8i16)vec1, 2);
   1673     reg2 = (v8u16)__msa_slli_h((v8i16)vec2, 3);
   1674     reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3);
   1675     reg2 |= (v8u16)__msa_srai_h((v8i16)vec2, 2);
   1676     reg3 |= (v8u16)__msa_srai_h((v8i16)vec3, 2);
   1677     reg4 = (v8u16)__msa_slli_h((v8i16)vec4, 3);
   1678     reg5 = (v8u16)__msa_slli_h((v8i16)vec5, 3);
   1679     reg4 |= (v8u16)__msa_srai_h((v8i16)vec4, 2);
   1680     reg5 |= (v8u16)__msa_srai_h((v8i16)vec5, 2);
   1681     reg0 *= const_0x19;
   1682     reg1 *= const_0x19;
   1683     reg2 *= const_0x81;
   1684     reg3 *= const_0x81;
   1685     reg4 *= const_0x42;
   1686     reg5 *= const_0x42;
   1687     reg0 += reg2;
   1688     reg1 += reg3;
   1689     reg0 += reg4;
   1690     reg1 += reg5;
   1691     reg0 += const_0x1080;
   1692     reg1 += const_0x1080;
   1693     reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
   1694     reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 8);
   1695     dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
   1696     ST_UB(dst0, dst_y);
   1697     src_argb1555 += 32;
   1698     dst_y += 16;
   1699   }
   1700 }
   1701 
   1702 void RGB565ToYRow_MSA(const uint8* src_rgb565, uint8* dst_y, int width) {
   1703   int x;
   1704   v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
   1705   v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
   1706   v4u32 res0, res1, res2, res3;
   1707   v16u8 dst0;
   1708   v4u32 const_0x810019 = (v4u32)__msa_fill_w(0x810019);
   1709   v4u32 const_0x010042 = (v4u32)__msa_fill_w(0x010042);
   1710   v8i16 const_0x1080 = __msa_fill_h(0x1080);
   1711   v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
   1712   v8u16 const_0x7E0 = (v8u16)__msa_fill_h(0x7E0);
   1713   v8u16 const_0xF800 = (v8u16)__msa_fill_h(0xF800);
   1714 
   1715   for (x = 0; x < width; x += 16) {
   1716     src0 = (v8u16)__msa_ld_b((v8i16*)src_rgb565, 0);
   1717     src1 = (v8u16)__msa_ld_b((v8i16*)src_rgb565, 16);
   1718     vec0 = src0 & const_0x1F;
   1719     vec1 = src0 & const_0x7E0;
   1720     vec2 = src0 & const_0xF800;
   1721     vec3 = src1 & const_0x1F;
   1722     vec4 = src1 & const_0x7E0;
   1723     vec5 = src1 & const_0xF800;
   1724     reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3);
   1725     reg1 = (v8u16)__msa_srli_h((v8i16)vec1, 3);
   1726     reg2 = (v8u16)__msa_srli_h((v8i16)vec2, 8);
   1727     reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3);
   1728     reg4 = (v8u16)__msa_srli_h((v8i16)vec4, 3);
   1729     reg5 = (v8u16)__msa_srli_h((v8i16)vec5, 8);
   1730     reg0 |= (v8u16)__msa_srli_h((v8i16)vec0, 2);
   1731     reg1 |= (v8u16)__msa_srli_h((v8i16)vec1, 9);
   1732     reg2 |= (v8u16)__msa_srli_h((v8i16)vec2, 13);
   1733     reg3 |= (v8u16)__msa_srli_h((v8i16)vec3, 2);
   1734     reg4 |= (v8u16)__msa_srli_h((v8i16)vec4, 9);
   1735     reg5 |= (v8u16)__msa_srli_h((v8i16)vec5, 13);
   1736     vec0 = (v8u16)__msa_ilvr_h((v8i16)reg1, (v8i16)reg0);
   1737     vec1 = (v8u16)__msa_ilvl_h((v8i16)reg1, (v8i16)reg0);
   1738     vec2 = (v8u16)__msa_ilvr_h((v8i16)reg4, (v8i16)reg3);
   1739     vec3 = (v8u16)__msa_ilvl_h((v8i16)reg4, (v8i16)reg3);
   1740     vec4 = (v8u16)__msa_ilvr_h(const_0x1080, (v8i16)reg2);
   1741     vec5 = (v8u16)__msa_ilvl_h(const_0x1080, (v8i16)reg2);
   1742     vec6 = (v8u16)__msa_ilvr_h(const_0x1080, (v8i16)reg5);
   1743     vec7 = (v8u16)__msa_ilvl_h(const_0x1080, (v8i16)reg5);
   1744     res0 = __msa_dotp_u_w(vec0, (v8u16)const_0x810019);
   1745     res1 = __msa_dotp_u_w(vec1, (v8u16)const_0x810019);
   1746     res2 = __msa_dotp_u_w(vec2, (v8u16)const_0x810019);
   1747     res3 = __msa_dotp_u_w(vec3, (v8u16)const_0x810019);
   1748     res0 = __msa_dpadd_u_w(res0, vec4, (v8u16)const_0x010042);
   1749     res1 = __msa_dpadd_u_w(res1, vec5, (v8u16)const_0x010042);
   1750     res2 = __msa_dpadd_u_w(res2, vec6, (v8u16)const_0x010042);
   1751     res3 = __msa_dpadd_u_w(res3, vec7, (v8u16)const_0x010042);
   1752     res0 = (v4u32)__msa_srai_w((v4i32)res0, 8);
   1753     res1 = (v4u32)__msa_srai_w((v4i32)res1, 8);
   1754     res2 = (v4u32)__msa_srai_w((v4i32)res2, 8);
   1755     res3 = (v4u32)__msa_srai_w((v4i32)res3, 8);
   1756     vec0 = (v8u16)__msa_pckev_h((v8i16)res1, (v8i16)res0);
   1757     vec1 = (v8u16)__msa_pckev_h((v8i16)res3, (v8i16)res2);
   1758     dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
   1759     ST_UB(dst0, dst_y);
   1760     src_rgb565 += 32;
   1761     dst_y += 16;
   1762   }
   1763 }
   1764 
   1765 void RGB24ToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) {
   1766   int x;
   1767   v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0;
   1768   v8u16 vec0, vec1, vec2, vec3;
   1769   v8u16 const_0x8119 = (v8u16)__msa_fill_h(0x8119);
   1770   v8u16 const_0x42 = (v8u16)__msa_fill_h(0x42);
   1771   v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
   1772   v16i8 mask0 = {0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 12};
   1773   v16i8 mask1 = {12, 13, 14, 15, 15, 16, 17, 18,
   1774                  18, 19, 20, 21, 21, 22, 23, 24};
   1775   v16i8 mask2 = {8, 9, 10, 11, 11, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19, 20};
   1776   v16i8 mask3 = {4, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 13, 13, 14, 15, 16};
   1777   v16i8 zero = {0};
   1778 
   1779   for (x = 0; x < width; x += 16) {
   1780     src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
   1781     src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
   1782     src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32);
   1783     reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0);
   1784     reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
   1785     reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1);
   1786     reg3 = (v16u8)__msa_vshf_b(mask3, zero, (v16i8)src2);
   1787     vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
   1788     vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
   1789     vec2 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0);
   1790     vec3 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2);
   1791     vec0 = __msa_dotp_u_h((v16u8)vec0, (v16u8)const_0x8119);
   1792     vec1 = __msa_dotp_u_h((v16u8)vec1, (v16u8)const_0x8119);
   1793     vec0 = __msa_dpadd_u_h(vec0, (v16u8)vec2, (v16u8)const_0x42);
   1794     vec1 = __msa_dpadd_u_h(vec1, (v16u8)vec3, (v16u8)const_0x42);
   1795     vec0 += const_0x1080;
   1796     vec1 += const_0x1080;
   1797     vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8);
   1798     vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8);
   1799     dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
   1800     ST_UB(dst0, dst_y);
   1801     src_argb0 += 48;
   1802     dst_y += 16;
   1803   }
   1804 }
   1805 
   1806 void RAWToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) {
   1807   int x;
   1808   v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0;
   1809   v8u16 vec0, vec1, vec2, vec3;
   1810   v8u16 const_0x8142 = (v8u16)__msa_fill_h(0x8142);
   1811   v8u16 const_0x19 = (v8u16)__msa_fill_h(0x19);
   1812   v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
   1813   v16i8 mask0 = {0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 12};
   1814   v16i8 mask1 = {12, 13, 14, 15, 15, 16, 17, 18,
   1815                  18, 19, 20, 21, 21, 22, 23, 24};
   1816   v16i8 mask2 = {8, 9, 10, 11, 11, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19, 20};
   1817   v16i8 mask3 = {4, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 13, 13, 14, 15, 16};
   1818   v16i8 zero = {0};
   1819 
   1820   for (x = 0; x < width; x += 16) {
   1821     src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
   1822     src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
   1823     src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32);
   1824     reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0);
   1825     reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
   1826     reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1);
   1827     reg3 = (v16u8)__msa_vshf_b(mask3, zero, (v16i8)src2);
   1828     vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
   1829     vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
   1830     vec2 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0);
   1831     vec3 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2);
   1832     vec0 = __msa_dotp_u_h((v16u8)vec0, (v16u8)const_0x8142);
   1833     vec1 = __msa_dotp_u_h((v16u8)vec1, (v16u8)const_0x8142);
   1834     vec0 = __msa_dpadd_u_h(vec0, (v16u8)vec2, (v16u8)const_0x19);
   1835     vec1 = __msa_dpadd_u_h(vec1, (v16u8)vec3, (v16u8)const_0x19);
   1836     vec0 += const_0x1080;
   1837     vec1 += const_0x1080;
   1838     vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8);
   1839     vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8);
   1840     dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
   1841     ST_UB(dst0, dst_y);
   1842     src_argb0 += 48;
   1843     dst_y += 16;
   1844   }
   1845 }
   1846 
   1847 void ARGB1555ToUVRow_MSA(const uint8* src_argb1555,
   1848                          int src_stride_argb1555,
   1849                          uint8* dst_u,
   1850                          uint8* dst_v,
   1851                          int width) {
   1852   int x;
   1853   const uint16* s = (const uint16*)src_argb1555;
   1854   const uint16* t = (const uint16*)(src_argb1555 + src_stride_argb1555);
   1855   int64_t res0, res1;
   1856   v8u16 src0, src1, src2, src3, reg0, reg1, reg2, reg3;
   1857   v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6;
   1858   v16u8 dst0;
   1859   v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70);
   1860   v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A);
   1861   v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26);
   1862   v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E);
   1863   v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12);
   1864   v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
   1865   v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
   1866 
   1867   for (x = 0; x < width; x += 16) {
   1868     src0 = (v8u16)__msa_ld_b((v8i16*)s, 0);
   1869     src1 = (v8u16)__msa_ld_b((v8i16*)s, 16);
   1870     src2 = (v8u16)__msa_ld_b((v8i16*)t, 0);
   1871     src3 = (v8u16)__msa_ld_b((v8i16*)t, 16);
   1872     vec0 = src0 & const_0x1F;
   1873     vec1 = src1 & const_0x1F;
   1874     vec0 += src2 & const_0x1F;
   1875     vec1 += src3 & const_0x1F;
   1876     vec0 = (v8u16)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
   1877     src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
   1878     src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
   1879     src2 = (v8u16)__msa_srai_h((v8i16)src2, 5);
   1880     src3 = (v8u16)__msa_srai_h((v8i16)src3, 5);
   1881     vec2 = src0 & const_0x1F;
   1882     vec3 = src1 & const_0x1F;
   1883     vec2 += src2 & const_0x1F;
   1884     vec3 += src3 & const_0x1F;
   1885     vec2 = (v8u16)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
   1886     src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
   1887     src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
   1888     src2 = (v8u16)__msa_srai_h((v8i16)src2, 5);
   1889     src3 = (v8u16)__msa_srai_h((v8i16)src3, 5);
   1890     vec4 = src0 & const_0x1F;
   1891     vec5 = src1 & const_0x1F;
   1892     vec4 += src2 & const_0x1F;
   1893     vec5 += src3 & const_0x1F;
   1894     vec4 = (v8u16)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
   1895     vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
   1896     vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
   1897     vec4 = __msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);
   1898     vec6 = (v8u16)__msa_slli_h((v8i16)vec0, 1);
   1899     vec6 |= (v8u16)__msa_srai_h((v8i16)vec0, 6);
   1900     vec0 = (v8u16)__msa_slli_h((v8i16)vec2, 1);
   1901     vec0 |= (v8u16)__msa_srai_h((v8i16)vec2, 6);
   1902     vec2 = (v8u16)__msa_slli_h((v8i16)vec4, 1);
   1903     vec2 |= (v8u16)__msa_srai_h((v8i16)vec4, 6);
   1904     reg0 = vec6 * const_0x70;
   1905     reg1 = vec0 * const_0x4A;
   1906     reg2 = vec2 * const_0x70;
   1907     reg3 = vec0 * const_0x5E;
   1908     reg0 += const_0x8080;
   1909     reg1 += vec2 * const_0x26;
   1910     reg2 += const_0x8080;
   1911     reg3 += vec6 * const_0x12;
   1912     reg0 -= reg1;
   1913     reg2 -= reg3;
   1914     reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
   1915     reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 8);
   1916     dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);
   1917     res0 = __msa_copy_u_d((v2i64)dst0, 0);
   1918     res1 = __msa_copy_u_d((v2i64)dst0, 1);
   1919     SD(res0, dst_u);
   1920     SD(res1, dst_v);
   1921     s += 16;
   1922     t += 16;
   1923     dst_u += 8;
   1924     dst_v += 8;
   1925   }
   1926 }
   1927 
   1928 void RGB565ToUVRow_MSA(const uint8* src_rgb565,
   1929                        int src_stride_rgb565,
   1930                        uint8* dst_u,
   1931                        uint8* dst_v,
   1932                        int width) {
   1933   int x;
   1934   const uint16* s = (const uint16*)src_rgb565;
   1935   const uint16* t = (const uint16*)(src_rgb565 + src_stride_rgb565);
   1936   int64_t res0, res1;
   1937   v8u16 src0, src1, src2, src3, reg0, reg1, reg2, reg3;
   1938   v8u16 vec0, vec1, vec2, vec3, vec4, vec5;
   1939   v16u8 dst0;
   1940   v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70);
   1941   v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A);
   1942   v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26);
   1943   v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E);
   1944   v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12);
   1945   v8u16 const_32896 = (v8u16)__msa_fill_h(0x8080);
   1946   v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
   1947   v8u16 const_0x3F = (v8u16)__msa_fill_h(0x3F);
   1948 
   1949   for (x = 0; x < width; x += 16) {
   1950     src0 = (v8u16)__msa_ld_b((v8i16*)s, 0);
   1951     src1 = (v8u16)__msa_ld_b((v8i16*)s, 16);
   1952     src2 = (v8u16)__msa_ld_b((v8i16*)t, 0);
   1953     src3 = (v8u16)__msa_ld_b((v8i16*)t, 16);
   1954     vec0 = src0 & const_0x1F;
   1955     vec1 = src1 & const_0x1F;
   1956     vec0 += src2 & const_0x1F;
   1957     vec1 += src3 & const_0x1F;
   1958     vec0 = (v8u16)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
   1959     src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
   1960     src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
   1961     src2 = (v8u16)__msa_srai_h((v8i16)src2, 5);
   1962     src3 = (v8u16)__msa_srai_h((v8i16)src3, 5);
   1963     vec2 = src0 & const_0x3F;
   1964     vec3 = src1 & const_0x3F;
   1965     vec2 += src2 & const_0x3F;
   1966     vec3 += src3 & const_0x3F;
   1967     vec1 = (v8u16)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
   1968     src0 = (v8u16)__msa_srai_h((v8i16)src0, 6);
   1969     src1 = (v8u16)__msa_srai_h((v8i16)src1, 6);
   1970     src2 = (v8u16)__msa_srai_h((v8i16)src2, 6);
   1971     src3 = (v8u16)__msa_srai_h((v8i16)src3, 6);
   1972     vec4 = src0 & const_0x1F;
   1973     vec5 = src1 & const_0x1F;
   1974     vec4 += src2 & const_0x1F;
   1975     vec5 += src3 & const_0x1F;
   1976     vec2 = (v8u16)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
   1977     vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
   1978     vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
   1979     vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
   1980     vec3 = (v8u16)__msa_slli_h((v8i16)vec0, 1);
   1981     vec3 |= (v8u16)__msa_srai_h((v8i16)vec0, 6);
   1982     vec4 = (v8u16)__msa_slli_h((v8i16)vec2, 1);
   1983     vec4 |= (v8u16)__msa_srai_h((v8i16)vec2, 6);
   1984     reg0 = vec3 * const_0x70;
   1985     reg1 = vec1 * const_0x4A;
   1986     reg2 = vec4 * const_0x70;
   1987     reg3 = vec1 * const_0x5E;
   1988     reg0 += const_32896;
   1989     reg1 += vec4 * const_0x26;
   1990     reg2 += const_32896;
   1991     reg3 += vec3 * const_0x12;
   1992     reg0 -= reg1;
   1993     reg2 -= reg3;
   1994     reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
   1995     reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 8);
   1996     dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);
   1997     res0 = __msa_copy_u_d((v2i64)dst0, 0);
   1998     res1 = __msa_copy_u_d((v2i64)dst0, 1);
   1999     SD(res0, dst_u);
   2000     SD(res1, dst_v);
   2001     s += 16;
   2002     t += 16;
   2003     dst_u += 8;
   2004     dst_v += 8;
   2005   }
   2006 }
   2007 
   2008 void RGB24ToUVRow_MSA(const uint8* src_rgb0,
   2009                       int src_stride_rgb,
   2010                       uint8* dst_u,
   2011                       uint8* dst_v,
   2012                       int width) {
   2013   int x;
   2014   const uint8* s = src_rgb0;
   2015   const uint8* t = src_rgb0 + src_stride_rgb;
   2016   int64 res0, res1;
   2017   v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
   2018   v16u8 inp0, inp1, inp2, inp3, inp4, inp5;
   2019   v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
   2020   v8i16 reg0, reg1, reg2, reg3;
   2021   v16u8 dst0;
   2022   v8u16 const_0x70 = (v8u16)__msa_fill_h(0x70);
   2023   v8u16 const_0x4A = (v8u16)__msa_fill_h(0x4A);
   2024   v8u16 const_0x26 = (v8u16)__msa_fill_h(0x26);
   2025   v8u16 const_0x5E = (v8u16)__msa_fill_h(0x5E);
   2026   v8u16 const_0x12 = (v8u16)__msa_fill_h(0x12);
   2027   v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
   2028   v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19};
   2029   v16i8 zero = {0};
   2030 
   2031   for (x = 0; x < width; x += 16) {
   2032     inp0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
   2033     inp1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
   2034     inp2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
   2035     inp3 = (v16u8)__msa_ld_b((v16i8*)t, 0);
   2036     inp4 = (v16u8)__msa_ld_b((v16i8*)t, 16);
   2037     inp5 = (v16u8)__msa_ld_b((v16i8*)t, 32);
   2038     src1 = (v16u8)__msa_sldi_b((v16i8)inp1, (v16i8)inp0, 12);
   2039     src5 = (v16u8)__msa_sldi_b((v16i8)inp4, (v16i8)inp3, 12);
   2040     src2 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp1, 8);
   2041     src6 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp4, 8);
   2042     src3 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp2, 4);
   2043     src7 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp5, 4);
   2044     src0 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp0);
   2045     src1 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src1);
   2046     src2 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src2);
   2047     src3 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src3);
   2048     src4 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp3);
   2049     src5 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src5);
   2050     src6 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src6);
   2051     src7 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src7);
   2052     vec0 = (v8u16)__msa_ilvr_b((v16i8)src4, (v16i8)src0);
   2053     vec1 = (v8u16)__msa_ilvl_b((v16i8)src4, (v16i8)src0);
   2054     vec2 = (v8u16)__msa_ilvr_b((v16i8)src5, (v16i8)src1);
   2055     vec3 = (v8u16)__msa_ilvl_b((v16i8)src5, (v16i8)src1);
   2056     vec4 = (v8u16)__msa_ilvr_b((v16i8)src6, (v16i8)src2);
   2057     vec5 = (v8u16)__msa_ilvl_b((v16i8)src6, (v16i8)src2);
   2058     vec6 = (v8u16)__msa_ilvr_b((v16i8)src7, (v16i8)src3);
   2059     vec7 = (v8u16)__msa_ilvl_b((v16i8)src7, (v16i8)src3);
   2060     vec0 = (v8u16)__msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
   2061     vec1 = (v8u16)__msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
   2062     vec2 = (v8u16)__msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
   2063     vec3 = (v8u16)__msa_hadd_u_h((v16u8)vec3, (v16u8)vec3);
   2064     vec4 = (v8u16)__msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);
   2065     vec5 = (v8u16)__msa_hadd_u_h((v16u8)vec5, (v16u8)vec5);
   2066     vec6 = (v8u16)__msa_hadd_u_h((v16u8)vec6, (v16u8)vec6);
   2067     vec7 = (v8u16)__msa_hadd_u_h((v16u8)vec7, (v16u8)vec7);
   2068     reg0 = (v8i16)__msa_pckev_d((v2i64)vec1, (v2i64)vec0);
   2069     reg1 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec2);
   2070     reg2 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4);
   2071     reg3 = (v8i16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);
   2072     reg0 += (v8i16)__msa_pckod_d((v2i64)vec1, (v2i64)vec0);
   2073     reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2);
   2074     reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4);
   2075     reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6);
   2076     reg0 = __msa_srai_h((v8i16)reg0, 2);
   2077     reg1 = __msa_srai_h((v8i16)reg1, 2);
   2078     reg2 = __msa_srai_h((v8i16)reg2, 2);
   2079     reg3 = __msa_srai_h((v8i16)reg3, 2);
   2080     vec4 = (v8u16)__msa_pckev_h(reg1, reg0);
   2081     vec5 = (v8u16)__msa_pckev_h(reg3, reg2);
   2082     vec6 = (v8u16)__msa_pckod_h(reg1, reg0);
   2083     vec7 = (v8u16)__msa_pckod_h(reg3, reg2);
   2084     vec0 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4);
   2085     vec1 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6);
   2086     vec2 = (v8u16)__msa_pckod_h((v8i16)vec5, (v8i16)vec4);
   2087     vec3 = vec0 * const_0x70;
   2088     vec4 = vec1 * const_0x4A;
   2089     vec5 = vec2 * const_0x26;
   2090     vec2 *= const_0x70;
   2091     vec1 *= const_0x5E;
   2092     vec0 *= const_0x12;
   2093     reg0 = __msa_subv_h((v8i16)vec3, (v8i16)vec4);
   2094     reg1 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec5);
   2095     reg2 = __msa_subv_h((v8i16)vec2, (v8i16)vec1);
   2096     reg3 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec0);
   2097     reg0 += reg1;
   2098     reg2 += reg3;
   2099     reg0 = __msa_srai_h(reg0, 8);
   2100     reg2 = __msa_srai_h(reg2, 8);
   2101     dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);
   2102     res0 = __msa_copy_u_d((v2i64)dst0, 0);
   2103     res1 = __msa_copy_u_d((v2i64)dst0, 1);
   2104     SD(res0, dst_u);
   2105     SD(res1, dst_v);
   2106     t += 48;
   2107     s += 48;
   2108     dst_u += 8;
   2109     dst_v += 8;
   2110   }
   2111 }
   2112 
   2113 void RAWToUVRow_MSA(const uint8* src_rgb0,
   2114                     int src_stride_rgb,
   2115                     uint8* dst_u,
   2116                     uint8* dst_v,
   2117                     int width) {
   2118   int x;
   2119   const uint8* s = src_rgb0;
   2120   const uint8* t = src_rgb0 + src_stride_rgb;
   2121   int64 res0, res1;
   2122   v16u8 inp0, inp1, inp2, inp3, inp4, inp5;
   2123   v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
   2124   v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
   2125   v8i16 reg0, reg1, reg2, reg3;
   2126   v16u8 dst0;
   2127   v8u16 const_0x70 = (v8u16)__msa_fill_h(0x70);
   2128   v8u16 const_0x4A = (v8u16)__msa_fill_h(0x4A);
   2129   v8u16 const_0x26 = (v8u16)__msa_fill_h(0x26);
   2130   v8u16 const_0x5E = (v8u16)__msa_fill_h(0x5E);
   2131   v8u16 const_0x12 = (v8u16)__msa_fill_h(0x12);
   2132   v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
   2133   v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19};
   2134   v16i8 zero = {0};
   2135 
   2136   for (x = 0; x < width; x += 16) {
   2137     inp0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
   2138     inp1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
   2139     inp2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
   2140     inp3 = (v16u8)__msa_ld_b((v16i8*)t, 0);
   2141     inp4 = (v16u8)__msa_ld_b((v16i8*)t, 16);
   2142     inp5 = (v16u8)__msa_ld_b((v16i8*)t, 32);
   2143     src1 = (v16u8)__msa_sldi_b((v16i8)inp1, (v16i8)inp0, 12);
   2144     src5 = (v16u8)__msa_sldi_b((v16i8)inp4, (v16i8)inp3, 12);
   2145     src2 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp1, 8);
   2146     src6 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp4, 8);
   2147     src3 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp2, 4);
   2148     src7 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp5, 4);
   2149     src0 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp0);
   2150     src1 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src1);
   2151     src2 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src2);
   2152     src3 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src3);
   2153     src4 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp3);
   2154     src5 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src5);
   2155     src6 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src6);
   2156     src7 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src7);
   2157     vec0 = (v8u16)__msa_ilvr_b((v16i8)src4, (v16i8)src0);
   2158     vec1 = (v8u16)__msa_ilvl_b((v16i8)src4, (v16i8)src0);
   2159     vec2 = (v8u16)__msa_ilvr_b((v16i8)src5, (v16i8)src1);
   2160     vec3 = (v8u16)__msa_ilvl_b((v16i8)src5, (v16i8)src1);
   2161     vec4 = (v8u16)__msa_ilvr_b((v16i8)src6, (v16i8)src2);
   2162     vec5 = (v8u16)__msa_ilvl_b((v16i8)src6, (v16i8)src2);
   2163     vec6 = (v8u16)__msa_ilvr_b((v16i8)src7, (v16i8)src3);
   2164     vec7 = (v8u16)__msa_ilvl_b((v16i8)src7, (v16i8)src3);
   2165     vec0 = (v8u16)__msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
   2166     vec1 = (v8u16)__msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
   2167     vec2 = (v8u16)__msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
   2168     vec3 = (v8u16)__msa_hadd_u_h((v16u8)vec3, (v16u8)vec3);
   2169     vec4 = (v8u16)__msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);
   2170     vec5 = (v8u16)__msa_hadd_u_h((v16u8)vec5, (v16u8)vec5);
   2171     vec6 = (v8u16)__msa_hadd_u_h((v16u8)vec6, (v16u8)vec6);
   2172     vec7 = (v8u16)__msa_hadd_u_h((v16u8)vec7, (v16u8)vec7);
   2173     reg0 = (v8i16)__msa_pckev_d((v2i64)vec1, (v2i64)vec0);
   2174     reg1 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec2);
   2175     reg2 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4);
   2176     reg3 = (v8i16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);
   2177     reg0 += (v8i16)__msa_pckod_d((v2i64)vec1, (v2i64)vec0);
   2178     reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2);
   2179     reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4);
   2180     reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6);
   2181     reg0 = __msa_srai_h(reg0, 2);
   2182     reg1 = __msa_srai_h(reg1, 2);
   2183     reg2 = __msa_srai_h(reg2, 2);
   2184     reg3 = __msa_srai_h(reg3, 2);
   2185     vec4 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
   2186     vec5 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
   2187     vec6 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0);
   2188     vec7 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2);
   2189     vec0 = (v8u16)__msa_pckod_h((v8i16)vec5, (v8i16)vec4);
   2190     vec1 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6);
   2191     vec2 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4);
   2192     vec3 = vec0 * const_0x70;
   2193     vec4 = vec1 * const_0x4A;
   2194     vec5 = vec2 * const_0x26;
   2195     vec2 *= const_0x70;
   2196     vec1 *= const_0x5E;
   2197     vec0 *= const_0x12;
   2198     reg0 = __msa_subv_h((v8i16)vec3, (v8i16)vec4);
   2199     reg1 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec5);
   2200     reg2 = __msa_subv_h((v8i16)vec2, (v8i16)vec1);
   2201     reg3 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec0);
   2202     reg0 += reg1;
   2203     reg2 += reg3;
   2204     reg0 = __msa_srai_h(reg0, 8);
   2205     reg2 = __msa_srai_h(reg2, 8);
   2206     dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);
   2207     res0 = __msa_copy_u_d((v2i64)dst0, 0);
   2208     res1 = __msa_copy_u_d((v2i64)dst0, 1);
   2209     SD(res0, dst_u);
   2210     SD(res1, dst_v);
   2211     t += 48;
   2212     s += 48;
   2213     dst_u += 8;
   2214     dst_v += 8;
   2215   }
   2216 }
   2217 
   2218 void NV12ToARGBRow_MSA(const uint8* src_y,
   2219                        const uint8* src_uv,
   2220                        uint8* rgb_buf,
   2221                        const struct YuvConstants* yuvconstants,
   2222                        int width) {
   2223   int x;
   2224   uint64 val0, val1;
   2225   v16u8 src0, src1, res0, res1, dst0, dst1;
   2226   v8i16 vec0, vec1, vec2;
   2227   v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
   2228   v4i32 vec_ubvr, vec_ugvg;
   2229   v16u8 zero = {0};
   2230   v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
   2231 
   2232   YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
   2233                  vec_br, vec_yg);
   2234   vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
   2235   vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
   2236 
   2237   for (x = 0; x < width; x += 8) {
   2238     val0 = LD(src_y);
   2239     val1 = LD(src_uv);
   2240     src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
   2241     src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
   2242     YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
   2243              vec0, vec1, vec2);
   2244     res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0);
   2245     res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1);
   2246     dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0);
   2247     dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0);
   2248     ST_UB2(dst0, dst1, rgb_buf, 16);
   2249     src_y += 8;
   2250     src_uv += 8;
   2251     rgb_buf += 32;
   2252   }
   2253 }
   2254 
   2255 void NV12ToRGB565Row_MSA(const uint8* src_y,
   2256                          const uint8* src_uv,
   2257                          uint8* rgb_buf,
   2258                          const struct YuvConstants* yuvconstants,
   2259                          int width) {
   2260   int x;
   2261   uint64 val0, val1;
   2262   v16u8 src0, src1, dst0;
   2263   v8i16 vec0, vec1, vec2;
   2264   v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
   2265   v4i32 vec_ubvr, vec_ugvg;
   2266   v16u8 zero = {0};
   2267 
   2268   YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
   2269                  vec_br, vec_yg);
   2270   vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
   2271   vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
   2272 
   2273   for (x = 0; x < width; x += 8) {
   2274     val0 = LD(src_y);
   2275     val1 = LD(src_uv);
   2276     src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
   2277     src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
   2278     YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
   2279              vec0, vec1, vec2);
   2280     vec0 = vec0 >> 3;
   2281     vec1 = (vec1 >> 2) << 5;
   2282     vec2 = (vec2 >> 3) << 11;
   2283     dst0 = (v16u8)(vec0 | vec1 | vec2);
   2284     ST_UB(dst0, rgb_buf);
   2285     src_y += 8;
   2286     src_uv += 8;
   2287     rgb_buf += 16;
   2288   }
   2289 }
   2290 
   2291 void NV21ToARGBRow_MSA(const uint8* src_y,
   2292                        const uint8* src_vu,
   2293                        uint8* rgb_buf,
   2294                        const struct YuvConstants* yuvconstants,
   2295                        int width) {
   2296   int x;
   2297   uint64 val0, val1;
   2298   v16u8 src0, src1, res0, res1, dst0, dst1;
   2299   v8i16 vec0, vec1, vec2;
   2300   v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
   2301   v4i32 vec_ubvr, vec_ugvg;
   2302   v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
   2303   v16u8 zero = {0};
   2304   v16i8 shuffler = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
   2305 
   2306   YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
   2307                  vec_br, vec_yg);
   2308   vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
   2309   vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
   2310 
   2311   for (x = 0; x < width; x += 8) {
   2312     val0 = LD(src_y);
   2313     val1 = LD(src_vu);
   2314     src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
   2315     src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
   2316     src1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src1, (v16i8)src1);
   2317     YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
   2318              vec0, vec1, vec2);
   2319     res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0);
   2320     res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1);
   2321     dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0);
   2322     dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0);
   2323     ST_UB2(dst0, dst1, rgb_buf, 16);
   2324     src_y += 8;
   2325     src_vu += 8;
   2326     rgb_buf += 32;
   2327   }
   2328 }
   2329 
   2330 void SobelRow_MSA(const uint8* src_sobelx,
   2331                   const uint8* src_sobely,
   2332                   uint8* dst_argb,
   2333                   int width) {
   2334   int x;
   2335   v16u8 src0, src1, vec0, dst0, dst1, dst2, dst3;
   2336   v16i8 mask0 = {0, 0, 0, 16, 1, 1, 1, 16, 2, 2, 2, 16, 3, 3, 3, 16};
   2337   v16i8 const_0x4 = __msa_ldi_b(0x4);
   2338   v16i8 mask1 = mask0 + const_0x4;
   2339   v16i8 mask2 = mask1 + const_0x4;
   2340   v16i8 mask3 = mask2 + const_0x4;
   2341   v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
   2342 
   2343   for (x = 0; x < width; x += 16) {
   2344     src0 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 0);
   2345     src1 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 0);
   2346     vec0 = __msa_adds_u_b(src0, src1);
   2347     dst0 = (v16u8)__msa_vshf_b(mask0, (v16i8)alpha, (v16i8)vec0);
   2348     dst1 = (v16u8)__msa_vshf_b(mask1, (v16i8)alpha, (v16i8)vec0);
   2349     dst2 = (v16u8)__msa_vshf_b(mask2, (v16i8)alpha, (v16i8)vec0);
   2350     dst3 = (v16u8)__msa_vshf_b(mask3, (v16i8)alpha, (v16i8)vec0);
   2351     ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
   2352     src_sobelx += 16;
   2353     src_sobely += 16;
   2354     dst_argb += 64;
   2355   }
   2356 }
   2357 
   2358 void SobelToPlaneRow_MSA(const uint8* src_sobelx,
   2359                          const uint8* src_sobely,
   2360                          uint8* dst_y,
   2361                          int width) {
   2362   int x;
   2363   v16u8 src0, src1, src2, src3, dst0, dst1;
   2364 
   2365   for (x = 0; x < width; x += 32) {
   2366     src0 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 0);
   2367     src1 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 16);
   2368     src2 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 0);
   2369     src3 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 16);
   2370     dst0 = __msa_adds_u_b(src0, src2);
   2371     dst1 = __msa_adds_u_b(src1, src3);
   2372     ST_UB2(dst0, dst1, dst_y, 16);
   2373     src_sobelx += 32;
   2374     src_sobely += 32;
   2375     dst_y += 32;
   2376   }
   2377 }
   2378 
   2379 void SobelXYRow_MSA(const uint8* src_sobelx,
   2380                     const uint8* src_sobely,
   2381                     uint8* dst_argb,
   2382                     int width) {
   2383   int x;
   2384   v16u8 src0, src1, vec0, vec1, vec2;
   2385   v16u8 reg0, reg1, dst0, dst1, dst2, dst3;
   2386   v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
   2387 
   2388   for (x = 0; x < width; x += 16) {
   2389     src0 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 0);
   2390     src1 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 0);
   2391     vec0 = __msa_adds_u_b(src0, src1);
   2392     vec1 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src1);
   2393     vec2 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src1);
   2394     reg0 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)vec0);
   2395     reg1 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)vec0);
   2396     dst0 = (v16u8)__msa_ilvr_b((v16i8)reg0, (v16i8)vec1);
   2397     dst1 = (v16u8)__msa_ilvl_b((v16i8)reg0, (v16i8)vec1);
   2398     dst2 = (v16u8)__msa_ilvr_b((v16i8)reg1, (v16i8)vec2);
   2399     dst3 = (v16u8)__msa_ilvl_b((v16i8)reg1, (v16i8)vec2);
   2400     ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
   2401     src_sobelx += 16;
   2402     src_sobely += 16;
   2403     dst_argb += 64;
   2404   }
   2405 }
   2406 
   2407 void ARGBToYJRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) {
   2408   int x;
   2409   v16u8 src0, src1, src2, src3, dst0;
   2410   v16u8 const_0x4B0F = (v16u8)__msa_fill_h(0x4B0F);
   2411   v16u8 const_0x26 = (v16u8)__msa_fill_h(0x26);
   2412   v8u16 const_0x40 = (v8u16)__msa_fill_h(0x40);
   2413 
   2414   for (x = 0; x < width; x += 16) {
   2415     src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
   2416     src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
   2417     src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32);
   2418     src3 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 48);
   2419     ARGBTOY(src0, src1, src2, src3, const_0x4B0F, const_0x26, const_0x40, 7,
   2420             dst0);
   2421     ST_UB(dst0, dst_y);
   2422     src_argb0 += 64;
   2423     dst_y += 16;
   2424   }
   2425 }
   2426 
   2427 void BGRAToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) {
   2428   int x;
   2429   v16u8 src0, src1, src2, src3, dst0;
   2430   v16u8 const_0x4200 = (v16u8)__msa_fill_h(0x4200);
   2431   v16u8 const_0x1981 = (v16u8)__msa_fill_h(0x1981);
   2432   v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
   2433 
   2434   for (x = 0; x < width; x += 16) {
   2435     src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
   2436     src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
   2437     src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32);
   2438     src3 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 48);
   2439     ARGBTOY(src0, src1, src2, src3, const_0x4200, const_0x1981, const_0x1080, 8,
   2440             dst0);
   2441     ST_UB(dst0, dst_y);
   2442     src_argb0 += 64;
   2443     dst_y += 16;
   2444   }
   2445 }
   2446 
   2447 void ABGRToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) {
   2448   int x;
   2449   v16u8 src0, src1, src2, src3, dst0;
   2450   v16u8 const_0x8142 = (v16u8)__msa_fill_h(0x8142);
   2451   v16u8 const_0x19 = (v16u8)__msa_fill_h(0x19);
   2452   v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
   2453 
   2454   for (x = 0; x < width; x += 16) {
   2455     src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
   2456     src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
   2457     src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32);
   2458     src3 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 48);
   2459     ARGBTOY(src0, src1, src2, src3, const_0x8142, const_0x19, const_0x1080, 8,
   2460             dst0);
   2461     ST_UB(dst0, dst_y);
   2462     src_argb0 += 64;
   2463     dst_y += 16;
   2464   }
   2465 }
   2466 
   2467 void RGBAToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) {
   2468   int x;
   2469   v16u8 src0, src1, src2, src3, dst0;
   2470   v16u8 const_0x1900 = (v16u8)__msa_fill_h(0x1900);
   2471   v16u8 const_0x4281 = (v16u8)__msa_fill_h(0x4281);
   2472   v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
   2473 
   2474   for (x = 0; x < width; x += 16) {
   2475     src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0);
   2476     src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16);
   2477     src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32);
   2478     src3 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 48);
   2479     ARGBTOY(src0, src1, src2, src3, const_0x1900, const_0x4281, const_0x1080, 8,
   2480             dst0);
   2481     ST_UB(dst0, dst_y);
   2482     src_argb0 += 64;
   2483     dst_y += 16;
   2484   }
   2485 }
   2486 
   2487 void ARGBToUVJRow_MSA(const uint8* src_rgb0,
   2488                       int src_stride_rgb,
   2489                       uint8* dst_u,
   2490                       uint8* dst_v,
   2491                       int width) {
   2492   int x;
   2493   const uint8* s = src_rgb0;
   2494   const uint8* t = src_rgb0 + src_stride_rgb;
   2495   v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
   2496   v16u8 vec0, vec1, vec2, vec3;
   2497   v16u8 dst0, dst1;
   2498   v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
   2499   v16i8 shuffler1 = {2,  3,  6,  7,  10, 11, 14, 15,
   2500                      18, 19, 22, 23, 26, 27, 30, 31};
   2501   v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
   2502   v16i8 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30};
   2503   v16u8 const_0x7F = (v16u8)__msa_fill_h(0x7F);
   2504   v16u8 const_0x6B14 = (v16u8)__msa_fill_h(0x6B14);
   2505   v16u8 const_0x2B54 = (v16u8)__msa_fill_h(0x2B54);
   2506   v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
   2507 
   2508   for (x = 0; x < width; x += 32) {
   2509     src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
   2510     src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
   2511     src2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
   2512     src3 = (v16u8)__msa_ld_b((v16i8*)s, 48);
   2513     src4 = (v16u8)__msa_ld_b((v16i8*)t, 0);
   2514     src5 = (v16u8)__msa_ld_b((v16i8*)t, 16);
   2515     src6 = (v16u8)__msa_ld_b((v16i8*)t, 32);
   2516     src7 = (v16u8)__msa_ld_b((v16i8*)t, 48);
   2517     src0 = __msa_aver_u_b(src0, src4);
   2518     src1 = __msa_aver_u_b(src1, src5);
   2519     src2 = __msa_aver_u_b(src2, src6);
   2520     src3 = __msa_aver_u_b(src3, src7);
   2521     src4 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0);
   2522     src5 = (v16u8)__msa_pckev_w((v4i32)src3, (v4i32)src2);
   2523     src6 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);
   2524     src7 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2);
   2525     vec0 = __msa_aver_u_b(src4, src6);
   2526     vec1 = __msa_aver_u_b(src5, src7);
   2527     src0 = (v16u8)__msa_ld_b((v16i8*)s, 64);
   2528     src1 = (v16u8)__msa_ld_b((v16i8*)s, 80);
   2529     src2 = (v16u8)__msa_ld_b((v16i8*)s, 96);
   2530     src3 = (v16u8)__msa_ld_b((v16i8*)s, 112);
   2531     src4 = (v16u8)__msa_ld_b((v16i8*)t, 64);
   2532     src5 = (v16u8)__msa_ld_b((v16i8*)t, 80);
   2533     src6 = (v16u8)__msa_ld_b((v16i8*)t, 96);
   2534     src7 = (v16u8)__msa_ld_b((v16i8*)t, 112);
   2535     src0 = __msa_aver_u_b(src0, src4);
   2536     src1 = __msa_aver_u_b(src1, src5);
   2537     src2 = __msa_aver_u_b(src2, src6);
   2538     src3 = __msa_aver_u_b(src3, src7);
   2539     src4 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0);
   2540     src5 = (v16u8)__msa_pckev_w((v4i32)src3, (v4i32)src2);
   2541     src6 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);
   2542     src7 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2);
   2543     vec2 = __msa_aver_u_b(src4, src6);
   2544     vec3 = __msa_aver_u_b(src5, src7);
   2545     ARGBTOUV(vec0, vec1, vec2, vec3, const_0x6B14, const_0x7F, const_0x2B54,
   2546              const_0x8080, shuffler1, shuffler0, shuffler2, shuffler3, dst0,
   2547              dst1);
   2548     ST_UB(dst0, dst_v);
   2549     ST_UB(dst1, dst_u);
   2550     s += 128;
   2551     t += 128;
   2552     dst_v += 16;
   2553     dst_u += 16;
   2554   }
   2555 }
   2556 
   2557 void BGRAToUVRow_MSA(const uint8* src_rgb0,
   2558                      int src_stride_rgb,
   2559                      uint8* dst_u,
   2560                      uint8* dst_v,
   2561                      int width) {
   2562   int x;
   2563   const uint8* s = src_rgb0;
   2564   const uint8* t = src_rgb0 + src_stride_rgb;
   2565   v16u8 dst0, dst1, vec0, vec1, vec2, vec3;
   2566   v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
   2567   v16i8 shuffler1 = {2,  3,  6,  7,  10, 11, 14, 15,
   2568                      18, 19, 22, 23, 26, 27, 30, 31};
   2569   v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
   2570   v16i8 shuffler3 = {2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29};
   2571   v16u8 const_0x125E = (v16u8)__msa_fill_h(0x125E);
   2572   v16u8 const_0x7000 = (v16u8)__msa_fill_h(0x7000);
   2573   v16u8 const_0x264A = (v16u8)__msa_fill_h(0x264A);
   2574   v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
   2575 
   2576   for (x = 0; x < width; x += 32) {
   2577     READ_ARGB(s, t, vec0, vec1, vec2, vec3);
   2578     ARGBTOUV(vec0, vec1, vec2, vec3, const_0x125E, const_0x7000, const_0x264A,
   2579              const_0x8080, shuffler0, shuffler1, shuffler2, shuffler3, dst0,
   2580              dst1);
   2581     ST_UB(dst0, dst_v);
   2582     ST_UB(dst1, dst_u);
   2583     s += 128;
   2584     t += 128;
   2585     dst_v += 16;
   2586     dst_u += 16;
   2587   }
   2588 }
   2589 
   2590 void ABGRToUVRow_MSA(const uint8* src_rgb0,
   2591                      int src_stride_rgb,
   2592                      uint8* dst_u,
   2593                      uint8* dst_v,
   2594                      int width) {
   2595   int x;
   2596   const uint8* s = src_rgb0;
   2597   const uint8* t = src_rgb0 + src_stride_rgb;
   2598   v16u8 src0, src1, src2, src3;
   2599   v16u8 dst0, dst1;
   2600   v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
   2601   v16i8 shuffler1 = {2,  3,  6,  7,  10, 11, 14, 15,
   2602                      18, 19, 22, 23, 26, 27, 30, 31};
   2603   v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
   2604   v16i8 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30};
   2605   v16u8 const_0x4A26 = (v16u8)__msa_fill_h(0x4A26);
   2606   v16u8 const_0x0070 = (v16u8)__msa_fill_h(0x0070);
   2607   v16u8 const_0x125E = (v16u8)__msa_fill_h(0x125E);
   2608   v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
   2609 
   2610   for (x = 0; x < width; x += 32) {
   2611     READ_ARGB(s, t, src0, src1, src2, src3);
   2612     ARGBTOUV(src0, src1, src2, src3, const_0x4A26, const_0x0070, const_0x125E,
   2613              const_0x8080, shuffler1, shuffler0, shuffler2, shuffler3, dst0,
   2614              dst1);
   2615     ST_UB(dst0, dst_u);
   2616     ST_UB(dst1, dst_v);
   2617     s += 128;
   2618     t += 128;
   2619     dst_u += 16;
   2620     dst_v += 16;
   2621   }
   2622 }
   2623 
   2624 void RGBAToUVRow_MSA(const uint8* src_rgb0,
   2625                      int src_stride_rgb,
   2626                      uint8* dst_u,
   2627                      uint8* dst_v,
   2628                      int width) {
   2629   int x;
   2630   const uint8* s = src_rgb0;
   2631   const uint8* t = src_rgb0 + src_stride_rgb;
   2632   v16u8 dst0, dst1, vec0, vec1, vec2, vec3;
   2633   v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
   2634   v16i8 shuffler1 = {2,  3,  6,  7,  10, 11, 14, 15,
   2635                      18, 19, 22, 23, 26, 27, 30, 31};
   2636   v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
   2637   v16i8 shuffler3 = {2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29};
   2638   v16u8 const_0x125E = (v16u8)__msa_fill_h(0x264A);
   2639   v16u8 const_0x7000 = (v16u8)__msa_fill_h(0x7000);
   2640   v16u8 const_0x264A = (v16u8)__msa_fill_h(0x125E);
   2641   v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
   2642 
   2643   for (x = 0; x < width; x += 32) {
   2644     READ_ARGB(s, t, vec0, vec1, vec2, vec3);
   2645     ARGBTOUV(vec0, vec1, vec2, vec3, const_0x125E, const_0x7000, const_0x264A,
   2646              const_0x8080, shuffler0, shuffler1, shuffler2, shuffler3, dst0,
   2647              dst1);
   2648     ST_UB(dst0, dst_u);
   2649     ST_UB(dst1, dst_v);
   2650     s += 128;
   2651     t += 128;
   2652     dst_u += 16;
   2653     dst_v += 16;
   2654   }
   2655 }
   2656 
   2657 void I444ToARGBRow_MSA(const uint8* src_y,
   2658                        const uint8* src_u,
   2659                        const uint8* src_v,
   2660                        uint8* rgb_buf,
   2661                        const struct YuvConstants* yuvconstants,
   2662                        int width) {
   2663   int x;
   2664   v16u8 src0, src1, src2, dst0, dst1;
   2665   v8u16 vec0, vec1, vec2;
   2666   v4i32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9;
   2667   v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
   2668   v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
   2669   v8i16 zero = {0};
   2670 
   2671   YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
   2672                  vec_br, vec_yg);
   2673 
   2674   for (x = 0; x < width; x += 8) {
   2675     READI444(src_y, src_u, src_v, src0, src1, src2);
   2676     vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
   2677     reg0 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0);
   2678     reg1 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0);
   2679     reg0 *= vec_yg;
   2680     reg1 *= vec_yg;
   2681     reg0 = __msa_srai_w(reg0, 16);
   2682     reg1 = __msa_srai_w(reg1, 16);
   2683     reg4 = reg0 + vec_br;
   2684     reg5 = reg1 + vec_br;
   2685     reg2 = reg0 + vec_bg;
   2686     reg3 = reg1 + vec_bg;
   2687     reg0 += vec_bb;
   2688     reg1 += vec_bb;
   2689     vec0 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1);
   2690     vec1 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src2);
   2691     reg6 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0);
   2692     reg7 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0);
   2693     reg8 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1);
   2694     reg9 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1);
   2695     reg0 -= reg6 * vec_ub;
   2696     reg1 -= reg7 * vec_ub;
   2697     reg2 -= reg6 * vec_ug;
   2698     reg3 -= reg7 * vec_ug;
   2699     reg4 -= reg8 * vec_vr;
   2700     reg5 -= reg9 * vec_vr;
   2701     reg2 -= reg8 * vec_vg;
   2702     reg3 -= reg9 * vec_vg;
   2703     reg0 = __msa_srai_w(reg0, 6);
   2704     reg1 = __msa_srai_w(reg1, 6);
   2705     reg2 = __msa_srai_w(reg2, 6);
   2706     reg3 = __msa_srai_w(reg3, 6);
   2707     reg4 = __msa_srai_w(reg4, 6);
   2708     reg5 = __msa_srai_w(reg5, 6);
   2709     CLIP_0TO255(reg0, reg1, reg2, reg3, reg4, reg5);
   2710     vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
   2711     vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
   2712     vec2 = (v8u16)__msa_pckev_h((v8i16)reg5, (v8i16)reg4);
   2713     vec0 = (v8u16)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
   2714     vec1 = (v8u16)__msa_ilvev_b((v16i8)alpha, (v16i8)vec2);
   2715     dst0 = (v16u8)__msa_ilvr_h((v8i16)vec1, (v8i16)vec0);
   2716     dst1 = (v16u8)__msa_ilvl_h((v8i16)vec1, (v8i16)vec0);
   2717     ST_UB2(dst0, dst1, rgb_buf, 16);
   2718     src_y += 8;
   2719     src_u += 8;
   2720     src_v += 8;
   2721     rgb_buf += 32;
   2722   }
   2723 }
   2724 
   2725 void I400ToARGBRow_MSA(const uint8* src_y, uint8* rgb_buf, int width) {
   2726   int x;
   2727   v16u8 src0, res0, res1, res2, res3, res4, dst0, dst1, dst2, dst3;
   2728   v8i16 vec0, vec1;
   2729   v4i32 reg0, reg1, reg2, reg3;
   2730   v4i32 vec_yg = __msa_fill_w(0x4A35);
   2731   v8i16 vec_ygb = __msa_fill_h(0xFB78);
   2732   v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
   2733   v8i16 max = __msa_ldi_h(0xFF);
   2734   v8i16 zero = {0};
   2735 
   2736   for (x = 0; x < width; x += 16) {
   2737     src0 = (v16u8)__msa_ld_b((v16i8*)src_y, 0);
   2738     vec0 = (v8i16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
   2739     vec1 = (v8i16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
   2740     reg0 = (v4i32)__msa_ilvr_h(zero, vec0);
   2741     reg1 = (v4i32)__msa_ilvl_h(zero, vec0);
   2742     reg2 = (v4i32)__msa_ilvr_h(zero, vec1);
   2743     reg3 = (v4i32)__msa_ilvl_h(zero, vec1);
   2744     reg0 *= vec_yg;
   2745     reg1 *= vec_yg;
   2746     reg2 *= vec_yg;
   2747     reg3 *= vec_yg;
   2748     reg0 = __msa_srai_w(reg0, 16);
   2749     reg1 = __msa_srai_w(reg1, 16);
   2750     reg2 = __msa_srai_w(reg2, 16);
   2751     reg3 = __msa_srai_w(reg3, 16);
   2752     vec0 = (v8i16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
   2753     vec1 = (v8i16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
   2754     vec0 += vec_ygb;
   2755     vec1 += vec_ygb;
   2756     vec0 = __msa_srai_h(vec0, 6);
   2757     vec1 = __msa_srai_h(vec1, 6);
   2758     vec0 = __msa_maxi_s_h(vec0, 0);
   2759     vec1 = __msa_maxi_s_h(vec1, 0);
   2760     vec0 = __msa_min_s_h(max, vec0);
   2761     vec1 = __msa_min_s_h(max, vec1);
   2762     res0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
   2763     res1 = (v16u8)__msa_ilvr_b((v16i8)res0, (v16i8)res0);
   2764     res2 = (v16u8)__msa_ilvl_b((v16i8)res0, (v16i8)res0);
   2765     res3 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)res0);
   2766     res4 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)res0);
   2767     dst0 = (v16u8)__msa_ilvr_b((v16i8)res3, (v16i8)res1);
   2768     dst1 = (v16u8)__msa_ilvl_b((v16i8)res3, (v16i8)res1);
   2769     dst2 = (v16u8)__msa_ilvr_b((v16i8)res4, (v16i8)res2);
   2770     dst3 = (v16u8)__msa_ilvl_b((v16i8)res4, (v16i8)res2);
   2771     ST_UB4(dst0, dst1, dst2, dst3, rgb_buf, 16);
   2772     src_y += 16;
   2773     rgb_buf += 64;
   2774   }
   2775 }
   2776 
   2777 void J400ToARGBRow_MSA(const uint8* src_y, uint8* dst_argb, int width) {
   2778   int x;
   2779   v16u8 src0, vec0, vec1, vec2, vec3, dst0, dst1, dst2, dst3;
   2780   v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
   2781 
   2782   for (x = 0; x < width; x += 16) {
   2783     src0 = (v16u8)__msa_ld_b((v16i8*)src_y, 0);
   2784     vec0 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
   2785     vec1 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
   2786     vec2 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)src0);
   2787     vec3 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)src0);
   2788     dst0 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0);
   2789     dst1 = (v16u8)__msa_ilvl_b((v16i8)vec2, (v16i8)vec0);
   2790     dst2 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1);
   2791     dst3 = (v16u8)__msa_ilvl_b((v16i8)vec3, (v16i8)vec1);
   2792     ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
   2793     src_y += 16;
   2794     dst_argb += 64;
   2795   }
   2796 }
   2797 
   2798 void YUY2ToARGBRow_MSA(const uint8* src_yuy2,
   2799                        uint8* rgb_buf,
   2800                        const struct YuvConstants* yuvconstants,
   2801                        int width) {
   2802   int x;
   2803   v16u8 src0, src1, src2;
   2804   v8i16 vec0, vec1, vec2;
   2805   v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
   2806   v4i32 vec_ubvr, vec_ugvg;
   2807   v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
   2808 
   2809   YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
   2810                  vec_br, vec_yg);
   2811   vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
   2812   vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
   2813 
   2814   for (x = 0; x < width; x += 8) {
   2815     src0 = (v16u8)__msa_ld_b((v16i8*)src_yuy2, 0);
   2816     src1 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0);
   2817     src2 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0);
   2818     YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
   2819              vec0, vec1, vec2);
   2820     STOREARGB(vec0, vec1, vec2, alpha, rgb_buf);
   2821     src_yuy2 += 16;
   2822     rgb_buf += 32;
   2823   }
   2824 }
   2825 
   2826 void UYVYToARGBRow_MSA(const uint8* src_uyvy,
   2827                        uint8* rgb_buf,
   2828                        const struct YuvConstants* yuvconstants,
   2829                        int width) {
   2830   int x;
   2831   v16u8 src0, src1, src2;
   2832   v8i16 vec0, vec1, vec2;
   2833   v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
   2834   v4i32 vec_ubvr, vec_ugvg;
   2835   v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
   2836 
   2837   YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
   2838                  vec_br, vec_yg);
   2839   vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
   2840   vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
   2841 
   2842   for (x = 0; x < width; x += 8) {
   2843     src0 = (v16u8)__msa_ld_b((v16i8*)src_uyvy, 0);
   2844     src1 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0);
   2845     src2 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0);
   2846     YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
   2847              vec0, vec1, vec2);
   2848     STOREARGB(vec0, vec1, vec2, alpha, rgb_buf);
   2849     src_uyvy += 16;
   2850     rgb_buf += 32;
   2851   }
   2852 }
   2853 
   2854 void InterpolateRow_MSA(uint8* dst_ptr,
   2855                         const uint8* src_ptr,
   2856                         ptrdiff_t src_stride,
   2857                         int width,
   2858                         int32 source_y_fraction) {
   2859   int32 y1_fraction = source_y_fraction;
   2860   int32 y0_fraction = 256 - y1_fraction;
   2861   uint16 y_fractions;
   2862   const uint8* s = src_ptr;
   2863   const uint8* t = src_ptr + src_stride;
   2864   int x;
   2865   v16u8 src0, src1, src2, src3, dst0, dst1;
   2866   v8u16 vec0, vec1, vec2, vec3, y_frac;
   2867 
   2868   if (0 == y1_fraction) {
   2869     memcpy(dst_ptr, src_ptr, width);
   2870     return;
   2871   }
   2872 
   2873   if (128 == y1_fraction) {
   2874     for (x = 0; x < width; x += 32) {
   2875       src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
   2876       src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
   2877       src2 = (v16u8)__msa_ld_b((v16i8*)t, 0);
   2878       src3 = (v16u8)__msa_ld_b((v16i8*)t, 16);
   2879       dst0 = __msa_aver_u_b(src0, src2);
   2880       dst1 = __msa_aver_u_b(src1, src3);
   2881       ST_UB2(dst0, dst1, dst_ptr, 16);
   2882       s += 32;
   2883       t += 32;
   2884       dst_ptr += 32;
   2885     }
   2886     return;
   2887   }
   2888 
   2889   y_fractions = (uint16)(y0_fraction + (y1_fraction << 8));
   2890   y_frac = (v8u16)__msa_fill_h(y_fractions);
   2891 
   2892   for (x = 0; x < width; x += 32) {
   2893     src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
   2894     src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
   2895     src2 = (v16u8)__msa_ld_b((v16i8*)t, 0);
   2896     src3 = (v16u8)__msa_ld_b((v16i8*)t, 16);
   2897     vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
   2898     vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
   2899     vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
   2900     vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
   2901     vec0 = (v8u16)__msa_dotp_u_h((v16u8)vec0, (v16u8)y_frac);
   2902     vec1 = (v8u16)__msa_dotp_u_h((v16u8)vec1, (v16u8)y_frac);
   2903     vec2 = (v8u16)__msa_dotp_u_h((v16u8)vec2, (v16u8)y_frac);
   2904     vec3 = (v8u16)__msa_dotp_u_h((v16u8)vec3, (v16u8)y_frac);
   2905     vec0 = (v8u16)__msa_srari_h((v8i16)vec0, 8);
   2906     vec1 = (v8u16)__msa_srari_h((v8i16)vec1, 8);
   2907     vec2 = (v8u16)__msa_srari_h((v8i16)vec2, 8);
   2908     vec3 = (v8u16)__msa_srari_h((v8i16)vec3, 8);
   2909     dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
   2910     dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
   2911     ST_UB2(dst0, dst1, dst_ptr, 16);
   2912     s += 32;
   2913     t += 32;
   2914     dst_ptr += 32;
   2915   }
   2916 }
   2917 
   2918 void ARGBSetRow_MSA(uint8* dst_argb, uint32 v32, int width) {
   2919   int x;
   2920   v16u8 dst0 = (v16u8)__msa_fill_w(v32);
   2921 
   2922   for (x = 0; x < width; x += 4) {
   2923     ST_UB(dst0, dst_argb);
   2924     dst_argb += 16;
   2925   }
   2926 }
   2927 
   2928 void RAWToRGB24Row_MSA(const uint8* src_raw, uint8* dst_rgb24, int width) {
   2929   int x;
   2930   v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2;
   2931   v16i8 shuffler0 = {2, 1, 0, 5, 4, 3, 8, 7, 6, 11, 10, 9, 14, 13, 12, 17};
   2932   v16i8 shuffler1 = {8,  7,  12, 11, 10, 15, 14, 13,
   2933                      18, 17, 16, 21, 20, 19, 24, 23};
   2934   v16i8 shuffler2 = {14, 19, 18, 17, 22, 21, 20, 25,
   2935                      24, 23, 28, 27, 26, 31, 30, 29};
   2936 
   2937   for (x = 0; x < width; x += 16) {
   2938     src0 = (v16u8)__msa_ld_b((v16i8*)src_raw, 0);
   2939     src1 = (v16u8)__msa_ld_b((v16i8*)src_raw, 16);
   2940     src2 = (v16u8)__msa_ld_b((v16i8*)src_raw, 32);
   2941     src3 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 8);
   2942     src4 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8);
   2943     dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0);
   2944     dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src4, (v16i8)src3);
   2945     dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src2, (v16i8)src1);
   2946     ST_UB2(dst0, dst1, dst_rgb24, 16);
   2947     ST_UB(dst2, (dst_rgb24 + 32));
   2948     src_raw += 48;
   2949     dst_rgb24 += 48;
   2950   }
   2951 }
   2952 
   2953 void MergeUVRow_MSA(const uint8* src_u,
   2954                     const uint8* src_v,
   2955                     uint8* dst_uv,
   2956                     int width) {
   2957   int x;
   2958   v16u8 src0, src1, dst0, dst1;
   2959 
   2960   for (x = 0; x < width; x += 16) {
   2961     src0 = (v16u8)__msa_ld_b((v16i8*)src_u, 0);
   2962     src1 = (v16u8)__msa_ld_b((v16i8*)src_v, 0);
   2963     dst0 = (v16u8)__msa_ilvr_b((v16i8)src1, (v16i8)src0);
   2964     dst1 = (v16u8)__msa_ilvl_b((v16i8)src1, (v16i8)src0);
   2965     ST_UB2(dst0, dst1, dst_uv, 16);
   2966     src_u += 16;
   2967     src_v += 16;
   2968     dst_uv += 32;
   2969   }
   2970 }
   2971 
   2972 #ifdef __cplusplus
   2973 }  // extern "C"
   2974 }  // namespace libyuv
   2975 #endif
   2976 
   2977 #endif  // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
   2978