Home | History | Annotate | Download | only in source
      1 /*
      2  *  Copyright 2016 The LibYuv Project Authors. All rights reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS. All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <assert.h>
     12 
     13 #include "libyuv/scale_row.h"
     14 
     15 // This module is for GCC MSA
     16 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
     17 #include "libyuv/macros_msa.h"
     18 
     19 #ifdef __cplusplus
     20 namespace libyuv {
     21 extern "C" {
     22 #endif
     23 
     24 void ScaleARGBRowDown2_MSA(const uint8_t* src_argb,
     25                            ptrdiff_t src_stride,
     26                            uint8_t* dst_argb,
     27                            int dst_width) {
     28   int x;
     29   v16u8 src0, src1, dst0;
     30   (void)src_stride;
     31 
     32   for (x = 0; x < dst_width; x += 4) {
     33     src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
     34     src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
     35     dst0 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);
     36     ST_UB(dst0, dst_argb);
     37     src_argb += 32;
     38     dst_argb += 16;
     39   }
     40 }
     41 
     42 void ScaleARGBRowDown2Linear_MSA(const uint8_t* src_argb,
     43                                  ptrdiff_t src_stride,
     44                                  uint8_t* dst_argb,
     45                                  int dst_width) {
     46   int x;
     47   v16u8 src0, src1, vec0, vec1, dst0;
     48   (void)src_stride;
     49 
     50   for (x = 0; x < dst_width; x += 4) {
     51     src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
     52     src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
     53     vec0 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0);
     54     vec1 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);
     55     dst0 = (v16u8)__msa_aver_u_b((v16u8)vec0, (v16u8)vec1);
     56     ST_UB(dst0, dst_argb);
     57     src_argb += 32;
     58     dst_argb += 16;
     59   }
     60 }
     61 
     62 void ScaleARGBRowDown2Box_MSA(const uint8_t* src_argb,
     63                               ptrdiff_t src_stride,
     64                               uint8_t* dst_argb,
     65                               int dst_width) {
     66   int x;
     67   const uint8_t* s = src_argb;
     68   const uint8_t* t = src_argb + src_stride;
     69   v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0;
     70   v8u16 reg0, reg1, reg2, reg3;
     71   v16i8 shuffler = {0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15};
     72 
     73   for (x = 0; x < dst_width; x += 4) {
     74     src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
     75     src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
     76     src2 = (v16u8)__msa_ld_b((v16i8*)t, 0);
     77     src3 = (v16u8)__msa_ld_b((v16i8*)t, 16);
     78     vec0 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src0, (v16i8)src0);
     79     vec1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src1, (v16i8)src1);
     80     vec2 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src2, (v16i8)src2);
     81     vec3 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src3, (v16i8)src3);
     82     reg0 = __msa_hadd_u_h(vec0, vec0);
     83     reg1 = __msa_hadd_u_h(vec1, vec1);
     84     reg2 = __msa_hadd_u_h(vec2, vec2);
     85     reg3 = __msa_hadd_u_h(vec3, vec3);
     86     reg0 += reg2;
     87     reg1 += reg3;
     88     reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 2);
     89     reg1 = (v8u16)__msa_srari_h((v8i16)reg1, 2);
     90     dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
     91     ST_UB(dst0, dst_argb);
     92     s += 32;
     93     t += 32;
     94     dst_argb += 16;
     95   }
     96 }
     97 
     98 void ScaleARGBRowDownEven_MSA(const uint8_t* src_argb,
     99                               ptrdiff_t src_stride,
    100                               int32_t src_stepx,
    101                               uint8_t* dst_argb,
    102                               int dst_width) {
    103   int x;
    104   int32_t stepx = src_stepx * 4;
    105   int32_t data0, data1, data2, data3;
    106   (void)src_stride;
    107 
    108   for (x = 0; x < dst_width; x += 4) {
    109     data0 = LW(src_argb);
    110     data1 = LW(src_argb + stepx);
    111     data2 = LW(src_argb + stepx * 2);
    112     data3 = LW(src_argb + stepx * 3);
    113     SW(data0, dst_argb);
    114     SW(data1, dst_argb + 4);
    115     SW(data2, dst_argb + 8);
    116     SW(data3, dst_argb + 12);
    117     src_argb += stepx * 4;
    118     dst_argb += 16;
    119   }
    120 }
    121 
    122 void ScaleARGBRowDownEvenBox_MSA(const uint8* src_argb,
    123                                  ptrdiff_t src_stride,
    124                                  int src_stepx,
    125                                  uint8* dst_argb,
    126                                  int dst_width) {
    127   int x;
    128   const uint8* nxt_argb = src_argb + src_stride;
    129   int32_t stepx = src_stepx * 4;
    130   int64_t data0, data1, data2, data3;
    131   v16u8 src0 = {0}, src1 = {0}, src2 = {0}, src3 = {0};
    132   v16u8 vec0, vec1, vec2, vec3;
    133   v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
    134   v16u8 dst0;
    135 
    136   for (x = 0; x < dst_width; x += 4) {
    137     data0 = LD(src_argb);
    138     data1 = LD(src_argb + stepx);
    139     data2 = LD(src_argb + stepx * 2);
    140     data3 = LD(src_argb + stepx * 3);
    141     src0 = (v16u8)__msa_insert_d((v2i64)src0, 0, data0);
    142     src0 = (v16u8)__msa_insert_d((v2i64)src0, 1, data1);
    143     src1 = (v16u8)__msa_insert_d((v2i64)src1, 0, data2);
    144     src1 = (v16u8)__msa_insert_d((v2i64)src1, 1, data3);
    145     data0 = LD(nxt_argb);
    146     data1 = LD(nxt_argb + stepx);
    147     data2 = LD(nxt_argb + stepx * 2);
    148     data3 = LD(nxt_argb + stepx * 3);
    149     src2 = (v16u8)__msa_insert_d((v2i64)src2, 0, data0);
    150     src2 = (v16u8)__msa_insert_d((v2i64)src2, 1, data1);
    151     src3 = (v16u8)__msa_insert_d((v2i64)src3, 0, data2);
    152     src3 = (v16u8)__msa_insert_d((v2i64)src3, 1, data3);
    153     vec0 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
    154     vec1 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
    155     vec2 = (v16u8)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
    156     vec3 = (v16u8)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
    157     reg0 = __msa_hadd_u_h(vec0, vec0);
    158     reg1 = __msa_hadd_u_h(vec1, vec1);
    159     reg2 = __msa_hadd_u_h(vec2, vec2);
    160     reg3 = __msa_hadd_u_h(vec3, vec3);
    161     reg4 = (v8u16)__msa_pckev_d((v2i64)reg2, (v2i64)reg0);
    162     reg5 = (v8u16)__msa_pckev_d((v2i64)reg3, (v2i64)reg1);
    163     reg6 = (v8u16)__msa_pckod_d((v2i64)reg2, (v2i64)reg0);
    164     reg7 = (v8u16)__msa_pckod_d((v2i64)reg3, (v2i64)reg1);
    165     reg4 += reg6;
    166     reg5 += reg7;
    167     reg4 = (v8u16)__msa_srari_h((v8i16)reg4, 2);
    168     reg5 = (v8u16)__msa_srari_h((v8i16)reg5, 2);
    169     dst0 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4);
    170     ST_UB(dst0, dst_argb);
    171     src_argb += stepx * 4;
    172     nxt_argb += stepx * 4;
    173     dst_argb += 16;
    174   }
    175 }
    176 
    177 void ScaleRowDown2_MSA(const uint8_t* src_ptr,
    178                        ptrdiff_t src_stride,
    179                        uint8_t* dst,
    180                        int dst_width) {
    181   int x;
    182   v16u8 src0, src1, src2, src3, dst0, dst1;
    183   (void)src_stride;
    184 
    185   for (x = 0; x < dst_width; x += 32) {
    186     src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
    187     src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
    188     src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32);
    189     src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48);
    190     dst0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
    191     dst1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
    192     ST_UB2(dst0, dst1, dst, 16);
    193     src_ptr += 64;
    194     dst += 32;
    195   }
    196 }
    197 
    198 void ScaleRowDown2Linear_MSA(const uint8_t* src_ptr,
    199                              ptrdiff_t src_stride,
    200                              uint8_t* dst,
    201                              int dst_width) {
    202   int x;
    203   v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0, dst1;
    204   (void)src_stride;
    205 
    206   for (x = 0; x < dst_width; x += 32) {
    207     src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
    208     src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
    209     src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32);
    210     src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48);
    211     vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
    212     vec2 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
    213     vec1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
    214     vec3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
    215     dst0 = __msa_aver_u_b(vec1, vec0);
    216     dst1 = __msa_aver_u_b(vec3, vec2);
    217     ST_UB2(dst0, dst1, dst, 16);
    218     src_ptr += 64;
    219     dst += 32;
    220   }
    221 }
    222 
    223 void ScaleRowDown2Box_MSA(const uint8_t* src_ptr,
    224                           ptrdiff_t src_stride,
    225                           uint8_t* dst,
    226                           int dst_width) {
    227   int x;
    228   const uint8_t* s = src_ptr;
    229   const uint8_t* t = src_ptr + src_stride;
    230   v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1;
    231   v8u16 vec0, vec1, vec2, vec3;
    232 
    233   for (x = 0; x < dst_width; x += 32) {
    234     src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
    235     src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
    236     src2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
    237     src3 = (v16u8)__msa_ld_b((v16i8*)s, 48);
    238     src4 = (v16u8)__msa_ld_b((v16i8*)t, 0);
    239     src5 = (v16u8)__msa_ld_b((v16i8*)t, 16);
    240     src6 = (v16u8)__msa_ld_b((v16i8*)t, 32);
    241     src7 = (v16u8)__msa_ld_b((v16i8*)t, 48);
    242     vec0 = __msa_hadd_u_h(src0, src0);
    243     vec1 = __msa_hadd_u_h(src1, src1);
    244     vec2 = __msa_hadd_u_h(src2, src2);
    245     vec3 = __msa_hadd_u_h(src3, src3);
    246     vec0 += __msa_hadd_u_h(src4, src4);
    247     vec1 += __msa_hadd_u_h(src5, src5);
    248     vec2 += __msa_hadd_u_h(src6, src6);
    249     vec3 += __msa_hadd_u_h(src7, src7);
    250     vec0 = (v8u16)__msa_srari_h((v8i16)vec0, 2);
    251     vec1 = (v8u16)__msa_srari_h((v8i16)vec1, 2);
    252     vec2 = (v8u16)__msa_srari_h((v8i16)vec2, 2);
    253     vec3 = (v8u16)__msa_srari_h((v8i16)vec3, 2);
    254     dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
    255     dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
    256     ST_UB2(dst0, dst1, dst, 16);
    257     s += 64;
    258     t += 64;
    259     dst += 32;
    260   }
    261 }
    262 
    263 void ScaleRowDown4_MSA(const uint8_t* src_ptr,
    264                        ptrdiff_t src_stride,
    265                        uint8_t* dst,
    266                        int dst_width) {
    267   int x;
    268   v16u8 src0, src1, src2, src3, vec0, vec1, dst0;
    269   (void)src_stride;
    270 
    271   for (x = 0; x < dst_width; x += 16) {
    272     src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
    273     src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
    274     src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32);
    275     src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48);
    276     vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
    277     vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
    278     dst0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
    279     ST_UB(dst0, dst);
    280     src_ptr += 64;
    281     dst += 16;
    282   }
    283 }
    284 
    285 void ScaleRowDown4Box_MSA(const uint8_t* src_ptr,
    286                           ptrdiff_t src_stride,
    287                           uint8_t* dst,
    288                           int dst_width) {
    289   int x;
    290   const uint8_t* s = src_ptr;
    291   const uint8_t* t0 = s + src_stride;
    292   const uint8_t* t1 = s + src_stride * 2;
    293   const uint8_t* t2 = s + src_stride * 3;
    294   v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0;
    295   v8u16 vec0, vec1, vec2, vec3;
    296   v4u32 reg0, reg1, reg2, reg3;
    297 
    298   for (x = 0; x < dst_width; x += 16) {
    299     src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
    300     src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
    301     src2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
    302     src3 = (v16u8)__msa_ld_b((v16i8*)s, 48);
    303     src4 = (v16u8)__msa_ld_b((v16i8*)t0, 0);
    304     src5 = (v16u8)__msa_ld_b((v16i8*)t0, 16);
    305     src6 = (v16u8)__msa_ld_b((v16i8*)t0, 32);
    306     src7 = (v16u8)__msa_ld_b((v16i8*)t0, 48);
    307     vec0 = __msa_hadd_u_h(src0, src0);
    308     vec1 = __msa_hadd_u_h(src1, src1);
    309     vec2 = __msa_hadd_u_h(src2, src2);
    310     vec3 = __msa_hadd_u_h(src3, src3);
    311     vec0 += __msa_hadd_u_h(src4, src4);
    312     vec1 += __msa_hadd_u_h(src5, src5);
    313     vec2 += __msa_hadd_u_h(src6, src6);
    314     vec3 += __msa_hadd_u_h(src7, src7);
    315     src0 = (v16u8)__msa_ld_b((v16i8*)t1, 0);
    316     src1 = (v16u8)__msa_ld_b((v16i8*)t1, 16);
    317     src2 = (v16u8)__msa_ld_b((v16i8*)t1, 32);
    318     src3 = (v16u8)__msa_ld_b((v16i8*)t1, 48);
    319     src4 = (v16u8)__msa_ld_b((v16i8*)t2, 0);
    320     src5 = (v16u8)__msa_ld_b((v16i8*)t2, 16);
    321     src6 = (v16u8)__msa_ld_b((v16i8*)t2, 32);
    322     src7 = (v16u8)__msa_ld_b((v16i8*)t2, 48);
    323     vec0 += __msa_hadd_u_h(src0, src0);
    324     vec1 += __msa_hadd_u_h(src1, src1);
    325     vec2 += __msa_hadd_u_h(src2, src2);
    326     vec3 += __msa_hadd_u_h(src3, src3);
    327     vec0 += __msa_hadd_u_h(src4, src4);
    328     vec1 += __msa_hadd_u_h(src5, src5);
    329     vec2 += __msa_hadd_u_h(src6, src6);
    330     vec3 += __msa_hadd_u_h(src7, src7);
    331     reg0 = __msa_hadd_u_w(vec0, vec0);
    332     reg1 = __msa_hadd_u_w(vec1, vec1);
    333     reg2 = __msa_hadd_u_w(vec2, vec2);
    334     reg3 = __msa_hadd_u_w(vec3, vec3);
    335     reg0 = (v4u32)__msa_srari_w((v4i32)reg0, 4);
    336     reg1 = (v4u32)__msa_srari_w((v4i32)reg1, 4);
    337     reg2 = (v4u32)__msa_srari_w((v4i32)reg2, 4);
    338     reg3 = (v4u32)__msa_srari_w((v4i32)reg3, 4);
    339     vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
    340     vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
    341     dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
    342     ST_UB(dst0, dst);
    343     s += 64;
    344     t0 += 64;
    345     t1 += 64;
    346     t2 += 64;
    347     dst += 16;
    348   }
    349 }
    350 
    351 void ScaleRowDown38_MSA(const uint8_t* src_ptr,
    352                         ptrdiff_t src_stride,
    353                         uint8_t* dst,
    354                         int dst_width) {
    355   int x, width;
    356   uint64_t dst0;
    357   uint32_t dst1;
    358   v16u8 src0, src1, vec0;
    359   v16i8 mask = {0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0};
    360   (void)src_stride;
    361 
    362   assert(dst_width % 3 == 0);
    363   width = dst_width / 3;
    364 
    365   for (x = 0; x < width; x += 4) {
    366     src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
    367     src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
    368     vec0 = (v16u8)__msa_vshf_b(mask, (v16i8)src1, (v16i8)src0);
    369     dst0 = __msa_copy_u_d((v2i64)vec0, 0);
    370     dst1 = __msa_copy_u_w((v4i32)vec0, 2);
    371     SD(dst0, dst);
    372     SW(dst1, dst + 8);
    373     src_ptr += 32;
    374     dst += 12;
    375   }
    376 }
    377 
    378 void ScaleRowDown38_2_Box_MSA(const uint8_t* src_ptr,
    379                               ptrdiff_t src_stride,
    380                               uint8_t* dst_ptr,
    381                               int dst_width) {
    382   int x, width;
    383   const uint8_t* s = src_ptr;
    384   const uint8_t* t = src_ptr + src_stride;
    385   uint64_t dst0;
    386   uint32_t dst1;
    387   v16u8 src0, src1, src2, src3, out;
    388   v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
    389   v4u32 tmp0, tmp1, tmp2, tmp3, tmp4;
    390   v8i16 zero = {0};
    391   v8i16 mask = {0, 1, 2, 8, 3, 4, 5, 9};
    392   v16i8 dst_mask = {0, 2, 16, 4, 6, 18, 8, 10, 20, 12, 14, 22, 0, 0, 0, 0};
    393   v4u32 const_0x2AAA = (v4u32)__msa_fill_w(0x2AAA);
    394   v4u32 const_0x4000 = (v4u32)__msa_fill_w(0x4000);
    395 
    396   assert((dst_width % 3 == 0) && (dst_width > 0));
    397   width = dst_width / 3;
    398 
    399   for (x = 0; x < width; x += 4) {
    400     src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
    401     src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
    402     src2 = (v16u8)__msa_ld_b((v16i8*)t, 0);
    403     src3 = (v16u8)__msa_ld_b((v16i8*)t, 16);
    404     vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
    405     vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
    406     vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
    407     vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
    408     vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
    409     vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
    410     vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
    411     vec3 = __msa_hadd_u_h((v16u8)vec3, (v16u8)vec3);
    412     vec4 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec0);
    413     vec5 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec1);
    414     vec6 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec2);
    415     vec7 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec3);
    416     vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);
    417     vec1 = (v8u16)__msa_pckod_w((v4i32)vec3, (v4i32)vec2);
    418     vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);
    419     tmp0 = __msa_hadd_u_w(vec4, vec4);
    420     tmp1 = __msa_hadd_u_w(vec5, vec5);
    421     tmp2 = __msa_hadd_u_w(vec6, vec6);
    422     tmp3 = __msa_hadd_u_w(vec7, vec7);
    423     tmp4 = __msa_hadd_u_w(vec0, vec0);
    424     vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
    425     vec1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
    426     tmp0 = __msa_hadd_u_w(vec0, vec0);
    427     tmp1 = __msa_hadd_u_w(vec1, vec1);
    428     tmp0 *= const_0x2AAA;
    429     tmp1 *= const_0x2AAA;
    430     tmp4 *= const_0x4000;
    431     tmp0 = (v4u32)__msa_srai_w((v4i32)tmp0, 16);
    432     tmp1 = (v4u32)__msa_srai_w((v4i32)tmp1, 16);
    433     tmp4 = (v4u32)__msa_srai_w((v4i32)tmp4, 16);
    434     vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
    435     vec1 = (v8u16)__msa_pckev_h((v8i16)tmp4, (v8i16)tmp4);
    436     out = (v16u8)__msa_vshf_b(dst_mask, (v16i8)vec1, (v16i8)vec0);
    437     dst0 = __msa_copy_u_d((v2i64)out, 0);
    438     dst1 = __msa_copy_u_w((v4i32)out, 2);
    439     SD(dst0, dst_ptr);
    440     SW(dst1, dst_ptr + 8);
    441     s += 32;
    442     t += 32;
    443     dst_ptr += 12;
    444   }
    445 }
    446 
    447 void ScaleRowDown38_3_Box_MSA(const uint8_t* src_ptr,
    448                               ptrdiff_t src_stride,
    449                               uint8_t* dst_ptr,
    450                               int dst_width) {
    451   int x, width;
    452   const uint8_t* s = src_ptr;
    453   const uint8_t* t0 = s + src_stride;
    454   const uint8_t* t1 = s + src_stride * 2;
    455   uint64_t dst0;
    456   uint32_t dst1;
    457   v16u8 src0, src1, src2, src3, src4, src5, out;
    458   v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
    459   v4u32 tmp0, tmp1, tmp2, tmp3, tmp4;
    460   v8u16 zero = {0};
    461   v8i16 mask = {0, 1, 2, 8, 3, 4, 5, 9};
    462   v16i8 dst_mask = {0, 2, 16, 4, 6, 18, 8, 10, 20, 12, 14, 22, 0, 0, 0, 0};
    463   v4u32 const_0x1C71 = (v4u32)__msa_fill_w(0x1C71);
    464   v4u32 const_0x2AAA = (v4u32)__msa_fill_w(0x2AAA);
    465 
    466   assert((dst_width % 3 == 0) && (dst_width > 0));
    467   width = dst_width / 3;
    468 
    469   for (x = 0; x < width; x += 4) {
    470     src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
    471     src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
    472     src2 = (v16u8)__msa_ld_b((v16i8*)t0, 0);
    473     src3 = (v16u8)__msa_ld_b((v16i8*)t0, 16);
    474     src4 = (v16u8)__msa_ld_b((v16i8*)t1, 0);
    475     src5 = (v16u8)__msa_ld_b((v16i8*)t1, 16);
    476     vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
    477     vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
    478     vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
    479     vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
    480     vec4 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src4);
    481     vec5 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src4);
    482     vec6 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src5);
    483     vec7 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src5);
    484     vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
    485     vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
    486     vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
    487     vec3 = __msa_hadd_u_h((v16u8)vec3, (v16u8)vec3);
    488     vec0 += __msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);
    489     vec1 += __msa_hadd_u_h((v16u8)vec5, (v16u8)vec5);
    490     vec2 += __msa_hadd_u_h((v16u8)vec6, (v16u8)vec6);
    491     vec3 += __msa_hadd_u_h((v16u8)vec7, (v16u8)vec7);
    492     vec4 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec0);
    493     vec5 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec1);
    494     vec6 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec2);
    495     vec7 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec3);
    496     vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);
    497     vec1 = (v8u16)__msa_pckod_w((v4i32)vec3, (v4i32)vec2);
    498     vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);
    499     tmp0 = __msa_hadd_u_w(vec4, vec4);
    500     tmp1 = __msa_hadd_u_w(vec5, vec5);
    501     tmp2 = __msa_hadd_u_w(vec6, vec6);
    502     tmp3 = __msa_hadd_u_w(vec7, vec7);
    503     tmp4 = __msa_hadd_u_w(vec0, vec0);
    504     vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
    505     vec1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
    506     tmp0 = __msa_hadd_u_w(vec0, vec0);
    507     tmp1 = __msa_hadd_u_w(vec1, vec1);
    508     tmp0 *= const_0x1C71;
    509     tmp1 *= const_0x1C71;
    510     tmp4 *= const_0x2AAA;
    511     tmp0 = (v4u32)__msa_srai_w((v4i32)tmp0, 16);
    512     tmp1 = (v4u32)__msa_srai_w((v4i32)tmp1, 16);
    513     tmp4 = (v4u32)__msa_srai_w((v4i32)tmp4, 16);
    514     vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
    515     vec1 = (v8u16)__msa_pckev_h((v8i16)tmp4, (v8i16)tmp4);
    516     out = (v16u8)__msa_vshf_b(dst_mask, (v16i8)vec1, (v16i8)vec0);
    517     dst0 = __msa_copy_u_d((v2i64)out, 0);
    518     dst1 = __msa_copy_u_w((v4i32)out, 2);
    519     SD(dst0, dst_ptr);
    520     SW(dst1, dst_ptr + 8);
    521     s += 32;
    522     t0 += 32;
    523     t1 += 32;
    524     dst_ptr += 12;
    525   }
    526 }
    527 
    528 void ScaleAddRow_MSA(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) {
    529   int x;
    530   v16u8 src0;
    531   v8u16 dst0, dst1;
    532   v16i8 zero = {0};
    533 
    534   assert(src_width > 0);
    535 
    536   for (x = 0; x < src_width; x += 16) {
    537     src0 = LD_UB(src_ptr);
    538     dst0 = (v8u16)__msa_ld_h((v8i16*)dst_ptr, 0);
    539     dst1 = (v8u16)__msa_ld_h((v8i16*)dst_ptr, 16);
    540     dst0 += (v8u16)__msa_ilvr_b(zero, (v16i8)src0);
    541     dst1 += (v8u16)__msa_ilvl_b(zero, (v16i8)src0);
    542     ST_UH2(dst0, dst1, dst_ptr, 8);
    543     src_ptr += 16;
    544     dst_ptr += 16;
    545   }
    546 }
    547 
    548 #ifdef __cplusplus
    549 }  // extern "C"
    550 }  // namespace libyuv
    551 #endif
    552 
    553 #endif  // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
    554