1 /* 2 * Copyright 2016 The LibYuv Project Authors. All rights reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <string.h> 12 13 #include "libyuv/row.h" 14 15 // This module is for GCC MSA 16 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) 17 #include "libyuv/macros_msa.h" 18 19 #ifdef __cplusplus 20 namespace libyuv { 21 extern "C" { 22 #endif 23 24 #define ALPHA_VAL (-1) 25 26 // Fill YUV -> RGB conversion constants into vectors 27 #define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, bb, bg, br, yg) \ 28 { \ 29 ub = __msa_fill_w(yuvconst->kUVToB[0]); \ 30 vr = __msa_fill_w(yuvconst->kUVToR[1]); \ 31 ug = __msa_fill_w(yuvconst->kUVToG[0]); \ 32 vg = __msa_fill_w(yuvconst->kUVToG[1]); \ 33 bb = __msa_fill_w(yuvconst->kUVBiasB[0]); \ 34 bg = __msa_fill_w(yuvconst->kUVBiasG[0]); \ 35 br = __msa_fill_w(yuvconst->kUVBiasR[0]); \ 36 yg = __msa_fill_w(yuvconst->kYToRgb[0]); \ 37 } 38 39 // Load YUV 422 pixel data 40 #define READYUV422(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v) \ 41 { \ 42 uint64 y_m; \ 43 uint32 u_m, v_m; \ 44 v4i32 zero_m = {0}; \ 45 y_m = LD(psrc_y); \ 46 u_m = LW(psrc_u); \ 47 v_m = LW(psrc_v); \ 48 out_y = (v16u8)__msa_insert_d((v2i64)zero_m, 0, (int64)y_m); \ 49 out_u = (v16u8)__msa_insert_w(zero_m, 0, (int32)u_m); \ 50 out_v = (v16u8)__msa_insert_w(zero_m, 0, (int32)v_m); \ 51 } 52 53 // Clip input vector elements between 0 to 255 54 #define CLIP_0TO255(in0, in1, in2, in3, in4, in5) \ 55 { \ 56 v4i32 max_m = __msa_ldi_w(0xFF); \ 57 \ 58 in0 = __msa_maxi_s_w(in0, 0); \ 59 in1 = __msa_maxi_s_w(in1, 0); \ 60 in2 = __msa_maxi_s_w(in2, 0); \ 61 in3 = __msa_maxi_s_w(in3, 0); \ 62 in4 = __msa_maxi_s_w(in4, 0); \ 63 in5 = __msa_maxi_s_w(in5, 0); \ 64 in0 = __msa_min_s_w(max_m, in0); \ 65 in1 = __msa_min_s_w(max_m, in1); \ 66 in2 = __msa_min_s_w(max_m, in2); \ 67 in3 = __msa_min_s_w(max_m, in3); \ 68 in4 = __msa_min_s_w(max_m, in4); \ 69 in5 = __msa_min_s_w(max_m, in5); \ 70 } 71 72 // Convert 8 pixels of YUV 420 to RGB. 73 #define YUVTORGB(in_y, in_uv, ubvr, ugvg, bb, bg, br, yg, out_b, out_g, out_r) \ 74 { \ 75 v8i16 vec0_m, vec1_m; \ 76 v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m; \ 77 v4i32 reg5_m, reg6_m, reg7_m; \ 78 v16i8 zero_m = {0}; \ 79 \ 80 vec0_m = (v8i16)__msa_ilvr_b((v16i8)in_y, (v16i8)in_y); \ 81 vec1_m = (v8i16)__msa_ilvr_b((v16i8)zero_m, (v16i8)in_uv); \ 82 reg0_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec0_m); \ 83 reg1_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec0_m); \ 84 reg2_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec1_m); \ 85 reg3_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec1_m); \ 86 reg0_m *= yg; \ 87 reg1_m *= yg; \ 88 reg2_m *= ubvr; \ 89 reg3_m *= ubvr; \ 90 reg0_m = __msa_srai_w(reg0_m, 16); \ 91 reg1_m = __msa_srai_w(reg1_m, 16); \ 92 reg4_m = __msa_dotp_s_w((v8i16)vec1_m, (v8i16)ugvg); \ 93 reg5_m = __msa_ilvev_w(reg2_m, reg2_m); \ 94 reg6_m = __msa_ilvev_w(reg3_m, reg3_m); \ 95 reg7_m = __msa_ilvr_w(reg4_m, reg4_m); \ 96 reg2_m = __msa_ilvod_w(reg2_m, reg2_m); \ 97 reg3_m = __msa_ilvod_w(reg3_m, reg3_m); \ 98 reg4_m = __msa_ilvl_w(reg4_m, reg4_m); \ 99 reg5_m = reg0_m - reg5_m; \ 100 reg6_m = reg1_m - reg6_m; \ 101 reg2_m = reg0_m - reg2_m; \ 102 reg3_m = reg1_m - reg3_m; \ 103 reg7_m = reg0_m - reg7_m; \ 104 reg4_m = reg1_m - reg4_m; \ 105 reg5_m += bb; \ 106 reg6_m += bb; \ 107 reg7_m += bg; \ 108 reg4_m += bg; \ 109 reg2_m += br; \ 110 reg3_m += br; \ 111 reg5_m = __msa_srai_w(reg5_m, 6); \ 112 reg6_m = __msa_srai_w(reg6_m, 6); \ 113 reg7_m = __msa_srai_w(reg7_m, 6); \ 114 reg4_m = __msa_srai_w(reg4_m, 6); \ 115 reg2_m = __msa_srai_w(reg2_m, 6); \ 116 reg3_m = __msa_srai_w(reg3_m, 6); \ 117 CLIP_0TO255(reg5_m, reg6_m, reg7_m, reg4_m, reg2_m, reg3_m); \ 118 out_b = __msa_pckev_h((v8i16)reg6_m, (v8i16)reg5_m); \ 119 out_g = __msa_pckev_h((v8i16)reg4_m, (v8i16)reg7_m); \ 120 out_r = __msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \ 121 } 122 123 // Pack and Store 8 ARGB values. 124 #define STOREARGB(in0, in1, in2, in3, pdst_argb) \ 125 { \ 126 v8i16 vec0_m, vec1_m; \ 127 v16u8 dst0_m, dst1_m; \ 128 vec0_m = (v8i16)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \ 129 vec1_m = (v8i16)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \ 130 dst0_m = (v16u8)__msa_ilvr_h(vec1_m, vec0_m); \ 131 dst1_m = (v16u8)__msa_ilvl_h(vec1_m, vec0_m); \ 132 ST_UB2(dst0_m, dst1_m, pdst_argb, 16); \ 133 } 134 135 // Takes ARGB input and calculates Y. 136 #define ARGBTOY(argb0, argb1, argb2, argb3, const0, const1, const2, shift, \ 137 y_out) \ 138 { \ 139 v16u8 vec0_m, vec1_m, vec2_m, vec3_m; \ 140 v8u16 reg0_m, reg1_m; \ 141 \ 142 vec0_m = (v16u8)__msa_pckev_h((v8i16)argb1, (v8i16)argb0); \ 143 vec1_m = (v16u8)__msa_pckev_h((v8i16)argb3, (v8i16)argb2); \ 144 vec2_m = (v16u8)__msa_pckod_h((v8i16)argb1, (v8i16)argb0); \ 145 vec3_m = (v16u8)__msa_pckod_h((v8i16)argb3, (v8i16)argb2); \ 146 reg0_m = __msa_dotp_u_h(vec0_m, const0); \ 147 reg1_m = __msa_dotp_u_h(vec1_m, const0); \ 148 reg0_m = __msa_dpadd_u_h(reg0_m, vec2_m, const1); \ 149 reg1_m = __msa_dpadd_u_h(reg1_m, vec3_m, const1); \ 150 reg0_m += const2; \ 151 reg1_m += const2; \ 152 reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, shift); \ 153 reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, shift); \ 154 y_out = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m); \ 155 } 156 157 // Loads current and next row of ARGB input and averages it to calculate U and V 158 #define READ_ARGB(s_ptr, t_ptr, argb0, argb1, argb2, argb3) \ 159 { \ 160 v16u8 src0_m, src1_m, src2_m, src3_m, src4_m, src5_m, src6_m, src7_m; \ 161 v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ 162 v16u8 vec8_m, vec9_m; \ 163 v8u16 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m, reg5_m, reg6_m, reg7_m; \ 164 v8u16 reg8_m, reg9_m; \ 165 \ 166 src0_m = (v16u8)__msa_ld_b((v16i8*)s, 0); \ 167 src1_m = (v16u8)__msa_ld_b((v16i8*)s, 16); \ 168 src2_m = (v16u8)__msa_ld_b((v16i8*)s, 32); \ 169 src3_m = (v16u8)__msa_ld_b((v16i8*)s, 48); \ 170 src4_m = (v16u8)__msa_ld_b((v16i8*)t, 0); \ 171 src5_m = (v16u8)__msa_ld_b((v16i8*)t, 16); \ 172 src6_m = (v16u8)__msa_ld_b((v16i8*)t, 32); \ 173 src7_m = (v16u8)__msa_ld_b((v16i8*)t, 48); \ 174 vec0_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m); \ 175 vec1_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m); \ 176 vec2_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m); \ 177 vec3_m = (v16u8)__msa_ilvr_b((v16i8)src3_m, (v16i8)src7_m); \ 178 vec4_m = (v16u8)__msa_ilvl_b((v16i8)src0_m, (v16i8)src4_m); \ 179 vec5_m = (v16u8)__msa_ilvl_b((v16i8)src1_m, (v16i8)src5_m); \ 180 vec6_m = (v16u8)__msa_ilvl_b((v16i8)src2_m, (v16i8)src6_m); \ 181 vec7_m = (v16u8)__msa_ilvl_b((v16i8)src3_m, (v16i8)src7_m); \ 182 reg0_m = __msa_hadd_u_h(vec0_m, vec0_m); \ 183 reg1_m = __msa_hadd_u_h(vec1_m, vec1_m); \ 184 reg2_m = __msa_hadd_u_h(vec2_m, vec2_m); \ 185 reg3_m = __msa_hadd_u_h(vec3_m, vec3_m); \ 186 reg4_m = __msa_hadd_u_h(vec4_m, vec4_m); \ 187 reg5_m = __msa_hadd_u_h(vec5_m, vec5_m); \ 188 reg6_m = __msa_hadd_u_h(vec6_m, vec6_m); \ 189 reg7_m = __msa_hadd_u_h(vec7_m, vec7_m); \ 190 reg8_m = (v8u16)__msa_pckev_d((v2i64)reg4_m, (v2i64)reg0_m); \ 191 reg9_m = (v8u16)__msa_pckev_d((v2i64)reg5_m, (v2i64)reg1_m); \ 192 reg8_m += (v8u16)__msa_pckod_d((v2i64)reg4_m, (v2i64)reg0_m); \ 193 reg9_m += (v8u16)__msa_pckod_d((v2i64)reg5_m, (v2i64)reg1_m); \ 194 reg0_m = (v8u16)__msa_pckev_d((v2i64)reg6_m, (v2i64)reg2_m); \ 195 reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m); \ 196 reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m); \ 197 reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m); \ 198 reg8_m = (v8u16)__msa_srai_h((v8i16)reg8_m, 2); \ 199 reg9_m = (v8u16)__msa_srai_h((v8i16)reg9_m, 2); \ 200 reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, 2); \ 201 reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, 2); \ 202 argb0 = (v16u8)__msa_pckev_b((v16i8)reg9_m, (v16i8)reg8_m); \ 203 argb1 = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m); \ 204 src0_m = (v16u8)__msa_ld_b((v16i8*)s, 64); \ 205 src1_m = (v16u8)__msa_ld_b((v16i8*)s, 80); \ 206 src2_m = (v16u8)__msa_ld_b((v16i8*)s, 96); \ 207 src3_m = (v16u8)__msa_ld_b((v16i8*)s, 112); \ 208 src4_m = (v16u8)__msa_ld_b((v16i8*)t, 64); \ 209 src5_m = (v16u8)__msa_ld_b((v16i8*)t, 80); \ 210 src6_m = (v16u8)__msa_ld_b((v16i8*)t, 96); \ 211 src7_m = (v16u8)__msa_ld_b((v16i8*)t, 112); \ 212 vec2_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m); \ 213 vec3_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m); \ 214 vec4_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m); \ 215 vec5_m = (v16u8)__msa_ilvr_b((v16i8)src3_m, (v16i8)src7_m); \ 216 vec6_m = (v16u8)__msa_ilvl_b((v16i8)src0_m, (v16i8)src4_m); \ 217 vec7_m = (v16u8)__msa_ilvl_b((v16i8)src1_m, (v16i8)src5_m); \ 218 vec8_m = (v16u8)__msa_ilvl_b((v16i8)src2_m, (v16i8)src6_m); \ 219 vec9_m = (v16u8)__msa_ilvl_b((v16i8)src3_m, (v16i8)src7_m); \ 220 reg0_m = __msa_hadd_u_h(vec2_m, vec2_m); \ 221 reg1_m = __msa_hadd_u_h(vec3_m, vec3_m); \ 222 reg2_m = __msa_hadd_u_h(vec4_m, vec4_m); \ 223 reg3_m = __msa_hadd_u_h(vec5_m, vec5_m); \ 224 reg4_m = __msa_hadd_u_h(vec6_m, vec6_m); \ 225 reg5_m = __msa_hadd_u_h(vec7_m, vec7_m); \ 226 reg6_m = __msa_hadd_u_h(vec8_m, vec8_m); \ 227 reg7_m = __msa_hadd_u_h(vec9_m, vec9_m); \ 228 reg8_m = (v8u16)__msa_pckev_d((v2i64)reg4_m, (v2i64)reg0_m); \ 229 reg9_m = (v8u16)__msa_pckev_d((v2i64)reg5_m, (v2i64)reg1_m); \ 230 reg8_m += (v8u16)__msa_pckod_d((v2i64)reg4_m, (v2i64)reg0_m); \ 231 reg9_m += (v8u16)__msa_pckod_d((v2i64)reg5_m, (v2i64)reg1_m); \ 232 reg0_m = (v8u16)__msa_pckev_d((v2i64)reg6_m, (v2i64)reg2_m); \ 233 reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m); \ 234 reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m); \ 235 reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m); \ 236 reg8_m = (v8u16)__msa_srai_h((v8i16)reg8_m, 2); \ 237 reg9_m = (v8u16)__msa_srai_h((v8i16)reg9_m, 2); \ 238 reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, 2); \ 239 reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, 2); \ 240 argb2 = (v16u8)__msa_pckev_b((v16i8)reg9_m, (v16i8)reg8_m); \ 241 argb3 = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m); \ 242 } 243 244 // Takes ARGB input and calculates U and V. 245 #define ARGBTOUV(argb0, argb1, argb2, argb3, const0, const1, const2, const3, \ 246 shf0, shf1, shf2, shf3, v_out, u_out) \ 247 { \ 248 v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ 249 v8u16 reg0_m, reg1_m, reg2_m, reg3_m; \ 250 \ 251 vec0_m = (v16u8)__msa_vshf_b(shf0, (v16i8)argb1, (v16i8)argb0); \ 252 vec1_m = (v16u8)__msa_vshf_b(shf0, (v16i8)argb3, (v16i8)argb2); \ 253 vec2_m = (v16u8)__msa_vshf_b(shf1, (v16i8)argb1, (v16i8)argb0); \ 254 vec3_m = (v16u8)__msa_vshf_b(shf1, (v16i8)argb3, (v16i8)argb2); \ 255 vec4_m = (v16u8)__msa_vshf_b(shf2, (v16i8)argb1, (v16i8)argb0); \ 256 vec5_m = (v16u8)__msa_vshf_b(shf2, (v16i8)argb3, (v16i8)argb2); \ 257 vec6_m = (v16u8)__msa_vshf_b(shf3, (v16i8)argb1, (v16i8)argb0); \ 258 vec7_m = (v16u8)__msa_vshf_b(shf3, (v16i8)argb3, (v16i8)argb2); \ 259 reg0_m = __msa_dotp_u_h(vec0_m, const1); \ 260 reg1_m = __msa_dotp_u_h(vec1_m, const1); \ 261 reg2_m = __msa_dotp_u_h(vec4_m, const1); \ 262 reg3_m = __msa_dotp_u_h(vec5_m, const1); \ 263 reg0_m += const3; \ 264 reg1_m += const3; \ 265 reg2_m += const3; \ 266 reg3_m += const3; \ 267 reg0_m -= __msa_dotp_u_h(vec2_m, const0); \ 268 reg1_m -= __msa_dotp_u_h(vec3_m, const0); \ 269 reg2_m -= __msa_dotp_u_h(vec6_m, const2); \ 270 reg3_m -= __msa_dotp_u_h(vec7_m, const2); \ 271 v_out = (v16u8)__msa_pckod_b((v16i8)reg1_m, (v16i8)reg0_m); \ 272 u_out = (v16u8)__msa_pckod_b((v16i8)reg3_m, (v16i8)reg2_m); \ 273 } 274 275 // Load I444 pixel data 276 #define READI444(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v) \ 277 { \ 278 uint64 y_m, u_m, v_m; \ 279 v2i64 zero_m = {0}; \ 280 y_m = LD(psrc_y); \ 281 u_m = LD(psrc_u); \ 282 v_m = LD(psrc_v); \ 283 out_y = (v16u8)__msa_insert_d(zero_m, 0, (int64)y_m); \ 284 out_u = (v16u8)__msa_insert_d(zero_m, 0, (int64)u_m); \ 285 out_v = (v16u8)__msa_insert_d(zero_m, 0, (int64)v_m); \ 286 } 287 288 void MirrorRow_MSA(const uint8* src, uint8* dst, int width) { 289 int x; 290 v16u8 src0, src1, src2, src3; 291 v16u8 dst0, dst1, dst2, dst3; 292 v16i8 shuffler = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; 293 src += width - 64; 294 295 for (x = 0; x < width; x += 64) { 296 LD_UB4(src, 16, src3, src2, src1, src0); 297 VSHF_B2_UB(src3, src3, src2, src2, shuffler, shuffler, dst3, dst2); 298 VSHF_B2_UB(src1, src1, src0, src0, shuffler, shuffler, dst1, dst0); 299 ST_UB4(dst0, dst1, dst2, dst3, dst, 16); 300 dst += 64; 301 src -= 64; 302 } 303 } 304 305 void ARGBMirrorRow_MSA(const uint8* src, uint8* dst, int width) { 306 int x; 307 v16u8 src0, src1, src2, src3; 308 v16u8 dst0, dst1, dst2, dst3; 309 v16i8 shuffler = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3}; 310 src += width * 4 - 64; 311 312 for (x = 0; x < width; x += 16) { 313 LD_UB4(src, 16, src3, src2, src1, src0); 314 VSHF_B2_UB(src3, src3, src2, src2, shuffler, shuffler, dst3, dst2); 315 VSHF_B2_UB(src1, src1, src0, src0, shuffler, shuffler, dst1, dst0); 316 ST_UB4(dst0, dst1, dst2, dst3, dst, 16); 317 dst += 64; 318 src -= 64; 319 } 320 } 321 322 void I422ToYUY2Row_MSA(const uint8* src_y, 323 const uint8* src_u, 324 const uint8* src_v, 325 uint8* dst_yuy2, 326 int width) { 327 int x; 328 v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1; 329 v16u8 dst_yuy2_0, dst_yuy2_1, dst_yuy2_2, dst_yuy2_3; 330 331 for (x = 0; x < width; x += 32) { 332 src_u0 = LD_UB(src_u); 333 src_v0 = LD_UB(src_v); 334 LD_UB2(src_y, 16, src_y0, src_y1); 335 ILVRL_B2_UB(src_v0, src_u0, vec_uv0, vec_uv1); 336 ILVRL_B2_UB(vec_uv0, src_y0, dst_yuy2_0, dst_yuy2_1); 337 ILVRL_B2_UB(vec_uv1, src_y1, dst_yuy2_2, dst_yuy2_3); 338 ST_UB4(dst_yuy2_0, dst_yuy2_1, dst_yuy2_2, dst_yuy2_3, dst_yuy2, 16); 339 src_u += 16; 340 src_v += 16; 341 src_y += 32; 342 dst_yuy2 += 64; 343 } 344 } 345 346 void I422ToUYVYRow_MSA(const uint8* src_y, 347 const uint8* src_u, 348 const uint8* src_v, 349 uint8* dst_uyvy, 350 int width) { 351 int x; 352 v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1; 353 v16u8 dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3; 354 355 for (x = 0; x < width; x += 32) { 356 src_u0 = LD_UB(src_u); 357 src_v0 = LD_UB(src_v); 358 LD_UB2(src_y, 16, src_y0, src_y1); 359 ILVRL_B2_UB(src_v0, src_u0, vec_uv0, vec_uv1); 360 ILVRL_B2_UB(src_y0, vec_uv0, dst_uyvy0, dst_uyvy1); 361 ILVRL_B2_UB(src_y1, vec_uv1, dst_uyvy2, dst_uyvy3); 362 ST_UB4(dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3, dst_uyvy, 16); 363 src_u += 16; 364 src_v += 16; 365 src_y += 32; 366 dst_uyvy += 64; 367 } 368 } 369 370 void I422ToARGBRow_MSA(const uint8* src_y, 371 const uint8* src_u, 372 const uint8* src_v, 373 uint8* rgb_buf, 374 const struct YuvConstants* yuvconstants, 375 int width) { 376 int x; 377 v16u8 src0, src1, src2; 378 v8i16 vec0, vec1, vec2; 379 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; 380 v4i32 vec_ubvr, vec_ugvg; 381 v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); 382 383 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, 384 vec_br, vec_yg); 385 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); 386 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); 387 388 for (x = 0; x < width; x += 8) { 389 READYUV422(src_y, src_u, src_v, src0, src1, src2); 390 src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); 391 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, 392 vec0, vec1, vec2); 393 STOREARGB(vec0, vec1, vec2, alpha, rgb_buf); 394 src_y += 8; 395 src_u += 4; 396 src_v += 4; 397 rgb_buf += 32; 398 } 399 } 400 401 void I422ToRGBARow_MSA(const uint8* src_y, 402 const uint8* src_u, 403 const uint8* src_v, 404 uint8* rgb_buf, 405 const struct YuvConstants* yuvconstants, 406 int width) { 407 int x; 408 v16u8 src0, src1, src2; 409 v8i16 vec0, vec1, vec2; 410 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; 411 v4i32 vec_ubvr, vec_ugvg; 412 v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); 413 414 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, 415 vec_br, vec_yg); 416 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); 417 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); 418 419 for (x = 0; x < width; x += 8) { 420 READYUV422(src_y, src_u, src_v, src0, src1, src2); 421 src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); 422 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, 423 vec0, vec1, vec2); 424 STOREARGB(alpha, vec0, vec1, vec2, rgb_buf); 425 src_y += 8; 426 src_u += 4; 427 src_v += 4; 428 rgb_buf += 32; 429 } 430 } 431 432 void I422AlphaToARGBRow_MSA(const uint8* src_y, 433 const uint8* src_u, 434 const uint8* src_v, 435 const uint8* src_a, 436 uint8* rgb_buf, 437 const struct YuvConstants* yuvconstants, 438 int width) { 439 int x; 440 int64 data_a; 441 v16u8 src0, src1, src2, src3; 442 v8i16 vec0, vec1, vec2; 443 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; 444 v4i32 vec_ubvr, vec_ugvg; 445 v4i32 zero = {0}; 446 447 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, 448 vec_br, vec_yg); 449 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); 450 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); 451 452 for (x = 0; x < width; x += 8) { 453 data_a = LD(src_a); 454 READYUV422(src_y, src_u, src_v, src0, src1, src2); 455 src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); 456 src3 = (v16u8)__msa_insert_d((v2i64)zero, 0, data_a); 457 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, 458 vec0, vec1, vec2); 459 src3 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src3); 460 STOREARGB(vec0, vec1, vec2, src3, rgb_buf); 461 src_y += 8; 462 src_u += 4; 463 src_v += 4; 464 src_a += 8; 465 rgb_buf += 32; 466 } 467 } 468 469 void I422ToRGB24Row_MSA(const uint8* src_y, 470 const uint8* src_u, 471 const uint8* src_v, 472 uint8* rgb_buf, 473 const struct YuvConstants* yuvconstants, 474 int32 width) { 475 int x; 476 int64 data_u, data_v; 477 v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2; 478 v8i16 vec0, vec1, vec2, vec3, vec4, vec5; 479 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; 480 v4i32 vec_ubvr, vec_ugvg; 481 v16u8 reg0, reg1, reg2, reg3; 482 v2i64 zero = {0}; 483 v16i8 shuffler0 = {0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, 8, 9, 20, 10}; 484 v16i8 shuffler1 = {0, 21, 1, 2, 22, 3, 4, 23, 5, 6, 24, 7, 8, 25, 9, 10}; 485 v16i8 shuffler2 = {26, 6, 7, 27, 8, 9, 28, 10, 486 11, 29, 12, 13, 30, 14, 15, 31}; 487 488 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, 489 vec_br, vec_yg); 490 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); 491 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); 492 493 for (x = 0; x < width; x += 16) { 494 src0 = (v16u8)__msa_ld_b((v16u8*)src_y, 0); 495 data_u = LD(src_u); 496 data_v = LD(src_v); 497 src1 = (v16u8)__msa_insert_d(zero, 0, data_u); 498 src2 = (v16u8)__msa_insert_d(zero, 0, data_v); 499 src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); 500 src3 = (v16u8)__msa_sldi_b((v16i8)src0, (v16i8)src0, 8); 501 src4 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src1, 8); 502 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, 503 vec0, vec1, vec2); 504 YUVTORGB(src3, src4, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, 505 vec3, vec4, vec5); 506 reg0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0); 507 reg2 = (v16u8)__msa_ilvev_b((v16i8)vec4, (v16i8)vec3); 508 reg3 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec2); 509 reg1 = (v16u8)__msa_sldi_b((v16i8)reg2, (v16i8)reg0, 11); 510 dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)reg3, (v16i8)reg0); 511 dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)reg3, (v16i8)reg1); 512 dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)reg3, (v16i8)reg2); 513 ST_UB2(dst0, dst1, rgb_buf, 16); 514 ST_UB(dst2, (rgb_buf + 32)); 515 src_y += 16; 516 src_u += 8; 517 src_v += 8; 518 rgb_buf += 48; 519 } 520 } 521 522 // TODO(fbarchard): Consider AND instead of shift to isolate 5 upper bits of R. 523 void I422ToRGB565Row_MSA(const uint8* src_y, 524 const uint8* src_u, 525 const uint8* src_v, 526 uint8* dst_rgb565, 527 const struct YuvConstants* yuvconstants, 528 int width) { 529 int x; 530 v16u8 src0, src1, src2, dst0; 531 v8i16 vec0, vec1, vec2; 532 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; 533 v4i32 vec_ubvr, vec_ugvg; 534 535 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, 536 vec_br, vec_yg); 537 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); 538 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); 539 540 for (x = 0; x < width; x += 8) { 541 READYUV422(src_y, src_u, src_v, src0, src1, src2); 542 src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); 543 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, 544 vec0, vec2, vec1); 545 vec0 = __msa_srai_h(vec0, 3); 546 vec1 = __msa_srai_h(vec1, 3); 547 vec2 = __msa_srai_h(vec2, 2); 548 vec1 = __msa_slli_h(vec1, 11); 549 vec2 = __msa_slli_h(vec2, 5); 550 vec0 |= vec1; 551 dst0 = (v16u8)(vec2 | vec0); 552 ST_UB(dst0, dst_rgb565); 553 src_y += 8; 554 src_u += 4; 555 src_v += 4; 556 dst_rgb565 += 16; 557 } 558 } 559 560 // TODO(fbarchard): Consider AND instead of shift to isolate 4 upper bits of G. 561 void I422ToARGB4444Row_MSA(const uint8* src_y, 562 const uint8* src_u, 563 const uint8* src_v, 564 uint8* dst_argb4444, 565 const struct YuvConstants* yuvconstants, 566 int width) { 567 int x; 568 v16u8 src0, src1, src2, dst0; 569 v8i16 vec0, vec1, vec2; 570 v8u16 reg0, reg1, reg2; 571 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; 572 v4i32 vec_ubvr, vec_ugvg; 573 v8u16 const_0xF000 = (v8u16)__msa_fill_h(0xF000); 574 575 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, 576 vec_br, vec_yg); 577 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); 578 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); 579 580 for (x = 0; x < width; x += 8) { 581 READYUV422(src_y, src_u, src_v, src0, src1, src2); 582 src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); 583 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, 584 vec0, vec1, vec2); 585 reg0 = (v8u16)__msa_srai_h(vec0, 4); 586 reg1 = (v8u16)__msa_srai_h(vec1, 4); 587 reg2 = (v8u16)__msa_srai_h(vec2, 4); 588 reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 4); 589 reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 8); 590 reg1 |= const_0xF000; 591 reg0 |= reg2; 592 dst0 = (v16u8)(reg1 | reg0); 593 ST_UB(dst0, dst_argb4444); 594 src_y += 8; 595 src_u += 4; 596 src_v += 4; 597 dst_argb4444 += 16; 598 } 599 } 600 601 void I422ToARGB1555Row_MSA(const uint8* src_y, 602 const uint8* src_u, 603 const uint8* src_v, 604 uint8* dst_argb1555, 605 const struct YuvConstants* yuvconstants, 606 int width) { 607 int x; 608 v16u8 src0, src1, src2, dst0; 609 v8i16 vec0, vec1, vec2; 610 v8u16 reg0, reg1, reg2; 611 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; 612 v4i32 vec_ubvr, vec_ugvg; 613 v8u16 const_0x8000 = (v8u16)__msa_fill_h(0x8000); 614 615 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, 616 vec_br, vec_yg); 617 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); 618 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); 619 620 for (x = 0; x < width; x += 8) { 621 READYUV422(src_y, src_u, src_v, src0, src1, src2); 622 src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1); 623 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, 624 vec0, vec1, vec2); 625 reg0 = (v8u16)__msa_srai_h(vec0, 3); 626 reg1 = (v8u16)__msa_srai_h(vec1, 3); 627 reg2 = (v8u16)__msa_srai_h(vec2, 3); 628 reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 5); 629 reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 10); 630 reg1 |= const_0x8000; 631 reg0 |= reg2; 632 dst0 = (v16u8)(reg1 | reg0); 633 ST_UB(dst0, dst_argb1555); 634 src_y += 8; 635 src_u += 4; 636 src_v += 4; 637 dst_argb1555 += 16; 638 } 639 } 640 641 void YUY2ToYRow_MSA(const uint8* src_yuy2, uint8* dst_y, int width) { 642 int x; 643 v16u8 src0, src1, src2, src3, dst0, dst1; 644 645 for (x = 0; x < width; x += 32) { 646 LD_UB4(src_yuy2, 16, src0, src1, src2, src3); 647 dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); 648 dst1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); 649 ST_UB2(dst0, dst1, dst_y, 16); 650 src_yuy2 += 64; 651 dst_y += 32; 652 } 653 } 654 655 void YUY2ToUVRow_MSA(const uint8* src_yuy2, 656 int src_stride_yuy2, 657 uint8* dst_u, 658 uint8* dst_v, 659 int width) { 660 const uint8* src_yuy2_next = src_yuy2 + src_stride_yuy2; 661 int x; 662 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 663 v16u8 vec0, vec1, dst0, dst1; 664 665 for (x = 0; x < width; x += 32) { 666 LD_UB4(src_yuy2, 16, src0, src1, src2, src3); 667 LD_UB4(src_yuy2_next, 16, src4, src5, src6, src7); 668 src0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); 669 src1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); 670 src2 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4); 671 src3 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6); 672 vec0 = __msa_aver_u_b(src0, src2); 673 vec1 = __msa_aver_u_b(src1, src3); 674 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); 675 dst1 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0); 676 ST_UB(dst0, dst_u); 677 ST_UB(dst1, dst_v); 678 src_yuy2 += 64; 679 src_yuy2_next += 64; 680 dst_u += 16; 681 dst_v += 16; 682 } 683 } 684 685 void YUY2ToUV422Row_MSA(const uint8* src_yuy2, 686 uint8* dst_u, 687 uint8* dst_v, 688 int width) { 689 int x; 690 v16u8 src0, src1, src2, src3, dst0, dst1; 691 692 for (x = 0; x < width; x += 32) { 693 LD_UB4(src_yuy2, 16, src0, src1, src2, src3); 694 src0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); 695 src1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); 696 dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); 697 dst1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); 698 ST_UB(dst0, dst_u); 699 ST_UB(dst1, dst_v); 700 src_yuy2 += 64; 701 dst_u += 16; 702 dst_v += 16; 703 } 704 } 705 706 void UYVYToYRow_MSA(const uint8* src_uyvy, uint8* dst_y, int width) { 707 int x; 708 v16u8 src0, src1, src2, src3, dst0, dst1; 709 710 for (x = 0; x < width; x += 32) { 711 LD_UB4(src_uyvy, 16, src0, src1, src2, src3); 712 dst0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); 713 dst1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); 714 ST_UB2(dst0, dst1, dst_y, 16); 715 src_uyvy += 64; 716 dst_y += 32; 717 } 718 } 719 720 void UYVYToUVRow_MSA(const uint8* src_uyvy, 721 int src_stride_uyvy, 722 uint8* dst_u, 723 uint8* dst_v, 724 int width) { 725 const uint8* src_uyvy_next = src_uyvy + src_stride_uyvy; 726 int x; 727 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 728 v16u8 vec0, vec1, dst0, dst1; 729 730 for (x = 0; x < width; x += 32) { 731 LD_UB4(src_uyvy, 16, src0, src1, src2, src3); 732 LD_UB4(src_uyvy_next, 16, src4, src5, src6, src7); 733 src0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); 734 src1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); 735 src2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4); 736 src3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6); 737 vec0 = __msa_aver_u_b(src0, src2); 738 vec1 = __msa_aver_u_b(src1, src3); 739 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); 740 dst1 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0); 741 ST_UB(dst0, dst_u); 742 ST_UB(dst1, dst_v); 743 src_uyvy += 64; 744 src_uyvy_next += 64; 745 dst_u += 16; 746 dst_v += 16; 747 } 748 } 749 750 void UYVYToUV422Row_MSA(const uint8* src_uyvy, 751 uint8* dst_u, 752 uint8* dst_v, 753 int width) { 754 int x; 755 v16u8 src0, src1, src2, src3, dst0, dst1; 756 757 for (x = 0; x < width; x += 32) { 758 LD_UB4(src_uyvy, 16, src0, src1, src2, src3); 759 src0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); 760 src1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); 761 dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); 762 dst1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); 763 ST_UB(dst0, dst_u); 764 ST_UB(dst1, dst_v); 765 src_uyvy += 64; 766 dst_u += 16; 767 dst_v += 16; 768 } 769 } 770 771 void ARGBToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) { 772 int x; 773 v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0; 774 v8u16 reg0, reg1, reg2, reg3, reg4, reg5; 775 v16i8 zero = {0}; 776 v8u16 const_0x19 = (v8u16)__msa_ldi_h(0x19); 777 v8u16 const_0x81 = (v8u16)__msa_ldi_h(0x81); 778 v8u16 const_0x42 = (v8u16)__msa_ldi_h(0x42); 779 v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); 780 781 for (x = 0; x < width; x += 16) { 782 src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 0); 783 src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 16); 784 src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 32); 785 src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 48); 786 vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); 787 vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); 788 vec2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); 789 vec3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); 790 reg0 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec0); 791 reg1 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec1); 792 reg2 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec2); 793 reg3 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec3); 794 reg4 = (v8u16)__msa_ilvod_b(zero, (v16i8)vec0); 795 reg5 = (v8u16)__msa_ilvod_b(zero, (v16i8)vec1); 796 reg0 *= const_0x19; 797 reg1 *= const_0x19; 798 reg2 *= const_0x81; 799 reg3 *= const_0x81; 800 reg4 *= const_0x42; 801 reg5 *= const_0x42; 802 reg0 += reg2; 803 reg1 += reg3; 804 reg0 += reg4; 805 reg1 += reg5; 806 reg0 += const_0x1080; 807 reg1 += const_0x1080; 808 reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8); 809 reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 8); 810 dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); 811 ST_UB(dst0, dst_y); 812 src_argb0 += 64; 813 dst_y += 16; 814 } 815 } 816 817 void ARGBToUVRow_MSA(const uint8* src_argb0, 818 int src_stride_argb, 819 uint8* dst_u, 820 uint8* dst_v, 821 int width) { 822 int x; 823 const uint8* src_argb0_next = src_argb0 + src_stride_argb; 824 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 825 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; 826 v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9; 827 v16u8 dst0, dst1; 828 v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70); 829 v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A); 830 v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26); 831 v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E); 832 v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12); 833 v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); 834 835 for (x = 0; x < width; x += 32) { 836 src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 0); 837 src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 16); 838 src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 32); 839 src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 48); 840 src4 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 64); 841 src5 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 80); 842 src6 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 96); 843 src7 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 112); 844 vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); 845 vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); 846 vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4); 847 vec3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6); 848 vec4 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); 849 vec5 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); 850 vec6 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4); 851 vec7 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6); 852 vec8 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); 853 vec9 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); 854 vec4 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4); 855 vec5 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6); 856 vec0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0); 857 vec1 = (v16u8)__msa_pckod_b((v16i8)vec3, (v16i8)vec2); 858 reg0 = __msa_hadd_u_h(vec8, vec8); 859 reg1 = __msa_hadd_u_h(vec9, vec9); 860 reg2 = __msa_hadd_u_h(vec4, vec4); 861 reg3 = __msa_hadd_u_h(vec5, vec5); 862 reg4 = __msa_hadd_u_h(vec0, vec0); 863 reg5 = __msa_hadd_u_h(vec1, vec1); 864 src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 0); 865 src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 16); 866 src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 32); 867 src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 48); 868 src4 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 64); 869 src5 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 80); 870 src6 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 96); 871 src7 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 112); 872 vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); 873 vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); 874 vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4); 875 vec3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6); 876 vec4 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); 877 vec5 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); 878 vec6 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4); 879 vec7 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6); 880 vec8 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); 881 vec9 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); 882 vec4 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4); 883 vec5 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6); 884 vec0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0); 885 vec1 = (v16u8)__msa_pckod_b((v16i8)vec3, (v16i8)vec2); 886 reg0 += __msa_hadd_u_h(vec8, vec8); 887 reg1 += __msa_hadd_u_h(vec9, vec9); 888 reg2 += __msa_hadd_u_h(vec4, vec4); 889 reg3 += __msa_hadd_u_h(vec5, vec5); 890 reg4 += __msa_hadd_u_h(vec0, vec0); 891 reg5 += __msa_hadd_u_h(vec1, vec1); 892 reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 2); 893 reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 2); 894 reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 2); 895 reg3 = (v8u16)__msa_srai_h((v8i16)reg3, 2); 896 reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 2); 897 reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 2); 898 reg6 = reg0 * const_0x70; 899 reg7 = reg1 * const_0x70; 900 reg8 = reg2 * const_0x4A; 901 reg9 = reg3 * const_0x4A; 902 reg6 += const_0x8080; 903 reg7 += const_0x8080; 904 reg8 += reg4 * const_0x26; 905 reg9 += reg5 * const_0x26; 906 reg0 *= const_0x12; 907 reg1 *= const_0x12; 908 reg2 *= const_0x5E; 909 reg3 *= const_0x5E; 910 reg4 *= const_0x70; 911 reg5 *= const_0x70; 912 reg2 += reg0; 913 reg3 += reg1; 914 reg4 += const_0x8080; 915 reg5 += const_0x8080; 916 reg6 -= reg8; 917 reg7 -= reg9; 918 reg4 -= reg2; 919 reg5 -= reg3; 920 reg6 = (v8u16)__msa_srai_h((v8i16)reg6, 8); 921 reg7 = (v8u16)__msa_srai_h((v8i16)reg7, 8); 922 reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 8); 923 reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 8); 924 dst0 = (v16u8)__msa_pckev_b((v16i8)reg7, (v16i8)reg6); 925 dst1 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4); 926 ST_UB(dst0, dst_u); 927 ST_UB(dst1, dst_v); 928 src_argb0 += 128; 929 src_argb0_next += 128; 930 dst_u += 16; 931 dst_v += 16; 932 } 933 } 934 935 void ARGBToRGB24Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width) { 936 int x; 937 v16u8 src0, src1, src2, src3, dst0, dst1, dst2; 938 v16i8 shuffler0 = {0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20}; 939 v16i8 shuffler1 = {5, 6, 8, 9, 10, 12, 13, 14, 940 16, 17, 18, 20, 21, 22, 24, 25}; 941 v16i8 shuffler2 = {10, 12, 13, 14, 16, 17, 18, 20, 942 21, 22, 24, 25, 26, 28, 29, 30}; 943 944 for (x = 0; x < width; x += 16) { 945 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0); 946 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16); 947 src2 = (v16u8)__msa_ld_b((v16i8*)src_argb, 32); 948 src3 = (v16u8)__msa_ld_b((v16i8*)src_argb, 48); 949 dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0); 950 dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src2, (v16i8)src1); 951 dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src3, (v16i8)src2); 952 ST_UB2(dst0, dst1, dst_rgb, 16); 953 ST_UB(dst2, (dst_rgb + 32)); 954 src_argb += 64; 955 dst_rgb += 48; 956 } 957 } 958 959 void ARGBToRAWRow_MSA(const uint8* src_argb, uint8* dst_rgb, int width) { 960 int x; 961 v16u8 src0, src1, src2, src3, dst0, dst1, dst2; 962 v16i8 shuffler0 = {2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, 18, 17, 16, 22}; 963 v16i8 shuffler1 = {5, 4, 10, 9, 8, 14, 13, 12, 964 18, 17, 16, 22, 21, 20, 26, 25}; 965 v16i8 shuffler2 = {8, 14, 13, 12, 18, 17, 16, 22, 966 21, 20, 26, 25, 24, 30, 29, 28}; 967 968 for (x = 0; x < width; x += 16) { 969 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0); 970 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16); 971 src2 = (v16u8)__msa_ld_b((v16i8*)src_argb, 32); 972 src3 = (v16u8)__msa_ld_b((v16i8*)src_argb, 48); 973 dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0); 974 dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src2, (v16i8)src1); 975 dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src3, (v16i8)src2); 976 ST_UB2(dst0, dst1, dst_rgb, 16); 977 ST_UB(dst2, (dst_rgb + 32)); 978 src_argb += 64; 979 dst_rgb += 48; 980 } 981 } 982 983 void ARGBToRGB565Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width) { 984 int x; 985 v16u8 src0, src1, dst0; 986 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 987 v16i8 zero = {0}; 988 989 for (x = 0; x < width; x += 8) { 990 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0); 991 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16); 992 vec0 = (v16u8)__msa_srai_b((v16i8)src0, 3); 993 vec1 = (v16u8)__msa_slli_b((v16i8)src0, 3); 994 vec2 = (v16u8)__msa_srai_b((v16i8)src0, 5); 995 vec4 = (v16u8)__msa_srai_b((v16i8)src1, 3); 996 vec5 = (v16u8)__msa_slli_b((v16i8)src1, 3); 997 vec6 = (v16u8)__msa_srai_b((v16i8)src1, 5); 998 vec1 = (v16u8)__msa_sldi_b(zero, (v16i8)vec1, 1); 999 vec2 = (v16u8)__msa_sldi_b(zero, (v16i8)vec2, 1); 1000 vec5 = (v16u8)__msa_sldi_b(zero, (v16i8)vec5, 1); 1001 vec6 = (v16u8)__msa_sldi_b(zero, (v16i8)vec6, 1); 1002 vec3 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 2); 1003 vec7 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 2); 1004 vec0 = __msa_binsli_b(vec0, vec1, 2); 1005 vec1 = __msa_binsli_b(vec2, vec3, 4); 1006 vec4 = __msa_binsli_b(vec4, vec5, 2); 1007 vec5 = __msa_binsli_b(vec6, vec7, 4); 1008 vec0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0); 1009 vec4 = (v16u8)__msa_ilvev_b((v16i8)vec5, (v16i8)vec4); 1010 dst0 = (v16u8)__msa_pckev_h((v8i16)vec4, (v8i16)vec0); 1011 ST_UB(dst0, dst_rgb); 1012 src_argb += 32; 1013 dst_rgb += 16; 1014 } 1015 } 1016 1017 void ARGBToARGB1555Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width) { 1018 int x; 1019 v16u8 src0, src1, dst0; 1020 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; 1021 v16i8 zero = {0}; 1022 1023 for (x = 0; x < width; x += 8) { 1024 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0); 1025 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16); 1026 vec0 = (v16u8)__msa_srai_b((v16i8)src0, 3); 1027 vec1 = (v16u8)__msa_slli_b((v16i8)src0, 2); 1028 vec2 = (v16u8)__msa_srai_b((v16i8)vec0, 3); 1029 vec1 = (v16u8)__msa_sldi_b(zero, (v16i8)vec1, 1); 1030 vec2 = (v16u8)__msa_sldi_b(zero, (v16i8)vec2, 1); 1031 vec3 = (v16u8)__msa_srai_b((v16i8)src0, 1); 1032 vec5 = (v16u8)__msa_srai_b((v16i8)src1, 3); 1033 vec6 = (v16u8)__msa_slli_b((v16i8)src1, 2); 1034 vec7 = (v16u8)__msa_srai_b((v16i8)vec5, 3); 1035 vec6 = (v16u8)__msa_sldi_b(zero, (v16i8)vec6, 1); 1036 vec7 = (v16u8)__msa_sldi_b(zero, (v16i8)vec7, 1); 1037 vec8 = (v16u8)__msa_srai_b((v16i8)src1, 1); 1038 vec3 = (v16u8)__msa_sldi_b(zero, (v16i8)vec3, 2); 1039 vec8 = (v16u8)__msa_sldi_b(zero, (v16i8)vec8, 2); 1040 vec4 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 3); 1041 vec9 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 3); 1042 vec0 = __msa_binsli_b(vec0, vec1, 2); 1043 vec5 = __msa_binsli_b(vec5, vec6, 2); 1044 vec1 = __msa_binsli_b(vec2, vec3, 5); 1045 vec6 = __msa_binsli_b(vec7, vec8, 5); 1046 vec1 = __msa_binsli_b(vec1, vec4, 0); 1047 vec6 = __msa_binsli_b(vec6, vec9, 0); 1048 vec0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0); 1049 vec1 = (v16u8)__msa_ilvev_b((v16i8)vec6, (v16i8)vec5); 1050 dst0 = (v16u8)__msa_pckev_h((v8i16)vec1, (v8i16)vec0); 1051 ST_UB(dst0, dst_rgb); 1052 src_argb += 32; 1053 dst_rgb += 16; 1054 } 1055 } 1056 1057 void ARGBToARGB4444Row_MSA(const uint8* src_argb, uint8* dst_rgb, int width) { 1058 int x; 1059 v16u8 src0, src1; 1060 v16u8 vec0, vec1; 1061 v16u8 dst0; 1062 v16i8 zero = {0}; 1063 1064 for (x = 0; x < width; x += 8) { 1065 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0); 1066 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16); 1067 vec0 = (v16u8)__msa_srai_b((v16i8)src0, 4); 1068 vec1 = (v16u8)__msa_srai_b((v16i8)src1, 4); 1069 src0 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 1); 1070 src1 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 1); 1071 vec0 = __msa_binsli_b(vec0, src0, 3); 1072 vec1 = __msa_binsli_b(vec1, src1, 3); 1073 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); 1074 ST_UB(dst0, dst_rgb); 1075 src_argb += 32; 1076 dst_rgb += 16; 1077 } 1078 } 1079 1080 void ARGBToUV444Row_MSA(const uint8* src_argb, 1081 uint8* dst_u, 1082 uint8* dst_v, 1083 int32 width) { 1084 int32 x; 1085 v16u8 src0, src1, src2, src3, reg0, reg1, reg2, reg3, dst0, dst1; 1086 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1087 v8u16 vec8, vec9, vec10, vec11; 1088 v8u16 const_112 = (v8u16)__msa_ldi_h(112); 1089 v8u16 const_74 = (v8u16)__msa_ldi_h(74); 1090 v8u16 const_38 = (v8u16)__msa_ldi_h(38); 1091 v8u16 const_94 = (v8u16)__msa_ldi_h(94); 1092 v8u16 const_18 = (v8u16)__msa_ldi_h(18); 1093 v8u16 const_32896 = (v8u16)__msa_fill_h(32896); 1094 v16i8 zero = {0}; 1095 1096 for (x = width; x > 0; x -= 16) { 1097 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0); 1098 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16); 1099 src2 = (v16u8)__msa_ld_b((v16i8*)src_argb, 32); 1100 src3 = (v16u8)__msa_ld_b((v16i8*)src_argb, 48); 1101 reg0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); 1102 reg1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); 1103 reg2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); 1104 reg3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); 1105 src0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); 1106 src1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2); 1107 src2 = (v16u8)__msa_pckod_b((v16i8)reg1, (v16i8)reg0); 1108 vec0 = (v8u16)__msa_ilvr_b(zero, (v16i8)src0); 1109 vec1 = (v8u16)__msa_ilvl_b(zero, (v16i8)src0); 1110 vec2 = (v8u16)__msa_ilvr_b(zero, (v16i8)src1); 1111 vec3 = (v8u16)__msa_ilvl_b(zero, (v16i8)src1); 1112 vec4 = (v8u16)__msa_ilvr_b(zero, (v16i8)src2); 1113 vec5 = (v8u16)__msa_ilvl_b(zero, (v16i8)src2); 1114 vec10 = vec0 * const_18; 1115 vec11 = vec1 * const_18; 1116 vec8 = vec2 * const_94; 1117 vec9 = vec3 * const_94; 1118 vec6 = vec4 * const_112; 1119 vec7 = vec5 * const_112; 1120 vec0 *= const_112; 1121 vec1 *= const_112; 1122 vec2 *= const_74; 1123 vec3 *= const_74; 1124 vec4 *= const_38; 1125 vec5 *= const_38; 1126 vec8 += vec10; 1127 vec9 += vec11; 1128 vec6 += const_32896; 1129 vec7 += const_32896; 1130 vec0 += const_32896; 1131 vec1 += const_32896; 1132 vec2 += vec4; 1133 vec3 += vec5; 1134 vec0 -= vec2; 1135 vec1 -= vec3; 1136 vec6 -= vec8; 1137 vec7 -= vec9; 1138 vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8); 1139 vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8); 1140 vec6 = (v8u16)__msa_srai_h((v8i16)vec6, 8); 1141 vec7 = (v8u16)__msa_srai_h((v8i16)vec7, 8); 1142 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); 1143 dst1 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6); 1144 ST_UB(dst0, dst_u); 1145 ST_UB(dst1, dst_v); 1146 src_argb += 64; 1147 dst_u += 16; 1148 dst_v += 16; 1149 } 1150 } 1151 1152 void ARGBMultiplyRow_MSA(const uint8* src_argb0, 1153 const uint8* src_argb1, 1154 uint8* dst_argb, 1155 int width) { 1156 int x; 1157 v16u8 src0, src1, dst0; 1158 v8u16 vec0, vec1, vec2, vec3; 1159 v4u32 reg0, reg1, reg2, reg3; 1160 v8i16 zero = {0}; 1161 1162 for (x = 0; x < width; x += 4) { 1163 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0); 1164 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 0); 1165 vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); 1166 vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0); 1167 vec2 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1); 1168 vec3 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src1); 1169 reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec0); 1170 reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec0); 1171 reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec1); 1172 reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec1); 1173 reg0 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec2); 1174 reg1 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec2); 1175 reg2 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec3); 1176 reg3 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec3); 1177 reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 16); 1178 reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 16); 1179 reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 16); 1180 reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 16); 1181 vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); 1182 vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); 1183 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); 1184 ST_UB(dst0, dst_argb); 1185 src_argb0 += 16; 1186 src_argb1 += 16; 1187 dst_argb += 16; 1188 } 1189 } 1190 1191 void ARGBAddRow_MSA(const uint8* src_argb0, 1192 const uint8* src_argb1, 1193 uint8* dst_argb, 1194 int width) { 1195 int x; 1196 v16u8 src0, src1, src2, src3, dst0, dst1; 1197 1198 for (x = 0; x < width; x += 8) { 1199 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0); 1200 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16); 1201 src2 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 0); 1202 src3 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 16); 1203 dst0 = __msa_adds_u_b(src0, src2); 1204 dst1 = __msa_adds_u_b(src1, src3); 1205 ST_UB2(dst0, dst1, dst_argb, 16); 1206 src_argb0 += 32; 1207 src_argb1 += 32; 1208 dst_argb += 32; 1209 } 1210 } 1211 1212 void ARGBSubtractRow_MSA(const uint8* src_argb0, 1213 const uint8* src_argb1, 1214 uint8* dst_argb, 1215 int width) { 1216 int x; 1217 v16u8 src0, src1, src2, src3, dst0, dst1; 1218 1219 for (x = 0; x < width; x += 8) { 1220 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0); 1221 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16); 1222 src2 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 0); 1223 src3 = (v16u8)__msa_ld_b((v16i8*)src_argb1, 16); 1224 dst0 = __msa_subs_u_b(src0, src2); 1225 dst1 = __msa_subs_u_b(src1, src3); 1226 ST_UB2(dst0, dst1, dst_argb, 16); 1227 src_argb0 += 32; 1228 src_argb1 += 32; 1229 dst_argb += 32; 1230 } 1231 } 1232 1233 void ARGBAttenuateRow_MSA(const uint8* src_argb, uint8* dst_argb, int width) { 1234 int x; 1235 v16u8 src0, src1, dst0, dst1; 1236 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9; 1237 v4u32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; 1238 v8i16 zero = {0}; 1239 v16u8 mask = {0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255}; 1240 1241 for (x = 0; x < width; x += 8) { 1242 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0); 1243 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16); 1244 vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); 1245 vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0); 1246 vec2 = (v8u16)__msa_ilvr_b((v16i8)src1, (v16i8)src1); 1247 vec3 = (v8u16)__msa_ilvl_b((v16i8)src1, (v16i8)src1); 1248 vec4 = (v8u16)__msa_fill_h(vec0[3]); 1249 vec5 = (v8u16)__msa_fill_h(vec0[7]); 1250 vec6 = (v8u16)__msa_fill_h(vec1[3]); 1251 vec7 = (v8u16)__msa_fill_h(vec1[7]); 1252 vec4 = (v8u16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4); 1253 vec5 = (v8u16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6); 1254 vec6 = (v8u16)__msa_fill_h(vec2[3]); 1255 vec7 = (v8u16)__msa_fill_h(vec2[7]); 1256 vec8 = (v8u16)__msa_fill_h(vec3[3]); 1257 vec9 = (v8u16)__msa_fill_h(vec3[7]); 1258 vec6 = (v8u16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6); 1259 vec7 = (v8u16)__msa_pckev_d((v2i64)vec9, (v2i64)vec8); 1260 reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec4); 1261 reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec4); 1262 reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec5); 1263 reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec5); 1264 reg4 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec6); 1265 reg5 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec6); 1266 reg6 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec7); 1267 reg7 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec7); 1268 reg0 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec0); 1269 reg1 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec0); 1270 reg2 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec1); 1271 reg3 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec1); 1272 reg4 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec2); 1273 reg5 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec2); 1274 reg6 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec3); 1275 reg7 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec3); 1276 reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 24); 1277 reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 24); 1278 reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 24); 1279 reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 24); 1280 reg4 = (v4u32)__msa_srai_w((v4i32)reg4, 24); 1281 reg5 = (v4u32)__msa_srai_w((v4i32)reg5, 24); 1282 reg6 = (v4u32)__msa_srai_w((v4i32)reg6, 24); 1283 reg7 = (v4u32)__msa_srai_w((v4i32)reg7, 24); 1284 vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); 1285 vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); 1286 vec2 = (v8u16)__msa_pckev_h((v8i16)reg5, (v8i16)reg4); 1287 vec3 = (v8u16)__msa_pckev_h((v8i16)reg7, (v8i16)reg6); 1288 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); 1289 dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); 1290 dst0 = __msa_bmnz_v(dst0, src0, mask); 1291 dst1 = __msa_bmnz_v(dst1, src1, mask); 1292 ST_UB2(dst0, dst1, dst_argb, 16); 1293 src_argb += 32; 1294 dst_argb += 32; 1295 } 1296 } 1297 1298 void ARGBToRGB565DitherRow_MSA(const uint8* src_argb, 1299 uint8* dst_rgb, 1300 uint32 dither4, 1301 int width) { 1302 int x; 1303 v16u8 src0, src1, dst0, vec0, vec1; 1304 v8i16 vec_d0; 1305 v8i16 reg0, reg1, reg2; 1306 v16i8 zero = {0}; 1307 v8i16 max = __msa_ldi_h(0xFF); 1308 1309 vec_d0 = (v8i16)__msa_fill_w(dither4); 1310 vec_d0 = (v8i16)__msa_ilvr_b(zero, (v16i8)vec_d0); 1311 1312 for (x = 0; x < width; x += 8) { 1313 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0); 1314 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16); 1315 vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); 1316 vec1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); 1317 reg0 = (v8i16)__msa_ilvev_b(zero, (v16i8)vec0); 1318 reg1 = (v8i16)__msa_ilvev_b(zero, (v16i8)vec1); 1319 reg2 = (v8i16)__msa_ilvod_b(zero, (v16i8)vec0); 1320 reg0 += vec_d0; 1321 reg1 += vec_d0; 1322 reg2 += vec_d0; 1323 reg0 = __msa_maxi_s_h((v8i16)reg0, 0); 1324 reg1 = __msa_maxi_s_h((v8i16)reg1, 0); 1325 reg2 = __msa_maxi_s_h((v8i16)reg2, 0); 1326 reg0 = __msa_min_s_h((v8i16)max, (v8i16)reg0); 1327 reg1 = __msa_min_s_h((v8i16)max, (v8i16)reg1); 1328 reg2 = __msa_min_s_h((v8i16)max, (v8i16)reg2); 1329 reg0 = __msa_srai_h(reg0, 3); 1330 reg2 = __msa_srai_h(reg2, 3); 1331 reg1 = __msa_srai_h(reg1, 2); 1332 reg2 = __msa_slli_h(reg2, 11); 1333 reg1 = __msa_slli_h(reg1, 5); 1334 reg0 |= reg1; 1335 dst0 = (v16u8)(reg0 | reg2); 1336 ST_UB(dst0, dst_rgb); 1337 src_argb += 32; 1338 dst_rgb += 16; 1339 } 1340 } 1341 1342 void ARGBShuffleRow_MSA(const uint8* src_argb, 1343 uint8* dst_argb, 1344 const uint8* shuffler, 1345 int width) { 1346 int x; 1347 v16u8 src0, src1, dst0, dst1; 1348 v16i8 vec0; 1349 v16i8 shuffler_vec = {0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12}; 1350 int32 val = LW((int32*)shuffler); 1351 1352 vec0 = (v16i8)__msa_fill_w(val); 1353 shuffler_vec += vec0; 1354 1355 for (x = 0; x < width; x += 8) { 1356 src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0); 1357 src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16); 1358 dst0 = (v16u8)__msa_vshf_b(shuffler_vec, (v16i8)src0, (v16i8)src0); 1359 dst1 = (v16u8)__msa_vshf_b(shuffler_vec, (v16i8)src1, (v16i8)src1); 1360 ST_UB2(dst0, dst1, dst_argb, 16); 1361 src_argb += 32; 1362 dst_argb += 32; 1363 } 1364 } 1365 1366 void ARGBShadeRow_MSA(const uint8* src_argb, 1367 uint8* dst_argb, 1368 int width, 1369 uint32 value) { 1370 int x; 1371 v16u8 src0, dst0; 1372 v8u16 vec0, vec1; 1373 v4u32 reg0, reg1, reg2, reg3, rgba_scale; 1374 v8i16 zero = {0}; 1375 1376 rgba_scale[0] = value; 1377 rgba_scale = (v4u32)__msa_ilvr_b((v16i8)rgba_scale, (v16i8)rgba_scale); 1378 rgba_scale = (v4u32)__msa_ilvr_h(zero, (v8i16)rgba_scale); 1379 1380 for (x = 0; x < width; x += 4) { 1381 src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0); 1382 vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); 1383 vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0); 1384 reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec0); 1385 reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec0); 1386 reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec1); 1387 reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec1); 1388 reg0 *= rgba_scale; 1389 reg1 *= rgba_scale; 1390 reg2 *= rgba_scale; 1391 reg3 *= rgba_scale; 1392 reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 24); 1393 reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 24); 1394 reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 24); 1395 reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 24); 1396 vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); 1397 vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); 1398 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); 1399 ST_UB(dst0, dst_argb); 1400 src_argb += 16; 1401 dst_argb += 16; 1402 } 1403 } 1404 1405 void ARGBGrayRow_MSA(const uint8* src_argb, uint8* dst_argb, int width) { 1406 int x; 1407 v16u8 src0, src1, vec0, vec1, dst0, dst1; 1408 v8u16 reg0; 1409 v16u8 const_0x26 = (v16u8)__msa_ldi_h(0x26); 1410 v16u8 const_0x4B0F = (v16u8)__msa_fill_h(0x4B0F); 1411 1412 for (x = 0; x < width; x += 8) { 1413 src0 = (v16u8)__msa_ld_b((v16u8*)src_argb, 0); 1414 src1 = (v16u8)__msa_ld_b((v16u8*)src_argb, 16); 1415 vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0); 1416 vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0); 1417 reg0 = __msa_dotp_u_h(vec0, const_0x4B0F); 1418 reg0 = __msa_dpadd_u_h(reg0, vec1, const_0x26); 1419 reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 7); 1420 vec0 = (v16u8)__msa_ilvev_b((v16i8)reg0, (v16i8)reg0); 1421 vec1 = (v16u8)__msa_ilvod_b((v16i8)vec1, (v16i8)vec0); 1422 dst0 = (v16u8)__msa_ilvr_b((v16i8)vec1, (v16i8)vec0); 1423 dst1 = (v16u8)__msa_ilvl_b((v16i8)vec1, (v16i8)vec0); 1424 ST_UB2(dst0, dst1, dst_argb, 16); 1425 src_argb += 32; 1426 dst_argb += 32; 1427 } 1428 } 1429 1430 void ARGBSepiaRow_MSA(uint8* dst_argb, int width) { 1431 int x; 1432 v16u8 src0, src1, dst0, dst1, vec0, vec1, vec2, vec3, vec4, vec5; 1433 v8u16 reg0, reg1, reg2; 1434 v16u8 const_0x4411 = (v16u8)__msa_fill_h(0x4411); 1435 v16u8 const_0x23 = (v16u8)__msa_ldi_h(0x23); 1436 v16u8 const_0x5816 = (v16u8)__msa_fill_h(0x5816); 1437 v16u8 const_0x2D = (v16u8)__msa_ldi_h(0x2D); 1438 v16u8 const_0x6218 = (v16u8)__msa_fill_h(0x6218); 1439 v16u8 const_0x32 = (v16u8)__msa_ldi_h(0x32); 1440 v8u16 const_0xFF = (v8u16)__msa_ldi_h(0xFF); 1441 1442 for (x = 0; x < width; x += 8) { 1443 src0 = (v16u8)__msa_ld_b((v16u8*)dst_argb, 0); 1444 src1 = (v16u8)__msa_ld_b((v16u8*)dst_argb, 16); 1445 vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0); 1446 vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0); 1447 vec3 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec1); 1448 reg0 = (v8u16)__msa_dotp_u_h(vec0, const_0x4411); 1449 reg1 = (v8u16)__msa_dotp_u_h(vec0, const_0x5816); 1450 reg2 = (v8u16)__msa_dotp_u_h(vec0, const_0x6218); 1451 reg0 = (v8u16)__msa_dpadd_u_h(reg0, vec1, const_0x23); 1452 reg1 = (v8u16)__msa_dpadd_u_h(reg1, vec1, const_0x2D); 1453 reg2 = (v8u16)__msa_dpadd_u_h(reg2, vec1, const_0x32); 1454 reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 7); 1455 reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 7); 1456 reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 7); 1457 reg1 = (v8u16)__msa_min_u_h((v8u16)reg1, const_0xFF); 1458 reg2 = (v8u16)__msa_min_u_h((v8u16)reg2, const_0xFF); 1459 vec0 = (v16u8)__msa_pckev_b((v16i8)reg0, (v16i8)reg0); 1460 vec1 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg1); 1461 vec2 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg2); 1462 vec4 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0); 1463 vec5 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1); 1464 dst0 = (v16u8)__msa_ilvr_b((v16i8)vec5, (v16i8)vec4); 1465 dst1 = (v16u8)__msa_ilvl_b((v16i8)vec5, (v16i8)vec4); 1466 ST_UB2(dst0, dst1, dst_argb, 16); 1467 dst_argb += 32; 1468 } 1469 } 1470 1471 void ARGB4444ToARGBRow_MSA(const uint8* src_argb4444, 1472 uint8* dst_argb, 1473 int width) { 1474 int x; 1475 v16u8 src0, src1; 1476 v8u16 vec0, vec1, vec2, vec3; 1477 v16u8 dst0, dst1, dst2, dst3; 1478 1479 for (x = 0; x < width; x += 16) { 1480 src0 = (v16u8)__msa_ld_b((v16u8*)src_argb4444, 0); 1481 src1 = (v16u8)__msa_ld_b((v16u8*)src_argb4444, 16); 1482 vec0 = (v8u16)__msa_andi_b(src0, 0x0F); 1483 vec1 = (v8u16)__msa_andi_b(src1, 0x0F); 1484 vec2 = (v8u16)__msa_andi_b(src0, 0xF0); 1485 vec3 = (v8u16)__msa_andi_b(src1, 0xF0); 1486 vec0 |= (v8u16)__msa_slli_b((v16i8)vec0, 4); 1487 vec1 |= (v8u16)__msa_slli_b((v16i8)vec1, 4); 1488 vec2 |= (v8u16)__msa_srli_b((v16i8)vec2, 4); 1489 vec3 |= (v8u16)__msa_srli_b((v16i8)vec3, 4); 1490 dst0 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0); 1491 dst1 = (v16u8)__msa_ilvl_b((v16i8)vec2, (v16i8)vec0); 1492 dst2 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1); 1493 dst3 = (v16u8)__msa_ilvl_b((v16i8)vec3, (v16i8)vec1); 1494 ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); 1495 src_argb4444 += 32; 1496 dst_argb += 64; 1497 } 1498 } 1499 1500 void ARGB1555ToARGBRow_MSA(const uint8* src_argb1555, 1501 uint8* dst_argb, 1502 int width) { 1503 int x; 1504 v8u16 src0, src1; 1505 v8u16 vec0, vec1, vec2, vec3, vec4, vec5; 1506 v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6; 1507 v16u8 dst0, dst1, dst2, dst3; 1508 v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F); 1509 1510 for (x = 0; x < width; x += 16) { 1511 src0 = (v8u16)__msa_ld_h((v8u16*)src_argb1555, 0); 1512 src1 = (v8u16)__msa_ld_h((v8u16*)src_argb1555, 16); 1513 vec0 = src0 & const_0x1F; 1514 vec1 = src1 & const_0x1F; 1515 src0 = (v8u16)__msa_srli_h((v8i16)src0, 5); 1516 src1 = (v8u16)__msa_srli_h((v8i16)src1, 5); 1517 vec2 = src0 & const_0x1F; 1518 vec3 = src1 & const_0x1F; 1519 src0 = (v8u16)__msa_srli_h((v8i16)src0, 5); 1520 src1 = (v8u16)__msa_srli_h((v8i16)src1, 5); 1521 vec4 = src0 & const_0x1F; 1522 vec5 = src1 & const_0x1F; 1523 src0 = (v8u16)__msa_srli_h((v8i16)src0, 5); 1524 src1 = (v8u16)__msa_srli_h((v8i16)src1, 5); 1525 reg0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); 1526 reg1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); 1527 reg2 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4); 1528 reg3 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); 1529 reg4 = (v16u8)__msa_slli_b((v16i8)reg0, 3); 1530 reg5 = (v16u8)__msa_slli_b((v16i8)reg1, 3); 1531 reg6 = (v16u8)__msa_slli_b((v16i8)reg2, 3); 1532 reg4 |= (v16u8)__msa_srai_b((v16i8)reg0, 2); 1533 reg5 |= (v16u8)__msa_srai_b((v16i8)reg1, 2); 1534 reg6 |= (v16u8)__msa_srai_b((v16i8)reg2, 2); 1535 reg3 = -reg3; 1536 reg0 = (v16u8)__msa_ilvr_b((v16i8)reg6, (v16i8)reg4); 1537 reg1 = (v16u8)__msa_ilvl_b((v16i8)reg6, (v16i8)reg4); 1538 reg2 = (v16u8)__msa_ilvr_b((v16i8)reg3, (v16i8)reg5); 1539 reg3 = (v16u8)__msa_ilvl_b((v16i8)reg3, (v16i8)reg5); 1540 dst0 = (v16u8)__msa_ilvr_b((v16i8)reg2, (v16i8)reg0); 1541 dst1 = (v16u8)__msa_ilvl_b((v16i8)reg2, (v16i8)reg0); 1542 dst2 = (v16u8)__msa_ilvr_b((v16i8)reg3, (v16i8)reg1); 1543 dst3 = (v16u8)__msa_ilvl_b((v16i8)reg3, (v16i8)reg1); 1544 ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); 1545 src_argb1555 += 32; 1546 dst_argb += 64; 1547 } 1548 } 1549 1550 void RGB565ToARGBRow_MSA(const uint8* src_rgb565, uint8* dst_argb, int width) { 1551 int x; 1552 v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5; 1553 v8u16 reg0, reg1, reg2, reg3, reg4, reg5; 1554 v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3; 1555 v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); 1556 v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F); 1557 v8u16 const_0x7E0 = (v8u16)__msa_fill_h(0x7E0); 1558 v8u16 const_0xF800 = (v8u16)__msa_fill_h(0xF800); 1559 1560 for (x = 0; x < width; x += 16) { 1561 src0 = (v8u16)__msa_ld_h((v8u16*)src_rgb565, 0); 1562 src1 = (v8u16)__msa_ld_h((v8u16*)src_rgb565, 16); 1563 vec0 = src0 & const_0x1F; 1564 vec1 = src0 & const_0x7E0; 1565 vec2 = src0 & const_0xF800; 1566 vec3 = src1 & const_0x1F; 1567 vec4 = src1 & const_0x7E0; 1568 vec5 = src1 & const_0xF800; 1569 reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3); 1570 reg1 = (v8u16)__msa_srli_h((v8i16)vec1, 3); 1571 reg2 = (v8u16)__msa_srli_h((v8i16)vec2, 8); 1572 reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3); 1573 reg4 = (v8u16)__msa_srli_h((v8i16)vec4, 3); 1574 reg5 = (v8u16)__msa_srli_h((v8i16)vec5, 8); 1575 reg0 |= (v8u16)__msa_srli_h((v8i16)vec0, 2); 1576 reg1 |= (v8u16)__msa_srli_h((v8i16)vec1, 9); 1577 reg2 |= (v8u16)__msa_srli_h((v8i16)vec2, 13); 1578 reg3 |= (v8u16)__msa_srli_h((v8i16)vec3, 2); 1579 reg4 |= (v8u16)__msa_srli_h((v8i16)vec4, 9); 1580 reg5 |= (v8u16)__msa_srli_h((v8i16)vec5, 13); 1581 res0 = (v16u8)__msa_ilvev_b((v16i8)reg2, (v16i8)reg0); 1582 res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)reg1); 1583 res2 = (v16u8)__msa_ilvev_b((v16i8)reg5, (v16i8)reg3); 1584 res3 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)reg4); 1585 dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0); 1586 dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0); 1587 dst2 = (v16u8)__msa_ilvr_b((v16i8)res3, (v16i8)res2); 1588 dst3 = (v16u8)__msa_ilvl_b((v16i8)res3, (v16i8)res2); 1589 ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); 1590 src_rgb565 += 32; 1591 dst_argb += 64; 1592 } 1593 } 1594 1595 void RGB24ToARGBRow_MSA(const uint8* src_rgb24, uint8* dst_argb, int width) { 1596 int x; 1597 v16u8 src0, src1, src2; 1598 v16u8 vec0, vec1, vec2; 1599 v16u8 dst0, dst1, dst2, dst3; 1600 v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); 1601 v16i8 shuffler = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19}; 1602 1603 for (x = 0; x < width; x += 16) { 1604 src0 = (v16u8)__msa_ld_b((v16i8*)src_rgb24, 0); 1605 src1 = (v16u8)__msa_ld_b((v16i8*)src_rgb24, 16); 1606 src2 = (v16u8)__msa_ld_b((v16i8*)src_rgb24, 32); 1607 vec0 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 12); 1608 vec1 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8); 1609 vec2 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src2, 4); 1610 dst0 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)src0); 1611 dst1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec0); 1612 dst2 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec1); 1613 dst3 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec2); 1614 ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); 1615 src_rgb24 += 48; 1616 dst_argb += 64; 1617 } 1618 } 1619 1620 void RAWToARGBRow_MSA(const uint8* src_raw, uint8* dst_argb, int width) { 1621 int x; 1622 v16u8 src0, src1, src2; 1623 v16u8 vec0, vec1, vec2; 1624 v16u8 dst0, dst1, dst2, dst3; 1625 v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); 1626 v16i8 mask = {2, 1, 0, 16, 5, 4, 3, 17, 8, 7, 6, 18, 11, 10, 9, 19}; 1627 1628 for (x = 0; x < width; x += 16) { 1629 src0 = (v16u8)__msa_ld_b((v16i8*)src_raw, 0); 1630 src1 = (v16u8)__msa_ld_b((v16i8*)src_raw, 16); 1631 src2 = (v16u8)__msa_ld_b((v16i8*)src_raw, 32); 1632 vec0 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 12); 1633 vec1 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8); 1634 vec2 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src2, 4); 1635 dst0 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)src0); 1636 dst1 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec0); 1637 dst2 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec1); 1638 dst3 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec2); 1639 ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); 1640 src_raw += 48; 1641 dst_argb += 64; 1642 } 1643 } 1644 1645 void ARGB1555ToYRow_MSA(const uint8* src_argb1555, uint8* dst_y, int width) { 1646 int x; 1647 v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5; 1648 v8u16 reg0, reg1, reg2, reg3, reg4, reg5; 1649 v16u8 dst0; 1650 v8u16 const_0x19 = (v8u16)__msa_ldi_h(0x19); 1651 v8u16 const_0x81 = (v8u16)__msa_ldi_h(0x81); 1652 v8u16 const_0x42 = (v8u16)__msa_ldi_h(0x42); 1653 v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F); 1654 v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); 1655 1656 for (x = 0; x < width; x += 16) { 1657 src0 = (v8u16)__msa_ld_b((v8i16*)src_argb1555, 0); 1658 src1 = (v8u16)__msa_ld_b((v8i16*)src_argb1555, 16); 1659 vec0 = src0 & const_0x1F; 1660 vec1 = src1 & const_0x1F; 1661 src0 = (v8u16)__msa_srai_h((v8i16)src0, 5); 1662 src1 = (v8u16)__msa_srai_h((v8i16)src1, 5); 1663 vec2 = src0 & const_0x1F; 1664 vec3 = src1 & const_0x1F; 1665 src0 = (v8u16)__msa_srai_h((v8i16)src0, 5); 1666 src1 = (v8u16)__msa_srai_h((v8i16)src1, 5); 1667 vec4 = src0 & const_0x1F; 1668 vec5 = src1 & const_0x1F; 1669 reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3); 1670 reg1 = (v8u16)__msa_slli_h((v8i16)vec1, 3); 1671 reg0 |= (v8u16)__msa_srai_h((v8i16)vec0, 2); 1672 reg1 |= (v8u16)__msa_srai_h((v8i16)vec1, 2); 1673 reg2 = (v8u16)__msa_slli_h((v8i16)vec2, 3); 1674 reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3); 1675 reg2 |= (v8u16)__msa_srai_h((v8i16)vec2, 2); 1676 reg3 |= (v8u16)__msa_srai_h((v8i16)vec3, 2); 1677 reg4 = (v8u16)__msa_slli_h((v8i16)vec4, 3); 1678 reg5 = (v8u16)__msa_slli_h((v8i16)vec5, 3); 1679 reg4 |= (v8u16)__msa_srai_h((v8i16)vec4, 2); 1680 reg5 |= (v8u16)__msa_srai_h((v8i16)vec5, 2); 1681 reg0 *= const_0x19; 1682 reg1 *= const_0x19; 1683 reg2 *= const_0x81; 1684 reg3 *= const_0x81; 1685 reg4 *= const_0x42; 1686 reg5 *= const_0x42; 1687 reg0 += reg2; 1688 reg1 += reg3; 1689 reg0 += reg4; 1690 reg1 += reg5; 1691 reg0 += const_0x1080; 1692 reg1 += const_0x1080; 1693 reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8); 1694 reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 8); 1695 dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); 1696 ST_UB(dst0, dst_y); 1697 src_argb1555 += 32; 1698 dst_y += 16; 1699 } 1700 } 1701 1702 void RGB565ToYRow_MSA(const uint8* src_rgb565, uint8* dst_y, int width) { 1703 int x; 1704 v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 1705 v8u16 reg0, reg1, reg2, reg3, reg4, reg5; 1706 v4u32 res0, res1, res2, res3; 1707 v16u8 dst0; 1708 v4u32 const_0x810019 = (v4u32)__msa_fill_w(0x810019); 1709 v4u32 const_0x010042 = (v4u32)__msa_fill_w(0x010042); 1710 v8i16 const_0x1080 = __msa_fill_h(0x1080); 1711 v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F); 1712 v8u16 const_0x7E0 = (v8u16)__msa_fill_h(0x7E0); 1713 v8u16 const_0xF800 = (v8u16)__msa_fill_h(0xF800); 1714 1715 for (x = 0; x < width; x += 16) { 1716 src0 = (v8u16)__msa_ld_b((v8i16*)src_rgb565, 0); 1717 src1 = (v8u16)__msa_ld_b((v8i16*)src_rgb565, 16); 1718 vec0 = src0 & const_0x1F; 1719 vec1 = src0 & const_0x7E0; 1720 vec2 = src0 & const_0xF800; 1721 vec3 = src1 & const_0x1F; 1722 vec4 = src1 & const_0x7E0; 1723 vec5 = src1 & const_0xF800; 1724 reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3); 1725 reg1 = (v8u16)__msa_srli_h((v8i16)vec1, 3); 1726 reg2 = (v8u16)__msa_srli_h((v8i16)vec2, 8); 1727 reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3); 1728 reg4 = (v8u16)__msa_srli_h((v8i16)vec4, 3); 1729 reg5 = (v8u16)__msa_srli_h((v8i16)vec5, 8); 1730 reg0 |= (v8u16)__msa_srli_h((v8i16)vec0, 2); 1731 reg1 |= (v8u16)__msa_srli_h((v8i16)vec1, 9); 1732 reg2 |= (v8u16)__msa_srli_h((v8i16)vec2, 13); 1733 reg3 |= (v8u16)__msa_srli_h((v8i16)vec3, 2); 1734 reg4 |= (v8u16)__msa_srli_h((v8i16)vec4, 9); 1735 reg5 |= (v8u16)__msa_srli_h((v8i16)vec5, 13); 1736 vec0 = (v8u16)__msa_ilvr_h((v8i16)reg1, (v8i16)reg0); 1737 vec1 = (v8u16)__msa_ilvl_h((v8i16)reg1, (v8i16)reg0); 1738 vec2 = (v8u16)__msa_ilvr_h((v8i16)reg4, (v8i16)reg3); 1739 vec3 = (v8u16)__msa_ilvl_h((v8i16)reg4, (v8i16)reg3); 1740 vec4 = (v8u16)__msa_ilvr_h(const_0x1080, (v8i16)reg2); 1741 vec5 = (v8u16)__msa_ilvl_h(const_0x1080, (v8i16)reg2); 1742 vec6 = (v8u16)__msa_ilvr_h(const_0x1080, (v8i16)reg5); 1743 vec7 = (v8u16)__msa_ilvl_h(const_0x1080, (v8i16)reg5); 1744 res0 = __msa_dotp_u_w(vec0, (v8u16)const_0x810019); 1745 res1 = __msa_dotp_u_w(vec1, (v8u16)const_0x810019); 1746 res2 = __msa_dotp_u_w(vec2, (v8u16)const_0x810019); 1747 res3 = __msa_dotp_u_w(vec3, (v8u16)const_0x810019); 1748 res0 = __msa_dpadd_u_w(res0, vec4, (v8u16)const_0x010042); 1749 res1 = __msa_dpadd_u_w(res1, vec5, (v8u16)const_0x010042); 1750 res2 = __msa_dpadd_u_w(res2, vec6, (v8u16)const_0x010042); 1751 res3 = __msa_dpadd_u_w(res3, vec7, (v8u16)const_0x010042); 1752 res0 = (v4u32)__msa_srai_w((v4i32)res0, 8); 1753 res1 = (v4u32)__msa_srai_w((v4i32)res1, 8); 1754 res2 = (v4u32)__msa_srai_w((v4i32)res2, 8); 1755 res3 = (v4u32)__msa_srai_w((v4i32)res3, 8); 1756 vec0 = (v8u16)__msa_pckev_h((v8i16)res1, (v8i16)res0); 1757 vec1 = (v8u16)__msa_pckev_h((v8i16)res3, (v8i16)res2); 1758 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); 1759 ST_UB(dst0, dst_y); 1760 src_rgb565 += 32; 1761 dst_y += 16; 1762 } 1763 } 1764 1765 void RGB24ToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) { 1766 int x; 1767 v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0; 1768 v8u16 vec0, vec1, vec2, vec3; 1769 v8u16 const_0x8119 = (v8u16)__msa_fill_h(0x8119); 1770 v8u16 const_0x42 = (v8u16)__msa_fill_h(0x42); 1771 v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); 1772 v16i8 mask0 = {0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 12}; 1773 v16i8 mask1 = {12, 13, 14, 15, 15, 16, 17, 18, 1774 18, 19, 20, 21, 21, 22, 23, 24}; 1775 v16i8 mask2 = {8, 9, 10, 11, 11, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19, 20}; 1776 v16i8 mask3 = {4, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 13, 13, 14, 15, 16}; 1777 v16i8 zero = {0}; 1778 1779 for (x = 0; x < width; x += 16) { 1780 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0); 1781 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16); 1782 src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32); 1783 reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0); 1784 reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); 1785 reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1); 1786 reg3 = (v16u8)__msa_vshf_b(mask3, zero, (v16i8)src2); 1787 vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); 1788 vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); 1789 vec2 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0); 1790 vec3 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2); 1791 vec0 = __msa_dotp_u_h((v16u8)vec0, (v16u8)const_0x8119); 1792 vec1 = __msa_dotp_u_h((v16u8)vec1, (v16u8)const_0x8119); 1793 vec0 = __msa_dpadd_u_h(vec0, (v16u8)vec2, (v16u8)const_0x42); 1794 vec1 = __msa_dpadd_u_h(vec1, (v16u8)vec3, (v16u8)const_0x42); 1795 vec0 += const_0x1080; 1796 vec1 += const_0x1080; 1797 vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8); 1798 vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8); 1799 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); 1800 ST_UB(dst0, dst_y); 1801 src_argb0 += 48; 1802 dst_y += 16; 1803 } 1804 } 1805 1806 void RAWToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) { 1807 int x; 1808 v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0; 1809 v8u16 vec0, vec1, vec2, vec3; 1810 v8u16 const_0x8142 = (v8u16)__msa_fill_h(0x8142); 1811 v8u16 const_0x19 = (v8u16)__msa_fill_h(0x19); 1812 v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); 1813 v16i8 mask0 = {0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 12}; 1814 v16i8 mask1 = {12, 13, 14, 15, 15, 16, 17, 18, 1815 18, 19, 20, 21, 21, 22, 23, 24}; 1816 v16i8 mask2 = {8, 9, 10, 11, 11, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19, 20}; 1817 v16i8 mask3 = {4, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 13, 13, 14, 15, 16}; 1818 v16i8 zero = {0}; 1819 1820 for (x = 0; x < width; x += 16) { 1821 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0); 1822 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16); 1823 src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32); 1824 reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0); 1825 reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0); 1826 reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1); 1827 reg3 = (v16u8)__msa_vshf_b(mask3, zero, (v16i8)src2); 1828 vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); 1829 vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); 1830 vec2 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0); 1831 vec3 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2); 1832 vec0 = __msa_dotp_u_h((v16u8)vec0, (v16u8)const_0x8142); 1833 vec1 = __msa_dotp_u_h((v16u8)vec1, (v16u8)const_0x8142); 1834 vec0 = __msa_dpadd_u_h(vec0, (v16u8)vec2, (v16u8)const_0x19); 1835 vec1 = __msa_dpadd_u_h(vec1, (v16u8)vec3, (v16u8)const_0x19); 1836 vec0 += const_0x1080; 1837 vec1 += const_0x1080; 1838 vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8); 1839 vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8); 1840 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); 1841 ST_UB(dst0, dst_y); 1842 src_argb0 += 48; 1843 dst_y += 16; 1844 } 1845 } 1846 1847 void ARGB1555ToUVRow_MSA(const uint8* src_argb1555, 1848 int src_stride_argb1555, 1849 uint8* dst_u, 1850 uint8* dst_v, 1851 int width) { 1852 int x; 1853 const uint16* s = (const uint16*)src_argb1555; 1854 const uint16* t = (const uint16*)(src_argb1555 + src_stride_argb1555); 1855 int64_t res0, res1; 1856 v8u16 src0, src1, src2, src3, reg0, reg1, reg2, reg3; 1857 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6; 1858 v16u8 dst0; 1859 v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70); 1860 v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A); 1861 v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26); 1862 v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E); 1863 v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12); 1864 v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); 1865 v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F); 1866 1867 for (x = 0; x < width; x += 16) { 1868 src0 = (v8u16)__msa_ld_b((v8i16*)s, 0); 1869 src1 = (v8u16)__msa_ld_b((v8i16*)s, 16); 1870 src2 = (v8u16)__msa_ld_b((v8i16*)t, 0); 1871 src3 = (v8u16)__msa_ld_b((v8i16*)t, 16); 1872 vec0 = src0 & const_0x1F; 1873 vec1 = src1 & const_0x1F; 1874 vec0 += src2 & const_0x1F; 1875 vec1 += src3 & const_0x1F; 1876 vec0 = (v8u16)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); 1877 src0 = (v8u16)__msa_srai_h((v8i16)src0, 5); 1878 src1 = (v8u16)__msa_srai_h((v8i16)src1, 5); 1879 src2 = (v8u16)__msa_srai_h((v8i16)src2, 5); 1880 src3 = (v8u16)__msa_srai_h((v8i16)src3, 5); 1881 vec2 = src0 & const_0x1F; 1882 vec3 = src1 & const_0x1F; 1883 vec2 += src2 & const_0x1F; 1884 vec3 += src3 & const_0x1F; 1885 vec2 = (v8u16)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); 1886 src0 = (v8u16)__msa_srai_h((v8i16)src0, 5); 1887 src1 = (v8u16)__msa_srai_h((v8i16)src1, 5); 1888 src2 = (v8u16)__msa_srai_h((v8i16)src2, 5); 1889 src3 = (v8u16)__msa_srai_h((v8i16)src3, 5); 1890 vec4 = src0 & const_0x1F; 1891 vec5 = src1 & const_0x1F; 1892 vec4 += src2 & const_0x1F; 1893 vec5 += src3 & const_0x1F; 1894 vec4 = (v8u16)__msa_pckev_b((v16i8)vec5, (v16i8)vec4); 1895 vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0); 1896 vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2); 1897 vec4 = __msa_hadd_u_h((v16u8)vec4, (v16u8)vec4); 1898 vec6 = (v8u16)__msa_slli_h((v8i16)vec0, 1); 1899 vec6 |= (v8u16)__msa_srai_h((v8i16)vec0, 6); 1900 vec0 = (v8u16)__msa_slli_h((v8i16)vec2, 1); 1901 vec0 |= (v8u16)__msa_srai_h((v8i16)vec2, 6); 1902 vec2 = (v8u16)__msa_slli_h((v8i16)vec4, 1); 1903 vec2 |= (v8u16)__msa_srai_h((v8i16)vec4, 6); 1904 reg0 = vec6 * const_0x70; 1905 reg1 = vec0 * const_0x4A; 1906 reg2 = vec2 * const_0x70; 1907 reg3 = vec0 * const_0x5E; 1908 reg0 += const_0x8080; 1909 reg1 += vec2 * const_0x26; 1910 reg2 += const_0x8080; 1911 reg3 += vec6 * const_0x12; 1912 reg0 -= reg1; 1913 reg2 -= reg3; 1914 reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8); 1915 reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 8); 1916 dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0); 1917 res0 = __msa_copy_u_d((v2i64)dst0, 0); 1918 res1 = __msa_copy_u_d((v2i64)dst0, 1); 1919 SD(res0, dst_u); 1920 SD(res1, dst_v); 1921 s += 16; 1922 t += 16; 1923 dst_u += 8; 1924 dst_v += 8; 1925 } 1926 } 1927 1928 void RGB565ToUVRow_MSA(const uint8* src_rgb565, 1929 int src_stride_rgb565, 1930 uint8* dst_u, 1931 uint8* dst_v, 1932 int width) { 1933 int x; 1934 const uint16* s = (const uint16*)src_rgb565; 1935 const uint16* t = (const uint16*)(src_rgb565 + src_stride_rgb565); 1936 int64_t res0, res1; 1937 v8u16 src0, src1, src2, src3, reg0, reg1, reg2, reg3; 1938 v8u16 vec0, vec1, vec2, vec3, vec4, vec5; 1939 v16u8 dst0; 1940 v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70); 1941 v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A); 1942 v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26); 1943 v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E); 1944 v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12); 1945 v8u16 const_32896 = (v8u16)__msa_fill_h(0x8080); 1946 v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F); 1947 v8u16 const_0x3F = (v8u16)__msa_fill_h(0x3F); 1948 1949 for (x = 0; x < width; x += 16) { 1950 src0 = (v8u16)__msa_ld_b((v8i16*)s, 0); 1951 src1 = (v8u16)__msa_ld_b((v8i16*)s, 16); 1952 src2 = (v8u16)__msa_ld_b((v8i16*)t, 0); 1953 src3 = (v8u16)__msa_ld_b((v8i16*)t, 16); 1954 vec0 = src0 & const_0x1F; 1955 vec1 = src1 & const_0x1F; 1956 vec0 += src2 & const_0x1F; 1957 vec1 += src3 & const_0x1F; 1958 vec0 = (v8u16)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); 1959 src0 = (v8u16)__msa_srai_h((v8i16)src0, 5); 1960 src1 = (v8u16)__msa_srai_h((v8i16)src1, 5); 1961 src2 = (v8u16)__msa_srai_h((v8i16)src2, 5); 1962 src3 = (v8u16)__msa_srai_h((v8i16)src3, 5); 1963 vec2 = src0 & const_0x3F; 1964 vec3 = src1 & const_0x3F; 1965 vec2 += src2 & const_0x3F; 1966 vec3 += src3 & const_0x3F; 1967 vec1 = (v8u16)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); 1968 src0 = (v8u16)__msa_srai_h((v8i16)src0, 6); 1969 src1 = (v8u16)__msa_srai_h((v8i16)src1, 6); 1970 src2 = (v8u16)__msa_srai_h((v8i16)src2, 6); 1971 src3 = (v8u16)__msa_srai_h((v8i16)src3, 6); 1972 vec4 = src0 & const_0x1F; 1973 vec5 = src1 & const_0x1F; 1974 vec4 += src2 & const_0x1F; 1975 vec5 += src3 & const_0x1F; 1976 vec2 = (v8u16)__msa_pckev_b((v16i8)vec5, (v16i8)vec4); 1977 vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0); 1978 vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1); 1979 vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2); 1980 vec3 = (v8u16)__msa_slli_h((v8i16)vec0, 1); 1981 vec3 |= (v8u16)__msa_srai_h((v8i16)vec0, 6); 1982 vec4 = (v8u16)__msa_slli_h((v8i16)vec2, 1); 1983 vec4 |= (v8u16)__msa_srai_h((v8i16)vec2, 6); 1984 reg0 = vec3 * const_0x70; 1985 reg1 = vec1 * const_0x4A; 1986 reg2 = vec4 * const_0x70; 1987 reg3 = vec1 * const_0x5E; 1988 reg0 += const_32896; 1989 reg1 += vec4 * const_0x26; 1990 reg2 += const_32896; 1991 reg3 += vec3 * const_0x12; 1992 reg0 -= reg1; 1993 reg2 -= reg3; 1994 reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8); 1995 reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 8); 1996 dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0); 1997 res0 = __msa_copy_u_d((v2i64)dst0, 0); 1998 res1 = __msa_copy_u_d((v2i64)dst0, 1); 1999 SD(res0, dst_u); 2000 SD(res1, dst_v); 2001 s += 16; 2002 t += 16; 2003 dst_u += 8; 2004 dst_v += 8; 2005 } 2006 } 2007 2008 void RGB24ToUVRow_MSA(const uint8* src_rgb0, 2009 int src_stride_rgb, 2010 uint8* dst_u, 2011 uint8* dst_v, 2012 int width) { 2013 int x; 2014 const uint8* s = src_rgb0; 2015 const uint8* t = src_rgb0 + src_stride_rgb; 2016 int64 res0, res1; 2017 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 2018 v16u8 inp0, inp1, inp2, inp3, inp4, inp5; 2019 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 2020 v8i16 reg0, reg1, reg2, reg3; 2021 v16u8 dst0; 2022 v8u16 const_0x70 = (v8u16)__msa_fill_h(0x70); 2023 v8u16 const_0x4A = (v8u16)__msa_fill_h(0x4A); 2024 v8u16 const_0x26 = (v8u16)__msa_fill_h(0x26); 2025 v8u16 const_0x5E = (v8u16)__msa_fill_h(0x5E); 2026 v8u16 const_0x12 = (v8u16)__msa_fill_h(0x12); 2027 v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); 2028 v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19}; 2029 v16i8 zero = {0}; 2030 2031 for (x = 0; x < width; x += 16) { 2032 inp0 = (v16u8)__msa_ld_b((v16i8*)s, 0); 2033 inp1 = (v16u8)__msa_ld_b((v16i8*)s, 16); 2034 inp2 = (v16u8)__msa_ld_b((v16i8*)s, 32); 2035 inp3 = (v16u8)__msa_ld_b((v16i8*)t, 0); 2036 inp4 = (v16u8)__msa_ld_b((v16i8*)t, 16); 2037 inp5 = (v16u8)__msa_ld_b((v16i8*)t, 32); 2038 src1 = (v16u8)__msa_sldi_b((v16i8)inp1, (v16i8)inp0, 12); 2039 src5 = (v16u8)__msa_sldi_b((v16i8)inp4, (v16i8)inp3, 12); 2040 src2 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp1, 8); 2041 src6 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp4, 8); 2042 src3 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp2, 4); 2043 src7 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp5, 4); 2044 src0 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp0); 2045 src1 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src1); 2046 src2 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src2); 2047 src3 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src3); 2048 src4 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp3); 2049 src5 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src5); 2050 src6 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src6); 2051 src7 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src7); 2052 vec0 = (v8u16)__msa_ilvr_b((v16i8)src4, (v16i8)src0); 2053 vec1 = (v8u16)__msa_ilvl_b((v16i8)src4, (v16i8)src0); 2054 vec2 = (v8u16)__msa_ilvr_b((v16i8)src5, (v16i8)src1); 2055 vec3 = (v8u16)__msa_ilvl_b((v16i8)src5, (v16i8)src1); 2056 vec4 = (v8u16)__msa_ilvr_b((v16i8)src6, (v16i8)src2); 2057 vec5 = (v8u16)__msa_ilvl_b((v16i8)src6, (v16i8)src2); 2058 vec6 = (v8u16)__msa_ilvr_b((v16i8)src7, (v16i8)src3); 2059 vec7 = (v8u16)__msa_ilvl_b((v16i8)src7, (v16i8)src3); 2060 vec0 = (v8u16)__msa_hadd_u_h((v16u8)vec0, (v16u8)vec0); 2061 vec1 = (v8u16)__msa_hadd_u_h((v16u8)vec1, (v16u8)vec1); 2062 vec2 = (v8u16)__msa_hadd_u_h((v16u8)vec2, (v16u8)vec2); 2063 vec3 = (v8u16)__msa_hadd_u_h((v16u8)vec3, (v16u8)vec3); 2064 vec4 = (v8u16)__msa_hadd_u_h((v16u8)vec4, (v16u8)vec4); 2065 vec5 = (v8u16)__msa_hadd_u_h((v16u8)vec5, (v16u8)vec5); 2066 vec6 = (v8u16)__msa_hadd_u_h((v16u8)vec6, (v16u8)vec6); 2067 vec7 = (v8u16)__msa_hadd_u_h((v16u8)vec7, (v16u8)vec7); 2068 reg0 = (v8i16)__msa_pckev_d((v2i64)vec1, (v2i64)vec0); 2069 reg1 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec2); 2070 reg2 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4); 2071 reg3 = (v8i16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6); 2072 reg0 += (v8i16)__msa_pckod_d((v2i64)vec1, (v2i64)vec0); 2073 reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2); 2074 reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4); 2075 reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6); 2076 reg0 = __msa_srai_h((v8i16)reg0, 2); 2077 reg1 = __msa_srai_h((v8i16)reg1, 2); 2078 reg2 = __msa_srai_h((v8i16)reg2, 2); 2079 reg3 = __msa_srai_h((v8i16)reg3, 2); 2080 vec4 = (v8u16)__msa_pckev_h(reg1, reg0); 2081 vec5 = (v8u16)__msa_pckev_h(reg3, reg2); 2082 vec6 = (v8u16)__msa_pckod_h(reg1, reg0); 2083 vec7 = (v8u16)__msa_pckod_h(reg3, reg2); 2084 vec0 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4); 2085 vec1 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6); 2086 vec2 = (v8u16)__msa_pckod_h((v8i16)vec5, (v8i16)vec4); 2087 vec3 = vec0 * const_0x70; 2088 vec4 = vec1 * const_0x4A; 2089 vec5 = vec2 * const_0x26; 2090 vec2 *= const_0x70; 2091 vec1 *= const_0x5E; 2092 vec0 *= const_0x12; 2093 reg0 = __msa_subv_h((v8i16)vec3, (v8i16)vec4); 2094 reg1 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec5); 2095 reg2 = __msa_subv_h((v8i16)vec2, (v8i16)vec1); 2096 reg3 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec0); 2097 reg0 += reg1; 2098 reg2 += reg3; 2099 reg0 = __msa_srai_h(reg0, 8); 2100 reg2 = __msa_srai_h(reg2, 8); 2101 dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0); 2102 res0 = __msa_copy_u_d((v2i64)dst0, 0); 2103 res1 = __msa_copy_u_d((v2i64)dst0, 1); 2104 SD(res0, dst_u); 2105 SD(res1, dst_v); 2106 t += 48; 2107 s += 48; 2108 dst_u += 8; 2109 dst_v += 8; 2110 } 2111 } 2112 2113 void RAWToUVRow_MSA(const uint8* src_rgb0, 2114 int src_stride_rgb, 2115 uint8* dst_u, 2116 uint8* dst_v, 2117 int width) { 2118 int x; 2119 const uint8* s = src_rgb0; 2120 const uint8* t = src_rgb0 + src_stride_rgb; 2121 int64 res0, res1; 2122 v16u8 inp0, inp1, inp2, inp3, inp4, inp5; 2123 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 2124 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 2125 v8i16 reg0, reg1, reg2, reg3; 2126 v16u8 dst0; 2127 v8u16 const_0x70 = (v8u16)__msa_fill_h(0x70); 2128 v8u16 const_0x4A = (v8u16)__msa_fill_h(0x4A); 2129 v8u16 const_0x26 = (v8u16)__msa_fill_h(0x26); 2130 v8u16 const_0x5E = (v8u16)__msa_fill_h(0x5E); 2131 v8u16 const_0x12 = (v8u16)__msa_fill_h(0x12); 2132 v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); 2133 v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19}; 2134 v16i8 zero = {0}; 2135 2136 for (x = 0; x < width; x += 16) { 2137 inp0 = (v16u8)__msa_ld_b((v16i8*)s, 0); 2138 inp1 = (v16u8)__msa_ld_b((v16i8*)s, 16); 2139 inp2 = (v16u8)__msa_ld_b((v16i8*)s, 32); 2140 inp3 = (v16u8)__msa_ld_b((v16i8*)t, 0); 2141 inp4 = (v16u8)__msa_ld_b((v16i8*)t, 16); 2142 inp5 = (v16u8)__msa_ld_b((v16i8*)t, 32); 2143 src1 = (v16u8)__msa_sldi_b((v16i8)inp1, (v16i8)inp0, 12); 2144 src5 = (v16u8)__msa_sldi_b((v16i8)inp4, (v16i8)inp3, 12); 2145 src2 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp1, 8); 2146 src6 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp4, 8); 2147 src3 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp2, 4); 2148 src7 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp5, 4); 2149 src0 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp0); 2150 src1 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src1); 2151 src2 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src2); 2152 src3 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src3); 2153 src4 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp3); 2154 src5 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src5); 2155 src6 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src6); 2156 src7 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src7); 2157 vec0 = (v8u16)__msa_ilvr_b((v16i8)src4, (v16i8)src0); 2158 vec1 = (v8u16)__msa_ilvl_b((v16i8)src4, (v16i8)src0); 2159 vec2 = (v8u16)__msa_ilvr_b((v16i8)src5, (v16i8)src1); 2160 vec3 = (v8u16)__msa_ilvl_b((v16i8)src5, (v16i8)src1); 2161 vec4 = (v8u16)__msa_ilvr_b((v16i8)src6, (v16i8)src2); 2162 vec5 = (v8u16)__msa_ilvl_b((v16i8)src6, (v16i8)src2); 2163 vec6 = (v8u16)__msa_ilvr_b((v16i8)src7, (v16i8)src3); 2164 vec7 = (v8u16)__msa_ilvl_b((v16i8)src7, (v16i8)src3); 2165 vec0 = (v8u16)__msa_hadd_u_h((v16u8)vec0, (v16u8)vec0); 2166 vec1 = (v8u16)__msa_hadd_u_h((v16u8)vec1, (v16u8)vec1); 2167 vec2 = (v8u16)__msa_hadd_u_h((v16u8)vec2, (v16u8)vec2); 2168 vec3 = (v8u16)__msa_hadd_u_h((v16u8)vec3, (v16u8)vec3); 2169 vec4 = (v8u16)__msa_hadd_u_h((v16u8)vec4, (v16u8)vec4); 2170 vec5 = (v8u16)__msa_hadd_u_h((v16u8)vec5, (v16u8)vec5); 2171 vec6 = (v8u16)__msa_hadd_u_h((v16u8)vec6, (v16u8)vec6); 2172 vec7 = (v8u16)__msa_hadd_u_h((v16u8)vec7, (v16u8)vec7); 2173 reg0 = (v8i16)__msa_pckev_d((v2i64)vec1, (v2i64)vec0); 2174 reg1 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec2); 2175 reg2 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4); 2176 reg3 = (v8i16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6); 2177 reg0 += (v8i16)__msa_pckod_d((v2i64)vec1, (v2i64)vec0); 2178 reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2); 2179 reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4); 2180 reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6); 2181 reg0 = __msa_srai_h(reg0, 2); 2182 reg1 = __msa_srai_h(reg1, 2); 2183 reg2 = __msa_srai_h(reg2, 2); 2184 reg3 = __msa_srai_h(reg3, 2); 2185 vec4 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); 2186 vec5 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); 2187 vec6 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0); 2188 vec7 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2); 2189 vec0 = (v8u16)__msa_pckod_h((v8i16)vec5, (v8i16)vec4); 2190 vec1 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6); 2191 vec2 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4); 2192 vec3 = vec0 * const_0x70; 2193 vec4 = vec1 * const_0x4A; 2194 vec5 = vec2 * const_0x26; 2195 vec2 *= const_0x70; 2196 vec1 *= const_0x5E; 2197 vec0 *= const_0x12; 2198 reg0 = __msa_subv_h((v8i16)vec3, (v8i16)vec4); 2199 reg1 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec5); 2200 reg2 = __msa_subv_h((v8i16)vec2, (v8i16)vec1); 2201 reg3 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec0); 2202 reg0 += reg1; 2203 reg2 += reg3; 2204 reg0 = __msa_srai_h(reg0, 8); 2205 reg2 = __msa_srai_h(reg2, 8); 2206 dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0); 2207 res0 = __msa_copy_u_d((v2i64)dst0, 0); 2208 res1 = __msa_copy_u_d((v2i64)dst0, 1); 2209 SD(res0, dst_u); 2210 SD(res1, dst_v); 2211 t += 48; 2212 s += 48; 2213 dst_u += 8; 2214 dst_v += 8; 2215 } 2216 } 2217 2218 void NV12ToARGBRow_MSA(const uint8* src_y, 2219 const uint8* src_uv, 2220 uint8* rgb_buf, 2221 const struct YuvConstants* yuvconstants, 2222 int width) { 2223 int x; 2224 uint64 val0, val1; 2225 v16u8 src0, src1, res0, res1, dst0, dst1; 2226 v8i16 vec0, vec1, vec2; 2227 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; 2228 v4i32 vec_ubvr, vec_ugvg; 2229 v16u8 zero = {0}; 2230 v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); 2231 2232 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, 2233 vec_br, vec_yg); 2234 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); 2235 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); 2236 2237 for (x = 0; x < width; x += 8) { 2238 val0 = LD(src_y); 2239 val1 = LD(src_uv); 2240 src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0); 2241 src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1); 2242 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, 2243 vec0, vec1, vec2); 2244 res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0); 2245 res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1); 2246 dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0); 2247 dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0); 2248 ST_UB2(dst0, dst1, rgb_buf, 16); 2249 src_y += 8; 2250 src_uv += 8; 2251 rgb_buf += 32; 2252 } 2253 } 2254 2255 void NV12ToRGB565Row_MSA(const uint8* src_y, 2256 const uint8* src_uv, 2257 uint8* rgb_buf, 2258 const struct YuvConstants* yuvconstants, 2259 int width) { 2260 int x; 2261 uint64 val0, val1; 2262 v16u8 src0, src1, dst0; 2263 v8i16 vec0, vec1, vec2; 2264 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; 2265 v4i32 vec_ubvr, vec_ugvg; 2266 v16u8 zero = {0}; 2267 2268 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, 2269 vec_br, vec_yg); 2270 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); 2271 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); 2272 2273 for (x = 0; x < width; x += 8) { 2274 val0 = LD(src_y); 2275 val1 = LD(src_uv); 2276 src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0); 2277 src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1); 2278 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, 2279 vec0, vec1, vec2); 2280 vec0 = vec0 >> 3; 2281 vec1 = (vec1 >> 2) << 5; 2282 vec2 = (vec2 >> 3) << 11; 2283 dst0 = (v16u8)(vec0 | vec1 | vec2); 2284 ST_UB(dst0, rgb_buf); 2285 src_y += 8; 2286 src_uv += 8; 2287 rgb_buf += 16; 2288 } 2289 } 2290 2291 void NV21ToARGBRow_MSA(const uint8* src_y, 2292 const uint8* src_vu, 2293 uint8* rgb_buf, 2294 const struct YuvConstants* yuvconstants, 2295 int width) { 2296 int x; 2297 uint64 val0, val1; 2298 v16u8 src0, src1, res0, res1, dst0, dst1; 2299 v8i16 vec0, vec1, vec2; 2300 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; 2301 v4i32 vec_ubvr, vec_ugvg; 2302 v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); 2303 v16u8 zero = {0}; 2304 v16i8 shuffler = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; 2305 2306 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, 2307 vec_br, vec_yg); 2308 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); 2309 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); 2310 2311 for (x = 0; x < width; x += 8) { 2312 val0 = LD(src_y); 2313 val1 = LD(src_vu); 2314 src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0); 2315 src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1); 2316 src1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src1, (v16i8)src1); 2317 YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, 2318 vec0, vec1, vec2); 2319 res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0); 2320 res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1); 2321 dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0); 2322 dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0); 2323 ST_UB2(dst0, dst1, rgb_buf, 16); 2324 src_y += 8; 2325 src_vu += 8; 2326 rgb_buf += 32; 2327 } 2328 } 2329 2330 void SobelRow_MSA(const uint8* src_sobelx, 2331 const uint8* src_sobely, 2332 uint8* dst_argb, 2333 int width) { 2334 int x; 2335 v16u8 src0, src1, vec0, dst0, dst1, dst2, dst3; 2336 v16i8 mask0 = {0, 0, 0, 16, 1, 1, 1, 16, 2, 2, 2, 16, 3, 3, 3, 16}; 2337 v16i8 const_0x4 = __msa_ldi_b(0x4); 2338 v16i8 mask1 = mask0 + const_0x4; 2339 v16i8 mask2 = mask1 + const_0x4; 2340 v16i8 mask3 = mask2 + const_0x4; 2341 v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); 2342 2343 for (x = 0; x < width; x += 16) { 2344 src0 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 0); 2345 src1 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 0); 2346 vec0 = __msa_adds_u_b(src0, src1); 2347 dst0 = (v16u8)__msa_vshf_b(mask0, (v16i8)alpha, (v16i8)vec0); 2348 dst1 = (v16u8)__msa_vshf_b(mask1, (v16i8)alpha, (v16i8)vec0); 2349 dst2 = (v16u8)__msa_vshf_b(mask2, (v16i8)alpha, (v16i8)vec0); 2350 dst3 = (v16u8)__msa_vshf_b(mask3, (v16i8)alpha, (v16i8)vec0); 2351 ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); 2352 src_sobelx += 16; 2353 src_sobely += 16; 2354 dst_argb += 64; 2355 } 2356 } 2357 2358 void SobelToPlaneRow_MSA(const uint8* src_sobelx, 2359 const uint8* src_sobely, 2360 uint8* dst_y, 2361 int width) { 2362 int x; 2363 v16u8 src0, src1, src2, src3, dst0, dst1; 2364 2365 for (x = 0; x < width; x += 32) { 2366 src0 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 0); 2367 src1 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 16); 2368 src2 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 0); 2369 src3 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 16); 2370 dst0 = __msa_adds_u_b(src0, src2); 2371 dst1 = __msa_adds_u_b(src1, src3); 2372 ST_UB2(dst0, dst1, dst_y, 16); 2373 src_sobelx += 32; 2374 src_sobely += 32; 2375 dst_y += 32; 2376 } 2377 } 2378 2379 void SobelXYRow_MSA(const uint8* src_sobelx, 2380 const uint8* src_sobely, 2381 uint8* dst_argb, 2382 int width) { 2383 int x; 2384 v16u8 src0, src1, vec0, vec1, vec2; 2385 v16u8 reg0, reg1, dst0, dst1, dst2, dst3; 2386 v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); 2387 2388 for (x = 0; x < width; x += 16) { 2389 src0 = (v16u8)__msa_ld_b((v16i8*)src_sobelx, 0); 2390 src1 = (v16u8)__msa_ld_b((v16i8*)src_sobely, 0); 2391 vec0 = __msa_adds_u_b(src0, src1); 2392 vec1 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src1); 2393 vec2 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src1); 2394 reg0 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)vec0); 2395 reg1 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)vec0); 2396 dst0 = (v16u8)__msa_ilvr_b((v16i8)reg0, (v16i8)vec1); 2397 dst1 = (v16u8)__msa_ilvl_b((v16i8)reg0, (v16i8)vec1); 2398 dst2 = (v16u8)__msa_ilvr_b((v16i8)reg1, (v16i8)vec2); 2399 dst3 = (v16u8)__msa_ilvl_b((v16i8)reg1, (v16i8)vec2); 2400 ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); 2401 src_sobelx += 16; 2402 src_sobely += 16; 2403 dst_argb += 64; 2404 } 2405 } 2406 2407 void ARGBToYJRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) { 2408 int x; 2409 v16u8 src0, src1, src2, src3, dst0; 2410 v16u8 const_0x4B0F = (v16u8)__msa_fill_h(0x4B0F); 2411 v16u8 const_0x26 = (v16u8)__msa_fill_h(0x26); 2412 v8u16 const_0x40 = (v8u16)__msa_fill_h(0x40); 2413 2414 for (x = 0; x < width; x += 16) { 2415 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0); 2416 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16); 2417 src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32); 2418 src3 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 48); 2419 ARGBTOY(src0, src1, src2, src3, const_0x4B0F, const_0x26, const_0x40, 7, 2420 dst0); 2421 ST_UB(dst0, dst_y); 2422 src_argb0 += 64; 2423 dst_y += 16; 2424 } 2425 } 2426 2427 void BGRAToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) { 2428 int x; 2429 v16u8 src0, src1, src2, src3, dst0; 2430 v16u8 const_0x4200 = (v16u8)__msa_fill_h(0x4200); 2431 v16u8 const_0x1981 = (v16u8)__msa_fill_h(0x1981); 2432 v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); 2433 2434 for (x = 0; x < width; x += 16) { 2435 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0); 2436 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16); 2437 src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32); 2438 src3 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 48); 2439 ARGBTOY(src0, src1, src2, src3, const_0x4200, const_0x1981, const_0x1080, 8, 2440 dst0); 2441 ST_UB(dst0, dst_y); 2442 src_argb0 += 64; 2443 dst_y += 16; 2444 } 2445 } 2446 2447 void ABGRToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) { 2448 int x; 2449 v16u8 src0, src1, src2, src3, dst0; 2450 v16u8 const_0x8142 = (v16u8)__msa_fill_h(0x8142); 2451 v16u8 const_0x19 = (v16u8)__msa_fill_h(0x19); 2452 v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); 2453 2454 for (x = 0; x < width; x += 16) { 2455 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0); 2456 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16); 2457 src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32); 2458 src3 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 48); 2459 ARGBTOY(src0, src1, src2, src3, const_0x8142, const_0x19, const_0x1080, 8, 2460 dst0); 2461 ST_UB(dst0, dst_y); 2462 src_argb0 += 64; 2463 dst_y += 16; 2464 } 2465 } 2466 2467 void RGBAToYRow_MSA(const uint8* src_argb0, uint8* dst_y, int width) { 2468 int x; 2469 v16u8 src0, src1, src2, src3, dst0; 2470 v16u8 const_0x1900 = (v16u8)__msa_fill_h(0x1900); 2471 v16u8 const_0x4281 = (v16u8)__msa_fill_h(0x4281); 2472 v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080); 2473 2474 for (x = 0; x < width; x += 16) { 2475 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 0); 2476 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 16); 2477 src2 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 32); 2478 src3 = (v16u8)__msa_ld_b((v16i8*)src_argb0, 48); 2479 ARGBTOY(src0, src1, src2, src3, const_0x1900, const_0x4281, const_0x1080, 8, 2480 dst0); 2481 ST_UB(dst0, dst_y); 2482 src_argb0 += 64; 2483 dst_y += 16; 2484 } 2485 } 2486 2487 void ARGBToUVJRow_MSA(const uint8* src_rgb0, 2488 int src_stride_rgb, 2489 uint8* dst_u, 2490 uint8* dst_v, 2491 int width) { 2492 int x; 2493 const uint8* s = src_rgb0; 2494 const uint8* t = src_rgb0 + src_stride_rgb; 2495 v16u8 src0, src1, src2, src3, src4, src5, src6, src7; 2496 v16u8 vec0, vec1, vec2, vec3; 2497 v16u8 dst0, dst1; 2498 v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}; 2499 v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15, 2500 18, 19, 22, 23, 26, 27, 30, 31}; 2501 v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31}; 2502 v16i8 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30}; 2503 v16u8 const_0x7F = (v16u8)__msa_fill_h(0x7F); 2504 v16u8 const_0x6B14 = (v16u8)__msa_fill_h(0x6B14); 2505 v16u8 const_0x2B54 = (v16u8)__msa_fill_h(0x2B54); 2506 v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); 2507 2508 for (x = 0; x < width; x += 32) { 2509 src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); 2510 src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); 2511 src2 = (v16u8)__msa_ld_b((v16i8*)s, 32); 2512 src3 = (v16u8)__msa_ld_b((v16i8*)s, 48); 2513 src4 = (v16u8)__msa_ld_b((v16i8*)t, 0); 2514 src5 = (v16u8)__msa_ld_b((v16i8*)t, 16); 2515 src6 = (v16u8)__msa_ld_b((v16i8*)t, 32); 2516 src7 = (v16u8)__msa_ld_b((v16i8*)t, 48); 2517 src0 = __msa_aver_u_b(src0, src4); 2518 src1 = __msa_aver_u_b(src1, src5); 2519 src2 = __msa_aver_u_b(src2, src6); 2520 src3 = __msa_aver_u_b(src3, src7); 2521 src4 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0); 2522 src5 = (v16u8)__msa_pckev_w((v4i32)src3, (v4i32)src2); 2523 src6 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0); 2524 src7 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2); 2525 vec0 = __msa_aver_u_b(src4, src6); 2526 vec1 = __msa_aver_u_b(src5, src7); 2527 src0 = (v16u8)__msa_ld_b((v16i8*)s, 64); 2528 src1 = (v16u8)__msa_ld_b((v16i8*)s, 80); 2529 src2 = (v16u8)__msa_ld_b((v16i8*)s, 96); 2530 src3 = (v16u8)__msa_ld_b((v16i8*)s, 112); 2531 src4 = (v16u8)__msa_ld_b((v16i8*)t, 64); 2532 src5 = (v16u8)__msa_ld_b((v16i8*)t, 80); 2533 src6 = (v16u8)__msa_ld_b((v16i8*)t, 96); 2534 src7 = (v16u8)__msa_ld_b((v16i8*)t, 112); 2535 src0 = __msa_aver_u_b(src0, src4); 2536 src1 = __msa_aver_u_b(src1, src5); 2537 src2 = __msa_aver_u_b(src2, src6); 2538 src3 = __msa_aver_u_b(src3, src7); 2539 src4 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0); 2540 src5 = (v16u8)__msa_pckev_w((v4i32)src3, (v4i32)src2); 2541 src6 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0); 2542 src7 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2); 2543 vec2 = __msa_aver_u_b(src4, src6); 2544 vec3 = __msa_aver_u_b(src5, src7); 2545 ARGBTOUV(vec0, vec1, vec2, vec3, const_0x6B14, const_0x7F, const_0x2B54, 2546 const_0x8080, shuffler1, shuffler0, shuffler2, shuffler3, dst0, 2547 dst1); 2548 ST_UB(dst0, dst_v); 2549 ST_UB(dst1, dst_u); 2550 s += 128; 2551 t += 128; 2552 dst_v += 16; 2553 dst_u += 16; 2554 } 2555 } 2556 2557 void BGRAToUVRow_MSA(const uint8* src_rgb0, 2558 int src_stride_rgb, 2559 uint8* dst_u, 2560 uint8* dst_v, 2561 int width) { 2562 int x; 2563 const uint8* s = src_rgb0; 2564 const uint8* t = src_rgb0 + src_stride_rgb; 2565 v16u8 dst0, dst1, vec0, vec1, vec2, vec3; 2566 v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}; 2567 v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15, 2568 18, 19, 22, 23, 26, 27, 30, 31}; 2569 v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31}; 2570 v16i8 shuffler3 = {2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29}; 2571 v16u8 const_0x125E = (v16u8)__msa_fill_h(0x125E); 2572 v16u8 const_0x7000 = (v16u8)__msa_fill_h(0x7000); 2573 v16u8 const_0x264A = (v16u8)__msa_fill_h(0x264A); 2574 v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); 2575 2576 for (x = 0; x < width; x += 32) { 2577 READ_ARGB(s, t, vec0, vec1, vec2, vec3); 2578 ARGBTOUV(vec0, vec1, vec2, vec3, const_0x125E, const_0x7000, const_0x264A, 2579 const_0x8080, shuffler0, shuffler1, shuffler2, shuffler3, dst0, 2580 dst1); 2581 ST_UB(dst0, dst_v); 2582 ST_UB(dst1, dst_u); 2583 s += 128; 2584 t += 128; 2585 dst_v += 16; 2586 dst_u += 16; 2587 } 2588 } 2589 2590 void ABGRToUVRow_MSA(const uint8* src_rgb0, 2591 int src_stride_rgb, 2592 uint8* dst_u, 2593 uint8* dst_v, 2594 int width) { 2595 int x; 2596 const uint8* s = src_rgb0; 2597 const uint8* t = src_rgb0 + src_stride_rgb; 2598 v16u8 src0, src1, src2, src3; 2599 v16u8 dst0, dst1; 2600 v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}; 2601 v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15, 2602 18, 19, 22, 23, 26, 27, 30, 31}; 2603 v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31}; 2604 v16i8 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30}; 2605 v16u8 const_0x4A26 = (v16u8)__msa_fill_h(0x4A26); 2606 v16u8 const_0x0070 = (v16u8)__msa_fill_h(0x0070); 2607 v16u8 const_0x125E = (v16u8)__msa_fill_h(0x125E); 2608 v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); 2609 2610 for (x = 0; x < width; x += 32) { 2611 READ_ARGB(s, t, src0, src1, src2, src3); 2612 ARGBTOUV(src0, src1, src2, src3, const_0x4A26, const_0x0070, const_0x125E, 2613 const_0x8080, shuffler1, shuffler0, shuffler2, shuffler3, dst0, 2614 dst1); 2615 ST_UB(dst0, dst_u); 2616 ST_UB(dst1, dst_v); 2617 s += 128; 2618 t += 128; 2619 dst_u += 16; 2620 dst_v += 16; 2621 } 2622 } 2623 2624 void RGBAToUVRow_MSA(const uint8* src_rgb0, 2625 int src_stride_rgb, 2626 uint8* dst_u, 2627 uint8* dst_v, 2628 int width) { 2629 int x; 2630 const uint8* s = src_rgb0; 2631 const uint8* t = src_rgb0 + src_stride_rgb; 2632 v16u8 dst0, dst1, vec0, vec1, vec2, vec3; 2633 v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29}; 2634 v16i8 shuffler1 = {2, 3, 6, 7, 10, 11, 14, 15, 2635 18, 19, 22, 23, 26, 27, 30, 31}; 2636 v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31}; 2637 v16i8 shuffler3 = {2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29}; 2638 v16u8 const_0x125E = (v16u8)__msa_fill_h(0x264A); 2639 v16u8 const_0x7000 = (v16u8)__msa_fill_h(0x7000); 2640 v16u8 const_0x264A = (v16u8)__msa_fill_h(0x125E); 2641 v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080); 2642 2643 for (x = 0; x < width; x += 32) { 2644 READ_ARGB(s, t, vec0, vec1, vec2, vec3); 2645 ARGBTOUV(vec0, vec1, vec2, vec3, const_0x125E, const_0x7000, const_0x264A, 2646 const_0x8080, shuffler0, shuffler1, shuffler2, shuffler3, dst0, 2647 dst1); 2648 ST_UB(dst0, dst_u); 2649 ST_UB(dst1, dst_v); 2650 s += 128; 2651 t += 128; 2652 dst_u += 16; 2653 dst_v += 16; 2654 } 2655 } 2656 2657 void I444ToARGBRow_MSA(const uint8* src_y, 2658 const uint8* src_u, 2659 const uint8* src_v, 2660 uint8* rgb_buf, 2661 const struct YuvConstants* yuvconstants, 2662 int width) { 2663 int x; 2664 v16u8 src0, src1, src2, dst0, dst1; 2665 v8u16 vec0, vec1, vec2; 2666 v4i32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9; 2667 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; 2668 v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); 2669 v8i16 zero = {0}; 2670 2671 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, 2672 vec_br, vec_yg); 2673 2674 for (x = 0; x < width; x += 8) { 2675 READI444(src_y, src_u, src_v, src0, src1, src2); 2676 vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); 2677 reg0 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0); 2678 reg1 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0); 2679 reg0 *= vec_yg; 2680 reg1 *= vec_yg; 2681 reg0 = __msa_srai_w(reg0, 16); 2682 reg1 = __msa_srai_w(reg1, 16); 2683 reg4 = reg0 + vec_br; 2684 reg5 = reg1 + vec_br; 2685 reg2 = reg0 + vec_bg; 2686 reg3 = reg1 + vec_bg; 2687 reg0 += vec_bb; 2688 reg1 += vec_bb; 2689 vec0 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1); 2690 vec1 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src2); 2691 reg6 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0); 2692 reg7 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0); 2693 reg8 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1); 2694 reg9 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1); 2695 reg0 -= reg6 * vec_ub; 2696 reg1 -= reg7 * vec_ub; 2697 reg2 -= reg6 * vec_ug; 2698 reg3 -= reg7 * vec_ug; 2699 reg4 -= reg8 * vec_vr; 2700 reg5 -= reg9 * vec_vr; 2701 reg2 -= reg8 * vec_vg; 2702 reg3 -= reg9 * vec_vg; 2703 reg0 = __msa_srai_w(reg0, 6); 2704 reg1 = __msa_srai_w(reg1, 6); 2705 reg2 = __msa_srai_w(reg2, 6); 2706 reg3 = __msa_srai_w(reg3, 6); 2707 reg4 = __msa_srai_w(reg4, 6); 2708 reg5 = __msa_srai_w(reg5, 6); 2709 CLIP_0TO255(reg0, reg1, reg2, reg3, reg4, reg5); 2710 vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); 2711 vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); 2712 vec2 = (v8u16)__msa_pckev_h((v8i16)reg5, (v8i16)reg4); 2713 vec0 = (v8u16)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0); 2714 vec1 = (v8u16)__msa_ilvev_b((v16i8)alpha, (v16i8)vec2); 2715 dst0 = (v16u8)__msa_ilvr_h((v8i16)vec1, (v8i16)vec0); 2716 dst1 = (v16u8)__msa_ilvl_h((v8i16)vec1, (v8i16)vec0); 2717 ST_UB2(dst0, dst1, rgb_buf, 16); 2718 src_y += 8; 2719 src_u += 8; 2720 src_v += 8; 2721 rgb_buf += 32; 2722 } 2723 } 2724 2725 void I400ToARGBRow_MSA(const uint8* src_y, uint8* rgb_buf, int width) { 2726 int x; 2727 v16u8 src0, res0, res1, res2, res3, res4, dst0, dst1, dst2, dst3; 2728 v8i16 vec0, vec1; 2729 v4i32 reg0, reg1, reg2, reg3; 2730 v4i32 vec_yg = __msa_fill_w(0x4A35); 2731 v8i16 vec_ygb = __msa_fill_h(0xFB78); 2732 v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); 2733 v8i16 max = __msa_ldi_h(0xFF); 2734 v8i16 zero = {0}; 2735 2736 for (x = 0; x < width; x += 16) { 2737 src0 = (v16u8)__msa_ld_b((v16i8*)src_y, 0); 2738 vec0 = (v8i16)__msa_ilvr_b((v16i8)src0, (v16i8)src0); 2739 vec1 = (v8i16)__msa_ilvl_b((v16i8)src0, (v16i8)src0); 2740 reg0 = (v4i32)__msa_ilvr_h(zero, vec0); 2741 reg1 = (v4i32)__msa_ilvl_h(zero, vec0); 2742 reg2 = (v4i32)__msa_ilvr_h(zero, vec1); 2743 reg3 = (v4i32)__msa_ilvl_h(zero, vec1); 2744 reg0 *= vec_yg; 2745 reg1 *= vec_yg; 2746 reg2 *= vec_yg; 2747 reg3 *= vec_yg; 2748 reg0 = __msa_srai_w(reg0, 16); 2749 reg1 = __msa_srai_w(reg1, 16); 2750 reg2 = __msa_srai_w(reg2, 16); 2751 reg3 = __msa_srai_w(reg3, 16); 2752 vec0 = (v8i16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); 2753 vec1 = (v8i16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); 2754 vec0 += vec_ygb; 2755 vec1 += vec_ygb; 2756 vec0 = __msa_srai_h(vec0, 6); 2757 vec1 = __msa_srai_h(vec1, 6); 2758 vec0 = __msa_maxi_s_h(vec0, 0); 2759 vec1 = __msa_maxi_s_h(vec1, 0); 2760 vec0 = __msa_min_s_h(max, vec0); 2761 vec1 = __msa_min_s_h(max, vec1); 2762 res0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); 2763 res1 = (v16u8)__msa_ilvr_b((v16i8)res0, (v16i8)res0); 2764 res2 = (v16u8)__msa_ilvl_b((v16i8)res0, (v16i8)res0); 2765 res3 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)res0); 2766 res4 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)res0); 2767 dst0 = (v16u8)__msa_ilvr_b((v16i8)res3, (v16i8)res1); 2768 dst1 = (v16u8)__msa_ilvl_b((v16i8)res3, (v16i8)res1); 2769 dst2 = (v16u8)__msa_ilvr_b((v16i8)res4, (v16i8)res2); 2770 dst3 = (v16u8)__msa_ilvl_b((v16i8)res4, (v16i8)res2); 2771 ST_UB4(dst0, dst1, dst2, dst3, rgb_buf, 16); 2772 src_y += 16; 2773 rgb_buf += 64; 2774 } 2775 } 2776 2777 void J400ToARGBRow_MSA(const uint8* src_y, uint8* dst_argb, int width) { 2778 int x; 2779 v16u8 src0, vec0, vec1, vec2, vec3, dst0, dst1, dst2, dst3; 2780 v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); 2781 2782 for (x = 0; x < width; x += 16) { 2783 src0 = (v16u8)__msa_ld_b((v16i8*)src_y, 0); 2784 vec0 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src0); 2785 vec1 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src0); 2786 vec2 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)src0); 2787 vec3 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)src0); 2788 dst0 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0); 2789 dst1 = (v16u8)__msa_ilvl_b((v16i8)vec2, (v16i8)vec0); 2790 dst2 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1); 2791 dst3 = (v16u8)__msa_ilvl_b((v16i8)vec3, (v16i8)vec1); 2792 ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16); 2793 src_y += 16; 2794 dst_argb += 64; 2795 } 2796 } 2797 2798 void YUY2ToARGBRow_MSA(const uint8* src_yuy2, 2799 uint8* rgb_buf, 2800 const struct YuvConstants* yuvconstants, 2801 int width) { 2802 int x; 2803 v16u8 src0, src1, src2; 2804 v8i16 vec0, vec1, vec2; 2805 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; 2806 v4i32 vec_ubvr, vec_ugvg; 2807 v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); 2808 2809 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, 2810 vec_br, vec_yg); 2811 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); 2812 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); 2813 2814 for (x = 0; x < width; x += 8) { 2815 src0 = (v16u8)__msa_ld_b((v16i8*)src_yuy2, 0); 2816 src1 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0); 2817 src2 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0); 2818 YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, 2819 vec0, vec1, vec2); 2820 STOREARGB(vec0, vec1, vec2, alpha, rgb_buf); 2821 src_yuy2 += 16; 2822 rgb_buf += 32; 2823 } 2824 } 2825 2826 void UYVYToARGBRow_MSA(const uint8* src_uyvy, 2827 uint8* rgb_buf, 2828 const struct YuvConstants* yuvconstants, 2829 int width) { 2830 int x; 2831 v16u8 src0, src1, src2; 2832 v8i16 vec0, vec1, vec2; 2833 v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg; 2834 v4i32 vec_ubvr, vec_ugvg; 2835 v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL); 2836 2837 YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, 2838 vec_br, vec_yg); 2839 vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub); 2840 vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug); 2841 2842 for (x = 0; x < width; x += 8) { 2843 src0 = (v16u8)__msa_ld_b((v16i8*)src_uyvy, 0); 2844 src1 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0); 2845 src2 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0); 2846 YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg, 2847 vec0, vec1, vec2); 2848 STOREARGB(vec0, vec1, vec2, alpha, rgb_buf); 2849 src_uyvy += 16; 2850 rgb_buf += 32; 2851 } 2852 } 2853 2854 void InterpolateRow_MSA(uint8* dst_ptr, 2855 const uint8* src_ptr, 2856 ptrdiff_t src_stride, 2857 int width, 2858 int32 source_y_fraction) { 2859 int32 y1_fraction = source_y_fraction; 2860 int32 y0_fraction = 256 - y1_fraction; 2861 uint16 y_fractions; 2862 const uint8* s = src_ptr; 2863 const uint8* t = src_ptr + src_stride; 2864 int x; 2865 v16u8 src0, src1, src2, src3, dst0, dst1; 2866 v8u16 vec0, vec1, vec2, vec3, y_frac; 2867 2868 if (0 == y1_fraction) { 2869 memcpy(dst_ptr, src_ptr, width); 2870 return; 2871 } 2872 2873 if (128 == y1_fraction) { 2874 for (x = 0; x < width; x += 32) { 2875 src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); 2876 src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); 2877 src2 = (v16u8)__msa_ld_b((v16i8*)t, 0); 2878 src3 = (v16u8)__msa_ld_b((v16i8*)t, 16); 2879 dst0 = __msa_aver_u_b(src0, src2); 2880 dst1 = __msa_aver_u_b(src1, src3); 2881 ST_UB2(dst0, dst1, dst_ptr, 16); 2882 s += 32; 2883 t += 32; 2884 dst_ptr += 32; 2885 } 2886 return; 2887 } 2888 2889 y_fractions = (uint16)(y0_fraction + (y1_fraction << 8)); 2890 y_frac = (v8u16)__msa_fill_h(y_fractions); 2891 2892 for (x = 0; x < width; x += 32) { 2893 src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); 2894 src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); 2895 src2 = (v16u8)__msa_ld_b((v16i8*)t, 0); 2896 src3 = (v16u8)__msa_ld_b((v16i8*)t, 16); 2897 vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0); 2898 vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0); 2899 vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1); 2900 vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1); 2901 vec0 = (v8u16)__msa_dotp_u_h((v16u8)vec0, (v16u8)y_frac); 2902 vec1 = (v8u16)__msa_dotp_u_h((v16u8)vec1, (v16u8)y_frac); 2903 vec2 = (v8u16)__msa_dotp_u_h((v16u8)vec2, (v16u8)y_frac); 2904 vec3 = (v8u16)__msa_dotp_u_h((v16u8)vec3, (v16u8)y_frac); 2905 vec0 = (v8u16)__msa_srari_h((v8i16)vec0, 8); 2906 vec1 = (v8u16)__msa_srari_h((v8i16)vec1, 8); 2907 vec2 = (v8u16)__msa_srari_h((v8i16)vec2, 8); 2908 vec3 = (v8u16)__msa_srari_h((v8i16)vec3, 8); 2909 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); 2910 dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); 2911 ST_UB2(dst0, dst1, dst_ptr, 16); 2912 s += 32; 2913 t += 32; 2914 dst_ptr += 32; 2915 } 2916 } 2917 2918 void ARGBSetRow_MSA(uint8* dst_argb, uint32 v32, int width) { 2919 int x; 2920 v16u8 dst0 = (v16u8)__msa_fill_w(v32); 2921 2922 for (x = 0; x < width; x += 4) { 2923 ST_UB(dst0, dst_argb); 2924 dst_argb += 16; 2925 } 2926 } 2927 2928 void RAWToRGB24Row_MSA(const uint8* src_raw, uint8* dst_rgb24, int width) { 2929 int x; 2930 v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2; 2931 v16i8 shuffler0 = {2, 1, 0, 5, 4, 3, 8, 7, 6, 11, 10, 9, 14, 13, 12, 17}; 2932 v16i8 shuffler1 = {8, 7, 12, 11, 10, 15, 14, 13, 2933 18, 17, 16, 21, 20, 19, 24, 23}; 2934 v16i8 shuffler2 = {14, 19, 18, 17, 22, 21, 20, 25, 2935 24, 23, 28, 27, 26, 31, 30, 29}; 2936 2937 for (x = 0; x < width; x += 16) { 2938 src0 = (v16u8)__msa_ld_b((v16i8*)src_raw, 0); 2939 src1 = (v16u8)__msa_ld_b((v16i8*)src_raw, 16); 2940 src2 = (v16u8)__msa_ld_b((v16i8*)src_raw, 32); 2941 src3 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 8); 2942 src4 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8); 2943 dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0); 2944 dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src4, (v16i8)src3); 2945 dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src2, (v16i8)src1); 2946 ST_UB2(dst0, dst1, dst_rgb24, 16); 2947 ST_UB(dst2, (dst_rgb24 + 32)); 2948 src_raw += 48; 2949 dst_rgb24 += 48; 2950 } 2951 } 2952 2953 void MergeUVRow_MSA(const uint8* src_u, 2954 const uint8* src_v, 2955 uint8* dst_uv, 2956 int width) { 2957 int x; 2958 v16u8 src0, src1, dst0, dst1; 2959 2960 for (x = 0; x < width; x += 16) { 2961 src0 = (v16u8)__msa_ld_b((v16i8*)src_u, 0); 2962 src1 = (v16u8)__msa_ld_b((v16i8*)src_v, 0); 2963 dst0 = (v16u8)__msa_ilvr_b((v16i8)src1, (v16i8)src0); 2964 dst1 = (v16u8)__msa_ilvl_b((v16i8)src1, (v16i8)src0); 2965 ST_UB2(dst0, dst1, dst_uv, 16); 2966 src_u += 16; 2967 src_v += 16; 2968 dst_uv += 32; 2969 } 2970 } 2971 2972 #ifdef __cplusplus 2973 } // extern "C" 2974 } // namespace libyuv 2975 #endif 2976 2977 #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) 2978