1 /* 2 * Copyright 2016 The LibYuv Project Authors. All rights reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <assert.h> 12 13 #include "libyuv/scale_row.h" 14 15 // This module is for GCC MSA 16 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) 17 #include "libyuv/macros_msa.h" 18 19 #ifdef __cplusplus 20 namespace libyuv { 21 extern "C" { 22 #endif 23 24 void ScaleARGBRowDown2_MSA(const uint8_t* src_argb, 25 ptrdiff_t src_stride, 26 uint8_t* dst_argb, 27 int dst_width) { 28 int x; 29 v16u8 src0, src1, dst0; 30 (void)src_stride; 31 32 for (x = 0; x < dst_width; x += 4) { 33 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0); 34 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16); 35 dst0 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0); 36 ST_UB(dst0, dst_argb); 37 src_argb += 32; 38 dst_argb += 16; 39 } 40 } 41 42 void ScaleARGBRowDown2Linear_MSA(const uint8_t* src_argb, 43 ptrdiff_t src_stride, 44 uint8_t* dst_argb, 45 int dst_width) { 46 int x; 47 v16u8 src0, src1, vec0, vec1, dst0; 48 (void)src_stride; 49 50 for (x = 0; x < dst_width; x += 4) { 51 src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0); 52 src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16); 53 vec0 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0); 54 vec1 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0); 55 dst0 = (v16u8)__msa_aver_u_b((v16u8)vec0, (v16u8)vec1); 56 ST_UB(dst0, dst_argb); 57 src_argb += 32; 58 dst_argb += 16; 59 } 60 } 61 62 void ScaleARGBRowDown2Box_MSA(const uint8_t* src_argb, 63 ptrdiff_t src_stride, 64 uint8_t* dst_argb, 65 int dst_width) { 66 int x; 67 const uint8_t* s = src_argb; 68 const uint8_t* t = src_argb + src_stride; 69 v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0; 70 v8u16 reg0, reg1, reg2, reg3; 71 v16i8 shuffler = {0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15}; 72 73 for (x = 0; x < dst_width; x += 4) { 74 src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); 75 src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); 76 src2 = (v16u8)__msa_ld_b((v16i8*)t, 0); 77 src3 = (v16u8)__msa_ld_b((v16i8*)t, 16); 78 vec0 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src0, (v16i8)src0); 79 vec1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src1, (v16i8)src1); 80 vec2 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src2, (v16i8)src2); 81 vec3 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src3, (v16i8)src3); 82 reg0 = __msa_hadd_u_h(vec0, vec0); 83 reg1 = __msa_hadd_u_h(vec1, vec1); 84 reg2 = __msa_hadd_u_h(vec2, vec2); 85 reg3 = __msa_hadd_u_h(vec3, vec3); 86 reg0 += reg2; 87 reg1 += reg3; 88 reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 2); 89 reg1 = (v8u16)__msa_srari_h((v8i16)reg1, 2); 90 dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0); 91 ST_UB(dst0, dst_argb); 92 s += 32; 93 t += 32; 94 dst_argb += 16; 95 } 96 } 97 98 void ScaleARGBRowDownEven_MSA(const uint8_t* src_argb, 99 ptrdiff_t src_stride, 100 int32_t src_stepx, 101 uint8_t* dst_argb, 102 int dst_width) { 103 int x; 104 int32_t stepx = src_stepx * 4; 105 int32_t data0, data1, data2, data3; 106 (void)src_stride; 107 108 for (x = 0; x < dst_width; x += 4) { 109 data0 = LW(src_argb); 110 data1 = LW(src_argb + stepx); 111 data2 = LW(src_argb + stepx * 2); 112 data3 = LW(src_argb + stepx * 3); 113 SW(data0, dst_argb); 114 SW(data1, dst_argb + 4); 115 SW(data2, dst_argb + 8); 116 SW(data3, dst_argb + 12); 117 src_argb += stepx * 4; 118 dst_argb += 16; 119 } 120 } 121 122 void ScaleARGBRowDownEvenBox_MSA(const uint8* src_argb, 123 ptrdiff_t src_stride, 124 int src_stepx, 125 uint8* dst_argb, 126 int dst_width) { 127 int x; 128 const uint8* nxt_argb = src_argb + src_stride; 129 int32_t stepx = src_stepx * 4; 130 int64_t data0, data1, data2, data3; 131 v16u8 src0 = {0}, src1 = {0}, src2 = {0}, src3 = {0}; 132 v16u8 vec0, vec1, vec2, vec3; 133 v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7; 134 v16u8 dst0; 135 136 for (x = 0; x < dst_width; x += 4) { 137 data0 = LD(src_argb); 138 data1 = LD(src_argb + stepx); 139 data2 = LD(src_argb + stepx * 2); 140 data3 = LD(src_argb + stepx * 3); 141 src0 = (v16u8)__msa_insert_d((v2i64)src0, 0, data0); 142 src0 = (v16u8)__msa_insert_d((v2i64)src0, 1, data1); 143 src1 = (v16u8)__msa_insert_d((v2i64)src1, 0, data2); 144 src1 = (v16u8)__msa_insert_d((v2i64)src1, 1, data3); 145 data0 = LD(nxt_argb); 146 data1 = LD(nxt_argb + stepx); 147 data2 = LD(nxt_argb + stepx * 2); 148 data3 = LD(nxt_argb + stepx * 3); 149 src2 = (v16u8)__msa_insert_d((v2i64)src2, 0, data0); 150 src2 = (v16u8)__msa_insert_d((v2i64)src2, 1, data1); 151 src3 = (v16u8)__msa_insert_d((v2i64)src3, 0, data2); 152 src3 = (v16u8)__msa_insert_d((v2i64)src3, 1, data3); 153 vec0 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src0); 154 vec1 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src1); 155 vec2 = (v16u8)__msa_ilvl_b((v16i8)src2, (v16i8)src0); 156 vec3 = (v16u8)__msa_ilvl_b((v16i8)src3, (v16i8)src1); 157 reg0 = __msa_hadd_u_h(vec0, vec0); 158 reg1 = __msa_hadd_u_h(vec1, vec1); 159 reg2 = __msa_hadd_u_h(vec2, vec2); 160 reg3 = __msa_hadd_u_h(vec3, vec3); 161 reg4 = (v8u16)__msa_pckev_d((v2i64)reg2, (v2i64)reg0); 162 reg5 = (v8u16)__msa_pckev_d((v2i64)reg3, (v2i64)reg1); 163 reg6 = (v8u16)__msa_pckod_d((v2i64)reg2, (v2i64)reg0); 164 reg7 = (v8u16)__msa_pckod_d((v2i64)reg3, (v2i64)reg1); 165 reg4 += reg6; 166 reg5 += reg7; 167 reg4 = (v8u16)__msa_srari_h((v8i16)reg4, 2); 168 reg5 = (v8u16)__msa_srari_h((v8i16)reg5, 2); 169 dst0 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4); 170 ST_UB(dst0, dst_argb); 171 src_argb += stepx * 4; 172 nxt_argb += stepx * 4; 173 dst_argb += 16; 174 } 175 } 176 177 void ScaleRowDown2_MSA(const uint8_t* src_ptr, 178 ptrdiff_t src_stride, 179 uint8_t* dst, 180 int dst_width) { 181 int x; 182 v16u8 src0, src1, src2, src3, dst0, dst1; 183 (void)src_stride; 184 185 for (x = 0; x < dst_width; x += 32) { 186 src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0); 187 src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16); 188 src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32); 189 src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48); 190 dst0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); 191 dst1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); 192 ST_UB2(dst0, dst1, dst, 16); 193 src_ptr += 64; 194 dst += 32; 195 } 196 } 197 198 void ScaleRowDown2Linear_MSA(const uint8_t* src_ptr, 199 ptrdiff_t src_stride, 200 uint8_t* dst, 201 int dst_width) { 202 int x; 203 v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0, dst1; 204 (void)src_stride; 205 206 for (x = 0; x < dst_width; x += 32) { 207 src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0); 208 src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16); 209 src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32); 210 src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48); 211 vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); 212 vec2 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); 213 vec1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0); 214 vec3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2); 215 dst0 = __msa_aver_u_b(vec1, vec0); 216 dst1 = __msa_aver_u_b(vec3, vec2); 217 ST_UB2(dst0, dst1, dst, 16); 218 src_ptr += 64; 219 dst += 32; 220 } 221 } 222 223 void ScaleRowDown2Box_MSA(const uint8_t* src_ptr, 224 ptrdiff_t src_stride, 225 uint8_t* dst, 226 int dst_width) { 227 int x; 228 const uint8_t* s = src_ptr; 229 const uint8_t* t = src_ptr + src_stride; 230 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1; 231 v8u16 vec0, vec1, vec2, vec3; 232 233 for (x = 0; x < dst_width; x += 32) { 234 src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); 235 src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); 236 src2 = (v16u8)__msa_ld_b((v16i8*)s, 32); 237 src3 = (v16u8)__msa_ld_b((v16i8*)s, 48); 238 src4 = (v16u8)__msa_ld_b((v16i8*)t, 0); 239 src5 = (v16u8)__msa_ld_b((v16i8*)t, 16); 240 src6 = (v16u8)__msa_ld_b((v16i8*)t, 32); 241 src7 = (v16u8)__msa_ld_b((v16i8*)t, 48); 242 vec0 = __msa_hadd_u_h(src0, src0); 243 vec1 = __msa_hadd_u_h(src1, src1); 244 vec2 = __msa_hadd_u_h(src2, src2); 245 vec3 = __msa_hadd_u_h(src3, src3); 246 vec0 += __msa_hadd_u_h(src4, src4); 247 vec1 += __msa_hadd_u_h(src5, src5); 248 vec2 += __msa_hadd_u_h(src6, src6); 249 vec3 += __msa_hadd_u_h(src7, src7); 250 vec0 = (v8u16)__msa_srari_h((v8i16)vec0, 2); 251 vec1 = (v8u16)__msa_srari_h((v8i16)vec1, 2); 252 vec2 = (v8u16)__msa_srari_h((v8i16)vec2, 2); 253 vec3 = (v8u16)__msa_srari_h((v8i16)vec3, 2); 254 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); 255 dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2); 256 ST_UB2(dst0, dst1, dst, 16); 257 s += 64; 258 t += 64; 259 dst += 32; 260 } 261 } 262 263 void ScaleRowDown4_MSA(const uint8_t* src_ptr, 264 ptrdiff_t src_stride, 265 uint8_t* dst, 266 int dst_width) { 267 int x; 268 v16u8 src0, src1, src2, src3, vec0, vec1, dst0; 269 (void)src_stride; 270 271 for (x = 0; x < dst_width; x += 16) { 272 src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0); 273 src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16); 274 src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32); 275 src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48); 276 vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0); 277 vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2); 278 dst0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0); 279 ST_UB(dst0, dst); 280 src_ptr += 64; 281 dst += 16; 282 } 283 } 284 285 void ScaleRowDown4Box_MSA(const uint8_t* src_ptr, 286 ptrdiff_t src_stride, 287 uint8_t* dst, 288 int dst_width) { 289 int x; 290 const uint8_t* s = src_ptr; 291 const uint8_t* t0 = s + src_stride; 292 const uint8_t* t1 = s + src_stride * 2; 293 const uint8_t* t2 = s + src_stride * 3; 294 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0; 295 v8u16 vec0, vec1, vec2, vec3; 296 v4u32 reg0, reg1, reg2, reg3; 297 298 for (x = 0; x < dst_width; x += 16) { 299 src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); 300 src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); 301 src2 = (v16u8)__msa_ld_b((v16i8*)s, 32); 302 src3 = (v16u8)__msa_ld_b((v16i8*)s, 48); 303 src4 = (v16u8)__msa_ld_b((v16i8*)t0, 0); 304 src5 = (v16u8)__msa_ld_b((v16i8*)t0, 16); 305 src6 = (v16u8)__msa_ld_b((v16i8*)t0, 32); 306 src7 = (v16u8)__msa_ld_b((v16i8*)t0, 48); 307 vec0 = __msa_hadd_u_h(src0, src0); 308 vec1 = __msa_hadd_u_h(src1, src1); 309 vec2 = __msa_hadd_u_h(src2, src2); 310 vec3 = __msa_hadd_u_h(src3, src3); 311 vec0 += __msa_hadd_u_h(src4, src4); 312 vec1 += __msa_hadd_u_h(src5, src5); 313 vec2 += __msa_hadd_u_h(src6, src6); 314 vec3 += __msa_hadd_u_h(src7, src7); 315 src0 = (v16u8)__msa_ld_b((v16i8*)t1, 0); 316 src1 = (v16u8)__msa_ld_b((v16i8*)t1, 16); 317 src2 = (v16u8)__msa_ld_b((v16i8*)t1, 32); 318 src3 = (v16u8)__msa_ld_b((v16i8*)t1, 48); 319 src4 = (v16u8)__msa_ld_b((v16i8*)t2, 0); 320 src5 = (v16u8)__msa_ld_b((v16i8*)t2, 16); 321 src6 = (v16u8)__msa_ld_b((v16i8*)t2, 32); 322 src7 = (v16u8)__msa_ld_b((v16i8*)t2, 48); 323 vec0 += __msa_hadd_u_h(src0, src0); 324 vec1 += __msa_hadd_u_h(src1, src1); 325 vec2 += __msa_hadd_u_h(src2, src2); 326 vec3 += __msa_hadd_u_h(src3, src3); 327 vec0 += __msa_hadd_u_h(src4, src4); 328 vec1 += __msa_hadd_u_h(src5, src5); 329 vec2 += __msa_hadd_u_h(src6, src6); 330 vec3 += __msa_hadd_u_h(src7, src7); 331 reg0 = __msa_hadd_u_w(vec0, vec0); 332 reg1 = __msa_hadd_u_w(vec1, vec1); 333 reg2 = __msa_hadd_u_w(vec2, vec2); 334 reg3 = __msa_hadd_u_w(vec3, vec3); 335 reg0 = (v4u32)__msa_srari_w((v4i32)reg0, 4); 336 reg1 = (v4u32)__msa_srari_w((v4i32)reg1, 4); 337 reg2 = (v4u32)__msa_srari_w((v4i32)reg2, 4); 338 reg3 = (v4u32)__msa_srari_w((v4i32)reg3, 4); 339 vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0); 340 vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2); 341 dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0); 342 ST_UB(dst0, dst); 343 s += 64; 344 t0 += 64; 345 t1 += 64; 346 t2 += 64; 347 dst += 16; 348 } 349 } 350 351 void ScaleRowDown38_MSA(const uint8_t* src_ptr, 352 ptrdiff_t src_stride, 353 uint8_t* dst, 354 int dst_width) { 355 int x, width; 356 uint64_t dst0; 357 uint32_t dst1; 358 v16u8 src0, src1, vec0; 359 v16i8 mask = {0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0}; 360 (void)src_stride; 361 362 assert(dst_width % 3 == 0); 363 width = dst_width / 3; 364 365 for (x = 0; x < width; x += 4) { 366 src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0); 367 src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16); 368 vec0 = (v16u8)__msa_vshf_b(mask, (v16i8)src1, (v16i8)src0); 369 dst0 = __msa_copy_u_d((v2i64)vec0, 0); 370 dst1 = __msa_copy_u_w((v4i32)vec0, 2); 371 SD(dst0, dst); 372 SW(dst1, dst + 8); 373 src_ptr += 32; 374 dst += 12; 375 } 376 } 377 378 void ScaleRowDown38_2_Box_MSA(const uint8_t* src_ptr, 379 ptrdiff_t src_stride, 380 uint8_t* dst_ptr, 381 int dst_width) { 382 int x, width; 383 const uint8_t* s = src_ptr; 384 const uint8_t* t = src_ptr + src_stride; 385 uint64_t dst0; 386 uint32_t dst1; 387 v16u8 src0, src1, src2, src3, out; 388 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 389 v4u32 tmp0, tmp1, tmp2, tmp3, tmp4; 390 v8i16 zero = {0}; 391 v8i16 mask = {0, 1, 2, 8, 3, 4, 5, 9}; 392 v16i8 dst_mask = {0, 2, 16, 4, 6, 18, 8, 10, 20, 12, 14, 22, 0, 0, 0, 0}; 393 v4u32 const_0x2AAA = (v4u32)__msa_fill_w(0x2AAA); 394 v4u32 const_0x4000 = (v4u32)__msa_fill_w(0x4000); 395 396 assert((dst_width % 3 == 0) && (dst_width > 0)); 397 width = dst_width / 3; 398 399 for (x = 0; x < width; x += 4) { 400 src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); 401 src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); 402 src2 = (v16u8)__msa_ld_b((v16i8*)t, 0); 403 src3 = (v16u8)__msa_ld_b((v16i8*)t, 16); 404 vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0); 405 vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0); 406 vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1); 407 vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1); 408 vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0); 409 vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1); 410 vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2); 411 vec3 = __msa_hadd_u_h((v16u8)vec3, (v16u8)vec3); 412 vec4 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec0); 413 vec5 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec1); 414 vec6 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec2); 415 vec7 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec3); 416 vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0); 417 vec1 = (v8u16)__msa_pckod_w((v4i32)vec3, (v4i32)vec2); 418 vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0); 419 tmp0 = __msa_hadd_u_w(vec4, vec4); 420 tmp1 = __msa_hadd_u_w(vec5, vec5); 421 tmp2 = __msa_hadd_u_w(vec6, vec6); 422 tmp3 = __msa_hadd_u_w(vec7, vec7); 423 tmp4 = __msa_hadd_u_w(vec0, vec0); 424 vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); 425 vec1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); 426 tmp0 = __msa_hadd_u_w(vec0, vec0); 427 tmp1 = __msa_hadd_u_w(vec1, vec1); 428 tmp0 *= const_0x2AAA; 429 tmp1 *= const_0x2AAA; 430 tmp4 *= const_0x4000; 431 tmp0 = (v4u32)__msa_srai_w((v4i32)tmp0, 16); 432 tmp1 = (v4u32)__msa_srai_w((v4i32)tmp1, 16); 433 tmp4 = (v4u32)__msa_srai_w((v4i32)tmp4, 16); 434 vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); 435 vec1 = (v8u16)__msa_pckev_h((v8i16)tmp4, (v8i16)tmp4); 436 out = (v16u8)__msa_vshf_b(dst_mask, (v16i8)vec1, (v16i8)vec0); 437 dst0 = __msa_copy_u_d((v2i64)out, 0); 438 dst1 = __msa_copy_u_w((v4i32)out, 2); 439 SD(dst0, dst_ptr); 440 SW(dst1, dst_ptr + 8); 441 s += 32; 442 t += 32; 443 dst_ptr += 12; 444 } 445 } 446 447 void ScaleRowDown38_3_Box_MSA(const uint8_t* src_ptr, 448 ptrdiff_t src_stride, 449 uint8_t* dst_ptr, 450 int dst_width) { 451 int x, width; 452 const uint8_t* s = src_ptr; 453 const uint8_t* t0 = s + src_stride; 454 const uint8_t* t1 = s + src_stride * 2; 455 uint64_t dst0; 456 uint32_t dst1; 457 v16u8 src0, src1, src2, src3, src4, src5, out; 458 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; 459 v4u32 tmp0, tmp1, tmp2, tmp3, tmp4; 460 v8u16 zero = {0}; 461 v8i16 mask = {0, 1, 2, 8, 3, 4, 5, 9}; 462 v16i8 dst_mask = {0, 2, 16, 4, 6, 18, 8, 10, 20, 12, 14, 22, 0, 0, 0, 0}; 463 v4u32 const_0x1C71 = (v4u32)__msa_fill_w(0x1C71); 464 v4u32 const_0x2AAA = (v4u32)__msa_fill_w(0x2AAA); 465 466 assert((dst_width % 3 == 0) && (dst_width > 0)); 467 width = dst_width / 3; 468 469 for (x = 0; x < width; x += 4) { 470 src0 = (v16u8)__msa_ld_b((v16i8*)s, 0); 471 src1 = (v16u8)__msa_ld_b((v16i8*)s, 16); 472 src2 = (v16u8)__msa_ld_b((v16i8*)t0, 0); 473 src3 = (v16u8)__msa_ld_b((v16i8*)t0, 16); 474 src4 = (v16u8)__msa_ld_b((v16i8*)t1, 0); 475 src5 = (v16u8)__msa_ld_b((v16i8*)t1, 16); 476 vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0); 477 vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0); 478 vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1); 479 vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1); 480 vec4 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src4); 481 vec5 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src4); 482 vec6 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src5); 483 vec7 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src5); 484 vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0); 485 vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1); 486 vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2); 487 vec3 = __msa_hadd_u_h((v16u8)vec3, (v16u8)vec3); 488 vec0 += __msa_hadd_u_h((v16u8)vec4, (v16u8)vec4); 489 vec1 += __msa_hadd_u_h((v16u8)vec5, (v16u8)vec5); 490 vec2 += __msa_hadd_u_h((v16u8)vec6, (v16u8)vec6); 491 vec3 += __msa_hadd_u_h((v16u8)vec7, (v16u8)vec7); 492 vec4 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec0); 493 vec5 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec1); 494 vec6 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec2); 495 vec7 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec3); 496 vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0); 497 vec1 = (v8u16)__msa_pckod_w((v4i32)vec3, (v4i32)vec2); 498 vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0); 499 tmp0 = __msa_hadd_u_w(vec4, vec4); 500 tmp1 = __msa_hadd_u_w(vec5, vec5); 501 tmp2 = __msa_hadd_u_w(vec6, vec6); 502 tmp3 = __msa_hadd_u_w(vec7, vec7); 503 tmp4 = __msa_hadd_u_w(vec0, vec0); 504 vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); 505 vec1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2); 506 tmp0 = __msa_hadd_u_w(vec0, vec0); 507 tmp1 = __msa_hadd_u_w(vec1, vec1); 508 tmp0 *= const_0x1C71; 509 tmp1 *= const_0x1C71; 510 tmp4 *= const_0x2AAA; 511 tmp0 = (v4u32)__msa_srai_w((v4i32)tmp0, 16); 512 tmp1 = (v4u32)__msa_srai_w((v4i32)tmp1, 16); 513 tmp4 = (v4u32)__msa_srai_w((v4i32)tmp4, 16); 514 vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0); 515 vec1 = (v8u16)__msa_pckev_h((v8i16)tmp4, (v8i16)tmp4); 516 out = (v16u8)__msa_vshf_b(dst_mask, (v16i8)vec1, (v16i8)vec0); 517 dst0 = __msa_copy_u_d((v2i64)out, 0); 518 dst1 = __msa_copy_u_w((v4i32)out, 2); 519 SD(dst0, dst_ptr); 520 SW(dst1, dst_ptr + 8); 521 s += 32; 522 t0 += 32; 523 t1 += 32; 524 dst_ptr += 12; 525 } 526 } 527 528 void ScaleAddRow_MSA(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { 529 int x; 530 v16u8 src0; 531 v8u16 dst0, dst1; 532 v16i8 zero = {0}; 533 534 assert(src_width > 0); 535 536 for (x = 0; x < src_width; x += 16) { 537 src0 = LD_UB(src_ptr); 538 dst0 = (v8u16)__msa_ld_h((v8i16*)dst_ptr, 0); 539 dst1 = (v8u16)__msa_ld_h((v8i16*)dst_ptr, 16); 540 dst0 += (v8u16)__msa_ilvr_b(zero, (v16i8)src0); 541 dst1 += (v8u16)__msa_ilvl_b(zero, (v16i8)src0); 542 ST_UH2(dst0, dst1, dst_ptr, 8); 543 src_ptr += 16; 544 dst_ptr += 16; 545 } 546 } 547 548 #ifdef __cplusplus 549 } // extern "C" 550 } // namespace libyuv 551 #endif 552 553 #endif // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) 554