1 // Copyright 2016 Google Inc. All Rights Reserved. 2 // 3 // Use of this source code is governed by a BSD-style license 4 // that can be found in the COPYING file in the root of the source 5 // tree. An additional intellectual property rights grant can be found 6 // in the file PATENTS. All contributing project authors may 7 // be found in the AUTHORS file in the root of the source tree. 8 // ----------------------------------------------------------------------------- 9 // 10 // MSA version of rescaling functions 11 // 12 // Author: Prashant Patil (prashant.patil (at) imgtec.com) 13 14 #include "src/dsp/dsp.h" 15 16 #if defined(WEBP_USE_MSA) && !defined(WEBP_REDUCE_SIZE) 17 18 #include <assert.h> 19 20 #include "src/utils/rescaler_utils.h" 21 #include "src/dsp/msa_macro.h" 22 23 #define ROUNDER (WEBP_RESCALER_ONE >> 1) 24 #define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX) 25 26 #define CALC_MULT_FIX_16(in0, in1, in2, in3, scale, shift, dst) do { \ 27 v4u32 tmp0, tmp1, tmp2, tmp3; \ 28 v16u8 t0, t1, t2, t3, t4, t5; \ 29 v2u64 out0, out1, out2, out3; \ 30 ILVRL_W2_UW(zero, in0, tmp0, tmp1); \ 31 ILVRL_W2_UW(zero, in1, tmp2, tmp3); \ 32 DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1); \ 33 DOTP_UW2_UD(tmp2, tmp3, scale, scale, out2, out3); \ 34 SRAR_D4_UD(out0, out1, out2, out3, shift); \ 35 PCKEV_B2_UB(out1, out0, out3, out2, t0, t1); \ 36 ILVRL_W2_UW(zero, in2, tmp0, tmp1); \ 37 ILVRL_W2_UW(zero, in3, tmp2, tmp3); \ 38 DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1); \ 39 DOTP_UW2_UD(tmp2, tmp3, scale, scale, out2, out3); \ 40 SRAR_D4_UD(out0, out1, out2, out3, shift); \ 41 PCKEV_B2_UB(out1, out0, out3, out2, t2, t3); \ 42 PCKEV_B2_UB(t1, t0, t3, t2, t4, t5); \ 43 dst = (v16u8)__msa_pckev_b((v16i8)t5, (v16i8)t4); \ 44 } while (0) 45 46 #define CALC_MULT_FIX_4(in0, scale, shift, dst) do { \ 47 v4u32 tmp0, tmp1; \ 48 v16i8 t0, t1; \ 49 v2u64 out0, out1; \ 50 ILVRL_W2_UW(zero, in0, tmp0, tmp1); \ 51 DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1); \ 52 SRAR_D2_UD(out0, out1, shift); \ 53 t0 = __msa_pckev_b((v16i8)out1, (v16i8)out0); \ 54 t1 = __msa_pckev_b(t0, t0); \ 55 t0 = __msa_pckev_b(t1, t1); \ 56 dst = __msa_copy_s_w((v4i32)t0, 0); \ 57 } while (0) 58 59 #define CALC_MULT_FIX1_16(in0, in1, in2, in3, fyscale, shift, \ 60 dst0, dst1, dst2, dst3) do { \ 61 v4u32 tmp0, tmp1, tmp2, tmp3; \ 62 v2u64 out0, out1, out2, out3; \ 63 ILVRL_W2_UW(zero, in0, tmp0, tmp1); \ 64 ILVRL_W2_UW(zero, in1, tmp2, tmp3); \ 65 DOTP_UW2_UD(tmp0, tmp1, fyscale, fyscale, out0, out1); \ 66 DOTP_UW2_UD(tmp2, tmp3, fyscale, fyscale, out2, out3); \ 67 SRAR_D4_UD(out0, out1, out2, out3, shift); \ 68 PCKEV_W2_UW(out1, out0, out3, out2, dst0, dst1); \ 69 ILVRL_W2_UW(zero, in2, tmp0, tmp1); \ 70 ILVRL_W2_UW(zero, in3, tmp2, tmp3); \ 71 DOTP_UW2_UD(tmp0, tmp1, fyscale, fyscale, out0, out1); \ 72 DOTP_UW2_UD(tmp2, tmp3, fyscale, fyscale, out2, out3); \ 73 SRAR_D4_UD(out0, out1, out2, out3, shift); \ 74 PCKEV_W2_UW(out1, out0, out3, out2, dst2, dst3); \ 75 } while (0) 76 77 #define CALC_MULT_FIX1_4(in0, scale, shift, dst) do { \ 78 v4u32 tmp0, tmp1; \ 79 v2u64 out0, out1; \ 80 ILVRL_W2_UW(zero, in0, tmp0, tmp1); \ 81 DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1); \ 82 SRAR_D2_UD(out0, out1, shift); \ 83 dst = (v4u32)__msa_pckev_w((v4i32)out1, (v4i32)out0); \ 84 } while (0) 85 86 #define CALC_MULT_FIX2_16(in0, in1, in2, in3, mult, scale, shift, \ 87 dst0, dst1) do { \ 88 v4u32 tmp0, tmp1, tmp2, tmp3; \ 89 v2u64 out0, out1, out2, out3; \ 90 ILVRL_W2_UW(in0, in2, tmp0, tmp1); \ 91 ILVRL_W2_UW(in1, in3, tmp2, tmp3); \ 92 DOTP_UW2_UD(tmp0, tmp1, mult, mult, out0, out1); \ 93 DOTP_UW2_UD(tmp2, tmp3, mult, mult, out2, out3); \ 94 SRAR_D4_UD(out0, out1, out2, out3, shift); \ 95 DOTP_UW2_UD(out0, out1, scale, scale, out0, out1); \ 96 DOTP_UW2_UD(out2, out3, scale, scale, out2, out3); \ 97 SRAR_D4_UD(out0, out1, out2, out3, shift); \ 98 PCKEV_B2_UB(out1, out0, out3, out2, dst0, dst1); \ 99 } while (0) 100 101 #define CALC_MULT_FIX2_4(in0, in1, mult, scale, shift, dst) do { \ 102 v4u32 tmp0, tmp1; \ 103 v2u64 out0, out1; \ 104 v16i8 t0, t1; \ 105 ILVRL_W2_UW(in0, in1, tmp0, tmp1); \ 106 DOTP_UW2_UD(tmp0, tmp1, mult, mult, out0, out1); \ 107 SRAR_D2_UD(out0, out1, shift); \ 108 DOTP_UW2_UD(out0, out1, scale, scale, out0, out1); \ 109 SRAR_D2_UD(out0, out1, shift); \ 110 t0 = __msa_pckev_b((v16i8)out1, (v16i8)out0); \ 111 t1 = __msa_pckev_b(t0, t0); \ 112 t0 = __msa_pckev_b(t1, t1); \ 113 dst = __msa_copy_s_w((v4i32)t0, 0); \ 114 } while (0) 115 116 static WEBP_INLINE void ExportRowExpand_0(const uint32_t* frow, uint8_t* dst, 117 int length, 118 WebPRescaler* const wrk) { 119 const v4u32 scale = (v4u32)__msa_fill_w(wrk->fy_scale); 120 const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX); 121 const v4i32 zero = { 0 }; 122 123 while (length >= 16) { 124 v4u32 src0, src1, src2, src3; 125 v16u8 out; 126 LD_UW4(frow, 4, src0, src1, src2, src3); 127 CALC_MULT_FIX_16(src0, src1, src2, src3, scale, shift, out); 128 ST_UB(out, dst); 129 length -= 16; 130 frow += 16; 131 dst += 16; 132 } 133 if (length > 0) { 134 int x_out; 135 if (length >= 12) { 136 uint32_t val0_m, val1_m, val2_m; 137 v4u32 src0, src1, src2; 138 LD_UW3(frow, 4, src0, src1, src2); 139 CALC_MULT_FIX_4(src0, scale, shift, val0_m); 140 CALC_MULT_FIX_4(src1, scale, shift, val1_m); 141 CALC_MULT_FIX_4(src2, scale, shift, val2_m); 142 SW3(val0_m, val1_m, val2_m, dst, 4); 143 length -= 12; 144 frow += 12; 145 dst += 12; 146 } else if (length >= 8) { 147 uint32_t val0_m, val1_m; 148 v4u32 src0, src1; 149 LD_UW2(frow, 4, src0, src1); 150 CALC_MULT_FIX_4(src0, scale, shift, val0_m); 151 CALC_MULT_FIX_4(src1, scale, shift, val1_m); 152 SW2(val0_m, val1_m, dst, 4); 153 length -= 8; 154 frow += 8; 155 dst += 8; 156 } else if (length >= 4) { 157 uint32_t val0_m; 158 const v4u32 src0 = LD_UW(frow); 159 CALC_MULT_FIX_4(src0, scale, shift, val0_m); 160 SW(val0_m, dst); 161 length -= 4; 162 frow += 4; 163 dst += 4; 164 } 165 for (x_out = 0; x_out < length; ++x_out) { 166 const uint32_t J = frow[x_out]; 167 const int v = (int)MULT_FIX(J, wrk->fy_scale); 168 assert(v >= 0 && v <= 255); 169 dst[x_out] = v; 170 } 171 } 172 } 173 174 static WEBP_INLINE void ExportRowExpand_1(const uint32_t* frow, uint32_t* irow, 175 uint8_t* dst, int length, 176 WebPRescaler* const wrk) { 177 const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub); 178 const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B); 179 const v4i32 B1 = __msa_fill_w(B); 180 const v4i32 A1 = __msa_fill_w(A); 181 const v4i32 AB = __msa_ilvr_w(A1, B1); 182 const v4u32 scale = (v4u32)__msa_fill_w(wrk->fy_scale); 183 const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX); 184 185 while (length >= 16) { 186 v4u32 frow0, frow1, frow2, frow3, irow0, irow1, irow2, irow3; 187 v16u8 t0, t1, t2, t3, t4, t5; 188 LD_UW4(frow, 4, frow0, frow1, frow2, frow3); 189 LD_UW4(irow, 4, irow0, irow1, irow2, irow3); 190 CALC_MULT_FIX2_16(frow0, frow1, irow0, irow1, AB, scale, shift, t0, t1); 191 CALC_MULT_FIX2_16(frow2, frow3, irow2, irow3, AB, scale, shift, t2, t3); 192 PCKEV_B2_UB(t1, t0, t3, t2, t4, t5); 193 t0 = (v16u8)__msa_pckev_b((v16i8)t5, (v16i8)t4); 194 ST_UB(t0, dst); 195 frow += 16; 196 irow += 16; 197 dst += 16; 198 length -= 16; 199 } 200 if (length > 0) { 201 int x_out; 202 if (length >= 12) { 203 uint32_t val0_m, val1_m, val2_m; 204 v4u32 frow0, frow1, frow2, irow0, irow1, irow2; 205 LD_UW3(frow, 4, frow0, frow1, frow2); 206 LD_UW3(irow, 4, irow0, irow1, irow2); 207 CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m); 208 CALC_MULT_FIX2_4(frow1, irow1, AB, scale, shift, val1_m); 209 CALC_MULT_FIX2_4(frow2, irow2, AB, scale, shift, val2_m); 210 SW3(val0_m, val1_m, val2_m, dst, 4); 211 frow += 12; 212 irow += 12; 213 dst += 12; 214 length -= 12; 215 } else if (length >= 8) { 216 uint32_t val0_m, val1_m; 217 v4u32 frow0, frow1, irow0, irow1; 218 LD_UW2(frow, 4, frow0, frow1); 219 LD_UW2(irow, 4, irow0, irow1); 220 CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m); 221 CALC_MULT_FIX2_4(frow1, irow1, AB, scale, shift, val1_m); 222 SW2(val0_m, val1_m, dst, 4); 223 frow += 4; 224 irow += 4; 225 dst += 4; 226 length -= 4; 227 } else if (length >= 4) { 228 uint32_t val0_m; 229 const v4u32 frow0 = LD_UW(frow + 0); 230 const v4u32 irow0 = LD_UW(irow + 0); 231 CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m); 232 SW(val0_m, dst); 233 frow += 4; 234 irow += 4; 235 dst += 4; 236 length -= 4; 237 } 238 for (x_out = 0; x_out < length; ++x_out) { 239 const uint64_t I = (uint64_t)A * frow[x_out] 240 + (uint64_t)B * irow[x_out]; 241 const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX); 242 const int v = (int)MULT_FIX(J, wrk->fy_scale); 243 assert(v >= 0 && v <= 255); 244 dst[x_out] = v; 245 } 246 } 247 } 248 249 static void RescalerExportRowExpand_MIPSdspR2(WebPRescaler* const wrk) { 250 uint8_t* dst = wrk->dst; 251 rescaler_t* irow = wrk->irow; 252 const int x_out_max = wrk->dst_width * wrk->num_channels; 253 const rescaler_t* frow = wrk->frow; 254 assert(!WebPRescalerOutputDone(wrk)); 255 assert(wrk->y_accum <= 0); 256 assert(wrk->y_expand); 257 assert(wrk->y_sub != 0); 258 if (wrk->y_accum == 0) { 259 ExportRowExpand_0(frow, dst, x_out_max, wrk); 260 } else { 261 ExportRowExpand_1(frow, irow, dst, x_out_max, wrk); 262 } 263 } 264 265 static WEBP_INLINE void ExportRowShrink_0(const uint32_t* frow, uint32_t* irow, 266 uint8_t* dst, int length, 267 const uint32_t yscale, 268 WebPRescaler* const wrk) { 269 const v4u32 y_scale = (v4u32)__msa_fill_w(yscale); 270 const v4u32 fxyscale = (v4u32)__msa_fill_w(wrk->fxy_scale); 271 const v4u32 shiftval = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX); 272 const v4i32 zero = { 0 }; 273 274 while (length >= 16) { 275 v4u32 src0, src1, src2, src3, frac0, frac1, frac2, frac3; 276 v16u8 out; 277 LD_UW4(frow, 4, src0, src1, src2, src3); 278 CALC_MULT_FIX1_16(src0, src1, src2, src3, y_scale, shiftval, 279 frac0, frac1, frac2, frac3); 280 LD_UW4(irow, 4, src0, src1, src2, src3); 281 SUB4(src0, frac0, src1, frac1, src2, frac2, src3, frac3, 282 src0, src1, src2, src3); 283 CALC_MULT_FIX_16(src0, src1, src2, src3, fxyscale, shiftval, out); 284 ST_UB(out, dst); 285 ST_UW4(frac0, frac1, frac2, frac3, irow, 4); 286 frow += 16; 287 irow += 16; 288 dst += 16; 289 length -= 16; 290 } 291 if (length > 0) { 292 int x_out; 293 if (length >= 12) { 294 uint32_t val0_m, val1_m, val2_m; 295 v4u32 src0, src1, src2, frac0, frac1, frac2; 296 LD_UW3(frow, 4, src0, src1, src2); 297 CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0); 298 CALC_MULT_FIX1_4(src1, y_scale, shiftval, frac1); 299 CALC_MULT_FIX1_4(src2, y_scale, shiftval, frac2); 300 LD_UW3(irow, 4, src0, src1, src2); 301 SUB3(src0, frac0, src1, frac1, src2, frac2, src0, src1, src2); 302 CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m); 303 CALC_MULT_FIX_4(src1, fxyscale, shiftval, val1_m); 304 CALC_MULT_FIX_4(src2, fxyscale, shiftval, val2_m); 305 SW3(val0_m, val1_m, val2_m, dst, 4); 306 ST_UW3(frac0, frac1, frac2, irow, 4); 307 frow += 12; 308 irow += 12; 309 dst += 12; 310 length -= 12; 311 } else if (length >= 8) { 312 uint32_t val0_m, val1_m; 313 v4u32 src0, src1, frac0, frac1; 314 LD_UW2(frow, 4, src0, src1); 315 CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0); 316 CALC_MULT_FIX1_4(src1, y_scale, shiftval, frac1); 317 LD_UW2(irow, 4, src0, src1); 318 SUB2(src0, frac0, src1, frac1, src0, src1); 319 CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m); 320 CALC_MULT_FIX_4(src1, fxyscale, shiftval, val1_m); 321 SW2(val0_m, val1_m, dst, 4); 322 ST_UW2(frac0, frac1, irow, 4); 323 frow += 8; 324 irow += 8; 325 dst += 8; 326 length -= 8; 327 } else if (length >= 4) { 328 uint32_t val0_m; 329 v4u32 frac0; 330 v4u32 src0 = LD_UW(frow); 331 CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0); 332 src0 = LD_UW(irow); 333 src0 = src0 - frac0; 334 CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m); 335 SW(val0_m, dst); 336 ST_UW(frac0, irow); 337 frow += 4; 338 irow += 4; 339 dst += 4; 340 length -= 4; 341 } 342 for (x_out = 0; x_out < length; ++x_out) { 343 const uint32_t frac = (uint32_t)MULT_FIX(frow[x_out], yscale); 344 const int v = (int)MULT_FIX(irow[x_out] - frac, wrk->fxy_scale); 345 assert(v >= 0 && v <= 255); 346 dst[x_out] = v; 347 irow[x_out] = frac; 348 } 349 } 350 } 351 352 static WEBP_INLINE void ExportRowShrink_1(uint32_t* irow, uint8_t* dst, 353 int length, 354 WebPRescaler* const wrk) { 355 const v4u32 scale = (v4u32)__msa_fill_w(wrk->fxy_scale); 356 const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX); 357 const v4i32 zero = { 0 }; 358 359 while (length >= 16) { 360 v4u32 src0, src1, src2, src3; 361 v16u8 dst0; 362 LD_UW4(irow, 4, src0, src1, src2, src3); 363 CALC_MULT_FIX_16(src0, src1, src2, src3, scale, shift, dst0); 364 ST_UB(dst0, dst); 365 ST_SW4(zero, zero, zero, zero, irow, 4); 366 length -= 16; 367 irow += 16; 368 dst += 16; 369 } 370 if (length > 0) { 371 int x_out; 372 if (length >= 12) { 373 uint32_t val0_m, val1_m, val2_m; 374 v4u32 src0, src1, src2; 375 LD_UW3(irow, 4, src0, src1, src2); 376 CALC_MULT_FIX_4(src0, scale, shift, val0_m); 377 CALC_MULT_FIX_4(src1, scale, shift, val1_m); 378 CALC_MULT_FIX_4(src2, scale, shift, val2_m); 379 SW3(val0_m, val1_m, val2_m, dst, 4); 380 ST_SW3(zero, zero, zero, irow, 4); 381 length -= 12; 382 irow += 12; 383 dst += 12; 384 } else if (length >= 8) { 385 uint32_t val0_m, val1_m; 386 v4u32 src0, src1; 387 LD_UW2(irow, 4, src0, src1); 388 CALC_MULT_FIX_4(src0, scale, shift, val0_m); 389 CALC_MULT_FIX_4(src1, scale, shift, val1_m); 390 SW2(val0_m, val1_m, dst, 4); 391 ST_SW2(zero, zero, irow, 4); 392 length -= 8; 393 irow += 8; 394 dst += 8; 395 } else if (length >= 4) { 396 uint32_t val0_m; 397 const v4u32 src0 = LD_UW(irow + 0); 398 CALC_MULT_FIX_4(src0, scale, shift, val0_m); 399 SW(val0_m, dst); 400 ST_SW(zero, irow); 401 length -= 4; 402 irow += 4; 403 dst += 4; 404 } 405 for (x_out = 0; x_out < length; ++x_out) { 406 const int v = (int)MULT_FIX(irow[x_out], wrk->fxy_scale); 407 assert(v >= 0 && v <= 255); 408 dst[x_out] = v; 409 irow[x_out] = 0; 410 } 411 } 412 } 413 414 static void RescalerExportRowShrink_MIPSdspR2(WebPRescaler* const wrk) { 415 uint8_t* dst = wrk->dst; 416 rescaler_t* irow = wrk->irow; 417 const int x_out_max = wrk->dst_width * wrk->num_channels; 418 const rescaler_t* frow = wrk->frow; 419 const uint32_t yscale = wrk->fy_scale * (-wrk->y_accum); 420 assert(!WebPRescalerOutputDone(wrk)); 421 assert(wrk->y_accum <= 0); 422 assert(!wrk->y_expand); 423 if (yscale) { 424 ExportRowShrink_0(frow, irow, dst, x_out_max, yscale, wrk); 425 } else { 426 ExportRowShrink_1(irow, dst, x_out_max, wrk); 427 } 428 } 429 430 //------------------------------------------------------------------------------ 431 // Entry point 432 433 extern void WebPRescalerDspInitMSA(void); 434 435 WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMSA(void) { 436 WebPRescalerExportRowExpand = RescalerExportRowExpand_MIPSdspR2; 437 WebPRescalerExportRowShrink = RescalerExportRowShrink_MIPSdspR2; 438 } 439 440 #else // !WEBP_USE_MSA 441 442 WEBP_DSP_INIT_STUB(WebPRescalerDspInitMSA) 443 444 #endif // WEBP_USE_MSA 445