1 // Copyright 2016 Google Inc. All Rights Reserved. 2 // 3 // Use of this source code is governed by a BSD-style license 4 // that can be found in the COPYING file in the root of the source 5 // tree. An additional intellectual property rights grant can be found 6 // in the file PATENTS. All contributing project authors may 7 // be found in the AUTHORS file in the root of the source tree. 8 // ----------------------------------------------------------------------------- 9 // 10 // MSA version of rescaling functions 11 // 12 // Author: Prashant Patil (prashant.patil (at) imgtec.com) 13 14 #include "src/dsp/dsp.h" 15 16 #if defined(WEBP_USE_MSA) && !defined(WEBP_REDUCE_SIZE) 17 18 #include <assert.h> 19 20 #include "src/utils/rescaler_utils.h" 21 #include "src/dsp/msa_macro.h" 22 23 #define ROUNDER (WEBP_RESCALER_ONE >> 1) 24 #define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX) 25 #define MULT_FIX_FLOOR(x, y) (((uint64_t)(x) * (y)) >> WEBP_RESCALER_RFIX) 26 27 #define CALC_MULT_FIX_16(in0, in1, in2, in3, scale, shift, dst) do { \ 28 v4u32 tmp0, tmp1, tmp2, tmp3; \ 29 v16u8 t0, t1, t2, t3, t4, t5; \ 30 v2u64 out0, out1, out2, out3; \ 31 ILVRL_W2_UW(zero, in0, tmp0, tmp1); \ 32 ILVRL_W2_UW(zero, in1, tmp2, tmp3); \ 33 DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1); \ 34 DOTP_UW2_UD(tmp2, tmp3, scale, scale, out2, out3); \ 35 SRAR_D4_UD(out0, out1, out2, out3, shift); \ 36 PCKEV_B2_UB(out1, out0, out3, out2, t0, t1); \ 37 ILVRL_W2_UW(zero, in2, tmp0, tmp1); \ 38 ILVRL_W2_UW(zero, in3, tmp2, tmp3); \ 39 DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1); \ 40 DOTP_UW2_UD(tmp2, tmp3, scale, scale, out2, out3); \ 41 SRAR_D4_UD(out0, out1, out2, out3, shift); \ 42 PCKEV_B2_UB(out1, out0, out3, out2, t2, t3); \ 43 PCKEV_B2_UB(t1, t0, t3, t2, t4, t5); \ 44 dst = (v16u8)__msa_pckev_b((v16i8)t5, (v16i8)t4); \ 45 } while (0) 46 47 #define CALC_MULT_FIX_4(in0, scale, shift, dst) do { \ 48 v4u32 tmp0, tmp1; \ 49 v16i8 t0, t1; \ 50 v2u64 out0, out1; \ 51 ILVRL_W2_UW(zero, in0, tmp0, tmp1); \ 52 DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1); \ 53 SRAR_D2_UD(out0, out1, shift); \ 54 t0 = __msa_pckev_b((v16i8)out1, (v16i8)out0); \ 55 t1 = __msa_pckev_b(t0, t0); \ 56 t0 = __msa_pckev_b(t1, t1); \ 57 dst = __msa_copy_s_w((v4i32)t0, 0); \ 58 } while (0) 59 60 #define CALC_MULT_FIX1_16(in0, in1, in2, in3, fyscale, shift, \ 61 dst0, dst1, dst2, dst3) do { \ 62 v4u32 tmp0, tmp1, tmp2, tmp3; \ 63 v2u64 out0, out1, out2, out3; \ 64 ILVRL_W2_UW(zero, in0, tmp0, tmp1); \ 65 ILVRL_W2_UW(zero, in1, tmp2, tmp3); \ 66 DOTP_UW2_UD(tmp0, tmp1, fyscale, fyscale, out0, out1); \ 67 DOTP_UW2_UD(tmp2, tmp3, fyscale, fyscale, out2, out3); \ 68 SRAR_D4_UD(out0, out1, out2, out3, shift); \ 69 PCKEV_W2_UW(out1, out0, out3, out2, dst0, dst1); \ 70 ILVRL_W2_UW(zero, in2, tmp0, tmp1); \ 71 ILVRL_W2_UW(zero, in3, tmp2, tmp3); \ 72 DOTP_UW2_UD(tmp0, tmp1, fyscale, fyscale, out0, out1); \ 73 DOTP_UW2_UD(tmp2, tmp3, fyscale, fyscale, out2, out3); \ 74 SRAR_D4_UD(out0, out1, out2, out3, shift); \ 75 PCKEV_W2_UW(out1, out0, out3, out2, dst2, dst3); \ 76 } while (0) 77 78 #define CALC_MULT_FIX1_4(in0, scale, shift, dst) do { \ 79 v4u32 tmp0, tmp1; \ 80 v2u64 out0, out1; \ 81 ILVRL_W2_UW(zero, in0, tmp0, tmp1); \ 82 DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1); \ 83 SRAR_D2_UD(out0, out1, shift); \ 84 dst = (v4u32)__msa_pckev_w((v4i32)out1, (v4i32)out0); \ 85 } while (0) 86 87 #define CALC_MULT_FIX2_16(in0, in1, in2, in3, mult, scale, shift, \ 88 dst0, dst1) do { \ 89 v4u32 tmp0, tmp1, tmp2, tmp3; \ 90 v2u64 out0, out1, out2, out3; \ 91 ILVRL_W2_UW(in0, in2, tmp0, tmp1); \ 92 ILVRL_W2_UW(in1, in3, tmp2, tmp3); \ 93 DOTP_UW2_UD(tmp0, tmp1, mult, mult, out0, out1); \ 94 DOTP_UW2_UD(tmp2, tmp3, mult, mult, out2, out3); \ 95 SRAR_D4_UD(out0, out1, out2, out3, shift); \ 96 DOTP_UW2_UD(out0, out1, scale, scale, out0, out1); \ 97 DOTP_UW2_UD(out2, out3, scale, scale, out2, out3); \ 98 SRAR_D4_UD(out0, out1, out2, out3, shift); \ 99 PCKEV_B2_UB(out1, out0, out3, out2, dst0, dst1); \ 100 } while (0) 101 102 #define CALC_MULT_FIX2_4(in0, in1, mult, scale, shift, dst) do { \ 103 v4u32 tmp0, tmp1; \ 104 v2u64 out0, out1; \ 105 v16i8 t0, t1; \ 106 ILVRL_W2_UW(in0, in1, tmp0, tmp1); \ 107 DOTP_UW2_UD(tmp0, tmp1, mult, mult, out0, out1); \ 108 SRAR_D2_UD(out0, out1, shift); \ 109 DOTP_UW2_UD(out0, out1, scale, scale, out0, out1); \ 110 SRAR_D2_UD(out0, out1, shift); \ 111 t0 = __msa_pckev_b((v16i8)out1, (v16i8)out0); \ 112 t1 = __msa_pckev_b(t0, t0); \ 113 t0 = __msa_pckev_b(t1, t1); \ 114 dst = __msa_copy_s_w((v4i32)t0, 0); \ 115 } while (0) 116 117 static WEBP_INLINE void ExportRowExpand_0(const uint32_t* frow, uint8_t* dst, 118 int length, 119 WebPRescaler* const wrk) { 120 const v4u32 scale = (v4u32)__msa_fill_w(wrk->fy_scale); 121 const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX); 122 const v4i32 zero = { 0 }; 123 124 while (length >= 16) { 125 v4u32 src0, src1, src2, src3; 126 v16u8 out; 127 LD_UW4(frow, 4, src0, src1, src2, src3); 128 CALC_MULT_FIX_16(src0, src1, src2, src3, scale, shift, out); 129 ST_UB(out, dst); 130 length -= 16; 131 frow += 16; 132 dst += 16; 133 } 134 if (length > 0) { 135 int x_out; 136 if (length >= 12) { 137 uint32_t val0_m, val1_m, val2_m; 138 v4u32 src0, src1, src2; 139 LD_UW3(frow, 4, src0, src1, src2); 140 CALC_MULT_FIX_4(src0, scale, shift, val0_m); 141 CALC_MULT_FIX_4(src1, scale, shift, val1_m); 142 CALC_MULT_FIX_4(src2, scale, shift, val2_m); 143 SW3(val0_m, val1_m, val2_m, dst, 4); 144 length -= 12; 145 frow += 12; 146 dst += 12; 147 } else if (length >= 8) { 148 uint32_t val0_m, val1_m; 149 v4u32 src0, src1; 150 LD_UW2(frow, 4, src0, src1); 151 CALC_MULT_FIX_4(src0, scale, shift, val0_m); 152 CALC_MULT_FIX_4(src1, scale, shift, val1_m); 153 SW2(val0_m, val1_m, dst, 4); 154 length -= 8; 155 frow += 8; 156 dst += 8; 157 } else if (length >= 4) { 158 uint32_t val0_m; 159 const v4u32 src0 = LD_UW(frow); 160 CALC_MULT_FIX_4(src0, scale, shift, val0_m); 161 SW(val0_m, dst); 162 length -= 4; 163 frow += 4; 164 dst += 4; 165 } 166 for (x_out = 0; x_out < length; ++x_out) { 167 const uint32_t J = frow[x_out]; 168 const int v = (int)MULT_FIX(J, wrk->fy_scale); 169 dst[x_out] = (v > 255) ? 255u : (uint8_t)v; 170 } 171 } 172 } 173 174 static WEBP_INLINE void ExportRowExpand_1(const uint32_t* frow, uint32_t* irow, 175 uint8_t* dst, int length, 176 WebPRescaler* const wrk) { 177 const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub); 178 const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B); 179 const v4i32 B1 = __msa_fill_w(B); 180 const v4i32 A1 = __msa_fill_w(A); 181 const v4i32 AB = __msa_ilvr_w(A1, B1); 182 const v4u32 scale = (v4u32)__msa_fill_w(wrk->fy_scale); 183 const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX); 184 185 while (length >= 16) { 186 v4u32 frow0, frow1, frow2, frow3, irow0, irow1, irow2, irow3; 187 v16u8 t0, t1, t2, t3, t4, t5; 188 LD_UW4(frow, 4, frow0, frow1, frow2, frow3); 189 LD_UW4(irow, 4, irow0, irow1, irow2, irow3); 190 CALC_MULT_FIX2_16(frow0, frow1, irow0, irow1, AB, scale, shift, t0, t1); 191 CALC_MULT_FIX2_16(frow2, frow3, irow2, irow3, AB, scale, shift, t2, t3); 192 PCKEV_B2_UB(t1, t0, t3, t2, t4, t5); 193 t0 = (v16u8)__msa_pckev_b((v16i8)t5, (v16i8)t4); 194 ST_UB(t0, dst); 195 frow += 16; 196 irow += 16; 197 dst += 16; 198 length -= 16; 199 } 200 if (length > 0) { 201 int x_out; 202 if (length >= 12) { 203 uint32_t val0_m, val1_m, val2_m; 204 v4u32 frow0, frow1, frow2, irow0, irow1, irow2; 205 LD_UW3(frow, 4, frow0, frow1, frow2); 206 LD_UW3(irow, 4, irow0, irow1, irow2); 207 CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m); 208 CALC_MULT_FIX2_4(frow1, irow1, AB, scale, shift, val1_m); 209 CALC_MULT_FIX2_4(frow2, irow2, AB, scale, shift, val2_m); 210 SW3(val0_m, val1_m, val2_m, dst, 4); 211 frow += 12; 212 irow += 12; 213 dst += 12; 214 length -= 12; 215 } else if (length >= 8) { 216 uint32_t val0_m, val1_m; 217 v4u32 frow0, frow1, irow0, irow1; 218 LD_UW2(frow, 4, frow0, frow1); 219 LD_UW2(irow, 4, irow0, irow1); 220 CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m); 221 CALC_MULT_FIX2_4(frow1, irow1, AB, scale, shift, val1_m); 222 SW2(val0_m, val1_m, dst, 4); 223 frow += 4; 224 irow += 4; 225 dst += 4; 226 length -= 4; 227 } else if (length >= 4) { 228 uint32_t val0_m; 229 const v4u32 frow0 = LD_UW(frow + 0); 230 const v4u32 irow0 = LD_UW(irow + 0); 231 CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m); 232 SW(val0_m, dst); 233 frow += 4; 234 irow += 4; 235 dst += 4; 236 length -= 4; 237 } 238 for (x_out = 0; x_out < length; ++x_out) { 239 const uint64_t I = (uint64_t)A * frow[x_out] 240 + (uint64_t)B * irow[x_out]; 241 const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX); 242 const int v = (int)MULT_FIX(J, wrk->fy_scale); 243 dst[x_out] = (v > 255) ? 255u : (uint8_t)v; 244 } 245 } 246 } 247 248 static void RescalerExportRowExpand_MIPSdspR2(WebPRescaler* const wrk) { 249 uint8_t* dst = wrk->dst; 250 rescaler_t* irow = wrk->irow; 251 const int x_out_max = wrk->dst_width * wrk->num_channels; 252 const rescaler_t* frow = wrk->frow; 253 assert(!WebPRescalerOutputDone(wrk)); 254 assert(wrk->y_accum <= 0); 255 assert(wrk->y_expand); 256 assert(wrk->y_sub != 0); 257 if (wrk->y_accum == 0) { 258 ExportRowExpand_0(frow, dst, x_out_max, wrk); 259 } else { 260 ExportRowExpand_1(frow, irow, dst, x_out_max, wrk); 261 } 262 } 263 264 #if 0 // disabled for now. TODO(skal): make match the C-code 265 static WEBP_INLINE void ExportRowShrink_0(const uint32_t* frow, uint32_t* irow, 266 uint8_t* dst, int length, 267 const uint32_t yscale, 268 WebPRescaler* const wrk) { 269 const v4u32 y_scale = (v4u32)__msa_fill_w(yscale); 270 const v4u32 fxyscale = (v4u32)__msa_fill_w(wrk->fxy_scale); 271 const v4u32 shiftval = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX); 272 const v4i32 zero = { 0 }; 273 274 while (length >= 16) { 275 v4u32 src0, src1, src2, src3, frac0, frac1, frac2, frac3; 276 v16u8 out; 277 LD_UW4(frow, 4, src0, src1, src2, src3); 278 CALC_MULT_FIX1_16(src0, src1, src2, src3, y_scale, shiftval, 279 frac0, frac1, frac2, frac3); 280 LD_UW4(irow, 4, src0, src1, src2, src3); 281 SUB4(src0, frac0, src1, frac1, src2, frac2, src3, frac3, 282 src0, src1, src2, src3); 283 CALC_MULT_FIX_16(src0, src1, src2, src3, fxyscale, shiftval, out); 284 ST_UB(out, dst); 285 ST_UW4(frac0, frac1, frac2, frac3, irow, 4); 286 frow += 16; 287 irow += 16; 288 dst += 16; 289 length -= 16; 290 } 291 if (length > 0) { 292 int x_out; 293 if (length >= 12) { 294 uint32_t val0_m, val1_m, val2_m; 295 v4u32 src0, src1, src2, frac0, frac1, frac2; 296 LD_UW3(frow, 4, src0, src1, src2); 297 CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0); 298 CALC_MULT_FIX1_4(src1, y_scale, shiftval, frac1); 299 CALC_MULT_FIX1_4(src2, y_scale, shiftval, frac2); 300 LD_UW3(irow, 4, src0, src1, src2); 301 SUB3(src0, frac0, src1, frac1, src2, frac2, src0, src1, src2); 302 CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m); 303 CALC_MULT_FIX_4(src1, fxyscale, shiftval, val1_m); 304 CALC_MULT_FIX_4(src2, fxyscale, shiftval, val2_m); 305 SW3(val0_m, val1_m, val2_m, dst, 4); 306 ST_UW3(frac0, frac1, frac2, irow, 4); 307 frow += 12; 308 irow += 12; 309 dst += 12; 310 length -= 12; 311 } else if (length >= 8) { 312 uint32_t val0_m, val1_m; 313 v4u32 src0, src1, frac0, frac1; 314 LD_UW2(frow, 4, src0, src1); 315 CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0); 316 CALC_MULT_FIX1_4(src1, y_scale, shiftval, frac1); 317 LD_UW2(irow, 4, src0, src1); 318 SUB2(src0, frac0, src1, frac1, src0, src1); 319 CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m); 320 CALC_MULT_FIX_4(src1, fxyscale, shiftval, val1_m); 321 SW2(val0_m, val1_m, dst, 4); 322 ST_UW2(frac0, frac1, irow, 4); 323 frow += 8; 324 irow += 8; 325 dst += 8; 326 length -= 8; 327 } else if (length >= 4) { 328 uint32_t val0_m; 329 v4u32 frac0; 330 v4u32 src0 = LD_UW(frow); 331 CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0); 332 src0 = LD_UW(irow); 333 src0 = src0 - frac0; 334 CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m); 335 SW(val0_m, dst); 336 ST_UW(frac0, irow); 337 frow += 4; 338 irow += 4; 339 dst += 4; 340 length -= 4; 341 } 342 for (x_out = 0; x_out < length; ++x_out) { 343 const uint32_t frac = (uint32_t)MULT_FIX_FLOOR(frow[x_out], yscale); 344 const int v = (int)MULT_FIX(irow[x_out] - frac, wrk->fxy_scale); 345 dst[x_out] = (v > 255) ? 255u : (uint8_t)v; 346 irow[x_out] = frac; 347 } 348 } 349 } 350 351 static WEBP_INLINE void ExportRowShrink_1(uint32_t* irow, uint8_t* dst, 352 int length, 353 WebPRescaler* const wrk) { 354 const v4u32 scale = (v4u32)__msa_fill_w(wrk->fxy_scale); 355 const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX); 356 const v4i32 zero = { 0 }; 357 358 while (length >= 16) { 359 v4u32 src0, src1, src2, src3; 360 v16u8 dst0; 361 LD_UW4(irow, 4, src0, src1, src2, src3); 362 CALC_MULT_FIX_16(src0, src1, src2, src3, scale, shift, dst0); 363 ST_UB(dst0, dst); 364 ST_SW4(zero, zero, zero, zero, irow, 4); 365 length -= 16; 366 irow += 16; 367 dst += 16; 368 } 369 if (length > 0) { 370 int x_out; 371 if (length >= 12) { 372 uint32_t val0_m, val1_m, val2_m; 373 v4u32 src0, src1, src2; 374 LD_UW3(irow, 4, src0, src1, src2); 375 CALC_MULT_FIX_4(src0, scale, shift, val0_m); 376 CALC_MULT_FIX_4(src1, scale, shift, val1_m); 377 CALC_MULT_FIX_4(src2, scale, shift, val2_m); 378 SW3(val0_m, val1_m, val2_m, dst, 4); 379 ST_SW3(zero, zero, zero, irow, 4); 380 length -= 12; 381 irow += 12; 382 dst += 12; 383 } else if (length >= 8) { 384 uint32_t val0_m, val1_m; 385 v4u32 src0, src1; 386 LD_UW2(irow, 4, src0, src1); 387 CALC_MULT_FIX_4(src0, scale, shift, val0_m); 388 CALC_MULT_FIX_4(src1, scale, shift, val1_m); 389 SW2(val0_m, val1_m, dst, 4); 390 ST_SW2(zero, zero, irow, 4); 391 length -= 8; 392 irow += 8; 393 dst += 8; 394 } else if (length >= 4) { 395 uint32_t val0_m; 396 const v4u32 src0 = LD_UW(irow + 0); 397 CALC_MULT_FIX_4(src0, scale, shift, val0_m); 398 SW(val0_m, dst); 399 ST_SW(zero, irow); 400 length -= 4; 401 irow += 4; 402 dst += 4; 403 } 404 for (x_out = 0; x_out < length; ++x_out) { 405 const int v = (int)MULT_FIX(irow[x_out], wrk->fxy_scale); 406 dst[x_out] = (v > 255) ? 255u : (uint8_t)v; 407 irow[x_out] = 0; 408 } 409 } 410 } 411 412 static void RescalerExportRowShrink_MIPSdspR2(WebPRescaler* const wrk) { 413 uint8_t* dst = wrk->dst; 414 rescaler_t* irow = wrk->irow; 415 const int x_out_max = wrk->dst_width * wrk->num_channels; 416 const rescaler_t* frow = wrk->frow; 417 const uint32_t yscale = wrk->fy_scale * (-wrk->y_accum); 418 assert(!WebPRescalerOutputDone(wrk)); 419 assert(wrk->y_accum <= 0); 420 assert(!wrk->y_expand); 421 if (yscale) { 422 ExportRowShrink_0(frow, irow, dst, x_out_max, yscale, wrk); 423 } else { 424 ExportRowShrink_1(irow, dst, x_out_max, wrk); 425 } 426 } 427 #endif // 0 428 429 //------------------------------------------------------------------------------ 430 // Entry point 431 432 extern void WebPRescalerDspInitMSA(void); 433 434 WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMSA(void) { 435 WebPRescalerExportRowExpand = RescalerExportRowExpand_MIPSdspR2; 436 // WebPRescalerExportRowShrink = RescalerExportRowShrink_MIPSdspR2; 437 } 438 439 #else // !WEBP_USE_MSA 440 441 WEBP_DSP_INIT_STUB(WebPRescalerDspInitMSA) 442 443 #endif // WEBP_USE_MSA 444