1 // Copyright 2016 Google Inc. All Rights Reserved. 2 // 3 // Use of this source code is governed by a BSD-style license 4 // that can be found in the COPYING file in the root of the source 5 // tree. An additional intellectual property rights grant can be found 6 // in the file PATENTS. All contributing project authors may 7 // be found in the AUTHORS file in the root of the source tree. 8 // ----------------------------------------------------------------------------- 9 // 10 // MSA variant of methods for lossless decoder 11 // 12 // Author: Prashant Patil (prashant.patil (at) imgtec.com) 13 14 #include "src/dsp/dsp.h" 15 16 #if defined(WEBP_USE_MSA) 17 18 #include "src/dsp/lossless.h" 19 #include "src/dsp/msa_macro.h" 20 21 //------------------------------------------------------------------------------ 22 // Colorspace conversion functions 23 24 #define CONVERT16_BGRA_XXX(psrc, pdst, m0, m1, m2) do { \ 25 v16u8 src0, src1, src2, src3, dst0, dst1, dst2; \ 26 LD_UB4(psrc, 16, src0, src1, src2, src3); \ 27 VSHF_B2_UB(src0, src1, src1, src2, m0, m1, dst0, dst1); \ 28 dst2 = VSHF_UB(src2, src3, m2); \ 29 ST_UB2(dst0, dst1, pdst, 16); \ 30 ST_UB(dst2, pdst + 32); \ 31 } while (0) 32 33 #define CONVERT12_BGRA_XXX(psrc, pdst, m0, m1, m2) do { \ 34 uint32_t pix_w; \ 35 v16u8 src0, src1, src2, dst0, dst1, dst2; \ 36 LD_UB3(psrc, 16, src0, src1, src2); \ 37 VSHF_B2_UB(src0, src1, src1, src2, m0, m1, dst0, dst1); \ 38 dst2 = VSHF_UB(src2, src2, m2); \ 39 ST_UB2(dst0, dst1, pdst, 16); \ 40 pix_w = __msa_copy_s_w((v4i32)dst2, 0); \ 41 SW(pix_w, pdst + 32); \ 42 } while (0) 43 44 #define CONVERT8_BGRA_XXX(psrc, pdst, m0, m1) do { \ 45 uint64_t pix_d; \ 46 v16u8 src0, src1, src2 = { 0 }, dst0, dst1; \ 47 LD_UB2(psrc, 16, src0, src1); \ 48 VSHF_B2_UB(src0, src1, src1, src2, m0, m1, dst0, dst1); \ 49 ST_UB(dst0, pdst); \ 50 pix_d = __msa_copy_s_d((v2i64)dst1, 0); \ 51 SD(pix_d, pdst + 16); \ 52 } while (0) 53 54 #define CONVERT4_BGRA_XXX(psrc, pdst, m) do { \ 55 const v16u8 src0 = LD_UB(psrc); \ 56 const v16u8 dst0 = VSHF_UB(src0, src0, m); \ 57 uint64_t pix_d = __msa_copy_s_d((v2i64)dst0, 0); \ 58 uint32_t pix_w = __msa_copy_s_w((v4i32)dst0, 2); \ 59 SD(pix_d, pdst + 0); \ 60 SW(pix_w, pdst + 8); \ 61 } while (0) 62 63 #define CONVERT1_BGRA_BGR(psrc, pdst) do { \ 64 const int32_t b = (psrc)[0]; \ 65 const int32_t g = (psrc)[1]; \ 66 const int32_t r = (psrc)[2]; \ 67 (pdst)[0] = b; \ 68 (pdst)[1] = g; \ 69 (pdst)[2] = r; \ 70 } while (0) 71 72 #define CONVERT1_BGRA_RGB(psrc, pdst) do { \ 73 const int32_t b = (psrc)[0]; \ 74 const int32_t g = (psrc)[1]; \ 75 const int32_t r = (psrc)[2]; \ 76 (pdst)[0] = r; \ 77 (pdst)[1] = g; \ 78 (pdst)[2] = b; \ 79 } while (0) 80 81 #define TRANSFORM_COLOR_INVERSE_8(src0, src1, dst0, dst1, \ 82 c0, c1, mask0, mask1) do { \ 83 v8i16 g0, g1, t0, t1, t2, t3; \ 84 v4i32 t4, t5; \ 85 VSHF_B2_SH(src0, src0, src1, src1, mask0, mask0, g0, g1); \ 86 DOTP_SB2_SH(g0, g1, c0, c0, t0, t1); \ 87 SRAI_H2_SH(t0, t1, 5); \ 88 t0 = __msa_addv_h(t0, (v8i16)src0); \ 89 t1 = __msa_addv_h(t1, (v8i16)src1); \ 90 t4 = __msa_srli_w((v4i32)t0, 16); \ 91 t5 = __msa_srli_w((v4i32)t1, 16); \ 92 DOTP_SB2_SH(t4, t5, c1, c1, t2, t3); \ 93 SRAI_H2_SH(t2, t3, 5); \ 94 ADD2(t0, t2, t1, t3, t0, t1); \ 95 VSHF_B2_UB(src0, t0, src1, t1, mask1, mask1, dst0, dst1); \ 96 } while (0) 97 98 #define TRANSFORM_COLOR_INVERSE_4(src, dst, c0, c1, mask0, mask1) do { \ 99 const v16i8 g0 = VSHF_SB(src, src, mask0); \ 100 v8i16 t0 = __msa_dotp_s_h(c0, g0); \ 101 v8i16 t1; \ 102 v4i32 t2; \ 103 t0 = SRAI_H(t0, 5); \ 104 t0 = __msa_addv_h(t0, (v8i16)src); \ 105 t2 = __msa_srli_w((v4i32)t0, 16); \ 106 t1 = __msa_dotp_s_h(c1, (v16i8)t2); \ 107 t1 = SRAI_H(t1, 5); \ 108 t0 = t0 + t1; \ 109 dst = VSHF_UB(src, t0, mask1); \ 110 } while (0) 111 112 static void ConvertBGRAToRGBA_MSA(const uint32_t* src, 113 int num_pixels, uint8_t* dst) { 114 int i; 115 const uint8_t* ptemp_src = (const uint8_t*)src; 116 uint8_t* ptemp_dst = (uint8_t*)dst; 117 v16u8 src0, dst0; 118 const v16u8 mask = { 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15 }; 119 120 while (num_pixels >= 8) { 121 v16u8 src1, dst1; 122 LD_UB2(ptemp_src, 16, src0, src1); 123 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, dst0, dst1); 124 ST_UB2(dst0, dst1, ptemp_dst, 16); 125 ptemp_src += 32; 126 ptemp_dst += 32; 127 num_pixels -= 8; 128 } 129 if (num_pixels > 0) { 130 if (num_pixels >= 4) { 131 src0 = LD_UB(ptemp_src); 132 dst0 = VSHF_UB(src0, src0, mask); 133 ST_UB(dst0, ptemp_dst); 134 ptemp_src += 16; 135 ptemp_dst += 16; 136 num_pixels -= 4; 137 } 138 for (i = 0; i < num_pixels; i++) { 139 const uint8_t b = ptemp_src[2]; 140 const uint8_t g = ptemp_src[1]; 141 const uint8_t r = ptemp_src[0]; 142 const uint8_t a = ptemp_src[3]; 143 ptemp_dst[0] = b; 144 ptemp_dst[1] = g; 145 ptemp_dst[2] = r; 146 ptemp_dst[3] = a; 147 ptemp_src += 4; 148 ptemp_dst += 4; 149 } 150 } 151 } 152 153 static void ConvertBGRAToBGR_MSA(const uint32_t* src, 154 int num_pixels, uint8_t* dst) { 155 const uint8_t* ptemp_src = (const uint8_t*)src; 156 uint8_t* ptemp_dst = (uint8_t*)dst; 157 const v16u8 mask0 = { 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 158 16, 17, 18, 20 }; 159 const v16u8 mask1 = { 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20, 160 21, 22, 24, 25 }; 161 const v16u8 mask2 = { 10, 12, 13, 14, 16, 17, 18, 20, 21, 22, 24, 25, 162 26, 28, 29, 30 }; 163 164 while (num_pixels >= 16) { 165 CONVERT16_BGRA_XXX(ptemp_src, ptemp_dst, mask0, mask1, mask2); 166 ptemp_src += 64; 167 ptemp_dst += 48; 168 num_pixels -= 16; 169 } 170 if (num_pixels > 0) { 171 if (num_pixels >= 12) { 172 CONVERT12_BGRA_XXX(ptemp_src, ptemp_dst, mask0, mask1, mask2); 173 ptemp_src += 48; 174 ptemp_dst += 36; 175 num_pixels -= 12; 176 } else if (num_pixels >= 8) { 177 CONVERT8_BGRA_XXX(ptemp_src, ptemp_dst, mask0, mask1); 178 ptemp_src += 32; 179 ptemp_dst += 24; 180 num_pixels -= 8; 181 } else if (num_pixels >= 4) { 182 CONVERT4_BGRA_XXX(ptemp_src, ptemp_dst, mask0); 183 ptemp_src += 16; 184 ptemp_dst += 12; 185 num_pixels -= 4; 186 } 187 if (num_pixels == 3) { 188 CONVERT1_BGRA_BGR(ptemp_src + 0, ptemp_dst + 0); 189 CONVERT1_BGRA_BGR(ptemp_src + 4, ptemp_dst + 3); 190 CONVERT1_BGRA_BGR(ptemp_src + 8, ptemp_dst + 6); 191 } else if (num_pixels == 2) { 192 CONVERT1_BGRA_BGR(ptemp_src + 0, ptemp_dst + 0); 193 CONVERT1_BGRA_BGR(ptemp_src + 4, ptemp_dst + 3); 194 } else if (num_pixels == 1) { 195 CONVERT1_BGRA_BGR(ptemp_src, ptemp_dst); 196 } 197 } 198 } 199 200 static void ConvertBGRAToRGB_MSA(const uint32_t* src, 201 int num_pixels, uint8_t* dst) { 202 const uint8_t* ptemp_src = (const uint8_t*)src; 203 uint8_t* ptemp_dst = (uint8_t*)dst; 204 const v16u8 mask0 = { 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, 205 18, 17, 16, 22 }; 206 const v16u8 mask1 = { 5, 4, 10, 9, 8, 14, 13, 12, 18, 17, 16, 22, 207 21, 20, 26, 25 }; 208 const v16u8 mask2 = { 8, 14, 13, 12, 18, 17, 16, 22, 21, 20, 26, 25, 209 24, 30, 29, 28 }; 210 211 while (num_pixels >= 16) { 212 CONVERT16_BGRA_XXX(ptemp_src, ptemp_dst, mask0, mask1, mask2); 213 ptemp_src += 64; 214 ptemp_dst += 48; 215 num_pixels -= 16; 216 } 217 if (num_pixels) { 218 if (num_pixels >= 12) { 219 CONVERT12_BGRA_XXX(ptemp_src, ptemp_dst, mask0, mask1, mask2); 220 ptemp_src += 48; 221 ptemp_dst += 36; 222 num_pixels -= 12; 223 } else if (num_pixels >= 8) { 224 CONVERT8_BGRA_XXX(ptemp_src, ptemp_dst, mask0, mask1); 225 ptemp_src += 32; 226 ptemp_dst += 24; 227 num_pixels -= 8; 228 } else if (num_pixels >= 4) { 229 CONVERT4_BGRA_XXX(ptemp_src, ptemp_dst, mask0); 230 ptemp_src += 16; 231 ptemp_dst += 12; 232 num_pixels -= 4; 233 } 234 if (num_pixels == 3) { 235 CONVERT1_BGRA_RGB(ptemp_src + 0, ptemp_dst + 0); 236 CONVERT1_BGRA_RGB(ptemp_src + 4, ptemp_dst + 3); 237 CONVERT1_BGRA_RGB(ptemp_src + 8, ptemp_dst + 6); 238 } else if (num_pixels == 2) { 239 CONVERT1_BGRA_RGB(ptemp_src + 0, ptemp_dst + 0); 240 CONVERT1_BGRA_RGB(ptemp_src + 4, ptemp_dst + 3); 241 } else if (num_pixels == 1) { 242 CONVERT1_BGRA_RGB(ptemp_src, ptemp_dst); 243 } 244 } 245 } 246 247 static void AddGreenToBlueAndRed_MSA(const uint32_t* const src, int num_pixels, 248 uint32_t* dst) { 249 int i; 250 const uint8_t* in = (const uint8_t*)src; 251 uint8_t* out = (uint8_t*)dst; 252 v16u8 src0, dst0, tmp0; 253 const v16u8 mask = { 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 254 13, 255, 13, 255 }; 255 256 while (num_pixels >= 8) { 257 v16u8 src1, dst1, tmp1; 258 LD_UB2(in, 16, src0, src1); 259 VSHF_B2_UB(src0, src1, src1, src0, mask, mask, tmp0, tmp1); 260 ADD2(src0, tmp0, src1, tmp1, dst0, dst1); 261 ST_UB2(dst0, dst1, out, 16); 262 in += 32; 263 out += 32; 264 num_pixels -= 8; 265 } 266 if (num_pixels > 0) { 267 if (num_pixels >= 4) { 268 src0 = LD_UB(in); 269 tmp0 = VSHF_UB(src0, src0, mask); 270 dst0 = src0 + tmp0; 271 ST_UB(dst0, out); 272 in += 16; 273 out += 16; 274 num_pixels -= 4; 275 } 276 for (i = 0; i < num_pixels; i++) { 277 const uint8_t b = in[0]; 278 const uint8_t g = in[1]; 279 const uint8_t r = in[2]; 280 out[0] = (b + g) & 0xff; 281 out[1] = g; 282 out[2] = (r + g) & 0xff; 283 out[4] = in[4]; 284 out += 4; 285 } 286 } 287 } 288 289 static void TransformColorInverse_MSA(const VP8LMultipliers* const m, 290 const uint32_t* src, int num_pixels, 291 uint32_t* dst) { 292 v16u8 src0, dst0; 293 const v16i8 g2br = (v16i8)__msa_fill_w(m->green_to_blue_ | 294 (m->green_to_red_ << 16)); 295 const v16i8 r2b = (v16i8)__msa_fill_w(m->red_to_blue_); 296 const v16u8 mask0 = { 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 297 13, 255, 13, 255 }; 298 const v16u8 mask1 = { 16, 1, 18, 3, 20, 5, 22, 7, 24, 9, 26, 11, 299 28, 13, 30, 15 }; 300 301 while (num_pixels >= 8) { 302 v16u8 src1, dst1; 303 LD_UB2(src, 4, src0, src1); 304 TRANSFORM_COLOR_INVERSE_8(src0, src1, dst0, dst1, g2br, r2b, mask0, mask1); 305 ST_UB2(dst0, dst1, dst, 4); 306 src += 8; 307 dst += 8; 308 num_pixels -= 8; 309 } 310 if (num_pixels > 0) { 311 if (num_pixels >= 4) { 312 src0 = LD_UB(src); 313 TRANSFORM_COLOR_INVERSE_4(src0, dst0, g2br, r2b, mask0, mask1); 314 ST_UB(dst0, dst); 315 src += 4; 316 dst += 4; 317 num_pixels -= 4; 318 } 319 if (num_pixels > 0) { 320 src0 = LD_UB(src); 321 TRANSFORM_COLOR_INVERSE_4(src0, dst0, g2br, r2b, mask0, mask1); 322 if (num_pixels == 3) { 323 const uint64_t pix_d = __msa_copy_s_d((v2i64)dst0, 0); 324 const uint32_t pix_w = __msa_copy_s_w((v4i32)dst0, 2); 325 SD(pix_d, dst + 0); 326 SW(pix_w, dst + 2); 327 } else if (num_pixels == 2) { 328 const uint64_t pix_d = __msa_copy_s_d((v2i64)dst0, 0); 329 SD(pix_d, dst); 330 } else { 331 const uint32_t pix_w = __msa_copy_s_w((v4i32)dst0, 0); 332 SW(pix_w, dst); 333 } 334 } 335 } 336 } 337 338 //------------------------------------------------------------------------------ 339 // Entry point 340 341 extern void VP8LDspInitMSA(void); 342 343 WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitMSA(void) { 344 VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_MSA; 345 VP8LConvertBGRAToBGR = ConvertBGRAToBGR_MSA; 346 VP8LConvertBGRAToRGB = ConvertBGRAToRGB_MSA; 347 348 VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_MSA; 349 VP8LTransformColorInverse = TransformColorInverse_MSA; 350 } 351 352 #else // !WEBP_USE_MSA 353 354 WEBP_DSP_INIT_STUB(VP8LDspInitMSA) 355 356 #endif // WEBP_USE_MSA 357