1 /* 2 * Copyright 2012 The Android Open Source Project 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8 #include <emmintrin.h> 9 #include "SkBitmapProcState_opts_SSE2.h" 10 #include "SkBlitRow_opts_SSE2.h" 11 #include "SkColorPriv.h" 12 #include "SkColor_opts_SSE2.h" 13 #include "SkDither.h" 14 #include "SkUtils.h" 15 16 /* SSE2 version of S32_Blend_BlitRow32() 17 * portable version is in core/SkBlitRow_D32.cpp 18 */ 19 void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, 20 const SkPMColor* SK_RESTRICT src, 21 int count, U8CPU alpha) { 22 SkASSERT(alpha <= 255); 23 if (count <= 0) { 24 return; 25 } 26 27 uint32_t src_scale = SkAlpha255To256(alpha); 28 uint32_t dst_scale = 256 - src_scale; 29 30 if (count >= 4) { 31 SkASSERT(((size_t)dst & 0x03) == 0); 32 while (((size_t)dst & 0x0F) != 0) { 33 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); 34 src++; 35 dst++; 36 count--; 37 } 38 39 const __m128i *s = reinterpret_cast<const __m128i*>(src); 40 __m128i *d = reinterpret_cast<__m128i*>(dst); 41 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 42 __m128i ag_mask = _mm_set1_epi32(0xFF00FF00); 43 44 // Move scale factors to upper byte of word 45 __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8); 46 __m128i dst_scale_wide = _mm_set1_epi16(dst_scale << 8); 47 while (count >= 4) { 48 // Load 4 pixels each of src and dest. 49 __m128i src_pixel = _mm_loadu_si128(s); 50 __m128i dst_pixel = _mm_load_si128(d); 51 52 // Interleave Atom port 0/1 operations based on the execution port 53 // constraints that multiply can only be executed on port 0 (while 54 // boolean operations can be executed on either port 0 or port 1) 55 // because GCC currently doesn't do a good job scheduling 56 // instructions based on these constraints. 57 58 // Get red and blue pixels into lower byte of each word. 59 // (0, r, 0, b, 0, r, 0, b, 0, r, 0, b, 0, r, 0, b) 60 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); 61 62 // Multiply by scale. 63 // (4 x (0, rs.h, 0, bs.h)) 64 // where rs.h stands for the higher byte of r * scale, and 65 // bs.h the higher byte of b * scale. 66 src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide); 67 68 // Get alpha and green pixels into higher byte of each word. 69 // (a, 0, g, 0, a, 0, g, 0, a, 0, g, 0, a, 0, g, 0) 70 __m128i src_ag = _mm_and_si128(ag_mask, src_pixel); 71 72 // Multiply by scale. 73 // (4 x (as.h, as.l, gs.h, gs.l)) 74 src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide); 75 76 // Clear the lower byte of the a*scale and g*scale results 77 // (4 x (as.h, 0, gs.h, 0)) 78 src_ag = _mm_and_si128(src_ag, ag_mask); 79 80 // Operations the destination pixels are the same as on the 81 // source pixels. See the comments above. 82 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 83 dst_rb = _mm_mulhi_epu16(dst_rb, dst_scale_wide); 84 __m128i dst_ag = _mm_and_si128(ag_mask, dst_pixel); 85 dst_ag = _mm_mulhi_epu16(dst_ag, dst_scale_wide); 86 dst_ag = _mm_and_si128(dst_ag, ag_mask); 87 88 // Combine back into RGBA. 89 // (4 x (as.h, rs.h, gs.h, bs.h)) 90 src_pixel = _mm_or_si128(src_rb, src_ag); 91 dst_pixel = _mm_or_si128(dst_rb, dst_ag); 92 93 // Add result 94 __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 95 _mm_store_si128(d, result); 96 s++; 97 d++; 98 count -= 4; 99 } 100 src = reinterpret_cast<const SkPMColor*>(s); 101 dst = reinterpret_cast<SkPMColor*>(d); 102 } 103 104 while (count > 0) { 105 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); 106 src++; 107 dst++; 108 count--; 109 } 110 } 111 112 void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, 113 const SkPMColor* SK_RESTRICT src, 114 int count, U8CPU alpha) { 115 SkASSERT(alpha == 255); 116 if (count <= 0) { 117 return; 118 } 119 120 if (count >= 4) { 121 SkASSERT(((size_t)dst & 0x03) == 0); 122 while (((size_t)dst & 0x0F) != 0) { 123 *dst = SkPMSrcOver(*src, *dst); 124 src++; 125 dst++; 126 count--; 127 } 128 129 const __m128i *s = reinterpret_cast<const __m128i*>(src); 130 __m128i *d = reinterpret_cast<__m128i*>(dst); 131 #ifdef SK_USE_ACCURATE_BLENDING 132 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 133 __m128i c_128 = _mm_set1_epi16(128); // 8 copies of 128 (16-bit) 134 __m128i c_255 = _mm_set1_epi16(255); // 8 copies of 255 (16-bit) 135 while (count >= 4) { 136 // Load 4 pixels 137 __m128i src_pixel = _mm_loadu_si128(s); 138 __m128i dst_pixel = _mm_load_si128(d); 139 140 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 141 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); 142 // Shift alphas down to lower 8 bits of each quad. 143 __m128i alpha = _mm_srli_epi32(src_pixel, 24); 144 145 // Copy alpha to upper 3rd byte of each quad 146 alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16)); 147 148 // Subtract alphas from 255, to get 0..255 149 alpha = _mm_sub_epi16(c_255, alpha); 150 151 // Multiply by red and blue by src alpha. 152 dst_rb = _mm_mullo_epi16(dst_rb, alpha); 153 // Multiply by alpha and green by src alpha. 154 dst_ag = _mm_mullo_epi16(dst_ag, alpha); 155 156 // dst_rb_low = (dst_rb >> 8) 157 __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8); 158 __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8); 159 160 // dst_rb = (dst_rb + dst_rb_low + 128) >> 8 161 dst_rb = _mm_add_epi16(dst_rb, dst_rb_low); 162 dst_rb = _mm_add_epi16(dst_rb, c_128); 163 dst_rb = _mm_srli_epi16(dst_rb, 8); 164 165 // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask 166 dst_ag = _mm_add_epi16(dst_ag, dst_ag_low); 167 dst_ag = _mm_add_epi16(dst_ag, c_128); 168 dst_ag = _mm_andnot_si128(rb_mask, dst_ag); 169 170 // Combine back into RGBA. 171 dst_pixel = _mm_or_si128(dst_rb, dst_ag); 172 173 // Add result 174 __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 175 _mm_store_si128(d, result); 176 s++; 177 d++; 178 count -= 4; 179 } 180 #else 181 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 182 __m128i c_256 = _mm_set1_epi16(0x0100); // 8 copies of 256 (16-bit) 183 while (count >= 4) { 184 // Load 4 pixels 185 __m128i src_pixel = _mm_loadu_si128(s); 186 __m128i dst_pixel = _mm_load_si128(d); 187 188 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 189 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); 190 191 // (a0, g0, a1, g1, a2, g2, a3, g3) (low byte of each word) 192 __m128i alpha = _mm_srli_epi16(src_pixel, 8); 193 194 // (a0, a0, a1, a1, a2, g2, a3, g3) 195 alpha = _mm_shufflehi_epi16(alpha, 0xF5); 196 197 // (a0, a0, a1, a1, a2, a2, a3, a3) 198 alpha = _mm_shufflelo_epi16(alpha, 0xF5); 199 200 // Subtract alphas from 256, to get 1..256 201 alpha = _mm_sub_epi16(c_256, alpha); 202 203 // Multiply by red and blue by src alpha. 204 dst_rb = _mm_mullo_epi16(dst_rb, alpha); 205 // Multiply by alpha and green by src alpha. 206 dst_ag = _mm_mullo_epi16(dst_ag, alpha); 207 208 // Divide by 256. 209 dst_rb = _mm_srli_epi16(dst_rb, 8); 210 211 // Mask out high bits (already in the right place) 212 dst_ag = _mm_andnot_si128(rb_mask, dst_ag); 213 214 // Combine back into RGBA. 215 dst_pixel = _mm_or_si128(dst_rb, dst_ag); 216 217 // Add result 218 __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 219 _mm_store_si128(d, result); 220 s++; 221 d++; 222 count -= 4; 223 } 224 #endif 225 src = reinterpret_cast<const SkPMColor*>(s); 226 dst = reinterpret_cast<SkPMColor*>(d); 227 } 228 229 while (count > 0) { 230 *dst = SkPMSrcOver(*src, *dst); 231 src++; 232 dst++; 233 count--; 234 } 235 } 236 237 void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, 238 const SkPMColor* SK_RESTRICT src, 239 int count, U8CPU alpha) { 240 SkASSERT(alpha <= 255); 241 if (count <= 0) { 242 return; 243 } 244 245 if (count >= 4) { 246 while (((size_t)dst & 0x0F) != 0) { 247 *dst = SkBlendARGB32(*src, *dst, alpha); 248 src++; 249 dst++; 250 count--; 251 } 252 253 uint32_t src_scale = SkAlpha255To256(alpha); 254 255 const __m128i *s = reinterpret_cast<const __m128i*>(src); 256 __m128i *d = reinterpret_cast<__m128i*>(dst); 257 __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8); 258 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 259 __m128i c_256 = _mm_set1_epi16(256); // 8 copies of 256 (16-bit) 260 while (count >= 4) { 261 // Load 4 pixels each of src and dest. 262 __m128i src_pixel = _mm_loadu_si128(s); 263 __m128i dst_pixel = _mm_load_si128(d); 264 265 // Get red and blue pixels into lower byte of each word. 266 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 267 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); 268 269 // Get alpha and green into lower byte of each word. 270 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); 271 __m128i src_ag = _mm_srli_epi16(src_pixel, 8); 272 273 // Put per-pixel alpha in low byte of each word. 274 // After the following two statements, the dst_alpha looks like 275 // (0, a0, 0, a0, 0, a1, 0, a1, 0, a2, 0, a2, 0, a3, 0, a3) 276 __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5); 277 dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5); 278 279 // dst_alpha = dst_alpha * src_scale 280 // Because src_scales are in the higher byte of each word and 281 // we use mulhi here, the resulting alpha values are already 282 // in the right place and don't need to be divided by 256. 283 // (0, sa0, 0, sa0, 0, sa1, 0, sa1, 0, sa2, 0, sa2, 0, sa3, 0, sa3) 284 dst_alpha = _mm_mulhi_epu16(dst_alpha, src_scale_wide); 285 286 // Subtract alphas from 256, to get 1..256 287 dst_alpha = _mm_sub_epi16(c_256, dst_alpha); 288 289 // Multiply red and blue by dst pixel alpha. 290 dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha); 291 // Multiply alpha and green by dst pixel alpha. 292 dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha); 293 294 // Multiply red and blue by global alpha. 295 // (4 x (0, rs.h, 0, bs.h)) 296 // where rs.h stands for the higher byte of r * src_scale, 297 // and bs.h the higher byte of b * src_scale. 298 // Again, because we use mulhi, the resuling red and blue 299 // values are already in the right place and don't need to 300 // be divided by 256. 301 src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide); 302 // Multiply alpha and green by global alpha. 303 // (4 x (0, as.h, 0, gs.h)) 304 src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide); 305 306 // Divide by 256. 307 dst_rb = _mm_srli_epi16(dst_rb, 8); 308 309 // Mask out low bits (goodies already in the right place; no need to divide) 310 dst_ag = _mm_andnot_si128(rb_mask, dst_ag); 311 // Shift alpha and green to higher byte of each word. 312 // (4 x (as.h, 0, gs.h, 0)) 313 src_ag = _mm_slli_epi16(src_ag, 8); 314 315 // Combine back into RGBA. 316 dst_pixel = _mm_or_si128(dst_rb, dst_ag); 317 src_pixel = _mm_or_si128(src_rb, src_ag); 318 319 // Add two pixels into result. 320 __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 321 _mm_store_si128(d, result); 322 s++; 323 d++; 324 count -= 4; 325 } 326 src = reinterpret_cast<const SkPMColor*>(s); 327 dst = reinterpret_cast<SkPMColor*>(d); 328 } 329 330 while (count > 0) { 331 *dst = SkBlendARGB32(*src, *dst, alpha); 332 src++; 333 dst++; 334 count--; 335 } 336 } 337 338 /* SSE2 version of Color32() 339 * portable version is in core/SkBlitRow_D32.cpp 340 */ 341 void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count, 342 SkPMColor color) { 343 if (count <= 0) { 344 return; 345 } 346 347 if (0 == color) { 348 if (src != dst) { 349 memcpy(dst, src, count * sizeof(SkPMColor)); 350 } 351 return; 352 } 353 354 unsigned colorA = SkGetPackedA32(color); 355 if (255 == colorA) { 356 sk_memset32(dst, color, count); 357 } else { 358 unsigned scale = 256 - SkAlpha255To256(colorA); 359 360 if (count >= 4) { 361 SkASSERT(((size_t)dst & 0x03) == 0); 362 while (((size_t)dst & 0x0F) != 0) { 363 *dst = color + SkAlphaMulQ(*src, scale); 364 src++; 365 dst++; 366 count--; 367 } 368 369 const __m128i *s = reinterpret_cast<const __m128i*>(src); 370 __m128i *d = reinterpret_cast<__m128i*>(dst); 371 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 372 __m128i src_scale_wide = _mm_set1_epi16(scale); 373 __m128i color_wide = _mm_set1_epi32(color); 374 while (count >= 4) { 375 // Load 4 pixels each of src and dest. 376 __m128i src_pixel = _mm_loadu_si128(s); 377 378 // Get red and blue pixels into lower byte of each word. 379 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); 380 381 // Get alpha and green into lower byte of each word. 382 __m128i src_ag = _mm_srli_epi16(src_pixel, 8); 383 384 // Multiply by scale. 385 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide); 386 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide); 387 388 // Divide by 256. 389 src_rb = _mm_srli_epi16(src_rb, 8); 390 src_ag = _mm_andnot_si128(rb_mask, src_ag); 391 392 // Combine back into RGBA. 393 src_pixel = _mm_or_si128(src_rb, src_ag); 394 395 // Add color to result. 396 __m128i result = _mm_add_epi8(color_wide, src_pixel); 397 398 // Store result. 399 _mm_store_si128(d, result); 400 s++; 401 d++; 402 count -= 4; 403 } 404 src = reinterpret_cast<const SkPMColor*>(s); 405 dst = reinterpret_cast<SkPMColor*>(d); 406 } 407 408 while (count > 0) { 409 *dst = color + SkAlphaMulQ(*src, scale); 410 src += 1; 411 dst += 1; 412 count--; 413 } 414 } 415 } 416 417 void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr, 418 size_t maskRB, SkColor origColor, 419 int width, int height) { 420 SkPMColor color = SkPreMultiplyColor(origColor); 421 size_t dstOffset = dstRB - (width << 2); 422 size_t maskOffset = maskRB - width; 423 SkPMColor* dst = (SkPMColor *)device; 424 const uint8_t* mask = (const uint8_t*)maskPtr; 425 do { 426 int count = width; 427 if (count >= 4) { 428 while (((size_t)dst & 0x0F) != 0 && (count > 0)) { 429 *dst = SkBlendARGB32(color, *dst, *mask); 430 mask++; 431 dst++; 432 count--; 433 } 434 __m128i *d = reinterpret_cast<__m128i*>(dst); 435 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 436 __m128i c_256 = _mm_set1_epi16(256); 437 __m128i c_1 = _mm_set1_epi16(1); 438 __m128i src_pixel = _mm_set1_epi32(color); 439 while (count >= 4) { 440 // Load 4 pixels each of src and dest. 441 __m128i dst_pixel = _mm_load_si128(d); 442 443 //set the aphla value 444 __m128i src_scale_wide = _mm_set_epi8(0, *(mask+3),\ 445 0, *(mask+3),0, \ 446 *(mask+2),0, *(mask+2),\ 447 0,*(mask+1), 0,*(mask+1),\ 448 0, *mask,0,*mask); 449 450 //call SkAlpha255To256() 451 src_scale_wide = _mm_add_epi16(src_scale_wide, c_1); 452 453 // Get red and blue pixels into lower byte of each word. 454 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 455 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); 456 457 // Get alpha and green into lower byte of each word. 458 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); 459 __m128i src_ag = _mm_srli_epi16(src_pixel, 8); 460 461 // Put per-pixel alpha in low byte of each word. 462 __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5); 463 dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5); 464 465 // dst_alpha = dst_alpha * src_scale 466 dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide); 467 468 // Divide by 256. 469 dst_alpha = _mm_srli_epi16(dst_alpha, 8); 470 471 // Subtract alphas from 256, to get 1..256 472 dst_alpha = _mm_sub_epi16(c_256, dst_alpha); 473 // Multiply red and blue by dst pixel alpha. 474 dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha); 475 // Multiply alpha and green by dst pixel alpha. 476 dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha); 477 478 // Multiply red and blue by global alpha. 479 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide); 480 // Multiply alpha and green by global alpha. 481 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide); 482 // Divide by 256. 483 dst_rb = _mm_srli_epi16(dst_rb, 8); 484 src_rb = _mm_srli_epi16(src_rb, 8); 485 486 // Mask out low bits (goodies already in the right place; no need to divide) 487 dst_ag = _mm_andnot_si128(rb_mask, dst_ag); 488 src_ag = _mm_andnot_si128(rb_mask, src_ag); 489 490 // Combine back into RGBA. 491 dst_pixel = _mm_or_si128(dst_rb, dst_ag); 492 __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag); 493 494 // Add two pixels into result. 495 __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel); 496 _mm_store_si128(d, result); 497 // load the next 4 pixel 498 mask = mask + 4; 499 d++; 500 count -= 4; 501 } 502 dst = reinterpret_cast<SkPMColor *>(d); 503 } 504 while (count > 0) { 505 *dst= SkBlendARGB32(color, *dst, *mask); 506 dst += 1; 507 mask++; 508 count --; 509 } 510 dst = (SkPMColor *)((char*)dst + dstOffset); 511 mask += maskOffset; 512 } while (--height != 0); 513 } 514 515 // The following (left) shifts cause the top 5 bits of the mask components to 516 // line up with the corresponding components in an SkPMColor. 517 // Note that the mask's RGB16 order may differ from the SkPMColor order. 518 #define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5) 519 #define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5) 520 #define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5) 521 522 #if SK_R16x5_R32x5_SHIFT == 0 523 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x) 524 #elif SK_R16x5_R32x5_SHIFT > 0 525 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT)) 526 #else 527 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT)) 528 #endif 529 530 #if SK_G16x5_G32x5_SHIFT == 0 531 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x) 532 #elif SK_G16x5_G32x5_SHIFT > 0 533 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT)) 534 #else 535 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT)) 536 #endif 537 538 #if SK_B16x5_B32x5_SHIFT == 0 539 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x) 540 #elif SK_B16x5_B32x5_SHIFT > 0 541 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT)) 542 #else 543 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT)) 544 #endif 545 546 static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst, 547 __m128i &mask, __m128i &srcA) { 548 // In the following comments, the components of src, dst and mask are 549 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked 550 // by an R, G, B, or A suffix. Components of one of the four pixels that 551 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for 552 // example is the blue channel of the second destination pixel. Memory 553 // layout is shown for an ARGB byte order in a color value. 554 555 // src and srcA store 8-bit values interleaved with zeros. 556 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) 557 // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0, 558 // srcA, 0, srcA, 0, srcA, 0, srcA, 0) 559 // mask stores 16-bit values (compressed three channels) interleaved with zeros. 560 // Lo and Hi denote the low and high bytes of a 16-bit value, respectively. 561 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, 562 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) 563 564 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits. 565 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0) 566 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask), 567 _mm_set1_epi32(0x1F << SK_R32_SHIFT)); 568 569 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0) 570 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask), 571 _mm_set1_epi32(0x1F << SK_G32_SHIFT)); 572 573 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B) 574 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask), 575 _mm_set1_epi32(0x1F << SK_B32_SHIFT)); 576 577 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3) 578 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an 579 // 8-bit position 580 // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B, 581 // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B) 582 mask = _mm_or_si128(_mm_or_si128(r, g), b); 583 584 // Interleave R,G,B into the lower byte of word. 585 // i.e. split the sixteen 8-bit values from mask into two sets of eight 586 // 16-bit values, padded by zero. 587 __m128i maskLo, maskHi; 588 // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0) 589 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128()); 590 // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0) 591 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128()); 592 593 // Upscale from 0..31 to 0..32 594 // (allows to replace division by left-shift further down) 595 // Left-shift each component by 4 and add the result back to that component, 596 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32 597 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4)); 598 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4)); 599 600 // Multiply each component of maskLo and maskHi by srcA 601 maskLo = _mm_mullo_epi16(maskLo, srcA); 602 maskHi = _mm_mullo_epi16(maskHi, srcA); 603 604 // Left shift mask components by 8 (divide by 256) 605 maskLo = _mm_srli_epi16(maskLo, 8); 606 maskHi = _mm_srli_epi16(maskHi, 8); 607 608 // Interleave R,G,B into the lower byte of the word 609 // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0) 610 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128()); 611 // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0) 612 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128()); 613 614 // mask = (src - dst) * mask 615 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo)); 616 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi)); 617 618 // mask = (src - dst) * mask >> 5 619 maskLo = _mm_srai_epi16(maskLo, 5); 620 maskHi = _mm_srai_epi16(maskHi, 5); 621 622 // Add two pixels into result. 623 // result = dst + ((src - dst) * mask >> 5) 624 __m128i resultLo = _mm_add_epi16(dstLo, maskLo); 625 __m128i resultHi = _mm_add_epi16(dstHi, maskHi); 626 627 // Pack into 4 32bit dst pixels. 628 // resultLo and resultHi contain eight 16-bit components (two pixels) each. 629 // Merge into one SSE regsiter with sixteen 8-bit values (four pixels), 630 // clamping to 255 if necessary. 631 return _mm_packus_epi16(resultLo, resultHi); 632 } 633 634 static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst, 635 __m128i &mask) { 636 // In the following comments, the components of src, dst and mask are 637 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked 638 // by an R, G, B, or A suffix. Components of one of the four pixels that 639 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for 640 // example is the blue channel of the second destination pixel. Memory 641 // layout is shown for an ARGB byte order in a color value. 642 643 // src and srcA store 8-bit values interleaved with zeros. 644 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) 645 // mask stores 16-bit values (shown as high and low bytes) interleaved with 646 // zeros 647 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, 648 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) 649 650 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits. 651 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0) 652 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask), 653 _mm_set1_epi32(0x1F << SK_R32_SHIFT)); 654 655 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0) 656 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask), 657 _mm_set1_epi32(0x1F << SK_G32_SHIFT)); 658 659 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B) 660 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask), 661 _mm_set1_epi32(0x1F << SK_B32_SHIFT)); 662 663 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3) 664 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an 665 // 8-bit position 666 // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B, 667 // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B) 668 mask = _mm_or_si128(_mm_or_si128(r, g), b); 669 670 // Interleave R,G,B into the lower byte of word. 671 // i.e. split the sixteen 8-bit values from mask into two sets of eight 672 // 16-bit values, padded by zero. 673 __m128i maskLo, maskHi; 674 // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0) 675 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128()); 676 // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0) 677 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128()); 678 679 // Upscale from 0..31 to 0..32 680 // (allows to replace division by left-shift further down) 681 // Left-shift each component by 4 and add the result back to that component, 682 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32 683 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4)); 684 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4)); 685 686 // Interleave R,G,B into the lower byte of the word 687 // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0) 688 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128()); 689 // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0) 690 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128()); 691 692 // mask = (src - dst) * mask 693 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo)); 694 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi)); 695 696 // mask = (src - dst) * mask >> 5 697 maskLo = _mm_srai_epi16(maskLo, 5); 698 maskHi = _mm_srai_epi16(maskHi, 5); 699 700 // Add two pixels into result. 701 // result = dst + ((src - dst) * mask >> 5) 702 __m128i resultLo = _mm_add_epi16(dstLo, maskLo); 703 __m128i resultHi = _mm_add_epi16(dstHi, maskHi); 704 705 // Pack into 4 32bit dst pixels and force opaque. 706 // resultLo and resultHi contain eight 16-bit components (two pixels) each. 707 // Merge into one SSE regsiter with sixteen 8-bit values (four pixels), 708 // clamping to 255 if necessary. Set alpha components to 0xFF. 709 return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi), 710 _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT)); 711 } 712 713 void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[], 714 SkColor src, int width, SkPMColor) { 715 if (width <= 0) { 716 return; 717 } 718 719 int srcA = SkColorGetA(src); 720 int srcR = SkColorGetR(src); 721 int srcG = SkColorGetG(src); 722 int srcB = SkColorGetB(src); 723 724 srcA = SkAlpha255To256(srcA); 725 726 if (width >= 4) { 727 SkASSERT(((size_t)dst & 0x03) == 0); 728 while (((size_t)dst & 0x0F) != 0) { 729 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask); 730 mask++; 731 dst++; 732 width--; 733 } 734 735 __m128i *d = reinterpret_cast<__m128i*>(dst); 736 // Set alpha to 0xFF and replicate source four times in SSE register. 737 __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB)); 738 // Interleave with zeros to get two sets of four 16-bit values. 739 src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128()); 740 // Set srcA_sse to contain eight copies of srcA, padded with zero. 741 // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) 742 __m128i srcA_sse = _mm_set1_epi16(srcA); 743 while (width >= 4) { 744 // Load four destination pixels into dst_sse. 745 __m128i dst_sse = _mm_load_si128(d); 746 // Load four 16-bit masks into lower half of mask_sse. 747 __m128i mask_sse = _mm_loadl_epi64( 748 reinterpret_cast<const __m128i*>(mask)); 749 750 // Check whether masks are equal to 0 and get the highest bit 751 // of each byte of result, if masks are all zero, we will get 752 // pack_cmp to 0xFFFF 753 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse, 754 _mm_setzero_si128())); 755 756 // if mask pixels are not all zero, we will blend the dst pixels 757 if (pack_cmp != 0xFFFF) { 758 // Unpack 4 16bit mask pixels to 759 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, 760 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) 761 mask_sse = _mm_unpacklo_epi16(mask_sse, 762 _mm_setzero_si128()); 763 764 // Process 4 32bit dst pixels 765 __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse, 766 mask_sse, srcA_sse); 767 _mm_store_si128(d, result); 768 } 769 770 d++; 771 mask += 4; 772 width -= 4; 773 } 774 775 dst = reinterpret_cast<SkPMColor*>(d); 776 } 777 778 while (width > 0) { 779 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask); 780 mask++; 781 dst++; 782 width--; 783 } 784 } 785 786 void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[], 787 SkColor src, int width, SkPMColor opaqueDst) { 788 if (width <= 0) { 789 return; 790 } 791 792 int srcR = SkColorGetR(src); 793 int srcG = SkColorGetG(src); 794 int srcB = SkColorGetB(src); 795 796 if (width >= 4) { 797 SkASSERT(((size_t)dst & 0x03) == 0); 798 while (((size_t)dst & 0x0F) != 0) { 799 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst); 800 mask++; 801 dst++; 802 width--; 803 } 804 805 __m128i *d = reinterpret_cast<__m128i*>(dst); 806 // Set alpha to 0xFF and replicate source four times in SSE register. 807 __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB)); 808 // Set srcA_sse to contain eight copies of srcA, padded with zero. 809 // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) 810 src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128()); 811 while (width >= 4) { 812 // Load four destination pixels into dst_sse. 813 __m128i dst_sse = _mm_load_si128(d); 814 // Load four 16-bit masks into lower half of mask_sse. 815 __m128i mask_sse = _mm_loadl_epi64( 816 reinterpret_cast<const __m128i*>(mask)); 817 818 // Check whether masks are equal to 0 and get the highest bit 819 // of each byte of result, if masks are all zero, we will get 820 // pack_cmp to 0xFFFF 821 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse, 822 _mm_setzero_si128())); 823 824 // if mask pixels are not all zero, we will blend the dst pixels 825 if (pack_cmp != 0xFFFF) { 826 // Unpack 4 16bit mask pixels to 827 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, 828 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) 829 mask_sse = _mm_unpacklo_epi16(mask_sse, 830 _mm_setzero_si128()); 831 832 // Process 4 32bit dst pixels 833 __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse, 834 mask_sse); 835 _mm_store_si128(d, result); 836 } 837 838 d++; 839 mask += 4; 840 width -= 4; 841 } 842 843 dst = reinterpret_cast<SkPMColor*>(d); 844 } 845 846 while (width > 0) { 847 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst); 848 mask++; 849 dst++; 850 width--; 851 } 852 } 853 854 /* SSE2 version of S32_D565_Opaque() 855 * portable version is in core/SkBlitRow_D16.cpp 856 */ 857 void S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst, 858 const SkPMColor* SK_RESTRICT src, int count, 859 U8CPU alpha, int /*x*/, int /*y*/) { 860 SkASSERT(255 == alpha); 861 862 if (count <= 0) { 863 return; 864 } 865 866 if (count >= 8) { 867 while (((size_t)dst & 0x0F) != 0) { 868 SkPMColor c = *src++; 869 SkPMColorAssert(c); 870 871 *dst++ = SkPixel32ToPixel16_ToU16(c); 872 count--; 873 } 874 875 const __m128i* s = reinterpret_cast<const __m128i*>(src); 876 __m128i* d = reinterpret_cast<__m128i*>(dst); 877 __m128i r16_mask = _mm_set1_epi32(SK_R16_MASK); 878 __m128i g16_mask = _mm_set1_epi32(SK_G16_MASK); 879 __m128i b16_mask = _mm_set1_epi32(SK_B16_MASK); 880 881 while (count >= 8) { 882 // Load 8 pixels of src. 883 __m128i src_pixel1 = _mm_loadu_si128(s++); 884 __m128i src_pixel2 = _mm_loadu_si128(s++); 885 886 // Calculate result r. 887 __m128i r1 = _mm_srli_epi32(src_pixel1, 888 SK_R32_SHIFT + (8 - SK_R16_BITS)); 889 r1 = _mm_and_si128(r1, r16_mask); 890 __m128i r2 = _mm_srli_epi32(src_pixel2, 891 SK_R32_SHIFT + (8 - SK_R16_BITS)); 892 r2 = _mm_and_si128(r2, r16_mask); 893 __m128i r = _mm_packs_epi32(r1, r2); 894 895 // Calculate result g. 896 __m128i g1 = _mm_srli_epi32(src_pixel1, 897 SK_G32_SHIFT + (8 - SK_G16_BITS)); 898 g1 = _mm_and_si128(g1, g16_mask); 899 __m128i g2 = _mm_srli_epi32(src_pixel2, 900 SK_G32_SHIFT + (8 - SK_G16_BITS)); 901 g2 = _mm_and_si128(g2, g16_mask); 902 __m128i g = _mm_packs_epi32(g1, g2); 903 904 // Calculate result b. 905 __m128i b1 = _mm_srli_epi32(src_pixel1, 906 SK_B32_SHIFT + (8 - SK_B16_BITS)); 907 b1 = _mm_and_si128(b1, b16_mask); 908 __m128i b2 = _mm_srli_epi32(src_pixel2, 909 SK_B32_SHIFT + (8 - SK_B16_BITS)); 910 b2 = _mm_and_si128(b2, b16_mask); 911 __m128i b = _mm_packs_epi32(b1, b2); 912 913 // Store 8 16-bit colors in dst. 914 __m128i d_pixel = SkPackRGB16_SSE2(r, g, b); 915 _mm_store_si128(d++, d_pixel); 916 count -= 8; 917 } 918 src = reinterpret_cast<const SkPMColor*>(s); 919 dst = reinterpret_cast<uint16_t*>(d); 920 } 921 922 if (count > 0) { 923 do { 924 SkPMColor c = *src++; 925 SkPMColorAssert(c); 926 *dst++ = SkPixel32ToPixel16_ToU16(c); 927 } while (--count != 0); 928 } 929 } 930 931 /* SSE2 version of S32A_D565_Opaque() 932 * portable version is in core/SkBlitRow_D16.cpp 933 */ 934 void S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst, 935 const SkPMColor* SK_RESTRICT src, 936 int count, U8CPU alpha, int /*x*/, int /*y*/) { 937 SkASSERT(255 == alpha); 938 939 if (count <= 0) { 940 return; 941 } 942 943 if (count >= 8) { 944 // Make dst 16 bytes alignment 945 while (((size_t)dst & 0x0F) != 0) { 946 SkPMColor c = *src++; 947 if (c) { 948 *dst = SkSrcOver32To16(c, *dst); 949 } 950 dst += 1; 951 count--; 952 } 953 954 const __m128i* s = reinterpret_cast<const __m128i*>(src); 955 __m128i* d = reinterpret_cast<__m128i*>(dst); 956 __m128i var255 = _mm_set1_epi16(255); 957 __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK); 958 __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK); 959 __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK); 960 961 while (count >= 8) { 962 // Load 8 pixels of src. 963 __m128i src_pixel1 = _mm_loadu_si128(s++); 964 __m128i src_pixel2 = _mm_loadu_si128(s++); 965 966 // Check whether src pixels are equal to 0 and get the highest bit 967 // of each byte of result, if src pixels are all zero, src_cmp1 and 968 // src_cmp2 will be 0xFFFF. 969 int src_cmp1 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel1, 970 _mm_setzero_si128())); 971 int src_cmp2 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel2, 972 _mm_setzero_si128())); 973 if (src_cmp1 == 0xFFFF && src_cmp2 == 0xFFFF) { 974 d++; 975 count -= 8; 976 continue; 977 } 978 979 // Load 8 pixels of dst. 980 __m128i dst_pixel = _mm_load_si128(d); 981 982 // Extract A from src. 983 __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT)); 984 sa1 = _mm_srli_epi32(sa1, 24); 985 __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT)); 986 sa2 = _mm_srli_epi32(sa2, 24); 987 __m128i sa = _mm_packs_epi32(sa1, sa2); 988 989 // Extract R from src. 990 __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT)); 991 sr1 = _mm_srli_epi32(sr1, 24); 992 __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT)); 993 sr2 = _mm_srli_epi32(sr2, 24); 994 __m128i sr = _mm_packs_epi32(sr1, sr2); 995 996 // Extract G from src. 997 __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT)); 998 sg1 = _mm_srli_epi32(sg1, 24); 999 __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT)); 1000 sg2 = _mm_srli_epi32(sg2, 24); 1001 __m128i sg = _mm_packs_epi32(sg1, sg2); 1002 1003 // Extract B from src. 1004 __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT)); 1005 sb1 = _mm_srli_epi32(sb1, 24); 1006 __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT)); 1007 sb2 = _mm_srli_epi32(sb2, 24); 1008 __m128i sb = _mm_packs_epi32(sb1, sb2); 1009 1010 // Extract R G B from dst. 1011 __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT); 1012 dr = _mm_and_si128(dr, r16_mask); 1013 __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT); 1014 dg = _mm_and_si128(dg, g16_mask); 1015 __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT); 1016 db = _mm_and_si128(db, b16_mask); 1017 1018 __m128i isa = _mm_sub_epi16(var255, sa); // 255 -sa 1019 1020 // Calculate R G B of result. 1021 // Original algorithm is in SkSrcOver32To16(). 1022 dr = _mm_add_epi16(sr, SkMul16ShiftRound_SSE2(dr, isa, SK_R16_BITS)); 1023 dr = _mm_srli_epi16(dr, 8 - SK_R16_BITS); 1024 dg = _mm_add_epi16(sg, SkMul16ShiftRound_SSE2(dg, isa, SK_G16_BITS)); 1025 dg = _mm_srli_epi16(dg, 8 - SK_G16_BITS); 1026 db = _mm_add_epi16(sb, SkMul16ShiftRound_SSE2(db, isa, SK_B16_BITS)); 1027 db = _mm_srli_epi16(db, 8 - SK_B16_BITS); 1028 1029 // Pack R G B into 16-bit color. 1030 __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db); 1031 1032 // Store 8 16-bit colors in dst. 1033 _mm_store_si128(d++, d_pixel); 1034 count -= 8; 1035 } 1036 1037 src = reinterpret_cast<const SkPMColor*>(s); 1038 dst = reinterpret_cast<uint16_t*>(d); 1039 } 1040 1041 if (count > 0) { 1042 do { 1043 SkPMColor c = *src++; 1044 SkPMColorAssert(c); 1045 if (c) { 1046 *dst = SkSrcOver32To16(c, *dst); 1047 } 1048 dst += 1; 1049 } while (--count != 0); 1050 } 1051 } 1052 1053 void S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst, 1054 const SkPMColor* SK_RESTRICT src, 1055 int count, U8CPU alpha, int x, int y) { 1056 SkASSERT(255 == alpha); 1057 1058 if (count <= 0) { 1059 return; 1060 } 1061 1062 if (count >= 8) { 1063 while (((size_t)dst & 0x0F) != 0) { 1064 DITHER_565_SCAN(y); 1065 SkPMColor c = *src++; 1066 SkPMColorAssert(c); 1067 1068 unsigned dither = DITHER_VALUE(x); 1069 *dst++ = SkDitherRGB32To565(c, dither); 1070 DITHER_INC_X(x); 1071 count--; 1072 } 1073 1074 unsigned short dither_value[8]; 1075 __m128i dither; 1076 #ifdef ENABLE_DITHER_MATRIX_4X4 1077 const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3]; 1078 dither_value[0] = dither_value[4] = dither_scan[(x) & 3]; 1079 dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3]; 1080 dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3]; 1081 dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3]; 1082 #else 1083 const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3]; 1084 dither_value[0] = dither_value[4] = (dither_scan 1085 >> (((x) & 3) << 2)) & 0xF; 1086 dither_value[1] = dither_value[5] = (dither_scan 1087 >> (((x + 1) & 3) << 2)) & 0xF; 1088 dither_value[2] = dither_value[6] = (dither_scan 1089 >> (((x + 2) & 3) << 2)) & 0xF; 1090 dither_value[3] = dither_value[7] = (dither_scan 1091 >> (((x + 3) & 3) << 2)) & 0xF; 1092 #endif 1093 dither = _mm_loadu_si128((__m128i*) dither_value); 1094 1095 const __m128i* s = reinterpret_cast<const __m128i*>(src); 1096 __m128i* d = reinterpret_cast<__m128i*>(dst); 1097 1098 while (count >= 8) { 1099 // Load 8 pixels of src. 1100 __m128i src_pixel1 = _mm_loadu_si128(s++); 1101 __m128i src_pixel2 = _mm_loadu_si128(s++); 1102 1103 // Extract R from src. 1104 __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT)); 1105 sr1 = _mm_srli_epi32(sr1, 24); 1106 __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT)); 1107 sr2 = _mm_srli_epi32(sr2, 24); 1108 __m128i sr = _mm_packs_epi32(sr1, sr2); 1109 1110 // SkDITHER_R32To565(sr, dither) 1111 __m128i sr_offset = _mm_srli_epi16(sr, 5); 1112 sr = _mm_add_epi16(sr, dither); 1113 sr = _mm_sub_epi16(sr, sr_offset); 1114 sr = _mm_srli_epi16(sr, SK_R32_BITS - SK_R16_BITS); 1115 1116 // Extract G from src. 1117 __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT)); 1118 sg1 = _mm_srli_epi32(sg1, 24); 1119 __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT)); 1120 sg2 = _mm_srli_epi32(sg2, 24); 1121 __m128i sg = _mm_packs_epi32(sg1, sg2); 1122 1123 // SkDITHER_R32To565(sg, dither) 1124 __m128i sg_offset = _mm_srli_epi16(sg, 6); 1125 sg = _mm_add_epi16(sg, _mm_srli_epi16(dither, 1)); 1126 sg = _mm_sub_epi16(sg, sg_offset); 1127 sg = _mm_srli_epi16(sg, SK_G32_BITS - SK_G16_BITS); 1128 1129 // Extract B from src. 1130 __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT)); 1131 sb1 = _mm_srli_epi32(sb1, 24); 1132 __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT)); 1133 sb2 = _mm_srli_epi32(sb2, 24); 1134 __m128i sb = _mm_packs_epi32(sb1, sb2); 1135 1136 // SkDITHER_R32To565(sb, dither) 1137 __m128i sb_offset = _mm_srli_epi16(sb, 5); 1138 sb = _mm_add_epi16(sb, dither); 1139 sb = _mm_sub_epi16(sb, sb_offset); 1140 sb = _mm_srli_epi16(sb, SK_B32_BITS - SK_B16_BITS); 1141 1142 // Pack and store 16-bit dst pixel. 1143 __m128i d_pixel = SkPackRGB16_SSE2(sr, sg, sb); 1144 _mm_store_si128(d++, d_pixel); 1145 1146 count -= 8; 1147 x += 8; 1148 } 1149 1150 src = reinterpret_cast<const SkPMColor*>(s); 1151 dst = reinterpret_cast<uint16_t*>(d); 1152 } 1153 1154 if (count > 0) { 1155 DITHER_565_SCAN(y); 1156 do { 1157 SkPMColor c = *src++; 1158 SkPMColorAssert(c); 1159 1160 unsigned dither = DITHER_VALUE(x); 1161 *dst++ = SkDitherRGB32To565(c, dither); 1162 DITHER_INC_X(x); 1163 } while (--count != 0); 1164 } 1165 } 1166 1167 /* SSE2 version of S32A_D565_Opaque_Dither() 1168 * portable version is in core/SkBlitRow_D16.cpp 1169 */ 1170 void S32A_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst, 1171 const SkPMColor* SK_RESTRICT src, 1172 int count, U8CPU alpha, int x, int y) { 1173 SkASSERT(255 == alpha); 1174 1175 if (count <= 0) { 1176 return; 1177 } 1178 1179 if (count >= 8) { 1180 while (((size_t)dst & 0x0F) != 0) { 1181 DITHER_565_SCAN(y); 1182 SkPMColor c = *src++; 1183 SkPMColorAssert(c); 1184 if (c) { 1185 unsigned a = SkGetPackedA32(c); 1186 1187 int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a)); 1188 1189 unsigned sr = SkGetPackedR32(c); 1190 unsigned sg = SkGetPackedG32(c); 1191 unsigned sb = SkGetPackedB32(c); 1192 sr = SkDITHER_R32_FOR_565(sr, d); 1193 sg = SkDITHER_G32_FOR_565(sg, d); 1194 sb = SkDITHER_B32_FOR_565(sb, d); 1195 1196 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); 1197 uint32_t dst_expanded = SkExpand_rgb_16(*dst); 1198 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); 1199 // now src and dst expanded are in g:11 r:10 x:1 b:10 1200 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); 1201 } 1202 dst += 1; 1203 DITHER_INC_X(x); 1204 count--; 1205 } 1206 1207 unsigned short dither_value[8]; 1208 __m128i dither, dither_cur; 1209 #ifdef ENABLE_DITHER_MATRIX_4X4 1210 const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3]; 1211 dither_value[0] = dither_value[4] = dither_scan[(x) & 3]; 1212 dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3]; 1213 dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3]; 1214 dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3]; 1215 #else 1216 const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3]; 1217 dither_value[0] = dither_value[4] = (dither_scan 1218 >> (((x) & 3) << 2)) & 0xF; 1219 dither_value[1] = dither_value[5] = (dither_scan 1220 >> (((x + 1) & 3) << 2)) & 0xF; 1221 dither_value[2] = dither_value[6] = (dither_scan 1222 >> (((x + 2) & 3) << 2)) & 0xF; 1223 dither_value[3] = dither_value[7] = (dither_scan 1224 >> (((x + 3) & 3) << 2)) & 0xF; 1225 #endif 1226 dither = _mm_loadu_si128((__m128i*) dither_value); 1227 1228 const __m128i* s = reinterpret_cast<const __m128i*>(src); 1229 __m128i* d = reinterpret_cast<__m128i*>(dst); 1230 __m128i var256 = _mm_set1_epi16(256); 1231 __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK); 1232 __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK); 1233 __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK); 1234 1235 while (count >= 8) { 1236 // Load 8 pixels of src and dst. 1237 __m128i src_pixel1 = _mm_loadu_si128(s++); 1238 __m128i src_pixel2 = _mm_loadu_si128(s++); 1239 __m128i dst_pixel = _mm_load_si128(d); 1240 1241 // Extract A from src. 1242 __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT)); 1243 sa1 = _mm_srli_epi32(sa1, 24); 1244 __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT)); 1245 sa2 = _mm_srli_epi32(sa2, 24); 1246 __m128i sa = _mm_packs_epi32(sa1, sa2); 1247 1248 // Calculate current dither value. 1249 dither_cur = _mm_mullo_epi16(dither, 1250 _mm_add_epi16(sa, _mm_set1_epi16(1))); 1251 dither_cur = _mm_srli_epi16(dither_cur, 8); 1252 1253 // Extract R from src. 1254 __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT)); 1255 sr1 = _mm_srli_epi32(sr1, 24); 1256 __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT)); 1257 sr2 = _mm_srli_epi32(sr2, 24); 1258 __m128i sr = _mm_packs_epi32(sr1, sr2); 1259 1260 // SkDITHER_R32_FOR_565(sr, d) 1261 __m128i sr_offset = _mm_srli_epi16(sr, 5); 1262 sr = _mm_add_epi16(sr, dither_cur); 1263 sr = _mm_sub_epi16(sr, sr_offset); 1264 1265 // Expand sr. 1266 sr = _mm_slli_epi16(sr, 2); 1267 1268 // Extract G from src. 1269 __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT)); 1270 sg1 = _mm_srli_epi32(sg1, 24); 1271 __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT)); 1272 sg2 = _mm_srli_epi32(sg2, 24); 1273 __m128i sg = _mm_packs_epi32(sg1, sg2); 1274 1275 // sg = SkDITHER_G32_FOR_565(sg, d). 1276 __m128i sg_offset = _mm_srli_epi16(sg, 6); 1277 sg = _mm_add_epi16(sg, _mm_srli_epi16(dither_cur, 1)); 1278 sg = _mm_sub_epi16(sg, sg_offset); 1279 1280 // Expand sg. 1281 sg = _mm_slli_epi16(sg, 3); 1282 1283 // Extract B from src. 1284 __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT)); 1285 sb1 = _mm_srli_epi32(sb1, 24); 1286 __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT)); 1287 sb2 = _mm_srli_epi32(sb2, 24); 1288 __m128i sb = _mm_packs_epi32(sb1, sb2); 1289 1290 // sb = SkDITHER_B32_FOR_565(sb, d). 1291 __m128i sb_offset = _mm_srli_epi16(sb, 5); 1292 sb = _mm_add_epi16(sb, dither_cur); 1293 sb = _mm_sub_epi16(sb, sb_offset); 1294 1295 // Expand sb. 1296 sb = _mm_slli_epi16(sb, 2); 1297 1298 // Extract R G B from dst. 1299 __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT); 1300 dr = _mm_and_si128(dr, r16_mask); 1301 __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT); 1302 dg = _mm_and_si128(dg, g16_mask); 1303 __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT); 1304 db = _mm_and_si128(db, b16_mask); 1305 1306 // SkAlpha255To256(255 - a) >> 3 1307 __m128i isa = _mm_sub_epi16(var256, sa); 1308 isa = _mm_srli_epi16(isa, 3); 1309 1310 dr = _mm_mullo_epi16(dr, isa); 1311 dr = _mm_add_epi16(dr, sr); 1312 dr = _mm_srli_epi16(dr, 5); 1313 1314 dg = _mm_mullo_epi16(dg, isa); 1315 dg = _mm_add_epi16(dg, sg); 1316 dg = _mm_srli_epi16(dg, 5); 1317 1318 db = _mm_mullo_epi16(db, isa); 1319 db = _mm_add_epi16(db, sb); 1320 db = _mm_srli_epi16(db, 5); 1321 1322 // Package and store dst pixel. 1323 __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db); 1324 _mm_store_si128(d++, d_pixel); 1325 1326 count -= 8; 1327 x += 8; 1328 } 1329 1330 src = reinterpret_cast<const SkPMColor*>(s); 1331 dst = reinterpret_cast<uint16_t*>(d); 1332 } 1333 1334 if (count > 0) { 1335 DITHER_565_SCAN(y); 1336 do { 1337 SkPMColor c = *src++; 1338 SkPMColorAssert(c); 1339 if (c) { 1340 unsigned a = SkGetPackedA32(c); 1341 1342 int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a)); 1343 1344 unsigned sr = SkGetPackedR32(c); 1345 unsigned sg = SkGetPackedG32(c); 1346 unsigned sb = SkGetPackedB32(c); 1347 sr = SkDITHER_R32_FOR_565(sr, d); 1348 sg = SkDITHER_G32_FOR_565(sg, d); 1349 sb = SkDITHER_B32_FOR_565(sb, d); 1350 1351 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); 1352 uint32_t dst_expanded = SkExpand_rgb_16(*dst); 1353 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); 1354 // now src and dst expanded are in g:11 r:10 x:1 b:10 1355 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); 1356 } 1357 dst += 1; 1358 DITHER_INC_X(x); 1359 } while (--count != 0); 1360 } 1361 } 1362