1 /* 2 * Copyright 2012 The Android Open Source Project 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8 #include <emmintrin.h> 9 #include "SkBitmapProcState_opts_SSE2.h" 10 #include "SkBlitRow_opts_SSE2.h" 11 #include "SkColorPriv.h" 12 #include "SkColor_opts_SSE2.h" 13 #include "SkDither.h" 14 #include "SkUtils.h" 15 16 /* SSE2 version of S32_Blend_BlitRow32() 17 * portable version is in core/SkBlitRow_D32.cpp 18 */ 19 void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, 20 const SkPMColor* SK_RESTRICT src, 21 int count, U8CPU alpha) { 22 SkASSERT(alpha <= 255); 23 if (count <= 0) { 24 return; 25 } 26 27 uint32_t src_scale = SkAlpha255To256(alpha); 28 uint32_t dst_scale = 256 - src_scale; 29 30 if (count >= 4) { 31 SkASSERT(((size_t)dst & 0x03) == 0); 32 while (((size_t)dst & 0x0F) != 0) { 33 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); 34 src++; 35 dst++; 36 count--; 37 } 38 39 const __m128i *s = reinterpret_cast<const __m128i*>(src); 40 __m128i *d = reinterpret_cast<__m128i*>(dst); 41 42 while (count >= 4) { 43 // Load 4 pixels each of src and dest. 44 __m128i src_pixel = _mm_loadu_si128(s); 45 __m128i dst_pixel = _mm_load_si128(d); 46 47 src_pixel = SkAlphaMulQ_SSE2(src_pixel, src_scale); 48 dst_pixel = SkAlphaMulQ_SSE2(dst_pixel, dst_scale); 49 50 // Add result 51 __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 52 _mm_store_si128(d, result); 53 s++; 54 d++; 55 count -= 4; 56 } 57 src = reinterpret_cast<const SkPMColor*>(s); 58 dst = reinterpret_cast<SkPMColor*>(d); 59 } 60 61 while (count > 0) { 62 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); 63 src++; 64 dst++; 65 count--; 66 } 67 } 68 69 void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, 70 const SkPMColor* SK_RESTRICT src, 71 int count, U8CPU alpha) { 72 SkASSERT(alpha == 255); 73 if (count <= 0) { 74 return; 75 } 76 77 #ifdef SK_USE_ACCURATE_BLENDING 78 if (count >= 4) { 79 SkASSERT(((size_t)dst & 0x03) == 0); 80 while (((size_t)dst & 0x0F) != 0) { 81 *dst = SkPMSrcOver(*src, *dst); 82 src++; 83 dst++; 84 count--; 85 } 86 87 const __m128i *s = reinterpret_cast<const __m128i*>(src); 88 __m128i *d = reinterpret_cast<__m128i*>(dst); 89 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 90 __m128i c_128 = _mm_set1_epi16(128); // 8 copies of 128 (16-bit) 91 __m128i c_255 = _mm_set1_epi16(255); // 8 copies of 255 (16-bit) 92 while (count >= 4) { 93 // Load 4 pixels 94 __m128i src_pixel = _mm_loadu_si128(s); 95 __m128i dst_pixel = _mm_load_si128(d); 96 97 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 98 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); 99 // Shift alphas down to lower 8 bits of each quad. 100 __m128i alpha = _mm_srli_epi32(src_pixel, 24); 101 102 // Copy alpha to upper 3rd byte of each quad 103 alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16)); 104 105 // Subtract alphas from 255, to get 0..255 106 alpha = _mm_sub_epi16(c_255, alpha); 107 108 // Multiply by red and blue by src alpha. 109 dst_rb = _mm_mullo_epi16(dst_rb, alpha); 110 // Multiply by alpha and green by src alpha. 111 dst_ag = _mm_mullo_epi16(dst_ag, alpha); 112 113 // dst_rb_low = (dst_rb >> 8) 114 __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8); 115 __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8); 116 117 // dst_rb = (dst_rb + dst_rb_low + 128) >> 8 118 dst_rb = _mm_add_epi16(dst_rb, dst_rb_low); 119 dst_rb = _mm_add_epi16(dst_rb, c_128); 120 dst_rb = _mm_srli_epi16(dst_rb, 8); 121 122 // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask 123 dst_ag = _mm_add_epi16(dst_ag, dst_ag_low); 124 dst_ag = _mm_add_epi16(dst_ag, c_128); 125 dst_ag = _mm_andnot_si128(rb_mask, dst_ag); 126 127 // Combine back into RGBA. 128 dst_pixel = _mm_or_si128(dst_rb, dst_ag); 129 130 // Add result 131 __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 132 _mm_store_si128(d, result); 133 s++; 134 d++; 135 count -= 4; 136 } 137 src = reinterpret_cast<const SkPMColor*>(s); 138 dst = reinterpret_cast<SkPMColor*>(d); 139 } 140 141 while (count > 0) { 142 *dst = SkPMSrcOver(*src, *dst); 143 src++; 144 dst++; 145 count--; 146 } 147 #else 148 int count16 = count / 16; 149 __m128i* dst4 = (__m128i*)dst; 150 const __m128i* src4 = (const __m128i*)src; 151 152 for (int i = 0; i < count16 * 4; i += 4) { 153 // Load 16 source pixels. 154 __m128i s0 = _mm_loadu_si128(src4+i+0), 155 s1 = _mm_loadu_si128(src4+i+1), 156 s2 = _mm_loadu_si128(src4+i+2), 157 s3 = _mm_loadu_si128(src4+i+3); 158 159 const __m128i alphaMask = _mm_set1_epi32(0xFF << SK_A32_SHIFT); 160 const __m128i ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1, s0))); 161 __m128i cmp = _mm_cmpeq_epi8(_mm_and_si128(ORed, alphaMask), _mm_setzero_si128()); 162 if (0xffff == _mm_movemask_epi8(cmp)) { 163 // All 16 source pixels are fully transparent. There's nothing to do! 164 continue; 165 } 166 const __m128i ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128(s1, s0))); 167 cmp = _mm_cmpeq_epi8(_mm_and_si128(ANDed, alphaMask), alphaMask); 168 if (0xffff == _mm_movemask_epi8(cmp)) { 169 // All 16 source pixels are fully opaque. There's no need to read dst or blend it. 170 _mm_storeu_si128(dst4+i+0, s0); 171 _mm_storeu_si128(dst4+i+1, s1); 172 _mm_storeu_si128(dst4+i+2, s2); 173 _mm_storeu_si128(dst4+i+3, s3); 174 continue; 175 } 176 // The general slow case: do the blend for all 16 pixels. 177 _mm_storeu_si128(dst4+i+0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(dst4+i+0))); 178 _mm_storeu_si128(dst4+i+1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(dst4+i+1))); 179 _mm_storeu_si128(dst4+i+2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(dst4+i+2))); 180 _mm_storeu_si128(dst4+i+3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(dst4+i+3))); 181 } 182 183 // Wrap up the last <= 15 pixels. 184 SkASSERT(count - (count16*16) <= 15); 185 for (int i = count16*16; i < count; i++) { 186 // This check is not really necessarily, but it prevents pointless autovectorization. 187 if (src[i] & 0xFF000000) { 188 dst[i] = SkPMSrcOver(src[i], dst[i]); 189 } 190 } 191 #endif 192 } 193 194 void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, 195 const SkPMColor* SK_RESTRICT src, 196 int count, U8CPU alpha) { 197 SkASSERT(alpha <= 255); 198 if (count <= 0) { 199 return; 200 } 201 202 if (count >= 4) { 203 while (((size_t)dst & 0x0F) != 0) { 204 *dst = SkBlendARGB32(*src, *dst, alpha); 205 src++; 206 dst++; 207 count--; 208 } 209 210 const __m128i *s = reinterpret_cast<const __m128i*>(src); 211 __m128i *d = reinterpret_cast<__m128i*>(dst); 212 while (count >= 4) { 213 // Load 4 pixels each of src and dest. 214 __m128i src_pixel = _mm_loadu_si128(s); 215 __m128i dst_pixel = _mm_load_si128(d); 216 217 __m128i result = SkBlendARGB32_SSE2(src_pixel, dst_pixel, alpha); 218 _mm_store_si128(d, result); 219 s++; 220 d++; 221 count -= 4; 222 } 223 src = reinterpret_cast<const SkPMColor*>(s); 224 dst = reinterpret_cast<SkPMColor*>(d); 225 } 226 227 while (count > 0) { 228 *dst = SkBlendARGB32(*src, *dst, alpha); 229 src++; 230 dst++; 231 count--; 232 } 233 } 234 235 void Color32A_D565_SSE2(uint16_t dst[], SkPMColor src, int count, int x, int y) { 236 SkASSERT(count > 0); 237 238 uint32_t src_expand = (SkGetPackedG32(src) << 24) | 239 (SkGetPackedR32(src) << 13) | 240 (SkGetPackedB32(src) << 2); 241 unsigned scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3; 242 243 // Check if we have enough pixels to run SIMD 244 if (count >= (int)(8 + (((16 - (size_t)dst) & 0x0F) >> 1))) { 245 __m128i* dst_wide; 246 const __m128i src_R_wide = _mm_set1_epi16(SkGetPackedR32(src) << 2); 247 const __m128i src_G_wide = _mm_set1_epi16(SkGetPackedG32(src) << 3); 248 const __m128i src_B_wide = _mm_set1_epi16(SkGetPackedB32(src) << 2); 249 const __m128i scale_wide = _mm_set1_epi16(scale); 250 const __m128i mask_blue = _mm_set1_epi16(SK_B16_MASK); 251 const __m128i mask_green = _mm_set1_epi16(SK_G16_MASK << SK_G16_SHIFT); 252 253 // Align dst to an even 16 byte address (0-7 pixels) 254 while (((((size_t)dst) & 0x0F) != 0) && (count > 0)) { 255 *dst = SkBlend32_RGB16(src_expand, *dst, scale); 256 dst += 1; 257 count--; 258 } 259 260 dst_wide = reinterpret_cast<__m128i*>(dst); 261 do { 262 // Load eight RGB565 pixels 263 __m128i pixels = _mm_load_si128(dst_wide); 264 265 // Mask out sub-pixels 266 __m128i pixel_R = _mm_srli_epi16(pixels, SK_R16_SHIFT); 267 __m128i pixel_G = _mm_slli_epi16(pixels, SK_R16_BITS); 268 pixel_G = _mm_srli_epi16(pixel_G, SK_R16_BITS + SK_B16_BITS); 269 __m128i pixel_B = _mm_and_si128(pixels, mask_blue); 270 271 // Scale with alpha 272 pixel_R = _mm_mullo_epi16(pixel_R, scale_wide); 273 pixel_G = _mm_mullo_epi16(pixel_G, scale_wide); 274 pixel_B = _mm_mullo_epi16(pixel_B, scale_wide); 275 276 // Add src_X_wide and shift down again 277 pixel_R = _mm_add_epi16(pixel_R, src_R_wide); 278 pixel_R = _mm_srli_epi16(pixel_R, 5); 279 pixel_G = _mm_add_epi16(pixel_G, src_G_wide); 280 pixel_B = _mm_add_epi16(pixel_B, src_B_wide); 281 pixel_B = _mm_srli_epi16(pixel_B, 5); 282 283 // Combine into RGB565 and store 284 pixel_R = _mm_slli_epi16(pixel_R, SK_R16_SHIFT); 285 pixel_G = _mm_and_si128(pixel_G, mask_green); 286 pixels = _mm_or_si128(pixel_R, pixel_G); 287 pixels = _mm_or_si128(pixels, pixel_B); 288 _mm_store_si128(dst_wide, pixels); 289 count -= 8; 290 dst_wide++; 291 } while (count >= 8); 292 293 dst = reinterpret_cast<uint16_t*>(dst_wide); 294 } 295 296 // Small loop to handle remaining pixels. 297 while (count > 0) { 298 *dst = SkBlend32_RGB16(src_expand, *dst, scale); 299 dst += 1; 300 count--; 301 } 302 } 303 304 void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr, 305 size_t maskRB, SkColor origColor, 306 int width, int height) { 307 SkPMColor color = SkPreMultiplyColor(origColor); 308 size_t dstOffset = dstRB - (width << 2); 309 size_t maskOffset = maskRB - width; 310 SkPMColor* dst = (SkPMColor *)device; 311 const uint8_t* mask = (const uint8_t*)maskPtr; 312 do { 313 int count = width; 314 if (count >= 4) { 315 while (((size_t)dst & 0x0F) != 0 && (count > 0)) { 316 *dst = SkBlendARGB32(color, *dst, *mask); 317 mask++; 318 dst++; 319 count--; 320 } 321 __m128i *d = reinterpret_cast<__m128i*>(dst); 322 __m128i src_pixel = _mm_set1_epi32(color); 323 while (count >= 4) { 324 // Load 4 dst pixels 325 __m128i dst_pixel = _mm_load_si128(d); 326 327 // Set the alpha value 328 __m128i alpha_wide = _mm_cvtsi32_si128(*reinterpret_cast<const uint32_t*>(mask)); 329 alpha_wide = _mm_unpacklo_epi8(alpha_wide, _mm_setzero_si128()); 330 alpha_wide = _mm_unpacklo_epi16(alpha_wide, _mm_setzero_si128()); 331 332 __m128i result = SkBlendARGB32_SSE2(src_pixel, dst_pixel, alpha_wide); 333 _mm_store_si128(d, result); 334 // Load the next 4 dst pixels and alphas 335 mask = mask + 4; 336 d++; 337 count -= 4; 338 } 339 dst = reinterpret_cast<SkPMColor*>(d); 340 } 341 while (count > 0) { 342 *dst= SkBlendARGB32(color, *dst, *mask); 343 dst += 1; 344 mask++; 345 count --; 346 } 347 dst = (SkPMColor *)((char*)dst + dstOffset); 348 mask += maskOffset; 349 } while (--height != 0); 350 } 351 352 // The following (left) shifts cause the top 5 bits of the mask components to 353 // line up with the corresponding components in an SkPMColor. 354 // Note that the mask's RGB16 order may differ from the SkPMColor order. 355 #define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5) 356 #define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5) 357 #define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5) 358 359 #if SK_R16x5_R32x5_SHIFT == 0 360 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x) 361 #elif SK_R16x5_R32x5_SHIFT > 0 362 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT)) 363 #else 364 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT)) 365 #endif 366 367 #if SK_G16x5_G32x5_SHIFT == 0 368 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x) 369 #elif SK_G16x5_G32x5_SHIFT > 0 370 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT)) 371 #else 372 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT)) 373 #endif 374 375 #if SK_B16x5_B32x5_SHIFT == 0 376 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x) 377 #elif SK_B16x5_B32x5_SHIFT > 0 378 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT)) 379 #else 380 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT)) 381 #endif 382 383 static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst, 384 __m128i &mask, __m128i &srcA) { 385 // In the following comments, the components of src, dst and mask are 386 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked 387 // by an R, G, B, or A suffix. Components of one of the four pixels that 388 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for 389 // example is the blue channel of the second destination pixel. Memory 390 // layout is shown for an ARGB byte order in a color value. 391 392 // src and srcA store 8-bit values interleaved with zeros. 393 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) 394 // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0, 395 // srcA, 0, srcA, 0, srcA, 0, srcA, 0) 396 // mask stores 16-bit values (compressed three channels) interleaved with zeros. 397 // Lo and Hi denote the low and high bytes of a 16-bit value, respectively. 398 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, 399 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) 400 401 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits. 402 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0) 403 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask), 404 _mm_set1_epi32(0x1F << SK_R32_SHIFT)); 405 406 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0) 407 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask), 408 _mm_set1_epi32(0x1F << SK_G32_SHIFT)); 409 410 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B) 411 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask), 412 _mm_set1_epi32(0x1F << SK_B32_SHIFT)); 413 414 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3) 415 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an 416 // 8-bit position 417 // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B, 418 // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B) 419 mask = _mm_or_si128(_mm_or_si128(r, g), b); 420 421 // Interleave R,G,B into the lower byte of word. 422 // i.e. split the sixteen 8-bit values from mask into two sets of eight 423 // 16-bit values, padded by zero. 424 __m128i maskLo, maskHi; 425 // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0) 426 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128()); 427 // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0) 428 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128()); 429 430 // Upscale from 0..31 to 0..32 431 // (allows to replace division by left-shift further down) 432 // Left-shift each component by 4 and add the result back to that component, 433 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32 434 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4)); 435 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4)); 436 437 // Multiply each component of maskLo and maskHi by srcA 438 maskLo = _mm_mullo_epi16(maskLo, srcA); 439 maskHi = _mm_mullo_epi16(maskHi, srcA); 440 441 // Left shift mask components by 8 (divide by 256) 442 maskLo = _mm_srli_epi16(maskLo, 8); 443 maskHi = _mm_srli_epi16(maskHi, 8); 444 445 // Interleave R,G,B into the lower byte of the word 446 // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0) 447 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128()); 448 // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0) 449 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128()); 450 451 // mask = (src - dst) * mask 452 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo)); 453 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi)); 454 455 // mask = (src - dst) * mask >> 5 456 maskLo = _mm_srai_epi16(maskLo, 5); 457 maskHi = _mm_srai_epi16(maskHi, 5); 458 459 // Add two pixels into result. 460 // result = dst + ((src - dst) * mask >> 5) 461 __m128i resultLo = _mm_add_epi16(dstLo, maskLo); 462 __m128i resultHi = _mm_add_epi16(dstHi, maskHi); 463 464 // Pack into 4 32bit dst pixels. 465 // resultLo and resultHi contain eight 16-bit components (two pixels) each. 466 // Merge into one SSE regsiter with sixteen 8-bit values (four pixels), 467 // clamping to 255 if necessary. 468 return _mm_packus_epi16(resultLo, resultHi); 469 } 470 471 static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst, 472 __m128i &mask) { 473 // In the following comments, the components of src, dst and mask are 474 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked 475 // by an R, G, B, or A suffix. Components of one of the four pixels that 476 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for 477 // example is the blue channel of the second destination pixel. Memory 478 // layout is shown for an ARGB byte order in a color value. 479 480 // src and srcA store 8-bit values interleaved with zeros. 481 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) 482 // mask stores 16-bit values (shown as high and low bytes) interleaved with 483 // zeros 484 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, 485 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) 486 487 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits. 488 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0) 489 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask), 490 _mm_set1_epi32(0x1F << SK_R32_SHIFT)); 491 492 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0) 493 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask), 494 _mm_set1_epi32(0x1F << SK_G32_SHIFT)); 495 496 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B) 497 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask), 498 _mm_set1_epi32(0x1F << SK_B32_SHIFT)); 499 500 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3) 501 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an 502 // 8-bit position 503 // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B, 504 // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B) 505 mask = _mm_or_si128(_mm_or_si128(r, g), b); 506 507 // Interleave R,G,B into the lower byte of word. 508 // i.e. split the sixteen 8-bit values from mask into two sets of eight 509 // 16-bit values, padded by zero. 510 __m128i maskLo, maskHi; 511 // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0) 512 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128()); 513 // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0) 514 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128()); 515 516 // Upscale from 0..31 to 0..32 517 // (allows to replace division by left-shift further down) 518 // Left-shift each component by 4 and add the result back to that component, 519 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32 520 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4)); 521 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4)); 522 523 // Interleave R,G,B into the lower byte of the word 524 // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0) 525 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128()); 526 // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0) 527 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128()); 528 529 // mask = (src - dst) * mask 530 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo)); 531 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi)); 532 533 // mask = (src - dst) * mask >> 5 534 maskLo = _mm_srai_epi16(maskLo, 5); 535 maskHi = _mm_srai_epi16(maskHi, 5); 536 537 // Add two pixels into result. 538 // result = dst + ((src - dst) * mask >> 5) 539 __m128i resultLo = _mm_add_epi16(dstLo, maskLo); 540 __m128i resultHi = _mm_add_epi16(dstHi, maskHi); 541 542 // Pack into 4 32bit dst pixels and force opaque. 543 // resultLo and resultHi contain eight 16-bit components (two pixels) each. 544 // Merge into one SSE regsiter with sixteen 8-bit values (four pixels), 545 // clamping to 255 if necessary. Set alpha components to 0xFF. 546 return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi), 547 _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT)); 548 } 549 550 void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[], 551 SkColor src, int width, SkPMColor) { 552 if (width <= 0) { 553 return; 554 } 555 556 int srcA = SkColorGetA(src); 557 int srcR = SkColorGetR(src); 558 int srcG = SkColorGetG(src); 559 int srcB = SkColorGetB(src); 560 561 srcA = SkAlpha255To256(srcA); 562 563 if (width >= 4) { 564 SkASSERT(((size_t)dst & 0x03) == 0); 565 while (((size_t)dst & 0x0F) != 0) { 566 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask); 567 mask++; 568 dst++; 569 width--; 570 } 571 572 __m128i *d = reinterpret_cast<__m128i*>(dst); 573 // Set alpha to 0xFF and replicate source four times in SSE register. 574 __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB)); 575 // Interleave with zeros to get two sets of four 16-bit values. 576 src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128()); 577 // Set srcA_sse to contain eight copies of srcA, padded with zero. 578 // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) 579 __m128i srcA_sse = _mm_set1_epi16(srcA); 580 while (width >= 4) { 581 // Load four destination pixels into dst_sse. 582 __m128i dst_sse = _mm_load_si128(d); 583 // Load four 16-bit masks into lower half of mask_sse. 584 __m128i mask_sse = _mm_loadl_epi64( 585 reinterpret_cast<const __m128i*>(mask)); 586 587 // Check whether masks are equal to 0 and get the highest bit 588 // of each byte of result, if masks are all zero, we will get 589 // pack_cmp to 0xFFFF 590 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse, 591 _mm_setzero_si128())); 592 593 // if mask pixels are not all zero, we will blend the dst pixels 594 if (pack_cmp != 0xFFFF) { 595 // Unpack 4 16bit mask pixels to 596 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, 597 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) 598 mask_sse = _mm_unpacklo_epi16(mask_sse, 599 _mm_setzero_si128()); 600 601 // Process 4 32bit dst pixels 602 __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse, 603 mask_sse, srcA_sse); 604 _mm_store_si128(d, result); 605 } 606 607 d++; 608 mask += 4; 609 width -= 4; 610 } 611 612 dst = reinterpret_cast<SkPMColor*>(d); 613 } 614 615 while (width > 0) { 616 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask); 617 mask++; 618 dst++; 619 width--; 620 } 621 } 622 623 void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[], 624 SkColor src, int width, SkPMColor opaqueDst) { 625 if (width <= 0) { 626 return; 627 } 628 629 int srcR = SkColorGetR(src); 630 int srcG = SkColorGetG(src); 631 int srcB = SkColorGetB(src); 632 633 if (width >= 4) { 634 SkASSERT(((size_t)dst & 0x03) == 0); 635 while (((size_t)dst & 0x0F) != 0) { 636 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst); 637 mask++; 638 dst++; 639 width--; 640 } 641 642 __m128i *d = reinterpret_cast<__m128i*>(dst); 643 // Set alpha to 0xFF and replicate source four times in SSE register. 644 __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB)); 645 // Set srcA_sse to contain eight copies of srcA, padded with zero. 646 // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) 647 src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128()); 648 while (width >= 4) { 649 // Load four destination pixels into dst_sse. 650 __m128i dst_sse = _mm_load_si128(d); 651 // Load four 16-bit masks into lower half of mask_sse. 652 __m128i mask_sse = _mm_loadl_epi64( 653 reinterpret_cast<const __m128i*>(mask)); 654 655 // Check whether masks are equal to 0 and get the highest bit 656 // of each byte of result, if masks are all zero, we will get 657 // pack_cmp to 0xFFFF 658 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse, 659 _mm_setzero_si128())); 660 661 // if mask pixels are not all zero, we will blend the dst pixels 662 if (pack_cmp != 0xFFFF) { 663 // Unpack 4 16bit mask pixels to 664 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, 665 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) 666 mask_sse = _mm_unpacklo_epi16(mask_sse, 667 _mm_setzero_si128()); 668 669 // Process 4 32bit dst pixels 670 __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse, 671 mask_sse); 672 _mm_store_si128(d, result); 673 } 674 675 d++; 676 mask += 4; 677 width -= 4; 678 } 679 680 dst = reinterpret_cast<SkPMColor*>(d); 681 } 682 683 while (width > 0) { 684 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst); 685 mask++; 686 dst++; 687 width--; 688 } 689 } 690 691 /* SSE2 version of S32_D565_Opaque() 692 * portable version is in core/SkBlitRow_D16.cpp 693 */ 694 void S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst, 695 const SkPMColor* SK_RESTRICT src, int count, 696 U8CPU alpha, int /*x*/, int /*y*/) { 697 SkASSERT(255 == alpha); 698 699 if (count <= 0) { 700 return; 701 } 702 703 if (count >= 8) { 704 while (((size_t)dst & 0x0F) != 0) { 705 SkPMColor c = *src++; 706 SkPMColorAssert(c); 707 708 *dst++ = SkPixel32ToPixel16_ToU16(c); 709 count--; 710 } 711 712 const __m128i* s = reinterpret_cast<const __m128i*>(src); 713 __m128i* d = reinterpret_cast<__m128i*>(dst); 714 715 while (count >= 8) { 716 // Load 8 pixels of src. 717 __m128i src_pixel1 = _mm_loadu_si128(s++); 718 __m128i src_pixel2 = _mm_loadu_si128(s++); 719 720 __m128i d_pixel = SkPixel32ToPixel16_ToU16_SSE2(src_pixel1, src_pixel2); 721 _mm_store_si128(d++, d_pixel); 722 count -= 8; 723 } 724 src = reinterpret_cast<const SkPMColor*>(s); 725 dst = reinterpret_cast<uint16_t*>(d); 726 } 727 728 if (count > 0) { 729 do { 730 SkPMColor c = *src++; 731 SkPMColorAssert(c); 732 *dst++ = SkPixel32ToPixel16_ToU16(c); 733 } while (--count != 0); 734 } 735 } 736 737 /* SSE2 version of S32A_D565_Opaque() 738 * portable version is in core/SkBlitRow_D16.cpp 739 */ 740 void S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst, 741 const SkPMColor* SK_RESTRICT src, 742 int count, U8CPU alpha, int /*x*/, int /*y*/) { 743 SkASSERT(255 == alpha); 744 745 if (count <= 0) { 746 return; 747 } 748 749 if (count >= 8) { 750 // Make dst 16 bytes alignment 751 while (((size_t)dst & 0x0F) != 0) { 752 SkPMColor c = *src++; 753 if (c) { 754 *dst = SkSrcOver32To16(c, *dst); 755 } 756 dst += 1; 757 count--; 758 } 759 760 const __m128i* s = reinterpret_cast<const __m128i*>(src); 761 __m128i* d = reinterpret_cast<__m128i*>(dst); 762 __m128i var255 = _mm_set1_epi16(255); 763 __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK); 764 __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK); 765 __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK); 766 767 while (count >= 8) { 768 // Load 8 pixels of src. 769 __m128i src_pixel1 = _mm_loadu_si128(s++); 770 __m128i src_pixel2 = _mm_loadu_si128(s++); 771 772 // Check whether src pixels are equal to 0 and get the highest bit 773 // of each byte of result, if src pixels are all zero, src_cmp1 and 774 // src_cmp2 will be 0xFFFF. 775 int src_cmp1 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel1, 776 _mm_setzero_si128())); 777 int src_cmp2 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel2, 778 _mm_setzero_si128())); 779 if (src_cmp1 == 0xFFFF && src_cmp2 == 0xFFFF) { 780 d++; 781 count -= 8; 782 continue; 783 } 784 785 // Load 8 pixels of dst. 786 __m128i dst_pixel = _mm_load_si128(d); 787 788 // Extract A from src. 789 __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT)); 790 sa1 = _mm_srli_epi32(sa1, 24); 791 __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT)); 792 sa2 = _mm_srli_epi32(sa2, 24); 793 __m128i sa = _mm_packs_epi32(sa1, sa2); 794 795 // Extract R from src. 796 __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT)); 797 sr1 = _mm_srli_epi32(sr1, 24); 798 __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT)); 799 sr2 = _mm_srli_epi32(sr2, 24); 800 __m128i sr = _mm_packs_epi32(sr1, sr2); 801 802 // Extract G from src. 803 __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT)); 804 sg1 = _mm_srli_epi32(sg1, 24); 805 __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT)); 806 sg2 = _mm_srli_epi32(sg2, 24); 807 __m128i sg = _mm_packs_epi32(sg1, sg2); 808 809 // Extract B from src. 810 __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT)); 811 sb1 = _mm_srli_epi32(sb1, 24); 812 __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT)); 813 sb2 = _mm_srli_epi32(sb2, 24); 814 __m128i sb = _mm_packs_epi32(sb1, sb2); 815 816 // Extract R G B from dst. 817 __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT); 818 dr = _mm_and_si128(dr, r16_mask); 819 __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT); 820 dg = _mm_and_si128(dg, g16_mask); 821 __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT); 822 db = _mm_and_si128(db, b16_mask); 823 824 __m128i isa = _mm_sub_epi16(var255, sa); // 255 -sa 825 826 // Calculate R G B of result. 827 // Original algorithm is in SkSrcOver32To16(). 828 dr = _mm_add_epi16(sr, SkMul16ShiftRound_SSE2(dr, isa, SK_R16_BITS)); 829 dr = _mm_srli_epi16(dr, 8 - SK_R16_BITS); 830 dg = _mm_add_epi16(sg, SkMul16ShiftRound_SSE2(dg, isa, SK_G16_BITS)); 831 dg = _mm_srli_epi16(dg, 8 - SK_G16_BITS); 832 db = _mm_add_epi16(sb, SkMul16ShiftRound_SSE2(db, isa, SK_B16_BITS)); 833 db = _mm_srli_epi16(db, 8 - SK_B16_BITS); 834 835 // Pack R G B into 16-bit color. 836 __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db); 837 838 // Store 8 16-bit colors in dst. 839 _mm_store_si128(d++, d_pixel); 840 count -= 8; 841 } 842 843 src = reinterpret_cast<const SkPMColor*>(s); 844 dst = reinterpret_cast<uint16_t*>(d); 845 } 846 847 if (count > 0) { 848 do { 849 SkPMColor c = *src++; 850 SkPMColorAssert(c); 851 if (c) { 852 *dst = SkSrcOver32To16(c, *dst); 853 } 854 dst += 1; 855 } while (--count != 0); 856 } 857 } 858 859 void S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst, 860 const SkPMColor* SK_RESTRICT src, 861 int count, U8CPU alpha, int x, int y) { 862 SkASSERT(255 == alpha); 863 864 if (count <= 0) { 865 return; 866 } 867 868 if (count >= 8) { 869 while (((size_t)dst & 0x0F) != 0) { 870 DITHER_565_SCAN(y); 871 SkPMColor c = *src++; 872 SkPMColorAssert(c); 873 874 unsigned dither = DITHER_VALUE(x); 875 *dst++ = SkDitherRGB32To565(c, dither); 876 DITHER_INC_X(x); 877 count--; 878 } 879 880 unsigned short dither_value[8]; 881 __m128i dither; 882 #ifdef ENABLE_DITHER_MATRIX_4X4 883 const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3]; 884 dither_value[0] = dither_value[4] = dither_scan[(x) & 3]; 885 dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3]; 886 dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3]; 887 dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3]; 888 #else 889 const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3]; 890 dither_value[0] = dither_value[4] = (dither_scan 891 >> (((x) & 3) << 2)) & 0xF; 892 dither_value[1] = dither_value[5] = (dither_scan 893 >> (((x + 1) & 3) << 2)) & 0xF; 894 dither_value[2] = dither_value[6] = (dither_scan 895 >> (((x + 2) & 3) << 2)) & 0xF; 896 dither_value[3] = dither_value[7] = (dither_scan 897 >> (((x + 3) & 3) << 2)) & 0xF; 898 #endif 899 dither = _mm_loadu_si128((__m128i*) dither_value); 900 901 const __m128i* s = reinterpret_cast<const __m128i*>(src); 902 __m128i* d = reinterpret_cast<__m128i*>(dst); 903 904 while (count >= 8) { 905 // Load 8 pixels of src. 906 __m128i src_pixel1 = _mm_loadu_si128(s++); 907 __m128i src_pixel2 = _mm_loadu_si128(s++); 908 909 // Extract R from src. 910 __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT)); 911 sr1 = _mm_srli_epi32(sr1, 24); 912 __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT)); 913 sr2 = _mm_srli_epi32(sr2, 24); 914 __m128i sr = _mm_packs_epi32(sr1, sr2); 915 916 // SkDITHER_R32To565(sr, dither) 917 __m128i sr_offset = _mm_srli_epi16(sr, 5); 918 sr = _mm_add_epi16(sr, dither); 919 sr = _mm_sub_epi16(sr, sr_offset); 920 sr = _mm_srli_epi16(sr, SK_R32_BITS - SK_R16_BITS); 921 922 // Extract G from src. 923 __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT)); 924 sg1 = _mm_srli_epi32(sg1, 24); 925 __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT)); 926 sg2 = _mm_srli_epi32(sg2, 24); 927 __m128i sg = _mm_packs_epi32(sg1, sg2); 928 929 // SkDITHER_R32To565(sg, dither) 930 __m128i sg_offset = _mm_srli_epi16(sg, 6); 931 sg = _mm_add_epi16(sg, _mm_srli_epi16(dither, 1)); 932 sg = _mm_sub_epi16(sg, sg_offset); 933 sg = _mm_srli_epi16(sg, SK_G32_BITS - SK_G16_BITS); 934 935 // Extract B from src. 936 __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT)); 937 sb1 = _mm_srli_epi32(sb1, 24); 938 __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT)); 939 sb2 = _mm_srli_epi32(sb2, 24); 940 __m128i sb = _mm_packs_epi32(sb1, sb2); 941 942 // SkDITHER_R32To565(sb, dither) 943 __m128i sb_offset = _mm_srli_epi16(sb, 5); 944 sb = _mm_add_epi16(sb, dither); 945 sb = _mm_sub_epi16(sb, sb_offset); 946 sb = _mm_srli_epi16(sb, SK_B32_BITS - SK_B16_BITS); 947 948 // Pack and store 16-bit dst pixel. 949 __m128i d_pixel = SkPackRGB16_SSE2(sr, sg, sb); 950 _mm_store_si128(d++, d_pixel); 951 952 count -= 8; 953 x += 8; 954 } 955 956 src = reinterpret_cast<const SkPMColor*>(s); 957 dst = reinterpret_cast<uint16_t*>(d); 958 } 959 960 if (count > 0) { 961 DITHER_565_SCAN(y); 962 do { 963 SkPMColor c = *src++; 964 SkPMColorAssert(c); 965 966 unsigned dither = DITHER_VALUE(x); 967 *dst++ = SkDitherRGB32To565(c, dither); 968 DITHER_INC_X(x); 969 } while (--count != 0); 970 } 971 } 972 973 /* SSE2 version of S32A_D565_Opaque_Dither() 974 * portable version is in core/SkBlitRow_D16.cpp 975 */ 976 void S32A_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst, 977 const SkPMColor* SK_RESTRICT src, 978 int count, U8CPU alpha, int x, int y) { 979 SkASSERT(255 == alpha); 980 981 if (count <= 0) { 982 return; 983 } 984 985 if (count >= 8) { 986 while (((size_t)dst & 0x0F) != 0) { 987 DITHER_565_SCAN(y); 988 SkPMColor c = *src++; 989 SkPMColorAssert(c); 990 if (c) { 991 unsigned a = SkGetPackedA32(c); 992 993 int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a)); 994 995 unsigned sr = SkGetPackedR32(c); 996 unsigned sg = SkGetPackedG32(c); 997 unsigned sb = SkGetPackedB32(c); 998 sr = SkDITHER_R32_FOR_565(sr, d); 999 sg = SkDITHER_G32_FOR_565(sg, d); 1000 sb = SkDITHER_B32_FOR_565(sb, d); 1001 1002 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); 1003 uint32_t dst_expanded = SkExpand_rgb_16(*dst); 1004 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); 1005 // now src and dst expanded are in g:11 r:10 x:1 b:10 1006 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); 1007 } 1008 dst += 1; 1009 DITHER_INC_X(x); 1010 count--; 1011 } 1012 1013 unsigned short dither_value[8]; 1014 __m128i dither, dither_cur; 1015 #ifdef ENABLE_DITHER_MATRIX_4X4 1016 const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3]; 1017 dither_value[0] = dither_value[4] = dither_scan[(x) & 3]; 1018 dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3]; 1019 dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3]; 1020 dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3]; 1021 #else 1022 const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3]; 1023 dither_value[0] = dither_value[4] = (dither_scan 1024 >> (((x) & 3) << 2)) & 0xF; 1025 dither_value[1] = dither_value[5] = (dither_scan 1026 >> (((x + 1) & 3) << 2)) & 0xF; 1027 dither_value[2] = dither_value[6] = (dither_scan 1028 >> (((x + 2) & 3) << 2)) & 0xF; 1029 dither_value[3] = dither_value[7] = (dither_scan 1030 >> (((x + 3) & 3) << 2)) & 0xF; 1031 #endif 1032 dither = _mm_loadu_si128((__m128i*) dither_value); 1033 1034 const __m128i* s = reinterpret_cast<const __m128i*>(src); 1035 __m128i* d = reinterpret_cast<__m128i*>(dst); 1036 __m128i var256 = _mm_set1_epi16(256); 1037 __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK); 1038 __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK); 1039 __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK); 1040 1041 while (count >= 8) { 1042 // Load 8 pixels of src and dst. 1043 __m128i src_pixel1 = _mm_loadu_si128(s++); 1044 __m128i src_pixel2 = _mm_loadu_si128(s++); 1045 __m128i dst_pixel = _mm_load_si128(d); 1046 1047 // Extract A from src. 1048 __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT)); 1049 sa1 = _mm_srli_epi32(sa1, 24); 1050 __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT)); 1051 sa2 = _mm_srli_epi32(sa2, 24); 1052 __m128i sa = _mm_packs_epi32(sa1, sa2); 1053 1054 // Calculate current dither value. 1055 dither_cur = _mm_mullo_epi16(dither, 1056 _mm_add_epi16(sa, _mm_set1_epi16(1))); 1057 dither_cur = _mm_srli_epi16(dither_cur, 8); 1058 1059 // Extract R from src. 1060 __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT)); 1061 sr1 = _mm_srli_epi32(sr1, 24); 1062 __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT)); 1063 sr2 = _mm_srli_epi32(sr2, 24); 1064 __m128i sr = _mm_packs_epi32(sr1, sr2); 1065 1066 // SkDITHER_R32_FOR_565(sr, d) 1067 __m128i sr_offset = _mm_srli_epi16(sr, 5); 1068 sr = _mm_add_epi16(sr, dither_cur); 1069 sr = _mm_sub_epi16(sr, sr_offset); 1070 1071 // Expand sr. 1072 sr = _mm_slli_epi16(sr, 2); 1073 1074 // Extract G from src. 1075 __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT)); 1076 sg1 = _mm_srli_epi32(sg1, 24); 1077 __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT)); 1078 sg2 = _mm_srli_epi32(sg2, 24); 1079 __m128i sg = _mm_packs_epi32(sg1, sg2); 1080 1081 // sg = SkDITHER_G32_FOR_565(sg, d). 1082 __m128i sg_offset = _mm_srli_epi16(sg, 6); 1083 sg = _mm_add_epi16(sg, _mm_srli_epi16(dither_cur, 1)); 1084 sg = _mm_sub_epi16(sg, sg_offset); 1085 1086 // Expand sg. 1087 sg = _mm_slli_epi16(sg, 3); 1088 1089 // Extract B from src. 1090 __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT)); 1091 sb1 = _mm_srli_epi32(sb1, 24); 1092 __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT)); 1093 sb2 = _mm_srli_epi32(sb2, 24); 1094 __m128i sb = _mm_packs_epi32(sb1, sb2); 1095 1096 // sb = SkDITHER_B32_FOR_565(sb, d). 1097 __m128i sb_offset = _mm_srli_epi16(sb, 5); 1098 sb = _mm_add_epi16(sb, dither_cur); 1099 sb = _mm_sub_epi16(sb, sb_offset); 1100 1101 // Expand sb. 1102 sb = _mm_slli_epi16(sb, 2); 1103 1104 // Extract R G B from dst. 1105 __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT); 1106 dr = _mm_and_si128(dr, r16_mask); 1107 __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT); 1108 dg = _mm_and_si128(dg, g16_mask); 1109 __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT); 1110 db = _mm_and_si128(db, b16_mask); 1111 1112 // SkAlpha255To256(255 - a) >> 3 1113 __m128i isa = _mm_sub_epi16(var256, sa); 1114 isa = _mm_srli_epi16(isa, 3); 1115 1116 dr = _mm_mullo_epi16(dr, isa); 1117 dr = _mm_add_epi16(dr, sr); 1118 dr = _mm_srli_epi16(dr, 5); 1119 1120 dg = _mm_mullo_epi16(dg, isa); 1121 dg = _mm_add_epi16(dg, sg); 1122 dg = _mm_srli_epi16(dg, 5); 1123 1124 db = _mm_mullo_epi16(db, isa); 1125 db = _mm_add_epi16(db, sb); 1126 db = _mm_srli_epi16(db, 5); 1127 1128 // Package and store dst pixel. 1129 __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db); 1130 _mm_store_si128(d++, d_pixel); 1131 1132 count -= 8; 1133 x += 8; 1134 } 1135 1136 src = reinterpret_cast<const SkPMColor*>(s); 1137 dst = reinterpret_cast<uint16_t*>(d); 1138 } 1139 1140 if (count > 0) { 1141 DITHER_565_SCAN(y); 1142 do { 1143 SkPMColor c = *src++; 1144 SkPMColorAssert(c); 1145 if (c) { 1146 unsigned a = SkGetPackedA32(c); 1147 1148 int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a)); 1149 1150 unsigned sr = SkGetPackedR32(c); 1151 unsigned sg = SkGetPackedG32(c); 1152 unsigned sb = SkGetPackedB32(c); 1153 sr = SkDITHER_R32_FOR_565(sr, d); 1154 sg = SkDITHER_G32_FOR_565(sg, d); 1155 sb = SkDITHER_B32_FOR_565(sb, d); 1156 1157 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); 1158 uint32_t dst_expanded = SkExpand_rgb_16(*dst); 1159 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); 1160 // now src and dst expanded are in g:11 r:10 x:1 b:10 1161 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); 1162 } 1163 dst += 1; 1164 DITHER_INC_X(x); 1165 } while (--count != 0); 1166 } 1167 } 1168