1 /* 2 * Copyright 2012 The Android Open Source Project 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8 #include <emmintrin.h> 9 #include "SkBitmapProcState_opts_SSE2.h" 10 #include "SkBlitRow_opts_SSE2.h" 11 #include "SkColorPriv.h" 12 #include "SkColor_opts_SSE2.h" 13 #include "SkDither.h" 14 #include "SkMSAN.h" 15 #include "SkUtils.h" 16 17 /* SSE2 version of S32_Blend_BlitRow32() 18 * portable version is in core/SkBlitRow_D32.cpp 19 */ 20 void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, 21 const SkPMColor* SK_RESTRICT src, 22 int count, U8CPU alpha) { 23 SkASSERT(alpha <= 255); 24 if (count <= 0) { 25 return; 26 } 27 28 uint32_t src_scale = SkAlpha255To256(alpha); 29 uint32_t dst_scale = 256 - src_scale; 30 31 if (count >= 4) { 32 SkASSERT(((size_t)dst & 0x03) == 0); 33 while (((size_t)dst & 0x0F) != 0) { 34 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); 35 src++; 36 dst++; 37 count--; 38 } 39 40 const __m128i *s = reinterpret_cast<const __m128i*>(src); 41 __m128i *d = reinterpret_cast<__m128i*>(dst); 42 43 while (count >= 4) { 44 // Load 4 pixels each of src and dest. 45 __m128i src_pixel = _mm_loadu_si128(s); 46 __m128i dst_pixel = _mm_load_si128(d); 47 48 src_pixel = SkAlphaMulQ_SSE2(src_pixel, src_scale); 49 dst_pixel = SkAlphaMulQ_SSE2(dst_pixel, dst_scale); 50 51 // Add result 52 __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 53 _mm_store_si128(d, result); 54 s++; 55 d++; 56 count -= 4; 57 } 58 src = reinterpret_cast<const SkPMColor*>(s); 59 dst = reinterpret_cast<SkPMColor*>(d); 60 } 61 62 while (count > 0) { 63 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); 64 src++; 65 dst++; 66 count--; 67 } 68 } 69 70 void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, 71 const SkPMColor* SK_RESTRICT src, 72 int count, U8CPU alpha) { 73 sk_msan_assert_initialized(src, src+count); 74 75 SkASSERT(alpha == 255); 76 if (count <= 0) { 77 return; 78 } 79 80 #ifdef SK_USE_ACCURATE_BLENDING 81 if (count >= 4) { 82 SkASSERT(((size_t)dst & 0x03) == 0); 83 while (((size_t)dst & 0x0F) != 0) { 84 *dst = SkPMSrcOver(*src, *dst); 85 src++; 86 dst++; 87 count--; 88 } 89 90 const __m128i *s = reinterpret_cast<const __m128i*>(src); 91 __m128i *d = reinterpret_cast<__m128i*>(dst); 92 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 93 __m128i c_128 = _mm_set1_epi16(128); // 8 copies of 128 (16-bit) 94 __m128i c_255 = _mm_set1_epi16(255); // 8 copies of 255 (16-bit) 95 while (count >= 4) { 96 // Load 4 pixels 97 __m128i src_pixel = _mm_loadu_si128(s); 98 __m128i dst_pixel = _mm_load_si128(d); 99 100 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 101 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); 102 // Shift alphas down to lower 8 bits of each quad. 103 __m128i alpha = _mm_srli_epi32(src_pixel, 24); 104 105 // Copy alpha to upper 3rd byte of each quad 106 alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16)); 107 108 // Subtract alphas from 255, to get 0..255 109 alpha = _mm_sub_epi16(c_255, alpha); 110 111 // Multiply by red and blue by src alpha. 112 dst_rb = _mm_mullo_epi16(dst_rb, alpha); 113 // Multiply by alpha and green by src alpha. 114 dst_ag = _mm_mullo_epi16(dst_ag, alpha); 115 116 // dst_rb_low = (dst_rb >> 8) 117 __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8); 118 __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8); 119 120 // dst_rb = (dst_rb + dst_rb_low + 128) >> 8 121 dst_rb = _mm_add_epi16(dst_rb, dst_rb_low); 122 dst_rb = _mm_add_epi16(dst_rb, c_128); 123 dst_rb = _mm_srli_epi16(dst_rb, 8); 124 125 // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask 126 dst_ag = _mm_add_epi16(dst_ag, dst_ag_low); 127 dst_ag = _mm_add_epi16(dst_ag, c_128); 128 dst_ag = _mm_andnot_si128(rb_mask, dst_ag); 129 130 // Combine back into RGBA. 131 dst_pixel = _mm_or_si128(dst_rb, dst_ag); 132 133 // Add result 134 __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 135 _mm_store_si128(d, result); 136 s++; 137 d++; 138 count -= 4; 139 } 140 src = reinterpret_cast<const SkPMColor*>(s); 141 dst = reinterpret_cast<SkPMColor*>(d); 142 } 143 144 while (count > 0) { 145 *dst = SkPMSrcOver(*src, *dst); 146 src++; 147 dst++; 148 count--; 149 } 150 #else 151 int count16 = count / 16; 152 __m128i* dst4 = (__m128i*)dst; 153 const __m128i* src4 = (const __m128i*)src; 154 155 for (int i = 0; i < count16 * 4; i += 4) { 156 // Load 16 source pixels. 157 __m128i s0 = _mm_loadu_si128(src4+i+0), 158 s1 = _mm_loadu_si128(src4+i+1), 159 s2 = _mm_loadu_si128(src4+i+2), 160 s3 = _mm_loadu_si128(src4+i+3); 161 162 const __m128i alphaMask = _mm_set1_epi32(0xFF << SK_A32_SHIFT); 163 const __m128i ORed = _mm_or_si128(s3, _mm_or_si128(s2, _mm_or_si128(s1, s0))); 164 __m128i cmp = _mm_cmpeq_epi8(_mm_and_si128(ORed, alphaMask), _mm_setzero_si128()); 165 if (0xffff == _mm_movemask_epi8(cmp)) { 166 // All 16 source pixels are fully transparent. There's nothing to do! 167 continue; 168 } 169 const __m128i ANDed = _mm_and_si128(s3, _mm_and_si128(s2, _mm_and_si128(s1, s0))); 170 cmp = _mm_cmpeq_epi8(_mm_and_si128(ANDed, alphaMask), alphaMask); 171 if (0xffff == _mm_movemask_epi8(cmp)) { 172 // All 16 source pixels are fully opaque. There's no need to read dst or blend it. 173 _mm_storeu_si128(dst4+i+0, s0); 174 _mm_storeu_si128(dst4+i+1, s1); 175 _mm_storeu_si128(dst4+i+2, s2); 176 _mm_storeu_si128(dst4+i+3, s3); 177 continue; 178 } 179 // The general slow case: do the blend for all 16 pixels. 180 _mm_storeu_si128(dst4+i+0, SkPMSrcOver_SSE2(s0, _mm_loadu_si128(dst4+i+0))); 181 _mm_storeu_si128(dst4+i+1, SkPMSrcOver_SSE2(s1, _mm_loadu_si128(dst4+i+1))); 182 _mm_storeu_si128(dst4+i+2, SkPMSrcOver_SSE2(s2, _mm_loadu_si128(dst4+i+2))); 183 _mm_storeu_si128(dst4+i+3, SkPMSrcOver_SSE2(s3, _mm_loadu_si128(dst4+i+3))); 184 } 185 186 // Wrap up the last <= 15 pixels. 187 SkASSERT(count - (count16*16) <= 15); 188 for (int i = count16*16; i < count; i++) { 189 // This check is not really necessarily, but it prevents pointless autovectorization. 190 if (src[i] & 0xFF000000) { 191 dst[i] = SkPMSrcOver(src[i], dst[i]); 192 } 193 } 194 #endif 195 } 196 197 void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, 198 const SkPMColor* SK_RESTRICT src, 199 int count, U8CPU alpha) { 200 SkASSERT(alpha <= 255); 201 if (count <= 0) { 202 return; 203 } 204 205 if (count >= 4) { 206 while (((size_t)dst & 0x0F) != 0) { 207 *dst = SkBlendARGB32(*src, *dst, alpha); 208 src++; 209 dst++; 210 count--; 211 } 212 213 const __m128i *s = reinterpret_cast<const __m128i*>(src); 214 __m128i *d = reinterpret_cast<__m128i*>(dst); 215 while (count >= 4) { 216 // Load 4 pixels each of src and dest. 217 __m128i src_pixel = _mm_loadu_si128(s); 218 __m128i dst_pixel = _mm_load_si128(d); 219 220 __m128i result = SkBlendARGB32_SSE2(src_pixel, dst_pixel, alpha); 221 _mm_store_si128(d, result); 222 s++; 223 d++; 224 count -= 4; 225 } 226 src = reinterpret_cast<const SkPMColor*>(s); 227 dst = reinterpret_cast<SkPMColor*>(d); 228 } 229 230 while (count > 0) { 231 *dst = SkBlendARGB32(*src, *dst, alpha); 232 src++; 233 dst++; 234 count--; 235 } 236 } 237 238 void Color32A_D565_SSE2(uint16_t dst[], SkPMColor src, int count, int x, int y) { 239 SkASSERT(count > 0); 240 241 uint32_t src_expand = (SkGetPackedG32(src) << 24) | 242 (SkGetPackedR32(src) << 13) | 243 (SkGetPackedB32(src) << 2); 244 unsigned scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3; 245 246 // Check if we have enough pixels to run SIMD 247 if (count >= (int)(8 + (((16 - (size_t)dst) & 0x0F) >> 1))) { 248 __m128i* dst_wide; 249 const __m128i src_R_wide = _mm_set1_epi16(SkGetPackedR32(src) << 2); 250 const __m128i src_G_wide = _mm_set1_epi16(SkGetPackedG32(src) << 3); 251 const __m128i src_B_wide = _mm_set1_epi16(SkGetPackedB32(src) << 2); 252 const __m128i scale_wide = _mm_set1_epi16(scale); 253 const __m128i mask_blue = _mm_set1_epi16(SK_B16_MASK); 254 const __m128i mask_green = _mm_set1_epi16(SK_G16_MASK << SK_G16_SHIFT); 255 256 // Align dst to an even 16 byte address (0-7 pixels) 257 while (((((size_t)dst) & 0x0F) != 0) && (count > 0)) { 258 *dst = SkBlend32_RGB16(src_expand, *dst, scale); 259 dst += 1; 260 count--; 261 } 262 263 dst_wide = reinterpret_cast<__m128i*>(dst); 264 do { 265 // Load eight RGB565 pixels 266 __m128i pixels = _mm_load_si128(dst_wide); 267 268 // Mask out sub-pixels 269 __m128i pixel_R = _mm_srli_epi16(pixels, SK_R16_SHIFT); 270 __m128i pixel_G = _mm_slli_epi16(pixels, SK_R16_BITS); 271 pixel_G = _mm_srli_epi16(pixel_G, SK_R16_BITS + SK_B16_BITS); 272 __m128i pixel_B = _mm_and_si128(pixels, mask_blue); 273 274 // Scale with alpha 275 pixel_R = _mm_mullo_epi16(pixel_R, scale_wide); 276 pixel_G = _mm_mullo_epi16(pixel_G, scale_wide); 277 pixel_B = _mm_mullo_epi16(pixel_B, scale_wide); 278 279 // Add src_X_wide and shift down again 280 pixel_R = _mm_add_epi16(pixel_R, src_R_wide); 281 pixel_R = _mm_srli_epi16(pixel_R, 5); 282 pixel_G = _mm_add_epi16(pixel_G, src_G_wide); 283 pixel_B = _mm_add_epi16(pixel_B, src_B_wide); 284 pixel_B = _mm_srli_epi16(pixel_B, 5); 285 286 // Combine into RGB565 and store 287 pixel_R = _mm_slli_epi16(pixel_R, SK_R16_SHIFT); 288 pixel_G = _mm_and_si128(pixel_G, mask_green); 289 pixels = _mm_or_si128(pixel_R, pixel_G); 290 pixels = _mm_or_si128(pixels, pixel_B); 291 _mm_store_si128(dst_wide, pixels); 292 count -= 8; 293 dst_wide++; 294 } while (count >= 8); 295 296 dst = reinterpret_cast<uint16_t*>(dst_wide); 297 } 298 299 // Small loop to handle remaining pixels. 300 while (count > 0) { 301 *dst = SkBlend32_RGB16(src_expand, *dst, scale); 302 dst += 1; 303 count--; 304 } 305 } 306 307 // The following (left) shifts cause the top 5 bits of the mask components to 308 // line up with the corresponding components in an SkPMColor. 309 // Note that the mask's RGB16 order may differ from the SkPMColor order. 310 #define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5) 311 #define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5) 312 #define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5) 313 314 #if SK_R16x5_R32x5_SHIFT == 0 315 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x) 316 #elif SK_R16x5_R32x5_SHIFT > 0 317 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT)) 318 #else 319 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT)) 320 #endif 321 322 #if SK_G16x5_G32x5_SHIFT == 0 323 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x) 324 #elif SK_G16x5_G32x5_SHIFT > 0 325 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT)) 326 #else 327 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT)) 328 #endif 329 330 #if SK_B16x5_B32x5_SHIFT == 0 331 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x) 332 #elif SK_B16x5_B32x5_SHIFT > 0 333 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT)) 334 #else 335 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT)) 336 #endif 337 338 static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst, 339 __m128i &mask, __m128i &srcA) { 340 // In the following comments, the components of src, dst and mask are 341 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked 342 // by an R, G, B, or A suffix. Components of one of the four pixels that 343 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for 344 // example is the blue channel of the second destination pixel. Memory 345 // layout is shown for an ARGB byte order in a color value. 346 347 // src and srcA store 8-bit values interleaved with zeros. 348 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) 349 // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0, 350 // srcA, 0, srcA, 0, srcA, 0, srcA, 0) 351 // mask stores 16-bit values (compressed three channels) interleaved with zeros. 352 // Lo and Hi denote the low and high bytes of a 16-bit value, respectively. 353 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, 354 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) 355 356 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits. 357 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0) 358 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask), 359 _mm_set1_epi32(0x1F << SK_R32_SHIFT)); 360 361 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0) 362 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask), 363 _mm_set1_epi32(0x1F << SK_G32_SHIFT)); 364 365 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B) 366 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask), 367 _mm_set1_epi32(0x1F << SK_B32_SHIFT)); 368 369 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3) 370 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an 371 // 8-bit position 372 // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B, 373 // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B) 374 mask = _mm_or_si128(_mm_or_si128(r, g), b); 375 376 // Interleave R,G,B into the lower byte of word. 377 // i.e. split the sixteen 8-bit values from mask into two sets of eight 378 // 16-bit values, padded by zero. 379 __m128i maskLo, maskHi; 380 // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0) 381 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128()); 382 // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0) 383 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128()); 384 385 // Upscale from 0..31 to 0..32 386 // (allows to replace division by left-shift further down) 387 // Left-shift each component by 4 and add the result back to that component, 388 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32 389 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4)); 390 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4)); 391 392 // Multiply each component of maskLo and maskHi by srcA 393 maskLo = _mm_mullo_epi16(maskLo, srcA); 394 maskHi = _mm_mullo_epi16(maskHi, srcA); 395 396 // Left shift mask components by 8 (divide by 256) 397 maskLo = _mm_srli_epi16(maskLo, 8); 398 maskHi = _mm_srli_epi16(maskHi, 8); 399 400 // Interleave R,G,B into the lower byte of the word 401 // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0) 402 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128()); 403 // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0) 404 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128()); 405 406 // mask = (src - dst) * mask 407 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo)); 408 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi)); 409 410 // mask = (src - dst) * mask >> 5 411 maskLo = _mm_srai_epi16(maskLo, 5); 412 maskHi = _mm_srai_epi16(maskHi, 5); 413 414 // Add two pixels into result. 415 // result = dst + ((src - dst) * mask >> 5) 416 __m128i resultLo = _mm_add_epi16(dstLo, maskLo); 417 __m128i resultHi = _mm_add_epi16(dstHi, maskHi); 418 419 // Pack into 4 32bit dst pixels. 420 // resultLo and resultHi contain eight 16-bit components (two pixels) each. 421 // Merge into one SSE regsiter with sixteen 8-bit values (four pixels), 422 // clamping to 255 if necessary. 423 return _mm_packus_epi16(resultLo, resultHi); 424 } 425 426 static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst, 427 __m128i &mask) { 428 // In the following comments, the components of src, dst and mask are 429 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked 430 // by an R, G, B, or A suffix. Components of one of the four pixels that 431 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for 432 // example is the blue channel of the second destination pixel. Memory 433 // layout is shown for an ARGB byte order in a color value. 434 435 // src and srcA store 8-bit values interleaved with zeros. 436 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) 437 // mask stores 16-bit values (shown as high and low bytes) interleaved with 438 // zeros 439 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, 440 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) 441 442 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits. 443 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0) 444 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask), 445 _mm_set1_epi32(0x1F << SK_R32_SHIFT)); 446 447 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0) 448 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask), 449 _mm_set1_epi32(0x1F << SK_G32_SHIFT)); 450 451 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B) 452 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask), 453 _mm_set1_epi32(0x1F << SK_B32_SHIFT)); 454 455 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3) 456 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an 457 // 8-bit position 458 // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B, 459 // 0, m2R, m2G, m2B, 0, m3R, m3G, m3B) 460 mask = _mm_or_si128(_mm_or_si128(r, g), b); 461 462 // Interleave R,G,B into the lower byte of word. 463 // i.e. split the sixteen 8-bit values from mask into two sets of eight 464 // 16-bit values, padded by zero. 465 __m128i maskLo, maskHi; 466 // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0) 467 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128()); 468 // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0) 469 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128()); 470 471 // Upscale from 0..31 to 0..32 472 // (allows to replace division by left-shift further down) 473 // Left-shift each component by 4 and add the result back to that component, 474 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32 475 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4)); 476 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4)); 477 478 // Interleave R,G,B into the lower byte of the word 479 // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0) 480 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128()); 481 // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0) 482 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128()); 483 484 // mask = (src - dst) * mask 485 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo)); 486 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi)); 487 488 // mask = (src - dst) * mask >> 5 489 maskLo = _mm_srai_epi16(maskLo, 5); 490 maskHi = _mm_srai_epi16(maskHi, 5); 491 492 // Add two pixels into result. 493 // result = dst + ((src - dst) * mask >> 5) 494 __m128i resultLo = _mm_add_epi16(dstLo, maskLo); 495 __m128i resultHi = _mm_add_epi16(dstHi, maskHi); 496 497 // Pack into 4 32bit dst pixels and force opaque. 498 // resultLo and resultHi contain eight 16-bit components (two pixels) each. 499 // Merge into one SSE regsiter with sixteen 8-bit values (four pixels), 500 // clamping to 255 if necessary. Set alpha components to 0xFF. 501 return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi), 502 _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT)); 503 } 504 505 void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[], 506 SkColor src, int width, SkPMColor) { 507 if (width <= 0) { 508 return; 509 } 510 511 int srcA = SkColorGetA(src); 512 int srcR = SkColorGetR(src); 513 int srcG = SkColorGetG(src); 514 int srcB = SkColorGetB(src); 515 516 srcA = SkAlpha255To256(srcA); 517 518 if (width >= 4) { 519 SkASSERT(((size_t)dst & 0x03) == 0); 520 while (((size_t)dst & 0x0F) != 0) { 521 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask); 522 mask++; 523 dst++; 524 width--; 525 } 526 527 __m128i *d = reinterpret_cast<__m128i*>(dst); 528 // Set alpha to 0xFF and replicate source four times in SSE register. 529 __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB)); 530 // Interleave with zeros to get two sets of four 16-bit values. 531 src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128()); 532 // Set srcA_sse to contain eight copies of srcA, padded with zero. 533 // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) 534 __m128i srcA_sse = _mm_set1_epi16(srcA); 535 while (width >= 4) { 536 // Load four destination pixels into dst_sse. 537 __m128i dst_sse = _mm_load_si128(d); 538 // Load four 16-bit masks into lower half of mask_sse. 539 __m128i mask_sse = _mm_loadl_epi64( 540 reinterpret_cast<const __m128i*>(mask)); 541 542 // Check whether masks are equal to 0 and get the highest bit 543 // of each byte of result, if masks are all zero, we will get 544 // pack_cmp to 0xFFFF 545 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse, 546 _mm_setzero_si128())); 547 548 // if mask pixels are not all zero, we will blend the dst pixels 549 if (pack_cmp != 0xFFFF) { 550 // Unpack 4 16bit mask pixels to 551 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, 552 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) 553 mask_sse = _mm_unpacklo_epi16(mask_sse, 554 _mm_setzero_si128()); 555 556 // Process 4 32bit dst pixels 557 __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse, 558 mask_sse, srcA_sse); 559 _mm_store_si128(d, result); 560 } 561 562 d++; 563 mask += 4; 564 width -= 4; 565 } 566 567 dst = reinterpret_cast<SkPMColor*>(d); 568 } 569 570 while (width > 0) { 571 *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask); 572 mask++; 573 dst++; 574 width--; 575 } 576 } 577 578 void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[], 579 SkColor src, int width, SkPMColor opaqueDst) { 580 if (width <= 0) { 581 return; 582 } 583 584 int srcR = SkColorGetR(src); 585 int srcG = SkColorGetG(src); 586 int srcB = SkColorGetB(src); 587 588 if (width >= 4) { 589 SkASSERT(((size_t)dst & 0x03) == 0); 590 while (((size_t)dst & 0x0F) != 0) { 591 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst); 592 mask++; 593 dst++; 594 width--; 595 } 596 597 __m128i *d = reinterpret_cast<__m128i*>(dst); 598 // Set alpha to 0xFF and replicate source four times in SSE register. 599 __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB)); 600 // Set srcA_sse to contain eight copies of srcA, padded with zero. 601 // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0) 602 src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128()); 603 while (width >= 4) { 604 // Load four destination pixels into dst_sse. 605 __m128i dst_sse = _mm_load_si128(d); 606 // Load four 16-bit masks into lower half of mask_sse. 607 __m128i mask_sse = _mm_loadl_epi64( 608 reinterpret_cast<const __m128i*>(mask)); 609 610 // Check whether masks are equal to 0 and get the highest bit 611 // of each byte of result, if masks are all zero, we will get 612 // pack_cmp to 0xFFFF 613 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse, 614 _mm_setzero_si128())); 615 616 // if mask pixels are not all zero, we will blend the dst pixels 617 if (pack_cmp != 0xFFFF) { 618 // Unpack 4 16bit mask pixels to 619 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0, 620 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0) 621 mask_sse = _mm_unpacklo_epi16(mask_sse, 622 _mm_setzero_si128()); 623 624 // Process 4 32bit dst pixels 625 __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse, 626 mask_sse); 627 _mm_store_si128(d, result); 628 } 629 630 d++; 631 mask += 4; 632 width -= 4; 633 } 634 635 dst = reinterpret_cast<SkPMColor*>(d); 636 } 637 638 while (width > 0) { 639 *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst); 640 mask++; 641 dst++; 642 width--; 643 } 644 } 645 646 /* SSE2 version of S32_D565_Opaque() 647 * portable version is in core/SkBlitRow_D16.cpp 648 */ 649 void S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst, 650 const SkPMColor* SK_RESTRICT src, int count, 651 U8CPU alpha, int /*x*/, int /*y*/) { 652 SkASSERT(255 == alpha); 653 654 if (count <= 0) { 655 return; 656 } 657 658 if (count >= 8) { 659 while (((size_t)dst & 0x0F) != 0) { 660 SkPMColor c = *src++; 661 SkPMColorAssert(c); 662 663 *dst++ = SkPixel32ToPixel16_ToU16(c); 664 count--; 665 } 666 667 const __m128i* s = reinterpret_cast<const __m128i*>(src); 668 __m128i* d = reinterpret_cast<__m128i*>(dst); 669 670 while (count >= 8) { 671 // Load 8 pixels of src. 672 __m128i src_pixel1 = _mm_loadu_si128(s++); 673 __m128i src_pixel2 = _mm_loadu_si128(s++); 674 675 __m128i d_pixel = SkPixel32ToPixel16_ToU16_SSE2(src_pixel1, src_pixel2); 676 _mm_store_si128(d++, d_pixel); 677 count -= 8; 678 } 679 src = reinterpret_cast<const SkPMColor*>(s); 680 dst = reinterpret_cast<uint16_t*>(d); 681 } 682 683 if (count > 0) { 684 do { 685 SkPMColor c = *src++; 686 SkPMColorAssert(c); 687 *dst++ = SkPixel32ToPixel16_ToU16(c); 688 } while (--count != 0); 689 } 690 } 691 692 /* SSE2 version of S32A_D565_Opaque() 693 * portable version is in core/SkBlitRow_D16.cpp 694 */ 695 void S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst, 696 const SkPMColor* SK_RESTRICT src, 697 int count, U8CPU alpha, int /*x*/, int /*y*/) { 698 SkASSERT(255 == alpha); 699 700 if (count <= 0) { 701 return; 702 } 703 704 if (count >= 8) { 705 // Make dst 16 bytes alignment 706 while (((size_t)dst & 0x0F) != 0) { 707 SkPMColor c = *src++; 708 if (c) { 709 *dst = SkSrcOver32To16(c, *dst); 710 } 711 dst += 1; 712 count--; 713 } 714 715 const __m128i* s = reinterpret_cast<const __m128i*>(src); 716 __m128i* d = reinterpret_cast<__m128i*>(dst); 717 __m128i var255 = _mm_set1_epi16(255); 718 __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK); 719 __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK); 720 __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK); 721 722 while (count >= 8) { 723 // Load 8 pixels of src. 724 __m128i src_pixel1 = _mm_loadu_si128(s++); 725 __m128i src_pixel2 = _mm_loadu_si128(s++); 726 727 // Check whether src pixels are equal to 0 and get the highest bit 728 // of each byte of result, if src pixels are all zero, src_cmp1 and 729 // src_cmp2 will be 0xFFFF. 730 int src_cmp1 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel1, 731 _mm_setzero_si128())); 732 int src_cmp2 = _mm_movemask_epi8(_mm_cmpeq_epi16(src_pixel2, 733 _mm_setzero_si128())); 734 if (src_cmp1 == 0xFFFF && src_cmp2 == 0xFFFF) { 735 d++; 736 count -= 8; 737 continue; 738 } 739 740 // Load 8 pixels of dst. 741 __m128i dst_pixel = _mm_load_si128(d); 742 743 // Extract A from src. 744 __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT)); 745 sa1 = _mm_srli_epi32(sa1, 24); 746 __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT)); 747 sa2 = _mm_srli_epi32(sa2, 24); 748 __m128i sa = _mm_packs_epi32(sa1, sa2); 749 750 // Extract R from src. 751 __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT)); 752 sr1 = _mm_srli_epi32(sr1, 24); 753 __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT)); 754 sr2 = _mm_srli_epi32(sr2, 24); 755 __m128i sr = _mm_packs_epi32(sr1, sr2); 756 757 // Extract G from src. 758 __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT)); 759 sg1 = _mm_srli_epi32(sg1, 24); 760 __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT)); 761 sg2 = _mm_srli_epi32(sg2, 24); 762 __m128i sg = _mm_packs_epi32(sg1, sg2); 763 764 // Extract B from src. 765 __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT)); 766 sb1 = _mm_srli_epi32(sb1, 24); 767 __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT)); 768 sb2 = _mm_srli_epi32(sb2, 24); 769 __m128i sb = _mm_packs_epi32(sb1, sb2); 770 771 // Extract R G B from dst. 772 __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT); 773 dr = _mm_and_si128(dr, r16_mask); 774 __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT); 775 dg = _mm_and_si128(dg, g16_mask); 776 __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT); 777 db = _mm_and_si128(db, b16_mask); 778 779 __m128i isa = _mm_sub_epi16(var255, sa); // 255 -sa 780 781 // Calculate R G B of result. 782 // Original algorithm is in SkSrcOver32To16(). 783 dr = _mm_add_epi16(sr, SkMul16ShiftRound_SSE2(dr, isa, SK_R16_BITS)); 784 dr = _mm_srli_epi16(dr, 8 - SK_R16_BITS); 785 dg = _mm_add_epi16(sg, SkMul16ShiftRound_SSE2(dg, isa, SK_G16_BITS)); 786 dg = _mm_srli_epi16(dg, 8 - SK_G16_BITS); 787 db = _mm_add_epi16(sb, SkMul16ShiftRound_SSE2(db, isa, SK_B16_BITS)); 788 db = _mm_srli_epi16(db, 8 - SK_B16_BITS); 789 790 // Pack R G B into 16-bit color. 791 __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db); 792 793 // Store 8 16-bit colors in dst. 794 _mm_store_si128(d++, d_pixel); 795 count -= 8; 796 } 797 798 src = reinterpret_cast<const SkPMColor*>(s); 799 dst = reinterpret_cast<uint16_t*>(d); 800 } 801 802 if (count > 0) { 803 do { 804 SkPMColor c = *src++; 805 SkPMColorAssert(c); 806 if (c) { 807 *dst = SkSrcOver32To16(c, *dst); 808 } 809 dst += 1; 810 } while (--count != 0); 811 } 812 } 813 814 void S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst, 815 const SkPMColor* SK_RESTRICT src, 816 int count, U8CPU alpha, int x, int y) { 817 SkASSERT(255 == alpha); 818 819 if (count <= 0) { 820 return; 821 } 822 823 if (count >= 8) { 824 while (((size_t)dst & 0x0F) != 0) { 825 DITHER_565_SCAN(y); 826 SkPMColor c = *src++; 827 SkPMColorAssert(c); 828 829 unsigned dither = DITHER_VALUE(x); 830 *dst++ = SkDitherRGB32To565(c, dither); 831 DITHER_INC_X(x); 832 count--; 833 } 834 835 unsigned short dither_value[8]; 836 __m128i dither; 837 #ifdef ENABLE_DITHER_MATRIX_4X4 838 const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3]; 839 dither_value[0] = dither_value[4] = dither_scan[(x) & 3]; 840 dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3]; 841 dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3]; 842 dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3]; 843 #else 844 const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3]; 845 dither_value[0] = dither_value[4] = (dither_scan 846 >> (((x) & 3) << 2)) & 0xF; 847 dither_value[1] = dither_value[5] = (dither_scan 848 >> (((x + 1) & 3) << 2)) & 0xF; 849 dither_value[2] = dither_value[6] = (dither_scan 850 >> (((x + 2) & 3) << 2)) & 0xF; 851 dither_value[3] = dither_value[7] = (dither_scan 852 >> (((x + 3) & 3) << 2)) & 0xF; 853 #endif 854 dither = _mm_loadu_si128((__m128i*) dither_value); 855 856 const __m128i* s = reinterpret_cast<const __m128i*>(src); 857 __m128i* d = reinterpret_cast<__m128i*>(dst); 858 859 while (count >= 8) { 860 // Load 8 pixels of src. 861 __m128i src_pixel1 = _mm_loadu_si128(s++); 862 __m128i src_pixel2 = _mm_loadu_si128(s++); 863 864 // Extract R from src. 865 __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT)); 866 sr1 = _mm_srli_epi32(sr1, 24); 867 __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT)); 868 sr2 = _mm_srli_epi32(sr2, 24); 869 __m128i sr = _mm_packs_epi32(sr1, sr2); 870 871 // SkDITHER_R32To565(sr, dither) 872 __m128i sr_offset = _mm_srli_epi16(sr, 5); 873 sr = _mm_add_epi16(sr, dither); 874 sr = _mm_sub_epi16(sr, sr_offset); 875 sr = _mm_srli_epi16(sr, SK_R32_BITS - SK_R16_BITS); 876 877 // Extract G from src. 878 __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT)); 879 sg1 = _mm_srli_epi32(sg1, 24); 880 __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT)); 881 sg2 = _mm_srli_epi32(sg2, 24); 882 __m128i sg = _mm_packs_epi32(sg1, sg2); 883 884 // SkDITHER_R32To565(sg, dither) 885 __m128i sg_offset = _mm_srli_epi16(sg, 6); 886 sg = _mm_add_epi16(sg, _mm_srli_epi16(dither, 1)); 887 sg = _mm_sub_epi16(sg, sg_offset); 888 sg = _mm_srli_epi16(sg, SK_G32_BITS - SK_G16_BITS); 889 890 // Extract B from src. 891 __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT)); 892 sb1 = _mm_srli_epi32(sb1, 24); 893 __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT)); 894 sb2 = _mm_srli_epi32(sb2, 24); 895 __m128i sb = _mm_packs_epi32(sb1, sb2); 896 897 // SkDITHER_R32To565(sb, dither) 898 __m128i sb_offset = _mm_srli_epi16(sb, 5); 899 sb = _mm_add_epi16(sb, dither); 900 sb = _mm_sub_epi16(sb, sb_offset); 901 sb = _mm_srli_epi16(sb, SK_B32_BITS - SK_B16_BITS); 902 903 // Pack and store 16-bit dst pixel. 904 __m128i d_pixel = SkPackRGB16_SSE2(sr, sg, sb); 905 _mm_store_si128(d++, d_pixel); 906 907 count -= 8; 908 x += 8; 909 } 910 911 src = reinterpret_cast<const SkPMColor*>(s); 912 dst = reinterpret_cast<uint16_t*>(d); 913 } 914 915 if (count > 0) { 916 DITHER_565_SCAN(y); 917 do { 918 SkPMColor c = *src++; 919 SkPMColorAssert(c); 920 921 unsigned dither = DITHER_VALUE(x); 922 *dst++ = SkDitherRGB32To565(c, dither); 923 DITHER_INC_X(x); 924 } while (--count != 0); 925 } 926 } 927 928 /* SSE2 version of S32A_D565_Opaque_Dither() 929 * portable version is in core/SkBlitRow_D16.cpp 930 */ 931 void S32A_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst, 932 const SkPMColor* SK_RESTRICT src, 933 int count, U8CPU alpha, int x, int y) { 934 SkASSERT(255 == alpha); 935 936 if (count <= 0) { 937 return; 938 } 939 940 if (count >= 8) { 941 while (((size_t)dst & 0x0F) != 0) { 942 DITHER_565_SCAN(y); 943 SkPMColor c = *src++; 944 SkPMColorAssert(c); 945 if (c) { 946 unsigned a = SkGetPackedA32(c); 947 948 int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a)); 949 950 unsigned sr = SkGetPackedR32(c); 951 unsigned sg = SkGetPackedG32(c); 952 unsigned sb = SkGetPackedB32(c); 953 sr = SkDITHER_R32_FOR_565(sr, d); 954 sg = SkDITHER_G32_FOR_565(sg, d); 955 sb = SkDITHER_B32_FOR_565(sb, d); 956 957 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); 958 uint32_t dst_expanded = SkExpand_rgb_16(*dst); 959 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); 960 // now src and dst expanded are in g:11 r:10 x:1 b:10 961 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); 962 } 963 dst += 1; 964 DITHER_INC_X(x); 965 count--; 966 } 967 968 unsigned short dither_value[8]; 969 __m128i dither, dither_cur; 970 #ifdef ENABLE_DITHER_MATRIX_4X4 971 const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3]; 972 dither_value[0] = dither_value[4] = dither_scan[(x) & 3]; 973 dither_value[1] = dither_value[5] = dither_scan[(x + 1) & 3]; 974 dither_value[2] = dither_value[6] = dither_scan[(x + 2) & 3]; 975 dither_value[3] = dither_value[7] = dither_scan[(x + 3) & 3]; 976 #else 977 const uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3]; 978 dither_value[0] = dither_value[4] = (dither_scan 979 >> (((x) & 3) << 2)) & 0xF; 980 dither_value[1] = dither_value[5] = (dither_scan 981 >> (((x + 1) & 3) << 2)) & 0xF; 982 dither_value[2] = dither_value[6] = (dither_scan 983 >> (((x + 2) & 3) << 2)) & 0xF; 984 dither_value[3] = dither_value[7] = (dither_scan 985 >> (((x + 3) & 3) << 2)) & 0xF; 986 #endif 987 dither = _mm_loadu_si128((__m128i*) dither_value); 988 989 const __m128i* s = reinterpret_cast<const __m128i*>(src); 990 __m128i* d = reinterpret_cast<__m128i*>(dst); 991 __m128i var256 = _mm_set1_epi16(256); 992 __m128i r16_mask = _mm_set1_epi16(SK_R16_MASK); 993 __m128i g16_mask = _mm_set1_epi16(SK_G16_MASK); 994 __m128i b16_mask = _mm_set1_epi16(SK_B16_MASK); 995 996 while (count >= 8) { 997 // Load 8 pixels of src and dst. 998 __m128i src_pixel1 = _mm_loadu_si128(s++); 999 __m128i src_pixel2 = _mm_loadu_si128(s++); 1000 __m128i dst_pixel = _mm_load_si128(d); 1001 1002 // Extract A from src. 1003 __m128i sa1 = _mm_slli_epi32(src_pixel1, (24 - SK_A32_SHIFT)); 1004 sa1 = _mm_srli_epi32(sa1, 24); 1005 __m128i sa2 = _mm_slli_epi32(src_pixel2, (24 - SK_A32_SHIFT)); 1006 sa2 = _mm_srli_epi32(sa2, 24); 1007 __m128i sa = _mm_packs_epi32(sa1, sa2); 1008 1009 // Calculate current dither value. 1010 dither_cur = _mm_mullo_epi16(dither, 1011 _mm_add_epi16(sa, _mm_set1_epi16(1))); 1012 dither_cur = _mm_srli_epi16(dither_cur, 8); 1013 1014 // Extract R from src. 1015 __m128i sr1 = _mm_slli_epi32(src_pixel1, (24 - SK_R32_SHIFT)); 1016 sr1 = _mm_srli_epi32(sr1, 24); 1017 __m128i sr2 = _mm_slli_epi32(src_pixel2, (24 - SK_R32_SHIFT)); 1018 sr2 = _mm_srli_epi32(sr2, 24); 1019 __m128i sr = _mm_packs_epi32(sr1, sr2); 1020 1021 // SkDITHER_R32_FOR_565(sr, d) 1022 __m128i sr_offset = _mm_srli_epi16(sr, 5); 1023 sr = _mm_add_epi16(sr, dither_cur); 1024 sr = _mm_sub_epi16(sr, sr_offset); 1025 1026 // Expand sr. 1027 sr = _mm_slli_epi16(sr, 2); 1028 1029 // Extract G from src. 1030 __m128i sg1 = _mm_slli_epi32(src_pixel1, (24 - SK_G32_SHIFT)); 1031 sg1 = _mm_srli_epi32(sg1, 24); 1032 __m128i sg2 = _mm_slli_epi32(src_pixel2, (24 - SK_G32_SHIFT)); 1033 sg2 = _mm_srli_epi32(sg2, 24); 1034 __m128i sg = _mm_packs_epi32(sg1, sg2); 1035 1036 // sg = SkDITHER_G32_FOR_565(sg, d). 1037 __m128i sg_offset = _mm_srli_epi16(sg, 6); 1038 sg = _mm_add_epi16(sg, _mm_srli_epi16(dither_cur, 1)); 1039 sg = _mm_sub_epi16(sg, sg_offset); 1040 1041 // Expand sg. 1042 sg = _mm_slli_epi16(sg, 3); 1043 1044 // Extract B from src. 1045 __m128i sb1 = _mm_slli_epi32(src_pixel1, (24 - SK_B32_SHIFT)); 1046 sb1 = _mm_srli_epi32(sb1, 24); 1047 __m128i sb2 = _mm_slli_epi32(src_pixel2, (24 - SK_B32_SHIFT)); 1048 sb2 = _mm_srli_epi32(sb2, 24); 1049 __m128i sb = _mm_packs_epi32(sb1, sb2); 1050 1051 // sb = SkDITHER_B32_FOR_565(sb, d). 1052 __m128i sb_offset = _mm_srli_epi16(sb, 5); 1053 sb = _mm_add_epi16(sb, dither_cur); 1054 sb = _mm_sub_epi16(sb, sb_offset); 1055 1056 // Expand sb. 1057 sb = _mm_slli_epi16(sb, 2); 1058 1059 // Extract R G B from dst. 1060 __m128i dr = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT); 1061 dr = _mm_and_si128(dr, r16_mask); 1062 __m128i dg = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT); 1063 dg = _mm_and_si128(dg, g16_mask); 1064 __m128i db = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT); 1065 db = _mm_and_si128(db, b16_mask); 1066 1067 // SkAlpha255To256(255 - a) >> 3 1068 __m128i isa = _mm_sub_epi16(var256, sa); 1069 isa = _mm_srli_epi16(isa, 3); 1070 1071 dr = _mm_mullo_epi16(dr, isa); 1072 dr = _mm_add_epi16(dr, sr); 1073 dr = _mm_srli_epi16(dr, 5); 1074 1075 dg = _mm_mullo_epi16(dg, isa); 1076 dg = _mm_add_epi16(dg, sg); 1077 dg = _mm_srli_epi16(dg, 5); 1078 1079 db = _mm_mullo_epi16(db, isa); 1080 db = _mm_add_epi16(db, sb); 1081 db = _mm_srli_epi16(db, 5); 1082 1083 // Package and store dst pixel. 1084 __m128i d_pixel = SkPackRGB16_SSE2(dr, dg, db); 1085 _mm_store_si128(d++, d_pixel); 1086 1087 count -= 8; 1088 x += 8; 1089 } 1090 1091 src = reinterpret_cast<const SkPMColor*>(s); 1092 dst = reinterpret_cast<uint16_t*>(d); 1093 } 1094 1095 if (count > 0) { 1096 DITHER_565_SCAN(y); 1097 do { 1098 SkPMColor c = *src++; 1099 SkPMColorAssert(c); 1100 if (c) { 1101 unsigned a = SkGetPackedA32(c); 1102 1103 int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a)); 1104 1105 unsigned sr = SkGetPackedR32(c); 1106 unsigned sg = SkGetPackedG32(c); 1107 unsigned sb = SkGetPackedB32(c); 1108 sr = SkDITHER_R32_FOR_565(sr, d); 1109 sg = SkDITHER_G32_FOR_565(sg, d); 1110 sb = SkDITHER_B32_FOR_565(sb, d); 1111 1112 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); 1113 uint32_t dst_expanded = SkExpand_rgb_16(*dst); 1114 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); 1115 // now src and dst expanded are in g:11 r:10 x:1 b:10 1116 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); 1117 } 1118 dst += 1; 1119 DITHER_INC_X(x); 1120 } while (--count != 0); 1121 } 1122 } 1123