1 /* 2 ** 3 ** Copyright 2009, The Android Open Source Project 4 ** 5 ** Licensed under the Apache License, Version 2.0 (the "License"); 6 ** you may not use this file except in compliance with the License. 7 ** You may obtain a copy of the License at 8 ** 9 ** http://www.apache.org/licenses/LICENSE-2.0 10 ** 11 ** Unless required by applicable law or agreed to in writing, software 12 ** distributed under the License is distributed on an "AS IS" BASIS, 13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 ** See the License for the specific language governing permissions and 15 ** limitations under the License. 16 */ 17 18 #include "SkBlitRow_opts_SSE2.h" 19 #include "SkColorPriv.h" 20 #include "SkUtils.h" 21 22 #include <emmintrin.h> 23 24 /* SSE2 version of S32_Blend_BlitRow32() 25 * portable version is in core/SkBlitRow_D32.cpp 26 */ 27 void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, 28 const SkPMColor* SK_RESTRICT src, 29 int count, U8CPU alpha) { 30 SkASSERT(alpha <= 255); 31 if (count <= 0) { 32 return; 33 } 34 35 uint32_t src_scale = SkAlpha255To256(alpha); 36 uint32_t dst_scale = 256 - src_scale; 37 38 if (count >= 4) { 39 SkASSERT(((size_t)dst & 0x03) == 0); 40 while (((size_t)dst & 0x0F) != 0) { 41 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); 42 src++; 43 dst++; 44 count--; 45 } 46 47 const __m128i *s = reinterpret_cast<const __m128i*>(src); 48 __m128i *d = reinterpret_cast<__m128i*>(dst); 49 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 50 __m128i src_scale_wide = _mm_set1_epi16(src_scale); 51 __m128i dst_scale_wide = _mm_set1_epi16(dst_scale); 52 while (count >= 4) { 53 // Load 4 pixels each of src and dest. 54 __m128i src_pixel = _mm_loadu_si128(s); 55 __m128i dst_pixel = _mm_load_si128(d); 56 57 // Get red and blue pixels into lower byte of each word. 58 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 59 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); 60 61 // Get alpha and green into lower byte of each word. 62 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); 63 __m128i src_ag = _mm_srli_epi16(src_pixel, 8); 64 65 // Multiply by scale. 66 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide); 67 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide); 68 dst_rb = _mm_mullo_epi16(dst_rb, dst_scale_wide); 69 dst_ag = _mm_mullo_epi16(dst_ag, dst_scale_wide); 70 71 // Divide by 256. 72 src_rb = _mm_srli_epi16(src_rb, 8); 73 dst_rb = _mm_srli_epi16(dst_rb, 8); 74 src_ag = _mm_andnot_si128(rb_mask, src_ag); 75 dst_ag = _mm_andnot_si128(rb_mask, dst_ag); 76 77 // Combine back into RGBA. 78 src_pixel = _mm_or_si128(src_rb, src_ag); 79 dst_pixel = _mm_or_si128(dst_rb, dst_ag); 80 81 // Add result 82 __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 83 _mm_store_si128(d, result); 84 s++; 85 d++; 86 count -= 4; 87 } 88 src = reinterpret_cast<const SkPMColor*>(s); 89 dst = reinterpret_cast<SkPMColor*>(d); 90 } 91 92 while (count > 0) { 93 *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); 94 src++; 95 dst++; 96 count--; 97 } 98 } 99 100 void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, 101 const SkPMColor* SK_RESTRICT src, 102 int count, U8CPU alpha) { 103 SkASSERT(alpha == 255); 104 if (count <= 0) { 105 return; 106 } 107 108 if (count >= 4) { 109 SkASSERT(((size_t)dst & 0x03) == 0); 110 while (((size_t)dst & 0x0F) != 0) { 111 *dst = SkPMSrcOver(*src, *dst); 112 src++; 113 dst++; 114 count--; 115 } 116 117 const __m128i *s = reinterpret_cast<const __m128i*>(src); 118 __m128i *d = reinterpret_cast<__m128i*>(dst); 119 #ifdef SK_USE_ACCURATE_BLENDING 120 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 121 __m128i c_128 = _mm_set1_epi16(128); // 8 copies of 128 (16-bit) 122 __m128i c_255 = _mm_set1_epi16(255); // 8 copies of 255 (16-bit) 123 while (count >= 4) { 124 // Load 4 pixels 125 __m128i src_pixel = _mm_loadu_si128(s); 126 __m128i dst_pixel = _mm_load_si128(d); 127 128 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 129 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); 130 // Shift alphas down to lower 8 bits of each quad. 131 __m128i alpha = _mm_srli_epi32(src_pixel, 24); 132 133 // Copy alpha to upper 3rd byte of each quad 134 alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16)); 135 136 // Subtract alphas from 255, to get 0..255 137 alpha = _mm_sub_epi16(c_255, alpha); 138 139 // Multiply by red and blue by src alpha. 140 dst_rb = _mm_mullo_epi16(dst_rb, alpha); 141 // Multiply by alpha and green by src alpha. 142 dst_ag = _mm_mullo_epi16(dst_ag, alpha); 143 144 // dst_rb_low = (dst_rb >> 8) 145 __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8); 146 __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8); 147 148 // dst_rb = (dst_rb + dst_rb_low + 128) >> 8 149 dst_rb = _mm_add_epi16(dst_rb, dst_rb_low); 150 dst_rb = _mm_add_epi16(dst_rb, c_128); 151 dst_rb = _mm_srli_epi16(dst_rb, 8); 152 153 // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask 154 dst_ag = _mm_add_epi16(dst_ag, dst_ag_low); 155 dst_ag = _mm_add_epi16(dst_ag, c_128); 156 dst_ag = _mm_andnot_si128(rb_mask, dst_ag); 157 158 // Combine back into RGBA. 159 dst_pixel = _mm_or_si128(dst_rb, dst_ag); 160 161 // Add result 162 __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 163 _mm_store_si128(d, result); 164 s++; 165 d++; 166 count -= 4; 167 } 168 #else 169 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 170 __m128i c_256 = _mm_set1_epi16(0x0100); // 8 copies of 256 (16-bit) 171 while (count >= 4) { 172 // Load 4 pixels 173 __m128i src_pixel = _mm_loadu_si128(s); 174 __m128i dst_pixel = _mm_load_si128(d); 175 176 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 177 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); 178 179 // (a0, g0, a1, g1, a2, g2, a3, g3) (low byte of each word) 180 __m128i alpha = _mm_srli_epi16(src_pixel, 8); 181 182 // (a0, a0, a1, a1, a2, g2, a3, g3) 183 alpha = _mm_shufflehi_epi16(alpha, 0xF5); 184 185 // (a0, a0, a1, a1, a2, a2, a3, a3) 186 alpha = _mm_shufflelo_epi16(alpha, 0xF5); 187 188 // Subtract alphas from 256, to get 1..256 189 alpha = _mm_sub_epi16(c_256, alpha); 190 191 // Multiply by red and blue by src alpha. 192 dst_rb = _mm_mullo_epi16(dst_rb, alpha); 193 // Multiply by alpha and green by src alpha. 194 dst_ag = _mm_mullo_epi16(dst_ag, alpha); 195 196 // Divide by 256. 197 dst_rb = _mm_srli_epi16(dst_rb, 8); 198 199 // Mask out high bits (already in the right place) 200 dst_ag = _mm_andnot_si128(rb_mask, dst_ag); 201 202 // Combine back into RGBA. 203 dst_pixel = _mm_or_si128(dst_rb, dst_ag); 204 205 // Add result 206 __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 207 _mm_store_si128(d, result); 208 s++; 209 d++; 210 count -= 4; 211 } 212 #endif 213 src = reinterpret_cast<const SkPMColor*>(s); 214 dst = reinterpret_cast<SkPMColor*>(d); 215 } 216 217 while (count > 0) { 218 *dst = SkPMSrcOver(*src, *dst); 219 src++; 220 dst++; 221 count--; 222 } 223 } 224 225 void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, 226 const SkPMColor* SK_RESTRICT src, 227 int count, U8CPU alpha) { 228 SkASSERT(alpha <= 255); 229 if (count <= 0) { 230 return; 231 } 232 233 if (count >= 4) { 234 while (((size_t)dst & 0x0F) != 0) { 235 *dst = SkBlendARGB32(*src, *dst, alpha); 236 src++; 237 dst++; 238 count--; 239 } 240 241 uint32_t src_scale = SkAlpha255To256(alpha); 242 243 const __m128i *s = reinterpret_cast<const __m128i*>(src); 244 __m128i *d = reinterpret_cast<__m128i*>(dst); 245 __m128i src_scale_wide = _mm_set1_epi16(src_scale); 246 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 247 __m128i c_256 = _mm_set1_epi16(256); // 8 copies of 256 (16-bit) 248 while (count >= 4) { 249 // Load 4 pixels each of src and dest. 250 __m128i src_pixel = _mm_loadu_si128(s); 251 __m128i dst_pixel = _mm_load_si128(d); 252 253 // Get red and blue pixels into lower byte of each word. 254 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 255 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); 256 257 // Get alpha and green into lower byte of each word. 258 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); 259 __m128i src_ag = _mm_srli_epi16(src_pixel, 8); 260 261 // Put per-pixel alpha in low byte of each word. 262 __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5); 263 dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5); 264 265 // dst_alpha = dst_alpha * src_scale 266 dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide); 267 268 // Divide by 256. 269 dst_alpha = _mm_srli_epi16(dst_alpha, 8); 270 271 // Subtract alphas from 256, to get 1..256 272 dst_alpha = _mm_sub_epi16(c_256, dst_alpha); 273 274 // Multiply red and blue by dst pixel alpha. 275 dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha); 276 // Multiply alpha and green by dst pixel alpha. 277 dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha); 278 279 // Multiply red and blue by global alpha. 280 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide); 281 // Multiply alpha and green by global alpha. 282 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide); 283 284 // Divide by 256. 285 dst_rb = _mm_srli_epi16(dst_rb, 8); 286 src_rb = _mm_srli_epi16(src_rb, 8); 287 288 // Mask out low bits (goodies already in the right place; no need to divide) 289 dst_ag = _mm_andnot_si128(rb_mask, dst_ag); 290 src_ag = _mm_andnot_si128(rb_mask, src_ag); 291 292 // Combine back into RGBA. 293 dst_pixel = _mm_or_si128(dst_rb, dst_ag); 294 src_pixel = _mm_or_si128(src_rb, src_ag); 295 296 // Add two pixels into result. 297 __m128i result = _mm_add_epi8(src_pixel, dst_pixel); 298 _mm_store_si128(d, result); 299 s++; 300 d++; 301 count -= 4; 302 } 303 src = reinterpret_cast<const SkPMColor*>(s); 304 dst = reinterpret_cast<SkPMColor*>(d); 305 } 306 307 while (count > 0) { 308 *dst = SkBlendARGB32(*src, *dst, alpha); 309 src++; 310 dst++; 311 count--; 312 } 313 } 314 315 /* SSE2 version of Color32() 316 * portable version is in core/SkBlitRow_D32.cpp 317 */ 318 void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count, 319 SkPMColor color) { 320 321 if (count <= 0) { 322 return; 323 } 324 325 if (0 == color) { 326 if (src != dst) { 327 memcpy(dst, src, count * sizeof(SkPMColor)); 328 } 329 } 330 331 unsigned colorA = SkGetPackedA32(color); 332 if (255 == colorA) { 333 sk_memset32(dst, color, count); 334 } else { 335 unsigned scale = 256 - SkAlpha255To256(colorA); 336 337 if (count >= 4) { 338 SkASSERT(((size_t)dst & 0x03) == 0); 339 while (((size_t)dst & 0x0F) != 0) { 340 *dst = color + SkAlphaMulQ(*src, scale); 341 src++; 342 dst++; 343 count--; 344 } 345 346 const __m128i *s = reinterpret_cast<const __m128i*>(src); 347 __m128i *d = reinterpret_cast<__m128i*>(dst); 348 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 349 __m128i src_scale_wide = _mm_set1_epi16(scale); 350 __m128i color_wide = _mm_set1_epi32(color); 351 while (count >= 4) { 352 // Load 4 pixels each of src and dest. 353 __m128i src_pixel = _mm_loadu_si128(s); 354 355 // Get red and blue pixels into lower byte of each word. 356 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); 357 358 // Get alpha and green into lower byte of each word. 359 __m128i src_ag = _mm_srli_epi16(src_pixel, 8); 360 361 // Multiply by scale. 362 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide); 363 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide); 364 365 // Divide by 256. 366 src_rb = _mm_srli_epi16(src_rb, 8); 367 src_ag = _mm_andnot_si128(rb_mask, src_ag); 368 369 // Combine back into RGBA. 370 src_pixel = _mm_or_si128(src_rb, src_ag); 371 372 // Add color to result. 373 __m128i result = _mm_add_epi8(color_wide, src_pixel); 374 375 // Store result. 376 _mm_store_si128(d, result); 377 s++; 378 d++; 379 count -= 4; 380 } 381 src = reinterpret_cast<const SkPMColor*>(s); 382 dst = reinterpret_cast<SkPMColor*>(d); 383 } 384 385 while (count > 0) { 386 *dst = color + SkAlphaMulQ(*src, scale); 387 src += 1; 388 dst += 1; 389 count--; 390 } 391 } 392 } 393 394 void SkARGB32_BlitMask_SSE2(void* device, size_t dstRB, 395 SkBitmap::Config dstConfig, const uint8_t* mask, 396 size_t maskRB, SkColor origColor, 397 int width, int height) 398 { 399 SkPMColor color = SkPreMultiplyColor(origColor); 400 size_t dstOffset = dstRB - (width << 2); 401 size_t maskOffset = maskRB - width; 402 SkPMColor* dst = (SkPMColor *)device; 403 do { 404 int count = width; 405 if (count >= 4) { 406 while (((size_t)dst & 0x0F) != 0 && (count > 0)) { 407 *dst = SkBlendARGB32(color, *dst, *mask); 408 mask++; 409 dst++; 410 count--; 411 } 412 __m128i *d = reinterpret_cast<__m128i*>(dst); 413 __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); 414 __m128i c_256 = _mm_set1_epi16(256); 415 __m128i c_1 = _mm_set1_epi16(1); 416 __m128i src_pixel = _mm_set1_epi32(color); 417 while (count >= 4) { 418 // Load 4 pixels each of src and dest. 419 __m128i dst_pixel = _mm_load_si128(d); 420 421 //set the aphla value 422 __m128i src_scale_wide = _mm_set_epi8(0, *(mask+3),\ 423 0, *(mask+3),0, \ 424 *(mask+2),0, *(mask+2),\ 425 0,*(mask+1), 0,*(mask+1),\ 426 0, *mask,0,*mask); 427 428 //call SkAlpha255To256() 429 src_scale_wide = _mm_add_epi16(src_scale_wide, c_1); 430 431 // Get red and blue pixels into lower byte of each word. 432 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); 433 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); 434 435 // Get alpha and green into lower byte of each word. 436 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); 437 __m128i src_ag = _mm_srli_epi16(src_pixel, 8); 438 439 // Put per-pixel alpha in low byte of each word. 440 __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5); 441 dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5); 442 443 // dst_alpha = dst_alpha * src_scale 444 dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide); 445 446 // Divide by 256. 447 dst_alpha = _mm_srli_epi16(dst_alpha, 8); 448 449 // Subtract alphas from 256, to get 1..256 450 dst_alpha = _mm_sub_epi16(c_256, dst_alpha); 451 // Multiply red and blue by dst pixel alpha. 452 dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha); 453 // Multiply alpha and green by dst pixel alpha. 454 dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha); 455 456 // Multiply red and blue by global alpha. 457 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide); 458 // Multiply alpha and green by global alpha. 459 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide); 460 // Divide by 256. 461 dst_rb = _mm_srli_epi16(dst_rb, 8); 462 src_rb = _mm_srli_epi16(src_rb, 8); 463 464 // Mask out low bits (goodies already in the right place; no need to divide) 465 dst_ag = _mm_andnot_si128(rb_mask, dst_ag); 466 src_ag = _mm_andnot_si128(rb_mask, src_ag); 467 468 // Combine back into RGBA. 469 dst_pixel = _mm_or_si128(dst_rb, dst_ag); 470 __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag); 471 472 // Add two pixels into result. 473 __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel); 474 _mm_store_si128(d, result); 475 // load the next 4 pixel 476 mask = mask + 4; 477 d++; 478 count -= 4; 479 } 480 dst = reinterpret_cast<SkPMColor *>(d); 481 } 482 while(count > 0) { 483 *dst= SkBlendARGB32(color, *dst, *mask); 484 dst += 1; 485 mask++; 486 count --; 487 } 488 dst = (SkPMColor *)((char*)dst + dstOffset); 489 mask += maskOffset; 490 } while (--height != 0); 491 } 492