1 /* 2 * Copyright 2016 Google Inc. 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8 #ifndef SkSwizzler_opts_DEFINED 9 #define SkSwizzler_opts_DEFINED 10 11 #include "SkColorPriv.h" 12 13 namespace SK_OPTS_NS { 14 15 static void RGBA_to_rgbA_portable(uint32_t* dst, const void* vsrc, int count) { 16 auto src = (const uint32_t*)vsrc; 17 for (int i = 0; i < count; i++) { 18 uint8_t a = src[i] >> 24, 19 b = src[i] >> 16, 20 g = src[i] >> 8, 21 r = src[i] >> 0; 22 b = (b*a+127)/255; 23 g = (g*a+127)/255; 24 r = (r*a+127)/255; 25 dst[i] = (uint32_t)a << 24 26 | (uint32_t)b << 16 27 | (uint32_t)g << 8 28 | (uint32_t)r << 0; 29 } 30 } 31 32 static void RGBA_to_bgrA_portable(uint32_t* dst, const void* vsrc, int count) { 33 auto src = (const uint32_t*)vsrc; 34 for (int i = 0; i < count; i++) { 35 uint8_t a = src[i] >> 24, 36 b = src[i] >> 16, 37 g = src[i] >> 8, 38 r = src[i] >> 0; 39 b = (b*a+127)/255; 40 g = (g*a+127)/255; 41 r = (r*a+127)/255; 42 dst[i] = (uint32_t)a << 24 43 | (uint32_t)r << 16 44 | (uint32_t)g << 8 45 | (uint32_t)b << 0; 46 } 47 } 48 49 static void RGBA_to_BGRA_portable(uint32_t* dst, const void* vsrc, int count) { 50 auto src = (const uint32_t*)vsrc; 51 for (int i = 0; i < count; i++) { 52 uint8_t a = src[i] >> 24, 53 b = src[i] >> 16, 54 g = src[i] >> 8, 55 r = src[i] >> 0; 56 dst[i] = (uint32_t)a << 24 57 | (uint32_t)r << 16 58 | (uint32_t)g << 8 59 | (uint32_t)b << 0; 60 } 61 } 62 63 static void RGB_to_RGB1_portable(uint32_t dst[], const void* vsrc, int count) { 64 const uint8_t* src = (const uint8_t*)vsrc; 65 for (int i = 0; i < count; i++) { 66 uint8_t r = src[0], 67 g = src[1], 68 b = src[2]; 69 src += 3; 70 dst[i] = (uint32_t)0xFF << 24 71 | (uint32_t)b << 16 72 | (uint32_t)g << 8 73 | (uint32_t)r << 0; 74 } 75 } 76 77 static void RGB_to_BGR1_portable(uint32_t dst[], const void* vsrc, int count) { 78 const uint8_t* src = (const uint8_t*)vsrc; 79 for (int i = 0; i < count; i++) { 80 uint8_t r = src[0], 81 g = src[1], 82 b = src[2]; 83 src += 3; 84 dst[i] = (uint32_t)0xFF << 24 85 | (uint32_t)r << 16 86 | (uint32_t)g << 8 87 | (uint32_t)b << 0; 88 } 89 } 90 91 static void gray_to_RGB1_portable(uint32_t dst[], const void* vsrc, int count) { 92 const uint8_t* src = (const uint8_t*)vsrc; 93 for (int i = 0; i < count; i++) { 94 dst[i] = (uint32_t)0xFF << 24 95 | (uint32_t)src[i] << 16 96 | (uint32_t)src[i] << 8 97 | (uint32_t)src[i] << 0; 98 } 99 } 100 101 static void grayA_to_RGBA_portable(uint32_t dst[], const void* vsrc, int count) { 102 const uint8_t* src = (const uint8_t*)vsrc; 103 for (int i = 0; i < count; i++) { 104 uint8_t g = src[0], 105 a = src[1]; 106 src += 2; 107 dst[i] = (uint32_t)a << 24 108 | (uint32_t)g << 16 109 | (uint32_t)g << 8 110 | (uint32_t)g << 0; 111 } 112 } 113 114 static void grayA_to_rgbA_portable(uint32_t dst[], const void* vsrc, int count) { 115 const uint8_t* src = (const uint8_t*)vsrc; 116 for (int i = 0; i < count; i++) { 117 uint8_t g = src[0], 118 a = src[1]; 119 src += 2; 120 g = (g*a+127)/255; 121 dst[i] = (uint32_t)a << 24 122 | (uint32_t)g << 16 123 | (uint32_t)g << 8 124 | (uint32_t)g << 0; 125 } 126 } 127 128 static void inverted_CMYK_to_RGB1_portable(uint32_t* dst, const void* vsrc, int count) { 129 const uint32_t* src = (const uint32_t*)vsrc; 130 for (int i = 0; i < count; i++) { 131 uint8_t k = src[i] >> 24, 132 y = src[i] >> 16, 133 m = src[i] >> 8, 134 c = src[i] >> 0; 135 // See comments in SkSwizzler.cpp for details on the conversion formula. 136 uint8_t b = (y*k+127)/255, 137 g = (m*k+127)/255, 138 r = (c*k+127)/255; 139 dst[i] = (uint32_t)0xFF << 24 140 | (uint32_t) b << 16 141 | (uint32_t) g << 8 142 | (uint32_t) r << 0; 143 } 144 } 145 146 static void inverted_CMYK_to_BGR1_portable(uint32_t* dst, const void* vsrc, int count) { 147 const uint32_t* src = (const uint32_t*)vsrc; 148 for (int i = 0; i < count; i++) { 149 uint8_t k = src[i] >> 24, 150 y = src[i] >> 16, 151 m = src[i] >> 8, 152 c = src[i] >> 0; 153 uint8_t b = (y*k+127)/255, 154 g = (m*k+127)/255, 155 r = (c*k+127)/255; 156 dst[i] = (uint32_t)0xFF << 24 157 | (uint32_t) r << 16 158 | (uint32_t) g << 8 159 | (uint32_t) b << 0; 160 } 161 } 162 163 #if defined(SK_ARM_HAS_NEON) 164 165 // Rounded divide by 255, (x + 127) / 255 166 static uint8x8_t div255_round(uint16x8_t x) { 167 // result = (x + 127) / 255 168 // result = (x + 127) / 256 + error1 169 // 170 // error1 = (x + 127) / (255 * 256) 171 // error1 = (x + 127) / (256 * 256) + error2 172 // 173 // error2 = (x + 127) / (255 * 256 * 256) 174 // 175 // The maximum value of error2 is too small to matter. Thus: 176 // result = (x + 127) / 256 + (x + 127) / (256 * 256) 177 // result = ((x + 127) / 256 + x + 127) / 256 178 // result = ((x + 127) >> 8 + x + 127) >> 8 179 // 180 // Use >>> to represent "rounded right shift" which, conveniently, 181 // NEON supports in one instruction. 182 // result = ((x >>> 8) + x) >>> 8 183 // 184 // Note that the second right shift is actually performed as an 185 // "add, round, and narrow back to 8-bits" instruction. 186 return vraddhn_u16(x, vrshrq_n_u16(x, 8)); 187 } 188 189 // Scale a byte by another, (x * y + 127) / 255 190 static uint8x8_t scale(uint8x8_t x, uint8x8_t y) { 191 return div255_round(vmull_u8(x, y)); 192 } 193 194 template <bool kSwapRB> 195 static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) { 196 auto src = (const uint32_t*)vsrc; 197 while (count >= 8) { 198 // Load 8 pixels. 199 uint8x8x4_t rgba = vld4_u8((const uint8_t*) src); 200 201 uint8x8_t a = rgba.val[3], 202 b = rgba.val[2], 203 g = rgba.val[1], 204 r = rgba.val[0]; 205 206 // Premultiply. 207 b = scale(b, a); 208 g = scale(g, a); 209 r = scale(r, a); 210 211 // Store 8 premultiplied pixels. 212 if (kSwapRB) { 213 rgba.val[2] = r; 214 rgba.val[1] = g; 215 rgba.val[0] = b; 216 } else { 217 rgba.val[2] = b; 218 rgba.val[1] = g; 219 rgba.val[0] = r; 220 } 221 vst4_u8((uint8_t*) dst, rgba); 222 src += 8; 223 dst += 8; 224 count -= 8; 225 } 226 227 // Call portable code to finish up the tail of [0,8) pixels. 228 auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable; 229 proc(dst, src, count); 230 } 231 232 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) { 233 premul_should_swapRB<false>(dst, src, count); 234 } 235 236 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) { 237 premul_should_swapRB<true>(dst, src, count); 238 } 239 240 static void RGBA_to_BGRA(uint32_t* dst, const void* vsrc, int count) { 241 auto src = (const uint32_t*)vsrc; 242 while (count >= 16) { 243 // Load 16 pixels. 244 uint8x16x4_t rgba = vld4q_u8((const uint8_t*) src); 245 246 // Swap r and b. 247 SkTSwap(rgba.val[0], rgba.val[2]); 248 249 // Store 16 pixels. 250 vst4q_u8((uint8_t*) dst, rgba); 251 src += 16; 252 dst += 16; 253 count -= 16; 254 } 255 256 if (count >= 8) { 257 // Load 8 pixels. 258 uint8x8x4_t rgba = vld4_u8((const uint8_t*) src); 259 260 // Swap r and b. 261 SkTSwap(rgba.val[0], rgba.val[2]); 262 263 // Store 8 pixels. 264 vst4_u8((uint8_t*) dst, rgba); 265 src += 8; 266 dst += 8; 267 count -= 8; 268 } 269 270 RGBA_to_BGRA_portable(dst, src, count); 271 } 272 273 template <bool kSwapRB> 274 static void insert_alpha_should_swaprb(uint32_t dst[], const void* vsrc, int count) { 275 const uint8_t* src = (const uint8_t*) vsrc; 276 while (count >= 16) { 277 // Load 16 pixels. 278 uint8x16x3_t rgb = vld3q_u8(src); 279 280 // Insert an opaque alpha channel and swap if needed. 281 uint8x16x4_t rgba; 282 if (kSwapRB) { 283 rgba.val[0] = rgb.val[2]; 284 rgba.val[2] = rgb.val[0]; 285 } else { 286 rgba.val[0] = rgb.val[0]; 287 rgba.val[2] = rgb.val[2]; 288 } 289 rgba.val[1] = rgb.val[1]; 290 rgba.val[3] = vdupq_n_u8(0xFF); 291 292 // Store 16 pixels. 293 vst4q_u8((uint8_t*) dst, rgba); 294 src += 16*3; 295 dst += 16; 296 count -= 16; 297 } 298 299 if (count >= 8) { 300 // Load 8 pixels. 301 uint8x8x3_t rgb = vld3_u8(src); 302 303 // Insert an opaque alpha channel and swap if needed. 304 uint8x8x4_t rgba; 305 if (kSwapRB) { 306 rgba.val[0] = rgb.val[2]; 307 rgba.val[2] = rgb.val[0]; 308 } else { 309 rgba.val[0] = rgb.val[0]; 310 rgba.val[2] = rgb.val[2]; 311 } 312 rgba.val[1] = rgb.val[1]; 313 rgba.val[3] = vdup_n_u8(0xFF); 314 315 // Store 8 pixels. 316 vst4_u8((uint8_t*) dst, rgba); 317 src += 8*3; 318 dst += 8; 319 count -= 8; 320 } 321 322 // Call portable code to finish up the tail of [0,8) pixels. 323 auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable; 324 proc(dst, src, count); 325 } 326 327 static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) { 328 insert_alpha_should_swaprb<false>(dst, src, count); 329 } 330 331 static void RGB_to_BGR1(uint32_t dst[], const void* src, int count) { 332 insert_alpha_should_swaprb<true>(dst, src, count); 333 } 334 335 static void gray_to_RGB1(uint32_t dst[], const void* vsrc, int count) { 336 const uint8_t* src = (const uint8_t*) vsrc; 337 while (count >= 16) { 338 // Load 16 pixels. 339 uint8x16_t gray = vld1q_u8(src); 340 341 // Set each of the color channels. 342 uint8x16x4_t rgba; 343 rgba.val[0] = gray; 344 rgba.val[1] = gray; 345 rgba.val[2] = gray; 346 rgba.val[3] = vdupq_n_u8(0xFF); 347 348 // Store 16 pixels. 349 vst4q_u8((uint8_t*) dst, rgba); 350 src += 16; 351 dst += 16; 352 count -= 16; 353 } 354 355 if (count >= 8) { 356 // Load 8 pixels. 357 uint8x8_t gray = vld1_u8(src); 358 359 // Set each of the color channels. 360 uint8x8x4_t rgba; 361 rgba.val[0] = gray; 362 rgba.val[1] = gray; 363 rgba.val[2] = gray; 364 rgba.val[3] = vdup_n_u8(0xFF); 365 366 // Store 8 pixels. 367 vst4_u8((uint8_t*) dst, rgba); 368 src += 8; 369 dst += 8; 370 count -= 8; 371 } 372 373 gray_to_RGB1_portable(dst, src, count); 374 } 375 376 template <bool kPremul> 377 static void expand_grayA(uint32_t dst[], const void* vsrc, int count) { 378 const uint8_t* src = (const uint8_t*) vsrc; 379 while (count >= 16) { 380 // Load 16 pixels. 381 uint8x16x2_t ga = vld2q_u8(src); 382 383 // Premultiply if requested. 384 if (kPremul) { 385 ga.val[0] = vcombine_u8( 386 scale(vget_low_u8(ga.val[0]), vget_low_u8(ga.val[1])), 387 scale(vget_high_u8(ga.val[0]), vget_high_u8(ga.val[1]))); 388 } 389 390 // Set each of the color channels. 391 uint8x16x4_t rgba; 392 rgba.val[0] = ga.val[0]; 393 rgba.val[1] = ga.val[0]; 394 rgba.val[2] = ga.val[0]; 395 rgba.val[3] = ga.val[1]; 396 397 // Store 16 pixels. 398 vst4q_u8((uint8_t*) dst, rgba); 399 src += 16*2; 400 dst += 16; 401 count -= 16; 402 } 403 404 if (count >= 8) { 405 // Load 8 pixels. 406 uint8x8x2_t ga = vld2_u8(src); 407 408 // Premultiply if requested. 409 if (kPremul) { 410 ga.val[0] = scale(ga.val[0], ga.val[1]); 411 } 412 413 // Set each of the color channels. 414 uint8x8x4_t rgba; 415 rgba.val[0] = ga.val[0]; 416 rgba.val[1] = ga.val[0]; 417 rgba.val[2] = ga.val[0]; 418 rgba.val[3] = ga.val[1]; 419 420 // Store 8 pixels. 421 vst4_u8((uint8_t*) dst, rgba); 422 src += 8*2; 423 dst += 8; 424 count -= 8; 425 } 426 427 auto proc = kPremul ? grayA_to_rgbA_portable : grayA_to_RGBA_portable; 428 proc(dst, src, count); 429 } 430 431 static void grayA_to_RGBA(uint32_t dst[], const void* src, int count) { 432 expand_grayA<false>(dst, src, count); 433 } 434 435 static void grayA_to_rgbA(uint32_t dst[], const void* src, int count) { 436 expand_grayA<true>(dst, src, count); 437 } 438 439 enum Format { kRGB1, kBGR1 }; 440 template <Format format> 441 static void inverted_cmyk_to(uint32_t* dst, const void* vsrc, int count) { 442 auto src = (const uint32_t*)vsrc; 443 while (count >= 8) { 444 // Load 8 cmyk pixels. 445 uint8x8x4_t pixels = vld4_u8((const uint8_t*) src); 446 447 uint8x8_t k = pixels.val[3], 448 y = pixels.val[2], 449 m = pixels.val[1], 450 c = pixels.val[0]; 451 452 // Scale to r, g, b. 453 uint8x8_t b = scale(y, k); 454 uint8x8_t g = scale(m, k); 455 uint8x8_t r = scale(c, k); 456 457 // Store 8 rgba pixels. 458 if (kBGR1 == format) { 459 pixels.val[3] = vdup_n_u8(0xFF); 460 pixels.val[2] = r; 461 pixels.val[1] = g; 462 pixels.val[0] = b; 463 } else { 464 pixels.val[3] = vdup_n_u8(0xFF); 465 pixels.val[2] = b; 466 pixels.val[1] = g; 467 pixels.val[0] = r; 468 } 469 vst4_u8((uint8_t*) dst, pixels); 470 src += 8; 471 dst += 8; 472 count -= 8; 473 } 474 475 auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable; 476 proc(dst, src, count); 477 } 478 479 static void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) { 480 inverted_cmyk_to<kRGB1>(dst, src, count); 481 } 482 483 static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) { 484 inverted_cmyk_to<kBGR1>(dst, src, count); 485 } 486 487 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 488 489 // Scale a byte by another. 490 // Inputs are stored in 16-bit lanes, but are not larger than 8-bits. 491 static __m128i scale(__m128i x, __m128i y) { 492 const __m128i _128 = _mm_set1_epi16(128); 493 const __m128i _257 = _mm_set1_epi16(257); 494 495 // (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255. 496 return _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(x, y), _128), _257); 497 } 498 499 template <bool kSwapRB> 500 static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) { 501 auto src = (const uint32_t*)vsrc; 502 503 auto premul8 = [](__m128i* lo, __m128i* hi) { 504 const __m128i zeros = _mm_setzero_si128(); 505 __m128i planar; 506 if (kSwapRB) { 507 planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15); 508 } else { 509 planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15); 510 } 511 512 // Swizzle the pixels to 8-bit planar. 513 *lo = _mm_shuffle_epi8(*lo, planar); // rrrrgggg bbbbaaaa 514 *hi = _mm_shuffle_epi8(*hi, planar); // RRRRGGGG BBBBAAAA 515 __m128i rg = _mm_unpacklo_epi32(*lo, *hi), // rrrrRRRR ggggGGGG 516 ba = _mm_unpackhi_epi32(*lo, *hi); // bbbbBBBB aaaaAAAA 517 518 // Unpack to 16-bit planar. 519 __m128i r = _mm_unpacklo_epi8(rg, zeros), // r_r_r_r_ R_R_R_R_ 520 g = _mm_unpackhi_epi8(rg, zeros), // g_g_g_g_ G_G_G_G_ 521 b = _mm_unpacklo_epi8(ba, zeros), // b_b_b_b_ B_B_B_B_ 522 a = _mm_unpackhi_epi8(ba, zeros); // a_a_a_a_ A_A_A_A_ 523 524 // Premultiply! 525 r = scale(r, a); 526 g = scale(g, a); 527 b = scale(b, a); 528 529 // Repack into interlaced pixels. 530 rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)); // rgrgrgrg RGRGRGRG 531 ba = _mm_or_si128(b, _mm_slli_epi16(a, 8)); // babababa BABABABA 532 *lo = _mm_unpacklo_epi16(rg, ba); // rgbargba rgbargba 533 *hi = _mm_unpackhi_epi16(rg, ba); // RGBARGBA RGBARGBA 534 }; 535 536 while (count >= 8) { 537 __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)), 538 hi = _mm_loadu_si128((const __m128i*) (src + 4)); 539 540 premul8(&lo, &hi); 541 542 _mm_storeu_si128((__m128i*) (dst + 0), lo); 543 _mm_storeu_si128((__m128i*) (dst + 4), hi); 544 545 src += 8; 546 dst += 8; 547 count -= 8; 548 } 549 550 if (count >= 4) { 551 __m128i lo = _mm_loadu_si128((const __m128i*) src), 552 hi = _mm_setzero_si128(); 553 554 premul8(&lo, &hi); 555 556 _mm_storeu_si128((__m128i*) dst, lo); 557 558 src += 4; 559 dst += 4; 560 count -= 4; 561 } 562 563 // Call portable code to finish up the tail of [0,4) pixels. 564 auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable; 565 proc(dst, src, count); 566 } 567 568 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) { 569 premul_should_swapRB<false>(dst, src, count); 570 } 571 572 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) { 573 premul_should_swapRB<true>(dst, src, count); 574 } 575 576 static void RGBA_to_BGRA(uint32_t* dst, const void* vsrc, int count) { 577 auto src = (const uint32_t*)vsrc; 578 const __m128i swapRB = _mm_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15); 579 580 while (count >= 4) { 581 __m128i rgba = _mm_loadu_si128((const __m128i*) src); 582 __m128i bgra = _mm_shuffle_epi8(rgba, swapRB); 583 _mm_storeu_si128((__m128i*) dst, bgra); 584 585 src += 4; 586 dst += 4; 587 count -= 4; 588 } 589 590 RGBA_to_BGRA_portable(dst, src, count); 591 } 592 593 template <bool kSwapRB> 594 static void insert_alpha_should_swaprb(uint32_t dst[], const void* vsrc, int count) { 595 const uint8_t* src = (const uint8_t*) vsrc; 596 597 const __m128i alphaMask = _mm_set1_epi32(0xFF000000); 598 __m128i expand; 599 const uint8_t X = 0xFF; // Used a placeholder. The value of X is irrelevant. 600 if (kSwapRB) { 601 expand = _mm_setr_epi8(2,1,0,X, 5,4,3,X, 8,7,6,X, 11,10,9,X); 602 } else { 603 expand = _mm_setr_epi8(0,1,2,X, 3,4,5,X, 6,7,8,X, 9,10,11,X); 604 } 605 606 while (count >= 6) { 607 // Load a vector. While this actually contains 5 pixels plus an 608 // extra component, we will discard all but the first four pixels on 609 // this iteration. 610 __m128i rgb = _mm_loadu_si128((const __m128i*) src); 611 612 // Expand the first four pixels to RGBX and then mask to RGB(FF). 613 __m128i rgba = _mm_or_si128(_mm_shuffle_epi8(rgb, expand), alphaMask); 614 615 // Store 4 pixels. 616 _mm_storeu_si128((__m128i*) dst, rgba); 617 618 src += 4*3; 619 dst += 4; 620 count -= 4; 621 } 622 623 // Call portable code to finish up the tail of [0,4) pixels. 624 auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable; 625 proc(dst, src, count); 626 } 627 628 static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) { 629 insert_alpha_should_swaprb<false>(dst, src, count); 630 } 631 632 static void RGB_to_BGR1(uint32_t dst[], const void* src, int count) { 633 insert_alpha_should_swaprb<true>(dst, src, count); 634 } 635 636 static void gray_to_RGB1(uint32_t dst[], const void* vsrc, int count) { 637 const uint8_t* src = (const uint8_t*) vsrc; 638 639 const __m128i alphas = _mm_set1_epi8((uint8_t) 0xFF); 640 while (count >= 16) { 641 __m128i grays = _mm_loadu_si128((const __m128i*) src); 642 643 __m128i gg_lo = _mm_unpacklo_epi8(grays, grays); 644 __m128i gg_hi = _mm_unpackhi_epi8(grays, grays); 645 __m128i ga_lo = _mm_unpacklo_epi8(grays, alphas); 646 __m128i ga_hi = _mm_unpackhi_epi8(grays, alphas); 647 648 __m128i ggga0 = _mm_unpacklo_epi16(gg_lo, ga_lo); 649 __m128i ggga1 = _mm_unpackhi_epi16(gg_lo, ga_lo); 650 __m128i ggga2 = _mm_unpacklo_epi16(gg_hi, ga_hi); 651 __m128i ggga3 = _mm_unpackhi_epi16(gg_hi, ga_hi); 652 653 _mm_storeu_si128((__m128i*) (dst + 0), ggga0); 654 _mm_storeu_si128((__m128i*) (dst + 4), ggga1); 655 _mm_storeu_si128((__m128i*) (dst + 8), ggga2); 656 _mm_storeu_si128((__m128i*) (dst + 12), ggga3); 657 658 src += 16; 659 dst += 16; 660 count -= 16; 661 } 662 663 gray_to_RGB1_portable(dst, src, count); 664 } 665 666 static void grayA_to_RGBA(uint32_t dst[], const void* vsrc, int count) { 667 const uint8_t* src = (const uint8_t*) vsrc; 668 while (count >= 8) { 669 __m128i ga = _mm_loadu_si128((const __m128i*) src); 670 671 __m128i gg = _mm_or_si128(_mm_and_si128(ga, _mm_set1_epi16(0x00FF)), 672 _mm_slli_epi16(ga, 8)); 673 674 __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga); 675 __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga); 676 677 _mm_storeu_si128((__m128i*) (dst + 0), ggga_lo); 678 _mm_storeu_si128((__m128i*) (dst + 4), ggga_hi); 679 680 src += 8*2; 681 dst += 8; 682 count -= 8; 683 } 684 685 grayA_to_RGBA_portable(dst, src, count); 686 } 687 688 static void grayA_to_rgbA(uint32_t dst[], const void* vsrc, int count) { 689 const uint8_t* src = (const uint8_t*) vsrc; 690 while (count >= 8) { 691 __m128i grayA = _mm_loadu_si128((const __m128i*) src); 692 693 __m128i g0 = _mm_and_si128(grayA, _mm_set1_epi16(0x00FF)); 694 __m128i a0 = _mm_srli_epi16(grayA, 8); 695 696 // Premultiply 697 g0 = scale(g0, a0); 698 699 __m128i gg = _mm_or_si128(g0, _mm_slli_epi16(g0, 8)); 700 __m128i ga = _mm_or_si128(g0, _mm_slli_epi16(a0, 8)); 701 702 703 __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga); 704 __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga); 705 706 _mm_storeu_si128((__m128i*) (dst + 0), ggga_lo); 707 _mm_storeu_si128((__m128i*) (dst + 4), ggga_hi); 708 709 src += 8*2; 710 dst += 8; 711 count -= 8; 712 } 713 714 grayA_to_rgbA_portable(dst, src, count); 715 } 716 717 enum Format { kRGB1, kBGR1 }; 718 template <Format format> 719 static void inverted_cmyk_to(uint32_t* dst, const void* vsrc, int count) { 720 auto src = (const uint32_t*)vsrc; 721 722 auto convert8 = [](__m128i* lo, __m128i* hi) { 723 const __m128i zeros = _mm_setzero_si128(); 724 __m128i planar; 725 if (kBGR1 == format) { 726 planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15); 727 } else { 728 planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15); 729 } 730 731 // Swizzle the pixels to 8-bit planar. 732 *lo = _mm_shuffle_epi8(*lo, planar); // ccccmmmm yyyykkkk 733 *hi = _mm_shuffle_epi8(*hi, planar); // CCCCMMMM YYYYKKKK 734 __m128i cm = _mm_unpacklo_epi32(*lo, *hi), // ccccCCCC mmmmMMMM 735 yk = _mm_unpackhi_epi32(*lo, *hi); // yyyyYYYY kkkkKKKK 736 737 // Unpack to 16-bit planar. 738 __m128i c = _mm_unpacklo_epi8(cm, zeros), // c_c_c_c_ C_C_C_C_ 739 m = _mm_unpackhi_epi8(cm, zeros), // m_m_m_m_ M_M_M_M_ 740 y = _mm_unpacklo_epi8(yk, zeros), // y_y_y_y_ Y_Y_Y_Y_ 741 k = _mm_unpackhi_epi8(yk, zeros); // k_k_k_k_ K_K_K_K_ 742 743 // Scale to r, g, b. 744 __m128i r = scale(c, k), 745 g = scale(m, k), 746 b = scale(y, k); 747 748 // Repack into interlaced pixels. 749 __m128i rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)), // rgrgrgrg RGRGRGRG 750 ba = _mm_or_si128(b, _mm_set1_epi16((uint16_t) 0xFF00)); // b1b1b1b1 B1B1B1B1 751 *lo = _mm_unpacklo_epi16(rg, ba); // rgbargba rgbargba 752 *hi = _mm_unpackhi_epi16(rg, ba); // RGB1RGB1 RGB1RGB1 753 }; 754 755 while (count >= 8) { 756 __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)), 757 hi = _mm_loadu_si128((const __m128i*) (src + 4)); 758 759 convert8(&lo, &hi); 760 761 _mm_storeu_si128((__m128i*) (dst + 0), lo); 762 _mm_storeu_si128((__m128i*) (dst + 4), hi); 763 764 src += 8; 765 dst += 8; 766 count -= 8; 767 } 768 769 if (count >= 4) { 770 __m128i lo = _mm_loadu_si128((const __m128i*) src), 771 hi = _mm_setzero_si128(); 772 773 convert8(&lo, &hi); 774 775 _mm_storeu_si128((__m128i*) dst, lo); 776 777 src += 4; 778 dst += 4; 779 count -= 4; 780 } 781 782 auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable; 783 proc(dst, src, count); 784 } 785 786 static void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) { 787 inverted_cmyk_to<kRGB1>(dst, src, count); 788 } 789 790 static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) { 791 inverted_cmyk_to<kBGR1>(dst, src, count); 792 } 793 794 #else 795 796 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) { 797 RGBA_to_rgbA_portable(dst, src, count); 798 } 799 800 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) { 801 RGBA_to_bgrA_portable(dst, src, count); 802 } 803 804 static void RGBA_to_BGRA(uint32_t* dst, const void* src, int count) { 805 RGBA_to_BGRA_portable(dst, src, count); 806 } 807 808 static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) { 809 RGB_to_RGB1_portable(dst, src, count); 810 } 811 812 static void RGB_to_BGR1(uint32_t dst[], const void* src, int count) { 813 RGB_to_BGR1_portable(dst, src, count); 814 } 815 816 static void gray_to_RGB1(uint32_t dst[], const void* src, int count) { 817 gray_to_RGB1_portable(dst, src, count); 818 } 819 820 static void grayA_to_RGBA(uint32_t dst[], const void* src, int count) { 821 grayA_to_RGBA_portable(dst, src, count); 822 } 823 824 static void grayA_to_rgbA(uint32_t dst[], const void* src, int count) { 825 grayA_to_rgbA_portable(dst, src, count); 826 } 827 828 static void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) { 829 inverted_CMYK_to_RGB1_portable(dst, src, count); 830 } 831 832 static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) { 833 inverted_CMYK_to_BGR1_portable(dst, src, count); 834 } 835 836 #endif 837 838 } 839 840 #endif // SkSwizzler_opts_DEFINED 841