1 /* 2 * Copyright 2016 Google Inc. 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8 #ifndef SkSwizzler_opts_DEFINED 9 #define SkSwizzler_opts_DEFINED 10 11 #include "SkColorPriv.h" 12 13 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 14 #include <immintrin.h> 15 #elif defined(SK_ARM_HAS_NEON) 16 #include <arm_neon.h> 17 #endif 18 19 namespace SK_OPTS_NS { 20 21 static void RGBA_to_rgbA_portable(uint32_t* dst, const void* vsrc, int count) { 22 auto src = (const uint32_t*)vsrc; 23 for (int i = 0; i < count; i++) { 24 uint8_t a = src[i] >> 24, 25 b = src[i] >> 16, 26 g = src[i] >> 8, 27 r = src[i] >> 0; 28 b = (b*a+127)/255; 29 g = (g*a+127)/255; 30 r = (r*a+127)/255; 31 dst[i] = (uint32_t)a << 24 32 | (uint32_t)b << 16 33 | (uint32_t)g << 8 34 | (uint32_t)r << 0; 35 } 36 } 37 38 static void RGBA_to_bgrA_portable(uint32_t* dst, const void* vsrc, int count) { 39 auto src = (const uint32_t*)vsrc; 40 for (int i = 0; i < count; i++) { 41 uint8_t a = src[i] >> 24, 42 b = src[i] >> 16, 43 g = src[i] >> 8, 44 r = src[i] >> 0; 45 b = (b*a+127)/255; 46 g = (g*a+127)/255; 47 r = (r*a+127)/255; 48 dst[i] = (uint32_t)a << 24 49 | (uint32_t)r << 16 50 | (uint32_t)g << 8 51 | (uint32_t)b << 0; 52 } 53 } 54 55 static void RGBA_to_BGRA_portable(uint32_t* dst, const void* vsrc, int count) { 56 auto src = (const uint32_t*)vsrc; 57 for (int i = 0; i < count; i++) { 58 uint8_t a = src[i] >> 24, 59 b = src[i] >> 16, 60 g = src[i] >> 8, 61 r = src[i] >> 0; 62 dst[i] = (uint32_t)a << 24 63 | (uint32_t)r << 16 64 | (uint32_t)g << 8 65 | (uint32_t)b << 0; 66 } 67 } 68 69 static void RGB_to_RGB1_portable(uint32_t dst[], const void* vsrc, int count) { 70 const uint8_t* src = (const uint8_t*)vsrc; 71 for (int i = 0; i < count; i++) { 72 uint8_t r = src[0], 73 g = src[1], 74 b = src[2]; 75 src += 3; 76 dst[i] = (uint32_t)0xFF << 24 77 | (uint32_t)b << 16 78 | (uint32_t)g << 8 79 | (uint32_t)r << 0; 80 } 81 } 82 83 static void RGB_to_BGR1_portable(uint32_t dst[], const void* vsrc, int count) { 84 const uint8_t* src = (const uint8_t*)vsrc; 85 for (int i = 0; i < count; i++) { 86 uint8_t r = src[0], 87 g = src[1], 88 b = src[2]; 89 src += 3; 90 dst[i] = (uint32_t)0xFF << 24 91 | (uint32_t)r << 16 92 | (uint32_t)g << 8 93 | (uint32_t)b << 0; 94 } 95 } 96 97 static void gray_to_RGB1_portable(uint32_t dst[], const void* vsrc, int count) { 98 const uint8_t* src = (const uint8_t*)vsrc; 99 for (int i = 0; i < count; i++) { 100 dst[i] = (uint32_t)0xFF << 24 101 | (uint32_t)src[i] << 16 102 | (uint32_t)src[i] << 8 103 | (uint32_t)src[i] << 0; 104 } 105 } 106 107 static void grayA_to_RGBA_portable(uint32_t dst[], const void* vsrc, int count) { 108 const uint8_t* src = (const uint8_t*)vsrc; 109 for (int i = 0; i < count; i++) { 110 uint8_t g = src[0], 111 a = src[1]; 112 src += 2; 113 dst[i] = (uint32_t)a << 24 114 | (uint32_t)g << 16 115 | (uint32_t)g << 8 116 | (uint32_t)g << 0; 117 } 118 } 119 120 static void grayA_to_rgbA_portable(uint32_t dst[], const void* vsrc, int count) { 121 const uint8_t* src = (const uint8_t*)vsrc; 122 for (int i = 0; i < count; i++) { 123 uint8_t g = src[0], 124 a = src[1]; 125 src += 2; 126 g = (g*a+127)/255; 127 dst[i] = (uint32_t)a << 24 128 | (uint32_t)g << 16 129 | (uint32_t)g << 8 130 | (uint32_t)g << 0; 131 } 132 } 133 134 static void inverted_CMYK_to_RGB1_portable(uint32_t* dst, const void* vsrc, int count) { 135 const uint32_t* src = (const uint32_t*)vsrc; 136 for (int i = 0; i < count; i++) { 137 uint8_t k = src[i] >> 24, 138 y = src[i] >> 16, 139 m = src[i] >> 8, 140 c = src[i] >> 0; 141 // See comments in SkSwizzler.cpp for details on the conversion formula. 142 uint8_t b = (y*k+127)/255, 143 g = (m*k+127)/255, 144 r = (c*k+127)/255; 145 dst[i] = (uint32_t)0xFF << 24 146 | (uint32_t) b << 16 147 | (uint32_t) g << 8 148 | (uint32_t) r << 0; 149 } 150 } 151 152 static void inverted_CMYK_to_BGR1_portable(uint32_t* dst, const void* vsrc, int count) { 153 const uint32_t* src = (const uint32_t*)vsrc; 154 for (int i = 0; i < count; i++) { 155 uint8_t k = src[i] >> 24, 156 y = src[i] >> 16, 157 m = src[i] >> 8, 158 c = src[i] >> 0; 159 uint8_t b = (y*k+127)/255, 160 g = (m*k+127)/255, 161 r = (c*k+127)/255; 162 dst[i] = (uint32_t)0xFF << 24 163 | (uint32_t) r << 16 164 | (uint32_t) g << 8 165 | (uint32_t) b << 0; 166 } 167 } 168 169 #if defined(SK_ARM_HAS_NEON) 170 171 // Rounded divide by 255, (x + 127) / 255 172 static uint8x8_t div255_round(uint16x8_t x) { 173 // result = (x + 127) / 255 174 // result = (x + 127) / 256 + error1 175 // 176 // error1 = (x + 127) / (255 * 256) 177 // error1 = (x + 127) / (256 * 256) + error2 178 // 179 // error2 = (x + 127) / (255 * 256 * 256) 180 // 181 // The maximum value of error2 is too small to matter. Thus: 182 // result = (x + 127) / 256 + (x + 127) / (256 * 256) 183 // result = ((x + 127) / 256 + x + 127) / 256 184 // result = ((x + 127) >> 8 + x + 127) >> 8 185 // 186 // Use >>> to represent "rounded right shift" which, conveniently, 187 // NEON supports in one instruction. 188 // result = ((x >>> 8) + x) >>> 8 189 // 190 // Note that the second right shift is actually performed as an 191 // "add, round, and narrow back to 8-bits" instruction. 192 return vraddhn_u16(x, vrshrq_n_u16(x, 8)); 193 } 194 195 // Scale a byte by another, (x * y + 127) / 255 196 static uint8x8_t scale(uint8x8_t x, uint8x8_t y) { 197 return div255_round(vmull_u8(x, y)); 198 } 199 200 template <bool kSwapRB> 201 static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) { 202 auto src = (const uint32_t*)vsrc; 203 while (count >= 8) { 204 // Load 8 pixels. 205 uint8x8x4_t rgba = vld4_u8((const uint8_t*) src); 206 207 uint8x8_t a = rgba.val[3], 208 b = rgba.val[2], 209 g = rgba.val[1], 210 r = rgba.val[0]; 211 212 // Premultiply. 213 b = scale(b, a); 214 g = scale(g, a); 215 r = scale(r, a); 216 217 // Store 8 premultiplied pixels. 218 if (kSwapRB) { 219 rgba.val[2] = r; 220 rgba.val[1] = g; 221 rgba.val[0] = b; 222 } else { 223 rgba.val[2] = b; 224 rgba.val[1] = g; 225 rgba.val[0] = r; 226 } 227 vst4_u8((uint8_t*) dst, rgba); 228 src += 8; 229 dst += 8; 230 count -= 8; 231 } 232 233 // Call portable code to finish up the tail of [0,8) pixels. 234 auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable; 235 proc(dst, src, count); 236 } 237 238 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) { 239 premul_should_swapRB<false>(dst, src, count); 240 } 241 242 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) { 243 premul_should_swapRB<true>(dst, src, count); 244 } 245 246 static void RGBA_to_BGRA(uint32_t* dst, const void* vsrc, int count) { 247 auto src = (const uint32_t*)vsrc; 248 while (count >= 16) { 249 // Load 16 pixels. 250 uint8x16x4_t rgba = vld4q_u8((const uint8_t*) src); 251 252 // Swap r and b. 253 SkTSwap(rgba.val[0], rgba.val[2]); 254 255 // Store 16 pixels. 256 vst4q_u8((uint8_t*) dst, rgba); 257 src += 16; 258 dst += 16; 259 count -= 16; 260 } 261 262 if (count >= 8) { 263 // Load 8 pixels. 264 uint8x8x4_t rgba = vld4_u8((const uint8_t*) src); 265 266 // Swap r and b. 267 SkTSwap(rgba.val[0], rgba.val[2]); 268 269 // Store 8 pixels. 270 vst4_u8((uint8_t*) dst, rgba); 271 src += 8; 272 dst += 8; 273 count -= 8; 274 } 275 276 RGBA_to_BGRA_portable(dst, src, count); 277 } 278 279 template <bool kSwapRB> 280 static void insert_alpha_should_swaprb(uint32_t dst[], const void* vsrc, int count) { 281 const uint8_t* src = (const uint8_t*) vsrc; 282 while (count >= 16) { 283 // Load 16 pixels. 284 uint8x16x3_t rgb = vld3q_u8(src); 285 286 // Insert an opaque alpha channel and swap if needed. 287 uint8x16x4_t rgba; 288 if (kSwapRB) { 289 rgba.val[0] = rgb.val[2]; 290 rgba.val[2] = rgb.val[0]; 291 } else { 292 rgba.val[0] = rgb.val[0]; 293 rgba.val[2] = rgb.val[2]; 294 } 295 rgba.val[1] = rgb.val[1]; 296 rgba.val[3] = vdupq_n_u8(0xFF); 297 298 // Store 16 pixels. 299 vst4q_u8((uint8_t*) dst, rgba); 300 src += 16*3; 301 dst += 16; 302 count -= 16; 303 } 304 305 if (count >= 8) { 306 // Load 8 pixels. 307 uint8x8x3_t rgb = vld3_u8(src); 308 309 // Insert an opaque alpha channel and swap if needed. 310 uint8x8x4_t rgba; 311 if (kSwapRB) { 312 rgba.val[0] = rgb.val[2]; 313 rgba.val[2] = rgb.val[0]; 314 } else { 315 rgba.val[0] = rgb.val[0]; 316 rgba.val[2] = rgb.val[2]; 317 } 318 rgba.val[1] = rgb.val[1]; 319 rgba.val[3] = vdup_n_u8(0xFF); 320 321 // Store 8 pixels. 322 vst4_u8((uint8_t*) dst, rgba); 323 src += 8*3; 324 dst += 8; 325 count -= 8; 326 } 327 328 // Call portable code to finish up the tail of [0,8) pixels. 329 auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable; 330 proc(dst, src, count); 331 } 332 333 static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) { 334 insert_alpha_should_swaprb<false>(dst, src, count); 335 } 336 337 static void RGB_to_BGR1(uint32_t dst[], const void* src, int count) { 338 insert_alpha_should_swaprb<true>(dst, src, count); 339 } 340 341 static void gray_to_RGB1(uint32_t dst[], const void* vsrc, int count) { 342 const uint8_t* src = (const uint8_t*) vsrc; 343 while (count >= 16) { 344 // Load 16 pixels. 345 uint8x16_t gray = vld1q_u8(src); 346 347 // Set each of the color channels. 348 uint8x16x4_t rgba; 349 rgba.val[0] = gray; 350 rgba.val[1] = gray; 351 rgba.val[2] = gray; 352 rgba.val[3] = vdupq_n_u8(0xFF); 353 354 // Store 16 pixels. 355 vst4q_u8((uint8_t*) dst, rgba); 356 src += 16; 357 dst += 16; 358 count -= 16; 359 } 360 361 if (count >= 8) { 362 // Load 8 pixels. 363 uint8x8_t gray = vld1_u8(src); 364 365 // Set each of the color channels. 366 uint8x8x4_t rgba; 367 rgba.val[0] = gray; 368 rgba.val[1] = gray; 369 rgba.val[2] = gray; 370 rgba.val[3] = vdup_n_u8(0xFF); 371 372 // Store 8 pixels. 373 vst4_u8((uint8_t*) dst, rgba); 374 src += 8; 375 dst += 8; 376 count -= 8; 377 } 378 379 gray_to_RGB1_portable(dst, src, count); 380 } 381 382 template <bool kPremul> 383 static void expand_grayA(uint32_t dst[], const void* vsrc, int count) { 384 const uint8_t* src = (const uint8_t*) vsrc; 385 while (count >= 16) { 386 // Load 16 pixels. 387 uint8x16x2_t ga = vld2q_u8(src); 388 389 // Premultiply if requested. 390 if (kPremul) { 391 ga.val[0] = vcombine_u8( 392 scale(vget_low_u8(ga.val[0]), vget_low_u8(ga.val[1])), 393 scale(vget_high_u8(ga.val[0]), vget_high_u8(ga.val[1]))); 394 } 395 396 // Set each of the color channels. 397 uint8x16x4_t rgba; 398 rgba.val[0] = ga.val[0]; 399 rgba.val[1] = ga.val[0]; 400 rgba.val[2] = ga.val[0]; 401 rgba.val[3] = ga.val[1]; 402 403 // Store 16 pixels. 404 vst4q_u8((uint8_t*) dst, rgba); 405 src += 16*2; 406 dst += 16; 407 count -= 16; 408 } 409 410 if (count >= 8) { 411 // Load 8 pixels. 412 uint8x8x2_t ga = vld2_u8(src); 413 414 // Premultiply if requested. 415 if (kPremul) { 416 ga.val[0] = scale(ga.val[0], ga.val[1]); 417 } 418 419 // Set each of the color channels. 420 uint8x8x4_t rgba; 421 rgba.val[0] = ga.val[0]; 422 rgba.val[1] = ga.val[0]; 423 rgba.val[2] = ga.val[0]; 424 rgba.val[3] = ga.val[1]; 425 426 // Store 8 pixels. 427 vst4_u8((uint8_t*) dst, rgba); 428 src += 8*2; 429 dst += 8; 430 count -= 8; 431 } 432 433 auto proc = kPremul ? grayA_to_rgbA_portable : grayA_to_RGBA_portable; 434 proc(dst, src, count); 435 } 436 437 static void grayA_to_RGBA(uint32_t dst[], const void* src, int count) { 438 expand_grayA<false>(dst, src, count); 439 } 440 441 static void grayA_to_rgbA(uint32_t dst[], const void* src, int count) { 442 expand_grayA<true>(dst, src, count); 443 } 444 445 enum Format { kRGB1, kBGR1 }; 446 template <Format format> 447 static void inverted_cmyk_to(uint32_t* dst, const void* vsrc, int count) { 448 auto src = (const uint32_t*)vsrc; 449 while (count >= 8) { 450 // Load 8 cmyk pixels. 451 uint8x8x4_t pixels = vld4_u8((const uint8_t*) src); 452 453 uint8x8_t k = pixels.val[3], 454 y = pixels.val[2], 455 m = pixels.val[1], 456 c = pixels.val[0]; 457 458 // Scale to r, g, b. 459 uint8x8_t b = scale(y, k); 460 uint8x8_t g = scale(m, k); 461 uint8x8_t r = scale(c, k); 462 463 // Store 8 rgba pixels. 464 if (kBGR1 == format) { 465 pixels.val[3] = vdup_n_u8(0xFF); 466 pixels.val[2] = r; 467 pixels.val[1] = g; 468 pixels.val[0] = b; 469 } else { 470 pixels.val[3] = vdup_n_u8(0xFF); 471 pixels.val[2] = b; 472 pixels.val[1] = g; 473 pixels.val[0] = r; 474 } 475 vst4_u8((uint8_t*) dst, pixels); 476 src += 8; 477 dst += 8; 478 count -= 8; 479 } 480 481 auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable; 482 proc(dst, src, count); 483 } 484 485 static void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) { 486 inverted_cmyk_to<kRGB1>(dst, src, count); 487 } 488 489 static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) { 490 inverted_cmyk_to<kBGR1>(dst, src, count); 491 } 492 493 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 494 495 // Scale a byte by another. 496 // Inputs are stored in 16-bit lanes, but are not larger than 8-bits. 497 static __m128i scale(__m128i x, __m128i y) { 498 const __m128i _128 = _mm_set1_epi16(128); 499 const __m128i _257 = _mm_set1_epi16(257); 500 501 // (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255. 502 return _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(x, y), _128), _257); 503 } 504 505 template <bool kSwapRB> 506 static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) { 507 auto src = (const uint32_t*)vsrc; 508 509 auto premul8 = [](__m128i* lo, __m128i* hi) { 510 const __m128i zeros = _mm_setzero_si128(); 511 __m128i planar; 512 if (kSwapRB) { 513 planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15); 514 } else { 515 planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15); 516 } 517 518 // Swizzle the pixels to 8-bit planar. 519 *lo = _mm_shuffle_epi8(*lo, planar); // rrrrgggg bbbbaaaa 520 *hi = _mm_shuffle_epi8(*hi, planar); // RRRRGGGG BBBBAAAA 521 __m128i rg = _mm_unpacklo_epi32(*lo, *hi), // rrrrRRRR ggggGGGG 522 ba = _mm_unpackhi_epi32(*lo, *hi); // bbbbBBBB aaaaAAAA 523 524 // Unpack to 16-bit planar. 525 __m128i r = _mm_unpacklo_epi8(rg, zeros), // r_r_r_r_ R_R_R_R_ 526 g = _mm_unpackhi_epi8(rg, zeros), // g_g_g_g_ G_G_G_G_ 527 b = _mm_unpacklo_epi8(ba, zeros), // b_b_b_b_ B_B_B_B_ 528 a = _mm_unpackhi_epi8(ba, zeros); // a_a_a_a_ A_A_A_A_ 529 530 // Premultiply! 531 r = scale(r, a); 532 g = scale(g, a); 533 b = scale(b, a); 534 535 // Repack into interlaced pixels. 536 rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)); // rgrgrgrg RGRGRGRG 537 ba = _mm_or_si128(b, _mm_slli_epi16(a, 8)); // babababa BABABABA 538 *lo = _mm_unpacklo_epi16(rg, ba); // rgbargba rgbargba 539 *hi = _mm_unpackhi_epi16(rg, ba); // RGBARGBA RGBARGBA 540 }; 541 542 while (count >= 8) { 543 __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)), 544 hi = _mm_loadu_si128((const __m128i*) (src + 4)); 545 546 premul8(&lo, &hi); 547 548 _mm_storeu_si128((__m128i*) (dst + 0), lo); 549 _mm_storeu_si128((__m128i*) (dst + 4), hi); 550 551 src += 8; 552 dst += 8; 553 count -= 8; 554 } 555 556 if (count >= 4) { 557 __m128i lo = _mm_loadu_si128((const __m128i*) src), 558 hi = _mm_setzero_si128(); 559 560 premul8(&lo, &hi); 561 562 _mm_storeu_si128((__m128i*) dst, lo); 563 564 src += 4; 565 dst += 4; 566 count -= 4; 567 } 568 569 // Call portable code to finish up the tail of [0,4) pixels. 570 auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable; 571 proc(dst, src, count); 572 } 573 574 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) { 575 premul_should_swapRB<false>(dst, src, count); 576 } 577 578 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) { 579 premul_should_swapRB<true>(dst, src, count); 580 } 581 582 static void RGBA_to_BGRA(uint32_t* dst, const void* vsrc, int count) { 583 auto src = (const uint32_t*)vsrc; 584 const __m128i swapRB = _mm_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15); 585 586 while (count >= 4) { 587 __m128i rgba = _mm_loadu_si128((const __m128i*) src); 588 __m128i bgra = _mm_shuffle_epi8(rgba, swapRB); 589 _mm_storeu_si128((__m128i*) dst, bgra); 590 591 src += 4; 592 dst += 4; 593 count -= 4; 594 } 595 596 RGBA_to_BGRA_portable(dst, src, count); 597 } 598 599 template <bool kSwapRB> 600 static void insert_alpha_should_swaprb(uint32_t dst[], const void* vsrc, int count) { 601 const uint8_t* src = (const uint8_t*) vsrc; 602 603 const __m128i alphaMask = _mm_set1_epi32(0xFF000000); 604 __m128i expand; 605 const uint8_t X = 0xFF; // Used a placeholder. The value of X is irrelevant. 606 if (kSwapRB) { 607 expand = _mm_setr_epi8(2,1,0,X, 5,4,3,X, 8,7,6,X, 11,10,9,X); 608 } else { 609 expand = _mm_setr_epi8(0,1,2,X, 3,4,5,X, 6,7,8,X, 9,10,11,X); 610 } 611 612 while (count >= 6) { 613 // Load a vector. While this actually contains 5 pixels plus an 614 // extra component, we will discard all but the first four pixels on 615 // this iteration. 616 __m128i rgb = _mm_loadu_si128((const __m128i*) src); 617 618 // Expand the first four pixels to RGBX and then mask to RGB(FF). 619 __m128i rgba = _mm_or_si128(_mm_shuffle_epi8(rgb, expand), alphaMask); 620 621 // Store 4 pixels. 622 _mm_storeu_si128((__m128i*) dst, rgba); 623 624 src += 4*3; 625 dst += 4; 626 count -= 4; 627 } 628 629 // Call portable code to finish up the tail of [0,4) pixels. 630 auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable; 631 proc(dst, src, count); 632 } 633 634 static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) { 635 insert_alpha_should_swaprb<false>(dst, src, count); 636 } 637 638 static void RGB_to_BGR1(uint32_t dst[], const void* src, int count) { 639 insert_alpha_should_swaprb<true>(dst, src, count); 640 } 641 642 static void gray_to_RGB1(uint32_t dst[], const void* vsrc, int count) { 643 const uint8_t* src = (const uint8_t*) vsrc; 644 645 const __m128i alphas = _mm_set1_epi8((uint8_t) 0xFF); 646 while (count >= 16) { 647 __m128i grays = _mm_loadu_si128((const __m128i*) src); 648 649 __m128i gg_lo = _mm_unpacklo_epi8(grays, grays); 650 __m128i gg_hi = _mm_unpackhi_epi8(grays, grays); 651 __m128i ga_lo = _mm_unpacklo_epi8(grays, alphas); 652 __m128i ga_hi = _mm_unpackhi_epi8(grays, alphas); 653 654 __m128i ggga0 = _mm_unpacklo_epi16(gg_lo, ga_lo); 655 __m128i ggga1 = _mm_unpackhi_epi16(gg_lo, ga_lo); 656 __m128i ggga2 = _mm_unpacklo_epi16(gg_hi, ga_hi); 657 __m128i ggga3 = _mm_unpackhi_epi16(gg_hi, ga_hi); 658 659 _mm_storeu_si128((__m128i*) (dst + 0), ggga0); 660 _mm_storeu_si128((__m128i*) (dst + 4), ggga1); 661 _mm_storeu_si128((__m128i*) (dst + 8), ggga2); 662 _mm_storeu_si128((__m128i*) (dst + 12), ggga3); 663 664 src += 16; 665 dst += 16; 666 count -= 16; 667 } 668 669 gray_to_RGB1_portable(dst, src, count); 670 } 671 672 static void grayA_to_RGBA(uint32_t dst[], const void* vsrc, int count) { 673 const uint8_t* src = (const uint8_t*) vsrc; 674 while (count >= 8) { 675 __m128i ga = _mm_loadu_si128((const __m128i*) src); 676 677 __m128i gg = _mm_or_si128(_mm_and_si128(ga, _mm_set1_epi16(0x00FF)), 678 _mm_slli_epi16(ga, 8)); 679 680 __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga); 681 __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga); 682 683 _mm_storeu_si128((__m128i*) (dst + 0), ggga_lo); 684 _mm_storeu_si128((__m128i*) (dst + 4), ggga_hi); 685 686 src += 8*2; 687 dst += 8; 688 count -= 8; 689 } 690 691 grayA_to_RGBA_portable(dst, src, count); 692 } 693 694 static void grayA_to_rgbA(uint32_t dst[], const void* vsrc, int count) { 695 const uint8_t* src = (const uint8_t*) vsrc; 696 while (count >= 8) { 697 __m128i grayA = _mm_loadu_si128((const __m128i*) src); 698 699 __m128i g0 = _mm_and_si128(grayA, _mm_set1_epi16(0x00FF)); 700 __m128i a0 = _mm_srli_epi16(grayA, 8); 701 702 // Premultiply 703 g0 = scale(g0, a0); 704 705 __m128i gg = _mm_or_si128(g0, _mm_slli_epi16(g0, 8)); 706 __m128i ga = _mm_or_si128(g0, _mm_slli_epi16(a0, 8)); 707 708 709 __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga); 710 __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga); 711 712 _mm_storeu_si128((__m128i*) (dst + 0), ggga_lo); 713 _mm_storeu_si128((__m128i*) (dst + 4), ggga_hi); 714 715 src += 8*2; 716 dst += 8; 717 count -= 8; 718 } 719 720 grayA_to_rgbA_portable(dst, src, count); 721 } 722 723 enum Format { kRGB1, kBGR1 }; 724 template <Format format> 725 static void inverted_cmyk_to(uint32_t* dst, const void* vsrc, int count) { 726 auto src = (const uint32_t*)vsrc; 727 728 auto convert8 = [](__m128i* lo, __m128i* hi) { 729 const __m128i zeros = _mm_setzero_si128(); 730 __m128i planar; 731 if (kBGR1 == format) { 732 planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15); 733 } else { 734 planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15); 735 } 736 737 // Swizzle the pixels to 8-bit planar. 738 *lo = _mm_shuffle_epi8(*lo, planar); // ccccmmmm yyyykkkk 739 *hi = _mm_shuffle_epi8(*hi, planar); // CCCCMMMM YYYYKKKK 740 __m128i cm = _mm_unpacklo_epi32(*lo, *hi), // ccccCCCC mmmmMMMM 741 yk = _mm_unpackhi_epi32(*lo, *hi); // yyyyYYYY kkkkKKKK 742 743 // Unpack to 16-bit planar. 744 __m128i c = _mm_unpacklo_epi8(cm, zeros), // c_c_c_c_ C_C_C_C_ 745 m = _mm_unpackhi_epi8(cm, zeros), // m_m_m_m_ M_M_M_M_ 746 y = _mm_unpacklo_epi8(yk, zeros), // y_y_y_y_ Y_Y_Y_Y_ 747 k = _mm_unpackhi_epi8(yk, zeros); // k_k_k_k_ K_K_K_K_ 748 749 // Scale to r, g, b. 750 __m128i r = scale(c, k), 751 g = scale(m, k), 752 b = scale(y, k); 753 754 // Repack into interlaced pixels. 755 __m128i rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)), // rgrgrgrg RGRGRGRG 756 ba = _mm_or_si128(b, _mm_set1_epi16((uint16_t) 0xFF00)); // b1b1b1b1 B1B1B1B1 757 *lo = _mm_unpacklo_epi16(rg, ba); // rgbargba rgbargba 758 *hi = _mm_unpackhi_epi16(rg, ba); // RGB1RGB1 RGB1RGB1 759 }; 760 761 while (count >= 8) { 762 __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)), 763 hi = _mm_loadu_si128((const __m128i*) (src + 4)); 764 765 convert8(&lo, &hi); 766 767 _mm_storeu_si128((__m128i*) (dst + 0), lo); 768 _mm_storeu_si128((__m128i*) (dst + 4), hi); 769 770 src += 8; 771 dst += 8; 772 count -= 8; 773 } 774 775 if (count >= 4) { 776 __m128i lo = _mm_loadu_si128((const __m128i*) src), 777 hi = _mm_setzero_si128(); 778 779 convert8(&lo, &hi); 780 781 _mm_storeu_si128((__m128i*) dst, lo); 782 783 src += 4; 784 dst += 4; 785 count -= 4; 786 } 787 788 auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable; 789 proc(dst, src, count); 790 } 791 792 static void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) { 793 inverted_cmyk_to<kRGB1>(dst, src, count); 794 } 795 796 static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) { 797 inverted_cmyk_to<kBGR1>(dst, src, count); 798 } 799 800 #else 801 802 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) { 803 RGBA_to_rgbA_portable(dst, src, count); 804 } 805 806 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) { 807 RGBA_to_bgrA_portable(dst, src, count); 808 } 809 810 static void RGBA_to_BGRA(uint32_t* dst, const void* src, int count) { 811 RGBA_to_BGRA_portable(dst, src, count); 812 } 813 814 static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) { 815 RGB_to_RGB1_portable(dst, src, count); 816 } 817 818 static void RGB_to_BGR1(uint32_t dst[], const void* src, int count) { 819 RGB_to_BGR1_portable(dst, src, count); 820 } 821 822 static void gray_to_RGB1(uint32_t dst[], const void* src, int count) { 823 gray_to_RGB1_portable(dst, src, count); 824 } 825 826 static void grayA_to_RGBA(uint32_t dst[], const void* src, int count) { 827 grayA_to_RGBA_portable(dst, src, count); 828 } 829 830 static void grayA_to_rgbA(uint32_t dst[], const void* src, int count) { 831 grayA_to_rgbA_portable(dst, src, count); 832 } 833 834 static void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) { 835 inverted_CMYK_to_RGB1_portable(dst, src, count); 836 } 837 838 static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) { 839 inverted_CMYK_to_BGR1_portable(dst, src, count); 840 } 841 842 #endif 843 844 } 845 846 #endif // SkSwizzler_opts_DEFINED 847