Home | History | Annotate | Download | only in opts
      1 /*
      2  * Copyright 2016 Google Inc.
      3  *
      4  * Use of this source code is governed by a BSD-style license that can be
      5  * found in the LICENSE file.
      6  */
      7 
      8 #ifndef SkSwizzler_opts_DEFINED
      9 #define SkSwizzler_opts_DEFINED
     10 
     11 #include "SkColorPriv.h"
     12 
     13 namespace SK_OPTS_NS {
     14 
     15 static void RGBA_to_rgbA_portable(uint32_t* dst, const void* vsrc, int count) {
     16     auto src = (const uint32_t*)vsrc;
     17     for (int i = 0; i < count; i++) {
     18         uint8_t a = src[i] >> 24,
     19                 b = src[i] >> 16,
     20                 g = src[i] >>  8,
     21                 r = src[i] >>  0;
     22         b = (b*a+127)/255;
     23         g = (g*a+127)/255;
     24         r = (r*a+127)/255;
     25         dst[i] = (uint32_t)a << 24
     26                | (uint32_t)b << 16
     27                | (uint32_t)g <<  8
     28                | (uint32_t)r <<  0;
     29     }
     30 }
     31 
     32 static void RGBA_to_bgrA_portable(uint32_t* dst, const void* vsrc, int count) {
     33     auto src = (const uint32_t*)vsrc;
     34     for (int i = 0; i < count; i++) {
     35         uint8_t a = src[i] >> 24,
     36                 b = src[i] >> 16,
     37                 g = src[i] >>  8,
     38                 r = src[i] >>  0;
     39         b = (b*a+127)/255;
     40         g = (g*a+127)/255;
     41         r = (r*a+127)/255;
     42         dst[i] = (uint32_t)a << 24
     43                | (uint32_t)r << 16
     44                | (uint32_t)g <<  8
     45                | (uint32_t)b <<  0;
     46     }
     47 }
     48 
     49 static void RGBA_to_BGRA_portable(uint32_t* dst, const void* vsrc, int count) {
     50     auto src = (const uint32_t*)vsrc;
     51     for (int i = 0; i < count; i++) {
     52         uint8_t a = src[i] >> 24,
     53                 b = src[i] >> 16,
     54                 g = src[i] >>  8,
     55                 r = src[i] >>  0;
     56         dst[i] = (uint32_t)a << 24
     57                | (uint32_t)r << 16
     58                | (uint32_t)g <<  8
     59                | (uint32_t)b <<  0;
     60     }
     61 }
     62 
     63 static void RGB_to_RGB1_portable(uint32_t dst[], const void* vsrc, int count) {
     64     const uint8_t* src = (const uint8_t*)vsrc;
     65     for (int i = 0; i < count; i++) {
     66         uint8_t r = src[0],
     67                 g = src[1],
     68                 b = src[2];
     69         src += 3;
     70         dst[i] = (uint32_t)0xFF << 24
     71                | (uint32_t)b    << 16
     72                | (uint32_t)g    <<  8
     73                | (uint32_t)r    <<  0;
     74     }
     75 }
     76 
     77 static void RGB_to_BGR1_portable(uint32_t dst[], const void* vsrc, int count) {
     78     const uint8_t* src = (const uint8_t*)vsrc;
     79     for (int i = 0; i < count; i++) {
     80         uint8_t r = src[0],
     81                 g = src[1],
     82                 b = src[2];
     83         src += 3;
     84         dst[i] = (uint32_t)0xFF << 24
     85                | (uint32_t)r    << 16
     86                | (uint32_t)g    <<  8
     87                | (uint32_t)b    <<  0;
     88     }
     89 }
     90 
     91 static void gray_to_RGB1_portable(uint32_t dst[], const void* vsrc, int count) {
     92     const uint8_t* src = (const uint8_t*)vsrc;
     93     for (int i = 0; i < count; i++) {
     94         dst[i] = (uint32_t)0xFF   << 24
     95                | (uint32_t)src[i] << 16
     96                | (uint32_t)src[i] <<  8
     97                | (uint32_t)src[i] <<  0;
     98     }
     99 }
    100 
    101 static void grayA_to_RGBA_portable(uint32_t dst[], const void* vsrc, int count) {
    102     const uint8_t* src = (const uint8_t*)vsrc;
    103     for (int i = 0; i < count; i++) {
    104         uint8_t g = src[0],
    105                 a = src[1];
    106         src += 2;
    107         dst[i] = (uint32_t)a << 24
    108                | (uint32_t)g << 16
    109                | (uint32_t)g <<  8
    110                | (uint32_t)g <<  0;
    111     }
    112 }
    113 
    114 static void grayA_to_rgbA_portable(uint32_t dst[], const void* vsrc, int count) {
    115     const uint8_t* src = (const uint8_t*)vsrc;
    116     for (int i = 0; i < count; i++) {
    117         uint8_t g = src[0],
    118                 a = src[1];
    119         src += 2;
    120         g = (g*a+127)/255;
    121         dst[i] = (uint32_t)a << 24
    122                | (uint32_t)g << 16
    123                | (uint32_t)g <<  8
    124                | (uint32_t)g <<  0;
    125     }
    126 }
    127 
    128 static void inverted_CMYK_to_RGB1_portable(uint32_t* dst, const void* vsrc, int count) {
    129     const uint32_t* src = (const uint32_t*)vsrc;
    130     for (int i = 0; i < count; i++) {
    131         uint8_t k = src[i] >> 24,
    132                 y = src[i] >> 16,
    133                 m = src[i] >>  8,
    134                 c = src[i] >>  0;
    135         // See comments in SkSwizzler.cpp for details on the conversion formula.
    136         uint8_t b = (y*k+127)/255,
    137                 g = (m*k+127)/255,
    138                 r = (c*k+127)/255;
    139         dst[i] = (uint32_t)0xFF << 24
    140                | (uint32_t)   b << 16
    141                | (uint32_t)   g <<  8
    142                | (uint32_t)   r <<  0;
    143     }
    144 }
    145 
    146 static void inverted_CMYK_to_BGR1_portable(uint32_t* dst, const void* vsrc, int count) {
    147     const uint32_t* src = (const uint32_t*)vsrc;
    148     for (int i = 0; i < count; i++) {
    149         uint8_t k = src[i] >> 24,
    150                 y = src[i] >> 16,
    151                 m = src[i] >>  8,
    152                 c = src[i] >>  0;
    153         uint8_t b = (y*k+127)/255,
    154                 g = (m*k+127)/255,
    155                 r = (c*k+127)/255;
    156         dst[i] = (uint32_t)0xFF << 24
    157                | (uint32_t)   r << 16
    158                | (uint32_t)   g <<  8
    159                | (uint32_t)   b <<  0;
    160     }
    161 }
    162 
    163 #if defined(SK_ARM_HAS_NEON)
    164 
    165 // Rounded divide by 255, (x + 127) / 255
    166 static uint8x8_t div255_round(uint16x8_t x) {
    167     // result = (x + 127) / 255
    168     // result = (x + 127) / 256 + error1
    169     //
    170     // error1 = (x + 127) / (255 * 256)
    171     // error1 = (x + 127) / (256 * 256) + error2
    172     //
    173     // error2 = (x + 127) / (255 * 256 * 256)
    174     //
    175     // The maximum value of error2 is too small to matter.  Thus:
    176     // result = (x + 127) / 256 + (x + 127) / (256 * 256)
    177     // result = ((x + 127) / 256 + x + 127) / 256
    178     // result = ((x + 127) >> 8 + x + 127) >> 8
    179     //
    180     // Use >>> to represent "rounded right shift" which, conveniently,
    181     // NEON supports in one instruction.
    182     // result = ((x >>> 8) + x) >>> 8
    183     //
    184     // Note that the second right shift is actually performed as an
    185     // "add, round, and narrow back to 8-bits" instruction.
    186     return vraddhn_u16(x, vrshrq_n_u16(x, 8));
    187 }
    188 
    189 // Scale a byte by another, (x * y + 127) / 255
    190 static uint8x8_t scale(uint8x8_t x, uint8x8_t y) {
    191     return div255_round(vmull_u8(x, y));
    192 }
    193 
    194 template <bool kSwapRB>
    195 static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) {
    196     auto src = (const uint32_t*)vsrc;
    197     while (count >= 8) {
    198         // Load 8 pixels.
    199         uint8x8x4_t rgba = vld4_u8((const uint8_t*) src);
    200 
    201         uint8x8_t a = rgba.val[3],
    202                   b = rgba.val[2],
    203                   g = rgba.val[1],
    204                   r = rgba.val[0];
    205 
    206         // Premultiply.
    207         b = scale(b, a);
    208         g = scale(g, a);
    209         r = scale(r, a);
    210 
    211         // Store 8 premultiplied pixels.
    212         if (kSwapRB) {
    213             rgba.val[2] = r;
    214             rgba.val[1] = g;
    215             rgba.val[0] = b;
    216         } else {
    217             rgba.val[2] = b;
    218             rgba.val[1] = g;
    219             rgba.val[0] = r;
    220         }
    221         vst4_u8((uint8_t*) dst, rgba);
    222         src += 8;
    223         dst += 8;
    224         count -= 8;
    225     }
    226 
    227     // Call portable code to finish up the tail of [0,8) pixels.
    228     auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
    229     proc(dst, src, count);
    230 }
    231 
    232 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
    233     premul_should_swapRB<false>(dst, src, count);
    234 }
    235 
    236 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {
    237     premul_should_swapRB<true>(dst, src, count);
    238 }
    239 
    240 static void RGBA_to_BGRA(uint32_t* dst, const void* vsrc, int count) {
    241     auto src = (const uint32_t*)vsrc;
    242     while (count >= 16) {
    243         // Load 16 pixels.
    244         uint8x16x4_t rgba = vld4q_u8((const uint8_t*) src);
    245 
    246         // Swap r and b.
    247         SkTSwap(rgba.val[0], rgba.val[2]);
    248 
    249         // Store 16 pixels.
    250         vst4q_u8((uint8_t*) dst, rgba);
    251         src += 16;
    252         dst += 16;
    253         count -= 16;
    254     }
    255 
    256     if (count >= 8) {
    257         // Load 8 pixels.
    258         uint8x8x4_t rgba = vld4_u8((const uint8_t*) src);
    259 
    260         // Swap r and b.
    261         SkTSwap(rgba.val[0], rgba.val[2]);
    262 
    263         // Store 8 pixels.
    264         vst4_u8((uint8_t*) dst, rgba);
    265         src += 8;
    266         dst += 8;
    267         count -= 8;
    268     }
    269 
    270     RGBA_to_BGRA_portable(dst, src, count);
    271 }
    272 
    273 template <bool kSwapRB>
    274 static void insert_alpha_should_swaprb(uint32_t dst[], const void* vsrc, int count) {
    275     const uint8_t* src = (const uint8_t*) vsrc;
    276     while (count >= 16) {
    277         // Load 16 pixels.
    278         uint8x16x3_t rgb = vld3q_u8(src);
    279 
    280         // Insert an opaque alpha channel and swap if needed.
    281         uint8x16x4_t rgba;
    282         if (kSwapRB) {
    283             rgba.val[0] = rgb.val[2];
    284             rgba.val[2] = rgb.val[0];
    285         } else {
    286             rgba.val[0] = rgb.val[0];
    287             rgba.val[2] = rgb.val[2];
    288         }
    289         rgba.val[1] = rgb.val[1];
    290         rgba.val[3] = vdupq_n_u8(0xFF);
    291 
    292         // Store 16 pixels.
    293         vst4q_u8((uint8_t*) dst, rgba);
    294         src += 16*3;
    295         dst += 16;
    296         count -= 16;
    297     }
    298 
    299     if (count >= 8) {
    300         // Load 8 pixels.
    301         uint8x8x3_t rgb = vld3_u8(src);
    302 
    303         // Insert an opaque alpha channel and swap if needed.
    304         uint8x8x4_t rgba;
    305         if (kSwapRB) {
    306             rgba.val[0] = rgb.val[2];
    307             rgba.val[2] = rgb.val[0];
    308         } else {
    309             rgba.val[0] = rgb.val[0];
    310             rgba.val[2] = rgb.val[2];
    311         }
    312         rgba.val[1] = rgb.val[1];
    313         rgba.val[3] = vdup_n_u8(0xFF);
    314 
    315         // Store 8 pixels.
    316         vst4_u8((uint8_t*) dst, rgba);
    317         src += 8*3;
    318         dst += 8;
    319         count -= 8;
    320     }
    321 
    322     // Call portable code to finish up the tail of [0,8) pixels.
    323     auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
    324     proc(dst, src, count);
    325 }
    326 
    327 static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) {
    328     insert_alpha_should_swaprb<false>(dst, src, count);
    329 }
    330 
    331 static void RGB_to_BGR1(uint32_t dst[], const void* src, int count) {
    332     insert_alpha_should_swaprb<true>(dst, src, count);
    333 }
    334 
    335 static void gray_to_RGB1(uint32_t dst[], const void* vsrc, int count) {
    336     const uint8_t* src = (const uint8_t*) vsrc;
    337     while (count >= 16) {
    338         // Load 16 pixels.
    339         uint8x16_t gray = vld1q_u8(src);
    340 
    341         // Set each of the color channels.
    342         uint8x16x4_t rgba;
    343         rgba.val[0] = gray;
    344         rgba.val[1] = gray;
    345         rgba.val[2] = gray;
    346         rgba.val[3] = vdupq_n_u8(0xFF);
    347 
    348         // Store 16 pixels.
    349         vst4q_u8((uint8_t*) dst, rgba);
    350         src += 16;
    351         dst += 16;
    352         count -= 16;
    353     }
    354 
    355     if (count >= 8) {
    356         // Load 8 pixels.
    357         uint8x8_t gray = vld1_u8(src);
    358 
    359         // Set each of the color channels.
    360         uint8x8x4_t rgba;
    361         rgba.val[0] = gray;
    362         rgba.val[1] = gray;
    363         rgba.val[2] = gray;
    364         rgba.val[3] = vdup_n_u8(0xFF);
    365 
    366         // Store 8 pixels.
    367         vst4_u8((uint8_t*) dst, rgba);
    368         src += 8;
    369         dst += 8;
    370         count -= 8;
    371     }
    372 
    373     gray_to_RGB1_portable(dst, src, count);
    374 }
    375 
    376 template <bool kPremul>
    377 static void expand_grayA(uint32_t dst[], const void* vsrc, int count) {
    378     const uint8_t* src = (const uint8_t*) vsrc;
    379     while (count >= 16) {
    380         // Load 16 pixels.
    381         uint8x16x2_t ga = vld2q_u8(src);
    382 
    383         // Premultiply if requested.
    384         if (kPremul) {
    385             ga.val[0] = vcombine_u8(
    386                     scale(vget_low_u8(ga.val[0]),  vget_low_u8(ga.val[1])),
    387                     scale(vget_high_u8(ga.val[0]), vget_high_u8(ga.val[1])));
    388         }
    389 
    390         // Set each of the color channels.
    391         uint8x16x4_t rgba;
    392         rgba.val[0] = ga.val[0];
    393         rgba.val[1] = ga.val[0];
    394         rgba.val[2] = ga.val[0];
    395         rgba.val[3] = ga.val[1];
    396 
    397         // Store 16 pixels.
    398         vst4q_u8((uint8_t*) dst, rgba);
    399         src += 16*2;
    400         dst += 16;
    401         count -= 16;
    402     }
    403 
    404     if (count >= 8) {
    405         // Load 8 pixels.
    406         uint8x8x2_t ga = vld2_u8(src);
    407 
    408         // Premultiply if requested.
    409         if (kPremul) {
    410             ga.val[0] = scale(ga.val[0], ga.val[1]);
    411         }
    412 
    413         // Set each of the color channels.
    414         uint8x8x4_t rgba;
    415         rgba.val[0] = ga.val[0];
    416         rgba.val[1] = ga.val[0];
    417         rgba.val[2] = ga.val[0];
    418         rgba.val[3] = ga.val[1];
    419 
    420         // Store 8 pixels.
    421         vst4_u8((uint8_t*) dst, rgba);
    422         src += 8*2;
    423         dst += 8;
    424         count -= 8;
    425     }
    426 
    427     auto proc = kPremul ? grayA_to_rgbA_portable : grayA_to_RGBA_portable;
    428     proc(dst, src, count);
    429 }
    430 
    431 static void grayA_to_RGBA(uint32_t dst[], const void* src, int count) {
    432     expand_grayA<false>(dst, src, count);
    433 }
    434 
    435 static void grayA_to_rgbA(uint32_t dst[], const void* src, int count) {
    436     expand_grayA<true>(dst, src, count);
    437 }
    438 
    439 enum Format { kRGB1, kBGR1 };
    440 template <Format format>
    441 static void inverted_cmyk_to(uint32_t* dst, const void* vsrc, int count) {
    442     auto src = (const uint32_t*)vsrc;
    443     while (count >= 8) {
    444         // Load 8 cmyk pixels.
    445         uint8x8x4_t pixels = vld4_u8((const uint8_t*) src);
    446 
    447         uint8x8_t k = pixels.val[3],
    448                   y = pixels.val[2],
    449                   m = pixels.val[1],
    450                   c = pixels.val[0];
    451 
    452         // Scale to r, g, b.
    453         uint8x8_t b = scale(y, k);
    454         uint8x8_t g = scale(m, k);
    455         uint8x8_t r = scale(c, k);
    456 
    457         // Store 8 rgba pixels.
    458         if (kBGR1 == format) {
    459             pixels.val[3] = vdup_n_u8(0xFF);
    460             pixels.val[2] = r;
    461             pixels.val[1] = g;
    462             pixels.val[0] = b;
    463         } else {
    464             pixels.val[3] = vdup_n_u8(0xFF);
    465             pixels.val[2] = b;
    466             pixels.val[1] = g;
    467             pixels.val[0] = r;
    468         }
    469         vst4_u8((uint8_t*) dst, pixels);
    470         src += 8;
    471         dst += 8;
    472         count -= 8;
    473     }
    474 
    475     auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
    476     proc(dst, src, count);
    477 }
    478 
    479 static void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) {
    480     inverted_cmyk_to<kRGB1>(dst, src, count);
    481 }
    482 
    483 static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) {
    484     inverted_cmyk_to<kBGR1>(dst, src, count);
    485 }
    486 
    487 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
    488 
    489 // Scale a byte by another.
    490 // Inputs are stored in 16-bit lanes, but are not larger than 8-bits.
    491 static __m128i scale(__m128i x, __m128i y) {
    492     const __m128i _128 = _mm_set1_epi16(128);
    493     const __m128i _257 = _mm_set1_epi16(257);
    494 
    495     // (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255.
    496     return _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(x, y), _128), _257);
    497 }
    498 
    499 template <bool kSwapRB>
    500 static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) {
    501     auto src = (const uint32_t*)vsrc;
    502 
    503     auto premul8 = [](__m128i* lo, __m128i* hi) {
    504         const __m128i zeros = _mm_setzero_si128();
    505         __m128i planar;
    506         if (kSwapRB) {
    507             planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
    508         } else {
    509             planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
    510         }
    511 
    512         // Swizzle the pixels to 8-bit planar.
    513         *lo = _mm_shuffle_epi8(*lo, planar);                      // rrrrgggg bbbbaaaa
    514         *hi = _mm_shuffle_epi8(*hi, planar);                      // RRRRGGGG BBBBAAAA
    515         __m128i rg = _mm_unpacklo_epi32(*lo, *hi),                // rrrrRRRR ggggGGGG
    516                 ba = _mm_unpackhi_epi32(*lo, *hi);                // bbbbBBBB aaaaAAAA
    517 
    518         // Unpack to 16-bit planar.
    519         __m128i r = _mm_unpacklo_epi8(rg, zeros),                 // r_r_r_r_ R_R_R_R_
    520                 g = _mm_unpackhi_epi8(rg, zeros),                 // g_g_g_g_ G_G_G_G_
    521                 b = _mm_unpacklo_epi8(ba, zeros),                 // b_b_b_b_ B_B_B_B_
    522                 a = _mm_unpackhi_epi8(ba, zeros);                 // a_a_a_a_ A_A_A_A_
    523 
    524         // Premultiply!
    525         r = scale(r, a);
    526         g = scale(g, a);
    527         b = scale(b, a);
    528 
    529         // Repack into interlaced pixels.
    530         rg = _mm_or_si128(r, _mm_slli_epi16(g, 8));               // rgrgrgrg RGRGRGRG
    531         ba = _mm_or_si128(b, _mm_slli_epi16(a, 8));               // babababa BABABABA
    532         *lo = _mm_unpacklo_epi16(rg, ba);                         // rgbargba rgbargba
    533         *hi = _mm_unpackhi_epi16(rg, ba);                         // RGBARGBA RGBARGBA
    534     };
    535 
    536     while (count >= 8) {
    537         __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)),
    538                 hi = _mm_loadu_si128((const __m128i*) (src + 4));
    539 
    540         premul8(&lo, &hi);
    541 
    542         _mm_storeu_si128((__m128i*) (dst + 0), lo);
    543         _mm_storeu_si128((__m128i*) (dst + 4), hi);
    544 
    545         src += 8;
    546         dst += 8;
    547         count -= 8;
    548     }
    549 
    550     if (count >= 4) {
    551         __m128i lo = _mm_loadu_si128((const __m128i*) src),
    552                 hi = _mm_setzero_si128();
    553 
    554         premul8(&lo, &hi);
    555 
    556         _mm_storeu_si128((__m128i*) dst, lo);
    557 
    558         src += 4;
    559         dst += 4;
    560         count -= 4;
    561     }
    562 
    563     // Call portable code to finish up the tail of [0,4) pixels.
    564     auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
    565     proc(dst, src, count);
    566 }
    567 
    568 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
    569     premul_should_swapRB<false>(dst, src, count);
    570 }
    571 
    572 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {
    573     premul_should_swapRB<true>(dst, src, count);
    574 }
    575 
    576 static void RGBA_to_BGRA(uint32_t* dst, const void* vsrc, int count) {
    577     auto src = (const uint32_t*)vsrc;
    578     const __m128i swapRB = _mm_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15);
    579 
    580     while (count >= 4) {
    581         __m128i rgba = _mm_loadu_si128((const __m128i*) src);
    582         __m128i bgra = _mm_shuffle_epi8(rgba, swapRB);
    583         _mm_storeu_si128((__m128i*) dst, bgra);
    584 
    585         src += 4;
    586         dst += 4;
    587         count -= 4;
    588     }
    589 
    590     RGBA_to_BGRA_portable(dst, src, count);
    591 }
    592 
    593 template <bool kSwapRB>
    594 static void insert_alpha_should_swaprb(uint32_t dst[], const void* vsrc, int count) {
    595     const uint8_t* src = (const uint8_t*) vsrc;
    596 
    597     const __m128i alphaMask = _mm_set1_epi32(0xFF000000);
    598     __m128i expand;
    599     const uint8_t X = 0xFF; // Used a placeholder.  The value of X is irrelevant.
    600     if (kSwapRB) {
    601         expand = _mm_setr_epi8(2,1,0,X, 5,4,3,X, 8,7,6,X, 11,10,9,X);
    602     } else {
    603         expand = _mm_setr_epi8(0,1,2,X, 3,4,5,X, 6,7,8,X, 9,10,11,X);
    604     }
    605 
    606     while (count >= 6) {
    607         // Load a vector.  While this actually contains 5 pixels plus an
    608         // extra component, we will discard all but the first four pixels on
    609         // this iteration.
    610         __m128i rgb = _mm_loadu_si128((const __m128i*) src);
    611 
    612         // Expand the first four pixels to RGBX and then mask to RGB(FF).
    613         __m128i rgba = _mm_or_si128(_mm_shuffle_epi8(rgb, expand), alphaMask);
    614 
    615         // Store 4 pixels.
    616         _mm_storeu_si128((__m128i*) dst, rgba);
    617 
    618         src += 4*3;
    619         dst += 4;
    620         count -= 4;
    621     }
    622 
    623     // Call portable code to finish up the tail of [0,4) pixels.
    624     auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
    625     proc(dst, src, count);
    626 }
    627 
    628 static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) {
    629     insert_alpha_should_swaprb<false>(dst, src, count);
    630 }
    631 
    632 static void RGB_to_BGR1(uint32_t dst[], const void* src, int count) {
    633     insert_alpha_should_swaprb<true>(dst, src, count);
    634 }
    635 
    636 static void gray_to_RGB1(uint32_t dst[], const void* vsrc, int count) {
    637     const uint8_t* src = (const uint8_t*) vsrc;
    638 
    639     const __m128i alphas = _mm_set1_epi8((uint8_t) 0xFF);
    640     while (count >= 16) {
    641         __m128i grays = _mm_loadu_si128((const __m128i*) src);
    642 
    643         __m128i gg_lo = _mm_unpacklo_epi8(grays, grays);
    644         __m128i gg_hi = _mm_unpackhi_epi8(grays, grays);
    645         __m128i ga_lo = _mm_unpacklo_epi8(grays, alphas);
    646         __m128i ga_hi = _mm_unpackhi_epi8(grays, alphas);
    647 
    648         __m128i ggga0 = _mm_unpacklo_epi16(gg_lo, ga_lo);
    649         __m128i ggga1 = _mm_unpackhi_epi16(gg_lo, ga_lo);
    650         __m128i ggga2 = _mm_unpacklo_epi16(gg_hi, ga_hi);
    651         __m128i ggga3 = _mm_unpackhi_epi16(gg_hi, ga_hi);
    652 
    653         _mm_storeu_si128((__m128i*) (dst +  0), ggga0);
    654         _mm_storeu_si128((__m128i*) (dst +  4), ggga1);
    655         _mm_storeu_si128((__m128i*) (dst +  8), ggga2);
    656         _mm_storeu_si128((__m128i*) (dst + 12), ggga3);
    657 
    658         src += 16;
    659         dst += 16;
    660         count -= 16;
    661     }
    662 
    663     gray_to_RGB1_portable(dst, src, count);
    664 }
    665 
    666 static void grayA_to_RGBA(uint32_t dst[], const void* vsrc, int count) {
    667     const uint8_t* src = (const uint8_t*) vsrc;
    668     while (count >= 8) {
    669         __m128i ga = _mm_loadu_si128((const __m128i*) src);
    670 
    671         __m128i gg = _mm_or_si128(_mm_and_si128(ga, _mm_set1_epi16(0x00FF)),
    672                                   _mm_slli_epi16(ga, 8));
    673 
    674         __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga);
    675         __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga);
    676 
    677         _mm_storeu_si128((__m128i*) (dst +  0), ggga_lo);
    678         _mm_storeu_si128((__m128i*) (dst +  4), ggga_hi);
    679 
    680         src += 8*2;
    681         dst += 8;
    682         count -= 8;
    683     }
    684 
    685     grayA_to_RGBA_portable(dst, src, count);
    686 }
    687 
    688 static void grayA_to_rgbA(uint32_t dst[], const void* vsrc, int count) {
    689     const uint8_t* src = (const uint8_t*) vsrc;
    690     while (count >= 8) {
    691         __m128i grayA = _mm_loadu_si128((const __m128i*) src);
    692 
    693         __m128i g0 = _mm_and_si128(grayA, _mm_set1_epi16(0x00FF));
    694         __m128i a0 = _mm_srli_epi16(grayA, 8);
    695 
    696         // Premultiply
    697         g0 = scale(g0, a0);
    698 
    699         __m128i gg = _mm_or_si128(g0, _mm_slli_epi16(g0, 8));
    700         __m128i ga = _mm_or_si128(g0, _mm_slli_epi16(a0, 8));
    701 
    702 
    703         __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga);
    704         __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga);
    705 
    706         _mm_storeu_si128((__m128i*) (dst +  0), ggga_lo);
    707         _mm_storeu_si128((__m128i*) (dst +  4), ggga_hi);
    708 
    709         src += 8*2;
    710         dst += 8;
    711         count -= 8;
    712     }
    713 
    714     grayA_to_rgbA_portable(dst, src, count);
    715 }
    716 
    717 enum Format { kRGB1, kBGR1 };
    718 template <Format format>
    719 static void inverted_cmyk_to(uint32_t* dst, const void* vsrc, int count) {
    720     auto src = (const uint32_t*)vsrc;
    721 
    722     auto convert8 = [](__m128i* lo, __m128i* hi) {
    723         const __m128i zeros = _mm_setzero_si128();
    724         __m128i planar;
    725         if (kBGR1 == format) {
    726             planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
    727         } else {
    728             planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
    729         }
    730 
    731         // Swizzle the pixels to 8-bit planar.
    732         *lo = _mm_shuffle_epi8(*lo, planar);                                 // ccccmmmm yyyykkkk
    733         *hi = _mm_shuffle_epi8(*hi, planar);                                 // CCCCMMMM YYYYKKKK
    734         __m128i cm = _mm_unpacklo_epi32(*lo, *hi),                           // ccccCCCC mmmmMMMM
    735                 yk = _mm_unpackhi_epi32(*lo, *hi);                           // yyyyYYYY kkkkKKKK
    736 
    737         // Unpack to 16-bit planar.
    738         __m128i c = _mm_unpacklo_epi8(cm, zeros),                            // c_c_c_c_ C_C_C_C_
    739                 m = _mm_unpackhi_epi8(cm, zeros),                            // m_m_m_m_ M_M_M_M_
    740                 y = _mm_unpacklo_epi8(yk, zeros),                            // y_y_y_y_ Y_Y_Y_Y_
    741                 k = _mm_unpackhi_epi8(yk, zeros);                            // k_k_k_k_ K_K_K_K_
    742 
    743         // Scale to r, g, b.
    744         __m128i r = scale(c, k),
    745                 g = scale(m, k),
    746                 b = scale(y, k);
    747 
    748         // Repack into interlaced pixels.
    749         __m128i rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)),                  // rgrgrgrg RGRGRGRG
    750                 ba = _mm_or_si128(b, _mm_set1_epi16((uint16_t) 0xFF00));     // b1b1b1b1 B1B1B1B1
    751         *lo = _mm_unpacklo_epi16(rg, ba);                                    // rgbargba rgbargba
    752         *hi = _mm_unpackhi_epi16(rg, ba);                                    // RGB1RGB1 RGB1RGB1
    753     };
    754 
    755     while (count >= 8) {
    756         __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)),
    757                 hi = _mm_loadu_si128((const __m128i*) (src + 4));
    758 
    759         convert8(&lo, &hi);
    760 
    761         _mm_storeu_si128((__m128i*) (dst + 0), lo);
    762         _mm_storeu_si128((__m128i*) (dst + 4), hi);
    763 
    764         src += 8;
    765         dst += 8;
    766         count -= 8;
    767     }
    768 
    769     if (count >= 4) {
    770         __m128i lo = _mm_loadu_si128((const __m128i*) src),
    771                 hi = _mm_setzero_si128();
    772 
    773         convert8(&lo, &hi);
    774 
    775         _mm_storeu_si128((__m128i*) dst, lo);
    776 
    777         src += 4;
    778         dst += 4;
    779         count -= 4;
    780     }
    781 
    782     auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
    783     proc(dst, src, count);
    784 }
    785 
    786 static void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) {
    787     inverted_cmyk_to<kRGB1>(dst, src, count);
    788 }
    789 
    790 static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) {
    791     inverted_cmyk_to<kBGR1>(dst, src, count);
    792 }
    793 
    794 #else
    795 
    796 static void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
    797     RGBA_to_rgbA_portable(dst, src, count);
    798 }
    799 
    800 static void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {
    801     RGBA_to_bgrA_portable(dst, src, count);
    802 }
    803 
    804 static void RGBA_to_BGRA(uint32_t* dst, const void* src, int count) {
    805     RGBA_to_BGRA_portable(dst, src, count);
    806 }
    807 
    808 static void RGB_to_RGB1(uint32_t dst[], const void* src, int count) {
    809     RGB_to_RGB1_portable(dst, src, count);
    810 }
    811 
    812 static void RGB_to_BGR1(uint32_t dst[], const void* src, int count) {
    813     RGB_to_BGR1_portable(dst, src, count);
    814 }
    815 
    816 static void gray_to_RGB1(uint32_t dst[], const void* src, int count) {
    817     gray_to_RGB1_portable(dst, src, count);
    818 }
    819 
    820 static void grayA_to_RGBA(uint32_t dst[], const void* src, int count) {
    821     grayA_to_RGBA_portable(dst, src, count);
    822 }
    823 
    824 static void grayA_to_rgbA(uint32_t dst[], const void* src, int count) {
    825     grayA_to_rgbA_portable(dst, src, count);
    826 }
    827 
    828 static void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) {
    829     inverted_CMYK_to_RGB1_portable(dst, src, count);
    830 }
    831 
    832 static void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) {
    833     inverted_CMYK_to_BGR1_portable(dst, src, count);
    834 }
    835 
    836 #endif
    837 
    838 }
    839 
    840 #endif // SkSwizzler_opts_DEFINED
    841