Home | History | Annotate | Download | only in opts
      1 /*
      2  * Copyright 2016 Google Inc.
      3  *
      4  * Use of this source code is governed by a BSD-style license that can be
      5  * found in the LICENSE file.
      6  */
      7 
      8 #ifndef SkSwizzler_opts_DEFINED
      9 #define SkSwizzler_opts_DEFINED
     10 
     11 #include "SkColorData.h"
     12 
     13 #include <utility>
     14 
     15 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
     16     #include <immintrin.h>
     17 #elif defined(SK_ARM_HAS_NEON)
     18     #include <arm_neon.h>
     19 #endif
     20 
     21 namespace SK_OPTS_NS {
     22 
     23 static void RGBA_to_rgbA_portable(uint32_t* dst, const uint32_t* src, int count) {
     24     for (int i = 0; i < count; i++) {
     25         uint8_t a = (src[i] >> 24) & 0xFF,
     26                 b = (src[i] >> 16) & 0xFF,
     27                 g = (src[i] >>  8) & 0xFF,
     28                 r = (src[i] >>  0) & 0xFF;
     29         b = (b*a+127)/255;
     30         g = (g*a+127)/255;
     31         r = (r*a+127)/255;
     32         dst[i] = (uint32_t)a << 24
     33                | (uint32_t)b << 16
     34                | (uint32_t)g <<  8
     35                | (uint32_t)r <<  0;
     36     }
     37 }
     38 
     39 static void RGBA_to_bgrA_portable(uint32_t* dst, const uint32_t* src, int count) {
     40     for (int i = 0; i < count; i++) {
     41         uint8_t a = (src[i] >> 24) & 0xFF,
     42                 b = (src[i] >> 16) & 0xFF,
     43                 g = (src[i] >>  8) & 0xFF,
     44                 r = (src[i] >>  0) & 0xFF;
     45         b = (b*a+127)/255;
     46         g = (g*a+127)/255;
     47         r = (r*a+127)/255;
     48         dst[i] = (uint32_t)a << 24
     49                | (uint32_t)r << 16
     50                | (uint32_t)g <<  8
     51                | (uint32_t)b <<  0;
     52     }
     53 }
     54 
     55 static void RGBA_to_BGRA_portable(uint32_t* dst, const uint32_t* src, int count) {
     56     for (int i = 0; i < count; i++) {
     57         uint8_t a = (src[i] >> 24) & 0xFF,
     58                 b = (src[i] >> 16) & 0xFF,
     59                 g = (src[i] >>  8) & 0xFF,
     60                 r = (src[i] >>  0) & 0xFF;
     61         dst[i] = (uint32_t)a << 24
     62                | (uint32_t)r << 16
     63                | (uint32_t)g <<  8
     64                | (uint32_t)b <<  0;
     65     }
     66 }
     67 
     68 static void RGB_to_RGB1_portable(uint32_t dst[], const uint8_t* src, int count) {
     69     for (int i = 0; i < count; i++) {
     70         uint8_t r = src[0],
     71                 g = src[1],
     72                 b = src[2];
     73         src += 3;
     74         dst[i] = (uint32_t)0xFF << 24
     75                | (uint32_t)b    << 16
     76                | (uint32_t)g    <<  8
     77                | (uint32_t)r    <<  0;
     78     }
     79 }
     80 
     81 static void RGB_to_BGR1_portable(uint32_t dst[], const uint8_t* src, int count) {
     82     for (int i = 0; i < count; i++) {
     83         uint8_t r = src[0],
     84                 g = src[1],
     85                 b = src[2];
     86         src += 3;
     87         dst[i] = (uint32_t)0xFF << 24
     88                | (uint32_t)r    << 16
     89                | (uint32_t)g    <<  8
     90                | (uint32_t)b    <<  0;
     91     }
     92 }
     93 
     94 static void gray_to_RGB1_portable(uint32_t dst[], const uint8_t* src, int count) {
     95     for (int i = 0; i < count; i++) {
     96         dst[i] = (uint32_t)0xFF   << 24
     97                | (uint32_t)src[i] << 16
     98                | (uint32_t)src[i] <<  8
     99                | (uint32_t)src[i] <<  0;
    100     }
    101 }
    102 
    103 static void grayA_to_RGBA_portable(uint32_t dst[], const uint8_t* src, int count) {
    104     for (int i = 0; i < count; i++) {
    105         uint8_t g = src[0],
    106                 a = src[1];
    107         src += 2;
    108         dst[i] = (uint32_t)a << 24
    109                | (uint32_t)g << 16
    110                | (uint32_t)g <<  8
    111                | (uint32_t)g <<  0;
    112     }
    113 }
    114 
    115 static void grayA_to_rgbA_portable(uint32_t dst[], const uint8_t* src, int count) {
    116     for (int i = 0; i < count; i++) {
    117         uint8_t g = src[0],
    118                 a = src[1];
    119         src += 2;
    120         g = (g*a+127)/255;
    121         dst[i] = (uint32_t)a << 24
    122                | (uint32_t)g << 16
    123                | (uint32_t)g <<  8
    124                | (uint32_t)g <<  0;
    125     }
    126 }
    127 
    128 static void inverted_CMYK_to_RGB1_portable(uint32_t* dst, const uint32_t* src, int count) {
    129     for (int i = 0; i < count; i++) {
    130         uint8_t k = (src[i] >> 24) & 0xFF,
    131                 y = (src[i] >> 16) & 0xFF,
    132                 m = (src[i] >>  8) & 0xFF,
    133                 c = (src[i] >>  0) & 0xFF;
    134         // See comments in SkSwizzler.cpp for details on the conversion formula.
    135         uint8_t b = (y*k+127)/255,
    136                 g = (m*k+127)/255,
    137                 r = (c*k+127)/255;
    138         dst[i] = (uint32_t)0xFF << 24
    139                | (uint32_t)   b << 16
    140                | (uint32_t)   g <<  8
    141                | (uint32_t)   r <<  0;
    142     }
    143 }
    144 
    145 static void inverted_CMYK_to_BGR1_portable(uint32_t* dst, const uint32_t* src, int count) {
    146     for (int i = 0; i < count; i++) {
    147         uint8_t k = (src[i] >> 24) & 0xFF,
    148                 y = (src[i] >> 16) & 0xFF,
    149                 m = (src[i] >>  8) & 0xFF,
    150                 c = (src[i] >>  0) & 0xFF;
    151         uint8_t b = (y*k+127)/255,
    152                 g = (m*k+127)/255,
    153                 r = (c*k+127)/255;
    154         dst[i] = (uint32_t)0xFF << 24
    155                | (uint32_t)   r << 16
    156                | (uint32_t)   g <<  8
    157                | (uint32_t)   b <<  0;
    158     }
    159 }
    160 
    161 #if defined(SK_ARM_HAS_NEON)
    162 
    163 // Rounded divide by 255, (x + 127) / 255
    164 static uint8x8_t div255_round(uint16x8_t x) {
    165     // result = (x + 127) / 255
    166     // result = (x + 127) / 256 + error1
    167     //
    168     // error1 = (x + 127) / (255 * 256)
    169     // error1 = (x + 127) / (256 * 256) + error2
    170     //
    171     // error2 = (x + 127) / (255 * 256 * 256)
    172     //
    173     // The maximum value of error2 is too small to matter.  Thus:
    174     // result = (x + 127) / 256 + (x + 127) / (256 * 256)
    175     // result = ((x + 127) / 256 + x + 127) / 256
    176     // result = ((x + 127) >> 8 + x + 127) >> 8
    177     //
    178     // Use >>> to represent "rounded right shift" which, conveniently,
    179     // NEON supports in one instruction.
    180     // result = ((x >>> 8) + x) >>> 8
    181     //
    182     // Note that the second right shift is actually performed as an
    183     // "add, round, and narrow back to 8-bits" instruction.
    184     return vraddhn_u16(x, vrshrq_n_u16(x, 8));
    185 }
    186 
    187 // Scale a byte by another, (x * y + 127) / 255
    188 static uint8x8_t scale(uint8x8_t x, uint8x8_t y) {
    189     return div255_round(vmull_u8(x, y));
    190 }
    191 
    192 template <bool kSwapRB>
    193 static void premul_should_swapRB(uint32_t* dst, const uint32_t* src, int count) {
    194     while (count >= 8) {
    195         // Load 8 pixels.
    196         uint8x8x4_t rgba = vld4_u8((const uint8_t*) src);
    197 
    198         uint8x8_t a = rgba.val[3],
    199                   b = rgba.val[2],
    200                   g = rgba.val[1],
    201                   r = rgba.val[0];
    202 
    203         // Premultiply.
    204         b = scale(b, a);
    205         g = scale(g, a);
    206         r = scale(r, a);
    207 
    208         // Store 8 premultiplied pixels.
    209         if (kSwapRB) {
    210             rgba.val[2] = r;
    211             rgba.val[1] = g;
    212             rgba.val[0] = b;
    213         } else {
    214             rgba.val[2] = b;
    215             rgba.val[1] = g;
    216             rgba.val[0] = r;
    217         }
    218         vst4_u8((uint8_t*) dst, rgba);
    219         src += 8;
    220         dst += 8;
    221         count -= 8;
    222     }
    223 
    224     // Call portable code to finish up the tail of [0,8) pixels.
    225     auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
    226     proc(dst, src, count);
    227 }
    228 
    229 /*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
    230     premul_should_swapRB<false>(dst, src, count);
    231 }
    232 
    233 /*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
    234     premul_should_swapRB<true>(dst, src, count);
    235 }
    236 
    237 /*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
    238     using std::swap;
    239     while (count >= 16) {
    240         // Load 16 pixels.
    241         uint8x16x4_t rgba = vld4q_u8((const uint8_t*) src);
    242 
    243         // Swap r and b.
    244         swap(rgba.val[0], rgba.val[2]);
    245 
    246         // Store 16 pixels.
    247         vst4q_u8((uint8_t*) dst, rgba);
    248         src += 16;
    249         dst += 16;
    250         count -= 16;
    251     }
    252 
    253     if (count >= 8) {
    254         // Load 8 pixels.
    255         uint8x8x4_t rgba = vld4_u8((const uint8_t*) src);
    256 
    257         // Swap r and b.
    258         swap(rgba.val[0], rgba.val[2]);
    259 
    260         // Store 8 pixels.
    261         vst4_u8((uint8_t*) dst, rgba);
    262         src += 8;
    263         dst += 8;
    264         count -= 8;
    265     }
    266 
    267     RGBA_to_BGRA_portable(dst, src, count);
    268 }
    269 
    270 template <bool kSwapRB>
    271 static void insert_alpha_should_swaprb(uint32_t dst[], const uint8_t* src, int count) {
    272     while (count >= 16) {
    273         // Load 16 pixels.
    274         uint8x16x3_t rgb = vld3q_u8(src);
    275 
    276         // Insert an opaque alpha channel and swap if needed.
    277         uint8x16x4_t rgba;
    278         if (kSwapRB) {
    279             rgba.val[0] = rgb.val[2];
    280             rgba.val[2] = rgb.val[0];
    281         } else {
    282             rgba.val[0] = rgb.val[0];
    283             rgba.val[2] = rgb.val[2];
    284         }
    285         rgba.val[1] = rgb.val[1];
    286         rgba.val[3] = vdupq_n_u8(0xFF);
    287 
    288         // Store 16 pixels.
    289         vst4q_u8((uint8_t*) dst, rgba);
    290         src += 16*3;
    291         dst += 16;
    292         count -= 16;
    293     }
    294 
    295     if (count >= 8) {
    296         // Load 8 pixels.
    297         uint8x8x3_t rgb = vld3_u8(src);
    298 
    299         // Insert an opaque alpha channel and swap if needed.
    300         uint8x8x4_t rgba;
    301         if (kSwapRB) {
    302             rgba.val[0] = rgb.val[2];
    303             rgba.val[2] = rgb.val[0];
    304         } else {
    305             rgba.val[0] = rgb.val[0];
    306             rgba.val[2] = rgb.val[2];
    307         }
    308         rgba.val[1] = rgb.val[1];
    309         rgba.val[3] = vdup_n_u8(0xFF);
    310 
    311         // Store 8 pixels.
    312         vst4_u8((uint8_t*) dst, rgba);
    313         src += 8*3;
    314         dst += 8;
    315         count -= 8;
    316     }
    317 
    318     // Call portable code to finish up the tail of [0,8) pixels.
    319     auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
    320     proc(dst, src, count);
    321 }
    322 
    323 /*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
    324     insert_alpha_should_swaprb<false>(dst, src, count);
    325 }
    326 
    327 /*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) {
    328     insert_alpha_should_swaprb<true>(dst, src, count);
    329 }
    330 
    331 /*not static*/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
    332     while (count >= 16) {
    333         // Load 16 pixels.
    334         uint8x16_t gray = vld1q_u8(src);
    335 
    336         // Set each of the color channels.
    337         uint8x16x4_t rgba;
    338         rgba.val[0] = gray;
    339         rgba.val[1] = gray;
    340         rgba.val[2] = gray;
    341         rgba.val[3] = vdupq_n_u8(0xFF);
    342 
    343         // Store 16 pixels.
    344         vst4q_u8((uint8_t*) dst, rgba);
    345         src += 16;
    346         dst += 16;
    347         count -= 16;
    348     }
    349 
    350     if (count >= 8) {
    351         // Load 8 pixels.
    352         uint8x8_t gray = vld1_u8(src);
    353 
    354         // Set each of the color channels.
    355         uint8x8x4_t rgba;
    356         rgba.val[0] = gray;
    357         rgba.val[1] = gray;
    358         rgba.val[2] = gray;
    359         rgba.val[3] = vdup_n_u8(0xFF);
    360 
    361         // Store 8 pixels.
    362         vst4_u8((uint8_t*) dst, rgba);
    363         src += 8;
    364         dst += 8;
    365         count -= 8;
    366     }
    367 
    368     gray_to_RGB1_portable(dst, src, count);
    369 }
    370 
    371 template <bool kPremul>
    372 static void expand_grayA(uint32_t dst[], const uint8_t* src, int count) {
    373     while (count >= 16) {
    374         // Load 16 pixels.
    375         uint8x16x2_t ga = vld2q_u8(src);
    376 
    377         // Premultiply if requested.
    378         if (kPremul) {
    379             ga.val[0] = vcombine_u8(
    380                     scale(vget_low_u8(ga.val[0]),  vget_low_u8(ga.val[1])),
    381                     scale(vget_high_u8(ga.val[0]), vget_high_u8(ga.val[1])));
    382         }
    383 
    384         // Set each of the color channels.
    385         uint8x16x4_t rgba;
    386         rgba.val[0] = ga.val[0];
    387         rgba.val[1] = ga.val[0];
    388         rgba.val[2] = ga.val[0];
    389         rgba.val[3] = ga.val[1];
    390 
    391         // Store 16 pixels.
    392         vst4q_u8((uint8_t*) dst, rgba);
    393         src += 16*2;
    394         dst += 16;
    395         count -= 16;
    396     }
    397 
    398     if (count >= 8) {
    399         // Load 8 pixels.
    400         uint8x8x2_t ga = vld2_u8(src);
    401 
    402         // Premultiply if requested.
    403         if (kPremul) {
    404             ga.val[0] = scale(ga.val[0], ga.val[1]);
    405         }
    406 
    407         // Set each of the color channels.
    408         uint8x8x4_t rgba;
    409         rgba.val[0] = ga.val[0];
    410         rgba.val[1] = ga.val[0];
    411         rgba.val[2] = ga.val[0];
    412         rgba.val[3] = ga.val[1];
    413 
    414         // Store 8 pixels.
    415         vst4_u8((uint8_t*) dst, rgba);
    416         src += 8*2;
    417         dst += 8;
    418         count -= 8;
    419     }
    420 
    421     auto proc = kPremul ? grayA_to_rgbA_portable : grayA_to_RGBA_portable;
    422     proc(dst, src, count);
    423 }
    424 
    425 /*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
    426     expand_grayA<false>(dst, src, count);
    427 }
    428 
    429 /*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
    430     expand_grayA<true>(dst, src, count);
    431 }
    432 
    433 enum Format { kRGB1, kBGR1 };
    434 template <Format format>
    435 static void inverted_cmyk_to(uint32_t* dst, const uint32_t* src, int count) {
    436     while (count >= 8) {
    437         // Load 8 cmyk pixels.
    438         uint8x8x4_t pixels = vld4_u8((const uint8_t*) src);
    439 
    440         uint8x8_t k = pixels.val[3],
    441                   y = pixels.val[2],
    442                   m = pixels.val[1],
    443                   c = pixels.val[0];
    444 
    445         // Scale to r, g, b.
    446         uint8x8_t b = scale(y, k);
    447         uint8x8_t g = scale(m, k);
    448         uint8x8_t r = scale(c, k);
    449 
    450         // Store 8 rgba pixels.
    451         if (kBGR1 == format) {
    452             pixels.val[3] = vdup_n_u8(0xFF);
    453             pixels.val[2] = r;
    454             pixels.val[1] = g;
    455             pixels.val[0] = b;
    456         } else {
    457             pixels.val[3] = vdup_n_u8(0xFF);
    458             pixels.val[2] = b;
    459             pixels.val[1] = g;
    460             pixels.val[0] = r;
    461         }
    462         vst4_u8((uint8_t*) dst, pixels);
    463         src += 8;
    464         dst += 8;
    465         count -= 8;
    466     }
    467 
    468     auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
    469     proc(dst, src, count);
    470 }
    471 
    472 /*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
    473     inverted_cmyk_to<kRGB1>(dst, src, count);
    474 }
    475 
    476 /*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
    477     inverted_cmyk_to<kBGR1>(dst, src, count);
    478 }
    479 
    480 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
    481 
    482 // Scale a byte by another.
    483 // Inputs are stored in 16-bit lanes, but are not larger than 8-bits.
    484 static __m128i scale(__m128i x, __m128i y) {
    485     const __m128i _128 = _mm_set1_epi16(128);
    486     const __m128i _257 = _mm_set1_epi16(257);
    487 
    488     // (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255.
    489     return _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(x, y), _128), _257);
    490 }
    491 
    492 template <bool kSwapRB>
    493 static void premul_should_swapRB(uint32_t* dst, const uint32_t* src, int count) {
    494 
    495     auto premul8 = [](__m128i* lo, __m128i* hi) {
    496         const __m128i zeros = _mm_setzero_si128();
    497         __m128i planar;
    498         if (kSwapRB) {
    499             planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
    500         } else {
    501             planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
    502         }
    503 
    504         // Swizzle the pixels to 8-bit planar.
    505         *lo = _mm_shuffle_epi8(*lo, planar);                      // rrrrgggg bbbbaaaa
    506         *hi = _mm_shuffle_epi8(*hi, planar);                      // RRRRGGGG BBBBAAAA
    507         __m128i rg = _mm_unpacklo_epi32(*lo, *hi),                // rrrrRRRR ggggGGGG
    508                 ba = _mm_unpackhi_epi32(*lo, *hi);                // bbbbBBBB aaaaAAAA
    509 
    510         // Unpack to 16-bit planar.
    511         __m128i r = _mm_unpacklo_epi8(rg, zeros),                 // r_r_r_r_ R_R_R_R_
    512                 g = _mm_unpackhi_epi8(rg, zeros),                 // g_g_g_g_ G_G_G_G_
    513                 b = _mm_unpacklo_epi8(ba, zeros),                 // b_b_b_b_ B_B_B_B_
    514                 a = _mm_unpackhi_epi8(ba, zeros);                 // a_a_a_a_ A_A_A_A_
    515 
    516         // Premultiply!
    517         r = scale(r, a);
    518         g = scale(g, a);
    519         b = scale(b, a);
    520 
    521         // Repack into interlaced pixels.
    522         rg = _mm_or_si128(r, _mm_slli_epi16(g, 8));               // rgrgrgrg RGRGRGRG
    523         ba = _mm_or_si128(b, _mm_slli_epi16(a, 8));               // babababa BABABABA
    524         *lo = _mm_unpacklo_epi16(rg, ba);                         // rgbargba rgbargba
    525         *hi = _mm_unpackhi_epi16(rg, ba);                         // RGBARGBA RGBARGBA
    526     };
    527 
    528     while (count >= 8) {
    529         __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)),
    530                 hi = _mm_loadu_si128((const __m128i*) (src + 4));
    531 
    532         premul8(&lo, &hi);
    533 
    534         _mm_storeu_si128((__m128i*) (dst + 0), lo);
    535         _mm_storeu_si128((__m128i*) (dst + 4), hi);
    536 
    537         src += 8;
    538         dst += 8;
    539         count -= 8;
    540     }
    541 
    542     if (count >= 4) {
    543         __m128i lo = _mm_loadu_si128((const __m128i*) src),
    544                 hi = _mm_setzero_si128();
    545 
    546         premul8(&lo, &hi);
    547 
    548         _mm_storeu_si128((__m128i*) dst, lo);
    549 
    550         src += 4;
    551         dst += 4;
    552         count -= 4;
    553     }
    554 
    555     // Call portable code to finish up the tail of [0,4) pixels.
    556     auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
    557     proc(dst, src, count);
    558 }
    559 
    560 /*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
    561     premul_should_swapRB<false>(dst, src, count);
    562 }
    563 
    564 /*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
    565     premul_should_swapRB<true>(dst, src, count);
    566 }
    567 
    568 /*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
    569     const __m128i swapRB = _mm_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15);
    570 
    571     while (count >= 4) {
    572         __m128i rgba = _mm_loadu_si128((const __m128i*) src);
    573         __m128i bgra = _mm_shuffle_epi8(rgba, swapRB);
    574         _mm_storeu_si128((__m128i*) dst, bgra);
    575 
    576         src += 4;
    577         dst += 4;
    578         count -= 4;
    579     }
    580 
    581     RGBA_to_BGRA_portable(dst, src, count);
    582 }
    583 
    584 template <bool kSwapRB>
    585 static void insert_alpha_should_swaprb(uint32_t dst[], const uint8_t* src, int count) {
    586     const __m128i alphaMask = _mm_set1_epi32(0xFF000000);
    587     __m128i expand;
    588     const uint8_t X = 0xFF; // Used a placeholder.  The value of X is irrelevant.
    589     if (kSwapRB) {
    590         expand = _mm_setr_epi8(2,1,0,X, 5,4,3,X, 8,7,6,X, 11,10,9,X);
    591     } else {
    592         expand = _mm_setr_epi8(0,1,2,X, 3,4,5,X, 6,7,8,X, 9,10,11,X);
    593     }
    594 
    595     while (count >= 6) {
    596         // Load a vector.  While this actually contains 5 pixels plus an
    597         // extra component, we will discard all but the first four pixels on
    598         // this iteration.
    599         __m128i rgb = _mm_loadu_si128((const __m128i*) src);
    600 
    601         // Expand the first four pixels to RGBX and then mask to RGB(FF).
    602         __m128i rgba = _mm_or_si128(_mm_shuffle_epi8(rgb, expand), alphaMask);
    603 
    604         // Store 4 pixels.
    605         _mm_storeu_si128((__m128i*) dst, rgba);
    606 
    607         src += 4*3;
    608         dst += 4;
    609         count -= 4;
    610     }
    611 
    612     // Call portable code to finish up the tail of [0,4) pixels.
    613     auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
    614     proc(dst, src, count);
    615 }
    616 
    617 /*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
    618     insert_alpha_should_swaprb<false>(dst, src, count);
    619 }
    620 
    621 /*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) {
    622     insert_alpha_should_swaprb<true>(dst, src, count);
    623 }
    624 
    625 /*not static*/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
    626     const __m128i alphas = _mm_set1_epi8((uint8_t) 0xFF);
    627     while (count >= 16) {
    628         __m128i grays = _mm_loadu_si128((const __m128i*) src);
    629 
    630         __m128i gg_lo = _mm_unpacklo_epi8(grays, grays);
    631         __m128i gg_hi = _mm_unpackhi_epi8(grays, grays);
    632         __m128i ga_lo = _mm_unpacklo_epi8(grays, alphas);
    633         __m128i ga_hi = _mm_unpackhi_epi8(grays, alphas);
    634 
    635         __m128i ggga0 = _mm_unpacklo_epi16(gg_lo, ga_lo);
    636         __m128i ggga1 = _mm_unpackhi_epi16(gg_lo, ga_lo);
    637         __m128i ggga2 = _mm_unpacklo_epi16(gg_hi, ga_hi);
    638         __m128i ggga3 = _mm_unpackhi_epi16(gg_hi, ga_hi);
    639 
    640         _mm_storeu_si128((__m128i*) (dst +  0), ggga0);
    641         _mm_storeu_si128((__m128i*) (dst +  4), ggga1);
    642         _mm_storeu_si128((__m128i*) (dst +  8), ggga2);
    643         _mm_storeu_si128((__m128i*) (dst + 12), ggga3);
    644 
    645         src += 16;
    646         dst += 16;
    647         count -= 16;
    648     }
    649 
    650     gray_to_RGB1_portable(dst, src, count);
    651 }
    652 
    653 /*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
    654     while (count >= 8) {
    655         __m128i ga = _mm_loadu_si128((const __m128i*) src);
    656 
    657         __m128i gg = _mm_or_si128(_mm_and_si128(ga, _mm_set1_epi16(0x00FF)),
    658                                   _mm_slli_epi16(ga, 8));
    659 
    660         __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga);
    661         __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga);
    662 
    663         _mm_storeu_si128((__m128i*) (dst +  0), ggga_lo);
    664         _mm_storeu_si128((__m128i*) (dst +  4), ggga_hi);
    665 
    666         src += 8*2;
    667         dst += 8;
    668         count -= 8;
    669     }
    670 
    671     grayA_to_RGBA_portable(dst, src, count);
    672 }
    673 
    674 /*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
    675     while (count >= 8) {
    676         __m128i grayA = _mm_loadu_si128((const __m128i*) src);
    677 
    678         __m128i g0 = _mm_and_si128(grayA, _mm_set1_epi16(0x00FF));
    679         __m128i a0 = _mm_srli_epi16(grayA, 8);
    680 
    681         // Premultiply
    682         g0 = scale(g0, a0);
    683 
    684         __m128i gg = _mm_or_si128(g0, _mm_slli_epi16(g0, 8));
    685         __m128i ga = _mm_or_si128(g0, _mm_slli_epi16(a0, 8));
    686 
    687 
    688         __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga);
    689         __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga);
    690 
    691         _mm_storeu_si128((__m128i*) (dst +  0), ggga_lo);
    692         _mm_storeu_si128((__m128i*) (dst +  4), ggga_hi);
    693 
    694         src += 8*2;
    695         dst += 8;
    696         count -= 8;
    697     }
    698 
    699     grayA_to_rgbA_portable(dst, src, count);
    700 }
    701 
    702 enum Format { kRGB1, kBGR1 };
    703 template <Format format>
    704 static void inverted_cmyk_to(uint32_t* dst, const uint32_t* src, int count) {
    705     auto convert8 = [](__m128i* lo, __m128i* hi) {
    706         const __m128i zeros = _mm_setzero_si128();
    707         __m128i planar;
    708         if (kBGR1 == format) {
    709             planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
    710         } else {
    711             planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
    712         }
    713 
    714         // Swizzle the pixels to 8-bit planar.
    715         *lo = _mm_shuffle_epi8(*lo, planar);                                 // ccccmmmm yyyykkkk
    716         *hi = _mm_shuffle_epi8(*hi, planar);                                 // CCCCMMMM YYYYKKKK
    717         __m128i cm = _mm_unpacklo_epi32(*lo, *hi),                           // ccccCCCC mmmmMMMM
    718                 yk = _mm_unpackhi_epi32(*lo, *hi);                           // yyyyYYYY kkkkKKKK
    719 
    720         // Unpack to 16-bit planar.
    721         __m128i c = _mm_unpacklo_epi8(cm, zeros),                            // c_c_c_c_ C_C_C_C_
    722                 m = _mm_unpackhi_epi8(cm, zeros),                            // m_m_m_m_ M_M_M_M_
    723                 y = _mm_unpacklo_epi8(yk, zeros),                            // y_y_y_y_ Y_Y_Y_Y_
    724                 k = _mm_unpackhi_epi8(yk, zeros);                            // k_k_k_k_ K_K_K_K_
    725 
    726         // Scale to r, g, b.
    727         __m128i r = scale(c, k),
    728                 g = scale(m, k),
    729                 b = scale(y, k);
    730 
    731         // Repack into interlaced pixels.
    732         __m128i rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)),                  // rgrgrgrg RGRGRGRG
    733                 ba = _mm_or_si128(b, _mm_set1_epi16((uint16_t) 0xFF00));     // b1b1b1b1 B1B1B1B1
    734         *lo = _mm_unpacklo_epi16(rg, ba);                                    // rgbargba rgbargba
    735         *hi = _mm_unpackhi_epi16(rg, ba);                                    // RGB1RGB1 RGB1RGB1
    736     };
    737 
    738     while (count >= 8) {
    739         __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)),
    740                 hi = _mm_loadu_si128((const __m128i*) (src + 4));
    741 
    742         convert8(&lo, &hi);
    743 
    744         _mm_storeu_si128((__m128i*) (dst + 0), lo);
    745         _mm_storeu_si128((__m128i*) (dst + 4), hi);
    746 
    747         src += 8;
    748         dst += 8;
    749         count -= 8;
    750     }
    751 
    752     if (count >= 4) {
    753         __m128i lo = _mm_loadu_si128((const __m128i*) src),
    754                 hi = _mm_setzero_si128();
    755 
    756         convert8(&lo, &hi);
    757 
    758         _mm_storeu_si128((__m128i*) dst, lo);
    759 
    760         src += 4;
    761         dst += 4;
    762         count -= 4;
    763     }
    764 
    765     auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
    766     proc(dst, src, count);
    767 }
    768 
    769 /*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
    770     inverted_cmyk_to<kRGB1>(dst, src, count);
    771 }
    772 
    773 /*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
    774     inverted_cmyk_to<kBGR1>(dst, src, count);
    775 }
    776 
    777 #else
    778 
    779 /*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const uint32_t* src, int count) {
    780     RGBA_to_rgbA_portable(dst, src, count);
    781 }
    782 
    783 /*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const uint32_t* src, int count) {
    784     RGBA_to_bgrA_portable(dst, src, count);
    785 }
    786 
    787 /*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const uint32_t* src, int count) {
    788     RGBA_to_BGRA_portable(dst, src, count);
    789 }
    790 
    791 /*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
    792     RGB_to_RGB1_portable(dst, src, count);
    793 }
    794 
    795 /*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const uint8_t* src, int count) {
    796     RGB_to_BGR1_portable(dst, src, count);
    797 }
    798 
    799 /*not static*/ inline void gray_to_RGB1(uint32_t dst[], const uint8_t* src, int count) {
    800     gray_to_RGB1_portable(dst, src, count);
    801 }
    802 
    803 /*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const uint8_t* src, int count) {
    804     grayA_to_RGBA_portable(dst, src, count);
    805 }
    806 
    807 /*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const uint8_t* src, int count) {
    808     grayA_to_rgbA_portable(dst, src, count);
    809 }
    810 
    811 /*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const uint32_t* src, int count) {
    812     inverted_CMYK_to_RGB1_portable(dst, src, count);
    813 }
    814 
    815 /*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const uint32_t* src, int count) {
    816     inverted_CMYK_to_BGR1_portable(dst, src, count);
    817 }
    818 
    819 #endif
    820 
    821 }
    822 
    823 #endif // SkSwizzler_opts_DEFINED
    824