Home | History | Annotate | Download | only in opts
      1 /*
      2  * Copyright 2016 Google Inc.
      3  *
      4  * Use of this source code is governed by a BSD-style license that can be
      5  * found in the LICENSE file.
      6  */
      7 
      8 #ifndef SkSwizzler_opts_DEFINED
      9 #define SkSwizzler_opts_DEFINED
     10 
     11 #include "SkColorData.h"
     12 
     13 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
     14     #include <immintrin.h>
     15 #elif defined(SK_ARM_HAS_NEON)
     16     #include <arm_neon.h>
     17 #endif
     18 
     19 namespace SK_OPTS_NS {
     20 
     21 static void RGBA_to_rgbA_portable(uint32_t* dst, const void* vsrc, int count) {
     22     auto src = (const uint32_t*)vsrc;
     23     for (int i = 0; i < count; i++) {
     24         uint8_t a = src[i] >> 24,
     25                 b = src[i] >> 16,
     26                 g = src[i] >>  8,
     27                 r = src[i] >>  0;
     28         b = (b*a+127)/255;
     29         g = (g*a+127)/255;
     30         r = (r*a+127)/255;
     31         dst[i] = (uint32_t)a << 24
     32                | (uint32_t)b << 16
     33                | (uint32_t)g <<  8
     34                | (uint32_t)r <<  0;
     35     }
     36 }
     37 
     38 static void RGBA_to_bgrA_portable(uint32_t* dst, const void* vsrc, int count) {
     39     auto src = (const uint32_t*)vsrc;
     40     for (int i = 0; i < count; i++) {
     41         uint8_t a = src[i] >> 24,
     42                 b = src[i] >> 16,
     43                 g = src[i] >>  8,
     44                 r = src[i] >>  0;
     45         b = (b*a+127)/255;
     46         g = (g*a+127)/255;
     47         r = (r*a+127)/255;
     48         dst[i] = (uint32_t)a << 24
     49                | (uint32_t)r << 16
     50                | (uint32_t)g <<  8
     51                | (uint32_t)b <<  0;
     52     }
     53 }
     54 
     55 static void RGBA_to_BGRA_portable(uint32_t* dst, const void* vsrc, int count) {
     56     auto src = (const uint32_t*)vsrc;
     57     for (int i = 0; i < count; i++) {
     58         uint8_t a = src[i] >> 24,
     59                 b = src[i] >> 16,
     60                 g = src[i] >>  8,
     61                 r = src[i] >>  0;
     62         dst[i] = (uint32_t)a << 24
     63                | (uint32_t)r << 16
     64                | (uint32_t)g <<  8
     65                | (uint32_t)b <<  0;
     66     }
     67 }
     68 
     69 static void RGB_to_RGB1_portable(uint32_t dst[], const void* vsrc, int count) {
     70     const uint8_t* src = (const uint8_t*)vsrc;
     71     for (int i = 0; i < count; i++) {
     72         uint8_t r = src[0],
     73                 g = src[1],
     74                 b = src[2];
     75         src += 3;
     76         dst[i] = (uint32_t)0xFF << 24
     77                | (uint32_t)b    << 16
     78                | (uint32_t)g    <<  8
     79                | (uint32_t)r    <<  0;
     80     }
     81 }
     82 
     83 static void RGB_to_BGR1_portable(uint32_t dst[], const void* vsrc, int count) {
     84     const uint8_t* src = (const uint8_t*)vsrc;
     85     for (int i = 0; i < count; i++) {
     86         uint8_t r = src[0],
     87                 g = src[1],
     88                 b = src[2];
     89         src += 3;
     90         dst[i] = (uint32_t)0xFF << 24
     91                | (uint32_t)r    << 16
     92                | (uint32_t)g    <<  8
     93                | (uint32_t)b    <<  0;
     94     }
     95 }
     96 
     97 static void gray_to_RGB1_portable(uint32_t dst[], const void* vsrc, int count) {
     98     const uint8_t* src = (const uint8_t*)vsrc;
     99     for (int i = 0; i < count; i++) {
    100         dst[i] = (uint32_t)0xFF   << 24
    101                | (uint32_t)src[i] << 16
    102                | (uint32_t)src[i] <<  8
    103                | (uint32_t)src[i] <<  0;
    104     }
    105 }
    106 
    107 static void grayA_to_RGBA_portable(uint32_t dst[], const void* vsrc, int count) {
    108     const uint8_t* src = (const uint8_t*)vsrc;
    109     for (int i = 0; i < count; i++) {
    110         uint8_t g = src[0],
    111                 a = src[1];
    112         src += 2;
    113         dst[i] = (uint32_t)a << 24
    114                | (uint32_t)g << 16
    115                | (uint32_t)g <<  8
    116                | (uint32_t)g <<  0;
    117     }
    118 }
    119 
    120 static void grayA_to_rgbA_portable(uint32_t dst[], const void* vsrc, int count) {
    121     const uint8_t* src = (const uint8_t*)vsrc;
    122     for (int i = 0; i < count; i++) {
    123         uint8_t g = src[0],
    124                 a = src[1];
    125         src += 2;
    126         g = (g*a+127)/255;
    127         dst[i] = (uint32_t)a << 24
    128                | (uint32_t)g << 16
    129                | (uint32_t)g <<  8
    130                | (uint32_t)g <<  0;
    131     }
    132 }
    133 
    134 static void inverted_CMYK_to_RGB1_portable(uint32_t* dst, const void* vsrc, int count) {
    135     const uint32_t* src = (const uint32_t*)vsrc;
    136     for (int i = 0; i < count; i++) {
    137         uint8_t k = src[i] >> 24,
    138                 y = src[i] >> 16,
    139                 m = src[i] >>  8,
    140                 c = src[i] >>  0;
    141         // See comments in SkSwizzler.cpp for details on the conversion formula.
    142         uint8_t b = (y*k+127)/255,
    143                 g = (m*k+127)/255,
    144                 r = (c*k+127)/255;
    145         dst[i] = (uint32_t)0xFF << 24
    146                | (uint32_t)   b << 16
    147                | (uint32_t)   g <<  8
    148                | (uint32_t)   r <<  0;
    149     }
    150 }
    151 
    152 static void inverted_CMYK_to_BGR1_portable(uint32_t* dst, const void* vsrc, int count) {
    153     const uint32_t* src = (const uint32_t*)vsrc;
    154     for (int i = 0; i < count; i++) {
    155         uint8_t k = src[i] >> 24,
    156                 y = src[i] >> 16,
    157                 m = src[i] >>  8,
    158                 c = src[i] >>  0;
    159         uint8_t b = (y*k+127)/255,
    160                 g = (m*k+127)/255,
    161                 r = (c*k+127)/255;
    162         dst[i] = (uint32_t)0xFF << 24
    163                | (uint32_t)   r << 16
    164                | (uint32_t)   g <<  8
    165                | (uint32_t)   b <<  0;
    166     }
    167 }
    168 
    169 #if defined(SK_ARM_HAS_NEON)
    170 
    171 // Rounded divide by 255, (x + 127) / 255
    172 static uint8x8_t div255_round(uint16x8_t x) {
    173     // result = (x + 127) / 255
    174     // result = (x + 127) / 256 + error1
    175     //
    176     // error1 = (x + 127) / (255 * 256)
    177     // error1 = (x + 127) / (256 * 256) + error2
    178     //
    179     // error2 = (x + 127) / (255 * 256 * 256)
    180     //
    181     // The maximum value of error2 is too small to matter.  Thus:
    182     // result = (x + 127) / 256 + (x + 127) / (256 * 256)
    183     // result = ((x + 127) / 256 + x + 127) / 256
    184     // result = ((x + 127) >> 8 + x + 127) >> 8
    185     //
    186     // Use >>> to represent "rounded right shift" which, conveniently,
    187     // NEON supports in one instruction.
    188     // result = ((x >>> 8) + x) >>> 8
    189     //
    190     // Note that the second right shift is actually performed as an
    191     // "add, round, and narrow back to 8-bits" instruction.
    192     return vraddhn_u16(x, vrshrq_n_u16(x, 8));
    193 }
    194 
    195 // Scale a byte by another, (x * y + 127) / 255
    196 static uint8x8_t scale(uint8x8_t x, uint8x8_t y) {
    197     return div255_round(vmull_u8(x, y));
    198 }
    199 
    200 template <bool kSwapRB>
    201 static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) {
    202     auto src = (const uint32_t*)vsrc;
    203     while (count >= 8) {
    204         // Load 8 pixels.
    205         uint8x8x4_t rgba = vld4_u8((const uint8_t*) src);
    206 
    207         uint8x8_t a = rgba.val[3],
    208                   b = rgba.val[2],
    209                   g = rgba.val[1],
    210                   r = rgba.val[0];
    211 
    212         // Premultiply.
    213         b = scale(b, a);
    214         g = scale(g, a);
    215         r = scale(r, a);
    216 
    217         // Store 8 premultiplied pixels.
    218         if (kSwapRB) {
    219             rgba.val[2] = r;
    220             rgba.val[1] = g;
    221             rgba.val[0] = b;
    222         } else {
    223             rgba.val[2] = b;
    224             rgba.val[1] = g;
    225             rgba.val[0] = r;
    226         }
    227         vst4_u8((uint8_t*) dst, rgba);
    228         src += 8;
    229         dst += 8;
    230         count -= 8;
    231     }
    232 
    233     // Call portable code to finish up the tail of [0,8) pixels.
    234     auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
    235     proc(dst, src, count);
    236 }
    237 
    238 /*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
    239     premul_should_swapRB<false>(dst, src, count);
    240 }
    241 
    242 /*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {
    243     premul_should_swapRB<true>(dst, src, count);
    244 }
    245 
    246 /*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const void* vsrc, int count) {
    247     auto src = (const uint32_t*)vsrc;
    248     while (count >= 16) {
    249         // Load 16 pixels.
    250         uint8x16x4_t rgba = vld4q_u8((const uint8_t*) src);
    251 
    252         // Swap r and b.
    253         SkTSwap(rgba.val[0], rgba.val[2]);
    254 
    255         // Store 16 pixels.
    256         vst4q_u8((uint8_t*) dst, rgba);
    257         src += 16;
    258         dst += 16;
    259         count -= 16;
    260     }
    261 
    262     if (count >= 8) {
    263         // Load 8 pixels.
    264         uint8x8x4_t rgba = vld4_u8((const uint8_t*) src);
    265 
    266         // Swap r and b.
    267         SkTSwap(rgba.val[0], rgba.val[2]);
    268 
    269         // Store 8 pixels.
    270         vst4_u8((uint8_t*) dst, rgba);
    271         src += 8;
    272         dst += 8;
    273         count -= 8;
    274     }
    275 
    276     RGBA_to_BGRA_portable(dst, src, count);
    277 }
    278 
    279 template <bool kSwapRB>
    280 static void insert_alpha_should_swaprb(uint32_t dst[], const void* vsrc, int count) {
    281     const uint8_t* src = (const uint8_t*) vsrc;
    282     while (count >= 16) {
    283         // Load 16 pixels.
    284         uint8x16x3_t rgb = vld3q_u8(src);
    285 
    286         // Insert an opaque alpha channel and swap if needed.
    287         uint8x16x4_t rgba;
    288         if (kSwapRB) {
    289             rgba.val[0] = rgb.val[2];
    290             rgba.val[2] = rgb.val[0];
    291         } else {
    292             rgba.val[0] = rgb.val[0];
    293             rgba.val[2] = rgb.val[2];
    294         }
    295         rgba.val[1] = rgb.val[1];
    296         rgba.val[3] = vdupq_n_u8(0xFF);
    297 
    298         // Store 16 pixels.
    299         vst4q_u8((uint8_t*) dst, rgba);
    300         src += 16*3;
    301         dst += 16;
    302         count -= 16;
    303     }
    304 
    305     if (count >= 8) {
    306         // Load 8 pixels.
    307         uint8x8x3_t rgb = vld3_u8(src);
    308 
    309         // Insert an opaque alpha channel and swap if needed.
    310         uint8x8x4_t rgba;
    311         if (kSwapRB) {
    312             rgba.val[0] = rgb.val[2];
    313             rgba.val[2] = rgb.val[0];
    314         } else {
    315             rgba.val[0] = rgb.val[0];
    316             rgba.val[2] = rgb.val[2];
    317         }
    318         rgba.val[1] = rgb.val[1];
    319         rgba.val[3] = vdup_n_u8(0xFF);
    320 
    321         // Store 8 pixels.
    322         vst4_u8((uint8_t*) dst, rgba);
    323         src += 8*3;
    324         dst += 8;
    325         count -= 8;
    326     }
    327 
    328     // Call portable code to finish up the tail of [0,8) pixels.
    329     auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
    330     proc(dst, src, count);
    331 }
    332 
    333 /*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const void* src, int count) {
    334     insert_alpha_should_swaprb<false>(dst, src, count);
    335 }
    336 
    337 /*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const void* src, int count) {
    338     insert_alpha_should_swaprb<true>(dst, src, count);
    339 }
    340 
    341 /*not static*/ inline void gray_to_RGB1(uint32_t dst[], const void* vsrc, int count) {
    342     const uint8_t* src = (const uint8_t*) vsrc;
    343     while (count >= 16) {
    344         // Load 16 pixels.
    345         uint8x16_t gray = vld1q_u8(src);
    346 
    347         // Set each of the color channels.
    348         uint8x16x4_t rgba;
    349         rgba.val[0] = gray;
    350         rgba.val[1] = gray;
    351         rgba.val[2] = gray;
    352         rgba.val[3] = vdupq_n_u8(0xFF);
    353 
    354         // Store 16 pixels.
    355         vst4q_u8((uint8_t*) dst, rgba);
    356         src += 16;
    357         dst += 16;
    358         count -= 16;
    359     }
    360 
    361     if (count >= 8) {
    362         // Load 8 pixels.
    363         uint8x8_t gray = vld1_u8(src);
    364 
    365         // Set each of the color channels.
    366         uint8x8x4_t rgba;
    367         rgba.val[0] = gray;
    368         rgba.val[1] = gray;
    369         rgba.val[2] = gray;
    370         rgba.val[3] = vdup_n_u8(0xFF);
    371 
    372         // Store 8 pixels.
    373         vst4_u8((uint8_t*) dst, rgba);
    374         src += 8;
    375         dst += 8;
    376         count -= 8;
    377     }
    378 
    379     gray_to_RGB1_portable(dst, src, count);
    380 }
    381 
    382 template <bool kPremul>
    383 static void expand_grayA(uint32_t dst[], const void* vsrc, int count) {
    384     const uint8_t* src = (const uint8_t*) vsrc;
    385     while (count >= 16) {
    386         // Load 16 pixels.
    387         uint8x16x2_t ga = vld2q_u8(src);
    388 
    389         // Premultiply if requested.
    390         if (kPremul) {
    391             ga.val[0] = vcombine_u8(
    392                     scale(vget_low_u8(ga.val[0]),  vget_low_u8(ga.val[1])),
    393                     scale(vget_high_u8(ga.val[0]), vget_high_u8(ga.val[1])));
    394         }
    395 
    396         // Set each of the color channels.
    397         uint8x16x4_t rgba;
    398         rgba.val[0] = ga.val[0];
    399         rgba.val[1] = ga.val[0];
    400         rgba.val[2] = ga.val[0];
    401         rgba.val[3] = ga.val[1];
    402 
    403         // Store 16 pixels.
    404         vst4q_u8((uint8_t*) dst, rgba);
    405         src += 16*2;
    406         dst += 16;
    407         count -= 16;
    408     }
    409 
    410     if (count >= 8) {
    411         // Load 8 pixels.
    412         uint8x8x2_t ga = vld2_u8(src);
    413 
    414         // Premultiply if requested.
    415         if (kPremul) {
    416             ga.val[0] = scale(ga.val[0], ga.val[1]);
    417         }
    418 
    419         // Set each of the color channels.
    420         uint8x8x4_t rgba;
    421         rgba.val[0] = ga.val[0];
    422         rgba.val[1] = ga.val[0];
    423         rgba.val[2] = ga.val[0];
    424         rgba.val[3] = ga.val[1];
    425 
    426         // Store 8 pixels.
    427         vst4_u8((uint8_t*) dst, rgba);
    428         src += 8*2;
    429         dst += 8;
    430         count -= 8;
    431     }
    432 
    433     auto proc = kPremul ? grayA_to_rgbA_portable : grayA_to_RGBA_portable;
    434     proc(dst, src, count);
    435 }
    436 
    437 /*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const void* src, int count) {
    438     expand_grayA<false>(dst, src, count);
    439 }
    440 
    441 /*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const void* src, int count) {
    442     expand_grayA<true>(dst, src, count);
    443 }
    444 
    445 enum Format { kRGB1, kBGR1 };
    446 template <Format format>
    447 static void inverted_cmyk_to(uint32_t* dst, const void* vsrc, int count) {
    448     auto src = (const uint32_t*)vsrc;
    449     while (count >= 8) {
    450         // Load 8 cmyk pixels.
    451         uint8x8x4_t pixels = vld4_u8((const uint8_t*) src);
    452 
    453         uint8x8_t k = pixels.val[3],
    454                   y = pixels.val[2],
    455                   m = pixels.val[1],
    456                   c = pixels.val[0];
    457 
    458         // Scale to r, g, b.
    459         uint8x8_t b = scale(y, k);
    460         uint8x8_t g = scale(m, k);
    461         uint8x8_t r = scale(c, k);
    462 
    463         // Store 8 rgba pixels.
    464         if (kBGR1 == format) {
    465             pixels.val[3] = vdup_n_u8(0xFF);
    466             pixels.val[2] = r;
    467             pixels.val[1] = g;
    468             pixels.val[0] = b;
    469         } else {
    470             pixels.val[3] = vdup_n_u8(0xFF);
    471             pixels.val[2] = b;
    472             pixels.val[1] = g;
    473             pixels.val[0] = r;
    474         }
    475         vst4_u8((uint8_t*) dst, pixels);
    476         src += 8;
    477         dst += 8;
    478         count -= 8;
    479     }
    480 
    481     auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
    482     proc(dst, src, count);
    483 }
    484 
    485 /*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) {
    486     inverted_cmyk_to<kRGB1>(dst, src, count);
    487 }
    488 
    489 /*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) {
    490     inverted_cmyk_to<kBGR1>(dst, src, count);
    491 }
    492 
    493 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3
    494 
    495 // Scale a byte by another.
    496 // Inputs are stored in 16-bit lanes, but are not larger than 8-bits.
    497 static __m128i scale(__m128i x, __m128i y) {
    498     const __m128i _128 = _mm_set1_epi16(128);
    499     const __m128i _257 = _mm_set1_epi16(257);
    500 
    501     // (x+127)/255 == ((x+128)*257)>>16 for 0 <= x <= 255*255.
    502     return _mm_mulhi_epu16(_mm_add_epi16(_mm_mullo_epi16(x, y), _128), _257);
    503 }
    504 
    505 template <bool kSwapRB>
    506 static void premul_should_swapRB(uint32_t* dst, const void* vsrc, int count) {
    507     auto src = (const uint32_t*)vsrc;
    508 
    509     auto premul8 = [](__m128i* lo, __m128i* hi) {
    510         const __m128i zeros = _mm_setzero_si128();
    511         __m128i planar;
    512         if (kSwapRB) {
    513             planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
    514         } else {
    515             planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
    516         }
    517 
    518         // Swizzle the pixels to 8-bit planar.
    519         *lo = _mm_shuffle_epi8(*lo, planar);                      // rrrrgggg bbbbaaaa
    520         *hi = _mm_shuffle_epi8(*hi, planar);                      // RRRRGGGG BBBBAAAA
    521         __m128i rg = _mm_unpacklo_epi32(*lo, *hi),                // rrrrRRRR ggggGGGG
    522                 ba = _mm_unpackhi_epi32(*lo, *hi);                // bbbbBBBB aaaaAAAA
    523 
    524         // Unpack to 16-bit planar.
    525         __m128i r = _mm_unpacklo_epi8(rg, zeros),                 // r_r_r_r_ R_R_R_R_
    526                 g = _mm_unpackhi_epi8(rg, zeros),                 // g_g_g_g_ G_G_G_G_
    527                 b = _mm_unpacklo_epi8(ba, zeros),                 // b_b_b_b_ B_B_B_B_
    528                 a = _mm_unpackhi_epi8(ba, zeros);                 // a_a_a_a_ A_A_A_A_
    529 
    530         // Premultiply!
    531         r = scale(r, a);
    532         g = scale(g, a);
    533         b = scale(b, a);
    534 
    535         // Repack into interlaced pixels.
    536         rg = _mm_or_si128(r, _mm_slli_epi16(g, 8));               // rgrgrgrg RGRGRGRG
    537         ba = _mm_or_si128(b, _mm_slli_epi16(a, 8));               // babababa BABABABA
    538         *lo = _mm_unpacklo_epi16(rg, ba);                         // rgbargba rgbargba
    539         *hi = _mm_unpackhi_epi16(rg, ba);                         // RGBARGBA RGBARGBA
    540     };
    541 
    542     while (count >= 8) {
    543         __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)),
    544                 hi = _mm_loadu_si128((const __m128i*) (src + 4));
    545 
    546         premul8(&lo, &hi);
    547 
    548         _mm_storeu_si128((__m128i*) (dst + 0), lo);
    549         _mm_storeu_si128((__m128i*) (dst + 4), hi);
    550 
    551         src += 8;
    552         dst += 8;
    553         count -= 8;
    554     }
    555 
    556     if (count >= 4) {
    557         __m128i lo = _mm_loadu_si128((const __m128i*) src),
    558                 hi = _mm_setzero_si128();
    559 
    560         premul8(&lo, &hi);
    561 
    562         _mm_storeu_si128((__m128i*) dst, lo);
    563 
    564         src += 4;
    565         dst += 4;
    566         count -= 4;
    567     }
    568 
    569     // Call portable code to finish up the tail of [0,4) pixels.
    570     auto proc = kSwapRB ? RGBA_to_bgrA_portable : RGBA_to_rgbA_portable;
    571     proc(dst, src, count);
    572 }
    573 
    574 /*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
    575     premul_should_swapRB<false>(dst, src, count);
    576 }
    577 
    578 /*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {
    579     premul_should_swapRB<true>(dst, src, count);
    580 }
    581 
    582 /*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const void* vsrc, int count) {
    583     auto src = (const uint32_t*)vsrc;
    584     const __m128i swapRB = _mm_setr_epi8(2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15);
    585 
    586     while (count >= 4) {
    587         __m128i rgba = _mm_loadu_si128((const __m128i*) src);
    588         __m128i bgra = _mm_shuffle_epi8(rgba, swapRB);
    589         _mm_storeu_si128((__m128i*) dst, bgra);
    590 
    591         src += 4;
    592         dst += 4;
    593         count -= 4;
    594     }
    595 
    596     RGBA_to_BGRA_portable(dst, src, count);
    597 }
    598 
    599 template <bool kSwapRB>
    600 static void insert_alpha_should_swaprb(uint32_t dst[], const void* vsrc, int count) {
    601     const uint8_t* src = (const uint8_t*) vsrc;
    602 
    603     const __m128i alphaMask = _mm_set1_epi32(0xFF000000);
    604     __m128i expand;
    605     const uint8_t X = 0xFF; // Used a placeholder.  The value of X is irrelevant.
    606     if (kSwapRB) {
    607         expand = _mm_setr_epi8(2,1,0,X, 5,4,3,X, 8,7,6,X, 11,10,9,X);
    608     } else {
    609         expand = _mm_setr_epi8(0,1,2,X, 3,4,5,X, 6,7,8,X, 9,10,11,X);
    610     }
    611 
    612     while (count >= 6) {
    613         // Load a vector.  While this actually contains 5 pixels plus an
    614         // extra component, we will discard all but the first four pixels on
    615         // this iteration.
    616         __m128i rgb = _mm_loadu_si128((const __m128i*) src);
    617 
    618         // Expand the first four pixels to RGBX and then mask to RGB(FF).
    619         __m128i rgba = _mm_or_si128(_mm_shuffle_epi8(rgb, expand), alphaMask);
    620 
    621         // Store 4 pixels.
    622         _mm_storeu_si128((__m128i*) dst, rgba);
    623 
    624         src += 4*3;
    625         dst += 4;
    626         count -= 4;
    627     }
    628 
    629     // Call portable code to finish up the tail of [0,4) pixels.
    630     auto proc = kSwapRB ? RGB_to_BGR1_portable : RGB_to_RGB1_portable;
    631     proc(dst, src, count);
    632 }
    633 
    634 /*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const void* src, int count) {
    635     insert_alpha_should_swaprb<false>(dst, src, count);
    636 }
    637 
    638 /*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const void* src, int count) {
    639     insert_alpha_should_swaprb<true>(dst, src, count);
    640 }
    641 
    642 /*not static*/ inline void gray_to_RGB1(uint32_t dst[], const void* vsrc, int count) {
    643     const uint8_t* src = (const uint8_t*) vsrc;
    644 
    645     const __m128i alphas = _mm_set1_epi8((uint8_t) 0xFF);
    646     while (count >= 16) {
    647         __m128i grays = _mm_loadu_si128((const __m128i*) src);
    648 
    649         __m128i gg_lo = _mm_unpacklo_epi8(grays, grays);
    650         __m128i gg_hi = _mm_unpackhi_epi8(grays, grays);
    651         __m128i ga_lo = _mm_unpacklo_epi8(grays, alphas);
    652         __m128i ga_hi = _mm_unpackhi_epi8(grays, alphas);
    653 
    654         __m128i ggga0 = _mm_unpacklo_epi16(gg_lo, ga_lo);
    655         __m128i ggga1 = _mm_unpackhi_epi16(gg_lo, ga_lo);
    656         __m128i ggga2 = _mm_unpacklo_epi16(gg_hi, ga_hi);
    657         __m128i ggga3 = _mm_unpackhi_epi16(gg_hi, ga_hi);
    658 
    659         _mm_storeu_si128((__m128i*) (dst +  0), ggga0);
    660         _mm_storeu_si128((__m128i*) (dst +  4), ggga1);
    661         _mm_storeu_si128((__m128i*) (dst +  8), ggga2);
    662         _mm_storeu_si128((__m128i*) (dst + 12), ggga3);
    663 
    664         src += 16;
    665         dst += 16;
    666         count -= 16;
    667     }
    668 
    669     gray_to_RGB1_portable(dst, src, count);
    670 }
    671 
    672 /*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const void* vsrc, int count) {
    673     const uint8_t* src = (const uint8_t*) vsrc;
    674     while (count >= 8) {
    675         __m128i ga = _mm_loadu_si128((const __m128i*) src);
    676 
    677         __m128i gg = _mm_or_si128(_mm_and_si128(ga, _mm_set1_epi16(0x00FF)),
    678                                   _mm_slli_epi16(ga, 8));
    679 
    680         __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga);
    681         __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga);
    682 
    683         _mm_storeu_si128((__m128i*) (dst +  0), ggga_lo);
    684         _mm_storeu_si128((__m128i*) (dst +  4), ggga_hi);
    685 
    686         src += 8*2;
    687         dst += 8;
    688         count -= 8;
    689     }
    690 
    691     grayA_to_RGBA_portable(dst, src, count);
    692 }
    693 
    694 /*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const void* vsrc, int count) {
    695     const uint8_t* src = (const uint8_t*) vsrc;
    696     while (count >= 8) {
    697         __m128i grayA = _mm_loadu_si128((const __m128i*) src);
    698 
    699         __m128i g0 = _mm_and_si128(grayA, _mm_set1_epi16(0x00FF));
    700         __m128i a0 = _mm_srli_epi16(grayA, 8);
    701 
    702         // Premultiply
    703         g0 = scale(g0, a0);
    704 
    705         __m128i gg = _mm_or_si128(g0, _mm_slli_epi16(g0, 8));
    706         __m128i ga = _mm_or_si128(g0, _mm_slli_epi16(a0, 8));
    707 
    708 
    709         __m128i ggga_lo = _mm_unpacklo_epi16(gg, ga);
    710         __m128i ggga_hi = _mm_unpackhi_epi16(gg, ga);
    711 
    712         _mm_storeu_si128((__m128i*) (dst +  0), ggga_lo);
    713         _mm_storeu_si128((__m128i*) (dst +  4), ggga_hi);
    714 
    715         src += 8*2;
    716         dst += 8;
    717         count -= 8;
    718     }
    719 
    720     grayA_to_rgbA_portable(dst, src, count);
    721 }
    722 
    723 enum Format { kRGB1, kBGR1 };
    724 template <Format format>
    725 static void inverted_cmyk_to(uint32_t* dst, const void* vsrc, int count) {
    726     auto src = (const uint32_t*)vsrc;
    727 
    728     auto convert8 = [](__m128i* lo, __m128i* hi) {
    729         const __m128i zeros = _mm_setzero_si128();
    730         __m128i planar;
    731         if (kBGR1 == format) {
    732             planar = _mm_setr_epi8(2,6,10,14, 1,5,9,13, 0,4,8,12, 3,7,11,15);
    733         } else {
    734             planar = _mm_setr_epi8(0,4,8,12, 1,5,9,13, 2,6,10,14, 3,7,11,15);
    735         }
    736 
    737         // Swizzle the pixels to 8-bit planar.
    738         *lo = _mm_shuffle_epi8(*lo, planar);                                 // ccccmmmm yyyykkkk
    739         *hi = _mm_shuffle_epi8(*hi, planar);                                 // CCCCMMMM YYYYKKKK
    740         __m128i cm = _mm_unpacklo_epi32(*lo, *hi),                           // ccccCCCC mmmmMMMM
    741                 yk = _mm_unpackhi_epi32(*lo, *hi);                           // yyyyYYYY kkkkKKKK
    742 
    743         // Unpack to 16-bit planar.
    744         __m128i c = _mm_unpacklo_epi8(cm, zeros),                            // c_c_c_c_ C_C_C_C_
    745                 m = _mm_unpackhi_epi8(cm, zeros),                            // m_m_m_m_ M_M_M_M_
    746                 y = _mm_unpacklo_epi8(yk, zeros),                            // y_y_y_y_ Y_Y_Y_Y_
    747                 k = _mm_unpackhi_epi8(yk, zeros);                            // k_k_k_k_ K_K_K_K_
    748 
    749         // Scale to r, g, b.
    750         __m128i r = scale(c, k),
    751                 g = scale(m, k),
    752                 b = scale(y, k);
    753 
    754         // Repack into interlaced pixels.
    755         __m128i rg = _mm_or_si128(r, _mm_slli_epi16(g, 8)),                  // rgrgrgrg RGRGRGRG
    756                 ba = _mm_or_si128(b, _mm_set1_epi16((uint16_t) 0xFF00));     // b1b1b1b1 B1B1B1B1
    757         *lo = _mm_unpacklo_epi16(rg, ba);                                    // rgbargba rgbargba
    758         *hi = _mm_unpackhi_epi16(rg, ba);                                    // RGB1RGB1 RGB1RGB1
    759     };
    760 
    761     while (count >= 8) {
    762         __m128i lo = _mm_loadu_si128((const __m128i*) (src + 0)),
    763                 hi = _mm_loadu_si128((const __m128i*) (src + 4));
    764 
    765         convert8(&lo, &hi);
    766 
    767         _mm_storeu_si128((__m128i*) (dst + 0), lo);
    768         _mm_storeu_si128((__m128i*) (dst + 4), hi);
    769 
    770         src += 8;
    771         dst += 8;
    772         count -= 8;
    773     }
    774 
    775     if (count >= 4) {
    776         __m128i lo = _mm_loadu_si128((const __m128i*) src),
    777                 hi = _mm_setzero_si128();
    778 
    779         convert8(&lo, &hi);
    780 
    781         _mm_storeu_si128((__m128i*) dst, lo);
    782 
    783         src += 4;
    784         dst += 4;
    785         count -= 4;
    786     }
    787 
    788     auto proc = (kBGR1 == format) ? inverted_CMYK_to_BGR1_portable : inverted_CMYK_to_RGB1_portable;
    789     proc(dst, src, count);
    790 }
    791 
    792 /*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) {
    793     inverted_cmyk_to<kRGB1>(dst, src, count);
    794 }
    795 
    796 /*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) {
    797     inverted_cmyk_to<kBGR1>(dst, src, count);
    798 }
    799 
    800 #else
    801 
    802 /*not static*/ inline void RGBA_to_rgbA(uint32_t* dst, const void* src, int count) {
    803     RGBA_to_rgbA_portable(dst, src, count);
    804 }
    805 
    806 /*not static*/ inline void RGBA_to_bgrA(uint32_t* dst, const void* src, int count) {
    807     RGBA_to_bgrA_portable(dst, src, count);
    808 }
    809 
    810 /*not static*/ inline void RGBA_to_BGRA(uint32_t* dst, const void* src, int count) {
    811     RGBA_to_BGRA_portable(dst, src, count);
    812 }
    813 
    814 /*not static*/ inline void RGB_to_RGB1(uint32_t dst[], const void* src, int count) {
    815     RGB_to_RGB1_portable(dst, src, count);
    816 }
    817 
    818 /*not static*/ inline void RGB_to_BGR1(uint32_t dst[], const void* src, int count) {
    819     RGB_to_BGR1_portable(dst, src, count);
    820 }
    821 
    822 /*not static*/ inline void gray_to_RGB1(uint32_t dst[], const void* src, int count) {
    823     gray_to_RGB1_portable(dst, src, count);
    824 }
    825 
    826 /*not static*/ inline void grayA_to_RGBA(uint32_t dst[], const void* src, int count) {
    827     grayA_to_RGBA_portable(dst, src, count);
    828 }
    829 
    830 /*not static*/ inline void grayA_to_rgbA(uint32_t dst[], const void* src, int count) {
    831     grayA_to_rgbA_portable(dst, src, count);
    832 }
    833 
    834 /*not static*/ inline void inverted_CMYK_to_RGB1(uint32_t dst[], const void* src, int count) {
    835     inverted_CMYK_to_RGB1_portable(dst, src, count);
    836 }
    837 
    838 /*not static*/ inline void inverted_CMYK_to_BGR1(uint32_t dst[], const void* src, int count) {
    839     inverted_CMYK_to_BGR1_portable(dst, src, count);
    840 }
    841 
    842 #endif
    843 
    844 }
    845 
    846 #endif // SkSwizzler_opts_DEFINED
    847