Home | History | Annotate | Download | only in opts
      1 /*
      2  * Copyright 2012 The Android Open Source Project
      3  *
      4  * Use of this source code is governed by a BSD-style license that can be
      5  * found in the LICENSE file.
      6  */
      7 
      8 #include <emmintrin.h>
      9 #include "SkBitmapProcState_opts_SSE2.h"
     10 #include "SkBlitRow_opts_SSE2.h"
     11 #include "SkColorData.h"
     12 #include "SkColor_opts_SSE2.h"
     13 #include "SkDither.h"
     14 #include "SkMSAN.h"
     15 #include "SkUtils.h"
     16 
     17 /* SSE2 version of S32_Blend_BlitRow32()
     18  * portable version is in core/SkBlitRow_D32.cpp
     19  */
     20 void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
     21                               const SkPMColor* SK_RESTRICT src,
     22                               int count, U8CPU alpha) {
     23     SkASSERT(alpha <= 255);
     24     if (count <= 0) {
     25         return;
     26     }
     27 
     28     uint32_t src_scale = SkAlpha255To256(alpha);
     29 
     30     if (count >= 4) {
     31         SkASSERT(((size_t)dst & 0x03) == 0);
     32         while (((size_t)dst & 0x0F) != 0) {
     33             *dst = SkPMLerp(*src, *dst, src_scale);
     34             src++;
     35             dst++;
     36             count--;
     37         }
     38 
     39         const __m128i *s = reinterpret_cast<const __m128i*>(src);
     40         __m128i *d = reinterpret_cast<__m128i*>(dst);
     41 
     42         while (count >= 4) {
     43             // Load 4 pixels each of src and dest.
     44             __m128i src_pixel = _mm_loadu_si128(s);
     45             __m128i dst_pixel = _mm_load_si128(d);
     46 
     47             __m128i result = SkPMLerp_SSE2(src_pixel, dst_pixel, src_scale);
     48             _mm_store_si128(d, result);
     49             s++;
     50             d++;
     51             count -= 4;
     52         }
     53         src = reinterpret_cast<const SkPMColor*>(s);
     54         dst = reinterpret_cast<SkPMColor*>(d);
     55     }
     56 
     57     while (count > 0) {
     58         *dst = SkPMLerp(*src, *dst, src_scale);
     59         src++;
     60         dst++;
     61         count--;
     62     }
     63 }
     64 
     65 void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
     66                                const SkPMColor* SK_RESTRICT src,
     67                                int count, U8CPU alpha) {
     68     SkASSERT(alpha <= 255);
     69     if (count <= 0) {
     70         return;
     71     }
     72 
     73     if (count >= 4) {
     74         while (((size_t)dst & 0x0F) != 0) {
     75             *dst = SkBlendARGB32(*src, *dst, alpha);
     76             src++;
     77             dst++;
     78             count--;
     79         }
     80 
     81         const __m128i *s = reinterpret_cast<const __m128i*>(src);
     82         __m128i *d = reinterpret_cast<__m128i*>(dst);
     83         while (count >= 4) {
     84             // Load 4 pixels each of src and dest.
     85             __m128i src_pixel = _mm_loadu_si128(s);
     86             __m128i dst_pixel = _mm_load_si128(d);
     87 
     88             __m128i result = SkBlendARGB32_SSE2(src_pixel, dst_pixel, alpha);
     89             _mm_store_si128(d, result);
     90             s++;
     91             d++;
     92             count -= 4;
     93         }
     94         src = reinterpret_cast<const SkPMColor*>(s);
     95         dst = reinterpret_cast<SkPMColor*>(d);
     96     }
     97 
     98     while (count > 0) {
     99         *dst = SkBlendARGB32(*src, *dst, alpha);
    100         src++;
    101         dst++;
    102         count--;
    103     }
    104 }
    105 
    106 // The following (left) shifts cause the top 5 bits of the mask components to
    107 // line up with the corresponding components in an SkPMColor.
    108 // Note that the mask's RGB16 order may differ from the SkPMColor order.
    109 #define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
    110 #define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
    111 #define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
    112 
    113 #if SK_R16x5_R32x5_SHIFT == 0
    114     #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x)
    115 #elif SK_R16x5_R32x5_SHIFT > 0
    116     #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT))
    117 #else
    118     #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT))
    119 #endif
    120 
    121 #if SK_G16x5_G32x5_SHIFT == 0
    122     #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x)
    123 #elif SK_G16x5_G32x5_SHIFT > 0
    124     #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT))
    125 #else
    126     #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT))
    127 #endif
    128 
    129 #if SK_B16x5_B32x5_SHIFT == 0
    130     #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)
    131 #elif SK_B16x5_B32x5_SHIFT > 0
    132     #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT))
    133 #else
    134     #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))
    135 #endif
    136 
    137 static __m128i SkBlendLCD16_SSE2(__m128i &src, __m128i &dst,
    138                                  __m128i &mask, __m128i &srcA) {
    139     // In the following comments, the components of src, dst and mask are
    140     // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
    141     // by an R, G, B, or A suffix. Components of one of the four pixels that
    142     // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
    143     // example is the blue channel of the second destination pixel. Memory
    144     // layout is shown for an ARGB byte order in a color value.
    145 
    146     // src and srcA store 8-bit values interleaved with zeros.
    147     // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
    148     // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
    149     //         srcA, 0, srcA, 0, srcA, 0, srcA, 0)
    150     // mask stores 16-bit values (compressed three channels) interleaved with zeros.
    151     // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
    152     // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
    153     //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
    154 
    155     // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
    156     // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
    157     __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
    158                               _mm_set1_epi32(0x1F << SK_R32_SHIFT));
    159 
    160     // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
    161     __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
    162                               _mm_set1_epi32(0x1F << SK_G32_SHIFT));
    163 
    164     // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
    165     __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
    166                               _mm_set1_epi32(0x1F << SK_B32_SHIFT));
    167 
    168     // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
    169     // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
    170     // 8-bit position
    171     // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
    172     //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
    173     mask = _mm_or_si128(_mm_or_si128(r, g), b);
    174 
    175     // Interleave R,G,B into the lower byte of word.
    176     // i.e. split the sixteen 8-bit values from mask into two sets of eight
    177     // 16-bit values, padded by zero.
    178     __m128i maskLo, maskHi;
    179     // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
    180     maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
    181     // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
    182     maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
    183 
    184     // Upscale from 0..31 to 0..32
    185     // (allows to replace division by left-shift further down)
    186     // Left-shift each component by 4 and add the result back to that component,
    187     // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
    188     maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
    189     maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
    190 
    191     // Multiply each component of maskLo and maskHi by srcA
    192     maskLo = _mm_mullo_epi16(maskLo, srcA);
    193     maskHi = _mm_mullo_epi16(maskHi, srcA);
    194 
    195     // Left shift mask components by 8 (divide by 256)
    196     maskLo = _mm_srli_epi16(maskLo, 8);
    197     maskHi = _mm_srli_epi16(maskHi, 8);
    198 
    199     // Interleave R,G,B into the lower byte of the word
    200     // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
    201     __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
    202     // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
    203     __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
    204 
    205     // mask = (src - dst) * mask
    206     maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
    207     maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
    208 
    209     // mask = (src - dst) * mask >> 5
    210     maskLo = _mm_srai_epi16(maskLo, 5);
    211     maskHi = _mm_srai_epi16(maskHi, 5);
    212 
    213     // Add two pixels into result.
    214     // result = dst + ((src - dst) * mask >> 5)
    215     __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
    216     __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
    217 
    218     // Pack into 4 32bit dst pixels.
    219     // resultLo and resultHi contain eight 16-bit components (two pixels) each.
    220     // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
    221     // clamping to 255 if necessary.
    222     return _mm_packus_epi16(resultLo, resultHi);
    223 }
    224 
    225 static __m128i SkBlendLCD16Opaque_SSE2(__m128i &src, __m128i &dst,
    226                                        __m128i &mask) {
    227     // In the following comments, the components of src, dst and mask are
    228     // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
    229     // by an R, G, B, or A suffix. Components of one of the four pixels that
    230     // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
    231     // example is the blue channel of the second destination pixel. Memory
    232     // layout is shown for an ARGB byte order in a color value.
    233 
    234     // src and srcA store 8-bit values interleaved with zeros.
    235     // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
    236     // mask stores 16-bit values (shown as high and low bytes) interleaved with
    237     // zeros
    238     // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
    239     //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
    240 
    241     // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
    242     // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
    243     __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
    244                               _mm_set1_epi32(0x1F << SK_R32_SHIFT));
    245 
    246     // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
    247     __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
    248                               _mm_set1_epi32(0x1F << SK_G32_SHIFT));
    249 
    250     // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
    251     __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
    252                               _mm_set1_epi32(0x1F << SK_B32_SHIFT));
    253 
    254     // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
    255     // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
    256     // 8-bit position
    257     // mask = (0, m0R, m0G, m0B, 0, m1R, m1G, m1B,
    258     //         0, m2R, m2G, m2B, 0, m3R, m3G, m3B)
    259     mask = _mm_or_si128(_mm_or_si128(r, g), b);
    260 
    261     // Interleave R,G,B into the lower byte of word.
    262     // i.e. split the sixteen 8-bit values from mask into two sets of eight
    263     // 16-bit values, padded by zero.
    264     __m128i maskLo, maskHi;
    265     // maskLo = (0, 0, m0R, 0, m0G, 0, m0B, 0, 0, 0, m1R, 0, m1G, 0, m1B, 0)
    266     maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
    267     // maskHi = (0, 0, m2R, 0, m2G, 0, m2B, 0, 0, 0, m3R, 0, m3G, 0, m3B, 0)
    268     maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
    269 
    270     // Upscale from 0..31 to 0..32
    271     // (allows to replace division by left-shift further down)
    272     // Left-shift each component by 4 and add the result back to that component,
    273     // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
    274     maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
    275     maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
    276 
    277     // Interleave R,G,B into the lower byte of the word
    278     // dstLo = (0, 0, d0R, 0, d0G, 0, d0B, 0, 0, 0, d1R, 0, d1G, 0, d1B, 0)
    279     __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
    280     // dstLo = (0, 0, d2R, 0, d2G, 0, d2B, 0, 0, 0, d3R, 0, d3G, 0, d3B, 0)
    281     __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
    282 
    283     // mask = (src - dst) * mask
    284     maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
    285     maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
    286 
    287     // mask = (src - dst) * mask >> 5
    288     maskLo = _mm_srai_epi16(maskLo, 5);
    289     maskHi = _mm_srai_epi16(maskHi, 5);
    290 
    291     // Add two pixels into result.
    292     // result = dst + ((src - dst) * mask >> 5)
    293     __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
    294     __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
    295 
    296     // Pack into 4 32bit dst pixels and force opaque.
    297     // resultLo and resultHi contain eight 16-bit components (two pixels) each.
    298     // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
    299     // clamping to 255 if necessary. Set alpha components to 0xFF.
    300     return _mm_or_si128(_mm_packus_epi16(resultLo, resultHi),
    301                         _mm_set1_epi32(SK_A32_MASK << SK_A32_SHIFT));
    302 }
    303 
    304 void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t mask[],
    305                          SkColor src, int width, SkPMColor) {
    306     if (width <= 0) {
    307         return;
    308     }
    309 
    310     int srcA = SkColorGetA(src);
    311     int srcR = SkColorGetR(src);
    312     int srcG = SkColorGetG(src);
    313     int srcB = SkColorGetB(src);
    314 
    315     srcA = SkAlpha255To256(srcA);
    316 
    317     if (width >= 4) {
    318         SkASSERT(((size_t)dst & 0x03) == 0);
    319         while (((size_t)dst & 0x0F) != 0) {
    320             *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
    321             mask++;
    322             dst++;
    323             width--;
    324         }
    325 
    326         __m128i *d = reinterpret_cast<__m128i*>(dst);
    327         // Set alpha to 0xFF and replicate source four times in SSE register.
    328         __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
    329         // Interleave with zeros to get two sets of four 16-bit values.
    330         src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
    331         // Set srcA_sse to contain eight copies of srcA, padded with zero.
    332         // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
    333         __m128i srcA_sse = _mm_set1_epi16(srcA);
    334         while (width >= 4) {
    335             // Load four destination pixels into dst_sse.
    336             __m128i dst_sse = _mm_load_si128(d);
    337             // Load four 16-bit masks into lower half of mask_sse.
    338             __m128i mask_sse = _mm_loadl_epi64(
    339                                    reinterpret_cast<const __m128i*>(mask));
    340 
    341             // Check whether masks are equal to 0 and get the highest bit
    342             // of each byte of result, if masks are all zero, we will get
    343             // pack_cmp to 0xFFFF
    344             int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
    345                                              _mm_setzero_si128()));
    346 
    347             // if mask pixels are not all zero, we will blend the dst pixels
    348             if (pack_cmp != 0xFFFF) {
    349                 // Unpack 4 16bit mask pixels to
    350                 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
    351                 //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
    352                 mask_sse = _mm_unpacklo_epi16(mask_sse,
    353                                               _mm_setzero_si128());
    354 
    355                 // Process 4 32bit dst pixels
    356                 __m128i result = SkBlendLCD16_SSE2(src_sse, dst_sse,
    357                                                    mask_sse, srcA_sse);
    358                 _mm_store_si128(d, result);
    359             }
    360 
    361             d++;
    362             mask += 4;
    363             width -= 4;
    364         }
    365 
    366         dst = reinterpret_cast<SkPMColor*>(d);
    367     }
    368 
    369     while (width > 0) {
    370         *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *mask);
    371         mask++;
    372         dst++;
    373         width--;
    374     }
    375 }
    376 
    377 void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t mask[],
    378                                SkColor src, int width, SkPMColor opaqueDst) {
    379     if (width <= 0) {
    380         return;
    381     }
    382 
    383     int srcR = SkColorGetR(src);
    384     int srcG = SkColorGetG(src);
    385     int srcB = SkColorGetB(src);
    386 
    387     if (width >= 4) {
    388         SkASSERT(((size_t)dst & 0x03) == 0);
    389         while (((size_t)dst & 0x0F) != 0) {
    390             *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
    391             mask++;
    392             dst++;
    393             width--;
    394         }
    395 
    396         __m128i *d = reinterpret_cast<__m128i*>(dst);
    397         // Set alpha to 0xFF and replicate source four times in SSE register.
    398         __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
    399         // Set srcA_sse to contain eight copies of srcA, padded with zero.
    400         // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
    401         src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
    402         while (width >= 4) {
    403             // Load four destination pixels into dst_sse.
    404             __m128i dst_sse = _mm_load_si128(d);
    405             // Load four 16-bit masks into lower half of mask_sse.
    406             __m128i mask_sse = _mm_loadl_epi64(
    407                                    reinterpret_cast<const __m128i*>(mask));
    408 
    409             // Check whether masks are equal to 0 and get the highest bit
    410             // of each byte of result, if masks are all zero, we will get
    411             // pack_cmp to 0xFFFF
    412             int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
    413                                              _mm_setzero_si128()));
    414 
    415             // if mask pixels are not all zero, we will blend the dst pixels
    416             if (pack_cmp != 0xFFFF) {
    417                 // Unpack 4 16bit mask pixels to
    418                 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
    419                 //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
    420                 mask_sse = _mm_unpacklo_epi16(mask_sse,
    421                                               _mm_setzero_si128());
    422 
    423                 // Process 4 32bit dst pixels
    424                 __m128i result = SkBlendLCD16Opaque_SSE2(src_sse, dst_sse,
    425                                                          mask_sse);
    426                 _mm_store_si128(d, result);
    427             }
    428 
    429             d++;
    430             mask += 4;
    431             width -= 4;
    432         }
    433 
    434         dst = reinterpret_cast<SkPMColor*>(d);
    435     }
    436 
    437     while (width > 0) {
    438         *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
    439         mask++;
    440         dst++;
    441         width--;
    442     }
    443 }
    444