Home | History | Annotate | Download | only in intel
      1 
      2 /* filter_sse2_intrinsics.c - SSE2 optimized filter functions
      3  *
      4  * Copyright (c) 2016 Google, Inc.
      5  * Written by Mike Klein and Matt Sarett
      6  * Derived from arm/filter_neon_intrinsics.c, which was
      7  * Copyright (c) 2014,2016 Glenn Randers-Pehrson
      8  *
      9  * Last changed in libpng 1.6.22 [(PENDING RELEASE)]
     10  *
     11  * This code is released under the libpng license.
     12  * For conditions of distribution and use, see the disclaimer
     13  * and license in png.h
     14  */
     15 
     16 #include "../../pngpriv.h"
     17 
     18 #ifdef PNG_READ_SUPPORTED
     19 
     20 #if PNG_INTEL_SSE_IMPLEMENTATION > 0
     21 
     22 #include <immintrin.h>
     23 
     24 /* Functions in this file look at most 3 pixels (a,b,c) to predict the 4th (d).
     25  * They're positioned like this:
     26  *    prev:  c b
     27  *    row:   a d
     28  * The Sub filter predicts d=a, Avg d=(a+b)/2, and Paeth predicts d to be
     29  * whichever of a, b, or c is closest to p=a+b-c.
     30  */
     31 
     32 static __m128i load4(const void* p) {
     33    return _mm_cvtsi32_si128(*(const int*)p);
     34 }
     35 
     36 static void store4(void* p, __m128i v) {
     37    *(int*)p = _mm_cvtsi128_si32(v);
     38 }
     39 
     40 static __m128i load3(const void* p) {
     41    /* We'll load 2 bytes, then 1 byte,
     42     * then mask them together, and finally load into SSE.
     43     */
     44    const png_uint_16* p01 = p;
     45    const png_byte*    p2  = (const png_byte*)(p01+1);
     46 
     47    png_uint_32 v012 = (png_uint_32)(*p01)
     48                     | (png_uint_32)(*p2) << 16;
     49    return load4(&v012);
     50 }
     51 
     52 static void store3(void* p, __m128i v) {
     53    /* We'll pull from SSE as a 32-bit int, then write
     54     * its bottom two bytes, then its third byte.
     55     */
     56    png_uint_32 v012;
     57    store4(&v012, v);
     58 
     59    png_uint_16* p01 = p;
     60    png_byte*    p2  = (png_byte*)(p01+1);
     61    *p01 = v012;
     62    *p2  = v012 >> 16;
     63 }
     64 
     65 void png_read_filter_row_sub3_sse2(png_row_infop row_info, png_bytep row,
     66    png_const_bytep prev)
     67 {
     68    /* The Sub filter predicts each pixel as the previous pixel, a.
     69     * There is no pixel to the left of the first pixel.  It's encoded directly.
     70     * That works with our main loop if we just say that left pixel was zero.
     71     */
     72    png_debug(1, "in png_read_filter_row_sub3_sse2");
     73    __m128i a, d = _mm_setzero_si128();
     74 
     75    int rb = row_info->rowbytes;
     76    while (rb >= 4) {
     77       a = d; d = load4(row);
     78       d = _mm_add_epi8(d, a);
     79       store3(row, d);
     80 
     81       row += 3;
     82       rb  -= 3;
     83    }
     84    if (rb > 0) {
     85       a = d; d = load3(row);
     86       d = _mm_add_epi8(d, a);
     87       store3(row, d);
     88 
     89       row += 3;
     90       rb  -= 3;
     91    }
     92 }
     93 
     94 void png_read_filter_row_sub4_sse2(png_row_infop row_info, png_bytep row,
     95    png_const_bytep prev)
     96 {
     97    /* The Sub filter predicts each pixel as the previous pixel, a.
     98     * There is no pixel to the left of the first pixel.  It's encoded directly.
     99     * That works with our main loop if we just say that left pixel was zero.
    100     */
    101    png_debug(1, "in png_read_filter_row_sub4_sse2");
    102    __m128i a, d = _mm_setzero_si128();
    103 
    104    int rb = row_info->rowbytes;
    105    while (rb > 0) {
    106       a = d; d = load4(row);
    107       d = _mm_add_epi8(d, a);
    108       store4(row, d);
    109 
    110       row += 4;
    111       rb  -= 4;
    112    }
    113 }
    114 
    115 void png_read_filter_row_avg3_sse2(png_row_infop row_info, png_bytep row,
    116    png_const_bytep prev)
    117 {
    118    /* The Avg filter predicts each pixel as the (truncated) average of a and b.
    119     * There's no pixel to the left of the first pixel.  Luckily, it's
    120     * predicted to be half of the pixel above it.  So again, this works
    121     * perfectly with our loop if we make sure a starts at zero.
    122     */
    123    png_debug(1, "in png_read_filter_row_avg3_sse2");
    124    const __m128i zero = _mm_setzero_si128();
    125    __m128i    b;
    126    __m128i a, d = zero;
    127 
    128    int rb = row_info->rowbytes;
    129    while (rb >= 4) {
    130              b = load4(prev);
    131       a = d; d = load4(row );
    132 
    133       /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */
    134       __m128i avg = _mm_avg_epu8(a,b);
    135       /* ...but we can fix it up by subtracting off 1 if it rounded up. */
    136       avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b),
    137                                             _mm_set1_epi8(1)));
    138       d = _mm_add_epi8(d, avg);
    139       store3(row, d);
    140 
    141       prev += 3;
    142       row  += 3;
    143       rb   -= 3;
    144    }
    145    if (rb > 0) {
    146              b = load3(prev);
    147       a = d; d = load3(row );
    148 
    149       /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */
    150       __m128i avg = _mm_avg_epu8(a,b);
    151       /* ...but we can fix it up by subtracting off 1 if it rounded up. */
    152       avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b),
    153                                             _mm_set1_epi8(1)));
    154 
    155       d = _mm_add_epi8(d, avg);
    156       store3(row, d);
    157 
    158       prev += 3;
    159       row  += 3;
    160       rb   -= 3;
    161    }
    162 }
    163 
    164 void png_read_filter_row_avg4_sse2(png_row_infop row_info, png_bytep row,
    165    png_const_bytep prev)
    166 {
    167    /* The Avg filter predicts each pixel as the (truncated) average of a and b.
    168     * There's no pixel to the left of the first pixel.  Luckily, it's
    169     * predicted to be half of the pixel above it.  So again, this works
    170     * perfectly with our loop if we make sure a starts at zero.
    171     */
    172    png_debug(1, "in png_read_filter_row_avg4_sse2");
    173    const __m128i zero = _mm_setzero_si128();
    174    __m128i    b;
    175    __m128i a, d = zero;
    176 
    177    int rb = row_info->rowbytes;
    178    while (rb > 0) {
    179              b = load4(prev);
    180       a = d; d = load4(row );
    181 
    182       /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */
    183       __m128i avg = _mm_avg_epu8(a,b);
    184       /* ...but we can fix it up by subtracting off 1 if it rounded up. */
    185       avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b),
    186                                             _mm_set1_epi8(1)));
    187 
    188       d = _mm_add_epi8(d, avg);
    189       store4(row, d);
    190 
    191       prev += 4;
    192       row  += 4;
    193       rb   -= 4;
    194    }
    195 }
    196 
    197 /* Returns |x| for 16-bit lanes. */
    198 static __m128i abs_i16(__m128i x) {
    199 #if PNG_INTEL_SSE_IMPLEMENTATION >= 2
    200    return _mm_abs_epi16(x);
    201 #else
    202    /* Read this all as, return x<0 ? -x : x.
    203    * To negate two's complement, you flip all the bits then add 1.
    204     */
    205    __m128i is_negative = _mm_cmplt_epi16(x, _mm_setzero_si128());
    206 
    207    /* Flip negative lanes. */
    208    x = _mm_xor_si128(x, is_negative);
    209 
    210    /* +1 to negative lanes, else +0. */
    211    x = _mm_add_epi16(x, _mm_srli_epi16(is_negative, 15));
    212    return x;
    213 #endif
    214 }
    215 
    216 /* Bytewise c ? t : e. */
    217 static __m128i if_then_else(__m128i c, __m128i t, __m128i e) {
    218 #if PNG_INTEL_SSE_IMPLEMENTATION >= 3
    219    return _mm_blendv_epi8(e,t,c);
    220 #else
    221    return _mm_or_si128(_mm_and_si128(c, t), _mm_andnot_si128(c, e));
    222 #endif
    223 }
    224 
    225 void png_read_filter_row_paeth3_sse2(png_row_infop row_info, png_bytep row,
    226    png_const_bytep prev)
    227 {
    228    /* Paeth tries to predict pixel d using the pixel to the left of it, a,
    229     * and two pixels from the previous row, b and c:
    230     *   prev: c b
    231     *   row:  a d
    232     * The Paeth function predicts d to be whichever of a, b, or c is nearest to
    233     * p=a+b-c.
    234     *
    235     * The first pixel has no left context, and so uses an Up filter, p = b.
    236     * This works naturally with our main loop's p = a+b-c if we force a and c
    237     * to zero.
    238     * Here we zero b and d, which become c and a respectively at the start of
    239     * the loop.
    240     */
    241    png_debug(1, "in png_read_filter_row_paeth3_sse2");
    242    const __m128i zero = _mm_setzero_si128();
    243    __m128i c, b = zero,
    244            a, d = zero;
    245 
    246    int rb = row_info->rowbytes;
    247    while (rb >= 4) {
    248       /* It's easiest to do this math (particularly, deal with pc) with 16-bit
    249        * intermediates.
    250        */
    251       c = b; b = _mm_unpacklo_epi8(load4(prev), zero);
    252       a = d; d = _mm_unpacklo_epi8(load4(row ), zero);
    253 
    254       /* (p-a) == (a+b-c - a) == (b-c) */
    255       __m128i pa = _mm_sub_epi16(b,c);
    256 
    257       /* (p-b) == (a+b-c - b) == (a-c) */
    258       __m128i pb = _mm_sub_epi16(a,c);
    259 
    260       /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */
    261       __m128i pc = _mm_add_epi16(pa,pb);
    262 
    263       pa = abs_i16(pa);  /* |p-a| */
    264       pb = abs_i16(pb);  /* |p-b| */
    265       pc = abs_i16(pc);  /* |p-c| */
    266 
    267       __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));
    268 
    269       /* Paeth breaks ties favoring a over b over c. */
    270       __m128i nearest  = if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
    271                          if_then_else(_mm_cmpeq_epi16(smallest, pb), b,
    272                                                                      c));
    273 
    274       /* Note `_epi8`: we need addition to wrap modulo 255. */
    275       d = _mm_add_epi8(d, nearest);
    276       store3(row, _mm_packus_epi16(d,d));
    277 
    278       prev += 3;
    279       row  += 3;
    280       rb   -= 3;
    281    }
    282    if (rb > 0) {
    283       /* It's easiest to do this math (particularly, deal with pc) with 16-bit
    284        * intermediates.
    285        */
    286       c = b; b = _mm_unpacklo_epi8(load3(prev), zero);
    287       a = d; d = _mm_unpacklo_epi8(load3(row ), zero);
    288 
    289       /* (p-a) == (a+b-c - a) == (b-c) */
    290       __m128i pa = _mm_sub_epi16(b,c);
    291 
    292       /* (p-b) == (a+b-c - b) == (a-c) */
    293       __m128i pb = _mm_sub_epi16(a,c);
    294 
    295       /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */
    296       __m128i pc = _mm_add_epi16(pa,pb);
    297 
    298       pa = abs_i16(pa);  /* |p-a| */
    299       pb = abs_i16(pb);  /* |p-b| */
    300       pc = abs_i16(pc);  /* |p-c| */
    301 
    302       __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));
    303 
    304       /* Paeth breaks ties favoring a over b over c. */
    305       __m128i nearest  = if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
    306                          if_then_else(_mm_cmpeq_epi16(smallest, pb), b,
    307                                                                      c));
    308 
    309       /* Note `_epi8`: we need addition to wrap modulo 255. */
    310       d = _mm_add_epi8(d, nearest);
    311       store3(row, _mm_packus_epi16(d,d));
    312 
    313       prev += 3;
    314       row  += 3;
    315       rb   -= 3;
    316    }
    317 }
    318 
    319 void png_read_filter_row_paeth4_sse2(png_row_infop row_info, png_bytep row,
    320    png_const_bytep prev)
    321 {
    322    /* Paeth tries to predict pixel d using the pixel to the left of it, a,
    323     * and two pixels from the previous row, b and c:
    324     *   prev: c b
    325     *   row:  a d
    326     * The Paeth function predicts d to be whichever of a, b, or c is nearest to
    327     * p=a+b-c.
    328     *
    329     * The first pixel has no left context, and so uses an Up filter, p = b.
    330     * This works naturally with our main loop's p = a+b-c if we force a and c
    331     * to zero.
    332     * Here we zero b and d, which become c and a respectively at the start of
    333     * the loop.
    334     */
    335    png_debug(1, "in png_read_filter_row_paeth4_sse2");
    336    const __m128i zero = _mm_setzero_si128();
    337    __m128i c, b = zero,
    338            a, d = zero;
    339 
    340    int rb = row_info->rowbytes;
    341    while (rb > 0) {
    342       /* It's easiest to do this math (particularly, deal with pc) with 16-bit
    343        * intermediates.
    344        */
    345       c = b; b = _mm_unpacklo_epi8(load4(prev), zero);
    346       a = d; d = _mm_unpacklo_epi8(load4(row ), zero);
    347 
    348       /* (p-a) == (a+b-c - a) == (b-c) */
    349       __m128i pa = _mm_sub_epi16(b,c);
    350 
    351       /* (p-b) == (a+b-c - b) == (a-c) */
    352       __m128i pb = _mm_sub_epi16(a,c);
    353 
    354       /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */
    355       __m128i pc = _mm_add_epi16(pa,pb);
    356 
    357       pa = abs_i16(pa);  /* |p-a| */
    358       pb = abs_i16(pb);  /* |p-b| */
    359       pc = abs_i16(pc);  /* |p-c| */
    360 
    361       __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));
    362 
    363       /* Paeth breaks ties favoring a over b over c. */
    364       __m128i nearest  = if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
    365                          if_then_else(_mm_cmpeq_epi16(smallest, pb), b,
    366                                                                      c));
    367 
    368       /* Note `_epi8`: we need addition to wrap modulo 255. */
    369       d = _mm_add_epi8(d, nearest);
    370       store4(row, _mm_packus_epi16(d,d));
    371 
    372       prev += 4;
    373       row  += 4;
    374       rb   -= 4;
    375    }
    376 }
    377 
    378 #endif /* PNG_INTEL_SSE_IMPLEMENTATION > 0 */
    379 #endif /* READ */
    380