Home | History | Annotate | Download | only in dsp
      1 // Copyright 2017 Google Inc. All Rights Reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style license
      4 // that can be found in the COPYING file in the root of the source
      5 // tree. An additional intellectual property rights grant can be found
      6 // in the file PATENTS. All contributing project authors may
      7 // be found in the AUTHORS file in the root of the source tree.
      8 // -----------------------------------------------------------------------------
      9 //
     10 // NEON variant of alpha filters
     11 //
     12 // Author: Skal (pascal.massimino (at) gmail.com)
     13 
     14 #include "src/dsp/dsp.h"
     15 
     16 #if defined(WEBP_USE_NEON)
     17 
     18 #include <assert.h>
     19 #include "src/dsp/neon.h"
     20 
     21 //------------------------------------------------------------------------------
     22 // Helpful macros.
     23 
     24 # define SANITY_CHECK(in, out)                                                 \
     25   assert(in != NULL);                                                          \
     26   assert(out != NULL);                                                         \
     27   assert(width > 0);                                                           \
     28   assert(height > 0);                                                          \
     29   assert(stride >= width);                                                     \
     30   assert(row >= 0 && num_rows > 0 && row + num_rows <= height);                \
     31   (void)height;  // Silence unused warning.
     32 
     33 // load eight u8 and widen to s16
     34 #define U8_TO_S16(A) vreinterpretq_s16_u16(vmovl_u8(A))
     35 #define LOAD_U8_TO_S16(A) U8_TO_S16(vld1_u8(A))
     36 
     37 // shift left or right by N byte, inserting zeros
     38 #define SHIFT_RIGHT_N_Q(A, N) vextq_u8((A), zero, (N))
     39 #define SHIFT_LEFT_N_Q(A, N) vextq_u8(zero, (A), (16 - (N)) % 16)
     40 
     41 // rotate left by N bytes
     42 #define ROTATE_LEFT_N(A, N)   vext_u8((A), (A), (N))
     43 // rotate right by N bytes
     44 #define ROTATE_RIGHT_N(A, N)   vext_u8((A), (A), (8 - (N)) % 8)
     45 
     46 static void PredictLine_NEON(const uint8_t* src, const uint8_t* pred,
     47                              uint8_t* dst, int length) {
     48   int i;
     49   assert(length >= 0);
     50   for (i = 0; i + 16 <= length; i += 16) {
     51     const uint8x16_t A = vld1q_u8(&src[i]);
     52     const uint8x16_t B = vld1q_u8(&pred[i]);
     53     const uint8x16_t C = vsubq_u8(A, B);
     54     vst1q_u8(&dst[i], C);
     55   }
     56   for (; i < length; ++i) dst[i] = src[i] - pred[i];
     57 }
     58 
     59 // Special case for left-based prediction (when preds==dst-1 or preds==src-1).
     60 static void PredictLineLeft_NEON(const uint8_t* src, uint8_t* dst, int length) {
     61   PredictLine_NEON(src, src - 1, dst, length);
     62 }
     63 
     64 //------------------------------------------------------------------------------
     65 // Horizontal filter.
     66 
     67 static WEBP_INLINE void DoHorizontalFilter_NEON(const uint8_t* in,
     68                                                 int width, int height,
     69                                                 int stride,
     70                                                 int row, int num_rows,
     71                                                 uint8_t* out) {
     72   const size_t start_offset = row * stride;
     73   const int last_row = row + num_rows;
     74   SANITY_CHECK(in, out);
     75   in += start_offset;
     76   out += start_offset;
     77 
     78   if (row == 0) {
     79     // Leftmost pixel is the same as input for topmost scanline.
     80     out[0] = in[0];
     81     PredictLineLeft_NEON(in + 1, out + 1, width - 1);
     82     row = 1;
     83     in += stride;
     84     out += stride;
     85   }
     86 
     87   // Filter line-by-line.
     88   while (row < last_row) {
     89     // Leftmost pixel is predicted from above.
     90     out[0] = in[0] - in[-stride];
     91     PredictLineLeft_NEON(in + 1, out + 1, width - 1);
     92     ++row;
     93     in += stride;
     94     out += stride;
     95   }
     96 }
     97 
     98 static void HorizontalFilter_NEON(const uint8_t* data, int width, int height,
     99                                   int stride, uint8_t* filtered_data) {
    100   DoHorizontalFilter_NEON(data, width, height, stride, 0, height,
    101                           filtered_data);
    102 }
    103 
    104 //------------------------------------------------------------------------------
    105 // Vertical filter.
    106 
    107 static WEBP_INLINE void DoVerticalFilter_NEON(const uint8_t* in,
    108                                               int width, int height, int stride,
    109                                               int row, int num_rows,
    110                                               uint8_t* out) {
    111   const size_t start_offset = row * stride;
    112   const int last_row = row + num_rows;
    113   SANITY_CHECK(in, out);
    114   in += start_offset;
    115   out += start_offset;
    116 
    117   if (row == 0) {
    118     // Very first top-left pixel is copied.
    119     out[0] = in[0];
    120     // Rest of top scan-line is left-predicted.
    121     PredictLineLeft_NEON(in + 1, out + 1, width - 1);
    122     row = 1;
    123     in += stride;
    124     out += stride;
    125   }
    126 
    127   // Filter line-by-line.
    128   while (row < last_row) {
    129     PredictLine_NEON(in, in - stride, out, width);
    130     ++row;
    131     in += stride;
    132     out += stride;
    133   }
    134 }
    135 
    136 static void VerticalFilter_NEON(const uint8_t* data, int width, int height,
    137                                 int stride, uint8_t* filtered_data) {
    138   DoVerticalFilter_NEON(data, width, height, stride, 0, height,
    139                         filtered_data);
    140 }
    141 
    142 //------------------------------------------------------------------------------
    143 // Gradient filter.
    144 
    145 static WEBP_INLINE int GradientPredictor_C(uint8_t a, uint8_t b, uint8_t c) {
    146   const int g = a + b - c;
    147   return ((g & ~0xff) == 0) ? g : (g < 0) ? 0 : 255;  // clip to 8bit
    148 }
    149 
    150 static void GradientPredictDirect_NEON(const uint8_t* const row,
    151                                        const uint8_t* const top,
    152                                        uint8_t* const out, int length) {
    153   int i;
    154   for (i = 0; i + 8 <= length; i += 8) {
    155     const uint8x8_t A = vld1_u8(&row[i - 1]);
    156     const uint8x8_t B = vld1_u8(&top[i + 0]);
    157     const int16x8_t C = vreinterpretq_s16_u16(vaddl_u8(A, B));
    158     const int16x8_t D = LOAD_U8_TO_S16(&top[i - 1]);
    159     const uint8x8_t E = vqmovun_s16(vsubq_s16(C, D));
    160     const uint8x8_t F = vld1_u8(&row[i + 0]);
    161     vst1_u8(&out[i], vsub_u8(F, E));
    162   }
    163   for (; i < length; ++i) {
    164     out[i] = row[i] - GradientPredictor_C(row[i - 1], top[i], top[i - 1]);
    165   }
    166 }
    167 
    168 static WEBP_INLINE void DoGradientFilter_NEON(const uint8_t* in,
    169                                               int width, int height,
    170                                               int stride,
    171                                               int row, int num_rows,
    172                                               uint8_t* out) {
    173   const size_t start_offset = row * stride;
    174   const int last_row = row + num_rows;
    175   SANITY_CHECK(in, out);
    176   in += start_offset;
    177   out += start_offset;
    178 
    179   // left prediction for top scan-line
    180   if (row == 0) {
    181     out[0] = in[0];
    182     PredictLineLeft_NEON(in + 1, out + 1, width - 1);
    183     row = 1;
    184     in += stride;
    185     out += stride;
    186   }
    187 
    188   // Filter line-by-line.
    189   while (row < last_row) {
    190     out[0] = in[0] - in[-stride];
    191     GradientPredictDirect_NEON(in + 1, in + 1 - stride, out + 1, width - 1);
    192     ++row;
    193     in += stride;
    194     out += stride;
    195   }
    196 }
    197 
    198 static void GradientFilter_NEON(const uint8_t* data, int width, int height,
    199                                 int stride, uint8_t* filtered_data) {
    200   DoGradientFilter_NEON(data, width, height, stride, 0, height,
    201                         filtered_data);
    202 }
    203 
    204 #undef SANITY_CHECK
    205 
    206 //------------------------------------------------------------------------------
    207 // Inverse transforms
    208 
    209 static void HorizontalUnfilter_NEON(const uint8_t* prev, const uint8_t* in,
    210                                     uint8_t* out, int width) {
    211   int i;
    212   const uint8x16_t zero = vdupq_n_u8(0);
    213   uint8x16_t last;
    214   out[0] = in[0] + (prev == NULL ? 0 : prev[0]);
    215   if (width <= 1) return;
    216   last = vsetq_lane_u8(out[0], zero, 0);
    217   for (i = 1; i + 16 <= width; i += 16) {
    218     const uint8x16_t A0 = vld1q_u8(&in[i]);
    219     const uint8x16_t A1 = vaddq_u8(A0, last);
    220     const uint8x16_t A2 = SHIFT_LEFT_N_Q(A1, 1);
    221     const uint8x16_t A3 = vaddq_u8(A1, A2);
    222     const uint8x16_t A4 = SHIFT_LEFT_N_Q(A3, 2);
    223     const uint8x16_t A5 = vaddq_u8(A3, A4);
    224     const uint8x16_t A6 = SHIFT_LEFT_N_Q(A5, 4);
    225     const uint8x16_t A7 = vaddq_u8(A5, A6);
    226     const uint8x16_t A8 = SHIFT_LEFT_N_Q(A7, 8);
    227     const uint8x16_t A9 = vaddq_u8(A7, A8);
    228     vst1q_u8(&out[i], A9);
    229     last = SHIFT_RIGHT_N_Q(A9, 15);
    230   }
    231   for (; i < width; ++i) out[i] = in[i] + out[i - 1];
    232 }
    233 
    234 static void VerticalUnfilter_NEON(const uint8_t* prev, const uint8_t* in,
    235                                   uint8_t* out, int width) {
    236   if (prev == NULL) {
    237     HorizontalUnfilter_NEON(NULL, in, out, width);
    238   } else {
    239     int i;
    240     assert(width >= 0);
    241     for (i = 0; i + 16 <= width; i += 16) {
    242       const uint8x16_t A = vld1q_u8(&in[i]);
    243       const uint8x16_t B = vld1q_u8(&prev[i]);
    244       const uint8x16_t C = vaddq_u8(A, B);
    245       vst1q_u8(&out[i], C);
    246     }
    247     for (; i < width; ++i) out[i] = in[i] + prev[i];
    248   }
    249 }
    250 
    251 // GradientUnfilter_NEON is correct but slower than the C-version,
    252 // at least on ARM64. For armv7, it's a wash.
    253 // So best is to disable it for now, but keep the idea around...
    254 #if !defined(USE_GRADIENT_UNFILTER)
    255 #define USE_GRADIENT_UNFILTER 0   // ALTERNATE_CODE
    256 #endif
    257 
    258 #if (USE_GRADIENT_UNFILTER == 1)
    259 #define GRAD_PROCESS_LANE(L)  do {                                             \
    260   const uint8x8_t tmp1 = ROTATE_RIGHT_N(pred, 1);  /* rotate predictor in */   \
    261   const int16x8_t tmp2 = vaddq_s16(BC, U8_TO_S16(tmp1));                       \
    262   const uint8x8_t delta = vqmovun_s16(tmp2);                                   \
    263   pred = vadd_u8(D, delta);                                                    \
    264   out = vext_u8(out, ROTATE_LEFT_N(pred, (L)), 1);                             \
    265 } while (0)
    266 
    267 static void GradientPredictInverse_NEON(const uint8_t* const in,
    268                                         const uint8_t* const top,
    269                                         uint8_t* const row, int length) {
    270   if (length > 0) {
    271     int i;
    272     uint8x8_t pred = vdup_n_u8(row[-1]);   // left sample
    273     uint8x8_t out = vdup_n_u8(0);
    274     for (i = 0; i + 8 <= length; i += 8) {
    275       const int16x8_t B = LOAD_U8_TO_S16(&top[i + 0]);
    276       const int16x8_t C = LOAD_U8_TO_S16(&top[i - 1]);
    277       const int16x8_t BC = vsubq_s16(B, C);  // unclipped gradient basis B - C
    278       const uint8x8_t D = vld1_u8(&in[i]);   // base input
    279       GRAD_PROCESS_LANE(0);
    280       GRAD_PROCESS_LANE(1);
    281       GRAD_PROCESS_LANE(2);
    282       GRAD_PROCESS_LANE(3);
    283       GRAD_PROCESS_LANE(4);
    284       GRAD_PROCESS_LANE(5);
    285       GRAD_PROCESS_LANE(6);
    286       GRAD_PROCESS_LANE(7);
    287       vst1_u8(&row[i], out);
    288     }
    289     for (; i < length; ++i) {
    290       row[i] = in[i] + GradientPredictor_C(row[i - 1], top[i], top[i - 1]);
    291     }
    292   }
    293 }
    294 #undef GRAD_PROCESS_LANE
    295 
    296 static void GradientUnfilter_NEON(const uint8_t* prev, const uint8_t* in,
    297                                   uint8_t* out, int width) {
    298   if (prev == NULL) {
    299     HorizontalUnfilter_NEON(NULL, in, out, width);
    300   } else {
    301     out[0] = in[0] + prev[0];  // predict from above
    302     GradientPredictInverse_NEON(in + 1, prev + 1, out + 1, width - 1);
    303   }
    304 }
    305 
    306 #endif   // USE_GRADIENT_UNFILTER
    307 
    308 //------------------------------------------------------------------------------
    309 // Entry point
    310 
    311 extern void VP8FiltersInitNEON(void);
    312 
    313 WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitNEON(void) {
    314   WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_NEON;
    315   WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_NEON;
    316 #if (USE_GRADIENT_UNFILTER == 1)
    317   WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_NEON;
    318 #endif
    319 
    320   WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_NEON;
    321   WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter_NEON;
    322   WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter_NEON;
    323 }
    324 
    325 #else  // !WEBP_USE_NEON
    326 
    327 WEBP_DSP_INIT_STUB(VP8FiltersInitNEON)
    328 
    329 #endif  // WEBP_USE_NEON
    330