Home | History | Annotate | Download | only in dsp
      1 // Copyright 2017 Google Inc. All Rights Reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style license
      4 // that can be found in the COPYING file in the root of the source
      5 // tree. An additional intellectual property rights grant can be found
      6 // in the file PATENTS. All contributing project authors may
      7 // be found in the AUTHORS file in the root of the source tree.
      8 // -----------------------------------------------------------------------------
      9 //
     10 // YUV->RGB conversion functions
     11 //
     12 // Author: Skal (pascal.massimino (at) gmail.com)
     13 
     14 #include "src/dsp/yuv.h"
     15 
     16 #if defined(WEBP_USE_NEON)
     17 
     18 #include <assert.h>
     19 #include <stdlib.h>
     20 
     21 #include "src/dsp/neon.h"
     22 
     23 //-----------------------------------------------------------------------------
     24 
     25 static uint8x8_t ConvertRGBToY_NEON(const uint8x8_t R,
     26                                     const uint8x8_t G,
     27                                     const uint8x8_t B) {
     28   const uint16x8_t r = vmovl_u8(R);
     29   const uint16x8_t g = vmovl_u8(G);
     30   const uint16x8_t b = vmovl_u8(B);
     31   const uint16x4_t r_lo = vget_low_u16(r);
     32   const uint16x4_t r_hi = vget_high_u16(r);
     33   const uint16x4_t g_lo = vget_low_u16(g);
     34   const uint16x4_t g_hi = vget_high_u16(g);
     35   const uint16x4_t b_lo = vget_low_u16(b);
     36   const uint16x4_t b_hi = vget_high_u16(b);
     37   const uint32x4_t tmp0_lo = vmull_n_u16(         r_lo, 16839u);
     38   const uint32x4_t tmp0_hi = vmull_n_u16(         r_hi, 16839u);
     39   const uint32x4_t tmp1_lo = vmlal_n_u16(tmp0_lo, g_lo, 33059u);
     40   const uint32x4_t tmp1_hi = vmlal_n_u16(tmp0_hi, g_hi, 33059u);
     41   const uint32x4_t tmp2_lo = vmlal_n_u16(tmp1_lo, b_lo, 6420u);
     42   const uint32x4_t tmp2_hi = vmlal_n_u16(tmp1_hi, b_hi, 6420u);
     43   const uint16x8_t Y1 = vcombine_u16(vrshrn_n_u32(tmp2_lo, 16),
     44                                      vrshrn_n_u32(tmp2_hi, 16));
     45   const uint16x8_t Y2 = vaddq_u16(Y1, vdupq_n_u16(16));
     46   return vqmovn_u16(Y2);
     47 }
     48 
     49 static void ConvertRGB24ToY_NEON(const uint8_t* rgb, uint8_t* y, int width) {
     50   int i;
     51   for (i = 0; i + 8 <= width; i += 8, rgb += 3 * 8) {
     52     const uint8x8x3_t RGB = vld3_u8(rgb);
     53     const uint8x8_t Y = ConvertRGBToY_NEON(RGB.val[0], RGB.val[1], RGB.val[2]);
     54     vst1_u8(y + i, Y);
     55   }
     56   for (; i < width; ++i, rgb += 3) {   // left-over
     57     y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF);
     58   }
     59 }
     60 
     61 static void ConvertBGR24ToY_NEON(const uint8_t* bgr, uint8_t* y, int width) {
     62   int i;
     63   for (i = 0; i + 8 <= width; i += 8, bgr += 3 * 8) {
     64     const uint8x8x3_t BGR = vld3_u8(bgr);
     65     const uint8x8_t Y = ConvertRGBToY_NEON(BGR.val[2], BGR.val[1], BGR.val[0]);
     66     vst1_u8(y + i, Y);
     67   }
     68   for (; i < width; ++i, bgr += 3) {  // left-over
     69     y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF);
     70   }
     71 }
     72 
     73 static void ConvertARGBToY_NEON(const uint32_t* argb, uint8_t* y, int width) {
     74   int i;
     75   for (i = 0; i + 8 <= width; i += 8) {
     76     const uint8x8x4_t RGB = vld4_u8((const uint8_t*)&argb[i]);
     77     const uint8x8_t Y = ConvertRGBToY_NEON(RGB.val[2], RGB.val[1], RGB.val[0]);
     78     vst1_u8(y + i, Y);
     79   }
     80   for (; i < width; ++i) {   // left-over
     81     const uint32_t p = argb[i];
     82     y[i] = VP8RGBToY((p >> 16) & 0xff, (p >> 8) & 0xff, (p >>  0) & 0xff,
     83                      YUV_HALF);
     84   }
     85 }
     86 
     87 //-----------------------------------------------------------------------------
     88 
     89 // computes: DST_s16 = [(C0 * r + C1 * g + C2 * b) >> 16] + CST
     90 #define MULTIPLY_16b_PREAMBLE(r, g, b)                           \
     91   const int16x4_t r_lo = vreinterpret_s16_u16(vget_low_u16(r));  \
     92   const int16x4_t r_hi = vreinterpret_s16_u16(vget_high_u16(r)); \
     93   const int16x4_t g_lo = vreinterpret_s16_u16(vget_low_u16(g));  \
     94   const int16x4_t g_hi = vreinterpret_s16_u16(vget_high_u16(g)); \
     95   const int16x4_t b_lo = vreinterpret_s16_u16(vget_low_u16(b));  \
     96   const int16x4_t b_hi = vreinterpret_s16_u16(vget_high_u16(b))
     97 
     98 #define MULTIPLY_16b(C0, C1, C2, CST, DST_s16) do {              \
     99   const int32x4_t tmp0_lo = vmull_n_s16(         r_lo, C0);      \
    100   const int32x4_t tmp0_hi = vmull_n_s16(         r_hi, C0);      \
    101   const int32x4_t tmp1_lo = vmlal_n_s16(tmp0_lo, g_lo, C1);      \
    102   const int32x4_t tmp1_hi = vmlal_n_s16(tmp0_hi, g_hi, C1);      \
    103   const int32x4_t tmp2_lo = vmlal_n_s16(tmp1_lo, b_lo, C2);      \
    104   const int32x4_t tmp2_hi = vmlal_n_s16(tmp1_hi, b_hi, C2);      \
    105   const int16x8_t tmp3 = vcombine_s16(vshrn_n_s32(tmp2_lo, 16),  \
    106                                       vshrn_n_s32(tmp2_hi, 16)); \
    107   DST_s16 = vaddq_s16(tmp3, vdupq_n_s16(CST));                   \
    108 } while (0)
    109 
    110 // This needs to be a macro, since (128 << SHIFT) needs to be an immediate.
    111 #define CONVERT_RGB_TO_UV(r, g, b, SHIFT, U_DST, V_DST) do {     \
    112   MULTIPLY_16b_PREAMBLE(r, g, b);                                \
    113   MULTIPLY_16b(-9719, -19081, 28800, 128 << SHIFT, U_DST);       \
    114   MULTIPLY_16b(28800, -24116, -4684, 128 << SHIFT, V_DST);       \
    115 } while (0)
    116 
    117 static void ConvertRGBA32ToUV_NEON(const uint16_t* rgb,
    118                                    uint8_t* u, uint8_t* v, int width) {
    119   int i;
    120   for (i = 0; i + 8 <= width; i += 8, rgb += 4 * 8) {
    121     const uint16x8x4_t RGB = vld4q_u16((const uint16_t*)rgb);
    122     int16x8_t U, V;
    123     CONVERT_RGB_TO_UV(RGB.val[0], RGB.val[1], RGB.val[2], 2, U, V);
    124     vst1_u8(u + i, vqrshrun_n_s16(U, 2));
    125     vst1_u8(v + i, vqrshrun_n_s16(V, 2));
    126   }
    127   for (; i < width; i += 1, rgb += 4) {
    128     const int r = rgb[0], g = rgb[1], b = rgb[2];
    129     u[i] = VP8RGBToU(r, g, b, YUV_HALF << 2);
    130     v[i] = VP8RGBToV(r, g, b, YUV_HALF << 2);
    131   }
    132 }
    133 
    134 static void ConvertARGBToUV_NEON(const uint32_t* argb, uint8_t* u, uint8_t* v,
    135                                  int src_width, int do_store) {
    136   int i;
    137   for (i = 0; i + 16 <= src_width; i += 16, u += 8, v += 8) {
    138     const uint8x16x4_t RGB = vld4q_u8((const uint8_t*)&argb[i]);
    139     const uint16x8_t R = vpaddlq_u8(RGB.val[2]);  // pair-wise adds
    140     const uint16x8_t G = vpaddlq_u8(RGB.val[1]);
    141     const uint16x8_t B = vpaddlq_u8(RGB.val[0]);
    142     int16x8_t U_tmp, V_tmp;
    143     CONVERT_RGB_TO_UV(R, G, B, 1, U_tmp, V_tmp);
    144     {
    145       const uint8x8_t U = vqrshrun_n_s16(U_tmp, 1);
    146       const uint8x8_t V = vqrshrun_n_s16(V_tmp, 1);
    147       if (do_store) {
    148         vst1_u8(u, U);
    149         vst1_u8(v, V);
    150       } else {
    151         const uint8x8_t prev_u = vld1_u8(u);
    152         const uint8x8_t prev_v = vld1_u8(v);
    153         vst1_u8(u, vrhadd_u8(U, prev_u));
    154         vst1_u8(v, vrhadd_u8(V, prev_v));
    155       }
    156     }
    157   }
    158   if (i < src_width) {  // left-over
    159     WebPConvertARGBToUV_C(argb + i, u, v, src_width - i, do_store);
    160   }
    161 }
    162 
    163 
    164 //------------------------------------------------------------------------------
    165 
    166 extern void WebPInitConvertARGBToYUVNEON(void);
    167 
    168 WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVNEON(void) {
    169   WebPConvertRGB24ToY = ConvertRGB24ToY_NEON;
    170   WebPConvertBGR24ToY = ConvertBGR24ToY_NEON;
    171   WebPConvertARGBToY = ConvertARGBToY_NEON;
    172   WebPConvertARGBToUV = ConvertARGBToUV_NEON;
    173   WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_NEON;
    174 }
    175 
    176 //------------------------------------------------------------------------------
    177 
    178 #define MAX_Y ((1 << 10) - 1)    // 10b precision over 16b-arithmetic
    179 static uint16_t clip_y_NEON(int v) {
    180   return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v;
    181 }
    182 
    183 static uint64_t SharpYUVUpdateY_NEON(const uint16_t* ref, const uint16_t* src,
    184                                      uint16_t* dst, int len) {
    185   int i;
    186   const int16x8_t zero = vdupq_n_s16(0);
    187   const int16x8_t max = vdupq_n_s16(MAX_Y);
    188   uint64x2_t sum = vdupq_n_u64(0);
    189   uint64_t diff;
    190 
    191   for (i = 0; i + 8 <= len; i += 8) {
    192     const int16x8_t A = vreinterpretq_s16_u16(vld1q_u16(ref + i));
    193     const int16x8_t B = vreinterpretq_s16_u16(vld1q_u16(src + i));
    194     const int16x8_t C = vreinterpretq_s16_u16(vld1q_u16(dst + i));
    195     const int16x8_t D = vsubq_s16(A, B);       // diff_y
    196     const int16x8_t F = vaddq_s16(C, D);       // new_y
    197     const uint16x8_t H =
    198         vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(F, max), zero));
    199     const int16x8_t I = vabsq_s16(D);          // abs(diff_y)
    200     vst1q_u16(dst + i, H);
    201     sum = vpadalq_u32(sum, vpaddlq_u16(vreinterpretq_u16_s16(I)));
    202   }
    203   diff = vgetq_lane_u64(sum, 0) + vgetq_lane_u64(sum, 1);
    204   for (; i < len; ++i) {
    205     const int diff_y = ref[i] - src[i];
    206     const int new_y = (int)(dst[i]) + diff_y;
    207     dst[i] = clip_y_NEON(new_y);
    208     diff += (uint64_t)(abs(diff_y));
    209   }
    210   return diff;
    211 }
    212 
    213 static void SharpYUVUpdateRGB_NEON(const int16_t* ref, const int16_t* src,
    214                                    int16_t* dst, int len) {
    215   int i;
    216   for (i = 0; i + 8 <= len; i += 8) {
    217     const int16x8_t A = vld1q_s16(ref + i);
    218     const int16x8_t B = vld1q_s16(src + i);
    219     const int16x8_t C = vld1q_s16(dst + i);
    220     const int16x8_t D = vsubq_s16(A, B);   // diff_uv
    221     const int16x8_t E = vaddq_s16(C, D);   // new_uv
    222     vst1q_s16(dst + i, E);
    223   }
    224   for (; i < len; ++i) {
    225     const int diff_uv = ref[i] - src[i];
    226     dst[i] += diff_uv;
    227   }
    228 }
    229 
    230 static void SharpYUVFilterRow_NEON(const int16_t* A, const int16_t* B, int len,
    231                                    const uint16_t* best_y, uint16_t* out) {
    232   int i;
    233   const int16x8_t max = vdupq_n_s16(MAX_Y);
    234   const int16x8_t zero = vdupq_n_s16(0);
    235   for (i = 0; i + 8 <= len; i += 8) {
    236     const int16x8_t a0 = vld1q_s16(A + i + 0);
    237     const int16x8_t a1 = vld1q_s16(A + i + 1);
    238     const int16x8_t b0 = vld1q_s16(B + i + 0);
    239     const int16x8_t b1 = vld1q_s16(B + i + 1);
    240     const int16x8_t a0b1 = vaddq_s16(a0, b1);
    241     const int16x8_t a1b0 = vaddq_s16(a1, b0);
    242     const int16x8_t a0a1b0b1 = vaddq_s16(a0b1, a1b0);  // A0+A1+B0+B1
    243     const int16x8_t a0b1_2 = vaddq_s16(a0b1, a0b1);    // 2*(A0+B1)
    244     const int16x8_t a1b0_2 = vaddq_s16(a1b0, a1b0);    // 2*(A1+B0)
    245     const int16x8_t c0 = vshrq_n_s16(vaddq_s16(a0b1_2, a0a1b0b1), 3);
    246     const int16x8_t c1 = vshrq_n_s16(vaddq_s16(a1b0_2, a0a1b0b1), 3);
    247     const int16x8_t d0 = vaddq_s16(c1, a0);
    248     const int16x8_t d1 = vaddq_s16(c0, a1);
    249     const int16x8_t e0 = vrshrq_n_s16(d0, 1);
    250     const int16x8_t e1 = vrshrq_n_s16(d1, 1);
    251     const int16x8x2_t f = vzipq_s16(e0, e1);
    252     const int16x8_t g0 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 0));
    253     const int16x8_t g1 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 8));
    254     const int16x8_t h0 = vaddq_s16(g0, f.val[0]);
    255     const int16x8_t h1 = vaddq_s16(g1, f.val[1]);
    256     const int16x8_t i0 = vmaxq_s16(vminq_s16(h0, max), zero);
    257     const int16x8_t i1 = vmaxq_s16(vminq_s16(h1, max), zero);
    258     vst1q_u16(out + 2 * i + 0, vreinterpretq_u16_s16(i0));
    259     vst1q_u16(out + 2 * i + 8, vreinterpretq_u16_s16(i1));
    260   }
    261   for (; i < len; ++i) {
    262     const int a0b1 = A[i + 0] + B[i + 1];
    263     const int a1b0 = A[i + 1] + B[i + 0];
    264     const int a0a1b0b1 = a0b1 + a1b0 + 8;
    265     const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
    266     const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
    267     out[2 * i + 0] = clip_y_NEON(best_y[2 * i + 0] + v0);
    268     out[2 * i + 1] = clip_y_NEON(best_y[2 * i + 1] + v1);
    269   }
    270 }
    271 #undef MAX_Y
    272 
    273 //------------------------------------------------------------------------------
    274 
    275 extern void WebPInitSharpYUVNEON(void);
    276 
    277 WEBP_TSAN_IGNORE_FUNCTION void WebPInitSharpYUVNEON(void) {
    278   WebPSharpYUVUpdateY = SharpYUVUpdateY_NEON;
    279   WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_NEON;
    280   WebPSharpYUVFilterRow = SharpYUVFilterRow_NEON;
    281 }
    282 
    283 #else  // !WEBP_USE_NEON
    284 
    285 WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVNEON)
    286 WEBP_DSP_INIT_STUB(WebPInitSharpYUVNEON)
    287 
    288 #endif  // WEBP_USE_NEON
    289