Home | History | Annotate | Download | only in simd
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "build/build_config.h"
      6 #include "media/base/simd/convert_rgb_to_yuv.h"
      7 #include "media/base/simd/yuv_to_rgb_table.h"
      8 
      9 #if defined(COMPILER_MSVC)
     10 #include <intrin.h>
     11 #else
     12 #include <mmintrin.h>
     13 #include <emmintrin.h>
     14 #endif
     15 
     16 namespace media {
     17 
     18 #define FIX_SHIFT 12
     19 #define FIX(x) ((x) * (1 << FIX_SHIFT))
     20 
     21 // Define a convenient macro to do static cast.
     22 #define INT16_FIX(x) static_cast<int16>(FIX(x))
     23 
     24 // Android's pixel layout is RGBA, while other platforms
     25 // are BGRA.
     26 #if defined(OS_ANDROID)
     27 SIMD_ALIGNED(const int16 ConvertRGBAToYUV_kTable[8 * 3]) = {
     28   INT16_FIX(0.257), INT16_FIX(0.504), INT16_FIX(0.098), 0,
     29   INT16_FIX(0.257), INT16_FIX(0.504), INT16_FIX(0.098), 0,
     30   -INT16_FIX(0.148), -INT16_FIX(0.291), INT16_FIX(0.439), 0,
     31   -INT16_FIX(0.148), -INT16_FIX(0.291), INT16_FIX(0.439), 0,
     32   INT16_FIX(0.439), -INT16_FIX(0.368), -INT16_FIX(0.071), 0,
     33   INT16_FIX(0.439), -INT16_FIX(0.368), -INT16_FIX(0.071), 0,
     34 };
     35 #else
     36 SIMD_ALIGNED(const int16 ConvertRGBAToYUV_kTable[8 * 3]) = {
     37   INT16_FIX(0.098), INT16_FIX(0.504), INT16_FIX(0.257), 0,
     38   INT16_FIX(0.098), INT16_FIX(0.504), INT16_FIX(0.257), 0,
     39   INT16_FIX(0.439), -INT16_FIX(0.291), -INT16_FIX(0.148), 0,
     40   INT16_FIX(0.439), -INT16_FIX(0.291), -INT16_FIX(0.148), 0,
     41   -INT16_FIX(0.071), -INT16_FIX(0.368), INT16_FIX(0.439), 0,
     42   -INT16_FIX(0.071), -INT16_FIX(0.368), INT16_FIX(0.439), 0,
     43 };
     44 #endif
     45 
     46 #undef INT16_FIX
     47 
     48 // This is the final offset for the conversion from signed yuv values to
     49 // unsigned values. It is arranged so that offset of 16 is applied to Y
     50 // components and 128 is added to UV components for 2 pixels.
     51 SIMD_ALIGNED(const int32 kYOffset[4]) = {16, 16, 16, 16};
     52 
     53 static inline int Clamp(int value) {
     54   if (value < 0)
     55     return 0;
     56   if (value > 255)
     57     return 255;
     58   return value;
     59 }
     60 
     61 static inline int RGBToY(int r, int g, int b) {
     62   int y = ConvertRGBAToYUV_kTable[0] * b +
     63       ConvertRGBAToYUV_kTable[1] * g +
     64       ConvertRGBAToYUV_kTable[2] * r;
     65   y >>= FIX_SHIFT;
     66   return Clamp(y + 16);
     67 }
     68 
     69 static inline int RGBToU(int r, int g, int b, int shift) {
     70   int u = ConvertRGBAToYUV_kTable[8] * b +
     71       ConvertRGBAToYUV_kTable[9] * g +
     72       ConvertRGBAToYUV_kTable[10] * r;
     73   u >>= FIX_SHIFT + shift;
     74   return Clamp(u + 128);
     75 }
     76 
     77 static inline int RGBToV(int r, int g, int b, int shift) {
     78   int v = ConvertRGBAToYUV_kTable[16] * b +
     79       ConvertRGBAToYUV_kTable[17] * g +
     80       ConvertRGBAToYUV_kTable[18] * r;
     81   v >>= FIX_SHIFT + shift;
     82   return Clamp(v + 128);
     83 }
     84 
     85 #define CONVERT_Y(rgb_buf, y_buf) \
     86   b = *rgb_buf++; \
     87   g = *rgb_buf++; \
     88   r = *rgb_buf++; \
     89   ++rgb_buf;      \
     90   sum_b += b;     \
     91   sum_g += g;     \
     92   sum_r += r;     \
     93   *y_buf++ = RGBToY(r, g, b);
     94 
     95 static inline void ConvertRGBToYUV_V2H2(const uint8* rgb_buf_1,
     96                                         const uint8* rgb_buf_2,
     97                                         uint8* y_buf_1,
     98                                         uint8* y_buf_2,
     99                                         uint8* u_buf,
    100                                         uint8* v_buf) {
    101   int sum_b = 0;
    102   int sum_g = 0;
    103   int sum_r = 0;
    104   int r, g, b;
    105 
    106 
    107 
    108   CONVERT_Y(rgb_buf_1, y_buf_1);
    109   CONVERT_Y(rgb_buf_1, y_buf_1);
    110   CONVERT_Y(rgb_buf_2, y_buf_2);
    111   CONVERT_Y(rgb_buf_2, y_buf_2);
    112   *u_buf++ = RGBToU(sum_r, sum_g, sum_b, 2);
    113   *v_buf++ = RGBToV(sum_r, sum_g, sum_b, 2);
    114 }
    115 
    116 static inline void ConvertRGBToYUV_V2H1(const uint8* rgb_buf_1,
    117                                         const uint8* rgb_buf_2,
    118                                         uint8* y_buf_1,
    119                                         uint8* y_buf_2,
    120                                         uint8* u_buf,
    121                                         uint8* v_buf) {
    122   int sum_b = 0;
    123   int sum_g = 0;
    124   int sum_r = 0;
    125   int r, g, b;
    126 
    127   CONVERT_Y(rgb_buf_1, y_buf_1);
    128   CONVERT_Y(rgb_buf_2, y_buf_2);
    129   *u_buf++ = RGBToU(sum_r, sum_g, sum_b, 1);
    130   *v_buf++ = RGBToV(sum_r, sum_g, sum_b, 1);
    131 }
    132 
    133 static inline void ConvertRGBToYUV_V1H2(const uint8* rgb_buf,
    134                                        uint8* y_buf,
    135                                        uint8* u_buf,
    136                                        uint8* v_buf) {
    137   int sum_b = 0;
    138   int sum_g = 0;
    139   int sum_r = 0;
    140   int r, g, b;
    141 
    142   CONVERT_Y(rgb_buf, y_buf);
    143   CONVERT_Y(rgb_buf, y_buf);
    144   *u_buf++ = RGBToU(sum_r, sum_g, sum_b, 1);
    145   *v_buf++ = RGBToV(sum_r, sum_g, sum_b, 1);
    146 }
    147 
    148 static inline void ConvertRGBToYUV_V1H1(const uint8* rgb_buf,
    149                                        uint8* y_buf,
    150                                        uint8* u_buf,
    151                                        uint8* v_buf) {
    152   int sum_b = 0;
    153   int sum_g = 0;
    154   int sum_r = 0;
    155   int r, g, b;
    156 
    157   CONVERT_Y(rgb_buf, y_buf);
    158   *u_buf++ = RGBToU(r, g, b, 0);
    159   *v_buf++ = RGBToV(r, g, b, 0);
    160 }
    161 
    162 static void ConvertRGB32ToYUVRow_SSE2(const uint8* rgb_buf_1,
    163                                       const uint8* rgb_buf_2,
    164                                       uint8* y_buf_1,
    165                                       uint8* y_buf_2,
    166                                       uint8* u_buf,
    167                                       uint8* v_buf,
    168                                       int width) {
    169   while (width >= 4) {
    170     // Name for the Y pixels:
    171     // Row 1: a b c d
    172     // Row 2: e f g h
    173     //
    174     // First row 4 pixels.
    175     __m128i rgb_row_1 = _mm_loadu_si128(
    176         reinterpret_cast<const __m128i*>(rgb_buf_1));
    177     __m128i zero_1 = _mm_xor_si128(rgb_row_1, rgb_row_1);
    178 
    179     __m128i y_table = _mm_load_si128(
    180         reinterpret_cast<const __m128i*>(ConvertRGBAToYUV_kTable));
    181 
    182     __m128i rgb_a_b = _mm_unpackhi_epi8(rgb_row_1, zero_1);
    183     rgb_a_b = _mm_madd_epi16(rgb_a_b, y_table);
    184 
    185     __m128i rgb_c_d = _mm_unpacklo_epi8(rgb_row_1, zero_1);
    186     rgb_c_d = _mm_madd_epi16(rgb_c_d, y_table);
    187 
    188     // Do a crazh shuffle so that we get:
    189     //  v------------ Multiply Add
    190     // BG: a b c d
    191     // A0: a b c d
    192     __m128i bg_abcd = _mm_castps_si128(
    193         _mm_shuffle_ps(
    194             _mm_castsi128_ps(rgb_c_d),
    195             _mm_castsi128_ps(rgb_a_b),
    196             (3 << 6) | (1 << 4) | (3 << 2) | 1));
    197     __m128i r_abcd = _mm_castps_si128(
    198         _mm_shuffle_ps(
    199             _mm_castsi128_ps(rgb_c_d),
    200             _mm_castsi128_ps(rgb_a_b),
    201             (2 << 6) | (2 << 2)));
    202     __m128i y_abcd = _mm_add_epi32(bg_abcd, r_abcd);
    203 
    204     // Down shift back to 8bits range.
    205     __m128i y_offset = _mm_load_si128(
    206         reinterpret_cast<const __m128i*>(kYOffset));
    207     y_abcd = _mm_srai_epi32(y_abcd, FIX_SHIFT);
    208     y_abcd = _mm_add_epi32(y_abcd, y_offset);
    209     y_abcd = _mm_packs_epi32(y_abcd, y_abcd);
    210     y_abcd = _mm_packus_epi16(y_abcd, y_abcd);
    211     *reinterpret_cast<uint32*>(y_buf_1) = _mm_cvtsi128_si32(y_abcd);
    212     y_buf_1 += 4;
    213 
    214     // Second row 4 pixels.
    215     __m128i rgb_row_2 = _mm_loadu_si128(
    216         reinterpret_cast<const __m128i*>(rgb_buf_2));
    217     __m128i zero_2 = _mm_xor_si128(rgb_row_2, rgb_row_2);
    218     __m128i rgb_e_f = _mm_unpackhi_epi8(rgb_row_2, zero_2);
    219     __m128i rgb_g_h = _mm_unpacklo_epi8(rgb_row_2, zero_2);
    220 
    221     // Add two rows together.
    222     __m128i rgb_ae_bf =
    223         _mm_add_epi16(_mm_unpackhi_epi8(rgb_row_1, zero_2), rgb_e_f);
    224     __m128i rgb_cg_dh =
    225         _mm_add_epi16(_mm_unpacklo_epi8(rgb_row_1, zero_2), rgb_g_h);
    226 
    227     // Multiply add like the previous row.
    228     rgb_e_f = _mm_madd_epi16(rgb_e_f, y_table);
    229     rgb_g_h = _mm_madd_epi16(rgb_g_h, y_table);
    230 
    231     __m128i bg_efgh = _mm_castps_si128(
    232         _mm_shuffle_ps(_mm_castsi128_ps(rgb_g_h),
    233                        _mm_castsi128_ps(rgb_e_f),
    234                        (3 << 6) | (1 << 4) | (3 << 2) | 1));
    235     __m128i r_efgh = _mm_castps_si128(
    236         _mm_shuffle_ps(_mm_castsi128_ps(rgb_g_h),
    237                        _mm_castsi128_ps(rgb_e_f),
    238                        (2 << 6) | (2 << 2)));
    239     __m128i y_efgh = _mm_add_epi32(bg_efgh, r_efgh);
    240     y_efgh = _mm_srai_epi32(y_efgh, FIX_SHIFT);
    241     y_efgh = _mm_add_epi32(y_efgh, y_offset);
    242     y_efgh = _mm_packs_epi32(y_efgh, y_efgh);
    243     y_efgh = _mm_packus_epi16(y_efgh, y_efgh);
    244     *reinterpret_cast<uint32*>(y_buf_2) = _mm_cvtsi128_si32(y_efgh);
    245     y_buf_2 += 4;
    246 
    247     __m128i rgb_ae_cg = _mm_castps_si128(
    248         _mm_shuffle_ps(_mm_castsi128_ps(rgb_cg_dh),
    249                        _mm_castsi128_ps(rgb_ae_bf),
    250                        (3 << 6) | (2 << 4) | (3 << 2) | 2));
    251     __m128i rgb_bf_dh = _mm_castps_si128(
    252         _mm_shuffle_ps(_mm_castsi128_ps(rgb_cg_dh),
    253                        _mm_castsi128_ps(rgb_ae_bf),
    254                        (1 << 6) | (1 << 2)));
    255 
    256     // This is a 2x2 subsampling for 2 pixels.
    257     __m128i rgb_abef_cdgh = _mm_add_epi16(rgb_ae_cg, rgb_bf_dh);
    258 
    259     // Do a multiply add with U table.
    260     __m128i u_a_b = _mm_madd_epi16(
    261         rgb_abef_cdgh,
    262         _mm_load_si128(
    263             reinterpret_cast<const __m128i*>(ConvertRGBAToYUV_kTable + 8)));
    264     u_a_b = _mm_add_epi32(_mm_shuffle_epi32(u_a_b, ((3 << 2) | 1)),
    265                           _mm_shuffle_epi32(u_a_b, (2 << 2)));
    266     // Right shift 14 because of 12 from fixed point and 2 from subsampling.
    267     u_a_b = _mm_srai_epi32(u_a_b, FIX_SHIFT + 2);
    268     __m128i uv_offset = _mm_slli_epi32(y_offset, 3);
    269     u_a_b = _mm_add_epi32(u_a_b, uv_offset);
    270     u_a_b = _mm_packs_epi32(u_a_b, u_a_b);
    271     u_a_b = _mm_packus_epi16(u_a_b, u_a_b);
    272     *reinterpret_cast<uint16*>(u_buf) = _mm_extract_epi16(u_a_b, 0);
    273     u_buf += 2;
    274 
    275     __m128i v_a_b = _mm_madd_epi16(
    276         rgb_abef_cdgh,
    277         _mm_load_si128(
    278             reinterpret_cast<const __m128i*>(ConvertRGBAToYUV_kTable + 16)));
    279     v_a_b = _mm_add_epi32(_mm_shuffle_epi32(v_a_b, ((3 << 2) | 1)),
    280                           _mm_shuffle_epi32(v_a_b, (2 << 2)));
    281     v_a_b = _mm_srai_epi32(v_a_b, FIX_SHIFT + 2);
    282     v_a_b = _mm_add_epi32(v_a_b, uv_offset);
    283     v_a_b = _mm_packs_epi32(v_a_b, v_a_b);
    284     v_a_b = _mm_packus_epi16(v_a_b, v_a_b);
    285     *reinterpret_cast<uint16*>(v_buf) = _mm_extract_epi16(v_a_b, 0);
    286     v_buf += 2;
    287 
    288     rgb_buf_1 += 16;
    289     rgb_buf_2 += 16;
    290 
    291     // Move forward by 4 pixels.
    292     width -= 4;
    293   }
    294 
    295   // Just use C code to convert the remaining pixels.
    296   if (width >= 2) {
    297     ConvertRGBToYUV_V2H2(rgb_buf_1, rgb_buf_2, y_buf_1, y_buf_2, u_buf, v_buf);
    298     rgb_buf_1 += 8;
    299     rgb_buf_2 += 8;
    300     y_buf_1 += 2;
    301     y_buf_2 += 2;
    302     ++u_buf;
    303     ++v_buf;
    304     width -= 2;
    305   }
    306 
    307   if (width)
    308     ConvertRGBToYUV_V2H1(rgb_buf_1, rgb_buf_2, y_buf_1, y_buf_2, u_buf, v_buf);
    309 }
    310 
    311 extern void ConvertRGB32ToYUV_SSE2(const uint8* rgbframe,
    312                                    uint8* yplane,
    313                                    uint8* uplane,
    314                                    uint8* vplane,
    315                                    int width,
    316                                    int height,
    317                                    int rgbstride,
    318                                    int ystride,
    319                                    int uvstride) {
    320   while (height >= 2) {
    321     ConvertRGB32ToYUVRow_SSE2(rgbframe,
    322                               rgbframe + rgbstride,
    323                               yplane,
    324                               yplane + ystride,
    325                               uplane,
    326                               vplane,
    327                               width);
    328     rgbframe += 2 * rgbstride;
    329     yplane += 2 * ystride;
    330     uplane += uvstride;
    331     vplane += uvstride;
    332     height -= 2;
    333   }
    334 
    335   if (!height)
    336     return;
    337 
    338   // Handle the last row.
    339   while (width >= 2) {
    340     ConvertRGBToYUV_V1H2(rgbframe, yplane, uplane, vplane);
    341     rgbframe += 8;
    342     yplane += 2;
    343     ++uplane;
    344     ++vplane;
    345     width -= 2;
    346   }
    347 
    348   if (width)
    349     ConvertRGBToYUV_V1H1(rgbframe, yplane, uplane, vplane);
    350 }
    351 
    352 void ConvertRGB32ToYUV_SSE2_Reference(const uint8* rgbframe,
    353                                       uint8* yplane,
    354                                       uint8* uplane,
    355                                       uint8* vplane,
    356                                       int width,
    357                                       int height,
    358                                       int rgbstride,
    359                                       int ystride,
    360                                       int uvstride) {
    361   while (height >= 2) {
    362     int i = 0;
    363 
    364     // Convert a 2x2 block.
    365     while (i + 2 <= width) {
    366       ConvertRGBToYUV_V2H2(rgbframe + i * 4,
    367                            rgbframe + rgbstride + i * 4,
    368                            yplane + i,
    369                            yplane + ystride + i,
    370                            uplane + i / 2,
    371                            vplane + i / 2);
    372       i += 2;
    373     }
    374 
    375     // Convert the last pixel of two rows.
    376     if (i < width) {
    377       ConvertRGBToYUV_V2H1(rgbframe + i * 4,
    378                            rgbframe + rgbstride + i * 4,
    379                            yplane + i,
    380                            yplane + ystride + i,
    381                            uplane + i / 2,
    382                            vplane + i / 2);
    383     }
    384 
    385     rgbframe += 2 * rgbstride;
    386     yplane += 2 * ystride;
    387     uplane += uvstride;
    388     vplane += uvstride;
    389     height -= 2;
    390   }
    391 
    392   if (!height)
    393     return;
    394 
    395   // Handle the last row.
    396   while (width >= 2) {
    397     ConvertRGBToYUV_V1H2(rgbframe, yplane, uplane, vplane);
    398     rgbframe += 8;
    399     yplane += 2;
    400     ++uplane;
    401     ++vplane;
    402     width -= 2;
    403   }
    404 
    405   // Handle the last pixel in the last row.
    406   if (width)
    407     ConvertRGBToYUV_V1H1(rgbframe, yplane, uplane, vplane);
    408 }
    409 
    410 }  // namespace media
    411