Home | History | Annotate | Download | only in simd
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "build/build_config.h"
      6 #include "media/base/simd/convert_rgb_to_yuv.h"
      7 #include "media/base/simd/yuv_to_rgb_table.h"
      8 
      9 #if defined(COMPILER_MSVC)
     10 #include <intrin.h>
     11 #else
     12 #include <mmintrin.h>
     13 #include <emmintrin.h>
     14 #endif
     15 
     16 namespace media {
     17 
     18 #define FIX_SHIFT 12
     19 #define FIX(x) ((x) * (1 << FIX_SHIFT))
     20 
     21 // Define a convenient macro to do static cast.
     22 #define INT16_FIX(x) static_cast<int16>(FIX(x))
     23 
     24 SIMD_ALIGNED(const int16 ConvertRGBAToYUV_kTable[8 * 3]) = {
     25   INT16_FIX(0.098), INT16_FIX(0.504), INT16_FIX(0.257), 0,
     26   INT16_FIX(0.098), INT16_FIX(0.504), INT16_FIX(0.257), 0,
     27   INT16_FIX(0.439), -INT16_FIX(0.291), -INT16_FIX(0.148), 0,
     28   INT16_FIX(0.439), -INT16_FIX(0.291), -INT16_FIX(0.148), 0,
     29   -INT16_FIX(0.071), -INT16_FIX(0.368), INT16_FIX(0.439), 0,
     30   -INT16_FIX(0.071), -INT16_FIX(0.368), INT16_FIX(0.439), 0,
     31 };
     32 
     33 #undef INT16_FIX
     34 
     35 // This is the final offset for the conversion from signed yuv values to
     36 // unsigned values. It is arranged so that offset of 16 is applied to Y
     37 // components and 128 is added to UV components for 2 pixels.
     38 SIMD_ALIGNED(const int32 kYOffset[4]) = {16, 16, 16, 16};
     39 
     40 static inline int Clamp(int value) {
     41   if (value < 0)
     42     return 0;
     43   if (value > 255)
     44     return 255;
     45   return value;
     46 }
     47 
     48 static inline int RGBToY(int r, int g, int b) {
     49   int y = ConvertRGBAToYUV_kTable[0] * b +
     50       ConvertRGBAToYUV_kTable[1] * g +
     51       ConvertRGBAToYUV_kTable[2] * r;
     52   y >>= FIX_SHIFT;
     53   return Clamp(y + 16);
     54 }
     55 
     56 static inline int RGBToU(int r, int g, int b, int shift) {
     57   int u = ConvertRGBAToYUV_kTable[8] * b +
     58       ConvertRGBAToYUV_kTable[9] * g +
     59       ConvertRGBAToYUV_kTable[10] * r;
     60   u >>= FIX_SHIFT + shift;
     61   return Clamp(u + 128);
     62 }
     63 
     64 static inline int RGBToV(int r, int g, int b, int shift) {
     65   int v = ConvertRGBAToYUV_kTable[16] * b +
     66       ConvertRGBAToYUV_kTable[17] * g +
     67       ConvertRGBAToYUV_kTable[18] * r;
     68   v >>= FIX_SHIFT + shift;
     69   return Clamp(v + 128);
     70 }
     71 
     72 #define CONVERT_Y(rgb_buf, y_buf) \
     73   b = *rgb_buf++; \
     74   g = *rgb_buf++; \
     75   r = *rgb_buf++; \
     76   ++rgb_buf;      \
     77   sum_b += b;     \
     78   sum_g += g;     \
     79   sum_r += r;     \
     80   *y_buf++ = RGBToY(r, g, b);
     81 
     82 static inline void ConvertRGBToYUV_V2H2(const uint8* rgb_buf_1,
     83                                         const uint8* rgb_buf_2,
     84                                         uint8* y_buf_1,
     85                                         uint8* y_buf_2,
     86                                         uint8* u_buf,
     87                                         uint8* v_buf) {
     88   int sum_b = 0;
     89   int sum_g = 0;
     90   int sum_r = 0;
     91   int r, g, b;
     92 
     93 
     94 
     95   CONVERT_Y(rgb_buf_1, y_buf_1);
     96   CONVERT_Y(rgb_buf_1, y_buf_1);
     97   CONVERT_Y(rgb_buf_2, y_buf_2);
     98   CONVERT_Y(rgb_buf_2, y_buf_2);
     99   *u_buf++ = RGBToU(sum_r, sum_g, sum_b, 2);
    100   *v_buf++ = RGBToV(sum_r, sum_g, sum_b, 2);
    101 }
    102 
    103 static inline void ConvertRGBToYUV_V2H1(const uint8* rgb_buf_1,
    104                                         const uint8* rgb_buf_2,
    105                                         uint8* y_buf_1,
    106                                         uint8* y_buf_2,
    107                                         uint8* u_buf,
    108                                         uint8* v_buf) {
    109   int sum_b = 0;
    110   int sum_g = 0;
    111   int sum_r = 0;
    112   int r, g, b;
    113 
    114   CONVERT_Y(rgb_buf_1, y_buf_1);
    115   CONVERT_Y(rgb_buf_2, y_buf_2);
    116   *u_buf++ = RGBToU(sum_r, sum_g, sum_b, 1);
    117   *v_buf++ = RGBToV(sum_r, sum_g, sum_b, 1);
    118 }
    119 
    120 static inline void ConvertRGBToYUV_V1H2(const uint8* rgb_buf,
    121                                        uint8* y_buf,
    122                                        uint8* u_buf,
    123                                        uint8* v_buf) {
    124   int sum_b = 0;
    125   int sum_g = 0;
    126   int sum_r = 0;
    127   int r, g, b;
    128 
    129   CONVERT_Y(rgb_buf, y_buf);
    130   CONVERT_Y(rgb_buf, y_buf);
    131   *u_buf++ = RGBToU(sum_r, sum_g, sum_b, 1);
    132   *v_buf++ = RGBToV(sum_r, sum_g, sum_b, 1);
    133 }
    134 
    135 static inline void ConvertRGBToYUV_V1H1(const uint8* rgb_buf,
    136                                        uint8* y_buf,
    137                                        uint8* u_buf,
    138                                        uint8* v_buf) {
    139   int sum_b = 0;
    140   int sum_g = 0;
    141   int sum_r = 0;
    142   int r, g, b;
    143 
    144   CONVERT_Y(rgb_buf, y_buf);
    145   *u_buf++ = RGBToU(r, g, b, 0);
    146   *v_buf++ = RGBToV(r, g, b, 0);
    147 }
    148 
    149 static void ConvertRGB32ToYUVRow_SSE2(const uint8* rgb_buf_1,
    150                                       const uint8* rgb_buf_2,
    151                                       uint8* y_buf_1,
    152                                       uint8* y_buf_2,
    153                                       uint8* u_buf,
    154                                       uint8* v_buf,
    155                                       int width) {
    156   while (width >= 4) {
    157     // Name for the Y pixels:
    158     // Row 1: a b c d
    159     // Row 2: e f g h
    160     //
    161     // First row 4 pixels.
    162     __m128i rgb_row_1 = _mm_loadu_si128(
    163         reinterpret_cast<const __m128i*>(rgb_buf_1));
    164     __m128i zero_1 = _mm_xor_si128(rgb_row_1, rgb_row_1);
    165 
    166     __m128i y_table = _mm_load_si128(
    167         reinterpret_cast<const __m128i*>(ConvertRGBAToYUV_kTable));
    168 
    169     __m128i rgb_a_b = _mm_unpackhi_epi8(rgb_row_1, zero_1);
    170     rgb_a_b = _mm_madd_epi16(rgb_a_b, y_table);
    171 
    172     __m128i rgb_c_d = _mm_unpacklo_epi8(rgb_row_1, zero_1);
    173     rgb_c_d = _mm_madd_epi16(rgb_c_d, y_table);
    174 
    175     // Do a crazh shuffle so that we get:
    176     //  v------------ Multiply Add
    177     // BG: a b c d
    178     // A0: a b c d
    179     __m128i bg_abcd = _mm_castps_si128(
    180         _mm_shuffle_ps(
    181             _mm_castsi128_ps(rgb_c_d),
    182             _mm_castsi128_ps(rgb_a_b),
    183             (3 << 6) | (1 << 4) | (3 << 2) | 1));
    184     __m128i r_abcd = _mm_castps_si128(
    185         _mm_shuffle_ps(
    186             _mm_castsi128_ps(rgb_c_d),
    187             _mm_castsi128_ps(rgb_a_b),
    188             (2 << 6) | (2 << 2)));
    189     __m128i y_abcd = _mm_add_epi32(bg_abcd, r_abcd);
    190 
    191     // Down shift back to 8bits range.
    192     __m128i y_offset = _mm_load_si128(
    193         reinterpret_cast<const __m128i*>(kYOffset));
    194     y_abcd = _mm_srai_epi32(y_abcd, FIX_SHIFT);
    195     y_abcd = _mm_add_epi32(y_abcd, y_offset);
    196     y_abcd = _mm_packs_epi32(y_abcd, y_abcd);
    197     y_abcd = _mm_packus_epi16(y_abcd, y_abcd);
    198     *reinterpret_cast<uint32*>(y_buf_1) = _mm_cvtsi128_si32(y_abcd);
    199     y_buf_1 += 4;
    200 
    201     // Second row 4 pixels.
    202     __m128i rgb_row_2 = _mm_loadu_si128(
    203         reinterpret_cast<const __m128i*>(rgb_buf_2));
    204     __m128i zero_2 = _mm_xor_si128(rgb_row_2, rgb_row_2);
    205     __m128i rgb_e_f = _mm_unpackhi_epi8(rgb_row_2, zero_2);
    206     __m128i rgb_g_h = _mm_unpacklo_epi8(rgb_row_2, zero_2);
    207 
    208     // Add two rows together.
    209     __m128i rgb_ae_bf =
    210         _mm_add_epi16(_mm_unpackhi_epi8(rgb_row_1, zero_2), rgb_e_f);
    211     __m128i rgb_cg_dh =
    212         _mm_add_epi16(_mm_unpacklo_epi8(rgb_row_1, zero_2), rgb_g_h);
    213 
    214     // Multiply add like the previous row.
    215     rgb_e_f = _mm_madd_epi16(rgb_e_f, y_table);
    216     rgb_g_h = _mm_madd_epi16(rgb_g_h, y_table);
    217 
    218     __m128i bg_efgh = _mm_castps_si128(
    219         _mm_shuffle_ps(_mm_castsi128_ps(rgb_g_h),
    220                        _mm_castsi128_ps(rgb_e_f),
    221                        (3 << 6) | (1 << 4) | (3 << 2) | 1));
    222     __m128i r_efgh = _mm_castps_si128(
    223         _mm_shuffle_ps(_mm_castsi128_ps(rgb_g_h),
    224                        _mm_castsi128_ps(rgb_e_f),
    225                        (2 << 6) | (2 << 2)));
    226     __m128i y_efgh = _mm_add_epi32(bg_efgh, r_efgh);
    227     y_efgh = _mm_srai_epi32(y_efgh, FIX_SHIFT);
    228     y_efgh = _mm_add_epi32(y_efgh, y_offset);
    229     y_efgh = _mm_packs_epi32(y_efgh, y_efgh);
    230     y_efgh = _mm_packus_epi16(y_efgh, y_efgh);
    231     *reinterpret_cast<uint32*>(y_buf_2) = _mm_cvtsi128_si32(y_efgh);
    232     y_buf_2 += 4;
    233 
    234     __m128i rgb_ae_cg = _mm_castps_si128(
    235         _mm_shuffle_ps(_mm_castsi128_ps(rgb_cg_dh),
    236                        _mm_castsi128_ps(rgb_ae_bf),
    237                        (3 << 6) | (2 << 4) | (3 << 2) | 2));
    238     __m128i rgb_bf_dh = _mm_castps_si128(
    239         _mm_shuffle_ps(_mm_castsi128_ps(rgb_cg_dh),
    240                        _mm_castsi128_ps(rgb_ae_bf),
    241                        (1 << 6) | (1 << 2)));
    242 
    243     // This is a 2x2 subsampling for 2 pixels.
    244     __m128i rgb_abef_cdgh = _mm_add_epi16(rgb_ae_cg, rgb_bf_dh);
    245 
    246     // Do a multiply add with U table.
    247     __m128i u_a_b = _mm_madd_epi16(
    248         rgb_abef_cdgh,
    249         _mm_load_si128(
    250             reinterpret_cast<const __m128i*>(ConvertRGBAToYUV_kTable + 8)));
    251     u_a_b = _mm_add_epi32(_mm_shuffle_epi32(u_a_b, ((3 << 2) | 1)),
    252                           _mm_shuffle_epi32(u_a_b, (2 << 2)));
    253     // Right shift 14 because of 12 from fixed point and 2 from subsampling.
    254     u_a_b = _mm_srai_epi32(u_a_b, FIX_SHIFT + 2);
    255     __m128i uv_offset = _mm_slli_epi32(y_offset, 3);
    256     u_a_b = _mm_add_epi32(u_a_b, uv_offset);
    257     u_a_b = _mm_packs_epi32(u_a_b, u_a_b);
    258     u_a_b = _mm_packus_epi16(u_a_b, u_a_b);
    259     *reinterpret_cast<uint16*>(u_buf) = _mm_extract_epi16(u_a_b, 0);
    260     u_buf += 2;
    261 
    262     __m128i v_a_b = _mm_madd_epi16(
    263         rgb_abef_cdgh,
    264         _mm_load_si128(
    265             reinterpret_cast<const __m128i*>(ConvertRGBAToYUV_kTable + 16)));
    266     v_a_b = _mm_add_epi32(_mm_shuffle_epi32(v_a_b, ((3 << 2) | 1)),
    267                           _mm_shuffle_epi32(v_a_b, (2 << 2)));
    268     v_a_b = _mm_srai_epi32(v_a_b, FIX_SHIFT + 2);
    269     v_a_b = _mm_add_epi32(v_a_b, uv_offset);
    270     v_a_b = _mm_packs_epi32(v_a_b, v_a_b);
    271     v_a_b = _mm_packus_epi16(v_a_b, v_a_b);
    272     *reinterpret_cast<uint16*>(v_buf) = _mm_extract_epi16(v_a_b, 0);
    273     v_buf += 2;
    274 
    275     rgb_buf_1 += 16;
    276     rgb_buf_2 += 16;
    277 
    278     // Move forward by 4 pixels.
    279     width -= 4;
    280   }
    281 
    282   // Just use C code to convert the remaining pixels.
    283   if (width >= 2) {
    284     ConvertRGBToYUV_V2H2(rgb_buf_1, rgb_buf_2, y_buf_1, y_buf_2, u_buf, v_buf);
    285     rgb_buf_1 += 8;
    286     rgb_buf_2 += 8;
    287     y_buf_1 += 2;
    288     y_buf_2 += 2;
    289     ++u_buf;
    290     ++v_buf;
    291     width -= 2;
    292   }
    293 
    294   if (width)
    295     ConvertRGBToYUV_V2H1(rgb_buf_1, rgb_buf_2, y_buf_1, y_buf_2, u_buf, v_buf);
    296 }
    297 
    298 extern void ConvertRGB32ToYUV_SSE2(const uint8* rgbframe,
    299                                    uint8* yplane,
    300                                    uint8* uplane,
    301                                    uint8* vplane,
    302                                    int width,
    303                                    int height,
    304                                    int rgbstride,
    305                                    int ystride,
    306                                    int uvstride) {
    307   while (height >= 2) {
    308     ConvertRGB32ToYUVRow_SSE2(rgbframe,
    309                               rgbframe + rgbstride,
    310                               yplane,
    311                               yplane + ystride,
    312                               uplane,
    313                               vplane,
    314                               width);
    315     rgbframe += 2 * rgbstride;
    316     yplane += 2 * ystride;
    317     uplane += uvstride;
    318     vplane += uvstride;
    319     height -= 2;
    320   }
    321 
    322   if (!height)
    323     return;
    324 
    325   // Handle the last row.
    326   while (width >= 2) {
    327     ConvertRGBToYUV_V1H2(rgbframe, yplane, uplane, vplane);
    328     rgbframe += 8;
    329     yplane += 2;
    330     ++uplane;
    331     ++vplane;
    332     width -= 2;
    333   }
    334 
    335   if (width)
    336     ConvertRGBToYUV_V1H1(rgbframe, yplane, uplane, vplane);
    337 }
    338 
    339 void ConvertRGB32ToYUV_SSE2_Reference(const uint8* rgbframe,
    340                                       uint8* yplane,
    341                                       uint8* uplane,
    342                                       uint8* vplane,
    343                                       int width,
    344                                       int height,
    345                                       int rgbstride,
    346                                       int ystride,
    347                                       int uvstride) {
    348   while (height >= 2) {
    349     int i = 0;
    350 
    351     // Convert a 2x2 block.
    352     while (i + 2 <= width) {
    353       ConvertRGBToYUV_V2H2(rgbframe + i * 4,
    354                            rgbframe + rgbstride + i * 4,
    355                            yplane + i,
    356                            yplane + ystride + i,
    357                            uplane + i / 2,
    358                            vplane + i / 2);
    359       i += 2;
    360     }
    361 
    362     // Convert the last pixel of two rows.
    363     if (i < width) {
    364       ConvertRGBToYUV_V2H1(rgbframe + i * 4,
    365                            rgbframe + rgbstride + i * 4,
    366                            yplane + i,
    367                            yplane + ystride + i,
    368                            uplane + i / 2,
    369                            vplane + i / 2);
    370     }
    371 
    372     rgbframe += 2 * rgbstride;
    373     yplane += 2 * ystride;
    374     uplane += uvstride;
    375     vplane += uvstride;
    376     height -= 2;
    377   }
    378 
    379   if (!height)
    380     return;
    381 
    382   // Handle the last row.
    383   while (width >= 2) {
    384     ConvertRGBToYUV_V1H2(rgbframe, yplane, uplane, vplane);
    385     rgbframe += 8;
    386     yplane += 2;
    387     ++uplane;
    388     ++vplane;
    389     width -= 2;
    390   }
    391 
    392   // Handle the last pixel in the last row.
    393   if (width)
    394     ConvertRGBToYUV_V1H1(rgbframe, yplane, uplane, vplane);
    395 }
    396 
    397 }  // namespace media
    398