Home | History | Annotate | Download | only in source
      1 /*
      2  *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "row.h"
     12 
     13 extern "C" {
     14 
     15 #ifdef HAS_ARGBTOYROW_SSSE3
     16 
     17 // Constant multiplication table for converting ARGB to I400.
     18 extern "C" TALIGN16(const uint8, kMultiplyMaskARGBToI400[16]) = {
     19   13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u, 13u, 64u, 33u, 0u
     20 };
     21 
     22 extern "C" TALIGN16(const uint8, kAdd16[16]) = {
     23   1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u
     24 };
     25 
     26 // Shuffle table for converting BG24 to ARGB.
     27 extern "C" TALIGN16(const uint8, kShuffleMaskBG24ToARGB[16]) = {
     28   0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
     29 };
     30 
     31 // Shuffle table for converting RAW to ARGB.
     32 extern "C" TALIGN16(const uint8, kShuffleMaskRAWToARGB[16]) = {
     33   2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
     34 };
     35 
     36 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
     37   asm volatile(
     38   "movdqa     (%3),%%xmm7\n"
     39   "movdqa     (%4),%%xmm6\n"
     40   "movdqa     %%xmm6,%%xmm5\n"
     41   "psllw      $0x4,%%xmm5\n"  // Generate a mask of 0x10 on each byte.
     42 "1:"
     43   "movdqa     (%0),%%xmm0\n"
     44   "pmaddubsw  %%xmm7,%%xmm0\n"
     45   "movdqa     0x10(%0),%%xmm1\n"
     46   "psrlw      $0x7,%%xmm0\n"
     47   "pmaddubsw  %%xmm7,%%xmm1\n"
     48   "lea        0x20(%0),%0\n"
     49   "psrlw      $0x7,%%xmm1\n"
     50   "packuswb   %%xmm1,%%xmm0\n"
     51   "pmaddubsw  %%xmm6,%%xmm0\n"
     52   "packuswb   %%xmm0,%%xmm0\n"
     53   "paddb      %%xmm5,%%xmm0\n"
     54   "movq       %%xmm0,(%1)\n"
     55   "lea        0x8(%1),%1\n"
     56   "sub        $0x8,%2\n"
     57   "ja         1b\n"
     58   : "+r"(src_argb),   // %0
     59     "+r"(dst_y),      // %1
     60     "+r"(pix)         // %2
     61   : "r"(kMultiplyMaskARGBToI400),    // %3
     62     "r"(kAdd16)   // %4
     63   : "memory"
     64 );
     65 }
     66 #endif
     67 
     68 #ifdef  HAS_BG24TOARGBROW_SSSE3
     69 void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) {
     70   asm volatile(
     71   "pcmpeqb    %%xmm7,%%xmm7\n"  // generate mask 0xff000000
     72   "pslld      $0x18,%%xmm7\n"
     73   "movdqa     (%3),%%xmm6\n"
     74 "1:"
     75   "movdqa     (%0),%%xmm0\n"
     76   "movdqa     0x10(%0),%%xmm1\n"
     77   "movdqa     0x20(%0),%%xmm3\n"
     78   "lea        0x30(%0),%0\n"
     79   "movdqa     %%xmm3,%%xmm2\n"
     80   "palignr    $0x8,%%xmm1,%%xmm2\n"  // xmm2 = { xmm3[0:3] xmm1[8:15] }
     81   "pshufb     %%xmm6,%%xmm2\n"
     82   "por        %%xmm7,%%xmm2\n"
     83   "palignr    $0xc,%%xmm0,%%xmm1\n"  // xmm1 = { xmm3[0:7] xmm0[12:15] }
     84   "pshufb     %%xmm6,%%xmm0\n"
     85   "movdqa     %%xmm2,0x20(%1)\n"
     86   "por        %%xmm7,%%xmm0\n"
     87   "pshufb     %%xmm6,%%xmm1\n"
     88   "movdqa     %%xmm0,(%1)\n"
     89   "por        %%xmm7,%%xmm1\n"
     90   "palignr    $0x4,%%xmm3,%%xmm3\n"  // xmm3 = { xmm3[4:15] }
     91   "pshufb     %%xmm6,%%xmm3\n"
     92   "movdqa     %%xmm1,0x10(%1)\n"
     93   "por        %%xmm7,%%xmm3\n"
     94   "movdqa     %%xmm3,0x30(%1)\n"
     95   "lea        0x40(%1),%1\n"
     96   "sub        $0x10,%2\n"
     97   "ja         1b\n"
     98   : "+r"(src_bg24),  // %0
     99     "+r"(dst_argb),  // %1
    100     "+r"(pix)        // %2
    101   : "r"(kShuffleMaskBG24ToARGB)  // %3
    102   : "memory"
    103 );
    104 }
    105 
    106 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
    107   asm volatile(
    108   "pcmpeqb    %%xmm7,%%xmm7\n"  // generate mask 0xff000000
    109   "pslld      $0x18,%%xmm7\n"
    110   "movdqa     (%3),%%xmm6\n"
    111 "1:"
    112   "movdqa     (%0),%%xmm0\n"
    113   "movdqa     0x10(%0),%%xmm1\n"
    114   "movdqa     0x20(%0),%%xmm3\n"
    115   "lea        0x30(%0),%0\n"
    116   "movdqa     %%xmm3,%%xmm2\n"
    117   "palignr    $0x8,%%xmm1,%%xmm2\n"  // xmm2 = { xmm3[0:3] xmm1[8:15] }
    118   "pshufb     %%xmm6,%%xmm2\n"
    119   "por        %%xmm7,%%xmm2\n"
    120   "palignr    $0xc,%%xmm0,%%xmm1\n"  // xmm1 = { xmm3[0:7] xmm0[12:15] }
    121   "pshufb     %%xmm6,%%xmm0\n"
    122   "movdqa     %%xmm2,0x20(%1)\n"
    123   "por        %%xmm7,%%xmm0\n"
    124   "pshufb     %%xmm6,%%xmm1\n"
    125   "movdqa     %%xmm0,(%1)\n"
    126   "por        %%xmm7,%%xmm1\n"
    127   "palignr    $0x4,%%xmm3,%%xmm3\n"  // xmm3 = { xmm3[4:15] }
    128   "pshufb     %%xmm6,%%xmm3\n"
    129   "movdqa     %%xmm1,0x10(%1)\n"
    130   "por        %%xmm7,%%xmm3\n"
    131   "movdqa     %%xmm3,0x30(%1)\n"
    132   "lea        0x40(%1),%1\n"
    133   "sub        $0x10,%2\n"
    134   "ja         1b\n"
    135   : "+r"(src_raw),   // %0
    136     "+r"(dst_argb),  // %1
    137     "+r"(pix)        // %2
    138   : "r"(kShuffleMaskRAWToARGB)  // %3
    139   : "memory"
    140 );
    141 }
    142 #endif
    143 
    144 #if defined(__x86_64__)
    145 
    146 // 64 bit linux gcc version
    147 
    148 void FastConvertYUVToRGB32Row(const uint8* y_buf,  // rdi
    149                               const uint8* u_buf,  // rsi
    150                               const uint8* v_buf,  // rdx
    151                               uint8* rgb_buf,      // rcx
    152                               int width) {         // r8
    153   asm volatile(
    154 "1:"
    155   "movzb  (%1),%%r10\n"
    156   "lea    1(%1),%1\n"
    157   "movzb  (%2),%%r11\n"
    158   "lea    1(%2),%2\n"
    159   "movq   2048(%5,%%r10,8),%%xmm0\n"
    160   "movzb  (%0),%%r10\n"
    161   "movq   4096(%5,%%r11,8),%%xmm1\n"
    162   "movzb  0x1(%0),%%r11\n"
    163   "paddsw %%xmm1,%%xmm0\n"
    164   "movq   (%5,%%r10,8),%%xmm2\n"
    165   "lea    2(%0),%0\n"
    166   "movq   (%5,%%r11,8),%%xmm3\n"
    167   "paddsw %%xmm0,%%xmm2\n"
    168   "paddsw %%xmm0,%%xmm3\n"
    169   "shufps $0x44,%%xmm3,%%xmm2\n"
    170   "psraw  $0x6,%%xmm2\n"
    171   "packuswb %%xmm2,%%xmm2\n"
    172   "movq   %%xmm2,0x0(%3)\n"
    173   "lea    8(%3),%3\n"
    174   "sub    $0x2,%4\n"
    175   "ja     1b\n"
    176   : "+r"(y_buf),    // %0
    177     "+r"(u_buf),    // %1
    178     "+r"(v_buf),    // %2
    179     "+r"(rgb_buf),  // %3
    180     "+r"(width)     // %4
    181   : "r" (_kCoefficientsRgbY)  // %5
    182   : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
    183 );
    184 }
    185 
    186 void FastConvertYUVToBGRARow(const uint8* y_buf,  // rdi
    187                              const uint8* u_buf,  // rsi
    188                              const uint8* v_buf,  // rdx
    189                              uint8* rgb_buf,      // rcx
    190                              int width) {         // r8
    191   asm volatile(
    192 "1:"
    193   "movzb  (%1),%%r10\n"
    194   "lea    1(%1),%1\n"
    195   "movzb  (%2),%%r11\n"
    196   "lea    1(%2),%2\n"
    197   "movq   2048(%5,%%r10,8),%%xmm0\n"
    198   "movzb  (%0),%%r10\n"
    199   "movq   4096(%5,%%r11,8),%%xmm1\n"
    200   "movzb  0x1(%0),%%r11\n"
    201   "paddsw %%xmm1,%%xmm0\n"
    202   "movq   (%5,%%r10,8),%%xmm2\n"
    203   "lea    2(%0),%0\n"
    204   "movq   (%5,%%r11,8),%%xmm3\n"
    205   "paddsw %%xmm0,%%xmm2\n"
    206   "paddsw %%xmm0,%%xmm3\n"
    207   "shufps $0x44,%%xmm3,%%xmm2\n"
    208   "psraw  $0x6,%%xmm2\n"
    209   "packuswb %%xmm2,%%xmm2\n"
    210   "movq   %%xmm2,0x0(%3)\n"
    211   "lea    8(%3),%3\n"
    212   "sub    $0x2,%4\n"
    213   "ja     1b\n"
    214   : "+r"(y_buf),    // %0
    215     "+r"(u_buf),    // %1
    216     "+r"(v_buf),    // %2
    217     "+r"(rgb_buf),  // %3
    218     "+r"(width)     // %4
    219   : "r" (_kCoefficientsBgraY)  // %5
    220   : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
    221 );
    222 }
    223 
    224 void FastConvertYUVToABGRRow(const uint8* y_buf,  // rdi
    225                              const uint8* u_buf,  // rsi
    226                              const uint8* v_buf,  // rdx
    227                              uint8* rgb_buf,      // rcx
    228                              int width) {         // r8
    229   asm volatile(
    230 "1:"
    231   "movzb  (%1),%%r10\n"
    232   "lea    1(%1),%1\n"
    233   "movzb  (%2),%%r11\n"
    234   "lea    1(%2),%2\n"
    235   "movq   2048(%5,%%r10,8),%%xmm0\n"
    236   "movzb  (%0),%%r10\n"
    237   "movq   4096(%5,%%r11,8),%%xmm1\n"
    238   "movzb  0x1(%0),%%r11\n"
    239   "paddsw %%xmm1,%%xmm0\n"
    240   "movq   (%5,%%r10,8),%%xmm2\n"
    241   "lea    2(%0),%0\n"
    242   "movq   (%5,%%r11,8),%%xmm3\n"
    243   "paddsw %%xmm0,%%xmm2\n"
    244   "paddsw %%xmm0,%%xmm3\n"
    245   "shufps $0x44,%%xmm3,%%xmm2\n"
    246   "psraw  $0x6,%%xmm2\n"
    247   "packuswb %%xmm2,%%xmm2\n"
    248   "movq   %%xmm2,0x0(%3)\n"
    249   "lea    8(%3),%3\n"
    250   "sub    $0x2,%4\n"
    251   "ja     1b\n"
    252   : "+r"(y_buf),    // %0
    253     "+r"(u_buf),    // %1
    254     "+r"(v_buf),    // %2
    255     "+r"(rgb_buf),  // %3
    256     "+r"(width)     // %4
    257   : "r" (_kCoefficientsAbgrY)  // %5
    258   : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
    259 );
    260 }
    261 
    262 void FastConvertYUV444ToRGB32Row(const uint8* y_buf,  // rdi
    263                                  const uint8* u_buf,  // rsi
    264                                  const uint8* v_buf,  // rdx
    265                                  uint8* rgb_buf,      // rcx
    266                                  int width) {         // r8
    267   asm volatile(
    268 "1:"
    269   "movzb  (%1),%%r10\n"
    270   "lea    1(%1),%1\n"
    271   "movzb  (%2),%%r11\n"
    272   "lea    1(%2),%2\n"
    273   "movq   2048(%5,%%r10,8),%%xmm0\n"
    274   "movzb  (%0),%%r10\n"
    275   "movq   4096(%5,%%r11,8),%%xmm1\n"
    276   "paddsw %%xmm1,%%xmm0\n"
    277   "movq   (%5,%%r10,8),%%xmm2\n"
    278   "lea    1(%0),%0\n"
    279   "paddsw %%xmm0,%%xmm2\n"
    280   "shufps $0x44,%%xmm2,%%xmm2\n"
    281   "psraw  $0x6,%%xmm2\n"
    282   "packuswb %%xmm2,%%xmm2\n"
    283   "movd   %%xmm2,0x0(%3)\n"
    284   "lea    4(%3),%3\n"
    285   "sub    $0x1,%4\n"
    286   "ja     1b\n"
    287   : "+r"(y_buf),    // %0
    288     "+r"(u_buf),    // %1
    289     "+r"(v_buf),    // %2
    290     "+r"(rgb_buf),  // %3
    291     "+r"(width)     // %4
    292   : "r" (_kCoefficientsRgbY)  // %5
    293   : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2"
    294 );
    295 }
    296 
    297 void FastConvertYToRGB32Row(const uint8* y_buf,  // rdi
    298                             uint8* rgb_buf,      // rcx
    299                             int width) {         // r8
    300   asm volatile(
    301 "1:"
    302   "movzb  (%0),%%r10\n"
    303   "movzb  0x1(%0),%%r11\n"
    304   "movq   (%3,%%r10,8),%%xmm2\n"
    305   "lea    2(%0),%0\n"
    306   "movq   (%3,%%r11,8),%%xmm3\n"
    307   "shufps $0x44,%%xmm3,%%xmm2\n"
    308   "psraw  $0x6,%%xmm2\n"
    309   "packuswb %%xmm2,%%xmm2\n"
    310   "movq   %%xmm2,0x0(%1)\n"
    311   "lea    8(%1),%1\n"
    312   "sub    $0x2,%2\n"
    313   "ja     1b\n"
    314   : "+r"(y_buf),    // %0
    315     "+r"(rgb_buf),  // %1
    316     "+r"(width)     // %2
    317   : "r" (_kCoefficientsRgbY)  // %3
    318   : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
    319 );
    320 }
    321 
    322 #elif defined(__i386__)
    323 // 32 bit gcc version
    324 
    325 void FastConvertYUVToRGB32Row(const uint8* y_buf,
    326                               const uint8* u_buf,
    327                               const uint8* v_buf,
    328                               uint8* rgb_buf,
    329                               int width);
    330   asm(
    331   ".text\n"
    332 #if defined(OSX) || defined(IOS)
    333   ".globl _FastConvertYUVToRGB32Row\n"
    334 "_FastConvertYUVToRGB32Row:\n"
    335 #else
    336   ".global FastConvertYUVToRGB32Row\n"
    337 "FastConvertYUVToRGB32Row:\n"
    338 #endif
    339   "pusha\n"
    340   "mov    0x24(%esp),%edx\n"
    341   "mov    0x28(%esp),%edi\n"
    342   "mov    0x2c(%esp),%esi\n"
    343   "mov    0x30(%esp),%ebp\n"
    344   "mov    0x34(%esp),%ecx\n"
    345 
    346 "1:"
    347   "movzbl (%edi),%eax\n"
    348   "lea    1(%edi),%edi\n"
    349   "movzbl (%esi),%ebx\n"
    350   "lea    1(%esi),%esi\n"
    351   "movq   _kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
    352   "movzbl (%edx),%eax\n"
    353   "paddsw _kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
    354   "movzbl 0x1(%edx),%ebx\n"
    355   "movq   _kCoefficientsRgbY(,%eax,8),%mm1\n"
    356   "lea    2(%edx),%edx\n"
    357   "movq   _kCoefficientsRgbY(,%ebx,8),%mm2\n"
    358   "paddsw %mm0,%mm1\n"
    359   "paddsw %mm0,%mm2\n"
    360   "psraw  $0x6,%mm1\n"
    361   "psraw  $0x6,%mm2\n"
    362   "packuswb %mm2,%mm1\n"
    363   "movntq %mm1,0x0(%ebp)\n"
    364   "lea    8(%ebp),%ebp\n"
    365   "sub    $0x2,%ecx\n"
    366   "ja     1b\n"
    367   "popa\n"
    368   "ret\n"
    369 );
    370 
    371 void FastConvertYUVToBGRARow(const uint8* y_buf,
    372                               const uint8* u_buf,
    373                               const uint8* v_buf,
    374                               uint8* rgb_buf,
    375                               int width);
    376   asm(
    377   ".text\n"
    378 #if defined(OSX) || defined(IOS)
    379   ".globl _FastConvertYUVToBGRARow\n"
    380 "_FastConvertYUVToBGRARow:\n"
    381 #else
    382   ".global FastConvertYUVToBGRARow\n"
    383 "FastConvertYUVToBGRARow:\n"
    384 #endif
    385   "pusha\n"
    386   "mov    0x24(%esp),%edx\n"
    387   "mov    0x28(%esp),%edi\n"
    388   "mov    0x2c(%esp),%esi\n"
    389   "mov    0x30(%esp),%ebp\n"
    390   "mov    0x34(%esp),%ecx\n"
    391 
    392 "1:"
    393   "movzbl (%edi),%eax\n"
    394   "lea    1(%edi),%edi\n"
    395   "movzbl (%esi),%ebx\n"
    396   "lea    1(%esi),%esi\n"
    397   "movq   _kCoefficientsBgraY+2048(,%eax,8),%mm0\n"
    398   "movzbl (%edx),%eax\n"
    399   "paddsw _kCoefficientsBgraY+4096(,%ebx,8),%mm0\n"
    400   "movzbl 0x1(%edx),%ebx\n"
    401   "movq   _kCoefficientsBgraY(,%eax,8),%mm1\n"
    402   "lea    2(%edx),%edx\n"
    403   "movq   _kCoefficientsBgraY(,%ebx,8),%mm2\n"
    404   "paddsw %mm0,%mm1\n"
    405   "paddsw %mm0,%mm2\n"
    406   "psraw  $0x6,%mm1\n"
    407   "psraw  $0x6,%mm2\n"
    408   "packuswb %mm2,%mm1\n"
    409   "movntq %mm1,0x0(%ebp)\n"
    410   "lea    8(%ebp),%ebp\n"
    411   "sub    $0x2,%ecx\n"
    412   "ja     1b\n"
    413   "popa\n"
    414   "ret\n"
    415 );
    416 
    417 void FastConvertYUVToABGRRow(const uint8* y_buf,
    418                               const uint8* u_buf,
    419                               const uint8* v_buf,
    420                               uint8* rgb_buf,
    421                               int width);
    422   asm(
    423   ".text\n"
    424 #if defined(OSX) || defined(IOS)
    425   ".globl _FastConvertYUVToABGRRow\n"
    426 "_FastConvertYUVToABGRRow:\n"
    427 #else
    428   ".global FastConvertYUVToABGRRow\n"
    429 "FastConvertYUVToABGRRow:\n"
    430 #endif
    431   "pusha\n"
    432   "mov    0x24(%esp),%edx\n"
    433   "mov    0x28(%esp),%edi\n"
    434   "mov    0x2c(%esp),%esi\n"
    435   "mov    0x30(%esp),%ebp\n"
    436   "mov    0x34(%esp),%ecx\n"
    437 
    438 "1:"
    439   "movzbl (%edi),%eax\n"
    440   "lea    1(%edi),%edi\n"
    441   "movzbl (%esi),%ebx\n"
    442   "lea    1(%esi),%esi\n"
    443   "movq   _kCoefficientsAbgrY+2048(,%eax,8),%mm0\n"
    444   "movzbl (%edx),%eax\n"
    445   "paddsw _kCoefficientsAbgrY+4096(,%ebx,8),%mm0\n"
    446   "movzbl 0x1(%edx),%ebx\n"
    447   "movq   _kCoefficientsAbgrY(,%eax,8),%mm1\n"
    448   "lea    2(%edx),%edx\n"
    449   "movq   _kCoefficientsAbgrY(,%ebx,8),%mm2\n"
    450   "paddsw %mm0,%mm1\n"
    451   "paddsw %mm0,%mm2\n"
    452   "psraw  $0x6,%mm1\n"
    453   "psraw  $0x6,%mm2\n"
    454   "packuswb %mm2,%mm1\n"
    455   "movntq %mm1,0x0(%ebp)\n"
    456   "lea    8(%ebp),%ebp\n"
    457   "sub    $0x2,%ecx\n"
    458   "ja     1b\n"
    459   "popa\n"
    460   "ret\n"
    461 );
    462 
    463 void FastConvertYUV444ToRGB32Row(const uint8* y_buf,
    464                                  const uint8* u_buf,
    465                                  const uint8* v_buf,
    466                                  uint8* rgb_buf,
    467                                  int width);
    468   asm(
    469   ".text\n"
    470 #if defined(OSX) || defined(IOS)
    471   ".globl _FastConvertYUV444ToRGB32Row\n"
    472 "_FastConvertYUV444ToRGB32Row:\n"
    473 #else
    474   ".global FastConvertYUV444ToRGB32Row\n"
    475 "FastConvertYUV444ToRGB32Row:\n"
    476 #endif
    477   "pusha\n"
    478   "mov    0x24(%esp),%edx\n"
    479   "mov    0x28(%esp),%edi\n"
    480   "mov    0x2c(%esp),%esi\n"
    481   "mov    0x30(%esp),%ebp\n"
    482   "mov    0x34(%esp),%ecx\n"
    483 
    484 "1:"
    485   "movzbl (%edi),%eax\n"
    486   "lea    1(%edi),%edi\n"
    487   "movzbl (%esi),%ebx\n"
    488   "lea    1(%esi),%esi\n"
    489   "movq   _kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
    490   "movzbl (%edx),%eax\n"
    491   "paddsw _kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
    492   "lea    1(%edx),%edx\n"
    493   "paddsw _kCoefficientsRgbY(,%eax,8),%mm0\n"
    494   "psraw  $0x6,%mm0\n"
    495   "packuswb %mm0,%mm0\n"
    496   "movd   %mm0,0x0(%ebp)\n"
    497   "lea    4(%ebp),%ebp\n"
    498   "sub    $0x1,%ecx\n"
    499   "ja     1b\n"
    500   "popa\n"
    501   "ret\n"
    502 );
    503 
    504 void FastConvertYToRGB32Row(const uint8* y_buf,
    505                             uint8* rgb_buf,
    506                             int width);
    507   asm(
    508   ".text\n"
    509 #if defined(OSX) || defined(IOS)
    510   ".globl _FastConvertYToRGB32Row\n"
    511 "_FastConvertYToRGB32Row:\n"
    512 #else
    513   ".global FastConvertYToRGB32Row\n"
    514 "FastConvertYToRGB32Row:\n"
    515 #endif
    516   "push   %ebx\n"
    517   "mov    0x8(%esp),%eax\n"
    518   "mov    0xc(%esp),%edx\n"
    519   "mov    0x10(%esp),%ecx\n"
    520 
    521 "1:"
    522   "movzbl (%eax),%ebx\n"
    523   "movq   _kCoefficientsRgbY(,%ebx,8),%mm0\n"
    524   "psraw  $0x6,%mm0\n"
    525   "movzbl 0x1(%eax),%ebx\n"
    526   "movq   _kCoefficientsRgbY(,%ebx,8),%mm1\n"
    527   "psraw  $0x6,%mm1\n"
    528   "packuswb %mm1,%mm0\n"
    529   "lea    0x2(%eax),%eax\n"
    530   "movq   %mm0,(%edx)\n"
    531   "lea    0x8(%edx),%edx\n"
    532   "sub    $0x2,%ecx\n"
    533   "ja     1b\n"
    534   "pop    %ebx\n"
    535   "ret\n"
    536 );
    537 
    538 #else
    539 // C reference code that mimic the YUV assembly.
    540 #define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
    541 #define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \
    542     (((x) + (y)) > 32767 ? 32767 : ((x) + (y))))
    543 
    544 static inline void YuvPixel(uint8 y,
    545                             uint8 u,
    546                             uint8 v,
    547                             uint8* rgb_buf,
    548                             int ashift,
    549                             int rshift,
    550                             int gshift,
    551                             int bshift) {
    552 
    553   int b = _kCoefficientsRgbY[256+u][0];
    554   int g = _kCoefficientsRgbY[256+u][1];
    555   int r = _kCoefficientsRgbY[256+u][2];
    556   int a = _kCoefficientsRgbY[256+u][3];
    557 
    558   b = paddsw(b, _kCoefficientsRgbY[512+v][0]);
    559   g = paddsw(g, _kCoefficientsRgbY[512+v][1]);
    560   r = paddsw(r, _kCoefficientsRgbY[512+v][2]);
    561   a = paddsw(a, _kCoefficientsRgbY[512+v][3]);
    562 
    563   b = paddsw(b, _kCoefficientsRgbY[y][0]);
    564   g = paddsw(g, _kCoefficientsRgbY[y][1]);
    565   r = paddsw(r, _kCoefficientsRgbY[y][2]);
    566   a = paddsw(a, _kCoefficientsRgbY[y][3]);
    567 
    568   b >>= 6;
    569   g >>= 6;
    570   r >>= 6;
    571   a >>= 6;
    572 
    573   *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b) << bshift) |
    574                                         (packuswb(g) << gshift) |
    575                                         (packuswb(r) << rshift) |
    576                                         (packuswb(a) << ashift);
    577 }
    578 
    579 void FastConvertYUVToRGB32Row(const uint8* y_buf,
    580                               const uint8* u_buf,
    581                               const uint8* v_buf,
    582                               uint8* rgb_buf,
    583                               int width) {
    584   for (int x = 0; x < width; x += 2) {
    585     uint8 u = u_buf[x >> 1];
    586     uint8 v = v_buf[x >> 1];
    587     uint8 y0 = y_buf[x];
    588     YuvPixel(y0, u, v, rgb_buf, 24, 16, 8, 0);
    589     if ((x + 1) < width) {
    590       uint8 y1 = y_buf[x + 1];
    591       YuvPixel(y1, u, v, rgb_buf + 4, 24, 16, 8, 0);
    592     }
    593     rgb_buf += 8;  // Advance 2 pixels.
    594   }
    595 }
    596 
    597 void FastConvertYUVToBGRARow(const uint8* y_buf,
    598                              const uint8* u_buf,
    599                              const uint8* v_buf,
    600                              uint8* rgb_buf,
    601                              int width) {
    602   for (int x = 0; x < width; x += 2) {
    603     uint8 u = u_buf[x >> 1];
    604     uint8 v = v_buf[x >> 1];
    605     uint8 y0 = y_buf[x];
    606     YuvPixel(y0, u, v, rgb_buf, 0, 8, 16, 24);
    607     if ((x + 1) < width) {
    608       uint8 y1 = y_buf[x + 1];
    609       YuvPixel(y1, u, v, rgb_buf + 4, 0, 8, 16, 24);
    610     }
    611     rgb_buf += 8;  // Advance 2 pixels.
    612   }
    613 }
    614 
    615 void FastConvertYUVToABGRRow(const uint8* y_buf,
    616                              const uint8* u_buf,
    617                              const uint8* v_buf,
    618                              uint8* rgb_buf,
    619                              int width) {
    620   for (int x = 0; x < width; x += 2) {
    621     uint8 u = u_buf[x >> 1];
    622     uint8 v = v_buf[x >> 1];
    623     uint8 y0 = y_buf[x];
    624     YuvPixel(y0, u, v, rgb_buf, 24, 0, 8, 16);
    625     if ((x + 1) < width) {
    626       uint8 y1 = y_buf[x + 1];
    627       YuvPixel(y1, u, v, rgb_buf + 4, 24, 0, 8, 16);
    628     }
    629     rgb_buf += 8;  // Advance 2 pixels.
    630   }
    631 }
    632 
    633 void FastConvertYUV444ToRGB32Row(const uint8* y_buf,
    634                                  const uint8* u_buf,
    635                                  const uint8* v_buf,
    636                                  uint8* rgb_buf,
    637                                  int width) {
    638   for (int x = 0; x < width; ++x) {
    639     uint8 u = u_buf[x];
    640     uint8 v = v_buf[x];
    641     uint8 y = y_buf[x];
    642     YuvPixel(y, u, v, rgb_buf, 24, 16, 8, 0);
    643     rgb_buf += 4;  // Advance 1 pixel.
    644   }
    645 }
    646 
    647 void FastConvertYToRGB32Row(const uint8* y_buf,
    648                             uint8* rgb_buf,
    649                             int width) {
    650   for (int x = 0; x < width; ++x) {
    651     uint8 y = y_buf[x];
    652     YuvPixel(y, 128, 128, rgb_buf, 24, 16, 8, 0);
    653     rgb_buf += 4;  // Advance 1 pixel.
    654   }
    655 }
    656 
    657 #endif
    658 
    659 }  // extern "C"
    660