Home | History | Annotate | Download | only in source
      1 /*
      2  *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "row.h"
     12 
     13 extern "C" {
     14 
     15 #ifdef HAS_ARGBTOYROW_SSSE3
     16 #define TALIGN16(t, var) static __declspec(align(16)) t _ ## var
     17 
     18 // Constant multiplication table for converting ARGB to I400.
     19 extern "C" TALIGN16(const int8, kARGBToY[16]) = {
     20   13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
     21 };
     22 
     23 extern "C" TALIGN16(const int8, kARGBToU[16]) = {
     24   112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
     25 };
     26 
     27 extern "C" TALIGN16(const int8, kARGBToV[16]) = {
     28   -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
     29 };
     30 
     31 // Constants for BGRA
     32 extern "C" TALIGN16(const int8, kBGRAToY[16]) = {
     33   0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
     34 };
     35 
     36 extern "C" TALIGN16(const int8, kBGRAToU[16]) = {
     37   0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
     38 };
     39 
     40 extern "C" TALIGN16(const int8, kBGRAToV[16]) = {
     41   0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
     42 };
     43 
     44 // Constants for ABGR
     45 extern "C" TALIGN16(const int8, kABGRToY[16]) = {
     46   33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
     47 };
     48 
     49 extern "C" TALIGN16(const int8, kABGRToU[16]) = {
     50   -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
     51 };
     52 
     53 extern "C" TALIGN16(const int8, kABGRToV[16]) = {
     54   112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
     55 };
     56 
     57 extern "C" TALIGN16(const uint8, kAddY16[16]) = {
     58   16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
     59   16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
     60 };
     61 
     62 extern "C" TALIGN16(const uint8, kAddUV128[16]) = {
     63   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
     64   128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
     65 };
     66 
     67 // Shuffle table for converting BG24 to ARGB.
     68 extern "C" TALIGN16(const uint8, kShuffleMaskBG24ToARGB[16]) = {
     69   0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
     70 };
     71 
     72 // Shuffle table for converting RAW to ARGB.
     73 extern "C" TALIGN16(const uint8, kShuffleMaskRAWToARGB[16]) = {
     74   2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
     75 };
     76 
     77 // Convert 16 ARGB pixels (64 bytes) to 16 Y values
     78 __declspec(naked)
     79 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
     80 __asm {
     81     mov        eax, [esp + 4]   /* src_argb */
     82     mov        edx, [esp + 8]   /* dst_y */
     83     mov        ecx, [esp + 12]  /* pix */
     84     movdqa     xmm7, _kARGBToY
     85     movdqa     xmm6, _kAddY16
     86 
     87  convertloop :
     88     movdqa     xmm0, [eax]
     89     movdqa     xmm1, [eax + 16]
     90     movdqa     xmm2, [eax + 32]
     91     movdqa     xmm3, [eax + 48]
     92     pmaddubsw  xmm0, xmm7
     93     pmaddubsw  xmm1, xmm7
     94     pmaddubsw  xmm2, xmm7
     95     pmaddubsw  xmm3, xmm7
     96     lea        eax, [eax + 64]
     97     phaddw     xmm0, xmm1
     98     phaddw     xmm2, xmm3
     99     psrlw      xmm0, 7
    100     psrlw      xmm2, 7
    101     packuswb   xmm0, xmm2
    102     paddb      xmm0, xmm6
    103     movdqa     [edx], xmm0
    104     lea        edx, [edx + 16]
    105     sub        ecx, 16
    106     ja         convertloop
    107     ret
    108   }
    109 }
    110 
    111 __declspec(naked)
    112 void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
    113 __asm {
    114     mov        eax, [esp + 4]   /* src_argb */
    115     mov        edx, [esp + 8]   /* dst_y */
    116     mov        ecx, [esp + 12]  /* pix */
    117     movdqa     xmm7, _kBGRAToY
    118     movdqa     xmm6, _kAddY16
    119 
    120  convertloop :
    121     movdqa     xmm0, [eax]
    122     movdqa     xmm1, [eax + 16]
    123     movdqa     xmm2, [eax + 32]
    124     movdqa     xmm3, [eax + 48]
    125     pmaddubsw  xmm0, xmm7
    126     pmaddubsw  xmm1, xmm7
    127     pmaddubsw  xmm2, xmm7
    128     pmaddubsw  xmm3, xmm7
    129     lea        eax, [eax + 64]
    130     phaddw     xmm0, xmm1
    131     phaddw     xmm2, xmm3
    132     psrlw      xmm0, 7
    133     psrlw      xmm2, 7
    134     packuswb   xmm0, xmm2
    135     paddb      xmm0, xmm6
    136     movdqa     [edx], xmm0
    137     lea        edx, [edx + 16]
    138     sub        ecx, 16
    139     ja         convertloop
    140     ret
    141   }
    142 }
    143 
    144 __declspec(naked)
    145 void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
    146 __asm {
    147     mov        eax, [esp + 4]   /* src_argb */
    148     mov        edx, [esp + 8]   /* dst_y */
    149     mov        ecx, [esp + 12]  /* pix */
    150     movdqa     xmm7, _kABGRToY
    151     movdqa     xmm6, _kAddY16
    152 
    153  convertloop :
    154     movdqa     xmm0, [eax]
    155     movdqa     xmm1, [eax + 16]
    156     movdqa     xmm2, [eax + 32]
    157     movdqa     xmm3, [eax + 48]
    158     pmaddubsw  xmm0, xmm7
    159     pmaddubsw  xmm1, xmm7
    160     pmaddubsw  xmm2, xmm7
    161     pmaddubsw  xmm3, xmm7
    162     lea        eax, [eax + 64]
    163     phaddw     xmm0, xmm1
    164     phaddw     xmm2, xmm3
    165     psrlw      xmm0, 7
    166     psrlw      xmm2, 7
    167     packuswb   xmm0, xmm2
    168     paddb      xmm0, xmm6
    169     movdqa     [edx], xmm0
    170     lea        edx, [edx + 16]
    171     sub        ecx, 16
    172     ja         convertloop
    173     ret
    174   }
    175 }
    176 
    177 __declspec(naked)
    178 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
    179                        uint8* dst_u, uint8* dst_v, int width) {
    180 __asm {
    181     push       esi
    182     push       edi
    183     mov        eax, [esp + 8 + 4]   // src_argb
    184     mov        esi, [esp + 8 + 8]   // src_stride_argb
    185     mov        edx, [esp + 8 + 12]  // dst_u
    186     mov        edi, [esp + 8 + 16]  // dst_v
    187     mov        ecx, [esp + 8 + 20]  // pix
    188     movdqa     xmm7, _kARGBToU
    189     movdqa     xmm6, _kARGBToV
    190     movdqa     xmm5, _kAddUV128
    191     sub        edi, edx             // stride from u to v
    192 
    193  convertloop :
    194     /* step 1 - subsample 16x2 argb pixels to 8x1 */
    195     movdqa     xmm0, [eax]
    196     movdqa     xmm1, [eax + 16]
    197     movdqa     xmm2, [eax + 32]
    198     movdqa     xmm3, [eax + 48]
    199     pavgb      xmm0, [eax + esi]
    200     pavgb      xmm1, [eax + esi + 16]
    201     pavgb      xmm2, [eax + esi + 32]
    202     pavgb      xmm3, [eax + esi + 48]
    203     lea        eax,  [eax + 64]
    204     movdqa     xmm4, xmm0
    205     shufps     xmm0, xmm1, 0x88
    206     shufps     xmm4, xmm1, 0xdd
    207     pavgb      xmm0, xmm4
    208     movdqa     xmm4, xmm2
    209     shufps     xmm2, xmm3, 0x88
    210     shufps     xmm4, xmm3, 0xdd
    211     pavgb      xmm2, xmm4
    212 
    213     // step 2 - convert to U and V
    214     // from here down is very similar to Y code except
    215     // instead of 16 different pixels, its 8 pixels of U and 8 of V
    216     movdqa     xmm1, xmm0
    217     movdqa     xmm3, xmm2
    218     pmaddubsw  xmm0, xmm7  // U
    219     pmaddubsw  xmm2, xmm7
    220     pmaddubsw  xmm1, xmm6  // V
    221     pmaddubsw  xmm3, xmm6
    222     phaddw     xmm0, xmm2
    223     phaddw     xmm1, xmm3
    224     psraw      xmm0, 8
    225     psraw      xmm1, 8
    226     packsswb   xmm0, xmm1
    227     paddb      xmm0, xmm5            // -> unsigned
    228 
    229     // step 3 - store 8 U and 8 V values
    230     movlps     qword ptr [edx], xmm0 // U
    231     movhps     qword ptr [edx + edi], xmm0 // V
    232     lea        edx, [edx + 8]
    233     sub        ecx, 16
    234     ja         convertloop
    235     pop        edi
    236     pop        esi
    237     ret
    238   }
    239 }
    240 
    241 __declspec(naked)
    242 void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
    243                        uint8* dst_u, uint8* dst_v, int width) {
    244 __asm {
    245     push       esi
    246     push       edi
    247     mov        eax, [esp + 8 + 4]   // src_argb
    248     mov        esi, [esp + 8 + 8]   // src_stride_argb
    249     mov        edx, [esp + 8 + 12]  // dst_u
    250     mov        edi, [esp + 8 + 16]  // dst_v
    251     mov        ecx, [esp + 8 + 20]  // pix
    252     movdqa     xmm7, _kBGRAToU
    253     movdqa     xmm6, _kBGRAToV
    254     movdqa     xmm5, _kAddUV128
    255     sub        edi, edx             // stride from u to v
    256 
    257  convertloop :
    258     /* step 1 - subsample 16x2 argb pixels to 8x1 */
    259     movdqa     xmm0, [eax]
    260     movdqa     xmm1, [eax + 16]
    261     movdqa     xmm2, [eax + 32]
    262     movdqa     xmm3, [eax + 48]
    263     pavgb      xmm0, [eax + esi]
    264     pavgb      xmm1, [eax + esi + 16]
    265     pavgb      xmm2, [eax + esi + 32]
    266     pavgb      xmm3, [eax + esi + 48]
    267     lea        eax,  [eax + 64]
    268     movdqa     xmm4, xmm0
    269     shufps     xmm0, xmm1, 0x88
    270     shufps     xmm4, xmm1, 0xdd
    271     pavgb      xmm0, xmm4
    272     movdqa     xmm4, xmm2
    273     shufps     xmm2, xmm3, 0x88
    274     shufps     xmm4, xmm3, 0xdd
    275     pavgb      xmm2, xmm4
    276 
    277     // step 2 - convert to U and V
    278     // from here down is very similar to Y code except
    279     // instead of 16 different pixels, its 8 pixels of U and 8 of V
    280     movdqa     xmm1, xmm0
    281     movdqa     xmm3, xmm2
    282     pmaddubsw  xmm0, xmm7  // U
    283     pmaddubsw  xmm2, xmm7
    284     pmaddubsw  xmm1, xmm6  // V
    285     pmaddubsw  xmm3, xmm6
    286     phaddw     xmm0, xmm2
    287     phaddw     xmm1, xmm3
    288     psraw      xmm0, 8
    289     psraw      xmm1, 8
    290     packsswb   xmm0, xmm1
    291     paddb      xmm0, xmm5            // -> unsigned
    292 
    293     // step 3 - store 8 U and 8 V values
    294     movlps     qword ptr [edx], xmm0 // U
    295     movhps     qword ptr [edx + edi], xmm0 // V
    296     lea        edx, [edx + 8]
    297     sub        ecx, 16
    298     ja         convertloop
    299     pop        edi
    300     pop        esi
    301     ret
    302   }
    303 }
    304 
    305 __declspec(naked)
    306 void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
    307                        uint8* dst_u, uint8* dst_v, int width) {
    308 __asm {
    309     push       esi
    310     push       edi
    311     mov        eax, [esp + 8 + 4]   // src_argb
    312     mov        esi, [esp + 8 + 8]   // src_stride_argb
    313     mov        edx, [esp + 8 + 12]  // dst_u
    314     mov        edi, [esp + 8 + 16]  // dst_v
    315     mov        ecx, [esp + 8 + 20]  // pix
    316     movdqa     xmm7, _kABGRToU
    317     movdqa     xmm6, _kABGRToV
    318     movdqa     xmm5, _kAddUV128
    319     sub        edi, edx             // stride from u to v
    320 
    321  convertloop :
    322     /* step 1 - subsample 16x2 argb pixels to 8x1 */
    323     movdqa     xmm0, [eax]
    324     movdqa     xmm1, [eax + 16]
    325     movdqa     xmm2, [eax + 32]
    326     movdqa     xmm3, [eax + 48]
    327     pavgb      xmm0, [eax + esi]
    328     pavgb      xmm1, [eax + esi + 16]
    329     pavgb      xmm2, [eax + esi + 32]
    330     pavgb      xmm3, [eax + esi + 48]
    331     lea        eax,  [eax + 64]
    332     movdqa     xmm4, xmm0
    333     shufps     xmm0, xmm1, 0x88
    334     shufps     xmm4, xmm1, 0xdd
    335     pavgb      xmm0, xmm4
    336     movdqa     xmm4, xmm2
    337     shufps     xmm2, xmm3, 0x88
    338     shufps     xmm4, xmm3, 0xdd
    339     pavgb      xmm2, xmm4
    340 
    341     // step 2 - convert to U and V
    342     // from here down is very similar to Y code except
    343     // instead of 16 different pixels, its 8 pixels of U and 8 of V
    344     movdqa     xmm1, xmm0
    345     movdqa     xmm3, xmm2
    346     pmaddubsw  xmm0, xmm7  // U
    347     pmaddubsw  xmm2, xmm7
    348     pmaddubsw  xmm1, xmm6  // V
    349     pmaddubsw  xmm3, xmm6
    350     phaddw     xmm0, xmm2
    351     phaddw     xmm1, xmm3
    352     psraw      xmm0, 8
    353     psraw      xmm1, 8
    354     packsswb   xmm0, xmm1
    355     paddb      xmm0, xmm5            // -> unsigned
    356 
    357     // step 3 - store 8 U and 8 V values
    358     movlps     qword ptr [edx], xmm0 // U
    359     movhps     qword ptr [edx + edi], xmm0 // V
    360     lea        edx, [edx + 8]
    361     sub        ecx, 16
    362     ja         convertloop
    363     pop        edi
    364     pop        esi
    365     ret
    366   }
    367 }
    368 
    369 __declspec(naked)
    370 void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) {
    371 __asm {
    372     mov       eax, [esp + 4]   // src_bg24
    373     mov       edx, [esp + 8]   // dst_argb
    374     mov       ecx, [esp + 12]  // pix
    375     pcmpeqb   xmm7, xmm7       // generate mask 0xff000000
    376     pslld     xmm7, 24
    377     movdqa    xmm6, _kShuffleMaskBG24ToARGB
    378 
    379  convertloop :
    380     movdqa    xmm0, [eax]
    381     movdqa    xmm1, [eax + 16]
    382     movdqa    xmm3, [eax + 32]
    383     lea       eax, [eax + 48]
    384     movdqa    xmm2, xmm3
    385     palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
    386     pshufb    xmm2, xmm6
    387     por       xmm2, xmm7
    388     palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
    389     pshufb    xmm0, xmm6
    390     movdqa    [edx + 32], xmm2
    391     por       xmm0, xmm7
    392     pshufb    xmm1, xmm6
    393     movdqa    [edx], xmm0
    394     por       xmm1, xmm7
    395     palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
    396     pshufb    xmm3, xmm6
    397     movdqa    [edx + 16], xmm1
    398     por       xmm3, xmm7
    399     movdqa    [edx + 48], xmm3
    400     lea       edx, [edx + 64]
    401     sub       ecx, 16
    402     ja        convertloop
    403     ret
    404   }
    405 }
    406 
    407 __declspec(naked)
    408 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
    409                         int pix) {
    410 __asm {
    411     mov       eax, [esp + 4]   // src_raw
    412     mov       edx, [esp + 8]   // dst_argb
    413     mov       ecx, [esp + 12]  // pix
    414     pcmpeqb   xmm7, xmm7       // generate mask 0xff000000
    415     pslld     xmm7, 24
    416     movdqa    xmm6, _kShuffleMaskRAWToARGB
    417 
    418  convertloop :
    419     movdqa    xmm0, [eax]
    420     movdqa    xmm1, [eax + 16]
    421     movdqa    xmm3, [eax + 32]
    422     lea       eax, [eax + 48]
    423     movdqa    xmm2, xmm3
    424     palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
    425     pshufb    xmm2, xmm6
    426     por       xmm2, xmm7
    427     palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
    428     pshufb    xmm0, xmm6
    429     movdqa    [edx + 32], xmm2
    430     por       xmm0, xmm7
    431     pshufb    xmm1, xmm6
    432     movdqa    [edx], xmm0
    433     por       xmm1, xmm7
    434     palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
    435     pshufb    xmm3, xmm6
    436     movdqa    [edx + 16], xmm1
    437     por       xmm3, xmm7
    438     movdqa    [edx + 48], xmm3
    439     lea       edx, [edx + 64]
    440     sub       ecx, 16
    441     ja        convertloop
    442     ret
    443   }
    444 }
    445 
    446 __declspec(naked)
    447 void FastConvertYUVToRGB32Row(const uint8* y_buf,
    448                               const uint8* u_buf,
    449                               const uint8* v_buf,
    450                               uint8* rgb_buf,
    451                               int width) {
    452   __asm {
    453     pushad
    454     mov       edx, [esp + 32 + 4]
    455     mov       edi, [esp + 32 + 8]
    456     mov       esi, [esp + 32 + 12]
    457     mov       ebp, [esp + 32 + 16]
    458     mov       ecx, [esp + 32 + 20]
    459 
    460  convertloop :
    461     movzx     eax, byte ptr [edi]
    462     lea       edi, [edi + 1]
    463     movzx     ebx, byte ptr [esi]
    464     lea       esi, [esi + 1]
    465     movq      mm0, [_kCoefficientsRgbY + 2048 + 8 * eax]
    466     movzx     eax, byte ptr [edx]
    467     paddsw    mm0, [_kCoefficientsRgbY + 4096 + 8 * ebx]
    468     movzx     ebx, byte ptr [edx + 1]
    469     movq      mm1, [_kCoefficientsRgbY + 8 * eax]
    470     lea       edx, [edx + 2]
    471     movq      mm2, [_kCoefficientsRgbY + 8 * ebx]
    472     paddsw    mm1, mm0
    473     paddsw    mm2, mm0
    474     psraw     mm1, 6
    475     psraw     mm2, 6
    476     packuswb  mm1, mm2
    477     movntq    [ebp], mm1
    478     lea       ebp, [ebp + 8]
    479     sub       ecx, 2
    480     ja        convertloop
    481 
    482     popad
    483     ret
    484   }
    485 }
    486 
    487 __declspec(naked)
    488 void FastConvertYUVToBGRARow(const uint8* y_buf,
    489                              const uint8* u_buf,
    490                              const uint8* v_buf,
    491                              uint8* rgb_buf,
    492                              int width) {
    493   __asm {
    494     pushad
    495     mov       edx, [esp + 32 + 4]
    496     mov       edi, [esp + 32 + 8]
    497     mov       esi, [esp + 32 + 12]
    498     mov       ebp, [esp + 32 + 16]
    499     mov       ecx, [esp + 32 + 20]
    500 
    501  convertloop :
    502     movzx     eax, byte ptr [edi]
    503     lea       edi, [edi + 1]
    504     movzx     ebx, byte ptr [esi]
    505     lea       esi, [esi + 1]
    506     movq      mm0, [_kCoefficientsBgraY + 2048 + 8 * eax]
    507     movzx     eax, byte ptr [edx]
    508     paddsw    mm0, [_kCoefficientsBgraY + 4096 + 8 * ebx]
    509     movzx     ebx, byte ptr [edx + 1]
    510     movq      mm1, [_kCoefficientsBgraY + 8 * eax]
    511     lea       edx, [edx + 2]
    512     movq      mm2, [_kCoefficientsBgraY + 8 * ebx]
    513     paddsw    mm1, mm0
    514     paddsw    mm2, mm0
    515     psraw     mm1, 6
    516     psraw     mm2, 6
    517     packuswb  mm1, mm2
    518     movntq    [ebp], mm1
    519     lea       ebp, [ebp + 8]
    520     sub       ecx, 2
    521     ja        convertloop
    522 
    523     popad
    524     ret
    525   }
    526 }
    527 
    528 __declspec(naked)
    529 void FastConvertYUVToABGRRow(const uint8* y_buf,
    530                              const uint8* u_buf,
    531                              const uint8* v_buf,
    532                              uint8* rgb_buf,
    533                              int width) {
    534   __asm {
    535     pushad
    536     mov       edx, [esp + 32 + 4]
    537     mov       edi, [esp + 32 + 8]
    538     mov       esi, [esp + 32 + 12]
    539     mov       ebp, [esp + 32 + 16]
    540     mov       ecx, [esp + 32 + 20]
    541 
    542  convertloop :
    543     movzx     eax, byte ptr [edi]
    544     lea       edi, [edi + 1]
    545     movzx     ebx, byte ptr [esi]
    546     lea       esi, [esi + 1]
    547     movq      mm0, [_kCoefficientsAbgrY + 2048 + 8 * eax]
    548     movzx     eax, byte ptr [edx]
    549     paddsw    mm0, [_kCoefficientsAbgrY + 4096 + 8 * ebx]
    550     movzx     ebx, byte ptr [edx + 1]
    551     movq      mm1, [_kCoefficientsAbgrY + 8 * eax]
    552     lea       edx, [edx + 2]
    553     movq      mm2, [_kCoefficientsAbgrY + 8 * ebx]
    554     paddsw    mm1, mm0
    555     paddsw    mm2, mm0
    556     psraw     mm1, 6
    557     psraw     mm2, 6
    558     packuswb  mm1, mm2
    559     movntq    [ebp], mm1
    560     lea       ebp, [ebp + 8]
    561     sub       ecx, 2
    562     ja        convertloop
    563 
    564     popad
    565     ret
    566   }
    567 }
    568 
    569 __declspec(naked)
    570 void FastConvertYUV444ToRGB32Row(const uint8* y_buf,
    571                                  const uint8* u_buf,
    572                                  const uint8* v_buf,
    573                                  uint8* rgb_buf,
    574                                  int width) {
    575   __asm {
    576     pushad
    577     mov       edx, [esp + 32 + 4]   // Y
    578     mov       edi, [esp + 32 + 8]   // U
    579     mov       esi, [esp + 32 + 12]  // V
    580     mov       ebp, [esp + 32 + 16]  // rgb
    581     mov       ecx, [esp + 32 + 20]  // width
    582 
    583  convertloop :
    584     movzx     eax, byte ptr [edi]
    585     lea       edi, [edi + 1]
    586     movzx     ebx, byte ptr [esi]
    587     lea       esi, [esi + 1]
    588     movq      mm0, [_kCoefficientsRgbY + 2048 + 8 * eax]
    589     movzx     eax, byte ptr [edx]
    590     paddsw    mm0, [_kCoefficientsRgbY + 4096 + 8 * ebx]
    591     lea       edx, [edx + 1]
    592     paddsw    mm0, [_kCoefficientsRgbY + 8 * eax]
    593     psraw     mm0, 6
    594     packuswb  mm0, mm0
    595     movd      [ebp], mm0
    596     lea       ebp, [ebp + 4]
    597     sub       ecx, 1
    598     ja        convertloop
    599 
    600     popad
    601     ret
    602   }
    603 }
    604 
    605 __declspec(naked)
    606 void FastConvertYToRGB32Row(const uint8* y_buf,
    607                             uint8* rgb_buf,
    608                             int width) {
    609   __asm {
    610     push      ebx
    611     mov       eax, [esp + 4 + 4]   // Y
    612     mov       edx, [esp + 4 + 8]   // rgb
    613     mov       ecx, [esp + 4 + 12]  // width
    614 
    615  convertloop :
    616     movzx     ebx, byte ptr [eax]
    617     movq      mm0, [_kCoefficientsRgbY + 8 * ebx]
    618     psraw     mm0, 6
    619     movzx     ebx, byte ptr [eax + 1]
    620     movq      mm1, [_kCoefficientsRgbY + 8 * ebx]
    621     psraw     mm1, 6
    622     packuswb  mm0, mm1
    623     lea       eax, [eax + 2]
    624     movq      [edx], mm0
    625     lea       edx, [edx + 8]
    626     sub       ecx, 2
    627     ja        convertloop
    628 
    629     pop       ebx
    630     ret
    631   }
    632 }
    633 
    634 #endif
    635 
    636 }  // extern "C"
    637