Home | History | Annotate | Download | only in source
      1 /*
      2  *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "libyuv/planar_functions.h"
     12 
     13 #include <string.h>
     14 
     15 #include "libyuv/cpu_id.h"
     16 #include "row.h"
     17 
     18 namespace libyuv {
     19 
     20 #if defined(__ARM_NEON__) && !defined(COVERAGE_ENABLED)
     21 #define HAS_SPLITUV_NEON
     22 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v
     23 // Alignment requirement: 16 bytes for pointers, and multiple of 16 pixels.
     24 static void SplitUV_NEON(const uint8* src_uv,
     25                          uint8* dst_u, uint8* dst_v, int pix) {
     26   __asm__ volatile
     27   (
     28     "1:\n"
     29     "vld2.u8    {q0,q1}, [%0]!    \n"  // load 16 pairs of UV
     30     "vst1.u8    {q0}, [%1]!       \n"  // store U
     31     "vst1.u8    {q1}, [%2]!       \n"  // Store V
     32     "subs       %3, %3, #16       \n"  // 16 processed per loop
     33     "bhi        1b                \n"
     34     : "+r"(src_uv),
     35       "+r"(dst_u),
     36       "+r"(dst_v),
     37       "+r"(pix)             // Output registers
     38     :                       // Input registers
     39     : "q0", "q1"            // Clobber List
     40   );
     41 }
     42 
     43 #elif (defined(WIN32) || defined(__x86_64__) || defined(__i386__)) \
     44     && !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
     45 #if defined(_MSC_VER)
     46 #define TALIGN16(t, var) static __declspec(align(16)) t _ ## var
     47 #else
     48 #define TALIGN16(t, var) t var __attribute__((aligned(16)))
     49 #endif
     50 
     51 // Shuffle table for converting ABGR to ARGB.
     52 extern "C" TALIGN16(const uint8, kShuffleMaskABGRToARGB[16]) = {
     53   2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
     54 };
     55 
     56 // Shuffle table for converting BGRA to ARGB.
     57 extern "C" TALIGN16(const uint8, kShuffleMaskBGRAToARGB[16]) = {
     58   3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
     59 };
     60 
     61 #if defined(WIN32) && !defined(COVERAGE_ENABLED)
     62 #define HAS_SPLITUV_SSE2
     63 __declspec(naked)
     64 static void SplitUV_SSE2(const uint8* src_uv,
     65                          uint8* dst_u, uint8* dst_v, int pix) {
     66   __asm {
     67     push       edi
     68     mov        eax, [esp + 4 + 4]    // src_uv
     69     mov        edx, [esp + 4 + 8]    // dst_u
     70     mov        edi, [esp + 4 + 12]   // dst_v
     71     mov        ecx, [esp + 4 + 16]   // pix
     72     pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
     73     psrlw      xmm7, 8
     74 
     75   wloop:
     76     movdqa     xmm0, [eax]
     77     movdqa     xmm1, [eax + 16]
     78     lea        eax,  [eax + 32]
     79     movdqa     xmm2, xmm0
     80     movdqa     xmm3, xmm1
     81     pand       xmm0, xmm7   // even bytes
     82     pand       xmm1, xmm7
     83     packuswb   xmm0, xmm1
     84     movdqa     [edx], xmm0
     85     lea        edx, [edx + 16]
     86     psrlw      xmm2, 8      // odd bytes
     87     psrlw      xmm3, 8
     88     packuswb   xmm2, xmm3
     89     movdqa     [edi], xmm2
     90     lea        edi, [edi + 16]
     91     sub        ecx, 16
     92     ja         wloop
     93     pop        edi
     94     ret
     95   }
     96 }
     97 
     98 #elif (defined(__x86_64__) || defined(__i386__)) && \
     99     !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
    100 #define HAS_SPLITUV_SSE2
    101 static void SplitUV_SSE2(const uint8* src_uv,
    102                          uint8* dst_u, uint8* dst_v, int pix) {
    103  asm volatile(
    104   "pcmpeqb    %%xmm7,%%xmm7\n"
    105   "psrlw      $0x8,%%xmm7\n"
    106 "1:"
    107   "movdqa     (%0),%%xmm0\n"
    108   "movdqa     0x10(%0),%%xmm1\n"
    109   "lea        0x20(%0),%0\n"
    110   "movdqa     %%xmm0,%%xmm2\n"
    111   "movdqa     %%xmm1,%%xmm3\n"
    112   "pand       %%xmm7,%%xmm0\n"
    113   "pand       %%xmm7,%%xmm1\n"
    114   "packuswb   %%xmm1,%%xmm0\n"
    115   "movdqa     %%xmm0,(%1)\n"
    116   "lea        0x10(%1),%1\n"
    117   "psrlw      $0x8,%%xmm2\n"
    118   "psrlw      $0x8,%%xmm3\n"
    119   "packuswb   %%xmm3,%%xmm2\n"
    120   "movdqa     %%xmm2,(%2)\n"
    121   "lea        0x10(%2),%2\n"
    122   "sub        $0x10,%3\n"
    123   "ja         1b\n"
    124   : "+r"(src_uv),     // %0
    125     "+r"(dst_u),      // %1
    126     "+r"(dst_v),      // %2
    127     "+r"(pix)         // %3
    128   :
    129   : "memory"
    130 );
    131 }
    132 #endif
    133 #endif
    134 
    135 static void SplitUV_C(const uint8* src_uv,
    136                       uint8* dst_u, uint8* dst_v, int pix) {
    137   // Copy a row of UV.
    138   for (int x = 0; x < pix; ++x) {
    139     dst_u[0] = src_uv[0];
    140     dst_v[0] = src_uv[1];
    141     src_uv += 2;
    142     dst_u += 1;
    143     dst_v += 1;
    144   }
    145 }
    146 
    147 static void I420CopyPlane(const uint8* src_y, int src_stride_y,
    148                           uint8* dst_y, int dst_stride_y,
    149                           int width, int height) {
    150   // Copy plane
    151   for (int y = 0; y < height; ++y) {
    152     memcpy(dst_y, src_y, width);
    153     src_y += src_stride_y;
    154     dst_y += dst_stride_y;
    155   }
    156 }
    157 
    158 // Copy I420 with optional flipping
    159 int I420Copy(const uint8* src_y, int src_stride_y,
    160              const uint8* src_u, int src_stride_u,
    161              const uint8* src_v, int src_stride_v,
    162              uint8* dst_y, int dst_stride_y,
    163              uint8* dst_u, int dst_stride_u,
    164              uint8* dst_v, int dst_stride_v,
    165              int width, int height) {
    166   if (!src_y || !src_u || !src_v ||
    167       !dst_y || !dst_u || !dst_v ||
    168       width <= 0 || height == 0) {
    169     return -1;
    170   }
    171 
    172   // Negative height means invert the image.
    173   if (height < 0) {
    174     height = -height;
    175     int halfheight = (height + 1) >> 1;
    176     src_y = src_y + (height - 1) * src_stride_y;
    177     src_u = src_u + (halfheight - 1) * src_stride_u;
    178     src_v = src_v + (halfheight - 1) * src_stride_v;
    179     src_stride_y = -src_stride_y;
    180     src_stride_u = -src_stride_u;
    181     src_stride_v = -src_stride_v;
    182   }
    183 
    184   int halfwidth = (width + 1) >> 1;
    185   int halfheight = (height + 1) >> 1;
    186   I420CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
    187   I420CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
    188   I420CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
    189   return 0;
    190 }
    191 
    192 // SetRows32 writes 'count' bytes using a 32 bit value repeated
    193 
    194 #if defined(__ARM_NEON__) && !defined(COVERAGE_ENABLED)
    195 #define HAS_SETROW_NEON
    196 static void SetRow32_NEON(uint8* dst, uint32 v32, int count) {
    197   __asm__ volatile
    198   (
    199     "vdup.u32   q0, %2            \n"  // duplicate 4 ints
    200     "1:\n"
    201     "vst1.u32   {q0}, [%0]!       \n"  // store
    202     "subs       %1, %1, #16       \n"  // 16 processed per loop
    203     "bhi        1b                \n"
    204   : "+r"(dst),  // %0
    205     "+r"(count) // %1
    206   : "r"(v32)    // %2
    207   : "q0", "memory"
    208   );
    209 }
    210 
    211 #elif defined(WIN32) && !defined(COVERAGE_ENABLED)
    212 #define HAS_SETROW_SSE2
    213 __declspec(naked)
    214 static void SetRow32_SSE2(uint8* dst, uint32 v32, int count) {
    215   __asm {
    216     mov        eax, [esp + 4]    // dst
    217     movd       xmm7, [esp + 8]   // v32
    218     mov        ecx, [esp + 12]   // count
    219     pshufd     xmm7, xmm7, 0
    220 
    221   wloop:
    222     movdqa     [eax], xmm7
    223     lea        eax, [eax + 16]
    224     sub        ecx, 16
    225     ja         wloop
    226     ret
    227   }
    228 }
    229 
    230 #elif (defined(__x86_64__) || defined(__i386__)) && \
    231     !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
    232 
    233 #define HAS_SETROW_SSE2
    234 static void SetRow32_SSE2(uint8* dst, uint32 v32, int count) {
    235   asm volatile(
    236   "movd       %2, %%xmm7\n"
    237   "pshufd     $0x0,%%xmm7,%%xmm7\n"
    238 "1:"
    239   "movdqa     %%xmm7,(%0)\n"
    240   "lea        0x10(%0),%0\n"
    241   "sub        $0x10,%1\n"
    242   "ja         1b\n"
    243   : "+r"(dst),  // %0
    244     "+r"(count) // %1
    245   : "r"(v32)    // %2
    246   : "memory"
    247 );
    248 }
    249 #endif
    250 
    251 static void SetRow8_C(uint8* dst, uint32 v8, int count) {
    252   memset(dst, v8, count);
    253 }
    254 
    255 static void I420SetPlane(uint8* dst_y, int dst_stride_y,
    256                          int width, int height,
    257                          int value) {
    258   void (*SetRow)(uint8* dst, uint32 value, int pix);
    259 #if defined(HAS_SETROW_NEON)
    260   if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
    261       (width % 16 == 0) &&
    262       IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
    263     SetRow = SetRow32_NEON;
    264   } else
    265 #elif defined(HAS_SETROW_SSE2)
    266   if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
    267       (width % 16 == 0) &&
    268       IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
    269     SetRow = SetRow32_SSE2;
    270   } else
    271 #endif
    272   {
    273     SetRow = SetRow8_C;
    274   }
    275 
    276   uint32 v32 = value | (value << 8) | (value << 16) | (value << 24);
    277   // Set plane
    278   for (int y = 0; y < height; ++y) {
    279     SetRow(dst_y, v32, width);
    280     dst_y += dst_stride_y;
    281   }
    282 }
    283 
    284 // Draw a rectangle into I420
    285 int I420Rect(uint8* dst_y, int dst_stride_y,
    286              uint8* dst_u, int dst_stride_u,
    287              uint8* dst_v, int dst_stride_v,
    288              int x, int y,
    289              int width, int height,
    290              int value_y, int value_u, int value_v) {
    291   if (!dst_y || !dst_u || !dst_v ||
    292       width <= 0 || height == 0 ||
    293       x < 0 || y < 0 ||
    294       value_y < 0 || value_y > 255 ||
    295       value_u < 0 || value_u > 255 ||
    296       value_v < 0 || value_v > 255) {
    297     return -1;
    298   }
    299   // Negative height means invert the image.
    300   if (height < 0) {
    301     height = -height;
    302     int halfheight = (height + 1) >> 1;
    303     dst_y = dst_y + (height - 1) * dst_stride_y;
    304     dst_u = dst_u + (halfheight - 1) * dst_stride_u;
    305     dst_v = dst_v + (halfheight - 1) * dst_stride_v;
    306     dst_stride_y = -dst_stride_y;
    307     dst_stride_u = -dst_stride_u;
    308     dst_stride_v = -dst_stride_v;
    309   }
    310 
    311   int halfwidth = (width + 1) >> 1;
    312   int halfheight = (height + 1) >> 1;
    313   uint8* start_y = dst_y + y * dst_stride_y + x;
    314   uint8* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2);
    315   uint8* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2);
    316 
    317   I420SetPlane(start_y, dst_stride_y, width, height, value_y);
    318   I420SetPlane(start_u, dst_stride_u, halfwidth, halfheight, value_u);
    319   I420SetPlane(start_v, dst_stride_v, halfwidth, halfheight, value_v);
    320   return 0;
    321 }
    322 
    323 // Helper function to copy yuv data without scaling.  Used
    324 // by our jpeg conversion callbacks to incrementally fill a yuv image.
    325 int I422ToI420(const uint8* src_y, int src_stride_y,
    326                const uint8* src_u, int src_stride_u,
    327                const uint8* src_v, int src_stride_v,
    328                uint8* dst_y, int dst_stride_y,
    329                uint8* dst_u, int dst_stride_u,
    330                uint8* dst_v, int dst_stride_v,
    331                int width, int height) {
    332   // Negative height means invert the image.
    333   if (height < 0) {
    334     height = -height;
    335     src_y = src_y + (height - 1) * src_stride_y;
    336     src_u = src_u + (height - 1) * src_stride_u;
    337     src_v = src_v + (height - 1) * src_stride_v;
    338     src_stride_y = -src_stride_y;
    339     src_stride_u = -src_stride_u;
    340     src_stride_v = -src_stride_v;
    341   }
    342 
    343   // Copy Y plane
    344   I420CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
    345 
    346   // SubSample UV planes.
    347   int x, y;
    348   int halfwidth = (width + 1) >> 1;
    349   for (y = 0; y < height; y += 2) {
    350     const uint8* u0 = src_u;
    351     const uint8* u1 = src_u + src_stride_u;
    352     if ((y + 1) >= height) {
    353       u1 = u0;
    354     }
    355     for (x = 0; x < halfwidth; ++x) {
    356       dst_u[x] = (u0[x] + u1[x] + 1) >> 1;
    357     }
    358     src_u += src_stride_u * 2;
    359     dst_u += dst_stride_u;
    360   }
    361   for (y = 0; y < height; y += 2) {
    362     const uint8* v0 = src_v;
    363     const uint8* v1 = src_v + src_stride_v;
    364     if ((y + 1) >= height) {
    365       v1 = v0;
    366     }
    367     for (x = 0; x < halfwidth; ++x) {
    368       dst_v[x] = (v0[x] + v1[x] + 1) >> 1;
    369     }
    370     src_v += src_stride_v * 2;
    371     dst_v += dst_stride_v;
    372   }
    373   return 0;
    374 }
    375 
    376 static void I420CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
    377                            uint8* dst, int dst_stride_frame,
    378                            int width, int height) {
    379   // Copy plane
    380   for (int y = 0; y < height; y += 2) {
    381     memcpy(dst, src, width);
    382     src += src_stride_0;
    383     dst += dst_stride_frame;
    384     memcpy(dst, src, width);
    385     src += src_stride_1;
    386     dst += dst_stride_frame;
    387   }
    388 }
    389 
    390 // Support converting from FOURCC_M420
    391 // Useful for bandwidth constrained transports like USB 1.0 and 2.0 and for
    392 // easy conversion to I420.
    393 // M420 format description:
    394 // M420 is row biplanar 420: 2 rows of Y and 1 row of VU.
    395 // Chroma is half width / half height. (420)
    396 // src_stride_m420 is row planar.  Normally this will be the width in pixels.
    397 //   The UV plane is half width, but 2 values, so src_stride_m420 applies to
    398 //   this as well as the two Y planes.
    399 static int X420ToI420(const uint8* src_y,
    400                       int src_stride_y0, int src_stride_y1,
    401                       const uint8* src_uv, int src_stride_uv,
    402                       uint8* dst_y, int dst_stride_y,
    403                       uint8* dst_u, int dst_stride_u,
    404                       uint8* dst_v, int dst_stride_v,
    405                       int width, int height) {
    406   // Negative height means invert the image.
    407   if (height < 0) {
    408     height = -height;
    409     int halfheight = (height + 1) >> 1;
    410     dst_y = dst_y + (height - 1) * dst_stride_y;
    411     dst_u = dst_u + (halfheight - 1) * dst_stride_u;
    412     dst_v = dst_v + (halfheight - 1) * dst_stride_v;
    413     dst_stride_y = -dst_stride_y;
    414     dst_stride_u = -dst_stride_u;
    415     dst_stride_v = -dst_stride_v;
    416   }
    417 
    418   int halfwidth = (width + 1) >> 1;
    419   void (*SplitUV)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
    420 #if defined(HAS_SPLITUV_NEON)
    421   if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
    422       (halfwidth % 16 == 0) &&
    423       IS_ALIGNED(src_uv, 16) && (src_stride_uv % 16 == 0) &&
    424       IS_ALIGNED(dst_u, 16) && (dst_stride_u % 16 == 0) &&
    425       IS_ALIGNED(dst_v, 16) && (dst_stride_v % 16 == 0)) {
    426     SplitUV = SplitUV_NEON;
    427   } else
    428 #elif defined(HAS_SPLITUV_SSE2)
    429   if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
    430       (halfwidth % 16 == 0) &&
    431       IS_ALIGNED(src_uv, 16) && (src_stride_uv % 16 == 0) &&
    432       IS_ALIGNED(dst_u, 16) && (dst_stride_u % 16 == 0) &&
    433       IS_ALIGNED(dst_v, 16) && (dst_stride_v % 16 == 0)) {
    434     SplitUV = SplitUV_SSE2;
    435   } else
    436 #endif
    437   {
    438     SplitUV = SplitUV_C;
    439   }
    440 
    441   I420CopyPlane2(src_y, src_stride_y0, src_stride_y1, dst_y, dst_stride_y,
    442                  width, height);
    443 
    444   int halfheight = (height + 1) >> 1;
    445   for (int y = 0; y < halfheight; ++y) {
    446     // Copy a row of UV.
    447     SplitUV(src_uv, dst_u, dst_v, halfwidth);
    448     dst_u += dst_stride_u;
    449     dst_v += dst_stride_v;
    450     src_uv += src_stride_uv;
    451   }
    452   return 0;
    453 }
    454 
    455 // Convert M420 to I420.
    456 int M420ToI420(const uint8* src_m420, int src_stride_m420,
    457                uint8* dst_y, int dst_stride_y,
    458                uint8* dst_u, int dst_stride_u,
    459                uint8* dst_v, int dst_stride_v,
    460                int width, int height) {
    461   return X420ToI420(src_m420, src_stride_m420, src_stride_m420 * 2,
    462                     src_m420 + src_stride_m420 * 2, src_stride_m420 * 3,
    463                     dst_y, dst_stride_y,
    464                     dst_u, dst_stride_u,
    465                     dst_v, dst_stride_v,
    466                     width, height);
    467 }
    468 
    469 // Convert NV12 to I420.
    470 int NV12ToI420(const uint8* src_y, int src_stride_y,
    471                const uint8* src_uv, int src_stride_uv,
    472                uint8* dst_y, int dst_stride_y,
    473                uint8* dst_u, int dst_stride_u,
    474                uint8* dst_v, int dst_stride_v,
    475                int width, int height) {
    476   return X420ToI420(src_y, src_stride_y, src_stride_y,
    477                     src_uv, src_stride_uv,
    478                     dst_y, dst_stride_y,
    479                     dst_u, dst_stride_u,
    480                     dst_v, dst_stride_v,
    481                     width, height);
    482 }
    483 
    484 // Convert NV12 to I420.  Deprecated.
    485 int NV12ToI420(const uint8* src_y,
    486                const uint8* src_uv,
    487                int src_stride_frame,
    488                uint8* dst_y, int dst_stride_y,
    489                uint8* dst_u, int dst_stride_u,
    490                uint8* dst_v, int dst_stride_v,
    491                int width, int height) {
    492   return X420ToI420(src_y, src_stride_frame, src_stride_frame,
    493                     src_uv, src_stride_frame,
    494                     dst_y, dst_stride_y,
    495                     dst_u, dst_stride_u,
    496                     dst_v, dst_stride_v,
    497                     width, height);
    498 }
    499 
    500 #if defined(WIN32) && !defined(COVERAGE_ENABLED)
    501 #define HAS_SPLITYUY2_SSE2
    502 __declspec(naked)
    503 static void SplitYUY2_SSE2(const uint8* src_yuy2,
    504                            uint8* dst_y, uint8* dst_u, uint8* dst_v, int pix) {
    505   __asm {
    506     push       esi
    507     push       edi
    508     mov        eax, [esp + 8 + 4]    // src_yuy2
    509     mov        edx, [esp + 8 + 8]    // dst_y
    510     mov        esi, [esp + 8 + 12]   // dst_u
    511     mov        edi, [esp + 8 + 16]   // dst_v
    512     mov        ecx, [esp + 8 + 20]   // pix
    513     pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
    514     psrlw      xmm7, 8
    515 
    516   wloop:
    517     movdqa     xmm0, [eax]
    518     movdqa     xmm1, [eax + 16]
    519     lea        eax,  [eax + 32]
    520     movdqa     xmm2, xmm0
    521     movdqa     xmm3, xmm1
    522     pand       xmm2, xmm7   // even bytes are Y
    523     pand       xmm3, xmm7
    524     packuswb   xmm2, xmm3
    525     movdqa     [edx], xmm2
    526     lea        edx, [edx + 16]
    527     psrlw      xmm0, 8      // YUYV -> UVUV
    528     psrlw      xmm1, 8
    529     packuswb   xmm0, xmm1
    530     movdqa     xmm1, xmm0
    531     pand       xmm0, xmm7  // U
    532     packuswb   xmm0, xmm0
    533     movq       qword ptr [esi], xmm0
    534     lea        esi, [esi + 8]
    535     psrlw      xmm1, 8     // V
    536     packuswb   xmm1, xmm1
    537     movq       qword ptr [edi], xmm1
    538     lea        edi, [edi + 8]
    539     sub        ecx, 16
    540     ja         wloop
    541 
    542     pop        edi
    543     pop        esi
    544     ret
    545   }
    546 }
    547 
    548 #elif (defined(__x86_64__) || defined(__i386__)) && \
    549     !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
    550 #define HAS_SPLITYUY2_SSE2
    551 static void SplitYUY2_SSE2(const uint8* src_yuy2, uint8* dst_y,
    552                            uint8* dst_u, uint8* dst_v, int pix) {
    553   asm volatile(
    554   "pcmpeqb    %%xmm7,%%xmm7\n"
    555   "psrlw      $0x8,%%xmm7\n"
    556 "1:"
    557   "movdqa     (%0),%%xmm0\n"
    558   "movdqa     0x10(%0),%%xmm1\n"
    559   "lea        0x20(%0),%0\n"
    560   "movdqa     %%xmm0,%%xmm2\n"
    561   "movdqa     %%xmm1,%%xmm3\n"
    562   "pand       %%xmm7,%%xmm2\n"
    563   "pand       %%xmm7,%%xmm3\n"
    564   "packuswb   %%xmm3,%%xmm2\n"
    565   "movdqa     %%xmm2,(%1)\n"
    566   "lea        0x10(%1),%1\n"
    567   "psrlw      $0x8,%%xmm0\n"
    568   "psrlw      $0x8,%%xmm1\n"
    569   "packuswb   %%xmm1,%%xmm0\n"
    570   "movdqa     %%xmm0,%%xmm1\n"
    571   "pand       %%xmm7,%%xmm0\n"
    572   "packuswb   %%xmm0,%%xmm0\n"
    573   "movq       %%xmm0,(%2)\n"
    574   "lea        0x8(%2),%2\n"
    575   "psrlw      $0x8,%%xmm1\n"
    576   "packuswb   %%xmm1,%%xmm1\n"
    577   "movq       %%xmm1,(%3)\n"
    578   "lea        0x8(%3),%3\n"
    579   "sub        $0x10,%4\n"
    580   "ja         1b\n"
    581   : "+r"(src_yuy2),    // %0
    582     "+r"(dst_y),       // %1
    583     "+r"(dst_u),       // %2
    584     "+r"(dst_v),       // %3
    585     "+r"(pix)          // %4
    586   :
    587   : "memory"
    588 );
    589 }
    590 #endif
    591 
    592 static void SplitYUY2_C(const uint8* src_yuy2,
    593                         uint8* dst_y, uint8* dst_u, uint8* dst_v, int pix) {
    594   // Copy a row of YUY2.
    595   for (int x = 0; x < pix; x += 2) {
    596     dst_y[0] = src_yuy2[0];
    597     dst_y[1] = src_yuy2[2];
    598     dst_u[0] = src_yuy2[1];
    599     dst_v[0] = src_yuy2[3];
    600     src_yuy2 += 4;
    601     dst_y += 2;
    602     dst_u += 1;
    603     dst_v += 1;
    604   }
    605 }
    606 
    607 // Convert Q420 to I420.
    608 // Format is rows of YY/YUYV
    609 int Q420ToI420(const uint8* src_y, int src_stride_y,
    610                const uint8* src_yuy2, int src_stride_yuy2,
    611                uint8* dst_y, int dst_stride_y,
    612                uint8* dst_u, int dst_stride_u,
    613                uint8* dst_v, int dst_stride_v,
    614                int width, int height) {
    615   // Negative height means invert the image.
    616   if (height < 0) {
    617     height = -height;
    618     int halfheight = (height + 1) >> 1;
    619     dst_y = dst_y + (height - 1) * dst_stride_y;
    620     dst_u = dst_u + (halfheight - 1) * dst_stride_u;
    621     dst_v = dst_v + (halfheight - 1) * dst_stride_v;
    622     dst_stride_y = -dst_stride_y;
    623     dst_stride_u = -dst_stride_u;
    624     dst_stride_v = -dst_stride_v;
    625   }
    626   void (*SplitYUY2)(const uint8* src_yuy2,
    627                     uint8* dst_y, uint8* dst_u, uint8* dst_v, int pix);
    628 #if defined(HAS_SPLITYUY2_SSE2)
    629   if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
    630       (width % 16 == 0) &&
    631       IS_ALIGNED(src_yuy2, 16) && (src_stride_yuy2 % 16 == 0) &&
    632       IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0) &&
    633       IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
    634       IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
    635     SplitYUY2 = SplitYUY2_SSE2;
    636   } else
    637 #endif
    638   {
    639     SplitYUY2 = SplitYUY2_C;
    640   }
    641   for (int y = 0; y < height; y += 2) {
    642     memcpy(dst_y, src_y, width);
    643     dst_y += dst_stride_y;
    644     src_y += src_stride_y;
    645 
    646     // Copy a row of YUY2.
    647     SplitYUY2(src_yuy2, dst_y, dst_u, dst_v, width);
    648     dst_y += dst_stride_y;
    649     dst_u += dst_stride_u;
    650     dst_v += dst_stride_v;
    651     src_yuy2 += src_stride_yuy2;
    652   }
    653   return 0;
    654 }
    655 
    656 #if defined(WIN32) && !defined(COVERAGE_ENABLED)
    657 #define HAS_YUY2TOI420ROW_SSE2
    658 __declspec(naked)
    659 void YUY2ToI420RowY_SSE2(const uint8* src_yuy2,
    660                          uint8* dst_y, int pix) {
    661   __asm {
    662     mov        eax, [esp + 4]    // src_yuy2
    663     mov        edx, [esp + 8]    // dst_y
    664     mov        ecx, [esp + 12]   // pix
    665     pcmpeqb    xmm7, xmm7        // generate mask 0x00ff00ff
    666     psrlw      xmm7, 8
    667 
    668   wloop:
    669     movdqa     xmm0, [eax]
    670     movdqa     xmm1, [eax + 16]
    671     lea        eax,  [eax + 32]
    672     pand       xmm0, xmm7   // even bytes are Y
    673     pand       xmm1, xmm7
    674     packuswb   xmm0, xmm1
    675     movdqa     [edx], xmm0
    676     lea        edx, [edx + 16]
    677     sub        ecx, 16
    678     ja         wloop
    679     ret
    680   }
    681 }
    682 
    683 __declspec(naked)
    684 void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2,
    685                           uint8* dst_u, uint8* dst_y, int pix) {
    686   __asm {
    687     push       esi
    688     push       edi
    689     mov        eax, [esp + 8 + 4]    // src_yuy2
    690     mov        esi, [esp + 8 + 8]    // stride_yuy2
    691     mov        edx, [esp + 8 + 12]   // dst_u
    692     mov        edi, [esp + 8 + 16]   // dst_v
    693     mov        ecx, [esp + 8 + 20]   // pix
    694     pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
    695     psrlw      xmm7, 8
    696 
    697   wloop:
    698     movdqa     xmm0, [eax]
    699     movdqa     xmm1, [eax + 16]
    700     movdqa     xmm2, [eax + esi]
    701     movdqa     xmm3, [eax + esi + 16]
    702     lea        eax,  [eax + 32]
    703     pavgb      xmm0, xmm2
    704     pavgb      xmm1, xmm3
    705     psrlw      xmm0, 8      // YUYV -> UVUV
    706     psrlw      xmm1, 8
    707     packuswb   xmm0, xmm1
    708     movdqa     xmm1, xmm0
    709     pand       xmm0, xmm7  // U
    710     packuswb   xmm0, xmm0
    711     movq       qword ptr [edx], xmm0
    712     lea        edx, [edx + 8]
    713     psrlw      xmm1, 8     // V
    714     packuswb   xmm1, xmm1
    715     movq       qword ptr [edi], xmm1
    716     lea        edi, [edi + 8]
    717     sub        ecx, 16
    718     ja         wloop
    719 
    720     pop        edi
    721     pop        esi
    722     ret
    723   }
    724 }
    725 
    726 #define HAS_UYVYTOI420ROW_SSE2
    727 __declspec(naked)
    728 void UYVYToI420RowY_SSE2(const uint8* src_uyvy,
    729                          uint8* dst_y, int pix) {
    730   __asm {
    731     mov        eax, [esp + 4]    // src_uyvy
    732     mov        edx, [esp + 8]    // dst_y
    733     mov        ecx, [esp + 12]   // pix
    734 
    735   wloop:
    736     movdqa     xmm0, [eax]
    737     movdqa     xmm1, [eax + 16]
    738     lea        eax,  [eax + 32]
    739     psrlw      xmm0, 8    // odd bytes are Y
    740     psrlw      xmm1, 8
    741     packuswb   xmm0, xmm1
    742     movdqa     [edx], xmm0
    743     lea        edx, [edx + 16]
    744     sub        ecx, 16
    745     ja         wloop
    746     ret
    747   }
    748 }
    749 
    750 __declspec(naked)
    751 void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy,
    752                           uint8* dst_u, uint8* dst_y, int pix) {
    753   __asm {
    754     push       esi
    755     push       edi
    756     mov        eax, [esp + 8 + 4]    // src_yuy2
    757     mov        esi, [esp + 8 + 8]    // stride_yuy2
    758     mov        edx, [esp + 8 + 12]   // dst_u
    759     mov        edi, [esp + 8 + 16]   // dst_v
    760     mov        ecx, [esp + 8 + 20]   // pix
    761     pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
    762     psrlw      xmm7, 8
    763 
    764   wloop:
    765     movdqa     xmm0, [eax]
    766     movdqa     xmm1, [eax + 16]
    767     movdqa     xmm2, [eax + esi]
    768     movdqa     xmm3, [eax + esi + 16]
    769     lea        eax,  [eax + 32]
    770     pavgb      xmm0, xmm2
    771     pavgb      xmm1, xmm3
    772     pand       xmm0, xmm7   // UYVY -> UVUV
    773     pand       xmm1, xmm7
    774     packuswb   xmm0, xmm1
    775     movdqa     xmm1, xmm0
    776     pand       xmm0, xmm7  // U
    777     packuswb   xmm0, xmm0
    778     movq       qword ptr [edx], xmm0
    779     lea        edx, [edx + 8]
    780     psrlw      xmm1, 8     // V
    781     packuswb   xmm1, xmm1
    782     movq       qword ptr [edi], xmm1
    783     lea        edi, [edi + 8]
    784     sub        ecx, 16
    785     ja         wloop
    786 
    787     pop        edi
    788     pop        esi
    789     ret
    790   }
    791 }
    792 
    793 #elif (defined(__x86_64__) || defined(__i386__)) && \
    794     !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
    795 
    796 #define HAS_YUY2TOI420ROW_SSE2
    797 static void YUY2ToI420RowY_SSE2(const uint8* src_yuy2,
    798                                 uint8* dst_y, int pix) {
    799   asm volatile(
    800   "pcmpeqb    %%xmm7,%%xmm7\n"
    801   "psrlw      $0x8,%%xmm7\n"
    802 "1:"
    803   "movdqa     (%0),%%xmm0\n"
    804   "movdqa     0x10(%0),%%xmm1\n"
    805   "lea        0x20(%0),%0\n"
    806   "pand       %%xmm7,%%xmm0\n"
    807   "pand       %%xmm7,%%xmm1\n"
    808   "packuswb   %%xmm1,%%xmm0\n"
    809   "movdqa     %%xmm0,(%1)\n"
    810   "lea        0x10(%1),%1\n"
    811   "sub        $0x10,%2\n"
    812   "ja         1b\n"
    813   : "+r"(src_yuy2),  // %0
    814     "+r"(dst_y),     // %1
    815     "+r"(pix)        // %2
    816   :
    817   : "memory"
    818 );
    819 }
    820 
    821 static void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2,
    822                                  uint8* dst_u, uint8* dst_y, int pix) {
    823   asm volatile(
    824   "pcmpeqb    %%xmm7,%%xmm7\n"
    825   "psrlw      $0x8,%%xmm7\n"
    826 "1:"
    827   "movdqa     (%0),%%xmm0\n"
    828   "movdqa     0x10(%0),%%xmm1\n"
    829   "movdqa     (%0,%4,1),%%xmm2\n"
    830   "movdqa     0x10(%0,%4,1),%%xmm3\n"
    831   "lea        0x20(%0),%0\n"
    832   "pavgb      %%xmm2,%%xmm0\n"
    833   "pavgb      %%xmm3,%%xmm1\n"
    834   "psrlw      $0x8,%%xmm0\n"
    835   "psrlw      $0x8,%%xmm1\n"
    836   "packuswb   %%xmm1,%%xmm0\n"
    837   "movdqa     %%xmm0,%%xmm1\n"
    838   "pand       %%xmm7,%%xmm0\n"
    839   "packuswb   %%xmm0,%%xmm0\n"
    840   "movq       %%xmm0,(%1)\n"
    841   "lea        0x8(%1),%1\n"
    842   "psrlw      $0x8,%%xmm1\n"
    843   "packuswb   %%xmm1,%%xmm1\n"
    844   "movq       %%xmm1,(%2)\n"
    845   "lea        0x8(%2),%2\n"
    846   "sub        $0x10,%3\n"
    847   "ja         1b\n"
    848   : "+r"(src_yuy2),    // %0
    849     "+r"(dst_u),       // %1
    850     "+r"(dst_y),       // %2
    851     "+r"(pix)          // %3
    852   : "r"(static_cast<intptr_t>(stride_yuy2))  // %4
    853   : "memory"
    854 );
    855 }
    856 #define HAS_UYVYTOI420ROW_SSE2
    857 static void UYVYToI420RowY_SSE2(const uint8* src_uyvy,
    858                                 uint8* dst_y, int pix) {
    859   asm volatile(
    860 "1:"
    861   "movdqa     (%0),%%xmm0\n"
    862   "movdqa     0x10(%0),%%xmm1\n"
    863   "lea        0x20(%0),%0\n"
    864   "psrlw      $0x8,%%xmm0\n"
    865   "psrlw      $0x8,%%xmm1\n"
    866   "packuswb   %%xmm1,%%xmm0\n"
    867   "movdqa     %%xmm0,(%1)\n"
    868   "lea        0x10(%1),%1\n"
    869   "sub        $0x10,%2\n"
    870   "ja         1b\n"
    871   : "+r"(src_uyvy),  // %0
    872     "+r"(dst_y),     // %1
    873     "+r"(pix)        // %2
    874   :
    875   : "memory"
    876 );
    877 }
    878 
    879 static void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy,
    880                                  uint8* dst_u, uint8* dst_y, int pix) {
    881   asm volatile(
    882   "pcmpeqb    %%xmm7,%%xmm7\n"
    883   "psrlw      $0x8,%%xmm7\n"
    884 "1:"
    885   "movdqa     (%0),%%xmm0\n"
    886   "movdqa     0x10(%0),%%xmm1\n"
    887   "movdqa     (%0,%4,1),%%xmm2\n"
    888   "movdqa     0x10(%0,%4,1),%%xmm3\n"
    889   "lea        0x20(%0),%0\n"
    890   "pavgb      %%xmm2,%%xmm0\n"
    891   "pavgb      %%xmm3,%%xmm1\n"
    892   "pand       %%xmm7,%%xmm0\n"
    893   "pand       %%xmm7,%%xmm1\n"
    894   "packuswb   %%xmm1,%%xmm0\n"
    895   "movdqa     %%xmm0,%%xmm1\n"
    896   "pand       %%xmm7,%%xmm0\n"
    897   "packuswb   %%xmm0,%%xmm0\n"
    898   "movq       %%xmm0,(%1)\n"
    899   "lea        0x8(%1),%1\n"
    900   "psrlw      $0x8,%%xmm1\n"
    901   "packuswb   %%xmm1,%%xmm1\n"
    902   "movq       %%xmm1,(%2)\n"
    903   "lea        0x8(%2),%2\n"
    904   "sub        $0x10,%3\n"
    905   "ja         1b\n"
    906   : "+r"(src_uyvy),    // %0
    907     "+r"(dst_u),       // %1
    908     "+r"(dst_y),       // %2
    909     "+r"(pix)          // %3
    910   : "r"(static_cast<intptr_t>(stride_uyvy))  // %4
    911   : "memory"
    912 );
    913 }
    914 #endif
    915 
    916 // Filter 2 rows of YUY2 UV's (422) into U and V (420)
    917 void YUY2ToI420RowUV_C(const uint8* src_yuy2, int src_stride_yuy2,
    918                        uint8* dst_u, uint8* dst_v, int pix) {
    919   // Output a row of UV values, filtering 2 rows of YUY2
    920   for (int x = 0; x < pix; x += 2) {
    921     dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1;
    922     dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1;
    923     src_yuy2 += 4;
    924     dst_u += 1;
    925     dst_v += 1;
    926   }
    927 }
    928 
    929 void YUY2ToI420RowY_C(const uint8* src_yuy2,
    930                       uint8* dst_y, int pix) {
    931   // Copy a row of yuy2 Y values
    932   for (int x = 0; x < pix; ++x) {
    933     dst_y[0] = src_yuy2[0];
    934     src_yuy2 += 2;
    935     dst_y += 1;
    936   }
    937 }
    938 
    939 void UYVYToI420RowUV_C(const uint8* src_uyvy, int src_stride_uyvy,
    940                        uint8* dst_u, uint8* dst_v, int pix) {
    941   // Copy a row of uyvy UV values
    942   for (int x = 0; x < pix; x += 2) {
    943     dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1;
    944     dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1;
    945     src_uyvy += 4;
    946     dst_u += 1;
    947     dst_v += 1;
    948   }
    949 }
    950 
    951 void UYVYToI420RowY_C(const uint8* src_uyvy,
    952                       uint8* dst_y, int pix) {
    953   // Copy a row of uyvy Y values
    954   for (int x = 0; x < pix; ++x) {
    955     dst_y[0] = src_uyvy[1];
    956     src_uyvy += 2;
    957     dst_y += 1;
    958   }
    959 }
    960 
    961 // Convert YUY2 to I420.
    962 int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
    963                uint8* dst_y, int dst_stride_y,
    964                uint8* dst_u, int dst_stride_u,
    965                uint8* dst_v, int dst_stride_v,
    966                int width, int height) {
    967   // Negative height means invert the image.
    968   if (height < 0) {
    969     height = -height;
    970     src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
    971     src_stride_yuy2 = -src_stride_yuy2;
    972   }
    973   void (*YUY2ToI420RowUV)(const uint8* src_yuy2, int src_stride_yuy2,
    974                           uint8* dst_u, uint8* dst_v, int pix);
    975   void (*YUY2ToI420RowY)(const uint8* src_yuy2,
    976                          uint8* dst_y, int pix);
    977 #if defined(HAS_YUY2TOI420ROW_SSE2)
    978   if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
    979       (width % 16 == 0) &&
    980       IS_ALIGNED(src_yuy2, 16) && (src_stride_yuy2 % 16 == 0) &&
    981       IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0) &&
    982       IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
    983       IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
    984     YUY2ToI420RowY = YUY2ToI420RowY_SSE2;
    985     YUY2ToI420RowUV = YUY2ToI420RowUV_SSE2;
    986   } else
    987 #endif
    988   {
    989     YUY2ToI420RowY = YUY2ToI420RowY_C;
    990     YUY2ToI420RowUV = YUY2ToI420RowUV_C;
    991   }
    992   for (int y = 0; y < height; ++y) {
    993     if ((y & 1) == 0) {
    994       if (y >= (height - 1) ) {  // last chroma on odd height clamp height
    995         src_stride_yuy2 = 0;
    996       }
    997       YUY2ToI420RowUV(src_yuy2, src_stride_yuy2, dst_u, dst_v, width);
    998       dst_u += dst_stride_u;
    999       dst_v += dst_stride_v;
   1000     }
   1001     YUY2ToI420RowY(src_yuy2, dst_y, width);
   1002     dst_y += dst_stride_y;
   1003     src_yuy2 += src_stride_yuy2;
   1004   }
   1005   return 0;
   1006 }
   1007 
   1008 // Convert UYVY to I420.
   1009 int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
   1010                uint8* dst_y, int dst_stride_y,
   1011                uint8* dst_u, int dst_stride_u,
   1012                uint8* dst_v, int dst_stride_v,
   1013                int width, int height) {
   1014   // Negative height means invert the image.
   1015   if (height < 0) {
   1016     height = -height;
   1017     src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
   1018     src_stride_uyvy = -src_stride_uyvy;
   1019   }
   1020   void (*UYVYToI420RowUV)(const uint8* src_uyvy, int src_stride_uyvy,
   1021                           uint8* dst_u, uint8* dst_v, int pix);
   1022   void (*UYVYToI420RowY)(const uint8* src_uyvy,
   1023                          uint8* dst_y, int pix);
   1024 #if defined(HAS_UYVYTOI420ROW_SSE2)
   1025   if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
   1026       (width % 16 == 0) &&
   1027       IS_ALIGNED(src_uyvy, 16) && (src_stride_uyvy % 16 == 0) &&
   1028       IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0) &&
   1029       IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
   1030       IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
   1031     UYVYToI420RowY = UYVYToI420RowY_SSE2;
   1032     UYVYToI420RowUV = UYVYToI420RowUV_SSE2;
   1033   } else
   1034 #endif
   1035   {
   1036     UYVYToI420RowY = UYVYToI420RowY_C;
   1037     UYVYToI420RowUV = UYVYToI420RowUV_C;
   1038   }
   1039   for (int y = 0; y < height; ++y) {
   1040     if ((y & 1) == 0) {
   1041       if (y >= (height - 1) ) {  // last chroma on odd height clamp height
   1042         src_stride_uyvy = 0;
   1043       }
   1044       UYVYToI420RowUV(src_uyvy, src_stride_uyvy, dst_u, dst_v, width);
   1045       dst_u += dst_stride_u;
   1046       dst_v += dst_stride_v;
   1047     }
   1048     UYVYToI420RowY(src_uyvy, dst_y, width);
   1049     dst_y += dst_stride_y;
   1050     src_uyvy += src_stride_uyvy;
   1051   }
   1052   return 0;
   1053 }
   1054 
   1055 // Convert I420 to ARGB.
   1056 // TODO(fbarchard): Add SSE2 version and supply C version for fallback.
   1057 int I420ToARGB(const uint8* src_y, int src_stride_y,
   1058                const uint8* src_u, int src_stride_u,
   1059                const uint8* src_v, int src_stride_v,
   1060                uint8* dst_argb, int dst_stride_argb,
   1061                int width, int height) {
   1062   // Negative height means invert the image.
   1063   if (height < 0) {
   1064     height = -height;
   1065     dst_argb = dst_argb + (height - 1) * dst_stride_argb;
   1066     dst_stride_argb = -dst_stride_argb;
   1067   }
   1068   for (int y = 0; y < height; ++y) {
   1069     FastConvertYUVToRGB32Row(src_y, src_u, src_v, dst_argb, width);
   1070     dst_argb += dst_stride_argb;
   1071     src_y += src_stride_y;
   1072     if (y & 1) {
   1073       src_u += src_stride_u;
   1074       src_v += src_stride_v;
   1075     }
   1076   }
   1077   // MMX used for FastConvertYUVToRGB32Row requires an emms instruction.
   1078   EMMS();
   1079   return 0;
   1080 }
   1081 
   1082 // Convert I420 to BGRA.
   1083 int I420ToBGRA(const uint8* src_y, int src_stride_y,
   1084                const uint8* src_u, int src_stride_u,
   1085                const uint8* src_v, int src_stride_v,
   1086                uint8* dst_argb, int dst_stride_argb,
   1087                int width, int height) {
   1088   // Negative height means invert the image.
   1089   if (height < 0) {
   1090     height = -height;
   1091     dst_argb = dst_argb + (height - 1) * dst_stride_argb;
   1092     dst_stride_argb = -dst_stride_argb;
   1093   }
   1094   for (int y = 0; y < height; ++y) {
   1095     FastConvertYUVToBGRARow(src_y, src_u, src_v, dst_argb, width);
   1096     dst_argb += dst_stride_argb;
   1097     src_y += src_stride_y;
   1098     if (y & 1) {
   1099       src_u += src_stride_u;
   1100       src_v += src_stride_v;
   1101     }
   1102   }
   1103   EMMS();
   1104   return 0;
   1105 }
   1106 
   1107 // Convert I420 to BGRA.
   1108 int I420ToABGR(const uint8* src_y, int src_stride_y,
   1109                const uint8* src_u, int src_stride_u,
   1110                const uint8* src_v, int src_stride_v,
   1111                uint8* dst_argb, int dst_stride_argb,
   1112                int width, int height) {
   1113   // Negative height means invert the image.
   1114   if (height < 0) {
   1115     height = -height;
   1116     dst_argb = dst_argb + (height - 1) * dst_stride_argb;
   1117     dst_stride_argb = -dst_stride_argb;
   1118   }
   1119   for (int y = 0; y < height; ++y) {
   1120     FastConvertYUVToABGRRow(src_y, src_u, src_v, dst_argb, width);
   1121     dst_argb += dst_stride_argb;
   1122     src_y += src_stride_y;
   1123     if (y & 1) {
   1124       src_u += src_stride_u;
   1125       src_v += src_stride_v;
   1126     }
   1127   }
   1128   EMMS();
   1129   return 0;
   1130 }
   1131 
   1132 // Convert I422 to ARGB.
   1133 int I422ToARGB(const uint8* src_y, int src_stride_y,
   1134                const uint8* src_u, int src_stride_u,
   1135                const uint8* src_v, int src_stride_v,
   1136                uint8* dst_argb, int dst_stride_argb,
   1137                int width, int height) {
   1138   // Negative height means invert the image.
   1139   if (height < 0) {
   1140     height = -height;
   1141     dst_argb = dst_argb + (height - 1) * dst_stride_argb;
   1142     dst_stride_argb = -dst_stride_argb;
   1143   }
   1144   for (int y = 0; y < height; ++y) {
   1145     FastConvertYUVToRGB32Row(src_y, src_u, src_v, dst_argb, width);
   1146     dst_argb += dst_stride_argb;
   1147     src_y += src_stride_y;
   1148     src_u += src_stride_u;
   1149     src_v += src_stride_v;
   1150   }
   1151   // MMX used for FastConvertYUVToRGB32Row requires an emms instruction.
   1152   EMMS();
   1153   return 0;
   1154 }
   1155 
   1156 // Convert I444 to ARGB.
   1157 int I444ToARGB(const uint8* src_y, int src_stride_y,
   1158                const uint8* src_u, int src_stride_u,
   1159                const uint8* src_v, int src_stride_v,
   1160                uint8* dst_argb, int dst_stride_argb,
   1161                int width, int height) {
   1162   // Negative height means invert the image.
   1163   if (height < 0) {
   1164     height = -height;
   1165     dst_argb = dst_argb + (height - 1) * dst_stride_argb;
   1166     dst_stride_argb = -dst_stride_argb;
   1167   }
   1168   for (int y = 0; y < height; ++y) {
   1169     FastConvertYUV444ToRGB32Row(src_y, src_u, src_v, dst_argb, width);
   1170     dst_argb += dst_stride_argb;
   1171     src_y += src_stride_y;
   1172     src_u += src_stride_u;
   1173     src_v += src_stride_v;
   1174   }
   1175   // MMX used for FastConvertYUVToRGB32Row requires an emms instruction.
   1176   EMMS();
   1177   return 0;
   1178 }
   1179 
   1180 // Convert I400 to ARGB.
   1181 int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
   1182                          uint8* dst_argb, int dst_stride_argb,
   1183                          int width, int height) {
   1184   // Negative height means invert the image.
   1185   if (height < 0) {
   1186     height = -height;
   1187     dst_argb = dst_argb + (height - 1) * dst_stride_argb;
   1188     dst_stride_argb = -dst_stride_argb;
   1189   }
   1190   for (int y = 0; y < height; ++y) {
   1191     FastConvertYToRGB32Row(src_y, dst_argb, width);
   1192     dst_argb += dst_stride_argb;
   1193     src_y += src_stride_y;
   1194   }
   1195   // MMX used for FastConvertYUVToRGB32Row requires an emms instruction.
   1196   EMMS();
   1197   return 0;
   1198 }
   1199 
   1200 // TODO(fbarchard): 64 bit version
   1201 #if defined(WIN32) && !defined(COVERAGE_ENABLED)
   1202 
   1203 #define HAS_I400TOARGBROW_SSE2
   1204 __declspec(naked)
   1205 static void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
   1206   __asm {
   1207     mov        eax, [esp + 4]        // src_y
   1208     mov        edx, [esp + 8]        // dst_argb
   1209     mov        ecx, [esp + 12]       // pix
   1210     pcmpeqb    xmm7, xmm7            // generate mask 0xff000000
   1211     pslld      xmm7, 24
   1212 
   1213   wloop:
   1214     movq       xmm0, qword ptr [eax]
   1215     lea        eax,  [eax + 8]
   1216     punpcklbw  xmm0, xmm0
   1217     movdqa     xmm1, xmm0
   1218     punpcklwd  xmm0, xmm0
   1219     punpckhwd  xmm1, xmm1
   1220     por        xmm0, xmm7
   1221     por        xmm1, xmm7
   1222     movdqa     [edx], xmm0
   1223     movdqa     [edx + 16], xmm1
   1224     lea        edx, [edx + 32]
   1225     sub        ecx, 8
   1226     ja         wloop
   1227     ret
   1228   }
   1229 }
   1230 
   1231 #define HAS_ABGRTOARGBROW_SSSE3
   1232 __declspec(naked)
   1233 static void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb,
   1234                                 int pix) {
   1235 __asm {
   1236     mov       eax, [esp + 4]   // src_abgr
   1237     mov       edx, [esp + 8]   // dst_argb
   1238     mov       ecx, [esp + 12]  // pix
   1239     movdqa    xmm7, _kShuffleMaskABGRToARGB
   1240 
   1241  convertloop :
   1242     movdqa    xmm0, [eax]
   1243     lea       eax, [eax + 16]
   1244     pshufb    xmm0, xmm7
   1245     movdqa    [edx], xmm0
   1246     lea       edx, [edx + 16]
   1247     sub       ecx, 4
   1248     ja        convertloop
   1249     ret
   1250   }
   1251 }
   1252 
   1253 #define HAS_BGRATOARGBROW_SSSE3
   1254 __declspec(naked)
   1255 static void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb,
   1256                                 int pix) {
   1257 __asm {
   1258     mov       eax, [esp + 4]   // src_bgra
   1259     mov       edx, [esp + 8]   // dst_argb
   1260     mov       ecx, [esp + 12]  // pix
   1261     movdqa    xmm7, _kShuffleMaskBGRAToARGB
   1262 
   1263  convertloop :
   1264     movdqa    xmm0, [eax]
   1265     lea       eax, [eax + 16]
   1266     pshufb    xmm0, xmm7
   1267     movdqa    [edx], xmm0
   1268     lea       edx, [edx + 16]
   1269     sub       ecx, 4
   1270     ja        convertloop
   1271     ret
   1272   }
   1273 }
   1274 
   1275 
   1276 #elif (defined(__x86_64__) || defined(__i386__)) && \
   1277     !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
   1278 
   1279 // TODO(yuche): consider moving ARGB related codes to a separate file.
   1280 #define HAS_I400TOARGBROW_SSE2
   1281 static void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
   1282   asm volatile(
   1283   "pcmpeqb    %%xmm7,%%xmm7\n"
   1284   "pslld      $0x18,%%xmm7\n"
   1285 "1:"
   1286   "movq       (%0),%%xmm0\n"
   1287   "lea        0x8(%0),%0\n"
   1288   "punpcklbw  %%xmm0,%%xmm0\n"
   1289   "movdqa     %%xmm0,%%xmm1\n"
   1290   "punpcklwd  %%xmm0,%%xmm0\n"
   1291   "punpckhwd  %%xmm1,%%xmm1\n"
   1292   "por        %%xmm7,%%xmm0\n"
   1293   "por        %%xmm7,%%xmm1\n"
   1294   "movdqa     %%xmm0,(%1)\n"
   1295   "movdqa     %%xmm1,0x10(%1)\n"
   1296   "lea        0x20(%1),%1\n"
   1297   "sub        $0x8,%2\n"
   1298   "ja         1b\n"
   1299   : "+r"(src_y),     // %0
   1300     "+r"(dst_argb),  // %1
   1301     "+r"(pix)        // %2
   1302   :
   1303   : "memory"
   1304 );
   1305 }
   1306 
   1307 #define HAS_ABGRTOARGBROW_SSSE3
   1308 static void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb,
   1309                                 int pix) {
   1310   asm volatile(
   1311   "movdqa     (%3),%%xmm7\n"
   1312 "1:"
   1313   "movdqa     (%0),%%xmm0\n"
   1314   "lea        0x10(%0),%0\n"
   1315   "pshufb     %%xmm7,%%xmm0\n"
   1316   "movdqa     %%xmm0,(%1)\n"
   1317   "lea        0x10(%1),%1\n"
   1318   "sub        $0x4,%2\n"
   1319   "ja         1b\n"
   1320   : "+r"(src_abgr),  // %0
   1321     "+r"(dst_argb),  // %1
   1322     "+r"(pix)        // %2
   1323   : "r"(kShuffleMaskABGRToARGB)  // %3
   1324   : "memory"
   1325 );
   1326 }
   1327 
   1328 #define HAS_BGRATOARGBROW_SSSE3
   1329 static void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb,
   1330                                 int pix) {
   1331   asm volatile(
   1332   "movdqa     (%3),%%xmm7\n"
   1333 "1:"
   1334   "movdqa     (%0),%%xmm0\n"
   1335   "lea        0x10(%0),%0\n"
   1336   "pshufb     %%xmm7,%%xmm0\n"
   1337   "movdqa     %%xmm0,(%1)\n"
   1338   "lea        0x10(%1),%1\n"
   1339   "sub        $0x4,%2\n"
   1340   "ja         1b\n"
   1341   : "+r"(src_bgra),  // %0
   1342     "+r"(dst_argb),  // %1
   1343     "+r"(pix)        // %2
   1344   : "r"(kShuffleMaskBGRAToARGB)  // %3
   1345   : "memory"
   1346 );
   1347 }
   1348 
   1349 #endif
   1350 
   1351 static void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix) {
   1352   // Copy a Y to RGB.
   1353   for (int x = 0; x < pix; ++x) {
   1354     uint8 y = src_y[0];
   1355     dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
   1356     dst_argb[3] = 255u;
   1357     dst_argb += 4;
   1358     ++src_y;
   1359   }
   1360 }
   1361 
   1362 // Convert I400 to ARGB.
   1363 int I400ToARGB(const uint8* src_y, int src_stride_y,
   1364                uint8* dst_argb, int dst_stride_argb,
   1365                int width, int height) {
   1366   if (height < 0) {
   1367     height = -height;
   1368     src_y = src_y + (height - 1) * src_stride_y;
   1369     src_stride_y = -src_stride_y;
   1370   }
   1371   void (*I400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix);
   1372 #if defined(HAS_I400TOARGBROW_SSE2)
   1373   if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
   1374       (width % 8 == 0) &&
   1375       IS_ALIGNED(src_y, 8) && (src_stride_y % 8 == 0) &&
   1376       IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
   1377     I400ToARGBRow = I400ToARGBRow_SSE2;
   1378   } else
   1379 #endif
   1380   {
   1381     I400ToARGBRow = I400ToARGBRow_C;
   1382   }
   1383 
   1384   for (int y = 0; y < height; ++y) {
   1385     I400ToARGBRow(src_y, dst_argb, width);
   1386     src_y += src_stride_y;
   1387     dst_argb += dst_stride_argb;
   1388   }
   1389   return 0;
   1390 }
   1391 
   1392 static void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix) {
   1393   for (int x = 0; x < pix; ++x) {
   1394     // To support in-place conversion.
   1395     uint8 r = src_abgr[0];
   1396     uint8 g = src_abgr[1];
   1397     uint8 b = src_abgr[2];
   1398     uint8 a = src_abgr[3];
   1399     dst_argb[0] = b;
   1400     dst_argb[1] = g;
   1401     dst_argb[2] = r;
   1402     dst_argb[3] = a;
   1403     dst_argb += 4;
   1404     src_abgr += 4;
   1405   }
   1406 }
   1407 
   1408 int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr,
   1409                uint8* dst_argb, int dst_stride_argb,
   1410                int width, int height) {
   1411   if (height < 0) {
   1412     height = -height;
   1413     src_abgr = src_abgr + (height - 1) * src_stride_abgr;
   1414     src_stride_abgr = -src_stride_abgr;
   1415   }
   1416 void (*ABGRToARGBRow)(const uint8* src_abgr, uint8* dst_argb, int pix);
   1417 #if defined(HAS_ABGRTOARGBROW_SSSE3)
   1418   if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
   1419       (width % 4 == 0) &&
   1420       IS_ALIGNED(src_abgr, 16) && (src_stride_abgr % 16 == 0) &&
   1421       IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
   1422     ABGRToARGBRow = ABGRToARGBRow_SSSE3;
   1423   } else
   1424 #endif
   1425   {
   1426     ABGRToARGBRow = ABGRToARGBRow_C;
   1427   }
   1428 
   1429   for (int y = 0; y < height; ++y) {
   1430     ABGRToARGBRow(src_abgr, dst_argb, width);
   1431     src_abgr += src_stride_abgr;
   1432     dst_argb += dst_stride_argb;
   1433   }
   1434   return 0;
   1435 }
   1436 
   1437 static void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int pix) {
   1438   for (int x = 0; x < pix; ++x) {
   1439     // To support in-place conversion.
   1440     uint8 a = src_bgra[0];
   1441     uint8 r = src_bgra[1];
   1442     uint8 g = src_bgra[2];
   1443     uint8 b = src_bgra[3];
   1444     dst_argb[0] = b;
   1445     dst_argb[1] = g;
   1446     dst_argb[2] = r;
   1447     dst_argb[3] = a;
   1448     dst_argb += 4;
   1449     src_bgra += 4;
   1450   }
   1451 }
   1452 
   1453 // Convert BGRA to ARGB.
   1454 int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra,
   1455                uint8* dst_argb, int dst_stride_argb,
   1456                int width, int height) {
   1457   if (height < 0) {
   1458     height = -height;
   1459     src_bgra = src_bgra + (height - 1) * src_stride_bgra;
   1460     src_stride_bgra = -src_stride_bgra;
   1461   }
   1462   void (*BGRAToARGBRow)(const uint8* src_bgra, uint8* dst_argb, int pix);
   1463 #if defined(HAS_BGRATOARGBROW_SSSE3)
   1464   if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
   1465       (width % 4 == 0) &&
   1466       IS_ALIGNED(src_bgra, 16) && (src_stride_bgra % 16 == 0) &&
   1467       IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
   1468     BGRAToARGBRow = BGRAToARGBRow_SSSE3;
   1469   } else
   1470 #endif
   1471   {
   1472     BGRAToARGBRow = BGRAToARGBRow_C;
   1473   }
   1474 
   1475   for (int y = 0; y < height; ++y) {
   1476     BGRAToARGBRow(src_bgra, dst_argb, width);
   1477     src_bgra += src_stride_bgra;
   1478     dst_argb += dst_stride_argb;
   1479   }
   1480   return 0;
   1481 }
   1482 
   1483 // Convert ARGB to I400.
   1484 int ARGBToI400(const uint8* src_argb, int src_stride_argb,
   1485                uint8* dst_y, int dst_stride_y,
   1486                int width, int height) {
   1487   if (height < 0) {
   1488     height = -height;
   1489     src_argb = src_argb + (height - 1) * src_stride_argb;
   1490     src_stride_argb = -src_stride_argb;
   1491   }
   1492 void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
   1493 #if defined(HAS_ARGBTOYROW_SSSE3)
   1494   if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
   1495       (width % 4 == 0) &&
   1496       IS_ALIGNED(src_argb, 16) && (src_stride_argb % 16 == 0) &&
   1497       IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
   1498     ARGBToYRow = ARGBToYRow_SSSE3;
   1499   } else
   1500 #endif
   1501   {
   1502     ARGBToYRow = ARGBToYRow_C;
   1503   }
   1504 
   1505   for (int y = 0; y < height; ++y) {
   1506     ARGBToYRow(src_argb, dst_y, width);
   1507     src_argb += src_stride_argb;
   1508     dst_y += dst_stride_y;
   1509   }
   1510   return 0;
   1511 }
   1512 
   1513 
   1514 // Convert RAW to ARGB.
   1515 int RAWToARGB(const uint8* src_raw, int src_stride_raw,
   1516               uint8* dst_argb, int dst_stride_argb,
   1517               int width, int height) {
   1518   if (height < 0) {
   1519     height = -height;
   1520     src_raw = src_raw + (height - 1) * src_stride_raw;
   1521     src_stride_raw = -src_stride_raw;
   1522   }
   1523   void (*RAWToARGBRow)(const uint8* src_raw, uint8* dst_argb, int pix);
   1524 #if defined(HAS_RAWTOARGBROW_SSSE3)
   1525   if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
   1526       (width % 16 == 0) &&
   1527       IS_ALIGNED(src_raw, 16) && (src_stride_raw % 16 == 0) &&
   1528       IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
   1529     RAWToARGBRow = RAWToARGBRow_SSSE3;
   1530   } else
   1531 #endif
   1532   {
   1533     RAWToARGBRow = RAWToARGBRow_C;
   1534   }
   1535 
   1536   for (int y = 0; y < height; ++y) {
   1537     RAWToARGBRow(src_raw, dst_argb, width);
   1538     src_raw += src_stride_raw;
   1539     dst_argb += dst_stride_argb;
   1540   }
   1541   return 0;
   1542 }
   1543 
   1544 // Convert BG24 to ARGB.
   1545 int BG24ToARGB(const uint8* src_bg24, int src_stride_bg24,
   1546                uint8* dst_argb, int dst_stride_argb,
   1547                int width, int height) {
   1548   if (height < 0) {
   1549     height = -height;
   1550     src_bg24 = src_bg24 + (height - 1) * src_stride_bg24;
   1551     src_stride_bg24 = -src_stride_bg24;
   1552   }
   1553   void (*BG24ToARGBRow)(const uint8* src_bg24, uint8* dst_argb, int pix);
   1554 #if defined(HAS_BG24TOARGBROW_SSSE3)
   1555   if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
   1556       (width % 16 == 0) &&
   1557       IS_ALIGNED(src_bg24, 16) && (src_stride_bg24 % 16 == 0) &&
   1558       IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
   1559     BG24ToARGBRow = BG24ToARGBRow_SSSE3;
   1560   } else
   1561 #endif
   1562   {
   1563     BG24ToARGBRow = BG24ToARGBRow_C;
   1564   }
   1565 
   1566   for (int y = 0; y < height; ++y) {
   1567     BG24ToARGBRow(src_bg24, dst_argb, width);
   1568     src_bg24 += src_stride_bg24;
   1569     dst_argb += dst_stride_argb;
   1570   }
   1571   return 0;
   1572 }
   1573 
   1574 }  // namespace libyuv
   1575 
   1576