Home | History | Annotate | Download | only in source
      1 /*
      2  *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "libyuv/planar_functions.h"
     12 #include "libyuv/rotate.h"
     13 #include "rotate_priv.h"
     14 
     15 #include "libyuv/cpu_id.h"
     16 
     17 namespace libyuv {
     18 
     19 #if (defined(WIN32) || defined(__x86_64__) || defined(__i386__)) \
     20     && !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
     21 #if defined(_MSC_VER)
     22 #define TALIGN16(t, var) static __declspec(align(16)) t _ ## var
     23 #else
     24 #define TALIGN16(t, var) t var __attribute__((aligned(16)))
     25 #endif
     26 // Shuffle table for reversing the bytes.
     27 extern "C" TALIGN16(const uint8, kShuffleReverse[16]) =
     28   { 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u };
     29 // Shuffle table for reversing the bytes of UV channels.
     30 extern "C" TALIGN16(const uint8, kShuffleReverseUV[16]) =
     31   { 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u };
     32 #endif
     33 
     34 typedef void (*reverse_uv_func)(const uint8*, uint8*, uint8*, int);
     35 typedef void (*reverse_func)(const uint8*, uint8*, int);
     36 typedef void (*rotate_uv_wx8_func)(const uint8*, int,
     37                                    uint8*, int,
     38                                    uint8*, int, int);
     39 typedef void (*rotate_uv_wxh_func)(const uint8*, int,
     40                                    uint8*, int,
     41                                    uint8*, int, int, int);
     42 typedef void (*rotate_wx8_func)(const uint8*, int, uint8*, int, int);
     43 typedef void (*rotate_wxh_func)(const uint8*, int, uint8*, int, int, int);
     44 
     45 #if 0 // Need to add rotate_neon.s to the build to enable this
     46 #ifdef __ARM_NEON__
     47 extern "C" {
     48 void RestoreRegisters_NEON(unsigned long long *restore);
     49 void SaveRegisters_NEON(unsigned long long *store);
     50 #define HAS_REVERSE_LINE_NEON
     51 void ReverseLine_NEON(const uint8* src, uint8* dst, int width);
     52 #define HAS_REVERSE_LINE_UV_NEON
     53 void ReverseLineUV_NEON(const uint8* src,
     54                         uint8* dst_a, uint8* dst_b,
     55                         int width);
     56 #define HAS_TRANSPOSE_WX8_NEON
     57 void TransposeWx8_NEON(const uint8* src, int src_stride,
     58                        uint8* dst, int dst_stride, int width);
     59 #define HAS_TRANSPOSE_UVWX8_NEON
     60 void TransposeUVWx8_NEON(const uint8* src, int src_stride,
     61                          uint8* dst_a, int dst_stride_a,
     62                          uint8* dst_b, int dst_stride_b,
     63                          int width);
     64 }  // extern "C"
     65 #endif
     66 #endif
     67 
     68 #if defined(WIN32) && !defined(COVERAGE_ENABLED)
     69 #define HAS_TRANSPOSE_WX8_SSSE3
     70 __declspec(naked)
     71 static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
     72                                uint8* dst, int dst_stride, int width) {
     73 __asm {
     74     push      edi
     75     push      esi
     76     push      ebp
     77     mov       eax, [esp + 12 + 4]   // src
     78     mov       edi, [esp + 12 + 8]   // src_stride
     79     mov       edx, [esp + 12 + 12]  // dst
     80     mov       esi, [esp + 12 + 16]  // dst_stride
     81     mov       ecx, [esp + 12 + 20]  // width
     82  convertloop :
     83     // Read in the data from the source pointer.
     84     // First round of bit swap.
     85     movq      xmm0, qword ptr [eax]
     86     lea       ebp, [eax + 8]
     87     movq      xmm1, qword ptr [eax + edi]
     88     lea       eax, [eax + 2 * edi]
     89     punpcklbw xmm0, xmm1
     90     movq      xmm2, qword ptr [eax]
     91     movdqa    xmm1, xmm0
     92     palignr   xmm1, xmm1, 8
     93     movq      xmm3, qword ptr [eax + edi]
     94     lea       eax, [eax + 2 * edi]
     95     punpcklbw xmm2, xmm3
     96     movdqa    xmm3, xmm2
     97     movq      xmm4, qword ptr [eax]
     98     palignr   xmm3, xmm3, 8
     99     movq      xmm5, qword ptr [eax + edi]
    100     punpcklbw xmm4, xmm5
    101     lea       eax, [eax + 2 * edi]
    102     movdqa    xmm5, xmm4
    103     movq      xmm6, qword ptr [eax]
    104     palignr   xmm5, xmm5, 8
    105     movq      xmm7, qword ptr [eax + edi]
    106     punpcklbw xmm6, xmm7
    107     mov       eax, ebp
    108     movdqa    xmm7, xmm6
    109     palignr   xmm7, xmm7, 8
    110     // Second round of bit swap.
    111     punpcklwd xmm0, xmm2
    112     punpcklwd xmm1, xmm3
    113     movdqa    xmm2, xmm0
    114     movdqa    xmm3, xmm1
    115     palignr   xmm2, xmm2, 8
    116     palignr   xmm3, xmm3, 8
    117     punpcklwd xmm4, xmm6
    118     punpcklwd xmm5, xmm7
    119     movdqa    xmm6, xmm4
    120     movdqa    xmm7, xmm5
    121     palignr   xmm6, xmm6, 8
    122     palignr   xmm7, xmm7, 8
    123     // Third round of bit swap.
    124     // Write to the destination pointer.
    125     punpckldq xmm0, xmm4
    126     movq      qword ptr [edx], xmm0
    127     movdqa    xmm4, xmm0
    128     palignr   xmm4, xmm4, 8
    129     movq      qword ptr [edx + esi], xmm4
    130     lea       edx, [edx + 2 * esi]
    131     punpckldq xmm2, xmm6
    132     movdqa    xmm6, xmm2
    133     palignr   xmm6, xmm6, 8
    134     movq      qword ptr [edx], xmm2
    135     punpckldq xmm1, xmm5
    136     movq      qword ptr [edx + esi], xmm6
    137     lea       edx, [edx + 2 * esi]
    138     movdqa    xmm5, xmm1
    139     movq      qword ptr [edx], xmm1
    140     palignr   xmm5, xmm5, 8
    141     punpckldq xmm3, xmm7
    142     movq      qword ptr [edx + esi], xmm5
    143     lea       edx, [edx + 2 * esi]
    144     movq      qword ptr [edx], xmm3
    145     movdqa    xmm7, xmm3
    146     palignr   xmm7, xmm7, 8
    147     movq      qword ptr [edx + esi], xmm7
    148     lea       edx, [edx + 2 * esi]
    149     sub       ecx, 8
    150     ja        convertloop
    151 
    152     pop       ebp
    153     pop       esi
    154     pop       edi
    155     ret
    156   }
    157 }
    158 
    159 #define HAS_TRANSPOSE_UVWX8_SSE2
    160 __declspec(naked)
    161 static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
    162                                 uint8* dst_a, int dst_stride_a,
    163                                 uint8* dst_b, int dst_stride_b,
    164                                 int w) {
    165 __asm {
    166     push      ebx
    167     push      esi
    168     push      edi
    169     push      ebp
    170     mov       eax, [esp + 16 + 4]   // src
    171     mov       edi, [esp + 16 + 8]   // src_stride
    172     mov       edx, [esp + 16 + 12]  // dst_a
    173     mov       esi, [esp + 16 + 16]  // dst_stride_a
    174     mov       ebx, [esp + 16 + 20]  // dst_b
    175     mov       ebp, [esp + 16 + 24]  // dst_stride_b
    176     mov       ecx, esp
    177     sub       esp, 4 + 16
    178     and       esp, ~15
    179     mov       [esp + 16], ecx
    180     mov       ecx, [ecx + 16 + 28]  // w
    181  convertloop :
    182     // Read in the data from the source pointer.
    183     // First round of bit swap.
    184     movdqa    xmm0, [eax]
    185     movdqa    xmm1, [eax + edi]
    186     lea       eax, [eax + 2 * edi]
    187     movdqa    xmm7, xmm0  // use xmm7 as temp register.
    188     punpcklbw xmm0, xmm1
    189     punpckhbw xmm7, xmm1
    190     movdqa    xmm1, xmm7
    191     movdqa    xmm2, [eax]
    192     movdqa    xmm3, [eax + edi]
    193     lea       eax, [eax + 2 * edi]
    194     movdqa    xmm7, xmm2
    195     punpcklbw xmm2, xmm3
    196     punpckhbw xmm7, xmm3
    197     movdqa    xmm3, xmm7
    198     movdqa    xmm4, [eax]
    199     movdqa    xmm5, [eax + edi]
    200     lea       eax, [eax + 2 * edi]
    201     movdqa    xmm7, xmm4
    202     punpcklbw xmm4, xmm5
    203     punpckhbw xmm7, xmm5
    204     movdqa    xmm5, xmm7
    205     movdqa    xmm6, [eax]
    206     movdqa    xmm7, [eax + edi]
    207     lea       eax, [eax + 2 * edi]
    208     movdqa    [esp], xmm5  // backup xmm5
    209     neg       edi
    210     movdqa    xmm5, xmm6   // use xmm5 as temp register.
    211     punpcklbw xmm6, xmm7
    212     punpckhbw xmm5, xmm7
    213     movdqa    xmm7, xmm5
    214     lea       eax, [eax + 8 * edi + 16]
    215     neg       edi
    216     // Second round of bit swap.
    217     movdqa    xmm5, xmm0
    218     punpcklwd xmm0, xmm2
    219     punpckhwd xmm5, xmm2
    220     movdqa    xmm2, xmm5
    221     movdqa    xmm5, xmm1
    222     punpcklwd xmm1, xmm3
    223     punpckhwd xmm5, xmm3
    224     movdqa    xmm3, xmm5
    225     movdqa    xmm5, xmm4
    226     punpcklwd xmm4, xmm6
    227     punpckhwd xmm5, xmm6
    228     movdqa    xmm6, xmm5
    229     movdqa    xmm5, [esp]  // restore xmm5
    230     movdqa    [esp], xmm6  // backup xmm6
    231     movdqa    xmm6, xmm5    // use xmm6 as temp register.
    232     punpcklwd xmm5, xmm7
    233     punpckhwd xmm6, xmm7
    234     movdqa    xmm7, xmm6
    235     // Third round of bit swap.
    236     // Write to the destination pointer.
    237     movdqa    xmm6, xmm0
    238     punpckldq xmm0, xmm4
    239     punpckhdq xmm6, xmm4
    240     movdqa    xmm4, xmm6
    241     movdqa    xmm6, [esp]  // restore xmm6
    242     movlpd    qword ptr [edx], xmm0
    243     movhpd    qword ptr [ebx], xmm0
    244     movlpd    qword ptr [edx + esi], xmm4
    245     lea       edx, [edx + 2 * esi]
    246     movhpd    qword ptr [ebx + ebp], xmm4
    247     lea       ebx, [ebx + 2 * ebp]
    248     movdqa    xmm0, xmm2   // use xmm0 as the temp register.
    249     punpckldq xmm2, xmm6
    250     movlpd    qword ptr [edx], xmm2
    251     movhpd    qword ptr [ebx], xmm2
    252     punpckhdq xmm0, xmm6
    253     movlpd    qword ptr [edx + esi], xmm0
    254     lea       edx, [edx + 2 * esi]
    255     movhpd    qword ptr [ebx + ebp], xmm0
    256     lea       ebx, [ebx + 2 * ebp]
    257     movdqa    xmm0, xmm1   // use xmm0 as the temp register.
    258     punpckldq xmm1, xmm5
    259     movlpd    qword ptr [edx], xmm1
    260     movhpd    qword ptr [ebx], xmm1
    261     punpckhdq xmm0, xmm5
    262     movlpd    qword ptr [edx + esi], xmm0
    263     lea       edx, [edx + 2 * esi]
    264     movhpd    qword ptr [ebx + ebp], xmm0
    265     lea       ebx, [ebx + 2 * ebp]
    266     movdqa    xmm0, xmm3   // use xmm0 as the temp register.
    267     punpckldq xmm3, xmm7
    268     movlpd    qword ptr [edx], xmm3
    269     movhpd    qword ptr [ebx], xmm3
    270     punpckhdq xmm0, xmm7
    271     movlpd    qword ptr [edx + esi], xmm0
    272     lea       edx, [edx + 2 * esi]
    273     movhpd    qword ptr [ebx + ebp], xmm0
    274     lea       ebx, [ebx + 2 * ebp]
    275     sub       ecx, 8
    276     ja        convertloop
    277 
    278     mov       esp, [esp + 16]
    279     pop       ebp
    280     pop       edi
    281     pop       esi
    282     pop       ebx
    283     ret
    284   }
    285 }
    286 #elif (defined(__i386__) || defined(__x86_64__)) && \
    287     !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
    288 #define HAS_TRANSPOSE_WX8_SSSE3
    289 static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
    290                                uint8* dst, int dst_stride, int width) {
    291   asm volatile(
    292 "1:"
    293   // Read in the data from the source pointer.
    294   // First round of bit swap.
    295   "movq       (%0),%%xmm0\n"
    296   "movq       (%0,%3),%%xmm1\n"
    297   "lea        (%0,%3,2),%0\n"
    298   "punpcklbw  %%xmm1,%%xmm0\n"
    299   "movq       (%0),%%xmm2\n"
    300   "movdqa     %%xmm0,%%xmm1\n"
    301   "palignr    $0x8,%%xmm1,%%xmm1\n"
    302   "movq       (%0,%3),%%xmm3\n"
    303   "lea        (%0,%3,2),%0\n"
    304   "punpcklbw  %%xmm3,%%xmm2\n"
    305   "movdqa     %%xmm2,%%xmm3\n"
    306   "movq       (%0),%%xmm4\n"
    307   "palignr    $0x8,%%xmm3,%%xmm3\n"
    308   "movq       (%0,%3),%%xmm5\n"
    309   "lea        (%0,%3,2),%0\n"
    310   "punpcklbw  %%xmm5,%%xmm4\n"
    311   "movdqa     %%xmm4,%%xmm5\n"
    312   "movq       (%0),%%xmm6\n"
    313   "palignr    $0x8,%%xmm5,%%xmm5\n"
    314   "movq       (%0,%3),%%xmm7\n"
    315   "lea        (%0,%3,2),%0\n"
    316   "punpcklbw  %%xmm7,%%xmm6\n"
    317   "neg        %3\n"
    318   "movdqa     %%xmm6,%%xmm7\n"
    319   "lea        0x8(%0,%3,8),%0\n"
    320   "palignr    $0x8,%%xmm7,%%xmm7\n"
    321   "neg        %3\n"
    322    // Second round of bit swap.
    323   "punpcklwd  %%xmm2,%%xmm0\n"
    324   "punpcklwd  %%xmm3,%%xmm1\n"
    325   "movdqa     %%xmm0,%%xmm2\n"
    326   "movdqa     %%xmm1,%%xmm3\n"
    327   "palignr    $0x8,%%xmm2,%%xmm2\n"
    328   "palignr    $0x8,%%xmm3,%%xmm3\n"
    329   "punpcklwd  %%xmm6,%%xmm4\n"
    330   "punpcklwd  %%xmm7,%%xmm5\n"
    331   "movdqa     %%xmm4,%%xmm6\n"
    332   "movdqa     %%xmm5,%%xmm7\n"
    333   "palignr    $0x8,%%xmm6,%%xmm6\n"
    334   "palignr    $0x8,%%xmm7,%%xmm7\n"
    335   // Third round of bit swap.
    336   // Write to the destination pointer.
    337   "punpckldq  %%xmm4,%%xmm0\n"
    338   "movq       %%xmm0,(%1)\n"
    339   "movdqa     %%xmm0,%%xmm4\n"
    340   "palignr    $0x8,%%xmm4,%%xmm4\n"
    341   "movq       %%xmm4,(%1,%4)\n"
    342   "lea        (%1,%4,2),%1\n"
    343   "punpckldq  %%xmm6,%%xmm2\n"
    344   "movdqa     %%xmm2,%%xmm6\n"
    345   "movq       %%xmm2,(%1)\n"
    346   "palignr    $0x8,%%xmm6,%%xmm6\n"
    347   "punpckldq  %%xmm5,%%xmm1\n"
    348   "movq       %%xmm6,(%1,%4)\n"
    349   "lea        (%1,%4,2),%1\n"
    350   "movdqa     %%xmm1,%%xmm5\n"
    351   "movq       %%xmm1,(%1)\n"
    352   "palignr    $0x8,%%xmm5,%%xmm5\n"
    353   "movq       %%xmm5,(%1,%4)\n"
    354   "lea        (%1,%4,2),%1\n"
    355   "punpckldq  %%xmm7,%%xmm3\n"
    356   "movq       %%xmm3,(%1)\n"
    357   "movdqa     %%xmm3,%%xmm7\n"
    358   "palignr    $0x8,%%xmm7,%%xmm7\n"
    359   "movq       %%xmm7,(%1,%4)\n"
    360   "lea        (%1,%4,2),%1\n"
    361   "sub        $0x8,%2\n"
    362   "ja         1b\n"
    363   : "+r"(src),    // %0
    364     "+r"(dst),    // %1
    365     "+r"(width)   // %2
    366   : "r"(static_cast<intptr_t>(src_stride)),  // %3
    367     "r"(static_cast<intptr_t>(dst_stride))   // %4
    368   : "memory"
    369 );
    370 }
    371 
    372 #if defined (__i386__)
    373 #define HAS_TRANSPOSE_UVWX8_SSE2
    374 extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
    375                                     uint8* dst_a, int dst_stride_a,
    376                                     uint8* dst_b, int dst_stride_b,
    377                                     int w);
    378   asm(
    379     ".text\n"
    380 #if defined(OSX)
    381     ".globl _TransposeUVWx8_SSE2\n"
    382 "_TransposeUVWx8_SSE2:\n"
    383 #else
    384     ".global TransposeUVWx8_SSE2\n"
    385 "TransposeUVWx8_SSE2:\n"
    386 #endif
    387     "push   %ebx\n"
    388     "push   %esi\n"
    389     "push   %edi\n"
    390     "push   %ebp\n"
    391     "mov    0x14(%esp),%eax\n"
    392     "mov    0x18(%esp),%edi\n"
    393     "mov    0x1c(%esp),%edx\n"
    394     "mov    0x20(%esp),%esi\n"
    395     "mov    0x24(%esp),%ebx\n"
    396     "mov    0x28(%esp),%ebp\n"
    397     "mov    %esp,%ecx\n"
    398     "sub    $0x14,%esp\n"
    399     "and    $0xfffffff0,%esp\n"
    400     "mov    %ecx,0x10(%esp)\n"
    401     "mov    0x2c(%ecx),%ecx\n"
    402 
    403 "1:"
    404     "movdqa (%eax),%xmm0\n"
    405     "movdqa (%eax,%edi,1),%xmm1\n"
    406     "lea    (%eax,%edi,2),%eax\n"
    407     "movdqa %xmm0,%xmm7\n"
    408     "punpcklbw %xmm1,%xmm0\n"
    409     "punpckhbw %xmm1,%xmm7\n"
    410     "movdqa %xmm7,%xmm1\n"
    411     "movdqa (%eax),%xmm2\n"
    412     "movdqa (%eax,%edi,1),%xmm3\n"
    413     "lea    (%eax,%edi,2),%eax\n"
    414     "movdqa %xmm2,%xmm7\n"
    415     "punpcklbw %xmm3,%xmm2\n"
    416     "punpckhbw %xmm3,%xmm7\n"
    417     "movdqa %xmm7,%xmm3\n"
    418     "movdqa (%eax),%xmm4\n"
    419     "movdqa (%eax,%edi,1),%xmm5\n"
    420     "lea    (%eax,%edi,2),%eax\n"
    421     "movdqa %xmm4,%xmm7\n"
    422     "punpcklbw %xmm5,%xmm4\n"
    423     "punpckhbw %xmm5,%xmm7\n"
    424     "movdqa %xmm7,%xmm5\n"
    425     "movdqa (%eax),%xmm6\n"
    426     "movdqa (%eax,%edi,1),%xmm7\n"
    427     "lea    (%eax,%edi,2),%eax\n"
    428     "movdqa %xmm5,(%esp)\n"
    429     "neg    %edi\n"
    430     "movdqa %xmm6,%xmm5\n"
    431     "punpcklbw %xmm7,%xmm6\n"
    432     "punpckhbw %xmm7,%xmm5\n"
    433     "movdqa %xmm5,%xmm7\n"
    434     "lea    0x10(%eax,%edi,8),%eax\n"
    435     "neg    %edi\n"
    436     "movdqa %xmm0,%xmm5\n"
    437     "punpcklwd %xmm2,%xmm0\n"
    438     "punpckhwd %xmm2,%xmm5\n"
    439     "movdqa %xmm5,%xmm2\n"
    440     "movdqa %xmm1,%xmm5\n"
    441     "punpcklwd %xmm3,%xmm1\n"
    442     "punpckhwd %xmm3,%xmm5\n"
    443     "movdqa %xmm5,%xmm3\n"
    444     "movdqa %xmm4,%xmm5\n"
    445     "punpcklwd %xmm6,%xmm4\n"
    446     "punpckhwd %xmm6,%xmm5\n"
    447     "movdqa %xmm5,%xmm6\n"
    448     "movdqa (%esp),%xmm5\n"
    449     "movdqa %xmm6,(%esp)\n"
    450     "movdqa %xmm5,%xmm6\n"
    451     "punpcklwd %xmm7,%xmm5\n"
    452     "punpckhwd %xmm7,%xmm6\n"
    453     "movdqa %xmm6,%xmm7\n"
    454     "movdqa %xmm0,%xmm6\n"
    455     "punpckldq %xmm4,%xmm0\n"
    456     "punpckhdq %xmm4,%xmm6\n"
    457     "movdqa %xmm6,%xmm4\n"
    458     "movdqa (%esp),%xmm6\n"
    459     "movlpd %xmm0,(%edx)\n"
    460     "movhpd %xmm0,(%ebx)\n"
    461     "movlpd %xmm4,(%edx,%esi,1)\n"
    462     "lea    (%edx,%esi,2),%edx\n"
    463     "movhpd %xmm4,(%ebx,%ebp,1)\n"
    464     "lea    (%ebx,%ebp,2),%ebx\n"
    465     "movdqa %xmm2,%xmm0\n"
    466     "punpckldq %xmm6,%xmm2\n"
    467     "movlpd %xmm2,(%edx)\n"
    468     "movhpd %xmm2,(%ebx)\n"
    469     "punpckhdq %xmm6,%xmm0\n"
    470     "movlpd %xmm0,(%edx,%esi,1)\n"
    471     "lea    (%edx,%esi,2),%edx\n"
    472     "movhpd %xmm0,(%ebx,%ebp,1)\n"
    473     "lea    (%ebx,%ebp,2),%ebx\n"
    474     "movdqa %xmm1,%xmm0\n"
    475     "punpckldq %xmm5,%xmm1\n"
    476     "movlpd %xmm1,(%edx)\n"
    477     "movhpd %xmm1,(%ebx)\n"
    478     "punpckhdq %xmm5,%xmm0\n"
    479     "movlpd %xmm0,(%edx,%esi,1)\n"
    480     "lea    (%edx,%esi,2),%edx\n"
    481     "movhpd %xmm0,(%ebx,%ebp,1)\n"
    482     "lea    (%ebx,%ebp,2),%ebx\n"
    483     "movdqa %xmm3,%xmm0\n"
    484     "punpckldq %xmm7,%xmm3\n"
    485     "movlpd %xmm3,(%edx)\n"
    486     "movhpd %xmm3,(%ebx)\n"
    487     "punpckhdq %xmm7,%xmm0\n"
    488     "movlpd %xmm0,(%edx,%esi,1)\n"
    489     "lea    (%edx,%esi,2),%edx\n"
    490     "movhpd %xmm0,(%ebx,%ebp,1)\n"
    491     "lea    (%ebx,%ebp,2),%ebx\n"
    492     "sub    $0x8,%ecx\n"
    493     "ja     1b\n"
    494     "mov    0x10(%esp),%esp\n"
    495     "pop    %ebp\n"
    496     "pop    %edi\n"
    497     "pop    %esi\n"
    498     "pop    %ebx\n"
    499     "ret\n"
    500 );
    501 #elif defined (__x86_64__)
    502 // 64 bit version has enough registers to do 16x8 to 8x16 at a time.
    503 #define HAS_TRANSPOSE_WX8_FAST_SSSE3
    504 static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
    505                                     uint8* dst, int dst_stride, int width) {
    506   asm volatile(
    507 "1:"
    508   // Read in the data from the source pointer.
    509   // First round of bit swap.
    510   "movdqa     (%0),%%xmm0\n"
    511   "movdqa     (%0,%3),%%xmm1\n"
    512   "lea        (%0,%3,2),%0\n"
    513   "movdqa     %%xmm0,%%xmm8\n"
    514   "punpcklbw  %%xmm1,%%xmm0\n"
    515   "punpckhbw  %%xmm1,%%xmm8\n"
    516   "movdqa     (%0),%%xmm2\n"
    517   "movdqa     %%xmm0,%%xmm1\n"
    518   "movdqa     %%xmm8,%%xmm9\n"
    519   "palignr    $0x8,%%xmm1,%%xmm1\n"
    520   "palignr    $0x8,%%xmm9,%%xmm9\n"
    521   "movdqa     (%0,%3),%%xmm3\n"
    522   "lea        (%0,%3,2),%0\n"
    523   "movdqa     %%xmm2,%%xmm10\n"
    524   "punpcklbw  %%xmm3,%%xmm2\n"
    525   "punpckhbw  %%xmm3,%%xmm10\n"
    526   "movdqa     %%xmm2,%%xmm3\n"
    527   "movdqa     %%xmm10,%%xmm11\n"
    528   "movdqa     (%0),%%xmm4\n"
    529   "palignr    $0x8,%%xmm3,%%xmm3\n"
    530   "palignr    $0x8,%%xmm11,%%xmm11\n"
    531   "movdqa     (%0,%3),%%xmm5\n"
    532   "lea        (%0,%3,2),%0\n"
    533   "movdqa     %%xmm4,%%xmm12\n"
    534   "punpcklbw  %%xmm5,%%xmm4\n"
    535   "punpckhbw  %%xmm5,%%xmm12\n"
    536   "movdqa     %%xmm4,%%xmm5\n"
    537   "movdqa     %%xmm12,%%xmm13\n"
    538   "movdqa     (%0),%%xmm6\n"
    539   "palignr    $0x8,%%xmm5,%%xmm5\n"
    540   "palignr    $0x8,%%xmm13,%%xmm13\n"
    541   "movdqa     (%0,%3),%%xmm7\n"
    542   "lea        (%0,%3,2),%0\n"
    543   "movdqa     %%xmm6,%%xmm14\n"
    544   "punpcklbw  %%xmm7,%%xmm6\n"
    545   "punpckhbw  %%xmm7,%%xmm14\n"
    546   "neg        %3\n"
    547   "movdqa     %%xmm6,%%xmm7\n"
    548   "movdqa     %%xmm14,%%xmm15\n"
    549   "lea        0x10(%0,%3,8),%0\n"
    550   "palignr    $0x8,%%xmm7,%%xmm7\n"
    551   "palignr    $0x8,%%xmm15,%%xmm15\n"
    552   "neg        %3\n"
    553    // Second round of bit swap.
    554   "punpcklwd  %%xmm2,%%xmm0\n"
    555   "punpcklwd  %%xmm3,%%xmm1\n"
    556   "movdqa     %%xmm0,%%xmm2\n"
    557   "movdqa     %%xmm1,%%xmm3\n"
    558   "palignr    $0x8,%%xmm2,%%xmm2\n"
    559   "palignr    $0x8,%%xmm3,%%xmm3\n"
    560   "punpcklwd  %%xmm6,%%xmm4\n"
    561   "punpcklwd  %%xmm7,%%xmm5\n"
    562   "movdqa     %%xmm4,%%xmm6\n"
    563   "movdqa     %%xmm5,%%xmm7\n"
    564   "palignr    $0x8,%%xmm6,%%xmm6\n"
    565   "palignr    $0x8,%%xmm7,%%xmm7\n"
    566   "punpcklwd  %%xmm10,%%xmm8\n"
    567   "punpcklwd  %%xmm11,%%xmm9\n"
    568   "movdqa     %%xmm8,%%xmm10\n"
    569   "movdqa     %%xmm9,%%xmm11\n"
    570   "palignr    $0x8,%%xmm10,%%xmm10\n"
    571   "palignr    $0x8,%%xmm11,%%xmm11\n"
    572   "punpcklwd  %%xmm14,%%xmm12\n"
    573   "punpcklwd  %%xmm15,%%xmm13\n"
    574   "movdqa     %%xmm12,%%xmm14\n"
    575   "movdqa     %%xmm13,%%xmm15\n"
    576   "palignr    $0x8,%%xmm14,%%xmm14\n"
    577   "palignr    $0x8,%%xmm15,%%xmm15\n"
    578   // Third round of bit swap.
    579   // Write to the destination pointer.
    580   "punpckldq  %%xmm4,%%xmm0\n"
    581   "movq       %%xmm0,(%1)\n"
    582   "movdqa     %%xmm0,%%xmm4\n"
    583   "palignr    $0x8,%%xmm4,%%xmm4\n"
    584   "movq       %%xmm4,(%1,%4)\n"
    585   "lea        (%1,%4,2),%1\n"
    586   "punpckldq  %%xmm6,%%xmm2\n"
    587   "movdqa     %%xmm2,%%xmm6\n"
    588   "movq       %%xmm2,(%1)\n"
    589   "palignr    $0x8,%%xmm6,%%xmm6\n"
    590   "punpckldq  %%xmm5,%%xmm1\n"
    591   "movq       %%xmm6,(%1,%4)\n"
    592   "lea        (%1,%4,2),%1\n"
    593   "movdqa     %%xmm1,%%xmm5\n"
    594   "movq       %%xmm1,(%1)\n"
    595   "palignr    $0x8,%%xmm5,%%xmm5\n"
    596   "movq       %%xmm5,(%1,%4)\n"
    597   "lea        (%1,%4,2),%1\n"
    598   "punpckldq  %%xmm7,%%xmm3\n"
    599   "movq       %%xmm3,(%1)\n"
    600   "movdqa     %%xmm3,%%xmm7\n"
    601   "palignr    $0x8,%%xmm7,%%xmm7\n"
    602   "movq       %%xmm7,(%1,%4)\n"
    603   "lea        (%1,%4,2),%1\n"
    604   "punpckldq  %%xmm12,%%xmm8\n"
    605   "movq       %%xmm8,(%1)\n"
    606   "movdqa     %%xmm8,%%xmm12\n"
    607   "palignr    $0x8,%%xmm12,%%xmm12\n"
    608   "movq       %%xmm12,(%1,%4)\n"
    609   "lea        (%1,%4,2),%1\n"
    610   "punpckldq  %%xmm14,%%xmm10\n"
    611   "movdqa     %%xmm10,%%xmm14\n"
    612   "movq       %%xmm10,(%1)\n"
    613   "palignr    $0x8,%%xmm14,%%xmm14\n"
    614   "punpckldq  %%xmm13,%%xmm9\n"
    615   "movq       %%xmm14,(%1,%4)\n"
    616   "lea        (%1,%4,2),%1\n"
    617   "movdqa     %%xmm9,%%xmm13\n"
    618   "movq       %%xmm9,(%1)\n"
    619   "palignr    $0x8,%%xmm13,%%xmm13\n"
    620   "movq       %%xmm13,(%1,%4)\n"
    621   "lea        (%1,%4,2),%1\n"
    622   "punpckldq  %%xmm15,%%xmm11\n"
    623   "movq       %%xmm11,(%1)\n"
    624   "movdqa     %%xmm11,%%xmm15\n"
    625   "palignr    $0x8,%%xmm15,%%xmm15\n"
    626   "movq       %%xmm15,(%1,%4)\n"
    627   "lea        (%1,%4,2),%1\n"
    628   "sub        $0x10,%2\n"
    629   "ja         1b\n"
    630   : "+r"(src),    // %0
    631     "+r"(dst),    // %1
    632     "+r"(width)   // %2
    633   : "r"(static_cast<intptr_t>(src_stride)),  // %3
    634     "r"(static_cast<intptr_t>(dst_stride))   // %4
    635   : "memory"
    636 );
    637 }
    638 
    639 #define HAS_TRANSPOSE_UVWX8_SSE2
    640 static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
    641                                 uint8* dst_a, int dst_stride_a,
    642                                 uint8* dst_b, int dst_stride_b,
    643                                 int w) {
    644   asm volatile(
    645 "1:"
    646   // Read in the data from the source pointer.
    647   // First round of bit swap.
    648   "movdqa     (%0),%%xmm0\n"
    649   "movdqa     (%0,%4),%%xmm1\n"
    650   "lea        (%0,%4,2),%0\n"
    651   "movdqa     %%xmm0,%%xmm8\n"
    652   "punpcklbw  %%xmm1,%%xmm0\n"
    653   "punpckhbw  %%xmm1,%%xmm8\n"
    654   "movdqa     %%xmm8,%%xmm1\n"
    655   "movdqa     (%0),%%xmm2\n"
    656   "movdqa     (%0,%4),%%xmm3\n"
    657   "lea        (%0,%4,2),%0\n"
    658   "movdqa     %%xmm2,%%xmm8\n"
    659   "punpcklbw  %%xmm3,%%xmm2\n"
    660   "punpckhbw  %%xmm3,%%xmm8\n"
    661   "movdqa     %%xmm8,%%xmm3\n"
    662   "movdqa     (%0),%%xmm4\n"
    663   "movdqa     (%0,%4),%%xmm5\n"
    664   "lea        (%0,%4,2),%0\n"
    665   "movdqa     %%xmm4,%%xmm8\n"
    666   "punpcklbw  %%xmm5,%%xmm4\n"
    667   "punpckhbw  %%xmm5,%%xmm8\n"
    668   "movdqa     %%xmm8,%%xmm5\n"
    669   "movdqa     (%0),%%xmm6\n"
    670   "movdqa     (%0,%4),%%xmm7\n"
    671   "lea        (%0,%4,2),%0\n"
    672   "movdqa     %%xmm6,%%xmm8\n"
    673   "punpcklbw  %%xmm7,%%xmm6\n"
    674   "neg        %4\n"
    675   "lea        0x10(%0,%4,8),%0\n"
    676   "punpckhbw  %%xmm7,%%xmm8\n"
    677   "movdqa     %%xmm8,%%xmm7\n"
    678   "neg        %4\n"
    679    // Second round of bit swap.
    680   "movdqa     %%xmm0,%%xmm8\n"
    681   "movdqa     %%xmm1,%%xmm9\n"
    682   "punpckhwd  %%xmm2,%%xmm8\n"
    683   "punpckhwd  %%xmm3,%%xmm9\n"
    684   "punpcklwd  %%xmm2,%%xmm0\n"
    685   "punpcklwd  %%xmm3,%%xmm1\n"
    686   "movdqa     %%xmm8,%%xmm2\n"
    687   "movdqa     %%xmm9,%%xmm3\n"
    688   "movdqa     %%xmm4,%%xmm8\n"
    689   "movdqa     %%xmm5,%%xmm9\n"
    690   "punpckhwd  %%xmm6,%%xmm8\n"
    691   "punpckhwd  %%xmm7,%%xmm9\n"
    692   "punpcklwd  %%xmm6,%%xmm4\n"
    693   "punpcklwd  %%xmm7,%%xmm5\n"
    694   "movdqa     %%xmm8,%%xmm6\n"
    695   "movdqa     %%xmm9,%%xmm7\n"
    696   // Third round of bit swap.
    697   // Write to the destination pointer.
    698   "movdqa     %%xmm0,%%xmm8\n"
    699   "punpckldq  %%xmm4,%%xmm0\n"
    700   "movlpd     %%xmm0,(%1)\n"  // Write back U channel
    701   "movhpd     %%xmm0,(%2)\n"  // Write back V channel
    702   "punpckhdq  %%xmm4,%%xmm8\n"
    703   "movlpd     %%xmm8,(%1,%5)\n"
    704   "lea        (%1,%5,2),%1\n"
    705   "movhpd     %%xmm8,(%2,%6)\n"
    706   "lea        (%2,%6,2),%2\n"
    707   "movdqa     %%xmm2,%%xmm8\n"
    708   "punpckldq  %%xmm6,%%xmm2\n"
    709   "movlpd     %%xmm2,(%1)\n"
    710   "movhpd     %%xmm2,(%2)\n"
    711   "punpckhdq  %%xmm6,%%xmm8\n"
    712   "movlpd     %%xmm8,(%1,%5)\n"
    713   "lea        (%1,%5,2),%1\n"
    714   "movhpd     %%xmm8,(%2,%6)\n"
    715   "lea        (%2,%6,2),%2\n"
    716   "movdqa     %%xmm1,%%xmm8\n"
    717   "punpckldq  %%xmm5,%%xmm1\n"
    718   "movlpd     %%xmm1,(%1)\n"
    719   "movhpd     %%xmm1,(%2)\n"
    720   "punpckhdq  %%xmm5,%%xmm8\n"
    721   "movlpd     %%xmm8,(%1,%5)\n"
    722   "lea        (%1,%5,2),%1\n"
    723   "movhpd     %%xmm8,(%2,%6)\n"
    724   "lea        (%2,%6,2),%2\n"
    725   "movdqa     %%xmm3,%%xmm8\n"
    726   "punpckldq  %%xmm7,%%xmm3\n"
    727   "movlpd     %%xmm3,(%1)\n"
    728   "movhpd     %%xmm3,(%2)\n"
    729   "punpckhdq  %%xmm7,%%xmm8\n"
    730   "movlpd     %%xmm8,(%1,%5)\n"
    731   "lea        (%1,%5,2),%1\n"
    732   "movhpd     %%xmm8,(%2,%6)\n"
    733   "lea        (%2,%6,2),%2\n"
    734   "sub        $0x8,%3\n"
    735   "ja         1b\n"
    736   : "+r"(src),    // %0
    737     "+r"(dst_a),  // %1
    738     "+r"(dst_b),  // %2
    739     "+r"(w)   // %3
    740   : "r"(static_cast<intptr_t>(src_stride)),    // %4
    741     "r"(static_cast<intptr_t>(dst_stride_a)),  // %5
    742     "r"(static_cast<intptr_t>(dst_stride_b))   // %6
    743   : "memory"
    744 );
    745 }
    746 #endif
    747 #endif
    748 
    749 static void TransposeWx8_C(const uint8* src, int src_stride,
    750                            uint8* dst, int dst_stride,
    751                            int w) {
    752   int i;
    753   for (i = 0; i < w; ++i) {
    754     dst[0] = src[0 * src_stride];
    755     dst[1] = src[1 * src_stride];
    756     dst[2] = src[2 * src_stride];
    757     dst[3] = src[3 * src_stride];
    758     dst[4] = src[4 * src_stride];
    759     dst[5] = src[5 * src_stride];
    760     dst[6] = src[6 * src_stride];
    761     dst[7] = src[7 * src_stride];
    762     ++src;
    763     dst += dst_stride;
    764   }
    765 }
    766 
    767 static void TransposeWxH_C(const uint8* src, int src_stride,
    768                            uint8* dst, int dst_stride,
    769                            int width, int height) {
    770   int i, j;
    771   for (i = 0; i < width; ++i)
    772     for (j = 0; j < height; ++j)
    773       dst[i * dst_stride + j] = src[j * src_stride + i];
    774 }
    775 
    776 void TransposePlane(const uint8* src, int src_stride,
    777                     uint8* dst, int dst_stride,
    778                     int width, int height) {
    779   int i = height;
    780   rotate_wx8_func TransposeWx8;
    781   rotate_wxh_func TransposeWxH;
    782 
    783 #if defined(HAS_TRANSPOSE_WX8_NEON)
    784   if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
    785       (width % 8 == 0) &&
    786       IS_ALIGNED(src, 8) && (src_stride % 8 == 0) &&
    787       IS_ALIGNED(dst, 8) && (dst_stride % 8 == 0)) {
    788     TransposeWx8 = TransposeWx8_NEON;
    789     TransposeWxH = TransposeWxH_C;
    790   } else
    791 #endif
    792 #if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3)
    793   if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
    794       (width % 16 == 0) &&
    795       IS_ALIGNED(src, 16) && (src_stride % 16 == 0) &&
    796       IS_ALIGNED(dst, 8) && (dst_stride % 8 == 0)) {
    797     TransposeWx8 = TransposeWx8_FAST_SSSE3;
    798     TransposeWxH = TransposeWxH_C;
    799   } else
    800 #endif
    801 #if defined(HAS_TRANSPOSE_WX8_SSSE3)
    802   if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
    803       (width % 8 == 0) &&
    804       IS_ALIGNED(src, 8) && (src_stride % 8 == 0) &&
    805       IS_ALIGNED(dst, 8) && (dst_stride % 8 == 0)) {
    806     TransposeWx8 = TransposeWx8_SSSE3;
    807     TransposeWxH = TransposeWxH_C;
    808   } else
    809 #endif
    810   {
    811     TransposeWx8 = TransposeWx8_C;
    812     TransposeWxH = TransposeWxH_C;
    813   }
    814 
    815   // work across the source in 8x8 tiles
    816   while (i >= 8) {
    817     TransposeWx8(src, src_stride, dst, dst_stride, width);
    818 
    819     src += 8 * src_stride;    // go down 8 rows
    820     dst += 8;                 // move over 8 columns
    821     i   -= 8;
    822   }
    823 
    824   TransposeWxH(src, src_stride, dst, dst_stride, width, i);
    825 }
    826 
    827 void RotatePlane90(const uint8* src, int src_stride,
    828                    uint8* dst, int dst_stride,
    829                    int width, int height) {
    830   // Rotate by 90 is a transpose with the source read
    831   // from bottom to top.  So set the source pointer to the end
    832   // of the buffer and flip the sign of the source stride.
    833   src += src_stride * (height - 1);
    834   src_stride = -src_stride;
    835 
    836   TransposePlane(src, src_stride, dst, dst_stride, width, height);
    837 }
    838 
    839 void RotatePlane270(const uint8* src, int src_stride,
    840                     uint8* dst, int dst_stride,
    841                     int width, int height) {
    842   // Rotate by 270 is a transpose with the destination written
    843   // from bottom to top.  So set the destination pointer to the end
    844   // of the buffer and flip the sign of the destination stride.
    845   dst += dst_stride * (width - 1);
    846   dst_stride = -dst_stride;
    847 
    848   TransposePlane(src, src_stride, dst, dst_stride, width, height);
    849 }
    850 
    851 static void ReverseLine_C(const uint8* src, uint8* dst, int width) {
    852   int i;
    853   src += width - 1;
    854   for (i = 0; i < width; ++i) {
    855     dst[i] = src[0];
    856     --src;
    857   }
    858 }
    859 
    860 #if defined(WIN32) && !defined(COVERAGE_ENABLED)
    861 #define HAS_REVERSE_LINE_SSSE3
    862 __declspec(naked)
    863 static void ReverseLine_SSSE3(const uint8* src, uint8* dst, int width) {
    864 __asm {
    865     mov       eax, [esp + 4]   // src
    866     mov       edx, [esp + 8]   // dst
    867     mov       ecx, [esp + 12]  // width
    868     movdqa    xmm7, _kShuffleReverse
    869     lea       eax, [eax + ecx - 16]
    870  convertloop :
    871     movdqa    xmm0, [eax]
    872     lea       eax, [eax - 16]
    873     pshufb    xmm0, xmm7
    874     movdqa    [edx], xmm0
    875     lea       edx, [edx + 16]
    876     sub       ecx, 16
    877     ja        convertloop
    878     ret
    879   }
    880 }
    881 
    882 #elif (defined(__i386__) || defined(__x86_64__)) && \
    883     !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
    884 #define HAS_REVERSE_LINE_SSSE3
    885 static void ReverseLine_SSSE3(const uint8* src, uint8* dst, int width) {
    886   intptr_t temp_width = static_cast<intptr_t>(width);
    887   asm volatile(
    888   "movdqa     (%3),%%xmm7\n"
    889   "lea        -0x10(%0,%2,1),%0\n"
    890 "1:"
    891   "movdqa     (%0),%%xmm0\n"
    892   "lea        -0x10(%0),%0\n"
    893   "pshufb     %%xmm7,%%xmm0\n"
    894   "movdqa     %%xmm0,(%1)\n"
    895   "lea        0x10(%1),%1\n"
    896   "sub        $0x10,%2\n"
    897   "ja         1b\n"
    898   : "+r"(src),    // %0
    899     "+r"(dst),    // %1
    900     "+r"(temp_width)   // %2
    901   : "r"(kShuffleReverse)   // %3
    902   : "memory"
    903 );
    904 }
    905 #endif
    906 
    907 void RotatePlane180(const uint8* src, int src_stride,
    908                     uint8* dst, int dst_stride,
    909                     int width, int height) {
    910   int i;
    911   reverse_func ReverseLine;
    912 
    913 #if defined(HAS_REVERSE_LINE_NEON)
    914   if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
    915       (width % 16 == 0) &&
    916       IS_ALIGNED(src, 16) && (src_stride % 16 == 0) &&
    917       IS_ALIGNED(dst, 16) && (dst_stride % 16 == 0)) {
    918     ReverseLine = ReverseLine_NEON;
    919   } else
    920 #endif
    921 #if defined(HAS_REVERSE_LINE_SSSE3)
    922   if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
    923       (width % 16 == 0) &&
    924       IS_ALIGNED(src, 16) && (src_stride % 16 == 0) &&
    925       IS_ALIGNED(dst, 16) && (dst_stride % 16 == 0)) {
    926     ReverseLine = ReverseLine_SSSE3;
    927   } else
    928 #endif
    929   {
    930     ReverseLine = ReverseLine_C;
    931   }
    932   // Rotate by 180 is a mirror and vertical flip
    933   src += src_stride * (height - 1);
    934 
    935   for (i = 0; i < height; ++i) {
    936     ReverseLine(src, dst, width);
    937     src -= src_stride;
    938     dst += dst_stride;
    939   }
    940 }
    941 
    942 static void TransposeUVWx8_C(const uint8* src, int src_stride,
    943                              uint8* dst_a, int dst_stride_a,
    944                              uint8* dst_b, int dst_stride_b,
    945                              int w) {
    946   int i;
    947   for (i = 0; i < w; ++i) {
    948     dst_a[0] = src[0 * src_stride + 0];
    949     dst_b[0] = src[0 * src_stride + 1];
    950     dst_a[1] = src[1 * src_stride + 0];
    951     dst_b[1] = src[1 * src_stride + 1];
    952     dst_a[2] = src[2 * src_stride + 0];
    953     dst_b[2] = src[2 * src_stride + 1];
    954     dst_a[3] = src[3 * src_stride + 0];
    955     dst_b[3] = src[3 * src_stride + 1];
    956     dst_a[4] = src[4 * src_stride + 0];
    957     dst_b[4] = src[4 * src_stride + 1];
    958     dst_a[5] = src[5 * src_stride + 0];
    959     dst_b[5] = src[5 * src_stride + 1];
    960     dst_a[6] = src[6 * src_stride + 0];
    961     dst_b[6] = src[6 * src_stride + 1];
    962     dst_a[7] = src[7 * src_stride + 0];
    963     dst_b[7] = src[7 * src_stride + 1];
    964     src += 2;
    965     dst_a += dst_stride_a;
    966     dst_b += dst_stride_b;
    967   }
    968 }
    969 
    970 static void TransposeUVWxH_C(const uint8* src, int src_stride,
    971                              uint8* dst_a, int dst_stride_a,
    972                              uint8* dst_b, int dst_stride_b,
    973                              int w, int h) {
    974   int i, j;
    975   for (i = 0; i < w * 2; i += 2)
    976     for (j = 0; j < h; ++j) {
    977       dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
    978       dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
    979     }
    980 }
    981 
    982 void TransposeUV(const uint8* src, int src_stride,
    983                  uint8* dst_a, int dst_stride_a,
    984                  uint8* dst_b, int dst_stride_b,
    985                  int width, int height) {
    986   int i = height;
    987   rotate_uv_wx8_func TransposeWx8;
    988   rotate_uv_wxh_func TransposeWxH;
    989 
    990 #if defined(HAS_TRANSPOSE_UVWX8_NEON)
    991   unsigned long long store_reg[8];
    992   if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON)) {
    993     SaveRegisters_NEON(store_reg);
    994     TransposeWx8 = TransposeUVWx8_NEON;
    995     TransposeWxH = TransposeUVWxH_C;
    996   } else
    997 #endif
    998 #if defined(HAS_TRANSPOSE_UVWX8_SSE2)
    999   if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
   1000       (width % 8 == 0) &&
   1001       IS_ALIGNED(src, 16) && (src_stride % 16 == 0) &&
   1002       IS_ALIGNED(dst_a, 8) && (dst_stride_a % 8 == 0) &&
   1003       IS_ALIGNED(dst_b, 8) && (dst_stride_b % 8 == 0)) {
   1004     TransposeWx8 = TransposeUVWx8_SSE2;
   1005     TransposeWxH = TransposeUVWxH_C;
   1006   } else
   1007 #endif
   1008   {
   1009     TransposeWx8 = TransposeUVWx8_C;
   1010     TransposeWxH = TransposeUVWxH_C;
   1011   }
   1012 
   1013   // work through the source in 8x8 tiles
   1014   while (i >= 8) {
   1015     TransposeWx8(src, src_stride,
   1016                  dst_a, dst_stride_a,
   1017                  dst_b, dst_stride_b,
   1018                  width);
   1019 
   1020     src   += 8 * src_stride;    // go down 8 rows
   1021     dst_a += 8;                 // move over 8 columns
   1022     dst_b += 8;                 // move over 8 columns
   1023     i     -= 8;
   1024   }
   1025 
   1026   TransposeWxH(src, src_stride,
   1027                dst_a, dst_stride_a,
   1028                dst_b, dst_stride_b,
   1029                width, i);
   1030 
   1031 #if defined(HAS_TRANSPOSE_UVWX8_NEON)
   1032   if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON)) {
   1033     RestoreRegisters_NEON(store_reg);
   1034   }
   1035 #endif
   1036 }
   1037 
   1038 void RotateUV90(const uint8* src, int src_stride,
   1039                 uint8* dst_a, int dst_stride_a,
   1040                 uint8* dst_b, int dst_stride_b,
   1041                 int width, int height) {
   1042   src += src_stride * (height - 1);
   1043   src_stride = -src_stride;
   1044 
   1045   TransposeUV(src, src_stride,
   1046               dst_a, dst_stride_a,
   1047               dst_b, dst_stride_b,
   1048               width, height);
   1049 }
   1050 
   1051 void RotateUV270(const uint8* src, int src_stride,
   1052                  uint8* dst_a, int dst_stride_a,
   1053                  uint8* dst_b, int dst_stride_b,
   1054                  int width, int height) {
   1055   dst_a += dst_stride_a * (width - 1);
   1056   dst_b += dst_stride_b * (width - 1);
   1057   dst_stride_a = -dst_stride_a;
   1058   dst_stride_b = -dst_stride_b;
   1059 
   1060   TransposeUV(src, src_stride,
   1061               dst_a, dst_stride_a,
   1062               dst_b, dst_stride_b,
   1063               width, height);
   1064 }
   1065 
   1066 #if defined(WIN32) && !defined(COVERAGE_ENABLED)
   1067 #define HAS_REVERSE_LINE_UV_SSSE3
   1068 __declspec(naked)
   1069 void ReverseLineUV_SSSE3(const uint8* src,
   1070                          uint8* dst_a, uint8* dst_b,
   1071                          int width) {
   1072 __asm {
   1073     push      edi
   1074     mov       eax, [esp + 4 + 4]   // src
   1075     mov       edx, [esp + 4 + 8]   // dst_a
   1076     mov       edi, [esp + 4 + 12]  // dst_b
   1077     mov       ecx, [esp + 4 + 16]  // width
   1078     movdqa    xmm7, _kShuffleReverseUV
   1079     lea       eax, [eax + ecx * 2 - 16]
   1080 
   1081  convertloop :
   1082     movdqa    xmm0, [eax]
   1083     lea       eax, [eax - 16]
   1084     pshufb    xmm0, xmm7
   1085     movlpd    qword ptr [edx], xmm0
   1086     lea       edx, [edx + 8]
   1087     movhpd    qword ptr [edi], xmm0
   1088     lea       edi, [edi + 8]
   1089     sub       ecx, 8
   1090     ja        convertloop
   1091     pop       edi
   1092     ret
   1093   }
   1094 }
   1095 
   1096 #elif (defined(__i386__) || defined(__x86_64__)) && \
   1097     !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
   1098 #define HAS_REVERSE_LINE_UV_SSSE3
   1099 void ReverseLineUV_SSSE3(const uint8* src,
   1100                          uint8* dst_a, uint8* dst_b,
   1101                          int width) {
   1102   intptr_t temp_width = static_cast<intptr_t>(width);
   1103   asm volatile(
   1104   "movdqa     (%4),%%xmm7\n"
   1105   "lea        -0x10(%0,%3,2),%0\n"
   1106 "1:"
   1107   "movdqa     (%0),%%xmm0\n"
   1108   "lea        -0x10(%0),%0\n"
   1109   "pshufb     %%xmm7,%%xmm0\n"
   1110   "movlpd     %%xmm0,(%1)\n"
   1111   "lea        0x8(%1),%1\n"
   1112   "movhpd     %%xmm0,(%2)\n"
   1113   "lea        0x8(%2),%2\n"
   1114   "sub        $0x8,%3\n"
   1115   "ja         1b\n"
   1116   : "+r"(src),      // %0
   1117     "+r"(dst_a),    // %1
   1118     "+r"(dst_b),    // %2
   1119     "+r"(temp_width)     // %3
   1120   : "r"(kShuffleReverseUV)  // %4
   1121   : "memory"
   1122 );
   1123 }
   1124 #endif
   1125 
   1126 static void ReverseLineUV_C(const uint8* src,
   1127                             uint8* dst_a, uint8* dst_b,
   1128                             int width) {
   1129   int i;
   1130   src += width << 1;
   1131   for (i = 0; i < width; ++i) {
   1132     src -= 2;
   1133     dst_a[i] = src[0];
   1134     dst_b[i] = src[1];
   1135   }
   1136 }
   1137 
   1138 void RotateUV180(const uint8* src, int src_stride,
   1139                  uint8* dst_a, int dst_stride_a,
   1140                  uint8* dst_b, int dst_stride_b,
   1141                  int width, int height) {
   1142   int i;
   1143   reverse_uv_func ReverseLine;
   1144 
   1145 #if defined(HAS_REVERSE_LINE_UV_NEON)
   1146   if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
   1147       (width % 16 == 0) &&
   1148       IS_ALIGNED(src, 16) && (src_stride % 16 == 0) &&
   1149       IS_ALIGNED(dst_a, 8) && (dst_stride_a % 8 == 0) &&
   1150       IS_ALIGNED(dst_b, 8) && (dst_stride_b % 8 == 0) ) {
   1151     ReverseLine = ReverseLineUV_NEON;
   1152   } else
   1153 #endif
   1154 #if defined(HAS_REVERSE_LINE_UV_SSSE3)
   1155   if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
   1156       (width % 16 == 0) &&
   1157       IS_ALIGNED(src, 16) && (src_stride % 16 == 0) &&
   1158       IS_ALIGNED(dst_a, 8) && (dst_stride_a % 8 == 0) &&
   1159       IS_ALIGNED(dst_b, 8) && (dst_stride_b % 8 == 0) ) {
   1160     ReverseLine = ReverseLineUV_SSSE3;
   1161   } else
   1162 #endif
   1163   {
   1164     ReverseLine = ReverseLineUV_C;
   1165   }
   1166 
   1167   dst_a += dst_stride_a * (height - 1);
   1168   dst_b += dst_stride_b * (height - 1);
   1169 
   1170   for (i = 0; i < height; ++i) {
   1171     ReverseLine(src, dst_a, dst_b, width);
   1172 
   1173     src   += src_stride;      // down one line at a time
   1174     dst_a -= dst_stride_a;    // nominally up one line at a time
   1175     dst_b -= dst_stride_b;    // nominally up one line at a time
   1176   }
   1177 }
   1178 
   1179 int I420Rotate(const uint8* src_y, int src_stride_y,
   1180                const uint8* src_u, int src_stride_u,
   1181                const uint8* src_v, int src_stride_v,
   1182                uint8* dst_y, int dst_stride_y,
   1183                uint8* dst_u, int dst_stride_u,
   1184                uint8* dst_v, int dst_stride_v,
   1185                int width, int height,
   1186                RotationMode mode) {
   1187   int halfwidth = (width + 1) >> 1;
   1188   int halfheight = (height + 1) >> 1;
   1189 
   1190   // Negative height means invert the image.
   1191   if (height < 0) {
   1192     height = -height;
   1193     halfheight = (height + 1) >> 1;
   1194     src_y = src_y + (height - 1) * src_stride_y;
   1195     src_u = src_u + (halfheight - 1) * src_stride_u;
   1196     src_v = src_v + (halfheight - 1) * src_stride_v;
   1197     src_stride_y = -src_stride_y;
   1198     src_stride_u = -src_stride_u;
   1199     src_stride_v = -src_stride_v;
   1200   }
   1201 
   1202   switch (mode) {
   1203     case kRotate0:
   1204       // copy frame
   1205       return I420Copy(src_y, src_stride_y,
   1206                       src_u, src_stride_u,
   1207                       src_v, src_stride_v,
   1208                       dst_y, dst_stride_y,
   1209                       dst_u, dst_stride_u,
   1210                       dst_v, dst_stride_v,
   1211                       width, height);
   1212     case kRotate90:
   1213       RotatePlane90(src_y, src_stride_y,
   1214                     dst_y, dst_stride_y,
   1215                     width, height);
   1216       RotatePlane90(src_u, src_stride_u,
   1217                     dst_u, dst_stride_u,
   1218                     halfwidth, halfheight);
   1219       RotatePlane90(src_v, src_stride_v,
   1220                     dst_v, dst_stride_v,
   1221                     halfwidth, halfheight);
   1222       return 0;
   1223     case kRotate270:
   1224       RotatePlane270(src_y, src_stride_y,
   1225                      dst_y, dst_stride_y,
   1226                      width, height);
   1227       RotatePlane270(src_u, src_stride_u,
   1228                      dst_u, dst_stride_u,
   1229                      halfwidth, halfheight);
   1230       RotatePlane270(src_v, src_stride_v,
   1231                      dst_v, dst_stride_v,
   1232                      halfwidth, halfheight);
   1233       return 0;
   1234     case kRotate180:
   1235       RotatePlane180(src_y, src_stride_y,
   1236                      dst_y, dst_stride_y,
   1237                      width, height);
   1238       RotatePlane180(src_u, src_stride_u,
   1239                      dst_u, dst_stride_u,
   1240                      halfwidth, halfheight);
   1241       RotatePlane180(src_v, src_stride_v,
   1242                      dst_v, dst_stride_v,
   1243                      halfwidth, halfheight);
   1244       return 0;
   1245     default:
   1246       break;
   1247   }
   1248   return -1;
   1249 }
   1250 
   1251 int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
   1252                      const uint8* src_uv, int src_stride_uv,
   1253                      uint8* dst_y, int dst_stride_y,
   1254                      uint8* dst_u, int dst_stride_u,
   1255                      uint8* dst_v, int dst_stride_v,
   1256                      int width, int height,
   1257                      RotationMode mode) {
   1258   int halfwidth = (width + 1) >> 1;
   1259   int halfheight = (height + 1) >> 1;
   1260 
   1261   // Negative height means invert the image.
   1262   if (height < 0) {
   1263     height = -height;
   1264     halfheight = (height + 1) >> 1;
   1265     src_y = src_y + (height - 1) * src_stride_y;
   1266     src_uv = src_uv + (halfheight - 1) * src_stride_uv;
   1267     src_stride_y = -src_stride_y;
   1268     src_stride_uv = -src_stride_uv;
   1269   }
   1270 
   1271   switch (mode) {
   1272     case kRotate0:
   1273       // copy frame
   1274       return NV12ToI420(src_y, src_uv, src_stride_y,
   1275                         dst_y, dst_stride_y,
   1276                         dst_u, dst_stride_u,
   1277                         dst_v, dst_stride_v,
   1278                         width, height);
   1279     case kRotate90:
   1280       RotatePlane90(src_y, src_stride_y,
   1281                     dst_y, dst_stride_y,
   1282                     width, height);
   1283       RotateUV90(src_uv, src_stride_uv,
   1284                  dst_u, dst_stride_u,
   1285                  dst_v, dst_stride_v,
   1286                  halfwidth, halfheight);
   1287       return 0;
   1288     case kRotate270:
   1289       RotatePlane270(src_y, src_stride_y,
   1290                      dst_y, dst_stride_y,
   1291                      width, height);
   1292       RotateUV270(src_uv, src_stride_uv,
   1293                   dst_u, dst_stride_u,
   1294                   dst_v, dst_stride_v,
   1295                   halfwidth, halfheight);
   1296       return 0;
   1297     case kRotate180:
   1298       RotatePlane180(src_y, src_stride_y,
   1299                      dst_y, dst_stride_y,
   1300                      width, height);
   1301       RotateUV180(src_uv, src_stride_uv,
   1302                   dst_u, dst_stride_u,
   1303                   dst_v, dst_stride_v,
   1304                   halfwidth, halfheight);
   1305       return 0;
   1306     default:
   1307       break;
   1308   }
   1309   return -1;
   1310 }
   1311 
   1312 }  // namespace libyuv
   1313