Home | History | Annotate | Download | only in source
      1 /*
      2  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "libyuv/rotate.h"
     12 
     13 #include "libyuv/cpu_id.h"
     14 #include "libyuv/convert.h"
     15 #include "libyuv/planar_functions.h"
     16 #include "libyuv/row.h"
     17 
     18 #ifdef __cplusplus
     19 namespace libyuv {
     20 extern "C" {
     21 #endif
     22 
     23 #if !defined(YUV_DISABLE_ASM) && \
     24     (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
     25 #if defined(__APPLE__) && defined(__i386__)
     26 #define DECLARE_FUNCTION(name)                                                 \
     27     ".text                                     \n"                             \
     28     ".private_extern _" #name "                \n"                             \
     29     ".align 4,0x90                             \n"                             \
     30 "_" #name ":                                   \n"
     31 #elif defined(__MINGW32__) || defined(__CYGWIN__) && defined(__i386__)
     32 #define DECLARE_FUNCTION(name)                                                 \
     33     ".text                                     \n"                             \
     34     ".align 4,0x90                             \n"                             \
     35 "_" #name ":                                   \n"
     36 #else
     37 #define DECLARE_FUNCTION(name)                                                 \
     38     ".text                                     \n"                             \
     39     ".align 4,0x90                             \n"                             \
     40 #name ":                                       \n"
     41 #endif
     42 #endif
     43 
     44 #if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
     45 #define HAS_MIRRORROW_NEON
     46 void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
     47 #define HAS_MIRRORROW_UV_NEON
     48 void MirrorRowUV_NEON(const uint8* src,
     49                         uint8* dst_a, uint8* dst_b,
     50                         int width);
     51 #define HAS_TRANSPOSE_WX8_NEON
     52 void TransposeWx8_NEON(const uint8* src, int src_stride,
     53                        uint8* dst, int dst_stride, int width);
     54 #define HAS_TRANSPOSE_UVWX8_NEON
     55 void TransposeUVWx8_NEON(const uint8* src, int src_stride,
     56                          uint8* dst_a, int dst_stride_a,
     57                          uint8* dst_b, int dst_stride_b,
     58                          int width);
     59 #endif  // defined(__ARM_NEON__)
     60 
     61 #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
     62 #define HAS_TRANSPOSE_WX8_SSSE3
     63 __declspec(naked) __declspec(align(16))
     64 static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
     65                                uint8* dst, int dst_stride, int width) {
     66   __asm {
     67     push      edi
     68     push      esi
     69     push      ebp
     70     mov       eax, [esp + 12 + 4]   // src
     71     mov       edi, [esp + 12 + 8]   // src_stride
     72     mov       edx, [esp + 12 + 12]  // dst
     73     mov       esi, [esp + 12 + 16]  // dst_stride
     74     mov       ecx, [esp + 12 + 20]  // width
     75 
     76     // Read in the data from the source pointer.
     77     // First round of bit swap.
     78     align      16
     79  convertloop:
     80     movq      xmm0, qword ptr [eax]
     81     lea       ebp, [eax + 8]
     82     movq      xmm1, qword ptr [eax + edi]
     83     lea       eax, [eax + 2 * edi]
     84     punpcklbw xmm0, xmm1
     85     movq      xmm2, qword ptr [eax]
     86     movdqa    xmm1, xmm0
     87     palignr   xmm1, xmm1, 8
     88     movq      xmm3, qword ptr [eax + edi]
     89     lea       eax, [eax + 2 * edi]
     90     punpcklbw xmm2, xmm3
     91     movdqa    xmm3, xmm2
     92     movq      xmm4, qword ptr [eax]
     93     palignr   xmm3, xmm3, 8
     94     movq      xmm5, qword ptr [eax + edi]
     95     punpcklbw xmm4, xmm5
     96     lea       eax, [eax + 2 * edi]
     97     movdqa    xmm5, xmm4
     98     movq      xmm6, qword ptr [eax]
     99     palignr   xmm5, xmm5, 8
    100     movq      xmm7, qword ptr [eax + edi]
    101     punpcklbw xmm6, xmm7
    102     mov       eax, ebp
    103     movdqa    xmm7, xmm6
    104     palignr   xmm7, xmm7, 8
    105     // Second round of bit swap.
    106     punpcklwd xmm0, xmm2
    107     punpcklwd xmm1, xmm3
    108     movdqa    xmm2, xmm0
    109     movdqa    xmm3, xmm1
    110     palignr   xmm2, xmm2, 8
    111     palignr   xmm3, xmm3, 8
    112     punpcklwd xmm4, xmm6
    113     punpcklwd xmm5, xmm7
    114     movdqa    xmm6, xmm4
    115     movdqa    xmm7, xmm5
    116     palignr   xmm6, xmm6, 8
    117     palignr   xmm7, xmm7, 8
    118     // Third round of bit swap.
    119     // Write to the destination pointer.
    120     punpckldq xmm0, xmm4
    121     movq      qword ptr [edx], xmm0
    122     movdqa    xmm4, xmm0
    123     palignr   xmm4, xmm4, 8
    124     movq      qword ptr [edx + esi], xmm4
    125     lea       edx, [edx + 2 * esi]
    126     punpckldq xmm2, xmm6
    127     movdqa    xmm6, xmm2
    128     palignr   xmm6, xmm6, 8
    129     movq      qword ptr [edx], xmm2
    130     punpckldq xmm1, xmm5
    131     movq      qword ptr [edx + esi], xmm6
    132     lea       edx, [edx + 2 * esi]
    133     movdqa    xmm5, xmm1
    134     movq      qword ptr [edx], xmm1
    135     palignr   xmm5, xmm5, 8
    136     punpckldq xmm3, xmm7
    137     movq      qword ptr [edx + esi], xmm5
    138     lea       edx, [edx + 2 * esi]
    139     movq      qword ptr [edx], xmm3
    140     movdqa    xmm7, xmm3
    141     palignr   xmm7, xmm7, 8
    142     sub       ecx, 8
    143     movq      qword ptr [edx + esi], xmm7
    144     lea       edx, [edx + 2 * esi]
    145     jg        convertloop
    146 
    147     pop       ebp
    148     pop       esi
    149     pop       edi
    150     ret
    151   }
    152 }
    153 
    154 #define HAS_TRANSPOSE_UVWX8_SSE2
    155 __declspec(naked) __declspec(align(16))
    156 static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
    157                                 uint8* dst_a, int dst_stride_a,
    158                                 uint8* dst_b, int dst_stride_b,
    159                                 int w) {
    160   __asm {
    161     push      ebx
    162     push      esi
    163     push      edi
    164     push      ebp
    165     mov       eax, [esp + 16 + 4]   // src
    166     mov       edi, [esp + 16 + 8]   // src_stride
    167     mov       edx, [esp + 16 + 12]  // dst_a
    168     mov       esi, [esp + 16 + 16]  // dst_stride_a
    169     mov       ebx, [esp + 16 + 20]  // dst_b
    170     mov       ebp, [esp + 16 + 24]  // dst_stride_b
    171     mov       ecx, esp
    172     sub       esp, 4 + 16
    173     and       esp, ~15
    174     mov       [esp + 16], ecx
    175     mov       ecx, [ecx + 16 + 28]  // w
    176 
    177     align      16
    178  convertloop:
    179     // Read in the data from the source pointer.
    180     // First round of bit swap.
    181     movdqa    xmm0, [eax]
    182     movdqa    xmm1, [eax + edi]
    183     lea       eax, [eax + 2 * edi]
    184     movdqa    xmm7, xmm0  // use xmm7 as temp register.
    185     punpcklbw xmm0, xmm1
    186     punpckhbw xmm7, xmm1
    187     movdqa    xmm1, xmm7
    188     movdqa    xmm2, [eax]
    189     movdqa    xmm3, [eax + edi]
    190     lea       eax, [eax + 2 * edi]
    191     movdqa    xmm7, xmm2
    192     punpcklbw xmm2, xmm3
    193     punpckhbw xmm7, xmm3
    194     movdqa    xmm3, xmm7
    195     movdqa    xmm4, [eax]
    196     movdqa    xmm5, [eax + edi]
    197     lea       eax, [eax + 2 * edi]
    198     movdqa    xmm7, xmm4
    199     punpcklbw xmm4, xmm5
    200     punpckhbw xmm7, xmm5
    201     movdqa    xmm5, xmm7
    202     movdqa    xmm6, [eax]
    203     movdqa    xmm7, [eax + edi]
    204     lea       eax, [eax + 2 * edi]
    205     movdqa    [esp], xmm5  // backup xmm5
    206     neg       edi
    207     movdqa    xmm5, xmm6   // use xmm5 as temp register.
    208     punpcklbw xmm6, xmm7
    209     punpckhbw xmm5, xmm7
    210     movdqa    xmm7, xmm5
    211     lea       eax, [eax + 8 * edi + 16]
    212     neg       edi
    213     // Second round of bit swap.
    214     movdqa    xmm5, xmm0
    215     punpcklwd xmm0, xmm2
    216     punpckhwd xmm5, xmm2
    217     movdqa    xmm2, xmm5
    218     movdqa    xmm5, xmm1
    219     punpcklwd xmm1, xmm3
    220     punpckhwd xmm5, xmm3
    221     movdqa    xmm3, xmm5
    222     movdqa    xmm5, xmm4
    223     punpcklwd xmm4, xmm6
    224     punpckhwd xmm5, xmm6
    225     movdqa    xmm6, xmm5
    226     movdqa    xmm5, [esp]  // restore xmm5
    227     movdqa    [esp], xmm6  // backup xmm6
    228     movdqa    xmm6, xmm5    // use xmm6 as temp register.
    229     punpcklwd xmm5, xmm7
    230     punpckhwd xmm6, xmm7
    231     movdqa    xmm7, xmm6
    232     // Third round of bit swap.
    233     // Write to the destination pointer.
    234     movdqa    xmm6, xmm0
    235     punpckldq xmm0, xmm4
    236     punpckhdq xmm6, xmm4
    237     movdqa    xmm4, xmm6
    238     movdqa    xmm6, [esp]  // restore xmm6
    239     movlpd    qword ptr [edx], xmm0
    240     movhpd    qword ptr [ebx], xmm0
    241     movlpd    qword ptr [edx + esi], xmm4
    242     lea       edx, [edx + 2 * esi]
    243     movhpd    qword ptr [ebx + ebp], xmm4
    244     lea       ebx, [ebx + 2 * ebp]
    245     movdqa    xmm0, xmm2   // use xmm0 as the temp register.
    246     punpckldq xmm2, xmm6
    247     movlpd    qword ptr [edx], xmm2
    248     movhpd    qword ptr [ebx], xmm2
    249     punpckhdq xmm0, xmm6
    250     movlpd    qword ptr [edx + esi], xmm0
    251     lea       edx, [edx + 2 * esi]
    252     movhpd    qword ptr [ebx + ebp], xmm0
    253     lea       ebx, [ebx + 2 * ebp]
    254     movdqa    xmm0, xmm1   // use xmm0 as the temp register.
    255     punpckldq xmm1, xmm5
    256     movlpd    qword ptr [edx], xmm1
    257     movhpd    qword ptr [ebx], xmm1
    258     punpckhdq xmm0, xmm5
    259     movlpd    qword ptr [edx + esi], xmm0
    260     lea       edx, [edx + 2 * esi]
    261     movhpd    qword ptr [ebx + ebp], xmm0
    262     lea       ebx, [ebx + 2 * ebp]
    263     movdqa    xmm0, xmm3   // use xmm0 as the temp register.
    264     punpckldq xmm3, xmm7
    265     movlpd    qword ptr [edx], xmm3
    266     movhpd    qword ptr [ebx], xmm3
    267     punpckhdq xmm0, xmm7
    268     sub       ecx, 8
    269     movlpd    qword ptr [edx + esi], xmm0
    270     lea       edx, [edx + 2 * esi]
    271     movhpd    qword ptr [ebx + ebp], xmm0
    272     lea       ebx, [ebx + 2 * ebp]
    273     jg        convertloop
    274 
    275     mov       esp, [esp + 16]
    276     pop       ebp
    277     pop       edi
    278     pop       esi
    279     pop       ebx
    280     ret
    281   }
    282 }
    283 #elif !defined(YUV_DISABLE_ASM) && (defined(__i386__) || defined(__x86_64__))
    284 #define HAS_TRANSPOSE_WX8_SSSE3
    285 static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
    286                                uint8* dst, int dst_stride, int width) {
    287   asm volatile (
    288     // Read in the data from the source pointer.
    289     // First round of bit swap.
    290     ".p2align  4                                 \n"
    291   "1:                                            \n"
    292     "movq       (%0),%%xmm0                      \n"
    293     "movq       (%0,%3),%%xmm1                   \n"
    294     "lea        (%0,%3,2),%0                     \n"
    295     "punpcklbw  %%xmm1,%%xmm0                    \n"
    296     "movq       (%0),%%xmm2                      \n"
    297     "movdqa     %%xmm0,%%xmm1                    \n"
    298     "palignr    $0x8,%%xmm1,%%xmm1               \n"
    299     "movq       (%0,%3),%%xmm3                   \n"
    300     "lea        (%0,%3,2),%0                     \n"
    301     "punpcklbw  %%xmm3,%%xmm2                    \n"
    302     "movdqa     %%xmm2,%%xmm3                    \n"
    303     "movq       (%0),%%xmm4                      \n"
    304     "palignr    $0x8,%%xmm3,%%xmm3               \n"
    305     "movq       (%0,%3),%%xmm5                   \n"
    306     "lea        (%0,%3,2),%0                     \n"
    307     "punpcklbw  %%xmm5,%%xmm4                    \n"
    308     "movdqa     %%xmm4,%%xmm5                    \n"
    309     "movq       (%0),%%xmm6                      \n"
    310     "palignr    $0x8,%%xmm5,%%xmm5               \n"
    311     "movq       (%0,%3),%%xmm7                   \n"
    312     "lea        (%0,%3,2),%0                     \n"
    313     "punpcklbw  %%xmm7,%%xmm6                    \n"
    314     "neg        %3                               \n"
    315     "movdqa     %%xmm6,%%xmm7                    \n"
    316     "lea        0x8(%0,%3,8),%0                  \n"
    317     "palignr    $0x8,%%xmm7,%%xmm7               \n"
    318     "neg        %3                               \n"
    319      // Second round of bit swap.
    320     "punpcklwd  %%xmm2,%%xmm0                    \n"
    321     "punpcklwd  %%xmm3,%%xmm1                    \n"
    322     "movdqa     %%xmm0,%%xmm2                    \n"
    323     "movdqa     %%xmm1,%%xmm3                    \n"
    324     "palignr    $0x8,%%xmm2,%%xmm2               \n"
    325     "palignr    $0x8,%%xmm3,%%xmm3               \n"
    326     "punpcklwd  %%xmm6,%%xmm4                    \n"
    327     "punpcklwd  %%xmm7,%%xmm5                    \n"
    328     "movdqa     %%xmm4,%%xmm6                    \n"
    329     "movdqa     %%xmm5,%%xmm7                    \n"
    330     "palignr    $0x8,%%xmm6,%%xmm6               \n"
    331     "palignr    $0x8,%%xmm7,%%xmm7               \n"
    332     // Third round of bit swap.
    333     // Write to the destination pointer.
    334     "punpckldq  %%xmm4,%%xmm0                    \n"
    335     "movq       %%xmm0,(%1)                      \n"
    336     "movdqa     %%xmm0,%%xmm4                    \n"
    337     "palignr    $0x8,%%xmm4,%%xmm4               \n"
    338     "movq       %%xmm4,(%1,%4)                   \n"
    339     "lea        (%1,%4,2),%1                     \n"
    340     "punpckldq  %%xmm6,%%xmm2                    \n"
    341     "movdqa     %%xmm2,%%xmm6                    \n"
    342     "movq       %%xmm2,(%1)                      \n"
    343     "palignr    $0x8,%%xmm6,%%xmm6               \n"
    344     "punpckldq  %%xmm5,%%xmm1                    \n"
    345     "movq       %%xmm6,(%1,%4)                   \n"
    346     "lea        (%1,%4,2),%1                     \n"
    347     "movdqa     %%xmm1,%%xmm5                    \n"
    348     "movq       %%xmm1,(%1)                      \n"
    349     "palignr    $0x8,%%xmm5,%%xmm5               \n"
    350     "movq       %%xmm5,(%1,%4)                   \n"
    351     "lea        (%1,%4,2),%1                     \n"
    352     "punpckldq  %%xmm7,%%xmm3                    \n"
    353     "movq       %%xmm3,(%1)                      \n"
    354     "movdqa     %%xmm3,%%xmm7                    \n"
    355     "palignr    $0x8,%%xmm7,%%xmm7               \n"
    356     "sub        $0x8,%2                          \n"
    357     "movq       %%xmm7,(%1,%4)                   \n"
    358     "lea        (%1,%4,2),%1                     \n"
    359     "jg         1b                               \n"
    360     : "+r"(src),    // %0
    361       "+r"(dst),    // %1
    362       "+r"(width)   // %2
    363     : "r"(static_cast<intptr_t>(src_stride)),  // %3
    364       "r"(static_cast<intptr_t>(dst_stride))   // %4
    365     : "memory", "cc"
    366   #if defined(__SSE2__)
    367       , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
    368   #endif
    369   );
    370 }
    371 
    372 #if !defined(YUV_DISABLE_ASM) && defined (__i386__)
    373 #define HAS_TRANSPOSE_UVWX8_SSE2
    374 extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
    375                                     uint8* dst_a, int dst_stride_a,
    376                                     uint8* dst_b, int dst_stride_b,
    377                                     int w);
    378   asm (
    379     DECLARE_FUNCTION(TransposeUVWx8_SSE2)
    380     "push   %ebx                               \n"
    381     "push   %esi                               \n"
    382     "push   %edi                               \n"
    383     "push   %ebp                               \n"
    384     "mov    0x14(%esp),%eax                    \n"
    385     "mov    0x18(%esp),%edi                    \n"
    386     "mov    0x1c(%esp),%edx                    \n"
    387     "mov    0x20(%esp),%esi                    \n"
    388     "mov    0x24(%esp),%ebx                    \n"
    389     "mov    0x28(%esp),%ebp                    \n"
    390     "mov    %esp,%ecx                          \n"
    391     "sub    $0x14,%esp                         \n"
    392     "and    $0xfffffff0,%esp                   \n"
    393     "mov    %ecx,0x10(%esp)                    \n"
    394     "mov    0x2c(%ecx),%ecx                    \n"
    395 
    396 "1:                                            \n"
    397     "movdqa (%eax),%xmm0                       \n"
    398     "movdqa (%eax,%edi,1),%xmm1                \n"
    399     "lea    (%eax,%edi,2),%eax                 \n"
    400     "movdqa %xmm0,%xmm7                        \n"
    401     "punpcklbw %xmm1,%xmm0                     \n"
    402     "punpckhbw %xmm1,%xmm7                     \n"
    403     "movdqa %xmm7,%xmm1                        \n"
    404     "movdqa (%eax),%xmm2                       \n"
    405     "movdqa (%eax,%edi,1),%xmm3                \n"
    406     "lea    (%eax,%edi,2),%eax                 \n"
    407     "movdqa %xmm2,%xmm7                        \n"
    408     "punpcklbw %xmm3,%xmm2                     \n"
    409     "punpckhbw %xmm3,%xmm7                     \n"
    410     "movdqa %xmm7,%xmm3                        \n"
    411     "movdqa (%eax),%xmm4                       \n"
    412     "movdqa (%eax,%edi,1),%xmm5                \n"
    413     "lea    (%eax,%edi,2),%eax                 \n"
    414     "movdqa %xmm4,%xmm7                        \n"
    415     "punpcklbw %xmm5,%xmm4                     \n"
    416     "punpckhbw %xmm5,%xmm7                     \n"
    417     "movdqa %xmm7,%xmm5                        \n"
    418     "movdqa (%eax),%xmm6                       \n"
    419     "movdqa (%eax,%edi,1),%xmm7                \n"
    420     "lea    (%eax,%edi,2),%eax                 \n"
    421     "movdqa %xmm5,(%esp)                       \n"
    422     "neg    %edi                               \n"
    423     "movdqa %xmm6,%xmm5                        \n"
    424     "punpcklbw %xmm7,%xmm6                     \n"
    425     "punpckhbw %xmm7,%xmm5                     \n"
    426     "movdqa %xmm5,%xmm7                        \n"
    427     "lea    0x10(%eax,%edi,8),%eax             \n"
    428     "neg    %edi                               \n"
    429     "movdqa %xmm0,%xmm5                        \n"
    430     "punpcklwd %xmm2,%xmm0                     \n"
    431     "punpckhwd %xmm2,%xmm5                     \n"
    432     "movdqa %xmm5,%xmm2                        \n"
    433     "movdqa %xmm1,%xmm5                        \n"
    434     "punpcklwd %xmm3,%xmm1                     \n"
    435     "punpckhwd %xmm3,%xmm5                     \n"
    436     "movdqa %xmm5,%xmm3                        \n"
    437     "movdqa %xmm4,%xmm5                        \n"
    438     "punpcklwd %xmm6,%xmm4                     \n"
    439     "punpckhwd %xmm6,%xmm5                     \n"
    440     "movdqa %xmm5,%xmm6                        \n"
    441     "movdqa (%esp),%xmm5                       \n"
    442     "movdqa %xmm6,(%esp)                       \n"
    443     "movdqa %xmm5,%xmm6                        \n"
    444     "punpcklwd %xmm7,%xmm5                     \n"
    445     "punpckhwd %xmm7,%xmm6                     \n"
    446     "movdqa %xmm6,%xmm7                        \n"
    447     "movdqa %xmm0,%xmm6                        \n"
    448     "punpckldq %xmm4,%xmm0                     \n"
    449     "punpckhdq %xmm4,%xmm6                     \n"
    450     "movdqa %xmm6,%xmm4                        \n"
    451     "movdqa (%esp),%xmm6                       \n"
    452     "movlpd %xmm0,(%edx)                       \n"
    453     "movhpd %xmm0,(%ebx)                       \n"
    454     "movlpd %xmm4,(%edx,%esi,1)                \n"
    455     "lea    (%edx,%esi,2),%edx                 \n"
    456     "movhpd %xmm4,(%ebx,%ebp,1)                \n"
    457     "lea    (%ebx,%ebp,2),%ebx                 \n"
    458     "movdqa %xmm2,%xmm0                        \n"
    459     "punpckldq %xmm6,%xmm2                     \n"
    460     "movlpd %xmm2,(%edx)                       \n"
    461     "movhpd %xmm2,(%ebx)                       \n"
    462     "punpckhdq %xmm6,%xmm0                     \n"
    463     "movlpd %xmm0,(%edx,%esi,1)                \n"
    464     "lea    (%edx,%esi,2),%edx                 \n"
    465     "movhpd %xmm0,(%ebx,%ebp,1)                \n"
    466     "lea    (%ebx,%ebp,2),%ebx                 \n"
    467     "movdqa %xmm1,%xmm0                        \n"
    468     "punpckldq %xmm5,%xmm1                     \n"
    469     "movlpd %xmm1,(%edx)                       \n"
    470     "movhpd %xmm1,(%ebx)                       \n"
    471     "punpckhdq %xmm5,%xmm0                     \n"
    472     "movlpd %xmm0,(%edx,%esi,1)                \n"
    473     "lea    (%edx,%esi,2),%edx                 \n"
    474     "movhpd %xmm0,(%ebx,%ebp,1)                \n"
    475     "lea    (%ebx,%ebp,2),%ebx                 \n"
    476     "movdqa %xmm3,%xmm0                        \n"
    477     "punpckldq %xmm7,%xmm3                     \n"
    478     "movlpd %xmm3,(%edx)                       \n"
    479     "movhpd %xmm3,(%ebx)                       \n"
    480     "punpckhdq %xmm7,%xmm0                     \n"
    481     "sub    $0x8,%ecx                          \n"
    482     "movlpd %xmm0,(%edx,%esi,1)                \n"
    483     "lea    (%edx,%esi,2),%edx                 \n"
    484     "movhpd %xmm0,(%ebx,%ebp,1)                \n"
    485     "lea    (%ebx,%ebp,2),%ebx                 \n"
    486     "jg     1b                                 \n"
    487     "mov    0x10(%esp),%esp                    \n"
    488     "pop    %ebp                               \n"
    489     "pop    %edi                               \n"
    490     "pop    %esi                               \n"
    491     "pop    %ebx                               \n"
    492     "ret                                       \n"
    493 );
    494 #elif !defined(YUV_DISABLE_ASM) && defined(__x86_64__)
    495 // 64 bit version has enough registers to do 16x8 to 8x16 at a time.
    496 #define HAS_TRANSPOSE_WX8_FAST_SSSE3
    497 static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
    498                                     uint8* dst, int dst_stride, int width) {
    499   asm volatile (
    500   // Read in the data from the source pointer.
    501   // First round of bit swap.
    502   ".p2align  4                                 \n"
    503 "1:                                            \n"
    504   "movdqa     (%0),%%xmm0                      \n"
    505   "movdqa     (%0,%3),%%xmm1                   \n"
    506   "lea        (%0,%3,2),%0                     \n"
    507   "movdqa     %%xmm0,%%xmm8                    \n"
    508   "punpcklbw  %%xmm1,%%xmm0                    \n"
    509   "punpckhbw  %%xmm1,%%xmm8                    \n"
    510   "movdqa     (%0),%%xmm2                      \n"
    511   "movdqa     %%xmm0,%%xmm1                    \n"
    512   "movdqa     %%xmm8,%%xmm9                    \n"
    513   "palignr    $0x8,%%xmm1,%%xmm1               \n"
    514   "palignr    $0x8,%%xmm9,%%xmm9               \n"
    515   "movdqa     (%0,%3),%%xmm3                   \n"
    516   "lea        (%0,%3,2),%0                     \n"
    517   "movdqa     %%xmm2,%%xmm10                   \n"
    518   "punpcklbw  %%xmm3,%%xmm2                    \n"
    519   "punpckhbw  %%xmm3,%%xmm10                   \n"
    520   "movdqa     %%xmm2,%%xmm3                    \n"
    521   "movdqa     %%xmm10,%%xmm11                  \n"
    522   "movdqa     (%0),%%xmm4                      \n"
    523   "palignr    $0x8,%%xmm3,%%xmm3               \n"
    524   "palignr    $0x8,%%xmm11,%%xmm11             \n"
    525   "movdqa     (%0,%3),%%xmm5                   \n"
    526   "lea        (%0,%3,2),%0                     \n"
    527   "movdqa     %%xmm4,%%xmm12                   \n"
    528   "punpcklbw  %%xmm5,%%xmm4                    \n"
    529   "punpckhbw  %%xmm5,%%xmm12                   \n"
    530   "movdqa     %%xmm4,%%xmm5                    \n"
    531   "movdqa     %%xmm12,%%xmm13                  \n"
    532   "movdqa     (%0),%%xmm6                      \n"
    533   "palignr    $0x8,%%xmm5,%%xmm5               \n"
    534   "palignr    $0x8,%%xmm13,%%xmm13             \n"
    535   "movdqa     (%0,%3),%%xmm7                   \n"
    536   "lea        (%0,%3,2),%0                     \n"
    537   "movdqa     %%xmm6,%%xmm14                   \n"
    538   "punpcklbw  %%xmm7,%%xmm6                    \n"
    539   "punpckhbw  %%xmm7,%%xmm14                   \n"
    540   "neg        %3                               \n"
    541   "movdqa     %%xmm6,%%xmm7                    \n"
    542   "movdqa     %%xmm14,%%xmm15                  \n"
    543   "lea        0x10(%0,%3,8),%0                 \n"
    544   "palignr    $0x8,%%xmm7,%%xmm7               \n"
    545   "palignr    $0x8,%%xmm15,%%xmm15             \n"
    546   "neg        %3                               \n"
    547    // Second round of bit swap.
    548   "punpcklwd  %%xmm2,%%xmm0                    \n"
    549   "punpcklwd  %%xmm3,%%xmm1                    \n"
    550   "movdqa     %%xmm0,%%xmm2                    \n"
    551   "movdqa     %%xmm1,%%xmm3                    \n"
    552   "palignr    $0x8,%%xmm2,%%xmm2               \n"
    553   "palignr    $0x8,%%xmm3,%%xmm3               \n"
    554   "punpcklwd  %%xmm6,%%xmm4                    \n"
    555   "punpcklwd  %%xmm7,%%xmm5                    \n"
    556   "movdqa     %%xmm4,%%xmm6                    \n"
    557   "movdqa     %%xmm5,%%xmm7                    \n"
    558   "palignr    $0x8,%%xmm6,%%xmm6               \n"
    559   "palignr    $0x8,%%xmm7,%%xmm7               \n"
    560   "punpcklwd  %%xmm10,%%xmm8                   \n"
    561   "punpcklwd  %%xmm11,%%xmm9                   \n"
    562   "movdqa     %%xmm8,%%xmm10                   \n"
    563   "movdqa     %%xmm9,%%xmm11                   \n"
    564   "palignr    $0x8,%%xmm10,%%xmm10             \n"
    565   "palignr    $0x8,%%xmm11,%%xmm11             \n"
    566   "punpcklwd  %%xmm14,%%xmm12                  \n"
    567   "punpcklwd  %%xmm15,%%xmm13                  \n"
    568   "movdqa     %%xmm12,%%xmm14                  \n"
    569   "movdqa     %%xmm13,%%xmm15                  \n"
    570   "palignr    $0x8,%%xmm14,%%xmm14             \n"
    571   "palignr    $0x8,%%xmm15,%%xmm15             \n"
    572   // Third round of bit swap.
    573   // Write to the destination pointer.
    574   "punpckldq  %%xmm4,%%xmm0                    \n"
    575   "movq       %%xmm0,(%1)                      \n"
    576   "movdqa     %%xmm0,%%xmm4                    \n"
    577   "palignr    $0x8,%%xmm4,%%xmm4               \n"
    578   "movq       %%xmm4,(%1,%4)                   \n"
    579   "lea        (%1,%4,2),%1                     \n"
    580   "punpckldq  %%xmm6,%%xmm2                    \n"
    581   "movdqa     %%xmm2,%%xmm6                    \n"
    582   "movq       %%xmm2,(%1)                      \n"
    583   "palignr    $0x8,%%xmm6,%%xmm6               \n"
    584   "punpckldq  %%xmm5,%%xmm1                    \n"
    585   "movq       %%xmm6,(%1,%4)                   \n"
    586   "lea        (%1,%4,2),%1                     \n"
    587   "movdqa     %%xmm1,%%xmm5                    \n"
    588   "movq       %%xmm1,(%1)                      \n"
    589   "palignr    $0x8,%%xmm5,%%xmm5               \n"
    590   "movq       %%xmm5,(%1,%4)                   \n"
    591   "lea        (%1,%4,2),%1                     \n"
    592   "punpckldq  %%xmm7,%%xmm3                    \n"
    593   "movq       %%xmm3,(%1)                      \n"
    594   "movdqa     %%xmm3,%%xmm7                    \n"
    595   "palignr    $0x8,%%xmm7,%%xmm7               \n"
    596   "movq       %%xmm7,(%1,%4)                   \n"
    597   "lea        (%1,%4,2),%1                     \n"
    598   "punpckldq  %%xmm12,%%xmm8                   \n"
    599   "movq       %%xmm8,(%1)                      \n"
    600   "movdqa     %%xmm8,%%xmm12                   \n"
    601   "palignr    $0x8,%%xmm12,%%xmm12             \n"
    602   "movq       %%xmm12,(%1,%4)                  \n"
    603   "lea        (%1,%4,2),%1                     \n"
    604   "punpckldq  %%xmm14,%%xmm10                  \n"
    605   "movdqa     %%xmm10,%%xmm14                  \n"
    606   "movq       %%xmm10,(%1)                     \n"
    607   "palignr    $0x8,%%xmm14,%%xmm14             \n"
    608   "punpckldq  %%xmm13,%%xmm9                   \n"
    609   "movq       %%xmm14,(%1,%4)                  \n"
    610   "lea        (%1,%4,2),%1                     \n"
    611   "movdqa     %%xmm9,%%xmm13                   \n"
    612   "movq       %%xmm9,(%1)                      \n"
    613   "palignr    $0x8,%%xmm13,%%xmm13             \n"
    614   "movq       %%xmm13,(%1,%4)                  \n"
    615   "lea        (%1,%4,2),%1                     \n"
    616   "punpckldq  %%xmm15,%%xmm11                  \n"
    617   "movq       %%xmm11,(%1)                     \n"
    618   "movdqa     %%xmm11,%%xmm15                  \n"
    619   "palignr    $0x8,%%xmm15,%%xmm15             \n"
    620   "sub        $0x10,%2                         \n"
    621   "movq       %%xmm15,(%1,%4)                  \n"
    622   "lea        (%1,%4,2),%1                     \n"
    623   "jg         1b                               \n"
    624   : "+r"(src),    // %0
    625     "+r"(dst),    // %1
    626     "+r"(width)   // %2
    627   : "r"(static_cast<intptr_t>(src_stride)),  // %3
    628     "r"(static_cast<intptr_t>(dst_stride))   // %4
    629   : "memory", "cc",
    630     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
    631     "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",  "xmm14",  "xmm15"
    632 );
    633 }
    634 
    635 #define HAS_TRANSPOSE_UVWX8_SSE2
    636 static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
    637                                 uint8* dst_a, int dst_stride_a,
    638                                 uint8* dst_b, int dst_stride_b,
    639                                 int w) {
    640   asm volatile (
    641   // Read in the data from the source pointer.
    642   // First round of bit swap.
    643   ".p2align  4                                 \n"
    644 "1:                                            \n"
    645   "movdqa     (%0),%%xmm0                      \n"
    646   "movdqa     (%0,%4),%%xmm1                   \n"
    647   "lea        (%0,%4,2),%0                     \n"
    648   "movdqa     %%xmm0,%%xmm8                    \n"
    649   "punpcklbw  %%xmm1,%%xmm0                    \n"
    650   "punpckhbw  %%xmm1,%%xmm8                    \n"
    651   "movdqa     %%xmm8,%%xmm1                    \n"
    652   "movdqa     (%0),%%xmm2                      \n"
    653   "movdqa     (%0,%4),%%xmm3                   \n"
    654   "lea        (%0,%4,2),%0                     \n"
    655   "movdqa     %%xmm2,%%xmm8                    \n"
    656   "punpcklbw  %%xmm3,%%xmm2                    \n"
    657   "punpckhbw  %%xmm3,%%xmm8                    \n"
    658   "movdqa     %%xmm8,%%xmm3                    \n"
    659   "movdqa     (%0),%%xmm4                      \n"
    660   "movdqa     (%0,%4),%%xmm5                   \n"
    661   "lea        (%0,%4,2),%0                     \n"
    662   "movdqa     %%xmm4,%%xmm8                    \n"
    663   "punpcklbw  %%xmm5,%%xmm4                    \n"
    664   "punpckhbw  %%xmm5,%%xmm8                    \n"
    665   "movdqa     %%xmm8,%%xmm5                    \n"
    666   "movdqa     (%0),%%xmm6                      \n"
    667   "movdqa     (%0,%4),%%xmm7                   \n"
    668   "lea        (%0,%4,2),%0                     \n"
    669   "movdqa     %%xmm6,%%xmm8                    \n"
    670   "punpcklbw  %%xmm7,%%xmm6                    \n"
    671   "neg        %4                               \n"
    672   "lea        0x10(%0,%4,8),%0                 \n"
    673   "punpckhbw  %%xmm7,%%xmm8                    \n"
    674   "movdqa     %%xmm8,%%xmm7                    \n"
    675   "neg        %4                               \n"
    676    // Second round of bit swap.
    677   "movdqa     %%xmm0,%%xmm8                    \n"
    678   "movdqa     %%xmm1,%%xmm9                    \n"
    679   "punpckhwd  %%xmm2,%%xmm8                    \n"
    680   "punpckhwd  %%xmm3,%%xmm9                    \n"
    681   "punpcklwd  %%xmm2,%%xmm0                    \n"
    682   "punpcklwd  %%xmm3,%%xmm1                    \n"
    683   "movdqa     %%xmm8,%%xmm2                    \n"
    684   "movdqa     %%xmm9,%%xmm3                    \n"
    685   "movdqa     %%xmm4,%%xmm8                    \n"
    686   "movdqa     %%xmm5,%%xmm9                    \n"
    687   "punpckhwd  %%xmm6,%%xmm8                    \n"
    688   "punpckhwd  %%xmm7,%%xmm9                    \n"
    689   "punpcklwd  %%xmm6,%%xmm4                    \n"
    690   "punpcklwd  %%xmm7,%%xmm5                    \n"
    691   "movdqa     %%xmm8,%%xmm6                    \n"
    692   "movdqa     %%xmm9,%%xmm7                    \n"
    693   // Third round of bit swap.
    694   // Write to the destination pointer.
    695   "movdqa     %%xmm0,%%xmm8                    \n"
    696   "punpckldq  %%xmm4,%%xmm0                    \n"
    697   "movlpd     %%xmm0,(%1)                      \n"  // Write back U channel
    698   "movhpd     %%xmm0,(%2)                      \n"  // Write back V channel
    699   "punpckhdq  %%xmm4,%%xmm8                    \n"
    700   "movlpd     %%xmm8,(%1,%5)                   \n"
    701   "lea        (%1,%5,2),%1                     \n"
    702   "movhpd     %%xmm8,(%2,%6)                   \n"
    703   "lea        (%2,%6,2),%2                     \n"
    704   "movdqa     %%xmm2,%%xmm8                    \n"
    705   "punpckldq  %%xmm6,%%xmm2                    \n"
    706   "movlpd     %%xmm2,(%1)                      \n"
    707   "movhpd     %%xmm2,(%2)                      \n"
    708   "punpckhdq  %%xmm6,%%xmm8                    \n"
    709   "movlpd     %%xmm8,(%1,%5)                   \n"
    710   "lea        (%1,%5,2),%1                     \n"
    711   "movhpd     %%xmm8,(%2,%6)                   \n"
    712   "lea        (%2,%6,2),%2                     \n"
    713   "movdqa     %%xmm1,%%xmm8                    \n"
    714   "punpckldq  %%xmm5,%%xmm1                    \n"
    715   "movlpd     %%xmm1,(%1)                      \n"
    716   "movhpd     %%xmm1,(%2)                      \n"
    717   "punpckhdq  %%xmm5,%%xmm8                    \n"
    718   "movlpd     %%xmm8,(%1,%5)                   \n"
    719   "lea        (%1,%5,2),%1                     \n"
    720   "movhpd     %%xmm8,(%2,%6)                   \n"
    721   "lea        (%2,%6,2),%2                     \n"
    722   "movdqa     %%xmm3,%%xmm8                    \n"
    723   "punpckldq  %%xmm7,%%xmm3                    \n"
    724   "movlpd     %%xmm3,(%1)                      \n"
    725   "movhpd     %%xmm3,(%2)                      \n"
    726   "punpckhdq  %%xmm7,%%xmm8                    \n"
    727   "sub        $0x8,%3                          \n"
    728   "movlpd     %%xmm8,(%1,%5)                   \n"
    729   "lea        (%1,%5,2),%1                     \n"
    730   "movhpd     %%xmm8,(%2,%6)                   \n"
    731   "lea        (%2,%6,2),%2                     \n"
    732   "jg         1b                               \n"
    733   : "+r"(src),    // %0
    734     "+r"(dst_a),  // %1
    735     "+r"(dst_b),  // %2
    736     "+r"(w)   // %3
    737   : "r"(static_cast<intptr_t>(src_stride)),    // %4
    738     "r"(static_cast<intptr_t>(dst_stride_a)),  // %5
    739     "r"(static_cast<intptr_t>(dst_stride_b))   // %6
    740   : "memory", "cc",
    741     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
    742     "xmm8", "xmm9"
    743 );
    744 }
    745 #endif
    746 #endif
    747 
    748 static void TransposeWx8_C(const uint8* src, int src_stride,
    749                            uint8* dst, int dst_stride,
    750                            int width) {
    751   for (int i = 0; i < width; ++i) {
    752     dst[0] = src[0 * src_stride];
    753     dst[1] = src[1 * src_stride];
    754     dst[2] = src[2 * src_stride];
    755     dst[3] = src[3 * src_stride];
    756     dst[4] = src[4 * src_stride];
    757     dst[5] = src[5 * src_stride];
    758     dst[6] = src[6 * src_stride];
    759     dst[7] = src[7 * src_stride];
    760     ++src;
    761     dst += dst_stride;
    762   }
    763 }
    764 
    765 static void TransposeWxH_C(const uint8* src, int src_stride,
    766                            uint8* dst, int dst_stride,
    767                            int width, int height) {
    768   for (int i = 0; i < width; ++i) {
    769     for (int j = 0; j < height; ++j) {
    770       dst[i * dst_stride + j] = src[j * src_stride + i];
    771     }
    772   }
    773 }
    774 
    775 LIBYUV_API
    776 void TransposePlane(const uint8* src, int src_stride,
    777                     uint8* dst, int dst_stride,
    778                     int width, int height) {
    779   void (*TransposeWx8)(const uint8* src, int src_stride,
    780                        uint8* dst, int dst_stride,
    781                        int width) = TransposeWx8_C;
    782 #if defined(HAS_TRANSPOSE_WX8_NEON)
    783   if (TestCpuFlag(kCpuHasNEON)) {
    784     TransposeWx8 = TransposeWx8_NEON;
    785   }
    786 #endif
    787 #if defined(HAS_TRANSPOSE_WX8_SSSE3)
    788   if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
    789     TransposeWx8 = TransposeWx8_SSSE3;
    790   }
    791 #endif
    792 #if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3)
    793   if (TestCpuFlag(kCpuHasSSSE3) &&
    794       IS_ALIGNED(width, 16) &&
    795       IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
    796     TransposeWx8 = TransposeWx8_FAST_SSSE3;
    797   }
    798 #endif
    799 
    800   // Work across the source in 8x8 tiles
    801   int i = height;
    802   while (i >= 8) {
    803     TransposeWx8(src, src_stride, dst, dst_stride, width);
    804     src += 8 * src_stride;    // Go down 8 rows.
    805     dst += 8;                 // Move over 8 columns.
    806     i -= 8;
    807   }
    808 
    809   TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);
    810 }
    811 
    812 LIBYUV_API
    813 void RotatePlane90(const uint8* src, int src_stride,
    814                    uint8* dst, int dst_stride,
    815                    int width, int height) {
    816   // Rotate by 90 is a transpose with the source read
    817   // from bottom to top. So set the source pointer to the end
    818   // of the buffer and flip the sign of the source stride.
    819   src += src_stride * (height - 1);
    820   src_stride = -src_stride;
    821   TransposePlane(src, src_stride, dst, dst_stride, width, height);
    822 }
    823 
    824 LIBYUV_API
    825 void RotatePlane270(const uint8* src, int src_stride,
    826                     uint8* dst, int dst_stride,
    827                     int width, int height) {
    828   // Rotate by 270 is a transpose with the destination written
    829   // from bottom to top. So set the destination pointer to the end
    830   // of the buffer and flip the sign of the destination stride.
    831   dst += dst_stride * (width - 1);
    832   dst_stride = -dst_stride;
    833   TransposePlane(src, src_stride, dst, dst_stride, width, height);
    834 }
    835 
    836 LIBYUV_API
    837 void RotatePlane180(const uint8* src, int src_stride,
    838                     uint8* dst, int dst_stride,
    839                     int width, int height) {
    840   void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
    841 #if defined(HAS_MIRRORROW_NEON)
    842   if (TestCpuFlag(kCpuHasNEON)) {
    843     MirrorRow = MirrorRow_NEON;
    844   }
    845 #endif
    846 #if defined(HAS_MIRRORROW_SSE2)
    847   if (TestCpuFlag(kCpuHasSSE2) &&
    848       IS_ALIGNED(width, 16) &&
    849       IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
    850       IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
    851     MirrorRow = MirrorRow_SSE2;
    852   }
    853 #endif
    854 #if defined(HAS_MIRRORROW_SSSE3)
    855   if (TestCpuFlag(kCpuHasSSSE3) &&
    856       IS_ALIGNED(width, 16) &&
    857       IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
    858       IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
    859     MirrorRow = MirrorRow_SSSE3;
    860   }
    861 #endif
    862   void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
    863 #if defined(HAS_COPYROW_NEON)
    864   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 64)) {
    865     CopyRow = CopyRow_NEON;
    866   }
    867 #endif
    868 #if defined(HAS_COPYROW_X86)
    869   if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
    870     CopyRow = CopyRow_X86;
    871   }
    872 #endif
    873 #if defined(HAS_COPYROW_SSE2)
    874   if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
    875       IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
    876       IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
    877     CopyRow = CopyRow_SSE2;
    878   }
    879 #endif
    880   if (width > kMaxStride) {
    881     return;
    882   }
    883   // Swap first and last row and mirror the content. Uses a temporary row.
    884   SIMD_ALIGNED(uint8 row[kMaxStride]);
    885   const uint8* src_bot = src + src_stride * (height - 1);
    886   uint8* dst_bot = dst + dst_stride * (height - 1);
    887   int half_height = (height + 1) >> 1;
    888   // Odd height will harmlessly mirror the middle row twice.
    889   for (int y = 0; y < half_height; ++y) {
    890     MirrorRow(src, row, width);  // Mirror first row into a buffer
    891     src += src_stride;
    892     MirrorRow(src_bot, dst, width);  // Mirror last row into first row
    893     dst += dst_stride;
    894     CopyRow(row, dst_bot, width);  // Copy first mirrored row into last
    895     src_bot -= src_stride;
    896     dst_bot -= dst_stride;
    897   }
    898 }
    899 
    900 static void TransposeUVWx8_C(const uint8* src, int src_stride,
    901                              uint8* dst_a, int dst_stride_a,
    902                              uint8* dst_b, int dst_stride_b,
    903                              int width) {
    904   for (int i = 0; i < width; ++i) {
    905     dst_a[0] = src[0 * src_stride + 0];
    906     dst_b[0] = src[0 * src_stride + 1];
    907     dst_a[1] = src[1 * src_stride + 0];
    908     dst_b[1] = src[1 * src_stride + 1];
    909     dst_a[2] = src[2 * src_stride + 0];
    910     dst_b[2] = src[2 * src_stride + 1];
    911     dst_a[3] = src[3 * src_stride + 0];
    912     dst_b[3] = src[3 * src_stride + 1];
    913     dst_a[4] = src[4 * src_stride + 0];
    914     dst_b[4] = src[4 * src_stride + 1];
    915     dst_a[5] = src[5 * src_stride + 0];
    916     dst_b[5] = src[5 * src_stride + 1];
    917     dst_a[6] = src[6 * src_stride + 0];
    918     dst_b[6] = src[6 * src_stride + 1];
    919     dst_a[7] = src[7 * src_stride + 0];
    920     dst_b[7] = src[7 * src_stride + 1];
    921     src += 2;
    922     dst_a += dst_stride_a;
    923     dst_b += dst_stride_b;
    924   }
    925 }
    926 
    927 static void TransposeUVWxH_C(const uint8* src, int src_stride,
    928                              uint8* dst_a, int dst_stride_a,
    929                              uint8* dst_b, int dst_stride_b,
    930                              int width, int height) {
    931   for (int i = 0; i < width * 2; i += 2)
    932     for (int j = 0; j < height; ++j) {
    933       dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
    934       dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
    935     }
    936 }
    937 
    938 LIBYUV_API
    939 void TransposeUV(const uint8* src, int src_stride,
    940                  uint8* dst_a, int dst_stride_a,
    941                  uint8* dst_b, int dst_stride_b,
    942                  int width, int height) {
    943   void (*TransposeUVWx8)(const uint8* src, int src_stride,
    944                          uint8* dst_a, int dst_stride_a,
    945                          uint8* dst_b, int dst_stride_b,
    946                          int width) = TransposeUVWx8_C;
    947 #if defined(HAS_TRANSPOSE_UVWX8_NEON)
    948   if (TestCpuFlag(kCpuHasNEON)) {
    949     TransposeUVWx8 = TransposeUVWx8_NEON;
    950   }
    951 #elif defined(HAS_TRANSPOSE_UVWX8_SSE2)
    952   if (TestCpuFlag(kCpuHasSSE2) &&
    953       IS_ALIGNED(width, 8) &&
    954       IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
    955     TransposeUVWx8 = TransposeUVWx8_SSE2;
    956   }
    957 #endif
    958 
    959   // Work through the source in 8x8 tiles.
    960   int i = height;
    961   while (i >= 8) {
    962     TransposeUVWx8(src, src_stride,
    963                    dst_a, dst_stride_a,
    964                    dst_b, dst_stride_b,
    965                    width);
    966     src += 8 * src_stride;    // Go down 8 rows.
    967     dst_a += 8;               // Move over 8 columns.
    968     dst_b += 8;               // Move over 8 columns.
    969     i -= 8;
    970   }
    971 
    972   TransposeUVWxH_C(src, src_stride,
    973                    dst_a, dst_stride_a,
    974                    dst_b, dst_stride_b,
    975                    width, i);
    976 }
    977 
    978 LIBYUV_API
    979 void RotateUV90(const uint8* src, int src_stride,
    980                 uint8* dst_a, int dst_stride_a,
    981                 uint8* dst_b, int dst_stride_b,
    982                 int width, int height) {
    983   src += src_stride * (height - 1);
    984   src_stride = -src_stride;
    985 
    986   TransposeUV(src, src_stride,
    987               dst_a, dst_stride_a,
    988               dst_b, dst_stride_b,
    989               width, height);
    990 }
    991 
    992 LIBYUV_API
    993 void RotateUV270(const uint8* src, int src_stride,
    994                  uint8* dst_a, int dst_stride_a,
    995                  uint8* dst_b, int dst_stride_b,
    996                  int width, int height) {
    997   dst_a += dst_stride_a * (width - 1);
    998   dst_b += dst_stride_b * (width - 1);
    999   dst_stride_a = -dst_stride_a;
   1000   dst_stride_b = -dst_stride_b;
   1001 
   1002   TransposeUV(src, src_stride,
   1003               dst_a, dst_stride_a,
   1004               dst_b, dst_stride_b,
   1005               width, height);
   1006 }
   1007 
   1008 // Rotate 180 is a horizontal and vertical flip.
   1009 LIBYUV_API
   1010 void RotateUV180(const uint8* src, int src_stride,
   1011                  uint8* dst_a, int dst_stride_a,
   1012                  uint8* dst_b, int dst_stride_b,
   1013                  int width, int height) {
   1014   void (*MirrorRowUV)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) =
   1015       MirrorRowUV_C;
   1016 #if defined(HAS_MIRRORROW_UV_NEON)
   1017   if (TestCpuFlag(kCpuHasNEON)) {
   1018     MirrorRowUV = MirrorRowUV_NEON;
   1019   }
   1020 #elif defined(HAS_MIRRORROW_UV_SSSE3)
   1021   if (TestCpuFlag(kCpuHasSSSE3) &&
   1022       IS_ALIGNED(width, 16) &&
   1023       IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
   1024     MirrorRowUV = MirrorRowUV_SSSE3;
   1025   }
   1026 #endif
   1027 
   1028   dst_a += dst_stride_a * (height - 1);
   1029   dst_b += dst_stride_b * (height - 1);
   1030 
   1031   for (int i = 0; i < height; ++i) {
   1032     MirrorRowUV(src, dst_a, dst_b, width);
   1033     src += src_stride;
   1034     dst_a -= dst_stride_a;
   1035     dst_b -= dst_stride_b;
   1036   }
   1037 }
   1038 
   1039 LIBYUV_API
   1040 int I420Rotate(const uint8* src_y, int src_stride_y,
   1041                const uint8* src_u, int src_stride_u,
   1042                const uint8* src_v, int src_stride_v,
   1043                uint8* dst_y, int dst_stride_y,
   1044                uint8* dst_u, int dst_stride_u,
   1045                uint8* dst_v, int dst_stride_v,
   1046                int width, int height,
   1047                RotationMode mode) {
   1048   if (!src_y || !src_u || !src_v || width <= 0 || height == 0 ||
   1049       !dst_y || !dst_u || !dst_v) {
   1050     return -1;
   1051   }
   1052   int halfwidth = (width + 1) >> 1;
   1053   int halfheight = (height + 1) >> 1;
   1054 
   1055   // Negative height means invert the image.
   1056   if (height < 0) {
   1057     height = -height;
   1058     halfheight = (height + 1) >> 1;
   1059     src_y = src_y + (height - 1) * src_stride_y;
   1060     src_u = src_u + (halfheight - 1) * src_stride_u;
   1061     src_v = src_v + (halfheight - 1) * src_stride_v;
   1062     src_stride_y = -src_stride_y;
   1063     src_stride_u = -src_stride_u;
   1064     src_stride_v = -src_stride_v;
   1065   }
   1066 
   1067   switch (mode) {
   1068     case kRotate0:
   1069       // copy frame
   1070       return I420Copy(src_y, src_stride_y,
   1071                       src_u, src_stride_u,
   1072                       src_v, src_stride_v,
   1073                       dst_y, dst_stride_y,
   1074                       dst_u, dst_stride_u,
   1075                       dst_v, dst_stride_v,
   1076                       width, height);
   1077     case kRotate90:
   1078       RotatePlane90(src_y, src_stride_y,
   1079                     dst_y, dst_stride_y,
   1080                     width, height);
   1081       RotatePlane90(src_u, src_stride_u,
   1082                     dst_u, dst_stride_u,
   1083                     halfwidth, halfheight);
   1084       RotatePlane90(src_v, src_stride_v,
   1085                     dst_v, dst_stride_v,
   1086                     halfwidth, halfheight);
   1087       return 0;
   1088     case kRotate270:
   1089       RotatePlane270(src_y, src_stride_y,
   1090                      dst_y, dst_stride_y,
   1091                      width, height);
   1092       RotatePlane270(src_u, src_stride_u,
   1093                      dst_u, dst_stride_u,
   1094                      halfwidth, halfheight);
   1095       RotatePlane270(src_v, src_stride_v,
   1096                      dst_v, dst_stride_v,
   1097                      halfwidth, halfheight);
   1098       return 0;
   1099     case kRotate180:
   1100       RotatePlane180(src_y, src_stride_y,
   1101                      dst_y, dst_stride_y,
   1102                      width, height);
   1103       RotatePlane180(src_u, src_stride_u,
   1104                      dst_u, dst_stride_u,
   1105                      halfwidth, halfheight);
   1106       RotatePlane180(src_v, src_stride_v,
   1107                      dst_v, dst_stride_v,
   1108                      halfwidth, halfheight);
   1109       return 0;
   1110     default:
   1111       break;
   1112   }
   1113   return -1;
   1114 }
   1115 
   1116 LIBYUV_API
   1117 int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
   1118                      const uint8* src_uv, int src_stride_uv,
   1119                      uint8* dst_y, int dst_stride_y,
   1120                      uint8* dst_u, int dst_stride_u,
   1121                      uint8* dst_v, int dst_stride_v,
   1122                      int width, int height,
   1123                      RotationMode mode) {
   1124   if (!src_y || !src_uv || width <= 0 || height == 0 ||
   1125       !dst_y || !dst_u || !dst_v) {
   1126     return -1;
   1127   }
   1128   int halfwidth = (width + 1) >> 1;
   1129   int halfheight = (height + 1) >> 1;
   1130 
   1131   // Negative height means invert the image.
   1132   if (height < 0) {
   1133     height = -height;
   1134     halfheight = (height + 1) >> 1;
   1135     src_y = src_y + (height - 1) * src_stride_y;
   1136     src_uv = src_uv + (halfheight - 1) * src_stride_uv;
   1137     src_stride_y = -src_stride_y;
   1138     src_stride_uv = -src_stride_uv;
   1139   }
   1140 
   1141   switch (mode) {
   1142     case kRotate0:
   1143       // copy frame
   1144       return NV12ToI420(src_y, src_stride_y,
   1145                         src_uv, src_stride_uv,
   1146                         dst_y, dst_stride_y,
   1147                         dst_u, dst_stride_u,
   1148                         dst_v, dst_stride_v,
   1149                         width, height);
   1150     case kRotate90:
   1151       RotatePlane90(src_y, src_stride_y,
   1152                     dst_y, dst_stride_y,
   1153                     width, height);
   1154       RotateUV90(src_uv, src_stride_uv,
   1155                  dst_u, dst_stride_u,
   1156                  dst_v, dst_stride_v,
   1157                  halfwidth, halfheight);
   1158       return 0;
   1159     case kRotate270:
   1160       RotatePlane270(src_y, src_stride_y,
   1161                      dst_y, dst_stride_y,
   1162                      width, height);
   1163       RotateUV270(src_uv, src_stride_uv,
   1164                   dst_u, dst_stride_u,
   1165                   dst_v, dst_stride_v,
   1166                   halfwidth, halfheight);
   1167       return 0;
   1168     case kRotate180:
   1169       RotatePlane180(src_y, src_stride_y,
   1170                      dst_y, dst_stride_y,
   1171                      width, height);
   1172       RotateUV180(src_uv, src_stride_uv,
   1173                   dst_u, dst_stride_u,
   1174                   dst_v, dst_stride_v,
   1175                   halfwidth, halfheight);
   1176       return 0;
   1177     default:
   1178       break;
   1179   }
   1180   return -1;
   1181 }
   1182 
   1183 #ifdef __cplusplus
   1184 }  // extern "C"
   1185 }  // namespace libyuv
   1186 #endif
   1187