Home | History | Annotate | Download | only in source
      1 /*
      2  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS. All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "libyuv/rotate.h"
     12 
     13 #include "libyuv/cpu_id.h"
     14 #include "libyuv/convert.h"
     15 #include "libyuv/planar_functions.h"
     16 #include "libyuv/row.h"
     17 
     18 #ifdef __cplusplus
     19 namespace libyuv {
     20 extern "C" {
     21 #endif
     22 
     23 #if !defined(LIBYUV_DISABLE_X86) && \
     24     (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
     25 #if defined(__APPLE__) && defined(__i386__)
     26 #define DECLARE_FUNCTION(name)                                                 \
     27     ".text                                     \n"                             \
     28     ".private_extern _" #name "                \n"                             \
     29     ".align 4,0x90                             \n"                             \
     30 "_" #name ":                                   \n"
     31 #elif defined(__MINGW32__) || defined(__CYGWIN__) && defined(__i386__)
     32 #define DECLARE_FUNCTION(name)                                                 \
     33     ".text                                     \n"                             \
     34     ".align 4,0x90                             \n"                             \
     35 "_" #name ":                                   \n"
     36 #else
     37 #define DECLARE_FUNCTION(name)                                                 \
     38     ".text                                     \n"                             \
     39     ".align 4,0x90                             \n"                             \
     40 #name ":                                       \n"
     41 #endif
     42 #endif
     43 
     44 #if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
     45     (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
     46 #define HAS_MIRRORROW_NEON
     47 void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
     48 #define HAS_MIRRORROW_UV_NEON
     49 void MirrorUVRow_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width);
     50 #define HAS_TRANSPOSE_WX8_NEON
     51 void TransposeWx8_NEON(const uint8* src, int src_stride,
     52                        uint8* dst, int dst_stride, int width);
     53 #define HAS_TRANSPOSE_UVWX8_NEON
     54 void TransposeUVWx8_NEON(const uint8* src, int src_stride,
     55                          uint8* dst_a, int dst_stride_a,
     56                          uint8* dst_b, int dst_stride_b,
     57                          int width);
     58 #endif  // defined(__ARM_NEON__)
     59 
     60 #if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
     61     defined(__mips__) && \
     62     defined(__mips_dsp) && (__mips_dsp_rev >= 2)
     63 #define HAS_TRANSPOSE_WX8_MIPS_DSPR2
     64 void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
     65                              uint8* dst, int dst_stride, int width);
     66 
     67 void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride,
     68                                   uint8* dst, int dst_stride, int width);
     69 #define HAS_TRANSPOSE_UVWx8_MIPS_DSPR2
     70 void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,
     71                                uint8* dst_a, int dst_stride_a,
     72                                uint8* dst_b, int dst_stride_b,
     73                                int width);
     74 #endif  // defined(__mips__)
     75 
     76 #if !defined(LIBYUV_DISABLE_X86) && \
     77     defined(_M_IX86) && defined(_MSC_VER)
     78 #define HAS_TRANSPOSE_WX8_SSSE3
     79 __declspec(naked) __declspec(align(16))
     80 static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
     81                                uint8* dst, int dst_stride, int width) {
     82   __asm {
     83     push      edi
     84     push      esi
     85     push      ebp
     86     mov       eax, [esp + 12 + 4]   // src
     87     mov       edi, [esp + 12 + 8]   // src_stride
     88     mov       edx, [esp + 12 + 12]  // dst
     89     mov       esi, [esp + 12 + 16]  // dst_stride
     90     mov       ecx, [esp + 12 + 20]  // width
     91 
     92     // Read in the data from the source pointer.
     93     // First round of bit swap.
     94     align      4
     95  convertloop:
     96     movq      xmm0, qword ptr [eax]
     97     lea       ebp, [eax + 8]
     98     movq      xmm1, qword ptr [eax + edi]
     99     lea       eax, [eax + 2 * edi]
    100     punpcklbw xmm0, xmm1
    101     movq      xmm2, qword ptr [eax]
    102     movdqa    xmm1, xmm0
    103     palignr   xmm1, xmm1, 8
    104     movq      xmm3, qword ptr [eax + edi]
    105     lea       eax, [eax + 2 * edi]
    106     punpcklbw xmm2, xmm3
    107     movdqa    xmm3, xmm2
    108     movq      xmm4, qword ptr [eax]
    109     palignr   xmm3, xmm3, 8
    110     movq      xmm5, qword ptr [eax + edi]
    111     punpcklbw xmm4, xmm5
    112     lea       eax, [eax + 2 * edi]
    113     movdqa    xmm5, xmm4
    114     movq      xmm6, qword ptr [eax]
    115     palignr   xmm5, xmm5, 8
    116     movq      xmm7, qword ptr [eax + edi]
    117     punpcklbw xmm6, xmm7
    118     mov       eax, ebp
    119     movdqa    xmm7, xmm6
    120     palignr   xmm7, xmm7, 8
    121     // Second round of bit swap.
    122     punpcklwd xmm0, xmm2
    123     punpcklwd xmm1, xmm3
    124     movdqa    xmm2, xmm0
    125     movdqa    xmm3, xmm1
    126     palignr   xmm2, xmm2, 8
    127     palignr   xmm3, xmm3, 8
    128     punpcklwd xmm4, xmm6
    129     punpcklwd xmm5, xmm7
    130     movdqa    xmm6, xmm4
    131     movdqa    xmm7, xmm5
    132     palignr   xmm6, xmm6, 8
    133     palignr   xmm7, xmm7, 8
    134     // Third round of bit swap.
    135     // Write to the destination pointer.
    136     punpckldq xmm0, xmm4
    137     movq      qword ptr [edx], xmm0
    138     movdqa    xmm4, xmm0
    139     palignr   xmm4, xmm4, 8
    140     movq      qword ptr [edx + esi], xmm4
    141     lea       edx, [edx + 2 * esi]
    142     punpckldq xmm2, xmm6
    143     movdqa    xmm6, xmm2
    144     palignr   xmm6, xmm6, 8
    145     movq      qword ptr [edx], xmm2
    146     punpckldq xmm1, xmm5
    147     movq      qword ptr [edx + esi], xmm6
    148     lea       edx, [edx + 2 * esi]
    149     movdqa    xmm5, xmm1
    150     movq      qword ptr [edx], xmm1
    151     palignr   xmm5, xmm5, 8
    152     punpckldq xmm3, xmm7
    153     movq      qword ptr [edx + esi], xmm5
    154     lea       edx, [edx + 2 * esi]
    155     movq      qword ptr [edx], xmm3
    156     movdqa    xmm7, xmm3
    157     palignr   xmm7, xmm7, 8
    158     sub       ecx, 8
    159     movq      qword ptr [edx + esi], xmm7
    160     lea       edx, [edx + 2 * esi]
    161     jg        convertloop
    162 
    163     pop       ebp
    164     pop       esi
    165     pop       edi
    166     ret
    167   }
    168 }
    169 
    170 #define HAS_TRANSPOSE_UVWX8_SSE2
    171 __declspec(naked) __declspec(align(16))
    172 static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
    173                                 uint8* dst_a, int dst_stride_a,
    174                                 uint8* dst_b, int dst_stride_b,
    175                                 int w) {
    176   __asm {
    177     push      ebx
    178     push      esi
    179     push      edi
    180     push      ebp
    181     mov       eax, [esp + 16 + 4]   // src
    182     mov       edi, [esp + 16 + 8]   // src_stride
    183     mov       edx, [esp + 16 + 12]  // dst_a
    184     mov       esi, [esp + 16 + 16]  // dst_stride_a
    185     mov       ebx, [esp + 16 + 20]  // dst_b
    186     mov       ebp, [esp + 16 + 24]  // dst_stride_b
    187     mov       ecx, esp
    188     sub       esp, 4 + 16
    189     and       esp, ~15
    190     mov       [esp + 16], ecx
    191     mov       ecx, [ecx + 16 + 28]  // w
    192 
    193     align      4
    194  convertloop:
    195     // Read in the data from the source pointer.
    196     // First round of bit swap.
    197     movdqa    xmm0, [eax]
    198     movdqa    xmm1, [eax + edi]
    199     lea       eax, [eax + 2 * edi]
    200     movdqa    xmm7, xmm0  // use xmm7 as temp register.
    201     punpcklbw xmm0, xmm1
    202     punpckhbw xmm7, xmm1
    203     movdqa    xmm1, xmm7
    204     movdqa    xmm2, [eax]
    205     movdqa    xmm3, [eax + edi]
    206     lea       eax, [eax + 2 * edi]
    207     movdqa    xmm7, xmm2
    208     punpcklbw xmm2, xmm3
    209     punpckhbw xmm7, xmm3
    210     movdqa    xmm3, xmm7
    211     movdqa    xmm4, [eax]
    212     movdqa    xmm5, [eax + edi]
    213     lea       eax, [eax + 2 * edi]
    214     movdqa    xmm7, xmm4
    215     punpcklbw xmm4, xmm5
    216     punpckhbw xmm7, xmm5
    217     movdqa    xmm5, xmm7
    218     movdqa    xmm6, [eax]
    219     movdqa    xmm7, [eax + edi]
    220     lea       eax, [eax + 2 * edi]
    221     movdqa    [esp], xmm5  // backup xmm5
    222     neg       edi
    223     movdqa    xmm5, xmm6   // use xmm5 as temp register.
    224     punpcklbw xmm6, xmm7
    225     punpckhbw xmm5, xmm7
    226     movdqa    xmm7, xmm5
    227     lea       eax, [eax + 8 * edi + 16]
    228     neg       edi
    229     // Second round of bit swap.
    230     movdqa    xmm5, xmm0
    231     punpcklwd xmm0, xmm2
    232     punpckhwd xmm5, xmm2
    233     movdqa    xmm2, xmm5
    234     movdqa    xmm5, xmm1
    235     punpcklwd xmm1, xmm3
    236     punpckhwd xmm5, xmm3
    237     movdqa    xmm3, xmm5
    238     movdqa    xmm5, xmm4
    239     punpcklwd xmm4, xmm6
    240     punpckhwd xmm5, xmm6
    241     movdqa    xmm6, xmm5
    242     movdqa    xmm5, [esp]  // restore xmm5
    243     movdqa    [esp], xmm6  // backup xmm6
    244     movdqa    xmm6, xmm5    // use xmm6 as temp register.
    245     punpcklwd xmm5, xmm7
    246     punpckhwd xmm6, xmm7
    247     movdqa    xmm7, xmm6
    248     // Third round of bit swap.
    249     // Write to the destination pointer.
    250     movdqa    xmm6, xmm0
    251     punpckldq xmm0, xmm4
    252     punpckhdq xmm6, xmm4
    253     movdqa    xmm4, xmm6
    254     movdqa    xmm6, [esp]  // restore xmm6
    255     movlpd    qword ptr [edx], xmm0
    256     movhpd    qword ptr [ebx], xmm0
    257     movlpd    qword ptr [edx + esi], xmm4
    258     lea       edx, [edx + 2 * esi]
    259     movhpd    qword ptr [ebx + ebp], xmm4
    260     lea       ebx, [ebx + 2 * ebp]
    261     movdqa    xmm0, xmm2   // use xmm0 as the temp register.
    262     punpckldq xmm2, xmm6
    263     movlpd    qword ptr [edx], xmm2
    264     movhpd    qword ptr [ebx], xmm2
    265     punpckhdq xmm0, xmm6
    266     movlpd    qword ptr [edx + esi], xmm0
    267     lea       edx, [edx + 2 * esi]
    268     movhpd    qword ptr [ebx + ebp], xmm0
    269     lea       ebx, [ebx + 2 * ebp]
    270     movdqa    xmm0, xmm1   // use xmm0 as the temp register.
    271     punpckldq xmm1, xmm5
    272     movlpd    qword ptr [edx], xmm1
    273     movhpd    qword ptr [ebx], xmm1
    274     punpckhdq xmm0, xmm5
    275     movlpd    qword ptr [edx + esi], xmm0
    276     lea       edx, [edx + 2 * esi]
    277     movhpd    qword ptr [ebx + ebp], xmm0
    278     lea       ebx, [ebx + 2 * ebp]
    279     movdqa    xmm0, xmm3   // use xmm0 as the temp register.
    280     punpckldq xmm3, xmm7
    281     movlpd    qword ptr [edx], xmm3
    282     movhpd    qword ptr [ebx], xmm3
    283     punpckhdq xmm0, xmm7
    284     sub       ecx, 8
    285     movlpd    qword ptr [edx + esi], xmm0
    286     lea       edx, [edx + 2 * esi]
    287     movhpd    qword ptr [ebx + ebp], xmm0
    288     lea       ebx, [ebx + 2 * ebp]
    289     jg        convertloop
    290 
    291     mov       esp, [esp + 16]
    292     pop       ebp
    293     pop       edi
    294     pop       esi
    295     pop       ebx
    296     ret
    297   }
    298 }
    299 #elif !defined(LIBYUV_DISABLE_X86) && \
    300     (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))
    301 #define HAS_TRANSPOSE_WX8_SSSE3
    302 static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
    303                                uint8* dst, int dst_stride, int width) {
    304   asm volatile (
    305     // Read in the data from the source pointer.
    306     // First round of bit swap.
    307     ".p2align  2                                 \n"
    308   "1:                                            \n"
    309     "movq       (%0),%%xmm0                      \n"
    310     "movq       (%0,%3),%%xmm1                   \n"
    311     "lea        (%0,%3,2),%0                     \n"
    312     "punpcklbw  %%xmm1,%%xmm0                    \n"
    313     "movq       (%0),%%xmm2                      \n"
    314     "movdqa     %%xmm0,%%xmm1                    \n"
    315     "palignr    $0x8,%%xmm1,%%xmm1               \n"
    316     "movq       (%0,%3),%%xmm3                   \n"
    317     "lea        (%0,%3,2),%0                     \n"
    318     "punpcklbw  %%xmm3,%%xmm2                    \n"
    319     "movdqa     %%xmm2,%%xmm3                    \n"
    320     "movq       (%0),%%xmm4                      \n"
    321     "palignr    $0x8,%%xmm3,%%xmm3               \n"
    322     "movq       (%0,%3),%%xmm5                   \n"
    323     "lea        (%0,%3,2),%0                     \n"
    324     "punpcklbw  %%xmm5,%%xmm4                    \n"
    325     "movdqa     %%xmm4,%%xmm5                    \n"
    326     "movq       (%0),%%xmm6                      \n"
    327     "palignr    $0x8,%%xmm5,%%xmm5               \n"
    328     "movq       (%0,%3),%%xmm7                   \n"
    329     "lea        (%0,%3,2),%0                     \n"
    330     "punpcklbw  %%xmm7,%%xmm6                    \n"
    331     "neg        %3                               \n"
    332     "movdqa     %%xmm6,%%xmm7                    \n"
    333     "lea        0x8(%0,%3,8),%0                  \n"
    334     "palignr    $0x8,%%xmm7,%%xmm7               \n"
    335     "neg        %3                               \n"
    336      // Second round of bit swap.
    337     "punpcklwd  %%xmm2,%%xmm0                    \n"
    338     "punpcklwd  %%xmm3,%%xmm1                    \n"
    339     "movdqa     %%xmm0,%%xmm2                    \n"
    340     "movdqa     %%xmm1,%%xmm3                    \n"
    341     "palignr    $0x8,%%xmm2,%%xmm2               \n"
    342     "palignr    $0x8,%%xmm3,%%xmm3               \n"
    343     "punpcklwd  %%xmm6,%%xmm4                    \n"
    344     "punpcklwd  %%xmm7,%%xmm5                    \n"
    345     "movdqa     %%xmm4,%%xmm6                    \n"
    346     "movdqa     %%xmm5,%%xmm7                    \n"
    347     "palignr    $0x8,%%xmm6,%%xmm6               \n"
    348     "palignr    $0x8,%%xmm7,%%xmm7               \n"
    349     // Third round of bit swap.
    350     // Write to the destination pointer.
    351     "punpckldq  %%xmm4,%%xmm0                    \n"
    352     "movq       %%xmm0,(%1)                      \n"
    353     "movdqa     %%xmm0,%%xmm4                    \n"
    354     "palignr    $0x8,%%xmm4,%%xmm4               \n"
    355     "movq       %%xmm4,(%1,%4)                   \n"
    356     "lea        (%1,%4,2),%1                     \n"
    357     "punpckldq  %%xmm6,%%xmm2                    \n"
    358     "movdqa     %%xmm2,%%xmm6                    \n"
    359     "movq       %%xmm2,(%1)                      \n"
    360     "palignr    $0x8,%%xmm6,%%xmm6               \n"
    361     "punpckldq  %%xmm5,%%xmm1                    \n"
    362     "movq       %%xmm6,(%1,%4)                   \n"
    363     "lea        (%1,%4,2),%1                     \n"
    364     "movdqa     %%xmm1,%%xmm5                    \n"
    365     "movq       %%xmm1,(%1)                      \n"
    366     "palignr    $0x8,%%xmm5,%%xmm5               \n"
    367     "movq       %%xmm5,(%1,%4)                   \n"
    368     "lea        (%1,%4,2),%1                     \n"
    369     "punpckldq  %%xmm7,%%xmm3                    \n"
    370     "movq       %%xmm3,(%1)                      \n"
    371     "movdqa     %%xmm3,%%xmm7                    \n"
    372     "palignr    $0x8,%%xmm7,%%xmm7               \n"
    373     "sub        $0x8,%2                          \n"
    374     "movq       %%xmm7,(%1,%4)                   \n"
    375     "lea        (%1,%4,2),%1                     \n"
    376     "jg         1b                               \n"
    377     : "+r"(src),    // %0
    378       "+r"(dst),    // %1
    379       "+r"(width)   // %2
    380     : "r"((intptr_t)(src_stride)),  // %3
    381       "r"((intptr_t)(dst_stride))   // %4
    382     : "memory", "cc"
    383   #if defined(__SSE2__)
    384       , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
    385   #endif
    386   );
    387 }
    388 
    389 #if !defined(LIBYUV_DISABLE_X86) && defined(__i386__)
    390 #define HAS_TRANSPOSE_UVWX8_SSE2
    391 void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
    392                          uint8* dst_a, int dst_stride_a,
    393                          uint8* dst_b, int dst_stride_b,
    394                          int w);
    395   asm (
    396     DECLARE_FUNCTION(TransposeUVWx8_SSE2)
    397     "push   %ebx                               \n"
    398     "push   %esi                               \n"
    399     "push   %edi                               \n"
    400     "push   %ebp                               \n"
    401     "mov    0x14(%esp),%eax                    \n"
    402     "mov    0x18(%esp),%edi                    \n"
    403     "mov    0x1c(%esp),%edx                    \n"
    404     "mov    0x20(%esp),%esi                    \n"
    405     "mov    0x24(%esp),%ebx                    \n"
    406     "mov    0x28(%esp),%ebp                    \n"
    407     "mov    %esp,%ecx                          \n"
    408     "sub    $0x14,%esp                         \n"
    409     "and    $0xfffffff0,%esp                   \n"
    410     "mov    %ecx,0x10(%esp)                    \n"
    411     "mov    0x2c(%ecx),%ecx                    \n"
    412 
    413 "1:                                            \n"
    414     "movdqa (%eax),%xmm0                       \n"
    415     "movdqa (%eax,%edi,1),%xmm1                \n"
    416     "lea    (%eax,%edi,2),%eax                 \n"
    417     "movdqa %xmm0,%xmm7                        \n"
    418     "punpcklbw %xmm1,%xmm0                     \n"
    419     "punpckhbw %xmm1,%xmm7                     \n"
    420     "movdqa %xmm7,%xmm1                        \n"
    421     "movdqa (%eax),%xmm2                       \n"
    422     "movdqa (%eax,%edi,1),%xmm3                \n"
    423     "lea    (%eax,%edi,2),%eax                 \n"
    424     "movdqa %xmm2,%xmm7                        \n"
    425     "punpcklbw %xmm3,%xmm2                     \n"
    426     "punpckhbw %xmm3,%xmm7                     \n"
    427     "movdqa %xmm7,%xmm3                        \n"
    428     "movdqa (%eax),%xmm4                       \n"
    429     "movdqa (%eax,%edi,1),%xmm5                \n"
    430     "lea    (%eax,%edi,2),%eax                 \n"
    431     "movdqa %xmm4,%xmm7                        \n"
    432     "punpcklbw %xmm5,%xmm4                     \n"
    433     "punpckhbw %xmm5,%xmm7                     \n"
    434     "movdqa %xmm7,%xmm5                        \n"
    435     "movdqa (%eax),%xmm6                       \n"
    436     "movdqa (%eax,%edi,1),%xmm7                \n"
    437     "lea    (%eax,%edi,2),%eax                 \n"
    438     "movdqa %xmm5,(%esp)                       \n"
    439     "neg    %edi                               \n"
    440     "movdqa %xmm6,%xmm5                        \n"
    441     "punpcklbw %xmm7,%xmm6                     \n"
    442     "punpckhbw %xmm7,%xmm5                     \n"
    443     "movdqa %xmm5,%xmm7                        \n"
    444     "lea    0x10(%eax,%edi,8),%eax             \n"
    445     "neg    %edi                               \n"
    446     "movdqa %xmm0,%xmm5                        \n"
    447     "punpcklwd %xmm2,%xmm0                     \n"
    448     "punpckhwd %xmm2,%xmm5                     \n"
    449     "movdqa %xmm5,%xmm2                        \n"
    450     "movdqa %xmm1,%xmm5                        \n"
    451     "punpcklwd %xmm3,%xmm1                     \n"
    452     "punpckhwd %xmm3,%xmm5                     \n"
    453     "movdqa %xmm5,%xmm3                        \n"
    454     "movdqa %xmm4,%xmm5                        \n"
    455     "punpcklwd %xmm6,%xmm4                     \n"
    456     "punpckhwd %xmm6,%xmm5                     \n"
    457     "movdqa %xmm5,%xmm6                        \n"
    458     "movdqa (%esp),%xmm5                       \n"
    459     "movdqa %xmm6,(%esp)                       \n"
    460     "movdqa %xmm5,%xmm6                        \n"
    461     "punpcklwd %xmm7,%xmm5                     \n"
    462     "punpckhwd %xmm7,%xmm6                     \n"
    463     "movdqa %xmm6,%xmm7                        \n"
    464     "movdqa %xmm0,%xmm6                        \n"
    465     "punpckldq %xmm4,%xmm0                     \n"
    466     "punpckhdq %xmm4,%xmm6                     \n"
    467     "movdqa %xmm6,%xmm4                        \n"
    468     "movdqa (%esp),%xmm6                       \n"
    469     "movlpd %xmm0,(%edx)                       \n"
    470     "movhpd %xmm0,(%ebx)                       \n"
    471     "movlpd %xmm4,(%edx,%esi,1)                \n"
    472     "lea    (%edx,%esi,2),%edx                 \n"
    473     "movhpd %xmm4,(%ebx,%ebp,1)                \n"
    474     "lea    (%ebx,%ebp,2),%ebx                 \n"
    475     "movdqa %xmm2,%xmm0                        \n"
    476     "punpckldq %xmm6,%xmm2                     \n"
    477     "movlpd %xmm2,(%edx)                       \n"
    478     "movhpd %xmm2,(%ebx)                       \n"
    479     "punpckhdq %xmm6,%xmm0                     \n"
    480     "movlpd %xmm0,(%edx,%esi,1)                \n"
    481     "lea    (%edx,%esi,2),%edx                 \n"
    482     "movhpd %xmm0,(%ebx,%ebp,1)                \n"
    483     "lea    (%ebx,%ebp,2),%ebx                 \n"
    484     "movdqa %xmm1,%xmm0                        \n"
    485     "punpckldq %xmm5,%xmm1                     \n"
    486     "movlpd %xmm1,(%edx)                       \n"
    487     "movhpd %xmm1,(%ebx)                       \n"
    488     "punpckhdq %xmm5,%xmm0                     \n"
    489     "movlpd %xmm0,(%edx,%esi,1)                \n"
    490     "lea    (%edx,%esi,2),%edx                 \n"
    491     "movhpd %xmm0,(%ebx,%ebp,1)                \n"
    492     "lea    (%ebx,%ebp,2),%ebx                 \n"
    493     "movdqa %xmm3,%xmm0                        \n"
    494     "punpckldq %xmm7,%xmm3                     \n"
    495     "movlpd %xmm3,(%edx)                       \n"
    496     "movhpd %xmm3,(%ebx)                       \n"
    497     "punpckhdq %xmm7,%xmm0                     \n"
    498     "sub    $0x8,%ecx                          \n"
    499     "movlpd %xmm0,(%edx,%esi,1)                \n"
    500     "lea    (%edx,%esi,2),%edx                 \n"
    501     "movhpd %xmm0,(%ebx,%ebp,1)                \n"
    502     "lea    (%ebx,%ebp,2),%ebx                 \n"
    503     "jg     1b                                 \n"
    504     "mov    0x10(%esp),%esp                    \n"
    505     "pop    %ebp                               \n"
    506     "pop    %edi                               \n"
    507     "pop    %esi                               \n"
    508     "pop    %ebx                               \n"
    509 #if defined(__native_client__)
    510     "pop    %ecx                               \n"
    511     "and    $0xffffffe0,%ecx                   \n"
    512     "jmp    *%ecx                              \n"
    513 #else
    514     "ret                                       \n"
    515 #endif
    516 );
    517 #elif !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
    518     defined(__x86_64__)
    519 // 64 bit version has enough registers to do 16x8 to 8x16 at a time.
    520 #define HAS_TRANSPOSE_WX8_FAST_SSSE3
    521 static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
    522                                     uint8* dst, int dst_stride, int width) {
    523   asm volatile (
    524   // Read in the data from the source pointer.
    525   // First round of bit swap.
    526   ".p2align  2                                 \n"
    527 "1:                                            \n"
    528   "movdqa     (%0),%%xmm0                      \n"
    529   "movdqa     (%0,%3),%%xmm1                   \n"
    530   "lea        (%0,%3,2),%0                     \n"
    531   "movdqa     %%xmm0,%%xmm8                    \n"
    532   "punpcklbw  %%xmm1,%%xmm0                    \n"
    533   "punpckhbw  %%xmm1,%%xmm8                    \n"
    534   "movdqa     (%0),%%xmm2                      \n"
    535   "movdqa     %%xmm0,%%xmm1                    \n"
    536   "movdqa     %%xmm8,%%xmm9                    \n"
    537   "palignr    $0x8,%%xmm1,%%xmm1               \n"
    538   "palignr    $0x8,%%xmm9,%%xmm9               \n"
    539   "movdqa     (%0,%3),%%xmm3                   \n"
    540   "lea        (%0,%3,2),%0                     \n"
    541   "movdqa     %%xmm2,%%xmm10                   \n"
    542   "punpcklbw  %%xmm3,%%xmm2                    \n"
    543   "punpckhbw  %%xmm3,%%xmm10                   \n"
    544   "movdqa     %%xmm2,%%xmm3                    \n"
    545   "movdqa     %%xmm10,%%xmm11                  \n"
    546   "movdqa     (%0),%%xmm4                      \n"
    547   "palignr    $0x8,%%xmm3,%%xmm3               \n"
    548   "palignr    $0x8,%%xmm11,%%xmm11             \n"
    549   "movdqa     (%0,%3),%%xmm5                   \n"
    550   "lea        (%0,%3,2),%0                     \n"
    551   "movdqa     %%xmm4,%%xmm12                   \n"
    552   "punpcklbw  %%xmm5,%%xmm4                    \n"
    553   "punpckhbw  %%xmm5,%%xmm12                   \n"
    554   "movdqa     %%xmm4,%%xmm5                    \n"
    555   "movdqa     %%xmm12,%%xmm13                  \n"
    556   "movdqa     (%0),%%xmm6                      \n"
    557   "palignr    $0x8,%%xmm5,%%xmm5               \n"
    558   "palignr    $0x8,%%xmm13,%%xmm13             \n"
    559   "movdqa     (%0,%3),%%xmm7                   \n"
    560   "lea        (%0,%3,2),%0                     \n"
    561   "movdqa     %%xmm6,%%xmm14                   \n"
    562   "punpcklbw  %%xmm7,%%xmm6                    \n"
    563   "punpckhbw  %%xmm7,%%xmm14                   \n"
    564   "neg        %3                               \n"
    565   "movdqa     %%xmm6,%%xmm7                    \n"
    566   "movdqa     %%xmm14,%%xmm15                  \n"
    567   "lea        0x10(%0,%3,8),%0                 \n"
    568   "palignr    $0x8,%%xmm7,%%xmm7               \n"
    569   "palignr    $0x8,%%xmm15,%%xmm15             \n"
    570   "neg        %3                               \n"
    571    // Second round of bit swap.
    572   "punpcklwd  %%xmm2,%%xmm0                    \n"
    573   "punpcklwd  %%xmm3,%%xmm1                    \n"
    574   "movdqa     %%xmm0,%%xmm2                    \n"
    575   "movdqa     %%xmm1,%%xmm3                    \n"
    576   "palignr    $0x8,%%xmm2,%%xmm2               \n"
    577   "palignr    $0x8,%%xmm3,%%xmm3               \n"
    578   "punpcklwd  %%xmm6,%%xmm4                    \n"
    579   "punpcklwd  %%xmm7,%%xmm5                    \n"
    580   "movdqa     %%xmm4,%%xmm6                    \n"
    581   "movdqa     %%xmm5,%%xmm7                    \n"
    582   "palignr    $0x8,%%xmm6,%%xmm6               \n"
    583   "palignr    $0x8,%%xmm7,%%xmm7               \n"
    584   "punpcklwd  %%xmm10,%%xmm8                   \n"
    585   "punpcklwd  %%xmm11,%%xmm9                   \n"
    586   "movdqa     %%xmm8,%%xmm10                   \n"
    587   "movdqa     %%xmm9,%%xmm11                   \n"
    588   "palignr    $0x8,%%xmm10,%%xmm10             \n"
    589   "palignr    $0x8,%%xmm11,%%xmm11             \n"
    590   "punpcklwd  %%xmm14,%%xmm12                  \n"
    591   "punpcklwd  %%xmm15,%%xmm13                  \n"
    592   "movdqa     %%xmm12,%%xmm14                  \n"
    593   "movdqa     %%xmm13,%%xmm15                  \n"
    594   "palignr    $0x8,%%xmm14,%%xmm14             \n"
    595   "palignr    $0x8,%%xmm15,%%xmm15             \n"
    596   // Third round of bit swap.
    597   // Write to the destination pointer.
    598   "punpckldq  %%xmm4,%%xmm0                    \n"
    599   "movq       %%xmm0,(%1)                      \n"
    600   "movdqa     %%xmm0,%%xmm4                    \n"
    601   "palignr    $0x8,%%xmm4,%%xmm4               \n"
    602   "movq       %%xmm4,(%1,%4)                   \n"
    603   "lea        (%1,%4,2),%1                     \n"
    604   "punpckldq  %%xmm6,%%xmm2                    \n"
    605   "movdqa     %%xmm2,%%xmm6                    \n"
    606   "movq       %%xmm2,(%1)                      \n"
    607   "palignr    $0x8,%%xmm6,%%xmm6               \n"
    608   "punpckldq  %%xmm5,%%xmm1                    \n"
    609   "movq       %%xmm6,(%1,%4)                   \n"
    610   "lea        (%1,%4,2),%1                     \n"
    611   "movdqa     %%xmm1,%%xmm5                    \n"
    612   "movq       %%xmm1,(%1)                      \n"
    613   "palignr    $0x8,%%xmm5,%%xmm5               \n"
    614   "movq       %%xmm5,(%1,%4)                   \n"
    615   "lea        (%1,%4,2),%1                     \n"
    616   "punpckldq  %%xmm7,%%xmm3                    \n"
    617   "movq       %%xmm3,(%1)                      \n"
    618   "movdqa     %%xmm3,%%xmm7                    \n"
    619   "palignr    $0x8,%%xmm7,%%xmm7               \n"
    620   "movq       %%xmm7,(%1,%4)                   \n"
    621   "lea        (%1,%4,2),%1                     \n"
    622   "punpckldq  %%xmm12,%%xmm8                   \n"
    623   "movq       %%xmm8,(%1)                      \n"
    624   "movdqa     %%xmm8,%%xmm12                   \n"
    625   "palignr    $0x8,%%xmm12,%%xmm12             \n"
    626   "movq       %%xmm12,(%1,%4)                  \n"
    627   "lea        (%1,%4,2),%1                     \n"
    628   "punpckldq  %%xmm14,%%xmm10                  \n"
    629   "movdqa     %%xmm10,%%xmm14                  \n"
    630   "movq       %%xmm10,(%1)                     \n"
    631   "palignr    $0x8,%%xmm14,%%xmm14             \n"
    632   "punpckldq  %%xmm13,%%xmm9                   \n"
    633   "movq       %%xmm14,(%1,%4)                  \n"
    634   "lea        (%1,%4,2),%1                     \n"
    635   "movdqa     %%xmm9,%%xmm13                   \n"
    636   "movq       %%xmm9,(%1)                      \n"
    637   "palignr    $0x8,%%xmm13,%%xmm13             \n"
    638   "movq       %%xmm13,(%1,%4)                  \n"
    639   "lea        (%1,%4,2),%1                     \n"
    640   "punpckldq  %%xmm15,%%xmm11                  \n"
    641   "movq       %%xmm11,(%1)                     \n"
    642   "movdqa     %%xmm11,%%xmm15                  \n"
    643   "palignr    $0x8,%%xmm15,%%xmm15             \n"
    644   "sub        $0x10,%2                         \n"
    645   "movq       %%xmm15,(%1,%4)                  \n"
    646   "lea        (%1,%4,2),%1                     \n"
    647   "jg         1b                               \n"
    648   : "+r"(src),    // %0
    649     "+r"(dst),    // %1
    650     "+r"(width)   // %2
    651   : "r"((intptr_t)(src_stride)),  // %3
    652     "r"((intptr_t)(dst_stride))   // %4
    653   : "memory", "cc",
    654     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
    655     "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",  "xmm14",  "xmm15"
    656 );
    657 }
    658 
    659 #define HAS_TRANSPOSE_UVWX8_SSE2
    660 static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
    661                                 uint8* dst_a, int dst_stride_a,
    662                                 uint8* dst_b, int dst_stride_b,
    663                                 int w) {
    664   asm volatile (
    665   // Read in the data from the source pointer.
    666   // First round of bit swap.
    667   ".p2align  2                                 \n"
    668 "1:                                            \n"
    669   "movdqa     (%0),%%xmm0                      \n"
    670   "movdqa     (%0,%4),%%xmm1                   \n"
    671   "lea        (%0,%4,2),%0                     \n"
    672   "movdqa     %%xmm0,%%xmm8                    \n"
    673   "punpcklbw  %%xmm1,%%xmm0                    \n"
    674   "punpckhbw  %%xmm1,%%xmm8                    \n"
    675   "movdqa     %%xmm8,%%xmm1                    \n"
    676   "movdqa     (%0),%%xmm2                      \n"
    677   "movdqa     (%0,%4),%%xmm3                   \n"
    678   "lea        (%0,%4,2),%0                     \n"
    679   "movdqa     %%xmm2,%%xmm8                    \n"
    680   "punpcklbw  %%xmm3,%%xmm2                    \n"
    681   "punpckhbw  %%xmm3,%%xmm8                    \n"
    682   "movdqa     %%xmm8,%%xmm3                    \n"
    683   "movdqa     (%0),%%xmm4                      \n"
    684   "movdqa     (%0,%4),%%xmm5                   \n"
    685   "lea        (%0,%4,2),%0                     \n"
    686   "movdqa     %%xmm4,%%xmm8                    \n"
    687   "punpcklbw  %%xmm5,%%xmm4                    \n"
    688   "punpckhbw  %%xmm5,%%xmm8                    \n"
    689   "movdqa     %%xmm8,%%xmm5                    \n"
    690   "movdqa     (%0),%%xmm6                      \n"
    691   "movdqa     (%0,%4),%%xmm7                   \n"
    692   "lea        (%0,%4,2),%0                     \n"
    693   "movdqa     %%xmm6,%%xmm8                    \n"
    694   "punpcklbw  %%xmm7,%%xmm6                    \n"
    695   "neg        %4                               \n"
    696   "lea        0x10(%0,%4,8),%0                 \n"
    697   "punpckhbw  %%xmm7,%%xmm8                    \n"
    698   "movdqa     %%xmm8,%%xmm7                    \n"
    699   "neg        %4                               \n"
    700    // Second round of bit swap.
    701   "movdqa     %%xmm0,%%xmm8                    \n"
    702   "movdqa     %%xmm1,%%xmm9                    \n"
    703   "punpckhwd  %%xmm2,%%xmm8                    \n"
    704   "punpckhwd  %%xmm3,%%xmm9                    \n"
    705   "punpcklwd  %%xmm2,%%xmm0                    \n"
    706   "punpcklwd  %%xmm3,%%xmm1                    \n"
    707   "movdqa     %%xmm8,%%xmm2                    \n"
    708   "movdqa     %%xmm9,%%xmm3                    \n"
    709   "movdqa     %%xmm4,%%xmm8                    \n"
    710   "movdqa     %%xmm5,%%xmm9                    \n"
    711   "punpckhwd  %%xmm6,%%xmm8                    \n"
    712   "punpckhwd  %%xmm7,%%xmm9                    \n"
    713   "punpcklwd  %%xmm6,%%xmm4                    \n"
    714   "punpcklwd  %%xmm7,%%xmm5                    \n"
    715   "movdqa     %%xmm8,%%xmm6                    \n"
    716   "movdqa     %%xmm9,%%xmm7                    \n"
    717   // Third round of bit swap.
    718   // Write to the destination pointer.
    719   "movdqa     %%xmm0,%%xmm8                    \n"
    720   "punpckldq  %%xmm4,%%xmm0                    \n"
    721   "movlpd     %%xmm0,(%1)                      \n"  // Write back U channel
    722   "movhpd     %%xmm0,(%2)                      \n"  // Write back V channel
    723   "punpckhdq  %%xmm4,%%xmm8                    \n"
    724   "movlpd     %%xmm8,(%1,%5)                   \n"
    725   "lea        (%1,%5,2),%1                     \n"
    726   "movhpd     %%xmm8,(%2,%6)                   \n"
    727   "lea        (%2,%6,2),%2                     \n"
    728   "movdqa     %%xmm2,%%xmm8                    \n"
    729   "punpckldq  %%xmm6,%%xmm2                    \n"
    730   "movlpd     %%xmm2,(%1)                      \n"
    731   "movhpd     %%xmm2,(%2)                      \n"
    732   "punpckhdq  %%xmm6,%%xmm8                    \n"
    733   "movlpd     %%xmm8,(%1,%5)                   \n"
    734   "lea        (%1,%5,2),%1                     \n"
    735   "movhpd     %%xmm8,(%2,%6)                   \n"
    736   "lea        (%2,%6,2),%2                     \n"
    737   "movdqa     %%xmm1,%%xmm8                    \n"
    738   "punpckldq  %%xmm5,%%xmm1                    \n"
    739   "movlpd     %%xmm1,(%1)                      \n"
    740   "movhpd     %%xmm1,(%2)                      \n"
    741   "punpckhdq  %%xmm5,%%xmm8                    \n"
    742   "movlpd     %%xmm8,(%1,%5)                   \n"
    743   "lea        (%1,%5,2),%1                     \n"
    744   "movhpd     %%xmm8,(%2,%6)                   \n"
    745   "lea        (%2,%6,2),%2                     \n"
    746   "movdqa     %%xmm3,%%xmm8                    \n"
    747   "punpckldq  %%xmm7,%%xmm3                    \n"
    748   "movlpd     %%xmm3,(%1)                      \n"
    749   "movhpd     %%xmm3,(%2)                      \n"
    750   "punpckhdq  %%xmm7,%%xmm8                    \n"
    751   "sub        $0x8,%3                          \n"
    752   "movlpd     %%xmm8,(%1,%5)                   \n"
    753   "lea        (%1,%5,2),%1                     \n"
    754   "movhpd     %%xmm8,(%2,%6)                   \n"
    755   "lea        (%2,%6,2),%2                     \n"
    756   "jg         1b                               \n"
    757   : "+r"(src),    // %0
    758     "+r"(dst_a),  // %1
    759     "+r"(dst_b),  // %2
    760     "+r"(w)   // %3
    761   : "r"((intptr_t)(src_stride)),    // %4
    762     "r"((intptr_t)(dst_stride_a)),  // %5
    763     "r"((intptr_t)(dst_stride_b))   // %6
    764   : "memory", "cc",
    765     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
    766     "xmm8", "xmm9"
    767 );
    768 }
    769 #endif
    770 #endif
    771 
    772 static void TransposeWx8_C(const uint8* src, int src_stride,
    773                            uint8* dst, int dst_stride,
    774                            int width) {
    775   int i;
    776   for (i = 0; i < width; ++i) {
    777     dst[0] = src[0 * src_stride];
    778     dst[1] = src[1 * src_stride];
    779     dst[2] = src[2 * src_stride];
    780     dst[3] = src[3 * src_stride];
    781     dst[4] = src[4 * src_stride];
    782     dst[5] = src[5 * src_stride];
    783     dst[6] = src[6 * src_stride];
    784     dst[7] = src[7 * src_stride];
    785     ++src;
    786     dst += dst_stride;
    787   }
    788 }
    789 
    790 static void TransposeWxH_C(const uint8* src, int src_stride,
    791                            uint8* dst, int dst_stride,
    792                            int width, int height) {
    793   int i;
    794   for (i = 0; i < width; ++i) {
    795     int j;
    796     for (j = 0; j < height; ++j) {
    797       dst[i * dst_stride + j] = src[j * src_stride + i];
    798     }
    799   }
    800 }
    801 
    802 LIBYUV_API
    803 void TransposePlane(const uint8* src, int src_stride,
    804                     uint8* dst, int dst_stride,
    805                     int width, int height) {
    806   int i = height;
    807   void (*TransposeWx8)(const uint8* src, int src_stride,
    808                        uint8* dst, int dst_stride,
    809                        int width) = TransposeWx8_C;
    810 #if defined(HAS_TRANSPOSE_WX8_NEON)
    811   if (TestCpuFlag(kCpuHasNEON)) {
    812     TransposeWx8 = TransposeWx8_NEON;
    813   }
    814 #endif
    815 #if defined(HAS_TRANSPOSE_WX8_SSSE3)
    816   if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
    817     TransposeWx8 = TransposeWx8_SSSE3;
    818   }
    819 #endif
    820 #if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3)
    821   if (TestCpuFlag(kCpuHasSSSE3) &&
    822       IS_ALIGNED(width, 16) &&
    823       IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
    824     TransposeWx8 = TransposeWx8_FAST_SSSE3;
    825   }
    826 #endif
    827 #if defined(HAS_TRANSPOSE_WX8_MIPS_DSPR2)
    828   if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
    829     if (IS_ALIGNED(width, 4) &&
    830         IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
    831       TransposeWx8 = TransposeWx8_FAST_MIPS_DSPR2;
    832     } else {
    833       TransposeWx8 = TransposeWx8_MIPS_DSPR2;
    834     }
    835   }
    836 #endif
    837 
    838   // Work across the source in 8x8 tiles
    839   while (i >= 8) {
    840     TransposeWx8(src, src_stride, dst, dst_stride, width);
    841     src += 8 * src_stride;    // Go down 8 rows.
    842     dst += 8;                 // Move over 8 columns.
    843     i -= 8;
    844   }
    845 
    846   TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);
    847 }
    848 
    849 LIBYUV_API
    850 void RotatePlane90(const uint8* src, int src_stride,
    851                    uint8* dst, int dst_stride,
    852                    int width, int height) {
    853   // Rotate by 90 is a transpose with the source read
    854   // from bottom to top. So set the source pointer to the end
    855   // of the buffer and flip the sign of the source stride.
    856   src += src_stride * (height - 1);
    857   src_stride = -src_stride;
    858   TransposePlane(src, src_stride, dst, dst_stride, width, height);
    859 }
    860 
    861 LIBYUV_API
    862 void RotatePlane270(const uint8* src, int src_stride,
    863                     uint8* dst, int dst_stride,
    864                     int width, int height) {
    865   // Rotate by 270 is a transpose with the destination written
    866   // from bottom to top. So set the destination pointer to the end
    867   // of the buffer and flip the sign of the destination stride.
    868   dst += dst_stride * (width - 1);
    869   dst_stride = -dst_stride;
    870   TransposePlane(src, src_stride, dst, dst_stride, width, height);
    871 }
    872 
    873 LIBYUV_API
    874 void RotatePlane180(const uint8* src, int src_stride,
    875                     uint8* dst, int dst_stride,
    876                     int width, int height) {
    877   // Swap first and last row and mirror the content. Uses a temporary row.
    878   align_buffer_64(row, width);
    879   const uint8* src_bot = src + src_stride * (height - 1);
    880   uint8* dst_bot = dst + dst_stride * (height - 1);
    881   int half_height = (height + 1) >> 1;
    882   int y;
    883   void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
    884   void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
    885 #if defined(HAS_MIRRORROW_NEON)
    886   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
    887     MirrorRow = MirrorRow_NEON;
    888   }
    889 #endif
    890 #if defined(HAS_MIRRORROW_SSE2)
    891   if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
    892       IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
    893       IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
    894     MirrorRow = MirrorRow_SSE2;
    895   }
    896 #endif
    897 #if defined(HAS_MIRRORROW_SSSE3)
    898   if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) &&
    899       IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
    900       IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
    901     MirrorRow = MirrorRow_SSSE3;
    902   }
    903 #endif
    904 #if defined(HAS_MIRRORROW_AVX2)
    905   if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
    906     MirrorRow = MirrorRow_AVX2;
    907   }
    908 #endif
    909 #if defined(HAS_MIRRORROW_MIPS_DSPR2)
    910   if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
    911       IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) &&
    912       IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4)) {
    913     MirrorRow = MirrorRow_MIPS_DSPR2;
    914   }
    915 #endif
    916 #if defined(HAS_COPYROW_NEON)
    917   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
    918     CopyRow = CopyRow_NEON;
    919   }
    920 #endif
    921 #if defined(HAS_COPYROW_X86)
    922   if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
    923     CopyRow = CopyRow_X86;
    924   }
    925 #endif
    926 #if defined(HAS_COPYROW_SSE2)
    927   if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
    928       IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
    929       IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
    930     CopyRow = CopyRow_SSE2;
    931   }
    932 #endif
    933 #if defined(HAS_COPYROW_ERMS)
    934   if (TestCpuFlag(kCpuHasERMS)) {
    935     CopyRow = CopyRow_ERMS;
    936   }
    937 #endif
    938 #if defined(HAS_COPYROW_MIPS)
    939   if (TestCpuFlag(kCpuHasMIPS)) {
    940     CopyRow = CopyRow_MIPS;
    941   }
    942 #endif
    943 
    944   // Odd height will harmlessly mirror the middle row twice.
    945   for (y = 0; y < half_height; ++y) {
    946     MirrorRow(src, row, width);  // Mirror first row into a buffer
    947     src += src_stride;
    948     MirrorRow(src_bot, dst, width);  // Mirror last row into first row
    949     dst += dst_stride;
    950     CopyRow(row, dst_bot, width);  // Copy first mirrored row into last
    951     src_bot -= src_stride;
    952     dst_bot -= dst_stride;
    953   }
    954   free_aligned_buffer_64(row);
    955 }
    956 
    957 static void TransposeUVWx8_C(const uint8* src, int src_stride,
    958                              uint8* dst_a, int dst_stride_a,
    959                              uint8* dst_b, int dst_stride_b,
    960                              int width) {
    961   int i;
    962   for (i = 0; i < width; ++i) {
    963     dst_a[0] = src[0 * src_stride + 0];
    964     dst_b[0] = src[0 * src_stride + 1];
    965     dst_a[1] = src[1 * src_stride + 0];
    966     dst_b[1] = src[1 * src_stride + 1];
    967     dst_a[2] = src[2 * src_stride + 0];
    968     dst_b[2] = src[2 * src_stride + 1];
    969     dst_a[3] = src[3 * src_stride + 0];
    970     dst_b[3] = src[3 * src_stride + 1];
    971     dst_a[4] = src[4 * src_stride + 0];
    972     dst_b[4] = src[4 * src_stride + 1];
    973     dst_a[5] = src[5 * src_stride + 0];
    974     dst_b[5] = src[5 * src_stride + 1];
    975     dst_a[6] = src[6 * src_stride + 0];
    976     dst_b[6] = src[6 * src_stride + 1];
    977     dst_a[7] = src[7 * src_stride + 0];
    978     dst_b[7] = src[7 * src_stride + 1];
    979     src += 2;
    980     dst_a += dst_stride_a;
    981     dst_b += dst_stride_b;
    982   }
    983 }
    984 
    985 static void TransposeUVWxH_C(const uint8* src, int src_stride,
    986                              uint8* dst_a, int dst_stride_a,
    987                              uint8* dst_b, int dst_stride_b,
    988                              int width, int height) {
    989   int i;
    990   for (i = 0; i < width * 2; i += 2) {
    991     int j;
    992     for (j = 0; j < height; ++j) {
    993       dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
    994       dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
    995     }
    996   }
    997 }
    998 
    999 LIBYUV_API
   1000 void TransposeUV(const uint8* src, int src_stride,
   1001                  uint8* dst_a, int dst_stride_a,
   1002                  uint8* dst_b, int dst_stride_b,
   1003                  int width, int height) {
   1004   int i = height;
   1005   void (*TransposeUVWx8)(const uint8* src, int src_stride,
   1006                          uint8* dst_a, int dst_stride_a,
   1007                          uint8* dst_b, int dst_stride_b,
   1008                          int width) = TransposeUVWx8_C;
   1009 #if defined(HAS_TRANSPOSE_UVWX8_NEON)
   1010   if (TestCpuFlag(kCpuHasNEON)) {
   1011     TransposeUVWx8 = TransposeUVWx8_NEON;
   1012   }
   1013 #elif defined(HAS_TRANSPOSE_UVWX8_SSE2)
   1014   if (TestCpuFlag(kCpuHasSSE2) &&
   1015       IS_ALIGNED(width, 8) &&
   1016       IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
   1017     TransposeUVWx8 = TransposeUVWx8_SSE2;
   1018   }
   1019 #elif defined(HAS_TRANSPOSE_UVWx8_MIPS_DSPR2)
   1020   if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 2) &&
   1021       IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
   1022     TransposeUVWx8 = TransposeUVWx8_MIPS_DSPR2;
   1023   }
   1024 #endif
   1025 
   1026   // Work through the source in 8x8 tiles.
   1027   while (i >= 8) {
   1028     TransposeUVWx8(src, src_stride,
   1029                    dst_a, dst_stride_a,
   1030                    dst_b, dst_stride_b,
   1031                    width);
   1032     src += 8 * src_stride;    // Go down 8 rows.
   1033     dst_a += 8;               // Move over 8 columns.
   1034     dst_b += 8;               // Move over 8 columns.
   1035     i -= 8;
   1036   }
   1037 
   1038   TransposeUVWxH_C(src, src_stride,
   1039                    dst_a, dst_stride_a,
   1040                    dst_b, dst_stride_b,
   1041                    width, i);
   1042 }
   1043 
   1044 LIBYUV_API
   1045 void RotateUV90(const uint8* src, int src_stride,
   1046                 uint8* dst_a, int dst_stride_a,
   1047                 uint8* dst_b, int dst_stride_b,
   1048                 int width, int height) {
   1049   src += src_stride * (height - 1);
   1050   src_stride = -src_stride;
   1051 
   1052   TransposeUV(src, src_stride,
   1053               dst_a, dst_stride_a,
   1054               dst_b, dst_stride_b,
   1055               width, height);
   1056 }
   1057 
   1058 LIBYUV_API
   1059 void RotateUV270(const uint8* src, int src_stride,
   1060                  uint8* dst_a, int dst_stride_a,
   1061                  uint8* dst_b, int dst_stride_b,
   1062                  int width, int height) {
   1063   dst_a += dst_stride_a * (width - 1);
   1064   dst_b += dst_stride_b * (width - 1);
   1065   dst_stride_a = -dst_stride_a;
   1066   dst_stride_b = -dst_stride_b;
   1067 
   1068   TransposeUV(src, src_stride,
   1069               dst_a, dst_stride_a,
   1070               dst_b, dst_stride_b,
   1071               width, height);
   1072 }
   1073 
   1074 // Rotate 180 is a horizontal and vertical flip.
   1075 LIBYUV_API
   1076 void RotateUV180(const uint8* src, int src_stride,
   1077                  uint8* dst_a, int dst_stride_a,
   1078                  uint8* dst_b, int dst_stride_b,
   1079                  int width, int height) {
   1080   int i;
   1081   void (*MirrorRowUV)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) =
   1082       MirrorUVRow_C;
   1083 #if defined(HAS_MIRRORUVROW_NEON)
   1084   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
   1085     MirrorRowUV = MirrorUVRow_NEON;
   1086   }
   1087 #elif defined(HAS_MIRRORROW_UV_SSSE3)
   1088   if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) &&
   1089       IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
   1090     MirrorRowUV = MirrorUVRow_SSSE3;
   1091   }
   1092 #elif defined(HAS_MIRRORUVROW_MIPS_DSPR2)
   1093   if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
   1094       IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
   1095     MirrorRowUV = MirrorUVRow_MIPS_DSPR2;
   1096   }
   1097 #endif
   1098 
   1099   dst_a += dst_stride_a * (height - 1);
   1100   dst_b += dst_stride_b * (height - 1);
   1101 
   1102   for (i = 0; i < height; ++i) {
   1103     MirrorRowUV(src, dst_a, dst_b, width);
   1104     src += src_stride;
   1105     dst_a -= dst_stride_a;
   1106     dst_b -= dst_stride_b;
   1107   }
   1108 }
   1109 
   1110 LIBYUV_API
   1111 int RotatePlane(const uint8* src, int src_stride,
   1112                 uint8* dst, int dst_stride,
   1113                 int width, int height,
   1114                 enum RotationMode mode) {
   1115   if (!src || width <= 0 || height == 0 || !dst) {
   1116     return -1;
   1117   }
   1118 
   1119   // Negative height means invert the image.
   1120   if (height < 0) {
   1121     height = -height;
   1122     src = src + (height - 1) * src_stride;
   1123     src_stride = -src_stride;
   1124   }
   1125 
   1126   switch (mode) {
   1127     case kRotate0:
   1128       // copy frame
   1129       CopyPlane(src, src_stride,
   1130                 dst, dst_stride,
   1131                 width, height);
   1132       return 0;
   1133     case kRotate90:
   1134       RotatePlane90(src, src_stride,
   1135                     dst, dst_stride,
   1136                     width, height);
   1137       return 0;
   1138     case kRotate270:
   1139       RotatePlane270(src, src_stride,
   1140                      dst, dst_stride,
   1141                      width, height);
   1142       return 0;
   1143     case kRotate180:
   1144       RotatePlane180(src, src_stride,
   1145                      dst, dst_stride,
   1146                      width, height);
   1147       return 0;
   1148     default:
   1149       break;
   1150   }
   1151   return -1;
   1152 }
   1153 
   1154 LIBYUV_API
   1155 int I420Rotate(const uint8* src_y, int src_stride_y,
   1156                const uint8* src_u, int src_stride_u,
   1157                const uint8* src_v, int src_stride_v,
   1158                uint8* dst_y, int dst_stride_y,
   1159                uint8* dst_u, int dst_stride_u,
   1160                uint8* dst_v, int dst_stride_v,
   1161                int width, int height,
   1162                enum RotationMode mode) {
   1163   int halfwidth = (width + 1) >> 1;
   1164   int halfheight = (height + 1) >> 1;
   1165   if (!src_y || !src_u || !src_v || width <= 0 || height == 0 ||
   1166       !dst_y || !dst_u || !dst_v) {
   1167     return -1;
   1168   }
   1169 
   1170   // Negative height means invert the image.
   1171   if (height < 0) {
   1172     height = -height;
   1173     halfheight = (height + 1) >> 1;
   1174     src_y = src_y + (height - 1) * src_stride_y;
   1175     src_u = src_u + (halfheight - 1) * src_stride_u;
   1176     src_v = src_v + (halfheight - 1) * src_stride_v;
   1177     src_stride_y = -src_stride_y;
   1178     src_stride_u = -src_stride_u;
   1179     src_stride_v = -src_stride_v;
   1180   }
   1181 
   1182   switch (mode) {
   1183     case kRotate0:
   1184       // copy frame
   1185       return I420Copy(src_y, src_stride_y,
   1186                       src_u, src_stride_u,
   1187                       src_v, src_stride_v,
   1188                       dst_y, dst_stride_y,
   1189                       dst_u, dst_stride_u,
   1190                       dst_v, dst_stride_v,
   1191                       width, height);
   1192     case kRotate90:
   1193       RotatePlane90(src_y, src_stride_y,
   1194                     dst_y, dst_stride_y,
   1195                     width, height);
   1196       RotatePlane90(src_u, src_stride_u,
   1197                     dst_u, dst_stride_u,
   1198                     halfwidth, halfheight);
   1199       RotatePlane90(src_v, src_stride_v,
   1200                     dst_v, dst_stride_v,
   1201                     halfwidth, halfheight);
   1202       return 0;
   1203     case kRotate270:
   1204       RotatePlane270(src_y, src_stride_y,
   1205                      dst_y, dst_stride_y,
   1206                      width, height);
   1207       RotatePlane270(src_u, src_stride_u,
   1208                      dst_u, dst_stride_u,
   1209                      halfwidth, halfheight);
   1210       RotatePlane270(src_v, src_stride_v,
   1211                      dst_v, dst_stride_v,
   1212                      halfwidth, halfheight);
   1213       return 0;
   1214     case kRotate180:
   1215       RotatePlane180(src_y, src_stride_y,
   1216                      dst_y, dst_stride_y,
   1217                      width, height);
   1218       RotatePlane180(src_u, src_stride_u,
   1219                      dst_u, dst_stride_u,
   1220                      halfwidth, halfheight);
   1221       RotatePlane180(src_v, src_stride_v,
   1222                      dst_v, dst_stride_v,
   1223                      halfwidth, halfheight);
   1224       return 0;
   1225     default:
   1226       break;
   1227   }
   1228   return -1;
   1229 }
   1230 
   1231 LIBYUV_API
   1232 int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
   1233                      const uint8* src_uv, int src_stride_uv,
   1234                      uint8* dst_y, int dst_stride_y,
   1235                      uint8* dst_u, int dst_stride_u,
   1236                      uint8* dst_v, int dst_stride_v,
   1237                      int width, int height,
   1238                      enum RotationMode mode) {
   1239   int halfwidth = (width + 1) >> 1;
   1240   int halfheight = (height + 1) >> 1;
   1241   if (!src_y || !src_uv || width <= 0 || height == 0 ||
   1242       !dst_y || !dst_u || !dst_v) {
   1243     return -1;
   1244   }
   1245 
   1246   // Negative height means invert the image.
   1247   if (height < 0) {
   1248     height = -height;
   1249     halfheight = (height + 1) >> 1;
   1250     src_y = src_y + (height - 1) * src_stride_y;
   1251     src_uv = src_uv + (halfheight - 1) * src_stride_uv;
   1252     src_stride_y = -src_stride_y;
   1253     src_stride_uv = -src_stride_uv;
   1254   }
   1255 
   1256   switch (mode) {
   1257     case kRotate0:
   1258       // copy frame
   1259       return NV12ToI420(src_y, src_stride_y,
   1260                         src_uv, src_stride_uv,
   1261                         dst_y, dst_stride_y,
   1262                         dst_u, dst_stride_u,
   1263                         dst_v, dst_stride_v,
   1264                         width, height);
   1265     case kRotate90:
   1266       RotatePlane90(src_y, src_stride_y,
   1267                     dst_y, dst_stride_y,
   1268                     width, height);
   1269       RotateUV90(src_uv, src_stride_uv,
   1270                  dst_u, dst_stride_u,
   1271                  dst_v, dst_stride_v,
   1272                  halfwidth, halfheight);
   1273       return 0;
   1274     case kRotate270:
   1275       RotatePlane270(src_y, src_stride_y,
   1276                      dst_y, dst_stride_y,
   1277                      width, height);
   1278       RotateUV270(src_uv, src_stride_uv,
   1279                   dst_u, dst_stride_u,
   1280                   dst_v, dst_stride_v,
   1281                   halfwidth, halfheight);
   1282       return 0;
   1283     case kRotate180:
   1284       RotatePlane180(src_y, src_stride_y,
   1285                      dst_y, dst_stride_y,
   1286                      width, height);
   1287       RotateUV180(src_uv, src_stride_uv,
   1288                   dst_u, dst_stride_u,
   1289                   dst_v, dst_stride_v,
   1290                   halfwidth, halfheight);
   1291       return 0;
   1292     default:
   1293       break;
   1294   }
   1295   return -1;
   1296 }
   1297 
   1298 #ifdef __cplusplus
   1299 }  // extern "C"
   1300 }  // namespace libyuv
   1301 #endif
   1302