Home | History | Annotate | Download | only in source
      1 /*
      2  *  Copyright 2015 The LibYuv Project Authors. All rights reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS. All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "libyuv/rotate_row.h"
     12 #include "libyuv/row.h"
     13 
     14 #ifdef __cplusplus
     15 namespace libyuv {
     16 extern "C" {
     17 #endif
     18 
     19 // This module is for GCC x86 and x64.
     20 #if !defined(LIBYUV_DISABLE_X86) && \
     21     (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
     22 
     23 // Transpose 8x8. 32 or 64 bit, but not NaCL for 64 bit.
     24 #if defined(HAS_TRANSPOSEWX8_SSSE3)
     25 void TransposeWx8_SSSE3(const uint8* src,
     26                         int src_stride,
     27                         uint8* dst,
     28                         int dst_stride,
     29                         int width) {
     30   asm volatile(
     31       // Read in the data from the source pointer.
     32       // First round of bit swap.
     33       LABELALIGN
     34       "1:                                          \n"
     35       "movq       (%0),%%xmm0                      \n"
     36       "movq       (%0,%3),%%xmm1                   \n"
     37       "lea        (%0,%3,2),%0                     \n"
     38       "punpcklbw  %%xmm1,%%xmm0                    \n"
     39       "movq       (%0),%%xmm2                      \n"
     40       "movdqa     %%xmm0,%%xmm1                    \n"
     41       "palignr    $0x8,%%xmm1,%%xmm1               \n"
     42       "movq       (%0,%3),%%xmm3                   \n"
     43       "lea        (%0,%3,2),%0                     \n"
     44       "punpcklbw  %%xmm3,%%xmm2                    \n"
     45       "movdqa     %%xmm2,%%xmm3                    \n"
     46       "movq       (%0),%%xmm4                      \n"
     47       "palignr    $0x8,%%xmm3,%%xmm3               \n"
     48       "movq       (%0,%3),%%xmm5                   \n"
     49       "lea        (%0,%3,2),%0                     \n"
     50       "punpcklbw  %%xmm5,%%xmm4                    \n"
     51       "movdqa     %%xmm4,%%xmm5                    \n"
     52       "movq       (%0),%%xmm6                      \n"
     53       "palignr    $0x8,%%xmm5,%%xmm5               \n"
     54       "movq       (%0,%3),%%xmm7                   \n"
     55       "lea        (%0,%3,2),%0                     \n"
     56       "punpcklbw  %%xmm7,%%xmm6                    \n"
     57       "neg        %3                               \n"
     58       "movdqa     %%xmm6,%%xmm7                    \n"
     59       "lea        0x8(%0,%3,8),%0                  \n"
     60       "palignr    $0x8,%%xmm7,%%xmm7               \n"
     61       "neg        %3                               \n"
     62       // Second round of bit swap.
     63       "punpcklwd  %%xmm2,%%xmm0                    \n"
     64       "punpcklwd  %%xmm3,%%xmm1                    \n"
     65       "movdqa     %%xmm0,%%xmm2                    \n"
     66       "movdqa     %%xmm1,%%xmm3                    \n"
     67       "palignr    $0x8,%%xmm2,%%xmm2               \n"
     68       "palignr    $0x8,%%xmm3,%%xmm3               \n"
     69       "punpcklwd  %%xmm6,%%xmm4                    \n"
     70       "punpcklwd  %%xmm7,%%xmm5                    \n"
     71       "movdqa     %%xmm4,%%xmm6                    \n"
     72       "movdqa     %%xmm5,%%xmm7                    \n"
     73       "palignr    $0x8,%%xmm6,%%xmm6               \n"
     74       "palignr    $0x8,%%xmm7,%%xmm7               \n"
     75       // Third round of bit swap.
     76       // Write to the destination pointer.
     77       "punpckldq  %%xmm4,%%xmm0                    \n"
     78       "movq       %%xmm0,(%1)                      \n"
     79       "movdqa     %%xmm0,%%xmm4                    \n"
     80       "palignr    $0x8,%%xmm4,%%xmm4               \n"
     81       "movq       %%xmm4,(%1,%4)                   \n"
     82       "lea        (%1,%4,2),%1                     \n"
     83       "punpckldq  %%xmm6,%%xmm2                    \n"
     84       "movdqa     %%xmm2,%%xmm6                    \n"
     85       "movq       %%xmm2,(%1)                      \n"
     86       "palignr    $0x8,%%xmm6,%%xmm6               \n"
     87       "punpckldq  %%xmm5,%%xmm1                    \n"
     88       "movq       %%xmm6,(%1,%4)                   \n"
     89       "lea        (%1,%4,2),%1                     \n"
     90       "movdqa     %%xmm1,%%xmm5                    \n"
     91       "movq       %%xmm1,(%1)                      \n"
     92       "palignr    $0x8,%%xmm5,%%xmm5               \n"
     93       "movq       %%xmm5,(%1,%4)                   \n"
     94       "lea        (%1,%4,2),%1                     \n"
     95       "punpckldq  %%xmm7,%%xmm3                    \n"
     96       "movq       %%xmm3,(%1)                      \n"
     97       "movdqa     %%xmm3,%%xmm7                    \n"
     98       "palignr    $0x8,%%xmm7,%%xmm7               \n"
     99       "sub        $0x8,%2                          \n"
    100       "movq       %%xmm7,(%1,%4)                   \n"
    101       "lea        (%1,%4,2),%1                     \n"
    102       "jg         1b                               \n"
    103       : "+r"(src),                    // %0
    104         "+r"(dst),                    // %1
    105         "+r"(width)                   // %2
    106       : "r"((intptr_t)(src_stride)),  // %3
    107         "r"((intptr_t)(dst_stride))   // %4
    108       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
    109         "xmm7");
    110 }
    111 #endif  // defined(HAS_TRANSPOSEWX8_SSSE3)
    112 
    113 // Transpose 16x8. 64 bit
    114 #if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
    115 void TransposeWx8_Fast_SSSE3(const uint8* src,
    116                              int src_stride,
    117                              uint8* dst,
    118                              int dst_stride,
    119                              int width) {
    120   asm volatile(
    121       // Read in the data from the source pointer.
    122       // First round of bit swap.
    123       LABELALIGN
    124       "1:                                          \n"
    125       "movdqu     (%0),%%xmm0                      \n"
    126       "movdqu     (%0,%3),%%xmm1                   \n"
    127       "lea        (%0,%3,2),%0                     \n"
    128       "movdqa     %%xmm0,%%xmm8                    \n"
    129       "punpcklbw  %%xmm1,%%xmm0                    \n"
    130       "punpckhbw  %%xmm1,%%xmm8                    \n"
    131       "movdqu     (%0),%%xmm2                      \n"
    132       "movdqa     %%xmm0,%%xmm1                    \n"
    133       "movdqa     %%xmm8,%%xmm9                    \n"
    134       "palignr    $0x8,%%xmm1,%%xmm1               \n"
    135       "palignr    $0x8,%%xmm9,%%xmm9               \n"
    136       "movdqu     (%0,%3),%%xmm3                   \n"
    137       "lea        (%0,%3,2),%0                     \n"
    138       "movdqa     %%xmm2,%%xmm10                   \n"
    139       "punpcklbw  %%xmm3,%%xmm2                    \n"
    140       "punpckhbw  %%xmm3,%%xmm10                   \n"
    141       "movdqa     %%xmm2,%%xmm3                    \n"
    142       "movdqa     %%xmm10,%%xmm11                  \n"
    143       "movdqu     (%0),%%xmm4                      \n"
    144       "palignr    $0x8,%%xmm3,%%xmm3               \n"
    145       "palignr    $0x8,%%xmm11,%%xmm11             \n"
    146       "movdqu     (%0,%3),%%xmm5                   \n"
    147       "lea        (%0,%3,2),%0                     \n"
    148       "movdqa     %%xmm4,%%xmm12                   \n"
    149       "punpcklbw  %%xmm5,%%xmm4                    \n"
    150       "punpckhbw  %%xmm5,%%xmm12                   \n"
    151       "movdqa     %%xmm4,%%xmm5                    \n"
    152       "movdqa     %%xmm12,%%xmm13                  \n"
    153       "movdqu     (%0),%%xmm6                      \n"
    154       "palignr    $0x8,%%xmm5,%%xmm5               \n"
    155       "palignr    $0x8,%%xmm13,%%xmm13             \n"
    156       "movdqu     (%0,%3),%%xmm7                   \n"
    157       "lea        (%0,%3,2),%0                     \n"
    158       "movdqa     %%xmm6,%%xmm14                   \n"
    159       "punpcklbw  %%xmm7,%%xmm6                    \n"
    160       "punpckhbw  %%xmm7,%%xmm14                   \n"
    161       "neg        %3                               \n"
    162       "movdqa     %%xmm6,%%xmm7                    \n"
    163       "movdqa     %%xmm14,%%xmm15                  \n"
    164       "lea        0x10(%0,%3,8),%0                 \n"
    165       "palignr    $0x8,%%xmm7,%%xmm7               \n"
    166       "palignr    $0x8,%%xmm15,%%xmm15             \n"
    167       "neg        %3                               \n"
    168       // Second round of bit swap.
    169       "punpcklwd  %%xmm2,%%xmm0                    \n"
    170       "punpcklwd  %%xmm3,%%xmm1                    \n"
    171       "movdqa     %%xmm0,%%xmm2                    \n"
    172       "movdqa     %%xmm1,%%xmm3                    \n"
    173       "palignr    $0x8,%%xmm2,%%xmm2               \n"
    174       "palignr    $0x8,%%xmm3,%%xmm3               \n"
    175       "punpcklwd  %%xmm6,%%xmm4                    \n"
    176       "punpcklwd  %%xmm7,%%xmm5                    \n"
    177       "movdqa     %%xmm4,%%xmm6                    \n"
    178       "movdqa     %%xmm5,%%xmm7                    \n"
    179       "palignr    $0x8,%%xmm6,%%xmm6               \n"
    180       "palignr    $0x8,%%xmm7,%%xmm7               \n"
    181       "punpcklwd  %%xmm10,%%xmm8                   \n"
    182       "punpcklwd  %%xmm11,%%xmm9                   \n"
    183       "movdqa     %%xmm8,%%xmm10                   \n"
    184       "movdqa     %%xmm9,%%xmm11                   \n"
    185       "palignr    $0x8,%%xmm10,%%xmm10             \n"
    186       "palignr    $0x8,%%xmm11,%%xmm11             \n"
    187       "punpcklwd  %%xmm14,%%xmm12                  \n"
    188       "punpcklwd  %%xmm15,%%xmm13                  \n"
    189       "movdqa     %%xmm12,%%xmm14                  \n"
    190       "movdqa     %%xmm13,%%xmm15                  \n"
    191       "palignr    $0x8,%%xmm14,%%xmm14             \n"
    192       "palignr    $0x8,%%xmm15,%%xmm15             \n"
    193       // Third round of bit swap.
    194       // Write to the destination pointer.
    195       "punpckldq  %%xmm4,%%xmm0                    \n"
    196       "movq       %%xmm0,(%1)                      \n"
    197       "movdqa     %%xmm0,%%xmm4                    \n"
    198       "palignr    $0x8,%%xmm4,%%xmm4               \n"
    199       "movq       %%xmm4,(%1,%4)                   \n"
    200       "lea        (%1,%4,2),%1                     \n"
    201       "punpckldq  %%xmm6,%%xmm2                    \n"
    202       "movdqa     %%xmm2,%%xmm6                    \n"
    203       "movq       %%xmm2,(%1)                      \n"
    204       "palignr    $0x8,%%xmm6,%%xmm6               \n"
    205       "punpckldq  %%xmm5,%%xmm1                    \n"
    206       "movq       %%xmm6,(%1,%4)                   \n"
    207       "lea        (%1,%4,2),%1                     \n"
    208       "movdqa     %%xmm1,%%xmm5                    \n"
    209       "movq       %%xmm1,(%1)                      \n"
    210       "palignr    $0x8,%%xmm5,%%xmm5               \n"
    211       "movq       %%xmm5,(%1,%4)                   \n"
    212       "lea        (%1,%4,2),%1                     \n"
    213       "punpckldq  %%xmm7,%%xmm3                    \n"
    214       "movq       %%xmm3,(%1)                      \n"
    215       "movdqa     %%xmm3,%%xmm7                    \n"
    216       "palignr    $0x8,%%xmm7,%%xmm7               \n"
    217       "movq       %%xmm7,(%1,%4)                   \n"
    218       "lea        (%1,%4,2),%1                     \n"
    219       "punpckldq  %%xmm12,%%xmm8                   \n"
    220       "movq       %%xmm8,(%1)                      \n"
    221       "movdqa     %%xmm8,%%xmm12                   \n"
    222       "palignr    $0x8,%%xmm12,%%xmm12             \n"
    223       "movq       %%xmm12,(%1,%4)                  \n"
    224       "lea        (%1,%4,2),%1                     \n"
    225       "punpckldq  %%xmm14,%%xmm10                  \n"
    226       "movdqa     %%xmm10,%%xmm14                  \n"
    227       "movq       %%xmm10,(%1)                     \n"
    228       "palignr    $0x8,%%xmm14,%%xmm14             \n"
    229       "punpckldq  %%xmm13,%%xmm9                   \n"
    230       "movq       %%xmm14,(%1,%4)                  \n"
    231       "lea        (%1,%4,2),%1                     \n"
    232       "movdqa     %%xmm9,%%xmm13                   \n"
    233       "movq       %%xmm9,(%1)                      \n"
    234       "palignr    $0x8,%%xmm13,%%xmm13             \n"
    235       "movq       %%xmm13,(%1,%4)                  \n"
    236       "lea        (%1,%4,2),%1                     \n"
    237       "punpckldq  %%xmm15,%%xmm11                  \n"
    238       "movq       %%xmm11,(%1)                     \n"
    239       "movdqa     %%xmm11,%%xmm15                  \n"
    240       "palignr    $0x8,%%xmm15,%%xmm15             \n"
    241       "sub        $0x10,%2                         \n"
    242       "movq       %%xmm15,(%1,%4)                  \n"
    243       "lea        (%1,%4,2),%1                     \n"
    244       "jg         1b                               \n"
    245       : "+r"(src),                    // %0
    246         "+r"(dst),                    // %1
    247         "+r"(width)                   // %2
    248       : "r"((intptr_t)(src_stride)),  // %3
    249         "r"((intptr_t)(dst_stride))   // %4
    250       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
    251         "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
    252         "xmm15");
    253 }
    254 #endif  // defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
    255 
    256 // Transpose UV 8x8.  64 bit.
    257 #if defined(HAS_TRANSPOSEUVWX8_SSE2)
    258 void TransposeUVWx8_SSE2(const uint8* src,
    259                          int src_stride,
    260                          uint8* dst_a,
    261                          int dst_stride_a,
    262                          uint8* dst_b,
    263                          int dst_stride_b,
    264                          int width) {
    265   asm volatile(
    266       // Read in the data from the source pointer.
    267       // First round of bit swap.
    268       LABELALIGN
    269       "1:                                          \n"
    270       "movdqu     (%0),%%xmm0                      \n"
    271       "movdqu     (%0,%4),%%xmm1                   \n"
    272       "lea        (%0,%4,2),%0                     \n"
    273       "movdqa     %%xmm0,%%xmm8                    \n"
    274       "punpcklbw  %%xmm1,%%xmm0                    \n"
    275       "punpckhbw  %%xmm1,%%xmm8                    \n"
    276       "movdqa     %%xmm8,%%xmm1                    \n"
    277       "movdqu     (%0),%%xmm2                      \n"
    278       "movdqu     (%0,%4),%%xmm3                   \n"
    279       "lea        (%0,%4,2),%0                     \n"
    280       "movdqa     %%xmm2,%%xmm8                    \n"
    281       "punpcklbw  %%xmm3,%%xmm2                    \n"
    282       "punpckhbw  %%xmm3,%%xmm8                    \n"
    283       "movdqa     %%xmm8,%%xmm3                    \n"
    284       "movdqu     (%0),%%xmm4                      \n"
    285       "movdqu     (%0,%4),%%xmm5                   \n"
    286       "lea        (%0,%4,2),%0                     \n"
    287       "movdqa     %%xmm4,%%xmm8                    \n"
    288       "punpcklbw  %%xmm5,%%xmm4                    \n"
    289       "punpckhbw  %%xmm5,%%xmm8                    \n"
    290       "movdqa     %%xmm8,%%xmm5                    \n"
    291       "movdqu     (%0),%%xmm6                      \n"
    292       "movdqu     (%0,%4),%%xmm7                   \n"
    293       "lea        (%0,%4,2),%0                     \n"
    294       "movdqa     %%xmm6,%%xmm8                    \n"
    295       "punpcklbw  %%xmm7,%%xmm6                    \n"
    296       "neg        %4                               \n"
    297       "lea        0x10(%0,%4,8),%0                 \n"
    298       "punpckhbw  %%xmm7,%%xmm8                    \n"
    299       "movdqa     %%xmm8,%%xmm7                    \n"
    300       "neg        %4                               \n"
    301       // Second round of bit swap.
    302       "movdqa     %%xmm0,%%xmm8                    \n"
    303       "movdqa     %%xmm1,%%xmm9                    \n"
    304       "punpckhwd  %%xmm2,%%xmm8                    \n"
    305       "punpckhwd  %%xmm3,%%xmm9                    \n"
    306       "punpcklwd  %%xmm2,%%xmm0                    \n"
    307       "punpcklwd  %%xmm3,%%xmm1                    \n"
    308       "movdqa     %%xmm8,%%xmm2                    \n"
    309       "movdqa     %%xmm9,%%xmm3                    \n"
    310       "movdqa     %%xmm4,%%xmm8                    \n"
    311       "movdqa     %%xmm5,%%xmm9                    \n"
    312       "punpckhwd  %%xmm6,%%xmm8                    \n"
    313       "punpckhwd  %%xmm7,%%xmm9                    \n"
    314       "punpcklwd  %%xmm6,%%xmm4                    \n"
    315       "punpcklwd  %%xmm7,%%xmm5                    \n"
    316       "movdqa     %%xmm8,%%xmm6                    \n"
    317       "movdqa     %%xmm9,%%xmm7                    \n"
    318       // Third round of bit swap.
    319       // Write to the destination pointer.
    320       "movdqa     %%xmm0,%%xmm8                    \n"
    321       "punpckldq  %%xmm4,%%xmm0                    \n"
    322       "movlpd     %%xmm0,(%1)                      \n"  // Write back U channel
    323       "movhpd     %%xmm0,(%2)                      \n"  // Write back V channel
    324       "punpckhdq  %%xmm4,%%xmm8                    \n"
    325       "movlpd     %%xmm8,(%1,%5)                   \n"
    326       "lea        (%1,%5,2),%1                     \n"
    327       "movhpd     %%xmm8,(%2,%6)                   \n"
    328       "lea        (%2,%6,2),%2                     \n"
    329       "movdqa     %%xmm2,%%xmm8                    \n"
    330       "punpckldq  %%xmm6,%%xmm2                    \n"
    331       "movlpd     %%xmm2,(%1)                      \n"
    332       "movhpd     %%xmm2,(%2)                      \n"
    333       "punpckhdq  %%xmm6,%%xmm8                    \n"
    334       "movlpd     %%xmm8,(%1,%5)                   \n"
    335       "lea        (%1,%5,2),%1                     \n"
    336       "movhpd     %%xmm8,(%2,%6)                   \n"
    337       "lea        (%2,%6,2),%2                     \n"
    338       "movdqa     %%xmm1,%%xmm8                    \n"
    339       "punpckldq  %%xmm5,%%xmm1                    \n"
    340       "movlpd     %%xmm1,(%1)                      \n"
    341       "movhpd     %%xmm1,(%2)                      \n"
    342       "punpckhdq  %%xmm5,%%xmm8                    \n"
    343       "movlpd     %%xmm8,(%1,%5)                   \n"
    344       "lea        (%1,%5,2),%1                     \n"
    345       "movhpd     %%xmm8,(%2,%6)                   \n"
    346       "lea        (%2,%6,2),%2                     \n"
    347       "movdqa     %%xmm3,%%xmm8                    \n"
    348       "punpckldq  %%xmm7,%%xmm3                    \n"
    349       "movlpd     %%xmm3,(%1)                      \n"
    350       "movhpd     %%xmm3,(%2)                      \n"
    351       "punpckhdq  %%xmm7,%%xmm8                    \n"
    352       "sub        $0x8,%3                          \n"
    353       "movlpd     %%xmm8,(%1,%5)                   \n"
    354       "lea        (%1,%5,2),%1                     \n"
    355       "movhpd     %%xmm8,(%2,%6)                   \n"
    356       "lea        (%2,%6,2),%2                     \n"
    357       "jg         1b                               \n"
    358       : "+r"(src),                      // %0
    359         "+r"(dst_a),                    // %1
    360         "+r"(dst_b),                    // %2
    361         "+r"(width)                     // %3
    362       : "r"((intptr_t)(src_stride)),    // %4
    363         "r"((intptr_t)(dst_stride_a)),  // %5
    364         "r"((intptr_t)(dst_stride_b))   // %6
    365       : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
    366         "xmm7", "xmm8", "xmm9");
    367 }
    368 #endif  // defined(HAS_TRANSPOSEUVWX8_SSE2)
    369 #endif  // defined(__x86_64__) || defined(__i386__)
    370 
    371 #ifdef __cplusplus
    372 }  // extern "C"
    373 }  // namespace libyuv
    374 #endif
    375