Home | History | Annotate | Download | only in source
      1 /*
      2  *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS. All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "libyuv/rotate_row.h"
     12 #include "libyuv/row.h"
     13 
     14 #ifdef __cplusplus
     15 namespace libyuv {
     16 extern "C" {
     17 #endif
     18 
     19 // This module is for 32 bit Visual C x86 and clangcl
     20 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
     21 
     22 __declspec(naked) void TransposeWx8_SSSE3(const uint8* src,
     23                                           int src_stride,
     24                                           uint8* dst,
     25                                           int dst_stride,
     26                                           int width) {
     27   __asm {
     28     push      edi
     29     push      esi
     30     push      ebp
     31     mov       eax, [esp + 12 + 4]  // src
     32     mov       edi, [esp + 12 + 8]  // src_stride
     33     mov       edx, [esp + 12 + 12]  // dst
     34     mov       esi, [esp + 12 + 16]  // dst_stride
     35     mov       ecx, [esp + 12 + 20]  // width
     36 
     37     // Read in the data from the source pointer.
     38     // First round of bit swap.
     39     align      4
     40  convertloop:
     41     movq      xmm0, qword ptr [eax]
     42     lea       ebp, [eax + 8]
     43     movq      xmm1, qword ptr [eax + edi]
     44     lea       eax, [eax + 2 * edi]
     45     punpcklbw xmm0, xmm1
     46     movq      xmm2, qword ptr [eax]
     47     movdqa    xmm1, xmm0
     48     palignr   xmm1, xmm1, 8
     49     movq      xmm3, qword ptr [eax + edi]
     50     lea       eax, [eax + 2 * edi]
     51     punpcklbw xmm2, xmm3
     52     movdqa    xmm3, xmm2
     53     movq      xmm4, qword ptr [eax]
     54     palignr   xmm3, xmm3, 8
     55     movq      xmm5, qword ptr [eax + edi]
     56     punpcklbw xmm4, xmm5
     57     lea       eax, [eax + 2 * edi]
     58     movdqa    xmm5, xmm4
     59     movq      xmm6, qword ptr [eax]
     60     palignr   xmm5, xmm5, 8
     61     movq      xmm7, qword ptr [eax + edi]
     62     punpcklbw xmm6, xmm7
     63     mov       eax, ebp
     64     movdqa    xmm7, xmm6
     65     palignr   xmm7, xmm7, 8
     66     // Second round of bit swap.
     67     punpcklwd xmm0, xmm2
     68     punpcklwd xmm1, xmm3
     69     movdqa    xmm2, xmm0
     70     movdqa    xmm3, xmm1
     71     palignr   xmm2, xmm2, 8
     72     palignr   xmm3, xmm3, 8
     73     punpcklwd xmm4, xmm6
     74     punpcklwd xmm5, xmm7
     75     movdqa    xmm6, xmm4
     76     movdqa    xmm7, xmm5
     77     palignr   xmm6, xmm6, 8
     78     palignr   xmm7, xmm7, 8
     79     // Third round of bit swap.
     80     // Write to the destination pointer.
     81     punpckldq xmm0, xmm4
     82     movq      qword ptr [edx], xmm0
     83     movdqa    xmm4, xmm0
     84     palignr   xmm4, xmm4, 8
     85     movq      qword ptr [edx + esi], xmm4
     86     lea       edx, [edx + 2 * esi]
     87     punpckldq xmm2, xmm6
     88     movdqa    xmm6, xmm2
     89     palignr   xmm6, xmm6, 8
     90     movq      qword ptr [edx], xmm2
     91     punpckldq xmm1, xmm5
     92     movq      qword ptr [edx + esi], xmm6
     93     lea       edx, [edx + 2 * esi]
     94     movdqa    xmm5, xmm1
     95     movq      qword ptr [edx], xmm1
     96     palignr   xmm5, xmm5, 8
     97     punpckldq xmm3, xmm7
     98     movq      qword ptr [edx + esi], xmm5
     99     lea       edx, [edx + 2 * esi]
    100     movq      qword ptr [edx], xmm3
    101     movdqa    xmm7, xmm3
    102     palignr   xmm7, xmm7, 8
    103     sub       ecx, 8
    104     movq      qword ptr [edx + esi], xmm7
    105     lea       edx, [edx + 2 * esi]
    106     jg        convertloop
    107 
    108     pop       ebp
    109     pop       esi
    110     pop       edi
    111     ret
    112   }
    113 }
    114 
    115 __declspec(naked) void TransposeUVWx8_SSE2(const uint8* src,
    116                                            int src_stride,
    117                                            uint8* dst_a,
    118                                            int dst_stride_a,
    119                                            uint8* dst_b,
    120                                            int dst_stride_b,
    121                                            int w) {
    122   __asm {
    123     push      ebx
    124     push      esi
    125     push      edi
    126     push      ebp
    127     mov       eax, [esp + 16 + 4]  // src
    128     mov       edi, [esp + 16 + 8]  // src_stride
    129     mov       edx, [esp + 16 + 12]  // dst_a
    130     mov       esi, [esp + 16 + 16]  // dst_stride_a
    131     mov       ebx, [esp + 16 + 20]  // dst_b
    132     mov       ebp, [esp + 16 + 24]  // dst_stride_b
    133     mov       ecx, esp
    134     sub       esp, 4 + 16
    135     and       esp, ~15
    136     mov       [esp + 16], ecx
    137     mov       ecx, [ecx + 16 + 28]  // w
    138 
    139     align      4
    140     // Read in the data from the source pointer.
    141     // First round of bit swap.
    142   convertloop:
    143     movdqu    xmm0, [eax]
    144     movdqu    xmm1, [eax + edi]
    145     lea       eax, [eax + 2 * edi]
    146     movdqa    xmm7, xmm0  // use xmm7 as temp register.
    147     punpcklbw xmm0, xmm1
    148     punpckhbw xmm7, xmm1
    149     movdqa    xmm1, xmm7
    150     movdqu    xmm2, [eax]
    151     movdqu    xmm3, [eax + edi]
    152     lea       eax, [eax + 2 * edi]
    153     movdqa    xmm7, xmm2
    154     punpcklbw xmm2, xmm3
    155     punpckhbw xmm7, xmm3
    156     movdqa    xmm3, xmm7
    157     movdqu    xmm4, [eax]
    158     movdqu    xmm5, [eax + edi]
    159     lea       eax, [eax + 2 * edi]
    160     movdqa    xmm7, xmm4
    161     punpcklbw xmm4, xmm5
    162     punpckhbw xmm7, xmm5
    163     movdqa    xmm5, xmm7
    164     movdqu    xmm6, [eax]
    165     movdqu    xmm7, [eax + edi]
    166     lea       eax, [eax + 2 * edi]
    167     movdqu    [esp], xmm5  // backup xmm5
    168     neg       edi
    169     movdqa    xmm5, xmm6  // use xmm5 as temp register.
    170     punpcklbw xmm6, xmm7
    171     punpckhbw xmm5, xmm7
    172     movdqa    xmm7, xmm5
    173     lea       eax, [eax + 8 * edi + 16]
    174     neg       edi
    175     // Second round of bit swap.
    176     movdqa    xmm5, xmm0
    177     punpcklwd xmm0, xmm2
    178     punpckhwd xmm5, xmm2
    179     movdqa    xmm2, xmm5
    180     movdqa    xmm5, xmm1
    181     punpcklwd xmm1, xmm3
    182     punpckhwd xmm5, xmm3
    183     movdqa    xmm3, xmm5
    184     movdqa    xmm5, xmm4
    185     punpcklwd xmm4, xmm6
    186     punpckhwd xmm5, xmm6
    187     movdqa    xmm6, xmm5
    188     movdqu    xmm5, [esp]  // restore xmm5
    189     movdqu    [esp], xmm6  // backup xmm6
    190     movdqa    xmm6, xmm5  // use xmm6 as temp register.
    191     punpcklwd xmm5, xmm7
    192     punpckhwd xmm6, xmm7
    193     movdqa    xmm7, xmm6
    194 
    195     // Third round of bit swap.
    196     // Write to the destination pointer.
    197     movdqa    xmm6, xmm0
    198     punpckldq xmm0, xmm4
    199     punpckhdq xmm6, xmm4
    200     movdqa    xmm4, xmm6
    201     movdqu    xmm6, [esp]  // restore xmm6
    202     movlpd    qword ptr [edx], xmm0
    203     movhpd    qword ptr [ebx], xmm0
    204     movlpd    qword ptr [edx + esi], xmm4
    205     lea       edx, [edx + 2 * esi]
    206     movhpd    qword ptr [ebx + ebp], xmm4
    207     lea       ebx, [ebx + 2 * ebp]
    208     movdqa    xmm0, xmm2  // use xmm0 as the temp register.
    209     punpckldq xmm2, xmm6
    210     movlpd    qword ptr [edx], xmm2
    211     movhpd    qword ptr [ebx], xmm2
    212     punpckhdq xmm0, xmm6
    213     movlpd    qword ptr [edx + esi], xmm0
    214     lea       edx, [edx + 2 * esi]
    215     movhpd    qword ptr [ebx + ebp], xmm0
    216     lea       ebx, [ebx + 2 * ebp]
    217     movdqa    xmm0, xmm1  // use xmm0 as the temp register.
    218     punpckldq xmm1, xmm5
    219     movlpd    qword ptr [edx], xmm1
    220     movhpd    qword ptr [ebx], xmm1
    221     punpckhdq xmm0, xmm5
    222     movlpd    qword ptr [edx + esi], xmm0
    223     lea       edx, [edx + 2 * esi]
    224     movhpd    qword ptr [ebx + ebp], xmm0
    225     lea       ebx, [ebx + 2 * ebp]
    226     movdqa    xmm0, xmm3  // use xmm0 as the temp register.
    227     punpckldq xmm3, xmm7
    228     movlpd    qword ptr [edx], xmm3
    229     movhpd    qword ptr [ebx], xmm3
    230     punpckhdq xmm0, xmm7
    231     sub       ecx, 8
    232     movlpd    qword ptr [edx + esi], xmm0
    233     lea       edx, [edx + 2 * esi]
    234     movhpd    qword ptr [ebx + ebp], xmm0
    235     lea       ebx, [ebx + 2 * ebp]
    236     jg        convertloop
    237 
    238     mov       esp, [esp + 16]
    239     pop       ebp
    240     pop       edi
    241     pop       esi
    242     pop       ebx
    243     ret
    244   }
    245 }
    246 
    247 #endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
    248 
    249 #ifdef __cplusplus
    250 }  // extern "C"
    251 }  // namespace libyuv
    252 #endif
    253