Home | History | Annotate | Download | only in source
      1 ;
      2 ; Copyright 2012 The LibYuv Project Authors. All rights reserved.
      3 ;
      4 ; Use of this source code is governed by a BSD-style license
      5 ; that can be found in the LICENSE file in the root of the source
      6 ; tree. An additional intellectual property rights grant can be found
      7 ; in the file PATENTS. All contributing project authors may
      8 ; be found in the AUTHORS file in the root of the source tree.
      9 ;
     10 
     11 %ifdef __YASM_VERSION_ID__
     12 %if __YASM_VERSION_ID__ < 01020000h
     13 %error AVX2 is supported only by yasm 1.2.0 or later.
     14 %endif
     15 %endif
     16 %include "x86inc.asm"
     17 
     18 SECTION .text
     19 
     20 ; cglobal numeric constants are parameters, gpr regs, mm regs
     21 
     22 ; void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix)
     23 
     24 %macro YUY2TOYROW 2-3
     25 cglobal %1ToYRow%3, 3, 3, 3, src_yuy2, dst_y, pix
     26 %ifidn %1,YUY2
     27     pcmpeqb    m2, m2, m2        ; generate mask 0x00ff00ff
     28     psrlw      m2, m2, 8
     29 %endif
     30 
     31     ALIGN      4
     32 .convertloop:
     33     mov%2      m0, [src_yuy2q]
     34     mov%2      m1, [src_yuy2q + mmsize]
     35     lea        src_yuy2q, [src_yuy2q + mmsize * 2]
     36 %ifidn %1,YUY2
     37     pand       m0, m0, m2   ; YUY2 even bytes are Y
     38     pand       m1, m1, m2
     39 %else
     40     psrlw      m0, m0, 8    ; UYVY odd bytes are Y
     41     psrlw      m1, m1, 8
     42 %endif
     43     packuswb   m0, m0, m1
     44 %if cpuflag(AVX2)
     45     vpermq     m0, m0, 0xd8
     46 %endif
     47     sub        pixd, mmsize
     48     mov%2      [dst_yq], m0
     49     lea        dst_yq, [dst_yq + mmsize]
     50     jg         .convertloop
     51     REP_RET
     52 %endmacro
     53 
     54 ; TODO(fbarchard): Remove MMX.  Add SSSE3 pshufb version.
     55 INIT_MMX MMX
     56 YUY2TOYROW YUY2,a,
     57 YUY2TOYROW YUY2,u,_Unaligned
     58 YUY2TOYROW UYVY,a,
     59 YUY2TOYROW UYVY,u,_Unaligned
     60 INIT_XMM SSE2
     61 YUY2TOYROW YUY2,a,
     62 YUY2TOYROW YUY2,u,_Unaligned
     63 YUY2TOYROW UYVY,a,
     64 YUY2TOYROW UYVY,u,_Unaligned
     65 INIT_YMM AVX2
     66 YUY2TOYROW YUY2,a,
     67 YUY2TOYROW UYVY,a,
     68 
     69 ; void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix)
     70 
     71 %macro SplitUVRow 1-2
     72 cglobal SplitUVRow%2, 4, 4, 5, src_uv, dst_u, dst_v, pix
     73     pcmpeqb    m4, m4, m4        ; generate mask 0x00ff00ff
     74     psrlw      m4, m4, 8
     75     sub        dst_vq, dst_uq
     76 
     77     ALIGN      4
     78 .convertloop:
     79     mov%1      m0, [src_uvq]
     80     mov%1      m1, [src_uvq + mmsize]
     81     lea        src_uvq, [src_uvq + mmsize * 2]
     82     psrlw      m2, m0, 8         ; odd bytes
     83     psrlw      m3, m1, 8
     84     pand       m0, m0, m4        ; even bytes
     85     pand       m1, m1, m4
     86     packuswb   m0, m0, m1
     87     packuswb   m2, m2, m3
     88 %if cpuflag(AVX2)
     89     vpermq     m0, m0, 0xd8
     90     vpermq     m2, m2, 0xd8
     91 %endif
     92     mov%1      [dst_uq], m0
     93     mov%1      [dst_uq + dst_vq], m2
     94     lea        dst_uq, [dst_uq + mmsize]
     95     sub        pixd, mmsize
     96     jg         .convertloop
     97     REP_RET
     98 %endmacro
     99 
    100 INIT_MMX MMX
    101 SplitUVRow a,
    102 SplitUVRow u,_Unaligned
    103 INIT_XMM SSE2
    104 SplitUVRow a,
    105 SplitUVRow u,_Unaligned
    106 INIT_YMM AVX2
    107 SplitUVRow a,
    108 
    109 ; void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
    110 ;                      int width);
    111 
    112 %macro MergeUVRow_ 1-2
    113 cglobal MergeUVRow_%2, 4, 4, 3, src_u, src_v, dst_uv, pix
    114     sub        src_vq, src_uq
    115 
    116     ALIGN      4
    117 .convertloop:
    118     mov%1      m0, [src_uq]
    119     mov%1      m1, [src_vq]
    120     lea        src_uq, [src_uq + mmsize]
    121     punpcklbw  m2, m0, m1       // first 8 UV pairs
    122     punpckhbw  m0, m0, m1       // next 8 UV pairs
    123 %if cpuflag(AVX2)
    124     vperm2i128 m1, m2, m0, 0x20  // low 128 of ymm2 and low 128 of ymm0
    125     vperm2i128 m2, m2, m0, 0x31  // high 128 of ymm2 and high 128 of ymm0
    126     mov%1      [dst_uvq], m1
    127     mov%1      [dst_uvq + mmsize], m2
    128 %else
    129     mov%1      [dst_uvq], m2
    130     mov%1      [dst_uvq + mmsize], m0
    131 %endif
    132     lea        dst_uvq, [dst_uvq + mmsize * 2]
    133     sub        pixd, mmsize
    134     jg         .convertloop
    135     REP_RET
    136 %endmacro
    137 
    138 INIT_MMX MMX
    139 MergeUVRow_ a,
    140 MergeUVRow_ u,_Unaligned
    141 INIT_XMM SSE2
    142 MergeUVRow_ a,
    143 MergeUVRow_ u,_Unaligned
    144 INIT_YMM AVX2
    145 MergeUVRow_ a,
    146 
    147