1 ; 2 ; Copyright 2012 The LibYuv Project Authors. All rights reserved. 3 ; 4 ; Use of this source code is governed by a BSD-style license 5 ; that can be found in the LICENSE file in the root of the source 6 ; tree. An additional intellectual property rights grant can be found 7 ; in the file PATENTS. All contributing project authors may 8 ; be found in the AUTHORS file in the root of the source tree. 9 ; 10 11 %ifdef __YASM_VERSION_ID__ 12 %if __YASM_VERSION_ID__ < 01020000h 13 %error AVX2 is supported only by yasm 1.2.0 or later. 14 %endif 15 %endif 16 %include "x86inc.asm" 17 18 SECTION .text 19 20 ; cglobal numeric constants are parameters, gpr regs, mm regs 21 22 ; void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) 23 24 %macro YUY2TOYROW 2-3 25 cglobal %1ToYRow%3, 3, 3, 3, src_yuy2, dst_y, pix 26 %ifidn %1,YUY2 27 pcmpeqb m2, m2, m2 ; generate mask 0x00ff00ff 28 psrlw m2, m2, 8 29 %endif 30 31 ALIGN 4 32 .convertloop: 33 mov%2 m0, [src_yuy2q] 34 mov%2 m1, [src_yuy2q + mmsize] 35 lea src_yuy2q, [src_yuy2q + mmsize * 2] 36 %ifidn %1,YUY2 37 pand m0, m0, m2 ; YUY2 even bytes are Y 38 pand m1, m1, m2 39 %else 40 psrlw m0, m0, 8 ; UYVY odd bytes are Y 41 psrlw m1, m1, 8 42 %endif 43 packuswb m0, m0, m1 44 %if cpuflag(AVX2) 45 vpermq m0, m0, 0xd8 46 %endif 47 sub pixd, mmsize 48 mov%2 [dst_yq], m0 49 lea dst_yq, [dst_yq + mmsize] 50 jg .convertloop 51 REP_RET 52 %endmacro 53 54 ; TODO(fbarchard): Remove MMX. Add SSSE3 pshufb version. 55 INIT_MMX MMX 56 YUY2TOYROW YUY2,a, 57 YUY2TOYROW YUY2,u,_Unaligned 58 YUY2TOYROW UYVY,a, 59 YUY2TOYROW UYVY,u,_Unaligned 60 INIT_XMM SSE2 61 YUY2TOYROW YUY2,a, 62 YUY2TOYROW YUY2,u,_Unaligned 63 YUY2TOYROW UYVY,a, 64 YUY2TOYROW UYVY,u,_Unaligned 65 INIT_YMM AVX2 66 YUY2TOYROW YUY2,a, 67 YUY2TOYROW UYVY,a, 68 69 ; void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) 70 71 %macro SplitUVRow 1-2 72 cglobal SplitUVRow%2, 4, 4, 5, src_uv, dst_u, dst_v, pix 73 pcmpeqb m4, m4, m4 ; generate mask 0x00ff00ff 74 psrlw m4, m4, 8 75 sub dst_vq, dst_uq 76 77 ALIGN 4 78 .convertloop: 79 mov%1 m0, [src_uvq] 80 mov%1 m1, [src_uvq + mmsize] 81 lea src_uvq, [src_uvq + mmsize * 2] 82 psrlw m2, m0, 8 ; odd bytes 83 psrlw m3, m1, 8 84 pand m0, m0, m4 ; even bytes 85 pand m1, m1, m4 86 packuswb m0, m0, m1 87 packuswb m2, m2, m3 88 %if cpuflag(AVX2) 89 vpermq m0, m0, 0xd8 90 vpermq m2, m2, 0xd8 91 %endif 92 mov%1 [dst_uq], m0 93 mov%1 [dst_uq + dst_vq], m2 94 lea dst_uq, [dst_uq + mmsize] 95 sub pixd, mmsize 96 jg .convertloop 97 REP_RET 98 %endmacro 99 100 INIT_MMX MMX 101 SplitUVRow a, 102 SplitUVRow u,_Unaligned 103 INIT_XMM SSE2 104 SplitUVRow a, 105 SplitUVRow u,_Unaligned 106 INIT_YMM AVX2 107 SplitUVRow a, 108 109 ; void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, 110 ; int width); 111 112 %macro MergeUVRow_ 1-2 113 cglobal MergeUVRow_%2, 4, 4, 3, src_u, src_v, dst_uv, pix 114 sub src_vq, src_uq 115 116 ALIGN 4 117 .convertloop: 118 mov%1 m0, [src_uq] 119 mov%1 m1, [src_vq] 120 lea src_uq, [src_uq + mmsize] 121 punpcklbw m2, m0, m1 // first 8 UV pairs 122 punpckhbw m0, m0, m1 // next 8 UV pairs 123 %if cpuflag(AVX2) 124 vperm2i128 m1, m2, m0, 0x20 // low 128 of ymm2 and low 128 of ymm0 125 vperm2i128 m2, m2, m0, 0x31 // high 128 of ymm2 and high 128 of ymm0 126 mov%1 [dst_uvq], m1 127 mov%1 [dst_uvq + mmsize], m2 128 %else 129 mov%1 [dst_uvq], m2 130 mov%1 [dst_uvq + mmsize], m0 131 %endif 132 lea dst_uvq, [dst_uvq + mmsize * 2] 133 sub pixd, mmsize 134 jg .convertloop 135 REP_RET 136 %endmacro 137 138 INIT_MMX MMX 139 MergeUVRow_ a, 140 MergeUVRow_ u,_Unaligned 141 INIT_XMM SSE2 142 MergeUVRow_ a, 143 MergeUVRow_ u,_Unaligned 144 INIT_YMM AVX2 145 MergeUVRow_ a, 146 147