1 /* 2 * Copyright 2013 The LibYuv Project Authors. All rights reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "libyuv/rotate_row.h" 12 #include "libyuv/row.h" 13 14 #ifdef __cplusplus 15 namespace libyuv { 16 extern "C" { 17 #endif 18 19 // This module is for 32 bit Visual C x86 and clangcl 20 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) 21 22 __declspec(naked) void TransposeWx8_SSSE3(const uint8* src, 23 int src_stride, 24 uint8* dst, 25 int dst_stride, 26 int width) { 27 __asm { 28 push edi 29 push esi 30 push ebp 31 mov eax, [esp + 12 + 4] // src 32 mov edi, [esp + 12 + 8] // src_stride 33 mov edx, [esp + 12 + 12] // dst 34 mov esi, [esp + 12 + 16] // dst_stride 35 mov ecx, [esp + 12 + 20] // width 36 37 // Read in the data from the source pointer. 38 // First round of bit swap. 39 align 4 40 convertloop: 41 movq xmm0, qword ptr [eax] 42 lea ebp, [eax + 8] 43 movq xmm1, qword ptr [eax + edi] 44 lea eax, [eax + 2 * edi] 45 punpcklbw xmm0, xmm1 46 movq xmm2, qword ptr [eax] 47 movdqa xmm1, xmm0 48 palignr xmm1, xmm1, 8 49 movq xmm3, qword ptr [eax + edi] 50 lea eax, [eax + 2 * edi] 51 punpcklbw xmm2, xmm3 52 movdqa xmm3, xmm2 53 movq xmm4, qword ptr [eax] 54 palignr xmm3, xmm3, 8 55 movq xmm5, qword ptr [eax + edi] 56 punpcklbw xmm4, xmm5 57 lea eax, [eax + 2 * edi] 58 movdqa xmm5, xmm4 59 movq xmm6, qword ptr [eax] 60 palignr xmm5, xmm5, 8 61 movq xmm7, qword ptr [eax + edi] 62 punpcklbw xmm6, xmm7 63 mov eax, ebp 64 movdqa xmm7, xmm6 65 palignr xmm7, xmm7, 8 66 // Second round of bit swap. 67 punpcklwd xmm0, xmm2 68 punpcklwd xmm1, xmm3 69 movdqa xmm2, xmm0 70 movdqa xmm3, xmm1 71 palignr xmm2, xmm2, 8 72 palignr xmm3, xmm3, 8 73 punpcklwd xmm4, xmm6 74 punpcklwd xmm5, xmm7 75 movdqa xmm6, xmm4 76 movdqa xmm7, xmm5 77 palignr xmm6, xmm6, 8 78 palignr xmm7, xmm7, 8 79 // Third round of bit swap. 80 // Write to the destination pointer. 81 punpckldq xmm0, xmm4 82 movq qword ptr [edx], xmm0 83 movdqa xmm4, xmm0 84 palignr xmm4, xmm4, 8 85 movq qword ptr [edx + esi], xmm4 86 lea edx, [edx + 2 * esi] 87 punpckldq xmm2, xmm6 88 movdqa xmm6, xmm2 89 palignr xmm6, xmm6, 8 90 movq qword ptr [edx], xmm2 91 punpckldq xmm1, xmm5 92 movq qword ptr [edx + esi], xmm6 93 lea edx, [edx + 2 * esi] 94 movdqa xmm5, xmm1 95 movq qword ptr [edx], xmm1 96 palignr xmm5, xmm5, 8 97 punpckldq xmm3, xmm7 98 movq qword ptr [edx + esi], xmm5 99 lea edx, [edx + 2 * esi] 100 movq qword ptr [edx], xmm3 101 movdqa xmm7, xmm3 102 palignr xmm7, xmm7, 8 103 sub ecx, 8 104 movq qword ptr [edx + esi], xmm7 105 lea edx, [edx + 2 * esi] 106 jg convertloop 107 108 pop ebp 109 pop esi 110 pop edi 111 ret 112 } 113 } 114 115 __declspec(naked) void TransposeUVWx8_SSE2(const uint8* src, 116 int src_stride, 117 uint8* dst_a, 118 int dst_stride_a, 119 uint8* dst_b, 120 int dst_stride_b, 121 int w) { 122 __asm { 123 push ebx 124 push esi 125 push edi 126 push ebp 127 mov eax, [esp + 16 + 4] // src 128 mov edi, [esp + 16 + 8] // src_stride 129 mov edx, [esp + 16 + 12] // dst_a 130 mov esi, [esp + 16 + 16] // dst_stride_a 131 mov ebx, [esp + 16 + 20] // dst_b 132 mov ebp, [esp + 16 + 24] // dst_stride_b 133 mov ecx, esp 134 sub esp, 4 + 16 135 and esp, ~15 136 mov [esp + 16], ecx 137 mov ecx, [ecx + 16 + 28] // w 138 139 align 4 140 // Read in the data from the source pointer. 141 // First round of bit swap. 142 convertloop: 143 movdqu xmm0, [eax] 144 movdqu xmm1, [eax + edi] 145 lea eax, [eax + 2 * edi] 146 movdqa xmm7, xmm0 // use xmm7 as temp register. 147 punpcklbw xmm0, xmm1 148 punpckhbw xmm7, xmm1 149 movdqa xmm1, xmm7 150 movdqu xmm2, [eax] 151 movdqu xmm3, [eax + edi] 152 lea eax, [eax + 2 * edi] 153 movdqa xmm7, xmm2 154 punpcklbw xmm2, xmm3 155 punpckhbw xmm7, xmm3 156 movdqa xmm3, xmm7 157 movdqu xmm4, [eax] 158 movdqu xmm5, [eax + edi] 159 lea eax, [eax + 2 * edi] 160 movdqa xmm7, xmm4 161 punpcklbw xmm4, xmm5 162 punpckhbw xmm7, xmm5 163 movdqa xmm5, xmm7 164 movdqu xmm6, [eax] 165 movdqu xmm7, [eax + edi] 166 lea eax, [eax + 2 * edi] 167 movdqu [esp], xmm5 // backup xmm5 168 neg edi 169 movdqa xmm5, xmm6 // use xmm5 as temp register. 170 punpcklbw xmm6, xmm7 171 punpckhbw xmm5, xmm7 172 movdqa xmm7, xmm5 173 lea eax, [eax + 8 * edi + 16] 174 neg edi 175 // Second round of bit swap. 176 movdqa xmm5, xmm0 177 punpcklwd xmm0, xmm2 178 punpckhwd xmm5, xmm2 179 movdqa xmm2, xmm5 180 movdqa xmm5, xmm1 181 punpcklwd xmm1, xmm3 182 punpckhwd xmm5, xmm3 183 movdqa xmm3, xmm5 184 movdqa xmm5, xmm4 185 punpcklwd xmm4, xmm6 186 punpckhwd xmm5, xmm6 187 movdqa xmm6, xmm5 188 movdqu xmm5, [esp] // restore xmm5 189 movdqu [esp], xmm6 // backup xmm6 190 movdqa xmm6, xmm5 // use xmm6 as temp register. 191 punpcklwd xmm5, xmm7 192 punpckhwd xmm6, xmm7 193 movdqa xmm7, xmm6 194 195 // Third round of bit swap. 196 // Write to the destination pointer. 197 movdqa xmm6, xmm0 198 punpckldq xmm0, xmm4 199 punpckhdq xmm6, xmm4 200 movdqa xmm4, xmm6 201 movdqu xmm6, [esp] // restore xmm6 202 movlpd qword ptr [edx], xmm0 203 movhpd qword ptr [ebx], xmm0 204 movlpd qword ptr [edx + esi], xmm4 205 lea edx, [edx + 2 * esi] 206 movhpd qword ptr [ebx + ebp], xmm4 207 lea ebx, [ebx + 2 * ebp] 208 movdqa xmm0, xmm2 // use xmm0 as the temp register. 209 punpckldq xmm2, xmm6 210 movlpd qword ptr [edx], xmm2 211 movhpd qword ptr [ebx], xmm2 212 punpckhdq xmm0, xmm6 213 movlpd qword ptr [edx + esi], xmm0 214 lea edx, [edx + 2 * esi] 215 movhpd qword ptr [ebx + ebp], xmm0 216 lea ebx, [ebx + 2 * ebp] 217 movdqa xmm0, xmm1 // use xmm0 as the temp register. 218 punpckldq xmm1, xmm5 219 movlpd qword ptr [edx], xmm1 220 movhpd qword ptr [ebx], xmm1 221 punpckhdq xmm0, xmm5 222 movlpd qword ptr [edx + esi], xmm0 223 lea edx, [edx + 2 * esi] 224 movhpd qword ptr [ebx + ebp], xmm0 225 lea ebx, [ebx + 2 * ebp] 226 movdqa xmm0, xmm3 // use xmm0 as the temp register. 227 punpckldq xmm3, xmm7 228 movlpd qword ptr [edx], xmm3 229 movhpd qword ptr [ebx], xmm3 230 punpckhdq xmm0, xmm7 231 sub ecx, 8 232 movlpd qword ptr [edx + esi], xmm0 233 lea edx, [edx + 2 * esi] 234 movhpd qword ptr [ebx + ebp], xmm0 235 lea ebx, [ebx + 2 * ebp] 236 jg convertloop 237 238 mov esp, [esp + 16] 239 pop ebp 240 pop edi 241 pop esi 242 pop ebx 243 ret 244 } 245 } 246 247 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) 248 249 #ifdef __cplusplus 250 } // extern "C" 251 } // namespace libyuv 252 #endif 253