1 /* 2 * Copyright 2015 The LibYuv Project Authors. All rights reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "libyuv/row.h" 12 #include "libyuv/rotate_row.h" 13 14 #ifdef __cplusplus 15 namespace libyuv { 16 extern "C" { 17 #endif 18 19 // This module is for GCC x86 and x64. 20 #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) 21 22 #if !defined(LIBYUV_DISABLE_X86) && \ 23 (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__))) 24 void TransposeWx8_SSSE3(const uint8* src, int src_stride, 25 uint8* dst, int dst_stride, int width) { 26 asm volatile ( 27 // Read in the data from the source pointer. 28 // First round of bit swap. 29 ".p2align 2 \n" 30 "1: \n" 31 "movq (%0),%%xmm0 \n" 32 "movq (%0,%3),%%xmm1 \n" 33 "lea (%0,%3,2),%0 \n" 34 "punpcklbw %%xmm1,%%xmm0 \n" 35 "movq (%0),%%xmm2 \n" 36 "movdqa %%xmm0,%%xmm1 \n" 37 "palignr $0x8,%%xmm1,%%xmm1 \n" 38 "movq (%0,%3),%%xmm3 \n" 39 "lea (%0,%3,2),%0 \n" 40 "punpcklbw %%xmm3,%%xmm2 \n" 41 "movdqa %%xmm2,%%xmm3 \n" 42 "movq (%0),%%xmm4 \n" 43 "palignr $0x8,%%xmm3,%%xmm3 \n" 44 "movq (%0,%3),%%xmm5 \n" 45 "lea (%0,%3,2),%0 \n" 46 "punpcklbw %%xmm5,%%xmm4 \n" 47 "movdqa %%xmm4,%%xmm5 \n" 48 "movq (%0),%%xmm6 \n" 49 "palignr $0x8,%%xmm5,%%xmm5 \n" 50 "movq (%0,%3),%%xmm7 \n" 51 "lea (%0,%3,2),%0 \n" 52 "punpcklbw %%xmm7,%%xmm6 \n" 53 "neg %3 \n" 54 "movdqa %%xmm6,%%xmm7 \n" 55 "lea 0x8(%0,%3,8),%0 \n" 56 "palignr $0x8,%%xmm7,%%xmm7 \n" 57 "neg %3 \n" 58 // Second round of bit swap. 59 "punpcklwd %%xmm2,%%xmm0 \n" 60 "punpcklwd %%xmm3,%%xmm1 \n" 61 "movdqa %%xmm0,%%xmm2 \n" 62 "movdqa %%xmm1,%%xmm3 \n" 63 "palignr $0x8,%%xmm2,%%xmm2 \n" 64 "palignr $0x8,%%xmm3,%%xmm3 \n" 65 "punpcklwd %%xmm6,%%xmm4 \n" 66 "punpcklwd %%xmm7,%%xmm5 \n" 67 "movdqa %%xmm4,%%xmm6 \n" 68 "movdqa %%xmm5,%%xmm7 \n" 69 "palignr $0x8,%%xmm6,%%xmm6 \n" 70 "palignr $0x8,%%xmm7,%%xmm7 \n" 71 // Third round of bit swap. 72 // Write to the destination pointer. 73 "punpckldq %%xmm4,%%xmm0 \n" 74 "movq %%xmm0,(%1) \n" 75 "movdqa %%xmm0,%%xmm4 \n" 76 "palignr $0x8,%%xmm4,%%xmm4 \n" 77 "movq %%xmm4,(%1,%4) \n" 78 "lea (%1,%4,2),%1 \n" 79 "punpckldq %%xmm6,%%xmm2 \n" 80 "movdqa %%xmm2,%%xmm6 \n" 81 "movq %%xmm2,(%1) \n" 82 "palignr $0x8,%%xmm6,%%xmm6 \n" 83 "punpckldq %%xmm5,%%xmm1 \n" 84 "movq %%xmm6,(%1,%4) \n" 85 "lea (%1,%4,2),%1 \n" 86 "movdqa %%xmm1,%%xmm5 \n" 87 "movq %%xmm1,(%1) \n" 88 "palignr $0x8,%%xmm5,%%xmm5 \n" 89 "movq %%xmm5,(%1,%4) \n" 90 "lea (%1,%4,2),%1 \n" 91 "punpckldq %%xmm7,%%xmm3 \n" 92 "movq %%xmm3,(%1) \n" 93 "movdqa %%xmm3,%%xmm7 \n" 94 "palignr $0x8,%%xmm7,%%xmm7 \n" 95 "sub $0x8,%2 \n" 96 "movq %%xmm7,(%1,%4) \n" 97 "lea (%1,%4,2),%1 \n" 98 "jg 1b \n" 99 : "+r"(src), // %0 100 "+r"(dst), // %1 101 "+r"(width) // %2 102 : "r"((intptr_t)(src_stride)), // %3 103 "r"((intptr_t)(dst_stride)) // %4 104 : "memory", "cc", 105 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 106 ); 107 } 108 109 #if !defined(LIBYUV_DISABLE_X86) && defined(__i386__) && !defined(__clang__) 110 void TransposeUVWx8_SSE2(const uint8* src, int src_stride, 111 uint8* dst_a, int dst_stride_a, 112 uint8* dst_b, int dst_stride_b, int width); 113 asm ( 114 DECLARE_FUNCTION(TransposeUVWx8_SSE2) 115 "push %ebx \n" 116 "push %esi \n" 117 "push %edi \n" 118 "push %ebp \n" 119 "mov 0x14(%esp),%eax \n" 120 "mov 0x18(%esp),%edi \n" 121 "mov 0x1c(%esp),%edx \n" 122 "mov 0x20(%esp),%esi \n" 123 "mov 0x24(%esp),%ebx \n" 124 "mov 0x28(%esp),%ebp \n" 125 "mov %esp,%ecx \n" 126 "sub $0x14,%esp \n" 127 "and $0xfffffff0,%esp \n" 128 "mov %ecx,0x10(%esp) \n" 129 "mov 0x2c(%ecx),%ecx \n" 130 131 "1: \n" 132 "movdqu (%eax),%xmm0 \n" 133 "movdqu (%eax,%edi,1),%xmm1 \n" 134 "lea (%eax,%edi,2),%eax \n" 135 "movdqa %xmm0,%xmm7 \n" 136 "punpcklbw %xmm1,%xmm0 \n" 137 "punpckhbw %xmm1,%xmm7 \n" 138 "movdqa %xmm7,%xmm1 \n" 139 "movdqu (%eax),%xmm2 \n" 140 "movdqu (%eax,%edi,1),%xmm3 \n" 141 "lea (%eax,%edi,2),%eax \n" 142 "movdqa %xmm2,%xmm7 \n" 143 "punpcklbw %xmm3,%xmm2 \n" 144 "punpckhbw %xmm3,%xmm7 \n" 145 "movdqa %xmm7,%xmm3 \n" 146 "movdqu (%eax),%xmm4 \n" 147 "movdqu (%eax,%edi,1),%xmm5 \n" 148 "lea (%eax,%edi,2),%eax \n" 149 "movdqa %xmm4,%xmm7 \n" 150 "punpcklbw %xmm5,%xmm4 \n" 151 "punpckhbw %xmm5,%xmm7 \n" 152 "movdqa %xmm7,%xmm5 \n" 153 "movdqu (%eax),%xmm6 \n" 154 "movdqu (%eax,%edi,1),%xmm7 \n" 155 "lea (%eax,%edi,2),%eax \n" 156 "movdqu %xmm5,(%esp) \n" 157 "neg %edi \n" 158 "movdqa %xmm6,%xmm5 \n" 159 "punpcklbw %xmm7,%xmm6 \n" 160 "punpckhbw %xmm7,%xmm5 \n" 161 "movdqa %xmm5,%xmm7 \n" 162 "lea 0x10(%eax,%edi,8),%eax \n" 163 "neg %edi \n" 164 "movdqa %xmm0,%xmm5 \n" 165 "punpcklwd %xmm2,%xmm0 \n" 166 "punpckhwd %xmm2,%xmm5 \n" 167 "movdqa %xmm5,%xmm2 \n" 168 "movdqa %xmm1,%xmm5 \n" 169 "punpcklwd %xmm3,%xmm1 \n" 170 "punpckhwd %xmm3,%xmm5 \n" 171 "movdqa %xmm5,%xmm3 \n" 172 "movdqa %xmm4,%xmm5 \n" 173 "punpcklwd %xmm6,%xmm4 \n" 174 "punpckhwd %xmm6,%xmm5 \n" 175 "movdqa %xmm5,%xmm6 \n" 176 "movdqu (%esp),%xmm5 \n" 177 "movdqu %xmm6,(%esp) \n" 178 "movdqa %xmm5,%xmm6 \n" 179 "punpcklwd %xmm7,%xmm5 \n" 180 "punpckhwd %xmm7,%xmm6 \n" 181 "movdqa %xmm6,%xmm7 \n" 182 "movdqa %xmm0,%xmm6 \n" 183 "punpckldq %xmm4,%xmm0 \n" 184 "punpckhdq %xmm4,%xmm6 \n" 185 "movdqa %xmm6,%xmm4 \n" 186 "movdqu (%esp),%xmm6 \n" 187 "movlpd %xmm0,(%edx) \n" 188 "movhpd %xmm0,(%ebx) \n" 189 "movlpd %xmm4,(%edx,%esi,1) \n" 190 "lea (%edx,%esi,2),%edx \n" 191 "movhpd %xmm4,(%ebx,%ebp,1) \n" 192 "lea (%ebx,%ebp,2),%ebx \n" 193 "movdqa %xmm2,%xmm0 \n" 194 "punpckldq %xmm6,%xmm2 \n" 195 "movlpd %xmm2,(%edx) \n" 196 "movhpd %xmm2,(%ebx) \n" 197 "punpckhdq %xmm6,%xmm0 \n" 198 "movlpd %xmm0,(%edx,%esi,1) \n" 199 "lea (%edx,%esi,2),%edx \n" 200 "movhpd %xmm0,(%ebx,%ebp,1) \n" 201 "lea (%ebx,%ebp,2),%ebx \n" 202 "movdqa %xmm1,%xmm0 \n" 203 "punpckldq %xmm5,%xmm1 \n" 204 "movlpd %xmm1,(%edx) \n" 205 "movhpd %xmm1,(%ebx) \n" 206 "punpckhdq %xmm5,%xmm0 \n" 207 "movlpd %xmm0,(%edx,%esi,1) \n" 208 "lea (%edx,%esi,2),%edx \n" 209 "movhpd %xmm0,(%ebx,%ebp,1) \n" 210 "lea (%ebx,%ebp,2),%ebx \n" 211 "movdqa %xmm3,%xmm0 \n" 212 "punpckldq %xmm7,%xmm3 \n" 213 "movlpd %xmm3,(%edx) \n" 214 "movhpd %xmm3,(%ebx) \n" 215 "punpckhdq %xmm7,%xmm0 \n" 216 "sub $0x8,%ecx \n" 217 "movlpd %xmm0,(%edx,%esi,1) \n" 218 "lea (%edx,%esi,2),%edx \n" 219 "movhpd %xmm0,(%ebx,%ebp,1) \n" 220 "lea (%ebx,%ebp,2),%ebx \n" 221 "jg 1b \n" 222 "mov 0x10(%esp),%esp \n" 223 "pop %ebp \n" 224 "pop %edi \n" 225 "pop %esi \n" 226 "pop %ebx \n" 227 #if defined(__native_client__) 228 "pop %ecx \n" 229 "and $0xffffffe0,%ecx \n" 230 "jmp *%ecx \n" 231 #else 232 "ret \n" 233 #endif 234 ); 235 #endif 236 #if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \ 237 defined(__x86_64__) 238 // 64 bit version has enough registers to do 16x8 to 8x16 at a time. 239 void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride, 240 uint8* dst, int dst_stride, int width) { 241 asm volatile ( 242 // Read in the data from the source pointer. 243 // First round of bit swap. 244 ".p2align 2 \n" 245 "1: \n" 246 "movdqu (%0),%%xmm0 \n" 247 "movdqu (%0,%3),%%xmm1 \n" 248 "lea (%0,%3,2),%0 \n" 249 "movdqa %%xmm0,%%xmm8 \n" 250 "punpcklbw %%xmm1,%%xmm0 \n" 251 "punpckhbw %%xmm1,%%xmm8 \n" 252 "movdqu (%0),%%xmm2 \n" 253 "movdqa %%xmm0,%%xmm1 \n" 254 "movdqa %%xmm8,%%xmm9 \n" 255 "palignr $0x8,%%xmm1,%%xmm1 \n" 256 "palignr $0x8,%%xmm9,%%xmm9 \n" 257 "movdqu (%0,%3),%%xmm3 \n" 258 "lea (%0,%3,2),%0 \n" 259 "movdqa %%xmm2,%%xmm10 \n" 260 "punpcklbw %%xmm3,%%xmm2 \n" 261 "punpckhbw %%xmm3,%%xmm10 \n" 262 "movdqa %%xmm2,%%xmm3 \n" 263 "movdqa %%xmm10,%%xmm11 \n" 264 "movdqu (%0),%%xmm4 \n" 265 "palignr $0x8,%%xmm3,%%xmm3 \n" 266 "palignr $0x8,%%xmm11,%%xmm11 \n" 267 "movdqu (%0,%3),%%xmm5 \n" 268 "lea (%0,%3,2),%0 \n" 269 "movdqa %%xmm4,%%xmm12 \n" 270 "punpcklbw %%xmm5,%%xmm4 \n" 271 "punpckhbw %%xmm5,%%xmm12 \n" 272 "movdqa %%xmm4,%%xmm5 \n" 273 "movdqa %%xmm12,%%xmm13 \n" 274 "movdqu (%0),%%xmm6 \n" 275 "palignr $0x8,%%xmm5,%%xmm5 \n" 276 "palignr $0x8,%%xmm13,%%xmm13 \n" 277 "movdqu (%0,%3),%%xmm7 \n" 278 "lea (%0,%3,2),%0 \n" 279 "movdqa %%xmm6,%%xmm14 \n" 280 "punpcklbw %%xmm7,%%xmm6 \n" 281 "punpckhbw %%xmm7,%%xmm14 \n" 282 "neg %3 \n" 283 "movdqa %%xmm6,%%xmm7 \n" 284 "movdqa %%xmm14,%%xmm15 \n" 285 "lea 0x10(%0,%3,8),%0 \n" 286 "palignr $0x8,%%xmm7,%%xmm7 \n" 287 "palignr $0x8,%%xmm15,%%xmm15 \n" 288 "neg %3 \n" 289 // Second round of bit swap. 290 "punpcklwd %%xmm2,%%xmm0 \n" 291 "punpcklwd %%xmm3,%%xmm1 \n" 292 "movdqa %%xmm0,%%xmm2 \n" 293 "movdqa %%xmm1,%%xmm3 \n" 294 "palignr $0x8,%%xmm2,%%xmm2 \n" 295 "palignr $0x8,%%xmm3,%%xmm3 \n" 296 "punpcklwd %%xmm6,%%xmm4 \n" 297 "punpcklwd %%xmm7,%%xmm5 \n" 298 "movdqa %%xmm4,%%xmm6 \n" 299 "movdqa %%xmm5,%%xmm7 \n" 300 "palignr $0x8,%%xmm6,%%xmm6 \n" 301 "palignr $0x8,%%xmm7,%%xmm7 \n" 302 "punpcklwd %%xmm10,%%xmm8 \n" 303 "punpcklwd %%xmm11,%%xmm9 \n" 304 "movdqa %%xmm8,%%xmm10 \n" 305 "movdqa %%xmm9,%%xmm11 \n" 306 "palignr $0x8,%%xmm10,%%xmm10 \n" 307 "palignr $0x8,%%xmm11,%%xmm11 \n" 308 "punpcklwd %%xmm14,%%xmm12 \n" 309 "punpcklwd %%xmm15,%%xmm13 \n" 310 "movdqa %%xmm12,%%xmm14 \n" 311 "movdqa %%xmm13,%%xmm15 \n" 312 "palignr $0x8,%%xmm14,%%xmm14 \n" 313 "palignr $0x8,%%xmm15,%%xmm15 \n" 314 // Third round of bit swap. 315 // Write to the destination pointer. 316 "punpckldq %%xmm4,%%xmm0 \n" 317 "movq %%xmm0,(%1) \n" 318 "movdqa %%xmm0,%%xmm4 \n" 319 "palignr $0x8,%%xmm4,%%xmm4 \n" 320 "movq %%xmm4,(%1,%4) \n" 321 "lea (%1,%4,2),%1 \n" 322 "punpckldq %%xmm6,%%xmm2 \n" 323 "movdqa %%xmm2,%%xmm6 \n" 324 "movq %%xmm2,(%1) \n" 325 "palignr $0x8,%%xmm6,%%xmm6 \n" 326 "punpckldq %%xmm5,%%xmm1 \n" 327 "movq %%xmm6,(%1,%4) \n" 328 "lea (%1,%4,2),%1 \n" 329 "movdqa %%xmm1,%%xmm5 \n" 330 "movq %%xmm1,(%1) \n" 331 "palignr $0x8,%%xmm5,%%xmm5 \n" 332 "movq %%xmm5,(%1,%4) \n" 333 "lea (%1,%4,2),%1 \n" 334 "punpckldq %%xmm7,%%xmm3 \n" 335 "movq %%xmm3,(%1) \n" 336 "movdqa %%xmm3,%%xmm7 \n" 337 "palignr $0x8,%%xmm7,%%xmm7 \n" 338 "movq %%xmm7,(%1,%4) \n" 339 "lea (%1,%4,2),%1 \n" 340 "punpckldq %%xmm12,%%xmm8 \n" 341 "movq %%xmm8,(%1) \n" 342 "movdqa %%xmm8,%%xmm12 \n" 343 "palignr $0x8,%%xmm12,%%xmm12 \n" 344 "movq %%xmm12,(%1,%4) \n" 345 "lea (%1,%4,2),%1 \n" 346 "punpckldq %%xmm14,%%xmm10 \n" 347 "movdqa %%xmm10,%%xmm14 \n" 348 "movq %%xmm10,(%1) \n" 349 "palignr $0x8,%%xmm14,%%xmm14 \n" 350 "punpckldq %%xmm13,%%xmm9 \n" 351 "movq %%xmm14,(%1,%4) \n" 352 "lea (%1,%4,2),%1 \n" 353 "movdqa %%xmm9,%%xmm13 \n" 354 "movq %%xmm9,(%1) \n" 355 "palignr $0x8,%%xmm13,%%xmm13 \n" 356 "movq %%xmm13,(%1,%4) \n" 357 "lea (%1,%4,2),%1 \n" 358 "punpckldq %%xmm15,%%xmm11 \n" 359 "movq %%xmm11,(%1) \n" 360 "movdqa %%xmm11,%%xmm15 \n" 361 "palignr $0x8,%%xmm15,%%xmm15 \n" 362 "sub $0x10,%2 \n" 363 "movq %%xmm15,(%1,%4) \n" 364 "lea (%1,%4,2),%1 \n" 365 "jg 1b \n" 366 : "+r"(src), // %0 367 "+r"(dst), // %1 368 "+r"(width) // %2 369 : "r"((intptr_t)(src_stride)), // %3 370 "r"((intptr_t)(dst_stride)) // %4 371 : "memory", "cc", 372 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", 373 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15" 374 ); 375 } 376 377 void TransposeUVWx8_SSE2(const uint8* src, int src_stride, 378 uint8* dst_a, int dst_stride_a, 379 uint8* dst_b, int dst_stride_b, int width) { 380 asm volatile ( 381 // Read in the data from the source pointer. 382 // First round of bit swap. 383 ".p2align 2 \n" 384 "1: \n" 385 "movdqu (%0),%%xmm0 \n" 386 "movdqu (%0,%4),%%xmm1 \n" 387 "lea (%0,%4,2),%0 \n" 388 "movdqa %%xmm0,%%xmm8 \n" 389 "punpcklbw %%xmm1,%%xmm0 \n" 390 "punpckhbw %%xmm1,%%xmm8 \n" 391 "movdqa %%xmm8,%%xmm1 \n" 392 "movdqu (%0),%%xmm2 \n" 393 "movdqu (%0,%4),%%xmm3 \n" 394 "lea (%0,%4,2),%0 \n" 395 "movdqa %%xmm2,%%xmm8 \n" 396 "punpcklbw %%xmm3,%%xmm2 \n" 397 "punpckhbw %%xmm3,%%xmm8 \n" 398 "movdqa %%xmm8,%%xmm3 \n" 399 "movdqu (%0),%%xmm4 \n" 400 "movdqu (%0,%4),%%xmm5 \n" 401 "lea (%0,%4,2),%0 \n" 402 "movdqa %%xmm4,%%xmm8 \n" 403 "punpcklbw %%xmm5,%%xmm4 \n" 404 "punpckhbw %%xmm5,%%xmm8 \n" 405 "movdqa %%xmm8,%%xmm5 \n" 406 "movdqu (%0),%%xmm6 \n" 407 "movdqu (%0,%4),%%xmm7 \n" 408 "lea (%0,%4,2),%0 \n" 409 "movdqa %%xmm6,%%xmm8 \n" 410 "punpcklbw %%xmm7,%%xmm6 \n" 411 "neg %4 \n" 412 "lea 0x10(%0,%4,8),%0 \n" 413 "punpckhbw %%xmm7,%%xmm8 \n" 414 "movdqa %%xmm8,%%xmm7 \n" 415 "neg %4 \n" 416 // Second round of bit swap. 417 "movdqa %%xmm0,%%xmm8 \n" 418 "movdqa %%xmm1,%%xmm9 \n" 419 "punpckhwd %%xmm2,%%xmm8 \n" 420 "punpckhwd %%xmm3,%%xmm9 \n" 421 "punpcklwd %%xmm2,%%xmm0 \n" 422 "punpcklwd %%xmm3,%%xmm1 \n" 423 "movdqa %%xmm8,%%xmm2 \n" 424 "movdqa %%xmm9,%%xmm3 \n" 425 "movdqa %%xmm4,%%xmm8 \n" 426 "movdqa %%xmm5,%%xmm9 \n" 427 "punpckhwd %%xmm6,%%xmm8 \n" 428 "punpckhwd %%xmm7,%%xmm9 \n" 429 "punpcklwd %%xmm6,%%xmm4 \n" 430 "punpcklwd %%xmm7,%%xmm5 \n" 431 "movdqa %%xmm8,%%xmm6 \n" 432 "movdqa %%xmm9,%%xmm7 \n" 433 // Third round of bit swap. 434 // Write to the destination pointer. 435 "movdqa %%xmm0,%%xmm8 \n" 436 "punpckldq %%xmm4,%%xmm0 \n" 437 "movlpd %%xmm0,(%1) \n" // Write back U channel 438 "movhpd %%xmm0,(%2) \n" // Write back V channel 439 "punpckhdq %%xmm4,%%xmm8 \n" 440 "movlpd %%xmm8,(%1,%5) \n" 441 "lea (%1,%5,2),%1 \n" 442 "movhpd %%xmm8,(%2,%6) \n" 443 "lea (%2,%6,2),%2 \n" 444 "movdqa %%xmm2,%%xmm8 \n" 445 "punpckldq %%xmm6,%%xmm2 \n" 446 "movlpd %%xmm2,(%1) \n" 447 "movhpd %%xmm2,(%2) \n" 448 "punpckhdq %%xmm6,%%xmm8 \n" 449 "movlpd %%xmm8,(%1,%5) \n" 450 "lea (%1,%5,2),%1 \n" 451 "movhpd %%xmm8,(%2,%6) \n" 452 "lea (%2,%6,2),%2 \n" 453 "movdqa %%xmm1,%%xmm8 \n" 454 "punpckldq %%xmm5,%%xmm1 \n" 455 "movlpd %%xmm1,(%1) \n" 456 "movhpd %%xmm1,(%2) \n" 457 "punpckhdq %%xmm5,%%xmm8 \n" 458 "movlpd %%xmm8,(%1,%5) \n" 459 "lea (%1,%5,2),%1 \n" 460 "movhpd %%xmm8,(%2,%6) \n" 461 "lea (%2,%6,2),%2 \n" 462 "movdqa %%xmm3,%%xmm8 \n" 463 "punpckldq %%xmm7,%%xmm3 \n" 464 "movlpd %%xmm3,(%1) \n" 465 "movhpd %%xmm3,(%2) \n" 466 "punpckhdq %%xmm7,%%xmm8 \n" 467 "sub $0x8,%3 \n" 468 "movlpd %%xmm8,(%1,%5) \n" 469 "lea (%1,%5,2),%1 \n" 470 "movhpd %%xmm8,(%2,%6) \n" 471 "lea (%2,%6,2),%2 \n" 472 "jg 1b \n" 473 : "+r"(src), // %0 474 "+r"(dst_a), // %1 475 "+r"(dst_b), // %2 476 "+r"(width) // %3 477 : "r"((intptr_t)(src_stride)), // %4 478 "r"((intptr_t)(dst_stride_a)), // %5 479 "r"((intptr_t)(dst_stride_b)) // %6 480 : "memory", "cc", 481 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", 482 "xmm8", "xmm9" 483 ); 484 } 485 #endif 486 #endif 487 488 #endif // defined(__x86_64__) || defined(__i386__) 489 490 #ifdef __cplusplus 491 } // extern "C" 492 } // namespace libyuv 493 #endif 494