1 /* 2 * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "libyuv/planar_functions.h" 12 #include "libyuv/rotate.h" 13 #include "rotate_priv.h" 14 15 #include "libyuv/cpu_id.h" 16 17 namespace libyuv { 18 19 #if (defined(WIN32) || defined(__x86_64__) || defined(__i386__)) \ 20 && !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) 21 #if defined(_MSC_VER) 22 #define TALIGN16(t, var) static __declspec(align(16)) t _ ## var 23 #else 24 #define TALIGN16(t, var) t var __attribute__((aligned(16))) 25 #endif 26 // Shuffle table for reversing the bytes. 27 extern "C" TALIGN16(const uint8, kShuffleReverse[16]) = 28 { 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u }; 29 // Shuffle table for reversing the bytes of UV channels. 30 extern "C" TALIGN16(const uint8, kShuffleReverseUV[16]) = 31 { 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u }; 32 #endif 33 34 typedef void (*reverse_uv_func)(const uint8*, uint8*, uint8*, int); 35 typedef void (*reverse_func)(const uint8*, uint8*, int); 36 typedef void (*rotate_uv_wx8_func)(const uint8*, int, 37 uint8*, int, 38 uint8*, int, int); 39 typedef void (*rotate_uv_wxh_func)(const uint8*, int, 40 uint8*, int, 41 uint8*, int, int, int); 42 typedef void (*rotate_wx8_func)(const uint8*, int, uint8*, int, int); 43 typedef void (*rotate_wxh_func)(const uint8*, int, uint8*, int, int, int); 44 45 #if 0 // Need to add rotate_neon.s to the build to enable this 46 #ifdef __ARM_NEON__ 47 extern "C" { 48 void RestoreRegisters_NEON(unsigned long long *restore); 49 void SaveRegisters_NEON(unsigned long long *store); 50 #define HAS_REVERSE_LINE_NEON 51 void ReverseLine_NEON(const uint8* src, uint8* dst, int width); 52 #define HAS_REVERSE_LINE_UV_NEON 53 void ReverseLineUV_NEON(const uint8* src, 54 uint8* dst_a, uint8* dst_b, 55 int width); 56 #define HAS_TRANSPOSE_WX8_NEON 57 void TransposeWx8_NEON(const uint8* src, int src_stride, 58 uint8* dst, int dst_stride, int width); 59 #define HAS_TRANSPOSE_UVWX8_NEON 60 void TransposeUVWx8_NEON(const uint8* src, int src_stride, 61 uint8* dst_a, int dst_stride_a, 62 uint8* dst_b, int dst_stride_b, 63 int width); 64 } // extern "C" 65 #endif 66 #endif 67 68 #if defined(WIN32) && !defined(COVERAGE_ENABLED) 69 #define HAS_TRANSPOSE_WX8_SSSE3 70 __declspec(naked) 71 static void TransposeWx8_SSSE3(const uint8* src, int src_stride, 72 uint8* dst, int dst_stride, int width) { 73 __asm { 74 push edi 75 push esi 76 push ebp 77 mov eax, [esp + 12 + 4] // src 78 mov edi, [esp + 12 + 8] // src_stride 79 mov edx, [esp + 12 + 12] // dst 80 mov esi, [esp + 12 + 16] // dst_stride 81 mov ecx, [esp + 12 + 20] // width 82 convertloop : 83 // Read in the data from the source pointer. 84 // First round of bit swap. 85 movq xmm0, qword ptr [eax] 86 lea ebp, [eax + 8] 87 movq xmm1, qword ptr [eax + edi] 88 lea eax, [eax + 2 * edi] 89 punpcklbw xmm0, xmm1 90 movq xmm2, qword ptr [eax] 91 movdqa xmm1, xmm0 92 palignr xmm1, xmm1, 8 93 movq xmm3, qword ptr [eax + edi] 94 lea eax, [eax + 2 * edi] 95 punpcklbw xmm2, xmm3 96 movdqa xmm3, xmm2 97 movq xmm4, qword ptr [eax] 98 palignr xmm3, xmm3, 8 99 movq xmm5, qword ptr [eax + edi] 100 punpcklbw xmm4, xmm5 101 lea eax, [eax + 2 * edi] 102 movdqa xmm5, xmm4 103 movq xmm6, qword ptr [eax] 104 palignr xmm5, xmm5, 8 105 movq xmm7, qword ptr [eax + edi] 106 punpcklbw xmm6, xmm7 107 mov eax, ebp 108 movdqa xmm7, xmm6 109 palignr xmm7, xmm7, 8 110 // Second round of bit swap. 111 punpcklwd xmm0, xmm2 112 punpcklwd xmm1, xmm3 113 movdqa xmm2, xmm0 114 movdqa xmm3, xmm1 115 palignr xmm2, xmm2, 8 116 palignr xmm3, xmm3, 8 117 punpcklwd xmm4, xmm6 118 punpcklwd xmm5, xmm7 119 movdqa xmm6, xmm4 120 movdqa xmm7, xmm5 121 palignr xmm6, xmm6, 8 122 palignr xmm7, xmm7, 8 123 // Third round of bit swap. 124 // Write to the destination pointer. 125 punpckldq xmm0, xmm4 126 movq qword ptr [edx], xmm0 127 movdqa xmm4, xmm0 128 palignr xmm4, xmm4, 8 129 movq qword ptr [edx + esi], xmm4 130 lea edx, [edx + 2 * esi] 131 punpckldq xmm2, xmm6 132 movdqa xmm6, xmm2 133 palignr xmm6, xmm6, 8 134 movq qword ptr [edx], xmm2 135 punpckldq xmm1, xmm5 136 movq qword ptr [edx + esi], xmm6 137 lea edx, [edx + 2 * esi] 138 movdqa xmm5, xmm1 139 movq qword ptr [edx], xmm1 140 palignr xmm5, xmm5, 8 141 punpckldq xmm3, xmm7 142 movq qword ptr [edx + esi], xmm5 143 lea edx, [edx + 2 * esi] 144 movq qword ptr [edx], xmm3 145 movdqa xmm7, xmm3 146 palignr xmm7, xmm7, 8 147 movq qword ptr [edx + esi], xmm7 148 lea edx, [edx + 2 * esi] 149 sub ecx, 8 150 ja convertloop 151 152 pop ebp 153 pop esi 154 pop edi 155 ret 156 } 157 } 158 159 #define HAS_TRANSPOSE_UVWX8_SSE2 160 __declspec(naked) 161 static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, 162 uint8* dst_a, int dst_stride_a, 163 uint8* dst_b, int dst_stride_b, 164 int w) { 165 __asm { 166 push ebx 167 push esi 168 push edi 169 push ebp 170 mov eax, [esp + 16 + 4] // src 171 mov edi, [esp + 16 + 8] // src_stride 172 mov edx, [esp + 16 + 12] // dst_a 173 mov esi, [esp + 16 + 16] // dst_stride_a 174 mov ebx, [esp + 16 + 20] // dst_b 175 mov ebp, [esp + 16 + 24] // dst_stride_b 176 mov ecx, esp 177 sub esp, 4 + 16 178 and esp, ~15 179 mov [esp + 16], ecx 180 mov ecx, [ecx + 16 + 28] // w 181 convertloop : 182 // Read in the data from the source pointer. 183 // First round of bit swap. 184 movdqa xmm0, [eax] 185 movdqa xmm1, [eax + edi] 186 lea eax, [eax + 2 * edi] 187 movdqa xmm7, xmm0 // use xmm7 as temp register. 188 punpcklbw xmm0, xmm1 189 punpckhbw xmm7, xmm1 190 movdqa xmm1, xmm7 191 movdqa xmm2, [eax] 192 movdqa xmm3, [eax + edi] 193 lea eax, [eax + 2 * edi] 194 movdqa xmm7, xmm2 195 punpcklbw xmm2, xmm3 196 punpckhbw xmm7, xmm3 197 movdqa xmm3, xmm7 198 movdqa xmm4, [eax] 199 movdqa xmm5, [eax + edi] 200 lea eax, [eax + 2 * edi] 201 movdqa xmm7, xmm4 202 punpcklbw xmm4, xmm5 203 punpckhbw xmm7, xmm5 204 movdqa xmm5, xmm7 205 movdqa xmm6, [eax] 206 movdqa xmm7, [eax + edi] 207 lea eax, [eax + 2 * edi] 208 movdqa [esp], xmm5 // backup xmm5 209 neg edi 210 movdqa xmm5, xmm6 // use xmm5 as temp register. 211 punpcklbw xmm6, xmm7 212 punpckhbw xmm5, xmm7 213 movdqa xmm7, xmm5 214 lea eax, [eax + 8 * edi + 16] 215 neg edi 216 // Second round of bit swap. 217 movdqa xmm5, xmm0 218 punpcklwd xmm0, xmm2 219 punpckhwd xmm5, xmm2 220 movdqa xmm2, xmm5 221 movdqa xmm5, xmm1 222 punpcklwd xmm1, xmm3 223 punpckhwd xmm5, xmm3 224 movdqa xmm3, xmm5 225 movdqa xmm5, xmm4 226 punpcklwd xmm4, xmm6 227 punpckhwd xmm5, xmm6 228 movdqa xmm6, xmm5 229 movdqa xmm5, [esp] // restore xmm5 230 movdqa [esp], xmm6 // backup xmm6 231 movdqa xmm6, xmm5 // use xmm6 as temp register. 232 punpcklwd xmm5, xmm7 233 punpckhwd xmm6, xmm7 234 movdqa xmm7, xmm6 235 // Third round of bit swap. 236 // Write to the destination pointer. 237 movdqa xmm6, xmm0 238 punpckldq xmm0, xmm4 239 punpckhdq xmm6, xmm4 240 movdqa xmm4, xmm6 241 movdqa xmm6, [esp] // restore xmm6 242 movlpd qword ptr [edx], xmm0 243 movhpd qword ptr [ebx], xmm0 244 movlpd qword ptr [edx + esi], xmm4 245 lea edx, [edx + 2 * esi] 246 movhpd qword ptr [ebx + ebp], xmm4 247 lea ebx, [ebx + 2 * ebp] 248 movdqa xmm0, xmm2 // use xmm0 as the temp register. 249 punpckldq xmm2, xmm6 250 movlpd qword ptr [edx], xmm2 251 movhpd qword ptr [ebx], xmm2 252 punpckhdq xmm0, xmm6 253 movlpd qword ptr [edx + esi], xmm0 254 lea edx, [edx + 2 * esi] 255 movhpd qword ptr [ebx + ebp], xmm0 256 lea ebx, [ebx + 2 * ebp] 257 movdqa xmm0, xmm1 // use xmm0 as the temp register. 258 punpckldq xmm1, xmm5 259 movlpd qword ptr [edx], xmm1 260 movhpd qword ptr [ebx], xmm1 261 punpckhdq xmm0, xmm5 262 movlpd qword ptr [edx + esi], xmm0 263 lea edx, [edx + 2 * esi] 264 movhpd qword ptr [ebx + ebp], xmm0 265 lea ebx, [ebx + 2 * ebp] 266 movdqa xmm0, xmm3 // use xmm0 as the temp register. 267 punpckldq xmm3, xmm7 268 movlpd qword ptr [edx], xmm3 269 movhpd qword ptr [ebx], xmm3 270 punpckhdq xmm0, xmm7 271 movlpd qword ptr [edx + esi], xmm0 272 lea edx, [edx + 2 * esi] 273 movhpd qword ptr [ebx + ebp], xmm0 274 lea ebx, [ebx + 2 * ebp] 275 sub ecx, 8 276 ja convertloop 277 278 mov esp, [esp + 16] 279 pop ebp 280 pop edi 281 pop esi 282 pop ebx 283 ret 284 } 285 } 286 #elif (defined(__i386__) || defined(__x86_64__)) && \ 287 !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) 288 #define HAS_TRANSPOSE_WX8_SSSE3 289 static void TransposeWx8_SSSE3(const uint8* src, int src_stride, 290 uint8* dst, int dst_stride, int width) { 291 asm volatile( 292 "1:" 293 // Read in the data from the source pointer. 294 // First round of bit swap. 295 "movq (%0),%%xmm0\n" 296 "movq (%0,%3),%%xmm1\n" 297 "lea (%0,%3,2),%0\n" 298 "punpcklbw %%xmm1,%%xmm0\n" 299 "movq (%0),%%xmm2\n" 300 "movdqa %%xmm0,%%xmm1\n" 301 "palignr $0x8,%%xmm1,%%xmm1\n" 302 "movq (%0,%3),%%xmm3\n" 303 "lea (%0,%3,2),%0\n" 304 "punpcklbw %%xmm3,%%xmm2\n" 305 "movdqa %%xmm2,%%xmm3\n" 306 "movq (%0),%%xmm4\n" 307 "palignr $0x8,%%xmm3,%%xmm3\n" 308 "movq (%0,%3),%%xmm5\n" 309 "lea (%0,%3,2),%0\n" 310 "punpcklbw %%xmm5,%%xmm4\n" 311 "movdqa %%xmm4,%%xmm5\n" 312 "movq (%0),%%xmm6\n" 313 "palignr $0x8,%%xmm5,%%xmm5\n" 314 "movq (%0,%3),%%xmm7\n" 315 "lea (%0,%3,2),%0\n" 316 "punpcklbw %%xmm7,%%xmm6\n" 317 "neg %3\n" 318 "movdqa %%xmm6,%%xmm7\n" 319 "lea 0x8(%0,%3,8),%0\n" 320 "palignr $0x8,%%xmm7,%%xmm7\n" 321 "neg %3\n" 322 // Second round of bit swap. 323 "punpcklwd %%xmm2,%%xmm0\n" 324 "punpcklwd %%xmm3,%%xmm1\n" 325 "movdqa %%xmm0,%%xmm2\n" 326 "movdqa %%xmm1,%%xmm3\n" 327 "palignr $0x8,%%xmm2,%%xmm2\n" 328 "palignr $0x8,%%xmm3,%%xmm3\n" 329 "punpcklwd %%xmm6,%%xmm4\n" 330 "punpcklwd %%xmm7,%%xmm5\n" 331 "movdqa %%xmm4,%%xmm6\n" 332 "movdqa %%xmm5,%%xmm7\n" 333 "palignr $0x8,%%xmm6,%%xmm6\n" 334 "palignr $0x8,%%xmm7,%%xmm7\n" 335 // Third round of bit swap. 336 // Write to the destination pointer. 337 "punpckldq %%xmm4,%%xmm0\n" 338 "movq %%xmm0,(%1)\n" 339 "movdqa %%xmm0,%%xmm4\n" 340 "palignr $0x8,%%xmm4,%%xmm4\n" 341 "movq %%xmm4,(%1,%4)\n" 342 "lea (%1,%4,2),%1\n" 343 "punpckldq %%xmm6,%%xmm2\n" 344 "movdqa %%xmm2,%%xmm6\n" 345 "movq %%xmm2,(%1)\n" 346 "palignr $0x8,%%xmm6,%%xmm6\n" 347 "punpckldq %%xmm5,%%xmm1\n" 348 "movq %%xmm6,(%1,%4)\n" 349 "lea (%1,%4,2),%1\n" 350 "movdqa %%xmm1,%%xmm5\n" 351 "movq %%xmm1,(%1)\n" 352 "palignr $0x8,%%xmm5,%%xmm5\n" 353 "movq %%xmm5,(%1,%4)\n" 354 "lea (%1,%4,2),%1\n" 355 "punpckldq %%xmm7,%%xmm3\n" 356 "movq %%xmm3,(%1)\n" 357 "movdqa %%xmm3,%%xmm7\n" 358 "palignr $0x8,%%xmm7,%%xmm7\n" 359 "movq %%xmm7,(%1,%4)\n" 360 "lea (%1,%4,2),%1\n" 361 "sub $0x8,%2\n" 362 "ja 1b\n" 363 : "+r"(src), // %0 364 "+r"(dst), // %1 365 "+r"(width) // %2 366 : "r"(static_cast<intptr_t>(src_stride)), // %3 367 "r"(static_cast<intptr_t>(dst_stride)) // %4 368 : "memory" 369 ); 370 } 371 372 #if defined (__i386__) 373 #define HAS_TRANSPOSE_UVWX8_SSE2 374 extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride, 375 uint8* dst_a, int dst_stride_a, 376 uint8* dst_b, int dst_stride_b, 377 int w); 378 asm( 379 ".text\n" 380 #if defined(OSX) 381 ".globl _TransposeUVWx8_SSE2\n" 382 "_TransposeUVWx8_SSE2:\n" 383 #else 384 ".global TransposeUVWx8_SSE2\n" 385 "TransposeUVWx8_SSE2:\n" 386 #endif 387 "push %ebx\n" 388 "push %esi\n" 389 "push %edi\n" 390 "push %ebp\n" 391 "mov 0x14(%esp),%eax\n" 392 "mov 0x18(%esp),%edi\n" 393 "mov 0x1c(%esp),%edx\n" 394 "mov 0x20(%esp),%esi\n" 395 "mov 0x24(%esp),%ebx\n" 396 "mov 0x28(%esp),%ebp\n" 397 "mov %esp,%ecx\n" 398 "sub $0x14,%esp\n" 399 "and $0xfffffff0,%esp\n" 400 "mov %ecx,0x10(%esp)\n" 401 "mov 0x2c(%ecx),%ecx\n" 402 403 "1:" 404 "movdqa (%eax),%xmm0\n" 405 "movdqa (%eax,%edi,1),%xmm1\n" 406 "lea (%eax,%edi,2),%eax\n" 407 "movdqa %xmm0,%xmm7\n" 408 "punpcklbw %xmm1,%xmm0\n" 409 "punpckhbw %xmm1,%xmm7\n" 410 "movdqa %xmm7,%xmm1\n" 411 "movdqa (%eax),%xmm2\n" 412 "movdqa (%eax,%edi,1),%xmm3\n" 413 "lea (%eax,%edi,2),%eax\n" 414 "movdqa %xmm2,%xmm7\n" 415 "punpcklbw %xmm3,%xmm2\n" 416 "punpckhbw %xmm3,%xmm7\n" 417 "movdqa %xmm7,%xmm3\n" 418 "movdqa (%eax),%xmm4\n" 419 "movdqa (%eax,%edi,1),%xmm5\n" 420 "lea (%eax,%edi,2),%eax\n" 421 "movdqa %xmm4,%xmm7\n" 422 "punpcklbw %xmm5,%xmm4\n" 423 "punpckhbw %xmm5,%xmm7\n" 424 "movdqa %xmm7,%xmm5\n" 425 "movdqa (%eax),%xmm6\n" 426 "movdqa (%eax,%edi,1),%xmm7\n" 427 "lea (%eax,%edi,2),%eax\n" 428 "movdqa %xmm5,(%esp)\n" 429 "neg %edi\n" 430 "movdqa %xmm6,%xmm5\n" 431 "punpcklbw %xmm7,%xmm6\n" 432 "punpckhbw %xmm7,%xmm5\n" 433 "movdqa %xmm5,%xmm7\n" 434 "lea 0x10(%eax,%edi,8),%eax\n" 435 "neg %edi\n" 436 "movdqa %xmm0,%xmm5\n" 437 "punpcklwd %xmm2,%xmm0\n" 438 "punpckhwd %xmm2,%xmm5\n" 439 "movdqa %xmm5,%xmm2\n" 440 "movdqa %xmm1,%xmm5\n" 441 "punpcklwd %xmm3,%xmm1\n" 442 "punpckhwd %xmm3,%xmm5\n" 443 "movdqa %xmm5,%xmm3\n" 444 "movdqa %xmm4,%xmm5\n" 445 "punpcklwd %xmm6,%xmm4\n" 446 "punpckhwd %xmm6,%xmm5\n" 447 "movdqa %xmm5,%xmm6\n" 448 "movdqa (%esp),%xmm5\n" 449 "movdqa %xmm6,(%esp)\n" 450 "movdqa %xmm5,%xmm6\n" 451 "punpcklwd %xmm7,%xmm5\n" 452 "punpckhwd %xmm7,%xmm6\n" 453 "movdqa %xmm6,%xmm7\n" 454 "movdqa %xmm0,%xmm6\n" 455 "punpckldq %xmm4,%xmm0\n" 456 "punpckhdq %xmm4,%xmm6\n" 457 "movdqa %xmm6,%xmm4\n" 458 "movdqa (%esp),%xmm6\n" 459 "movlpd %xmm0,(%edx)\n" 460 "movhpd %xmm0,(%ebx)\n" 461 "movlpd %xmm4,(%edx,%esi,1)\n" 462 "lea (%edx,%esi,2),%edx\n" 463 "movhpd %xmm4,(%ebx,%ebp,1)\n" 464 "lea (%ebx,%ebp,2),%ebx\n" 465 "movdqa %xmm2,%xmm0\n" 466 "punpckldq %xmm6,%xmm2\n" 467 "movlpd %xmm2,(%edx)\n" 468 "movhpd %xmm2,(%ebx)\n" 469 "punpckhdq %xmm6,%xmm0\n" 470 "movlpd %xmm0,(%edx,%esi,1)\n" 471 "lea (%edx,%esi,2),%edx\n" 472 "movhpd %xmm0,(%ebx,%ebp,1)\n" 473 "lea (%ebx,%ebp,2),%ebx\n" 474 "movdqa %xmm1,%xmm0\n" 475 "punpckldq %xmm5,%xmm1\n" 476 "movlpd %xmm1,(%edx)\n" 477 "movhpd %xmm1,(%ebx)\n" 478 "punpckhdq %xmm5,%xmm0\n" 479 "movlpd %xmm0,(%edx,%esi,1)\n" 480 "lea (%edx,%esi,2),%edx\n" 481 "movhpd %xmm0,(%ebx,%ebp,1)\n" 482 "lea (%ebx,%ebp,2),%ebx\n" 483 "movdqa %xmm3,%xmm0\n" 484 "punpckldq %xmm7,%xmm3\n" 485 "movlpd %xmm3,(%edx)\n" 486 "movhpd %xmm3,(%ebx)\n" 487 "punpckhdq %xmm7,%xmm0\n" 488 "movlpd %xmm0,(%edx,%esi,1)\n" 489 "lea (%edx,%esi,2),%edx\n" 490 "movhpd %xmm0,(%ebx,%ebp,1)\n" 491 "lea (%ebx,%ebp,2),%ebx\n" 492 "sub $0x8,%ecx\n" 493 "ja 1b\n" 494 "mov 0x10(%esp),%esp\n" 495 "pop %ebp\n" 496 "pop %edi\n" 497 "pop %esi\n" 498 "pop %ebx\n" 499 "ret\n" 500 ); 501 #elif defined (__x86_64__) 502 // 64 bit version has enough registers to do 16x8 to 8x16 at a time. 503 #define HAS_TRANSPOSE_WX8_FAST_SSSE3 504 static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride, 505 uint8* dst, int dst_stride, int width) { 506 asm volatile( 507 "1:" 508 // Read in the data from the source pointer. 509 // First round of bit swap. 510 "movdqa (%0),%%xmm0\n" 511 "movdqa (%0,%3),%%xmm1\n" 512 "lea (%0,%3,2),%0\n" 513 "movdqa %%xmm0,%%xmm8\n" 514 "punpcklbw %%xmm1,%%xmm0\n" 515 "punpckhbw %%xmm1,%%xmm8\n" 516 "movdqa (%0),%%xmm2\n" 517 "movdqa %%xmm0,%%xmm1\n" 518 "movdqa %%xmm8,%%xmm9\n" 519 "palignr $0x8,%%xmm1,%%xmm1\n" 520 "palignr $0x8,%%xmm9,%%xmm9\n" 521 "movdqa (%0,%3),%%xmm3\n" 522 "lea (%0,%3,2),%0\n" 523 "movdqa %%xmm2,%%xmm10\n" 524 "punpcklbw %%xmm3,%%xmm2\n" 525 "punpckhbw %%xmm3,%%xmm10\n" 526 "movdqa %%xmm2,%%xmm3\n" 527 "movdqa %%xmm10,%%xmm11\n" 528 "movdqa (%0),%%xmm4\n" 529 "palignr $0x8,%%xmm3,%%xmm3\n" 530 "palignr $0x8,%%xmm11,%%xmm11\n" 531 "movdqa (%0,%3),%%xmm5\n" 532 "lea (%0,%3,2),%0\n" 533 "movdqa %%xmm4,%%xmm12\n" 534 "punpcklbw %%xmm5,%%xmm4\n" 535 "punpckhbw %%xmm5,%%xmm12\n" 536 "movdqa %%xmm4,%%xmm5\n" 537 "movdqa %%xmm12,%%xmm13\n" 538 "movdqa (%0),%%xmm6\n" 539 "palignr $0x8,%%xmm5,%%xmm5\n" 540 "palignr $0x8,%%xmm13,%%xmm13\n" 541 "movdqa (%0,%3),%%xmm7\n" 542 "lea (%0,%3,2),%0\n" 543 "movdqa %%xmm6,%%xmm14\n" 544 "punpcklbw %%xmm7,%%xmm6\n" 545 "punpckhbw %%xmm7,%%xmm14\n" 546 "neg %3\n" 547 "movdqa %%xmm6,%%xmm7\n" 548 "movdqa %%xmm14,%%xmm15\n" 549 "lea 0x10(%0,%3,8),%0\n" 550 "palignr $0x8,%%xmm7,%%xmm7\n" 551 "palignr $0x8,%%xmm15,%%xmm15\n" 552 "neg %3\n" 553 // Second round of bit swap. 554 "punpcklwd %%xmm2,%%xmm0\n" 555 "punpcklwd %%xmm3,%%xmm1\n" 556 "movdqa %%xmm0,%%xmm2\n" 557 "movdqa %%xmm1,%%xmm3\n" 558 "palignr $0x8,%%xmm2,%%xmm2\n" 559 "palignr $0x8,%%xmm3,%%xmm3\n" 560 "punpcklwd %%xmm6,%%xmm4\n" 561 "punpcklwd %%xmm7,%%xmm5\n" 562 "movdqa %%xmm4,%%xmm6\n" 563 "movdqa %%xmm5,%%xmm7\n" 564 "palignr $0x8,%%xmm6,%%xmm6\n" 565 "palignr $0x8,%%xmm7,%%xmm7\n" 566 "punpcklwd %%xmm10,%%xmm8\n" 567 "punpcklwd %%xmm11,%%xmm9\n" 568 "movdqa %%xmm8,%%xmm10\n" 569 "movdqa %%xmm9,%%xmm11\n" 570 "palignr $0x8,%%xmm10,%%xmm10\n" 571 "palignr $0x8,%%xmm11,%%xmm11\n" 572 "punpcklwd %%xmm14,%%xmm12\n" 573 "punpcklwd %%xmm15,%%xmm13\n" 574 "movdqa %%xmm12,%%xmm14\n" 575 "movdqa %%xmm13,%%xmm15\n" 576 "palignr $0x8,%%xmm14,%%xmm14\n" 577 "palignr $0x8,%%xmm15,%%xmm15\n" 578 // Third round of bit swap. 579 // Write to the destination pointer. 580 "punpckldq %%xmm4,%%xmm0\n" 581 "movq %%xmm0,(%1)\n" 582 "movdqa %%xmm0,%%xmm4\n" 583 "palignr $0x8,%%xmm4,%%xmm4\n" 584 "movq %%xmm4,(%1,%4)\n" 585 "lea (%1,%4,2),%1\n" 586 "punpckldq %%xmm6,%%xmm2\n" 587 "movdqa %%xmm2,%%xmm6\n" 588 "movq %%xmm2,(%1)\n" 589 "palignr $0x8,%%xmm6,%%xmm6\n" 590 "punpckldq %%xmm5,%%xmm1\n" 591 "movq %%xmm6,(%1,%4)\n" 592 "lea (%1,%4,2),%1\n" 593 "movdqa %%xmm1,%%xmm5\n" 594 "movq %%xmm1,(%1)\n" 595 "palignr $0x8,%%xmm5,%%xmm5\n" 596 "movq %%xmm5,(%1,%4)\n" 597 "lea (%1,%4,2),%1\n" 598 "punpckldq %%xmm7,%%xmm3\n" 599 "movq %%xmm3,(%1)\n" 600 "movdqa %%xmm3,%%xmm7\n" 601 "palignr $0x8,%%xmm7,%%xmm7\n" 602 "movq %%xmm7,(%1,%4)\n" 603 "lea (%1,%4,2),%1\n" 604 "punpckldq %%xmm12,%%xmm8\n" 605 "movq %%xmm8,(%1)\n" 606 "movdqa %%xmm8,%%xmm12\n" 607 "palignr $0x8,%%xmm12,%%xmm12\n" 608 "movq %%xmm12,(%1,%4)\n" 609 "lea (%1,%4,2),%1\n" 610 "punpckldq %%xmm14,%%xmm10\n" 611 "movdqa %%xmm10,%%xmm14\n" 612 "movq %%xmm10,(%1)\n" 613 "palignr $0x8,%%xmm14,%%xmm14\n" 614 "punpckldq %%xmm13,%%xmm9\n" 615 "movq %%xmm14,(%1,%4)\n" 616 "lea (%1,%4,2),%1\n" 617 "movdqa %%xmm9,%%xmm13\n" 618 "movq %%xmm9,(%1)\n" 619 "palignr $0x8,%%xmm13,%%xmm13\n" 620 "movq %%xmm13,(%1,%4)\n" 621 "lea (%1,%4,2),%1\n" 622 "punpckldq %%xmm15,%%xmm11\n" 623 "movq %%xmm11,(%1)\n" 624 "movdqa %%xmm11,%%xmm15\n" 625 "palignr $0x8,%%xmm15,%%xmm15\n" 626 "movq %%xmm15,(%1,%4)\n" 627 "lea (%1,%4,2),%1\n" 628 "sub $0x10,%2\n" 629 "ja 1b\n" 630 : "+r"(src), // %0 631 "+r"(dst), // %1 632 "+r"(width) // %2 633 : "r"(static_cast<intptr_t>(src_stride)), // %3 634 "r"(static_cast<intptr_t>(dst_stride)) // %4 635 : "memory" 636 ); 637 } 638 639 #define HAS_TRANSPOSE_UVWX8_SSE2 640 static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, 641 uint8* dst_a, int dst_stride_a, 642 uint8* dst_b, int dst_stride_b, 643 int w) { 644 asm volatile( 645 "1:" 646 // Read in the data from the source pointer. 647 // First round of bit swap. 648 "movdqa (%0),%%xmm0\n" 649 "movdqa (%0,%4),%%xmm1\n" 650 "lea (%0,%4,2),%0\n" 651 "movdqa %%xmm0,%%xmm8\n" 652 "punpcklbw %%xmm1,%%xmm0\n" 653 "punpckhbw %%xmm1,%%xmm8\n" 654 "movdqa %%xmm8,%%xmm1\n" 655 "movdqa (%0),%%xmm2\n" 656 "movdqa (%0,%4),%%xmm3\n" 657 "lea (%0,%4,2),%0\n" 658 "movdqa %%xmm2,%%xmm8\n" 659 "punpcklbw %%xmm3,%%xmm2\n" 660 "punpckhbw %%xmm3,%%xmm8\n" 661 "movdqa %%xmm8,%%xmm3\n" 662 "movdqa (%0),%%xmm4\n" 663 "movdqa (%0,%4),%%xmm5\n" 664 "lea (%0,%4,2),%0\n" 665 "movdqa %%xmm4,%%xmm8\n" 666 "punpcklbw %%xmm5,%%xmm4\n" 667 "punpckhbw %%xmm5,%%xmm8\n" 668 "movdqa %%xmm8,%%xmm5\n" 669 "movdqa (%0),%%xmm6\n" 670 "movdqa (%0,%4),%%xmm7\n" 671 "lea (%0,%4,2),%0\n" 672 "movdqa %%xmm6,%%xmm8\n" 673 "punpcklbw %%xmm7,%%xmm6\n" 674 "neg %4\n" 675 "lea 0x10(%0,%4,8),%0\n" 676 "punpckhbw %%xmm7,%%xmm8\n" 677 "movdqa %%xmm8,%%xmm7\n" 678 "neg %4\n" 679 // Second round of bit swap. 680 "movdqa %%xmm0,%%xmm8\n" 681 "movdqa %%xmm1,%%xmm9\n" 682 "punpckhwd %%xmm2,%%xmm8\n" 683 "punpckhwd %%xmm3,%%xmm9\n" 684 "punpcklwd %%xmm2,%%xmm0\n" 685 "punpcklwd %%xmm3,%%xmm1\n" 686 "movdqa %%xmm8,%%xmm2\n" 687 "movdqa %%xmm9,%%xmm3\n" 688 "movdqa %%xmm4,%%xmm8\n" 689 "movdqa %%xmm5,%%xmm9\n" 690 "punpckhwd %%xmm6,%%xmm8\n" 691 "punpckhwd %%xmm7,%%xmm9\n" 692 "punpcklwd %%xmm6,%%xmm4\n" 693 "punpcklwd %%xmm7,%%xmm5\n" 694 "movdqa %%xmm8,%%xmm6\n" 695 "movdqa %%xmm9,%%xmm7\n" 696 // Third round of bit swap. 697 // Write to the destination pointer. 698 "movdqa %%xmm0,%%xmm8\n" 699 "punpckldq %%xmm4,%%xmm0\n" 700 "movlpd %%xmm0,(%1)\n" // Write back U channel 701 "movhpd %%xmm0,(%2)\n" // Write back V channel 702 "punpckhdq %%xmm4,%%xmm8\n" 703 "movlpd %%xmm8,(%1,%5)\n" 704 "lea (%1,%5,2),%1\n" 705 "movhpd %%xmm8,(%2,%6)\n" 706 "lea (%2,%6,2),%2\n" 707 "movdqa %%xmm2,%%xmm8\n" 708 "punpckldq %%xmm6,%%xmm2\n" 709 "movlpd %%xmm2,(%1)\n" 710 "movhpd %%xmm2,(%2)\n" 711 "punpckhdq %%xmm6,%%xmm8\n" 712 "movlpd %%xmm8,(%1,%5)\n" 713 "lea (%1,%5,2),%1\n" 714 "movhpd %%xmm8,(%2,%6)\n" 715 "lea (%2,%6,2),%2\n" 716 "movdqa %%xmm1,%%xmm8\n" 717 "punpckldq %%xmm5,%%xmm1\n" 718 "movlpd %%xmm1,(%1)\n" 719 "movhpd %%xmm1,(%2)\n" 720 "punpckhdq %%xmm5,%%xmm8\n" 721 "movlpd %%xmm8,(%1,%5)\n" 722 "lea (%1,%5,2),%1\n" 723 "movhpd %%xmm8,(%2,%6)\n" 724 "lea (%2,%6,2),%2\n" 725 "movdqa %%xmm3,%%xmm8\n" 726 "punpckldq %%xmm7,%%xmm3\n" 727 "movlpd %%xmm3,(%1)\n" 728 "movhpd %%xmm3,(%2)\n" 729 "punpckhdq %%xmm7,%%xmm8\n" 730 "movlpd %%xmm8,(%1,%5)\n" 731 "lea (%1,%5,2),%1\n" 732 "movhpd %%xmm8,(%2,%6)\n" 733 "lea (%2,%6,2),%2\n" 734 "sub $0x8,%3\n" 735 "ja 1b\n" 736 : "+r"(src), // %0 737 "+r"(dst_a), // %1 738 "+r"(dst_b), // %2 739 "+r"(w) // %3 740 : "r"(static_cast<intptr_t>(src_stride)), // %4 741 "r"(static_cast<intptr_t>(dst_stride_a)), // %5 742 "r"(static_cast<intptr_t>(dst_stride_b)) // %6 743 : "memory" 744 ); 745 } 746 #endif 747 #endif 748 749 static void TransposeWx8_C(const uint8* src, int src_stride, 750 uint8* dst, int dst_stride, 751 int w) { 752 int i; 753 for (i = 0; i < w; ++i) { 754 dst[0] = src[0 * src_stride]; 755 dst[1] = src[1 * src_stride]; 756 dst[2] = src[2 * src_stride]; 757 dst[3] = src[3 * src_stride]; 758 dst[4] = src[4 * src_stride]; 759 dst[5] = src[5 * src_stride]; 760 dst[6] = src[6 * src_stride]; 761 dst[7] = src[7 * src_stride]; 762 ++src; 763 dst += dst_stride; 764 } 765 } 766 767 static void TransposeWxH_C(const uint8* src, int src_stride, 768 uint8* dst, int dst_stride, 769 int width, int height) { 770 int i, j; 771 for (i = 0; i < width; ++i) 772 for (j = 0; j < height; ++j) 773 dst[i * dst_stride + j] = src[j * src_stride + i]; 774 } 775 776 void TransposePlane(const uint8* src, int src_stride, 777 uint8* dst, int dst_stride, 778 int width, int height) { 779 int i = height; 780 rotate_wx8_func TransposeWx8; 781 rotate_wxh_func TransposeWxH; 782 783 #if defined(HAS_TRANSPOSE_WX8_NEON) 784 if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) && 785 (width % 8 == 0) && 786 IS_ALIGNED(src, 8) && (src_stride % 8 == 0) && 787 IS_ALIGNED(dst, 8) && (dst_stride % 8 == 0)) { 788 TransposeWx8 = TransposeWx8_NEON; 789 TransposeWxH = TransposeWxH_C; 790 } else 791 #endif 792 #if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3) 793 if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && 794 (width % 16 == 0) && 795 IS_ALIGNED(src, 16) && (src_stride % 16 == 0) && 796 IS_ALIGNED(dst, 8) && (dst_stride % 8 == 0)) { 797 TransposeWx8 = TransposeWx8_FAST_SSSE3; 798 TransposeWxH = TransposeWxH_C; 799 } else 800 #endif 801 #if defined(HAS_TRANSPOSE_WX8_SSSE3) 802 if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && 803 (width % 8 == 0) && 804 IS_ALIGNED(src, 8) && (src_stride % 8 == 0) && 805 IS_ALIGNED(dst, 8) && (dst_stride % 8 == 0)) { 806 TransposeWx8 = TransposeWx8_SSSE3; 807 TransposeWxH = TransposeWxH_C; 808 } else 809 #endif 810 { 811 TransposeWx8 = TransposeWx8_C; 812 TransposeWxH = TransposeWxH_C; 813 } 814 815 // work across the source in 8x8 tiles 816 while (i >= 8) { 817 TransposeWx8(src, src_stride, dst, dst_stride, width); 818 819 src += 8 * src_stride; // go down 8 rows 820 dst += 8; // move over 8 columns 821 i -= 8; 822 } 823 824 TransposeWxH(src, src_stride, dst, dst_stride, width, i); 825 } 826 827 void RotatePlane90(const uint8* src, int src_stride, 828 uint8* dst, int dst_stride, 829 int width, int height) { 830 // Rotate by 90 is a transpose with the source read 831 // from bottom to top. So set the source pointer to the end 832 // of the buffer and flip the sign of the source stride. 833 src += src_stride * (height - 1); 834 src_stride = -src_stride; 835 836 TransposePlane(src, src_stride, dst, dst_stride, width, height); 837 } 838 839 void RotatePlane270(const uint8* src, int src_stride, 840 uint8* dst, int dst_stride, 841 int width, int height) { 842 // Rotate by 270 is a transpose with the destination written 843 // from bottom to top. So set the destination pointer to the end 844 // of the buffer and flip the sign of the destination stride. 845 dst += dst_stride * (width - 1); 846 dst_stride = -dst_stride; 847 848 TransposePlane(src, src_stride, dst, dst_stride, width, height); 849 } 850 851 static void ReverseLine_C(const uint8* src, uint8* dst, int width) { 852 int i; 853 src += width - 1; 854 for (i = 0; i < width; ++i) { 855 dst[i] = src[0]; 856 --src; 857 } 858 } 859 860 #if defined(WIN32) && !defined(COVERAGE_ENABLED) 861 #define HAS_REVERSE_LINE_SSSE3 862 __declspec(naked) 863 static void ReverseLine_SSSE3(const uint8* src, uint8* dst, int width) { 864 __asm { 865 mov eax, [esp + 4] // src 866 mov edx, [esp + 8] // dst 867 mov ecx, [esp + 12] // width 868 movdqa xmm7, _kShuffleReverse 869 lea eax, [eax + ecx - 16] 870 convertloop : 871 movdqa xmm0, [eax] 872 lea eax, [eax - 16] 873 pshufb xmm0, xmm7 874 movdqa [edx], xmm0 875 lea edx, [edx + 16] 876 sub ecx, 16 877 ja convertloop 878 ret 879 } 880 } 881 882 #elif (defined(__i386__) || defined(__x86_64__)) && \ 883 !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) 884 #define HAS_REVERSE_LINE_SSSE3 885 static void ReverseLine_SSSE3(const uint8* src, uint8* dst, int width) { 886 intptr_t temp_width = static_cast<intptr_t>(width); 887 asm volatile( 888 "movdqa (%3),%%xmm7\n" 889 "lea -0x10(%0,%2,1),%0\n" 890 "1:" 891 "movdqa (%0),%%xmm0\n" 892 "lea -0x10(%0),%0\n" 893 "pshufb %%xmm7,%%xmm0\n" 894 "movdqa %%xmm0,(%1)\n" 895 "lea 0x10(%1),%1\n" 896 "sub $0x10,%2\n" 897 "ja 1b\n" 898 : "+r"(src), // %0 899 "+r"(dst), // %1 900 "+r"(temp_width) // %2 901 : "r"(kShuffleReverse) // %3 902 : "memory" 903 ); 904 } 905 #endif 906 907 void RotatePlane180(const uint8* src, int src_stride, 908 uint8* dst, int dst_stride, 909 int width, int height) { 910 int i; 911 reverse_func ReverseLine; 912 913 #if defined(HAS_REVERSE_LINE_NEON) 914 if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) && 915 (width % 16 == 0) && 916 IS_ALIGNED(src, 16) && (src_stride % 16 == 0) && 917 IS_ALIGNED(dst, 16) && (dst_stride % 16 == 0)) { 918 ReverseLine = ReverseLine_NEON; 919 } else 920 #endif 921 #if defined(HAS_REVERSE_LINE_SSSE3) 922 if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && 923 (width % 16 == 0) && 924 IS_ALIGNED(src, 16) && (src_stride % 16 == 0) && 925 IS_ALIGNED(dst, 16) && (dst_stride % 16 == 0)) { 926 ReverseLine = ReverseLine_SSSE3; 927 } else 928 #endif 929 { 930 ReverseLine = ReverseLine_C; 931 } 932 // Rotate by 180 is a mirror and vertical flip 933 src += src_stride * (height - 1); 934 935 for (i = 0; i < height; ++i) { 936 ReverseLine(src, dst, width); 937 src -= src_stride; 938 dst += dst_stride; 939 } 940 } 941 942 static void TransposeUVWx8_C(const uint8* src, int src_stride, 943 uint8* dst_a, int dst_stride_a, 944 uint8* dst_b, int dst_stride_b, 945 int w) { 946 int i; 947 for (i = 0; i < w; ++i) { 948 dst_a[0] = src[0 * src_stride + 0]; 949 dst_b[0] = src[0 * src_stride + 1]; 950 dst_a[1] = src[1 * src_stride + 0]; 951 dst_b[1] = src[1 * src_stride + 1]; 952 dst_a[2] = src[2 * src_stride + 0]; 953 dst_b[2] = src[2 * src_stride + 1]; 954 dst_a[3] = src[3 * src_stride + 0]; 955 dst_b[3] = src[3 * src_stride + 1]; 956 dst_a[4] = src[4 * src_stride + 0]; 957 dst_b[4] = src[4 * src_stride + 1]; 958 dst_a[5] = src[5 * src_stride + 0]; 959 dst_b[5] = src[5 * src_stride + 1]; 960 dst_a[6] = src[6 * src_stride + 0]; 961 dst_b[6] = src[6 * src_stride + 1]; 962 dst_a[7] = src[7 * src_stride + 0]; 963 dst_b[7] = src[7 * src_stride + 1]; 964 src += 2; 965 dst_a += dst_stride_a; 966 dst_b += dst_stride_b; 967 } 968 } 969 970 static void TransposeUVWxH_C(const uint8* src, int src_stride, 971 uint8* dst_a, int dst_stride_a, 972 uint8* dst_b, int dst_stride_b, 973 int w, int h) { 974 int i, j; 975 for (i = 0; i < w * 2; i += 2) 976 for (j = 0; j < h; ++j) { 977 dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)]; 978 dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1]; 979 } 980 } 981 982 void TransposeUV(const uint8* src, int src_stride, 983 uint8* dst_a, int dst_stride_a, 984 uint8* dst_b, int dst_stride_b, 985 int width, int height) { 986 int i = height; 987 rotate_uv_wx8_func TransposeWx8; 988 rotate_uv_wxh_func TransposeWxH; 989 990 #if defined(HAS_TRANSPOSE_UVWX8_NEON) 991 unsigned long long store_reg[8]; 992 if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON)) { 993 SaveRegisters_NEON(store_reg); 994 TransposeWx8 = TransposeUVWx8_NEON; 995 TransposeWxH = TransposeUVWxH_C; 996 } else 997 #endif 998 #if defined(HAS_TRANSPOSE_UVWX8_SSE2) 999 if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && 1000 (width % 8 == 0) && 1001 IS_ALIGNED(src, 16) && (src_stride % 16 == 0) && 1002 IS_ALIGNED(dst_a, 8) && (dst_stride_a % 8 == 0) && 1003 IS_ALIGNED(dst_b, 8) && (dst_stride_b % 8 == 0)) { 1004 TransposeWx8 = TransposeUVWx8_SSE2; 1005 TransposeWxH = TransposeUVWxH_C; 1006 } else 1007 #endif 1008 { 1009 TransposeWx8 = TransposeUVWx8_C; 1010 TransposeWxH = TransposeUVWxH_C; 1011 } 1012 1013 // work through the source in 8x8 tiles 1014 while (i >= 8) { 1015 TransposeWx8(src, src_stride, 1016 dst_a, dst_stride_a, 1017 dst_b, dst_stride_b, 1018 width); 1019 1020 src += 8 * src_stride; // go down 8 rows 1021 dst_a += 8; // move over 8 columns 1022 dst_b += 8; // move over 8 columns 1023 i -= 8; 1024 } 1025 1026 TransposeWxH(src, src_stride, 1027 dst_a, dst_stride_a, 1028 dst_b, dst_stride_b, 1029 width, i); 1030 1031 #if defined(HAS_TRANSPOSE_UVWX8_NEON) 1032 if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON)) { 1033 RestoreRegisters_NEON(store_reg); 1034 } 1035 #endif 1036 } 1037 1038 void RotateUV90(const uint8* src, int src_stride, 1039 uint8* dst_a, int dst_stride_a, 1040 uint8* dst_b, int dst_stride_b, 1041 int width, int height) { 1042 src += src_stride * (height - 1); 1043 src_stride = -src_stride; 1044 1045 TransposeUV(src, src_stride, 1046 dst_a, dst_stride_a, 1047 dst_b, dst_stride_b, 1048 width, height); 1049 } 1050 1051 void RotateUV270(const uint8* src, int src_stride, 1052 uint8* dst_a, int dst_stride_a, 1053 uint8* dst_b, int dst_stride_b, 1054 int width, int height) { 1055 dst_a += dst_stride_a * (width - 1); 1056 dst_b += dst_stride_b * (width - 1); 1057 dst_stride_a = -dst_stride_a; 1058 dst_stride_b = -dst_stride_b; 1059 1060 TransposeUV(src, src_stride, 1061 dst_a, dst_stride_a, 1062 dst_b, dst_stride_b, 1063 width, height); 1064 } 1065 1066 #if defined(WIN32) && !defined(COVERAGE_ENABLED) 1067 #define HAS_REVERSE_LINE_UV_SSSE3 1068 __declspec(naked) 1069 void ReverseLineUV_SSSE3(const uint8* src, 1070 uint8* dst_a, uint8* dst_b, 1071 int width) { 1072 __asm { 1073 push edi 1074 mov eax, [esp + 4 + 4] // src 1075 mov edx, [esp + 4 + 8] // dst_a 1076 mov edi, [esp + 4 + 12] // dst_b 1077 mov ecx, [esp + 4 + 16] // width 1078 movdqa xmm7, _kShuffleReverseUV 1079 lea eax, [eax + ecx * 2 - 16] 1080 1081 convertloop : 1082 movdqa xmm0, [eax] 1083 lea eax, [eax - 16] 1084 pshufb xmm0, xmm7 1085 movlpd qword ptr [edx], xmm0 1086 lea edx, [edx + 8] 1087 movhpd qword ptr [edi], xmm0 1088 lea edi, [edi + 8] 1089 sub ecx, 8 1090 ja convertloop 1091 pop edi 1092 ret 1093 } 1094 } 1095 1096 #elif (defined(__i386__) || defined(__x86_64__)) && \ 1097 !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) 1098 #define HAS_REVERSE_LINE_UV_SSSE3 1099 void ReverseLineUV_SSSE3(const uint8* src, 1100 uint8* dst_a, uint8* dst_b, 1101 int width) { 1102 intptr_t temp_width = static_cast<intptr_t>(width); 1103 asm volatile( 1104 "movdqa (%4),%%xmm7\n" 1105 "lea -0x10(%0,%3,2),%0\n" 1106 "1:" 1107 "movdqa (%0),%%xmm0\n" 1108 "lea -0x10(%0),%0\n" 1109 "pshufb %%xmm7,%%xmm0\n" 1110 "movlpd %%xmm0,(%1)\n" 1111 "lea 0x8(%1),%1\n" 1112 "movhpd %%xmm0,(%2)\n" 1113 "lea 0x8(%2),%2\n" 1114 "sub $0x8,%3\n" 1115 "ja 1b\n" 1116 : "+r"(src), // %0 1117 "+r"(dst_a), // %1 1118 "+r"(dst_b), // %2 1119 "+r"(temp_width) // %3 1120 : "r"(kShuffleReverseUV) // %4 1121 : "memory" 1122 ); 1123 } 1124 #endif 1125 1126 static void ReverseLineUV_C(const uint8* src, 1127 uint8* dst_a, uint8* dst_b, 1128 int width) { 1129 int i; 1130 src += width << 1; 1131 for (i = 0; i < width; ++i) { 1132 src -= 2; 1133 dst_a[i] = src[0]; 1134 dst_b[i] = src[1]; 1135 } 1136 } 1137 1138 void RotateUV180(const uint8* src, int src_stride, 1139 uint8* dst_a, int dst_stride_a, 1140 uint8* dst_b, int dst_stride_b, 1141 int width, int height) { 1142 int i; 1143 reverse_uv_func ReverseLine; 1144 1145 #if defined(HAS_REVERSE_LINE_UV_NEON) 1146 if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) && 1147 (width % 16 == 0) && 1148 IS_ALIGNED(src, 16) && (src_stride % 16 == 0) && 1149 IS_ALIGNED(dst_a, 8) && (dst_stride_a % 8 == 0) && 1150 IS_ALIGNED(dst_b, 8) && (dst_stride_b % 8 == 0) ) { 1151 ReverseLine = ReverseLineUV_NEON; 1152 } else 1153 #endif 1154 #if defined(HAS_REVERSE_LINE_UV_SSSE3) 1155 if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && 1156 (width % 16 == 0) && 1157 IS_ALIGNED(src, 16) && (src_stride % 16 == 0) && 1158 IS_ALIGNED(dst_a, 8) && (dst_stride_a % 8 == 0) && 1159 IS_ALIGNED(dst_b, 8) && (dst_stride_b % 8 == 0) ) { 1160 ReverseLine = ReverseLineUV_SSSE3; 1161 } else 1162 #endif 1163 { 1164 ReverseLine = ReverseLineUV_C; 1165 } 1166 1167 dst_a += dst_stride_a * (height - 1); 1168 dst_b += dst_stride_b * (height - 1); 1169 1170 for (i = 0; i < height; ++i) { 1171 ReverseLine(src, dst_a, dst_b, width); 1172 1173 src += src_stride; // down one line at a time 1174 dst_a -= dst_stride_a; // nominally up one line at a time 1175 dst_b -= dst_stride_b; // nominally up one line at a time 1176 } 1177 } 1178 1179 int I420Rotate(const uint8* src_y, int src_stride_y, 1180 const uint8* src_u, int src_stride_u, 1181 const uint8* src_v, int src_stride_v, 1182 uint8* dst_y, int dst_stride_y, 1183 uint8* dst_u, int dst_stride_u, 1184 uint8* dst_v, int dst_stride_v, 1185 int width, int height, 1186 RotationMode mode) { 1187 int halfwidth = (width + 1) >> 1; 1188 int halfheight = (height + 1) >> 1; 1189 1190 // Negative height means invert the image. 1191 if (height < 0) { 1192 height = -height; 1193 halfheight = (height + 1) >> 1; 1194 src_y = src_y + (height - 1) * src_stride_y; 1195 src_u = src_u + (halfheight - 1) * src_stride_u; 1196 src_v = src_v + (halfheight - 1) * src_stride_v; 1197 src_stride_y = -src_stride_y; 1198 src_stride_u = -src_stride_u; 1199 src_stride_v = -src_stride_v; 1200 } 1201 1202 switch (mode) { 1203 case kRotate0: 1204 // copy frame 1205 return I420Copy(src_y, src_stride_y, 1206 src_u, src_stride_u, 1207 src_v, src_stride_v, 1208 dst_y, dst_stride_y, 1209 dst_u, dst_stride_u, 1210 dst_v, dst_stride_v, 1211 width, height); 1212 case kRotate90: 1213 RotatePlane90(src_y, src_stride_y, 1214 dst_y, dst_stride_y, 1215 width, height); 1216 RotatePlane90(src_u, src_stride_u, 1217 dst_u, dst_stride_u, 1218 halfwidth, halfheight); 1219 RotatePlane90(src_v, src_stride_v, 1220 dst_v, dst_stride_v, 1221 halfwidth, halfheight); 1222 return 0; 1223 case kRotate270: 1224 RotatePlane270(src_y, src_stride_y, 1225 dst_y, dst_stride_y, 1226 width, height); 1227 RotatePlane270(src_u, src_stride_u, 1228 dst_u, dst_stride_u, 1229 halfwidth, halfheight); 1230 RotatePlane270(src_v, src_stride_v, 1231 dst_v, dst_stride_v, 1232 halfwidth, halfheight); 1233 return 0; 1234 case kRotate180: 1235 RotatePlane180(src_y, src_stride_y, 1236 dst_y, dst_stride_y, 1237 width, height); 1238 RotatePlane180(src_u, src_stride_u, 1239 dst_u, dst_stride_u, 1240 halfwidth, halfheight); 1241 RotatePlane180(src_v, src_stride_v, 1242 dst_v, dst_stride_v, 1243 halfwidth, halfheight); 1244 return 0; 1245 default: 1246 break; 1247 } 1248 return -1; 1249 } 1250 1251 int NV12ToI420Rotate(const uint8* src_y, int src_stride_y, 1252 const uint8* src_uv, int src_stride_uv, 1253 uint8* dst_y, int dst_stride_y, 1254 uint8* dst_u, int dst_stride_u, 1255 uint8* dst_v, int dst_stride_v, 1256 int width, int height, 1257 RotationMode mode) { 1258 int halfwidth = (width + 1) >> 1; 1259 int halfheight = (height + 1) >> 1; 1260 1261 // Negative height means invert the image. 1262 if (height < 0) { 1263 height = -height; 1264 halfheight = (height + 1) >> 1; 1265 src_y = src_y + (height - 1) * src_stride_y; 1266 src_uv = src_uv + (halfheight - 1) * src_stride_uv; 1267 src_stride_y = -src_stride_y; 1268 src_stride_uv = -src_stride_uv; 1269 } 1270 1271 switch (mode) { 1272 case kRotate0: 1273 // copy frame 1274 return NV12ToI420(src_y, src_uv, src_stride_y, 1275 dst_y, dst_stride_y, 1276 dst_u, dst_stride_u, 1277 dst_v, dst_stride_v, 1278 width, height); 1279 case kRotate90: 1280 RotatePlane90(src_y, src_stride_y, 1281 dst_y, dst_stride_y, 1282 width, height); 1283 RotateUV90(src_uv, src_stride_uv, 1284 dst_u, dst_stride_u, 1285 dst_v, dst_stride_v, 1286 halfwidth, halfheight); 1287 return 0; 1288 case kRotate270: 1289 RotatePlane270(src_y, src_stride_y, 1290 dst_y, dst_stride_y, 1291 width, height); 1292 RotateUV270(src_uv, src_stride_uv, 1293 dst_u, dst_stride_u, 1294 dst_v, dst_stride_v, 1295 halfwidth, halfheight); 1296 return 0; 1297 case kRotate180: 1298 RotatePlane180(src_y, src_stride_y, 1299 dst_y, dst_stride_y, 1300 width, height); 1301 RotateUV180(src_uv, src_stride_uv, 1302 dst_u, dst_stride_u, 1303 dst_v, dst_stride_v, 1304 halfwidth, halfheight); 1305 return 0; 1306 default: 1307 break; 1308 } 1309 return -1; 1310 } 1311 1312 } // namespace libyuv 1313