1 /* 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "libyuv/rotate.h" 12 13 #include "libyuv/cpu_id.h" 14 #include "libyuv/convert.h" 15 #include "libyuv/planar_functions.h" 16 #include "libyuv/row.h" 17 18 #ifdef __cplusplus 19 namespace libyuv { 20 extern "C" { 21 #endif 22 23 #if !defined(YUV_DISABLE_ASM) && \ 24 (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) 25 #if defined(__APPLE__) && defined(__i386__) 26 #define DECLARE_FUNCTION(name) \ 27 ".text \n" \ 28 ".private_extern _" #name " \n" \ 29 ".align 4,0x90 \n" \ 30 "_" #name ": \n" 31 #elif defined(__MINGW32__) || defined(__CYGWIN__) && defined(__i386__) 32 #define DECLARE_FUNCTION(name) \ 33 ".text \n" \ 34 ".align 4,0x90 \n" \ 35 "_" #name ": \n" 36 #else 37 #define DECLARE_FUNCTION(name) \ 38 ".text \n" \ 39 ".align 4,0x90 \n" \ 40 #name ": \n" 41 #endif 42 #endif 43 44 #if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__) 45 #define HAS_MIRRORROW_NEON 46 void MirrorRow_NEON(const uint8* src, uint8* dst, int width); 47 #define HAS_MIRRORROW_UV_NEON 48 void MirrorRowUV_NEON(const uint8* src, 49 uint8* dst_a, uint8* dst_b, 50 int width); 51 #define HAS_TRANSPOSE_WX8_NEON 52 void TransposeWx8_NEON(const uint8* src, int src_stride, 53 uint8* dst, int dst_stride, int width); 54 #define HAS_TRANSPOSE_UVWX8_NEON 55 void TransposeUVWx8_NEON(const uint8* src, int src_stride, 56 uint8* dst_a, int dst_stride_a, 57 uint8* dst_b, int dst_stride_b, 58 int width); 59 #endif // defined(__ARM_NEON__) 60 61 #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86) 62 #define HAS_TRANSPOSE_WX8_SSSE3 63 __declspec(naked) __declspec(align(16)) 64 static void TransposeWx8_SSSE3(const uint8* src, int src_stride, 65 uint8* dst, int dst_stride, int width) { 66 __asm { 67 push edi 68 push esi 69 push ebp 70 mov eax, [esp + 12 + 4] // src 71 mov edi, [esp + 12 + 8] // src_stride 72 mov edx, [esp + 12 + 12] // dst 73 mov esi, [esp + 12 + 16] // dst_stride 74 mov ecx, [esp + 12 + 20] // width 75 76 // Read in the data from the source pointer. 77 // First round of bit swap. 78 align 16 79 convertloop: 80 movq xmm0, qword ptr [eax] 81 lea ebp, [eax + 8] 82 movq xmm1, qword ptr [eax + edi] 83 lea eax, [eax + 2 * edi] 84 punpcklbw xmm0, xmm1 85 movq xmm2, qword ptr [eax] 86 movdqa xmm1, xmm0 87 palignr xmm1, xmm1, 8 88 movq xmm3, qword ptr [eax + edi] 89 lea eax, [eax + 2 * edi] 90 punpcklbw xmm2, xmm3 91 movdqa xmm3, xmm2 92 movq xmm4, qword ptr [eax] 93 palignr xmm3, xmm3, 8 94 movq xmm5, qword ptr [eax + edi] 95 punpcklbw xmm4, xmm5 96 lea eax, [eax + 2 * edi] 97 movdqa xmm5, xmm4 98 movq xmm6, qword ptr [eax] 99 palignr xmm5, xmm5, 8 100 movq xmm7, qword ptr [eax + edi] 101 punpcklbw xmm6, xmm7 102 mov eax, ebp 103 movdqa xmm7, xmm6 104 palignr xmm7, xmm7, 8 105 // Second round of bit swap. 106 punpcklwd xmm0, xmm2 107 punpcklwd xmm1, xmm3 108 movdqa xmm2, xmm0 109 movdqa xmm3, xmm1 110 palignr xmm2, xmm2, 8 111 palignr xmm3, xmm3, 8 112 punpcklwd xmm4, xmm6 113 punpcklwd xmm5, xmm7 114 movdqa xmm6, xmm4 115 movdqa xmm7, xmm5 116 palignr xmm6, xmm6, 8 117 palignr xmm7, xmm7, 8 118 // Third round of bit swap. 119 // Write to the destination pointer. 120 punpckldq xmm0, xmm4 121 movq qword ptr [edx], xmm0 122 movdqa xmm4, xmm0 123 palignr xmm4, xmm4, 8 124 movq qword ptr [edx + esi], xmm4 125 lea edx, [edx + 2 * esi] 126 punpckldq xmm2, xmm6 127 movdqa xmm6, xmm2 128 palignr xmm6, xmm6, 8 129 movq qword ptr [edx], xmm2 130 punpckldq xmm1, xmm5 131 movq qword ptr [edx + esi], xmm6 132 lea edx, [edx + 2 * esi] 133 movdqa xmm5, xmm1 134 movq qword ptr [edx], xmm1 135 palignr xmm5, xmm5, 8 136 punpckldq xmm3, xmm7 137 movq qword ptr [edx + esi], xmm5 138 lea edx, [edx + 2 * esi] 139 movq qword ptr [edx], xmm3 140 movdqa xmm7, xmm3 141 palignr xmm7, xmm7, 8 142 sub ecx, 8 143 movq qword ptr [edx + esi], xmm7 144 lea edx, [edx + 2 * esi] 145 jg convertloop 146 147 pop ebp 148 pop esi 149 pop edi 150 ret 151 } 152 } 153 154 #define HAS_TRANSPOSE_UVWX8_SSE2 155 __declspec(naked) __declspec(align(16)) 156 static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, 157 uint8* dst_a, int dst_stride_a, 158 uint8* dst_b, int dst_stride_b, 159 int w) { 160 __asm { 161 push ebx 162 push esi 163 push edi 164 push ebp 165 mov eax, [esp + 16 + 4] // src 166 mov edi, [esp + 16 + 8] // src_stride 167 mov edx, [esp + 16 + 12] // dst_a 168 mov esi, [esp + 16 + 16] // dst_stride_a 169 mov ebx, [esp + 16 + 20] // dst_b 170 mov ebp, [esp + 16 + 24] // dst_stride_b 171 mov ecx, esp 172 sub esp, 4 + 16 173 and esp, ~15 174 mov [esp + 16], ecx 175 mov ecx, [ecx + 16 + 28] // w 176 177 align 16 178 convertloop: 179 // Read in the data from the source pointer. 180 // First round of bit swap. 181 movdqa xmm0, [eax] 182 movdqa xmm1, [eax + edi] 183 lea eax, [eax + 2 * edi] 184 movdqa xmm7, xmm0 // use xmm7 as temp register. 185 punpcklbw xmm0, xmm1 186 punpckhbw xmm7, xmm1 187 movdqa xmm1, xmm7 188 movdqa xmm2, [eax] 189 movdqa xmm3, [eax + edi] 190 lea eax, [eax + 2 * edi] 191 movdqa xmm7, xmm2 192 punpcklbw xmm2, xmm3 193 punpckhbw xmm7, xmm3 194 movdqa xmm3, xmm7 195 movdqa xmm4, [eax] 196 movdqa xmm5, [eax + edi] 197 lea eax, [eax + 2 * edi] 198 movdqa xmm7, xmm4 199 punpcklbw xmm4, xmm5 200 punpckhbw xmm7, xmm5 201 movdqa xmm5, xmm7 202 movdqa xmm6, [eax] 203 movdqa xmm7, [eax + edi] 204 lea eax, [eax + 2 * edi] 205 movdqa [esp], xmm5 // backup xmm5 206 neg edi 207 movdqa xmm5, xmm6 // use xmm5 as temp register. 208 punpcklbw xmm6, xmm7 209 punpckhbw xmm5, xmm7 210 movdqa xmm7, xmm5 211 lea eax, [eax + 8 * edi + 16] 212 neg edi 213 // Second round of bit swap. 214 movdqa xmm5, xmm0 215 punpcklwd xmm0, xmm2 216 punpckhwd xmm5, xmm2 217 movdqa xmm2, xmm5 218 movdqa xmm5, xmm1 219 punpcklwd xmm1, xmm3 220 punpckhwd xmm5, xmm3 221 movdqa xmm3, xmm5 222 movdqa xmm5, xmm4 223 punpcklwd xmm4, xmm6 224 punpckhwd xmm5, xmm6 225 movdqa xmm6, xmm5 226 movdqa xmm5, [esp] // restore xmm5 227 movdqa [esp], xmm6 // backup xmm6 228 movdqa xmm6, xmm5 // use xmm6 as temp register. 229 punpcklwd xmm5, xmm7 230 punpckhwd xmm6, xmm7 231 movdqa xmm7, xmm6 232 // Third round of bit swap. 233 // Write to the destination pointer. 234 movdqa xmm6, xmm0 235 punpckldq xmm0, xmm4 236 punpckhdq xmm6, xmm4 237 movdqa xmm4, xmm6 238 movdqa xmm6, [esp] // restore xmm6 239 movlpd qword ptr [edx], xmm0 240 movhpd qword ptr [ebx], xmm0 241 movlpd qword ptr [edx + esi], xmm4 242 lea edx, [edx + 2 * esi] 243 movhpd qword ptr [ebx + ebp], xmm4 244 lea ebx, [ebx + 2 * ebp] 245 movdqa xmm0, xmm2 // use xmm0 as the temp register. 246 punpckldq xmm2, xmm6 247 movlpd qword ptr [edx], xmm2 248 movhpd qword ptr [ebx], xmm2 249 punpckhdq xmm0, xmm6 250 movlpd qword ptr [edx + esi], xmm0 251 lea edx, [edx + 2 * esi] 252 movhpd qword ptr [ebx + ebp], xmm0 253 lea ebx, [ebx + 2 * ebp] 254 movdqa xmm0, xmm1 // use xmm0 as the temp register. 255 punpckldq xmm1, xmm5 256 movlpd qword ptr [edx], xmm1 257 movhpd qword ptr [ebx], xmm1 258 punpckhdq xmm0, xmm5 259 movlpd qword ptr [edx + esi], xmm0 260 lea edx, [edx + 2 * esi] 261 movhpd qword ptr [ebx + ebp], xmm0 262 lea ebx, [ebx + 2 * ebp] 263 movdqa xmm0, xmm3 // use xmm0 as the temp register. 264 punpckldq xmm3, xmm7 265 movlpd qword ptr [edx], xmm3 266 movhpd qword ptr [ebx], xmm3 267 punpckhdq xmm0, xmm7 268 sub ecx, 8 269 movlpd qword ptr [edx + esi], xmm0 270 lea edx, [edx + 2 * esi] 271 movhpd qword ptr [ebx + ebp], xmm0 272 lea ebx, [ebx + 2 * ebp] 273 jg convertloop 274 275 mov esp, [esp + 16] 276 pop ebp 277 pop edi 278 pop esi 279 pop ebx 280 ret 281 } 282 } 283 #elif !defined(YUV_DISABLE_ASM) && (defined(__i386__) || defined(__x86_64__)) 284 #define HAS_TRANSPOSE_WX8_SSSE3 285 static void TransposeWx8_SSSE3(const uint8* src, int src_stride, 286 uint8* dst, int dst_stride, int width) { 287 asm volatile ( 288 // Read in the data from the source pointer. 289 // First round of bit swap. 290 ".p2align 4 \n" 291 "1: \n" 292 "movq (%0),%%xmm0 \n" 293 "movq (%0,%3),%%xmm1 \n" 294 "lea (%0,%3,2),%0 \n" 295 "punpcklbw %%xmm1,%%xmm0 \n" 296 "movq (%0),%%xmm2 \n" 297 "movdqa %%xmm0,%%xmm1 \n" 298 "palignr $0x8,%%xmm1,%%xmm1 \n" 299 "movq (%0,%3),%%xmm3 \n" 300 "lea (%0,%3,2),%0 \n" 301 "punpcklbw %%xmm3,%%xmm2 \n" 302 "movdqa %%xmm2,%%xmm3 \n" 303 "movq (%0),%%xmm4 \n" 304 "palignr $0x8,%%xmm3,%%xmm3 \n" 305 "movq (%0,%3),%%xmm5 \n" 306 "lea (%0,%3,2),%0 \n" 307 "punpcklbw %%xmm5,%%xmm4 \n" 308 "movdqa %%xmm4,%%xmm5 \n" 309 "movq (%0),%%xmm6 \n" 310 "palignr $0x8,%%xmm5,%%xmm5 \n" 311 "movq (%0,%3),%%xmm7 \n" 312 "lea (%0,%3,2),%0 \n" 313 "punpcklbw %%xmm7,%%xmm6 \n" 314 "neg %3 \n" 315 "movdqa %%xmm6,%%xmm7 \n" 316 "lea 0x8(%0,%3,8),%0 \n" 317 "palignr $0x8,%%xmm7,%%xmm7 \n" 318 "neg %3 \n" 319 // Second round of bit swap. 320 "punpcklwd %%xmm2,%%xmm0 \n" 321 "punpcklwd %%xmm3,%%xmm1 \n" 322 "movdqa %%xmm0,%%xmm2 \n" 323 "movdqa %%xmm1,%%xmm3 \n" 324 "palignr $0x8,%%xmm2,%%xmm2 \n" 325 "palignr $0x8,%%xmm3,%%xmm3 \n" 326 "punpcklwd %%xmm6,%%xmm4 \n" 327 "punpcklwd %%xmm7,%%xmm5 \n" 328 "movdqa %%xmm4,%%xmm6 \n" 329 "movdqa %%xmm5,%%xmm7 \n" 330 "palignr $0x8,%%xmm6,%%xmm6 \n" 331 "palignr $0x8,%%xmm7,%%xmm7 \n" 332 // Third round of bit swap. 333 // Write to the destination pointer. 334 "punpckldq %%xmm4,%%xmm0 \n" 335 "movq %%xmm0,(%1) \n" 336 "movdqa %%xmm0,%%xmm4 \n" 337 "palignr $0x8,%%xmm4,%%xmm4 \n" 338 "movq %%xmm4,(%1,%4) \n" 339 "lea (%1,%4,2),%1 \n" 340 "punpckldq %%xmm6,%%xmm2 \n" 341 "movdqa %%xmm2,%%xmm6 \n" 342 "movq %%xmm2,(%1) \n" 343 "palignr $0x8,%%xmm6,%%xmm6 \n" 344 "punpckldq %%xmm5,%%xmm1 \n" 345 "movq %%xmm6,(%1,%4) \n" 346 "lea (%1,%4,2),%1 \n" 347 "movdqa %%xmm1,%%xmm5 \n" 348 "movq %%xmm1,(%1) \n" 349 "palignr $0x8,%%xmm5,%%xmm5 \n" 350 "movq %%xmm5,(%1,%4) \n" 351 "lea (%1,%4,2),%1 \n" 352 "punpckldq %%xmm7,%%xmm3 \n" 353 "movq %%xmm3,(%1) \n" 354 "movdqa %%xmm3,%%xmm7 \n" 355 "palignr $0x8,%%xmm7,%%xmm7 \n" 356 "sub $0x8,%2 \n" 357 "movq %%xmm7,(%1,%4) \n" 358 "lea (%1,%4,2),%1 \n" 359 "jg 1b \n" 360 : "+r"(src), // %0 361 "+r"(dst), // %1 362 "+r"(width) // %2 363 : "r"(static_cast<intptr_t>(src_stride)), // %3 364 "r"(static_cast<intptr_t>(dst_stride)) // %4 365 : "memory", "cc" 366 #if defined(__SSE2__) 367 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 368 #endif 369 ); 370 } 371 372 #if !defined(YUV_DISABLE_ASM) && defined (__i386__) 373 #define HAS_TRANSPOSE_UVWX8_SSE2 374 extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride, 375 uint8* dst_a, int dst_stride_a, 376 uint8* dst_b, int dst_stride_b, 377 int w); 378 asm ( 379 DECLARE_FUNCTION(TransposeUVWx8_SSE2) 380 "push %ebx \n" 381 "push %esi \n" 382 "push %edi \n" 383 "push %ebp \n" 384 "mov 0x14(%esp),%eax \n" 385 "mov 0x18(%esp),%edi \n" 386 "mov 0x1c(%esp),%edx \n" 387 "mov 0x20(%esp),%esi \n" 388 "mov 0x24(%esp),%ebx \n" 389 "mov 0x28(%esp),%ebp \n" 390 "mov %esp,%ecx \n" 391 "sub $0x14,%esp \n" 392 "and $0xfffffff0,%esp \n" 393 "mov %ecx,0x10(%esp) \n" 394 "mov 0x2c(%ecx),%ecx \n" 395 396 "1: \n" 397 "movdqa (%eax),%xmm0 \n" 398 "movdqa (%eax,%edi,1),%xmm1 \n" 399 "lea (%eax,%edi,2),%eax \n" 400 "movdqa %xmm0,%xmm7 \n" 401 "punpcklbw %xmm1,%xmm0 \n" 402 "punpckhbw %xmm1,%xmm7 \n" 403 "movdqa %xmm7,%xmm1 \n" 404 "movdqa (%eax),%xmm2 \n" 405 "movdqa (%eax,%edi,1),%xmm3 \n" 406 "lea (%eax,%edi,2),%eax \n" 407 "movdqa %xmm2,%xmm7 \n" 408 "punpcklbw %xmm3,%xmm2 \n" 409 "punpckhbw %xmm3,%xmm7 \n" 410 "movdqa %xmm7,%xmm3 \n" 411 "movdqa (%eax),%xmm4 \n" 412 "movdqa (%eax,%edi,1),%xmm5 \n" 413 "lea (%eax,%edi,2),%eax \n" 414 "movdqa %xmm4,%xmm7 \n" 415 "punpcklbw %xmm5,%xmm4 \n" 416 "punpckhbw %xmm5,%xmm7 \n" 417 "movdqa %xmm7,%xmm5 \n" 418 "movdqa (%eax),%xmm6 \n" 419 "movdqa (%eax,%edi,1),%xmm7 \n" 420 "lea (%eax,%edi,2),%eax \n" 421 "movdqa %xmm5,(%esp) \n" 422 "neg %edi \n" 423 "movdqa %xmm6,%xmm5 \n" 424 "punpcklbw %xmm7,%xmm6 \n" 425 "punpckhbw %xmm7,%xmm5 \n" 426 "movdqa %xmm5,%xmm7 \n" 427 "lea 0x10(%eax,%edi,8),%eax \n" 428 "neg %edi \n" 429 "movdqa %xmm0,%xmm5 \n" 430 "punpcklwd %xmm2,%xmm0 \n" 431 "punpckhwd %xmm2,%xmm5 \n" 432 "movdqa %xmm5,%xmm2 \n" 433 "movdqa %xmm1,%xmm5 \n" 434 "punpcklwd %xmm3,%xmm1 \n" 435 "punpckhwd %xmm3,%xmm5 \n" 436 "movdqa %xmm5,%xmm3 \n" 437 "movdqa %xmm4,%xmm5 \n" 438 "punpcklwd %xmm6,%xmm4 \n" 439 "punpckhwd %xmm6,%xmm5 \n" 440 "movdqa %xmm5,%xmm6 \n" 441 "movdqa (%esp),%xmm5 \n" 442 "movdqa %xmm6,(%esp) \n" 443 "movdqa %xmm5,%xmm6 \n" 444 "punpcklwd %xmm7,%xmm5 \n" 445 "punpckhwd %xmm7,%xmm6 \n" 446 "movdqa %xmm6,%xmm7 \n" 447 "movdqa %xmm0,%xmm6 \n" 448 "punpckldq %xmm4,%xmm0 \n" 449 "punpckhdq %xmm4,%xmm6 \n" 450 "movdqa %xmm6,%xmm4 \n" 451 "movdqa (%esp),%xmm6 \n" 452 "movlpd %xmm0,(%edx) \n" 453 "movhpd %xmm0,(%ebx) \n" 454 "movlpd %xmm4,(%edx,%esi,1) \n" 455 "lea (%edx,%esi,2),%edx \n" 456 "movhpd %xmm4,(%ebx,%ebp,1) \n" 457 "lea (%ebx,%ebp,2),%ebx \n" 458 "movdqa %xmm2,%xmm0 \n" 459 "punpckldq %xmm6,%xmm2 \n" 460 "movlpd %xmm2,(%edx) \n" 461 "movhpd %xmm2,(%ebx) \n" 462 "punpckhdq %xmm6,%xmm0 \n" 463 "movlpd %xmm0,(%edx,%esi,1) \n" 464 "lea (%edx,%esi,2),%edx \n" 465 "movhpd %xmm0,(%ebx,%ebp,1) \n" 466 "lea (%ebx,%ebp,2),%ebx \n" 467 "movdqa %xmm1,%xmm0 \n" 468 "punpckldq %xmm5,%xmm1 \n" 469 "movlpd %xmm1,(%edx) \n" 470 "movhpd %xmm1,(%ebx) \n" 471 "punpckhdq %xmm5,%xmm0 \n" 472 "movlpd %xmm0,(%edx,%esi,1) \n" 473 "lea (%edx,%esi,2),%edx \n" 474 "movhpd %xmm0,(%ebx,%ebp,1) \n" 475 "lea (%ebx,%ebp,2),%ebx \n" 476 "movdqa %xmm3,%xmm0 \n" 477 "punpckldq %xmm7,%xmm3 \n" 478 "movlpd %xmm3,(%edx) \n" 479 "movhpd %xmm3,(%ebx) \n" 480 "punpckhdq %xmm7,%xmm0 \n" 481 "sub $0x8,%ecx \n" 482 "movlpd %xmm0,(%edx,%esi,1) \n" 483 "lea (%edx,%esi,2),%edx \n" 484 "movhpd %xmm0,(%ebx,%ebp,1) \n" 485 "lea (%ebx,%ebp,2),%ebx \n" 486 "jg 1b \n" 487 "mov 0x10(%esp),%esp \n" 488 "pop %ebp \n" 489 "pop %edi \n" 490 "pop %esi \n" 491 "pop %ebx \n" 492 "ret \n" 493 ); 494 #elif !defined(YUV_DISABLE_ASM) && defined(__x86_64__) 495 // 64 bit version has enough registers to do 16x8 to 8x16 at a time. 496 #define HAS_TRANSPOSE_WX8_FAST_SSSE3 497 static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride, 498 uint8* dst, int dst_stride, int width) { 499 asm volatile ( 500 // Read in the data from the source pointer. 501 // First round of bit swap. 502 ".p2align 4 \n" 503 "1: \n" 504 "movdqa (%0),%%xmm0 \n" 505 "movdqa (%0,%3),%%xmm1 \n" 506 "lea (%0,%3,2),%0 \n" 507 "movdqa %%xmm0,%%xmm8 \n" 508 "punpcklbw %%xmm1,%%xmm0 \n" 509 "punpckhbw %%xmm1,%%xmm8 \n" 510 "movdqa (%0),%%xmm2 \n" 511 "movdqa %%xmm0,%%xmm1 \n" 512 "movdqa %%xmm8,%%xmm9 \n" 513 "palignr $0x8,%%xmm1,%%xmm1 \n" 514 "palignr $0x8,%%xmm9,%%xmm9 \n" 515 "movdqa (%0,%3),%%xmm3 \n" 516 "lea (%0,%3,2),%0 \n" 517 "movdqa %%xmm2,%%xmm10 \n" 518 "punpcklbw %%xmm3,%%xmm2 \n" 519 "punpckhbw %%xmm3,%%xmm10 \n" 520 "movdqa %%xmm2,%%xmm3 \n" 521 "movdqa %%xmm10,%%xmm11 \n" 522 "movdqa (%0),%%xmm4 \n" 523 "palignr $0x8,%%xmm3,%%xmm3 \n" 524 "palignr $0x8,%%xmm11,%%xmm11 \n" 525 "movdqa (%0,%3),%%xmm5 \n" 526 "lea (%0,%3,2),%0 \n" 527 "movdqa %%xmm4,%%xmm12 \n" 528 "punpcklbw %%xmm5,%%xmm4 \n" 529 "punpckhbw %%xmm5,%%xmm12 \n" 530 "movdqa %%xmm4,%%xmm5 \n" 531 "movdqa %%xmm12,%%xmm13 \n" 532 "movdqa (%0),%%xmm6 \n" 533 "palignr $0x8,%%xmm5,%%xmm5 \n" 534 "palignr $0x8,%%xmm13,%%xmm13 \n" 535 "movdqa (%0,%3),%%xmm7 \n" 536 "lea (%0,%3,2),%0 \n" 537 "movdqa %%xmm6,%%xmm14 \n" 538 "punpcklbw %%xmm7,%%xmm6 \n" 539 "punpckhbw %%xmm7,%%xmm14 \n" 540 "neg %3 \n" 541 "movdqa %%xmm6,%%xmm7 \n" 542 "movdqa %%xmm14,%%xmm15 \n" 543 "lea 0x10(%0,%3,8),%0 \n" 544 "palignr $0x8,%%xmm7,%%xmm7 \n" 545 "palignr $0x8,%%xmm15,%%xmm15 \n" 546 "neg %3 \n" 547 // Second round of bit swap. 548 "punpcklwd %%xmm2,%%xmm0 \n" 549 "punpcklwd %%xmm3,%%xmm1 \n" 550 "movdqa %%xmm0,%%xmm2 \n" 551 "movdqa %%xmm1,%%xmm3 \n" 552 "palignr $0x8,%%xmm2,%%xmm2 \n" 553 "palignr $0x8,%%xmm3,%%xmm3 \n" 554 "punpcklwd %%xmm6,%%xmm4 \n" 555 "punpcklwd %%xmm7,%%xmm5 \n" 556 "movdqa %%xmm4,%%xmm6 \n" 557 "movdqa %%xmm5,%%xmm7 \n" 558 "palignr $0x8,%%xmm6,%%xmm6 \n" 559 "palignr $0x8,%%xmm7,%%xmm7 \n" 560 "punpcklwd %%xmm10,%%xmm8 \n" 561 "punpcklwd %%xmm11,%%xmm9 \n" 562 "movdqa %%xmm8,%%xmm10 \n" 563 "movdqa %%xmm9,%%xmm11 \n" 564 "palignr $0x8,%%xmm10,%%xmm10 \n" 565 "palignr $0x8,%%xmm11,%%xmm11 \n" 566 "punpcklwd %%xmm14,%%xmm12 \n" 567 "punpcklwd %%xmm15,%%xmm13 \n" 568 "movdqa %%xmm12,%%xmm14 \n" 569 "movdqa %%xmm13,%%xmm15 \n" 570 "palignr $0x8,%%xmm14,%%xmm14 \n" 571 "palignr $0x8,%%xmm15,%%xmm15 \n" 572 // Third round of bit swap. 573 // Write to the destination pointer. 574 "punpckldq %%xmm4,%%xmm0 \n" 575 "movq %%xmm0,(%1) \n" 576 "movdqa %%xmm0,%%xmm4 \n" 577 "palignr $0x8,%%xmm4,%%xmm4 \n" 578 "movq %%xmm4,(%1,%4) \n" 579 "lea (%1,%4,2),%1 \n" 580 "punpckldq %%xmm6,%%xmm2 \n" 581 "movdqa %%xmm2,%%xmm6 \n" 582 "movq %%xmm2,(%1) \n" 583 "palignr $0x8,%%xmm6,%%xmm6 \n" 584 "punpckldq %%xmm5,%%xmm1 \n" 585 "movq %%xmm6,(%1,%4) \n" 586 "lea (%1,%4,2),%1 \n" 587 "movdqa %%xmm1,%%xmm5 \n" 588 "movq %%xmm1,(%1) \n" 589 "palignr $0x8,%%xmm5,%%xmm5 \n" 590 "movq %%xmm5,(%1,%4) \n" 591 "lea (%1,%4,2),%1 \n" 592 "punpckldq %%xmm7,%%xmm3 \n" 593 "movq %%xmm3,(%1) \n" 594 "movdqa %%xmm3,%%xmm7 \n" 595 "palignr $0x8,%%xmm7,%%xmm7 \n" 596 "movq %%xmm7,(%1,%4) \n" 597 "lea (%1,%4,2),%1 \n" 598 "punpckldq %%xmm12,%%xmm8 \n" 599 "movq %%xmm8,(%1) \n" 600 "movdqa %%xmm8,%%xmm12 \n" 601 "palignr $0x8,%%xmm12,%%xmm12 \n" 602 "movq %%xmm12,(%1,%4) \n" 603 "lea (%1,%4,2),%1 \n" 604 "punpckldq %%xmm14,%%xmm10 \n" 605 "movdqa %%xmm10,%%xmm14 \n" 606 "movq %%xmm10,(%1) \n" 607 "palignr $0x8,%%xmm14,%%xmm14 \n" 608 "punpckldq %%xmm13,%%xmm9 \n" 609 "movq %%xmm14,(%1,%4) \n" 610 "lea (%1,%4,2),%1 \n" 611 "movdqa %%xmm9,%%xmm13 \n" 612 "movq %%xmm9,(%1) \n" 613 "palignr $0x8,%%xmm13,%%xmm13 \n" 614 "movq %%xmm13,(%1,%4) \n" 615 "lea (%1,%4,2),%1 \n" 616 "punpckldq %%xmm15,%%xmm11 \n" 617 "movq %%xmm11,(%1) \n" 618 "movdqa %%xmm11,%%xmm15 \n" 619 "palignr $0x8,%%xmm15,%%xmm15 \n" 620 "sub $0x10,%2 \n" 621 "movq %%xmm15,(%1,%4) \n" 622 "lea (%1,%4,2),%1 \n" 623 "jg 1b \n" 624 : "+r"(src), // %0 625 "+r"(dst), // %1 626 "+r"(width) // %2 627 : "r"(static_cast<intptr_t>(src_stride)), // %3 628 "r"(static_cast<intptr_t>(dst_stride)) // %4 629 : "memory", "cc", 630 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", 631 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15" 632 ); 633 } 634 635 #define HAS_TRANSPOSE_UVWX8_SSE2 636 static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, 637 uint8* dst_a, int dst_stride_a, 638 uint8* dst_b, int dst_stride_b, 639 int w) { 640 asm volatile ( 641 // Read in the data from the source pointer. 642 // First round of bit swap. 643 ".p2align 4 \n" 644 "1: \n" 645 "movdqa (%0),%%xmm0 \n" 646 "movdqa (%0,%4),%%xmm1 \n" 647 "lea (%0,%4,2),%0 \n" 648 "movdqa %%xmm0,%%xmm8 \n" 649 "punpcklbw %%xmm1,%%xmm0 \n" 650 "punpckhbw %%xmm1,%%xmm8 \n" 651 "movdqa %%xmm8,%%xmm1 \n" 652 "movdqa (%0),%%xmm2 \n" 653 "movdqa (%0,%4),%%xmm3 \n" 654 "lea (%0,%4,2),%0 \n" 655 "movdqa %%xmm2,%%xmm8 \n" 656 "punpcklbw %%xmm3,%%xmm2 \n" 657 "punpckhbw %%xmm3,%%xmm8 \n" 658 "movdqa %%xmm8,%%xmm3 \n" 659 "movdqa (%0),%%xmm4 \n" 660 "movdqa (%0,%4),%%xmm5 \n" 661 "lea (%0,%4,2),%0 \n" 662 "movdqa %%xmm4,%%xmm8 \n" 663 "punpcklbw %%xmm5,%%xmm4 \n" 664 "punpckhbw %%xmm5,%%xmm8 \n" 665 "movdqa %%xmm8,%%xmm5 \n" 666 "movdqa (%0),%%xmm6 \n" 667 "movdqa (%0,%4),%%xmm7 \n" 668 "lea (%0,%4,2),%0 \n" 669 "movdqa %%xmm6,%%xmm8 \n" 670 "punpcklbw %%xmm7,%%xmm6 \n" 671 "neg %4 \n" 672 "lea 0x10(%0,%4,8),%0 \n" 673 "punpckhbw %%xmm7,%%xmm8 \n" 674 "movdqa %%xmm8,%%xmm7 \n" 675 "neg %4 \n" 676 // Second round of bit swap. 677 "movdqa %%xmm0,%%xmm8 \n" 678 "movdqa %%xmm1,%%xmm9 \n" 679 "punpckhwd %%xmm2,%%xmm8 \n" 680 "punpckhwd %%xmm3,%%xmm9 \n" 681 "punpcklwd %%xmm2,%%xmm0 \n" 682 "punpcklwd %%xmm3,%%xmm1 \n" 683 "movdqa %%xmm8,%%xmm2 \n" 684 "movdqa %%xmm9,%%xmm3 \n" 685 "movdqa %%xmm4,%%xmm8 \n" 686 "movdqa %%xmm5,%%xmm9 \n" 687 "punpckhwd %%xmm6,%%xmm8 \n" 688 "punpckhwd %%xmm7,%%xmm9 \n" 689 "punpcklwd %%xmm6,%%xmm4 \n" 690 "punpcklwd %%xmm7,%%xmm5 \n" 691 "movdqa %%xmm8,%%xmm6 \n" 692 "movdqa %%xmm9,%%xmm7 \n" 693 // Third round of bit swap. 694 // Write to the destination pointer. 695 "movdqa %%xmm0,%%xmm8 \n" 696 "punpckldq %%xmm4,%%xmm0 \n" 697 "movlpd %%xmm0,(%1) \n" // Write back U channel 698 "movhpd %%xmm0,(%2) \n" // Write back V channel 699 "punpckhdq %%xmm4,%%xmm8 \n" 700 "movlpd %%xmm8,(%1,%5) \n" 701 "lea (%1,%5,2),%1 \n" 702 "movhpd %%xmm8,(%2,%6) \n" 703 "lea (%2,%6,2),%2 \n" 704 "movdqa %%xmm2,%%xmm8 \n" 705 "punpckldq %%xmm6,%%xmm2 \n" 706 "movlpd %%xmm2,(%1) \n" 707 "movhpd %%xmm2,(%2) \n" 708 "punpckhdq %%xmm6,%%xmm8 \n" 709 "movlpd %%xmm8,(%1,%5) \n" 710 "lea (%1,%5,2),%1 \n" 711 "movhpd %%xmm8,(%2,%6) \n" 712 "lea (%2,%6,2),%2 \n" 713 "movdqa %%xmm1,%%xmm8 \n" 714 "punpckldq %%xmm5,%%xmm1 \n" 715 "movlpd %%xmm1,(%1) \n" 716 "movhpd %%xmm1,(%2) \n" 717 "punpckhdq %%xmm5,%%xmm8 \n" 718 "movlpd %%xmm8,(%1,%5) \n" 719 "lea (%1,%5,2),%1 \n" 720 "movhpd %%xmm8,(%2,%6) \n" 721 "lea (%2,%6,2),%2 \n" 722 "movdqa %%xmm3,%%xmm8 \n" 723 "punpckldq %%xmm7,%%xmm3 \n" 724 "movlpd %%xmm3,(%1) \n" 725 "movhpd %%xmm3,(%2) \n" 726 "punpckhdq %%xmm7,%%xmm8 \n" 727 "sub $0x8,%3 \n" 728 "movlpd %%xmm8,(%1,%5) \n" 729 "lea (%1,%5,2),%1 \n" 730 "movhpd %%xmm8,(%2,%6) \n" 731 "lea (%2,%6,2),%2 \n" 732 "jg 1b \n" 733 : "+r"(src), // %0 734 "+r"(dst_a), // %1 735 "+r"(dst_b), // %2 736 "+r"(w) // %3 737 : "r"(static_cast<intptr_t>(src_stride)), // %4 738 "r"(static_cast<intptr_t>(dst_stride_a)), // %5 739 "r"(static_cast<intptr_t>(dst_stride_b)) // %6 740 : "memory", "cc", 741 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", 742 "xmm8", "xmm9" 743 ); 744 } 745 #endif 746 #endif 747 748 static void TransposeWx8_C(const uint8* src, int src_stride, 749 uint8* dst, int dst_stride, 750 int width) { 751 for (int i = 0; i < width; ++i) { 752 dst[0] = src[0 * src_stride]; 753 dst[1] = src[1 * src_stride]; 754 dst[2] = src[2 * src_stride]; 755 dst[3] = src[3 * src_stride]; 756 dst[4] = src[4 * src_stride]; 757 dst[5] = src[5 * src_stride]; 758 dst[6] = src[6 * src_stride]; 759 dst[7] = src[7 * src_stride]; 760 ++src; 761 dst += dst_stride; 762 } 763 } 764 765 static void TransposeWxH_C(const uint8* src, int src_stride, 766 uint8* dst, int dst_stride, 767 int width, int height) { 768 for (int i = 0; i < width; ++i) { 769 for (int j = 0; j < height; ++j) { 770 dst[i * dst_stride + j] = src[j * src_stride + i]; 771 } 772 } 773 } 774 775 LIBYUV_API 776 void TransposePlane(const uint8* src, int src_stride, 777 uint8* dst, int dst_stride, 778 int width, int height) { 779 void (*TransposeWx8)(const uint8* src, int src_stride, 780 uint8* dst, int dst_stride, 781 int width) = TransposeWx8_C; 782 #if defined(HAS_TRANSPOSE_WX8_NEON) 783 if (TestCpuFlag(kCpuHasNEON)) { 784 TransposeWx8 = TransposeWx8_NEON; 785 } 786 #endif 787 #if defined(HAS_TRANSPOSE_WX8_SSSE3) 788 if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { 789 TransposeWx8 = TransposeWx8_SSSE3; 790 } 791 #endif 792 #if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3) 793 if (TestCpuFlag(kCpuHasSSSE3) && 794 IS_ALIGNED(width, 16) && 795 IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) { 796 TransposeWx8 = TransposeWx8_FAST_SSSE3; 797 } 798 #endif 799 800 // Work across the source in 8x8 tiles 801 int i = height; 802 while (i >= 8) { 803 TransposeWx8(src, src_stride, dst, dst_stride, width); 804 src += 8 * src_stride; // Go down 8 rows. 805 dst += 8; // Move over 8 columns. 806 i -= 8; 807 } 808 809 TransposeWxH_C(src, src_stride, dst, dst_stride, width, i); 810 } 811 812 LIBYUV_API 813 void RotatePlane90(const uint8* src, int src_stride, 814 uint8* dst, int dst_stride, 815 int width, int height) { 816 // Rotate by 90 is a transpose with the source read 817 // from bottom to top. So set the source pointer to the end 818 // of the buffer and flip the sign of the source stride. 819 src += src_stride * (height - 1); 820 src_stride = -src_stride; 821 TransposePlane(src, src_stride, dst, dst_stride, width, height); 822 } 823 824 LIBYUV_API 825 void RotatePlane270(const uint8* src, int src_stride, 826 uint8* dst, int dst_stride, 827 int width, int height) { 828 // Rotate by 270 is a transpose with the destination written 829 // from bottom to top. So set the destination pointer to the end 830 // of the buffer and flip the sign of the destination stride. 831 dst += dst_stride * (width - 1); 832 dst_stride = -dst_stride; 833 TransposePlane(src, src_stride, dst, dst_stride, width, height); 834 } 835 836 LIBYUV_API 837 void RotatePlane180(const uint8* src, int src_stride, 838 uint8* dst, int dst_stride, 839 int width, int height) { 840 void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C; 841 #if defined(HAS_MIRRORROW_NEON) 842 if (TestCpuFlag(kCpuHasNEON)) { 843 MirrorRow = MirrorRow_NEON; 844 } 845 #endif 846 #if defined(HAS_MIRRORROW_SSE2) 847 if (TestCpuFlag(kCpuHasSSE2) && 848 IS_ALIGNED(width, 16) && 849 IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) && 850 IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) { 851 MirrorRow = MirrorRow_SSE2; 852 } 853 #endif 854 #if defined(HAS_MIRRORROW_SSSE3) 855 if (TestCpuFlag(kCpuHasSSSE3) && 856 IS_ALIGNED(width, 16) && 857 IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) && 858 IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) { 859 MirrorRow = MirrorRow_SSSE3; 860 } 861 #endif 862 void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; 863 #if defined(HAS_COPYROW_NEON) 864 if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 64)) { 865 CopyRow = CopyRow_NEON; 866 } 867 #endif 868 #if defined(HAS_COPYROW_X86) 869 if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) { 870 CopyRow = CopyRow_X86; 871 } 872 #endif 873 #if defined(HAS_COPYROW_SSE2) 874 if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) && 875 IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) && 876 IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) { 877 CopyRow = CopyRow_SSE2; 878 } 879 #endif 880 if (width > kMaxStride) { 881 return; 882 } 883 // Swap first and last row and mirror the content. Uses a temporary row. 884 SIMD_ALIGNED(uint8 row[kMaxStride]); 885 const uint8* src_bot = src + src_stride * (height - 1); 886 uint8* dst_bot = dst + dst_stride * (height - 1); 887 int half_height = (height + 1) >> 1; 888 // Odd height will harmlessly mirror the middle row twice. 889 for (int y = 0; y < half_height; ++y) { 890 MirrorRow(src, row, width); // Mirror first row into a buffer 891 src += src_stride; 892 MirrorRow(src_bot, dst, width); // Mirror last row into first row 893 dst += dst_stride; 894 CopyRow(row, dst_bot, width); // Copy first mirrored row into last 895 src_bot -= src_stride; 896 dst_bot -= dst_stride; 897 } 898 } 899 900 static void TransposeUVWx8_C(const uint8* src, int src_stride, 901 uint8* dst_a, int dst_stride_a, 902 uint8* dst_b, int dst_stride_b, 903 int width) { 904 for (int i = 0; i < width; ++i) { 905 dst_a[0] = src[0 * src_stride + 0]; 906 dst_b[0] = src[0 * src_stride + 1]; 907 dst_a[1] = src[1 * src_stride + 0]; 908 dst_b[1] = src[1 * src_stride + 1]; 909 dst_a[2] = src[2 * src_stride + 0]; 910 dst_b[2] = src[2 * src_stride + 1]; 911 dst_a[3] = src[3 * src_stride + 0]; 912 dst_b[3] = src[3 * src_stride + 1]; 913 dst_a[4] = src[4 * src_stride + 0]; 914 dst_b[4] = src[4 * src_stride + 1]; 915 dst_a[5] = src[5 * src_stride + 0]; 916 dst_b[5] = src[5 * src_stride + 1]; 917 dst_a[6] = src[6 * src_stride + 0]; 918 dst_b[6] = src[6 * src_stride + 1]; 919 dst_a[7] = src[7 * src_stride + 0]; 920 dst_b[7] = src[7 * src_stride + 1]; 921 src += 2; 922 dst_a += dst_stride_a; 923 dst_b += dst_stride_b; 924 } 925 } 926 927 static void TransposeUVWxH_C(const uint8* src, int src_stride, 928 uint8* dst_a, int dst_stride_a, 929 uint8* dst_b, int dst_stride_b, 930 int width, int height) { 931 for (int i = 0; i < width * 2; i += 2) 932 for (int j = 0; j < height; ++j) { 933 dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)]; 934 dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1]; 935 } 936 } 937 938 LIBYUV_API 939 void TransposeUV(const uint8* src, int src_stride, 940 uint8* dst_a, int dst_stride_a, 941 uint8* dst_b, int dst_stride_b, 942 int width, int height) { 943 void (*TransposeUVWx8)(const uint8* src, int src_stride, 944 uint8* dst_a, int dst_stride_a, 945 uint8* dst_b, int dst_stride_b, 946 int width) = TransposeUVWx8_C; 947 #if defined(HAS_TRANSPOSE_UVWX8_NEON) 948 if (TestCpuFlag(kCpuHasNEON)) { 949 TransposeUVWx8 = TransposeUVWx8_NEON; 950 } 951 #elif defined(HAS_TRANSPOSE_UVWX8_SSE2) 952 if (TestCpuFlag(kCpuHasSSE2) && 953 IS_ALIGNED(width, 8) && 954 IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) { 955 TransposeUVWx8 = TransposeUVWx8_SSE2; 956 } 957 #endif 958 959 // Work through the source in 8x8 tiles. 960 int i = height; 961 while (i >= 8) { 962 TransposeUVWx8(src, src_stride, 963 dst_a, dst_stride_a, 964 dst_b, dst_stride_b, 965 width); 966 src += 8 * src_stride; // Go down 8 rows. 967 dst_a += 8; // Move over 8 columns. 968 dst_b += 8; // Move over 8 columns. 969 i -= 8; 970 } 971 972 TransposeUVWxH_C(src, src_stride, 973 dst_a, dst_stride_a, 974 dst_b, dst_stride_b, 975 width, i); 976 } 977 978 LIBYUV_API 979 void RotateUV90(const uint8* src, int src_stride, 980 uint8* dst_a, int dst_stride_a, 981 uint8* dst_b, int dst_stride_b, 982 int width, int height) { 983 src += src_stride * (height - 1); 984 src_stride = -src_stride; 985 986 TransposeUV(src, src_stride, 987 dst_a, dst_stride_a, 988 dst_b, dst_stride_b, 989 width, height); 990 } 991 992 LIBYUV_API 993 void RotateUV270(const uint8* src, int src_stride, 994 uint8* dst_a, int dst_stride_a, 995 uint8* dst_b, int dst_stride_b, 996 int width, int height) { 997 dst_a += dst_stride_a * (width - 1); 998 dst_b += dst_stride_b * (width - 1); 999 dst_stride_a = -dst_stride_a; 1000 dst_stride_b = -dst_stride_b; 1001 1002 TransposeUV(src, src_stride, 1003 dst_a, dst_stride_a, 1004 dst_b, dst_stride_b, 1005 width, height); 1006 } 1007 1008 // Rotate 180 is a horizontal and vertical flip. 1009 LIBYUV_API 1010 void RotateUV180(const uint8* src, int src_stride, 1011 uint8* dst_a, int dst_stride_a, 1012 uint8* dst_b, int dst_stride_b, 1013 int width, int height) { 1014 void (*MirrorRowUV)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) = 1015 MirrorRowUV_C; 1016 #if defined(HAS_MIRRORROW_UV_NEON) 1017 if (TestCpuFlag(kCpuHasNEON)) { 1018 MirrorRowUV = MirrorRowUV_NEON; 1019 } 1020 #elif defined(HAS_MIRRORROW_UV_SSSE3) 1021 if (TestCpuFlag(kCpuHasSSSE3) && 1022 IS_ALIGNED(width, 16) && 1023 IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) { 1024 MirrorRowUV = MirrorRowUV_SSSE3; 1025 } 1026 #endif 1027 1028 dst_a += dst_stride_a * (height - 1); 1029 dst_b += dst_stride_b * (height - 1); 1030 1031 for (int i = 0; i < height; ++i) { 1032 MirrorRowUV(src, dst_a, dst_b, width); 1033 src += src_stride; 1034 dst_a -= dst_stride_a; 1035 dst_b -= dst_stride_b; 1036 } 1037 } 1038 1039 LIBYUV_API 1040 int I420Rotate(const uint8* src_y, int src_stride_y, 1041 const uint8* src_u, int src_stride_u, 1042 const uint8* src_v, int src_stride_v, 1043 uint8* dst_y, int dst_stride_y, 1044 uint8* dst_u, int dst_stride_u, 1045 uint8* dst_v, int dst_stride_v, 1046 int width, int height, 1047 RotationMode mode) { 1048 if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || 1049 !dst_y || !dst_u || !dst_v) { 1050 return -1; 1051 } 1052 int halfwidth = (width + 1) >> 1; 1053 int halfheight = (height + 1) >> 1; 1054 1055 // Negative height means invert the image. 1056 if (height < 0) { 1057 height = -height; 1058 halfheight = (height + 1) >> 1; 1059 src_y = src_y + (height - 1) * src_stride_y; 1060 src_u = src_u + (halfheight - 1) * src_stride_u; 1061 src_v = src_v + (halfheight - 1) * src_stride_v; 1062 src_stride_y = -src_stride_y; 1063 src_stride_u = -src_stride_u; 1064 src_stride_v = -src_stride_v; 1065 } 1066 1067 switch (mode) { 1068 case kRotate0: 1069 // copy frame 1070 return I420Copy(src_y, src_stride_y, 1071 src_u, src_stride_u, 1072 src_v, src_stride_v, 1073 dst_y, dst_stride_y, 1074 dst_u, dst_stride_u, 1075 dst_v, dst_stride_v, 1076 width, height); 1077 case kRotate90: 1078 RotatePlane90(src_y, src_stride_y, 1079 dst_y, dst_stride_y, 1080 width, height); 1081 RotatePlane90(src_u, src_stride_u, 1082 dst_u, dst_stride_u, 1083 halfwidth, halfheight); 1084 RotatePlane90(src_v, src_stride_v, 1085 dst_v, dst_stride_v, 1086 halfwidth, halfheight); 1087 return 0; 1088 case kRotate270: 1089 RotatePlane270(src_y, src_stride_y, 1090 dst_y, dst_stride_y, 1091 width, height); 1092 RotatePlane270(src_u, src_stride_u, 1093 dst_u, dst_stride_u, 1094 halfwidth, halfheight); 1095 RotatePlane270(src_v, src_stride_v, 1096 dst_v, dst_stride_v, 1097 halfwidth, halfheight); 1098 return 0; 1099 case kRotate180: 1100 RotatePlane180(src_y, src_stride_y, 1101 dst_y, dst_stride_y, 1102 width, height); 1103 RotatePlane180(src_u, src_stride_u, 1104 dst_u, dst_stride_u, 1105 halfwidth, halfheight); 1106 RotatePlane180(src_v, src_stride_v, 1107 dst_v, dst_stride_v, 1108 halfwidth, halfheight); 1109 return 0; 1110 default: 1111 break; 1112 } 1113 return -1; 1114 } 1115 1116 LIBYUV_API 1117 int NV12ToI420Rotate(const uint8* src_y, int src_stride_y, 1118 const uint8* src_uv, int src_stride_uv, 1119 uint8* dst_y, int dst_stride_y, 1120 uint8* dst_u, int dst_stride_u, 1121 uint8* dst_v, int dst_stride_v, 1122 int width, int height, 1123 RotationMode mode) { 1124 if (!src_y || !src_uv || width <= 0 || height == 0 || 1125 !dst_y || !dst_u || !dst_v) { 1126 return -1; 1127 } 1128 int halfwidth = (width + 1) >> 1; 1129 int halfheight = (height + 1) >> 1; 1130 1131 // Negative height means invert the image. 1132 if (height < 0) { 1133 height = -height; 1134 halfheight = (height + 1) >> 1; 1135 src_y = src_y + (height - 1) * src_stride_y; 1136 src_uv = src_uv + (halfheight - 1) * src_stride_uv; 1137 src_stride_y = -src_stride_y; 1138 src_stride_uv = -src_stride_uv; 1139 } 1140 1141 switch (mode) { 1142 case kRotate0: 1143 // copy frame 1144 return NV12ToI420(src_y, src_stride_y, 1145 src_uv, src_stride_uv, 1146 dst_y, dst_stride_y, 1147 dst_u, dst_stride_u, 1148 dst_v, dst_stride_v, 1149 width, height); 1150 case kRotate90: 1151 RotatePlane90(src_y, src_stride_y, 1152 dst_y, dst_stride_y, 1153 width, height); 1154 RotateUV90(src_uv, src_stride_uv, 1155 dst_u, dst_stride_u, 1156 dst_v, dst_stride_v, 1157 halfwidth, halfheight); 1158 return 0; 1159 case kRotate270: 1160 RotatePlane270(src_y, src_stride_y, 1161 dst_y, dst_stride_y, 1162 width, height); 1163 RotateUV270(src_uv, src_stride_uv, 1164 dst_u, dst_stride_u, 1165 dst_v, dst_stride_v, 1166 halfwidth, halfheight); 1167 return 0; 1168 case kRotate180: 1169 RotatePlane180(src_y, src_stride_y, 1170 dst_y, dst_stride_y, 1171 width, height); 1172 RotateUV180(src_uv, src_stride_uv, 1173 dst_u, dst_stride_u, 1174 dst_v, dst_stride_v, 1175 halfwidth, halfheight); 1176 return 0; 1177 default: 1178 break; 1179 } 1180 return -1; 1181 } 1182 1183 #ifdef __cplusplus 1184 } // extern "C" 1185 } // namespace libyuv 1186 #endif 1187