1 /* 2 * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "libyuv/planar_functions.h" 12 13 #include <string.h> 14 15 #include "libyuv/cpu_id.h" 16 #include "row.h" 17 18 namespace libyuv { 19 20 #if defined(__ARM_NEON__) && !defined(COVERAGE_ENABLED) 21 #define HAS_SPLITUV_NEON 22 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v 23 // Alignment requirement: 16 bytes for pointers, and multiple of 16 pixels. 24 static void SplitUV_NEON(const uint8* src_uv, 25 uint8* dst_u, uint8* dst_v, int pix) { 26 __asm__ volatile 27 ( 28 "1:\n" 29 "vld2.u8 {q0,q1}, [%0]! \n" // load 16 pairs of UV 30 "vst1.u8 {q0}, [%1]! \n" // store U 31 "vst1.u8 {q1}, [%2]! \n" // Store V 32 "subs %3, %3, #16 \n" // 16 processed per loop 33 "bhi 1b \n" 34 : "+r"(src_uv), 35 "+r"(dst_u), 36 "+r"(dst_v), 37 "+r"(pix) // Output registers 38 : // Input registers 39 : "q0", "q1" // Clobber List 40 ); 41 } 42 43 #elif (defined(WIN32) || defined(__x86_64__) || defined(__i386__)) \ 44 && !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) 45 #if defined(_MSC_VER) 46 #define TALIGN16(t, var) static __declspec(align(16)) t _ ## var 47 #else 48 #define TALIGN16(t, var) t var __attribute__((aligned(16))) 49 #endif 50 51 // Shuffle table for converting ABGR to ARGB. 52 extern "C" TALIGN16(const uint8, kShuffleMaskABGRToARGB[16]) = { 53 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u 54 }; 55 56 // Shuffle table for converting BGRA to ARGB. 57 extern "C" TALIGN16(const uint8, kShuffleMaskBGRAToARGB[16]) = { 58 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u 59 }; 60 61 #if defined(WIN32) && !defined(COVERAGE_ENABLED) 62 #define HAS_SPLITUV_SSE2 63 __declspec(naked) 64 static void SplitUV_SSE2(const uint8* src_uv, 65 uint8* dst_u, uint8* dst_v, int pix) { 66 __asm { 67 push edi 68 mov eax, [esp + 4 + 4] // src_uv 69 mov edx, [esp + 4 + 8] // dst_u 70 mov edi, [esp + 4 + 12] // dst_v 71 mov ecx, [esp + 4 + 16] // pix 72 pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff 73 psrlw xmm7, 8 74 75 wloop: 76 movdqa xmm0, [eax] 77 movdqa xmm1, [eax + 16] 78 lea eax, [eax + 32] 79 movdqa xmm2, xmm0 80 movdqa xmm3, xmm1 81 pand xmm0, xmm7 // even bytes 82 pand xmm1, xmm7 83 packuswb xmm0, xmm1 84 movdqa [edx], xmm0 85 lea edx, [edx + 16] 86 psrlw xmm2, 8 // odd bytes 87 psrlw xmm3, 8 88 packuswb xmm2, xmm3 89 movdqa [edi], xmm2 90 lea edi, [edi + 16] 91 sub ecx, 16 92 ja wloop 93 pop edi 94 ret 95 } 96 } 97 98 #elif (defined(__x86_64__) || defined(__i386__)) && \ 99 !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) 100 #define HAS_SPLITUV_SSE2 101 static void SplitUV_SSE2(const uint8* src_uv, 102 uint8* dst_u, uint8* dst_v, int pix) { 103 asm volatile( 104 "pcmpeqb %%xmm7,%%xmm7\n" 105 "psrlw $0x8,%%xmm7\n" 106 "1:" 107 "movdqa (%0),%%xmm0\n" 108 "movdqa 0x10(%0),%%xmm1\n" 109 "lea 0x20(%0),%0\n" 110 "movdqa %%xmm0,%%xmm2\n" 111 "movdqa %%xmm1,%%xmm3\n" 112 "pand %%xmm7,%%xmm0\n" 113 "pand %%xmm7,%%xmm1\n" 114 "packuswb %%xmm1,%%xmm0\n" 115 "movdqa %%xmm0,(%1)\n" 116 "lea 0x10(%1),%1\n" 117 "psrlw $0x8,%%xmm2\n" 118 "psrlw $0x8,%%xmm3\n" 119 "packuswb %%xmm3,%%xmm2\n" 120 "movdqa %%xmm2,(%2)\n" 121 "lea 0x10(%2),%2\n" 122 "sub $0x10,%3\n" 123 "ja 1b\n" 124 : "+r"(src_uv), // %0 125 "+r"(dst_u), // %1 126 "+r"(dst_v), // %2 127 "+r"(pix) // %3 128 : 129 : "memory" 130 ); 131 } 132 #endif 133 #endif 134 135 static void SplitUV_C(const uint8* src_uv, 136 uint8* dst_u, uint8* dst_v, int pix) { 137 // Copy a row of UV. 138 for (int x = 0; x < pix; ++x) { 139 dst_u[0] = src_uv[0]; 140 dst_v[0] = src_uv[1]; 141 src_uv += 2; 142 dst_u += 1; 143 dst_v += 1; 144 } 145 } 146 147 static void I420CopyPlane(const uint8* src_y, int src_stride_y, 148 uint8* dst_y, int dst_stride_y, 149 int width, int height) { 150 // Copy plane 151 for (int y = 0; y < height; ++y) { 152 memcpy(dst_y, src_y, width); 153 src_y += src_stride_y; 154 dst_y += dst_stride_y; 155 } 156 } 157 158 // Copy I420 with optional flipping 159 int I420Copy(const uint8* src_y, int src_stride_y, 160 const uint8* src_u, int src_stride_u, 161 const uint8* src_v, int src_stride_v, 162 uint8* dst_y, int dst_stride_y, 163 uint8* dst_u, int dst_stride_u, 164 uint8* dst_v, int dst_stride_v, 165 int width, int height) { 166 if (!src_y || !src_u || !src_v || 167 !dst_y || !dst_u || !dst_v || 168 width <= 0 || height == 0) { 169 return -1; 170 } 171 172 // Negative height means invert the image. 173 if (height < 0) { 174 height = -height; 175 int halfheight = (height + 1) >> 1; 176 src_y = src_y + (height - 1) * src_stride_y; 177 src_u = src_u + (halfheight - 1) * src_stride_u; 178 src_v = src_v + (halfheight - 1) * src_stride_v; 179 src_stride_y = -src_stride_y; 180 src_stride_u = -src_stride_u; 181 src_stride_v = -src_stride_v; 182 } 183 184 int halfwidth = (width + 1) >> 1; 185 int halfheight = (height + 1) >> 1; 186 I420CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); 187 I420CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight); 188 I420CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight); 189 return 0; 190 } 191 192 // SetRows32 writes 'count' bytes using a 32 bit value repeated 193 194 #if defined(__ARM_NEON__) && !defined(COVERAGE_ENABLED) 195 #define HAS_SETROW_NEON 196 static void SetRow32_NEON(uint8* dst, uint32 v32, int count) { 197 __asm__ volatile 198 ( 199 "vdup.u32 q0, %2 \n" // duplicate 4 ints 200 "1:\n" 201 "vst1.u32 {q0}, [%0]! \n" // store 202 "subs %1, %1, #16 \n" // 16 processed per loop 203 "bhi 1b \n" 204 : "+r"(dst), // %0 205 "+r"(count) // %1 206 : "r"(v32) // %2 207 : "q0", "memory" 208 ); 209 } 210 211 #elif defined(WIN32) && !defined(COVERAGE_ENABLED) 212 #define HAS_SETROW_SSE2 213 __declspec(naked) 214 static void SetRow32_SSE2(uint8* dst, uint32 v32, int count) { 215 __asm { 216 mov eax, [esp + 4] // dst 217 movd xmm7, [esp + 8] // v32 218 mov ecx, [esp + 12] // count 219 pshufd xmm7, xmm7, 0 220 221 wloop: 222 movdqa [eax], xmm7 223 lea eax, [eax + 16] 224 sub ecx, 16 225 ja wloop 226 ret 227 } 228 } 229 230 #elif (defined(__x86_64__) || defined(__i386__)) && \ 231 !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) 232 233 #define HAS_SETROW_SSE2 234 static void SetRow32_SSE2(uint8* dst, uint32 v32, int count) { 235 asm volatile( 236 "movd %2, %%xmm7\n" 237 "pshufd $0x0,%%xmm7,%%xmm7\n" 238 "1:" 239 "movdqa %%xmm7,(%0)\n" 240 "lea 0x10(%0),%0\n" 241 "sub $0x10,%1\n" 242 "ja 1b\n" 243 : "+r"(dst), // %0 244 "+r"(count) // %1 245 : "r"(v32) // %2 246 : "memory" 247 ); 248 } 249 #endif 250 251 static void SetRow8_C(uint8* dst, uint32 v8, int count) { 252 memset(dst, v8, count); 253 } 254 255 static void I420SetPlane(uint8* dst_y, int dst_stride_y, 256 int width, int height, 257 int value) { 258 void (*SetRow)(uint8* dst, uint32 value, int pix); 259 #if defined(HAS_SETROW_NEON) 260 if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) && 261 (width % 16 == 0) && 262 IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) { 263 SetRow = SetRow32_NEON; 264 } else 265 #elif defined(HAS_SETROW_SSE2) 266 if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && 267 (width % 16 == 0) && 268 IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) { 269 SetRow = SetRow32_SSE2; 270 } else 271 #endif 272 { 273 SetRow = SetRow8_C; 274 } 275 276 uint32 v32 = value | (value << 8) | (value << 16) | (value << 24); 277 // Set plane 278 for (int y = 0; y < height; ++y) { 279 SetRow(dst_y, v32, width); 280 dst_y += dst_stride_y; 281 } 282 } 283 284 // Draw a rectangle into I420 285 int I420Rect(uint8* dst_y, int dst_stride_y, 286 uint8* dst_u, int dst_stride_u, 287 uint8* dst_v, int dst_stride_v, 288 int x, int y, 289 int width, int height, 290 int value_y, int value_u, int value_v) { 291 if (!dst_y || !dst_u || !dst_v || 292 width <= 0 || height == 0 || 293 x < 0 || y < 0 || 294 value_y < 0 || value_y > 255 || 295 value_u < 0 || value_u > 255 || 296 value_v < 0 || value_v > 255) { 297 return -1; 298 } 299 // Negative height means invert the image. 300 if (height < 0) { 301 height = -height; 302 int halfheight = (height + 1) >> 1; 303 dst_y = dst_y + (height - 1) * dst_stride_y; 304 dst_u = dst_u + (halfheight - 1) * dst_stride_u; 305 dst_v = dst_v + (halfheight - 1) * dst_stride_v; 306 dst_stride_y = -dst_stride_y; 307 dst_stride_u = -dst_stride_u; 308 dst_stride_v = -dst_stride_v; 309 } 310 311 int halfwidth = (width + 1) >> 1; 312 int halfheight = (height + 1) >> 1; 313 uint8* start_y = dst_y + y * dst_stride_y + x; 314 uint8* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2); 315 uint8* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2); 316 317 I420SetPlane(start_y, dst_stride_y, width, height, value_y); 318 I420SetPlane(start_u, dst_stride_u, halfwidth, halfheight, value_u); 319 I420SetPlane(start_v, dst_stride_v, halfwidth, halfheight, value_v); 320 return 0; 321 } 322 323 // Helper function to copy yuv data without scaling. Used 324 // by our jpeg conversion callbacks to incrementally fill a yuv image. 325 int I422ToI420(const uint8* src_y, int src_stride_y, 326 const uint8* src_u, int src_stride_u, 327 const uint8* src_v, int src_stride_v, 328 uint8* dst_y, int dst_stride_y, 329 uint8* dst_u, int dst_stride_u, 330 uint8* dst_v, int dst_stride_v, 331 int width, int height) { 332 // Negative height means invert the image. 333 if (height < 0) { 334 height = -height; 335 src_y = src_y + (height - 1) * src_stride_y; 336 src_u = src_u + (height - 1) * src_stride_u; 337 src_v = src_v + (height - 1) * src_stride_v; 338 src_stride_y = -src_stride_y; 339 src_stride_u = -src_stride_u; 340 src_stride_v = -src_stride_v; 341 } 342 343 // Copy Y plane 344 I420CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); 345 346 // SubSample UV planes. 347 int x, y; 348 int halfwidth = (width + 1) >> 1; 349 for (y = 0; y < height; y += 2) { 350 const uint8* u0 = src_u; 351 const uint8* u1 = src_u + src_stride_u; 352 if ((y + 1) >= height) { 353 u1 = u0; 354 } 355 for (x = 0; x < halfwidth; ++x) { 356 dst_u[x] = (u0[x] + u1[x] + 1) >> 1; 357 } 358 src_u += src_stride_u * 2; 359 dst_u += dst_stride_u; 360 } 361 for (y = 0; y < height; y += 2) { 362 const uint8* v0 = src_v; 363 const uint8* v1 = src_v + src_stride_v; 364 if ((y + 1) >= height) { 365 v1 = v0; 366 } 367 for (x = 0; x < halfwidth; ++x) { 368 dst_v[x] = (v0[x] + v1[x] + 1) >> 1; 369 } 370 src_v += src_stride_v * 2; 371 dst_v += dst_stride_v; 372 } 373 return 0; 374 } 375 376 static void I420CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1, 377 uint8* dst, int dst_stride_frame, 378 int width, int height) { 379 // Copy plane 380 for (int y = 0; y < height; y += 2) { 381 memcpy(dst, src, width); 382 src += src_stride_0; 383 dst += dst_stride_frame; 384 memcpy(dst, src, width); 385 src += src_stride_1; 386 dst += dst_stride_frame; 387 } 388 } 389 390 // Support converting from FOURCC_M420 391 // Useful for bandwidth constrained transports like USB 1.0 and 2.0 and for 392 // easy conversion to I420. 393 // M420 format description: 394 // M420 is row biplanar 420: 2 rows of Y and 1 row of VU. 395 // Chroma is half width / half height. (420) 396 // src_stride_m420 is row planar. Normally this will be the width in pixels. 397 // The UV plane is half width, but 2 values, so src_stride_m420 applies to 398 // this as well as the two Y planes. 399 static int X420ToI420(const uint8* src_y, 400 int src_stride_y0, int src_stride_y1, 401 const uint8* src_uv, int src_stride_uv, 402 uint8* dst_y, int dst_stride_y, 403 uint8* dst_u, int dst_stride_u, 404 uint8* dst_v, int dst_stride_v, 405 int width, int height) { 406 // Negative height means invert the image. 407 if (height < 0) { 408 height = -height; 409 int halfheight = (height + 1) >> 1; 410 dst_y = dst_y + (height - 1) * dst_stride_y; 411 dst_u = dst_u + (halfheight - 1) * dst_stride_u; 412 dst_v = dst_v + (halfheight - 1) * dst_stride_v; 413 dst_stride_y = -dst_stride_y; 414 dst_stride_u = -dst_stride_u; 415 dst_stride_v = -dst_stride_v; 416 } 417 418 int halfwidth = (width + 1) >> 1; 419 void (*SplitUV)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix); 420 #if defined(HAS_SPLITUV_NEON) 421 if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) && 422 (halfwidth % 16 == 0) && 423 IS_ALIGNED(src_uv, 16) && (src_stride_uv % 16 == 0) && 424 IS_ALIGNED(dst_u, 16) && (dst_stride_u % 16 == 0) && 425 IS_ALIGNED(dst_v, 16) && (dst_stride_v % 16 == 0)) { 426 SplitUV = SplitUV_NEON; 427 } else 428 #elif defined(HAS_SPLITUV_SSE2) 429 if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && 430 (halfwidth % 16 == 0) && 431 IS_ALIGNED(src_uv, 16) && (src_stride_uv % 16 == 0) && 432 IS_ALIGNED(dst_u, 16) && (dst_stride_u % 16 == 0) && 433 IS_ALIGNED(dst_v, 16) && (dst_stride_v % 16 == 0)) { 434 SplitUV = SplitUV_SSE2; 435 } else 436 #endif 437 { 438 SplitUV = SplitUV_C; 439 } 440 441 I420CopyPlane2(src_y, src_stride_y0, src_stride_y1, dst_y, dst_stride_y, 442 width, height); 443 444 int halfheight = (height + 1) >> 1; 445 for (int y = 0; y < halfheight; ++y) { 446 // Copy a row of UV. 447 SplitUV(src_uv, dst_u, dst_v, halfwidth); 448 dst_u += dst_stride_u; 449 dst_v += dst_stride_v; 450 src_uv += src_stride_uv; 451 } 452 return 0; 453 } 454 455 // Convert M420 to I420. 456 int M420ToI420(const uint8* src_m420, int src_stride_m420, 457 uint8* dst_y, int dst_stride_y, 458 uint8* dst_u, int dst_stride_u, 459 uint8* dst_v, int dst_stride_v, 460 int width, int height) { 461 return X420ToI420(src_m420, src_stride_m420, src_stride_m420 * 2, 462 src_m420 + src_stride_m420 * 2, src_stride_m420 * 3, 463 dst_y, dst_stride_y, 464 dst_u, dst_stride_u, 465 dst_v, dst_stride_v, 466 width, height); 467 } 468 469 // Convert NV12 to I420. 470 int NV12ToI420(const uint8* src_y, int src_stride_y, 471 const uint8* src_uv, int src_stride_uv, 472 uint8* dst_y, int dst_stride_y, 473 uint8* dst_u, int dst_stride_u, 474 uint8* dst_v, int dst_stride_v, 475 int width, int height) { 476 return X420ToI420(src_y, src_stride_y, src_stride_y, 477 src_uv, src_stride_uv, 478 dst_y, dst_stride_y, 479 dst_u, dst_stride_u, 480 dst_v, dst_stride_v, 481 width, height); 482 } 483 484 // Convert NV12 to I420. Deprecated. 485 int NV12ToI420(const uint8* src_y, 486 const uint8* src_uv, 487 int src_stride_frame, 488 uint8* dst_y, int dst_stride_y, 489 uint8* dst_u, int dst_stride_u, 490 uint8* dst_v, int dst_stride_v, 491 int width, int height) { 492 return X420ToI420(src_y, src_stride_frame, src_stride_frame, 493 src_uv, src_stride_frame, 494 dst_y, dst_stride_y, 495 dst_u, dst_stride_u, 496 dst_v, dst_stride_v, 497 width, height); 498 } 499 500 #if defined(WIN32) && !defined(COVERAGE_ENABLED) 501 #define HAS_SPLITYUY2_SSE2 502 __declspec(naked) 503 static void SplitYUY2_SSE2(const uint8* src_yuy2, 504 uint8* dst_y, uint8* dst_u, uint8* dst_v, int pix) { 505 __asm { 506 push esi 507 push edi 508 mov eax, [esp + 8 + 4] // src_yuy2 509 mov edx, [esp + 8 + 8] // dst_y 510 mov esi, [esp + 8 + 12] // dst_u 511 mov edi, [esp + 8 + 16] // dst_v 512 mov ecx, [esp + 8 + 20] // pix 513 pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff 514 psrlw xmm7, 8 515 516 wloop: 517 movdqa xmm0, [eax] 518 movdqa xmm1, [eax + 16] 519 lea eax, [eax + 32] 520 movdqa xmm2, xmm0 521 movdqa xmm3, xmm1 522 pand xmm2, xmm7 // even bytes are Y 523 pand xmm3, xmm7 524 packuswb xmm2, xmm3 525 movdqa [edx], xmm2 526 lea edx, [edx + 16] 527 psrlw xmm0, 8 // YUYV -> UVUV 528 psrlw xmm1, 8 529 packuswb xmm0, xmm1 530 movdqa xmm1, xmm0 531 pand xmm0, xmm7 // U 532 packuswb xmm0, xmm0 533 movq qword ptr [esi], xmm0 534 lea esi, [esi + 8] 535 psrlw xmm1, 8 // V 536 packuswb xmm1, xmm1 537 movq qword ptr [edi], xmm1 538 lea edi, [edi + 8] 539 sub ecx, 16 540 ja wloop 541 542 pop edi 543 pop esi 544 ret 545 } 546 } 547 548 #elif (defined(__x86_64__) || defined(__i386__)) && \ 549 !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) 550 #define HAS_SPLITYUY2_SSE2 551 static void SplitYUY2_SSE2(const uint8* src_yuy2, uint8* dst_y, 552 uint8* dst_u, uint8* dst_v, int pix) { 553 asm volatile( 554 "pcmpeqb %%xmm7,%%xmm7\n" 555 "psrlw $0x8,%%xmm7\n" 556 "1:" 557 "movdqa (%0),%%xmm0\n" 558 "movdqa 0x10(%0),%%xmm1\n" 559 "lea 0x20(%0),%0\n" 560 "movdqa %%xmm0,%%xmm2\n" 561 "movdqa %%xmm1,%%xmm3\n" 562 "pand %%xmm7,%%xmm2\n" 563 "pand %%xmm7,%%xmm3\n" 564 "packuswb %%xmm3,%%xmm2\n" 565 "movdqa %%xmm2,(%1)\n" 566 "lea 0x10(%1),%1\n" 567 "psrlw $0x8,%%xmm0\n" 568 "psrlw $0x8,%%xmm1\n" 569 "packuswb %%xmm1,%%xmm0\n" 570 "movdqa %%xmm0,%%xmm1\n" 571 "pand %%xmm7,%%xmm0\n" 572 "packuswb %%xmm0,%%xmm0\n" 573 "movq %%xmm0,(%2)\n" 574 "lea 0x8(%2),%2\n" 575 "psrlw $0x8,%%xmm1\n" 576 "packuswb %%xmm1,%%xmm1\n" 577 "movq %%xmm1,(%3)\n" 578 "lea 0x8(%3),%3\n" 579 "sub $0x10,%4\n" 580 "ja 1b\n" 581 : "+r"(src_yuy2), // %0 582 "+r"(dst_y), // %1 583 "+r"(dst_u), // %2 584 "+r"(dst_v), // %3 585 "+r"(pix) // %4 586 : 587 : "memory" 588 ); 589 } 590 #endif 591 592 static void SplitYUY2_C(const uint8* src_yuy2, 593 uint8* dst_y, uint8* dst_u, uint8* dst_v, int pix) { 594 // Copy a row of YUY2. 595 for (int x = 0; x < pix; x += 2) { 596 dst_y[0] = src_yuy2[0]; 597 dst_y[1] = src_yuy2[2]; 598 dst_u[0] = src_yuy2[1]; 599 dst_v[0] = src_yuy2[3]; 600 src_yuy2 += 4; 601 dst_y += 2; 602 dst_u += 1; 603 dst_v += 1; 604 } 605 } 606 607 // Convert Q420 to I420. 608 // Format is rows of YY/YUYV 609 int Q420ToI420(const uint8* src_y, int src_stride_y, 610 const uint8* src_yuy2, int src_stride_yuy2, 611 uint8* dst_y, int dst_stride_y, 612 uint8* dst_u, int dst_stride_u, 613 uint8* dst_v, int dst_stride_v, 614 int width, int height) { 615 // Negative height means invert the image. 616 if (height < 0) { 617 height = -height; 618 int halfheight = (height + 1) >> 1; 619 dst_y = dst_y + (height - 1) * dst_stride_y; 620 dst_u = dst_u + (halfheight - 1) * dst_stride_u; 621 dst_v = dst_v + (halfheight - 1) * dst_stride_v; 622 dst_stride_y = -dst_stride_y; 623 dst_stride_u = -dst_stride_u; 624 dst_stride_v = -dst_stride_v; 625 } 626 void (*SplitYUY2)(const uint8* src_yuy2, 627 uint8* dst_y, uint8* dst_u, uint8* dst_v, int pix); 628 #if defined(HAS_SPLITYUY2_SSE2) 629 if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && 630 (width % 16 == 0) && 631 IS_ALIGNED(src_yuy2, 16) && (src_stride_yuy2 % 16 == 0) && 632 IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0) && 633 IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) && 634 IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) { 635 SplitYUY2 = SplitYUY2_SSE2; 636 } else 637 #endif 638 { 639 SplitYUY2 = SplitYUY2_C; 640 } 641 for (int y = 0; y < height; y += 2) { 642 memcpy(dst_y, src_y, width); 643 dst_y += dst_stride_y; 644 src_y += src_stride_y; 645 646 // Copy a row of YUY2. 647 SplitYUY2(src_yuy2, dst_y, dst_u, dst_v, width); 648 dst_y += dst_stride_y; 649 dst_u += dst_stride_u; 650 dst_v += dst_stride_v; 651 src_yuy2 += src_stride_yuy2; 652 } 653 return 0; 654 } 655 656 #if defined(WIN32) && !defined(COVERAGE_ENABLED) 657 #define HAS_YUY2TOI420ROW_SSE2 658 __declspec(naked) 659 void YUY2ToI420RowY_SSE2(const uint8* src_yuy2, 660 uint8* dst_y, int pix) { 661 __asm { 662 mov eax, [esp + 4] // src_yuy2 663 mov edx, [esp + 8] // dst_y 664 mov ecx, [esp + 12] // pix 665 pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff 666 psrlw xmm7, 8 667 668 wloop: 669 movdqa xmm0, [eax] 670 movdqa xmm1, [eax + 16] 671 lea eax, [eax + 32] 672 pand xmm0, xmm7 // even bytes are Y 673 pand xmm1, xmm7 674 packuswb xmm0, xmm1 675 movdqa [edx], xmm0 676 lea edx, [edx + 16] 677 sub ecx, 16 678 ja wloop 679 ret 680 } 681 } 682 683 __declspec(naked) 684 void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2, 685 uint8* dst_u, uint8* dst_y, int pix) { 686 __asm { 687 push esi 688 push edi 689 mov eax, [esp + 8 + 4] // src_yuy2 690 mov esi, [esp + 8 + 8] // stride_yuy2 691 mov edx, [esp + 8 + 12] // dst_u 692 mov edi, [esp + 8 + 16] // dst_v 693 mov ecx, [esp + 8 + 20] // pix 694 pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff 695 psrlw xmm7, 8 696 697 wloop: 698 movdqa xmm0, [eax] 699 movdqa xmm1, [eax + 16] 700 movdqa xmm2, [eax + esi] 701 movdqa xmm3, [eax + esi + 16] 702 lea eax, [eax + 32] 703 pavgb xmm0, xmm2 704 pavgb xmm1, xmm3 705 psrlw xmm0, 8 // YUYV -> UVUV 706 psrlw xmm1, 8 707 packuswb xmm0, xmm1 708 movdqa xmm1, xmm0 709 pand xmm0, xmm7 // U 710 packuswb xmm0, xmm0 711 movq qword ptr [edx], xmm0 712 lea edx, [edx + 8] 713 psrlw xmm1, 8 // V 714 packuswb xmm1, xmm1 715 movq qword ptr [edi], xmm1 716 lea edi, [edi + 8] 717 sub ecx, 16 718 ja wloop 719 720 pop edi 721 pop esi 722 ret 723 } 724 } 725 726 #define HAS_UYVYTOI420ROW_SSE2 727 __declspec(naked) 728 void UYVYToI420RowY_SSE2(const uint8* src_uyvy, 729 uint8* dst_y, int pix) { 730 __asm { 731 mov eax, [esp + 4] // src_uyvy 732 mov edx, [esp + 8] // dst_y 733 mov ecx, [esp + 12] // pix 734 735 wloop: 736 movdqa xmm0, [eax] 737 movdqa xmm1, [eax + 16] 738 lea eax, [eax + 32] 739 psrlw xmm0, 8 // odd bytes are Y 740 psrlw xmm1, 8 741 packuswb xmm0, xmm1 742 movdqa [edx], xmm0 743 lea edx, [edx + 16] 744 sub ecx, 16 745 ja wloop 746 ret 747 } 748 } 749 750 __declspec(naked) 751 void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy, 752 uint8* dst_u, uint8* dst_y, int pix) { 753 __asm { 754 push esi 755 push edi 756 mov eax, [esp + 8 + 4] // src_yuy2 757 mov esi, [esp + 8 + 8] // stride_yuy2 758 mov edx, [esp + 8 + 12] // dst_u 759 mov edi, [esp + 8 + 16] // dst_v 760 mov ecx, [esp + 8 + 20] // pix 761 pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff 762 psrlw xmm7, 8 763 764 wloop: 765 movdqa xmm0, [eax] 766 movdqa xmm1, [eax + 16] 767 movdqa xmm2, [eax + esi] 768 movdqa xmm3, [eax + esi + 16] 769 lea eax, [eax + 32] 770 pavgb xmm0, xmm2 771 pavgb xmm1, xmm3 772 pand xmm0, xmm7 // UYVY -> UVUV 773 pand xmm1, xmm7 774 packuswb xmm0, xmm1 775 movdqa xmm1, xmm0 776 pand xmm0, xmm7 // U 777 packuswb xmm0, xmm0 778 movq qword ptr [edx], xmm0 779 lea edx, [edx + 8] 780 psrlw xmm1, 8 // V 781 packuswb xmm1, xmm1 782 movq qword ptr [edi], xmm1 783 lea edi, [edi + 8] 784 sub ecx, 16 785 ja wloop 786 787 pop edi 788 pop esi 789 ret 790 } 791 } 792 793 #elif (defined(__x86_64__) || defined(__i386__)) && \ 794 !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) 795 796 #define HAS_YUY2TOI420ROW_SSE2 797 static void YUY2ToI420RowY_SSE2(const uint8* src_yuy2, 798 uint8* dst_y, int pix) { 799 asm volatile( 800 "pcmpeqb %%xmm7,%%xmm7\n" 801 "psrlw $0x8,%%xmm7\n" 802 "1:" 803 "movdqa (%0),%%xmm0\n" 804 "movdqa 0x10(%0),%%xmm1\n" 805 "lea 0x20(%0),%0\n" 806 "pand %%xmm7,%%xmm0\n" 807 "pand %%xmm7,%%xmm1\n" 808 "packuswb %%xmm1,%%xmm0\n" 809 "movdqa %%xmm0,(%1)\n" 810 "lea 0x10(%1),%1\n" 811 "sub $0x10,%2\n" 812 "ja 1b\n" 813 : "+r"(src_yuy2), // %0 814 "+r"(dst_y), // %1 815 "+r"(pix) // %2 816 : 817 : "memory" 818 ); 819 } 820 821 static void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2, 822 uint8* dst_u, uint8* dst_y, int pix) { 823 asm volatile( 824 "pcmpeqb %%xmm7,%%xmm7\n" 825 "psrlw $0x8,%%xmm7\n" 826 "1:" 827 "movdqa (%0),%%xmm0\n" 828 "movdqa 0x10(%0),%%xmm1\n" 829 "movdqa (%0,%4,1),%%xmm2\n" 830 "movdqa 0x10(%0,%4,1),%%xmm3\n" 831 "lea 0x20(%0),%0\n" 832 "pavgb %%xmm2,%%xmm0\n" 833 "pavgb %%xmm3,%%xmm1\n" 834 "psrlw $0x8,%%xmm0\n" 835 "psrlw $0x8,%%xmm1\n" 836 "packuswb %%xmm1,%%xmm0\n" 837 "movdqa %%xmm0,%%xmm1\n" 838 "pand %%xmm7,%%xmm0\n" 839 "packuswb %%xmm0,%%xmm0\n" 840 "movq %%xmm0,(%1)\n" 841 "lea 0x8(%1),%1\n" 842 "psrlw $0x8,%%xmm1\n" 843 "packuswb %%xmm1,%%xmm1\n" 844 "movq %%xmm1,(%2)\n" 845 "lea 0x8(%2),%2\n" 846 "sub $0x10,%3\n" 847 "ja 1b\n" 848 : "+r"(src_yuy2), // %0 849 "+r"(dst_u), // %1 850 "+r"(dst_y), // %2 851 "+r"(pix) // %3 852 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4 853 : "memory" 854 ); 855 } 856 #define HAS_UYVYTOI420ROW_SSE2 857 static void UYVYToI420RowY_SSE2(const uint8* src_uyvy, 858 uint8* dst_y, int pix) { 859 asm volatile( 860 "1:" 861 "movdqa (%0),%%xmm0\n" 862 "movdqa 0x10(%0),%%xmm1\n" 863 "lea 0x20(%0),%0\n" 864 "psrlw $0x8,%%xmm0\n" 865 "psrlw $0x8,%%xmm1\n" 866 "packuswb %%xmm1,%%xmm0\n" 867 "movdqa %%xmm0,(%1)\n" 868 "lea 0x10(%1),%1\n" 869 "sub $0x10,%2\n" 870 "ja 1b\n" 871 : "+r"(src_uyvy), // %0 872 "+r"(dst_y), // %1 873 "+r"(pix) // %2 874 : 875 : "memory" 876 ); 877 } 878 879 static void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy, 880 uint8* dst_u, uint8* dst_y, int pix) { 881 asm volatile( 882 "pcmpeqb %%xmm7,%%xmm7\n" 883 "psrlw $0x8,%%xmm7\n" 884 "1:" 885 "movdqa (%0),%%xmm0\n" 886 "movdqa 0x10(%0),%%xmm1\n" 887 "movdqa (%0,%4,1),%%xmm2\n" 888 "movdqa 0x10(%0,%4,1),%%xmm3\n" 889 "lea 0x20(%0),%0\n" 890 "pavgb %%xmm2,%%xmm0\n" 891 "pavgb %%xmm3,%%xmm1\n" 892 "pand %%xmm7,%%xmm0\n" 893 "pand %%xmm7,%%xmm1\n" 894 "packuswb %%xmm1,%%xmm0\n" 895 "movdqa %%xmm0,%%xmm1\n" 896 "pand %%xmm7,%%xmm0\n" 897 "packuswb %%xmm0,%%xmm0\n" 898 "movq %%xmm0,(%1)\n" 899 "lea 0x8(%1),%1\n" 900 "psrlw $0x8,%%xmm1\n" 901 "packuswb %%xmm1,%%xmm1\n" 902 "movq %%xmm1,(%2)\n" 903 "lea 0x8(%2),%2\n" 904 "sub $0x10,%3\n" 905 "ja 1b\n" 906 : "+r"(src_uyvy), // %0 907 "+r"(dst_u), // %1 908 "+r"(dst_y), // %2 909 "+r"(pix) // %3 910 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4 911 : "memory" 912 ); 913 } 914 #endif 915 916 // Filter 2 rows of YUY2 UV's (422) into U and V (420) 917 void YUY2ToI420RowUV_C(const uint8* src_yuy2, int src_stride_yuy2, 918 uint8* dst_u, uint8* dst_v, int pix) { 919 // Output a row of UV values, filtering 2 rows of YUY2 920 for (int x = 0; x < pix; x += 2) { 921 dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1; 922 dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1; 923 src_yuy2 += 4; 924 dst_u += 1; 925 dst_v += 1; 926 } 927 } 928 929 void YUY2ToI420RowY_C(const uint8* src_yuy2, 930 uint8* dst_y, int pix) { 931 // Copy a row of yuy2 Y values 932 for (int x = 0; x < pix; ++x) { 933 dst_y[0] = src_yuy2[0]; 934 src_yuy2 += 2; 935 dst_y += 1; 936 } 937 } 938 939 void UYVYToI420RowUV_C(const uint8* src_uyvy, int src_stride_uyvy, 940 uint8* dst_u, uint8* dst_v, int pix) { 941 // Copy a row of uyvy UV values 942 for (int x = 0; x < pix; x += 2) { 943 dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1; 944 dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1; 945 src_uyvy += 4; 946 dst_u += 1; 947 dst_v += 1; 948 } 949 } 950 951 void UYVYToI420RowY_C(const uint8* src_uyvy, 952 uint8* dst_y, int pix) { 953 // Copy a row of uyvy Y values 954 for (int x = 0; x < pix; ++x) { 955 dst_y[0] = src_uyvy[1]; 956 src_uyvy += 2; 957 dst_y += 1; 958 } 959 } 960 961 // Convert YUY2 to I420. 962 int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2, 963 uint8* dst_y, int dst_stride_y, 964 uint8* dst_u, int dst_stride_u, 965 uint8* dst_v, int dst_stride_v, 966 int width, int height) { 967 // Negative height means invert the image. 968 if (height < 0) { 969 height = -height; 970 src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2; 971 src_stride_yuy2 = -src_stride_yuy2; 972 } 973 void (*YUY2ToI420RowUV)(const uint8* src_yuy2, int src_stride_yuy2, 974 uint8* dst_u, uint8* dst_v, int pix); 975 void (*YUY2ToI420RowY)(const uint8* src_yuy2, 976 uint8* dst_y, int pix); 977 #if defined(HAS_YUY2TOI420ROW_SSE2) 978 if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && 979 (width % 16 == 0) && 980 IS_ALIGNED(src_yuy2, 16) && (src_stride_yuy2 % 16 == 0) && 981 IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0) && 982 IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) && 983 IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) { 984 YUY2ToI420RowY = YUY2ToI420RowY_SSE2; 985 YUY2ToI420RowUV = YUY2ToI420RowUV_SSE2; 986 } else 987 #endif 988 { 989 YUY2ToI420RowY = YUY2ToI420RowY_C; 990 YUY2ToI420RowUV = YUY2ToI420RowUV_C; 991 } 992 for (int y = 0; y < height; ++y) { 993 if ((y & 1) == 0) { 994 if (y >= (height - 1) ) { // last chroma on odd height clamp height 995 src_stride_yuy2 = 0; 996 } 997 YUY2ToI420RowUV(src_yuy2, src_stride_yuy2, dst_u, dst_v, width); 998 dst_u += dst_stride_u; 999 dst_v += dst_stride_v; 1000 } 1001 YUY2ToI420RowY(src_yuy2, dst_y, width); 1002 dst_y += dst_stride_y; 1003 src_yuy2 += src_stride_yuy2; 1004 } 1005 return 0; 1006 } 1007 1008 // Convert UYVY to I420. 1009 int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy, 1010 uint8* dst_y, int dst_stride_y, 1011 uint8* dst_u, int dst_stride_u, 1012 uint8* dst_v, int dst_stride_v, 1013 int width, int height) { 1014 // Negative height means invert the image. 1015 if (height < 0) { 1016 height = -height; 1017 src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy; 1018 src_stride_uyvy = -src_stride_uyvy; 1019 } 1020 void (*UYVYToI420RowUV)(const uint8* src_uyvy, int src_stride_uyvy, 1021 uint8* dst_u, uint8* dst_v, int pix); 1022 void (*UYVYToI420RowY)(const uint8* src_uyvy, 1023 uint8* dst_y, int pix); 1024 #if defined(HAS_UYVYTOI420ROW_SSE2) 1025 if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && 1026 (width % 16 == 0) && 1027 IS_ALIGNED(src_uyvy, 16) && (src_stride_uyvy % 16 == 0) && 1028 IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0) && 1029 IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) && 1030 IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) { 1031 UYVYToI420RowY = UYVYToI420RowY_SSE2; 1032 UYVYToI420RowUV = UYVYToI420RowUV_SSE2; 1033 } else 1034 #endif 1035 { 1036 UYVYToI420RowY = UYVYToI420RowY_C; 1037 UYVYToI420RowUV = UYVYToI420RowUV_C; 1038 } 1039 for (int y = 0; y < height; ++y) { 1040 if ((y & 1) == 0) { 1041 if (y >= (height - 1) ) { // last chroma on odd height clamp height 1042 src_stride_uyvy = 0; 1043 } 1044 UYVYToI420RowUV(src_uyvy, src_stride_uyvy, dst_u, dst_v, width); 1045 dst_u += dst_stride_u; 1046 dst_v += dst_stride_v; 1047 } 1048 UYVYToI420RowY(src_uyvy, dst_y, width); 1049 dst_y += dst_stride_y; 1050 src_uyvy += src_stride_uyvy; 1051 } 1052 return 0; 1053 } 1054 1055 // Convert I420 to ARGB. 1056 // TODO(fbarchard): Add SSE2 version and supply C version for fallback. 1057 int I420ToARGB(const uint8* src_y, int src_stride_y, 1058 const uint8* src_u, int src_stride_u, 1059 const uint8* src_v, int src_stride_v, 1060 uint8* dst_argb, int dst_stride_argb, 1061 int width, int height) { 1062 // Negative height means invert the image. 1063 if (height < 0) { 1064 height = -height; 1065 dst_argb = dst_argb + (height - 1) * dst_stride_argb; 1066 dst_stride_argb = -dst_stride_argb; 1067 } 1068 for (int y = 0; y < height; ++y) { 1069 FastConvertYUVToRGB32Row(src_y, src_u, src_v, dst_argb, width); 1070 dst_argb += dst_stride_argb; 1071 src_y += src_stride_y; 1072 if (y & 1) { 1073 src_u += src_stride_u; 1074 src_v += src_stride_v; 1075 } 1076 } 1077 // MMX used for FastConvertYUVToRGB32Row requires an emms instruction. 1078 EMMS(); 1079 return 0; 1080 } 1081 1082 // Convert I420 to BGRA. 1083 int I420ToBGRA(const uint8* src_y, int src_stride_y, 1084 const uint8* src_u, int src_stride_u, 1085 const uint8* src_v, int src_stride_v, 1086 uint8* dst_argb, int dst_stride_argb, 1087 int width, int height) { 1088 // Negative height means invert the image. 1089 if (height < 0) { 1090 height = -height; 1091 dst_argb = dst_argb + (height - 1) * dst_stride_argb; 1092 dst_stride_argb = -dst_stride_argb; 1093 } 1094 for (int y = 0; y < height; ++y) { 1095 FastConvertYUVToBGRARow(src_y, src_u, src_v, dst_argb, width); 1096 dst_argb += dst_stride_argb; 1097 src_y += src_stride_y; 1098 if (y & 1) { 1099 src_u += src_stride_u; 1100 src_v += src_stride_v; 1101 } 1102 } 1103 EMMS(); 1104 return 0; 1105 } 1106 1107 // Convert I420 to BGRA. 1108 int I420ToABGR(const uint8* src_y, int src_stride_y, 1109 const uint8* src_u, int src_stride_u, 1110 const uint8* src_v, int src_stride_v, 1111 uint8* dst_argb, int dst_stride_argb, 1112 int width, int height) { 1113 // Negative height means invert the image. 1114 if (height < 0) { 1115 height = -height; 1116 dst_argb = dst_argb + (height - 1) * dst_stride_argb; 1117 dst_stride_argb = -dst_stride_argb; 1118 } 1119 for (int y = 0; y < height; ++y) { 1120 FastConvertYUVToABGRRow(src_y, src_u, src_v, dst_argb, width); 1121 dst_argb += dst_stride_argb; 1122 src_y += src_stride_y; 1123 if (y & 1) { 1124 src_u += src_stride_u; 1125 src_v += src_stride_v; 1126 } 1127 } 1128 EMMS(); 1129 return 0; 1130 } 1131 1132 // Convert I422 to ARGB. 1133 int I422ToARGB(const uint8* src_y, int src_stride_y, 1134 const uint8* src_u, int src_stride_u, 1135 const uint8* src_v, int src_stride_v, 1136 uint8* dst_argb, int dst_stride_argb, 1137 int width, int height) { 1138 // Negative height means invert the image. 1139 if (height < 0) { 1140 height = -height; 1141 dst_argb = dst_argb + (height - 1) * dst_stride_argb; 1142 dst_stride_argb = -dst_stride_argb; 1143 } 1144 for (int y = 0; y < height; ++y) { 1145 FastConvertYUVToRGB32Row(src_y, src_u, src_v, dst_argb, width); 1146 dst_argb += dst_stride_argb; 1147 src_y += src_stride_y; 1148 src_u += src_stride_u; 1149 src_v += src_stride_v; 1150 } 1151 // MMX used for FastConvertYUVToRGB32Row requires an emms instruction. 1152 EMMS(); 1153 return 0; 1154 } 1155 1156 // Convert I444 to ARGB. 1157 int I444ToARGB(const uint8* src_y, int src_stride_y, 1158 const uint8* src_u, int src_stride_u, 1159 const uint8* src_v, int src_stride_v, 1160 uint8* dst_argb, int dst_stride_argb, 1161 int width, int height) { 1162 // Negative height means invert the image. 1163 if (height < 0) { 1164 height = -height; 1165 dst_argb = dst_argb + (height - 1) * dst_stride_argb; 1166 dst_stride_argb = -dst_stride_argb; 1167 } 1168 for (int y = 0; y < height; ++y) { 1169 FastConvertYUV444ToRGB32Row(src_y, src_u, src_v, dst_argb, width); 1170 dst_argb += dst_stride_argb; 1171 src_y += src_stride_y; 1172 src_u += src_stride_u; 1173 src_v += src_stride_v; 1174 } 1175 // MMX used for FastConvertYUVToRGB32Row requires an emms instruction. 1176 EMMS(); 1177 return 0; 1178 } 1179 1180 // Convert I400 to ARGB. 1181 int I400ToARGB_Reference(const uint8* src_y, int src_stride_y, 1182 uint8* dst_argb, int dst_stride_argb, 1183 int width, int height) { 1184 // Negative height means invert the image. 1185 if (height < 0) { 1186 height = -height; 1187 dst_argb = dst_argb + (height - 1) * dst_stride_argb; 1188 dst_stride_argb = -dst_stride_argb; 1189 } 1190 for (int y = 0; y < height; ++y) { 1191 FastConvertYToRGB32Row(src_y, dst_argb, width); 1192 dst_argb += dst_stride_argb; 1193 src_y += src_stride_y; 1194 } 1195 // MMX used for FastConvertYUVToRGB32Row requires an emms instruction. 1196 EMMS(); 1197 return 0; 1198 } 1199 1200 // TODO(fbarchard): 64 bit version 1201 #if defined(WIN32) && !defined(COVERAGE_ENABLED) 1202 1203 #define HAS_I400TOARGBROW_SSE2 1204 __declspec(naked) 1205 static void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { 1206 __asm { 1207 mov eax, [esp + 4] // src_y 1208 mov edx, [esp + 8] // dst_argb 1209 mov ecx, [esp + 12] // pix 1210 pcmpeqb xmm7, xmm7 // generate mask 0xff000000 1211 pslld xmm7, 24 1212 1213 wloop: 1214 movq xmm0, qword ptr [eax] 1215 lea eax, [eax + 8] 1216 punpcklbw xmm0, xmm0 1217 movdqa xmm1, xmm0 1218 punpcklwd xmm0, xmm0 1219 punpckhwd xmm1, xmm1 1220 por xmm0, xmm7 1221 por xmm1, xmm7 1222 movdqa [edx], xmm0 1223 movdqa [edx + 16], xmm1 1224 lea edx, [edx + 32] 1225 sub ecx, 8 1226 ja wloop 1227 ret 1228 } 1229 } 1230 1231 #define HAS_ABGRTOARGBROW_SSSE3 1232 __declspec(naked) 1233 static void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, 1234 int pix) { 1235 __asm { 1236 mov eax, [esp + 4] // src_abgr 1237 mov edx, [esp + 8] // dst_argb 1238 mov ecx, [esp + 12] // pix 1239 movdqa xmm7, _kShuffleMaskABGRToARGB 1240 1241 convertloop : 1242 movdqa xmm0, [eax] 1243 lea eax, [eax + 16] 1244 pshufb xmm0, xmm7 1245 movdqa [edx], xmm0 1246 lea edx, [edx + 16] 1247 sub ecx, 4 1248 ja convertloop 1249 ret 1250 } 1251 } 1252 1253 #define HAS_BGRATOARGBROW_SSSE3 1254 __declspec(naked) 1255 static void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, 1256 int pix) { 1257 __asm { 1258 mov eax, [esp + 4] // src_bgra 1259 mov edx, [esp + 8] // dst_argb 1260 mov ecx, [esp + 12] // pix 1261 movdqa xmm7, _kShuffleMaskBGRAToARGB 1262 1263 convertloop : 1264 movdqa xmm0, [eax] 1265 lea eax, [eax + 16] 1266 pshufb xmm0, xmm7 1267 movdqa [edx], xmm0 1268 lea edx, [edx + 16] 1269 sub ecx, 4 1270 ja convertloop 1271 ret 1272 } 1273 } 1274 1275 1276 #elif (defined(__x86_64__) || defined(__i386__)) && \ 1277 !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) 1278 1279 // TODO(yuche): consider moving ARGB related codes to a separate file. 1280 #define HAS_I400TOARGBROW_SSE2 1281 static void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { 1282 asm volatile( 1283 "pcmpeqb %%xmm7,%%xmm7\n" 1284 "pslld $0x18,%%xmm7\n" 1285 "1:" 1286 "movq (%0),%%xmm0\n" 1287 "lea 0x8(%0),%0\n" 1288 "punpcklbw %%xmm0,%%xmm0\n" 1289 "movdqa %%xmm0,%%xmm1\n" 1290 "punpcklwd %%xmm0,%%xmm0\n" 1291 "punpckhwd %%xmm1,%%xmm1\n" 1292 "por %%xmm7,%%xmm0\n" 1293 "por %%xmm7,%%xmm1\n" 1294 "movdqa %%xmm0,(%1)\n" 1295 "movdqa %%xmm1,0x10(%1)\n" 1296 "lea 0x20(%1),%1\n" 1297 "sub $0x8,%2\n" 1298 "ja 1b\n" 1299 : "+r"(src_y), // %0 1300 "+r"(dst_argb), // %1 1301 "+r"(pix) // %2 1302 : 1303 : "memory" 1304 ); 1305 } 1306 1307 #define HAS_ABGRTOARGBROW_SSSE3 1308 static void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, 1309 int pix) { 1310 asm volatile( 1311 "movdqa (%3),%%xmm7\n" 1312 "1:" 1313 "movdqa (%0),%%xmm0\n" 1314 "lea 0x10(%0),%0\n" 1315 "pshufb %%xmm7,%%xmm0\n" 1316 "movdqa %%xmm0,(%1)\n" 1317 "lea 0x10(%1),%1\n" 1318 "sub $0x4,%2\n" 1319 "ja 1b\n" 1320 : "+r"(src_abgr), // %0 1321 "+r"(dst_argb), // %1 1322 "+r"(pix) // %2 1323 : "r"(kShuffleMaskABGRToARGB) // %3 1324 : "memory" 1325 ); 1326 } 1327 1328 #define HAS_BGRATOARGBROW_SSSE3 1329 static void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, 1330 int pix) { 1331 asm volatile( 1332 "movdqa (%3),%%xmm7\n" 1333 "1:" 1334 "movdqa (%0),%%xmm0\n" 1335 "lea 0x10(%0),%0\n" 1336 "pshufb %%xmm7,%%xmm0\n" 1337 "movdqa %%xmm0,(%1)\n" 1338 "lea 0x10(%1),%1\n" 1339 "sub $0x4,%2\n" 1340 "ja 1b\n" 1341 : "+r"(src_bgra), // %0 1342 "+r"(dst_argb), // %1 1343 "+r"(pix) // %2 1344 : "r"(kShuffleMaskBGRAToARGB) // %3 1345 : "memory" 1346 ); 1347 } 1348 1349 #endif 1350 1351 static void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix) { 1352 // Copy a Y to RGB. 1353 for (int x = 0; x < pix; ++x) { 1354 uint8 y = src_y[0]; 1355 dst_argb[2] = dst_argb[1] = dst_argb[0] = y; 1356 dst_argb[3] = 255u; 1357 dst_argb += 4; 1358 ++src_y; 1359 } 1360 } 1361 1362 // Convert I400 to ARGB. 1363 int I400ToARGB(const uint8* src_y, int src_stride_y, 1364 uint8* dst_argb, int dst_stride_argb, 1365 int width, int height) { 1366 if (height < 0) { 1367 height = -height; 1368 src_y = src_y + (height - 1) * src_stride_y; 1369 src_stride_y = -src_stride_y; 1370 } 1371 void (*I400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix); 1372 #if defined(HAS_I400TOARGBROW_SSE2) 1373 if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && 1374 (width % 8 == 0) && 1375 IS_ALIGNED(src_y, 8) && (src_stride_y % 8 == 0) && 1376 IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) { 1377 I400ToARGBRow = I400ToARGBRow_SSE2; 1378 } else 1379 #endif 1380 { 1381 I400ToARGBRow = I400ToARGBRow_C; 1382 } 1383 1384 for (int y = 0; y < height; ++y) { 1385 I400ToARGBRow(src_y, dst_argb, width); 1386 src_y += src_stride_y; 1387 dst_argb += dst_stride_argb; 1388 } 1389 return 0; 1390 } 1391 1392 static void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix) { 1393 for (int x = 0; x < pix; ++x) { 1394 // To support in-place conversion. 1395 uint8 r = src_abgr[0]; 1396 uint8 g = src_abgr[1]; 1397 uint8 b = src_abgr[2]; 1398 uint8 a = src_abgr[3]; 1399 dst_argb[0] = b; 1400 dst_argb[1] = g; 1401 dst_argb[2] = r; 1402 dst_argb[3] = a; 1403 dst_argb += 4; 1404 src_abgr += 4; 1405 } 1406 } 1407 1408 int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr, 1409 uint8* dst_argb, int dst_stride_argb, 1410 int width, int height) { 1411 if (height < 0) { 1412 height = -height; 1413 src_abgr = src_abgr + (height - 1) * src_stride_abgr; 1414 src_stride_abgr = -src_stride_abgr; 1415 } 1416 void (*ABGRToARGBRow)(const uint8* src_abgr, uint8* dst_argb, int pix); 1417 #if defined(HAS_ABGRTOARGBROW_SSSE3) 1418 if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && 1419 (width % 4 == 0) && 1420 IS_ALIGNED(src_abgr, 16) && (src_stride_abgr % 16 == 0) && 1421 IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) { 1422 ABGRToARGBRow = ABGRToARGBRow_SSSE3; 1423 } else 1424 #endif 1425 { 1426 ABGRToARGBRow = ABGRToARGBRow_C; 1427 } 1428 1429 for (int y = 0; y < height; ++y) { 1430 ABGRToARGBRow(src_abgr, dst_argb, width); 1431 src_abgr += src_stride_abgr; 1432 dst_argb += dst_stride_argb; 1433 } 1434 return 0; 1435 } 1436 1437 static void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int pix) { 1438 for (int x = 0; x < pix; ++x) { 1439 // To support in-place conversion. 1440 uint8 a = src_bgra[0]; 1441 uint8 r = src_bgra[1]; 1442 uint8 g = src_bgra[2]; 1443 uint8 b = src_bgra[3]; 1444 dst_argb[0] = b; 1445 dst_argb[1] = g; 1446 dst_argb[2] = r; 1447 dst_argb[3] = a; 1448 dst_argb += 4; 1449 src_bgra += 4; 1450 } 1451 } 1452 1453 // Convert BGRA to ARGB. 1454 int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra, 1455 uint8* dst_argb, int dst_stride_argb, 1456 int width, int height) { 1457 if (height < 0) { 1458 height = -height; 1459 src_bgra = src_bgra + (height - 1) * src_stride_bgra; 1460 src_stride_bgra = -src_stride_bgra; 1461 } 1462 void (*BGRAToARGBRow)(const uint8* src_bgra, uint8* dst_argb, int pix); 1463 #if defined(HAS_BGRATOARGBROW_SSSE3) 1464 if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && 1465 (width % 4 == 0) && 1466 IS_ALIGNED(src_bgra, 16) && (src_stride_bgra % 16 == 0) && 1467 IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) { 1468 BGRAToARGBRow = BGRAToARGBRow_SSSE3; 1469 } else 1470 #endif 1471 { 1472 BGRAToARGBRow = BGRAToARGBRow_C; 1473 } 1474 1475 for (int y = 0; y < height; ++y) { 1476 BGRAToARGBRow(src_bgra, dst_argb, width); 1477 src_bgra += src_stride_bgra; 1478 dst_argb += dst_stride_argb; 1479 } 1480 return 0; 1481 } 1482 1483 // Convert ARGB to I400. 1484 int ARGBToI400(const uint8* src_argb, int src_stride_argb, 1485 uint8* dst_y, int dst_stride_y, 1486 int width, int height) { 1487 if (height < 0) { 1488 height = -height; 1489 src_argb = src_argb + (height - 1) * src_stride_argb; 1490 src_stride_argb = -src_stride_argb; 1491 } 1492 void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix); 1493 #if defined(HAS_ARGBTOYROW_SSSE3) 1494 if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && 1495 (width % 4 == 0) && 1496 IS_ALIGNED(src_argb, 16) && (src_stride_argb % 16 == 0) && 1497 IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) { 1498 ARGBToYRow = ARGBToYRow_SSSE3; 1499 } else 1500 #endif 1501 { 1502 ARGBToYRow = ARGBToYRow_C; 1503 } 1504 1505 for (int y = 0; y < height; ++y) { 1506 ARGBToYRow(src_argb, dst_y, width); 1507 src_argb += src_stride_argb; 1508 dst_y += dst_stride_y; 1509 } 1510 return 0; 1511 } 1512 1513 1514 // Convert RAW to ARGB. 1515 int RAWToARGB(const uint8* src_raw, int src_stride_raw, 1516 uint8* dst_argb, int dst_stride_argb, 1517 int width, int height) { 1518 if (height < 0) { 1519 height = -height; 1520 src_raw = src_raw + (height - 1) * src_stride_raw; 1521 src_stride_raw = -src_stride_raw; 1522 } 1523 void (*RAWToARGBRow)(const uint8* src_raw, uint8* dst_argb, int pix); 1524 #if defined(HAS_RAWTOARGBROW_SSSE3) 1525 if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && 1526 (width % 16 == 0) && 1527 IS_ALIGNED(src_raw, 16) && (src_stride_raw % 16 == 0) && 1528 IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) { 1529 RAWToARGBRow = RAWToARGBRow_SSSE3; 1530 } else 1531 #endif 1532 { 1533 RAWToARGBRow = RAWToARGBRow_C; 1534 } 1535 1536 for (int y = 0; y < height; ++y) { 1537 RAWToARGBRow(src_raw, dst_argb, width); 1538 src_raw += src_stride_raw; 1539 dst_argb += dst_stride_argb; 1540 } 1541 return 0; 1542 } 1543 1544 // Convert BG24 to ARGB. 1545 int BG24ToARGB(const uint8* src_bg24, int src_stride_bg24, 1546 uint8* dst_argb, int dst_stride_argb, 1547 int width, int height) { 1548 if (height < 0) { 1549 height = -height; 1550 src_bg24 = src_bg24 + (height - 1) * src_stride_bg24; 1551 src_stride_bg24 = -src_stride_bg24; 1552 } 1553 void (*BG24ToARGBRow)(const uint8* src_bg24, uint8* dst_argb, int pix); 1554 #if defined(HAS_BG24TOARGBROW_SSSE3) 1555 if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && 1556 (width % 16 == 0) && 1557 IS_ALIGNED(src_bg24, 16) && (src_stride_bg24 % 16 == 0) && 1558 IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) { 1559 BG24ToARGBRow = BG24ToARGBRow_SSSE3; 1560 } else 1561 #endif 1562 { 1563 BG24ToARGBRow = BG24ToARGBRow_C; 1564 } 1565 1566 for (int y = 0; y < height; ++y) { 1567 BG24ToARGBRow(src_bg24, dst_argb, width); 1568 src_bg24 += src_stride_bg24; 1569 dst_argb += dst_stride_argb; 1570 } 1571 return 0; 1572 } 1573 1574 } // namespace libyuv 1575 1576