1 /* 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "libyuv/scale.h" 12 13 #include <assert.h> 14 #include <string.h> 15 #include <stdlib.h> // For getenv() 16 17 #include "libyuv/cpu_id.h" 18 #include "libyuv/planar_functions.h" // For CopyARGB 19 #include "libyuv/row.h" 20 21 #ifdef __cplusplus 22 namespace libyuv { 23 extern "C" { 24 #endif 25 26 // Bilinear SSE2 is disabled. 27 #define SSE2_DISABLED 1 28 29 // ARGB scaling uses bilinear or point, but not box filter. 30 /** 31 * SSE2 downscalers with bilinear interpolation. 32 */ 33 34 #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86) 35 36 #define HAS_SCALEARGBROWDOWN2_SSE2 37 // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6) 38 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. 39 __declspec(naked) __declspec(align(16)) 40 static void ScaleARGBRowDown2_SSE2(const uint8* src_ptr, 41 ptrdiff_t /* src_stride */, 42 uint8* dst_ptr, int dst_width) { 43 __asm { 44 mov eax, [esp + 4] // src_ptr 45 // src_stride ignored 46 mov edx, [esp + 12] // dst_ptr 47 mov ecx, [esp + 16] // dst_width 48 49 align 16 50 wloop: 51 movdqa xmm0, [eax] 52 movdqa xmm1, [eax + 16] 53 lea eax, [eax + 32] 54 shufps xmm0, xmm1, 0x88 55 sub ecx, 4 56 movdqa [edx], xmm0 57 lea edx, [edx + 16] 58 jg wloop 59 60 ret 61 } 62 } 63 64 // Blends 8x2 rectangle to 4x1. 65 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. 66 __declspec(naked) __declspec(align(16)) 67 static void ScaleARGBRowDown2Int_SSE2(const uint8* src_ptr, 68 ptrdiff_t src_stride, 69 uint8* dst_ptr, int dst_width) { 70 __asm { 71 push esi 72 mov eax, [esp + 4 + 4] // src_ptr 73 mov esi, [esp + 4 + 8] // src_stride 74 mov edx, [esp + 4 + 12] // dst_ptr 75 mov ecx, [esp + 4 + 16] // dst_width 76 77 align 16 78 wloop: 79 movdqa xmm0, [eax] 80 movdqa xmm1, [eax + 16] 81 movdqa xmm2, [eax + esi] 82 movdqa xmm3, [eax + esi + 16] 83 lea eax, [eax + 32] 84 pavgb xmm0, xmm2 // average rows 85 pavgb xmm1, xmm3 86 movdqa xmm2, xmm0 // average columns (8 to 4 pixels) 87 shufps xmm0, xmm1, 0x88 // even pixels 88 shufps xmm2, xmm1, 0xdd // odd pixels 89 pavgb xmm0, xmm2 90 sub ecx, 4 91 movdqa [edx], xmm0 92 lea edx, [edx + 16] 93 jg wloop 94 95 pop esi 96 ret 97 } 98 } 99 100 #define HAS_SCALEARGBROWDOWNEVEN_SSE2 101 // Reads 4 pixels at a time. 102 // Alignment requirement: dst_ptr 16 byte aligned. 103 __declspec(naked) __declspec(align(16)) 104 void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 105 int src_stepx, 106 uint8* dst_ptr, int dst_width) { 107 __asm { 108 push ebx 109 push edi 110 mov eax, [esp + 8 + 4] // src_ptr 111 // src_stride ignored 112 mov ebx, [esp + 8 + 12] // src_stepx 113 mov edx, [esp + 8 + 16] // dst_ptr 114 mov ecx, [esp + 8 + 20] // dst_width 115 lea ebx, [ebx * 4] 116 lea edi, [ebx + ebx * 2] 117 118 align 16 119 wloop: 120 movd xmm0, [eax] 121 movd xmm1, [eax + ebx] 122 punpckldq xmm0, xmm1 123 movd xmm2, [eax + ebx * 2] 124 movd xmm3, [eax + edi] 125 lea eax, [eax + ebx * 4] 126 punpckldq xmm2, xmm3 127 punpcklqdq xmm0, xmm2 128 sub ecx, 4 129 movdqa [edx], xmm0 130 lea edx, [edx + 16] 131 jg wloop 132 133 pop edi 134 pop ebx 135 ret 136 } 137 } 138 139 // Blends four 2x2 to 4x1. 140 // Alignment requirement: dst_ptr 16 byte aligned. 141 __declspec(naked) __declspec(align(16)) 142 static void ScaleARGBRowDownEvenInt_SSE2(const uint8* src_ptr, 143 ptrdiff_t src_stride, 144 int src_stepx, 145 uint8* dst_ptr, int dst_width) { 146 __asm { 147 push ebx 148 push esi 149 push edi 150 mov eax, [esp + 12 + 4] // src_ptr 151 mov esi, [esp + 12 + 8] // src_stride 152 mov ebx, [esp + 12 + 12] // src_stepx 153 mov edx, [esp + 12 + 16] // dst_ptr 154 mov ecx, [esp + 12 + 20] // dst_width 155 lea esi, [eax + esi] // row1 pointer 156 lea ebx, [ebx * 4] 157 lea edi, [ebx + ebx * 2] 158 159 align 16 160 wloop: 161 movq xmm0, qword ptr [eax] // row0 4 pairs 162 movhps xmm0, qword ptr [eax + ebx] 163 movq xmm1, qword ptr [eax + ebx * 2] 164 movhps xmm1, qword ptr [eax + edi] 165 lea eax, [eax + ebx * 4] 166 movq xmm2, qword ptr [esi] // row1 4 pairs 167 movhps xmm2, qword ptr [esi + ebx] 168 movq xmm3, qword ptr [esi + ebx * 2] 169 movhps xmm3, qword ptr [esi + edi] 170 lea esi, [esi + ebx * 4] 171 pavgb xmm0, xmm2 // average rows 172 pavgb xmm1, xmm3 173 movdqa xmm2, xmm0 // average columns (8 to 4 pixels) 174 shufps xmm0, xmm1, 0x88 // even pixels 175 shufps xmm2, xmm1, 0xdd // odd pixels 176 pavgb xmm0, xmm2 177 sub ecx, 4 178 movdqa [edx], xmm0 179 lea edx, [edx + 16] 180 jg wloop 181 182 pop edi 183 pop esi 184 pop ebx 185 ret 186 } 187 } 188 189 // Bilinear row filtering combines 4x2 -> 4x1. SSE2 version. 190 #ifndef SSE2_DISABLED 191 #define HAS_SCALEARGBFILTERROWS_SSE2_DISABLED 192 __declspec(naked) __declspec(align(16)) 193 void ScaleARGBFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr, 194 ptrdiff_t src_stride, int dst_width, 195 int source_y_fraction) { 196 __asm { 197 push esi 198 push edi 199 mov edi, [esp + 8 + 4] // dst_ptr 200 mov esi, [esp + 8 + 8] // src_ptr 201 mov edx, [esp + 8 + 12] // src_stride 202 mov ecx, [esp + 8 + 16] // dst_width 203 mov eax, [esp + 8 + 20] // source_y_fraction (0..255) 204 sub edi, esi 205 cmp eax, 0 206 je xloop1 207 cmp eax, 128 208 je xloop2 209 210 movd xmm5, eax // xmm5 = y fraction 211 punpcklbw xmm5, xmm5 212 punpcklwd xmm5, xmm5 213 pshufd xmm5, xmm5, 0 214 pxor xmm4, xmm4 215 216 // f * row1 + (1 - frac) row0 217 // frac * (row1 - row0) + row0 218 align 16 219 xloop: 220 movdqa xmm0, [esi] // row0 221 movdqa xmm2, [esi + edx] // row1 222 movdqa xmm1, xmm0 223 movdqa xmm3, xmm2 224 punpcklbw xmm2, xmm4 225 punpckhbw xmm3, xmm4 226 punpcklbw xmm0, xmm4 227 punpckhbw xmm1, xmm4 228 psubw xmm2, xmm0 // row1 - row0 229 psubw xmm3, xmm1 230 pmulhw xmm2, xmm5 // scale diff 231 pmulhw xmm3, xmm5 232 paddw xmm0, xmm2 // sum rows 233 paddw xmm1, xmm3 234 packuswb xmm0, xmm1 235 sub ecx, 4 236 movdqa [esi + edi], xmm0 237 lea esi, [esi + 16] 238 jg xloop 239 240 shufps xmm0, xmm0, 0xff 241 movdqa [esi + edi], xmm0 // duplicate last pixel for filtering 242 pop edi 243 pop esi 244 ret 245 246 align 16 247 xloop1: 248 movdqa xmm0, [esi] 249 sub ecx, 4 250 movdqa [esi + edi], xmm0 251 lea esi, [esi + 16] 252 jg xloop1 253 254 shufps xmm0, xmm0, 0xff 255 movdqa [esi + edi], xmm0 256 pop edi 257 pop esi 258 ret 259 260 align 16 261 xloop2: 262 movdqa xmm0, [esi] 263 pavgb xmm0, [esi + edx] 264 sub ecx, 4 265 movdqa [esi + edi], xmm0 266 lea esi, [esi + 16] 267 jg xloop2 268 269 shufps xmm0, xmm0, 0xff 270 movdqa [esi + edi], xmm0 271 pop edi 272 pop esi 273 ret 274 } 275 } 276 #endif // SSE2_DISABLED 277 278 // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version. 279 #define HAS_SCALEARGBFILTERROWS_SSSE3 280 __declspec(naked) __declspec(align(16)) 281 void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, 282 ptrdiff_t src_stride, int dst_width, 283 int source_y_fraction) { 284 __asm { 285 push esi 286 push edi 287 mov edi, [esp + 8 + 4] // dst_ptr 288 mov esi, [esp + 8 + 8] // src_ptr 289 mov edx, [esp + 8 + 12] // src_stride 290 mov ecx, [esp + 8 + 16] // dst_width 291 mov eax, [esp + 8 + 20] // source_y_fraction (0..255) 292 sub edi, esi 293 shr eax, 1 294 cmp eax, 0 295 je xloop1 296 cmp eax, 64 297 je xloop2 298 movd xmm0, eax // high fraction 0..127 299 neg eax 300 add eax, 128 301 movd xmm5, eax // low fraction 128..1 302 punpcklbw xmm5, xmm0 303 punpcklwd xmm5, xmm5 304 pshufd xmm5, xmm5, 0 305 306 align 16 307 xloop: 308 movdqa xmm0, [esi] 309 movdqa xmm2, [esi + edx] 310 movdqa xmm1, xmm0 311 punpcklbw xmm0, xmm2 312 punpckhbw xmm1, xmm2 313 pmaddubsw xmm0, xmm5 314 pmaddubsw xmm1, xmm5 315 psrlw xmm0, 7 316 psrlw xmm1, 7 317 packuswb xmm0, xmm1 318 sub ecx, 4 319 movdqa [esi + edi], xmm0 320 lea esi, [esi + 16] 321 jg xloop 322 323 shufps xmm0, xmm0, 0xff 324 movdqa [esi + edi], xmm0 // duplicate last pixel for filtering 325 pop edi 326 pop esi 327 ret 328 329 align 16 330 xloop1: 331 movdqa xmm0, [esi] 332 sub ecx, 4 333 movdqa [esi + edi], xmm0 334 lea esi, [esi + 16] 335 jg xloop1 336 337 shufps xmm0, xmm0, 0xff 338 movdqa [esi + edi], xmm0 339 pop edi 340 pop esi 341 ret 342 343 align 16 344 xloop2: 345 movdqa xmm0, [esi] 346 pavgb xmm0, [esi + edx] 347 sub ecx, 4 348 movdqa [esi + edi], xmm0 349 lea esi, [esi + 16] 350 jg xloop2 351 352 shufps xmm0, xmm0, 0xff 353 movdqa [esi + edi], xmm0 354 pop edi 355 pop esi 356 ret 357 } 358 } 359 360 #elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__)) 361 362 // GCC versions of row functions are verbatim conversions from Visual C. 363 // Generated using gcc disassembly on Visual C object file: 364 // objdump -D yuvscaler.obj >yuvscaler.txt 365 #define HAS_SCALEARGBROWDOWN2_SSE2 366 static void ScaleARGBRowDown2_SSE2(const uint8* src_ptr, 367 ptrdiff_t /* src_stride */, 368 uint8* dst_ptr, int dst_width) { 369 asm volatile ( 370 ".p2align 4 \n" 371 "1: \n" 372 "movdqa (%0),%%xmm0 \n" 373 "movdqa 0x10(%0),%%xmm1 \n" 374 "lea 0x20(%0),%0 \n" 375 "shufps $0x88,%%xmm1,%%xmm0 \n" 376 "sub $0x4,%2 \n" 377 "movdqa %%xmm0,(%1) \n" 378 "lea 0x10(%1),%1 \n" 379 "jg 1b \n" 380 : "+r"(src_ptr), // %0 381 "+r"(dst_ptr), // %1 382 "+r"(dst_width) // %2 383 : 384 : "memory", "cc" 385 #if defined(__SSE2__) 386 , "xmm0", "xmm1" 387 #endif 388 ); 389 } 390 391 static void ScaleARGBRowDown2Int_SSE2(const uint8* src_ptr, 392 ptrdiff_t src_stride, 393 uint8* dst_ptr, int dst_width) { 394 asm volatile ( 395 ".p2align 4 \n" 396 "1: \n" 397 "movdqa (%0),%%xmm0 \n" 398 "movdqa 0x10(%0),%%xmm1 \n" 399 "movdqa (%0,%3,1),%%xmm2 \n" 400 "movdqa 0x10(%0,%3,1),%%xmm3 \n" 401 "lea 0x20(%0),%0 \n" 402 "pavgb %%xmm2,%%xmm0 \n" 403 "pavgb %%xmm3,%%xmm1 \n" 404 "movdqa %%xmm0,%%xmm2 \n" 405 "shufps $0x88,%%xmm1,%%xmm0 \n" 406 "shufps $0xdd,%%xmm1,%%xmm2 \n" 407 "pavgb %%xmm2,%%xmm0 \n" 408 "sub $0x4,%2 \n" 409 "movdqa %%xmm0,(%1) \n" 410 "lea 0x10(%1),%1 \n" 411 "jg 1b \n" 412 : "+r"(src_ptr), // %0 413 "+r"(dst_ptr), // %1 414 "+r"(dst_width) // %2 415 : "r"(static_cast<intptr_t>(src_stride)) // %3 416 : "memory", "cc" 417 #if defined(__SSE2__) 418 , "xmm0", "xmm1", "xmm2", "xmm3" 419 #endif 420 ); 421 } 422 423 #define HAS_SCALEARGBROWDOWNEVEN_SSE2 424 // Reads 4 pixels at a time. 425 // Alignment requirement: dst_ptr 16 byte aligned. 426 void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 427 int src_stepx, 428 uint8* dst_ptr, int dst_width) { 429 intptr_t src_stepx_x4 = static_cast<intptr_t>(src_stepx); 430 intptr_t src_stepx_x12 = 0; 431 asm volatile ( 432 "lea 0x0(,%1,4),%1 \n" 433 "lea (%1,%1,2),%4 \n" 434 ".p2align 4 \n" 435 "1: \n" 436 "movd (%0),%%xmm0 \n" 437 "movd (%0,%1,1),%%xmm1 \n" 438 "punpckldq %%xmm1,%%xmm0 \n" 439 "movd (%0,%1,2),%%xmm2 \n" 440 "movd (%0,%4,1),%%xmm3 \n" 441 "lea (%0,%1,4),%0 \n" 442 "punpckldq %%xmm3,%%xmm2 \n" 443 "punpcklqdq %%xmm2,%%xmm0 \n" 444 "sub $0x4,%3 \n" 445 "movdqa %%xmm0,(%2) \n" 446 "lea 0x10(%2),%2 \n" 447 "jg 1b \n" 448 : "+r"(src_ptr), // %0 449 "+r"(src_stepx_x4), // %1 450 "+r"(dst_ptr), // %2 451 "+r"(dst_width), // %3 452 "+r"(src_stepx_x12) // %4 453 : 454 : "memory", "cc" 455 #if defined(__SSE2__) 456 , "xmm0", "xmm1", "xmm2", "xmm3" 457 #endif 458 ); 459 } 460 461 // Blends four 2x2 to 4x1. 462 // Alignment requirement: dst_ptr 16 byte aligned. 463 static void ScaleARGBRowDownEvenInt_SSE2(const uint8* src_ptr, 464 ptrdiff_t src_stride, int src_stepx, 465 uint8* dst_ptr, int dst_width) { 466 intptr_t src_stepx_x4 = static_cast<intptr_t>(src_stepx); 467 intptr_t src_stepx_x12 = 0; 468 intptr_t row1 = static_cast<intptr_t>(src_stride); 469 asm volatile ( 470 "lea 0x0(,%1,4),%1 \n" 471 "lea (%1,%1,2),%4 \n" 472 "lea (%0,%5,1),%5 \n" 473 ".p2align 4 \n" 474 "1: \n" 475 "movq (%0),%%xmm0 \n" 476 "movhps (%0,%1,1),%%xmm0 \n" 477 "movq (%0,%1,2),%%xmm1 \n" 478 "movhps (%0,%4,1),%%xmm1 \n" 479 "lea (%0,%1,4),%0 \n" 480 "movq (%5),%%xmm2 \n" 481 "movhps (%5,%1,1),%%xmm2 \n" 482 "movq (%5,%1,2),%%xmm3 \n" 483 "movhps (%5,%4,1),%%xmm3 \n" 484 "lea (%5,%1,4),%5 \n" 485 "pavgb %%xmm2,%%xmm0 \n" 486 "pavgb %%xmm3,%%xmm1 \n" 487 "movdqa %%xmm0,%%xmm2 \n" 488 "shufps $0x88,%%xmm1,%%xmm0 \n" 489 "shufps $0xdd,%%xmm1,%%xmm2 \n" 490 "pavgb %%xmm2,%%xmm0 \n" 491 "sub $0x4,%3 \n" 492 "movdqa %%xmm0,(%2) \n" 493 "lea 0x10(%2),%2 \n" 494 "jg 1b \n" 495 : "+r"(src_ptr), // %0 496 "+r"(src_stepx_x4), // %1 497 "+r"(dst_ptr), // %2 498 "+rm"(dst_width), // %3 499 "+r"(src_stepx_x12), // %4 500 "+r"(row1) // %5 501 : 502 : "memory", "cc" 503 #if defined(__SSE2__) 504 , "xmm0", "xmm1", "xmm2", "xmm3" 505 #endif 506 ); 507 } 508 509 #ifndef SSE2_DISABLED 510 // Bilinear row filtering combines 4x2 -> 4x1. SSE2 version 511 #define HAS_SCALEARGBFILTERROWS_SSE2_DISABLED 512 void ScaleARGBFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr, 513 ptrdiff_t src_stride, int dst_width, 514 int source_y_fraction) { 515 asm volatile ( 516 "sub %1,%0 \n" 517 "cmp $0x0,%3 \n" 518 "je 2f \n" 519 "cmp $0x80,%3 \n" 520 "je 3f \n" 521 "movd %3,%%xmm5 \n" 522 "punpcklbw %%xmm5,%%xmm5 \n" 523 "punpcklwd %%xmm5,%%xmm5 \n" 524 "pshufd $0x0,%%xmm5,%%xmm5 \n" 525 "pxor %%xmm4,%%xmm4 \n" 526 ".p2align 4 \n" 527 "1: \n" 528 "movdqa (%1),%%xmm0 \n" 529 "movdqa (%1,%4,1),%%xmm2 \n" 530 "movdqa %%xmm0,%%xmm1 \n" 531 "movdqa %%xmm2,%%xmm3 \n" 532 "punpcklbw %%xmm4,%%xmm2 \n" 533 "punpckhbw %%xmm4,%%xmm3 \n" 534 "punpcklbw %%xmm4,%%xmm0 \n" 535 "punpckhbw %%xmm4,%%xmm1 \n" 536 "psubw %%xmm0,%%xmm2 \n" 537 "psubw %%xmm1,%%xmm3 \n" 538 "pmulhw %%xmm5,%%xmm2 \n" 539 "pmulhw %%xmm5,%%xmm3 \n" 540 "paddw %%xmm2,%%xmm0 \n" 541 "paddw %%xmm3,%%xmm1 \n" 542 "packuswb %%xmm1,%%xmm0 \n" 543 "sub $0x4,%2 \n" 544 "movdqa %%xmm0,(%1,%0,1) \n" 545 "lea 0x10(%1),%1 \n" 546 "jg 1b \n" 547 "jmp 4f \n" 548 ".p2align 4 \n" 549 "2: \n" 550 "movdqa (%1),%%xmm0 \n" 551 "sub $0x4,%2 \n" 552 "movdqa %%xmm0,(%1,%0,1) \n" 553 "lea 0x10(%1),%1 \n" 554 "jg 2b \n" 555 "jmp 4f \n" 556 ".p2align 4 \n" 557 "3: \n" 558 "movdqa (%1),%%xmm0 \n" 559 "pavgb (%1,%4,1),%%xmm0 \n" 560 "sub $0x4,%2 \n" 561 "movdqa %%xmm0,(%1,%0,1) \n" 562 "lea 0x10(%1),%1 \n" 563 "lea 0x10(%1),%1 \n" 564 "jg 3b \n" 565 ".p2align 4 \n" 566 "4: \n" 567 "shufps $0xff,%%xmm0,%%xmm0 \n" 568 "movdqa %%xmm0,(%1,%0,1) \n" 569 : "+r"(dst_ptr), // %0 570 "+r"(src_ptr), // %1 571 "+r"(dst_width), // %2 572 "+r"(source_y_fraction) // %3 573 : "r"(static_cast<intptr_t>(src_stride)) // %4 574 : "memory", "cc" 575 #if defined(__SSE2__) 576 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 577 #endif 578 ); 579 } 580 #endif // SSE2_DISABLED 581 582 // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version 583 #define HAS_SCALEARGBFILTERROWS_SSSE3 584 void ScaleARGBFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, 585 ptrdiff_t src_stride, int dst_width, 586 int source_y_fraction) { 587 asm volatile ( 588 "sub %1,%0 \n" 589 "shr %3 \n" 590 "cmp $0x0,%3 \n" 591 "je 2f \n" 592 "cmp $0x40,%3 \n" 593 "je 3f \n" 594 "movd %3,%%xmm0 \n" 595 "neg %3 \n" 596 "add $0x80,%3 \n" 597 "movd %3,%%xmm5 \n" 598 "punpcklbw %%xmm0,%%xmm5 \n" 599 "punpcklwd %%xmm5,%%xmm5 \n" 600 "pshufd $0x0,%%xmm5,%%xmm5 \n" 601 ".p2align 4 \n" 602 "1: \n" 603 "movdqa (%1),%%xmm0 \n" 604 "movdqa (%1,%4,1),%%xmm2 \n" 605 "movdqa %%xmm0,%%xmm1 \n" 606 "punpcklbw %%xmm2,%%xmm0 \n" 607 "punpckhbw %%xmm2,%%xmm1 \n" 608 "pmaddubsw %%xmm5,%%xmm0 \n" 609 "pmaddubsw %%xmm5,%%xmm1 \n" 610 "psrlw $0x7,%%xmm0 \n" 611 "psrlw $0x7,%%xmm1 \n" 612 "packuswb %%xmm1,%%xmm0 \n" 613 "sub $0x4,%2 \n" 614 "movdqa %%xmm0,(%1,%0,1) \n" 615 "lea 0x10(%1),%1 \n" 616 "jg 1b \n" 617 "jmp 4f \n" 618 ".p2align 4 \n" 619 "2: \n" 620 "movdqa (%1),%%xmm0 \n" 621 "sub $0x4,%2 \n" 622 "movdqa %%xmm0,(%1,%0,1) \n" 623 "lea 0x10(%1),%1 \n" 624 "jg 2b \n" 625 "jmp 4f \n" 626 ".p2align 4 \n" 627 "3: \n" 628 "movdqa (%1),%%xmm0 \n" 629 "pavgb (%1,%4,1),%%xmm0 \n" 630 "sub $0x4,%2 \n" 631 "movdqa %%xmm0,(%1,%0,1) \n" 632 "lea 0x10(%1),%1 \n" 633 "jg 3b \n" 634 "4: \n" 635 ".p2align 4 \n" 636 "shufps $0xff,%%xmm0,%%xmm0 \n" 637 "movdqa %%xmm0,(%1,%0,1) \n" 638 : "+r"(dst_ptr), // %0 639 "+r"(src_ptr), // %1 640 "+r"(dst_width), // %2 641 "+r"(source_y_fraction) // %3 642 : "r"(static_cast<intptr_t>(src_stride)) // %4 643 : "memory", "cc" 644 #if defined(__SSE2__) 645 , "xmm0", "xmm1", "xmm2", "xmm5" 646 #endif 647 ); 648 } 649 #endif // defined(__x86_64__) || defined(__i386__) 650 651 static void ScaleARGBRowDown2_C(const uint8* src_ptr, 652 ptrdiff_t /* src_stride */, 653 uint8* dst_ptr, int dst_width) { 654 const uint32* src = reinterpret_cast<const uint32*>(src_ptr); 655 uint32* dst = reinterpret_cast<uint32*>(dst_ptr); 656 657 for (int x = 0; x < dst_width - 1; x += 2) { 658 dst[0] = src[0]; 659 dst[1] = src[2]; 660 src += 4; 661 dst += 2; 662 } 663 if (dst_width & 1) { 664 dst[0] = src[0]; 665 } 666 } 667 668 static void ScaleARGBRowDown2Int_C(const uint8* src_ptr, ptrdiff_t src_stride, 669 uint8* dst_ptr, int dst_width) { 670 for (int x = 0; x < dst_width; ++x) { 671 dst_ptr[0] = (src_ptr[0] + src_ptr[4] + 672 src_ptr[src_stride] + src_ptr[src_stride + 4] + 2) >> 2; 673 dst_ptr[1] = (src_ptr[1] + src_ptr[5] + 674 src_ptr[src_stride + 1] + src_ptr[src_stride + 5] + 2) >> 2; 675 dst_ptr[2] = (src_ptr[2] + src_ptr[6] + 676 src_ptr[src_stride + 2] + src_ptr[src_stride + 6] + 2) >> 2; 677 dst_ptr[3] = (src_ptr[3] + src_ptr[7] + 678 src_ptr[src_stride + 3] + src_ptr[src_stride + 7] + 2) >> 2; 679 src_ptr += 8; 680 dst_ptr += 4; 681 } 682 } 683 684 void ScaleARGBRowDownEven_C(const uint8* src_ptr, ptrdiff_t /* src_stride */, 685 int src_stepx, 686 uint8* dst_ptr, int dst_width) { 687 const uint32* src = reinterpret_cast<const uint32*>(src_ptr); 688 uint32* dst = reinterpret_cast<uint32*>(dst_ptr); 689 690 for (int x = 0; x < dst_width - 1; x += 2) { 691 dst[0] = src[0]; 692 dst[1] = src[src_stepx]; 693 src += src_stepx * 2; 694 dst += 2; 695 } 696 if (dst_width & 1) { 697 dst[0] = src[0]; 698 } 699 } 700 701 static void ScaleARGBRowDownEvenInt_C(const uint8* src_ptr, 702 ptrdiff_t src_stride, 703 int src_stepx, 704 uint8* dst_ptr, int dst_width) { 705 for (int x = 0; x < dst_width; ++x) { 706 dst_ptr[0] = (src_ptr[0] + src_ptr[4] + 707 src_ptr[src_stride] + src_ptr[src_stride + 4] + 2) >> 2; 708 dst_ptr[1] = (src_ptr[1] + src_ptr[5] + 709 src_ptr[src_stride + 1] + src_ptr[src_stride + 5] + 2) >> 2; 710 dst_ptr[2] = (src_ptr[2] + src_ptr[6] + 711 src_ptr[src_stride + 2] + src_ptr[src_stride + 6] + 2) >> 2; 712 dst_ptr[3] = (src_ptr[3] + src_ptr[7] + 713 src_ptr[src_stride + 3] + src_ptr[src_stride + 7] + 2) >> 2; 714 src_ptr += src_stepx * 4; 715 dst_ptr += 4; 716 } 717 } 718 719 // (1-f)a + fb can be replaced with a + f(b-a) 720 721 #define BLENDER1(a, b, f) (static_cast<int>(a) + \ 722 ((f) * (static_cast<int>(b) - static_cast<int>(a)) >> 16)) 723 724 #define BLENDERC(a, b, f, s) static_cast<uint32>( \ 725 BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s) 726 727 #define BLENDER(a, b, f) \ 728 BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | \ 729 BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0) 730 731 static void ScaleARGBFilterCols_C(uint8* dst_ptr, const uint8* src_ptr, 732 int dst_width, int x, int dx) { 733 const uint32* src = reinterpret_cast<const uint32*>(src_ptr); 734 uint32* dst = reinterpret_cast<uint32*>(dst_ptr); 735 for (int j = 0; j < dst_width - 1; j += 2) { 736 int xi = x >> 16; 737 uint32 a = src[xi]; 738 uint32 b = src[xi + 1]; 739 dst[0] = BLENDER(a, b, x & 0xffff); 740 x += dx; 741 xi = x >> 16; 742 a = src[xi]; 743 b = src[xi + 1]; 744 dst[1] = BLENDER(a, b, x & 0xffff); 745 x += dx; 746 dst += 2; 747 } 748 if (dst_width & 1) { 749 int xi = x >> 16; 750 uint32 a = src[xi]; 751 uint32 b = src[xi + 1]; 752 dst[0] = BLENDER(a, b, x & 0xffff); 753 } 754 } 755 756 static const int kMaxInputWidth = 2560; 757 758 // C version 2x2 -> 2x1 759 void ScaleARGBFilterRows_C(uint8* dst_ptr, const uint8* src_ptr, 760 ptrdiff_t src_stride, 761 int dst_width, int source_y_fraction) { 762 assert(dst_width > 0); 763 int y1_fraction = source_y_fraction; 764 int y0_fraction = 256 - y1_fraction; 765 const uint8* src_ptr1 = src_ptr + src_stride; 766 uint8* end = dst_ptr + (dst_width << 2); 767 do { 768 dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8; 769 dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8; 770 dst_ptr[2] = (src_ptr[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8; 771 dst_ptr[3] = (src_ptr[3] * y0_fraction + src_ptr1[3] * y1_fraction) >> 8; 772 dst_ptr[4] = (src_ptr[4] * y0_fraction + src_ptr1[4] * y1_fraction) >> 8; 773 dst_ptr[5] = (src_ptr[5] * y0_fraction + src_ptr1[5] * y1_fraction) >> 8; 774 dst_ptr[6] = (src_ptr[6] * y0_fraction + src_ptr1[6] * y1_fraction) >> 8; 775 dst_ptr[7] = (src_ptr[7] * y0_fraction + src_ptr1[7] * y1_fraction) >> 8; 776 src_ptr += 8; 777 src_ptr1 += 8; 778 dst_ptr += 8; 779 } while (dst_ptr < end); 780 // Duplicate the last pixel (4 bytes) for filtering. 781 dst_ptr[0] = dst_ptr[-4]; 782 dst_ptr[1] = dst_ptr[-3]; 783 dst_ptr[2] = dst_ptr[-2]; 784 dst_ptr[3] = dst_ptr[-1]; 785 } 786 787 /** 788 * ScaleARGB ARGB, 1/2 789 * 790 * This is an optimized version for scaling down a ARGB to 1/2 of 791 * its original size. 792 * 793 */ 794 static void ScaleARGBDown2(int /* src_width */, int /* src_height */, 795 int dst_width, int dst_height, 796 int src_stride, int dst_stride, 797 const uint8* src_ptr, uint8* dst_ptr, 798 FilterMode filtering) { 799 void (*ScaleARGBRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride, 800 uint8* dst_ptr, int dst_width) = 801 filtering ? ScaleARGBRowDown2Int_C : ScaleARGBRowDown2_C; 802 #if defined(HAS_SCALEARGBROWDOWN2_SSE2) 803 if (TestCpuFlag(kCpuHasSSE2) && 804 IS_ALIGNED(dst_width, 4) && 805 IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && 806 IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { 807 ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Int_SSE2 : 808 ScaleARGBRowDown2_SSE2; 809 } 810 #endif 811 812 // TODO(fbarchard): Loop through source height to allow odd height. 813 for (int y = 0; y < dst_height; ++y) { 814 ScaleARGBRowDown2(src_ptr, src_stride, dst_ptr, dst_width); 815 src_ptr += (src_stride << 1); 816 dst_ptr += dst_stride; 817 } 818 } 819 820 /** 821 * ScaleARGB ARGB Even 822 * 823 * This is an optimized version for scaling down a ARGB to even 824 * multiple of its original size. 825 * 826 */ 827 static void ScaleARGBDownEven(int src_width, int src_height, 828 int dst_width, int dst_height, 829 int src_stride, int dst_stride, 830 const uint8* src_ptr, uint8* dst_ptr, 831 FilterMode filtering) { 832 assert(IS_ALIGNED(src_width, 2)); 833 assert(IS_ALIGNED(src_height, 2)); 834 void (*ScaleARGBRowDownEven)(const uint8* src_ptr, ptrdiff_t src_stride, 835 int src_step, uint8* dst_ptr, int dst_width) = 836 filtering ? ScaleARGBRowDownEvenInt_C : ScaleARGBRowDownEven_C; 837 #if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2) 838 if (TestCpuFlag(kCpuHasSSE2) && 839 IS_ALIGNED(dst_width, 4) && 840 IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { 841 ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenInt_SSE2 : 842 ScaleARGBRowDownEven_SSE2; 843 } 844 #endif 845 int src_step = src_width / dst_width; 846 // Adjust to point to center of box. 847 int row_step = src_height / dst_height; 848 int row_stride = row_step * src_stride; 849 src_ptr += ((row_step >> 1) - 1) * src_stride + ((src_step >> 1) - 1) * 4; 850 for (int y = 0; y < dst_height; ++y) { 851 ScaleARGBRowDownEven(src_ptr, src_stride, src_step, dst_ptr, dst_width); 852 src_ptr += row_stride; 853 dst_ptr += dst_stride; 854 } 855 } 856 /** 857 * ScaleARGB ARGB to/from any dimensions, with bilinear 858 * interpolation. 859 */ 860 861 static void ScaleARGBBilinear(int src_width, int src_height, 862 int dst_width, int dst_height, 863 int src_stride, int dst_stride, 864 const uint8* src_ptr, uint8* dst_ptr) { 865 assert(dst_width > 0); 866 assert(dst_height > 0); 867 assert(src_width <= kMaxInputWidth); 868 SIMD_ALIGNED(uint8 row[kMaxInputWidth * 4 + 16]); 869 void (*ScaleARGBFilterRows)(uint8* dst_ptr, const uint8* src_ptr, 870 ptrdiff_t src_stride, 871 int dst_width, int source_y_fraction) = 872 ScaleARGBFilterRows_C; 873 #if defined(HAS_SCALEARGBFILTERROWS_SSE2) 874 if (TestCpuFlag(kCpuHasSSE2) && 875 IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16)) { 876 ScaleARGBFilterRows = ScaleARGBFilterRows_SSE2; 877 } 878 #endif 879 #if defined(HAS_SCALEARGBFILTERROWS_SSSE3) 880 if (TestCpuFlag(kCpuHasSSSE3) && 881 IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16)) { 882 ScaleARGBFilterRows = ScaleARGBFilterRows_SSSE3; 883 } 884 #endif 885 int dx = (src_width << 16) / dst_width; 886 int dy = (src_height << 16) / dst_height; 887 int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1); 888 int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1); 889 int maxy = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0; 890 for (int j = 0; j < dst_height; ++j) { 891 int yi = y >> 16; 892 int yf = (y >> 8) & 255; 893 const uint8* src = src_ptr + yi * src_stride; 894 ScaleARGBFilterRows(row, src, src_stride, src_width, yf); 895 ScaleARGBFilterCols_C(dst_ptr, row, dst_width, x, dx); 896 dst_ptr += dst_stride; 897 y += dy; 898 if (y > maxy) { 899 y = maxy; 900 } 901 } 902 } 903 904 // Scales a single row of pixels using point sampling. 905 // Code is adapted from libyuv bilinear yuv scaling, but with bilinear 906 // interpolation off, and argb pixels instead of yuv. 907 static void ScaleARGBCols(uint8* dst_ptr, const uint8* src_ptr, 908 int dst_width, int x, int dx) { 909 const uint32* src = reinterpret_cast<const uint32*>(src_ptr); 910 uint32* dst = reinterpret_cast<uint32*>(dst_ptr); 911 for (int j = 0; j < dst_width - 1; j += 2) { 912 dst[0] = src[x >> 16]; 913 x += dx; 914 dst[1] = src[x >> 16]; 915 x += dx; 916 dst += 2; 917 } 918 if (dst_width & 1) { 919 dst[0] = src[x >> 16]; 920 } 921 } 922 923 /** 924 * ScaleARGB ARGB to/from any dimensions, without interpolation. 925 * Fixed point math is used for performance: The upper 16 bits 926 * of x and dx is the integer part of the source position and 927 * the lower 16 bits are the fixed decimal part. 928 */ 929 930 static void ScaleARGBSimple(int src_width, int src_height, 931 int dst_width, int dst_height, 932 int src_stride, int dst_stride, 933 const uint8* src_ptr, uint8* dst_ptr) { 934 int dx = (src_width << 16) / dst_width; 935 int dy = (src_height << 16) / dst_height; 936 int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1); 937 int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1); 938 for (int i = 0; i < dst_height; ++i) { 939 ScaleARGBCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx); 940 dst_ptr += dst_stride; 941 y += dy; 942 } 943 } 944 945 /** 946 * ScaleARGB ARGB to/from any dimensions. 947 */ 948 static void ScaleARGBAnySize(int src_width, int src_height, 949 int dst_width, int dst_height, 950 int src_stride, int dst_stride, 951 const uint8* src_ptr, uint8* dst_ptr, 952 FilterMode filtering) { 953 if (!filtering || (src_width > kMaxInputWidth)) { 954 ScaleARGBSimple(src_width, src_height, dst_width, dst_height, 955 src_stride, dst_stride, src_ptr, dst_ptr); 956 } else { 957 ScaleARGBBilinear(src_width, src_height, dst_width, dst_height, 958 src_stride, dst_stride, src_ptr, dst_ptr); 959 } 960 } 961 962 // ScaleARGB a ARGB. 963 // 964 // This function in turn calls a scaling function 965 // suitable for handling the desired resolutions. 966 967 static void ScaleARGB(const uint8* src, int src_stride, 968 int src_width, int src_height, 969 uint8* dst, int dst_stride, 970 int dst_width, int dst_height, 971 FilterMode filtering) { 972 #ifdef CPU_X86 973 // environment variable overrides for testing. 974 char *filter_override = getenv("LIBYUV_FILTER"); 975 if (filter_override) { 976 filtering = (FilterMode)atoi(filter_override); // NOLINT 977 } 978 #endif 979 if (dst_width == src_width && dst_height == src_height) { 980 // Straight copy. 981 ARGBCopy(src, src_stride, dst, dst_stride, dst_width, dst_height); 982 return; 983 } 984 if (2 * dst_width == src_width && 2 * dst_height == src_height) { 985 // Optimized 1/2. 986 ScaleARGBDown2(src_width, src_height, dst_width, dst_height, 987 src_stride, dst_stride, src, dst, filtering); 988 return; 989 } 990 int scale_down_x = src_width / dst_width; 991 int scale_down_y = src_height / dst_height; 992 if (dst_width * scale_down_x == src_width && 993 dst_height * scale_down_y == src_height) { 994 if (!(scale_down_x & 1) && !(scale_down_y & 1)) { 995 // Optimized even scale down. ie 4, 6, 8, 10x 996 ScaleARGBDownEven(src_width, src_height, dst_width, dst_height, 997 src_stride, dst_stride, src, dst, filtering); 998 return; 999 } 1000 if ((scale_down_x & 1) && (scale_down_y & 1)) { 1001 filtering = kFilterNone; 1002 } 1003 } 1004 // Arbitrary scale up and/or down. 1005 ScaleARGBAnySize(src_width, src_height, dst_width, dst_height, 1006 src_stride, dst_stride, src, dst, filtering); 1007 } 1008 1009 // ScaleARGB an ARGB image. 1010 LIBYUV_API 1011 int ARGBScale(const uint8* src_argb, int src_stride_argb, 1012 int src_width, int src_height, 1013 uint8* dst_argb, int dst_stride_argb, 1014 int dst_width, int dst_height, 1015 FilterMode filtering) { 1016 if (!src_argb || src_width <= 0 || src_height == 0 || 1017 !dst_argb || dst_width <= 0 || dst_height <= 0) { 1018 return -1; 1019 } 1020 // Negative height means invert the image. 1021 if (src_height < 0) { 1022 src_height = -src_height; 1023 src_argb = src_argb + (src_height - 1) * src_stride_argb; 1024 src_stride_argb = -src_stride_argb; 1025 } 1026 ScaleARGB(src_argb, src_stride_argb, src_width, src_height, 1027 dst_argb, dst_stride_argb, dst_width, dst_height, 1028 filtering); 1029 return 0; 1030 } 1031 1032 #ifdef __cplusplus 1033 } // extern "C" 1034 } // namespace libyuv 1035 #endif 1036