1 /* 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "libyuv/scale.h" 12 13 #include <assert.h> 14 #include <string.h> 15 #include <stdlib.h> // For getenv() 16 17 #include "libyuv/cpu_id.h" 18 #include "libyuv/planar_functions.h" // For CopyPlane 19 #include "libyuv/row.h" 20 21 #ifdef __cplusplus 22 namespace libyuv { 23 extern "C" { 24 #endif 25 26 // Bilinear SSE2 is disabled. 27 #define SSE2_DISABLED 1 28 29 // Note: Some SSE2 reference manuals 30 // cpuvol1.pdf agner_instruction_tables.pdf 253666.pdf 253667.pdf 31 32 // Set the following flag to true to revert to only 33 // using the reference implementation ScalePlaneBox(), and 34 // NOT the optimized versions. Useful for debugging and 35 // when comparing the quality of the resulting YUV planes 36 // as produced by the optimized and non-optimized versions. 37 static bool use_reference_impl_ = false; 38 39 LIBYUV_API 40 void SetUseReferenceImpl(bool use) { 41 use_reference_impl_ = use; 42 } 43 44 // ScaleRowDown2Int also used by planar functions 45 46 /** 47 * NEON downscalers with interpolation. 48 * 49 * Provided by Fritz Koenig 50 * 51 */ 52 53 #if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__) 54 #define HAS_SCALEROWDOWN2_NEON 55 // Note - not static due to reuse in convert for 444 to 420. 56 void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, 57 uint8* dst, int dst_width); 58 59 void ScaleRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride, 60 uint8* dst, int dst_width); 61 62 #define HAS_SCALEROWDOWN4_NEON 63 void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, 64 uint8* dst_ptr, int dst_width); 65 void ScaleRowDown4Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride, 66 uint8* dst_ptr, int dst_width); 67 68 #define HAS_SCALEROWDOWN34_NEON 69 // Down scale from 4 to 3 pixels. Use the neon multilane read/write 70 // to load up the every 4th pixel into a 4 different registers. 71 // Point samples 32 pixels to 24 pixels. 72 void ScaleRowDown34_NEON(const uint8* src_ptr, 73 ptrdiff_t /* src_stride */, 74 uint8* dst_ptr, int dst_width); 75 void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr, 76 ptrdiff_t src_stride, 77 uint8* dst_ptr, int dst_width); 78 void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr, 79 ptrdiff_t src_stride, 80 uint8* dst_ptr, int dst_width); 81 82 #define HAS_SCALEROWDOWN38_NEON 83 // 32 -> 12 84 void ScaleRowDown38_NEON(const uint8* src_ptr, 85 ptrdiff_t /* src_stride */, 86 uint8* dst_ptr, int dst_width); 87 // 32x3 -> 12x1 88 void ScaleRowDown38_3_Int_NEON(const uint8* src_ptr, 89 ptrdiff_t src_stride, 90 uint8* dst_ptr, int dst_width); 91 // 32x2 -> 12x1 92 void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr, 93 ptrdiff_t src_stride, 94 uint8* dst_ptr, int dst_width); 95 // 16x2 -> 16x1 96 #define HAS_SCALEFILTERROWS_NEON 97 void ScaleFilterRows_NEON(uint8* dst_ptr, 98 const uint8* src_ptr, ptrdiff_t src_stride, 99 int dst_width, int source_y_fraction); 100 101 /** 102 * SSE2 downscalers with interpolation. 103 * 104 * Provided by Frank Barchard (fbarchard (at) google.com) 105 * 106 */ 107 108 109 // Constants for SSSE3 code 110 #elif !defined(YUV_DISABLE_ASM) && \ 111 (defined(_M_IX86) || defined(__i386__) || defined(__x86_64__)) 112 113 // GCC 4.2 on OSX has link error when passing static or const to inline. 114 // TODO(fbarchard): Use static const when gcc 4.2 support is dropped. 115 #ifdef __APPLE__ 116 #define CONST 117 #else 118 #define CONST static const 119 #endif 120 121 // Offsets for source bytes 0 to 9 122 CONST uvec8 kShuf0 = 123 { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 }; 124 125 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. 126 CONST uvec8 kShuf1 = 127 { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 }; 128 129 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. 130 CONST uvec8 kShuf2 = 131 { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 }; 132 133 // Offsets for source bytes 0 to 10 134 CONST uvec8 kShuf01 = 135 { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 }; 136 137 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. 138 CONST uvec8 kShuf11 = 139 { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 }; 140 141 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. 142 CONST uvec8 kShuf21 = 143 { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 }; 144 145 // Coefficients for source bytes 0 to 10 146 CONST uvec8 kMadd01 = 147 { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 }; 148 149 // Coefficients for source bytes 10 to 21 150 CONST uvec8 kMadd11 = 151 { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 }; 152 153 // Coefficients for source bytes 21 to 31 154 CONST uvec8 kMadd21 = 155 { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 }; 156 157 // Coefficients for source bytes 21 to 31 158 CONST vec16 kRound34 = 159 { 2, 2, 2, 2, 2, 2, 2, 2 }; 160 161 CONST uvec8 kShuf38a = 162 { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; 163 164 CONST uvec8 kShuf38b = 165 { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 }; 166 167 // Arrange words 0,3,6 into 0,1,2 168 CONST uvec8 kShufAc = 169 { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; 170 171 // Arrange words 0,3,6 into 3,4,5 172 CONST uvec8 kShufAc3 = 173 { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 }; 174 175 // Scaling values for boxes of 3x3 and 2x3 176 CONST uvec16 kScaleAc33 = 177 { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 }; 178 179 // Arrange first value for pixels 0,1,2,3,4,5 180 CONST uvec8 kShufAb0 = 181 { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 }; 182 183 // Arrange second value for pixels 0,1,2,3,4,5 184 CONST uvec8 kShufAb1 = 185 { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 }; 186 187 // Arrange third value for pixels 0,1,2,3,4,5 188 CONST uvec8 kShufAb2 = 189 { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; 190 191 // Scaling values for boxes of 3x2 and 2x2 192 CONST uvec16 kScaleAb2 = 193 { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; 194 #endif 195 196 #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86) 197 198 #define HAS_SCALEROWDOWN2_SSE2 199 // Reads 32 pixels, throws half away and writes 16 pixels. 200 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. 201 __declspec(naked) __declspec(align(16)) 202 static void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 203 uint8* dst_ptr, int dst_width) { 204 __asm { 205 mov eax, [esp + 4] // src_ptr 206 // src_stride ignored 207 mov edx, [esp + 12] // dst_ptr 208 mov ecx, [esp + 16] // dst_width 209 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 210 psrlw xmm5, 8 211 212 align 16 213 wloop: 214 movdqa xmm0, [eax] 215 movdqa xmm1, [eax + 16] 216 lea eax, [eax + 32] 217 pand xmm0, xmm5 218 pand xmm1, xmm5 219 packuswb xmm0, xmm1 220 sub ecx, 16 221 movdqa [edx], xmm0 222 lea edx, [edx + 16] 223 jg wloop 224 225 ret 226 } 227 } 228 // Blends 32x2 rectangle to 16x1. 229 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. 230 __declspec(naked) __declspec(align(16)) 231 void ScaleRowDown2Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 232 uint8* dst_ptr, int dst_width) { 233 __asm { 234 push esi 235 mov eax, [esp + 4 + 4] // src_ptr 236 mov esi, [esp + 4 + 8] // src_stride 237 mov edx, [esp + 4 + 12] // dst_ptr 238 mov ecx, [esp + 4 + 16] // dst_width 239 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 240 psrlw xmm5, 8 241 242 align 16 243 wloop: 244 movdqa xmm0, [eax] 245 movdqa xmm1, [eax + 16] 246 movdqa xmm2, [eax + esi] 247 movdqa xmm3, [eax + esi + 16] 248 lea eax, [eax + 32] 249 pavgb xmm0, xmm2 // average rows 250 pavgb xmm1, xmm3 251 252 movdqa xmm2, xmm0 // average columns (32 to 16 pixels) 253 psrlw xmm0, 8 254 movdqa xmm3, xmm1 255 psrlw xmm1, 8 256 pand xmm2, xmm5 257 pand xmm3, xmm5 258 pavgw xmm0, xmm2 259 pavgw xmm1, xmm3 260 packuswb xmm0, xmm1 261 262 sub ecx, 16 263 movdqa [edx], xmm0 264 lea edx, [edx + 16] 265 jg wloop 266 267 pop esi 268 ret 269 } 270 } 271 272 // Reads 32 pixels, throws half away and writes 16 pixels. 273 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. 274 __declspec(naked) __declspec(align(16)) 275 static void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, 276 ptrdiff_t src_stride, 277 uint8* dst_ptr, int dst_width) { 278 __asm { 279 mov eax, [esp + 4] // src_ptr 280 // src_stride ignored 281 mov edx, [esp + 12] // dst_ptr 282 mov ecx, [esp + 16] // dst_width 283 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 284 psrlw xmm5, 8 285 286 align 16 287 wloop: 288 movdqu xmm0, [eax] 289 movdqu xmm1, [eax + 16] 290 lea eax, [eax + 32] 291 pand xmm0, xmm5 292 pand xmm1, xmm5 293 packuswb xmm0, xmm1 294 sub ecx, 16 295 movdqu [edx], xmm0 296 lea edx, [edx + 16] 297 jg wloop 298 299 ret 300 } 301 } 302 // Blends 32x2 rectangle to 16x1. 303 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. 304 __declspec(naked) __declspec(align(16)) 305 static void ScaleRowDown2Int_Unaligned_SSE2(const uint8* src_ptr, 306 ptrdiff_t src_stride, 307 uint8* dst_ptr, int dst_width) { 308 __asm { 309 push esi 310 mov eax, [esp + 4 + 4] // src_ptr 311 mov esi, [esp + 4 + 8] // src_stride 312 mov edx, [esp + 4 + 12] // dst_ptr 313 mov ecx, [esp + 4 + 16] // dst_width 314 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 315 psrlw xmm5, 8 316 317 align 16 318 wloop: 319 movdqu xmm0, [eax] 320 movdqu xmm1, [eax + 16] 321 movdqu xmm2, [eax + esi] 322 movdqu xmm3, [eax + esi + 16] 323 lea eax, [eax + 32] 324 pavgb xmm0, xmm2 // average rows 325 pavgb xmm1, xmm3 326 327 movdqa xmm2, xmm0 // average columns (32 to 16 pixels) 328 psrlw xmm0, 8 329 movdqa xmm3, xmm1 330 psrlw xmm1, 8 331 pand xmm2, xmm5 332 pand xmm3, xmm5 333 pavgw xmm0, xmm2 334 pavgw xmm1, xmm3 335 packuswb xmm0, xmm1 336 337 sub ecx, 16 338 movdqu [edx], xmm0 339 lea edx, [edx + 16] 340 jg wloop 341 342 pop esi 343 ret 344 } 345 } 346 347 #define HAS_SCALEROWDOWN4_SSE2 348 // Point samples 32 pixels to 8 pixels. 349 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. 350 __declspec(naked) __declspec(align(16)) 351 static void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 352 uint8* dst_ptr, int dst_width) { 353 __asm { 354 mov eax, [esp + 4] // src_ptr 355 // src_stride ignored 356 mov edx, [esp + 12] // dst_ptr 357 mov ecx, [esp + 16] // dst_width 358 pcmpeqb xmm5, xmm5 // generate mask 0x000000ff 359 psrld xmm5, 24 360 361 align 16 362 wloop: 363 movdqa xmm0, [eax] 364 movdqa xmm1, [eax + 16] 365 lea eax, [eax + 32] 366 pand xmm0, xmm5 367 pand xmm1, xmm5 368 packuswb xmm0, xmm1 369 packuswb xmm0, xmm0 370 sub ecx, 8 371 movq qword ptr [edx], xmm0 372 lea edx, [edx + 8] 373 jg wloop 374 375 ret 376 } 377 } 378 379 // Blends 32x4 rectangle to 8x1. 380 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. 381 __declspec(naked) __declspec(align(16)) 382 static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 383 uint8* dst_ptr, int dst_width) { 384 __asm { 385 push esi 386 push edi 387 mov eax, [esp + 8 + 4] // src_ptr 388 mov esi, [esp + 8 + 8] // src_stride 389 mov edx, [esp + 8 + 12] // dst_ptr 390 mov ecx, [esp + 8 + 16] // dst_width 391 lea edi, [esi + esi * 2] // src_stride * 3 392 pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff 393 psrlw xmm7, 8 394 395 align 16 396 wloop: 397 movdqa xmm0, [eax] 398 movdqa xmm1, [eax + 16] 399 movdqa xmm2, [eax + esi] 400 movdqa xmm3, [eax + esi + 16] 401 pavgb xmm0, xmm2 // average rows 402 pavgb xmm1, xmm3 403 movdqa xmm2, [eax + esi * 2] 404 movdqa xmm3, [eax + esi * 2 + 16] 405 movdqa xmm4, [eax + edi] 406 movdqa xmm5, [eax + edi + 16] 407 lea eax, [eax + 32] 408 pavgb xmm2, xmm4 409 pavgb xmm3, xmm5 410 pavgb xmm0, xmm2 411 pavgb xmm1, xmm3 412 413 movdqa xmm2, xmm0 // average columns (32 to 16 pixels) 414 psrlw xmm0, 8 415 movdqa xmm3, xmm1 416 psrlw xmm1, 8 417 pand xmm2, xmm7 418 pand xmm3, xmm7 419 pavgw xmm0, xmm2 420 pavgw xmm1, xmm3 421 packuswb xmm0, xmm1 422 423 movdqa xmm2, xmm0 // average columns (16 to 8 pixels) 424 psrlw xmm0, 8 425 pand xmm2, xmm7 426 pavgw xmm0, xmm2 427 packuswb xmm0, xmm0 428 429 sub ecx, 8 430 movq qword ptr [edx], xmm0 431 lea edx, [edx + 8] 432 jg wloop 433 434 pop edi 435 pop esi 436 ret 437 } 438 } 439 440 #define HAS_SCALEROWDOWN8_SSE2 441 // Point samples 32 pixels to 4 pixels. 442 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned. 443 __declspec(naked) __declspec(align(16)) 444 static void ScaleRowDown8_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 445 uint8* dst_ptr, int dst_width) { 446 __asm { 447 mov eax, [esp + 4] // src_ptr 448 // src_stride ignored 449 mov edx, [esp + 12] // dst_ptr 450 mov ecx, [esp + 16] // dst_width 451 pcmpeqb xmm5, xmm5 // generate mask isolating 1 src 8 bytes 452 psrlq xmm5, 56 453 454 align 16 455 wloop: 456 movdqa xmm0, [eax] 457 movdqa xmm1, [eax + 16] 458 lea eax, [eax + 32] 459 pand xmm0, xmm5 460 pand xmm1, xmm5 461 packuswb xmm0, xmm1 // 32->16 462 packuswb xmm0, xmm0 // 16->8 463 packuswb xmm0, xmm0 // 8->4 464 sub ecx, 4 465 movd dword ptr [edx], xmm0 466 lea edx, [edx + 4] 467 jg wloop 468 469 ret 470 } 471 } 472 473 // Blends 32x8 rectangle to 4x1. 474 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned. 475 __declspec(naked) __declspec(align(16)) 476 static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 477 uint8* dst_ptr, int dst_width) { 478 __asm { 479 push esi 480 push edi 481 push ebp 482 mov eax, [esp + 12 + 4] // src_ptr 483 mov esi, [esp + 12 + 8] // src_stride 484 mov edx, [esp + 12 + 12] // dst_ptr 485 mov ecx, [esp + 12 + 16] // dst_width 486 lea edi, [esi + esi * 2] // src_stride * 3 487 pxor xmm7, xmm7 488 489 align 16 490 wloop: 491 movdqa xmm0, [eax] // average 8 rows to 1 492 movdqa xmm1, [eax + 16] 493 movdqa xmm2, [eax + esi] 494 movdqa xmm3, [eax + esi + 16] 495 pavgb xmm0, xmm2 496 pavgb xmm1, xmm3 497 movdqa xmm2, [eax + esi * 2] 498 movdqa xmm3, [eax + esi * 2 + 16] 499 movdqa xmm4, [eax + edi] 500 movdqa xmm5, [eax + edi + 16] 501 lea ebp, [eax + esi * 4] 502 lea eax, [eax + 32] 503 pavgb xmm2, xmm4 504 pavgb xmm3, xmm5 505 pavgb xmm0, xmm2 506 pavgb xmm1, xmm3 507 508 movdqa xmm2, [ebp] 509 movdqa xmm3, [ebp + 16] 510 movdqa xmm4, [ebp + esi] 511 movdqa xmm5, [ebp + esi + 16] 512 pavgb xmm2, xmm4 513 pavgb xmm3, xmm5 514 movdqa xmm4, [ebp + esi * 2] 515 movdqa xmm5, [ebp + esi * 2 + 16] 516 movdqa xmm6, [ebp + edi] 517 pavgb xmm4, xmm6 518 movdqa xmm6, [ebp + edi + 16] 519 pavgb xmm5, xmm6 520 pavgb xmm2, xmm4 521 pavgb xmm3, xmm5 522 pavgb xmm0, xmm2 523 pavgb xmm1, xmm3 524 525 psadbw xmm0, xmm7 // average 32 pixels to 4 526 psadbw xmm1, xmm7 527 pshufd xmm0, xmm0, 0xd8 // x1x0 -> xx01 528 pshufd xmm1, xmm1, 0x8d // x3x2 -> 32xx 529 por xmm0, xmm1 // -> 3201 530 psrlw xmm0, 3 531 packuswb xmm0, xmm0 532 packuswb xmm0, xmm0 533 534 sub ecx, 4 535 movd dword ptr [edx], xmm0 536 lea edx, [edx + 4] 537 jg wloop 538 539 pop ebp 540 pop edi 541 pop esi 542 ret 543 } 544 } 545 546 #define HAS_SCALEROWDOWN34_SSSE3 547 // Point samples 32 pixels to 24 pixels. 548 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read. 549 // Then shuffled to do the scaling. 550 551 // Note that movdqa+palign may be better than movdqu. 552 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. 553 __declspec(naked) __declspec(align(16)) 554 static void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, 555 uint8* dst_ptr, int dst_width) { 556 __asm { 557 mov eax, [esp + 4] // src_ptr 558 // src_stride ignored 559 mov edx, [esp + 12] // dst_ptr 560 mov ecx, [esp + 16] // dst_width 561 movdqa xmm3, kShuf0 562 movdqa xmm4, kShuf1 563 movdqa xmm5, kShuf2 564 565 align 16 566 wloop: 567 movdqa xmm0, [eax] 568 movdqa xmm1, [eax + 16] 569 lea eax, [eax + 32] 570 movdqa xmm2, xmm1 571 palignr xmm1, xmm0, 8 572 pshufb xmm0, xmm3 573 pshufb xmm1, xmm4 574 pshufb xmm2, xmm5 575 movq qword ptr [edx], xmm0 576 movq qword ptr [edx + 8], xmm1 577 movq qword ptr [edx + 16], xmm2 578 lea edx, [edx + 24] 579 sub ecx, 24 580 jg wloop 581 582 ret 583 } 584 } 585 586 // Blends 32x2 rectangle to 24x1 587 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read. 588 // Then shuffled to do the scaling. 589 590 // Register usage: 591 // xmm0 src_row 0 592 // xmm1 src_row 1 593 // xmm2 shuf 0 594 // xmm3 shuf 1 595 // xmm4 shuf 2 596 // xmm5 madd 0 597 // xmm6 madd 1 598 // xmm7 kRound34 599 600 // Note that movdqa+palign may be better than movdqu. 601 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. 602 __declspec(naked) __declspec(align(16)) 603 static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, 604 ptrdiff_t src_stride, 605 uint8* dst_ptr, int dst_width) { 606 __asm { 607 push esi 608 mov eax, [esp + 4 + 4] // src_ptr 609 mov esi, [esp + 4 + 8] // src_stride 610 mov edx, [esp + 4 + 12] // dst_ptr 611 mov ecx, [esp + 4 + 16] // dst_width 612 movdqa xmm2, kShuf01 613 movdqa xmm3, kShuf11 614 movdqa xmm4, kShuf21 615 movdqa xmm5, kMadd01 616 movdqa xmm6, kMadd11 617 movdqa xmm7, kRound34 618 619 align 16 620 wloop: 621 movdqa xmm0, [eax] // pixels 0..7 622 movdqa xmm1, [eax + esi] 623 pavgb xmm0, xmm1 624 pshufb xmm0, xmm2 625 pmaddubsw xmm0, xmm5 626 paddsw xmm0, xmm7 627 psrlw xmm0, 2 628 packuswb xmm0, xmm0 629 movq qword ptr [edx], xmm0 630 movdqu xmm0, [eax + 8] // pixels 8..15 631 movdqu xmm1, [eax + esi + 8] 632 pavgb xmm0, xmm1 633 pshufb xmm0, xmm3 634 pmaddubsw xmm0, xmm6 635 paddsw xmm0, xmm7 636 psrlw xmm0, 2 637 packuswb xmm0, xmm0 638 movq qword ptr [edx + 8], xmm0 639 movdqa xmm0, [eax + 16] // pixels 16..23 640 movdqa xmm1, [eax + esi + 16] 641 lea eax, [eax + 32] 642 pavgb xmm0, xmm1 643 pshufb xmm0, xmm4 644 movdqa xmm1, kMadd21 645 pmaddubsw xmm0, xmm1 646 paddsw xmm0, xmm7 647 psrlw xmm0, 2 648 packuswb xmm0, xmm0 649 sub ecx, 24 650 movq qword ptr [edx + 16], xmm0 651 lea edx, [edx + 24] 652 jg wloop 653 654 pop esi 655 ret 656 } 657 } 658 659 // Note that movdqa+palign may be better than movdqu. 660 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. 661 __declspec(naked) __declspec(align(16)) 662 static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, 663 ptrdiff_t src_stride, 664 uint8* dst_ptr, int dst_width) { 665 __asm { 666 push esi 667 mov eax, [esp + 4 + 4] // src_ptr 668 mov esi, [esp + 4 + 8] // src_stride 669 mov edx, [esp + 4 + 12] // dst_ptr 670 mov ecx, [esp + 4 + 16] // dst_width 671 movdqa xmm2, kShuf01 672 movdqa xmm3, kShuf11 673 movdqa xmm4, kShuf21 674 movdqa xmm5, kMadd01 675 movdqa xmm6, kMadd11 676 movdqa xmm7, kRound34 677 678 align 16 679 wloop: 680 movdqa xmm0, [eax] // pixels 0..7 681 movdqa xmm1, [eax + esi] 682 pavgb xmm1, xmm0 683 pavgb xmm0, xmm1 684 pshufb xmm0, xmm2 685 pmaddubsw xmm0, xmm5 686 paddsw xmm0, xmm7 687 psrlw xmm0, 2 688 packuswb xmm0, xmm0 689 movq qword ptr [edx], xmm0 690 movdqu xmm0, [eax + 8] // pixels 8..15 691 movdqu xmm1, [eax + esi + 8] 692 pavgb xmm1, xmm0 693 pavgb xmm0, xmm1 694 pshufb xmm0, xmm3 695 pmaddubsw xmm0, xmm6 696 paddsw xmm0, xmm7 697 psrlw xmm0, 2 698 packuswb xmm0, xmm0 699 movq qword ptr [edx + 8], xmm0 700 movdqa xmm0, [eax + 16] // pixels 16..23 701 movdqa xmm1, [eax + esi + 16] 702 lea eax, [eax + 32] 703 pavgb xmm1, xmm0 704 pavgb xmm0, xmm1 705 pshufb xmm0, xmm4 706 movdqa xmm1, kMadd21 707 pmaddubsw xmm0, xmm1 708 paddsw xmm0, xmm7 709 psrlw xmm0, 2 710 packuswb xmm0, xmm0 711 sub ecx, 24 712 movq qword ptr [edx + 16], xmm0 713 lea edx, [edx+24] 714 jg wloop 715 716 pop esi 717 ret 718 } 719 } 720 721 #define HAS_SCALEROWDOWN38_SSSE3 722 // 3/8 point sampler 723 724 // Scale 32 pixels to 12 725 __declspec(naked) __declspec(align(16)) 726 static void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, 727 uint8* dst_ptr, int dst_width) { 728 __asm { 729 mov eax, [esp + 4] // src_ptr 730 // src_stride ignored 731 mov edx, [esp + 12] // dst_ptr 732 mov ecx, [esp + 16] // dst_width 733 movdqa xmm4, kShuf38a 734 movdqa xmm5, kShuf38b 735 736 align 16 737 xloop: 738 movdqa xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5 739 movdqa xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11 740 lea eax, [eax + 32] 741 pshufb xmm0, xmm4 742 pshufb xmm1, xmm5 743 paddusb xmm0, xmm1 744 745 sub ecx, 12 746 movq qword ptr [edx], xmm0 // write 12 pixels 747 movhlps xmm1, xmm0 748 movd [edx + 8], xmm1 749 lea edx, [edx + 12] 750 jg xloop 751 752 ret 753 } 754 } 755 756 // Scale 16x3 pixels to 6x1 with interpolation 757 __declspec(naked) __declspec(align(16)) 758 static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, 759 ptrdiff_t src_stride, 760 uint8* dst_ptr, int dst_width) { 761 __asm { 762 push esi 763 mov eax, [esp + 4 + 4] // src_ptr 764 mov esi, [esp + 4 + 8] // src_stride 765 mov edx, [esp + 4 + 12] // dst_ptr 766 mov ecx, [esp + 4 + 16] // dst_width 767 movdqa xmm2, kShufAc 768 movdqa xmm3, kShufAc3 769 movdqa xmm4, kScaleAc33 770 pxor xmm5, xmm5 771 772 align 16 773 xloop: 774 movdqa xmm0, [eax] // sum up 3 rows into xmm0/1 775 movdqa xmm6, [eax + esi] 776 movhlps xmm1, xmm0 777 movhlps xmm7, xmm6 778 punpcklbw xmm0, xmm5 779 punpcklbw xmm1, xmm5 780 punpcklbw xmm6, xmm5 781 punpcklbw xmm7, xmm5 782 paddusw xmm0, xmm6 783 paddusw xmm1, xmm7 784 movdqa xmm6, [eax + esi * 2] 785 lea eax, [eax + 16] 786 movhlps xmm7, xmm6 787 punpcklbw xmm6, xmm5 788 punpcklbw xmm7, xmm5 789 paddusw xmm0, xmm6 790 paddusw xmm1, xmm7 791 792 movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6 793 psrldq xmm0, 2 794 paddusw xmm6, xmm0 795 psrldq xmm0, 2 796 paddusw xmm6, xmm0 797 pshufb xmm6, xmm2 798 799 movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6 800 psrldq xmm1, 2 801 paddusw xmm7, xmm1 802 psrldq xmm1, 2 803 paddusw xmm7, xmm1 804 pshufb xmm7, xmm3 805 paddusw xmm6, xmm7 806 807 pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6 808 packuswb xmm6, xmm6 809 810 sub ecx, 6 811 movd [edx], xmm6 // write 6 pixels 812 psrlq xmm6, 16 813 movd [edx + 2], xmm6 814 lea edx, [edx + 6] 815 jg xloop 816 817 pop esi 818 ret 819 } 820 } 821 822 // Scale 16x2 pixels to 6x1 with interpolation 823 __declspec(naked) __declspec(align(16)) 824 static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, 825 ptrdiff_t src_stride, 826 uint8* dst_ptr, int dst_width) { 827 __asm { 828 push esi 829 mov eax, [esp + 4 + 4] // src_ptr 830 mov esi, [esp + 4 + 8] // src_stride 831 mov edx, [esp + 4 + 12] // dst_ptr 832 mov ecx, [esp + 4 + 16] // dst_width 833 movdqa xmm2, kShufAb0 834 movdqa xmm3, kShufAb1 835 movdqa xmm4, kShufAb2 836 movdqa xmm5, kScaleAb2 837 838 align 16 839 xloop: 840 movdqa xmm0, [eax] // average 2 rows into xmm0 841 pavgb xmm0, [eax + esi] 842 lea eax, [eax + 16] 843 844 movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1 845 pshufb xmm1, xmm2 846 movdqa xmm6, xmm0 847 pshufb xmm6, xmm3 848 paddusw xmm1, xmm6 849 pshufb xmm0, xmm4 850 paddusw xmm1, xmm0 851 852 pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2 853 packuswb xmm1, xmm1 854 855 sub ecx, 6 856 movd [edx], xmm1 // write 6 pixels 857 psrlq xmm1, 16 858 movd [edx + 2], xmm1 859 lea edx, [edx + 6] 860 jg xloop 861 862 pop esi 863 ret 864 } 865 } 866 867 #define HAS_SCALEADDROWS_SSE2 868 869 // Reads 16xN bytes and produces 16 shorts at a time. 870 __declspec(naked) __declspec(align(16)) 871 static void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 872 uint16* dst_ptr, int src_width, 873 int src_height) { 874 __asm { 875 push esi 876 push edi 877 push ebx 878 push ebp 879 mov esi, [esp + 16 + 4] // src_ptr 880 mov edx, [esp + 16 + 8] // src_stride 881 mov edi, [esp + 16 + 12] // dst_ptr 882 mov ecx, [esp + 16 + 16] // dst_width 883 mov ebx, [esp + 16 + 20] // height 884 pxor xmm4, xmm4 885 dec ebx 886 887 align 16 888 xloop: 889 // first row 890 movdqa xmm0, [esi] 891 lea eax, [esi + edx] 892 movdqa xmm1, xmm0 893 punpcklbw xmm0, xmm4 894 punpckhbw xmm1, xmm4 895 lea esi, [esi + 16] 896 mov ebp, ebx 897 test ebp, ebp 898 je ydone 899 900 // sum remaining rows 901 align 16 902 yloop: 903 movdqa xmm2, [eax] // read 16 pixels 904 lea eax, [eax + edx] // advance to next row 905 movdqa xmm3, xmm2 906 punpcklbw xmm2, xmm4 907 punpckhbw xmm3, xmm4 908 paddusw xmm0, xmm2 // sum 16 words 909 paddusw xmm1, xmm3 910 sub ebp, 1 911 jg yloop 912 ydone: 913 movdqa [edi], xmm0 914 movdqa [edi + 16], xmm1 915 lea edi, [edi + 32] 916 917 sub ecx, 16 918 jg xloop 919 920 pop ebp 921 pop ebx 922 pop edi 923 pop esi 924 ret 925 } 926 } 927 928 #ifndef SSE2_DISABLED 929 // Bilinear row filtering combines 16x2 -> 16x1. SSE2 version. 930 // Normal formula for bilinear interpolation is: 931 // source_y_fraction * row1 + (1 - source_y_fraction) row0 932 // SSE2 version using the a single multiply of difference: 933 // source_y_fraction * (row1 - row0) + row0 934 #define HAS_SCALEFILTERROWS_SSE2_DISABLED 935 __declspec(naked) __declspec(align(16)) 936 static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr, 937 ptrdiff_t src_stride, int dst_width, 938 int source_y_fraction) { 939 __asm { 940 push esi 941 push edi 942 mov edi, [esp + 8 + 4] // dst_ptr 943 mov esi, [esp + 8 + 8] // src_ptr 944 mov edx, [esp + 8 + 12] // src_stride 945 mov ecx, [esp + 8 + 16] // dst_width 946 mov eax, [esp + 8 + 20] // source_y_fraction (0..255) 947 sub edi, esi 948 cmp eax, 0 949 je xloop1 950 cmp eax, 128 951 je xloop2 952 953 movd xmm5, eax // xmm5 = y fraction 954 punpcklbw xmm5, xmm5 955 punpcklwd xmm5, xmm5 956 pshufd xmm5, xmm5, 0 957 pxor xmm4, xmm4 958 959 align 16 960 xloop: 961 movdqa xmm0, [esi] // row0 962 movdqa xmm2, [esi + edx] // row1 963 movdqa xmm1, xmm0 964 movdqa xmm3, xmm2 965 punpcklbw xmm2, xmm4 966 punpckhbw xmm3, xmm4 967 punpcklbw xmm0, xmm4 968 punpckhbw xmm1, xmm4 969 psubw xmm2, xmm0 // row1 - row0 970 psubw xmm3, xmm1 971 pmulhw xmm2, xmm5 // scale diff 972 pmulhw xmm3, xmm5 973 paddw xmm0, xmm2 // sum rows 974 paddw xmm1, xmm3 975 packuswb xmm0, xmm1 976 sub ecx, 16 977 movdqa [esi + edi], xmm0 978 lea esi, [esi + 16] 979 jg xloop 980 981 punpckhbw xmm0, xmm0 // duplicate last pixel for filtering 982 pshufhw xmm0, xmm0, 0xff 983 punpckhqdq xmm0, xmm0 984 movdqa [esi + edi], xmm0 985 pop edi 986 pop esi 987 ret 988 989 align 16 990 xloop1: 991 movdqa xmm0, [esi] 992 sub ecx, 16 993 movdqa [esi + edi], xmm0 994 lea esi, [esi + 16] 995 jg xloop1 996 997 punpckhbw xmm0, xmm0 // duplicate last pixel for filtering 998 pshufhw xmm0, xmm0, 0xff 999 punpckhqdq xmm0, xmm0 1000 movdqa [esi + edi], xmm0 1001 pop edi 1002 pop esi 1003 ret 1004 1005 align 16 1006 xloop2: 1007 movdqa xmm0, [esi] 1008 pavgb xmm0, [esi + edx] 1009 sub ecx, 16 1010 movdqa [esi + edi], xmm0 1011 lea esi, [esi + 16] 1012 jg xloop2 1013 1014 punpckhbw xmm0, xmm0 // duplicate last pixel for filtering 1015 pshufhw xmm0, xmm0, 0xff 1016 punpckhqdq xmm0, xmm0 1017 movdqa [esi + edi], xmm0 1018 pop edi 1019 pop esi 1020 ret 1021 } 1022 } 1023 #endif // SSE2_DISABLED 1024 // Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version. 1025 #define HAS_SCALEFILTERROWS_SSSE3 1026 __declspec(naked) __declspec(align(16)) 1027 static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, 1028 ptrdiff_t src_stride, int dst_width, 1029 int source_y_fraction) { 1030 __asm { 1031 push esi 1032 push edi 1033 mov edi, [esp + 8 + 4] // dst_ptr 1034 mov esi, [esp + 8 + 8] // src_ptr 1035 mov edx, [esp + 8 + 12] // src_stride 1036 mov ecx, [esp + 8 + 16] // dst_width 1037 mov eax, [esp + 8 + 20] // source_y_fraction (0..255) 1038 sub edi, esi 1039 shr eax, 1 1040 cmp eax, 0 1041 je xloop1 1042 cmp eax, 64 1043 je xloop2 1044 movd xmm0, eax // high fraction 0..127 1045 neg eax 1046 add eax, 128 1047 movd xmm5, eax // low fraction 128..1 1048 punpcklbw xmm5, xmm0 1049 punpcklwd xmm5, xmm5 1050 pshufd xmm5, xmm5, 0 1051 1052 align 16 1053 xloop: 1054 movdqa xmm0, [esi] 1055 movdqa xmm2, [esi + edx] 1056 movdqa xmm1, xmm0 1057 punpcklbw xmm0, xmm2 1058 punpckhbw xmm1, xmm2 1059 pmaddubsw xmm0, xmm5 1060 pmaddubsw xmm1, xmm5 1061 psrlw xmm0, 7 1062 psrlw xmm1, 7 1063 packuswb xmm0, xmm1 1064 sub ecx, 16 1065 movdqa [esi + edi], xmm0 1066 lea esi, [esi + 16] 1067 jg xloop 1068 1069 punpckhbw xmm0, xmm0 // duplicate last pixel for filtering 1070 pshufhw xmm0, xmm0, 0xff 1071 punpckhqdq xmm0, xmm0 1072 movdqa [esi + edi], xmm0 1073 1074 pop edi 1075 pop esi 1076 ret 1077 1078 align 16 1079 xloop1: 1080 movdqa xmm0, [esi] 1081 sub ecx, 16 1082 movdqa [esi + edi], xmm0 1083 lea esi, [esi + 16] 1084 jg xloop1 1085 1086 punpckhbw xmm0, xmm0 1087 pshufhw xmm0, xmm0, 0xff 1088 punpckhqdq xmm0, xmm0 1089 movdqa [esi + edi], xmm0 1090 pop edi 1091 pop esi 1092 ret 1093 1094 align 16 1095 xloop2: 1096 movdqa xmm0, [esi] 1097 pavgb xmm0, [esi + edx] 1098 sub ecx, 16 1099 movdqa [esi + edi], xmm0 1100 lea esi, [esi + 16] 1101 jg xloop2 1102 1103 punpckhbw xmm0, xmm0 1104 pshufhw xmm0, xmm0, 0xff 1105 punpckhqdq xmm0, xmm0 1106 movdqa [esi + edi], xmm0 1107 pop edi 1108 pop esi 1109 ret 1110 } 1111 } 1112 1113 #elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__)) 1114 1115 // GCC versions of row functions are verbatim conversions from Visual C. 1116 // Generated using gcc disassembly on Visual C object file: 1117 // objdump -D yuvscaler.obj >yuvscaler.txt 1118 #define HAS_SCALEROWDOWN2_SSE2 1119 static void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 1120 uint8* dst_ptr, int dst_width) { 1121 asm volatile ( 1122 "pcmpeqb %%xmm5,%%xmm5 \n" 1123 "psrlw $0x8,%%xmm5 \n" 1124 ".p2align 4 \n" 1125 "1: \n" 1126 "movdqa (%0),%%xmm0 \n" 1127 "movdqa 0x10(%0),%%xmm1 \n" 1128 "lea 0x20(%0),%0 \n" 1129 "pand %%xmm5,%%xmm0 \n" 1130 "pand %%xmm5,%%xmm1 \n" 1131 "packuswb %%xmm1,%%xmm0 \n" 1132 "movdqa %%xmm0,(%1) \n" 1133 "lea 0x10(%1),%1 \n" 1134 "sub $0x10,%2 \n" 1135 "jg 1b \n" 1136 : "+r"(src_ptr), // %0 1137 "+r"(dst_ptr), // %1 1138 "+r"(dst_width) // %2 1139 : 1140 : "memory", "cc" 1141 #if defined(__SSE2__) 1142 , "xmm0", "xmm1", "xmm5" 1143 #endif 1144 ); 1145 } 1146 1147 void ScaleRowDown2Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 1148 uint8* dst_ptr, int dst_width) { 1149 asm volatile ( 1150 "pcmpeqb %%xmm5,%%xmm5 \n" 1151 "psrlw $0x8,%%xmm5 \n" 1152 ".p2align 4 \n" 1153 "1: \n" 1154 "movdqa (%0),%%xmm0 \n" 1155 "movdqa 0x10(%0),%%xmm1 \n" 1156 "movdqa (%0,%3,1),%%xmm2 \n" 1157 "movdqa 0x10(%0,%3,1),%%xmm3 \n" 1158 "lea 0x20(%0),%0 \n" 1159 "pavgb %%xmm2,%%xmm0 \n" 1160 "pavgb %%xmm3,%%xmm1 \n" 1161 "movdqa %%xmm0,%%xmm2 \n" 1162 "psrlw $0x8,%%xmm0 \n" 1163 "movdqa %%xmm1,%%xmm3 \n" 1164 "psrlw $0x8,%%xmm1 \n" 1165 "pand %%xmm5,%%xmm2 \n" 1166 "pand %%xmm5,%%xmm3 \n" 1167 "pavgw %%xmm2,%%xmm0 \n" 1168 "pavgw %%xmm3,%%xmm1 \n" 1169 "packuswb %%xmm1,%%xmm0 \n" 1170 "movdqa %%xmm0,(%1) \n" 1171 "lea 0x10(%1),%1 \n" 1172 "sub $0x10,%2 \n" 1173 "jg 1b \n" 1174 : "+r"(src_ptr), // %0 1175 "+r"(dst_ptr), // %1 1176 "+r"(dst_width) // %2 1177 : "r"(static_cast<intptr_t>(src_stride)) // %3 1178 : "memory", "cc" 1179 #if defined(__SSE2__) 1180 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 1181 #endif 1182 ); 1183 } 1184 static void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, 1185 ptrdiff_t src_stride, 1186 uint8* dst_ptr, int dst_width) { 1187 asm volatile ( 1188 "pcmpeqb %%xmm5,%%xmm5 \n" 1189 "psrlw $0x8,%%xmm5 \n" 1190 ".p2align 4 \n" 1191 "1: \n" 1192 "movdqu (%0),%%xmm0 \n" 1193 "movdqu 0x10(%0),%%xmm1 \n" 1194 "lea 0x20(%0),%0 \n" 1195 "pand %%xmm5,%%xmm0 \n" 1196 "pand %%xmm5,%%xmm1 \n" 1197 "packuswb %%xmm1,%%xmm0 \n" 1198 "movdqu %%xmm0,(%1) \n" 1199 "lea 0x10(%1),%1 \n" 1200 "sub $0x10,%2 \n" 1201 "jg 1b \n" 1202 : "+r"(src_ptr), // %0 1203 "+r"(dst_ptr), // %1 1204 "+r"(dst_width) // %2 1205 : 1206 : "memory", "cc" 1207 #if defined(__SSE2__) 1208 , "xmm0", "xmm1", "xmm5" 1209 #endif 1210 ); 1211 } 1212 1213 static void ScaleRowDown2Int_Unaligned_SSE2(const uint8* src_ptr, 1214 ptrdiff_t src_stride, 1215 uint8* dst_ptr, int dst_width) { 1216 asm volatile ( 1217 "pcmpeqb %%xmm5,%%xmm5 \n" 1218 "psrlw $0x8,%%xmm5 \n" 1219 ".p2align 4 \n" 1220 "1: \n" 1221 "movdqu (%0),%%xmm0 \n" 1222 "movdqu 0x10(%0),%%xmm1 \n" 1223 "movdqu (%0,%3,1),%%xmm2 \n" 1224 "movdqu 0x10(%0,%3,1),%%xmm3 \n" 1225 "lea 0x20(%0),%0 \n" 1226 "pavgb %%xmm2,%%xmm0 \n" 1227 "pavgb %%xmm3,%%xmm1 \n" 1228 "movdqa %%xmm0,%%xmm2 \n" 1229 "psrlw $0x8,%%xmm0 \n" 1230 "movdqa %%xmm1,%%xmm3 \n" 1231 "psrlw $0x8,%%xmm1 \n" 1232 "pand %%xmm5,%%xmm2 \n" 1233 "pand %%xmm5,%%xmm3 \n" 1234 "pavgw %%xmm2,%%xmm0 \n" 1235 "pavgw %%xmm3,%%xmm1 \n" 1236 "packuswb %%xmm1,%%xmm0 \n" 1237 "movdqu %%xmm0,(%1) \n" 1238 "lea 0x10(%1),%1 \n" 1239 "sub $0x10,%2 \n" 1240 "jg 1b \n" 1241 : "+r"(src_ptr), // %0 1242 "+r"(dst_ptr), // %1 1243 "+r"(dst_width) // %2 1244 : "r"(static_cast<intptr_t>(src_stride)) // %3 1245 : "memory", "cc" 1246 #if defined(__SSE2__) 1247 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 1248 #endif 1249 ); 1250 } 1251 1252 #define HAS_SCALEROWDOWN4_SSE2 1253 static void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 1254 uint8* dst_ptr, int dst_width) { 1255 asm volatile ( 1256 "pcmpeqb %%xmm5,%%xmm5 \n" 1257 "psrld $0x18,%%xmm5 \n" 1258 ".p2align 4 \n" 1259 "1: \n" 1260 "movdqa (%0),%%xmm0 \n" 1261 "movdqa 0x10(%0),%%xmm1 \n" 1262 "lea 0x20(%0),%0 \n" 1263 "pand %%xmm5,%%xmm0 \n" 1264 "pand %%xmm5,%%xmm1 \n" 1265 "packuswb %%xmm1,%%xmm0 \n" 1266 "packuswb %%xmm0,%%xmm0 \n" 1267 "movq %%xmm0,(%1) \n" 1268 "lea 0x8(%1),%1 \n" 1269 "sub $0x8,%2 \n" 1270 "jg 1b \n" 1271 : "+r"(src_ptr), // %0 1272 "+r"(dst_ptr), // %1 1273 "+r"(dst_width) // %2 1274 : 1275 : "memory", "cc" 1276 #if defined(__SSE2__) 1277 , "xmm0", "xmm1", "xmm5" 1278 #endif 1279 ); 1280 } 1281 1282 static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 1283 uint8* dst_ptr, int dst_width) { 1284 intptr_t stridex3 = 0; 1285 asm volatile ( 1286 "pcmpeqb %%xmm7,%%xmm7 \n" 1287 "psrlw $0x8,%%xmm7 \n" 1288 "lea (%4,%4,2),%3 \n" 1289 ".p2align 4 \n" 1290 "1: \n" 1291 "movdqa (%0),%%xmm0 \n" 1292 "movdqa 0x10(%0),%%xmm1 \n" 1293 "movdqa (%0,%4,1),%%xmm2 \n" 1294 "movdqa 0x10(%0,%4,1),%%xmm3 \n" 1295 "pavgb %%xmm2,%%xmm0 \n" 1296 "pavgb %%xmm3,%%xmm1 \n" 1297 "movdqa (%0,%4,2),%%xmm2 \n" 1298 "movdqa 0x10(%0,%4,2),%%xmm3 \n" 1299 "movdqa (%0,%3,1),%%xmm4 \n" 1300 "movdqa 0x10(%0,%3,1),%%xmm5 \n" 1301 "lea 0x20(%0),%0 \n" 1302 "pavgb %%xmm4,%%xmm2 \n" 1303 "pavgb %%xmm2,%%xmm0 \n" 1304 "pavgb %%xmm5,%%xmm3 \n" 1305 "pavgb %%xmm3,%%xmm1 \n" 1306 "movdqa %%xmm0,%%xmm2 \n" 1307 "psrlw $0x8,%%xmm0 \n" 1308 "movdqa %%xmm1,%%xmm3 \n" 1309 "psrlw $0x8,%%xmm1 \n" 1310 "pand %%xmm7,%%xmm2 \n" 1311 "pand %%xmm7,%%xmm3 \n" 1312 "pavgw %%xmm2,%%xmm0 \n" 1313 "pavgw %%xmm3,%%xmm1 \n" 1314 "packuswb %%xmm1,%%xmm0 \n" 1315 "movdqa %%xmm0,%%xmm2 \n" 1316 "psrlw $0x8,%%xmm0 \n" 1317 "pand %%xmm7,%%xmm2 \n" 1318 "pavgw %%xmm2,%%xmm0 \n" 1319 "packuswb %%xmm0,%%xmm0 \n" 1320 "movq %%xmm0,(%1) \n" 1321 "lea 0x8(%1),%1 \n" 1322 "sub $0x8,%2 \n" 1323 "jg 1b \n" 1324 : "+r"(src_ptr), // %0 1325 "+r"(dst_ptr), // %1 1326 "+r"(dst_width), // %2 1327 "+r"(stridex3) // %3 1328 : "r"(static_cast<intptr_t>(src_stride)) // %4 1329 : "memory", "cc" 1330 #if defined(__SSE2__) 1331 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7" 1332 #endif 1333 ); 1334 } 1335 1336 #define HAS_SCALEROWDOWN8_SSE2 1337 static void ScaleRowDown8_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 1338 uint8* dst_ptr, int dst_width) { 1339 asm volatile ( 1340 "pcmpeqb %%xmm5,%%xmm5 \n" 1341 "psrlq $0x38,%%xmm5 \n" 1342 ".p2align 4 \n" 1343 "1: \n" 1344 "movdqa (%0),%%xmm0 \n" 1345 "movdqa 0x10(%0),%%xmm1 \n" 1346 "lea 0x20(%0),%0 \n" 1347 "pand %%xmm5,%%xmm0 \n" 1348 "pand %%xmm5,%%xmm1 \n" 1349 "packuswb %%xmm1,%%xmm0 \n" 1350 "packuswb %%xmm0,%%xmm0 \n" 1351 "packuswb %%xmm0,%%xmm0 \n" 1352 "movd %%xmm0,(%1) \n" 1353 "lea 0x4(%1),%1 \n" 1354 "sub $0x4,%2 \n" 1355 "jg 1b \n" 1356 : "+r"(src_ptr), // %0 1357 "+r"(dst_ptr), // %1 1358 "+r"(dst_width) // %2 1359 : 1360 : "memory", "cc" 1361 #if defined(__SSE2__) 1362 , "xmm0", "xmm1", "xmm5" 1363 #endif 1364 ); 1365 } 1366 1367 static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 1368 uint8* dst_ptr, int dst_width) { 1369 intptr_t stridex3 = 0; 1370 intptr_t row4 = 0; 1371 asm volatile ( 1372 "lea (%5,%5,2),%3 \n" 1373 "pxor %%xmm7,%%xmm7 \n" 1374 ".p2align 4 \n" 1375 "1: \n" 1376 "movdqa (%0),%%xmm0 \n" 1377 "movdqa 0x10(%0),%%xmm1 \n" 1378 "movdqa (%0,%5,1),%%xmm2 \n" 1379 "movdqa 0x10(%0,%5,1),%%xmm3 \n" 1380 "pavgb %%xmm2,%%xmm0 \n" 1381 "pavgb %%xmm3,%%xmm1 \n" 1382 "movdqa (%0,%5,2),%%xmm2 \n" 1383 "movdqa 0x10(%0,%5,2),%%xmm3 \n" 1384 "movdqa (%0,%3,1),%%xmm4 \n" 1385 "movdqa 0x10(%0,%3,1),%%xmm5 \n" 1386 "lea (%0,%5,4),%4 \n" 1387 "lea 0x20(%0),%0 \n" 1388 "pavgb %%xmm4,%%xmm2 \n" 1389 "pavgb %%xmm5,%%xmm3 \n" 1390 "pavgb %%xmm2,%%xmm0 \n" 1391 "pavgb %%xmm3,%%xmm1 \n" 1392 "movdqa 0x0(%4),%%xmm2 \n" 1393 "movdqa 0x10(%4),%%xmm3 \n" 1394 "movdqa 0x0(%4,%5,1),%%xmm4 \n" 1395 "movdqa 0x10(%4,%5,1),%%xmm5 \n" 1396 "pavgb %%xmm4,%%xmm2 \n" 1397 "pavgb %%xmm5,%%xmm3 \n" 1398 "movdqa 0x0(%4,%5,2),%%xmm4 \n" 1399 "movdqa 0x10(%4,%5,2),%%xmm5 \n" 1400 "movdqa 0x0(%4,%3,1),%%xmm6 \n" 1401 "pavgb %%xmm6,%%xmm4 \n" 1402 "movdqa 0x10(%4,%3,1),%%xmm6 \n" 1403 "pavgb %%xmm6,%%xmm5 \n" 1404 "pavgb %%xmm4,%%xmm2 \n" 1405 "pavgb %%xmm5,%%xmm3 \n" 1406 "pavgb %%xmm2,%%xmm0 \n" 1407 "pavgb %%xmm3,%%xmm1 \n" 1408 "psadbw %%xmm7,%%xmm0 \n" 1409 "psadbw %%xmm7,%%xmm1 \n" 1410 "pshufd $0xd8,%%xmm0,%%xmm0 \n" 1411 "pshufd $0x8d,%%xmm1,%%xmm1 \n" 1412 "por %%xmm1,%%xmm0 \n" 1413 "psrlw $0x3,%%xmm0 \n" 1414 "packuswb %%xmm0,%%xmm0 \n" 1415 "packuswb %%xmm0,%%xmm0 \n" 1416 "movd %%xmm0,(%1) \n" 1417 "lea 0x4(%1),%1 \n" 1418 "sub $0x4,%2 \n" 1419 "jg 1b \n" 1420 : "+r"(src_ptr), // %0 1421 "+r"(dst_ptr), // %1 1422 "+rm"(dst_width), // %2 1423 "+r"(stridex3), // %3 1424 "+r"(row4) // %4 1425 : "r"(static_cast<intptr_t>(src_stride)) // %5 1426 : "memory", "cc" 1427 #if defined(__SSE2__) 1428 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 1429 #endif 1430 ); 1431 } 1432 1433 #define HAS_SCALEROWDOWN34_SSSE3 1434 static void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, 1435 uint8* dst_ptr, int dst_width) { 1436 asm volatile ( 1437 "movdqa %0,%%xmm3 \n" 1438 "movdqa %1,%%xmm4 \n" 1439 "movdqa %2,%%xmm5 \n" 1440 : 1441 : "m"(kShuf0), // %0 1442 "m"(kShuf1), // %1 1443 "m"(kShuf2) // %2 1444 ); 1445 asm volatile ( 1446 ".p2align 4 \n" 1447 "1: \n" 1448 "movdqa (%0),%%xmm0 \n" 1449 "movdqa 0x10(%0),%%xmm2 \n" 1450 "lea 0x20(%0),%0 \n" 1451 "movdqa %%xmm2,%%xmm1 \n" 1452 "palignr $0x8,%%xmm0,%%xmm1 \n" 1453 "pshufb %%xmm3,%%xmm0 \n" 1454 "pshufb %%xmm4,%%xmm1 \n" 1455 "pshufb %%xmm5,%%xmm2 \n" 1456 "movq %%xmm0,(%1) \n" 1457 "movq %%xmm1,0x8(%1) \n" 1458 "movq %%xmm2,0x10(%1) \n" 1459 "lea 0x18(%1),%1 \n" 1460 "sub $0x18,%2 \n" 1461 "jg 1b \n" 1462 : "+r"(src_ptr), // %0 1463 "+r"(dst_ptr), // %1 1464 "+r"(dst_width) // %2 1465 : 1466 : "memory", "cc" 1467 #if defined(__SSE2__) 1468 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1469 #endif 1470 ); 1471 } 1472 1473 static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, 1474 ptrdiff_t src_stride, 1475 uint8* dst_ptr, int dst_width) { 1476 asm volatile ( 1477 "movdqa %0,%%xmm2 \n" // kShuf01 1478 "movdqa %1,%%xmm3 \n" // kShuf11 1479 "movdqa %2,%%xmm4 \n" // kShuf21 1480 : 1481 : "m"(kShuf01), // %0 1482 "m"(kShuf11), // %1 1483 "m"(kShuf21) // %2 1484 ); 1485 asm volatile ( 1486 "movdqa %0,%%xmm5 \n" // kMadd01 1487 "movdqa %1,%%xmm0 \n" // kMadd11 1488 "movdqa %2,%%xmm1 \n" // kRound34 1489 : 1490 : "m"(kMadd01), // %0 1491 "m"(kMadd11), // %1 1492 "m"(kRound34) // %2 1493 ); 1494 asm volatile ( 1495 ".p2align 4 \n" 1496 "1: \n" 1497 "movdqa (%0),%%xmm6 \n" 1498 "movdqa (%0,%3),%%xmm7 \n" 1499 "pavgb %%xmm7,%%xmm6 \n" 1500 "pshufb %%xmm2,%%xmm6 \n" 1501 "pmaddubsw %%xmm5,%%xmm6 \n" 1502 "paddsw %%xmm1,%%xmm6 \n" 1503 "psrlw $0x2,%%xmm6 \n" 1504 "packuswb %%xmm6,%%xmm6 \n" 1505 "movq %%xmm6,(%1) \n" 1506 "movdqu 0x8(%0),%%xmm6 \n" 1507 "movdqu 0x8(%0,%3),%%xmm7 \n" 1508 "pavgb %%xmm7,%%xmm6 \n" 1509 "pshufb %%xmm3,%%xmm6 \n" 1510 "pmaddubsw %%xmm0,%%xmm6 \n" 1511 "paddsw %%xmm1,%%xmm6 \n" 1512 "psrlw $0x2,%%xmm6 \n" 1513 "packuswb %%xmm6,%%xmm6 \n" 1514 "movq %%xmm6,0x8(%1) \n" 1515 "movdqa 0x10(%0),%%xmm6 \n" 1516 "movdqa 0x10(%0,%3),%%xmm7 \n" 1517 "lea 0x20(%0),%0 \n" 1518 "pavgb %%xmm7,%%xmm6 \n" 1519 "pshufb %%xmm4,%%xmm6 \n" 1520 "pmaddubsw %4,%%xmm6 \n" 1521 "paddsw %%xmm1,%%xmm6 \n" 1522 "psrlw $0x2,%%xmm6 \n" 1523 "packuswb %%xmm6,%%xmm6 \n" 1524 "movq %%xmm6,0x10(%1) \n" 1525 "lea 0x18(%1),%1 \n" 1526 "sub $0x18,%2 \n" 1527 "jg 1b \n" 1528 : "+r"(src_ptr), // %0 1529 "+r"(dst_ptr), // %1 1530 "+r"(dst_width) // %2 1531 : "r"(static_cast<intptr_t>(src_stride)), // %3 1532 "m"(kMadd21) // %4 1533 : "memory", "cc" 1534 #if defined(__SSE2__) 1535 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 1536 #endif 1537 ); 1538 } 1539 1540 static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, 1541 ptrdiff_t src_stride, 1542 uint8* dst_ptr, int dst_width) { 1543 asm volatile ( 1544 "movdqa %0,%%xmm2 \n" // kShuf01 1545 "movdqa %1,%%xmm3 \n" // kShuf11 1546 "movdqa %2,%%xmm4 \n" // kShuf21 1547 : 1548 : "m"(kShuf01), // %0 1549 "m"(kShuf11), // %1 1550 "m"(kShuf21) // %2 1551 ); 1552 asm volatile ( 1553 "movdqa %0,%%xmm5 \n" // kMadd01 1554 "movdqa %1,%%xmm0 \n" // kMadd11 1555 "movdqa %2,%%xmm1 \n" // kRound34 1556 : 1557 : "m"(kMadd01), // %0 1558 "m"(kMadd11), // %1 1559 "m"(kRound34) // %2 1560 ); 1561 1562 asm volatile ( 1563 ".p2align 4 \n" 1564 "1: \n" 1565 "movdqa (%0),%%xmm6 \n" 1566 "movdqa (%0,%3,1),%%xmm7 \n" 1567 "pavgb %%xmm6,%%xmm7 \n" 1568 "pavgb %%xmm7,%%xmm6 \n" 1569 "pshufb %%xmm2,%%xmm6 \n" 1570 "pmaddubsw %%xmm5,%%xmm6 \n" 1571 "paddsw %%xmm1,%%xmm6 \n" 1572 "psrlw $0x2,%%xmm6 \n" 1573 "packuswb %%xmm6,%%xmm6 \n" 1574 "movq %%xmm6,(%1) \n" 1575 "movdqu 0x8(%0),%%xmm6 \n" 1576 "movdqu 0x8(%0,%3,1),%%xmm7 \n" 1577 "pavgb %%xmm6,%%xmm7 \n" 1578 "pavgb %%xmm7,%%xmm6 \n" 1579 "pshufb %%xmm3,%%xmm6 \n" 1580 "pmaddubsw %%xmm0,%%xmm6 \n" 1581 "paddsw %%xmm1,%%xmm6 \n" 1582 "psrlw $0x2,%%xmm6 \n" 1583 "packuswb %%xmm6,%%xmm6 \n" 1584 "movq %%xmm6,0x8(%1) \n" 1585 "movdqa 0x10(%0),%%xmm6 \n" 1586 "movdqa 0x10(%0,%3,1),%%xmm7 \n" 1587 "lea 0x20(%0),%0 \n" 1588 "pavgb %%xmm6,%%xmm7 \n" 1589 "pavgb %%xmm7,%%xmm6 \n" 1590 "pshufb %%xmm4,%%xmm6 \n" 1591 "pmaddubsw %4,%%xmm6 \n" 1592 "paddsw %%xmm1,%%xmm6 \n" 1593 "psrlw $0x2,%%xmm6 \n" 1594 "packuswb %%xmm6,%%xmm6 \n" 1595 "movq %%xmm6,0x10(%1) \n" 1596 "lea 0x18(%1),%1 \n" 1597 "sub $0x18,%2 \n" 1598 "jg 1b \n" 1599 : "+r"(src_ptr), // %0 1600 "+r"(dst_ptr), // %1 1601 "+r"(dst_width) // %2 1602 : "r"(static_cast<intptr_t>(src_stride)), // %3 1603 "m"(kMadd21) // %4 1604 : "memory", "cc" 1605 #if defined(__SSE2__) 1606 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 1607 #endif 1608 ); 1609 } 1610 1611 #define HAS_SCALEROWDOWN38_SSSE3 1612 static void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, 1613 uint8* dst_ptr, int dst_width) { 1614 asm volatile ( 1615 "movdqa %3,%%xmm4 \n" 1616 "movdqa %4,%%xmm5 \n" 1617 ".p2align 4 \n" 1618 "1: \n" 1619 "movdqa (%0),%%xmm0 \n" 1620 "movdqa 0x10(%0),%%xmm1 \n" 1621 "lea 0x20(%0),%0 \n" 1622 "pshufb %%xmm4,%%xmm0 \n" 1623 "pshufb %%xmm5,%%xmm1 \n" 1624 "paddusb %%xmm1,%%xmm0 \n" 1625 "movq %%xmm0,(%1) \n" 1626 "movhlps %%xmm0,%%xmm1 \n" 1627 "movd %%xmm1,0x8(%1) \n" 1628 "lea 0xc(%1),%1 \n" 1629 "sub $0xc,%2 \n" 1630 "jg 1b \n" 1631 : "+r"(src_ptr), // %0 1632 "+r"(dst_ptr), // %1 1633 "+r"(dst_width) // %2 1634 : "m"(kShuf38a), // %3 1635 "m"(kShuf38b) // %4 1636 : "memory", "cc" 1637 #if defined(__SSE2__) 1638 , "xmm0", "xmm1", "xmm4", "xmm5" 1639 #endif 1640 ); 1641 } 1642 1643 static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, 1644 ptrdiff_t src_stride, 1645 uint8* dst_ptr, int dst_width) { 1646 asm volatile ( 1647 "movdqa %0,%%xmm2 \n" 1648 "movdqa %1,%%xmm3 \n" 1649 "movdqa %2,%%xmm4 \n" 1650 "movdqa %3,%%xmm5 \n" 1651 : 1652 : "m"(kShufAb0), // %0 1653 "m"(kShufAb1), // %1 1654 "m"(kShufAb2), // %2 1655 "m"(kScaleAb2) // %3 1656 ); 1657 asm volatile ( 1658 ".p2align 4 \n" 1659 "1: \n" 1660 "movdqa (%0),%%xmm0 \n" 1661 "pavgb (%0,%3,1),%%xmm0 \n" 1662 "lea 0x10(%0),%0 \n" 1663 "movdqa %%xmm0,%%xmm1 \n" 1664 "pshufb %%xmm2,%%xmm1 \n" 1665 "movdqa %%xmm0,%%xmm6 \n" 1666 "pshufb %%xmm3,%%xmm6 \n" 1667 "paddusw %%xmm6,%%xmm1 \n" 1668 "pshufb %%xmm4,%%xmm0 \n" 1669 "paddusw %%xmm0,%%xmm1 \n" 1670 "pmulhuw %%xmm5,%%xmm1 \n" 1671 "packuswb %%xmm1,%%xmm1 \n" 1672 "sub $0x6,%2 \n" 1673 "movd %%xmm1,(%1) \n" 1674 "psrlq $0x10,%%xmm1 \n" 1675 "movd %%xmm1,0x2(%1) \n" 1676 "lea 0x6(%1),%1 \n" 1677 "jg 1b \n" 1678 : "+r"(src_ptr), // %0 1679 "+r"(dst_ptr), // %1 1680 "+r"(dst_width) // %2 1681 : "r"(static_cast<intptr_t>(src_stride)) // %3 1682 : "memory", "cc" 1683 #if defined(__SSE2__) 1684 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" 1685 #endif 1686 ); 1687 } 1688 1689 static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, 1690 ptrdiff_t src_stride, 1691 uint8* dst_ptr, int dst_width) { 1692 asm volatile ( 1693 "movdqa %0,%%xmm2 \n" 1694 "movdqa %1,%%xmm3 \n" 1695 "movdqa %2,%%xmm4 \n" 1696 "pxor %%xmm5,%%xmm5 \n" 1697 : 1698 : "m"(kShufAc), // %0 1699 "m"(kShufAc3), // %1 1700 "m"(kScaleAc33) // %2 1701 ); 1702 asm volatile ( 1703 ".p2align 4 \n" 1704 "1: \n" 1705 "movdqa (%0),%%xmm0 \n" 1706 "movdqa (%0,%3,1),%%xmm6 \n" 1707 "movhlps %%xmm0,%%xmm1 \n" 1708 "movhlps %%xmm6,%%xmm7 \n" 1709 "punpcklbw %%xmm5,%%xmm0 \n" 1710 "punpcklbw %%xmm5,%%xmm1 \n" 1711 "punpcklbw %%xmm5,%%xmm6 \n" 1712 "punpcklbw %%xmm5,%%xmm7 \n" 1713 "paddusw %%xmm6,%%xmm0 \n" 1714 "paddusw %%xmm7,%%xmm1 \n" 1715 "movdqa (%0,%3,2),%%xmm6 \n" 1716 "lea 0x10(%0),%0 \n" 1717 "movhlps %%xmm6,%%xmm7 \n" 1718 "punpcklbw %%xmm5,%%xmm6 \n" 1719 "punpcklbw %%xmm5,%%xmm7 \n" 1720 "paddusw %%xmm6,%%xmm0 \n" 1721 "paddusw %%xmm7,%%xmm1 \n" 1722 "movdqa %%xmm0,%%xmm6 \n" 1723 "psrldq $0x2,%%xmm0 \n" 1724 "paddusw %%xmm0,%%xmm6 \n" 1725 "psrldq $0x2,%%xmm0 \n" 1726 "paddusw %%xmm0,%%xmm6 \n" 1727 "pshufb %%xmm2,%%xmm6 \n" 1728 "movdqa %%xmm1,%%xmm7 \n" 1729 "psrldq $0x2,%%xmm1 \n" 1730 "paddusw %%xmm1,%%xmm7 \n" 1731 "psrldq $0x2,%%xmm1 \n" 1732 "paddusw %%xmm1,%%xmm7 \n" 1733 "pshufb %%xmm3,%%xmm7 \n" 1734 "paddusw %%xmm7,%%xmm6 \n" 1735 "pmulhuw %%xmm4,%%xmm6 \n" 1736 "packuswb %%xmm6,%%xmm6 \n" 1737 "sub $0x6,%2 \n" 1738 "movd %%xmm6,(%1) \n" 1739 "psrlq $0x10,%%xmm6 \n" 1740 "movd %%xmm6,0x2(%1) \n" 1741 "lea 0x6(%1),%1 \n" 1742 "jg 1b \n" 1743 : "+r"(src_ptr), // %0 1744 "+r"(dst_ptr), // %1 1745 "+r"(dst_width) // %2 1746 : "r"(static_cast<intptr_t>(src_stride)) // %3 1747 : "memory", "cc" 1748 #if defined(__SSE2__) 1749 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 1750 #endif 1751 ); 1752 } 1753 1754 #define HAS_SCALEADDROWS_SSE2 1755 static void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 1756 uint16* dst_ptr, int src_width, int src_height) { 1757 int tmp_height = 0; 1758 intptr_t tmp_src = 0; 1759 asm volatile ( 1760 "pxor %%xmm4,%%xmm4 \n" 1761 "sub $0x1,%5 \n" 1762 ".p2align 4 \n" 1763 "1: \n" 1764 "movdqa (%0),%%xmm0 \n" 1765 "mov %0,%3 \n" 1766 "add %6,%0 \n" 1767 "movdqa %%xmm0,%%xmm1 \n" 1768 "punpcklbw %%xmm4,%%xmm0 \n" 1769 "punpckhbw %%xmm4,%%xmm1 \n" 1770 "mov %5,%2 \n" 1771 "test %2,%2 \n" 1772 "je 3f \n" 1773 "2: \n" 1774 "movdqa (%0),%%xmm2 \n" 1775 "add %6,%0 \n" 1776 "movdqa %%xmm2,%%xmm3 \n" 1777 "punpcklbw %%xmm4,%%xmm2 \n" 1778 "punpckhbw %%xmm4,%%xmm3 \n" 1779 "paddusw %%xmm2,%%xmm0 \n" 1780 "paddusw %%xmm3,%%xmm1 \n" 1781 "sub $0x1,%2 \n" 1782 "jg 2b \n" 1783 "3: \n" 1784 "movdqa %%xmm0,(%1) \n" 1785 "movdqa %%xmm1,0x10(%1) \n" 1786 "lea 0x10(%3),%0 \n" 1787 "lea 0x20(%1),%1 \n" 1788 "sub $0x10,%4 \n" 1789 "jg 1b \n" 1790 : "+r"(src_ptr), // %0 1791 "+r"(dst_ptr), // %1 1792 "+r"(tmp_height), // %2 1793 "+r"(tmp_src), // %3 1794 "+r"(src_width), // %4 1795 "+rm"(src_height) // %5 1796 : "rm"(static_cast<intptr_t>(src_stride)) // %6 1797 : "memory", "cc" 1798 #if defined(__SSE2__) 1799 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" 1800 #endif 1801 ); 1802 } 1803 1804 #ifndef SSE2_DISABLED 1805 // Bilinear row filtering combines 16x2 -> 16x1. SSE2 version 1806 #define HAS_SCALEFILTERROWS_SSE2_DISABLED 1807 static void ScaleFilterRows_SSE2(uint8* dst_ptr, 1808 const uint8* src_ptr, ptrdiff_t src_stride, 1809 int dst_width, int source_y_fraction) { 1810 asm volatile ( 1811 "sub %1,%0 \n" 1812 "cmp $0x0,%3 \n" 1813 "je 2f \n" 1814 "cmp $0x80,%3 \n" 1815 "je 3f \n" 1816 "movd %3,%%xmm5 \n" 1817 "punpcklbw %%xmm5,%%xmm5 \n" 1818 "punpcklwd %%xmm5,%%xmm5 \n" 1819 "pshufd $0x0,%%xmm5,%%xmm5 \n" 1820 "pxor %%xmm4,%%xmm4 \n" 1821 ".p2align 4 \n" 1822 "1: \n" 1823 "movdqa (%1),%%xmm0 \n" 1824 "movdqa (%1,%4,1),%%xmm2 \n" 1825 "movdqa %%xmm0,%%xmm1 \n" 1826 "movdqa %%xmm2,%%xmm3 \n" 1827 "punpcklbw %%xmm4,%%xmm2 \n" 1828 "punpckhbw %%xmm4,%%xmm3 \n" 1829 "punpcklbw %%xmm4,%%xmm0 \n" 1830 "punpckhbw %%xmm4,%%xmm1 \n" 1831 "psubw %%xmm0,%%xmm2 \n" 1832 "psubw %%xmm1,%%xmm3 \n" 1833 "pmulhw %%xmm5,%%xmm2 \n" 1834 "pmulhw %%xmm5,%%xmm3 \n" 1835 "paddw %%xmm2,%%xmm0 \n" 1836 "paddw %%xmm3,%%xmm1 \n" 1837 "packuswb %%xmm1,%%xmm0 \n" 1838 "sub $0x10,%2 \n" 1839 "movdqa %%xmm0,(%1,%0,1) \n" 1840 "lea 0x10(%1),%1 \n" 1841 "jg 1b \n" 1842 "jmp 4f \n" 1843 ".p2align 4 \n" 1844 "2: \n" 1845 "movdqa (%1),%%xmm0 \n" 1846 "sub $0x10,%2 \n" 1847 "movdqa %%xmm0,(%1,%0,1) \n" 1848 "lea 0x10(%1),%1 \n" 1849 "jg 2b \n" 1850 "jmp 4f \n" 1851 ".p2align 4 \n" 1852 "3: \n" 1853 "movdqa (%1),%%xmm0 \n" 1854 "pavgb (%1,%4,1),%%xmm0 \n" 1855 "sub $0x10,%2 \n" 1856 "movdqa %%xmm0,(%1,%0,1) \n" 1857 "lea 0x10(%1),%1 \n" 1858 "jg 3b \n" 1859 ".p2align 4 \n" 1860 "4: \n" 1861 "punpckhbw %%xmm0,%%xmm0 \n" 1862 "pshufhw $0xff,%%xmm0,%%xmm0 \n" 1863 "punpckhqdq %%xmm0,%%xmm0 \n" 1864 "movdqa %%xmm0,(%1,%0,1) \n" 1865 : "+r"(dst_ptr), // %0 1866 "+r"(src_ptr), // %1 1867 "+r"(dst_width), // %2 1868 "+r"(source_y_fraction) // %3 1869 : "r"(static_cast<intptr_t>(src_stride)) // %4 1870 : "memory", "cc" 1871 #if defined(__SSE2__) 1872 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1873 #endif 1874 ); 1875 } 1876 #endif // SSE2_DISABLED 1877 1878 // Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version 1879 #define HAS_SCALEFILTERROWS_SSSE3 1880 static void ScaleFilterRows_SSSE3(uint8* dst_ptr, 1881 const uint8* src_ptr, ptrdiff_t src_stride, 1882 int dst_width, int source_y_fraction) { 1883 asm volatile ( 1884 "sub %1,%0 \n" 1885 "shr %3 \n" 1886 "cmp $0x0,%3 \n" 1887 "je 2f \n" 1888 "cmp $0x40,%3 \n" 1889 "je 3f \n" 1890 "movd %3,%%xmm0 \n" 1891 "neg %3 \n" 1892 "add $0x80,%3 \n" 1893 "movd %3,%%xmm5 \n" 1894 "punpcklbw %%xmm0,%%xmm5 \n" 1895 "punpcklwd %%xmm5,%%xmm5 \n" 1896 "pshufd $0x0,%%xmm5,%%xmm5 \n" 1897 ".p2align 4 \n" 1898 "1: \n" 1899 "movdqa (%1),%%xmm0 \n" 1900 "movdqa (%1,%4,1),%%xmm2 \n" 1901 "movdqa %%xmm0,%%xmm1 \n" 1902 "punpcklbw %%xmm2,%%xmm0 \n" 1903 "punpckhbw %%xmm2,%%xmm1 \n" 1904 "pmaddubsw %%xmm5,%%xmm0 \n" 1905 "pmaddubsw %%xmm5,%%xmm1 \n" 1906 "psrlw $0x7,%%xmm0 \n" 1907 "psrlw $0x7,%%xmm1 \n" 1908 "packuswb %%xmm1,%%xmm0 \n" 1909 "sub $0x10,%2 \n" 1910 "movdqa %%xmm0,(%1,%0,1) \n" 1911 "lea 0x10(%1),%1 \n" 1912 "jg 1b \n" 1913 "jmp 4f \n" 1914 ".p2align 4 \n" 1915 "2: \n" 1916 "movdqa (%1),%%xmm0 \n" 1917 "sub $0x10,%2 \n" 1918 "movdqa %%xmm0,(%1,%0,1) \n" 1919 "lea 0x10(%1),%1 \n" 1920 "jg 2b \n" 1921 "jmp 4f \n" 1922 ".p2align 4 \n" 1923 "3: \n" 1924 "movdqa (%1),%%xmm0 \n" 1925 "pavgb (%1,%4,1),%%xmm0 \n" 1926 "sub $0x10,%2 \n" 1927 "movdqa %%xmm0,(%1,%0,1) \n" 1928 "lea 0x10(%1),%1 \n" 1929 "jg 3b \n" 1930 ".p2align 4 \n" 1931 "4: \n" 1932 "punpckhbw %%xmm0,%%xmm0 \n" 1933 "pshufhw $0xff,%%xmm0,%%xmm0 \n" 1934 "punpckhqdq %%xmm0,%%xmm0 \n" 1935 "movdqa %%xmm0,(%1,%0,1) \n" 1936 : "+r"(dst_ptr), // %0 1937 "+r"(src_ptr), // %1 1938 "+r"(dst_width), // %2 1939 "+r"(source_y_fraction) // %3 1940 : "r"(static_cast<intptr_t>(src_stride)) // %4 1941 : "memory", "cc" 1942 #if defined(__SSE2__) 1943 , "xmm0", "xmm1", "xmm2", "xmm5" 1944 #endif 1945 ); 1946 } 1947 #endif // defined(__x86_64__) || defined(__i386__) 1948 1949 // CPU agnostic row functions 1950 static void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t /* src_stride */, 1951 uint8* dst, int dst_width) { 1952 uint8* dend = dst + dst_width - 1; 1953 do { 1954 dst[0] = src_ptr[0]; 1955 dst[1] = src_ptr[2]; 1956 dst += 2; 1957 src_ptr += 4; 1958 } while (dst < dend); 1959 if (dst_width & 1) { 1960 dst[0] = src_ptr[0]; 1961 } 1962 } 1963 1964 void ScaleRowDown2Int_C(const uint8* src_ptr, ptrdiff_t src_stride, 1965 uint8* dst, int dst_width) { 1966 const uint8* s = src_ptr; 1967 const uint8* t = src_ptr + src_stride; 1968 uint8* dend = dst + dst_width - 1; 1969 do { 1970 dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; 1971 dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2; 1972 dst += 2; 1973 s += 4; 1974 t += 4; 1975 } while (dst < dend); 1976 if (dst_width & 1) { 1977 dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2; 1978 } 1979 } 1980 1981 static void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t /* src_stride */, 1982 uint8* dst, int dst_width) { 1983 uint8* dend = dst + dst_width - 1; 1984 do { 1985 dst[0] = src_ptr[0]; 1986 dst[1] = src_ptr[4]; 1987 dst += 2; 1988 src_ptr += 8; 1989 } while (dst < dend); 1990 if (dst_width & 1) { 1991 dst[0] = src_ptr[0]; 1992 } 1993 } 1994 1995 static void ScaleRowDown4Int_C(const uint8* src_ptr, ptrdiff_t src_stride, 1996 uint8* dst, int dst_width) { 1997 intptr_t stride = src_stride; 1998 uint8* dend = dst + dst_width - 1; 1999 do { 2000 dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + 2001 src_ptr[stride + 0] + src_ptr[stride + 1] + 2002 src_ptr[stride + 2] + src_ptr[stride + 3] + 2003 src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] + 2004 src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] + 2005 src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] + 2006 src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] + 2007 8) >> 4; 2008 dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] + 2009 src_ptr[stride + 4] + src_ptr[stride + 5] + 2010 src_ptr[stride + 6] + src_ptr[stride + 7] + 2011 src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] + 2012 src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] + 2013 src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] + 2014 src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] + 2015 8) >> 4; 2016 dst += 2; 2017 src_ptr += 8; 2018 } while (dst < dend); 2019 if (dst_width & 1) { 2020 dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + 2021 src_ptr[stride + 0] + src_ptr[stride + 1] + 2022 src_ptr[stride + 2] + src_ptr[stride + 3] + 2023 src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] + 2024 src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] + 2025 src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] + 2026 src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] + 2027 8) >> 4; 2028 } 2029 } 2030 2031 // 640 output pixels is enough to allow 5120 input pixels with 1/8 scale down. 2032 // Keeping the total buffer under 4096 bytes avoids a stackcheck, saving 4% cpu. 2033 static const int kMaxOutputWidth = 640; 2034 static const int kMaxRow12 = kMaxOutputWidth * 2; 2035 2036 static void ScaleRowDown8_C(const uint8* src_ptr, ptrdiff_t /* src_stride */, 2037 uint8* dst, int dst_width) { 2038 uint8* dend = dst + dst_width - 1; 2039 do { 2040 dst[0] = src_ptr[0]; 2041 dst[1] = src_ptr[8]; 2042 dst += 2; 2043 src_ptr += 16; 2044 } while (dst < dend); 2045 if (dst_width & 1) { 2046 dst[0] = src_ptr[0]; 2047 } 2048 } 2049 2050 // Note calling code checks width is less than max and if not 2051 // uses ScaleRowDown8_C instead. 2052 static void ScaleRowDown8Int_C(const uint8* src_ptr, ptrdiff_t src_stride, 2053 uint8* dst, int dst_width) { 2054 SIMD_ALIGNED(uint8 src_row[kMaxRow12 * 2]); 2055 assert(dst_width <= kMaxOutputWidth); 2056 ScaleRowDown4Int_C(src_ptr, src_stride, src_row, dst_width * 2); 2057 ScaleRowDown4Int_C(src_ptr + src_stride * 4, src_stride, 2058 src_row + kMaxOutputWidth, 2059 dst_width * 2); 2060 ScaleRowDown2Int_C(src_row, kMaxOutputWidth, dst, dst_width); 2061 } 2062 2063 static void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t /* src_stride */, 2064 uint8* dst, int dst_width) { 2065 assert((dst_width % 3 == 0) && (dst_width > 0)); 2066 uint8* dend = dst + dst_width; 2067 do { 2068 dst[0] = src_ptr[0]; 2069 dst[1] = src_ptr[1]; 2070 dst[2] = src_ptr[3]; 2071 dst += 3; 2072 src_ptr += 4; 2073 } while (dst < dend); 2074 } 2075 2076 // Filter rows 0 and 1 together, 3 : 1 2077 static void ScaleRowDown34_0_Int_C(const uint8* src_ptr, ptrdiff_t src_stride, 2078 uint8* d, int dst_width) { 2079 assert((dst_width % 3 == 0) && (dst_width > 0)); 2080 const uint8* s = src_ptr; 2081 const uint8* t = src_ptr + src_stride; 2082 uint8* dend = d + dst_width; 2083 do { 2084 uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; 2085 uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; 2086 uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; 2087 uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; 2088 uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; 2089 uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; 2090 d[0] = (a0 * 3 + b0 + 2) >> 2; 2091 d[1] = (a1 * 3 + b1 + 2) >> 2; 2092 d[2] = (a2 * 3 + b2 + 2) >> 2; 2093 d += 3; 2094 s += 4; 2095 t += 4; 2096 } while (d < dend); 2097 } 2098 2099 // Filter rows 1 and 2 together, 1 : 1 2100 static void ScaleRowDown34_1_Int_C(const uint8* src_ptr, ptrdiff_t src_stride, 2101 uint8* d, int dst_width) { 2102 assert((dst_width % 3 == 0) && (dst_width > 0)); 2103 const uint8* s = src_ptr; 2104 const uint8* t = src_ptr + src_stride; 2105 uint8* dend = d + dst_width; 2106 do { 2107 uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; 2108 uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; 2109 uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; 2110 uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; 2111 uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; 2112 uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; 2113 d[0] = (a0 + b0 + 1) >> 1; 2114 d[1] = (a1 + b1 + 1) >> 1; 2115 d[2] = (a2 + b2 + 1) >> 1; 2116 d += 3; 2117 s += 4; 2118 t += 4; 2119 } while (d < dend); 2120 } 2121 2122 // (1-f)a + fb can be replaced with a + f(b-a) 2123 #define BLENDER(a, b, f) (static_cast<int>(a) + \ 2124 ((f) * (static_cast<int>(b) - static_cast<int>(a)) >> 16)) 2125 2126 static void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr, 2127 int dst_width, int x, int dx) { 2128 for (int j = 0; j < dst_width - 1; j += 2) { 2129 int xi = x >> 16; 2130 int a = src_ptr[xi]; 2131 int b = src_ptr[xi + 1]; 2132 dst_ptr[0] = BLENDER(a, b, x & 0xffff); 2133 x += dx; 2134 xi = x >> 16; 2135 a = src_ptr[xi]; 2136 b = src_ptr[xi + 1]; 2137 dst_ptr[1] = BLENDER(a, b, x & 0xffff); 2138 x += dx; 2139 dst_ptr += 2; 2140 } 2141 if (dst_width & 1) { 2142 int xi = x >> 16; 2143 int a = src_ptr[xi]; 2144 int b = src_ptr[xi + 1]; 2145 dst_ptr[0] = BLENDER(a, b, x & 0xffff); 2146 } 2147 } 2148 2149 static const int kMaxInputWidth = 2560; 2150 2151 #if defined(HAS_SCALEFILTERROWS_SSE2) 2152 // Filter row to 3/4 2153 static void ScaleFilterCols34_C(uint8* dst_ptr, const uint8* src_ptr, 2154 int dst_width) { 2155 assert((dst_width % 3 == 0) && (dst_width > 0)); 2156 const uint8* s = src_ptr; 2157 uint8* dend = dst_ptr + dst_width; 2158 do { 2159 dst_ptr[0] = (s[0] * 3 + s[1] * 1 + 2) >> 2; 2160 dst_ptr[1] = (s[1] * 1 + s[2] * 1 + 1) >> 1; 2161 dst_ptr[2] = (s[2] * 1 + s[3] * 3 + 2) >> 2; 2162 dst_ptr += 3; 2163 s += 4; 2164 } while (dst_ptr < dend); 2165 } 2166 2167 #define HAS_SCALEROWDOWN34_SSE2_DISABLED 2168 // Filter rows 0 and 1 together, 3 : 1 2169 static void ScaleRowDown34_0_Int_SSE2(const uint8* src_ptr, 2170 ptrdiff_t src_stride, 2171 uint8* dst_ptr, int dst_width) { 2172 assert((dst_width % 3 == 0) && (dst_width > 0)); 2173 SIMD_ALIGNED(uint8 row[kMaxInputWidth]); 2174 ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 4); 2175 ScaleFilterCols34_C(dst_ptr, row, dst_width); 2176 } 2177 2178 // Filter rows 1 and 2 together, 1 : 1 2179 static void ScaleRowDown34_1_Int_SSE2(const uint8* src_ptr, 2180 ptrdiff_t src_stride, 2181 uint8* dst_ptr, int dst_width) { 2182 assert((dst_width % 3 == 0) && (dst_width > 0)); 2183 SIMD_ALIGNED(uint8 row[kMaxInputWidth]); 2184 ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 2); 2185 ScaleFilterCols34_C(dst_ptr, row, dst_width); 2186 } 2187 #endif 2188 2189 static void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t /* src_stride */, 2190 uint8* dst, int dst_width) { 2191 assert(dst_width % 3 == 0); 2192 for (int x = 0; x < dst_width; x += 3) { 2193 dst[0] = src_ptr[0]; 2194 dst[1] = src_ptr[3]; 2195 dst[2] = src_ptr[6]; 2196 dst += 3; 2197 src_ptr += 8; 2198 } 2199 } 2200 2201 // 8x3 -> 3x1 2202 static void ScaleRowDown38_3_Int_C(const uint8* src_ptr, 2203 ptrdiff_t src_stride, 2204 uint8* dst_ptr, int dst_width) { 2205 assert((dst_width % 3 == 0) && (dst_width > 0)); 2206 intptr_t stride = src_stride; 2207 for (int i = 0; i < dst_width; i += 3) { 2208 dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + 2209 src_ptr[stride + 0] + src_ptr[stride + 1] + 2210 src_ptr[stride + 2] + src_ptr[stride * 2 + 0] + 2211 src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) * 2212 (65536 / 9) >> 16; 2213 dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + 2214 src_ptr[stride + 3] + src_ptr[stride + 4] + 2215 src_ptr[stride + 5] + src_ptr[stride * 2 + 3] + 2216 src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) * 2217 (65536 / 9) >> 16; 2218 dst_ptr[2] = (src_ptr[6] + src_ptr[7] + 2219 src_ptr[stride + 6] + src_ptr[stride + 7] + 2220 src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) * 2221 (65536 / 6) >> 16; 2222 src_ptr += 8; 2223 dst_ptr += 3; 2224 } 2225 } 2226 2227 // 8x2 -> 3x1 2228 static void ScaleRowDown38_2_Int_C(const uint8* src_ptr, ptrdiff_t src_stride, 2229 uint8* dst_ptr, int dst_width) { 2230 assert((dst_width % 3 == 0) && (dst_width > 0)); 2231 intptr_t stride = src_stride; 2232 for (int i = 0; i < dst_width; i += 3) { 2233 dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + 2234 src_ptr[stride + 0] + src_ptr[stride + 1] + 2235 src_ptr[stride + 2]) * (65536 / 6) >> 16; 2236 dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + 2237 src_ptr[stride + 3] + src_ptr[stride + 4] + 2238 src_ptr[stride + 5]) * (65536 / 6) >> 16; 2239 dst_ptr[2] = (src_ptr[6] + src_ptr[7] + 2240 src_ptr[stride + 6] + src_ptr[stride + 7]) * 2241 (65536 / 4) >> 16; 2242 src_ptr += 8; 2243 dst_ptr += 3; 2244 } 2245 } 2246 2247 // C version 8x2 -> 8x1 2248 static void ScaleFilterRows_C(uint8* dst_ptr, 2249 const uint8* src_ptr, ptrdiff_t src_stride, 2250 int dst_width, int source_y_fraction) { 2251 assert(dst_width > 0); 2252 int y1_fraction = source_y_fraction; 2253 int y0_fraction = 256 - y1_fraction; 2254 const uint8* src_ptr1 = src_ptr + src_stride; 2255 uint8* end = dst_ptr + dst_width; 2256 do { 2257 dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8; 2258 dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8; 2259 dst_ptr[2] = (src_ptr[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8; 2260 dst_ptr[3] = (src_ptr[3] * y0_fraction + src_ptr1[3] * y1_fraction) >> 8; 2261 dst_ptr[4] = (src_ptr[4] * y0_fraction + src_ptr1[4] * y1_fraction) >> 8; 2262 dst_ptr[5] = (src_ptr[5] * y0_fraction + src_ptr1[5] * y1_fraction) >> 8; 2263 dst_ptr[6] = (src_ptr[6] * y0_fraction + src_ptr1[6] * y1_fraction) >> 8; 2264 dst_ptr[7] = (src_ptr[7] * y0_fraction + src_ptr1[7] * y1_fraction) >> 8; 2265 src_ptr += 8; 2266 src_ptr1 += 8; 2267 dst_ptr += 8; 2268 } while (dst_ptr < end); 2269 dst_ptr[0] = dst_ptr[-1]; 2270 } 2271 2272 void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride, 2273 uint16* dst_ptr, int src_width, int src_height) { 2274 assert(src_width > 0); 2275 assert(src_height > 0); 2276 for (int x = 0; x < src_width; ++x) { 2277 const uint8* s = src_ptr + x; 2278 int sum = 0; 2279 for (int y = 0; y < src_height; ++y) { 2280 sum += s[0]; 2281 s += src_stride; 2282 } 2283 dst_ptr[x] = sum; 2284 } 2285 } 2286 2287 /** 2288 * Scale plane, 1/2 2289 * 2290 * This is an optimized version for scaling down a plane to 1/2 of 2291 * its original size. 2292 * 2293 */ 2294 static void ScalePlaneDown2(int /* src_width */, int /* src_height */, 2295 int dst_width, int dst_height, 2296 int src_stride, int dst_stride, 2297 const uint8* src_ptr, uint8* dst_ptr, 2298 FilterMode filtering) { 2299 void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride, 2300 uint8* dst_ptr, int dst_width) = 2301 filtering ? ScaleRowDown2Int_C : ScaleRowDown2_C; 2302 #if defined(HAS_SCALEROWDOWN2_NEON) 2303 if (TestCpuFlag(kCpuHasNEON) && 2304 IS_ALIGNED(dst_width, 16)) { 2305 ScaleRowDown2 = filtering ? ScaleRowDown2Int_NEON : ScaleRowDown2_NEON; 2306 } 2307 #elif defined(HAS_SCALEROWDOWN2_SSE2) 2308 if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) { 2309 ScaleRowDown2 = filtering ? ScaleRowDown2Int_Unaligned_SSE2 : 2310 ScaleRowDown2_Unaligned_SSE2; 2311 if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && 2312 IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { 2313 ScaleRowDown2 = filtering ? ScaleRowDown2Int_SSE2 : ScaleRowDown2_SSE2; 2314 } 2315 } 2316 #endif 2317 2318 // TODO(fbarchard): Loop through source height to allow odd height. 2319 for (int y = 0; y < dst_height; ++y) { 2320 ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width); 2321 src_ptr += (src_stride << 1); 2322 dst_ptr += dst_stride; 2323 } 2324 } 2325 2326 /** 2327 * Scale plane, 1/4 2328 * 2329 * This is an optimized version for scaling down a plane to 1/4 of 2330 * its original size. 2331 */ 2332 static void ScalePlaneDown4(int /* src_width */, int /* src_height */, 2333 int dst_width, int dst_height, 2334 int src_stride, int dst_stride, 2335 const uint8* src_ptr, uint8* dst_ptr, 2336 FilterMode filtering) { 2337 void (*ScaleRowDown4)(const uint8* src_ptr, ptrdiff_t src_stride, 2338 uint8* dst_ptr, int dst_width) = 2339 filtering ? ScaleRowDown4Int_C : ScaleRowDown4_C; 2340 #if defined(HAS_SCALEROWDOWN4_NEON) 2341 if (TestCpuFlag(kCpuHasNEON) && 2342 IS_ALIGNED(dst_width, 4)) { 2343 ScaleRowDown4 = filtering ? ScaleRowDown4Int_NEON : ScaleRowDown4_NEON; 2344 } 2345 #elif defined(HAS_SCALEROWDOWN4_SSE2) 2346 if (TestCpuFlag(kCpuHasSSE2) && 2347 IS_ALIGNED(dst_width, 8) && 2348 IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { 2349 ScaleRowDown4 = filtering ? ScaleRowDown4Int_SSE2 : ScaleRowDown4_SSE2; 2350 } 2351 #endif 2352 2353 for (int y = 0; y < dst_height; ++y) { 2354 ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width); 2355 src_ptr += (src_stride << 2); 2356 dst_ptr += dst_stride; 2357 } 2358 } 2359 2360 /** 2361 * Scale plane, 1/8 2362 * 2363 * This is an optimized version for scaling down a plane to 1/8 2364 * of its original size. 2365 * 2366 */ 2367 static void ScalePlaneDown8(int /* src_width */, int /* src_height */, 2368 int dst_width, int dst_height, 2369 int src_stride, int dst_stride, 2370 const uint8* src_ptr, uint8* dst_ptr, 2371 FilterMode filtering) { 2372 void (*ScaleRowDown8)(const uint8* src_ptr, ptrdiff_t src_stride, 2373 uint8* dst_ptr, int dst_width) = 2374 filtering && (dst_width <= kMaxOutputWidth) ? 2375 ScaleRowDown8Int_C : ScaleRowDown8_C; 2376 #if defined(HAS_SCALEROWDOWN8_SSE2) 2377 if (TestCpuFlag(kCpuHasSSE2) && 2378 IS_ALIGNED(dst_width, 4) && 2379 IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { 2380 ScaleRowDown8 = filtering ? ScaleRowDown8Int_SSE2 : ScaleRowDown8_SSE2; 2381 } 2382 #endif 2383 2384 for (int y = 0; y < dst_height; ++y) { 2385 ScaleRowDown8(src_ptr, src_stride, dst_ptr, dst_width); 2386 src_ptr += (src_stride << 3); 2387 dst_ptr += dst_stride; 2388 } 2389 } 2390 2391 /** 2392 * Scale plane down, 3/4 2393 * 2394 * Provided by Frank Barchard (fbarchard (at) google.com) 2395 * 2396 */ 2397 static void ScalePlaneDown34(int /* src_width */, int /* src_height */, 2398 int dst_width, int dst_height, 2399 int src_stride, int dst_stride, 2400 const uint8* src_ptr, uint8* dst_ptr, 2401 FilterMode filtering) { 2402 assert(dst_width % 3 == 0); 2403 void (*ScaleRowDown34_0)(const uint8* src_ptr, ptrdiff_t src_stride, 2404 uint8* dst_ptr, int dst_width); 2405 void (*ScaleRowDown34_1)(const uint8* src_ptr, ptrdiff_t src_stride, 2406 uint8* dst_ptr, int dst_width); 2407 if (!filtering) { 2408 ScaleRowDown34_0 = ScaleRowDown34_C; 2409 ScaleRowDown34_1 = ScaleRowDown34_C; 2410 } else { 2411 ScaleRowDown34_0 = ScaleRowDown34_0_Int_C; 2412 ScaleRowDown34_1 = ScaleRowDown34_1_Int_C; 2413 } 2414 #if defined(HAS_SCALEROWDOWN34_NEON) 2415 if (TestCpuFlag(kCpuHasNEON) && (dst_width % 24 == 0)) { 2416 if (!filtering) { 2417 ScaleRowDown34_0 = ScaleRowDown34_NEON; 2418 ScaleRowDown34_1 = ScaleRowDown34_NEON; 2419 } else { 2420 ScaleRowDown34_0 = ScaleRowDown34_0_Int_NEON; 2421 ScaleRowDown34_1 = ScaleRowDown34_1_Int_NEON; 2422 } 2423 } 2424 #endif 2425 #if defined(HAS_SCALEROWDOWN34_SSE2) 2426 if (TestCpuFlag(kCpuHasSSE2) && (dst_width % 24 == 0) && 2427 IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && filtering) { 2428 ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSE2; 2429 ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSE2; 2430 } 2431 #endif 2432 #if defined(HAS_SCALEROWDOWN34_SSSE3) 2433 if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) && 2434 IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { 2435 if (!filtering) { 2436 ScaleRowDown34_0 = ScaleRowDown34_SSSE3; 2437 ScaleRowDown34_1 = ScaleRowDown34_SSSE3; 2438 } else { 2439 ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSSE3; 2440 ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSSE3; 2441 } 2442 } 2443 #endif 2444 2445 for (int y = 0; y < dst_height - 2; y += 3) { 2446 ScaleRowDown34_0(src_ptr, src_stride, dst_ptr, dst_width); 2447 src_ptr += src_stride; 2448 dst_ptr += dst_stride; 2449 ScaleRowDown34_1(src_ptr, src_stride, dst_ptr, dst_width); 2450 src_ptr += src_stride; 2451 dst_ptr += dst_stride; 2452 ScaleRowDown34_0(src_ptr + src_stride, -src_stride, 2453 dst_ptr, dst_width); 2454 src_ptr += src_stride * 2; 2455 dst_ptr += dst_stride; 2456 } 2457 2458 // Remainder 1 or 2 rows with last row vertically unfiltered 2459 if ((dst_height % 3) == 2) { 2460 ScaleRowDown34_0(src_ptr, src_stride, dst_ptr, dst_width); 2461 src_ptr += src_stride; 2462 dst_ptr += dst_stride; 2463 ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width); 2464 } else if ((dst_height % 3) == 1) { 2465 ScaleRowDown34_0(src_ptr, 0, dst_ptr, dst_width); 2466 } 2467 } 2468 2469 /** 2470 * Scale plane, 3/8 2471 * 2472 * This is an optimized version for scaling down a plane to 3/8 2473 * of its original size. 2474 * 2475 * Uses box filter arranges like this 2476 * aaabbbcc -> abc 2477 * aaabbbcc def 2478 * aaabbbcc ghi 2479 * dddeeeff 2480 * dddeeeff 2481 * dddeeeff 2482 * ggghhhii 2483 * ggghhhii 2484 * Boxes are 3x3, 2x3, 3x2 and 2x2 2485 */ 2486 static void ScalePlaneDown38(int /* src_width */, int /* src_height */, 2487 int dst_width, int dst_height, 2488 int src_stride, int dst_stride, 2489 const uint8* src_ptr, uint8* dst_ptr, 2490 FilterMode filtering) { 2491 assert(dst_width % 3 == 0); 2492 void (*ScaleRowDown38_3)(const uint8* src_ptr, ptrdiff_t src_stride, 2493 uint8* dst_ptr, int dst_width); 2494 void (*ScaleRowDown38_2)(const uint8* src_ptr, ptrdiff_t src_stride, 2495 uint8* dst_ptr, int dst_width); 2496 if (!filtering) { 2497 ScaleRowDown38_3 = ScaleRowDown38_C; 2498 ScaleRowDown38_2 = ScaleRowDown38_C; 2499 } else { 2500 ScaleRowDown38_3 = ScaleRowDown38_3_Int_C; 2501 ScaleRowDown38_2 = ScaleRowDown38_2_Int_C; 2502 } 2503 #if defined(HAS_SCALEROWDOWN38_NEON) 2504 if (TestCpuFlag(kCpuHasNEON) && (dst_width % 12 == 0)) { 2505 if (!filtering) { 2506 ScaleRowDown38_3 = ScaleRowDown38_NEON; 2507 ScaleRowDown38_2 = ScaleRowDown38_NEON; 2508 } else { 2509 ScaleRowDown38_3 = ScaleRowDown38_3_Int_NEON; 2510 ScaleRowDown38_2 = ScaleRowDown38_2_Int_NEON; 2511 } 2512 } 2513 #elif defined(HAS_SCALEROWDOWN38_SSSE3) 2514 if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) && 2515 IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { 2516 if (!filtering) { 2517 ScaleRowDown38_3 = ScaleRowDown38_SSSE3; 2518 ScaleRowDown38_2 = ScaleRowDown38_SSSE3; 2519 } else { 2520 ScaleRowDown38_3 = ScaleRowDown38_3_Int_SSSE3; 2521 ScaleRowDown38_2 = ScaleRowDown38_2_Int_SSSE3; 2522 } 2523 } 2524 #endif 2525 2526 for (int y = 0; y < dst_height - 2; y += 3) { 2527 ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width); 2528 src_ptr += src_stride * 3; 2529 dst_ptr += dst_stride; 2530 ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width); 2531 src_ptr += src_stride * 3; 2532 dst_ptr += dst_stride; 2533 ScaleRowDown38_2(src_ptr, src_stride, dst_ptr, dst_width); 2534 src_ptr += src_stride * 2; 2535 dst_ptr += dst_stride; 2536 } 2537 2538 // Remainder 1 or 2 rows with last row vertically unfiltered 2539 if ((dst_height % 3) == 2) { 2540 ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width); 2541 src_ptr += src_stride * 3; 2542 dst_ptr += dst_stride; 2543 ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width); 2544 } else if ((dst_height % 3) == 1) { 2545 ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width); 2546 } 2547 } 2548 2549 static __inline uint32 SumBox(int iboxwidth, int iboxheight, 2550 ptrdiff_t src_stride, const uint8* src_ptr) { 2551 assert(iboxwidth > 0); 2552 assert(iboxheight > 0); 2553 uint32 sum = 0u; 2554 for (int y = 0; y < iboxheight; ++y) { 2555 for (int x = 0; x < iboxwidth; ++x) { 2556 sum += src_ptr[x]; 2557 } 2558 src_ptr += src_stride; 2559 } 2560 return sum; 2561 } 2562 2563 static void ScalePlaneBoxRow_C(int dst_width, int boxheight, 2564 int x, int dx, ptrdiff_t src_stride, 2565 const uint8* src_ptr, uint8* dst_ptr) { 2566 for (int i = 0; i < dst_width; ++i) { 2567 int ix = x >> 16; 2568 x += dx; 2569 int boxwidth = (x >> 16) - ix; 2570 *dst_ptr++ = SumBox(boxwidth, boxheight, src_stride, src_ptr + ix) / 2571 (boxwidth * boxheight); 2572 } 2573 } 2574 2575 static __inline uint32 SumPixels(int iboxwidth, const uint16* src_ptr) { 2576 assert(iboxwidth > 0); 2577 uint32 sum = 0u; 2578 for (int x = 0; x < iboxwidth; ++x) { 2579 sum += src_ptr[x]; 2580 } 2581 return sum; 2582 } 2583 2584 static void ScaleAddCols2_C(int dst_width, int boxheight, int x, int dx, 2585 const uint16* src_ptr, uint8* dst_ptr) { 2586 int scaletbl[2]; 2587 int minboxwidth = (dx >> 16); 2588 scaletbl[0] = 65536 / (minboxwidth * boxheight); 2589 scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight); 2590 int *scaleptr = scaletbl - minboxwidth; 2591 for (int i = 0; i < dst_width; ++i) { 2592 int ix = x >> 16; 2593 x += dx; 2594 int boxwidth = (x >> 16) - ix; 2595 *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16; 2596 } 2597 } 2598 2599 static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx, 2600 const uint16* src_ptr, uint8* dst_ptr) { 2601 int boxwidth = (dx >> 16); 2602 int scaleval = 65536 / (boxwidth * boxheight); 2603 for (int i = 0; i < dst_width; ++i) { 2604 *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16; 2605 x += boxwidth; 2606 } 2607 } 2608 2609 /** 2610 * Scale plane down to any dimensions, with interpolation. 2611 * (boxfilter). 2612 * 2613 * Same method as SimpleScale, which is fixed point, outputting 2614 * one pixel of destination using fixed point (16.16) to step 2615 * through source, sampling a box of pixel with simple 2616 * averaging. 2617 */ 2618 static void ScalePlaneBox(int src_width, int src_height, 2619 int dst_width, int dst_height, 2620 int src_stride, int dst_stride, 2621 const uint8* src_ptr, uint8* dst_ptr) { 2622 assert(dst_width > 0); 2623 assert(dst_height > 0); 2624 int dx = (src_width << 16) / dst_width; 2625 int dy = (src_height << 16) / dst_height; 2626 int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1); 2627 int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1); 2628 int maxy = (src_height << 16); 2629 if (!IS_ALIGNED(src_width, 16) || (src_width > kMaxInputWidth) || 2630 dst_height * 2 > src_height) { 2631 uint8* dst = dst_ptr; 2632 for (int j = 0; j < dst_height; ++j) { 2633 int iy = y >> 16; 2634 const uint8* src = src_ptr + iy * src_stride; 2635 y += dy; 2636 if (y > maxy) { 2637 y = maxy; 2638 } 2639 int boxheight = (y >> 16) - iy; 2640 ScalePlaneBoxRow_C(dst_width, boxheight, 2641 x, dx, src_stride, 2642 src, dst); 2643 dst += dst_stride; 2644 } 2645 } else { 2646 SIMD_ALIGNED(uint16 row[kMaxInputWidth]); 2647 void (*ScaleAddRows)(const uint8* src_ptr, ptrdiff_t src_stride, 2648 uint16* dst_ptr, int src_width, int src_height)= 2649 ScaleAddRows_C; 2650 void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx, 2651 const uint16* src_ptr, uint8* dst_ptr); 2652 if (dx & 0xffff) { 2653 ScaleAddCols = ScaleAddCols2_C; 2654 } else { 2655 ScaleAddCols = ScaleAddCols1_C; 2656 } 2657 #if defined(HAS_SCALEADDROWS_SSE2) 2658 if (TestCpuFlag(kCpuHasSSE2) && 2659 IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16)) { 2660 ScaleAddRows = ScaleAddRows_SSE2; 2661 } 2662 #endif 2663 2664 for (int j = 0; j < dst_height; ++j) { 2665 int iy = y >> 16; 2666 const uint8* src = src_ptr + iy * src_stride; 2667 y += dy; 2668 if (y > (src_height << 16)) { 2669 y = (src_height << 16); 2670 } 2671 int boxheight = (y >> 16) - iy; 2672 ScaleAddRows(src, src_stride, row, src_width, boxheight); 2673 ScaleAddCols(dst_width, boxheight, x, dx, row, dst_ptr); 2674 dst_ptr += dst_stride; 2675 } 2676 } 2677 } 2678 2679 /** 2680 * Scale plane to/from any dimensions, with interpolation. 2681 */ 2682 static void ScalePlaneBilinearSimple(int src_width, int src_height, 2683 int dst_width, int dst_height, 2684 int src_stride, int dst_stride, 2685 const uint8* src_ptr, uint8* dst_ptr) { 2686 int dx = (src_width << 16) / dst_width; 2687 int dy = (src_height << 16) / dst_height; 2688 int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1); 2689 int maxx = (src_width > 1) ? ((src_width - 1) << 16) - 1 : 0; 2690 int maxy = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0; 2691 for (int i = 0; i < dst_height; ++i) { 2692 int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1); 2693 int yi = y >> 16; 2694 int yf = y & 0xffff; 2695 const uint8* src0 = src_ptr + yi * src_stride; 2696 const uint8* src1 = (yi < src_height - 1) ? src0 + src_stride : src0; 2697 uint8* dst = dst_ptr; 2698 for (int j = 0; j < dst_width; ++j) { 2699 int xi = x >> 16; 2700 int xf = x & 0xffff; 2701 int x1 = (xi < src_width - 1) ? xi + 1 : xi; 2702 int a = src0[xi]; 2703 int b = src0[x1]; 2704 int r0 = BLENDER(a, b, xf); 2705 a = src1[xi]; 2706 b = src1[x1]; 2707 int r1 = BLENDER(a, b, xf); 2708 *dst++ = BLENDER(r0, r1, yf); 2709 x += dx; 2710 if (x > maxx) 2711 x = maxx; 2712 } 2713 dst_ptr += dst_stride; 2714 y += dy; 2715 if (y > maxy) 2716 y = maxy; 2717 } 2718 } 2719 2720 /** 2721 * Scale plane to/from any dimensions, with bilinear 2722 * interpolation. 2723 */ 2724 void ScalePlaneBilinear(int src_width, int src_height, 2725 int dst_width, int dst_height, 2726 int src_stride, int dst_stride, 2727 const uint8* src_ptr, uint8* dst_ptr) { 2728 assert(dst_width > 0); 2729 assert(dst_height > 0); 2730 if (!IS_ALIGNED(src_width, 8) || (src_width > kMaxInputWidth)) { 2731 ScalePlaneBilinearSimple(src_width, src_height, dst_width, dst_height, 2732 src_stride, dst_stride, src_ptr, dst_ptr); 2733 2734 } else { 2735 SIMD_ALIGNED(uint8 row[kMaxInputWidth + 16]); 2736 void (*ScaleFilterRows)(uint8* dst_ptr, const uint8* src_ptr, 2737 ptrdiff_t src_stride, 2738 int dst_width, int source_y_fraction) = 2739 ScaleFilterRows_C; 2740 #if defined(HAS_SCALEFILTERROWS_NEON) 2741 if (TestCpuFlag(kCpuHasNEON)) { 2742 ScaleFilterRows = ScaleFilterRows_NEON; 2743 } 2744 #endif 2745 #if defined(HAS_SCALEFILTERROWS_SSE2) 2746 if (TestCpuFlag(kCpuHasSSE2) && 2747 IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16)) { 2748 ScaleFilterRows = ScaleFilterRows_SSE2; 2749 } 2750 #endif 2751 #if defined(HAS_SCALEFILTERROWS_SSSE3) 2752 if (TestCpuFlag(kCpuHasSSSE3) && 2753 IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16)) { 2754 ScaleFilterRows = ScaleFilterRows_SSSE3; 2755 } 2756 #endif 2757 2758 int dx = (src_width << 16) / dst_width; 2759 int dy = (src_height << 16) / dst_height; 2760 int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1); 2761 int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1); 2762 int maxy = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0; 2763 for (int j = 0; j < dst_height; ++j) { 2764 int yi = y >> 16; 2765 int yf = (y >> 8) & 255; 2766 const uint8* src = src_ptr + yi * src_stride; 2767 ScaleFilterRows(row, src, src_stride, src_width, yf); 2768 ScaleFilterCols_C(dst_ptr, row, dst_width, x, dx); 2769 dst_ptr += dst_stride; 2770 y += dy; 2771 if (y > maxy) { 2772 y = maxy; 2773 } 2774 } 2775 } 2776 } 2777 2778 /** 2779 * Scale plane to/from any dimensions, without interpolation. 2780 * Fixed point math is used for performance: The upper 16 bits 2781 * of x and dx is the integer part of the source position and 2782 * the lower 16 bits are the fixed decimal part. 2783 */ 2784 static void ScalePlaneSimple(int src_width, int src_height, 2785 int dst_width, int dst_height, 2786 int src_stride, int dst_stride, 2787 const uint8* src_ptr, uint8* dst_ptr) { 2788 int dx = (src_width << 16) / dst_width; 2789 int dy = (src_height << 16) / dst_height; 2790 int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1); 2791 for (int j = 0; j < dst_height; ++j) { 2792 int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1); 2793 int yi = y >> 16; 2794 const uint8* src = src_ptr + yi * src_stride; 2795 uint8* dst = dst_ptr; 2796 for (int i = 0; i < dst_width; ++i) { 2797 *dst++ = src[x >> 16]; 2798 x += dx; 2799 } 2800 dst_ptr += dst_stride; 2801 y += dy; 2802 } 2803 } 2804 2805 /** 2806 * Scale plane to/from any dimensions. 2807 */ 2808 static void ScalePlaneAnySize(int src_width, int src_height, 2809 int dst_width, int dst_height, 2810 int src_stride, int dst_stride, 2811 const uint8* src_ptr, uint8* dst_ptr, 2812 FilterMode filtering) { 2813 if (!filtering) { 2814 ScalePlaneSimple(src_width, src_height, dst_width, dst_height, 2815 src_stride, dst_stride, src_ptr, dst_ptr); 2816 } else { 2817 // fall back to non-optimized version 2818 ScalePlaneBilinear(src_width, src_height, dst_width, dst_height, 2819 src_stride, dst_stride, src_ptr, dst_ptr); 2820 } 2821 } 2822 2823 /** 2824 * Scale plane down, any size 2825 * 2826 * This is an optimized version for scaling down a plane to any size. 2827 * The current implementation is ~10 times faster compared to the 2828 * reference implementation for e.g. XGA->LowResPAL 2829 * 2830 */ 2831 static void ScalePlaneDown(int src_width, int src_height, 2832 int dst_width, int dst_height, 2833 int src_stride, int dst_stride, 2834 const uint8* src_ptr, uint8* dst_ptr, 2835 FilterMode filtering) { 2836 if (!filtering) { 2837 ScalePlaneSimple(src_width, src_height, dst_width, dst_height, 2838 src_stride, dst_stride, src_ptr, dst_ptr); 2839 } else if (filtering == kFilterBilinear || src_height * 2 > dst_height) { 2840 // between 1/2x and 1x use bilinear 2841 ScalePlaneBilinear(src_width, src_height, dst_width, dst_height, 2842 src_stride, dst_stride, src_ptr, dst_ptr); 2843 } else { 2844 ScalePlaneBox(src_width, src_height, dst_width, dst_height, 2845 src_stride, dst_stride, src_ptr, dst_ptr); 2846 } 2847 } 2848 2849 // Scale a plane. 2850 // This function in turn calls a scaling function suitable for handling 2851 // the desired resolutions. 2852 2853 LIBYUV_API 2854 void ScalePlane(const uint8* src, int src_stride, 2855 int src_width, int src_height, 2856 uint8* dst, int dst_stride, 2857 int dst_width, int dst_height, 2858 FilterMode filtering) { 2859 #ifdef CPU_X86 2860 // environment variable overrides for testing. 2861 char *filter_override = getenv("LIBYUV_FILTER"); 2862 if (filter_override) { 2863 filtering = (FilterMode)atoi(filter_override); // NOLINT 2864 } 2865 #endif 2866 // Use specialized scales to improve performance for common resolutions. 2867 // For example, all the 1/2 scalings will use ScalePlaneDown2() 2868 if (dst_width == src_width && dst_height == src_height) { 2869 // Straight copy. 2870 CopyPlane(src, src_stride, dst, dst_stride, dst_width, dst_height); 2871 } else if (dst_width <= src_width && dst_height <= src_height) { 2872 // Scale down. 2873 if (use_reference_impl_) { 2874 // For testing, allow the optimized versions to be disabled. 2875 ScalePlaneDown(src_width, src_height, dst_width, dst_height, 2876 src_stride, dst_stride, src, dst, filtering); 2877 } else if (4 * dst_width == 3 * src_width && 2878 4 * dst_height == 3 * src_height) { 2879 // optimized, 3/4 2880 ScalePlaneDown34(src_width, src_height, dst_width, dst_height, 2881 src_stride, dst_stride, src, dst, filtering); 2882 } else if (2 * dst_width == src_width && 2 * dst_height == src_height) { 2883 // optimized, 1/2 2884 ScalePlaneDown2(src_width, src_height, dst_width, dst_height, 2885 src_stride, dst_stride, src, dst, filtering); 2886 // 3/8 rounded up for odd sized chroma height. 2887 } else if (8 * dst_width == 3 * src_width && 2888 dst_height == ((src_height * 3 + 7) / 8)) { 2889 // optimized, 3/8 2890 ScalePlaneDown38(src_width, src_height, dst_width, dst_height, 2891 src_stride, dst_stride, src, dst, filtering); 2892 } else if (4 * dst_width == src_width && 4 * dst_height == src_height && 2893 filtering != kFilterBilinear) { 2894 // optimized, 1/4 2895 ScalePlaneDown4(src_width, src_height, dst_width, dst_height, 2896 src_stride, dst_stride, src, dst, filtering); 2897 } else if (8 * dst_width == src_width && 8 * dst_height == src_height && 2898 filtering != kFilterBilinear) { 2899 // optimized, 1/8 2900 ScalePlaneDown8(src_width, src_height, dst_width, dst_height, 2901 src_stride, dst_stride, src, dst, filtering); 2902 } else { 2903 // Arbitrary downsample 2904 ScalePlaneDown(src_width, src_height, dst_width, dst_height, 2905 src_stride, dst_stride, src, dst, filtering); 2906 } 2907 } else { 2908 // Arbitrary scale up and/or down. 2909 ScalePlaneAnySize(src_width, src_height, dst_width, dst_height, 2910 src_stride, dst_stride, src, dst, filtering); 2911 } 2912 } 2913 2914 // Scale an I420 image. 2915 // This function in turn calls a scaling function for each plane. 2916 2917 #define UNDER_ALLOCATED_HACK 1 2918 2919 LIBYUV_API 2920 int I420Scale(const uint8* src_y, int src_stride_y, 2921 const uint8* src_u, int src_stride_u, 2922 const uint8* src_v, int src_stride_v, 2923 int src_width, int src_height, 2924 uint8* dst_y, int dst_stride_y, 2925 uint8* dst_u, int dst_stride_u, 2926 uint8* dst_v, int dst_stride_v, 2927 int dst_width, int dst_height, 2928 FilterMode filtering) { 2929 if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || 2930 !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { 2931 return -1; 2932 } 2933 // Negative height means invert the image. 2934 if (src_height < 0) { 2935 src_height = -src_height; 2936 int halfheight = (src_height + 1) >> 1; 2937 src_y = src_y + (src_height - 1) * src_stride_y; 2938 src_u = src_u + (halfheight - 1) * src_stride_u; 2939 src_v = src_v + (halfheight - 1) * src_stride_v; 2940 src_stride_y = -src_stride_y; 2941 src_stride_u = -src_stride_u; 2942 src_stride_v = -src_stride_v; 2943 } 2944 int src_halfwidth = (src_width + 1) >> 1; 2945 int src_halfheight = (src_height + 1) >> 1; 2946 int dst_halfwidth = (dst_width + 1) >> 1; 2947 int dst_halfheight = (dst_height + 1) >> 1; 2948 2949 #ifdef UNDER_ALLOCATED_HACK 2950 // If caller passed width / 2 for stride, adjust halfwidth to match. 2951 if ((src_width & 1) && src_stride_u && src_halfwidth > abs(src_stride_u)) { 2952 src_halfwidth = src_width >> 1; 2953 } 2954 if ((dst_width & 1) && dst_stride_u && dst_halfwidth > abs(dst_stride_u)) { 2955 dst_halfwidth = dst_width >> 1; 2956 } 2957 // If caller used height / 2 when computing src_v, it will point into what 2958 // should be the src_u plane. Detect this and reduce halfheight to match. 2959 int uv_src_plane_size = src_halfwidth * src_halfheight; 2960 if ((src_height & 1) && 2961 (src_v > src_u) && (src_v < (src_u + uv_src_plane_size))) { 2962 src_halfheight = src_height >> 1; 2963 } 2964 int uv_dst_plane_size = dst_halfwidth * dst_halfheight; 2965 if ((dst_height & 1) && 2966 (dst_v > dst_u) && (dst_v < (dst_u + uv_dst_plane_size))) { 2967 dst_halfheight = dst_height >> 1; 2968 } 2969 #endif 2970 2971 ScalePlane(src_y, src_stride_y, src_width, src_height, 2972 dst_y, dst_stride_y, dst_width, dst_height, 2973 filtering); 2974 ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight, 2975 dst_u, dst_stride_u, dst_halfwidth, dst_halfheight, 2976 filtering); 2977 ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight, 2978 dst_v, dst_stride_v, dst_halfwidth, dst_halfheight, 2979 filtering); 2980 return 0; 2981 } 2982 2983 // Deprecated api 2984 LIBYUV_API 2985 int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v, 2986 int src_stride_y, int src_stride_u, int src_stride_v, 2987 int src_width, int src_height, 2988 uint8* dst_y, uint8* dst_u, uint8* dst_v, 2989 int dst_stride_y, int dst_stride_u, int dst_stride_v, 2990 int dst_width, int dst_height, 2991 bool interpolate) { 2992 if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || 2993 !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { 2994 return -1; 2995 } 2996 // Negative height means invert the image. 2997 if (src_height < 0) { 2998 src_height = -src_height; 2999 int halfheight = (src_height + 1) >> 1; 3000 src_y = src_y + (src_height - 1) * src_stride_y; 3001 src_u = src_u + (halfheight - 1) * src_stride_u; 3002 src_v = src_v + (halfheight - 1) * src_stride_v; 3003 src_stride_y = -src_stride_y; 3004 src_stride_u = -src_stride_u; 3005 src_stride_v = -src_stride_v; 3006 } 3007 int src_halfwidth = (src_width + 1) >> 1; 3008 int src_halfheight = (src_height + 1) >> 1; 3009 int dst_halfwidth = (dst_width + 1) >> 1; 3010 int dst_halfheight = (dst_height + 1) >> 1; 3011 FilterMode filtering = interpolate ? kFilterBox : kFilterNone; 3012 3013 #ifdef UNDER_ALLOCATED_HACK 3014 // If caller passed width / 2 for stride, adjust halfwidth to match. 3015 if ((src_width & 1) && src_stride_u && src_halfwidth > abs(src_stride_u)) { 3016 src_halfwidth = src_width >> 1; 3017 } 3018 if ((dst_width & 1) && dst_stride_u && dst_halfwidth > abs(dst_stride_u)) { 3019 dst_halfwidth = dst_width >> 1; 3020 } 3021 // If caller used height / 2 when computing src_v, it will point into what 3022 // should be the src_u plane. Detect this and reduce halfheight to match. 3023 int uv_src_plane_size = src_halfwidth * src_halfheight; 3024 if ((src_height & 1) && 3025 (src_v > src_u) && (src_v < (src_u + uv_src_plane_size))) { 3026 src_halfheight = src_height >> 1; 3027 } 3028 int uv_dst_plane_size = dst_halfwidth * dst_halfheight; 3029 if ((dst_height & 1) && 3030 (dst_v > dst_u) && (dst_v < (dst_u + uv_dst_plane_size))) { 3031 dst_halfheight = dst_height >> 1; 3032 } 3033 #endif 3034 3035 ScalePlane(src_y, src_stride_y, src_width, src_height, 3036 dst_y, dst_stride_y, dst_width, dst_height, 3037 filtering); 3038 ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight, 3039 dst_u, dst_stride_u, dst_halfwidth, dst_halfheight, 3040 filtering); 3041 ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight, 3042 dst_v, dst_stride_v, dst_halfwidth, dst_halfheight, 3043 filtering); 3044 return 0; 3045 } 3046 3047 // Deprecated api 3048 LIBYUV_API 3049 int ScaleOffset(const uint8* src, int src_width, int src_height, 3050 uint8* dst, int dst_width, int dst_height, int dst_yoffset, 3051 bool interpolate) { 3052 if (!src || src_width <= 0 || src_height <= 0 || 3053 !dst || dst_width <= 0 || dst_height <= 0 || dst_yoffset < 0 || 3054 dst_yoffset >= dst_height) { 3055 return -1; 3056 } 3057 dst_yoffset = dst_yoffset & ~1; // chroma requires offset to multiple of 2. 3058 int src_halfwidth = (src_width + 1) >> 1; 3059 int src_halfheight = (src_height + 1) >> 1; 3060 int dst_halfwidth = (dst_width + 1) >> 1; 3061 int dst_halfheight = (dst_height + 1) >> 1; 3062 int aheight = dst_height - dst_yoffset * 2; // actual output height 3063 const uint8* src_y = src; 3064 const uint8* src_u = src + src_width * src_height; 3065 const uint8* src_v = src + src_width * src_height + 3066 src_halfwidth * src_halfheight; 3067 uint8* dst_y = dst + dst_yoffset * dst_width; 3068 uint8* dst_u = dst + dst_width * dst_height + 3069 (dst_yoffset >> 1) * dst_halfwidth; 3070 uint8* dst_v = dst + dst_width * dst_height + dst_halfwidth * dst_halfheight + 3071 (dst_yoffset >> 1) * dst_halfwidth; 3072 return Scale(src_y, src_u, src_v, src_width, src_halfwidth, src_halfwidth, 3073 src_width, src_height, dst_y, dst_u, dst_v, dst_width, 3074 dst_halfwidth, dst_halfwidth, dst_width, aheight, interpolate); 3075 } 3076 3077 #ifdef __cplusplus 3078 } // extern "C" 3079 } // namespace libyuv 3080 #endif 3081