1 /* 2 * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "libyuv/scale.h" 12 13 #include <assert.h> 14 #include <string.h> 15 16 #include "libyuv/cpu_id.h" 17 18 #if defined(_MSC_VER) 19 #define ALIGN16(var) __declspec(align(16)) var 20 #else 21 #define ALIGN16(var) var __attribute__((aligned(16))) 22 #endif 23 24 // Note: A Neon reference manual 25 // http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0204j/CJAJIIGG.html 26 // Note: Some SSE2 reference manuals 27 // cpuvol1.pdf agner_instruction_tables.pdf 253666.pdf 253667.pdf 28 29 namespace libyuv { 30 31 // Set the following flag to true to revert to only 32 // using the reference implementation ScalePlaneBox(), and 33 // NOT the optimized versions. Useful for debugging and 34 // when comparing the quality of the resulting YUV planes 35 // as produced by the optimized and non-optimized versions. 36 37 static bool use_reference_impl_ = false; 38 39 void SetUseReferenceImpl(bool use) { 40 use_reference_impl_ = use; 41 } 42 43 /** 44 * NEON downscalers with interpolation. 45 * 46 * Provided by Fritz Koenig 47 * 48 */ 49 50 #if defined(__ARM_NEON__) && !defined(COVERAGE_ENABLED) 51 #define HAS_SCALEROWDOWN2_NEON 52 void ScaleRowDown2_NEON(const uint8* src_ptr, int /* src_stride */, 53 uint8* dst, int dst_width) { 54 __asm__ volatile 55 ( 56 "1:\n" 57 "vld2.u8 {q0,q1}, [%0]! \n" // load even pixels into q0, odd into q1 58 "vst1.u8 {q0}, [%1]! \n" // store even pixels 59 "subs %2, %2, #16 \n" // 16 processed per loop 60 "bhi 1b \n" 61 : "+r"(src_ptr), // %0 62 "+r"(dst), // %1 63 "+r"(dst_width) // %2 64 : 65 : "q0", "q1" // Clobber List 66 ); 67 } 68 69 void ScaleRowDown2Int_NEON(const uint8* src_ptr, int src_stride, 70 uint8* dst, int dst_width) { 71 __asm__ volatile 72 ( 73 "mov r4, #2 \n" // rounding constant 74 "add %1, %0 \n" // change the stride to row 2 pointer 75 "vdup.16 q4, r4 \n" 76 "1:\n" 77 "vld1.u8 {q0,q1}, [%0]! \n" // load row 1 and post increment 78 "vld1.u8 {q2,q3}, [%1]! \n" // load row 2 and post increment 79 "vpaddl.u8 q0, q0 \n" // row 1 add adjacent 80 "vpaddl.u8 q1, q1 \n" 81 "vpadal.u8 q0, q2 \n" // row 2 add adjacent, add row 1 to row 2 82 "vpadal.u8 q1, q3 \n" 83 "vadd.u16 q0, q4 \n" // rounding 84 "vadd.u16 q1, q4 \n" 85 "vshrn.u16 d0, q0, #2 \n" // downshift and pack 86 "vshrn.u16 d1, q1, #2 \n" 87 "vst1.u8 {q0}, [%2]! \n" 88 "subs %3, %3, #16 \n" // 16 processed per loop 89 "bhi 1b \n" 90 : "+r"(src_ptr), // %0 91 "+r"(src_stride), // %1 92 "+r"(dst), // %2 93 "+r"(dst_width) // %3 94 : 95 : "r4", "q0", "q1", "q2", "q3", "q4" // Clobber List 96 ); 97 } 98 99 #define HAS_SCALEROWDOWN4_NEON 100 // Expecting widths on arm devices to be smaller. Went with 8x4 blocks 101 // to get most coverage. Look to back and evaluate 16x4 blocks with 102 // handling of leftovers. 103 static void ScaleRowDown4_NEON(const uint8* src_ptr, int /* src_stride */, 104 uint8* dst_ptr, int dst_width) { 105 __asm__ volatile 106 ( 107 "mov r4, #4 \n" 108 "1: \n" 109 "vld1.u8 {d0[0]}, [%0],r4 \n" // load up only 2 pixels of data to 110 "vld1.u8 {d0[1]}, [%0],r4 \n" // represent the entire 8x4 block 111 112 "vst1.u16 {d0[0]}, [%1]! \n" 113 114 "subs %2, #2 \n" // dst_width -= 2 115 "bhi 1b \n" 116 : "+r"(src_ptr), // %0 117 "+r"(dst_ptr), // %1 118 "+r"(dst_width) // %2 119 : 120 : "r4", "q0", "q1", "memory", "cc" 121 ); 122 } 123 124 static void ScaleRowDown4Int_NEON(const uint8* src_ptr, int src_stride, 125 uint8* dst_ptr, int dst_width) { 126 __asm__ volatile 127 ( 128 "1: \n" 129 "mov r4, %0 \n" 130 "vld1.u8 {d0}, [r4],%3 \n" // load up 8x4 block of input data 131 "vld1.u8 {d1}, [r4],%3 \n" 132 "vld1.u8 {d2}, [r4],%3 \n" 133 "vld1.u8 {d3}, [r4] \n" 134 135 // data is loaded up int q0 and q1 136 // q0 = a00 a01 a02 a03 b00 b01 b02 b03 a10 a11 a12 a13 b10 b11 b12 b13 137 // q1 = a20 a21 a22 a23 b20 b21 b22 b23 a20 a21 a22 a23 b20 b21 b22 b23 138 // q0 = a00+a01 a02+a03 b00+b01 b02+b03 a10+a11 a12+a13 b10+b11 b12+b13 139 "vpaddl.u8 q0, q0 \n" 140 141 // d0 = a00+a01+a20+a21 a02+a03+a22+a23 b00+b01+b20+b21 b02+b03+b22+b23 142 // d1 = a10+a11+a20+a21 a12+a13+a22+a23 b10+b11+b20+b21 b12+b13+b22+b23 143 "vpadal.u8 q0, q1 \n" 144 145 // d0 = a00+a01+a20+a21+a02+a03+a22+a23 b00+b01+b20+b21+b02+b03+b22+b23 146 // d1 = a10+a11+a20+a21+a12+a13+a22+a23 b10+b11+b20+b21+b12+b13+b22+b23 147 "vpaddl.u16 q0, q0 \n" 148 149 150 // d0 = a00+a01+a20+a21+a02+a03+a22+a23+a10+a11+a20+a21+a12+a13+a22+a23 151 // b00+b01+b20+b21+b02+b03+b22+b23+b10+b11+b20+b21+b12+b13+b22+b23 152 "vadd.u32 d0, d1 \n" 153 154 "vrshr.u32 d0, d0, #4 \n" // divide by 16 w/rounding 155 156 "vst1.u8 {d0[0]}, [%1]! \n" 157 "vst1.u8 {d0[4]}, [%1]! \n" 158 159 "add %0, #8 \n" // move src pointer to next 8 pixels 160 "subs %2, #2 \n" // dst_width -= 2 161 "bhi 1b \n" 162 163 : "+r"(src_ptr), // %0 164 "+r"(dst_ptr), // %1 165 "+r"(dst_width) // %2 166 : "r"(src_stride) // %3 167 : "r4", "q0", "q1", "memory", "cc" 168 ); 169 } 170 171 /** 172 * SSE2 downscalers with interpolation. 173 * 174 * Provided by Frank Barchard (fbarchard (at) google.com) 175 * 176 */ 177 178 // Constants for SSE2 code 179 #elif (defined(WIN32) || defined(__i386__) || defined(__x86_64__)) && \ 180 !defined(COVERAGE_ENABLED) && !TARGET_IPHONE_SIMULATOR 181 #if defined(_MSC_VER) 182 #define TALIGN16(t, var) __declspec(align(16)) t _ ## var 183 #elif defined(OSX) 184 #define TALIGN16(t, var) t var __attribute__((aligned(16))) 185 #else 186 #define TALIGN16(t, var) t _ ## var __attribute__((aligned(16))) 187 #endif 188 189 // Offsets for source bytes 0 to 9 190 extern "C" TALIGN16(const uint8, shuf0[16]) = 191 { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 }; 192 193 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. 194 extern "C" TALIGN16(const uint8, shuf1[16]) = 195 { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 }; 196 197 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. 198 extern "C" TALIGN16(const uint8, shuf2[16]) = 199 { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 }; 200 201 // Offsets for source bytes 0 to 10 202 extern "C" TALIGN16(const uint8, shuf01[16]) = 203 { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 }; 204 205 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. 206 extern "C" TALIGN16(const uint8, shuf11[16]) = 207 { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 }; 208 209 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. 210 extern "C" TALIGN16(const uint8, shuf21[16]) = 211 { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 }; 212 213 // Coefficients for source bytes 0 to 10 214 extern "C" TALIGN16(const uint8, madd01[16]) = 215 { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 }; 216 217 // Coefficients for source bytes 10 to 21 218 extern "C" TALIGN16(const uint8, madd11[16]) = 219 { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 }; 220 221 // Coefficients for source bytes 21 to 31 222 extern "C" TALIGN16(const uint8, madd21[16]) = 223 { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 }; 224 225 // Coefficients for source bytes 21 to 31 226 extern "C" TALIGN16(const int16, round34[8]) = 227 { 2, 2, 2, 2, 2, 2, 2, 2 }; 228 229 extern "C" TALIGN16(const uint8, shuf38a[16]) = 230 { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; 231 232 extern "C" TALIGN16(const uint8, shuf38b[16]) = 233 { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 }; 234 235 // Arrange words 0,3,6 into 0,1,2 236 extern "C" TALIGN16(const uint8, shufac0[16]) = 237 { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; 238 239 // Arrange words 0,3,6 into 3,4,5 240 extern "C" TALIGN16(const uint8, shufac3[16]) = 241 { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 }; 242 243 // Scaling values for boxes of 3x3 and 2x3 244 extern "C" TALIGN16(const uint16, scaleac3[8]) = 245 { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 }; 246 247 // Arrange first value for pixels 0,1,2,3,4,5 248 extern "C" TALIGN16(const uint8, shufab0[16]) = 249 { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 }; 250 251 // Arrange second value for pixels 0,1,2,3,4,5 252 extern "C" TALIGN16(const uint8, shufab1[16]) = 253 { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 }; 254 255 // Arrange third value for pixels 0,1,2,3,4,5 256 extern "C" TALIGN16(const uint8, shufab2[16]) = 257 { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; 258 259 // Scaling values for boxes of 3x2 and 2x2 260 extern "C" TALIGN16(const uint16, scaleab2[8]) = 261 { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; 262 #endif 263 264 #if defined(WIN32) && !defined(COVERAGE_ENABLED) 265 266 #define HAS_SCALEROWDOWN2_SSE2 267 // Reads 32 pixels, throws half away and writes 16 pixels. 268 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. 269 __declspec(naked) 270 static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride, 271 uint8* dst_ptr, int dst_width) { 272 __asm { 273 mov eax, [esp + 4] // src_ptr 274 // src_stride ignored 275 mov edx, [esp + 12] // dst_ptr 276 mov ecx, [esp + 16] // dst_width 277 pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff 278 psrlw xmm7, 8 279 280 wloop: 281 movdqa xmm0, [eax] 282 movdqa xmm1, [eax + 16] 283 lea eax, [eax + 32] 284 pand xmm0, xmm7 285 pand xmm1, xmm7 286 packuswb xmm0, xmm1 287 movdqa [edx], xmm0 288 lea edx, [edx + 16] 289 sub ecx, 16 290 ja wloop 291 292 ret 293 } 294 } 295 // Blends 32x2 rectangle to 16x1. 296 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. 297 __declspec(naked) 298 static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride, 299 uint8* dst_ptr, int dst_width) { 300 __asm { 301 push esi 302 mov eax, [esp + 4 + 4] // src_ptr 303 mov esi, [esp + 4 + 8] // src_stride 304 mov edx, [esp + 4 + 12] // dst_ptr 305 mov ecx, [esp + 4 + 16] // dst_width 306 pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff 307 psrlw xmm7, 8 308 309 wloop: 310 movdqa xmm0, [eax] 311 movdqa xmm1, [eax + 16] 312 movdqa xmm2, [eax + esi] 313 movdqa xmm3, [eax + esi + 16] 314 lea eax, [eax + 32] 315 pavgb xmm0, xmm2 // average rows 316 pavgb xmm1, xmm3 317 318 movdqa xmm2, xmm0 // average columns (32 to 16 pixels) 319 psrlw xmm0, 8 320 movdqa xmm3, xmm1 321 psrlw xmm1, 8 322 pand xmm2, xmm7 323 pand xmm3, xmm7 324 pavgw xmm0, xmm2 325 pavgw xmm1, xmm3 326 packuswb xmm0, xmm1 327 328 movdqa [edx], xmm0 329 lea edx, [edx + 16] 330 sub ecx, 16 331 ja wloop 332 333 pop esi 334 ret 335 } 336 } 337 338 #define HAS_SCALEROWDOWN4_SSE2 339 // Point samples 32 pixels to 8 pixels. 340 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. 341 __declspec(naked) 342 static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride, 343 uint8* dst_ptr, int dst_width) { 344 __asm { 345 pushad 346 mov esi, [esp + 32 + 4] // src_ptr 347 // src_stride ignored 348 mov edi, [esp + 32 + 12] // dst_ptr 349 mov ecx, [esp + 32 + 16] // dst_width 350 pcmpeqb xmm7, xmm7 // generate mask 0x000000ff 351 psrld xmm7, 24 352 353 wloop: 354 movdqa xmm0, [esi] 355 movdqa xmm1, [esi + 16] 356 lea esi, [esi + 32] 357 pand xmm0, xmm7 358 pand xmm1, xmm7 359 packuswb xmm0, xmm1 360 packuswb xmm0, xmm0 361 movq qword ptr [edi], xmm0 362 lea edi, [edi + 8] 363 sub ecx, 8 364 ja wloop 365 366 popad 367 ret 368 } 369 } 370 371 // Blends 32x4 rectangle to 8x1. 372 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. 373 __declspec(naked) 374 static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride, 375 uint8* dst_ptr, int dst_width) { 376 __asm { 377 pushad 378 mov esi, [esp + 32 + 4] // src_ptr 379 mov ebx, [esp + 32 + 8] // src_stride 380 mov edi, [esp + 32 + 12] // dst_ptr 381 mov ecx, [esp + 32 + 16] // dst_width 382 pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff 383 psrlw xmm7, 8 384 lea edx, [ebx + ebx * 2] // src_stride * 3 385 386 wloop: 387 movdqa xmm0, [esi] 388 movdqa xmm1, [esi + 16] 389 movdqa xmm2, [esi + ebx] 390 movdqa xmm3, [esi + ebx + 16] 391 pavgb xmm0, xmm2 // average rows 392 pavgb xmm1, xmm3 393 movdqa xmm2, [esi + ebx * 2] 394 movdqa xmm3, [esi + ebx * 2 + 16] 395 movdqa xmm4, [esi + edx] 396 movdqa xmm5, [esi + edx + 16] 397 lea esi, [esi + 32] 398 pavgb xmm2, xmm4 399 pavgb xmm3, xmm5 400 pavgb xmm0, xmm2 401 pavgb xmm1, xmm3 402 403 movdqa xmm2, xmm0 // average columns (32 to 16 pixels) 404 psrlw xmm0, 8 405 movdqa xmm3, xmm1 406 psrlw xmm1, 8 407 pand xmm2, xmm7 408 pand xmm3, xmm7 409 pavgw xmm0, xmm2 410 pavgw xmm1, xmm3 411 packuswb xmm0, xmm1 412 413 movdqa xmm2, xmm0 // average columns (16 to 8 pixels) 414 psrlw xmm0, 8 415 pand xmm2, xmm7 416 pavgw xmm0, xmm2 417 packuswb xmm0, xmm0 418 419 movq qword ptr [edi], xmm0 420 lea edi, [edi + 8] 421 sub ecx, 8 422 ja wloop 423 424 popad 425 ret 426 } 427 } 428 429 #define HAS_SCALEROWDOWN8_SSE2 430 // Point samples 32 pixels to 4 pixels. 431 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned. 432 __declspec(naked) 433 static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride, 434 uint8* dst_ptr, int dst_width) { 435 __asm { 436 pushad 437 mov esi, [esp + 32 + 4] // src_ptr 438 // src_stride ignored 439 mov edi, [esp + 32 + 12] // dst_ptr 440 mov ecx, [esp + 32 + 16] // dst_width 441 pcmpeqb xmm7, xmm7 // generate mask isolating 1 src 8 bytes 442 psrlq xmm7, 56 443 444 wloop: 445 movdqa xmm0, [esi] 446 movdqa xmm1, [esi + 16] 447 lea esi, [esi + 32] 448 pand xmm0, xmm7 449 pand xmm1, xmm7 450 packuswb xmm0, xmm1 // 32->16 451 packuswb xmm0, xmm0 // 16->8 452 packuswb xmm0, xmm0 // 8->4 453 movd dword ptr [edi], xmm0 454 lea edi, [edi + 4] 455 sub ecx, 4 456 ja wloop 457 458 popad 459 ret 460 } 461 } 462 463 // Blends 32x8 rectangle to 4x1. 464 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned. 465 __declspec(naked) 466 static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, 467 uint8* dst_ptr, int dst_width) { 468 __asm { 469 pushad 470 mov esi, [esp + 32 + 4] // src_ptr 471 mov ebx, [esp + 32 + 8] // src_stride 472 mov edi, [esp + 32 + 12] // dst_ptr 473 mov ecx, [esp + 32 + 16] // dst_width 474 lea edx, [ebx + ebx * 2] // src_stride * 3 475 pxor xmm7, xmm7 476 477 wloop: 478 movdqa xmm0, [esi] // average 8 rows to 1 479 movdqa xmm1, [esi + 16] 480 movdqa xmm2, [esi + ebx] 481 movdqa xmm3, [esi + ebx + 16] 482 pavgb xmm0, xmm2 483 pavgb xmm1, xmm3 484 movdqa xmm2, [esi + ebx * 2] 485 movdqa xmm3, [esi + ebx * 2 + 16] 486 movdqa xmm4, [esi + edx] 487 movdqa xmm5, [esi + edx + 16] 488 lea ebp, [esi + ebx * 4] 489 lea esi, [esi + 32] 490 pavgb xmm2, xmm4 491 pavgb xmm3, xmm5 492 pavgb xmm0, xmm2 493 pavgb xmm1, xmm3 494 495 movdqa xmm2, [ebp] 496 movdqa xmm3, [ebp + 16] 497 movdqa xmm4, [ebp + ebx] 498 movdqa xmm5, [ebp + ebx + 16] 499 pavgb xmm2, xmm4 500 pavgb xmm3, xmm5 501 movdqa xmm4, [ebp + ebx * 2] 502 movdqa xmm5, [ebp + ebx * 2 + 16] 503 movdqa xmm6, [ebp + edx] 504 pavgb xmm4, xmm6 505 movdqa xmm6, [ebp + edx + 16] 506 pavgb xmm5, xmm6 507 pavgb xmm2, xmm4 508 pavgb xmm3, xmm5 509 pavgb xmm0, xmm2 510 pavgb xmm1, xmm3 511 512 psadbw xmm0, xmm7 // average 32 pixels to 4 513 psadbw xmm1, xmm7 514 pshufd xmm0, xmm0, 0xd8 // x1x0 -> xx01 515 pshufd xmm1, xmm1, 0x8d // x3x2 -> 32xx 516 por xmm0, xmm1 // -> 3201 517 psrlw xmm0, 3 518 packuswb xmm0, xmm0 519 packuswb xmm0, xmm0 520 movd dword ptr [edi], xmm0 521 522 lea edi, [edi + 4] 523 sub ecx, 4 524 ja wloop 525 526 popad 527 ret 528 } 529 } 530 531 #define HAS_SCALEROWDOWN34_SSSE3 532 // Point samples 32 pixels to 24 pixels. 533 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read. 534 // Then shuffled to do the scaling. 535 536 // Note that movdqa+palign may be better than movdqu. 537 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. 538 __declspec(naked) 539 static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride, 540 uint8* dst_ptr, int dst_width) { 541 __asm { 542 pushad 543 mov esi, [esp + 32 + 4] // src_ptr 544 // src_stride ignored 545 mov edi, [esp + 32 + 12] // dst_ptr 546 mov ecx, [esp + 32 + 16] // dst_width 547 movdqa xmm3, _shuf0 548 movdqa xmm4, _shuf1 549 movdqa xmm5, _shuf2 550 551 wloop: 552 movdqa xmm0, [esi] 553 movdqa xmm2, [esi + 16] 554 lea esi, [esi + 32] 555 movdqa xmm1, xmm2 556 palignr xmm1, xmm0, 8 557 pshufb xmm0, xmm3 558 pshufb xmm1, xmm4 559 pshufb xmm2, xmm5 560 movq qword ptr [edi], xmm0 561 movq qword ptr [edi + 8], xmm1 562 movq qword ptr [edi + 16], xmm2 563 lea edi, [edi + 24] 564 sub ecx, 24 565 ja wloop 566 567 popad 568 ret 569 } 570 } 571 572 // Blends 32x2 rectangle to 24x1 573 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read. 574 // Then shuffled to do the scaling. 575 576 // Register usage: 577 // xmm0 src_row 0 578 // xmm1 src_row 1 579 // xmm2 shuf 0 580 // xmm3 shuf 1 581 // xmm4 shuf 2 582 // xmm5 madd 0 583 // xmm6 madd 1 584 // xmm7 round34 585 586 // Note that movdqa+palign may be better than movdqu. 587 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. 588 __declspec(naked) 589 static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride, 590 uint8* dst_ptr, int dst_width) { 591 __asm { 592 pushad 593 mov esi, [esp + 32 + 4] // src_ptr 594 mov ebx, [esp + 32 + 8] // src_stride 595 mov edi, [esp + 32 + 12] // dst_ptr 596 mov ecx, [esp + 32 + 16] // dst_width 597 movdqa xmm2, _shuf01 598 movdqa xmm3, _shuf11 599 movdqa xmm4, _shuf21 600 movdqa xmm5, _madd01 601 movdqa xmm6, _madd11 602 movdqa xmm7, _round34 603 604 wloop: 605 movdqa xmm0, [esi] // pixels 0..7 606 movdqa xmm1, [esi+ebx] 607 pavgb xmm0, xmm1 608 pshufb xmm0, xmm2 609 pmaddubsw xmm0, xmm5 610 paddsw xmm0, xmm7 611 psrlw xmm0, 2 612 packuswb xmm0, xmm0 613 movq qword ptr [edi], xmm0 614 movdqu xmm0, [esi+8] // pixels 8..15 615 movdqu xmm1, [esi+ebx+8] 616 pavgb xmm0, xmm1 617 pshufb xmm0, xmm3 618 pmaddubsw xmm0, xmm6 619 paddsw xmm0, xmm7 620 psrlw xmm0, 2 621 packuswb xmm0, xmm0 622 movq qword ptr [edi+8], xmm0 623 movdqa xmm0, [esi+16] // pixels 16..23 624 movdqa xmm1, [esi+ebx+16] 625 lea esi, [esi+32] 626 pavgb xmm0, xmm1 627 pshufb xmm0, xmm4 628 movdqa xmm1, _madd21 629 pmaddubsw xmm0, xmm1 630 paddsw xmm0, xmm7 631 psrlw xmm0, 2 632 packuswb xmm0, xmm0 633 movq qword ptr [edi+16], xmm0 634 lea edi, [edi+24] 635 sub ecx, 24 636 ja wloop 637 638 popad 639 ret 640 } 641 } 642 643 // Note that movdqa+palign may be better than movdqu. 644 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. 645 __declspec(naked) 646 static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, 647 uint8* dst_ptr, int dst_width) { 648 __asm { 649 pushad 650 mov esi, [esp + 32 + 4] // src_ptr 651 mov ebx, [esp + 32 + 8] // src_stride 652 mov edi, [esp + 32 + 12] // dst_ptr 653 mov ecx, [esp + 32 + 16] // dst_width 654 movdqa xmm2, _shuf01 655 movdqa xmm3, _shuf11 656 movdqa xmm4, _shuf21 657 movdqa xmm5, _madd01 658 movdqa xmm6, _madd11 659 movdqa xmm7, _round34 660 661 wloop: 662 movdqa xmm0, [esi] // pixels 0..7 663 movdqa xmm1, [esi+ebx] 664 pavgb xmm1, xmm0 665 pavgb xmm0, xmm1 666 pshufb xmm0, xmm2 667 pmaddubsw xmm0, xmm5 668 paddsw xmm0, xmm7 669 psrlw xmm0, 2 670 packuswb xmm0, xmm0 671 movq qword ptr [edi], xmm0 672 movdqu xmm0, [esi+8] // pixels 8..15 673 movdqu xmm1, [esi+ebx+8] 674 pavgb xmm1, xmm0 675 pavgb xmm0, xmm1 676 pshufb xmm0, xmm3 677 pmaddubsw xmm0, xmm6 678 paddsw xmm0, xmm7 679 psrlw xmm0, 2 680 packuswb xmm0, xmm0 681 movq qword ptr [edi+8], xmm0 682 movdqa xmm0, [esi+16] // pixels 16..23 683 movdqa xmm1, [esi+ebx+16] 684 lea esi, [esi+32] 685 pavgb xmm1, xmm0 686 pavgb xmm0, xmm1 687 pshufb xmm0, xmm4 688 movdqa xmm1, _madd21 689 pmaddubsw xmm0, xmm1 690 paddsw xmm0, xmm7 691 psrlw xmm0, 2 692 packuswb xmm0, xmm0 693 movq qword ptr [edi+16], xmm0 694 lea edi, [edi+24] 695 sub ecx, 24 696 ja wloop 697 698 popad 699 ret 700 } 701 } 702 703 #define HAS_SCALEROWDOWN38_SSSE3 704 // 3/8 point sampler 705 706 // Scale 32 pixels to 12 707 __declspec(naked) 708 static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride, 709 uint8* dst_ptr, int dst_width) { 710 __asm { 711 pushad 712 mov esi, [esp + 32 + 4] // src_ptr 713 mov edx, [esp + 32 + 8] // src_stride 714 mov edi, [esp + 32 + 12] // dst_ptr 715 mov ecx, [esp + 32 + 16] // dst_width 716 movdqa xmm5, _shuf38a 717 movdqa xmm6, _shuf38b 718 pxor xmm7, xmm7 719 720 xloop: 721 movdqa xmm0, [esi] // 16 pixels -> 0,1,2,3,4,5 722 movdqa xmm1, [esi + 16] // 16 pixels -> 6,7,8,9,10,11 723 lea esi, [esi + 32] 724 pshufb xmm0, xmm5 725 pshufb xmm1, xmm6 726 paddusb xmm0, xmm1 727 728 movq qword ptr [edi], xmm0 // write 12 pixels 729 movhlps xmm1, xmm0 730 movd [edi + 8], xmm1 731 lea edi, [edi + 12] 732 sub ecx, 12 733 ja xloop 734 735 popad 736 ret 737 } 738 } 739 740 // Scale 16x3 pixels to 6x1 with interpolation 741 __declspec(naked) 742 static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride, 743 uint8* dst_ptr, int dst_width) { 744 __asm { 745 pushad 746 mov esi, [esp + 32 + 4] // src_ptr 747 mov edx, [esp + 32 + 8] // src_stride 748 mov edi, [esp + 32 + 12] // dst_ptr 749 mov ecx, [esp + 32 + 16] // dst_width 750 movdqa xmm4, _shufac0 751 movdqa xmm5, _shufac3 752 movdqa xmm6, _scaleac3 753 pxor xmm7, xmm7 754 755 xloop: 756 movdqa xmm0, [esi] // sum up 3 rows into xmm0/1 757 movdqa xmm2, [esi + edx] 758 movhlps xmm1, xmm0 759 movhlps xmm3, xmm2 760 punpcklbw xmm0, xmm7 761 punpcklbw xmm1, xmm7 762 punpcklbw xmm2, xmm7 763 punpcklbw xmm3, xmm7 764 paddusw xmm0, xmm2 765 paddusw xmm1, xmm3 766 movdqa xmm2, [esi + edx * 2] 767 lea esi, [esi + 16] 768 movhlps xmm3, xmm2 769 punpcklbw xmm2, xmm7 770 punpcklbw xmm3, xmm7 771 paddusw xmm0, xmm2 772 paddusw xmm1, xmm3 773 774 movdqa xmm2, xmm0 // 8 pixels -> 0,1,2 of xmm2 775 psrldq xmm0, 2 776 paddusw xmm2, xmm0 777 psrldq xmm0, 2 778 paddusw xmm2, xmm0 779 pshufb xmm2, xmm4 780 781 movdqa xmm3, xmm1 // 8 pixels -> 3,4,5 of xmm2 782 psrldq xmm1, 2 783 paddusw xmm3, xmm1 784 psrldq xmm1, 2 785 paddusw xmm3, xmm1 786 pshufb xmm3, xmm5 787 paddusw xmm2, xmm3 788 789 pmulhuw xmm2, xmm6 // divide by 9,9,6, 9,9,6 790 packuswb xmm2, xmm2 791 792 movd [edi], xmm2 // write 6 pixels 793 pextrw eax, xmm2, 2 794 mov [edi + 4], ax 795 lea edi, [edi + 6] 796 sub ecx, 6 797 ja xloop 798 799 popad 800 ret 801 } 802 } 803 804 // Scale 16x2 pixels to 6x1 with interpolation 805 __declspec(naked) 806 static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride, 807 uint8* dst_ptr, int dst_width) { 808 __asm { 809 pushad 810 mov esi, [esp + 32 + 4] // src_ptr 811 mov edx, [esp + 32 + 8] // src_stride 812 mov edi, [esp + 32 + 12] // dst_ptr 813 mov ecx, [esp + 32 + 16] // dst_width 814 movdqa xmm4, _shufab0 815 movdqa xmm5, _shufab1 816 movdqa xmm6, _shufab2 817 movdqa xmm7, _scaleab2 818 819 xloop: 820 movdqa xmm2, [esi] // average 2 rows into xmm2 821 pavgb xmm2, [esi + edx] 822 lea esi, [esi + 16] 823 824 movdqa xmm0, xmm2 // 16 pixels -> 0,1,2,3,4,5 of xmm0 825 pshufb xmm0, xmm4 826 movdqa xmm1, xmm2 827 pshufb xmm1, xmm5 828 paddusw xmm0, xmm1 829 pshufb xmm2, xmm6 830 paddusw xmm0, xmm2 831 832 pmulhuw xmm0, xmm7 // divide by 3,3,2, 3,3,2 833 packuswb xmm0, xmm0 834 835 movd [edi], xmm0 // write 6 pixels 836 pextrw eax, xmm0, 2 837 mov [edi + 4], ax 838 lea edi, [edi + 6] 839 sub ecx, 6 840 ja xloop 841 842 popad 843 ret 844 } 845 } 846 847 #define HAS_SCALEADDROWS_SSE2 848 849 // Reads 8xN bytes and produces 16 shorts at a time. 850 __declspec(naked) 851 static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, 852 uint16* dst_ptr, int src_width, 853 int src_height) { 854 __asm { 855 pushad 856 mov esi, [esp + 32 + 4] // src_ptr 857 mov edx, [esp + 32 + 8] // src_stride 858 mov edi, [esp + 32 + 12] // dst_ptr 859 mov ecx, [esp + 32 + 16] // dst_width 860 mov ebx, [esp + 32 + 20] // height 861 pxor xmm7, xmm7 862 dec ebx 863 864 xloop: 865 // first row 866 movdqa xmm2, [esi] 867 lea eax, [esi + edx] 868 movhlps xmm3, xmm2 869 mov ebp, ebx 870 punpcklbw xmm2, xmm7 871 punpcklbw xmm3, xmm7 872 873 // sum remaining rows 874 yloop: 875 movdqa xmm0, [eax] // read 16 pixels 876 lea eax, [eax + edx] // advance to next row 877 movhlps xmm1, xmm0 878 punpcklbw xmm0, xmm7 879 punpcklbw xmm1, xmm7 880 paddusw xmm2, xmm0 // sum 16 words 881 paddusw xmm3, xmm1 882 sub ebp, 1 883 ja yloop 884 885 movdqa [edi], xmm2 886 movdqa [edi + 16], xmm3 887 lea edi, [edi + 32] 888 lea esi, [esi + 16] 889 890 sub ecx, 16 891 ja xloop 892 893 popad 894 ret 895 } 896 } 897 898 // Bilinear row filtering combines 16x2 -> 16x1. SSE2 version. 899 #define HAS_SCALEFILTERROWS_SSE2 900 __declspec(naked) 901 static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr, 902 int src_stride, int dst_width, 903 int source_y_fraction) { 904 __asm { 905 push esi 906 push edi 907 mov edi, [esp + 8 + 4] // dst_ptr 908 mov esi, [esp + 8 + 8] // src_ptr 909 mov edx, [esp + 8 + 12] // src_stride 910 mov ecx, [esp + 8 + 16] // dst_width 911 mov eax, [esp + 8 + 20] // source_y_fraction (0..255) 912 cmp eax, 0 913 je xloop1 914 cmp eax, 128 915 je xloop2 916 917 movd xmm6, eax // xmm6 = y fraction 918 punpcklwd xmm6, xmm6 919 pshufd xmm6, xmm6, 0 920 neg eax // xmm5 = 256 - y fraction 921 add eax, 256 922 movd xmm5, eax 923 punpcklwd xmm5, xmm5 924 pshufd xmm5, xmm5, 0 925 pxor xmm7, xmm7 926 927 xloop: 928 movdqa xmm0, [esi] 929 movdqa xmm2, [esi + edx] 930 lea esi, [esi + 16] 931 movdqa xmm1, xmm0 932 movdqa xmm3, xmm2 933 punpcklbw xmm0, xmm7 934 punpcklbw xmm2, xmm7 935 punpckhbw xmm1, xmm7 936 punpckhbw xmm3, xmm7 937 pmullw xmm0, xmm5 // scale row 0 938 pmullw xmm1, xmm5 939 pmullw xmm2, xmm6 // scale row 1 940 pmullw xmm3, xmm6 941 paddusw xmm0, xmm2 // sum rows 942 paddusw xmm1, xmm3 943 psrlw xmm0, 8 944 psrlw xmm1, 8 945 packuswb xmm0, xmm1 946 movdqa [edi], xmm0 947 lea edi, [edi + 16] 948 sub ecx, 16 949 ja xloop 950 951 mov al, [edi - 1] 952 mov [edi], al 953 pop edi 954 pop esi 955 ret 956 957 xloop1: 958 movdqa xmm0, [esi] 959 lea esi, [esi + 16] 960 movdqa [edi], xmm0 961 lea edi, [edi + 16] 962 sub ecx, 16 963 ja xloop1 964 965 mov al, [edi - 1] 966 mov [edi], al 967 pop edi 968 pop esi 969 ret 970 971 xloop2: 972 movdqa xmm0, [esi] 973 movdqa xmm2, [esi + edx] 974 lea esi, [esi + 16] 975 pavgb xmm0, xmm2 976 movdqa [edi], xmm0 977 lea edi, [edi + 16] 978 sub ecx, 16 979 ja xloop2 980 981 mov al, [edi - 1] 982 mov [edi], al 983 pop edi 984 pop esi 985 ret 986 } 987 } 988 989 // Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version. 990 #define HAS_SCALEFILTERROWS_SSSE3 991 __declspec(naked) 992 static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr, 993 int src_stride, int dst_width, 994 int source_y_fraction) { 995 __asm { 996 push esi 997 push edi 998 mov edi, [esp + 8 + 4] // dst_ptr 999 mov esi, [esp + 8 + 8] // src_ptr 1000 mov edx, [esp + 8 + 12] // src_stride 1001 mov ecx, [esp + 8 + 16] // dst_width 1002 mov eax, [esp + 8 + 20] // source_y_fraction (0..255) 1003 cmp eax, 0 1004 je xloop1 1005 cmp eax, 128 1006 je xloop2 1007 1008 shr eax, 1 1009 mov ah,al 1010 neg al 1011 add al, 128 1012 movd xmm7, eax 1013 punpcklwd xmm7, xmm7 1014 pshufd xmm7, xmm7, 0 1015 1016 xloop: 1017 movdqa xmm0, [esi] 1018 movdqa xmm2, [esi + edx] 1019 lea esi, [esi + 16] 1020 movdqa xmm1, xmm0 1021 punpcklbw xmm0, xmm2 1022 punpckhbw xmm1, xmm2 1023 pmaddubsw xmm0, xmm7 1024 pmaddubsw xmm1, xmm7 1025 psrlw xmm0, 7 1026 psrlw xmm1, 7 1027 packuswb xmm0, xmm1 1028 movdqa [edi], xmm0 1029 lea edi, [edi + 16] 1030 sub ecx, 16 1031 ja xloop 1032 1033 mov al, [edi - 1] 1034 mov [edi], al 1035 pop edi 1036 pop esi 1037 ret 1038 1039 xloop1: 1040 movdqa xmm0, [esi] 1041 lea esi, [esi + 16] 1042 movdqa [edi], xmm0 1043 lea edi, [edi + 16] 1044 sub ecx, 16 1045 ja xloop1 1046 1047 mov al, [edi - 1] 1048 mov [edi], al 1049 pop edi 1050 pop esi 1051 ret 1052 1053 xloop2: 1054 movdqa xmm0, [esi] 1055 movdqa xmm2, [esi + edx] 1056 lea esi, [esi + 16] 1057 pavgb xmm0, xmm2 1058 movdqa [edi], xmm0 1059 lea edi, [edi + 16] 1060 sub ecx, 16 1061 ja xloop2 1062 1063 mov al, [edi - 1] 1064 mov [edi], al 1065 pop edi 1066 pop esi 1067 ret 1068 1069 } 1070 } 1071 1072 // Note that movdqa+palign may be better than movdqu. 1073 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. 1074 __declspec(naked) 1075 static void ScaleFilterCols34_SSSE3(uint8* dst_ptr, const uint8* src_ptr, 1076 int dst_width) { 1077 __asm { 1078 mov edx, [esp + 4] // dst_ptr 1079 mov eax, [esp + 8] // src_ptr 1080 mov ecx, [esp + 12] // dst_width 1081 movdqa xmm1, _round34 1082 movdqa xmm2, _shuf01 1083 movdqa xmm3, _shuf11 1084 movdqa xmm4, _shuf21 1085 movdqa xmm5, _madd01 1086 movdqa xmm6, _madd11 1087 movdqa xmm7, _madd21 1088 1089 wloop: 1090 movdqa xmm0, [eax] // pixels 0..7 1091 pshufb xmm0, xmm2 1092 pmaddubsw xmm0, xmm5 1093 paddsw xmm0, xmm1 1094 psrlw xmm0, 2 1095 packuswb xmm0, xmm0 1096 movq qword ptr [edx], xmm0 1097 movdqu xmm0, [eax+8] // pixels 8..15 1098 pshufb xmm0, xmm3 1099 pmaddubsw xmm0, xmm6 1100 paddsw xmm0, xmm1 1101 psrlw xmm0, 2 1102 packuswb xmm0, xmm0 1103 movq qword ptr [edx+8], xmm0 1104 movdqa xmm0, [eax+16] // pixels 16..23 1105 lea eax, [eax+32] 1106 pshufb xmm0, xmm4 1107 pmaddubsw xmm0, xmm7 1108 paddsw xmm0, xmm1 1109 psrlw xmm0, 2 1110 packuswb xmm0, xmm0 1111 movq qword ptr [edx+16], xmm0 1112 lea edx, [edx+24] 1113 sub ecx, 24 1114 ja wloop 1115 ret 1116 } 1117 } 1118 1119 #elif (defined(__x86_64__) || defined(__i386__)) && \ 1120 !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) 1121 1122 // GCC versions of row functions are verbatim conversions from Visual C. 1123 // Generated using gcc disassembly on Visual C object file: 1124 // objdump -D yuvscaler.obj >yuvscaler.txt 1125 #define HAS_SCALEROWDOWN2_SSE2 1126 static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride, 1127 uint8* dst_ptr, int dst_width) { 1128 asm volatile( 1129 "pcmpeqb %%xmm7,%%xmm7\n" 1130 "psrlw $0x8,%%xmm7\n" 1131 "1:" 1132 "movdqa (%0),%%xmm0\n" 1133 "movdqa 0x10(%0),%%xmm1\n" 1134 "lea 0x20(%0),%0\n" 1135 "pand %%xmm7,%%xmm0\n" 1136 "pand %%xmm7,%%xmm1\n" 1137 "packuswb %%xmm1,%%xmm0\n" 1138 "movdqa %%xmm0,(%1)\n" 1139 "lea 0x10(%1),%1\n" 1140 "sub $0x10,%2\n" 1141 "ja 1b\n" 1142 : "+r"(src_ptr), // %0 1143 "+r"(dst_ptr), // %1 1144 "+r"(dst_width) // %2 1145 : 1146 : "memory" 1147 ); 1148 } 1149 1150 static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride, 1151 uint8* dst_ptr, int dst_width) { 1152 asm volatile( 1153 "pcmpeqb %%xmm7,%%xmm7\n" 1154 "psrlw $0x8,%%xmm7\n" 1155 "1:" 1156 "movdqa (%0),%%xmm0\n" 1157 "movdqa 0x10(%0),%%xmm1\n" 1158 "movdqa (%0,%3,1),%%xmm2\n" 1159 "movdqa 0x10(%0,%3,1),%%xmm3\n" 1160 "lea 0x20(%0),%0\n" 1161 "pavgb %%xmm2,%%xmm0\n" 1162 "pavgb %%xmm3,%%xmm1\n" 1163 "movdqa %%xmm0,%%xmm2\n" 1164 "psrlw $0x8,%%xmm0\n" 1165 "movdqa %%xmm1,%%xmm3\n" 1166 "psrlw $0x8,%%xmm1\n" 1167 "pand %%xmm7,%%xmm2\n" 1168 "pand %%xmm7,%%xmm3\n" 1169 "pavgw %%xmm2,%%xmm0\n" 1170 "pavgw %%xmm3,%%xmm1\n" 1171 "packuswb %%xmm1,%%xmm0\n" 1172 "movdqa %%xmm0,(%1)\n" 1173 "lea 0x10(%1),%1\n" 1174 "sub $0x10,%2\n" 1175 "ja 1b\n" 1176 : "+r"(src_ptr), // %0 1177 "+r"(dst_ptr), // %1 1178 "+r"(dst_width) // %2 1179 : "r"(static_cast<intptr_t>(src_stride)) // %3 1180 : "memory" 1181 ); 1182 } 1183 1184 #define HAS_SCALEROWDOWN4_SSE2 1185 static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride, 1186 uint8* dst_ptr, int dst_width) { 1187 asm volatile( 1188 "pcmpeqb %%xmm7,%%xmm7\n" 1189 "psrld $0x18,%%xmm7\n" 1190 "1:" 1191 "movdqa (%0),%%xmm0\n" 1192 "movdqa 0x10(%0),%%xmm1\n" 1193 "lea 0x20(%0),%0\n" 1194 "pand %%xmm7,%%xmm0\n" 1195 "pand %%xmm7,%%xmm1\n" 1196 "packuswb %%xmm1,%%xmm0\n" 1197 "packuswb %%xmm0,%%xmm0\n" 1198 "movq %%xmm0,(%1)\n" 1199 "lea 0x8(%1),%1\n" 1200 "sub $0x8,%2\n" 1201 "ja 1b\n" 1202 : "+r"(src_ptr), // %0 1203 "+r"(dst_ptr), // %1 1204 "+r"(dst_width) // %2 1205 : 1206 : "memory" 1207 ); 1208 } 1209 1210 static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride, 1211 uint8* dst_ptr, int dst_width) { 1212 intptr_t temp = 0; 1213 asm volatile( 1214 "pcmpeqb %%xmm7,%%xmm7\n" 1215 "psrlw $0x8,%%xmm7\n" 1216 "lea (%4,%4,2),%3\n" 1217 "1:" 1218 "movdqa (%0),%%xmm0\n" 1219 "movdqa 0x10(%0),%%xmm1\n" 1220 "movdqa (%0,%4,1),%%xmm2\n" 1221 "movdqa 0x10(%0,%4,1),%%xmm3\n" 1222 "pavgb %%xmm2,%%xmm0\n" 1223 "pavgb %%xmm3,%%xmm1\n" 1224 "movdqa (%0,%4,2),%%xmm2\n" 1225 "movdqa 0x10(%0,%4,2),%%xmm3\n" 1226 "movdqa (%0,%3,1),%%xmm4\n" 1227 "movdqa 0x10(%0,%3,1),%%xmm5\n" 1228 "lea 0x20(%0),%0\n" 1229 "pavgb %%xmm4,%%xmm2\n" 1230 "pavgb %%xmm2,%%xmm0\n" 1231 "pavgb %%xmm5,%%xmm3\n" 1232 "pavgb %%xmm3,%%xmm1\n" 1233 "movdqa %%xmm0,%%xmm2\n" 1234 "psrlw $0x8,%%xmm0\n" 1235 "movdqa %%xmm1,%%xmm3\n" 1236 "psrlw $0x8,%%xmm1\n" 1237 "pand %%xmm7,%%xmm2\n" 1238 "pand %%xmm7,%%xmm3\n" 1239 "pavgw %%xmm2,%%xmm0\n" 1240 "pavgw %%xmm3,%%xmm1\n" 1241 "packuswb %%xmm1,%%xmm0\n" 1242 "movdqa %%xmm0,%%xmm2\n" 1243 "psrlw $0x8,%%xmm0\n" 1244 "pand %%xmm7,%%xmm2\n" 1245 "pavgw %%xmm2,%%xmm0\n" 1246 "packuswb %%xmm0,%%xmm0\n" 1247 "movq %%xmm0,(%1)\n" 1248 "lea 0x8(%1),%1\n" 1249 "sub $0x8,%2\n" 1250 "ja 1b\n" 1251 : "+r"(src_ptr), // %0 1252 "+r"(dst_ptr), // %1 1253 "+r"(dst_width), // %2 1254 "+r"(temp) // %3 1255 : "r"(static_cast<intptr_t>(src_stride)) // %4 1256 : "memory" 1257 ); 1258 } 1259 1260 #define HAS_SCALEROWDOWN8_SSE2 1261 static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride, 1262 uint8* dst_ptr, int dst_width) { 1263 asm volatile( 1264 "pcmpeqb %%xmm7,%%xmm7\n" 1265 "psrlq $0x38,%%xmm7\n" 1266 "1:" 1267 "movdqa (%0),%%xmm0\n" 1268 "movdqa 0x10(%0),%%xmm1\n" 1269 "lea 0x20(%0),%0\n" 1270 "pand %%xmm7,%%xmm0\n" 1271 "pand %%xmm7,%%xmm1\n" 1272 "packuswb %%xmm1,%%xmm0\n" 1273 "packuswb %%xmm0,%%xmm0\n" 1274 "packuswb %%xmm0,%%xmm0\n" 1275 "movd %%xmm0,(%1)\n" 1276 "lea 0x4(%1),%1\n" 1277 "sub $0x4,%2\n" 1278 "ja 1b\n" 1279 : "+r"(src_ptr), // %0 1280 "+r"(dst_ptr), // %1 1281 "+r"(dst_width) // %2 1282 : 1283 : "memory" 1284 ); 1285 } 1286 1287 #if defined(__i386__) 1288 extern "C" void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, 1289 uint8* dst_ptr, int dst_width); 1290 asm( 1291 ".text\n" 1292 #if defined(OSX) 1293 ".globl _ScaleRowDown8Int_SSE2\n" 1294 "_ScaleRowDown8Int_SSE2:\n" 1295 #else 1296 ".global ScaleRowDown8Int_SSE2\n" 1297 "ScaleRowDown8Int_SSE2:\n" 1298 #endif 1299 "pusha\n" 1300 "mov 0x24(%esp),%esi\n" 1301 "mov 0x28(%esp),%ebx\n" 1302 "mov 0x2c(%esp),%edi\n" 1303 "mov 0x30(%esp),%ecx\n" 1304 "lea (%ebx,%ebx,2),%edx\n" 1305 "pxor %xmm7,%xmm7\n" 1306 1307 "1:" 1308 "movdqa (%esi),%xmm0\n" 1309 "movdqa 0x10(%esi),%xmm1\n" 1310 "movdqa (%esi,%ebx,1),%xmm2\n" 1311 "movdqa 0x10(%esi,%ebx,1),%xmm3\n" 1312 "pavgb %xmm2,%xmm0\n" 1313 "pavgb %xmm3,%xmm1\n" 1314 "movdqa (%esi,%ebx,2),%xmm2\n" 1315 "movdqa 0x10(%esi,%ebx,2),%xmm3\n" 1316 "movdqa (%esi,%edx,1),%xmm4\n" 1317 "movdqa 0x10(%esi,%edx,1),%xmm5\n" 1318 "lea (%esi,%ebx,4),%ebp\n" 1319 "lea 0x20(%esi),%esi\n" 1320 "pavgb %xmm4,%xmm2\n" 1321 "pavgb %xmm5,%xmm3\n" 1322 "pavgb %xmm2,%xmm0\n" 1323 "pavgb %xmm3,%xmm1\n" 1324 "movdqa 0x0(%ebp),%xmm2\n" 1325 "movdqa 0x10(%ebp),%xmm3\n" 1326 "movdqa 0x0(%ebp,%ebx,1),%xmm4\n" 1327 "movdqa 0x10(%ebp,%ebx,1),%xmm5\n" 1328 "pavgb %xmm4,%xmm2\n" 1329 "pavgb %xmm5,%xmm3\n" 1330 "movdqa 0x0(%ebp,%ebx,2),%xmm4\n" 1331 "movdqa 0x10(%ebp,%ebx,2),%xmm5\n" 1332 "movdqa 0x0(%ebp,%edx,1),%xmm6\n" 1333 "pavgb %xmm6,%xmm4\n" 1334 "movdqa 0x10(%ebp,%edx,1),%xmm6\n" 1335 "pavgb %xmm6,%xmm5\n" 1336 "pavgb %xmm4,%xmm2\n" 1337 "pavgb %xmm5,%xmm3\n" 1338 "pavgb %xmm2,%xmm0\n" 1339 "pavgb %xmm3,%xmm1\n" 1340 "psadbw %xmm7,%xmm0\n" 1341 "psadbw %xmm7,%xmm1\n" 1342 "pshufd $0xd8,%xmm0,%xmm0\n" 1343 "pshufd $0x8d,%xmm1,%xmm1\n" 1344 "por %xmm1,%xmm0\n" 1345 "psrlw $0x3,%xmm0\n" 1346 "packuswb %xmm0,%xmm0\n" 1347 "packuswb %xmm0,%xmm0\n" 1348 "movd %xmm0,(%edi)\n" 1349 "lea 0x4(%edi),%edi\n" 1350 "sub $0x4,%ecx\n" 1351 "ja 1b\n" 1352 "popa\n" 1353 "ret\n" 1354 ); 1355 1356 // fpic is used for magiccam plugin 1357 #if !defined(__PIC__) 1358 #define HAS_SCALEROWDOWN34_SSSE3 1359 extern "C" void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride, 1360 uint8* dst_ptr, int dst_width); 1361 asm( 1362 ".text\n" 1363 #if defined(OSX) 1364 ".globl _ScaleRowDown34_SSSE3\n" 1365 "_ScaleRowDown34_SSSE3:\n" 1366 #else 1367 ".global ScaleRowDown34_SSSE3\n" 1368 "ScaleRowDown34_SSSE3:\n" 1369 #endif 1370 "pusha\n" 1371 "mov 0x24(%esp),%esi\n" 1372 "mov 0x2c(%esp),%edi\n" 1373 "mov 0x30(%esp),%ecx\n" 1374 "movdqa _shuf0,%xmm3\n" 1375 "movdqa _shuf1,%xmm4\n" 1376 "movdqa _shuf2,%xmm5\n" 1377 1378 "1:" 1379 "movdqa (%esi),%xmm0\n" 1380 "movdqa 0x10(%esi),%xmm2\n" 1381 "lea 0x20(%esi),%esi\n" 1382 "movdqa %xmm2,%xmm1\n" 1383 "palignr $0x8,%xmm0,%xmm1\n" 1384 "pshufb %xmm3,%xmm0\n" 1385 "pshufb %xmm4,%xmm1\n" 1386 "pshufb %xmm5,%xmm2\n" 1387 "movq %xmm0,(%edi)\n" 1388 "movq %xmm1,0x8(%edi)\n" 1389 "movq %xmm2,0x10(%edi)\n" 1390 "lea 0x18(%edi),%edi\n" 1391 "sub $0x18,%ecx\n" 1392 "ja 1b\n" 1393 "popa\n" 1394 "ret\n" 1395 ); 1396 1397 extern "C" void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride, 1398 uint8* dst_ptr, int dst_width); 1399 asm( 1400 ".text\n" 1401 #if defined(OSX) 1402 ".globl _ScaleRowDown34_1_Int_SSSE3\n" 1403 "_ScaleRowDown34_1_Int_SSSE3:\n" 1404 #else 1405 ".global ScaleRowDown34_1_Int_SSSE3\n" 1406 "ScaleRowDown34_1_Int_SSSE3:\n" 1407 #endif 1408 "pusha\n" 1409 "mov 0x24(%esp),%esi\n" 1410 "mov 0x28(%esp),%ebp\n" 1411 "mov 0x2c(%esp),%edi\n" 1412 "mov 0x30(%esp),%ecx\n" 1413 "movdqa _shuf01,%xmm2\n" 1414 "movdqa _shuf11,%xmm3\n" 1415 "movdqa _shuf21,%xmm4\n" 1416 "movdqa _madd01,%xmm5\n" 1417 "movdqa _madd11,%xmm6\n" 1418 "movdqa _round34,%xmm7\n" 1419 1420 "1:" 1421 "movdqa (%esi),%xmm0\n" 1422 "movdqa (%esi,%ebp),%xmm1\n" 1423 "pavgb %xmm1,%xmm0\n" 1424 "pshufb %xmm2,%xmm0\n" 1425 "pmaddubsw %xmm5,%xmm0\n" 1426 "paddsw %xmm7,%xmm0\n" 1427 "psrlw $0x2,%xmm0\n" 1428 "packuswb %xmm0,%xmm0\n" 1429 "movq %xmm0,(%edi)\n" 1430 "movdqu 0x8(%esi),%xmm0\n" 1431 "movdqu 0x8(%esi,%ebp),%xmm1\n" 1432 "pavgb %xmm1,%xmm0\n" 1433 "pshufb %xmm3,%xmm0\n" 1434 "pmaddubsw %xmm6,%xmm0\n" 1435 "paddsw %xmm7,%xmm0\n" 1436 "psrlw $0x2,%xmm0\n" 1437 "packuswb %xmm0,%xmm0\n" 1438 "movq %xmm0,0x8(%edi)\n" 1439 "movdqa 0x10(%esi),%xmm0\n" 1440 "movdqa 0x10(%esi,%ebp),%xmm1\n" 1441 "lea 0x20(%esi),%esi\n" 1442 "pavgb %xmm1,%xmm0\n" 1443 "pshufb %xmm4,%xmm0\n" 1444 "movdqa _madd21,%xmm1\n" 1445 "pmaddubsw %xmm1,%xmm0\n" 1446 "paddsw %xmm7,%xmm0\n" 1447 "psrlw $0x2,%xmm0\n" 1448 "packuswb %xmm0,%xmm0\n" 1449 "movq %xmm0,0x10(%edi)\n" 1450 "lea 0x18(%edi),%edi\n" 1451 "sub $0x18,%ecx\n" 1452 "ja 1b\n" 1453 1454 "popa\n" 1455 "ret\n" 1456 ); 1457 1458 extern "C" void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, 1459 uint8* dst_ptr, int dst_width); 1460 asm( 1461 ".text\n" 1462 #if defined(OSX) 1463 ".globl _ScaleRowDown34_0_Int_SSSE3\n" 1464 "_ScaleRowDown34_0_Int_SSSE3:\n" 1465 #else 1466 ".global ScaleRowDown34_0_Int_SSSE3\n" 1467 "ScaleRowDown34_0_Int_SSSE3:\n" 1468 #endif 1469 "pusha\n" 1470 "mov 0x24(%esp),%esi\n" 1471 "mov 0x28(%esp),%ebp\n" 1472 "mov 0x2c(%esp),%edi\n" 1473 "mov 0x30(%esp),%ecx\n" 1474 "movdqa _shuf01,%xmm2\n" 1475 "movdqa _shuf11,%xmm3\n" 1476 "movdqa _shuf21,%xmm4\n" 1477 "movdqa _madd01,%xmm5\n" 1478 "movdqa _madd11,%xmm6\n" 1479 "movdqa _round34,%xmm7\n" 1480 1481 "1:" 1482 "movdqa (%esi),%xmm0\n" 1483 "movdqa (%esi,%ebp,1),%xmm1\n" 1484 "pavgb %xmm0,%xmm1\n" 1485 "pavgb %xmm1,%xmm0\n" 1486 "pshufb %xmm2,%xmm0\n" 1487 "pmaddubsw %xmm5,%xmm0\n" 1488 "paddsw %xmm7,%xmm0\n" 1489 "psrlw $0x2,%xmm0\n" 1490 "packuswb %xmm0,%xmm0\n" 1491 "movq %xmm0,(%edi)\n" 1492 "movdqu 0x8(%esi),%xmm0\n" 1493 "movdqu 0x8(%esi,%ebp,1),%xmm1\n" 1494 "pavgb %xmm0,%xmm1\n" 1495 "pavgb %xmm1,%xmm0\n" 1496 "pshufb %xmm3,%xmm0\n" 1497 "pmaddubsw %xmm6,%xmm0\n" 1498 "paddsw %xmm7,%xmm0\n" 1499 "psrlw $0x2,%xmm0\n" 1500 "packuswb %xmm0,%xmm0\n" 1501 "movq %xmm0,0x8(%edi)\n" 1502 "movdqa 0x10(%esi),%xmm0\n" 1503 "movdqa 0x10(%esi,%ebp,1),%xmm1\n" 1504 "lea 0x20(%esi),%esi\n" 1505 "pavgb %xmm0,%xmm1\n" 1506 "pavgb %xmm1,%xmm0\n" 1507 "pshufb %xmm4,%xmm0\n" 1508 "movdqa _madd21,%xmm1\n" 1509 "pmaddubsw %xmm1,%xmm0\n" 1510 "paddsw %xmm7,%xmm0\n" 1511 "psrlw $0x2,%xmm0\n" 1512 "packuswb %xmm0,%xmm0\n" 1513 "movq %xmm0,0x10(%edi)\n" 1514 "lea 0x18(%edi),%edi\n" 1515 "sub $0x18,%ecx\n" 1516 "ja 1b\n" 1517 "popa\n" 1518 "ret\n" 1519 ); 1520 1521 #define HAS_SCALEROWDOWN38_SSSE3 1522 extern "C" void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride, 1523 uint8* dst_ptr, int dst_width); 1524 asm( 1525 ".text\n" 1526 #if defined(OSX) 1527 ".globl _ScaleRowDown38_SSSE3\n" 1528 "_ScaleRowDown38_SSSE3:\n" 1529 #else 1530 ".global ScaleRowDown38_SSSE3\n" 1531 "ScaleRowDown38_SSSE3:\n" 1532 #endif 1533 "pusha\n" 1534 "mov 0x24(%esp),%esi\n" 1535 "mov 0x28(%esp),%edx\n" 1536 "mov 0x2c(%esp),%edi\n" 1537 "mov 0x30(%esp),%ecx\n" 1538 "movdqa _shuf38a ,%xmm5\n" 1539 "movdqa _shuf38b ,%xmm6\n" 1540 "pxor %xmm7,%xmm7\n" 1541 1542 "1:" 1543 "movdqa (%esi),%xmm0\n" 1544 "movdqa 0x10(%esi),%xmm1\n" 1545 "lea 0x20(%esi),%esi\n" 1546 "pshufb %xmm5,%xmm0\n" 1547 "pshufb %xmm6,%xmm1\n" 1548 "paddusb %xmm1,%xmm0\n" 1549 "movq %xmm0,(%edi)\n" 1550 "movhlps %xmm0,%xmm1\n" 1551 "movd %xmm1,0x8(%edi)\n" 1552 "lea 0xc(%edi),%edi\n" 1553 "sub $0xc,%ecx\n" 1554 "ja 1b\n" 1555 "popa\n" 1556 "ret\n" 1557 ); 1558 1559 extern "C" void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride, 1560 uint8* dst_ptr, int dst_width); 1561 asm( 1562 ".text\n" 1563 #if defined(OSX) 1564 ".globl _ScaleRowDown38_3_Int_SSSE3\n" 1565 "_ScaleRowDown38_3_Int_SSSE3:\n" 1566 #else 1567 ".global ScaleRowDown38_3_Int_SSSE3\n" 1568 "ScaleRowDown38_3_Int_SSSE3:\n" 1569 #endif 1570 "pusha\n" 1571 "mov 0x24(%esp),%esi\n" 1572 "mov 0x28(%esp),%edx\n" 1573 "mov 0x2c(%esp),%edi\n" 1574 "mov 0x30(%esp),%ecx\n" 1575 "movdqa _shufac0,%xmm4\n" 1576 "movdqa _shufac3,%xmm5\n" 1577 "movdqa _scaleac3,%xmm6\n" 1578 "pxor %xmm7,%xmm7\n" 1579 1580 "1:" 1581 "movdqa (%esi),%xmm0\n" 1582 "movdqa (%esi,%edx,1),%xmm2\n" 1583 "movhlps %xmm0,%xmm1\n" 1584 "movhlps %xmm2,%xmm3\n" 1585 "punpcklbw %xmm7,%xmm0\n" 1586 "punpcklbw %xmm7,%xmm1\n" 1587 "punpcklbw %xmm7,%xmm2\n" 1588 "punpcklbw %xmm7,%xmm3\n" 1589 "paddusw %xmm2,%xmm0\n" 1590 "paddusw %xmm3,%xmm1\n" 1591 "movdqa (%esi,%edx,2),%xmm2\n" 1592 "lea 0x10(%esi),%esi\n" 1593 "movhlps %xmm2,%xmm3\n" 1594 "punpcklbw %xmm7,%xmm2\n" 1595 "punpcklbw %xmm7,%xmm3\n" 1596 "paddusw %xmm2,%xmm0\n" 1597 "paddusw %xmm3,%xmm1\n" 1598 "movdqa %xmm0,%xmm2\n" 1599 "psrldq $0x2,%xmm0\n" 1600 "paddusw %xmm0,%xmm2\n" 1601 "psrldq $0x2,%xmm0\n" 1602 "paddusw %xmm0,%xmm2\n" 1603 "pshufb %xmm4,%xmm2\n" 1604 "movdqa %xmm1,%xmm3\n" 1605 "psrldq $0x2,%xmm1\n" 1606 "paddusw %xmm1,%xmm3\n" 1607 "psrldq $0x2,%xmm1\n" 1608 "paddusw %xmm1,%xmm3\n" 1609 "pshufb %xmm5,%xmm3\n" 1610 "paddusw %xmm3,%xmm2\n" 1611 "pmulhuw %xmm6,%xmm2\n" 1612 "packuswb %xmm2,%xmm2\n" 1613 "movd %xmm2,(%edi)\n" 1614 "pextrw $0x2,%xmm2,%eax\n" 1615 "mov %ax,0x4(%edi)\n" 1616 "lea 0x6(%edi),%edi\n" 1617 "sub $0x6,%ecx\n" 1618 "ja 1b\n" 1619 "popa\n" 1620 "ret\n" 1621 ); 1622 1623 extern "C" void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride, 1624 uint8* dst_ptr, int dst_width); 1625 asm( 1626 ".text\n" 1627 #if defined(OSX) 1628 ".globl _ScaleRowDown38_2_Int_SSSE3\n" 1629 "_ScaleRowDown38_2_Int_SSSE3:\n" 1630 #else 1631 ".global ScaleRowDown38_2_Int_SSSE3\n" 1632 "ScaleRowDown38_2_Int_SSSE3:\n" 1633 #endif 1634 "pusha\n" 1635 "mov 0x24(%esp),%esi\n" 1636 "mov 0x28(%esp),%edx\n" 1637 "mov 0x2c(%esp),%edi\n" 1638 "mov 0x30(%esp),%ecx\n" 1639 "movdqa _shufab0,%xmm4\n" 1640 "movdqa _shufab1,%xmm5\n" 1641 "movdqa _shufab2,%xmm6\n" 1642 "movdqa _scaleab2,%xmm7\n" 1643 1644 "1:" 1645 "movdqa (%esi),%xmm2\n" 1646 "pavgb (%esi,%edx,1),%xmm2\n" 1647 "lea 0x10(%esi),%esi\n" 1648 "movdqa %xmm2,%xmm0\n" 1649 "pshufb %xmm4,%xmm0\n" 1650 "movdqa %xmm2,%xmm1\n" 1651 "pshufb %xmm5,%xmm1\n" 1652 "paddusw %xmm1,%xmm0\n" 1653 "pshufb %xmm6,%xmm2\n" 1654 "paddusw %xmm2,%xmm0\n" 1655 "pmulhuw %xmm7,%xmm0\n" 1656 "packuswb %xmm0,%xmm0\n" 1657 "movd %xmm0,(%edi)\n" 1658 "pextrw $0x2,%xmm0,%eax\n" 1659 "mov %ax,0x4(%edi)\n" 1660 "lea 0x6(%edi),%edi\n" 1661 "sub $0x6,%ecx\n" 1662 "ja 1b\n" 1663 "popa\n" 1664 "ret\n" 1665 ); 1666 #endif // __PIC__ 1667 1668 #define HAS_SCALEADDROWS_SSE2 1669 extern "C" void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, 1670 uint16* dst_ptr, int src_width, 1671 int src_height); 1672 asm( 1673 ".text\n" 1674 #if defined(OSX) 1675 ".globl _ScaleAddRows_SSE2\n" 1676 "_ScaleAddRows_SSE2:\n" 1677 #else 1678 ".global ScaleAddRows_SSE2\n" 1679 "ScaleAddRows_SSE2:\n" 1680 #endif 1681 "pusha\n" 1682 "mov 0x24(%esp),%esi\n" 1683 "mov 0x28(%esp),%edx\n" 1684 "mov 0x2c(%esp),%edi\n" 1685 "mov 0x30(%esp),%ecx\n" 1686 "mov 0x34(%esp),%ebx\n" 1687 "pxor %xmm7,%xmm7\n" 1688 1689 "1:" 1690 "movdqa (%esi),%xmm2\n" 1691 "lea (%esi,%edx,1),%eax\n" 1692 "movhlps %xmm2,%xmm3\n" 1693 "lea -0x1(%ebx),%ebp\n" 1694 "punpcklbw %xmm7,%xmm2\n" 1695 "punpcklbw %xmm7,%xmm3\n" 1696 1697 "2:" 1698 "movdqa (%eax),%xmm0\n" 1699 "lea (%eax,%edx,1),%eax\n" 1700 "movhlps %xmm0,%xmm1\n" 1701 "punpcklbw %xmm7,%xmm0\n" 1702 "punpcklbw %xmm7,%xmm1\n" 1703 "paddusw %xmm0,%xmm2\n" 1704 "paddusw %xmm1,%xmm3\n" 1705 "sub $0x1,%ebp\n" 1706 "ja 2b\n" 1707 1708 "movdqa %xmm2,(%edi)\n" 1709 "movdqa %xmm3,0x10(%edi)\n" 1710 "lea 0x20(%edi),%edi\n" 1711 "lea 0x10(%esi),%esi\n" 1712 "sub $0x10,%ecx\n" 1713 "ja 1b\n" 1714 "popa\n" 1715 "ret\n" 1716 ); 1717 1718 // Bilinear row filtering combines 16x2 -> 16x1. SSE2 version 1719 #define HAS_SCALEFILTERROWS_SSE2 1720 extern "C" void ScaleFilterRows_SSE2(uint8* dst_ptr, 1721 const uint8* src_ptr, int src_stride, 1722 int dst_width, int source_y_fraction); 1723 asm( 1724 ".text\n" 1725 #if defined(OSX) 1726 ".globl _ScaleFilterRows_SSE2\n" 1727 "_ScaleFilterRows_SSE2:\n" 1728 #else 1729 ".global ScaleFilterRows_SSE2\n" 1730 "ScaleFilterRows_SSE2:\n" 1731 #endif 1732 "push %esi\n" 1733 "push %edi\n" 1734 "mov 0xc(%esp),%edi\n" 1735 "mov 0x10(%esp),%esi\n" 1736 "mov 0x14(%esp),%edx\n" 1737 "mov 0x18(%esp),%ecx\n" 1738 "mov 0x1c(%esp),%eax\n" 1739 "cmp $0x0,%eax\n" 1740 "je 2f\n" 1741 "cmp $0x80,%eax\n" 1742 "je 3f\n" 1743 "movd %eax,%xmm6\n" 1744 "punpcklwd %xmm6,%xmm6\n" 1745 "pshufd $0x0,%xmm6,%xmm6\n" 1746 "neg %eax\n" 1747 "add $0x100,%eax\n" 1748 "movd %eax,%xmm5\n" 1749 "punpcklwd %xmm5,%xmm5\n" 1750 "pshufd $0x0,%xmm5,%xmm5\n" 1751 "pxor %xmm7,%xmm7\n" 1752 1753 "1:" 1754 "movdqa (%esi),%xmm0\n" 1755 "movdqa (%esi,%edx,1),%xmm2\n" 1756 "lea 0x10(%esi),%esi\n" 1757 "movdqa %xmm0,%xmm1\n" 1758 "movdqa %xmm2,%xmm3\n" 1759 "punpcklbw %xmm7,%xmm0\n" 1760 "punpcklbw %xmm7,%xmm2\n" 1761 "punpckhbw %xmm7,%xmm1\n" 1762 "punpckhbw %xmm7,%xmm3\n" 1763 "pmullw %xmm5,%xmm0\n" 1764 "pmullw %xmm5,%xmm1\n" 1765 "pmullw %xmm6,%xmm2\n" 1766 "pmullw %xmm6,%xmm3\n" 1767 "paddusw %xmm2,%xmm0\n" 1768 "paddusw %xmm3,%xmm1\n" 1769 "psrlw $0x8,%xmm0\n" 1770 "psrlw $0x8,%xmm1\n" 1771 "packuswb %xmm1,%xmm0\n" 1772 "movdqa %xmm0,(%edi)\n" 1773 "lea 0x10(%edi),%edi\n" 1774 "sub $0x10,%ecx\n" 1775 "ja 1b\n" 1776 "mov -0x1(%edi),%al\n" 1777 "mov %al,(%edi)\n" 1778 "pop %edi\n" 1779 "pop %esi\n" 1780 "ret\n" 1781 1782 "2:" 1783 "movdqa (%esi),%xmm0\n" 1784 "lea 0x10(%esi),%esi\n" 1785 "movdqa %xmm0,(%edi)\n" 1786 "lea 0x10(%edi),%edi\n" 1787 "sub $0x10,%ecx\n" 1788 "ja 2b\n" 1789 1790 "mov -0x1(%edi),%al\n" 1791 "mov %al,(%edi)\n" 1792 "pop %edi\n" 1793 "pop %esi\n" 1794 "ret\n" 1795 1796 "3:" 1797 "movdqa (%esi),%xmm0\n" 1798 "movdqa (%esi,%edx,1),%xmm2\n" 1799 "lea 0x10(%esi),%esi\n" 1800 "pavgb %xmm2,%xmm0\n" 1801 "movdqa %xmm0,(%edi)\n" 1802 "lea 0x10(%edi),%edi\n" 1803 "sub $0x10,%ecx\n" 1804 "ja 3b\n" 1805 1806 "mov -0x1(%edi),%al\n" 1807 "mov %al,(%edi)\n" 1808 "pop %edi\n" 1809 "pop %esi\n" 1810 "ret\n" 1811 ); 1812 1813 // Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version 1814 #define HAS_SCALEFILTERROWS_SSSE3 1815 extern "C" void ScaleFilterRows_SSSE3(uint8* dst_ptr, 1816 const uint8* src_ptr, int src_stride, 1817 int dst_width, int source_y_fraction); 1818 asm( 1819 ".text\n" 1820 #if defined(OSX) 1821 ".globl _ScaleFilterRows_SSSE3\n" 1822 "_ScaleFilterRows_SSSE3:\n" 1823 #else 1824 ".global ScaleFilterRows_SSSE3\n" 1825 "ScaleFilterRows_SSSE3:\n" 1826 #endif 1827 "push %esi\n" 1828 "push %edi\n" 1829 "mov 0xc(%esp),%edi\n" 1830 "mov 0x10(%esp),%esi\n" 1831 "mov 0x14(%esp),%edx\n" 1832 "mov 0x18(%esp),%ecx\n" 1833 "mov 0x1c(%esp),%eax\n" 1834 "cmp $0x0,%eax\n" 1835 "je 2f\n" 1836 "cmp $0x80,%eax\n" 1837 "je 3f\n" 1838 "shr %eax\n" 1839 "mov %al,%ah\n" 1840 "neg %al\n" 1841 "add $0x80,%al\n" 1842 "movd %eax,%xmm7\n" 1843 "punpcklwd %xmm7,%xmm7\n" 1844 "pshufd $0x0,%xmm7,%xmm7\n" 1845 1846 "1:" 1847 "movdqa (%esi),%xmm0\n" 1848 "movdqa (%esi,%edx,1),%xmm2\n" 1849 "lea 0x10(%esi),%esi\n" 1850 "movdqa %xmm0,%xmm1\n" 1851 "punpcklbw %xmm2,%xmm0\n" 1852 "punpckhbw %xmm2,%xmm1\n" 1853 "pmaddubsw %xmm7,%xmm0\n" 1854 "pmaddubsw %xmm7,%xmm1\n" 1855 "psrlw $0x7,%xmm0\n" 1856 "psrlw $0x7,%xmm1\n" 1857 "packuswb %xmm1,%xmm0\n" 1858 "movdqa %xmm0,(%edi)\n" 1859 "lea 0x10(%edi),%edi\n" 1860 "sub $0x10,%ecx\n" 1861 "ja 1b\n" 1862 "mov -0x1(%edi),%al\n" 1863 "mov %al,(%edi)\n" 1864 "pop %edi\n" 1865 "pop %esi\n" 1866 "ret\n" 1867 1868 "2:" 1869 "movdqa (%esi),%xmm0\n" 1870 "lea 0x10(%esi),%esi\n" 1871 "movdqa %xmm0,(%edi)\n" 1872 "lea 0x10(%edi),%edi\n" 1873 "sub $0x10,%ecx\n" 1874 "ja 2b\n" 1875 "mov -0x1(%edi),%al\n" 1876 "mov %al,(%edi)\n" 1877 "pop %edi\n" 1878 "pop %esi\n" 1879 "ret\n" 1880 1881 "3:" 1882 "movdqa (%esi),%xmm0\n" 1883 "movdqa (%esi,%edx,1),%xmm2\n" 1884 "lea 0x10(%esi),%esi\n" 1885 "pavgb %xmm2,%xmm0\n" 1886 "movdqa %xmm0,(%edi)\n" 1887 "lea 0x10(%edi),%edi\n" 1888 "sub $0x10,%ecx\n" 1889 "ja 3b\n" 1890 "mov -0x1(%edi),%al\n" 1891 "mov %al,(%edi)\n" 1892 "pop %edi\n" 1893 "pop %esi\n" 1894 "ret\n" 1895 ); 1896 1897 #elif defined(__x86_64__) 1898 static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride, 1899 uint8* dst_ptr, int dst_width) { 1900 asm volatile( 1901 "lea (%3,%3,2),%%r10\n" 1902 "pxor %%xmm7,%%xmm7\n" 1903 "1:" 1904 "movdqa (%0),%%xmm0\n" 1905 "movdqa 0x10(%0),%%xmm1\n" 1906 "movdqa (%0,%3,1),%%xmm2\n" 1907 "movdqa 0x10(%0,%3,1),%%xmm3\n" 1908 "pavgb %%xmm2,%%xmm0\n" 1909 "pavgb %%xmm3,%%xmm1\n" 1910 "movdqa (%0,%3,2),%%xmm2\n" 1911 "movdqa 0x10(%0,%3,2),%%xmm3\n" 1912 "movdqa (%0,%%r10,1),%%xmm4\n" 1913 "movdqa 0x10(%0,%%r10,1),%%xmm5\n" 1914 "lea (%0,%3,4),%%r11\n" 1915 "lea 0x20(%0),%0\n" 1916 "pavgb %%xmm4,%%xmm2\n" 1917 "pavgb %%xmm5,%%xmm3\n" 1918 "pavgb %%xmm2,%%xmm0\n" 1919 "pavgb %%xmm3,%%xmm1\n" 1920 "movdqa 0x0(%%r11),%%xmm2\n" 1921 "movdqa 0x10(%%r11),%%xmm3\n" 1922 "movdqa 0x0(%%r11,%3,1),%%xmm4\n" 1923 "movdqa 0x10(%%r11,%3,1),%%xmm5\n" 1924 "pavgb %%xmm4,%%xmm2\n" 1925 "pavgb %%xmm5,%%xmm3\n" 1926 "movdqa 0x0(%%r11,%3,2),%%xmm4\n" 1927 "movdqa 0x10(%%r11,%3,2),%%xmm5\n" 1928 "movdqa 0x0(%%r11,%%r10,1),%%xmm6\n" 1929 "pavgb %%xmm6,%%xmm4\n" 1930 "movdqa 0x10(%%r11,%%r10,1),%%xmm6\n" 1931 "pavgb %%xmm6,%%xmm5\n" 1932 "pavgb %%xmm4,%%xmm2\n" 1933 "pavgb %%xmm5,%%xmm3\n" 1934 "pavgb %%xmm2,%%xmm0\n" 1935 "pavgb %%xmm3,%%xmm1\n" 1936 "psadbw %%xmm7,%%xmm0\n" 1937 "psadbw %%xmm7,%%xmm1\n" 1938 "pshufd $0xd8,%%xmm0,%%xmm0\n" 1939 "pshufd $0x8d,%%xmm1,%%xmm1\n" 1940 "por %%xmm1,%%xmm0\n" 1941 "psrlw $0x3,%%xmm0\n" 1942 "packuswb %%xmm0,%%xmm0\n" 1943 "packuswb %%xmm0,%%xmm0\n" 1944 "movd %%xmm0,(%1)\n" 1945 "lea 0x4(%1),%1\n" 1946 "sub $0x4,%2\n" 1947 "ja 1b\n" 1948 : "+r"(src_ptr), // %0 1949 "+r"(dst_ptr), // %1 1950 "+r"(dst_width) // %2 1951 : "r"(static_cast<intptr_t>(src_stride)) // %3 1952 : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3", 1953 "xmm4", "xmm5", "xmm6", "xmm7" 1954 ); 1955 } 1956 1957 #define HAS_SCALEROWDOWN34_SSSE3 1958 static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride, 1959 uint8* dst_ptr, int dst_width) { 1960 asm volatile( 1961 "movdqa (%3),%%xmm3\n" 1962 "movdqa (%4),%%xmm4\n" 1963 "movdqa (%5),%%xmm5\n" 1964 "1:" 1965 "movdqa (%0),%%xmm0\n" 1966 "movdqa 0x10(%0),%%xmm2\n" 1967 "lea 0x20(%0),%0\n" 1968 "movdqa %%xmm2,%%xmm1\n" 1969 "palignr $0x8,%%xmm0,%%xmm1\n" 1970 "pshufb %%xmm3,%%xmm0\n" 1971 "pshufb %%xmm4,%%xmm1\n" 1972 "pshufb %%xmm5,%%xmm2\n" 1973 "movq %%xmm0,(%1)\n" 1974 "movq %%xmm1,0x8(%1)\n" 1975 "movq %%xmm2,0x10(%1)\n" 1976 "lea 0x18(%1),%1\n" 1977 "sub $0x18,%2\n" 1978 "ja 1b\n" 1979 : "+r"(src_ptr), // %0 1980 "+r"(dst_ptr), // %1 1981 "+r"(dst_width) // %2 1982 : "r"(_shuf0), // %3 1983 "r"(_shuf1), // %4 1984 "r"(_shuf2) // %5 1985 : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1986 ); 1987 } 1988 1989 static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride, 1990 uint8* dst_ptr, int dst_width) { 1991 asm volatile( 1992 "movdqa (%4),%%xmm2\n" // _shuf01 1993 "movdqa (%5),%%xmm3\n" // _shuf11 1994 "movdqa (%6),%%xmm4\n" // _shuf21 1995 "movdqa (%7),%%xmm5\n" // _madd01 1996 "movdqa (%8),%%xmm6\n" // _madd11 1997 "movdqa (%9),%%xmm7\n" // _round34 1998 "movdqa (%10),%%xmm8\n" // _madd21 1999 "1:" 2000 "movdqa (%0),%%xmm0\n" 2001 "movdqa (%0,%3),%%xmm1\n" 2002 "pavgb %%xmm1,%%xmm0\n" 2003 "pshufb %%xmm2,%%xmm0\n" 2004 "pmaddubsw %%xmm5,%%xmm0\n" 2005 "paddsw %%xmm7,%%xmm0\n" 2006 "psrlw $0x2,%%xmm0\n" 2007 "packuswb %%xmm0,%%xmm0\n" 2008 "movq %%xmm0,(%1)\n" 2009 "movdqu 0x8(%0),%%xmm0\n" 2010 "movdqu 0x8(%0,%3),%%xmm1\n" 2011 "pavgb %%xmm1,%%xmm0\n" 2012 "pshufb %%xmm3,%%xmm0\n" 2013 "pmaddubsw %%xmm6,%%xmm0\n" 2014 "paddsw %%xmm7,%%xmm0\n" 2015 "psrlw $0x2,%%xmm0\n" 2016 "packuswb %%xmm0,%%xmm0\n" 2017 "movq %%xmm0,0x8(%1)\n" 2018 "movdqa 0x10(%0),%%xmm0\n" 2019 "movdqa 0x10(%0,%3),%%xmm1\n" 2020 "lea 0x20(%0),%0\n" 2021 "pavgb %%xmm1,%%xmm0\n" 2022 "pshufb %%xmm4,%%xmm0\n" 2023 "pmaddubsw %%xmm8,%%xmm0\n" 2024 "paddsw %%xmm7,%%xmm0\n" 2025 "psrlw $0x2,%%xmm0\n" 2026 "packuswb %%xmm0,%%xmm0\n" 2027 "movq %%xmm0,0x10(%1)\n" 2028 "lea 0x18(%1),%1\n" 2029 "sub $0x18,%2\n" 2030 "ja 1b\n" 2031 : "+r"(src_ptr), // %0 2032 "+r"(dst_ptr), // %1 2033 "+r"(dst_width) // %2 2034 : "r"(static_cast<intptr_t>(src_stride)), // %3 2035 "r"(_shuf01), // %4 2036 "r"(_shuf11), // %5 2037 "r"(_shuf21), // %6 2038 "r"(_madd01), // %7 2039 "r"(_madd11), // %8 2040 "r"(_round34), // %9 2041 "r"(_madd21) // %10 2042 : "memory", "xmm0", "xmm1", "xmm2", "xmm3", 2043 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8" 2044 ); 2045 } 2046 2047 static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride, 2048 uint8* dst_ptr, int dst_width) { 2049 asm volatile( 2050 "movdqa (%4),%%xmm2\n" // _shuf01 2051 "movdqa (%5),%%xmm3\n" // _shuf11 2052 "movdqa (%6),%%xmm4\n" // _shuf21 2053 "movdqa (%7),%%xmm5\n" // _madd01 2054 "movdqa (%8),%%xmm6\n" // _madd11 2055 "movdqa (%9),%%xmm7\n" // _round34 2056 "movdqa (%10),%%xmm8\n" // _madd21 2057 "1:" 2058 "movdqa (%0),%%xmm0\n" 2059 "movdqa (%0,%3,1),%%xmm1\n" 2060 "pavgb %%xmm0,%%xmm1\n" 2061 "pavgb %%xmm1,%%xmm0\n" 2062 "pshufb %%xmm2,%%xmm0\n" 2063 "pmaddubsw %%xmm5,%%xmm0\n" 2064 "paddsw %%xmm7,%%xmm0\n" 2065 "psrlw $0x2,%%xmm0\n" 2066 "packuswb %%xmm0,%%xmm0\n" 2067 "movq %%xmm0,(%1)\n" 2068 "movdqu 0x8(%0),%%xmm0\n" 2069 "movdqu 0x8(%0,%3,1),%%xmm1\n" 2070 "pavgb %%xmm0,%%xmm1\n" 2071 "pavgb %%xmm1,%%xmm0\n" 2072 "pshufb %%xmm3,%%xmm0\n" 2073 "pmaddubsw %%xmm6,%%xmm0\n" 2074 "paddsw %%xmm7,%%xmm0\n" 2075 "psrlw $0x2,%%xmm0\n" 2076 "packuswb %%xmm0,%%xmm0\n" 2077 "movq %%xmm0,0x8(%1)\n" 2078 "movdqa 0x10(%0),%%xmm0\n" 2079 "movdqa 0x10(%0,%3,1),%%xmm1\n" 2080 "lea 0x20(%0),%0\n" 2081 "pavgb %%xmm0,%%xmm1\n" 2082 "pavgb %%xmm1,%%xmm0\n" 2083 "pshufb %%xmm4,%%xmm0\n" 2084 "pmaddubsw %%xmm8,%%xmm0\n" 2085 "paddsw %%xmm7,%%xmm0\n" 2086 "psrlw $0x2,%%xmm0\n" 2087 "packuswb %%xmm0,%%xmm0\n" 2088 "movq %%xmm0,0x10(%1)\n" 2089 "lea 0x18(%1),%1\n" 2090 "sub $0x18,%2\n" 2091 "ja 1b\n" 2092 : "+r"(src_ptr), // %0 2093 "+r"(dst_ptr), // %1 2094 "+r"(dst_width) // %2 2095 : "r"(static_cast<intptr_t>(src_stride)), // %3 2096 "r"(_shuf01), // %4 2097 "r"(_shuf11), // %5 2098 "r"(_shuf21), // %6 2099 "r"(_madd01), // %7 2100 "r"(_madd11), // %8 2101 "r"(_round34), // %9 2102 "r"(_madd21) // %10 2103 : "memory", "xmm0", "xmm1", "xmm2", "xmm3", 2104 "xmm4", "xmm5", "xmm6", "xmm7", "xmm8" 2105 ); 2106 } 2107 2108 #define HAS_SCALEROWDOWN38_SSSE3 2109 static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride, 2110 uint8* dst_ptr, int dst_width) { 2111 asm volatile( 2112 "movdqa (%3),%%xmm5\n" 2113 "movdqa (%4),%%xmm6\n" 2114 "pxor %%xmm7,%%xmm7\n" 2115 "1:" 2116 "movdqa (%0),%%xmm0\n" 2117 "movdqa 0x10(%0),%%xmm1\n" 2118 "lea 0x20(%0),%0\n" 2119 "pshufb %%xmm5,%%xmm0\n" 2120 "pshufb %%xmm6,%%xmm1\n" 2121 "paddusb %%xmm1,%%xmm0\n" 2122 "movq %%xmm0,(%1)\n" 2123 "movhlps %%xmm0,%%xmm1\n" 2124 "movd %%xmm1,0x8(%1)\n" 2125 "lea 0xc(%1),%1\n" 2126 "sub $0xc,%2\n" 2127 "ja 1b\n" 2128 : "+r"(src_ptr), // %0 2129 "+r"(dst_ptr), // %1 2130 "+r"(dst_width) // %2 2131 : "r"(_shuf38a), // %3 2132 "r"(_shuf38b) // %4 2133 : "memory", "xmm0", "xmm1", "xmm5", "xmm6", "xmm7" 2134 ); 2135 } 2136 2137 static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride, 2138 uint8* dst_ptr, int dst_width) { 2139 asm volatile( 2140 "movdqa (%4),%%xmm4\n" 2141 "movdqa (%5),%%xmm5\n" 2142 "movdqa (%6),%%xmm6\n" 2143 "pxor %%xmm7,%%xmm7\n" 2144 "1:" 2145 "movdqa (%0),%%xmm0\n" 2146 "movdqa (%0,%3,1),%%xmm2\n" 2147 "movhlps %%xmm0,%%xmm1\n" 2148 "movhlps %%xmm2,%%xmm3\n" 2149 "punpcklbw %%xmm7,%%xmm0\n" 2150 "punpcklbw %%xmm7,%%xmm1\n" 2151 "punpcklbw %%xmm7,%%xmm2\n" 2152 "punpcklbw %%xmm7,%%xmm3\n" 2153 "paddusw %%xmm2,%%xmm0\n" 2154 "paddusw %%xmm3,%%xmm1\n" 2155 "movdqa (%0,%3,2),%%xmm2\n" 2156 "lea 0x10(%0),%0\n" 2157 "movhlps %%xmm2,%%xmm3\n" 2158 "punpcklbw %%xmm7,%%xmm2\n" 2159 "punpcklbw %%xmm7,%%xmm3\n" 2160 "paddusw %%xmm2,%%xmm0\n" 2161 "paddusw %%xmm3,%%xmm1\n" 2162 "movdqa %%xmm0,%%xmm2\n" 2163 "psrldq $0x2,%%xmm0\n" 2164 "paddusw %%xmm0,%%xmm2\n" 2165 "psrldq $0x2,%%xmm0\n" 2166 "paddusw %%xmm0,%%xmm2\n" 2167 "pshufb %%xmm4,%%xmm2\n" 2168 "movdqa %%xmm1,%%xmm3\n" 2169 "psrldq $0x2,%%xmm1\n" 2170 "paddusw %%xmm1,%%xmm3\n" 2171 "psrldq $0x2,%%xmm1\n" 2172 "paddusw %%xmm1,%%xmm3\n" 2173 "pshufb %%xmm5,%%xmm3\n" 2174 "paddusw %%xmm3,%%xmm2\n" 2175 "pmulhuw %%xmm6,%%xmm2\n" 2176 "packuswb %%xmm2,%%xmm2\n" 2177 "movd %%xmm2,(%1)\n" 2178 "pextrw $0x2,%%xmm2,%%eax\n" 2179 "mov %%ax,0x4(%1)\n" 2180 "lea 0x6(%1),%1\n" 2181 "sub $0x6,%2\n" 2182 "ja 1b\n" 2183 : "+r"(src_ptr), // %0 2184 "+r"(dst_ptr), // %1 2185 "+r"(dst_width) // %2 2186 : "r"(static_cast<intptr_t>(src_stride)), // %3 2187 "r"(_shufac0), // %4 2188 "r"(_shufac3), // %5 2189 "r"(_scaleac3) // %6 2190 : "memory", "rax", "xmm0", "xmm1", "xmm2", "xmm3", 2191 "xmm4", "xmm5", "xmm6", "xmm7" 2192 ); 2193 } 2194 2195 static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride, 2196 uint8* dst_ptr, int dst_width) { 2197 asm volatile( 2198 "movdqa (%4),%%xmm4\n" 2199 "movdqa (%5),%%xmm5\n" 2200 "movdqa (%6),%%xmm6\n" 2201 "movdqa (%7),%%xmm7\n" 2202 "1:" 2203 "movdqa (%0),%%xmm2\n" 2204 "pavgb (%0,%3,1),%%xmm2\n" 2205 "lea 0x10(%0),%0\n" 2206 "movdqa %%xmm2,%%xmm0\n" 2207 "pshufb %%xmm4,%%xmm0\n" 2208 "movdqa %%xmm2,%%xmm1\n" 2209 "pshufb %%xmm5,%%xmm1\n" 2210 "paddusw %%xmm1,%%xmm0\n" 2211 "pshufb %%xmm6,%%xmm2\n" 2212 "paddusw %%xmm2,%%xmm0\n" 2213 "pmulhuw %%xmm7,%%xmm0\n" 2214 "packuswb %%xmm0,%%xmm0\n" 2215 "movd %%xmm0,(%1)\n" 2216 "pextrw $0x2,%%xmm0,%%eax\n" 2217 "mov %%ax,0x4(%1)\n" 2218 "lea 0x6(%1),%1\n" 2219 "sub $0x6,%2\n" 2220 "ja 1b\n" 2221 : "+r"(src_ptr), // %0 2222 "+r"(dst_ptr), // %1 2223 "+r"(dst_width) // %2 2224 : "r"(static_cast<intptr_t>(src_stride)), // %3 2225 "r"(_shufab0), // %4 2226 "r"(_shufab1), // %5 2227 "r"(_shufab2), // %6 2228 "r"(_scaleab2) // %7 2229 : "memory", "rax", "xmm0", "xmm1", "xmm2", 2230 "xmm4", "xmm5", "xmm6", "xmm7" 2231 ); 2232 } 2233 2234 #define HAS_SCALEADDROWS_SSE2 2235 static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, 2236 uint16* dst_ptr, int src_width, 2237 int src_height) { 2238 asm volatile( 2239 "pxor %%xmm7,%%xmm7\n" 2240 "1:" 2241 "movdqa (%0),%%xmm2\n" 2242 "lea (%0,%4,1),%%r10\n" 2243 "movhlps %%xmm2,%%xmm3\n" 2244 "lea -0x1(%3),%%r11\n" 2245 "punpcklbw %%xmm7,%%xmm2\n" 2246 "punpcklbw %%xmm7,%%xmm3\n" 2247 2248 "2:" 2249 "movdqa (%%r10),%%xmm0\n" 2250 "lea (%%r10,%4,1),%%r10\n" 2251 "movhlps %%xmm0,%%xmm1\n" 2252 "punpcklbw %%xmm7,%%xmm0\n" 2253 "punpcklbw %%xmm7,%%xmm1\n" 2254 "paddusw %%xmm0,%%xmm2\n" 2255 "paddusw %%xmm1,%%xmm3\n" 2256 "sub $0x1,%%r11\n" 2257 "ja 2b\n" 2258 2259 "movdqa %%xmm2,(%1)\n" 2260 "movdqa %%xmm3,0x10(%1)\n" 2261 "lea 0x20(%1),%1\n" 2262 "lea 0x10(%0),%0\n" 2263 "sub $0x10,%2\n" 2264 "ja 1b\n" 2265 : "+r"(src_ptr), // %0 2266 "+r"(dst_ptr), // %1 2267 "+r"(src_width), // %2 2268 "+r"(src_height) // %3 2269 : "r"(static_cast<intptr_t>(src_stride)) // %4 2270 : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3", "xmm7" 2271 ); 2272 } 2273 2274 // Bilinear row filtering combines 16x2 -> 16x1. SSE2 version 2275 #define HAS_SCALEFILTERROWS_SSE2 2276 static void ScaleFilterRows_SSE2(uint8* dst_ptr, 2277 const uint8* src_ptr, int src_stride, 2278 int dst_width, int source_y_fraction) { 2279 if (source_y_fraction == 0) { 2280 asm volatile( 2281 "1:" 2282 "movdqa (%1),%%xmm0\n" 2283 "lea 0x10(%1),%1\n" 2284 "movdqa %%xmm0,(%0)\n" 2285 "lea 0x10(%0),%0\n" 2286 "sub $0x10,%2\n" 2287 "ja 1b\n" 2288 "mov -0x1(%0),%%al\n" 2289 "mov %%al,(%0)\n" 2290 : "+r"(dst_ptr), // %0 2291 "+r"(src_ptr), // %1 2292 "+r"(dst_width) // %2 2293 : 2294 : "memory", "rax", "xmm0" 2295 ); 2296 return; 2297 } else if (source_y_fraction == 128) { 2298 asm volatile( 2299 "1:" 2300 "movdqa (%1),%%xmm0\n" 2301 "movdqa (%1,%3,1),%%xmm2\n" 2302 "lea 0x10(%1),%1\n" 2303 "pavgb %%xmm2,%%xmm0\n" 2304 "movdqa %%xmm0,(%0)\n" 2305 "lea 0x10(%0),%0\n" 2306 "sub $0x10,%2\n" 2307 "ja 1b\n" 2308 "mov -0x1(%0),%%al\n" 2309 "mov %%al,(%0)\n" 2310 : "+r"(dst_ptr), // %0 2311 "+r"(src_ptr), // %1 2312 "+r"(dst_width) // %2 2313 : "r"(static_cast<intptr_t>(src_stride)) // %3 2314 : "memory", "rax", "xmm0", "xmm2" 2315 ); 2316 return; 2317 } else { 2318 asm volatile( 2319 "mov %3,%%eax\n" 2320 "movd %%eax,%%xmm6\n" 2321 "punpcklwd %%xmm6,%%xmm6\n" 2322 "pshufd $0x0,%%xmm6,%%xmm6\n" 2323 "neg %%eax\n" 2324 "add $0x100,%%eax\n" 2325 "movd %%eax,%%xmm5\n" 2326 "punpcklwd %%xmm5,%%xmm5\n" 2327 "pshufd $0x0,%%xmm5,%%xmm5\n" 2328 "pxor %%xmm7,%%xmm7\n" 2329 "1:" 2330 "movdqa (%1),%%xmm0\n" 2331 "movdqa (%1,%4,1),%%xmm2\n" 2332 "lea 0x10(%1),%1\n" 2333 "movdqa %%xmm0,%%xmm1\n" 2334 "movdqa %%xmm2,%%xmm3\n" 2335 "punpcklbw %%xmm7,%%xmm0\n" 2336 "punpcklbw %%xmm7,%%xmm2\n" 2337 "punpckhbw %%xmm7,%%xmm1\n" 2338 "punpckhbw %%xmm7,%%xmm3\n" 2339 "pmullw %%xmm5,%%xmm0\n" 2340 "pmullw %%xmm5,%%xmm1\n" 2341 "pmullw %%xmm6,%%xmm2\n" 2342 "pmullw %%xmm6,%%xmm3\n" 2343 "paddusw %%xmm2,%%xmm0\n" 2344 "paddusw %%xmm3,%%xmm1\n" 2345 "psrlw $0x8,%%xmm0\n" 2346 "psrlw $0x8,%%xmm1\n" 2347 "packuswb %%xmm1,%%xmm0\n" 2348 "movdqa %%xmm0,(%0)\n" 2349 "lea 0x10(%0),%0\n" 2350 "sub $0x10,%2\n" 2351 "ja 1b\n" 2352 "mov -0x1(%0),%%al\n" 2353 "mov %%al,(%0)\n" 2354 : "+r"(dst_ptr), // %0 2355 "+r"(src_ptr), // %1 2356 "+r"(dst_width), // %2 2357 "+r"(source_y_fraction) // %3 2358 : "r"(static_cast<intptr_t>(src_stride)) // %4 2359 : "memory", "rax", "xmm0", "xmm1", "xmm2", "xmm3", 2360 "xmm5", "xmm6", "xmm7" 2361 ); 2362 } 2363 return; 2364 } 2365 2366 // Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version 2367 #define HAS_SCALEFILTERROWS_SSSE3 2368 static void ScaleFilterRows_SSSE3(uint8* dst_ptr, 2369 const uint8* src_ptr, int src_stride, 2370 int dst_width, int source_y_fraction) { 2371 if (source_y_fraction == 0) { 2372 asm volatile( 2373 "1:" 2374 "movdqa (%1),%%xmm0\n" 2375 "lea 0x10(%1),%1\n" 2376 "movdqa %%xmm0,(%0)\n" 2377 "lea 0x10(%0),%0\n" 2378 "sub $0x10,%2\n" 2379 "ja 1b\n" 2380 "mov -0x1(%0),%%al\n" 2381 "mov %%al,(%0)\n" 2382 : "+r"(dst_ptr), // %0 2383 "+r"(src_ptr), // %1 2384 "+r"(dst_width) // %2 2385 : 2386 : "memory", "rax", "xmm0" 2387 ); 2388 return; 2389 } else if (source_y_fraction == 128) { 2390 asm volatile( 2391 "1:" 2392 "movdqa (%1),%%xmm0\n" 2393 "movdqa (%1,%3,1),%%xmm2\n" 2394 "lea 0x10(%1),%1\n" 2395 "pavgb %%xmm2,%%xmm0\n" 2396 "movdqa %%xmm0,(%0)\n" 2397 "lea 0x10(%0),%0\n" 2398 "sub $0x10,%2\n" 2399 "ja 1b\n" 2400 "mov -0x1(%0),%%al\n" 2401 "mov %%al,(%0)\n" 2402 : "+r"(dst_ptr), // %0 2403 "+r"(src_ptr), // %1 2404 "+r"(dst_width) // %2 2405 : "r"(static_cast<intptr_t>(src_stride)) // %3 2406 : "memory", "rax", "xmm0", "xmm2" 2407 ); 2408 return; 2409 } else { 2410 asm volatile( 2411 "mov %3,%%eax\n" 2412 "shr %%eax\n" 2413 "mov %%al,%%ah\n" 2414 "neg %%al\n" 2415 "add $0x80,%%al\n" 2416 "movd %%eax,%%xmm7\n" 2417 "punpcklwd %%xmm7,%%xmm7\n" 2418 "pshufd $0x0,%%xmm7,%%xmm7\n" 2419 "1:" 2420 "movdqa (%1),%%xmm0\n" 2421 "movdqa (%1,%4,1),%%xmm2\n" 2422 "lea 0x10(%1),%1\n" 2423 "movdqa %%xmm0,%%xmm1\n" 2424 "punpcklbw %%xmm2,%%xmm0\n" 2425 "punpckhbw %%xmm2,%%xmm1\n" 2426 "pmaddubsw %%xmm7,%%xmm0\n" 2427 "pmaddubsw %%xmm7,%%xmm1\n" 2428 "psrlw $0x7,%%xmm0\n" 2429 "psrlw $0x7,%%xmm1\n" 2430 "packuswb %%xmm1,%%xmm0\n" 2431 "movdqa %%xmm0,(%0)\n" 2432 "lea 0x10(%0),%0\n" 2433 "sub $0x10,%2\n" 2434 "ja 1b\n" 2435 "mov -0x1(%0),%%al\n" 2436 "mov %%al,(%0)\n" 2437 : "+r"(dst_ptr), // %0 2438 "+r"(src_ptr), // %1 2439 "+r"(dst_width), // %2 2440 "+r"(source_y_fraction) // %3 2441 : "r"(static_cast<intptr_t>(src_stride)) // %4 2442 : "memory", "rax", "xmm0", "xmm1", "xmm2", "xmm7" 2443 ); 2444 } 2445 return; 2446 } 2447 #endif 2448 #endif 2449 2450 // CPU agnostic row functions 2451 static void ScaleRowDown2_C(const uint8* src_ptr, int, 2452 uint8* dst, int dst_width) { 2453 for (int x = 0; x < dst_width; ++x) { 2454 *dst++ = *src_ptr; 2455 src_ptr += 2; 2456 } 2457 } 2458 2459 static void ScaleRowDown2Int_C(const uint8* src_ptr, int src_stride, 2460 uint8* dst, int dst_width) { 2461 for (int x = 0; x < dst_width; ++x) { 2462 *dst++ = (src_ptr[0] + src_ptr[1] + 2463 src_ptr[src_stride] + src_ptr[src_stride + 1] + 2) >> 2; 2464 src_ptr += 2; 2465 } 2466 } 2467 2468 static void ScaleRowDown4_C(const uint8* src_ptr, int, 2469 uint8* dst, int dst_width) { 2470 for (int x = 0; x < dst_width; ++x) { 2471 *dst++ = *src_ptr; 2472 src_ptr += 4; 2473 } 2474 } 2475 2476 static void ScaleRowDown4Int_C(const uint8* src_ptr, int src_stride, 2477 uint8* dst, int dst_width) { 2478 for (int x = 0; x < dst_width; ++x) { 2479 *dst++ = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] + 2480 src_ptr[src_stride + 0] + src_ptr[src_stride + 1] + 2481 src_ptr[src_stride + 2] + src_ptr[src_stride + 3] + 2482 src_ptr[src_stride * 2 + 0] + src_ptr[src_stride * 2 + 1] + 2483 src_ptr[src_stride * 2 + 2] + src_ptr[src_stride * 2 + 3] + 2484 src_ptr[src_stride * 3 + 0] + src_ptr[src_stride * 3 + 1] + 2485 src_ptr[src_stride * 3 + 2] + src_ptr[src_stride * 3 + 3] + 2486 8) >> 4; 2487 src_ptr += 4; 2488 } 2489 } 2490 2491 // 640 output pixels is enough to allow 5120 input pixels with 1/8 scale down. 2492 // Keeping the total buffer under 4096 bytes avoids a stackcheck, saving 4% cpu. 2493 static const int kMaxOutputWidth = 640; 2494 static const int kMaxRow12 = kMaxOutputWidth * 2; 2495 2496 static void ScaleRowDown8_C(const uint8* src_ptr, int, 2497 uint8* dst, int dst_width) { 2498 for (int x = 0; x < dst_width; ++x) { 2499 *dst++ = *src_ptr; 2500 src_ptr += 8; 2501 } 2502 } 2503 2504 // Note calling code checks width is less than max and if not 2505 // uses ScaleRowDown8_C instead. 2506 static void ScaleRowDown8Int_C(const uint8* src_ptr, int src_stride, 2507 uint8* dst, int dst_width) { 2508 ALIGN16(uint8 src_row[kMaxRow12 * 2]); 2509 assert(dst_width <= kMaxOutputWidth); 2510 ScaleRowDown4Int_C(src_ptr, src_stride, src_row, dst_width * 2); 2511 ScaleRowDown4Int_C(src_ptr + src_stride * 4, src_stride, 2512 src_row + kMaxOutputWidth, 2513 dst_width * 2); 2514 ScaleRowDown2Int_C(src_row, kMaxOutputWidth, dst, dst_width); 2515 } 2516 2517 static void ScaleRowDown34_C(const uint8* src_ptr, int, 2518 uint8* dst, int dst_width) { 2519 assert((dst_width % 3 == 0) && (dst_width > 0)); 2520 uint8* dend = dst + dst_width; 2521 do { 2522 dst[0] = src_ptr[0]; 2523 dst[1] = src_ptr[1]; 2524 dst[2] = src_ptr[3]; 2525 dst += 3; 2526 src_ptr += 4; 2527 } while (dst < dend); 2528 } 2529 2530 // Filter rows 0 and 1 together, 3 : 1 2531 static void ScaleRowDown34_0_Int_C(const uint8* src_ptr, int src_stride, 2532 uint8* d, int dst_width) { 2533 assert((dst_width % 3 == 0) && (dst_width > 0)); 2534 uint8* dend = d + dst_width; 2535 const uint8* s = src_ptr; 2536 const uint8* t = src_ptr + src_stride; 2537 do { 2538 uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; 2539 uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; 2540 uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; 2541 uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; 2542 uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; 2543 uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; 2544 d[0] = (a0 * 3 + b0 + 2) >> 2; 2545 d[1] = (a1 * 3 + b1 + 2) >> 2; 2546 d[2] = (a2 * 3 + b2 + 2) >> 2; 2547 d += 3; 2548 s += 4; 2549 t += 4; 2550 } while (d < dend); 2551 } 2552 2553 // Filter rows 1 and 2 together, 1 : 1 2554 static void ScaleRowDown34_1_Int_C(const uint8* src_ptr, int src_stride, 2555 uint8* d, int dst_width) { 2556 assert((dst_width % 3 == 0) && (dst_width > 0)); 2557 uint8* dend = d + dst_width; 2558 const uint8* s = src_ptr; 2559 const uint8* t = src_ptr + src_stride; 2560 do { 2561 uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2; 2562 uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1; 2563 uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2; 2564 uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2; 2565 uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1; 2566 uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2; 2567 d[0] = (a0 + b0 + 1) >> 1; 2568 d[1] = (a1 + b1 + 1) >> 1; 2569 d[2] = (a2 + b2 + 1) >> 1; 2570 d += 3; 2571 s += 4; 2572 t += 4; 2573 } while (d < dend); 2574 } 2575 2576 #if defined(HAS_SCALEFILTERROWS_SSE2) 2577 // Filter row to 3/4 2578 static void ScaleFilterCols34_C(uint8* dst_ptr, const uint8* src_ptr, 2579 int dst_width) { 2580 assert((dst_width % 3 == 0) && (dst_width > 0)); 2581 uint8* dend = dst_ptr + dst_width; 2582 const uint8* s = src_ptr; 2583 do { 2584 dst_ptr[0] = (s[0] * 3 + s[1] * 1 + 2) >> 2; 2585 dst_ptr[1] = (s[1] * 1 + s[2] * 1 + 1) >> 1; 2586 dst_ptr[2] = (s[2] * 1 + s[3] * 3 + 2) >> 2; 2587 dst_ptr += 3; 2588 s += 4; 2589 } while (dst_ptr < dend); 2590 } 2591 #endif 2592 2593 static void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr, 2594 int dst_width, int dx) { 2595 int x = 0; 2596 for (int j = 0; j < dst_width; ++j) { 2597 int xi = x >> 16; 2598 int xf1 = x & 0xffff; 2599 int xf0 = 65536 - xf1; 2600 2601 *dst_ptr++ = (src_ptr[xi] * xf0 + src_ptr[xi + 1] * xf1) >> 16; 2602 x += dx; 2603 } 2604 } 2605 2606 static const int kMaxInputWidth = 2560; 2607 #if defined(HAS_SCALEFILTERROWS_SSE2) 2608 #define HAS_SCALEROWDOWN34_SSE2 2609 // Filter rows 0 and 1 together, 3 : 1 2610 static void ScaleRowDown34_0_Int_SSE2(const uint8* src_ptr, int src_stride, 2611 uint8* dst_ptr, int dst_width) { 2612 assert((dst_width % 3 == 0) && (dst_width > 0)); 2613 ALIGN16(uint8 row[kMaxInputWidth]); 2614 ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 2615 256 / 4); 2616 ScaleFilterCols34_C(dst_ptr, row, dst_width); 2617 } 2618 2619 // Filter rows 1 and 2 together, 1 : 1 2620 static void ScaleRowDown34_1_Int_SSE2(const uint8* src_ptr, int src_stride, 2621 uint8* dst_ptr, int dst_width) { 2622 assert((dst_width % 3 == 0) && (dst_width > 0)); 2623 ALIGN16(uint8 row[kMaxInputWidth]); 2624 ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 2); 2625 ScaleFilterCols34_C(dst_ptr, row, dst_width); 2626 } 2627 #endif 2628 2629 static void ScaleRowDown38_C(const uint8* src_ptr, int, 2630 uint8* dst, int dst_width) { 2631 assert(dst_width % 3 == 0); 2632 for (int x = 0; x < dst_width; x += 3) { 2633 dst[0] = src_ptr[0]; 2634 dst[1] = src_ptr[3]; 2635 dst[2] = src_ptr[6]; 2636 dst += 3; 2637 src_ptr += 8; 2638 } 2639 } 2640 2641 // 8x3 -> 3x1 2642 static void ScaleRowDown38_3_Int_C(const uint8* src_ptr, int src_stride, 2643 uint8* dst_ptr, int dst_width) { 2644 assert((dst_width % 3 == 0) && (dst_width > 0)); 2645 for (int i = 0; i < dst_width; i+=3) { 2646 dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + 2647 src_ptr[src_stride + 0] + src_ptr[src_stride + 1] + 2648 src_ptr[src_stride + 2] + src_ptr[src_stride * 2 + 0] + 2649 src_ptr[src_stride * 2 + 1] + src_ptr[src_stride * 2 + 2]) * 2650 (65536 / 9) >> 16; 2651 dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + 2652 src_ptr[src_stride + 3] + src_ptr[src_stride + 4] + 2653 src_ptr[src_stride + 5] + src_ptr[src_stride * 2 + 3] + 2654 src_ptr[src_stride * 2 + 4] + src_ptr[src_stride * 2 + 5]) * 2655 (65536 / 9) >> 16; 2656 dst_ptr[2] = (src_ptr[6] + src_ptr[7] + 2657 src_ptr[src_stride + 6] + src_ptr[src_stride + 7] + 2658 src_ptr[src_stride * 2 + 6] + src_ptr[src_stride * 2 + 7]) * 2659 (65536 / 6) >> 16; 2660 src_ptr += 8; 2661 dst_ptr += 3; 2662 } 2663 } 2664 2665 // 8x2 -> 3x1 2666 static void ScaleRowDown38_2_Int_C(const uint8* src_ptr, int src_stride, 2667 uint8* dst_ptr, int dst_width) { 2668 assert((dst_width % 3 == 0) && (dst_width > 0)); 2669 for (int i = 0; i < dst_width; i+=3) { 2670 dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + 2671 src_ptr[src_stride + 0] + src_ptr[src_stride + 1] + 2672 src_ptr[src_stride + 2]) * (65536 / 6) >> 16; 2673 dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + 2674 src_ptr[src_stride + 3] + src_ptr[src_stride + 4] + 2675 src_ptr[src_stride + 5]) * (65536 / 6) >> 16; 2676 dst_ptr[2] = (src_ptr[6] + src_ptr[7] + 2677 src_ptr[src_stride + 6] + src_ptr[src_stride + 7]) * 2678 (65536 / 4) >> 16; 2679 src_ptr += 8; 2680 dst_ptr += 3; 2681 } 2682 } 2683 2684 // C version 8x2 -> 8x1 2685 static void ScaleFilterRows_C(uint8* dst_ptr, 2686 const uint8* src_ptr, int src_stride, 2687 int dst_width, int source_y_fraction) { 2688 assert(dst_width > 0); 2689 int y1_fraction = source_y_fraction; 2690 int y0_fraction = 256 - y1_fraction; 2691 const uint8* src_ptr1 = src_ptr + src_stride; 2692 uint8* end = dst_ptr + dst_width; 2693 do { 2694 dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8; 2695 dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8; 2696 dst_ptr[2] = (src_ptr[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8; 2697 dst_ptr[3] = (src_ptr[3] * y0_fraction + src_ptr1[3] * y1_fraction) >> 8; 2698 dst_ptr[4] = (src_ptr[4] * y0_fraction + src_ptr1[4] * y1_fraction) >> 8; 2699 dst_ptr[5] = (src_ptr[5] * y0_fraction + src_ptr1[5] * y1_fraction) >> 8; 2700 dst_ptr[6] = (src_ptr[6] * y0_fraction + src_ptr1[6] * y1_fraction) >> 8; 2701 dst_ptr[7] = (src_ptr[7] * y0_fraction + src_ptr1[7] * y1_fraction) >> 8; 2702 src_ptr += 8; 2703 src_ptr1 += 8; 2704 dst_ptr += 8; 2705 } while (dst_ptr < end); 2706 dst_ptr[0] = dst_ptr[-1]; 2707 } 2708 2709 void ScaleAddRows_C(const uint8* src_ptr, int src_stride, 2710 uint16* dst_ptr, int src_width, int src_height) { 2711 assert(src_width > 0); 2712 assert(src_height > 0); 2713 for (int x = 0; x < src_width; ++x) { 2714 const uint8* s = src_ptr + x; 2715 int sum = 0; 2716 for (int y = 0; y < src_height; ++y) { 2717 sum += s[0]; 2718 s += src_stride; 2719 } 2720 dst_ptr[x] = sum; 2721 } 2722 } 2723 2724 /** 2725 * Scale plane, 1/2 2726 * 2727 * This is an optimized version for scaling down a plane to 1/2 of 2728 * its original size. 2729 * 2730 */ 2731 static void ScalePlaneDown2(int src_width, int src_height, 2732 int dst_width, int dst_height, 2733 int src_stride, int dst_stride, 2734 const uint8* src_ptr, uint8* dst_ptr, 2735 FilterMode filtering) { 2736 assert(src_width % 2 == 0); 2737 assert(src_height % 2 == 0); 2738 void (*ScaleRowDown2)(const uint8* src_ptr, int src_stride, 2739 uint8* dst_ptr, int dst_width); 2740 2741 #if defined(HAS_SCALEROWDOWN2_NEON) 2742 if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) && 2743 (dst_width % 16 == 0) && (src_stride % 16 == 0) && 2744 (dst_stride % 16 == 0) && 2745 IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 16)) { 2746 ScaleRowDown2 = filtering ? ScaleRowDown2Int_NEON : ScaleRowDown2_NEON; 2747 } else 2748 #endif 2749 #if defined(HAS_SCALEROWDOWN2_SSE2) 2750 if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && 2751 (dst_width % 16 == 0) && IS_ALIGNED(src_ptr, 16) && 2752 IS_ALIGNED(dst_ptr, 16)) { 2753 ScaleRowDown2 = filtering ? ScaleRowDown2Int_SSE2 : ScaleRowDown2_SSE2; 2754 } else 2755 #endif 2756 { 2757 ScaleRowDown2 = filtering ? ScaleRowDown2Int_C : ScaleRowDown2_C; 2758 } 2759 2760 for (int y = 0; y < dst_height; ++y) { 2761 ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width); 2762 src_ptr += (src_stride << 1); 2763 dst_ptr += dst_stride; 2764 } 2765 } 2766 2767 /** 2768 * Scale plane, 1/4 2769 * 2770 * This is an optimized version for scaling down a plane to 1/4 of 2771 * its original size. 2772 */ 2773 static void ScalePlaneDown4(int src_width, int src_height, 2774 int dst_width, int dst_height, 2775 int src_stride, int dst_stride, 2776 const uint8* src_ptr, uint8* dst_ptr, 2777 FilterMode filtering) { 2778 assert(src_width % 4 == 0); 2779 assert(src_height % 4 == 0); 2780 void (*ScaleRowDown4)(const uint8* src_ptr, int src_stride, 2781 uint8* dst_ptr, int dst_width); 2782 2783 #if defined(HAS_SCALEROWDOWN4_NEON) 2784 if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) && 2785 (dst_width % 2 == 0) && (src_stride % 8 == 0) && 2786 IS_ALIGNED(src_ptr, 8)) { 2787 ScaleRowDown4 = filtering ? ScaleRowDown4Int_NEON : ScaleRowDown4_NEON; 2788 } else 2789 #endif 2790 #if defined(HAS_SCALEROWDOWN4_SSE2) 2791 if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && 2792 (dst_width % 8 == 0) && (src_stride % 16 == 0) && 2793 (dst_stride % 8 == 0) && 2794 IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8)) { 2795 ScaleRowDown4 = filtering ? ScaleRowDown4Int_SSE2 : ScaleRowDown4_SSE2; 2796 } else 2797 #endif 2798 { 2799 ScaleRowDown4 = filtering ? ScaleRowDown4Int_C : ScaleRowDown4_C; 2800 } 2801 2802 for (int y = 0; y < dst_height; ++y) { 2803 ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width); 2804 src_ptr += (src_stride << 2); 2805 dst_ptr += dst_stride; 2806 } 2807 } 2808 2809 /** 2810 * Scale plane, 1/8 2811 * 2812 * This is an optimized version for scaling down a plane to 1/8 2813 * of its original size. 2814 * 2815 */ 2816 static void ScalePlaneDown8(int src_width, int src_height, 2817 int dst_width, int dst_height, 2818 int src_stride, int dst_stride, 2819 const uint8* src_ptr, uint8* dst_ptr, 2820 FilterMode filtering) { 2821 assert(src_width % 8 == 0); 2822 assert(src_height % 8 == 0); 2823 void (*ScaleRowDown8)(const uint8* src_ptr, int src_stride, 2824 uint8* dst_ptr, int dst_width); 2825 #if defined(HAS_SCALEROWDOWN8_SSE2) 2826 if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && 2827 (dst_width % 16 == 0) && dst_width <= kMaxOutputWidth && 2828 (src_stride % 16 == 0) && (dst_stride % 16 == 0) && 2829 IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 16)) { 2830 ScaleRowDown8 = filtering ? ScaleRowDown8Int_SSE2 : ScaleRowDown8_SSE2; 2831 } else 2832 #endif 2833 { 2834 ScaleRowDown8 = filtering && (dst_width <= kMaxOutputWidth) ? 2835 ScaleRowDown8Int_C : ScaleRowDown8_C; 2836 } 2837 for (int y = 0; y < dst_height; ++y) { 2838 ScaleRowDown8(src_ptr, src_stride, dst_ptr, dst_width); 2839 src_ptr += (src_stride << 3); 2840 dst_ptr += dst_stride; 2841 } 2842 } 2843 2844 /** 2845 * Scale plane down, 3/4 2846 * 2847 * Provided by Frank Barchard (fbarchard (at) google.com) 2848 * 2849 */ 2850 static void ScalePlaneDown34(int src_width, int src_height, 2851 int dst_width, int dst_height, 2852 int src_stride, int dst_stride, 2853 const uint8* src_ptr, uint8* dst_ptr, 2854 FilterMode filtering) { 2855 assert(dst_width % 3 == 0); 2856 void (*ScaleRowDown34_0)(const uint8* src_ptr, int src_stride, 2857 uint8* dst_ptr, int dst_width); 2858 void (*ScaleRowDown34_1)(const uint8* src_ptr, int src_stride, 2859 uint8* dst_ptr, int dst_width); 2860 #if defined(HAS_SCALEROWDOWN34_SSSE3) 2861 if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && 2862 (dst_width % 24 == 0) && (src_stride % 16 == 0) && 2863 (dst_stride % 8 == 0) && 2864 IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8)) { 2865 if (!filtering) { 2866 ScaleRowDown34_0 = ScaleRowDown34_SSSE3; 2867 ScaleRowDown34_1 = ScaleRowDown34_SSSE3; 2868 } else { 2869 ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSSE3; 2870 ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSSE3; 2871 } 2872 } else 2873 #endif 2874 #if defined(HAS_SCALEROWDOWN34_SSE2) 2875 if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && 2876 (dst_width % 24 == 0) && (src_stride % 16 == 0) && 2877 (dst_stride % 8 == 0) && 2878 IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8) && 2879 filtering) { 2880 ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSE2; 2881 ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSE2; 2882 } else 2883 #endif 2884 { 2885 if (!filtering) { 2886 ScaleRowDown34_0 = ScaleRowDown34_C; 2887 ScaleRowDown34_1 = ScaleRowDown34_C; 2888 } else { 2889 ScaleRowDown34_0 = ScaleRowDown34_0_Int_C; 2890 ScaleRowDown34_1 = ScaleRowDown34_1_Int_C; 2891 } 2892 } 2893 int src_row = 0; 2894 for (int y = 0; y < dst_height; ++y) { 2895 switch (src_row) { 2896 case 0: 2897 ScaleRowDown34_0(src_ptr, src_stride, dst_ptr, dst_width); 2898 break; 2899 2900 case 1: 2901 ScaleRowDown34_1(src_ptr, src_stride, dst_ptr, dst_width); 2902 break; 2903 2904 case 2: 2905 ScaleRowDown34_0(src_ptr + src_stride, -src_stride, 2906 dst_ptr, dst_width); 2907 break; 2908 } 2909 ++src_row; 2910 src_ptr += src_stride; 2911 dst_ptr += dst_stride; 2912 if (src_row >= 3) { 2913 src_ptr += src_stride; 2914 src_row = 0; 2915 } 2916 } 2917 } 2918 2919 /** 2920 * Scale plane, 3/8 2921 * 2922 * This is an optimized version for scaling down a plane to 3/8 2923 * of its original size. 2924 * 2925 * Reduces 16x3 to 6x1 2926 */ 2927 static void ScalePlaneDown38(int src_width, int src_height, 2928 int dst_width, int dst_height, 2929 int src_stride, int dst_stride, 2930 const uint8* src_ptr, uint8* dst_ptr, 2931 FilterMode filtering) { 2932 assert(dst_width % 3 == 0); 2933 void (*ScaleRowDown38_3)(const uint8* src_ptr, int src_stride, 2934 uint8* dst_ptr, int dst_width); 2935 void (*ScaleRowDown38_2)(const uint8* src_ptr, int src_stride, 2936 uint8* dst_ptr, int dst_width); 2937 #if defined(HAS_SCALEROWDOWN38_SSSE3) 2938 if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && 2939 (dst_width % 24 == 0) && (src_stride % 16 == 0) && 2940 (dst_stride % 8 == 0) && 2941 IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8)) { 2942 if (!filtering) { 2943 ScaleRowDown38_3 = ScaleRowDown38_SSSE3; 2944 ScaleRowDown38_2 = ScaleRowDown38_SSSE3; 2945 } else { 2946 ScaleRowDown38_3 = ScaleRowDown38_3_Int_SSSE3; 2947 ScaleRowDown38_2 = ScaleRowDown38_2_Int_SSSE3; 2948 } 2949 } else 2950 #endif 2951 { 2952 if (!filtering) { 2953 ScaleRowDown38_3 = ScaleRowDown38_C; 2954 ScaleRowDown38_2 = ScaleRowDown38_C; 2955 } else { 2956 ScaleRowDown38_3 = ScaleRowDown38_3_Int_C; 2957 ScaleRowDown38_2 = ScaleRowDown38_2_Int_C; 2958 } 2959 } 2960 int src_row = 0; 2961 for (int y = 0; y < dst_height; ++y) { 2962 switch (src_row) { 2963 case 0: 2964 case 1: 2965 ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width); 2966 src_ptr += src_stride * 3; 2967 ++src_row; 2968 break; 2969 2970 case 2: 2971 ScaleRowDown38_2(src_ptr, src_stride, dst_ptr, dst_width); 2972 src_ptr += src_stride * 2; 2973 src_row = 0; 2974 break; 2975 } 2976 dst_ptr += dst_stride; 2977 } 2978 } 2979 2980 inline static uint32 SumBox(int iboxwidth, int iboxheight, 2981 int src_stride, const uint8* src_ptr) { 2982 assert(iboxwidth > 0); 2983 assert(iboxheight > 0); 2984 uint32 sum = 0u; 2985 for (int y = 0; y < iboxheight; ++y) { 2986 for (int x = 0; x < iboxwidth; ++x) { 2987 sum += src_ptr[x]; 2988 } 2989 src_ptr += src_stride; 2990 } 2991 return sum; 2992 } 2993 2994 static void ScalePlaneBoxRow(int dst_width, int boxheight, 2995 int dx, int src_stride, 2996 const uint8* src_ptr, uint8* dst_ptr) { 2997 int x = 0; 2998 for (int i = 0; i < dst_width; ++i) { 2999 int ix = x >> 16; 3000 x += dx; 3001 int boxwidth = (x >> 16) - ix; 3002 *dst_ptr++ = SumBox(boxwidth, boxheight, src_stride, src_ptr + ix) / 3003 (boxwidth * boxheight); 3004 } 3005 } 3006 3007 inline static uint32 SumPixels(int iboxwidth, const uint16* src_ptr) { 3008 assert(iboxwidth > 0); 3009 uint32 sum = 0u; 3010 for (int x = 0; x < iboxwidth; ++x) { 3011 sum += src_ptr[x]; 3012 } 3013 return sum; 3014 } 3015 3016 static void ScaleAddCols2_C(int dst_width, int boxheight, int dx, 3017 const uint16* src_ptr, uint8* dst_ptr) { 3018 int scaletbl[2]; 3019 int minboxwidth = (dx >> 16); 3020 scaletbl[0] = 65536 / (minboxwidth * boxheight); 3021 scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight); 3022 int *scaleptr = scaletbl - minboxwidth; 3023 int x = 0; 3024 for (int i = 0; i < dst_width; ++i) { 3025 int ix = x >> 16; 3026 x += dx; 3027 int boxwidth = (x >> 16) - ix; 3028 *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16; 3029 } 3030 } 3031 3032 static void ScaleAddCols1_C(int dst_width, int boxheight, int dx, 3033 const uint16* src_ptr, uint8* dst_ptr) { 3034 int boxwidth = (dx >> 16); 3035 int scaleval = 65536 / (boxwidth * boxheight); 3036 int x = 0; 3037 for (int i = 0; i < dst_width; ++i) { 3038 *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16; 3039 x += boxwidth; 3040 } 3041 } 3042 3043 /** 3044 * Scale plane down to any dimensions, with interpolation. 3045 * (boxfilter). 3046 * 3047 * Same method as SimpleScale, which is fixed point, outputting 3048 * one pixel of destination using fixed point (16.16) to step 3049 * through source, sampling a box of pixel with simple 3050 * averaging. 3051 */ 3052 static void ScalePlaneBox(int src_width, int src_height, 3053 int dst_width, int dst_height, 3054 int src_stride, int dst_stride, 3055 const uint8* src_ptr, uint8* dst_ptr) { 3056 assert(dst_width > 0); 3057 assert(dst_height > 0); 3058 int dy = (src_height << 16) / dst_height; 3059 int dx = (src_width << 16) / dst_width; 3060 if ((src_width % 16 != 0) || (src_width > kMaxInputWidth) || 3061 dst_height * 2 > src_height) { 3062 uint8* dst = dst_ptr; 3063 int dy = (src_height << 16) / dst_height; 3064 int dx = (src_width << 16) / dst_width; 3065 int y = 0; 3066 for (int j = 0; j < dst_height; ++j) { 3067 int iy = y >> 16; 3068 const uint8* const src = src_ptr + iy * src_stride; 3069 y += dy; 3070 if (y > (src_height << 16)) { 3071 y = (src_height << 16); 3072 } 3073 int boxheight = (y >> 16) - iy; 3074 ScalePlaneBoxRow(dst_width, boxheight, 3075 dx, src_stride, 3076 src, dst); 3077 3078 dst += dst_stride; 3079 } 3080 } else { 3081 ALIGN16(uint16 row[kMaxInputWidth]); 3082 void (*ScaleAddRows)(const uint8* src_ptr, int src_stride, 3083 uint16* dst_ptr, int src_width, int src_height); 3084 void (*ScaleAddCols)(int dst_width, int boxheight, int dx, 3085 const uint16* src_ptr, uint8* dst_ptr); 3086 #if defined(HAS_SCALEADDROWS_SSE2) 3087 if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && 3088 (src_stride % 16 == 0) && IS_ALIGNED(src_ptr, 16) && 3089 (src_width % 16) == 0) { 3090 ScaleAddRows = ScaleAddRows_SSE2; 3091 } else 3092 #endif 3093 { 3094 ScaleAddRows = ScaleAddRows_C; 3095 } 3096 if (dx & 0xffff) { 3097 ScaleAddCols = ScaleAddCols2_C; 3098 } else { 3099 ScaleAddCols = ScaleAddCols1_C; 3100 } 3101 3102 int y = 0; 3103 for (int j = 0; j < dst_height; ++j) { 3104 int iy = y >> 16; 3105 const uint8* const src = src_ptr + iy * src_stride; 3106 y += dy; 3107 if (y > (src_height << 16)) { 3108 y = (src_height << 16); 3109 } 3110 int boxheight = (y >> 16) - iy; 3111 ScaleAddRows(src, src_stride, row, src_width, boxheight); 3112 ScaleAddCols(dst_width, boxheight, dx, row, dst_ptr); 3113 dst_ptr += dst_stride; 3114 } 3115 } 3116 } 3117 3118 /** 3119 * Scale plane to/from any dimensions, with interpolation. 3120 */ 3121 static void ScalePlaneBilinearSimple(int src_width, int src_height, 3122 int dst_width, int dst_height, 3123 int src_stride, int dst_stride, 3124 const uint8* src_ptr, uint8* dst_ptr) { 3125 uint8* dst = dst_ptr; 3126 int dx = (src_width << 16) / dst_width; 3127 int dy = (src_height << 16) / dst_height; 3128 int maxx = ((src_width - 1) << 16) - 1; 3129 int maxy = ((src_height - 1) << 16) - 1; 3130 int y = (dst_height < src_height) ? 32768 : 3131 (src_height << 16) / dst_height - 32768; 3132 for (int i = 0; i < dst_height; ++i) { 3133 int cy = (y < 0) ? 0 : y; 3134 int yi = cy >> 16; 3135 int yf = cy & 0xffff; 3136 const uint8* const src = src_ptr + yi * src_stride; 3137 int x = (dst_width < src_width) ? 32768 : 3138 (src_width << 16) / dst_width - 32768; 3139 for (int j = 0; j < dst_width; ++j) { 3140 int cx = (x < 0) ? 0 : x; 3141 int xi = cx >> 16; 3142 int xf = cx & 0xffff; 3143 int r0 = (src[xi] * (65536 - xf) + src[xi + 1] * xf) >> 16; 3144 int r1 = (src[xi + src_stride] * (65536 - xf) + 3145 src[xi + src_stride + 1] * xf) >> 16; 3146 *dst++ = (r0 * (65536 - yf) + r1 * yf) >> 16; 3147 x += dx; 3148 if (x > maxx) 3149 x = maxx; 3150 } 3151 dst += dst_stride - dst_width; 3152 y += dy; 3153 if (y > maxy) 3154 y = maxy; 3155 } 3156 } 3157 3158 /** 3159 * Scale plane to/from any dimensions, with bilinear 3160 * interpolation. 3161 */ 3162 static void ScalePlaneBilinear(int src_width, int src_height, 3163 int dst_width, int dst_height, 3164 int src_stride, int dst_stride, 3165 const uint8* src_ptr, uint8* dst_ptr) { 3166 assert(dst_width > 0); 3167 assert(dst_height > 0); 3168 int dy = (src_height << 16) / dst_height; 3169 int dx = (src_width << 16) / dst_width; 3170 if ((src_width % 8 != 0) || (src_width > kMaxInputWidth)) { 3171 ScalePlaneBilinearSimple(src_width, src_height, dst_width, dst_height, 3172 src_stride, dst_stride, src_ptr, dst_ptr); 3173 3174 } else { 3175 ALIGN16(uint8 row[kMaxInputWidth + 1]); 3176 void (*ScaleFilterRows)(uint8* dst_ptr, const uint8* src_ptr, 3177 int src_stride, 3178 int dst_width, int source_y_fraction); 3179 void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr, 3180 int dst_width, int dx); 3181 #if defined(HAS_SCALEFILTERROWS_SSSE3) 3182 if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) && 3183 (src_stride % 16 == 0) && IS_ALIGNED(src_ptr, 16) && 3184 (src_width % 16) == 0) { 3185 ScaleFilterRows = ScaleFilterRows_SSSE3; 3186 } else 3187 #endif 3188 #if defined(HAS_SCALEFILTERROWS_SSE2) 3189 if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) && 3190 (src_stride % 16 == 0) && IS_ALIGNED(src_ptr, 16) && 3191 (src_width % 16) == 0) { 3192 ScaleFilterRows = ScaleFilterRows_SSE2; 3193 } else 3194 #endif 3195 { 3196 ScaleFilterRows = ScaleFilterRows_C; 3197 } 3198 ScaleFilterCols = ScaleFilterCols_C; 3199 3200 int y = 0; 3201 int maxy = ((src_height - 1) << 16) - 1; // max is filter of last 2 rows. 3202 for (int j = 0; j < dst_height; ++j) { 3203 int iy = y >> 16; 3204 int fy = (y >> 8) & 255; 3205 const uint8* const src = src_ptr + iy * src_stride; 3206 ScaleFilterRows(row, src, src_stride, src_width, fy); 3207 ScaleFilterCols(dst_ptr, row, dst_width, dx); 3208 dst_ptr += dst_stride; 3209 y += dy; 3210 if (y > maxy) { 3211 y = maxy; 3212 } 3213 } 3214 } 3215 } 3216 3217 /** 3218 * Scale plane to/from any dimensions, without interpolation. 3219 * Fixed point math is used for performance: The upper 16 bits 3220 * of x and dx is the integer part of the source position and 3221 * the lower 16 bits are the fixed decimal part. 3222 */ 3223 static void ScalePlaneSimple(int src_width, int src_height, 3224 int dst_width, int dst_height, 3225 int src_stride, int dst_stride, 3226 const uint8* src_ptr, uint8* dst_ptr) { 3227 uint8* dst = dst_ptr; 3228 int dx = (src_width << 16) / dst_width; 3229 for (int y = 0; y < dst_height; ++y) { 3230 const uint8* const src = src_ptr + (y * src_height / dst_height) * 3231 src_stride; 3232 // TODO(fbarchard): Round X coordinate by setting x=0x8000. 3233 int x = 0; 3234 for (int i = 0; i < dst_width; ++i) { 3235 *dst++ = src[x >> 16]; 3236 x += dx; 3237 } 3238 dst += dst_stride - dst_width; 3239 } 3240 } 3241 3242 /** 3243 * Scale plane to/from any dimensions. 3244 */ 3245 static void ScalePlaneAnySize(int src_width, int src_height, 3246 int dst_width, int dst_height, 3247 int src_stride, int dst_stride, 3248 const uint8* src_ptr, uint8* dst_ptr, 3249 FilterMode filtering) { 3250 if (!filtering) { 3251 ScalePlaneSimple(src_width, src_height, dst_width, dst_height, 3252 src_stride, dst_stride, src_ptr, dst_ptr); 3253 } else { 3254 // fall back to non-optimized version 3255 ScalePlaneBilinear(src_width, src_height, dst_width, dst_height, 3256 src_stride, dst_stride, src_ptr, dst_ptr); 3257 } 3258 } 3259 3260 /** 3261 * Scale plane down, any size 3262 * 3263 * This is an optimized version for scaling down a plane to any size. 3264 * The current implementation is ~10 times faster compared to the 3265 * reference implementation for e.g. XGA->LowResPAL 3266 * 3267 */ 3268 static void ScalePlaneDown(int src_width, int src_height, 3269 int dst_width, int dst_height, 3270 int src_stride, int dst_stride, 3271 const uint8* src_ptr, uint8* dst_ptr, 3272 FilterMode filtering) { 3273 if (!filtering) { 3274 ScalePlaneSimple(src_width, src_height, dst_width, dst_height, 3275 src_stride, dst_stride, src_ptr, dst_ptr); 3276 } else if (filtering == kFilterBilinear || src_height * 2 > dst_height) { 3277 // between 1/2x and 1x use bilinear 3278 ScalePlaneBilinear(src_width, src_height, dst_width, dst_height, 3279 src_stride, dst_stride, src_ptr, dst_ptr); 3280 } else { 3281 ScalePlaneBox(src_width, src_height, dst_width, dst_height, 3282 src_stride, dst_stride, src_ptr, dst_ptr); 3283 } 3284 } 3285 3286 /** 3287 * Copy plane, no scaling 3288 * 3289 * This simply copies the given plane without scaling. 3290 * The current implementation is ~115 times faster 3291 * compared to the reference implementation. 3292 * 3293 */ 3294 static void CopyPlane(int src_width, int src_height, 3295 int dst_width, int dst_height, 3296 int src_stride, int dst_stride, 3297 const uint8* src_ptr, uint8* dst_ptr) { 3298 if (src_stride == src_width && dst_stride == dst_width) { 3299 // All contiguous, so can use REALLY fast path. 3300 memcpy(dst_ptr, src_ptr, src_width * src_height); 3301 } else { 3302 // Not all contiguous; must copy scanlines individually 3303 const uint8* src = src_ptr; 3304 uint8* dst = dst_ptr; 3305 for (int i = 0; i < src_height; ++i) { 3306 memcpy(dst, src, src_width); 3307 dst += dst_stride; 3308 src += src_stride; 3309 } 3310 } 3311 } 3312 3313 static void ScalePlane(const uint8* src, int src_stride, 3314 int src_width, int src_height, 3315 uint8* dst, int dst_stride, 3316 int dst_width, int dst_height, 3317 FilterMode filtering, bool use_ref) { 3318 // Use specialized scales to improve performance for common resolutions. 3319 // For example, all the 1/2 scalings will use ScalePlaneDown2() 3320 if (dst_width == src_width && dst_height == src_height) { 3321 // Straight copy. 3322 CopyPlane(src_width, src_height, dst_width, dst_height, src_stride, 3323 dst_stride, src, dst); 3324 } else if (dst_width <= src_width && dst_height <= src_height) { 3325 // Scale down. 3326 if (use_ref) { 3327 // For testing, allow the optimized versions to be disabled. 3328 ScalePlaneDown(src_width, src_height, dst_width, dst_height, 3329 src_stride, dst_stride, src, dst, filtering); 3330 } else if (4 * dst_width == 3 * src_width && 3331 4 * dst_height == 3 * src_height) { 3332 // optimized, 3/4 3333 ScalePlaneDown34(src_width, src_height, dst_width, dst_height, 3334 src_stride, dst_stride, src, dst, filtering); 3335 } else if (2 * dst_width == src_width && 2 * dst_height == src_height) { 3336 // optimized, 1/2 3337 ScalePlaneDown2(src_width, src_height, dst_width, dst_height, 3338 src_stride, dst_stride, src, dst, filtering); 3339 // 3/8 rounded up for odd sized chroma height. 3340 } else if (8 * dst_width == 3 * src_width && 3341 dst_height == ((src_height * 3 + 7) / 8)) { 3342 // optimized, 3/8 3343 ScalePlaneDown38(src_width, src_height, dst_width, dst_height, 3344 src_stride, dst_stride, src, dst, filtering); 3345 } else if (4 * dst_width == src_width && 4 * dst_height == src_height) { 3346 // optimized, 1/4 3347 ScalePlaneDown4(src_width, src_height, dst_width, dst_height, 3348 src_stride, dst_stride, src, dst, filtering); 3349 } else if (8 * dst_width == src_width && 8 * dst_height == src_height) { 3350 // optimized, 1/8 3351 ScalePlaneDown8(src_width, src_height, dst_width, dst_height, 3352 src_stride, dst_stride, src, dst, filtering); 3353 } else { 3354 // Arbitrary downsample 3355 ScalePlaneDown(src_width, src_height, dst_width, dst_height, 3356 src_stride, dst_stride, src, dst, filtering); 3357 } 3358 } else { 3359 // Arbitrary scale up and/or down. 3360 ScalePlaneAnySize(src_width, src_height, dst_width, dst_height, 3361 src_stride, dst_stride, src, dst, filtering); 3362 } 3363 } 3364 3365 /** 3366 * Scale a plane. 3367 * 3368 * This function in turn calls a scaling function 3369 * suitable for handling the desired resolutions. 3370 * 3371 */ 3372 3373 int I420Scale(const uint8* src_y, int src_stride_y, 3374 const uint8* src_u, int src_stride_u, 3375 const uint8* src_v, int src_stride_v, 3376 int src_width, int src_height, 3377 uint8* dst_y, int dst_stride_y, 3378 uint8* dst_u, int dst_stride_u, 3379 uint8* dst_v, int dst_stride_v, 3380 int dst_width, int dst_height, 3381 FilterMode filtering) { 3382 if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || 3383 !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { 3384 return -1; 3385 } 3386 // Negative height means invert the image. 3387 if (src_height < 0) { 3388 src_height = -src_height; 3389 int halfheight = (src_height + 1) >> 1; 3390 src_y = src_y + (src_height - 1) * src_stride_y; 3391 src_u = src_u + (halfheight - 1) * src_stride_u; 3392 src_v = src_v + (halfheight - 1) * src_stride_v; 3393 src_stride_y = -src_stride_y; 3394 src_stride_u = -src_stride_u; 3395 src_stride_v = -src_stride_v; 3396 } 3397 int halfsrc_width = (src_width + 1) >> 1; 3398 int halfsrc_height = (src_height + 1) >> 1; 3399 int halfdst_width = (dst_width + 1) >> 1; 3400 int halfoheight = (dst_height + 1) >> 1; 3401 3402 ScalePlane(src_y, src_stride_y, src_width, src_height, 3403 dst_y, dst_stride_y, dst_width, dst_height, 3404 filtering, use_reference_impl_); 3405 ScalePlane(src_u, src_stride_u, halfsrc_width, halfsrc_height, 3406 dst_u, dst_stride_u, halfdst_width, halfoheight, 3407 filtering, use_reference_impl_); 3408 ScalePlane(src_v, src_stride_v, halfsrc_width, halfsrc_height, 3409 dst_v, dst_stride_v, halfdst_width, halfoheight, 3410 filtering, use_reference_impl_); 3411 return 0; 3412 } 3413 3414 int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v, 3415 int src_stride_y, int src_stride_u, int src_stride_v, 3416 int src_width, int src_height, 3417 uint8* dst_y, uint8* dst_u, uint8* dst_v, 3418 int dst_stride_y, int dst_stride_u, int dst_stride_v, 3419 int dst_width, int dst_height, 3420 bool interpolate) { 3421 if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 || 3422 !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { 3423 return -1; 3424 } 3425 // Negative height means invert the image. 3426 if (src_height < 0) { 3427 src_height = -src_height; 3428 int halfheight = (src_height + 1) >> 1; 3429 src_y = src_y + (src_height - 1) * src_stride_y; 3430 src_u = src_u + (halfheight - 1) * src_stride_u; 3431 src_v = src_v + (halfheight - 1) * src_stride_v; 3432 src_stride_y = -src_stride_y; 3433 src_stride_u = -src_stride_u; 3434 src_stride_v = -src_stride_v; 3435 } 3436 int halfsrc_width = (src_width + 1) >> 1; 3437 int halfsrc_height = (src_height + 1) >> 1; 3438 int halfdst_width = (dst_width + 1) >> 1; 3439 int halfoheight = (dst_height + 1) >> 1; 3440 FilterMode filtering = interpolate ? kFilterBox : kFilterNone; 3441 3442 ScalePlane(src_y, src_stride_y, src_width, src_height, 3443 dst_y, dst_stride_y, dst_width, dst_height, 3444 filtering, use_reference_impl_); 3445 ScalePlane(src_u, src_stride_u, halfsrc_width, halfsrc_height, 3446 dst_u, dst_stride_u, halfdst_width, halfoheight, 3447 filtering, use_reference_impl_); 3448 ScalePlane(src_v, src_stride_v, halfsrc_width, halfsrc_height, 3449 dst_v, dst_stride_v, halfdst_width, halfoheight, 3450 filtering, use_reference_impl_); 3451 return 0; 3452 } 3453 3454 int Scale(const uint8* src, int src_width, int src_height, 3455 uint8* dst, int dst_width, int dst_height, int ooffset, 3456 bool interpolate) { 3457 if (!src || src_width <= 0 || src_height <= 0 || 3458 !dst || dst_width <= 0 || dst_height <= 0 || ooffset < 0 || 3459 ooffset >= dst_height) { 3460 return -1; 3461 } 3462 ooffset = ooffset & ~1; // chroma requires offset to multiple of 2. 3463 int halfsrc_width = (src_width + 1) >> 1; 3464 int halfsrc_height = (src_height + 1) >> 1; 3465 int halfdst_width = (dst_width + 1) >> 1; 3466 int halfoheight = (dst_height + 1) >> 1; 3467 int aheight = dst_height - ooffset * 2; // actual output height 3468 const uint8* const iyptr = src; 3469 uint8* oyptr = dst + ooffset * dst_width; 3470 const uint8* const iuptr = src + src_width * src_height; 3471 uint8* ouptr = dst + dst_width * dst_height + (ooffset >> 1) * halfdst_width; 3472 const uint8* const ivptr = src + src_width * src_height + 3473 halfsrc_width * halfsrc_height; 3474 uint8* ovptr = dst + dst_width * dst_height + halfdst_width * halfoheight + 3475 (ooffset >> 1) * halfdst_width; 3476 return Scale(iyptr, iuptr, ivptr, src_width, halfsrc_width, halfsrc_width, 3477 src_width, src_height, oyptr, ouptr, ovptr, dst_width, 3478 halfdst_width, halfdst_width, dst_width, aheight, interpolate); 3479 } 3480 3481 } // namespace libyuv 3482