1 /* 2 * Copyright 2013 The LibYuv Project Authors. All rights reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "libyuv/row.h" 12 #include "libyuv/scale_row.h" 13 14 #ifdef __cplusplus 15 namespace libyuv { 16 extern "C" { 17 #endif 18 19 // This module is for Visual C x86. 20 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \ 21 defined(_MSC_VER) && !defined(__clang__) 22 23 // Offsets for source bytes 0 to 9 24 static uvec8 kShuf0 = 25 { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 }; 26 27 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. 28 static uvec8 kShuf1 = 29 { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 }; 30 31 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. 32 static uvec8 kShuf2 = 33 { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 }; 34 35 // Offsets for source bytes 0 to 10 36 static uvec8 kShuf01 = 37 { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 }; 38 39 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. 40 static uvec8 kShuf11 = 41 { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 }; 42 43 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. 44 static uvec8 kShuf21 = 45 { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 }; 46 47 // Coefficients for source bytes 0 to 10 48 static uvec8 kMadd01 = 49 { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 }; 50 51 // Coefficients for source bytes 10 to 21 52 static uvec8 kMadd11 = 53 { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 }; 54 55 // Coefficients for source bytes 21 to 31 56 static uvec8 kMadd21 = 57 { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 }; 58 59 // Coefficients for source bytes 21 to 31 60 static vec16 kRound34 = 61 { 2, 2, 2, 2, 2, 2, 2, 2 }; 62 63 static uvec8 kShuf38a = 64 { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; 65 66 static uvec8 kShuf38b = 67 { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 }; 68 69 // Arrange words 0,3,6 into 0,1,2 70 static uvec8 kShufAc = 71 { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; 72 73 // Arrange words 0,3,6 into 3,4,5 74 static uvec8 kShufAc3 = 75 { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 }; 76 77 // Scaling values for boxes of 3x3 and 2x3 78 static uvec16 kScaleAc33 = 79 { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 }; 80 81 // Arrange first value for pixels 0,1,2,3,4,5 82 static uvec8 kShufAb0 = 83 { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 }; 84 85 // Arrange second value for pixels 0,1,2,3,4,5 86 static uvec8 kShufAb1 = 87 { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 }; 88 89 // Arrange third value for pixels 0,1,2,3,4,5 90 static uvec8 kShufAb2 = 91 { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; 92 93 // Scaling values for boxes of 3x2 and 2x2 94 static uvec16 kScaleAb2 = 95 { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; 96 97 // Reads 32 pixels, throws half away and writes 16 pixels. 98 __declspec(naked) 99 void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 100 uint8* dst_ptr, int dst_width) { 101 __asm { 102 mov eax, [esp + 4] // src_ptr 103 // src_stride ignored 104 mov edx, [esp + 12] // dst_ptr 105 mov ecx, [esp + 16] // dst_width 106 107 wloop: 108 movdqu xmm0, [eax] 109 movdqu xmm1, [eax + 16] 110 lea eax, [eax + 32] 111 psrlw xmm0, 8 // isolate odd pixels. 112 psrlw xmm1, 8 113 packuswb xmm0, xmm1 114 movdqu [edx], xmm0 115 lea edx, [edx + 16] 116 sub ecx, 16 117 jg wloop 118 119 ret 120 } 121 } 122 123 // Blends 32x1 rectangle to 16x1. 124 __declspec(naked) 125 void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 126 uint8* dst_ptr, int dst_width) { 127 __asm { 128 mov eax, [esp + 4] // src_ptr 129 // src_stride 130 mov edx, [esp + 12] // dst_ptr 131 mov ecx, [esp + 16] // dst_width 132 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 133 psrlw xmm5, 8 134 135 wloop: 136 movdqu xmm0, [eax] 137 movdqu xmm1, [eax + 16] 138 lea eax, [eax + 32] 139 140 movdqa xmm2, xmm0 // average columns (32 to 16 pixels) 141 psrlw xmm0, 8 142 movdqa xmm3, xmm1 143 psrlw xmm1, 8 144 pand xmm2, xmm5 145 pand xmm3, xmm5 146 pavgw xmm0, xmm2 147 pavgw xmm1, xmm3 148 packuswb xmm0, xmm1 149 150 movdqu [edx], xmm0 151 lea edx, [edx + 16] 152 sub ecx, 16 153 jg wloop 154 155 ret 156 } 157 } 158 159 // Blends 32x2 rectangle to 16x1. 160 __declspec(naked) 161 void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 162 uint8* dst_ptr, int dst_width) { 163 __asm { 164 push esi 165 mov eax, [esp + 4 + 4] // src_ptr 166 mov esi, [esp + 4 + 8] // src_stride 167 mov edx, [esp + 4 + 12] // dst_ptr 168 mov ecx, [esp + 4 + 16] // dst_width 169 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 170 psrlw xmm5, 8 171 172 wloop: 173 movdqu xmm0, [eax] 174 movdqu xmm1, [eax + 16] 175 movdqu xmm2, [eax + esi] 176 movdqu xmm3, [eax + esi + 16] 177 lea eax, [eax + 32] 178 pavgb xmm0, xmm2 // average rows 179 pavgb xmm1, xmm3 180 181 movdqa xmm2, xmm0 // average columns (32 to 16 pixels) 182 psrlw xmm0, 8 183 movdqa xmm3, xmm1 184 psrlw xmm1, 8 185 pand xmm2, xmm5 186 pand xmm3, xmm5 187 pavgw xmm0, xmm2 188 pavgw xmm1, xmm3 189 packuswb xmm0, xmm1 190 191 movdqu [edx], xmm0 192 lea edx, [edx + 16] 193 sub ecx, 16 194 jg wloop 195 196 pop esi 197 ret 198 } 199 } 200 201 #ifdef HAS_SCALEROWDOWN2_AVX2 202 // Reads 64 pixels, throws half away and writes 32 pixels. 203 __declspec(naked) 204 void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, 205 uint8* dst_ptr, int dst_width) { 206 __asm { 207 mov eax, [esp + 4] // src_ptr 208 // src_stride ignored 209 mov edx, [esp + 12] // dst_ptr 210 mov ecx, [esp + 16] // dst_width 211 212 wloop: 213 vmovdqu ymm0, [eax] 214 vmovdqu ymm1, [eax + 32] 215 lea eax, [eax + 64] 216 vpsrlw ymm0, ymm0, 8 // isolate odd pixels. 217 vpsrlw ymm1, ymm1, 8 218 vpackuswb ymm0, ymm0, ymm1 219 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb 220 vmovdqu [edx], ymm0 221 lea edx, [edx + 32] 222 sub ecx, 32 223 jg wloop 224 225 vzeroupper 226 ret 227 } 228 } 229 230 // Blends 64x1 rectangle to 32x1. 231 __declspec(naked) 232 void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, 233 uint8* dst_ptr, int dst_width) { 234 __asm { 235 mov eax, [esp + 4] // src_ptr 236 // src_stride 237 mov edx, [esp + 12] // dst_ptr 238 mov ecx, [esp + 16] // dst_width 239 240 vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b 241 vpsrlw ymm4, ymm4, 15 242 vpackuswb ymm4, ymm4, ymm4 243 vpxor ymm5, ymm5, ymm5 // constant 0 244 245 wloop: 246 vmovdqu ymm0, [eax] 247 vmovdqu ymm1, [eax + 32] 248 lea eax, [eax + 64] 249 250 vpmaddubsw ymm0, ymm0, ymm4 // average horizontally 251 vpmaddubsw ymm1, ymm1, ymm4 252 vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 253 vpavgw ymm1, ymm1, ymm5 254 vpackuswb ymm0, ymm0, ymm1 255 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb 256 257 vmovdqu [edx], ymm0 258 lea edx, [edx + 32] 259 sub ecx, 32 260 jg wloop 261 262 vzeroupper 263 ret 264 } 265 } 266 267 // Blends 64x2 rectangle to 32x1. 268 __declspec(naked) 269 void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, 270 uint8* dst_ptr, int dst_width) { 271 __asm { 272 push esi 273 mov eax, [esp + 4 + 4] // src_ptr 274 mov esi, [esp + 4 + 8] // src_stride 275 mov edx, [esp + 4 + 12] // dst_ptr 276 mov ecx, [esp + 4 + 16] // dst_width 277 278 vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b 279 vpsrlw ymm4, ymm4, 15 280 vpackuswb ymm4, ymm4, ymm4 281 vpxor ymm5, ymm5, ymm5 // constant 0 282 283 wloop: 284 vmovdqu ymm0, [eax] // average rows 285 vmovdqu ymm1, [eax + 32] 286 vpavgb ymm0, ymm0, [eax + esi] 287 vpavgb ymm1, ymm1, [eax + esi + 32] 288 lea eax, [eax + 64] 289 290 vpmaddubsw ymm0, ymm0, ymm4 // average horizontally 291 vpmaddubsw ymm1, ymm1, ymm4 292 vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 293 vpavgw ymm1, ymm1, ymm5 294 vpackuswb ymm0, ymm0, ymm1 295 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb 296 297 vmovdqu [edx], ymm0 298 lea edx, [edx + 32] 299 sub ecx, 32 300 jg wloop 301 302 pop esi 303 vzeroupper 304 ret 305 } 306 } 307 #endif // HAS_SCALEROWDOWN2_AVX2 308 309 // Point samples 32 pixels to 8 pixels. 310 __declspec(naked) 311 void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 312 uint8* dst_ptr, int dst_width) { 313 __asm { 314 mov eax, [esp + 4] // src_ptr 315 // src_stride ignored 316 mov edx, [esp + 12] // dst_ptr 317 mov ecx, [esp + 16] // dst_width 318 pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000 319 psrld xmm5, 24 320 pslld xmm5, 16 321 322 wloop: 323 movdqu xmm0, [eax] 324 movdqu xmm1, [eax + 16] 325 lea eax, [eax + 32] 326 pand xmm0, xmm5 327 pand xmm1, xmm5 328 packuswb xmm0, xmm1 329 psrlw xmm0, 8 330 packuswb xmm0, xmm0 331 movq qword ptr [edx], xmm0 332 lea edx, [edx + 8] 333 sub ecx, 8 334 jg wloop 335 336 ret 337 } 338 } 339 340 // Blends 32x4 rectangle to 8x1. 341 __declspec(naked) 342 void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 343 uint8* dst_ptr, int dst_width) { 344 __asm { 345 push esi 346 push edi 347 mov eax, [esp + 8 + 4] // src_ptr 348 mov esi, [esp + 8 + 8] // src_stride 349 mov edx, [esp + 8 + 12] // dst_ptr 350 mov ecx, [esp + 8 + 16] // dst_width 351 lea edi, [esi + esi * 2] // src_stride * 3 352 pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff 353 psrlw xmm7, 8 354 355 wloop: 356 movdqu xmm0, [eax] // average rows 357 movdqu xmm1, [eax + 16] 358 movdqu xmm2, [eax + esi] 359 movdqu xmm3, [eax + esi + 16] 360 pavgb xmm0, xmm2 361 pavgb xmm1, xmm3 362 movdqu xmm2, [eax + esi * 2] 363 movdqu xmm3, [eax + esi * 2 + 16] 364 movdqu xmm4, [eax + edi] 365 movdqu xmm5, [eax + edi + 16] 366 lea eax, [eax + 32] 367 pavgb xmm2, xmm4 368 pavgb xmm3, xmm5 369 pavgb xmm0, xmm2 370 pavgb xmm1, xmm3 371 372 movdqa xmm2, xmm0 // average columns (32 to 16 pixels) 373 psrlw xmm0, 8 374 movdqa xmm3, xmm1 375 psrlw xmm1, 8 376 pand xmm2, xmm7 377 pand xmm3, xmm7 378 pavgw xmm0, xmm2 379 pavgw xmm1, xmm3 380 packuswb xmm0, xmm1 381 382 movdqa xmm2, xmm0 // average columns (16 to 8 pixels) 383 psrlw xmm0, 8 384 pand xmm2, xmm7 385 pavgw xmm0, xmm2 386 packuswb xmm0, xmm0 387 388 movq qword ptr [edx], xmm0 389 lea edx, [edx + 8] 390 sub ecx, 8 391 jg wloop 392 393 pop edi 394 pop esi 395 ret 396 } 397 } 398 399 #ifdef HAS_SCALEROWDOWN4_AVX2 400 // Point samples 64 pixels to 16 pixels. 401 __declspec(naked) 402 void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, 403 uint8* dst_ptr, int dst_width) { 404 __asm { 405 mov eax, [esp + 4] // src_ptr 406 // src_stride ignored 407 mov edx, [esp + 12] // dst_ptr 408 mov ecx, [esp + 16] // dst_width 409 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff0000 410 vpsrld ymm5, ymm5, 24 411 vpslld ymm5, ymm5, 16 412 413 wloop: 414 vmovdqu ymm0, [eax] 415 vmovdqu ymm1, [eax + 32] 416 lea eax, [eax + 64] 417 vpand ymm0, ymm0, ymm5 418 vpand ymm1, ymm1, ymm5 419 vpackuswb ymm0, ymm0, ymm1 420 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb 421 vpsrlw ymm0, ymm0, 8 422 vpackuswb ymm0, ymm0, ymm0 423 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb 424 vmovdqu [edx], xmm0 425 lea edx, [edx + 16] 426 sub ecx, 16 427 jg wloop 428 429 vzeroupper 430 ret 431 } 432 } 433 434 // Blends 64x4 rectangle to 16x1. 435 __declspec(naked) 436 void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, 437 uint8* dst_ptr, int dst_width) { 438 __asm { 439 push esi 440 push edi 441 mov eax, [esp + 8 + 4] // src_ptr 442 mov esi, [esp + 8 + 8] // src_stride 443 mov edx, [esp + 8 + 12] // dst_ptr 444 mov ecx, [esp + 8 + 16] // dst_width 445 lea edi, [esi + esi * 2] // src_stride * 3 446 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0x00ff00ff 447 vpsrlw ymm7, ymm7, 8 448 449 wloop: 450 vmovdqu ymm0, [eax] // average rows 451 vmovdqu ymm1, [eax + 32] 452 vpavgb ymm0, ymm0, [eax + esi] 453 vpavgb ymm1, ymm1, [eax + esi + 32] 454 vmovdqu ymm2, [eax + esi * 2] 455 vmovdqu ymm3, [eax + esi * 2 + 32] 456 vpavgb ymm2, ymm2, [eax + edi] 457 vpavgb ymm3, ymm3, [eax + edi + 32] 458 lea eax, [eax + 64] 459 vpavgb ymm0, ymm0, ymm2 460 vpavgb ymm1, ymm1, ymm3 461 462 vpand ymm2, ymm0, ymm7 // average columns (64 to 32 pixels) 463 vpand ymm3, ymm1, ymm7 464 vpsrlw ymm0, ymm0, 8 465 vpsrlw ymm1, ymm1, 8 466 vpavgw ymm0, ymm0, ymm2 467 vpavgw ymm1, ymm1, ymm3 468 vpackuswb ymm0, ymm0, ymm1 469 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb 470 471 vpand ymm2, ymm0, ymm7 // average columns (32 to 16 pixels) 472 vpsrlw ymm0, ymm0, 8 473 vpavgw ymm0, ymm0, ymm2 474 vpackuswb ymm0, ymm0, ymm0 475 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb 476 477 vmovdqu [edx], xmm0 478 lea edx, [edx + 16] 479 sub ecx, 16 480 jg wloop 481 482 pop edi 483 pop esi 484 vzeroupper 485 ret 486 } 487 } 488 #endif // HAS_SCALEROWDOWN4_AVX2 489 490 // Point samples 32 pixels to 24 pixels. 491 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read. 492 // Then shuffled to do the scaling. 493 494 __declspec(naked) 495 void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, 496 uint8* dst_ptr, int dst_width) { 497 __asm { 498 mov eax, [esp + 4] // src_ptr 499 // src_stride ignored 500 mov edx, [esp + 12] // dst_ptr 501 mov ecx, [esp + 16] // dst_width 502 movdqa xmm3, kShuf0 503 movdqa xmm4, kShuf1 504 movdqa xmm5, kShuf2 505 506 wloop: 507 movdqu xmm0, [eax] 508 movdqu xmm1, [eax + 16] 509 lea eax, [eax + 32] 510 movdqa xmm2, xmm1 511 palignr xmm1, xmm0, 8 512 pshufb xmm0, xmm3 513 pshufb xmm1, xmm4 514 pshufb xmm2, xmm5 515 movq qword ptr [edx], xmm0 516 movq qword ptr [edx + 8], xmm1 517 movq qword ptr [edx + 16], xmm2 518 lea edx, [edx + 24] 519 sub ecx, 24 520 jg wloop 521 522 ret 523 } 524 } 525 526 // Blends 32x2 rectangle to 24x1 527 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read. 528 // Then shuffled to do the scaling. 529 530 // Register usage: 531 // xmm0 src_row 0 532 // xmm1 src_row 1 533 // xmm2 shuf 0 534 // xmm3 shuf 1 535 // xmm4 shuf 2 536 // xmm5 madd 0 537 // xmm6 madd 1 538 // xmm7 kRound34 539 540 // Note that movdqa+palign may be better than movdqu. 541 __declspec(naked) 542 void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, 543 ptrdiff_t src_stride, 544 uint8* dst_ptr, int dst_width) { 545 __asm { 546 push esi 547 mov eax, [esp + 4 + 4] // src_ptr 548 mov esi, [esp + 4 + 8] // src_stride 549 mov edx, [esp + 4 + 12] // dst_ptr 550 mov ecx, [esp + 4 + 16] // dst_width 551 movdqa xmm2, kShuf01 552 movdqa xmm3, kShuf11 553 movdqa xmm4, kShuf21 554 movdqa xmm5, kMadd01 555 movdqa xmm6, kMadd11 556 movdqa xmm7, kRound34 557 558 wloop: 559 movdqu xmm0, [eax] // pixels 0..7 560 movdqu xmm1, [eax + esi] 561 pavgb xmm0, xmm1 562 pshufb xmm0, xmm2 563 pmaddubsw xmm0, xmm5 564 paddsw xmm0, xmm7 565 psrlw xmm0, 2 566 packuswb xmm0, xmm0 567 movq qword ptr [edx], xmm0 568 movdqu xmm0, [eax + 8] // pixels 8..15 569 movdqu xmm1, [eax + esi + 8] 570 pavgb xmm0, xmm1 571 pshufb xmm0, xmm3 572 pmaddubsw xmm0, xmm6 573 paddsw xmm0, xmm7 574 psrlw xmm0, 2 575 packuswb xmm0, xmm0 576 movq qword ptr [edx + 8], xmm0 577 movdqu xmm0, [eax + 16] // pixels 16..23 578 movdqu xmm1, [eax + esi + 16] 579 lea eax, [eax + 32] 580 pavgb xmm0, xmm1 581 pshufb xmm0, xmm4 582 movdqa xmm1, kMadd21 583 pmaddubsw xmm0, xmm1 584 paddsw xmm0, xmm7 585 psrlw xmm0, 2 586 packuswb xmm0, xmm0 587 movq qword ptr [edx + 16], xmm0 588 lea edx, [edx + 24] 589 sub ecx, 24 590 jg wloop 591 592 pop esi 593 ret 594 } 595 } 596 597 // Note that movdqa+palign may be better than movdqu. 598 __declspec(naked) 599 void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, 600 ptrdiff_t src_stride, 601 uint8* dst_ptr, int dst_width) { 602 __asm { 603 push esi 604 mov eax, [esp + 4 + 4] // src_ptr 605 mov esi, [esp + 4 + 8] // src_stride 606 mov edx, [esp + 4 + 12] // dst_ptr 607 mov ecx, [esp + 4 + 16] // dst_width 608 movdqa xmm2, kShuf01 609 movdqa xmm3, kShuf11 610 movdqa xmm4, kShuf21 611 movdqa xmm5, kMadd01 612 movdqa xmm6, kMadd11 613 movdqa xmm7, kRound34 614 615 wloop: 616 movdqu xmm0, [eax] // pixels 0..7 617 movdqu xmm1, [eax + esi] 618 pavgb xmm1, xmm0 619 pavgb xmm0, xmm1 620 pshufb xmm0, xmm2 621 pmaddubsw xmm0, xmm5 622 paddsw xmm0, xmm7 623 psrlw xmm0, 2 624 packuswb xmm0, xmm0 625 movq qword ptr [edx], xmm0 626 movdqu xmm0, [eax + 8] // pixels 8..15 627 movdqu xmm1, [eax + esi + 8] 628 pavgb xmm1, xmm0 629 pavgb xmm0, xmm1 630 pshufb xmm0, xmm3 631 pmaddubsw xmm0, xmm6 632 paddsw xmm0, xmm7 633 psrlw xmm0, 2 634 packuswb xmm0, xmm0 635 movq qword ptr [edx + 8], xmm0 636 movdqu xmm0, [eax + 16] // pixels 16..23 637 movdqu xmm1, [eax + esi + 16] 638 lea eax, [eax + 32] 639 pavgb xmm1, xmm0 640 pavgb xmm0, xmm1 641 pshufb xmm0, xmm4 642 movdqa xmm1, kMadd21 643 pmaddubsw xmm0, xmm1 644 paddsw xmm0, xmm7 645 psrlw xmm0, 2 646 packuswb xmm0, xmm0 647 movq qword ptr [edx + 16], xmm0 648 lea edx, [edx+24] 649 sub ecx, 24 650 jg wloop 651 652 pop esi 653 ret 654 } 655 } 656 657 // 3/8 point sampler 658 659 // Scale 32 pixels to 12 660 __declspec(naked) 661 void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, 662 uint8* dst_ptr, int dst_width) { 663 __asm { 664 mov eax, [esp + 4] // src_ptr 665 // src_stride ignored 666 mov edx, [esp + 12] // dst_ptr 667 mov ecx, [esp + 16] // dst_width 668 movdqa xmm4, kShuf38a 669 movdqa xmm5, kShuf38b 670 671 xloop: 672 movdqu xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5 673 movdqu xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11 674 lea eax, [eax + 32] 675 pshufb xmm0, xmm4 676 pshufb xmm1, xmm5 677 paddusb xmm0, xmm1 678 679 movq qword ptr [edx], xmm0 // write 12 pixels 680 movhlps xmm1, xmm0 681 movd [edx + 8], xmm1 682 lea edx, [edx + 12] 683 sub ecx, 12 684 jg xloop 685 686 ret 687 } 688 } 689 690 // Scale 16x3 pixels to 6x1 with interpolation 691 __declspec(naked) 692 void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, 693 ptrdiff_t src_stride, 694 uint8* dst_ptr, int dst_width) { 695 __asm { 696 push esi 697 mov eax, [esp + 4 + 4] // src_ptr 698 mov esi, [esp + 4 + 8] // src_stride 699 mov edx, [esp + 4 + 12] // dst_ptr 700 mov ecx, [esp + 4 + 16] // dst_width 701 movdqa xmm2, kShufAc 702 movdqa xmm3, kShufAc3 703 movdqa xmm4, kScaleAc33 704 pxor xmm5, xmm5 705 706 xloop: 707 movdqu xmm0, [eax] // sum up 3 rows into xmm0/1 708 movdqu xmm6, [eax + esi] 709 movhlps xmm1, xmm0 710 movhlps xmm7, xmm6 711 punpcklbw xmm0, xmm5 712 punpcklbw xmm1, xmm5 713 punpcklbw xmm6, xmm5 714 punpcklbw xmm7, xmm5 715 paddusw xmm0, xmm6 716 paddusw xmm1, xmm7 717 movdqu xmm6, [eax + esi * 2] 718 lea eax, [eax + 16] 719 movhlps xmm7, xmm6 720 punpcklbw xmm6, xmm5 721 punpcklbw xmm7, xmm5 722 paddusw xmm0, xmm6 723 paddusw xmm1, xmm7 724 725 movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6 726 psrldq xmm0, 2 727 paddusw xmm6, xmm0 728 psrldq xmm0, 2 729 paddusw xmm6, xmm0 730 pshufb xmm6, xmm2 731 732 movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6 733 psrldq xmm1, 2 734 paddusw xmm7, xmm1 735 psrldq xmm1, 2 736 paddusw xmm7, xmm1 737 pshufb xmm7, xmm3 738 paddusw xmm6, xmm7 739 740 pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6 741 packuswb xmm6, xmm6 742 743 movd [edx], xmm6 // write 6 pixels 744 psrlq xmm6, 16 745 movd [edx + 2], xmm6 746 lea edx, [edx + 6] 747 sub ecx, 6 748 jg xloop 749 750 pop esi 751 ret 752 } 753 } 754 755 // Scale 16x2 pixels to 6x1 with interpolation 756 __declspec(naked) 757 void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, 758 ptrdiff_t src_stride, 759 uint8* dst_ptr, int dst_width) { 760 __asm { 761 push esi 762 mov eax, [esp + 4 + 4] // src_ptr 763 mov esi, [esp + 4 + 8] // src_stride 764 mov edx, [esp + 4 + 12] // dst_ptr 765 mov ecx, [esp + 4 + 16] // dst_width 766 movdqa xmm2, kShufAb0 767 movdqa xmm3, kShufAb1 768 movdqa xmm4, kShufAb2 769 movdqa xmm5, kScaleAb2 770 771 xloop: 772 movdqu xmm0, [eax] // average 2 rows into xmm0 773 movdqu xmm1, [eax + esi] 774 lea eax, [eax + 16] 775 pavgb xmm0, xmm1 776 777 movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1 778 pshufb xmm1, xmm2 779 movdqa xmm6, xmm0 780 pshufb xmm6, xmm3 781 paddusw xmm1, xmm6 782 pshufb xmm0, xmm4 783 paddusw xmm1, xmm0 784 785 pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2 786 packuswb xmm1, xmm1 787 788 movd [edx], xmm1 // write 6 pixels 789 psrlq xmm1, 16 790 movd [edx + 2], xmm1 791 lea edx, [edx + 6] 792 sub ecx, 6 793 jg xloop 794 795 pop esi 796 ret 797 } 798 } 799 800 // Reads 16 bytes and accumulates to 16 shorts at a time. 801 __declspec(naked) 802 void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) { 803 __asm { 804 mov eax, [esp + 4] // src_ptr 805 mov edx, [esp + 8] // dst_ptr 806 mov ecx, [esp + 12] // src_width 807 pxor xmm5, xmm5 808 809 // sum rows 810 xloop: 811 movdqu xmm3, [eax] // read 16 bytes 812 lea eax, [eax + 16] 813 movdqu xmm0, [edx] // read 16 words from destination 814 movdqu xmm1, [edx + 16] 815 movdqa xmm2, xmm3 816 punpcklbw xmm2, xmm5 817 punpckhbw xmm3, xmm5 818 paddusw xmm0, xmm2 // sum 16 words 819 paddusw xmm1, xmm3 820 movdqu [edx], xmm0 // write 16 words to destination 821 movdqu [edx + 16], xmm1 822 lea edx, [edx + 32] 823 sub ecx, 16 824 jg xloop 825 ret 826 } 827 } 828 829 #ifdef HAS_SCALEADDROW_AVX2 830 // Reads 32 bytes and accumulates to 32 shorts at a time. 831 __declspec(naked) 832 void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) { 833 __asm { 834 mov eax, [esp + 4] // src_ptr 835 mov edx, [esp + 8] // dst_ptr 836 mov ecx, [esp + 12] // src_width 837 vpxor ymm5, ymm5, ymm5 838 839 // sum rows 840 xloop: 841 vmovdqu ymm3, [eax] // read 32 bytes 842 lea eax, [eax + 32] 843 vpermq ymm3, ymm3, 0xd8 // unmutate for vpunpck 844 vpunpcklbw ymm2, ymm3, ymm5 845 vpunpckhbw ymm3, ymm3, ymm5 846 vpaddusw ymm0, ymm2, [edx] // sum 16 words 847 vpaddusw ymm1, ymm3, [edx + 32] 848 vmovdqu [edx], ymm0 // write 32 words to destination 849 vmovdqu [edx + 32], ymm1 850 lea edx, [edx + 64] 851 sub ecx, 32 852 jg xloop 853 854 vzeroupper 855 ret 856 } 857 } 858 #endif // HAS_SCALEADDROW_AVX2 859 860 // Bilinear column filtering. SSSE3 version. 861 __declspec(naked) 862 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, 863 int dst_width, int x, int dx) { 864 __asm { 865 push ebx 866 push esi 867 push edi 868 mov edi, [esp + 12 + 4] // dst_ptr 869 mov esi, [esp + 12 + 8] // src_ptr 870 mov ecx, [esp + 12 + 12] // dst_width 871 movd xmm2, [esp + 12 + 16] // x 872 movd xmm3, [esp + 12 + 20] // dx 873 mov eax, 0x04040000 // shuffle to line up fractions with pixel. 874 movd xmm5, eax 875 pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. 876 psrlw xmm6, 9 877 pextrw eax, xmm2, 1 // get x0 integer. preroll 878 sub ecx, 2 879 jl xloop29 880 881 movdqa xmm0, xmm2 // x1 = x0 + dx 882 paddd xmm0, xmm3 883 punpckldq xmm2, xmm0 // x0 x1 884 punpckldq xmm3, xmm3 // dx dx 885 paddd xmm3, xmm3 // dx * 2, dx * 2 886 pextrw edx, xmm2, 3 // get x1 integer. preroll 887 888 // 2 Pixel loop. 889 xloop2: 890 movdqa xmm1, xmm2 // x0, x1 fractions. 891 paddd xmm2, xmm3 // x += dx 892 movzx ebx, word ptr [esi + eax] // 2 source x0 pixels 893 movd xmm0, ebx 894 psrlw xmm1, 9 // 7 bit fractions. 895 movzx ebx, word ptr [esi + edx] // 2 source x1 pixels 896 movd xmm4, ebx 897 pshufb xmm1, xmm5 // 0011 898 punpcklwd xmm0, xmm4 899 pxor xmm1, xmm6 // 0..7f and 7f..0 900 pmaddubsw xmm0, xmm1 // 16 bit, 2 pixels. 901 pextrw eax, xmm2, 1 // get x0 integer. next iteration. 902 pextrw edx, xmm2, 3 // get x1 integer. next iteration. 903 psrlw xmm0, 7 // 8.7 fixed point to low 8 bits. 904 packuswb xmm0, xmm0 // 8 bits, 2 pixels. 905 movd ebx, xmm0 906 mov [edi], bx 907 lea edi, [edi + 2] 908 sub ecx, 2 // 2 pixels 909 jge xloop2 910 911 xloop29: 912 913 add ecx, 2 - 1 914 jl xloop99 915 916 // 1 pixel remainder 917 movzx ebx, word ptr [esi + eax] // 2 source x0 pixels 918 movd xmm0, ebx 919 psrlw xmm2, 9 // 7 bit fractions. 920 pshufb xmm2, xmm5 // 0011 921 pxor xmm2, xmm6 // 0..7f and 7f..0 922 pmaddubsw xmm0, xmm2 // 16 bit 923 psrlw xmm0, 7 // 8.7 fixed point to low 8 bits. 924 packuswb xmm0, xmm0 // 8 bits 925 movd ebx, xmm0 926 mov [edi], bl 927 928 xloop99: 929 930 pop edi 931 pop esi 932 pop ebx 933 ret 934 } 935 } 936 937 // Reads 16 pixels, duplicates them and writes 32 pixels. 938 __declspec(naked) 939 void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, 940 int dst_width, int x, int dx) { 941 __asm { 942 mov edx, [esp + 4] // dst_ptr 943 mov eax, [esp + 8] // src_ptr 944 mov ecx, [esp + 12] // dst_width 945 946 wloop: 947 movdqu xmm0, [eax] 948 lea eax, [eax + 16] 949 movdqa xmm1, xmm0 950 punpcklbw xmm0, xmm0 951 punpckhbw xmm1, xmm1 952 movdqu [edx], xmm0 953 movdqu [edx + 16], xmm1 954 lea edx, [edx + 32] 955 sub ecx, 32 956 jg wloop 957 958 ret 959 } 960 } 961 962 // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6) 963 __declspec(naked) 964 void ScaleARGBRowDown2_SSE2(const uint8* src_argb, 965 ptrdiff_t src_stride, 966 uint8* dst_argb, int dst_width) { 967 __asm { 968 mov eax, [esp + 4] // src_argb 969 // src_stride ignored 970 mov edx, [esp + 12] // dst_argb 971 mov ecx, [esp + 16] // dst_width 972 973 wloop: 974 movdqu xmm0, [eax] 975 movdqu xmm1, [eax + 16] 976 lea eax, [eax + 32] 977 shufps xmm0, xmm1, 0xdd 978 movdqu [edx], xmm0 979 lea edx, [edx + 16] 980 sub ecx, 4 981 jg wloop 982 983 ret 984 } 985 } 986 987 // Blends 8x1 rectangle to 4x1. 988 __declspec(naked) 989 void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, 990 ptrdiff_t src_stride, 991 uint8* dst_argb, int dst_width) { 992 __asm { 993 mov eax, [esp + 4] // src_argb 994 // src_stride ignored 995 mov edx, [esp + 12] // dst_argb 996 mov ecx, [esp + 16] // dst_width 997 998 wloop: 999 movdqu xmm0, [eax] 1000 movdqu xmm1, [eax + 16] 1001 lea eax, [eax + 32] 1002 movdqa xmm2, xmm0 1003 shufps xmm0, xmm1, 0x88 // even pixels 1004 shufps xmm2, xmm1, 0xdd // odd pixels 1005 pavgb xmm0, xmm2 1006 movdqu [edx], xmm0 1007 lea edx, [edx + 16] 1008 sub ecx, 4 1009 jg wloop 1010 1011 ret 1012 } 1013 } 1014 1015 // Blends 8x2 rectangle to 4x1. 1016 __declspec(naked) 1017 void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, 1018 ptrdiff_t src_stride, 1019 uint8* dst_argb, int dst_width) { 1020 __asm { 1021 push esi 1022 mov eax, [esp + 4 + 4] // src_argb 1023 mov esi, [esp + 4 + 8] // src_stride 1024 mov edx, [esp + 4 + 12] // dst_argb 1025 mov ecx, [esp + 4 + 16] // dst_width 1026 1027 wloop: 1028 movdqu xmm0, [eax] 1029 movdqu xmm1, [eax + 16] 1030 movdqu xmm2, [eax + esi] 1031 movdqu xmm3, [eax + esi + 16] 1032 lea eax, [eax + 32] 1033 pavgb xmm0, xmm2 // average rows 1034 pavgb xmm1, xmm3 1035 movdqa xmm2, xmm0 // average columns (8 to 4 pixels) 1036 shufps xmm0, xmm1, 0x88 // even pixels 1037 shufps xmm2, xmm1, 0xdd // odd pixels 1038 pavgb xmm0, xmm2 1039 movdqu [edx], xmm0 1040 lea edx, [edx + 16] 1041 sub ecx, 4 1042 jg wloop 1043 1044 pop esi 1045 ret 1046 } 1047 } 1048 1049 // Reads 4 pixels at a time. 1050 __declspec(naked) 1051 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, 1052 int src_stepx, 1053 uint8* dst_argb, int dst_width) { 1054 __asm { 1055 push ebx 1056 push edi 1057 mov eax, [esp + 8 + 4] // src_argb 1058 // src_stride ignored 1059 mov ebx, [esp + 8 + 12] // src_stepx 1060 mov edx, [esp + 8 + 16] // dst_argb 1061 mov ecx, [esp + 8 + 20] // dst_width 1062 lea ebx, [ebx * 4] 1063 lea edi, [ebx + ebx * 2] 1064 1065 wloop: 1066 movd xmm0, [eax] 1067 movd xmm1, [eax + ebx] 1068 punpckldq xmm0, xmm1 1069 movd xmm2, [eax + ebx * 2] 1070 movd xmm3, [eax + edi] 1071 lea eax, [eax + ebx * 4] 1072 punpckldq xmm2, xmm3 1073 punpcklqdq xmm0, xmm2 1074 movdqu [edx], xmm0 1075 lea edx, [edx + 16] 1076 sub ecx, 4 1077 jg wloop 1078 1079 pop edi 1080 pop ebx 1081 ret 1082 } 1083 } 1084 1085 // Blends four 2x2 to 4x1. 1086 __declspec(naked) 1087 void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, 1088 ptrdiff_t src_stride, 1089 int src_stepx, 1090 uint8* dst_argb, int dst_width) { 1091 __asm { 1092 push ebx 1093 push esi 1094 push edi 1095 mov eax, [esp + 12 + 4] // src_argb 1096 mov esi, [esp + 12 + 8] // src_stride 1097 mov ebx, [esp + 12 + 12] // src_stepx 1098 mov edx, [esp + 12 + 16] // dst_argb 1099 mov ecx, [esp + 12 + 20] // dst_width 1100 lea esi, [eax + esi] // row1 pointer 1101 lea ebx, [ebx * 4] 1102 lea edi, [ebx + ebx * 2] 1103 1104 wloop: 1105 movq xmm0, qword ptr [eax] // row0 4 pairs 1106 movhps xmm0, qword ptr [eax + ebx] 1107 movq xmm1, qword ptr [eax + ebx * 2] 1108 movhps xmm1, qword ptr [eax + edi] 1109 lea eax, [eax + ebx * 4] 1110 movq xmm2, qword ptr [esi] // row1 4 pairs 1111 movhps xmm2, qword ptr [esi + ebx] 1112 movq xmm3, qword ptr [esi + ebx * 2] 1113 movhps xmm3, qword ptr [esi + edi] 1114 lea esi, [esi + ebx * 4] 1115 pavgb xmm0, xmm2 // average rows 1116 pavgb xmm1, xmm3 1117 movdqa xmm2, xmm0 // average columns (8 to 4 pixels) 1118 shufps xmm0, xmm1, 0x88 // even pixels 1119 shufps xmm2, xmm1, 0xdd // odd pixels 1120 pavgb xmm0, xmm2 1121 movdqu [edx], xmm0 1122 lea edx, [edx + 16] 1123 sub ecx, 4 1124 jg wloop 1125 1126 pop edi 1127 pop esi 1128 pop ebx 1129 ret 1130 } 1131 } 1132 1133 // Column scaling unfiltered. SSE2 version. 1134 __declspec(naked) 1135 void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, 1136 int dst_width, int x, int dx) { 1137 __asm { 1138 push edi 1139 push esi 1140 mov edi, [esp + 8 + 4] // dst_argb 1141 mov esi, [esp + 8 + 8] // src_argb 1142 mov ecx, [esp + 8 + 12] // dst_width 1143 movd xmm2, [esp + 8 + 16] // x 1144 movd xmm3, [esp + 8 + 20] // dx 1145 1146 pshufd xmm2, xmm2, 0 // x0 x0 x0 x0 1147 pshufd xmm0, xmm3, 0x11 // dx 0 dx 0 1148 paddd xmm2, xmm0 1149 paddd xmm3, xmm3 // 0, 0, 0, dx * 2 1150 pshufd xmm0, xmm3, 0x05 // dx * 2, dx * 2, 0, 0 1151 paddd xmm2, xmm0 // x3 x2 x1 x0 1152 paddd xmm3, xmm3 // 0, 0, 0, dx * 4 1153 pshufd xmm3, xmm3, 0 // dx * 4, dx * 4, dx * 4, dx * 4 1154 1155 pextrw eax, xmm2, 1 // get x0 integer. 1156 pextrw edx, xmm2, 3 // get x1 integer. 1157 1158 cmp ecx, 0 1159 jle xloop99 1160 sub ecx, 4 1161 jl xloop49 1162 1163 // 4 Pixel loop. 1164 xloop4: 1165 movd xmm0, [esi + eax * 4] // 1 source x0 pixels 1166 movd xmm1, [esi + edx * 4] // 1 source x1 pixels 1167 pextrw eax, xmm2, 5 // get x2 integer. 1168 pextrw edx, xmm2, 7 // get x3 integer. 1169 paddd xmm2, xmm3 // x += dx 1170 punpckldq xmm0, xmm1 // x0 x1 1171 1172 movd xmm1, [esi + eax * 4] // 1 source x2 pixels 1173 movd xmm4, [esi + edx * 4] // 1 source x3 pixels 1174 pextrw eax, xmm2, 1 // get x0 integer. next iteration. 1175 pextrw edx, xmm2, 3 // get x1 integer. next iteration. 1176 punpckldq xmm1, xmm4 // x2 x3 1177 punpcklqdq xmm0, xmm1 // x0 x1 x2 x3 1178 movdqu [edi], xmm0 1179 lea edi, [edi + 16] 1180 sub ecx, 4 // 4 pixels 1181 jge xloop4 1182 1183 xloop49: 1184 test ecx, 2 1185 je xloop29 1186 1187 // 2 Pixels. 1188 movd xmm0, [esi + eax * 4] // 1 source x0 pixels 1189 movd xmm1, [esi + edx * 4] // 1 source x1 pixels 1190 pextrw eax, xmm2, 5 // get x2 integer. 1191 punpckldq xmm0, xmm1 // x0 x1 1192 1193 movq qword ptr [edi], xmm0 1194 lea edi, [edi + 8] 1195 1196 xloop29: 1197 test ecx, 1 1198 je xloop99 1199 1200 // 1 Pixels. 1201 movd xmm0, [esi + eax * 4] // 1 source x2 pixels 1202 movd dword ptr [edi], xmm0 1203 xloop99: 1204 1205 pop esi 1206 pop edi 1207 ret 1208 } 1209 } 1210 1211 // Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version. 1212 // TODO(fbarchard): Port to Neon 1213 1214 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw 1215 static uvec8 kShuffleColARGB = { 1216 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel 1217 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel 1218 }; 1219 1220 // Shuffle table for duplicating 2 fractions into 8 bytes each 1221 static uvec8 kShuffleFractions = { 1222 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 1223 }; 1224 1225 __declspec(naked) 1226 void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, 1227 int dst_width, int x, int dx) { 1228 __asm { 1229 push esi 1230 push edi 1231 mov edi, [esp + 8 + 4] // dst_argb 1232 mov esi, [esp + 8 + 8] // src_argb 1233 mov ecx, [esp + 8 + 12] // dst_width 1234 movd xmm2, [esp + 8 + 16] // x 1235 movd xmm3, [esp + 8 + 20] // dx 1236 movdqa xmm4, kShuffleColARGB 1237 movdqa xmm5, kShuffleFractions 1238 pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. 1239 psrlw xmm6, 9 1240 pextrw eax, xmm2, 1 // get x0 integer. preroll 1241 sub ecx, 2 1242 jl xloop29 1243 1244 movdqa xmm0, xmm2 // x1 = x0 + dx 1245 paddd xmm0, xmm3 1246 punpckldq xmm2, xmm0 // x0 x1 1247 punpckldq xmm3, xmm3 // dx dx 1248 paddd xmm3, xmm3 // dx * 2, dx * 2 1249 pextrw edx, xmm2, 3 // get x1 integer. preroll 1250 1251 // 2 Pixel loop. 1252 xloop2: 1253 movdqa xmm1, xmm2 // x0, x1 fractions. 1254 paddd xmm2, xmm3 // x += dx 1255 movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels 1256 psrlw xmm1, 9 // 7 bit fractions. 1257 movhps xmm0, qword ptr [esi + edx * 4] // 2 source x1 pixels 1258 pshufb xmm1, xmm5 // 0000000011111111 1259 pshufb xmm0, xmm4 // arrange pixels into pairs 1260 pxor xmm1, xmm6 // 0..7f and 7f..0 1261 pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels. 1262 pextrw eax, xmm2, 1 // get x0 integer. next iteration. 1263 pextrw edx, xmm2, 3 // get x1 integer. next iteration. 1264 psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits. 1265 packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels. 1266 movq qword ptr [edi], xmm0 1267 lea edi, [edi + 8] 1268 sub ecx, 2 // 2 pixels 1269 jge xloop2 1270 1271 xloop29: 1272 1273 add ecx, 2 - 1 1274 jl xloop99 1275 1276 // 1 pixel remainder 1277 psrlw xmm2, 9 // 7 bit fractions. 1278 movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels 1279 pshufb xmm2, xmm5 // 00000000 1280 pshufb xmm0, xmm4 // arrange pixels into pairs 1281 pxor xmm2, xmm6 // 0..7f and 7f..0 1282 pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel. 1283 psrlw xmm0, 7 1284 packuswb xmm0, xmm0 // argb 8 bits, 1 pixel. 1285 movd [edi], xmm0 1286 1287 xloop99: 1288 1289 pop edi 1290 pop esi 1291 ret 1292 } 1293 } 1294 1295 // Reads 4 pixels, duplicates them and writes 8 pixels. 1296 __declspec(naked) 1297 void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, 1298 int dst_width, int x, int dx) { 1299 __asm { 1300 mov edx, [esp + 4] // dst_argb 1301 mov eax, [esp + 8] // src_argb 1302 mov ecx, [esp + 12] // dst_width 1303 1304 wloop: 1305 movdqu xmm0, [eax] 1306 lea eax, [eax + 16] 1307 movdqa xmm1, xmm0 1308 punpckldq xmm0, xmm0 1309 punpckhdq xmm1, xmm1 1310 movdqu [edx], xmm0 1311 movdqu [edx + 16], xmm1 1312 lea edx, [edx + 32] 1313 sub ecx, 8 1314 jg wloop 1315 1316 ret 1317 } 1318 } 1319 1320 // Divide num by div and return as 16.16 fixed point result. 1321 __declspec(naked) 1322 int FixedDiv_X86(int num, int div) { 1323 __asm { 1324 mov eax, [esp + 4] // num 1325 cdq // extend num to 64 bits 1326 shld edx, eax, 16 // 32.16 1327 shl eax, 16 1328 idiv dword ptr [esp + 8] 1329 ret 1330 } 1331 } 1332 1333 // Divide num by div and return as 16.16 fixed point result. 1334 __declspec(naked) 1335 int FixedDiv1_X86(int num, int div) { 1336 __asm { 1337 mov eax, [esp + 4] // num 1338 mov ecx, [esp + 8] // denom 1339 cdq // extend num to 64 bits 1340 shld edx, eax, 16 // 32.16 1341 shl eax, 16 1342 sub eax, 0x00010001 1343 sbb edx, 0 1344 sub ecx, 1 1345 idiv ecx 1346 ret 1347 } 1348 } 1349 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) 1350 1351 #ifdef __cplusplus 1352 } // extern "C" 1353 } // namespace libyuv 1354 #endif 1355