1 /* 2 * Copyright 2013 The LibYuv Project Authors. All rights reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "libyuv/row.h" 12 #include "libyuv/scale_row.h" 13 14 #ifdef __cplusplus 15 namespace libyuv { 16 extern "C" { 17 #endif 18 19 // This module is for 32 bit Visual C x86 and clangcl 20 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) 21 22 // Offsets for source bytes 0 to 9 23 static uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9, 24 128, 128, 128, 128, 128, 128, 128, 128}; 25 26 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. 27 static uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12, 28 128, 128, 128, 128, 128, 128, 128, 128}; 29 30 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. 31 static uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15, 32 128, 128, 128, 128, 128, 128, 128, 128}; 33 34 // Offsets for source bytes 0 to 10 35 static uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10}; 36 37 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. 38 static uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13}; 39 40 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. 41 static uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10, 42 10, 11, 12, 13, 13, 14, 14, 15}; 43 44 // Coefficients for source bytes 0 to 10 45 static uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2}; 46 47 // Coefficients for source bytes 10 to 21 48 static uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1}; 49 50 // Coefficients for source bytes 21 to 31 51 static uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3}; 52 53 // Coefficients for source bytes 21 to 31 54 static vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2}; 55 56 static uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128, 57 128, 128, 128, 128, 128, 128, 128, 128}; 58 59 static uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3, 60 6, 8, 11, 14, 128, 128, 128, 128}; 61 62 // Arrange words 0,3,6 into 0,1,2 63 static uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128, 64 128, 128, 128, 128, 128, 128, 128, 128}; 65 66 // Arrange words 0,3,6 into 3,4,5 67 static uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1, 68 6, 7, 12, 13, 128, 128, 128, 128}; 69 70 // Scaling values for boxes of 3x3 and 2x3 71 static uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 72 65536 / 9, 65536 / 6, 0, 0}; 73 74 // Arrange first value for pixels 0,1,2,3,4,5 75 static uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128, 76 11, 128, 14, 128, 128, 128, 128, 128}; 77 78 // Arrange second value for pixels 0,1,2,3,4,5 79 static uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128, 80 12, 128, 15, 128, 128, 128, 128, 128}; 81 82 // Arrange third value for pixels 0,1,2,3,4,5 83 static uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128, 84 13, 128, 128, 128, 128, 128, 128, 128}; 85 86 // Scaling values for boxes of 3x2 and 2x2 87 static uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 88 65536 / 3, 65536 / 2, 0, 0}; 89 90 // Reads 32 pixels, throws half away and writes 16 pixels. 91 __declspec(naked) void ScaleRowDown2_SSSE3(const uint8* src_ptr, 92 ptrdiff_t src_stride, 93 uint8* dst_ptr, 94 int dst_width) { 95 __asm { 96 mov eax, [esp + 4] // src_ptr 97 // src_stride ignored 98 mov edx, [esp + 12] // dst_ptr 99 mov ecx, [esp + 16] // dst_width 100 101 wloop: 102 movdqu xmm0, [eax] 103 movdqu xmm1, [eax + 16] 104 lea eax, [eax + 32] 105 psrlw xmm0, 8 // isolate odd pixels. 106 psrlw xmm1, 8 107 packuswb xmm0, xmm1 108 movdqu [edx], xmm0 109 lea edx, [edx + 16] 110 sub ecx, 16 111 jg wloop 112 113 ret 114 } 115 } 116 117 // Blends 32x1 rectangle to 16x1. 118 __declspec(naked) void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, 119 ptrdiff_t src_stride, 120 uint8* dst_ptr, 121 int dst_width) { 122 __asm { 123 mov eax, [esp + 4] // src_ptr 124 // src_stride 125 mov edx, [esp + 12] // dst_ptr 126 mov ecx, [esp + 16] // dst_width 127 128 pcmpeqb xmm4, xmm4 // constant 0x0101 129 psrlw xmm4, 15 130 packuswb xmm4, xmm4 131 pxor xmm5, xmm5 // constant 0 132 133 wloop: 134 movdqu xmm0, [eax] 135 movdqu xmm1, [eax + 16] 136 lea eax, [eax + 32] 137 pmaddubsw xmm0, xmm4 // horizontal add 138 pmaddubsw xmm1, xmm4 139 pavgw xmm0, xmm5 // (x + 1) / 2 140 pavgw xmm1, xmm5 141 packuswb xmm0, xmm1 142 movdqu [edx], xmm0 143 lea edx, [edx + 16] 144 sub ecx, 16 145 jg wloop 146 147 ret 148 } 149 } 150 151 // Blends 32x2 rectangle to 16x1. 152 __declspec(naked) void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, 153 ptrdiff_t src_stride, 154 uint8* dst_ptr, 155 int dst_width) { 156 __asm { 157 push esi 158 mov eax, [esp + 4 + 4] // src_ptr 159 mov esi, [esp + 4 + 8] // src_stride 160 mov edx, [esp + 4 + 12] // dst_ptr 161 mov ecx, [esp + 4 + 16] // dst_width 162 163 pcmpeqb xmm4, xmm4 // constant 0x0101 164 psrlw xmm4, 15 165 packuswb xmm4, xmm4 166 pxor xmm5, xmm5 // constant 0 167 168 wloop: 169 movdqu xmm0, [eax] 170 movdqu xmm1, [eax + 16] 171 movdqu xmm2, [eax + esi] 172 movdqu xmm3, [eax + esi + 16] 173 lea eax, [eax + 32] 174 pmaddubsw xmm0, xmm4 // horizontal add 175 pmaddubsw xmm1, xmm4 176 pmaddubsw xmm2, xmm4 177 pmaddubsw xmm3, xmm4 178 paddw xmm0, xmm2 // vertical add 179 paddw xmm1, xmm3 180 psrlw xmm0, 1 181 psrlw xmm1, 1 182 pavgw xmm0, xmm5 // (x + 1) / 2 183 pavgw xmm1, xmm5 184 packuswb xmm0, xmm1 185 movdqu [edx], xmm0 186 lea edx, [edx + 16] 187 sub ecx, 16 188 jg wloop 189 190 pop esi 191 ret 192 } 193 } 194 195 #ifdef HAS_SCALEROWDOWN2_AVX2 196 // Reads 64 pixels, throws half away and writes 32 pixels. 197 __declspec(naked) void ScaleRowDown2_AVX2(const uint8* src_ptr, 198 ptrdiff_t src_stride, 199 uint8* dst_ptr, 200 int dst_width) { 201 __asm { 202 mov eax, [esp + 4] // src_ptr 203 // src_stride ignored 204 mov edx, [esp + 12] // dst_ptr 205 mov ecx, [esp + 16] // dst_width 206 207 wloop: 208 vmovdqu ymm0, [eax] 209 vmovdqu ymm1, [eax + 32] 210 lea eax, [eax + 64] 211 vpsrlw ymm0, ymm0, 8 // isolate odd pixels. 212 vpsrlw ymm1, ymm1, 8 213 vpackuswb ymm0, ymm0, ymm1 214 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb 215 vmovdqu [edx], ymm0 216 lea edx, [edx + 32] 217 sub ecx, 32 218 jg wloop 219 220 vzeroupper 221 ret 222 } 223 } 224 225 // Blends 64x1 rectangle to 32x1. 226 __declspec(naked) void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, 227 ptrdiff_t src_stride, 228 uint8* dst_ptr, 229 int dst_width) { 230 __asm { 231 mov eax, [esp + 4] // src_ptr 232 // src_stride 233 mov edx, [esp + 12] // dst_ptr 234 mov ecx, [esp + 16] // dst_width 235 236 vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b 237 vpsrlw ymm4, ymm4, 15 238 vpackuswb ymm4, ymm4, ymm4 239 vpxor ymm5, ymm5, ymm5 // constant 0 240 241 wloop: 242 vmovdqu ymm0, [eax] 243 vmovdqu ymm1, [eax + 32] 244 lea eax, [eax + 64] 245 vpmaddubsw ymm0, ymm0, ymm4 // horizontal add 246 vpmaddubsw ymm1, ymm1, ymm4 247 vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 248 vpavgw ymm1, ymm1, ymm5 249 vpackuswb ymm0, ymm0, ymm1 250 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb 251 vmovdqu [edx], ymm0 252 lea edx, [edx + 32] 253 sub ecx, 32 254 jg wloop 255 256 vzeroupper 257 ret 258 } 259 } 260 261 // For rounding, average = (sum + 2) / 4 262 // becomes average((sum >> 1), 0) 263 // Blends 64x2 rectangle to 32x1. 264 __declspec(naked) void ScaleRowDown2Box_AVX2(const uint8* src_ptr, 265 ptrdiff_t src_stride, 266 uint8* dst_ptr, 267 int dst_width) { 268 __asm { 269 push esi 270 mov eax, [esp + 4 + 4] // src_ptr 271 mov esi, [esp + 4 + 8] // src_stride 272 mov edx, [esp + 4 + 12] // dst_ptr 273 mov ecx, [esp + 4 + 16] // dst_width 274 275 vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b 276 vpsrlw ymm4, ymm4, 15 277 vpackuswb ymm4, ymm4, ymm4 278 vpxor ymm5, ymm5, ymm5 // constant 0 279 280 wloop: 281 vmovdqu ymm0, [eax] 282 vmovdqu ymm1, [eax + 32] 283 vmovdqu ymm2, [eax + esi] 284 vmovdqu ymm3, [eax + esi + 32] 285 lea eax, [eax + 64] 286 vpmaddubsw ymm0, ymm0, ymm4 // horizontal add 287 vpmaddubsw ymm1, ymm1, ymm4 288 vpmaddubsw ymm2, ymm2, ymm4 289 vpmaddubsw ymm3, ymm3, ymm4 290 vpaddw ymm0, ymm0, ymm2 // vertical add 291 vpaddw ymm1, ymm1, ymm3 292 vpsrlw ymm0, ymm0, 1 // (x + 2) / 4 = (x / 2 + 1) / 2 293 vpsrlw ymm1, ymm1, 1 294 vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 295 vpavgw ymm1, ymm1, ymm5 296 vpackuswb ymm0, ymm0, ymm1 297 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb 298 vmovdqu [edx], ymm0 299 lea edx, [edx + 32] 300 sub ecx, 32 301 jg wloop 302 303 pop esi 304 vzeroupper 305 ret 306 } 307 } 308 #endif // HAS_SCALEROWDOWN2_AVX2 309 310 // Point samples 32 pixels to 8 pixels. 311 __declspec(naked) void ScaleRowDown4_SSSE3(const uint8* src_ptr, 312 ptrdiff_t src_stride, 313 uint8* dst_ptr, 314 int dst_width) { 315 __asm { 316 mov eax, [esp + 4] // src_ptr 317 // src_stride ignored 318 mov edx, [esp + 12] // dst_ptr 319 mov ecx, [esp + 16] // dst_width 320 pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000 321 psrld xmm5, 24 322 pslld xmm5, 16 323 324 wloop: 325 movdqu xmm0, [eax] 326 movdqu xmm1, [eax + 16] 327 lea eax, [eax + 32] 328 pand xmm0, xmm5 329 pand xmm1, xmm5 330 packuswb xmm0, xmm1 331 psrlw xmm0, 8 332 packuswb xmm0, xmm0 333 movq qword ptr [edx], xmm0 334 lea edx, [edx + 8] 335 sub ecx, 8 336 jg wloop 337 338 ret 339 } 340 } 341 342 // Blends 32x4 rectangle to 8x1. 343 __declspec(naked) void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, 344 ptrdiff_t src_stride, 345 uint8* dst_ptr, 346 int dst_width) { 347 __asm { 348 push esi 349 push edi 350 mov eax, [esp + 8 + 4] // src_ptr 351 mov esi, [esp + 8 + 8] // src_stride 352 mov edx, [esp + 8 + 12] // dst_ptr 353 mov ecx, [esp + 8 + 16] // dst_width 354 lea edi, [esi + esi * 2] // src_stride * 3 355 pcmpeqb xmm4, xmm4 // constant 0x0101 356 psrlw xmm4, 15 357 movdqa xmm5, xmm4 358 packuswb xmm4, xmm4 359 psllw xmm5, 3 // constant 0x0008 360 361 wloop: 362 movdqu xmm0, [eax] // average rows 363 movdqu xmm1, [eax + 16] 364 movdqu xmm2, [eax + esi] 365 movdqu xmm3, [eax + esi + 16] 366 pmaddubsw xmm0, xmm4 // horizontal add 367 pmaddubsw xmm1, xmm4 368 pmaddubsw xmm2, xmm4 369 pmaddubsw xmm3, xmm4 370 paddw xmm0, xmm2 // vertical add rows 0, 1 371 paddw xmm1, xmm3 372 movdqu xmm2, [eax + esi * 2] 373 movdqu xmm3, [eax + esi * 2 + 16] 374 pmaddubsw xmm2, xmm4 375 pmaddubsw xmm3, xmm4 376 paddw xmm0, xmm2 // add row 2 377 paddw xmm1, xmm3 378 movdqu xmm2, [eax + edi] 379 movdqu xmm3, [eax + edi + 16] 380 lea eax, [eax + 32] 381 pmaddubsw xmm2, xmm4 382 pmaddubsw xmm3, xmm4 383 paddw xmm0, xmm2 // add row 3 384 paddw xmm1, xmm3 385 phaddw xmm0, xmm1 386 paddw xmm0, xmm5 // + 8 for round 387 psrlw xmm0, 4 // /16 for average of 4 * 4 388 packuswb xmm0, xmm0 389 movq qword ptr [edx], xmm0 390 lea edx, [edx + 8] 391 sub ecx, 8 392 jg wloop 393 394 pop edi 395 pop esi 396 ret 397 } 398 } 399 400 #ifdef HAS_SCALEROWDOWN4_AVX2 401 // Point samples 64 pixels to 16 pixels. 402 __declspec(naked) void ScaleRowDown4_AVX2(const uint8* src_ptr, 403 ptrdiff_t src_stride, 404 uint8* dst_ptr, 405 int dst_width) { 406 __asm { 407 mov eax, [esp + 4] // src_ptr 408 // src_stride ignored 409 mov edx, [esp + 12] // dst_ptr 410 mov ecx, [esp + 16] // dst_width 411 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff0000 412 vpsrld ymm5, ymm5, 24 413 vpslld ymm5, ymm5, 16 414 415 wloop: 416 vmovdqu ymm0, [eax] 417 vmovdqu ymm1, [eax + 32] 418 lea eax, [eax + 64] 419 vpand ymm0, ymm0, ymm5 420 vpand ymm1, ymm1, ymm5 421 vpackuswb ymm0, ymm0, ymm1 422 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb 423 vpsrlw ymm0, ymm0, 8 424 vpackuswb ymm0, ymm0, ymm0 425 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb 426 vmovdqu [edx], xmm0 427 lea edx, [edx + 16] 428 sub ecx, 16 429 jg wloop 430 431 vzeroupper 432 ret 433 } 434 } 435 436 // Blends 64x4 rectangle to 16x1. 437 __declspec(naked) void ScaleRowDown4Box_AVX2(const uint8* src_ptr, 438 ptrdiff_t src_stride, 439 uint8* dst_ptr, 440 int dst_width) { 441 __asm { 442 push esi 443 push edi 444 mov eax, [esp + 8 + 4] // src_ptr 445 mov esi, [esp + 8 + 8] // src_stride 446 mov edx, [esp + 8 + 12] // dst_ptr 447 mov ecx, [esp + 8 + 16] // dst_width 448 lea edi, [esi + esi * 2] // src_stride * 3 449 vpcmpeqb ymm4, ymm4, ymm4 // constant 0x0101 450 vpsrlw ymm4, ymm4, 15 451 vpsllw ymm5, ymm4, 3 // constant 0x0008 452 vpackuswb ymm4, ymm4, ymm4 453 454 wloop: 455 vmovdqu ymm0, [eax] // average rows 456 vmovdqu ymm1, [eax + 32] 457 vmovdqu ymm2, [eax + esi] 458 vmovdqu ymm3, [eax + esi + 32] 459 vpmaddubsw ymm0, ymm0, ymm4 // horizontal add 460 vpmaddubsw ymm1, ymm1, ymm4 461 vpmaddubsw ymm2, ymm2, ymm4 462 vpmaddubsw ymm3, ymm3, ymm4 463 vpaddw ymm0, ymm0, ymm2 // vertical add rows 0, 1 464 vpaddw ymm1, ymm1, ymm3 465 vmovdqu ymm2, [eax + esi * 2] 466 vmovdqu ymm3, [eax + esi * 2 + 32] 467 vpmaddubsw ymm2, ymm2, ymm4 468 vpmaddubsw ymm3, ymm3, ymm4 469 vpaddw ymm0, ymm0, ymm2 // add row 2 470 vpaddw ymm1, ymm1, ymm3 471 vmovdqu ymm2, [eax + edi] 472 vmovdqu ymm3, [eax + edi + 32] 473 lea eax, [eax + 64] 474 vpmaddubsw ymm2, ymm2, ymm4 475 vpmaddubsw ymm3, ymm3, ymm4 476 vpaddw ymm0, ymm0, ymm2 // add row 3 477 vpaddw ymm1, ymm1, ymm3 478 vphaddw ymm0, ymm0, ymm1 // mutates 479 vpermq ymm0, ymm0, 0xd8 // unmutate vphaddw 480 vpaddw ymm0, ymm0, ymm5 // + 8 for round 481 vpsrlw ymm0, ymm0, 4 // /32 for average of 4 * 4 482 vpackuswb ymm0, ymm0, ymm0 483 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb 484 vmovdqu [edx], xmm0 485 lea edx, [edx + 16] 486 sub ecx, 16 487 jg wloop 488 489 pop edi 490 pop esi 491 vzeroupper 492 ret 493 } 494 } 495 #endif // HAS_SCALEROWDOWN4_AVX2 496 497 // Point samples 32 pixels to 24 pixels. 498 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read. 499 // Then shuffled to do the scaling. 500 501 __declspec(naked) void ScaleRowDown34_SSSE3(const uint8* src_ptr, 502 ptrdiff_t src_stride, 503 uint8* dst_ptr, 504 int dst_width) { 505 __asm { 506 mov eax, [esp + 4] // src_ptr 507 // src_stride ignored 508 mov edx, [esp + 12] // dst_ptr 509 mov ecx, [esp + 16] // dst_width 510 movdqa xmm3, xmmword ptr kShuf0 511 movdqa xmm4, xmmword ptr kShuf1 512 movdqa xmm5, xmmword ptr kShuf2 513 514 wloop: 515 movdqu xmm0, [eax] 516 movdqu xmm1, [eax + 16] 517 lea eax, [eax + 32] 518 movdqa xmm2, xmm1 519 palignr xmm1, xmm0, 8 520 pshufb xmm0, xmm3 521 pshufb xmm1, xmm4 522 pshufb xmm2, xmm5 523 movq qword ptr [edx], xmm0 524 movq qword ptr [edx + 8], xmm1 525 movq qword ptr [edx + 16], xmm2 526 lea edx, [edx + 24] 527 sub ecx, 24 528 jg wloop 529 530 ret 531 } 532 } 533 534 // Blends 32x2 rectangle to 24x1 535 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read. 536 // Then shuffled to do the scaling. 537 538 // Register usage: 539 // xmm0 src_row 0 540 // xmm1 src_row 1 541 // xmm2 shuf 0 542 // xmm3 shuf 1 543 // xmm4 shuf 2 544 // xmm5 madd 0 545 // xmm6 madd 1 546 // xmm7 kRound34 547 548 // Note that movdqa+palign may be better than movdqu. 549 __declspec(naked) void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, 550 ptrdiff_t src_stride, 551 uint8* dst_ptr, 552 int dst_width) { 553 __asm { 554 push esi 555 mov eax, [esp + 4 + 4] // src_ptr 556 mov esi, [esp + 4 + 8] // src_stride 557 mov edx, [esp + 4 + 12] // dst_ptr 558 mov ecx, [esp + 4 + 16] // dst_width 559 movdqa xmm2, xmmword ptr kShuf01 560 movdqa xmm3, xmmword ptr kShuf11 561 movdqa xmm4, xmmword ptr kShuf21 562 movdqa xmm5, xmmword ptr kMadd01 563 movdqa xmm6, xmmword ptr kMadd11 564 movdqa xmm7, xmmword ptr kRound34 565 566 wloop: 567 movdqu xmm0, [eax] // pixels 0..7 568 movdqu xmm1, [eax + esi] 569 pavgb xmm0, xmm1 570 pshufb xmm0, xmm2 571 pmaddubsw xmm0, xmm5 572 paddsw xmm0, xmm7 573 psrlw xmm0, 2 574 packuswb xmm0, xmm0 575 movq qword ptr [edx], xmm0 576 movdqu xmm0, [eax + 8] // pixels 8..15 577 movdqu xmm1, [eax + esi + 8] 578 pavgb xmm0, xmm1 579 pshufb xmm0, xmm3 580 pmaddubsw xmm0, xmm6 581 paddsw xmm0, xmm7 582 psrlw xmm0, 2 583 packuswb xmm0, xmm0 584 movq qword ptr [edx + 8], xmm0 585 movdqu xmm0, [eax + 16] // pixels 16..23 586 movdqu xmm1, [eax + esi + 16] 587 lea eax, [eax + 32] 588 pavgb xmm0, xmm1 589 pshufb xmm0, xmm4 590 movdqa xmm1, xmmword ptr kMadd21 591 pmaddubsw xmm0, xmm1 592 paddsw xmm0, xmm7 593 psrlw xmm0, 2 594 packuswb xmm0, xmm0 595 movq qword ptr [edx + 16], xmm0 596 lea edx, [edx + 24] 597 sub ecx, 24 598 jg wloop 599 600 pop esi 601 ret 602 } 603 } 604 605 // Note that movdqa+palign may be better than movdqu. 606 __declspec(naked) void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, 607 ptrdiff_t src_stride, 608 uint8* dst_ptr, 609 int dst_width) { 610 __asm { 611 push esi 612 mov eax, [esp + 4 + 4] // src_ptr 613 mov esi, [esp + 4 + 8] // src_stride 614 mov edx, [esp + 4 + 12] // dst_ptr 615 mov ecx, [esp + 4 + 16] // dst_width 616 movdqa xmm2, xmmword ptr kShuf01 617 movdqa xmm3, xmmword ptr kShuf11 618 movdqa xmm4, xmmword ptr kShuf21 619 movdqa xmm5, xmmword ptr kMadd01 620 movdqa xmm6, xmmword ptr kMadd11 621 movdqa xmm7, xmmword ptr kRound34 622 623 wloop: 624 movdqu xmm0, [eax] // pixels 0..7 625 movdqu xmm1, [eax + esi] 626 pavgb xmm1, xmm0 627 pavgb xmm0, xmm1 628 pshufb xmm0, xmm2 629 pmaddubsw xmm0, xmm5 630 paddsw xmm0, xmm7 631 psrlw xmm0, 2 632 packuswb xmm0, xmm0 633 movq qword ptr [edx], xmm0 634 movdqu xmm0, [eax + 8] // pixels 8..15 635 movdqu xmm1, [eax + esi + 8] 636 pavgb xmm1, xmm0 637 pavgb xmm0, xmm1 638 pshufb xmm0, xmm3 639 pmaddubsw xmm0, xmm6 640 paddsw xmm0, xmm7 641 psrlw xmm0, 2 642 packuswb xmm0, xmm0 643 movq qword ptr [edx + 8], xmm0 644 movdqu xmm0, [eax + 16] // pixels 16..23 645 movdqu xmm1, [eax + esi + 16] 646 lea eax, [eax + 32] 647 pavgb xmm1, xmm0 648 pavgb xmm0, xmm1 649 pshufb xmm0, xmm4 650 movdqa xmm1, xmmword ptr kMadd21 651 pmaddubsw xmm0, xmm1 652 paddsw xmm0, xmm7 653 psrlw xmm0, 2 654 packuswb xmm0, xmm0 655 movq qword ptr [edx + 16], xmm0 656 lea edx, [edx+24] 657 sub ecx, 24 658 jg wloop 659 660 pop esi 661 ret 662 } 663 } 664 665 // 3/8 point sampler 666 667 // Scale 32 pixels to 12 668 __declspec(naked) void ScaleRowDown38_SSSE3(const uint8* src_ptr, 669 ptrdiff_t src_stride, 670 uint8* dst_ptr, 671 int dst_width) { 672 __asm { 673 mov eax, [esp + 4] // src_ptr 674 // src_stride ignored 675 mov edx, [esp + 12] // dst_ptr 676 mov ecx, [esp + 16] // dst_width 677 movdqa xmm4, xmmword ptr kShuf38a 678 movdqa xmm5, xmmword ptr kShuf38b 679 680 xloop: 681 movdqu xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5 682 movdqu xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11 683 lea eax, [eax + 32] 684 pshufb xmm0, xmm4 685 pshufb xmm1, xmm5 686 paddusb xmm0, xmm1 687 688 movq qword ptr [edx], xmm0 // write 12 pixels 689 movhlps xmm1, xmm0 690 movd [edx + 8], xmm1 691 lea edx, [edx + 12] 692 sub ecx, 12 693 jg xloop 694 695 ret 696 } 697 } 698 699 // Scale 16x3 pixels to 6x1 with interpolation 700 __declspec(naked) void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, 701 ptrdiff_t src_stride, 702 uint8* dst_ptr, 703 int dst_width) { 704 __asm { 705 push esi 706 mov eax, [esp + 4 + 4] // src_ptr 707 mov esi, [esp + 4 + 8] // src_stride 708 mov edx, [esp + 4 + 12] // dst_ptr 709 mov ecx, [esp + 4 + 16] // dst_width 710 movdqa xmm2, xmmword ptr kShufAc 711 movdqa xmm3, xmmword ptr kShufAc3 712 movdqa xmm4, xmmword ptr kScaleAc33 713 pxor xmm5, xmm5 714 715 xloop: 716 movdqu xmm0, [eax] // sum up 3 rows into xmm0/1 717 movdqu xmm6, [eax + esi] 718 movhlps xmm1, xmm0 719 movhlps xmm7, xmm6 720 punpcklbw xmm0, xmm5 721 punpcklbw xmm1, xmm5 722 punpcklbw xmm6, xmm5 723 punpcklbw xmm7, xmm5 724 paddusw xmm0, xmm6 725 paddusw xmm1, xmm7 726 movdqu xmm6, [eax + esi * 2] 727 lea eax, [eax + 16] 728 movhlps xmm7, xmm6 729 punpcklbw xmm6, xmm5 730 punpcklbw xmm7, xmm5 731 paddusw xmm0, xmm6 732 paddusw xmm1, xmm7 733 734 movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6 735 psrldq xmm0, 2 736 paddusw xmm6, xmm0 737 psrldq xmm0, 2 738 paddusw xmm6, xmm0 739 pshufb xmm6, xmm2 740 741 movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6 742 psrldq xmm1, 2 743 paddusw xmm7, xmm1 744 psrldq xmm1, 2 745 paddusw xmm7, xmm1 746 pshufb xmm7, xmm3 747 paddusw xmm6, xmm7 748 749 pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6 750 packuswb xmm6, xmm6 751 752 movd [edx], xmm6 // write 6 pixels 753 psrlq xmm6, 16 754 movd [edx + 2], xmm6 755 lea edx, [edx + 6] 756 sub ecx, 6 757 jg xloop 758 759 pop esi 760 ret 761 } 762 } 763 764 // Scale 16x2 pixels to 6x1 with interpolation 765 __declspec(naked) void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, 766 ptrdiff_t src_stride, 767 uint8* dst_ptr, 768 int dst_width) { 769 __asm { 770 push esi 771 mov eax, [esp + 4 + 4] // src_ptr 772 mov esi, [esp + 4 + 8] // src_stride 773 mov edx, [esp + 4 + 12] // dst_ptr 774 mov ecx, [esp + 4 + 16] // dst_width 775 movdqa xmm2, xmmword ptr kShufAb0 776 movdqa xmm3, xmmword ptr kShufAb1 777 movdqa xmm4, xmmword ptr kShufAb2 778 movdqa xmm5, xmmword ptr kScaleAb2 779 780 xloop: 781 movdqu xmm0, [eax] // average 2 rows into xmm0 782 movdqu xmm1, [eax + esi] 783 lea eax, [eax + 16] 784 pavgb xmm0, xmm1 785 786 movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1 787 pshufb xmm1, xmm2 788 movdqa xmm6, xmm0 789 pshufb xmm6, xmm3 790 paddusw xmm1, xmm6 791 pshufb xmm0, xmm4 792 paddusw xmm1, xmm0 793 794 pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2 795 packuswb xmm1, xmm1 796 797 movd [edx], xmm1 // write 6 pixels 798 psrlq xmm1, 16 799 movd [edx + 2], xmm1 800 lea edx, [edx + 6] 801 sub ecx, 6 802 jg xloop 803 804 pop esi 805 ret 806 } 807 } 808 809 // Reads 16 bytes and accumulates to 16 shorts at a time. 810 __declspec(naked) void ScaleAddRow_SSE2(const uint8* src_ptr, 811 uint16* dst_ptr, 812 int src_width) { 813 __asm { 814 mov eax, [esp + 4] // src_ptr 815 mov edx, [esp + 8] // dst_ptr 816 mov ecx, [esp + 12] // src_width 817 pxor xmm5, xmm5 818 819 // sum rows 820 xloop: 821 movdqu xmm3, [eax] // read 16 bytes 822 lea eax, [eax + 16] 823 movdqu xmm0, [edx] // read 16 words from destination 824 movdqu xmm1, [edx + 16] 825 movdqa xmm2, xmm3 826 punpcklbw xmm2, xmm5 827 punpckhbw xmm3, xmm5 828 paddusw xmm0, xmm2 // sum 16 words 829 paddusw xmm1, xmm3 830 movdqu [edx], xmm0 // write 16 words to destination 831 movdqu [edx + 16], xmm1 832 lea edx, [edx + 32] 833 sub ecx, 16 834 jg xloop 835 ret 836 } 837 } 838 839 #ifdef HAS_SCALEADDROW_AVX2 840 // Reads 32 bytes and accumulates to 32 shorts at a time. 841 __declspec(naked) void ScaleAddRow_AVX2(const uint8* src_ptr, 842 uint16* dst_ptr, 843 int src_width) { 844 __asm { 845 mov eax, [esp + 4] // src_ptr 846 mov edx, [esp + 8] // dst_ptr 847 mov ecx, [esp + 12] // src_width 848 vpxor ymm5, ymm5, ymm5 849 850 // sum rows 851 xloop: 852 vmovdqu ymm3, [eax] // read 32 bytes 853 lea eax, [eax + 32] 854 vpermq ymm3, ymm3, 0xd8 // unmutate for vpunpck 855 vpunpcklbw ymm2, ymm3, ymm5 856 vpunpckhbw ymm3, ymm3, ymm5 857 vpaddusw ymm0, ymm2, [edx] // sum 16 words 858 vpaddusw ymm1, ymm3, [edx + 32] 859 vmovdqu [edx], ymm0 // write 32 words to destination 860 vmovdqu [edx + 32], ymm1 861 lea edx, [edx + 64] 862 sub ecx, 32 863 jg xloop 864 865 vzeroupper 866 ret 867 } 868 } 869 #endif // HAS_SCALEADDROW_AVX2 870 871 // Constant for making pixels signed to avoid pmaddubsw 872 // saturation. 873 static uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 874 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; 875 876 // Constant for making pixels unsigned and adding .5 for rounding. 877 static uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040, 878 0x4040, 0x4040, 0x4040, 0x4040}; 879 880 // Bilinear column filtering. SSSE3 version. 881 __declspec(naked) void ScaleFilterCols_SSSE3(uint8* dst_ptr, 882 const uint8* src_ptr, 883 int dst_width, 884 int x, 885 int dx) { 886 __asm { 887 push ebx 888 push esi 889 push edi 890 mov edi, [esp + 12 + 4] // dst_ptr 891 mov esi, [esp + 12 + 8] // src_ptr 892 mov ecx, [esp + 12 + 12] // dst_width 893 movd xmm2, [esp + 12 + 16] // x 894 movd xmm3, [esp + 12 + 20] // dx 895 mov eax, 0x04040000 // shuffle to line up fractions with pixel. 896 movd xmm5, eax 897 pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. 898 psrlw xmm6, 9 899 pcmpeqb xmm7, xmm7 // generate 0x0001 900 psrlw xmm7, 15 901 pextrw eax, xmm2, 1 // get x0 integer. preroll 902 sub ecx, 2 903 jl xloop29 904 905 movdqa xmm0, xmm2 // x1 = x0 + dx 906 paddd xmm0, xmm3 907 punpckldq xmm2, xmm0 // x0 x1 908 punpckldq xmm3, xmm3 // dx dx 909 paddd xmm3, xmm3 // dx * 2, dx * 2 910 pextrw edx, xmm2, 3 // get x1 integer. preroll 911 912 // 2 Pixel loop. 913 xloop2: 914 movdqa xmm1, xmm2 // x0, x1 fractions. 915 paddd xmm2, xmm3 // x += dx 916 movzx ebx, word ptr [esi + eax] // 2 source x0 pixels 917 movd xmm0, ebx 918 psrlw xmm1, 9 // 7 bit fractions. 919 movzx ebx, word ptr [esi + edx] // 2 source x1 pixels 920 movd xmm4, ebx 921 pshufb xmm1, xmm5 // 0011 922 punpcklwd xmm0, xmm4 923 psubb xmm0, xmmword ptr kFsub80 // make pixels signed. 924 pxor xmm1, xmm6 // 0..7f and 7f..0 925 paddusb xmm1, xmm7 // +1 so 0..7f and 80..1 926 pmaddubsw xmm1, xmm0 // 16 bit, 2 pixels. 927 pextrw eax, xmm2, 1 // get x0 integer. next iteration. 928 pextrw edx, xmm2, 3 // get x1 integer. next iteration. 929 paddw xmm1, xmmword ptr kFadd40 // make pixels unsigned and round. 930 psrlw xmm1, 7 // 8.7 fixed point to low 8 bits. 931 packuswb xmm1, xmm1 // 8 bits, 2 pixels. 932 movd ebx, xmm1 933 mov [edi], bx 934 lea edi, [edi + 2] 935 sub ecx, 2 // 2 pixels 936 jge xloop2 937 938 xloop29: 939 add ecx, 2 - 1 940 jl xloop99 941 942 // 1 pixel remainder 943 movzx ebx, word ptr [esi + eax] // 2 source x0 pixels 944 movd xmm0, ebx 945 psrlw xmm2, 9 // 7 bit fractions. 946 pshufb xmm2, xmm5 // 0011 947 psubb xmm0, xmmword ptr kFsub80 // make pixels signed. 948 pxor xmm2, xmm6 // 0..7f and 7f..0 949 paddusb xmm2, xmm7 // +1 so 0..7f and 80..1 950 pmaddubsw xmm2, xmm0 // 16 bit 951 paddw xmm2, xmmword ptr kFadd40 // make pixels unsigned and round. 952 psrlw xmm2, 7 // 8.7 fixed point to low 8 bits. 953 packuswb xmm2, xmm2 // 8 bits 954 movd ebx, xmm2 955 mov [edi], bl 956 957 xloop99: 958 959 pop edi 960 pop esi 961 pop ebx 962 ret 963 } 964 } 965 966 // Reads 16 pixels, duplicates them and writes 32 pixels. 967 __declspec(naked) void ScaleColsUp2_SSE2(uint8* dst_ptr, 968 const uint8* src_ptr, 969 int dst_width, 970 int x, 971 int dx) { 972 __asm { 973 mov edx, [esp + 4] // dst_ptr 974 mov eax, [esp + 8] // src_ptr 975 mov ecx, [esp + 12] // dst_width 976 977 wloop: 978 movdqu xmm0, [eax] 979 lea eax, [eax + 16] 980 movdqa xmm1, xmm0 981 punpcklbw xmm0, xmm0 982 punpckhbw xmm1, xmm1 983 movdqu [edx], xmm0 984 movdqu [edx + 16], xmm1 985 lea edx, [edx + 32] 986 sub ecx, 32 987 jg wloop 988 989 ret 990 } 991 } 992 993 // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6) 994 __declspec(naked) void ScaleARGBRowDown2_SSE2(const uint8* src_argb, 995 ptrdiff_t src_stride, 996 uint8* dst_argb, 997 int dst_width) { 998 __asm { 999 mov eax, [esp + 4] // src_argb 1000 // src_stride ignored 1001 mov edx, [esp + 12] // dst_argb 1002 mov ecx, [esp + 16] // dst_width 1003 1004 wloop: 1005 movdqu xmm0, [eax] 1006 movdqu xmm1, [eax + 16] 1007 lea eax, [eax + 32] 1008 shufps xmm0, xmm1, 0xdd 1009 movdqu [edx], xmm0 1010 lea edx, [edx + 16] 1011 sub ecx, 4 1012 jg wloop 1013 1014 ret 1015 } 1016 } 1017 1018 // Blends 8x1 rectangle to 4x1. 1019 __declspec(naked) void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, 1020 ptrdiff_t src_stride, 1021 uint8* dst_argb, 1022 int dst_width) { 1023 __asm { 1024 mov eax, [esp + 4] // src_argb 1025 // src_stride ignored 1026 mov edx, [esp + 12] // dst_argb 1027 mov ecx, [esp + 16] // dst_width 1028 1029 wloop: 1030 movdqu xmm0, [eax] 1031 movdqu xmm1, [eax + 16] 1032 lea eax, [eax + 32] 1033 movdqa xmm2, xmm0 1034 shufps xmm0, xmm1, 0x88 // even pixels 1035 shufps xmm2, xmm1, 0xdd // odd pixels 1036 pavgb xmm0, xmm2 1037 movdqu [edx], xmm0 1038 lea edx, [edx + 16] 1039 sub ecx, 4 1040 jg wloop 1041 1042 ret 1043 } 1044 } 1045 1046 // Blends 8x2 rectangle to 4x1. 1047 __declspec(naked) void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, 1048 ptrdiff_t src_stride, 1049 uint8* dst_argb, 1050 int dst_width) { 1051 __asm { 1052 push esi 1053 mov eax, [esp + 4 + 4] // src_argb 1054 mov esi, [esp + 4 + 8] // src_stride 1055 mov edx, [esp + 4 + 12] // dst_argb 1056 mov ecx, [esp + 4 + 16] // dst_width 1057 1058 wloop: 1059 movdqu xmm0, [eax] 1060 movdqu xmm1, [eax + 16] 1061 movdqu xmm2, [eax + esi] 1062 movdqu xmm3, [eax + esi + 16] 1063 lea eax, [eax + 32] 1064 pavgb xmm0, xmm2 // average rows 1065 pavgb xmm1, xmm3 1066 movdqa xmm2, xmm0 // average columns (8 to 4 pixels) 1067 shufps xmm0, xmm1, 0x88 // even pixels 1068 shufps xmm2, xmm1, 0xdd // odd pixels 1069 pavgb xmm0, xmm2 1070 movdqu [edx], xmm0 1071 lea edx, [edx + 16] 1072 sub ecx, 4 1073 jg wloop 1074 1075 pop esi 1076 ret 1077 } 1078 } 1079 1080 // Reads 4 pixels at a time. 1081 __declspec(naked) void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, 1082 ptrdiff_t src_stride, 1083 int src_stepx, 1084 uint8* dst_argb, 1085 int dst_width) { 1086 __asm { 1087 push ebx 1088 push edi 1089 mov eax, [esp + 8 + 4] // src_argb 1090 // src_stride ignored 1091 mov ebx, [esp + 8 + 12] // src_stepx 1092 mov edx, [esp + 8 + 16] // dst_argb 1093 mov ecx, [esp + 8 + 20] // dst_width 1094 lea ebx, [ebx * 4] 1095 lea edi, [ebx + ebx * 2] 1096 1097 wloop: 1098 movd xmm0, [eax] 1099 movd xmm1, [eax + ebx] 1100 punpckldq xmm0, xmm1 1101 movd xmm2, [eax + ebx * 2] 1102 movd xmm3, [eax + edi] 1103 lea eax, [eax + ebx * 4] 1104 punpckldq xmm2, xmm3 1105 punpcklqdq xmm0, xmm2 1106 movdqu [edx], xmm0 1107 lea edx, [edx + 16] 1108 sub ecx, 4 1109 jg wloop 1110 1111 pop edi 1112 pop ebx 1113 ret 1114 } 1115 } 1116 1117 // Blends four 2x2 to 4x1. 1118 __declspec(naked) void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, 1119 ptrdiff_t src_stride, 1120 int src_stepx, 1121 uint8* dst_argb, 1122 int dst_width) { 1123 __asm { 1124 push ebx 1125 push esi 1126 push edi 1127 mov eax, [esp + 12 + 4] // src_argb 1128 mov esi, [esp + 12 + 8] // src_stride 1129 mov ebx, [esp + 12 + 12] // src_stepx 1130 mov edx, [esp + 12 + 16] // dst_argb 1131 mov ecx, [esp + 12 + 20] // dst_width 1132 lea esi, [eax + esi] // row1 pointer 1133 lea ebx, [ebx * 4] 1134 lea edi, [ebx + ebx * 2] 1135 1136 wloop: 1137 movq xmm0, qword ptr [eax] // row0 4 pairs 1138 movhps xmm0, qword ptr [eax + ebx] 1139 movq xmm1, qword ptr [eax + ebx * 2] 1140 movhps xmm1, qword ptr [eax + edi] 1141 lea eax, [eax + ebx * 4] 1142 movq xmm2, qword ptr [esi] // row1 4 pairs 1143 movhps xmm2, qword ptr [esi + ebx] 1144 movq xmm3, qword ptr [esi + ebx * 2] 1145 movhps xmm3, qword ptr [esi + edi] 1146 lea esi, [esi + ebx * 4] 1147 pavgb xmm0, xmm2 // average rows 1148 pavgb xmm1, xmm3 1149 movdqa xmm2, xmm0 // average columns (8 to 4 pixels) 1150 shufps xmm0, xmm1, 0x88 // even pixels 1151 shufps xmm2, xmm1, 0xdd // odd pixels 1152 pavgb xmm0, xmm2 1153 movdqu [edx], xmm0 1154 lea edx, [edx + 16] 1155 sub ecx, 4 1156 jg wloop 1157 1158 pop edi 1159 pop esi 1160 pop ebx 1161 ret 1162 } 1163 } 1164 1165 // Column scaling unfiltered. SSE2 version. 1166 __declspec(naked) void ScaleARGBCols_SSE2(uint8* dst_argb, 1167 const uint8* src_argb, 1168 int dst_width, 1169 int x, 1170 int dx) { 1171 __asm { 1172 push edi 1173 push esi 1174 mov edi, [esp + 8 + 4] // dst_argb 1175 mov esi, [esp + 8 + 8] // src_argb 1176 mov ecx, [esp + 8 + 12] // dst_width 1177 movd xmm2, [esp + 8 + 16] // x 1178 movd xmm3, [esp + 8 + 20] // dx 1179 1180 pshufd xmm2, xmm2, 0 // x0 x0 x0 x0 1181 pshufd xmm0, xmm3, 0x11 // dx 0 dx 0 1182 paddd xmm2, xmm0 1183 paddd xmm3, xmm3 // 0, 0, 0, dx * 2 1184 pshufd xmm0, xmm3, 0x05 // dx * 2, dx * 2, 0, 0 1185 paddd xmm2, xmm0 // x3 x2 x1 x0 1186 paddd xmm3, xmm3 // 0, 0, 0, dx * 4 1187 pshufd xmm3, xmm3, 0 // dx * 4, dx * 4, dx * 4, dx * 4 1188 1189 pextrw eax, xmm2, 1 // get x0 integer. 1190 pextrw edx, xmm2, 3 // get x1 integer. 1191 1192 cmp ecx, 0 1193 jle xloop99 1194 sub ecx, 4 1195 jl xloop49 1196 1197 // 4 Pixel loop. 1198 xloop4: 1199 movd xmm0, [esi + eax * 4] // 1 source x0 pixels 1200 movd xmm1, [esi + edx * 4] // 1 source x1 pixels 1201 pextrw eax, xmm2, 5 // get x2 integer. 1202 pextrw edx, xmm2, 7 // get x3 integer. 1203 paddd xmm2, xmm3 // x += dx 1204 punpckldq xmm0, xmm1 // x0 x1 1205 1206 movd xmm1, [esi + eax * 4] // 1 source x2 pixels 1207 movd xmm4, [esi + edx * 4] // 1 source x3 pixels 1208 pextrw eax, xmm2, 1 // get x0 integer. next iteration. 1209 pextrw edx, xmm2, 3 // get x1 integer. next iteration. 1210 punpckldq xmm1, xmm4 // x2 x3 1211 punpcklqdq xmm0, xmm1 // x0 x1 x2 x3 1212 movdqu [edi], xmm0 1213 lea edi, [edi + 16] 1214 sub ecx, 4 // 4 pixels 1215 jge xloop4 1216 1217 xloop49: 1218 test ecx, 2 1219 je xloop29 1220 1221 // 2 Pixels. 1222 movd xmm0, [esi + eax * 4] // 1 source x0 pixels 1223 movd xmm1, [esi + edx * 4] // 1 source x1 pixels 1224 pextrw eax, xmm2, 5 // get x2 integer. 1225 punpckldq xmm0, xmm1 // x0 x1 1226 1227 movq qword ptr [edi], xmm0 1228 lea edi, [edi + 8] 1229 1230 xloop29: 1231 test ecx, 1 1232 je xloop99 1233 1234 // 1 Pixels. 1235 movd xmm0, [esi + eax * 4] // 1 source x2 pixels 1236 movd dword ptr [edi], xmm0 1237 xloop99: 1238 1239 pop esi 1240 pop edi 1241 ret 1242 } 1243 } 1244 1245 // Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version. 1246 // TODO(fbarchard): Port to Neon 1247 1248 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw 1249 static uvec8 kShuffleColARGB = { 1250 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel 1251 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel 1252 }; 1253 1254 // Shuffle table for duplicating 2 fractions into 8 bytes each 1255 static uvec8 kShuffleFractions = { 1256 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 1257 }; 1258 1259 __declspec(naked) void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, 1260 const uint8* src_argb, 1261 int dst_width, 1262 int x, 1263 int dx) { 1264 __asm { 1265 push esi 1266 push edi 1267 mov edi, [esp + 8 + 4] // dst_argb 1268 mov esi, [esp + 8 + 8] // src_argb 1269 mov ecx, [esp + 8 + 12] // dst_width 1270 movd xmm2, [esp + 8 + 16] // x 1271 movd xmm3, [esp + 8 + 20] // dx 1272 movdqa xmm4, xmmword ptr kShuffleColARGB 1273 movdqa xmm5, xmmword ptr kShuffleFractions 1274 pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. 1275 psrlw xmm6, 9 1276 pextrw eax, xmm2, 1 // get x0 integer. preroll 1277 sub ecx, 2 1278 jl xloop29 1279 1280 movdqa xmm0, xmm2 // x1 = x0 + dx 1281 paddd xmm0, xmm3 1282 punpckldq xmm2, xmm0 // x0 x1 1283 punpckldq xmm3, xmm3 // dx dx 1284 paddd xmm3, xmm3 // dx * 2, dx * 2 1285 pextrw edx, xmm2, 3 // get x1 integer. preroll 1286 1287 // 2 Pixel loop. 1288 xloop2: 1289 movdqa xmm1, xmm2 // x0, x1 fractions. 1290 paddd xmm2, xmm3 // x += dx 1291 movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels 1292 psrlw xmm1, 9 // 7 bit fractions. 1293 movhps xmm0, qword ptr [esi + edx * 4] // 2 source x1 pixels 1294 pshufb xmm1, xmm5 // 0000000011111111 1295 pshufb xmm0, xmm4 // arrange pixels into pairs 1296 pxor xmm1, xmm6 // 0..7f and 7f..0 1297 pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels. 1298 pextrw eax, xmm2, 1 // get x0 integer. next iteration. 1299 pextrw edx, xmm2, 3 // get x1 integer. next iteration. 1300 psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits. 1301 packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels. 1302 movq qword ptr [edi], xmm0 1303 lea edi, [edi + 8] 1304 sub ecx, 2 // 2 pixels 1305 jge xloop2 1306 1307 xloop29: 1308 1309 add ecx, 2 - 1 1310 jl xloop99 1311 1312 // 1 pixel remainder 1313 psrlw xmm2, 9 // 7 bit fractions. 1314 movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels 1315 pshufb xmm2, xmm5 // 00000000 1316 pshufb xmm0, xmm4 // arrange pixels into pairs 1317 pxor xmm2, xmm6 // 0..7f and 7f..0 1318 pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel. 1319 psrlw xmm0, 7 1320 packuswb xmm0, xmm0 // argb 8 bits, 1 pixel. 1321 movd [edi], xmm0 1322 1323 xloop99: 1324 1325 pop edi 1326 pop esi 1327 ret 1328 } 1329 } 1330 1331 // Reads 4 pixels, duplicates them and writes 8 pixels. 1332 __declspec(naked) void ScaleARGBColsUp2_SSE2(uint8* dst_argb, 1333 const uint8* src_argb, 1334 int dst_width, 1335 int x, 1336 int dx) { 1337 __asm { 1338 mov edx, [esp + 4] // dst_argb 1339 mov eax, [esp + 8] // src_argb 1340 mov ecx, [esp + 12] // dst_width 1341 1342 wloop: 1343 movdqu xmm0, [eax] 1344 lea eax, [eax + 16] 1345 movdqa xmm1, xmm0 1346 punpckldq xmm0, xmm0 1347 punpckhdq xmm1, xmm1 1348 movdqu [edx], xmm0 1349 movdqu [edx + 16], xmm1 1350 lea edx, [edx + 32] 1351 sub ecx, 8 1352 jg wloop 1353 1354 ret 1355 } 1356 } 1357 1358 // Divide num by div and return as 16.16 fixed point result. 1359 __declspec(naked) int FixedDiv_X86(int num, int div) { 1360 __asm { 1361 mov eax, [esp + 4] // num 1362 cdq // extend num to 64 bits 1363 shld edx, eax, 16 // 32.16 1364 shl eax, 16 1365 idiv dword ptr [esp + 8] 1366 ret 1367 } 1368 } 1369 1370 // Divide num by div and return as 16.16 fixed point result. 1371 __declspec(naked) int FixedDiv1_X86(int num, int div) { 1372 __asm { 1373 mov eax, [esp + 4] // num 1374 mov ecx, [esp + 8] // denom 1375 cdq // extend num to 64 bits 1376 shld edx, eax, 16 // 32.16 1377 shl eax, 16 1378 sub eax, 0x00010001 1379 sbb edx, 0 1380 sub ecx, 1 1381 idiv ecx 1382 ret 1383 } 1384 } 1385 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) 1386 1387 #ifdef __cplusplus 1388 } // extern "C" 1389 } // namespace libyuv 1390 #endif 1391