1 /* 2 * Copyright 2013 The LibYuv Project Authors. All rights reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "libyuv/row.h" 12 #include "libyuv/scale_row.h" 13 14 #ifdef __cplusplus 15 namespace libyuv { 16 extern "C" { 17 #endif 18 19 // This module is for 32 bit Visual C x86 and clangcl 20 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) 21 22 // Offsets for source bytes 0 to 9 23 static uvec8 kShuf0 = 24 { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 }; 25 26 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. 27 static uvec8 kShuf1 = 28 { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 }; 29 30 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. 31 static uvec8 kShuf2 = 32 { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 }; 33 34 // Offsets for source bytes 0 to 10 35 static uvec8 kShuf01 = 36 { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 }; 37 38 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. 39 static uvec8 kShuf11 = 40 { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 }; 41 42 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. 43 static uvec8 kShuf21 = 44 { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 }; 45 46 // Coefficients for source bytes 0 to 10 47 static uvec8 kMadd01 = 48 { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 }; 49 50 // Coefficients for source bytes 10 to 21 51 static uvec8 kMadd11 = 52 { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 }; 53 54 // Coefficients for source bytes 21 to 31 55 static uvec8 kMadd21 = 56 { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 }; 57 58 // Coefficients for source bytes 21 to 31 59 static vec16 kRound34 = 60 { 2, 2, 2, 2, 2, 2, 2, 2 }; 61 62 static uvec8 kShuf38a = 63 { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; 64 65 static uvec8 kShuf38b = 66 { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 }; 67 68 // Arrange words 0,3,6 into 0,1,2 69 static uvec8 kShufAc = 70 { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; 71 72 // Arrange words 0,3,6 into 3,4,5 73 static uvec8 kShufAc3 = 74 { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 }; 75 76 // Scaling values for boxes of 3x3 and 2x3 77 static uvec16 kScaleAc33 = 78 { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 }; 79 80 // Arrange first value for pixels 0,1,2,3,4,5 81 static uvec8 kShufAb0 = 82 { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 }; 83 84 // Arrange second value for pixels 0,1,2,3,4,5 85 static uvec8 kShufAb1 = 86 { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 }; 87 88 // Arrange third value for pixels 0,1,2,3,4,5 89 static uvec8 kShufAb2 = 90 { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; 91 92 // Scaling values for boxes of 3x2 and 2x2 93 static uvec16 kScaleAb2 = 94 { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; 95 96 // Reads 32 pixels, throws half away and writes 16 pixels. 97 __declspec(naked) 98 void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, 99 uint8* dst_ptr, int dst_width) { 100 __asm { 101 mov eax, [esp + 4] // src_ptr 102 // src_stride ignored 103 mov edx, [esp + 12] // dst_ptr 104 mov ecx, [esp + 16] // dst_width 105 106 wloop: 107 movdqu xmm0, [eax] 108 movdqu xmm1, [eax + 16] 109 lea eax, [eax + 32] 110 psrlw xmm0, 8 // isolate odd pixels. 111 psrlw xmm1, 8 112 packuswb xmm0, xmm1 113 movdqu [edx], xmm0 114 lea edx, [edx + 16] 115 sub ecx, 16 116 jg wloop 117 118 ret 119 } 120 } 121 122 // Blends 32x1 rectangle to 16x1. 123 __declspec(naked) 124 void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, 125 uint8* dst_ptr, int dst_width) { 126 __asm { 127 mov eax, [esp + 4] // src_ptr 128 // src_stride 129 mov edx, [esp + 12] // dst_ptr 130 mov ecx, [esp + 16] // dst_width 131 132 pcmpeqb xmm4, xmm4 // constant 0x0101 133 psrlw xmm4, 15 134 packuswb xmm4, xmm4 135 pxor xmm5, xmm5 // constant 0 136 137 wloop: 138 movdqu xmm0, [eax] 139 movdqu xmm1, [eax + 16] 140 lea eax, [eax + 32] 141 pmaddubsw xmm0, xmm4 // horizontal add 142 pmaddubsw xmm1, xmm4 143 pavgw xmm0, xmm5 // (x + 1) / 2 144 pavgw xmm1, xmm5 145 packuswb xmm0, xmm1 146 movdqu [edx], xmm0 147 lea edx, [edx + 16] 148 sub ecx, 16 149 jg wloop 150 151 ret 152 } 153 } 154 155 // Blends 32x2 rectangle to 16x1. 156 __declspec(naked) 157 void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, 158 uint8* dst_ptr, int dst_width) { 159 __asm { 160 push esi 161 mov eax, [esp + 4 + 4] // src_ptr 162 mov esi, [esp + 4 + 8] // src_stride 163 mov edx, [esp + 4 + 12] // dst_ptr 164 mov ecx, [esp + 4 + 16] // dst_width 165 166 pcmpeqb xmm4, xmm4 // constant 0x0101 167 psrlw xmm4, 15 168 packuswb xmm4, xmm4 169 pxor xmm5, xmm5 // constant 0 170 171 wloop: 172 movdqu xmm0, [eax] 173 movdqu xmm1, [eax + 16] 174 movdqu xmm2, [eax + esi] 175 movdqu xmm3, [eax + esi + 16] 176 lea eax, [eax + 32] 177 pmaddubsw xmm0, xmm4 // horizontal add 178 pmaddubsw xmm1, xmm4 179 pmaddubsw xmm2, xmm4 180 pmaddubsw xmm3, xmm4 181 paddw xmm0, xmm2 // vertical add 182 paddw xmm1, xmm3 183 psrlw xmm0, 1 184 psrlw xmm1, 1 185 pavgw xmm0, xmm5 // (x + 1) / 2 186 pavgw xmm1, xmm5 187 packuswb xmm0, xmm1 188 movdqu [edx], xmm0 189 lea edx, [edx + 16] 190 sub ecx, 16 191 jg wloop 192 193 pop esi 194 ret 195 } 196 } 197 198 #ifdef HAS_SCALEROWDOWN2_AVX2 199 // Reads 64 pixels, throws half away and writes 32 pixels. 200 __declspec(naked) 201 void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, 202 uint8* dst_ptr, int dst_width) { 203 __asm { 204 mov eax, [esp + 4] // src_ptr 205 // src_stride ignored 206 mov edx, [esp + 12] // dst_ptr 207 mov ecx, [esp + 16] // dst_width 208 209 wloop: 210 vmovdqu ymm0, [eax] 211 vmovdqu ymm1, [eax + 32] 212 lea eax, [eax + 64] 213 vpsrlw ymm0, ymm0, 8 // isolate odd pixels. 214 vpsrlw ymm1, ymm1, 8 215 vpackuswb ymm0, ymm0, ymm1 216 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb 217 vmovdqu [edx], ymm0 218 lea edx, [edx + 32] 219 sub ecx, 32 220 jg wloop 221 222 vzeroupper 223 ret 224 } 225 } 226 227 // Blends 64x1 rectangle to 32x1. 228 __declspec(naked) 229 void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, 230 uint8* dst_ptr, int dst_width) { 231 __asm { 232 mov eax, [esp + 4] // src_ptr 233 // src_stride 234 mov edx, [esp + 12] // dst_ptr 235 mov ecx, [esp + 16] // dst_width 236 237 vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b 238 vpsrlw ymm4, ymm4, 15 239 vpackuswb ymm4, ymm4, ymm4 240 vpxor ymm5, ymm5, ymm5 // constant 0 241 242 wloop: 243 vmovdqu ymm0, [eax] 244 vmovdqu ymm1, [eax + 32] 245 lea eax, [eax + 64] 246 vpmaddubsw ymm0, ymm0, ymm4 // horizontal add 247 vpmaddubsw ymm1, ymm1, ymm4 248 vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 249 vpavgw ymm1, ymm1, ymm5 250 vpackuswb ymm0, ymm0, ymm1 251 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb 252 vmovdqu [edx], ymm0 253 lea edx, [edx + 32] 254 sub ecx, 32 255 jg wloop 256 257 vzeroupper 258 ret 259 } 260 } 261 262 // For rounding, average = (sum + 2) / 4 263 // becomes average((sum >> 1), 0) 264 // Blends 64x2 rectangle to 32x1. 265 __declspec(naked) 266 void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, 267 uint8* dst_ptr, int dst_width) { 268 __asm { 269 push esi 270 mov eax, [esp + 4 + 4] // src_ptr 271 mov esi, [esp + 4 + 8] // src_stride 272 mov edx, [esp + 4 + 12] // dst_ptr 273 mov ecx, [esp + 4 + 16] // dst_width 274 275 vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b 276 vpsrlw ymm4, ymm4, 15 277 vpackuswb ymm4, ymm4, ymm4 278 vpxor ymm5, ymm5, ymm5 // constant 0 279 280 wloop: 281 vmovdqu ymm0, [eax] 282 vmovdqu ymm1, [eax + 32] 283 vmovdqu ymm2, [eax + esi] 284 vmovdqu ymm3, [eax + esi + 32] 285 lea eax, [eax + 64] 286 vpmaddubsw ymm0, ymm0, ymm4 // horizontal add 287 vpmaddubsw ymm1, ymm1, ymm4 288 vpmaddubsw ymm2, ymm2, ymm4 289 vpmaddubsw ymm3, ymm3, ymm4 290 vpaddw ymm0, ymm0, ymm2 // vertical add 291 vpaddw ymm1, ymm1, ymm3 292 vpsrlw ymm0, ymm0, 1 // (x + 2) / 4 = (x / 2 + 1) / 2 293 vpsrlw ymm1, ymm1, 1 294 vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 295 vpavgw ymm1, ymm1, ymm5 296 vpackuswb ymm0, ymm0, ymm1 297 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb 298 vmovdqu [edx], ymm0 299 lea edx, [edx + 32] 300 sub ecx, 32 301 jg wloop 302 303 pop esi 304 vzeroupper 305 ret 306 } 307 } 308 #endif // HAS_SCALEROWDOWN2_AVX2 309 310 // Point samples 32 pixels to 8 pixels. 311 __declspec(naked) 312 void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, 313 uint8* dst_ptr, int dst_width) { 314 __asm { 315 mov eax, [esp + 4] // src_ptr 316 // src_stride ignored 317 mov edx, [esp + 12] // dst_ptr 318 mov ecx, [esp + 16] // dst_width 319 pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000 320 psrld xmm5, 24 321 pslld xmm5, 16 322 323 wloop: 324 movdqu xmm0, [eax] 325 movdqu xmm1, [eax + 16] 326 lea eax, [eax + 32] 327 pand xmm0, xmm5 328 pand xmm1, xmm5 329 packuswb xmm0, xmm1 330 psrlw xmm0, 8 331 packuswb xmm0, xmm0 332 movq qword ptr [edx], xmm0 333 lea edx, [edx + 8] 334 sub ecx, 8 335 jg wloop 336 337 ret 338 } 339 } 340 341 // Blends 32x4 rectangle to 8x1. 342 __declspec(naked) 343 void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, 344 uint8* dst_ptr, int dst_width) { 345 __asm { 346 push esi 347 push edi 348 mov eax, [esp + 8 + 4] // src_ptr 349 mov esi, [esp + 8 + 8] // src_stride 350 mov edx, [esp + 8 + 12] // dst_ptr 351 mov ecx, [esp + 8 + 16] // dst_width 352 lea edi, [esi + esi * 2] // src_stride * 3 353 pcmpeqb xmm4, xmm4 // constant 0x0101 354 psrlw xmm4, 15 355 movdqa xmm5, xmm4 356 packuswb xmm4, xmm4 357 psllw xmm5, 3 // constant 0x0008 358 359 wloop: 360 movdqu xmm0, [eax] // average rows 361 movdqu xmm1, [eax + 16] 362 movdqu xmm2, [eax + esi] 363 movdqu xmm3, [eax + esi + 16] 364 pmaddubsw xmm0, xmm4 // horizontal add 365 pmaddubsw xmm1, xmm4 366 pmaddubsw xmm2, xmm4 367 pmaddubsw xmm3, xmm4 368 paddw xmm0, xmm2 // vertical add rows 0, 1 369 paddw xmm1, xmm3 370 movdqu xmm2, [eax + esi * 2] 371 movdqu xmm3, [eax + esi * 2 + 16] 372 pmaddubsw xmm2, xmm4 373 pmaddubsw xmm3, xmm4 374 paddw xmm0, xmm2 // add row 2 375 paddw xmm1, xmm3 376 movdqu xmm2, [eax + edi] 377 movdqu xmm3, [eax + edi + 16] 378 lea eax, [eax + 32] 379 pmaddubsw xmm2, xmm4 380 pmaddubsw xmm3, xmm4 381 paddw xmm0, xmm2 // add row 3 382 paddw xmm1, xmm3 383 phaddw xmm0, xmm1 384 paddw xmm0, xmm5 // + 8 for round 385 psrlw xmm0, 4 // /16 for average of 4 * 4 386 packuswb xmm0, xmm0 387 movq qword ptr [edx], xmm0 388 lea edx, [edx + 8] 389 sub ecx, 8 390 jg wloop 391 392 pop edi 393 pop esi 394 ret 395 } 396 } 397 398 #ifdef HAS_SCALEROWDOWN4_AVX2 399 // Point samples 64 pixels to 16 pixels. 400 __declspec(naked) 401 void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, 402 uint8* dst_ptr, int dst_width) { 403 __asm { 404 mov eax, [esp + 4] // src_ptr 405 // src_stride ignored 406 mov edx, [esp + 12] // dst_ptr 407 mov ecx, [esp + 16] // dst_width 408 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff0000 409 vpsrld ymm5, ymm5, 24 410 vpslld ymm5, ymm5, 16 411 412 wloop: 413 vmovdqu ymm0, [eax] 414 vmovdqu ymm1, [eax + 32] 415 lea eax, [eax + 64] 416 vpand ymm0, ymm0, ymm5 417 vpand ymm1, ymm1, ymm5 418 vpackuswb ymm0, ymm0, ymm1 419 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb 420 vpsrlw ymm0, ymm0, 8 421 vpackuswb ymm0, ymm0, ymm0 422 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb 423 vmovdqu [edx], xmm0 424 lea edx, [edx + 16] 425 sub ecx, 16 426 jg wloop 427 428 vzeroupper 429 ret 430 } 431 } 432 433 // Blends 64x4 rectangle to 16x1. 434 __declspec(naked) 435 void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, 436 uint8* dst_ptr, int dst_width) { 437 __asm { 438 push esi 439 push edi 440 mov eax, [esp + 8 + 4] // src_ptr 441 mov esi, [esp + 8 + 8] // src_stride 442 mov edx, [esp + 8 + 12] // dst_ptr 443 mov ecx, [esp + 8 + 16] // dst_width 444 lea edi, [esi + esi * 2] // src_stride * 3 445 vpcmpeqb ymm4, ymm4, ymm4 // constant 0x0101 446 vpsrlw ymm4, ymm4, 15 447 vpsllw ymm5, ymm4, 3 // constant 0x0008 448 vpackuswb ymm4, ymm4, ymm4 449 450 wloop: 451 vmovdqu ymm0, [eax] // average rows 452 vmovdqu ymm1, [eax + 32] 453 vmovdqu ymm2, [eax + esi] 454 vmovdqu ymm3, [eax + esi + 32] 455 vpmaddubsw ymm0, ymm0, ymm4 // horizontal add 456 vpmaddubsw ymm1, ymm1, ymm4 457 vpmaddubsw ymm2, ymm2, ymm4 458 vpmaddubsw ymm3, ymm3, ymm4 459 vpaddw ymm0, ymm0, ymm2 // vertical add rows 0, 1 460 vpaddw ymm1, ymm1, ymm3 461 vmovdqu ymm2, [eax + esi * 2] 462 vmovdqu ymm3, [eax + esi * 2 + 32] 463 vpmaddubsw ymm2, ymm2, ymm4 464 vpmaddubsw ymm3, ymm3, ymm4 465 vpaddw ymm0, ymm0, ymm2 // add row 2 466 vpaddw ymm1, ymm1, ymm3 467 vmovdqu ymm2, [eax + edi] 468 vmovdqu ymm3, [eax + edi + 32] 469 lea eax, [eax + 64] 470 vpmaddubsw ymm2, ymm2, ymm4 471 vpmaddubsw ymm3, ymm3, ymm4 472 vpaddw ymm0, ymm0, ymm2 // add row 3 473 vpaddw ymm1, ymm1, ymm3 474 vphaddw ymm0, ymm0, ymm1 // mutates 475 vpermq ymm0, ymm0, 0xd8 // unmutate vphaddw 476 vpaddw ymm0, ymm0, ymm5 // + 8 for round 477 vpsrlw ymm0, ymm0, 4 // /32 for average of 4 * 4 478 vpackuswb ymm0, ymm0, ymm0 479 vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb 480 vmovdqu [edx], xmm0 481 lea edx, [edx + 16] 482 sub ecx, 16 483 jg wloop 484 485 pop edi 486 pop esi 487 vzeroupper 488 ret 489 } 490 } 491 #endif // HAS_SCALEROWDOWN4_AVX2 492 493 // Point samples 32 pixels to 24 pixels. 494 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read. 495 // Then shuffled to do the scaling. 496 497 __declspec(naked) 498 void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, 499 uint8* dst_ptr, int dst_width) { 500 __asm { 501 mov eax, [esp + 4] // src_ptr 502 // src_stride ignored 503 mov edx, [esp + 12] // dst_ptr 504 mov ecx, [esp + 16] // dst_width 505 movdqa xmm3, xmmword ptr kShuf0 506 movdqa xmm4, xmmword ptr kShuf1 507 movdqa xmm5, xmmword ptr kShuf2 508 509 wloop: 510 movdqu xmm0, [eax] 511 movdqu xmm1, [eax + 16] 512 lea eax, [eax + 32] 513 movdqa xmm2, xmm1 514 palignr xmm1, xmm0, 8 515 pshufb xmm0, xmm3 516 pshufb xmm1, xmm4 517 pshufb xmm2, xmm5 518 movq qword ptr [edx], xmm0 519 movq qword ptr [edx + 8], xmm1 520 movq qword ptr [edx + 16], xmm2 521 lea edx, [edx + 24] 522 sub ecx, 24 523 jg wloop 524 525 ret 526 } 527 } 528 529 // Blends 32x2 rectangle to 24x1 530 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read. 531 // Then shuffled to do the scaling. 532 533 // Register usage: 534 // xmm0 src_row 0 535 // xmm1 src_row 1 536 // xmm2 shuf 0 537 // xmm3 shuf 1 538 // xmm4 shuf 2 539 // xmm5 madd 0 540 // xmm6 madd 1 541 // xmm7 kRound34 542 543 // Note that movdqa+palign may be better than movdqu. 544 __declspec(naked) 545 void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, 546 ptrdiff_t src_stride, 547 uint8* dst_ptr, int dst_width) { 548 __asm { 549 push esi 550 mov eax, [esp + 4 + 4] // src_ptr 551 mov esi, [esp + 4 + 8] // src_stride 552 mov edx, [esp + 4 + 12] // dst_ptr 553 mov ecx, [esp + 4 + 16] // dst_width 554 movdqa xmm2, xmmword ptr kShuf01 555 movdqa xmm3, xmmword ptr kShuf11 556 movdqa xmm4, xmmword ptr kShuf21 557 movdqa xmm5, xmmword ptr kMadd01 558 movdqa xmm6, xmmword ptr kMadd11 559 movdqa xmm7, xmmword ptr kRound34 560 561 wloop: 562 movdqu xmm0, [eax] // pixels 0..7 563 movdqu xmm1, [eax + esi] 564 pavgb xmm0, xmm1 565 pshufb xmm0, xmm2 566 pmaddubsw xmm0, xmm5 567 paddsw xmm0, xmm7 568 psrlw xmm0, 2 569 packuswb xmm0, xmm0 570 movq qword ptr [edx], xmm0 571 movdqu xmm0, [eax + 8] // pixels 8..15 572 movdqu xmm1, [eax + esi + 8] 573 pavgb xmm0, xmm1 574 pshufb xmm0, xmm3 575 pmaddubsw xmm0, xmm6 576 paddsw xmm0, xmm7 577 psrlw xmm0, 2 578 packuswb xmm0, xmm0 579 movq qword ptr [edx + 8], xmm0 580 movdqu xmm0, [eax + 16] // pixels 16..23 581 movdqu xmm1, [eax + esi + 16] 582 lea eax, [eax + 32] 583 pavgb xmm0, xmm1 584 pshufb xmm0, xmm4 585 movdqa xmm1, xmmword ptr kMadd21 586 pmaddubsw xmm0, xmm1 587 paddsw xmm0, xmm7 588 psrlw xmm0, 2 589 packuswb xmm0, xmm0 590 movq qword ptr [edx + 16], xmm0 591 lea edx, [edx + 24] 592 sub ecx, 24 593 jg wloop 594 595 pop esi 596 ret 597 } 598 } 599 600 // Note that movdqa+palign may be better than movdqu. 601 __declspec(naked) 602 void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, 603 ptrdiff_t src_stride, 604 uint8* dst_ptr, int dst_width) { 605 __asm { 606 push esi 607 mov eax, [esp + 4 + 4] // src_ptr 608 mov esi, [esp + 4 + 8] // src_stride 609 mov edx, [esp + 4 + 12] // dst_ptr 610 mov ecx, [esp + 4 + 16] // dst_width 611 movdqa xmm2, xmmword ptr kShuf01 612 movdqa xmm3, xmmword ptr kShuf11 613 movdqa xmm4, xmmword ptr kShuf21 614 movdqa xmm5, xmmword ptr kMadd01 615 movdqa xmm6, xmmword ptr kMadd11 616 movdqa xmm7, xmmword ptr kRound34 617 618 wloop: 619 movdqu xmm0, [eax] // pixels 0..7 620 movdqu xmm1, [eax + esi] 621 pavgb xmm1, xmm0 622 pavgb xmm0, xmm1 623 pshufb xmm0, xmm2 624 pmaddubsw xmm0, xmm5 625 paddsw xmm0, xmm7 626 psrlw xmm0, 2 627 packuswb xmm0, xmm0 628 movq qword ptr [edx], xmm0 629 movdqu xmm0, [eax + 8] // pixels 8..15 630 movdqu xmm1, [eax + esi + 8] 631 pavgb xmm1, xmm0 632 pavgb xmm0, xmm1 633 pshufb xmm0, xmm3 634 pmaddubsw xmm0, xmm6 635 paddsw xmm0, xmm7 636 psrlw xmm0, 2 637 packuswb xmm0, xmm0 638 movq qword ptr [edx + 8], xmm0 639 movdqu xmm0, [eax + 16] // pixels 16..23 640 movdqu xmm1, [eax + esi + 16] 641 lea eax, [eax + 32] 642 pavgb xmm1, xmm0 643 pavgb xmm0, xmm1 644 pshufb xmm0, xmm4 645 movdqa xmm1, xmmword ptr kMadd21 646 pmaddubsw xmm0, xmm1 647 paddsw xmm0, xmm7 648 psrlw xmm0, 2 649 packuswb xmm0, xmm0 650 movq qword ptr [edx + 16], xmm0 651 lea edx, [edx+24] 652 sub ecx, 24 653 jg wloop 654 655 pop esi 656 ret 657 } 658 } 659 660 // 3/8 point sampler 661 662 // Scale 32 pixels to 12 663 __declspec(naked) 664 void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, 665 uint8* dst_ptr, int dst_width) { 666 __asm { 667 mov eax, [esp + 4] // src_ptr 668 // src_stride ignored 669 mov edx, [esp + 12] // dst_ptr 670 mov ecx, [esp + 16] // dst_width 671 movdqa xmm4, xmmword ptr kShuf38a 672 movdqa xmm5, xmmword ptr kShuf38b 673 674 xloop: 675 movdqu xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5 676 movdqu xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11 677 lea eax, [eax + 32] 678 pshufb xmm0, xmm4 679 pshufb xmm1, xmm5 680 paddusb xmm0, xmm1 681 682 movq qword ptr [edx], xmm0 // write 12 pixels 683 movhlps xmm1, xmm0 684 movd [edx + 8], xmm1 685 lea edx, [edx + 12] 686 sub ecx, 12 687 jg xloop 688 689 ret 690 } 691 } 692 693 // Scale 16x3 pixels to 6x1 with interpolation 694 __declspec(naked) 695 void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, 696 ptrdiff_t src_stride, 697 uint8* dst_ptr, int dst_width) { 698 __asm { 699 push esi 700 mov eax, [esp + 4 + 4] // src_ptr 701 mov esi, [esp + 4 + 8] // src_stride 702 mov edx, [esp + 4 + 12] // dst_ptr 703 mov ecx, [esp + 4 + 16] // dst_width 704 movdqa xmm2, xmmword ptr kShufAc 705 movdqa xmm3, xmmword ptr kShufAc3 706 movdqa xmm4, xmmword ptr kScaleAc33 707 pxor xmm5, xmm5 708 709 xloop: 710 movdqu xmm0, [eax] // sum up 3 rows into xmm0/1 711 movdqu xmm6, [eax + esi] 712 movhlps xmm1, xmm0 713 movhlps xmm7, xmm6 714 punpcklbw xmm0, xmm5 715 punpcklbw xmm1, xmm5 716 punpcklbw xmm6, xmm5 717 punpcklbw xmm7, xmm5 718 paddusw xmm0, xmm6 719 paddusw xmm1, xmm7 720 movdqu xmm6, [eax + esi * 2] 721 lea eax, [eax + 16] 722 movhlps xmm7, xmm6 723 punpcklbw xmm6, xmm5 724 punpcklbw xmm7, xmm5 725 paddusw xmm0, xmm6 726 paddusw xmm1, xmm7 727 728 movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6 729 psrldq xmm0, 2 730 paddusw xmm6, xmm0 731 psrldq xmm0, 2 732 paddusw xmm6, xmm0 733 pshufb xmm6, xmm2 734 735 movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6 736 psrldq xmm1, 2 737 paddusw xmm7, xmm1 738 psrldq xmm1, 2 739 paddusw xmm7, xmm1 740 pshufb xmm7, xmm3 741 paddusw xmm6, xmm7 742 743 pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6 744 packuswb xmm6, xmm6 745 746 movd [edx], xmm6 // write 6 pixels 747 psrlq xmm6, 16 748 movd [edx + 2], xmm6 749 lea edx, [edx + 6] 750 sub ecx, 6 751 jg xloop 752 753 pop esi 754 ret 755 } 756 } 757 758 // Scale 16x2 pixels to 6x1 with interpolation 759 __declspec(naked) 760 void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, 761 ptrdiff_t src_stride, 762 uint8* dst_ptr, int dst_width) { 763 __asm { 764 push esi 765 mov eax, [esp + 4 + 4] // src_ptr 766 mov esi, [esp + 4 + 8] // src_stride 767 mov edx, [esp + 4 + 12] // dst_ptr 768 mov ecx, [esp + 4 + 16] // dst_width 769 movdqa xmm2, xmmword ptr kShufAb0 770 movdqa xmm3, xmmword ptr kShufAb1 771 movdqa xmm4, xmmword ptr kShufAb2 772 movdqa xmm5, xmmword ptr kScaleAb2 773 774 xloop: 775 movdqu xmm0, [eax] // average 2 rows into xmm0 776 movdqu xmm1, [eax + esi] 777 lea eax, [eax + 16] 778 pavgb xmm0, xmm1 779 780 movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1 781 pshufb xmm1, xmm2 782 movdqa xmm6, xmm0 783 pshufb xmm6, xmm3 784 paddusw xmm1, xmm6 785 pshufb xmm0, xmm4 786 paddusw xmm1, xmm0 787 788 pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2 789 packuswb xmm1, xmm1 790 791 movd [edx], xmm1 // write 6 pixels 792 psrlq xmm1, 16 793 movd [edx + 2], xmm1 794 lea edx, [edx + 6] 795 sub ecx, 6 796 jg xloop 797 798 pop esi 799 ret 800 } 801 } 802 803 // Reads 16 bytes and accumulates to 16 shorts at a time. 804 __declspec(naked) 805 void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) { 806 __asm { 807 mov eax, [esp + 4] // src_ptr 808 mov edx, [esp + 8] // dst_ptr 809 mov ecx, [esp + 12] // src_width 810 pxor xmm5, xmm5 811 812 // sum rows 813 xloop: 814 movdqu xmm3, [eax] // read 16 bytes 815 lea eax, [eax + 16] 816 movdqu xmm0, [edx] // read 16 words from destination 817 movdqu xmm1, [edx + 16] 818 movdqa xmm2, xmm3 819 punpcklbw xmm2, xmm5 820 punpckhbw xmm3, xmm5 821 paddusw xmm0, xmm2 // sum 16 words 822 paddusw xmm1, xmm3 823 movdqu [edx], xmm0 // write 16 words to destination 824 movdqu [edx + 16], xmm1 825 lea edx, [edx + 32] 826 sub ecx, 16 827 jg xloop 828 ret 829 } 830 } 831 832 #ifdef HAS_SCALEADDROW_AVX2 833 // Reads 32 bytes and accumulates to 32 shorts at a time. 834 __declspec(naked) 835 void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) { 836 __asm { 837 mov eax, [esp + 4] // src_ptr 838 mov edx, [esp + 8] // dst_ptr 839 mov ecx, [esp + 12] // src_width 840 vpxor ymm5, ymm5, ymm5 841 842 // sum rows 843 xloop: 844 vmovdqu ymm3, [eax] // read 32 bytes 845 lea eax, [eax + 32] 846 vpermq ymm3, ymm3, 0xd8 // unmutate for vpunpck 847 vpunpcklbw ymm2, ymm3, ymm5 848 vpunpckhbw ymm3, ymm3, ymm5 849 vpaddusw ymm0, ymm2, [edx] // sum 16 words 850 vpaddusw ymm1, ymm3, [edx + 32] 851 vmovdqu [edx], ymm0 // write 32 words to destination 852 vmovdqu [edx + 32], ymm1 853 lea edx, [edx + 64] 854 sub ecx, 32 855 jg xloop 856 857 vzeroupper 858 ret 859 } 860 } 861 #endif // HAS_SCALEADDROW_AVX2 862 863 // Constant for making pixels signed to avoid pmaddubsw 864 // saturation. 865 static uvec8 kFsub80 = 866 { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 867 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 }; 868 869 // Constant for making pixels unsigned and adding .5 for rounding. 870 static uvec16 kFadd40 = 871 { 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040 }; 872 873 // Bilinear column filtering. SSSE3 version. 874 __declspec(naked) 875 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, 876 int dst_width, int x, int dx) { 877 __asm { 878 push ebx 879 push esi 880 push edi 881 mov edi, [esp + 12 + 4] // dst_ptr 882 mov esi, [esp + 12 + 8] // src_ptr 883 mov ecx, [esp + 12 + 12] // dst_width 884 movd xmm2, [esp + 12 + 16] // x 885 movd xmm3, [esp + 12 + 20] // dx 886 mov eax, 0x04040000 // shuffle to line up fractions with pixel. 887 movd xmm5, eax 888 pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. 889 psrlw xmm6, 9 890 pcmpeqb xmm7, xmm7 // generate 0x0001 891 psrlw xmm7, 15 892 pextrw eax, xmm2, 1 // get x0 integer. preroll 893 sub ecx, 2 894 jl xloop29 895 896 movdqa xmm0, xmm2 // x1 = x0 + dx 897 paddd xmm0, xmm3 898 punpckldq xmm2, xmm0 // x0 x1 899 punpckldq xmm3, xmm3 // dx dx 900 paddd xmm3, xmm3 // dx * 2, dx * 2 901 pextrw edx, xmm2, 3 // get x1 integer. preroll 902 903 // 2 Pixel loop. 904 xloop2: 905 movdqa xmm1, xmm2 // x0, x1 fractions. 906 paddd xmm2, xmm3 // x += dx 907 movzx ebx, word ptr [esi + eax] // 2 source x0 pixels 908 movd xmm0, ebx 909 psrlw xmm1, 9 // 7 bit fractions. 910 movzx ebx, word ptr [esi + edx] // 2 source x1 pixels 911 movd xmm4, ebx 912 pshufb xmm1, xmm5 // 0011 913 punpcklwd xmm0, xmm4 914 psubb xmm0, xmmword ptr kFsub80 // make pixels signed. 915 pxor xmm1, xmm6 // 0..7f and 7f..0 916 paddusb xmm1, xmm7 // +1 so 0..7f and 80..1 917 pmaddubsw xmm1, xmm0 // 16 bit, 2 pixels. 918 pextrw eax, xmm2, 1 // get x0 integer. next iteration. 919 pextrw edx, xmm2, 3 // get x1 integer. next iteration. 920 paddw xmm1, xmmword ptr kFadd40 // make pixels unsigned and round. 921 psrlw xmm1, 7 // 8.7 fixed point to low 8 bits. 922 packuswb xmm1, xmm1 // 8 bits, 2 pixels. 923 movd ebx, xmm1 924 mov [edi], bx 925 lea edi, [edi + 2] 926 sub ecx, 2 // 2 pixels 927 jge xloop2 928 929 xloop29: 930 add ecx, 2 - 1 931 jl xloop99 932 933 // 1 pixel remainder 934 movzx ebx, word ptr [esi + eax] // 2 source x0 pixels 935 movd xmm0, ebx 936 psrlw xmm2, 9 // 7 bit fractions. 937 pshufb xmm2, xmm5 // 0011 938 psubb xmm0, xmmword ptr kFsub80 // make pixels signed. 939 pxor xmm2, xmm6 // 0..7f and 7f..0 940 paddusb xmm2, xmm7 // +1 so 0..7f and 80..1 941 pmaddubsw xmm2, xmm0 // 16 bit 942 paddw xmm2, xmmword ptr kFadd40 // make pixels unsigned and round. 943 psrlw xmm2, 7 // 8.7 fixed point to low 8 bits. 944 packuswb xmm2, xmm2 // 8 bits 945 movd ebx, xmm2 946 mov [edi], bl 947 948 xloop99: 949 950 pop edi 951 pop esi 952 pop ebx 953 ret 954 } 955 } 956 957 // Reads 16 pixels, duplicates them and writes 32 pixels. 958 __declspec(naked) 959 void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, 960 int dst_width, int x, int dx) { 961 __asm { 962 mov edx, [esp + 4] // dst_ptr 963 mov eax, [esp + 8] // src_ptr 964 mov ecx, [esp + 12] // dst_width 965 966 wloop: 967 movdqu xmm0, [eax] 968 lea eax, [eax + 16] 969 movdqa xmm1, xmm0 970 punpcklbw xmm0, xmm0 971 punpckhbw xmm1, xmm1 972 movdqu [edx], xmm0 973 movdqu [edx + 16], xmm1 974 lea edx, [edx + 32] 975 sub ecx, 32 976 jg wloop 977 978 ret 979 } 980 } 981 982 // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6) 983 __declspec(naked) 984 void ScaleARGBRowDown2_SSE2(const uint8* src_argb, 985 ptrdiff_t src_stride, 986 uint8* dst_argb, int dst_width) { 987 __asm { 988 mov eax, [esp + 4] // src_argb 989 // src_stride ignored 990 mov edx, [esp + 12] // dst_argb 991 mov ecx, [esp + 16] // dst_width 992 993 wloop: 994 movdqu xmm0, [eax] 995 movdqu xmm1, [eax + 16] 996 lea eax, [eax + 32] 997 shufps xmm0, xmm1, 0xdd 998 movdqu [edx], xmm0 999 lea edx, [edx + 16] 1000 sub ecx, 4 1001 jg wloop 1002 1003 ret 1004 } 1005 } 1006 1007 // Blends 8x1 rectangle to 4x1. 1008 __declspec(naked) 1009 void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, 1010 ptrdiff_t src_stride, 1011 uint8* dst_argb, int dst_width) { 1012 __asm { 1013 mov eax, [esp + 4] // src_argb 1014 // src_stride ignored 1015 mov edx, [esp + 12] // dst_argb 1016 mov ecx, [esp + 16] // dst_width 1017 1018 wloop: 1019 movdqu xmm0, [eax] 1020 movdqu xmm1, [eax + 16] 1021 lea eax, [eax + 32] 1022 movdqa xmm2, xmm0 1023 shufps xmm0, xmm1, 0x88 // even pixels 1024 shufps xmm2, xmm1, 0xdd // odd pixels 1025 pavgb xmm0, xmm2 1026 movdqu [edx], xmm0 1027 lea edx, [edx + 16] 1028 sub ecx, 4 1029 jg wloop 1030 1031 ret 1032 } 1033 } 1034 1035 // Blends 8x2 rectangle to 4x1. 1036 __declspec(naked) 1037 void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, 1038 ptrdiff_t src_stride, 1039 uint8* dst_argb, int dst_width) { 1040 __asm { 1041 push esi 1042 mov eax, [esp + 4 + 4] // src_argb 1043 mov esi, [esp + 4 + 8] // src_stride 1044 mov edx, [esp + 4 + 12] // dst_argb 1045 mov ecx, [esp + 4 + 16] // dst_width 1046 1047 wloop: 1048 movdqu xmm0, [eax] 1049 movdqu xmm1, [eax + 16] 1050 movdqu xmm2, [eax + esi] 1051 movdqu xmm3, [eax + esi + 16] 1052 lea eax, [eax + 32] 1053 pavgb xmm0, xmm2 // average rows 1054 pavgb xmm1, xmm3 1055 movdqa xmm2, xmm0 // average columns (8 to 4 pixels) 1056 shufps xmm0, xmm1, 0x88 // even pixels 1057 shufps xmm2, xmm1, 0xdd // odd pixels 1058 pavgb xmm0, xmm2 1059 movdqu [edx], xmm0 1060 lea edx, [edx + 16] 1061 sub ecx, 4 1062 jg wloop 1063 1064 pop esi 1065 ret 1066 } 1067 } 1068 1069 // Reads 4 pixels at a time. 1070 __declspec(naked) 1071 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, 1072 int src_stepx, 1073 uint8* dst_argb, int dst_width) { 1074 __asm { 1075 push ebx 1076 push edi 1077 mov eax, [esp + 8 + 4] // src_argb 1078 // src_stride ignored 1079 mov ebx, [esp + 8 + 12] // src_stepx 1080 mov edx, [esp + 8 + 16] // dst_argb 1081 mov ecx, [esp + 8 + 20] // dst_width 1082 lea ebx, [ebx * 4] 1083 lea edi, [ebx + ebx * 2] 1084 1085 wloop: 1086 movd xmm0, [eax] 1087 movd xmm1, [eax + ebx] 1088 punpckldq xmm0, xmm1 1089 movd xmm2, [eax + ebx * 2] 1090 movd xmm3, [eax + edi] 1091 lea eax, [eax + ebx * 4] 1092 punpckldq xmm2, xmm3 1093 punpcklqdq xmm0, xmm2 1094 movdqu [edx], xmm0 1095 lea edx, [edx + 16] 1096 sub ecx, 4 1097 jg wloop 1098 1099 pop edi 1100 pop ebx 1101 ret 1102 } 1103 } 1104 1105 // Blends four 2x2 to 4x1. 1106 __declspec(naked) 1107 void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, 1108 ptrdiff_t src_stride, 1109 int src_stepx, 1110 uint8* dst_argb, int dst_width) { 1111 __asm { 1112 push ebx 1113 push esi 1114 push edi 1115 mov eax, [esp + 12 + 4] // src_argb 1116 mov esi, [esp + 12 + 8] // src_stride 1117 mov ebx, [esp + 12 + 12] // src_stepx 1118 mov edx, [esp + 12 + 16] // dst_argb 1119 mov ecx, [esp + 12 + 20] // dst_width 1120 lea esi, [eax + esi] // row1 pointer 1121 lea ebx, [ebx * 4] 1122 lea edi, [ebx + ebx * 2] 1123 1124 wloop: 1125 movq xmm0, qword ptr [eax] // row0 4 pairs 1126 movhps xmm0, qword ptr [eax + ebx] 1127 movq xmm1, qword ptr [eax + ebx * 2] 1128 movhps xmm1, qword ptr [eax + edi] 1129 lea eax, [eax + ebx * 4] 1130 movq xmm2, qword ptr [esi] // row1 4 pairs 1131 movhps xmm2, qword ptr [esi + ebx] 1132 movq xmm3, qword ptr [esi + ebx * 2] 1133 movhps xmm3, qword ptr [esi + edi] 1134 lea esi, [esi + ebx * 4] 1135 pavgb xmm0, xmm2 // average rows 1136 pavgb xmm1, xmm3 1137 movdqa xmm2, xmm0 // average columns (8 to 4 pixels) 1138 shufps xmm0, xmm1, 0x88 // even pixels 1139 shufps xmm2, xmm1, 0xdd // odd pixels 1140 pavgb xmm0, xmm2 1141 movdqu [edx], xmm0 1142 lea edx, [edx + 16] 1143 sub ecx, 4 1144 jg wloop 1145 1146 pop edi 1147 pop esi 1148 pop ebx 1149 ret 1150 } 1151 } 1152 1153 // Column scaling unfiltered. SSE2 version. 1154 __declspec(naked) 1155 void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, 1156 int dst_width, int x, int dx) { 1157 __asm { 1158 push edi 1159 push esi 1160 mov edi, [esp + 8 + 4] // dst_argb 1161 mov esi, [esp + 8 + 8] // src_argb 1162 mov ecx, [esp + 8 + 12] // dst_width 1163 movd xmm2, [esp + 8 + 16] // x 1164 movd xmm3, [esp + 8 + 20] // dx 1165 1166 pshufd xmm2, xmm2, 0 // x0 x0 x0 x0 1167 pshufd xmm0, xmm3, 0x11 // dx 0 dx 0 1168 paddd xmm2, xmm0 1169 paddd xmm3, xmm3 // 0, 0, 0, dx * 2 1170 pshufd xmm0, xmm3, 0x05 // dx * 2, dx * 2, 0, 0 1171 paddd xmm2, xmm0 // x3 x2 x1 x0 1172 paddd xmm3, xmm3 // 0, 0, 0, dx * 4 1173 pshufd xmm3, xmm3, 0 // dx * 4, dx * 4, dx * 4, dx * 4 1174 1175 pextrw eax, xmm2, 1 // get x0 integer. 1176 pextrw edx, xmm2, 3 // get x1 integer. 1177 1178 cmp ecx, 0 1179 jle xloop99 1180 sub ecx, 4 1181 jl xloop49 1182 1183 // 4 Pixel loop. 1184 xloop4: 1185 movd xmm0, [esi + eax * 4] // 1 source x0 pixels 1186 movd xmm1, [esi + edx * 4] // 1 source x1 pixels 1187 pextrw eax, xmm2, 5 // get x2 integer. 1188 pextrw edx, xmm2, 7 // get x3 integer. 1189 paddd xmm2, xmm3 // x += dx 1190 punpckldq xmm0, xmm1 // x0 x1 1191 1192 movd xmm1, [esi + eax * 4] // 1 source x2 pixels 1193 movd xmm4, [esi + edx * 4] // 1 source x3 pixels 1194 pextrw eax, xmm2, 1 // get x0 integer. next iteration. 1195 pextrw edx, xmm2, 3 // get x1 integer. next iteration. 1196 punpckldq xmm1, xmm4 // x2 x3 1197 punpcklqdq xmm0, xmm1 // x0 x1 x2 x3 1198 movdqu [edi], xmm0 1199 lea edi, [edi + 16] 1200 sub ecx, 4 // 4 pixels 1201 jge xloop4 1202 1203 xloop49: 1204 test ecx, 2 1205 je xloop29 1206 1207 // 2 Pixels. 1208 movd xmm0, [esi + eax * 4] // 1 source x0 pixels 1209 movd xmm1, [esi + edx * 4] // 1 source x1 pixels 1210 pextrw eax, xmm2, 5 // get x2 integer. 1211 punpckldq xmm0, xmm1 // x0 x1 1212 1213 movq qword ptr [edi], xmm0 1214 lea edi, [edi + 8] 1215 1216 xloop29: 1217 test ecx, 1 1218 je xloop99 1219 1220 // 1 Pixels. 1221 movd xmm0, [esi + eax * 4] // 1 source x2 pixels 1222 movd dword ptr [edi], xmm0 1223 xloop99: 1224 1225 pop esi 1226 pop edi 1227 ret 1228 } 1229 } 1230 1231 // Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version. 1232 // TODO(fbarchard): Port to Neon 1233 1234 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw 1235 static uvec8 kShuffleColARGB = { 1236 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel 1237 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel 1238 }; 1239 1240 // Shuffle table for duplicating 2 fractions into 8 bytes each 1241 static uvec8 kShuffleFractions = { 1242 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 1243 }; 1244 1245 __declspec(naked) 1246 void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, 1247 int dst_width, int x, int dx) { 1248 __asm { 1249 push esi 1250 push edi 1251 mov edi, [esp + 8 + 4] // dst_argb 1252 mov esi, [esp + 8 + 8] // src_argb 1253 mov ecx, [esp + 8 + 12] // dst_width 1254 movd xmm2, [esp + 8 + 16] // x 1255 movd xmm3, [esp + 8 + 20] // dx 1256 movdqa xmm4, xmmword ptr kShuffleColARGB 1257 movdqa xmm5, xmmword ptr kShuffleFractions 1258 pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. 1259 psrlw xmm6, 9 1260 pextrw eax, xmm2, 1 // get x0 integer. preroll 1261 sub ecx, 2 1262 jl xloop29 1263 1264 movdqa xmm0, xmm2 // x1 = x0 + dx 1265 paddd xmm0, xmm3 1266 punpckldq xmm2, xmm0 // x0 x1 1267 punpckldq xmm3, xmm3 // dx dx 1268 paddd xmm3, xmm3 // dx * 2, dx * 2 1269 pextrw edx, xmm2, 3 // get x1 integer. preroll 1270 1271 // 2 Pixel loop. 1272 xloop2: 1273 movdqa xmm1, xmm2 // x0, x1 fractions. 1274 paddd xmm2, xmm3 // x += dx 1275 movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels 1276 psrlw xmm1, 9 // 7 bit fractions. 1277 movhps xmm0, qword ptr [esi + edx * 4] // 2 source x1 pixels 1278 pshufb xmm1, xmm5 // 0000000011111111 1279 pshufb xmm0, xmm4 // arrange pixels into pairs 1280 pxor xmm1, xmm6 // 0..7f and 7f..0 1281 pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels. 1282 pextrw eax, xmm2, 1 // get x0 integer. next iteration. 1283 pextrw edx, xmm2, 3 // get x1 integer. next iteration. 1284 psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits. 1285 packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels. 1286 movq qword ptr [edi], xmm0 1287 lea edi, [edi + 8] 1288 sub ecx, 2 // 2 pixels 1289 jge xloop2 1290 1291 xloop29: 1292 1293 add ecx, 2 - 1 1294 jl xloop99 1295 1296 // 1 pixel remainder 1297 psrlw xmm2, 9 // 7 bit fractions. 1298 movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels 1299 pshufb xmm2, xmm5 // 00000000 1300 pshufb xmm0, xmm4 // arrange pixels into pairs 1301 pxor xmm2, xmm6 // 0..7f and 7f..0 1302 pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel. 1303 psrlw xmm0, 7 1304 packuswb xmm0, xmm0 // argb 8 bits, 1 pixel. 1305 movd [edi], xmm0 1306 1307 xloop99: 1308 1309 pop edi 1310 pop esi 1311 ret 1312 } 1313 } 1314 1315 // Reads 4 pixels, duplicates them and writes 8 pixels. 1316 __declspec(naked) 1317 void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, 1318 int dst_width, int x, int dx) { 1319 __asm { 1320 mov edx, [esp + 4] // dst_argb 1321 mov eax, [esp + 8] // src_argb 1322 mov ecx, [esp + 12] // dst_width 1323 1324 wloop: 1325 movdqu xmm0, [eax] 1326 lea eax, [eax + 16] 1327 movdqa xmm1, xmm0 1328 punpckldq xmm0, xmm0 1329 punpckhdq xmm1, xmm1 1330 movdqu [edx], xmm0 1331 movdqu [edx + 16], xmm1 1332 lea edx, [edx + 32] 1333 sub ecx, 8 1334 jg wloop 1335 1336 ret 1337 } 1338 } 1339 1340 // Divide num by div and return as 16.16 fixed point result. 1341 __declspec(naked) 1342 int FixedDiv_X86(int num, int div) { 1343 __asm { 1344 mov eax, [esp + 4] // num 1345 cdq // extend num to 64 bits 1346 shld edx, eax, 16 // 32.16 1347 shl eax, 16 1348 idiv dword ptr [esp + 8] 1349 ret 1350 } 1351 } 1352 1353 // Divide num by div and return as 16.16 fixed point result. 1354 __declspec(naked) 1355 int FixedDiv1_X86(int num, int div) { 1356 __asm { 1357 mov eax, [esp + 4] // num 1358 mov ecx, [esp + 8] // denom 1359 cdq // extend num to 64 bits 1360 shld edx, eax, 16 // 32.16 1361 shl eax, 16 1362 sub eax, 0x00010001 1363 sbb edx, 0 1364 sub ecx, 1 1365 idiv ecx 1366 ret 1367 } 1368 } 1369 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) 1370 1371 #ifdef __cplusplus 1372 } // extern "C" 1373 } // namespace libyuv 1374 #endif 1375