1 /* 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "libyuv/row.h" 12 13 #ifdef __cplusplus 14 namespace libyuv { 15 extern "C" { 16 #endif 17 18 // This module is for Visual C x86. 19 #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86) 20 21 // TODO(fbarchard): I420ToRGB24, I420ToRAW 22 #ifdef HAS_ARGBTOYROW_SSSE3 23 24 // Constants for ARGB. 25 static const vec8 kARGBToY = { 26 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 27 }; 28 29 static const vec8 kARGBToU = { 30 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 31 }; 32 33 static const vec8 kARGBToV = { 34 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, 35 }; 36 37 // Constants for BGRA. 38 static const vec8 kBGRAToY = { 39 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13 40 }; 41 42 static const vec8 kBGRAToU = { 43 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112 44 }; 45 46 static const vec8 kBGRAToV = { 47 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18 48 }; 49 50 // Constants for ABGR. 51 static const vec8 kABGRToY = { 52 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0 53 }; 54 55 static const vec8 kABGRToU = { 56 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0 57 }; 58 59 static const vec8 kABGRToV = { 60 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0 61 }; 62 63 // Constants for RGBA. 64 static const vec8 kRGBAToY = { 65 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33 66 }; 67 68 static const vec8 kRGBAToU = { 69 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38 70 }; 71 72 static const vec8 kRGBAToV = { 73 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112 74 }; 75 76 static const uvec8 kAddY16 = { 77 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u 78 }; 79 80 static const uvec8 kAddUV128 = { 81 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 82 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u 83 }; 84 85 // Shuffle table for converting RGB24 to ARGB. 86 static const uvec8 kShuffleMaskRGB24ToARGB = { 87 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u 88 }; 89 90 // Shuffle table for converting RAW to ARGB. 91 static const uvec8 kShuffleMaskRAWToARGB = { 92 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u 93 }; 94 95 // Shuffle table for converting BGRA to ARGB. 96 static const uvec8 kShuffleMaskBGRAToARGB = { 97 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u 98 }; 99 100 // Shuffle table for converting ABGR to ARGB. 101 static const uvec8 kShuffleMaskABGRToARGB = { 102 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u 103 }; 104 105 // Shuffle table for converting RGBA to ARGB. 106 static const uvec8 kShuffleMaskRGBAToARGB = { 107 1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u 108 }; 109 110 // Shuffle table for converting ARGB to RGBA. 111 static const uvec8 kShuffleMaskARGBToRGBA = { 112 3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u 113 }; 114 115 // Shuffle table for converting ARGB to RGB24. 116 static const uvec8 kShuffleMaskARGBToRGB24 = { 117 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u 118 }; 119 120 // Shuffle table for converting ARGB to RAW. 121 static const uvec8 kShuffleMaskARGBToRAW = { 122 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u 123 }; 124 125 __declspec(naked) __declspec(align(16)) 126 void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { 127 __asm { 128 mov eax, [esp + 4] // src_y 129 mov edx, [esp + 8] // dst_argb 130 mov ecx, [esp + 12] // pix 131 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 132 pslld xmm5, 24 133 134 align 16 135 convertloop: 136 movq xmm0, qword ptr [eax] 137 lea eax, [eax + 8] 138 punpcklbw xmm0, xmm0 139 movdqa xmm1, xmm0 140 punpcklwd xmm0, xmm0 141 punpckhwd xmm1, xmm1 142 por xmm0, xmm5 143 por xmm1, xmm5 144 movdqa [edx], xmm0 145 movdqa [edx + 16], xmm1 146 lea edx, [edx + 32] 147 sub ecx, 8 148 jg convertloop 149 ret 150 } 151 } 152 153 __declspec(naked) __declspec(align(16)) 154 void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) { 155 __asm { 156 mov eax, [esp + 4] // src_bgra 157 mov edx, [esp + 8] // dst_argb 158 mov ecx, [esp + 12] // pix 159 movdqa xmm5, kShuffleMaskBGRAToARGB 160 sub edx, eax 161 162 align 16 163 convertloop: 164 movdqa xmm0, [eax] 165 pshufb xmm0, xmm5 166 sub ecx, 4 167 movdqa [eax + edx], xmm0 168 lea eax, [eax + 16] 169 jg convertloop 170 ret 171 } 172 } 173 174 __declspec(naked) __declspec(align(16)) 175 void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) { 176 __asm { 177 mov eax, [esp + 4] // src_abgr 178 mov edx, [esp + 8] // dst_argb 179 mov ecx, [esp + 12] // pix 180 movdqa xmm5, kShuffleMaskABGRToARGB 181 sub edx, eax 182 183 align 16 184 convertloop: 185 movdqa xmm0, [eax] 186 pshufb xmm0, xmm5 187 sub ecx, 4 188 movdqa [eax + edx], xmm0 189 lea eax, [eax + 16] 190 jg convertloop 191 ret 192 } 193 } 194 195 __declspec(naked) __declspec(align(16)) 196 void RGBAToARGBRow_SSSE3(const uint8* src_rgba, uint8* dst_argb, int pix) { 197 __asm { 198 mov eax, [esp + 4] // src_rgba 199 mov edx, [esp + 8] // dst_argb 200 mov ecx, [esp + 12] // pix 201 movdqa xmm5, kShuffleMaskRGBAToARGB 202 sub edx, eax 203 204 align 16 205 convertloop: 206 movdqa xmm0, [eax] 207 pshufb xmm0, xmm5 208 sub ecx, 4 209 movdqa [eax + edx], xmm0 210 lea eax, [eax + 16] 211 jg convertloop 212 ret 213 } 214 } 215 216 __declspec(naked) __declspec(align(16)) 217 void ARGBToRGBARow_SSSE3(const uint8* src_argb, uint8* dst_rgba, int pix) { 218 __asm { 219 mov eax, [esp + 4] // src_argb 220 mov edx, [esp + 8] // dst_rgba 221 mov ecx, [esp + 12] // pix 222 movdqa xmm5, kShuffleMaskARGBToRGBA 223 sub edx, eax 224 225 align 16 226 convertloop: 227 movdqa xmm0, [eax] 228 pshufb xmm0, xmm5 229 sub ecx, 4 230 movdqa [eax + edx], xmm0 231 lea eax, [eax + 16] 232 jg convertloop 233 ret 234 } 235 } 236 237 __declspec(naked) __declspec(align(16)) 238 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { 239 __asm { 240 mov eax, [esp + 4] // src_rgb24 241 mov edx, [esp + 8] // dst_argb 242 mov ecx, [esp + 12] // pix 243 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 244 pslld xmm5, 24 245 movdqa xmm4, kShuffleMaskRGB24ToARGB 246 247 align 16 248 convertloop: 249 movdqu xmm0, [eax] 250 movdqu xmm1, [eax + 16] 251 movdqu xmm3, [eax + 32] 252 lea eax, [eax + 48] 253 movdqa xmm2, xmm3 254 palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} 255 pshufb xmm2, xmm4 256 por xmm2, xmm5 257 palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} 258 pshufb xmm0, xmm4 259 movdqa [edx + 32], xmm2 260 por xmm0, xmm5 261 pshufb xmm1, xmm4 262 movdqa [edx], xmm0 263 por xmm1, xmm5 264 palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} 265 pshufb xmm3, xmm4 266 movdqa [edx + 16], xmm1 267 por xmm3, xmm5 268 sub ecx, 16 269 movdqa [edx + 48], xmm3 270 lea edx, [edx + 64] 271 jg convertloop 272 ret 273 } 274 } 275 276 __declspec(naked) __declspec(align(16)) 277 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, 278 int pix) { 279 __asm { 280 mov eax, [esp + 4] // src_raw 281 mov edx, [esp + 8] // dst_argb 282 mov ecx, [esp + 12] // pix 283 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 284 pslld xmm5, 24 285 movdqa xmm4, kShuffleMaskRAWToARGB 286 287 align 16 288 convertloop: 289 movdqu xmm0, [eax] 290 movdqu xmm1, [eax + 16] 291 movdqu xmm3, [eax + 32] 292 lea eax, [eax + 48] 293 movdqa xmm2, xmm3 294 palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} 295 pshufb xmm2, xmm4 296 por xmm2, xmm5 297 palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} 298 pshufb xmm0, xmm4 299 movdqa [edx + 32], xmm2 300 por xmm0, xmm5 301 pshufb xmm1, xmm4 302 movdqa [edx], xmm0 303 por xmm1, xmm5 304 palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} 305 pshufb xmm3, xmm4 306 movdqa [edx + 16], xmm1 307 por xmm3, xmm5 308 sub ecx, 16 309 movdqa [edx + 48], xmm3 310 lea edx, [edx + 64] 311 jg convertloop 312 ret 313 } 314 } 315 316 // pmul method to replicate bits. 317 // Math to replicate bits: 318 // (v << 8) | (v << 3) 319 // v * 256 + v * 8 320 // v * (256 + 8) 321 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 322 // 20 instructions. 323 __declspec(naked) __declspec(align(16)) 324 void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, 325 int pix) { 326 __asm { 327 mov eax, 0x01080108 // generate multiplier to repeat 5 bits 328 movd xmm5, eax 329 pshufd xmm5, xmm5, 0 330 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits 331 movd xmm6, eax 332 pshufd xmm6, xmm6, 0 333 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red 334 psllw xmm3, 11 335 pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green 336 psllw xmm4, 10 337 psrlw xmm4, 5 338 pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha 339 psllw xmm7, 8 340 341 mov eax, [esp + 4] // src_rgb565 342 mov edx, [esp + 8] // dst_argb 343 mov ecx, [esp + 12] // pix 344 sub edx, eax 345 sub edx, eax 346 347 align 16 348 convertloop: 349 movdqu xmm0, [eax] // fetch 8 pixels of bgr565 350 movdqa xmm1, xmm0 351 movdqa xmm2, xmm0 352 pand xmm1, xmm3 // R in upper 5 bits 353 psllw xmm2, 11 // B in upper 5 bits 354 pmulhuw xmm1, xmm5 // * (256 + 8) 355 pmulhuw xmm2, xmm5 // * (256 + 8) 356 psllw xmm1, 8 357 por xmm1, xmm2 // RB 358 pand xmm0, xmm4 // G in middle 6 bits 359 pmulhuw xmm0, xmm6 // << 5 * (256 + 4) 360 por xmm0, xmm7 // AG 361 movdqa xmm2, xmm1 362 punpcklbw xmm1, xmm0 363 punpckhbw xmm2, xmm0 364 movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB 365 movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB 366 lea eax, [eax + 16] 367 sub ecx, 8 368 jg convertloop 369 ret 370 } 371 } 372 373 // 24 instructions 374 __declspec(naked) __declspec(align(16)) 375 void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, 376 int pix) { 377 __asm { 378 mov eax, 0x01080108 // generate multiplier to repeat 5 bits 379 movd xmm5, eax 380 pshufd xmm5, xmm5, 0 381 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits 382 movd xmm6, eax 383 pshufd xmm6, xmm6, 0 384 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red 385 psllw xmm3, 11 386 movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green 387 psrlw xmm4, 6 388 pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha 389 psllw xmm7, 8 390 391 mov eax, [esp + 4] // src_argb1555 392 mov edx, [esp + 8] // dst_argb 393 mov ecx, [esp + 12] // pix 394 sub edx, eax 395 sub edx, eax 396 397 align 16 398 convertloop: 399 movdqu xmm0, [eax] // fetch 8 pixels of 1555 400 movdqa xmm1, xmm0 401 movdqa xmm2, xmm0 402 psllw xmm1, 1 // R in upper 5 bits 403 psllw xmm2, 11 // B in upper 5 bits 404 pand xmm1, xmm3 405 pmulhuw xmm2, xmm5 // * (256 + 8) 406 pmulhuw xmm1, xmm5 // * (256 + 8) 407 psllw xmm1, 8 408 por xmm1, xmm2 // RB 409 movdqa xmm2, xmm0 410 pand xmm0, xmm4 // G in middle 5 bits 411 psraw xmm2, 8 // A 412 pmulhuw xmm0, xmm6 // << 6 * (256 + 8) 413 pand xmm2, xmm7 414 por xmm0, xmm2 // AG 415 movdqa xmm2, xmm1 416 punpcklbw xmm1, xmm0 417 punpckhbw xmm2, xmm0 418 movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB 419 movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB 420 lea eax, [eax + 16] 421 sub ecx, 8 422 jg convertloop 423 ret 424 } 425 } 426 427 // 18 instructions. 428 __declspec(naked) __declspec(align(16)) 429 void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, 430 int pix) { 431 __asm { 432 mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f 433 movd xmm4, eax 434 pshufd xmm4, xmm4, 0 435 movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles 436 pslld xmm5, 4 437 mov eax, [esp + 4] // src_argb4444 438 mov edx, [esp + 8] // dst_argb 439 mov ecx, [esp + 12] // pix 440 sub edx, eax 441 sub edx, eax 442 443 align 16 444 convertloop: 445 movdqu xmm0, [eax] // fetch 8 pixels of bgra4444 446 movdqa xmm2, xmm0 447 pand xmm0, xmm4 // mask low nibbles 448 pand xmm2, xmm5 // mask high nibbles 449 movdqa xmm1, xmm0 450 movdqa xmm3, xmm2 451 psllw xmm1, 4 452 psrlw xmm3, 4 453 por xmm0, xmm1 454 por xmm2, xmm3 455 movdqa xmm1, xmm0 456 punpcklbw xmm0, xmm2 457 punpckhbw xmm1, xmm2 458 movdqa [eax * 2 + edx], xmm0 // store 4 pixels of ARGB 459 movdqa [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB 460 lea eax, [eax + 16] 461 sub ecx, 8 462 jg convertloop 463 ret 464 } 465 } 466 467 __declspec(naked) __declspec(align(16)) 468 void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { 469 __asm { 470 mov eax, [esp + 4] // src_argb 471 mov edx, [esp + 8] // dst_rgb 472 mov ecx, [esp + 12] // pix 473 movdqa xmm6, kShuffleMaskARGBToRGB24 474 475 align 16 476 convertloop: 477 movdqa xmm0, [eax] // fetch 16 pixels of argb 478 movdqa xmm1, [eax + 16] 479 movdqa xmm2, [eax + 32] 480 movdqa xmm3, [eax + 48] 481 lea eax, [eax + 64] 482 pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB 483 pshufb xmm1, xmm6 484 pshufb xmm2, xmm6 485 pshufb xmm3, xmm6 486 movdqa xmm4, xmm1 // 4 bytes from 1 for 0 487 psrldq xmm1, 4 // 8 bytes from 1 488 pslldq xmm4, 12 // 4 bytes from 1 for 0 489 movdqa xmm5, xmm2 // 8 bytes from 2 for 1 490 por xmm0, xmm4 // 4 bytes from 1 for 0 491 pslldq xmm5, 8 // 8 bytes from 2 for 1 492 movdqa [edx], xmm0 // store 0 493 por xmm1, xmm5 // 8 bytes from 2 for 1 494 psrldq xmm2, 8 // 4 bytes from 2 495 pslldq xmm3, 4 // 12 bytes from 3 for 2 496 por xmm2, xmm3 // 12 bytes from 3 for 2 497 movdqa [edx + 16], xmm1 // store 1 498 movdqa [edx + 32], xmm2 // store 2 499 lea edx, [edx + 48] 500 sub ecx, 16 501 jg convertloop 502 ret 503 } 504 } 505 506 __declspec(naked) __declspec(align(16)) 507 void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { 508 __asm { 509 mov eax, [esp + 4] // src_argb 510 mov edx, [esp + 8] // dst_rgb 511 mov ecx, [esp + 12] // pix 512 movdqa xmm6, kShuffleMaskARGBToRAW 513 514 align 16 515 convertloop: 516 movdqa xmm0, [eax] // fetch 16 pixels of argb 517 movdqa xmm1, [eax + 16] 518 movdqa xmm2, [eax + 32] 519 movdqa xmm3, [eax + 48] 520 lea eax, [eax + 64] 521 pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB 522 pshufb xmm1, xmm6 523 pshufb xmm2, xmm6 524 pshufb xmm3, xmm6 525 movdqa xmm4, xmm1 // 4 bytes from 1 for 0 526 psrldq xmm1, 4 // 8 bytes from 1 527 pslldq xmm4, 12 // 4 bytes from 1 for 0 528 movdqa xmm5, xmm2 // 8 bytes from 2 for 1 529 por xmm0, xmm4 // 4 bytes from 1 for 0 530 pslldq xmm5, 8 // 8 bytes from 2 for 1 531 movdqa [edx], xmm0 // store 0 532 por xmm1, xmm5 // 8 bytes from 2 for 1 533 psrldq xmm2, 8 // 4 bytes from 2 534 pslldq xmm3, 4 // 12 bytes from 3 for 2 535 por xmm2, xmm3 // 12 bytes from 3 for 2 536 movdqa [edx + 16], xmm1 // store 1 537 movdqa [edx + 32], xmm2 // store 2 538 lea edx, [edx + 48] 539 sub ecx, 16 540 jg convertloop 541 ret 542 } 543 } 544 545 __declspec(naked) __declspec(align(16)) 546 void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { 547 __asm { 548 mov eax, [esp + 4] // src_argb 549 mov edx, [esp + 8] // dst_rgb 550 mov ecx, [esp + 12] // pix 551 pcmpeqb xmm3, xmm3 // generate mask 0x0000001f 552 psrld xmm3, 27 553 pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 554 psrld xmm4, 26 555 pslld xmm4, 5 556 pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 557 pslld xmm5, 11 558 559 align 16 560 convertloop: 561 movdqa xmm0, [eax] // fetch 4 pixels of argb 562 movdqa xmm1, xmm0 // B 563 movdqa xmm2, xmm0 // G 564 pslld xmm0, 8 // R 565 psrld xmm1, 3 // B 566 psrld xmm2, 5 // G 567 psrad xmm0, 16 // R 568 pand xmm1, xmm3 // B 569 pand xmm2, xmm4 // G 570 pand xmm0, xmm5 // R 571 por xmm1, xmm2 // BG 572 por xmm0, xmm1 // BGR 573 packssdw xmm0, xmm0 574 lea eax, [eax + 16] 575 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555 576 lea edx, [edx + 8] 577 sub ecx, 4 578 jg convertloop 579 ret 580 } 581 } 582 583 // TODO(fbarchard): Improve sign extension/packing. 584 __declspec(naked) __declspec(align(16)) 585 void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { 586 __asm { 587 mov eax, [esp + 4] // src_argb 588 mov edx, [esp + 8] // dst_rgb 589 mov ecx, [esp + 12] // pix 590 pcmpeqb xmm4, xmm4 // generate mask 0x0000001f 591 psrld xmm4, 27 592 movdqa xmm5, xmm4 // generate mask 0x000003e0 593 pslld xmm5, 5 594 movdqa xmm6, xmm4 // generate mask 0x00007c00 595 pslld xmm6, 10 596 pcmpeqb xmm7, xmm7 // generate mask 0xffff8000 597 pslld xmm7, 15 598 599 align 16 600 convertloop: 601 movdqa xmm0, [eax] // fetch 4 pixels of argb 602 movdqa xmm1, xmm0 // B 603 movdqa xmm2, xmm0 // G 604 movdqa xmm3, xmm0 // R 605 psrad xmm0, 16 // A 606 psrld xmm1, 3 // B 607 psrld xmm2, 6 // G 608 psrld xmm3, 9 // R 609 pand xmm0, xmm7 // A 610 pand xmm1, xmm4 // B 611 pand xmm2, xmm5 // G 612 pand xmm3, xmm6 // R 613 por xmm0, xmm1 // BA 614 por xmm2, xmm3 // GR 615 por xmm0, xmm2 // BGRA 616 packssdw xmm0, xmm0 617 lea eax, [eax + 16] 618 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555 619 lea edx, [edx + 8] 620 sub ecx, 4 621 jg convertloop 622 ret 623 } 624 } 625 626 __declspec(naked) __declspec(align(16)) 627 void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { 628 __asm { 629 mov eax, [esp + 4] // src_argb 630 mov edx, [esp + 8] // dst_rgb 631 mov ecx, [esp + 12] // pix 632 pcmpeqb xmm4, xmm4 // generate mask 0xf000f000 633 psllw xmm4, 12 634 movdqa xmm3, xmm4 // generate mask 0x00f000f0 635 psrlw xmm3, 8 636 637 align 16 638 convertloop: 639 movdqa xmm0, [eax] // fetch 4 pixels of argb 640 movdqa xmm1, xmm0 641 pand xmm0, xmm3 // low nibble 642 pand xmm1, xmm4 // high nibble 643 psrl xmm0, 4 644 psrl xmm1, 8 645 por xmm0, xmm1 646 packuswb xmm0, xmm0 647 lea eax, [eax + 16] 648 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444 649 lea edx, [edx + 8] 650 sub ecx, 4 651 jg convertloop 652 ret 653 } 654 } 655 656 // Convert 16 ARGB pixels (64 bytes) to 16 Y values. 657 __declspec(naked) __declspec(align(16)) 658 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 659 __asm { 660 mov eax, [esp + 4] /* src_argb */ 661 mov edx, [esp + 8] /* dst_y */ 662 mov ecx, [esp + 12] /* pix */ 663 movdqa xmm5, kAddY16 664 movdqa xmm4, kARGBToY 665 666 align 16 667 convertloop: 668 movdqa xmm0, [eax] 669 movdqa xmm1, [eax + 16] 670 movdqa xmm2, [eax + 32] 671 movdqa xmm3, [eax + 48] 672 pmaddubsw xmm0, xmm4 673 pmaddubsw xmm1, xmm4 674 pmaddubsw xmm2, xmm4 675 pmaddubsw xmm3, xmm4 676 lea eax, [eax + 64] 677 phaddw xmm0, xmm1 678 phaddw xmm2, xmm3 679 psrlw xmm0, 7 680 psrlw xmm2, 7 681 packuswb xmm0, xmm2 682 paddb xmm0, xmm5 683 sub ecx, 16 684 movdqa [edx], xmm0 685 lea edx, [edx + 16] 686 jg convertloop 687 ret 688 } 689 } 690 691 __declspec(naked) __declspec(align(16)) 692 void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 693 __asm { 694 mov eax, [esp + 4] /* src_argb */ 695 mov edx, [esp + 8] /* dst_y */ 696 mov ecx, [esp + 12] /* pix */ 697 movdqa xmm5, kAddY16 698 movdqa xmm4, kARGBToY 699 700 align 16 701 convertloop: 702 movdqu xmm0, [eax] 703 movdqu xmm1, [eax + 16] 704 movdqu xmm2, [eax + 32] 705 movdqu xmm3, [eax + 48] 706 pmaddubsw xmm0, xmm4 707 pmaddubsw xmm1, xmm4 708 pmaddubsw xmm2, xmm4 709 pmaddubsw xmm3, xmm4 710 lea eax, [eax + 64] 711 phaddw xmm0, xmm1 712 phaddw xmm2, xmm3 713 psrlw xmm0, 7 714 psrlw xmm2, 7 715 packuswb xmm0, xmm2 716 paddb xmm0, xmm5 717 sub ecx, 16 718 movdqu [edx], xmm0 719 lea edx, [edx + 16] 720 jg convertloop 721 ret 722 } 723 } 724 725 __declspec(naked) __declspec(align(16)) 726 void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 727 __asm { 728 mov eax, [esp + 4] /* src_argb */ 729 mov edx, [esp + 8] /* dst_y */ 730 mov ecx, [esp + 12] /* pix */ 731 movdqa xmm5, kAddY16 732 movdqa xmm4, kBGRAToY 733 734 align 16 735 convertloop: 736 movdqa xmm0, [eax] 737 movdqa xmm1, [eax + 16] 738 movdqa xmm2, [eax + 32] 739 movdqa xmm3, [eax + 48] 740 pmaddubsw xmm0, xmm4 741 pmaddubsw xmm1, xmm4 742 pmaddubsw xmm2, xmm4 743 pmaddubsw xmm3, xmm4 744 lea eax, [eax + 64] 745 phaddw xmm0, xmm1 746 phaddw xmm2, xmm3 747 psrlw xmm0, 7 748 psrlw xmm2, 7 749 packuswb xmm0, xmm2 750 paddb xmm0, xmm5 751 sub ecx, 16 752 movdqa [edx], xmm0 753 lea edx, [edx + 16] 754 jg convertloop 755 ret 756 } 757 } 758 759 __declspec(naked) __declspec(align(16)) 760 void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 761 __asm { 762 mov eax, [esp + 4] /* src_argb */ 763 mov edx, [esp + 8] /* dst_y */ 764 mov ecx, [esp + 12] /* pix */ 765 movdqa xmm5, kAddY16 766 movdqa xmm4, kBGRAToY 767 768 align 16 769 convertloop: 770 movdqu xmm0, [eax] 771 movdqu xmm1, [eax + 16] 772 movdqu xmm2, [eax + 32] 773 movdqu xmm3, [eax + 48] 774 pmaddubsw xmm0, xmm4 775 pmaddubsw xmm1, xmm4 776 pmaddubsw xmm2, xmm4 777 pmaddubsw xmm3, xmm4 778 lea eax, [eax + 64] 779 phaddw xmm0, xmm1 780 phaddw xmm2, xmm3 781 psrlw xmm0, 7 782 psrlw xmm2, 7 783 packuswb xmm0, xmm2 784 paddb xmm0, xmm5 785 sub ecx, 16 786 movdqu [edx], xmm0 787 lea edx, [edx + 16] 788 jg convertloop 789 ret 790 } 791 } 792 793 __declspec(naked) __declspec(align(16)) 794 void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 795 __asm { 796 mov eax, [esp + 4] /* src_argb */ 797 mov edx, [esp + 8] /* dst_y */ 798 mov ecx, [esp + 12] /* pix */ 799 movdqa xmm5, kAddY16 800 movdqa xmm4, kABGRToY 801 802 align 16 803 convertloop: 804 movdqa xmm0, [eax] 805 movdqa xmm1, [eax + 16] 806 movdqa xmm2, [eax + 32] 807 movdqa xmm3, [eax + 48] 808 pmaddubsw xmm0, xmm4 809 pmaddubsw xmm1, xmm4 810 pmaddubsw xmm2, xmm4 811 pmaddubsw xmm3, xmm4 812 lea eax, [eax + 64] 813 phaddw xmm0, xmm1 814 phaddw xmm2, xmm3 815 psrlw xmm0, 7 816 psrlw xmm2, 7 817 packuswb xmm0, xmm2 818 paddb xmm0, xmm5 819 sub ecx, 16 820 movdqa [edx], xmm0 821 lea edx, [edx + 16] 822 jg convertloop 823 ret 824 } 825 } 826 827 __declspec(naked) __declspec(align(16)) 828 void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 829 __asm { 830 mov eax, [esp + 4] /* src_argb */ 831 mov edx, [esp + 8] /* dst_y */ 832 mov ecx, [esp + 12] /* pix */ 833 movdqa xmm5, kAddY16 834 movdqa xmm4, kABGRToY 835 836 align 16 837 convertloop: 838 movdqu xmm0, [eax] 839 movdqu xmm1, [eax + 16] 840 movdqu xmm2, [eax + 32] 841 movdqu xmm3, [eax + 48] 842 pmaddubsw xmm0, xmm4 843 pmaddubsw xmm1, xmm4 844 pmaddubsw xmm2, xmm4 845 pmaddubsw xmm3, xmm4 846 lea eax, [eax + 64] 847 phaddw xmm0, xmm1 848 phaddw xmm2, xmm3 849 psrlw xmm0, 7 850 psrlw xmm2, 7 851 packuswb xmm0, xmm2 852 paddb xmm0, xmm5 853 sub ecx, 16 854 movdqu [edx], xmm0 855 lea edx, [edx + 16] 856 jg convertloop 857 ret 858 } 859 } 860 861 __declspec(naked) __declspec(align(16)) 862 void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 863 __asm { 864 mov eax, [esp + 4] /* src_argb */ 865 mov edx, [esp + 8] /* dst_y */ 866 mov ecx, [esp + 12] /* pix */ 867 movdqa xmm5, kAddY16 868 movdqa xmm4, kRGBAToY 869 870 align 16 871 convertloop: 872 movdqa xmm0, [eax] 873 movdqa xmm1, [eax + 16] 874 movdqa xmm2, [eax + 32] 875 movdqa xmm3, [eax + 48] 876 pmaddubsw xmm0, xmm4 877 pmaddubsw xmm1, xmm4 878 pmaddubsw xmm2, xmm4 879 pmaddubsw xmm3, xmm4 880 lea eax, [eax + 64] 881 phaddw xmm0, xmm1 882 phaddw xmm2, xmm3 883 psrlw xmm0, 7 884 psrlw xmm2, 7 885 packuswb xmm0, xmm2 886 paddb xmm0, xmm5 887 sub ecx, 16 888 movdqa [edx], xmm0 889 lea edx, [edx + 16] 890 jg convertloop 891 ret 892 } 893 } 894 895 __declspec(naked) __declspec(align(16)) 896 void RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 897 __asm { 898 mov eax, [esp + 4] /* src_argb */ 899 mov edx, [esp + 8] /* dst_y */ 900 mov ecx, [esp + 12] /* pix */ 901 movdqa xmm5, kAddY16 902 movdqa xmm4, kRGBAToY 903 904 align 16 905 convertloop: 906 movdqu xmm0, [eax] 907 movdqu xmm1, [eax + 16] 908 movdqu xmm2, [eax + 32] 909 movdqu xmm3, [eax + 48] 910 pmaddubsw xmm0, xmm4 911 pmaddubsw xmm1, xmm4 912 pmaddubsw xmm2, xmm4 913 pmaddubsw xmm3, xmm4 914 lea eax, [eax + 64] 915 phaddw xmm0, xmm1 916 phaddw xmm2, xmm3 917 psrlw xmm0, 7 918 psrlw xmm2, 7 919 packuswb xmm0, xmm2 920 paddb xmm0, xmm5 921 sub ecx, 16 922 movdqu [edx], xmm0 923 lea edx, [edx + 16] 924 jg convertloop 925 ret 926 } 927 } 928 929 __declspec(naked) __declspec(align(16)) 930 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 931 uint8* dst_u, uint8* dst_v, int width) { 932 __asm { 933 push esi 934 push edi 935 mov eax, [esp + 8 + 4] // src_argb 936 mov esi, [esp + 8 + 8] // src_stride_argb 937 mov edx, [esp + 8 + 12] // dst_u 938 mov edi, [esp + 8 + 16] // dst_v 939 mov ecx, [esp + 8 + 20] // pix 940 movdqa xmm7, kARGBToU 941 movdqa xmm6, kARGBToV 942 movdqa xmm5, kAddUV128 943 sub edi, edx // stride from u to v 944 945 align 16 946 convertloop: 947 /* step 1 - subsample 16x2 argb pixels to 8x1 */ 948 movdqa xmm0, [eax] 949 movdqa xmm1, [eax + 16] 950 movdqa xmm2, [eax + 32] 951 movdqa xmm3, [eax + 48] 952 pavgb xmm0, [eax + esi] 953 pavgb xmm1, [eax + esi + 16] 954 pavgb xmm2, [eax + esi + 32] 955 pavgb xmm3, [eax + esi + 48] 956 lea eax, [eax + 64] 957 movdqa xmm4, xmm0 958 shufps xmm0, xmm1, 0x88 959 shufps xmm4, xmm1, 0xdd 960 pavgb xmm0, xmm4 961 movdqa xmm4, xmm2 962 shufps xmm2, xmm3, 0x88 963 shufps xmm4, xmm3, 0xdd 964 pavgb xmm2, xmm4 965 966 // step 2 - convert to U and V 967 // from here down is very similar to Y code except 968 // instead of 16 different pixels, its 8 pixels of U and 8 of V 969 movdqa xmm1, xmm0 970 movdqa xmm3, xmm2 971 pmaddubsw xmm0, xmm7 // U 972 pmaddubsw xmm2, xmm7 973 pmaddubsw xmm1, xmm6 // V 974 pmaddubsw xmm3, xmm6 975 phaddw xmm0, xmm2 976 phaddw xmm1, xmm3 977 psraw xmm0, 8 978 psraw xmm1, 8 979 packsswb xmm0, xmm1 980 paddb xmm0, xmm5 // -> unsigned 981 982 // step 3 - store 8 U and 8 V values 983 sub ecx, 16 984 movlps qword ptr [edx], xmm0 // U 985 movhps qword ptr [edx + edi], xmm0 // V 986 lea edx, [edx + 8] 987 jg convertloop 988 989 pop edi 990 pop esi 991 ret 992 } 993 } 994 995 __declspec(naked) __declspec(align(16)) 996 void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, 997 uint8* dst_u, uint8* dst_v, int width) { 998 __asm { 999 push esi 1000 push edi 1001 mov eax, [esp + 8 + 4] // src_argb 1002 mov esi, [esp + 8 + 8] // src_stride_argb 1003 mov edx, [esp + 8 + 12] // dst_u 1004 mov edi, [esp + 8 + 16] // dst_v 1005 mov ecx, [esp + 8 + 20] // pix 1006 movdqa xmm7, kARGBToU 1007 movdqa xmm6, kARGBToV 1008 movdqa xmm5, kAddUV128 1009 sub edi, edx // stride from u to v 1010 1011 align 16 1012 convertloop: 1013 /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1014 movdqu xmm0, [eax] 1015 movdqu xmm1, [eax + 16] 1016 movdqu xmm2, [eax + 32] 1017 movdqu xmm3, [eax + 48] 1018 movdqu xmm4, [eax + esi] 1019 pavgb xmm0, xmm4 1020 movdqu xmm4, [eax + esi + 16] 1021 pavgb xmm1, xmm4 1022 movdqu xmm4, [eax + esi + 32] 1023 pavgb xmm2, xmm4 1024 movdqu xmm4, [eax + esi + 48] 1025 pavgb xmm3, xmm4 1026 lea eax, [eax + 64] 1027 movdqa xmm4, xmm0 1028 shufps xmm0, xmm1, 0x88 1029 shufps xmm4, xmm1, 0xdd 1030 pavgb xmm0, xmm4 1031 movdqa xmm4, xmm2 1032 shufps xmm2, xmm3, 0x88 1033 shufps xmm4, xmm3, 0xdd 1034 pavgb xmm2, xmm4 1035 1036 // step 2 - convert to U and V 1037 // from here down is very similar to Y code except 1038 // instead of 16 different pixels, its 8 pixels of U and 8 of V 1039 movdqa xmm1, xmm0 1040 movdqa xmm3, xmm2 1041 pmaddubsw xmm0, xmm7 // U 1042 pmaddubsw xmm2, xmm7 1043 pmaddubsw xmm1, xmm6 // V 1044 pmaddubsw xmm3, xmm6 1045 phaddw xmm0, xmm2 1046 phaddw xmm1, xmm3 1047 psraw xmm0, 8 1048 psraw xmm1, 8 1049 packsswb xmm0, xmm1 1050 paddb xmm0, xmm5 // -> unsigned 1051 1052 // step 3 - store 8 U and 8 V values 1053 sub ecx, 16 1054 movlps qword ptr [edx], xmm0 // U 1055 movhps qword ptr [edx + edi], xmm0 // V 1056 lea edx, [edx + 8] 1057 jg convertloop 1058 1059 pop edi 1060 pop esi 1061 ret 1062 } 1063 } 1064 1065 __declspec(naked) __declspec(align(16)) 1066 void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 1067 uint8* dst_u, uint8* dst_v, int width) { 1068 __asm { 1069 push esi 1070 push edi 1071 mov eax, [esp + 8 + 4] // src_argb 1072 mov esi, [esp + 8 + 8] // src_stride_argb 1073 mov edx, [esp + 8 + 12] // dst_u 1074 mov edi, [esp + 8 + 16] // dst_v 1075 mov ecx, [esp + 8 + 20] // pix 1076 movdqa xmm7, kBGRAToU 1077 movdqa xmm6, kBGRAToV 1078 movdqa xmm5, kAddUV128 1079 sub edi, edx // stride from u to v 1080 1081 align 16 1082 convertloop: 1083 /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1084 movdqa xmm0, [eax] 1085 movdqa xmm1, [eax + 16] 1086 movdqa xmm2, [eax + 32] 1087 movdqa xmm3, [eax + 48] 1088 pavgb xmm0, [eax + esi] 1089 pavgb xmm1, [eax + esi + 16] 1090 pavgb xmm2, [eax + esi + 32] 1091 pavgb xmm3, [eax + esi + 48] 1092 lea eax, [eax + 64] 1093 movdqa xmm4, xmm0 1094 shufps xmm0, xmm1, 0x88 1095 shufps xmm4, xmm1, 0xdd 1096 pavgb xmm0, xmm4 1097 movdqa xmm4, xmm2 1098 shufps xmm2, xmm3, 0x88 1099 shufps xmm4, xmm3, 0xdd 1100 pavgb xmm2, xmm4 1101 1102 // step 2 - convert to U and V 1103 // from here down is very similar to Y code except 1104 // instead of 16 different pixels, its 8 pixels of U and 8 of V 1105 movdqa xmm1, xmm0 1106 movdqa xmm3, xmm2 1107 pmaddubsw xmm0, xmm7 // U 1108 pmaddubsw xmm2, xmm7 1109 pmaddubsw xmm1, xmm6 // V 1110 pmaddubsw xmm3, xmm6 1111 phaddw xmm0, xmm2 1112 phaddw xmm1, xmm3 1113 psraw xmm0, 8 1114 psraw xmm1, 8 1115 packsswb xmm0, xmm1 1116 paddb xmm0, xmm5 // -> unsigned 1117 1118 // step 3 - store 8 U and 8 V values 1119 sub ecx, 16 1120 movlps qword ptr [edx], xmm0 // U 1121 movhps qword ptr [edx + edi], xmm0 // V 1122 lea edx, [edx + 8] 1123 jg convertloop 1124 1125 pop edi 1126 pop esi 1127 ret 1128 } 1129 } 1130 1131 __declspec(naked) __declspec(align(16)) 1132 void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, 1133 uint8* dst_u, uint8* dst_v, int width) { 1134 __asm { 1135 push esi 1136 push edi 1137 mov eax, [esp + 8 + 4] // src_argb 1138 mov esi, [esp + 8 + 8] // src_stride_argb 1139 mov edx, [esp + 8 + 12] // dst_u 1140 mov edi, [esp + 8 + 16] // dst_v 1141 mov ecx, [esp + 8 + 20] // pix 1142 movdqa xmm7, kBGRAToU 1143 movdqa xmm6, kBGRAToV 1144 movdqa xmm5, kAddUV128 1145 sub edi, edx // stride from u to v 1146 1147 align 16 1148 convertloop: 1149 /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1150 movdqu xmm0, [eax] 1151 movdqu xmm1, [eax + 16] 1152 movdqu xmm2, [eax + 32] 1153 movdqu xmm3, [eax + 48] 1154 movdqu xmm4, [eax + esi] 1155 pavgb xmm0, xmm4 1156 movdqu xmm4, [eax + esi + 16] 1157 pavgb xmm1, xmm4 1158 movdqu xmm4, [eax + esi + 32] 1159 pavgb xmm2, xmm4 1160 movdqu xmm4, [eax + esi + 48] 1161 pavgb xmm3, xmm4 1162 lea eax, [eax + 64] 1163 movdqa xmm4, xmm0 1164 shufps xmm0, xmm1, 0x88 1165 shufps xmm4, xmm1, 0xdd 1166 pavgb xmm0, xmm4 1167 movdqa xmm4, xmm2 1168 shufps xmm2, xmm3, 0x88 1169 shufps xmm4, xmm3, 0xdd 1170 pavgb xmm2, xmm4 1171 1172 // step 2 - convert to U and V 1173 // from here down is very similar to Y code except 1174 // instead of 16 different pixels, its 8 pixels of U and 8 of V 1175 movdqa xmm1, xmm0 1176 movdqa xmm3, xmm2 1177 pmaddubsw xmm0, xmm7 // U 1178 pmaddubsw xmm2, xmm7 1179 pmaddubsw xmm1, xmm6 // V 1180 pmaddubsw xmm3, xmm6 1181 phaddw xmm0, xmm2 1182 phaddw xmm1, xmm3 1183 psraw xmm0, 8 1184 psraw xmm1, 8 1185 packsswb xmm0, xmm1 1186 paddb xmm0, xmm5 // -> unsigned 1187 1188 // step 3 - store 8 U and 8 V values 1189 sub ecx, 16 1190 movlps qword ptr [edx], xmm0 // U 1191 movhps qword ptr [edx + edi], xmm0 // V 1192 lea edx, [edx + 8] 1193 jg convertloop 1194 1195 pop edi 1196 pop esi 1197 ret 1198 } 1199 } 1200 1201 __declspec(naked) __declspec(align(16)) 1202 void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 1203 uint8* dst_u, uint8* dst_v, int width) { 1204 __asm { 1205 push esi 1206 push edi 1207 mov eax, [esp + 8 + 4] // src_argb 1208 mov esi, [esp + 8 + 8] // src_stride_argb 1209 mov edx, [esp + 8 + 12] // dst_u 1210 mov edi, [esp + 8 + 16] // dst_v 1211 mov ecx, [esp + 8 + 20] // pix 1212 movdqa xmm7, kABGRToU 1213 movdqa xmm6, kABGRToV 1214 movdqa xmm5, kAddUV128 1215 sub edi, edx // stride from u to v 1216 1217 align 16 1218 convertloop: 1219 /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1220 movdqa xmm0, [eax] 1221 movdqa xmm1, [eax + 16] 1222 movdqa xmm2, [eax + 32] 1223 movdqa xmm3, [eax + 48] 1224 pavgb xmm0, [eax + esi] 1225 pavgb xmm1, [eax + esi + 16] 1226 pavgb xmm2, [eax + esi + 32] 1227 pavgb xmm3, [eax + esi + 48] 1228 lea eax, [eax + 64] 1229 movdqa xmm4, xmm0 1230 shufps xmm0, xmm1, 0x88 1231 shufps xmm4, xmm1, 0xdd 1232 pavgb xmm0, xmm4 1233 movdqa xmm4, xmm2 1234 shufps xmm2, xmm3, 0x88 1235 shufps xmm4, xmm3, 0xdd 1236 pavgb xmm2, xmm4 1237 1238 // step 2 - convert to U and V 1239 // from here down is very similar to Y code except 1240 // instead of 16 different pixels, its 8 pixels of U and 8 of V 1241 movdqa xmm1, xmm0 1242 movdqa xmm3, xmm2 1243 pmaddubsw xmm0, xmm7 // U 1244 pmaddubsw xmm2, xmm7 1245 pmaddubsw xmm1, xmm6 // V 1246 pmaddubsw xmm3, xmm6 1247 phaddw xmm0, xmm2 1248 phaddw xmm1, xmm3 1249 psraw xmm0, 8 1250 psraw xmm1, 8 1251 packsswb xmm0, xmm1 1252 paddb xmm0, xmm5 // -> unsigned 1253 1254 // step 3 - store 8 U and 8 V values 1255 sub ecx, 16 1256 movlps qword ptr [edx], xmm0 // U 1257 movhps qword ptr [edx + edi], xmm0 // V 1258 lea edx, [edx + 8] 1259 jg convertloop 1260 1261 pop edi 1262 pop esi 1263 ret 1264 } 1265 } 1266 1267 __declspec(naked) __declspec(align(16)) 1268 void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, 1269 uint8* dst_u, uint8* dst_v, int width) { 1270 __asm { 1271 push esi 1272 push edi 1273 mov eax, [esp + 8 + 4] // src_argb 1274 mov esi, [esp + 8 + 8] // src_stride_argb 1275 mov edx, [esp + 8 + 12] // dst_u 1276 mov edi, [esp + 8 + 16] // dst_v 1277 mov ecx, [esp + 8 + 20] // pix 1278 movdqa xmm7, kABGRToU 1279 movdqa xmm6, kABGRToV 1280 movdqa xmm5, kAddUV128 1281 sub edi, edx // stride from u to v 1282 1283 align 16 1284 convertloop: 1285 /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1286 movdqu xmm0, [eax] 1287 movdqu xmm1, [eax + 16] 1288 movdqu xmm2, [eax + 32] 1289 movdqu xmm3, [eax + 48] 1290 movdqu xmm4, [eax + esi] 1291 pavgb xmm0, xmm4 1292 movdqu xmm4, [eax + esi + 16] 1293 pavgb xmm1, xmm4 1294 movdqu xmm4, [eax + esi + 32] 1295 pavgb xmm2, xmm4 1296 movdqu xmm4, [eax + esi + 48] 1297 pavgb xmm3, xmm4 1298 lea eax, [eax + 64] 1299 movdqa xmm4, xmm0 1300 shufps xmm0, xmm1, 0x88 1301 shufps xmm4, xmm1, 0xdd 1302 pavgb xmm0, xmm4 1303 movdqa xmm4, xmm2 1304 shufps xmm2, xmm3, 0x88 1305 shufps xmm4, xmm3, 0xdd 1306 pavgb xmm2, xmm4 1307 1308 // step 2 - convert to U and V 1309 // from here down is very similar to Y code except 1310 // instead of 16 different pixels, its 8 pixels of U and 8 of V 1311 movdqa xmm1, xmm0 1312 movdqa xmm3, xmm2 1313 pmaddubsw xmm0, xmm7 // U 1314 pmaddubsw xmm2, xmm7 1315 pmaddubsw xmm1, xmm6 // V 1316 pmaddubsw xmm3, xmm6 1317 phaddw xmm0, xmm2 1318 phaddw xmm1, xmm3 1319 psraw xmm0, 8 1320 psraw xmm1, 8 1321 packsswb xmm0, xmm1 1322 paddb xmm0, xmm5 // -> unsigned 1323 1324 // step 3 - store 8 U and 8 V values 1325 sub ecx, 16 1326 movlps qword ptr [edx], xmm0 // U 1327 movhps qword ptr [edx + edi], xmm0 // V 1328 lea edx, [edx + 8] 1329 jg convertloop 1330 1331 pop edi 1332 pop esi 1333 ret 1334 } 1335 } 1336 1337 __declspec(naked) __declspec(align(16)) 1338 void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 1339 uint8* dst_u, uint8* dst_v, int width) { 1340 __asm { 1341 push esi 1342 push edi 1343 mov eax, [esp + 8 + 4] // src_argb 1344 mov esi, [esp + 8 + 8] // src_stride_argb 1345 mov edx, [esp + 8 + 12] // dst_u 1346 mov edi, [esp + 8 + 16] // dst_v 1347 mov ecx, [esp + 8 + 20] // pix 1348 movdqa xmm7, kRGBAToU 1349 movdqa xmm6, kRGBAToV 1350 movdqa xmm5, kAddUV128 1351 sub edi, edx // stride from u to v 1352 1353 align 16 1354 convertloop: 1355 /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1356 movdqa xmm0, [eax] 1357 movdqa xmm1, [eax + 16] 1358 movdqa xmm2, [eax + 32] 1359 movdqa xmm3, [eax + 48] 1360 pavgb xmm0, [eax + esi] 1361 pavgb xmm1, [eax + esi + 16] 1362 pavgb xmm2, [eax + esi + 32] 1363 pavgb xmm3, [eax + esi + 48] 1364 lea eax, [eax + 64] 1365 movdqa xmm4, xmm0 1366 shufps xmm0, xmm1, 0x88 1367 shufps xmm4, xmm1, 0xdd 1368 pavgb xmm0, xmm4 1369 movdqa xmm4, xmm2 1370 shufps xmm2, xmm3, 0x88 1371 shufps xmm4, xmm3, 0xdd 1372 pavgb xmm2, xmm4 1373 1374 // step 2 - convert to U and V 1375 // from here down is very similar to Y code except 1376 // instead of 16 different pixels, its 8 pixels of U and 8 of V 1377 movdqa xmm1, xmm0 1378 movdqa xmm3, xmm2 1379 pmaddubsw xmm0, xmm7 // U 1380 pmaddubsw xmm2, xmm7 1381 pmaddubsw xmm1, xmm6 // V 1382 pmaddubsw xmm3, xmm6 1383 phaddw xmm0, xmm2 1384 phaddw xmm1, xmm3 1385 psraw xmm0, 8 1386 psraw xmm1, 8 1387 packsswb xmm0, xmm1 1388 paddb xmm0, xmm5 // -> unsigned 1389 1390 // step 3 - store 8 U and 8 V values 1391 sub ecx, 16 1392 movlps qword ptr [edx], xmm0 // U 1393 movhps qword ptr [edx + edi], xmm0 // V 1394 lea edx, [edx + 8] 1395 jg convertloop 1396 1397 pop edi 1398 pop esi 1399 ret 1400 } 1401 } 1402 1403 __declspec(naked) __declspec(align(16)) 1404 void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, 1405 uint8* dst_u, uint8* dst_v, int width) { 1406 __asm { 1407 push esi 1408 push edi 1409 mov eax, [esp + 8 + 4] // src_argb 1410 mov esi, [esp + 8 + 8] // src_stride_argb 1411 mov edx, [esp + 8 + 12] // dst_u 1412 mov edi, [esp + 8 + 16] // dst_v 1413 mov ecx, [esp + 8 + 20] // pix 1414 movdqa xmm7, kRGBAToU 1415 movdqa xmm6, kRGBAToV 1416 movdqa xmm5, kAddUV128 1417 sub edi, edx // stride from u to v 1418 1419 align 16 1420 convertloop: 1421 /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1422 movdqu xmm0, [eax] 1423 movdqu xmm1, [eax + 16] 1424 movdqu xmm2, [eax + 32] 1425 movdqu xmm3, [eax + 48] 1426 movdqu xmm4, [eax + esi] 1427 pavgb xmm0, xmm4 1428 movdqu xmm4, [eax + esi + 16] 1429 pavgb xmm1, xmm4 1430 movdqu xmm4, [eax + esi + 32] 1431 pavgb xmm2, xmm4 1432 movdqu xmm4, [eax + esi + 48] 1433 pavgb xmm3, xmm4 1434 lea eax, [eax + 64] 1435 movdqa xmm4, xmm0 1436 shufps xmm0, xmm1, 0x88 1437 shufps xmm4, xmm1, 0xdd 1438 pavgb xmm0, xmm4 1439 movdqa xmm4, xmm2 1440 shufps xmm2, xmm3, 0x88 1441 shufps xmm4, xmm3, 0xdd 1442 pavgb xmm2, xmm4 1443 1444 // step 2 - convert to U and V 1445 // from here down is very similar to Y code except 1446 // instead of 16 different pixels, its 8 pixels of U and 8 of V 1447 movdqa xmm1, xmm0 1448 movdqa xmm3, xmm2 1449 pmaddubsw xmm0, xmm7 // U 1450 pmaddubsw xmm2, xmm7 1451 pmaddubsw xmm1, xmm6 // V 1452 pmaddubsw xmm3, xmm6 1453 phaddw xmm0, xmm2 1454 phaddw xmm1, xmm3 1455 psraw xmm0, 8 1456 psraw xmm1, 8 1457 packsswb xmm0, xmm1 1458 paddb xmm0, xmm5 // -> unsigned 1459 1460 // step 3 - store 8 U and 8 V values 1461 sub ecx, 16 1462 movlps qword ptr [edx], xmm0 // U 1463 movhps qword ptr [edx + edi], xmm0 // V 1464 lea edx, [edx + 8] 1465 jg convertloop 1466 1467 pop edi 1468 pop esi 1469 ret 1470 } 1471 } 1472 #endif // HAS_ARGBTOYROW_SSSE3 1473 1474 #ifdef HAS_I422TOARGBROW_SSSE3 1475 1476 #define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */ 1477 1478 #define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */ 1479 #define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */ 1480 #define UR 0 1481 1482 #define VB 0 1483 #define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */ 1484 #define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */ 1485 1486 // Bias 1487 #define BB UB * 128 + VB * 128 1488 #define BG UG * 128 + VG * 128 1489 #define BR UR * 128 + VR * 128 1490 1491 static const vec8 kUVToB = { 1492 UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB 1493 }; 1494 1495 static const vec8 kUVToR = { 1496 UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR 1497 }; 1498 1499 static const vec8 kUVToG = { 1500 UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG 1501 }; 1502 1503 static const vec8 kVUToB = { 1504 VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, 1505 }; 1506 1507 static const vec8 kVUToR = { 1508 VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, 1509 }; 1510 1511 static const vec8 kVUToG = { 1512 VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, 1513 }; 1514 1515 static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG }; 1516 static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 }; 1517 static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB }; 1518 static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG }; 1519 static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR }; 1520 1521 // TODO(fbarchard): NV12/NV21 fetch UV and use directly. 1522 // TODO(fbarchard): Read that does half size on Y and treats 420 as 444. 1523 1524 // Read 8 UV from 411. 1525 #define READYUV444 __asm { \ 1526 __asm movq xmm0, qword ptr [esi] /* U */ /* NOLINT */ \ 1527 __asm movq xmm1, qword ptr [esi + edi] /* V */ /* NOLINT */ \ 1528 __asm lea esi, [esi + 8] \ 1529 __asm punpcklbw xmm0, xmm1 /* UV */ \ 1530 } 1531 1532 // Read 4 UV from 422, upsample to 8 UV. 1533 #define READYUV422 __asm { \ 1534 __asm movd xmm0, [esi] /* U */ \ 1535 __asm movd xmm1, [esi + edi] /* V */ \ 1536 __asm lea esi, [esi + 4] \ 1537 __asm punpcklbw xmm0, xmm1 /* UV */ \ 1538 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ 1539 } 1540 1541 // Read 2 UV from 411, upsample to 8 UV. 1542 #define READYUV411 __asm { \ 1543 __asm movd xmm0, [esi] /* U */ \ 1544 __asm movd xmm1, [esi + edi] /* V */ \ 1545 __asm lea esi, [esi + 2] \ 1546 __asm punpcklbw xmm0, xmm1 /* UV */ \ 1547 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ 1548 __asm punpckldq xmm0, xmm0 /* UVUV (upsample) */ \ 1549 } 1550 1551 // Read 4 UV from NV12, upsample to 8 UV. 1552 #define READNV12 __asm { \ 1553 __asm movq xmm0, qword ptr [esi] /* UV */ /* NOLINT */ \ 1554 __asm lea esi, [esi + 8] \ 1555 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ 1556 } 1557 1558 // Convert 8 pixels: 8 UV and 8 Y. 1559 #define YUVTORGB __asm { \ 1560 /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \ 1561 __asm movdqa xmm1, xmm0 \ 1562 __asm movdqa xmm2, xmm0 \ 1563 __asm pmaddubsw xmm0, kUVToB /* scale B UV */ \ 1564 __asm pmaddubsw xmm1, kUVToG /* scale G UV */ \ 1565 __asm pmaddubsw xmm2, kUVToR /* scale R UV */ \ 1566 __asm psubw xmm0, kUVBiasB /* unbias back to signed */ \ 1567 __asm psubw xmm1, kUVBiasG \ 1568 __asm psubw xmm2, kUVBiasR \ 1569 /* Step 2: Find Y contribution to 8 R,G,B values */ \ 1570 __asm movq xmm3, qword ptr [eax] /* NOLINT */ \ 1571 __asm lea eax, [eax + 8] \ 1572 __asm punpcklbw xmm3, xmm4 \ 1573 __asm psubsw xmm3, kYSub16 \ 1574 __asm pmullw xmm3, kYToRgb \ 1575 __asm paddsw xmm0, xmm3 /* B += Y */ \ 1576 __asm paddsw xmm1, xmm3 /* G += Y */ \ 1577 __asm paddsw xmm2, xmm3 /* R += Y */ \ 1578 __asm psraw xmm0, 6 \ 1579 __asm psraw xmm1, 6 \ 1580 __asm psraw xmm2, 6 \ 1581 __asm packuswb xmm0, xmm0 /* B */ \ 1582 __asm packuswb xmm1, xmm1 /* G */ \ 1583 __asm packuswb xmm2, xmm2 /* R */ \ 1584 } 1585 1586 // Convert 8 pixels: 8 VU and 8 Y. 1587 #define YVUTORGB __asm { \ 1588 /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \ 1589 __asm movdqa xmm1, xmm0 \ 1590 __asm movdqa xmm2, xmm0 \ 1591 __asm pmaddubsw xmm0, kVUToB /* scale B UV */ \ 1592 __asm pmaddubsw xmm1, kVUToG /* scale G UV */ \ 1593 __asm pmaddubsw xmm2, kVUToR /* scale R UV */ \ 1594 __asm psubw xmm0, kUVBiasB /* unbias back to signed */ \ 1595 __asm psubw xmm1, kUVBiasG \ 1596 __asm psubw xmm2, kUVBiasR \ 1597 /* Step 2: Find Y contribution to 8 R,G,B values */ \ 1598 __asm movq xmm3, qword ptr [eax] /* NOLINT */ \ 1599 __asm lea eax, [eax + 8] \ 1600 __asm punpcklbw xmm3, xmm4 \ 1601 __asm psubsw xmm3, kYSub16 \ 1602 __asm pmullw xmm3, kYToRgb \ 1603 __asm paddsw xmm0, xmm3 /* B += Y */ \ 1604 __asm paddsw xmm1, xmm3 /* G += Y */ \ 1605 __asm paddsw xmm2, xmm3 /* R += Y */ \ 1606 __asm psraw xmm0, 6 \ 1607 __asm psraw xmm1, 6 \ 1608 __asm psraw xmm2, 6 \ 1609 __asm packuswb xmm0, xmm0 /* B */ \ 1610 __asm packuswb xmm1, xmm1 /* G */ \ 1611 __asm packuswb xmm2, xmm2 /* R */ \ 1612 } 1613 1614 // 8 pixels, dest aligned 16. 1615 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). 1616 __declspec(naked) __declspec(align(16)) 1617 void I444ToARGBRow_SSSE3(const uint8* y_buf, 1618 const uint8* u_buf, 1619 const uint8* v_buf, 1620 uint8* argb_buf, 1621 int width) { 1622 __asm { 1623 push esi 1624 push edi 1625 mov eax, [esp + 8 + 4] // Y 1626 mov esi, [esp + 8 + 8] // U 1627 mov edi, [esp + 8 + 12] // V 1628 mov edx, [esp + 8 + 16] // argb 1629 mov ecx, [esp + 8 + 20] // width 1630 sub edi, esi 1631 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 1632 pxor xmm4, xmm4 1633 1634 align 16 1635 convertloop: 1636 READYUV444 1637 YUVTORGB 1638 1639 // Step 3: Weave into ARGB 1640 punpcklbw xmm0, xmm1 // BG 1641 punpcklbw xmm2, xmm5 // RA 1642 movdqa xmm1, xmm0 1643 punpcklwd xmm0, xmm2 // BGRA first 4 pixels 1644 punpckhwd xmm1, xmm2 // BGRA next 4 pixels 1645 movdqa [edx], xmm0 1646 movdqa [edx + 16], xmm1 1647 lea edx, [edx + 32] 1648 sub ecx, 8 1649 jg convertloop 1650 1651 pop edi 1652 pop esi 1653 ret 1654 } 1655 } 1656 1657 // 8 pixels, dest aligned 16. 1658 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 1659 __declspec(naked) __declspec(align(16)) 1660 void I422ToARGBRow_SSSE3(const uint8* y_buf, 1661 const uint8* u_buf, 1662 const uint8* v_buf, 1663 uint8* argb_buf, 1664 int width) { 1665 __asm { 1666 push esi 1667 push edi 1668 mov eax, [esp + 8 + 4] // Y 1669 mov esi, [esp + 8 + 8] // U 1670 mov edi, [esp + 8 + 12] // V 1671 mov edx, [esp + 8 + 16] // argb 1672 mov ecx, [esp + 8 + 20] // width 1673 sub edi, esi 1674 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 1675 pxor xmm4, xmm4 1676 1677 align 16 1678 convertloop: 1679 READYUV422 1680 YUVTORGB 1681 1682 // Step 3: Weave into ARGB 1683 punpcklbw xmm0, xmm1 // BG 1684 punpcklbw xmm2, xmm5 // RA 1685 movdqa xmm1, xmm0 1686 punpcklwd xmm0, xmm2 // BGRA first 4 pixels 1687 punpckhwd xmm1, xmm2 // BGRA next 4 pixels 1688 movdqa [edx], xmm0 1689 movdqa [edx + 16], xmm1 1690 lea edx, [edx + 32] 1691 sub ecx, 8 1692 jg convertloop 1693 1694 pop edi 1695 pop esi 1696 ret 1697 } 1698 } 1699 1700 // 8 pixels, dest aligned 16. 1701 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 1702 // Similar to I420 but duplicate UV once more. 1703 __declspec(naked) __declspec(align(16)) 1704 void I411ToARGBRow_SSSE3(const uint8* y_buf, 1705 const uint8* u_buf, 1706 const uint8* v_buf, 1707 uint8* argb_buf, 1708 int width) { 1709 __asm { 1710 push esi 1711 push edi 1712 mov eax, [esp + 8 + 4] // Y 1713 mov esi, [esp + 8 + 8] // U 1714 mov edi, [esp + 8 + 12] // V 1715 mov edx, [esp + 8 + 16] // argb 1716 mov ecx, [esp + 8 + 20] // width 1717 sub edi, esi 1718 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 1719 pxor xmm4, xmm4 1720 1721 align 16 1722 convertloop: 1723 READYUV411 1724 YUVTORGB 1725 1726 // Step 3: Weave into ARGB 1727 punpcklbw xmm0, xmm1 // BG 1728 punpcklbw xmm2, xmm5 // RA 1729 movdqa xmm1, xmm0 1730 punpcklwd xmm0, xmm2 // BGRA first 4 pixels 1731 punpckhwd xmm1, xmm2 // BGRA next 4 pixels 1732 movdqa [edx], xmm0 1733 movdqa [edx + 16], xmm1 1734 lea edx, [edx + 32] 1735 sub ecx, 8 1736 jg convertloop 1737 1738 pop edi 1739 pop esi 1740 ret 1741 } 1742 } 1743 1744 // 8 pixels, dest aligned 16. 1745 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 1746 __declspec(naked) __declspec(align(16)) 1747 void NV12ToARGBRow_SSSE3(const uint8* y_buf, 1748 const uint8* uv_buf, 1749 uint8* argb_buf, 1750 int width) { 1751 __asm { 1752 push esi 1753 mov eax, [esp + 4 + 4] // Y 1754 mov esi, [esp + 4 + 8] // UV 1755 mov edx, [esp + 4 + 12] // argb 1756 mov ecx, [esp + 4 + 16] // width 1757 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 1758 pxor xmm4, xmm4 1759 1760 align 16 1761 convertloop: 1762 READNV12 1763 YUVTORGB 1764 1765 // Step 3: Weave into ARGB 1766 punpcklbw xmm0, xmm1 // BG 1767 punpcklbw xmm2, xmm5 // RA 1768 movdqa xmm1, xmm0 1769 punpcklwd xmm0, xmm2 // BGRA first 4 pixels 1770 punpckhwd xmm1, xmm2 // BGRA next 4 pixels 1771 movdqa [edx], xmm0 1772 movdqa [edx + 16], xmm1 1773 lea edx, [edx + 32] 1774 sub ecx, 8 1775 jg convertloop 1776 1777 pop esi 1778 ret 1779 } 1780 } 1781 1782 // 8 pixels, dest aligned 16. 1783 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 1784 __declspec(naked) __declspec(align(16)) 1785 void NV21ToARGBRow_SSSE3(const uint8* y_buf, 1786 const uint8* uv_buf, 1787 uint8* argb_buf, 1788 int width) { 1789 __asm { 1790 push esi 1791 mov eax, [esp + 4 + 4] // Y 1792 mov esi, [esp + 4 + 8] // VU 1793 mov edx, [esp + 4 + 12] // argb 1794 mov ecx, [esp + 4 + 16] // width 1795 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 1796 pxor xmm4, xmm4 1797 1798 align 16 1799 convertloop: 1800 READNV12 1801 YVUTORGB 1802 1803 // Step 3: Weave into ARGB 1804 punpcklbw xmm0, xmm1 // BG 1805 punpcklbw xmm2, xmm5 // RA 1806 movdqa xmm1, xmm0 1807 punpcklwd xmm0, xmm2 // BGRA first 4 pixels 1808 punpckhwd xmm1, xmm2 // BGRA next 4 pixels 1809 movdqa [edx], xmm0 1810 movdqa [edx + 16], xmm1 1811 lea edx, [edx + 32] 1812 sub ecx, 8 1813 jg convertloop 1814 1815 pop esi 1816 ret 1817 } 1818 } 1819 1820 // 8 pixels, unaligned. 1821 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). 1822 __declspec(naked) __declspec(align(16)) 1823 void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, 1824 const uint8* u_buf, 1825 const uint8* v_buf, 1826 uint8* argb_buf, 1827 int width) { 1828 __asm { 1829 push esi 1830 push edi 1831 mov eax, [esp + 8 + 4] // Y 1832 mov esi, [esp + 8 + 8] // U 1833 mov edi, [esp + 8 + 12] // V 1834 mov edx, [esp + 8 + 16] // argb 1835 mov ecx, [esp + 8 + 20] // width 1836 sub edi, esi 1837 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 1838 pxor xmm4, xmm4 1839 1840 align 16 1841 convertloop: 1842 READYUV444 1843 YUVTORGB 1844 1845 // Step 3: Weave into ARGB 1846 punpcklbw xmm0, xmm1 // BG 1847 punpcklbw xmm2, xmm5 // RA 1848 movdqa xmm1, xmm0 1849 punpcklwd xmm0, xmm2 // BGRA first 4 pixels 1850 punpckhwd xmm1, xmm2 // BGRA next 4 pixels 1851 movdqu [edx], xmm0 1852 movdqu [edx + 16], xmm1 1853 lea edx, [edx + 32] 1854 sub ecx, 8 1855 jg convertloop 1856 1857 pop edi 1858 pop esi 1859 ret 1860 } 1861 } 1862 1863 // 8 pixels, unaligned. 1864 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 1865 __declspec(naked) __declspec(align(16)) 1866 void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, 1867 const uint8* u_buf, 1868 const uint8* v_buf, 1869 uint8* argb_buf, 1870 int width) { 1871 __asm { 1872 push esi 1873 push edi 1874 mov eax, [esp + 8 + 4] // Y 1875 mov esi, [esp + 8 + 8] // U 1876 mov edi, [esp + 8 + 12] // V 1877 mov edx, [esp + 8 + 16] // argb 1878 mov ecx, [esp + 8 + 20] // width 1879 sub edi, esi 1880 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 1881 pxor xmm4, xmm4 1882 1883 align 16 1884 convertloop: 1885 READYUV422 1886 YUVTORGB 1887 1888 // Step 3: Weave into ARGB 1889 punpcklbw xmm0, xmm1 // BG 1890 punpcklbw xmm2, xmm5 // RA 1891 movdqa xmm1, xmm0 1892 punpcklwd xmm0, xmm2 // BGRA first 4 pixels 1893 punpckhwd xmm1, xmm2 // BGRA next 4 pixels 1894 movdqu [edx], xmm0 1895 movdqu [edx + 16], xmm1 1896 lea edx, [edx + 32] 1897 sub ecx, 8 1898 jg convertloop 1899 1900 pop edi 1901 pop esi 1902 ret 1903 } 1904 } 1905 1906 // 8 pixels, unaligned. 1907 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 1908 // Similar to I420 but duplicate UV once more. 1909 __declspec(naked) __declspec(align(16)) 1910 void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, 1911 const uint8* u_buf, 1912 const uint8* v_buf, 1913 uint8* argb_buf, 1914 int width) { 1915 __asm { 1916 push esi 1917 push edi 1918 mov eax, [esp + 8 + 4] // Y 1919 mov esi, [esp + 8 + 8] // U 1920 mov edi, [esp + 8 + 12] // V 1921 mov edx, [esp + 8 + 16] // argb 1922 mov ecx, [esp + 8 + 20] // width 1923 sub edi, esi 1924 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 1925 pxor xmm4, xmm4 1926 1927 align 16 1928 convertloop: 1929 READYUV411 1930 YUVTORGB 1931 1932 // Step 3: Weave into ARGB 1933 punpcklbw xmm0, xmm1 // BG 1934 punpcklbw xmm2, xmm5 // RA 1935 movdqa xmm1, xmm0 1936 punpcklwd xmm0, xmm2 // BGRA first 4 pixels 1937 punpckhwd xmm1, xmm2 // BGRA next 4 pixels 1938 movdqu [edx], xmm0 1939 movdqu [edx + 16], xmm1 1940 lea edx, [edx + 32] 1941 sub ecx, 8 1942 jg convertloop 1943 1944 pop edi 1945 pop esi 1946 ret 1947 } 1948 } 1949 1950 1951 // 8 pixels, dest aligned 16. 1952 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 1953 __declspec(naked) __declspec(align(16)) 1954 void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, 1955 const uint8* uv_buf, 1956 uint8* argb_buf, 1957 int width) { 1958 __asm { 1959 push esi 1960 mov eax, [esp + 4 + 4] // Y 1961 mov esi, [esp + 4 + 8] // UV 1962 mov edx, [esp + 4 + 12] // argb 1963 mov ecx, [esp + 4 + 16] // width 1964 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 1965 pxor xmm4, xmm4 1966 1967 align 16 1968 convertloop: 1969 READNV12 1970 YUVTORGB 1971 1972 // Step 3: Weave into ARGB 1973 punpcklbw xmm0, xmm1 // BG 1974 punpcklbw xmm2, xmm5 // RA 1975 movdqa xmm1, xmm0 1976 punpcklwd xmm0, xmm2 // BGRA first 4 pixels 1977 punpckhwd xmm1, xmm2 // BGRA next 4 pixels 1978 movdqu [edx], xmm0 1979 movdqu [edx + 16], xmm1 1980 lea edx, [edx + 32] 1981 sub ecx, 8 1982 jg convertloop 1983 1984 pop esi 1985 ret 1986 } 1987 } 1988 1989 // 8 pixels, dest aligned 16. 1990 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 1991 __declspec(naked) __declspec(align(16)) 1992 void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, 1993 const uint8* uv_buf, 1994 uint8* argb_buf, 1995 int width) { 1996 __asm { 1997 push esi 1998 mov eax, [esp + 4 + 4] // Y 1999 mov esi, [esp + 4 + 8] // VU 2000 mov edx, [esp + 4 + 12] // argb 2001 mov ecx, [esp + 4 + 16] // width 2002 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2003 pxor xmm4, xmm4 2004 2005 align 16 2006 convertloop: 2007 READNV12 2008 YVUTORGB 2009 2010 // Step 3: Weave into ARGB 2011 punpcklbw xmm0, xmm1 // BG 2012 punpcklbw xmm2, xmm5 // RA 2013 movdqa xmm1, xmm0 2014 punpcklwd xmm0, xmm2 // BGRA first 4 pixels 2015 punpckhwd xmm1, xmm2 // BGRA next 4 pixels 2016 movdqu [edx], xmm0 2017 movdqu [edx + 16], xmm1 2018 lea edx, [edx + 32] 2019 sub ecx, 8 2020 jg convertloop 2021 2022 pop esi 2023 ret 2024 } 2025 } 2026 2027 __declspec(naked) __declspec(align(16)) 2028 void I422ToBGRARow_SSSE3(const uint8* y_buf, 2029 const uint8* u_buf, 2030 const uint8* v_buf, 2031 uint8* bgra_buf, 2032 int width) { 2033 __asm { 2034 push esi 2035 push edi 2036 mov eax, [esp + 8 + 4] // Y 2037 mov esi, [esp + 8 + 8] // U 2038 mov edi, [esp + 8 + 12] // V 2039 mov edx, [esp + 8 + 16] // bgra 2040 mov ecx, [esp + 8 + 20] // width 2041 sub edi, esi 2042 pxor xmm4, xmm4 2043 2044 align 16 2045 convertloop: 2046 READYUV422 2047 YUVTORGB 2048 2049 // Step 3: Weave into BGRA 2050 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2051 punpcklbw xmm1, xmm0 // GB 2052 punpcklbw xmm5, xmm2 // AR 2053 movdqa xmm0, xmm5 2054 punpcklwd xmm5, xmm1 // BGRA first 4 pixels 2055 punpckhwd xmm0, xmm1 // BGRA next 4 pixels 2056 movdqa [edx], xmm5 2057 movdqa [edx + 16], xmm0 2058 lea edx, [edx + 32] 2059 sub ecx, 8 2060 jg convertloop 2061 2062 pop edi 2063 pop esi 2064 ret 2065 } 2066 } 2067 2068 __declspec(naked) __declspec(align(16)) 2069 void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, 2070 const uint8* u_buf, 2071 const uint8* v_buf, 2072 uint8* bgra_buf, 2073 int width) { 2074 __asm { 2075 push esi 2076 push edi 2077 mov eax, [esp + 8 + 4] // Y 2078 mov esi, [esp + 8 + 8] // U 2079 mov edi, [esp + 8 + 12] // V 2080 mov edx, [esp + 8 + 16] // bgra 2081 mov ecx, [esp + 8 + 20] // width 2082 sub edi, esi 2083 pxor xmm4, xmm4 2084 2085 align 16 2086 convertloop: 2087 READYUV422 2088 YUVTORGB 2089 2090 // Step 3: Weave into BGRA 2091 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2092 punpcklbw xmm1, xmm0 // GB 2093 punpcklbw xmm5, xmm2 // AR 2094 movdqa xmm0, xmm5 2095 punpcklwd xmm5, xmm1 // BGRA first 4 pixels 2096 punpckhwd xmm0, xmm1 // BGRA next 4 pixels 2097 movdqu [edx], xmm5 2098 movdqu [edx + 16], xmm0 2099 lea edx, [edx + 32] 2100 sub ecx, 8 2101 jg convertloop 2102 2103 pop edi 2104 pop esi 2105 ret 2106 } 2107 } 2108 2109 __declspec(naked) __declspec(align(16)) 2110 void I422ToABGRRow_SSSE3(const uint8* y_buf, 2111 const uint8* u_buf, 2112 const uint8* v_buf, 2113 uint8* abgr_buf, 2114 int width) { 2115 __asm { 2116 push esi 2117 push edi 2118 mov eax, [esp + 8 + 4] // Y 2119 mov esi, [esp + 8 + 8] // U 2120 mov edi, [esp + 8 + 12] // V 2121 mov edx, [esp + 8 + 16] // abgr 2122 mov ecx, [esp + 8 + 20] // width 2123 sub edi, esi 2124 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2125 pxor xmm4, xmm4 2126 2127 align 16 2128 convertloop: 2129 READYUV422 2130 YUVTORGB 2131 2132 // Step 3: Weave into ARGB 2133 punpcklbw xmm2, xmm1 // RG 2134 punpcklbw xmm0, xmm5 // BA 2135 movdqa xmm1, xmm2 2136 punpcklwd xmm2, xmm0 // RGBA first 4 pixels 2137 punpckhwd xmm1, xmm0 // RGBA next 4 pixels 2138 movdqa [edx], xmm2 2139 movdqa [edx + 16], xmm1 2140 lea edx, [edx + 32] 2141 sub ecx, 8 2142 jg convertloop 2143 2144 pop edi 2145 pop esi 2146 ret 2147 } 2148 } 2149 2150 __declspec(naked) __declspec(align(16)) 2151 void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, 2152 const uint8* u_buf, 2153 const uint8* v_buf, 2154 uint8* abgr_buf, 2155 int width) { 2156 __asm { 2157 push esi 2158 push edi 2159 mov eax, [esp + 8 + 4] // Y 2160 mov esi, [esp + 8 + 8] // U 2161 mov edi, [esp + 8 + 12] // V 2162 mov edx, [esp + 8 + 16] // abgr 2163 mov ecx, [esp + 8 + 20] // width 2164 sub edi, esi 2165 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2166 pxor xmm4, xmm4 2167 2168 align 16 2169 convertloop: 2170 READYUV422 2171 YUVTORGB 2172 2173 // Step 3: Weave into ARGB 2174 punpcklbw xmm2, xmm1 // RG 2175 punpcklbw xmm0, xmm5 // BA 2176 movdqa xmm1, xmm2 2177 punpcklwd xmm2, xmm0 // RGBA first 4 pixels 2178 punpckhwd xmm1, xmm0 // RGBA next 4 pixels 2179 movdqu [edx], xmm2 2180 movdqu [edx + 16], xmm1 2181 lea edx, [edx + 32] 2182 sub ecx, 8 2183 jg convertloop 2184 2185 pop edi 2186 pop esi 2187 ret 2188 } 2189 } 2190 2191 __declspec(naked) __declspec(align(16)) 2192 void I422ToRGBARow_SSSE3(const uint8* y_buf, 2193 const uint8* u_buf, 2194 const uint8* v_buf, 2195 uint8* rgba_buf, 2196 int width) { 2197 __asm { 2198 push esi 2199 push edi 2200 mov eax, [esp + 8 + 4] // Y 2201 mov esi, [esp + 8 + 8] // U 2202 mov edi, [esp + 8 + 12] // V 2203 mov edx, [esp + 8 + 16] // rgba 2204 mov ecx, [esp + 8 + 20] // width 2205 sub edi, esi 2206 pxor xmm4, xmm4 2207 2208 align 16 2209 convertloop: 2210 READYUV422 2211 YUVTORGB 2212 2213 // Step 3: Weave into RGBA 2214 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2215 punpcklbw xmm1, xmm2 // GR 2216 punpcklbw xmm5, xmm0 // AB 2217 movdqa xmm0, xmm5 2218 punpcklwd xmm5, xmm1 // RGBA first 4 pixels 2219 punpckhwd xmm0, xmm1 // RGBA next 4 pixels 2220 movdqa [edx], xmm5 2221 movdqa [edx + 16], xmm0 2222 lea edx, [edx + 32] 2223 sub ecx, 8 2224 jg convertloop 2225 2226 pop edi 2227 pop esi 2228 ret 2229 } 2230 } 2231 2232 __declspec(naked) __declspec(align(16)) 2233 void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf, 2234 const uint8* u_buf, 2235 const uint8* v_buf, 2236 uint8* rgba_buf, 2237 int width) { 2238 __asm { 2239 push esi 2240 push edi 2241 mov eax, [esp + 8 + 4] // Y 2242 mov esi, [esp + 8 + 8] // U 2243 mov edi, [esp + 8 + 12] // V 2244 mov edx, [esp + 8 + 16] // rgba 2245 mov ecx, [esp + 8 + 20] // width 2246 sub edi, esi 2247 pxor xmm4, xmm4 2248 2249 align 16 2250 convertloop: 2251 READYUV422 2252 YUVTORGB 2253 2254 // Step 3: Weave into RGBA 2255 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2256 punpcklbw xmm1, xmm2 // GR 2257 punpcklbw xmm5, xmm0 // AB 2258 movdqa xmm0, xmm5 2259 punpcklwd xmm5, xmm1 // RGBA first 4 pixels 2260 punpckhwd xmm0, xmm1 // RGBA next 4 pixels 2261 movdqu [edx], xmm5 2262 movdqu [edx + 16], xmm0 2263 lea edx, [edx + 32] 2264 sub ecx, 8 2265 jg convertloop 2266 2267 pop edi 2268 pop esi 2269 ret 2270 } 2271 } 2272 2273 #endif // HAS_I422TOARGBROW_SSSE3 2274 2275 #ifdef HAS_YTOARGBROW_SSE2 2276 __declspec(naked) __declspec(align(16)) 2277 void YToARGBRow_SSE2(const uint8* y_buf, 2278 uint8* rgb_buf, 2279 int width) { 2280 __asm { 2281 pcmpeqb xmm4, xmm4 // generate mask 0xff000000 2282 pslld xmm4, 24 2283 mov eax,0x10001000 2284 movd xmm3,eax 2285 pshufd xmm3,xmm3,0 2286 mov eax,0x012a012a 2287 movd xmm2,eax 2288 pshufd xmm2,xmm2,0 2289 mov eax, [esp + 4] // Y 2290 mov edx, [esp + 8] // rgb 2291 mov ecx, [esp + 12] // width 2292 2293 align 16 2294 convertloop: 2295 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 2296 movq xmm0, qword ptr [eax] 2297 lea eax, [eax + 8] 2298 punpcklbw xmm0, xmm0 // Y.Y 2299 psubusw xmm0, xmm3 2300 pmulhuw xmm0, xmm2 2301 packuswb xmm0, xmm0 // G 2302 2303 // Step 2: Weave into ARGB 2304 punpcklbw xmm0, xmm0 // GG 2305 movdqa xmm1, xmm0 2306 punpcklwd xmm0, xmm0 // BGRA first 4 pixels 2307 punpckhwd xmm1, xmm1 // BGRA next 4 pixels 2308 por xmm0, xmm4 2309 por xmm1, xmm4 2310 movdqa [edx], xmm0 2311 movdqa [edx + 16], xmm1 2312 lea edx, [edx + 32] 2313 sub ecx, 8 2314 jg convertloop 2315 2316 ret 2317 } 2318 } 2319 #endif // HAS_YTOARGBROW_SSE2 2320 2321 #ifdef HAS_MIRRORROW_SSSE3 2322 2323 // Shuffle table for reversing the bytes. 2324 static const uvec8 kShuffleMirror = { 2325 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u 2326 }; 2327 2328 __declspec(naked) __declspec(align(16)) 2329 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { 2330 __asm { 2331 mov eax, [esp + 4] // src 2332 mov edx, [esp + 8] // dst 2333 mov ecx, [esp + 12] // width 2334 movdqa xmm5, kShuffleMirror 2335 lea eax, [eax - 16] 2336 2337 align 16 2338 convertloop: 2339 movdqa xmm0, [eax + ecx] 2340 pshufb xmm0, xmm5 2341 sub ecx, 16 2342 movdqa [edx], xmm0 2343 lea edx, [edx + 16] 2344 jg convertloop 2345 ret 2346 } 2347 } 2348 #endif // HAS_MIRRORROW_SSSE3 2349 2350 #ifdef HAS_MIRRORROW_SSE2 2351 // SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3 2352 // version can not. 2353 __declspec(naked) __declspec(align(16)) 2354 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { 2355 __asm { 2356 mov eax, [esp + 4] // src 2357 mov edx, [esp + 8] // dst 2358 mov ecx, [esp + 12] // width 2359 lea eax, [eax - 16] 2360 2361 align 16 2362 convertloop: 2363 movdqu xmm0, [eax + ecx] 2364 movdqa xmm1, xmm0 // swap bytes 2365 psllw xmm0, 8 2366 psrlw xmm1, 8 2367 por xmm0, xmm1 2368 pshuflw xmm0, xmm0, 0x1b // swap words 2369 pshufhw xmm0, xmm0, 0x1b 2370 pshufd xmm0, xmm0, 0x4e // swap qwords 2371 sub ecx, 16 2372 movdqu [edx], xmm0 2373 lea edx, [edx + 16] 2374 jg convertloop 2375 ret 2376 } 2377 } 2378 #endif // HAS_MIRRORROW_SSE2 2379 2380 #ifdef HAS_MIRRORROW_UV_SSSE3 2381 // Shuffle table for reversing the bytes of UV channels. 2382 static const uvec8 kShuffleMirrorUV = { 2383 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u 2384 }; 2385 2386 __declspec(naked) __declspec(align(16)) 2387 void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, 2388 int width) { 2389 __asm { 2390 push edi 2391 mov eax, [esp + 4 + 4] // src 2392 mov edx, [esp + 4 + 8] // dst_u 2393 mov edi, [esp + 4 + 12] // dst_v 2394 mov ecx, [esp + 4 + 16] // width 2395 movdqa xmm1, kShuffleMirrorUV 2396 lea eax, [eax + ecx * 2 - 16] 2397 sub edi, edx 2398 2399 align 16 2400 convertloop: 2401 movdqa xmm0, [eax] 2402 lea eax, [eax - 16] 2403 pshufb xmm0, xmm1 2404 sub ecx, 8 2405 movlpd qword ptr [edx], xmm0 2406 movhpd qword ptr [edx + edi], xmm0 2407 lea edx, [edx + 8] 2408 jg convertloop 2409 2410 pop edi 2411 ret 2412 } 2413 } 2414 #endif // HAS_MIRRORROW_UV_SSSE3 2415 2416 #ifdef HAS_ARGBMIRRORROW_SSSE3 2417 2418 // Shuffle table for reversing the bytes. 2419 static const uvec8 kARGBShuffleMirror = { 2420 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u 2421 }; 2422 2423 __declspec(naked) __declspec(align(16)) 2424 void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { 2425 __asm { 2426 mov eax, [esp + 4] // src 2427 mov edx, [esp + 8] // dst 2428 mov ecx, [esp + 12] // width 2429 movdqa xmm5, kARGBShuffleMirror 2430 lea eax, [eax - 16] 2431 2432 align 16 2433 convertloop: 2434 movdqa xmm0, [eax + ecx * 4] 2435 pshufb xmm0, xmm5 2436 sub ecx, 4 2437 movdqa [edx], xmm0 2438 lea edx, [edx + 16] 2439 jg convertloop 2440 ret 2441 } 2442 } 2443 #endif // HAS_ARGBMIRRORROW_SSSE3 2444 2445 #ifdef HAS_SPLITUV_SSE2 2446 __declspec(naked) __declspec(align(16)) 2447 void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { 2448 __asm { 2449 push edi 2450 mov eax, [esp + 4 + 4] // src_uv 2451 mov edx, [esp + 4 + 8] // dst_u 2452 mov edi, [esp + 4 + 12] // dst_v 2453 mov ecx, [esp + 4 + 16] // pix 2454 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 2455 psrlw xmm5, 8 2456 sub edi, edx 2457 2458 align 16 2459 convertloop: 2460 movdqa xmm0, [eax] 2461 movdqa xmm1, [eax + 16] 2462 lea eax, [eax + 32] 2463 movdqa xmm2, xmm0 2464 movdqa xmm3, xmm1 2465 pand xmm0, xmm5 // even bytes 2466 pand xmm1, xmm5 2467 packuswb xmm0, xmm1 2468 psrlw xmm2, 8 // odd bytes 2469 psrlw xmm3, 8 2470 packuswb xmm2, xmm3 2471 movdqa [edx], xmm0 2472 movdqa [edx + edi], xmm2 2473 lea edx, [edx + 16] 2474 sub ecx, 16 2475 jg convertloop 2476 2477 pop edi 2478 ret 2479 } 2480 } 2481 #endif // HAS_SPLITUV_SSE2 2482 2483 #ifdef HAS_COPYROW_SSE2 2484 // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time. 2485 __declspec(naked) __declspec(align(16)) 2486 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { 2487 __asm { 2488 mov eax, [esp + 4] // src 2489 mov edx, [esp + 8] // dst 2490 mov ecx, [esp + 12] // count 2491 sub edx, eax 2492 2493 align 16 2494 convertloop: 2495 movdqa xmm0, [eax] 2496 movdqa xmm1, [eax + 16] 2497 movdqa [eax + edx], xmm0 2498 movdqa [eax + edx + 16], xmm1 2499 lea eax, [eax + 32] 2500 sub ecx, 32 2501 jg convertloop 2502 ret 2503 } 2504 } 2505 #endif // HAS_COPYROW_SSE2 2506 2507 #ifdef HAS_COPYROW_X86 2508 __declspec(naked) __declspec(align(16)) 2509 void CopyRow_X86(const uint8* src, uint8* dst, int count) { 2510 __asm { 2511 mov eax, esi 2512 mov edx, edi 2513 mov esi, [esp + 4] // src 2514 mov edi, [esp + 8] // dst 2515 mov ecx, [esp + 12] // count 2516 shr ecx, 2 2517 rep movsd 2518 mov edi, edx 2519 mov esi, eax 2520 ret 2521 } 2522 } 2523 #endif // HAS_COPYROW_X86 2524 2525 #ifdef HAS_SETROW_X86 2526 // SetRow8 writes 'count' bytes using a 32 bit value repeated. 2527 __declspec(naked) __declspec(align(16)) 2528 void SetRow8_X86(uint8* dst, uint32 v32, int count) { 2529 __asm { 2530 mov edx, edi 2531 mov edi, [esp + 4] // dst 2532 mov eax, [esp + 8] // v32 2533 mov ecx, [esp + 12] // count 2534 shr ecx, 2 2535 rep stosd 2536 mov edi, edx 2537 ret 2538 } 2539 } 2540 2541 // SetRow32 writes 'count' words using a 32 bit value repeated. 2542 __declspec(naked) __declspec(align(16)) 2543 void SetRows32_X86(uint8* dst, uint32 v32, int width, 2544 int dst_stride, int height) { 2545 __asm { 2546 push esi 2547 push edi 2548 push ebp 2549 mov edi, [esp + 12 + 4] // dst 2550 mov eax, [esp + 12 + 8] // v32 2551 mov ebp, [esp + 12 + 12] // width 2552 mov edx, [esp + 12 + 16] // dst_stride 2553 mov esi, [esp + 12 + 20] // height 2554 lea ecx, [ebp * 4] 2555 sub edx, ecx // stride - width * 4 2556 2557 align 16 2558 convertloop: 2559 mov ecx, ebp 2560 rep stosd 2561 add edi, edx 2562 sub esi, 1 2563 jg convertloop 2564 2565 pop ebp 2566 pop edi 2567 pop esi 2568 ret 2569 } 2570 } 2571 #endif // HAS_SETROW_X86 2572 2573 #ifdef HAS_YUY2TOYROW_SSE2 2574 __declspec(naked) __declspec(align(16)) 2575 void YUY2ToYRow_SSE2(const uint8* src_yuy2, 2576 uint8* dst_y, int pix) { 2577 __asm { 2578 mov eax, [esp + 4] // src_yuy2 2579 mov edx, [esp + 8] // dst_y 2580 mov ecx, [esp + 12] // pix 2581 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 2582 psrlw xmm5, 8 2583 2584 align 16 2585 convertloop: 2586 movdqa xmm0, [eax] 2587 movdqa xmm1, [eax + 16] 2588 lea eax, [eax + 32] 2589 pand xmm0, xmm5 // even bytes are Y 2590 pand xmm1, xmm5 2591 packuswb xmm0, xmm1 2592 sub ecx, 16 2593 movdqa [edx], xmm0 2594 lea edx, [edx + 16] 2595 jg convertloop 2596 ret 2597 } 2598 } 2599 2600 __declspec(naked) __declspec(align(16)) 2601 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, 2602 uint8* dst_u, uint8* dst_v, int pix) { 2603 __asm { 2604 push esi 2605 push edi 2606 mov eax, [esp + 8 + 4] // src_yuy2 2607 mov esi, [esp + 8 + 8] // stride_yuy2 2608 mov edx, [esp + 8 + 12] // dst_u 2609 mov edi, [esp + 8 + 16] // dst_v 2610 mov ecx, [esp + 8 + 20] // pix 2611 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 2612 psrlw xmm5, 8 2613 sub edi, edx 2614 2615 align 16 2616 convertloop: 2617 movdqa xmm0, [eax] 2618 movdqa xmm1, [eax + 16] 2619 movdqa xmm2, [eax + esi] 2620 movdqa xmm3, [eax + esi + 16] 2621 lea eax, [eax + 32] 2622 pavgb xmm0, xmm2 2623 pavgb xmm1, xmm3 2624 psrlw xmm0, 8 // YUYV -> UVUV 2625 psrlw xmm1, 8 2626 packuswb xmm0, xmm1 2627 movdqa xmm1, xmm0 2628 pand xmm0, xmm5 // U 2629 packuswb xmm0, xmm0 2630 psrlw xmm1, 8 // V 2631 packuswb xmm1, xmm1 2632 movq qword ptr [edx], xmm0 2633 movq qword ptr [edx + edi], xmm1 2634 lea edx, [edx + 8] 2635 sub ecx, 16 2636 jg convertloop 2637 2638 pop edi 2639 pop esi 2640 ret 2641 } 2642 } 2643 2644 __declspec(naked) __declspec(align(16)) 2645 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, 2646 uint8* dst_u, uint8* dst_v, int pix) { 2647 __asm { 2648 push edi 2649 mov eax, [esp + 4 + 4] // src_yuy2 2650 mov edx, [esp + 4 + 8] // dst_u 2651 mov edi, [esp + 4 + 12] // dst_v 2652 mov ecx, [esp + 4 + 16] // pix 2653 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 2654 psrlw xmm5, 8 2655 sub edi, edx 2656 2657 align 16 2658 convertloop: 2659 movdqa xmm0, [eax] 2660 movdqa xmm1, [eax + 16] 2661 lea eax, [eax + 32] 2662 psrlw xmm0, 8 // YUYV -> UVUV 2663 psrlw xmm1, 8 2664 packuswb xmm0, xmm1 2665 movdqa xmm1, xmm0 2666 pand xmm0, xmm5 // U 2667 packuswb xmm0, xmm0 2668 psrlw xmm1, 8 // V 2669 packuswb xmm1, xmm1 2670 movq qword ptr [edx], xmm0 2671 movq qword ptr [edx + edi], xmm1 2672 lea edx, [edx + 8] 2673 sub ecx, 16 2674 jg convertloop 2675 2676 pop edi 2677 ret 2678 } 2679 } 2680 2681 __declspec(naked) __declspec(align(16)) 2682 void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2, 2683 uint8* dst_y, int pix) { 2684 __asm { 2685 mov eax, [esp + 4] // src_yuy2 2686 mov edx, [esp + 8] // dst_y 2687 mov ecx, [esp + 12] // pix 2688 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 2689 psrlw xmm5, 8 2690 2691 align 16 2692 convertloop: 2693 movdqu xmm0, [eax] 2694 movdqu xmm1, [eax + 16] 2695 lea eax, [eax + 32] 2696 pand xmm0, xmm5 // even bytes are Y 2697 pand xmm1, xmm5 2698 packuswb xmm0, xmm1 2699 sub ecx, 16 2700 movdqu [edx], xmm0 2701 lea edx, [edx + 16] 2702 jg convertloop 2703 ret 2704 } 2705 } 2706 2707 __declspec(naked) __declspec(align(16)) 2708 void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2, 2709 uint8* dst_u, uint8* dst_v, int pix) { 2710 __asm { 2711 push esi 2712 push edi 2713 mov eax, [esp + 8 + 4] // src_yuy2 2714 mov esi, [esp + 8 + 8] // stride_yuy2 2715 mov edx, [esp + 8 + 12] // dst_u 2716 mov edi, [esp + 8 + 16] // dst_v 2717 mov ecx, [esp + 8 + 20] // pix 2718 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 2719 psrlw xmm5, 8 2720 sub edi, edx 2721 2722 align 16 2723 convertloop: 2724 movdqu xmm0, [eax] 2725 movdqu xmm1, [eax + 16] 2726 movdqu xmm2, [eax + esi] 2727 movdqu xmm3, [eax + esi + 16] 2728 lea eax, [eax + 32] 2729 pavgb xmm0, xmm2 2730 pavgb xmm1, xmm3 2731 psrlw xmm0, 8 // YUYV -> UVUV 2732 psrlw xmm1, 8 2733 packuswb xmm0, xmm1 2734 movdqa xmm1, xmm0 2735 pand xmm0, xmm5 // U 2736 packuswb xmm0, xmm0 2737 psrlw xmm1, 8 // V 2738 packuswb xmm1, xmm1 2739 movq qword ptr [edx], xmm0 2740 movq qword ptr [edx + edi], xmm1 2741 lea edx, [edx + 8] 2742 sub ecx, 16 2743 jg convertloop 2744 2745 pop edi 2746 pop esi 2747 ret 2748 } 2749 } 2750 2751 __declspec(naked) __declspec(align(16)) 2752 void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2, 2753 uint8* dst_u, uint8* dst_v, int pix) { 2754 __asm { 2755 push edi 2756 mov eax, [esp + 4 + 4] // src_yuy2 2757 mov edx, [esp + 4 + 8] // dst_u 2758 mov edi, [esp + 4 + 12] // dst_v 2759 mov ecx, [esp + 4 + 16] // pix 2760 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 2761 psrlw xmm5, 8 2762 sub edi, edx 2763 2764 align 16 2765 convertloop: 2766 movdqu xmm0, [eax] 2767 movdqu xmm1, [eax + 16] 2768 lea eax, [eax + 32] 2769 psrlw xmm0, 8 // YUYV -> UVUV 2770 psrlw xmm1, 8 2771 packuswb xmm0, xmm1 2772 movdqa xmm1, xmm0 2773 pand xmm0, xmm5 // U 2774 packuswb xmm0, xmm0 2775 psrlw xmm1, 8 // V 2776 packuswb xmm1, xmm1 2777 movq qword ptr [edx], xmm0 2778 movq qword ptr [edx + edi], xmm1 2779 lea edx, [edx + 8] 2780 sub ecx, 16 2781 jg convertloop 2782 2783 pop edi 2784 ret 2785 } 2786 } 2787 2788 __declspec(naked) __declspec(align(16)) 2789 void UYVYToYRow_SSE2(const uint8* src_uyvy, 2790 uint8* dst_y, int pix) { 2791 __asm { 2792 mov eax, [esp + 4] // src_uyvy 2793 mov edx, [esp + 8] // dst_y 2794 mov ecx, [esp + 12] // pix 2795 2796 align 16 2797 convertloop: 2798 movdqa xmm0, [eax] 2799 movdqa xmm1, [eax + 16] 2800 lea eax, [eax + 32] 2801 psrlw xmm0, 8 // odd bytes are Y 2802 psrlw xmm1, 8 2803 packuswb xmm0, xmm1 2804 sub ecx, 16 2805 movdqa [edx], xmm0 2806 lea edx, [edx + 16] 2807 jg convertloop 2808 ret 2809 } 2810 } 2811 2812 __declspec(naked) __declspec(align(16)) 2813 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, 2814 uint8* dst_u, uint8* dst_v, int pix) { 2815 __asm { 2816 push esi 2817 push edi 2818 mov eax, [esp + 8 + 4] // src_yuy2 2819 mov esi, [esp + 8 + 8] // stride_yuy2 2820 mov edx, [esp + 8 + 12] // dst_u 2821 mov edi, [esp + 8 + 16] // dst_v 2822 mov ecx, [esp + 8 + 20] // pix 2823 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 2824 psrlw xmm5, 8 2825 sub edi, edx 2826 2827 align 16 2828 convertloop: 2829 movdqa xmm0, [eax] 2830 movdqa xmm1, [eax + 16] 2831 movdqa xmm2, [eax + esi] 2832 movdqa xmm3, [eax + esi + 16] 2833 lea eax, [eax + 32] 2834 pavgb xmm0, xmm2 2835 pavgb xmm1, xmm3 2836 pand xmm0, xmm5 // UYVY -> UVUV 2837 pand xmm1, xmm5 2838 packuswb xmm0, xmm1 2839 movdqa xmm1, xmm0 2840 pand xmm0, xmm5 // U 2841 packuswb xmm0, xmm0 2842 psrlw xmm1, 8 // V 2843 packuswb xmm1, xmm1 2844 movq qword ptr [edx], xmm0 2845 movq qword ptr [edx + edi], xmm1 2846 lea edx, [edx + 8] 2847 sub ecx, 16 2848 jg convertloop 2849 2850 pop edi 2851 pop esi 2852 ret 2853 } 2854 } 2855 2856 __declspec(naked) __declspec(align(16)) 2857 void UYVYToUV422Row_SSE2(const uint8* src_uyvy, 2858 uint8* dst_u, uint8* dst_v, int pix) { 2859 __asm { 2860 push edi 2861 mov eax, [esp + 4 + 4] // src_yuy2 2862 mov edx, [esp + 4 + 8] // dst_u 2863 mov edi, [esp + 4 + 12] // dst_v 2864 mov ecx, [esp + 4 + 16] // pix 2865 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 2866 psrlw xmm5, 8 2867 sub edi, edx 2868 2869 align 16 2870 convertloop: 2871 movdqa xmm0, [eax] 2872 movdqa xmm1, [eax + 16] 2873 lea eax, [eax + 32] 2874 pand xmm0, xmm5 // UYVY -> UVUV 2875 pand xmm1, xmm5 2876 packuswb xmm0, xmm1 2877 movdqa xmm1, xmm0 2878 pand xmm0, xmm5 // U 2879 packuswb xmm0, xmm0 2880 psrlw xmm1, 8 // V 2881 packuswb xmm1, xmm1 2882 movq qword ptr [edx], xmm0 2883 movq qword ptr [edx + edi], xmm1 2884 lea edx, [edx + 8] 2885 sub ecx, 16 2886 jg convertloop 2887 2888 pop edi 2889 ret 2890 } 2891 } 2892 2893 __declspec(naked) __declspec(align(16)) 2894 void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy, 2895 uint8* dst_y, int pix) { 2896 __asm { 2897 mov eax, [esp + 4] // src_uyvy 2898 mov edx, [esp + 8] // dst_y 2899 mov ecx, [esp + 12] // pix 2900 2901 align 16 2902 convertloop: 2903 movdqu xmm0, [eax] 2904 movdqu xmm1, [eax + 16] 2905 lea eax, [eax + 32] 2906 psrlw xmm0, 8 // odd bytes are Y 2907 psrlw xmm1, 8 2908 packuswb xmm0, xmm1 2909 sub ecx, 16 2910 movdqu [edx], xmm0 2911 lea edx, [edx + 16] 2912 jg convertloop 2913 ret 2914 } 2915 } 2916 2917 __declspec(naked) __declspec(align(16)) 2918 void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, 2919 uint8* dst_u, uint8* dst_v, int pix) { 2920 __asm { 2921 push esi 2922 push edi 2923 mov eax, [esp + 8 + 4] // src_yuy2 2924 mov esi, [esp + 8 + 8] // stride_yuy2 2925 mov edx, [esp + 8 + 12] // dst_u 2926 mov edi, [esp + 8 + 16] // dst_v 2927 mov ecx, [esp + 8 + 20] // pix 2928 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 2929 psrlw xmm5, 8 2930 sub edi, edx 2931 2932 align 16 2933 convertloop: 2934 movdqu xmm0, [eax] 2935 movdqu xmm1, [eax + 16] 2936 movdqu xmm2, [eax + esi] 2937 movdqu xmm3, [eax + esi + 16] 2938 lea eax, [eax + 32] 2939 pavgb xmm0, xmm2 2940 pavgb xmm1, xmm3 2941 pand xmm0, xmm5 // UYVY -> UVUV 2942 pand xmm1, xmm5 2943 packuswb xmm0, xmm1 2944 movdqa xmm1, xmm0 2945 pand xmm0, xmm5 // U 2946 packuswb xmm0, xmm0 2947 psrlw xmm1, 8 // V 2948 packuswb xmm1, xmm1 2949 movq qword ptr [edx], xmm0 2950 movq qword ptr [edx + edi], xmm1 2951 lea edx, [edx + 8] 2952 sub ecx, 16 2953 jg convertloop 2954 2955 pop edi 2956 pop esi 2957 ret 2958 } 2959 } 2960 2961 __declspec(naked) __declspec(align(16)) 2962 void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy, 2963 uint8* dst_u, uint8* dst_v, int pix) { 2964 __asm { 2965 push edi 2966 mov eax, [esp + 4 + 4] // src_yuy2 2967 mov edx, [esp + 4 + 8] // dst_u 2968 mov edi, [esp + 4 + 12] // dst_v 2969 mov ecx, [esp + 4 + 16] // pix 2970 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 2971 psrlw xmm5, 8 2972 sub edi, edx 2973 2974 align 16 2975 convertloop: 2976 movdqu xmm0, [eax] 2977 movdqu xmm1, [eax + 16] 2978 lea eax, [eax + 32] 2979 pand xmm0, xmm5 // UYVY -> UVUV 2980 pand xmm1, xmm5 2981 packuswb xmm0, xmm1 2982 movdqa xmm1, xmm0 2983 pand xmm0, xmm5 // U 2984 packuswb xmm0, xmm0 2985 psrlw xmm1, 8 // V 2986 packuswb xmm1, xmm1 2987 movq qword ptr [edx], xmm0 2988 movq qword ptr [edx + edi], xmm1 2989 lea edx, [edx + 8] 2990 sub ecx, 16 2991 jg convertloop 2992 2993 pop edi 2994 ret 2995 } 2996 } 2997 #endif // HAS_YUY2TOYROW_SSE2 2998 2999 #ifdef HAS_ARGBBLENDROW_SSE2 3000 // Blend 8 pixels at a time. 3001 __declspec(naked) __declspec(align(16)) 3002 void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, 3003 uint8* dst_argb, int width) { 3004 __asm { 3005 push esi 3006 mov eax, [esp + 4 + 4] // src_argb0 3007 mov esi, [esp + 4 + 8] // src_argb1 3008 mov edx, [esp + 4 + 12] // dst_argb 3009 mov ecx, [esp + 4 + 16] // width 3010 pcmpeqb xmm7, xmm7 // generate constant 1 3011 psrlw xmm7, 15 3012 pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff 3013 psrlw xmm6, 8 3014 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 3015 psllw xmm5, 8 3016 pcmpeqb xmm4, xmm4 // generate mask 0xff000000 3017 pslld xmm4, 24 3018 3019 sub ecx, 1 3020 je convertloop1 // only 1 pixel? 3021 jl convertloop1b 3022 3023 // 1 pixel loop until destination pointer is aligned. 3024 alignloop1: 3025 test edx, 15 // aligned? 3026 je alignloop1b 3027 movd xmm3, [eax] 3028 lea eax, [eax + 4] 3029 movdqa xmm0, xmm3 // src argb 3030 pxor xmm3, xmm4 // ~alpha 3031 movd xmm2, [esi] // _r_b 3032 psrlw xmm3, 8 // alpha 3033 pshufhw xmm3, xmm3,0F5h // 8 alpha words 3034 pshuflw xmm3, xmm3,0F5h 3035 pand xmm2, xmm6 // _r_b 3036 paddw xmm3, xmm7 // 256 - alpha 3037 pmullw xmm2, xmm3 // _r_b * alpha 3038 movd xmm1, [esi] // _a_g 3039 lea esi, [esi + 4] 3040 psrlw xmm1, 8 // _a_g 3041 por xmm0, xmm4 // set alpha to 255 3042 pmullw xmm1, xmm3 // _a_g * alpha 3043 psrlw xmm2, 8 // _r_b convert to 8 bits again 3044 paddusb xmm0, xmm2 // + src argb 3045 pand xmm1, xmm5 // a_g_ convert to 8 bits again 3046 paddusb xmm0, xmm1 // + src argb 3047 sub ecx, 1 3048 movd [edx], xmm0 3049 lea edx, [edx + 4] 3050 jge alignloop1 3051 3052 alignloop1b: 3053 add ecx, 1 - 4 3054 jl convertloop4b 3055 3056 // 4 pixel loop. 3057 convertloop4: 3058 movdqu xmm3, [eax] // src argb 3059 lea eax, [eax + 16] 3060 movdqa xmm0, xmm3 // src argb 3061 pxor xmm3, xmm4 // ~alpha 3062 movdqu xmm2, [esi] // _r_b 3063 psrlw xmm3, 8 // alpha 3064 pshufhw xmm3, xmm3,0F5h // 8 alpha words 3065 pshuflw xmm3, xmm3,0F5h 3066 pand xmm2, xmm6 // _r_b 3067 paddw xmm3, xmm7 // 256 - alpha 3068 pmullw xmm2, xmm3 // _r_b * alpha 3069 movdqu xmm1, [esi] // _a_g 3070 lea esi, [esi + 16] 3071 psrlw xmm1, 8 // _a_g 3072 por xmm0, xmm4 // set alpha to 255 3073 pmullw xmm1, xmm3 // _a_g * alpha 3074 psrlw xmm2, 8 // _r_b convert to 8 bits again 3075 paddusb xmm0, xmm2 // + src argb 3076 pand xmm1, xmm5 // a_g_ convert to 8 bits again 3077 paddusb xmm0, xmm1 // + src argb 3078 sub ecx, 4 3079 movdqa [edx], xmm0 3080 lea edx, [edx + 16] 3081 jge convertloop4 3082 3083 convertloop4b: 3084 add ecx, 4 - 1 3085 jl convertloop1b 3086 3087 // 1 pixel loop. 3088 convertloop1: 3089 movd xmm3, [eax] // src argb 3090 lea eax, [eax + 4] 3091 movdqa xmm0, xmm3 // src argb 3092 pxor xmm3, xmm4 // ~alpha 3093 movd xmm2, [esi] // _r_b 3094 psrlw xmm3, 8 // alpha 3095 pshufhw xmm3, xmm3,0F5h // 8 alpha words 3096 pshuflw xmm3, xmm3,0F5h 3097 pand xmm2, xmm6 // _r_b 3098 paddw xmm3, xmm7 // 256 - alpha 3099 pmullw xmm2, xmm3 // _r_b * alpha 3100 movd xmm1, [esi] // _a_g 3101 lea esi, [esi + 4] 3102 psrlw xmm1, 8 // _a_g 3103 por xmm0, xmm4 // set alpha to 255 3104 pmullw xmm1, xmm3 // _a_g * alpha 3105 psrlw xmm2, 8 // _r_b convert to 8 bits again 3106 paddusb xmm0, xmm2 // + src argb 3107 pand xmm1, xmm5 // a_g_ convert to 8 bits again 3108 paddusb xmm0, xmm1 // + src argb 3109 sub ecx, 1 3110 movd [edx], xmm0 3111 lea edx, [edx + 4] 3112 jge convertloop1 3113 3114 convertloop1b: 3115 pop esi 3116 ret 3117 } 3118 } 3119 #endif // HAS_ARGBBLENDROW_SSE2 3120 3121 #ifdef HAS_ARGBBLENDROW_SSSE3 3122 // Shuffle table for isolating alpha. 3123 static const uvec8 kShuffleAlpha = { 3124 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, 3125 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 3126 }; 3127 // Same as SSE2, but replaces: 3128 // psrlw xmm3, 8 // alpha 3129 // pshufhw xmm3, xmm3,0F5h // 8 alpha words 3130 // pshuflw xmm3, xmm3,0F5h 3131 // with.. 3132 // pshufb xmm3, kShuffleAlpha // alpha 3133 // Blend 8 pixels at a time. 3134 3135 __declspec(naked) __declspec(align(16)) 3136 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, 3137 uint8* dst_argb, int width) { 3138 __asm { 3139 push esi 3140 mov eax, [esp + 4 + 4] // src_argb0 3141 mov esi, [esp + 4 + 8] // src_argb1 3142 mov edx, [esp + 4 + 12] // dst_argb 3143 mov ecx, [esp + 4 + 16] // width 3144 pcmpeqb xmm7, xmm7 // generate constant 1 3145 psrlw xmm7, 15 3146 pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff 3147 psrlw xmm6, 8 3148 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 3149 psllw xmm5, 8 3150 pcmpeqb xmm4, xmm4 // generate mask 0xff000000 3151 pslld xmm4, 24 3152 3153 sub ecx, 1 3154 je convertloop1 // only 1 pixel? 3155 jl convertloop1b 3156 3157 // 1 pixel loop until destination pointer is aligned. 3158 alignloop1: 3159 test edx, 15 // aligned? 3160 je alignloop1b 3161 movd xmm3, [eax] 3162 lea eax, [eax + 4] 3163 movdqa xmm0, xmm3 // src argb 3164 pxor xmm3, xmm4 // ~alpha 3165 movd xmm2, [esi] // _r_b 3166 pshufb xmm3, kShuffleAlpha // alpha 3167 pand xmm2, xmm6 // _r_b 3168 paddw xmm3, xmm7 // 256 - alpha 3169 pmullw xmm2, xmm3 // _r_b * alpha 3170 movd xmm1, [esi] // _a_g 3171 lea esi, [esi + 4] 3172 psrlw xmm1, 8 // _a_g 3173 por xmm0, xmm4 // set alpha to 255 3174 pmullw xmm1, xmm3 // _a_g * alpha 3175 psrlw xmm2, 8 // _r_b convert to 8 bits again 3176 paddusb xmm0, xmm2 // + src argb 3177 pand xmm1, xmm5 // a_g_ convert to 8 bits again 3178 paddusb xmm0, xmm1 // + src argb 3179 sub ecx, 1 3180 movd [edx], xmm0 3181 lea edx, [edx + 4] 3182 jge alignloop1 3183 3184 alignloop1b: 3185 add ecx, 1 - 4 3186 jl convertloop4b 3187 3188 test eax, 15 // unaligned? 3189 jne convertuloop4 3190 test esi, 15 // unaligned? 3191 jne convertuloop4 3192 3193 // 4 pixel loop. 3194 convertloop4: 3195 movdqa xmm3, [eax] // src argb 3196 lea eax, [eax + 16] 3197 movdqa xmm0, xmm3 // src argb 3198 pxor xmm3, xmm4 // ~alpha 3199 movdqa xmm2, [esi] // _r_b 3200 pshufb xmm3, kShuffleAlpha // alpha 3201 pand xmm2, xmm6 // _r_b 3202 paddw xmm3, xmm7 // 256 - alpha 3203 pmullw xmm2, xmm3 // _r_b * alpha 3204 movdqa xmm1, [esi] // _a_g 3205 lea esi, [esi + 16] 3206 psrlw xmm1, 8 // _a_g 3207 por xmm0, xmm4 // set alpha to 255 3208 pmullw xmm1, xmm3 // _a_g * alpha 3209 psrlw xmm2, 8 // _r_b convert to 8 bits again 3210 paddusb xmm0, xmm2 // + src argb 3211 pand xmm1, xmm5 // a_g_ convert to 8 bits again 3212 paddusb xmm0, xmm1 // + src argb 3213 sub ecx, 4 3214 movdqa [edx], xmm0 3215 lea edx, [edx + 16] 3216 jge convertloop4 3217 jmp convertloop4b 3218 3219 // 4 pixel unaligned loop. 3220 convertuloop4: 3221 movdqu xmm3, [eax] // src argb 3222 lea eax, [eax + 16] 3223 movdqa xmm0, xmm3 // src argb 3224 pxor xmm3, xmm4 // ~alpha 3225 movdqu xmm2, [esi] // _r_b 3226 pshufb xmm3, kShuffleAlpha // alpha 3227 pand xmm2, xmm6 // _r_b 3228 paddw xmm3, xmm7 // 256 - alpha 3229 pmullw xmm2, xmm3 // _r_b * alpha 3230 movdqu xmm1, [esi] // _a_g 3231 lea esi, [esi + 16] 3232 psrlw xmm1, 8 // _a_g 3233 por xmm0, xmm4 // set alpha to 255 3234 pmullw xmm1, xmm3 // _a_g * alpha 3235 psrlw xmm2, 8 // _r_b convert to 8 bits again 3236 paddusb xmm0, xmm2 // + src argb 3237 pand xmm1, xmm5 // a_g_ convert to 8 bits again 3238 paddusb xmm0, xmm1 // + src argb 3239 sub ecx, 4 3240 movdqa [edx], xmm0 3241 lea edx, [edx + 16] 3242 jge convertuloop4 3243 3244 convertloop4b: 3245 add ecx, 4 - 1 3246 jl convertloop1b 3247 3248 // 1 pixel loop. 3249 convertloop1: 3250 movd xmm3, [eax] // src argb 3251 lea eax, [eax + 4] 3252 movdqa xmm0, xmm3 // src argb 3253 pxor xmm3, xmm4 // ~alpha 3254 movd xmm2, [esi] // _r_b 3255 pshufb xmm3, kShuffleAlpha // alpha 3256 pand xmm2, xmm6 // _r_b 3257 paddw xmm3, xmm7 // 256 - alpha 3258 pmullw xmm2, xmm3 // _r_b * alpha 3259 movd xmm1, [esi] // _a_g 3260 lea esi, [esi + 4] 3261 psrlw xmm1, 8 // _a_g 3262 por xmm0, xmm4 // set alpha to 255 3263 pmullw xmm1, xmm3 // _a_g * alpha 3264 psrlw xmm2, 8 // _r_b convert to 8 bits again 3265 paddusb xmm0, xmm2 // + src argb 3266 pand xmm1, xmm5 // a_g_ convert to 8 bits again 3267 paddusb xmm0, xmm1 // + src argb 3268 sub ecx, 1 3269 movd [edx], xmm0 3270 lea edx, [edx + 4] 3271 jge convertloop1 3272 3273 convertloop1b: 3274 pop esi 3275 ret 3276 } 3277 } 3278 #endif // HAS_ARGBBLENDROW_SSSE3 3279 3280 #ifdef HAS_ARGBATTENUATE_SSE2 3281 // Attenuate 4 pixels at a time. 3282 // Aligned to 16 bytes. 3283 __declspec(naked) __declspec(align(16)) 3284 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { 3285 __asm { 3286 mov eax, [esp + 4] // src_argb0 3287 mov edx, [esp + 8] // dst_argb 3288 mov ecx, [esp + 12] // width 3289 sub edx, eax 3290 pcmpeqb xmm4, xmm4 // generate mask 0xff000000 3291 pslld xmm4, 24 3292 pcmpeqb xmm5, xmm5 // generate mask 0x00ffffff 3293 psrld xmm5, 8 3294 3295 align 16 3296 convertloop: 3297 movdqa xmm0, [eax] // read 4 pixels 3298 punpcklbw xmm0, xmm0 // first 2 3299 pshufhw xmm2, xmm0,0FFh // 8 alpha words 3300 pshuflw xmm2, xmm2,0FFh 3301 pmulhuw xmm0, xmm2 // rgb * a 3302 movdqa xmm1, [eax] // read 4 pixels 3303 punpckhbw xmm1, xmm1 // next 2 pixels 3304 pshufhw xmm2, xmm1,0FFh // 8 alpha words 3305 pshuflw xmm2, xmm2,0FFh 3306 pmulhuw xmm1, xmm2 // rgb * a 3307 movdqa xmm2, [eax] // alphas 3308 psrlw xmm0, 8 3309 pand xmm2, xmm4 3310 psrlw xmm1, 8 3311 packuswb xmm0, xmm1 3312 pand xmm0, xmm5 // keep original alphas 3313 por xmm0, xmm2 3314 sub ecx, 4 3315 movdqa [eax + edx], xmm0 3316 lea eax, [eax + 16] 3317 jg convertloop 3318 3319 ret 3320 } 3321 } 3322 #endif // HAS_ARGBATTENUATE_SSE2 3323 3324 #ifdef HAS_ARGBATTENUATEROW_SSSE3 3325 // Shuffle table duplicating alpha. 3326 static const uvec8 kShuffleAlpha0 = { 3327 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u, 3328 }; 3329 static const uvec8 kShuffleAlpha1 = { 3330 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, 3331 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u, 3332 }; 3333 __declspec(naked) __declspec(align(16)) 3334 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { 3335 __asm { 3336 mov eax, [esp + 4] // src_argb0 3337 mov edx, [esp + 8] // dst_argb 3338 mov ecx, [esp + 12] // width 3339 sub edx, eax 3340 pcmpeqb xmm3, xmm3 // generate mask 0xff000000 3341 pslld xmm3, 24 3342 movdqa xmm4, kShuffleAlpha0 3343 movdqa xmm5, kShuffleAlpha1 3344 3345 align 16 3346 convertloop: 3347 movdqa xmm0, [eax] // read 4 pixels 3348 pshufb xmm0, xmm4 // isolate first 2 alphas 3349 movdqa xmm1, [eax] // read 4 pixels 3350 punpcklbw xmm1, xmm1 // first 2 pixel rgbs 3351 pmulhuw xmm0, xmm1 // rgb * a 3352 movdqa xmm1, [eax] // read 4 pixels 3353 pshufb xmm1, xmm5 // isolate next 2 alphas 3354 movdqa xmm2, [eax] // read 4 pixels 3355 punpckhbw xmm2, xmm2 // next 2 pixel rgbs 3356 pmulhuw xmm1, xmm2 // rgb * a 3357 movdqa xmm2, [eax] // mask original alpha 3358 pand xmm2, xmm3 3359 psrlw xmm0, 8 3360 psrlw xmm1, 8 3361 packuswb xmm0, xmm1 3362 por xmm0, xmm2 // copy original alpha 3363 sub ecx, 4 3364 movdqa [eax + edx], xmm0 3365 lea eax, [eax + 16] 3366 jg convertloop 3367 3368 ret 3369 } 3370 } 3371 #endif // HAS_ARGBATTENUATEROW_SSSE3 3372 3373 #ifdef HAS_ARGBUNATTENUATEROW_SSE2 3374 // Unattenuate 4 pixels at a time. 3375 // Aligned to 16 bytes. 3376 __declspec(naked) __declspec(align(16)) 3377 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, 3378 int width) { 3379 __asm { 3380 push esi 3381 push edi 3382 mov eax, [esp + 8 + 4] // src_argb0 3383 mov edx, [esp + 8 + 8] // dst_argb 3384 mov ecx, [esp + 8 + 12] // width 3385 sub edx, eax 3386 pcmpeqb xmm4, xmm4 // generate mask 0xff000000 3387 pslld xmm4, 24 3388 3389 align 16 3390 convertloop: 3391 movdqa xmm0, [eax] // read 4 pixels 3392 movzx esi, byte ptr [eax + 3] // first alpha 3393 movzx edi, byte ptr [eax + 7] // second alpha 3394 punpcklbw xmm0, xmm0 // first 2 3395 movd xmm2, dword ptr fixed_invtbl8[esi * 4] 3396 movd xmm3, dword ptr fixed_invtbl8[edi * 4] 3397 pshuflw xmm2, xmm2,0C0h // first 4 inv_alpha words 3398 pshuflw xmm3, xmm3,0C0h // next 4 inv_alpha words 3399 movlhps xmm2, xmm3 3400 pmulhuw xmm0, xmm2 // rgb * a 3401 3402 movdqa xmm1, [eax] // read 4 pixels 3403 movzx esi, byte ptr [eax + 11] // third alpha 3404 movzx edi, byte ptr [eax + 15] // forth alpha 3405 punpckhbw xmm1, xmm1 // next 2 3406 movd xmm2, dword ptr fixed_invtbl8[esi * 4] 3407 movd xmm3, dword ptr fixed_invtbl8[edi * 4] 3408 pshuflw xmm2, xmm2,0C0h // first 4 inv_alpha words 3409 pshuflw xmm3, xmm3,0C0h // next 4 inv_alpha words 3410 movlhps xmm2, xmm3 3411 pmulhuw xmm1, xmm2 // rgb * a 3412 3413 movdqa xmm2, [eax] // alphas 3414 pand xmm2, xmm4 3415 packuswb xmm0, xmm1 3416 por xmm0, xmm2 3417 sub ecx, 4 3418 movdqa [eax + edx], xmm0 3419 lea eax, [eax + 16] 3420 jg convertloop 3421 pop edi 3422 pop esi 3423 ret 3424 } 3425 } 3426 #endif // HAS_ARGBUNATTENUATEROW_SSE2 3427 3428 #ifdef HAS_ARGBGRAYROW_SSSE3 3429 // Constant for ARGB color to gray scale: 0.11 * B + 0.59 * G + 0.30 * R 3430 static const vec8 kARGBToGray = { 3431 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0 3432 }; 3433 3434 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels. 3435 __declspec(naked) __declspec(align(16)) 3436 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { 3437 __asm { 3438 mov eax, [esp + 4] /* src_argb */ 3439 mov edx, [esp + 8] /* dst_argb */ 3440 mov ecx, [esp + 12] /* width */ 3441 movdqa xmm4, kARGBToGray 3442 sub edx, eax 3443 3444 align 16 3445 convertloop: 3446 movdqa xmm0, [eax] // G 3447 movdqa xmm1, [eax + 16] 3448 pmaddubsw xmm0, xmm4 3449 pmaddubsw xmm1, xmm4 3450 phaddw xmm0, xmm1 3451 psrlw xmm0, 7 3452 packuswb xmm0, xmm0 // 8 G bytes 3453 movdqa xmm2, [eax] // A 3454 movdqa xmm3, [eax + 16] 3455 psrld xmm2, 24 3456 psrld xmm3, 24 3457 packuswb xmm2, xmm3 3458 packuswb xmm2, xmm2 // 8 A bytes 3459 movdqa xmm3, xmm0 // Weave into GG, GA, then GGGA 3460 punpcklbw xmm0, xmm0 // 8 GG words 3461 punpcklbw xmm3, xmm2 // 8 GA words 3462 movdqa xmm1, xmm0 3463 punpcklwd xmm0, xmm3 // GGGA first 4 3464 punpckhwd xmm1, xmm3 // GGGA next 4 3465 sub ecx, 8 3466 movdqa [eax + edx], xmm0 3467 movdqa [eax + edx + 16], xmm1 3468 lea eax, [eax + 32] 3469 jg convertloop 3470 ret 3471 } 3472 } 3473 #endif // HAS_ARGBGRAYROW_SSSE3 3474 3475 #ifdef HAS_ARGBSEPIAROW_SSSE3 3476 // b = (r * 35 + g * 68 + b * 17) >> 7 3477 // g = (r * 45 + g * 88 + b * 22) >> 7 3478 // r = (r * 50 + g * 98 + b * 24) >> 7 3479 // Constant for ARGB color to sepia tone. 3480 static const vec8 kARGBToSepiaB = { 3481 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0 3482 }; 3483 3484 static const vec8 kARGBToSepiaG = { 3485 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0 3486 }; 3487 3488 static const vec8 kARGBToSepiaR = { 3489 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0 3490 }; 3491 3492 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. 3493 __declspec(naked) __declspec(align(16)) 3494 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { 3495 __asm { 3496 mov eax, [esp + 4] /* dst_argb */ 3497 mov ecx, [esp + 8] /* width */ 3498 movdqa xmm2, kARGBToSepiaB 3499 movdqa xmm3, kARGBToSepiaG 3500 movdqa xmm4, kARGBToSepiaR 3501 3502 align 16 3503 convertloop: 3504 movdqa xmm0, [eax] // B 3505 movdqa xmm6, [eax + 16] 3506 pmaddubsw xmm0, xmm2 3507 pmaddubsw xmm6, xmm2 3508 phaddw xmm0, xmm6 3509 psrlw xmm0, 7 3510 packuswb xmm0, xmm0 // 8 B values 3511 movdqa xmm5, [eax] // G 3512 movdqa xmm1, [eax + 16] 3513 pmaddubsw xmm5, xmm3 3514 pmaddubsw xmm1, xmm3 3515 phaddw xmm5, xmm1 3516 psrlw xmm5, 7 3517 packuswb xmm5, xmm5 // 8 G values 3518 punpcklbw xmm0, xmm5 // 8 BG values 3519 movdqa xmm5, [eax] // R 3520 movdqa xmm1, [eax + 16] 3521 pmaddubsw xmm5, xmm4 3522 pmaddubsw xmm1, xmm4 3523 phaddw xmm5, xmm1 3524 psrlw xmm5, 7 3525 packuswb xmm5, xmm5 // 8 R values 3526 movdqa xmm6, [eax] // A 3527 movdqa xmm1, [eax + 16] 3528 psrld xmm6, 24 3529 psrld xmm1, 24 3530 packuswb xmm6, xmm1 3531 packuswb xmm6, xmm6 // 8 A values 3532 punpcklbw xmm5, xmm6 // 8 RA values 3533 movdqa xmm1, xmm0 // Weave BG, RA together 3534 punpcklwd xmm0, xmm5 // BGRA first 4 3535 punpckhwd xmm1, xmm5 // BGRA next 4 3536 sub ecx, 8 3537 movdqa [eax], xmm0 3538 movdqa [eax + 16], xmm1 3539 lea eax, [eax + 32] 3540 jg convertloop 3541 ret 3542 } 3543 } 3544 #endif // HAS_ARGBSEPIAROW_SSSE3 3545 3546 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 3547 // Tranform 8 ARGB pixels (32 bytes) with color matrix. 3548 // Same as Sepia except matrix is provided. 3549 // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R 3550 // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd. 3551 __declspec(naked) __declspec(align(16)) 3552 void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb, 3553 int width) { 3554 __asm { 3555 mov eax, [esp + 4] /* dst_argb */ 3556 mov edx, [esp + 8] /* matrix_argb */ 3557 mov ecx, [esp + 12] /* width */ 3558 movd xmm2, [edx] 3559 movd xmm3, [edx + 4] 3560 movd xmm4, [edx + 8] 3561 pshufd xmm2, xmm2, 0 3562 pshufd xmm3, xmm3, 0 3563 pshufd xmm4, xmm4, 0 3564 3565 align 16 3566 convertloop: 3567 movdqa xmm0, [eax] // B 3568 movdqa xmm6, [eax + 16] 3569 pmaddubsw xmm0, xmm2 3570 pmaddubsw xmm6, xmm2 3571 movdqa xmm5, [eax] // G 3572 movdqa xmm1, [eax + 16] 3573 pmaddubsw xmm5, xmm3 3574 pmaddubsw xmm1, xmm3 3575 phaddsw xmm0, xmm6 // B 3576 phaddsw xmm5, xmm1 // G 3577 psraw xmm0, 7 // B 3578 psraw xmm5, 7 // G 3579 packuswb xmm0, xmm0 // 8 B values 3580 packuswb xmm5, xmm5 // 8 G values 3581 punpcklbw xmm0, xmm5 // 8 BG values 3582 movdqa xmm5, [eax] // R 3583 movdqa xmm1, [eax + 16] 3584 pmaddubsw xmm5, xmm4 3585 pmaddubsw xmm1, xmm4 3586 phaddsw xmm5, xmm1 3587 psraw xmm5, 7 3588 packuswb xmm5, xmm5 // 8 R values 3589 movdqa xmm6, [eax] // A 3590 movdqa xmm1, [eax + 16] 3591 psrld xmm6, 24 3592 psrld xmm1, 24 3593 packuswb xmm6, xmm1 3594 packuswb xmm6, xmm6 // 8 A values 3595 movdqa xmm1, xmm0 // Weave BG, RA together 3596 punpcklbw xmm5, xmm6 // 8 RA values 3597 punpcklwd xmm0, xmm5 // BGRA first 4 3598 punpckhwd xmm1, xmm5 // BGRA next 4 3599 sub ecx, 8 3600 movdqa [eax], xmm0 3601 movdqa [eax + 16], xmm1 3602 lea eax, [eax + 32] 3603 jg convertloop 3604 ret 3605 } 3606 } 3607 #endif // HAS_ARGBCOLORMATRIXROW_SSSE3 3608 3609 #ifdef HAS_ARGBCOLORTABLEROW_X86 3610 // Tranform ARGB pixels with color table. 3611 __declspec(naked) __declspec(align(16)) 3612 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, 3613 int width) { 3614 __asm { 3615 push ebx 3616 push esi 3617 push edi 3618 push ebp 3619 mov eax, [esp + 16 + 4] /* dst_argb */ 3620 mov edi, [esp + 16 + 8] /* table_argb */ 3621 mov ecx, [esp + 16 + 12] /* width */ 3622 xor ebx, ebx 3623 xor edx, edx 3624 3625 align 16 3626 convertloop: 3627 mov ebp, dword ptr [eax] // BGRA 3628 mov esi, ebp 3629 and ebp, 255 3630 shr esi, 8 3631 and esi, 255 3632 mov bl, [edi + ebp * 4 + 0] // B 3633 mov dl, [edi + esi * 4 + 1] // G 3634 mov ebp, dword ptr [eax] // BGRA 3635 mov esi, ebp 3636 shr ebp, 16 3637 shr esi, 24 3638 and ebp, 255 3639 mov [eax], bl 3640 mov [eax + 1], dl 3641 mov bl, [edi + ebp * 4 + 2] // R 3642 mov dl, [edi + esi * 4 + 3] // A 3643 mov [eax + 2], bl 3644 mov [eax + 3], dl 3645 lea eax, [eax + 4] 3646 sub ecx, 1 3647 jg convertloop 3648 pop ebp 3649 pop edi 3650 pop esi 3651 pop ebx 3652 ret 3653 } 3654 } 3655 #endif // HAS_ARGBCOLORTABLEROW_X86 3656 3657 #ifdef HAS_ARGBQUANTIZEROW_SSE2 3658 // Quantize 4 ARGB pixels (16 bytes). 3659 // Aligned to 16 bytes. 3660 __declspec(naked) __declspec(align(16)) 3661 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, 3662 int interval_offset, int width) { 3663 __asm { 3664 mov eax, [esp + 4] /* dst_argb */ 3665 movd xmm2, [esp + 8] /* scale */ 3666 movd xmm3, [esp + 12] /* interval_size */ 3667 movd xmm4, [esp + 16] /* interval_offset */ 3668 mov ecx, [esp + 20] /* width */ 3669 pshuflw xmm2, xmm2, 040h 3670 pshufd xmm2, xmm2, 044h 3671 pshuflw xmm3, xmm3, 040h 3672 pshufd xmm3, xmm3, 044h 3673 pshuflw xmm4, xmm4, 040h 3674 pshufd xmm4, xmm4, 044h 3675 pxor xmm5, xmm5 // constant 0 3676 pcmpeqb xmm6, xmm6 // generate mask 0xff000000 3677 pslld xmm6, 24 3678 3679 align 16 3680 convertloop: 3681 movdqa xmm0, [eax] // read 4 pixels 3682 punpcklbw xmm0, xmm5 // first 2 pixels 3683 pmulhuw xmm0, xmm2 // pixel * scale >> 16 3684 movdqa xmm1, [eax] // read 4 pixels 3685 punpckhbw xmm1, xmm5 // next 2 pixels 3686 pmulhuw xmm1, xmm2 3687 pmullw xmm0, xmm3 // * interval_size 3688 movdqa xmm7, [eax] // read 4 pixels 3689 pmullw xmm1, xmm3 3690 pand xmm7, xmm6 // mask alpha 3691 paddw xmm0, xmm4 // + interval_size / 2 3692 paddw xmm1, xmm4 3693 packuswb xmm0, xmm1 3694 por xmm0, xmm7 3695 sub ecx, 4 3696 movdqa [eax], xmm0 3697 lea eax, [eax + 16] 3698 jg convertloop 3699 ret 3700 } 3701 } 3702 #endif // HAS_ARGBQUANTIZEROW_SSE2 3703 3704 #ifdef HAS_CUMULATIVESUMTOAVERAGE_SSE2 3705 // Consider float CumulativeSum. 3706 // Consider calling CumulativeSum one row at time as needed. 3707 // Consider circular CumulativeSum buffer of radius * 2 + 1 height. 3708 // Convert cumulative sum for an area to an average for 1 pixel. 3709 // topleft is pointer to top left of CumulativeSum buffer for area. 3710 // botleft is pointer to bottom left of CumulativeSum buffer. 3711 // width is offset from left to right of area in CumulativeSum buffer measured 3712 // in number of ints. 3713 // area is the number of pixels in the area being averaged. 3714 // dst points to pixel to store result to. 3715 // count is number of averaged pixels to produce. 3716 // Does 4 pixels at a time, requires CumulativeSum pointers to be 16 byte 3717 // aligned. 3718 void CumulativeSumToAverage_SSE2(const int32* topleft, const int32* botleft, 3719 int width, int area, uint8* dst, int count) { 3720 __asm { 3721 mov eax, topleft // eax topleft 3722 mov esi, botleft // esi botleft 3723 mov edx, width 3724 movd xmm4, area 3725 mov edi, dst 3726 mov ecx, count 3727 cvtdq2ps xmm4, xmm4 3728 rcpss xmm4, xmm4 // 1.0f / area 3729 pshufd xmm4, xmm4, 0 3730 sub ecx, 4 3731 jl l4b 3732 3733 // 4 pixel loop 3734 align 4 3735 l4: 3736 // top left 3737 movdqa xmm0, [eax] 3738 movdqa xmm1, [eax + 16] 3739 movdqa xmm2, [eax + 32] 3740 movdqa xmm3, [eax + 48] 3741 3742 // - top right 3743 psubd xmm0, [eax + edx * 4] 3744 psubd xmm1, [eax + edx * 4 + 16] 3745 psubd xmm2, [eax + edx * 4 + 32] 3746 psubd xmm3, [eax + edx * 4 + 48] 3747 lea eax, [eax + 64] 3748 3749 // - bottom left 3750 psubd xmm0, [esi] 3751 psubd xmm1, [esi + 16] 3752 psubd xmm2, [esi + 32] 3753 psubd xmm3, [esi + 48] 3754 3755 // + bottom right 3756 paddd xmm0, [esi + edx * 4] 3757 paddd xmm1, [esi + edx * 4 + 16] 3758 paddd xmm2, [esi + edx * 4 + 32] 3759 paddd xmm3, [esi + edx * 4 + 48] 3760 lea esi, [esi + 64] 3761 3762 cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area 3763 cvtdq2ps xmm1, xmm1 3764 mulps xmm0, xmm4 3765 mulps xmm1, xmm4 3766 cvtdq2ps xmm2, xmm2 3767 cvtdq2ps xmm3, xmm3 3768 mulps xmm2, xmm4 3769 mulps xmm3, xmm4 3770 cvtps2dq xmm0, xmm0 3771 cvtps2dq xmm1, xmm1 3772 cvtps2dq xmm2, xmm2 3773 cvtps2dq xmm3, xmm3 3774 packssdw xmm0, xmm1 3775 packssdw xmm2, xmm3 3776 packuswb xmm0, xmm2 3777 movdqu [edi], xmm0 3778 lea edi, [edi + 16] 3779 sub ecx, 4 3780 jge l4 3781 3782 l4b: 3783 add ecx, 4 - 1 3784 jl l1b 3785 3786 // 1 pixel loop 3787 align 4 3788 l1: 3789 movdqa xmm0, [eax] 3790 psubd xmm0, [eax + edx * 4] 3791 lea eax, [eax + 16] 3792 psubd xmm0, [esi] 3793 paddd xmm0, [esi + edx * 4] 3794 lea esi, [esi + 16] 3795 cvtdq2ps xmm0, xmm0 3796 mulps xmm0, xmm4 3797 cvtps2dq xmm0, xmm0 3798 packssdw xmm0, xmm0 3799 packuswb xmm0, xmm0 3800 movd dword ptr [edi], xmm0 3801 lea edi, [edi + 4] 3802 sub ecx, 1 3803 jge l1 3804 l1b: 3805 } 3806 } 3807 #endif // HAS_CUMULATIVESUMTOAVERAGE_SSE2 3808 3809 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 3810 // Creates a table of cumulative sums where each value is a sum of all values 3811 // above and to the left of the value. 3812 void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, 3813 const int32* previous_cumsum, int width) { 3814 __asm { 3815 mov eax, row 3816 mov edx, cumsum 3817 mov esi, previous_cumsum 3818 mov ecx, width 3819 sub esi, edx 3820 pxor xmm0, xmm0 3821 pxor xmm1, xmm1 3822 3823 sub ecx, 4 3824 jl l4b 3825 test edx, 15 3826 jne l4b 3827 3828 // 4 pixel loop 3829 align 4 3830 l4: 3831 movdqu xmm2, [eax] // 4 argb pixels 16 bytes. 3832 lea eax, [eax + 16] 3833 movdqa xmm4, xmm2 3834 3835 punpcklbw xmm2, xmm1 3836 movdqa xmm3, xmm2 3837 punpcklwd xmm2, xmm1 3838 punpckhwd xmm3, xmm1 3839 3840 punpckhbw xmm4, xmm1 3841 movdqa xmm5, xmm4 3842 punpcklwd xmm4, xmm1 3843 punpckhwd xmm5, xmm1 3844 3845 paddd xmm0, xmm2 3846 movdqa xmm2, [edx + esi] // previous row above. 3847 paddd xmm2, xmm0 3848 3849 paddd xmm0, xmm3 3850 movdqa xmm3, [edx + esi + 16] 3851 paddd xmm3, xmm0 3852 3853 paddd xmm0, xmm4 3854 movdqa xmm4, [edx + esi + 32] 3855 paddd xmm4, xmm0 3856 3857 paddd xmm0, xmm5 3858 movdqa xmm5, [edx + esi + 48] 3859 paddd xmm5, xmm0 3860 3861 movdqa [edx], xmm2 3862 movdqa [edx + 16], xmm3 3863 movdqa [edx + 32], xmm4 3864 movdqa [edx + 48], xmm5 3865 3866 lea edx, [edx + 64] 3867 sub ecx, 4 3868 jge l4 3869 3870 l4b: 3871 add ecx, 4 - 1 3872 jl l1b 3873 3874 // 1 pixel loop 3875 align 4 3876 l1: 3877 movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes. 3878 lea eax, [eax + 4] 3879 punpcklbw xmm2, xmm1 3880 punpcklwd xmm2, xmm1 3881 paddd xmm0, xmm2 3882 movdqu xmm2, [edx + esi] 3883 paddd xmm2, xmm0 3884 movdqu [edx], xmm2 3885 lea edx, [edx + 16] 3886 sub ecx, 1 3887 jge l1 3888 3889 l1b: 3890 } 3891 } 3892 #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 3893 3894 #ifdef HAS_ARGBSHADE_SSE2 3895 // Shade 4 pixels at a time by specified value. 3896 // Aligned to 16 bytes. 3897 __declspec(naked) __declspec(align(16)) 3898 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, 3899 uint32 value) { 3900 __asm { 3901 mov eax, [esp + 4] // src_argb 3902 mov edx, [esp + 8] // dst_argb 3903 mov ecx, [esp + 12] // width 3904 movd xmm2, [esp + 16] // value 3905 sub edx, eax 3906 punpcklbw xmm2, xmm2 3907 punpcklqdq xmm2, xmm2 3908 3909 align 16 3910 convertloop: 3911 movdqa xmm0, [eax] // read 4 pixels 3912 movdqa xmm1, xmm0 3913 punpcklbw xmm0, xmm0 // first 2 3914 punpckhbw xmm1, xmm1 // next 2 3915 pmulhuw xmm0, xmm2 // argb * value 3916 pmulhuw xmm1, xmm2 // argb * value 3917 psrlw xmm0, 8 3918 psrlw xmm1, 8 3919 packuswb xmm0, xmm1 3920 sub ecx, 4 3921 movdqa [eax + edx], xmm0 3922 lea eax, [eax + 16] 3923 jg convertloop 3924 3925 ret 3926 } 3927 } 3928 #endif // HAS_ARGBSHADE_SSE2 3929 3930 #ifdef HAS_ARGBAFFINEROW_SSE2 3931 // Copy ARGB pixels from source image with slope to a row of destination. 3932 __declspec(naked) __declspec(align(16)) 3933 LIBYUV_API 3934 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, 3935 uint8* dst_argb, const float* uv_dudv, int width) { 3936 __asm { 3937 push esi 3938 push edi 3939 mov eax, [esp + 12] // src_argb 3940 mov esi, [esp + 16] // stride 3941 mov edx, [esp + 20] // dst_argb 3942 mov ecx, [esp + 24] // pointer to uv_dudv 3943 movq xmm2, qword ptr [ecx] // uv 3944 movq xmm7, qword ptr [ecx + 8] // dudv 3945 mov ecx, [esp + 28] // width 3946 shl esi, 16 // 4, stride 3947 add esi, 4 3948 movd xmm5, esi 3949 sub ecx, 4 3950 jl l4b 3951 3952 // setup for 4 pixel loop 3953 pshufd xmm7, xmm7, 0x44 // dup dudv 3954 pshufd xmm5, xmm5, 0 // dup 4, stride 3955 movdqa xmm0, xmm2 // x0, y0, x1, y1 3956 addps xmm0, xmm7 3957 movlhps xmm2, xmm0 3958 movdqa xmm4, xmm7 3959 addps xmm4, xmm4 // dudv *= 2 3960 movdqa xmm3, xmm2 // x2, y2, x3, y3 3961 addps xmm3, xmm4 3962 addps xmm4, xmm4 // dudv *= 4 3963 3964 // 4 pixel loop 3965 align 4 3966 l4: 3967 cvttps2dq xmm0, xmm2 // x, y float to int first 2 3968 cvttps2dq xmm1, xmm3 // x, y float to int next 2 3969 packssdw xmm0, xmm1 // x, y as 8 shorts 3970 pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride. 3971 movd esi, xmm0 3972 pshufd xmm0, xmm0, 0x39 // shift right 3973 movd edi, xmm0 3974 pshufd xmm0, xmm0, 0x39 // shift right 3975 movd xmm1, [eax + esi] // read pixel 0 3976 movd xmm6, [eax + edi] // read pixel 1 3977 punpckldq xmm1, xmm6 // combine pixel 0 and 1 3978 addps xmm2, xmm4 // x, y += dx, dy first 2 3979 movq qword ptr [edx], xmm1 3980 movd esi, xmm0 3981 pshufd xmm0, xmm0, 0x39 // shift right 3982 movd edi, xmm0 3983 movd xmm6, [eax + esi] // read pixel 2 3984 movd xmm0, [eax + edi] // read pixel 3 3985 punpckldq xmm6, xmm0 // combine pixel 2 and 3 3986 addps xmm3, xmm4 // x, y += dx, dy next 2 3987 sub ecx, 4 3988 movq qword ptr 8[edx], xmm6 3989 lea edx, [edx + 16] 3990 jge l4 3991 3992 l4b: 3993 add ecx, 4 - 1 3994 jl l1b 3995 3996 // 1 pixel loop 3997 align 4 3998 l1: 3999 cvttps2dq xmm0, xmm2 // x, y float to int 4000 packssdw xmm0, xmm0 // x, y as shorts 4001 pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride 4002 addps xmm2, xmm7 // x, y += dx, dy 4003 movd esi, xmm0 4004 movd xmm0, [eax + esi] // copy a pixel 4005 sub ecx, 1 4006 movd [edx], xmm0 4007 lea edx, [edx + 4] 4008 jge l1 4009 l1b: 4010 pop edi 4011 pop esi 4012 ret 4013 } 4014 } 4015 #endif // HAS_ARGBAFFINEROW_SSE2 4016 4017 // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version. 4018 __declspec(naked) __declspec(align(16)) 4019 void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, 4020 ptrdiff_t src_stride, int dst_width, 4021 int source_y_fraction) { 4022 __asm { 4023 push esi 4024 push edi 4025 mov edi, [esp + 8 + 4] // dst_ptr 4026 mov esi, [esp + 8 + 8] // src_ptr 4027 mov edx, [esp + 8 + 12] // src_stride 4028 mov ecx, [esp + 8 + 16] // dst_width 4029 mov eax, [esp + 8 + 20] // source_y_fraction (0..255) 4030 sub edi, esi 4031 shr eax, 1 4032 cmp eax, 0 4033 je xloop1 4034 cmp eax, 64 4035 je xloop2 4036 movd xmm0, eax // high fraction 0..127 4037 neg eax 4038 add eax, 128 4039 movd xmm5, eax // low fraction 128..1 4040 punpcklbw xmm5, xmm0 4041 punpcklwd xmm5, xmm5 4042 pshufd xmm5, xmm5, 0 4043 4044 align 16 4045 xloop: 4046 movdqa xmm0, [esi] 4047 movdqa xmm2, [esi + edx] 4048 movdqa xmm1, xmm0 4049 punpcklbw xmm0, xmm2 4050 punpckhbw xmm1, xmm2 4051 pmaddubsw xmm0, xmm5 4052 pmaddubsw xmm1, xmm5 4053 psrlw xmm0, 7 4054 psrlw xmm1, 7 4055 packuswb xmm0, xmm1 4056 sub ecx, 4 4057 movdqa [esi + edi], xmm0 4058 lea esi, [esi + 16] 4059 jg xloop 4060 4061 pop edi 4062 pop esi 4063 ret 4064 4065 align 16 4066 xloop1: 4067 movdqa xmm0, [esi] 4068 sub ecx, 4 4069 movdqa [esi + edi], xmm0 4070 lea esi, [esi + 16] 4071 jg xloop1 4072 4073 pop edi 4074 pop esi 4075 ret 4076 4077 align 16 4078 xloop2: 4079 movdqa xmm0, [esi] 4080 pavgb xmm0, [esi + edx] 4081 sub ecx, 4 4082 movdqa [esi + edi], xmm0 4083 lea esi, [esi + 16] 4084 jg xloop2 4085 4086 pop edi 4087 pop esi 4088 ret 4089 } 4090 } 4091 4092 #endif // _M_IX86 4093 4094 #ifdef __cplusplus 4095 } // extern "C" 4096 } // namespace libyuv 4097 #endif 4098