1 /* 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "libyuv/row.h" 12 13 #ifdef __cplusplus 14 namespace libyuv { 15 extern "C" { 16 #endif 17 18 // This module is for Visual C x86. 19 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) 20 21 #ifdef HAS_ARGBTOYROW_SSSE3 22 23 // Constants for ARGB. 24 static const vec8 kARGBToY = { 25 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 26 }; 27 28 // JPeg full range. 29 static const vec8 kARGBToYJ = { 30 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0 31 }; 32 33 static const vec8 kARGBToU = { 34 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 35 }; 36 37 static const vec8 kARGBToUJ = { 38 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0 39 }; 40 41 static const vec8 kARGBToV = { 42 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, 43 }; 44 45 static const vec8 kARGBToVJ = { 46 -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0 47 }; 48 49 // vpermd for vphaddw + vpackuswb vpermd. 50 static const lvec32 kPermdARGBToY_AVX = { 51 0, 4, 1, 5, 2, 6, 3, 7 52 }; 53 54 // vpshufb for vphaddw + vpackuswb packed to shorts. 55 static const lvec8 kShufARGBToUV_AVX = { 56 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, 57 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, 58 }; 59 60 // Constants for BGRA. 61 static const vec8 kBGRAToY = { 62 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13 63 }; 64 65 static const vec8 kBGRAToU = { 66 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112 67 }; 68 69 static const vec8 kBGRAToV = { 70 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18 71 }; 72 73 // Constants for ABGR. 74 static const vec8 kABGRToY = { 75 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0 76 }; 77 78 static const vec8 kABGRToU = { 79 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0 80 }; 81 82 static const vec8 kABGRToV = { 83 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0 84 }; 85 86 // Constants for RGBA. 87 static const vec8 kRGBAToY = { 88 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33 89 }; 90 91 static const vec8 kRGBAToU = { 92 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38 93 }; 94 95 static const vec8 kRGBAToV = { 96 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112 97 }; 98 99 static const uvec8 kAddY16 = { 100 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u 101 }; 102 103 static const vec16 kAddYJ64 = { 104 64, 64, 64, 64, 64, 64, 64, 64 105 }; 106 107 static const uvec8 kAddUV128 = { 108 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 109 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u 110 }; 111 112 static const uvec16 kAddUVJ128 = { 113 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u 114 }; 115 116 // Shuffle table for converting RGB24 to ARGB. 117 static const uvec8 kShuffleMaskRGB24ToARGB = { 118 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u 119 }; 120 121 // Shuffle table for converting RAW to ARGB. 122 static const uvec8 kShuffleMaskRAWToARGB = { 123 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u 124 }; 125 126 // Shuffle table for converting ARGB to RGB24. 127 static const uvec8 kShuffleMaskARGBToRGB24 = { 128 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u 129 }; 130 131 // Shuffle table for converting ARGB to RAW. 132 static const uvec8 kShuffleMaskARGBToRAW = { 133 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u 134 }; 135 136 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 137 static const uvec8 kShuffleMaskARGBToRGB24_0 = { 138 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u 139 }; 140 141 // Shuffle table for converting ARGB to RAW. 142 static const uvec8 kShuffleMaskARGBToRAW_0 = { 143 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u 144 }; 145 146 // Duplicates gray value 3 times and fills in alpha opaque. 147 __declspec(naked) __declspec(align(16)) 148 void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { 149 __asm { 150 mov eax, [esp + 4] // src_y 151 mov edx, [esp + 8] // dst_argb 152 mov ecx, [esp + 12] // pix 153 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 154 pslld xmm5, 24 155 156 align 4 157 convertloop: 158 movq xmm0, qword ptr [eax] 159 lea eax, [eax + 8] 160 punpcklbw xmm0, xmm0 161 movdqa xmm1, xmm0 162 punpcklwd xmm0, xmm0 163 punpckhwd xmm1, xmm1 164 por xmm0, xmm5 165 por xmm1, xmm5 166 movdqa [edx], xmm0 167 movdqa [edx + 16], xmm1 168 lea edx, [edx + 32] 169 sub ecx, 8 170 jg convertloop 171 ret 172 } 173 } 174 175 __declspec(naked) __declspec(align(16)) 176 void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb, 177 int pix) { 178 __asm { 179 mov eax, [esp + 4] // src_y 180 mov edx, [esp + 8] // dst_argb 181 mov ecx, [esp + 12] // pix 182 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 183 pslld xmm5, 24 184 185 align 4 186 convertloop: 187 movq xmm0, qword ptr [eax] 188 lea eax, [eax + 8] 189 punpcklbw xmm0, xmm0 190 movdqa xmm1, xmm0 191 punpcklwd xmm0, xmm0 192 punpckhwd xmm1, xmm1 193 por xmm0, xmm5 194 por xmm1, xmm5 195 movdqu [edx], xmm0 196 movdqu [edx + 16], xmm1 197 lea edx, [edx + 32] 198 sub ecx, 8 199 jg convertloop 200 ret 201 } 202 } 203 204 __declspec(naked) __declspec(align(16)) 205 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { 206 __asm { 207 mov eax, [esp + 4] // src_rgb24 208 mov edx, [esp + 8] // dst_argb 209 mov ecx, [esp + 12] // pix 210 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 211 pslld xmm5, 24 212 movdqa xmm4, kShuffleMaskRGB24ToARGB 213 214 align 4 215 convertloop: 216 movdqu xmm0, [eax] 217 movdqu xmm1, [eax + 16] 218 movdqu xmm3, [eax + 32] 219 lea eax, [eax + 48] 220 movdqa xmm2, xmm3 221 palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} 222 pshufb xmm2, xmm4 223 por xmm2, xmm5 224 palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} 225 pshufb xmm0, xmm4 226 movdqa [edx + 32], xmm2 227 por xmm0, xmm5 228 pshufb xmm1, xmm4 229 movdqa [edx], xmm0 230 por xmm1, xmm5 231 palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} 232 pshufb xmm3, xmm4 233 movdqa [edx + 16], xmm1 234 por xmm3, xmm5 235 sub ecx, 16 236 movdqa [edx + 48], xmm3 237 lea edx, [edx + 64] 238 jg convertloop 239 ret 240 } 241 } 242 243 __declspec(naked) __declspec(align(16)) 244 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, 245 int pix) { 246 __asm { 247 mov eax, [esp + 4] // src_raw 248 mov edx, [esp + 8] // dst_argb 249 mov ecx, [esp + 12] // pix 250 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 251 pslld xmm5, 24 252 movdqa xmm4, kShuffleMaskRAWToARGB 253 254 align 4 255 convertloop: 256 movdqu xmm0, [eax] 257 movdqu xmm1, [eax + 16] 258 movdqu xmm3, [eax + 32] 259 lea eax, [eax + 48] 260 movdqa xmm2, xmm3 261 palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} 262 pshufb xmm2, xmm4 263 por xmm2, xmm5 264 palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} 265 pshufb xmm0, xmm4 266 movdqa [edx + 32], xmm2 267 por xmm0, xmm5 268 pshufb xmm1, xmm4 269 movdqa [edx], xmm0 270 por xmm1, xmm5 271 palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} 272 pshufb xmm3, xmm4 273 movdqa [edx + 16], xmm1 274 por xmm3, xmm5 275 sub ecx, 16 276 movdqa [edx + 48], xmm3 277 lea edx, [edx + 64] 278 jg convertloop 279 ret 280 } 281 } 282 283 // pmul method to replicate bits. 284 // Math to replicate bits: 285 // (v << 8) | (v << 3) 286 // v * 256 + v * 8 287 // v * (256 + 8) 288 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 289 // 20 instructions. 290 __declspec(naked) __declspec(align(16)) 291 void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, 292 int pix) { 293 __asm { 294 mov eax, 0x01080108 // generate multiplier to repeat 5 bits 295 movd xmm5, eax 296 pshufd xmm5, xmm5, 0 297 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits 298 movd xmm6, eax 299 pshufd xmm6, xmm6, 0 300 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red 301 psllw xmm3, 11 302 pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green 303 psllw xmm4, 10 304 psrlw xmm4, 5 305 pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha 306 psllw xmm7, 8 307 308 mov eax, [esp + 4] // src_rgb565 309 mov edx, [esp + 8] // dst_argb 310 mov ecx, [esp + 12] // pix 311 sub edx, eax 312 sub edx, eax 313 314 align 4 315 convertloop: 316 movdqu xmm0, [eax] // fetch 8 pixels of bgr565 317 movdqa xmm1, xmm0 318 movdqa xmm2, xmm0 319 pand xmm1, xmm3 // R in upper 5 bits 320 psllw xmm2, 11 // B in upper 5 bits 321 pmulhuw xmm1, xmm5 // * (256 + 8) 322 pmulhuw xmm2, xmm5 // * (256 + 8) 323 psllw xmm1, 8 324 por xmm1, xmm2 // RB 325 pand xmm0, xmm4 // G in middle 6 bits 326 pmulhuw xmm0, xmm6 // << 5 * (256 + 4) 327 por xmm0, xmm7 // AG 328 movdqa xmm2, xmm1 329 punpcklbw xmm1, xmm0 330 punpckhbw xmm2, xmm0 331 movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB 332 movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB 333 lea eax, [eax + 16] 334 sub ecx, 8 335 jg convertloop 336 ret 337 } 338 } 339 340 // 24 instructions 341 __declspec(naked) __declspec(align(16)) 342 void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, 343 int pix) { 344 __asm { 345 mov eax, 0x01080108 // generate multiplier to repeat 5 bits 346 movd xmm5, eax 347 pshufd xmm5, xmm5, 0 348 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits 349 movd xmm6, eax 350 pshufd xmm6, xmm6, 0 351 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red 352 psllw xmm3, 11 353 movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green 354 psrlw xmm4, 6 355 pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha 356 psllw xmm7, 8 357 358 mov eax, [esp + 4] // src_argb1555 359 mov edx, [esp + 8] // dst_argb 360 mov ecx, [esp + 12] // pix 361 sub edx, eax 362 sub edx, eax 363 364 align 4 365 convertloop: 366 movdqu xmm0, [eax] // fetch 8 pixels of 1555 367 movdqa xmm1, xmm0 368 movdqa xmm2, xmm0 369 psllw xmm1, 1 // R in upper 5 bits 370 psllw xmm2, 11 // B in upper 5 bits 371 pand xmm1, xmm3 372 pmulhuw xmm2, xmm5 // * (256 + 8) 373 pmulhuw xmm1, xmm5 // * (256 + 8) 374 psllw xmm1, 8 375 por xmm1, xmm2 // RB 376 movdqa xmm2, xmm0 377 pand xmm0, xmm4 // G in middle 5 bits 378 psraw xmm2, 8 // A 379 pmulhuw xmm0, xmm6 // << 6 * (256 + 8) 380 pand xmm2, xmm7 381 por xmm0, xmm2 // AG 382 movdqa xmm2, xmm1 383 punpcklbw xmm1, xmm0 384 punpckhbw xmm2, xmm0 385 movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB 386 movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB 387 lea eax, [eax + 16] 388 sub ecx, 8 389 jg convertloop 390 ret 391 } 392 } 393 394 // 18 instructions. 395 __declspec(naked) __declspec(align(16)) 396 void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, 397 int pix) { 398 __asm { 399 mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f 400 movd xmm4, eax 401 pshufd xmm4, xmm4, 0 402 movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles 403 pslld xmm5, 4 404 mov eax, [esp + 4] // src_argb4444 405 mov edx, [esp + 8] // dst_argb 406 mov ecx, [esp + 12] // pix 407 sub edx, eax 408 sub edx, eax 409 410 align 4 411 convertloop: 412 movdqu xmm0, [eax] // fetch 8 pixels of bgra4444 413 movdqa xmm2, xmm0 414 pand xmm0, xmm4 // mask low nibbles 415 pand xmm2, xmm5 // mask high nibbles 416 movdqa xmm1, xmm0 417 movdqa xmm3, xmm2 418 psllw xmm1, 4 419 psrlw xmm3, 4 420 por xmm0, xmm1 421 por xmm2, xmm3 422 movdqa xmm1, xmm0 423 punpcklbw xmm0, xmm2 424 punpckhbw xmm1, xmm2 425 movdqa [eax * 2 + edx], xmm0 // store 4 pixels of ARGB 426 movdqa [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB 427 lea eax, [eax + 16] 428 sub ecx, 8 429 jg convertloop 430 ret 431 } 432 } 433 434 __declspec(naked) __declspec(align(16)) 435 void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { 436 __asm { 437 mov eax, [esp + 4] // src_argb 438 mov edx, [esp + 8] // dst_rgb 439 mov ecx, [esp + 12] // pix 440 movdqa xmm6, kShuffleMaskARGBToRGB24 441 442 align 4 443 convertloop: 444 movdqu xmm0, [eax] // fetch 16 pixels of argb 445 movdqu xmm1, [eax + 16] 446 movdqu xmm2, [eax + 32] 447 movdqu xmm3, [eax + 48] 448 lea eax, [eax + 64] 449 pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB 450 pshufb xmm1, xmm6 451 pshufb xmm2, xmm6 452 pshufb xmm3, xmm6 453 movdqa xmm4, xmm1 // 4 bytes from 1 for 0 454 psrldq xmm1, 4 // 8 bytes from 1 455 pslldq xmm4, 12 // 4 bytes from 1 for 0 456 movdqa xmm5, xmm2 // 8 bytes from 2 for 1 457 por xmm0, xmm4 // 4 bytes from 1 for 0 458 pslldq xmm5, 8 // 8 bytes from 2 for 1 459 movdqu [edx], xmm0 // store 0 460 por xmm1, xmm5 // 8 bytes from 2 for 1 461 psrldq xmm2, 8 // 4 bytes from 2 462 pslldq xmm3, 4 // 12 bytes from 3 for 2 463 por xmm2, xmm3 // 12 bytes from 3 for 2 464 movdqu [edx + 16], xmm1 // store 1 465 movdqu [edx + 32], xmm2 // store 2 466 lea edx, [edx + 48] 467 sub ecx, 16 468 jg convertloop 469 ret 470 } 471 } 472 473 __declspec(naked) __declspec(align(16)) 474 void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { 475 __asm { 476 mov eax, [esp + 4] // src_argb 477 mov edx, [esp + 8] // dst_rgb 478 mov ecx, [esp + 12] // pix 479 movdqa xmm6, kShuffleMaskARGBToRAW 480 481 align 4 482 convertloop: 483 movdqu xmm0, [eax] // fetch 16 pixels of argb 484 movdqu xmm1, [eax + 16] 485 movdqu xmm2, [eax + 32] 486 movdqu xmm3, [eax + 48] 487 lea eax, [eax + 64] 488 pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB 489 pshufb xmm1, xmm6 490 pshufb xmm2, xmm6 491 pshufb xmm3, xmm6 492 movdqa xmm4, xmm1 // 4 bytes from 1 for 0 493 psrldq xmm1, 4 // 8 bytes from 1 494 pslldq xmm4, 12 // 4 bytes from 1 for 0 495 movdqa xmm5, xmm2 // 8 bytes from 2 for 1 496 por xmm0, xmm4 // 4 bytes from 1 for 0 497 pslldq xmm5, 8 // 8 bytes from 2 for 1 498 movdqu [edx], xmm0 // store 0 499 por xmm1, xmm5 // 8 bytes from 2 for 1 500 psrldq xmm2, 8 // 4 bytes from 2 501 pslldq xmm3, 4 // 12 bytes from 3 for 2 502 por xmm2, xmm3 // 12 bytes from 3 for 2 503 movdqu [edx + 16], xmm1 // store 1 504 movdqu [edx + 32], xmm2 // store 2 505 lea edx, [edx + 48] 506 sub ecx, 16 507 jg convertloop 508 ret 509 } 510 } 511 512 __declspec(naked) __declspec(align(16)) 513 void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { 514 __asm { 515 mov eax, [esp + 4] // src_argb 516 mov edx, [esp + 8] // dst_rgb 517 mov ecx, [esp + 12] // pix 518 pcmpeqb xmm3, xmm3 // generate mask 0x0000001f 519 psrld xmm3, 27 520 pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 521 psrld xmm4, 26 522 pslld xmm4, 5 523 pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 524 pslld xmm5, 11 525 526 align 4 527 convertloop: 528 movdqa xmm0, [eax] // fetch 4 pixels of argb 529 movdqa xmm1, xmm0 // B 530 movdqa xmm2, xmm0 // G 531 pslld xmm0, 8 // R 532 psrld xmm1, 3 // B 533 psrld xmm2, 5 // G 534 psrad xmm0, 16 // R 535 pand xmm1, xmm3 // B 536 pand xmm2, xmm4 // G 537 pand xmm0, xmm5 // R 538 por xmm1, xmm2 // BG 539 por xmm0, xmm1 // BGR 540 packssdw xmm0, xmm0 541 lea eax, [eax + 16] 542 movq qword ptr [edx], xmm0 // store 4 pixels of RGB565 543 lea edx, [edx + 8] 544 sub ecx, 4 545 jg convertloop 546 ret 547 } 548 } 549 550 // TODO(fbarchard): Improve sign extension/packing. 551 __declspec(naked) __declspec(align(16)) 552 void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { 553 __asm { 554 mov eax, [esp + 4] // src_argb 555 mov edx, [esp + 8] // dst_rgb 556 mov ecx, [esp + 12] // pix 557 pcmpeqb xmm4, xmm4 // generate mask 0x0000001f 558 psrld xmm4, 27 559 movdqa xmm5, xmm4 // generate mask 0x000003e0 560 pslld xmm5, 5 561 movdqa xmm6, xmm4 // generate mask 0x00007c00 562 pslld xmm6, 10 563 pcmpeqb xmm7, xmm7 // generate mask 0xffff8000 564 pslld xmm7, 15 565 566 align 4 567 convertloop: 568 movdqa xmm0, [eax] // fetch 4 pixels of argb 569 movdqa xmm1, xmm0 // B 570 movdqa xmm2, xmm0 // G 571 movdqa xmm3, xmm0 // R 572 psrad xmm0, 16 // A 573 psrld xmm1, 3 // B 574 psrld xmm2, 6 // G 575 psrld xmm3, 9 // R 576 pand xmm0, xmm7 // A 577 pand xmm1, xmm4 // B 578 pand xmm2, xmm5 // G 579 pand xmm3, xmm6 // R 580 por xmm0, xmm1 // BA 581 por xmm2, xmm3 // GR 582 por xmm0, xmm2 // BGRA 583 packssdw xmm0, xmm0 584 lea eax, [eax + 16] 585 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555 586 lea edx, [edx + 8] 587 sub ecx, 4 588 jg convertloop 589 ret 590 } 591 } 592 593 __declspec(naked) __declspec(align(16)) 594 void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { 595 __asm { 596 mov eax, [esp + 4] // src_argb 597 mov edx, [esp + 8] // dst_rgb 598 mov ecx, [esp + 12] // pix 599 pcmpeqb xmm4, xmm4 // generate mask 0xf000f000 600 psllw xmm4, 12 601 movdqa xmm3, xmm4 // generate mask 0x00f000f0 602 psrlw xmm3, 8 603 604 align 4 605 convertloop: 606 movdqa xmm0, [eax] // fetch 4 pixels of argb 607 movdqa xmm1, xmm0 608 pand xmm0, xmm3 // low nibble 609 pand xmm1, xmm4 // high nibble 610 psrl xmm0, 4 611 psrl xmm1, 8 612 por xmm0, xmm1 613 packuswb xmm0, xmm0 614 lea eax, [eax + 16] 615 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444 616 lea edx, [edx + 8] 617 sub ecx, 4 618 jg convertloop 619 ret 620 } 621 } 622 623 // Convert 16 ARGB pixels (64 bytes) to 16 Y values. 624 __declspec(naked) __declspec(align(16)) 625 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 626 __asm { 627 mov eax, [esp + 4] /* src_argb */ 628 mov edx, [esp + 8] /* dst_y */ 629 mov ecx, [esp + 12] /* pix */ 630 movdqa xmm5, kAddY16 631 movdqa xmm4, kARGBToY 632 633 align 4 634 convertloop: 635 movdqa xmm0, [eax] 636 movdqa xmm1, [eax + 16] 637 movdqa xmm2, [eax + 32] 638 movdqa xmm3, [eax + 48] 639 pmaddubsw xmm0, xmm4 640 pmaddubsw xmm1, xmm4 641 pmaddubsw xmm2, xmm4 642 pmaddubsw xmm3, xmm4 643 lea eax, [eax + 64] 644 phaddw xmm0, xmm1 645 phaddw xmm2, xmm3 646 psrlw xmm0, 7 647 psrlw xmm2, 7 648 packuswb xmm0, xmm2 649 paddb xmm0, xmm5 650 sub ecx, 16 651 movdqa [edx], xmm0 652 lea edx, [edx + 16] 653 jg convertloop 654 ret 655 } 656 } 657 658 // Convert 16 ARGB pixels (64 bytes) to 16 Y values. 659 __declspec(naked) __declspec(align(16)) 660 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 661 __asm { 662 mov eax, [esp + 4] /* src_argb */ 663 mov edx, [esp + 8] /* dst_y */ 664 mov ecx, [esp + 12] /* pix */ 665 movdqa xmm4, kARGBToYJ 666 movdqa xmm5, kAddYJ64 667 668 align 4 669 convertloop: 670 movdqa xmm0, [eax] 671 movdqa xmm1, [eax + 16] 672 movdqa xmm2, [eax + 32] 673 movdqa xmm3, [eax + 48] 674 pmaddubsw xmm0, xmm4 675 pmaddubsw xmm1, xmm4 676 pmaddubsw xmm2, xmm4 677 pmaddubsw xmm3, xmm4 678 lea eax, [eax + 64] 679 phaddw xmm0, xmm1 680 phaddw xmm2, xmm3 681 paddw xmm0, xmm5 // Add .5 for rounding. 682 paddw xmm2, xmm5 683 psrlw xmm0, 7 684 psrlw xmm2, 7 685 packuswb xmm0, xmm2 686 sub ecx, 16 687 movdqa [edx], xmm0 688 lea edx, [edx + 16] 689 jg convertloop 690 ret 691 } 692 } 693 694 #ifdef HAS_ARGBTOYROW_AVX2 695 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. 696 __declspec(naked) __declspec(align(32)) 697 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { 698 __asm { 699 mov eax, [esp + 4] /* src_argb */ 700 mov edx, [esp + 8] /* dst_y */ 701 mov ecx, [esp + 12] /* pix */ 702 vbroadcastf128 ymm4, kARGBToY 703 vbroadcastf128 ymm5, kAddY16 704 vmovdqa ymm6, kPermdARGBToY_AVX 705 706 align 4 707 convertloop: 708 vmovdqu ymm0, [eax] 709 vmovdqu ymm1, [eax + 32] 710 vmovdqu ymm2, [eax + 64] 711 vmovdqu ymm3, [eax + 96] 712 vpmaddubsw ymm0, ymm0, ymm4 713 vpmaddubsw ymm1, ymm1, ymm4 714 vpmaddubsw ymm2, ymm2, ymm4 715 vpmaddubsw ymm3, ymm3, ymm4 716 lea eax, [eax + 128] 717 vphaddw ymm0, ymm0, ymm1 // mutates. 718 vphaddw ymm2, ymm2, ymm3 719 vpsrlw ymm0, ymm0, 7 720 vpsrlw ymm2, ymm2, 7 721 vpackuswb ymm0, ymm0, ymm2 // mutates. 722 vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation. 723 vpaddb ymm0, ymm0, ymm5 724 sub ecx, 32 725 vmovdqu [edx], ymm0 726 lea edx, [edx + 32] 727 jg convertloop 728 vzeroupper 729 ret 730 } 731 } 732 #endif // HAS_ARGBTOYROW_AVX2 733 734 #ifdef HAS_ARGBTOYROW_AVX2 735 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. 736 __declspec(naked) __declspec(align(32)) 737 void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { 738 __asm { 739 mov eax, [esp + 4] /* src_argb */ 740 mov edx, [esp + 8] /* dst_y */ 741 mov ecx, [esp + 12] /* pix */ 742 vbroadcastf128 ymm4, kARGBToYJ 743 vbroadcastf128 ymm5, kAddYJ64 744 vmovdqa ymm6, kPermdARGBToY_AVX 745 746 align 4 747 convertloop: 748 vmovdqu ymm0, [eax] 749 vmovdqu ymm1, [eax + 32] 750 vmovdqu ymm2, [eax + 64] 751 vmovdqu ymm3, [eax + 96] 752 vpmaddubsw ymm0, ymm0, ymm4 753 vpmaddubsw ymm1, ymm1, ymm4 754 vpmaddubsw ymm2, ymm2, ymm4 755 vpmaddubsw ymm3, ymm3, ymm4 756 lea eax, [eax + 128] 757 vphaddw ymm0, ymm0, ymm1 // mutates. 758 vphaddw ymm2, ymm2, ymm3 759 vpaddw ymm0, ymm0, ymm5 // Add .5 for rounding. 760 vpaddw ymm2, ymm2, ymm5 761 vpsrlw ymm0, ymm0, 7 762 vpsrlw ymm2, ymm2, 7 763 vpackuswb ymm0, ymm0, ymm2 // mutates. 764 vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation. 765 sub ecx, 32 766 vmovdqu [edx], ymm0 767 lea edx, [edx + 32] 768 jg convertloop 769 770 vzeroupper 771 ret 772 } 773 } 774 #endif // HAS_ARGBTOYJROW_AVX2 775 776 __declspec(naked) __declspec(align(16)) 777 void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 778 __asm { 779 mov eax, [esp + 4] /* src_argb */ 780 mov edx, [esp + 8] /* dst_y */ 781 mov ecx, [esp + 12] /* pix */ 782 movdqa xmm5, kAddY16 783 movdqa xmm4, kARGBToY 784 785 align 4 786 convertloop: 787 movdqu xmm0, [eax] 788 movdqu xmm1, [eax + 16] 789 movdqu xmm2, [eax + 32] 790 movdqu xmm3, [eax + 48] 791 pmaddubsw xmm0, xmm4 792 pmaddubsw xmm1, xmm4 793 pmaddubsw xmm2, xmm4 794 pmaddubsw xmm3, xmm4 795 lea eax, [eax + 64] 796 phaddw xmm0, xmm1 797 phaddw xmm2, xmm3 798 psrlw xmm0, 7 799 psrlw xmm2, 7 800 packuswb xmm0, xmm2 801 paddb xmm0, xmm5 802 sub ecx, 16 803 movdqu [edx], xmm0 804 lea edx, [edx + 16] 805 jg convertloop 806 ret 807 } 808 } 809 810 __declspec(naked) __declspec(align(16)) 811 void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 812 __asm { 813 mov eax, [esp + 4] /* src_argb */ 814 mov edx, [esp + 8] /* dst_y */ 815 mov ecx, [esp + 12] /* pix */ 816 movdqa xmm4, kARGBToYJ 817 movdqa xmm5, kAddYJ64 818 819 align 4 820 convertloop: 821 movdqu xmm0, [eax] 822 movdqu xmm1, [eax + 16] 823 movdqu xmm2, [eax + 32] 824 movdqu xmm3, [eax + 48] 825 pmaddubsw xmm0, xmm4 826 pmaddubsw xmm1, xmm4 827 pmaddubsw xmm2, xmm4 828 pmaddubsw xmm3, xmm4 829 lea eax, [eax + 64] 830 phaddw xmm0, xmm1 831 phaddw xmm2, xmm3 832 paddw xmm0, xmm5 833 paddw xmm2, xmm5 834 psrlw xmm0, 7 835 psrlw xmm2, 7 836 packuswb xmm0, xmm2 837 sub ecx, 16 838 movdqu [edx], xmm0 839 lea edx, [edx + 16] 840 jg convertloop 841 ret 842 } 843 } 844 845 __declspec(naked) __declspec(align(16)) 846 void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 847 __asm { 848 mov eax, [esp + 4] /* src_argb */ 849 mov edx, [esp + 8] /* dst_y */ 850 mov ecx, [esp + 12] /* pix */ 851 movdqa xmm5, kAddY16 852 movdqa xmm4, kBGRAToY 853 854 align 4 855 convertloop: 856 movdqa xmm0, [eax] 857 movdqa xmm1, [eax + 16] 858 movdqa xmm2, [eax + 32] 859 movdqa xmm3, [eax + 48] 860 pmaddubsw xmm0, xmm4 861 pmaddubsw xmm1, xmm4 862 pmaddubsw xmm2, xmm4 863 pmaddubsw xmm3, xmm4 864 lea eax, [eax + 64] 865 phaddw xmm0, xmm1 866 phaddw xmm2, xmm3 867 psrlw xmm0, 7 868 psrlw xmm2, 7 869 packuswb xmm0, xmm2 870 paddb xmm0, xmm5 871 sub ecx, 16 872 movdqa [edx], xmm0 873 lea edx, [edx + 16] 874 jg convertloop 875 ret 876 } 877 } 878 879 __declspec(naked) __declspec(align(16)) 880 void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 881 __asm { 882 mov eax, [esp + 4] /* src_argb */ 883 mov edx, [esp + 8] /* dst_y */ 884 mov ecx, [esp + 12] /* pix */ 885 movdqa xmm5, kAddY16 886 movdqa xmm4, kBGRAToY 887 888 align 4 889 convertloop: 890 movdqu xmm0, [eax] 891 movdqu xmm1, [eax + 16] 892 movdqu xmm2, [eax + 32] 893 movdqu xmm3, [eax + 48] 894 pmaddubsw xmm0, xmm4 895 pmaddubsw xmm1, xmm4 896 pmaddubsw xmm2, xmm4 897 pmaddubsw xmm3, xmm4 898 lea eax, [eax + 64] 899 phaddw xmm0, xmm1 900 phaddw xmm2, xmm3 901 psrlw xmm0, 7 902 psrlw xmm2, 7 903 packuswb xmm0, xmm2 904 paddb xmm0, xmm5 905 sub ecx, 16 906 movdqu [edx], xmm0 907 lea edx, [edx + 16] 908 jg convertloop 909 ret 910 } 911 } 912 913 __declspec(naked) __declspec(align(16)) 914 void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 915 __asm { 916 mov eax, [esp + 4] /* src_argb */ 917 mov edx, [esp + 8] /* dst_y */ 918 mov ecx, [esp + 12] /* pix */ 919 movdqa xmm5, kAddY16 920 movdqa xmm4, kABGRToY 921 922 align 4 923 convertloop: 924 movdqa xmm0, [eax] 925 movdqa xmm1, [eax + 16] 926 movdqa xmm2, [eax + 32] 927 movdqa xmm3, [eax + 48] 928 pmaddubsw xmm0, xmm4 929 pmaddubsw xmm1, xmm4 930 pmaddubsw xmm2, xmm4 931 pmaddubsw xmm3, xmm4 932 lea eax, [eax + 64] 933 phaddw xmm0, xmm1 934 phaddw xmm2, xmm3 935 psrlw xmm0, 7 936 psrlw xmm2, 7 937 packuswb xmm0, xmm2 938 paddb xmm0, xmm5 939 sub ecx, 16 940 movdqa [edx], xmm0 941 lea edx, [edx + 16] 942 jg convertloop 943 ret 944 } 945 } 946 947 __declspec(naked) __declspec(align(16)) 948 void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 949 __asm { 950 mov eax, [esp + 4] /* src_argb */ 951 mov edx, [esp + 8] /* dst_y */ 952 mov ecx, [esp + 12] /* pix */ 953 movdqa xmm5, kAddY16 954 movdqa xmm4, kABGRToY 955 956 align 4 957 convertloop: 958 movdqu xmm0, [eax] 959 movdqu xmm1, [eax + 16] 960 movdqu xmm2, [eax + 32] 961 movdqu xmm3, [eax + 48] 962 pmaddubsw xmm0, xmm4 963 pmaddubsw xmm1, xmm4 964 pmaddubsw xmm2, xmm4 965 pmaddubsw xmm3, xmm4 966 lea eax, [eax + 64] 967 phaddw xmm0, xmm1 968 phaddw xmm2, xmm3 969 psrlw xmm0, 7 970 psrlw xmm2, 7 971 packuswb xmm0, xmm2 972 paddb xmm0, xmm5 973 sub ecx, 16 974 movdqu [edx], xmm0 975 lea edx, [edx + 16] 976 jg convertloop 977 ret 978 } 979 } 980 981 __declspec(naked) __declspec(align(16)) 982 void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 983 __asm { 984 mov eax, [esp + 4] /* src_argb */ 985 mov edx, [esp + 8] /* dst_y */ 986 mov ecx, [esp + 12] /* pix */ 987 movdqa xmm5, kAddY16 988 movdqa xmm4, kRGBAToY 989 990 align 4 991 convertloop: 992 movdqa xmm0, [eax] 993 movdqa xmm1, [eax + 16] 994 movdqa xmm2, [eax + 32] 995 movdqa xmm3, [eax + 48] 996 pmaddubsw xmm0, xmm4 997 pmaddubsw xmm1, xmm4 998 pmaddubsw xmm2, xmm4 999 pmaddubsw xmm3, xmm4 1000 lea eax, [eax + 64] 1001 phaddw xmm0, xmm1 1002 phaddw xmm2, xmm3 1003 psrlw xmm0, 7 1004 psrlw xmm2, 7 1005 packuswb xmm0, xmm2 1006 paddb xmm0, xmm5 1007 sub ecx, 16 1008 movdqa [edx], xmm0 1009 lea edx, [edx + 16] 1010 jg convertloop 1011 ret 1012 } 1013 } 1014 1015 __declspec(naked) __declspec(align(16)) 1016 void RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 1017 __asm { 1018 mov eax, [esp + 4] /* src_argb */ 1019 mov edx, [esp + 8] /* dst_y */ 1020 mov ecx, [esp + 12] /* pix */ 1021 movdqa xmm5, kAddY16 1022 movdqa xmm4, kRGBAToY 1023 1024 align 4 1025 convertloop: 1026 movdqu xmm0, [eax] 1027 movdqu xmm1, [eax + 16] 1028 movdqu xmm2, [eax + 32] 1029 movdqu xmm3, [eax + 48] 1030 pmaddubsw xmm0, xmm4 1031 pmaddubsw xmm1, xmm4 1032 pmaddubsw xmm2, xmm4 1033 pmaddubsw xmm3, xmm4 1034 lea eax, [eax + 64] 1035 phaddw xmm0, xmm1 1036 phaddw xmm2, xmm3 1037 psrlw xmm0, 7 1038 psrlw xmm2, 7 1039 packuswb xmm0, xmm2 1040 paddb xmm0, xmm5 1041 sub ecx, 16 1042 movdqu [edx], xmm0 1043 lea edx, [edx + 16] 1044 jg convertloop 1045 ret 1046 } 1047 } 1048 1049 __declspec(naked) __declspec(align(16)) 1050 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 1051 uint8* dst_u, uint8* dst_v, int width) { 1052 __asm { 1053 push esi 1054 push edi 1055 mov eax, [esp + 8 + 4] // src_argb 1056 mov esi, [esp + 8 + 8] // src_stride_argb 1057 mov edx, [esp + 8 + 12] // dst_u 1058 mov edi, [esp + 8 + 16] // dst_v 1059 mov ecx, [esp + 8 + 20] // pix 1060 movdqa xmm7, kARGBToU 1061 movdqa xmm6, kARGBToV 1062 movdqa xmm5, kAddUV128 1063 sub edi, edx // stride from u to v 1064 1065 align 4 1066 convertloop: 1067 /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1068 movdqa xmm0, [eax] 1069 movdqa xmm1, [eax + 16] 1070 movdqa xmm2, [eax + 32] 1071 movdqa xmm3, [eax + 48] 1072 pavgb xmm0, [eax + esi] 1073 pavgb xmm1, [eax + esi + 16] 1074 pavgb xmm2, [eax + esi + 32] 1075 pavgb xmm3, [eax + esi + 48] 1076 lea eax, [eax + 64] 1077 movdqa xmm4, xmm0 1078 shufps xmm0, xmm1, 0x88 1079 shufps xmm4, xmm1, 0xdd 1080 pavgb xmm0, xmm4 1081 movdqa xmm4, xmm2 1082 shufps xmm2, xmm3, 0x88 1083 shufps xmm4, xmm3, 0xdd 1084 pavgb xmm2, xmm4 1085 1086 // step 2 - convert to U and V 1087 // from here down is very similar to Y code except 1088 // instead of 16 different pixels, its 8 pixels of U and 8 of V 1089 movdqa xmm1, xmm0 1090 movdqa xmm3, xmm2 1091 pmaddubsw xmm0, xmm7 // U 1092 pmaddubsw xmm2, xmm7 1093 pmaddubsw xmm1, xmm6 // V 1094 pmaddubsw xmm3, xmm6 1095 phaddw xmm0, xmm2 1096 phaddw xmm1, xmm3 1097 psraw xmm0, 8 1098 psraw xmm1, 8 1099 packsswb xmm0, xmm1 1100 paddb xmm0, xmm5 // -> unsigned 1101 1102 // step 3 - store 8 U and 8 V values 1103 sub ecx, 16 1104 movlps qword ptr [edx], xmm0 // U 1105 movhps qword ptr [edx + edi], xmm0 // V 1106 lea edx, [edx + 8] 1107 jg convertloop 1108 1109 pop edi 1110 pop esi 1111 ret 1112 } 1113 } 1114 1115 __declspec(naked) __declspec(align(16)) 1116 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 1117 uint8* dst_u, uint8* dst_v, int width) { 1118 __asm { 1119 push esi 1120 push edi 1121 mov eax, [esp + 8 + 4] // src_argb 1122 mov esi, [esp + 8 + 8] // src_stride_argb 1123 mov edx, [esp + 8 + 12] // dst_u 1124 mov edi, [esp + 8 + 16] // dst_v 1125 mov ecx, [esp + 8 + 20] // pix 1126 movdqa xmm7, kARGBToUJ 1127 movdqa xmm6, kARGBToVJ 1128 movdqa xmm5, kAddUVJ128 1129 sub edi, edx // stride from u to v 1130 1131 align 4 1132 convertloop: 1133 /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1134 movdqa xmm0, [eax] 1135 movdqa xmm1, [eax + 16] 1136 movdqa xmm2, [eax + 32] 1137 movdqa xmm3, [eax + 48] 1138 pavgb xmm0, [eax + esi] 1139 pavgb xmm1, [eax + esi + 16] 1140 pavgb xmm2, [eax + esi + 32] 1141 pavgb xmm3, [eax + esi + 48] 1142 lea eax, [eax + 64] 1143 movdqa xmm4, xmm0 1144 shufps xmm0, xmm1, 0x88 1145 shufps xmm4, xmm1, 0xdd 1146 pavgb xmm0, xmm4 1147 movdqa xmm4, xmm2 1148 shufps xmm2, xmm3, 0x88 1149 shufps xmm4, xmm3, 0xdd 1150 pavgb xmm2, xmm4 1151 1152 // step 2 - convert to U and V 1153 // from here down is very similar to Y code except 1154 // instead of 16 different pixels, its 8 pixels of U and 8 of V 1155 movdqa xmm1, xmm0 1156 movdqa xmm3, xmm2 1157 pmaddubsw xmm0, xmm7 // U 1158 pmaddubsw xmm2, xmm7 1159 pmaddubsw xmm1, xmm6 // V 1160 pmaddubsw xmm3, xmm6 1161 phaddw xmm0, xmm2 1162 phaddw xmm1, xmm3 1163 paddw xmm0, xmm5 // +.5 rounding -> unsigned 1164 paddw xmm1, xmm5 1165 psraw xmm0, 8 1166 psraw xmm1, 8 1167 packsswb xmm0, xmm1 1168 1169 // step 3 - store 8 U and 8 V values 1170 sub ecx, 16 1171 movlps qword ptr [edx], xmm0 // U 1172 movhps qword ptr [edx + edi], xmm0 // V 1173 lea edx, [edx + 8] 1174 jg convertloop 1175 1176 pop edi 1177 pop esi 1178 ret 1179 } 1180 } 1181 1182 #ifdef HAS_ARGBTOUVROW_AVX2 1183 __declspec(naked) __declspec(align(32)) 1184 void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, 1185 uint8* dst_u, uint8* dst_v, int width) { 1186 __asm { 1187 push esi 1188 push edi 1189 mov eax, [esp + 8 + 4] // src_argb 1190 mov esi, [esp + 8 + 8] // src_stride_argb 1191 mov edx, [esp + 8 + 12] // dst_u 1192 mov edi, [esp + 8 + 16] // dst_v 1193 mov ecx, [esp + 8 + 20] // pix 1194 vbroadcastf128 ymm5, kAddUV128 1195 vbroadcastf128 ymm6, kARGBToV 1196 vbroadcastf128 ymm7, kARGBToU 1197 sub edi, edx // stride from u to v 1198 1199 align 4 1200 convertloop: 1201 /* step 1 - subsample 32x2 argb pixels to 16x1 */ 1202 vmovdqu ymm0, [eax] 1203 vmovdqu ymm1, [eax + 32] 1204 vmovdqu ymm2, [eax + 64] 1205 vmovdqu ymm3, [eax + 96] 1206 vpavgb ymm0, ymm0, [eax + esi] 1207 vpavgb ymm1, ymm1, [eax + esi + 32] 1208 vpavgb ymm2, ymm2, [eax + esi + 64] 1209 vpavgb ymm3, ymm3, [eax + esi + 96] 1210 lea eax, [eax + 128] 1211 vshufps ymm4, ymm0, ymm1, 0x88 1212 vshufps ymm0, ymm0, ymm1, 0xdd 1213 vpavgb ymm0, ymm0, ymm4 // mutated by vshufps 1214 vshufps ymm4, ymm2, ymm3, 0x88 1215 vshufps ymm2, ymm2, ymm3, 0xdd 1216 vpavgb ymm2, ymm2, ymm4 // mutated by vshufps 1217 1218 // step 2 - convert to U and V 1219 // from here down is very similar to Y code except 1220 // instead of 32 different pixels, its 16 pixels of U and 16 of V 1221 vpmaddubsw ymm1, ymm0, ymm7 // U 1222 vpmaddubsw ymm3, ymm2, ymm7 1223 vpmaddubsw ymm0, ymm0, ymm6 // V 1224 vpmaddubsw ymm2, ymm2, ymm6 1225 vphaddw ymm1, ymm1, ymm3 // mutates 1226 vphaddw ymm0, ymm0, ymm2 1227 vpsraw ymm1, ymm1, 8 1228 vpsraw ymm0, ymm0, 8 1229 vpacksswb ymm0, ymm1, ymm0 // mutates 1230 vpermq ymm0, ymm0, 0xd8 // For vpacksswb 1231 vpshufb ymm0, ymm0, kShufARGBToUV_AVX // For vshufps + vphaddw 1232 vpaddb ymm0, ymm0, ymm5 // -> unsigned 1233 1234 // step 3 - store 16 U and 16 V values 1235 sub ecx, 32 1236 vextractf128 [edx], ymm0, 0 // U 1237 vextractf128 [edx + edi], ymm0, 1 // V 1238 lea edx, [edx + 16] 1239 jg convertloop 1240 1241 pop edi 1242 pop esi 1243 vzeroupper 1244 ret 1245 } 1246 } 1247 #endif // HAS_ARGBTOUVROW_AVX2 1248 1249 __declspec(naked) __declspec(align(16)) 1250 void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, 1251 uint8* dst_u, uint8* dst_v, int width) { 1252 __asm { 1253 push esi 1254 push edi 1255 mov eax, [esp + 8 + 4] // src_argb 1256 mov esi, [esp + 8 + 8] // src_stride_argb 1257 mov edx, [esp + 8 + 12] // dst_u 1258 mov edi, [esp + 8 + 16] // dst_v 1259 mov ecx, [esp + 8 + 20] // pix 1260 movdqa xmm7, kARGBToU 1261 movdqa xmm6, kARGBToV 1262 movdqa xmm5, kAddUV128 1263 sub edi, edx // stride from u to v 1264 1265 align 4 1266 convertloop: 1267 /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1268 movdqu xmm0, [eax] 1269 movdqu xmm1, [eax + 16] 1270 movdqu xmm2, [eax + 32] 1271 movdqu xmm3, [eax + 48] 1272 movdqu xmm4, [eax + esi] 1273 pavgb xmm0, xmm4 1274 movdqu xmm4, [eax + esi + 16] 1275 pavgb xmm1, xmm4 1276 movdqu xmm4, [eax + esi + 32] 1277 pavgb xmm2, xmm4 1278 movdqu xmm4, [eax + esi + 48] 1279 pavgb xmm3, xmm4 1280 lea eax, [eax + 64] 1281 movdqa xmm4, xmm0 1282 shufps xmm0, xmm1, 0x88 1283 shufps xmm4, xmm1, 0xdd 1284 pavgb xmm0, xmm4 1285 movdqa xmm4, xmm2 1286 shufps xmm2, xmm3, 0x88 1287 shufps xmm4, xmm3, 0xdd 1288 pavgb xmm2, xmm4 1289 1290 // step 2 - convert to U and V 1291 // from here down is very similar to Y code except 1292 // instead of 16 different pixels, its 8 pixels of U and 8 of V 1293 movdqa xmm1, xmm0 1294 movdqa xmm3, xmm2 1295 pmaddubsw xmm0, xmm7 // U 1296 pmaddubsw xmm2, xmm7 1297 pmaddubsw xmm1, xmm6 // V 1298 pmaddubsw xmm3, xmm6 1299 phaddw xmm0, xmm2 1300 phaddw xmm1, xmm3 1301 psraw xmm0, 8 1302 psraw xmm1, 8 1303 packsswb xmm0, xmm1 1304 paddb xmm0, xmm5 // -> unsigned 1305 1306 // step 3 - store 8 U and 8 V values 1307 sub ecx, 16 1308 movlps qword ptr [edx], xmm0 // U 1309 movhps qword ptr [edx + edi], xmm0 // V 1310 lea edx, [edx + 8] 1311 jg convertloop 1312 1313 pop edi 1314 pop esi 1315 ret 1316 } 1317 } 1318 1319 __declspec(naked) __declspec(align(16)) 1320 void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, 1321 uint8* dst_u, uint8* dst_v, int width) { 1322 __asm { 1323 push esi 1324 push edi 1325 mov eax, [esp + 8 + 4] // src_argb 1326 mov esi, [esp + 8 + 8] // src_stride_argb 1327 mov edx, [esp + 8 + 12] // dst_u 1328 mov edi, [esp + 8 + 16] // dst_v 1329 mov ecx, [esp + 8 + 20] // pix 1330 movdqa xmm7, kARGBToUJ 1331 movdqa xmm6, kARGBToVJ 1332 movdqa xmm5, kAddUVJ128 1333 sub edi, edx // stride from u to v 1334 1335 align 4 1336 convertloop: 1337 /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1338 movdqu xmm0, [eax] 1339 movdqu xmm1, [eax + 16] 1340 movdqu xmm2, [eax + 32] 1341 movdqu xmm3, [eax + 48] 1342 movdqu xmm4, [eax + esi] 1343 pavgb xmm0, xmm4 1344 movdqu xmm4, [eax + esi + 16] 1345 pavgb xmm1, xmm4 1346 movdqu xmm4, [eax + esi + 32] 1347 pavgb xmm2, xmm4 1348 movdqu xmm4, [eax + esi + 48] 1349 pavgb xmm3, xmm4 1350 lea eax, [eax + 64] 1351 movdqa xmm4, xmm0 1352 shufps xmm0, xmm1, 0x88 1353 shufps xmm4, xmm1, 0xdd 1354 pavgb xmm0, xmm4 1355 movdqa xmm4, xmm2 1356 shufps xmm2, xmm3, 0x88 1357 shufps xmm4, xmm3, 0xdd 1358 pavgb xmm2, xmm4 1359 1360 // step 2 - convert to U and V 1361 // from here down is very similar to Y code except 1362 // instead of 16 different pixels, its 8 pixels of U and 8 of V 1363 movdqa xmm1, xmm0 1364 movdqa xmm3, xmm2 1365 pmaddubsw xmm0, xmm7 // U 1366 pmaddubsw xmm2, xmm7 1367 pmaddubsw xmm1, xmm6 // V 1368 pmaddubsw xmm3, xmm6 1369 phaddw xmm0, xmm2 1370 phaddw xmm1, xmm3 1371 paddw xmm0, xmm5 // +.5 rounding -> unsigned 1372 paddw xmm1, xmm5 1373 psraw xmm0, 8 1374 psraw xmm1, 8 1375 packsswb xmm0, xmm1 1376 1377 // step 3 - store 8 U and 8 V values 1378 sub ecx, 16 1379 movlps qword ptr [edx], xmm0 // U 1380 movhps qword ptr [edx + edi], xmm0 // V 1381 lea edx, [edx + 8] 1382 jg convertloop 1383 1384 pop edi 1385 pop esi 1386 ret 1387 } 1388 } 1389 1390 __declspec(naked) __declspec(align(16)) 1391 void ARGBToUV444Row_SSSE3(const uint8* src_argb0, 1392 uint8* dst_u, uint8* dst_v, int width) { 1393 __asm { 1394 push edi 1395 mov eax, [esp + 4 + 4] // src_argb 1396 mov edx, [esp + 4 + 8] // dst_u 1397 mov edi, [esp + 4 + 12] // dst_v 1398 mov ecx, [esp + 4 + 16] // pix 1399 movdqa xmm7, kARGBToU 1400 movdqa xmm6, kARGBToV 1401 movdqa xmm5, kAddUV128 1402 sub edi, edx // stride from u to v 1403 1404 align 4 1405 convertloop: 1406 /* convert to U and V */ 1407 movdqa xmm0, [eax] // U 1408 movdqa xmm1, [eax + 16] 1409 movdqa xmm2, [eax + 32] 1410 movdqa xmm3, [eax + 48] 1411 pmaddubsw xmm0, xmm7 1412 pmaddubsw xmm1, xmm7 1413 pmaddubsw xmm2, xmm7 1414 pmaddubsw xmm3, xmm7 1415 phaddw xmm0, xmm1 1416 phaddw xmm2, xmm3 1417 psraw xmm0, 8 1418 psraw xmm2, 8 1419 packsswb xmm0, xmm2 1420 paddb xmm0, xmm5 1421 sub ecx, 16 1422 movdqa [edx], xmm0 1423 1424 movdqa xmm0, [eax] // V 1425 movdqa xmm1, [eax + 16] 1426 movdqa xmm2, [eax + 32] 1427 movdqa xmm3, [eax + 48] 1428 pmaddubsw xmm0, xmm6 1429 pmaddubsw xmm1, xmm6 1430 pmaddubsw xmm2, xmm6 1431 pmaddubsw xmm3, xmm6 1432 phaddw xmm0, xmm1 1433 phaddw xmm2, xmm3 1434 psraw xmm0, 8 1435 psraw xmm2, 8 1436 packsswb xmm0, xmm2 1437 paddb xmm0, xmm5 1438 lea eax, [eax + 64] 1439 movdqa [edx + edi], xmm0 1440 lea edx, [edx + 16] 1441 jg convertloop 1442 1443 pop edi 1444 ret 1445 } 1446 } 1447 1448 __declspec(naked) __declspec(align(16)) 1449 void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb0, 1450 uint8* dst_u, uint8* dst_v, int width) { 1451 __asm { 1452 push edi 1453 mov eax, [esp + 4 + 4] // src_argb 1454 mov edx, [esp + 4 + 8] // dst_u 1455 mov edi, [esp + 4 + 12] // dst_v 1456 mov ecx, [esp + 4 + 16] // pix 1457 movdqa xmm7, kARGBToU 1458 movdqa xmm6, kARGBToV 1459 movdqa xmm5, kAddUV128 1460 sub edi, edx // stride from u to v 1461 1462 align 4 1463 convertloop: 1464 /* convert to U and V */ 1465 movdqu xmm0, [eax] // U 1466 movdqu xmm1, [eax + 16] 1467 movdqu xmm2, [eax + 32] 1468 movdqu xmm3, [eax + 48] 1469 pmaddubsw xmm0, xmm7 1470 pmaddubsw xmm1, xmm7 1471 pmaddubsw xmm2, xmm7 1472 pmaddubsw xmm3, xmm7 1473 phaddw xmm0, xmm1 1474 phaddw xmm2, xmm3 1475 psraw xmm0, 8 1476 psraw xmm2, 8 1477 packsswb xmm0, xmm2 1478 paddb xmm0, xmm5 1479 sub ecx, 16 1480 movdqu [edx], xmm0 1481 1482 movdqu xmm0, [eax] // V 1483 movdqu xmm1, [eax + 16] 1484 movdqu xmm2, [eax + 32] 1485 movdqu xmm3, [eax + 48] 1486 pmaddubsw xmm0, xmm6 1487 pmaddubsw xmm1, xmm6 1488 pmaddubsw xmm2, xmm6 1489 pmaddubsw xmm3, xmm6 1490 phaddw xmm0, xmm1 1491 phaddw xmm2, xmm3 1492 psraw xmm0, 8 1493 psraw xmm2, 8 1494 packsswb xmm0, xmm2 1495 paddb xmm0, xmm5 1496 lea eax, [eax + 64] 1497 movdqu [edx + edi], xmm0 1498 lea edx, [edx + 16] 1499 jg convertloop 1500 1501 pop edi 1502 ret 1503 } 1504 } 1505 1506 __declspec(naked) __declspec(align(16)) 1507 void ARGBToUV422Row_SSSE3(const uint8* src_argb0, 1508 uint8* dst_u, uint8* dst_v, int width) { 1509 __asm { 1510 push edi 1511 mov eax, [esp + 4 + 4] // src_argb 1512 mov edx, [esp + 4 + 8] // dst_u 1513 mov edi, [esp + 4 + 12] // dst_v 1514 mov ecx, [esp + 4 + 16] // pix 1515 movdqa xmm7, kARGBToU 1516 movdqa xmm6, kARGBToV 1517 movdqa xmm5, kAddUV128 1518 sub edi, edx // stride from u to v 1519 1520 align 4 1521 convertloop: 1522 /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1523 movdqa xmm0, [eax] 1524 movdqa xmm1, [eax + 16] 1525 movdqa xmm2, [eax + 32] 1526 movdqa xmm3, [eax + 48] 1527 lea eax, [eax + 64] 1528 movdqa xmm4, xmm0 1529 shufps xmm0, xmm1, 0x88 1530 shufps xmm4, xmm1, 0xdd 1531 pavgb xmm0, xmm4 1532 movdqa xmm4, xmm2 1533 shufps xmm2, xmm3, 0x88 1534 shufps xmm4, xmm3, 0xdd 1535 pavgb xmm2, xmm4 1536 1537 // step 2 - convert to U and V 1538 // from here down is very similar to Y code except 1539 // instead of 16 different pixels, its 8 pixels of U and 8 of V 1540 movdqa xmm1, xmm0 1541 movdqa xmm3, xmm2 1542 pmaddubsw xmm0, xmm7 // U 1543 pmaddubsw xmm2, xmm7 1544 pmaddubsw xmm1, xmm6 // V 1545 pmaddubsw xmm3, xmm6 1546 phaddw xmm0, xmm2 1547 phaddw xmm1, xmm3 1548 psraw xmm0, 8 1549 psraw xmm1, 8 1550 packsswb xmm0, xmm1 1551 paddb xmm0, xmm5 // -> unsigned 1552 1553 // step 3 - store 8 U and 8 V values 1554 sub ecx, 16 1555 movlps qword ptr [edx], xmm0 // U 1556 movhps qword ptr [edx + edi], xmm0 // V 1557 lea edx, [edx + 8] 1558 jg convertloop 1559 1560 pop edi 1561 ret 1562 } 1563 } 1564 1565 __declspec(naked) __declspec(align(16)) 1566 void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0, 1567 uint8* dst_u, uint8* dst_v, int width) { 1568 __asm { 1569 push edi 1570 mov eax, [esp + 4 + 4] // src_argb 1571 mov edx, [esp + 4 + 8] // dst_u 1572 mov edi, [esp + 4 + 12] // dst_v 1573 mov ecx, [esp + 4 + 16] // pix 1574 movdqa xmm7, kARGBToU 1575 movdqa xmm6, kARGBToV 1576 movdqa xmm5, kAddUV128 1577 sub edi, edx // stride from u to v 1578 1579 align 4 1580 convertloop: 1581 /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1582 movdqu xmm0, [eax] 1583 movdqu xmm1, [eax + 16] 1584 movdqu xmm2, [eax + 32] 1585 movdqu xmm3, [eax + 48] 1586 lea eax, [eax + 64] 1587 movdqa xmm4, xmm0 1588 shufps xmm0, xmm1, 0x88 1589 shufps xmm4, xmm1, 0xdd 1590 pavgb xmm0, xmm4 1591 movdqa xmm4, xmm2 1592 shufps xmm2, xmm3, 0x88 1593 shufps xmm4, xmm3, 0xdd 1594 pavgb xmm2, xmm4 1595 1596 // step 2 - convert to U and V 1597 // from here down is very similar to Y code except 1598 // instead of 16 different pixels, its 8 pixels of U and 8 of V 1599 movdqa xmm1, xmm0 1600 movdqa xmm3, xmm2 1601 pmaddubsw xmm0, xmm7 // U 1602 pmaddubsw xmm2, xmm7 1603 pmaddubsw xmm1, xmm6 // V 1604 pmaddubsw xmm3, xmm6 1605 phaddw xmm0, xmm2 1606 phaddw xmm1, xmm3 1607 psraw xmm0, 8 1608 psraw xmm1, 8 1609 packsswb xmm0, xmm1 1610 paddb xmm0, xmm5 // -> unsigned 1611 1612 // step 3 - store 8 U and 8 V values 1613 sub ecx, 16 1614 movlps qword ptr [edx], xmm0 // U 1615 movhps qword ptr [edx + edi], xmm0 // V 1616 lea edx, [edx + 8] 1617 jg convertloop 1618 1619 pop edi 1620 ret 1621 } 1622 } 1623 1624 __declspec(naked) __declspec(align(16)) 1625 void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 1626 uint8* dst_u, uint8* dst_v, int width) { 1627 __asm { 1628 push esi 1629 push edi 1630 mov eax, [esp + 8 + 4] // src_argb 1631 mov esi, [esp + 8 + 8] // src_stride_argb 1632 mov edx, [esp + 8 + 12] // dst_u 1633 mov edi, [esp + 8 + 16] // dst_v 1634 mov ecx, [esp + 8 + 20] // pix 1635 movdqa xmm7, kBGRAToU 1636 movdqa xmm6, kBGRAToV 1637 movdqa xmm5, kAddUV128 1638 sub edi, edx // stride from u to v 1639 1640 align 4 1641 convertloop: 1642 /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1643 movdqa xmm0, [eax] 1644 movdqa xmm1, [eax + 16] 1645 movdqa xmm2, [eax + 32] 1646 movdqa xmm3, [eax + 48] 1647 pavgb xmm0, [eax + esi] 1648 pavgb xmm1, [eax + esi + 16] 1649 pavgb xmm2, [eax + esi + 32] 1650 pavgb xmm3, [eax + esi + 48] 1651 lea eax, [eax + 64] 1652 movdqa xmm4, xmm0 1653 shufps xmm0, xmm1, 0x88 1654 shufps xmm4, xmm1, 0xdd 1655 pavgb xmm0, xmm4 1656 movdqa xmm4, xmm2 1657 shufps xmm2, xmm3, 0x88 1658 shufps xmm4, xmm3, 0xdd 1659 pavgb xmm2, xmm4 1660 1661 // step 2 - convert to U and V 1662 // from here down is very similar to Y code except 1663 // instead of 16 different pixels, its 8 pixels of U and 8 of V 1664 movdqa xmm1, xmm0 1665 movdqa xmm3, xmm2 1666 pmaddubsw xmm0, xmm7 // U 1667 pmaddubsw xmm2, xmm7 1668 pmaddubsw xmm1, xmm6 // V 1669 pmaddubsw xmm3, xmm6 1670 phaddw xmm0, xmm2 1671 phaddw xmm1, xmm3 1672 psraw xmm0, 8 1673 psraw xmm1, 8 1674 packsswb xmm0, xmm1 1675 paddb xmm0, xmm5 // -> unsigned 1676 1677 // step 3 - store 8 U and 8 V values 1678 sub ecx, 16 1679 movlps qword ptr [edx], xmm0 // U 1680 movhps qword ptr [edx + edi], xmm0 // V 1681 lea edx, [edx + 8] 1682 jg convertloop 1683 1684 pop edi 1685 pop esi 1686 ret 1687 } 1688 } 1689 1690 __declspec(naked) __declspec(align(16)) 1691 void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, 1692 uint8* dst_u, uint8* dst_v, int width) { 1693 __asm { 1694 push esi 1695 push edi 1696 mov eax, [esp + 8 + 4] // src_argb 1697 mov esi, [esp + 8 + 8] // src_stride_argb 1698 mov edx, [esp + 8 + 12] // dst_u 1699 mov edi, [esp + 8 + 16] // dst_v 1700 mov ecx, [esp + 8 + 20] // pix 1701 movdqa xmm7, kBGRAToU 1702 movdqa xmm6, kBGRAToV 1703 movdqa xmm5, kAddUV128 1704 sub edi, edx // stride from u to v 1705 1706 align 4 1707 convertloop: 1708 /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1709 movdqu xmm0, [eax] 1710 movdqu xmm1, [eax + 16] 1711 movdqu xmm2, [eax + 32] 1712 movdqu xmm3, [eax + 48] 1713 movdqu xmm4, [eax + esi] 1714 pavgb xmm0, xmm4 1715 movdqu xmm4, [eax + esi + 16] 1716 pavgb xmm1, xmm4 1717 movdqu xmm4, [eax + esi + 32] 1718 pavgb xmm2, xmm4 1719 movdqu xmm4, [eax + esi + 48] 1720 pavgb xmm3, xmm4 1721 lea eax, [eax + 64] 1722 movdqa xmm4, xmm0 1723 shufps xmm0, xmm1, 0x88 1724 shufps xmm4, xmm1, 0xdd 1725 pavgb xmm0, xmm4 1726 movdqa xmm4, xmm2 1727 shufps xmm2, xmm3, 0x88 1728 shufps xmm4, xmm3, 0xdd 1729 pavgb xmm2, xmm4 1730 1731 // step 2 - convert to U and V 1732 // from here down is very similar to Y code except 1733 // instead of 16 different pixels, its 8 pixels of U and 8 of V 1734 movdqa xmm1, xmm0 1735 movdqa xmm3, xmm2 1736 pmaddubsw xmm0, xmm7 // U 1737 pmaddubsw xmm2, xmm7 1738 pmaddubsw xmm1, xmm6 // V 1739 pmaddubsw xmm3, xmm6 1740 phaddw xmm0, xmm2 1741 phaddw xmm1, xmm3 1742 psraw xmm0, 8 1743 psraw xmm1, 8 1744 packsswb xmm0, xmm1 1745 paddb xmm0, xmm5 // -> unsigned 1746 1747 // step 3 - store 8 U and 8 V values 1748 sub ecx, 16 1749 movlps qword ptr [edx], xmm0 // U 1750 movhps qword ptr [edx + edi], xmm0 // V 1751 lea edx, [edx + 8] 1752 jg convertloop 1753 1754 pop edi 1755 pop esi 1756 ret 1757 } 1758 } 1759 1760 __declspec(naked) __declspec(align(16)) 1761 void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 1762 uint8* dst_u, uint8* dst_v, int width) { 1763 __asm { 1764 push esi 1765 push edi 1766 mov eax, [esp + 8 + 4] // src_argb 1767 mov esi, [esp + 8 + 8] // src_stride_argb 1768 mov edx, [esp + 8 + 12] // dst_u 1769 mov edi, [esp + 8 + 16] // dst_v 1770 mov ecx, [esp + 8 + 20] // pix 1771 movdqa xmm7, kABGRToU 1772 movdqa xmm6, kABGRToV 1773 movdqa xmm5, kAddUV128 1774 sub edi, edx // stride from u to v 1775 1776 align 4 1777 convertloop: 1778 /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1779 movdqa xmm0, [eax] 1780 movdqa xmm1, [eax + 16] 1781 movdqa xmm2, [eax + 32] 1782 movdqa xmm3, [eax + 48] 1783 pavgb xmm0, [eax + esi] 1784 pavgb xmm1, [eax + esi + 16] 1785 pavgb xmm2, [eax + esi + 32] 1786 pavgb xmm3, [eax + esi + 48] 1787 lea eax, [eax + 64] 1788 movdqa xmm4, xmm0 1789 shufps xmm0, xmm1, 0x88 1790 shufps xmm4, xmm1, 0xdd 1791 pavgb xmm0, xmm4 1792 movdqa xmm4, xmm2 1793 shufps xmm2, xmm3, 0x88 1794 shufps xmm4, xmm3, 0xdd 1795 pavgb xmm2, xmm4 1796 1797 // step 2 - convert to U and V 1798 // from here down is very similar to Y code except 1799 // instead of 16 different pixels, its 8 pixels of U and 8 of V 1800 movdqa xmm1, xmm0 1801 movdqa xmm3, xmm2 1802 pmaddubsw xmm0, xmm7 // U 1803 pmaddubsw xmm2, xmm7 1804 pmaddubsw xmm1, xmm6 // V 1805 pmaddubsw xmm3, xmm6 1806 phaddw xmm0, xmm2 1807 phaddw xmm1, xmm3 1808 psraw xmm0, 8 1809 psraw xmm1, 8 1810 packsswb xmm0, xmm1 1811 paddb xmm0, xmm5 // -> unsigned 1812 1813 // step 3 - store 8 U and 8 V values 1814 sub ecx, 16 1815 movlps qword ptr [edx], xmm0 // U 1816 movhps qword ptr [edx + edi], xmm0 // V 1817 lea edx, [edx + 8] 1818 jg convertloop 1819 1820 pop edi 1821 pop esi 1822 ret 1823 } 1824 } 1825 1826 __declspec(naked) __declspec(align(16)) 1827 void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, 1828 uint8* dst_u, uint8* dst_v, int width) { 1829 __asm { 1830 push esi 1831 push edi 1832 mov eax, [esp + 8 + 4] // src_argb 1833 mov esi, [esp + 8 + 8] // src_stride_argb 1834 mov edx, [esp + 8 + 12] // dst_u 1835 mov edi, [esp + 8 + 16] // dst_v 1836 mov ecx, [esp + 8 + 20] // pix 1837 movdqa xmm7, kABGRToU 1838 movdqa xmm6, kABGRToV 1839 movdqa xmm5, kAddUV128 1840 sub edi, edx // stride from u to v 1841 1842 align 4 1843 convertloop: 1844 /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1845 movdqu xmm0, [eax] 1846 movdqu xmm1, [eax + 16] 1847 movdqu xmm2, [eax + 32] 1848 movdqu xmm3, [eax + 48] 1849 movdqu xmm4, [eax + esi] 1850 pavgb xmm0, xmm4 1851 movdqu xmm4, [eax + esi + 16] 1852 pavgb xmm1, xmm4 1853 movdqu xmm4, [eax + esi + 32] 1854 pavgb xmm2, xmm4 1855 movdqu xmm4, [eax + esi + 48] 1856 pavgb xmm3, xmm4 1857 lea eax, [eax + 64] 1858 movdqa xmm4, xmm0 1859 shufps xmm0, xmm1, 0x88 1860 shufps xmm4, xmm1, 0xdd 1861 pavgb xmm0, xmm4 1862 movdqa xmm4, xmm2 1863 shufps xmm2, xmm3, 0x88 1864 shufps xmm4, xmm3, 0xdd 1865 pavgb xmm2, xmm4 1866 1867 // step 2 - convert to U and V 1868 // from here down is very similar to Y code except 1869 // instead of 16 different pixels, its 8 pixels of U and 8 of V 1870 movdqa xmm1, xmm0 1871 movdqa xmm3, xmm2 1872 pmaddubsw xmm0, xmm7 // U 1873 pmaddubsw xmm2, xmm7 1874 pmaddubsw xmm1, xmm6 // V 1875 pmaddubsw xmm3, xmm6 1876 phaddw xmm0, xmm2 1877 phaddw xmm1, xmm3 1878 psraw xmm0, 8 1879 psraw xmm1, 8 1880 packsswb xmm0, xmm1 1881 paddb xmm0, xmm5 // -> unsigned 1882 1883 // step 3 - store 8 U and 8 V values 1884 sub ecx, 16 1885 movlps qword ptr [edx], xmm0 // U 1886 movhps qword ptr [edx + edi], xmm0 // V 1887 lea edx, [edx + 8] 1888 jg convertloop 1889 1890 pop edi 1891 pop esi 1892 ret 1893 } 1894 } 1895 1896 __declspec(naked) __declspec(align(16)) 1897 void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 1898 uint8* dst_u, uint8* dst_v, int width) { 1899 __asm { 1900 push esi 1901 push edi 1902 mov eax, [esp + 8 + 4] // src_argb 1903 mov esi, [esp + 8 + 8] // src_stride_argb 1904 mov edx, [esp + 8 + 12] // dst_u 1905 mov edi, [esp + 8 + 16] // dst_v 1906 mov ecx, [esp + 8 + 20] // pix 1907 movdqa xmm7, kRGBAToU 1908 movdqa xmm6, kRGBAToV 1909 movdqa xmm5, kAddUV128 1910 sub edi, edx // stride from u to v 1911 1912 align 4 1913 convertloop: 1914 /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1915 movdqa xmm0, [eax] 1916 movdqa xmm1, [eax + 16] 1917 movdqa xmm2, [eax + 32] 1918 movdqa xmm3, [eax + 48] 1919 pavgb xmm0, [eax + esi] 1920 pavgb xmm1, [eax + esi + 16] 1921 pavgb xmm2, [eax + esi + 32] 1922 pavgb xmm3, [eax + esi + 48] 1923 lea eax, [eax + 64] 1924 movdqa xmm4, xmm0 1925 shufps xmm0, xmm1, 0x88 1926 shufps xmm4, xmm1, 0xdd 1927 pavgb xmm0, xmm4 1928 movdqa xmm4, xmm2 1929 shufps xmm2, xmm3, 0x88 1930 shufps xmm4, xmm3, 0xdd 1931 pavgb xmm2, xmm4 1932 1933 // step 2 - convert to U and V 1934 // from here down is very similar to Y code except 1935 // instead of 16 different pixels, its 8 pixels of U and 8 of V 1936 movdqa xmm1, xmm0 1937 movdqa xmm3, xmm2 1938 pmaddubsw xmm0, xmm7 // U 1939 pmaddubsw xmm2, xmm7 1940 pmaddubsw xmm1, xmm6 // V 1941 pmaddubsw xmm3, xmm6 1942 phaddw xmm0, xmm2 1943 phaddw xmm1, xmm3 1944 psraw xmm0, 8 1945 psraw xmm1, 8 1946 packsswb xmm0, xmm1 1947 paddb xmm0, xmm5 // -> unsigned 1948 1949 // step 3 - store 8 U and 8 V values 1950 sub ecx, 16 1951 movlps qword ptr [edx], xmm0 // U 1952 movhps qword ptr [edx + edi], xmm0 // V 1953 lea edx, [edx + 8] 1954 jg convertloop 1955 1956 pop edi 1957 pop esi 1958 ret 1959 } 1960 } 1961 1962 __declspec(naked) __declspec(align(16)) 1963 void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, 1964 uint8* dst_u, uint8* dst_v, int width) { 1965 __asm { 1966 push esi 1967 push edi 1968 mov eax, [esp + 8 + 4] // src_argb 1969 mov esi, [esp + 8 + 8] // src_stride_argb 1970 mov edx, [esp + 8 + 12] // dst_u 1971 mov edi, [esp + 8 + 16] // dst_v 1972 mov ecx, [esp + 8 + 20] // pix 1973 movdqa xmm7, kRGBAToU 1974 movdqa xmm6, kRGBAToV 1975 movdqa xmm5, kAddUV128 1976 sub edi, edx // stride from u to v 1977 1978 align 4 1979 convertloop: 1980 /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1981 movdqu xmm0, [eax] 1982 movdqu xmm1, [eax + 16] 1983 movdqu xmm2, [eax + 32] 1984 movdqu xmm3, [eax + 48] 1985 movdqu xmm4, [eax + esi] 1986 pavgb xmm0, xmm4 1987 movdqu xmm4, [eax + esi + 16] 1988 pavgb xmm1, xmm4 1989 movdqu xmm4, [eax + esi + 32] 1990 pavgb xmm2, xmm4 1991 movdqu xmm4, [eax + esi + 48] 1992 pavgb xmm3, xmm4 1993 lea eax, [eax + 64] 1994 movdqa xmm4, xmm0 1995 shufps xmm0, xmm1, 0x88 1996 shufps xmm4, xmm1, 0xdd 1997 pavgb xmm0, xmm4 1998 movdqa xmm4, xmm2 1999 shufps xmm2, xmm3, 0x88 2000 shufps xmm4, xmm3, 0xdd 2001 pavgb xmm2, xmm4 2002 2003 // step 2 - convert to U and V 2004 // from here down is very similar to Y code except 2005 // instead of 16 different pixels, its 8 pixels of U and 8 of V 2006 movdqa xmm1, xmm0 2007 movdqa xmm3, xmm2 2008 pmaddubsw xmm0, xmm7 // U 2009 pmaddubsw xmm2, xmm7 2010 pmaddubsw xmm1, xmm6 // V 2011 pmaddubsw xmm3, xmm6 2012 phaddw xmm0, xmm2 2013 phaddw xmm1, xmm3 2014 psraw xmm0, 8 2015 psraw xmm1, 8 2016 packsswb xmm0, xmm1 2017 paddb xmm0, xmm5 // -> unsigned 2018 2019 // step 3 - store 8 U and 8 V values 2020 sub ecx, 16 2021 movlps qword ptr [edx], xmm0 // U 2022 movhps qword ptr [edx + edi], xmm0 // V 2023 lea edx, [edx + 8] 2024 jg convertloop 2025 2026 pop edi 2027 pop esi 2028 ret 2029 } 2030 } 2031 #endif // HAS_ARGBTOYROW_SSSE3 2032 2033 #define YG 74 /* (int8)(1.164 * 64 + 0.5) */ 2034 2035 #define UB 127 /* min(63,(int8)(2.018 * 64)) */ 2036 #define UG -25 /* (int8)(-0.391 * 64 - 0.5) */ 2037 #define UR 0 2038 2039 #define VB 0 2040 #define VG -52 /* (int8)(-0.813 * 64 - 0.5) */ 2041 #define VR 102 /* (int8)(1.596 * 64 + 0.5) */ 2042 2043 // Bias 2044 #define BB UB * 128 + VB * 128 2045 #define BG UG * 128 + VG * 128 2046 #define BR UR * 128 + VR * 128 2047 2048 #ifdef HAS_I422TOARGBROW_AVX2 2049 2050 static const lvec8 kUVToB_AVX = { 2051 UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, 2052 UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB 2053 }; 2054 static const lvec8 kUVToR_AVX = { 2055 UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, 2056 UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR 2057 }; 2058 static const lvec8 kUVToG_AVX = { 2059 UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, 2060 UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG 2061 }; 2062 static const lvec16 kYToRgb_AVX = { 2063 YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG 2064 }; 2065 static const lvec16 kYSub16_AVX = { 2066 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 2067 }; 2068 static const lvec16 kUVBiasB_AVX = { 2069 BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB 2070 }; 2071 static const lvec16 kUVBiasG_AVX = { 2072 BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG 2073 }; 2074 static const lvec16 kUVBiasR_AVX = { 2075 BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR 2076 }; 2077 2078 // 16 pixels 2079 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). 2080 __declspec(naked) __declspec(align(16)) 2081 void I422ToARGBRow_AVX2(const uint8* y_buf, 2082 const uint8* u_buf, 2083 const uint8* v_buf, 2084 uint8* dst_argb, 2085 int width) { 2086 __asm { 2087 push esi 2088 push edi 2089 mov eax, [esp + 8 + 4] // Y 2090 mov esi, [esp + 8 + 8] // U 2091 mov edi, [esp + 8 + 12] // V 2092 mov edx, [esp + 8 + 16] // argb 2093 mov ecx, [esp + 8 + 20] // width 2094 sub edi, esi 2095 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha 2096 vpxor ymm4, ymm4, ymm4 2097 2098 align 4 2099 convertloop: 2100 vmovq xmm0, qword ptr [esi] // U 2101 vmovq xmm1, qword ptr [esi + edi] // V 2102 lea esi, [esi + 8] 2103 vpunpcklbw ymm0, ymm0, ymm1 // UV 2104 vpermq ymm0, ymm0, 0xd8 2105 vpunpcklwd ymm0, ymm0, ymm0 // UVUV 2106 vpmaddubsw ymm2, ymm0, kUVToB_AVX // scale B UV 2107 vpmaddubsw ymm1, ymm0, kUVToG_AVX // scale G UV 2108 vpmaddubsw ymm0, ymm0, kUVToR_AVX // scale R UV 2109 vpsubw ymm2, ymm2, kUVBiasB_AVX // unbias back to signed 2110 vpsubw ymm1, ymm1, kUVBiasG_AVX 2111 vpsubw ymm0, ymm0, kUVBiasR_AVX 2112 2113 // Step 2: Find Y contribution to 16 R,G,B values 2114 vmovdqu xmm3, [eax] // NOLINT 2115 lea eax, [eax + 16] 2116 vpermq ymm3, ymm3, 0xd8 2117 vpunpcklbw ymm3, ymm3, ymm4 2118 vpsubsw ymm3, ymm3, kYSub16_AVX 2119 vpmullw ymm3, ymm3, kYToRgb_AVX 2120 vpaddsw ymm2, ymm2, ymm3 // B += Y 2121 vpaddsw ymm1, ymm1, ymm3 // G += Y 2122 vpaddsw ymm0, ymm0, ymm3 // R += Y 2123 vpsraw ymm2, ymm2, 6 2124 vpsraw ymm1, ymm1, 6 2125 vpsraw ymm0, ymm0, 6 2126 vpackuswb ymm2, ymm2, ymm2 // B 2127 vpackuswb ymm1, ymm1, ymm1 // G 2128 vpackuswb ymm0, ymm0, ymm0 // R 2129 2130 // Step 3: Weave into ARGB 2131 vpunpcklbw ymm2, ymm2, ymm1 // BG 2132 vpermq ymm2, ymm2, 0xd8 2133 vpunpcklbw ymm0, ymm0, ymm5 // RA 2134 vpermq ymm0, ymm0, 0xd8 2135 vpunpcklwd ymm1, ymm2, ymm0 // BGRA first 8 pixels 2136 vpunpckhwd ymm2, ymm2, ymm0 // BGRA next 8 pixels 2137 vmovdqu [edx], ymm1 2138 vmovdqu [edx + 32], ymm2 2139 lea edx, [edx + 64] 2140 sub ecx, 16 2141 jg convertloop 2142 vzeroupper 2143 2144 pop edi 2145 pop esi 2146 ret 2147 } 2148 } 2149 #endif // HAS_I422TOARGBROW_AVX2 2150 2151 #ifdef HAS_I422TOARGBROW_SSSE3 2152 2153 static const vec8 kUVToB = { 2154 UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB 2155 }; 2156 2157 static const vec8 kUVToR = { 2158 UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR 2159 }; 2160 2161 static const vec8 kUVToG = { 2162 UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG 2163 }; 2164 2165 static const vec8 kVUToB = { 2166 VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, 2167 }; 2168 2169 static const vec8 kVUToR = { 2170 VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, 2171 }; 2172 2173 static const vec8 kVUToG = { 2174 VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, 2175 }; 2176 2177 static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG }; 2178 static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 }; 2179 static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB }; 2180 static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG }; 2181 static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR }; 2182 2183 // TODO(fbarchard): Read that does half size on Y and treats 420 as 444. 2184 2185 // Read 8 UV from 444. 2186 #define READYUV444 __asm { \ 2187 __asm movq xmm0, qword ptr [esi] /* U */ /* NOLINT */ \ 2188 __asm movq xmm1, qword ptr [esi + edi] /* V */ /* NOLINT */ \ 2189 __asm lea esi, [esi + 8] \ 2190 __asm punpcklbw xmm0, xmm1 /* UV */ \ 2191 } 2192 2193 // Read 4 UV from 422, upsample to 8 UV. 2194 #define READYUV422 __asm { \ 2195 __asm movd xmm0, [esi] /* U */ \ 2196 __asm movd xmm1, [esi + edi] /* V */ \ 2197 __asm lea esi, [esi + 4] \ 2198 __asm punpcklbw xmm0, xmm1 /* UV */ \ 2199 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ 2200 } 2201 2202 // Read 2 UV from 411, upsample to 8 UV. 2203 #define READYUV411 __asm { \ 2204 __asm movzx ebx, word ptr [esi] /* U */ /* NOLINT */ \ 2205 __asm movd xmm0, ebx \ 2206 __asm movzx ebx, word ptr [esi + edi] /* V */ /* NOLINT */ \ 2207 __asm movd xmm1, ebx \ 2208 __asm lea esi, [esi + 2] \ 2209 __asm punpcklbw xmm0, xmm1 /* UV */ \ 2210 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ 2211 __asm punpckldq xmm0, xmm0 /* UVUV (upsample) */ \ 2212 } 2213 2214 // Read 4 UV from NV12, upsample to 8 UV. 2215 #define READNV12 __asm { \ 2216 __asm movq xmm0, qword ptr [esi] /* UV */ /* NOLINT */ \ 2217 __asm lea esi, [esi + 8] \ 2218 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ 2219 } 2220 2221 // Convert 8 pixels: 8 UV and 8 Y. 2222 #define YUVTORGB __asm { \ 2223 /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \ 2224 __asm movdqa xmm1, xmm0 \ 2225 __asm movdqa xmm2, xmm0 \ 2226 __asm pmaddubsw xmm0, kUVToB /* scale B UV */ \ 2227 __asm pmaddubsw xmm1, kUVToG /* scale G UV */ \ 2228 __asm pmaddubsw xmm2, kUVToR /* scale R UV */ \ 2229 __asm psubw xmm0, kUVBiasB /* unbias back to signed */ \ 2230 __asm psubw xmm1, kUVBiasG \ 2231 __asm psubw xmm2, kUVBiasR \ 2232 /* Step 2: Find Y contribution to 8 R,G,B values */ \ 2233 __asm movq xmm3, qword ptr [eax] /* NOLINT */ \ 2234 __asm lea eax, [eax + 8] \ 2235 __asm punpcklbw xmm3, xmm4 \ 2236 __asm psubsw xmm3, kYSub16 \ 2237 __asm pmullw xmm3, kYToRgb \ 2238 __asm paddsw xmm0, xmm3 /* B += Y */ \ 2239 __asm paddsw xmm1, xmm3 /* G += Y */ \ 2240 __asm paddsw xmm2, xmm3 /* R += Y */ \ 2241 __asm psraw xmm0, 6 \ 2242 __asm psraw xmm1, 6 \ 2243 __asm psraw xmm2, 6 \ 2244 __asm packuswb xmm0, xmm0 /* B */ \ 2245 __asm packuswb xmm1, xmm1 /* G */ \ 2246 __asm packuswb xmm2, xmm2 /* R */ \ 2247 } 2248 2249 // Convert 8 pixels: 8 VU and 8 Y. 2250 #define YVUTORGB __asm { \ 2251 /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \ 2252 __asm movdqa xmm1, xmm0 \ 2253 __asm movdqa xmm2, xmm0 \ 2254 __asm pmaddubsw xmm0, kVUToB /* scale B UV */ \ 2255 __asm pmaddubsw xmm1, kVUToG /* scale G UV */ \ 2256 __asm pmaddubsw xmm2, kVUToR /* scale R UV */ \ 2257 __asm psubw xmm0, kUVBiasB /* unbias back to signed */ \ 2258 __asm psubw xmm1, kUVBiasG \ 2259 __asm psubw xmm2, kUVBiasR \ 2260 /* Step 2: Find Y contribution to 8 R,G,B values */ \ 2261 __asm movq xmm3, qword ptr [eax] /* NOLINT */ \ 2262 __asm lea eax, [eax + 8] \ 2263 __asm punpcklbw xmm3, xmm4 \ 2264 __asm psubsw xmm3, kYSub16 \ 2265 __asm pmullw xmm3, kYToRgb \ 2266 __asm paddsw xmm0, xmm3 /* B += Y */ \ 2267 __asm paddsw xmm1, xmm3 /* G += Y */ \ 2268 __asm paddsw xmm2, xmm3 /* R += Y */ \ 2269 __asm psraw xmm0, 6 \ 2270 __asm psraw xmm1, 6 \ 2271 __asm psraw xmm2, 6 \ 2272 __asm packuswb xmm0, xmm0 /* B */ \ 2273 __asm packuswb xmm1, xmm1 /* G */ \ 2274 __asm packuswb xmm2, xmm2 /* R */ \ 2275 } 2276 2277 // 8 pixels, dest aligned 16. 2278 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). 2279 __declspec(naked) __declspec(align(16)) 2280 void I444ToARGBRow_SSSE3(const uint8* y_buf, 2281 const uint8* u_buf, 2282 const uint8* v_buf, 2283 uint8* dst_argb, 2284 int width) { 2285 __asm { 2286 push esi 2287 push edi 2288 mov eax, [esp + 8 + 4] // Y 2289 mov esi, [esp + 8 + 8] // U 2290 mov edi, [esp + 8 + 12] // V 2291 mov edx, [esp + 8 + 16] // argb 2292 mov ecx, [esp + 8 + 20] // width 2293 sub edi, esi 2294 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2295 pxor xmm4, xmm4 2296 2297 align 4 2298 convertloop: 2299 READYUV444 2300 YUVTORGB 2301 2302 // Step 3: Weave into ARGB 2303 punpcklbw xmm0, xmm1 // BG 2304 punpcklbw xmm2, xmm5 // RA 2305 movdqa xmm1, xmm0 2306 punpcklwd xmm0, xmm2 // BGRA first 4 pixels 2307 punpckhwd xmm1, xmm2 // BGRA next 4 pixels 2308 movdqa [edx], xmm0 2309 movdqa [edx + 16], xmm1 2310 lea edx, [edx + 32] 2311 sub ecx, 8 2312 jg convertloop 2313 2314 pop edi 2315 pop esi 2316 ret 2317 } 2318 } 2319 2320 // 8 pixels, dest aligned 16. 2321 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 2322 __declspec(naked) __declspec(align(16)) 2323 void I422ToRGB24Row_SSSE3(const uint8* y_buf, 2324 const uint8* u_buf, 2325 const uint8* v_buf, 2326 uint8* dst_rgb24, 2327 int width) { 2328 __asm { 2329 push esi 2330 push edi 2331 mov eax, [esp + 8 + 4] // Y 2332 mov esi, [esp + 8 + 8] // U 2333 mov edi, [esp + 8 + 12] // V 2334 mov edx, [esp + 8 + 16] // rgb24 2335 mov ecx, [esp + 8 + 20] // width 2336 sub edi, esi 2337 pxor xmm4, xmm4 2338 movdqa xmm5, kShuffleMaskARGBToRGB24_0 2339 movdqa xmm6, kShuffleMaskARGBToRGB24 2340 2341 align 4 2342 convertloop: 2343 READYUV422 2344 YUVTORGB 2345 2346 // Step 3: Weave into RRGB 2347 punpcklbw xmm0, xmm1 // BG 2348 punpcklbw xmm2, xmm2 // RR 2349 movdqa xmm1, xmm0 2350 punpcklwd xmm0, xmm2 // BGRR first 4 pixels 2351 punpckhwd xmm1, xmm2 // BGRR next 4 pixels 2352 pshufb xmm0, xmm5 // Pack into first 8 and last 4 bytes. 2353 pshufb xmm1, xmm6 // Pack into first 12 bytes. 2354 palignr xmm1, xmm0, 12 // last 4 bytes of xmm0 + 12 from xmm1 2355 movq qword ptr [edx], xmm0 // First 8 bytes 2356 movdqu [edx + 8], xmm1 // Last 16 bytes. = 24 bytes, 8 RGB pixels. 2357 lea edx, [edx + 24] 2358 sub ecx, 8 2359 jg convertloop 2360 2361 pop edi 2362 pop esi 2363 ret 2364 } 2365 } 2366 2367 // 8 pixels, dest aligned 16. 2368 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 2369 __declspec(naked) __declspec(align(16)) 2370 void I422ToRAWRow_SSSE3(const uint8* y_buf, 2371 const uint8* u_buf, 2372 const uint8* v_buf, 2373 uint8* dst_raw, 2374 int width) { 2375 __asm { 2376 push esi 2377 push edi 2378 mov eax, [esp + 8 + 4] // Y 2379 mov esi, [esp + 8 + 8] // U 2380 mov edi, [esp + 8 + 12] // V 2381 mov edx, [esp + 8 + 16] // raw 2382 mov ecx, [esp + 8 + 20] // width 2383 sub edi, esi 2384 pxor xmm4, xmm4 2385 movdqa xmm5, kShuffleMaskARGBToRAW_0 2386 movdqa xmm6, kShuffleMaskARGBToRAW 2387 2388 align 4 2389 convertloop: 2390 READYUV422 2391 YUVTORGB 2392 2393 // Step 3: Weave into RRGB 2394 punpcklbw xmm0, xmm1 // BG 2395 punpcklbw xmm2, xmm2 // RR 2396 movdqa xmm1, xmm0 2397 punpcklwd xmm0, xmm2 // BGRR first 4 pixels 2398 punpckhwd xmm1, xmm2 // BGRR next 4 pixels 2399 pshufb xmm0, xmm5 // Pack into first 8 and last 4 bytes. 2400 pshufb xmm1, xmm6 // Pack into first 12 bytes. 2401 palignr xmm1, xmm0, 12 // last 4 bytes of xmm0 + 12 from xmm1 2402 movq qword ptr [edx], xmm0 // First 8 bytes 2403 movdqu [edx + 8], xmm1 // Last 16 bytes. = 24 bytes, 8 RGB pixels. 2404 lea edx, [edx + 24] 2405 sub ecx, 8 2406 jg convertloop 2407 2408 pop edi 2409 pop esi 2410 ret 2411 } 2412 } 2413 2414 // 8 pixels, dest unaligned. 2415 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 2416 __declspec(naked) __declspec(align(16)) 2417 void I422ToRGB565Row_SSSE3(const uint8* y_buf, 2418 const uint8* u_buf, 2419 const uint8* v_buf, 2420 uint8* rgb565_buf, 2421 int width) { 2422 __asm { 2423 push esi 2424 push edi 2425 mov eax, [esp + 8 + 4] // Y 2426 mov esi, [esp + 8 + 8] // U 2427 mov edi, [esp + 8 + 12] // V 2428 mov edx, [esp + 8 + 16] // rgb565 2429 mov ecx, [esp + 8 + 20] // width 2430 sub edi, esi 2431 pxor xmm4, xmm4 2432 pcmpeqb xmm5, xmm5 // generate mask 0x0000001f 2433 psrld xmm5, 27 2434 pcmpeqb xmm6, xmm6 // generate mask 0x000007e0 2435 psrld xmm6, 26 2436 pslld xmm6, 5 2437 pcmpeqb xmm7, xmm7 // generate mask 0xfffff800 2438 pslld xmm7, 11 2439 2440 align 4 2441 convertloop: 2442 READYUV422 2443 YUVTORGB 2444 2445 // Step 3: Weave into RRGB 2446 punpcklbw xmm0, xmm1 // BG 2447 punpcklbw xmm2, xmm2 // RR 2448 movdqa xmm1, xmm0 2449 punpcklwd xmm0, xmm2 // BGRR first 4 pixels 2450 punpckhwd xmm1, xmm2 // BGRR next 4 pixels 2451 2452 // Step 3b: RRGB -> RGB565 2453 movdqa xmm3, xmm0 // B first 4 pixels of argb 2454 movdqa xmm2, xmm0 // G 2455 pslld xmm0, 8 // R 2456 psrld xmm3, 3 // B 2457 psrld xmm2, 5 // G 2458 psrad xmm0, 16 // R 2459 pand xmm3, xmm5 // B 2460 pand xmm2, xmm6 // G 2461 pand xmm0, xmm7 // R 2462 por xmm3, xmm2 // BG 2463 por xmm0, xmm3 // BGR 2464 movdqa xmm3, xmm1 // B next 4 pixels of argb 2465 movdqa xmm2, xmm1 // G 2466 pslld xmm1, 8 // R 2467 psrld xmm3, 3 // B 2468 psrld xmm2, 5 // G 2469 psrad xmm1, 16 // R 2470 pand xmm3, xmm5 // B 2471 pand xmm2, xmm6 // G 2472 pand xmm1, xmm7 // R 2473 por xmm3, xmm2 // BG 2474 por xmm1, xmm3 // BGR 2475 packssdw xmm0, xmm1 2476 sub ecx, 8 2477 movdqu [edx], xmm0 // store 8 pixels of RGB565 2478 lea edx, [edx + 16] 2479 jg convertloop 2480 2481 pop edi 2482 pop esi 2483 ret 2484 } 2485 } 2486 2487 // 8 pixels, dest aligned 16. 2488 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 2489 __declspec(naked) __declspec(align(16)) 2490 void I422ToARGBRow_SSSE3(const uint8* y_buf, 2491 const uint8* u_buf, 2492 const uint8* v_buf, 2493 uint8* dst_argb, 2494 int width) { 2495 __asm { 2496 push esi 2497 push edi 2498 mov eax, [esp + 8 + 4] // Y 2499 mov esi, [esp + 8 + 8] // U 2500 mov edi, [esp + 8 + 12] // V 2501 mov edx, [esp + 8 + 16] // argb 2502 mov ecx, [esp + 8 + 20] // width 2503 sub edi, esi 2504 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2505 pxor xmm4, xmm4 2506 2507 align 4 2508 convertloop: 2509 READYUV422 2510 YUVTORGB 2511 2512 // Step 3: Weave into ARGB 2513 punpcklbw xmm0, xmm1 // BG 2514 punpcklbw xmm2, xmm5 // RA 2515 movdqa xmm1, xmm0 2516 punpcklwd xmm0, xmm2 // BGRA first 4 pixels 2517 punpckhwd xmm1, xmm2 // BGRA next 4 pixels 2518 movdqa [edx], xmm0 2519 movdqa [edx + 16], xmm1 2520 lea edx, [edx + 32] 2521 sub ecx, 8 2522 jg convertloop 2523 2524 pop edi 2525 pop esi 2526 ret 2527 } 2528 } 2529 2530 // 8 pixels, dest aligned 16. 2531 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 2532 // Similar to I420 but duplicate UV once more. 2533 __declspec(naked) __declspec(align(16)) 2534 void I411ToARGBRow_SSSE3(const uint8* y_buf, 2535 const uint8* u_buf, 2536 const uint8* v_buf, 2537 uint8* dst_argb, 2538 int width) { 2539 __asm { 2540 push ebx 2541 push esi 2542 push edi 2543 mov eax, [esp + 12 + 4] // Y 2544 mov esi, [esp + 12 + 8] // U 2545 mov edi, [esp + 12 + 12] // V 2546 mov edx, [esp + 12 + 16] // argb 2547 mov ecx, [esp + 12 + 20] // width 2548 sub edi, esi 2549 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2550 pxor xmm4, xmm4 2551 2552 align 4 2553 convertloop: 2554 READYUV411 // modifies EBX 2555 YUVTORGB 2556 2557 // Step 3: Weave into ARGB 2558 punpcklbw xmm0, xmm1 // BG 2559 punpcklbw xmm2, xmm5 // RA 2560 movdqa xmm1, xmm0 2561 punpcklwd xmm0, xmm2 // BGRA first 4 pixels 2562 punpckhwd xmm1, xmm2 // BGRA next 4 pixels 2563 movdqa [edx], xmm0 2564 movdqa [edx + 16], xmm1 2565 lea edx, [edx + 32] 2566 sub ecx, 8 2567 jg convertloop 2568 2569 pop edi 2570 pop esi 2571 pop ebx 2572 ret 2573 } 2574 } 2575 2576 // 8 pixels, dest aligned 16. 2577 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 2578 __declspec(naked) __declspec(align(16)) 2579 void NV12ToARGBRow_SSSE3(const uint8* y_buf, 2580 const uint8* uv_buf, 2581 uint8* dst_argb, 2582 int width) { 2583 __asm { 2584 push esi 2585 mov eax, [esp + 4 + 4] // Y 2586 mov esi, [esp + 4 + 8] // UV 2587 mov edx, [esp + 4 + 12] // argb 2588 mov ecx, [esp + 4 + 16] // width 2589 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2590 pxor xmm4, xmm4 2591 2592 align 4 2593 convertloop: 2594 READNV12 2595 YUVTORGB 2596 2597 // Step 3: Weave into ARGB 2598 punpcklbw xmm0, xmm1 // BG 2599 punpcklbw xmm2, xmm5 // RA 2600 movdqa xmm1, xmm0 2601 punpcklwd xmm0, xmm2 // BGRA first 4 pixels 2602 punpckhwd xmm1, xmm2 // BGRA next 4 pixels 2603 movdqa [edx], xmm0 2604 movdqa [edx + 16], xmm1 2605 lea edx, [edx + 32] 2606 sub ecx, 8 2607 jg convertloop 2608 2609 pop esi 2610 ret 2611 } 2612 } 2613 2614 // 8 pixels, dest aligned 16. 2615 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 2616 __declspec(naked) __declspec(align(16)) 2617 void NV21ToARGBRow_SSSE3(const uint8* y_buf, 2618 const uint8* uv_buf, 2619 uint8* dst_argb, 2620 int width) { 2621 __asm { 2622 push esi 2623 mov eax, [esp + 4 + 4] // Y 2624 mov esi, [esp + 4 + 8] // VU 2625 mov edx, [esp + 4 + 12] // argb 2626 mov ecx, [esp + 4 + 16] // width 2627 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2628 pxor xmm4, xmm4 2629 2630 align 4 2631 convertloop: 2632 READNV12 2633 YVUTORGB 2634 2635 // Step 3: Weave into ARGB 2636 punpcklbw xmm0, xmm1 // BG 2637 punpcklbw xmm2, xmm5 // RA 2638 movdqa xmm1, xmm0 2639 punpcklwd xmm0, xmm2 // BGRA first 4 pixels 2640 punpckhwd xmm1, xmm2 // BGRA next 4 pixels 2641 movdqa [edx], xmm0 2642 movdqa [edx + 16], xmm1 2643 lea edx, [edx + 32] 2644 sub ecx, 8 2645 jg convertloop 2646 2647 pop esi 2648 ret 2649 } 2650 } 2651 2652 // 8 pixels, unaligned. 2653 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). 2654 __declspec(naked) __declspec(align(16)) 2655 void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, 2656 const uint8* u_buf, 2657 const uint8* v_buf, 2658 uint8* dst_argb, 2659 int width) { 2660 __asm { 2661 push esi 2662 push edi 2663 mov eax, [esp + 8 + 4] // Y 2664 mov esi, [esp + 8 + 8] // U 2665 mov edi, [esp + 8 + 12] // V 2666 mov edx, [esp + 8 + 16] // argb 2667 mov ecx, [esp + 8 + 20] // width 2668 sub edi, esi 2669 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2670 pxor xmm4, xmm4 2671 2672 align 4 2673 convertloop: 2674 READYUV444 2675 YUVTORGB 2676 2677 // Step 3: Weave into ARGB 2678 punpcklbw xmm0, xmm1 // BG 2679 punpcklbw xmm2, xmm5 // RA 2680 movdqa xmm1, xmm0 2681 punpcklwd xmm0, xmm2 // BGRA first 4 pixels 2682 punpckhwd xmm1, xmm2 // BGRA next 4 pixels 2683 movdqu [edx], xmm0 2684 movdqu [edx + 16], xmm1 2685 lea edx, [edx + 32] 2686 sub ecx, 8 2687 jg convertloop 2688 2689 pop edi 2690 pop esi 2691 ret 2692 } 2693 } 2694 2695 // 8 pixels, unaligned. 2696 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 2697 __declspec(naked) __declspec(align(16)) 2698 void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, 2699 const uint8* u_buf, 2700 const uint8* v_buf, 2701 uint8* dst_argb, 2702 int width) { 2703 __asm { 2704 push esi 2705 push edi 2706 mov eax, [esp + 8 + 4] // Y 2707 mov esi, [esp + 8 + 8] // U 2708 mov edi, [esp + 8 + 12] // V 2709 mov edx, [esp + 8 + 16] // argb 2710 mov ecx, [esp + 8 + 20] // width 2711 sub edi, esi 2712 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2713 pxor xmm4, xmm4 2714 2715 align 4 2716 convertloop: 2717 READYUV422 2718 YUVTORGB 2719 2720 // Step 3: Weave into ARGB 2721 punpcklbw xmm0, xmm1 // BG 2722 punpcklbw xmm2, xmm5 // RA 2723 movdqa xmm1, xmm0 2724 punpcklwd xmm0, xmm2 // BGRA first 4 pixels 2725 punpckhwd xmm1, xmm2 // BGRA next 4 pixels 2726 movdqu [edx], xmm0 2727 movdqu [edx + 16], xmm1 2728 lea edx, [edx + 32] 2729 sub ecx, 8 2730 jg convertloop 2731 2732 pop edi 2733 pop esi 2734 ret 2735 } 2736 } 2737 2738 // 8 pixels, unaligned. 2739 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 2740 // Similar to I420 but duplicate UV once more. 2741 __declspec(naked) __declspec(align(16)) 2742 void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, 2743 const uint8* u_buf, 2744 const uint8* v_buf, 2745 uint8* dst_argb, 2746 int width) { 2747 __asm { 2748 push ebx 2749 push esi 2750 push edi 2751 mov eax, [esp + 12 + 4] // Y 2752 mov esi, [esp + 12 + 8] // U 2753 mov edi, [esp + 12 + 12] // V 2754 mov edx, [esp + 12 + 16] // argb 2755 mov ecx, [esp + 12 + 20] // width 2756 sub edi, esi 2757 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2758 pxor xmm4, xmm4 2759 2760 align 4 2761 convertloop: 2762 READYUV411 // modifies EBX 2763 YUVTORGB 2764 2765 // Step 3: Weave into ARGB 2766 punpcklbw xmm0, xmm1 // BG 2767 punpcklbw xmm2, xmm5 // RA 2768 movdqa xmm1, xmm0 2769 punpcklwd xmm0, xmm2 // BGRA first 4 pixels 2770 punpckhwd xmm1, xmm2 // BGRA next 4 pixels 2771 movdqu [edx], xmm0 2772 movdqu [edx + 16], xmm1 2773 lea edx, [edx + 32] 2774 sub ecx, 8 2775 jg convertloop 2776 2777 pop edi 2778 pop esi 2779 pop ebx 2780 ret 2781 } 2782 } 2783 2784 // 8 pixels, dest aligned 16. 2785 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 2786 __declspec(naked) __declspec(align(16)) 2787 void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, 2788 const uint8* uv_buf, 2789 uint8* dst_argb, 2790 int width) { 2791 __asm { 2792 push esi 2793 mov eax, [esp + 4 + 4] // Y 2794 mov esi, [esp + 4 + 8] // UV 2795 mov edx, [esp + 4 + 12] // argb 2796 mov ecx, [esp + 4 + 16] // width 2797 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2798 pxor xmm4, xmm4 2799 2800 align 4 2801 convertloop: 2802 READNV12 2803 YUVTORGB 2804 2805 // Step 3: Weave into ARGB 2806 punpcklbw xmm0, xmm1 // BG 2807 punpcklbw xmm2, xmm5 // RA 2808 movdqa xmm1, xmm0 2809 punpcklwd xmm0, xmm2 // BGRA first 4 pixels 2810 punpckhwd xmm1, xmm2 // BGRA next 4 pixels 2811 movdqu [edx], xmm0 2812 movdqu [edx + 16], xmm1 2813 lea edx, [edx + 32] 2814 sub ecx, 8 2815 jg convertloop 2816 2817 pop esi 2818 ret 2819 } 2820 } 2821 2822 // 8 pixels, dest aligned 16. 2823 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 2824 __declspec(naked) __declspec(align(16)) 2825 void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, 2826 const uint8* uv_buf, 2827 uint8* dst_argb, 2828 int width) { 2829 __asm { 2830 push esi 2831 mov eax, [esp + 4 + 4] // Y 2832 mov esi, [esp + 4 + 8] // VU 2833 mov edx, [esp + 4 + 12] // argb 2834 mov ecx, [esp + 4 + 16] // width 2835 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2836 pxor xmm4, xmm4 2837 2838 align 4 2839 convertloop: 2840 READNV12 2841 YVUTORGB 2842 2843 // Step 3: Weave into ARGB 2844 punpcklbw xmm0, xmm1 // BG 2845 punpcklbw xmm2, xmm5 // RA 2846 movdqa xmm1, xmm0 2847 punpcklwd xmm0, xmm2 // BGRA first 4 pixels 2848 punpckhwd xmm1, xmm2 // BGRA next 4 pixels 2849 movdqu [edx], xmm0 2850 movdqu [edx + 16], xmm1 2851 lea edx, [edx + 32] 2852 sub ecx, 8 2853 jg convertloop 2854 2855 pop esi 2856 ret 2857 } 2858 } 2859 2860 __declspec(naked) __declspec(align(16)) 2861 void I422ToBGRARow_SSSE3(const uint8* y_buf, 2862 const uint8* u_buf, 2863 const uint8* v_buf, 2864 uint8* dst_bgra, 2865 int width) { 2866 __asm { 2867 push esi 2868 push edi 2869 mov eax, [esp + 8 + 4] // Y 2870 mov esi, [esp + 8 + 8] // U 2871 mov edi, [esp + 8 + 12] // V 2872 mov edx, [esp + 8 + 16] // bgra 2873 mov ecx, [esp + 8 + 20] // width 2874 sub edi, esi 2875 pxor xmm4, xmm4 2876 2877 align 4 2878 convertloop: 2879 READYUV422 2880 YUVTORGB 2881 2882 // Step 3: Weave into BGRA 2883 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2884 punpcklbw xmm1, xmm0 // GB 2885 punpcklbw xmm5, xmm2 // AR 2886 movdqa xmm0, xmm5 2887 punpcklwd xmm5, xmm1 // BGRA first 4 pixels 2888 punpckhwd xmm0, xmm1 // BGRA next 4 pixels 2889 movdqa [edx], xmm5 2890 movdqa [edx + 16], xmm0 2891 lea edx, [edx + 32] 2892 sub ecx, 8 2893 jg convertloop 2894 2895 pop edi 2896 pop esi 2897 ret 2898 } 2899 } 2900 2901 __declspec(naked) __declspec(align(16)) 2902 void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, 2903 const uint8* u_buf, 2904 const uint8* v_buf, 2905 uint8* dst_bgra, 2906 int width) { 2907 __asm { 2908 push esi 2909 push edi 2910 mov eax, [esp + 8 + 4] // Y 2911 mov esi, [esp + 8 + 8] // U 2912 mov edi, [esp + 8 + 12] // V 2913 mov edx, [esp + 8 + 16] // bgra 2914 mov ecx, [esp + 8 + 20] // width 2915 sub edi, esi 2916 pxor xmm4, xmm4 2917 2918 align 4 2919 convertloop: 2920 READYUV422 2921 YUVTORGB 2922 2923 // Step 3: Weave into BGRA 2924 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2925 punpcklbw xmm1, xmm0 // GB 2926 punpcklbw xmm5, xmm2 // AR 2927 movdqa xmm0, xmm5 2928 punpcklwd xmm5, xmm1 // BGRA first 4 pixels 2929 punpckhwd xmm0, xmm1 // BGRA next 4 pixels 2930 movdqu [edx], xmm5 2931 movdqu [edx + 16], xmm0 2932 lea edx, [edx + 32] 2933 sub ecx, 8 2934 jg convertloop 2935 2936 pop edi 2937 pop esi 2938 ret 2939 } 2940 } 2941 2942 __declspec(naked) __declspec(align(16)) 2943 void I422ToABGRRow_SSSE3(const uint8* y_buf, 2944 const uint8* u_buf, 2945 const uint8* v_buf, 2946 uint8* dst_abgr, 2947 int width) { 2948 __asm { 2949 push esi 2950 push edi 2951 mov eax, [esp + 8 + 4] // Y 2952 mov esi, [esp + 8 + 8] // U 2953 mov edi, [esp + 8 + 12] // V 2954 mov edx, [esp + 8 + 16] // abgr 2955 mov ecx, [esp + 8 + 20] // width 2956 sub edi, esi 2957 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2958 pxor xmm4, xmm4 2959 2960 align 4 2961 convertloop: 2962 READYUV422 2963 YUVTORGB 2964 2965 // Step 3: Weave into ARGB 2966 punpcklbw xmm2, xmm1 // RG 2967 punpcklbw xmm0, xmm5 // BA 2968 movdqa xmm1, xmm2 2969 punpcklwd xmm2, xmm0 // RGBA first 4 pixels 2970 punpckhwd xmm1, xmm0 // RGBA next 4 pixels 2971 movdqa [edx], xmm2 2972 movdqa [edx + 16], xmm1 2973 lea edx, [edx + 32] 2974 sub ecx, 8 2975 jg convertloop 2976 2977 pop edi 2978 pop esi 2979 ret 2980 } 2981 } 2982 2983 __declspec(naked) __declspec(align(16)) 2984 void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, 2985 const uint8* u_buf, 2986 const uint8* v_buf, 2987 uint8* dst_abgr, 2988 int width) { 2989 __asm { 2990 push esi 2991 push edi 2992 mov eax, [esp + 8 + 4] // Y 2993 mov esi, [esp + 8 + 8] // U 2994 mov edi, [esp + 8 + 12] // V 2995 mov edx, [esp + 8 + 16] // abgr 2996 mov ecx, [esp + 8 + 20] // width 2997 sub edi, esi 2998 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2999 pxor xmm4, xmm4 3000 3001 align 4 3002 convertloop: 3003 READYUV422 3004 YUVTORGB 3005 3006 // Step 3: Weave into ARGB 3007 punpcklbw xmm2, xmm1 // RG 3008 punpcklbw xmm0, xmm5 // BA 3009 movdqa xmm1, xmm2 3010 punpcklwd xmm2, xmm0 // RGBA first 4 pixels 3011 punpckhwd xmm1, xmm0 // RGBA next 4 pixels 3012 movdqu [edx], xmm2 3013 movdqu [edx + 16], xmm1 3014 lea edx, [edx + 32] 3015 sub ecx, 8 3016 jg convertloop 3017 3018 pop edi 3019 pop esi 3020 ret 3021 } 3022 } 3023 3024 __declspec(naked) __declspec(align(16)) 3025 void I422ToRGBARow_SSSE3(const uint8* y_buf, 3026 const uint8* u_buf, 3027 const uint8* v_buf, 3028 uint8* dst_rgba, 3029 int width) { 3030 __asm { 3031 push esi 3032 push edi 3033 mov eax, [esp + 8 + 4] // Y 3034 mov esi, [esp + 8 + 8] // U 3035 mov edi, [esp + 8 + 12] // V 3036 mov edx, [esp + 8 + 16] // rgba 3037 mov ecx, [esp + 8 + 20] // width 3038 sub edi, esi 3039 pxor xmm4, xmm4 3040 3041 align 4 3042 convertloop: 3043 READYUV422 3044 YUVTORGB 3045 3046 // Step 3: Weave into RGBA 3047 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 3048 punpcklbw xmm1, xmm2 // GR 3049 punpcklbw xmm5, xmm0 // AB 3050 movdqa xmm0, xmm5 3051 punpcklwd xmm5, xmm1 // RGBA first 4 pixels 3052 punpckhwd xmm0, xmm1 // RGBA next 4 pixels 3053 movdqa [edx], xmm5 3054 movdqa [edx + 16], xmm0 3055 lea edx, [edx + 32] 3056 sub ecx, 8 3057 jg convertloop 3058 3059 pop edi 3060 pop esi 3061 ret 3062 } 3063 } 3064 3065 __declspec(naked) __declspec(align(16)) 3066 void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf, 3067 const uint8* u_buf, 3068 const uint8* v_buf, 3069 uint8* dst_rgba, 3070 int width) { 3071 __asm { 3072 push esi 3073 push edi 3074 mov eax, [esp + 8 + 4] // Y 3075 mov esi, [esp + 8 + 8] // U 3076 mov edi, [esp + 8 + 12] // V 3077 mov edx, [esp + 8 + 16] // rgba 3078 mov ecx, [esp + 8 + 20] // width 3079 sub edi, esi 3080 pxor xmm4, xmm4 3081 3082 align 4 3083 convertloop: 3084 READYUV422 3085 YUVTORGB 3086 3087 // Step 3: Weave into RGBA 3088 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 3089 punpcklbw xmm1, xmm2 // GR 3090 punpcklbw xmm5, xmm0 // AB 3091 movdqa xmm0, xmm5 3092 punpcklwd xmm5, xmm1 // RGBA first 4 pixels 3093 punpckhwd xmm0, xmm1 // RGBA next 4 pixels 3094 movdqu [edx], xmm5 3095 movdqu [edx + 16], xmm0 3096 lea edx, [edx + 32] 3097 sub ecx, 8 3098 jg convertloop 3099 3100 pop edi 3101 pop esi 3102 ret 3103 } 3104 } 3105 3106 #endif // HAS_I422TOARGBROW_SSSE3 3107 3108 #ifdef HAS_YTOARGBROW_SSE2 3109 __declspec(naked) __declspec(align(16)) 3110 void YToARGBRow_SSE2(const uint8* y_buf, 3111 uint8* rgb_buf, 3112 int width) { 3113 __asm { 3114 pxor xmm5, xmm5 3115 pcmpeqb xmm4, xmm4 // generate mask 0xff000000 3116 pslld xmm4, 24 3117 mov eax, 0x00100010 3118 movd xmm3, eax 3119 pshufd xmm3, xmm3, 0 3120 mov eax, 0x004a004a // 74 3121 movd xmm2, eax 3122 pshufd xmm2, xmm2,0 3123 mov eax, [esp + 4] // Y 3124 mov edx, [esp + 8] // rgb 3125 mov ecx, [esp + 12] // width 3126 3127 align 4 3128 convertloop: 3129 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 3130 movq xmm0, qword ptr [eax] 3131 lea eax, [eax + 8] 3132 punpcklbw xmm0, xmm5 // 0.Y 3133 psubusw xmm0, xmm3 3134 pmullw xmm0, xmm2 3135 psrlw xmm0, 6 3136 packuswb xmm0, xmm0 // G 3137 3138 // Step 2: Weave into ARGB 3139 punpcklbw xmm0, xmm0 // GG 3140 movdqa xmm1, xmm0 3141 punpcklwd xmm0, xmm0 // BGRA first 4 pixels 3142 punpckhwd xmm1, xmm1 // BGRA next 4 pixels 3143 por xmm0, xmm4 3144 por xmm1, xmm4 3145 movdqa [edx], xmm0 3146 movdqa [edx + 16], xmm1 3147 lea edx, [edx + 32] 3148 sub ecx, 8 3149 jg convertloop 3150 3151 ret 3152 } 3153 } 3154 #endif // HAS_YTOARGBROW_SSE2 3155 3156 #ifdef HAS_MIRRORROW_SSSE3 3157 // Shuffle table for reversing the bytes. 3158 static const uvec8 kShuffleMirror = { 3159 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u 3160 }; 3161 3162 __declspec(naked) __declspec(align(16)) 3163 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { 3164 __asm { 3165 mov eax, [esp + 4] // src 3166 mov edx, [esp + 8] // dst 3167 mov ecx, [esp + 12] // width 3168 movdqa xmm5, kShuffleMirror 3169 lea eax, [eax - 16] 3170 3171 align 4 3172 convertloop: 3173 movdqa xmm0, [eax + ecx] 3174 pshufb xmm0, xmm5 3175 sub ecx, 16 3176 movdqa [edx], xmm0 3177 lea edx, [edx + 16] 3178 jg convertloop 3179 ret 3180 } 3181 } 3182 #endif // HAS_MIRRORROW_SSSE3 3183 3184 #ifdef HAS_MIRRORROW_AVX2 3185 // Shuffle table for reversing the bytes. 3186 static const ulvec8 kShuffleMirror_AVX2 = { 3187 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u, 3188 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u 3189 }; 3190 3191 __declspec(naked) __declspec(align(16)) 3192 void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { 3193 __asm { 3194 mov eax, [esp + 4] // src 3195 mov edx, [esp + 8] // dst 3196 mov ecx, [esp + 12] // width 3197 vmovdqa ymm5, kShuffleMirror_AVX2 3198 lea eax, [eax - 32] 3199 3200 align 4 3201 convertloop: 3202 vmovdqu ymm0, [eax + ecx] 3203 vpshufb ymm0, ymm0, ymm5 3204 vpermq ymm0, ymm0, 0x4e // swap high and low halfs 3205 sub ecx, 32 3206 vmovdqu [edx], ymm0 3207 lea edx, [edx + 32] 3208 jg convertloop 3209 vzeroupper 3210 ret 3211 } 3212 } 3213 #endif // HAS_MIRRORROW_AVX2 3214 3215 #ifdef HAS_MIRRORROW_SSE2 3216 // SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3 3217 // version can not. 3218 __declspec(naked) __declspec(align(16)) 3219 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { 3220 __asm { 3221 mov eax, [esp + 4] // src 3222 mov edx, [esp + 8] // dst 3223 mov ecx, [esp + 12] // width 3224 lea eax, [eax - 16] 3225 3226 align 4 3227 convertloop: 3228 movdqu xmm0, [eax + ecx] 3229 movdqa xmm1, xmm0 // swap bytes 3230 psllw xmm0, 8 3231 psrlw xmm1, 8 3232 por xmm0, xmm1 3233 pshuflw xmm0, xmm0, 0x1b // swap words 3234 pshufhw xmm0, xmm0, 0x1b 3235 pshufd xmm0, xmm0, 0x4e // swap qwords 3236 sub ecx, 16 3237 movdqu [edx], xmm0 3238 lea edx, [edx + 16] 3239 jg convertloop 3240 ret 3241 } 3242 } 3243 #endif // HAS_MIRRORROW_SSE2 3244 3245 #ifdef HAS_MIRRORROW_UV_SSSE3 3246 // Shuffle table for reversing the bytes of UV channels. 3247 static const uvec8 kShuffleMirrorUV = { 3248 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u 3249 }; 3250 3251 __declspec(naked) __declspec(align(16)) 3252 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, 3253 int width) { 3254 __asm { 3255 push edi 3256 mov eax, [esp + 4 + 4] // src 3257 mov edx, [esp + 4 + 8] // dst_u 3258 mov edi, [esp + 4 + 12] // dst_v 3259 mov ecx, [esp + 4 + 16] // width 3260 movdqa xmm1, kShuffleMirrorUV 3261 lea eax, [eax + ecx * 2 - 16] 3262 sub edi, edx 3263 3264 align 4 3265 convertloop: 3266 movdqa xmm0, [eax] 3267 lea eax, [eax - 16] 3268 pshufb xmm0, xmm1 3269 sub ecx, 8 3270 movlpd qword ptr [edx], xmm0 3271 movhpd qword ptr [edx + edi], xmm0 3272 lea edx, [edx + 8] 3273 jg convertloop 3274 3275 pop edi 3276 ret 3277 } 3278 } 3279 #endif // HAS_MIRRORROW_UV_SSSE3 3280 3281 #ifdef HAS_ARGBMIRRORROW_SSSE3 3282 // Shuffle table for reversing the bytes. 3283 static const uvec8 kARGBShuffleMirror = { 3284 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u 3285 }; 3286 3287 __declspec(naked) __declspec(align(16)) 3288 void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { 3289 __asm { 3290 mov eax, [esp + 4] // src 3291 mov edx, [esp + 8] // dst 3292 mov ecx, [esp + 12] // width 3293 lea eax, [eax - 16 + ecx * 4] // last 4 pixels. 3294 movdqa xmm5, kARGBShuffleMirror 3295 3296 align 4 3297 convertloop: 3298 movdqa xmm0, [eax] 3299 lea eax, [eax - 16] 3300 pshufb xmm0, xmm5 3301 sub ecx, 4 3302 movdqa [edx], xmm0 3303 lea edx, [edx + 16] 3304 jg convertloop 3305 ret 3306 } 3307 } 3308 #endif // HAS_ARGBMIRRORROW_SSSE3 3309 3310 #ifdef HAS_ARGBMIRRORROW_AVX2 3311 // Shuffle table for reversing the bytes. 3312 static const ulvec32 kARGBShuffleMirror_AVX2 = { 3313 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u 3314 }; 3315 3316 __declspec(naked) __declspec(align(16)) 3317 void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { 3318 __asm { 3319 mov eax, [esp + 4] // src 3320 mov edx, [esp + 8] // dst 3321 mov ecx, [esp + 12] // width 3322 lea eax, [eax - 32] 3323 vmovdqa ymm5, kARGBShuffleMirror_AVX2 3324 3325 align 4 3326 convertloop: 3327 vpermd ymm0, ymm5, [eax + ecx * 4] // permute dword order 3328 sub ecx, 8 3329 vmovdqu [edx], ymm0 3330 lea edx, [edx + 32] 3331 jg convertloop 3332 vzeroupper 3333 ret 3334 } 3335 } 3336 #endif // HAS_ARGBMIRRORROW_AVX2 3337 3338 #ifdef HAS_SPLITUVROW_SSE2 3339 __declspec(naked) __declspec(align(16)) 3340 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { 3341 __asm { 3342 push edi 3343 mov eax, [esp + 4 + 4] // src_uv 3344 mov edx, [esp + 4 + 8] // dst_u 3345 mov edi, [esp + 4 + 12] // dst_v 3346 mov ecx, [esp + 4 + 16] // pix 3347 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 3348 psrlw xmm5, 8 3349 sub edi, edx 3350 3351 align 4 3352 convertloop: 3353 movdqa xmm0, [eax] 3354 movdqa xmm1, [eax + 16] 3355 lea eax, [eax + 32] 3356 movdqa xmm2, xmm0 3357 movdqa xmm3, xmm1 3358 pand xmm0, xmm5 // even bytes 3359 pand xmm1, xmm5 3360 packuswb xmm0, xmm1 3361 psrlw xmm2, 8 // odd bytes 3362 psrlw xmm3, 8 3363 packuswb xmm2, xmm3 3364 movdqa [edx], xmm0 3365 movdqa [edx + edi], xmm2 3366 lea edx, [edx + 16] 3367 sub ecx, 16 3368 jg convertloop 3369 3370 pop edi 3371 ret 3372 } 3373 } 3374 3375 __declspec(naked) __declspec(align(16)) 3376 void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, 3377 int pix) { 3378 __asm { 3379 push edi 3380 mov eax, [esp + 4 + 4] // src_uv 3381 mov edx, [esp + 4 + 8] // dst_u 3382 mov edi, [esp + 4 + 12] // dst_v 3383 mov ecx, [esp + 4 + 16] // pix 3384 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 3385 psrlw xmm5, 8 3386 sub edi, edx 3387 3388 align 4 3389 convertloop: 3390 movdqu xmm0, [eax] 3391 movdqu xmm1, [eax + 16] 3392 lea eax, [eax + 32] 3393 movdqa xmm2, xmm0 3394 movdqa xmm3, xmm1 3395 pand xmm0, xmm5 // even bytes 3396 pand xmm1, xmm5 3397 packuswb xmm0, xmm1 3398 psrlw xmm2, 8 // odd bytes 3399 psrlw xmm3, 8 3400 packuswb xmm2, xmm3 3401 movdqu [edx], xmm0 3402 movdqu [edx + edi], xmm2 3403 lea edx, [edx + 16] 3404 sub ecx, 16 3405 jg convertloop 3406 3407 pop edi 3408 ret 3409 } 3410 } 3411 #endif // HAS_SPLITUVROW_SSE2 3412 3413 #ifdef HAS_SPLITUVROW_AVX2 3414 __declspec(naked) __declspec(align(16)) 3415 void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { 3416 __asm { 3417 push edi 3418 mov eax, [esp + 4 + 4] // src_uv 3419 mov edx, [esp + 4 + 8] // dst_u 3420 mov edi, [esp + 4 + 12] // dst_v 3421 mov ecx, [esp + 4 + 16] // pix 3422 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff 3423 vpsrlw ymm5, ymm5, 8 3424 sub edi, edx 3425 3426 align 4 3427 convertloop: 3428 vmovdqu ymm0, [eax] 3429 vmovdqu ymm1, [eax + 32] 3430 lea eax, [eax + 64] 3431 vpsrlw ymm2, ymm0, 8 // odd bytes 3432 vpsrlw ymm3, ymm1, 8 3433 vpand ymm0, ymm0, ymm5 // even bytes 3434 vpand ymm1, ymm1, ymm5 3435 vpackuswb ymm0, ymm0, ymm1 3436 vpackuswb ymm2, ymm2, ymm3 3437 vpermq ymm0, ymm0, 0xd8 3438 vpermq ymm2, ymm2, 0xd8 3439 vmovdqu [edx], ymm0 3440 vmovdqu [edx + edi], ymm2 3441 lea edx, [edx + 32] 3442 sub ecx, 32 3443 jg convertloop 3444 3445 pop edi 3446 vzeroupper 3447 ret 3448 } 3449 } 3450 #endif // HAS_SPLITUVROW_AVX2 3451 3452 #ifdef HAS_MERGEUVROW_SSE2 3453 __declspec(naked) __declspec(align(16)) 3454 void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, 3455 int width) { 3456 __asm { 3457 push edi 3458 mov eax, [esp + 4 + 4] // src_u 3459 mov edx, [esp + 4 + 8] // src_v 3460 mov edi, [esp + 4 + 12] // dst_uv 3461 mov ecx, [esp + 4 + 16] // width 3462 sub edx, eax 3463 3464 align 4 3465 convertloop: 3466 movdqa xmm0, [eax] // read 16 U's 3467 movdqa xmm1, [eax + edx] // and 16 V's 3468 lea eax, [eax + 16] 3469 movdqa xmm2, xmm0 3470 punpcklbw xmm0, xmm1 // first 8 UV pairs 3471 punpckhbw xmm2, xmm1 // next 8 UV pairs 3472 movdqa [edi], xmm0 3473 movdqa [edi + 16], xmm2 3474 lea edi, [edi + 32] 3475 sub ecx, 16 3476 jg convertloop 3477 3478 pop edi 3479 ret 3480 } 3481 } 3482 3483 __declspec(naked) __declspec(align(16)) 3484 void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v, 3485 uint8* dst_uv, int width) { 3486 __asm { 3487 push edi 3488 mov eax, [esp + 4 + 4] // src_u 3489 mov edx, [esp + 4 + 8] // src_v 3490 mov edi, [esp + 4 + 12] // dst_uv 3491 mov ecx, [esp + 4 + 16] // width 3492 sub edx, eax 3493 3494 align 4 3495 convertloop: 3496 movdqu xmm0, [eax] // read 16 U's 3497 movdqu xmm1, [eax + edx] // and 16 V's 3498 lea eax, [eax + 16] 3499 movdqa xmm2, xmm0 3500 punpcklbw xmm0, xmm1 // first 8 UV pairs 3501 punpckhbw xmm2, xmm1 // next 8 UV pairs 3502 movdqu [edi], xmm0 3503 movdqu [edi + 16], xmm2 3504 lea edi, [edi + 32] 3505 sub ecx, 16 3506 jg convertloop 3507 3508 pop edi 3509 ret 3510 } 3511 } 3512 #endif // HAS_MERGEUVROW_SSE2 3513 3514 #ifdef HAS_MERGEUVROW_AVX2 3515 __declspec(naked) __declspec(align(16)) 3516 void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, 3517 int width) { 3518 __asm { 3519 push edi 3520 mov eax, [esp + 4 + 4] // src_u 3521 mov edx, [esp + 4 + 8] // src_v 3522 mov edi, [esp + 4 + 12] // dst_uv 3523 mov ecx, [esp + 4 + 16] // width 3524 sub edx, eax 3525 3526 align 4 3527 convertloop: 3528 vmovdqu ymm0, [eax] // read 32 U's 3529 vmovdqu ymm1, [eax + edx] // and 32 V's 3530 lea eax, [eax + 32] 3531 vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2 3532 vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3 3533 vperm2i128 ymm1, ymm2, ymm0, 0x20 // low 128 of ymm2 and low 128 of ymm0 3534 vperm2i128 ymm2, ymm2, ymm0, 0x31 // high 128 of ymm2 and high 128 of ymm0 3535 vmovdqu [edi], ymm1 3536 vmovdqu [edi + 32], ymm2 3537 lea edi, [edi + 64] 3538 sub ecx, 32 3539 jg convertloop 3540 3541 pop edi 3542 vzeroupper 3543 ret 3544 } 3545 } 3546 #endif // HAS_MERGEUVROW_AVX2 3547 3548 #ifdef HAS_COPYROW_SSE2 3549 // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time. 3550 __declspec(naked) __declspec(align(16)) 3551 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { 3552 __asm { 3553 mov eax, [esp + 4] // src 3554 mov edx, [esp + 8] // dst 3555 mov ecx, [esp + 12] // count 3556 3557 align 4 3558 convertloop: 3559 movdqa xmm0, [eax] 3560 movdqa xmm1, [eax + 16] 3561 lea eax, [eax + 32] 3562 movdqa [edx], xmm0 3563 movdqa [edx + 16], xmm1 3564 lea edx, [edx + 32] 3565 sub ecx, 32 3566 jg convertloop 3567 ret 3568 } 3569 } 3570 #endif // HAS_COPYROW_SSE2 3571 3572 // Unaligned Multiple of 1. 3573 __declspec(naked) __declspec(align(16)) 3574 void CopyRow_ERMS(const uint8* src, uint8* dst, int count) { 3575 __asm { 3576 mov eax, esi 3577 mov edx, edi 3578 mov esi, [esp + 4] // src 3579 mov edi, [esp + 8] // dst 3580 mov ecx, [esp + 12] // count 3581 rep movsb 3582 mov edi, edx 3583 mov esi, eax 3584 ret 3585 } 3586 } 3587 3588 #ifdef HAS_COPYROW_X86 3589 __declspec(naked) __declspec(align(16)) 3590 void CopyRow_X86(const uint8* src, uint8* dst, int count) { 3591 __asm { 3592 mov eax, esi 3593 mov edx, edi 3594 mov esi, [esp + 4] // src 3595 mov edi, [esp + 8] // dst 3596 mov ecx, [esp + 12] // count 3597 shr ecx, 2 3598 rep movsd 3599 mov edi, edx 3600 mov esi, eax 3601 ret 3602 } 3603 } 3604 #endif // HAS_COPYROW_X86 3605 3606 #ifdef HAS_ARGBCOPYALPHAROW_SSE2 3607 // width in pixels 3608 __declspec(naked) __declspec(align(16)) 3609 void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { 3610 __asm { 3611 mov eax, [esp + 4] // src 3612 mov edx, [esp + 8] // dst 3613 mov ecx, [esp + 12] // count 3614 pcmpeqb xmm0, xmm0 // generate mask 0xff000000 3615 pslld xmm0, 24 3616 pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff 3617 psrld xmm1, 8 3618 3619 align 4 3620 convertloop: 3621 movdqa xmm2, [eax] 3622 movdqa xmm3, [eax + 16] 3623 lea eax, [eax + 32] 3624 movdqa xmm4, [edx] 3625 movdqa xmm5, [edx + 16] 3626 pand xmm2, xmm0 3627 pand xmm3, xmm0 3628 pand xmm4, xmm1 3629 pand xmm5, xmm1 3630 por xmm2, xmm4 3631 por xmm3, xmm5 3632 movdqa [edx], xmm2 3633 movdqa [edx + 16], xmm3 3634 lea edx, [edx + 32] 3635 sub ecx, 8 3636 jg convertloop 3637 3638 ret 3639 } 3640 } 3641 #endif // HAS_ARGBCOPYALPHAROW_SSE2 3642 3643 #ifdef HAS_ARGBCOPYALPHAROW_AVX2 3644 // width in pixels 3645 __declspec(naked) __declspec(align(16)) 3646 void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { 3647 __asm { 3648 mov eax, [esp + 4] // src 3649 mov edx, [esp + 8] // dst 3650 mov ecx, [esp + 12] // count 3651 vpcmpeqb ymm0, ymm0, ymm0 3652 vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff 3653 3654 align 4 3655 convertloop: 3656 vmovdqu ymm1, [eax] 3657 vmovdqu ymm2, [eax + 32] 3658 lea eax, [eax + 64] 3659 vpblendvb ymm1, ymm1, [edx], ymm0 3660 vpblendvb ymm2, ymm2, [edx + 32], ymm0 3661 vmovdqu [edx], ymm1 3662 vmovdqu [edx + 32], ymm2 3663 lea edx, [edx + 64] 3664 sub ecx, 16 3665 jg convertloop 3666 3667 vzeroupper 3668 ret 3669 } 3670 } 3671 #endif // HAS_ARGBCOPYALPHAROW_AVX2 3672 3673 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 3674 // width in pixels 3675 __declspec(naked) __declspec(align(16)) 3676 void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { 3677 __asm { 3678 mov eax, [esp + 4] // src 3679 mov edx, [esp + 8] // dst 3680 mov ecx, [esp + 12] // count 3681 pcmpeqb xmm0, xmm0 // generate mask 0xff000000 3682 pslld xmm0, 24 3683 pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff 3684 psrld xmm1, 8 3685 3686 align 4 3687 convertloop: 3688 movq xmm2, qword ptr [eax] // 8 Y's 3689 lea eax, [eax + 8] 3690 punpcklbw xmm2, xmm2 3691 punpckhwd xmm3, xmm2 3692 punpcklwd xmm2, xmm2 3693 movdqa xmm4, [edx] 3694 movdqa xmm5, [edx + 16] 3695 pand xmm2, xmm0 3696 pand xmm3, xmm0 3697 pand xmm4, xmm1 3698 pand xmm5, xmm1 3699 por xmm2, xmm4 3700 por xmm3, xmm5 3701 movdqa [edx], xmm2 3702 movdqa [edx + 16], xmm3 3703 lea edx, [edx + 32] 3704 sub ecx, 8 3705 jg convertloop 3706 3707 ret 3708 } 3709 } 3710 #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2 3711 3712 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 3713 // width in pixels 3714 __declspec(naked) __declspec(align(16)) 3715 void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { 3716 __asm { 3717 mov eax, [esp + 4] // src 3718 mov edx, [esp + 8] // dst 3719 mov ecx, [esp + 12] // count 3720 vpcmpeqb ymm0, ymm0, ymm0 3721 vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff 3722 3723 align 4 3724 convertloop: 3725 vpmovzxbd ymm1, qword ptr [eax] 3726 vpmovzxbd ymm2, qword ptr [eax + 8] 3727 lea eax, [eax + 16] 3728 vpslld ymm1, ymm1, 24 3729 vpslld ymm2, ymm2, 24 3730 vpblendvb ymm1, ymm1, [edx], ymm0 3731 vpblendvb ymm2, ymm2, [edx + 32], ymm0 3732 vmovdqu [edx], ymm1 3733 vmovdqu [edx + 32], ymm2 3734 lea edx, [edx + 64] 3735 sub ecx, 16 3736 jg convertloop 3737 3738 vzeroupper 3739 ret 3740 } 3741 } 3742 #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 3743 3744 #ifdef HAS_SETROW_X86 3745 // SetRow8 writes 'count' bytes using a 32 bit value repeated. 3746 __declspec(naked) __declspec(align(16)) 3747 void SetRow_X86(uint8* dst, uint32 v32, int count) { 3748 __asm { 3749 mov edx, edi 3750 mov edi, [esp + 4] // dst 3751 mov eax, [esp + 8] // v32 3752 mov ecx, [esp + 12] // count 3753 shr ecx, 2 3754 rep stosd 3755 mov edi, edx 3756 ret 3757 } 3758 } 3759 3760 // SetRow32 writes 'count' words using a 32 bit value repeated. 3761 __declspec(naked) __declspec(align(16)) 3762 void ARGBSetRows_X86(uint8* dst, uint32 v32, int width, 3763 int dst_stride, int height) { 3764 __asm { 3765 push esi 3766 push edi 3767 push ebp 3768 mov edi, [esp + 12 + 4] // dst 3769 mov eax, [esp + 12 + 8] // v32 3770 mov ebp, [esp + 12 + 12] // width 3771 mov edx, [esp + 12 + 16] // dst_stride 3772 mov esi, [esp + 12 + 20] // height 3773 lea ecx, [ebp * 4] 3774 sub edx, ecx // stride - width * 4 3775 3776 align 4 3777 convertloop: 3778 mov ecx, ebp 3779 rep stosd 3780 add edi, edx 3781 sub esi, 1 3782 jg convertloop 3783 3784 pop ebp 3785 pop edi 3786 pop esi 3787 ret 3788 } 3789 } 3790 #endif // HAS_SETROW_X86 3791 3792 #ifdef HAS_YUY2TOYROW_AVX2 3793 __declspec(naked) __declspec(align(16)) 3794 void YUY2ToYRow_AVX2(const uint8* src_yuy2, 3795 uint8* dst_y, int pix) { 3796 __asm { 3797 mov eax, [esp + 4] // src_yuy2 3798 mov edx, [esp + 8] // dst_y 3799 mov ecx, [esp + 12] // pix 3800 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff 3801 vpsrlw ymm5, ymm5, 8 3802 3803 align 4 3804 convertloop: 3805 vmovdqu ymm0, [eax] 3806 vmovdqu ymm1, [eax + 32] 3807 lea eax, [eax + 64] 3808 vpand ymm0, ymm0, ymm5 // even bytes are Y 3809 vpand ymm1, ymm1, ymm5 3810 vpackuswb ymm0, ymm0, ymm1 // mutates. 3811 vpermq ymm0, ymm0, 0xd8 3812 sub ecx, 32 3813 vmovdqu [edx], ymm0 3814 lea edx, [edx + 32] 3815 jg convertloop 3816 vzeroupper 3817 ret 3818 } 3819 } 3820 3821 __declspec(naked) __declspec(align(16)) 3822 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, 3823 uint8* dst_u, uint8* dst_v, int pix) { 3824 __asm { 3825 push esi 3826 push edi 3827 mov eax, [esp + 8 + 4] // src_yuy2 3828 mov esi, [esp + 8 + 8] // stride_yuy2 3829 mov edx, [esp + 8 + 12] // dst_u 3830 mov edi, [esp + 8 + 16] // dst_v 3831 mov ecx, [esp + 8 + 20] // pix 3832 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff 3833 vpsrlw ymm5, ymm5, 8 3834 sub edi, edx 3835 3836 align 4 3837 convertloop: 3838 vmovdqu ymm0, [eax] 3839 vmovdqu ymm1, [eax + 32] 3840 vpavgb ymm0, ymm0, [eax + esi] 3841 vpavgb ymm1, ymm1, [eax + esi + 32] 3842 lea eax, [eax + 64] 3843 vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV 3844 vpsrlw ymm1, ymm1, 8 3845 vpackuswb ymm0, ymm0, ymm1 // mutates. 3846 vpermq ymm0, ymm0, 0xd8 3847 vpand ymm1, ymm0, ymm5 // U 3848 vpsrlw ymm0, ymm0, 8 // V 3849 vpackuswb ymm1, ymm1, ymm1 // mutates. 3850 vpackuswb ymm0, ymm0, ymm0 // mutates. 3851 vpermq ymm1, ymm1, 0xd8 3852 vpermq ymm0, ymm0, 0xd8 3853 vextractf128 [edx], ymm1, 0 // U 3854 vextractf128 [edx + edi], ymm0, 0 // V 3855 lea edx, [edx + 16] 3856 sub ecx, 32 3857 jg convertloop 3858 3859 pop edi 3860 pop esi 3861 vzeroupper 3862 ret 3863 } 3864 } 3865 3866 __declspec(naked) __declspec(align(16)) 3867 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, 3868 uint8* dst_u, uint8* dst_v, int pix) { 3869 __asm { 3870 push edi 3871 mov eax, [esp + 4 + 4] // src_yuy2 3872 mov edx, [esp + 4 + 8] // dst_u 3873 mov edi, [esp + 4 + 12] // dst_v 3874 mov ecx, [esp + 4 + 16] // pix 3875 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff 3876 vpsrlw ymm5, ymm5, 8 3877 sub edi, edx 3878 3879 align 4 3880 convertloop: 3881 vmovdqu ymm0, [eax] 3882 vmovdqu ymm1, [eax + 32] 3883 lea eax, [eax + 64] 3884 vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV 3885 vpsrlw ymm1, ymm1, 8 3886 vpackuswb ymm0, ymm0, ymm1 // mutates. 3887 vpermq ymm0, ymm0, 0xd8 3888 vpand ymm1, ymm0, ymm5 // U 3889 vpsrlw ymm0, ymm0, 8 // V 3890 vpackuswb ymm1, ymm1, ymm1 // mutates. 3891 vpackuswb ymm0, ymm0, ymm0 // mutates. 3892 vpermq ymm1, ymm1, 0xd8 3893 vpermq ymm0, ymm0, 0xd8 3894 vextractf128 [edx], ymm1, 0 // U 3895 vextractf128 [edx + edi], ymm0, 0 // V 3896 lea edx, [edx + 16] 3897 sub ecx, 32 3898 jg convertloop 3899 3900 pop edi 3901 vzeroupper 3902 ret 3903 } 3904 } 3905 3906 __declspec(naked) __declspec(align(16)) 3907 void UYVYToYRow_AVX2(const uint8* src_uyvy, 3908 uint8* dst_y, int pix) { 3909 __asm { 3910 mov eax, [esp + 4] // src_uyvy 3911 mov edx, [esp + 8] // dst_y 3912 mov ecx, [esp + 12] // pix 3913 3914 align 4 3915 convertloop: 3916 vmovdqu ymm0, [eax] 3917 vmovdqu ymm1, [eax + 32] 3918 lea eax, [eax + 64] 3919 vpsrlw ymm0, ymm0, 8 // odd bytes are Y 3920 vpsrlw ymm1, ymm1, 8 3921 vpackuswb ymm0, ymm0, ymm1 // mutates. 3922 vpermq ymm0, ymm0, 0xd8 3923 sub ecx, 32 3924 vmovdqu [edx], ymm0 3925 lea edx, [edx + 32] 3926 jg convertloop 3927 ret 3928 vzeroupper 3929 } 3930 } 3931 3932 __declspec(naked) __declspec(align(16)) 3933 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, 3934 uint8* dst_u, uint8* dst_v, int pix) { 3935 __asm { 3936 push esi 3937 push edi 3938 mov eax, [esp + 8 + 4] // src_yuy2 3939 mov esi, [esp + 8 + 8] // stride_yuy2 3940 mov edx, [esp + 8 + 12] // dst_u 3941 mov edi, [esp + 8 + 16] // dst_v 3942 mov ecx, [esp + 8 + 20] // pix 3943 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff 3944 vpsrlw ymm5, ymm5, 8 3945 sub edi, edx 3946 3947 align 4 3948 convertloop: 3949 vmovdqu ymm0, [eax] 3950 vmovdqu ymm1, [eax + 32] 3951 vpavgb ymm0, ymm0, [eax + esi] 3952 vpavgb ymm1, ymm1, [eax + esi + 32] 3953 lea eax, [eax + 64] 3954 vpand ymm0, ymm0, ymm5 // UYVY -> UVUV 3955 vpand ymm1, ymm1, ymm5 3956 vpackuswb ymm0, ymm0, ymm1 // mutates. 3957 vpermq ymm0, ymm0, 0xd8 3958 vpand ymm1, ymm0, ymm5 // U 3959 vpsrlw ymm0, ymm0, 8 // V 3960 vpackuswb ymm1, ymm1, ymm1 // mutates. 3961 vpackuswb ymm0, ymm0, ymm0 // mutates. 3962 vpermq ymm1, ymm1, 0xd8 3963 vpermq ymm0, ymm0, 0xd8 3964 vextractf128 [edx], ymm1, 0 // U 3965 vextractf128 [edx + edi], ymm0, 0 // V 3966 lea edx, [edx + 16] 3967 sub ecx, 32 3968 jg convertloop 3969 3970 pop edi 3971 pop esi 3972 vzeroupper 3973 ret 3974 } 3975 } 3976 3977 __declspec(naked) __declspec(align(16)) 3978 void UYVYToUV422Row_AVX2(const uint8* src_uyvy, 3979 uint8* dst_u, uint8* dst_v, int pix) { 3980 __asm { 3981 push edi 3982 mov eax, [esp + 4 + 4] // src_yuy2 3983 mov edx, [esp + 4 + 8] // dst_u 3984 mov edi, [esp + 4 + 12] // dst_v 3985 mov ecx, [esp + 4 + 16] // pix 3986 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff 3987 vpsrlw ymm5, ymm5, 8 3988 sub edi, edx 3989 3990 align 4 3991 convertloop: 3992 vmovdqu ymm0, [eax] 3993 vmovdqu ymm1, [eax + 32] 3994 lea eax, [eax + 64] 3995 vpand ymm0, ymm0, ymm5 // UYVY -> UVUV 3996 vpand ymm1, ymm1, ymm5 3997 vpackuswb ymm0, ymm0, ymm1 // mutates. 3998 vpermq ymm0, ymm0, 0xd8 3999 vpand ymm1, ymm0, ymm5 // U 4000 vpsrlw ymm0, ymm0, 8 // V 4001 vpackuswb ymm1, ymm1, ymm1 // mutates. 4002 vpackuswb ymm0, ymm0, ymm0 // mutates. 4003 vpermq ymm1, ymm1, 0xd8 4004 vpermq ymm0, ymm0, 0xd8 4005 vextractf128 [edx], ymm1, 0 // U 4006 vextractf128 [edx + edi], ymm0, 0 // V 4007 lea edx, [edx + 16] 4008 sub ecx, 32 4009 jg convertloop 4010 4011 pop edi 4012 vzeroupper 4013 ret 4014 } 4015 } 4016 #endif // HAS_YUY2TOYROW_AVX2 4017 4018 #ifdef HAS_YUY2TOYROW_SSE2 4019 __declspec(naked) __declspec(align(16)) 4020 void YUY2ToYRow_SSE2(const uint8* src_yuy2, 4021 uint8* dst_y, int pix) { 4022 __asm { 4023 mov eax, [esp + 4] // src_yuy2 4024 mov edx, [esp + 8] // dst_y 4025 mov ecx, [esp + 12] // pix 4026 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 4027 psrlw xmm5, 8 4028 4029 align 4 4030 convertloop: 4031 movdqa xmm0, [eax] 4032 movdqa xmm1, [eax + 16] 4033 lea eax, [eax + 32] 4034 pand xmm0, xmm5 // even bytes are Y 4035 pand xmm1, xmm5 4036 packuswb xmm0, xmm1 4037 sub ecx, 16 4038 movdqa [edx], xmm0 4039 lea edx, [edx + 16] 4040 jg convertloop 4041 ret 4042 } 4043 } 4044 4045 __declspec(naked) __declspec(align(16)) 4046 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, 4047 uint8* dst_u, uint8* dst_v, int pix) { 4048 __asm { 4049 push esi 4050 push edi 4051 mov eax, [esp + 8 + 4] // src_yuy2 4052 mov esi, [esp + 8 + 8] // stride_yuy2 4053 mov edx, [esp + 8 + 12] // dst_u 4054 mov edi, [esp + 8 + 16] // dst_v 4055 mov ecx, [esp + 8 + 20] // pix 4056 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 4057 psrlw xmm5, 8 4058 sub edi, edx 4059 4060 align 4 4061 convertloop: 4062 movdqa xmm0, [eax] 4063 movdqa xmm1, [eax + 16] 4064 movdqa xmm2, [eax + esi] 4065 movdqa xmm3, [eax + esi + 16] 4066 lea eax, [eax + 32] 4067 pavgb xmm0, xmm2 4068 pavgb xmm1, xmm3 4069 psrlw xmm0, 8 // YUYV -> UVUV 4070 psrlw xmm1, 8 4071 packuswb xmm0, xmm1 4072 movdqa xmm1, xmm0 4073 pand xmm0, xmm5 // U 4074 packuswb xmm0, xmm0 4075 psrlw xmm1, 8 // V 4076 packuswb xmm1, xmm1 4077 movq qword ptr [edx], xmm0 4078 movq qword ptr [edx + edi], xmm1 4079 lea edx, [edx + 8] 4080 sub ecx, 16 4081 jg convertloop 4082 4083 pop edi 4084 pop esi 4085 ret 4086 } 4087 } 4088 4089 __declspec(naked) __declspec(align(16)) 4090 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, 4091 uint8* dst_u, uint8* dst_v, int pix) { 4092 __asm { 4093 push edi 4094 mov eax, [esp + 4 + 4] // src_yuy2 4095 mov edx, [esp + 4 + 8] // dst_u 4096 mov edi, [esp + 4 + 12] // dst_v 4097 mov ecx, [esp + 4 + 16] // pix 4098 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 4099 psrlw xmm5, 8 4100 sub edi, edx 4101 4102 align 4 4103 convertloop: 4104 movdqa xmm0, [eax] 4105 movdqa xmm1, [eax + 16] 4106 lea eax, [eax + 32] 4107 psrlw xmm0, 8 // YUYV -> UVUV 4108 psrlw xmm1, 8 4109 packuswb xmm0, xmm1 4110 movdqa xmm1, xmm0 4111 pand xmm0, xmm5 // U 4112 packuswb xmm0, xmm0 4113 psrlw xmm1, 8 // V 4114 packuswb xmm1, xmm1 4115 movq qword ptr [edx], xmm0 4116 movq qword ptr [edx + edi], xmm1 4117 lea edx, [edx + 8] 4118 sub ecx, 16 4119 jg convertloop 4120 4121 pop edi 4122 ret 4123 } 4124 } 4125 4126 __declspec(naked) __declspec(align(16)) 4127 void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2, 4128 uint8* dst_y, int pix) { 4129 __asm { 4130 mov eax, [esp + 4] // src_yuy2 4131 mov edx, [esp + 8] // dst_y 4132 mov ecx, [esp + 12] // pix 4133 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 4134 psrlw xmm5, 8 4135 4136 align 4 4137 convertloop: 4138 movdqu xmm0, [eax] 4139 movdqu xmm1, [eax + 16] 4140 lea eax, [eax + 32] 4141 pand xmm0, xmm5 // even bytes are Y 4142 pand xmm1, xmm5 4143 packuswb xmm0, xmm1 4144 sub ecx, 16 4145 movdqu [edx], xmm0 4146 lea edx, [edx + 16] 4147 jg convertloop 4148 ret 4149 } 4150 } 4151 4152 __declspec(naked) __declspec(align(16)) 4153 void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2, 4154 uint8* dst_u, uint8* dst_v, int pix) { 4155 __asm { 4156 push esi 4157 push edi 4158 mov eax, [esp + 8 + 4] // src_yuy2 4159 mov esi, [esp + 8 + 8] // stride_yuy2 4160 mov edx, [esp + 8 + 12] // dst_u 4161 mov edi, [esp + 8 + 16] // dst_v 4162 mov ecx, [esp + 8 + 20] // pix 4163 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 4164 psrlw xmm5, 8 4165 sub edi, edx 4166 4167 align 4 4168 convertloop: 4169 movdqu xmm0, [eax] 4170 movdqu xmm1, [eax + 16] 4171 movdqu xmm2, [eax + esi] 4172 movdqu xmm3, [eax + esi + 16] 4173 lea eax, [eax + 32] 4174 pavgb xmm0, xmm2 4175 pavgb xmm1, xmm3 4176 psrlw xmm0, 8 // YUYV -> UVUV 4177 psrlw xmm1, 8 4178 packuswb xmm0, xmm1 4179 movdqa xmm1, xmm0 4180 pand xmm0, xmm5 // U 4181 packuswb xmm0, xmm0 4182 psrlw xmm1, 8 // V 4183 packuswb xmm1, xmm1 4184 movq qword ptr [edx], xmm0 4185 movq qword ptr [edx + edi], xmm1 4186 lea edx, [edx + 8] 4187 sub ecx, 16 4188 jg convertloop 4189 4190 pop edi 4191 pop esi 4192 ret 4193 } 4194 } 4195 4196 __declspec(naked) __declspec(align(16)) 4197 void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2, 4198 uint8* dst_u, uint8* dst_v, int pix) { 4199 __asm { 4200 push edi 4201 mov eax, [esp + 4 + 4] // src_yuy2 4202 mov edx, [esp + 4 + 8] // dst_u 4203 mov edi, [esp + 4 + 12] // dst_v 4204 mov ecx, [esp + 4 + 16] // pix 4205 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 4206 psrlw xmm5, 8 4207 sub edi, edx 4208 4209 align 4 4210 convertloop: 4211 movdqu xmm0, [eax] 4212 movdqu xmm1, [eax + 16] 4213 lea eax, [eax + 32] 4214 psrlw xmm0, 8 // YUYV -> UVUV 4215 psrlw xmm1, 8 4216 packuswb xmm0, xmm1 4217 movdqa xmm1, xmm0 4218 pand xmm0, xmm5 // U 4219 packuswb xmm0, xmm0 4220 psrlw xmm1, 8 // V 4221 packuswb xmm1, xmm1 4222 movq qword ptr [edx], xmm0 4223 movq qword ptr [edx + edi], xmm1 4224 lea edx, [edx + 8] 4225 sub ecx, 16 4226 jg convertloop 4227 4228 pop edi 4229 ret 4230 } 4231 } 4232 4233 __declspec(naked) __declspec(align(16)) 4234 void UYVYToYRow_SSE2(const uint8* src_uyvy, 4235 uint8* dst_y, int pix) { 4236 __asm { 4237 mov eax, [esp + 4] // src_uyvy 4238 mov edx, [esp + 8] // dst_y 4239 mov ecx, [esp + 12] // pix 4240 4241 align 4 4242 convertloop: 4243 movdqa xmm0, [eax] 4244 movdqa xmm1, [eax + 16] 4245 lea eax, [eax + 32] 4246 psrlw xmm0, 8 // odd bytes are Y 4247 psrlw xmm1, 8 4248 packuswb xmm0, xmm1 4249 sub ecx, 16 4250 movdqa [edx], xmm0 4251 lea edx, [edx + 16] 4252 jg convertloop 4253 ret 4254 } 4255 } 4256 4257 __declspec(naked) __declspec(align(16)) 4258 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, 4259 uint8* dst_u, uint8* dst_v, int pix) { 4260 __asm { 4261 push esi 4262 push edi 4263 mov eax, [esp + 8 + 4] // src_yuy2 4264 mov esi, [esp + 8 + 8] // stride_yuy2 4265 mov edx, [esp + 8 + 12] // dst_u 4266 mov edi, [esp + 8 + 16] // dst_v 4267 mov ecx, [esp + 8 + 20] // pix 4268 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 4269 psrlw xmm5, 8 4270 sub edi, edx 4271 4272 align 4 4273 convertloop: 4274 movdqa xmm0, [eax] 4275 movdqa xmm1, [eax + 16] 4276 movdqa xmm2, [eax + esi] 4277 movdqa xmm3, [eax + esi + 16] 4278 lea eax, [eax + 32] 4279 pavgb xmm0, xmm2 4280 pavgb xmm1, xmm3 4281 pand xmm0, xmm5 // UYVY -> UVUV 4282 pand xmm1, xmm5 4283 packuswb xmm0, xmm1 4284 movdqa xmm1, xmm0 4285 pand xmm0, xmm5 // U 4286 packuswb xmm0, xmm0 4287 psrlw xmm1, 8 // V 4288 packuswb xmm1, xmm1 4289 movq qword ptr [edx], xmm0 4290 movq qword ptr [edx + edi], xmm1 4291 lea edx, [edx + 8] 4292 sub ecx, 16 4293 jg convertloop 4294 4295 pop edi 4296 pop esi 4297 ret 4298 } 4299 } 4300 4301 __declspec(naked) __declspec(align(16)) 4302 void UYVYToUV422Row_SSE2(const uint8* src_uyvy, 4303 uint8* dst_u, uint8* dst_v, int pix) { 4304 __asm { 4305 push edi 4306 mov eax, [esp + 4 + 4] // src_yuy2 4307 mov edx, [esp + 4 + 8] // dst_u 4308 mov edi, [esp + 4 + 12] // dst_v 4309 mov ecx, [esp + 4 + 16] // pix 4310 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 4311 psrlw xmm5, 8 4312 sub edi, edx 4313 4314 align 4 4315 convertloop: 4316 movdqa xmm0, [eax] 4317 movdqa xmm1, [eax + 16] 4318 lea eax, [eax + 32] 4319 pand xmm0, xmm5 // UYVY -> UVUV 4320 pand xmm1, xmm5 4321 packuswb xmm0, xmm1 4322 movdqa xmm1, xmm0 4323 pand xmm0, xmm5 // U 4324 packuswb xmm0, xmm0 4325 psrlw xmm1, 8 // V 4326 packuswb xmm1, xmm1 4327 movq qword ptr [edx], xmm0 4328 movq qword ptr [edx + edi], xmm1 4329 lea edx, [edx + 8] 4330 sub ecx, 16 4331 jg convertloop 4332 4333 pop edi 4334 ret 4335 } 4336 } 4337 4338 __declspec(naked) __declspec(align(16)) 4339 void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy, 4340 uint8* dst_y, int pix) { 4341 __asm { 4342 mov eax, [esp + 4] // src_uyvy 4343 mov edx, [esp + 8] // dst_y 4344 mov ecx, [esp + 12] // pix 4345 4346 align 4 4347 convertloop: 4348 movdqu xmm0, [eax] 4349 movdqu xmm1, [eax + 16] 4350 lea eax, [eax + 32] 4351 psrlw xmm0, 8 // odd bytes are Y 4352 psrlw xmm1, 8 4353 packuswb xmm0, xmm1 4354 sub ecx, 16 4355 movdqu [edx], xmm0 4356 lea edx, [edx + 16] 4357 jg convertloop 4358 ret 4359 } 4360 } 4361 4362 __declspec(naked) __declspec(align(16)) 4363 void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, 4364 uint8* dst_u, uint8* dst_v, int pix) { 4365 __asm { 4366 push esi 4367 push edi 4368 mov eax, [esp + 8 + 4] // src_yuy2 4369 mov esi, [esp + 8 + 8] // stride_yuy2 4370 mov edx, [esp + 8 + 12] // dst_u 4371 mov edi, [esp + 8 + 16] // dst_v 4372 mov ecx, [esp + 8 + 20] // pix 4373 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 4374 psrlw xmm5, 8 4375 sub edi, edx 4376 4377 align 4 4378 convertloop: 4379 movdqu xmm0, [eax] 4380 movdqu xmm1, [eax + 16] 4381 movdqu xmm2, [eax + esi] 4382 movdqu xmm3, [eax + esi + 16] 4383 lea eax, [eax + 32] 4384 pavgb xmm0, xmm2 4385 pavgb xmm1, xmm3 4386 pand xmm0, xmm5 // UYVY -> UVUV 4387 pand xmm1, xmm5 4388 packuswb xmm0, xmm1 4389 movdqa xmm1, xmm0 4390 pand xmm0, xmm5 // U 4391 packuswb xmm0, xmm0 4392 psrlw xmm1, 8 // V 4393 packuswb xmm1, xmm1 4394 movq qword ptr [edx], xmm0 4395 movq qword ptr [edx + edi], xmm1 4396 lea edx, [edx + 8] 4397 sub ecx, 16 4398 jg convertloop 4399 4400 pop edi 4401 pop esi 4402 ret 4403 } 4404 } 4405 4406 __declspec(naked) __declspec(align(16)) 4407 void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy, 4408 uint8* dst_u, uint8* dst_v, int pix) { 4409 __asm { 4410 push edi 4411 mov eax, [esp + 4 + 4] // src_yuy2 4412 mov edx, [esp + 4 + 8] // dst_u 4413 mov edi, [esp + 4 + 12] // dst_v 4414 mov ecx, [esp + 4 + 16] // pix 4415 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 4416 psrlw xmm5, 8 4417 sub edi, edx 4418 4419 align 4 4420 convertloop: 4421 movdqu xmm0, [eax] 4422 movdqu xmm1, [eax + 16] 4423 lea eax, [eax + 32] 4424 pand xmm0, xmm5 // UYVY -> UVUV 4425 pand xmm1, xmm5 4426 packuswb xmm0, xmm1 4427 movdqa xmm1, xmm0 4428 pand xmm0, xmm5 // U 4429 packuswb xmm0, xmm0 4430 psrlw xmm1, 8 // V 4431 packuswb xmm1, xmm1 4432 movq qword ptr [edx], xmm0 4433 movq qword ptr [edx + edi], xmm1 4434 lea edx, [edx + 8] 4435 sub ecx, 16 4436 jg convertloop 4437 4438 pop edi 4439 ret 4440 } 4441 } 4442 #endif // HAS_YUY2TOYROW_SSE2 4443 4444 #ifdef HAS_ARGBBLENDROW_SSE2 4445 // Blend 8 pixels at a time. 4446 __declspec(naked) __declspec(align(16)) 4447 void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, 4448 uint8* dst_argb, int width) { 4449 __asm { 4450 push esi 4451 mov eax, [esp + 4 + 4] // src_argb0 4452 mov esi, [esp + 4 + 8] // src_argb1 4453 mov edx, [esp + 4 + 12] // dst_argb 4454 mov ecx, [esp + 4 + 16] // width 4455 pcmpeqb xmm7, xmm7 // generate constant 1 4456 psrlw xmm7, 15 4457 pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff 4458 psrlw xmm6, 8 4459 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 4460 psllw xmm5, 8 4461 pcmpeqb xmm4, xmm4 // generate mask 0xff000000 4462 pslld xmm4, 24 4463 4464 sub ecx, 1 4465 je convertloop1 // only 1 pixel? 4466 jl convertloop1b 4467 4468 // 1 pixel loop until destination pointer is aligned. 4469 alignloop1: 4470 test edx, 15 // aligned? 4471 je alignloop1b 4472 movd xmm3, [eax] 4473 lea eax, [eax + 4] 4474 movdqa xmm0, xmm3 // src argb 4475 pxor xmm3, xmm4 // ~alpha 4476 movd xmm2, [esi] // _r_b 4477 psrlw xmm3, 8 // alpha 4478 pshufhw xmm3, xmm3, 0F5h // 8 alpha words 4479 pshuflw xmm3, xmm3, 0F5h 4480 pand xmm2, xmm6 // _r_b 4481 paddw xmm3, xmm7 // 256 - alpha 4482 pmullw xmm2, xmm3 // _r_b * alpha 4483 movd xmm1, [esi] // _a_g 4484 lea esi, [esi + 4] 4485 psrlw xmm1, 8 // _a_g 4486 por xmm0, xmm4 // set alpha to 255 4487 pmullw xmm1, xmm3 // _a_g * alpha 4488 psrlw xmm2, 8 // _r_b convert to 8 bits again 4489 paddusb xmm0, xmm2 // + src argb 4490 pand xmm1, xmm5 // a_g_ convert to 8 bits again 4491 paddusb xmm0, xmm1 // + src argb 4492 sub ecx, 1 4493 movd [edx], xmm0 4494 lea edx, [edx + 4] 4495 jge alignloop1 4496 4497 alignloop1b: 4498 add ecx, 1 - 4 4499 jl convertloop4b 4500 4501 // 4 pixel loop. 4502 convertloop4: 4503 movdqu xmm3, [eax] // src argb 4504 lea eax, [eax + 16] 4505 movdqa xmm0, xmm3 // src argb 4506 pxor xmm3, xmm4 // ~alpha 4507 movdqu xmm2, [esi] // _r_b 4508 psrlw xmm3, 8 // alpha 4509 pshufhw xmm3, xmm3, 0F5h // 8 alpha words 4510 pshuflw xmm3, xmm3, 0F5h 4511 pand xmm2, xmm6 // _r_b 4512 paddw xmm3, xmm7 // 256 - alpha 4513 pmullw xmm2, xmm3 // _r_b * alpha 4514 movdqu xmm1, [esi] // _a_g 4515 lea esi, [esi + 16] 4516 psrlw xmm1, 8 // _a_g 4517 por xmm0, xmm4 // set alpha to 255 4518 pmullw xmm1, xmm3 // _a_g * alpha 4519 psrlw xmm2, 8 // _r_b convert to 8 bits again 4520 paddusb xmm0, xmm2 // + src argb 4521 pand xmm1, xmm5 // a_g_ convert to 8 bits again 4522 paddusb xmm0, xmm1 // + src argb 4523 sub ecx, 4 4524 movdqa [edx], xmm0 4525 lea edx, [edx + 16] 4526 jge convertloop4 4527 4528 convertloop4b: 4529 add ecx, 4 - 1 4530 jl convertloop1b 4531 4532 // 1 pixel loop. 4533 convertloop1: 4534 movd xmm3, [eax] // src argb 4535 lea eax, [eax + 4] 4536 movdqa xmm0, xmm3 // src argb 4537 pxor xmm3, xmm4 // ~alpha 4538 movd xmm2, [esi] // _r_b 4539 psrlw xmm3, 8 // alpha 4540 pshufhw xmm3, xmm3, 0F5h // 8 alpha words 4541 pshuflw xmm3, xmm3, 0F5h 4542 pand xmm2, xmm6 // _r_b 4543 paddw xmm3, xmm7 // 256 - alpha 4544 pmullw xmm2, xmm3 // _r_b * alpha 4545 movd xmm1, [esi] // _a_g 4546 lea esi, [esi + 4] 4547 psrlw xmm1, 8 // _a_g 4548 por xmm0, xmm4 // set alpha to 255 4549 pmullw xmm1, xmm3 // _a_g * alpha 4550 psrlw xmm2, 8 // _r_b convert to 8 bits again 4551 paddusb xmm0, xmm2 // + src argb 4552 pand xmm1, xmm5 // a_g_ convert to 8 bits again 4553 paddusb xmm0, xmm1 // + src argb 4554 sub ecx, 1 4555 movd [edx], xmm0 4556 lea edx, [edx + 4] 4557 jge convertloop1 4558 4559 convertloop1b: 4560 pop esi 4561 ret 4562 } 4563 } 4564 #endif // HAS_ARGBBLENDROW_SSE2 4565 4566 #ifdef HAS_ARGBBLENDROW_SSSE3 4567 // Shuffle table for isolating alpha. 4568 static const uvec8 kShuffleAlpha = { 4569 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, 4570 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 4571 }; 4572 // Same as SSE2, but replaces: 4573 // psrlw xmm3, 8 // alpha 4574 // pshufhw xmm3, xmm3, 0F5h // 8 alpha words 4575 // pshuflw xmm3, xmm3, 0F5h 4576 // with.. 4577 // pshufb xmm3, kShuffleAlpha // alpha 4578 // Blend 8 pixels at a time. 4579 4580 __declspec(naked) __declspec(align(16)) 4581 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, 4582 uint8* dst_argb, int width) { 4583 __asm { 4584 push esi 4585 mov eax, [esp + 4 + 4] // src_argb0 4586 mov esi, [esp + 4 + 8] // src_argb1 4587 mov edx, [esp + 4 + 12] // dst_argb 4588 mov ecx, [esp + 4 + 16] // width 4589 pcmpeqb xmm7, xmm7 // generate constant 0x0001 4590 psrlw xmm7, 15 4591 pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff 4592 psrlw xmm6, 8 4593 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 4594 psllw xmm5, 8 4595 pcmpeqb xmm4, xmm4 // generate mask 0xff000000 4596 pslld xmm4, 24 4597 4598 sub ecx, 1 4599 je convertloop1 // only 1 pixel? 4600 jl convertloop1b 4601 4602 // 1 pixel loop until destination pointer is aligned. 4603 alignloop1: 4604 test edx, 15 // aligned? 4605 je alignloop1b 4606 movd xmm3, [eax] 4607 lea eax, [eax + 4] 4608 movdqa xmm0, xmm3 // src argb 4609 pxor xmm3, xmm4 // ~alpha 4610 movd xmm2, [esi] // _r_b 4611 pshufb xmm3, kShuffleAlpha // alpha 4612 pand xmm2, xmm6 // _r_b 4613 paddw xmm3, xmm7 // 256 - alpha 4614 pmullw xmm2, xmm3 // _r_b * alpha 4615 movd xmm1, [esi] // _a_g 4616 lea esi, [esi + 4] 4617 psrlw xmm1, 8 // _a_g 4618 por xmm0, xmm4 // set alpha to 255 4619 pmullw xmm1, xmm3 // _a_g * alpha 4620 psrlw xmm2, 8 // _r_b convert to 8 bits again 4621 paddusb xmm0, xmm2 // + src argb 4622 pand xmm1, xmm5 // a_g_ convert to 8 bits again 4623 paddusb xmm0, xmm1 // + src argb 4624 sub ecx, 1 4625 movd [edx], xmm0 4626 lea edx, [edx + 4] 4627 jge alignloop1 4628 4629 alignloop1b: 4630 add ecx, 1 - 4 4631 jl convertloop4b 4632 4633 test eax, 15 // unaligned? 4634 jne convertuloop4 4635 test esi, 15 // unaligned? 4636 jne convertuloop4 4637 4638 // 4 pixel loop. 4639 convertloop4: 4640 movdqa xmm3, [eax] // src argb 4641 lea eax, [eax + 16] 4642 movdqa xmm0, xmm3 // src argb 4643 pxor xmm3, xmm4 // ~alpha 4644 movdqa xmm2, [esi] // _r_b 4645 pshufb xmm3, kShuffleAlpha // alpha 4646 pand xmm2, xmm6 // _r_b 4647 paddw xmm3, xmm7 // 256 - alpha 4648 pmullw xmm2, xmm3 // _r_b * alpha 4649 movdqa xmm1, [esi] // _a_g 4650 lea esi, [esi + 16] 4651 psrlw xmm1, 8 // _a_g 4652 por xmm0, xmm4 // set alpha to 255 4653 pmullw xmm1, xmm3 // _a_g * alpha 4654 psrlw xmm2, 8 // _r_b convert to 8 bits again 4655 paddusb xmm0, xmm2 // + src argb 4656 pand xmm1, xmm5 // a_g_ convert to 8 bits again 4657 paddusb xmm0, xmm1 // + src argb 4658 sub ecx, 4 4659 movdqa [edx], xmm0 4660 lea edx, [edx + 16] 4661 jge convertloop4 4662 jmp convertloop4b 4663 4664 // 4 pixel unaligned loop. 4665 convertuloop4: 4666 movdqu xmm3, [eax] // src argb 4667 lea eax, [eax + 16] 4668 movdqa xmm0, xmm3 // src argb 4669 pxor xmm3, xmm4 // ~alpha 4670 movdqu xmm2, [esi] // _r_b 4671 pshufb xmm3, kShuffleAlpha // alpha 4672 pand xmm2, xmm6 // _r_b 4673 paddw xmm3, xmm7 // 256 - alpha 4674 pmullw xmm2, xmm3 // _r_b * alpha 4675 movdqu xmm1, [esi] // _a_g 4676 lea esi, [esi + 16] 4677 psrlw xmm1, 8 // _a_g 4678 por xmm0, xmm4 // set alpha to 255 4679 pmullw xmm1, xmm3 // _a_g * alpha 4680 psrlw xmm2, 8 // _r_b convert to 8 bits again 4681 paddusb xmm0, xmm2 // + src argb 4682 pand xmm1, xmm5 // a_g_ convert to 8 bits again 4683 paddusb xmm0, xmm1 // + src argb 4684 sub ecx, 4 4685 movdqa [edx], xmm0 4686 lea edx, [edx + 16] 4687 jge convertuloop4 4688 4689 convertloop4b: 4690 add ecx, 4 - 1 4691 jl convertloop1b 4692 4693 // 1 pixel loop. 4694 convertloop1: 4695 movd xmm3, [eax] // src argb 4696 lea eax, [eax + 4] 4697 movdqa xmm0, xmm3 // src argb 4698 pxor xmm3, xmm4 // ~alpha 4699 movd xmm2, [esi] // _r_b 4700 pshufb xmm3, kShuffleAlpha // alpha 4701 pand xmm2, xmm6 // _r_b 4702 paddw xmm3, xmm7 // 256 - alpha 4703 pmullw xmm2, xmm3 // _r_b * alpha 4704 movd xmm1, [esi] // _a_g 4705 lea esi, [esi + 4] 4706 psrlw xmm1, 8 // _a_g 4707 por xmm0, xmm4 // set alpha to 255 4708 pmullw xmm1, xmm3 // _a_g * alpha 4709 psrlw xmm2, 8 // _r_b convert to 8 bits again 4710 paddusb xmm0, xmm2 // + src argb 4711 pand xmm1, xmm5 // a_g_ convert to 8 bits again 4712 paddusb xmm0, xmm1 // + src argb 4713 sub ecx, 1 4714 movd [edx], xmm0 4715 lea edx, [edx + 4] 4716 jge convertloop1 4717 4718 convertloop1b: 4719 pop esi 4720 ret 4721 } 4722 } 4723 #endif // HAS_ARGBBLENDROW_SSSE3 4724 4725 #ifdef HAS_ARGBATTENUATEROW_SSE2 4726 // Attenuate 4 pixels at a time. 4727 // Aligned to 16 bytes. 4728 __declspec(naked) __declspec(align(16)) 4729 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { 4730 __asm { 4731 mov eax, [esp + 4] // src_argb0 4732 mov edx, [esp + 8] // dst_argb 4733 mov ecx, [esp + 12] // width 4734 pcmpeqb xmm4, xmm4 // generate mask 0xff000000 4735 pslld xmm4, 24 4736 pcmpeqb xmm5, xmm5 // generate mask 0x00ffffff 4737 psrld xmm5, 8 4738 4739 align 4 4740 convertloop: 4741 movdqa xmm0, [eax] // read 4 pixels 4742 punpcklbw xmm0, xmm0 // first 2 4743 pshufhw xmm2, xmm0, 0FFh // 8 alpha words 4744 pshuflw xmm2, xmm2, 0FFh 4745 pmulhuw xmm0, xmm2 // rgb * a 4746 movdqa xmm1, [eax] // read 4 pixels 4747 punpckhbw xmm1, xmm1 // next 2 pixels 4748 pshufhw xmm2, xmm1, 0FFh // 8 alpha words 4749 pshuflw xmm2, xmm2, 0FFh 4750 pmulhuw xmm1, xmm2 // rgb * a 4751 movdqa xmm2, [eax] // alphas 4752 lea eax, [eax + 16] 4753 psrlw xmm0, 8 4754 pand xmm2, xmm4 4755 psrlw xmm1, 8 4756 packuswb xmm0, xmm1 4757 pand xmm0, xmm5 // keep original alphas 4758 por xmm0, xmm2 4759 sub ecx, 4 4760 movdqa [edx], xmm0 4761 lea edx, [edx + 16] 4762 jg convertloop 4763 4764 ret 4765 } 4766 } 4767 #endif // HAS_ARGBATTENUATEROW_SSE2 4768 4769 #ifdef HAS_ARGBATTENUATEROW_SSSE3 4770 // Shuffle table duplicating alpha. 4771 static const uvec8 kShuffleAlpha0 = { 4772 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u, 4773 }; 4774 static const uvec8 kShuffleAlpha1 = { 4775 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, 4776 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u, 4777 }; 4778 __declspec(naked) __declspec(align(16)) 4779 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { 4780 __asm { 4781 mov eax, [esp + 4] // src_argb0 4782 mov edx, [esp + 8] // dst_argb 4783 mov ecx, [esp + 12] // width 4784 pcmpeqb xmm3, xmm3 // generate mask 0xff000000 4785 pslld xmm3, 24 4786 movdqa xmm4, kShuffleAlpha0 4787 movdqa xmm5, kShuffleAlpha1 4788 4789 align 4 4790 convertloop: 4791 movdqu xmm0, [eax] // read 4 pixels 4792 pshufb xmm0, xmm4 // isolate first 2 alphas 4793 movdqu xmm1, [eax] // read 4 pixels 4794 punpcklbw xmm1, xmm1 // first 2 pixel rgbs 4795 pmulhuw xmm0, xmm1 // rgb * a 4796 movdqu xmm1, [eax] // read 4 pixels 4797 pshufb xmm1, xmm5 // isolate next 2 alphas 4798 movdqu xmm2, [eax] // read 4 pixels 4799 punpckhbw xmm2, xmm2 // next 2 pixel rgbs 4800 pmulhuw xmm1, xmm2 // rgb * a 4801 movdqu xmm2, [eax] // mask original alpha 4802 lea eax, [eax + 16] 4803 pand xmm2, xmm3 4804 psrlw xmm0, 8 4805 psrlw xmm1, 8 4806 packuswb xmm0, xmm1 4807 por xmm0, xmm2 // copy original alpha 4808 sub ecx, 4 4809 movdqu [edx], xmm0 4810 lea edx, [edx + 16] 4811 jg convertloop 4812 4813 ret 4814 } 4815 } 4816 #endif // HAS_ARGBATTENUATEROW_SSSE3 4817 4818 #ifdef HAS_ARGBATTENUATEROW_AVX2 4819 // Shuffle table duplicating alpha. 4820 static const ulvec8 kShuffleAlpha_AVX2 = { 4821 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 4822 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u, 4823 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 4824 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u, 4825 }; 4826 __declspec(naked) __declspec(align(16)) 4827 void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { 4828 __asm { 4829 mov eax, [esp + 4] // src_argb0 4830 mov edx, [esp + 8] // dst_argb 4831 mov ecx, [esp + 12] // width 4832 sub edx, eax 4833 vmovdqa ymm4, kShuffleAlpha_AVX2 4834 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 4835 vpslld ymm5, ymm5, 24 4836 4837 align 4 4838 convertloop: 4839 vmovdqu ymm6, [eax] // read 8 pixels. 4840 vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. 4841 vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. 4842 vpshufb ymm2, ymm0, ymm4 // low 4 alphas 4843 vpshufb ymm3, ymm1, ymm4 // high 4 alphas 4844 vpmulhuw ymm0, ymm0, ymm2 // rgb * a 4845 vpmulhuw ymm1, ymm1, ymm3 // rgb * a 4846 vpand ymm6, ymm6, ymm5 // isolate alpha 4847 vpsrlw ymm0, ymm0, 8 4848 vpsrlw ymm1, ymm1, 8 4849 vpackuswb ymm0, ymm0, ymm1 // unmutated. 4850 vpor ymm0, ymm0, ymm6 // copy original alpha 4851 sub ecx, 8 4852 vmovdqu [eax + edx], ymm0 4853 lea eax, [eax + 32] 4854 jg convertloop 4855 4856 vzeroupper 4857 ret 4858 } 4859 } 4860 #endif // HAS_ARGBATTENUATEROW_AVX2 4861 4862 #ifdef HAS_ARGBUNATTENUATEROW_SSE2 4863 // Unattenuate 4 pixels at a time. 4864 // Aligned to 16 bytes. 4865 __declspec(naked) __declspec(align(16)) 4866 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, 4867 int width) { 4868 __asm { 4869 push esi 4870 push edi 4871 mov eax, [esp + 8 + 4] // src_argb0 4872 mov edx, [esp + 8 + 8] // dst_argb 4873 mov ecx, [esp + 8 + 12] // width 4874 4875 align 4 4876 convertloop: 4877 movdqu xmm0, [eax] // read 4 pixels 4878 movzx esi, byte ptr [eax + 3] // first alpha 4879 movzx edi, byte ptr [eax + 7] // second alpha 4880 punpcklbw xmm0, xmm0 // first 2 4881 movd xmm2, dword ptr fixed_invtbl8[esi * 4] 4882 movd xmm3, dword ptr fixed_invtbl8[edi * 4] 4883 pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words. 1, a, a, a 4884 pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words 4885 movlhps xmm2, xmm3 4886 pmulhuw xmm0, xmm2 // rgb * a 4887 4888 movdqu xmm1, [eax] // read 4 pixels 4889 movzx esi, byte ptr [eax + 11] // third alpha 4890 movzx edi, byte ptr [eax + 15] // forth alpha 4891 punpckhbw xmm1, xmm1 // next 2 4892 movd xmm2, dword ptr fixed_invtbl8[esi * 4] 4893 movd xmm3, dword ptr fixed_invtbl8[edi * 4] 4894 pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words 4895 pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words 4896 movlhps xmm2, xmm3 4897 pmulhuw xmm1, xmm2 // rgb * a 4898 lea eax, [eax + 16] 4899 4900 packuswb xmm0, xmm1 4901 sub ecx, 4 4902 movdqu [edx], xmm0 4903 lea edx, [edx + 16] 4904 jg convertloop 4905 pop edi 4906 pop esi 4907 ret 4908 } 4909 } 4910 #endif // HAS_ARGBUNATTENUATEROW_SSE2 4911 4912 #ifdef HAS_ARGBUNATTENUATEROW_AVX2 4913 // Shuffle table duplicating alpha. 4914 static const ulvec8 kUnattenShuffleAlpha_AVX2 = { 4915 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15, 4916 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15, 4917 }; 4918 // TODO(fbarchard): Enable USE_GATHER for future hardware if faster. 4919 // USE_GATHER is not on by default, due to being a slow instruction. 4920 #ifdef USE_GATHER 4921 __declspec(naked) __declspec(align(16)) 4922 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, 4923 int width) { 4924 __asm { 4925 mov eax, [esp + 4] // src_argb0 4926 mov edx, [esp + 8] // dst_argb 4927 mov ecx, [esp + 12] // width 4928 sub edx, eax 4929 vmovdqa ymm4, kUnattenShuffleAlpha_AVX2 4930 4931 align 4 4932 convertloop: 4933 vmovdqu ymm6, [eax] // read 8 pixels. 4934 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xffffffff for gather. 4935 vpsrld ymm2, ymm6, 24 // alpha in low 8 bits. 4936 vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. 4937 vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. 4938 vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5 // ymm5 cleared. 1, a 4939 vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a 4940 vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated. 4941 vpshufb ymm2, ymm2, ymm4 // replicate low 4 alphas. 1, a, a, a 4942 vpshufb ymm3, ymm3, ymm4 // replicate high 4 alphas 4943 vpmulhuw ymm0, ymm0, ymm2 // rgb * ia 4944 vpmulhuw ymm1, ymm1, ymm3 // rgb * ia 4945 vpackuswb ymm0, ymm0, ymm1 // unmutated. 4946 sub ecx, 8 4947 vmovdqu [eax + edx], ymm0 4948 lea eax, [eax + 32] 4949 jg convertloop 4950 4951 vzeroupper 4952 ret 4953 } 4954 } 4955 #else // USE_GATHER 4956 __declspec(naked) __declspec(align(16)) 4957 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, 4958 int width) { 4959 __asm { 4960 4961 mov eax, [esp + 4] // src_argb0 4962 mov edx, [esp + 8] // dst_argb 4963 mov ecx, [esp + 12] // width 4964 sub edx, eax 4965 vmovdqa ymm5, kUnattenShuffleAlpha_AVX2 4966 4967 push esi 4968 push edi 4969 4970 align 4 4971 convertloop: 4972 // replace VPGATHER 4973 movzx esi, byte ptr [eax + 3] // alpha0 4974 movzx edi, byte ptr [eax + 7] // alpha1 4975 vmovd xmm0, dword ptr fixed_invtbl8[esi * 4] // [1,a0] 4976 vmovd xmm1, dword ptr fixed_invtbl8[edi * 4] // [1,a1] 4977 movzx esi, byte ptr [eax + 11] // alpha2 4978 movzx edi, byte ptr [eax + 15] // alpha3 4979 vpunpckldq xmm6, xmm0, xmm1 // [1,a1,1,a0] 4980 vmovd xmm2, dword ptr fixed_invtbl8[esi * 4] // [1,a2] 4981 vmovd xmm3, dword ptr fixed_invtbl8[edi * 4] // [1,a3] 4982 movzx esi, byte ptr [eax + 19] // alpha4 4983 movzx edi, byte ptr [eax + 23] // alpha5 4984 vpunpckldq xmm7, xmm2, xmm3 // [1,a3,1,a2] 4985 vmovd xmm0, dword ptr fixed_invtbl8[esi * 4] // [1,a4] 4986 vmovd xmm1, dword ptr fixed_invtbl8[edi * 4] // [1,a5] 4987 movzx esi, byte ptr [eax + 27] // alpha6 4988 movzx edi, byte ptr [eax + 31] // alpha7 4989 vpunpckldq xmm0, xmm0, xmm1 // [1,a5,1,a4] 4990 vmovd xmm2, dword ptr fixed_invtbl8[esi * 4] // [1,a6] 4991 vmovd xmm3, dword ptr fixed_invtbl8[edi * 4] // [1,a7] 4992 vpunpckldq xmm2, xmm2, xmm3 // [1,a7,1,a6] 4993 vpunpcklqdq xmm3, xmm6, xmm7 // [1,a3,1,a2,1,a1,1,a0] 4994 vpunpcklqdq xmm0, xmm0, xmm2 // [1,a7,1,a6,1,a5,1,a4] 4995 vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0] 4996 // end of VPGATHER 4997 4998 vmovdqu ymm6, [eax] // read 8 pixels. 4999 vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. 5000 vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. 5001 vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a 5002 vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated. 5003 vpshufb ymm2, ymm2, ymm5 // replicate low 4 alphas. 1, a, a, a 5004 vpshufb ymm3, ymm3, ymm5 // replicate high 4 alphas 5005 vpmulhuw ymm0, ymm0, ymm2 // rgb * ia 5006 vpmulhuw ymm1, ymm1, ymm3 // rgb * ia 5007 vpackuswb ymm0, ymm0, ymm1 // unmutated. 5008 sub ecx, 8 5009 vmovdqu [eax + edx], ymm0 5010 lea eax, [eax + 32] 5011 jg convertloop 5012 5013 pop edi 5014 pop esi 5015 vzeroupper 5016 ret 5017 } 5018 } 5019 #endif // USE_GATHER 5020 #endif // HAS_ARGBATTENUATEROW_AVX2 5021 5022 #ifdef HAS_ARGBGRAYROW_SSSE3 5023 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels. 5024 __declspec(naked) __declspec(align(16)) 5025 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { 5026 __asm { 5027 mov eax, [esp + 4] /* src_argb */ 5028 mov edx, [esp + 8] /* dst_argb */ 5029 mov ecx, [esp + 12] /* width */ 5030 movdqa xmm4, kARGBToYJ 5031 movdqa xmm5, kAddYJ64 5032 5033 align 4 5034 convertloop: 5035 movdqa xmm0, [eax] // G 5036 movdqa xmm1, [eax + 16] 5037 pmaddubsw xmm0, xmm4 5038 pmaddubsw xmm1, xmm4 5039 phaddw xmm0, xmm1 5040 paddw xmm0, xmm5 // Add .5 for rounding. 5041 psrlw xmm0, 7 5042 packuswb xmm0, xmm0 // 8 G bytes 5043 movdqa xmm2, [eax] // A 5044 movdqa xmm3, [eax + 16] 5045 lea eax, [eax + 32] 5046 psrld xmm2, 24 5047 psrld xmm3, 24 5048 packuswb xmm2, xmm3 5049 packuswb xmm2, xmm2 // 8 A bytes 5050 movdqa xmm3, xmm0 // Weave into GG, GA, then GGGA 5051 punpcklbw xmm0, xmm0 // 8 GG words 5052 punpcklbw xmm3, xmm2 // 8 GA words 5053 movdqa xmm1, xmm0 5054 punpcklwd xmm0, xmm3 // GGGA first 4 5055 punpckhwd xmm1, xmm3 // GGGA next 4 5056 sub ecx, 8 5057 movdqa [edx], xmm0 5058 movdqa [edx + 16], xmm1 5059 lea edx, [edx + 32] 5060 jg convertloop 5061 ret 5062 } 5063 } 5064 #endif // HAS_ARGBGRAYROW_SSSE3 5065 5066 #ifdef HAS_ARGBSEPIAROW_SSSE3 5067 // b = (r * 35 + g * 68 + b * 17) >> 7 5068 // g = (r * 45 + g * 88 + b * 22) >> 7 5069 // r = (r * 50 + g * 98 + b * 24) >> 7 5070 // Constant for ARGB color to sepia tone. 5071 static const vec8 kARGBToSepiaB = { 5072 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0 5073 }; 5074 5075 static const vec8 kARGBToSepiaG = { 5076 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0 5077 }; 5078 5079 static const vec8 kARGBToSepiaR = { 5080 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0 5081 }; 5082 5083 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. 5084 __declspec(naked) __declspec(align(16)) 5085 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { 5086 __asm { 5087 mov eax, [esp + 4] /* dst_argb */ 5088 mov ecx, [esp + 8] /* width */ 5089 movdqa xmm2, kARGBToSepiaB 5090 movdqa xmm3, kARGBToSepiaG 5091 movdqa xmm4, kARGBToSepiaR 5092 5093 align 4 5094 convertloop: 5095 movdqa xmm0, [eax] // B 5096 movdqa xmm6, [eax + 16] 5097 pmaddubsw xmm0, xmm2 5098 pmaddubsw xmm6, xmm2 5099 phaddw xmm0, xmm6 5100 psrlw xmm0, 7 5101 packuswb xmm0, xmm0 // 8 B values 5102 movdqa xmm5, [eax] // G 5103 movdqa xmm1, [eax + 16] 5104 pmaddubsw xmm5, xmm3 5105 pmaddubsw xmm1, xmm3 5106 phaddw xmm5, xmm1 5107 psrlw xmm5, 7 5108 packuswb xmm5, xmm5 // 8 G values 5109 punpcklbw xmm0, xmm5 // 8 BG values 5110 movdqa xmm5, [eax] // R 5111 movdqa xmm1, [eax + 16] 5112 pmaddubsw xmm5, xmm4 5113 pmaddubsw xmm1, xmm4 5114 phaddw xmm5, xmm1 5115 psrlw xmm5, 7 5116 packuswb xmm5, xmm5 // 8 R values 5117 movdqa xmm6, [eax] // A 5118 movdqa xmm1, [eax + 16] 5119 psrld xmm6, 24 5120 psrld xmm1, 24 5121 packuswb xmm6, xmm1 5122 packuswb xmm6, xmm6 // 8 A values 5123 punpcklbw xmm5, xmm6 // 8 RA values 5124 movdqa xmm1, xmm0 // Weave BG, RA together 5125 punpcklwd xmm0, xmm5 // BGRA first 4 5126 punpckhwd xmm1, xmm5 // BGRA next 4 5127 sub ecx, 8 5128 movdqa [eax], xmm0 5129 movdqa [eax + 16], xmm1 5130 lea eax, [eax + 32] 5131 jg convertloop 5132 ret 5133 } 5134 } 5135 #endif // HAS_ARGBSEPIAROW_SSSE3 5136 5137 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 5138 // Tranform 8 ARGB pixels (32 bytes) with color matrix. 5139 // Same as Sepia except matrix is provided. 5140 // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R 5141 // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd. 5142 __declspec(naked) __declspec(align(16)) 5143 void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, 5144 const int8* matrix_argb, int width) { 5145 __asm { 5146 mov eax, [esp + 4] /* src_argb */ 5147 mov edx, [esp + 8] /* dst_argb */ 5148 mov ecx, [esp + 12] /* matrix_argb */ 5149 movdqu xmm5, [ecx] 5150 pshufd xmm2, xmm5, 0x00 5151 pshufd xmm3, xmm5, 0x55 5152 pshufd xmm4, xmm5, 0xaa 5153 pshufd xmm5, xmm5, 0xff 5154 mov ecx, [esp + 16] /* width */ 5155 5156 align 4 5157 convertloop: 5158 movdqa xmm0, [eax] // B 5159 movdqa xmm7, [eax + 16] 5160 pmaddubsw xmm0, xmm2 5161 pmaddubsw xmm7, xmm2 5162 movdqa xmm6, [eax] // G 5163 movdqa xmm1, [eax + 16] 5164 pmaddubsw xmm6, xmm3 5165 pmaddubsw xmm1, xmm3 5166 phaddsw xmm0, xmm7 // B 5167 phaddsw xmm6, xmm1 // G 5168 psraw xmm0, 6 // B 5169 psraw xmm6, 6 // G 5170 packuswb xmm0, xmm0 // 8 B values 5171 packuswb xmm6, xmm6 // 8 G values 5172 punpcklbw xmm0, xmm6 // 8 BG values 5173 movdqa xmm1, [eax] // R 5174 movdqa xmm7, [eax + 16] 5175 pmaddubsw xmm1, xmm4 5176 pmaddubsw xmm7, xmm4 5177 phaddsw xmm1, xmm7 // R 5178 movdqa xmm6, [eax] // A 5179 movdqa xmm7, [eax + 16] 5180 pmaddubsw xmm6, xmm5 5181 pmaddubsw xmm7, xmm5 5182 phaddsw xmm6, xmm7 // A 5183 psraw xmm1, 6 // R 5184 psraw xmm6, 6 // A 5185 packuswb xmm1, xmm1 // 8 R values 5186 packuswb xmm6, xmm6 // 8 A values 5187 punpcklbw xmm1, xmm6 // 8 RA values 5188 movdqa xmm6, xmm0 // Weave BG, RA together 5189 punpcklwd xmm0, xmm1 // BGRA first 4 5190 punpckhwd xmm6, xmm1 // BGRA next 4 5191 sub ecx, 8 5192 movdqa [edx], xmm0 5193 movdqa [edx + 16], xmm6 5194 lea eax, [eax + 32] 5195 lea edx, [edx + 32] 5196 jg convertloop 5197 ret 5198 } 5199 } 5200 #endif // HAS_ARGBCOLORMATRIXROW_SSSE3 5201 5202 #ifdef HAS_ARGBQUANTIZEROW_SSE2 5203 // Quantize 4 ARGB pixels (16 bytes). 5204 // Aligned to 16 bytes. 5205 __declspec(naked) __declspec(align(16)) 5206 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, 5207 int interval_offset, int width) { 5208 __asm { 5209 mov eax, [esp + 4] /* dst_argb */ 5210 movd xmm2, [esp + 8] /* scale */ 5211 movd xmm3, [esp + 12] /* interval_size */ 5212 movd xmm4, [esp + 16] /* interval_offset */ 5213 mov ecx, [esp + 20] /* width */ 5214 pshuflw xmm2, xmm2, 040h 5215 pshufd xmm2, xmm2, 044h 5216 pshuflw xmm3, xmm3, 040h 5217 pshufd xmm3, xmm3, 044h 5218 pshuflw xmm4, xmm4, 040h 5219 pshufd xmm4, xmm4, 044h 5220 pxor xmm5, xmm5 // constant 0 5221 pcmpeqb xmm6, xmm6 // generate mask 0xff000000 5222 pslld xmm6, 24 5223 5224 align 4 5225 convertloop: 5226 movdqa xmm0, [eax] // read 4 pixels 5227 punpcklbw xmm0, xmm5 // first 2 pixels 5228 pmulhuw xmm0, xmm2 // pixel * scale >> 16 5229 movdqa xmm1, [eax] // read 4 pixels 5230 punpckhbw xmm1, xmm5 // next 2 pixels 5231 pmulhuw xmm1, xmm2 5232 pmullw xmm0, xmm3 // * interval_size 5233 movdqa xmm7, [eax] // read 4 pixels 5234 pmullw xmm1, xmm3 5235 pand xmm7, xmm6 // mask alpha 5236 paddw xmm0, xmm4 // + interval_size / 2 5237 paddw xmm1, xmm4 5238 packuswb xmm0, xmm1 5239 por xmm0, xmm7 5240 sub ecx, 4 5241 movdqa [eax], xmm0 5242 lea eax, [eax + 16] 5243 jg convertloop 5244 ret 5245 } 5246 } 5247 #endif // HAS_ARGBQUANTIZEROW_SSE2 5248 5249 #ifdef HAS_ARGBSHADEROW_SSE2 5250 // Shade 4 pixels at a time by specified value. 5251 // Aligned to 16 bytes. 5252 __declspec(naked) __declspec(align(16)) 5253 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, 5254 uint32 value) { 5255 __asm { 5256 mov eax, [esp + 4] // src_argb 5257 mov edx, [esp + 8] // dst_argb 5258 mov ecx, [esp + 12] // width 5259 movd xmm2, [esp + 16] // value 5260 punpcklbw xmm2, xmm2 5261 punpcklqdq xmm2, xmm2 5262 5263 align 4 5264 convertloop: 5265 movdqa xmm0, [eax] // read 4 pixels 5266 lea eax, [eax + 16] 5267 movdqa xmm1, xmm0 5268 punpcklbw xmm0, xmm0 // first 2 5269 punpckhbw xmm1, xmm1 // next 2 5270 pmulhuw xmm0, xmm2 // argb * value 5271 pmulhuw xmm1, xmm2 // argb * value 5272 psrlw xmm0, 8 5273 psrlw xmm1, 8 5274 packuswb xmm0, xmm1 5275 sub ecx, 4 5276 movdqa [edx], xmm0 5277 lea edx, [edx + 16] 5278 jg convertloop 5279 5280 ret 5281 } 5282 } 5283 #endif // HAS_ARGBSHADEROW_SSE2 5284 5285 #ifdef HAS_ARGBMULTIPLYROW_SSE2 5286 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time. 5287 __declspec(naked) __declspec(align(16)) 5288 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, 5289 uint8* dst_argb, int width) { 5290 __asm { 5291 push esi 5292 mov eax, [esp + 4 + 4] // src_argb0 5293 mov esi, [esp + 4 + 8] // src_argb1 5294 mov edx, [esp + 4 + 12] // dst_argb 5295 mov ecx, [esp + 4 + 16] // width 5296 pxor xmm5, xmm5 // constant 0 5297 5298 align 4 5299 convertloop: 5300 movdqu xmm0, [eax] // read 4 pixels from src_argb0 5301 movdqu xmm2, [esi] // read 4 pixels from src_argb1 5302 movdqu xmm1, xmm0 5303 movdqu xmm3, xmm2 5304 punpcklbw xmm0, xmm0 // first 2 5305 punpckhbw xmm1, xmm1 // next 2 5306 punpcklbw xmm2, xmm5 // first 2 5307 punpckhbw xmm3, xmm5 // next 2 5308 pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2 5309 pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2 5310 lea eax, [eax + 16] 5311 lea esi, [esi + 16] 5312 packuswb xmm0, xmm1 5313 sub ecx, 4 5314 movdqu [edx], xmm0 5315 lea edx, [edx + 16] 5316 jg convertloop 5317 5318 pop esi 5319 ret 5320 } 5321 } 5322 #endif // HAS_ARGBMULTIPLYROW_SSE2 5323 5324 #ifdef HAS_ARGBADDROW_SSE2 5325 // Add 2 rows of ARGB pixels together, 4 pixels at a time. 5326 // TODO(fbarchard): Port this to posix, neon and other math functions. 5327 __declspec(naked) __declspec(align(16)) 5328 void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, 5329 uint8* dst_argb, int width) { 5330 __asm { 5331 push esi 5332 mov eax, [esp + 4 + 4] // src_argb0 5333 mov esi, [esp + 4 + 8] // src_argb1 5334 mov edx, [esp + 4 + 12] // dst_argb 5335 mov ecx, [esp + 4 + 16] // width 5336 5337 sub ecx, 4 5338 jl convertloop49 5339 5340 align 4 5341 convertloop4: 5342 movdqu xmm0, [eax] // read 4 pixels from src_argb0 5343 lea eax, [eax + 16] 5344 movdqu xmm1, [esi] // read 4 pixels from src_argb1 5345 lea esi, [esi + 16] 5346 paddusb xmm0, xmm1 // src_argb0 + src_argb1 5347 sub ecx, 4 5348 movdqu [edx], xmm0 5349 lea edx, [edx + 16] 5350 jge convertloop4 5351 5352 convertloop49: 5353 add ecx, 4 - 1 5354 jl convertloop19 5355 5356 convertloop1: 5357 movd xmm0, [eax] // read 1 pixels from src_argb0 5358 lea eax, [eax + 4] 5359 movd xmm1, [esi] // read 1 pixels from src_argb1 5360 lea esi, [esi + 4] 5361 paddusb xmm0, xmm1 // src_argb0 + src_argb1 5362 sub ecx, 1 5363 movd [edx], xmm0 5364 lea edx, [edx + 4] 5365 jge convertloop1 5366 5367 convertloop19: 5368 pop esi 5369 ret 5370 } 5371 } 5372 #endif // HAS_ARGBADDROW_SSE2 5373 5374 #ifdef HAS_ARGBSUBTRACTROW_SSE2 5375 // Subtract 2 rows of ARGB pixels together, 4 pixels at a time. 5376 __declspec(naked) __declspec(align(16)) 5377 void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, 5378 uint8* dst_argb, int width) { 5379 __asm { 5380 push esi 5381 mov eax, [esp + 4 + 4] // src_argb0 5382 mov esi, [esp + 4 + 8] // src_argb1 5383 mov edx, [esp + 4 + 12] // dst_argb 5384 mov ecx, [esp + 4 + 16] // width 5385 5386 align 4 5387 convertloop: 5388 movdqu xmm0, [eax] // read 4 pixels from src_argb0 5389 lea eax, [eax + 16] 5390 movdqu xmm1, [esi] // read 4 pixels from src_argb1 5391 lea esi, [esi + 16] 5392 psubusb xmm0, xmm1 // src_argb0 - src_argb1 5393 sub ecx, 4 5394 movdqu [edx], xmm0 5395 lea edx, [edx + 16] 5396 jg convertloop 5397 5398 pop esi 5399 ret 5400 } 5401 } 5402 #endif // HAS_ARGBSUBTRACTROW_SSE2 5403 5404 #ifdef HAS_ARGBMULTIPLYROW_AVX2 5405 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. 5406 __declspec(naked) __declspec(align(16)) 5407 void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, 5408 uint8* dst_argb, int width) { 5409 __asm { 5410 push esi 5411 mov eax, [esp + 4 + 4] // src_argb0 5412 mov esi, [esp + 4 + 8] // src_argb1 5413 mov edx, [esp + 4 + 12] // dst_argb 5414 mov ecx, [esp + 4 + 16] // width 5415 vpxor ymm5, ymm5, ymm5 // constant 0 5416 5417 align 4 5418 convertloop: 5419 vmovdqu ymm1, [eax] // read 8 pixels from src_argb0 5420 lea eax, [eax + 32] 5421 vmovdqu ymm3, [esi] // read 8 pixels from src_argb1 5422 lea esi, [esi + 32] 5423 vpunpcklbw ymm0, ymm1, ymm1 // low 4 5424 vpunpckhbw ymm1, ymm1, ymm1 // high 4 5425 vpunpcklbw ymm2, ymm3, ymm5 // low 4 5426 vpunpckhbw ymm3, ymm3, ymm5 // high 4 5427 vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4 5428 vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4 5429 vpackuswb ymm0, ymm0, ymm1 5430 vmovdqu [edx], ymm0 5431 lea edx, [edx + 32] 5432 sub ecx, 8 5433 jg convertloop 5434 5435 pop esi 5436 vzeroupper 5437 ret 5438 } 5439 } 5440 #endif // HAS_ARGBMULTIPLYROW_AVX2 5441 5442 #ifdef HAS_ARGBADDROW_AVX2 5443 // Add 2 rows of ARGB pixels together, 8 pixels at a time. 5444 __declspec(naked) __declspec(align(16)) 5445 void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, 5446 uint8* dst_argb, int width) { 5447 __asm { 5448 push esi 5449 mov eax, [esp + 4 + 4] // src_argb0 5450 mov esi, [esp + 4 + 8] // src_argb1 5451 mov edx, [esp + 4 + 12] // dst_argb 5452 mov ecx, [esp + 4 + 16] // width 5453 5454 align 4 5455 convertloop: 5456 vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 5457 lea eax, [eax + 32] 5458 vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1 5459 lea esi, [esi + 32] 5460 vmovdqu [edx], ymm0 5461 lea edx, [edx + 32] 5462 sub ecx, 8 5463 jg convertloop 5464 5465 pop esi 5466 vzeroupper 5467 ret 5468 } 5469 } 5470 #endif // HAS_ARGBADDROW_AVX2 5471 5472 #ifdef HAS_ARGBSUBTRACTROW_AVX2 5473 // Subtract 2 rows of ARGB pixels together, 8 pixels at a time. 5474 __declspec(naked) __declspec(align(16)) 5475 void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, 5476 uint8* dst_argb, int width) { 5477 __asm { 5478 push esi 5479 mov eax, [esp + 4 + 4] // src_argb0 5480 mov esi, [esp + 4 + 8] // src_argb1 5481 mov edx, [esp + 4 + 12] // dst_argb 5482 mov ecx, [esp + 4 + 16] // width 5483 5484 align 4 5485 convertloop: 5486 vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 5487 lea eax, [eax + 32] 5488 vpsubusb ymm0, ymm0, [esi] // src_argb0 - src_argb1 5489 lea esi, [esi + 32] 5490 vmovdqu [edx], ymm0 5491 lea edx, [edx + 32] 5492 sub ecx, 8 5493 jg convertloop 5494 5495 pop esi 5496 vzeroupper 5497 ret 5498 } 5499 } 5500 #endif // HAS_ARGBSUBTRACTROW_AVX2 5501 5502 #ifdef HAS_SOBELXROW_SSE2 5503 // SobelX as a matrix is 5504 // -1 0 1 5505 // -2 0 2 5506 // -1 0 1 5507 __declspec(naked) __declspec(align(16)) 5508 void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, 5509 const uint8* src_y2, uint8* dst_sobelx, int width) { 5510 __asm { 5511 push esi 5512 push edi 5513 mov eax, [esp + 8 + 4] // src_y0 5514 mov esi, [esp + 8 + 8] // src_y1 5515 mov edi, [esp + 8 + 12] // src_y2 5516 mov edx, [esp + 8 + 16] // dst_sobelx 5517 mov ecx, [esp + 8 + 20] // width 5518 sub esi, eax 5519 sub edi, eax 5520 sub edx, eax 5521 pxor xmm5, xmm5 // constant 0 5522 5523 align 4 5524 convertloop: 5525 movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] 5526 movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2] 5527 punpcklbw xmm0, xmm5 5528 punpcklbw xmm1, xmm5 5529 psubw xmm0, xmm1 5530 movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] 5531 movq xmm2, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2] 5532 punpcklbw xmm1, xmm5 5533 punpcklbw xmm2, xmm5 5534 psubw xmm1, xmm2 5535 movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0] 5536 movq xmm3, qword ptr [eax + edi + 2] // read 8 pixels from src_y2[2] 5537 punpcklbw xmm2, xmm5 5538 punpcklbw xmm3, xmm5 5539 psubw xmm2, xmm3 5540 paddw xmm0, xmm2 5541 paddw xmm0, xmm1 5542 paddw xmm0, xmm1 5543 pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw 5544 psubw xmm1, xmm0 5545 pmaxsw xmm0, xmm1 5546 packuswb xmm0, xmm0 5547 sub ecx, 8 5548 movq qword ptr [eax + edx], xmm0 5549 lea eax, [eax + 8] 5550 jg convertloop 5551 5552 pop edi 5553 pop esi 5554 ret 5555 } 5556 } 5557 #endif // HAS_SOBELXROW_SSE2 5558 5559 #ifdef HAS_SOBELYROW_SSE2 5560 // SobelY as a matrix is 5561 // -1 -2 -1 5562 // 0 0 0 5563 // 1 2 1 5564 __declspec(naked) __declspec(align(16)) 5565 void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, 5566 uint8* dst_sobely, int width) { 5567 __asm { 5568 push esi 5569 mov eax, [esp + 4 + 4] // src_y0 5570 mov esi, [esp + 4 + 8] // src_y1 5571 mov edx, [esp + 4 + 12] // dst_sobely 5572 mov ecx, [esp + 4 + 16] // width 5573 sub esi, eax 5574 sub edx, eax 5575 pxor xmm5, xmm5 // constant 0 5576 5577 align 4 5578 convertloop: 5579 movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] 5580 movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] 5581 punpcklbw xmm0, xmm5 5582 punpcklbw xmm1, xmm5 5583 psubw xmm0, xmm1 5584 movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1] 5585 movq xmm2, qword ptr [eax + esi + 1] // read 8 pixels from src_y1[1] 5586 punpcklbw xmm1, xmm5 5587 punpcklbw xmm2, xmm5 5588 psubw xmm1, xmm2 5589 movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2] 5590 movq xmm3, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2] 5591 punpcklbw xmm2, xmm5 5592 punpcklbw xmm3, xmm5 5593 psubw xmm2, xmm3 5594 paddw xmm0, xmm2 5595 paddw xmm0, xmm1 5596 paddw xmm0, xmm1 5597 pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw 5598 psubw xmm1, xmm0 5599 pmaxsw xmm0, xmm1 5600 packuswb xmm0, xmm0 5601 sub ecx, 8 5602 movq qword ptr [eax + edx], xmm0 5603 lea eax, [eax + 8] 5604 jg convertloop 5605 5606 pop esi 5607 ret 5608 } 5609 } 5610 #endif // HAS_SOBELYROW_SSE2 5611 5612 #ifdef HAS_SOBELROW_SSE2 5613 // Adds Sobel X and Sobel Y and stores Sobel into ARGB. 5614 // A = 255 5615 // R = Sobel 5616 // G = Sobel 5617 // B = Sobel 5618 __declspec(naked) __declspec(align(16)) 5619 void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, 5620 uint8* dst_argb, int width) { 5621 __asm { 5622 push esi 5623 mov eax, [esp + 4 + 4] // src_sobelx 5624 mov esi, [esp + 4 + 8] // src_sobely 5625 mov edx, [esp + 4 + 12] // dst_argb 5626 mov ecx, [esp + 4 + 16] // width 5627 sub esi, eax 5628 pcmpeqb xmm5, xmm5 // alpha 255 5629 pslld xmm5, 24 // 0xff000000 5630 5631 align 4 5632 convertloop: 5633 movdqa xmm0, [eax] // read 16 pixels src_sobelx 5634 movdqa xmm1, [eax + esi] // read 16 pixels src_sobely 5635 lea eax, [eax + 16] 5636 paddusb xmm0, xmm1 // sobel = sobelx + sobely 5637 movdqa xmm2, xmm0 // GG 5638 punpcklbw xmm2, xmm0 // First 8 5639 punpckhbw xmm0, xmm0 // Next 8 5640 movdqa xmm1, xmm2 // GGGG 5641 punpcklwd xmm1, xmm2 // First 4 5642 punpckhwd xmm2, xmm2 // Next 4 5643 por xmm1, xmm5 // GGGA 5644 por xmm2, xmm5 5645 movdqa xmm3, xmm0 // GGGG 5646 punpcklwd xmm3, xmm0 // Next 4 5647 punpckhwd xmm0, xmm0 // Last 4 5648 por xmm3, xmm5 // GGGA 5649 por xmm0, xmm5 5650 sub ecx, 16 5651 movdqa [edx], xmm1 5652 movdqa [edx + 16], xmm2 5653 movdqa [edx + 32], xmm3 5654 movdqa [edx + 48], xmm0 5655 lea edx, [edx + 64] 5656 jg convertloop 5657 5658 pop esi 5659 ret 5660 } 5661 } 5662 #endif // HAS_SOBELROW_SSE2 5663 5664 #ifdef HAS_SOBELTOPLANEROW_SSE2 5665 // Adds Sobel X and Sobel Y and stores Sobel into a plane. 5666 __declspec(naked) __declspec(align(16)) 5667 void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, 5668 uint8* dst_y, int width) { 5669 __asm { 5670 push esi 5671 mov eax, [esp + 4 + 4] // src_sobelx 5672 mov esi, [esp + 4 + 8] // src_sobely 5673 mov edx, [esp + 4 + 12] // dst_argb 5674 mov ecx, [esp + 4 + 16] // width 5675 sub esi, eax 5676 5677 align 4 5678 convertloop: 5679 movdqa xmm0, [eax] // read 16 pixels src_sobelx 5680 movdqa xmm1, [eax + esi] // read 16 pixels src_sobely 5681 lea eax, [eax + 16] 5682 paddusb xmm0, xmm1 // sobel = sobelx + sobely 5683 sub ecx, 16 5684 movdqa [edx], xmm0 5685 lea edx, [edx + 16] 5686 jg convertloop 5687 5688 pop esi 5689 ret 5690 } 5691 } 5692 #endif // HAS_SOBELTOPLANEROW_SSE2 5693 5694 #ifdef HAS_SOBELXYROW_SSE2 5695 // Mixes Sobel X, Sobel Y and Sobel into ARGB. 5696 // A = 255 5697 // R = Sobel X 5698 // G = Sobel 5699 // B = Sobel Y 5700 __declspec(naked) __declspec(align(16)) 5701 void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, 5702 uint8* dst_argb, int width) { 5703 __asm { 5704 push esi 5705 mov eax, [esp + 4 + 4] // src_sobelx 5706 mov esi, [esp + 4 + 8] // src_sobely 5707 mov edx, [esp + 4 + 12] // dst_argb 5708 mov ecx, [esp + 4 + 16] // width 5709 sub esi, eax 5710 pcmpeqb xmm5, xmm5 // alpha 255 5711 5712 align 4 5713 convertloop: 5714 movdqa xmm0, [eax] // read 16 pixels src_sobelx 5715 movdqa xmm1, [eax + esi] // read 16 pixels src_sobely 5716 lea eax, [eax + 16] 5717 movdqa xmm2, xmm0 5718 paddusb xmm2, xmm1 // sobel = sobelx + sobely 5719 movdqa xmm3, xmm0 // XA 5720 punpcklbw xmm3, xmm5 5721 punpckhbw xmm0, xmm5 5722 movdqa xmm4, xmm1 // YS 5723 punpcklbw xmm4, xmm2 5724 punpckhbw xmm1, xmm2 5725 movdqa xmm6, xmm4 // YSXA 5726 punpcklwd xmm6, xmm3 // First 4 5727 punpckhwd xmm4, xmm3 // Next 4 5728 movdqa xmm7, xmm1 // YSXA 5729 punpcklwd xmm7, xmm0 // Next 4 5730 punpckhwd xmm1, xmm0 // Last 4 5731 sub ecx, 16 5732 movdqa [edx], xmm6 5733 movdqa [edx + 16], xmm4 5734 movdqa [edx + 32], xmm7 5735 movdqa [edx + 48], xmm1 5736 lea edx, [edx + 64] 5737 jg convertloop 5738 5739 pop esi 5740 ret 5741 } 5742 } 5743 #endif // HAS_SOBELXYROW_SSE2 5744 5745 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 5746 // Consider float CumulativeSum. 5747 // Consider calling CumulativeSum one row at time as needed. 5748 // Consider circular CumulativeSum buffer of radius * 2 + 1 height. 5749 // Convert cumulative sum for an area to an average for 1 pixel. 5750 // topleft is pointer to top left of CumulativeSum buffer for area. 5751 // botleft is pointer to bottom left of CumulativeSum buffer. 5752 // width is offset from left to right of area in CumulativeSum buffer measured 5753 // in number of ints. 5754 // area is the number of pixels in the area being averaged. 5755 // dst points to pixel to store result to. 5756 // count is number of averaged pixels to produce. 5757 // Does 4 pixels at a time, requires CumulativeSum pointers to be 16 byte 5758 // aligned. 5759 void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, 5760 int width, int area, uint8* dst, 5761 int count) { 5762 __asm { 5763 mov eax, topleft // eax topleft 5764 mov esi, botleft // esi botleft 5765 mov edx, width 5766 movd xmm5, area 5767 mov edi, dst 5768 mov ecx, count 5769 cvtdq2ps xmm5, xmm5 5770 rcpss xmm4, xmm5 // 1.0f / area 5771 pshufd xmm4, xmm4, 0 5772 sub ecx, 4 5773 jl l4b 5774 5775 cmp area, 128 // 128 pixels will not overflow 15 bits. 5776 ja l4 5777 5778 pshufd xmm5, xmm5, 0 // area 5779 pcmpeqb xmm6, xmm6 // constant of 65536.0 - 1 = 65535.0 5780 psrld xmm6, 16 5781 cvtdq2ps xmm6, xmm6 5782 addps xmm5, xmm6 // (65536.0 + area - 1) 5783 mulps xmm5, xmm4 // (65536.0 + area - 1) * 1 / area 5784 cvtps2dq xmm5, xmm5 // 0.16 fixed point 5785 packssdw xmm5, xmm5 // 16 bit shorts 5786 5787 // 4 pixel loop small blocks. 5788 align 4 5789 s4: 5790 // top left 5791 movdqa xmm0, [eax] 5792 movdqa xmm1, [eax + 16] 5793 movdqa xmm2, [eax + 32] 5794 movdqa xmm3, [eax + 48] 5795 5796 // - top right 5797 psubd xmm0, [eax + edx * 4] 5798 psubd xmm1, [eax + edx * 4 + 16] 5799 psubd xmm2, [eax + edx * 4 + 32] 5800 psubd xmm3, [eax + edx * 4 + 48] 5801 lea eax, [eax + 64] 5802 5803 // - bottom left 5804 psubd xmm0, [esi] 5805 psubd xmm1, [esi + 16] 5806 psubd xmm2, [esi + 32] 5807 psubd xmm3, [esi + 48] 5808 5809 // + bottom right 5810 paddd xmm0, [esi + edx * 4] 5811 paddd xmm1, [esi + edx * 4 + 16] 5812 paddd xmm2, [esi + edx * 4 + 32] 5813 paddd xmm3, [esi + edx * 4 + 48] 5814 lea esi, [esi + 64] 5815 5816 packssdw xmm0, xmm1 // pack 4 pixels into 2 registers 5817 packssdw xmm2, xmm3 5818 5819 pmulhuw xmm0, xmm5 5820 pmulhuw xmm2, xmm5 5821 5822 packuswb xmm0, xmm2 5823 movdqu [edi], xmm0 5824 lea edi, [edi + 16] 5825 sub ecx, 4 5826 jge s4 5827 5828 jmp l4b 5829 5830 // 4 pixel loop 5831 align 4 5832 l4: 5833 // top left 5834 movdqa xmm0, [eax] 5835 movdqa xmm1, [eax + 16] 5836 movdqa xmm2, [eax + 32] 5837 movdqa xmm3, [eax + 48] 5838 5839 // - top right 5840 psubd xmm0, [eax + edx * 4] 5841 psubd xmm1, [eax + edx * 4 + 16] 5842 psubd xmm2, [eax + edx * 4 + 32] 5843 psubd xmm3, [eax + edx * 4 + 48] 5844 lea eax, [eax + 64] 5845 5846 // - bottom left 5847 psubd xmm0, [esi] 5848 psubd xmm1, [esi + 16] 5849 psubd xmm2, [esi + 32] 5850 psubd xmm3, [esi + 48] 5851 5852 // + bottom right 5853 paddd xmm0, [esi + edx * 4] 5854 paddd xmm1, [esi + edx * 4 + 16] 5855 paddd xmm2, [esi + edx * 4 + 32] 5856 paddd xmm3, [esi + edx * 4 + 48] 5857 lea esi, [esi + 64] 5858 5859 cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area 5860 cvtdq2ps xmm1, xmm1 5861 mulps xmm0, xmm4 5862 mulps xmm1, xmm4 5863 cvtdq2ps xmm2, xmm2 5864 cvtdq2ps xmm3, xmm3 5865 mulps xmm2, xmm4 5866 mulps xmm3, xmm4 5867 cvtps2dq xmm0, xmm0 5868 cvtps2dq xmm1, xmm1 5869 cvtps2dq xmm2, xmm2 5870 cvtps2dq xmm3, xmm3 5871 packssdw xmm0, xmm1 5872 packssdw xmm2, xmm3 5873 packuswb xmm0, xmm2 5874 movdqu [edi], xmm0 5875 lea edi, [edi + 16] 5876 sub ecx, 4 5877 jge l4 5878 5879 l4b: 5880 add ecx, 4 - 1 5881 jl l1b 5882 5883 // 1 pixel loop 5884 align 4 5885 l1: 5886 movdqa xmm0, [eax] 5887 psubd xmm0, [eax + edx * 4] 5888 lea eax, [eax + 16] 5889 psubd xmm0, [esi] 5890 paddd xmm0, [esi + edx * 4] 5891 lea esi, [esi + 16] 5892 cvtdq2ps xmm0, xmm0 5893 mulps xmm0, xmm4 5894 cvtps2dq xmm0, xmm0 5895 packssdw xmm0, xmm0 5896 packuswb xmm0, xmm0 5897 movd dword ptr [edi], xmm0 5898 lea edi, [edi + 4] 5899 sub ecx, 1 5900 jge l1 5901 l1b: 5902 } 5903 } 5904 #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 5905 5906 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 5907 // Creates a table of cumulative sums where each value is a sum of all values 5908 // above and to the left of the value. 5909 void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, 5910 const int32* previous_cumsum, int width) { 5911 __asm { 5912 mov eax, row 5913 mov edx, cumsum 5914 mov esi, previous_cumsum 5915 mov ecx, width 5916 pxor xmm0, xmm0 5917 pxor xmm1, xmm1 5918 5919 sub ecx, 4 5920 jl l4b 5921 test edx, 15 5922 jne l4b 5923 5924 // 4 pixel loop 5925 align 4 5926 l4: 5927 movdqu xmm2, [eax] // 4 argb pixels 16 bytes. 5928 lea eax, [eax + 16] 5929 movdqa xmm4, xmm2 5930 5931 punpcklbw xmm2, xmm1 5932 movdqa xmm3, xmm2 5933 punpcklwd xmm2, xmm1 5934 punpckhwd xmm3, xmm1 5935 5936 punpckhbw xmm4, xmm1 5937 movdqa xmm5, xmm4 5938 punpcklwd xmm4, xmm1 5939 punpckhwd xmm5, xmm1 5940 5941 paddd xmm0, xmm2 5942 movdqa xmm2, [esi] // previous row above. 5943 paddd xmm2, xmm0 5944 5945 paddd xmm0, xmm3 5946 movdqa xmm3, [esi + 16] 5947 paddd xmm3, xmm0 5948 5949 paddd xmm0, xmm4 5950 movdqa xmm4, [esi + 32] 5951 paddd xmm4, xmm0 5952 5953 paddd xmm0, xmm5 5954 movdqa xmm5, [esi + 48] 5955 lea esi, [esi + 64] 5956 paddd xmm5, xmm0 5957 5958 movdqa [edx], xmm2 5959 movdqa [edx + 16], xmm3 5960 movdqa [edx + 32], xmm4 5961 movdqa [edx + 48], xmm5 5962 5963 lea edx, [edx + 64] 5964 sub ecx, 4 5965 jge l4 5966 5967 l4b: 5968 add ecx, 4 - 1 5969 jl l1b 5970 5971 // 1 pixel loop 5972 align 4 5973 l1: 5974 movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes. 5975 lea eax, [eax + 4] 5976 punpcklbw xmm2, xmm1 5977 punpcklwd xmm2, xmm1 5978 paddd xmm0, xmm2 5979 movdqu xmm2, [esi] 5980 lea esi, [esi + 16] 5981 paddd xmm2, xmm0 5982 movdqu [edx], xmm2 5983 lea edx, [edx + 16] 5984 sub ecx, 1 5985 jge l1 5986 5987 l1b: 5988 } 5989 } 5990 #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 5991 5992 #ifdef HAS_ARGBAFFINEROW_SSE2 5993 // Copy ARGB pixels from source image with slope to a row of destination. 5994 __declspec(naked) __declspec(align(16)) 5995 LIBYUV_API 5996 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, 5997 uint8* dst_argb, const float* uv_dudv, int width) { 5998 __asm { 5999 push esi 6000 push edi 6001 mov eax, [esp + 12] // src_argb 6002 mov esi, [esp + 16] // stride 6003 mov edx, [esp + 20] // dst_argb 6004 mov ecx, [esp + 24] // pointer to uv_dudv 6005 movq xmm2, qword ptr [ecx] // uv 6006 movq xmm7, qword ptr [ecx + 8] // dudv 6007 mov ecx, [esp + 28] // width 6008 shl esi, 16 // 4, stride 6009 add esi, 4 6010 movd xmm5, esi 6011 sub ecx, 4 6012 jl l4b 6013 6014 // setup for 4 pixel loop 6015 pshufd xmm7, xmm7, 0x44 // dup dudv 6016 pshufd xmm5, xmm5, 0 // dup 4, stride 6017 movdqa xmm0, xmm2 // x0, y0, x1, y1 6018 addps xmm0, xmm7 6019 movlhps xmm2, xmm0 6020 movdqa xmm4, xmm7 6021 addps xmm4, xmm4 // dudv *= 2 6022 movdqa xmm3, xmm2 // x2, y2, x3, y3 6023 addps xmm3, xmm4 6024 addps xmm4, xmm4 // dudv *= 4 6025 6026 // 4 pixel loop 6027 align 4 6028 l4: 6029 cvttps2dq xmm0, xmm2 // x, y float to int first 2 6030 cvttps2dq xmm1, xmm3 // x, y float to int next 2 6031 packssdw xmm0, xmm1 // x, y as 8 shorts 6032 pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride. 6033 movd esi, xmm0 6034 pshufd xmm0, xmm0, 0x39 // shift right 6035 movd edi, xmm0 6036 pshufd xmm0, xmm0, 0x39 // shift right 6037 movd xmm1, [eax + esi] // read pixel 0 6038 movd xmm6, [eax + edi] // read pixel 1 6039 punpckldq xmm1, xmm6 // combine pixel 0 and 1 6040 addps xmm2, xmm4 // x, y += dx, dy first 2 6041 movq qword ptr [edx], xmm1 6042 movd esi, xmm0 6043 pshufd xmm0, xmm0, 0x39 // shift right 6044 movd edi, xmm0 6045 movd xmm6, [eax + esi] // read pixel 2 6046 movd xmm0, [eax + edi] // read pixel 3 6047 punpckldq xmm6, xmm0 // combine pixel 2 and 3 6048 addps xmm3, xmm4 // x, y += dx, dy next 2 6049 sub ecx, 4 6050 movq qword ptr 8[edx], xmm6 6051 lea edx, [edx + 16] 6052 jge l4 6053 6054 l4b: 6055 add ecx, 4 - 1 6056 jl l1b 6057 6058 // 1 pixel loop 6059 align 4 6060 l1: 6061 cvttps2dq xmm0, xmm2 // x, y float to int 6062 packssdw xmm0, xmm0 // x, y as shorts 6063 pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride 6064 addps xmm2, xmm7 // x, y += dx, dy 6065 movd esi, xmm0 6066 movd xmm0, [eax + esi] // copy a pixel 6067 sub ecx, 1 6068 movd [edx], xmm0 6069 lea edx, [edx + 4] 6070 jge l1 6071 l1b: 6072 pop edi 6073 pop esi 6074 ret 6075 } 6076 } 6077 #endif // HAS_ARGBAFFINEROW_SSE2 6078 6079 #ifdef HAS_INTERPOLATEROW_AVX2 6080 // Bilinear filter 16x2 -> 16x1 6081 __declspec(naked) __declspec(align(16)) 6082 void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, 6083 ptrdiff_t src_stride, int dst_width, 6084 int source_y_fraction) { 6085 __asm { 6086 push esi 6087 push edi 6088 mov edi, [esp + 8 + 4] // dst_ptr 6089 mov esi, [esp + 8 + 8] // src_ptr 6090 mov edx, [esp + 8 + 12] // src_stride 6091 mov ecx, [esp + 8 + 16] // dst_width 6092 mov eax, [esp + 8 + 20] // source_y_fraction (0..255) 6093 shr eax, 1 6094 // Dispatch to specialized filters if applicable. 6095 cmp eax, 0 6096 je xloop100 // 0 / 128. Blend 100 / 0. 6097 sub edi, esi 6098 cmp eax, 32 6099 je xloop75 // 32 / 128 is 0.25. Blend 75 / 25. 6100 cmp eax, 64 6101 je xloop50 // 64 / 128 is 0.50. Blend 50 / 50. 6102 cmp eax, 96 6103 je xloop25 // 96 / 128 is 0.75. Blend 25 / 75. 6104 6105 vmovd xmm0, eax // high fraction 0..127 6106 neg eax 6107 add eax, 128 6108 vmovd xmm5, eax // low fraction 128..1 6109 vpunpcklbw xmm5, xmm5, xmm0 6110 vpunpcklwd xmm5, xmm5, xmm5 6111 vpxor ymm0, ymm0, ymm0 6112 vpermd ymm5, ymm0, ymm5 6113 6114 align 4 6115 xloop: 6116 vmovdqu ymm0, [esi] 6117 vmovdqu ymm2, [esi + edx] 6118 vpunpckhbw ymm1, ymm0, ymm2 // mutates 6119 vpunpcklbw ymm0, ymm0, ymm2 // mutates 6120 vpmaddubsw ymm0, ymm0, ymm5 6121 vpmaddubsw ymm1, ymm1, ymm5 6122 vpsrlw ymm0, ymm0, 7 6123 vpsrlw ymm1, ymm1, 7 6124 vpackuswb ymm0, ymm0, ymm1 // unmutates 6125 sub ecx, 32 6126 vmovdqu [esi + edi], ymm0 6127 lea esi, [esi + 32] 6128 jg xloop 6129 jmp xloop99 6130 6131 // Blend 25 / 75. 6132 align 4 6133 xloop25: 6134 vmovdqu ymm0, [esi] 6135 vpavgb ymm0, ymm0, [esi + edx] 6136 vpavgb ymm0, ymm0, [esi + edx] 6137 sub ecx, 32 6138 vmovdqu [esi + edi], ymm0 6139 lea esi, [esi + 32] 6140 jg xloop25 6141 jmp xloop99 6142 6143 // Blend 50 / 50. 6144 align 4 6145 xloop50: 6146 vmovdqu ymm0, [esi] 6147 vpavgb ymm0, ymm0, [esi + edx] 6148 sub ecx, 32 6149 vmovdqu [esi + edi], ymm0 6150 lea esi, [esi + 32] 6151 jg xloop50 6152 jmp xloop99 6153 6154 // Blend 75 / 25. 6155 align 4 6156 xloop75: 6157 vmovdqu ymm0, [esi + edx] 6158 vpavgb ymm0, ymm0, [esi] 6159 vpavgb ymm0, ymm0, [esi] 6160 sub ecx, 32 6161 vmovdqu [esi + edi], ymm0 6162 lea esi, [esi + 32] 6163 jg xloop75 6164 jmp xloop99 6165 6166 // Blend 100 / 0 - Copy row unchanged. 6167 align 4 6168 xloop100: 6169 rep movsb 6170 6171 xloop99: 6172 pop edi 6173 pop esi 6174 vzeroupper 6175 ret 6176 } 6177 } 6178 #endif // HAS_INTERPOLATEROW_AVX2 6179 6180 #ifdef HAS_INTERPOLATEROW_SSSE3 6181 // Bilinear filter 16x2 -> 16x1 6182 __declspec(naked) __declspec(align(16)) 6183 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, 6184 ptrdiff_t src_stride, int dst_width, 6185 int source_y_fraction) { 6186 __asm { 6187 push esi 6188 push edi 6189 mov edi, [esp + 8 + 4] // dst_ptr 6190 mov esi, [esp + 8 + 8] // src_ptr 6191 mov edx, [esp + 8 + 12] // src_stride 6192 mov ecx, [esp + 8 + 16] // dst_width 6193 mov eax, [esp + 8 + 20] // source_y_fraction (0..255) 6194 sub edi, esi 6195 shr eax, 1 6196 // Dispatch to specialized filters if applicable. 6197 cmp eax, 0 6198 je xloop100 // 0 / 128. Blend 100 / 0. 6199 cmp eax, 32 6200 je xloop75 // 32 / 128 is 0.25. Blend 75 / 25. 6201 cmp eax, 64 6202 je xloop50 // 64 / 128 is 0.50. Blend 50 / 50. 6203 cmp eax, 96 6204 je xloop25 // 96 / 128 is 0.75. Blend 25 / 75. 6205 6206 movd xmm0, eax // high fraction 0..127 6207 neg eax 6208 add eax, 128 6209 movd xmm5, eax // low fraction 128..1 6210 punpcklbw xmm5, xmm0 6211 punpcklwd xmm5, xmm5 6212 pshufd xmm5, xmm5, 0 6213 6214 align 4 6215 xloop: 6216 movdqa xmm0, [esi] 6217 movdqa xmm2, [esi + edx] 6218 movdqa xmm1, xmm0 6219 punpcklbw xmm0, xmm2 6220 punpckhbw xmm1, xmm2 6221 pmaddubsw xmm0, xmm5 6222 pmaddubsw xmm1, xmm5 6223 psrlw xmm0, 7 6224 psrlw xmm1, 7 6225 packuswb xmm0, xmm1 6226 sub ecx, 16 6227 movdqa [esi + edi], xmm0 6228 lea esi, [esi + 16] 6229 jg xloop 6230 jmp xloop99 6231 6232 // Blend 25 / 75. 6233 align 4 6234 xloop25: 6235 movdqa xmm0, [esi] 6236 movdqa xmm1, [esi + edx] 6237 pavgb xmm0, xmm1 6238 pavgb xmm0, xmm1 6239 sub ecx, 16 6240 movdqa [esi + edi], xmm0 6241 lea esi, [esi + 16] 6242 jg xloop25 6243 jmp xloop99 6244 6245 // Blend 50 / 50. 6246 align 4 6247 xloop50: 6248 movdqa xmm0, [esi] 6249 movdqa xmm1, [esi + edx] 6250 pavgb xmm0, xmm1 6251 sub ecx, 16 6252 movdqa [esi + edi], xmm0 6253 lea esi, [esi + 16] 6254 jg xloop50 6255 jmp xloop99 6256 6257 // Blend 75 / 25. 6258 align 4 6259 xloop75: 6260 movdqa xmm1, [esi] 6261 movdqa xmm0, [esi + edx] 6262 pavgb xmm0, xmm1 6263 pavgb xmm0, xmm1 6264 sub ecx, 16 6265 movdqa [esi + edi], xmm0 6266 lea esi, [esi + 16] 6267 jg xloop75 6268 jmp xloop99 6269 6270 // Blend 100 / 0 - Copy row unchanged. 6271 align 4 6272 xloop100: 6273 movdqa xmm0, [esi] 6274 sub ecx, 16 6275 movdqa [esi + edi], xmm0 6276 lea esi, [esi + 16] 6277 jg xloop100 6278 6279 xloop99: 6280 pop edi 6281 pop esi 6282 ret 6283 } 6284 } 6285 #endif // HAS_INTERPOLATEROW_SSSE3 6286 6287 #ifdef HAS_INTERPOLATEROW_SSE2 6288 // Bilinear filter 16x2 -> 16x1 6289 __declspec(naked) __declspec(align(16)) 6290 void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, 6291 ptrdiff_t src_stride, int dst_width, 6292 int source_y_fraction) { 6293 __asm { 6294 push esi 6295 push edi 6296 mov edi, [esp + 8 + 4] // dst_ptr 6297 mov esi, [esp + 8 + 8] // src_ptr 6298 mov edx, [esp + 8 + 12] // src_stride 6299 mov ecx, [esp + 8 + 16] // dst_width 6300 mov eax, [esp + 8 + 20] // source_y_fraction (0..255) 6301 sub edi, esi 6302 // Dispatch to specialized filters if applicable. 6303 cmp eax, 0 6304 je xloop100 // 0 / 256. Blend 100 / 0. 6305 cmp eax, 64 6306 je xloop75 // 64 / 256 is 0.25. Blend 75 / 25. 6307 cmp eax, 128 6308 je xloop50 // 128 / 256 is 0.50. Blend 50 / 50. 6309 cmp eax, 192 6310 je xloop25 // 192 / 256 is 0.75. Blend 25 / 75. 6311 6312 movd xmm5, eax // xmm5 = y fraction 6313 punpcklbw xmm5, xmm5 6314 psrlw xmm5, 1 6315 punpcklwd xmm5, xmm5 6316 punpckldq xmm5, xmm5 6317 punpcklqdq xmm5, xmm5 6318 pxor xmm4, xmm4 6319 6320 align 4 6321 xloop: 6322 movdqa xmm0, [esi] // row0 6323 movdqa xmm2, [esi + edx] // row1 6324 movdqa xmm1, xmm0 6325 movdqa xmm3, xmm2 6326 punpcklbw xmm2, xmm4 6327 punpckhbw xmm3, xmm4 6328 punpcklbw xmm0, xmm4 6329 punpckhbw xmm1, xmm4 6330 psubw xmm2, xmm0 // row1 - row0 6331 psubw xmm3, xmm1 6332 paddw xmm2, xmm2 // 9 bits * 15 bits = 8.16 6333 paddw xmm3, xmm3 6334 pmulhw xmm2, xmm5 // scale diff 6335 pmulhw xmm3, xmm5 6336 paddw xmm0, xmm2 // sum rows 6337 paddw xmm1, xmm3 6338 packuswb xmm0, xmm1 6339 sub ecx, 16 6340 movdqa [esi + edi], xmm0 6341 lea esi, [esi + 16] 6342 jg xloop 6343 jmp xloop99 6344 6345 // Blend 25 / 75. 6346 align 4 6347 xloop25: 6348 movdqa xmm0, [esi] 6349 movdqa xmm1, [esi + edx] 6350 pavgb xmm0, xmm1 6351 pavgb xmm0, xmm1 6352 sub ecx, 16 6353 movdqa [esi + edi], xmm0 6354 lea esi, [esi + 16] 6355 jg xloop25 6356 jmp xloop99 6357 6358 // Blend 50 / 50. 6359 align 4 6360 xloop50: 6361 movdqa xmm0, [esi] 6362 movdqa xmm1, [esi + edx] 6363 pavgb xmm0, xmm1 6364 sub ecx, 16 6365 movdqa [esi + edi], xmm0 6366 lea esi, [esi + 16] 6367 jg xloop50 6368 jmp xloop99 6369 6370 // Blend 75 / 25. 6371 align 4 6372 xloop75: 6373 movdqa xmm1, [esi] 6374 movdqa xmm0, [esi + edx] 6375 pavgb xmm0, xmm1 6376 pavgb xmm0, xmm1 6377 sub ecx, 16 6378 movdqa [esi + edi], xmm0 6379 lea esi, [esi + 16] 6380 jg xloop75 6381 jmp xloop99 6382 6383 // Blend 100 / 0 - Copy row unchanged. 6384 align 4 6385 xloop100: 6386 movdqa xmm0, [esi] 6387 sub ecx, 16 6388 movdqa [esi + edi], xmm0 6389 lea esi, [esi + 16] 6390 jg xloop100 6391 6392 xloop99: 6393 pop edi 6394 pop esi 6395 ret 6396 } 6397 } 6398 #endif // HAS_INTERPOLATEROW_SSE2 6399 6400 // Bilinear filter 16x2 -> 16x1 6401 __declspec(naked) __declspec(align(16)) 6402 void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr, 6403 ptrdiff_t src_stride, int dst_width, 6404 int source_y_fraction) { 6405 __asm { 6406 push esi 6407 push edi 6408 mov edi, [esp + 8 + 4] // dst_ptr 6409 mov esi, [esp + 8 + 8] // src_ptr 6410 mov edx, [esp + 8 + 12] // src_stride 6411 mov ecx, [esp + 8 + 16] // dst_width 6412 mov eax, [esp + 8 + 20] // source_y_fraction (0..255) 6413 sub edi, esi 6414 shr eax, 1 6415 // Dispatch to specialized filters if applicable. 6416 cmp eax, 0 6417 je xloop100 // 0 / 128. Blend 100 / 0. 6418 cmp eax, 32 6419 je xloop75 // 32 / 128 is 0.25. Blend 75 / 25. 6420 cmp eax, 64 6421 je xloop50 // 64 / 128 is 0.50. Blend 50 / 50. 6422 cmp eax, 96 6423 je xloop25 // 96 / 128 is 0.75. Blend 25 / 75. 6424 6425 movd xmm0, eax // high fraction 0..127 6426 neg eax 6427 add eax, 128 6428 movd xmm5, eax // low fraction 128..1 6429 punpcklbw xmm5, xmm0 6430 punpcklwd xmm5, xmm5 6431 pshufd xmm5, xmm5, 0 6432 6433 align 4 6434 xloop: 6435 movdqu xmm0, [esi] 6436 movdqu xmm2, [esi + edx] 6437 movdqu xmm1, xmm0 6438 punpcklbw xmm0, xmm2 6439 punpckhbw xmm1, xmm2 6440 pmaddubsw xmm0, xmm5 6441 pmaddubsw xmm1, xmm5 6442 psrlw xmm0, 7 6443 psrlw xmm1, 7 6444 packuswb xmm0, xmm1 6445 sub ecx, 16 6446 movdqu [esi + edi], xmm0 6447 lea esi, [esi + 16] 6448 jg xloop 6449 jmp xloop99 6450 6451 // Blend 25 / 75. 6452 align 4 6453 xloop25: 6454 movdqu xmm0, [esi] 6455 movdqu xmm1, [esi + edx] 6456 pavgb xmm0, xmm1 6457 pavgb xmm0, xmm1 6458 sub ecx, 16 6459 movdqu [esi + edi], xmm0 6460 lea esi, [esi + 16] 6461 jg xloop25 6462 jmp xloop99 6463 6464 // Blend 50 / 50. 6465 align 4 6466 xloop50: 6467 movdqu xmm0, [esi] 6468 movdqu xmm1, [esi + edx] 6469 pavgb xmm0, xmm1 6470 sub ecx, 16 6471 movdqu [esi + edi], xmm0 6472 lea esi, [esi + 16] 6473 jg xloop50 6474 jmp xloop99 6475 6476 // Blend 75 / 25. 6477 align 4 6478 xloop75: 6479 movdqu xmm1, [esi] 6480 movdqu xmm0, [esi + edx] 6481 pavgb xmm0, xmm1 6482 pavgb xmm0, xmm1 6483 sub ecx, 16 6484 movdqu [esi + edi], xmm0 6485 lea esi, [esi + 16] 6486 jg xloop75 6487 jmp xloop99 6488 6489 // Blend 100 / 0 - Copy row unchanged. 6490 align 4 6491 xloop100: 6492 movdqu xmm0, [esi] 6493 sub ecx, 16 6494 movdqu [esi + edi], xmm0 6495 lea esi, [esi + 16] 6496 jg xloop100 6497 6498 xloop99: 6499 pop edi 6500 pop esi 6501 ret 6502 } 6503 } 6504 6505 #ifdef HAS_INTERPOLATEROW_SSE2 6506 // Bilinear filter 16x2 -> 16x1 6507 __declspec(naked) __declspec(align(16)) 6508 void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr, 6509 ptrdiff_t src_stride, int dst_width, 6510 int source_y_fraction) { 6511 __asm { 6512 push esi 6513 push edi 6514 mov edi, [esp + 8 + 4] // dst_ptr 6515 mov esi, [esp + 8 + 8] // src_ptr 6516 mov edx, [esp + 8 + 12] // src_stride 6517 mov ecx, [esp + 8 + 16] // dst_width 6518 mov eax, [esp + 8 + 20] // source_y_fraction (0..255) 6519 sub edi, esi 6520 // Dispatch to specialized filters if applicable. 6521 cmp eax, 0 6522 je xloop100 // 0 / 256. Blend 100 / 0. 6523 cmp eax, 64 6524 je xloop75 // 64 / 256 is 0.25. Blend 75 / 25. 6525 cmp eax, 128 6526 je xloop50 // 128 / 256 is 0.50. Blend 50 / 50. 6527 cmp eax, 192 6528 je xloop25 // 192 / 256 is 0.75. Blend 25 / 75. 6529 6530 movd xmm5, eax // xmm5 = y fraction 6531 punpcklbw xmm5, xmm5 6532 psrlw xmm5, 1 6533 punpcklwd xmm5, xmm5 6534 punpckldq xmm5, xmm5 6535 punpcklqdq xmm5, xmm5 6536 pxor xmm4, xmm4 6537 6538 align 4 6539 xloop: 6540 movdqu xmm0, [esi] // row0 6541 movdqu xmm2, [esi + edx] // row1 6542 movdqu xmm1, xmm0 6543 movdqu xmm3, xmm2 6544 punpcklbw xmm2, xmm4 6545 punpckhbw xmm3, xmm4 6546 punpcklbw xmm0, xmm4 6547 punpckhbw xmm1, xmm4 6548 psubw xmm2, xmm0 // row1 - row0 6549 psubw xmm3, xmm1 6550 paddw xmm2, xmm2 // 9 bits * 15 bits = 8.16 6551 paddw xmm3, xmm3 6552 pmulhw xmm2, xmm5 // scale diff 6553 pmulhw xmm3, xmm5 6554 paddw xmm0, xmm2 // sum rows 6555 paddw xmm1, xmm3 6556 packuswb xmm0, xmm1 6557 sub ecx, 16 6558 movdqu [esi + edi], xmm0 6559 lea esi, [esi + 16] 6560 jg xloop 6561 jmp xloop99 6562 6563 // Blend 25 / 75. 6564 align 4 6565 xloop25: 6566 movdqu xmm0, [esi] 6567 movdqu xmm1, [esi + edx] 6568 pavgb xmm0, xmm1 6569 pavgb xmm0, xmm1 6570 sub ecx, 16 6571 movdqu [esi + edi], xmm0 6572 lea esi, [esi + 16] 6573 jg xloop25 6574 jmp xloop99 6575 6576 // Blend 50 / 50. 6577 align 4 6578 xloop50: 6579 movdqu xmm0, [esi] 6580 movdqu xmm1, [esi + edx] 6581 pavgb xmm0, xmm1 6582 sub ecx, 16 6583 movdqu [esi + edi], xmm0 6584 lea esi, [esi + 16] 6585 jg xloop50 6586 jmp xloop99 6587 6588 // Blend 75 / 25. 6589 align 4 6590 xloop75: 6591 movdqu xmm1, [esi] 6592 movdqu xmm0, [esi + edx] 6593 pavgb xmm0, xmm1 6594 pavgb xmm0, xmm1 6595 sub ecx, 16 6596 movdqu [esi + edi], xmm0 6597 lea esi, [esi + 16] 6598 jg xloop75 6599 jmp xloop99 6600 6601 // Blend 100 / 0 - Copy row unchanged. 6602 align 4 6603 xloop100: 6604 movdqu xmm0, [esi] 6605 sub ecx, 16 6606 movdqu [esi + edi], xmm0 6607 lea esi, [esi + 16] 6608 jg xloop100 6609 6610 xloop99: 6611 pop edi 6612 pop esi 6613 ret 6614 } 6615 } 6616 #endif // HAS_INTERPOLATEROW_SSE2 6617 6618 __declspec(naked) __declspec(align(16)) 6619 void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, 6620 uint8* dst_uv, int pix) { 6621 __asm { 6622 push edi 6623 mov eax, [esp + 4 + 4] // src_uv 6624 mov edx, [esp + 4 + 8] // src_uv_stride 6625 mov edi, [esp + 4 + 12] // dst_v 6626 mov ecx, [esp + 4 + 16] // pix 6627 sub edi, eax 6628 6629 align 4 6630 convertloop: 6631 movdqa xmm0, [eax] 6632 pavgb xmm0, [eax + edx] 6633 sub ecx, 16 6634 movdqa [eax + edi], xmm0 6635 lea eax, [eax + 16] 6636 jg convertloop 6637 pop edi 6638 ret 6639 } 6640 } 6641 6642 #ifdef HAS_HALFROW_AVX2 6643 __declspec(naked) __declspec(align(16)) 6644 void HalfRow_AVX2(const uint8* src_uv, int src_uv_stride, 6645 uint8* dst_uv, int pix) { 6646 __asm { 6647 push edi 6648 mov eax, [esp + 4 + 4] // src_uv 6649 mov edx, [esp + 4 + 8] // src_uv_stride 6650 mov edi, [esp + 4 + 12] // dst_v 6651 mov ecx, [esp + 4 + 16] // pix 6652 sub edi, eax 6653 6654 align 4 6655 convertloop: 6656 vmovdqu ymm0, [eax] 6657 vpavgb ymm0, ymm0, [eax + edx] 6658 sub ecx, 32 6659 vmovdqu [eax + edi], ymm0 6660 lea eax, [eax + 32] 6661 jg convertloop 6662 6663 pop edi 6664 vzeroupper 6665 ret 6666 } 6667 } 6668 #endif // HAS_HALFROW_AVX2 6669 6670 __declspec(naked) __declspec(align(16)) 6671 void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, 6672 uint32 selector, int pix) { 6673 __asm { 6674 mov eax, [esp + 4] // src_argb 6675 mov edx, [esp + 8] // dst_bayer 6676 movd xmm5, [esp + 12] // selector 6677 mov ecx, [esp + 16] // pix 6678 pshufd xmm5, xmm5, 0 6679 6680 align 4 6681 wloop: 6682 movdqa xmm0, [eax] 6683 movdqa xmm1, [eax + 16] 6684 lea eax, [eax + 32] 6685 pshufb xmm0, xmm5 6686 pshufb xmm1, xmm5 6687 punpckldq xmm0, xmm1 6688 sub ecx, 8 6689 movq qword ptr [edx], xmm0 6690 lea edx, [edx + 8] 6691 jg wloop 6692 ret 6693 } 6694 } 6695 6696 // Specialized ARGB to Bayer that just isolates G channel. 6697 __declspec(naked) __declspec(align(16)) 6698 void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer, 6699 uint32 selector, int pix) { 6700 __asm { 6701 mov eax, [esp + 4] // src_argb 6702 mov edx, [esp + 8] // dst_bayer 6703 // selector 6704 mov ecx, [esp + 16] // pix 6705 pcmpeqb xmm5, xmm5 // generate mask 0x000000ff 6706 psrld xmm5, 24 6707 6708 align 4 6709 wloop: 6710 movdqa xmm0, [eax] 6711 movdqa xmm1, [eax + 16] 6712 lea eax, [eax + 32] 6713 psrld xmm0, 8 // Move green to bottom. 6714 psrld xmm1, 8 6715 pand xmm0, xmm5 6716 pand xmm1, xmm5 6717 packssdw xmm0, xmm1 6718 packuswb xmm0, xmm1 6719 sub ecx, 8 6720 movq qword ptr [edx], xmm0 6721 lea edx, [edx + 8] 6722 jg wloop 6723 ret 6724 } 6725 } 6726 6727 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. 6728 __declspec(naked) __declspec(align(16)) 6729 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, 6730 const uint8* shuffler, int pix) { 6731 __asm { 6732 mov eax, [esp + 4] // src_argb 6733 mov edx, [esp + 8] // dst_argb 6734 mov ecx, [esp + 12] // shuffler 6735 movdqa xmm5, [ecx] 6736 mov ecx, [esp + 16] // pix 6737 6738 align 4 6739 wloop: 6740 movdqa xmm0, [eax] 6741 movdqa xmm1, [eax + 16] 6742 lea eax, [eax + 32] 6743 pshufb xmm0, xmm5 6744 pshufb xmm1, xmm5 6745 sub ecx, 8 6746 movdqa [edx], xmm0 6747 movdqa [edx + 16], xmm1 6748 lea edx, [edx + 32] 6749 jg wloop 6750 ret 6751 } 6752 } 6753 6754 __declspec(naked) __declspec(align(16)) 6755 void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb, 6756 const uint8* shuffler, int pix) { 6757 __asm { 6758 mov eax, [esp + 4] // src_argb 6759 mov edx, [esp + 8] // dst_argb 6760 mov ecx, [esp + 12] // shuffler 6761 movdqa xmm5, [ecx] 6762 mov ecx, [esp + 16] // pix 6763 6764 align 4 6765 wloop: 6766 movdqu xmm0, [eax] 6767 movdqu xmm1, [eax + 16] 6768 lea eax, [eax + 32] 6769 pshufb xmm0, xmm5 6770 pshufb xmm1, xmm5 6771 sub ecx, 8 6772 movdqu [edx], xmm0 6773 movdqu [edx + 16], xmm1 6774 lea edx, [edx + 32] 6775 jg wloop 6776 ret 6777 } 6778 } 6779 6780 #ifdef HAS_ARGBSHUFFLEROW_AVX2 6781 __declspec(naked) __declspec(align(16)) 6782 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, 6783 const uint8* shuffler, int pix) { 6784 __asm { 6785 mov eax, [esp + 4] // src_argb 6786 mov edx, [esp + 8] // dst_argb 6787 mov ecx, [esp + 12] // shuffler 6788 vbroadcastf128 ymm5, [ecx] // same shuffle in high as low. 6789 mov ecx, [esp + 16] // pix 6790 6791 align 4 6792 wloop: 6793 vmovdqu ymm0, [eax] 6794 vmovdqu ymm1, [eax + 32] 6795 lea eax, [eax + 64] 6796 vpshufb ymm0, ymm0, ymm5 6797 vpshufb ymm1, ymm1, ymm5 6798 sub ecx, 16 6799 vmovdqu [edx], ymm0 6800 vmovdqu [edx + 32], ymm1 6801 lea edx, [edx + 64] 6802 jg wloop 6803 6804 vzeroupper 6805 ret 6806 } 6807 } 6808 #endif // HAS_ARGBSHUFFLEROW_AVX2 6809 6810 __declspec(naked) __declspec(align(16)) 6811 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, 6812 const uint8* shuffler, int pix) { 6813 __asm { 6814 push ebx 6815 push esi 6816 mov eax, [esp + 8 + 4] // src_argb 6817 mov edx, [esp + 8 + 8] // dst_argb 6818 mov esi, [esp + 8 + 12] // shuffler 6819 mov ecx, [esp + 8 + 16] // pix 6820 pxor xmm5, xmm5 6821 6822 mov ebx, [esi] // shuffler 6823 cmp ebx, 0x03000102 6824 je shuf_3012 6825 cmp ebx, 0x00010203 6826 je shuf_0123 6827 cmp ebx, 0x00030201 6828 je shuf_0321 6829 cmp ebx, 0x02010003 6830 je shuf_2103 6831 6832 // TODO(fbarchard): Use one source pointer and 3 offsets. 6833 shuf_any1: 6834 movzx ebx, byte ptr [esi] 6835 movzx ebx, byte ptr [eax + ebx] 6836 mov [edx], bl 6837 movzx ebx, byte ptr [esi + 1] 6838 movzx ebx, byte ptr [eax + ebx] 6839 mov [edx + 1], bl 6840 movzx ebx, byte ptr [esi + 2] 6841 movzx ebx, byte ptr [eax + ebx] 6842 mov [edx + 2], bl 6843 movzx ebx, byte ptr [esi + 3] 6844 movzx ebx, byte ptr [eax + ebx] 6845 mov [edx + 3], bl 6846 lea eax, [eax + 4] 6847 lea edx, [edx + 4] 6848 sub ecx, 1 6849 jg shuf_any1 6850 jmp shuf99 6851 6852 align 4 6853 shuf_0123: 6854 movdqu xmm0, [eax] 6855 lea eax, [eax + 16] 6856 movdqa xmm1, xmm0 6857 punpcklbw xmm0, xmm5 6858 punpckhbw xmm1, xmm5 6859 pshufhw xmm0, xmm0, 01Bh // 1B = 00011011 = 0x0123 = BGRAToARGB 6860 pshuflw xmm0, xmm0, 01Bh 6861 pshufhw xmm1, xmm1, 01Bh 6862 pshuflw xmm1, xmm1, 01Bh 6863 packuswb xmm0, xmm1 6864 sub ecx, 4 6865 movdqu [edx], xmm0 6866 lea edx, [edx + 16] 6867 jg shuf_0123 6868 jmp shuf99 6869 6870 align 4 6871 shuf_0321: 6872 movdqu xmm0, [eax] 6873 lea eax, [eax + 16] 6874 movdqa xmm1, xmm0 6875 punpcklbw xmm0, xmm5 6876 punpckhbw xmm1, xmm5 6877 pshufhw xmm0, xmm0, 039h // 39 = 00111001 = 0x0321 = RGBAToARGB 6878 pshuflw xmm0, xmm0, 039h 6879 pshufhw xmm1, xmm1, 039h 6880 pshuflw xmm1, xmm1, 039h 6881 packuswb xmm0, xmm1 6882 sub ecx, 4 6883 movdqu [edx], xmm0 6884 lea edx, [edx + 16] 6885 jg shuf_0321 6886 jmp shuf99 6887 6888 align 4 6889 shuf_2103: 6890 movdqu xmm0, [eax] 6891 lea eax, [eax + 16] 6892 movdqa xmm1, xmm0 6893 punpcklbw xmm0, xmm5 6894 punpckhbw xmm1, xmm5 6895 pshufhw xmm0, xmm0, 093h // 93 = 10010011 = 0x2103 = ARGBToRGBA 6896 pshuflw xmm0, xmm0, 093h 6897 pshufhw xmm1, xmm1, 093h 6898 pshuflw xmm1, xmm1, 093h 6899 packuswb xmm0, xmm1 6900 sub ecx, 4 6901 movdqu [edx], xmm0 6902 lea edx, [edx + 16] 6903 jg shuf_2103 6904 jmp shuf99 6905 6906 align 4 6907 shuf_3012: 6908 movdqu xmm0, [eax] 6909 lea eax, [eax + 16] 6910 movdqa xmm1, xmm0 6911 punpcklbw xmm0, xmm5 6912 punpckhbw xmm1, xmm5 6913 pshufhw xmm0, xmm0, 0C6h // C6 = 11000110 = 0x3012 = ABGRToARGB 6914 pshuflw xmm0, xmm0, 0C6h 6915 pshufhw xmm1, xmm1, 0C6h 6916 pshuflw xmm1, xmm1, 0C6h 6917 packuswb xmm0, xmm1 6918 sub ecx, 4 6919 movdqu [edx], xmm0 6920 lea edx, [edx + 16] 6921 jg shuf_3012 6922 6923 shuf99: 6924 pop esi 6925 pop ebx 6926 ret 6927 } 6928 } 6929 6930 // YUY2 - Macro-pixel = 2 image pixels 6931 // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4.... 6932 6933 // UYVY - Macro-pixel = 2 image pixels 6934 // U0Y0V0Y1 6935 6936 __declspec(naked) __declspec(align(16)) 6937 void I422ToYUY2Row_SSE2(const uint8* src_y, 6938 const uint8* src_u, 6939 const uint8* src_v, 6940 uint8* dst_frame, int width) { 6941 __asm { 6942 push esi 6943 push edi 6944 mov eax, [esp + 8 + 4] // src_y 6945 mov esi, [esp + 8 + 8] // src_u 6946 mov edx, [esp + 8 + 12] // src_v 6947 mov edi, [esp + 8 + 16] // dst_frame 6948 mov ecx, [esp + 8 + 20] // width 6949 sub edx, esi 6950 6951 align 4 6952 convertloop: 6953 movq xmm2, qword ptr [esi] // U 6954 movq xmm3, qword ptr [esi + edx] // V 6955 lea esi, [esi + 8] 6956 punpcklbw xmm2, xmm3 // UV 6957 movdqu xmm0, [eax] // Y 6958 lea eax, [eax + 16] 6959 movdqa xmm1, xmm0 6960 punpcklbw xmm0, xmm2 // YUYV 6961 punpckhbw xmm1, xmm2 6962 movdqu [edi], xmm0 6963 movdqu [edi + 16], xmm1 6964 lea edi, [edi + 32] 6965 sub ecx, 16 6966 jg convertloop 6967 6968 pop edi 6969 pop esi 6970 ret 6971 } 6972 } 6973 6974 __declspec(naked) __declspec(align(16)) 6975 void I422ToUYVYRow_SSE2(const uint8* src_y, 6976 const uint8* src_u, 6977 const uint8* src_v, 6978 uint8* dst_frame, int width) { 6979 __asm { 6980 push esi 6981 push edi 6982 mov eax, [esp + 8 + 4] // src_y 6983 mov esi, [esp + 8 + 8] // src_u 6984 mov edx, [esp + 8 + 12] // src_v 6985 mov edi, [esp + 8 + 16] // dst_frame 6986 mov ecx, [esp + 8 + 20] // width 6987 sub edx, esi 6988 6989 align 4 6990 convertloop: 6991 movq xmm2, qword ptr [esi] // U 6992 movq xmm3, qword ptr [esi + edx] // V 6993 lea esi, [esi + 8] 6994 punpcklbw xmm2, xmm3 // UV 6995 movdqu xmm0, [eax] // Y 6996 movdqa xmm1, xmm2 6997 lea eax, [eax + 16] 6998 punpcklbw xmm1, xmm0 // UYVY 6999 punpckhbw xmm2, xmm0 7000 movdqu [edi], xmm1 7001 movdqu [edi + 16], xmm2 7002 lea edi, [edi + 32] 7003 sub ecx, 16 7004 jg convertloop 7005 7006 pop edi 7007 pop esi 7008 ret 7009 } 7010 } 7011 7012 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2 7013 __declspec(naked) __declspec(align(16)) 7014 void ARGBPolynomialRow_SSE2(const uint8* src_argb, 7015 uint8* dst_argb, const float* poly, 7016 int width) { 7017 __asm { 7018 push esi 7019 mov eax, [esp + 4 + 4] /* src_argb */ 7020 mov edx, [esp + 4 + 8] /* dst_argb */ 7021 mov esi, [esp + 4 + 12] /* poly */ 7022 mov ecx, [esp + 4 + 16] /* width */ 7023 pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints. 7024 7025 // 2 pixel loop. 7026 align 4 7027 convertloop: 7028 // pmovzxbd xmm0, dword ptr [eax] // BGRA pixel 7029 // pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel 7030 movq xmm0, qword ptr [eax] // BGRABGRA 7031 lea eax, [eax + 8] 7032 punpcklbw xmm0, xmm3 7033 movdqa xmm4, xmm0 7034 punpcklwd xmm0, xmm3 // pixel 0 7035 punpckhwd xmm4, xmm3 // pixel 1 7036 cvtdq2ps xmm0, xmm0 // 4 floats 7037 cvtdq2ps xmm4, xmm4 7038 movdqa xmm1, xmm0 // X 7039 movdqa xmm5, xmm4 7040 mulps xmm0, [esi + 16] // C1 * X 7041 mulps xmm4, [esi + 16] 7042 addps xmm0, [esi] // result = C0 + C1 * X 7043 addps xmm4, [esi] 7044 movdqa xmm2, xmm1 7045 movdqa xmm6, xmm5 7046 mulps xmm2, xmm1 // X * X 7047 mulps xmm6, xmm5 7048 mulps xmm1, xmm2 // X * X * X 7049 mulps xmm5, xmm6 7050 mulps xmm2, [esi + 32] // C2 * X * X 7051 mulps xmm6, [esi + 32] 7052 mulps xmm1, [esi + 48] // C3 * X * X * X 7053 mulps xmm5, [esi + 48] 7054 addps xmm0, xmm2 // result += C2 * X * X 7055 addps xmm4, xmm6 7056 addps xmm0, xmm1 // result += C3 * X * X * X 7057 addps xmm4, xmm5 7058 cvttps2dq xmm0, xmm0 7059 cvttps2dq xmm4, xmm4 7060 packuswb xmm0, xmm4 7061 packuswb xmm0, xmm0 7062 sub ecx, 2 7063 movq qword ptr [edx], xmm0 7064 lea edx, [edx + 8] 7065 jg convertloop 7066 pop esi 7067 ret 7068 } 7069 } 7070 #endif // HAS_ARGBPOLYNOMIALROW_SSE2 7071 7072 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2 7073 __declspec(naked) __declspec(align(16)) 7074 void ARGBPolynomialRow_AVX2(const uint8* src_argb, 7075 uint8* dst_argb, const float* poly, 7076 int width) { 7077 __asm { 7078 mov eax, [esp + 4] /* src_argb */ 7079 mov edx, [esp + 8] /* dst_argb */ 7080 mov ecx, [esp + 12] /* poly */ 7081 vbroadcastf128 ymm4, [ecx] // C0 7082 vbroadcastf128 ymm5, [ecx + 16] // C1 7083 vbroadcastf128 ymm6, [ecx + 32] // C2 7084 vbroadcastf128 ymm7, [ecx + 48] // C3 7085 mov ecx, [esp + 16] /* width */ 7086 7087 // 2 pixel loop. 7088 align 4 7089 convertloop: 7090 vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels 7091 lea eax, [eax + 8] 7092 vcvtdq2ps ymm0, ymm0 // X 8 floats 7093 vmulps ymm2, ymm0, ymm0 // X * X 7094 vmulps ymm3, ymm0, ymm7 // C3 * X 7095 vfmadd132ps ymm0, ymm4, ymm5 // result = C0 + C1 * X 7096 vfmadd231ps ymm0, ymm2, ymm6 // result += C2 * X * X 7097 vfmadd231ps ymm0, ymm2, ymm3 // result += C3 * X * X * X 7098 vcvttps2dq ymm0, ymm0 7099 vpackusdw ymm0, ymm0, ymm0 // b0g0r0a0_00000000_b0g0r0a0_00000000 7100 vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000 7101 vpackuswb xmm0, xmm0, xmm0 // bgrabgra_00000000_00000000_00000000 7102 sub ecx, 2 7103 vmovq qword ptr [edx], xmm0 7104 lea edx, [edx + 8] 7105 jg convertloop 7106 vzeroupper 7107 ret 7108 } 7109 } 7110 #endif // HAS_ARGBPOLYNOMIALROW_AVX2 7111 7112 #ifdef HAS_ARGBCOLORTABLEROW_X86 7113 // Tranform ARGB pixels with color table. 7114 __declspec(naked) __declspec(align(16)) 7115 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, 7116 int width) { 7117 __asm { 7118 push esi 7119 mov eax, [esp + 4 + 4] /* dst_argb */ 7120 mov esi, [esp + 4 + 8] /* table_argb */ 7121 mov ecx, [esp + 4 + 12] /* width */ 7122 7123 // 1 pixel loop. 7124 align 4 7125 convertloop: 7126 movzx edx, byte ptr [eax] 7127 lea eax, [eax + 4] 7128 movzx edx, byte ptr [esi + edx * 4] 7129 mov byte ptr [eax - 4], dl 7130 movzx edx, byte ptr [eax - 4 + 1] 7131 movzx edx, byte ptr [esi + edx * 4 + 1] 7132 mov byte ptr [eax - 4 + 1], dl 7133 movzx edx, byte ptr [eax - 4 + 2] 7134 movzx edx, byte ptr [esi + edx * 4 + 2] 7135 mov byte ptr [eax - 4 + 2], dl 7136 movzx edx, byte ptr [eax - 4 + 3] 7137 movzx edx, byte ptr [esi + edx * 4 + 3] 7138 mov byte ptr [eax - 4 + 3], dl 7139 dec ecx 7140 jg convertloop 7141 pop esi 7142 ret 7143 } 7144 } 7145 #endif // HAS_ARGBCOLORTABLEROW_X86 7146 7147 #ifdef HAS_RGBCOLORTABLEROW_X86 7148 // Tranform RGB pixels with color table. 7149 __declspec(naked) __declspec(align(16)) 7150 void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { 7151 __asm { 7152 push esi 7153 mov eax, [esp + 4 + 4] /* dst_argb */ 7154 mov esi, [esp + 4 + 8] /* table_argb */ 7155 mov ecx, [esp + 4 + 12] /* width */ 7156 7157 // 1 pixel loop. 7158 align 4 7159 convertloop: 7160 movzx edx, byte ptr [eax] 7161 lea eax, [eax + 4] 7162 movzx edx, byte ptr [esi + edx * 4] 7163 mov byte ptr [eax - 4], dl 7164 movzx edx, byte ptr [eax - 4 + 1] 7165 movzx edx, byte ptr [esi + edx * 4 + 1] 7166 mov byte ptr [eax - 4 + 1], dl 7167 movzx edx, byte ptr [eax - 4 + 2] 7168 movzx edx, byte ptr [esi + edx * 4 + 2] 7169 mov byte ptr [eax - 4 + 2], dl 7170 dec ecx 7171 jg convertloop 7172 7173 pop esi 7174 ret 7175 } 7176 } 7177 #endif // HAS_RGBCOLORTABLEROW_X86 7178 7179 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 7180 // Tranform RGB pixels with luma table. 7181 __declspec(naked) __declspec(align(16)) 7182 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, 7183 int width, 7184 const uint8* luma, uint32 lumacoeff) { 7185 __asm { 7186 push esi 7187 push edi 7188 mov eax, [esp + 8 + 4] /* src_argb */ 7189 mov edi, [esp + 8 + 8] /* dst_argb */ 7190 mov ecx, [esp + 8 + 12] /* width */ 7191 movd xmm2, dword ptr [esp + 8 + 16] // luma table 7192 movd xmm3, dword ptr [esp + 8 + 20] // lumacoeff 7193 pshufd xmm2, xmm2, 0 7194 pshufd xmm3, xmm3, 0 7195 pcmpeqb xmm4, xmm4 // generate mask 0xff00ff00 7196 psllw xmm4, 8 7197 pxor xmm5, xmm5 7198 7199 // 4 pixel loop. 7200 align 4 7201 convertloop: 7202 movdqu xmm0, qword ptr [eax] // generate luma ptr 7203 pmaddubsw xmm0, xmm3 7204 phaddw xmm0, xmm0 7205 pand xmm0, xmm4 // mask out low bits 7206 punpcklwd xmm0, xmm5 7207 paddd xmm0, xmm2 // add table base 7208 movd esi, xmm0 7209 pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 7210 7211 movzx edx, byte ptr [eax] 7212 movzx edx, byte ptr [esi + edx] 7213 mov byte ptr [edi], dl 7214 movzx edx, byte ptr [eax + 1] 7215 movzx edx, byte ptr [esi + edx] 7216 mov byte ptr [edi + 1], dl 7217 movzx edx, byte ptr [eax + 2] 7218 movzx edx, byte ptr [esi + edx] 7219 mov byte ptr [edi + 2], dl 7220 movzx edx, byte ptr [eax + 3] // copy alpha. 7221 mov byte ptr [edi + 3], dl 7222 7223 movd esi, xmm0 7224 pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 7225 7226 movzx edx, byte ptr [eax + 4] 7227 movzx edx, byte ptr [esi + edx] 7228 mov byte ptr [edi + 4], dl 7229 movzx edx, byte ptr [eax + 5] 7230 movzx edx, byte ptr [esi + edx] 7231 mov byte ptr [edi + 5], dl 7232 movzx edx, byte ptr [eax + 6] 7233 movzx edx, byte ptr [esi + edx] 7234 mov byte ptr [edi + 6], dl 7235 movzx edx, byte ptr [eax + 7] // copy alpha. 7236 mov byte ptr [edi + 7], dl 7237 7238 movd esi, xmm0 7239 pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 7240 7241 movzx edx, byte ptr [eax + 8] 7242 movzx edx, byte ptr [esi + edx] 7243 mov byte ptr [edi + 8], dl 7244 movzx edx, byte ptr [eax + 9] 7245 movzx edx, byte ptr [esi + edx] 7246 mov byte ptr [edi + 9], dl 7247 movzx edx, byte ptr [eax + 10] 7248 movzx edx, byte ptr [esi + edx] 7249 mov byte ptr [edi + 10], dl 7250 movzx edx, byte ptr [eax + 11] // copy alpha. 7251 mov byte ptr [edi + 11], dl 7252 7253 movd esi, xmm0 7254 7255 movzx edx, byte ptr [eax + 12] 7256 movzx edx, byte ptr [esi + edx] 7257 mov byte ptr [edi + 12], dl 7258 movzx edx, byte ptr [eax + 13] 7259 movzx edx, byte ptr [esi + edx] 7260 mov byte ptr [edi + 13], dl 7261 movzx edx, byte ptr [eax + 14] 7262 movzx edx, byte ptr [esi + edx] 7263 mov byte ptr [edi + 14], dl 7264 movzx edx, byte ptr [eax + 15] // copy alpha. 7265 mov byte ptr [edi + 15], dl 7266 7267 sub ecx, 4 7268 lea eax, [eax + 16] 7269 lea edi, [edi + 16] 7270 jg convertloop 7271 7272 pop edi 7273 pop esi 7274 ret 7275 } 7276 } 7277 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 7278 7279 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) 7280 7281 #ifdef __cplusplus 7282 } // extern "C" 7283 } // namespace libyuv 7284 #endif 7285