1 /* 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "libyuv/row.h" 12 13 #if defined (_M_X64) 14 #include <emmintrin.h> 15 #include <tmmintrin.h> // For _mm_maddubs_epi16 16 #endif 17 18 #ifdef __cplusplus 19 namespace libyuv { 20 extern "C" { 21 #endif 22 23 // This module is for Visual C. 24 #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) 25 26 #define YG 74 /* (int8)(1.164 * 64 + 0.5) */ 27 28 #define UB 127 /* min(127,(int8)(2.018 * 64)) */ 29 #define UG -25 /* (int8)(-0.391 * 64 - 0.5) */ 30 #define UR 0 31 32 #define VB 0 33 #define VG -52 /* (int8)(-0.813 * 64 - 0.5) */ 34 #define VR 102 /* (int8)(1.596 * 64 + 0.5) */ 35 36 // Bias 37 #define BB UB * 128 + VB * 128 38 #define BG UG * 128 + VG * 128 39 #define BR UR * 128 + VR * 128 40 41 static const vec8 kUVToB = { 42 UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB 43 }; 44 45 static const vec8 kUVToR = { 46 UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR 47 }; 48 49 static const vec8 kUVToG = { 50 UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG 51 }; 52 53 static const vec8 kVUToB = { 54 VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, 55 }; 56 57 static const vec8 kVUToR = { 58 VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, 59 }; 60 61 static const vec8 kVUToG = { 62 VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, 63 }; 64 65 static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG }; 66 static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 }; 67 static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB }; 68 static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG }; 69 static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR }; 70 71 // 64 bit 72 #if defined(_M_X64) 73 74 // Aligned destination version. 75 __declspec(align(16)) 76 void I422ToARGBRow_SSSE3(const uint8* y_buf, 77 const uint8* u_buf, 78 const uint8* v_buf, 79 uint8* dst_argb, 80 int width) { 81 82 __m128i xmm0, xmm1, xmm2, xmm3; 83 const __m128i xmm5 = _mm_set1_epi8(-1); 84 const __m128i xmm4 = _mm_setzero_si128(); 85 const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf; 86 87 while (width > 0) { 88 xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); 89 xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); 90 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); 91 xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); 92 xmm1 = _mm_load_si128(&xmm0); 93 xmm2 = _mm_load_si128(&xmm0); 94 xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kUVToB); 95 xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kUVToG); 96 xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kUVToR); 97 xmm0 = _mm_sub_epi16(xmm0, *(__m128i*)kUVBiasB); 98 xmm1 = _mm_sub_epi16(xmm1, *(__m128i*)kUVBiasG); 99 xmm2 = _mm_sub_epi16(xmm2, *(__m128i*)kUVBiasR); 100 xmm3 = _mm_loadl_epi64((__m128i*)y_buf); 101 xmm3 = _mm_unpacklo_epi8(xmm3, xmm4); 102 xmm3 = _mm_subs_epi16(xmm3, *(__m128i*)kYSub16); 103 xmm3 = _mm_mullo_epi16(xmm3, *(__m128i*)kYToRgb); 104 xmm0 = _mm_adds_epi16(xmm0, xmm3); 105 xmm1 = _mm_adds_epi16(xmm1, xmm3); 106 xmm2 = _mm_adds_epi16(xmm2, xmm3); 107 xmm0 = _mm_srai_epi16(xmm0, 6); 108 xmm1 = _mm_srai_epi16(xmm1, 6); 109 xmm2 = _mm_srai_epi16(xmm2, 6); 110 xmm0 = _mm_packus_epi16(xmm0, xmm0); 111 xmm1 = _mm_packus_epi16(xmm1, xmm1); 112 xmm2 = _mm_packus_epi16(xmm2, xmm2); 113 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); 114 xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); 115 xmm1 = _mm_load_si128(&xmm0); 116 xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); 117 xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); 118 119 _mm_store_si128((__m128i *)dst_argb, xmm0); 120 _mm_store_si128((__m128i *)(dst_argb + 16), xmm1); 121 122 y_buf += 8; 123 u_buf += 4; 124 dst_argb += 32; 125 width -= 8; 126 } 127 } 128 129 // Unaligned destination version. 130 void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, 131 const uint8* u_buf, 132 const uint8* v_buf, 133 uint8* dst_argb, 134 int width) { 135 136 __m128i xmm0, xmm1, xmm2, xmm3; 137 const __m128i xmm5 = _mm_set1_epi8(-1); 138 const __m128i xmm4 = _mm_setzero_si128(); 139 const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf; 140 141 while (width > 0) { 142 xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); 143 xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); 144 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); 145 xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); 146 xmm1 = _mm_load_si128(&xmm0); 147 xmm2 = _mm_load_si128(&xmm0); 148 xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kUVToB); 149 xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kUVToG); 150 xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kUVToR); 151 xmm0 = _mm_sub_epi16(xmm0, *(__m128i*)kUVBiasB); 152 xmm1 = _mm_sub_epi16(xmm1, *(__m128i*)kUVBiasG); 153 xmm2 = _mm_sub_epi16(xmm2, *(__m128i*)kUVBiasR); 154 xmm3 = _mm_loadl_epi64((__m128i*)y_buf); 155 xmm3 = _mm_unpacklo_epi8(xmm3, xmm4); 156 xmm3 = _mm_subs_epi16(xmm3, *(__m128i*)kYSub16); 157 xmm3 = _mm_mullo_epi16(xmm3, *(__m128i*)kYToRgb); 158 xmm0 = _mm_adds_epi16(xmm0, xmm3); 159 xmm1 = _mm_adds_epi16(xmm1, xmm3); 160 xmm2 = _mm_adds_epi16(xmm2, xmm3); 161 xmm0 = _mm_srai_epi16(xmm0, 6); 162 xmm1 = _mm_srai_epi16(xmm1, 6); 163 xmm2 = _mm_srai_epi16(xmm2, 6); 164 xmm0 = _mm_packus_epi16(xmm0, xmm0); 165 xmm1 = _mm_packus_epi16(xmm1, xmm1); 166 xmm2 = _mm_packus_epi16(xmm2, xmm2); 167 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); 168 xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); 169 xmm1 = _mm_load_si128(&xmm0); 170 xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); 171 xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); 172 173 _mm_storeu_si128((__m128i *)dst_argb, xmm0); 174 _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1); 175 176 y_buf += 8; 177 u_buf += 4; 178 dst_argb += 32; 179 width -= 8; 180 } 181 } 182 // 32 bit 183 #else // defined(_M_X64) 184 185 #ifdef HAS_ARGBTOYROW_SSSE3 186 187 // Constants for ARGB. 188 static const vec8 kARGBToY = { 189 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 190 }; 191 192 // JPeg full range. 193 static const vec8 kARGBToYJ = { 194 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0 195 }; 196 197 static const vec8 kARGBToU = { 198 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 199 }; 200 201 static const vec8 kARGBToUJ = { 202 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0 203 }; 204 205 static const vec8 kARGBToV = { 206 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, 207 }; 208 209 static const vec8 kARGBToVJ = { 210 -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0 211 }; 212 213 // vpermd for vphaddw + vpackuswb vpermd. 214 static const lvec32 kPermdARGBToY_AVX = { 215 0, 4, 1, 5, 2, 6, 3, 7 216 }; 217 218 // vpshufb for vphaddw + vpackuswb packed to shorts. 219 static const lvec8 kShufARGBToUV_AVX = { 220 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, 221 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, 222 }; 223 224 // Constants for BGRA. 225 static const vec8 kBGRAToY = { 226 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13 227 }; 228 229 static const vec8 kBGRAToU = { 230 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112 231 }; 232 233 static const vec8 kBGRAToV = { 234 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18 235 }; 236 237 // Constants for ABGR. 238 static const vec8 kABGRToY = { 239 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0 240 }; 241 242 static const vec8 kABGRToU = { 243 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0 244 }; 245 246 static const vec8 kABGRToV = { 247 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0 248 }; 249 250 // Constants for RGBA. 251 static const vec8 kRGBAToY = { 252 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33 253 }; 254 255 static const vec8 kRGBAToU = { 256 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38 257 }; 258 259 static const vec8 kRGBAToV = { 260 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112 261 }; 262 263 static const uvec8 kAddY16 = { 264 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u 265 }; 266 267 static const vec16 kAddYJ64 = { 268 64, 64, 64, 64, 64, 64, 64, 64 269 }; 270 271 static const uvec8 kAddUV128 = { 272 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 273 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u 274 }; 275 276 static const uvec16 kAddUVJ128 = { 277 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u 278 }; 279 280 // Shuffle table for converting RGB24 to ARGB. 281 static const uvec8 kShuffleMaskRGB24ToARGB = { 282 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u 283 }; 284 285 // Shuffle table for converting RAW to ARGB. 286 static const uvec8 kShuffleMaskRAWToARGB = { 287 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u 288 }; 289 290 // Shuffle table for converting ARGB to RGB24. 291 static const uvec8 kShuffleMaskARGBToRGB24 = { 292 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u 293 }; 294 295 // Shuffle table for converting ARGB to RAW. 296 static const uvec8 kShuffleMaskARGBToRAW = { 297 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u 298 }; 299 300 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 301 static const uvec8 kShuffleMaskARGBToRGB24_0 = { 302 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u 303 }; 304 305 // Shuffle table for converting ARGB to RAW. 306 static const uvec8 kShuffleMaskARGBToRAW_0 = { 307 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u 308 }; 309 310 // Duplicates gray value 3 times and fills in alpha opaque. 311 __declspec(naked) __declspec(align(16)) 312 void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { 313 __asm { 314 mov eax, [esp + 4] // src_y 315 mov edx, [esp + 8] // dst_argb 316 mov ecx, [esp + 12] // pix 317 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 318 pslld xmm5, 24 319 320 align 4 321 convertloop: 322 movq xmm0, qword ptr [eax] 323 lea eax, [eax + 8] 324 punpcklbw xmm0, xmm0 325 movdqa xmm1, xmm0 326 punpcklwd xmm0, xmm0 327 punpckhwd xmm1, xmm1 328 por xmm0, xmm5 329 por xmm1, xmm5 330 movdqa [edx], xmm0 331 movdqa [edx + 16], xmm1 332 lea edx, [edx + 32] 333 sub ecx, 8 334 jg convertloop 335 ret 336 } 337 } 338 339 __declspec(naked) __declspec(align(16)) 340 void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb, 341 int pix) { 342 __asm { 343 mov eax, [esp + 4] // src_y 344 mov edx, [esp + 8] // dst_argb 345 mov ecx, [esp + 12] // pix 346 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 347 pslld xmm5, 24 348 349 align 4 350 convertloop: 351 movq xmm0, qword ptr [eax] 352 lea eax, [eax + 8] 353 punpcklbw xmm0, xmm0 354 movdqa xmm1, xmm0 355 punpcklwd xmm0, xmm0 356 punpckhwd xmm1, xmm1 357 por xmm0, xmm5 358 por xmm1, xmm5 359 movdqu [edx], xmm0 360 movdqu [edx + 16], xmm1 361 lea edx, [edx + 32] 362 sub ecx, 8 363 jg convertloop 364 ret 365 } 366 } 367 368 __declspec(naked) __declspec(align(16)) 369 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { 370 __asm { 371 mov eax, [esp + 4] // src_rgb24 372 mov edx, [esp + 8] // dst_argb 373 mov ecx, [esp + 12] // pix 374 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 375 pslld xmm5, 24 376 movdqa xmm4, kShuffleMaskRGB24ToARGB 377 378 align 4 379 convertloop: 380 movdqu xmm0, [eax] 381 movdqu xmm1, [eax + 16] 382 movdqu xmm3, [eax + 32] 383 lea eax, [eax + 48] 384 movdqa xmm2, xmm3 385 palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} 386 pshufb xmm2, xmm4 387 por xmm2, xmm5 388 palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} 389 pshufb xmm0, xmm4 390 movdqa [edx + 32], xmm2 391 por xmm0, xmm5 392 pshufb xmm1, xmm4 393 movdqa [edx], xmm0 394 por xmm1, xmm5 395 palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} 396 pshufb xmm3, xmm4 397 movdqa [edx + 16], xmm1 398 por xmm3, xmm5 399 sub ecx, 16 400 movdqa [edx + 48], xmm3 401 lea edx, [edx + 64] 402 jg convertloop 403 ret 404 } 405 } 406 407 __declspec(naked) __declspec(align(16)) 408 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, 409 int pix) { 410 __asm { 411 mov eax, [esp + 4] // src_raw 412 mov edx, [esp + 8] // dst_argb 413 mov ecx, [esp + 12] // pix 414 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 415 pslld xmm5, 24 416 movdqa xmm4, kShuffleMaskRAWToARGB 417 418 align 4 419 convertloop: 420 movdqu xmm0, [eax] 421 movdqu xmm1, [eax + 16] 422 movdqu xmm3, [eax + 32] 423 lea eax, [eax + 48] 424 movdqa xmm2, xmm3 425 palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} 426 pshufb xmm2, xmm4 427 por xmm2, xmm5 428 palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} 429 pshufb xmm0, xmm4 430 movdqa [edx + 32], xmm2 431 por xmm0, xmm5 432 pshufb xmm1, xmm4 433 movdqa [edx], xmm0 434 por xmm1, xmm5 435 palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} 436 pshufb xmm3, xmm4 437 movdqa [edx + 16], xmm1 438 por xmm3, xmm5 439 sub ecx, 16 440 movdqa [edx + 48], xmm3 441 lea edx, [edx + 64] 442 jg convertloop 443 ret 444 } 445 } 446 447 // pmul method to replicate bits. 448 // Math to replicate bits: 449 // (v << 8) | (v << 3) 450 // v * 256 + v * 8 451 // v * (256 + 8) 452 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 453 // 20 instructions. 454 __declspec(naked) __declspec(align(16)) 455 void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, 456 int pix) { 457 __asm { 458 mov eax, 0x01080108 // generate multiplier to repeat 5 bits 459 movd xmm5, eax 460 pshufd xmm5, xmm5, 0 461 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits 462 movd xmm6, eax 463 pshufd xmm6, xmm6, 0 464 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red 465 psllw xmm3, 11 466 pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green 467 psllw xmm4, 10 468 psrlw xmm4, 5 469 pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha 470 psllw xmm7, 8 471 472 mov eax, [esp + 4] // src_rgb565 473 mov edx, [esp + 8] // dst_argb 474 mov ecx, [esp + 12] // pix 475 sub edx, eax 476 sub edx, eax 477 478 align 4 479 convertloop: 480 movdqu xmm0, [eax] // fetch 8 pixels of bgr565 481 movdqa xmm1, xmm0 482 movdqa xmm2, xmm0 483 pand xmm1, xmm3 // R in upper 5 bits 484 psllw xmm2, 11 // B in upper 5 bits 485 pmulhuw xmm1, xmm5 // * (256 + 8) 486 pmulhuw xmm2, xmm5 // * (256 + 8) 487 psllw xmm1, 8 488 por xmm1, xmm2 // RB 489 pand xmm0, xmm4 // G in middle 6 bits 490 pmulhuw xmm0, xmm6 // << 5 * (256 + 4) 491 por xmm0, xmm7 // AG 492 movdqa xmm2, xmm1 493 punpcklbw xmm1, xmm0 494 punpckhbw xmm2, xmm0 495 movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB 496 movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB 497 lea eax, [eax + 16] 498 sub ecx, 8 499 jg convertloop 500 ret 501 } 502 } 503 504 // 24 instructions 505 __declspec(naked) __declspec(align(16)) 506 void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, 507 int pix) { 508 __asm { 509 mov eax, 0x01080108 // generate multiplier to repeat 5 bits 510 movd xmm5, eax 511 pshufd xmm5, xmm5, 0 512 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits 513 movd xmm6, eax 514 pshufd xmm6, xmm6, 0 515 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red 516 psllw xmm3, 11 517 movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green 518 psrlw xmm4, 6 519 pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha 520 psllw xmm7, 8 521 522 mov eax, [esp + 4] // src_argb1555 523 mov edx, [esp + 8] // dst_argb 524 mov ecx, [esp + 12] // pix 525 sub edx, eax 526 sub edx, eax 527 528 align 4 529 convertloop: 530 movdqu xmm0, [eax] // fetch 8 pixels of 1555 531 movdqa xmm1, xmm0 532 movdqa xmm2, xmm0 533 psllw xmm1, 1 // R in upper 5 bits 534 psllw xmm2, 11 // B in upper 5 bits 535 pand xmm1, xmm3 536 pmulhuw xmm2, xmm5 // * (256 + 8) 537 pmulhuw xmm1, xmm5 // * (256 + 8) 538 psllw xmm1, 8 539 por xmm1, xmm2 // RB 540 movdqa xmm2, xmm0 541 pand xmm0, xmm4 // G in middle 5 bits 542 psraw xmm2, 8 // A 543 pmulhuw xmm0, xmm6 // << 6 * (256 + 8) 544 pand xmm2, xmm7 545 por xmm0, xmm2 // AG 546 movdqa xmm2, xmm1 547 punpcklbw xmm1, xmm0 548 punpckhbw xmm2, xmm0 549 movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB 550 movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB 551 lea eax, [eax + 16] 552 sub ecx, 8 553 jg convertloop 554 ret 555 } 556 } 557 558 // 18 instructions. 559 __declspec(naked) __declspec(align(16)) 560 void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, 561 int pix) { 562 __asm { 563 mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f 564 movd xmm4, eax 565 pshufd xmm4, xmm4, 0 566 movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles 567 pslld xmm5, 4 568 mov eax, [esp + 4] // src_argb4444 569 mov edx, [esp + 8] // dst_argb 570 mov ecx, [esp + 12] // pix 571 sub edx, eax 572 sub edx, eax 573 574 align 4 575 convertloop: 576 movdqu xmm0, [eax] // fetch 8 pixels of bgra4444 577 movdqa xmm2, xmm0 578 pand xmm0, xmm4 // mask low nibbles 579 pand xmm2, xmm5 // mask high nibbles 580 movdqa xmm1, xmm0 581 movdqa xmm3, xmm2 582 psllw xmm1, 4 583 psrlw xmm3, 4 584 por xmm0, xmm1 585 por xmm2, xmm3 586 movdqa xmm1, xmm0 587 punpcklbw xmm0, xmm2 588 punpckhbw xmm1, xmm2 589 movdqa [eax * 2 + edx], xmm0 // store 4 pixels of ARGB 590 movdqa [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB 591 lea eax, [eax + 16] 592 sub ecx, 8 593 jg convertloop 594 ret 595 } 596 } 597 598 __declspec(naked) __declspec(align(16)) 599 void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { 600 __asm { 601 mov eax, [esp + 4] // src_argb 602 mov edx, [esp + 8] // dst_rgb 603 mov ecx, [esp + 12] // pix 604 movdqa xmm6, kShuffleMaskARGBToRGB24 605 606 align 4 607 convertloop: 608 movdqu xmm0, [eax] // fetch 16 pixels of argb 609 movdqu xmm1, [eax + 16] 610 movdqu xmm2, [eax + 32] 611 movdqu xmm3, [eax + 48] 612 lea eax, [eax + 64] 613 pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB 614 pshufb xmm1, xmm6 615 pshufb xmm2, xmm6 616 pshufb xmm3, xmm6 617 movdqa xmm4, xmm1 // 4 bytes from 1 for 0 618 psrldq xmm1, 4 // 8 bytes from 1 619 pslldq xmm4, 12 // 4 bytes from 1 for 0 620 movdqa xmm5, xmm2 // 8 bytes from 2 for 1 621 por xmm0, xmm4 // 4 bytes from 1 for 0 622 pslldq xmm5, 8 // 8 bytes from 2 for 1 623 movdqu [edx], xmm0 // store 0 624 por xmm1, xmm5 // 8 bytes from 2 for 1 625 psrldq xmm2, 8 // 4 bytes from 2 626 pslldq xmm3, 4 // 12 bytes from 3 for 2 627 por xmm2, xmm3 // 12 bytes from 3 for 2 628 movdqu [edx + 16], xmm1 // store 1 629 movdqu [edx + 32], xmm2 // store 2 630 lea edx, [edx + 48] 631 sub ecx, 16 632 jg convertloop 633 ret 634 } 635 } 636 637 __declspec(naked) __declspec(align(16)) 638 void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { 639 __asm { 640 mov eax, [esp + 4] // src_argb 641 mov edx, [esp + 8] // dst_rgb 642 mov ecx, [esp + 12] // pix 643 movdqa xmm6, kShuffleMaskARGBToRAW 644 645 align 4 646 convertloop: 647 movdqu xmm0, [eax] // fetch 16 pixels of argb 648 movdqu xmm1, [eax + 16] 649 movdqu xmm2, [eax + 32] 650 movdqu xmm3, [eax + 48] 651 lea eax, [eax + 64] 652 pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB 653 pshufb xmm1, xmm6 654 pshufb xmm2, xmm6 655 pshufb xmm3, xmm6 656 movdqa xmm4, xmm1 // 4 bytes from 1 for 0 657 psrldq xmm1, 4 // 8 bytes from 1 658 pslldq xmm4, 12 // 4 bytes from 1 for 0 659 movdqa xmm5, xmm2 // 8 bytes from 2 for 1 660 por xmm0, xmm4 // 4 bytes from 1 for 0 661 pslldq xmm5, 8 // 8 bytes from 2 for 1 662 movdqu [edx], xmm0 // store 0 663 por xmm1, xmm5 // 8 bytes from 2 for 1 664 psrldq xmm2, 8 // 4 bytes from 2 665 pslldq xmm3, 4 // 12 bytes from 3 for 2 666 por xmm2, xmm3 // 12 bytes from 3 for 2 667 movdqu [edx + 16], xmm1 // store 1 668 movdqu [edx + 32], xmm2 // store 2 669 lea edx, [edx + 48] 670 sub ecx, 16 671 jg convertloop 672 ret 673 } 674 } 675 676 __declspec(naked) __declspec(align(16)) 677 void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { 678 __asm { 679 mov eax, [esp + 4] // src_argb 680 mov edx, [esp + 8] // dst_rgb 681 mov ecx, [esp + 12] // pix 682 pcmpeqb xmm3, xmm3 // generate mask 0x0000001f 683 psrld xmm3, 27 684 pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 685 psrld xmm4, 26 686 pslld xmm4, 5 687 pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 688 pslld xmm5, 11 689 690 align 4 691 convertloop: 692 movdqa xmm0, [eax] // fetch 4 pixels of argb 693 movdqa xmm1, xmm0 // B 694 movdqa xmm2, xmm0 // G 695 pslld xmm0, 8 // R 696 psrld xmm1, 3 // B 697 psrld xmm2, 5 // G 698 psrad xmm0, 16 // R 699 pand xmm1, xmm3 // B 700 pand xmm2, xmm4 // G 701 pand xmm0, xmm5 // R 702 por xmm1, xmm2 // BG 703 por xmm0, xmm1 // BGR 704 packssdw xmm0, xmm0 705 lea eax, [eax + 16] 706 movq qword ptr [edx], xmm0 // store 4 pixels of RGB565 707 lea edx, [edx + 8] 708 sub ecx, 4 709 jg convertloop 710 ret 711 } 712 } 713 714 // TODO(fbarchard): Improve sign extension/packing. 715 __declspec(naked) __declspec(align(16)) 716 void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { 717 __asm { 718 mov eax, [esp + 4] // src_argb 719 mov edx, [esp + 8] // dst_rgb 720 mov ecx, [esp + 12] // pix 721 pcmpeqb xmm4, xmm4 // generate mask 0x0000001f 722 psrld xmm4, 27 723 movdqa xmm5, xmm4 // generate mask 0x000003e0 724 pslld xmm5, 5 725 movdqa xmm6, xmm4 // generate mask 0x00007c00 726 pslld xmm6, 10 727 pcmpeqb xmm7, xmm7 // generate mask 0xffff8000 728 pslld xmm7, 15 729 730 align 4 731 convertloop: 732 movdqa xmm0, [eax] // fetch 4 pixels of argb 733 movdqa xmm1, xmm0 // B 734 movdqa xmm2, xmm0 // G 735 movdqa xmm3, xmm0 // R 736 psrad xmm0, 16 // A 737 psrld xmm1, 3 // B 738 psrld xmm2, 6 // G 739 psrld xmm3, 9 // R 740 pand xmm0, xmm7 // A 741 pand xmm1, xmm4 // B 742 pand xmm2, xmm5 // G 743 pand xmm3, xmm6 // R 744 por xmm0, xmm1 // BA 745 por xmm2, xmm3 // GR 746 por xmm0, xmm2 // BGRA 747 packssdw xmm0, xmm0 748 lea eax, [eax + 16] 749 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555 750 lea edx, [edx + 8] 751 sub ecx, 4 752 jg convertloop 753 ret 754 } 755 } 756 757 __declspec(naked) __declspec(align(16)) 758 void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { 759 __asm { 760 mov eax, [esp + 4] // src_argb 761 mov edx, [esp + 8] // dst_rgb 762 mov ecx, [esp + 12] // pix 763 pcmpeqb xmm4, xmm4 // generate mask 0xf000f000 764 psllw xmm4, 12 765 movdqa xmm3, xmm4 // generate mask 0x00f000f0 766 psrlw xmm3, 8 767 768 align 4 769 convertloop: 770 movdqa xmm0, [eax] // fetch 4 pixels of argb 771 movdqa xmm1, xmm0 772 pand xmm0, xmm3 // low nibble 773 pand xmm1, xmm4 // high nibble 774 psrl xmm0, 4 775 psrl xmm1, 8 776 por xmm0, xmm1 777 packuswb xmm0, xmm0 778 lea eax, [eax + 16] 779 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444 780 lea edx, [edx + 8] 781 sub ecx, 4 782 jg convertloop 783 ret 784 } 785 } 786 787 // Convert 16 ARGB pixels (64 bytes) to 16 Y values. 788 __declspec(naked) __declspec(align(16)) 789 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 790 __asm { 791 mov eax, [esp + 4] /* src_argb */ 792 mov edx, [esp + 8] /* dst_y */ 793 mov ecx, [esp + 12] /* pix */ 794 movdqa xmm5, kAddY16 795 movdqa xmm4, kARGBToY 796 797 align 4 798 convertloop: 799 movdqa xmm0, [eax] 800 movdqa xmm1, [eax + 16] 801 movdqa xmm2, [eax + 32] 802 movdqa xmm3, [eax + 48] 803 pmaddubsw xmm0, xmm4 804 pmaddubsw xmm1, xmm4 805 pmaddubsw xmm2, xmm4 806 pmaddubsw xmm3, xmm4 807 lea eax, [eax + 64] 808 phaddw xmm0, xmm1 809 phaddw xmm2, xmm3 810 psrlw xmm0, 7 811 psrlw xmm2, 7 812 packuswb xmm0, xmm2 813 paddb xmm0, xmm5 814 sub ecx, 16 815 movdqa [edx], xmm0 816 lea edx, [edx + 16] 817 jg convertloop 818 ret 819 } 820 } 821 822 // Convert 16 ARGB pixels (64 bytes) to 16 Y values. 823 __declspec(naked) __declspec(align(16)) 824 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 825 __asm { 826 mov eax, [esp + 4] /* src_argb */ 827 mov edx, [esp + 8] /* dst_y */ 828 mov ecx, [esp + 12] /* pix */ 829 movdqa xmm4, kARGBToYJ 830 movdqa xmm5, kAddYJ64 831 832 align 4 833 convertloop: 834 movdqa xmm0, [eax] 835 movdqa xmm1, [eax + 16] 836 movdqa xmm2, [eax + 32] 837 movdqa xmm3, [eax + 48] 838 pmaddubsw xmm0, xmm4 839 pmaddubsw xmm1, xmm4 840 pmaddubsw xmm2, xmm4 841 pmaddubsw xmm3, xmm4 842 lea eax, [eax + 64] 843 phaddw xmm0, xmm1 844 phaddw xmm2, xmm3 845 paddw xmm0, xmm5 // Add .5 for rounding. 846 paddw xmm2, xmm5 847 psrlw xmm0, 7 848 psrlw xmm2, 7 849 packuswb xmm0, xmm2 850 sub ecx, 16 851 movdqa [edx], xmm0 852 lea edx, [edx + 16] 853 jg convertloop 854 ret 855 } 856 } 857 858 #ifdef HAS_ARGBTOYROW_AVX2 859 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. 860 __declspec(naked) __declspec(align(32)) 861 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { 862 __asm { 863 mov eax, [esp + 4] /* src_argb */ 864 mov edx, [esp + 8] /* dst_y */ 865 mov ecx, [esp + 12] /* pix */ 866 vbroadcastf128 ymm4, kARGBToY 867 vbroadcastf128 ymm5, kAddY16 868 vmovdqa ymm6, kPermdARGBToY_AVX 869 870 align 4 871 convertloop: 872 vmovdqu ymm0, [eax] 873 vmovdqu ymm1, [eax + 32] 874 vmovdqu ymm2, [eax + 64] 875 vmovdqu ymm3, [eax + 96] 876 vpmaddubsw ymm0, ymm0, ymm4 877 vpmaddubsw ymm1, ymm1, ymm4 878 vpmaddubsw ymm2, ymm2, ymm4 879 vpmaddubsw ymm3, ymm3, ymm4 880 lea eax, [eax + 128] 881 vphaddw ymm0, ymm0, ymm1 // mutates. 882 vphaddw ymm2, ymm2, ymm3 883 vpsrlw ymm0, ymm0, 7 884 vpsrlw ymm2, ymm2, 7 885 vpackuswb ymm0, ymm0, ymm2 // mutates. 886 vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation. 887 vpaddb ymm0, ymm0, ymm5 888 sub ecx, 32 889 vmovdqu [edx], ymm0 890 lea edx, [edx + 32] 891 jg convertloop 892 vzeroupper 893 ret 894 } 895 } 896 #endif // HAS_ARGBTOYROW_AVX2 897 898 #ifdef HAS_ARGBTOYROW_AVX2 899 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. 900 __declspec(naked) __declspec(align(32)) 901 void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { 902 __asm { 903 mov eax, [esp + 4] /* src_argb */ 904 mov edx, [esp + 8] /* dst_y */ 905 mov ecx, [esp + 12] /* pix */ 906 vbroadcastf128 ymm4, kARGBToYJ 907 vbroadcastf128 ymm5, kAddYJ64 908 vmovdqa ymm6, kPermdARGBToY_AVX 909 910 align 4 911 convertloop: 912 vmovdqu ymm0, [eax] 913 vmovdqu ymm1, [eax + 32] 914 vmovdqu ymm2, [eax + 64] 915 vmovdqu ymm3, [eax + 96] 916 vpmaddubsw ymm0, ymm0, ymm4 917 vpmaddubsw ymm1, ymm1, ymm4 918 vpmaddubsw ymm2, ymm2, ymm4 919 vpmaddubsw ymm3, ymm3, ymm4 920 lea eax, [eax + 128] 921 vphaddw ymm0, ymm0, ymm1 // mutates. 922 vphaddw ymm2, ymm2, ymm3 923 vpaddw ymm0, ymm0, ymm5 // Add .5 for rounding. 924 vpaddw ymm2, ymm2, ymm5 925 vpsrlw ymm0, ymm0, 7 926 vpsrlw ymm2, ymm2, 7 927 vpackuswb ymm0, ymm0, ymm2 // mutates. 928 vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation. 929 sub ecx, 32 930 vmovdqu [edx], ymm0 931 lea edx, [edx + 32] 932 jg convertloop 933 934 vzeroupper 935 ret 936 } 937 } 938 #endif // HAS_ARGBTOYJROW_AVX2 939 940 __declspec(naked) __declspec(align(16)) 941 void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 942 __asm { 943 mov eax, [esp + 4] /* src_argb */ 944 mov edx, [esp + 8] /* dst_y */ 945 mov ecx, [esp + 12] /* pix */ 946 movdqa xmm5, kAddY16 947 movdqa xmm4, kARGBToY 948 949 align 4 950 convertloop: 951 movdqu xmm0, [eax] 952 movdqu xmm1, [eax + 16] 953 movdqu xmm2, [eax + 32] 954 movdqu xmm3, [eax + 48] 955 pmaddubsw xmm0, xmm4 956 pmaddubsw xmm1, xmm4 957 pmaddubsw xmm2, xmm4 958 pmaddubsw xmm3, xmm4 959 lea eax, [eax + 64] 960 phaddw xmm0, xmm1 961 phaddw xmm2, xmm3 962 psrlw xmm0, 7 963 psrlw xmm2, 7 964 packuswb xmm0, xmm2 965 paddb xmm0, xmm5 966 sub ecx, 16 967 movdqu [edx], xmm0 968 lea edx, [edx + 16] 969 jg convertloop 970 ret 971 } 972 } 973 974 __declspec(naked) __declspec(align(16)) 975 void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 976 __asm { 977 mov eax, [esp + 4] /* src_argb */ 978 mov edx, [esp + 8] /* dst_y */ 979 mov ecx, [esp + 12] /* pix */ 980 movdqa xmm4, kARGBToYJ 981 movdqa xmm5, kAddYJ64 982 983 align 4 984 convertloop: 985 movdqu xmm0, [eax] 986 movdqu xmm1, [eax + 16] 987 movdqu xmm2, [eax + 32] 988 movdqu xmm3, [eax + 48] 989 pmaddubsw xmm0, xmm4 990 pmaddubsw xmm1, xmm4 991 pmaddubsw xmm2, xmm4 992 pmaddubsw xmm3, xmm4 993 lea eax, [eax + 64] 994 phaddw xmm0, xmm1 995 phaddw xmm2, xmm3 996 paddw xmm0, xmm5 997 paddw xmm2, xmm5 998 psrlw xmm0, 7 999 psrlw xmm2, 7 1000 packuswb xmm0, xmm2 1001 sub ecx, 16 1002 movdqu [edx], xmm0 1003 lea edx, [edx + 16] 1004 jg convertloop 1005 ret 1006 } 1007 } 1008 1009 __declspec(naked) __declspec(align(16)) 1010 void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 1011 __asm { 1012 mov eax, [esp + 4] /* src_argb */ 1013 mov edx, [esp + 8] /* dst_y */ 1014 mov ecx, [esp + 12] /* pix */ 1015 movdqa xmm5, kAddY16 1016 movdqa xmm4, kBGRAToY 1017 1018 align 4 1019 convertloop: 1020 movdqa xmm0, [eax] 1021 movdqa xmm1, [eax + 16] 1022 movdqa xmm2, [eax + 32] 1023 movdqa xmm3, [eax + 48] 1024 pmaddubsw xmm0, xmm4 1025 pmaddubsw xmm1, xmm4 1026 pmaddubsw xmm2, xmm4 1027 pmaddubsw xmm3, xmm4 1028 lea eax, [eax + 64] 1029 phaddw xmm0, xmm1 1030 phaddw xmm2, xmm3 1031 psrlw xmm0, 7 1032 psrlw xmm2, 7 1033 packuswb xmm0, xmm2 1034 paddb xmm0, xmm5 1035 sub ecx, 16 1036 movdqa [edx], xmm0 1037 lea edx, [edx + 16] 1038 jg convertloop 1039 ret 1040 } 1041 } 1042 1043 __declspec(naked) __declspec(align(16)) 1044 void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 1045 __asm { 1046 mov eax, [esp + 4] /* src_argb */ 1047 mov edx, [esp + 8] /* dst_y */ 1048 mov ecx, [esp + 12] /* pix */ 1049 movdqa xmm5, kAddY16 1050 movdqa xmm4, kBGRAToY 1051 1052 align 4 1053 convertloop: 1054 movdqu xmm0, [eax] 1055 movdqu xmm1, [eax + 16] 1056 movdqu xmm2, [eax + 32] 1057 movdqu xmm3, [eax + 48] 1058 pmaddubsw xmm0, xmm4 1059 pmaddubsw xmm1, xmm4 1060 pmaddubsw xmm2, xmm4 1061 pmaddubsw xmm3, xmm4 1062 lea eax, [eax + 64] 1063 phaddw xmm0, xmm1 1064 phaddw xmm2, xmm3 1065 psrlw xmm0, 7 1066 psrlw xmm2, 7 1067 packuswb xmm0, xmm2 1068 paddb xmm0, xmm5 1069 sub ecx, 16 1070 movdqu [edx], xmm0 1071 lea edx, [edx + 16] 1072 jg convertloop 1073 ret 1074 } 1075 } 1076 1077 __declspec(naked) __declspec(align(16)) 1078 void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 1079 __asm { 1080 mov eax, [esp + 4] /* src_argb */ 1081 mov edx, [esp + 8] /* dst_y */ 1082 mov ecx, [esp + 12] /* pix */ 1083 movdqa xmm5, kAddY16 1084 movdqa xmm4, kABGRToY 1085 1086 align 4 1087 convertloop: 1088 movdqa xmm0, [eax] 1089 movdqa xmm1, [eax + 16] 1090 movdqa xmm2, [eax + 32] 1091 movdqa xmm3, [eax + 48] 1092 pmaddubsw xmm0, xmm4 1093 pmaddubsw xmm1, xmm4 1094 pmaddubsw xmm2, xmm4 1095 pmaddubsw xmm3, xmm4 1096 lea eax, [eax + 64] 1097 phaddw xmm0, xmm1 1098 phaddw xmm2, xmm3 1099 psrlw xmm0, 7 1100 psrlw xmm2, 7 1101 packuswb xmm0, xmm2 1102 paddb xmm0, xmm5 1103 sub ecx, 16 1104 movdqa [edx], xmm0 1105 lea edx, [edx + 16] 1106 jg convertloop 1107 ret 1108 } 1109 } 1110 1111 __declspec(naked) __declspec(align(16)) 1112 void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 1113 __asm { 1114 mov eax, [esp + 4] /* src_argb */ 1115 mov edx, [esp + 8] /* dst_y */ 1116 mov ecx, [esp + 12] /* pix */ 1117 movdqa xmm5, kAddY16 1118 movdqa xmm4, kABGRToY 1119 1120 align 4 1121 convertloop: 1122 movdqu xmm0, [eax] 1123 movdqu xmm1, [eax + 16] 1124 movdqu xmm2, [eax + 32] 1125 movdqu xmm3, [eax + 48] 1126 pmaddubsw xmm0, xmm4 1127 pmaddubsw xmm1, xmm4 1128 pmaddubsw xmm2, xmm4 1129 pmaddubsw xmm3, xmm4 1130 lea eax, [eax + 64] 1131 phaddw xmm0, xmm1 1132 phaddw xmm2, xmm3 1133 psrlw xmm0, 7 1134 psrlw xmm2, 7 1135 packuswb xmm0, xmm2 1136 paddb xmm0, xmm5 1137 sub ecx, 16 1138 movdqu [edx], xmm0 1139 lea edx, [edx + 16] 1140 jg convertloop 1141 ret 1142 } 1143 } 1144 1145 __declspec(naked) __declspec(align(16)) 1146 void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 1147 __asm { 1148 mov eax, [esp + 4] /* src_argb */ 1149 mov edx, [esp + 8] /* dst_y */ 1150 mov ecx, [esp + 12] /* pix */ 1151 movdqa xmm5, kAddY16 1152 movdqa xmm4, kRGBAToY 1153 1154 align 4 1155 convertloop: 1156 movdqa xmm0, [eax] 1157 movdqa xmm1, [eax + 16] 1158 movdqa xmm2, [eax + 32] 1159 movdqa xmm3, [eax + 48] 1160 pmaddubsw xmm0, xmm4 1161 pmaddubsw xmm1, xmm4 1162 pmaddubsw xmm2, xmm4 1163 pmaddubsw xmm3, xmm4 1164 lea eax, [eax + 64] 1165 phaddw xmm0, xmm1 1166 phaddw xmm2, xmm3 1167 psrlw xmm0, 7 1168 psrlw xmm2, 7 1169 packuswb xmm0, xmm2 1170 paddb xmm0, xmm5 1171 sub ecx, 16 1172 movdqa [edx], xmm0 1173 lea edx, [edx + 16] 1174 jg convertloop 1175 ret 1176 } 1177 } 1178 1179 __declspec(naked) __declspec(align(16)) 1180 void RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 1181 __asm { 1182 mov eax, [esp + 4] /* src_argb */ 1183 mov edx, [esp + 8] /* dst_y */ 1184 mov ecx, [esp + 12] /* pix */ 1185 movdqa xmm5, kAddY16 1186 movdqa xmm4, kRGBAToY 1187 1188 align 4 1189 convertloop: 1190 movdqu xmm0, [eax] 1191 movdqu xmm1, [eax + 16] 1192 movdqu xmm2, [eax + 32] 1193 movdqu xmm3, [eax + 48] 1194 pmaddubsw xmm0, xmm4 1195 pmaddubsw xmm1, xmm4 1196 pmaddubsw xmm2, xmm4 1197 pmaddubsw xmm3, xmm4 1198 lea eax, [eax + 64] 1199 phaddw xmm0, xmm1 1200 phaddw xmm2, xmm3 1201 psrlw xmm0, 7 1202 psrlw xmm2, 7 1203 packuswb xmm0, xmm2 1204 paddb xmm0, xmm5 1205 sub ecx, 16 1206 movdqu [edx], xmm0 1207 lea edx, [edx + 16] 1208 jg convertloop 1209 ret 1210 } 1211 } 1212 1213 __declspec(naked) __declspec(align(16)) 1214 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 1215 uint8* dst_u, uint8* dst_v, int width) { 1216 __asm { 1217 push esi 1218 push edi 1219 mov eax, [esp + 8 + 4] // src_argb 1220 mov esi, [esp + 8 + 8] // src_stride_argb 1221 mov edx, [esp + 8 + 12] // dst_u 1222 mov edi, [esp + 8 + 16] // dst_v 1223 mov ecx, [esp + 8 + 20] // pix 1224 movdqa xmm7, kARGBToU 1225 movdqa xmm6, kARGBToV 1226 movdqa xmm5, kAddUV128 1227 sub edi, edx // stride from u to v 1228 1229 align 4 1230 convertloop: 1231 /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1232 movdqa xmm0, [eax] 1233 movdqa xmm1, [eax + 16] 1234 movdqa xmm2, [eax + 32] 1235 movdqa xmm3, [eax + 48] 1236 pavgb xmm0, [eax + esi] 1237 pavgb xmm1, [eax + esi + 16] 1238 pavgb xmm2, [eax + esi + 32] 1239 pavgb xmm3, [eax + esi + 48] 1240 lea eax, [eax + 64] 1241 movdqa xmm4, xmm0 1242 shufps xmm0, xmm1, 0x88 1243 shufps xmm4, xmm1, 0xdd 1244 pavgb xmm0, xmm4 1245 movdqa xmm4, xmm2 1246 shufps xmm2, xmm3, 0x88 1247 shufps xmm4, xmm3, 0xdd 1248 pavgb xmm2, xmm4 1249 1250 // step 2 - convert to U and V 1251 // from here down is very similar to Y code except 1252 // instead of 16 different pixels, its 8 pixels of U and 8 of V 1253 movdqa xmm1, xmm0 1254 movdqa xmm3, xmm2 1255 pmaddubsw xmm0, xmm7 // U 1256 pmaddubsw xmm2, xmm7 1257 pmaddubsw xmm1, xmm6 // V 1258 pmaddubsw xmm3, xmm6 1259 phaddw xmm0, xmm2 1260 phaddw xmm1, xmm3 1261 psraw xmm0, 8 1262 psraw xmm1, 8 1263 packsswb xmm0, xmm1 1264 paddb xmm0, xmm5 // -> unsigned 1265 1266 // step 3 - store 8 U and 8 V values 1267 sub ecx, 16 1268 movlps qword ptr [edx], xmm0 // U 1269 movhps qword ptr [edx + edi], xmm0 // V 1270 lea edx, [edx + 8] 1271 jg convertloop 1272 1273 pop edi 1274 pop esi 1275 ret 1276 } 1277 } 1278 1279 __declspec(naked) __declspec(align(16)) 1280 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 1281 uint8* dst_u, uint8* dst_v, int width) { 1282 __asm { 1283 push esi 1284 push edi 1285 mov eax, [esp + 8 + 4] // src_argb 1286 mov esi, [esp + 8 + 8] // src_stride_argb 1287 mov edx, [esp + 8 + 12] // dst_u 1288 mov edi, [esp + 8 + 16] // dst_v 1289 mov ecx, [esp + 8 + 20] // pix 1290 movdqa xmm7, kARGBToUJ 1291 movdqa xmm6, kARGBToVJ 1292 movdqa xmm5, kAddUVJ128 1293 sub edi, edx // stride from u to v 1294 1295 align 4 1296 convertloop: 1297 /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1298 movdqa xmm0, [eax] 1299 movdqa xmm1, [eax + 16] 1300 movdqa xmm2, [eax + 32] 1301 movdqa xmm3, [eax + 48] 1302 pavgb xmm0, [eax + esi] 1303 pavgb xmm1, [eax + esi + 16] 1304 pavgb xmm2, [eax + esi + 32] 1305 pavgb xmm3, [eax + esi + 48] 1306 lea eax, [eax + 64] 1307 movdqa xmm4, xmm0 1308 shufps xmm0, xmm1, 0x88 1309 shufps xmm4, xmm1, 0xdd 1310 pavgb xmm0, xmm4 1311 movdqa xmm4, xmm2 1312 shufps xmm2, xmm3, 0x88 1313 shufps xmm4, xmm3, 0xdd 1314 pavgb xmm2, xmm4 1315 1316 // step 2 - convert to U and V 1317 // from here down is very similar to Y code except 1318 // instead of 16 different pixels, its 8 pixels of U and 8 of V 1319 movdqa xmm1, xmm0 1320 movdqa xmm3, xmm2 1321 pmaddubsw xmm0, xmm7 // U 1322 pmaddubsw xmm2, xmm7 1323 pmaddubsw xmm1, xmm6 // V 1324 pmaddubsw xmm3, xmm6 1325 phaddw xmm0, xmm2 1326 phaddw xmm1, xmm3 1327 paddw xmm0, xmm5 // +.5 rounding -> unsigned 1328 paddw xmm1, xmm5 1329 psraw xmm0, 8 1330 psraw xmm1, 8 1331 packsswb xmm0, xmm1 1332 1333 // step 3 - store 8 U and 8 V values 1334 sub ecx, 16 1335 movlps qword ptr [edx], xmm0 // U 1336 movhps qword ptr [edx + edi], xmm0 // V 1337 lea edx, [edx + 8] 1338 jg convertloop 1339 1340 pop edi 1341 pop esi 1342 ret 1343 } 1344 } 1345 1346 #ifdef HAS_ARGBTOUVROW_AVX2 1347 __declspec(naked) __declspec(align(32)) 1348 void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, 1349 uint8* dst_u, uint8* dst_v, int width) { 1350 __asm { 1351 push esi 1352 push edi 1353 mov eax, [esp + 8 + 4] // src_argb 1354 mov esi, [esp + 8 + 8] // src_stride_argb 1355 mov edx, [esp + 8 + 12] // dst_u 1356 mov edi, [esp + 8 + 16] // dst_v 1357 mov ecx, [esp + 8 + 20] // pix 1358 vbroadcastf128 ymm5, kAddUV128 1359 vbroadcastf128 ymm6, kARGBToV 1360 vbroadcastf128 ymm7, kARGBToU 1361 sub edi, edx // stride from u to v 1362 1363 align 4 1364 convertloop: 1365 /* step 1 - subsample 32x2 argb pixels to 16x1 */ 1366 vmovdqu ymm0, [eax] 1367 vmovdqu ymm1, [eax + 32] 1368 vmovdqu ymm2, [eax + 64] 1369 vmovdqu ymm3, [eax + 96] 1370 vpavgb ymm0, ymm0, [eax + esi] 1371 vpavgb ymm1, ymm1, [eax + esi + 32] 1372 vpavgb ymm2, ymm2, [eax + esi + 64] 1373 vpavgb ymm3, ymm3, [eax + esi + 96] 1374 lea eax, [eax + 128] 1375 vshufps ymm4, ymm0, ymm1, 0x88 1376 vshufps ymm0, ymm0, ymm1, 0xdd 1377 vpavgb ymm0, ymm0, ymm4 // mutated by vshufps 1378 vshufps ymm4, ymm2, ymm3, 0x88 1379 vshufps ymm2, ymm2, ymm3, 0xdd 1380 vpavgb ymm2, ymm2, ymm4 // mutated by vshufps 1381 1382 // step 2 - convert to U and V 1383 // from here down is very similar to Y code except 1384 // instead of 32 different pixels, its 16 pixels of U and 16 of V 1385 vpmaddubsw ymm1, ymm0, ymm7 // U 1386 vpmaddubsw ymm3, ymm2, ymm7 1387 vpmaddubsw ymm0, ymm0, ymm6 // V 1388 vpmaddubsw ymm2, ymm2, ymm6 1389 vphaddw ymm1, ymm1, ymm3 // mutates 1390 vphaddw ymm0, ymm0, ymm2 1391 vpsraw ymm1, ymm1, 8 1392 vpsraw ymm0, ymm0, 8 1393 vpacksswb ymm0, ymm1, ymm0 // mutates 1394 vpermq ymm0, ymm0, 0xd8 // For vpacksswb 1395 vpshufb ymm0, ymm0, kShufARGBToUV_AVX // For vshufps + vphaddw 1396 vpaddb ymm0, ymm0, ymm5 // -> unsigned 1397 1398 // step 3 - store 16 U and 16 V values 1399 sub ecx, 32 1400 vextractf128 [edx], ymm0, 0 // U 1401 vextractf128 [edx + edi], ymm0, 1 // V 1402 lea edx, [edx + 16] 1403 jg convertloop 1404 1405 pop edi 1406 pop esi 1407 vzeroupper 1408 ret 1409 } 1410 } 1411 #endif // HAS_ARGBTOUVROW_AVX2 1412 1413 __declspec(naked) __declspec(align(16)) 1414 void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, 1415 uint8* dst_u, uint8* dst_v, int width) { 1416 __asm { 1417 push esi 1418 push edi 1419 mov eax, [esp + 8 + 4] // src_argb 1420 mov esi, [esp + 8 + 8] // src_stride_argb 1421 mov edx, [esp + 8 + 12] // dst_u 1422 mov edi, [esp + 8 + 16] // dst_v 1423 mov ecx, [esp + 8 + 20] // pix 1424 movdqa xmm7, kARGBToU 1425 movdqa xmm6, kARGBToV 1426 movdqa xmm5, kAddUV128 1427 sub edi, edx // stride from u to v 1428 1429 align 4 1430 convertloop: 1431 /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1432 movdqu xmm0, [eax] 1433 movdqu xmm1, [eax + 16] 1434 movdqu xmm2, [eax + 32] 1435 movdqu xmm3, [eax + 48] 1436 movdqu xmm4, [eax + esi] 1437 pavgb xmm0, xmm4 1438 movdqu xmm4, [eax + esi + 16] 1439 pavgb xmm1, xmm4 1440 movdqu xmm4, [eax + esi + 32] 1441 pavgb xmm2, xmm4 1442 movdqu xmm4, [eax + esi + 48] 1443 pavgb xmm3, xmm4 1444 lea eax, [eax + 64] 1445 movdqa xmm4, xmm0 1446 shufps xmm0, xmm1, 0x88 1447 shufps xmm4, xmm1, 0xdd 1448 pavgb xmm0, xmm4 1449 movdqa xmm4, xmm2 1450 shufps xmm2, xmm3, 0x88 1451 shufps xmm4, xmm3, 0xdd 1452 pavgb xmm2, xmm4 1453 1454 // step 2 - convert to U and V 1455 // from here down is very similar to Y code except 1456 // instead of 16 different pixels, its 8 pixels of U and 8 of V 1457 movdqa xmm1, xmm0 1458 movdqa xmm3, xmm2 1459 pmaddubsw xmm0, xmm7 // U 1460 pmaddubsw xmm2, xmm7 1461 pmaddubsw xmm1, xmm6 // V 1462 pmaddubsw xmm3, xmm6 1463 phaddw xmm0, xmm2 1464 phaddw xmm1, xmm3 1465 psraw xmm0, 8 1466 psraw xmm1, 8 1467 packsswb xmm0, xmm1 1468 paddb xmm0, xmm5 // -> unsigned 1469 1470 // step 3 - store 8 U and 8 V values 1471 sub ecx, 16 1472 movlps qword ptr [edx], xmm0 // U 1473 movhps qword ptr [edx + edi], xmm0 // V 1474 lea edx, [edx + 8] 1475 jg convertloop 1476 1477 pop edi 1478 pop esi 1479 ret 1480 } 1481 } 1482 1483 __declspec(naked) __declspec(align(16)) 1484 void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, 1485 uint8* dst_u, uint8* dst_v, int width) { 1486 __asm { 1487 push esi 1488 push edi 1489 mov eax, [esp + 8 + 4] // src_argb 1490 mov esi, [esp + 8 + 8] // src_stride_argb 1491 mov edx, [esp + 8 + 12] // dst_u 1492 mov edi, [esp + 8 + 16] // dst_v 1493 mov ecx, [esp + 8 + 20] // pix 1494 movdqa xmm7, kARGBToUJ 1495 movdqa xmm6, kARGBToVJ 1496 movdqa xmm5, kAddUVJ128 1497 sub edi, edx // stride from u to v 1498 1499 align 4 1500 convertloop: 1501 /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1502 movdqu xmm0, [eax] 1503 movdqu xmm1, [eax + 16] 1504 movdqu xmm2, [eax + 32] 1505 movdqu xmm3, [eax + 48] 1506 movdqu xmm4, [eax + esi] 1507 pavgb xmm0, xmm4 1508 movdqu xmm4, [eax + esi + 16] 1509 pavgb xmm1, xmm4 1510 movdqu xmm4, [eax + esi + 32] 1511 pavgb xmm2, xmm4 1512 movdqu xmm4, [eax + esi + 48] 1513 pavgb xmm3, xmm4 1514 lea eax, [eax + 64] 1515 movdqa xmm4, xmm0 1516 shufps xmm0, xmm1, 0x88 1517 shufps xmm4, xmm1, 0xdd 1518 pavgb xmm0, xmm4 1519 movdqa xmm4, xmm2 1520 shufps xmm2, xmm3, 0x88 1521 shufps xmm4, xmm3, 0xdd 1522 pavgb xmm2, xmm4 1523 1524 // step 2 - convert to U and V 1525 // from here down is very similar to Y code except 1526 // instead of 16 different pixels, its 8 pixels of U and 8 of V 1527 movdqa xmm1, xmm0 1528 movdqa xmm3, xmm2 1529 pmaddubsw xmm0, xmm7 // U 1530 pmaddubsw xmm2, xmm7 1531 pmaddubsw xmm1, xmm6 // V 1532 pmaddubsw xmm3, xmm6 1533 phaddw xmm0, xmm2 1534 phaddw xmm1, xmm3 1535 paddw xmm0, xmm5 // +.5 rounding -> unsigned 1536 paddw xmm1, xmm5 1537 psraw xmm0, 8 1538 psraw xmm1, 8 1539 packsswb xmm0, xmm1 1540 1541 // step 3 - store 8 U and 8 V values 1542 sub ecx, 16 1543 movlps qword ptr [edx], xmm0 // U 1544 movhps qword ptr [edx + edi], xmm0 // V 1545 lea edx, [edx + 8] 1546 jg convertloop 1547 1548 pop edi 1549 pop esi 1550 ret 1551 } 1552 } 1553 1554 __declspec(naked) __declspec(align(16)) 1555 void ARGBToUV444Row_SSSE3(const uint8* src_argb0, 1556 uint8* dst_u, uint8* dst_v, int width) { 1557 __asm { 1558 push edi 1559 mov eax, [esp + 4 + 4] // src_argb 1560 mov edx, [esp + 4 + 8] // dst_u 1561 mov edi, [esp + 4 + 12] // dst_v 1562 mov ecx, [esp + 4 + 16] // pix 1563 movdqa xmm7, kARGBToU 1564 movdqa xmm6, kARGBToV 1565 movdqa xmm5, kAddUV128 1566 sub edi, edx // stride from u to v 1567 1568 align 4 1569 convertloop: 1570 /* convert to U and V */ 1571 movdqa xmm0, [eax] // U 1572 movdqa xmm1, [eax + 16] 1573 movdqa xmm2, [eax + 32] 1574 movdqa xmm3, [eax + 48] 1575 pmaddubsw xmm0, xmm7 1576 pmaddubsw xmm1, xmm7 1577 pmaddubsw xmm2, xmm7 1578 pmaddubsw xmm3, xmm7 1579 phaddw xmm0, xmm1 1580 phaddw xmm2, xmm3 1581 psraw xmm0, 8 1582 psraw xmm2, 8 1583 packsswb xmm0, xmm2 1584 paddb xmm0, xmm5 1585 sub ecx, 16 1586 movdqa [edx], xmm0 1587 1588 movdqa xmm0, [eax] // V 1589 movdqa xmm1, [eax + 16] 1590 movdqa xmm2, [eax + 32] 1591 movdqa xmm3, [eax + 48] 1592 pmaddubsw xmm0, xmm6 1593 pmaddubsw xmm1, xmm6 1594 pmaddubsw xmm2, xmm6 1595 pmaddubsw xmm3, xmm6 1596 phaddw xmm0, xmm1 1597 phaddw xmm2, xmm3 1598 psraw xmm0, 8 1599 psraw xmm2, 8 1600 packsswb xmm0, xmm2 1601 paddb xmm0, xmm5 1602 lea eax, [eax + 64] 1603 movdqa [edx + edi], xmm0 1604 lea edx, [edx + 16] 1605 jg convertloop 1606 1607 pop edi 1608 ret 1609 } 1610 } 1611 1612 __declspec(naked) __declspec(align(16)) 1613 void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb0, 1614 uint8* dst_u, uint8* dst_v, int width) { 1615 __asm { 1616 push edi 1617 mov eax, [esp + 4 + 4] // src_argb 1618 mov edx, [esp + 4 + 8] // dst_u 1619 mov edi, [esp + 4 + 12] // dst_v 1620 mov ecx, [esp + 4 + 16] // pix 1621 movdqa xmm7, kARGBToU 1622 movdqa xmm6, kARGBToV 1623 movdqa xmm5, kAddUV128 1624 sub edi, edx // stride from u to v 1625 1626 align 4 1627 convertloop: 1628 /* convert to U and V */ 1629 movdqu xmm0, [eax] // U 1630 movdqu xmm1, [eax + 16] 1631 movdqu xmm2, [eax + 32] 1632 movdqu xmm3, [eax + 48] 1633 pmaddubsw xmm0, xmm7 1634 pmaddubsw xmm1, xmm7 1635 pmaddubsw xmm2, xmm7 1636 pmaddubsw xmm3, xmm7 1637 phaddw xmm0, xmm1 1638 phaddw xmm2, xmm3 1639 psraw xmm0, 8 1640 psraw xmm2, 8 1641 packsswb xmm0, xmm2 1642 paddb xmm0, xmm5 1643 sub ecx, 16 1644 movdqu [edx], xmm0 1645 1646 movdqu xmm0, [eax] // V 1647 movdqu xmm1, [eax + 16] 1648 movdqu xmm2, [eax + 32] 1649 movdqu xmm3, [eax + 48] 1650 pmaddubsw xmm0, xmm6 1651 pmaddubsw xmm1, xmm6 1652 pmaddubsw xmm2, xmm6 1653 pmaddubsw xmm3, xmm6 1654 phaddw xmm0, xmm1 1655 phaddw xmm2, xmm3 1656 psraw xmm0, 8 1657 psraw xmm2, 8 1658 packsswb xmm0, xmm2 1659 paddb xmm0, xmm5 1660 lea eax, [eax + 64] 1661 movdqu [edx + edi], xmm0 1662 lea edx, [edx + 16] 1663 jg convertloop 1664 1665 pop edi 1666 ret 1667 } 1668 } 1669 1670 __declspec(naked) __declspec(align(16)) 1671 void ARGBToUV422Row_SSSE3(const uint8* src_argb0, 1672 uint8* dst_u, uint8* dst_v, int width) { 1673 __asm { 1674 push edi 1675 mov eax, [esp + 4 + 4] // src_argb 1676 mov edx, [esp + 4 + 8] // dst_u 1677 mov edi, [esp + 4 + 12] // dst_v 1678 mov ecx, [esp + 4 + 16] // pix 1679 movdqa xmm7, kARGBToU 1680 movdqa xmm6, kARGBToV 1681 movdqa xmm5, kAddUV128 1682 sub edi, edx // stride from u to v 1683 1684 align 4 1685 convertloop: 1686 /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1687 movdqa xmm0, [eax] 1688 movdqa xmm1, [eax + 16] 1689 movdqa xmm2, [eax + 32] 1690 movdqa xmm3, [eax + 48] 1691 lea eax, [eax + 64] 1692 movdqa xmm4, xmm0 1693 shufps xmm0, xmm1, 0x88 1694 shufps xmm4, xmm1, 0xdd 1695 pavgb xmm0, xmm4 1696 movdqa xmm4, xmm2 1697 shufps xmm2, xmm3, 0x88 1698 shufps xmm4, xmm3, 0xdd 1699 pavgb xmm2, xmm4 1700 1701 // step 2 - convert to U and V 1702 // from here down is very similar to Y code except 1703 // instead of 16 different pixels, its 8 pixels of U and 8 of V 1704 movdqa xmm1, xmm0 1705 movdqa xmm3, xmm2 1706 pmaddubsw xmm0, xmm7 // U 1707 pmaddubsw xmm2, xmm7 1708 pmaddubsw xmm1, xmm6 // V 1709 pmaddubsw xmm3, xmm6 1710 phaddw xmm0, xmm2 1711 phaddw xmm1, xmm3 1712 psraw xmm0, 8 1713 psraw xmm1, 8 1714 packsswb xmm0, xmm1 1715 paddb xmm0, xmm5 // -> unsigned 1716 1717 // step 3 - store 8 U and 8 V values 1718 sub ecx, 16 1719 movlps qword ptr [edx], xmm0 // U 1720 movhps qword ptr [edx + edi], xmm0 // V 1721 lea edx, [edx + 8] 1722 jg convertloop 1723 1724 pop edi 1725 ret 1726 } 1727 } 1728 1729 __declspec(naked) __declspec(align(16)) 1730 void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0, 1731 uint8* dst_u, uint8* dst_v, int width) { 1732 __asm { 1733 push edi 1734 mov eax, [esp + 4 + 4] // src_argb 1735 mov edx, [esp + 4 + 8] // dst_u 1736 mov edi, [esp + 4 + 12] // dst_v 1737 mov ecx, [esp + 4 + 16] // pix 1738 movdqa xmm7, kARGBToU 1739 movdqa xmm6, kARGBToV 1740 movdqa xmm5, kAddUV128 1741 sub edi, edx // stride from u to v 1742 1743 align 4 1744 convertloop: 1745 /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1746 movdqu xmm0, [eax] 1747 movdqu xmm1, [eax + 16] 1748 movdqu xmm2, [eax + 32] 1749 movdqu xmm3, [eax + 48] 1750 lea eax, [eax + 64] 1751 movdqa xmm4, xmm0 1752 shufps xmm0, xmm1, 0x88 1753 shufps xmm4, xmm1, 0xdd 1754 pavgb xmm0, xmm4 1755 movdqa xmm4, xmm2 1756 shufps xmm2, xmm3, 0x88 1757 shufps xmm4, xmm3, 0xdd 1758 pavgb xmm2, xmm4 1759 1760 // step 2 - convert to U and V 1761 // from here down is very similar to Y code except 1762 // instead of 16 different pixels, its 8 pixels of U and 8 of V 1763 movdqa xmm1, xmm0 1764 movdqa xmm3, xmm2 1765 pmaddubsw xmm0, xmm7 // U 1766 pmaddubsw xmm2, xmm7 1767 pmaddubsw xmm1, xmm6 // V 1768 pmaddubsw xmm3, xmm6 1769 phaddw xmm0, xmm2 1770 phaddw xmm1, xmm3 1771 psraw xmm0, 8 1772 psraw xmm1, 8 1773 packsswb xmm0, xmm1 1774 paddb xmm0, xmm5 // -> unsigned 1775 1776 // step 3 - store 8 U and 8 V values 1777 sub ecx, 16 1778 movlps qword ptr [edx], xmm0 // U 1779 movhps qword ptr [edx + edi], xmm0 // V 1780 lea edx, [edx + 8] 1781 jg convertloop 1782 1783 pop edi 1784 ret 1785 } 1786 } 1787 1788 __declspec(naked) __declspec(align(16)) 1789 void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 1790 uint8* dst_u, uint8* dst_v, int width) { 1791 __asm { 1792 push esi 1793 push edi 1794 mov eax, [esp + 8 + 4] // src_argb 1795 mov esi, [esp + 8 + 8] // src_stride_argb 1796 mov edx, [esp + 8 + 12] // dst_u 1797 mov edi, [esp + 8 + 16] // dst_v 1798 mov ecx, [esp + 8 + 20] // pix 1799 movdqa xmm7, kBGRAToU 1800 movdqa xmm6, kBGRAToV 1801 movdqa xmm5, kAddUV128 1802 sub edi, edx // stride from u to v 1803 1804 align 4 1805 convertloop: 1806 /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1807 movdqa xmm0, [eax] 1808 movdqa xmm1, [eax + 16] 1809 movdqa xmm2, [eax + 32] 1810 movdqa xmm3, [eax + 48] 1811 pavgb xmm0, [eax + esi] 1812 pavgb xmm1, [eax + esi + 16] 1813 pavgb xmm2, [eax + esi + 32] 1814 pavgb xmm3, [eax + esi + 48] 1815 lea eax, [eax + 64] 1816 movdqa xmm4, xmm0 1817 shufps xmm0, xmm1, 0x88 1818 shufps xmm4, xmm1, 0xdd 1819 pavgb xmm0, xmm4 1820 movdqa xmm4, xmm2 1821 shufps xmm2, xmm3, 0x88 1822 shufps xmm4, xmm3, 0xdd 1823 pavgb xmm2, xmm4 1824 1825 // step 2 - convert to U and V 1826 // from here down is very similar to Y code except 1827 // instead of 16 different pixels, its 8 pixels of U and 8 of V 1828 movdqa xmm1, xmm0 1829 movdqa xmm3, xmm2 1830 pmaddubsw xmm0, xmm7 // U 1831 pmaddubsw xmm2, xmm7 1832 pmaddubsw xmm1, xmm6 // V 1833 pmaddubsw xmm3, xmm6 1834 phaddw xmm0, xmm2 1835 phaddw xmm1, xmm3 1836 psraw xmm0, 8 1837 psraw xmm1, 8 1838 packsswb xmm0, xmm1 1839 paddb xmm0, xmm5 // -> unsigned 1840 1841 // step 3 - store 8 U and 8 V values 1842 sub ecx, 16 1843 movlps qword ptr [edx], xmm0 // U 1844 movhps qword ptr [edx + edi], xmm0 // V 1845 lea edx, [edx + 8] 1846 jg convertloop 1847 1848 pop edi 1849 pop esi 1850 ret 1851 } 1852 } 1853 1854 __declspec(naked) __declspec(align(16)) 1855 void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, 1856 uint8* dst_u, uint8* dst_v, int width) { 1857 __asm { 1858 push esi 1859 push edi 1860 mov eax, [esp + 8 + 4] // src_argb 1861 mov esi, [esp + 8 + 8] // src_stride_argb 1862 mov edx, [esp + 8 + 12] // dst_u 1863 mov edi, [esp + 8 + 16] // dst_v 1864 mov ecx, [esp + 8 + 20] // pix 1865 movdqa xmm7, kBGRAToU 1866 movdqa xmm6, kBGRAToV 1867 movdqa xmm5, kAddUV128 1868 sub edi, edx // stride from u to v 1869 1870 align 4 1871 convertloop: 1872 /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1873 movdqu xmm0, [eax] 1874 movdqu xmm1, [eax + 16] 1875 movdqu xmm2, [eax + 32] 1876 movdqu xmm3, [eax + 48] 1877 movdqu xmm4, [eax + esi] 1878 pavgb xmm0, xmm4 1879 movdqu xmm4, [eax + esi + 16] 1880 pavgb xmm1, xmm4 1881 movdqu xmm4, [eax + esi + 32] 1882 pavgb xmm2, xmm4 1883 movdqu xmm4, [eax + esi + 48] 1884 pavgb xmm3, xmm4 1885 lea eax, [eax + 64] 1886 movdqa xmm4, xmm0 1887 shufps xmm0, xmm1, 0x88 1888 shufps xmm4, xmm1, 0xdd 1889 pavgb xmm0, xmm4 1890 movdqa xmm4, xmm2 1891 shufps xmm2, xmm3, 0x88 1892 shufps xmm4, xmm3, 0xdd 1893 pavgb xmm2, xmm4 1894 1895 // step 2 - convert to U and V 1896 // from here down is very similar to Y code except 1897 // instead of 16 different pixels, its 8 pixels of U and 8 of V 1898 movdqa xmm1, xmm0 1899 movdqa xmm3, xmm2 1900 pmaddubsw xmm0, xmm7 // U 1901 pmaddubsw xmm2, xmm7 1902 pmaddubsw xmm1, xmm6 // V 1903 pmaddubsw xmm3, xmm6 1904 phaddw xmm0, xmm2 1905 phaddw xmm1, xmm3 1906 psraw xmm0, 8 1907 psraw xmm1, 8 1908 packsswb xmm0, xmm1 1909 paddb xmm0, xmm5 // -> unsigned 1910 1911 // step 3 - store 8 U and 8 V values 1912 sub ecx, 16 1913 movlps qword ptr [edx], xmm0 // U 1914 movhps qword ptr [edx + edi], xmm0 // V 1915 lea edx, [edx + 8] 1916 jg convertloop 1917 1918 pop edi 1919 pop esi 1920 ret 1921 } 1922 } 1923 1924 __declspec(naked) __declspec(align(16)) 1925 void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 1926 uint8* dst_u, uint8* dst_v, int width) { 1927 __asm { 1928 push esi 1929 push edi 1930 mov eax, [esp + 8 + 4] // src_argb 1931 mov esi, [esp + 8 + 8] // src_stride_argb 1932 mov edx, [esp + 8 + 12] // dst_u 1933 mov edi, [esp + 8 + 16] // dst_v 1934 mov ecx, [esp + 8 + 20] // pix 1935 movdqa xmm7, kABGRToU 1936 movdqa xmm6, kABGRToV 1937 movdqa xmm5, kAddUV128 1938 sub edi, edx // stride from u to v 1939 1940 align 4 1941 convertloop: 1942 /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1943 movdqa xmm0, [eax] 1944 movdqa xmm1, [eax + 16] 1945 movdqa xmm2, [eax + 32] 1946 movdqa xmm3, [eax + 48] 1947 pavgb xmm0, [eax + esi] 1948 pavgb xmm1, [eax + esi + 16] 1949 pavgb xmm2, [eax + esi + 32] 1950 pavgb xmm3, [eax + esi + 48] 1951 lea eax, [eax + 64] 1952 movdqa xmm4, xmm0 1953 shufps xmm0, xmm1, 0x88 1954 shufps xmm4, xmm1, 0xdd 1955 pavgb xmm0, xmm4 1956 movdqa xmm4, xmm2 1957 shufps xmm2, xmm3, 0x88 1958 shufps xmm4, xmm3, 0xdd 1959 pavgb xmm2, xmm4 1960 1961 // step 2 - convert to U and V 1962 // from here down is very similar to Y code except 1963 // instead of 16 different pixels, its 8 pixels of U and 8 of V 1964 movdqa xmm1, xmm0 1965 movdqa xmm3, xmm2 1966 pmaddubsw xmm0, xmm7 // U 1967 pmaddubsw xmm2, xmm7 1968 pmaddubsw xmm1, xmm6 // V 1969 pmaddubsw xmm3, xmm6 1970 phaddw xmm0, xmm2 1971 phaddw xmm1, xmm3 1972 psraw xmm0, 8 1973 psraw xmm1, 8 1974 packsswb xmm0, xmm1 1975 paddb xmm0, xmm5 // -> unsigned 1976 1977 // step 3 - store 8 U and 8 V values 1978 sub ecx, 16 1979 movlps qword ptr [edx], xmm0 // U 1980 movhps qword ptr [edx + edi], xmm0 // V 1981 lea edx, [edx + 8] 1982 jg convertloop 1983 1984 pop edi 1985 pop esi 1986 ret 1987 } 1988 } 1989 1990 __declspec(naked) __declspec(align(16)) 1991 void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, 1992 uint8* dst_u, uint8* dst_v, int width) { 1993 __asm { 1994 push esi 1995 push edi 1996 mov eax, [esp + 8 + 4] // src_argb 1997 mov esi, [esp + 8 + 8] // src_stride_argb 1998 mov edx, [esp + 8 + 12] // dst_u 1999 mov edi, [esp + 8 + 16] // dst_v 2000 mov ecx, [esp + 8 + 20] // pix 2001 movdqa xmm7, kABGRToU 2002 movdqa xmm6, kABGRToV 2003 movdqa xmm5, kAddUV128 2004 sub edi, edx // stride from u to v 2005 2006 align 4 2007 convertloop: 2008 /* step 1 - subsample 16x2 argb pixels to 8x1 */ 2009 movdqu xmm0, [eax] 2010 movdqu xmm1, [eax + 16] 2011 movdqu xmm2, [eax + 32] 2012 movdqu xmm3, [eax + 48] 2013 movdqu xmm4, [eax + esi] 2014 pavgb xmm0, xmm4 2015 movdqu xmm4, [eax + esi + 16] 2016 pavgb xmm1, xmm4 2017 movdqu xmm4, [eax + esi + 32] 2018 pavgb xmm2, xmm4 2019 movdqu xmm4, [eax + esi + 48] 2020 pavgb xmm3, xmm4 2021 lea eax, [eax + 64] 2022 movdqa xmm4, xmm0 2023 shufps xmm0, xmm1, 0x88 2024 shufps xmm4, xmm1, 0xdd 2025 pavgb xmm0, xmm4 2026 movdqa xmm4, xmm2 2027 shufps xmm2, xmm3, 0x88 2028 shufps xmm4, xmm3, 0xdd 2029 pavgb xmm2, xmm4 2030 2031 // step 2 - convert to U and V 2032 // from here down is very similar to Y code except 2033 // instead of 16 different pixels, its 8 pixels of U and 8 of V 2034 movdqa xmm1, xmm0 2035 movdqa xmm3, xmm2 2036 pmaddubsw xmm0, xmm7 // U 2037 pmaddubsw xmm2, xmm7 2038 pmaddubsw xmm1, xmm6 // V 2039 pmaddubsw xmm3, xmm6 2040 phaddw xmm0, xmm2 2041 phaddw xmm1, xmm3 2042 psraw xmm0, 8 2043 psraw xmm1, 8 2044 packsswb xmm0, xmm1 2045 paddb xmm0, xmm5 // -> unsigned 2046 2047 // step 3 - store 8 U and 8 V values 2048 sub ecx, 16 2049 movlps qword ptr [edx], xmm0 // U 2050 movhps qword ptr [edx + edi], xmm0 // V 2051 lea edx, [edx + 8] 2052 jg convertloop 2053 2054 pop edi 2055 pop esi 2056 ret 2057 } 2058 } 2059 2060 __declspec(naked) __declspec(align(16)) 2061 void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 2062 uint8* dst_u, uint8* dst_v, int width) { 2063 __asm { 2064 push esi 2065 push edi 2066 mov eax, [esp + 8 + 4] // src_argb 2067 mov esi, [esp + 8 + 8] // src_stride_argb 2068 mov edx, [esp + 8 + 12] // dst_u 2069 mov edi, [esp + 8 + 16] // dst_v 2070 mov ecx, [esp + 8 + 20] // pix 2071 movdqa xmm7, kRGBAToU 2072 movdqa xmm6, kRGBAToV 2073 movdqa xmm5, kAddUV128 2074 sub edi, edx // stride from u to v 2075 2076 align 4 2077 convertloop: 2078 /* step 1 - subsample 16x2 argb pixels to 8x1 */ 2079 movdqa xmm0, [eax] 2080 movdqa xmm1, [eax + 16] 2081 movdqa xmm2, [eax + 32] 2082 movdqa xmm3, [eax + 48] 2083 pavgb xmm0, [eax + esi] 2084 pavgb xmm1, [eax + esi + 16] 2085 pavgb xmm2, [eax + esi + 32] 2086 pavgb xmm3, [eax + esi + 48] 2087 lea eax, [eax + 64] 2088 movdqa xmm4, xmm0 2089 shufps xmm0, xmm1, 0x88 2090 shufps xmm4, xmm1, 0xdd 2091 pavgb xmm0, xmm4 2092 movdqa xmm4, xmm2 2093 shufps xmm2, xmm3, 0x88 2094 shufps xmm4, xmm3, 0xdd 2095 pavgb xmm2, xmm4 2096 2097 // step 2 - convert to U and V 2098 // from here down is very similar to Y code except 2099 // instead of 16 different pixels, its 8 pixels of U and 8 of V 2100 movdqa xmm1, xmm0 2101 movdqa xmm3, xmm2 2102 pmaddubsw xmm0, xmm7 // U 2103 pmaddubsw xmm2, xmm7 2104 pmaddubsw xmm1, xmm6 // V 2105 pmaddubsw xmm3, xmm6 2106 phaddw xmm0, xmm2 2107 phaddw xmm1, xmm3 2108 psraw xmm0, 8 2109 psraw xmm1, 8 2110 packsswb xmm0, xmm1 2111 paddb xmm0, xmm5 // -> unsigned 2112 2113 // step 3 - store 8 U and 8 V values 2114 sub ecx, 16 2115 movlps qword ptr [edx], xmm0 // U 2116 movhps qword ptr [edx + edi], xmm0 // V 2117 lea edx, [edx + 8] 2118 jg convertloop 2119 2120 pop edi 2121 pop esi 2122 ret 2123 } 2124 } 2125 2126 __declspec(naked) __declspec(align(16)) 2127 void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, 2128 uint8* dst_u, uint8* dst_v, int width) { 2129 __asm { 2130 push esi 2131 push edi 2132 mov eax, [esp + 8 + 4] // src_argb 2133 mov esi, [esp + 8 + 8] // src_stride_argb 2134 mov edx, [esp + 8 + 12] // dst_u 2135 mov edi, [esp + 8 + 16] // dst_v 2136 mov ecx, [esp + 8 + 20] // pix 2137 movdqa xmm7, kRGBAToU 2138 movdqa xmm6, kRGBAToV 2139 movdqa xmm5, kAddUV128 2140 sub edi, edx // stride from u to v 2141 2142 align 4 2143 convertloop: 2144 /* step 1 - subsample 16x2 argb pixels to 8x1 */ 2145 movdqu xmm0, [eax] 2146 movdqu xmm1, [eax + 16] 2147 movdqu xmm2, [eax + 32] 2148 movdqu xmm3, [eax + 48] 2149 movdqu xmm4, [eax + esi] 2150 pavgb xmm0, xmm4 2151 movdqu xmm4, [eax + esi + 16] 2152 pavgb xmm1, xmm4 2153 movdqu xmm4, [eax + esi + 32] 2154 pavgb xmm2, xmm4 2155 movdqu xmm4, [eax + esi + 48] 2156 pavgb xmm3, xmm4 2157 lea eax, [eax + 64] 2158 movdqa xmm4, xmm0 2159 shufps xmm0, xmm1, 0x88 2160 shufps xmm4, xmm1, 0xdd 2161 pavgb xmm0, xmm4 2162 movdqa xmm4, xmm2 2163 shufps xmm2, xmm3, 0x88 2164 shufps xmm4, xmm3, 0xdd 2165 pavgb xmm2, xmm4 2166 2167 // step 2 - convert to U and V 2168 // from here down is very similar to Y code except 2169 // instead of 16 different pixels, its 8 pixels of U and 8 of V 2170 movdqa xmm1, xmm0 2171 movdqa xmm3, xmm2 2172 pmaddubsw xmm0, xmm7 // U 2173 pmaddubsw xmm2, xmm7 2174 pmaddubsw xmm1, xmm6 // V 2175 pmaddubsw xmm3, xmm6 2176 phaddw xmm0, xmm2 2177 phaddw xmm1, xmm3 2178 psraw xmm0, 8 2179 psraw xmm1, 8 2180 packsswb xmm0, xmm1 2181 paddb xmm0, xmm5 // -> unsigned 2182 2183 // step 3 - store 8 U and 8 V values 2184 sub ecx, 16 2185 movlps qword ptr [edx], xmm0 // U 2186 movhps qword ptr [edx + edi], xmm0 // V 2187 lea edx, [edx + 8] 2188 jg convertloop 2189 2190 pop edi 2191 pop esi 2192 ret 2193 } 2194 } 2195 #endif // HAS_ARGBTOYROW_SSSE3 2196 2197 #ifdef HAS_I422TOARGBROW_AVX2 2198 2199 static const lvec8 kUVToB_AVX = { 2200 UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, 2201 UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB 2202 }; 2203 static const lvec8 kUVToR_AVX = { 2204 UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, 2205 UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR 2206 }; 2207 static const lvec8 kUVToG_AVX = { 2208 UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, 2209 UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG 2210 }; 2211 static const lvec16 kYToRgb_AVX = { 2212 YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG 2213 }; 2214 static const lvec16 kYSub16_AVX = { 2215 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 2216 }; 2217 static const lvec16 kUVBiasB_AVX = { 2218 BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB 2219 }; 2220 static const lvec16 kUVBiasG_AVX = { 2221 BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG 2222 }; 2223 static const lvec16 kUVBiasR_AVX = { 2224 BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR 2225 }; 2226 2227 // 16 pixels 2228 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). 2229 __declspec(naked) __declspec(align(16)) 2230 void I422ToARGBRow_AVX2(const uint8* y_buf, 2231 const uint8* u_buf, 2232 const uint8* v_buf, 2233 uint8* dst_argb, 2234 int width) { 2235 __asm { 2236 push esi 2237 push edi 2238 mov eax, [esp + 8 + 4] // Y 2239 mov esi, [esp + 8 + 8] // U 2240 mov edi, [esp + 8 + 12] // V 2241 mov edx, [esp + 8 + 16] // argb 2242 mov ecx, [esp + 8 + 20] // width 2243 sub edi, esi 2244 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha 2245 vpxor ymm4, ymm4, ymm4 2246 2247 align 4 2248 convertloop: 2249 vmovq xmm0, qword ptr [esi] // U 2250 vmovq xmm1, qword ptr [esi + edi] // V 2251 lea esi, [esi + 8] 2252 vpunpcklbw ymm0, ymm0, ymm1 // UV 2253 vpermq ymm0, ymm0, 0xd8 2254 vpunpcklwd ymm0, ymm0, ymm0 // UVUV 2255 vpmaddubsw ymm2, ymm0, kUVToB_AVX // scale B UV 2256 vpmaddubsw ymm1, ymm0, kUVToG_AVX // scale G UV 2257 vpmaddubsw ymm0, ymm0, kUVToR_AVX // scale R UV 2258 vpsubw ymm2, ymm2, kUVBiasB_AVX // unbias back to signed 2259 vpsubw ymm1, ymm1, kUVBiasG_AVX 2260 vpsubw ymm0, ymm0, kUVBiasR_AVX 2261 2262 // Step 2: Find Y contribution to 16 R,G,B values 2263 vmovdqu xmm3, [eax] // NOLINT 2264 lea eax, [eax + 16] 2265 vpermq ymm3, ymm3, 0xd8 2266 vpunpcklbw ymm3, ymm3, ymm4 2267 vpsubsw ymm3, ymm3, kYSub16_AVX 2268 vpmullw ymm3, ymm3, kYToRgb_AVX 2269 vpaddsw ymm2, ymm2, ymm3 // B += Y 2270 vpaddsw ymm1, ymm1, ymm3 // G += Y 2271 vpaddsw ymm0, ymm0, ymm3 // R += Y 2272 vpsraw ymm2, ymm2, 6 2273 vpsraw ymm1, ymm1, 6 2274 vpsraw ymm0, ymm0, 6 2275 vpackuswb ymm2, ymm2, ymm2 // B 2276 vpackuswb ymm1, ymm1, ymm1 // G 2277 vpackuswb ymm0, ymm0, ymm0 // R 2278 2279 // Step 3: Weave into ARGB 2280 vpunpcklbw ymm2, ymm2, ymm1 // BG 2281 vpermq ymm2, ymm2, 0xd8 2282 vpunpcklbw ymm0, ymm0, ymm5 // RA 2283 vpermq ymm0, ymm0, 0xd8 2284 vpunpcklwd ymm1, ymm2, ymm0 // BGRA first 8 pixels 2285 vpunpckhwd ymm2, ymm2, ymm0 // BGRA next 8 pixels 2286 vmovdqu [edx], ymm1 2287 vmovdqu [edx + 32], ymm2 2288 lea edx, [edx + 64] 2289 sub ecx, 16 2290 jg convertloop 2291 vzeroupper 2292 2293 pop edi 2294 pop esi 2295 ret 2296 } 2297 } 2298 #endif // HAS_I422TOARGBROW_AVX2 2299 2300 #ifdef HAS_I422TOARGBROW_SSSE3 2301 2302 // TODO(fbarchard): Read that does half size on Y and treats 420 as 444. 2303 2304 // Read 8 UV from 444. 2305 #define READYUV444 __asm { \ 2306 __asm movq xmm0, qword ptr [esi] /* U */ /* NOLINT */ \ 2307 __asm movq xmm1, qword ptr [esi + edi] /* V */ /* NOLINT */ \ 2308 __asm lea esi, [esi + 8] \ 2309 __asm punpcklbw xmm0, xmm1 /* UV */ \ 2310 } 2311 2312 // Read 4 UV from 422, upsample to 8 UV. 2313 #define READYUV422 __asm { \ 2314 __asm movd xmm0, [esi] /* U */ \ 2315 __asm movd xmm1, [esi + edi] /* V */ \ 2316 __asm lea esi, [esi + 4] \ 2317 __asm punpcklbw xmm0, xmm1 /* UV */ \ 2318 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ 2319 } 2320 2321 // Read 2 UV from 411, upsample to 8 UV. 2322 #define READYUV411 __asm { \ 2323 __asm movzx ebx, word ptr [esi] /* U */ /* NOLINT */ \ 2324 __asm movd xmm0, ebx \ 2325 __asm movzx ebx, word ptr [esi + edi] /* V */ /* NOLINT */ \ 2326 __asm movd xmm1, ebx \ 2327 __asm lea esi, [esi + 2] \ 2328 __asm punpcklbw xmm0, xmm1 /* UV */ \ 2329 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ 2330 __asm punpckldq xmm0, xmm0 /* UVUV (upsample) */ \ 2331 } 2332 2333 // Read 4 UV from NV12, upsample to 8 UV. 2334 #define READNV12 __asm { \ 2335 __asm movq xmm0, qword ptr [esi] /* UV */ /* NOLINT */ \ 2336 __asm lea esi, [esi + 8] \ 2337 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ 2338 } 2339 2340 // Convert 8 pixels: 8 UV and 8 Y. 2341 #define YUVTORGB __asm { \ 2342 /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \ 2343 __asm movdqa xmm1, xmm0 \ 2344 __asm movdqa xmm2, xmm0 \ 2345 __asm pmaddubsw xmm0, kUVToB /* scale B UV */ \ 2346 __asm pmaddubsw xmm1, kUVToG /* scale G UV */ \ 2347 __asm pmaddubsw xmm2, kUVToR /* scale R UV */ \ 2348 __asm psubw xmm0, kUVBiasB /* unbias back to signed */ \ 2349 __asm psubw xmm1, kUVBiasG \ 2350 __asm psubw xmm2, kUVBiasR \ 2351 /* Step 2: Find Y contribution to 8 R,G,B values */ \ 2352 __asm movq xmm3, qword ptr [eax] /* NOLINT */ \ 2353 __asm lea eax, [eax + 8] \ 2354 __asm punpcklbw xmm3, xmm4 \ 2355 __asm psubsw xmm3, kYSub16 \ 2356 __asm pmullw xmm3, kYToRgb \ 2357 __asm paddsw xmm0, xmm3 /* B += Y */ \ 2358 __asm paddsw xmm1, xmm3 /* G += Y */ \ 2359 __asm paddsw xmm2, xmm3 /* R += Y */ \ 2360 __asm psraw xmm0, 6 \ 2361 __asm psraw xmm1, 6 \ 2362 __asm psraw xmm2, 6 \ 2363 __asm packuswb xmm0, xmm0 /* B */ \ 2364 __asm packuswb xmm1, xmm1 /* G */ \ 2365 __asm packuswb xmm2, xmm2 /* R */ \ 2366 } 2367 2368 // Convert 8 pixels: 8 VU and 8 Y. 2369 #define YVUTORGB __asm { \ 2370 /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \ 2371 __asm movdqa xmm1, xmm0 \ 2372 __asm movdqa xmm2, xmm0 \ 2373 __asm pmaddubsw xmm0, kVUToB /* scale B UV */ \ 2374 __asm pmaddubsw xmm1, kVUToG /* scale G UV */ \ 2375 __asm pmaddubsw xmm2, kVUToR /* scale R UV */ \ 2376 __asm psubw xmm0, kUVBiasB /* unbias back to signed */ \ 2377 __asm psubw xmm1, kUVBiasG \ 2378 __asm psubw xmm2, kUVBiasR \ 2379 /* Step 2: Find Y contribution to 8 R,G,B values */ \ 2380 __asm movq xmm3, qword ptr [eax] /* NOLINT */ \ 2381 __asm lea eax, [eax + 8] \ 2382 __asm punpcklbw xmm3, xmm4 \ 2383 __asm psubsw xmm3, kYSub16 \ 2384 __asm pmullw xmm3, kYToRgb \ 2385 __asm paddsw xmm0, xmm3 /* B += Y */ \ 2386 __asm paddsw xmm1, xmm3 /* G += Y */ \ 2387 __asm paddsw xmm2, xmm3 /* R += Y */ \ 2388 __asm psraw xmm0, 6 \ 2389 __asm psraw xmm1, 6 \ 2390 __asm psraw xmm2, 6 \ 2391 __asm packuswb xmm0, xmm0 /* B */ \ 2392 __asm packuswb xmm1, xmm1 /* G */ \ 2393 __asm packuswb xmm2, xmm2 /* R */ \ 2394 } 2395 2396 // 8 pixels, dest aligned 16. 2397 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). 2398 __declspec(naked) __declspec(align(16)) 2399 void I444ToARGBRow_SSSE3(const uint8* y_buf, 2400 const uint8* u_buf, 2401 const uint8* v_buf, 2402 uint8* dst_argb, 2403 int width) { 2404 __asm { 2405 push esi 2406 push edi 2407 mov eax, [esp + 8 + 4] // Y 2408 mov esi, [esp + 8 + 8] // U 2409 mov edi, [esp + 8 + 12] // V 2410 mov edx, [esp + 8 + 16] // argb 2411 mov ecx, [esp + 8 + 20] // width 2412 sub edi, esi 2413 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2414 pxor xmm4, xmm4 2415 2416 align 4 2417 convertloop: 2418 READYUV444 2419 YUVTORGB 2420 2421 // Step 3: Weave into ARGB 2422 punpcklbw xmm0, xmm1 // BG 2423 punpcklbw xmm2, xmm5 // RA 2424 movdqa xmm1, xmm0 2425 punpcklwd xmm0, xmm2 // BGRA first 4 pixels 2426 punpckhwd xmm1, xmm2 // BGRA next 4 pixels 2427 movdqa [edx], xmm0 2428 movdqa [edx + 16], xmm1 2429 lea edx, [edx + 32] 2430 sub ecx, 8 2431 jg convertloop 2432 2433 pop edi 2434 pop esi 2435 ret 2436 } 2437 } 2438 2439 // 8 pixels, dest aligned 16. 2440 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 2441 __declspec(naked) __declspec(align(16)) 2442 void I422ToRGB24Row_SSSE3(const uint8* y_buf, 2443 const uint8* u_buf, 2444 const uint8* v_buf, 2445 uint8* dst_rgb24, 2446 int width) { 2447 __asm { 2448 push esi 2449 push edi 2450 mov eax, [esp + 8 + 4] // Y 2451 mov esi, [esp + 8 + 8] // U 2452 mov edi, [esp + 8 + 12] // V 2453 mov edx, [esp + 8 + 16] // rgb24 2454 mov ecx, [esp + 8 + 20] // width 2455 sub edi, esi 2456 pxor xmm4, xmm4 2457 movdqa xmm5, kShuffleMaskARGBToRGB24_0 2458 movdqa xmm6, kShuffleMaskARGBToRGB24 2459 2460 align 4 2461 convertloop: 2462 READYUV422 2463 YUVTORGB 2464 2465 // Step 3: Weave into RRGB 2466 punpcklbw xmm0, xmm1 // BG 2467 punpcklbw xmm2, xmm2 // RR 2468 movdqa xmm1, xmm0 2469 punpcklwd xmm0, xmm2 // BGRR first 4 pixels 2470 punpckhwd xmm1, xmm2 // BGRR next 4 pixels 2471 pshufb xmm0, xmm5 // Pack into first 8 and last 4 bytes. 2472 pshufb xmm1, xmm6 // Pack into first 12 bytes. 2473 palignr xmm1, xmm0, 12 // last 4 bytes of xmm0 + 12 from xmm1 2474 movq qword ptr [edx], xmm0 // First 8 bytes 2475 movdqu [edx + 8], xmm1 // Last 16 bytes. = 24 bytes, 8 RGB pixels. 2476 lea edx, [edx + 24] 2477 sub ecx, 8 2478 jg convertloop 2479 2480 pop edi 2481 pop esi 2482 ret 2483 } 2484 } 2485 2486 // 8 pixels, dest aligned 16. 2487 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 2488 __declspec(naked) __declspec(align(16)) 2489 void I422ToRAWRow_SSSE3(const uint8* y_buf, 2490 const uint8* u_buf, 2491 const uint8* v_buf, 2492 uint8* dst_raw, 2493 int width) { 2494 __asm { 2495 push esi 2496 push edi 2497 mov eax, [esp + 8 + 4] // Y 2498 mov esi, [esp + 8 + 8] // U 2499 mov edi, [esp + 8 + 12] // V 2500 mov edx, [esp + 8 + 16] // raw 2501 mov ecx, [esp + 8 + 20] // width 2502 sub edi, esi 2503 pxor xmm4, xmm4 2504 movdqa xmm5, kShuffleMaskARGBToRAW_0 2505 movdqa xmm6, kShuffleMaskARGBToRAW 2506 2507 align 4 2508 convertloop: 2509 READYUV422 2510 YUVTORGB 2511 2512 // Step 3: Weave into RRGB 2513 punpcklbw xmm0, xmm1 // BG 2514 punpcklbw xmm2, xmm2 // RR 2515 movdqa xmm1, xmm0 2516 punpcklwd xmm0, xmm2 // BGRR first 4 pixels 2517 punpckhwd xmm1, xmm2 // BGRR next 4 pixels 2518 pshufb xmm0, xmm5 // Pack into first 8 and last 4 bytes. 2519 pshufb xmm1, xmm6 // Pack into first 12 bytes. 2520 palignr xmm1, xmm0, 12 // last 4 bytes of xmm0 + 12 from xmm1 2521 movq qword ptr [edx], xmm0 // First 8 bytes 2522 movdqu [edx + 8], xmm1 // Last 16 bytes. = 24 bytes, 8 RGB pixels. 2523 lea edx, [edx + 24] 2524 sub ecx, 8 2525 jg convertloop 2526 2527 pop edi 2528 pop esi 2529 ret 2530 } 2531 } 2532 2533 // 8 pixels, dest unaligned. 2534 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 2535 __declspec(naked) __declspec(align(16)) 2536 void I422ToRGB565Row_SSSE3(const uint8* y_buf, 2537 const uint8* u_buf, 2538 const uint8* v_buf, 2539 uint8* rgb565_buf, 2540 int width) { 2541 __asm { 2542 push esi 2543 push edi 2544 mov eax, [esp + 8 + 4] // Y 2545 mov esi, [esp + 8 + 8] // U 2546 mov edi, [esp + 8 + 12] // V 2547 mov edx, [esp + 8 + 16] // rgb565 2548 mov ecx, [esp + 8 + 20] // width 2549 sub edi, esi 2550 pxor xmm4, xmm4 2551 pcmpeqb xmm5, xmm5 // generate mask 0x0000001f 2552 psrld xmm5, 27 2553 pcmpeqb xmm6, xmm6 // generate mask 0x000007e0 2554 psrld xmm6, 26 2555 pslld xmm6, 5 2556 pcmpeqb xmm7, xmm7 // generate mask 0xfffff800 2557 pslld xmm7, 11 2558 2559 align 4 2560 convertloop: 2561 READYUV422 2562 YUVTORGB 2563 2564 // Step 3: Weave into RRGB 2565 punpcklbw xmm0, xmm1 // BG 2566 punpcklbw xmm2, xmm2 // RR 2567 movdqa xmm1, xmm0 2568 punpcklwd xmm0, xmm2 // BGRR first 4 pixels 2569 punpckhwd xmm1, xmm2 // BGRR next 4 pixels 2570 2571 // Step 3b: RRGB -> RGB565 2572 movdqa xmm3, xmm0 // B first 4 pixels of argb 2573 movdqa xmm2, xmm0 // G 2574 pslld xmm0, 8 // R 2575 psrld xmm3, 3 // B 2576 psrld xmm2, 5 // G 2577 psrad xmm0, 16 // R 2578 pand xmm3, xmm5 // B 2579 pand xmm2, xmm6 // G 2580 pand xmm0, xmm7 // R 2581 por xmm3, xmm2 // BG 2582 por xmm0, xmm3 // BGR 2583 movdqa xmm3, xmm1 // B next 4 pixels of argb 2584 movdqa xmm2, xmm1 // G 2585 pslld xmm1, 8 // R 2586 psrld xmm3, 3 // B 2587 psrld xmm2, 5 // G 2588 psrad xmm1, 16 // R 2589 pand xmm3, xmm5 // B 2590 pand xmm2, xmm6 // G 2591 pand xmm1, xmm7 // R 2592 por xmm3, xmm2 // BG 2593 por xmm1, xmm3 // BGR 2594 packssdw xmm0, xmm1 2595 sub ecx, 8 2596 movdqu [edx], xmm0 // store 8 pixels of RGB565 2597 lea edx, [edx + 16] 2598 jg convertloop 2599 2600 pop edi 2601 pop esi 2602 ret 2603 } 2604 } 2605 2606 // 8 pixels, dest aligned 16. 2607 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 2608 __declspec(naked) __declspec(align(16)) 2609 void I422ToARGBRow_SSSE3(const uint8* y_buf, 2610 const uint8* u_buf, 2611 const uint8* v_buf, 2612 uint8* dst_argb, 2613 int width) { 2614 __asm { 2615 push esi 2616 push edi 2617 mov eax, [esp + 8 + 4] // Y 2618 mov esi, [esp + 8 + 8] // U 2619 mov edi, [esp + 8 + 12] // V 2620 mov edx, [esp + 8 + 16] // argb 2621 mov ecx, [esp + 8 + 20] // width 2622 sub edi, esi 2623 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2624 pxor xmm4, xmm4 2625 2626 align 4 2627 convertloop: 2628 READYUV422 2629 YUVTORGB 2630 2631 // Step 3: Weave into ARGB 2632 punpcklbw xmm0, xmm1 // BG 2633 punpcklbw xmm2, xmm5 // RA 2634 movdqa xmm1, xmm0 2635 punpcklwd xmm0, xmm2 // BGRA first 4 pixels 2636 punpckhwd xmm1, xmm2 // BGRA next 4 pixels 2637 movdqa [edx], xmm0 2638 movdqa [edx + 16], xmm1 2639 lea edx, [edx + 32] 2640 sub ecx, 8 2641 jg convertloop 2642 2643 pop edi 2644 pop esi 2645 ret 2646 } 2647 } 2648 2649 // 8 pixels, dest aligned 16. 2650 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 2651 // Similar to I420 but duplicate UV once more. 2652 __declspec(naked) __declspec(align(16)) 2653 void I411ToARGBRow_SSSE3(const uint8* y_buf, 2654 const uint8* u_buf, 2655 const uint8* v_buf, 2656 uint8* dst_argb, 2657 int width) { 2658 __asm { 2659 push ebx 2660 push esi 2661 push edi 2662 mov eax, [esp + 12 + 4] // Y 2663 mov esi, [esp + 12 + 8] // U 2664 mov edi, [esp + 12 + 12] // V 2665 mov edx, [esp + 12 + 16] // argb 2666 mov ecx, [esp + 12 + 20] // width 2667 sub edi, esi 2668 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2669 pxor xmm4, xmm4 2670 2671 align 4 2672 convertloop: 2673 READYUV411 // modifies EBX 2674 YUVTORGB 2675 2676 // Step 3: Weave into ARGB 2677 punpcklbw xmm0, xmm1 // BG 2678 punpcklbw xmm2, xmm5 // RA 2679 movdqa xmm1, xmm0 2680 punpcklwd xmm0, xmm2 // BGRA first 4 pixels 2681 punpckhwd xmm1, xmm2 // BGRA next 4 pixels 2682 movdqa [edx], xmm0 2683 movdqa [edx + 16], xmm1 2684 lea edx, [edx + 32] 2685 sub ecx, 8 2686 jg convertloop 2687 2688 pop edi 2689 pop esi 2690 pop ebx 2691 ret 2692 } 2693 } 2694 2695 // 8 pixels, dest aligned 16. 2696 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 2697 __declspec(naked) __declspec(align(16)) 2698 void NV12ToARGBRow_SSSE3(const uint8* y_buf, 2699 const uint8* uv_buf, 2700 uint8* dst_argb, 2701 int width) { 2702 __asm { 2703 push esi 2704 mov eax, [esp + 4 + 4] // Y 2705 mov esi, [esp + 4 + 8] // UV 2706 mov edx, [esp + 4 + 12] // argb 2707 mov ecx, [esp + 4 + 16] // width 2708 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2709 pxor xmm4, xmm4 2710 2711 align 4 2712 convertloop: 2713 READNV12 2714 YUVTORGB 2715 2716 // Step 3: Weave into ARGB 2717 punpcklbw xmm0, xmm1 // BG 2718 punpcklbw xmm2, xmm5 // RA 2719 movdqa xmm1, xmm0 2720 punpcklwd xmm0, xmm2 // BGRA first 4 pixels 2721 punpckhwd xmm1, xmm2 // BGRA next 4 pixels 2722 movdqa [edx], xmm0 2723 movdqa [edx + 16], xmm1 2724 lea edx, [edx + 32] 2725 sub ecx, 8 2726 jg convertloop 2727 2728 pop esi 2729 ret 2730 } 2731 } 2732 2733 // 8 pixels, dest aligned 16. 2734 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 2735 __declspec(naked) __declspec(align(16)) 2736 void NV21ToARGBRow_SSSE3(const uint8* y_buf, 2737 const uint8* uv_buf, 2738 uint8* dst_argb, 2739 int width) { 2740 __asm { 2741 push esi 2742 mov eax, [esp + 4 + 4] // Y 2743 mov esi, [esp + 4 + 8] // VU 2744 mov edx, [esp + 4 + 12] // argb 2745 mov ecx, [esp + 4 + 16] // width 2746 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2747 pxor xmm4, xmm4 2748 2749 align 4 2750 convertloop: 2751 READNV12 2752 YVUTORGB 2753 2754 // Step 3: Weave into ARGB 2755 punpcklbw xmm0, xmm1 // BG 2756 punpcklbw xmm2, xmm5 // RA 2757 movdqa xmm1, xmm0 2758 punpcklwd xmm0, xmm2 // BGRA first 4 pixels 2759 punpckhwd xmm1, xmm2 // BGRA next 4 pixels 2760 movdqa [edx], xmm0 2761 movdqa [edx + 16], xmm1 2762 lea edx, [edx + 32] 2763 sub ecx, 8 2764 jg convertloop 2765 2766 pop esi 2767 ret 2768 } 2769 } 2770 2771 // 8 pixels, unaligned. 2772 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). 2773 __declspec(naked) __declspec(align(16)) 2774 void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, 2775 const uint8* u_buf, 2776 const uint8* v_buf, 2777 uint8* dst_argb, 2778 int width) { 2779 __asm { 2780 push esi 2781 push edi 2782 mov eax, [esp + 8 + 4] // Y 2783 mov esi, [esp + 8 + 8] // U 2784 mov edi, [esp + 8 + 12] // V 2785 mov edx, [esp + 8 + 16] // argb 2786 mov ecx, [esp + 8 + 20] // width 2787 sub edi, esi 2788 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2789 pxor xmm4, xmm4 2790 2791 align 4 2792 convertloop: 2793 READYUV444 2794 YUVTORGB 2795 2796 // Step 3: Weave into ARGB 2797 punpcklbw xmm0, xmm1 // BG 2798 punpcklbw xmm2, xmm5 // RA 2799 movdqa xmm1, xmm0 2800 punpcklwd xmm0, xmm2 // BGRA first 4 pixels 2801 punpckhwd xmm1, xmm2 // BGRA next 4 pixels 2802 movdqu [edx], xmm0 2803 movdqu [edx + 16], xmm1 2804 lea edx, [edx + 32] 2805 sub ecx, 8 2806 jg convertloop 2807 2808 pop edi 2809 pop esi 2810 ret 2811 } 2812 } 2813 2814 // 8 pixels, unaligned. 2815 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 2816 __declspec(naked) __declspec(align(16)) 2817 void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, 2818 const uint8* u_buf, 2819 const uint8* v_buf, 2820 uint8* dst_argb, 2821 int width) { 2822 __asm { 2823 push esi 2824 push edi 2825 mov eax, [esp + 8 + 4] // Y 2826 mov esi, [esp + 8 + 8] // U 2827 mov edi, [esp + 8 + 12] // V 2828 mov edx, [esp + 8 + 16] // argb 2829 mov ecx, [esp + 8 + 20] // width 2830 sub edi, esi 2831 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2832 pxor xmm4, xmm4 2833 2834 align 4 2835 convertloop: 2836 READYUV422 2837 YUVTORGB 2838 2839 // Step 3: Weave into ARGB 2840 punpcklbw xmm0, xmm1 // BG 2841 punpcklbw xmm2, xmm5 // RA 2842 movdqa xmm1, xmm0 2843 punpcklwd xmm0, xmm2 // BGRA first 4 pixels 2844 punpckhwd xmm1, xmm2 // BGRA next 4 pixels 2845 movdqu [edx], xmm0 2846 movdqu [edx + 16], xmm1 2847 lea edx, [edx + 32] 2848 sub ecx, 8 2849 jg convertloop 2850 2851 pop edi 2852 pop esi 2853 ret 2854 } 2855 } 2856 2857 // 8 pixels, unaligned. 2858 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 2859 // Similar to I420 but duplicate UV once more. 2860 __declspec(naked) __declspec(align(16)) 2861 void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, 2862 const uint8* u_buf, 2863 const uint8* v_buf, 2864 uint8* dst_argb, 2865 int width) { 2866 __asm { 2867 push ebx 2868 push esi 2869 push edi 2870 mov eax, [esp + 12 + 4] // Y 2871 mov esi, [esp + 12 + 8] // U 2872 mov edi, [esp + 12 + 12] // V 2873 mov edx, [esp + 12 + 16] // argb 2874 mov ecx, [esp + 12 + 20] // width 2875 sub edi, esi 2876 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2877 pxor xmm4, xmm4 2878 2879 align 4 2880 convertloop: 2881 READYUV411 // modifies EBX 2882 YUVTORGB 2883 2884 // Step 3: Weave into ARGB 2885 punpcklbw xmm0, xmm1 // BG 2886 punpcklbw xmm2, xmm5 // RA 2887 movdqa xmm1, xmm0 2888 punpcklwd xmm0, xmm2 // BGRA first 4 pixels 2889 punpckhwd xmm1, xmm2 // BGRA next 4 pixels 2890 movdqu [edx], xmm0 2891 movdqu [edx + 16], xmm1 2892 lea edx, [edx + 32] 2893 sub ecx, 8 2894 jg convertloop 2895 2896 pop edi 2897 pop esi 2898 pop ebx 2899 ret 2900 } 2901 } 2902 2903 // 8 pixels, dest aligned 16. 2904 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 2905 __declspec(naked) __declspec(align(16)) 2906 void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, 2907 const uint8* uv_buf, 2908 uint8* dst_argb, 2909 int width) { 2910 __asm { 2911 push esi 2912 mov eax, [esp + 4 + 4] // Y 2913 mov esi, [esp + 4 + 8] // UV 2914 mov edx, [esp + 4 + 12] // argb 2915 mov ecx, [esp + 4 + 16] // width 2916 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2917 pxor xmm4, xmm4 2918 2919 align 4 2920 convertloop: 2921 READNV12 2922 YUVTORGB 2923 2924 // Step 3: Weave into ARGB 2925 punpcklbw xmm0, xmm1 // BG 2926 punpcklbw xmm2, xmm5 // RA 2927 movdqa xmm1, xmm0 2928 punpcklwd xmm0, xmm2 // BGRA first 4 pixels 2929 punpckhwd xmm1, xmm2 // BGRA next 4 pixels 2930 movdqu [edx], xmm0 2931 movdqu [edx + 16], xmm1 2932 lea edx, [edx + 32] 2933 sub ecx, 8 2934 jg convertloop 2935 2936 pop esi 2937 ret 2938 } 2939 } 2940 2941 // 8 pixels, dest aligned 16. 2942 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 2943 __declspec(naked) __declspec(align(16)) 2944 void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, 2945 const uint8* uv_buf, 2946 uint8* dst_argb, 2947 int width) { 2948 __asm { 2949 push esi 2950 mov eax, [esp + 4 + 4] // Y 2951 mov esi, [esp + 4 + 8] // VU 2952 mov edx, [esp + 4 + 12] // argb 2953 mov ecx, [esp + 4 + 16] // width 2954 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2955 pxor xmm4, xmm4 2956 2957 align 4 2958 convertloop: 2959 READNV12 2960 YVUTORGB 2961 2962 // Step 3: Weave into ARGB 2963 punpcklbw xmm0, xmm1 // BG 2964 punpcklbw xmm2, xmm5 // RA 2965 movdqa xmm1, xmm0 2966 punpcklwd xmm0, xmm2 // BGRA first 4 pixels 2967 punpckhwd xmm1, xmm2 // BGRA next 4 pixels 2968 movdqu [edx], xmm0 2969 movdqu [edx + 16], xmm1 2970 lea edx, [edx + 32] 2971 sub ecx, 8 2972 jg convertloop 2973 2974 pop esi 2975 ret 2976 } 2977 } 2978 2979 __declspec(naked) __declspec(align(16)) 2980 void I422ToBGRARow_SSSE3(const uint8* y_buf, 2981 const uint8* u_buf, 2982 const uint8* v_buf, 2983 uint8* dst_bgra, 2984 int width) { 2985 __asm { 2986 push esi 2987 push edi 2988 mov eax, [esp + 8 + 4] // Y 2989 mov esi, [esp + 8 + 8] // U 2990 mov edi, [esp + 8 + 12] // V 2991 mov edx, [esp + 8 + 16] // bgra 2992 mov ecx, [esp + 8 + 20] // width 2993 sub edi, esi 2994 pxor xmm4, xmm4 2995 2996 align 4 2997 convertloop: 2998 READYUV422 2999 YUVTORGB 3000 3001 // Step 3: Weave into BGRA 3002 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 3003 punpcklbw xmm1, xmm0 // GB 3004 punpcklbw xmm5, xmm2 // AR 3005 movdqa xmm0, xmm5 3006 punpcklwd xmm5, xmm1 // BGRA first 4 pixels 3007 punpckhwd xmm0, xmm1 // BGRA next 4 pixels 3008 movdqa [edx], xmm5 3009 movdqa [edx + 16], xmm0 3010 lea edx, [edx + 32] 3011 sub ecx, 8 3012 jg convertloop 3013 3014 pop edi 3015 pop esi 3016 ret 3017 } 3018 } 3019 3020 __declspec(naked) __declspec(align(16)) 3021 void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, 3022 const uint8* u_buf, 3023 const uint8* v_buf, 3024 uint8* dst_bgra, 3025 int width) { 3026 __asm { 3027 push esi 3028 push edi 3029 mov eax, [esp + 8 + 4] // Y 3030 mov esi, [esp + 8 + 8] // U 3031 mov edi, [esp + 8 + 12] // V 3032 mov edx, [esp + 8 + 16] // bgra 3033 mov ecx, [esp + 8 + 20] // width 3034 sub edi, esi 3035 pxor xmm4, xmm4 3036 3037 align 4 3038 convertloop: 3039 READYUV422 3040 YUVTORGB 3041 3042 // Step 3: Weave into BGRA 3043 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 3044 punpcklbw xmm1, xmm0 // GB 3045 punpcklbw xmm5, xmm2 // AR 3046 movdqa xmm0, xmm5 3047 punpcklwd xmm5, xmm1 // BGRA first 4 pixels 3048 punpckhwd xmm0, xmm1 // BGRA next 4 pixels 3049 movdqu [edx], xmm5 3050 movdqu [edx + 16], xmm0 3051 lea edx, [edx + 32] 3052 sub ecx, 8 3053 jg convertloop 3054 3055 pop edi 3056 pop esi 3057 ret 3058 } 3059 } 3060 3061 __declspec(naked) __declspec(align(16)) 3062 void I422ToABGRRow_SSSE3(const uint8* y_buf, 3063 const uint8* u_buf, 3064 const uint8* v_buf, 3065 uint8* dst_abgr, 3066 int width) { 3067 __asm { 3068 push esi 3069 push edi 3070 mov eax, [esp + 8 + 4] // Y 3071 mov esi, [esp + 8 + 8] // U 3072 mov edi, [esp + 8 + 12] // V 3073 mov edx, [esp + 8 + 16] // abgr 3074 mov ecx, [esp + 8 + 20] // width 3075 sub edi, esi 3076 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 3077 pxor xmm4, xmm4 3078 3079 align 4 3080 convertloop: 3081 READYUV422 3082 YUVTORGB 3083 3084 // Step 3: Weave into ARGB 3085 punpcklbw xmm2, xmm1 // RG 3086 punpcklbw xmm0, xmm5 // BA 3087 movdqa xmm1, xmm2 3088 punpcklwd xmm2, xmm0 // RGBA first 4 pixels 3089 punpckhwd xmm1, xmm0 // RGBA next 4 pixels 3090 movdqa [edx], xmm2 3091 movdqa [edx + 16], xmm1 3092 lea edx, [edx + 32] 3093 sub ecx, 8 3094 jg convertloop 3095 3096 pop edi 3097 pop esi 3098 ret 3099 } 3100 } 3101 3102 __declspec(naked) __declspec(align(16)) 3103 void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, 3104 const uint8* u_buf, 3105 const uint8* v_buf, 3106 uint8* dst_abgr, 3107 int width) { 3108 __asm { 3109 push esi 3110 push edi 3111 mov eax, [esp + 8 + 4] // Y 3112 mov esi, [esp + 8 + 8] // U 3113 mov edi, [esp + 8 + 12] // V 3114 mov edx, [esp + 8 + 16] // abgr 3115 mov ecx, [esp + 8 + 20] // width 3116 sub edi, esi 3117 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 3118 pxor xmm4, xmm4 3119 3120 align 4 3121 convertloop: 3122 READYUV422 3123 YUVTORGB 3124 3125 // Step 3: Weave into ARGB 3126 punpcklbw xmm2, xmm1 // RG 3127 punpcklbw xmm0, xmm5 // BA 3128 movdqa xmm1, xmm2 3129 punpcklwd xmm2, xmm0 // RGBA first 4 pixels 3130 punpckhwd xmm1, xmm0 // RGBA next 4 pixels 3131 movdqu [edx], xmm2 3132 movdqu [edx + 16], xmm1 3133 lea edx, [edx + 32] 3134 sub ecx, 8 3135 jg convertloop 3136 3137 pop edi 3138 pop esi 3139 ret 3140 } 3141 } 3142 3143 __declspec(naked) __declspec(align(16)) 3144 void I422ToRGBARow_SSSE3(const uint8* y_buf, 3145 const uint8* u_buf, 3146 const uint8* v_buf, 3147 uint8* dst_rgba, 3148 int width) { 3149 __asm { 3150 push esi 3151 push edi 3152 mov eax, [esp + 8 + 4] // Y 3153 mov esi, [esp + 8 + 8] // U 3154 mov edi, [esp + 8 + 12] // V 3155 mov edx, [esp + 8 + 16] // rgba 3156 mov ecx, [esp + 8 + 20] // width 3157 sub edi, esi 3158 pxor xmm4, xmm4 3159 3160 align 4 3161 convertloop: 3162 READYUV422 3163 YUVTORGB 3164 3165 // Step 3: Weave into RGBA 3166 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 3167 punpcklbw xmm1, xmm2 // GR 3168 punpcklbw xmm5, xmm0 // AB 3169 movdqa xmm0, xmm5 3170 punpcklwd xmm5, xmm1 // RGBA first 4 pixels 3171 punpckhwd xmm0, xmm1 // RGBA next 4 pixels 3172 movdqa [edx], xmm5 3173 movdqa [edx + 16], xmm0 3174 lea edx, [edx + 32] 3175 sub ecx, 8 3176 jg convertloop 3177 3178 pop edi 3179 pop esi 3180 ret 3181 } 3182 } 3183 3184 __declspec(naked) __declspec(align(16)) 3185 void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf, 3186 const uint8* u_buf, 3187 const uint8* v_buf, 3188 uint8* dst_rgba, 3189 int width) { 3190 __asm { 3191 push esi 3192 push edi 3193 mov eax, [esp + 8 + 4] // Y 3194 mov esi, [esp + 8 + 8] // U 3195 mov edi, [esp + 8 + 12] // V 3196 mov edx, [esp + 8 + 16] // rgba 3197 mov ecx, [esp + 8 + 20] // width 3198 sub edi, esi 3199 pxor xmm4, xmm4 3200 3201 align 4 3202 convertloop: 3203 READYUV422 3204 YUVTORGB 3205 3206 // Step 3: Weave into RGBA 3207 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 3208 punpcklbw xmm1, xmm2 // GR 3209 punpcklbw xmm5, xmm0 // AB 3210 movdqa xmm0, xmm5 3211 punpcklwd xmm5, xmm1 // RGBA first 4 pixels 3212 punpckhwd xmm0, xmm1 // RGBA next 4 pixels 3213 movdqu [edx], xmm5 3214 movdqu [edx + 16], xmm0 3215 lea edx, [edx + 32] 3216 sub ecx, 8 3217 jg convertloop 3218 3219 pop edi 3220 pop esi 3221 ret 3222 } 3223 } 3224 3225 #endif // HAS_I422TOARGBROW_SSSE3 3226 3227 #ifdef HAS_YTOARGBROW_SSE2 3228 __declspec(naked) __declspec(align(16)) 3229 void YToARGBRow_SSE2(const uint8* y_buf, 3230 uint8* rgb_buf, 3231 int width) { 3232 __asm { 3233 pxor xmm5, xmm5 3234 pcmpeqb xmm4, xmm4 // generate mask 0xff000000 3235 pslld xmm4, 24 3236 mov eax, 0x00100010 3237 movd xmm3, eax 3238 pshufd xmm3, xmm3, 0 3239 mov eax, 0x004a004a // 74 3240 movd xmm2, eax 3241 pshufd xmm2, xmm2,0 3242 mov eax, [esp + 4] // Y 3243 mov edx, [esp + 8] // rgb 3244 mov ecx, [esp + 12] // width 3245 3246 align 4 3247 convertloop: 3248 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 3249 movq xmm0, qword ptr [eax] 3250 lea eax, [eax + 8] 3251 punpcklbw xmm0, xmm5 // 0.Y 3252 psubusw xmm0, xmm3 3253 pmullw xmm0, xmm2 3254 psrlw xmm0, 6 3255 packuswb xmm0, xmm0 // G 3256 3257 // Step 2: Weave into ARGB 3258 punpcklbw xmm0, xmm0 // GG 3259 movdqa xmm1, xmm0 3260 punpcklwd xmm0, xmm0 // BGRA first 4 pixels 3261 punpckhwd xmm1, xmm1 // BGRA next 4 pixels 3262 por xmm0, xmm4 3263 por xmm1, xmm4 3264 movdqa [edx], xmm0 3265 movdqa [edx + 16], xmm1 3266 lea edx, [edx + 32] 3267 sub ecx, 8 3268 jg convertloop 3269 3270 ret 3271 } 3272 } 3273 #endif // HAS_YTOARGBROW_SSE2 3274 3275 #ifdef HAS_MIRRORROW_SSSE3 3276 // Shuffle table for reversing the bytes. 3277 static const uvec8 kShuffleMirror = { 3278 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u 3279 }; 3280 3281 __declspec(naked) __declspec(align(16)) 3282 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { 3283 __asm { 3284 mov eax, [esp + 4] // src 3285 mov edx, [esp + 8] // dst 3286 mov ecx, [esp + 12] // width 3287 movdqa xmm5, kShuffleMirror 3288 lea eax, [eax - 16] 3289 3290 align 4 3291 convertloop: 3292 movdqa xmm0, [eax + ecx] 3293 pshufb xmm0, xmm5 3294 sub ecx, 16 3295 movdqa [edx], xmm0 3296 lea edx, [edx + 16] 3297 jg convertloop 3298 ret 3299 } 3300 } 3301 #endif // HAS_MIRRORROW_SSSE3 3302 3303 #ifdef HAS_MIRRORROW_AVX2 3304 // Shuffle table for reversing the bytes. 3305 static const ulvec8 kShuffleMirror_AVX2 = { 3306 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u, 3307 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u 3308 }; 3309 3310 __declspec(naked) __declspec(align(16)) 3311 void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { 3312 __asm { 3313 mov eax, [esp + 4] // src 3314 mov edx, [esp + 8] // dst 3315 mov ecx, [esp + 12] // width 3316 vmovdqa ymm5, kShuffleMirror_AVX2 3317 lea eax, [eax - 32] 3318 3319 align 4 3320 convertloop: 3321 vmovdqu ymm0, [eax + ecx] 3322 vpshufb ymm0, ymm0, ymm5 3323 vpermq ymm0, ymm0, 0x4e // swap high and low halfs 3324 sub ecx, 32 3325 vmovdqu [edx], ymm0 3326 lea edx, [edx + 32] 3327 jg convertloop 3328 vzeroupper 3329 ret 3330 } 3331 } 3332 #endif // HAS_MIRRORROW_AVX2 3333 3334 #ifdef HAS_MIRRORROW_SSE2 3335 // SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3 3336 // version can not. 3337 __declspec(naked) __declspec(align(16)) 3338 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { 3339 __asm { 3340 mov eax, [esp + 4] // src 3341 mov edx, [esp + 8] // dst 3342 mov ecx, [esp + 12] // width 3343 lea eax, [eax - 16] 3344 3345 align 4 3346 convertloop: 3347 movdqu xmm0, [eax + ecx] 3348 movdqa xmm1, xmm0 // swap bytes 3349 psllw xmm0, 8 3350 psrlw xmm1, 8 3351 por xmm0, xmm1 3352 pshuflw xmm0, xmm0, 0x1b // swap words 3353 pshufhw xmm0, xmm0, 0x1b 3354 pshufd xmm0, xmm0, 0x4e // swap qwords 3355 sub ecx, 16 3356 movdqu [edx], xmm0 3357 lea edx, [edx + 16] 3358 jg convertloop 3359 ret 3360 } 3361 } 3362 #endif // HAS_MIRRORROW_SSE2 3363 3364 #ifdef HAS_MIRRORROW_UV_SSSE3 3365 // Shuffle table for reversing the bytes of UV channels. 3366 static const uvec8 kShuffleMirrorUV = { 3367 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u 3368 }; 3369 3370 __declspec(naked) __declspec(align(16)) 3371 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, 3372 int width) { 3373 __asm { 3374 push edi 3375 mov eax, [esp + 4 + 4] // src 3376 mov edx, [esp + 4 + 8] // dst_u 3377 mov edi, [esp + 4 + 12] // dst_v 3378 mov ecx, [esp + 4 + 16] // width 3379 movdqa xmm1, kShuffleMirrorUV 3380 lea eax, [eax + ecx * 2 - 16] 3381 sub edi, edx 3382 3383 align 4 3384 convertloop: 3385 movdqa xmm0, [eax] 3386 lea eax, [eax - 16] 3387 pshufb xmm0, xmm1 3388 sub ecx, 8 3389 movlpd qword ptr [edx], xmm0 3390 movhpd qword ptr [edx + edi], xmm0 3391 lea edx, [edx + 8] 3392 jg convertloop 3393 3394 pop edi 3395 ret 3396 } 3397 } 3398 #endif // HAS_MIRRORROW_UV_SSSE3 3399 3400 #ifdef HAS_ARGBMIRRORROW_SSSE3 3401 // Shuffle table for reversing the bytes. 3402 static const uvec8 kARGBShuffleMirror = { 3403 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u 3404 }; 3405 3406 __declspec(naked) __declspec(align(16)) 3407 void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { 3408 __asm { 3409 mov eax, [esp + 4] // src 3410 mov edx, [esp + 8] // dst 3411 mov ecx, [esp + 12] // width 3412 lea eax, [eax - 16 + ecx * 4] // last 4 pixels. 3413 movdqa xmm5, kARGBShuffleMirror 3414 3415 align 4 3416 convertloop: 3417 movdqa xmm0, [eax] 3418 lea eax, [eax - 16] 3419 pshufb xmm0, xmm5 3420 sub ecx, 4 3421 movdqa [edx], xmm0 3422 lea edx, [edx + 16] 3423 jg convertloop 3424 ret 3425 } 3426 } 3427 #endif // HAS_ARGBMIRRORROW_SSSE3 3428 3429 #ifdef HAS_ARGBMIRRORROW_AVX2 3430 // Shuffle table for reversing the bytes. 3431 static const ulvec32 kARGBShuffleMirror_AVX2 = { 3432 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u 3433 }; 3434 3435 __declspec(naked) __declspec(align(16)) 3436 void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { 3437 __asm { 3438 mov eax, [esp + 4] // src 3439 mov edx, [esp + 8] // dst 3440 mov ecx, [esp + 12] // width 3441 lea eax, [eax - 32] 3442 vmovdqa ymm5, kARGBShuffleMirror_AVX2 3443 3444 align 4 3445 convertloop: 3446 vpermd ymm0, ymm5, [eax + ecx * 4] // permute dword order 3447 sub ecx, 8 3448 vmovdqu [edx], ymm0 3449 lea edx, [edx + 32] 3450 jg convertloop 3451 vzeroupper 3452 ret 3453 } 3454 } 3455 #endif // HAS_ARGBMIRRORROW_AVX2 3456 3457 #ifdef HAS_SPLITUVROW_SSE2 3458 __declspec(naked) __declspec(align(16)) 3459 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { 3460 __asm { 3461 push edi 3462 mov eax, [esp + 4 + 4] // src_uv 3463 mov edx, [esp + 4 + 8] // dst_u 3464 mov edi, [esp + 4 + 12] // dst_v 3465 mov ecx, [esp + 4 + 16] // pix 3466 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 3467 psrlw xmm5, 8 3468 sub edi, edx 3469 3470 align 4 3471 convertloop: 3472 movdqa xmm0, [eax] 3473 movdqa xmm1, [eax + 16] 3474 lea eax, [eax + 32] 3475 movdqa xmm2, xmm0 3476 movdqa xmm3, xmm1 3477 pand xmm0, xmm5 // even bytes 3478 pand xmm1, xmm5 3479 packuswb xmm0, xmm1 3480 psrlw xmm2, 8 // odd bytes 3481 psrlw xmm3, 8 3482 packuswb xmm2, xmm3 3483 movdqa [edx], xmm0 3484 movdqa [edx + edi], xmm2 3485 lea edx, [edx + 16] 3486 sub ecx, 16 3487 jg convertloop 3488 3489 pop edi 3490 ret 3491 } 3492 } 3493 3494 __declspec(naked) __declspec(align(16)) 3495 void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, 3496 int pix) { 3497 __asm { 3498 push edi 3499 mov eax, [esp + 4 + 4] // src_uv 3500 mov edx, [esp + 4 + 8] // dst_u 3501 mov edi, [esp + 4 + 12] // dst_v 3502 mov ecx, [esp + 4 + 16] // pix 3503 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 3504 psrlw xmm5, 8 3505 sub edi, edx 3506 3507 align 4 3508 convertloop: 3509 movdqu xmm0, [eax] 3510 movdqu xmm1, [eax + 16] 3511 lea eax, [eax + 32] 3512 movdqa xmm2, xmm0 3513 movdqa xmm3, xmm1 3514 pand xmm0, xmm5 // even bytes 3515 pand xmm1, xmm5 3516 packuswb xmm0, xmm1 3517 psrlw xmm2, 8 // odd bytes 3518 psrlw xmm3, 8 3519 packuswb xmm2, xmm3 3520 movdqu [edx], xmm0 3521 movdqu [edx + edi], xmm2 3522 lea edx, [edx + 16] 3523 sub ecx, 16 3524 jg convertloop 3525 3526 pop edi 3527 ret 3528 } 3529 } 3530 #endif // HAS_SPLITUVROW_SSE2 3531 3532 #ifdef HAS_SPLITUVROW_AVX2 3533 __declspec(naked) __declspec(align(16)) 3534 void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { 3535 __asm { 3536 push edi 3537 mov eax, [esp + 4 + 4] // src_uv 3538 mov edx, [esp + 4 + 8] // dst_u 3539 mov edi, [esp + 4 + 12] // dst_v 3540 mov ecx, [esp + 4 + 16] // pix 3541 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff 3542 vpsrlw ymm5, ymm5, 8 3543 sub edi, edx 3544 3545 align 4 3546 convertloop: 3547 vmovdqu ymm0, [eax] 3548 vmovdqu ymm1, [eax + 32] 3549 lea eax, [eax + 64] 3550 vpsrlw ymm2, ymm0, 8 // odd bytes 3551 vpsrlw ymm3, ymm1, 8 3552 vpand ymm0, ymm0, ymm5 // even bytes 3553 vpand ymm1, ymm1, ymm5 3554 vpackuswb ymm0, ymm0, ymm1 3555 vpackuswb ymm2, ymm2, ymm3 3556 vpermq ymm0, ymm0, 0xd8 3557 vpermq ymm2, ymm2, 0xd8 3558 vmovdqu [edx], ymm0 3559 vmovdqu [edx + edi], ymm2 3560 lea edx, [edx + 32] 3561 sub ecx, 32 3562 jg convertloop 3563 3564 pop edi 3565 vzeroupper 3566 ret 3567 } 3568 } 3569 #endif // HAS_SPLITUVROW_AVX2 3570 3571 #ifdef HAS_MERGEUVROW_SSE2 3572 __declspec(naked) __declspec(align(16)) 3573 void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, 3574 int width) { 3575 __asm { 3576 push edi 3577 mov eax, [esp + 4 + 4] // src_u 3578 mov edx, [esp + 4 + 8] // src_v 3579 mov edi, [esp + 4 + 12] // dst_uv 3580 mov ecx, [esp + 4 + 16] // width 3581 sub edx, eax 3582 3583 align 4 3584 convertloop: 3585 movdqa xmm0, [eax] // read 16 U's 3586 movdqa xmm1, [eax + edx] // and 16 V's 3587 lea eax, [eax + 16] 3588 movdqa xmm2, xmm0 3589 punpcklbw xmm0, xmm1 // first 8 UV pairs 3590 punpckhbw xmm2, xmm1 // next 8 UV pairs 3591 movdqa [edi], xmm0 3592 movdqa [edi + 16], xmm2 3593 lea edi, [edi + 32] 3594 sub ecx, 16 3595 jg convertloop 3596 3597 pop edi 3598 ret 3599 } 3600 } 3601 3602 __declspec(naked) __declspec(align(16)) 3603 void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v, 3604 uint8* dst_uv, int width) { 3605 __asm { 3606 push edi 3607 mov eax, [esp + 4 + 4] // src_u 3608 mov edx, [esp + 4 + 8] // src_v 3609 mov edi, [esp + 4 + 12] // dst_uv 3610 mov ecx, [esp + 4 + 16] // width 3611 sub edx, eax 3612 3613 align 4 3614 convertloop: 3615 movdqu xmm0, [eax] // read 16 U's 3616 movdqu xmm1, [eax + edx] // and 16 V's 3617 lea eax, [eax + 16] 3618 movdqa xmm2, xmm0 3619 punpcklbw xmm0, xmm1 // first 8 UV pairs 3620 punpckhbw xmm2, xmm1 // next 8 UV pairs 3621 movdqu [edi], xmm0 3622 movdqu [edi + 16], xmm2 3623 lea edi, [edi + 32] 3624 sub ecx, 16 3625 jg convertloop 3626 3627 pop edi 3628 ret 3629 } 3630 } 3631 #endif // HAS_MERGEUVROW_SSE2 3632 3633 #ifdef HAS_MERGEUVROW_AVX2 3634 __declspec(naked) __declspec(align(16)) 3635 void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, 3636 int width) { 3637 __asm { 3638 push edi 3639 mov eax, [esp + 4 + 4] // src_u 3640 mov edx, [esp + 4 + 8] // src_v 3641 mov edi, [esp + 4 + 12] // dst_uv 3642 mov ecx, [esp + 4 + 16] // width 3643 sub edx, eax 3644 3645 align 4 3646 convertloop: 3647 vmovdqu ymm0, [eax] // read 32 U's 3648 vmovdqu ymm1, [eax + edx] // and 32 V's 3649 lea eax, [eax + 32] 3650 vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2 3651 vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3 3652 vperm2i128 ymm1, ymm2, ymm0, 0x20 // low 128 of ymm2 and low 128 of ymm0 3653 vperm2i128 ymm2, ymm2, ymm0, 0x31 // high 128 of ymm2 and high 128 of ymm0 3654 vmovdqu [edi], ymm1 3655 vmovdqu [edi + 32], ymm2 3656 lea edi, [edi + 64] 3657 sub ecx, 32 3658 jg convertloop 3659 3660 pop edi 3661 vzeroupper 3662 ret 3663 } 3664 } 3665 #endif // HAS_MERGEUVROW_AVX2 3666 3667 #ifdef HAS_COPYROW_SSE2 3668 // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time. 3669 __declspec(naked) __declspec(align(16)) 3670 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { 3671 __asm { 3672 mov eax, [esp + 4] // src 3673 mov edx, [esp + 8] // dst 3674 mov ecx, [esp + 12] // count 3675 3676 align 4 3677 convertloop: 3678 movdqa xmm0, [eax] 3679 movdqa xmm1, [eax + 16] 3680 lea eax, [eax + 32] 3681 movdqa [edx], xmm0 3682 movdqa [edx + 16], xmm1 3683 lea edx, [edx + 32] 3684 sub ecx, 32 3685 jg convertloop 3686 ret 3687 } 3688 } 3689 #endif // HAS_COPYROW_SSE2 3690 3691 // Unaligned Multiple of 1. 3692 __declspec(naked) __declspec(align(16)) 3693 void CopyRow_ERMS(const uint8* src, uint8* dst, int count) { 3694 __asm { 3695 mov eax, esi 3696 mov edx, edi 3697 mov esi, [esp + 4] // src 3698 mov edi, [esp + 8] // dst 3699 mov ecx, [esp + 12] // count 3700 rep movsb 3701 mov edi, edx 3702 mov esi, eax 3703 ret 3704 } 3705 } 3706 3707 #ifdef HAS_COPYROW_X86 3708 __declspec(naked) __declspec(align(16)) 3709 void CopyRow_X86(const uint8* src, uint8* dst, int count) { 3710 __asm { 3711 mov eax, esi 3712 mov edx, edi 3713 mov esi, [esp + 4] // src 3714 mov edi, [esp + 8] // dst 3715 mov ecx, [esp + 12] // count 3716 shr ecx, 2 3717 rep movsd 3718 mov edi, edx 3719 mov esi, eax 3720 ret 3721 } 3722 } 3723 #endif // HAS_COPYROW_X86 3724 3725 #ifdef HAS_ARGBCOPYALPHAROW_SSE2 3726 // width in pixels 3727 __declspec(naked) __declspec(align(16)) 3728 void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { 3729 __asm { 3730 mov eax, [esp + 4] // src 3731 mov edx, [esp + 8] // dst 3732 mov ecx, [esp + 12] // count 3733 pcmpeqb xmm0, xmm0 // generate mask 0xff000000 3734 pslld xmm0, 24 3735 pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff 3736 psrld xmm1, 8 3737 3738 align 4 3739 convertloop: 3740 movdqa xmm2, [eax] 3741 movdqa xmm3, [eax + 16] 3742 lea eax, [eax + 32] 3743 movdqa xmm4, [edx] 3744 movdqa xmm5, [edx + 16] 3745 pand xmm2, xmm0 3746 pand xmm3, xmm0 3747 pand xmm4, xmm1 3748 pand xmm5, xmm1 3749 por xmm2, xmm4 3750 por xmm3, xmm5 3751 movdqa [edx], xmm2 3752 movdqa [edx + 16], xmm3 3753 lea edx, [edx + 32] 3754 sub ecx, 8 3755 jg convertloop 3756 3757 ret 3758 } 3759 } 3760 #endif // HAS_ARGBCOPYALPHAROW_SSE2 3761 3762 #ifdef HAS_ARGBCOPYALPHAROW_AVX2 3763 // width in pixels 3764 __declspec(naked) __declspec(align(16)) 3765 void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { 3766 __asm { 3767 mov eax, [esp + 4] // src 3768 mov edx, [esp + 8] // dst 3769 mov ecx, [esp + 12] // count 3770 vpcmpeqb ymm0, ymm0, ymm0 3771 vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff 3772 3773 align 4 3774 convertloop: 3775 vmovdqu ymm1, [eax] 3776 vmovdqu ymm2, [eax + 32] 3777 lea eax, [eax + 64] 3778 vpblendvb ymm1, ymm1, [edx], ymm0 3779 vpblendvb ymm2, ymm2, [edx + 32], ymm0 3780 vmovdqu [edx], ymm1 3781 vmovdqu [edx + 32], ymm2 3782 lea edx, [edx + 64] 3783 sub ecx, 16 3784 jg convertloop 3785 3786 vzeroupper 3787 ret 3788 } 3789 } 3790 #endif // HAS_ARGBCOPYALPHAROW_AVX2 3791 3792 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 3793 // width in pixels 3794 __declspec(naked) __declspec(align(16)) 3795 void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { 3796 __asm { 3797 mov eax, [esp + 4] // src 3798 mov edx, [esp + 8] // dst 3799 mov ecx, [esp + 12] // count 3800 pcmpeqb xmm0, xmm0 // generate mask 0xff000000 3801 pslld xmm0, 24 3802 pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff 3803 psrld xmm1, 8 3804 3805 align 4 3806 convertloop: 3807 movq xmm2, qword ptr [eax] // 8 Y's 3808 lea eax, [eax + 8] 3809 punpcklbw xmm2, xmm2 3810 punpckhwd xmm3, xmm2 3811 punpcklwd xmm2, xmm2 3812 movdqa xmm4, [edx] 3813 movdqa xmm5, [edx + 16] 3814 pand xmm2, xmm0 3815 pand xmm3, xmm0 3816 pand xmm4, xmm1 3817 pand xmm5, xmm1 3818 por xmm2, xmm4 3819 por xmm3, xmm5 3820 movdqa [edx], xmm2 3821 movdqa [edx + 16], xmm3 3822 lea edx, [edx + 32] 3823 sub ecx, 8 3824 jg convertloop 3825 3826 ret 3827 } 3828 } 3829 #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2 3830 3831 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 3832 // width in pixels 3833 __declspec(naked) __declspec(align(16)) 3834 void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { 3835 __asm { 3836 mov eax, [esp + 4] // src 3837 mov edx, [esp + 8] // dst 3838 mov ecx, [esp + 12] // count 3839 vpcmpeqb ymm0, ymm0, ymm0 3840 vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff 3841 3842 align 4 3843 convertloop: 3844 vpmovzxbd ymm1, qword ptr [eax] 3845 vpmovzxbd ymm2, qword ptr [eax + 8] 3846 lea eax, [eax + 16] 3847 vpslld ymm1, ymm1, 24 3848 vpslld ymm2, ymm2, 24 3849 vpblendvb ymm1, ymm1, [edx], ymm0 3850 vpblendvb ymm2, ymm2, [edx + 32], ymm0 3851 vmovdqu [edx], ymm1 3852 vmovdqu [edx + 32], ymm2 3853 lea edx, [edx + 64] 3854 sub ecx, 16 3855 jg convertloop 3856 3857 vzeroupper 3858 ret 3859 } 3860 } 3861 #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 3862 3863 #ifdef HAS_SETROW_X86 3864 // SetRow8 writes 'count' bytes using a 32 bit value repeated. 3865 __declspec(naked) __declspec(align(16)) 3866 void SetRow_X86(uint8* dst, uint32 v32, int count) { 3867 __asm { 3868 mov edx, edi 3869 mov edi, [esp + 4] // dst 3870 mov eax, [esp + 8] // v32 3871 mov ecx, [esp + 12] // count 3872 shr ecx, 2 3873 rep stosd 3874 mov edi, edx 3875 ret 3876 } 3877 } 3878 3879 // SetRow32 writes 'count' words using a 32 bit value repeated. 3880 __declspec(naked) __declspec(align(16)) 3881 void ARGBSetRows_X86(uint8* dst, uint32 v32, int width, 3882 int dst_stride, int height) { 3883 __asm { 3884 push esi 3885 push edi 3886 push ebp 3887 mov edi, [esp + 12 + 4] // dst 3888 mov eax, [esp + 12 + 8] // v32 3889 mov ebp, [esp + 12 + 12] // width 3890 mov edx, [esp + 12 + 16] // dst_stride 3891 mov esi, [esp + 12 + 20] // height 3892 lea ecx, [ebp * 4] 3893 sub edx, ecx // stride - width * 4 3894 3895 align 4 3896 convertloop: 3897 mov ecx, ebp 3898 rep stosd 3899 add edi, edx 3900 sub esi, 1 3901 jg convertloop 3902 3903 pop ebp 3904 pop edi 3905 pop esi 3906 ret 3907 } 3908 } 3909 #endif // HAS_SETROW_X86 3910 3911 #ifdef HAS_YUY2TOYROW_AVX2 3912 __declspec(naked) __declspec(align(16)) 3913 void YUY2ToYRow_AVX2(const uint8* src_yuy2, 3914 uint8* dst_y, int pix) { 3915 __asm { 3916 mov eax, [esp + 4] // src_yuy2 3917 mov edx, [esp + 8] // dst_y 3918 mov ecx, [esp + 12] // pix 3919 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff 3920 vpsrlw ymm5, ymm5, 8 3921 3922 align 4 3923 convertloop: 3924 vmovdqu ymm0, [eax] 3925 vmovdqu ymm1, [eax + 32] 3926 lea eax, [eax + 64] 3927 vpand ymm0, ymm0, ymm5 // even bytes are Y 3928 vpand ymm1, ymm1, ymm5 3929 vpackuswb ymm0, ymm0, ymm1 // mutates. 3930 vpermq ymm0, ymm0, 0xd8 3931 sub ecx, 32 3932 vmovdqu [edx], ymm0 3933 lea edx, [edx + 32] 3934 jg convertloop 3935 vzeroupper 3936 ret 3937 } 3938 } 3939 3940 __declspec(naked) __declspec(align(16)) 3941 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, 3942 uint8* dst_u, uint8* dst_v, int pix) { 3943 __asm { 3944 push esi 3945 push edi 3946 mov eax, [esp + 8 + 4] // src_yuy2 3947 mov esi, [esp + 8 + 8] // stride_yuy2 3948 mov edx, [esp + 8 + 12] // dst_u 3949 mov edi, [esp + 8 + 16] // dst_v 3950 mov ecx, [esp + 8 + 20] // pix 3951 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff 3952 vpsrlw ymm5, ymm5, 8 3953 sub edi, edx 3954 3955 align 4 3956 convertloop: 3957 vmovdqu ymm0, [eax] 3958 vmovdqu ymm1, [eax + 32] 3959 vpavgb ymm0, ymm0, [eax + esi] 3960 vpavgb ymm1, ymm1, [eax + esi + 32] 3961 lea eax, [eax + 64] 3962 vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV 3963 vpsrlw ymm1, ymm1, 8 3964 vpackuswb ymm0, ymm0, ymm1 // mutates. 3965 vpermq ymm0, ymm0, 0xd8 3966 vpand ymm1, ymm0, ymm5 // U 3967 vpsrlw ymm0, ymm0, 8 // V 3968 vpackuswb ymm1, ymm1, ymm1 // mutates. 3969 vpackuswb ymm0, ymm0, ymm0 // mutates. 3970 vpermq ymm1, ymm1, 0xd8 3971 vpermq ymm0, ymm0, 0xd8 3972 vextractf128 [edx], ymm1, 0 // U 3973 vextractf128 [edx + edi], ymm0, 0 // V 3974 lea edx, [edx + 16] 3975 sub ecx, 32 3976 jg convertloop 3977 3978 pop edi 3979 pop esi 3980 vzeroupper 3981 ret 3982 } 3983 } 3984 3985 __declspec(naked) __declspec(align(16)) 3986 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, 3987 uint8* dst_u, uint8* dst_v, int pix) { 3988 __asm { 3989 push edi 3990 mov eax, [esp + 4 + 4] // src_yuy2 3991 mov edx, [esp + 4 + 8] // dst_u 3992 mov edi, [esp + 4 + 12] // dst_v 3993 mov ecx, [esp + 4 + 16] // pix 3994 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff 3995 vpsrlw ymm5, ymm5, 8 3996 sub edi, edx 3997 3998 align 4 3999 convertloop: 4000 vmovdqu ymm0, [eax] 4001 vmovdqu ymm1, [eax + 32] 4002 lea eax, [eax + 64] 4003 vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV 4004 vpsrlw ymm1, ymm1, 8 4005 vpackuswb ymm0, ymm0, ymm1 // mutates. 4006 vpermq ymm0, ymm0, 0xd8 4007 vpand ymm1, ymm0, ymm5 // U 4008 vpsrlw ymm0, ymm0, 8 // V 4009 vpackuswb ymm1, ymm1, ymm1 // mutates. 4010 vpackuswb ymm0, ymm0, ymm0 // mutates. 4011 vpermq ymm1, ymm1, 0xd8 4012 vpermq ymm0, ymm0, 0xd8 4013 vextractf128 [edx], ymm1, 0 // U 4014 vextractf128 [edx + edi], ymm0, 0 // V 4015 lea edx, [edx + 16] 4016 sub ecx, 32 4017 jg convertloop 4018 4019 pop edi 4020 vzeroupper 4021 ret 4022 } 4023 } 4024 4025 __declspec(naked) __declspec(align(16)) 4026 void UYVYToYRow_AVX2(const uint8* src_uyvy, 4027 uint8* dst_y, int pix) { 4028 __asm { 4029 mov eax, [esp + 4] // src_uyvy 4030 mov edx, [esp + 8] // dst_y 4031 mov ecx, [esp + 12] // pix 4032 4033 align 4 4034 convertloop: 4035 vmovdqu ymm0, [eax] 4036 vmovdqu ymm1, [eax + 32] 4037 lea eax, [eax + 64] 4038 vpsrlw ymm0, ymm0, 8 // odd bytes are Y 4039 vpsrlw ymm1, ymm1, 8 4040 vpackuswb ymm0, ymm0, ymm1 // mutates. 4041 vpermq ymm0, ymm0, 0xd8 4042 sub ecx, 32 4043 vmovdqu [edx], ymm0 4044 lea edx, [edx + 32] 4045 jg convertloop 4046 ret 4047 vzeroupper 4048 } 4049 } 4050 4051 __declspec(naked) __declspec(align(16)) 4052 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, 4053 uint8* dst_u, uint8* dst_v, int pix) { 4054 __asm { 4055 push esi 4056 push edi 4057 mov eax, [esp + 8 + 4] // src_yuy2 4058 mov esi, [esp + 8 + 8] // stride_yuy2 4059 mov edx, [esp + 8 + 12] // dst_u 4060 mov edi, [esp + 8 + 16] // dst_v 4061 mov ecx, [esp + 8 + 20] // pix 4062 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff 4063 vpsrlw ymm5, ymm5, 8 4064 sub edi, edx 4065 4066 align 4 4067 convertloop: 4068 vmovdqu ymm0, [eax] 4069 vmovdqu ymm1, [eax + 32] 4070 vpavgb ymm0, ymm0, [eax + esi] 4071 vpavgb ymm1, ymm1, [eax + esi + 32] 4072 lea eax, [eax + 64] 4073 vpand ymm0, ymm0, ymm5 // UYVY -> UVUV 4074 vpand ymm1, ymm1, ymm5 4075 vpackuswb ymm0, ymm0, ymm1 // mutates. 4076 vpermq ymm0, ymm0, 0xd8 4077 vpand ymm1, ymm0, ymm5 // U 4078 vpsrlw ymm0, ymm0, 8 // V 4079 vpackuswb ymm1, ymm1, ymm1 // mutates. 4080 vpackuswb ymm0, ymm0, ymm0 // mutates. 4081 vpermq ymm1, ymm1, 0xd8 4082 vpermq ymm0, ymm0, 0xd8 4083 vextractf128 [edx], ymm1, 0 // U 4084 vextractf128 [edx + edi], ymm0, 0 // V 4085 lea edx, [edx + 16] 4086 sub ecx, 32 4087 jg convertloop 4088 4089 pop edi 4090 pop esi 4091 vzeroupper 4092 ret 4093 } 4094 } 4095 4096 __declspec(naked) __declspec(align(16)) 4097 void UYVYToUV422Row_AVX2(const uint8* src_uyvy, 4098 uint8* dst_u, uint8* dst_v, int pix) { 4099 __asm { 4100 push edi 4101 mov eax, [esp + 4 + 4] // src_yuy2 4102 mov edx, [esp + 4 + 8] // dst_u 4103 mov edi, [esp + 4 + 12] // dst_v 4104 mov ecx, [esp + 4 + 16] // pix 4105 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff 4106 vpsrlw ymm5, ymm5, 8 4107 sub edi, edx 4108 4109 align 4 4110 convertloop: 4111 vmovdqu ymm0, [eax] 4112 vmovdqu ymm1, [eax + 32] 4113 lea eax, [eax + 64] 4114 vpand ymm0, ymm0, ymm5 // UYVY -> UVUV 4115 vpand ymm1, ymm1, ymm5 4116 vpackuswb ymm0, ymm0, ymm1 // mutates. 4117 vpermq ymm0, ymm0, 0xd8 4118 vpand ymm1, ymm0, ymm5 // U 4119 vpsrlw ymm0, ymm0, 8 // V 4120 vpackuswb ymm1, ymm1, ymm1 // mutates. 4121 vpackuswb ymm0, ymm0, ymm0 // mutates. 4122 vpermq ymm1, ymm1, 0xd8 4123 vpermq ymm0, ymm0, 0xd8 4124 vextractf128 [edx], ymm1, 0 // U 4125 vextractf128 [edx + edi], ymm0, 0 // V 4126 lea edx, [edx + 16] 4127 sub ecx, 32 4128 jg convertloop 4129 4130 pop edi 4131 vzeroupper 4132 ret 4133 } 4134 } 4135 #endif // HAS_YUY2TOYROW_AVX2 4136 4137 #ifdef HAS_YUY2TOYROW_SSE2 4138 __declspec(naked) __declspec(align(16)) 4139 void YUY2ToYRow_SSE2(const uint8* src_yuy2, 4140 uint8* dst_y, int pix) { 4141 __asm { 4142 mov eax, [esp + 4] // src_yuy2 4143 mov edx, [esp + 8] // dst_y 4144 mov ecx, [esp + 12] // pix 4145 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 4146 psrlw xmm5, 8 4147 4148 align 4 4149 convertloop: 4150 movdqa xmm0, [eax] 4151 movdqa xmm1, [eax + 16] 4152 lea eax, [eax + 32] 4153 pand xmm0, xmm5 // even bytes are Y 4154 pand xmm1, xmm5 4155 packuswb xmm0, xmm1 4156 sub ecx, 16 4157 movdqa [edx], xmm0 4158 lea edx, [edx + 16] 4159 jg convertloop 4160 ret 4161 } 4162 } 4163 4164 __declspec(naked) __declspec(align(16)) 4165 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, 4166 uint8* dst_u, uint8* dst_v, int pix) { 4167 __asm { 4168 push esi 4169 push edi 4170 mov eax, [esp + 8 + 4] // src_yuy2 4171 mov esi, [esp + 8 + 8] // stride_yuy2 4172 mov edx, [esp + 8 + 12] // dst_u 4173 mov edi, [esp + 8 + 16] // dst_v 4174 mov ecx, [esp + 8 + 20] // pix 4175 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 4176 psrlw xmm5, 8 4177 sub edi, edx 4178 4179 align 4 4180 convertloop: 4181 movdqa xmm0, [eax] 4182 movdqa xmm1, [eax + 16] 4183 movdqa xmm2, [eax + esi] 4184 movdqa xmm3, [eax + esi + 16] 4185 lea eax, [eax + 32] 4186 pavgb xmm0, xmm2 4187 pavgb xmm1, xmm3 4188 psrlw xmm0, 8 // YUYV -> UVUV 4189 psrlw xmm1, 8 4190 packuswb xmm0, xmm1 4191 movdqa xmm1, xmm0 4192 pand xmm0, xmm5 // U 4193 packuswb xmm0, xmm0 4194 psrlw xmm1, 8 // V 4195 packuswb xmm1, xmm1 4196 movq qword ptr [edx], xmm0 4197 movq qword ptr [edx + edi], xmm1 4198 lea edx, [edx + 8] 4199 sub ecx, 16 4200 jg convertloop 4201 4202 pop edi 4203 pop esi 4204 ret 4205 } 4206 } 4207 4208 __declspec(naked) __declspec(align(16)) 4209 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, 4210 uint8* dst_u, uint8* dst_v, int pix) { 4211 __asm { 4212 push edi 4213 mov eax, [esp + 4 + 4] // src_yuy2 4214 mov edx, [esp + 4 + 8] // dst_u 4215 mov edi, [esp + 4 + 12] // dst_v 4216 mov ecx, [esp + 4 + 16] // pix 4217 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 4218 psrlw xmm5, 8 4219 sub edi, edx 4220 4221 align 4 4222 convertloop: 4223 movdqa xmm0, [eax] 4224 movdqa xmm1, [eax + 16] 4225 lea eax, [eax + 32] 4226 psrlw xmm0, 8 // YUYV -> UVUV 4227 psrlw xmm1, 8 4228 packuswb xmm0, xmm1 4229 movdqa xmm1, xmm0 4230 pand xmm0, xmm5 // U 4231 packuswb xmm0, xmm0 4232 psrlw xmm1, 8 // V 4233 packuswb xmm1, xmm1 4234 movq qword ptr [edx], xmm0 4235 movq qword ptr [edx + edi], xmm1 4236 lea edx, [edx + 8] 4237 sub ecx, 16 4238 jg convertloop 4239 4240 pop edi 4241 ret 4242 } 4243 } 4244 4245 __declspec(naked) __declspec(align(16)) 4246 void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2, 4247 uint8* dst_y, int pix) { 4248 __asm { 4249 mov eax, [esp + 4] // src_yuy2 4250 mov edx, [esp + 8] // dst_y 4251 mov ecx, [esp + 12] // pix 4252 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 4253 psrlw xmm5, 8 4254 4255 align 4 4256 convertloop: 4257 movdqu xmm0, [eax] 4258 movdqu xmm1, [eax + 16] 4259 lea eax, [eax + 32] 4260 pand xmm0, xmm5 // even bytes are Y 4261 pand xmm1, xmm5 4262 packuswb xmm0, xmm1 4263 sub ecx, 16 4264 movdqu [edx], xmm0 4265 lea edx, [edx + 16] 4266 jg convertloop 4267 ret 4268 } 4269 } 4270 4271 __declspec(naked) __declspec(align(16)) 4272 void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2, 4273 uint8* dst_u, uint8* dst_v, int pix) { 4274 __asm { 4275 push esi 4276 push edi 4277 mov eax, [esp + 8 + 4] // src_yuy2 4278 mov esi, [esp + 8 + 8] // stride_yuy2 4279 mov edx, [esp + 8 + 12] // dst_u 4280 mov edi, [esp + 8 + 16] // dst_v 4281 mov ecx, [esp + 8 + 20] // pix 4282 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 4283 psrlw xmm5, 8 4284 sub edi, edx 4285 4286 align 4 4287 convertloop: 4288 movdqu xmm0, [eax] 4289 movdqu xmm1, [eax + 16] 4290 movdqu xmm2, [eax + esi] 4291 movdqu xmm3, [eax + esi + 16] 4292 lea eax, [eax + 32] 4293 pavgb xmm0, xmm2 4294 pavgb xmm1, xmm3 4295 psrlw xmm0, 8 // YUYV -> UVUV 4296 psrlw xmm1, 8 4297 packuswb xmm0, xmm1 4298 movdqa xmm1, xmm0 4299 pand xmm0, xmm5 // U 4300 packuswb xmm0, xmm0 4301 psrlw xmm1, 8 // V 4302 packuswb xmm1, xmm1 4303 movq qword ptr [edx], xmm0 4304 movq qword ptr [edx + edi], xmm1 4305 lea edx, [edx + 8] 4306 sub ecx, 16 4307 jg convertloop 4308 4309 pop edi 4310 pop esi 4311 ret 4312 } 4313 } 4314 4315 __declspec(naked) __declspec(align(16)) 4316 void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2, 4317 uint8* dst_u, uint8* dst_v, int pix) { 4318 __asm { 4319 push edi 4320 mov eax, [esp + 4 + 4] // src_yuy2 4321 mov edx, [esp + 4 + 8] // dst_u 4322 mov edi, [esp + 4 + 12] // dst_v 4323 mov ecx, [esp + 4 + 16] // pix 4324 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 4325 psrlw xmm5, 8 4326 sub edi, edx 4327 4328 align 4 4329 convertloop: 4330 movdqu xmm0, [eax] 4331 movdqu xmm1, [eax + 16] 4332 lea eax, [eax + 32] 4333 psrlw xmm0, 8 // YUYV -> UVUV 4334 psrlw xmm1, 8 4335 packuswb xmm0, xmm1 4336 movdqa xmm1, xmm0 4337 pand xmm0, xmm5 // U 4338 packuswb xmm0, xmm0 4339 psrlw xmm1, 8 // V 4340 packuswb xmm1, xmm1 4341 movq qword ptr [edx], xmm0 4342 movq qword ptr [edx + edi], xmm1 4343 lea edx, [edx + 8] 4344 sub ecx, 16 4345 jg convertloop 4346 4347 pop edi 4348 ret 4349 } 4350 } 4351 4352 __declspec(naked) __declspec(align(16)) 4353 void UYVYToYRow_SSE2(const uint8* src_uyvy, 4354 uint8* dst_y, int pix) { 4355 __asm { 4356 mov eax, [esp + 4] // src_uyvy 4357 mov edx, [esp + 8] // dst_y 4358 mov ecx, [esp + 12] // pix 4359 4360 align 4 4361 convertloop: 4362 movdqa xmm0, [eax] 4363 movdqa xmm1, [eax + 16] 4364 lea eax, [eax + 32] 4365 psrlw xmm0, 8 // odd bytes are Y 4366 psrlw xmm1, 8 4367 packuswb xmm0, xmm1 4368 sub ecx, 16 4369 movdqa [edx], xmm0 4370 lea edx, [edx + 16] 4371 jg convertloop 4372 ret 4373 } 4374 } 4375 4376 __declspec(naked) __declspec(align(16)) 4377 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, 4378 uint8* dst_u, uint8* dst_v, int pix) { 4379 __asm { 4380 push esi 4381 push edi 4382 mov eax, [esp + 8 + 4] // src_yuy2 4383 mov esi, [esp + 8 + 8] // stride_yuy2 4384 mov edx, [esp + 8 + 12] // dst_u 4385 mov edi, [esp + 8 + 16] // dst_v 4386 mov ecx, [esp + 8 + 20] // pix 4387 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 4388 psrlw xmm5, 8 4389 sub edi, edx 4390 4391 align 4 4392 convertloop: 4393 movdqa xmm0, [eax] 4394 movdqa xmm1, [eax + 16] 4395 movdqa xmm2, [eax + esi] 4396 movdqa xmm3, [eax + esi + 16] 4397 lea eax, [eax + 32] 4398 pavgb xmm0, xmm2 4399 pavgb xmm1, xmm3 4400 pand xmm0, xmm5 // UYVY -> UVUV 4401 pand xmm1, xmm5 4402 packuswb xmm0, xmm1 4403 movdqa xmm1, xmm0 4404 pand xmm0, xmm5 // U 4405 packuswb xmm0, xmm0 4406 psrlw xmm1, 8 // V 4407 packuswb xmm1, xmm1 4408 movq qword ptr [edx], xmm0 4409 movq qword ptr [edx + edi], xmm1 4410 lea edx, [edx + 8] 4411 sub ecx, 16 4412 jg convertloop 4413 4414 pop edi 4415 pop esi 4416 ret 4417 } 4418 } 4419 4420 __declspec(naked) __declspec(align(16)) 4421 void UYVYToUV422Row_SSE2(const uint8* src_uyvy, 4422 uint8* dst_u, uint8* dst_v, int pix) { 4423 __asm { 4424 push edi 4425 mov eax, [esp + 4 + 4] // src_yuy2 4426 mov edx, [esp + 4 + 8] // dst_u 4427 mov edi, [esp + 4 + 12] // dst_v 4428 mov ecx, [esp + 4 + 16] // pix 4429 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 4430 psrlw xmm5, 8 4431 sub edi, edx 4432 4433 align 4 4434 convertloop: 4435 movdqa xmm0, [eax] 4436 movdqa xmm1, [eax + 16] 4437 lea eax, [eax + 32] 4438 pand xmm0, xmm5 // UYVY -> UVUV 4439 pand xmm1, xmm5 4440 packuswb xmm0, xmm1 4441 movdqa xmm1, xmm0 4442 pand xmm0, xmm5 // U 4443 packuswb xmm0, xmm0 4444 psrlw xmm1, 8 // V 4445 packuswb xmm1, xmm1 4446 movq qword ptr [edx], xmm0 4447 movq qword ptr [edx + edi], xmm1 4448 lea edx, [edx + 8] 4449 sub ecx, 16 4450 jg convertloop 4451 4452 pop edi 4453 ret 4454 } 4455 } 4456 4457 __declspec(naked) __declspec(align(16)) 4458 void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy, 4459 uint8* dst_y, int pix) { 4460 __asm { 4461 mov eax, [esp + 4] // src_uyvy 4462 mov edx, [esp + 8] // dst_y 4463 mov ecx, [esp + 12] // pix 4464 4465 align 4 4466 convertloop: 4467 movdqu xmm0, [eax] 4468 movdqu xmm1, [eax + 16] 4469 lea eax, [eax + 32] 4470 psrlw xmm0, 8 // odd bytes are Y 4471 psrlw xmm1, 8 4472 packuswb xmm0, xmm1 4473 sub ecx, 16 4474 movdqu [edx], xmm0 4475 lea edx, [edx + 16] 4476 jg convertloop 4477 ret 4478 } 4479 } 4480 4481 __declspec(naked) __declspec(align(16)) 4482 void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, 4483 uint8* dst_u, uint8* dst_v, int pix) { 4484 __asm { 4485 push esi 4486 push edi 4487 mov eax, [esp + 8 + 4] // src_yuy2 4488 mov esi, [esp + 8 + 8] // stride_yuy2 4489 mov edx, [esp + 8 + 12] // dst_u 4490 mov edi, [esp + 8 + 16] // dst_v 4491 mov ecx, [esp + 8 + 20] // pix 4492 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 4493 psrlw xmm5, 8 4494 sub edi, edx 4495 4496 align 4 4497 convertloop: 4498 movdqu xmm0, [eax] 4499 movdqu xmm1, [eax + 16] 4500 movdqu xmm2, [eax + esi] 4501 movdqu xmm3, [eax + esi + 16] 4502 lea eax, [eax + 32] 4503 pavgb xmm0, xmm2 4504 pavgb xmm1, xmm3 4505 pand xmm0, xmm5 // UYVY -> UVUV 4506 pand xmm1, xmm5 4507 packuswb xmm0, xmm1 4508 movdqa xmm1, xmm0 4509 pand xmm0, xmm5 // U 4510 packuswb xmm0, xmm0 4511 psrlw xmm1, 8 // V 4512 packuswb xmm1, xmm1 4513 movq qword ptr [edx], xmm0 4514 movq qword ptr [edx + edi], xmm1 4515 lea edx, [edx + 8] 4516 sub ecx, 16 4517 jg convertloop 4518 4519 pop edi 4520 pop esi 4521 ret 4522 } 4523 } 4524 4525 __declspec(naked) __declspec(align(16)) 4526 void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy, 4527 uint8* dst_u, uint8* dst_v, int pix) { 4528 __asm { 4529 push edi 4530 mov eax, [esp + 4 + 4] // src_yuy2 4531 mov edx, [esp + 4 + 8] // dst_u 4532 mov edi, [esp + 4 + 12] // dst_v 4533 mov ecx, [esp + 4 + 16] // pix 4534 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 4535 psrlw xmm5, 8 4536 sub edi, edx 4537 4538 align 4 4539 convertloop: 4540 movdqu xmm0, [eax] 4541 movdqu xmm1, [eax + 16] 4542 lea eax, [eax + 32] 4543 pand xmm0, xmm5 // UYVY -> UVUV 4544 pand xmm1, xmm5 4545 packuswb xmm0, xmm1 4546 movdqa xmm1, xmm0 4547 pand xmm0, xmm5 // U 4548 packuswb xmm0, xmm0 4549 psrlw xmm1, 8 // V 4550 packuswb xmm1, xmm1 4551 movq qword ptr [edx], xmm0 4552 movq qword ptr [edx + edi], xmm1 4553 lea edx, [edx + 8] 4554 sub ecx, 16 4555 jg convertloop 4556 4557 pop edi 4558 ret 4559 } 4560 } 4561 #endif // HAS_YUY2TOYROW_SSE2 4562 4563 #ifdef HAS_ARGBBLENDROW_SSE2 4564 // Blend 8 pixels at a time. 4565 __declspec(naked) __declspec(align(16)) 4566 void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, 4567 uint8* dst_argb, int width) { 4568 __asm { 4569 push esi 4570 mov eax, [esp + 4 + 4] // src_argb0 4571 mov esi, [esp + 4 + 8] // src_argb1 4572 mov edx, [esp + 4 + 12] // dst_argb 4573 mov ecx, [esp + 4 + 16] // width 4574 pcmpeqb xmm7, xmm7 // generate constant 1 4575 psrlw xmm7, 15 4576 pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff 4577 psrlw xmm6, 8 4578 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 4579 psllw xmm5, 8 4580 pcmpeqb xmm4, xmm4 // generate mask 0xff000000 4581 pslld xmm4, 24 4582 4583 sub ecx, 1 4584 je convertloop1 // only 1 pixel? 4585 jl convertloop1b 4586 4587 // 1 pixel loop until destination pointer is aligned. 4588 alignloop1: 4589 test edx, 15 // aligned? 4590 je alignloop1b 4591 movd xmm3, [eax] 4592 lea eax, [eax + 4] 4593 movdqa xmm0, xmm3 // src argb 4594 pxor xmm3, xmm4 // ~alpha 4595 movd xmm2, [esi] // _r_b 4596 psrlw xmm3, 8 // alpha 4597 pshufhw xmm3, xmm3, 0F5h // 8 alpha words 4598 pshuflw xmm3, xmm3, 0F5h 4599 pand xmm2, xmm6 // _r_b 4600 paddw xmm3, xmm7 // 256 - alpha 4601 pmullw xmm2, xmm3 // _r_b * alpha 4602 movd xmm1, [esi] // _a_g 4603 lea esi, [esi + 4] 4604 psrlw xmm1, 8 // _a_g 4605 por xmm0, xmm4 // set alpha to 255 4606 pmullw xmm1, xmm3 // _a_g * alpha 4607 psrlw xmm2, 8 // _r_b convert to 8 bits again 4608 paddusb xmm0, xmm2 // + src argb 4609 pand xmm1, xmm5 // a_g_ convert to 8 bits again 4610 paddusb xmm0, xmm1 // + src argb 4611 sub ecx, 1 4612 movd [edx], xmm0 4613 lea edx, [edx + 4] 4614 jge alignloop1 4615 4616 alignloop1b: 4617 add ecx, 1 - 4 4618 jl convertloop4b 4619 4620 // 4 pixel loop. 4621 convertloop4: 4622 movdqu xmm3, [eax] // src argb 4623 lea eax, [eax + 16] 4624 movdqa xmm0, xmm3 // src argb 4625 pxor xmm3, xmm4 // ~alpha 4626 movdqu xmm2, [esi] // _r_b 4627 psrlw xmm3, 8 // alpha 4628 pshufhw xmm3, xmm3, 0F5h // 8 alpha words 4629 pshuflw xmm3, xmm3, 0F5h 4630 pand xmm2, xmm6 // _r_b 4631 paddw xmm3, xmm7 // 256 - alpha 4632 pmullw xmm2, xmm3 // _r_b * alpha 4633 movdqu xmm1, [esi] // _a_g 4634 lea esi, [esi + 16] 4635 psrlw xmm1, 8 // _a_g 4636 por xmm0, xmm4 // set alpha to 255 4637 pmullw xmm1, xmm3 // _a_g * alpha 4638 psrlw xmm2, 8 // _r_b convert to 8 bits again 4639 paddusb xmm0, xmm2 // + src argb 4640 pand xmm1, xmm5 // a_g_ convert to 8 bits again 4641 paddusb xmm0, xmm1 // + src argb 4642 sub ecx, 4 4643 movdqa [edx], xmm0 4644 lea edx, [edx + 16] 4645 jge convertloop4 4646 4647 convertloop4b: 4648 add ecx, 4 - 1 4649 jl convertloop1b 4650 4651 // 1 pixel loop. 4652 convertloop1: 4653 movd xmm3, [eax] // src argb 4654 lea eax, [eax + 4] 4655 movdqa xmm0, xmm3 // src argb 4656 pxor xmm3, xmm4 // ~alpha 4657 movd xmm2, [esi] // _r_b 4658 psrlw xmm3, 8 // alpha 4659 pshufhw xmm3, xmm3, 0F5h // 8 alpha words 4660 pshuflw xmm3, xmm3, 0F5h 4661 pand xmm2, xmm6 // _r_b 4662 paddw xmm3, xmm7 // 256 - alpha 4663 pmullw xmm2, xmm3 // _r_b * alpha 4664 movd xmm1, [esi] // _a_g 4665 lea esi, [esi + 4] 4666 psrlw xmm1, 8 // _a_g 4667 por xmm0, xmm4 // set alpha to 255 4668 pmullw xmm1, xmm3 // _a_g * alpha 4669 psrlw xmm2, 8 // _r_b convert to 8 bits again 4670 paddusb xmm0, xmm2 // + src argb 4671 pand xmm1, xmm5 // a_g_ convert to 8 bits again 4672 paddusb xmm0, xmm1 // + src argb 4673 sub ecx, 1 4674 movd [edx], xmm0 4675 lea edx, [edx + 4] 4676 jge convertloop1 4677 4678 convertloop1b: 4679 pop esi 4680 ret 4681 } 4682 } 4683 #endif // HAS_ARGBBLENDROW_SSE2 4684 4685 #ifdef HAS_ARGBBLENDROW_SSSE3 4686 // Shuffle table for isolating alpha. 4687 static const uvec8 kShuffleAlpha = { 4688 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, 4689 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 4690 }; 4691 // Same as SSE2, but replaces: 4692 // psrlw xmm3, 8 // alpha 4693 // pshufhw xmm3, xmm3, 0F5h // 8 alpha words 4694 // pshuflw xmm3, xmm3, 0F5h 4695 // with.. 4696 // pshufb xmm3, kShuffleAlpha // alpha 4697 // Blend 8 pixels at a time. 4698 4699 __declspec(naked) __declspec(align(16)) 4700 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, 4701 uint8* dst_argb, int width) { 4702 __asm { 4703 push esi 4704 mov eax, [esp + 4 + 4] // src_argb0 4705 mov esi, [esp + 4 + 8] // src_argb1 4706 mov edx, [esp + 4 + 12] // dst_argb 4707 mov ecx, [esp + 4 + 16] // width 4708 pcmpeqb xmm7, xmm7 // generate constant 0x0001 4709 psrlw xmm7, 15 4710 pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff 4711 psrlw xmm6, 8 4712 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 4713 psllw xmm5, 8 4714 pcmpeqb xmm4, xmm4 // generate mask 0xff000000 4715 pslld xmm4, 24 4716 4717 sub ecx, 1 4718 je convertloop1 // only 1 pixel? 4719 jl convertloop1b 4720 4721 // 1 pixel loop until destination pointer is aligned. 4722 alignloop1: 4723 test edx, 15 // aligned? 4724 je alignloop1b 4725 movd xmm3, [eax] 4726 lea eax, [eax + 4] 4727 movdqa xmm0, xmm3 // src argb 4728 pxor xmm3, xmm4 // ~alpha 4729 movd xmm2, [esi] // _r_b 4730 pshufb xmm3, kShuffleAlpha // alpha 4731 pand xmm2, xmm6 // _r_b 4732 paddw xmm3, xmm7 // 256 - alpha 4733 pmullw xmm2, xmm3 // _r_b * alpha 4734 movd xmm1, [esi] // _a_g 4735 lea esi, [esi + 4] 4736 psrlw xmm1, 8 // _a_g 4737 por xmm0, xmm4 // set alpha to 255 4738 pmullw xmm1, xmm3 // _a_g * alpha 4739 psrlw xmm2, 8 // _r_b convert to 8 bits again 4740 paddusb xmm0, xmm2 // + src argb 4741 pand xmm1, xmm5 // a_g_ convert to 8 bits again 4742 paddusb xmm0, xmm1 // + src argb 4743 sub ecx, 1 4744 movd [edx], xmm0 4745 lea edx, [edx + 4] 4746 jge alignloop1 4747 4748 alignloop1b: 4749 add ecx, 1 - 4 4750 jl convertloop4b 4751 4752 test eax, 15 // unaligned? 4753 jne convertuloop4 4754 test esi, 15 // unaligned? 4755 jne convertuloop4 4756 4757 // 4 pixel loop. 4758 convertloop4: 4759 movdqa xmm3, [eax] // src argb 4760 lea eax, [eax + 16] 4761 movdqa xmm0, xmm3 // src argb 4762 pxor xmm3, xmm4 // ~alpha 4763 movdqa xmm2, [esi] // _r_b 4764 pshufb xmm3, kShuffleAlpha // alpha 4765 pand xmm2, xmm6 // _r_b 4766 paddw xmm3, xmm7 // 256 - alpha 4767 pmullw xmm2, xmm3 // _r_b * alpha 4768 movdqa xmm1, [esi] // _a_g 4769 lea esi, [esi + 16] 4770 psrlw xmm1, 8 // _a_g 4771 por xmm0, xmm4 // set alpha to 255 4772 pmullw xmm1, xmm3 // _a_g * alpha 4773 psrlw xmm2, 8 // _r_b convert to 8 bits again 4774 paddusb xmm0, xmm2 // + src argb 4775 pand xmm1, xmm5 // a_g_ convert to 8 bits again 4776 paddusb xmm0, xmm1 // + src argb 4777 sub ecx, 4 4778 movdqa [edx], xmm0 4779 lea edx, [edx + 16] 4780 jge convertloop4 4781 jmp convertloop4b 4782 4783 // 4 pixel unaligned loop. 4784 convertuloop4: 4785 movdqu xmm3, [eax] // src argb 4786 lea eax, [eax + 16] 4787 movdqa xmm0, xmm3 // src argb 4788 pxor xmm3, xmm4 // ~alpha 4789 movdqu xmm2, [esi] // _r_b 4790 pshufb xmm3, kShuffleAlpha // alpha 4791 pand xmm2, xmm6 // _r_b 4792 paddw xmm3, xmm7 // 256 - alpha 4793 pmullw xmm2, xmm3 // _r_b * alpha 4794 movdqu xmm1, [esi] // _a_g 4795 lea esi, [esi + 16] 4796 psrlw xmm1, 8 // _a_g 4797 por xmm0, xmm4 // set alpha to 255 4798 pmullw xmm1, xmm3 // _a_g * alpha 4799 psrlw xmm2, 8 // _r_b convert to 8 bits again 4800 paddusb xmm0, xmm2 // + src argb 4801 pand xmm1, xmm5 // a_g_ convert to 8 bits again 4802 paddusb xmm0, xmm1 // + src argb 4803 sub ecx, 4 4804 movdqa [edx], xmm0 4805 lea edx, [edx + 16] 4806 jge convertuloop4 4807 4808 convertloop4b: 4809 add ecx, 4 - 1 4810 jl convertloop1b 4811 4812 // 1 pixel loop. 4813 convertloop1: 4814 movd xmm3, [eax] // src argb 4815 lea eax, [eax + 4] 4816 movdqa xmm0, xmm3 // src argb 4817 pxor xmm3, xmm4 // ~alpha 4818 movd xmm2, [esi] // _r_b 4819 pshufb xmm3, kShuffleAlpha // alpha 4820 pand xmm2, xmm6 // _r_b 4821 paddw xmm3, xmm7 // 256 - alpha 4822 pmullw xmm2, xmm3 // _r_b * alpha 4823 movd xmm1, [esi] // _a_g 4824 lea esi, [esi + 4] 4825 psrlw xmm1, 8 // _a_g 4826 por xmm0, xmm4 // set alpha to 255 4827 pmullw xmm1, xmm3 // _a_g * alpha 4828 psrlw xmm2, 8 // _r_b convert to 8 bits again 4829 paddusb xmm0, xmm2 // + src argb 4830 pand xmm1, xmm5 // a_g_ convert to 8 bits again 4831 paddusb xmm0, xmm1 // + src argb 4832 sub ecx, 1 4833 movd [edx], xmm0 4834 lea edx, [edx + 4] 4835 jge convertloop1 4836 4837 convertloop1b: 4838 pop esi 4839 ret 4840 } 4841 } 4842 #endif // HAS_ARGBBLENDROW_SSSE3 4843 4844 #ifdef HAS_ARGBATTENUATEROW_SSE2 4845 // Attenuate 4 pixels at a time. 4846 // Aligned to 16 bytes. 4847 __declspec(naked) __declspec(align(16)) 4848 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { 4849 __asm { 4850 mov eax, [esp + 4] // src_argb0 4851 mov edx, [esp + 8] // dst_argb 4852 mov ecx, [esp + 12] // width 4853 pcmpeqb xmm4, xmm4 // generate mask 0xff000000 4854 pslld xmm4, 24 4855 pcmpeqb xmm5, xmm5 // generate mask 0x00ffffff 4856 psrld xmm5, 8 4857 4858 align 4 4859 convertloop: 4860 movdqa xmm0, [eax] // read 4 pixels 4861 punpcklbw xmm0, xmm0 // first 2 4862 pshufhw xmm2, xmm0, 0FFh // 8 alpha words 4863 pshuflw xmm2, xmm2, 0FFh 4864 pmulhuw xmm0, xmm2 // rgb * a 4865 movdqa xmm1, [eax] // read 4 pixels 4866 punpckhbw xmm1, xmm1 // next 2 pixels 4867 pshufhw xmm2, xmm1, 0FFh // 8 alpha words 4868 pshuflw xmm2, xmm2, 0FFh 4869 pmulhuw xmm1, xmm2 // rgb * a 4870 movdqa xmm2, [eax] // alphas 4871 lea eax, [eax + 16] 4872 psrlw xmm0, 8 4873 pand xmm2, xmm4 4874 psrlw xmm1, 8 4875 packuswb xmm0, xmm1 4876 pand xmm0, xmm5 // keep original alphas 4877 por xmm0, xmm2 4878 sub ecx, 4 4879 movdqa [edx], xmm0 4880 lea edx, [edx + 16] 4881 jg convertloop 4882 4883 ret 4884 } 4885 } 4886 #endif // HAS_ARGBATTENUATEROW_SSE2 4887 4888 #ifdef HAS_ARGBATTENUATEROW_SSSE3 4889 // Shuffle table duplicating alpha. 4890 static const uvec8 kShuffleAlpha0 = { 4891 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u, 4892 }; 4893 static const uvec8 kShuffleAlpha1 = { 4894 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, 4895 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u, 4896 }; 4897 __declspec(naked) __declspec(align(16)) 4898 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { 4899 __asm { 4900 mov eax, [esp + 4] // src_argb0 4901 mov edx, [esp + 8] // dst_argb 4902 mov ecx, [esp + 12] // width 4903 pcmpeqb xmm3, xmm3 // generate mask 0xff000000 4904 pslld xmm3, 24 4905 movdqa xmm4, kShuffleAlpha0 4906 movdqa xmm5, kShuffleAlpha1 4907 4908 align 4 4909 convertloop: 4910 movdqu xmm0, [eax] // read 4 pixels 4911 pshufb xmm0, xmm4 // isolate first 2 alphas 4912 movdqu xmm1, [eax] // read 4 pixels 4913 punpcklbw xmm1, xmm1 // first 2 pixel rgbs 4914 pmulhuw xmm0, xmm1 // rgb * a 4915 movdqu xmm1, [eax] // read 4 pixels 4916 pshufb xmm1, xmm5 // isolate next 2 alphas 4917 movdqu xmm2, [eax] // read 4 pixels 4918 punpckhbw xmm2, xmm2 // next 2 pixel rgbs 4919 pmulhuw xmm1, xmm2 // rgb * a 4920 movdqu xmm2, [eax] // mask original alpha 4921 lea eax, [eax + 16] 4922 pand xmm2, xmm3 4923 psrlw xmm0, 8 4924 psrlw xmm1, 8 4925 packuswb xmm0, xmm1 4926 por xmm0, xmm2 // copy original alpha 4927 sub ecx, 4 4928 movdqu [edx], xmm0 4929 lea edx, [edx + 16] 4930 jg convertloop 4931 4932 ret 4933 } 4934 } 4935 #endif // HAS_ARGBATTENUATEROW_SSSE3 4936 4937 #ifdef HAS_ARGBATTENUATEROW_AVX2 4938 // Shuffle table duplicating alpha. 4939 static const ulvec8 kShuffleAlpha_AVX2 = { 4940 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 4941 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u, 4942 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 4943 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u, 4944 }; 4945 __declspec(naked) __declspec(align(16)) 4946 void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { 4947 __asm { 4948 mov eax, [esp + 4] // src_argb0 4949 mov edx, [esp + 8] // dst_argb 4950 mov ecx, [esp + 12] // width 4951 sub edx, eax 4952 vmovdqa ymm4, kShuffleAlpha_AVX2 4953 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 4954 vpslld ymm5, ymm5, 24 4955 4956 align 4 4957 convertloop: 4958 vmovdqu ymm6, [eax] // read 8 pixels. 4959 vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. 4960 vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. 4961 vpshufb ymm2, ymm0, ymm4 // low 4 alphas 4962 vpshufb ymm3, ymm1, ymm4 // high 4 alphas 4963 vpmulhuw ymm0, ymm0, ymm2 // rgb * a 4964 vpmulhuw ymm1, ymm1, ymm3 // rgb * a 4965 vpand ymm6, ymm6, ymm5 // isolate alpha 4966 vpsrlw ymm0, ymm0, 8 4967 vpsrlw ymm1, ymm1, 8 4968 vpackuswb ymm0, ymm0, ymm1 // unmutated. 4969 vpor ymm0, ymm0, ymm6 // copy original alpha 4970 sub ecx, 8 4971 vmovdqu [eax + edx], ymm0 4972 lea eax, [eax + 32] 4973 jg convertloop 4974 4975 vzeroupper 4976 ret 4977 } 4978 } 4979 #endif // HAS_ARGBATTENUATEROW_AVX2 4980 4981 #ifdef HAS_ARGBUNATTENUATEROW_SSE2 4982 // Unattenuate 4 pixels at a time. 4983 // Aligned to 16 bytes. 4984 __declspec(naked) __declspec(align(16)) 4985 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, 4986 int width) { 4987 __asm { 4988 push esi 4989 push edi 4990 mov eax, [esp + 8 + 4] // src_argb0 4991 mov edx, [esp + 8 + 8] // dst_argb 4992 mov ecx, [esp + 8 + 12] // width 4993 4994 align 4 4995 convertloop: 4996 movdqu xmm0, [eax] // read 4 pixels 4997 movzx esi, byte ptr [eax + 3] // first alpha 4998 movzx edi, byte ptr [eax + 7] // second alpha 4999 punpcklbw xmm0, xmm0 // first 2 5000 movd xmm2, dword ptr fixed_invtbl8[esi * 4] 5001 movd xmm3, dword ptr fixed_invtbl8[edi * 4] 5002 pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words. 1, a, a, a 5003 pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words 5004 movlhps xmm2, xmm3 5005 pmulhuw xmm0, xmm2 // rgb * a 5006 5007 movdqu xmm1, [eax] // read 4 pixels 5008 movzx esi, byte ptr [eax + 11] // third alpha 5009 movzx edi, byte ptr [eax + 15] // forth alpha 5010 punpckhbw xmm1, xmm1 // next 2 5011 movd xmm2, dword ptr fixed_invtbl8[esi * 4] 5012 movd xmm3, dword ptr fixed_invtbl8[edi * 4] 5013 pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words 5014 pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words 5015 movlhps xmm2, xmm3 5016 pmulhuw xmm1, xmm2 // rgb * a 5017 lea eax, [eax + 16] 5018 5019 packuswb xmm0, xmm1 5020 sub ecx, 4 5021 movdqu [edx], xmm0 5022 lea edx, [edx + 16] 5023 jg convertloop 5024 pop edi 5025 pop esi 5026 ret 5027 } 5028 } 5029 #endif // HAS_ARGBUNATTENUATEROW_SSE2 5030 5031 #ifdef HAS_ARGBUNATTENUATEROW_AVX2 5032 // Shuffle table duplicating alpha. 5033 static const ulvec8 kUnattenShuffleAlpha_AVX2 = { 5034 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15, 5035 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15, 5036 }; 5037 // TODO(fbarchard): Enable USE_GATHER for future hardware if faster. 5038 // USE_GATHER is not on by default, due to being a slow instruction. 5039 #ifdef USE_GATHER 5040 __declspec(naked) __declspec(align(16)) 5041 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, 5042 int width) { 5043 __asm { 5044 mov eax, [esp + 4] // src_argb0 5045 mov edx, [esp + 8] // dst_argb 5046 mov ecx, [esp + 12] // width 5047 sub edx, eax 5048 vmovdqa ymm4, kUnattenShuffleAlpha_AVX2 5049 5050 align 4 5051 convertloop: 5052 vmovdqu ymm6, [eax] // read 8 pixels. 5053 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xffffffff for gather. 5054 vpsrld ymm2, ymm6, 24 // alpha in low 8 bits. 5055 vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. 5056 vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. 5057 vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5 // ymm5 cleared. 1, a 5058 vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a 5059 vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated. 5060 vpshufb ymm2, ymm2, ymm4 // replicate low 4 alphas. 1, a, a, a 5061 vpshufb ymm3, ymm3, ymm4 // replicate high 4 alphas 5062 vpmulhuw ymm0, ymm0, ymm2 // rgb * ia 5063 vpmulhuw ymm1, ymm1, ymm3 // rgb * ia 5064 vpackuswb ymm0, ymm0, ymm1 // unmutated. 5065 sub ecx, 8 5066 vmovdqu [eax + edx], ymm0 5067 lea eax, [eax + 32] 5068 jg convertloop 5069 5070 vzeroupper 5071 ret 5072 } 5073 } 5074 #else // USE_GATHER 5075 __declspec(naked) __declspec(align(16)) 5076 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, 5077 int width) { 5078 __asm { 5079 5080 mov eax, [esp + 4] // src_argb0 5081 mov edx, [esp + 8] // dst_argb 5082 mov ecx, [esp + 12] // width 5083 sub edx, eax 5084 vmovdqa ymm5, kUnattenShuffleAlpha_AVX2 5085 5086 push esi 5087 push edi 5088 5089 align 4 5090 convertloop: 5091 // replace VPGATHER 5092 movzx esi, byte ptr [eax + 3] // alpha0 5093 movzx edi, byte ptr [eax + 7] // alpha1 5094 vmovd xmm0, dword ptr fixed_invtbl8[esi * 4] // [1,a0] 5095 vmovd xmm1, dword ptr fixed_invtbl8[edi * 4] // [1,a1] 5096 movzx esi, byte ptr [eax + 11] // alpha2 5097 movzx edi, byte ptr [eax + 15] // alpha3 5098 vpunpckldq xmm6, xmm0, xmm1 // [1,a1,1,a0] 5099 vmovd xmm2, dword ptr fixed_invtbl8[esi * 4] // [1,a2] 5100 vmovd xmm3, dword ptr fixed_invtbl8[edi * 4] // [1,a3] 5101 movzx esi, byte ptr [eax + 19] // alpha4 5102 movzx edi, byte ptr [eax + 23] // alpha5 5103 vpunpckldq xmm7, xmm2, xmm3 // [1,a3,1,a2] 5104 vmovd xmm0, dword ptr fixed_invtbl8[esi * 4] // [1,a4] 5105 vmovd xmm1, dword ptr fixed_invtbl8[edi * 4] // [1,a5] 5106 movzx esi, byte ptr [eax + 27] // alpha6 5107 movzx edi, byte ptr [eax + 31] // alpha7 5108 vpunpckldq xmm0, xmm0, xmm1 // [1,a5,1,a4] 5109 vmovd xmm2, dword ptr fixed_invtbl8[esi * 4] // [1,a6] 5110 vmovd xmm3, dword ptr fixed_invtbl8[edi * 4] // [1,a7] 5111 vpunpckldq xmm2, xmm2, xmm3 // [1,a7,1,a6] 5112 vpunpcklqdq xmm3, xmm6, xmm7 // [1,a3,1,a2,1,a1,1,a0] 5113 vpunpcklqdq xmm0, xmm0, xmm2 // [1,a7,1,a6,1,a5,1,a4] 5114 vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0] 5115 // end of VPGATHER 5116 5117 vmovdqu ymm6, [eax] // read 8 pixels. 5118 vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. 5119 vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. 5120 vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a 5121 vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated. 5122 vpshufb ymm2, ymm2, ymm5 // replicate low 4 alphas. 1, a, a, a 5123 vpshufb ymm3, ymm3, ymm5 // replicate high 4 alphas 5124 vpmulhuw ymm0, ymm0, ymm2 // rgb * ia 5125 vpmulhuw ymm1, ymm1, ymm3 // rgb * ia 5126 vpackuswb ymm0, ymm0, ymm1 // unmutated. 5127 sub ecx, 8 5128 vmovdqu [eax + edx], ymm0 5129 lea eax, [eax + 32] 5130 jg convertloop 5131 5132 pop edi 5133 pop esi 5134 vzeroupper 5135 ret 5136 } 5137 } 5138 #endif // USE_GATHER 5139 #endif // HAS_ARGBATTENUATEROW_AVX2 5140 5141 #ifdef HAS_ARGBGRAYROW_SSSE3 5142 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels. 5143 __declspec(naked) __declspec(align(16)) 5144 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { 5145 __asm { 5146 mov eax, [esp + 4] /* src_argb */ 5147 mov edx, [esp + 8] /* dst_argb */ 5148 mov ecx, [esp + 12] /* width */ 5149 movdqa xmm4, kARGBToYJ 5150 movdqa xmm5, kAddYJ64 5151 5152 align 4 5153 convertloop: 5154 movdqa xmm0, [eax] // G 5155 movdqa xmm1, [eax + 16] 5156 pmaddubsw xmm0, xmm4 5157 pmaddubsw xmm1, xmm4 5158 phaddw xmm0, xmm1 5159 paddw xmm0, xmm5 // Add .5 for rounding. 5160 psrlw xmm0, 7 5161 packuswb xmm0, xmm0 // 8 G bytes 5162 movdqa xmm2, [eax] // A 5163 movdqa xmm3, [eax + 16] 5164 lea eax, [eax + 32] 5165 psrld xmm2, 24 5166 psrld xmm3, 24 5167 packuswb xmm2, xmm3 5168 packuswb xmm2, xmm2 // 8 A bytes 5169 movdqa xmm3, xmm0 // Weave into GG, GA, then GGGA 5170 punpcklbw xmm0, xmm0 // 8 GG words 5171 punpcklbw xmm3, xmm2 // 8 GA words 5172 movdqa xmm1, xmm0 5173 punpcklwd xmm0, xmm3 // GGGA first 4 5174 punpckhwd xmm1, xmm3 // GGGA next 4 5175 sub ecx, 8 5176 movdqa [edx], xmm0 5177 movdqa [edx + 16], xmm1 5178 lea edx, [edx + 32] 5179 jg convertloop 5180 ret 5181 } 5182 } 5183 #endif // HAS_ARGBGRAYROW_SSSE3 5184 5185 #ifdef HAS_ARGBSEPIAROW_SSSE3 5186 // b = (r * 35 + g * 68 + b * 17) >> 7 5187 // g = (r * 45 + g * 88 + b * 22) >> 7 5188 // r = (r * 50 + g * 98 + b * 24) >> 7 5189 // Constant for ARGB color to sepia tone. 5190 static const vec8 kARGBToSepiaB = { 5191 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0 5192 }; 5193 5194 static const vec8 kARGBToSepiaG = { 5195 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0 5196 }; 5197 5198 static const vec8 kARGBToSepiaR = { 5199 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0 5200 }; 5201 5202 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. 5203 __declspec(naked) __declspec(align(16)) 5204 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { 5205 __asm { 5206 mov eax, [esp + 4] /* dst_argb */ 5207 mov ecx, [esp + 8] /* width */ 5208 movdqa xmm2, kARGBToSepiaB 5209 movdqa xmm3, kARGBToSepiaG 5210 movdqa xmm4, kARGBToSepiaR 5211 5212 align 4 5213 convertloop: 5214 movdqa xmm0, [eax] // B 5215 movdqa xmm6, [eax + 16] 5216 pmaddubsw xmm0, xmm2 5217 pmaddubsw xmm6, xmm2 5218 phaddw xmm0, xmm6 5219 psrlw xmm0, 7 5220 packuswb xmm0, xmm0 // 8 B values 5221 movdqa xmm5, [eax] // G 5222 movdqa xmm1, [eax + 16] 5223 pmaddubsw xmm5, xmm3 5224 pmaddubsw xmm1, xmm3 5225 phaddw xmm5, xmm1 5226 psrlw xmm5, 7 5227 packuswb xmm5, xmm5 // 8 G values 5228 punpcklbw xmm0, xmm5 // 8 BG values 5229 movdqa xmm5, [eax] // R 5230 movdqa xmm1, [eax + 16] 5231 pmaddubsw xmm5, xmm4 5232 pmaddubsw xmm1, xmm4 5233 phaddw xmm5, xmm1 5234 psrlw xmm5, 7 5235 packuswb xmm5, xmm5 // 8 R values 5236 movdqa xmm6, [eax] // A 5237 movdqa xmm1, [eax + 16] 5238 psrld xmm6, 24 5239 psrld xmm1, 24 5240 packuswb xmm6, xmm1 5241 packuswb xmm6, xmm6 // 8 A values 5242 punpcklbw xmm5, xmm6 // 8 RA values 5243 movdqa xmm1, xmm0 // Weave BG, RA together 5244 punpcklwd xmm0, xmm5 // BGRA first 4 5245 punpckhwd xmm1, xmm5 // BGRA next 4 5246 sub ecx, 8 5247 movdqa [eax], xmm0 5248 movdqa [eax + 16], xmm1 5249 lea eax, [eax + 32] 5250 jg convertloop 5251 ret 5252 } 5253 } 5254 #endif // HAS_ARGBSEPIAROW_SSSE3 5255 5256 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 5257 // Tranform 8 ARGB pixels (32 bytes) with color matrix. 5258 // Same as Sepia except matrix is provided. 5259 // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R 5260 // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd. 5261 __declspec(naked) __declspec(align(16)) 5262 void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, 5263 const int8* matrix_argb, int width) { 5264 __asm { 5265 mov eax, [esp + 4] /* src_argb */ 5266 mov edx, [esp + 8] /* dst_argb */ 5267 mov ecx, [esp + 12] /* matrix_argb */ 5268 movdqu xmm5, [ecx] 5269 pshufd xmm2, xmm5, 0x00 5270 pshufd xmm3, xmm5, 0x55 5271 pshufd xmm4, xmm5, 0xaa 5272 pshufd xmm5, xmm5, 0xff 5273 mov ecx, [esp + 16] /* width */ 5274 5275 align 4 5276 convertloop: 5277 movdqa xmm0, [eax] // B 5278 movdqa xmm7, [eax + 16] 5279 pmaddubsw xmm0, xmm2 5280 pmaddubsw xmm7, xmm2 5281 movdqa xmm6, [eax] // G 5282 movdqa xmm1, [eax + 16] 5283 pmaddubsw xmm6, xmm3 5284 pmaddubsw xmm1, xmm3 5285 phaddsw xmm0, xmm7 // B 5286 phaddsw xmm6, xmm1 // G 5287 psraw xmm0, 6 // B 5288 psraw xmm6, 6 // G 5289 packuswb xmm0, xmm0 // 8 B values 5290 packuswb xmm6, xmm6 // 8 G values 5291 punpcklbw xmm0, xmm6 // 8 BG values 5292 movdqa xmm1, [eax] // R 5293 movdqa xmm7, [eax + 16] 5294 pmaddubsw xmm1, xmm4 5295 pmaddubsw xmm7, xmm4 5296 phaddsw xmm1, xmm7 // R 5297 movdqa xmm6, [eax] // A 5298 movdqa xmm7, [eax + 16] 5299 pmaddubsw xmm6, xmm5 5300 pmaddubsw xmm7, xmm5 5301 phaddsw xmm6, xmm7 // A 5302 psraw xmm1, 6 // R 5303 psraw xmm6, 6 // A 5304 packuswb xmm1, xmm1 // 8 R values 5305 packuswb xmm6, xmm6 // 8 A values 5306 punpcklbw xmm1, xmm6 // 8 RA values 5307 movdqa xmm6, xmm0 // Weave BG, RA together 5308 punpcklwd xmm0, xmm1 // BGRA first 4 5309 punpckhwd xmm6, xmm1 // BGRA next 4 5310 sub ecx, 8 5311 movdqa [edx], xmm0 5312 movdqa [edx + 16], xmm6 5313 lea eax, [eax + 32] 5314 lea edx, [edx + 32] 5315 jg convertloop 5316 ret 5317 } 5318 } 5319 #endif // HAS_ARGBCOLORMATRIXROW_SSSE3 5320 5321 #ifdef HAS_ARGBQUANTIZEROW_SSE2 5322 // Quantize 4 ARGB pixels (16 bytes). 5323 // Aligned to 16 bytes. 5324 __declspec(naked) __declspec(align(16)) 5325 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, 5326 int interval_offset, int width) { 5327 __asm { 5328 mov eax, [esp + 4] /* dst_argb */ 5329 movd xmm2, [esp + 8] /* scale */ 5330 movd xmm3, [esp + 12] /* interval_size */ 5331 movd xmm4, [esp + 16] /* interval_offset */ 5332 mov ecx, [esp + 20] /* width */ 5333 pshuflw xmm2, xmm2, 040h 5334 pshufd xmm2, xmm2, 044h 5335 pshuflw xmm3, xmm3, 040h 5336 pshufd xmm3, xmm3, 044h 5337 pshuflw xmm4, xmm4, 040h 5338 pshufd xmm4, xmm4, 044h 5339 pxor xmm5, xmm5 // constant 0 5340 pcmpeqb xmm6, xmm6 // generate mask 0xff000000 5341 pslld xmm6, 24 5342 5343 align 4 5344 convertloop: 5345 movdqa xmm0, [eax] // read 4 pixels 5346 punpcklbw xmm0, xmm5 // first 2 pixels 5347 pmulhuw xmm0, xmm2 // pixel * scale >> 16 5348 movdqa xmm1, [eax] // read 4 pixels 5349 punpckhbw xmm1, xmm5 // next 2 pixels 5350 pmulhuw xmm1, xmm2 5351 pmullw xmm0, xmm3 // * interval_size 5352 movdqa xmm7, [eax] // read 4 pixels 5353 pmullw xmm1, xmm3 5354 pand xmm7, xmm6 // mask alpha 5355 paddw xmm0, xmm4 // + interval_size / 2 5356 paddw xmm1, xmm4 5357 packuswb xmm0, xmm1 5358 por xmm0, xmm7 5359 sub ecx, 4 5360 movdqa [eax], xmm0 5361 lea eax, [eax + 16] 5362 jg convertloop 5363 ret 5364 } 5365 } 5366 #endif // HAS_ARGBQUANTIZEROW_SSE2 5367 5368 #ifdef HAS_ARGBSHADEROW_SSE2 5369 // Shade 4 pixels at a time by specified value. 5370 // Aligned to 16 bytes. 5371 __declspec(naked) __declspec(align(16)) 5372 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, 5373 uint32 value) { 5374 __asm { 5375 mov eax, [esp + 4] // src_argb 5376 mov edx, [esp + 8] // dst_argb 5377 mov ecx, [esp + 12] // width 5378 movd xmm2, [esp + 16] // value 5379 punpcklbw xmm2, xmm2 5380 punpcklqdq xmm2, xmm2 5381 5382 align 4 5383 convertloop: 5384 movdqa xmm0, [eax] // read 4 pixels 5385 lea eax, [eax + 16] 5386 movdqa xmm1, xmm0 5387 punpcklbw xmm0, xmm0 // first 2 5388 punpckhbw xmm1, xmm1 // next 2 5389 pmulhuw xmm0, xmm2 // argb * value 5390 pmulhuw xmm1, xmm2 // argb * value 5391 psrlw xmm0, 8 5392 psrlw xmm1, 8 5393 packuswb xmm0, xmm1 5394 sub ecx, 4 5395 movdqa [edx], xmm0 5396 lea edx, [edx + 16] 5397 jg convertloop 5398 5399 ret 5400 } 5401 } 5402 #endif // HAS_ARGBSHADEROW_SSE2 5403 5404 #ifdef HAS_ARGBMULTIPLYROW_SSE2 5405 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time. 5406 __declspec(naked) __declspec(align(16)) 5407 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, 5408 uint8* dst_argb, int width) { 5409 __asm { 5410 push esi 5411 mov eax, [esp + 4 + 4] // src_argb0 5412 mov esi, [esp + 4 + 8] // src_argb1 5413 mov edx, [esp + 4 + 12] // dst_argb 5414 mov ecx, [esp + 4 + 16] // width 5415 pxor xmm5, xmm5 // constant 0 5416 5417 align 4 5418 convertloop: 5419 movdqu xmm0, [eax] // read 4 pixels from src_argb0 5420 movdqu xmm2, [esi] // read 4 pixels from src_argb1 5421 movdqu xmm1, xmm0 5422 movdqu xmm3, xmm2 5423 punpcklbw xmm0, xmm0 // first 2 5424 punpckhbw xmm1, xmm1 // next 2 5425 punpcklbw xmm2, xmm5 // first 2 5426 punpckhbw xmm3, xmm5 // next 2 5427 pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2 5428 pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2 5429 lea eax, [eax + 16] 5430 lea esi, [esi + 16] 5431 packuswb xmm0, xmm1 5432 sub ecx, 4 5433 movdqu [edx], xmm0 5434 lea edx, [edx + 16] 5435 jg convertloop 5436 5437 pop esi 5438 ret 5439 } 5440 } 5441 #endif // HAS_ARGBMULTIPLYROW_SSE2 5442 5443 #ifdef HAS_ARGBADDROW_SSE2 5444 // Add 2 rows of ARGB pixels together, 4 pixels at a time. 5445 // TODO(fbarchard): Port this to posix, neon and other math functions. 5446 __declspec(naked) __declspec(align(16)) 5447 void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, 5448 uint8* dst_argb, int width) { 5449 __asm { 5450 push esi 5451 mov eax, [esp + 4 + 4] // src_argb0 5452 mov esi, [esp + 4 + 8] // src_argb1 5453 mov edx, [esp + 4 + 12] // dst_argb 5454 mov ecx, [esp + 4 + 16] // width 5455 5456 sub ecx, 4 5457 jl convertloop49 5458 5459 align 4 5460 convertloop4: 5461 movdqu xmm0, [eax] // read 4 pixels from src_argb0 5462 lea eax, [eax + 16] 5463 movdqu xmm1, [esi] // read 4 pixels from src_argb1 5464 lea esi, [esi + 16] 5465 paddusb xmm0, xmm1 // src_argb0 + src_argb1 5466 sub ecx, 4 5467 movdqu [edx], xmm0 5468 lea edx, [edx + 16] 5469 jge convertloop4 5470 5471 convertloop49: 5472 add ecx, 4 - 1 5473 jl convertloop19 5474 5475 convertloop1: 5476 movd xmm0, [eax] // read 1 pixels from src_argb0 5477 lea eax, [eax + 4] 5478 movd xmm1, [esi] // read 1 pixels from src_argb1 5479 lea esi, [esi + 4] 5480 paddusb xmm0, xmm1 // src_argb0 + src_argb1 5481 sub ecx, 1 5482 movd [edx], xmm0 5483 lea edx, [edx + 4] 5484 jge convertloop1 5485 5486 convertloop19: 5487 pop esi 5488 ret 5489 } 5490 } 5491 #endif // HAS_ARGBADDROW_SSE2 5492 5493 #ifdef HAS_ARGBSUBTRACTROW_SSE2 5494 // Subtract 2 rows of ARGB pixels together, 4 pixels at a time. 5495 __declspec(naked) __declspec(align(16)) 5496 void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, 5497 uint8* dst_argb, int width) { 5498 __asm { 5499 push esi 5500 mov eax, [esp + 4 + 4] // src_argb0 5501 mov esi, [esp + 4 + 8] // src_argb1 5502 mov edx, [esp + 4 + 12] // dst_argb 5503 mov ecx, [esp + 4 + 16] // width 5504 5505 align 4 5506 convertloop: 5507 movdqu xmm0, [eax] // read 4 pixels from src_argb0 5508 lea eax, [eax + 16] 5509 movdqu xmm1, [esi] // read 4 pixels from src_argb1 5510 lea esi, [esi + 16] 5511 psubusb xmm0, xmm1 // src_argb0 - src_argb1 5512 sub ecx, 4 5513 movdqu [edx], xmm0 5514 lea edx, [edx + 16] 5515 jg convertloop 5516 5517 pop esi 5518 ret 5519 } 5520 } 5521 #endif // HAS_ARGBSUBTRACTROW_SSE2 5522 5523 #ifdef HAS_ARGBMULTIPLYROW_AVX2 5524 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. 5525 __declspec(naked) __declspec(align(16)) 5526 void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, 5527 uint8* dst_argb, int width) { 5528 __asm { 5529 push esi 5530 mov eax, [esp + 4 + 4] // src_argb0 5531 mov esi, [esp + 4 + 8] // src_argb1 5532 mov edx, [esp + 4 + 12] // dst_argb 5533 mov ecx, [esp + 4 + 16] // width 5534 vpxor ymm5, ymm5, ymm5 // constant 0 5535 5536 align 4 5537 convertloop: 5538 vmovdqu ymm1, [eax] // read 8 pixels from src_argb0 5539 lea eax, [eax + 32] 5540 vmovdqu ymm3, [esi] // read 8 pixels from src_argb1 5541 lea esi, [esi + 32] 5542 vpunpcklbw ymm0, ymm1, ymm1 // low 4 5543 vpunpckhbw ymm1, ymm1, ymm1 // high 4 5544 vpunpcklbw ymm2, ymm3, ymm5 // low 4 5545 vpunpckhbw ymm3, ymm3, ymm5 // high 4 5546 vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4 5547 vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4 5548 vpackuswb ymm0, ymm0, ymm1 5549 vmovdqu [edx], ymm0 5550 lea edx, [edx + 32] 5551 sub ecx, 8 5552 jg convertloop 5553 5554 pop esi 5555 vzeroupper 5556 ret 5557 } 5558 } 5559 #endif // HAS_ARGBMULTIPLYROW_AVX2 5560 5561 #ifdef HAS_ARGBADDROW_AVX2 5562 // Add 2 rows of ARGB pixels together, 8 pixels at a time. 5563 __declspec(naked) __declspec(align(16)) 5564 void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, 5565 uint8* dst_argb, int width) { 5566 __asm { 5567 push esi 5568 mov eax, [esp + 4 + 4] // src_argb0 5569 mov esi, [esp + 4 + 8] // src_argb1 5570 mov edx, [esp + 4 + 12] // dst_argb 5571 mov ecx, [esp + 4 + 16] // width 5572 5573 align 4 5574 convertloop: 5575 vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 5576 lea eax, [eax + 32] 5577 vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1 5578 lea esi, [esi + 32] 5579 vmovdqu [edx], ymm0 5580 lea edx, [edx + 32] 5581 sub ecx, 8 5582 jg convertloop 5583 5584 pop esi 5585 vzeroupper 5586 ret 5587 } 5588 } 5589 #endif // HAS_ARGBADDROW_AVX2 5590 5591 #ifdef HAS_ARGBSUBTRACTROW_AVX2 5592 // Subtract 2 rows of ARGB pixels together, 8 pixels at a time. 5593 __declspec(naked) __declspec(align(16)) 5594 void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, 5595 uint8* dst_argb, int width) { 5596 __asm { 5597 push esi 5598 mov eax, [esp + 4 + 4] // src_argb0 5599 mov esi, [esp + 4 + 8] // src_argb1 5600 mov edx, [esp + 4 + 12] // dst_argb 5601 mov ecx, [esp + 4 + 16] // width 5602 5603 align 4 5604 convertloop: 5605 vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 5606 lea eax, [eax + 32] 5607 vpsubusb ymm0, ymm0, [esi] // src_argb0 - src_argb1 5608 lea esi, [esi + 32] 5609 vmovdqu [edx], ymm0 5610 lea edx, [edx + 32] 5611 sub ecx, 8 5612 jg convertloop 5613 5614 pop esi 5615 vzeroupper 5616 ret 5617 } 5618 } 5619 #endif // HAS_ARGBSUBTRACTROW_AVX2 5620 5621 #ifdef HAS_SOBELXROW_SSE2 5622 // SobelX as a matrix is 5623 // -1 0 1 5624 // -2 0 2 5625 // -1 0 1 5626 __declspec(naked) __declspec(align(16)) 5627 void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, 5628 const uint8* src_y2, uint8* dst_sobelx, int width) { 5629 __asm { 5630 push esi 5631 push edi 5632 mov eax, [esp + 8 + 4] // src_y0 5633 mov esi, [esp + 8 + 8] // src_y1 5634 mov edi, [esp + 8 + 12] // src_y2 5635 mov edx, [esp + 8 + 16] // dst_sobelx 5636 mov ecx, [esp + 8 + 20] // width 5637 sub esi, eax 5638 sub edi, eax 5639 sub edx, eax 5640 pxor xmm5, xmm5 // constant 0 5641 5642 align 4 5643 convertloop: 5644 movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] 5645 movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2] 5646 punpcklbw xmm0, xmm5 5647 punpcklbw xmm1, xmm5 5648 psubw xmm0, xmm1 5649 movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] 5650 movq xmm2, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2] 5651 punpcklbw xmm1, xmm5 5652 punpcklbw xmm2, xmm5 5653 psubw xmm1, xmm2 5654 movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0] 5655 movq xmm3, qword ptr [eax + edi + 2] // read 8 pixels from src_y2[2] 5656 punpcklbw xmm2, xmm5 5657 punpcklbw xmm3, xmm5 5658 psubw xmm2, xmm3 5659 paddw xmm0, xmm2 5660 paddw xmm0, xmm1 5661 paddw xmm0, xmm1 5662 pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw 5663 psubw xmm1, xmm0 5664 pmaxsw xmm0, xmm1 5665 packuswb xmm0, xmm0 5666 sub ecx, 8 5667 movq qword ptr [eax + edx], xmm0 5668 lea eax, [eax + 8] 5669 jg convertloop 5670 5671 pop edi 5672 pop esi 5673 ret 5674 } 5675 } 5676 #endif // HAS_SOBELXROW_SSE2 5677 5678 #ifdef HAS_SOBELYROW_SSE2 5679 // SobelY as a matrix is 5680 // -1 -2 -1 5681 // 0 0 0 5682 // 1 2 1 5683 __declspec(naked) __declspec(align(16)) 5684 void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, 5685 uint8* dst_sobely, int width) { 5686 __asm { 5687 push esi 5688 mov eax, [esp + 4 + 4] // src_y0 5689 mov esi, [esp + 4 + 8] // src_y1 5690 mov edx, [esp + 4 + 12] // dst_sobely 5691 mov ecx, [esp + 4 + 16] // width 5692 sub esi, eax 5693 sub edx, eax 5694 pxor xmm5, xmm5 // constant 0 5695 5696 align 4 5697 convertloop: 5698 movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] 5699 movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] 5700 punpcklbw xmm0, xmm5 5701 punpcklbw xmm1, xmm5 5702 psubw xmm0, xmm1 5703 movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1] 5704 movq xmm2, qword ptr [eax + esi + 1] // read 8 pixels from src_y1[1] 5705 punpcklbw xmm1, xmm5 5706 punpcklbw xmm2, xmm5 5707 psubw xmm1, xmm2 5708 movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2] 5709 movq xmm3, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2] 5710 punpcklbw xmm2, xmm5 5711 punpcklbw xmm3, xmm5 5712 psubw xmm2, xmm3 5713 paddw xmm0, xmm2 5714 paddw xmm0, xmm1 5715 paddw xmm0, xmm1 5716 pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw 5717 psubw xmm1, xmm0 5718 pmaxsw xmm0, xmm1 5719 packuswb xmm0, xmm0 5720 sub ecx, 8 5721 movq qword ptr [eax + edx], xmm0 5722 lea eax, [eax + 8] 5723 jg convertloop 5724 5725 pop esi 5726 ret 5727 } 5728 } 5729 #endif // HAS_SOBELYROW_SSE2 5730 5731 #ifdef HAS_SOBELROW_SSE2 5732 // Adds Sobel X and Sobel Y and stores Sobel into ARGB. 5733 // A = 255 5734 // R = Sobel 5735 // G = Sobel 5736 // B = Sobel 5737 __declspec(naked) __declspec(align(16)) 5738 void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, 5739 uint8* dst_argb, int width) { 5740 __asm { 5741 push esi 5742 mov eax, [esp + 4 + 4] // src_sobelx 5743 mov esi, [esp + 4 + 8] // src_sobely 5744 mov edx, [esp + 4 + 12] // dst_argb 5745 mov ecx, [esp + 4 + 16] // width 5746 sub esi, eax 5747 pcmpeqb xmm5, xmm5 // alpha 255 5748 pslld xmm5, 24 // 0xff000000 5749 5750 align 4 5751 convertloop: 5752 movdqa xmm0, [eax] // read 16 pixels src_sobelx 5753 movdqa xmm1, [eax + esi] // read 16 pixels src_sobely 5754 lea eax, [eax + 16] 5755 paddusb xmm0, xmm1 // sobel = sobelx + sobely 5756 movdqa xmm2, xmm0 // GG 5757 punpcklbw xmm2, xmm0 // First 8 5758 punpckhbw xmm0, xmm0 // Next 8 5759 movdqa xmm1, xmm2 // GGGG 5760 punpcklwd xmm1, xmm2 // First 4 5761 punpckhwd xmm2, xmm2 // Next 4 5762 por xmm1, xmm5 // GGGA 5763 por xmm2, xmm5 5764 movdqa xmm3, xmm0 // GGGG 5765 punpcklwd xmm3, xmm0 // Next 4 5766 punpckhwd xmm0, xmm0 // Last 4 5767 por xmm3, xmm5 // GGGA 5768 por xmm0, xmm5 5769 sub ecx, 16 5770 movdqa [edx], xmm1 5771 movdqa [edx + 16], xmm2 5772 movdqa [edx + 32], xmm3 5773 movdqa [edx + 48], xmm0 5774 lea edx, [edx + 64] 5775 jg convertloop 5776 5777 pop esi 5778 ret 5779 } 5780 } 5781 #endif // HAS_SOBELROW_SSE2 5782 5783 #ifdef HAS_SOBELTOPLANEROW_SSE2 5784 // Adds Sobel X and Sobel Y and stores Sobel into a plane. 5785 __declspec(naked) __declspec(align(16)) 5786 void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, 5787 uint8* dst_y, int width) { 5788 __asm { 5789 push esi 5790 mov eax, [esp + 4 + 4] // src_sobelx 5791 mov esi, [esp + 4 + 8] // src_sobely 5792 mov edx, [esp + 4 + 12] // dst_argb 5793 mov ecx, [esp + 4 + 16] // width 5794 sub esi, eax 5795 5796 align 4 5797 convertloop: 5798 movdqa xmm0, [eax] // read 16 pixels src_sobelx 5799 movdqa xmm1, [eax + esi] // read 16 pixels src_sobely 5800 lea eax, [eax + 16] 5801 paddusb xmm0, xmm1 // sobel = sobelx + sobely 5802 sub ecx, 16 5803 movdqa [edx], xmm0 5804 lea edx, [edx + 16] 5805 jg convertloop 5806 5807 pop esi 5808 ret 5809 } 5810 } 5811 #endif // HAS_SOBELTOPLANEROW_SSE2 5812 5813 #ifdef HAS_SOBELXYROW_SSE2 5814 // Mixes Sobel X, Sobel Y and Sobel into ARGB. 5815 // A = 255 5816 // R = Sobel X 5817 // G = Sobel 5818 // B = Sobel Y 5819 __declspec(naked) __declspec(align(16)) 5820 void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, 5821 uint8* dst_argb, int width) { 5822 __asm { 5823 push esi 5824 mov eax, [esp + 4 + 4] // src_sobelx 5825 mov esi, [esp + 4 + 8] // src_sobely 5826 mov edx, [esp + 4 + 12] // dst_argb 5827 mov ecx, [esp + 4 + 16] // width 5828 sub esi, eax 5829 pcmpeqb xmm5, xmm5 // alpha 255 5830 5831 align 4 5832 convertloop: 5833 movdqa xmm0, [eax] // read 16 pixels src_sobelx 5834 movdqa xmm1, [eax + esi] // read 16 pixels src_sobely 5835 lea eax, [eax + 16] 5836 movdqa xmm2, xmm0 5837 paddusb xmm2, xmm1 // sobel = sobelx + sobely 5838 movdqa xmm3, xmm0 // XA 5839 punpcklbw xmm3, xmm5 5840 punpckhbw xmm0, xmm5 5841 movdqa xmm4, xmm1 // YS 5842 punpcklbw xmm4, xmm2 5843 punpckhbw xmm1, xmm2 5844 movdqa xmm6, xmm4 // YSXA 5845 punpcklwd xmm6, xmm3 // First 4 5846 punpckhwd xmm4, xmm3 // Next 4 5847 movdqa xmm7, xmm1 // YSXA 5848 punpcklwd xmm7, xmm0 // Next 4 5849 punpckhwd xmm1, xmm0 // Last 4 5850 sub ecx, 16 5851 movdqa [edx], xmm6 5852 movdqa [edx + 16], xmm4 5853 movdqa [edx + 32], xmm7 5854 movdqa [edx + 48], xmm1 5855 lea edx, [edx + 64] 5856 jg convertloop 5857 5858 pop esi 5859 ret 5860 } 5861 } 5862 #endif // HAS_SOBELXYROW_SSE2 5863 5864 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 5865 // Consider float CumulativeSum. 5866 // Consider calling CumulativeSum one row at time as needed. 5867 // Consider circular CumulativeSum buffer of radius * 2 + 1 height. 5868 // Convert cumulative sum for an area to an average for 1 pixel. 5869 // topleft is pointer to top left of CumulativeSum buffer for area. 5870 // botleft is pointer to bottom left of CumulativeSum buffer. 5871 // width is offset from left to right of area in CumulativeSum buffer measured 5872 // in number of ints. 5873 // area is the number of pixels in the area being averaged. 5874 // dst points to pixel to store result to. 5875 // count is number of averaged pixels to produce. 5876 // Does 4 pixels at a time, requires CumulativeSum pointers to be 16 byte 5877 // aligned. 5878 void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, 5879 int width, int area, uint8* dst, 5880 int count) { 5881 __asm { 5882 mov eax, topleft // eax topleft 5883 mov esi, botleft // esi botleft 5884 mov edx, width 5885 movd xmm5, area 5886 mov edi, dst 5887 mov ecx, count 5888 cvtdq2ps xmm5, xmm5 5889 rcpss xmm4, xmm5 // 1.0f / area 5890 pshufd xmm4, xmm4, 0 5891 sub ecx, 4 5892 jl l4b 5893 5894 cmp area, 128 // 128 pixels will not overflow 15 bits. 5895 ja l4 5896 5897 pshufd xmm5, xmm5, 0 // area 5898 pcmpeqb xmm6, xmm6 // constant of 65536.0 - 1 = 65535.0 5899 psrld xmm6, 16 5900 cvtdq2ps xmm6, xmm6 5901 addps xmm5, xmm6 // (65536.0 + area - 1) 5902 mulps xmm5, xmm4 // (65536.0 + area - 1) * 1 / area 5903 cvtps2dq xmm5, xmm5 // 0.16 fixed point 5904 packssdw xmm5, xmm5 // 16 bit shorts 5905 5906 // 4 pixel loop small blocks. 5907 align 4 5908 s4: 5909 // top left 5910 movdqa xmm0, [eax] 5911 movdqa xmm1, [eax + 16] 5912 movdqa xmm2, [eax + 32] 5913 movdqa xmm3, [eax + 48] 5914 5915 // - top right 5916 psubd xmm0, [eax + edx * 4] 5917 psubd xmm1, [eax + edx * 4 + 16] 5918 psubd xmm2, [eax + edx * 4 + 32] 5919 psubd xmm3, [eax + edx * 4 + 48] 5920 lea eax, [eax + 64] 5921 5922 // - bottom left 5923 psubd xmm0, [esi] 5924 psubd xmm1, [esi + 16] 5925 psubd xmm2, [esi + 32] 5926 psubd xmm3, [esi + 48] 5927 5928 // + bottom right 5929 paddd xmm0, [esi + edx * 4] 5930 paddd xmm1, [esi + edx * 4 + 16] 5931 paddd xmm2, [esi + edx * 4 + 32] 5932 paddd xmm3, [esi + edx * 4 + 48] 5933 lea esi, [esi + 64] 5934 5935 packssdw xmm0, xmm1 // pack 4 pixels into 2 registers 5936 packssdw xmm2, xmm3 5937 5938 pmulhuw xmm0, xmm5 5939 pmulhuw xmm2, xmm5 5940 5941 packuswb xmm0, xmm2 5942 movdqu [edi], xmm0 5943 lea edi, [edi + 16] 5944 sub ecx, 4 5945 jge s4 5946 5947 jmp l4b 5948 5949 // 4 pixel loop 5950 align 4 5951 l4: 5952 // top left 5953 movdqa xmm0, [eax] 5954 movdqa xmm1, [eax + 16] 5955 movdqa xmm2, [eax + 32] 5956 movdqa xmm3, [eax + 48] 5957 5958 // - top right 5959 psubd xmm0, [eax + edx * 4] 5960 psubd xmm1, [eax + edx * 4 + 16] 5961 psubd xmm2, [eax + edx * 4 + 32] 5962 psubd xmm3, [eax + edx * 4 + 48] 5963 lea eax, [eax + 64] 5964 5965 // - bottom left 5966 psubd xmm0, [esi] 5967 psubd xmm1, [esi + 16] 5968 psubd xmm2, [esi + 32] 5969 psubd xmm3, [esi + 48] 5970 5971 // + bottom right 5972 paddd xmm0, [esi + edx * 4] 5973 paddd xmm1, [esi + edx * 4 + 16] 5974 paddd xmm2, [esi + edx * 4 + 32] 5975 paddd xmm3, [esi + edx * 4 + 48] 5976 lea esi, [esi + 64] 5977 5978 cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area 5979 cvtdq2ps xmm1, xmm1 5980 mulps xmm0, xmm4 5981 mulps xmm1, xmm4 5982 cvtdq2ps xmm2, xmm2 5983 cvtdq2ps xmm3, xmm3 5984 mulps xmm2, xmm4 5985 mulps xmm3, xmm4 5986 cvtps2dq xmm0, xmm0 5987 cvtps2dq xmm1, xmm1 5988 cvtps2dq xmm2, xmm2 5989 cvtps2dq xmm3, xmm3 5990 packssdw xmm0, xmm1 5991 packssdw xmm2, xmm3 5992 packuswb xmm0, xmm2 5993 movdqu [edi], xmm0 5994 lea edi, [edi + 16] 5995 sub ecx, 4 5996 jge l4 5997 5998 l4b: 5999 add ecx, 4 - 1 6000 jl l1b 6001 6002 // 1 pixel loop 6003 align 4 6004 l1: 6005 movdqa xmm0, [eax] 6006 psubd xmm0, [eax + edx * 4] 6007 lea eax, [eax + 16] 6008 psubd xmm0, [esi] 6009 paddd xmm0, [esi + edx * 4] 6010 lea esi, [esi + 16] 6011 cvtdq2ps xmm0, xmm0 6012 mulps xmm0, xmm4 6013 cvtps2dq xmm0, xmm0 6014 packssdw xmm0, xmm0 6015 packuswb xmm0, xmm0 6016 movd dword ptr [edi], xmm0 6017 lea edi, [edi + 4] 6018 sub ecx, 1 6019 jge l1 6020 l1b: 6021 } 6022 } 6023 #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 6024 6025 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 6026 // Creates a table of cumulative sums where each value is a sum of all values 6027 // above and to the left of the value. 6028 void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, 6029 const int32* previous_cumsum, int width) { 6030 __asm { 6031 mov eax, row 6032 mov edx, cumsum 6033 mov esi, previous_cumsum 6034 mov ecx, width 6035 pxor xmm0, xmm0 6036 pxor xmm1, xmm1 6037 6038 sub ecx, 4 6039 jl l4b 6040 test edx, 15 6041 jne l4b 6042 6043 // 4 pixel loop 6044 align 4 6045 l4: 6046 movdqu xmm2, [eax] // 4 argb pixels 16 bytes. 6047 lea eax, [eax + 16] 6048 movdqa xmm4, xmm2 6049 6050 punpcklbw xmm2, xmm1 6051 movdqa xmm3, xmm2 6052 punpcklwd xmm2, xmm1 6053 punpckhwd xmm3, xmm1 6054 6055 punpckhbw xmm4, xmm1 6056 movdqa xmm5, xmm4 6057 punpcklwd xmm4, xmm1 6058 punpckhwd xmm5, xmm1 6059 6060 paddd xmm0, xmm2 6061 movdqa xmm2, [esi] // previous row above. 6062 paddd xmm2, xmm0 6063 6064 paddd xmm0, xmm3 6065 movdqa xmm3, [esi + 16] 6066 paddd xmm3, xmm0 6067 6068 paddd xmm0, xmm4 6069 movdqa xmm4, [esi + 32] 6070 paddd xmm4, xmm0 6071 6072 paddd xmm0, xmm5 6073 movdqa xmm5, [esi + 48] 6074 lea esi, [esi + 64] 6075 paddd xmm5, xmm0 6076 6077 movdqa [edx], xmm2 6078 movdqa [edx + 16], xmm3 6079 movdqa [edx + 32], xmm4 6080 movdqa [edx + 48], xmm5 6081 6082 lea edx, [edx + 64] 6083 sub ecx, 4 6084 jge l4 6085 6086 l4b: 6087 add ecx, 4 - 1 6088 jl l1b 6089 6090 // 1 pixel loop 6091 align 4 6092 l1: 6093 movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes. 6094 lea eax, [eax + 4] 6095 punpcklbw xmm2, xmm1 6096 punpcklwd xmm2, xmm1 6097 paddd xmm0, xmm2 6098 movdqu xmm2, [esi] 6099 lea esi, [esi + 16] 6100 paddd xmm2, xmm0 6101 movdqu [edx], xmm2 6102 lea edx, [edx + 16] 6103 sub ecx, 1 6104 jge l1 6105 6106 l1b: 6107 } 6108 } 6109 #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 6110 6111 #ifdef HAS_ARGBAFFINEROW_SSE2 6112 // Copy ARGB pixels from source image with slope to a row of destination. 6113 __declspec(naked) __declspec(align(16)) 6114 LIBYUV_API 6115 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, 6116 uint8* dst_argb, const float* uv_dudv, int width) { 6117 __asm { 6118 push esi 6119 push edi 6120 mov eax, [esp + 12] // src_argb 6121 mov esi, [esp + 16] // stride 6122 mov edx, [esp + 20] // dst_argb 6123 mov ecx, [esp + 24] // pointer to uv_dudv 6124 movq xmm2, qword ptr [ecx] // uv 6125 movq xmm7, qword ptr [ecx + 8] // dudv 6126 mov ecx, [esp + 28] // width 6127 shl esi, 16 // 4, stride 6128 add esi, 4 6129 movd xmm5, esi 6130 sub ecx, 4 6131 jl l4b 6132 6133 // setup for 4 pixel loop 6134 pshufd xmm7, xmm7, 0x44 // dup dudv 6135 pshufd xmm5, xmm5, 0 // dup 4, stride 6136 movdqa xmm0, xmm2 // x0, y0, x1, y1 6137 addps xmm0, xmm7 6138 movlhps xmm2, xmm0 6139 movdqa xmm4, xmm7 6140 addps xmm4, xmm4 // dudv *= 2 6141 movdqa xmm3, xmm2 // x2, y2, x3, y3 6142 addps xmm3, xmm4 6143 addps xmm4, xmm4 // dudv *= 4 6144 6145 // 4 pixel loop 6146 align 4 6147 l4: 6148 cvttps2dq xmm0, xmm2 // x, y float to int first 2 6149 cvttps2dq xmm1, xmm3 // x, y float to int next 2 6150 packssdw xmm0, xmm1 // x, y as 8 shorts 6151 pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride. 6152 movd esi, xmm0 6153 pshufd xmm0, xmm0, 0x39 // shift right 6154 movd edi, xmm0 6155 pshufd xmm0, xmm0, 0x39 // shift right 6156 movd xmm1, [eax + esi] // read pixel 0 6157 movd xmm6, [eax + edi] // read pixel 1 6158 punpckldq xmm1, xmm6 // combine pixel 0 and 1 6159 addps xmm2, xmm4 // x, y += dx, dy first 2 6160 movq qword ptr [edx], xmm1 6161 movd esi, xmm0 6162 pshufd xmm0, xmm0, 0x39 // shift right 6163 movd edi, xmm0 6164 movd xmm6, [eax + esi] // read pixel 2 6165 movd xmm0, [eax + edi] // read pixel 3 6166 punpckldq xmm6, xmm0 // combine pixel 2 and 3 6167 addps xmm3, xmm4 // x, y += dx, dy next 2 6168 sub ecx, 4 6169 movq qword ptr 8[edx], xmm6 6170 lea edx, [edx + 16] 6171 jge l4 6172 6173 l4b: 6174 add ecx, 4 - 1 6175 jl l1b 6176 6177 // 1 pixel loop 6178 align 4 6179 l1: 6180 cvttps2dq xmm0, xmm2 // x, y float to int 6181 packssdw xmm0, xmm0 // x, y as shorts 6182 pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride 6183 addps xmm2, xmm7 // x, y += dx, dy 6184 movd esi, xmm0 6185 movd xmm0, [eax + esi] // copy a pixel 6186 sub ecx, 1 6187 movd [edx], xmm0 6188 lea edx, [edx + 4] 6189 jge l1 6190 l1b: 6191 pop edi 6192 pop esi 6193 ret 6194 } 6195 } 6196 #endif // HAS_ARGBAFFINEROW_SSE2 6197 6198 #ifdef HAS_INTERPOLATEROW_AVX2 6199 // Bilinear filter 16x2 -> 16x1 6200 __declspec(naked) __declspec(align(16)) 6201 void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, 6202 ptrdiff_t src_stride, int dst_width, 6203 int source_y_fraction) { 6204 __asm { 6205 push esi 6206 push edi 6207 mov edi, [esp + 8 + 4] // dst_ptr 6208 mov esi, [esp + 8 + 8] // src_ptr 6209 mov edx, [esp + 8 + 12] // src_stride 6210 mov ecx, [esp + 8 + 16] // dst_width 6211 mov eax, [esp + 8 + 20] // source_y_fraction (0..255) 6212 shr eax, 1 6213 // Dispatch to specialized filters if applicable. 6214 cmp eax, 0 6215 je xloop100 // 0 / 128. Blend 100 / 0. 6216 sub edi, esi 6217 cmp eax, 32 6218 je xloop75 // 32 / 128 is 0.25. Blend 75 / 25. 6219 cmp eax, 64 6220 je xloop50 // 64 / 128 is 0.50. Blend 50 / 50. 6221 cmp eax, 96 6222 je xloop25 // 96 / 128 is 0.75. Blend 25 / 75. 6223 6224 vmovd xmm0, eax // high fraction 0..127 6225 neg eax 6226 add eax, 128 6227 vmovd xmm5, eax // low fraction 128..1 6228 vpunpcklbw xmm5, xmm5, xmm0 6229 vpunpcklwd xmm5, xmm5, xmm5 6230 vpxor ymm0, ymm0, ymm0 6231 vpermd ymm5, ymm0, ymm5 6232 6233 align 4 6234 xloop: 6235 vmovdqu ymm0, [esi] 6236 vmovdqu ymm2, [esi + edx] 6237 vpunpckhbw ymm1, ymm0, ymm2 // mutates 6238 vpunpcklbw ymm0, ymm0, ymm2 // mutates 6239 vpmaddubsw ymm0, ymm0, ymm5 6240 vpmaddubsw ymm1, ymm1, ymm5 6241 vpsrlw ymm0, ymm0, 7 6242 vpsrlw ymm1, ymm1, 7 6243 vpackuswb ymm0, ymm0, ymm1 // unmutates 6244 sub ecx, 32 6245 vmovdqu [esi + edi], ymm0 6246 lea esi, [esi + 32] 6247 jg xloop 6248 jmp xloop99 6249 6250 // Blend 25 / 75. 6251 align 4 6252 xloop25: 6253 vmovdqu ymm0, [esi] 6254 vpavgb ymm0, ymm0, [esi + edx] 6255 vpavgb ymm0, ymm0, [esi + edx] 6256 sub ecx, 32 6257 vmovdqu [esi + edi], ymm0 6258 lea esi, [esi + 32] 6259 jg xloop25 6260 jmp xloop99 6261 6262 // Blend 50 / 50. 6263 align 4 6264 xloop50: 6265 vmovdqu ymm0, [esi] 6266 vpavgb ymm0, ymm0, [esi + edx] 6267 sub ecx, 32 6268 vmovdqu [esi + edi], ymm0 6269 lea esi, [esi + 32] 6270 jg xloop50 6271 jmp xloop99 6272 6273 // Blend 75 / 25. 6274 align 4 6275 xloop75: 6276 vmovdqu ymm0, [esi + edx] 6277 vpavgb ymm0, ymm0, [esi] 6278 vpavgb ymm0, ymm0, [esi] 6279 sub ecx, 32 6280 vmovdqu [esi + edi], ymm0 6281 lea esi, [esi + 32] 6282 jg xloop75 6283 jmp xloop99 6284 6285 // Blend 100 / 0 - Copy row unchanged. 6286 align 4 6287 xloop100: 6288 rep movsb 6289 6290 xloop99: 6291 pop edi 6292 pop esi 6293 vzeroupper 6294 ret 6295 } 6296 } 6297 #endif // HAS_INTERPOLATEROW_AVX2 6298 6299 #ifdef HAS_INTERPOLATEROW_SSSE3 6300 // Bilinear filter 16x2 -> 16x1 6301 __declspec(naked) __declspec(align(16)) 6302 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, 6303 ptrdiff_t src_stride, int dst_width, 6304 int source_y_fraction) { 6305 __asm { 6306 push esi 6307 push edi 6308 mov edi, [esp + 8 + 4] // dst_ptr 6309 mov esi, [esp + 8 + 8] // src_ptr 6310 mov edx, [esp + 8 + 12] // src_stride 6311 mov ecx, [esp + 8 + 16] // dst_width 6312 mov eax, [esp + 8 + 20] // source_y_fraction (0..255) 6313 sub edi, esi 6314 shr eax, 1 6315 // Dispatch to specialized filters if applicable. 6316 cmp eax, 0 6317 je xloop100 // 0 / 128. Blend 100 / 0. 6318 cmp eax, 32 6319 je xloop75 // 32 / 128 is 0.25. Blend 75 / 25. 6320 cmp eax, 64 6321 je xloop50 // 64 / 128 is 0.50. Blend 50 / 50. 6322 cmp eax, 96 6323 je xloop25 // 96 / 128 is 0.75. Blend 25 / 75. 6324 6325 movd xmm0, eax // high fraction 0..127 6326 neg eax 6327 add eax, 128 6328 movd xmm5, eax // low fraction 128..1 6329 punpcklbw xmm5, xmm0 6330 punpcklwd xmm5, xmm5 6331 pshufd xmm5, xmm5, 0 6332 6333 align 4 6334 xloop: 6335 movdqa xmm0, [esi] 6336 movdqa xmm2, [esi + edx] 6337 movdqa xmm1, xmm0 6338 punpcklbw xmm0, xmm2 6339 punpckhbw xmm1, xmm2 6340 pmaddubsw xmm0, xmm5 6341 pmaddubsw xmm1, xmm5 6342 psrlw xmm0, 7 6343 psrlw xmm1, 7 6344 packuswb xmm0, xmm1 6345 sub ecx, 16 6346 movdqa [esi + edi], xmm0 6347 lea esi, [esi + 16] 6348 jg xloop 6349 jmp xloop99 6350 6351 // Blend 25 / 75. 6352 align 4 6353 xloop25: 6354 movdqa xmm0, [esi] 6355 movdqa xmm1, [esi + edx] 6356 pavgb xmm0, xmm1 6357 pavgb xmm0, xmm1 6358 sub ecx, 16 6359 movdqa [esi + edi], xmm0 6360 lea esi, [esi + 16] 6361 jg xloop25 6362 jmp xloop99 6363 6364 // Blend 50 / 50. 6365 align 4 6366 xloop50: 6367 movdqa xmm0, [esi] 6368 movdqa xmm1, [esi + edx] 6369 pavgb xmm0, xmm1 6370 sub ecx, 16 6371 movdqa [esi + edi], xmm0 6372 lea esi, [esi + 16] 6373 jg xloop50 6374 jmp xloop99 6375 6376 // Blend 75 / 25. 6377 align 4 6378 xloop75: 6379 movdqa xmm1, [esi] 6380 movdqa xmm0, [esi + edx] 6381 pavgb xmm0, xmm1 6382 pavgb xmm0, xmm1 6383 sub ecx, 16 6384 movdqa [esi + edi], xmm0 6385 lea esi, [esi + 16] 6386 jg xloop75 6387 jmp xloop99 6388 6389 // Blend 100 / 0 - Copy row unchanged. 6390 align 4 6391 xloop100: 6392 movdqa xmm0, [esi] 6393 sub ecx, 16 6394 movdqa [esi + edi], xmm0 6395 lea esi, [esi + 16] 6396 jg xloop100 6397 6398 xloop99: 6399 pop edi 6400 pop esi 6401 ret 6402 } 6403 } 6404 #endif // HAS_INTERPOLATEROW_SSSE3 6405 6406 #ifdef HAS_INTERPOLATEROW_SSE2 6407 // Bilinear filter 16x2 -> 16x1 6408 __declspec(naked) __declspec(align(16)) 6409 void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, 6410 ptrdiff_t src_stride, int dst_width, 6411 int source_y_fraction) { 6412 __asm { 6413 push esi 6414 push edi 6415 mov edi, [esp + 8 + 4] // dst_ptr 6416 mov esi, [esp + 8 + 8] // src_ptr 6417 mov edx, [esp + 8 + 12] // src_stride 6418 mov ecx, [esp + 8 + 16] // dst_width 6419 mov eax, [esp + 8 + 20] // source_y_fraction (0..255) 6420 sub edi, esi 6421 // Dispatch to specialized filters if applicable. 6422 cmp eax, 0 6423 je xloop100 // 0 / 256. Blend 100 / 0. 6424 cmp eax, 64 6425 je xloop75 // 64 / 256 is 0.25. Blend 75 / 25. 6426 cmp eax, 128 6427 je xloop50 // 128 / 256 is 0.50. Blend 50 / 50. 6428 cmp eax, 192 6429 je xloop25 // 192 / 256 is 0.75. Blend 25 / 75. 6430 6431 movd xmm5, eax // xmm5 = y fraction 6432 punpcklbw xmm5, xmm5 6433 psrlw xmm5, 1 6434 punpcklwd xmm5, xmm5 6435 punpckldq xmm5, xmm5 6436 punpcklqdq xmm5, xmm5 6437 pxor xmm4, xmm4 6438 6439 align 4 6440 xloop: 6441 movdqa xmm0, [esi] // row0 6442 movdqa xmm2, [esi + edx] // row1 6443 movdqa xmm1, xmm0 6444 movdqa xmm3, xmm2 6445 punpcklbw xmm2, xmm4 6446 punpckhbw xmm3, xmm4 6447 punpcklbw xmm0, xmm4 6448 punpckhbw xmm1, xmm4 6449 psubw xmm2, xmm0 // row1 - row0 6450 psubw xmm3, xmm1 6451 paddw xmm2, xmm2 // 9 bits * 15 bits = 8.16 6452 paddw xmm3, xmm3 6453 pmulhw xmm2, xmm5 // scale diff 6454 pmulhw xmm3, xmm5 6455 paddw xmm0, xmm2 // sum rows 6456 paddw xmm1, xmm3 6457 packuswb xmm0, xmm1 6458 sub ecx, 16 6459 movdqa [esi + edi], xmm0 6460 lea esi, [esi + 16] 6461 jg xloop 6462 jmp xloop99 6463 6464 // Blend 25 / 75. 6465 align 4 6466 xloop25: 6467 movdqa xmm0, [esi] 6468 movdqa xmm1, [esi + edx] 6469 pavgb xmm0, xmm1 6470 pavgb xmm0, xmm1 6471 sub ecx, 16 6472 movdqa [esi + edi], xmm0 6473 lea esi, [esi + 16] 6474 jg xloop25 6475 jmp xloop99 6476 6477 // Blend 50 / 50. 6478 align 4 6479 xloop50: 6480 movdqa xmm0, [esi] 6481 movdqa xmm1, [esi + edx] 6482 pavgb xmm0, xmm1 6483 sub ecx, 16 6484 movdqa [esi + edi], xmm0 6485 lea esi, [esi + 16] 6486 jg xloop50 6487 jmp xloop99 6488 6489 // Blend 75 / 25. 6490 align 4 6491 xloop75: 6492 movdqa xmm1, [esi] 6493 movdqa xmm0, [esi + edx] 6494 pavgb xmm0, xmm1 6495 pavgb xmm0, xmm1 6496 sub ecx, 16 6497 movdqa [esi + edi], xmm0 6498 lea esi, [esi + 16] 6499 jg xloop75 6500 jmp xloop99 6501 6502 // Blend 100 / 0 - Copy row unchanged. 6503 align 4 6504 xloop100: 6505 movdqa xmm0, [esi] 6506 sub ecx, 16 6507 movdqa [esi + edi], xmm0 6508 lea esi, [esi + 16] 6509 jg xloop100 6510 6511 xloop99: 6512 pop edi 6513 pop esi 6514 ret 6515 } 6516 } 6517 #endif // HAS_INTERPOLATEROW_SSE2 6518 6519 // Bilinear filter 16x2 -> 16x1 6520 __declspec(naked) __declspec(align(16)) 6521 void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr, 6522 ptrdiff_t src_stride, int dst_width, 6523 int source_y_fraction) { 6524 __asm { 6525 push esi 6526 push edi 6527 mov edi, [esp + 8 + 4] // dst_ptr 6528 mov esi, [esp + 8 + 8] // src_ptr 6529 mov edx, [esp + 8 + 12] // src_stride 6530 mov ecx, [esp + 8 + 16] // dst_width 6531 mov eax, [esp + 8 + 20] // source_y_fraction (0..255) 6532 sub edi, esi 6533 shr eax, 1 6534 // Dispatch to specialized filters if applicable. 6535 cmp eax, 0 6536 je xloop100 // 0 / 128. Blend 100 / 0. 6537 cmp eax, 32 6538 je xloop75 // 32 / 128 is 0.25. Blend 75 / 25. 6539 cmp eax, 64 6540 je xloop50 // 64 / 128 is 0.50. Blend 50 / 50. 6541 cmp eax, 96 6542 je xloop25 // 96 / 128 is 0.75. Blend 25 / 75. 6543 6544 movd xmm0, eax // high fraction 0..127 6545 neg eax 6546 add eax, 128 6547 movd xmm5, eax // low fraction 128..1 6548 punpcklbw xmm5, xmm0 6549 punpcklwd xmm5, xmm5 6550 pshufd xmm5, xmm5, 0 6551 6552 align 4 6553 xloop: 6554 movdqu xmm0, [esi] 6555 movdqu xmm2, [esi + edx] 6556 movdqu xmm1, xmm0 6557 punpcklbw xmm0, xmm2 6558 punpckhbw xmm1, xmm2 6559 pmaddubsw xmm0, xmm5 6560 pmaddubsw xmm1, xmm5 6561 psrlw xmm0, 7 6562 psrlw xmm1, 7 6563 packuswb xmm0, xmm1 6564 sub ecx, 16 6565 movdqu [esi + edi], xmm0 6566 lea esi, [esi + 16] 6567 jg xloop 6568 jmp xloop99 6569 6570 // Blend 25 / 75. 6571 align 4 6572 xloop25: 6573 movdqu xmm0, [esi] 6574 movdqu xmm1, [esi + edx] 6575 pavgb xmm0, xmm1 6576 pavgb xmm0, xmm1 6577 sub ecx, 16 6578 movdqu [esi + edi], xmm0 6579 lea esi, [esi + 16] 6580 jg xloop25 6581 jmp xloop99 6582 6583 // Blend 50 / 50. 6584 align 4 6585 xloop50: 6586 movdqu xmm0, [esi] 6587 movdqu xmm1, [esi + edx] 6588 pavgb xmm0, xmm1 6589 sub ecx, 16 6590 movdqu [esi + edi], xmm0 6591 lea esi, [esi + 16] 6592 jg xloop50 6593 jmp xloop99 6594 6595 // Blend 75 / 25. 6596 align 4 6597 xloop75: 6598 movdqu xmm1, [esi] 6599 movdqu xmm0, [esi + edx] 6600 pavgb xmm0, xmm1 6601 pavgb xmm0, xmm1 6602 sub ecx, 16 6603 movdqu [esi + edi], xmm0 6604 lea esi, [esi + 16] 6605 jg xloop75 6606 jmp xloop99 6607 6608 // Blend 100 / 0 - Copy row unchanged. 6609 align 4 6610 xloop100: 6611 movdqu xmm0, [esi] 6612 sub ecx, 16 6613 movdqu [esi + edi], xmm0 6614 lea esi, [esi + 16] 6615 jg xloop100 6616 6617 xloop99: 6618 pop edi 6619 pop esi 6620 ret 6621 } 6622 } 6623 6624 #ifdef HAS_INTERPOLATEROW_SSE2 6625 // Bilinear filter 16x2 -> 16x1 6626 __declspec(naked) __declspec(align(16)) 6627 void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr, 6628 ptrdiff_t src_stride, int dst_width, 6629 int source_y_fraction) { 6630 __asm { 6631 push esi 6632 push edi 6633 mov edi, [esp + 8 + 4] // dst_ptr 6634 mov esi, [esp + 8 + 8] // src_ptr 6635 mov edx, [esp + 8 + 12] // src_stride 6636 mov ecx, [esp + 8 + 16] // dst_width 6637 mov eax, [esp + 8 + 20] // source_y_fraction (0..255) 6638 sub edi, esi 6639 // Dispatch to specialized filters if applicable. 6640 cmp eax, 0 6641 je xloop100 // 0 / 256. Blend 100 / 0. 6642 cmp eax, 64 6643 je xloop75 // 64 / 256 is 0.25. Blend 75 / 25. 6644 cmp eax, 128 6645 je xloop50 // 128 / 256 is 0.50. Blend 50 / 50. 6646 cmp eax, 192 6647 je xloop25 // 192 / 256 is 0.75. Blend 25 / 75. 6648 6649 movd xmm5, eax // xmm5 = y fraction 6650 punpcklbw xmm5, xmm5 6651 psrlw xmm5, 1 6652 punpcklwd xmm5, xmm5 6653 punpckldq xmm5, xmm5 6654 punpcklqdq xmm5, xmm5 6655 pxor xmm4, xmm4 6656 6657 align 4 6658 xloop: 6659 movdqu xmm0, [esi] // row0 6660 movdqu xmm2, [esi + edx] // row1 6661 movdqu xmm1, xmm0 6662 movdqu xmm3, xmm2 6663 punpcklbw xmm2, xmm4 6664 punpckhbw xmm3, xmm4 6665 punpcklbw xmm0, xmm4 6666 punpckhbw xmm1, xmm4 6667 psubw xmm2, xmm0 // row1 - row0 6668 psubw xmm3, xmm1 6669 paddw xmm2, xmm2 // 9 bits * 15 bits = 8.16 6670 paddw xmm3, xmm3 6671 pmulhw xmm2, xmm5 // scale diff 6672 pmulhw xmm3, xmm5 6673 paddw xmm0, xmm2 // sum rows 6674 paddw xmm1, xmm3 6675 packuswb xmm0, xmm1 6676 sub ecx, 16 6677 movdqu [esi + edi], xmm0 6678 lea esi, [esi + 16] 6679 jg xloop 6680 jmp xloop99 6681 6682 // Blend 25 / 75. 6683 align 4 6684 xloop25: 6685 movdqu xmm0, [esi] 6686 movdqu xmm1, [esi + edx] 6687 pavgb xmm0, xmm1 6688 pavgb xmm0, xmm1 6689 sub ecx, 16 6690 movdqu [esi + edi], xmm0 6691 lea esi, [esi + 16] 6692 jg xloop25 6693 jmp xloop99 6694 6695 // Blend 50 / 50. 6696 align 4 6697 xloop50: 6698 movdqu xmm0, [esi] 6699 movdqu xmm1, [esi + edx] 6700 pavgb xmm0, xmm1 6701 sub ecx, 16 6702 movdqu [esi + edi], xmm0 6703 lea esi, [esi + 16] 6704 jg xloop50 6705 jmp xloop99 6706 6707 // Blend 75 / 25. 6708 align 4 6709 xloop75: 6710 movdqu xmm1, [esi] 6711 movdqu xmm0, [esi + edx] 6712 pavgb xmm0, xmm1 6713 pavgb xmm0, xmm1 6714 sub ecx, 16 6715 movdqu [esi + edi], xmm0 6716 lea esi, [esi + 16] 6717 jg xloop75 6718 jmp xloop99 6719 6720 // Blend 100 / 0 - Copy row unchanged. 6721 align 4 6722 xloop100: 6723 movdqu xmm0, [esi] 6724 sub ecx, 16 6725 movdqu [esi + edi], xmm0 6726 lea esi, [esi + 16] 6727 jg xloop100 6728 6729 xloop99: 6730 pop edi 6731 pop esi 6732 ret 6733 } 6734 } 6735 #endif // HAS_INTERPOLATEROW_SSE2 6736 6737 __declspec(naked) __declspec(align(16)) 6738 void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, 6739 uint8* dst_uv, int pix) { 6740 __asm { 6741 push edi 6742 mov eax, [esp + 4 + 4] // src_uv 6743 mov edx, [esp + 4 + 8] // src_uv_stride 6744 mov edi, [esp + 4 + 12] // dst_v 6745 mov ecx, [esp + 4 + 16] // pix 6746 sub edi, eax 6747 6748 align 4 6749 convertloop: 6750 movdqa xmm0, [eax] 6751 pavgb xmm0, [eax + edx] 6752 sub ecx, 16 6753 movdqa [eax + edi], xmm0 6754 lea eax, [eax + 16] 6755 jg convertloop 6756 pop edi 6757 ret 6758 } 6759 } 6760 6761 #ifdef HAS_HALFROW_AVX2 6762 __declspec(naked) __declspec(align(16)) 6763 void HalfRow_AVX2(const uint8* src_uv, int src_uv_stride, 6764 uint8* dst_uv, int pix) { 6765 __asm { 6766 push edi 6767 mov eax, [esp + 4 + 4] // src_uv 6768 mov edx, [esp + 4 + 8] // src_uv_stride 6769 mov edi, [esp + 4 + 12] // dst_v 6770 mov ecx, [esp + 4 + 16] // pix 6771 sub edi, eax 6772 6773 align 4 6774 convertloop: 6775 vmovdqu ymm0, [eax] 6776 vpavgb ymm0, ymm0, [eax + edx] 6777 sub ecx, 32 6778 vmovdqu [eax + edi], ymm0 6779 lea eax, [eax + 32] 6780 jg convertloop 6781 6782 pop edi 6783 vzeroupper 6784 ret 6785 } 6786 } 6787 #endif // HAS_HALFROW_AVX2 6788 6789 __declspec(naked) __declspec(align(16)) 6790 void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, 6791 uint32 selector, int pix) { 6792 __asm { 6793 mov eax, [esp + 4] // src_argb 6794 mov edx, [esp + 8] // dst_bayer 6795 movd xmm5, [esp + 12] // selector 6796 mov ecx, [esp + 16] // pix 6797 pshufd xmm5, xmm5, 0 6798 6799 align 4 6800 wloop: 6801 movdqa xmm0, [eax] 6802 movdqa xmm1, [eax + 16] 6803 lea eax, [eax + 32] 6804 pshufb xmm0, xmm5 6805 pshufb xmm1, xmm5 6806 punpckldq xmm0, xmm1 6807 sub ecx, 8 6808 movq qword ptr [edx], xmm0 6809 lea edx, [edx + 8] 6810 jg wloop 6811 ret 6812 } 6813 } 6814 6815 // Specialized ARGB to Bayer that just isolates G channel. 6816 __declspec(naked) __declspec(align(16)) 6817 void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer, 6818 uint32 selector, int pix) { 6819 __asm { 6820 mov eax, [esp + 4] // src_argb 6821 mov edx, [esp + 8] // dst_bayer 6822 // selector 6823 mov ecx, [esp + 16] // pix 6824 pcmpeqb xmm5, xmm5 // generate mask 0x000000ff 6825 psrld xmm5, 24 6826 6827 align 4 6828 wloop: 6829 movdqa xmm0, [eax] 6830 movdqa xmm1, [eax + 16] 6831 lea eax, [eax + 32] 6832 psrld xmm0, 8 // Move green to bottom. 6833 psrld xmm1, 8 6834 pand xmm0, xmm5 6835 pand xmm1, xmm5 6836 packssdw xmm0, xmm1 6837 packuswb xmm0, xmm1 6838 sub ecx, 8 6839 movq qword ptr [edx], xmm0 6840 lea edx, [edx + 8] 6841 jg wloop 6842 ret 6843 } 6844 } 6845 6846 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. 6847 __declspec(naked) __declspec(align(16)) 6848 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, 6849 const uint8* shuffler, int pix) { 6850 __asm { 6851 mov eax, [esp + 4] // src_argb 6852 mov edx, [esp + 8] // dst_argb 6853 mov ecx, [esp + 12] // shuffler 6854 movdqa xmm5, [ecx] 6855 mov ecx, [esp + 16] // pix 6856 6857 align 4 6858 wloop: 6859 movdqa xmm0, [eax] 6860 movdqa xmm1, [eax + 16] 6861 lea eax, [eax + 32] 6862 pshufb xmm0, xmm5 6863 pshufb xmm1, xmm5 6864 sub ecx, 8 6865 movdqa [edx], xmm0 6866 movdqa [edx + 16], xmm1 6867 lea edx, [edx + 32] 6868 jg wloop 6869 ret 6870 } 6871 } 6872 6873 __declspec(naked) __declspec(align(16)) 6874 void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb, 6875 const uint8* shuffler, int pix) { 6876 __asm { 6877 mov eax, [esp + 4] // src_argb 6878 mov edx, [esp + 8] // dst_argb 6879 mov ecx, [esp + 12] // shuffler 6880 movdqa xmm5, [ecx] 6881 mov ecx, [esp + 16] // pix 6882 6883 align 4 6884 wloop: 6885 movdqu xmm0, [eax] 6886 movdqu xmm1, [eax + 16] 6887 lea eax, [eax + 32] 6888 pshufb xmm0, xmm5 6889 pshufb xmm1, xmm5 6890 sub ecx, 8 6891 movdqu [edx], xmm0 6892 movdqu [edx + 16], xmm1 6893 lea edx, [edx + 32] 6894 jg wloop 6895 ret 6896 } 6897 } 6898 6899 #ifdef HAS_ARGBSHUFFLEROW_AVX2 6900 __declspec(naked) __declspec(align(16)) 6901 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, 6902 const uint8* shuffler, int pix) { 6903 __asm { 6904 mov eax, [esp + 4] // src_argb 6905 mov edx, [esp + 8] // dst_argb 6906 mov ecx, [esp + 12] // shuffler 6907 vbroadcastf128 ymm5, [ecx] // same shuffle in high as low. 6908 mov ecx, [esp + 16] // pix 6909 6910 align 4 6911 wloop: 6912 vmovdqu ymm0, [eax] 6913 vmovdqu ymm1, [eax + 32] 6914 lea eax, [eax + 64] 6915 vpshufb ymm0, ymm0, ymm5 6916 vpshufb ymm1, ymm1, ymm5 6917 sub ecx, 16 6918 vmovdqu [edx], ymm0 6919 vmovdqu [edx + 32], ymm1 6920 lea edx, [edx + 64] 6921 jg wloop 6922 6923 vzeroupper 6924 ret 6925 } 6926 } 6927 #endif // HAS_ARGBSHUFFLEROW_AVX2 6928 6929 __declspec(naked) __declspec(align(16)) 6930 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, 6931 const uint8* shuffler, int pix) { 6932 __asm { 6933 push ebx 6934 push esi 6935 mov eax, [esp + 8 + 4] // src_argb 6936 mov edx, [esp + 8 + 8] // dst_argb 6937 mov esi, [esp + 8 + 12] // shuffler 6938 mov ecx, [esp + 8 + 16] // pix 6939 pxor xmm5, xmm5 6940 6941 mov ebx, [esi] // shuffler 6942 cmp ebx, 0x03000102 6943 je shuf_3012 6944 cmp ebx, 0x00010203 6945 je shuf_0123 6946 cmp ebx, 0x00030201 6947 je shuf_0321 6948 cmp ebx, 0x02010003 6949 je shuf_2103 6950 6951 // TODO(fbarchard): Use one source pointer and 3 offsets. 6952 shuf_any1: 6953 movzx ebx, byte ptr [esi] 6954 movzx ebx, byte ptr [eax + ebx] 6955 mov [edx], bl 6956 movzx ebx, byte ptr [esi + 1] 6957 movzx ebx, byte ptr [eax + ebx] 6958 mov [edx + 1], bl 6959 movzx ebx, byte ptr [esi + 2] 6960 movzx ebx, byte ptr [eax + ebx] 6961 mov [edx + 2], bl 6962 movzx ebx, byte ptr [esi + 3] 6963 movzx ebx, byte ptr [eax + ebx] 6964 mov [edx + 3], bl 6965 lea eax, [eax + 4] 6966 lea edx, [edx + 4] 6967 sub ecx, 1 6968 jg shuf_any1 6969 jmp shuf99 6970 6971 align 4 6972 shuf_0123: 6973 movdqu xmm0, [eax] 6974 lea eax, [eax + 16] 6975 movdqa xmm1, xmm0 6976 punpcklbw xmm0, xmm5 6977 punpckhbw xmm1, xmm5 6978 pshufhw xmm0, xmm0, 01Bh // 1B = 00011011 = 0x0123 = BGRAToARGB 6979 pshuflw xmm0, xmm0, 01Bh 6980 pshufhw xmm1, xmm1, 01Bh 6981 pshuflw xmm1, xmm1, 01Bh 6982 packuswb xmm0, xmm1 6983 sub ecx, 4 6984 movdqu [edx], xmm0 6985 lea edx, [edx + 16] 6986 jg shuf_0123 6987 jmp shuf99 6988 6989 align 4 6990 shuf_0321: 6991 movdqu xmm0, [eax] 6992 lea eax, [eax + 16] 6993 movdqa xmm1, xmm0 6994 punpcklbw xmm0, xmm5 6995 punpckhbw xmm1, xmm5 6996 pshufhw xmm0, xmm0, 039h // 39 = 00111001 = 0x0321 = RGBAToARGB 6997 pshuflw xmm0, xmm0, 039h 6998 pshufhw xmm1, xmm1, 039h 6999 pshuflw xmm1, xmm1, 039h 7000 packuswb xmm0, xmm1 7001 sub ecx, 4 7002 movdqu [edx], xmm0 7003 lea edx, [edx + 16] 7004 jg shuf_0321 7005 jmp shuf99 7006 7007 align 4 7008 shuf_2103: 7009 movdqu xmm0, [eax] 7010 lea eax, [eax + 16] 7011 movdqa xmm1, xmm0 7012 punpcklbw xmm0, xmm5 7013 punpckhbw xmm1, xmm5 7014 pshufhw xmm0, xmm0, 093h // 93 = 10010011 = 0x2103 = ARGBToRGBA 7015 pshuflw xmm0, xmm0, 093h 7016 pshufhw xmm1, xmm1, 093h 7017 pshuflw xmm1, xmm1, 093h 7018 packuswb xmm0, xmm1 7019 sub ecx, 4 7020 movdqu [edx], xmm0 7021 lea edx, [edx + 16] 7022 jg shuf_2103 7023 jmp shuf99 7024 7025 align 4 7026 shuf_3012: 7027 movdqu xmm0, [eax] 7028 lea eax, [eax + 16] 7029 movdqa xmm1, xmm0 7030 punpcklbw xmm0, xmm5 7031 punpckhbw xmm1, xmm5 7032 pshufhw xmm0, xmm0, 0C6h // C6 = 11000110 = 0x3012 = ABGRToARGB 7033 pshuflw xmm0, xmm0, 0C6h 7034 pshufhw xmm1, xmm1, 0C6h 7035 pshuflw xmm1, xmm1, 0C6h 7036 packuswb xmm0, xmm1 7037 sub ecx, 4 7038 movdqu [edx], xmm0 7039 lea edx, [edx + 16] 7040 jg shuf_3012 7041 7042 shuf99: 7043 pop esi 7044 pop ebx 7045 ret 7046 } 7047 } 7048 7049 // YUY2 - Macro-pixel = 2 image pixels 7050 // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4.... 7051 7052 // UYVY - Macro-pixel = 2 image pixels 7053 // U0Y0V0Y1 7054 7055 __declspec(naked) __declspec(align(16)) 7056 void I422ToYUY2Row_SSE2(const uint8* src_y, 7057 const uint8* src_u, 7058 const uint8* src_v, 7059 uint8* dst_frame, int width) { 7060 __asm { 7061 push esi 7062 push edi 7063 mov eax, [esp + 8 + 4] // src_y 7064 mov esi, [esp + 8 + 8] // src_u 7065 mov edx, [esp + 8 + 12] // src_v 7066 mov edi, [esp + 8 + 16] // dst_frame 7067 mov ecx, [esp + 8 + 20] // width 7068 sub edx, esi 7069 7070 align 4 7071 convertloop: 7072 movq xmm2, qword ptr [esi] // U 7073 movq xmm3, qword ptr [esi + edx] // V 7074 lea esi, [esi + 8] 7075 punpcklbw xmm2, xmm3 // UV 7076 movdqu xmm0, [eax] // Y 7077 lea eax, [eax + 16] 7078 movdqa xmm1, xmm0 7079 punpcklbw xmm0, xmm2 // YUYV 7080 punpckhbw xmm1, xmm2 7081 movdqu [edi], xmm0 7082 movdqu [edi + 16], xmm1 7083 lea edi, [edi + 32] 7084 sub ecx, 16 7085 jg convertloop 7086 7087 pop edi 7088 pop esi 7089 ret 7090 } 7091 } 7092 7093 __declspec(naked) __declspec(align(16)) 7094 void I422ToUYVYRow_SSE2(const uint8* src_y, 7095 const uint8* src_u, 7096 const uint8* src_v, 7097 uint8* dst_frame, int width) { 7098 __asm { 7099 push esi 7100 push edi 7101 mov eax, [esp + 8 + 4] // src_y 7102 mov esi, [esp + 8 + 8] // src_u 7103 mov edx, [esp + 8 + 12] // src_v 7104 mov edi, [esp + 8 + 16] // dst_frame 7105 mov ecx, [esp + 8 + 20] // width 7106 sub edx, esi 7107 7108 align 4 7109 convertloop: 7110 movq xmm2, qword ptr [esi] // U 7111 movq xmm3, qword ptr [esi + edx] // V 7112 lea esi, [esi + 8] 7113 punpcklbw xmm2, xmm3 // UV 7114 movdqu xmm0, [eax] // Y 7115 movdqa xmm1, xmm2 7116 lea eax, [eax + 16] 7117 punpcklbw xmm1, xmm0 // UYVY 7118 punpckhbw xmm2, xmm0 7119 movdqu [edi], xmm1 7120 movdqu [edi + 16], xmm2 7121 lea edi, [edi + 32] 7122 sub ecx, 16 7123 jg convertloop 7124 7125 pop edi 7126 pop esi 7127 ret 7128 } 7129 } 7130 7131 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2 7132 __declspec(naked) __declspec(align(16)) 7133 void ARGBPolynomialRow_SSE2(const uint8* src_argb, 7134 uint8* dst_argb, const float* poly, 7135 int width) { 7136 __asm { 7137 push esi 7138 mov eax, [esp + 4 + 4] /* src_argb */ 7139 mov edx, [esp + 4 + 8] /* dst_argb */ 7140 mov esi, [esp + 4 + 12] /* poly */ 7141 mov ecx, [esp + 4 + 16] /* width */ 7142 pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints. 7143 7144 // 2 pixel loop. 7145 align 4 7146 convertloop: 7147 // pmovzxbd xmm0, dword ptr [eax] // BGRA pixel 7148 // pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel 7149 movq xmm0, qword ptr [eax] // BGRABGRA 7150 lea eax, [eax + 8] 7151 punpcklbw xmm0, xmm3 7152 movdqa xmm4, xmm0 7153 punpcklwd xmm0, xmm3 // pixel 0 7154 punpckhwd xmm4, xmm3 // pixel 1 7155 cvtdq2ps xmm0, xmm0 // 4 floats 7156 cvtdq2ps xmm4, xmm4 7157 movdqa xmm1, xmm0 // X 7158 movdqa xmm5, xmm4 7159 mulps xmm0, [esi + 16] // C1 * X 7160 mulps xmm4, [esi + 16] 7161 addps xmm0, [esi] // result = C0 + C1 * X 7162 addps xmm4, [esi] 7163 movdqa xmm2, xmm1 7164 movdqa xmm6, xmm5 7165 mulps xmm2, xmm1 // X * X 7166 mulps xmm6, xmm5 7167 mulps xmm1, xmm2 // X * X * X 7168 mulps xmm5, xmm6 7169 mulps xmm2, [esi + 32] // C2 * X * X 7170 mulps xmm6, [esi + 32] 7171 mulps xmm1, [esi + 48] // C3 * X * X * X 7172 mulps xmm5, [esi + 48] 7173 addps xmm0, xmm2 // result += C2 * X * X 7174 addps xmm4, xmm6 7175 addps xmm0, xmm1 // result += C3 * X * X * X 7176 addps xmm4, xmm5 7177 cvttps2dq xmm0, xmm0 7178 cvttps2dq xmm4, xmm4 7179 packuswb xmm0, xmm4 7180 packuswb xmm0, xmm0 7181 sub ecx, 2 7182 movq qword ptr [edx], xmm0 7183 lea edx, [edx + 8] 7184 jg convertloop 7185 pop esi 7186 ret 7187 } 7188 } 7189 #endif // HAS_ARGBPOLYNOMIALROW_SSE2 7190 7191 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2 7192 __declspec(naked) __declspec(align(16)) 7193 void ARGBPolynomialRow_AVX2(const uint8* src_argb, 7194 uint8* dst_argb, const float* poly, 7195 int width) { 7196 __asm { 7197 mov eax, [esp + 4] /* src_argb */ 7198 mov edx, [esp + 8] /* dst_argb */ 7199 mov ecx, [esp + 12] /* poly */ 7200 vbroadcastf128 ymm4, [ecx] // C0 7201 vbroadcastf128 ymm5, [ecx + 16] // C1 7202 vbroadcastf128 ymm6, [ecx + 32] // C2 7203 vbroadcastf128 ymm7, [ecx + 48] // C3 7204 mov ecx, [esp + 16] /* width */ 7205 7206 // 2 pixel loop. 7207 align 4 7208 convertloop: 7209 vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels 7210 lea eax, [eax + 8] 7211 vcvtdq2ps ymm0, ymm0 // X 8 floats 7212 vmulps ymm2, ymm0, ymm0 // X * X 7213 vmulps ymm3, ymm0, ymm7 // C3 * X 7214 vfmadd132ps ymm0, ymm4, ymm5 // result = C0 + C1 * X 7215 vfmadd231ps ymm0, ymm2, ymm6 // result += C2 * X * X 7216 vfmadd231ps ymm0, ymm2, ymm3 // result += C3 * X * X * X 7217 vcvttps2dq ymm0, ymm0 7218 vpackusdw ymm0, ymm0, ymm0 // b0g0r0a0_00000000_b0g0r0a0_00000000 7219 vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000 7220 vpackuswb xmm0, xmm0, xmm0 // bgrabgra_00000000_00000000_00000000 7221 sub ecx, 2 7222 vmovq qword ptr [edx], xmm0 7223 lea edx, [edx + 8] 7224 jg convertloop 7225 vzeroupper 7226 ret 7227 } 7228 } 7229 #endif // HAS_ARGBPOLYNOMIALROW_AVX2 7230 7231 #ifdef HAS_ARGBCOLORTABLEROW_X86 7232 // Tranform ARGB pixels with color table. 7233 __declspec(naked) __declspec(align(16)) 7234 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, 7235 int width) { 7236 __asm { 7237 push esi 7238 mov eax, [esp + 4 + 4] /* dst_argb */ 7239 mov esi, [esp + 4 + 8] /* table_argb */ 7240 mov ecx, [esp + 4 + 12] /* width */ 7241 7242 // 1 pixel loop. 7243 align 4 7244 convertloop: 7245 movzx edx, byte ptr [eax] 7246 lea eax, [eax + 4] 7247 movzx edx, byte ptr [esi + edx * 4] 7248 mov byte ptr [eax - 4], dl 7249 movzx edx, byte ptr [eax - 4 + 1] 7250 movzx edx, byte ptr [esi + edx * 4 + 1] 7251 mov byte ptr [eax - 4 + 1], dl 7252 movzx edx, byte ptr [eax - 4 + 2] 7253 movzx edx, byte ptr [esi + edx * 4 + 2] 7254 mov byte ptr [eax - 4 + 2], dl 7255 movzx edx, byte ptr [eax - 4 + 3] 7256 movzx edx, byte ptr [esi + edx * 4 + 3] 7257 mov byte ptr [eax - 4 + 3], dl 7258 dec ecx 7259 jg convertloop 7260 pop esi 7261 ret 7262 } 7263 } 7264 #endif // HAS_ARGBCOLORTABLEROW_X86 7265 7266 #ifdef HAS_RGBCOLORTABLEROW_X86 7267 // Tranform RGB pixels with color table. 7268 __declspec(naked) __declspec(align(16)) 7269 void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { 7270 __asm { 7271 push esi 7272 mov eax, [esp + 4 + 4] /* dst_argb */ 7273 mov esi, [esp + 4 + 8] /* table_argb */ 7274 mov ecx, [esp + 4 + 12] /* width */ 7275 7276 // 1 pixel loop. 7277 align 4 7278 convertloop: 7279 movzx edx, byte ptr [eax] 7280 lea eax, [eax + 4] 7281 movzx edx, byte ptr [esi + edx * 4] 7282 mov byte ptr [eax - 4], dl 7283 movzx edx, byte ptr [eax - 4 + 1] 7284 movzx edx, byte ptr [esi + edx * 4 + 1] 7285 mov byte ptr [eax - 4 + 1], dl 7286 movzx edx, byte ptr [eax - 4 + 2] 7287 movzx edx, byte ptr [esi + edx * 4 + 2] 7288 mov byte ptr [eax - 4 + 2], dl 7289 dec ecx 7290 jg convertloop 7291 7292 pop esi 7293 ret 7294 } 7295 } 7296 #endif // HAS_RGBCOLORTABLEROW_X86 7297 7298 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 7299 // Tranform RGB pixels with luma table. 7300 __declspec(naked) __declspec(align(16)) 7301 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, 7302 int width, 7303 const uint8* luma, uint32 lumacoeff) { 7304 __asm { 7305 push esi 7306 push edi 7307 mov eax, [esp + 8 + 4] /* src_argb */ 7308 mov edi, [esp + 8 + 8] /* dst_argb */ 7309 mov ecx, [esp + 8 + 12] /* width */ 7310 movd xmm2, dword ptr [esp + 8 + 16] // luma table 7311 movd xmm3, dword ptr [esp + 8 + 20] // lumacoeff 7312 pshufd xmm2, xmm2, 0 7313 pshufd xmm3, xmm3, 0 7314 pcmpeqb xmm4, xmm4 // generate mask 0xff00ff00 7315 psllw xmm4, 8 7316 pxor xmm5, xmm5 7317 7318 // 4 pixel loop. 7319 align 4 7320 convertloop: 7321 movdqu xmm0, qword ptr [eax] // generate luma ptr 7322 pmaddubsw xmm0, xmm3 7323 phaddw xmm0, xmm0 7324 pand xmm0, xmm4 // mask out low bits 7325 punpcklwd xmm0, xmm5 7326 paddd xmm0, xmm2 // add table base 7327 movd esi, xmm0 7328 pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 7329 7330 movzx edx, byte ptr [eax] 7331 movzx edx, byte ptr [esi + edx] 7332 mov byte ptr [edi], dl 7333 movzx edx, byte ptr [eax + 1] 7334 movzx edx, byte ptr [esi + edx] 7335 mov byte ptr [edi + 1], dl 7336 movzx edx, byte ptr [eax + 2] 7337 movzx edx, byte ptr [esi + edx] 7338 mov byte ptr [edi + 2], dl 7339 movzx edx, byte ptr [eax + 3] // copy alpha. 7340 mov byte ptr [edi + 3], dl 7341 7342 movd esi, xmm0 7343 pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 7344 7345 movzx edx, byte ptr [eax + 4] 7346 movzx edx, byte ptr [esi + edx] 7347 mov byte ptr [edi + 4], dl 7348 movzx edx, byte ptr [eax + 5] 7349 movzx edx, byte ptr [esi + edx] 7350 mov byte ptr [edi + 5], dl 7351 movzx edx, byte ptr [eax + 6] 7352 movzx edx, byte ptr [esi + edx] 7353 mov byte ptr [edi + 6], dl 7354 movzx edx, byte ptr [eax + 7] // copy alpha. 7355 mov byte ptr [edi + 7], dl 7356 7357 movd esi, xmm0 7358 pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 7359 7360 movzx edx, byte ptr [eax + 8] 7361 movzx edx, byte ptr [esi + edx] 7362 mov byte ptr [edi + 8], dl 7363 movzx edx, byte ptr [eax + 9] 7364 movzx edx, byte ptr [esi + edx] 7365 mov byte ptr [edi + 9], dl 7366 movzx edx, byte ptr [eax + 10] 7367 movzx edx, byte ptr [esi + edx] 7368 mov byte ptr [edi + 10], dl 7369 movzx edx, byte ptr [eax + 11] // copy alpha. 7370 mov byte ptr [edi + 11], dl 7371 7372 movd esi, xmm0 7373 7374 movzx edx, byte ptr [eax + 12] 7375 movzx edx, byte ptr [esi + edx] 7376 mov byte ptr [edi + 12], dl 7377 movzx edx, byte ptr [eax + 13] 7378 movzx edx, byte ptr [esi + edx] 7379 mov byte ptr [edi + 13], dl 7380 movzx edx, byte ptr [eax + 14] 7381 movzx edx, byte ptr [esi + edx] 7382 mov byte ptr [edi + 14], dl 7383 movzx edx, byte ptr [eax + 15] // copy alpha. 7384 mov byte ptr [edi + 15], dl 7385 7386 sub ecx, 4 7387 lea eax, [eax + 16] 7388 lea edi, [edi + 16] 7389 jg convertloop 7390 7391 pop edi 7392 pop esi 7393 ret 7394 } 7395 } 7396 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 7397 7398 #endif // defined(_M_X64) 7399 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) 7400 7401 #ifdef __cplusplus 7402 } // extern "C" 7403 } // namespace libyuv 7404 #endif 7405