1 /* 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "libyuv/row.h" 12 13 // This module is for Visual C 32/64 bit and clangcl 32 bit 14 #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \ 15 (defined(_M_IX86) || (defined(_M_X64) && !defined(__clang__))) 16 17 #if defined(_M_X64) 18 #include <emmintrin.h> 19 #include <tmmintrin.h> // For _mm_maddubs_epi16 20 #endif 21 22 #ifdef __cplusplus 23 namespace libyuv { 24 extern "C" { 25 #endif 26 27 // 64 bit 28 #if defined(_M_X64) 29 30 // Read 4 UV from 422, upsample to 8 UV. 31 #define READYUV422 \ 32 xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); \ 33 xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); \ 34 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ 35 xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \ 36 u_buf += 4; \ 37 xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ 38 xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ 39 y_buf += 8; 40 41 // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. 42 #define READYUVA422 \ 43 xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); \ 44 xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); \ 45 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ 46 xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \ 47 u_buf += 4; \ 48 xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ 49 xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ 50 y_buf += 8; \ 51 xmm5 = _mm_loadl_epi64((__m128i*)a_buf); \ 52 a_buf += 8; 53 54 // Convert 8 pixels: 8 UV and 8 Y. 55 #define YUVTORGB(yuvconstants) \ 56 xmm1 = _mm_loadu_si128(&xmm0); \ 57 xmm2 = _mm_loadu_si128(&xmm0); \ 58 xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB); \ 59 xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG); \ 60 xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR); \ 61 xmm0 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasB, xmm0); \ 62 xmm1 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasG, xmm1); \ 63 xmm2 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasR, xmm2); \ 64 xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb); \ 65 xmm0 = _mm_adds_epi16(xmm0, xmm4); \ 66 xmm1 = _mm_adds_epi16(xmm1, xmm4); \ 67 xmm2 = _mm_adds_epi16(xmm2, xmm4); \ 68 xmm0 = _mm_srai_epi16(xmm0, 6); \ 69 xmm1 = _mm_srai_epi16(xmm1, 6); \ 70 xmm2 = _mm_srai_epi16(xmm2, 6); \ 71 xmm0 = _mm_packus_epi16(xmm0, xmm0); \ 72 xmm1 = _mm_packus_epi16(xmm1, xmm1); \ 73 xmm2 = _mm_packus_epi16(xmm2, xmm2); 74 75 // Store 8 ARGB values. 76 #define STOREARGB \ 77 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ 78 xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); \ 79 xmm1 = _mm_loadu_si128(&xmm0); \ 80 xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); \ 81 xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); \ 82 _mm_storeu_si128((__m128i*)dst_argb, xmm0); \ 83 _mm_storeu_si128((__m128i*)(dst_argb + 16), xmm1); \ 84 dst_argb += 32; 85 86 #if defined(HAS_I422TOARGBROW_SSSE3) 87 void I422ToARGBRow_SSSE3(const uint8* y_buf, 88 const uint8* u_buf, 89 const uint8* v_buf, 90 uint8* dst_argb, 91 const struct YuvConstants* yuvconstants, 92 int width) { 93 __m128i xmm0, xmm1, xmm2, xmm4; 94 const __m128i xmm5 = _mm_set1_epi8(-1); 95 const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf; 96 while (width > 0) { 97 READYUV422 98 YUVTORGB(yuvconstants) 99 STOREARGB 100 width -= 8; 101 } 102 } 103 #endif 104 105 #if defined(HAS_I422ALPHATOARGBROW_SSSE3) 106 void I422AlphaToARGBRow_SSSE3(const uint8* y_buf, 107 const uint8* u_buf, 108 const uint8* v_buf, 109 const uint8* a_buf, 110 uint8* dst_argb, 111 const struct YuvConstants* yuvconstants, 112 int width) { 113 __m128i xmm0, xmm1, xmm2, xmm4, xmm5; 114 const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf; 115 while (width > 0) { 116 READYUVA422 117 YUVTORGB(yuvconstants) 118 STOREARGB 119 width -= 8; 120 } 121 } 122 #endif 123 124 // 32 bit 125 #else // defined(_M_X64) 126 #ifdef HAS_ARGBTOYROW_SSSE3 127 128 // Constants for ARGB. 129 static const vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0, 130 13, 65, 33, 0, 13, 65, 33, 0}; 131 132 // JPeg full range. 133 static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0, 134 15, 75, 38, 0, 15, 75, 38, 0}; 135 136 static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0, 137 112, -74, -38, 0, 112, -74, -38, 0}; 138 139 static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0, 140 127, -84, -43, 0, 127, -84, -43, 0}; 141 142 static const vec8 kARGBToV = { 143 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, 144 }; 145 146 static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0, 147 -20, -107, 127, 0, -20, -107, 127, 0}; 148 149 // vpshufb for vphaddw + vpackuswb packed to shorts. 150 static const lvec8 kShufARGBToUV_AVX = { 151 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, 152 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15}; 153 154 // Constants for BGRA. 155 static const vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13, 156 0, 33, 65, 13, 0, 33, 65, 13}; 157 158 static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112, 159 0, -38, -74, 112, 0, -38, -74, 112}; 160 161 static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18, 162 0, 112, -94, -18, 0, 112, -94, -18}; 163 164 // Constants for ABGR. 165 static const vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0, 166 33, 65, 13, 0, 33, 65, 13, 0}; 167 168 static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0, 169 -38, -74, 112, 0, -38, -74, 112, 0}; 170 171 static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0, 172 112, -94, -18, 0, 112, -94, -18, 0}; 173 174 // Constants for RGBA. 175 static const vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33, 176 0, 13, 65, 33, 0, 13, 65, 33}; 177 178 static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38, 179 0, 112, -74, -38, 0, 112, -74, -38}; 180 181 static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112, 182 0, -18, -94, 112, 0, -18, -94, 112}; 183 184 static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 185 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u}; 186 187 // 7 bit fixed point 0.5. 188 static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64}; 189 190 static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 191 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; 192 193 static const uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u, 194 0x8080u, 0x8080u, 0x8080u, 0x8080u}; 195 196 // Shuffle table for converting RGB24 to ARGB. 197 static const uvec8 kShuffleMaskRGB24ToARGB = { 198 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u}; 199 200 // Shuffle table for converting RAW to ARGB. 201 static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 202 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u}; 203 204 // Shuffle table for converting RAW to RGB24. First 8. 205 static const uvec8 kShuffleMaskRAWToRGB24_0 = { 206 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u, 207 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; 208 209 // Shuffle table for converting RAW to RGB24. Middle 8. 210 static const uvec8 kShuffleMaskRAWToRGB24_1 = { 211 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u, 212 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; 213 214 // Shuffle table for converting RAW to RGB24. Last 8. 215 static const uvec8 kShuffleMaskRAWToRGB24_2 = { 216 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u, 217 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; 218 219 // Shuffle table for converting ARGB to RGB24. 220 static const uvec8 kShuffleMaskARGBToRGB24 = { 221 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u}; 222 223 // Shuffle table for converting ARGB to RAW. 224 static const uvec8 kShuffleMaskARGBToRAW = { 225 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u}; 226 227 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 228 static const uvec8 kShuffleMaskARGBToRGB24_0 = { 229 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u}; 230 231 // YUY2 shuf 16 Y to 32 Y. 232 static const lvec8 kShuffleYUY2Y = {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 233 10, 12, 12, 14, 14, 0, 0, 2, 2, 4, 4, 234 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}; 235 236 // YUY2 shuf 8 UV to 16 UV. 237 static const lvec8 kShuffleYUY2UV = {1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 238 11, 13, 15, 13, 15, 1, 3, 1, 3, 5, 7, 239 5, 7, 9, 11, 9, 11, 13, 15, 13, 15}; 240 241 // UYVY shuf 16 Y to 32 Y. 242 static const lvec8 kShuffleUYVYY = {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 243 11, 13, 13, 15, 15, 1, 1, 3, 3, 5, 5, 244 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}; 245 246 // UYVY shuf 8 UV to 16 UV. 247 static const lvec8 kShuffleUYVYUV = {0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 248 10, 12, 14, 12, 14, 0, 2, 0, 2, 4, 6, 249 4, 6, 8, 10, 8, 10, 12, 14, 12, 14}; 250 251 // NV21 shuf 8 VU to 16 UV. 252 static const lvec8 kShuffleNV21 = { 253 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, 254 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6, 255 }; 256 257 // Duplicates gray value 3 times and fills in alpha opaque. 258 __declspec(naked) void J400ToARGBRow_SSE2(const uint8* src_y, 259 uint8* dst_argb, 260 int width) { 261 __asm { 262 mov eax, [esp + 4] // src_y 263 mov edx, [esp + 8] // dst_argb 264 mov ecx, [esp + 12] // width 265 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 266 pslld xmm5, 24 267 268 convertloop: 269 movq xmm0, qword ptr [eax] 270 lea eax, [eax + 8] 271 punpcklbw xmm0, xmm0 272 movdqa xmm1, xmm0 273 punpcklwd xmm0, xmm0 274 punpckhwd xmm1, xmm1 275 por xmm0, xmm5 276 por xmm1, xmm5 277 movdqu [edx], xmm0 278 movdqu [edx + 16], xmm1 279 lea edx, [edx + 32] 280 sub ecx, 8 281 jg convertloop 282 ret 283 } 284 } 285 286 #ifdef HAS_J400TOARGBROW_AVX2 287 // Duplicates gray value 3 times and fills in alpha opaque. 288 __declspec(naked) void J400ToARGBRow_AVX2(const uint8* src_y, 289 uint8* dst_argb, 290 int width) { 291 __asm { 292 mov eax, [esp + 4] // src_y 293 mov edx, [esp + 8] // dst_argb 294 mov ecx, [esp + 12] // width 295 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 296 vpslld ymm5, ymm5, 24 297 298 convertloop: 299 vmovdqu xmm0, [eax] 300 lea eax, [eax + 16] 301 vpermq ymm0, ymm0, 0xd8 302 vpunpcklbw ymm0, ymm0, ymm0 303 vpermq ymm0, ymm0, 0xd8 304 vpunpckhwd ymm1, ymm0, ymm0 305 vpunpcklwd ymm0, ymm0, ymm0 306 vpor ymm0, ymm0, ymm5 307 vpor ymm1, ymm1, ymm5 308 vmovdqu [edx], ymm0 309 vmovdqu [edx + 32], ymm1 310 lea edx, [edx + 64] 311 sub ecx, 16 312 jg convertloop 313 vzeroupper 314 ret 315 } 316 } 317 #endif // HAS_J400TOARGBROW_AVX2 318 319 __declspec(naked) void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, 320 uint8* dst_argb, 321 int width) { 322 __asm { 323 mov eax, [esp + 4] // src_rgb24 324 mov edx, [esp + 8] // dst_argb 325 mov ecx, [esp + 12] // width 326 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 327 pslld xmm5, 24 328 movdqa xmm4, xmmword ptr kShuffleMaskRGB24ToARGB 329 330 convertloop: 331 movdqu xmm0, [eax] 332 movdqu xmm1, [eax + 16] 333 movdqu xmm3, [eax + 32] 334 lea eax, [eax + 48] 335 movdqa xmm2, xmm3 336 palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} 337 pshufb xmm2, xmm4 338 por xmm2, xmm5 339 palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} 340 pshufb xmm0, xmm4 341 movdqu [edx + 32], xmm2 342 por xmm0, xmm5 343 pshufb xmm1, xmm4 344 movdqu [edx], xmm0 345 por xmm1, xmm5 346 palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} 347 pshufb xmm3, xmm4 348 movdqu [edx + 16], xmm1 349 por xmm3, xmm5 350 movdqu [edx + 48], xmm3 351 lea edx, [edx + 64] 352 sub ecx, 16 353 jg convertloop 354 ret 355 } 356 } 357 358 __declspec(naked) void RAWToARGBRow_SSSE3(const uint8* src_raw, 359 uint8* dst_argb, 360 int width) { 361 __asm { 362 mov eax, [esp + 4] // src_raw 363 mov edx, [esp + 8] // dst_argb 364 mov ecx, [esp + 12] // width 365 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 366 pslld xmm5, 24 367 movdqa xmm4, xmmword ptr kShuffleMaskRAWToARGB 368 369 convertloop: 370 movdqu xmm0, [eax] 371 movdqu xmm1, [eax + 16] 372 movdqu xmm3, [eax + 32] 373 lea eax, [eax + 48] 374 movdqa xmm2, xmm3 375 palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} 376 pshufb xmm2, xmm4 377 por xmm2, xmm5 378 palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} 379 pshufb xmm0, xmm4 380 movdqu [edx + 32], xmm2 381 por xmm0, xmm5 382 pshufb xmm1, xmm4 383 movdqu [edx], xmm0 384 por xmm1, xmm5 385 palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} 386 pshufb xmm3, xmm4 387 movdqu [edx + 16], xmm1 388 por xmm3, xmm5 389 movdqu [edx + 48], xmm3 390 lea edx, [edx + 64] 391 sub ecx, 16 392 jg convertloop 393 ret 394 } 395 } 396 397 __declspec(naked) void RAWToRGB24Row_SSSE3(const uint8* src_raw, 398 uint8* dst_rgb24, 399 int width) { 400 __asm { 401 mov eax, [esp + 4] // src_raw 402 mov edx, [esp + 8] // dst_rgb24 403 mov ecx, [esp + 12] // width 404 movdqa xmm3, xmmword ptr kShuffleMaskRAWToRGB24_0 405 movdqa xmm4, xmmword ptr kShuffleMaskRAWToRGB24_1 406 movdqa xmm5, xmmword ptr kShuffleMaskRAWToRGB24_2 407 408 convertloop: 409 movdqu xmm0, [eax] 410 movdqu xmm1, [eax + 4] 411 movdqu xmm2, [eax + 8] 412 lea eax, [eax + 24] 413 pshufb xmm0, xmm3 414 pshufb xmm1, xmm4 415 pshufb xmm2, xmm5 416 movq qword ptr [edx], xmm0 417 movq qword ptr [edx + 8], xmm1 418 movq qword ptr [edx + 16], xmm2 419 lea edx, [edx + 24] 420 sub ecx, 8 421 jg convertloop 422 ret 423 } 424 } 425 426 // pmul method to replicate bits. 427 // Math to replicate bits: 428 // (v << 8) | (v << 3) 429 // v * 256 + v * 8 430 // v * (256 + 8) 431 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 432 // 20 instructions. 433 __declspec(naked) void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, 434 uint8* dst_argb, 435 int width) { 436 __asm { 437 mov eax, 0x01080108 // generate multiplier to repeat 5 bits 438 movd xmm5, eax 439 pshufd xmm5, xmm5, 0 440 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits 441 movd xmm6, eax 442 pshufd xmm6, xmm6, 0 443 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red 444 psllw xmm3, 11 445 pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green 446 psllw xmm4, 10 447 psrlw xmm4, 5 448 pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha 449 psllw xmm7, 8 450 451 mov eax, [esp + 4] // src_rgb565 452 mov edx, [esp + 8] // dst_argb 453 mov ecx, [esp + 12] // width 454 sub edx, eax 455 sub edx, eax 456 457 convertloop: 458 movdqu xmm0, [eax] // fetch 8 pixels of bgr565 459 movdqa xmm1, xmm0 460 movdqa xmm2, xmm0 461 pand xmm1, xmm3 // R in upper 5 bits 462 psllw xmm2, 11 // B in upper 5 bits 463 pmulhuw xmm1, xmm5 // * (256 + 8) 464 pmulhuw xmm2, xmm5 // * (256 + 8) 465 psllw xmm1, 8 466 por xmm1, xmm2 // RB 467 pand xmm0, xmm4 // G in middle 6 bits 468 pmulhuw xmm0, xmm6 // << 5 * (256 + 4) 469 por xmm0, xmm7 // AG 470 movdqa xmm2, xmm1 471 punpcklbw xmm1, xmm0 472 punpckhbw xmm2, xmm0 473 movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB 474 movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB 475 lea eax, [eax + 16] 476 sub ecx, 8 477 jg convertloop 478 ret 479 } 480 } 481 482 #ifdef HAS_RGB565TOARGBROW_AVX2 483 // pmul method to replicate bits. 484 // Math to replicate bits: 485 // (v << 8) | (v << 3) 486 // v * 256 + v * 8 487 // v * (256 + 8) 488 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 489 __declspec(naked) void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, 490 uint8* dst_argb, 491 int width) { 492 __asm { 493 mov eax, 0x01080108 // generate multiplier to repeat 5 bits 494 vmovd xmm5, eax 495 vbroadcastss ymm5, xmm5 496 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits 497 vmovd xmm6, eax 498 vbroadcastss ymm6, xmm6 499 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red 500 vpsllw ymm3, ymm3, 11 501 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x07e007e0 for Green 502 vpsllw ymm4, ymm4, 10 503 vpsrlw ymm4, ymm4, 5 504 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha 505 vpsllw ymm7, ymm7, 8 506 507 mov eax, [esp + 4] // src_rgb565 508 mov edx, [esp + 8] // dst_argb 509 mov ecx, [esp + 12] // width 510 sub edx, eax 511 sub edx, eax 512 513 convertloop: 514 vmovdqu ymm0, [eax] // fetch 16 pixels of bgr565 515 vpand ymm1, ymm0, ymm3 // R in upper 5 bits 516 vpsllw ymm2, ymm0, 11 // B in upper 5 bits 517 vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8) 518 vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8) 519 vpsllw ymm1, ymm1, 8 520 vpor ymm1, ymm1, ymm2 // RB 521 vpand ymm0, ymm0, ymm4 // G in middle 6 bits 522 vpmulhuw ymm0, ymm0, ymm6 // << 5 * (256 + 4) 523 vpor ymm0, ymm0, ymm7 // AG 524 vpermq ymm0, ymm0, 0xd8 // mutate for unpack 525 vpermq ymm1, ymm1, 0xd8 526 vpunpckhbw ymm2, ymm1, ymm0 527 vpunpcklbw ymm1, ymm1, ymm0 528 vmovdqu [eax * 2 + edx], ymm1 // store 4 pixels of ARGB 529 vmovdqu [eax * 2 + edx + 32], ymm2 // store next 4 pixels of ARGB 530 lea eax, [eax + 32] 531 sub ecx, 16 532 jg convertloop 533 vzeroupper 534 ret 535 } 536 } 537 #endif // HAS_RGB565TOARGBROW_AVX2 538 539 #ifdef HAS_ARGB1555TOARGBROW_AVX2 540 __declspec(naked) void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, 541 uint8* dst_argb, 542 int width) { 543 __asm { 544 mov eax, 0x01080108 // generate multiplier to repeat 5 bits 545 vmovd xmm5, eax 546 vbroadcastss ymm5, xmm5 547 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits 548 vmovd xmm6, eax 549 vbroadcastss ymm6, xmm6 550 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red 551 vpsllw ymm3, ymm3, 11 552 vpsrlw ymm4, ymm3, 6 // generate mask 0x03e003e0 for Green 553 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha 554 vpsllw ymm7, ymm7, 8 555 556 mov eax, [esp + 4] // src_argb1555 557 mov edx, [esp + 8] // dst_argb 558 mov ecx, [esp + 12] // width 559 sub edx, eax 560 sub edx, eax 561 562 convertloop: 563 vmovdqu ymm0, [eax] // fetch 16 pixels of 1555 564 vpsllw ymm1, ymm0, 1 // R in upper 5 bits 565 vpsllw ymm2, ymm0, 11 // B in upper 5 bits 566 vpand ymm1, ymm1, ymm3 567 vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8) 568 vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8) 569 vpsllw ymm1, ymm1, 8 570 vpor ymm1, ymm1, ymm2 // RB 571 vpsraw ymm2, ymm0, 8 // A 572 vpand ymm0, ymm0, ymm4 // G in middle 5 bits 573 vpmulhuw ymm0, ymm0, ymm6 // << 6 * (256 + 8) 574 vpand ymm2, ymm2, ymm7 575 vpor ymm0, ymm0, ymm2 // AG 576 vpermq ymm0, ymm0, 0xd8 // mutate for unpack 577 vpermq ymm1, ymm1, 0xd8 578 vpunpckhbw ymm2, ymm1, ymm0 579 vpunpcklbw ymm1, ymm1, ymm0 580 vmovdqu [eax * 2 + edx], ymm1 // store 8 pixels of ARGB 581 vmovdqu [eax * 2 + edx + 32], ymm2 // store next 8 pixels of ARGB 582 lea eax, [eax + 32] 583 sub ecx, 16 584 jg convertloop 585 vzeroupper 586 ret 587 } 588 } 589 #endif // HAS_ARGB1555TOARGBROW_AVX2 590 591 #ifdef HAS_ARGB4444TOARGBROW_AVX2 592 __declspec(naked) void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, 593 uint8* dst_argb, 594 int width) { 595 __asm { 596 mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f 597 vmovd xmm4, eax 598 vbroadcastss ymm4, xmm4 599 vpslld ymm5, ymm4, 4 // 0xf0f0f0f0 for high nibbles 600 mov eax, [esp + 4] // src_argb4444 601 mov edx, [esp + 8] // dst_argb 602 mov ecx, [esp + 12] // width 603 sub edx, eax 604 sub edx, eax 605 606 convertloop: 607 vmovdqu ymm0, [eax] // fetch 16 pixels of bgra4444 608 vpand ymm2, ymm0, ymm5 // mask high nibbles 609 vpand ymm0, ymm0, ymm4 // mask low nibbles 610 vpsrlw ymm3, ymm2, 4 611 vpsllw ymm1, ymm0, 4 612 vpor ymm2, ymm2, ymm3 613 vpor ymm0, ymm0, ymm1 614 vpermq ymm0, ymm0, 0xd8 // mutate for unpack 615 vpermq ymm2, ymm2, 0xd8 616 vpunpckhbw ymm1, ymm0, ymm2 617 vpunpcklbw ymm0, ymm0, ymm2 618 vmovdqu [eax * 2 + edx], ymm0 // store 8 pixels of ARGB 619 vmovdqu [eax * 2 + edx + 32], ymm1 // store next 8 pixels of ARGB 620 lea eax, [eax + 32] 621 sub ecx, 16 622 jg convertloop 623 vzeroupper 624 ret 625 } 626 } 627 #endif // HAS_ARGB4444TOARGBROW_AVX2 628 629 // 24 instructions 630 __declspec(naked) void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, 631 uint8* dst_argb, 632 int width) { 633 __asm { 634 mov eax, 0x01080108 // generate multiplier to repeat 5 bits 635 movd xmm5, eax 636 pshufd xmm5, xmm5, 0 637 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits 638 movd xmm6, eax 639 pshufd xmm6, xmm6, 0 640 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red 641 psllw xmm3, 11 642 movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green 643 psrlw xmm4, 6 644 pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha 645 psllw xmm7, 8 646 647 mov eax, [esp + 4] // src_argb1555 648 mov edx, [esp + 8] // dst_argb 649 mov ecx, [esp + 12] // width 650 sub edx, eax 651 sub edx, eax 652 653 convertloop: 654 movdqu xmm0, [eax] // fetch 8 pixels of 1555 655 movdqa xmm1, xmm0 656 movdqa xmm2, xmm0 657 psllw xmm1, 1 // R in upper 5 bits 658 psllw xmm2, 11 // B in upper 5 bits 659 pand xmm1, xmm3 660 pmulhuw xmm2, xmm5 // * (256 + 8) 661 pmulhuw xmm1, xmm5 // * (256 + 8) 662 psllw xmm1, 8 663 por xmm1, xmm2 // RB 664 movdqa xmm2, xmm0 665 pand xmm0, xmm4 // G in middle 5 bits 666 psraw xmm2, 8 // A 667 pmulhuw xmm0, xmm6 // << 6 * (256 + 8) 668 pand xmm2, xmm7 669 por xmm0, xmm2 // AG 670 movdqa xmm2, xmm1 671 punpcklbw xmm1, xmm0 672 punpckhbw xmm2, xmm0 673 movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB 674 movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB 675 lea eax, [eax + 16] 676 sub ecx, 8 677 jg convertloop 678 ret 679 } 680 } 681 682 // 18 instructions. 683 __declspec(naked) void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, 684 uint8* dst_argb, 685 int width) { 686 __asm { 687 mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f 688 movd xmm4, eax 689 pshufd xmm4, xmm4, 0 690 movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles 691 pslld xmm5, 4 692 mov eax, [esp + 4] // src_argb4444 693 mov edx, [esp + 8] // dst_argb 694 mov ecx, [esp + 12] // width 695 sub edx, eax 696 sub edx, eax 697 698 convertloop: 699 movdqu xmm0, [eax] // fetch 8 pixels of bgra4444 700 movdqa xmm2, xmm0 701 pand xmm0, xmm4 // mask low nibbles 702 pand xmm2, xmm5 // mask high nibbles 703 movdqa xmm1, xmm0 704 movdqa xmm3, xmm2 705 psllw xmm1, 4 706 psrlw xmm3, 4 707 por xmm0, xmm1 708 por xmm2, xmm3 709 movdqa xmm1, xmm0 710 punpcklbw xmm0, xmm2 711 punpckhbw xmm1, xmm2 712 movdqu [eax * 2 + edx], xmm0 // store 4 pixels of ARGB 713 movdqu [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB 714 lea eax, [eax + 16] 715 sub ecx, 8 716 jg convertloop 717 ret 718 } 719 } 720 721 __declspec(naked) void ARGBToRGB24Row_SSSE3(const uint8* src_argb, 722 uint8* dst_rgb, 723 int width) { 724 __asm { 725 mov eax, [esp + 4] // src_argb 726 mov edx, [esp + 8] // dst_rgb 727 mov ecx, [esp + 12] // width 728 movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24 729 730 convertloop: 731 movdqu xmm0, [eax] // fetch 16 pixels of argb 732 movdqu xmm1, [eax + 16] 733 movdqu xmm2, [eax + 32] 734 movdqu xmm3, [eax + 48] 735 lea eax, [eax + 64] 736 pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB 737 pshufb xmm1, xmm6 738 pshufb xmm2, xmm6 739 pshufb xmm3, xmm6 740 movdqa xmm4, xmm1 // 4 bytes from 1 for 0 741 psrldq xmm1, 4 // 8 bytes from 1 742 pslldq xmm4, 12 // 4 bytes from 1 for 0 743 movdqa xmm5, xmm2 // 8 bytes from 2 for 1 744 por xmm0, xmm4 // 4 bytes from 1 for 0 745 pslldq xmm5, 8 // 8 bytes from 2 for 1 746 movdqu [edx], xmm0 // store 0 747 por xmm1, xmm5 // 8 bytes from 2 for 1 748 psrldq xmm2, 8 // 4 bytes from 2 749 pslldq xmm3, 4 // 12 bytes from 3 for 2 750 por xmm2, xmm3 // 12 bytes from 3 for 2 751 movdqu [edx + 16], xmm1 // store 1 752 movdqu [edx + 32], xmm2 // store 2 753 lea edx, [edx + 48] 754 sub ecx, 16 755 jg convertloop 756 ret 757 } 758 } 759 760 __declspec(naked) void ARGBToRAWRow_SSSE3(const uint8* src_argb, 761 uint8* dst_rgb, 762 int width) { 763 __asm { 764 mov eax, [esp + 4] // src_argb 765 mov edx, [esp + 8] // dst_rgb 766 mov ecx, [esp + 12] // width 767 movdqa xmm6, xmmword ptr kShuffleMaskARGBToRAW 768 769 convertloop: 770 movdqu xmm0, [eax] // fetch 16 pixels of argb 771 movdqu xmm1, [eax + 16] 772 movdqu xmm2, [eax + 32] 773 movdqu xmm3, [eax + 48] 774 lea eax, [eax + 64] 775 pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB 776 pshufb xmm1, xmm6 777 pshufb xmm2, xmm6 778 pshufb xmm3, xmm6 779 movdqa xmm4, xmm1 // 4 bytes from 1 for 0 780 psrldq xmm1, 4 // 8 bytes from 1 781 pslldq xmm4, 12 // 4 bytes from 1 for 0 782 movdqa xmm5, xmm2 // 8 bytes from 2 for 1 783 por xmm0, xmm4 // 4 bytes from 1 for 0 784 pslldq xmm5, 8 // 8 bytes from 2 for 1 785 movdqu [edx], xmm0 // store 0 786 por xmm1, xmm5 // 8 bytes from 2 for 1 787 psrldq xmm2, 8 // 4 bytes from 2 788 pslldq xmm3, 4 // 12 bytes from 3 for 2 789 por xmm2, xmm3 // 12 bytes from 3 for 2 790 movdqu [edx + 16], xmm1 // store 1 791 movdqu [edx + 32], xmm2 // store 2 792 lea edx, [edx + 48] 793 sub ecx, 16 794 jg convertloop 795 ret 796 } 797 } 798 799 __declspec(naked) void ARGBToRGB565Row_SSE2(const uint8* src_argb, 800 uint8* dst_rgb, 801 int width) { 802 __asm { 803 mov eax, [esp + 4] // src_argb 804 mov edx, [esp + 8] // dst_rgb 805 mov ecx, [esp + 12] // width 806 pcmpeqb xmm3, xmm3 // generate mask 0x0000001f 807 psrld xmm3, 27 808 pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 809 psrld xmm4, 26 810 pslld xmm4, 5 811 pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 812 pslld xmm5, 11 813 814 convertloop: 815 movdqu xmm0, [eax] // fetch 4 pixels of argb 816 movdqa xmm1, xmm0 // B 817 movdqa xmm2, xmm0 // G 818 pslld xmm0, 8 // R 819 psrld xmm1, 3 // B 820 psrld xmm2, 5 // G 821 psrad xmm0, 16 // R 822 pand xmm1, xmm3 // B 823 pand xmm2, xmm4 // G 824 pand xmm0, xmm5 // R 825 por xmm1, xmm2 // BG 826 por xmm0, xmm1 // BGR 827 packssdw xmm0, xmm0 828 lea eax, [eax + 16] 829 movq qword ptr [edx], xmm0 // store 4 pixels of RGB565 830 lea edx, [edx + 8] 831 sub ecx, 4 832 jg convertloop 833 ret 834 } 835 } 836 837 __declspec(naked) void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, 838 uint8* dst_rgb, 839 const uint32 dither4, 840 int width) { 841 __asm { 842 843 mov eax, [esp + 4] // src_argb 844 mov edx, [esp + 8] // dst_rgb 845 movd xmm6, [esp + 12] // dither4 846 mov ecx, [esp + 16] // width 847 punpcklbw xmm6, xmm6 // make dither 16 bytes 848 movdqa xmm7, xmm6 849 punpcklwd xmm6, xmm6 850 punpckhwd xmm7, xmm7 851 pcmpeqb xmm3, xmm3 // generate mask 0x0000001f 852 psrld xmm3, 27 853 pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 854 psrld xmm4, 26 855 pslld xmm4, 5 856 pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 857 pslld xmm5, 11 858 859 convertloop: 860 movdqu xmm0, [eax] // fetch 4 pixels of argb 861 paddusb xmm0, xmm6 // add dither 862 movdqa xmm1, xmm0 // B 863 movdqa xmm2, xmm0 // G 864 pslld xmm0, 8 // R 865 psrld xmm1, 3 // B 866 psrld xmm2, 5 // G 867 psrad xmm0, 16 // R 868 pand xmm1, xmm3 // B 869 pand xmm2, xmm4 // G 870 pand xmm0, xmm5 // R 871 por xmm1, xmm2 // BG 872 por xmm0, xmm1 // BGR 873 packssdw xmm0, xmm0 874 lea eax, [eax + 16] 875 movq qword ptr [edx], xmm0 // store 4 pixels of RGB565 876 lea edx, [edx + 8] 877 sub ecx, 4 878 jg convertloop 879 ret 880 } 881 } 882 883 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2 884 __declspec(naked) void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, 885 uint8* dst_rgb, 886 const uint32 dither4, 887 int width) { 888 __asm { 889 mov eax, [esp + 4] // src_argb 890 mov edx, [esp + 8] // dst_rgb 891 vbroadcastss xmm6, [esp + 12] // dither4 892 mov ecx, [esp + 16] // width 893 vpunpcklbw xmm6, xmm6, xmm6 // make dither 32 bytes 894 vpermq ymm6, ymm6, 0xd8 895 vpunpcklwd ymm6, ymm6, ymm6 896 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f 897 vpsrld ymm3, ymm3, 27 898 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 899 vpsrld ymm4, ymm4, 26 900 vpslld ymm4, ymm4, 5 901 vpslld ymm5, ymm3, 11 // generate mask 0x0000f800 902 903 convertloop: 904 vmovdqu ymm0, [eax] // fetch 8 pixels of argb 905 vpaddusb ymm0, ymm0, ymm6 // add dither 906 vpsrld ymm2, ymm0, 5 // G 907 vpsrld ymm1, ymm0, 3 // B 908 vpsrld ymm0, ymm0, 8 // R 909 vpand ymm2, ymm2, ymm4 // G 910 vpand ymm1, ymm1, ymm3 // B 911 vpand ymm0, ymm0, ymm5 // R 912 vpor ymm1, ymm1, ymm2 // BG 913 vpor ymm0, ymm0, ymm1 // BGR 914 vpackusdw ymm0, ymm0, ymm0 915 vpermq ymm0, ymm0, 0xd8 916 lea eax, [eax + 32] 917 vmovdqu [edx], xmm0 // store 8 pixels of RGB565 918 lea edx, [edx + 16] 919 sub ecx, 8 920 jg convertloop 921 vzeroupper 922 ret 923 } 924 } 925 #endif // HAS_ARGBTORGB565DITHERROW_AVX2 926 927 // TODO(fbarchard): Improve sign extension/packing. 928 __declspec(naked) void ARGBToARGB1555Row_SSE2(const uint8* src_argb, 929 uint8* dst_rgb, 930 int width) { 931 __asm { 932 mov eax, [esp + 4] // src_argb 933 mov edx, [esp + 8] // dst_rgb 934 mov ecx, [esp + 12] // width 935 pcmpeqb xmm4, xmm4 // generate mask 0x0000001f 936 psrld xmm4, 27 937 movdqa xmm5, xmm4 // generate mask 0x000003e0 938 pslld xmm5, 5 939 movdqa xmm6, xmm4 // generate mask 0x00007c00 940 pslld xmm6, 10 941 pcmpeqb xmm7, xmm7 // generate mask 0xffff8000 942 pslld xmm7, 15 943 944 convertloop: 945 movdqu xmm0, [eax] // fetch 4 pixels of argb 946 movdqa xmm1, xmm0 // B 947 movdqa xmm2, xmm0 // G 948 movdqa xmm3, xmm0 // R 949 psrad xmm0, 16 // A 950 psrld xmm1, 3 // B 951 psrld xmm2, 6 // G 952 psrld xmm3, 9 // R 953 pand xmm0, xmm7 // A 954 pand xmm1, xmm4 // B 955 pand xmm2, xmm5 // G 956 pand xmm3, xmm6 // R 957 por xmm0, xmm1 // BA 958 por xmm2, xmm3 // GR 959 por xmm0, xmm2 // BGRA 960 packssdw xmm0, xmm0 961 lea eax, [eax + 16] 962 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555 963 lea edx, [edx + 8] 964 sub ecx, 4 965 jg convertloop 966 ret 967 } 968 } 969 970 __declspec(naked) void ARGBToARGB4444Row_SSE2(const uint8* src_argb, 971 uint8* dst_rgb, 972 int width) { 973 __asm { 974 mov eax, [esp + 4] // src_argb 975 mov edx, [esp + 8] // dst_rgb 976 mov ecx, [esp + 12] // width 977 pcmpeqb xmm4, xmm4 // generate mask 0xf000f000 978 psllw xmm4, 12 979 movdqa xmm3, xmm4 // generate mask 0x00f000f0 980 psrlw xmm3, 8 981 982 convertloop: 983 movdqu xmm0, [eax] // fetch 4 pixels of argb 984 movdqa xmm1, xmm0 985 pand xmm0, xmm3 // low nibble 986 pand xmm1, xmm4 // high nibble 987 psrld xmm0, 4 988 psrld xmm1, 8 989 por xmm0, xmm1 990 packuswb xmm0, xmm0 991 lea eax, [eax + 16] 992 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444 993 lea edx, [edx + 8] 994 sub ecx, 4 995 jg convertloop 996 ret 997 } 998 } 999 1000 #ifdef HAS_ARGBTORGB565ROW_AVX2 1001 __declspec(naked) void ARGBToRGB565Row_AVX2(const uint8* src_argb, 1002 uint8* dst_rgb, 1003 int width) { 1004 __asm { 1005 mov eax, [esp + 4] // src_argb 1006 mov edx, [esp + 8] // dst_rgb 1007 mov ecx, [esp + 12] // width 1008 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f 1009 vpsrld ymm3, ymm3, 27 1010 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 1011 vpsrld ymm4, ymm4, 26 1012 vpslld ymm4, ymm4, 5 1013 vpslld ymm5, ymm3, 11 // generate mask 0x0000f800 1014 1015 convertloop: 1016 vmovdqu ymm0, [eax] // fetch 8 pixels of argb 1017 vpsrld ymm2, ymm0, 5 // G 1018 vpsrld ymm1, ymm0, 3 // B 1019 vpsrld ymm0, ymm0, 8 // R 1020 vpand ymm2, ymm2, ymm4 // G 1021 vpand ymm1, ymm1, ymm3 // B 1022 vpand ymm0, ymm0, ymm5 // R 1023 vpor ymm1, ymm1, ymm2 // BG 1024 vpor ymm0, ymm0, ymm1 // BGR 1025 vpackusdw ymm0, ymm0, ymm0 1026 vpermq ymm0, ymm0, 0xd8 1027 lea eax, [eax + 32] 1028 vmovdqu [edx], xmm0 // store 8 pixels of RGB565 1029 lea edx, [edx + 16] 1030 sub ecx, 8 1031 jg convertloop 1032 vzeroupper 1033 ret 1034 } 1035 } 1036 #endif // HAS_ARGBTORGB565ROW_AVX2 1037 1038 #ifdef HAS_ARGBTOARGB1555ROW_AVX2 1039 __declspec(naked) void ARGBToARGB1555Row_AVX2(const uint8* src_argb, 1040 uint8* dst_rgb, 1041 int width) { 1042 __asm { 1043 mov eax, [esp + 4] // src_argb 1044 mov edx, [esp + 8] // dst_rgb 1045 mov ecx, [esp + 12] // width 1046 vpcmpeqb ymm4, ymm4, ymm4 1047 vpsrld ymm4, ymm4, 27 // generate mask 0x0000001f 1048 vpslld ymm5, ymm4, 5 // generate mask 0x000003e0 1049 vpslld ymm6, ymm4, 10 // generate mask 0x00007c00 1050 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xffff8000 1051 vpslld ymm7, ymm7, 15 1052 1053 convertloop: 1054 vmovdqu ymm0, [eax] // fetch 8 pixels of argb 1055 vpsrld ymm3, ymm0, 9 // R 1056 vpsrld ymm2, ymm0, 6 // G 1057 vpsrld ymm1, ymm0, 3 // B 1058 vpsrad ymm0, ymm0, 16 // A 1059 vpand ymm3, ymm3, ymm6 // R 1060 vpand ymm2, ymm2, ymm5 // G 1061 vpand ymm1, ymm1, ymm4 // B 1062 vpand ymm0, ymm0, ymm7 // A 1063 vpor ymm0, ymm0, ymm1 // BA 1064 vpor ymm2, ymm2, ymm3 // GR 1065 vpor ymm0, ymm0, ymm2 // BGRA 1066 vpackssdw ymm0, ymm0, ymm0 1067 vpermq ymm0, ymm0, 0xd8 1068 lea eax, [eax + 32] 1069 vmovdqu [edx], xmm0 // store 8 pixels of ARGB1555 1070 lea edx, [edx + 16] 1071 sub ecx, 8 1072 jg convertloop 1073 vzeroupper 1074 ret 1075 } 1076 } 1077 #endif // HAS_ARGBTOARGB1555ROW_AVX2 1078 1079 #ifdef HAS_ARGBTOARGB4444ROW_AVX2 1080 __declspec(naked) void ARGBToARGB4444Row_AVX2(const uint8* src_argb, 1081 uint8* dst_rgb, 1082 int width) { 1083 __asm { 1084 mov eax, [esp + 4] // src_argb 1085 mov edx, [esp + 8] // dst_rgb 1086 mov ecx, [esp + 12] // width 1087 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000 1088 vpsllw ymm4, ymm4, 12 1089 vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0 1090 1091 convertloop: 1092 vmovdqu ymm0, [eax] // fetch 8 pixels of argb 1093 vpand ymm1, ymm0, ymm4 // high nibble 1094 vpand ymm0, ymm0, ymm3 // low nibble 1095 vpsrld ymm1, ymm1, 8 1096 vpsrld ymm0, ymm0, 4 1097 vpor ymm0, ymm0, ymm1 1098 vpackuswb ymm0, ymm0, ymm0 1099 vpermq ymm0, ymm0, 0xd8 1100 lea eax, [eax + 32] 1101 vmovdqu [edx], xmm0 // store 8 pixels of ARGB4444 1102 lea edx, [edx + 16] 1103 sub ecx, 8 1104 jg convertloop 1105 vzeroupper 1106 ret 1107 } 1108 } 1109 #endif // HAS_ARGBTOARGB4444ROW_AVX2 1110 1111 // Convert 16 ARGB pixels (64 bytes) to 16 Y values. 1112 __declspec(naked) void ARGBToYRow_SSSE3(const uint8* src_argb, 1113 uint8* dst_y, 1114 int width) { 1115 __asm { 1116 mov eax, [esp + 4] /* src_argb */ 1117 mov edx, [esp + 8] /* dst_y */ 1118 mov ecx, [esp + 12] /* width */ 1119 movdqa xmm4, xmmword ptr kARGBToY 1120 movdqa xmm5, xmmword ptr kAddY16 1121 1122 convertloop: 1123 movdqu xmm0, [eax] 1124 movdqu xmm1, [eax + 16] 1125 movdqu xmm2, [eax + 32] 1126 movdqu xmm3, [eax + 48] 1127 pmaddubsw xmm0, xmm4 1128 pmaddubsw xmm1, xmm4 1129 pmaddubsw xmm2, xmm4 1130 pmaddubsw xmm3, xmm4 1131 lea eax, [eax + 64] 1132 phaddw xmm0, xmm1 1133 phaddw xmm2, xmm3 1134 psrlw xmm0, 7 1135 psrlw xmm2, 7 1136 packuswb xmm0, xmm2 1137 paddb xmm0, xmm5 1138 movdqu [edx], xmm0 1139 lea edx, [edx + 16] 1140 sub ecx, 16 1141 jg convertloop 1142 ret 1143 } 1144 } 1145 1146 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values. 1147 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding. 1148 __declspec(naked) void ARGBToYJRow_SSSE3(const uint8* src_argb, 1149 uint8* dst_y, 1150 int width) { 1151 __asm { 1152 mov eax, [esp + 4] /* src_argb */ 1153 mov edx, [esp + 8] /* dst_y */ 1154 mov ecx, [esp + 12] /* width */ 1155 movdqa xmm4, xmmword ptr kARGBToYJ 1156 movdqa xmm5, xmmword ptr kAddYJ64 1157 1158 convertloop: 1159 movdqu xmm0, [eax] 1160 movdqu xmm1, [eax + 16] 1161 movdqu xmm2, [eax + 32] 1162 movdqu xmm3, [eax + 48] 1163 pmaddubsw xmm0, xmm4 1164 pmaddubsw xmm1, xmm4 1165 pmaddubsw xmm2, xmm4 1166 pmaddubsw xmm3, xmm4 1167 lea eax, [eax + 64] 1168 phaddw xmm0, xmm1 1169 phaddw xmm2, xmm3 1170 paddw xmm0, xmm5 // Add .5 for rounding. 1171 paddw xmm2, xmm5 1172 psrlw xmm0, 7 1173 psrlw xmm2, 7 1174 packuswb xmm0, xmm2 1175 movdqu [edx], xmm0 1176 lea edx, [edx + 16] 1177 sub ecx, 16 1178 jg convertloop 1179 ret 1180 } 1181 } 1182 1183 #ifdef HAS_ARGBTOYROW_AVX2 1184 // vpermd for vphaddw + vpackuswb vpermd. 1185 static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7}; 1186 1187 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. 1188 __declspec(naked) void ARGBToYRow_AVX2(const uint8* src_argb, 1189 uint8* dst_y, 1190 int width) { 1191 __asm { 1192 mov eax, [esp + 4] /* src_argb */ 1193 mov edx, [esp + 8] /* dst_y */ 1194 mov ecx, [esp + 12] /* width */ 1195 vbroadcastf128 ymm4, xmmword ptr kARGBToY 1196 vbroadcastf128 ymm5, xmmword ptr kAddY16 1197 vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX 1198 1199 convertloop: 1200 vmovdqu ymm0, [eax] 1201 vmovdqu ymm1, [eax + 32] 1202 vmovdqu ymm2, [eax + 64] 1203 vmovdqu ymm3, [eax + 96] 1204 vpmaddubsw ymm0, ymm0, ymm4 1205 vpmaddubsw ymm1, ymm1, ymm4 1206 vpmaddubsw ymm2, ymm2, ymm4 1207 vpmaddubsw ymm3, ymm3, ymm4 1208 lea eax, [eax + 128] 1209 vphaddw ymm0, ymm0, ymm1 // mutates. 1210 vphaddw ymm2, ymm2, ymm3 1211 vpsrlw ymm0, ymm0, 7 1212 vpsrlw ymm2, ymm2, 7 1213 vpackuswb ymm0, ymm0, ymm2 // mutates. 1214 vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation. 1215 vpaddb ymm0, ymm0, ymm5 // add 16 for Y 1216 vmovdqu [edx], ymm0 1217 lea edx, [edx + 32] 1218 sub ecx, 32 1219 jg convertloop 1220 vzeroupper 1221 ret 1222 } 1223 } 1224 #endif // HAS_ARGBTOYROW_AVX2 1225 1226 #ifdef HAS_ARGBTOYJROW_AVX2 1227 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. 1228 __declspec(naked) void ARGBToYJRow_AVX2(const uint8* src_argb, 1229 uint8* dst_y, 1230 int width) { 1231 __asm { 1232 mov eax, [esp + 4] /* src_argb */ 1233 mov edx, [esp + 8] /* dst_y */ 1234 mov ecx, [esp + 12] /* width */ 1235 vbroadcastf128 ymm4, xmmword ptr kARGBToYJ 1236 vbroadcastf128 ymm5, xmmword ptr kAddYJ64 1237 vmovdqu ymm6, ymmword ptr kPermdARGBToY_AVX 1238 1239 convertloop: 1240 vmovdqu ymm0, [eax] 1241 vmovdqu ymm1, [eax + 32] 1242 vmovdqu ymm2, [eax + 64] 1243 vmovdqu ymm3, [eax + 96] 1244 vpmaddubsw ymm0, ymm0, ymm4 1245 vpmaddubsw ymm1, ymm1, ymm4 1246 vpmaddubsw ymm2, ymm2, ymm4 1247 vpmaddubsw ymm3, ymm3, ymm4 1248 lea eax, [eax + 128] 1249 vphaddw ymm0, ymm0, ymm1 // mutates. 1250 vphaddw ymm2, ymm2, ymm3 1251 vpaddw ymm0, ymm0, ymm5 // Add .5 for rounding. 1252 vpaddw ymm2, ymm2, ymm5 1253 vpsrlw ymm0, ymm0, 7 1254 vpsrlw ymm2, ymm2, 7 1255 vpackuswb ymm0, ymm0, ymm2 // mutates. 1256 vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation. 1257 vmovdqu [edx], ymm0 1258 lea edx, [edx + 32] 1259 sub ecx, 32 1260 jg convertloop 1261 1262 vzeroupper 1263 ret 1264 } 1265 } 1266 #endif // HAS_ARGBTOYJROW_AVX2 1267 1268 __declspec(naked) void BGRAToYRow_SSSE3(const uint8* src_argb, 1269 uint8* dst_y, 1270 int width) { 1271 __asm { 1272 mov eax, [esp + 4] /* src_argb */ 1273 mov edx, [esp + 8] /* dst_y */ 1274 mov ecx, [esp + 12] /* width */ 1275 movdqa xmm4, xmmword ptr kBGRAToY 1276 movdqa xmm5, xmmword ptr kAddY16 1277 1278 convertloop: 1279 movdqu xmm0, [eax] 1280 movdqu xmm1, [eax + 16] 1281 movdqu xmm2, [eax + 32] 1282 movdqu xmm3, [eax + 48] 1283 pmaddubsw xmm0, xmm4 1284 pmaddubsw xmm1, xmm4 1285 pmaddubsw xmm2, xmm4 1286 pmaddubsw xmm3, xmm4 1287 lea eax, [eax + 64] 1288 phaddw xmm0, xmm1 1289 phaddw xmm2, xmm3 1290 psrlw xmm0, 7 1291 psrlw xmm2, 7 1292 packuswb xmm0, xmm2 1293 paddb xmm0, xmm5 1294 movdqu [edx], xmm0 1295 lea edx, [edx + 16] 1296 sub ecx, 16 1297 jg convertloop 1298 ret 1299 } 1300 } 1301 1302 __declspec(naked) void ABGRToYRow_SSSE3(const uint8* src_argb, 1303 uint8* dst_y, 1304 int width) { 1305 __asm { 1306 mov eax, [esp + 4] /* src_argb */ 1307 mov edx, [esp + 8] /* dst_y */ 1308 mov ecx, [esp + 12] /* width */ 1309 movdqa xmm4, xmmword ptr kABGRToY 1310 movdqa xmm5, xmmword ptr kAddY16 1311 1312 convertloop: 1313 movdqu xmm0, [eax] 1314 movdqu xmm1, [eax + 16] 1315 movdqu xmm2, [eax + 32] 1316 movdqu xmm3, [eax + 48] 1317 pmaddubsw xmm0, xmm4 1318 pmaddubsw xmm1, xmm4 1319 pmaddubsw xmm2, xmm4 1320 pmaddubsw xmm3, xmm4 1321 lea eax, [eax + 64] 1322 phaddw xmm0, xmm1 1323 phaddw xmm2, xmm3 1324 psrlw xmm0, 7 1325 psrlw xmm2, 7 1326 packuswb xmm0, xmm2 1327 paddb xmm0, xmm5 1328 movdqu [edx], xmm0 1329 lea edx, [edx + 16] 1330 sub ecx, 16 1331 jg convertloop 1332 ret 1333 } 1334 } 1335 1336 __declspec(naked) void RGBAToYRow_SSSE3(const uint8* src_argb, 1337 uint8* dst_y, 1338 int width) { 1339 __asm { 1340 mov eax, [esp + 4] /* src_argb */ 1341 mov edx, [esp + 8] /* dst_y */ 1342 mov ecx, [esp + 12] /* width */ 1343 movdqa xmm4, xmmword ptr kRGBAToY 1344 movdqa xmm5, xmmword ptr kAddY16 1345 1346 convertloop: 1347 movdqu xmm0, [eax] 1348 movdqu xmm1, [eax + 16] 1349 movdqu xmm2, [eax + 32] 1350 movdqu xmm3, [eax + 48] 1351 pmaddubsw xmm0, xmm4 1352 pmaddubsw xmm1, xmm4 1353 pmaddubsw xmm2, xmm4 1354 pmaddubsw xmm3, xmm4 1355 lea eax, [eax + 64] 1356 phaddw xmm0, xmm1 1357 phaddw xmm2, xmm3 1358 psrlw xmm0, 7 1359 psrlw xmm2, 7 1360 packuswb xmm0, xmm2 1361 paddb xmm0, xmm5 1362 movdqu [edx], xmm0 1363 lea edx, [edx + 16] 1364 sub ecx, 16 1365 jg convertloop 1366 ret 1367 } 1368 } 1369 1370 __declspec(naked) void ARGBToUVRow_SSSE3(const uint8* src_argb0, 1371 int src_stride_argb, 1372 uint8* dst_u, 1373 uint8* dst_v, 1374 int width) { 1375 __asm { 1376 push esi 1377 push edi 1378 mov eax, [esp + 8 + 4] // src_argb 1379 mov esi, [esp + 8 + 8] // src_stride_argb 1380 mov edx, [esp + 8 + 12] // dst_u 1381 mov edi, [esp + 8 + 16] // dst_v 1382 mov ecx, [esp + 8 + 20] // width 1383 movdqa xmm5, xmmword ptr kAddUV128 1384 movdqa xmm6, xmmword ptr kARGBToV 1385 movdqa xmm7, xmmword ptr kARGBToU 1386 sub edi, edx // stride from u to v 1387 1388 convertloop: 1389 /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1390 movdqu xmm0, [eax] 1391 movdqu xmm4, [eax + esi] 1392 pavgb xmm0, xmm4 1393 movdqu xmm1, [eax + 16] 1394 movdqu xmm4, [eax + esi + 16] 1395 pavgb xmm1, xmm4 1396 movdqu xmm2, [eax + 32] 1397 movdqu xmm4, [eax + esi + 32] 1398 pavgb xmm2, xmm4 1399 movdqu xmm3, [eax + 48] 1400 movdqu xmm4, [eax + esi + 48] 1401 pavgb xmm3, xmm4 1402 1403 lea eax, [eax + 64] 1404 movdqa xmm4, xmm0 1405 shufps xmm0, xmm1, 0x88 1406 shufps xmm4, xmm1, 0xdd 1407 pavgb xmm0, xmm4 1408 movdqa xmm4, xmm2 1409 shufps xmm2, xmm3, 0x88 1410 shufps xmm4, xmm3, 0xdd 1411 pavgb xmm2, xmm4 1412 1413 // step 2 - convert to U and V 1414 // from here down is very similar to Y code except 1415 // instead of 16 different pixels, its 8 pixels of U and 8 of V 1416 movdqa xmm1, xmm0 1417 movdqa xmm3, xmm2 1418 pmaddubsw xmm0, xmm7 // U 1419 pmaddubsw xmm2, xmm7 1420 pmaddubsw xmm1, xmm6 // V 1421 pmaddubsw xmm3, xmm6 1422 phaddw xmm0, xmm2 1423 phaddw xmm1, xmm3 1424 psraw xmm0, 8 1425 psraw xmm1, 8 1426 packsswb xmm0, xmm1 1427 paddb xmm0, xmm5 // -> unsigned 1428 1429 // step 3 - store 8 U and 8 V values 1430 movlps qword ptr [edx], xmm0 // U 1431 movhps qword ptr [edx + edi], xmm0 // V 1432 lea edx, [edx + 8] 1433 sub ecx, 16 1434 jg convertloop 1435 1436 pop edi 1437 pop esi 1438 ret 1439 } 1440 } 1441 1442 __declspec(naked) void ARGBToUVJRow_SSSE3(const uint8* src_argb0, 1443 int src_stride_argb, 1444 uint8* dst_u, 1445 uint8* dst_v, 1446 int width) { 1447 __asm { 1448 push esi 1449 push edi 1450 mov eax, [esp + 8 + 4] // src_argb 1451 mov esi, [esp + 8 + 8] // src_stride_argb 1452 mov edx, [esp + 8 + 12] // dst_u 1453 mov edi, [esp + 8 + 16] // dst_v 1454 mov ecx, [esp + 8 + 20] // width 1455 movdqa xmm5, xmmword ptr kAddUVJ128 1456 movdqa xmm6, xmmword ptr kARGBToVJ 1457 movdqa xmm7, xmmword ptr kARGBToUJ 1458 sub edi, edx // stride from u to v 1459 1460 convertloop: 1461 /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1462 movdqu xmm0, [eax] 1463 movdqu xmm4, [eax + esi] 1464 pavgb xmm0, xmm4 1465 movdqu xmm1, [eax + 16] 1466 movdqu xmm4, [eax + esi + 16] 1467 pavgb xmm1, xmm4 1468 movdqu xmm2, [eax + 32] 1469 movdqu xmm4, [eax + esi + 32] 1470 pavgb xmm2, xmm4 1471 movdqu xmm3, [eax + 48] 1472 movdqu xmm4, [eax + esi + 48] 1473 pavgb xmm3, xmm4 1474 1475 lea eax, [eax + 64] 1476 movdqa xmm4, xmm0 1477 shufps xmm0, xmm1, 0x88 1478 shufps xmm4, xmm1, 0xdd 1479 pavgb xmm0, xmm4 1480 movdqa xmm4, xmm2 1481 shufps xmm2, xmm3, 0x88 1482 shufps xmm4, xmm3, 0xdd 1483 pavgb xmm2, xmm4 1484 1485 // step 2 - convert to U and V 1486 // from here down is very similar to Y code except 1487 // instead of 16 different pixels, its 8 pixels of U and 8 of V 1488 movdqa xmm1, xmm0 1489 movdqa xmm3, xmm2 1490 pmaddubsw xmm0, xmm7 // U 1491 pmaddubsw xmm2, xmm7 1492 pmaddubsw xmm1, xmm6 // V 1493 pmaddubsw xmm3, xmm6 1494 phaddw xmm0, xmm2 1495 phaddw xmm1, xmm3 1496 paddw xmm0, xmm5 // +.5 rounding -> unsigned 1497 paddw xmm1, xmm5 1498 psraw xmm0, 8 1499 psraw xmm1, 8 1500 packsswb xmm0, xmm1 1501 1502 // step 3 - store 8 U and 8 V values 1503 movlps qword ptr [edx], xmm0 // U 1504 movhps qword ptr [edx + edi], xmm0 // V 1505 lea edx, [edx + 8] 1506 sub ecx, 16 1507 jg convertloop 1508 1509 pop edi 1510 pop esi 1511 ret 1512 } 1513 } 1514 1515 #ifdef HAS_ARGBTOUVROW_AVX2 1516 __declspec(naked) void ARGBToUVRow_AVX2(const uint8* src_argb0, 1517 int src_stride_argb, 1518 uint8* dst_u, 1519 uint8* dst_v, 1520 int width) { 1521 __asm { 1522 push esi 1523 push edi 1524 mov eax, [esp + 8 + 4] // src_argb 1525 mov esi, [esp + 8 + 8] // src_stride_argb 1526 mov edx, [esp + 8 + 12] // dst_u 1527 mov edi, [esp + 8 + 16] // dst_v 1528 mov ecx, [esp + 8 + 20] // width 1529 vbroadcastf128 ymm5, xmmword ptr kAddUV128 1530 vbroadcastf128 ymm6, xmmword ptr kARGBToV 1531 vbroadcastf128 ymm7, xmmword ptr kARGBToU 1532 sub edi, edx // stride from u to v 1533 1534 convertloop: 1535 /* step 1 - subsample 32x2 argb pixels to 16x1 */ 1536 vmovdqu ymm0, [eax] 1537 vmovdqu ymm1, [eax + 32] 1538 vmovdqu ymm2, [eax + 64] 1539 vmovdqu ymm3, [eax + 96] 1540 vpavgb ymm0, ymm0, [eax + esi] 1541 vpavgb ymm1, ymm1, [eax + esi + 32] 1542 vpavgb ymm2, ymm2, [eax + esi + 64] 1543 vpavgb ymm3, ymm3, [eax + esi + 96] 1544 lea eax, [eax + 128] 1545 vshufps ymm4, ymm0, ymm1, 0x88 1546 vshufps ymm0, ymm0, ymm1, 0xdd 1547 vpavgb ymm0, ymm0, ymm4 // mutated by vshufps 1548 vshufps ymm4, ymm2, ymm3, 0x88 1549 vshufps ymm2, ymm2, ymm3, 0xdd 1550 vpavgb ymm2, ymm2, ymm4 // mutated by vshufps 1551 1552 // step 2 - convert to U and V 1553 // from here down is very similar to Y code except 1554 // instead of 32 different pixels, its 16 pixels of U and 16 of V 1555 vpmaddubsw ymm1, ymm0, ymm7 // U 1556 vpmaddubsw ymm3, ymm2, ymm7 1557 vpmaddubsw ymm0, ymm0, ymm6 // V 1558 vpmaddubsw ymm2, ymm2, ymm6 1559 vphaddw ymm1, ymm1, ymm3 // mutates 1560 vphaddw ymm0, ymm0, ymm2 1561 vpsraw ymm1, ymm1, 8 1562 vpsraw ymm0, ymm0, 8 1563 vpacksswb ymm0, ymm1, ymm0 // mutates 1564 vpermq ymm0, ymm0, 0xd8 // For vpacksswb 1565 vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw 1566 vpaddb ymm0, ymm0, ymm5 // -> unsigned 1567 1568 // step 3 - store 16 U and 16 V values 1569 vextractf128 [edx], ymm0, 0 // U 1570 vextractf128 [edx + edi], ymm0, 1 // V 1571 lea edx, [edx + 16] 1572 sub ecx, 32 1573 jg convertloop 1574 1575 pop edi 1576 pop esi 1577 vzeroupper 1578 ret 1579 } 1580 } 1581 #endif // HAS_ARGBTOUVROW_AVX2 1582 1583 #ifdef HAS_ARGBTOUVJROW_AVX2 1584 __declspec(naked) void ARGBToUVJRow_AVX2(const uint8* src_argb0, 1585 int src_stride_argb, 1586 uint8* dst_u, 1587 uint8* dst_v, 1588 int width) { 1589 __asm { 1590 push esi 1591 push edi 1592 mov eax, [esp + 8 + 4] // src_argb 1593 mov esi, [esp + 8 + 8] // src_stride_argb 1594 mov edx, [esp + 8 + 12] // dst_u 1595 mov edi, [esp + 8 + 16] // dst_v 1596 mov ecx, [esp + 8 + 20] // width 1597 vbroadcastf128 ymm5, xmmword ptr kAddUV128 1598 vbroadcastf128 ymm6, xmmword ptr kARGBToV 1599 vbroadcastf128 ymm7, xmmword ptr kARGBToU 1600 sub edi, edx // stride from u to v 1601 1602 convertloop: 1603 /* step 1 - subsample 32x2 argb pixels to 16x1 */ 1604 vmovdqu ymm0, [eax] 1605 vmovdqu ymm1, [eax + 32] 1606 vmovdqu ymm2, [eax + 64] 1607 vmovdqu ymm3, [eax + 96] 1608 vpavgb ymm0, ymm0, [eax + esi] 1609 vpavgb ymm1, ymm1, [eax + esi + 32] 1610 vpavgb ymm2, ymm2, [eax + esi + 64] 1611 vpavgb ymm3, ymm3, [eax + esi + 96] 1612 lea eax, [eax + 128] 1613 vshufps ymm4, ymm0, ymm1, 0x88 1614 vshufps ymm0, ymm0, ymm1, 0xdd 1615 vpavgb ymm0, ymm0, ymm4 // mutated by vshufps 1616 vshufps ymm4, ymm2, ymm3, 0x88 1617 vshufps ymm2, ymm2, ymm3, 0xdd 1618 vpavgb ymm2, ymm2, ymm4 // mutated by vshufps 1619 1620 // step 2 - convert to U and V 1621 // from here down is very similar to Y code except 1622 // instead of 32 different pixels, its 16 pixels of U and 16 of V 1623 vpmaddubsw ymm1, ymm0, ymm7 // U 1624 vpmaddubsw ymm3, ymm2, ymm7 1625 vpmaddubsw ymm0, ymm0, ymm6 // V 1626 vpmaddubsw ymm2, ymm2, ymm6 1627 vphaddw ymm1, ymm1, ymm3 // mutates 1628 vphaddw ymm0, ymm0, ymm2 1629 vpaddw ymm1, ymm1, ymm5 // +.5 rounding -> unsigned 1630 vpaddw ymm0, ymm0, ymm5 1631 vpsraw ymm1, ymm1, 8 1632 vpsraw ymm0, ymm0, 8 1633 vpacksswb ymm0, ymm1, ymm0 // mutates 1634 vpermq ymm0, ymm0, 0xd8 // For vpacksswb 1635 vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw 1636 1637 // step 3 - store 16 U and 16 V values 1638 vextractf128 [edx], ymm0, 0 // U 1639 vextractf128 [edx + edi], ymm0, 1 // V 1640 lea edx, [edx + 16] 1641 sub ecx, 32 1642 jg convertloop 1643 1644 pop edi 1645 pop esi 1646 vzeroupper 1647 ret 1648 } 1649 } 1650 #endif // HAS_ARGBTOUVJROW_AVX2 1651 1652 __declspec(naked) void ARGBToUV444Row_SSSE3(const uint8* src_argb0, 1653 uint8* dst_u, 1654 uint8* dst_v, 1655 int width) { 1656 __asm { 1657 push edi 1658 mov eax, [esp + 4 + 4] // src_argb 1659 mov edx, [esp + 4 + 8] // dst_u 1660 mov edi, [esp + 4 + 12] // dst_v 1661 mov ecx, [esp + 4 + 16] // width 1662 movdqa xmm5, xmmword ptr kAddUV128 1663 movdqa xmm6, xmmword ptr kARGBToV 1664 movdqa xmm7, xmmword ptr kARGBToU 1665 sub edi, edx // stride from u to v 1666 1667 convertloop: 1668 /* convert to U and V */ 1669 movdqu xmm0, [eax] // U 1670 movdqu xmm1, [eax + 16] 1671 movdqu xmm2, [eax + 32] 1672 movdqu xmm3, [eax + 48] 1673 pmaddubsw xmm0, xmm7 1674 pmaddubsw xmm1, xmm7 1675 pmaddubsw xmm2, xmm7 1676 pmaddubsw xmm3, xmm7 1677 phaddw xmm0, xmm1 1678 phaddw xmm2, xmm3 1679 psraw xmm0, 8 1680 psraw xmm2, 8 1681 packsswb xmm0, xmm2 1682 paddb xmm0, xmm5 1683 movdqu [edx], xmm0 1684 1685 movdqu xmm0, [eax] // V 1686 movdqu xmm1, [eax + 16] 1687 movdqu xmm2, [eax + 32] 1688 movdqu xmm3, [eax + 48] 1689 pmaddubsw xmm0, xmm6 1690 pmaddubsw xmm1, xmm6 1691 pmaddubsw xmm2, xmm6 1692 pmaddubsw xmm3, xmm6 1693 phaddw xmm0, xmm1 1694 phaddw xmm2, xmm3 1695 psraw xmm0, 8 1696 psraw xmm2, 8 1697 packsswb xmm0, xmm2 1698 paddb xmm0, xmm5 1699 lea eax, [eax + 64] 1700 movdqu [edx + edi], xmm0 1701 lea edx, [edx + 16] 1702 sub ecx, 16 1703 jg convertloop 1704 1705 pop edi 1706 ret 1707 } 1708 } 1709 1710 __declspec(naked) void BGRAToUVRow_SSSE3(const uint8* src_argb0, 1711 int src_stride_argb, 1712 uint8* dst_u, 1713 uint8* dst_v, 1714 int width) { 1715 __asm { 1716 push esi 1717 push edi 1718 mov eax, [esp + 8 + 4] // src_argb 1719 mov esi, [esp + 8 + 8] // src_stride_argb 1720 mov edx, [esp + 8 + 12] // dst_u 1721 mov edi, [esp + 8 + 16] // dst_v 1722 mov ecx, [esp + 8 + 20] // width 1723 movdqa xmm5, xmmword ptr kAddUV128 1724 movdqa xmm6, xmmword ptr kBGRAToV 1725 movdqa xmm7, xmmword ptr kBGRAToU 1726 sub edi, edx // stride from u to v 1727 1728 convertloop: 1729 /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1730 movdqu xmm0, [eax] 1731 movdqu xmm4, [eax + esi] 1732 pavgb xmm0, xmm4 1733 movdqu xmm1, [eax + 16] 1734 movdqu xmm4, [eax + esi + 16] 1735 pavgb xmm1, xmm4 1736 movdqu xmm2, [eax + 32] 1737 movdqu xmm4, [eax + esi + 32] 1738 pavgb xmm2, xmm4 1739 movdqu xmm3, [eax + 48] 1740 movdqu xmm4, [eax + esi + 48] 1741 pavgb xmm3, xmm4 1742 1743 lea eax, [eax + 64] 1744 movdqa xmm4, xmm0 1745 shufps xmm0, xmm1, 0x88 1746 shufps xmm4, xmm1, 0xdd 1747 pavgb xmm0, xmm4 1748 movdqa xmm4, xmm2 1749 shufps xmm2, xmm3, 0x88 1750 shufps xmm4, xmm3, 0xdd 1751 pavgb xmm2, xmm4 1752 1753 // step 2 - convert to U and V 1754 // from here down is very similar to Y code except 1755 // instead of 16 different pixels, its 8 pixels of U and 8 of V 1756 movdqa xmm1, xmm0 1757 movdqa xmm3, xmm2 1758 pmaddubsw xmm0, xmm7 // U 1759 pmaddubsw xmm2, xmm7 1760 pmaddubsw xmm1, xmm6 // V 1761 pmaddubsw xmm3, xmm6 1762 phaddw xmm0, xmm2 1763 phaddw xmm1, xmm3 1764 psraw xmm0, 8 1765 psraw xmm1, 8 1766 packsswb xmm0, xmm1 1767 paddb xmm0, xmm5 // -> unsigned 1768 1769 // step 3 - store 8 U and 8 V values 1770 movlps qword ptr [edx], xmm0 // U 1771 movhps qword ptr [edx + edi], xmm0 // V 1772 lea edx, [edx + 8] 1773 sub ecx, 16 1774 jg convertloop 1775 1776 pop edi 1777 pop esi 1778 ret 1779 } 1780 } 1781 1782 __declspec(naked) void ABGRToUVRow_SSSE3(const uint8* src_argb0, 1783 int src_stride_argb, 1784 uint8* dst_u, 1785 uint8* dst_v, 1786 int width) { 1787 __asm { 1788 push esi 1789 push edi 1790 mov eax, [esp + 8 + 4] // src_argb 1791 mov esi, [esp + 8 + 8] // src_stride_argb 1792 mov edx, [esp + 8 + 12] // dst_u 1793 mov edi, [esp + 8 + 16] // dst_v 1794 mov ecx, [esp + 8 + 20] // width 1795 movdqa xmm5, xmmword ptr kAddUV128 1796 movdqa xmm6, xmmword ptr kABGRToV 1797 movdqa xmm7, xmmword ptr kABGRToU 1798 sub edi, edx // stride from u to v 1799 1800 convertloop: 1801 /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1802 movdqu xmm0, [eax] 1803 movdqu xmm4, [eax + esi] 1804 pavgb xmm0, xmm4 1805 movdqu xmm1, [eax + 16] 1806 movdqu xmm4, [eax + esi + 16] 1807 pavgb xmm1, xmm4 1808 movdqu xmm2, [eax + 32] 1809 movdqu xmm4, [eax + esi + 32] 1810 pavgb xmm2, xmm4 1811 movdqu xmm3, [eax + 48] 1812 movdqu xmm4, [eax + esi + 48] 1813 pavgb xmm3, xmm4 1814 1815 lea eax, [eax + 64] 1816 movdqa xmm4, xmm0 1817 shufps xmm0, xmm1, 0x88 1818 shufps xmm4, xmm1, 0xdd 1819 pavgb xmm0, xmm4 1820 movdqa xmm4, xmm2 1821 shufps xmm2, xmm3, 0x88 1822 shufps xmm4, xmm3, 0xdd 1823 pavgb xmm2, xmm4 1824 1825 // step 2 - convert to U and V 1826 // from here down is very similar to Y code except 1827 // instead of 16 different pixels, its 8 pixels of U and 8 of V 1828 movdqa xmm1, xmm0 1829 movdqa xmm3, xmm2 1830 pmaddubsw xmm0, xmm7 // U 1831 pmaddubsw xmm2, xmm7 1832 pmaddubsw xmm1, xmm6 // V 1833 pmaddubsw xmm3, xmm6 1834 phaddw xmm0, xmm2 1835 phaddw xmm1, xmm3 1836 psraw xmm0, 8 1837 psraw xmm1, 8 1838 packsswb xmm0, xmm1 1839 paddb xmm0, xmm5 // -> unsigned 1840 1841 // step 3 - store 8 U and 8 V values 1842 movlps qword ptr [edx], xmm0 // U 1843 movhps qword ptr [edx + edi], xmm0 // V 1844 lea edx, [edx + 8] 1845 sub ecx, 16 1846 jg convertloop 1847 1848 pop edi 1849 pop esi 1850 ret 1851 } 1852 } 1853 1854 __declspec(naked) void RGBAToUVRow_SSSE3(const uint8* src_argb0, 1855 int src_stride_argb, 1856 uint8* dst_u, 1857 uint8* dst_v, 1858 int width) { 1859 __asm { 1860 push esi 1861 push edi 1862 mov eax, [esp + 8 + 4] // src_argb 1863 mov esi, [esp + 8 + 8] // src_stride_argb 1864 mov edx, [esp + 8 + 12] // dst_u 1865 mov edi, [esp + 8 + 16] // dst_v 1866 mov ecx, [esp + 8 + 20] // width 1867 movdqa xmm5, xmmword ptr kAddUV128 1868 movdqa xmm6, xmmword ptr kRGBAToV 1869 movdqa xmm7, xmmword ptr kRGBAToU 1870 sub edi, edx // stride from u to v 1871 1872 convertloop: 1873 /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1874 movdqu xmm0, [eax] 1875 movdqu xmm4, [eax + esi] 1876 pavgb xmm0, xmm4 1877 movdqu xmm1, [eax + 16] 1878 movdqu xmm4, [eax + esi + 16] 1879 pavgb xmm1, xmm4 1880 movdqu xmm2, [eax + 32] 1881 movdqu xmm4, [eax + esi + 32] 1882 pavgb xmm2, xmm4 1883 movdqu xmm3, [eax + 48] 1884 movdqu xmm4, [eax + esi + 48] 1885 pavgb xmm3, xmm4 1886 1887 lea eax, [eax + 64] 1888 movdqa xmm4, xmm0 1889 shufps xmm0, xmm1, 0x88 1890 shufps xmm4, xmm1, 0xdd 1891 pavgb xmm0, xmm4 1892 movdqa xmm4, xmm2 1893 shufps xmm2, xmm3, 0x88 1894 shufps xmm4, xmm3, 0xdd 1895 pavgb xmm2, xmm4 1896 1897 // step 2 - convert to U and V 1898 // from here down is very similar to Y code except 1899 // instead of 16 different pixels, its 8 pixels of U and 8 of V 1900 movdqa xmm1, xmm0 1901 movdqa xmm3, xmm2 1902 pmaddubsw xmm0, xmm7 // U 1903 pmaddubsw xmm2, xmm7 1904 pmaddubsw xmm1, xmm6 // V 1905 pmaddubsw xmm3, xmm6 1906 phaddw xmm0, xmm2 1907 phaddw xmm1, xmm3 1908 psraw xmm0, 8 1909 psraw xmm1, 8 1910 packsswb xmm0, xmm1 1911 paddb xmm0, xmm5 // -> unsigned 1912 1913 // step 3 - store 8 U and 8 V values 1914 movlps qword ptr [edx], xmm0 // U 1915 movhps qword ptr [edx + edi], xmm0 // V 1916 lea edx, [edx + 8] 1917 sub ecx, 16 1918 jg convertloop 1919 1920 pop edi 1921 pop esi 1922 ret 1923 } 1924 } 1925 #endif // HAS_ARGBTOYROW_SSSE3 1926 1927 // Read 16 UV from 444 1928 #define READYUV444_AVX2 \ 1929 __asm { \ 1930 __asm vmovdqu xmm0, [esi] /* U */ \ 1931 __asm vmovdqu xmm1, [esi + edi] /* V */ \ 1932 __asm lea esi, [esi + 16] \ 1933 __asm vpermq ymm0, ymm0, 0xd8 \ 1934 __asm vpermq ymm1, ymm1, 0xd8 \ 1935 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ 1936 __asm vmovdqu xmm4, [eax] /* Y */ \ 1937 __asm vpermq ymm4, ymm4, 0xd8 \ 1938 __asm vpunpcklbw ymm4, ymm4, ymm4 \ 1939 __asm lea eax, [eax + 16]} 1940 1941 // Read 8 UV from 422, upsample to 16 UV. 1942 #define READYUV422_AVX2 \ 1943 __asm { \ 1944 __asm vmovq xmm0, qword ptr [esi] /* U */ \ 1945 __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \ 1946 __asm lea esi, [esi + 8] \ 1947 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ 1948 __asm vpermq ymm0, ymm0, 0xd8 \ 1949 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ 1950 __asm vmovdqu xmm4, [eax] /* Y */ \ 1951 __asm vpermq ymm4, ymm4, 0xd8 \ 1952 __asm vpunpcklbw ymm4, ymm4, ymm4 \ 1953 __asm lea eax, [eax + 16]} 1954 1955 // Read 8 UV from 422, upsample to 16 UV. With 16 Alpha. 1956 #define READYUVA422_AVX2 \ 1957 __asm { \ 1958 __asm vmovq xmm0, qword ptr [esi] /* U */ \ 1959 __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \ 1960 __asm lea esi, [esi + 8] \ 1961 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ 1962 __asm vpermq ymm0, ymm0, 0xd8 \ 1963 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ 1964 __asm vmovdqu xmm4, [eax] /* Y */ \ 1965 __asm vpermq ymm4, ymm4, 0xd8 \ 1966 __asm vpunpcklbw ymm4, ymm4, ymm4 \ 1967 __asm lea eax, [eax + 16] \ 1968 __asm vmovdqu xmm5, [ebp] /* A */ \ 1969 __asm vpermq ymm5, ymm5, 0xd8 \ 1970 __asm lea ebp, [ebp + 16]} 1971 1972 // Read 8 UV from NV12, upsample to 16 UV. 1973 #define READNV12_AVX2 \ 1974 __asm { \ 1975 __asm vmovdqu xmm0, [esi] /* UV */ \ 1976 __asm lea esi, [esi + 16] \ 1977 __asm vpermq ymm0, ymm0, 0xd8 \ 1978 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ 1979 __asm vmovdqu xmm4, [eax] /* Y */ \ 1980 __asm vpermq ymm4, ymm4, 0xd8 \ 1981 __asm vpunpcklbw ymm4, ymm4, ymm4 \ 1982 __asm lea eax, [eax + 16]} 1983 1984 // Read 8 UV from NV21, upsample to 16 UV. 1985 #define READNV21_AVX2 \ 1986 __asm { \ 1987 __asm vmovdqu xmm0, [esi] /* UV */ \ 1988 __asm lea esi, [esi + 16] \ 1989 __asm vpermq ymm0, ymm0, 0xd8 \ 1990 __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleNV21 \ 1991 __asm vmovdqu xmm4, [eax] /* Y */ \ 1992 __asm vpermq ymm4, ymm4, 0xd8 \ 1993 __asm vpunpcklbw ymm4, ymm4, ymm4 \ 1994 __asm lea eax, [eax + 16]} 1995 1996 // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV. 1997 #define READYUY2_AVX2 \ 1998 __asm { \ 1999 __asm vmovdqu ymm4, [eax] /* YUY2 */ \ 2000 __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleYUY2Y \ 2001 __asm vmovdqu ymm0, [eax] /* UV */ \ 2002 __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleYUY2UV \ 2003 __asm lea eax, [eax + 32]} 2004 2005 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV. 2006 #define READUYVY_AVX2 \ 2007 __asm { \ 2008 __asm vmovdqu ymm4, [eax] /* UYVY */ \ 2009 __asm vpshufb ymm4, ymm4, ymmword ptr kShuffleUYVYY \ 2010 __asm vmovdqu ymm0, [eax] /* UV */ \ 2011 __asm vpshufb ymm0, ymm0, ymmword ptr kShuffleUYVYUV \ 2012 __asm lea eax, [eax + 32]} 2013 2014 // Convert 16 pixels: 16 UV and 16 Y. 2015 #define YUVTORGB_AVX2(YuvConstants) \ 2016 __asm { \ 2017 __asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\ 2018 __asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\ 2019 __asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\ 2020 __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASR] \ 2021 __asm vpsubw ymm2, ymm3, ymm2 \ 2022 __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASG] \ 2023 __asm vpsubw ymm1, ymm3, ymm1 \ 2024 __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASB] \ 2025 __asm vpsubw ymm0, ymm3, ymm0 /* Step 2: Find Y contribution to 16 R,G,B values */ \ 2026 __asm vpmulhuw ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB] \ 2027 __asm vpaddsw ymm0, ymm0, ymm4 /* B += Y */ \ 2028 __asm vpaddsw ymm1, ymm1, ymm4 /* G += Y */ \ 2029 __asm vpaddsw ymm2, ymm2, ymm4 /* R += Y */ \ 2030 __asm vpsraw ymm0, ymm0, 6 \ 2031 __asm vpsraw ymm1, ymm1, 6 \ 2032 __asm vpsraw ymm2, ymm2, 6 \ 2033 __asm vpackuswb ymm0, ymm0, ymm0 /* B */ \ 2034 __asm vpackuswb ymm1, ymm1, ymm1 /* G */ \ 2035 __asm vpackuswb ymm2, ymm2, ymm2 /* R */ \ 2036 } 2037 2038 // Store 16 ARGB values. 2039 #define STOREARGB_AVX2 \ 2040 __asm { \ 2041 __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \ 2042 __asm vpermq ymm0, ymm0, 0xd8 \ 2043 __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \ 2044 __asm vpermq ymm2, ymm2, 0xd8 \ 2045 __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \ 2046 __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \ 2047 __asm vmovdqu 0[edx], ymm1 \ 2048 __asm vmovdqu 32[edx], ymm0 \ 2049 __asm lea edx, [edx + 64]} 2050 2051 // Store 16 RGBA values. 2052 #define STORERGBA_AVX2 \ 2053 __asm { \ 2054 __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */ \ 2055 __asm vpermq ymm1, ymm1, 0xd8 \ 2056 __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */ \ 2057 __asm vpermq ymm2, ymm2, 0xd8 \ 2058 __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */ \ 2059 __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */ \ 2060 __asm vmovdqu [edx], ymm0 \ 2061 __asm vmovdqu [edx + 32], ymm1 \ 2062 __asm lea edx, [edx + 64]} 2063 2064 #ifdef HAS_I422TOARGBROW_AVX2 2065 // 16 pixels 2066 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). 2067 __declspec(naked) void I422ToARGBRow_AVX2( 2068 const uint8* y_buf, 2069 const uint8* u_buf, 2070 const uint8* v_buf, 2071 uint8* dst_argb, 2072 const struct YuvConstants* yuvconstants, 2073 int width) { 2074 __asm { 2075 push esi 2076 push edi 2077 push ebx 2078 mov eax, [esp + 12 + 4] // Y 2079 mov esi, [esp + 12 + 8] // U 2080 mov edi, [esp + 12 + 12] // V 2081 mov edx, [esp + 12 + 16] // argb 2082 mov ebx, [esp + 12 + 20] // yuvconstants 2083 mov ecx, [esp + 12 + 24] // width 2084 sub edi, esi 2085 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha 2086 2087 convertloop: 2088 READYUV422_AVX2 2089 YUVTORGB_AVX2(ebx) 2090 STOREARGB_AVX2 2091 2092 sub ecx, 16 2093 jg convertloop 2094 2095 pop ebx 2096 pop edi 2097 pop esi 2098 vzeroupper 2099 ret 2100 } 2101 } 2102 #endif // HAS_I422TOARGBROW_AVX2 2103 2104 #ifdef HAS_I422ALPHATOARGBROW_AVX2 2105 // 16 pixels 2106 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB. 2107 __declspec(naked) void I422AlphaToARGBRow_AVX2( 2108 const uint8* y_buf, 2109 const uint8* u_buf, 2110 const uint8* v_buf, 2111 const uint8* a_buf, 2112 uint8* dst_argb, 2113 const struct YuvConstants* yuvconstants, 2114 int width) { 2115 __asm { 2116 push esi 2117 push edi 2118 push ebx 2119 push ebp 2120 mov eax, [esp + 16 + 4] // Y 2121 mov esi, [esp + 16 + 8] // U 2122 mov edi, [esp + 16 + 12] // V 2123 mov ebp, [esp + 16 + 16] // A 2124 mov edx, [esp + 16 + 20] // argb 2125 mov ebx, [esp + 16 + 24] // yuvconstants 2126 mov ecx, [esp + 16 + 28] // width 2127 sub edi, esi 2128 2129 convertloop: 2130 READYUVA422_AVX2 2131 YUVTORGB_AVX2(ebx) 2132 STOREARGB_AVX2 2133 2134 sub ecx, 16 2135 jg convertloop 2136 2137 pop ebp 2138 pop ebx 2139 pop edi 2140 pop esi 2141 vzeroupper 2142 ret 2143 } 2144 } 2145 #endif // HAS_I422ALPHATOARGBROW_AVX2 2146 2147 #ifdef HAS_I444TOARGBROW_AVX2 2148 // 16 pixels 2149 // 16 UV values with 16 Y producing 16 ARGB (64 bytes). 2150 __declspec(naked) void I444ToARGBRow_AVX2( 2151 const uint8* y_buf, 2152 const uint8* u_buf, 2153 const uint8* v_buf, 2154 uint8* dst_argb, 2155 const struct YuvConstants* yuvconstants, 2156 int width) { 2157 __asm { 2158 push esi 2159 push edi 2160 push ebx 2161 mov eax, [esp + 12 + 4] // Y 2162 mov esi, [esp + 12 + 8] // U 2163 mov edi, [esp + 12 + 12] // V 2164 mov edx, [esp + 12 + 16] // argb 2165 mov ebx, [esp + 12 + 20] // yuvconstants 2166 mov ecx, [esp + 12 + 24] // width 2167 sub edi, esi 2168 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha 2169 convertloop: 2170 READYUV444_AVX2 2171 YUVTORGB_AVX2(ebx) 2172 STOREARGB_AVX2 2173 2174 sub ecx, 16 2175 jg convertloop 2176 2177 pop ebx 2178 pop edi 2179 pop esi 2180 vzeroupper 2181 ret 2182 } 2183 } 2184 #endif // HAS_I444TOARGBROW_AVX2 2185 2186 #ifdef HAS_NV12TOARGBROW_AVX2 2187 // 16 pixels. 2188 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). 2189 __declspec(naked) void NV12ToARGBRow_AVX2( 2190 const uint8* y_buf, 2191 const uint8* uv_buf, 2192 uint8* dst_argb, 2193 const struct YuvConstants* yuvconstants, 2194 int width) { 2195 __asm { 2196 push esi 2197 push ebx 2198 mov eax, [esp + 8 + 4] // Y 2199 mov esi, [esp + 8 + 8] // UV 2200 mov edx, [esp + 8 + 12] // argb 2201 mov ebx, [esp + 8 + 16] // yuvconstants 2202 mov ecx, [esp + 8 + 20] // width 2203 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha 2204 2205 convertloop: 2206 READNV12_AVX2 2207 YUVTORGB_AVX2(ebx) 2208 STOREARGB_AVX2 2209 2210 sub ecx, 16 2211 jg convertloop 2212 2213 pop ebx 2214 pop esi 2215 vzeroupper 2216 ret 2217 } 2218 } 2219 #endif // HAS_NV12TOARGBROW_AVX2 2220 2221 #ifdef HAS_NV21TOARGBROW_AVX2 2222 // 16 pixels. 2223 // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). 2224 __declspec(naked) void NV21ToARGBRow_AVX2( 2225 const uint8* y_buf, 2226 const uint8* vu_buf, 2227 uint8* dst_argb, 2228 const struct YuvConstants* yuvconstants, 2229 int width) { 2230 __asm { 2231 push esi 2232 push ebx 2233 mov eax, [esp + 8 + 4] // Y 2234 mov esi, [esp + 8 + 8] // VU 2235 mov edx, [esp + 8 + 12] // argb 2236 mov ebx, [esp + 8 + 16] // yuvconstants 2237 mov ecx, [esp + 8 + 20] // width 2238 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha 2239 2240 convertloop: 2241 READNV21_AVX2 2242 YUVTORGB_AVX2(ebx) 2243 STOREARGB_AVX2 2244 2245 sub ecx, 16 2246 jg convertloop 2247 2248 pop ebx 2249 pop esi 2250 vzeroupper 2251 ret 2252 } 2253 } 2254 #endif // HAS_NV21TOARGBROW_AVX2 2255 2256 #ifdef HAS_YUY2TOARGBROW_AVX2 2257 // 16 pixels. 2258 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes). 2259 __declspec(naked) void YUY2ToARGBRow_AVX2( 2260 const uint8* src_yuy2, 2261 uint8* dst_argb, 2262 const struct YuvConstants* yuvconstants, 2263 int width) { 2264 __asm { 2265 push ebx 2266 mov eax, [esp + 4 + 4] // yuy2 2267 mov edx, [esp + 4 + 8] // argb 2268 mov ebx, [esp + 4 + 12] // yuvconstants 2269 mov ecx, [esp + 4 + 16] // width 2270 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha 2271 2272 convertloop: 2273 READYUY2_AVX2 2274 YUVTORGB_AVX2(ebx) 2275 STOREARGB_AVX2 2276 2277 sub ecx, 16 2278 jg convertloop 2279 2280 pop ebx 2281 vzeroupper 2282 ret 2283 } 2284 } 2285 #endif // HAS_YUY2TOARGBROW_AVX2 2286 2287 #ifdef HAS_UYVYTOARGBROW_AVX2 2288 // 16 pixels. 2289 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes). 2290 __declspec(naked) void UYVYToARGBRow_AVX2( 2291 const uint8* src_uyvy, 2292 uint8* dst_argb, 2293 const struct YuvConstants* yuvconstants, 2294 int width) { 2295 __asm { 2296 push ebx 2297 mov eax, [esp + 4 + 4] // uyvy 2298 mov edx, [esp + 4 + 8] // argb 2299 mov ebx, [esp + 4 + 12] // yuvconstants 2300 mov ecx, [esp + 4 + 16] // width 2301 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha 2302 2303 convertloop: 2304 READUYVY_AVX2 2305 YUVTORGB_AVX2(ebx) 2306 STOREARGB_AVX2 2307 2308 sub ecx, 16 2309 jg convertloop 2310 2311 pop ebx 2312 vzeroupper 2313 ret 2314 } 2315 } 2316 #endif // HAS_UYVYTOARGBROW_AVX2 2317 2318 #ifdef HAS_I422TORGBAROW_AVX2 2319 // 16 pixels 2320 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). 2321 __declspec(naked) void I422ToRGBARow_AVX2( 2322 const uint8* y_buf, 2323 const uint8* u_buf, 2324 const uint8* v_buf, 2325 uint8* dst_argb, 2326 const struct YuvConstants* yuvconstants, 2327 int width) { 2328 __asm { 2329 push esi 2330 push edi 2331 push ebx 2332 mov eax, [esp + 12 + 4] // Y 2333 mov esi, [esp + 12 + 8] // U 2334 mov edi, [esp + 12 + 12] // V 2335 mov edx, [esp + 12 + 16] // abgr 2336 mov ebx, [esp + 12 + 20] // yuvconstants 2337 mov ecx, [esp + 12 + 24] // width 2338 sub edi, esi 2339 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha 2340 2341 convertloop: 2342 READYUV422_AVX2 2343 YUVTORGB_AVX2(ebx) 2344 STORERGBA_AVX2 2345 2346 sub ecx, 16 2347 jg convertloop 2348 2349 pop ebx 2350 pop edi 2351 pop esi 2352 vzeroupper 2353 ret 2354 } 2355 } 2356 #endif // HAS_I422TORGBAROW_AVX2 2357 2358 #if defined(HAS_I422TOARGBROW_SSSE3) 2359 // TODO(fbarchard): Read that does half size on Y and treats 420 as 444. 2360 // Allows a conversion with half size scaling. 2361 2362 // Read 8 UV from 444. 2363 #define READYUV444 \ 2364 __asm { \ 2365 __asm movq xmm0, qword ptr [esi] /* U */ \ 2366 __asm movq xmm1, qword ptr [esi + edi] /* V */ \ 2367 __asm lea esi, [esi + 8] \ 2368 __asm punpcklbw xmm0, xmm1 /* UV */ \ 2369 __asm movq xmm4, qword ptr [eax] \ 2370 __asm punpcklbw xmm4, xmm4 \ 2371 __asm lea eax, [eax + 8]} 2372 2373 // Read 4 UV from 422, upsample to 8 UV. 2374 #define READYUV422 \ 2375 __asm { \ 2376 __asm movd xmm0, [esi] /* U */ \ 2377 __asm movd xmm1, [esi + edi] /* V */ \ 2378 __asm lea esi, [esi + 4] \ 2379 __asm punpcklbw xmm0, xmm1 /* UV */ \ 2380 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ 2381 __asm movq xmm4, qword ptr [eax] \ 2382 __asm punpcklbw xmm4, xmm4 \ 2383 __asm lea eax, [eax + 8]} 2384 2385 // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. 2386 #define READYUVA422 \ 2387 __asm { \ 2388 __asm movd xmm0, [esi] /* U */ \ 2389 __asm movd xmm1, [esi + edi] /* V */ \ 2390 __asm lea esi, [esi + 4] \ 2391 __asm punpcklbw xmm0, xmm1 /* UV */ \ 2392 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ 2393 __asm movq xmm4, qword ptr [eax] /* Y */ \ 2394 __asm punpcklbw xmm4, xmm4 \ 2395 __asm lea eax, [eax + 8] \ 2396 __asm movq xmm5, qword ptr [ebp] /* A */ \ 2397 __asm lea ebp, [ebp + 8]} 2398 2399 // Read 4 UV from NV12, upsample to 8 UV. 2400 #define READNV12 \ 2401 __asm { \ 2402 __asm movq xmm0, qword ptr [esi] /* UV */ \ 2403 __asm lea esi, [esi + 8] \ 2404 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ 2405 __asm movq xmm4, qword ptr [eax] \ 2406 __asm punpcklbw xmm4, xmm4 \ 2407 __asm lea eax, [eax + 8]} 2408 2409 // Read 4 VU from NV21, upsample to 8 UV. 2410 #define READNV21 \ 2411 __asm { \ 2412 __asm movq xmm0, qword ptr [esi] /* UV */ \ 2413 __asm lea esi, [esi + 8] \ 2414 __asm pshufb xmm0, xmmword ptr kShuffleNV21 \ 2415 __asm movq xmm4, qword ptr [eax] \ 2416 __asm punpcklbw xmm4, xmm4 \ 2417 __asm lea eax, [eax + 8]} 2418 2419 // Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV. 2420 #define READYUY2 \ 2421 __asm { \ 2422 __asm movdqu xmm4, [eax] /* YUY2 */ \ 2423 __asm pshufb xmm4, xmmword ptr kShuffleYUY2Y \ 2424 __asm movdqu xmm0, [eax] /* UV */ \ 2425 __asm pshufb xmm0, xmmword ptr kShuffleYUY2UV \ 2426 __asm lea eax, [eax + 16]} 2427 2428 // Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV. 2429 #define READUYVY \ 2430 __asm { \ 2431 __asm movdqu xmm4, [eax] /* UYVY */ \ 2432 __asm pshufb xmm4, xmmword ptr kShuffleUYVYY \ 2433 __asm movdqu xmm0, [eax] /* UV */ \ 2434 __asm pshufb xmm0, xmmword ptr kShuffleUYVYUV \ 2435 __asm lea eax, [eax + 16]} 2436 2437 // Convert 8 pixels: 8 UV and 8 Y. 2438 #define YUVTORGB(YuvConstants) \ 2439 __asm { \ 2440 __asm movdqa xmm1, xmm0 \ 2441 __asm movdqa xmm2, xmm0 \ 2442 __asm movdqa xmm3, xmm0 \ 2443 __asm movdqa xmm0, xmmword ptr [YuvConstants + KUVBIASB] \ 2444 __asm pmaddubsw xmm1, xmmword ptr [YuvConstants + KUVTOB] \ 2445 __asm psubw xmm0, xmm1 \ 2446 __asm movdqa xmm1, xmmword ptr [YuvConstants + KUVBIASG] \ 2447 __asm pmaddubsw xmm2, xmmword ptr [YuvConstants + KUVTOG] \ 2448 __asm psubw xmm1, xmm2 \ 2449 __asm movdqa xmm2, xmmword ptr [YuvConstants + KUVBIASR] \ 2450 __asm pmaddubsw xmm3, xmmword ptr [YuvConstants + KUVTOR] \ 2451 __asm psubw xmm2, xmm3 \ 2452 __asm pmulhuw xmm4, xmmword ptr [YuvConstants + KYTORGB] \ 2453 __asm paddsw xmm0, xmm4 /* B += Y */ \ 2454 __asm paddsw xmm1, xmm4 /* G += Y */ \ 2455 __asm paddsw xmm2, xmm4 /* R += Y */ \ 2456 __asm psraw xmm0, 6 \ 2457 __asm psraw xmm1, 6 \ 2458 __asm psraw xmm2, 6 \ 2459 __asm packuswb xmm0, xmm0 /* B */ \ 2460 __asm packuswb xmm1, xmm1 /* G */ \ 2461 __asm packuswb xmm2, xmm2 /* R */ \ 2462 } 2463 2464 // Store 8 ARGB values. 2465 #define STOREARGB \ 2466 __asm { \ 2467 __asm punpcklbw xmm0, xmm1 /* BG */ \ 2468 __asm punpcklbw xmm2, xmm5 /* RA */ \ 2469 __asm movdqa xmm1, xmm0 \ 2470 __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \ 2471 __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \ 2472 __asm movdqu 0[edx], xmm0 \ 2473 __asm movdqu 16[edx], xmm1 \ 2474 __asm lea edx, [edx + 32]} 2475 2476 // Store 8 BGRA values. 2477 #define STOREBGRA \ 2478 __asm { \ 2479 __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ 2480 __asm punpcklbw xmm1, xmm0 /* GB */ \ 2481 __asm punpcklbw xmm5, xmm2 /* AR */ \ 2482 __asm movdqa xmm0, xmm5 \ 2483 __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \ 2484 __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \ 2485 __asm movdqu 0[edx], xmm5 \ 2486 __asm movdqu 16[edx], xmm0 \ 2487 __asm lea edx, [edx + 32]} 2488 2489 // Store 8 RGBA values. 2490 #define STORERGBA \ 2491 __asm { \ 2492 __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ 2493 __asm punpcklbw xmm1, xmm2 /* GR */ \ 2494 __asm punpcklbw xmm5, xmm0 /* AB */ \ 2495 __asm movdqa xmm0, xmm5 \ 2496 __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \ 2497 __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \ 2498 __asm movdqu 0[edx], xmm5 \ 2499 __asm movdqu 16[edx], xmm0 \ 2500 __asm lea edx, [edx + 32]} 2501 2502 // Store 8 RGB24 values. 2503 #define STORERGB24 \ 2504 __asm {/* Weave into RRGB */ \ 2505 __asm punpcklbw xmm0, xmm1 /* BG */ \ 2506 __asm punpcklbw xmm2, xmm2 /* RR */ \ 2507 __asm movdqa xmm1, xmm0 \ 2508 __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ 2509 __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB24 */ \ 2510 __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \ 2511 __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \ 2512 __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \ 2513 __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \ 2514 __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \ 2515 __asm lea edx, [edx + 24]} 2516 2517 // Store 8 RGB565 values. 2518 #define STORERGB565 \ 2519 __asm {/* Weave into RRGB */ \ 2520 __asm punpcklbw xmm0, xmm1 /* BG */ \ 2521 __asm punpcklbw xmm2, xmm2 /* RR */ \ 2522 __asm movdqa xmm1, xmm0 \ 2523 __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ 2524 __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB565 */ \ 2525 __asm movdqa xmm3, xmm0 /* B first 4 pixels of argb */ \ 2526 __asm movdqa xmm2, xmm0 /* G */ \ 2527 __asm pslld xmm0, 8 /* R */ \ 2528 __asm psrld xmm3, 3 /* B */ \ 2529 __asm psrld xmm2, 5 /* G */ \ 2530 __asm psrad xmm0, 16 /* R */ \ 2531 __asm pand xmm3, xmm5 /* B */ \ 2532 __asm pand xmm2, xmm6 /* G */ \ 2533 __asm pand xmm0, xmm7 /* R */ \ 2534 __asm por xmm3, xmm2 /* BG */ \ 2535 __asm por xmm0, xmm3 /* BGR */ \ 2536 __asm movdqa xmm3, xmm1 /* B next 4 pixels of argb */ \ 2537 __asm movdqa xmm2, xmm1 /* G */ \ 2538 __asm pslld xmm1, 8 /* R */ \ 2539 __asm psrld xmm3, 3 /* B */ \ 2540 __asm psrld xmm2, 5 /* G */ \ 2541 __asm psrad xmm1, 16 /* R */ \ 2542 __asm pand xmm3, xmm5 /* B */ \ 2543 __asm pand xmm2, xmm6 /* G */ \ 2544 __asm pand xmm1, xmm7 /* R */ \ 2545 __asm por xmm3, xmm2 /* BG */ \ 2546 __asm por xmm1, xmm3 /* BGR */ \ 2547 __asm packssdw xmm0, xmm1 \ 2548 __asm movdqu 0[edx], xmm0 /* store 8 pixels of RGB565 */ \ 2549 __asm lea edx, [edx + 16]} 2550 2551 // 8 pixels. 2552 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). 2553 __declspec(naked) void I444ToARGBRow_SSSE3( 2554 const uint8* y_buf, 2555 const uint8* u_buf, 2556 const uint8* v_buf, 2557 uint8* dst_argb, 2558 const struct YuvConstants* yuvconstants, 2559 int width) { 2560 __asm { 2561 push esi 2562 push edi 2563 push ebx 2564 mov eax, [esp + 12 + 4] // Y 2565 mov esi, [esp + 12 + 8] // U 2566 mov edi, [esp + 12 + 12] // V 2567 mov edx, [esp + 12 + 16] // argb 2568 mov ebx, [esp + 12 + 20] // yuvconstants 2569 mov ecx, [esp + 12 + 24] // width 2570 sub edi, esi 2571 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2572 2573 convertloop: 2574 READYUV444 2575 YUVTORGB(ebx) 2576 STOREARGB 2577 2578 sub ecx, 8 2579 jg convertloop 2580 2581 pop ebx 2582 pop edi 2583 pop esi 2584 ret 2585 } 2586 } 2587 2588 // 8 pixels. 2589 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes). 2590 __declspec(naked) void I422ToRGB24Row_SSSE3( 2591 const uint8* y_buf, 2592 const uint8* u_buf, 2593 const uint8* v_buf, 2594 uint8* dst_rgb24, 2595 const struct YuvConstants* yuvconstants, 2596 int width) { 2597 __asm { 2598 push esi 2599 push edi 2600 push ebx 2601 mov eax, [esp + 12 + 4] // Y 2602 mov esi, [esp + 12 + 8] // U 2603 mov edi, [esp + 12 + 12] // V 2604 mov edx, [esp + 12 + 16] // argb 2605 mov ebx, [esp + 12 + 20] // yuvconstants 2606 mov ecx, [esp + 12 + 24] // width 2607 sub edi, esi 2608 movdqa xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0 2609 movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24 2610 2611 convertloop: 2612 READYUV422 2613 YUVTORGB(ebx) 2614 STORERGB24 2615 2616 sub ecx, 8 2617 jg convertloop 2618 2619 pop ebx 2620 pop edi 2621 pop esi 2622 ret 2623 } 2624 } 2625 2626 // 8 pixels 2627 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes). 2628 __declspec(naked) void I422ToRGB565Row_SSSE3( 2629 const uint8* y_buf, 2630 const uint8* u_buf, 2631 const uint8* v_buf, 2632 uint8* rgb565_buf, 2633 const struct YuvConstants* yuvconstants, 2634 int width) { 2635 __asm { 2636 push esi 2637 push edi 2638 push ebx 2639 mov eax, [esp + 12 + 4] // Y 2640 mov esi, [esp + 12 + 8] // U 2641 mov edi, [esp + 12 + 12] // V 2642 mov edx, [esp + 12 + 16] // argb 2643 mov ebx, [esp + 12 + 20] // yuvconstants 2644 mov ecx, [esp + 12 + 24] // width 2645 sub edi, esi 2646 pcmpeqb xmm5, xmm5 // generate mask 0x0000001f 2647 psrld xmm5, 27 2648 pcmpeqb xmm6, xmm6 // generate mask 0x000007e0 2649 psrld xmm6, 26 2650 pslld xmm6, 5 2651 pcmpeqb xmm7, xmm7 // generate mask 0xfffff800 2652 pslld xmm7, 11 2653 2654 convertloop: 2655 READYUV422 2656 YUVTORGB(ebx) 2657 STORERGB565 2658 2659 sub ecx, 8 2660 jg convertloop 2661 2662 pop ebx 2663 pop edi 2664 pop esi 2665 ret 2666 } 2667 } 2668 2669 // 8 pixels. 2670 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 2671 __declspec(naked) void I422ToARGBRow_SSSE3( 2672 const uint8* y_buf, 2673 const uint8* u_buf, 2674 const uint8* v_buf, 2675 uint8* dst_argb, 2676 const struct YuvConstants* yuvconstants, 2677 int width) { 2678 __asm { 2679 push esi 2680 push edi 2681 push ebx 2682 mov eax, [esp + 12 + 4] // Y 2683 mov esi, [esp + 12 + 8] // U 2684 mov edi, [esp + 12 + 12] // V 2685 mov edx, [esp + 12 + 16] // argb 2686 mov ebx, [esp + 12 + 20] // yuvconstants 2687 mov ecx, [esp + 12 + 24] // width 2688 sub edi, esi 2689 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2690 2691 convertloop: 2692 READYUV422 2693 YUVTORGB(ebx) 2694 STOREARGB 2695 2696 sub ecx, 8 2697 jg convertloop 2698 2699 pop ebx 2700 pop edi 2701 pop esi 2702 ret 2703 } 2704 } 2705 2706 // 8 pixels. 2707 // 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB. 2708 __declspec(naked) void I422AlphaToARGBRow_SSSE3( 2709 const uint8* y_buf, 2710 const uint8* u_buf, 2711 const uint8* v_buf, 2712 const uint8* a_buf, 2713 uint8* dst_argb, 2714 const struct YuvConstants* yuvconstants, 2715 int width) { 2716 __asm { 2717 push esi 2718 push edi 2719 push ebx 2720 push ebp 2721 mov eax, [esp + 16 + 4] // Y 2722 mov esi, [esp + 16 + 8] // U 2723 mov edi, [esp + 16 + 12] // V 2724 mov ebp, [esp + 16 + 16] // A 2725 mov edx, [esp + 16 + 20] // argb 2726 mov ebx, [esp + 16 + 24] // yuvconstants 2727 mov ecx, [esp + 16 + 28] // width 2728 sub edi, esi 2729 2730 convertloop: 2731 READYUVA422 2732 YUVTORGB(ebx) 2733 STOREARGB 2734 2735 sub ecx, 8 2736 jg convertloop 2737 2738 pop ebp 2739 pop ebx 2740 pop edi 2741 pop esi 2742 ret 2743 } 2744 } 2745 2746 // 8 pixels. 2747 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 2748 __declspec(naked) void NV12ToARGBRow_SSSE3( 2749 const uint8* y_buf, 2750 const uint8* uv_buf, 2751 uint8* dst_argb, 2752 const struct YuvConstants* yuvconstants, 2753 int width) { 2754 __asm { 2755 push esi 2756 push ebx 2757 mov eax, [esp + 8 + 4] // Y 2758 mov esi, [esp + 8 + 8] // UV 2759 mov edx, [esp + 8 + 12] // argb 2760 mov ebx, [esp + 8 + 16] // yuvconstants 2761 mov ecx, [esp + 8 + 20] // width 2762 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2763 2764 convertloop: 2765 READNV12 2766 YUVTORGB(ebx) 2767 STOREARGB 2768 2769 sub ecx, 8 2770 jg convertloop 2771 2772 pop ebx 2773 pop esi 2774 ret 2775 } 2776 } 2777 2778 // 8 pixels. 2779 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 2780 __declspec(naked) void NV21ToARGBRow_SSSE3( 2781 const uint8* y_buf, 2782 const uint8* vu_buf, 2783 uint8* dst_argb, 2784 const struct YuvConstants* yuvconstants, 2785 int width) { 2786 __asm { 2787 push esi 2788 push ebx 2789 mov eax, [esp + 8 + 4] // Y 2790 mov esi, [esp + 8 + 8] // VU 2791 mov edx, [esp + 8 + 12] // argb 2792 mov ebx, [esp + 8 + 16] // yuvconstants 2793 mov ecx, [esp + 8 + 20] // width 2794 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2795 2796 convertloop: 2797 READNV21 2798 YUVTORGB(ebx) 2799 STOREARGB 2800 2801 sub ecx, 8 2802 jg convertloop 2803 2804 pop ebx 2805 pop esi 2806 ret 2807 } 2808 } 2809 2810 // 8 pixels. 2811 // 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes). 2812 __declspec(naked) void YUY2ToARGBRow_SSSE3( 2813 const uint8* src_yuy2, 2814 uint8* dst_argb, 2815 const struct YuvConstants* yuvconstants, 2816 int width) { 2817 __asm { 2818 push ebx 2819 mov eax, [esp + 4 + 4] // yuy2 2820 mov edx, [esp + 4 + 8] // argb 2821 mov ebx, [esp + 4 + 12] // yuvconstants 2822 mov ecx, [esp + 4 + 16] // width 2823 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2824 2825 convertloop: 2826 READYUY2 2827 YUVTORGB(ebx) 2828 STOREARGB 2829 2830 sub ecx, 8 2831 jg convertloop 2832 2833 pop ebx 2834 ret 2835 } 2836 } 2837 2838 // 8 pixels. 2839 // 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes). 2840 __declspec(naked) void UYVYToARGBRow_SSSE3( 2841 const uint8* src_uyvy, 2842 uint8* dst_argb, 2843 const struct YuvConstants* yuvconstants, 2844 int width) { 2845 __asm { 2846 push ebx 2847 mov eax, [esp + 4 + 4] // uyvy 2848 mov edx, [esp + 4 + 8] // argb 2849 mov ebx, [esp + 4 + 12] // yuvconstants 2850 mov ecx, [esp + 4 + 16] // width 2851 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2852 2853 convertloop: 2854 READUYVY 2855 YUVTORGB(ebx) 2856 STOREARGB 2857 2858 sub ecx, 8 2859 jg convertloop 2860 2861 pop ebx 2862 ret 2863 } 2864 } 2865 2866 __declspec(naked) void I422ToRGBARow_SSSE3( 2867 const uint8* y_buf, 2868 const uint8* u_buf, 2869 const uint8* v_buf, 2870 uint8* dst_rgba, 2871 const struct YuvConstants* yuvconstants, 2872 int width) { 2873 __asm { 2874 push esi 2875 push edi 2876 push ebx 2877 mov eax, [esp + 12 + 4] // Y 2878 mov esi, [esp + 12 + 8] // U 2879 mov edi, [esp + 12 + 12] // V 2880 mov edx, [esp + 12 + 16] // argb 2881 mov ebx, [esp + 12 + 20] // yuvconstants 2882 mov ecx, [esp + 12 + 24] // width 2883 sub edi, esi 2884 2885 convertloop: 2886 READYUV422 2887 YUVTORGB(ebx) 2888 STORERGBA 2889 2890 sub ecx, 8 2891 jg convertloop 2892 2893 pop ebx 2894 pop edi 2895 pop esi 2896 ret 2897 } 2898 } 2899 #endif // HAS_I422TOARGBROW_SSSE3 2900 2901 #ifdef HAS_I400TOARGBROW_SSE2 2902 // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes). 2903 __declspec(naked) void I400ToARGBRow_SSE2(const uint8* y_buf, 2904 uint8* rgb_buf, 2905 int width) { 2906 __asm { 2907 mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) 2908 movd xmm2, eax 2909 pshufd xmm2, xmm2,0 2910 mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16) 2911 movd xmm3, eax 2912 pshufd xmm3, xmm3, 0 2913 pcmpeqb xmm4, xmm4 // generate mask 0xff000000 2914 pslld xmm4, 24 2915 2916 mov eax, [esp + 4] // Y 2917 mov edx, [esp + 8] // rgb 2918 mov ecx, [esp + 12] // width 2919 2920 convertloop: 2921 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 2922 movq xmm0, qword ptr [eax] 2923 lea eax, [eax + 8] 2924 punpcklbw xmm0, xmm0 // Y.Y 2925 pmulhuw xmm0, xmm2 2926 psubusw xmm0, xmm3 2927 psrlw xmm0, 6 2928 packuswb xmm0, xmm0 // G 2929 2930 // Step 2: Weave into ARGB 2931 punpcklbw xmm0, xmm0 // GG 2932 movdqa xmm1, xmm0 2933 punpcklwd xmm0, xmm0 // BGRA first 4 pixels 2934 punpckhwd xmm1, xmm1 // BGRA next 4 pixels 2935 por xmm0, xmm4 2936 por xmm1, xmm4 2937 movdqu [edx], xmm0 2938 movdqu [edx + 16], xmm1 2939 lea edx, [edx + 32] 2940 sub ecx, 8 2941 jg convertloop 2942 ret 2943 } 2944 } 2945 #endif // HAS_I400TOARGBROW_SSE2 2946 2947 #ifdef HAS_I400TOARGBROW_AVX2 2948 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes). 2949 // note: vpunpcklbw mutates and vpackuswb unmutates. 2950 __declspec(naked) void I400ToARGBRow_AVX2(const uint8* y_buf, 2951 uint8* rgb_buf, 2952 int width) { 2953 __asm { 2954 mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) 2955 vmovd xmm2, eax 2956 vbroadcastss ymm2, xmm2 2957 mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16) 2958 vmovd xmm3, eax 2959 vbroadcastss ymm3, xmm3 2960 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xff000000 2961 vpslld ymm4, ymm4, 24 2962 2963 mov eax, [esp + 4] // Y 2964 mov edx, [esp + 8] // rgb 2965 mov ecx, [esp + 12] // width 2966 2967 convertloop: 2968 // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164 2969 vmovdqu xmm0, [eax] 2970 lea eax, [eax + 16] 2971 vpermq ymm0, ymm0, 0xd8 // vpunpcklbw mutates 2972 vpunpcklbw ymm0, ymm0, ymm0 // Y.Y 2973 vpmulhuw ymm0, ymm0, ymm2 2974 vpsubusw ymm0, ymm0, ymm3 2975 vpsrlw ymm0, ymm0, 6 2976 vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120 2977 2978 // TODO(fbarchard): Weave alpha with unpack. 2979 // Step 2: Weave into ARGB 2980 vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates 2981 vpermq ymm1, ymm1, 0xd8 2982 vpunpcklwd ymm0, ymm1, ymm1 // GGGG first 8 pixels 2983 vpunpckhwd ymm1, ymm1, ymm1 // GGGG next 8 pixels 2984 vpor ymm0, ymm0, ymm4 2985 vpor ymm1, ymm1, ymm4 2986 vmovdqu [edx], ymm0 2987 vmovdqu [edx + 32], ymm1 2988 lea edx, [edx + 64] 2989 sub ecx, 16 2990 jg convertloop 2991 vzeroupper 2992 ret 2993 } 2994 } 2995 #endif // HAS_I400TOARGBROW_AVX2 2996 2997 #ifdef HAS_MIRRORROW_SSSE3 2998 // Shuffle table for reversing the bytes. 2999 static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 3000 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; 3001 3002 // TODO(fbarchard): Replace lea with -16 offset. 3003 __declspec(naked) void MirrorRow_SSSE3(const uint8* src, 3004 uint8* dst, 3005 int width) { 3006 __asm { 3007 mov eax, [esp + 4] // src 3008 mov edx, [esp + 8] // dst 3009 mov ecx, [esp + 12] // width 3010 movdqa xmm5, xmmword ptr kShuffleMirror 3011 3012 convertloop: 3013 movdqu xmm0, [eax - 16 + ecx] 3014 pshufb xmm0, xmm5 3015 movdqu [edx], xmm0 3016 lea edx, [edx + 16] 3017 sub ecx, 16 3018 jg convertloop 3019 ret 3020 } 3021 } 3022 #endif // HAS_MIRRORROW_SSSE3 3023 3024 #ifdef HAS_MIRRORROW_AVX2 3025 __declspec(naked) void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { 3026 __asm { 3027 mov eax, [esp + 4] // src 3028 mov edx, [esp + 8] // dst 3029 mov ecx, [esp + 12] // width 3030 vbroadcastf128 ymm5, xmmword ptr kShuffleMirror 3031 3032 convertloop: 3033 vmovdqu ymm0, [eax - 32 + ecx] 3034 vpshufb ymm0, ymm0, ymm5 3035 vpermq ymm0, ymm0, 0x4e // swap high and low halfs 3036 vmovdqu [edx], ymm0 3037 lea edx, [edx + 32] 3038 sub ecx, 32 3039 jg convertloop 3040 vzeroupper 3041 ret 3042 } 3043 } 3044 #endif // HAS_MIRRORROW_AVX2 3045 3046 #ifdef HAS_MIRRORUVROW_SSSE3 3047 // Shuffle table for reversing the bytes of UV channels. 3048 static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 3049 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u}; 3050 3051 __declspec(naked) void MirrorUVRow_SSSE3(const uint8* src, 3052 uint8* dst_u, 3053 uint8* dst_v, 3054 int width) { 3055 __asm { 3056 push edi 3057 mov eax, [esp + 4 + 4] // src 3058 mov edx, [esp + 4 + 8] // dst_u 3059 mov edi, [esp + 4 + 12] // dst_v 3060 mov ecx, [esp + 4 + 16] // width 3061 movdqa xmm1, xmmword ptr kShuffleMirrorUV 3062 lea eax, [eax + ecx * 2 - 16] 3063 sub edi, edx 3064 3065 convertloop: 3066 movdqu xmm0, [eax] 3067 lea eax, [eax - 16] 3068 pshufb xmm0, xmm1 3069 movlpd qword ptr [edx], xmm0 3070 movhpd qword ptr [edx + edi], xmm0 3071 lea edx, [edx + 8] 3072 sub ecx, 8 3073 jg convertloop 3074 3075 pop edi 3076 ret 3077 } 3078 } 3079 #endif // HAS_MIRRORUVROW_SSSE3 3080 3081 #ifdef HAS_ARGBMIRRORROW_SSE2 3082 __declspec(naked) void ARGBMirrorRow_SSE2(const uint8* src, 3083 uint8* dst, 3084 int width) { 3085 __asm { 3086 mov eax, [esp + 4] // src 3087 mov edx, [esp + 8] // dst 3088 mov ecx, [esp + 12] // width 3089 lea eax, [eax - 16 + ecx * 4] // last 4 pixels. 3090 3091 convertloop: 3092 movdqu xmm0, [eax] 3093 lea eax, [eax - 16] 3094 pshufd xmm0, xmm0, 0x1b 3095 movdqu [edx], xmm0 3096 lea edx, [edx + 16] 3097 sub ecx, 4 3098 jg convertloop 3099 ret 3100 } 3101 } 3102 #endif // HAS_ARGBMIRRORROW_SSE2 3103 3104 #ifdef HAS_ARGBMIRRORROW_AVX2 3105 // Shuffle table for reversing the bytes. 3106 static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; 3107 3108 __declspec(naked) void ARGBMirrorRow_AVX2(const uint8* src, 3109 uint8* dst, 3110 int width) { 3111 __asm { 3112 mov eax, [esp + 4] // src 3113 mov edx, [esp + 8] // dst 3114 mov ecx, [esp + 12] // width 3115 vmovdqu ymm5, ymmword ptr kARGBShuffleMirror_AVX2 3116 3117 convertloop: 3118 vpermd ymm0, ymm5, [eax - 32 + ecx * 4] // permute dword order 3119 vmovdqu [edx], ymm0 3120 lea edx, [edx + 32] 3121 sub ecx, 8 3122 jg convertloop 3123 vzeroupper 3124 ret 3125 } 3126 } 3127 #endif // HAS_ARGBMIRRORROW_AVX2 3128 3129 #ifdef HAS_SPLITUVROW_SSE2 3130 __declspec(naked) void SplitUVRow_SSE2(const uint8* src_uv, 3131 uint8* dst_u, 3132 uint8* dst_v, 3133 int width) { 3134 __asm { 3135 push edi 3136 mov eax, [esp + 4 + 4] // src_uv 3137 mov edx, [esp + 4 + 8] // dst_u 3138 mov edi, [esp + 4 + 12] // dst_v 3139 mov ecx, [esp + 4 + 16] // width 3140 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 3141 psrlw xmm5, 8 3142 sub edi, edx 3143 3144 convertloop: 3145 movdqu xmm0, [eax] 3146 movdqu xmm1, [eax + 16] 3147 lea eax, [eax + 32] 3148 movdqa xmm2, xmm0 3149 movdqa xmm3, xmm1 3150 pand xmm0, xmm5 // even bytes 3151 pand xmm1, xmm5 3152 packuswb xmm0, xmm1 3153 psrlw xmm2, 8 // odd bytes 3154 psrlw xmm3, 8 3155 packuswb xmm2, xmm3 3156 movdqu [edx], xmm0 3157 movdqu [edx + edi], xmm2 3158 lea edx, [edx + 16] 3159 sub ecx, 16 3160 jg convertloop 3161 3162 pop edi 3163 ret 3164 } 3165 } 3166 3167 #endif // HAS_SPLITUVROW_SSE2 3168 3169 #ifdef HAS_SPLITUVROW_AVX2 3170 __declspec(naked) void SplitUVRow_AVX2(const uint8* src_uv, 3171 uint8* dst_u, 3172 uint8* dst_v, 3173 int width) { 3174 __asm { 3175 push edi 3176 mov eax, [esp + 4 + 4] // src_uv 3177 mov edx, [esp + 4 + 8] // dst_u 3178 mov edi, [esp + 4 + 12] // dst_v 3179 mov ecx, [esp + 4 + 16] // width 3180 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff 3181 vpsrlw ymm5, ymm5, 8 3182 sub edi, edx 3183 3184 convertloop: 3185 vmovdqu ymm0, [eax] 3186 vmovdqu ymm1, [eax + 32] 3187 lea eax, [eax + 64] 3188 vpsrlw ymm2, ymm0, 8 // odd bytes 3189 vpsrlw ymm3, ymm1, 8 3190 vpand ymm0, ymm0, ymm5 // even bytes 3191 vpand ymm1, ymm1, ymm5 3192 vpackuswb ymm0, ymm0, ymm1 3193 vpackuswb ymm2, ymm2, ymm3 3194 vpermq ymm0, ymm0, 0xd8 3195 vpermq ymm2, ymm2, 0xd8 3196 vmovdqu [edx], ymm0 3197 vmovdqu [edx + edi], ymm2 3198 lea edx, [edx + 32] 3199 sub ecx, 32 3200 jg convertloop 3201 3202 pop edi 3203 vzeroupper 3204 ret 3205 } 3206 } 3207 #endif // HAS_SPLITUVROW_AVX2 3208 3209 #ifdef HAS_MERGEUVROW_SSE2 3210 __declspec(naked) void MergeUVRow_SSE2(const uint8* src_u, 3211 const uint8* src_v, 3212 uint8* dst_uv, 3213 int width) { 3214 __asm { 3215 push edi 3216 mov eax, [esp + 4 + 4] // src_u 3217 mov edx, [esp + 4 + 8] // src_v 3218 mov edi, [esp + 4 + 12] // dst_uv 3219 mov ecx, [esp + 4 + 16] // width 3220 sub edx, eax 3221 3222 convertloop: 3223 movdqu xmm0, [eax] // read 16 U's 3224 movdqu xmm1, [eax + edx] // and 16 V's 3225 lea eax, [eax + 16] 3226 movdqa xmm2, xmm0 3227 punpcklbw xmm0, xmm1 // first 8 UV pairs 3228 punpckhbw xmm2, xmm1 // next 8 UV pairs 3229 movdqu [edi], xmm0 3230 movdqu [edi + 16], xmm2 3231 lea edi, [edi + 32] 3232 sub ecx, 16 3233 jg convertloop 3234 3235 pop edi 3236 ret 3237 } 3238 } 3239 #endif // HAS_MERGEUVROW_SSE2 3240 3241 #ifdef HAS_MERGEUVROW_AVX2 3242 __declspec(naked) void MergeUVRow_AVX2(const uint8* src_u, 3243 const uint8* src_v, 3244 uint8* dst_uv, 3245 int width) { 3246 __asm { 3247 push edi 3248 mov eax, [esp + 4 + 4] // src_u 3249 mov edx, [esp + 4 + 8] // src_v 3250 mov edi, [esp + 4 + 12] // dst_uv 3251 mov ecx, [esp + 4 + 16] // width 3252 sub edx, eax 3253 3254 convertloop: 3255 vmovdqu ymm0, [eax] // read 32 U's 3256 vmovdqu ymm1, [eax + edx] // and 32 V's 3257 lea eax, [eax + 32] 3258 vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2 3259 vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3 3260 vextractf128 [edi], ymm2, 0 // bytes 0..15 3261 vextractf128 [edi + 16], ymm0, 0 // bytes 16..31 3262 vextractf128 [edi + 32], ymm2, 1 // bytes 32..47 3263 vextractf128 [edi + 48], ymm0, 1 // bytes 47..63 3264 lea edi, [edi + 64] 3265 sub ecx, 32 3266 jg convertloop 3267 3268 pop edi 3269 vzeroupper 3270 ret 3271 } 3272 } 3273 #endif // HAS_MERGEUVROW_AVX2 3274 3275 #ifdef HAS_COPYROW_SSE2 3276 // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time. 3277 __declspec(naked) void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { 3278 __asm { 3279 mov eax, [esp + 4] // src 3280 mov edx, [esp + 8] // dst 3281 mov ecx, [esp + 12] // count 3282 test eax, 15 3283 jne convertloopu 3284 test edx, 15 3285 jne convertloopu 3286 3287 convertloopa: 3288 movdqa xmm0, [eax] 3289 movdqa xmm1, [eax + 16] 3290 lea eax, [eax + 32] 3291 movdqa [edx], xmm0 3292 movdqa [edx + 16], xmm1 3293 lea edx, [edx + 32] 3294 sub ecx, 32 3295 jg convertloopa 3296 ret 3297 3298 convertloopu: 3299 movdqu xmm0, [eax] 3300 movdqu xmm1, [eax + 16] 3301 lea eax, [eax + 32] 3302 movdqu [edx], xmm0 3303 movdqu [edx + 16], xmm1 3304 lea edx, [edx + 32] 3305 sub ecx, 32 3306 jg convertloopu 3307 ret 3308 } 3309 } 3310 #endif // HAS_COPYROW_SSE2 3311 3312 #ifdef HAS_COPYROW_AVX 3313 // CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time. 3314 __declspec(naked) void CopyRow_AVX(const uint8* src, uint8* dst, int count) { 3315 __asm { 3316 mov eax, [esp + 4] // src 3317 mov edx, [esp + 8] // dst 3318 mov ecx, [esp + 12] // count 3319 3320 convertloop: 3321 vmovdqu ymm0, [eax] 3322 vmovdqu ymm1, [eax + 32] 3323 lea eax, [eax + 64] 3324 vmovdqu [edx], ymm0 3325 vmovdqu [edx + 32], ymm1 3326 lea edx, [edx + 64] 3327 sub ecx, 64 3328 jg convertloop 3329 3330 vzeroupper 3331 ret 3332 } 3333 } 3334 #endif // HAS_COPYROW_AVX 3335 3336 // Multiple of 1. 3337 __declspec(naked) void CopyRow_ERMS(const uint8* src, uint8* dst, int count) { 3338 __asm { 3339 mov eax, esi 3340 mov edx, edi 3341 mov esi, [esp + 4] // src 3342 mov edi, [esp + 8] // dst 3343 mov ecx, [esp + 12] // count 3344 rep movsb 3345 mov edi, edx 3346 mov esi, eax 3347 ret 3348 } 3349 } 3350 3351 #ifdef HAS_ARGBCOPYALPHAROW_SSE2 3352 // width in pixels 3353 __declspec(naked) void ARGBCopyAlphaRow_SSE2(const uint8* src, 3354 uint8* dst, 3355 int width) { 3356 __asm { 3357 mov eax, [esp + 4] // src 3358 mov edx, [esp + 8] // dst 3359 mov ecx, [esp + 12] // count 3360 pcmpeqb xmm0, xmm0 // generate mask 0xff000000 3361 pslld xmm0, 24 3362 pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff 3363 psrld xmm1, 8 3364 3365 convertloop: 3366 movdqu xmm2, [eax] 3367 movdqu xmm3, [eax + 16] 3368 lea eax, [eax + 32] 3369 movdqu xmm4, [edx] 3370 movdqu xmm5, [edx + 16] 3371 pand xmm2, xmm0 3372 pand xmm3, xmm0 3373 pand xmm4, xmm1 3374 pand xmm5, xmm1 3375 por xmm2, xmm4 3376 por xmm3, xmm5 3377 movdqu [edx], xmm2 3378 movdqu [edx + 16], xmm3 3379 lea edx, [edx + 32] 3380 sub ecx, 8 3381 jg convertloop 3382 3383 ret 3384 } 3385 } 3386 #endif // HAS_ARGBCOPYALPHAROW_SSE2 3387 3388 #ifdef HAS_ARGBCOPYALPHAROW_AVX2 3389 // width in pixels 3390 __declspec(naked) void ARGBCopyAlphaRow_AVX2(const uint8* src, 3391 uint8* dst, 3392 int width) { 3393 __asm { 3394 mov eax, [esp + 4] // src 3395 mov edx, [esp + 8] // dst 3396 mov ecx, [esp + 12] // count 3397 vpcmpeqb ymm0, ymm0, ymm0 3398 vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff 3399 3400 convertloop: 3401 vmovdqu ymm1, [eax] 3402 vmovdqu ymm2, [eax + 32] 3403 lea eax, [eax + 64] 3404 vpblendvb ymm1, ymm1, [edx], ymm0 3405 vpblendvb ymm2, ymm2, [edx + 32], ymm0 3406 vmovdqu [edx], ymm1 3407 vmovdqu [edx + 32], ymm2 3408 lea edx, [edx + 64] 3409 sub ecx, 16 3410 jg convertloop 3411 3412 vzeroupper 3413 ret 3414 } 3415 } 3416 #endif // HAS_ARGBCOPYALPHAROW_AVX2 3417 3418 #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2 3419 // width in pixels 3420 __declspec(naked) void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, 3421 uint8* dst_a, 3422 int width) { 3423 __asm { 3424 mov eax, [esp + 4] // src_argb 3425 mov edx, [esp + 8] // dst_a 3426 mov ecx, [esp + 12] // width 3427 3428 extractloop: 3429 movdqu xmm0, [eax] 3430 movdqu xmm1, [eax + 16] 3431 lea eax, [eax + 32] 3432 psrld xmm0, 24 3433 psrld xmm1, 24 3434 packssdw xmm0, xmm1 3435 packuswb xmm0, xmm0 3436 movq qword ptr [edx], xmm0 3437 lea edx, [edx + 8] 3438 sub ecx, 8 3439 jg extractloop 3440 3441 ret 3442 } 3443 } 3444 #endif // HAS_ARGBEXTRACTALPHAROW_SSE2 3445 3446 #ifdef HAS_ARGBEXTRACTALPHAROW_AVX2 3447 // width in pixels 3448 __declspec(naked) void ARGBExtractAlphaRow_AVX2(const uint8* src_argb, 3449 uint8* dst_a, 3450 int width) { 3451 __asm { 3452 mov eax, [esp + 4] // src_argb 3453 mov edx, [esp + 8] // dst_a 3454 mov ecx, [esp + 12] // width 3455 vmovdqa ymm4, ymmword ptr kPermdARGBToY_AVX 3456 3457 extractloop: 3458 vmovdqu ymm0, [eax] 3459 vmovdqu ymm1, [eax + 32] 3460 vpsrld ymm0, ymm0, 24 3461 vpsrld ymm1, ymm1, 24 3462 vmovdqu ymm2, [eax + 64] 3463 vmovdqu ymm3, [eax + 96] 3464 lea eax, [eax + 128] 3465 vpackssdw ymm0, ymm0, ymm1 // mutates 3466 vpsrld ymm2, ymm2, 24 3467 vpsrld ymm3, ymm3, 24 3468 vpackssdw ymm2, ymm2, ymm3 // mutates 3469 vpackuswb ymm0, ymm0, ymm2 // mutates 3470 vpermd ymm0, ymm4, ymm0 // unmutate 3471 vmovdqu [edx], ymm0 3472 lea edx, [edx + 32] 3473 sub ecx, 32 3474 jg extractloop 3475 3476 vzeroupper 3477 ret 3478 } 3479 } 3480 #endif // HAS_ARGBEXTRACTALPHAROW_AVX2 3481 3482 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 3483 // width in pixels 3484 __declspec(naked) void ARGBCopyYToAlphaRow_SSE2(const uint8* src, 3485 uint8* dst, 3486 int width) { 3487 __asm { 3488 mov eax, [esp + 4] // src 3489 mov edx, [esp + 8] // dst 3490 mov ecx, [esp + 12] // count 3491 pcmpeqb xmm0, xmm0 // generate mask 0xff000000 3492 pslld xmm0, 24 3493 pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff 3494 psrld xmm1, 8 3495 3496 convertloop: 3497 movq xmm2, qword ptr [eax] // 8 Y's 3498 lea eax, [eax + 8] 3499 punpcklbw xmm2, xmm2 3500 punpckhwd xmm3, xmm2 3501 punpcklwd xmm2, xmm2 3502 movdqu xmm4, [edx] 3503 movdqu xmm5, [edx + 16] 3504 pand xmm2, xmm0 3505 pand xmm3, xmm0 3506 pand xmm4, xmm1 3507 pand xmm5, xmm1 3508 por xmm2, xmm4 3509 por xmm3, xmm5 3510 movdqu [edx], xmm2 3511 movdqu [edx + 16], xmm3 3512 lea edx, [edx + 32] 3513 sub ecx, 8 3514 jg convertloop 3515 3516 ret 3517 } 3518 } 3519 #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2 3520 3521 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 3522 // width in pixels 3523 __declspec(naked) void ARGBCopyYToAlphaRow_AVX2(const uint8* src, 3524 uint8* dst, 3525 int width) { 3526 __asm { 3527 mov eax, [esp + 4] // src 3528 mov edx, [esp + 8] // dst 3529 mov ecx, [esp + 12] // count 3530 vpcmpeqb ymm0, ymm0, ymm0 3531 vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff 3532 3533 convertloop: 3534 vpmovzxbd ymm1, qword ptr [eax] 3535 vpmovzxbd ymm2, qword ptr [eax + 8] 3536 lea eax, [eax + 16] 3537 vpslld ymm1, ymm1, 24 3538 vpslld ymm2, ymm2, 24 3539 vpblendvb ymm1, ymm1, [edx], ymm0 3540 vpblendvb ymm2, ymm2, [edx + 32], ymm0 3541 vmovdqu [edx], ymm1 3542 vmovdqu [edx + 32], ymm2 3543 lea edx, [edx + 64] 3544 sub ecx, 16 3545 jg convertloop 3546 3547 vzeroupper 3548 ret 3549 } 3550 } 3551 #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 3552 3553 #ifdef HAS_SETROW_X86 3554 // Write 'count' bytes using an 8 bit value repeated. 3555 // Count should be multiple of 4. 3556 __declspec(naked) void SetRow_X86(uint8* dst, uint8 v8, int count) { 3557 __asm { 3558 movzx eax, byte ptr [esp + 8] // v8 3559 mov edx, 0x01010101 // Duplicate byte to all bytes. 3560 mul edx // overwrites edx with upper part of result. 3561 mov edx, edi 3562 mov edi, [esp + 4] // dst 3563 mov ecx, [esp + 12] // count 3564 shr ecx, 2 3565 rep stosd 3566 mov edi, edx 3567 ret 3568 } 3569 } 3570 3571 // Write 'count' bytes using an 8 bit value repeated. 3572 __declspec(naked) void SetRow_ERMS(uint8* dst, uint8 v8, int count) { 3573 __asm { 3574 mov edx, edi 3575 mov edi, [esp + 4] // dst 3576 mov eax, [esp + 8] // v8 3577 mov ecx, [esp + 12] // count 3578 rep stosb 3579 mov edi, edx 3580 ret 3581 } 3582 } 3583 3584 // Write 'count' 32 bit values. 3585 __declspec(naked) void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) { 3586 __asm { 3587 mov edx, edi 3588 mov edi, [esp + 4] // dst 3589 mov eax, [esp + 8] // v32 3590 mov ecx, [esp + 12] // count 3591 rep stosd 3592 mov edi, edx 3593 ret 3594 } 3595 } 3596 #endif // HAS_SETROW_X86 3597 3598 #ifdef HAS_YUY2TOYROW_AVX2 3599 __declspec(naked) void YUY2ToYRow_AVX2(const uint8* src_yuy2, 3600 uint8* dst_y, 3601 int width) { 3602 __asm { 3603 mov eax, [esp + 4] // src_yuy2 3604 mov edx, [esp + 8] // dst_y 3605 mov ecx, [esp + 12] // width 3606 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff 3607 vpsrlw ymm5, ymm5, 8 3608 3609 convertloop: 3610 vmovdqu ymm0, [eax] 3611 vmovdqu ymm1, [eax + 32] 3612 lea eax, [eax + 64] 3613 vpand ymm0, ymm0, ymm5 // even bytes are Y 3614 vpand ymm1, ymm1, ymm5 3615 vpackuswb ymm0, ymm0, ymm1 // mutates. 3616 vpermq ymm0, ymm0, 0xd8 3617 vmovdqu [edx], ymm0 3618 lea edx, [edx + 32] 3619 sub ecx, 32 3620 jg convertloop 3621 vzeroupper 3622 ret 3623 } 3624 } 3625 3626 __declspec(naked) void YUY2ToUVRow_AVX2(const uint8* src_yuy2, 3627 int stride_yuy2, 3628 uint8* dst_u, 3629 uint8* dst_v, 3630 int width) { 3631 __asm { 3632 push esi 3633 push edi 3634 mov eax, [esp + 8 + 4] // src_yuy2 3635 mov esi, [esp + 8 + 8] // stride_yuy2 3636 mov edx, [esp + 8 + 12] // dst_u 3637 mov edi, [esp + 8 + 16] // dst_v 3638 mov ecx, [esp + 8 + 20] // width 3639 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff 3640 vpsrlw ymm5, ymm5, 8 3641 sub edi, edx 3642 3643 convertloop: 3644 vmovdqu ymm0, [eax] 3645 vmovdqu ymm1, [eax + 32] 3646 vpavgb ymm0, ymm0, [eax + esi] 3647 vpavgb ymm1, ymm1, [eax + esi + 32] 3648 lea eax, [eax + 64] 3649 vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV 3650 vpsrlw ymm1, ymm1, 8 3651 vpackuswb ymm0, ymm0, ymm1 // mutates. 3652 vpermq ymm0, ymm0, 0xd8 3653 vpand ymm1, ymm0, ymm5 // U 3654 vpsrlw ymm0, ymm0, 8 // V 3655 vpackuswb ymm1, ymm1, ymm1 // mutates. 3656 vpackuswb ymm0, ymm0, ymm0 // mutates. 3657 vpermq ymm1, ymm1, 0xd8 3658 vpermq ymm0, ymm0, 0xd8 3659 vextractf128 [edx], ymm1, 0 // U 3660 vextractf128 [edx + edi], ymm0, 0 // V 3661 lea edx, [edx + 16] 3662 sub ecx, 32 3663 jg convertloop 3664 3665 pop edi 3666 pop esi 3667 vzeroupper 3668 ret 3669 } 3670 } 3671 3672 __declspec(naked) void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, 3673 uint8* dst_u, 3674 uint8* dst_v, 3675 int width) { 3676 __asm { 3677 push edi 3678 mov eax, [esp + 4 + 4] // src_yuy2 3679 mov edx, [esp + 4 + 8] // dst_u 3680 mov edi, [esp + 4 + 12] // dst_v 3681 mov ecx, [esp + 4 + 16] // width 3682 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff 3683 vpsrlw ymm5, ymm5, 8 3684 sub edi, edx 3685 3686 convertloop: 3687 vmovdqu ymm0, [eax] 3688 vmovdqu ymm1, [eax + 32] 3689 lea eax, [eax + 64] 3690 vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV 3691 vpsrlw ymm1, ymm1, 8 3692 vpackuswb ymm0, ymm0, ymm1 // mutates. 3693 vpermq ymm0, ymm0, 0xd8 3694 vpand ymm1, ymm0, ymm5 // U 3695 vpsrlw ymm0, ymm0, 8 // V 3696 vpackuswb ymm1, ymm1, ymm1 // mutates. 3697 vpackuswb ymm0, ymm0, ymm0 // mutates. 3698 vpermq ymm1, ymm1, 0xd8 3699 vpermq ymm0, ymm0, 0xd8 3700 vextractf128 [edx], ymm1, 0 // U 3701 vextractf128 [edx + edi], ymm0, 0 // V 3702 lea edx, [edx + 16] 3703 sub ecx, 32 3704 jg convertloop 3705 3706 pop edi 3707 vzeroupper 3708 ret 3709 } 3710 } 3711 3712 __declspec(naked) void UYVYToYRow_AVX2(const uint8* src_uyvy, 3713 uint8* dst_y, 3714 int width) { 3715 __asm { 3716 mov eax, [esp + 4] // src_uyvy 3717 mov edx, [esp + 8] // dst_y 3718 mov ecx, [esp + 12] // width 3719 3720 convertloop: 3721 vmovdqu ymm0, [eax] 3722 vmovdqu ymm1, [eax + 32] 3723 lea eax, [eax + 64] 3724 vpsrlw ymm0, ymm0, 8 // odd bytes are Y 3725 vpsrlw ymm1, ymm1, 8 3726 vpackuswb ymm0, ymm0, ymm1 // mutates. 3727 vpermq ymm0, ymm0, 0xd8 3728 vmovdqu [edx], ymm0 3729 lea edx, [edx + 32] 3730 sub ecx, 32 3731 jg convertloop 3732 vzeroupper 3733 ret 3734 } 3735 } 3736 3737 __declspec(naked) void UYVYToUVRow_AVX2(const uint8* src_uyvy, 3738 int stride_uyvy, 3739 uint8* dst_u, 3740 uint8* dst_v, 3741 int width) { 3742 __asm { 3743 push esi 3744 push edi 3745 mov eax, [esp + 8 + 4] // src_yuy2 3746 mov esi, [esp + 8 + 8] // stride_yuy2 3747 mov edx, [esp + 8 + 12] // dst_u 3748 mov edi, [esp + 8 + 16] // dst_v 3749 mov ecx, [esp + 8 + 20] // width 3750 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff 3751 vpsrlw ymm5, ymm5, 8 3752 sub edi, edx 3753 3754 convertloop: 3755 vmovdqu ymm0, [eax] 3756 vmovdqu ymm1, [eax + 32] 3757 vpavgb ymm0, ymm0, [eax + esi] 3758 vpavgb ymm1, ymm1, [eax + esi + 32] 3759 lea eax, [eax + 64] 3760 vpand ymm0, ymm0, ymm5 // UYVY -> UVUV 3761 vpand ymm1, ymm1, ymm5 3762 vpackuswb ymm0, ymm0, ymm1 // mutates. 3763 vpermq ymm0, ymm0, 0xd8 3764 vpand ymm1, ymm0, ymm5 // U 3765 vpsrlw ymm0, ymm0, 8 // V 3766 vpackuswb ymm1, ymm1, ymm1 // mutates. 3767 vpackuswb ymm0, ymm0, ymm0 // mutates. 3768 vpermq ymm1, ymm1, 0xd8 3769 vpermq ymm0, ymm0, 0xd8 3770 vextractf128 [edx], ymm1, 0 // U 3771 vextractf128 [edx + edi], ymm0, 0 // V 3772 lea edx, [edx + 16] 3773 sub ecx, 32 3774 jg convertloop 3775 3776 pop edi 3777 pop esi 3778 vzeroupper 3779 ret 3780 } 3781 } 3782 3783 __declspec(naked) void UYVYToUV422Row_AVX2(const uint8* src_uyvy, 3784 uint8* dst_u, 3785 uint8* dst_v, 3786 int width) { 3787 __asm { 3788 push edi 3789 mov eax, [esp + 4 + 4] // src_yuy2 3790 mov edx, [esp + 4 + 8] // dst_u 3791 mov edi, [esp + 4 + 12] // dst_v 3792 mov ecx, [esp + 4 + 16] // width 3793 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff 3794 vpsrlw ymm5, ymm5, 8 3795 sub edi, edx 3796 3797 convertloop: 3798 vmovdqu ymm0, [eax] 3799 vmovdqu ymm1, [eax + 32] 3800 lea eax, [eax + 64] 3801 vpand ymm0, ymm0, ymm5 // UYVY -> UVUV 3802 vpand ymm1, ymm1, ymm5 3803 vpackuswb ymm0, ymm0, ymm1 // mutates. 3804 vpermq ymm0, ymm0, 0xd8 3805 vpand ymm1, ymm0, ymm5 // U 3806 vpsrlw ymm0, ymm0, 8 // V 3807 vpackuswb ymm1, ymm1, ymm1 // mutates. 3808 vpackuswb ymm0, ymm0, ymm0 // mutates. 3809 vpermq ymm1, ymm1, 0xd8 3810 vpermq ymm0, ymm0, 0xd8 3811 vextractf128 [edx], ymm1, 0 // U 3812 vextractf128 [edx + edi], ymm0, 0 // V 3813 lea edx, [edx + 16] 3814 sub ecx, 32 3815 jg convertloop 3816 3817 pop edi 3818 vzeroupper 3819 ret 3820 } 3821 } 3822 #endif // HAS_YUY2TOYROW_AVX2 3823 3824 #ifdef HAS_YUY2TOYROW_SSE2 3825 __declspec(naked) void YUY2ToYRow_SSE2(const uint8* src_yuy2, 3826 uint8* dst_y, 3827 int width) { 3828 __asm { 3829 mov eax, [esp + 4] // src_yuy2 3830 mov edx, [esp + 8] // dst_y 3831 mov ecx, [esp + 12] // width 3832 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 3833 psrlw xmm5, 8 3834 3835 convertloop: 3836 movdqu xmm0, [eax] 3837 movdqu xmm1, [eax + 16] 3838 lea eax, [eax + 32] 3839 pand xmm0, xmm5 // even bytes are Y 3840 pand xmm1, xmm5 3841 packuswb xmm0, xmm1 3842 movdqu [edx], xmm0 3843 lea edx, [edx + 16] 3844 sub ecx, 16 3845 jg convertloop 3846 ret 3847 } 3848 } 3849 3850 __declspec(naked) void YUY2ToUVRow_SSE2(const uint8* src_yuy2, 3851 int stride_yuy2, 3852 uint8* dst_u, 3853 uint8* dst_v, 3854 int width) { 3855 __asm { 3856 push esi 3857 push edi 3858 mov eax, [esp + 8 + 4] // src_yuy2 3859 mov esi, [esp + 8 + 8] // stride_yuy2 3860 mov edx, [esp + 8 + 12] // dst_u 3861 mov edi, [esp + 8 + 16] // dst_v 3862 mov ecx, [esp + 8 + 20] // width 3863 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 3864 psrlw xmm5, 8 3865 sub edi, edx 3866 3867 convertloop: 3868 movdqu xmm0, [eax] 3869 movdqu xmm1, [eax + 16] 3870 movdqu xmm2, [eax + esi] 3871 movdqu xmm3, [eax + esi + 16] 3872 lea eax, [eax + 32] 3873 pavgb xmm0, xmm2 3874 pavgb xmm1, xmm3 3875 psrlw xmm0, 8 // YUYV -> UVUV 3876 psrlw xmm1, 8 3877 packuswb xmm0, xmm1 3878 movdqa xmm1, xmm0 3879 pand xmm0, xmm5 // U 3880 packuswb xmm0, xmm0 3881 psrlw xmm1, 8 // V 3882 packuswb xmm1, xmm1 3883 movq qword ptr [edx], xmm0 3884 movq qword ptr [edx + edi], xmm1 3885 lea edx, [edx + 8] 3886 sub ecx, 16 3887 jg convertloop 3888 3889 pop edi 3890 pop esi 3891 ret 3892 } 3893 } 3894 3895 __declspec(naked) void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, 3896 uint8* dst_u, 3897 uint8* dst_v, 3898 int width) { 3899 __asm { 3900 push edi 3901 mov eax, [esp + 4 + 4] // src_yuy2 3902 mov edx, [esp + 4 + 8] // dst_u 3903 mov edi, [esp + 4 + 12] // dst_v 3904 mov ecx, [esp + 4 + 16] // width 3905 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 3906 psrlw xmm5, 8 3907 sub edi, edx 3908 3909 convertloop: 3910 movdqu xmm0, [eax] 3911 movdqu xmm1, [eax + 16] 3912 lea eax, [eax + 32] 3913 psrlw xmm0, 8 // YUYV -> UVUV 3914 psrlw xmm1, 8 3915 packuswb xmm0, xmm1 3916 movdqa xmm1, xmm0 3917 pand xmm0, xmm5 // U 3918 packuswb xmm0, xmm0 3919 psrlw xmm1, 8 // V 3920 packuswb xmm1, xmm1 3921 movq qword ptr [edx], xmm0 3922 movq qword ptr [edx + edi], xmm1 3923 lea edx, [edx + 8] 3924 sub ecx, 16 3925 jg convertloop 3926 3927 pop edi 3928 ret 3929 } 3930 } 3931 3932 __declspec(naked) void UYVYToYRow_SSE2(const uint8* src_uyvy, 3933 uint8* dst_y, 3934 int width) { 3935 __asm { 3936 mov eax, [esp + 4] // src_uyvy 3937 mov edx, [esp + 8] // dst_y 3938 mov ecx, [esp + 12] // width 3939 3940 convertloop: 3941 movdqu xmm0, [eax] 3942 movdqu xmm1, [eax + 16] 3943 lea eax, [eax + 32] 3944 psrlw xmm0, 8 // odd bytes are Y 3945 psrlw xmm1, 8 3946 packuswb xmm0, xmm1 3947 movdqu [edx], xmm0 3948 lea edx, [edx + 16] 3949 sub ecx, 16 3950 jg convertloop 3951 ret 3952 } 3953 } 3954 3955 __declspec(naked) void UYVYToUVRow_SSE2(const uint8* src_uyvy, 3956 int stride_uyvy, 3957 uint8* dst_u, 3958 uint8* dst_v, 3959 int width) { 3960 __asm { 3961 push esi 3962 push edi 3963 mov eax, [esp + 8 + 4] // src_yuy2 3964 mov esi, [esp + 8 + 8] // stride_yuy2 3965 mov edx, [esp + 8 + 12] // dst_u 3966 mov edi, [esp + 8 + 16] // dst_v 3967 mov ecx, [esp + 8 + 20] // width 3968 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 3969 psrlw xmm5, 8 3970 sub edi, edx 3971 3972 convertloop: 3973 movdqu xmm0, [eax] 3974 movdqu xmm1, [eax + 16] 3975 movdqu xmm2, [eax + esi] 3976 movdqu xmm3, [eax + esi + 16] 3977 lea eax, [eax + 32] 3978 pavgb xmm0, xmm2 3979 pavgb xmm1, xmm3 3980 pand xmm0, xmm5 // UYVY -> UVUV 3981 pand xmm1, xmm5 3982 packuswb xmm0, xmm1 3983 movdqa xmm1, xmm0 3984 pand xmm0, xmm5 // U 3985 packuswb xmm0, xmm0 3986 psrlw xmm1, 8 // V 3987 packuswb xmm1, xmm1 3988 movq qword ptr [edx], xmm0 3989 movq qword ptr [edx + edi], xmm1 3990 lea edx, [edx + 8] 3991 sub ecx, 16 3992 jg convertloop 3993 3994 pop edi 3995 pop esi 3996 ret 3997 } 3998 } 3999 4000 __declspec(naked) void UYVYToUV422Row_SSE2(const uint8* src_uyvy, 4001 uint8* dst_u, 4002 uint8* dst_v, 4003 int width) { 4004 __asm { 4005 push edi 4006 mov eax, [esp + 4 + 4] // src_yuy2 4007 mov edx, [esp + 4 + 8] // dst_u 4008 mov edi, [esp + 4 + 12] // dst_v 4009 mov ecx, [esp + 4 + 16] // width 4010 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 4011 psrlw xmm5, 8 4012 sub edi, edx 4013 4014 convertloop: 4015 movdqu xmm0, [eax] 4016 movdqu xmm1, [eax + 16] 4017 lea eax, [eax + 32] 4018 pand xmm0, xmm5 // UYVY -> UVUV 4019 pand xmm1, xmm5 4020 packuswb xmm0, xmm1 4021 movdqa xmm1, xmm0 4022 pand xmm0, xmm5 // U 4023 packuswb xmm0, xmm0 4024 psrlw xmm1, 8 // V 4025 packuswb xmm1, xmm1 4026 movq qword ptr [edx], xmm0 4027 movq qword ptr [edx + edi], xmm1 4028 lea edx, [edx + 8] 4029 sub ecx, 16 4030 jg convertloop 4031 4032 pop edi 4033 ret 4034 } 4035 } 4036 #endif // HAS_YUY2TOYROW_SSE2 4037 4038 #ifdef HAS_BLENDPLANEROW_SSSE3 4039 // Blend 8 pixels at a time. 4040 // unsigned version of math 4041 // =((A2*C2)+(B2*(255-C2))+255)/256 4042 // signed version of math 4043 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 4044 __declspec(naked) void BlendPlaneRow_SSSE3(const uint8* src0, 4045 const uint8* src1, 4046 const uint8* alpha, 4047 uint8* dst, 4048 int width) { 4049 __asm { 4050 push esi 4051 push edi 4052 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 4053 psllw xmm5, 8 4054 mov eax, 0x80808080 // 128 for biasing image to signed. 4055 movd xmm6, eax 4056 pshufd xmm6, xmm6, 0x00 4057 4058 mov eax, 0x807f807f // 32768 + 127 for unbias and round. 4059 movd xmm7, eax 4060 pshufd xmm7, xmm7, 0x00 4061 mov eax, [esp + 8 + 4] // src0 4062 mov edx, [esp + 8 + 8] // src1 4063 mov esi, [esp + 8 + 12] // alpha 4064 mov edi, [esp + 8 + 16] // dst 4065 mov ecx, [esp + 8 + 20] // width 4066 sub eax, esi 4067 sub edx, esi 4068 sub edi, esi 4069 4070 // 8 pixel loop. 4071 convertloop8: 4072 movq xmm0, qword ptr [esi] // alpha 4073 punpcklbw xmm0, xmm0 4074 pxor xmm0, xmm5 // a, 255-a 4075 movq xmm1, qword ptr [eax + esi] // src0 4076 movq xmm2, qword ptr [edx + esi] // src1 4077 punpcklbw xmm1, xmm2 4078 psubb xmm1, xmm6 // bias src0/1 - 128 4079 pmaddubsw xmm0, xmm1 4080 paddw xmm0, xmm7 // unbias result - 32768 and round. 4081 psrlw xmm0, 8 4082 packuswb xmm0, xmm0 4083 movq qword ptr [edi + esi], xmm0 4084 lea esi, [esi + 8] 4085 sub ecx, 8 4086 jg convertloop8 4087 4088 pop edi 4089 pop esi 4090 ret 4091 } 4092 } 4093 #endif // HAS_BLENDPLANEROW_SSSE3 4094 4095 #ifdef HAS_BLENDPLANEROW_AVX2 4096 // Blend 32 pixels at a time. 4097 // unsigned version of math 4098 // =((A2*C2)+(B2*(255-C2))+255)/256 4099 // signed version of math 4100 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 4101 __declspec(naked) void BlendPlaneRow_AVX2(const uint8* src0, 4102 const uint8* src1, 4103 const uint8* alpha, 4104 uint8* dst, 4105 int width) { 4106 __asm { 4107 push esi 4108 push edi 4109 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff00ff00 4110 vpsllw ymm5, ymm5, 8 4111 mov eax, 0x80808080 // 128 for biasing image to signed. 4112 vmovd xmm6, eax 4113 vbroadcastss ymm6, xmm6 4114 mov eax, 0x807f807f // 32768 + 127 for unbias and round. 4115 vmovd xmm7, eax 4116 vbroadcastss ymm7, xmm7 4117 mov eax, [esp + 8 + 4] // src0 4118 mov edx, [esp + 8 + 8] // src1 4119 mov esi, [esp + 8 + 12] // alpha 4120 mov edi, [esp + 8 + 16] // dst 4121 mov ecx, [esp + 8 + 20] // width 4122 sub eax, esi 4123 sub edx, esi 4124 sub edi, esi 4125 4126 // 32 pixel loop. 4127 convertloop32: 4128 vmovdqu ymm0, [esi] // alpha 4129 vpunpckhbw ymm3, ymm0, ymm0 // 8..15, 24..31 4130 vpunpcklbw ymm0, ymm0, ymm0 // 0..7, 16..23 4131 vpxor ymm3, ymm3, ymm5 // a, 255-a 4132 vpxor ymm0, ymm0, ymm5 // a, 255-a 4133 vmovdqu ymm1, [eax + esi] // src0 4134 vmovdqu ymm2, [edx + esi] // src1 4135 vpunpckhbw ymm4, ymm1, ymm2 4136 vpunpcklbw ymm1, ymm1, ymm2 4137 vpsubb ymm4, ymm4, ymm6 // bias src0/1 - 128 4138 vpsubb ymm1, ymm1, ymm6 // bias src0/1 - 128 4139 vpmaddubsw ymm3, ymm3, ymm4 4140 vpmaddubsw ymm0, ymm0, ymm1 4141 vpaddw ymm3, ymm3, ymm7 // unbias result - 32768 and round. 4142 vpaddw ymm0, ymm0, ymm7 // unbias result - 32768 and round. 4143 vpsrlw ymm3, ymm3, 8 4144 vpsrlw ymm0, ymm0, 8 4145 vpackuswb ymm0, ymm0, ymm3 4146 vmovdqu [edi + esi], ymm0 4147 lea esi, [esi + 32] 4148 sub ecx, 32 4149 jg convertloop32 4150 4151 pop edi 4152 pop esi 4153 vzeroupper 4154 ret 4155 } 4156 } 4157 #endif // HAS_BLENDPLANEROW_AVX2 4158 4159 #ifdef HAS_ARGBBLENDROW_SSSE3 4160 // Shuffle table for isolating alpha. 4161 static const uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, 4162 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80}; 4163 4164 // Blend 8 pixels at a time. 4165 __declspec(naked) void ARGBBlendRow_SSSE3(const uint8* src_argb0, 4166 const uint8* src_argb1, 4167 uint8* dst_argb, 4168 int width) { 4169 __asm { 4170 push esi 4171 mov eax, [esp + 4 + 4] // src_argb0 4172 mov esi, [esp + 4 + 8] // src_argb1 4173 mov edx, [esp + 4 + 12] // dst_argb 4174 mov ecx, [esp + 4 + 16] // width 4175 pcmpeqb xmm7, xmm7 // generate constant 0x0001 4176 psrlw xmm7, 15 4177 pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff 4178 psrlw xmm6, 8 4179 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 4180 psllw xmm5, 8 4181 pcmpeqb xmm4, xmm4 // generate mask 0xff000000 4182 pslld xmm4, 24 4183 sub ecx, 4 4184 jl convertloop4b // less than 4 pixels? 4185 4186 // 4 pixel loop. 4187 convertloop4: 4188 movdqu xmm3, [eax] // src argb 4189 lea eax, [eax + 16] 4190 movdqa xmm0, xmm3 // src argb 4191 pxor xmm3, xmm4 // ~alpha 4192 movdqu xmm2, [esi] // _r_b 4193 pshufb xmm3, xmmword ptr kShuffleAlpha // alpha 4194 pand xmm2, xmm6 // _r_b 4195 paddw xmm3, xmm7 // 256 - alpha 4196 pmullw xmm2, xmm3 // _r_b * alpha 4197 movdqu xmm1, [esi] // _a_g 4198 lea esi, [esi + 16] 4199 psrlw xmm1, 8 // _a_g 4200 por xmm0, xmm4 // set alpha to 255 4201 pmullw xmm1, xmm3 // _a_g * alpha 4202 psrlw xmm2, 8 // _r_b convert to 8 bits again 4203 paddusb xmm0, xmm2 // + src argb 4204 pand xmm1, xmm5 // a_g_ convert to 8 bits again 4205 paddusb xmm0, xmm1 // + src argb 4206 movdqu [edx], xmm0 4207 lea edx, [edx + 16] 4208 sub ecx, 4 4209 jge convertloop4 4210 4211 convertloop4b: 4212 add ecx, 4 - 1 4213 jl convertloop1b 4214 4215 // 1 pixel loop. 4216 convertloop1: 4217 movd xmm3, [eax] // src argb 4218 lea eax, [eax + 4] 4219 movdqa xmm0, xmm3 // src argb 4220 pxor xmm3, xmm4 // ~alpha 4221 movd xmm2, [esi] // _r_b 4222 pshufb xmm3, xmmword ptr kShuffleAlpha // alpha 4223 pand xmm2, xmm6 // _r_b 4224 paddw xmm3, xmm7 // 256 - alpha 4225 pmullw xmm2, xmm3 // _r_b * alpha 4226 movd xmm1, [esi] // _a_g 4227 lea esi, [esi + 4] 4228 psrlw xmm1, 8 // _a_g 4229 por xmm0, xmm4 // set alpha to 255 4230 pmullw xmm1, xmm3 // _a_g * alpha 4231 psrlw xmm2, 8 // _r_b convert to 8 bits again 4232 paddusb xmm0, xmm2 // + src argb 4233 pand xmm1, xmm5 // a_g_ convert to 8 bits again 4234 paddusb xmm0, xmm1 // + src argb 4235 movd [edx], xmm0 4236 lea edx, [edx + 4] 4237 sub ecx, 1 4238 jge convertloop1 4239 4240 convertloop1b: 4241 pop esi 4242 ret 4243 } 4244 } 4245 #endif // HAS_ARGBBLENDROW_SSSE3 4246 4247 #ifdef HAS_ARGBATTENUATEROW_SSSE3 4248 // Shuffle table duplicating alpha. 4249 static const uvec8 kShuffleAlpha0 = { 4250 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u, 4251 }; 4252 static const uvec8 kShuffleAlpha1 = { 4253 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, 4254 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u, 4255 }; 4256 __declspec(naked) void ARGBAttenuateRow_SSSE3(const uint8* src_argb, 4257 uint8* dst_argb, 4258 int width) { 4259 __asm { 4260 mov eax, [esp + 4] // src_argb0 4261 mov edx, [esp + 8] // dst_argb 4262 mov ecx, [esp + 12] // width 4263 pcmpeqb xmm3, xmm3 // generate mask 0xff000000 4264 pslld xmm3, 24 4265 movdqa xmm4, xmmword ptr kShuffleAlpha0 4266 movdqa xmm5, xmmword ptr kShuffleAlpha1 4267 4268 convertloop: 4269 movdqu xmm0, [eax] // read 4 pixels 4270 pshufb xmm0, xmm4 // isolate first 2 alphas 4271 movdqu xmm1, [eax] // read 4 pixels 4272 punpcklbw xmm1, xmm1 // first 2 pixel rgbs 4273 pmulhuw xmm0, xmm1 // rgb * a 4274 movdqu xmm1, [eax] // read 4 pixels 4275 pshufb xmm1, xmm5 // isolate next 2 alphas 4276 movdqu xmm2, [eax] // read 4 pixels 4277 punpckhbw xmm2, xmm2 // next 2 pixel rgbs 4278 pmulhuw xmm1, xmm2 // rgb * a 4279 movdqu xmm2, [eax] // mask original alpha 4280 lea eax, [eax + 16] 4281 pand xmm2, xmm3 4282 psrlw xmm0, 8 4283 psrlw xmm1, 8 4284 packuswb xmm0, xmm1 4285 por xmm0, xmm2 // copy original alpha 4286 movdqu [edx], xmm0 4287 lea edx, [edx + 16] 4288 sub ecx, 4 4289 jg convertloop 4290 4291 ret 4292 } 4293 } 4294 #endif // HAS_ARGBATTENUATEROW_SSSE3 4295 4296 #ifdef HAS_ARGBATTENUATEROW_AVX2 4297 // Shuffle table duplicating alpha. 4298 static const uvec8 kShuffleAlpha_AVX2 = {6u, 7u, 6u, 7u, 6u, 7u, 4299 128u, 128u, 14u, 15u, 14u, 15u, 4300 14u, 15u, 128u, 128u}; 4301 __declspec(naked) void ARGBAttenuateRow_AVX2(const uint8* src_argb, 4302 uint8* dst_argb, 4303 int width) { 4304 __asm { 4305 mov eax, [esp + 4] // src_argb0 4306 mov edx, [esp + 8] // dst_argb 4307 mov ecx, [esp + 12] // width 4308 sub edx, eax 4309 vbroadcastf128 ymm4, xmmword ptr kShuffleAlpha_AVX2 4310 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 4311 vpslld ymm5, ymm5, 24 4312 4313 convertloop: 4314 vmovdqu ymm6, [eax] // read 8 pixels. 4315 vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. 4316 vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. 4317 vpshufb ymm2, ymm0, ymm4 // low 4 alphas 4318 vpshufb ymm3, ymm1, ymm4 // high 4 alphas 4319 vpmulhuw ymm0, ymm0, ymm2 // rgb * a 4320 vpmulhuw ymm1, ymm1, ymm3 // rgb * a 4321 vpand ymm6, ymm6, ymm5 // isolate alpha 4322 vpsrlw ymm0, ymm0, 8 4323 vpsrlw ymm1, ymm1, 8 4324 vpackuswb ymm0, ymm0, ymm1 // unmutated. 4325 vpor ymm0, ymm0, ymm6 // copy original alpha 4326 vmovdqu [eax + edx], ymm0 4327 lea eax, [eax + 32] 4328 sub ecx, 8 4329 jg convertloop 4330 4331 vzeroupper 4332 ret 4333 } 4334 } 4335 #endif // HAS_ARGBATTENUATEROW_AVX2 4336 4337 #ifdef HAS_ARGBUNATTENUATEROW_SSE2 4338 // Unattenuate 4 pixels at a time. 4339 __declspec(naked) void ARGBUnattenuateRow_SSE2(const uint8* src_argb, 4340 uint8* dst_argb, 4341 int width) { 4342 __asm { 4343 push ebx 4344 push esi 4345 push edi 4346 mov eax, [esp + 12 + 4] // src_argb 4347 mov edx, [esp + 12 + 8] // dst_argb 4348 mov ecx, [esp + 12 + 12] // width 4349 lea ebx, fixed_invtbl8 4350 4351 convertloop: 4352 movdqu xmm0, [eax] // read 4 pixels 4353 movzx esi, byte ptr [eax + 3] // first alpha 4354 movzx edi, byte ptr [eax + 7] // second alpha 4355 punpcklbw xmm0, xmm0 // first 2 4356 movd xmm2, dword ptr [ebx + esi * 4] 4357 movd xmm3, dword ptr [ebx + edi * 4] 4358 pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words. 1, a, a, a 4359 pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words 4360 movlhps xmm2, xmm3 4361 pmulhuw xmm0, xmm2 // rgb * a 4362 4363 movdqu xmm1, [eax] // read 4 pixels 4364 movzx esi, byte ptr [eax + 11] // third alpha 4365 movzx edi, byte ptr [eax + 15] // forth alpha 4366 punpckhbw xmm1, xmm1 // next 2 4367 movd xmm2, dword ptr [ebx + esi * 4] 4368 movd xmm3, dword ptr [ebx + edi * 4] 4369 pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words 4370 pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words 4371 movlhps xmm2, xmm3 4372 pmulhuw xmm1, xmm2 // rgb * a 4373 lea eax, [eax + 16] 4374 packuswb xmm0, xmm1 4375 movdqu [edx], xmm0 4376 lea edx, [edx + 16] 4377 sub ecx, 4 4378 jg convertloop 4379 4380 pop edi 4381 pop esi 4382 pop ebx 4383 ret 4384 } 4385 } 4386 #endif // HAS_ARGBUNATTENUATEROW_SSE2 4387 4388 #ifdef HAS_ARGBUNATTENUATEROW_AVX2 4389 // Shuffle table duplicating alpha. 4390 static const uvec8 kUnattenShuffleAlpha_AVX2 = { 4391 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u}; 4392 // TODO(fbarchard): Enable USE_GATHER for future hardware if faster. 4393 // USE_GATHER is not on by default, due to being a slow instruction. 4394 #ifdef USE_GATHER 4395 __declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8* src_argb, 4396 uint8* dst_argb, 4397 int width) { 4398 __asm { 4399 mov eax, [esp + 4] // src_argb0 4400 mov edx, [esp + 8] // dst_argb 4401 mov ecx, [esp + 12] // width 4402 sub edx, eax 4403 vbroadcastf128 ymm4, xmmword ptr kUnattenShuffleAlpha_AVX2 4404 4405 convertloop: 4406 vmovdqu ymm6, [eax] // read 8 pixels. 4407 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xffffffff for gather. 4408 vpsrld ymm2, ymm6, 24 // alpha in low 8 bits. 4409 vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. 4410 vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. 4411 vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5 // ymm5 cleared. 1, a 4412 vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a 4413 vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated. 4414 vpshufb ymm2, ymm2, ymm4 // replicate low 4 alphas. 1, a, a, a 4415 vpshufb ymm3, ymm3, ymm4 // replicate high 4 alphas 4416 vpmulhuw ymm0, ymm0, ymm2 // rgb * ia 4417 vpmulhuw ymm1, ymm1, ymm3 // rgb * ia 4418 vpackuswb ymm0, ymm0, ymm1 // unmutated. 4419 vmovdqu [eax + edx], ymm0 4420 lea eax, [eax + 32] 4421 sub ecx, 8 4422 jg convertloop 4423 4424 vzeroupper 4425 ret 4426 } 4427 } 4428 #else // USE_GATHER 4429 __declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8* src_argb, 4430 uint8* dst_argb, 4431 int width) { 4432 __asm { 4433 4434 push ebx 4435 push esi 4436 push edi 4437 mov eax, [esp + 12 + 4] // src_argb 4438 mov edx, [esp + 12 + 8] // dst_argb 4439 mov ecx, [esp + 12 + 12] // width 4440 sub edx, eax 4441 lea ebx, fixed_invtbl8 4442 vbroadcastf128 ymm5, xmmword ptr kUnattenShuffleAlpha_AVX2 4443 4444 convertloop: 4445 // replace VPGATHER 4446 movzx esi, byte ptr [eax + 3] // alpha0 4447 movzx edi, byte ptr [eax + 7] // alpha1 4448 vmovd xmm0, dword ptr [ebx + esi * 4] // [1,a0] 4449 vmovd xmm1, dword ptr [ebx + edi * 4] // [1,a1] 4450 movzx esi, byte ptr [eax + 11] // alpha2 4451 movzx edi, byte ptr [eax + 15] // alpha3 4452 vpunpckldq xmm6, xmm0, xmm1 // [1,a1,1,a0] 4453 vmovd xmm2, dword ptr [ebx + esi * 4] // [1,a2] 4454 vmovd xmm3, dword ptr [ebx + edi * 4] // [1,a3] 4455 movzx esi, byte ptr [eax + 19] // alpha4 4456 movzx edi, byte ptr [eax + 23] // alpha5 4457 vpunpckldq xmm7, xmm2, xmm3 // [1,a3,1,a2] 4458 vmovd xmm0, dword ptr [ebx + esi * 4] // [1,a4] 4459 vmovd xmm1, dword ptr [ebx + edi * 4] // [1,a5] 4460 movzx esi, byte ptr [eax + 27] // alpha6 4461 movzx edi, byte ptr [eax + 31] // alpha7 4462 vpunpckldq xmm0, xmm0, xmm1 // [1,a5,1,a4] 4463 vmovd xmm2, dword ptr [ebx + esi * 4] // [1,a6] 4464 vmovd xmm3, dword ptr [ebx + edi * 4] // [1,a7] 4465 vpunpckldq xmm2, xmm2, xmm3 // [1,a7,1,a6] 4466 vpunpcklqdq xmm3, xmm6, xmm7 // [1,a3,1,a2,1,a1,1,a0] 4467 vpunpcklqdq xmm0, xmm0, xmm2 // [1,a7,1,a6,1,a5,1,a4] 4468 vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0] 4469 // end of VPGATHER 4470 4471 vmovdqu ymm6, [eax] // read 8 pixels. 4472 vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. 4473 vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. 4474 vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a 4475 vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated. 4476 vpshufb ymm2, ymm2, ymm5 // replicate low 4 alphas. 1, a, a, a 4477 vpshufb ymm3, ymm3, ymm5 // replicate high 4 alphas 4478 vpmulhuw ymm0, ymm0, ymm2 // rgb * ia 4479 vpmulhuw ymm1, ymm1, ymm3 // rgb * ia 4480 vpackuswb ymm0, ymm0, ymm1 // unmutated. 4481 vmovdqu [eax + edx], ymm0 4482 lea eax, [eax + 32] 4483 sub ecx, 8 4484 jg convertloop 4485 4486 pop edi 4487 pop esi 4488 pop ebx 4489 vzeroupper 4490 ret 4491 } 4492 } 4493 #endif // USE_GATHER 4494 #endif // HAS_ARGBATTENUATEROW_AVX2 4495 4496 #ifdef HAS_ARGBGRAYROW_SSSE3 4497 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels. 4498 __declspec(naked) void ARGBGrayRow_SSSE3(const uint8* src_argb, 4499 uint8* dst_argb, 4500 int width) { 4501 __asm { 4502 mov eax, [esp + 4] /* src_argb */ 4503 mov edx, [esp + 8] /* dst_argb */ 4504 mov ecx, [esp + 12] /* width */ 4505 movdqa xmm4, xmmword ptr kARGBToYJ 4506 movdqa xmm5, xmmword ptr kAddYJ64 4507 4508 convertloop: 4509 movdqu xmm0, [eax] // G 4510 movdqu xmm1, [eax + 16] 4511 pmaddubsw xmm0, xmm4 4512 pmaddubsw xmm1, xmm4 4513 phaddw xmm0, xmm1 4514 paddw xmm0, xmm5 // Add .5 for rounding. 4515 psrlw xmm0, 7 4516 packuswb xmm0, xmm0 // 8 G bytes 4517 movdqu xmm2, [eax] // A 4518 movdqu xmm3, [eax + 16] 4519 lea eax, [eax + 32] 4520 psrld xmm2, 24 4521 psrld xmm3, 24 4522 packuswb xmm2, xmm3 4523 packuswb xmm2, xmm2 // 8 A bytes 4524 movdqa xmm3, xmm0 // Weave into GG, GA, then GGGA 4525 punpcklbw xmm0, xmm0 // 8 GG words 4526 punpcklbw xmm3, xmm2 // 8 GA words 4527 movdqa xmm1, xmm0 4528 punpcklwd xmm0, xmm3 // GGGA first 4 4529 punpckhwd xmm1, xmm3 // GGGA next 4 4530 movdqu [edx], xmm0 4531 movdqu [edx + 16], xmm1 4532 lea edx, [edx + 32] 4533 sub ecx, 8 4534 jg convertloop 4535 ret 4536 } 4537 } 4538 #endif // HAS_ARGBGRAYROW_SSSE3 4539 4540 #ifdef HAS_ARGBSEPIAROW_SSSE3 4541 // b = (r * 35 + g * 68 + b * 17) >> 7 4542 // g = (r * 45 + g * 88 + b * 22) >> 7 4543 // r = (r * 50 + g * 98 + b * 24) >> 7 4544 // Constant for ARGB color to sepia tone. 4545 static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0, 4546 17, 68, 35, 0, 17, 68, 35, 0}; 4547 4548 static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0, 4549 22, 88, 45, 0, 22, 88, 45, 0}; 4550 4551 static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0, 4552 24, 98, 50, 0, 24, 98, 50, 0}; 4553 4554 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. 4555 __declspec(naked) void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { 4556 __asm { 4557 mov eax, [esp + 4] /* dst_argb */ 4558 mov ecx, [esp + 8] /* width */ 4559 movdqa xmm2, xmmword ptr kARGBToSepiaB 4560 movdqa xmm3, xmmword ptr kARGBToSepiaG 4561 movdqa xmm4, xmmword ptr kARGBToSepiaR 4562 4563 convertloop: 4564 movdqu xmm0, [eax] // B 4565 movdqu xmm6, [eax + 16] 4566 pmaddubsw xmm0, xmm2 4567 pmaddubsw xmm6, xmm2 4568 phaddw xmm0, xmm6 4569 psrlw xmm0, 7 4570 packuswb xmm0, xmm0 // 8 B values 4571 movdqu xmm5, [eax] // G 4572 movdqu xmm1, [eax + 16] 4573 pmaddubsw xmm5, xmm3 4574 pmaddubsw xmm1, xmm3 4575 phaddw xmm5, xmm1 4576 psrlw xmm5, 7 4577 packuswb xmm5, xmm5 // 8 G values 4578 punpcklbw xmm0, xmm5 // 8 BG values 4579 movdqu xmm5, [eax] // R 4580 movdqu xmm1, [eax + 16] 4581 pmaddubsw xmm5, xmm4 4582 pmaddubsw xmm1, xmm4 4583 phaddw xmm5, xmm1 4584 psrlw xmm5, 7 4585 packuswb xmm5, xmm5 // 8 R values 4586 movdqu xmm6, [eax] // A 4587 movdqu xmm1, [eax + 16] 4588 psrld xmm6, 24 4589 psrld xmm1, 24 4590 packuswb xmm6, xmm1 4591 packuswb xmm6, xmm6 // 8 A values 4592 punpcklbw xmm5, xmm6 // 8 RA values 4593 movdqa xmm1, xmm0 // Weave BG, RA together 4594 punpcklwd xmm0, xmm5 // BGRA first 4 4595 punpckhwd xmm1, xmm5 // BGRA next 4 4596 movdqu [eax], xmm0 4597 movdqu [eax + 16], xmm1 4598 lea eax, [eax + 32] 4599 sub ecx, 8 4600 jg convertloop 4601 ret 4602 } 4603 } 4604 #endif // HAS_ARGBSEPIAROW_SSSE3 4605 4606 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 4607 // Tranform 8 ARGB pixels (32 bytes) with color matrix. 4608 // Same as Sepia except matrix is provided. 4609 // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R 4610 // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd. 4611 __declspec(naked) void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, 4612 uint8* dst_argb, 4613 const int8* matrix_argb, 4614 int width) { 4615 __asm { 4616 mov eax, [esp + 4] /* src_argb */ 4617 mov edx, [esp + 8] /* dst_argb */ 4618 mov ecx, [esp + 12] /* matrix_argb */ 4619 movdqu xmm5, [ecx] 4620 pshufd xmm2, xmm5, 0x00 4621 pshufd xmm3, xmm5, 0x55 4622 pshufd xmm4, xmm5, 0xaa 4623 pshufd xmm5, xmm5, 0xff 4624 mov ecx, [esp + 16] /* width */ 4625 4626 convertloop: 4627 movdqu xmm0, [eax] // B 4628 movdqu xmm7, [eax + 16] 4629 pmaddubsw xmm0, xmm2 4630 pmaddubsw xmm7, xmm2 4631 movdqu xmm6, [eax] // G 4632 movdqu xmm1, [eax + 16] 4633 pmaddubsw xmm6, xmm3 4634 pmaddubsw xmm1, xmm3 4635 phaddsw xmm0, xmm7 // B 4636 phaddsw xmm6, xmm1 // G 4637 psraw xmm0, 6 // B 4638 psraw xmm6, 6 // G 4639 packuswb xmm0, xmm0 // 8 B values 4640 packuswb xmm6, xmm6 // 8 G values 4641 punpcklbw xmm0, xmm6 // 8 BG values 4642 movdqu xmm1, [eax] // R 4643 movdqu xmm7, [eax + 16] 4644 pmaddubsw xmm1, xmm4 4645 pmaddubsw xmm7, xmm4 4646 phaddsw xmm1, xmm7 // R 4647 movdqu xmm6, [eax] // A 4648 movdqu xmm7, [eax + 16] 4649 pmaddubsw xmm6, xmm5 4650 pmaddubsw xmm7, xmm5 4651 phaddsw xmm6, xmm7 // A 4652 psraw xmm1, 6 // R 4653 psraw xmm6, 6 // A 4654 packuswb xmm1, xmm1 // 8 R values 4655 packuswb xmm6, xmm6 // 8 A values 4656 punpcklbw xmm1, xmm6 // 8 RA values 4657 movdqa xmm6, xmm0 // Weave BG, RA together 4658 punpcklwd xmm0, xmm1 // BGRA first 4 4659 punpckhwd xmm6, xmm1 // BGRA next 4 4660 movdqu [edx], xmm0 4661 movdqu [edx + 16], xmm6 4662 lea eax, [eax + 32] 4663 lea edx, [edx + 32] 4664 sub ecx, 8 4665 jg convertloop 4666 ret 4667 } 4668 } 4669 #endif // HAS_ARGBCOLORMATRIXROW_SSSE3 4670 4671 #ifdef HAS_ARGBQUANTIZEROW_SSE2 4672 // Quantize 4 ARGB pixels (16 bytes). 4673 __declspec(naked) void ARGBQuantizeRow_SSE2(uint8* dst_argb, 4674 int scale, 4675 int interval_size, 4676 int interval_offset, 4677 int width) { 4678 __asm { 4679 mov eax, [esp + 4] /* dst_argb */ 4680 movd xmm2, [esp + 8] /* scale */ 4681 movd xmm3, [esp + 12] /* interval_size */ 4682 movd xmm4, [esp + 16] /* interval_offset */ 4683 mov ecx, [esp + 20] /* width */ 4684 pshuflw xmm2, xmm2, 040h 4685 pshufd xmm2, xmm2, 044h 4686 pshuflw xmm3, xmm3, 040h 4687 pshufd xmm3, xmm3, 044h 4688 pshuflw xmm4, xmm4, 040h 4689 pshufd xmm4, xmm4, 044h 4690 pxor xmm5, xmm5 // constant 0 4691 pcmpeqb xmm6, xmm6 // generate mask 0xff000000 4692 pslld xmm6, 24 4693 4694 convertloop: 4695 movdqu xmm0, [eax] // read 4 pixels 4696 punpcklbw xmm0, xmm5 // first 2 pixels 4697 pmulhuw xmm0, xmm2 // pixel * scale >> 16 4698 movdqu xmm1, [eax] // read 4 pixels 4699 punpckhbw xmm1, xmm5 // next 2 pixels 4700 pmulhuw xmm1, xmm2 4701 pmullw xmm0, xmm3 // * interval_size 4702 movdqu xmm7, [eax] // read 4 pixels 4703 pmullw xmm1, xmm3 4704 pand xmm7, xmm6 // mask alpha 4705 paddw xmm0, xmm4 // + interval_size / 2 4706 paddw xmm1, xmm4 4707 packuswb xmm0, xmm1 4708 por xmm0, xmm7 4709 movdqu [eax], xmm0 4710 lea eax, [eax + 16] 4711 sub ecx, 4 4712 jg convertloop 4713 ret 4714 } 4715 } 4716 #endif // HAS_ARGBQUANTIZEROW_SSE2 4717 4718 #ifdef HAS_ARGBSHADEROW_SSE2 4719 // Shade 4 pixels at a time by specified value. 4720 __declspec(naked) void ARGBShadeRow_SSE2(const uint8* src_argb, 4721 uint8* dst_argb, 4722 int width, 4723 uint32 value) { 4724 __asm { 4725 mov eax, [esp + 4] // src_argb 4726 mov edx, [esp + 8] // dst_argb 4727 mov ecx, [esp + 12] // width 4728 movd xmm2, [esp + 16] // value 4729 punpcklbw xmm2, xmm2 4730 punpcklqdq xmm2, xmm2 4731 4732 convertloop: 4733 movdqu xmm0, [eax] // read 4 pixels 4734 lea eax, [eax + 16] 4735 movdqa xmm1, xmm0 4736 punpcklbw xmm0, xmm0 // first 2 4737 punpckhbw xmm1, xmm1 // next 2 4738 pmulhuw xmm0, xmm2 // argb * value 4739 pmulhuw xmm1, xmm2 // argb * value 4740 psrlw xmm0, 8 4741 psrlw xmm1, 8 4742 packuswb xmm0, xmm1 4743 movdqu [edx], xmm0 4744 lea edx, [edx + 16] 4745 sub ecx, 4 4746 jg convertloop 4747 4748 ret 4749 } 4750 } 4751 #endif // HAS_ARGBSHADEROW_SSE2 4752 4753 #ifdef HAS_ARGBMULTIPLYROW_SSE2 4754 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time. 4755 __declspec(naked) void ARGBMultiplyRow_SSE2(const uint8* src_argb0, 4756 const uint8* src_argb1, 4757 uint8* dst_argb, 4758 int width) { 4759 __asm { 4760 push esi 4761 mov eax, [esp + 4 + 4] // src_argb0 4762 mov esi, [esp + 4 + 8] // src_argb1 4763 mov edx, [esp + 4 + 12] // dst_argb 4764 mov ecx, [esp + 4 + 16] // width 4765 pxor xmm5, xmm5 // constant 0 4766 4767 convertloop: 4768 movdqu xmm0, [eax] // read 4 pixels from src_argb0 4769 movdqu xmm2, [esi] // read 4 pixels from src_argb1 4770 movdqu xmm1, xmm0 4771 movdqu xmm3, xmm2 4772 punpcklbw xmm0, xmm0 // first 2 4773 punpckhbw xmm1, xmm1 // next 2 4774 punpcklbw xmm2, xmm5 // first 2 4775 punpckhbw xmm3, xmm5 // next 2 4776 pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2 4777 pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2 4778 lea eax, [eax + 16] 4779 lea esi, [esi + 16] 4780 packuswb xmm0, xmm1 4781 movdqu [edx], xmm0 4782 lea edx, [edx + 16] 4783 sub ecx, 4 4784 jg convertloop 4785 4786 pop esi 4787 ret 4788 } 4789 } 4790 #endif // HAS_ARGBMULTIPLYROW_SSE2 4791 4792 #ifdef HAS_ARGBADDROW_SSE2 4793 // Add 2 rows of ARGB pixels together, 4 pixels at a time. 4794 // TODO(fbarchard): Port this to posix, neon and other math functions. 4795 __declspec(naked) void ARGBAddRow_SSE2(const uint8* src_argb0, 4796 const uint8* src_argb1, 4797 uint8* dst_argb, 4798 int width) { 4799 __asm { 4800 push esi 4801 mov eax, [esp + 4 + 4] // src_argb0 4802 mov esi, [esp + 4 + 8] // src_argb1 4803 mov edx, [esp + 4 + 12] // dst_argb 4804 mov ecx, [esp + 4 + 16] // width 4805 4806 sub ecx, 4 4807 jl convertloop49 4808 4809 convertloop4: 4810 movdqu xmm0, [eax] // read 4 pixels from src_argb0 4811 lea eax, [eax + 16] 4812 movdqu xmm1, [esi] // read 4 pixels from src_argb1 4813 lea esi, [esi + 16] 4814 paddusb xmm0, xmm1 // src_argb0 + src_argb1 4815 movdqu [edx], xmm0 4816 lea edx, [edx + 16] 4817 sub ecx, 4 4818 jge convertloop4 4819 4820 convertloop49: 4821 add ecx, 4 - 1 4822 jl convertloop19 4823 4824 convertloop1: 4825 movd xmm0, [eax] // read 1 pixels from src_argb0 4826 lea eax, [eax + 4] 4827 movd xmm1, [esi] // read 1 pixels from src_argb1 4828 lea esi, [esi + 4] 4829 paddusb xmm0, xmm1 // src_argb0 + src_argb1 4830 movd [edx], xmm0 4831 lea edx, [edx + 4] 4832 sub ecx, 1 4833 jge convertloop1 4834 4835 convertloop19: 4836 pop esi 4837 ret 4838 } 4839 } 4840 #endif // HAS_ARGBADDROW_SSE2 4841 4842 #ifdef HAS_ARGBSUBTRACTROW_SSE2 4843 // Subtract 2 rows of ARGB pixels together, 4 pixels at a time. 4844 __declspec(naked) void ARGBSubtractRow_SSE2(const uint8* src_argb0, 4845 const uint8* src_argb1, 4846 uint8* dst_argb, 4847 int width) { 4848 __asm { 4849 push esi 4850 mov eax, [esp + 4 + 4] // src_argb0 4851 mov esi, [esp + 4 + 8] // src_argb1 4852 mov edx, [esp + 4 + 12] // dst_argb 4853 mov ecx, [esp + 4 + 16] // width 4854 4855 convertloop: 4856 movdqu xmm0, [eax] // read 4 pixels from src_argb0 4857 lea eax, [eax + 16] 4858 movdqu xmm1, [esi] // read 4 pixels from src_argb1 4859 lea esi, [esi + 16] 4860 psubusb xmm0, xmm1 // src_argb0 - src_argb1 4861 movdqu [edx], xmm0 4862 lea edx, [edx + 16] 4863 sub ecx, 4 4864 jg convertloop 4865 4866 pop esi 4867 ret 4868 } 4869 } 4870 #endif // HAS_ARGBSUBTRACTROW_SSE2 4871 4872 #ifdef HAS_ARGBMULTIPLYROW_AVX2 4873 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. 4874 __declspec(naked) void ARGBMultiplyRow_AVX2(const uint8* src_argb0, 4875 const uint8* src_argb1, 4876 uint8* dst_argb, 4877 int width) { 4878 __asm { 4879 push esi 4880 mov eax, [esp + 4 + 4] // src_argb0 4881 mov esi, [esp + 4 + 8] // src_argb1 4882 mov edx, [esp + 4 + 12] // dst_argb 4883 mov ecx, [esp + 4 + 16] // width 4884 vpxor ymm5, ymm5, ymm5 // constant 0 4885 4886 convertloop: 4887 vmovdqu ymm1, [eax] // read 8 pixels from src_argb0 4888 lea eax, [eax + 32] 4889 vmovdqu ymm3, [esi] // read 8 pixels from src_argb1 4890 lea esi, [esi + 32] 4891 vpunpcklbw ymm0, ymm1, ymm1 // low 4 4892 vpunpckhbw ymm1, ymm1, ymm1 // high 4 4893 vpunpcklbw ymm2, ymm3, ymm5 // low 4 4894 vpunpckhbw ymm3, ymm3, ymm5 // high 4 4895 vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4 4896 vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4 4897 vpackuswb ymm0, ymm0, ymm1 4898 vmovdqu [edx], ymm0 4899 lea edx, [edx + 32] 4900 sub ecx, 8 4901 jg convertloop 4902 4903 pop esi 4904 vzeroupper 4905 ret 4906 } 4907 } 4908 #endif // HAS_ARGBMULTIPLYROW_AVX2 4909 4910 #ifdef HAS_ARGBADDROW_AVX2 4911 // Add 2 rows of ARGB pixels together, 8 pixels at a time. 4912 __declspec(naked) void ARGBAddRow_AVX2(const uint8* src_argb0, 4913 const uint8* src_argb1, 4914 uint8* dst_argb, 4915 int width) { 4916 __asm { 4917 push esi 4918 mov eax, [esp + 4 + 4] // src_argb0 4919 mov esi, [esp + 4 + 8] // src_argb1 4920 mov edx, [esp + 4 + 12] // dst_argb 4921 mov ecx, [esp + 4 + 16] // width 4922 4923 convertloop: 4924 vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 4925 lea eax, [eax + 32] 4926 vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1 4927 lea esi, [esi + 32] 4928 vmovdqu [edx], ymm0 4929 lea edx, [edx + 32] 4930 sub ecx, 8 4931 jg convertloop 4932 4933 pop esi 4934 vzeroupper 4935 ret 4936 } 4937 } 4938 #endif // HAS_ARGBADDROW_AVX2 4939 4940 #ifdef HAS_ARGBSUBTRACTROW_AVX2 4941 // Subtract 2 rows of ARGB pixels together, 8 pixels at a time. 4942 __declspec(naked) void ARGBSubtractRow_AVX2(const uint8* src_argb0, 4943 const uint8* src_argb1, 4944 uint8* dst_argb, 4945 int width) { 4946 __asm { 4947 push esi 4948 mov eax, [esp + 4 + 4] // src_argb0 4949 mov esi, [esp + 4 + 8] // src_argb1 4950 mov edx, [esp + 4 + 12] // dst_argb 4951 mov ecx, [esp + 4 + 16] // width 4952 4953 convertloop: 4954 vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 4955 lea eax, [eax + 32] 4956 vpsubusb ymm0, ymm0, [esi] // src_argb0 - src_argb1 4957 lea esi, [esi + 32] 4958 vmovdqu [edx], ymm0 4959 lea edx, [edx + 32] 4960 sub ecx, 8 4961 jg convertloop 4962 4963 pop esi 4964 vzeroupper 4965 ret 4966 } 4967 } 4968 #endif // HAS_ARGBSUBTRACTROW_AVX2 4969 4970 #ifdef HAS_SOBELXROW_SSE2 4971 // SobelX as a matrix is 4972 // -1 0 1 4973 // -2 0 2 4974 // -1 0 1 4975 __declspec(naked) void SobelXRow_SSE2(const uint8* src_y0, 4976 const uint8* src_y1, 4977 const uint8* src_y2, 4978 uint8* dst_sobelx, 4979 int width) { 4980 __asm { 4981 push esi 4982 push edi 4983 mov eax, [esp + 8 + 4] // src_y0 4984 mov esi, [esp + 8 + 8] // src_y1 4985 mov edi, [esp + 8 + 12] // src_y2 4986 mov edx, [esp + 8 + 16] // dst_sobelx 4987 mov ecx, [esp + 8 + 20] // width 4988 sub esi, eax 4989 sub edi, eax 4990 sub edx, eax 4991 pxor xmm5, xmm5 // constant 0 4992 4993 convertloop: 4994 movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] 4995 movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2] 4996 punpcklbw xmm0, xmm5 4997 punpcklbw xmm1, xmm5 4998 psubw xmm0, xmm1 4999 movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] 5000 movq xmm2, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2] 5001 punpcklbw xmm1, xmm5 5002 punpcklbw xmm2, xmm5 5003 psubw xmm1, xmm2 5004 movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0] 5005 movq xmm3, qword ptr [eax + edi + 2] // read 8 pixels from src_y2[2] 5006 punpcklbw xmm2, xmm5 5007 punpcklbw xmm3, xmm5 5008 psubw xmm2, xmm3 5009 paddw xmm0, xmm2 5010 paddw xmm0, xmm1 5011 paddw xmm0, xmm1 5012 pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw 5013 psubw xmm1, xmm0 5014 pmaxsw xmm0, xmm1 5015 packuswb xmm0, xmm0 5016 movq qword ptr [eax + edx], xmm0 5017 lea eax, [eax + 8] 5018 sub ecx, 8 5019 jg convertloop 5020 5021 pop edi 5022 pop esi 5023 ret 5024 } 5025 } 5026 #endif // HAS_SOBELXROW_SSE2 5027 5028 #ifdef HAS_SOBELYROW_SSE2 5029 // SobelY as a matrix is 5030 // -1 -2 -1 5031 // 0 0 0 5032 // 1 2 1 5033 __declspec(naked) void SobelYRow_SSE2(const uint8* src_y0, 5034 const uint8* src_y1, 5035 uint8* dst_sobely, 5036 int width) { 5037 __asm { 5038 push esi 5039 mov eax, [esp + 4 + 4] // src_y0 5040 mov esi, [esp + 4 + 8] // src_y1 5041 mov edx, [esp + 4 + 12] // dst_sobely 5042 mov ecx, [esp + 4 + 16] // width 5043 sub esi, eax 5044 sub edx, eax 5045 pxor xmm5, xmm5 // constant 0 5046 5047 convertloop: 5048 movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] 5049 movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] 5050 punpcklbw xmm0, xmm5 5051 punpcklbw xmm1, xmm5 5052 psubw xmm0, xmm1 5053 movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1] 5054 movq xmm2, qword ptr [eax + esi + 1] // read 8 pixels from src_y1[1] 5055 punpcklbw xmm1, xmm5 5056 punpcklbw xmm2, xmm5 5057 psubw xmm1, xmm2 5058 movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2] 5059 movq xmm3, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2] 5060 punpcklbw xmm2, xmm5 5061 punpcklbw xmm3, xmm5 5062 psubw xmm2, xmm3 5063 paddw xmm0, xmm2 5064 paddw xmm0, xmm1 5065 paddw xmm0, xmm1 5066 pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw 5067 psubw xmm1, xmm0 5068 pmaxsw xmm0, xmm1 5069 packuswb xmm0, xmm0 5070 movq qword ptr [eax + edx], xmm0 5071 lea eax, [eax + 8] 5072 sub ecx, 8 5073 jg convertloop 5074 5075 pop esi 5076 ret 5077 } 5078 } 5079 #endif // HAS_SOBELYROW_SSE2 5080 5081 #ifdef HAS_SOBELROW_SSE2 5082 // Adds Sobel X and Sobel Y and stores Sobel into ARGB. 5083 // A = 255 5084 // R = Sobel 5085 // G = Sobel 5086 // B = Sobel 5087 __declspec(naked) void SobelRow_SSE2(const uint8* src_sobelx, 5088 const uint8* src_sobely, 5089 uint8* dst_argb, 5090 int width) { 5091 __asm { 5092 push esi 5093 mov eax, [esp + 4 + 4] // src_sobelx 5094 mov esi, [esp + 4 + 8] // src_sobely 5095 mov edx, [esp + 4 + 12] // dst_argb 5096 mov ecx, [esp + 4 + 16] // width 5097 sub esi, eax 5098 pcmpeqb xmm5, xmm5 // alpha 255 5099 pslld xmm5, 24 // 0xff000000 5100 5101 convertloop: 5102 movdqu xmm0, [eax] // read 16 pixels src_sobelx 5103 movdqu xmm1, [eax + esi] // read 16 pixels src_sobely 5104 lea eax, [eax + 16] 5105 paddusb xmm0, xmm1 // sobel = sobelx + sobely 5106 movdqa xmm2, xmm0 // GG 5107 punpcklbw xmm2, xmm0 // First 8 5108 punpckhbw xmm0, xmm0 // Next 8 5109 movdqa xmm1, xmm2 // GGGG 5110 punpcklwd xmm1, xmm2 // First 4 5111 punpckhwd xmm2, xmm2 // Next 4 5112 por xmm1, xmm5 // GGGA 5113 por xmm2, xmm5 5114 movdqa xmm3, xmm0 // GGGG 5115 punpcklwd xmm3, xmm0 // Next 4 5116 punpckhwd xmm0, xmm0 // Last 4 5117 por xmm3, xmm5 // GGGA 5118 por xmm0, xmm5 5119 movdqu [edx], xmm1 5120 movdqu [edx + 16], xmm2 5121 movdqu [edx + 32], xmm3 5122 movdqu [edx + 48], xmm0 5123 lea edx, [edx + 64] 5124 sub ecx, 16 5125 jg convertloop 5126 5127 pop esi 5128 ret 5129 } 5130 } 5131 #endif // HAS_SOBELROW_SSE2 5132 5133 #ifdef HAS_SOBELTOPLANEROW_SSE2 5134 // Adds Sobel X and Sobel Y and stores Sobel into a plane. 5135 __declspec(naked) void SobelToPlaneRow_SSE2(const uint8* src_sobelx, 5136 const uint8* src_sobely, 5137 uint8* dst_y, 5138 int width) { 5139 __asm { 5140 push esi 5141 mov eax, [esp + 4 + 4] // src_sobelx 5142 mov esi, [esp + 4 + 8] // src_sobely 5143 mov edx, [esp + 4 + 12] // dst_argb 5144 mov ecx, [esp + 4 + 16] // width 5145 sub esi, eax 5146 5147 convertloop: 5148 movdqu xmm0, [eax] // read 16 pixels src_sobelx 5149 movdqu xmm1, [eax + esi] // read 16 pixels src_sobely 5150 lea eax, [eax + 16] 5151 paddusb xmm0, xmm1 // sobel = sobelx + sobely 5152 movdqu [edx], xmm0 5153 lea edx, [edx + 16] 5154 sub ecx, 16 5155 jg convertloop 5156 5157 pop esi 5158 ret 5159 } 5160 } 5161 #endif // HAS_SOBELTOPLANEROW_SSE2 5162 5163 #ifdef HAS_SOBELXYROW_SSE2 5164 // Mixes Sobel X, Sobel Y and Sobel into ARGB. 5165 // A = 255 5166 // R = Sobel X 5167 // G = Sobel 5168 // B = Sobel Y 5169 __declspec(naked) void SobelXYRow_SSE2(const uint8* src_sobelx, 5170 const uint8* src_sobely, 5171 uint8* dst_argb, 5172 int width) { 5173 __asm { 5174 push esi 5175 mov eax, [esp + 4 + 4] // src_sobelx 5176 mov esi, [esp + 4 + 8] // src_sobely 5177 mov edx, [esp + 4 + 12] // dst_argb 5178 mov ecx, [esp + 4 + 16] // width 5179 sub esi, eax 5180 pcmpeqb xmm5, xmm5 // alpha 255 5181 5182 convertloop: 5183 movdqu xmm0, [eax] // read 16 pixels src_sobelx 5184 movdqu xmm1, [eax + esi] // read 16 pixels src_sobely 5185 lea eax, [eax + 16] 5186 movdqa xmm2, xmm0 5187 paddusb xmm2, xmm1 // sobel = sobelx + sobely 5188 movdqa xmm3, xmm0 // XA 5189 punpcklbw xmm3, xmm5 5190 punpckhbw xmm0, xmm5 5191 movdqa xmm4, xmm1 // YS 5192 punpcklbw xmm4, xmm2 5193 punpckhbw xmm1, xmm2 5194 movdqa xmm6, xmm4 // YSXA 5195 punpcklwd xmm6, xmm3 // First 4 5196 punpckhwd xmm4, xmm3 // Next 4 5197 movdqa xmm7, xmm1 // YSXA 5198 punpcklwd xmm7, xmm0 // Next 4 5199 punpckhwd xmm1, xmm0 // Last 4 5200 movdqu [edx], xmm6 5201 movdqu [edx + 16], xmm4 5202 movdqu [edx + 32], xmm7 5203 movdqu [edx + 48], xmm1 5204 lea edx, [edx + 64] 5205 sub ecx, 16 5206 jg convertloop 5207 5208 pop esi 5209 ret 5210 } 5211 } 5212 #endif // HAS_SOBELXYROW_SSE2 5213 5214 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 5215 // Consider float CumulativeSum. 5216 // Consider calling CumulativeSum one row at time as needed. 5217 // Consider circular CumulativeSum buffer of radius * 2 + 1 height. 5218 // Convert cumulative sum for an area to an average for 1 pixel. 5219 // topleft is pointer to top left of CumulativeSum buffer for area. 5220 // botleft is pointer to bottom left of CumulativeSum buffer. 5221 // width is offset from left to right of area in CumulativeSum buffer measured 5222 // in number of ints. 5223 // area is the number of pixels in the area being averaged. 5224 // dst points to pixel to store result to. 5225 // count is number of averaged pixels to produce. 5226 // Does 4 pixels at a time. 5227 // This function requires alignment on accumulation buffer pointers. 5228 void CumulativeSumToAverageRow_SSE2(const int32* topleft, 5229 const int32* botleft, 5230 int width, 5231 int area, 5232 uint8* dst, 5233 int count) { 5234 __asm { 5235 mov eax, topleft // eax topleft 5236 mov esi, botleft // esi botleft 5237 mov edx, width 5238 movd xmm5, area 5239 mov edi, dst 5240 mov ecx, count 5241 cvtdq2ps xmm5, xmm5 5242 rcpss xmm4, xmm5 // 1.0f / area 5243 pshufd xmm4, xmm4, 0 5244 sub ecx, 4 5245 jl l4b 5246 5247 cmp area, 128 // 128 pixels will not overflow 15 bits. 5248 ja l4 5249 5250 pshufd xmm5, xmm5, 0 // area 5251 pcmpeqb xmm6, xmm6 // constant of 65536.0 - 1 = 65535.0 5252 psrld xmm6, 16 5253 cvtdq2ps xmm6, xmm6 5254 addps xmm5, xmm6 // (65536.0 + area - 1) 5255 mulps xmm5, xmm4 // (65536.0 + area - 1) * 1 / area 5256 cvtps2dq xmm5, xmm5 // 0.16 fixed point 5257 packssdw xmm5, xmm5 // 16 bit shorts 5258 5259 // 4 pixel loop small blocks. 5260 s4: 5261 // top left 5262 movdqu xmm0, [eax] 5263 movdqu xmm1, [eax + 16] 5264 movdqu xmm2, [eax + 32] 5265 movdqu xmm3, [eax + 48] 5266 5267 // - top right 5268 psubd xmm0, [eax + edx * 4] 5269 psubd xmm1, [eax + edx * 4 + 16] 5270 psubd xmm2, [eax + edx * 4 + 32] 5271 psubd xmm3, [eax + edx * 4 + 48] 5272 lea eax, [eax + 64] 5273 5274 // - bottom left 5275 psubd xmm0, [esi] 5276 psubd xmm1, [esi + 16] 5277 psubd xmm2, [esi + 32] 5278 psubd xmm3, [esi + 48] 5279 5280 // + bottom right 5281 paddd xmm0, [esi + edx * 4] 5282 paddd xmm1, [esi + edx * 4 + 16] 5283 paddd xmm2, [esi + edx * 4 + 32] 5284 paddd xmm3, [esi + edx * 4 + 48] 5285 lea esi, [esi + 64] 5286 5287 packssdw xmm0, xmm1 // pack 4 pixels into 2 registers 5288 packssdw xmm2, xmm3 5289 5290 pmulhuw xmm0, xmm5 5291 pmulhuw xmm2, xmm5 5292 5293 packuswb xmm0, xmm2 5294 movdqu [edi], xmm0 5295 lea edi, [edi + 16] 5296 sub ecx, 4 5297 jge s4 5298 5299 jmp l4b 5300 5301 // 4 pixel loop 5302 l4: 5303 // top left 5304 movdqu xmm0, [eax] 5305 movdqu xmm1, [eax + 16] 5306 movdqu xmm2, [eax + 32] 5307 movdqu xmm3, [eax + 48] 5308 5309 // - top right 5310 psubd xmm0, [eax + edx * 4] 5311 psubd xmm1, [eax + edx * 4 + 16] 5312 psubd xmm2, [eax + edx * 4 + 32] 5313 psubd xmm3, [eax + edx * 4 + 48] 5314 lea eax, [eax + 64] 5315 5316 // - bottom left 5317 psubd xmm0, [esi] 5318 psubd xmm1, [esi + 16] 5319 psubd xmm2, [esi + 32] 5320 psubd xmm3, [esi + 48] 5321 5322 // + bottom right 5323 paddd xmm0, [esi + edx * 4] 5324 paddd xmm1, [esi + edx * 4 + 16] 5325 paddd xmm2, [esi + edx * 4 + 32] 5326 paddd xmm3, [esi + edx * 4 + 48] 5327 lea esi, [esi + 64] 5328 5329 cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area 5330 cvtdq2ps xmm1, xmm1 5331 mulps xmm0, xmm4 5332 mulps xmm1, xmm4 5333 cvtdq2ps xmm2, xmm2 5334 cvtdq2ps xmm3, xmm3 5335 mulps xmm2, xmm4 5336 mulps xmm3, xmm4 5337 cvtps2dq xmm0, xmm0 5338 cvtps2dq xmm1, xmm1 5339 cvtps2dq xmm2, xmm2 5340 cvtps2dq xmm3, xmm3 5341 packssdw xmm0, xmm1 5342 packssdw xmm2, xmm3 5343 packuswb xmm0, xmm2 5344 movdqu [edi], xmm0 5345 lea edi, [edi + 16] 5346 sub ecx, 4 5347 jge l4 5348 5349 l4b: 5350 add ecx, 4 - 1 5351 jl l1b 5352 5353 // 1 pixel loop 5354 l1: 5355 movdqu xmm0, [eax] 5356 psubd xmm0, [eax + edx * 4] 5357 lea eax, [eax + 16] 5358 psubd xmm0, [esi] 5359 paddd xmm0, [esi + edx * 4] 5360 lea esi, [esi + 16] 5361 cvtdq2ps xmm0, xmm0 5362 mulps xmm0, xmm4 5363 cvtps2dq xmm0, xmm0 5364 packssdw xmm0, xmm0 5365 packuswb xmm0, xmm0 5366 movd dword ptr [edi], xmm0 5367 lea edi, [edi + 4] 5368 sub ecx, 1 5369 jge l1 5370 l1b: 5371 } 5372 } 5373 #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 5374 5375 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 5376 // Creates a table of cumulative sums where each value is a sum of all values 5377 // above and to the left of the value. 5378 void ComputeCumulativeSumRow_SSE2(const uint8* row, 5379 int32* cumsum, 5380 const int32* previous_cumsum, 5381 int width) { 5382 __asm { 5383 mov eax, row 5384 mov edx, cumsum 5385 mov esi, previous_cumsum 5386 mov ecx, width 5387 pxor xmm0, xmm0 5388 pxor xmm1, xmm1 5389 5390 sub ecx, 4 5391 jl l4b 5392 test edx, 15 5393 jne l4b 5394 5395 // 4 pixel loop 5396 l4: 5397 movdqu xmm2, [eax] // 4 argb pixels 16 bytes. 5398 lea eax, [eax + 16] 5399 movdqa xmm4, xmm2 5400 5401 punpcklbw xmm2, xmm1 5402 movdqa xmm3, xmm2 5403 punpcklwd xmm2, xmm1 5404 punpckhwd xmm3, xmm1 5405 5406 punpckhbw xmm4, xmm1 5407 movdqa xmm5, xmm4 5408 punpcklwd xmm4, xmm1 5409 punpckhwd xmm5, xmm1 5410 5411 paddd xmm0, xmm2 5412 movdqu xmm2, [esi] // previous row above. 5413 paddd xmm2, xmm0 5414 5415 paddd xmm0, xmm3 5416 movdqu xmm3, [esi + 16] 5417 paddd xmm3, xmm0 5418 5419 paddd xmm0, xmm4 5420 movdqu xmm4, [esi + 32] 5421 paddd xmm4, xmm0 5422 5423 paddd xmm0, xmm5 5424 movdqu xmm5, [esi + 48] 5425 lea esi, [esi + 64] 5426 paddd xmm5, xmm0 5427 5428 movdqu [edx], xmm2 5429 movdqu [edx + 16], xmm3 5430 movdqu [edx + 32], xmm4 5431 movdqu [edx + 48], xmm5 5432 5433 lea edx, [edx + 64] 5434 sub ecx, 4 5435 jge l4 5436 5437 l4b: 5438 add ecx, 4 - 1 5439 jl l1b 5440 5441 // 1 pixel loop 5442 l1: 5443 movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes. 5444 lea eax, [eax + 4] 5445 punpcklbw xmm2, xmm1 5446 punpcklwd xmm2, xmm1 5447 paddd xmm0, xmm2 5448 movdqu xmm2, [esi] 5449 lea esi, [esi + 16] 5450 paddd xmm2, xmm0 5451 movdqu [edx], xmm2 5452 lea edx, [edx + 16] 5453 sub ecx, 1 5454 jge l1 5455 5456 l1b: 5457 } 5458 } 5459 #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 5460 5461 #ifdef HAS_ARGBAFFINEROW_SSE2 5462 // Copy ARGB pixels from source image with slope to a row of destination. 5463 __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8* src_argb, 5464 int src_argb_stride, 5465 uint8* dst_argb, 5466 const float* uv_dudv, 5467 int width) { 5468 __asm { 5469 push esi 5470 push edi 5471 mov eax, [esp + 12] // src_argb 5472 mov esi, [esp + 16] // stride 5473 mov edx, [esp + 20] // dst_argb 5474 mov ecx, [esp + 24] // pointer to uv_dudv 5475 movq xmm2, qword ptr [ecx] // uv 5476 movq xmm7, qword ptr [ecx + 8] // dudv 5477 mov ecx, [esp + 28] // width 5478 shl esi, 16 // 4, stride 5479 add esi, 4 5480 movd xmm5, esi 5481 sub ecx, 4 5482 jl l4b 5483 5484 // setup for 4 pixel loop 5485 pshufd xmm7, xmm7, 0x44 // dup dudv 5486 pshufd xmm5, xmm5, 0 // dup 4, stride 5487 movdqa xmm0, xmm2 // x0, y0, x1, y1 5488 addps xmm0, xmm7 5489 movlhps xmm2, xmm0 5490 movdqa xmm4, xmm7 5491 addps xmm4, xmm4 // dudv *= 2 5492 movdqa xmm3, xmm2 // x2, y2, x3, y3 5493 addps xmm3, xmm4 5494 addps xmm4, xmm4 // dudv *= 4 5495 5496 // 4 pixel loop 5497 l4: 5498 cvttps2dq xmm0, xmm2 // x, y float to int first 2 5499 cvttps2dq xmm1, xmm3 // x, y float to int next 2 5500 packssdw xmm0, xmm1 // x, y as 8 shorts 5501 pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride. 5502 movd esi, xmm0 5503 pshufd xmm0, xmm0, 0x39 // shift right 5504 movd edi, xmm0 5505 pshufd xmm0, xmm0, 0x39 // shift right 5506 movd xmm1, [eax + esi] // read pixel 0 5507 movd xmm6, [eax + edi] // read pixel 1 5508 punpckldq xmm1, xmm6 // combine pixel 0 and 1 5509 addps xmm2, xmm4 // x, y += dx, dy first 2 5510 movq qword ptr [edx], xmm1 5511 movd esi, xmm0 5512 pshufd xmm0, xmm0, 0x39 // shift right 5513 movd edi, xmm0 5514 movd xmm6, [eax + esi] // read pixel 2 5515 movd xmm0, [eax + edi] // read pixel 3 5516 punpckldq xmm6, xmm0 // combine pixel 2 and 3 5517 addps xmm3, xmm4 // x, y += dx, dy next 2 5518 movq qword ptr 8[edx], xmm6 5519 lea edx, [edx + 16] 5520 sub ecx, 4 5521 jge l4 5522 5523 l4b: 5524 add ecx, 4 - 1 5525 jl l1b 5526 5527 // 1 pixel loop 5528 l1: 5529 cvttps2dq xmm0, xmm2 // x, y float to int 5530 packssdw xmm0, xmm0 // x, y as shorts 5531 pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride 5532 addps xmm2, xmm7 // x, y += dx, dy 5533 movd esi, xmm0 5534 movd xmm0, [eax + esi] // copy a pixel 5535 movd [edx], xmm0 5536 lea edx, [edx + 4] 5537 sub ecx, 1 5538 jge l1 5539 l1b: 5540 pop edi 5541 pop esi 5542 ret 5543 } 5544 } 5545 #endif // HAS_ARGBAFFINEROW_SSE2 5546 5547 #ifdef HAS_INTERPOLATEROW_AVX2 5548 // Bilinear filter 32x2 -> 32x1 5549 __declspec(naked) void InterpolateRow_AVX2(uint8* dst_ptr, 5550 const uint8* src_ptr, 5551 ptrdiff_t src_stride, 5552 int dst_width, 5553 int source_y_fraction) { 5554 __asm { 5555 push esi 5556 push edi 5557 mov edi, [esp + 8 + 4] // dst_ptr 5558 mov esi, [esp + 8 + 8] // src_ptr 5559 mov edx, [esp + 8 + 12] // src_stride 5560 mov ecx, [esp + 8 + 16] // dst_width 5561 mov eax, [esp + 8 + 20] // source_y_fraction (0..255) 5562 // Dispatch to specialized filters if applicable. 5563 cmp eax, 0 5564 je xloop100 // 0 / 256. Blend 100 / 0. 5565 sub edi, esi 5566 cmp eax, 128 5567 je xloop50 // 128 /256 is 0.50. Blend 50 / 50. 5568 5569 vmovd xmm0, eax // high fraction 0..255 5570 neg eax 5571 add eax, 256 5572 vmovd xmm5, eax // low fraction 256..1 5573 vpunpcklbw xmm5, xmm5, xmm0 5574 vpunpcklwd xmm5, xmm5, xmm5 5575 vbroadcastss ymm5, xmm5 5576 5577 mov eax, 0x80808080 // 128b for bias and rounding. 5578 vmovd xmm4, eax 5579 vbroadcastss ymm4, xmm4 5580 5581 xloop: 5582 vmovdqu ymm0, [esi] 5583 vmovdqu ymm2, [esi + edx] 5584 vpunpckhbw ymm1, ymm0, ymm2 // mutates 5585 vpunpcklbw ymm0, ymm0, ymm2 5586 vpsubb ymm1, ymm1, ymm4 // bias to signed image 5587 vpsubb ymm0, ymm0, ymm4 5588 vpmaddubsw ymm1, ymm5, ymm1 5589 vpmaddubsw ymm0, ymm5, ymm0 5590 vpaddw ymm1, ymm1, ymm4 // unbias and round 5591 vpaddw ymm0, ymm0, ymm4 5592 vpsrlw ymm1, ymm1, 8 5593 vpsrlw ymm0, ymm0, 8 5594 vpackuswb ymm0, ymm0, ymm1 // unmutates 5595 vmovdqu [esi + edi], ymm0 5596 lea esi, [esi + 32] 5597 sub ecx, 32 5598 jg xloop 5599 jmp xloop99 5600 5601 // Blend 50 / 50. 5602 xloop50: 5603 vmovdqu ymm0, [esi] 5604 vpavgb ymm0, ymm0, [esi + edx] 5605 vmovdqu [esi + edi], ymm0 5606 lea esi, [esi + 32] 5607 sub ecx, 32 5608 jg xloop50 5609 jmp xloop99 5610 5611 // Blend 100 / 0 - Copy row unchanged. 5612 xloop100: 5613 rep movsb 5614 5615 xloop99: 5616 pop edi 5617 pop esi 5618 vzeroupper 5619 ret 5620 } 5621 } 5622 #endif // HAS_INTERPOLATEROW_AVX2 5623 5624 // Bilinear filter 16x2 -> 16x1 5625 // TODO(fbarchard): Consider allowing 256 using memcpy. 5626 __declspec(naked) void InterpolateRow_SSSE3(uint8* dst_ptr, 5627 const uint8* src_ptr, 5628 ptrdiff_t src_stride, 5629 int dst_width, 5630 int source_y_fraction) { 5631 __asm { 5632 push esi 5633 push edi 5634 5635 mov edi, [esp + 8 + 4] // dst_ptr 5636 mov esi, [esp + 8 + 8] // src_ptr 5637 mov edx, [esp + 8 + 12] // src_stride 5638 mov ecx, [esp + 8 + 16] // dst_width 5639 mov eax, [esp + 8 + 20] // source_y_fraction (0..255) 5640 sub edi, esi 5641 // Dispatch to specialized filters if applicable. 5642 cmp eax, 0 5643 je xloop100 // 0 /256. Blend 100 / 0. 5644 cmp eax, 128 5645 je xloop50 // 128 / 256 is 0.50. Blend 50 / 50. 5646 5647 movd xmm0, eax // high fraction 0..255 5648 neg eax 5649 add eax, 256 5650 movd xmm5, eax // low fraction 255..1 5651 punpcklbw xmm5, xmm0 5652 punpcklwd xmm5, xmm5 5653 pshufd xmm5, xmm5, 0 5654 mov eax, 0x80808080 // 128 for biasing image to signed. 5655 movd xmm4, eax 5656 pshufd xmm4, xmm4, 0x00 5657 5658 xloop: 5659 movdqu xmm0, [esi] 5660 movdqu xmm2, [esi + edx] 5661 movdqu xmm1, xmm0 5662 punpcklbw xmm0, xmm2 5663 punpckhbw xmm1, xmm2 5664 psubb xmm0, xmm4 // bias image by -128 5665 psubb xmm1, xmm4 5666 movdqa xmm2, xmm5 5667 movdqa xmm3, xmm5 5668 pmaddubsw xmm2, xmm0 5669 pmaddubsw xmm3, xmm1 5670 paddw xmm2, xmm4 5671 paddw xmm3, xmm4 5672 psrlw xmm2, 8 5673 psrlw xmm3, 8 5674 packuswb xmm2, xmm3 5675 movdqu [esi + edi], xmm2 5676 lea esi, [esi + 16] 5677 sub ecx, 16 5678 jg xloop 5679 jmp xloop99 5680 5681 // Blend 50 / 50. 5682 xloop50: 5683 movdqu xmm0, [esi] 5684 movdqu xmm1, [esi + edx] 5685 pavgb xmm0, xmm1 5686 movdqu [esi + edi], xmm0 5687 lea esi, [esi + 16] 5688 sub ecx, 16 5689 jg xloop50 5690 jmp xloop99 5691 5692 // Blend 100 / 0 - Copy row unchanged. 5693 xloop100: 5694 movdqu xmm0, [esi] 5695 movdqu [esi + edi], xmm0 5696 lea esi, [esi + 16] 5697 sub ecx, 16 5698 jg xloop100 5699 5700 xloop99: 5701 pop edi 5702 pop esi 5703 ret 5704 } 5705 } 5706 5707 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. 5708 __declspec(naked) void ARGBShuffleRow_SSSE3(const uint8* src_argb, 5709 uint8* dst_argb, 5710 const uint8* shuffler, 5711 int width) { 5712 __asm { 5713 mov eax, [esp + 4] // src_argb 5714 mov edx, [esp + 8] // dst_argb 5715 mov ecx, [esp + 12] // shuffler 5716 movdqu xmm5, [ecx] 5717 mov ecx, [esp + 16] // width 5718 5719 wloop: 5720 movdqu xmm0, [eax] 5721 movdqu xmm1, [eax + 16] 5722 lea eax, [eax + 32] 5723 pshufb xmm0, xmm5 5724 pshufb xmm1, xmm5 5725 movdqu [edx], xmm0 5726 movdqu [edx + 16], xmm1 5727 lea edx, [edx + 32] 5728 sub ecx, 8 5729 jg wloop 5730 ret 5731 } 5732 } 5733 5734 #ifdef HAS_ARGBSHUFFLEROW_AVX2 5735 __declspec(naked) void ARGBShuffleRow_AVX2(const uint8* src_argb, 5736 uint8* dst_argb, 5737 const uint8* shuffler, 5738 int width) { 5739 __asm { 5740 mov eax, [esp + 4] // src_argb 5741 mov edx, [esp + 8] // dst_argb 5742 mov ecx, [esp + 12] // shuffler 5743 vbroadcastf128 ymm5, [ecx] // same shuffle in high as low. 5744 mov ecx, [esp + 16] // width 5745 5746 wloop: 5747 vmovdqu ymm0, [eax] 5748 vmovdqu ymm1, [eax + 32] 5749 lea eax, [eax + 64] 5750 vpshufb ymm0, ymm0, ymm5 5751 vpshufb ymm1, ymm1, ymm5 5752 vmovdqu [edx], ymm0 5753 vmovdqu [edx + 32], ymm1 5754 lea edx, [edx + 64] 5755 sub ecx, 16 5756 jg wloop 5757 5758 vzeroupper 5759 ret 5760 } 5761 } 5762 #endif // HAS_ARGBSHUFFLEROW_AVX2 5763 5764 __declspec(naked) void ARGBShuffleRow_SSE2(const uint8* src_argb, 5765 uint8* dst_argb, 5766 const uint8* shuffler, 5767 int width) { 5768 __asm { 5769 push ebx 5770 push esi 5771 mov eax, [esp + 8 + 4] // src_argb 5772 mov edx, [esp + 8 + 8] // dst_argb 5773 mov esi, [esp + 8 + 12] // shuffler 5774 mov ecx, [esp + 8 + 16] // width 5775 pxor xmm5, xmm5 5776 5777 mov ebx, [esi] // shuffler 5778 cmp ebx, 0x03000102 5779 je shuf_3012 5780 cmp ebx, 0x00010203 5781 je shuf_0123 5782 cmp ebx, 0x00030201 5783 je shuf_0321 5784 cmp ebx, 0x02010003 5785 je shuf_2103 5786 5787 // TODO(fbarchard): Use one source pointer and 3 offsets. 5788 shuf_any1: 5789 movzx ebx, byte ptr [esi] 5790 movzx ebx, byte ptr [eax + ebx] 5791 mov [edx], bl 5792 movzx ebx, byte ptr [esi + 1] 5793 movzx ebx, byte ptr [eax + ebx] 5794 mov [edx + 1], bl 5795 movzx ebx, byte ptr [esi + 2] 5796 movzx ebx, byte ptr [eax + ebx] 5797 mov [edx + 2], bl 5798 movzx ebx, byte ptr [esi + 3] 5799 movzx ebx, byte ptr [eax + ebx] 5800 mov [edx + 3], bl 5801 lea eax, [eax + 4] 5802 lea edx, [edx + 4] 5803 sub ecx, 1 5804 jg shuf_any1 5805 jmp shuf99 5806 5807 shuf_0123: 5808 movdqu xmm0, [eax] 5809 lea eax, [eax + 16] 5810 movdqa xmm1, xmm0 5811 punpcklbw xmm0, xmm5 5812 punpckhbw xmm1, xmm5 5813 pshufhw xmm0, xmm0, 01Bh // 1B = 00011011 = 0x0123 = BGRAToARGB 5814 pshuflw xmm0, xmm0, 01Bh 5815 pshufhw xmm1, xmm1, 01Bh 5816 pshuflw xmm1, xmm1, 01Bh 5817 packuswb xmm0, xmm1 5818 movdqu [edx], xmm0 5819 lea edx, [edx + 16] 5820 sub ecx, 4 5821 jg shuf_0123 5822 jmp shuf99 5823 5824 shuf_0321: 5825 movdqu xmm0, [eax] 5826 lea eax, [eax + 16] 5827 movdqa xmm1, xmm0 5828 punpcklbw xmm0, xmm5 5829 punpckhbw xmm1, xmm5 5830 pshufhw xmm0, xmm0, 039h // 39 = 00111001 = 0x0321 = RGBAToARGB 5831 pshuflw xmm0, xmm0, 039h 5832 pshufhw xmm1, xmm1, 039h 5833 pshuflw xmm1, xmm1, 039h 5834 packuswb xmm0, xmm1 5835 movdqu [edx], xmm0 5836 lea edx, [edx + 16] 5837 sub ecx, 4 5838 jg shuf_0321 5839 jmp shuf99 5840 5841 shuf_2103: 5842 movdqu xmm0, [eax] 5843 lea eax, [eax + 16] 5844 movdqa xmm1, xmm0 5845 punpcklbw xmm0, xmm5 5846 punpckhbw xmm1, xmm5 5847 pshufhw xmm0, xmm0, 093h // 93 = 10010011 = 0x2103 = ARGBToRGBA 5848 pshuflw xmm0, xmm0, 093h 5849 pshufhw xmm1, xmm1, 093h 5850 pshuflw xmm1, xmm1, 093h 5851 packuswb xmm0, xmm1 5852 movdqu [edx], xmm0 5853 lea edx, [edx + 16] 5854 sub ecx, 4 5855 jg shuf_2103 5856 jmp shuf99 5857 5858 shuf_3012: 5859 movdqu xmm0, [eax] 5860 lea eax, [eax + 16] 5861 movdqa xmm1, xmm0 5862 punpcklbw xmm0, xmm5 5863 punpckhbw xmm1, xmm5 5864 pshufhw xmm0, xmm0, 0C6h // C6 = 11000110 = 0x3012 = ABGRToARGB 5865 pshuflw xmm0, xmm0, 0C6h 5866 pshufhw xmm1, xmm1, 0C6h 5867 pshuflw xmm1, xmm1, 0C6h 5868 packuswb xmm0, xmm1 5869 movdqu [edx], xmm0 5870 lea edx, [edx + 16] 5871 sub ecx, 4 5872 jg shuf_3012 5873 5874 shuf99: 5875 pop esi 5876 pop ebx 5877 ret 5878 } 5879 } 5880 5881 // YUY2 - Macro-pixel = 2 image pixels 5882 // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4.... 5883 5884 // UYVY - Macro-pixel = 2 image pixels 5885 // U0Y0V0Y1 5886 5887 __declspec(naked) void I422ToYUY2Row_SSE2(const uint8* src_y, 5888 const uint8* src_u, 5889 const uint8* src_v, 5890 uint8* dst_frame, 5891 int width) { 5892 __asm { 5893 push esi 5894 push edi 5895 mov eax, [esp + 8 + 4] // src_y 5896 mov esi, [esp + 8 + 8] // src_u 5897 mov edx, [esp + 8 + 12] // src_v 5898 mov edi, [esp + 8 + 16] // dst_frame 5899 mov ecx, [esp + 8 + 20] // width 5900 sub edx, esi 5901 5902 convertloop: 5903 movq xmm2, qword ptr [esi] // U 5904 movq xmm3, qword ptr [esi + edx] // V 5905 lea esi, [esi + 8] 5906 punpcklbw xmm2, xmm3 // UV 5907 movdqu xmm0, [eax] // Y 5908 lea eax, [eax + 16] 5909 movdqa xmm1, xmm0 5910 punpcklbw xmm0, xmm2 // YUYV 5911 punpckhbw xmm1, xmm2 5912 movdqu [edi], xmm0 5913 movdqu [edi + 16], xmm1 5914 lea edi, [edi + 32] 5915 sub ecx, 16 5916 jg convertloop 5917 5918 pop edi 5919 pop esi 5920 ret 5921 } 5922 } 5923 5924 __declspec(naked) void I422ToUYVYRow_SSE2(const uint8* src_y, 5925 const uint8* src_u, 5926 const uint8* src_v, 5927 uint8* dst_frame, 5928 int width) { 5929 __asm { 5930 push esi 5931 push edi 5932 mov eax, [esp + 8 + 4] // src_y 5933 mov esi, [esp + 8 + 8] // src_u 5934 mov edx, [esp + 8 + 12] // src_v 5935 mov edi, [esp + 8 + 16] // dst_frame 5936 mov ecx, [esp + 8 + 20] // width 5937 sub edx, esi 5938 5939 convertloop: 5940 movq xmm2, qword ptr [esi] // U 5941 movq xmm3, qword ptr [esi + edx] // V 5942 lea esi, [esi + 8] 5943 punpcklbw xmm2, xmm3 // UV 5944 movdqu xmm0, [eax] // Y 5945 movdqa xmm1, xmm2 5946 lea eax, [eax + 16] 5947 punpcklbw xmm1, xmm0 // UYVY 5948 punpckhbw xmm2, xmm0 5949 movdqu [edi], xmm1 5950 movdqu [edi + 16], xmm2 5951 lea edi, [edi + 32] 5952 sub ecx, 16 5953 jg convertloop 5954 5955 pop edi 5956 pop esi 5957 ret 5958 } 5959 } 5960 5961 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2 5962 __declspec(naked) void ARGBPolynomialRow_SSE2(const uint8* src_argb, 5963 uint8* dst_argb, 5964 const float* poly, 5965 int width) { 5966 __asm { 5967 push esi 5968 mov eax, [esp + 4 + 4] /* src_argb */ 5969 mov edx, [esp + 4 + 8] /* dst_argb */ 5970 mov esi, [esp + 4 + 12] /* poly */ 5971 mov ecx, [esp + 4 + 16] /* width */ 5972 pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints. 5973 5974 // 2 pixel loop. 5975 convertloop: 5976 // pmovzxbd xmm0, dword ptr [eax] // BGRA pixel 5977 // pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel 5978 movq xmm0, qword ptr [eax] // BGRABGRA 5979 lea eax, [eax + 8] 5980 punpcklbw xmm0, xmm3 5981 movdqa xmm4, xmm0 5982 punpcklwd xmm0, xmm3 // pixel 0 5983 punpckhwd xmm4, xmm3 // pixel 1 5984 cvtdq2ps xmm0, xmm0 // 4 floats 5985 cvtdq2ps xmm4, xmm4 5986 movdqa xmm1, xmm0 // X 5987 movdqa xmm5, xmm4 5988 mulps xmm0, [esi + 16] // C1 * X 5989 mulps xmm4, [esi + 16] 5990 addps xmm0, [esi] // result = C0 + C1 * X 5991 addps xmm4, [esi] 5992 movdqa xmm2, xmm1 5993 movdqa xmm6, xmm5 5994 mulps xmm2, xmm1 // X * X 5995 mulps xmm6, xmm5 5996 mulps xmm1, xmm2 // X * X * X 5997 mulps xmm5, xmm6 5998 mulps xmm2, [esi + 32] // C2 * X * X 5999 mulps xmm6, [esi + 32] 6000 mulps xmm1, [esi + 48] // C3 * X * X * X 6001 mulps xmm5, [esi + 48] 6002 addps xmm0, xmm2 // result += C2 * X * X 6003 addps xmm4, xmm6 6004 addps xmm0, xmm1 // result += C3 * X * X * X 6005 addps xmm4, xmm5 6006 cvttps2dq xmm0, xmm0 6007 cvttps2dq xmm4, xmm4 6008 packuswb xmm0, xmm4 6009 packuswb xmm0, xmm0 6010 movq qword ptr [edx], xmm0 6011 lea edx, [edx + 8] 6012 sub ecx, 2 6013 jg convertloop 6014 pop esi 6015 ret 6016 } 6017 } 6018 #endif // HAS_ARGBPOLYNOMIALROW_SSE2 6019 6020 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2 6021 __declspec(naked) void ARGBPolynomialRow_AVX2(const uint8* src_argb, 6022 uint8* dst_argb, 6023 const float* poly, 6024 int width) { 6025 __asm { 6026 mov eax, [esp + 4] /* src_argb */ 6027 mov edx, [esp + 8] /* dst_argb */ 6028 mov ecx, [esp + 12] /* poly */ 6029 vbroadcastf128 ymm4, [ecx] // C0 6030 vbroadcastf128 ymm5, [ecx + 16] // C1 6031 vbroadcastf128 ymm6, [ecx + 32] // C2 6032 vbroadcastf128 ymm7, [ecx + 48] // C3 6033 mov ecx, [esp + 16] /* width */ 6034 6035 // 2 pixel loop. 6036 convertloop: 6037 vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels 6038 lea eax, [eax + 8] 6039 vcvtdq2ps ymm0, ymm0 // X 8 floats 6040 vmulps ymm2, ymm0, ymm0 // X * X 6041 vmulps ymm3, ymm0, ymm7 // C3 * X 6042 vfmadd132ps ymm0, ymm4, ymm5 // result = C0 + C1 * X 6043 vfmadd231ps ymm0, ymm2, ymm6 // result += C2 * X * X 6044 vfmadd231ps ymm0, ymm2, ymm3 // result += C3 * X * X * X 6045 vcvttps2dq ymm0, ymm0 6046 vpackusdw ymm0, ymm0, ymm0 // b0g0r0a0_00000000_b0g0r0a0_00000000 6047 vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000 6048 vpackuswb xmm0, xmm0, xmm0 // bgrabgra_00000000_00000000_00000000 6049 vmovq qword ptr [edx], xmm0 6050 lea edx, [edx + 8] 6051 sub ecx, 2 6052 jg convertloop 6053 vzeroupper 6054 ret 6055 } 6056 } 6057 #endif // HAS_ARGBPOLYNOMIALROW_AVX2 6058 6059 #ifdef HAS_HALFFLOATROW_SSE2 6060 static float kExpBias = 1.9259299444e-34f; 6061 __declspec(naked) void HalfFloatRow_SSE2(const uint16* src, 6062 uint16* dst, 6063 float scale, 6064 int width) { 6065 __asm { 6066 mov eax, [esp + 4] /* src */ 6067 mov edx, [esp + 8] /* dst */ 6068 movd xmm4, dword ptr [esp + 12] /* scale */ 6069 mov ecx, [esp + 16] /* width */ 6070 mulss xmm4, kExpBias 6071 pshufd xmm4, xmm4, 0 6072 pxor xmm5, xmm5 6073 sub edx, eax 6074 6075 // 8 pixel loop. 6076 convertloop: 6077 movdqu xmm2, xmmword ptr [eax] // 8 shorts 6078 add eax, 16 6079 movdqa xmm3, xmm2 6080 punpcklwd xmm2, xmm5 6081 cvtdq2ps xmm2, xmm2 // convert 8 ints to floats 6082 punpckhwd xmm3, xmm5 6083 cvtdq2ps xmm3, xmm3 6084 mulps xmm2, xmm4 6085 mulps xmm3, xmm4 6086 psrld xmm2, 13 6087 psrld xmm3, 13 6088 packssdw xmm2, xmm3 6089 movdqu [eax + edx - 16], xmm2 6090 sub ecx, 8 6091 jg convertloop 6092 ret 6093 } 6094 } 6095 #endif // HAS_HALFFLOATROW_SSE2 6096 6097 #ifdef HAS_HALFFLOATROW_AVX2 6098 __declspec(naked) void HalfFloatRow_AVX2(const uint16* src, 6099 uint16* dst, 6100 float scale, 6101 int width) { 6102 __asm { 6103 mov eax, [esp + 4] /* src */ 6104 mov edx, [esp + 8] /* dst */ 6105 movd xmm4, dword ptr [esp + 12] /* scale */ 6106 mov ecx, [esp + 16] /* width */ 6107 6108 vmulss xmm4, xmm4, kExpBias 6109 vbroadcastss ymm4, xmm4 6110 vpxor ymm5, ymm5, ymm5 6111 sub edx, eax 6112 6113 // 16 pixel loop. 6114 convertloop: 6115 vmovdqu ymm2, [eax] // 16 shorts 6116 add eax, 32 6117 vpunpckhwd ymm3, ymm2, ymm5 // convert 16 shorts to 16 ints 6118 vpunpcklwd ymm2, ymm2, ymm5 6119 vcvtdq2ps ymm3, ymm3 // convert 16 ints to floats 6120 vcvtdq2ps ymm2, ymm2 6121 vmulps ymm3, ymm3, ymm4 // scale to adjust exponent for 5 bit range. 6122 vmulps ymm2, ymm2, ymm4 6123 vpsrld ymm3, ymm3, 13 // float convert to 8 half floats truncate 6124 vpsrld ymm2, ymm2, 13 6125 vpackssdw ymm2, ymm2, ymm3 6126 vmovdqu [eax + edx - 32], ymm2 6127 sub ecx, 16 6128 jg convertloop 6129 vzeroupper 6130 ret 6131 } 6132 } 6133 #endif // HAS_HALFFLOATROW_AVX2 6134 6135 #ifdef HAS_HALFFLOATROW_F16C 6136 __declspec(naked) void HalfFloatRow_F16C(const uint16* src, 6137 uint16* dst, 6138 float scale, 6139 int width) { 6140 __asm { 6141 mov eax, [esp + 4] /* src */ 6142 mov edx, [esp + 8] /* dst */ 6143 vbroadcastss ymm4, [esp + 12] /* scale */ 6144 mov ecx, [esp + 16] /* width */ 6145 sub edx, eax 6146 6147 // 16 pixel loop. 6148 convertloop: 6149 vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints 6150 vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts 6151 add eax, 32 6152 vcvtdq2ps ymm2, ymm2 // convert 8 ints to floats 6153 vcvtdq2ps ymm3, ymm3 6154 vmulps ymm2, ymm2, ymm4 // scale to normalized range 0 to 1 6155 vmulps ymm3, ymm3, ymm4 6156 vcvtps2ph xmm2, ymm2, 3 // float convert to 8 half floats truncate 6157 vcvtps2ph xmm3, ymm3, 3 6158 vmovdqu [eax + edx + 32], xmm2 6159 vmovdqu [eax + edx + 32 + 16], xmm3 6160 sub ecx, 16 6161 jg convertloop 6162 vzeroupper 6163 ret 6164 } 6165 } 6166 #endif // HAS_HALFFLOATROW_F16C 6167 6168 #ifdef HAS_ARGBCOLORTABLEROW_X86 6169 // Tranform ARGB pixels with color table. 6170 __declspec(naked) void ARGBColorTableRow_X86(uint8* dst_argb, 6171 const uint8* table_argb, 6172 int width) { 6173 __asm { 6174 push esi 6175 mov eax, [esp + 4 + 4] /* dst_argb */ 6176 mov esi, [esp + 4 + 8] /* table_argb */ 6177 mov ecx, [esp + 4 + 12] /* width */ 6178 6179 // 1 pixel loop. 6180 convertloop: 6181 movzx edx, byte ptr [eax] 6182 lea eax, [eax + 4] 6183 movzx edx, byte ptr [esi + edx * 4] 6184 mov byte ptr [eax - 4], dl 6185 movzx edx, byte ptr [eax - 4 + 1] 6186 movzx edx, byte ptr [esi + edx * 4 + 1] 6187 mov byte ptr [eax - 4 + 1], dl 6188 movzx edx, byte ptr [eax - 4 + 2] 6189 movzx edx, byte ptr [esi + edx * 4 + 2] 6190 mov byte ptr [eax - 4 + 2], dl 6191 movzx edx, byte ptr [eax - 4 + 3] 6192 movzx edx, byte ptr [esi + edx * 4 + 3] 6193 mov byte ptr [eax - 4 + 3], dl 6194 dec ecx 6195 jg convertloop 6196 pop esi 6197 ret 6198 } 6199 } 6200 #endif // HAS_ARGBCOLORTABLEROW_X86 6201 6202 #ifdef HAS_RGBCOLORTABLEROW_X86 6203 // Tranform RGB pixels with color table. 6204 __declspec(naked) void RGBColorTableRow_X86(uint8* dst_argb, 6205 const uint8* table_argb, 6206 int width) { 6207 __asm { 6208 push esi 6209 mov eax, [esp + 4 + 4] /* dst_argb */ 6210 mov esi, [esp + 4 + 8] /* table_argb */ 6211 mov ecx, [esp + 4 + 12] /* width */ 6212 6213 // 1 pixel loop. 6214 convertloop: 6215 movzx edx, byte ptr [eax] 6216 lea eax, [eax + 4] 6217 movzx edx, byte ptr [esi + edx * 4] 6218 mov byte ptr [eax - 4], dl 6219 movzx edx, byte ptr [eax - 4 + 1] 6220 movzx edx, byte ptr [esi + edx * 4 + 1] 6221 mov byte ptr [eax - 4 + 1], dl 6222 movzx edx, byte ptr [eax - 4 + 2] 6223 movzx edx, byte ptr [esi + edx * 4 + 2] 6224 mov byte ptr [eax - 4 + 2], dl 6225 dec ecx 6226 jg convertloop 6227 6228 pop esi 6229 ret 6230 } 6231 } 6232 #endif // HAS_RGBCOLORTABLEROW_X86 6233 6234 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 6235 // Tranform RGB pixels with luma table. 6236 __declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, 6237 uint8* dst_argb, 6238 int width, 6239 const uint8* luma, 6240 uint32 lumacoeff) { 6241 __asm { 6242 push esi 6243 push edi 6244 mov eax, [esp + 8 + 4] /* src_argb */ 6245 mov edi, [esp + 8 + 8] /* dst_argb */ 6246 mov ecx, [esp + 8 + 12] /* width */ 6247 movd xmm2, dword ptr [esp + 8 + 16] // luma table 6248 movd xmm3, dword ptr [esp + 8 + 20] // lumacoeff 6249 pshufd xmm2, xmm2, 0 6250 pshufd xmm3, xmm3, 0 6251 pcmpeqb xmm4, xmm4 // generate mask 0xff00ff00 6252 psllw xmm4, 8 6253 pxor xmm5, xmm5 6254 6255 // 4 pixel loop. 6256 convertloop: 6257 movdqu xmm0, xmmword ptr [eax] // generate luma ptr 6258 pmaddubsw xmm0, xmm3 6259 phaddw xmm0, xmm0 6260 pand xmm0, xmm4 // mask out low bits 6261 punpcklwd xmm0, xmm5 6262 paddd xmm0, xmm2 // add table base 6263 movd esi, xmm0 6264 pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 6265 6266 movzx edx, byte ptr [eax] 6267 movzx edx, byte ptr [esi + edx] 6268 mov byte ptr [edi], dl 6269 movzx edx, byte ptr [eax + 1] 6270 movzx edx, byte ptr [esi + edx] 6271 mov byte ptr [edi + 1], dl 6272 movzx edx, byte ptr [eax + 2] 6273 movzx edx, byte ptr [esi + edx] 6274 mov byte ptr [edi + 2], dl 6275 movzx edx, byte ptr [eax + 3] // copy alpha. 6276 mov byte ptr [edi + 3], dl 6277 6278 movd esi, xmm0 6279 pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 6280 6281 movzx edx, byte ptr [eax + 4] 6282 movzx edx, byte ptr [esi + edx] 6283 mov byte ptr [edi + 4], dl 6284 movzx edx, byte ptr [eax + 5] 6285 movzx edx, byte ptr [esi + edx] 6286 mov byte ptr [edi + 5], dl 6287 movzx edx, byte ptr [eax + 6] 6288 movzx edx, byte ptr [esi + edx] 6289 mov byte ptr [edi + 6], dl 6290 movzx edx, byte ptr [eax + 7] // copy alpha. 6291 mov byte ptr [edi + 7], dl 6292 6293 movd esi, xmm0 6294 pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 6295 6296 movzx edx, byte ptr [eax + 8] 6297 movzx edx, byte ptr [esi + edx] 6298 mov byte ptr [edi + 8], dl 6299 movzx edx, byte ptr [eax + 9] 6300 movzx edx, byte ptr [esi + edx] 6301 mov byte ptr [edi + 9], dl 6302 movzx edx, byte ptr [eax + 10] 6303 movzx edx, byte ptr [esi + edx] 6304 mov byte ptr [edi + 10], dl 6305 movzx edx, byte ptr [eax + 11] // copy alpha. 6306 mov byte ptr [edi + 11], dl 6307 6308 movd esi, xmm0 6309 6310 movzx edx, byte ptr [eax + 12] 6311 movzx edx, byte ptr [esi + edx] 6312 mov byte ptr [edi + 12], dl 6313 movzx edx, byte ptr [eax + 13] 6314 movzx edx, byte ptr [esi + edx] 6315 mov byte ptr [edi + 13], dl 6316 movzx edx, byte ptr [eax + 14] 6317 movzx edx, byte ptr [esi + edx] 6318 mov byte ptr [edi + 14], dl 6319 movzx edx, byte ptr [eax + 15] // copy alpha. 6320 mov byte ptr [edi + 15], dl 6321 6322 lea eax, [eax + 16] 6323 lea edi, [edi + 16] 6324 sub ecx, 4 6325 jg convertloop 6326 6327 pop edi 6328 pop esi 6329 ret 6330 } 6331 } 6332 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 6333 6334 #endif // defined(_M_X64) 6335 6336 #ifdef __cplusplus 6337 } // extern "C" 6338 } // namespace libyuv 6339 #endif 6340 6341 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) 6342