1 /* 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include "libyuv/row.h" 12 13 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_X64) && \ 14 defined(_MSC_VER) && !defined(__clang__) 15 #include <emmintrin.h> 16 #include <tmmintrin.h> // For _mm_maddubs_epi16 17 #endif 18 19 #ifdef __cplusplus 20 namespace libyuv { 21 extern "C" { 22 #endif 23 24 // This module is for Visual C. 25 #if !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) && \ 26 defined(_MSC_VER) && !defined(__clang__) 27 28 struct YuvConstants { 29 lvec8 kUVToB; // 0 30 lvec8 kUVToG; // 32 31 lvec8 kUVToR; // 64 32 lvec16 kUVBiasB; // 96 33 lvec16 kUVBiasG; // 128 34 lvec16 kUVBiasR; // 160 35 lvec16 kYToRgb; // 192 36 }; 37 38 // BT.601 YUV to RGB reference 39 // R = (Y - 16) * 1.164 - V * -1.596 40 // G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813 41 // B = (Y - 16) * 1.164 - U * -2.018 42 43 // Y contribution to R,G,B. Scale and bias. 44 // TODO(fbarchard): Consider moving constants into a common header. 45 #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ 46 #define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */ 47 48 // U and V contributions to R,G,B. 49 #define UB -128 /* max(-128, round(-2.018 * 64)) */ 50 #define UG 25 /* round(0.391 * 64) */ 51 #define VG 52 /* round(0.813 * 64) */ 52 #define VR -102 /* round(-1.596 * 64) */ 53 54 // Bias values to subtract 16 from Y and 128 from U and V. 55 #define BB (UB * 128 + YGB) 56 #define BG (UG * 128 + VG * 128 + YGB) 57 #define BR (VR * 128 + YGB) 58 59 // BT601 constants for YUV to RGB. 60 static YuvConstants SIMD_ALIGNED(kYuvConstants) = { 61 { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, 62 UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 }, 63 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, 64 UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG }, 65 { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 66 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR }, 67 { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, 68 { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, 69 { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, 70 { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } 71 }; 72 73 // BT601 constants for NV21 where chroma plane is VU instead of UV. 74 static YuvConstants SIMD_ALIGNED(kYvuConstants) = { 75 { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 76 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB }, 77 { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, 78 VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG }, 79 { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, 80 VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 }, 81 { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB }, 82 { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG }, 83 { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }, 84 { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } 85 }; 86 87 #undef YG 88 #undef YGB 89 #undef UB 90 #undef UG 91 #undef VG 92 #undef VR 93 #undef BB 94 #undef BG 95 #undef BR 96 97 // JPEG YUV to RGB reference 98 // * R = Y - V * -1.40200 99 // * G = Y - U * 0.34414 - V * 0.71414 100 // * B = Y - U * -1.77200 101 102 // Y contribution to R,G,B. Scale and bias. 103 // TODO(fbarchard): Consider moving constants into a common header. 104 #define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */ 105 #define YGBJ 32 /* 64 / 2 */ 106 107 // U and V contributions to R,G,B. 108 #define UBJ -113 /* round(-1.77200 * 64) */ 109 #define UGJ 22 /* round(0.34414 * 64) */ 110 #define VGJ 46 /* round(0.71414 * 64) */ 111 #define VRJ -90 /* round(-1.40200 * 64) */ 112 113 // Bias values to subtract 16 from Y and 128 from U and V. 114 #define BBJ (UBJ * 128 + YGBJ) 115 #define BGJ (UGJ * 128 + VGJ * 128 + YGBJ) 116 #define BRJ (VRJ * 128 + YGBJ) 117 118 // JPEG constants for YUV to RGB. 119 static YuvConstants SIMD_ALIGNED(kYuvJConstants) = { 120 { UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, 121 UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0 }, 122 { UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, 123 UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, 124 UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, 125 UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ }, 126 { 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 127 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ }, 128 { BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, 129 BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ }, 130 { BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, 131 BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ }, 132 { BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, 133 BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ }, 134 { YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, 135 YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ } 136 }; 137 138 #undef YGJ 139 #undef YGBJ 140 #undef UBJ 141 #undef UGJ 142 #undef VGJ 143 #undef VRJ 144 #undef BBJ 145 #undef BGJ 146 #undef BRJ 147 148 // 64 bit 149 #if defined(_M_X64) 150 #if defined(HAS_I422TOARGBROW_SSSE3) 151 void I422ToARGBRow_SSSE3(const uint8* y_buf, 152 const uint8* u_buf, 153 const uint8* v_buf, 154 uint8* dst_argb, 155 int width) { 156 __m128i xmm0, xmm1, xmm2, xmm3; 157 const __m128i xmm5 = _mm_set1_epi8(-1); 158 const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf; 159 160 while (width > 0) { 161 xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); 162 xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); 163 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); 164 xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); 165 xmm1 = _mm_loadu_si128(&xmm0); 166 xmm2 = _mm_loadu_si128(&xmm0); 167 xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kYuvConstants.kUVToB); 168 xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kYuvConstants.kUVToG); 169 xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kYuvConstants.kUVToR); 170 xmm0 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasB, xmm0); 171 xmm1 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasG, xmm1); 172 xmm2 = _mm_sub_epi16(*(__m128i*)kYuvConstants.kUVBiasR, xmm2); 173 xmm3 = _mm_loadl_epi64((__m128i*)y_buf); 174 xmm3 = _mm_unpacklo_epi8(xmm3, xmm3); 175 xmm3 = _mm_mulhi_epu16(xmm3, *(__m128i*)kYuvConstants.kYToRgb); 176 xmm0 = _mm_adds_epi16(xmm0, xmm3); 177 xmm1 = _mm_adds_epi16(xmm1, xmm3); 178 xmm2 = _mm_adds_epi16(xmm2, xmm3); 179 xmm0 = _mm_srai_epi16(xmm0, 6); 180 xmm1 = _mm_srai_epi16(xmm1, 6); 181 xmm2 = _mm_srai_epi16(xmm2, 6); 182 xmm0 = _mm_packus_epi16(xmm0, xmm0); 183 xmm1 = _mm_packus_epi16(xmm1, xmm1); 184 xmm2 = _mm_packus_epi16(xmm2, xmm2); 185 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); 186 xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); 187 xmm1 = _mm_loadu_si128(&xmm0); 188 xmm0 = _mm_unpacklo_epi16(xmm0, xmm2); 189 xmm1 = _mm_unpackhi_epi16(xmm1, xmm2); 190 191 _mm_storeu_si128((__m128i *)dst_argb, xmm0); 192 _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1); 193 194 y_buf += 8; 195 u_buf += 4; 196 dst_argb += 32; 197 width -= 8; 198 } 199 } 200 #endif 201 // 32 bit 202 #else // defined(_M_X64) 203 #ifdef HAS_ARGBTOYROW_SSSE3 204 205 // Constants for ARGB. 206 static const vec8 kARGBToY = { 207 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 208 }; 209 210 // JPeg full range. 211 static const vec8 kARGBToYJ = { 212 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0 213 }; 214 215 static const vec8 kARGBToU = { 216 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 217 }; 218 219 static const vec8 kARGBToUJ = { 220 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0 221 }; 222 223 static const vec8 kARGBToV = { 224 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, 225 }; 226 227 static const vec8 kARGBToVJ = { 228 -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0 229 }; 230 231 // vpshufb for vphaddw + vpackuswb packed to shorts. 232 static const lvec8 kShufARGBToUV_AVX = { 233 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, 234 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 235 }; 236 237 // Constants for BGRA. 238 static const vec8 kBGRAToY = { 239 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13 240 }; 241 242 static const vec8 kBGRAToU = { 243 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112 244 }; 245 246 static const vec8 kBGRAToV = { 247 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18 248 }; 249 250 // Constants for ABGR. 251 static const vec8 kABGRToY = { 252 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0 253 }; 254 255 static const vec8 kABGRToU = { 256 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0 257 }; 258 259 static const vec8 kABGRToV = { 260 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0 261 }; 262 263 // Constants for RGBA. 264 static const vec8 kRGBAToY = { 265 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33 266 }; 267 268 static const vec8 kRGBAToU = { 269 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38 270 }; 271 272 static const vec8 kRGBAToV = { 273 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112 274 }; 275 276 static const uvec8 kAddY16 = { 277 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u 278 }; 279 280 // 7 bit fixed point 0.5. 281 static const vec16 kAddYJ64 = { 282 64, 64, 64, 64, 64, 64, 64, 64 283 }; 284 285 static const uvec8 kAddUV128 = { 286 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 287 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u 288 }; 289 290 static const uvec16 kAddUVJ128 = { 291 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u 292 }; 293 294 // Shuffle table for converting RGB24 to ARGB. 295 static const uvec8 kShuffleMaskRGB24ToARGB = { 296 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u 297 }; 298 299 // Shuffle table for converting RAW to ARGB. 300 static const uvec8 kShuffleMaskRAWToARGB = { 301 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u 302 }; 303 304 // Shuffle table for converting ARGB to RGB24. 305 static const uvec8 kShuffleMaskARGBToRGB24 = { 306 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u 307 }; 308 309 // Shuffle table for converting ARGB to RAW. 310 static const uvec8 kShuffleMaskARGBToRAW = { 311 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u 312 }; 313 314 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 315 static const uvec8 kShuffleMaskARGBToRGB24_0 = { 316 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u 317 }; 318 319 // Shuffle table for converting ARGB to RAW. 320 static const uvec8 kShuffleMaskARGBToRAW_0 = { 321 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u 322 }; 323 324 // Duplicates gray value 3 times and fills in alpha opaque. 325 __declspec(naked) 326 void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { 327 __asm { 328 mov eax, [esp + 4] // src_y 329 mov edx, [esp + 8] // dst_argb 330 mov ecx, [esp + 12] // pix 331 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 332 pslld xmm5, 24 333 334 convertloop: 335 movq xmm0, qword ptr [eax] 336 lea eax, [eax + 8] 337 punpcklbw xmm0, xmm0 338 movdqa xmm1, xmm0 339 punpcklwd xmm0, xmm0 340 punpckhwd xmm1, xmm1 341 por xmm0, xmm5 342 por xmm1, xmm5 343 movdqu [edx], xmm0 344 movdqu [edx + 16], xmm1 345 lea edx, [edx + 32] 346 sub ecx, 8 347 jg convertloop 348 ret 349 } 350 } 351 352 #ifdef HAS_J400TOARGBROW_AVX2 353 // Duplicates gray value 3 times and fills in alpha opaque. 354 __declspec(naked) 355 void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int pix) { 356 __asm { 357 mov eax, [esp + 4] // src_y 358 mov edx, [esp + 8] // dst_argb 359 mov ecx, [esp + 12] // pix 360 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 361 vpslld ymm5, ymm5, 24 362 363 convertloop: 364 vmovdqu xmm0, [eax] 365 lea eax, [eax + 16] 366 vpermq ymm0, ymm0, 0xd8 367 vpunpcklbw ymm0, ymm0, ymm0 368 vpermq ymm0, ymm0, 0xd8 369 vpunpckhwd ymm1, ymm0, ymm0 370 vpunpcklwd ymm0, ymm0, ymm0 371 vpor ymm0, ymm0, ymm5 372 vpor ymm1, ymm1, ymm5 373 vmovdqu [edx], ymm0 374 vmovdqu [edx + 32], ymm1 375 lea edx, [edx + 64] 376 sub ecx, 16 377 jg convertloop 378 vzeroupper 379 ret 380 } 381 } 382 #endif // HAS_J400TOARGBROW_AVX2 383 384 __declspec(naked) 385 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { 386 __asm { 387 mov eax, [esp + 4] // src_rgb24 388 mov edx, [esp + 8] // dst_argb 389 mov ecx, [esp + 12] // pix 390 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 391 pslld xmm5, 24 392 movdqa xmm4, kShuffleMaskRGB24ToARGB 393 394 convertloop: 395 movdqu xmm0, [eax] 396 movdqu xmm1, [eax + 16] 397 movdqu xmm3, [eax + 32] 398 lea eax, [eax + 48] 399 movdqa xmm2, xmm3 400 palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} 401 pshufb xmm2, xmm4 402 por xmm2, xmm5 403 palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} 404 pshufb xmm0, xmm4 405 movdqu [edx + 32], xmm2 406 por xmm0, xmm5 407 pshufb xmm1, xmm4 408 movdqu [edx], xmm0 409 por xmm1, xmm5 410 palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} 411 pshufb xmm3, xmm4 412 movdqu [edx + 16], xmm1 413 por xmm3, xmm5 414 movdqu [edx + 48], xmm3 415 lea edx, [edx + 64] 416 sub ecx, 16 417 jg convertloop 418 ret 419 } 420 } 421 422 __declspec(naked) 423 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, 424 int pix) { 425 __asm { 426 mov eax, [esp + 4] // src_raw 427 mov edx, [esp + 8] // dst_argb 428 mov ecx, [esp + 12] // pix 429 pcmpeqb xmm5, xmm5 // generate mask 0xff000000 430 pslld xmm5, 24 431 movdqa xmm4, kShuffleMaskRAWToARGB 432 433 convertloop: 434 movdqu xmm0, [eax] 435 movdqu xmm1, [eax + 16] 436 movdqu xmm3, [eax + 32] 437 lea eax, [eax + 48] 438 movdqa xmm2, xmm3 439 palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} 440 pshufb xmm2, xmm4 441 por xmm2, xmm5 442 palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} 443 pshufb xmm0, xmm4 444 movdqu [edx + 32], xmm2 445 por xmm0, xmm5 446 pshufb xmm1, xmm4 447 movdqu [edx], xmm0 448 por xmm1, xmm5 449 palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} 450 pshufb xmm3, xmm4 451 movdqu [edx + 16], xmm1 452 por xmm3, xmm5 453 movdqu [edx + 48], xmm3 454 lea edx, [edx + 64] 455 sub ecx, 16 456 jg convertloop 457 ret 458 } 459 } 460 461 // pmul method to replicate bits. 462 // Math to replicate bits: 463 // (v << 8) | (v << 3) 464 // v * 256 + v * 8 465 // v * (256 + 8) 466 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 467 // 20 instructions. 468 __declspec(naked) 469 void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, 470 int pix) { 471 __asm { 472 mov eax, 0x01080108 // generate multiplier to repeat 5 bits 473 movd xmm5, eax 474 pshufd xmm5, xmm5, 0 475 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits 476 movd xmm6, eax 477 pshufd xmm6, xmm6, 0 478 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red 479 psllw xmm3, 11 480 pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green 481 psllw xmm4, 10 482 psrlw xmm4, 5 483 pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha 484 psllw xmm7, 8 485 486 mov eax, [esp + 4] // src_rgb565 487 mov edx, [esp + 8] // dst_argb 488 mov ecx, [esp + 12] // pix 489 sub edx, eax 490 sub edx, eax 491 492 convertloop: 493 movdqu xmm0, [eax] // fetch 8 pixels of bgr565 494 movdqa xmm1, xmm0 495 movdqa xmm2, xmm0 496 pand xmm1, xmm3 // R in upper 5 bits 497 psllw xmm2, 11 // B in upper 5 bits 498 pmulhuw xmm1, xmm5 // * (256 + 8) 499 pmulhuw xmm2, xmm5 // * (256 + 8) 500 psllw xmm1, 8 501 por xmm1, xmm2 // RB 502 pand xmm0, xmm4 // G in middle 6 bits 503 pmulhuw xmm0, xmm6 // << 5 * (256 + 4) 504 por xmm0, xmm7 // AG 505 movdqa xmm2, xmm1 506 punpcklbw xmm1, xmm0 507 punpckhbw xmm2, xmm0 508 movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB 509 movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB 510 lea eax, [eax + 16] 511 sub ecx, 8 512 jg convertloop 513 ret 514 } 515 } 516 517 #ifdef HAS_RGB565TOARGBROW_AVX2 518 // pmul method to replicate bits. 519 // Math to replicate bits: 520 // (v << 8) | (v << 3) 521 // v * 256 + v * 8 522 // v * (256 + 8) 523 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 524 __declspec(naked) 525 void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, 526 int pix) { 527 __asm { 528 mov eax, 0x01080108 // generate multiplier to repeat 5 bits 529 vmovd xmm5, eax 530 vbroadcastss ymm5, xmm5 531 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits 532 movd xmm6, eax 533 vbroadcastss ymm6, xmm6 534 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red 535 vpsllw ymm3, ymm3, 11 536 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x07e007e0 for Green 537 vpsllw ymm4, ymm4, 10 538 vpsrlw ymm4, ymm4, 5 539 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha 540 vpsllw ymm7, ymm7, 8 541 542 mov eax, [esp + 4] // src_rgb565 543 mov edx, [esp + 8] // dst_argb 544 mov ecx, [esp + 12] // pix 545 sub edx, eax 546 sub edx, eax 547 548 convertloop: 549 vmovdqu ymm0, [eax] // fetch 16 pixels of bgr565 550 vpand ymm1, ymm0, ymm3 // R in upper 5 bits 551 vpsllw ymm2, ymm0, 11 // B in upper 5 bits 552 vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8) 553 vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8) 554 vpsllw ymm1, ymm1, 8 555 vpor ymm1, ymm1, ymm2 // RB 556 vpand ymm0, ymm0, ymm4 // G in middle 6 bits 557 vpmulhuw ymm0, ymm0, ymm6 // << 5 * (256 + 4) 558 vpor ymm0, ymm0, ymm7 // AG 559 vpermq ymm0, ymm0, 0xd8 // mutate for unpack 560 vpermq ymm1, ymm1, 0xd8 561 vpunpckhbw ymm2, ymm1, ymm0 562 vpunpcklbw ymm1, ymm1, ymm0 563 vmovdqu [eax * 2 + edx], ymm1 // store 4 pixels of ARGB 564 vmovdqu [eax * 2 + edx + 32], ymm2 // store next 4 pixels of ARGB 565 lea eax, [eax + 32] 566 sub ecx, 16 567 jg convertloop 568 vzeroupper 569 ret 570 } 571 } 572 #endif // HAS_RGB565TOARGBROW_AVX2 573 574 #ifdef HAS_ARGB1555TOARGBROW_AVX2 575 __declspec(naked) 576 void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb, 577 int pix) { 578 __asm { 579 mov eax, 0x01080108 // generate multiplier to repeat 5 bits 580 vmovd xmm5, eax 581 vbroadcastss ymm5, xmm5 582 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits 583 movd xmm6, eax 584 vbroadcastss ymm6, xmm6 585 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red 586 vpsllw ymm3, ymm3, 11 587 vpsrlw ymm4, ymm3, 6 // generate mask 0x03e003e0 for Green 588 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha 589 vpsllw ymm7, ymm7, 8 590 591 mov eax, [esp + 4] // src_argb1555 592 mov edx, [esp + 8] // dst_argb 593 mov ecx, [esp + 12] // pix 594 sub edx, eax 595 sub edx, eax 596 597 convertloop: 598 vmovdqu ymm0, [eax] // fetch 16 pixels of 1555 599 vpsllw ymm1, ymm0, 1 // R in upper 5 bits 600 vpsllw ymm2, ymm0, 11 // B in upper 5 bits 601 vpand ymm1, ymm1, ymm3 602 vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8) 603 vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8) 604 vpsllw ymm1, ymm1, 8 605 vpor ymm1, ymm1, ymm2 // RB 606 vpsraw ymm2, ymm0, 8 // A 607 vpand ymm0, ymm0, ymm4 // G in middle 5 bits 608 vpmulhuw ymm0, ymm0, ymm6 // << 6 * (256 + 8) 609 vpand ymm2, ymm2, ymm7 610 vpor ymm0, ymm0, ymm2 // AG 611 vpermq ymm0, ymm0, 0xd8 // mutate for unpack 612 vpermq ymm1, ymm1, 0xd8 613 vpunpckhbw ymm2, ymm1, ymm0 614 vpunpcklbw ymm1, ymm1, ymm0 615 vmovdqu [eax * 2 + edx], ymm1 // store 8 pixels of ARGB 616 vmovdqu [eax * 2 + edx + 32], ymm2 // store next 8 pixels of ARGB 617 lea eax, [eax + 32] 618 sub ecx, 16 619 jg convertloop 620 vzeroupper 621 ret 622 } 623 } 624 #endif // HAS_ARGB1555TOARGBROW_AVX2 625 626 #ifdef HAS_ARGB4444TOARGBROW_AVX2 627 __declspec(naked) 628 void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb, 629 int pix) { 630 __asm { 631 mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f 632 vmovd xmm4, eax 633 vbroadcastss ymm4, xmm4 634 vpslld ymm5, ymm4, 4 // 0xf0f0f0f0 for high nibbles 635 mov eax, [esp + 4] // src_argb4444 636 mov edx, [esp + 8] // dst_argb 637 mov ecx, [esp + 12] // pix 638 sub edx, eax 639 sub edx, eax 640 641 convertloop: 642 vmovdqu ymm0, [eax] // fetch 16 pixels of bgra4444 643 vpand ymm2, ymm0, ymm5 // mask high nibbles 644 vpand ymm0, ymm0, ymm4 // mask low nibbles 645 vpsrlw ymm3, ymm2, 4 646 vpsllw ymm1, ymm0, 4 647 vpor ymm2, ymm2, ymm3 648 vpor ymm0, ymm0, ymm1 649 vpermq ymm0, ymm0, 0xd8 // mutate for unpack 650 vpermq ymm2, ymm2, 0xd8 651 vpunpckhbw ymm1, ymm0, ymm2 652 vpunpcklbw ymm0, ymm0, ymm2 653 vmovdqu [eax * 2 + edx], ymm0 // store 8 pixels of ARGB 654 vmovdqu [eax * 2 + edx + 32], ymm1 // store next 8 pixels of ARGB 655 lea eax, [eax + 32] 656 sub ecx, 16 657 jg convertloop 658 vzeroupper 659 ret 660 } 661 } 662 #endif // HAS_ARGB4444TOARGBROW_AVX2 663 664 // 24 instructions 665 __declspec(naked) 666 void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, 667 int pix) { 668 __asm { 669 mov eax, 0x01080108 // generate multiplier to repeat 5 bits 670 movd xmm5, eax 671 pshufd xmm5, xmm5, 0 672 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits 673 movd xmm6, eax 674 pshufd xmm6, xmm6, 0 675 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red 676 psllw xmm3, 11 677 movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green 678 psrlw xmm4, 6 679 pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha 680 psllw xmm7, 8 681 682 mov eax, [esp + 4] // src_argb1555 683 mov edx, [esp + 8] // dst_argb 684 mov ecx, [esp + 12] // pix 685 sub edx, eax 686 sub edx, eax 687 688 convertloop: 689 movdqu xmm0, [eax] // fetch 8 pixels of 1555 690 movdqa xmm1, xmm0 691 movdqa xmm2, xmm0 692 psllw xmm1, 1 // R in upper 5 bits 693 psllw xmm2, 11 // B in upper 5 bits 694 pand xmm1, xmm3 695 pmulhuw xmm2, xmm5 // * (256 + 8) 696 pmulhuw xmm1, xmm5 // * (256 + 8) 697 psllw xmm1, 8 698 por xmm1, xmm2 // RB 699 movdqa xmm2, xmm0 700 pand xmm0, xmm4 // G in middle 5 bits 701 psraw xmm2, 8 // A 702 pmulhuw xmm0, xmm6 // << 6 * (256 + 8) 703 pand xmm2, xmm7 704 por xmm0, xmm2 // AG 705 movdqa xmm2, xmm1 706 punpcklbw xmm1, xmm0 707 punpckhbw xmm2, xmm0 708 movdqu [eax * 2 + edx], xmm1 // store 4 pixels of ARGB 709 movdqu [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB 710 lea eax, [eax + 16] 711 sub ecx, 8 712 jg convertloop 713 ret 714 } 715 } 716 717 // 18 instructions. 718 __declspec(naked) 719 void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, 720 int pix) { 721 __asm { 722 mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f 723 movd xmm4, eax 724 pshufd xmm4, xmm4, 0 725 movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles 726 pslld xmm5, 4 727 mov eax, [esp + 4] // src_argb4444 728 mov edx, [esp + 8] // dst_argb 729 mov ecx, [esp + 12] // pix 730 sub edx, eax 731 sub edx, eax 732 733 convertloop: 734 movdqu xmm0, [eax] // fetch 8 pixels of bgra4444 735 movdqa xmm2, xmm0 736 pand xmm0, xmm4 // mask low nibbles 737 pand xmm2, xmm5 // mask high nibbles 738 movdqa xmm1, xmm0 739 movdqa xmm3, xmm2 740 psllw xmm1, 4 741 psrlw xmm3, 4 742 por xmm0, xmm1 743 por xmm2, xmm3 744 movdqa xmm1, xmm0 745 punpcklbw xmm0, xmm2 746 punpckhbw xmm1, xmm2 747 movdqu [eax * 2 + edx], xmm0 // store 4 pixels of ARGB 748 movdqu [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB 749 lea eax, [eax + 16] 750 sub ecx, 8 751 jg convertloop 752 ret 753 } 754 } 755 756 __declspec(naked) 757 void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { 758 __asm { 759 mov eax, [esp + 4] // src_argb 760 mov edx, [esp + 8] // dst_rgb 761 mov ecx, [esp + 12] // pix 762 movdqa xmm6, kShuffleMaskARGBToRGB24 763 764 convertloop: 765 movdqu xmm0, [eax] // fetch 16 pixels of argb 766 movdqu xmm1, [eax + 16] 767 movdqu xmm2, [eax + 32] 768 movdqu xmm3, [eax + 48] 769 lea eax, [eax + 64] 770 pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB 771 pshufb xmm1, xmm6 772 pshufb xmm2, xmm6 773 pshufb xmm3, xmm6 774 movdqa xmm4, xmm1 // 4 bytes from 1 for 0 775 psrldq xmm1, 4 // 8 bytes from 1 776 pslldq xmm4, 12 // 4 bytes from 1 for 0 777 movdqa xmm5, xmm2 // 8 bytes from 2 for 1 778 por xmm0, xmm4 // 4 bytes from 1 for 0 779 pslldq xmm5, 8 // 8 bytes from 2 for 1 780 movdqu [edx], xmm0 // store 0 781 por xmm1, xmm5 // 8 bytes from 2 for 1 782 psrldq xmm2, 8 // 4 bytes from 2 783 pslldq xmm3, 4 // 12 bytes from 3 for 2 784 por xmm2, xmm3 // 12 bytes from 3 for 2 785 movdqu [edx + 16], xmm1 // store 1 786 movdqu [edx + 32], xmm2 // store 2 787 lea edx, [edx + 48] 788 sub ecx, 16 789 jg convertloop 790 ret 791 } 792 } 793 794 __declspec(naked) 795 void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { 796 __asm { 797 mov eax, [esp + 4] // src_argb 798 mov edx, [esp + 8] // dst_rgb 799 mov ecx, [esp + 12] // pix 800 movdqa xmm6, kShuffleMaskARGBToRAW 801 802 convertloop: 803 movdqu xmm0, [eax] // fetch 16 pixels of argb 804 movdqu xmm1, [eax + 16] 805 movdqu xmm2, [eax + 32] 806 movdqu xmm3, [eax + 48] 807 lea eax, [eax + 64] 808 pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB 809 pshufb xmm1, xmm6 810 pshufb xmm2, xmm6 811 pshufb xmm3, xmm6 812 movdqa xmm4, xmm1 // 4 bytes from 1 for 0 813 psrldq xmm1, 4 // 8 bytes from 1 814 pslldq xmm4, 12 // 4 bytes from 1 for 0 815 movdqa xmm5, xmm2 // 8 bytes from 2 for 1 816 por xmm0, xmm4 // 4 bytes from 1 for 0 817 pslldq xmm5, 8 // 8 bytes from 2 for 1 818 movdqu [edx], xmm0 // store 0 819 por xmm1, xmm5 // 8 bytes from 2 for 1 820 psrldq xmm2, 8 // 4 bytes from 2 821 pslldq xmm3, 4 // 12 bytes from 3 for 2 822 por xmm2, xmm3 // 12 bytes from 3 for 2 823 movdqu [edx + 16], xmm1 // store 1 824 movdqu [edx + 32], xmm2 // store 2 825 lea edx, [edx + 48] 826 sub ecx, 16 827 jg convertloop 828 ret 829 } 830 } 831 832 // 4 pixels 833 __declspec(naked) 834 void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { 835 __asm { 836 mov eax, [esp + 4] // src_argb 837 mov edx, [esp + 8] // dst_rgb 838 mov ecx, [esp + 12] // pix 839 pcmpeqb xmm3, xmm3 // generate mask 0x0000001f 840 psrld xmm3, 27 841 pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 842 psrld xmm4, 26 843 pslld xmm4, 5 844 pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 845 pslld xmm5, 11 846 847 convertloop: 848 movdqu xmm0, [eax] // fetch 4 pixels of argb 849 movdqa xmm1, xmm0 // B 850 movdqa xmm2, xmm0 // G 851 pslld xmm0, 8 // R 852 psrld xmm1, 3 // B 853 psrld xmm2, 5 // G 854 psrad xmm0, 16 // R 855 pand xmm1, xmm3 // B 856 pand xmm2, xmm4 // G 857 pand xmm0, xmm5 // R 858 por xmm1, xmm2 // BG 859 por xmm0, xmm1 // BGR 860 packssdw xmm0, xmm0 861 lea eax, [eax + 16] 862 movq qword ptr [edx], xmm0 // store 4 pixels of RGB565 863 lea edx, [edx + 8] 864 sub ecx, 4 865 jg convertloop 866 ret 867 } 868 } 869 870 // 8 pixels 871 __declspec(naked) 872 void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb, 873 const uint32 dither4, int pix) { 874 __asm { 875 876 mov eax, [esp + 4] // src_argb 877 mov edx, [esp + 8] // dst_rgb 878 movd xmm6, [esp + 12] // dither4 879 mov ecx, [esp + 16] // pix 880 punpcklbw xmm6, xmm6 // make dither 16 bytes 881 movdqa xmm7, xmm6 882 punpcklwd xmm6, xmm6 883 punpckhwd xmm7, xmm7 884 pcmpeqb xmm3, xmm3 // generate mask 0x0000001f 885 psrld xmm3, 27 886 pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 887 psrld xmm4, 26 888 pslld xmm4, 5 889 pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 890 pslld xmm5, 11 891 892 convertloop: 893 movdqu xmm0, [eax] // fetch 4 pixels of argb 894 paddusb xmm0, xmm6 // add dither 895 movdqa xmm1, xmm0 // B 896 movdqa xmm2, xmm0 // G 897 pslld xmm0, 8 // R 898 psrld xmm1, 3 // B 899 psrld xmm2, 5 // G 900 psrad xmm0, 16 // R 901 pand xmm1, xmm3 // B 902 pand xmm2, xmm4 // G 903 pand xmm0, xmm5 // R 904 por xmm1, xmm2 // BG 905 por xmm0, xmm1 // BGR 906 packssdw xmm0, xmm0 907 lea eax, [eax + 16] 908 movq qword ptr [edx], xmm0 // store 4 pixels of RGB565 909 lea edx, [edx + 8] 910 sub ecx, 4 911 jg convertloop 912 ret 913 } 914 } 915 916 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2 917 __declspec(naked) 918 void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb, 919 const uint32 dither4, int pix) { 920 __asm { 921 mov eax, [esp + 4] // src_argb 922 mov edx, [esp + 8] // dst_rgb 923 vbroadcastss xmm6, [esp + 12] // dither4 924 mov ecx, [esp + 16] // pix 925 vpunpcklbw xmm6, xmm6, xmm6 // make dither 32 bytes 926 vpermq ymm6, ymm6, 0xd8 927 vpunpcklwd ymm6, ymm6, ymm6 928 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f 929 vpsrld ymm3, ymm3, 27 930 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 931 vpsrld ymm4, ymm4, 26 932 vpslld ymm4, ymm4, 5 933 vpslld ymm5, ymm3, 11 // generate mask 0x0000f800 934 935 convertloop: 936 vmovdqu ymm0, [eax] // fetch 8 pixels of argb 937 vpaddusb ymm0, ymm0, ymm6 // add dither 938 vpsrld ymm2, ymm0, 5 // G 939 vpsrld ymm1, ymm0, 3 // B 940 vpsrld ymm0, ymm0, 8 // R 941 vpand ymm2, ymm2, ymm4 // G 942 vpand ymm1, ymm1, ymm3 // B 943 vpand ymm0, ymm0, ymm5 // R 944 vpor ymm1, ymm1, ymm2 // BG 945 vpor ymm0, ymm0, ymm1 // BGR 946 vpackusdw ymm0, ymm0, ymm0 947 vpermq ymm0, ymm0, 0xd8 948 lea eax, [eax + 32] 949 vmovdqu [edx], xmm0 // store 8 pixels of RGB565 950 lea edx, [edx + 16] 951 sub ecx, 8 952 jg convertloop 953 vzeroupper 954 ret 955 } 956 } 957 #endif // HAS_ARGBTORGB565DITHERROW_AVX2 958 959 // TODO(fbarchard): Improve sign extension/packing. 960 __declspec(naked) 961 void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { 962 __asm { 963 mov eax, [esp + 4] // src_argb 964 mov edx, [esp + 8] // dst_rgb 965 mov ecx, [esp + 12] // pix 966 pcmpeqb xmm4, xmm4 // generate mask 0x0000001f 967 psrld xmm4, 27 968 movdqa xmm5, xmm4 // generate mask 0x000003e0 969 pslld xmm5, 5 970 movdqa xmm6, xmm4 // generate mask 0x00007c00 971 pslld xmm6, 10 972 pcmpeqb xmm7, xmm7 // generate mask 0xffff8000 973 pslld xmm7, 15 974 975 convertloop: 976 movdqu xmm0, [eax] // fetch 4 pixels of argb 977 movdqa xmm1, xmm0 // B 978 movdqa xmm2, xmm0 // G 979 movdqa xmm3, xmm0 // R 980 psrad xmm0, 16 // A 981 psrld xmm1, 3 // B 982 psrld xmm2, 6 // G 983 psrld xmm3, 9 // R 984 pand xmm0, xmm7 // A 985 pand xmm1, xmm4 // B 986 pand xmm2, xmm5 // G 987 pand xmm3, xmm6 // R 988 por xmm0, xmm1 // BA 989 por xmm2, xmm3 // GR 990 por xmm0, xmm2 // BGRA 991 packssdw xmm0, xmm0 992 lea eax, [eax + 16] 993 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555 994 lea edx, [edx + 8] 995 sub ecx, 4 996 jg convertloop 997 ret 998 } 999 } 1000 1001 __declspec(naked) 1002 void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { 1003 __asm { 1004 mov eax, [esp + 4] // src_argb 1005 mov edx, [esp + 8] // dst_rgb 1006 mov ecx, [esp + 12] // pix 1007 pcmpeqb xmm4, xmm4 // generate mask 0xf000f000 1008 psllw xmm4, 12 1009 movdqa xmm3, xmm4 // generate mask 0x00f000f0 1010 psrlw xmm3, 8 1011 1012 convertloop: 1013 movdqu xmm0, [eax] // fetch 4 pixels of argb 1014 movdqa xmm1, xmm0 1015 pand xmm0, xmm3 // low nibble 1016 pand xmm1, xmm4 // high nibble 1017 psrld xmm0, 4 1018 psrld xmm1, 8 1019 por xmm0, xmm1 1020 packuswb xmm0, xmm0 1021 lea eax, [eax + 16] 1022 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444 1023 lea edx, [edx + 8] 1024 sub ecx, 4 1025 jg convertloop 1026 ret 1027 } 1028 } 1029 1030 #ifdef HAS_ARGBTORGB565ROW_AVX2 1031 __declspec(naked) 1032 void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { 1033 __asm { 1034 mov eax, [esp + 4] // src_argb 1035 mov edx, [esp + 8] // dst_rgb 1036 mov ecx, [esp + 12] // pix 1037 vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f 1038 vpsrld ymm3, ymm3, 27 1039 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 1040 vpsrld ymm4, ymm4, 26 1041 vpslld ymm4, ymm4, 5 1042 vpslld ymm5, ymm3, 11 // generate mask 0x0000f800 1043 1044 convertloop: 1045 vmovdqu ymm0, [eax] // fetch 8 pixels of argb 1046 vpsrld ymm2, ymm0, 5 // G 1047 vpsrld ymm1, ymm0, 3 // B 1048 vpsrld ymm0, ymm0, 8 // R 1049 vpand ymm2, ymm2, ymm4 // G 1050 vpand ymm1, ymm1, ymm3 // B 1051 vpand ymm0, ymm0, ymm5 // R 1052 vpor ymm1, ymm1, ymm2 // BG 1053 vpor ymm0, ymm0, ymm1 // BGR 1054 vpackusdw ymm0, ymm0, ymm0 1055 vpermq ymm0, ymm0, 0xd8 1056 lea eax, [eax + 32] 1057 vmovdqu [edx], xmm0 // store 8 pixels of RGB565 1058 lea edx, [edx + 16] 1059 sub ecx, 8 1060 jg convertloop 1061 vzeroupper 1062 ret 1063 } 1064 } 1065 #endif // HAS_ARGBTORGB565ROW_AVX2 1066 1067 #ifdef HAS_ARGBTOARGB1555ROW_AVX2 1068 __declspec(naked) 1069 void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { 1070 __asm { 1071 mov eax, [esp + 4] // src_argb 1072 mov edx, [esp + 8] // dst_rgb 1073 mov ecx, [esp + 12] // pix 1074 vpcmpeqb ymm4, ymm4, ymm4 1075 vpsrld ymm4, ymm4, 27 // generate mask 0x0000001f 1076 vpslld ymm5, ymm4, 5 // generate mask 0x000003e0 1077 vpslld ymm6, ymm4, 10 // generate mask 0x00007c00 1078 vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xffff8000 1079 vpslld ymm7, ymm7, 15 1080 1081 convertloop: 1082 vmovdqu ymm0, [eax] // fetch 8 pixels of argb 1083 vpsrld ymm3, ymm0, 9 // R 1084 vpsrld ymm2, ymm0, 6 // G 1085 vpsrld ymm1, ymm0, 3 // B 1086 vpsrad ymm0, ymm0, 16 // A 1087 vpand ymm3, ymm3, ymm6 // R 1088 vpand ymm2, ymm2, ymm5 // G 1089 vpand ymm1, ymm1, ymm4 // B 1090 vpand ymm0, ymm0, ymm7 // A 1091 vpor ymm0, ymm0, ymm1 // BA 1092 vpor ymm2, ymm2, ymm3 // GR 1093 vpor ymm0, ymm0, ymm2 // BGRA 1094 vpackssdw ymm0, ymm0, ymm0 1095 vpermq ymm0, ymm0, 0xd8 1096 lea eax, [eax + 32] 1097 vmovdqu [edx], xmm0 // store 8 pixels of ARGB1555 1098 lea edx, [edx + 16] 1099 sub ecx, 8 1100 jg convertloop 1101 vzeroupper 1102 ret 1103 } 1104 } 1105 #endif // HAS_ARGBTOARGB1555ROW_AVX2 1106 1107 #ifdef HAS_ARGBTOARGB4444ROW_AVX2 1108 __declspec(naked) 1109 void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { 1110 __asm { 1111 mov eax, [esp + 4] // src_argb 1112 mov edx, [esp + 8] // dst_rgb 1113 mov ecx, [esp + 12] // pix 1114 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000 1115 vpsllw ymm4, ymm4, 12 1116 vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0 1117 1118 convertloop: 1119 vmovdqu ymm0, [eax] // fetch 8 pixels of argb 1120 vpand ymm1, ymm0, ymm4 // high nibble 1121 vpand ymm0, ymm0, ymm3 // low nibble 1122 vpsrld ymm1, ymm1, 8 1123 vpsrld ymm0, ymm0, 4 1124 vpor ymm0, ymm0, ymm1 1125 vpackuswb ymm0, ymm0, ymm0 1126 vpermq ymm0, ymm0, 0xd8 1127 lea eax, [eax + 32] 1128 vmovdqu [edx], xmm0 // store 8 pixels of ARGB4444 1129 lea edx, [edx + 16] 1130 sub ecx, 8 1131 jg convertloop 1132 vzeroupper 1133 ret 1134 } 1135 } 1136 #endif // HAS_ARGBTOARGB4444ROW_AVX2 1137 1138 // Convert 16 ARGB pixels (64 bytes) to 16 Y values. 1139 __declspec(naked) 1140 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 1141 __asm { 1142 mov eax, [esp + 4] /* src_argb */ 1143 mov edx, [esp + 8] /* dst_y */ 1144 mov ecx, [esp + 12] /* pix */ 1145 movdqa xmm4, kARGBToY 1146 movdqa xmm5, kAddY16 1147 1148 convertloop: 1149 movdqu xmm0, [eax] 1150 movdqu xmm1, [eax + 16] 1151 movdqu xmm2, [eax + 32] 1152 movdqu xmm3, [eax + 48] 1153 pmaddubsw xmm0, xmm4 1154 pmaddubsw xmm1, xmm4 1155 pmaddubsw xmm2, xmm4 1156 pmaddubsw xmm3, xmm4 1157 lea eax, [eax + 64] 1158 phaddw xmm0, xmm1 1159 phaddw xmm2, xmm3 1160 psrlw xmm0, 7 1161 psrlw xmm2, 7 1162 packuswb xmm0, xmm2 1163 paddb xmm0, xmm5 1164 movdqu [edx], xmm0 1165 lea edx, [edx + 16] 1166 sub ecx, 16 1167 jg convertloop 1168 ret 1169 } 1170 } 1171 1172 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values. 1173 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding. 1174 __declspec(naked) 1175 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 1176 __asm { 1177 mov eax, [esp + 4] /* src_argb */ 1178 mov edx, [esp + 8] /* dst_y */ 1179 mov ecx, [esp + 12] /* pix */ 1180 movdqa xmm4, kARGBToYJ 1181 movdqa xmm5, kAddYJ64 1182 1183 convertloop: 1184 movdqu xmm0, [eax] 1185 movdqu xmm1, [eax + 16] 1186 movdqu xmm2, [eax + 32] 1187 movdqu xmm3, [eax + 48] 1188 pmaddubsw xmm0, xmm4 1189 pmaddubsw xmm1, xmm4 1190 pmaddubsw xmm2, xmm4 1191 pmaddubsw xmm3, xmm4 1192 lea eax, [eax + 64] 1193 phaddw xmm0, xmm1 1194 phaddw xmm2, xmm3 1195 paddw xmm0, xmm5 // Add .5 for rounding. 1196 paddw xmm2, xmm5 1197 psrlw xmm0, 7 1198 psrlw xmm2, 7 1199 packuswb xmm0, xmm2 1200 movdqu [edx], xmm0 1201 lea edx, [edx + 16] 1202 sub ecx, 16 1203 jg convertloop 1204 ret 1205 } 1206 } 1207 1208 #ifdef HAS_ARGBTOYROW_AVX2 1209 // vpermd for vphaddw + vpackuswb vpermd. 1210 static const lvec32 kPermdARGBToY_AVX = { 1211 0, 4, 1, 5, 2, 6, 3, 7 1212 }; 1213 1214 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. 1215 __declspec(naked) 1216 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { 1217 __asm { 1218 mov eax, [esp + 4] /* src_argb */ 1219 mov edx, [esp + 8] /* dst_y */ 1220 mov ecx, [esp + 12] /* pix */ 1221 vbroadcastf128 ymm4, kARGBToY 1222 vbroadcastf128 ymm5, kAddY16 1223 vmovdqu ymm6, kPermdARGBToY_AVX 1224 1225 convertloop: 1226 vmovdqu ymm0, [eax] 1227 vmovdqu ymm1, [eax + 32] 1228 vmovdqu ymm2, [eax + 64] 1229 vmovdqu ymm3, [eax + 96] 1230 vpmaddubsw ymm0, ymm0, ymm4 1231 vpmaddubsw ymm1, ymm1, ymm4 1232 vpmaddubsw ymm2, ymm2, ymm4 1233 vpmaddubsw ymm3, ymm3, ymm4 1234 lea eax, [eax + 128] 1235 vphaddw ymm0, ymm0, ymm1 // mutates. 1236 vphaddw ymm2, ymm2, ymm3 1237 vpsrlw ymm0, ymm0, 7 1238 vpsrlw ymm2, ymm2, 7 1239 vpackuswb ymm0, ymm0, ymm2 // mutates. 1240 vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation. 1241 vpaddb ymm0, ymm0, ymm5 // add 16 for Y 1242 vmovdqu [edx], ymm0 1243 lea edx, [edx + 32] 1244 sub ecx, 32 1245 jg convertloop 1246 vzeroupper 1247 ret 1248 } 1249 } 1250 #endif // HAS_ARGBTOYROW_AVX2 1251 1252 #ifdef HAS_ARGBTOYJROW_AVX2 1253 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. 1254 __declspec(naked) 1255 void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { 1256 __asm { 1257 mov eax, [esp + 4] /* src_argb */ 1258 mov edx, [esp + 8] /* dst_y */ 1259 mov ecx, [esp + 12] /* pix */ 1260 vbroadcastf128 ymm4, kARGBToYJ 1261 vbroadcastf128 ymm5, kAddYJ64 1262 vmovdqu ymm6, kPermdARGBToY_AVX 1263 1264 convertloop: 1265 vmovdqu ymm0, [eax] 1266 vmovdqu ymm1, [eax + 32] 1267 vmovdqu ymm2, [eax + 64] 1268 vmovdqu ymm3, [eax + 96] 1269 vpmaddubsw ymm0, ymm0, ymm4 1270 vpmaddubsw ymm1, ymm1, ymm4 1271 vpmaddubsw ymm2, ymm2, ymm4 1272 vpmaddubsw ymm3, ymm3, ymm4 1273 lea eax, [eax + 128] 1274 vphaddw ymm0, ymm0, ymm1 // mutates. 1275 vphaddw ymm2, ymm2, ymm3 1276 vpaddw ymm0, ymm0, ymm5 // Add .5 for rounding. 1277 vpaddw ymm2, ymm2, ymm5 1278 vpsrlw ymm0, ymm0, 7 1279 vpsrlw ymm2, ymm2, 7 1280 vpackuswb ymm0, ymm0, ymm2 // mutates. 1281 vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation. 1282 vmovdqu [edx], ymm0 1283 lea edx, [edx + 32] 1284 sub ecx, 32 1285 jg convertloop 1286 1287 vzeroupper 1288 ret 1289 } 1290 } 1291 #endif // HAS_ARGBTOYJROW_AVX2 1292 1293 __declspec(naked) 1294 void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 1295 __asm { 1296 mov eax, [esp + 4] /* src_argb */ 1297 mov edx, [esp + 8] /* dst_y */ 1298 mov ecx, [esp + 12] /* pix */ 1299 movdqa xmm4, kBGRAToY 1300 movdqa xmm5, kAddY16 1301 1302 convertloop: 1303 movdqu xmm0, [eax] 1304 movdqu xmm1, [eax + 16] 1305 movdqu xmm2, [eax + 32] 1306 movdqu xmm3, [eax + 48] 1307 pmaddubsw xmm0, xmm4 1308 pmaddubsw xmm1, xmm4 1309 pmaddubsw xmm2, xmm4 1310 pmaddubsw xmm3, xmm4 1311 lea eax, [eax + 64] 1312 phaddw xmm0, xmm1 1313 phaddw xmm2, xmm3 1314 psrlw xmm0, 7 1315 psrlw xmm2, 7 1316 packuswb xmm0, xmm2 1317 paddb xmm0, xmm5 1318 movdqu [edx], xmm0 1319 lea edx, [edx + 16] 1320 sub ecx, 16 1321 jg convertloop 1322 ret 1323 } 1324 } 1325 1326 __declspec(naked) 1327 void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 1328 __asm { 1329 mov eax, [esp + 4] /* src_argb */ 1330 mov edx, [esp + 8] /* dst_y */ 1331 mov ecx, [esp + 12] /* pix */ 1332 movdqa xmm4, kABGRToY 1333 movdqa xmm5, kAddY16 1334 1335 convertloop: 1336 movdqu xmm0, [eax] 1337 movdqu xmm1, [eax + 16] 1338 movdqu xmm2, [eax + 32] 1339 movdqu xmm3, [eax + 48] 1340 pmaddubsw xmm0, xmm4 1341 pmaddubsw xmm1, xmm4 1342 pmaddubsw xmm2, xmm4 1343 pmaddubsw xmm3, xmm4 1344 lea eax, [eax + 64] 1345 phaddw xmm0, xmm1 1346 phaddw xmm2, xmm3 1347 psrlw xmm0, 7 1348 psrlw xmm2, 7 1349 packuswb xmm0, xmm2 1350 paddb xmm0, xmm5 1351 movdqu [edx], xmm0 1352 lea edx, [edx + 16] 1353 sub ecx, 16 1354 jg convertloop 1355 ret 1356 } 1357 } 1358 1359 __declspec(naked) 1360 void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 1361 __asm { 1362 mov eax, [esp + 4] /* src_argb */ 1363 mov edx, [esp + 8] /* dst_y */ 1364 mov ecx, [esp + 12] /* pix */ 1365 movdqa xmm4, kRGBAToY 1366 movdqa xmm5, kAddY16 1367 1368 convertloop: 1369 movdqu xmm0, [eax] 1370 movdqu xmm1, [eax + 16] 1371 movdqu xmm2, [eax + 32] 1372 movdqu xmm3, [eax + 48] 1373 pmaddubsw xmm0, xmm4 1374 pmaddubsw xmm1, xmm4 1375 pmaddubsw xmm2, xmm4 1376 pmaddubsw xmm3, xmm4 1377 lea eax, [eax + 64] 1378 phaddw xmm0, xmm1 1379 phaddw xmm2, xmm3 1380 psrlw xmm0, 7 1381 psrlw xmm2, 7 1382 packuswb xmm0, xmm2 1383 paddb xmm0, xmm5 1384 movdqu [edx], xmm0 1385 lea edx, [edx + 16] 1386 sub ecx, 16 1387 jg convertloop 1388 ret 1389 } 1390 } 1391 1392 __declspec(naked) 1393 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 1394 uint8* dst_u, uint8* dst_v, int width) { 1395 __asm { 1396 push esi 1397 push edi 1398 mov eax, [esp + 8 + 4] // src_argb 1399 mov esi, [esp + 8 + 8] // src_stride_argb 1400 mov edx, [esp + 8 + 12] // dst_u 1401 mov edi, [esp + 8 + 16] // dst_v 1402 mov ecx, [esp + 8 + 20] // pix 1403 movdqa xmm5, kAddUV128 1404 movdqa xmm6, kARGBToV 1405 movdqa xmm7, kARGBToU 1406 sub edi, edx // stride from u to v 1407 1408 convertloop: 1409 /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1410 movdqu xmm0, [eax] 1411 movdqu xmm4, [eax + esi] 1412 pavgb xmm0, xmm4 1413 movdqu xmm1, [eax + 16] 1414 movdqu xmm4, [eax + esi + 16] 1415 pavgb xmm1, xmm4 1416 movdqu xmm2, [eax + 32] 1417 movdqu xmm4, [eax + esi + 32] 1418 pavgb xmm2, xmm4 1419 movdqu xmm3, [eax + 48] 1420 movdqu xmm4, [eax + esi + 48] 1421 pavgb xmm3, xmm4 1422 1423 lea eax, [eax + 64] 1424 movdqa xmm4, xmm0 1425 shufps xmm0, xmm1, 0x88 1426 shufps xmm4, xmm1, 0xdd 1427 pavgb xmm0, xmm4 1428 movdqa xmm4, xmm2 1429 shufps xmm2, xmm3, 0x88 1430 shufps xmm4, xmm3, 0xdd 1431 pavgb xmm2, xmm4 1432 1433 // step 2 - convert to U and V 1434 // from here down is very similar to Y code except 1435 // instead of 16 different pixels, its 8 pixels of U and 8 of V 1436 movdqa xmm1, xmm0 1437 movdqa xmm3, xmm2 1438 pmaddubsw xmm0, xmm7 // U 1439 pmaddubsw xmm2, xmm7 1440 pmaddubsw xmm1, xmm6 // V 1441 pmaddubsw xmm3, xmm6 1442 phaddw xmm0, xmm2 1443 phaddw xmm1, xmm3 1444 psraw xmm0, 8 1445 psraw xmm1, 8 1446 packsswb xmm0, xmm1 1447 paddb xmm0, xmm5 // -> unsigned 1448 1449 // step 3 - store 8 U and 8 V values 1450 movlps qword ptr [edx], xmm0 // U 1451 movhps qword ptr [edx + edi], xmm0 // V 1452 lea edx, [edx + 8] 1453 sub ecx, 16 1454 jg convertloop 1455 1456 pop edi 1457 pop esi 1458 ret 1459 } 1460 } 1461 1462 __declspec(naked) 1463 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 1464 uint8* dst_u, uint8* dst_v, int width) { 1465 __asm { 1466 push esi 1467 push edi 1468 mov eax, [esp + 8 + 4] // src_argb 1469 mov esi, [esp + 8 + 8] // src_stride_argb 1470 mov edx, [esp + 8 + 12] // dst_u 1471 mov edi, [esp + 8 + 16] // dst_v 1472 mov ecx, [esp + 8 + 20] // pix 1473 movdqa xmm5, kAddUVJ128 1474 movdqa xmm6, kARGBToVJ 1475 movdqa xmm7, kARGBToUJ 1476 sub edi, edx // stride from u to v 1477 1478 convertloop: 1479 /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1480 movdqu xmm0, [eax] 1481 movdqu xmm4, [eax + esi] 1482 pavgb xmm0, xmm4 1483 movdqu xmm1, [eax + 16] 1484 movdqu xmm4, [eax + esi + 16] 1485 pavgb xmm1, xmm4 1486 movdqu xmm2, [eax + 32] 1487 movdqu xmm4, [eax + esi + 32] 1488 pavgb xmm2, xmm4 1489 movdqu xmm3, [eax + 48] 1490 movdqu xmm4, [eax + esi + 48] 1491 pavgb xmm3, xmm4 1492 1493 lea eax, [eax + 64] 1494 movdqa xmm4, xmm0 1495 shufps xmm0, xmm1, 0x88 1496 shufps xmm4, xmm1, 0xdd 1497 pavgb xmm0, xmm4 1498 movdqa xmm4, xmm2 1499 shufps xmm2, xmm3, 0x88 1500 shufps xmm4, xmm3, 0xdd 1501 pavgb xmm2, xmm4 1502 1503 // step 2 - convert to U and V 1504 // from here down is very similar to Y code except 1505 // instead of 16 different pixels, its 8 pixels of U and 8 of V 1506 movdqa xmm1, xmm0 1507 movdqa xmm3, xmm2 1508 pmaddubsw xmm0, xmm7 // U 1509 pmaddubsw xmm2, xmm7 1510 pmaddubsw xmm1, xmm6 // V 1511 pmaddubsw xmm3, xmm6 1512 phaddw xmm0, xmm2 1513 phaddw xmm1, xmm3 1514 paddw xmm0, xmm5 // +.5 rounding -> unsigned 1515 paddw xmm1, xmm5 1516 psraw xmm0, 8 1517 psraw xmm1, 8 1518 packsswb xmm0, xmm1 1519 1520 // step 3 - store 8 U and 8 V values 1521 movlps qword ptr [edx], xmm0 // U 1522 movhps qword ptr [edx + edi], xmm0 // V 1523 lea edx, [edx + 8] 1524 sub ecx, 16 1525 jg convertloop 1526 1527 pop edi 1528 pop esi 1529 ret 1530 } 1531 } 1532 1533 #ifdef HAS_ARGBTOUVROW_AVX2 1534 __declspec(naked) 1535 void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, 1536 uint8* dst_u, uint8* dst_v, int width) { 1537 __asm { 1538 push esi 1539 push edi 1540 mov eax, [esp + 8 + 4] // src_argb 1541 mov esi, [esp + 8 + 8] // src_stride_argb 1542 mov edx, [esp + 8 + 12] // dst_u 1543 mov edi, [esp + 8 + 16] // dst_v 1544 mov ecx, [esp + 8 + 20] // pix 1545 vbroadcastf128 ymm5, kAddUV128 1546 vbroadcastf128 ymm6, kARGBToV 1547 vbroadcastf128 ymm7, kARGBToU 1548 sub edi, edx // stride from u to v 1549 1550 convertloop: 1551 /* step 1 - subsample 32x2 argb pixels to 16x1 */ 1552 vmovdqu ymm0, [eax] 1553 vmovdqu ymm1, [eax + 32] 1554 vmovdqu ymm2, [eax + 64] 1555 vmovdqu ymm3, [eax + 96] 1556 vpavgb ymm0, ymm0, [eax + esi] 1557 vpavgb ymm1, ymm1, [eax + esi + 32] 1558 vpavgb ymm2, ymm2, [eax + esi + 64] 1559 vpavgb ymm3, ymm3, [eax + esi + 96] 1560 lea eax, [eax + 128] 1561 vshufps ymm4, ymm0, ymm1, 0x88 1562 vshufps ymm0, ymm0, ymm1, 0xdd 1563 vpavgb ymm0, ymm0, ymm4 // mutated by vshufps 1564 vshufps ymm4, ymm2, ymm3, 0x88 1565 vshufps ymm2, ymm2, ymm3, 0xdd 1566 vpavgb ymm2, ymm2, ymm4 // mutated by vshufps 1567 1568 // step 2 - convert to U and V 1569 // from here down is very similar to Y code except 1570 // instead of 32 different pixels, its 16 pixels of U and 16 of V 1571 vpmaddubsw ymm1, ymm0, ymm7 // U 1572 vpmaddubsw ymm3, ymm2, ymm7 1573 vpmaddubsw ymm0, ymm0, ymm6 // V 1574 vpmaddubsw ymm2, ymm2, ymm6 1575 vphaddw ymm1, ymm1, ymm3 // mutates 1576 vphaddw ymm0, ymm0, ymm2 1577 vpsraw ymm1, ymm1, 8 1578 vpsraw ymm0, ymm0, 8 1579 vpacksswb ymm0, ymm1, ymm0 // mutates 1580 vpermq ymm0, ymm0, 0xd8 // For vpacksswb 1581 vpshufb ymm0, ymm0, kShufARGBToUV_AVX // For vshufps + vphaddw 1582 vpaddb ymm0, ymm0, ymm5 // -> unsigned 1583 1584 // step 3 - store 16 U and 16 V values 1585 vextractf128 [edx], ymm0, 0 // U 1586 vextractf128 [edx + edi], ymm0, 1 // V 1587 lea edx, [edx + 16] 1588 sub ecx, 32 1589 jg convertloop 1590 1591 pop edi 1592 pop esi 1593 vzeroupper 1594 ret 1595 } 1596 } 1597 #endif // HAS_ARGBTOUVROW_AVX2 1598 1599 __declspec(naked) 1600 void ARGBToUV444Row_SSSE3(const uint8* src_argb0, 1601 uint8* dst_u, uint8* dst_v, int width) { 1602 __asm { 1603 push edi 1604 mov eax, [esp + 4 + 4] // src_argb 1605 mov edx, [esp + 4 + 8] // dst_u 1606 mov edi, [esp + 4 + 12] // dst_v 1607 mov ecx, [esp + 4 + 16] // pix 1608 movdqa xmm5, kAddUV128 1609 movdqa xmm6, kARGBToV 1610 movdqa xmm7, kARGBToU 1611 sub edi, edx // stride from u to v 1612 1613 convertloop: 1614 /* convert to U and V */ 1615 movdqu xmm0, [eax] // U 1616 movdqu xmm1, [eax + 16] 1617 movdqu xmm2, [eax + 32] 1618 movdqu xmm3, [eax + 48] 1619 pmaddubsw xmm0, xmm7 1620 pmaddubsw xmm1, xmm7 1621 pmaddubsw xmm2, xmm7 1622 pmaddubsw xmm3, xmm7 1623 phaddw xmm0, xmm1 1624 phaddw xmm2, xmm3 1625 psraw xmm0, 8 1626 psraw xmm2, 8 1627 packsswb xmm0, xmm2 1628 paddb xmm0, xmm5 1629 movdqu [edx], xmm0 1630 1631 movdqu xmm0, [eax] // V 1632 movdqu xmm1, [eax + 16] 1633 movdqu xmm2, [eax + 32] 1634 movdqu xmm3, [eax + 48] 1635 pmaddubsw xmm0, xmm6 1636 pmaddubsw xmm1, xmm6 1637 pmaddubsw xmm2, xmm6 1638 pmaddubsw xmm3, xmm6 1639 phaddw xmm0, xmm1 1640 phaddw xmm2, xmm3 1641 psraw xmm0, 8 1642 psraw xmm2, 8 1643 packsswb xmm0, xmm2 1644 paddb xmm0, xmm5 1645 lea eax, [eax + 64] 1646 movdqu [edx + edi], xmm0 1647 lea edx, [edx + 16] 1648 sub ecx, 16 1649 jg convertloop 1650 1651 pop edi 1652 ret 1653 } 1654 } 1655 1656 __declspec(naked) 1657 void ARGBToUV422Row_SSSE3(const uint8* src_argb0, 1658 uint8* dst_u, uint8* dst_v, int width) { 1659 __asm { 1660 push edi 1661 mov eax, [esp + 4 + 4] // src_argb 1662 mov edx, [esp + 4 + 8] // dst_u 1663 mov edi, [esp + 4 + 12] // dst_v 1664 mov ecx, [esp + 4 + 16] // pix 1665 movdqa xmm5, kAddUV128 1666 movdqa xmm6, kARGBToV 1667 movdqa xmm7, kARGBToU 1668 sub edi, edx // stride from u to v 1669 1670 convertloop: 1671 /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1672 movdqu xmm0, [eax] 1673 movdqu xmm1, [eax + 16] 1674 movdqu xmm2, [eax + 32] 1675 movdqu xmm3, [eax + 48] 1676 lea eax, [eax + 64] 1677 movdqa xmm4, xmm0 1678 shufps xmm0, xmm1, 0x88 1679 shufps xmm4, xmm1, 0xdd 1680 pavgb xmm0, xmm4 1681 movdqa xmm4, xmm2 1682 shufps xmm2, xmm3, 0x88 1683 shufps xmm4, xmm3, 0xdd 1684 pavgb xmm2, xmm4 1685 1686 // step 2 - convert to U and V 1687 // from here down is very similar to Y code except 1688 // instead of 16 different pixels, its 8 pixels of U and 8 of V 1689 movdqa xmm1, xmm0 1690 movdqa xmm3, xmm2 1691 pmaddubsw xmm0, xmm7 // U 1692 pmaddubsw xmm2, xmm7 1693 pmaddubsw xmm1, xmm6 // V 1694 pmaddubsw xmm3, xmm6 1695 phaddw xmm0, xmm2 1696 phaddw xmm1, xmm3 1697 psraw xmm0, 8 1698 psraw xmm1, 8 1699 packsswb xmm0, xmm1 1700 paddb xmm0, xmm5 // -> unsigned 1701 1702 // step 3 - store 8 U and 8 V values 1703 movlps qword ptr [edx], xmm0 // U 1704 movhps qword ptr [edx + edi], xmm0 // V 1705 lea edx, [edx + 8] 1706 sub ecx, 16 1707 jg convertloop 1708 1709 pop edi 1710 ret 1711 } 1712 } 1713 1714 __declspec(naked) 1715 void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 1716 uint8* dst_u, uint8* dst_v, int width) { 1717 __asm { 1718 push esi 1719 push edi 1720 mov eax, [esp + 8 + 4] // src_argb 1721 mov esi, [esp + 8 + 8] // src_stride_argb 1722 mov edx, [esp + 8 + 12] // dst_u 1723 mov edi, [esp + 8 + 16] // dst_v 1724 mov ecx, [esp + 8 + 20] // pix 1725 movdqa xmm5, kAddUV128 1726 movdqa xmm6, kBGRAToV 1727 movdqa xmm7, kBGRAToU 1728 sub edi, edx // stride from u to v 1729 1730 convertloop: 1731 /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1732 movdqu xmm0, [eax] 1733 movdqu xmm4, [eax + esi] 1734 pavgb xmm0, xmm4 1735 movdqu xmm1, [eax + 16] 1736 movdqu xmm4, [eax + esi + 16] 1737 pavgb xmm1, xmm4 1738 movdqu xmm2, [eax + 32] 1739 movdqu xmm4, [eax + esi + 32] 1740 pavgb xmm2, xmm4 1741 movdqu xmm3, [eax + 48] 1742 movdqu xmm4, [eax + esi + 48] 1743 pavgb xmm3, xmm4 1744 1745 lea eax, [eax + 64] 1746 movdqa xmm4, xmm0 1747 shufps xmm0, xmm1, 0x88 1748 shufps xmm4, xmm1, 0xdd 1749 pavgb xmm0, xmm4 1750 movdqa xmm4, xmm2 1751 shufps xmm2, xmm3, 0x88 1752 shufps xmm4, xmm3, 0xdd 1753 pavgb xmm2, xmm4 1754 1755 // step 2 - convert to U and V 1756 // from here down is very similar to Y code except 1757 // instead of 16 different pixels, its 8 pixels of U and 8 of V 1758 movdqa xmm1, xmm0 1759 movdqa xmm3, xmm2 1760 pmaddubsw xmm0, xmm7 // U 1761 pmaddubsw xmm2, xmm7 1762 pmaddubsw xmm1, xmm6 // V 1763 pmaddubsw xmm3, xmm6 1764 phaddw xmm0, xmm2 1765 phaddw xmm1, xmm3 1766 psraw xmm0, 8 1767 psraw xmm1, 8 1768 packsswb xmm0, xmm1 1769 paddb xmm0, xmm5 // -> unsigned 1770 1771 // step 3 - store 8 U and 8 V values 1772 movlps qword ptr [edx], xmm0 // U 1773 movhps qword ptr [edx + edi], xmm0 // V 1774 lea edx, [edx + 8] 1775 sub ecx, 16 1776 jg convertloop 1777 1778 pop edi 1779 pop esi 1780 ret 1781 } 1782 } 1783 1784 __declspec(naked) 1785 void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 1786 uint8* dst_u, uint8* dst_v, int width) { 1787 __asm { 1788 push esi 1789 push edi 1790 mov eax, [esp + 8 + 4] // src_argb 1791 mov esi, [esp + 8 + 8] // src_stride_argb 1792 mov edx, [esp + 8 + 12] // dst_u 1793 mov edi, [esp + 8 + 16] // dst_v 1794 mov ecx, [esp + 8 + 20] // pix 1795 movdqa xmm5, kAddUV128 1796 movdqa xmm6, kABGRToV 1797 movdqa xmm7, kABGRToU 1798 sub edi, edx // stride from u to v 1799 1800 convertloop: 1801 /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1802 movdqu xmm0, [eax] 1803 movdqu xmm4, [eax + esi] 1804 pavgb xmm0, xmm4 1805 movdqu xmm1, [eax + 16] 1806 movdqu xmm4, [eax + esi + 16] 1807 pavgb xmm1, xmm4 1808 movdqu xmm2, [eax + 32] 1809 movdqu xmm4, [eax + esi + 32] 1810 pavgb xmm2, xmm4 1811 movdqu xmm3, [eax + 48] 1812 movdqu xmm4, [eax + esi + 48] 1813 pavgb xmm3, xmm4 1814 1815 lea eax, [eax + 64] 1816 movdqa xmm4, xmm0 1817 shufps xmm0, xmm1, 0x88 1818 shufps xmm4, xmm1, 0xdd 1819 pavgb xmm0, xmm4 1820 movdqa xmm4, xmm2 1821 shufps xmm2, xmm3, 0x88 1822 shufps xmm4, xmm3, 0xdd 1823 pavgb xmm2, xmm4 1824 1825 // step 2 - convert to U and V 1826 // from here down is very similar to Y code except 1827 // instead of 16 different pixels, its 8 pixels of U and 8 of V 1828 movdqa xmm1, xmm0 1829 movdqa xmm3, xmm2 1830 pmaddubsw xmm0, xmm7 // U 1831 pmaddubsw xmm2, xmm7 1832 pmaddubsw xmm1, xmm6 // V 1833 pmaddubsw xmm3, xmm6 1834 phaddw xmm0, xmm2 1835 phaddw xmm1, xmm3 1836 psraw xmm0, 8 1837 psraw xmm1, 8 1838 packsswb xmm0, xmm1 1839 paddb xmm0, xmm5 // -> unsigned 1840 1841 // step 3 - store 8 U and 8 V values 1842 movlps qword ptr [edx], xmm0 // U 1843 movhps qword ptr [edx + edi], xmm0 // V 1844 lea edx, [edx + 8] 1845 sub ecx, 16 1846 jg convertloop 1847 1848 pop edi 1849 pop esi 1850 ret 1851 } 1852 } 1853 1854 __declspec(naked) 1855 void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 1856 uint8* dst_u, uint8* dst_v, int width) { 1857 __asm { 1858 push esi 1859 push edi 1860 mov eax, [esp + 8 + 4] // src_argb 1861 mov esi, [esp + 8 + 8] // src_stride_argb 1862 mov edx, [esp + 8 + 12] // dst_u 1863 mov edi, [esp + 8 + 16] // dst_v 1864 mov ecx, [esp + 8 + 20] // pix 1865 movdqa xmm5, kAddUV128 1866 movdqa xmm6, kRGBAToV 1867 movdqa xmm7, kRGBAToU 1868 sub edi, edx // stride from u to v 1869 1870 convertloop: 1871 /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1872 movdqu xmm0, [eax] 1873 movdqu xmm4, [eax + esi] 1874 pavgb xmm0, xmm4 1875 movdqu xmm1, [eax + 16] 1876 movdqu xmm4, [eax + esi + 16] 1877 pavgb xmm1, xmm4 1878 movdqu xmm2, [eax + 32] 1879 movdqu xmm4, [eax + esi + 32] 1880 pavgb xmm2, xmm4 1881 movdqu xmm3, [eax + 48] 1882 movdqu xmm4, [eax + esi + 48] 1883 pavgb xmm3, xmm4 1884 1885 lea eax, [eax + 64] 1886 movdqa xmm4, xmm0 1887 shufps xmm0, xmm1, 0x88 1888 shufps xmm4, xmm1, 0xdd 1889 pavgb xmm0, xmm4 1890 movdqa xmm4, xmm2 1891 shufps xmm2, xmm3, 0x88 1892 shufps xmm4, xmm3, 0xdd 1893 pavgb xmm2, xmm4 1894 1895 // step 2 - convert to U and V 1896 // from here down is very similar to Y code except 1897 // instead of 16 different pixels, its 8 pixels of U and 8 of V 1898 movdqa xmm1, xmm0 1899 movdqa xmm3, xmm2 1900 pmaddubsw xmm0, xmm7 // U 1901 pmaddubsw xmm2, xmm7 1902 pmaddubsw xmm1, xmm6 // V 1903 pmaddubsw xmm3, xmm6 1904 phaddw xmm0, xmm2 1905 phaddw xmm1, xmm3 1906 psraw xmm0, 8 1907 psraw xmm1, 8 1908 packsswb xmm0, xmm1 1909 paddb xmm0, xmm5 // -> unsigned 1910 1911 // step 3 - store 8 U and 8 V values 1912 movlps qword ptr [edx], xmm0 // U 1913 movhps qword ptr [edx + edi], xmm0 // V 1914 lea edx, [edx + 8] 1915 sub ecx, 16 1916 jg convertloop 1917 1918 pop edi 1919 pop esi 1920 ret 1921 } 1922 } 1923 #endif // HAS_ARGBTOYROW_SSSE3 1924 1925 // Read 16 UV from 444 1926 #define READYUV444_AVX2 __asm { \ 1927 __asm vmovdqu xmm0, [esi] /* U */ /* NOLINT */ \ 1928 __asm vmovdqu xmm1, [esi + edi] /* V */ /* NOLINT */ \ 1929 __asm lea esi, [esi + 16] \ 1930 __asm vpermq ymm0, ymm0, 0xd8 \ 1931 __asm vpermq ymm1, ymm1, 0xd8 \ 1932 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ 1933 } 1934 1935 // Read 8 UV from 422, upsample to 16 UV. 1936 #define READYUV422_AVX2 __asm { \ 1937 __asm vmovq xmm0, qword ptr [esi] /* U */ /* NOLINT */ \ 1938 __asm vmovq xmm1, qword ptr [esi + edi] /* V */ /* NOLINT */ \ 1939 __asm lea esi, [esi + 8] \ 1940 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ 1941 __asm vpermq ymm0, ymm0, 0xd8 \ 1942 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ 1943 } 1944 1945 // Read 4 UV from 411, upsample to 16 UV. 1946 #define READYUV411_AVX2 __asm { \ 1947 __asm vmovd xmm0, dword ptr [esi] /* U */ /* NOLINT */ \ 1948 __asm vmovd xmm1, dword ptr [esi + edi] /* V */ /* NOLINT */ \ 1949 __asm lea esi, [esi + 4] \ 1950 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ 1951 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ 1952 __asm vpermq ymm0, ymm0, 0xd8 \ 1953 __asm vpunpckldq ymm0, ymm0, ymm0 /* UVUVUVUV (upsample) */ \ 1954 } 1955 1956 // Read 8 UV from NV12, upsample to 16 UV. 1957 #define READNV12_AVX2 __asm { \ 1958 __asm vmovdqu xmm0, [esi] /* UV */ \ 1959 __asm lea esi, [esi + 16] \ 1960 __asm vpermq ymm0, ymm0, 0xd8 \ 1961 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ 1962 } 1963 1964 // Convert 16 pixels: 16 UV and 16 Y. 1965 #define YUVTORGB_AVX2(YuvConstants) __asm { \ 1966 /* Step 1: Find 8 UV contributions to 16 R,G,B values */ \ 1967 __asm vpmaddubsw ymm2, ymm0, YuvConstants.kUVToR /* scale R UV */ \ 1968 __asm vpmaddubsw ymm1, ymm0, YuvConstants.kUVToG /* scale G UV */ \ 1969 __asm vpmaddubsw ymm0, ymm0, YuvConstants.kUVToB /* scale B UV */ \ 1970 __asm vmovdqu ymm3, YuvConstants.kUVBiasR \ 1971 __asm vpsubw ymm2, ymm3, ymm2 \ 1972 __asm vmovdqu ymm3, YuvConstants.kUVBiasG \ 1973 __asm vpsubw ymm1, ymm3, ymm1 \ 1974 __asm vmovdqu ymm3, YuvConstants.kUVBiasB \ 1975 __asm vpsubw ymm0, ymm3, ymm0 \ 1976 /* Step 2: Find Y contribution to 16 R,G,B values */ \ 1977 __asm vmovdqu xmm3, [eax] /* NOLINT */ \ 1978 __asm lea eax, [eax + 16] \ 1979 __asm vpermq ymm3, ymm3, 0xd8 \ 1980 __asm vpunpcklbw ymm3, ymm3, ymm3 \ 1981 __asm vpmulhuw ymm3, ymm3, YuvConstants.kYToRgb \ 1982 __asm vpaddsw ymm0, ymm0, ymm3 /* B += Y */ \ 1983 __asm vpaddsw ymm1, ymm1, ymm3 /* G += Y */ \ 1984 __asm vpaddsw ymm2, ymm2, ymm3 /* R += Y */ \ 1985 __asm vpsraw ymm0, ymm0, 6 \ 1986 __asm vpsraw ymm1, ymm1, 6 \ 1987 __asm vpsraw ymm2, ymm2, 6 \ 1988 __asm vpackuswb ymm0, ymm0, ymm0 /* B */ \ 1989 __asm vpackuswb ymm1, ymm1, ymm1 /* G */ \ 1990 __asm vpackuswb ymm2, ymm2, ymm2 /* R */ \ 1991 } 1992 1993 // Store 16 ARGB values. 1994 #define STOREARGB_AVX2 __asm { \ 1995 /* Step 3: Weave into ARGB */ \ 1996 __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */ \ 1997 __asm vpermq ymm0, ymm0, 0xd8 \ 1998 __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */ \ 1999 __asm vpermq ymm2, ymm2, 0xd8 \ 2000 __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \ 2001 __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \ 2002 __asm vmovdqu 0[edx], ymm1 \ 2003 __asm vmovdqu 32[edx], ymm0 \ 2004 __asm lea edx, [edx + 64] \ 2005 } 2006 2007 #ifdef HAS_I422TOARGBROW_AVX2 2008 // 16 pixels 2009 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). 2010 __declspec(naked) 2011 void I422ToARGBRow_AVX2(const uint8* y_buf, 2012 const uint8* u_buf, 2013 const uint8* v_buf, 2014 uint8* dst_argb, 2015 int width) { 2016 __asm { 2017 push esi 2018 push edi 2019 mov eax, [esp + 8 + 4] // Y 2020 mov esi, [esp + 8 + 8] // U 2021 mov edi, [esp + 8 + 12] // V 2022 mov edx, [esp + 8 + 16] // argb 2023 mov ecx, [esp + 8 + 20] // width 2024 sub edi, esi 2025 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha 2026 2027 convertloop: 2028 READYUV422_AVX2 2029 YUVTORGB_AVX2(kYuvConstants) 2030 STOREARGB_AVX2 2031 2032 sub ecx, 16 2033 jg convertloop 2034 2035 pop edi 2036 pop esi 2037 vzeroupper 2038 ret 2039 } 2040 } 2041 #endif // HAS_I422TOARGBROW_AVX2 2042 2043 #ifdef HAS_J422TOARGBROW_AVX2 2044 // 16 pixels 2045 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). 2046 __declspec(naked) 2047 void J422ToARGBRow_AVX2(const uint8* y_buf, 2048 const uint8* u_buf, 2049 const uint8* v_buf, 2050 uint8* dst_argb, 2051 int width) { 2052 __asm { 2053 push esi 2054 push edi 2055 mov eax, [esp + 8 + 4] // Y 2056 mov esi, [esp + 8 + 8] // U 2057 mov edi, [esp + 8 + 12] // V 2058 mov edx, [esp + 8 + 16] // argb 2059 mov ecx, [esp + 8 + 20] // width 2060 sub edi, esi 2061 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha 2062 2063 convertloop: 2064 READYUV422_AVX2 2065 YUVTORGB_AVX2(kYuvJConstants) 2066 STOREARGB_AVX2 2067 2068 sub ecx, 16 2069 jg convertloop 2070 2071 pop edi 2072 pop esi 2073 vzeroupper 2074 ret 2075 } 2076 } 2077 #endif // HAS_J422TOARGBROW_AVX2 2078 2079 #ifdef HAS_I444TOARGBROW_AVX2 2080 // 16 pixels 2081 // 16 UV values with 16 Y producing 16 ARGB (64 bytes). 2082 __declspec(naked) 2083 void I444ToARGBRow_AVX2(const uint8* y_buf, 2084 const uint8* u_buf, 2085 const uint8* v_buf, 2086 uint8* dst_argb, 2087 int width) { 2088 __asm { 2089 push esi 2090 push edi 2091 mov eax, [esp + 8 + 4] // Y 2092 mov esi, [esp + 8 + 8] // U 2093 mov edi, [esp + 8 + 12] // V 2094 mov edx, [esp + 8 + 16] // argb 2095 mov ecx, [esp + 8 + 20] // width 2096 sub edi, esi 2097 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha 2098 2099 convertloop: 2100 READYUV444_AVX2 2101 YUVTORGB_AVX2(kYuvConstants) 2102 STOREARGB_AVX2 2103 2104 sub ecx, 16 2105 jg convertloop 2106 2107 pop edi 2108 pop esi 2109 vzeroupper 2110 ret 2111 } 2112 } 2113 #endif // HAS_I444TOARGBROW_AVX2 2114 2115 #ifdef HAS_I411TOARGBROW_AVX2 2116 // 16 pixels 2117 // 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). 2118 __declspec(naked) 2119 void I411ToARGBRow_AVX2(const uint8* y_buf, 2120 const uint8* u_buf, 2121 const uint8* v_buf, 2122 uint8* dst_argb, 2123 int width) { 2124 __asm { 2125 push esi 2126 push edi 2127 mov eax, [esp + 8 + 4] // Y 2128 mov esi, [esp + 8 + 8] // U 2129 mov edi, [esp + 8 + 12] // V 2130 mov edx, [esp + 8 + 16] // argb 2131 mov ecx, [esp + 8 + 20] // width 2132 sub edi, esi 2133 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha 2134 2135 convertloop: 2136 READYUV411_AVX2 2137 YUVTORGB_AVX2(kYuvConstants) 2138 STOREARGB_AVX2 2139 2140 sub ecx, 16 2141 jg convertloop 2142 2143 pop edi 2144 pop esi 2145 vzeroupper 2146 ret 2147 } 2148 } 2149 #endif // HAS_I411TOARGBROW_AVX2 2150 2151 #ifdef HAS_NV12TOARGBROW_AVX2 2152 // 16 pixels. 2153 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). 2154 __declspec(naked) 2155 void NV12ToARGBRow_AVX2(const uint8* y_buf, 2156 const uint8* uv_buf, 2157 uint8* dst_argb, 2158 int width) { 2159 __asm { 2160 push esi 2161 mov eax, [esp + 4 + 4] // Y 2162 mov esi, [esp + 4 + 8] // UV 2163 mov edx, [esp + 4 + 12] // argb 2164 mov ecx, [esp + 4 + 16] // width 2165 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha 2166 2167 convertloop: 2168 READNV12_AVX2 2169 YUVTORGB_AVX2(kYuvConstants) 2170 STOREARGB_AVX2 2171 2172 sub ecx, 16 2173 jg convertloop 2174 2175 pop esi 2176 vzeroupper 2177 ret 2178 } 2179 } 2180 #endif // HAS_NV12TOARGBROW_AVX2 2181 2182 #ifdef HAS_NV21TOARGBROW_AVX2 2183 // 16 pixels. 2184 // 8 VU values upsampled to 16 VU, mixed with 16 Y producing 16 ARGB (64 bytes). 2185 __declspec(naked) 2186 void NV21ToARGBRow_AVX2(const uint8* y_buf, 2187 const uint8* uv_buf, 2188 uint8* dst_argb, 2189 int width) { 2190 __asm { 2191 push esi 2192 mov eax, [esp + 4 + 4] // Y 2193 mov esi, [esp + 4 + 8] // UV 2194 mov edx, [esp + 4 + 12] // argb 2195 mov ecx, [esp + 4 + 16] // width 2196 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha 2197 2198 convertloop: 2199 READNV12_AVX2 2200 YUVTORGB_AVX2(kYvuConstants) 2201 STOREARGB_AVX2 2202 2203 sub ecx, 16 2204 jg convertloop 2205 2206 pop esi 2207 vzeroupper 2208 ret 2209 } 2210 } 2211 #endif // HAS_NV21TOARGBROW_AVX2 2212 2213 #ifdef HAS_I422TOBGRAROW_AVX2 2214 // 16 pixels 2215 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes). 2216 // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3. 2217 __declspec(naked) 2218 void I422ToBGRARow_AVX2(const uint8* y_buf, 2219 const uint8* u_buf, 2220 const uint8* v_buf, 2221 uint8* dst_argb, 2222 int width) { 2223 __asm { 2224 push esi 2225 push edi 2226 mov eax, [esp + 8 + 4] // Y 2227 mov esi, [esp + 8 + 8] // U 2228 mov edi, [esp + 8 + 12] // V 2229 mov edx, [esp + 8 + 16] // argb 2230 mov ecx, [esp + 8 + 20] // width 2231 sub edi, esi 2232 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha 2233 2234 convertloop: 2235 READYUV422_AVX2 2236 YUVTORGB_AVX2(kYuvConstants) 2237 2238 // Step 3: Weave into BGRA 2239 vpunpcklbw ymm1, ymm1, ymm0 // GB 2240 vpermq ymm1, ymm1, 0xd8 2241 vpunpcklbw ymm2, ymm5, ymm2 // AR 2242 vpermq ymm2, ymm2, 0xd8 2243 vpunpcklwd ymm0, ymm2, ymm1 // ARGB first 8 pixels 2244 vpunpckhwd ymm2, ymm2, ymm1 // ARGB next 8 pixels 2245 vmovdqu [edx], ymm0 2246 vmovdqu [edx + 32], ymm2 2247 lea edx, [edx + 64] 2248 sub ecx, 16 2249 jg convertloop 2250 2251 pop edi 2252 pop esi 2253 vzeroupper 2254 ret 2255 } 2256 } 2257 #endif // HAS_I422TOBGRAROW_AVX2 2258 2259 #ifdef HAS_I422TORGBAROW_AVX2 2260 // 16 pixels 2261 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). 2262 // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3. 2263 __declspec(naked) 2264 void I422ToRGBARow_AVX2(const uint8* y_buf, 2265 const uint8* u_buf, 2266 const uint8* v_buf, 2267 uint8* dst_argb, 2268 int width) { 2269 __asm { 2270 push esi 2271 push edi 2272 mov eax, [esp + 8 + 4] // Y 2273 mov esi, [esp + 8 + 8] // U 2274 mov edi, [esp + 8 + 12] // V 2275 mov edx, [esp + 8 + 16] // argb 2276 mov ecx, [esp + 8 + 20] // width 2277 sub edi, esi 2278 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha 2279 2280 convertloop: 2281 READYUV422_AVX2 2282 YUVTORGB_AVX2(kYuvConstants) 2283 2284 // Step 3: Weave into RGBA 2285 vpunpcklbw ymm1, ymm1, ymm2 // GR 2286 vpermq ymm1, ymm1, 0xd8 2287 vpunpcklbw ymm2, ymm5, ymm0 // AB 2288 vpermq ymm2, ymm2, 0xd8 2289 vpunpcklwd ymm0, ymm2, ymm1 // ABGR first 8 pixels 2290 vpunpckhwd ymm1, ymm2, ymm1 // ABGR next 8 pixels 2291 vmovdqu [edx], ymm0 2292 vmovdqu [edx + 32], ymm1 2293 lea edx, [edx + 64] 2294 sub ecx, 16 2295 jg convertloop 2296 2297 pop edi 2298 pop esi 2299 vzeroupper 2300 ret 2301 } 2302 } 2303 #endif // HAS_I422TORGBAROW_AVX2 2304 2305 #ifdef HAS_I422TOABGRROW_AVX2 2306 // 16 pixels 2307 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes). 2308 // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3. 2309 __declspec(naked) 2310 void I422ToABGRRow_AVX2(const uint8* y_buf, 2311 const uint8* u_buf, 2312 const uint8* v_buf, 2313 uint8* dst_argb, 2314 int width) { 2315 __asm { 2316 push esi 2317 push edi 2318 mov eax, [esp + 8 + 4] // Y 2319 mov esi, [esp + 8 + 8] // U 2320 mov edi, [esp + 8 + 12] // V 2321 mov edx, [esp + 8 + 16] // argb 2322 mov ecx, [esp + 8 + 20] // width 2323 sub edi, esi 2324 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha 2325 2326 convertloop: 2327 READYUV422_AVX2 2328 YUVTORGB_AVX2(kYuvConstants) 2329 2330 // Step 3: Weave into ABGR 2331 vpunpcklbw ymm1, ymm2, ymm1 // RG 2332 vpermq ymm1, ymm1, 0xd8 2333 vpunpcklbw ymm2, ymm0, ymm5 // BA 2334 vpermq ymm2, ymm2, 0xd8 2335 vpunpcklwd ymm0, ymm1, ymm2 // RGBA first 8 pixels 2336 vpunpckhwd ymm1, ymm1, ymm2 // RGBA next 8 pixels 2337 vmovdqu [edx], ymm0 2338 vmovdqu [edx + 32], ymm1 2339 lea edx, [edx + 64] 2340 sub ecx, 16 2341 jg convertloop 2342 2343 pop edi 2344 pop esi 2345 vzeroupper 2346 ret 2347 } 2348 } 2349 #endif // HAS_I422TOABGRROW_AVX2 2350 2351 #if defined(HAS_I422TOARGBROW_SSSE3) 2352 // TODO(fbarchard): Read that does half size on Y and treats 420 as 444. 2353 2354 // Read 8 UV from 444. 2355 #define READYUV444 __asm { \ 2356 __asm movq xmm0, qword ptr [esi] /* U */ /* NOLINT */ \ 2357 __asm movq xmm1, qword ptr [esi + edi] /* V */ /* NOLINT */ \ 2358 __asm lea esi, [esi + 8] \ 2359 __asm punpcklbw xmm0, xmm1 /* UV */ \ 2360 } 2361 2362 // Read 4 UV from 422, upsample to 8 UV. 2363 #define READYUV422 __asm { \ 2364 __asm movd xmm0, [esi] /* U */ \ 2365 __asm movd xmm1, [esi + edi] /* V */ \ 2366 __asm lea esi, [esi + 4] \ 2367 __asm punpcklbw xmm0, xmm1 /* UV */ \ 2368 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ 2369 } 2370 2371 // Read 2 UV from 411, upsample to 8 UV. 2372 #define READYUV411 __asm { \ 2373 __asm movzx ebx, word ptr [esi] /* U */ /* NOLINT */ \ 2374 __asm movd xmm0, ebx \ 2375 __asm movzx ebx, word ptr [esi + edi] /* V */ /* NOLINT */ \ 2376 __asm movd xmm1, ebx \ 2377 __asm lea esi, [esi + 2] \ 2378 __asm punpcklbw xmm0, xmm1 /* UV */ \ 2379 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ 2380 __asm punpckldq xmm0, xmm0 /* UVUVUVUV (upsample) */ \ 2381 } 2382 2383 // Read 4 UV from NV12, upsample to 8 UV. 2384 #define READNV12 __asm { \ 2385 __asm movq xmm0, qword ptr [esi] /* UV */ /* NOLINT */ \ 2386 __asm lea esi, [esi + 8] \ 2387 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ 2388 } 2389 2390 // Convert 8 pixels: 8 UV and 8 Y. 2391 #define YUVTORGB(YuvConstants) __asm { \ 2392 /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \ 2393 __asm movdqa xmm1, xmm0 \ 2394 __asm movdqa xmm2, xmm0 \ 2395 __asm movdqa xmm3, xmm0 \ 2396 __asm movdqa xmm0, YuvConstants.kUVBiasB /* unbias back to signed */ \ 2397 __asm pmaddubsw xmm1, YuvConstants.kUVToB /* scale B UV */ \ 2398 __asm psubw xmm0, xmm1 \ 2399 __asm movdqa xmm1, YuvConstants.kUVBiasG \ 2400 __asm pmaddubsw xmm2, YuvConstants.kUVToG /* scale G UV */ \ 2401 __asm psubw xmm1, xmm2 \ 2402 __asm movdqa xmm2, YuvConstants.kUVBiasR \ 2403 __asm pmaddubsw xmm3, YuvConstants.kUVToR /* scale R UV */ \ 2404 __asm psubw xmm2, xmm3 \ 2405 /* Step 2: Find Y contribution to 8 R,G,B values */ \ 2406 __asm movq xmm3, qword ptr [eax] /* NOLINT */ \ 2407 __asm lea eax, [eax + 8] \ 2408 __asm punpcklbw xmm3, xmm3 \ 2409 __asm pmulhuw xmm3, YuvConstants.kYToRgb \ 2410 __asm paddsw xmm0, xmm3 /* B += Y */ \ 2411 __asm paddsw xmm1, xmm3 /* G += Y */ \ 2412 __asm paddsw xmm2, xmm3 /* R += Y */ \ 2413 __asm psraw xmm0, 6 \ 2414 __asm psraw xmm1, 6 \ 2415 __asm psraw xmm2, 6 \ 2416 __asm packuswb xmm0, xmm0 /* B */ \ 2417 __asm packuswb xmm1, xmm1 /* G */ \ 2418 __asm packuswb xmm2, xmm2 /* R */ \ 2419 } 2420 2421 // Store 8 ARGB values. 2422 #define STOREARGB __asm { \ 2423 /* Step 3: Weave into ARGB */ \ 2424 __asm punpcklbw xmm0, xmm1 /* BG */ \ 2425 __asm punpcklbw xmm2, xmm5 /* RA */ \ 2426 __asm movdqa xmm1, xmm0 \ 2427 __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \ 2428 __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \ 2429 __asm movdqu 0[edx], xmm0 \ 2430 __asm movdqu 16[edx], xmm1 \ 2431 __asm lea edx, [edx + 32] \ 2432 } 2433 2434 // Store 8 BGRA values. 2435 #define STOREBGRA __asm { \ 2436 /* Step 3: Weave into BGRA */ \ 2437 __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ 2438 __asm punpcklbw xmm1, xmm0 /* GB */ \ 2439 __asm punpcklbw xmm5, xmm2 /* AR */ \ 2440 __asm movdqa xmm0, xmm5 \ 2441 __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \ 2442 __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \ 2443 __asm movdqu 0[edx], xmm5 \ 2444 __asm movdqu 16[edx], xmm0 \ 2445 __asm lea edx, [edx + 32] \ 2446 } 2447 2448 // Store 8 ABGR values. 2449 #define STOREABGR __asm { \ 2450 /* Step 3: Weave into ABGR */ \ 2451 __asm punpcklbw xmm2, xmm1 /* RG */ \ 2452 __asm punpcklbw xmm0, xmm5 /* BA */ \ 2453 __asm movdqa xmm1, xmm2 \ 2454 __asm punpcklwd xmm2, xmm0 /* RGBA first 4 pixels */ \ 2455 __asm punpckhwd xmm1, xmm0 /* RGBA next 4 pixels */ \ 2456 __asm movdqu 0[edx], xmm2 \ 2457 __asm movdqu 16[edx], xmm1 \ 2458 __asm lea edx, [edx + 32] \ 2459 } 2460 2461 // Store 8 RGBA values. 2462 #define STORERGBA __asm { \ 2463 /* Step 3: Weave into RGBA */ \ 2464 __asm pcmpeqb xmm5, xmm5 /* generate 0xffffffff for alpha */ \ 2465 __asm punpcklbw xmm1, xmm2 /* GR */ \ 2466 __asm punpcklbw xmm5, xmm0 /* AB */ \ 2467 __asm movdqa xmm0, xmm5 \ 2468 __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \ 2469 __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \ 2470 __asm movdqu 0[edx], xmm5 \ 2471 __asm movdqu 16[edx], xmm0 \ 2472 __asm lea edx, [edx + 32] \ 2473 } 2474 2475 // Store 8 RGB24 values. 2476 #define STORERGB24 __asm { \ 2477 /* Step 3: Weave into RRGB */ \ 2478 __asm punpcklbw xmm0, xmm1 /* BG */ \ 2479 __asm punpcklbw xmm2, xmm2 /* RR */ \ 2480 __asm movdqa xmm1, xmm0 \ 2481 __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ 2482 __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \ 2483 /* Step 4: RRGB -> RGB24 */ \ 2484 __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \ 2485 __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \ 2486 __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \ 2487 __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \ 2488 __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \ 2489 __asm lea edx, [edx + 24] \ 2490 } 2491 2492 // Store 8 RAW values. 2493 #define STORERAW __asm { \ 2494 /* Step 3: Weave into RRGB */ \ 2495 __asm punpcklbw xmm0, xmm1 /* BG */ \ 2496 __asm punpcklbw xmm2, xmm2 /* RR */ \ 2497 __asm movdqa xmm1, xmm0 \ 2498 __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ 2499 __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \ 2500 /* Step 4: RRGB -> RAW */ \ 2501 __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \ 2502 __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \ 2503 __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \ 2504 __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \ 2505 __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \ 2506 __asm lea edx, [edx + 24] \ 2507 } 2508 2509 // Store 8 RGB565 values. 2510 #define STORERGB565 __asm { \ 2511 /* Step 3: Weave into RRGB */ \ 2512 __asm punpcklbw xmm0, xmm1 /* BG */ \ 2513 __asm punpcklbw xmm2, xmm2 /* RR */ \ 2514 __asm movdqa xmm1, xmm0 \ 2515 __asm punpcklwd xmm0, xmm2 /* BGRR first 4 pixels */ \ 2516 __asm punpckhwd xmm1, xmm2 /* BGRR next 4 pixels */ \ 2517 /* Step 4: RRGB -> RGB565 */ \ 2518 __asm movdqa xmm3, xmm0 /* B first 4 pixels of argb */ \ 2519 __asm movdqa xmm2, xmm0 /* G */ \ 2520 __asm pslld xmm0, 8 /* R */ \ 2521 __asm psrld xmm3, 3 /* B */ \ 2522 __asm psrld xmm2, 5 /* G */ \ 2523 __asm psrad xmm0, 16 /* R */ \ 2524 __asm pand xmm3, xmm5 /* B */ \ 2525 __asm pand xmm2, xmm6 /* G */ \ 2526 __asm pand xmm0, xmm7 /* R */ \ 2527 __asm por xmm3, xmm2 /* BG */ \ 2528 __asm por xmm0, xmm3 /* BGR */ \ 2529 __asm movdqa xmm3, xmm1 /* B next 4 pixels of argb */ \ 2530 __asm movdqa xmm2, xmm1 /* G */ \ 2531 __asm pslld xmm1, 8 /* R */ \ 2532 __asm psrld xmm3, 3 /* B */ \ 2533 __asm psrld xmm2, 5 /* G */ \ 2534 __asm psrad xmm1, 16 /* R */ \ 2535 __asm pand xmm3, xmm5 /* B */ \ 2536 __asm pand xmm2, xmm6 /* G */ \ 2537 __asm pand xmm1, xmm7 /* R */ \ 2538 __asm por xmm3, xmm2 /* BG */ \ 2539 __asm por xmm1, xmm3 /* BGR */ \ 2540 __asm packssdw xmm0, xmm1 \ 2541 __asm movdqu 0[edx], xmm0 /* store 8 pixels of RGB565 */ \ 2542 __asm lea edx, [edx + 16] \ 2543 } 2544 2545 // 8 pixels. 2546 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). 2547 __declspec(naked) 2548 void I444ToARGBRow_SSSE3(const uint8* y_buf, 2549 const uint8* u_buf, 2550 const uint8* v_buf, 2551 uint8* dst_argb, 2552 int width) { 2553 __asm { 2554 push esi 2555 push edi 2556 mov eax, [esp + 8 + 4] // Y 2557 mov esi, [esp + 8 + 8] // U 2558 mov edi, [esp + 8 + 12] // V 2559 mov edx, [esp + 8 + 16] // argb 2560 mov ecx, [esp + 8 + 20] // width 2561 sub edi, esi 2562 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2563 2564 convertloop: 2565 READYUV444 2566 YUVTORGB(kYuvConstants) 2567 STOREARGB 2568 2569 sub ecx, 8 2570 jg convertloop 2571 2572 pop edi 2573 pop esi 2574 ret 2575 } 2576 } 2577 2578 // 8 pixels. 2579 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes). 2580 __declspec(naked) 2581 void I422ToRGB24Row_SSSE3(const uint8* y_buf, 2582 const uint8* u_buf, 2583 const uint8* v_buf, 2584 uint8* dst_rgb24, 2585 int width) { 2586 __asm { 2587 push esi 2588 push edi 2589 mov eax, [esp + 8 + 4] // Y 2590 mov esi, [esp + 8 + 8] // U 2591 mov edi, [esp + 8 + 12] // V 2592 mov edx, [esp + 8 + 16] // rgb24 2593 mov ecx, [esp + 8 + 20] // width 2594 sub edi, esi 2595 movdqa xmm5, kShuffleMaskARGBToRGB24_0 2596 movdqa xmm6, kShuffleMaskARGBToRGB24 2597 2598 convertloop: 2599 READYUV422 2600 YUVTORGB(kYuvConstants) 2601 STORERGB24 2602 2603 sub ecx, 8 2604 jg convertloop 2605 2606 pop edi 2607 pop esi 2608 ret 2609 } 2610 } 2611 2612 // 8 pixels. 2613 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RAW (24 bytes). 2614 __declspec(naked) 2615 void I422ToRAWRow_SSSE3(const uint8* y_buf, 2616 const uint8* u_buf, 2617 const uint8* v_buf, 2618 uint8* dst_raw, 2619 int width) { 2620 __asm { 2621 push esi 2622 push edi 2623 mov eax, [esp + 8 + 4] // Y 2624 mov esi, [esp + 8 + 8] // U 2625 mov edi, [esp + 8 + 12] // V 2626 mov edx, [esp + 8 + 16] // raw 2627 mov ecx, [esp + 8 + 20] // width 2628 sub edi, esi 2629 movdqa xmm5, kShuffleMaskARGBToRAW_0 2630 movdqa xmm6, kShuffleMaskARGBToRAW 2631 2632 convertloop: 2633 READYUV422 2634 YUVTORGB(kYuvConstants) 2635 STORERAW 2636 2637 sub ecx, 8 2638 jg convertloop 2639 2640 pop edi 2641 pop esi 2642 ret 2643 } 2644 } 2645 2646 // 8 pixels 2647 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes). 2648 __declspec(naked) 2649 void I422ToRGB565Row_SSSE3(const uint8* y_buf, 2650 const uint8* u_buf, 2651 const uint8* v_buf, 2652 uint8* rgb565_buf, 2653 int width) { 2654 __asm { 2655 push esi 2656 push edi 2657 mov eax, [esp + 8 + 4] // Y 2658 mov esi, [esp + 8 + 8] // U 2659 mov edi, [esp + 8 + 12] // V 2660 mov edx, [esp + 8 + 16] // rgb565 2661 mov ecx, [esp + 8 + 20] // width 2662 sub edi, esi 2663 pcmpeqb xmm5, xmm5 // generate mask 0x0000001f 2664 psrld xmm5, 27 2665 pcmpeqb xmm6, xmm6 // generate mask 0x000007e0 2666 psrld xmm6, 26 2667 pslld xmm6, 5 2668 pcmpeqb xmm7, xmm7 // generate mask 0xfffff800 2669 pslld xmm7, 11 2670 2671 convertloop: 2672 READYUV422 2673 YUVTORGB(kYuvConstants) 2674 STORERGB565 2675 2676 sub ecx, 8 2677 jg convertloop 2678 2679 pop edi 2680 pop esi 2681 ret 2682 } 2683 } 2684 2685 // 8 pixels. 2686 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 2687 __declspec(naked) 2688 void I422ToARGBRow_SSSE3(const uint8* y_buf, 2689 const uint8* u_buf, 2690 const uint8* v_buf, 2691 uint8* dst_argb, 2692 int width) { 2693 __asm { 2694 push esi 2695 push edi 2696 mov eax, [esp + 8 + 4] // Y 2697 mov esi, [esp + 8 + 8] // U 2698 mov edi, [esp + 8 + 12] // V 2699 mov edx, [esp + 8 + 16] // argb 2700 mov ecx, [esp + 8 + 20] // width 2701 sub edi, esi 2702 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2703 2704 convertloop: 2705 READYUV422 2706 YUVTORGB(kYuvConstants) 2707 STOREARGB 2708 2709 sub ecx, 8 2710 jg convertloop 2711 2712 pop edi 2713 pop esi 2714 ret 2715 } 2716 } 2717 2718 // 8 pixels. 2719 // JPeg color space version of I422ToARGB 2720 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 2721 __declspec(naked) 2722 void J422ToARGBRow_SSSE3(const uint8* y_buf, 2723 const uint8* u_buf, 2724 const uint8* v_buf, 2725 uint8* dst_argb, 2726 int width) { 2727 __asm { 2728 push esi 2729 push edi 2730 mov eax, [esp + 8 + 4] // Y 2731 mov esi, [esp + 8 + 8] // U 2732 mov edi, [esp + 8 + 12] // V 2733 mov edx, [esp + 8 + 16] // argb 2734 mov ecx, [esp + 8 + 20] // width 2735 sub edi, esi 2736 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2737 2738 convertloop: 2739 READYUV422 2740 YUVTORGB(kYuvJConstants) 2741 STOREARGB 2742 2743 sub ecx, 8 2744 jg convertloop 2745 2746 pop edi 2747 pop esi 2748 ret 2749 } 2750 } 2751 2752 // 8 pixels. 2753 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 2754 // Similar to I420 but duplicate UV once more. 2755 __declspec(naked) 2756 void I411ToARGBRow_SSSE3(const uint8* y_buf, 2757 const uint8* u_buf, 2758 const uint8* v_buf, 2759 uint8* dst_argb, 2760 int width) { 2761 __asm { 2762 push ebx 2763 push esi 2764 push edi 2765 mov eax, [esp + 12 + 4] // Y 2766 mov esi, [esp + 12 + 8] // U 2767 mov edi, [esp + 12 + 12] // V 2768 mov edx, [esp + 12 + 16] // argb 2769 mov ecx, [esp + 12 + 20] // width 2770 sub edi, esi 2771 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2772 2773 convertloop: 2774 READYUV411 // modifies EBX 2775 YUVTORGB(kYuvConstants) 2776 STOREARGB 2777 2778 sub ecx, 8 2779 jg convertloop 2780 2781 pop edi 2782 pop esi 2783 pop ebx 2784 ret 2785 } 2786 } 2787 2788 // 8 pixels. 2789 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 2790 __declspec(naked) 2791 void NV12ToARGBRow_SSSE3(const uint8* y_buf, 2792 const uint8* uv_buf, 2793 uint8* dst_argb, 2794 int width) { 2795 __asm { 2796 push esi 2797 mov eax, [esp + 4 + 4] // Y 2798 mov esi, [esp + 4 + 8] // UV 2799 mov edx, [esp + 4 + 12] // argb 2800 mov ecx, [esp + 4 + 16] // width 2801 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2802 2803 convertloop: 2804 READNV12 2805 YUVTORGB(kYuvConstants) 2806 STOREARGB 2807 2808 sub ecx, 8 2809 jg convertloop 2810 2811 pop esi 2812 ret 2813 } 2814 } 2815 2816 // 8 pixels. 2817 // 4 VU values upsampled to 8 VU, mixed with 8 Y producing 8 ARGB (32 bytes). 2818 __declspec(naked) 2819 void NV21ToARGBRow_SSSE3(const uint8* y_buf, 2820 const uint8* uv_buf, 2821 uint8* dst_argb, 2822 int width) { 2823 __asm { 2824 push esi 2825 mov eax, [esp + 4 + 4] // Y 2826 mov esi, [esp + 4 + 8] // UV 2827 mov edx, [esp + 4 + 12] // argb 2828 mov ecx, [esp + 4 + 16] // width 2829 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2830 2831 convertloop: 2832 READNV12 2833 YUVTORGB(kYvuConstants) 2834 STOREARGB 2835 2836 sub ecx, 8 2837 jg convertloop 2838 2839 pop esi 2840 ret 2841 } 2842 } 2843 2844 __declspec(naked) 2845 void I422ToBGRARow_SSSE3(const uint8* y_buf, 2846 const uint8* u_buf, 2847 const uint8* v_buf, 2848 uint8* dst_bgra, 2849 int width) { 2850 __asm { 2851 push esi 2852 push edi 2853 mov eax, [esp + 8 + 4] // Y 2854 mov esi, [esp + 8 + 8] // U 2855 mov edi, [esp + 8 + 12] // V 2856 mov edx, [esp + 8 + 16] // bgra 2857 mov ecx, [esp + 8 + 20] // width 2858 sub edi, esi 2859 2860 convertloop: 2861 READYUV422 2862 YUVTORGB(kYuvConstants) 2863 STOREBGRA 2864 2865 sub ecx, 8 2866 jg convertloop 2867 2868 pop edi 2869 pop esi 2870 ret 2871 } 2872 } 2873 2874 __declspec(naked) 2875 void I422ToABGRRow_SSSE3(const uint8* y_buf, 2876 const uint8* u_buf, 2877 const uint8* v_buf, 2878 uint8* dst_abgr, 2879 int width) { 2880 __asm { 2881 push esi 2882 push edi 2883 mov eax, [esp + 8 + 4] // Y 2884 mov esi, [esp + 8 + 8] // U 2885 mov edi, [esp + 8 + 12] // V 2886 mov edx, [esp + 8 + 16] // abgr 2887 mov ecx, [esp + 8 + 20] // width 2888 sub edi, esi 2889 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 2890 2891 convertloop: 2892 READYUV422 2893 YUVTORGB(kYuvConstants) 2894 STOREABGR 2895 2896 sub ecx, 8 2897 jg convertloop 2898 2899 pop edi 2900 pop esi 2901 ret 2902 } 2903 } 2904 2905 __declspec(naked) 2906 void I422ToRGBARow_SSSE3(const uint8* y_buf, 2907 const uint8* u_buf, 2908 const uint8* v_buf, 2909 uint8* dst_rgba, 2910 int width) { 2911 __asm { 2912 push esi 2913 push edi 2914 mov eax, [esp + 8 + 4] // Y 2915 mov esi, [esp + 8 + 8] // U 2916 mov edi, [esp + 8 + 12] // V 2917 mov edx, [esp + 8 + 16] // rgba 2918 mov ecx, [esp + 8 + 20] // width 2919 sub edi, esi 2920 2921 convertloop: 2922 READYUV422 2923 YUVTORGB(kYuvConstants) 2924 STORERGBA 2925 2926 sub ecx, 8 2927 jg convertloop 2928 2929 pop edi 2930 pop esi 2931 ret 2932 } 2933 } 2934 2935 #endif // HAS_I422TOARGBROW_SSSE3 2936 2937 #ifdef HAS_I400TOARGBROW_SSE2 2938 // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes). 2939 __declspec(naked) 2940 void I400ToARGBRow_SSE2(const uint8* y_buf, 2941 uint8* rgb_buf, 2942 int width) { 2943 __asm { 2944 mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) 2945 movd xmm2, eax 2946 pshufd xmm2, xmm2,0 2947 mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16) 2948 movd xmm3, eax 2949 pshufd xmm3, xmm3, 0 2950 pcmpeqb xmm4, xmm4 // generate mask 0xff000000 2951 pslld xmm4, 24 2952 2953 mov eax, [esp + 4] // Y 2954 mov edx, [esp + 8] // rgb 2955 mov ecx, [esp + 12] // width 2956 2957 convertloop: 2958 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 2959 movq xmm0, qword ptr [eax] 2960 lea eax, [eax + 8] 2961 punpcklbw xmm0, xmm0 // Y.Y 2962 pmulhuw xmm0, xmm2 2963 psubusw xmm0, xmm3 2964 psrlw xmm0, 6 2965 packuswb xmm0, xmm0 // G 2966 2967 // Step 2: Weave into ARGB 2968 punpcklbw xmm0, xmm0 // GG 2969 movdqa xmm1, xmm0 2970 punpcklwd xmm0, xmm0 // BGRA first 4 pixels 2971 punpckhwd xmm1, xmm1 // BGRA next 4 pixels 2972 por xmm0, xmm4 2973 por xmm1, xmm4 2974 movdqu [edx], xmm0 2975 movdqu [edx + 16], xmm1 2976 lea edx, [edx + 32] 2977 sub ecx, 8 2978 jg convertloop 2979 ret 2980 } 2981 } 2982 #endif // HAS_I400TOARGBROW_SSE2 2983 2984 #ifdef HAS_I400TOARGBROW_AVX2 2985 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes). 2986 // note: vpunpcklbw mutates and vpackuswb unmutates. 2987 __declspec(naked) 2988 void I400ToARGBRow_AVX2(const uint8* y_buf, 2989 uint8* rgb_buf, 2990 int width) { 2991 __asm { 2992 mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) 2993 vmovd xmm2, eax 2994 vbroadcastss ymm2, xmm2 2995 mov eax, 0x04880488 // 0488 = 1160 = round(1.164 * 64 * 16) 2996 vmovd xmm3, eax 2997 vbroadcastss ymm3, xmm3 2998 vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xff000000 2999 vpslld ymm4, ymm4, 24 3000 3001 mov eax, [esp + 4] // Y 3002 mov edx, [esp + 8] // rgb 3003 mov ecx, [esp + 12] // width 3004 3005 convertloop: 3006 // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164 3007 vmovdqu xmm0, [eax] 3008 lea eax, [eax + 16] 3009 vpermq ymm0, ymm0, 0xd8 // vpunpcklbw mutates 3010 vpunpcklbw ymm0, ymm0, ymm0 // Y.Y 3011 vpmulhuw ymm0, ymm0, ymm2 3012 vpsubusw ymm0, ymm0, ymm3 3013 vpsrlw ymm0, ymm0, 6 3014 vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120 3015 3016 // TODO(fbarchard): Weave alpha with unpack. 3017 // Step 2: Weave into ARGB 3018 vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates 3019 vpermq ymm1, ymm1, 0xd8 3020 vpunpcklwd ymm0, ymm1, ymm1 // GGGG first 8 pixels 3021 vpunpckhwd ymm1, ymm1, ymm1 // GGGG next 8 pixels 3022 vpor ymm0, ymm0, ymm4 3023 vpor ymm1, ymm1, ymm4 3024 vmovdqu [edx], ymm0 3025 vmovdqu [edx + 32], ymm1 3026 lea edx, [edx + 64] 3027 sub ecx, 16 3028 jg convertloop 3029 vzeroupper 3030 ret 3031 } 3032 } 3033 #endif // HAS_I400TOARGBROW_AVX2 3034 3035 #ifdef HAS_MIRRORROW_SSSE3 3036 // Shuffle table for reversing the bytes. 3037 static const uvec8 kShuffleMirror = { 3038 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u 3039 }; 3040 3041 // TODO(fbarchard): Replace lea with -16 offset. 3042 __declspec(naked) 3043 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { 3044 __asm { 3045 mov eax, [esp + 4] // src 3046 mov edx, [esp + 8] // dst 3047 mov ecx, [esp + 12] // width 3048 movdqa xmm5, kShuffleMirror 3049 3050 convertloop: 3051 movdqu xmm0, [eax - 16 + ecx] 3052 pshufb xmm0, xmm5 3053 movdqu [edx], xmm0 3054 lea edx, [edx + 16] 3055 sub ecx, 16 3056 jg convertloop 3057 ret 3058 } 3059 } 3060 #endif // HAS_MIRRORROW_SSSE3 3061 3062 #ifdef HAS_MIRRORROW_AVX2 3063 __declspec(naked) 3064 void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { 3065 __asm { 3066 mov eax, [esp + 4] // src 3067 mov edx, [esp + 8] // dst 3068 mov ecx, [esp + 12] // width 3069 vbroadcastf128 ymm5, kShuffleMirror 3070 3071 convertloop: 3072 vmovdqu ymm0, [eax - 32 + ecx] 3073 vpshufb ymm0, ymm0, ymm5 3074 vpermq ymm0, ymm0, 0x4e // swap high and low halfs 3075 vmovdqu [edx], ymm0 3076 lea edx, [edx + 32] 3077 sub ecx, 32 3078 jg convertloop 3079 vzeroupper 3080 ret 3081 } 3082 } 3083 #endif // HAS_MIRRORROW_AVX2 3084 3085 #ifdef HAS_MIRRORROW_SSE2 3086 __declspec(naked) 3087 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { 3088 __asm { 3089 mov eax, [esp + 4] // src 3090 mov edx, [esp + 8] // dst 3091 mov ecx, [esp + 12] // width 3092 3093 convertloop: 3094 movdqu xmm0, [eax - 16 + ecx] 3095 movdqa xmm1, xmm0 // swap bytes 3096 psllw xmm0, 8 3097 psrlw xmm1, 8 3098 por xmm0, xmm1 3099 pshuflw xmm0, xmm0, 0x1b // swap words 3100 pshufhw xmm0, xmm0, 0x1b 3101 pshufd xmm0, xmm0, 0x4e // swap qwords 3102 movdqu [edx], xmm0 3103 lea edx, [edx + 16] 3104 sub ecx, 16 3105 jg convertloop 3106 ret 3107 } 3108 } 3109 #endif // HAS_MIRRORROW_SSE2 3110 3111 #ifdef HAS_MIRRORROW_UV_SSSE3 3112 // Shuffle table for reversing the bytes of UV channels. 3113 static const uvec8 kShuffleMirrorUV = { 3114 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u 3115 }; 3116 3117 __declspec(naked) 3118 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, 3119 int width) { 3120 __asm { 3121 push edi 3122 mov eax, [esp + 4 + 4] // src 3123 mov edx, [esp + 4 + 8] // dst_u 3124 mov edi, [esp + 4 + 12] // dst_v 3125 mov ecx, [esp + 4 + 16] // width 3126 movdqa xmm1, kShuffleMirrorUV 3127 lea eax, [eax + ecx * 2 - 16] 3128 sub edi, edx 3129 3130 convertloop: 3131 movdqu xmm0, [eax] 3132 lea eax, [eax - 16] 3133 pshufb xmm0, xmm1 3134 movlpd qword ptr [edx], xmm0 3135 movhpd qword ptr [edx + edi], xmm0 3136 lea edx, [edx + 8] 3137 sub ecx, 8 3138 jg convertloop 3139 3140 pop edi 3141 ret 3142 } 3143 } 3144 #endif // HAS_MIRRORROW_UV_SSSE3 3145 3146 #ifdef HAS_ARGBMIRRORROW_SSE2 3147 __declspec(naked) 3148 void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) { 3149 __asm { 3150 mov eax, [esp + 4] // src 3151 mov edx, [esp + 8] // dst 3152 mov ecx, [esp + 12] // width 3153 lea eax, [eax - 16 + ecx * 4] // last 4 pixels. 3154 3155 convertloop: 3156 movdqu xmm0, [eax] 3157 lea eax, [eax - 16] 3158 pshufd xmm0, xmm0, 0x1b 3159 movdqu [edx], xmm0 3160 lea edx, [edx + 16] 3161 sub ecx, 4 3162 jg convertloop 3163 ret 3164 } 3165 } 3166 #endif // HAS_ARGBMIRRORROW_SSE2 3167 3168 #ifdef HAS_ARGBMIRRORROW_AVX2 3169 // Shuffle table for reversing the bytes. 3170 static const ulvec32 kARGBShuffleMirror_AVX2 = { 3171 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u 3172 }; 3173 3174 __declspec(naked) 3175 void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { 3176 __asm { 3177 mov eax, [esp + 4] // src 3178 mov edx, [esp + 8] // dst 3179 mov ecx, [esp + 12] // width 3180 vmovdqu ymm5, kARGBShuffleMirror_AVX2 3181 3182 convertloop: 3183 vpermd ymm0, ymm5, [eax - 32 + ecx * 4] // permute dword order 3184 vmovdqu [edx], ymm0 3185 lea edx, [edx + 32] 3186 sub ecx, 8 3187 jg convertloop 3188 vzeroupper 3189 ret 3190 } 3191 } 3192 #endif // HAS_ARGBMIRRORROW_AVX2 3193 3194 #ifdef HAS_SPLITUVROW_SSE2 3195 __declspec(naked) 3196 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { 3197 __asm { 3198 push edi 3199 mov eax, [esp + 4 + 4] // src_uv 3200 mov edx, [esp + 4 + 8] // dst_u 3201 mov edi, [esp + 4 + 12] // dst_v 3202 mov ecx, [esp + 4 + 16] // pix 3203 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 3204 psrlw xmm5, 8 3205 sub edi, edx 3206 3207 convertloop: 3208 movdqu xmm0, [eax] 3209 movdqu xmm1, [eax + 16] 3210 lea eax, [eax + 32] 3211 movdqa xmm2, xmm0 3212 movdqa xmm3, xmm1 3213 pand xmm0, xmm5 // even bytes 3214 pand xmm1, xmm5 3215 packuswb xmm0, xmm1 3216 psrlw xmm2, 8 // odd bytes 3217 psrlw xmm3, 8 3218 packuswb xmm2, xmm3 3219 movdqu [edx], xmm0 3220 movdqu [edx + edi], xmm2 3221 lea edx, [edx + 16] 3222 sub ecx, 16 3223 jg convertloop 3224 3225 pop edi 3226 ret 3227 } 3228 } 3229 3230 #endif // HAS_SPLITUVROW_SSE2 3231 3232 #ifdef HAS_SPLITUVROW_AVX2 3233 __declspec(naked) 3234 void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { 3235 __asm { 3236 push edi 3237 mov eax, [esp + 4 + 4] // src_uv 3238 mov edx, [esp + 4 + 8] // dst_u 3239 mov edi, [esp + 4 + 12] // dst_v 3240 mov ecx, [esp + 4 + 16] // pix 3241 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff 3242 vpsrlw ymm5, ymm5, 8 3243 sub edi, edx 3244 3245 convertloop: 3246 vmovdqu ymm0, [eax] 3247 vmovdqu ymm1, [eax + 32] 3248 lea eax, [eax + 64] 3249 vpsrlw ymm2, ymm0, 8 // odd bytes 3250 vpsrlw ymm3, ymm1, 8 3251 vpand ymm0, ymm0, ymm5 // even bytes 3252 vpand ymm1, ymm1, ymm5 3253 vpackuswb ymm0, ymm0, ymm1 3254 vpackuswb ymm2, ymm2, ymm3 3255 vpermq ymm0, ymm0, 0xd8 3256 vpermq ymm2, ymm2, 0xd8 3257 vmovdqu [edx], ymm0 3258 vmovdqu [edx + edi], ymm2 3259 lea edx, [edx + 32] 3260 sub ecx, 32 3261 jg convertloop 3262 3263 pop edi 3264 vzeroupper 3265 ret 3266 } 3267 } 3268 #endif // HAS_SPLITUVROW_AVX2 3269 3270 #ifdef HAS_MERGEUVROW_SSE2 3271 __declspec(naked) 3272 void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, 3273 int width) { 3274 __asm { 3275 push edi 3276 mov eax, [esp + 4 + 4] // src_u 3277 mov edx, [esp + 4 + 8] // src_v 3278 mov edi, [esp + 4 + 12] // dst_uv 3279 mov ecx, [esp + 4 + 16] // width 3280 sub edx, eax 3281 3282 convertloop: 3283 movdqu xmm0, [eax] // read 16 U's 3284 movdqu xmm1, [eax + edx] // and 16 V's 3285 lea eax, [eax + 16] 3286 movdqa xmm2, xmm0 3287 punpcklbw xmm0, xmm1 // first 8 UV pairs 3288 punpckhbw xmm2, xmm1 // next 8 UV pairs 3289 movdqu [edi], xmm0 3290 movdqu [edi + 16], xmm2 3291 lea edi, [edi + 32] 3292 sub ecx, 16 3293 jg convertloop 3294 3295 pop edi 3296 ret 3297 } 3298 } 3299 #endif // HAS_MERGEUVROW_SSE2 3300 3301 #ifdef HAS_MERGEUVROW_AVX2 3302 __declspec(naked) 3303 void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, 3304 int width) { 3305 __asm { 3306 push edi 3307 mov eax, [esp + 4 + 4] // src_u 3308 mov edx, [esp + 4 + 8] // src_v 3309 mov edi, [esp + 4 + 12] // dst_uv 3310 mov ecx, [esp + 4 + 16] // width 3311 sub edx, eax 3312 3313 convertloop: 3314 vmovdqu ymm0, [eax] // read 32 U's 3315 vmovdqu ymm1, [eax + edx] // and 32 V's 3316 lea eax, [eax + 32] 3317 vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2 3318 vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3 3319 vextractf128 [edi], ymm2, 0 // bytes 0..15 3320 vextractf128 [edi + 16], ymm0, 0 // bytes 16..31 3321 vextractf128 [edi + 32], ymm2, 1 // bytes 32..47 3322 vextractf128 [edi + 48], ymm0, 1 // bytes 47..63 3323 lea edi, [edi + 64] 3324 sub ecx, 32 3325 jg convertloop 3326 3327 pop edi 3328 vzeroupper 3329 ret 3330 } 3331 } 3332 #endif // HAS_MERGEUVROW_AVX2 3333 3334 #ifdef HAS_COPYROW_SSE2 3335 // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time. 3336 __declspec(naked) 3337 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { 3338 __asm { 3339 mov eax, [esp + 4] // src 3340 mov edx, [esp + 8] // dst 3341 mov ecx, [esp + 12] // count 3342 3343 convertloop: 3344 movdqu xmm0, [eax] 3345 movdqu xmm1, [eax + 16] 3346 lea eax, [eax + 32] 3347 movdqu [edx], xmm0 3348 movdqu [edx + 16], xmm1 3349 lea edx, [edx + 32] 3350 sub ecx, 32 3351 jg convertloop 3352 ret 3353 } 3354 } 3355 #endif // HAS_COPYROW_SSE2 3356 3357 #ifdef HAS_COPYROW_AVX 3358 // CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time. 3359 __declspec(naked) 3360 void CopyRow_AVX(const uint8* src, uint8* dst, int count) { 3361 __asm { 3362 mov eax, [esp + 4] // src 3363 mov edx, [esp + 8] // dst 3364 mov ecx, [esp + 12] // count 3365 3366 convertloop: 3367 vmovdqu ymm0, [eax] 3368 vmovdqu ymm1, [eax + 32] 3369 lea eax, [eax + 64] 3370 vmovdqu [edx], ymm0 3371 vmovdqu [edx + 32], ymm1 3372 lea edx, [edx + 64] 3373 sub ecx, 64 3374 jg convertloop 3375 3376 vzeroupper 3377 ret 3378 } 3379 } 3380 #endif // HAS_COPYROW_AVX 3381 3382 // Multiple of 1. 3383 __declspec(naked) 3384 void CopyRow_ERMS(const uint8* src, uint8* dst, int count) { 3385 __asm { 3386 mov eax, esi 3387 mov edx, edi 3388 mov esi, [esp + 4] // src 3389 mov edi, [esp + 8] // dst 3390 mov ecx, [esp + 12] // count 3391 rep movsb 3392 mov edi, edx 3393 mov esi, eax 3394 ret 3395 } 3396 } 3397 3398 #ifdef HAS_ARGBCOPYALPHAROW_SSE2 3399 // width in pixels 3400 __declspec(naked) 3401 void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { 3402 __asm { 3403 mov eax, [esp + 4] // src 3404 mov edx, [esp + 8] // dst 3405 mov ecx, [esp + 12] // count 3406 pcmpeqb xmm0, xmm0 // generate mask 0xff000000 3407 pslld xmm0, 24 3408 pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff 3409 psrld xmm1, 8 3410 3411 convertloop: 3412 movdqu xmm2, [eax] 3413 movdqu xmm3, [eax + 16] 3414 lea eax, [eax + 32] 3415 movdqu xmm4, [edx] 3416 movdqu xmm5, [edx + 16] 3417 pand xmm2, xmm0 3418 pand xmm3, xmm0 3419 pand xmm4, xmm1 3420 pand xmm5, xmm1 3421 por xmm2, xmm4 3422 por xmm3, xmm5 3423 movdqu [edx], xmm2 3424 movdqu [edx + 16], xmm3 3425 lea edx, [edx + 32] 3426 sub ecx, 8 3427 jg convertloop 3428 3429 ret 3430 } 3431 } 3432 #endif // HAS_ARGBCOPYALPHAROW_SSE2 3433 3434 #ifdef HAS_ARGBCOPYALPHAROW_AVX2 3435 // width in pixels 3436 __declspec(naked) 3437 void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { 3438 __asm { 3439 mov eax, [esp + 4] // src 3440 mov edx, [esp + 8] // dst 3441 mov ecx, [esp + 12] // count 3442 vpcmpeqb ymm0, ymm0, ymm0 3443 vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff 3444 3445 convertloop: 3446 vmovdqu ymm1, [eax] 3447 vmovdqu ymm2, [eax + 32] 3448 lea eax, [eax + 64] 3449 vpblendvb ymm1, ymm1, [edx], ymm0 3450 vpblendvb ymm2, ymm2, [edx + 32], ymm0 3451 vmovdqu [edx], ymm1 3452 vmovdqu [edx + 32], ymm2 3453 lea edx, [edx + 64] 3454 sub ecx, 16 3455 jg convertloop 3456 3457 vzeroupper 3458 ret 3459 } 3460 } 3461 #endif // HAS_ARGBCOPYALPHAROW_AVX2 3462 3463 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 3464 // width in pixels 3465 __declspec(naked) 3466 void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { 3467 __asm { 3468 mov eax, [esp + 4] // src 3469 mov edx, [esp + 8] // dst 3470 mov ecx, [esp + 12] // count 3471 pcmpeqb xmm0, xmm0 // generate mask 0xff000000 3472 pslld xmm0, 24 3473 pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff 3474 psrld xmm1, 8 3475 3476 convertloop: 3477 movq xmm2, qword ptr [eax] // 8 Y's 3478 lea eax, [eax + 8] 3479 punpcklbw xmm2, xmm2 3480 punpckhwd xmm3, xmm2 3481 punpcklwd xmm2, xmm2 3482 movdqu xmm4, [edx] 3483 movdqu xmm5, [edx + 16] 3484 pand xmm2, xmm0 3485 pand xmm3, xmm0 3486 pand xmm4, xmm1 3487 pand xmm5, xmm1 3488 por xmm2, xmm4 3489 por xmm3, xmm5 3490 movdqu [edx], xmm2 3491 movdqu [edx + 16], xmm3 3492 lea edx, [edx + 32] 3493 sub ecx, 8 3494 jg convertloop 3495 3496 ret 3497 } 3498 } 3499 #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2 3500 3501 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 3502 // width in pixels 3503 __declspec(naked) 3504 void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { 3505 __asm { 3506 mov eax, [esp + 4] // src 3507 mov edx, [esp + 8] // dst 3508 mov ecx, [esp + 12] // count 3509 vpcmpeqb ymm0, ymm0, ymm0 3510 vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff 3511 3512 convertloop: 3513 vpmovzxbd ymm1, qword ptr [eax] 3514 vpmovzxbd ymm2, qword ptr [eax + 8] 3515 lea eax, [eax + 16] 3516 vpslld ymm1, ymm1, 24 3517 vpslld ymm2, ymm2, 24 3518 vpblendvb ymm1, ymm1, [edx], ymm0 3519 vpblendvb ymm2, ymm2, [edx + 32], ymm0 3520 vmovdqu [edx], ymm1 3521 vmovdqu [edx + 32], ymm2 3522 lea edx, [edx + 64] 3523 sub ecx, 16 3524 jg convertloop 3525 3526 vzeroupper 3527 ret 3528 } 3529 } 3530 #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 3531 3532 #ifdef HAS_SETROW_X86 3533 // Write 'count' bytes using an 8 bit value repeated. 3534 // Count should be multiple of 4. 3535 __declspec(naked) 3536 void SetRow_X86(uint8* dst, uint8 v8, int count) { 3537 __asm { 3538 movzx eax, byte ptr [esp + 8] // v8 3539 mov edx, 0x01010101 // Duplicate byte to all bytes. 3540 mul edx // overwrites edx with upper part of result. 3541 mov edx, edi 3542 mov edi, [esp + 4] // dst 3543 mov ecx, [esp + 12] // count 3544 shr ecx, 2 3545 rep stosd 3546 mov edi, edx 3547 ret 3548 } 3549 } 3550 3551 // Write 'count' bytes using an 8 bit value repeated. 3552 __declspec(naked) 3553 void SetRow_ERMS(uint8* dst, uint8 v8, int count) { 3554 __asm { 3555 mov edx, edi 3556 mov edi, [esp + 4] // dst 3557 mov eax, [esp + 8] // v8 3558 mov ecx, [esp + 12] // count 3559 rep stosb 3560 mov edi, edx 3561 ret 3562 } 3563 } 3564 3565 // Write 'count' 32 bit values. 3566 __declspec(naked) 3567 void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) { 3568 __asm { 3569 mov edx, edi 3570 mov edi, [esp + 4] // dst 3571 mov eax, [esp + 8] // v32 3572 mov ecx, [esp + 12] // count 3573 rep stosd 3574 mov edi, edx 3575 ret 3576 } 3577 } 3578 #endif // HAS_SETROW_X86 3579 3580 #ifdef HAS_YUY2TOYROW_AVX2 3581 __declspec(naked) 3582 void YUY2ToYRow_AVX2(const uint8* src_yuy2, 3583 uint8* dst_y, int pix) { 3584 __asm { 3585 mov eax, [esp + 4] // src_yuy2 3586 mov edx, [esp + 8] // dst_y 3587 mov ecx, [esp + 12] // pix 3588 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff 3589 vpsrlw ymm5, ymm5, 8 3590 3591 convertloop: 3592 vmovdqu ymm0, [eax] 3593 vmovdqu ymm1, [eax + 32] 3594 lea eax, [eax + 64] 3595 vpand ymm0, ymm0, ymm5 // even bytes are Y 3596 vpand ymm1, ymm1, ymm5 3597 vpackuswb ymm0, ymm0, ymm1 // mutates. 3598 vpermq ymm0, ymm0, 0xd8 3599 vmovdqu [edx], ymm0 3600 lea edx, [edx + 32] 3601 sub ecx, 32 3602 jg convertloop 3603 vzeroupper 3604 ret 3605 } 3606 } 3607 3608 __declspec(naked) 3609 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, 3610 uint8* dst_u, uint8* dst_v, int pix) { 3611 __asm { 3612 push esi 3613 push edi 3614 mov eax, [esp + 8 + 4] // src_yuy2 3615 mov esi, [esp + 8 + 8] // stride_yuy2 3616 mov edx, [esp + 8 + 12] // dst_u 3617 mov edi, [esp + 8 + 16] // dst_v 3618 mov ecx, [esp + 8 + 20] // pix 3619 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff 3620 vpsrlw ymm5, ymm5, 8 3621 sub edi, edx 3622 3623 convertloop: 3624 vmovdqu ymm0, [eax] 3625 vmovdqu ymm1, [eax + 32] 3626 vpavgb ymm0, ymm0, [eax + esi] 3627 vpavgb ymm1, ymm1, [eax + esi + 32] 3628 lea eax, [eax + 64] 3629 vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV 3630 vpsrlw ymm1, ymm1, 8 3631 vpackuswb ymm0, ymm0, ymm1 // mutates. 3632 vpermq ymm0, ymm0, 0xd8 3633 vpand ymm1, ymm0, ymm5 // U 3634 vpsrlw ymm0, ymm0, 8 // V 3635 vpackuswb ymm1, ymm1, ymm1 // mutates. 3636 vpackuswb ymm0, ymm0, ymm0 // mutates. 3637 vpermq ymm1, ymm1, 0xd8 3638 vpermq ymm0, ymm0, 0xd8 3639 vextractf128 [edx], ymm1, 0 // U 3640 vextractf128 [edx + edi], ymm0, 0 // V 3641 lea edx, [edx + 16] 3642 sub ecx, 32 3643 jg convertloop 3644 3645 pop edi 3646 pop esi 3647 vzeroupper 3648 ret 3649 } 3650 } 3651 3652 __declspec(naked) 3653 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, 3654 uint8* dst_u, uint8* dst_v, int pix) { 3655 __asm { 3656 push edi 3657 mov eax, [esp + 4 + 4] // src_yuy2 3658 mov edx, [esp + 4 + 8] // dst_u 3659 mov edi, [esp + 4 + 12] // dst_v 3660 mov ecx, [esp + 4 + 16] // pix 3661 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff 3662 vpsrlw ymm5, ymm5, 8 3663 sub edi, edx 3664 3665 convertloop: 3666 vmovdqu ymm0, [eax] 3667 vmovdqu ymm1, [eax + 32] 3668 lea eax, [eax + 64] 3669 vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV 3670 vpsrlw ymm1, ymm1, 8 3671 vpackuswb ymm0, ymm0, ymm1 // mutates. 3672 vpermq ymm0, ymm0, 0xd8 3673 vpand ymm1, ymm0, ymm5 // U 3674 vpsrlw ymm0, ymm0, 8 // V 3675 vpackuswb ymm1, ymm1, ymm1 // mutates. 3676 vpackuswb ymm0, ymm0, ymm0 // mutates. 3677 vpermq ymm1, ymm1, 0xd8 3678 vpermq ymm0, ymm0, 0xd8 3679 vextractf128 [edx], ymm1, 0 // U 3680 vextractf128 [edx + edi], ymm0, 0 // V 3681 lea edx, [edx + 16] 3682 sub ecx, 32 3683 jg convertloop 3684 3685 pop edi 3686 vzeroupper 3687 ret 3688 } 3689 } 3690 3691 __declspec(naked) 3692 void UYVYToYRow_AVX2(const uint8* src_uyvy, 3693 uint8* dst_y, int pix) { 3694 __asm { 3695 mov eax, [esp + 4] // src_uyvy 3696 mov edx, [esp + 8] // dst_y 3697 mov ecx, [esp + 12] // pix 3698 3699 convertloop: 3700 vmovdqu ymm0, [eax] 3701 vmovdqu ymm1, [eax + 32] 3702 lea eax, [eax + 64] 3703 vpsrlw ymm0, ymm0, 8 // odd bytes are Y 3704 vpsrlw ymm1, ymm1, 8 3705 vpackuswb ymm0, ymm0, ymm1 // mutates. 3706 vpermq ymm0, ymm0, 0xd8 3707 vmovdqu [edx], ymm0 3708 lea edx, [edx + 32] 3709 sub ecx, 32 3710 jg convertloop 3711 vzeroupper 3712 ret 3713 } 3714 } 3715 3716 __declspec(naked) 3717 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, 3718 uint8* dst_u, uint8* dst_v, int pix) { 3719 __asm { 3720 push esi 3721 push edi 3722 mov eax, [esp + 8 + 4] // src_yuy2 3723 mov esi, [esp + 8 + 8] // stride_yuy2 3724 mov edx, [esp + 8 + 12] // dst_u 3725 mov edi, [esp + 8 + 16] // dst_v 3726 mov ecx, [esp + 8 + 20] // pix 3727 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff 3728 vpsrlw ymm5, ymm5, 8 3729 sub edi, edx 3730 3731 convertloop: 3732 vmovdqu ymm0, [eax] 3733 vmovdqu ymm1, [eax + 32] 3734 vpavgb ymm0, ymm0, [eax + esi] 3735 vpavgb ymm1, ymm1, [eax + esi + 32] 3736 lea eax, [eax + 64] 3737 vpand ymm0, ymm0, ymm5 // UYVY -> UVUV 3738 vpand ymm1, ymm1, ymm5 3739 vpackuswb ymm0, ymm0, ymm1 // mutates. 3740 vpermq ymm0, ymm0, 0xd8 3741 vpand ymm1, ymm0, ymm5 // U 3742 vpsrlw ymm0, ymm0, 8 // V 3743 vpackuswb ymm1, ymm1, ymm1 // mutates. 3744 vpackuswb ymm0, ymm0, ymm0 // mutates. 3745 vpermq ymm1, ymm1, 0xd8 3746 vpermq ymm0, ymm0, 0xd8 3747 vextractf128 [edx], ymm1, 0 // U 3748 vextractf128 [edx + edi], ymm0, 0 // V 3749 lea edx, [edx + 16] 3750 sub ecx, 32 3751 jg convertloop 3752 3753 pop edi 3754 pop esi 3755 vzeroupper 3756 ret 3757 } 3758 } 3759 3760 __declspec(naked) 3761 void UYVYToUV422Row_AVX2(const uint8* src_uyvy, 3762 uint8* dst_u, uint8* dst_v, int pix) { 3763 __asm { 3764 push edi 3765 mov eax, [esp + 4 + 4] // src_yuy2 3766 mov edx, [esp + 4 + 8] // dst_u 3767 mov edi, [esp + 4 + 12] // dst_v 3768 mov ecx, [esp + 4 + 16] // pix 3769 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff 3770 vpsrlw ymm5, ymm5, 8 3771 sub edi, edx 3772 3773 convertloop: 3774 vmovdqu ymm0, [eax] 3775 vmovdqu ymm1, [eax + 32] 3776 lea eax, [eax + 64] 3777 vpand ymm0, ymm0, ymm5 // UYVY -> UVUV 3778 vpand ymm1, ymm1, ymm5 3779 vpackuswb ymm0, ymm0, ymm1 // mutates. 3780 vpermq ymm0, ymm0, 0xd8 3781 vpand ymm1, ymm0, ymm5 // U 3782 vpsrlw ymm0, ymm0, 8 // V 3783 vpackuswb ymm1, ymm1, ymm1 // mutates. 3784 vpackuswb ymm0, ymm0, ymm0 // mutates. 3785 vpermq ymm1, ymm1, 0xd8 3786 vpermq ymm0, ymm0, 0xd8 3787 vextractf128 [edx], ymm1, 0 // U 3788 vextractf128 [edx + edi], ymm0, 0 // V 3789 lea edx, [edx + 16] 3790 sub ecx, 32 3791 jg convertloop 3792 3793 pop edi 3794 vzeroupper 3795 ret 3796 } 3797 } 3798 #endif // HAS_YUY2TOYROW_AVX2 3799 3800 #ifdef HAS_YUY2TOYROW_SSE2 3801 __declspec(naked) 3802 void YUY2ToYRow_SSE2(const uint8* src_yuy2, 3803 uint8* dst_y, int pix) { 3804 __asm { 3805 mov eax, [esp + 4] // src_yuy2 3806 mov edx, [esp + 8] // dst_y 3807 mov ecx, [esp + 12] // pix 3808 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 3809 psrlw xmm5, 8 3810 3811 convertloop: 3812 movdqu xmm0, [eax] 3813 movdqu xmm1, [eax + 16] 3814 lea eax, [eax + 32] 3815 pand xmm0, xmm5 // even bytes are Y 3816 pand xmm1, xmm5 3817 packuswb xmm0, xmm1 3818 movdqu [edx], xmm0 3819 lea edx, [edx + 16] 3820 sub ecx, 16 3821 jg convertloop 3822 ret 3823 } 3824 } 3825 3826 __declspec(naked) 3827 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, 3828 uint8* dst_u, uint8* dst_v, int pix) { 3829 __asm { 3830 push esi 3831 push edi 3832 mov eax, [esp + 8 + 4] // src_yuy2 3833 mov esi, [esp + 8 + 8] // stride_yuy2 3834 mov edx, [esp + 8 + 12] // dst_u 3835 mov edi, [esp + 8 + 16] // dst_v 3836 mov ecx, [esp + 8 + 20] // pix 3837 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 3838 psrlw xmm5, 8 3839 sub edi, edx 3840 3841 convertloop: 3842 movdqu xmm0, [eax] 3843 movdqu xmm1, [eax + 16] 3844 movdqu xmm2, [eax + esi] 3845 movdqu xmm3, [eax + esi + 16] 3846 lea eax, [eax + 32] 3847 pavgb xmm0, xmm2 3848 pavgb xmm1, xmm3 3849 psrlw xmm0, 8 // YUYV -> UVUV 3850 psrlw xmm1, 8 3851 packuswb xmm0, xmm1 3852 movdqa xmm1, xmm0 3853 pand xmm0, xmm5 // U 3854 packuswb xmm0, xmm0 3855 psrlw xmm1, 8 // V 3856 packuswb xmm1, xmm1 3857 movq qword ptr [edx], xmm0 3858 movq qword ptr [edx + edi], xmm1 3859 lea edx, [edx + 8] 3860 sub ecx, 16 3861 jg convertloop 3862 3863 pop edi 3864 pop esi 3865 ret 3866 } 3867 } 3868 3869 __declspec(naked) 3870 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, 3871 uint8* dst_u, uint8* dst_v, int pix) { 3872 __asm { 3873 push edi 3874 mov eax, [esp + 4 + 4] // src_yuy2 3875 mov edx, [esp + 4 + 8] // dst_u 3876 mov edi, [esp + 4 + 12] // dst_v 3877 mov ecx, [esp + 4 + 16] // pix 3878 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 3879 psrlw xmm5, 8 3880 sub edi, edx 3881 3882 convertloop: 3883 movdqu xmm0, [eax] 3884 movdqu xmm1, [eax + 16] 3885 lea eax, [eax + 32] 3886 psrlw xmm0, 8 // YUYV -> UVUV 3887 psrlw xmm1, 8 3888 packuswb xmm0, xmm1 3889 movdqa xmm1, xmm0 3890 pand xmm0, xmm5 // U 3891 packuswb xmm0, xmm0 3892 psrlw xmm1, 8 // V 3893 packuswb xmm1, xmm1 3894 movq qword ptr [edx], xmm0 3895 movq qword ptr [edx + edi], xmm1 3896 lea edx, [edx + 8] 3897 sub ecx, 16 3898 jg convertloop 3899 3900 pop edi 3901 ret 3902 } 3903 } 3904 3905 __declspec(naked) 3906 void UYVYToYRow_SSE2(const uint8* src_uyvy, 3907 uint8* dst_y, int pix) { 3908 __asm { 3909 mov eax, [esp + 4] // src_uyvy 3910 mov edx, [esp + 8] // dst_y 3911 mov ecx, [esp + 12] // pix 3912 3913 convertloop: 3914 movdqu xmm0, [eax] 3915 movdqu xmm1, [eax + 16] 3916 lea eax, [eax + 32] 3917 psrlw xmm0, 8 // odd bytes are Y 3918 psrlw xmm1, 8 3919 packuswb xmm0, xmm1 3920 movdqu [edx], xmm0 3921 lea edx, [edx + 16] 3922 sub ecx, 16 3923 jg convertloop 3924 ret 3925 } 3926 } 3927 3928 __declspec(naked) 3929 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, 3930 uint8* dst_u, uint8* dst_v, int pix) { 3931 __asm { 3932 push esi 3933 push edi 3934 mov eax, [esp + 8 + 4] // src_yuy2 3935 mov esi, [esp + 8 + 8] // stride_yuy2 3936 mov edx, [esp + 8 + 12] // dst_u 3937 mov edi, [esp + 8 + 16] // dst_v 3938 mov ecx, [esp + 8 + 20] // pix 3939 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 3940 psrlw xmm5, 8 3941 sub edi, edx 3942 3943 convertloop: 3944 movdqu xmm0, [eax] 3945 movdqu xmm1, [eax + 16] 3946 movdqu xmm2, [eax + esi] 3947 movdqu xmm3, [eax + esi + 16] 3948 lea eax, [eax + 32] 3949 pavgb xmm0, xmm2 3950 pavgb xmm1, xmm3 3951 pand xmm0, xmm5 // UYVY -> UVUV 3952 pand xmm1, xmm5 3953 packuswb xmm0, xmm1 3954 movdqa xmm1, xmm0 3955 pand xmm0, xmm5 // U 3956 packuswb xmm0, xmm0 3957 psrlw xmm1, 8 // V 3958 packuswb xmm1, xmm1 3959 movq qword ptr [edx], xmm0 3960 movq qword ptr [edx + edi], xmm1 3961 lea edx, [edx + 8] 3962 sub ecx, 16 3963 jg convertloop 3964 3965 pop edi 3966 pop esi 3967 ret 3968 } 3969 } 3970 3971 __declspec(naked) 3972 void UYVYToUV422Row_SSE2(const uint8* src_uyvy, 3973 uint8* dst_u, uint8* dst_v, int pix) { 3974 __asm { 3975 push edi 3976 mov eax, [esp + 4 + 4] // src_yuy2 3977 mov edx, [esp + 4 + 8] // dst_u 3978 mov edi, [esp + 4 + 12] // dst_v 3979 mov ecx, [esp + 4 + 16] // pix 3980 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 3981 psrlw xmm5, 8 3982 sub edi, edx 3983 3984 convertloop: 3985 movdqu xmm0, [eax] 3986 movdqu xmm1, [eax + 16] 3987 lea eax, [eax + 32] 3988 pand xmm0, xmm5 // UYVY -> UVUV 3989 pand xmm1, xmm5 3990 packuswb xmm0, xmm1 3991 movdqa xmm1, xmm0 3992 pand xmm0, xmm5 // U 3993 packuswb xmm0, xmm0 3994 psrlw xmm1, 8 // V 3995 packuswb xmm1, xmm1 3996 movq qword ptr [edx], xmm0 3997 movq qword ptr [edx + edi], xmm1 3998 lea edx, [edx + 8] 3999 sub ecx, 16 4000 jg convertloop 4001 4002 pop edi 4003 ret 4004 } 4005 } 4006 #endif // HAS_YUY2TOYROW_SSE2 4007 4008 #ifdef HAS_ARGBBLENDROW_SSE2 4009 // Blend 8 pixels at a time. 4010 __declspec(naked) 4011 void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, 4012 uint8* dst_argb, int width) { 4013 __asm { 4014 push esi 4015 mov eax, [esp + 4 + 4] // src_argb0 4016 mov esi, [esp + 4 + 8] // src_argb1 4017 mov edx, [esp + 4 + 12] // dst_argb 4018 mov ecx, [esp + 4 + 16] // width 4019 pcmpeqb xmm7, xmm7 // generate constant 1 4020 psrlw xmm7, 15 4021 pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff 4022 psrlw xmm6, 8 4023 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 4024 psllw xmm5, 8 4025 pcmpeqb xmm4, xmm4 // generate mask 0xff000000 4026 pslld xmm4, 24 4027 sub ecx, 4 4028 jl convertloop4b // less than 4 pixels? 4029 4030 // 4 pixel loop. 4031 convertloop4: 4032 movdqu xmm3, [eax] // src argb 4033 lea eax, [eax + 16] 4034 movdqa xmm0, xmm3 // src argb 4035 pxor xmm3, xmm4 // ~alpha 4036 movdqu xmm2, [esi] // _r_b 4037 psrlw xmm3, 8 // alpha 4038 pshufhw xmm3, xmm3, 0F5h // 8 alpha words 4039 pshuflw xmm3, xmm3, 0F5h 4040 pand xmm2, xmm6 // _r_b 4041 paddw xmm3, xmm7 // 256 - alpha 4042 pmullw xmm2, xmm3 // _r_b * alpha 4043 movdqu xmm1, [esi] // _a_g 4044 lea esi, [esi + 16] 4045 psrlw xmm1, 8 // _a_g 4046 por xmm0, xmm4 // set alpha to 255 4047 pmullw xmm1, xmm3 // _a_g * alpha 4048 psrlw xmm2, 8 // _r_b convert to 8 bits again 4049 paddusb xmm0, xmm2 // + src argb 4050 pand xmm1, xmm5 // a_g_ convert to 8 bits again 4051 paddusb xmm0, xmm1 // + src argb 4052 movdqu [edx], xmm0 4053 lea edx, [edx + 16] 4054 sub ecx, 4 4055 jge convertloop4 4056 4057 convertloop4b: 4058 add ecx, 4 - 1 4059 jl convertloop1b 4060 4061 // 1 pixel loop. 4062 convertloop1: 4063 movd xmm3, [eax] // src argb 4064 lea eax, [eax + 4] 4065 movdqa xmm0, xmm3 // src argb 4066 pxor xmm3, xmm4 // ~alpha 4067 movd xmm2, [esi] // _r_b 4068 psrlw xmm3, 8 // alpha 4069 pshufhw xmm3, xmm3, 0F5h // 8 alpha words 4070 pshuflw xmm3, xmm3, 0F5h 4071 pand xmm2, xmm6 // _r_b 4072 paddw xmm3, xmm7 // 256 - alpha 4073 pmullw xmm2, xmm3 // _r_b * alpha 4074 movd xmm1, [esi] // _a_g 4075 lea esi, [esi + 4] 4076 psrlw xmm1, 8 // _a_g 4077 por xmm0, xmm4 // set alpha to 255 4078 pmullw xmm1, xmm3 // _a_g * alpha 4079 psrlw xmm2, 8 // _r_b convert to 8 bits again 4080 paddusb xmm0, xmm2 // + src argb 4081 pand xmm1, xmm5 // a_g_ convert to 8 bits again 4082 paddusb xmm0, xmm1 // + src argb 4083 movd [edx], xmm0 4084 lea edx, [edx + 4] 4085 sub ecx, 1 4086 jge convertloop1 4087 4088 convertloop1b: 4089 pop esi 4090 ret 4091 } 4092 } 4093 #endif // HAS_ARGBBLENDROW_SSE2 4094 4095 #ifdef HAS_ARGBBLENDROW_SSSE3 4096 // Shuffle table for isolating alpha. 4097 static const uvec8 kShuffleAlpha = { 4098 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, 4099 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 4100 }; 4101 // Same as SSE2, but replaces: 4102 // psrlw xmm3, 8 // alpha 4103 // pshufhw xmm3, xmm3, 0F5h // 8 alpha words 4104 // pshuflw xmm3, xmm3, 0F5h 4105 // with.. 4106 // pshufb xmm3, kShuffleAlpha // alpha 4107 // Blend 8 pixels at a time. 4108 4109 __declspec(naked) 4110 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, 4111 uint8* dst_argb, int width) { 4112 __asm { 4113 push esi 4114 mov eax, [esp + 4 + 4] // src_argb0 4115 mov esi, [esp + 4 + 8] // src_argb1 4116 mov edx, [esp + 4 + 12] // dst_argb 4117 mov ecx, [esp + 4 + 16] // width 4118 pcmpeqb xmm7, xmm7 // generate constant 0x0001 4119 psrlw xmm7, 15 4120 pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff 4121 psrlw xmm6, 8 4122 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 4123 psllw xmm5, 8 4124 pcmpeqb xmm4, xmm4 // generate mask 0xff000000 4125 pslld xmm4, 24 4126 sub ecx, 4 4127 jl convertloop4b // less than 4 pixels? 4128 4129 // 4 pixel loop. 4130 convertloop4: 4131 movdqu xmm3, [eax] // src argb 4132 lea eax, [eax + 16] 4133 movdqa xmm0, xmm3 // src argb 4134 pxor xmm3, xmm4 // ~alpha 4135 movdqu xmm2, [esi] // _r_b 4136 pshufb xmm3, kShuffleAlpha // alpha 4137 pand xmm2, xmm6 // _r_b 4138 paddw xmm3, xmm7 // 256 - alpha 4139 pmullw xmm2, xmm3 // _r_b * alpha 4140 movdqu xmm1, [esi] // _a_g 4141 lea esi, [esi + 16] 4142 psrlw xmm1, 8 // _a_g 4143 por xmm0, xmm4 // set alpha to 255 4144 pmullw xmm1, xmm3 // _a_g * alpha 4145 psrlw xmm2, 8 // _r_b convert to 8 bits again 4146 paddusb xmm0, xmm2 // + src argb 4147 pand xmm1, xmm5 // a_g_ convert to 8 bits again 4148 paddusb xmm0, xmm1 // + src argb 4149 movdqu [edx], xmm0 4150 lea edx, [edx + 16] 4151 sub ecx, 4 4152 jge convertloop4 4153 4154 convertloop4b: 4155 add ecx, 4 - 1 4156 jl convertloop1b 4157 4158 // 1 pixel loop. 4159 convertloop1: 4160 movd xmm3, [eax] // src argb 4161 lea eax, [eax + 4] 4162 movdqa xmm0, xmm3 // src argb 4163 pxor xmm3, xmm4 // ~alpha 4164 movd xmm2, [esi] // _r_b 4165 pshufb xmm3, kShuffleAlpha // alpha 4166 pand xmm2, xmm6 // _r_b 4167 paddw xmm3, xmm7 // 256 - alpha 4168 pmullw xmm2, xmm3 // _r_b * alpha 4169 movd xmm1, [esi] // _a_g 4170 lea esi, [esi + 4] 4171 psrlw xmm1, 8 // _a_g 4172 por xmm0, xmm4 // set alpha to 255 4173 pmullw xmm1, xmm3 // _a_g * alpha 4174 psrlw xmm2, 8 // _r_b convert to 8 bits again 4175 paddusb xmm0, xmm2 // + src argb 4176 pand xmm1, xmm5 // a_g_ convert to 8 bits again 4177 paddusb xmm0, xmm1 // + src argb 4178 movd [edx], xmm0 4179 lea edx, [edx + 4] 4180 sub ecx, 1 4181 jge convertloop1 4182 4183 convertloop1b: 4184 pop esi 4185 ret 4186 } 4187 } 4188 #endif // HAS_ARGBBLENDROW_SSSE3 4189 4190 #ifdef HAS_ARGBATTENUATEROW_SSE2 4191 // Attenuate 4 pixels at a time. 4192 __declspec(naked) 4193 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { 4194 __asm { 4195 mov eax, [esp + 4] // src_argb0 4196 mov edx, [esp + 8] // dst_argb 4197 mov ecx, [esp + 12] // width 4198 pcmpeqb xmm4, xmm4 // generate mask 0xff000000 4199 pslld xmm4, 24 4200 pcmpeqb xmm5, xmm5 // generate mask 0x00ffffff 4201 psrld xmm5, 8 4202 4203 convertloop: 4204 movdqu xmm0, [eax] // read 4 pixels 4205 punpcklbw xmm0, xmm0 // first 2 4206 pshufhw xmm2, xmm0, 0FFh // 8 alpha words 4207 pshuflw xmm2, xmm2, 0FFh 4208 pmulhuw xmm0, xmm2 // rgb * a 4209 movdqu xmm1, [eax] // read 4 pixels 4210 punpckhbw xmm1, xmm1 // next 2 pixels 4211 pshufhw xmm2, xmm1, 0FFh // 8 alpha words 4212 pshuflw xmm2, xmm2, 0FFh 4213 pmulhuw xmm1, xmm2 // rgb * a 4214 movdqu xmm2, [eax] // alphas 4215 lea eax, [eax + 16] 4216 psrlw xmm0, 8 4217 pand xmm2, xmm4 4218 psrlw xmm1, 8 4219 packuswb xmm0, xmm1 4220 pand xmm0, xmm5 // keep original alphas 4221 por xmm0, xmm2 4222 movdqu [edx], xmm0 4223 lea edx, [edx + 16] 4224 sub ecx, 4 4225 jg convertloop 4226 4227 ret 4228 } 4229 } 4230 #endif // HAS_ARGBATTENUATEROW_SSE2 4231 4232 #ifdef HAS_ARGBATTENUATEROW_SSSE3 4233 // Shuffle table duplicating alpha. 4234 static const uvec8 kShuffleAlpha0 = { 4235 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u, 4236 }; 4237 static const uvec8 kShuffleAlpha1 = { 4238 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, 4239 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u, 4240 }; 4241 __declspec(naked) 4242 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { 4243 __asm { 4244 mov eax, [esp + 4] // src_argb0 4245 mov edx, [esp + 8] // dst_argb 4246 mov ecx, [esp + 12] // width 4247 pcmpeqb xmm3, xmm3 // generate mask 0xff000000 4248 pslld xmm3, 24 4249 movdqa xmm4, kShuffleAlpha0 4250 movdqa xmm5, kShuffleAlpha1 4251 4252 convertloop: 4253 movdqu xmm0, [eax] // read 4 pixels 4254 pshufb xmm0, xmm4 // isolate first 2 alphas 4255 movdqu xmm1, [eax] // read 4 pixels 4256 punpcklbw xmm1, xmm1 // first 2 pixel rgbs 4257 pmulhuw xmm0, xmm1 // rgb * a 4258 movdqu xmm1, [eax] // read 4 pixels 4259 pshufb xmm1, xmm5 // isolate next 2 alphas 4260 movdqu xmm2, [eax] // read 4 pixels 4261 punpckhbw xmm2, xmm2 // next 2 pixel rgbs 4262 pmulhuw xmm1, xmm2 // rgb * a 4263 movdqu xmm2, [eax] // mask original alpha 4264 lea eax, [eax + 16] 4265 pand xmm2, xmm3 4266 psrlw xmm0, 8 4267 psrlw xmm1, 8 4268 packuswb xmm0, xmm1 4269 por xmm0, xmm2 // copy original alpha 4270 movdqu [edx], xmm0 4271 lea edx, [edx + 16] 4272 sub ecx, 4 4273 jg convertloop 4274 4275 ret 4276 } 4277 } 4278 #endif // HAS_ARGBATTENUATEROW_SSSE3 4279 4280 #ifdef HAS_ARGBATTENUATEROW_AVX2 4281 // Shuffle table duplicating alpha. 4282 static const uvec8 kShuffleAlpha_AVX2 = { 4283 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u 4284 }; 4285 __declspec(naked) 4286 void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { 4287 __asm { 4288 mov eax, [esp + 4] // src_argb0 4289 mov edx, [esp + 8] // dst_argb 4290 mov ecx, [esp + 12] // width 4291 sub edx, eax 4292 vbroadcastf128 ymm4,kShuffleAlpha_AVX2 4293 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 4294 vpslld ymm5, ymm5, 24 4295 4296 convertloop: 4297 vmovdqu ymm6, [eax] // read 8 pixels. 4298 vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. 4299 vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. 4300 vpshufb ymm2, ymm0, ymm4 // low 4 alphas 4301 vpshufb ymm3, ymm1, ymm4 // high 4 alphas 4302 vpmulhuw ymm0, ymm0, ymm2 // rgb * a 4303 vpmulhuw ymm1, ymm1, ymm3 // rgb * a 4304 vpand ymm6, ymm6, ymm5 // isolate alpha 4305 vpsrlw ymm0, ymm0, 8 4306 vpsrlw ymm1, ymm1, 8 4307 vpackuswb ymm0, ymm0, ymm1 // unmutated. 4308 vpor ymm0, ymm0, ymm6 // copy original alpha 4309 vmovdqu [eax + edx], ymm0 4310 lea eax, [eax + 32] 4311 sub ecx, 8 4312 jg convertloop 4313 4314 vzeroupper 4315 ret 4316 } 4317 } 4318 #endif // HAS_ARGBATTENUATEROW_AVX2 4319 4320 #ifdef HAS_ARGBUNATTENUATEROW_SSE2 4321 // Unattenuate 4 pixels at a time. 4322 __declspec(naked) 4323 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, 4324 int width) { 4325 __asm { 4326 push esi 4327 push edi 4328 mov eax, [esp + 8 + 4] // src_argb0 4329 mov edx, [esp + 8 + 8] // dst_argb 4330 mov ecx, [esp + 8 + 12] // width 4331 4332 convertloop: 4333 movdqu xmm0, [eax] // read 4 pixels 4334 movzx esi, byte ptr [eax + 3] // first alpha 4335 movzx edi, byte ptr [eax + 7] // second alpha 4336 punpcklbw xmm0, xmm0 // first 2 4337 movd xmm2, dword ptr fixed_invtbl8[esi * 4] 4338 movd xmm3, dword ptr fixed_invtbl8[edi * 4] 4339 pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words. 1, a, a, a 4340 pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words 4341 movlhps xmm2, xmm3 4342 pmulhuw xmm0, xmm2 // rgb * a 4343 4344 movdqu xmm1, [eax] // read 4 pixels 4345 movzx esi, byte ptr [eax + 11] // third alpha 4346 movzx edi, byte ptr [eax + 15] // forth alpha 4347 punpckhbw xmm1, xmm1 // next 2 4348 movd xmm2, dword ptr fixed_invtbl8[esi * 4] 4349 movd xmm3, dword ptr fixed_invtbl8[edi * 4] 4350 pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words 4351 pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words 4352 movlhps xmm2, xmm3 4353 pmulhuw xmm1, xmm2 // rgb * a 4354 lea eax, [eax + 16] 4355 4356 packuswb xmm0, xmm1 4357 movdqu [edx], xmm0 4358 lea edx, [edx + 16] 4359 sub ecx, 4 4360 jg convertloop 4361 pop edi 4362 pop esi 4363 ret 4364 } 4365 } 4366 #endif // HAS_ARGBUNATTENUATEROW_SSE2 4367 4368 #ifdef HAS_ARGBUNATTENUATEROW_AVX2 4369 // Shuffle table duplicating alpha. 4370 static const uvec8 kUnattenShuffleAlpha_AVX2 = { 4371 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u 4372 }; 4373 // TODO(fbarchard): Enable USE_GATHER for future hardware if faster. 4374 // USE_GATHER is not on by default, due to being a slow instruction. 4375 #ifdef USE_GATHER 4376 __declspec(naked) 4377 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, 4378 int width) { 4379 __asm { 4380 mov eax, [esp + 4] // src_argb0 4381 mov edx, [esp + 8] // dst_argb 4382 mov ecx, [esp + 12] // width 4383 sub edx, eax 4384 vbroadcastf128 ymm4, kUnattenShuffleAlpha_AVX2 4385 4386 convertloop: 4387 vmovdqu ymm6, [eax] // read 8 pixels. 4388 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xffffffff for gather. 4389 vpsrld ymm2, ymm6, 24 // alpha in low 8 bits. 4390 vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. 4391 vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. 4392 vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5 // ymm5 cleared. 1, a 4393 vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a 4394 vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated. 4395 vpshufb ymm2, ymm2, ymm4 // replicate low 4 alphas. 1, a, a, a 4396 vpshufb ymm3, ymm3, ymm4 // replicate high 4 alphas 4397 vpmulhuw ymm0, ymm0, ymm2 // rgb * ia 4398 vpmulhuw ymm1, ymm1, ymm3 // rgb * ia 4399 vpackuswb ymm0, ymm0, ymm1 // unmutated. 4400 vmovdqu [eax + edx], ymm0 4401 lea eax, [eax + 32] 4402 sub ecx, 8 4403 jg convertloop 4404 4405 vzeroupper 4406 ret 4407 } 4408 } 4409 #else // USE_GATHER 4410 __declspec(naked) 4411 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, 4412 int width) { 4413 __asm { 4414 4415 mov eax, [esp + 4] // src_argb0 4416 mov edx, [esp + 8] // dst_argb 4417 mov ecx, [esp + 12] // width 4418 sub edx, eax 4419 vbroadcastf128 ymm5, kUnattenShuffleAlpha_AVX2 4420 4421 push esi 4422 push edi 4423 4424 convertloop: 4425 // replace VPGATHER 4426 movzx esi, byte ptr [eax + 3] // alpha0 4427 movzx edi, byte ptr [eax + 7] // alpha1 4428 vmovd xmm0, dword ptr fixed_invtbl8[esi * 4] // [1,a0] 4429 vmovd xmm1, dword ptr fixed_invtbl8[edi * 4] // [1,a1] 4430 movzx esi, byte ptr [eax + 11] // alpha2 4431 movzx edi, byte ptr [eax + 15] // alpha3 4432 vpunpckldq xmm6, xmm0, xmm1 // [1,a1,1,a0] 4433 vmovd xmm2, dword ptr fixed_invtbl8[esi * 4] // [1,a2] 4434 vmovd xmm3, dword ptr fixed_invtbl8[edi * 4] // [1,a3] 4435 movzx esi, byte ptr [eax + 19] // alpha4 4436 movzx edi, byte ptr [eax + 23] // alpha5 4437 vpunpckldq xmm7, xmm2, xmm3 // [1,a3,1,a2] 4438 vmovd xmm0, dword ptr fixed_invtbl8[esi * 4] // [1,a4] 4439 vmovd xmm1, dword ptr fixed_invtbl8[edi * 4] // [1,a5] 4440 movzx esi, byte ptr [eax + 27] // alpha6 4441 movzx edi, byte ptr [eax + 31] // alpha7 4442 vpunpckldq xmm0, xmm0, xmm1 // [1,a5,1,a4] 4443 vmovd xmm2, dword ptr fixed_invtbl8[esi * 4] // [1,a6] 4444 vmovd xmm3, dword ptr fixed_invtbl8[edi * 4] // [1,a7] 4445 vpunpckldq xmm2, xmm2, xmm3 // [1,a7,1,a6] 4446 vpunpcklqdq xmm3, xmm6, xmm7 // [1,a3,1,a2,1,a1,1,a0] 4447 vpunpcklqdq xmm0, xmm0, xmm2 // [1,a7,1,a6,1,a5,1,a4] 4448 vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0] 4449 // end of VPGATHER 4450 4451 vmovdqu ymm6, [eax] // read 8 pixels. 4452 vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. 4453 vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. 4454 vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a 4455 vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated. 4456 vpshufb ymm2, ymm2, ymm5 // replicate low 4 alphas. 1, a, a, a 4457 vpshufb ymm3, ymm3, ymm5 // replicate high 4 alphas 4458 vpmulhuw ymm0, ymm0, ymm2 // rgb * ia 4459 vpmulhuw ymm1, ymm1, ymm3 // rgb * ia 4460 vpackuswb ymm0, ymm0, ymm1 // unmutated. 4461 vmovdqu [eax + edx], ymm0 4462 lea eax, [eax + 32] 4463 sub ecx, 8 4464 jg convertloop 4465 4466 pop edi 4467 pop esi 4468 vzeroupper 4469 ret 4470 } 4471 } 4472 #endif // USE_GATHER 4473 #endif // HAS_ARGBATTENUATEROW_AVX2 4474 4475 #ifdef HAS_ARGBGRAYROW_SSSE3 4476 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels. 4477 __declspec(naked) 4478 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { 4479 __asm { 4480 mov eax, [esp + 4] /* src_argb */ 4481 mov edx, [esp + 8] /* dst_argb */ 4482 mov ecx, [esp + 12] /* width */ 4483 movdqa xmm4, kARGBToYJ 4484 movdqa xmm5, kAddYJ64 4485 4486 convertloop: 4487 movdqu xmm0, [eax] // G 4488 movdqu xmm1, [eax + 16] 4489 pmaddubsw xmm0, xmm4 4490 pmaddubsw xmm1, xmm4 4491 phaddw xmm0, xmm1 4492 paddw xmm0, xmm5 // Add .5 for rounding. 4493 psrlw xmm0, 7 4494 packuswb xmm0, xmm0 // 8 G bytes 4495 movdqu xmm2, [eax] // A 4496 movdqu xmm3, [eax + 16] 4497 lea eax, [eax + 32] 4498 psrld xmm2, 24 4499 psrld xmm3, 24 4500 packuswb xmm2, xmm3 4501 packuswb xmm2, xmm2 // 8 A bytes 4502 movdqa xmm3, xmm0 // Weave into GG, GA, then GGGA 4503 punpcklbw xmm0, xmm0 // 8 GG words 4504 punpcklbw xmm3, xmm2 // 8 GA words 4505 movdqa xmm1, xmm0 4506 punpcklwd xmm0, xmm3 // GGGA first 4 4507 punpckhwd xmm1, xmm3 // GGGA next 4 4508 movdqu [edx], xmm0 4509 movdqu [edx + 16], xmm1 4510 lea edx, [edx + 32] 4511 sub ecx, 8 4512 jg convertloop 4513 ret 4514 } 4515 } 4516 #endif // HAS_ARGBGRAYROW_SSSE3 4517 4518 #ifdef HAS_ARGBSEPIAROW_SSSE3 4519 // b = (r * 35 + g * 68 + b * 17) >> 7 4520 // g = (r * 45 + g * 88 + b * 22) >> 7 4521 // r = (r * 50 + g * 98 + b * 24) >> 7 4522 // Constant for ARGB color to sepia tone. 4523 static const vec8 kARGBToSepiaB = { 4524 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0 4525 }; 4526 4527 static const vec8 kARGBToSepiaG = { 4528 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0 4529 }; 4530 4531 static const vec8 kARGBToSepiaR = { 4532 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0 4533 }; 4534 4535 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. 4536 __declspec(naked) 4537 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { 4538 __asm { 4539 mov eax, [esp + 4] /* dst_argb */ 4540 mov ecx, [esp + 8] /* width */ 4541 movdqa xmm2, kARGBToSepiaB 4542 movdqa xmm3, kARGBToSepiaG 4543 movdqa xmm4, kARGBToSepiaR 4544 4545 convertloop: 4546 movdqu xmm0, [eax] // B 4547 movdqu xmm6, [eax + 16] 4548 pmaddubsw xmm0, xmm2 4549 pmaddubsw xmm6, xmm2 4550 phaddw xmm0, xmm6 4551 psrlw xmm0, 7 4552 packuswb xmm0, xmm0 // 8 B values 4553 movdqu xmm5, [eax] // G 4554 movdqu xmm1, [eax + 16] 4555 pmaddubsw xmm5, xmm3 4556 pmaddubsw xmm1, xmm3 4557 phaddw xmm5, xmm1 4558 psrlw xmm5, 7 4559 packuswb xmm5, xmm5 // 8 G values 4560 punpcklbw xmm0, xmm5 // 8 BG values 4561 movdqu xmm5, [eax] // R 4562 movdqu xmm1, [eax + 16] 4563 pmaddubsw xmm5, xmm4 4564 pmaddubsw xmm1, xmm4 4565 phaddw xmm5, xmm1 4566 psrlw xmm5, 7 4567 packuswb xmm5, xmm5 // 8 R values 4568 movdqu xmm6, [eax] // A 4569 movdqu xmm1, [eax + 16] 4570 psrld xmm6, 24 4571 psrld xmm1, 24 4572 packuswb xmm6, xmm1 4573 packuswb xmm6, xmm6 // 8 A values 4574 punpcklbw xmm5, xmm6 // 8 RA values 4575 movdqa xmm1, xmm0 // Weave BG, RA together 4576 punpcklwd xmm0, xmm5 // BGRA first 4 4577 punpckhwd xmm1, xmm5 // BGRA next 4 4578 movdqu [eax], xmm0 4579 movdqu [eax + 16], xmm1 4580 lea eax, [eax + 32] 4581 sub ecx, 8 4582 jg convertloop 4583 ret 4584 } 4585 } 4586 #endif // HAS_ARGBSEPIAROW_SSSE3 4587 4588 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 4589 // Tranform 8 ARGB pixels (32 bytes) with color matrix. 4590 // Same as Sepia except matrix is provided. 4591 // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R 4592 // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd. 4593 __declspec(naked) 4594 void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, 4595 const int8* matrix_argb, int width) { 4596 __asm { 4597 mov eax, [esp + 4] /* src_argb */ 4598 mov edx, [esp + 8] /* dst_argb */ 4599 mov ecx, [esp + 12] /* matrix_argb */ 4600 movdqu xmm5, [ecx] 4601 pshufd xmm2, xmm5, 0x00 4602 pshufd xmm3, xmm5, 0x55 4603 pshufd xmm4, xmm5, 0xaa 4604 pshufd xmm5, xmm5, 0xff 4605 mov ecx, [esp + 16] /* width */ 4606 4607 convertloop: 4608 movdqu xmm0, [eax] // B 4609 movdqu xmm7, [eax + 16] 4610 pmaddubsw xmm0, xmm2 4611 pmaddubsw xmm7, xmm2 4612 movdqu xmm6, [eax] // G 4613 movdqu xmm1, [eax + 16] 4614 pmaddubsw xmm6, xmm3 4615 pmaddubsw xmm1, xmm3 4616 phaddsw xmm0, xmm7 // B 4617 phaddsw xmm6, xmm1 // G 4618 psraw xmm0, 6 // B 4619 psraw xmm6, 6 // G 4620 packuswb xmm0, xmm0 // 8 B values 4621 packuswb xmm6, xmm6 // 8 G values 4622 punpcklbw xmm0, xmm6 // 8 BG values 4623 movdqu xmm1, [eax] // R 4624 movdqu xmm7, [eax + 16] 4625 pmaddubsw xmm1, xmm4 4626 pmaddubsw xmm7, xmm4 4627 phaddsw xmm1, xmm7 // R 4628 movdqu xmm6, [eax] // A 4629 movdqu xmm7, [eax + 16] 4630 pmaddubsw xmm6, xmm5 4631 pmaddubsw xmm7, xmm5 4632 phaddsw xmm6, xmm7 // A 4633 psraw xmm1, 6 // R 4634 psraw xmm6, 6 // A 4635 packuswb xmm1, xmm1 // 8 R values 4636 packuswb xmm6, xmm6 // 8 A values 4637 punpcklbw xmm1, xmm6 // 8 RA values 4638 movdqa xmm6, xmm0 // Weave BG, RA together 4639 punpcklwd xmm0, xmm1 // BGRA first 4 4640 punpckhwd xmm6, xmm1 // BGRA next 4 4641 movdqu [edx], xmm0 4642 movdqu [edx + 16], xmm6 4643 lea eax, [eax + 32] 4644 lea edx, [edx + 32] 4645 sub ecx, 8 4646 jg convertloop 4647 ret 4648 } 4649 } 4650 #endif // HAS_ARGBCOLORMATRIXROW_SSSE3 4651 4652 #ifdef HAS_ARGBQUANTIZEROW_SSE2 4653 // Quantize 4 ARGB pixels (16 bytes). 4654 __declspec(naked) 4655 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, 4656 int interval_offset, int width) { 4657 __asm { 4658 mov eax, [esp + 4] /* dst_argb */ 4659 movd xmm2, [esp + 8] /* scale */ 4660 movd xmm3, [esp + 12] /* interval_size */ 4661 movd xmm4, [esp + 16] /* interval_offset */ 4662 mov ecx, [esp + 20] /* width */ 4663 pshuflw xmm2, xmm2, 040h 4664 pshufd xmm2, xmm2, 044h 4665 pshuflw xmm3, xmm3, 040h 4666 pshufd xmm3, xmm3, 044h 4667 pshuflw xmm4, xmm4, 040h 4668 pshufd xmm4, xmm4, 044h 4669 pxor xmm5, xmm5 // constant 0 4670 pcmpeqb xmm6, xmm6 // generate mask 0xff000000 4671 pslld xmm6, 24 4672 4673 convertloop: 4674 movdqu xmm0, [eax] // read 4 pixels 4675 punpcklbw xmm0, xmm5 // first 2 pixels 4676 pmulhuw xmm0, xmm2 // pixel * scale >> 16 4677 movdqu xmm1, [eax] // read 4 pixels 4678 punpckhbw xmm1, xmm5 // next 2 pixels 4679 pmulhuw xmm1, xmm2 4680 pmullw xmm0, xmm3 // * interval_size 4681 movdqu xmm7, [eax] // read 4 pixels 4682 pmullw xmm1, xmm3 4683 pand xmm7, xmm6 // mask alpha 4684 paddw xmm0, xmm4 // + interval_size / 2 4685 paddw xmm1, xmm4 4686 packuswb xmm0, xmm1 4687 por xmm0, xmm7 4688 movdqu [eax], xmm0 4689 lea eax, [eax + 16] 4690 sub ecx, 4 4691 jg convertloop 4692 ret 4693 } 4694 } 4695 #endif // HAS_ARGBQUANTIZEROW_SSE2 4696 4697 #ifdef HAS_ARGBSHADEROW_SSE2 4698 // Shade 4 pixels at a time by specified value. 4699 __declspec(naked) 4700 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, 4701 uint32 value) { 4702 __asm { 4703 mov eax, [esp + 4] // src_argb 4704 mov edx, [esp + 8] // dst_argb 4705 mov ecx, [esp + 12] // width 4706 movd xmm2, [esp + 16] // value 4707 punpcklbw xmm2, xmm2 4708 punpcklqdq xmm2, xmm2 4709 4710 convertloop: 4711 movdqu xmm0, [eax] // read 4 pixels 4712 lea eax, [eax + 16] 4713 movdqa xmm1, xmm0 4714 punpcklbw xmm0, xmm0 // first 2 4715 punpckhbw xmm1, xmm1 // next 2 4716 pmulhuw xmm0, xmm2 // argb * value 4717 pmulhuw xmm1, xmm2 // argb * value 4718 psrlw xmm0, 8 4719 psrlw xmm1, 8 4720 packuswb xmm0, xmm1 4721 movdqu [edx], xmm0 4722 lea edx, [edx + 16] 4723 sub ecx, 4 4724 jg convertloop 4725 4726 ret 4727 } 4728 } 4729 #endif // HAS_ARGBSHADEROW_SSE2 4730 4731 #ifdef HAS_ARGBMULTIPLYROW_SSE2 4732 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time. 4733 __declspec(naked) 4734 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, 4735 uint8* dst_argb, int width) { 4736 __asm { 4737 push esi 4738 mov eax, [esp + 4 + 4] // src_argb0 4739 mov esi, [esp + 4 + 8] // src_argb1 4740 mov edx, [esp + 4 + 12] // dst_argb 4741 mov ecx, [esp + 4 + 16] // width 4742 pxor xmm5, xmm5 // constant 0 4743 4744 convertloop: 4745 movdqu xmm0, [eax] // read 4 pixels from src_argb0 4746 movdqu xmm2, [esi] // read 4 pixels from src_argb1 4747 movdqu xmm1, xmm0 4748 movdqu xmm3, xmm2 4749 punpcklbw xmm0, xmm0 // first 2 4750 punpckhbw xmm1, xmm1 // next 2 4751 punpcklbw xmm2, xmm5 // first 2 4752 punpckhbw xmm3, xmm5 // next 2 4753 pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2 4754 pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2 4755 lea eax, [eax + 16] 4756 lea esi, [esi + 16] 4757 packuswb xmm0, xmm1 4758 movdqu [edx], xmm0 4759 lea edx, [edx + 16] 4760 sub ecx, 4 4761 jg convertloop 4762 4763 pop esi 4764 ret 4765 } 4766 } 4767 #endif // HAS_ARGBMULTIPLYROW_SSE2 4768 4769 #ifdef HAS_ARGBADDROW_SSE2 4770 // Add 2 rows of ARGB pixels together, 4 pixels at a time. 4771 // TODO(fbarchard): Port this to posix, neon and other math functions. 4772 __declspec(naked) 4773 void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, 4774 uint8* dst_argb, int width) { 4775 __asm { 4776 push esi 4777 mov eax, [esp + 4 + 4] // src_argb0 4778 mov esi, [esp + 4 + 8] // src_argb1 4779 mov edx, [esp + 4 + 12] // dst_argb 4780 mov ecx, [esp + 4 + 16] // width 4781 4782 sub ecx, 4 4783 jl convertloop49 4784 4785 convertloop4: 4786 movdqu xmm0, [eax] // read 4 pixels from src_argb0 4787 lea eax, [eax + 16] 4788 movdqu xmm1, [esi] // read 4 pixels from src_argb1 4789 lea esi, [esi + 16] 4790 paddusb xmm0, xmm1 // src_argb0 + src_argb1 4791 movdqu [edx], xmm0 4792 lea edx, [edx + 16] 4793 sub ecx, 4 4794 jge convertloop4 4795 4796 convertloop49: 4797 add ecx, 4 - 1 4798 jl convertloop19 4799 4800 convertloop1: 4801 movd xmm0, [eax] // read 1 pixels from src_argb0 4802 lea eax, [eax + 4] 4803 movd xmm1, [esi] // read 1 pixels from src_argb1 4804 lea esi, [esi + 4] 4805 paddusb xmm0, xmm1 // src_argb0 + src_argb1 4806 movd [edx], xmm0 4807 lea edx, [edx + 4] 4808 sub ecx, 1 4809 jge convertloop1 4810 4811 convertloop19: 4812 pop esi 4813 ret 4814 } 4815 } 4816 #endif // HAS_ARGBADDROW_SSE2 4817 4818 #ifdef HAS_ARGBSUBTRACTROW_SSE2 4819 // Subtract 2 rows of ARGB pixels together, 4 pixels at a time. 4820 __declspec(naked) 4821 void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, 4822 uint8* dst_argb, int width) { 4823 __asm { 4824 push esi 4825 mov eax, [esp + 4 + 4] // src_argb0 4826 mov esi, [esp + 4 + 8] // src_argb1 4827 mov edx, [esp + 4 + 12] // dst_argb 4828 mov ecx, [esp + 4 + 16] // width 4829 4830 convertloop: 4831 movdqu xmm0, [eax] // read 4 pixels from src_argb0 4832 lea eax, [eax + 16] 4833 movdqu xmm1, [esi] // read 4 pixels from src_argb1 4834 lea esi, [esi + 16] 4835 psubusb xmm0, xmm1 // src_argb0 - src_argb1 4836 movdqu [edx], xmm0 4837 lea edx, [edx + 16] 4838 sub ecx, 4 4839 jg convertloop 4840 4841 pop esi 4842 ret 4843 } 4844 } 4845 #endif // HAS_ARGBSUBTRACTROW_SSE2 4846 4847 #ifdef HAS_ARGBMULTIPLYROW_AVX2 4848 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. 4849 __declspec(naked) 4850 void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, 4851 uint8* dst_argb, int width) { 4852 __asm { 4853 push esi 4854 mov eax, [esp + 4 + 4] // src_argb0 4855 mov esi, [esp + 4 + 8] // src_argb1 4856 mov edx, [esp + 4 + 12] // dst_argb 4857 mov ecx, [esp + 4 + 16] // width 4858 vpxor ymm5, ymm5, ymm5 // constant 0 4859 4860 convertloop: 4861 vmovdqu ymm1, [eax] // read 8 pixels from src_argb0 4862 lea eax, [eax + 32] 4863 vmovdqu ymm3, [esi] // read 8 pixels from src_argb1 4864 lea esi, [esi + 32] 4865 vpunpcklbw ymm0, ymm1, ymm1 // low 4 4866 vpunpckhbw ymm1, ymm1, ymm1 // high 4 4867 vpunpcklbw ymm2, ymm3, ymm5 // low 4 4868 vpunpckhbw ymm3, ymm3, ymm5 // high 4 4869 vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4 4870 vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4 4871 vpackuswb ymm0, ymm0, ymm1 4872 vmovdqu [edx], ymm0 4873 lea edx, [edx + 32] 4874 sub ecx, 8 4875 jg convertloop 4876 4877 pop esi 4878 vzeroupper 4879 ret 4880 } 4881 } 4882 #endif // HAS_ARGBMULTIPLYROW_AVX2 4883 4884 #ifdef HAS_ARGBADDROW_AVX2 4885 // Add 2 rows of ARGB pixels together, 8 pixels at a time. 4886 __declspec(naked) 4887 void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, 4888 uint8* dst_argb, int width) { 4889 __asm { 4890 push esi 4891 mov eax, [esp + 4 + 4] // src_argb0 4892 mov esi, [esp + 4 + 8] // src_argb1 4893 mov edx, [esp + 4 + 12] // dst_argb 4894 mov ecx, [esp + 4 + 16] // width 4895 4896 convertloop: 4897 vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 4898 lea eax, [eax + 32] 4899 vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1 4900 lea esi, [esi + 32] 4901 vmovdqu [edx], ymm0 4902 lea edx, [edx + 32] 4903 sub ecx, 8 4904 jg convertloop 4905 4906 pop esi 4907 vzeroupper 4908 ret 4909 } 4910 } 4911 #endif // HAS_ARGBADDROW_AVX2 4912 4913 #ifdef HAS_ARGBSUBTRACTROW_AVX2 4914 // Subtract 2 rows of ARGB pixels together, 8 pixels at a time. 4915 __declspec(naked) 4916 void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, 4917 uint8* dst_argb, int width) { 4918 __asm { 4919 push esi 4920 mov eax, [esp + 4 + 4] // src_argb0 4921 mov esi, [esp + 4 + 8] // src_argb1 4922 mov edx, [esp + 4 + 12] // dst_argb 4923 mov ecx, [esp + 4 + 16] // width 4924 4925 convertloop: 4926 vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 4927 lea eax, [eax + 32] 4928 vpsubusb ymm0, ymm0, [esi] // src_argb0 - src_argb1 4929 lea esi, [esi + 32] 4930 vmovdqu [edx], ymm0 4931 lea edx, [edx + 32] 4932 sub ecx, 8 4933 jg convertloop 4934 4935 pop esi 4936 vzeroupper 4937 ret 4938 } 4939 } 4940 #endif // HAS_ARGBSUBTRACTROW_AVX2 4941 4942 #ifdef HAS_SOBELXROW_SSE2 4943 // SobelX as a matrix is 4944 // -1 0 1 4945 // -2 0 2 4946 // -1 0 1 4947 __declspec(naked) 4948 void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, 4949 const uint8* src_y2, uint8* dst_sobelx, int width) { 4950 __asm { 4951 push esi 4952 push edi 4953 mov eax, [esp + 8 + 4] // src_y0 4954 mov esi, [esp + 8 + 8] // src_y1 4955 mov edi, [esp + 8 + 12] // src_y2 4956 mov edx, [esp + 8 + 16] // dst_sobelx 4957 mov ecx, [esp + 8 + 20] // width 4958 sub esi, eax 4959 sub edi, eax 4960 sub edx, eax 4961 pxor xmm5, xmm5 // constant 0 4962 4963 convertloop: 4964 movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] 4965 movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2] 4966 punpcklbw xmm0, xmm5 4967 punpcklbw xmm1, xmm5 4968 psubw xmm0, xmm1 4969 movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] 4970 movq xmm2, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2] 4971 punpcklbw xmm1, xmm5 4972 punpcklbw xmm2, xmm5 4973 psubw xmm1, xmm2 4974 movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0] 4975 movq xmm3, qword ptr [eax + edi + 2] // read 8 pixels from src_y2[2] 4976 punpcklbw xmm2, xmm5 4977 punpcklbw xmm3, xmm5 4978 psubw xmm2, xmm3 4979 paddw xmm0, xmm2 4980 paddw xmm0, xmm1 4981 paddw xmm0, xmm1 4982 pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw 4983 psubw xmm1, xmm0 4984 pmaxsw xmm0, xmm1 4985 packuswb xmm0, xmm0 4986 movq qword ptr [eax + edx], xmm0 4987 lea eax, [eax + 8] 4988 sub ecx, 8 4989 jg convertloop 4990 4991 pop edi 4992 pop esi 4993 ret 4994 } 4995 } 4996 #endif // HAS_SOBELXROW_SSE2 4997 4998 #ifdef HAS_SOBELYROW_SSE2 4999 // SobelY as a matrix is 5000 // -1 -2 -1 5001 // 0 0 0 5002 // 1 2 1 5003 __declspec(naked) 5004 void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, 5005 uint8* dst_sobely, int width) { 5006 __asm { 5007 push esi 5008 mov eax, [esp + 4 + 4] // src_y0 5009 mov esi, [esp + 4 + 8] // src_y1 5010 mov edx, [esp + 4 + 12] // dst_sobely 5011 mov ecx, [esp + 4 + 16] // width 5012 sub esi, eax 5013 sub edx, eax 5014 pxor xmm5, xmm5 // constant 0 5015 5016 convertloop: 5017 movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] 5018 movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] 5019 punpcklbw xmm0, xmm5 5020 punpcklbw xmm1, xmm5 5021 psubw xmm0, xmm1 5022 movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1] 5023 movq xmm2, qword ptr [eax + esi + 1] // read 8 pixels from src_y1[1] 5024 punpcklbw xmm1, xmm5 5025 punpcklbw xmm2, xmm5 5026 psubw xmm1, xmm2 5027 movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2] 5028 movq xmm3, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2] 5029 punpcklbw xmm2, xmm5 5030 punpcklbw xmm3, xmm5 5031 psubw xmm2, xmm3 5032 paddw xmm0, xmm2 5033 paddw xmm0, xmm1 5034 paddw xmm0, xmm1 5035 pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw 5036 psubw xmm1, xmm0 5037 pmaxsw xmm0, xmm1 5038 packuswb xmm0, xmm0 5039 movq qword ptr [eax + edx], xmm0 5040 lea eax, [eax + 8] 5041 sub ecx, 8 5042 jg convertloop 5043 5044 pop esi 5045 ret 5046 } 5047 } 5048 #endif // HAS_SOBELYROW_SSE2 5049 5050 #ifdef HAS_SOBELROW_SSE2 5051 // Adds Sobel X and Sobel Y and stores Sobel into ARGB. 5052 // A = 255 5053 // R = Sobel 5054 // G = Sobel 5055 // B = Sobel 5056 __declspec(naked) 5057 void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, 5058 uint8* dst_argb, int width) { 5059 __asm { 5060 push esi 5061 mov eax, [esp + 4 + 4] // src_sobelx 5062 mov esi, [esp + 4 + 8] // src_sobely 5063 mov edx, [esp + 4 + 12] // dst_argb 5064 mov ecx, [esp + 4 + 16] // width 5065 sub esi, eax 5066 pcmpeqb xmm5, xmm5 // alpha 255 5067 pslld xmm5, 24 // 0xff000000 5068 5069 convertloop: 5070 movdqu xmm0, [eax] // read 16 pixels src_sobelx 5071 movdqu xmm1, [eax + esi] // read 16 pixels src_sobely 5072 lea eax, [eax + 16] 5073 paddusb xmm0, xmm1 // sobel = sobelx + sobely 5074 movdqa xmm2, xmm0 // GG 5075 punpcklbw xmm2, xmm0 // First 8 5076 punpckhbw xmm0, xmm0 // Next 8 5077 movdqa xmm1, xmm2 // GGGG 5078 punpcklwd xmm1, xmm2 // First 4 5079 punpckhwd xmm2, xmm2 // Next 4 5080 por xmm1, xmm5 // GGGA 5081 por xmm2, xmm5 5082 movdqa xmm3, xmm0 // GGGG 5083 punpcklwd xmm3, xmm0 // Next 4 5084 punpckhwd xmm0, xmm0 // Last 4 5085 por xmm3, xmm5 // GGGA 5086 por xmm0, xmm5 5087 movdqu [edx], xmm1 5088 movdqu [edx + 16], xmm2 5089 movdqu [edx + 32], xmm3 5090 movdqu [edx + 48], xmm0 5091 lea edx, [edx + 64] 5092 sub ecx, 16 5093 jg convertloop 5094 5095 pop esi 5096 ret 5097 } 5098 } 5099 #endif // HAS_SOBELROW_SSE2 5100 5101 #ifdef HAS_SOBELTOPLANEROW_SSE2 5102 // Adds Sobel X and Sobel Y and stores Sobel into a plane. 5103 __declspec(naked) 5104 void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, 5105 uint8* dst_y, int width) { 5106 __asm { 5107 push esi 5108 mov eax, [esp + 4 + 4] // src_sobelx 5109 mov esi, [esp + 4 + 8] // src_sobely 5110 mov edx, [esp + 4 + 12] // dst_argb 5111 mov ecx, [esp + 4 + 16] // width 5112 sub esi, eax 5113 5114 convertloop: 5115 movdqu xmm0, [eax] // read 16 pixels src_sobelx 5116 movdqu xmm1, [eax + esi] // read 16 pixels src_sobely 5117 lea eax, [eax + 16] 5118 paddusb xmm0, xmm1 // sobel = sobelx + sobely 5119 movdqu [edx], xmm0 5120 lea edx, [edx + 16] 5121 sub ecx, 16 5122 jg convertloop 5123 5124 pop esi 5125 ret 5126 } 5127 } 5128 #endif // HAS_SOBELTOPLANEROW_SSE2 5129 5130 #ifdef HAS_SOBELXYROW_SSE2 5131 // Mixes Sobel X, Sobel Y and Sobel into ARGB. 5132 // A = 255 5133 // R = Sobel X 5134 // G = Sobel 5135 // B = Sobel Y 5136 __declspec(naked) 5137 void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, 5138 uint8* dst_argb, int width) { 5139 __asm { 5140 push esi 5141 mov eax, [esp + 4 + 4] // src_sobelx 5142 mov esi, [esp + 4 + 8] // src_sobely 5143 mov edx, [esp + 4 + 12] // dst_argb 5144 mov ecx, [esp + 4 + 16] // width 5145 sub esi, eax 5146 pcmpeqb xmm5, xmm5 // alpha 255 5147 5148 convertloop: 5149 movdqu xmm0, [eax] // read 16 pixels src_sobelx 5150 movdqu xmm1, [eax + esi] // read 16 pixels src_sobely 5151 lea eax, [eax + 16] 5152 movdqa xmm2, xmm0 5153 paddusb xmm2, xmm1 // sobel = sobelx + sobely 5154 movdqa xmm3, xmm0 // XA 5155 punpcklbw xmm3, xmm5 5156 punpckhbw xmm0, xmm5 5157 movdqa xmm4, xmm1 // YS 5158 punpcklbw xmm4, xmm2 5159 punpckhbw xmm1, xmm2 5160 movdqa xmm6, xmm4 // YSXA 5161 punpcklwd xmm6, xmm3 // First 4 5162 punpckhwd xmm4, xmm3 // Next 4 5163 movdqa xmm7, xmm1 // YSXA 5164 punpcklwd xmm7, xmm0 // Next 4 5165 punpckhwd xmm1, xmm0 // Last 4 5166 movdqu [edx], xmm6 5167 movdqu [edx + 16], xmm4 5168 movdqu [edx + 32], xmm7 5169 movdqu [edx + 48], xmm1 5170 lea edx, [edx + 64] 5171 sub ecx, 16 5172 jg convertloop 5173 5174 pop esi 5175 ret 5176 } 5177 } 5178 #endif // HAS_SOBELXYROW_SSE2 5179 5180 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 5181 // Consider float CumulativeSum. 5182 // Consider calling CumulativeSum one row at time as needed. 5183 // Consider circular CumulativeSum buffer of radius * 2 + 1 height. 5184 // Convert cumulative sum for an area to an average for 1 pixel. 5185 // topleft is pointer to top left of CumulativeSum buffer for area. 5186 // botleft is pointer to bottom left of CumulativeSum buffer. 5187 // width is offset from left to right of area in CumulativeSum buffer measured 5188 // in number of ints. 5189 // area is the number of pixels in the area being averaged. 5190 // dst points to pixel to store result to. 5191 // count is number of averaged pixels to produce. 5192 // Does 4 pixels at a time. 5193 void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, 5194 int width, int area, uint8* dst, 5195 int count) { 5196 __asm { 5197 mov eax, topleft // eax topleft 5198 mov esi, botleft // esi botleft 5199 mov edx, width 5200 movd xmm5, area 5201 mov edi, dst 5202 mov ecx, count 5203 cvtdq2ps xmm5, xmm5 5204 rcpss xmm4, xmm5 // 1.0f / area 5205 pshufd xmm4, xmm4, 0 5206 sub ecx, 4 5207 jl l4b 5208 5209 cmp area, 128 // 128 pixels will not overflow 15 bits. 5210 ja l4 5211 5212 pshufd xmm5, xmm5, 0 // area 5213 pcmpeqb xmm6, xmm6 // constant of 65536.0 - 1 = 65535.0 5214 psrld xmm6, 16 5215 cvtdq2ps xmm6, xmm6 5216 addps xmm5, xmm6 // (65536.0 + area - 1) 5217 mulps xmm5, xmm4 // (65536.0 + area - 1) * 1 / area 5218 cvtps2dq xmm5, xmm5 // 0.16 fixed point 5219 packssdw xmm5, xmm5 // 16 bit shorts 5220 5221 // 4 pixel loop small blocks. 5222 s4: 5223 // top left 5224 movdqu xmm0, [eax] 5225 movdqu xmm1, [eax + 16] 5226 movdqu xmm2, [eax + 32] 5227 movdqu xmm3, [eax + 48] 5228 5229 // - top right 5230 psubd xmm0, [eax + edx * 4] 5231 psubd xmm1, [eax + edx * 4 + 16] 5232 psubd xmm2, [eax + edx * 4 + 32] 5233 psubd xmm3, [eax + edx * 4 + 48] 5234 lea eax, [eax + 64] 5235 5236 // - bottom left 5237 psubd xmm0, [esi] 5238 psubd xmm1, [esi + 16] 5239 psubd xmm2, [esi + 32] 5240 psubd xmm3, [esi + 48] 5241 5242 // + bottom right 5243 paddd xmm0, [esi + edx * 4] 5244 paddd xmm1, [esi + edx * 4 + 16] 5245 paddd xmm2, [esi + edx * 4 + 32] 5246 paddd xmm3, [esi + edx * 4 + 48] 5247 lea esi, [esi + 64] 5248 5249 packssdw xmm0, xmm1 // pack 4 pixels into 2 registers 5250 packssdw xmm2, xmm3 5251 5252 pmulhuw xmm0, xmm5 5253 pmulhuw xmm2, xmm5 5254 5255 packuswb xmm0, xmm2 5256 movdqu [edi], xmm0 5257 lea edi, [edi + 16] 5258 sub ecx, 4 5259 jge s4 5260 5261 jmp l4b 5262 5263 // 4 pixel loop 5264 l4: 5265 // top left 5266 movdqu xmm0, [eax] 5267 movdqu xmm1, [eax + 16] 5268 movdqu xmm2, [eax + 32] 5269 movdqu xmm3, [eax + 48] 5270 5271 // - top right 5272 psubd xmm0, [eax + edx * 4] 5273 psubd xmm1, [eax + edx * 4 + 16] 5274 psubd xmm2, [eax + edx * 4 + 32] 5275 psubd xmm3, [eax + edx * 4 + 48] 5276 lea eax, [eax + 64] 5277 5278 // - bottom left 5279 psubd xmm0, [esi] 5280 psubd xmm1, [esi + 16] 5281 psubd xmm2, [esi + 32] 5282 psubd xmm3, [esi + 48] 5283 5284 // + bottom right 5285 paddd xmm0, [esi + edx * 4] 5286 paddd xmm1, [esi + edx * 4 + 16] 5287 paddd xmm2, [esi + edx * 4 + 32] 5288 paddd xmm3, [esi + edx * 4 + 48] 5289 lea esi, [esi + 64] 5290 5291 cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area 5292 cvtdq2ps xmm1, xmm1 5293 mulps xmm0, xmm4 5294 mulps xmm1, xmm4 5295 cvtdq2ps xmm2, xmm2 5296 cvtdq2ps xmm3, xmm3 5297 mulps xmm2, xmm4 5298 mulps xmm3, xmm4 5299 cvtps2dq xmm0, xmm0 5300 cvtps2dq xmm1, xmm1 5301 cvtps2dq xmm2, xmm2 5302 cvtps2dq xmm3, xmm3 5303 packssdw xmm0, xmm1 5304 packssdw xmm2, xmm3 5305 packuswb xmm0, xmm2 5306 movdqu [edi], xmm0 5307 lea edi, [edi + 16] 5308 sub ecx, 4 5309 jge l4 5310 5311 l4b: 5312 add ecx, 4 - 1 5313 jl l1b 5314 5315 // 1 pixel loop 5316 l1: 5317 movdqu xmm0, [eax] 5318 psubd xmm0, [eax + edx * 4] 5319 lea eax, [eax + 16] 5320 psubd xmm0, [esi] 5321 paddd xmm0, [esi + edx * 4] 5322 lea esi, [esi + 16] 5323 cvtdq2ps xmm0, xmm0 5324 mulps xmm0, xmm4 5325 cvtps2dq xmm0, xmm0 5326 packssdw xmm0, xmm0 5327 packuswb xmm0, xmm0 5328 movd dword ptr [edi], xmm0 5329 lea edi, [edi + 4] 5330 sub ecx, 1 5331 jge l1 5332 l1b: 5333 } 5334 } 5335 #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 5336 5337 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 5338 // Creates a table of cumulative sums where each value is a sum of all values 5339 // above and to the left of the value. 5340 void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, 5341 const int32* previous_cumsum, int width) { 5342 __asm { 5343 mov eax, row 5344 mov edx, cumsum 5345 mov esi, previous_cumsum 5346 mov ecx, width 5347 pxor xmm0, xmm0 5348 pxor xmm1, xmm1 5349 5350 sub ecx, 4 5351 jl l4b 5352 test edx, 15 5353 jne l4b 5354 5355 // 4 pixel loop 5356 l4: 5357 movdqu xmm2, [eax] // 4 argb pixels 16 bytes. 5358 lea eax, [eax + 16] 5359 movdqa xmm4, xmm2 5360 5361 punpcklbw xmm2, xmm1 5362 movdqa xmm3, xmm2 5363 punpcklwd xmm2, xmm1 5364 punpckhwd xmm3, xmm1 5365 5366 punpckhbw xmm4, xmm1 5367 movdqa xmm5, xmm4 5368 punpcklwd xmm4, xmm1 5369 punpckhwd xmm5, xmm1 5370 5371 paddd xmm0, xmm2 5372 movdqu xmm2, [esi] // previous row above. 5373 paddd xmm2, xmm0 5374 5375 paddd xmm0, xmm3 5376 movdqu xmm3, [esi + 16] 5377 paddd xmm3, xmm0 5378 5379 paddd xmm0, xmm4 5380 movdqu xmm4, [esi + 32] 5381 paddd xmm4, xmm0 5382 5383 paddd xmm0, xmm5 5384 movdqu xmm5, [esi + 48] 5385 lea esi, [esi + 64] 5386 paddd xmm5, xmm0 5387 5388 movdqu [edx], xmm2 5389 movdqu [edx + 16], xmm3 5390 movdqu [edx + 32], xmm4 5391 movdqu [edx + 48], xmm5 5392 5393 lea edx, [edx + 64] 5394 sub ecx, 4 5395 jge l4 5396 5397 l4b: 5398 add ecx, 4 - 1 5399 jl l1b 5400 5401 // 1 pixel loop 5402 l1: 5403 movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes. 5404 lea eax, [eax + 4] 5405 punpcklbw xmm2, xmm1 5406 punpcklwd xmm2, xmm1 5407 paddd xmm0, xmm2 5408 movdqu xmm2, [esi] 5409 lea esi, [esi + 16] 5410 paddd xmm2, xmm0 5411 movdqu [edx], xmm2 5412 lea edx, [edx + 16] 5413 sub ecx, 1 5414 jge l1 5415 5416 l1b: 5417 } 5418 } 5419 #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 5420 5421 #ifdef HAS_ARGBAFFINEROW_SSE2 5422 // Copy ARGB pixels from source image with slope to a row of destination. 5423 __declspec(naked) 5424 LIBYUV_API 5425 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, 5426 uint8* dst_argb, const float* uv_dudv, int width) { 5427 __asm { 5428 push esi 5429 push edi 5430 mov eax, [esp + 12] // src_argb 5431 mov esi, [esp + 16] // stride 5432 mov edx, [esp + 20] // dst_argb 5433 mov ecx, [esp + 24] // pointer to uv_dudv 5434 movq xmm2, qword ptr [ecx] // uv 5435 movq xmm7, qword ptr [ecx + 8] // dudv 5436 mov ecx, [esp + 28] // width 5437 shl esi, 16 // 4, stride 5438 add esi, 4 5439 movd xmm5, esi 5440 sub ecx, 4 5441 jl l4b 5442 5443 // setup for 4 pixel loop 5444 pshufd xmm7, xmm7, 0x44 // dup dudv 5445 pshufd xmm5, xmm5, 0 // dup 4, stride 5446 movdqa xmm0, xmm2 // x0, y0, x1, y1 5447 addps xmm0, xmm7 5448 movlhps xmm2, xmm0 5449 movdqa xmm4, xmm7 5450 addps xmm4, xmm4 // dudv *= 2 5451 movdqa xmm3, xmm2 // x2, y2, x3, y3 5452 addps xmm3, xmm4 5453 addps xmm4, xmm4 // dudv *= 4 5454 5455 // 4 pixel loop 5456 l4: 5457 cvttps2dq xmm0, xmm2 // x, y float to int first 2 5458 cvttps2dq xmm1, xmm3 // x, y float to int next 2 5459 packssdw xmm0, xmm1 // x, y as 8 shorts 5460 pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride. 5461 movd esi, xmm0 5462 pshufd xmm0, xmm0, 0x39 // shift right 5463 movd edi, xmm0 5464 pshufd xmm0, xmm0, 0x39 // shift right 5465 movd xmm1, [eax + esi] // read pixel 0 5466 movd xmm6, [eax + edi] // read pixel 1 5467 punpckldq xmm1, xmm6 // combine pixel 0 and 1 5468 addps xmm2, xmm4 // x, y += dx, dy first 2 5469 movq qword ptr [edx], xmm1 5470 movd esi, xmm0 5471 pshufd xmm0, xmm0, 0x39 // shift right 5472 movd edi, xmm0 5473 movd xmm6, [eax + esi] // read pixel 2 5474 movd xmm0, [eax + edi] // read pixel 3 5475 punpckldq xmm6, xmm0 // combine pixel 2 and 3 5476 addps xmm3, xmm4 // x, y += dx, dy next 2 5477 movq qword ptr 8[edx], xmm6 5478 lea edx, [edx + 16] 5479 sub ecx, 4 5480 jge l4 5481 5482 l4b: 5483 add ecx, 4 - 1 5484 jl l1b 5485 5486 // 1 pixel loop 5487 l1: 5488 cvttps2dq xmm0, xmm2 // x, y float to int 5489 packssdw xmm0, xmm0 // x, y as shorts 5490 pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride 5491 addps xmm2, xmm7 // x, y += dx, dy 5492 movd esi, xmm0 5493 movd xmm0, [eax + esi] // copy a pixel 5494 movd [edx], xmm0 5495 lea edx, [edx + 4] 5496 sub ecx, 1 5497 jge l1 5498 l1b: 5499 pop edi 5500 pop esi 5501 ret 5502 } 5503 } 5504 #endif // HAS_ARGBAFFINEROW_SSE2 5505 5506 #ifdef HAS_INTERPOLATEROW_AVX2 5507 // Bilinear filter 32x2 -> 32x1 5508 __declspec(naked) 5509 void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, 5510 ptrdiff_t src_stride, int dst_width, 5511 int source_y_fraction) { 5512 __asm { 5513 push esi 5514 push edi 5515 mov edi, [esp + 8 + 4] // dst_ptr 5516 mov esi, [esp + 8 + 8] // src_ptr 5517 mov edx, [esp + 8 + 12] // src_stride 5518 mov ecx, [esp + 8 + 16] // dst_width 5519 mov eax, [esp + 8 + 20] // source_y_fraction (0..255) 5520 shr eax, 1 5521 // Dispatch to specialized filters if applicable. 5522 cmp eax, 0 5523 je xloop100 // 0 / 128. Blend 100 / 0. 5524 sub edi, esi 5525 cmp eax, 32 5526 je xloop75 // 32 / 128 is 0.25. Blend 75 / 25. 5527 cmp eax, 64 5528 je xloop50 // 64 / 128 is 0.50. Blend 50 / 50. 5529 cmp eax, 96 5530 je xloop25 // 96 / 128 is 0.75. Blend 25 / 75. 5531 5532 vmovd xmm0, eax // high fraction 0..127 5533 neg eax 5534 add eax, 128 5535 vmovd xmm5, eax // low fraction 128..1 5536 vpunpcklbw xmm5, xmm5, xmm0 5537 vpunpcklwd xmm5, xmm5, xmm5 5538 vpxor ymm0, ymm0, ymm0 5539 vpermd ymm5, ymm0, ymm5 5540 5541 xloop: 5542 vmovdqu ymm0, [esi] 5543 vmovdqu ymm2, [esi + edx] 5544 vpunpckhbw ymm1, ymm0, ymm2 // mutates 5545 vpunpcklbw ymm0, ymm0, ymm2 // mutates 5546 vpmaddubsw ymm0, ymm0, ymm5 5547 vpmaddubsw ymm1, ymm1, ymm5 5548 vpsrlw ymm0, ymm0, 7 5549 vpsrlw ymm1, ymm1, 7 5550 vpackuswb ymm0, ymm0, ymm1 // unmutates 5551 vmovdqu [esi + edi], ymm0 5552 lea esi, [esi + 32] 5553 sub ecx, 32 5554 jg xloop 5555 jmp xloop99 5556 5557 // Blend 25 / 75. 5558 xloop25: 5559 vmovdqu ymm0, [esi] 5560 vmovdqu ymm1, [esi + edx] 5561 vpavgb ymm0, ymm0, ymm1 5562 vpavgb ymm0, ymm0, ymm1 5563 vmovdqu [esi + edi], ymm0 5564 lea esi, [esi + 32] 5565 sub ecx, 32 5566 jg xloop25 5567 jmp xloop99 5568 5569 // Blend 50 / 50. 5570 xloop50: 5571 vmovdqu ymm0, [esi] 5572 vpavgb ymm0, ymm0, [esi + edx] 5573 vmovdqu [esi + edi], ymm0 5574 lea esi, [esi + 32] 5575 sub ecx, 32 5576 jg xloop50 5577 jmp xloop99 5578 5579 // Blend 75 / 25. 5580 xloop75: 5581 vmovdqu ymm1, [esi] 5582 vmovdqu ymm0, [esi + edx] 5583 vpavgb ymm0, ymm0, ymm1 5584 vpavgb ymm0, ymm0, ymm1 5585 vmovdqu [esi + edi], ymm0 5586 lea esi, [esi + 32] 5587 sub ecx, 32 5588 jg xloop75 5589 jmp xloop99 5590 5591 // Blend 100 / 0 - Copy row unchanged. 5592 xloop100: 5593 rep movsb 5594 5595 xloop99: 5596 pop edi 5597 pop esi 5598 vzeroupper 5599 ret 5600 } 5601 } 5602 #endif // HAS_INTERPOLATEROW_AVX2 5603 5604 // Bilinear filter 16x2 -> 16x1 5605 __declspec(naked) 5606 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, 5607 ptrdiff_t src_stride, int dst_width, 5608 int source_y_fraction) { 5609 __asm { 5610 push esi 5611 push edi 5612 mov edi, [esp + 8 + 4] // dst_ptr 5613 mov esi, [esp + 8 + 8] // src_ptr 5614 mov edx, [esp + 8 + 12] // src_stride 5615 mov ecx, [esp + 8 + 16] // dst_width 5616 mov eax, [esp + 8 + 20] // source_y_fraction (0..255) 5617 sub edi, esi 5618 shr eax, 1 5619 // Dispatch to specialized filters if applicable. 5620 cmp eax, 0 5621 je xloop100 // 0 / 128. Blend 100 / 0. 5622 cmp eax, 32 5623 je xloop75 // 32 / 128 is 0.25. Blend 75 / 25. 5624 cmp eax, 64 5625 je xloop50 // 64 / 128 is 0.50. Blend 50 / 50. 5626 cmp eax, 96 5627 je xloop25 // 96 / 128 is 0.75. Blend 25 / 75. 5628 5629 movd xmm0, eax // high fraction 0..127 5630 neg eax 5631 add eax, 128 5632 movd xmm5, eax // low fraction 128..1 5633 punpcklbw xmm5, xmm0 5634 punpcklwd xmm5, xmm5 5635 pshufd xmm5, xmm5, 0 5636 5637 xloop: 5638 movdqu xmm0, [esi] 5639 movdqu xmm2, [esi + edx] 5640 movdqu xmm1, xmm0 5641 punpcklbw xmm0, xmm2 5642 punpckhbw xmm1, xmm2 5643 pmaddubsw xmm0, xmm5 5644 pmaddubsw xmm1, xmm5 5645 psrlw xmm0, 7 5646 psrlw xmm1, 7 5647 packuswb xmm0, xmm1 5648 movdqu [esi + edi], xmm0 5649 lea esi, [esi + 16] 5650 sub ecx, 16 5651 jg xloop 5652 jmp xloop99 5653 5654 // Blend 25 / 75. 5655 xloop25: 5656 movdqu xmm0, [esi] 5657 movdqu xmm1, [esi + edx] 5658 pavgb xmm0, xmm1 5659 pavgb xmm0, xmm1 5660 movdqu [esi + edi], xmm0 5661 lea esi, [esi + 16] 5662 sub ecx, 16 5663 jg xloop25 5664 jmp xloop99 5665 5666 // Blend 50 / 50. 5667 xloop50: 5668 movdqu xmm0, [esi] 5669 movdqu xmm1, [esi + edx] 5670 pavgb xmm0, xmm1 5671 movdqu [esi + edi], xmm0 5672 lea esi, [esi + 16] 5673 sub ecx, 16 5674 jg xloop50 5675 jmp xloop99 5676 5677 // Blend 75 / 25. 5678 xloop75: 5679 movdqu xmm1, [esi] 5680 movdqu xmm0, [esi + edx] 5681 pavgb xmm0, xmm1 5682 pavgb xmm0, xmm1 5683 movdqu [esi + edi], xmm0 5684 lea esi, [esi + 16] 5685 sub ecx, 16 5686 jg xloop75 5687 jmp xloop99 5688 5689 // Blend 100 / 0 - Copy row unchanged. 5690 xloop100: 5691 movdqu xmm0, [esi] 5692 movdqu [esi + edi], xmm0 5693 lea esi, [esi + 16] 5694 sub ecx, 16 5695 jg xloop100 5696 5697 xloop99: 5698 pop edi 5699 pop esi 5700 ret 5701 } 5702 } 5703 5704 #ifdef HAS_INTERPOLATEROW_SSE2 5705 // Bilinear filter 16x2 -> 16x1 5706 __declspec(naked) 5707 void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, 5708 ptrdiff_t src_stride, int dst_width, 5709 int source_y_fraction) { 5710 __asm { 5711 push esi 5712 push edi 5713 mov edi, [esp + 8 + 4] // dst_ptr 5714 mov esi, [esp + 8 + 8] // src_ptr 5715 mov edx, [esp + 8 + 12] // src_stride 5716 mov ecx, [esp + 8 + 16] // dst_width 5717 mov eax, [esp + 8 + 20] // source_y_fraction (0..255) 5718 sub edi, esi 5719 // Dispatch to specialized filters if applicable. 5720 cmp eax, 0 5721 je xloop100 // 0 / 256. Blend 100 / 0. 5722 cmp eax, 64 5723 je xloop75 // 64 / 256 is 0.25. Blend 75 / 25. 5724 cmp eax, 128 5725 je xloop50 // 128 / 256 is 0.50. Blend 50 / 50. 5726 cmp eax, 192 5727 je xloop25 // 192 / 256 is 0.75. Blend 25 / 75. 5728 5729 movd xmm5, eax // xmm5 = y fraction 5730 punpcklbw xmm5, xmm5 5731 psrlw xmm5, 1 5732 punpcklwd xmm5, xmm5 5733 punpckldq xmm5, xmm5 5734 punpcklqdq xmm5, xmm5 5735 pxor xmm4, xmm4 5736 5737 xloop: 5738 movdqu xmm0, [esi] // row0 5739 movdqu xmm2, [esi + edx] // row1 5740 movdqu xmm1, xmm0 5741 movdqu xmm3, xmm2 5742 punpcklbw xmm2, xmm4 5743 punpckhbw xmm3, xmm4 5744 punpcklbw xmm0, xmm4 5745 punpckhbw xmm1, xmm4 5746 psubw xmm2, xmm0 // row1 - row0 5747 psubw xmm3, xmm1 5748 paddw xmm2, xmm2 // 9 bits * 15 bits = 8.16 5749 paddw xmm3, xmm3 5750 pmulhw xmm2, xmm5 // scale diff 5751 pmulhw xmm3, xmm5 5752 paddw xmm0, xmm2 // sum rows 5753 paddw xmm1, xmm3 5754 packuswb xmm0, xmm1 5755 movdqu [esi + edi], xmm0 5756 lea esi, [esi + 16] 5757 sub ecx, 16 5758 jg xloop 5759 jmp xloop99 5760 5761 // Blend 25 / 75. 5762 xloop25: 5763 movdqu xmm0, [esi] 5764 movdqu xmm1, [esi + edx] 5765 pavgb xmm0, xmm1 5766 pavgb xmm0, xmm1 5767 movdqu [esi + edi], xmm0 5768 lea esi, [esi + 16] 5769 sub ecx, 16 5770 jg xloop25 5771 jmp xloop99 5772 5773 // Blend 50 / 50. 5774 xloop50: 5775 movdqu xmm0, [esi] 5776 movdqu xmm1, [esi + edx] 5777 pavgb xmm0, xmm1 5778 movdqu [esi + edi], xmm0 5779 lea esi, [esi + 16] 5780 sub ecx, 16 5781 jg xloop50 5782 jmp xloop99 5783 5784 // Blend 75 / 25. 5785 xloop75: 5786 movdqu xmm1, [esi] 5787 movdqu xmm0, [esi + edx] 5788 pavgb xmm0, xmm1 5789 pavgb xmm0, xmm1 5790 movdqu [esi + edi], xmm0 5791 lea esi, [esi + 16] 5792 sub ecx, 16 5793 jg xloop75 5794 jmp xloop99 5795 5796 // Blend 100 / 0 - Copy row unchanged. 5797 xloop100: 5798 movdqu xmm0, [esi] 5799 movdqu [esi + edi], xmm0 5800 lea esi, [esi + 16] 5801 sub ecx, 16 5802 jg xloop100 5803 5804 xloop99: 5805 pop edi 5806 pop esi 5807 ret 5808 } 5809 } 5810 #endif // HAS_INTERPOLATEROW_SSE2 5811 5812 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. 5813 __declspec(naked) 5814 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, 5815 const uint8* shuffler, int pix) { 5816 __asm { 5817 mov eax, [esp + 4] // src_argb 5818 mov edx, [esp + 8] // dst_argb 5819 mov ecx, [esp + 12] // shuffler 5820 movdqu xmm5, [ecx] 5821 mov ecx, [esp + 16] // pix 5822 5823 wloop: 5824 movdqu xmm0, [eax] 5825 movdqu xmm1, [eax + 16] 5826 lea eax, [eax + 32] 5827 pshufb xmm0, xmm5 5828 pshufb xmm1, xmm5 5829 movdqu [edx], xmm0 5830 movdqu [edx + 16], xmm1 5831 lea edx, [edx + 32] 5832 sub ecx, 8 5833 jg wloop 5834 ret 5835 } 5836 } 5837 5838 #ifdef HAS_ARGBSHUFFLEROW_AVX2 5839 __declspec(naked) 5840 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, 5841 const uint8* shuffler, int pix) { 5842 __asm { 5843 mov eax, [esp + 4] // src_argb 5844 mov edx, [esp + 8] // dst_argb 5845 mov ecx, [esp + 12] // shuffler 5846 vbroadcastf128 ymm5, [ecx] // same shuffle in high as low. 5847 mov ecx, [esp + 16] // pix 5848 5849 wloop: 5850 vmovdqu ymm0, [eax] 5851 vmovdqu ymm1, [eax + 32] 5852 lea eax, [eax + 64] 5853 vpshufb ymm0, ymm0, ymm5 5854 vpshufb ymm1, ymm1, ymm5 5855 vmovdqu [edx], ymm0 5856 vmovdqu [edx + 32], ymm1 5857 lea edx, [edx + 64] 5858 sub ecx, 16 5859 jg wloop 5860 5861 vzeroupper 5862 ret 5863 } 5864 } 5865 #endif // HAS_ARGBSHUFFLEROW_AVX2 5866 5867 __declspec(naked) 5868 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, 5869 const uint8* shuffler, int pix) { 5870 __asm { 5871 push ebx 5872 push esi 5873 mov eax, [esp + 8 + 4] // src_argb 5874 mov edx, [esp + 8 + 8] // dst_argb 5875 mov esi, [esp + 8 + 12] // shuffler 5876 mov ecx, [esp + 8 + 16] // pix 5877 pxor xmm5, xmm5 5878 5879 mov ebx, [esi] // shuffler 5880 cmp ebx, 0x03000102 5881 je shuf_3012 5882 cmp ebx, 0x00010203 5883 je shuf_0123 5884 cmp ebx, 0x00030201 5885 je shuf_0321 5886 cmp ebx, 0x02010003 5887 je shuf_2103 5888 5889 // TODO(fbarchard): Use one source pointer and 3 offsets. 5890 shuf_any1: 5891 movzx ebx, byte ptr [esi] 5892 movzx ebx, byte ptr [eax + ebx] 5893 mov [edx], bl 5894 movzx ebx, byte ptr [esi + 1] 5895 movzx ebx, byte ptr [eax + ebx] 5896 mov [edx + 1], bl 5897 movzx ebx, byte ptr [esi + 2] 5898 movzx ebx, byte ptr [eax + ebx] 5899 mov [edx + 2], bl 5900 movzx ebx, byte ptr [esi + 3] 5901 movzx ebx, byte ptr [eax + ebx] 5902 mov [edx + 3], bl 5903 lea eax, [eax + 4] 5904 lea edx, [edx + 4] 5905 sub ecx, 1 5906 jg shuf_any1 5907 jmp shuf99 5908 5909 shuf_0123: 5910 movdqu xmm0, [eax] 5911 lea eax, [eax + 16] 5912 movdqa xmm1, xmm0 5913 punpcklbw xmm0, xmm5 5914 punpckhbw xmm1, xmm5 5915 pshufhw xmm0, xmm0, 01Bh // 1B = 00011011 = 0x0123 = BGRAToARGB 5916 pshuflw xmm0, xmm0, 01Bh 5917 pshufhw xmm1, xmm1, 01Bh 5918 pshuflw xmm1, xmm1, 01Bh 5919 packuswb xmm0, xmm1 5920 movdqu [edx], xmm0 5921 lea edx, [edx + 16] 5922 sub ecx, 4 5923 jg shuf_0123 5924 jmp shuf99 5925 5926 shuf_0321: 5927 movdqu xmm0, [eax] 5928 lea eax, [eax + 16] 5929 movdqa xmm1, xmm0 5930 punpcklbw xmm0, xmm5 5931 punpckhbw xmm1, xmm5 5932 pshufhw xmm0, xmm0, 039h // 39 = 00111001 = 0x0321 = RGBAToARGB 5933 pshuflw xmm0, xmm0, 039h 5934 pshufhw xmm1, xmm1, 039h 5935 pshuflw xmm1, xmm1, 039h 5936 packuswb xmm0, xmm1 5937 movdqu [edx], xmm0 5938 lea edx, [edx + 16] 5939 sub ecx, 4 5940 jg shuf_0321 5941 jmp shuf99 5942 5943 shuf_2103: 5944 movdqu xmm0, [eax] 5945 lea eax, [eax + 16] 5946 movdqa xmm1, xmm0 5947 punpcklbw xmm0, xmm5 5948 punpckhbw xmm1, xmm5 5949 pshufhw xmm0, xmm0, 093h // 93 = 10010011 = 0x2103 = ARGBToRGBA 5950 pshuflw xmm0, xmm0, 093h 5951 pshufhw xmm1, xmm1, 093h 5952 pshuflw xmm1, xmm1, 093h 5953 packuswb xmm0, xmm1 5954 movdqu [edx], xmm0 5955 lea edx, [edx + 16] 5956 sub ecx, 4 5957 jg shuf_2103 5958 jmp shuf99 5959 5960 shuf_3012: 5961 movdqu xmm0, [eax] 5962 lea eax, [eax + 16] 5963 movdqa xmm1, xmm0 5964 punpcklbw xmm0, xmm5 5965 punpckhbw xmm1, xmm5 5966 pshufhw xmm0, xmm0, 0C6h // C6 = 11000110 = 0x3012 = ABGRToARGB 5967 pshuflw xmm0, xmm0, 0C6h 5968 pshufhw xmm1, xmm1, 0C6h 5969 pshuflw xmm1, xmm1, 0C6h 5970 packuswb xmm0, xmm1 5971 movdqu [edx], xmm0 5972 lea edx, [edx + 16] 5973 sub ecx, 4 5974 jg shuf_3012 5975 5976 shuf99: 5977 pop esi 5978 pop ebx 5979 ret 5980 } 5981 } 5982 5983 // YUY2 - Macro-pixel = 2 image pixels 5984 // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4.... 5985 5986 // UYVY - Macro-pixel = 2 image pixels 5987 // U0Y0V0Y1 5988 5989 __declspec(naked) 5990 void I422ToYUY2Row_SSE2(const uint8* src_y, 5991 const uint8* src_u, 5992 const uint8* src_v, 5993 uint8* dst_frame, int width) { 5994 __asm { 5995 push esi 5996 push edi 5997 mov eax, [esp + 8 + 4] // src_y 5998 mov esi, [esp + 8 + 8] // src_u 5999 mov edx, [esp + 8 + 12] // src_v 6000 mov edi, [esp + 8 + 16] // dst_frame 6001 mov ecx, [esp + 8 + 20] // width 6002 sub edx, esi 6003 6004 convertloop: 6005 movq xmm2, qword ptr [esi] // U 6006 movq xmm3, qword ptr [esi + edx] // V 6007 lea esi, [esi + 8] 6008 punpcklbw xmm2, xmm3 // UV 6009 movdqu xmm0, [eax] // Y 6010 lea eax, [eax + 16] 6011 movdqa xmm1, xmm0 6012 punpcklbw xmm0, xmm2 // YUYV 6013 punpckhbw xmm1, xmm2 6014 movdqu [edi], xmm0 6015 movdqu [edi + 16], xmm1 6016 lea edi, [edi + 32] 6017 sub ecx, 16 6018 jg convertloop 6019 6020 pop edi 6021 pop esi 6022 ret 6023 } 6024 } 6025 6026 __declspec(naked) 6027 void I422ToUYVYRow_SSE2(const uint8* src_y, 6028 const uint8* src_u, 6029 const uint8* src_v, 6030 uint8* dst_frame, int width) { 6031 __asm { 6032 push esi 6033 push edi 6034 mov eax, [esp + 8 + 4] // src_y 6035 mov esi, [esp + 8 + 8] // src_u 6036 mov edx, [esp + 8 + 12] // src_v 6037 mov edi, [esp + 8 + 16] // dst_frame 6038 mov ecx, [esp + 8 + 20] // width 6039 sub edx, esi 6040 6041 convertloop: 6042 movq xmm2, qword ptr [esi] // U 6043 movq xmm3, qword ptr [esi + edx] // V 6044 lea esi, [esi + 8] 6045 punpcklbw xmm2, xmm3 // UV 6046 movdqu xmm0, [eax] // Y 6047 movdqa xmm1, xmm2 6048 lea eax, [eax + 16] 6049 punpcklbw xmm1, xmm0 // UYVY 6050 punpckhbw xmm2, xmm0 6051 movdqu [edi], xmm1 6052 movdqu [edi + 16], xmm2 6053 lea edi, [edi + 32] 6054 sub ecx, 16 6055 jg convertloop 6056 6057 pop edi 6058 pop esi 6059 ret 6060 } 6061 } 6062 6063 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2 6064 __declspec(naked) 6065 void ARGBPolynomialRow_SSE2(const uint8* src_argb, 6066 uint8* dst_argb, const float* poly, 6067 int width) { 6068 __asm { 6069 push esi 6070 mov eax, [esp + 4 + 4] /* src_argb */ 6071 mov edx, [esp + 4 + 8] /* dst_argb */ 6072 mov esi, [esp + 4 + 12] /* poly */ 6073 mov ecx, [esp + 4 + 16] /* width */ 6074 pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints. 6075 6076 // 2 pixel loop. 6077 convertloop: 6078 // pmovzxbd xmm0, dword ptr [eax] // BGRA pixel 6079 // pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel 6080 movq xmm0, qword ptr [eax] // BGRABGRA 6081 lea eax, [eax + 8] 6082 punpcklbw xmm0, xmm3 6083 movdqa xmm4, xmm0 6084 punpcklwd xmm0, xmm3 // pixel 0 6085 punpckhwd xmm4, xmm3 // pixel 1 6086 cvtdq2ps xmm0, xmm0 // 4 floats 6087 cvtdq2ps xmm4, xmm4 6088 movdqa xmm1, xmm0 // X 6089 movdqa xmm5, xmm4 6090 mulps xmm0, [esi + 16] // C1 * X 6091 mulps xmm4, [esi + 16] 6092 addps xmm0, [esi] // result = C0 + C1 * X 6093 addps xmm4, [esi] 6094 movdqa xmm2, xmm1 6095 movdqa xmm6, xmm5 6096 mulps xmm2, xmm1 // X * X 6097 mulps xmm6, xmm5 6098 mulps xmm1, xmm2 // X * X * X 6099 mulps xmm5, xmm6 6100 mulps xmm2, [esi + 32] // C2 * X * X 6101 mulps xmm6, [esi + 32] 6102 mulps xmm1, [esi + 48] // C3 * X * X * X 6103 mulps xmm5, [esi + 48] 6104 addps xmm0, xmm2 // result += C2 * X * X 6105 addps xmm4, xmm6 6106 addps xmm0, xmm1 // result += C3 * X * X * X 6107 addps xmm4, xmm5 6108 cvttps2dq xmm0, xmm0 6109 cvttps2dq xmm4, xmm4 6110 packuswb xmm0, xmm4 6111 packuswb xmm0, xmm0 6112 movq qword ptr [edx], xmm0 6113 lea edx, [edx + 8] 6114 sub ecx, 2 6115 jg convertloop 6116 pop esi 6117 ret 6118 } 6119 } 6120 #endif // HAS_ARGBPOLYNOMIALROW_SSE2 6121 6122 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2 6123 __declspec(naked) 6124 void ARGBPolynomialRow_AVX2(const uint8* src_argb, 6125 uint8* dst_argb, const float* poly, 6126 int width) { 6127 __asm { 6128 mov eax, [esp + 4] /* src_argb */ 6129 mov edx, [esp + 8] /* dst_argb */ 6130 mov ecx, [esp + 12] /* poly */ 6131 vbroadcastf128 ymm4, [ecx] // C0 6132 vbroadcastf128 ymm5, [ecx + 16] // C1 6133 vbroadcastf128 ymm6, [ecx + 32] // C2 6134 vbroadcastf128 ymm7, [ecx + 48] // C3 6135 mov ecx, [esp + 16] /* width */ 6136 6137 // 2 pixel loop. 6138 convertloop: 6139 vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels 6140 lea eax, [eax + 8] 6141 vcvtdq2ps ymm0, ymm0 // X 8 floats 6142 vmulps ymm2, ymm0, ymm0 // X * X 6143 vmulps ymm3, ymm0, ymm7 // C3 * X 6144 vfmadd132ps ymm0, ymm4, ymm5 // result = C0 + C1 * X 6145 vfmadd231ps ymm0, ymm2, ymm6 // result += C2 * X * X 6146 vfmadd231ps ymm0, ymm2, ymm3 // result += C3 * X * X * X 6147 vcvttps2dq ymm0, ymm0 6148 vpackusdw ymm0, ymm0, ymm0 // b0g0r0a0_00000000_b0g0r0a0_00000000 6149 vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000 6150 vpackuswb xmm0, xmm0, xmm0 // bgrabgra_00000000_00000000_00000000 6151 vmovq qword ptr [edx], xmm0 6152 lea edx, [edx + 8] 6153 sub ecx, 2 6154 jg convertloop 6155 vzeroupper 6156 ret 6157 } 6158 } 6159 #endif // HAS_ARGBPOLYNOMIALROW_AVX2 6160 6161 #ifdef HAS_ARGBCOLORTABLEROW_X86 6162 // Tranform ARGB pixels with color table. 6163 __declspec(naked) 6164 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, 6165 int width) { 6166 __asm { 6167 push esi 6168 mov eax, [esp + 4 + 4] /* dst_argb */ 6169 mov esi, [esp + 4 + 8] /* table_argb */ 6170 mov ecx, [esp + 4 + 12] /* width */ 6171 6172 // 1 pixel loop. 6173 convertloop: 6174 movzx edx, byte ptr [eax] 6175 lea eax, [eax + 4] 6176 movzx edx, byte ptr [esi + edx * 4] 6177 mov byte ptr [eax - 4], dl 6178 movzx edx, byte ptr [eax - 4 + 1] 6179 movzx edx, byte ptr [esi + edx * 4 + 1] 6180 mov byte ptr [eax - 4 + 1], dl 6181 movzx edx, byte ptr [eax - 4 + 2] 6182 movzx edx, byte ptr [esi + edx * 4 + 2] 6183 mov byte ptr [eax - 4 + 2], dl 6184 movzx edx, byte ptr [eax - 4 + 3] 6185 movzx edx, byte ptr [esi + edx * 4 + 3] 6186 mov byte ptr [eax - 4 + 3], dl 6187 dec ecx 6188 jg convertloop 6189 pop esi 6190 ret 6191 } 6192 } 6193 #endif // HAS_ARGBCOLORTABLEROW_X86 6194 6195 #ifdef HAS_RGBCOLORTABLEROW_X86 6196 // Tranform RGB pixels with color table. 6197 __declspec(naked) 6198 void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { 6199 __asm { 6200 push esi 6201 mov eax, [esp + 4 + 4] /* dst_argb */ 6202 mov esi, [esp + 4 + 8] /* table_argb */ 6203 mov ecx, [esp + 4 + 12] /* width */ 6204 6205 // 1 pixel loop. 6206 convertloop: 6207 movzx edx, byte ptr [eax] 6208 lea eax, [eax + 4] 6209 movzx edx, byte ptr [esi + edx * 4] 6210 mov byte ptr [eax - 4], dl 6211 movzx edx, byte ptr [eax - 4 + 1] 6212 movzx edx, byte ptr [esi + edx * 4 + 1] 6213 mov byte ptr [eax - 4 + 1], dl 6214 movzx edx, byte ptr [eax - 4 + 2] 6215 movzx edx, byte ptr [esi + edx * 4 + 2] 6216 mov byte ptr [eax - 4 + 2], dl 6217 dec ecx 6218 jg convertloop 6219 6220 pop esi 6221 ret 6222 } 6223 } 6224 #endif // HAS_RGBCOLORTABLEROW_X86 6225 6226 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 6227 // Tranform RGB pixels with luma table. 6228 __declspec(naked) 6229 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, 6230 int width, 6231 const uint8* luma, uint32 lumacoeff) { 6232 __asm { 6233 push esi 6234 push edi 6235 mov eax, [esp + 8 + 4] /* src_argb */ 6236 mov edi, [esp + 8 + 8] /* dst_argb */ 6237 mov ecx, [esp + 8 + 12] /* width */ 6238 movd xmm2, dword ptr [esp + 8 + 16] // luma table 6239 movd xmm3, dword ptr [esp + 8 + 20] // lumacoeff 6240 pshufd xmm2, xmm2, 0 6241 pshufd xmm3, xmm3, 0 6242 pcmpeqb xmm4, xmm4 // generate mask 0xff00ff00 6243 psllw xmm4, 8 6244 pxor xmm5, xmm5 6245 6246 // 4 pixel loop. 6247 convertloop: 6248 movdqu xmm0, qword ptr [eax] // generate luma ptr 6249 pmaddubsw xmm0, xmm3 6250 phaddw xmm0, xmm0 6251 pand xmm0, xmm4 // mask out low bits 6252 punpcklwd xmm0, xmm5 6253 paddd xmm0, xmm2 // add table base 6254 movd esi, xmm0 6255 pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 6256 6257 movzx edx, byte ptr [eax] 6258 movzx edx, byte ptr [esi + edx] 6259 mov byte ptr [edi], dl 6260 movzx edx, byte ptr [eax + 1] 6261 movzx edx, byte ptr [esi + edx] 6262 mov byte ptr [edi + 1], dl 6263 movzx edx, byte ptr [eax + 2] 6264 movzx edx, byte ptr [esi + edx] 6265 mov byte ptr [edi + 2], dl 6266 movzx edx, byte ptr [eax + 3] // copy alpha. 6267 mov byte ptr [edi + 3], dl 6268 6269 movd esi, xmm0 6270 pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 6271 6272 movzx edx, byte ptr [eax + 4] 6273 movzx edx, byte ptr [esi + edx] 6274 mov byte ptr [edi + 4], dl 6275 movzx edx, byte ptr [eax + 5] 6276 movzx edx, byte ptr [esi + edx] 6277 mov byte ptr [edi + 5], dl 6278 movzx edx, byte ptr [eax + 6] 6279 movzx edx, byte ptr [esi + edx] 6280 mov byte ptr [edi + 6], dl 6281 movzx edx, byte ptr [eax + 7] // copy alpha. 6282 mov byte ptr [edi + 7], dl 6283 6284 movd esi, xmm0 6285 pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 6286 6287 movzx edx, byte ptr [eax + 8] 6288 movzx edx, byte ptr [esi + edx] 6289 mov byte ptr [edi + 8], dl 6290 movzx edx, byte ptr [eax + 9] 6291 movzx edx, byte ptr [esi + edx] 6292 mov byte ptr [edi + 9], dl 6293 movzx edx, byte ptr [eax + 10] 6294 movzx edx, byte ptr [esi + edx] 6295 mov byte ptr [edi + 10], dl 6296 movzx edx, byte ptr [eax + 11] // copy alpha. 6297 mov byte ptr [edi + 11], dl 6298 6299 movd esi, xmm0 6300 6301 movzx edx, byte ptr [eax + 12] 6302 movzx edx, byte ptr [esi + edx] 6303 mov byte ptr [edi + 12], dl 6304 movzx edx, byte ptr [eax + 13] 6305 movzx edx, byte ptr [esi + edx] 6306 mov byte ptr [edi + 13], dl 6307 movzx edx, byte ptr [eax + 14] 6308 movzx edx, byte ptr [esi + edx] 6309 mov byte ptr [edi + 14], dl 6310 movzx edx, byte ptr [eax + 15] // copy alpha. 6311 mov byte ptr [edi + 15], dl 6312 6313 lea eax, [eax + 16] 6314 lea edi, [edi + 16] 6315 sub ecx, 4 6316 jg convertloop 6317 6318 pop edi 6319 pop esi 6320 ret 6321 } 6322 } 6323 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 6324 6325 #endif // defined(_M_X64) 6326 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) 6327 6328 #ifdef __cplusplus 6329 } // extern "C" 6330 } // namespace libyuv 6331 #endif 6332